cld3 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +18 -0
  3. data/LICENSE +204 -0
  4. data/LICENSE_CLD3 +203 -0
  5. data/README.md +22 -0
  6. data/cld3.gemspec +35 -0
  7. data/ext/cld3/base.cc +36 -0
  8. data/ext/cld3/base.h +106 -0
  9. data/ext/cld3/casts.h +98 -0
  10. data/ext/cld3/embedding_feature_extractor.cc +51 -0
  11. data/ext/cld3/embedding_feature_extractor.h +182 -0
  12. data/ext/cld3/embedding_network.cc +196 -0
  13. data/ext/cld3/embedding_network.h +186 -0
  14. data/ext/cld3/embedding_network_params.h +285 -0
  15. data/ext/cld3/extconf.rb +49 -0
  16. data/ext/cld3/feature_extractor.cc +137 -0
  17. data/ext/cld3/feature_extractor.h +633 -0
  18. data/ext/cld3/feature_extractor.proto +50 -0
  19. data/ext/cld3/feature_types.cc +72 -0
  20. data/ext/cld3/feature_types.h +158 -0
  21. data/ext/cld3/fixunicodevalue.cc +55 -0
  22. data/ext/cld3/fixunicodevalue.h +69 -0
  23. data/ext/cld3/float16.h +58 -0
  24. data/ext/cld3/fml_parser.cc +308 -0
  25. data/ext/cld3/fml_parser.h +123 -0
  26. data/ext/cld3/generated_entities.cc +296 -0
  27. data/ext/cld3/generated_ulscript.cc +678 -0
  28. data/ext/cld3/generated_ulscript.h +142 -0
  29. data/ext/cld3/getonescriptspan.cc +1109 -0
  30. data/ext/cld3/getonescriptspan.h +124 -0
  31. data/ext/cld3/integral_types.h +37 -0
  32. data/ext/cld3/lang_id_nn_params.cc +57449 -0
  33. data/ext/cld3/lang_id_nn_params.h +178 -0
  34. data/ext/cld3/language_identifier_features.cc +165 -0
  35. data/ext/cld3/language_identifier_features.h +116 -0
  36. data/ext/cld3/nnet_language_identifier.cc +380 -0
  37. data/ext/cld3/nnet_language_identifier.h +175 -0
  38. data/ext/cld3/nnet_language_identifier_c.cc +72 -0
  39. data/ext/cld3/offsetmap.cc +478 -0
  40. data/ext/cld3/offsetmap.h +168 -0
  41. data/ext/cld3/port.h +143 -0
  42. data/ext/cld3/registry.cc +28 -0
  43. data/ext/cld3/registry.h +242 -0
  44. data/ext/cld3/relevant_script_feature.cc +89 -0
  45. data/ext/cld3/relevant_script_feature.h +49 -0
  46. data/ext/cld3/script_detector.h +156 -0
  47. data/ext/cld3/sentence.proto +77 -0
  48. data/ext/cld3/sentence_features.cc +29 -0
  49. data/ext/cld3/sentence_features.h +35 -0
  50. data/ext/cld3/simple_adder.h +72 -0
  51. data/ext/cld3/stringpiece.h +81 -0
  52. data/ext/cld3/task_context.cc +161 -0
  53. data/ext/cld3/task_context.h +81 -0
  54. data/ext/cld3/task_context_params.cc +74 -0
  55. data/ext/cld3/task_context_params.h +54 -0
  56. data/ext/cld3/task_spec.proto +98 -0
  57. data/ext/cld3/text_processing.cc +245 -0
  58. data/ext/cld3/text_processing.h +30 -0
  59. data/ext/cld3/unicodetext.cc +96 -0
  60. data/ext/cld3/unicodetext.h +144 -0
  61. data/ext/cld3/utf8acceptinterchange.h +486 -0
  62. data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
  63. data/ext/cld3/utf8repl_lettermarklower.h +758 -0
  64. data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
  65. data/ext/cld3/utf8statetable.cc +1344 -0
  66. data/ext/cld3/utf8statetable.h +285 -0
  67. data/ext/cld3/utils.cc +241 -0
  68. data/ext/cld3/utils.h +144 -0
  69. data/ext/cld3/workspace.cc +64 -0
  70. data/ext/cld3/workspace.h +177 -0
  71. data/lib/cld3.rb +99 -0
  72. metadata +158 -0
@@ -0,0 +1,144 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #ifndef UTILS_H_
17
+ #define UTILS_H_
18
+
19
+ #include <stddef.h>
20
+ #include <functional>
21
+ #include <initializer_list>
22
+ #include <string>
23
+ #include <vector>
24
+
25
+ #include "base.h"
26
+ #include "script_span/stringpiece.h"
27
+
28
+ namespace chrome_lang_id {
29
+ namespace utils {
30
+
31
+ bool ParseInt32(const char *c_str, int *value);
32
+ bool ParseDouble(const char *c_str, double *value);
33
+
34
+ template <typename T>
35
+ T ParseUsing(const string &str, std::function<bool(const char *, T *)> func) {
36
+ T value;
37
+ func(str.c_str(), &value);
38
+ return value;
39
+ }
40
+
41
+ template <typename T>
42
+ T ParseUsing(const string &str, T defval,
43
+ std::function<bool(const char *, T *)> func) {
44
+ return str.empty() ? defval : ParseUsing<T>(str, func);
45
+ }
46
+
47
+ string CEscape(const string &src);
48
+
49
+ std::vector<string> Split(const string &text, char delim);
50
+
51
+ int RemoveLeadingWhitespace(StringPiece *text);
52
+
53
+ int RemoveTrailingWhitespace(StringPiece *text);
54
+
55
+ int RemoveWhitespaceContext(StringPiece *text);
56
+
57
+ uint32 Hash32(const char *data, size_t n, uint32 seed);
58
+
59
+ uint32 Hash32WithDefaultSeed(const string &input);
60
+
61
+ // Deletes all the elements in an STL container and clears the container. This
62
+ // function is suitable for use with a vector, set, hash_set, or any other STL
63
+ // container which defines sensible begin(), end(), and clear() methods.
64
+ // If container is NULL, this function is a no-op.
65
+ template <typename T>
66
+ void STLDeleteElements(T *container) {
67
+ if (!container) return;
68
+ auto it = container->begin();
69
+ while (it != container->end()) {
70
+ auto temp = it;
71
+ ++it;
72
+ delete *temp;
73
+ }
74
+ container->clear();
75
+ }
76
+
77
+ class PunctuationUtil {
78
+ public:
79
+ // Unicode character ranges for punctuation characters according to CoNLL.
80
+ struct CharacterRange {
81
+ int first;
82
+ int last;
83
+ };
84
+ static CharacterRange kPunctuation[];
85
+
86
+ // Returns true if Unicode character is a punctuation character.
87
+ static bool IsPunctuation(int u) {
88
+ int i = 0;
89
+ while (kPunctuation[i].first > 0) {
90
+ if (u < kPunctuation[i].first) return false;
91
+ if (u <= kPunctuation[i].last) return true;
92
+ ++i;
93
+ }
94
+ return false;
95
+ }
96
+
97
+ // Determine if tag is a punctuation tag.
98
+ static bool IsPunctuationTag(const string &tag) {
99
+ for (size_t i = 0; i < tag.length(); ++i) {
100
+ int c = tag[i];
101
+ if (c != ',' && c != ':' && c != '.' && c != '\'' && c != '`') {
102
+ return false;
103
+ }
104
+ }
105
+ return true;
106
+ }
107
+
108
+ // Returns true if tag is non-empty and has only punctuation or parens
109
+ // symbols.
110
+ static bool IsPunctuationTagOrParens(const string &tag) {
111
+ if (tag.empty()) return false;
112
+ for (size_t i = 0; i < tag.length(); ++i) {
113
+ int c = tag[i];
114
+ if (c != '(' && c != ')' && c != ',' && c != ':' && c != '.' &&
115
+ c != '\'' && c != '`') {
116
+ return false;
117
+ }
118
+ }
119
+ return true;
120
+ }
121
+ };
122
+
123
+ void NormalizeDigits(string *form);
124
+
125
+ // Takes a text and convert it into a vector, where each element is a utf8
126
+ // character.
127
+ void GetUTF8Chars(const string &text, std::vector<string> *chars);
128
+
129
+ // Returns the number of bytes in the first UTF-8 char at the beginning
130
+ // of the string. It is assumed that the string is valid UTF-8. If
131
+ // the first byte of the string is null, return 0 (for backwards
132
+ // compatibility only; this use is discouraged).
133
+ int UTF8FirstLetterNumBytes(const char *in_buf);
134
+
135
+ // Returns the length (number of bytes) of the Unicode code point starting at
136
+ // src, based on inspecting just that one byte. Preconditions: src != NULL,
137
+ // *src can be read, and *src is not '\0', and src points to a well-formed UTF-8
138
+ // string.
139
+ int OneCharLen(const char *src);
140
+
141
+ } // namespace utils
142
+ } // namespace chrome_lang_id
143
+
144
+ #endif // UTILS_H_
@@ -0,0 +1,64 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #include "workspace.h"
17
+
18
+ #include "base.h"
19
+
20
+ namespace chrome_lang_id {
21
+
22
+ WorkspaceSet::WorkspaceSet() {}
23
+
24
+ WorkspaceSet::~WorkspaceSet() { Reset(WorkspaceRegistry()); }
25
+
26
+ WorkspaceRegistry::WorkspaceRegistry() {}
27
+
28
+ WorkspaceRegistry::~WorkspaceRegistry() {}
29
+
30
+ string WorkspaceRegistry::DebugString() const {
31
+ string str;
32
+ for (auto &it : workspace_names_) {
33
+ const string &type_name = workspace_types_.at(it.first);
34
+ for (size_t index = 0; index < it.second.size(); ++index) {
35
+ const string &workspace_name = it.second[index];
36
+ str += "\n ";
37
+ str += type_name;
38
+ str += " :: ";
39
+ str += workspace_name;
40
+ }
41
+ }
42
+ return str;
43
+ }
44
+
45
+ VectorIntWorkspace::~VectorIntWorkspace() {}
46
+
47
+ VectorIntWorkspace::VectorIntWorkspace(int size) : elements_(size) {}
48
+
49
+ VectorIntWorkspace::VectorIntWorkspace(int size, int value)
50
+ : elements_(size, value) {}
51
+
52
+ VectorIntWorkspace::VectorIntWorkspace(const std::vector<int> &elements)
53
+ : elements_(elements) {}
54
+
55
+ string VectorIntWorkspace::TypeName() { return "Vector"; }
56
+
57
+ VectorVectorIntWorkspace::~VectorVectorIntWorkspace() {}
58
+
59
+ VectorVectorIntWorkspace::VectorVectorIntWorkspace(int size)
60
+ : elements_(size) {}
61
+
62
+ string VectorVectorIntWorkspace::TypeName() { return "VectorVector"; }
63
+
64
+ } // namespace chrome_lang_id
@@ -0,0 +1,177 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ // Notes on thread-safety: All of the classes here are thread-compatible. More
17
+ // specifically, the registry machinery is thread-safe, as long as each thread
18
+ // performs feature extraction on a different Sentence object.
19
+
20
+ #ifndef WORKSPACE_H_
21
+ #define WORKSPACE_H_
22
+
23
+ #include <stddef.h>
24
+ #include <string>
25
+ #include <typeindex>
26
+ #include <unordered_map>
27
+ #include <utility>
28
+ #include <vector>
29
+
30
+ #include "base.h"
31
+
32
+ namespace chrome_lang_id {
33
+
34
+ // A base class for shared workspaces. Derived classes implement a static member
35
+ // function TypeName() which returns a human readable string name for the class.
36
+ class Workspace {
37
+ public:
38
+ // Polymorphic destructor.
39
+ virtual ~Workspace() {}
40
+
41
+ protected:
42
+ // Create an empty workspace.
43
+ Workspace() {}
44
+
45
+ private:
46
+ CLD3_DISALLOW_COPY_AND_ASSIGN(Workspace);
47
+ };
48
+
49
+ // A registry that keeps track of workspaces.
50
+ class WorkspaceRegistry {
51
+ public:
52
+ // Create an empty registry.
53
+ WorkspaceRegistry();
54
+ ~WorkspaceRegistry();
55
+
56
+ const std::unordered_map<std::type_index, std::vector<std::string>>
57
+ &WorkspaceNames() const {
58
+ return workspace_names_;
59
+ }
60
+
61
+ // Returns a string describing the registered workspaces.
62
+ string DebugString() const;
63
+
64
+ private:
65
+ // Workspace type names, indexed as workspace_types_[typeid].
66
+ std::unordered_map<std::type_index, string> workspace_types_;
67
+
68
+ // Workspace names, indexed as workspace_names_[typeid][workspace].
69
+ std::unordered_map<std::type_index, std::vector<string>> workspace_names_;
70
+
71
+ CLD3_DISALLOW_COPY_AND_ASSIGN(WorkspaceRegistry);
72
+ };
73
+
74
+ // A typed collected of workspaces. The workspaces are indexed according to an
75
+ // external WorkspaceRegistry. If the WorkspaceSet is const, the contents are
76
+ // also immutable.
77
+ class WorkspaceSet {
78
+ public:
79
+ WorkspaceSet();
80
+ ~WorkspaceSet();
81
+
82
+ void Reset(const WorkspaceRegistry &registry) {
83
+ // Deallocate current workspaces.
84
+ for (auto &it : workspaces_) {
85
+ for (size_t index = 0; index < it.second.size(); ++index) {
86
+ delete it.second[index];
87
+ }
88
+ }
89
+ workspaces_.clear();
90
+
91
+ // Allocate space for new workspaces.
92
+ for (auto &it : registry.WorkspaceNames()) {
93
+ workspaces_[it.first].resize(it.second.size());
94
+ }
95
+ }
96
+
97
+ private:
98
+ // The set of workspaces, indexed as workspaces_[typeid][index].
99
+ std::unordered_map<std::type_index, std::vector<Workspace *>> workspaces_;
100
+ };
101
+
102
+ // A workspace that wraps around a single int.
103
+ class SingletonIntWorkspace : public Workspace {
104
+ public:
105
+ // Default-initializes the int value.
106
+ SingletonIntWorkspace() {}
107
+
108
+ // Initializes the int with the given value.
109
+ explicit SingletonIntWorkspace(int value) : value_(value) {}
110
+
111
+ // Returns the name of this type of workspace.
112
+ static string TypeName() { return "SingletonInt"; }
113
+
114
+ // Returns the int value.
115
+ int get() const { return value_; }
116
+
117
+ // Sets the int value.
118
+ void set(int value) { value_ = value; }
119
+
120
+ private:
121
+ // The enclosed int.
122
+ int value_ = 0;
123
+ };
124
+
125
+ // A workspace that wraps around a vector of int.
126
+ class VectorIntWorkspace : public Workspace {
127
+ public:
128
+ // Creates a vector of the given size.
129
+ explicit VectorIntWorkspace(int size);
130
+
131
+ // Creates a vector initialized with the given array.
132
+ explicit VectorIntWorkspace(const std::vector<int> &elements);
133
+
134
+ // Creates a vector of the given size, with each element initialized to the
135
+ // given value.
136
+ VectorIntWorkspace(int size, int value);
137
+
138
+ ~VectorIntWorkspace() override;
139
+
140
+ // Returns the name of this type of workspace.
141
+ static string TypeName();
142
+
143
+ // Returns the i'th element.
144
+ int element(int i) const { return elements_[i]; }
145
+
146
+ // Sets the i'th element.
147
+ void set_element(int i, int value) { elements_[i] = value; }
148
+
149
+ private:
150
+ // The enclosed vector.
151
+ std::vector<int> elements_;
152
+ };
153
+
154
+ // A workspace that wraps around a vector of vector of int.
155
+ class VectorVectorIntWorkspace : public Workspace {
156
+ public:
157
+ // Creates a vector of empty vectors of the given size.
158
+ explicit VectorVectorIntWorkspace(int size);
159
+ ~VectorVectorIntWorkspace() override;
160
+
161
+ // Returns the name of this type of workspace.
162
+ static string TypeName();
163
+
164
+ // Returns the i'th vector of elements.
165
+ const std::vector<int> &elements(int i) const { return elements_[i]; }
166
+
167
+ // Mutable access to the i'th vector of elements.
168
+ std::vector<int> *mutable_elements(int i) { return &(elements_[i]); }
169
+
170
+ private:
171
+ // The enclosed vector of vector of elements.
172
+ std::vector<std::vector<int>> elements_;
173
+ };
174
+
175
+ } // namespace chrome_lang_id
176
+
177
+ #endif // WORKSPACE_H_
@@ -0,0 +1,99 @@
1
+ # File including an implementation of CLD3 module. Some documentations are
2
+ # extracted from ext/cld3/ext/src/nnet_language_identifier.h.
3
+ #
4
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
5
+ # All Rights Reserved.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ # ==============================================================================
19
+
20
+ require "ffi"
21
+
22
+ # Module providing an interface for Compact Language Detector v3 (CLD3)
23
+ module CLD3
24
+ # Class for detecting the language of a document.
25
+ class NNetLanguageIdentifier
26
+ # Min number of bytes needed to make a prediction if the construcotr is
27
+ # called without the corresponding parameter.
28
+ MIN_NUM_BYTES_TO_CONSIDER = 140
29
+
30
+ # Max number of bytes needed to make a prediction if the construcotr is
31
+ # called without the corresponding parameter.
32
+ MAX_NUM_BYTES_TO_CONSIDER = 700
33
+
34
+ # Max number of input bytes to process.
35
+ MAX_NUM_INPUT_BYTES_TO_CONSIDER = 10000
36
+
37
+ # Predictions with probability greater than or equal to this threshold are
38
+ # marked as reliable. This threshold was optimized on a set of text segments
39
+ # extracted from wikipedia, and results in an overall precision, recall,
40
+ # and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
41
+ RELIABILITY_THRESHOLD = 0.7
42
+
43
+ # Reliability threshold for the languages hr and bs.
44
+ RELIABILITY_HR_BS_THRESHOLD = 0.5
45
+
46
+ # Information about a predicted language.
47
+ Result = Struct.new("Result", :language, :probability, :reliable?, :proportion)
48
+
49
+ def initialize(minNumBytes = MIN_NUM_BYTES_TO_CONSIDER, maxNumBytes = MAX_NUM_BYTES_TO_CONSIDER)
50
+ @cc = Pointer.new(CLD3::Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
51
+ end
52
+
53
+ # Finds the most likely language for the given text, along with additional
54
+ # information (e.g., probability). The prediction is based on the first N
55
+ # bytes where N is the minumum between the number of interchange valid UTF8
56
+ # bytes and max_num_bytes_. If N is less than min_num_bytes_ long, then this
57
+ # function returns nil.
58
+ def find_language(text)
59
+ text_utf8 = text.encode(Encoding::UTF_8)
60
+ pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
61
+ pointer.put_bytes(0, text_utf8)
62
+
63
+ cc_result = CLD3::Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
64
+ language = cc_result[:language_data].read_bytes(cc_result[:language_size])
65
+
66
+ Result.new(
67
+ language == "und" ? nil : language,
68
+ cc_result[:probability],
69
+ cc_result[:reliable?],
70
+ cc_result[:proportion])
71
+ end
72
+
73
+ private
74
+
75
+ class Pointer < FFI::AutoPointer
76
+ def self.release(pointer)
77
+ CLD3::Unstable.delete_NNetLanguageIdentifier(pointer)
78
+ end
79
+ end
80
+ end
81
+
82
+ # Do NOT use this module from outside.
83
+ module Unstable
84
+ extend FFI::Library
85
+
86
+ ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", FFI.map_library_name("cld3"))
87
+
88
+ class NNetLanguageIdentifierResult < FFI::Struct
89
+ layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
90
+ end
91
+
92
+ attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
93
+
94
+ attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
95
+
96
+ attach_function :NNetLanguageIdentifier_find_language,
97
+ [ :pointer, :buffer_in, :size_t ], NNetLanguageIdentifierResult.by_value
98
+ end
99
+ end