cld3 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +18 -0
  3. data/LICENSE +204 -0
  4. data/LICENSE_CLD3 +203 -0
  5. data/README.md +22 -0
  6. data/cld3.gemspec +35 -0
  7. data/ext/cld3/base.cc +36 -0
  8. data/ext/cld3/base.h +106 -0
  9. data/ext/cld3/casts.h +98 -0
  10. data/ext/cld3/embedding_feature_extractor.cc +51 -0
  11. data/ext/cld3/embedding_feature_extractor.h +182 -0
  12. data/ext/cld3/embedding_network.cc +196 -0
  13. data/ext/cld3/embedding_network.h +186 -0
  14. data/ext/cld3/embedding_network_params.h +285 -0
  15. data/ext/cld3/extconf.rb +49 -0
  16. data/ext/cld3/feature_extractor.cc +137 -0
  17. data/ext/cld3/feature_extractor.h +633 -0
  18. data/ext/cld3/feature_extractor.proto +50 -0
  19. data/ext/cld3/feature_types.cc +72 -0
  20. data/ext/cld3/feature_types.h +158 -0
  21. data/ext/cld3/fixunicodevalue.cc +55 -0
  22. data/ext/cld3/fixunicodevalue.h +69 -0
  23. data/ext/cld3/float16.h +58 -0
  24. data/ext/cld3/fml_parser.cc +308 -0
  25. data/ext/cld3/fml_parser.h +123 -0
  26. data/ext/cld3/generated_entities.cc +296 -0
  27. data/ext/cld3/generated_ulscript.cc +678 -0
  28. data/ext/cld3/generated_ulscript.h +142 -0
  29. data/ext/cld3/getonescriptspan.cc +1109 -0
  30. data/ext/cld3/getonescriptspan.h +124 -0
  31. data/ext/cld3/integral_types.h +37 -0
  32. data/ext/cld3/lang_id_nn_params.cc +57449 -0
  33. data/ext/cld3/lang_id_nn_params.h +178 -0
  34. data/ext/cld3/language_identifier_features.cc +165 -0
  35. data/ext/cld3/language_identifier_features.h +116 -0
  36. data/ext/cld3/nnet_language_identifier.cc +380 -0
  37. data/ext/cld3/nnet_language_identifier.h +175 -0
  38. data/ext/cld3/nnet_language_identifier_c.cc +72 -0
  39. data/ext/cld3/offsetmap.cc +478 -0
  40. data/ext/cld3/offsetmap.h +168 -0
  41. data/ext/cld3/port.h +143 -0
  42. data/ext/cld3/registry.cc +28 -0
  43. data/ext/cld3/registry.h +242 -0
  44. data/ext/cld3/relevant_script_feature.cc +89 -0
  45. data/ext/cld3/relevant_script_feature.h +49 -0
  46. data/ext/cld3/script_detector.h +156 -0
  47. data/ext/cld3/sentence.proto +77 -0
  48. data/ext/cld3/sentence_features.cc +29 -0
  49. data/ext/cld3/sentence_features.h +35 -0
  50. data/ext/cld3/simple_adder.h +72 -0
  51. data/ext/cld3/stringpiece.h +81 -0
  52. data/ext/cld3/task_context.cc +161 -0
  53. data/ext/cld3/task_context.h +81 -0
  54. data/ext/cld3/task_context_params.cc +74 -0
  55. data/ext/cld3/task_context_params.h +54 -0
  56. data/ext/cld3/task_spec.proto +98 -0
  57. data/ext/cld3/text_processing.cc +245 -0
  58. data/ext/cld3/text_processing.h +30 -0
  59. data/ext/cld3/unicodetext.cc +96 -0
  60. data/ext/cld3/unicodetext.h +144 -0
  61. data/ext/cld3/utf8acceptinterchange.h +486 -0
  62. data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
  63. data/ext/cld3/utf8repl_lettermarklower.h +758 -0
  64. data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
  65. data/ext/cld3/utf8statetable.cc +1344 -0
  66. data/ext/cld3/utf8statetable.h +285 -0
  67. data/ext/cld3/utils.cc +241 -0
  68. data/ext/cld3/utils.h +144 -0
  69. data/ext/cld3/workspace.cc +64 -0
  70. data/ext/cld3/workspace.h +177 -0
  71. data/lib/cld3.rb +99 -0
  72. metadata +158 -0
@@ -0,0 +1,144 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #ifndef UTILS_H_
17
+ #define UTILS_H_
18
+
19
+ #include <stddef.h>
20
+ #include <functional>
21
+ #include <initializer_list>
22
+ #include <string>
23
+ #include <vector>
24
+
25
+ #include "base.h"
26
+ #include "script_span/stringpiece.h"
27
+
28
+ namespace chrome_lang_id {
29
+ namespace utils {
30
+
31
+ bool ParseInt32(const char *c_str, int *value);
32
+ bool ParseDouble(const char *c_str, double *value);
33
+
34
+ template <typename T>
35
+ T ParseUsing(const string &str, std::function<bool(const char *, T *)> func) {
36
+ T value;
37
+ func(str.c_str(), &value);
38
+ return value;
39
+ }
40
+
41
+ template <typename T>
42
+ T ParseUsing(const string &str, T defval,
43
+ std::function<bool(const char *, T *)> func) {
44
+ return str.empty() ? defval : ParseUsing<T>(str, func);
45
+ }
46
+
47
+ string CEscape(const string &src);
48
+
49
+ std::vector<string> Split(const string &text, char delim);
50
+
51
+ int RemoveLeadingWhitespace(StringPiece *text);
52
+
53
+ int RemoveTrailingWhitespace(StringPiece *text);
54
+
55
+ int RemoveWhitespaceContext(StringPiece *text);
56
+
57
+ uint32 Hash32(const char *data, size_t n, uint32 seed);
58
+
59
+ uint32 Hash32WithDefaultSeed(const string &input);
60
+
61
+ // Deletes all the elements in an STL container and clears the container. This
62
+ // function is suitable for use with a vector, set, hash_set, or any other STL
63
+ // container which defines sensible begin(), end(), and clear() methods.
64
+ // If container is NULL, this function is a no-op.
65
+ template <typename T>
66
+ void STLDeleteElements(T *container) {
67
+ if (!container) return;
68
+ auto it = container->begin();
69
+ while (it != container->end()) {
70
+ auto temp = it;
71
+ ++it;
72
+ delete *temp;
73
+ }
74
+ container->clear();
75
+ }
76
+
77
+ class PunctuationUtil {
78
+ public:
79
+ // Unicode character ranges for punctuation characters according to CoNLL.
80
+ struct CharacterRange {
81
+ int first;
82
+ int last;
83
+ };
84
+ static CharacterRange kPunctuation[];
85
+
86
+ // Returns true if Unicode character is a punctuation character.
87
+ static bool IsPunctuation(int u) {
88
+ int i = 0;
89
+ while (kPunctuation[i].first > 0) {
90
+ if (u < kPunctuation[i].first) return false;
91
+ if (u <= kPunctuation[i].last) return true;
92
+ ++i;
93
+ }
94
+ return false;
95
+ }
96
+
97
+ // Determine if tag is a punctuation tag.
98
+ static bool IsPunctuationTag(const string &tag) {
99
+ for (size_t i = 0; i < tag.length(); ++i) {
100
+ int c = tag[i];
101
+ if (c != ',' && c != ':' && c != '.' && c != '\'' && c != '`') {
102
+ return false;
103
+ }
104
+ }
105
+ return true;
106
+ }
107
+
108
+ // Returns true if tag is non-empty and has only punctuation or parens
109
+ // symbols.
110
+ static bool IsPunctuationTagOrParens(const string &tag) {
111
+ if (tag.empty()) return false;
112
+ for (size_t i = 0; i < tag.length(); ++i) {
113
+ int c = tag[i];
114
+ if (c != '(' && c != ')' && c != ',' && c != ':' && c != '.' &&
115
+ c != '\'' && c != '`') {
116
+ return false;
117
+ }
118
+ }
119
+ return true;
120
+ }
121
+ };
122
+
123
+ void NormalizeDigits(string *form);
124
+
125
+ // Takes a text and convert it into a vector, where each element is a utf8
126
+ // character.
127
+ void GetUTF8Chars(const string &text, std::vector<string> *chars);
128
+
129
+ // Returns the number of bytes in the first UTF-8 char at the beginning
130
+ // of the string. It is assumed that the string is valid UTF-8. If
131
+ // the first byte of the string is null, return 0 (for backwards
132
+ // compatibility only; this use is discouraged).
133
+ int UTF8FirstLetterNumBytes(const char *in_buf);
134
+
135
+ // Returns the length (number of bytes) of the Unicode code point starting at
136
+ // src, based on inspecting just that one byte. Preconditions: src != NULL,
137
+ // *src can be read, and *src is not '\0', and src points to a well-formed UTF-8
138
+ // string.
139
+ int OneCharLen(const char *src);
140
+
141
+ } // namespace utils
142
+ } // namespace chrome_lang_id
143
+
144
+ #endif // UTILS_H_
@@ -0,0 +1,64 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #include "workspace.h"
17
+
18
+ #include "base.h"
19
+
20
+ namespace chrome_lang_id {
21
+
22
+ WorkspaceSet::WorkspaceSet() {}
23
+
24
+ WorkspaceSet::~WorkspaceSet() { Reset(WorkspaceRegistry()); }
25
+
26
+ WorkspaceRegistry::WorkspaceRegistry() {}
27
+
28
+ WorkspaceRegistry::~WorkspaceRegistry() {}
29
+
30
+ string WorkspaceRegistry::DebugString() const {
31
+ string str;
32
+ for (auto &it : workspace_names_) {
33
+ const string &type_name = workspace_types_.at(it.first);
34
+ for (size_t index = 0; index < it.second.size(); ++index) {
35
+ const string &workspace_name = it.second[index];
36
+ str += "\n ";
37
+ str += type_name;
38
+ str += " :: ";
39
+ str += workspace_name;
40
+ }
41
+ }
42
+ return str;
43
+ }
44
+
45
+ VectorIntWorkspace::~VectorIntWorkspace() {}
46
+
47
+ VectorIntWorkspace::VectorIntWorkspace(int size) : elements_(size) {}
48
+
49
+ VectorIntWorkspace::VectorIntWorkspace(int size, int value)
50
+ : elements_(size, value) {}
51
+
52
+ VectorIntWorkspace::VectorIntWorkspace(const std::vector<int> &elements)
53
+ : elements_(elements) {}
54
+
55
+ string VectorIntWorkspace::TypeName() { return "Vector"; }
56
+
57
+ VectorVectorIntWorkspace::~VectorVectorIntWorkspace() {}
58
+
59
+ VectorVectorIntWorkspace::VectorVectorIntWorkspace(int size)
60
+ : elements_(size) {}
61
+
62
+ string VectorVectorIntWorkspace::TypeName() { return "VectorVector"; }
63
+
64
+ } // namespace chrome_lang_id
@@ -0,0 +1,177 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ // Notes on thread-safety: All of the classes here are thread-compatible. More
17
+ // specifically, the registry machinery is thread-safe, as long as each thread
18
+ // performs feature extraction on a different Sentence object.
19
+
20
+ #ifndef WORKSPACE_H_
21
+ #define WORKSPACE_H_
22
+
23
+ #include <stddef.h>
24
+ #include <string>
25
+ #include <typeindex>
26
+ #include <unordered_map>
27
+ #include <utility>
28
+ #include <vector>
29
+
30
+ #include "base.h"
31
+
32
+ namespace chrome_lang_id {
33
+
34
+ // A base class for shared workspaces. Derived classes implement a static member
35
+ // function TypeName() which returns a human readable string name for the class.
36
+ class Workspace {
37
+ public:
38
+ // Polymorphic destructor.
39
+ virtual ~Workspace() {}
40
+
41
+ protected:
42
+ // Create an empty workspace.
43
+ Workspace() {}
44
+
45
+ private:
46
+ CLD3_DISALLOW_COPY_AND_ASSIGN(Workspace);
47
+ };
48
+
49
+ // A registry that keeps track of workspaces.
50
+ class WorkspaceRegistry {
51
+ public:
52
+ // Create an empty registry.
53
+ WorkspaceRegistry();
54
+ ~WorkspaceRegistry();
55
+
56
+ const std::unordered_map<std::type_index, std::vector<std::string>>
57
+ &WorkspaceNames() const {
58
+ return workspace_names_;
59
+ }
60
+
61
+ // Returns a string describing the registered workspaces.
62
+ string DebugString() const;
63
+
64
+ private:
65
+ // Workspace type names, indexed as workspace_types_[typeid].
66
+ std::unordered_map<std::type_index, string> workspace_types_;
67
+
68
+ // Workspace names, indexed as workspace_names_[typeid][workspace].
69
+ std::unordered_map<std::type_index, std::vector<string>> workspace_names_;
70
+
71
+ CLD3_DISALLOW_COPY_AND_ASSIGN(WorkspaceRegistry);
72
+ };
73
+
74
+ // A typed collected of workspaces. The workspaces are indexed according to an
75
+ // external WorkspaceRegistry. If the WorkspaceSet is const, the contents are
76
+ // also immutable.
77
+ class WorkspaceSet {
78
+ public:
79
+ WorkspaceSet();
80
+ ~WorkspaceSet();
81
+
82
+ void Reset(const WorkspaceRegistry &registry) {
83
+ // Deallocate current workspaces.
84
+ for (auto &it : workspaces_) {
85
+ for (size_t index = 0; index < it.second.size(); ++index) {
86
+ delete it.second[index];
87
+ }
88
+ }
89
+ workspaces_.clear();
90
+
91
+ // Allocate space for new workspaces.
92
+ for (auto &it : registry.WorkspaceNames()) {
93
+ workspaces_[it.first].resize(it.second.size());
94
+ }
95
+ }
96
+
97
+ private:
98
+ // The set of workspaces, indexed as workspaces_[typeid][index].
99
+ std::unordered_map<std::type_index, std::vector<Workspace *>> workspaces_;
100
+ };
101
+
102
+ // A workspace that wraps around a single int.
103
+ class SingletonIntWorkspace : public Workspace {
104
+ public:
105
+ // Default-initializes the int value.
106
+ SingletonIntWorkspace() {}
107
+
108
+ // Initializes the int with the given value.
109
+ explicit SingletonIntWorkspace(int value) : value_(value) {}
110
+
111
+ // Returns the name of this type of workspace.
112
+ static string TypeName() { return "SingletonInt"; }
113
+
114
+ // Returns the int value.
115
+ int get() const { return value_; }
116
+
117
+ // Sets the int value.
118
+ void set(int value) { value_ = value; }
119
+
120
+ private:
121
+ // The enclosed int.
122
+ int value_ = 0;
123
+ };
124
+
125
+ // A workspace that wraps around a vector of int.
126
+ class VectorIntWorkspace : public Workspace {
127
+ public:
128
+ // Creates a vector of the given size.
129
+ explicit VectorIntWorkspace(int size);
130
+
131
+ // Creates a vector initialized with the given array.
132
+ explicit VectorIntWorkspace(const std::vector<int> &elements);
133
+
134
+ // Creates a vector of the given size, with each element initialized to the
135
+ // given value.
136
+ VectorIntWorkspace(int size, int value);
137
+
138
+ ~VectorIntWorkspace() override;
139
+
140
+ // Returns the name of this type of workspace.
141
+ static string TypeName();
142
+
143
+ // Returns the i'th element.
144
+ int element(int i) const { return elements_[i]; }
145
+
146
+ // Sets the i'th element.
147
+ void set_element(int i, int value) { elements_[i] = value; }
148
+
149
+ private:
150
+ // The enclosed vector.
151
+ std::vector<int> elements_;
152
+ };
153
+
154
+ // A workspace that wraps around a vector of vector of int.
155
+ class VectorVectorIntWorkspace : public Workspace {
156
+ public:
157
+ // Creates a vector of empty vectors of the given size.
158
+ explicit VectorVectorIntWorkspace(int size);
159
+ ~VectorVectorIntWorkspace() override;
160
+
161
+ // Returns the name of this type of workspace.
162
+ static string TypeName();
163
+
164
+ // Returns the i'th vector of elements.
165
+ const std::vector<int> &elements(int i) const { return elements_[i]; }
166
+
167
+ // Mutable access to the i'th vector of elements.
168
+ std::vector<int> *mutable_elements(int i) { return &(elements_[i]); }
169
+
170
+ private:
171
+ // The enclosed vector of vector of elements.
172
+ std::vector<std::vector<int>> elements_;
173
+ };
174
+
175
+ } // namespace chrome_lang_id
176
+
177
+ #endif // WORKSPACE_H_
@@ -0,0 +1,99 @@
1
+ # File including an implementation of CLD3 module. Some documentations are
2
+ # extracted from ext/cld3/ext/src/nnet_language_identifier.h.
3
+ #
4
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
5
+ # All Rights Reserved.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ # ==============================================================================
19
+
20
+ require "ffi"
21
+
22
+ # Module providing an interface for Compact Language Detector v3 (CLD3)
23
+ module CLD3
24
+ # Class for detecting the language of a document.
25
+ class NNetLanguageIdentifier
26
+ # Min number of bytes needed to make a prediction if the construcotr is
27
+ # called without the corresponding parameter.
28
+ MIN_NUM_BYTES_TO_CONSIDER = 140
29
+
30
+ # Max number of bytes needed to make a prediction if the construcotr is
31
+ # called without the corresponding parameter.
32
+ MAX_NUM_BYTES_TO_CONSIDER = 700
33
+
34
+ # Max number of input bytes to process.
35
+ MAX_NUM_INPUT_BYTES_TO_CONSIDER = 10000
36
+
37
+ # Predictions with probability greater than or equal to this threshold are
38
+ # marked as reliable. This threshold was optimized on a set of text segments
39
+ # extracted from wikipedia, and results in an overall precision, recall,
40
+ # and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
41
+ RELIABILITY_THRESHOLD = 0.7
42
+
43
+ # Reliability threshold for the languages hr and bs.
44
+ RELIABILITY_HR_BS_THRESHOLD = 0.5
45
+
46
+ # Information about a predicted language.
47
+ Result = Struct.new("Result", :language, :probability, :reliable?, :proportion)
48
+
49
+ def initialize(minNumBytes = MIN_NUM_BYTES_TO_CONSIDER, maxNumBytes = MAX_NUM_BYTES_TO_CONSIDER)
50
+ @cc = Pointer.new(CLD3::Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
51
+ end
52
+
53
+ # Finds the most likely language for the given text, along with additional
54
+ # information (e.g., probability). The prediction is based on the first N
55
+ # bytes where N is the minumum between the number of interchange valid UTF8
56
+ # bytes and max_num_bytes_. If N is less than min_num_bytes_ long, then this
57
+ # function returns nil.
58
+ def find_language(text)
59
+ text_utf8 = text.encode(Encoding::UTF_8)
60
+ pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
61
+ pointer.put_bytes(0, text_utf8)
62
+
63
+ cc_result = CLD3::Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
64
+ language = cc_result[:language_data].read_bytes(cc_result[:language_size])
65
+
66
+ Result.new(
67
+ language == "und" ? nil : language,
68
+ cc_result[:probability],
69
+ cc_result[:reliable?],
70
+ cc_result[:proportion])
71
+ end
72
+
73
+ private
74
+
75
+ class Pointer < FFI::AutoPointer
76
+ def self.release(pointer)
77
+ CLD3::Unstable.delete_NNetLanguageIdentifier(pointer)
78
+ end
79
+ end
80
+ end
81
+
82
+ # Do NOT use this module from outside.
83
+ module Unstable
84
+ extend FFI::Library
85
+
86
+ ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", FFI.map_library_name("cld3"))
87
+
88
+ class NNetLanguageIdentifierResult < FFI::Struct
89
+ layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
90
+ end
91
+
92
+ attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
93
+
94
+ attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
95
+
96
+ attach_function :NNetLanguageIdentifier_find_language,
97
+ [ :pointer, :buffer_in, :size_t ], NNetLanguageIdentifierResult.by_value
98
+ end
99
+ end