cld3 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +18 -0
  3. data/LICENSE +204 -0
  4. data/LICENSE_CLD3 +203 -0
  5. data/README.md +22 -0
  6. data/cld3.gemspec +35 -0
  7. data/ext/cld3/base.cc +36 -0
  8. data/ext/cld3/base.h +106 -0
  9. data/ext/cld3/casts.h +98 -0
  10. data/ext/cld3/embedding_feature_extractor.cc +51 -0
  11. data/ext/cld3/embedding_feature_extractor.h +182 -0
  12. data/ext/cld3/embedding_network.cc +196 -0
  13. data/ext/cld3/embedding_network.h +186 -0
  14. data/ext/cld3/embedding_network_params.h +285 -0
  15. data/ext/cld3/extconf.rb +49 -0
  16. data/ext/cld3/feature_extractor.cc +137 -0
  17. data/ext/cld3/feature_extractor.h +633 -0
  18. data/ext/cld3/feature_extractor.proto +50 -0
  19. data/ext/cld3/feature_types.cc +72 -0
  20. data/ext/cld3/feature_types.h +158 -0
  21. data/ext/cld3/fixunicodevalue.cc +55 -0
  22. data/ext/cld3/fixunicodevalue.h +69 -0
  23. data/ext/cld3/float16.h +58 -0
  24. data/ext/cld3/fml_parser.cc +308 -0
  25. data/ext/cld3/fml_parser.h +123 -0
  26. data/ext/cld3/generated_entities.cc +296 -0
  27. data/ext/cld3/generated_ulscript.cc +678 -0
  28. data/ext/cld3/generated_ulscript.h +142 -0
  29. data/ext/cld3/getonescriptspan.cc +1109 -0
  30. data/ext/cld3/getonescriptspan.h +124 -0
  31. data/ext/cld3/integral_types.h +37 -0
  32. data/ext/cld3/lang_id_nn_params.cc +57449 -0
  33. data/ext/cld3/lang_id_nn_params.h +178 -0
  34. data/ext/cld3/language_identifier_features.cc +165 -0
  35. data/ext/cld3/language_identifier_features.h +116 -0
  36. data/ext/cld3/nnet_language_identifier.cc +380 -0
  37. data/ext/cld3/nnet_language_identifier.h +175 -0
  38. data/ext/cld3/nnet_language_identifier_c.cc +72 -0
  39. data/ext/cld3/offsetmap.cc +478 -0
  40. data/ext/cld3/offsetmap.h +168 -0
  41. data/ext/cld3/port.h +143 -0
  42. data/ext/cld3/registry.cc +28 -0
  43. data/ext/cld3/registry.h +242 -0
  44. data/ext/cld3/relevant_script_feature.cc +89 -0
  45. data/ext/cld3/relevant_script_feature.h +49 -0
  46. data/ext/cld3/script_detector.h +156 -0
  47. data/ext/cld3/sentence.proto +77 -0
  48. data/ext/cld3/sentence_features.cc +29 -0
  49. data/ext/cld3/sentence_features.h +35 -0
  50. data/ext/cld3/simple_adder.h +72 -0
  51. data/ext/cld3/stringpiece.h +81 -0
  52. data/ext/cld3/task_context.cc +161 -0
  53. data/ext/cld3/task_context.h +81 -0
  54. data/ext/cld3/task_context_params.cc +74 -0
  55. data/ext/cld3/task_context_params.h +54 -0
  56. data/ext/cld3/task_spec.proto +98 -0
  57. data/ext/cld3/text_processing.cc +245 -0
  58. data/ext/cld3/text_processing.h +30 -0
  59. data/ext/cld3/unicodetext.cc +96 -0
  60. data/ext/cld3/unicodetext.h +144 -0
  61. data/ext/cld3/utf8acceptinterchange.h +486 -0
  62. data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
  63. data/ext/cld3/utf8repl_lettermarklower.h +758 -0
  64. data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
  65. data/ext/cld3/utf8statetable.cc +1344 -0
  66. data/ext/cld3/utf8statetable.h +285 -0
  67. data/ext/cld3/utils.cc +241 -0
  68. data/ext/cld3/utils.h +144 -0
  69. data/ext/cld3/workspace.cc +64 -0
  70. data/ext/cld3/workspace.h +177 -0
  71. data/lib/cld3.rb +99 -0
  72. metadata +158 -0
@@ -0,0 +1,81 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // A StringPiece points to part or all of a string, double-quoted string
17
+ // literal, or other string-like object. A StringPiece does *not* own the
18
+ // string to which it points. A StringPiece is not null-terminated. [subset]
19
+ //
20
+
21
+ #ifndef SCRIPT_SPAN_STRINGPIECE_H_
22
+ #define SCRIPT_SPAN_STRINGPIECE_H_
23
+
24
+ #include <string.h>
25
+ #include <string>
26
+
27
+ namespace chrome_lang_id {
28
+
29
+ typedef int stringpiece_ssize_type;
30
+
31
+ class StringPiece {
32
+ private:
33
+ const char* ptr_;
34
+ stringpiece_ssize_type length_;
35
+
36
+ public:
37
+ // We provide non-explicit singleton constructors so users can pass
38
+ // in a "const char*" or a "string" wherever a "StringPiece" is
39
+ // expected.
40
+ StringPiece() : ptr_(NULL), length_(0) {}
41
+
42
+ StringPiece(const char* str) // NOLINT(runtime/explicit)
43
+ : ptr_(str), length_(0) {
44
+ if (str != NULL) {
45
+ length_ = static_cast<stringpiece_ssize_type>(strlen(str));
46
+ }
47
+ }
48
+
49
+ StringPiece(const std::string& str) // NOLINT(runtime/explicit)
50
+ : ptr_(str.data()), length_(0) {
51
+ length_ = static_cast<stringpiece_ssize_type>(str.size());
52
+ }
53
+
54
+ StringPiece(const char* offset, stringpiece_ssize_type len)
55
+ : ptr_(offset), length_(len) {
56
+ }
57
+
58
+ void remove_prefix(stringpiece_ssize_type n) {
59
+ ptr_ += n;
60
+ length_ -= n;
61
+ }
62
+
63
+ void remove_suffix(stringpiece_ssize_type n) {
64
+ length_ -= n;
65
+ }
66
+
67
+ // data() may return a pointer to a buffer with embedded NULs, and the
68
+ // returned buffer may or may not be null terminated. Therefore it is
69
+ // typically a mistake to pass data() to a routine that expects a NUL
70
+ // terminated string.
71
+ const char* data() const { return ptr_; }
72
+ stringpiece_ssize_type size() const { return length_; }
73
+ stringpiece_ssize_type length() const { return length_; }
74
+ bool empty() const { return length_ == 0; }
75
+ };
76
+
77
+ class StringPiece;
78
+
79
+ } // namespace chrome_lang_id
80
+
81
+ #endif // SCRIPT_SPAN_STRINGPIECE_H__
@@ -0,0 +1,161 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #include "task_context.h"
17
+
18
+ #include "utils.h"
19
+
20
+ namespace chrome_lang_id {
21
+
22
+ TaskContext::TaskContext() {}
23
+
24
+ TaskContext::~TaskContext() {}
25
+
26
+ TaskInput *TaskContext::GetInput(const string &name) {
27
+ // Return existing input if it exists.
28
+ for (int i = 0; i < spec_.input_size(); ++i) {
29
+ if (spec_.input(i).name() == name) return spec_.mutable_input(i);
30
+ }
31
+
32
+ // Create new input.
33
+ TaskInput *input = spec_.add_input();
34
+ input->set_name(name);
35
+ return input;
36
+ }
37
+
38
+ TaskInput *TaskContext::GetInput(const string &name, const string &file_format,
39
+ const string &record_format) {
40
+ TaskInput *input = GetInput(name);
41
+ if (!file_format.empty()) {
42
+ bool found = false;
43
+ for (int i = 0; i < input->file_format_size(); ++i) {
44
+ if (input->file_format(i) == file_format) found = true;
45
+ }
46
+ if (!found) input->add_file_format(file_format);
47
+ }
48
+ if (!record_format.empty()) {
49
+ bool found = false;
50
+ for (int i = 0; i < input->record_format_size(); ++i) {
51
+ if (input->record_format(i) == record_format) found = true;
52
+ }
53
+ if (!found) input->add_record_format(record_format);
54
+ }
55
+ return input;
56
+ }
57
+
58
+ void TaskContext::SetParameter(const string &name, const string &value) {
59
+ // If the parameter already exists update the value.
60
+ for (int i = 0; i < spec_.parameter_size(); ++i) {
61
+ if (spec_.parameter(i).name() == name) {
62
+ spec_.mutable_parameter(i)->set_value(value);
63
+ return;
64
+ }
65
+ }
66
+
67
+ // Add new parameter.
68
+ TaskSpec::Parameter *param = spec_.add_parameter();
69
+ param->set_name(name);
70
+ param->set_value(value);
71
+ }
72
+
73
+ string TaskContext::GetParameter(const string &name) const {
74
+ // First try to find parameter in task specification.
75
+ for (int i = 0; i < spec_.parameter_size(); ++i) {
76
+ if (spec_.parameter(i).name() == name) return spec_.parameter(i).value();
77
+ }
78
+
79
+ // Parameter not found, return empty string.
80
+ return "";
81
+ }
82
+
83
+ int TaskContext::GetIntParameter(const string &name) const {
84
+ string value = GetParameter(name);
85
+ return utils::ParseUsing<int>(value, 0, utils::ParseInt32);
86
+ }
87
+
88
+ bool TaskContext::GetBoolParameter(const string &name) const {
89
+ string value = GetParameter(name);
90
+ return value == "true";
91
+ }
92
+
93
+ double TaskContext::GetFloatParameter(const string &name) const {
94
+ string value = GetParameter(name);
95
+ return utils::ParseUsing<double>(value, .0, utils::ParseDouble);
96
+ }
97
+
98
+ string TaskContext::Get(const string &name, const char *defval) const {
99
+ // First try to find parameter in task specification.
100
+ for (int i = 0; i < spec_.parameter_size(); ++i) {
101
+ if (spec_.parameter(i).name() == name) return spec_.parameter(i).value();
102
+ }
103
+
104
+ // Parameter not found, return default value.
105
+ return defval;
106
+ }
107
+
108
+ string TaskContext::Get(const string &name, const string &defval) const {
109
+ return Get(name, defval.c_str());
110
+ }
111
+
112
+ int TaskContext::Get(const string &name, int defval) const {
113
+ string value = Get(name, "");
114
+ return utils::ParseUsing<int>(value, defval, utils::ParseInt32);
115
+ }
116
+
117
+ double TaskContext::Get(const string &name, double defval) const {
118
+ string value = Get(name, "");
119
+ return utils::ParseUsing<double>(value, defval, utils::ParseDouble);
120
+ }
121
+
122
+ bool TaskContext::Get(const string &name, bool defval) const {
123
+ string value = Get(name, "");
124
+ return value.empty() ? defval : value == "true";
125
+ }
126
+
127
+ string TaskContext::InputFile(const TaskInput &input) {
128
+ CLD3_CHECK(input.part_size() == 1);
129
+ return input.part(0).file_pattern();
130
+ }
131
+
132
+ bool TaskContext::Supports(const TaskInput &input, const string &file_format,
133
+ const string &record_format) {
134
+ // Check file format.
135
+ if (input.file_format_size() > 0) {
136
+ bool found = false;
137
+ for (int i = 0; i < input.file_format_size(); ++i) {
138
+ if (input.file_format(i) == file_format) {
139
+ found = true;
140
+ break;
141
+ }
142
+ }
143
+ if (!found) return false;
144
+ }
145
+
146
+ // Check record format.
147
+ if (input.record_format_size() > 0) {
148
+ bool found = false;
149
+ for (int i = 0; i < input.record_format_size(); ++i) {
150
+ if (input.record_format(i) == record_format) {
151
+ found = true;
152
+ break;
153
+ }
154
+ }
155
+ if (!found) return false;
156
+ }
157
+
158
+ return true;
159
+ }
160
+
161
+ } // namespace chrome_lang_id
@@ -0,0 +1,81 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #ifndef TASK_CONTEXT_H_
17
+ #define TASK_CONTEXT_H_
18
+
19
+ #include <string>
20
+ #include <vector>
21
+
22
+ #include "base.h"
23
+ #include "cld_3/protos/task_spec.pb.h"
24
+
25
+ namespace chrome_lang_id {
26
+
27
+ // A task context holds configuration information for a task. It is basically a
28
+ // wrapper around a TaskSpec protocol buffer.
29
+ class TaskContext {
30
+ public:
31
+ TaskContext();
32
+ ~TaskContext();
33
+
34
+ // Returns the underlying task specification protocol buffer for the context.
35
+ const TaskSpec &spec() const { return spec_; }
36
+ TaskSpec *mutable_spec() { return &spec_; }
37
+
38
+ // Returns a named input descriptor for the task. A new input is created if
39
+ // the task context does not already have an input with that name.
40
+ TaskInput *GetInput(const string &name);
41
+ TaskInput *GetInput(const string &name, const string &file_format,
42
+ const string &record_format);
43
+
44
+ // Sets task parameter.
45
+ void SetParameter(const string &name, const string &value);
46
+
47
+ // Returns task parameter. If the parameter is not in the task configuration
48
+ // the (default) value of the corresponding command line flag is returned.
49
+ string GetParameter(const string &name) const;
50
+ int GetIntParameter(const string &name) const;
51
+ bool GetBoolParameter(const string &name) const;
52
+ double GetFloatParameter(const string &name) const;
53
+
54
+ // Returns task parameter. If the parameter is not in the task configuration
55
+ // the default value is returned. Parameters retrieved using these methods
56
+ // don't need to be defined with a DEFINE_*() macro.
57
+ string Get(const string &name, const string &defval) const;
58
+ string Get(const string &name, const char *defval) const;
59
+ int Get(const string &name, int defval) const;
60
+ double Get(const string &name, double defval) const;
61
+ bool Get(const string &name, bool defval) const;
62
+
63
+ // Returns input file name for a single-file task input.
64
+ static string InputFile(const TaskInput &input);
65
+
66
+ // Returns true if task input supports the file and record format.
67
+ static bool Supports(const TaskInput &input, const string &file_format,
68
+ const string &record_format);
69
+
70
+ private:
71
+ // Underlying task specification protocol buffer.
72
+ TaskSpec spec_;
73
+
74
+ // Vector of parameters required by this task. These must be specified in the
75
+ // task rather than relying on default values.
76
+ std::vector<string> required_parameters_;
77
+ };
78
+
79
+ } // namespace chrome_lang_id
80
+
81
+ #endif // TASK_CONTEXT_H_
@@ -0,0 +1,74 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ // This file contains the hard-coded parameters from the training workflow. If
17
+ // you update the binary model, you may need to update the variables below as
18
+ // well.
19
+
20
+ #include "task_context_params.h"
21
+
22
+ #include "task_context.h"
23
+
24
+ namespace chrome_lang_id {
25
+
26
+ void TaskContextParams::ToTaskContext(TaskContext *context) {
27
+ context->SetParameter("language_identifier_features",
28
+ kLanguageIdentifierFeatures);
29
+ context->SetParameter("language_identifier_embedding_names",
30
+ kLanguageIdentifierEmbeddingNames);
31
+ context->SetParameter("language_identifier_embedding_dims",
32
+ kLanguageIdentifierEmbeddingDims);
33
+ }
34
+
35
+ int TaskContextParams::GetNumLanguages() {
36
+ int i = 0;
37
+ while (kLanguageNames[i] != nullptr) {
38
+ i++;
39
+ }
40
+ return i;
41
+ }
42
+
43
+ const char *const TaskContextParams::kLanguageNames[] = {
44
+ "eo", "co", "eu", "ta", "de", "mt", "ps", "te", "su", "uz", "zh-Latn", "ne",
45
+ "nl", "sw", "sq", "hmn", "ja", "no", "mn", "so", "ko", "kk", "sl", "ig",
46
+ "mr", "th", "zu", "ml", "hr", "bs", "lo", "sd", "cy", "hy", "uk", "pt",
47
+ "lv", "iw", "cs", "vi", "jv", "be", "km", "mk", "tr", "fy", "am", "zh",
48
+ "da", "sv", "fi", "ht", "af", "la", "id", "fil", "sm", "ca", "el", "ka",
49
+ "sr", "it", "sk", "ru", "ru-Latn", "bg", "ny", "fa", "haw", "gl", "et",
50
+ "ms", "gd", "bg-Latn", "ha", "is", "ur", "mi", "hi", "bn", "hi-Latn", "fr",
51
+ "yi", "hu", "xh", "my", "tg", "ro", "ar", "lb", "el-Latn", "st", "ceb",
52
+ "kn", "az", "si", "ky", "mg", "en", "gu", "es", "pl", "ja-Latn", "ga", "lt",
53
+ "sn", "yo", "pa", "ku",
54
+
55
+ // last element must be nullptr
56
+ nullptr,
57
+ };
58
+
59
+ const char TaskContextParams::kLanguageIdentifierFeatures[] =
60
+ "continuous-bag-of-ngrams(include_terminators=true,include_spaces=false,"
61
+ "use_equal_weight=false,id_dim=1000,size=2);continuous-bag-of-ngrams("
62
+ "include_terminators=true,include_spaces=false,use_equal_weight=false,id_"
63
+ "dim=5000,size=4);continuous-bag-of-relevant-scripts;script;continuous-bag-"
64
+ "of-ngrams(include_terminators=true,include_spaces=false,use_equal_weight="
65
+ "false,id_dim=5000,size=3);continuous-bag-of-ngrams(include_terminators="
66
+ "true,include_spaces=false,use_equal_weight=false,id_dim=100,size=1)";
67
+
68
+ const char TaskContextParams::kLanguageIdentifierEmbeddingNames[] =
69
+ "bigrams;quadgrams;relevant-scripts;text-script;trigrams;unigrams";
70
+
71
+ const char TaskContextParams::kLanguageIdentifierEmbeddingDims[] =
72
+ "16;16;8;8;16;16";
73
+
74
+ } // namespace chrome_lang_id
@@ -0,0 +1,54 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #ifndef TASK_CONTEXT_PARAMS_H_
17
+ #define TASK_CONTEXT_PARAMS_H_
18
+
19
+ #include <string>
20
+
21
+ #include "base.h"
22
+ #include "task_context.h"
23
+
24
+ namespace chrome_lang_id {
25
+
26
+ // Encapsulates the TaskContext specifying only the parameters for the model.
27
+ // The model weights are loaded statically.
28
+ class TaskContextParams {
29
+ public:
30
+ // Gets the name of the i'th language.
31
+ static const char *language_names(int i) { return kLanguageNames[i]; }
32
+
33
+ // Saves the parameters to the given TaskContext.
34
+ static void ToTaskContext(TaskContext *context);
35
+
36
+ // Gets the number of languages.
37
+ static int GetNumLanguages();
38
+
39
+ private:
40
+ // Names of all the languages.
41
+ static const char *const kLanguageNames[];
42
+
43
+ // Features in FML format.
44
+ static const char kLanguageIdentifierFeatures[];
45
+
46
+ // Names of the embedding spaces.
47
+ static const char kLanguageIdentifierEmbeddingNames[];
48
+
49
+ // Dimensions of the embedding spaces.
50
+ static const char kLanguageIdentifierEmbeddingDims[];
51
+ };
52
+ } // namespace chrome_lang_id
53
+
54
+ #endif // TASK_CONTEXT_PARAMS_H_