cld3 3.2.5 → 3.2.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/{ext/cld3/ext/LICENSE → LICENSE_CLD3} +0 -0
  3. data/cld3.gemspec +1 -1
  4. data/ext/cld3/Makefile +266 -0
  5. data/ext/cld3/{ext/src/base.cc → base.cc} +0 -0
  6. data/ext/cld3/{ext/src/base.h → base.h} +0 -0
  7. data/ext/cld3/base.o +0 -0
  8. data/ext/cld3/{ext/src/casts.h → casts.h} +0 -0
  9. data/ext/cld3/{ext/src/embedding_feature_extractor.cc → embedding_feature_extractor.cc} +0 -0
  10. data/ext/cld3/{ext/src/embedding_feature_extractor.h → embedding_feature_extractor.h} +0 -0
  11. data/ext/cld3/embedding_feature_extractor.o +0 -0
  12. data/ext/cld3/{ext/src/embedding_network.cc → embedding_network.cc} +0 -0
  13. data/ext/cld3/{ext/src/embedding_network.h → embedding_network.h} +0 -0
  14. data/ext/cld3/embedding_network.o +0 -0
  15. data/ext/cld3/{ext/src/embedding_network_params.h → embedding_network_params.h} +0 -0
  16. data/ext/cld3/{ext/src/feature_extractor.cc → feature_extractor.cc} +0 -0
  17. data/ext/cld3/{ext/src/feature_extractor.h → feature_extractor.h} +0 -0
  18. data/ext/cld3/feature_extractor.o +0 -0
  19. data/ext/cld3/feature_extractor.pb.o +0 -0
  20. data/ext/cld3/{ext/src/feature_extractor.proto → feature_extractor.proto} +0 -0
  21. data/ext/cld3/{ext/src/feature_types.cc → feature_types.cc} +0 -0
  22. data/ext/cld3/{ext/src/feature_types.h → feature_types.h} +0 -0
  23. data/ext/cld3/feature_types.o +0 -0
  24. data/ext/cld3/{ext/src/script_span/fixunicodevalue.cc → fixunicodevalue.cc} +0 -0
  25. data/ext/cld3/{ext/src/script_span/fixunicodevalue.h → fixunicodevalue.h} +0 -0
  26. data/ext/cld3/fixunicodevalue.o +0 -0
  27. data/ext/cld3/{ext/src/float16.h → float16.h} +0 -0
  28. data/ext/cld3/{ext/src/fml_parser.cc → fml_parser.cc} +0 -0
  29. data/ext/cld3/{ext/src/fml_parser.h → fml_parser.h} +0 -0
  30. data/ext/cld3/fml_parser.o +0 -0
  31. data/ext/cld3/{ext/src/script_span/generated_entities.cc → generated_entities.cc} +0 -0
  32. data/ext/cld3/generated_entities.o +0 -0
  33. data/ext/cld3/{ext/src/script_span/generated_ulscript.cc → generated_ulscript.cc} +0 -0
  34. data/ext/cld3/{ext/src/script_span/generated_ulscript.h → generated_ulscript.h} +0 -0
  35. data/ext/cld3/generated_ulscript.o +0 -0
  36. data/ext/cld3/{ext/src/script_span/getonescriptspan.cc → getonescriptspan.cc} +0 -0
  37. data/ext/cld3/{ext/src/script_span/getonescriptspan.h → getonescriptspan.h} +0 -0
  38. data/ext/cld3/getonescriptspan.o +0 -0
  39. data/ext/cld3/{ext/src/script_span/integral_types.h → integral_types.h} +0 -0
  40. data/ext/cld3/{ext/src/lang_id_nn_params.cc → lang_id_nn_params.cc} +0 -0
  41. data/ext/cld3/{ext/src/lang_id_nn_params.h → lang_id_nn_params.h} +0 -0
  42. data/ext/cld3/lang_id_nn_params.o +0 -0
  43. data/ext/cld3/{ext/src/language_identifier_features.cc → language_identifier_features.cc} +0 -0
  44. data/ext/cld3/{ext/src/language_identifier_features.h → language_identifier_features.h} +0 -0
  45. data/ext/cld3/language_identifier_features.o +0 -0
  46. data/ext/cld3/libcld3.so +0 -0
  47. data/ext/cld3/mkmf.log +36 -0
  48. data/ext/cld3/{ext/src/nnet_language_identifier.cc → nnet_language_identifier.cc} +0 -0
  49. data/ext/cld3/{ext/src/nnet_language_identifier.h → nnet_language_identifier.h} +0 -0
  50. data/ext/cld3/nnet_language_identifier.o +0 -0
  51. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  52. data/ext/cld3/{ext/src/script_span/offsetmap.cc → offsetmap.cc} +0 -0
  53. data/ext/cld3/{ext/src/script_span/offsetmap.h → offsetmap.h} +0 -0
  54. data/ext/cld3/offsetmap.o +0 -0
  55. data/ext/cld3/{ext/src/script_span/port.h → port.h} +0 -0
  56. data/ext/cld3/{ext/src/registry.cc → registry.cc} +0 -0
  57. data/ext/cld3/{ext/src/registry.h → registry.h} +0 -0
  58. data/ext/cld3/registry.o +0 -0
  59. data/ext/cld3/{ext/src/relevant_script_feature.cc → relevant_script_feature.cc} +0 -0
  60. data/ext/cld3/{ext/src/relevant_script_feature.h → relevant_script_feature.h} +0 -0
  61. data/ext/cld3/relevant_script_feature.o +0 -0
  62. data/ext/cld3/{ext/src/script_detector.h → script_detector.h} +0 -0
  63. data/ext/cld3/sentence.pb.o +0 -0
  64. data/ext/cld3/{ext/src/sentence.proto → sentence.proto} +0 -0
  65. data/ext/cld3/{ext/src/sentence_features.cc → sentence_features.cc} +0 -0
  66. data/ext/cld3/{ext/src/sentence_features.h → sentence_features.h} +0 -0
  67. data/ext/cld3/sentence_features.o +0 -0
  68. data/ext/cld3/{ext/src/simple_adder.h → simple_adder.h} +0 -0
  69. data/ext/cld3/{ext/src/script_span/stringpiece.h → stringpiece.h} +0 -0
  70. data/ext/cld3/{ext/src/task_context.cc → task_context.cc} +0 -0
  71. data/ext/cld3/{ext/src/task_context.h → task_context.h} +0 -0
  72. data/ext/cld3/task_context.o +0 -0
  73. data/ext/cld3/{ext/src/task_context_params.cc → task_context_params.cc} +0 -0
  74. data/ext/cld3/{ext/src/task_context_params.h → task_context_params.h} +0 -0
  75. data/ext/cld3/task_context_params.o +0 -0
  76. data/ext/cld3/task_spec.pb.o +0 -0
  77. data/ext/cld3/{ext/src/task_spec.proto → task_spec.proto} +0 -0
  78. data/ext/cld3/{ext/src/script_span/text_processing.cc → text_processing.cc} +0 -0
  79. data/ext/cld3/{ext/src/script_span/text_processing.h → text_processing.h} +0 -0
  80. data/ext/cld3/text_processing.o +0 -0
  81. data/ext/cld3/{ext/src/unicodetext.cc → unicodetext.cc} +0 -0
  82. data/ext/cld3/{ext/src/unicodetext.h → unicodetext.h} +0 -0
  83. data/ext/cld3/unicodetext.o +0 -0
  84. data/ext/cld3/{ext/src/script_span/utf8acceptinterchange.h → utf8acceptinterchange.h} +0 -0
  85. data/ext/cld3/{ext/src/script_span/utf8prop_lettermarkscriptnum.h → utf8prop_lettermarkscriptnum.h} +0 -0
  86. data/ext/cld3/{ext/src/script_span/utf8repl_lettermarklower.h → utf8repl_lettermarklower.h} +0 -0
  87. data/ext/cld3/{ext/src/script_span/utf8scannot_lettermarkspecial.h → utf8scannot_lettermarkspecial.h} +0 -0
  88. data/ext/cld3/{ext/src/script_span/utf8statetable.cc → utf8statetable.cc} +0 -0
  89. data/ext/cld3/{ext/src/script_span/utf8statetable.h → utf8statetable.h} +0 -0
  90. data/ext/cld3/utf8statetable.o +0 -0
  91. data/ext/cld3/{ext/src/utils.cc → utils.cc} +0 -0
  92. data/ext/cld3/{ext/src/utils.h → utils.h} +0 -0
  93. data/ext/cld3/utils.o +0 -0
  94. data/ext/cld3/{ext/src/workspace.cc → workspace.cc} +0 -0
  95. data/ext/cld3/{ext/src/workspace.h → workspace.h} +0 -0
  96. data/ext/cld3/workspace.o +0 -0
  97. metadata +96 -81
  98. data/ext/cld3/ext/CMakeLists.txt +0 -69
  99. data/ext/cld3/ext/CONTRIBUTING.md +0 -26
  100. data/ext/cld3/ext/README.md +0 -73
  101. data/ext/cld3/ext/misc/myprotobuf.cmake +0 -58
  102. data/ext/cld3/ext/model.png +0 -0
  103. data/ext/cld3/ext/src/BUILD.gn +0 -133
  104. data/ext/cld3/ext/src/DEPS +0 -4
  105. data/ext/cld3/ext/src/language_identifier_features_test.cc +0 -261
  106. data/ext/cld3/ext/src/language_identifier_main.cc +0 -54
  107. data/ext/cld3/ext/src/nnet_lang_id_test.cc +0 -254
  108. data/ext/cld3/ext/src/nnet_lang_id_test_data.cc +0 -529
  109. data/ext/cld3/ext/src/nnet_lang_id_test_data.h +0 -117
  110. data/ext/cld3/ext/src/relevant_script_feature_test.cc +0 -259
  111. data/ext/cld3/ext/src/script_detector_test.cc +0 -161
  112. data/ext/cld3/ext/src/script_span/README.md +0 -11
  113. data/ext/cld3/ext/src/script_span/getonescriptspan_test.cc +0 -135
Binary file
@@ -1,133 +0,0 @@
1
- # Copyright 2016 Google Inc. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- #==============================================================================
15
-
16
- import("//third_party/protobuf/proto_library.gni")
17
-
18
- proto_library("protos") {
19
- sources = [
20
- "feature_extractor.proto",
21
- "sentence.proto",
22
- "task_spec.proto",
23
- ]
24
- proto_out_dir = "cld_3/protos"
25
- }
26
-
27
- static_library("cld_3") {
28
- sources = [
29
- "base.cc",
30
- "base.h",
31
- "casts.h",
32
- "embedding_feature_extractor.cc",
33
- "embedding_feature_extractor.h",
34
- "embedding_network.cc",
35
- "embedding_network.h",
36
- "embedding_network_params.h",
37
- "feature_extractor.cc",
38
- "feature_extractor.h",
39
- "feature_types.cc",
40
- "feature_types.h",
41
- "float16.h",
42
- "fml_parser.cc",
43
- "fml_parser.h",
44
- "language_identifier_features.cc",
45
- "language_identifier_features.h",
46
- "lang_id_nn_params.cc",
47
- "lang_id_nn_params.h",
48
- "nnet_language_identifier.cc",
49
- "nnet_language_identifier.h",
50
- "registry.cc",
51
- "registry.h",
52
- "relevant_script_feature.cc",
53
- "relevant_script_feature.h",
54
- "script_detector.h",
55
- "sentence_features.cc",
56
- "sentence_features.h",
57
- "simple_adder.h",
58
- "script_span/fixunicodevalue.cc",
59
- "script_span/fixunicodevalue.h",
60
- "script_span/generated_entities.cc",
61
- "script_span/generated_ulscript.cc",
62
- "script_span/generated_ulscript.h",
63
- "script_span/getonescriptspan.cc",
64
- "script_span/getonescriptspan.h",
65
- "script_span/integral_types.h",
66
- "script_span/offsetmap.cc",
67
- "script_span/offsetmap.h",
68
- "script_span/port.h",
69
- "script_span/stringpiece.h",
70
- "script_span/text_processing.cc",
71
- "script_span/text_processing.h",
72
- "script_span/utf8acceptinterchange.h",
73
- "script_span/utf8prop_lettermarkscriptnum.h",
74
- "script_span/utf8repl_lettermarklower.h",
75
- "script_span/utf8scannot_lettermarkspecial.h",
76
- "script_span/utf8statetable.cc",
77
- "script_span/utf8statetable.h",
78
- "task_context.cc",
79
- "task_context.h",
80
- "task_context_params.cc",
81
- "task_context_params.h",
82
- "unicodetext.cc",
83
- "unicodetext.h",
84
- "utils.cc",
85
- "utils.h",
86
- "workspace.cc",
87
- "workspace.h",
88
- ]
89
- public_deps = [
90
- "//third_party/protobuf:protobuf_lite",
91
- ":protos",
92
- ]
93
- }
94
-
95
- # The executables below are functional. Uncomment to use.
96
-
97
- #executable("language_identifier_main") {
98
- # sources = [
99
- # "language_identifier_main.cc",
100
- # ]
101
- # deps = [
102
- # ":cld_3",
103
- # ]
104
- #}
105
-
106
- #executable("getonescriptspan_test") {
107
- # sources = [
108
- # "script_span/getonescriptspan_test.cc",
109
- # ]
110
- # deps = [
111
- # ":cld_3",
112
- # ]
113
- #}
114
-
115
- #executable("language_identifier_features_test") {
116
- # sources = [
117
- # "language_identifier_features_test.cc",
118
- # ]
119
- # deps = [
120
- # ":cld_3",
121
- # ]
122
- #}
123
-
124
- #executable("nnet_lang_id_test") {
125
- # sources = [
126
- # "nnet_lang_id_test.cc",
127
- # "nnet_lang_id_test_data.cc",
128
- # "nnet_lang_id_test_data.h",
129
- # ]
130
- # deps = [
131
- # ":cld_3",
132
- # ]
133
- #}
@@ -1,4 +0,0 @@
1
- include_rules = [
2
- '+cld_3',
3
- '+script_span',
4
- ]
@@ -1,261 +0,0 @@
1
- /* Copyright 2016 Google Inc. All Rights Reserved.
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- ==============================================================================*/
15
-
16
- #include <cmath>
17
- #include <iostream>
18
- #include <vector>
19
- #include <set>
20
-
21
- #include "base.h"
22
- #include "feature_extractor.h"
23
- #include "language_identifier_features.h"
24
- #include "nnet_language_identifier.h"
25
- #include "script_span/generated_ulscript.h"
26
- #include "cld_3/protos/sentence.pb.h"
27
- #include "task_context.h"
28
- #include "utils.h"
29
- #include "workspace.h"
30
-
31
- namespace chrome_lang_id {
32
- namespace language_identifier_features_test {
33
-
34
- static WholeSentenceFeature *cbog_factory() {
35
- return new ContinuousBagOfNgramsFunction;
36
- }
37
-
38
- static WholeSentenceFeature *sf_factory() { return new ScriptFeature; }
39
-
40
- // Class for calculating the feature weights and ids.
41
- class FeatureIdWeightCalculator {
42
- public:
43
- explicit FeatureIdWeightCalculator(TaskContext *context) {
44
- if (WholeSentenceFeature::registry() == nullptr) {
45
- // Create registry for our WholeSentenceFeature(s).
46
- RegisterableClass<WholeSentenceFeature>::CreateRegistry(
47
- "sentence feature function", "WholeSentenceFeature", __FILE__,
48
- __LINE__);
49
- }
50
-
51
- // Register our WholeSentenceFeature(s).
52
- // Register ContinuousBagOfNgramsFunction feature function.
53
- static WholeSentenceFeature::Registry::Registrar cbog_registrar(
54
- WholeSentenceFeature::registry(), "continuous-bag-of-ngrams",
55
- "ContinuousBagOfNgramsFunction", __FILE__, __LINE__, cbog_factory);
56
-
57
- // Register Script feature function.
58
- static WholeSentenceFeature::Registry::Registrar sf_registrar(
59
- WholeSentenceFeature::registry(), "script", "ScriptFeature", __FILE__,
60
- __LINE__, sf_factory);
61
-
62
- feature_extractor_.Setup(context);
63
- feature_extractor_.Init(context);
64
- }
65
-
66
- // Assumes that a single feature is specified and extracts it.
67
- void ExtractOnlyFeature(Sentence *sentence,
68
- std::vector<FeatureVector> *features) {
69
- CLD3_CHECK(features->size() == 1);
70
- WorkspaceSet workspace;
71
- workspace.Reset(workspace_registry_);
72
- feature_extractor_.Preprocess(&workspace, sentence);
73
- feature_extractor_.ExtractFeatures(workspace, *sentence, features);
74
- CLD3_CHECK(features->size() == 1);
75
- }
76
-
77
- // Returns a map from feature value id to feature value weight.
78
- std::unordered_map<int, float> GetFloatFeatureValIdsAndWeights(
79
- Sentence *sentence) {
80
- std::vector<FeatureVector> feature_vectors(1); // one feature space
81
- ExtractOnlyFeature(sentence, &feature_vectors);
82
- const FeatureVector &feature_vector = feature_vectors.at(0);
83
-
84
- // Save the (feature value id, feature value weight) pairs to a map.
85
- std::unordered_map<int, float> feature_id_weight;
86
- for (int index = 0; index < feature_vector.size(); ++index) {
87
- const FloatFeatureValue feature_value =
88
- FloatFeatureValue(feature_vector.value(index));
89
- feature_id_weight[feature_value.value.id] = feature_value.value.weight;
90
- }
91
- return feature_id_weight;
92
- }
93
-
94
- // Returns the feature value ids.
95
- std::set<int> GetFeatureValueIds(Sentence *sentence) {
96
- std::vector<FeatureVector> feature_vectors(1); // one feature space
97
- ExtractOnlyFeature(sentence, &feature_vectors);
98
- const FeatureVector &feature_vector = feature_vectors.at(0);
99
-
100
- std::set<int> ids;
101
- for (int index = 0; index < feature_vector.size(); ++index) {
102
- ids.insert(feature_vector.value(index));
103
- }
104
- return ids;
105
- }
106
-
107
- private:
108
- // The registry of shared workspaces in the feature extractor.
109
- WorkspaceRegistry workspace_registry_;
110
- LanguageIdEmbeddingFeatureExtractor feature_extractor_;
111
- };
112
-
113
- // Extracts features and checks that their ids and weights are correct.
114
- bool ExtractAndCheckFeatures(const string &features, const int id_dim,
115
- const std::vector<string> &expected_char_ngrams,
116
- const std::vector<float> &expected_weights,
117
- Sentence *sentence) {
118
- TaskContext context;
119
- context.SetParameter("language_identifier_features", features);
120
- FeatureIdWeightCalculator calc(&context);
121
-
122
- // Get the feature ids and the corresponding weights.
123
- const std::unordered_map<int, float> feature_id_weight =
124
- calc.GetFloatFeatureValIdsAndWeights(sentence);
125
- if (feature_id_weight.size() != expected_char_ngrams.size()) {
126
- std::cout << " Failure" << std::endl;
127
- std::cout << " Number of expected feature ids: "
128
- << expected_char_ngrams.size() << std::endl;
129
- std::cout << " Number of extracted feature ids: "
130
- << feature_id_weight.size() << std::endl;
131
- return false;
132
- }
133
-
134
- // Specifies how close two float values should be to be considered equal.
135
- const float epsilon = 0.0001f;
136
- bool test_successful = true;
137
- for (size_t i = 0; i < expected_char_ngrams.size(); ++i) {
138
- const int expected_id =
139
- utils::Hash32WithDefaultSeed(expected_char_ngrams.at(i)) % id_dim;
140
-
141
- // Check the ids and the weights.
142
- if (feature_id_weight.count(expected_id) == 0) {
143
- std::cout << " Failure" << std::endl;
144
- std::cout << " Feature id " << expected_id << " is missing" << std::endl;
145
- test_successful = false;
146
- } else {
147
- if (std::abs(feature_id_weight.at(expected_id) - expected_weights.at(i)) >
148
- epsilon) {
149
- std::cout << " Failure" << std::endl;
150
- std::cout << " Different weight for feature id " << expected_id
151
- << ": expected weight " << expected_weights.at(i)
152
- << ", actual weight " << feature_id_weight.at(expected_id)
153
- << std::endl;
154
- test_successful = false;
155
- }
156
- }
157
- }
158
-
159
- if (test_successful) {
160
- std::cout << " Success!" << std::endl;
161
- }
162
- return test_successful;
163
- }
164
-
165
- // Tests the case when ngram features get equal weight. Returns "true" if the
166
- // test is successful and "false" otherwise.
167
- bool TestExtractFeaturesWithEqualWeight() {
168
- std::cout << "Running " << __FUNCTION__ << std::endl;
169
-
170
- // The integer id of each char ngram is computed as follows:
171
- // utils::Hash32WithDefaultSeed(char ngram) % id_dim.
172
- const int id_dim = 100;
173
- const string features = "continuous-bag-of-ngrams(id_dim=" +
174
- std::to_string(id_dim) +
175
- ",size=2,include_terminators=true,include_" +
176
- "spaces=false,use_equal_weight=true)";
177
- Sentence sentence;
178
- sentence.set_text("aa aab");
179
- const std::vector<string> expected_char_ngrams{"ab", "b$", "^a", "aa", "a$"};
180
- const std::vector<float> expected_weights = {0.2f, 0.2f, 0.2f, 0.2f, 0.2f};
181
- return ExtractAndCheckFeatures(features, id_dim, expected_char_ngrams,
182
- expected_weights, &sentence);
183
- }
184
-
185
- // Tests the case when ngram features get weights equal to their normalized
186
- // counts. Returns "true" if the test is successful and "false" otherwise.
187
- bool TestExtractFeaturesWithNonEqualWeight() {
188
- std::cout << "Running " << __FUNCTION__ << std::endl;
189
-
190
- // The integer id of each char ngram is computed as follows:
191
- // utils::Hash32WithDefaultSeed(char ngram) % id_dim.
192
- const int id_dim = 100;
193
- const string features = "continuous-bag-of-ngrams(id_dim=" +
194
- std::to_string(id_dim) +
195
- ",size=2,include_terminators=true,include_" +
196
- "spaces=false,use_equal_weight=false)";
197
- Sentence sentence;
198
- sentence.set_text("aa aab");
199
- const std::vector<string> expected_char_ngrams{"ab", "b$", "^a", "aa", "a$"};
200
- const std::vector<float> expected_weights{0.1428f, 0.1428f, 0.2857f, 0.2857f,
201
- 0.1428f};
202
- return ExtractAndCheckFeatures(features, id_dim, expected_char_ngrams,
203
- expected_weights, &sentence);
204
- }
205
-
206
- // Tests the feature Script.
207
- bool TestScriptFeature() {
208
- std::cout << "Running " << __FUNCTION__ << std::endl;
209
-
210
- bool test_successful = true;
211
- TaskContext context;
212
- context.SetParameter("language_identifier_features", "script");
213
- FeatureIdWeightCalculator calc(&context);
214
-
215
- // Check the script of the English sentence.
216
- Sentence sentence;
217
- sentence.set_text("food");
218
- std::set<int> feature_val_ids = calc.GetFeatureValueIds(&sentence);
219
- if (feature_val_ids.size() != 1 ||
220
- feature_val_ids.count(chrome_lang_id::CLD2::ULScript_Latin) == 0) {
221
- test_successful = false;
222
- std::cout << " Failure for input: " << sentence.text() << std::endl;
223
- }
224
-
225
- // Check the script of a Chinese sentence.
226
- sentence.set_text("字");
227
- feature_val_ids = calc.GetFeatureValueIds(&sentence);
228
- if (feature_val_ids.size() != 1 ||
229
- feature_val_ids.count(chrome_lang_id::CLD2::ULScript_Hani) == 0) {
230
- test_successful = false;
231
- std::cout << " Failure for input: " << sentence.text() << std::endl;
232
- }
233
-
234
- // Check the script of a Korean sentence.
235
- sentence.set_text("워드");
236
- feature_val_ids = calc.GetFeatureValueIds(&sentence);
237
- if (feature_val_ids.size() != 1 ||
238
- feature_val_ids.count(chrome_lang_id::CLD2::NUM_ULSCRIPTS) == 0) {
239
- test_successful = false;
240
- std::cout << " Failure for input: " << sentence.text() << std::endl;
241
- }
242
-
243
- if (test_successful) {
244
- std::cout << " Success!" << std::endl;
245
- }
246
- return test_successful;
247
- }
248
-
249
- } // namespace language_identifier_features_test
250
- } // namespace chrome_lang_id
251
-
252
- // Runs the feature extraction tests.
253
- int main(int argc, char **argv) {
254
- const bool tests_successful =
255
- chrome_lang_id::language_identifier_features_test::
256
- TestExtractFeaturesWithEqualWeight() &&
257
- chrome_lang_id::language_identifier_features_test::
258
- TestExtractFeaturesWithNonEqualWeight() &&
259
- chrome_lang_id::language_identifier_features_test::TestScriptFeature();
260
- return tests_successful ? 0 : 1;
261
- }
@@ -1,54 +0,0 @@
1
- /* Copyright 2016 Google Inc. All Rights Reserved.
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- ==============================================================================*/
15
-
16
- #include <iostream>
17
- #include <string>
18
-
19
- #include "base.h"
20
- #include "nnet_language_identifier.h"
21
-
22
- using chrome_lang_id::NNetLanguageIdentifier;
23
-
24
- // Runs a neural net model for language identification.
25
- int main(int argc, char **argv) {
26
- NNetLanguageIdentifier lang_id(/*min_num_bytes=*/0,
27
- /*max_num_bytes=*/1000);
28
-
29
- const std::vector<std::string> texts{"This text is written in English.",
30
- "Text in deutscher Sprache verfasst."};
31
- for (const std::string &text : texts) {
32
- const NNetLanguageIdentifier::Result result = lang_id.FindLanguage(text);
33
- std::cout << "text: " << text << std::endl
34
- << " language: " << result.language << std::endl
35
- << " probability: " << result.probability << std::endl
36
- << " reliable: " << result.is_reliable << std::endl
37
- << " proportion: " << result.proportion << std::endl
38
- << std::endl;
39
- }
40
-
41
- const std::string &text =
42
- "This piece of text is in English. Този текст е на Български.";
43
- std::cout << "text: " << text << std::endl;
44
- const std::vector<NNetLanguageIdentifier::Result> results =
45
- lang_id.FindTopNMostFreqLangs(text, /*num_langs*/ 3);
46
- for (const NNetLanguageIdentifier::Result &result : results) {
47
- std::cout << " language: " << result.language << std::endl
48
- << " probability: " << result.probability << std::endl
49
- << " reliable: " << result.is_reliable << std::endl
50
- << " proportion: " << result.proportion << std::endl
51
- << std::endl;
52
- }
53
- return 0;
54
- }