cld3 3.2.5 → 3.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/{ext/cld3/ext/LICENSE → LICENSE_CLD3} +0 -0
  3. data/cld3.gemspec +1 -1
  4. data/ext/cld3/Makefile +266 -0
  5. data/ext/cld3/{ext/src/base.cc → base.cc} +0 -0
  6. data/ext/cld3/{ext/src/base.h → base.h} +0 -0
  7. data/ext/cld3/base.o +0 -0
  8. data/ext/cld3/{ext/src/casts.h → casts.h} +0 -0
  9. data/ext/cld3/{ext/src/embedding_feature_extractor.cc → embedding_feature_extractor.cc} +0 -0
  10. data/ext/cld3/{ext/src/embedding_feature_extractor.h → embedding_feature_extractor.h} +0 -0
  11. data/ext/cld3/embedding_feature_extractor.o +0 -0
  12. data/ext/cld3/{ext/src/embedding_network.cc → embedding_network.cc} +0 -0
  13. data/ext/cld3/{ext/src/embedding_network.h → embedding_network.h} +0 -0
  14. data/ext/cld3/embedding_network.o +0 -0
  15. data/ext/cld3/{ext/src/embedding_network_params.h → embedding_network_params.h} +0 -0
  16. data/ext/cld3/{ext/src/feature_extractor.cc → feature_extractor.cc} +0 -0
  17. data/ext/cld3/{ext/src/feature_extractor.h → feature_extractor.h} +0 -0
  18. data/ext/cld3/feature_extractor.o +0 -0
  19. data/ext/cld3/feature_extractor.pb.o +0 -0
  20. data/ext/cld3/{ext/src/feature_extractor.proto → feature_extractor.proto} +0 -0
  21. data/ext/cld3/{ext/src/feature_types.cc → feature_types.cc} +0 -0
  22. data/ext/cld3/{ext/src/feature_types.h → feature_types.h} +0 -0
  23. data/ext/cld3/feature_types.o +0 -0
  24. data/ext/cld3/{ext/src/script_span/fixunicodevalue.cc → fixunicodevalue.cc} +0 -0
  25. data/ext/cld3/{ext/src/script_span/fixunicodevalue.h → fixunicodevalue.h} +0 -0
  26. data/ext/cld3/fixunicodevalue.o +0 -0
  27. data/ext/cld3/{ext/src/float16.h → float16.h} +0 -0
  28. data/ext/cld3/{ext/src/fml_parser.cc → fml_parser.cc} +0 -0
  29. data/ext/cld3/{ext/src/fml_parser.h → fml_parser.h} +0 -0
  30. data/ext/cld3/fml_parser.o +0 -0
  31. data/ext/cld3/{ext/src/script_span/generated_entities.cc → generated_entities.cc} +0 -0
  32. data/ext/cld3/generated_entities.o +0 -0
  33. data/ext/cld3/{ext/src/script_span/generated_ulscript.cc → generated_ulscript.cc} +0 -0
  34. data/ext/cld3/{ext/src/script_span/generated_ulscript.h → generated_ulscript.h} +0 -0
  35. data/ext/cld3/generated_ulscript.o +0 -0
  36. data/ext/cld3/{ext/src/script_span/getonescriptspan.cc → getonescriptspan.cc} +0 -0
  37. data/ext/cld3/{ext/src/script_span/getonescriptspan.h → getonescriptspan.h} +0 -0
  38. data/ext/cld3/getonescriptspan.o +0 -0
  39. data/ext/cld3/{ext/src/script_span/integral_types.h → integral_types.h} +0 -0
  40. data/ext/cld3/{ext/src/lang_id_nn_params.cc → lang_id_nn_params.cc} +0 -0
  41. data/ext/cld3/{ext/src/lang_id_nn_params.h → lang_id_nn_params.h} +0 -0
  42. data/ext/cld3/lang_id_nn_params.o +0 -0
  43. data/ext/cld3/{ext/src/language_identifier_features.cc → language_identifier_features.cc} +0 -0
  44. data/ext/cld3/{ext/src/language_identifier_features.h → language_identifier_features.h} +0 -0
  45. data/ext/cld3/language_identifier_features.o +0 -0
  46. data/ext/cld3/libcld3.so +0 -0
  47. data/ext/cld3/mkmf.log +36 -0
  48. data/ext/cld3/{ext/src/nnet_language_identifier.cc → nnet_language_identifier.cc} +0 -0
  49. data/ext/cld3/{ext/src/nnet_language_identifier.h → nnet_language_identifier.h} +0 -0
  50. data/ext/cld3/nnet_language_identifier.o +0 -0
  51. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  52. data/ext/cld3/{ext/src/script_span/offsetmap.cc → offsetmap.cc} +0 -0
  53. data/ext/cld3/{ext/src/script_span/offsetmap.h → offsetmap.h} +0 -0
  54. data/ext/cld3/offsetmap.o +0 -0
  55. data/ext/cld3/{ext/src/script_span/port.h → port.h} +0 -0
  56. data/ext/cld3/{ext/src/registry.cc → registry.cc} +0 -0
  57. data/ext/cld3/{ext/src/registry.h → registry.h} +0 -0
  58. data/ext/cld3/registry.o +0 -0
  59. data/ext/cld3/{ext/src/relevant_script_feature.cc → relevant_script_feature.cc} +0 -0
  60. data/ext/cld3/{ext/src/relevant_script_feature.h → relevant_script_feature.h} +0 -0
  61. data/ext/cld3/relevant_script_feature.o +0 -0
  62. data/ext/cld3/{ext/src/script_detector.h → script_detector.h} +0 -0
  63. data/ext/cld3/sentence.pb.o +0 -0
  64. data/ext/cld3/{ext/src/sentence.proto → sentence.proto} +0 -0
  65. data/ext/cld3/{ext/src/sentence_features.cc → sentence_features.cc} +0 -0
  66. data/ext/cld3/{ext/src/sentence_features.h → sentence_features.h} +0 -0
  67. data/ext/cld3/sentence_features.o +0 -0
  68. data/ext/cld3/{ext/src/simple_adder.h → simple_adder.h} +0 -0
  69. data/ext/cld3/{ext/src/script_span/stringpiece.h → stringpiece.h} +0 -0
  70. data/ext/cld3/{ext/src/task_context.cc → task_context.cc} +0 -0
  71. data/ext/cld3/{ext/src/task_context.h → task_context.h} +0 -0
  72. data/ext/cld3/task_context.o +0 -0
  73. data/ext/cld3/{ext/src/task_context_params.cc → task_context_params.cc} +0 -0
  74. data/ext/cld3/{ext/src/task_context_params.h → task_context_params.h} +0 -0
  75. data/ext/cld3/task_context_params.o +0 -0
  76. data/ext/cld3/task_spec.pb.o +0 -0
  77. data/ext/cld3/{ext/src/task_spec.proto → task_spec.proto} +0 -0
  78. data/ext/cld3/{ext/src/script_span/text_processing.cc → text_processing.cc} +0 -0
  79. data/ext/cld3/{ext/src/script_span/text_processing.h → text_processing.h} +0 -0
  80. data/ext/cld3/text_processing.o +0 -0
  81. data/ext/cld3/{ext/src/unicodetext.cc → unicodetext.cc} +0 -0
  82. data/ext/cld3/{ext/src/unicodetext.h → unicodetext.h} +0 -0
  83. data/ext/cld3/unicodetext.o +0 -0
  84. data/ext/cld3/{ext/src/script_span/utf8acceptinterchange.h → utf8acceptinterchange.h} +0 -0
  85. data/ext/cld3/{ext/src/script_span/utf8prop_lettermarkscriptnum.h → utf8prop_lettermarkscriptnum.h} +0 -0
  86. data/ext/cld3/{ext/src/script_span/utf8repl_lettermarklower.h → utf8repl_lettermarklower.h} +0 -0
  87. data/ext/cld3/{ext/src/script_span/utf8scannot_lettermarkspecial.h → utf8scannot_lettermarkspecial.h} +0 -0
  88. data/ext/cld3/{ext/src/script_span/utf8statetable.cc → utf8statetable.cc} +0 -0
  89. data/ext/cld3/{ext/src/script_span/utf8statetable.h → utf8statetable.h} +0 -0
  90. data/ext/cld3/utf8statetable.o +0 -0
  91. data/ext/cld3/{ext/src/utils.cc → utils.cc} +0 -0
  92. data/ext/cld3/{ext/src/utils.h → utils.h} +0 -0
  93. data/ext/cld3/utils.o +0 -0
  94. data/ext/cld3/{ext/src/workspace.cc → workspace.cc} +0 -0
  95. data/ext/cld3/{ext/src/workspace.h → workspace.h} +0 -0
  96. data/ext/cld3/workspace.o +0 -0
  97. metadata +96 -81
  98. data/ext/cld3/ext/CMakeLists.txt +0 -69
  99. data/ext/cld3/ext/CONTRIBUTING.md +0 -26
  100. data/ext/cld3/ext/README.md +0 -73
  101. data/ext/cld3/ext/misc/myprotobuf.cmake +0 -58
  102. data/ext/cld3/ext/model.png +0 -0
  103. data/ext/cld3/ext/src/BUILD.gn +0 -133
  104. data/ext/cld3/ext/src/DEPS +0 -4
  105. data/ext/cld3/ext/src/language_identifier_features_test.cc +0 -261
  106. data/ext/cld3/ext/src/language_identifier_main.cc +0 -54
  107. data/ext/cld3/ext/src/nnet_lang_id_test.cc +0 -254
  108. data/ext/cld3/ext/src/nnet_lang_id_test_data.cc +0 -529
  109. data/ext/cld3/ext/src/nnet_lang_id_test_data.h +0 -117
  110. data/ext/cld3/ext/src/relevant_script_feature_test.cc +0 -259
  111. data/ext/cld3/ext/src/script_detector_test.cc +0 -161
  112. data/ext/cld3/ext/src/script_span/README.md +0 -11
  113. data/ext/cld3/ext/src/script_span/getonescriptspan_test.cc +0 -135
@@ -1,117 +0,0 @@
1
- /* Copyright 2016 Google Inc. All Rights Reserved.
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- ==============================================================================*/
15
-
16
- #ifndef NNET_LANG_ID_TEST_DATA_H_
17
- #define NNET_LANG_ID_TEST_DATA_H_
18
-
19
- namespace chrome_lang_id {
20
-
21
- class NNetLangIdTestData {
22
- public:
23
- // Pieces of text in different languages.
24
- static const char *const kTestStrAF;
25
- static const char *const kTestStrAR;
26
- static const char *const kTestStrAZ;
27
- static const char *const kTestStrBE;
28
- static const char *const kTestStrBG;
29
- static const char *const kTestStrBN;
30
- static const char *const kTestStrBS;
31
- static const char *const kTestStrCA;
32
- static const char *const kTestStrCEB;
33
- static const char *const kTestStrCS;
34
- static const char *const kTestStrCY;
35
- static const char *const kTestStrDA;
36
- static const char *const kTestStrDE;
37
- static const char *const kTestStrEL;
38
- static const char *const kTestStrEN;
39
- static const char *const kTestStrEO;
40
- static const char *const kTestStrES;
41
- static const char *const kTestStrET;
42
- static const char *const kTestStrEU;
43
- static const char *const kTestStrFA;
44
- static const char *const kTestStrFI;
45
- static const char *const kTestStrFIL;
46
- static const char *const kTestStrFR;
47
- static const char *const kTestStrGA;
48
- static const char *const kTestStrGL;
49
- static const char *const kTestStrGU;
50
- static const char *const kTestStrHA;
51
- static const char *const kTestStrHI;
52
- static const char *const kTestStrHMN;
53
- static const char *const kTestStrHR;
54
- static const char *const kTestStrHT;
55
- static const char *const kTestStrHU;
56
- static const char *const kTestStrHY;
57
- static const char *const kTestStrID;
58
- static const char *const kTestStrIG;
59
- static const char *const kTestStrIS;
60
- static const char *const kTestStrIT;
61
- static const char *const kTestStrIW;
62
- static const char *const kTestStrJA;
63
- static const char *const kTestStrJV;
64
- static const char *const kTestStrKA;
65
- static const char *const kTestStrKK;
66
- static const char *const kTestStrKM;
67
- static const char *const kTestStrKN;
68
- static const char *const kTestStrKO;
69
- static const char *const kTestStrLA;
70
- static const char *const kTestStrLO;
71
- static const char *const kTestStrLT;
72
- static const char *const kTestStrLV;
73
- static const char *const kTestStrMG;
74
- static const char *const kTestStrMI;
75
- static const char *const kTestStrMK;
76
- static const char *const kTestStrML;
77
- static const char *const kTestStrMN;
78
- static const char *const kTestStrMR;
79
- static const char *const kTestStrMS;
80
- static const char *const kTestStrMT;
81
- static const char *const kTestStrMY;
82
- static const char *const kTestStrNE;
83
- static const char *const kTestStrNL;
84
- static const char *const kTestStrNO;
85
- static const char *const kTestStrNY;
86
- static const char *const kTestStrPA;
87
- static const char *const kTestStrPL;
88
- static const char *const kTestStrPT;
89
- static const char *const kTestStrRO;
90
- static const char *const kTestStrRU;
91
- static const char *const kTestStrSI;
92
- static const char *const kTestStrSK;
93
- static const char *const kTestStrSL;
94
- static const char *const kTestStrSO;
95
- static const char *const kTestStrSQ;
96
- static const char *const kTestStrSR;
97
- static const char *const kTestStrST;
98
- static const char *const kTestStrSU;
99
- static const char *const kTestStrSV;
100
- static const char *const kTestStrSW;
101
- static const char *const kTestStrTA;
102
- static const char *const kTestStrTE;
103
- static const char *const kTestStrTG;
104
- static const char *const kTestStrTH;
105
- static const char *const kTestStrTR;
106
- static const char *const kTestStrUK;
107
- static const char *const kTestStrUR;
108
- static const char *const kTestStrUZ;
109
- static const char *const kTestStrVI;
110
- static const char *const kTestStrYI;
111
- static const char *const kTestStrYO;
112
- static const char *const kTestStrZH;
113
- static const char *const kTestStrZU;
114
- };
115
- } // namespace chrome_lang_id
116
-
117
- #endif // NNET_LANG_ID_TEST_DATA_H_
@@ -1,259 +0,0 @@
1
- /* Copyright 2016 Google Inc. All Rights Reserved.
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- ==============================================================================*/
15
-
16
- #include <algorithm>
17
- #include <cmath>
18
- #include <iostream>
19
- #include <memory>
20
-
21
- #include "feature_extractor.h"
22
- #include "feature_types.h"
23
- #include "relevant_script_feature.h"
24
- #include "script_detector.h"
25
- #include "cld_3/protos/sentence.pb.h"
26
- #include "sentence_features.h"
27
- #include "task_context.h"
28
- #include "utils.h"
29
- #include "workspace.h"
30
-
31
- namespace chrome_lang_id {
32
- namespace relevant_script_feature_test {
33
- namespace {
34
- // Checks whether the expected and actual float feature values are within 0.0001
35
- // of each other.
36
- bool FeatureValuesNear(float expected_value, float actual_value) {
37
- return std::abs(expected_value - actual_value) < 0.0001;
38
- }
39
-
40
- // Checks whether two sets of feature values are within an acceptable amount of
41
- // each other.
42
- bool FeaturesNear(const string &test_input,
43
- const std::map<int, float> &expected_features,
44
- const std::map<int, float> &actual_features) {
45
- if (expected_features.size() != actual_features.size()) {
46
- std::cout << " Failure for input: " << test_input << std::endl;
47
- return false;
48
- }
49
-
50
- for (const auto &id_and_value : expected_features) {
51
- const int id = id_and_value.first;
52
- if (actual_features.count(id) == 0 ||
53
- !FeatureValuesNear(expected_features.at(id), actual_features.at(id))) {
54
- std::cout << " Failure for input: " << test_input << std::endl;
55
- return false;
56
- }
57
- }
58
- std::cout << " Success for input: " << test_input << std::endl;
59
- return true;
60
- }
61
-
62
- // Checks whether the set of features is empty.
63
- bool CheckFeaturesEmpty(const string &input,
64
- const std::map<int, float> &actual_features) {
65
- if (!actual_features.empty()) {
66
- std::cout << " Failure for input: " << input << std::endl;
67
- return false;
68
- } else {
69
- std::cout << " Success for input: " << input << std::endl;
70
- return true;
71
- }
72
- }
73
- } // namespace
74
-
75
- static WholeSentenceFeature *rsf_factory() { return new RelevantScriptFeature; }
76
-
77
- class RelevantScriptFeatureExtractor {
78
- public:
79
- RelevantScriptFeatureExtractor() {
80
- if (WholeSentenceFeature::registry() == nullptr) {
81
- // Create registry for our WholeSentenceFeature(s).
82
- RegisterableClass<WholeSentenceFeature>::CreateRegistry(
83
- "sentence feature function", "WholeSentenceFeature", __FILE__,
84
- __LINE__);
85
- }
86
-
87
- // Register our WholeSentenceFeature(s).
88
- // Register RelevantScriptFeature feature function.
89
- static WholeSentenceFeature::Registry::Registrar rsf_registrar(
90
- WholeSentenceFeature::registry(), "continuous-bag-of-relevant-scripts",
91
- "RelevantScriptFeature", __FILE__, __LINE__, rsf_factory);
92
-
93
- feature_extractor_.Parse("continuous-bag-of-relevant-scripts");
94
- TaskContext context;
95
- feature_extractor_.Setup(&context);
96
- feature_extractor_.Init(&context);
97
- feature_extractor_.RequestWorkspaces(&workspace_registry_);
98
- }
99
-
100
- // Returns "true" if feature extraction is successful, and "false" otherwise.
101
- bool Extract(const string &text, std::map<int, float> *float_features) {
102
- float_features->clear();
103
- if (text.empty()) {
104
- return true;
105
- }
106
- Sentence sentence;
107
- sentence.set_text(text);
108
- workspace_.Reset(workspace_registry_);
109
- feature_extractor_.Preprocess(&workspace_, &sentence);
110
- FeatureVector feature_vector;
111
- feature_extractor_.ExtractFeatures(workspace_, sentence, &feature_vector);
112
-
113
- for (int index = 0; index < feature_vector.size(); ++index) {
114
- const FloatFeatureValue value =
115
- FloatFeatureValue(feature_vector.value(index));
116
- if (float_features->count(value.value.id) != 0) {
117
- std::cout << " Failure: duplicate feature" << std::endl;
118
- return false;
119
- }
120
- float_features->emplace(value.value.id, value.value.weight);
121
- }
122
- return true;
123
- }
124
-
125
- private:
126
- WorkspaceSet workspace_;
127
- WholeSentenceExtractor feature_extractor_;
128
-
129
- // The registry of shared workspaces in the feature extractor.
130
- WorkspaceRegistry workspace_registry_;
131
- };
132
-
133
- bool TestCommonCases() {
134
- std::cout << "Running " << __FUNCTION__ << std::endl;
135
-
136
- RelevantScriptFeatureExtractor extractor;
137
- std::map<int, float> float_features;
138
- bool test_successful = true;
139
-
140
- string input = "just some plain text";
141
- if (!extractor.Extract(input, &float_features) ||
142
- !FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 1.00}},
143
- float_features)) {
144
- test_successful = false;
145
- }
146
-
147
- input = "ヸヂ゠ヂ";
148
- if (!extractor.Extract(input, &float_features) ||
149
- !FeaturesNear(input, {{chrome_lang_id::kScriptKatakana, 1.00}},
150
- float_features)) {
151
- test_successful = false;
152
- }
153
-
154
- // 4 Latin letters mixed with 4 Katakana letters.
155
- input = "ヸtヂe゠xtヂ";
156
- if (!extractor.Extract(input, &float_features) ||
157
- !FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 0.5},
158
- {chrome_lang_id::kScriptKatakana, 0.5}},
159
- float_features)) {
160
- test_successful = false;
161
- }
162
-
163
- input = "just some 121212%^^( ヸヂ゠ヂ text";
164
- if (!extractor.Extract(input, &float_features) ||
165
- !FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 0.75},
166
- {chrome_lang_id::kScriptKatakana, 0.25}},
167
- float_features)) {
168
- test_successful = false;
169
- }
170
-
171
- return test_successful;
172
- }
173
-
174
- bool TestCornerCases() {
175
- std::cout << "Running " << __FUNCTION__ << std::endl;
176
-
177
- RelevantScriptFeatureExtractor extractor;
178
- std::map<int, float> float_features;
179
- bool test_successful = true;
180
-
181
- // Empty string.
182
- string input = "";
183
- if (!extractor.Extract(input, &float_features) ||
184
- !CheckFeaturesEmpty(input, float_features)) {
185
- test_successful = false;
186
- }
187
-
188
- // Only whitespaces.
189
- input = " ";
190
- if (!extractor.Extract(input, &float_features) ||
191
- !CheckFeaturesEmpty(input, float_features)) {
192
- test_successful = false;
193
- }
194
-
195
- // Only numbers and punctuation.
196
- input = "12----)(";
197
- if (!extractor.Extract(input, &float_features) ||
198
- !CheckFeaturesEmpty(input, float_features)) {
199
- test_successful = false;
200
- }
201
-
202
- // Only numbers, punctuation, and spaces.
203
- input = "12--- - ) ( ";
204
- if (!extractor.Extract(input, &float_features) ||
205
- !CheckFeaturesEmpty(input, float_features)) {
206
- test_successful = false;
207
- }
208
-
209
- // One UTF8 character by itself.
210
- input = "ゟ";
211
- if (!extractor.Extract(input, &float_features) ||
212
- !FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
213
- float_features)) {
214
- test_successful = false;
215
- }
216
-
217
- input = "ה";
218
- if (!extractor.Extract(input, &float_features) ||
219
- !FeaturesNear(input, {{chrome_lang_id::kScriptHebrew, 1.00}},
220
- float_features)) {
221
- test_successful = false;
222
- }
223
-
224
- // One UTF8 character with some numbers / punctuation / spaces: character at
225
- // one extremity or in the middle.
226
- input = "1234ゟ";
227
- if (!extractor.Extract(input, &float_features) ||
228
- !FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
229
- float_features)) {
230
- test_successful = false;
231
- }
232
-
233
- input = "ゟ12-(";
234
- if (!extractor.Extract(input, &float_features) ||
235
- !FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
236
- float_features)) {
237
- test_successful = false;
238
- }
239
-
240
- input = "8*1ゟ12----";
241
- if (!extractor.Extract(input, &float_features) ||
242
- !FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
243
- float_features)) {
244
- test_successful = false;
245
- }
246
-
247
- return test_successful;
248
- }
249
-
250
- } // namespace relevant_script_feature_test
251
- } // namespace chrome_lang_id
252
-
253
- // Runs the feature extraction tests.
254
- int main(int argc, char **argv) {
255
- const bool tests_successful =
256
- chrome_lang_id::relevant_script_feature_test::TestCommonCases() &&
257
- chrome_lang_id::relevant_script_feature_test::TestCornerCases();
258
- return tests_successful ? 0 : 1;
259
- }
@@ -1,161 +0,0 @@
1
- /* Copyright 2016 Google Inc. All Rights Reserved.
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- ==============================================================================*/
15
-
16
- #include "script_detector.h"
17
-
18
- #include <iostream>
19
-
20
- #include "utils.h"
21
-
22
- namespace chrome_lang_id {
23
- namespace script_detector_test {
24
-
25
- Script GetScript(const char *p) {
26
- const int num_bytes = utils::OneCharLen(p);
27
- return chrome_lang_id::GetScript(p, num_bytes);
28
- }
29
-
30
- bool PrintAndReturnStatus(bool status) {
31
- if (status) {
32
- std::cout << " Success" << std::endl;
33
- return true;
34
- } else {
35
- std::cout << " Failure" << std::endl;
36
- return false;
37
- }
38
- }
39
-
40
- bool TestGreekScript() {
41
- std::cout << "Running " << __FUNCTION__ << std::endl;
42
-
43
- // The first two conditions check first / last character from the Greek and
44
- // Coptic script. The last two ones are negative tests.
45
- return PrintAndReturnStatus(
46
- kScriptGreek == GetScript("Ͱ") && kScriptGreek == GetScript("Ͽ") &&
47
- kScriptGreek == GetScript("δ") && kScriptGreek == GetScript("Θ") &&
48
- kScriptGreek == GetScript("Δ") && kScriptGreek != GetScript("a") &&
49
- kScriptGreek != GetScript("0"));
50
- }
51
-
52
- bool TestCyrillicScript() {
53
- std::cout << "Running " << __FUNCTION__ << std::endl;
54
- return PrintAndReturnStatus(
55
- kScriptCyrillic == GetScript("Ѐ") && kScriptCyrillic == GetScript("ӿ") &&
56
- kScriptCyrillic == GetScript("ш") && kScriptCyrillic == GetScript("Б") &&
57
- kScriptCyrillic == GetScript("Ӱ"));
58
- }
59
-
60
- bool TestHebrewScript() {
61
- std::cout << "Running " << __FUNCTION__ << std::endl;
62
- return PrintAndReturnStatus(
63
- kScriptHebrew == GetScript("֑") && kScriptHebrew == GetScript("״") &&
64
- kScriptHebrew == GetScript("ד") && kScriptHebrew == GetScript("ה") &&
65
- kScriptHebrew == GetScript("צ"));
66
- }
67
-
68
- bool TestArabicScript() {
69
- std::cout << "Running " << __FUNCTION__ << std::endl;
70
- return PrintAndReturnStatus(kScriptArabic == GetScript("م") &&
71
- kScriptArabic == GetScript("خ"));
72
- }
73
-
74
- bool TestHangulJamoScript() {
75
- std::cout << "Running " << __FUNCTION__ << std::endl;
76
- return PrintAndReturnStatus(kScriptHangulJamo == GetScript("ᄀ") &&
77
- kScriptHangulJamo == GetScript("ᇿ") &&
78
- kScriptHangulJamo == GetScript("ᄡ") &&
79
- kScriptHangulJamo == GetScript("ᆅ") &&
80
- kScriptHangulJamo == GetScript("ᅘ"));
81
- }
82
-
83
- bool TestHiraganaScript() {
84
- std::cout << "Running " << __FUNCTION__ << std::endl;
85
- return PrintAndReturnStatus(kScriptHiragana == GetScript("ぁ") &&
86
- kScriptHiragana == GetScript("ゟ") &&
87
- kScriptHiragana == GetScript("こ") &&
88
- kScriptHiragana == GetScript("や") &&
89
- kScriptHiragana == GetScript("ぜ"));
90
- }
91
-
92
- bool TestKatakanaScript() {
93
- std::cout << "Running " << __FUNCTION__ << std::endl;
94
- return PrintAndReturnStatus(kScriptKatakana == GetScript("゠") &&
95
- kScriptKatakana == GetScript("ヿ") &&
96
- kScriptKatakana == GetScript("ヂ") &&
97
- kScriptKatakana == GetScript("ザ") &&
98
- kScriptKatakana == GetScript("ヸ"));
99
- }
100
-
101
- bool TestOtherScripts() {
102
- std::cout << "Running " << __FUNCTION__ << std::endl;
103
- bool test_successful = true;
104
-
105
- if (kScriptOtherUtf8OneByte != GetScript("^") ||
106
- kScriptOtherUtf8OneByte != GetScript("$")) {
107
- test_successful = false;
108
- }
109
-
110
- // Unrecognized 2-byte scripts. For info on the scripts mentioned below, see
111
- // http://www.unicode.org/charts/#scripts Note: the scripts below are uniquely
112
- // associated with a language. Still, the number of queries in those
113
- // languages is small and we didn't want to increase the code size and
114
- // latency, so (at least for now) we do not treat them specially.
115
- // The following three tests are, respectively, for Armenian, Syriac and
116
- // Thaana.
117
- if (kScriptOtherUtf8TwoBytes != GetScript("Ձ") ||
118
- kScriptOtherUtf8TwoBytes != GetScript("ܔ") ||
119
- kScriptOtherUtf8TwoBytes != GetScript("ށ")) {
120
- test_successful = false;
121
- }
122
-
123
- // Unrecognized 3-byte script: CJK Unified Ideographs: not uniquely associated
124
- // with a language.
125
- if (kScriptOtherUtf8ThreeBytes != GetScript("万") ||
126
- kScriptOtherUtf8ThreeBytes != GetScript("両")) {
127
- test_successful = false;
128
- }
129
-
130
- // Unrecognized 4-byte script: CJK Unified Ideographs Extension C. Note:
131
- // there is a nice UTF-8 encoder / decoder at https://mothereff.in/utf-8
132
- if (kScriptOtherUtf8FourBytes != GetScript("\xF0\xAA\x9C\x94")) {
133
- test_successful = false;
134
- }
135
-
136
- // Unrecognized 4-byte script: CJK Unified Ideographs Extension E
137
- if (kScriptOtherUtf8FourBytes != GetScript("\xF0\xAB\xA0\xB5") ||
138
- kScriptOtherUtf8FourBytes != GetScript("\xF0\xAC\xBA\xA1")) {
139
- test_successful = false;
140
- }
141
-
142
- return PrintAndReturnStatus(test_successful);
143
- }
144
-
145
- } // namespace script_detector_test
146
- } // namespace chrome_lang_id
147
-
148
- // Runs the feature extraction tests.
149
- int main(int argc, char **argv) {
150
- const bool tests_successful =
151
- chrome_lang_id::script_detector_test::TestGreekScript() &&
152
- chrome_lang_id::script_detector_test::TestCyrillicScript() &&
153
- chrome_lang_id::script_detector_test::TestHebrewScript() &&
154
- chrome_lang_id::script_detector_test::TestArabicScript() &&
155
- chrome_lang_id::script_detector_test::TestHangulJamoScript() &&
156
- chrome_lang_id::script_detector_test::TestHiraganaScript() &&
157
- chrome_lang_id::script_detector_test::TestKatakanaScript() &&
158
- chrome_lang_id::script_detector_test::TestOtherScripts();
159
-
160
- return tests_successful ? 0 : 1;
161
- }