cld3 3.2.4 → 3.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +4 -4
  2. data/cld3.gemspec +4 -4
  3. data/ext/cld3/ext/CMakeLists.txt +69 -0
  4. data/ext/cld3/ext/CONTRIBUTING.md +26 -0
  5. data/{LICENSE_CLD3 → ext/cld3/ext/LICENSE} +0 -0
  6. data/ext/cld3/ext/README.md +73 -0
  7. data/ext/cld3/ext/misc/myprotobuf.cmake +58 -0
  8. data/ext/cld3/ext/model.png +0 -0
  9. data/ext/cld3/ext/src/BUILD.gn +133 -0
  10. data/ext/cld3/ext/src/DEPS +4 -0
  11. data/ext/cld3/{base.cc → ext/src/base.cc} +0 -0
  12. data/ext/cld3/{base.h → ext/src/base.h} +0 -0
  13. data/ext/cld3/{casts.h → ext/src/casts.h} +0 -0
  14. data/ext/cld3/{embedding_feature_extractor.cc → ext/src/embedding_feature_extractor.cc} +0 -0
  15. data/ext/cld3/{embedding_feature_extractor.h → ext/src/embedding_feature_extractor.h} +0 -0
  16. data/ext/cld3/{embedding_network.cc → ext/src/embedding_network.cc} +0 -0
  17. data/ext/cld3/{embedding_network.h → ext/src/embedding_network.h} +0 -0
  18. data/ext/cld3/{embedding_network_params.h → ext/src/embedding_network_params.h} +0 -0
  19. data/ext/cld3/{feature_extractor.cc → ext/src/feature_extractor.cc} +0 -0
  20. data/ext/cld3/{feature_extractor.h → ext/src/feature_extractor.h} +0 -0
  21. data/ext/cld3/{feature_extractor.proto → ext/src/feature_extractor.proto} +0 -0
  22. data/ext/cld3/{feature_types.cc → ext/src/feature_types.cc} +0 -0
  23. data/ext/cld3/{feature_types.h → ext/src/feature_types.h} +0 -0
  24. data/ext/cld3/{float16.h → ext/src/float16.h} +0 -0
  25. data/ext/cld3/{fml_parser.cc → ext/src/fml_parser.cc} +0 -0
  26. data/ext/cld3/{fml_parser.h → ext/src/fml_parser.h} +0 -0
  27. data/ext/cld3/{lang_id_nn_params.cc → ext/src/lang_id_nn_params.cc} +0 -0
  28. data/ext/cld3/{lang_id_nn_params.h → ext/src/lang_id_nn_params.h} +0 -0
  29. data/ext/cld3/{language_identifier_features.cc → ext/src/language_identifier_features.cc} +0 -0
  30. data/ext/cld3/{language_identifier_features.h → ext/src/language_identifier_features.h} +0 -0
  31. data/ext/cld3/ext/src/language_identifier_features_test.cc +261 -0
  32. data/ext/cld3/ext/src/language_identifier_main.cc +54 -0
  33. data/ext/cld3/ext/src/nnet_lang_id_test.cc +254 -0
  34. data/ext/cld3/ext/src/nnet_lang_id_test_data.cc +529 -0
  35. data/ext/cld3/ext/src/nnet_lang_id_test_data.h +117 -0
  36. data/ext/cld3/{nnet_language_identifier.cc → ext/src/nnet_language_identifier.cc} +8 -0
  37. data/ext/cld3/{nnet_language_identifier.h → ext/src/nnet_language_identifier.h} +16 -0
  38. data/ext/cld3/{registry.cc → ext/src/registry.cc} +0 -0
  39. data/ext/cld3/{registry.h → ext/src/registry.h} +0 -0
  40. data/ext/cld3/{relevant_script_feature.cc → ext/src/relevant_script_feature.cc} +0 -0
  41. data/ext/cld3/{relevant_script_feature.h → ext/src/relevant_script_feature.h} +0 -0
  42. data/ext/cld3/ext/src/relevant_script_feature_test.cc +259 -0
  43. data/ext/cld3/{script_detector.h → ext/src/script_detector.h} +0 -0
  44. data/ext/cld3/ext/src/script_detector_test.cc +161 -0
  45. data/ext/cld3/ext/src/script_span/README.md +11 -0
  46. data/ext/cld3/{fixunicodevalue.cc → ext/src/script_span/fixunicodevalue.cc} +0 -0
  47. data/ext/cld3/{fixunicodevalue.h → ext/src/script_span/fixunicodevalue.h} +0 -0
  48. data/ext/cld3/{generated_entities.cc → ext/src/script_span/generated_entities.cc} +0 -0
  49. data/ext/cld3/{generated_ulscript.cc → ext/src/script_span/generated_ulscript.cc} +0 -0
  50. data/ext/cld3/{generated_ulscript.h → ext/src/script_span/generated_ulscript.h} +0 -0
  51. data/ext/cld3/{getonescriptspan.cc → ext/src/script_span/getonescriptspan.cc} +0 -0
  52. data/ext/cld3/{getonescriptspan.h → ext/src/script_span/getonescriptspan.h} +1 -1
  53. data/ext/cld3/ext/src/script_span/getonescriptspan_test.cc +135 -0
  54. data/ext/cld3/{integral_types.h → ext/src/script_span/integral_types.h} +0 -0
  55. data/ext/cld3/{offsetmap.cc → ext/src/script_span/offsetmap.cc} +0 -0
  56. data/ext/cld3/{offsetmap.h → ext/src/script_span/offsetmap.h} +0 -0
  57. data/ext/cld3/{port.h → ext/src/script_span/port.h} +0 -0
  58. data/ext/cld3/{stringpiece.h → ext/src/script_span/stringpiece.h} +0 -0
  59. data/ext/cld3/{text_processing.cc → ext/src/script_span/text_processing.cc} +0 -0
  60. data/ext/cld3/{text_processing.h → ext/src/script_span/text_processing.h} +0 -0
  61. data/ext/cld3/{utf8acceptinterchange.h → ext/src/script_span/utf8acceptinterchange.h} +0 -0
  62. data/ext/cld3/{utf8prop_lettermarkscriptnum.h → ext/src/script_span/utf8prop_lettermarkscriptnum.h} +0 -0
  63. data/ext/cld3/{utf8repl_lettermarklower.h → ext/src/script_span/utf8repl_lettermarklower.h} +0 -0
  64. data/ext/cld3/{utf8scannot_lettermarkspecial.h → ext/src/script_span/utf8scannot_lettermarkspecial.h} +0 -0
  65. data/ext/cld3/{utf8statetable.cc → ext/src/script_span/utf8statetable.cc} +0 -0
  66. data/ext/cld3/{utf8statetable.h → ext/src/script_span/utf8statetable.h} +0 -0
  67. data/ext/cld3/{sentence.proto → ext/src/sentence.proto} +0 -0
  68. data/ext/cld3/{sentence_features.cc → ext/src/sentence_features.cc} +0 -0
  69. data/ext/cld3/{sentence_features.h → ext/src/sentence_features.h} +0 -0
  70. data/ext/cld3/{simple_adder.h → ext/src/simple_adder.h} +0 -0
  71. data/ext/cld3/{task_context.cc → ext/src/task_context.cc} +0 -0
  72. data/ext/cld3/{task_context.h → ext/src/task_context.h} +0 -0
  73. data/ext/cld3/{task_context_params.cc → ext/src/task_context_params.cc} +0 -0
  74. data/ext/cld3/{task_context_params.h → ext/src/task_context_params.h} +0 -0
  75. data/ext/cld3/{task_spec.proto → ext/src/task_spec.proto} +0 -0
  76. data/ext/cld3/{unicodetext.cc → ext/src/unicodetext.cc} +0 -0
  77. data/ext/cld3/{unicodetext.h → ext/src/unicodetext.h} +0 -0
  78. data/ext/cld3/{utils.cc → ext/src/utils.cc} +0 -0
  79. data/ext/cld3/{utils.h → ext/src/utils.h} +0 -0
  80. data/ext/cld3/{workspace.cc → ext/src/workspace.cc} +0 -0
  81. data/ext/cld3/{workspace.h → ext/src/workspace.h} +0 -0
  82. metadata +87 -71
@@ -0,0 +1,117 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #ifndef NNET_LANG_ID_TEST_DATA_H_
17
+ #define NNET_LANG_ID_TEST_DATA_H_
18
+
19
+ namespace chrome_lang_id {
20
+
21
+ class NNetLangIdTestData {
22
+ public:
23
+ // Pieces of text in different languages.
24
+ static const char *const kTestStrAF;
25
+ static const char *const kTestStrAR;
26
+ static const char *const kTestStrAZ;
27
+ static const char *const kTestStrBE;
28
+ static const char *const kTestStrBG;
29
+ static const char *const kTestStrBN;
30
+ static const char *const kTestStrBS;
31
+ static const char *const kTestStrCA;
32
+ static const char *const kTestStrCEB;
33
+ static const char *const kTestStrCS;
34
+ static const char *const kTestStrCY;
35
+ static const char *const kTestStrDA;
36
+ static const char *const kTestStrDE;
37
+ static const char *const kTestStrEL;
38
+ static const char *const kTestStrEN;
39
+ static const char *const kTestStrEO;
40
+ static const char *const kTestStrES;
41
+ static const char *const kTestStrET;
42
+ static const char *const kTestStrEU;
43
+ static const char *const kTestStrFA;
44
+ static const char *const kTestStrFI;
45
+ static const char *const kTestStrFIL;
46
+ static const char *const kTestStrFR;
47
+ static const char *const kTestStrGA;
48
+ static const char *const kTestStrGL;
49
+ static const char *const kTestStrGU;
50
+ static const char *const kTestStrHA;
51
+ static const char *const kTestStrHI;
52
+ static const char *const kTestStrHMN;
53
+ static const char *const kTestStrHR;
54
+ static const char *const kTestStrHT;
55
+ static const char *const kTestStrHU;
56
+ static const char *const kTestStrHY;
57
+ static const char *const kTestStrID;
58
+ static const char *const kTestStrIG;
59
+ static const char *const kTestStrIS;
60
+ static const char *const kTestStrIT;
61
+ static const char *const kTestStrIW;
62
+ static const char *const kTestStrJA;
63
+ static const char *const kTestStrJV;
64
+ static const char *const kTestStrKA;
65
+ static const char *const kTestStrKK;
66
+ static const char *const kTestStrKM;
67
+ static const char *const kTestStrKN;
68
+ static const char *const kTestStrKO;
69
+ static const char *const kTestStrLA;
70
+ static const char *const kTestStrLO;
71
+ static const char *const kTestStrLT;
72
+ static const char *const kTestStrLV;
73
+ static const char *const kTestStrMG;
74
+ static const char *const kTestStrMI;
75
+ static const char *const kTestStrMK;
76
+ static const char *const kTestStrML;
77
+ static const char *const kTestStrMN;
78
+ static const char *const kTestStrMR;
79
+ static const char *const kTestStrMS;
80
+ static const char *const kTestStrMT;
81
+ static const char *const kTestStrMY;
82
+ static const char *const kTestStrNE;
83
+ static const char *const kTestStrNL;
84
+ static const char *const kTestStrNO;
85
+ static const char *const kTestStrNY;
86
+ static const char *const kTestStrPA;
87
+ static const char *const kTestStrPL;
88
+ static const char *const kTestStrPT;
89
+ static const char *const kTestStrRO;
90
+ static const char *const kTestStrRU;
91
+ static const char *const kTestStrSI;
92
+ static const char *const kTestStrSK;
93
+ static const char *const kTestStrSL;
94
+ static const char *const kTestStrSO;
95
+ static const char *const kTestStrSQ;
96
+ static const char *const kTestStrSR;
97
+ static const char *const kTestStrST;
98
+ static const char *const kTestStrSU;
99
+ static const char *const kTestStrSV;
100
+ static const char *const kTestStrSW;
101
+ static const char *const kTestStrTA;
102
+ static const char *const kTestStrTE;
103
+ static const char *const kTestStrTG;
104
+ static const char *const kTestStrTH;
105
+ static const char *const kTestStrTR;
106
+ static const char *const kTestStrUK;
107
+ static const char *const kTestStrUR;
108
+ static const char *const kTestStrUZ;
109
+ static const char *const kTestStrVI;
110
+ static const char *const kTestStrYI;
111
+ static const char *const kTestStrYO;
112
+ static const char *const kTestStrZH;
113
+ static const char *const kTestStrZU;
114
+ };
115
+ } // namespace chrome_lang_id
116
+
117
+ #endif // NNET_LANG_ID_TEST_DATA_H_
@@ -47,6 +47,9 @@ struct LangChunksStats {
47
47
 
48
48
  // Number chunks corresponding to the language.
49
49
  int num_chunks = 0;
50
+
51
+ // Specifies the byte ranges that language applies to.
52
+ std::vector<NNetLanguageIdentifier::SpanInfo> byte_ranges;
50
53
  };
51
54
 
52
55
  // Compares two pairs based on their values.
@@ -298,12 +301,16 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
298
301
  total_num_bytes += num_original_span_bytes;
299
302
 
300
303
  const string selected_text = SelectTextGivenScriptSpan(script_span);
304
+
301
305
  result = FindLanguageOfValidUTF8(selected_text);
302
306
  language = result.language;
303
307
  lang_stats[language].byte_sum += num_original_span_bytes;
304
308
  lang_stats[language].prob_sum +=
305
309
  result.probability * num_original_span_bytes;
306
310
  lang_stats[language].num_chunks++;
311
+ // Add SpanInfo. Start and end indices are relative to original input.
312
+ lang_stats[language].byte_ranges.push_back(SpanInfo(
313
+ ss.MapBack(0), ss.MapBack(script_span.text_bytes), result.probability));
307
314
  }
308
315
 
309
316
  // Sort the languages based on the number of bytes associated with them.
@@ -329,6 +336,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
329
336
  result.probability = stats.prob_sum / stats.byte_sum;
330
337
  result.proportion = stats.byte_sum / byte_sum;
331
338
  result.is_reliable = ResultIsReliable(language, result.probability);
339
+ result.byte_ranges = stats.byte_ranges;
332
340
  results.push_back(result);
333
341
  }
334
342
 
@@ -44,6 +44,19 @@ class LanguageIdEmbeddingFeatureExtractor
44
44
  // Class for detecting the language of a document.
45
45
  class NNetLanguageIdentifier {
46
46
  public:
47
+ // Holds probability that Span, specified by start/end indices, is a given
48
+ // language. The langauge is not stored here; it can be found in Result, which
49
+ // holds a vector of SpanInfo.
50
+ struct SpanInfo {
51
+ SpanInfo(int start_index_val, int end_index_val, float probability_val)
52
+ : start_index(start_index_val),
53
+ end_index(end_index_val),
54
+ probability(probability_val) {}
55
+ int start_index = -1;
56
+ int end_index = -1;
57
+ float probability = 0.0;
58
+ };
59
+
47
60
  // Information about a predicted language.
48
61
  struct Result {
49
62
  string language = kUnknown;
@@ -53,6 +66,9 @@ class NNetLanguageIdentifier {
53
66
  // Proportion of bytes associated with the language. If FindLanguage is
54
67
  // called, this variable is set to 1.
55
68
  float proportion = 0.0;
69
+
70
+ // Specifies the byte ranges that |language| applies to.
71
+ std::vector<SpanInfo> byte_ranges;
56
72
  };
57
73
 
58
74
  NNetLanguageIdentifier();
@@ -0,0 +1,259 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #include <algorithm>
17
+ #include <cmath>
18
+ #include <iostream>
19
+ #include <memory>
20
+
21
+ #include "feature_extractor.h"
22
+ #include "feature_types.h"
23
+ #include "relevant_script_feature.h"
24
+ #include "script_detector.h"
25
+ #include "cld_3/protos/sentence.pb.h"
26
+ #include "sentence_features.h"
27
+ #include "task_context.h"
28
+ #include "utils.h"
29
+ #include "workspace.h"
30
+
31
+ namespace chrome_lang_id {
32
+ namespace relevant_script_feature_test {
33
+ namespace {
34
+ // Checks whether the expected and actual float feature values are within 0.0001
35
+ // of each other.
36
+ bool FeatureValuesNear(float expected_value, float actual_value) {
37
+ return std::abs(expected_value - actual_value) < 0.0001;
38
+ }
39
+
40
+ // Checks whether two sets of feature values are within an acceptable amount of
41
+ // each other.
42
+ bool FeaturesNear(const string &test_input,
43
+ const std::map<int, float> &expected_features,
44
+ const std::map<int, float> &actual_features) {
45
+ if (expected_features.size() != actual_features.size()) {
46
+ std::cout << " Failure for input: " << test_input << std::endl;
47
+ return false;
48
+ }
49
+
50
+ for (const auto &id_and_value : expected_features) {
51
+ const int id = id_and_value.first;
52
+ if (actual_features.count(id) == 0 ||
53
+ !FeatureValuesNear(expected_features.at(id), actual_features.at(id))) {
54
+ std::cout << " Failure for input: " << test_input << std::endl;
55
+ return false;
56
+ }
57
+ }
58
+ std::cout << " Success for input: " << test_input << std::endl;
59
+ return true;
60
+ }
61
+
62
+ // Checks whether the set of features is empty.
63
+ bool CheckFeaturesEmpty(const string &input,
64
+ const std::map<int, float> &actual_features) {
65
+ if (!actual_features.empty()) {
66
+ std::cout << " Failure for input: " << input << std::endl;
67
+ return false;
68
+ } else {
69
+ std::cout << " Success for input: " << input << std::endl;
70
+ return true;
71
+ }
72
+ }
73
+ } // namespace
74
+
75
+ static WholeSentenceFeature *rsf_factory() { return new RelevantScriptFeature; }
76
+
77
+ class RelevantScriptFeatureExtractor {
78
+ public:
79
+ RelevantScriptFeatureExtractor() {
80
+ if (WholeSentenceFeature::registry() == nullptr) {
81
+ // Create registry for our WholeSentenceFeature(s).
82
+ RegisterableClass<WholeSentenceFeature>::CreateRegistry(
83
+ "sentence feature function", "WholeSentenceFeature", __FILE__,
84
+ __LINE__);
85
+ }
86
+
87
+ // Register our WholeSentenceFeature(s).
88
+ // Register RelevantScriptFeature feature function.
89
+ static WholeSentenceFeature::Registry::Registrar rsf_registrar(
90
+ WholeSentenceFeature::registry(), "continuous-bag-of-relevant-scripts",
91
+ "RelevantScriptFeature", __FILE__, __LINE__, rsf_factory);
92
+
93
+ feature_extractor_.Parse("continuous-bag-of-relevant-scripts");
94
+ TaskContext context;
95
+ feature_extractor_.Setup(&context);
96
+ feature_extractor_.Init(&context);
97
+ feature_extractor_.RequestWorkspaces(&workspace_registry_);
98
+ }
99
+
100
+ // Returns "true" if feature extraction is successful, and "false" otherwise.
101
+ bool Extract(const string &text, std::map<int, float> *float_features) {
102
+ float_features->clear();
103
+ if (text.empty()) {
104
+ return true;
105
+ }
106
+ Sentence sentence;
107
+ sentence.set_text(text);
108
+ workspace_.Reset(workspace_registry_);
109
+ feature_extractor_.Preprocess(&workspace_, &sentence);
110
+ FeatureVector feature_vector;
111
+ feature_extractor_.ExtractFeatures(workspace_, sentence, &feature_vector);
112
+
113
+ for (int index = 0; index < feature_vector.size(); ++index) {
114
+ const FloatFeatureValue value =
115
+ FloatFeatureValue(feature_vector.value(index));
116
+ if (float_features->count(value.value.id) != 0) {
117
+ std::cout << " Failure: duplicate feature" << std::endl;
118
+ return false;
119
+ }
120
+ float_features->emplace(value.value.id, value.value.weight);
121
+ }
122
+ return true;
123
+ }
124
+
125
+ private:
126
+ WorkspaceSet workspace_;
127
+ WholeSentenceExtractor feature_extractor_;
128
+
129
+ // The registry of shared workspaces in the feature extractor.
130
+ WorkspaceRegistry workspace_registry_;
131
+ };
132
+
133
+ bool TestCommonCases() {
134
+ std::cout << "Running " << __FUNCTION__ << std::endl;
135
+
136
+ RelevantScriptFeatureExtractor extractor;
137
+ std::map<int, float> float_features;
138
+ bool test_successful = true;
139
+
140
+ string input = "just some plain text";
141
+ if (!extractor.Extract(input, &float_features) ||
142
+ !FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 1.00}},
143
+ float_features)) {
144
+ test_successful = false;
145
+ }
146
+
147
+ input = "ヸヂ゠ヂ";
148
+ if (!extractor.Extract(input, &float_features) ||
149
+ !FeaturesNear(input, {{chrome_lang_id::kScriptKatakana, 1.00}},
150
+ float_features)) {
151
+ test_successful = false;
152
+ }
153
+
154
+ // 4 Latin letters mixed with 4 Katakana letters.
155
+ input = "ヸtヂe゠xtヂ";
156
+ if (!extractor.Extract(input, &float_features) ||
157
+ !FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 0.5},
158
+ {chrome_lang_id::kScriptKatakana, 0.5}},
159
+ float_features)) {
160
+ test_successful = false;
161
+ }
162
+
163
+ input = "just some 121212%^^( ヸヂ゠ヂ text";
164
+ if (!extractor.Extract(input, &float_features) ||
165
+ !FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 0.75},
166
+ {chrome_lang_id::kScriptKatakana, 0.25}},
167
+ float_features)) {
168
+ test_successful = false;
169
+ }
170
+
171
+ return test_successful;
172
+ }
173
+
174
+ bool TestCornerCases() {
175
+ std::cout << "Running " << __FUNCTION__ << std::endl;
176
+
177
+ RelevantScriptFeatureExtractor extractor;
178
+ std::map<int, float> float_features;
179
+ bool test_successful = true;
180
+
181
+ // Empty string.
182
+ string input = "";
183
+ if (!extractor.Extract(input, &float_features) ||
184
+ !CheckFeaturesEmpty(input, float_features)) {
185
+ test_successful = false;
186
+ }
187
+
188
+ // Only whitespaces.
189
+ input = " ";
190
+ if (!extractor.Extract(input, &float_features) ||
191
+ !CheckFeaturesEmpty(input, float_features)) {
192
+ test_successful = false;
193
+ }
194
+
195
+ // Only numbers and punctuation.
196
+ input = "12----)(";
197
+ if (!extractor.Extract(input, &float_features) ||
198
+ !CheckFeaturesEmpty(input, float_features)) {
199
+ test_successful = false;
200
+ }
201
+
202
+ // Only numbers, punctuation, and spaces.
203
+ input = "12--- - ) ( ";
204
+ if (!extractor.Extract(input, &float_features) ||
205
+ !CheckFeaturesEmpty(input, float_features)) {
206
+ test_successful = false;
207
+ }
208
+
209
+ // One UTF8 character by itself.
210
+ input = "ゟ";
211
+ if (!extractor.Extract(input, &float_features) ||
212
+ !FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
213
+ float_features)) {
214
+ test_successful = false;
215
+ }
216
+
217
+ input = "ה";
218
+ if (!extractor.Extract(input, &float_features) ||
219
+ !FeaturesNear(input, {{chrome_lang_id::kScriptHebrew, 1.00}},
220
+ float_features)) {
221
+ test_successful = false;
222
+ }
223
+
224
+ // One UTF8 character with some numbers / punctuation / spaces: character at
225
+ // one extremity or in the middle.
226
+ input = "1234ゟ";
227
+ if (!extractor.Extract(input, &float_features) ||
228
+ !FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
229
+ float_features)) {
230
+ test_successful = false;
231
+ }
232
+
233
+ input = "ゟ12-(";
234
+ if (!extractor.Extract(input, &float_features) ||
235
+ !FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
236
+ float_features)) {
237
+ test_successful = false;
238
+ }
239
+
240
+ input = "8*1ゟ12----";
241
+ if (!extractor.Extract(input, &float_features) ||
242
+ !FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
243
+ float_features)) {
244
+ test_successful = false;
245
+ }
246
+
247
+ return test_successful;
248
+ }
249
+
250
+ } // namespace relevant_script_feature_test
251
+ } // namespace chrome_lang_id
252
+
253
+ // Runs the feature extraction tests.
254
+ int main(int argc, char **argv) {
255
+ const bool tests_successful =
256
+ chrome_lang_id::relevant_script_feature_test::TestCommonCases() &&
257
+ chrome_lang_id::relevant_script_feature_test::TestCornerCases();
258
+ return tests_successful ? 0 : 1;
259
+ }