cld3 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +18 -0
  3. data/LICENSE +204 -0
  4. data/LICENSE_CLD3 +203 -0
  5. data/README.md +22 -0
  6. data/cld3.gemspec +35 -0
  7. data/ext/cld3/base.cc +36 -0
  8. data/ext/cld3/base.h +106 -0
  9. data/ext/cld3/casts.h +98 -0
  10. data/ext/cld3/embedding_feature_extractor.cc +51 -0
  11. data/ext/cld3/embedding_feature_extractor.h +182 -0
  12. data/ext/cld3/embedding_network.cc +196 -0
  13. data/ext/cld3/embedding_network.h +186 -0
  14. data/ext/cld3/embedding_network_params.h +285 -0
  15. data/ext/cld3/extconf.rb +49 -0
  16. data/ext/cld3/feature_extractor.cc +137 -0
  17. data/ext/cld3/feature_extractor.h +633 -0
  18. data/ext/cld3/feature_extractor.proto +50 -0
  19. data/ext/cld3/feature_types.cc +72 -0
  20. data/ext/cld3/feature_types.h +158 -0
  21. data/ext/cld3/fixunicodevalue.cc +55 -0
  22. data/ext/cld3/fixunicodevalue.h +69 -0
  23. data/ext/cld3/float16.h +58 -0
  24. data/ext/cld3/fml_parser.cc +308 -0
  25. data/ext/cld3/fml_parser.h +123 -0
  26. data/ext/cld3/generated_entities.cc +296 -0
  27. data/ext/cld3/generated_ulscript.cc +678 -0
  28. data/ext/cld3/generated_ulscript.h +142 -0
  29. data/ext/cld3/getonescriptspan.cc +1109 -0
  30. data/ext/cld3/getonescriptspan.h +124 -0
  31. data/ext/cld3/integral_types.h +37 -0
  32. data/ext/cld3/lang_id_nn_params.cc +57449 -0
  33. data/ext/cld3/lang_id_nn_params.h +178 -0
  34. data/ext/cld3/language_identifier_features.cc +165 -0
  35. data/ext/cld3/language_identifier_features.h +116 -0
  36. data/ext/cld3/nnet_language_identifier.cc +380 -0
  37. data/ext/cld3/nnet_language_identifier.h +175 -0
  38. data/ext/cld3/nnet_language_identifier_c.cc +72 -0
  39. data/ext/cld3/offsetmap.cc +478 -0
  40. data/ext/cld3/offsetmap.h +168 -0
  41. data/ext/cld3/port.h +143 -0
  42. data/ext/cld3/registry.cc +28 -0
  43. data/ext/cld3/registry.h +242 -0
  44. data/ext/cld3/relevant_script_feature.cc +89 -0
  45. data/ext/cld3/relevant_script_feature.h +49 -0
  46. data/ext/cld3/script_detector.h +156 -0
  47. data/ext/cld3/sentence.proto +77 -0
  48. data/ext/cld3/sentence_features.cc +29 -0
  49. data/ext/cld3/sentence_features.h +35 -0
  50. data/ext/cld3/simple_adder.h +72 -0
  51. data/ext/cld3/stringpiece.h +81 -0
  52. data/ext/cld3/task_context.cc +161 -0
  53. data/ext/cld3/task_context.h +81 -0
  54. data/ext/cld3/task_context_params.cc +74 -0
  55. data/ext/cld3/task_context_params.h +54 -0
  56. data/ext/cld3/task_spec.proto +98 -0
  57. data/ext/cld3/text_processing.cc +245 -0
  58. data/ext/cld3/text_processing.h +30 -0
  59. data/ext/cld3/unicodetext.cc +96 -0
  60. data/ext/cld3/unicodetext.h +144 -0
  61. data/ext/cld3/utf8acceptinterchange.h +486 -0
  62. data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
  63. data/ext/cld3/utf8repl_lettermarklower.h +758 -0
  64. data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
  65. data/ext/cld3/utf8statetable.cc +1344 -0
  66. data/ext/cld3/utf8statetable.h +285 -0
  67. data/ext/cld3/utils.cc +241 -0
  68. data/ext/cld3/utils.h +144 -0
  69. data/ext/cld3/workspace.cc +64 -0
  70. data/ext/cld3/workspace.h +177 -0
  71. data/lib/cld3.rb +99 -0
  72. metadata +158 -0
@@ -0,0 +1,285 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // State Table follower for scanning UTF-8 strings without converting to
17
+ // 32- or 16-bit Unicode values.
18
+ //
19
+ // Author: dsites@google.com (Dick Sites)
20
+ //
21
+
22
+ #ifndef SCRIPT_SPAN_UTF8STATETABLE_H_
23
+ #define SCRIPT_SPAN_UTF8STATETABLE_H_
24
+
25
+ #include <string>
26
+
27
+ #include "integral_types.h" // for uint8, uint32, uint16
28
+ #include "stringpiece.h"
29
+
30
+ namespace chrome_lang_id {
31
+ namespace CLD2 {
32
+
33
+ class OffsetMap;
34
+
35
+
36
+ // These four-byte entries compactly encode how many bytes 0..255 to delete
37
+ // in making a string replacement, how many bytes to add 0..255, and the offset
38
+ // 0..64k-1 of the replacement string in remap_string.
39
+ struct RemapEntry {
40
+ uint8 delete_bytes;
41
+ uint8 add_bytes;
42
+ uint16 bytes_offset;
43
+ };
44
+
45
+ // Exit type codes for state tables. All but the first get stuffed into
46
+ // signed one-byte entries. The first is only generated by executable code.
47
+ // To distinguish from next-state entries, these must be contiguous and
48
+ // all <= kExitNone
49
+ typedef enum {
50
+ kExitDstSpaceFull = 239,
51
+ kExitIllegalStructure, // 240
52
+ kExitOK, // 241
53
+ kExitReject, // ...
54
+ kExitReplace1,
55
+ kExitReplace2,
56
+ kExitReplace3,
57
+ kExitReplace21,
58
+ kExitReplace31,
59
+ kExitReplace32,
60
+ kExitReplaceOffset1,
61
+ kExitReplaceOffset2,
62
+ kExitReplace1S0,
63
+ kExitSpecial,
64
+ kExitDoAgain,
65
+ kExitRejectAlt,
66
+ kExitNone // 255
67
+ } ExitReason;
68
+
69
+ typedef enum {
70
+ kExitDstSpaceFull_2 = 32767, // 0x7fff
71
+ kExitIllegalStructure_2, // 32768 0x8000
72
+ kExitOK_2, // 32769 0x8001
73
+ kExitReject_2, // ...
74
+ kExitReplace1_2,
75
+ kExitReplace2_2,
76
+ kExitReplace3_2,
77
+ kExitReplace21_2,
78
+ kExitReplace31_2,
79
+ kExitReplace32_2,
80
+ kExitReplaceOffset1_2,
81
+ kExitReplaceOffset2_2,
82
+ kExitReplace1S0_2,
83
+ kExitSpecial_2,
84
+ kExitDoAgain_2,
85
+ kExitRejectAlt_2,
86
+ kExitNone_2 // 32783 0x800f
87
+ } ExitReason_2;
88
+
89
+
90
+ // This struct represents one entire state table. The three initialized byte
91
+ // areas are state_table, remap_base, and remap_string. state0 and state0_size
92
+ // give the byte offset and length within state_table of the initial state --
93
+ // table lookups are expected to start and end in this state, but for
94
+ // truncated UTF-8 strings, may end in a different state. These allow a quick
95
+ // test for that condition. entry_shift is 8 for tables subscripted by a full
96
+ // byte value and 6 for space-optimized tables subscripted by only six
97
+ // significant bits in UTF-8 continuation bytes.
98
+ typedef struct {
99
+ const uint32 state0;
100
+ const uint32 state0_size;
101
+ const uint32 total_size;
102
+ const int max_expand;
103
+ const int entry_shift;
104
+ const int bytes_per_entry;
105
+ const uint32 losub;
106
+ const uint32 hiadd;
107
+ const uint8* state_table;
108
+ const RemapEntry* remap_base;
109
+ const uint8* remap_string;
110
+ const uint8* fast_state;
111
+ } UTF8StateMachineObj;
112
+
113
+ // Near-duplicate declaration for tables with two-byte entries
114
+ typedef struct {
115
+ const uint32 state0;
116
+ const uint32 state0_size;
117
+ const uint32 total_size;
118
+ const int max_expand;
119
+ const int entry_shift;
120
+ const int bytes_per_entry;
121
+ const uint32 losub;
122
+ const uint32 hiadd;
123
+ const unsigned short* state_table;
124
+ const RemapEntry* remap_base;
125
+ const uint8* remap_string;
126
+ const uint8* fast_state;
127
+ } UTF8StateMachineObj_2;
128
+
129
+
130
+ typedef UTF8StateMachineObj UTF8PropObj;
131
+ typedef UTF8StateMachineObj UTF8ScanObj;
132
+ typedef UTF8StateMachineObj UTF8ReplaceObj;
133
+ typedef UTF8StateMachineObj_2 UTF8PropObj_2;
134
+ typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
135
+ // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
136
+
137
+
138
+ // Look up property of one UTF-8 character and advance over it
139
+ // Return 0 if input length is zero
140
+ // Return 0 and advance one byte if input is ill-formed
141
+ uint8 UTF8GenericProperty(const UTF8PropObj* st,
142
+ const uint8** src,
143
+ int* srclen);
144
+
145
+ // Look up property of one UTF-8 character (assumed to be valid).
146
+ // (This is a faster version of UTF8GenericProperty.)
147
+ bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
148
+
149
+
150
+ // BigOneByte versions are needed for tables > 240 states, but most
151
+ // won't need the TwoByte versions.
152
+
153
+ // Look up property of one UTF-8 character and advance over it
154
+ // Return 0 if input length is zero
155
+ // Return 0 and advance one byte if input is ill-formed
156
+ uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
157
+ const uint8** src,
158
+ int* srclen);
159
+
160
+
161
+ // TwoByte versions are needed for tables > 240 states that don't fit onto
162
+ // BigOneByte -- rare ultimate fallback
163
+
164
+ // Look up property of one UTF-8 character (assumed to be valid).
165
+ // (This is a faster version of UTF8GenericProperty.)
166
+ bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
167
+
168
+ // Look up property of one UTF-8 character and advance over it
169
+ // Return 0 if input length is zero
170
+ // Return 0 and advance one byte if input is ill-formed
171
+ uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
172
+ const uint8** src,
173
+ int* srclen);
174
+
175
+ // Look up property of one UTF-8 character (assumed to be valid).
176
+ // (This is a faster version of UTF8GenericProperty.)
177
+ bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
178
+
179
+ // Scan a UTF-8 stringpiece based on a state table.
180
+ // Always scan complete UTF-8 characters
181
+ // Set number of bytes scanned. Return reason for exiting
182
+ int UTF8GenericScan(const UTF8ScanObj* st,
183
+ const StringPiece& str,
184
+ int* bytes_consumed);
185
+
186
+
187
+
188
+ // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
189
+ // and doing text replacements.
190
+ // Always scan complete UTF-8 characters
191
+ // Set number of bytes consumed from input, number filled to output.
192
+ // Return reason for exiting
193
+ // Also writes an optional OffsetMap. Pass NULL to skip writing one.
194
+ int UTF8GenericReplace(const UTF8ReplaceObj* st,
195
+ const StringPiece& istr,
196
+ StringPiece& ostr,
197
+ bool is_plain_text,
198
+ int* bytes_consumed,
199
+ int* bytes_filled,
200
+ int* chars_changed,
201
+ OffsetMap* offsetmap);
202
+
203
+ // Older version without offsetmap
204
+ int UTF8GenericReplace(const UTF8ReplaceObj* st,
205
+ const StringPiece& istr,
206
+ StringPiece& ostr,
207
+ bool is_plain_text,
208
+ int* bytes_consumed,
209
+ int* bytes_filled,
210
+ int* chars_changed);
211
+
212
+ // Older version without is_plain_text or offsetmap
213
+ int UTF8GenericReplace(const UTF8ReplaceObj* st,
214
+ const StringPiece& istr,
215
+ StringPiece& ostr,
216
+ int* bytes_consumed,
217
+ int* bytes_filled,
218
+ int* chars_changed);
219
+
220
+
221
+ // TwoByte version is needed for tables > about 256 states, such
222
+ // as the table for full Unicode 4.1 canonical + compatibility mapping
223
+
224
+ // Scan a UTF-8 stringpiece based on state table with two-byte entries,
225
+ // copying to output stringpiece
226
+ // and doing text replacements.
227
+ // Always scan complete UTF-8 characters
228
+ // Set number of bytes consumed from input, number filled to output.
229
+ // Return reason for exiting
230
+ // Also writes an optional OffsetMap. Pass NULL to skip writing one.
231
+ int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
232
+ const StringPiece& istr,
233
+ StringPiece& ostr,
234
+ bool is_plain_text,
235
+ int* bytes_consumed,
236
+ int* bytes_filled,
237
+ int* chars_changed,
238
+ OffsetMap* offsetmap);
239
+
240
+ // Older version without offsetmap
241
+ int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
242
+ const StringPiece& istr,
243
+ StringPiece& ostr,
244
+ bool is_plain_text,
245
+ int* bytes_consumed,
246
+ int* bytes_filled,
247
+ int* chars_changed);
248
+
249
+ // Older version without is_plain_text or offsetmap
250
+ int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
251
+ const StringPiece& istr,
252
+ StringPiece& ostr,
253
+ int* bytes_consumed,
254
+ int* bytes_filled,
255
+ int* chars_changed);
256
+
257
+
258
+ static const unsigned char kUTF8LenTbl[256] = {
259
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
260
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
261
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
262
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
263
+
264
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
265
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
266
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
267
+ 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
268
+ };
269
+
270
+ inline int UTF8OneCharLen(const char* in) {
271
+ return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
272
+ }
273
+
274
+ // Adjust a stringpiece to encompass complete UTF-8 characters.
275
+ // The data pointer will be increased by 0..3 bytes to get to a character
276
+ // boundary, and the length will then be decreased by 0..3 bytes
277
+ // to encompass the last complete character.
278
+ // This is useful especially when a UTF-8 string must be put into a fixed-
279
+ // maximum-size buffer cleanly, such as a MySQL buffer.
280
+ void UTF8TrimToChars(StringPiece* istr);
281
+
282
+ } // End namespace CLD2
283
+ } // End namespace chrome_lang_id
284
+
285
+ #endif // SCRIPT_SPAN_UTF8STATETABLE_H_
@@ -0,0 +1,241 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #include "utils.h"
17
+
18
+ #include <ctype.h>
19
+ #include <stdlib.h>
20
+
21
+ #include "script_span/stringpiece.h"
22
+
23
+ namespace chrome_lang_id {
24
+ namespace utils {
25
+
26
+ bool ParseInt32(const char *c_str, int *value) {
27
+ char *temp;
28
+ *value = strtol(c_str, &temp, 0); // NOLINT
29
+ return (*temp == '\0');
30
+ }
31
+
32
+ bool ParseDouble(const char *c_str, double *value) {
33
+ char *temp;
34
+ *value = strtod(c_str, &temp);
35
+ return (*temp == '\0');
36
+ }
37
+
38
+ static char hex_char[] = "0123456789abcdef";
39
+
40
+ string CEscape(const string &src) {
41
+ string dest;
42
+
43
+ for (unsigned char c : src) {
44
+ switch (c) {
45
+ case '\n':
46
+ dest.append("\\n");
47
+ break;
48
+ case '\r':
49
+ dest.append("\\r");
50
+ break;
51
+ case '\t':
52
+ dest.append("\\t");
53
+ break;
54
+ case '\"':
55
+ dest.append("\\\"");
56
+ break;
57
+ case '\'':
58
+ dest.append("\\'");
59
+ break;
60
+ case '\\':
61
+ dest.append("\\\\");
62
+ break;
63
+ default:
64
+ // Note that if we emit \xNN and the src character after that is a hex
65
+ // digit then that digit must be escaped too to prevent it being
66
+ // interpreted as part of the character code by C.
67
+ if ((c >= 0x80) || !isprint(c)) {
68
+ dest.append("\\");
69
+ dest.push_back(hex_char[c / 64]);
70
+ dest.push_back(hex_char[(c % 64) / 8]);
71
+ dest.push_back(hex_char[c % 8]);
72
+ } else {
73
+ dest.push_back(c);
74
+ break;
75
+ }
76
+ }
77
+ }
78
+
79
+ return dest;
80
+ }
81
+
82
+ std::vector<string> Split(const string &text, char delim) {
83
+ std::vector<string> result;
84
+ size_t token_start = 0;
85
+ if (!text.empty()) {
86
+ for (size_t i = 0; i < text.size() + 1; i++) {
87
+ if ((i == text.size()) || (text[i] == delim)) {
88
+ result.push_back(string(text.data() + token_start, i - token_start));
89
+ token_start = i + 1;
90
+ }
91
+ }
92
+ }
93
+ return result;
94
+ }
95
+
96
+ int RemoveLeadingWhitespace(StringPiece *text) {
97
+ int count = 0;
98
+ const char *ptr = text->data();
99
+ while (count < text->size() && isspace(*ptr)) {
100
+ count++;
101
+ ptr++;
102
+ }
103
+ text->remove_prefix(count);
104
+ return count;
105
+ }
106
+
107
+ int RemoveTrailingWhitespace(StringPiece *text) {
108
+ int count = 0;
109
+ const char *ptr = text->data() + text->size() - 1;
110
+ while (count < text->size() && isspace(*ptr)) {
111
+ ++count;
112
+ --ptr;
113
+ }
114
+ text->remove_suffix(count);
115
+ return count;
116
+ }
117
+
118
+ int RemoveWhitespaceContext(StringPiece *text) {
119
+ // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
120
+ return RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text);
121
+ }
122
+
123
+ namespace {
124
+ // Lower-level versions of Get... that read directly from a character buffer
125
+ // without any bounds checking.
126
+ inline uint32 DecodeFixed32(const char *ptr) {
127
+ return ((static_cast<uint32>(static_cast<unsigned char>(ptr[0]))) |
128
+ (static_cast<uint32>(static_cast<unsigned char>(ptr[1])) << 8) |
129
+ (static_cast<uint32>(static_cast<unsigned char>(ptr[2])) << 16) |
130
+ (static_cast<uint32>(static_cast<unsigned char>(ptr[3])) << 24));
131
+ }
132
+
133
+ // 0xff is in case char is signed.
134
+ static inline uint32 ByteAs32(char c) { return static_cast<uint32>(c) & 0xff; }
135
+ } // namespace
136
+
137
+ uint32 Hash32(const char *data, size_t n, uint32 seed) {
138
+ // 'm' and 'r' are mixing constants generated offline.
139
+ // They're not really 'magic', they just happen to work well.
140
+ const uint32 m = 0x5bd1e995;
141
+ const int r = 24;
142
+
143
+ // Initialize the hash to a 'random' value
144
+ uint32 h = static_cast<uint32>(seed ^ n);
145
+
146
+ // Mix 4 bytes at a time into the hash
147
+ while (n >= 4) {
148
+ uint32 k = DecodeFixed32(data);
149
+ k *= m;
150
+ k ^= k >> r;
151
+ k *= m;
152
+ h *= m;
153
+ h ^= k;
154
+ data += 4;
155
+ n -= 4;
156
+ }
157
+
158
+ // Handle the last few bytes of the input array
159
+ if (n == 3) {
160
+ h ^= ByteAs32(data[2]) << 16;
161
+ h ^= ByteAs32(data[1]) << 8;
162
+ h ^= ByteAs32(data[0]);
163
+ h *= m;
164
+ } else if (n == 2) {
165
+ h ^= ByteAs32(data[1]) << 8;
166
+ h ^= ByteAs32(data[0]);
167
+ h *= m;
168
+ } else if (n == 1) {
169
+ h ^= ByteAs32(data[0]);
170
+ h *= m;
171
+ }
172
+
173
+ // Do a few final mixes of the hash to ensure the last few
174
+ // bytes are well-incorporated.
175
+ h ^= h >> 13;
176
+ h *= m;
177
+ h ^= h >> 15;
178
+ return h;
179
+ }
180
+
181
+ uint32 Hash32WithDefaultSeed(const string &input) {
182
+ return Hash32(input.data(), input.size(), 0xBEEF);
183
+ }
184
+
185
+ PunctuationUtil::CharacterRange PunctuationUtil::kPunctuation[] = {
186
+ {33, 35}, {37, 42}, {44, 47}, {58, 59},
187
+ {63, 64}, {91, 93}, {95, 95}, {123, 123},
188
+ {125, 125}, {161, 161}, {171, 171}, {183, 183},
189
+ {187, 187}, {191, 191}, {894, 894}, {903, 903},
190
+ {1370, 1375}, {1417, 1418}, {1470, 1470}, {1472, 1472},
191
+ {1475, 1475}, {1478, 1478}, {1523, 1524}, {1548, 1549},
192
+ {1563, 1563}, {1566, 1567}, {1642, 1645}, {1748, 1748},
193
+ {1792, 1805}, {2404, 2405}, {2416, 2416}, {3572, 3572},
194
+ {3663, 3663}, {3674, 3675}, {3844, 3858}, {3898, 3901},
195
+ {3973, 3973}, {4048, 4049}, {4170, 4175}, {4347, 4347},
196
+ {4961, 4968}, {5741, 5742}, {5787, 5788}, {5867, 5869},
197
+ {5941, 5942}, {6100, 6102}, {6104, 6106}, {6144, 6154},
198
+ {6468, 6469}, {6622, 6623}, {6686, 6687}, {8208, 8231},
199
+ {8240, 8259}, {8261, 8273}, {8275, 8286}, {8317, 8318},
200
+ {8333, 8334}, {9001, 9002}, {9140, 9142}, {10088, 10101},
201
+ {10181, 10182}, {10214, 10219}, {10627, 10648}, {10712, 10715},
202
+ {10748, 10749}, {11513, 11516}, {11518, 11519}, {11776, 11799},
203
+ {11804, 11805}, {12289, 12291}, {12296, 12305}, {12308, 12319},
204
+ {12336, 12336}, {12349, 12349}, {12448, 12448}, {12539, 12539},
205
+ {64830, 64831}, {65040, 65049}, {65072, 65106}, {65108, 65121},
206
+ {65123, 65123}, {65128, 65128}, {65130, 65131}, {65281, 65283},
207
+ {65285, 65290}, {65292, 65295}, {65306, 65307}, {65311, 65312},
208
+ {65339, 65341}, {65343, 65343}, {65371, 65371}, {65373, 65373},
209
+ {65375, 65381}, {65792, 65793}, {66463, 66463}, {68176, 68184},
210
+ {-1, -1}};
211
+
212
+ void NormalizeDigits(string *form) {
213
+ for (size_t i = 0; i < form->size(); ++i) {
214
+ if ((*form)[i] >= '0' && (*form)[i] <= '9') (*form)[i] = '9';
215
+ }
216
+ }
217
+
218
+ void GetUTF8Chars(const string &text, std::vector<string> *chars) {
219
+ const char *start = text.c_str();
220
+ const char *end = text.c_str() + text.size();
221
+ while (start < end) {
222
+ int char_length = UTF8FirstLetterNumBytes(start);
223
+ chars->emplace_back(start, char_length);
224
+ start += char_length;
225
+ }
226
+ }
227
+
228
+ int UTF8FirstLetterNumBytes(const char *utf8_str) {
229
+ if (*utf8_str == '\0') return 0;
230
+ return OneCharLen(utf8_str);
231
+ }
232
+
233
+ int OneCharLen(const char *src) {
234
+ // On most platforms, char is unsigned by default, but iOS is an exception.
235
+ // The cast below makes sure we always interpret *src as an unsigned char.
236
+ return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
237
+ [(*(reinterpret_cast<const unsigned char *>(src)) & 0xFF) >> 4];
238
+ }
239
+
240
+ } // namespace utils
241
+ } // namespace chrome_lang_id