cld3 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/LICENSE +204 -0
- data/LICENSE_CLD3 +203 -0
- data/README.md +22 -0
- data/cld3.gemspec +35 -0
- data/ext/cld3/base.cc +36 -0
- data/ext/cld3/base.h +106 -0
- data/ext/cld3/casts.h +98 -0
- data/ext/cld3/embedding_feature_extractor.cc +51 -0
- data/ext/cld3/embedding_feature_extractor.h +182 -0
- data/ext/cld3/embedding_network.cc +196 -0
- data/ext/cld3/embedding_network.h +186 -0
- data/ext/cld3/embedding_network_params.h +285 -0
- data/ext/cld3/extconf.rb +49 -0
- data/ext/cld3/feature_extractor.cc +137 -0
- data/ext/cld3/feature_extractor.h +633 -0
- data/ext/cld3/feature_extractor.proto +50 -0
- data/ext/cld3/feature_types.cc +72 -0
- data/ext/cld3/feature_types.h +158 -0
- data/ext/cld3/fixunicodevalue.cc +55 -0
- data/ext/cld3/fixunicodevalue.h +69 -0
- data/ext/cld3/float16.h +58 -0
- data/ext/cld3/fml_parser.cc +308 -0
- data/ext/cld3/fml_parser.h +123 -0
- data/ext/cld3/generated_entities.cc +296 -0
- data/ext/cld3/generated_ulscript.cc +678 -0
- data/ext/cld3/generated_ulscript.h +142 -0
- data/ext/cld3/getonescriptspan.cc +1109 -0
- data/ext/cld3/getonescriptspan.h +124 -0
- data/ext/cld3/integral_types.h +37 -0
- data/ext/cld3/lang_id_nn_params.cc +57449 -0
- data/ext/cld3/lang_id_nn_params.h +178 -0
- data/ext/cld3/language_identifier_features.cc +165 -0
- data/ext/cld3/language_identifier_features.h +116 -0
- data/ext/cld3/nnet_language_identifier.cc +380 -0
- data/ext/cld3/nnet_language_identifier.h +175 -0
- data/ext/cld3/nnet_language_identifier_c.cc +72 -0
- data/ext/cld3/offsetmap.cc +478 -0
- data/ext/cld3/offsetmap.h +168 -0
- data/ext/cld3/port.h +143 -0
- data/ext/cld3/registry.cc +28 -0
- data/ext/cld3/registry.h +242 -0
- data/ext/cld3/relevant_script_feature.cc +89 -0
- data/ext/cld3/relevant_script_feature.h +49 -0
- data/ext/cld3/script_detector.h +156 -0
- data/ext/cld3/sentence.proto +77 -0
- data/ext/cld3/sentence_features.cc +29 -0
- data/ext/cld3/sentence_features.h +35 -0
- data/ext/cld3/simple_adder.h +72 -0
- data/ext/cld3/stringpiece.h +81 -0
- data/ext/cld3/task_context.cc +161 -0
- data/ext/cld3/task_context.h +81 -0
- data/ext/cld3/task_context_params.cc +74 -0
- data/ext/cld3/task_context_params.h +54 -0
- data/ext/cld3/task_spec.proto +98 -0
- data/ext/cld3/text_processing.cc +245 -0
- data/ext/cld3/text_processing.h +30 -0
- data/ext/cld3/unicodetext.cc +96 -0
- data/ext/cld3/unicodetext.h +144 -0
- data/ext/cld3/utf8acceptinterchange.h +486 -0
- data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/utf8statetable.cc +1344 -0
- data/ext/cld3/utf8statetable.h +285 -0
- data/ext/cld3/utils.cc +241 -0
- data/ext/cld3/utils.h +144 -0
- data/ext/cld3/workspace.cc +64 -0
- data/ext/cld3/workspace.h +177 -0
- data/lib/cld3.rb +99 -0
- metadata +158 -0
@@ -0,0 +1,285 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// State Table follower for scanning UTF-8 strings without converting to
|
17
|
+
// 32- or 16-bit Unicode values.
|
18
|
+
//
|
19
|
+
// Author: dsites@google.com (Dick Sites)
|
20
|
+
//
|
21
|
+
|
22
|
+
#ifndef SCRIPT_SPAN_UTF8STATETABLE_H_
|
23
|
+
#define SCRIPT_SPAN_UTF8STATETABLE_H_
|
24
|
+
|
25
|
+
#include <string>
|
26
|
+
|
27
|
+
#include "integral_types.h" // for uint8, uint32, uint16
|
28
|
+
#include "stringpiece.h"
|
29
|
+
|
30
|
+
namespace chrome_lang_id {
|
31
|
+
namespace CLD2 {
|
32
|
+
|
33
|
+
class OffsetMap;
|
34
|
+
|
35
|
+
|
36
|
+
// These four-byte entries compactly encode how many bytes 0..255 to delete
|
37
|
+
// in making a string replacement, how many bytes to add 0..255, and the offset
|
38
|
+
// 0..64k-1 of the replacement string in remap_string.
|
39
|
+
struct RemapEntry {
|
40
|
+
uint8 delete_bytes;
|
41
|
+
uint8 add_bytes;
|
42
|
+
uint16 bytes_offset;
|
43
|
+
};
|
44
|
+
|
45
|
+
// Exit type codes for state tables. All but the first get stuffed into
|
46
|
+
// signed one-byte entries. The first is only generated by executable code.
|
47
|
+
// To distinguish from next-state entries, these must be contiguous and
|
48
|
+
// all <= kExitNone
|
49
|
+
typedef enum {
|
50
|
+
kExitDstSpaceFull = 239,
|
51
|
+
kExitIllegalStructure, // 240
|
52
|
+
kExitOK, // 241
|
53
|
+
kExitReject, // ...
|
54
|
+
kExitReplace1,
|
55
|
+
kExitReplace2,
|
56
|
+
kExitReplace3,
|
57
|
+
kExitReplace21,
|
58
|
+
kExitReplace31,
|
59
|
+
kExitReplace32,
|
60
|
+
kExitReplaceOffset1,
|
61
|
+
kExitReplaceOffset2,
|
62
|
+
kExitReplace1S0,
|
63
|
+
kExitSpecial,
|
64
|
+
kExitDoAgain,
|
65
|
+
kExitRejectAlt,
|
66
|
+
kExitNone // 255
|
67
|
+
} ExitReason;
|
68
|
+
|
69
|
+
typedef enum {
|
70
|
+
kExitDstSpaceFull_2 = 32767, // 0x7fff
|
71
|
+
kExitIllegalStructure_2, // 32768 0x8000
|
72
|
+
kExitOK_2, // 32769 0x8001
|
73
|
+
kExitReject_2, // ...
|
74
|
+
kExitReplace1_2,
|
75
|
+
kExitReplace2_2,
|
76
|
+
kExitReplace3_2,
|
77
|
+
kExitReplace21_2,
|
78
|
+
kExitReplace31_2,
|
79
|
+
kExitReplace32_2,
|
80
|
+
kExitReplaceOffset1_2,
|
81
|
+
kExitReplaceOffset2_2,
|
82
|
+
kExitReplace1S0_2,
|
83
|
+
kExitSpecial_2,
|
84
|
+
kExitDoAgain_2,
|
85
|
+
kExitRejectAlt_2,
|
86
|
+
kExitNone_2 // 32783 0x800f
|
87
|
+
} ExitReason_2;
|
88
|
+
|
89
|
+
|
90
|
+
// This struct represents one entire state table. The three initialized byte
|
91
|
+
// areas are state_table, remap_base, and remap_string. state0 and state0_size
|
92
|
+
// give the byte offset and length within state_table of the initial state --
|
93
|
+
// table lookups are expected to start and end in this state, but for
|
94
|
+
// truncated UTF-8 strings, may end in a different state. These allow a quick
|
95
|
+
// test for that condition. entry_shift is 8 for tables subscripted by a full
|
96
|
+
// byte value and 6 for space-optimized tables subscripted by only six
|
97
|
+
// significant bits in UTF-8 continuation bytes.
|
98
|
+
typedef struct {
|
99
|
+
const uint32 state0;
|
100
|
+
const uint32 state0_size;
|
101
|
+
const uint32 total_size;
|
102
|
+
const int max_expand;
|
103
|
+
const int entry_shift;
|
104
|
+
const int bytes_per_entry;
|
105
|
+
const uint32 losub;
|
106
|
+
const uint32 hiadd;
|
107
|
+
const uint8* state_table;
|
108
|
+
const RemapEntry* remap_base;
|
109
|
+
const uint8* remap_string;
|
110
|
+
const uint8* fast_state;
|
111
|
+
} UTF8StateMachineObj;
|
112
|
+
|
113
|
+
// Near-duplicate declaration for tables with two-byte entries
|
114
|
+
typedef struct {
|
115
|
+
const uint32 state0;
|
116
|
+
const uint32 state0_size;
|
117
|
+
const uint32 total_size;
|
118
|
+
const int max_expand;
|
119
|
+
const int entry_shift;
|
120
|
+
const int bytes_per_entry;
|
121
|
+
const uint32 losub;
|
122
|
+
const uint32 hiadd;
|
123
|
+
const unsigned short* state_table;
|
124
|
+
const RemapEntry* remap_base;
|
125
|
+
const uint8* remap_string;
|
126
|
+
const uint8* fast_state;
|
127
|
+
} UTF8StateMachineObj_2;
|
128
|
+
|
129
|
+
|
130
|
+
typedef UTF8StateMachineObj UTF8PropObj;
|
131
|
+
typedef UTF8StateMachineObj UTF8ScanObj;
|
132
|
+
typedef UTF8StateMachineObj UTF8ReplaceObj;
|
133
|
+
typedef UTF8StateMachineObj_2 UTF8PropObj_2;
|
134
|
+
typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
|
135
|
+
// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
|
136
|
+
|
137
|
+
|
138
|
+
// Look up property of one UTF-8 character and advance over it
|
139
|
+
// Return 0 if input length is zero
|
140
|
+
// Return 0 and advance one byte if input is ill-formed
|
141
|
+
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
142
|
+
const uint8** src,
|
143
|
+
int* srclen);
|
144
|
+
|
145
|
+
// Look up property of one UTF-8 character (assumed to be valid).
|
146
|
+
// (This is a faster version of UTF8GenericProperty.)
|
147
|
+
bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
|
148
|
+
|
149
|
+
|
150
|
+
// BigOneByte versions are needed for tables > 240 states, but most
|
151
|
+
// won't need the TwoByte versions.
|
152
|
+
|
153
|
+
// Look up property of one UTF-8 character and advance over it
|
154
|
+
// Return 0 if input length is zero
|
155
|
+
// Return 0 and advance one byte if input is ill-formed
|
156
|
+
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
157
|
+
const uint8** src,
|
158
|
+
int* srclen);
|
159
|
+
|
160
|
+
|
161
|
+
// TwoByte versions are needed for tables > 240 states that don't fit onto
|
162
|
+
// BigOneByte -- rare ultimate fallback
|
163
|
+
|
164
|
+
// Look up property of one UTF-8 character (assumed to be valid).
|
165
|
+
// (This is a faster version of UTF8GenericProperty.)
|
166
|
+
bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
|
167
|
+
|
168
|
+
// Look up property of one UTF-8 character and advance over it
|
169
|
+
// Return 0 if input length is zero
|
170
|
+
// Return 0 and advance one byte if input is ill-formed
|
171
|
+
uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
|
172
|
+
const uint8** src,
|
173
|
+
int* srclen);
|
174
|
+
|
175
|
+
// Look up property of one UTF-8 character (assumed to be valid).
|
176
|
+
// (This is a faster version of UTF8GenericProperty.)
|
177
|
+
bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
|
178
|
+
|
179
|
+
// Scan a UTF-8 stringpiece based on a state table.
|
180
|
+
// Always scan complete UTF-8 characters
|
181
|
+
// Set number of bytes scanned. Return reason for exiting
|
182
|
+
int UTF8GenericScan(const UTF8ScanObj* st,
|
183
|
+
const StringPiece& str,
|
184
|
+
int* bytes_consumed);
|
185
|
+
|
186
|
+
|
187
|
+
|
188
|
+
// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
|
189
|
+
// and doing text replacements.
|
190
|
+
// Always scan complete UTF-8 characters
|
191
|
+
// Set number of bytes consumed from input, number filled to output.
|
192
|
+
// Return reason for exiting
|
193
|
+
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
194
|
+
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
195
|
+
const StringPiece& istr,
|
196
|
+
StringPiece& ostr,
|
197
|
+
bool is_plain_text,
|
198
|
+
int* bytes_consumed,
|
199
|
+
int* bytes_filled,
|
200
|
+
int* chars_changed,
|
201
|
+
OffsetMap* offsetmap);
|
202
|
+
|
203
|
+
// Older version without offsetmap
|
204
|
+
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
205
|
+
const StringPiece& istr,
|
206
|
+
StringPiece& ostr,
|
207
|
+
bool is_plain_text,
|
208
|
+
int* bytes_consumed,
|
209
|
+
int* bytes_filled,
|
210
|
+
int* chars_changed);
|
211
|
+
|
212
|
+
// Older version without is_plain_text or offsetmap
|
213
|
+
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
214
|
+
const StringPiece& istr,
|
215
|
+
StringPiece& ostr,
|
216
|
+
int* bytes_consumed,
|
217
|
+
int* bytes_filled,
|
218
|
+
int* chars_changed);
|
219
|
+
|
220
|
+
|
221
|
+
// TwoByte version is needed for tables > about 256 states, such
|
222
|
+
// as the table for full Unicode 4.1 canonical + compatibility mapping
|
223
|
+
|
224
|
+
// Scan a UTF-8 stringpiece based on state table with two-byte entries,
|
225
|
+
// copying to output stringpiece
|
226
|
+
// and doing text replacements.
|
227
|
+
// Always scan complete UTF-8 characters
|
228
|
+
// Set number of bytes consumed from input, number filled to output.
|
229
|
+
// Return reason for exiting
|
230
|
+
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
231
|
+
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
232
|
+
const StringPiece& istr,
|
233
|
+
StringPiece& ostr,
|
234
|
+
bool is_plain_text,
|
235
|
+
int* bytes_consumed,
|
236
|
+
int* bytes_filled,
|
237
|
+
int* chars_changed,
|
238
|
+
OffsetMap* offsetmap);
|
239
|
+
|
240
|
+
// Older version without offsetmap
|
241
|
+
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
242
|
+
const StringPiece& istr,
|
243
|
+
StringPiece& ostr,
|
244
|
+
bool is_plain_text,
|
245
|
+
int* bytes_consumed,
|
246
|
+
int* bytes_filled,
|
247
|
+
int* chars_changed);
|
248
|
+
|
249
|
+
// Older version without is_plain_text or offsetmap
|
250
|
+
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
251
|
+
const StringPiece& istr,
|
252
|
+
StringPiece& ostr,
|
253
|
+
int* bytes_consumed,
|
254
|
+
int* bytes_filled,
|
255
|
+
int* chars_changed);
|
256
|
+
|
257
|
+
|
258
|
+
static const unsigned char kUTF8LenTbl[256] = {
|
259
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
260
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
261
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
262
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
263
|
+
|
264
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
265
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
266
|
+
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
267
|
+
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
|
268
|
+
};
|
269
|
+
|
270
|
+
inline int UTF8OneCharLen(const char* in) {
|
271
|
+
return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
|
272
|
+
}
|
273
|
+
|
274
|
+
// Adjust a stringpiece to encompass complete UTF-8 characters.
|
275
|
+
// The data pointer will be increased by 0..3 bytes to get to a character
|
276
|
+
// boundary, and the length will then be decreased by 0..3 bytes
|
277
|
+
// to encompass the last complete character.
|
278
|
+
// This is useful especially when a UTF-8 string must be put into a fixed-
|
279
|
+
// maximum-size buffer cleanly, such as a MySQL buffer.
|
280
|
+
void UTF8TrimToChars(StringPiece* istr);
|
281
|
+
|
282
|
+
} // End namespace CLD2
|
283
|
+
} // End namespace chrome_lang_id
|
284
|
+
|
285
|
+
#endif // SCRIPT_SPAN_UTF8STATETABLE_H_
|
data/ext/cld3/utils.cc
ADDED
@@ -0,0 +1,241 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#include "utils.h"
|
17
|
+
|
18
|
+
#include <ctype.h>
|
19
|
+
#include <stdlib.h>
|
20
|
+
|
21
|
+
#include "script_span/stringpiece.h"
|
22
|
+
|
23
|
+
namespace chrome_lang_id {
|
24
|
+
namespace utils {
|
25
|
+
|
26
|
+
bool ParseInt32(const char *c_str, int *value) {
|
27
|
+
char *temp;
|
28
|
+
*value = strtol(c_str, &temp, 0); // NOLINT
|
29
|
+
return (*temp == '\0');
|
30
|
+
}
|
31
|
+
|
32
|
+
bool ParseDouble(const char *c_str, double *value) {
|
33
|
+
char *temp;
|
34
|
+
*value = strtod(c_str, &temp);
|
35
|
+
return (*temp == '\0');
|
36
|
+
}
|
37
|
+
|
38
|
+
static char hex_char[] = "0123456789abcdef";
|
39
|
+
|
40
|
+
string CEscape(const string &src) {
|
41
|
+
string dest;
|
42
|
+
|
43
|
+
for (unsigned char c : src) {
|
44
|
+
switch (c) {
|
45
|
+
case '\n':
|
46
|
+
dest.append("\\n");
|
47
|
+
break;
|
48
|
+
case '\r':
|
49
|
+
dest.append("\\r");
|
50
|
+
break;
|
51
|
+
case '\t':
|
52
|
+
dest.append("\\t");
|
53
|
+
break;
|
54
|
+
case '\"':
|
55
|
+
dest.append("\\\"");
|
56
|
+
break;
|
57
|
+
case '\'':
|
58
|
+
dest.append("\\'");
|
59
|
+
break;
|
60
|
+
case '\\':
|
61
|
+
dest.append("\\\\");
|
62
|
+
break;
|
63
|
+
default:
|
64
|
+
// Note that if we emit \xNN and the src character after that is a hex
|
65
|
+
// digit then that digit must be escaped too to prevent it being
|
66
|
+
// interpreted as part of the character code by C.
|
67
|
+
if ((c >= 0x80) || !isprint(c)) {
|
68
|
+
dest.append("\\");
|
69
|
+
dest.push_back(hex_char[c / 64]);
|
70
|
+
dest.push_back(hex_char[(c % 64) / 8]);
|
71
|
+
dest.push_back(hex_char[c % 8]);
|
72
|
+
} else {
|
73
|
+
dest.push_back(c);
|
74
|
+
break;
|
75
|
+
}
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
return dest;
|
80
|
+
}
|
81
|
+
|
82
|
+
std::vector<string> Split(const string &text, char delim) {
|
83
|
+
std::vector<string> result;
|
84
|
+
size_t token_start = 0;
|
85
|
+
if (!text.empty()) {
|
86
|
+
for (size_t i = 0; i < text.size() + 1; i++) {
|
87
|
+
if ((i == text.size()) || (text[i] == delim)) {
|
88
|
+
result.push_back(string(text.data() + token_start, i - token_start));
|
89
|
+
token_start = i + 1;
|
90
|
+
}
|
91
|
+
}
|
92
|
+
}
|
93
|
+
return result;
|
94
|
+
}
|
95
|
+
|
96
|
+
int RemoveLeadingWhitespace(StringPiece *text) {
|
97
|
+
int count = 0;
|
98
|
+
const char *ptr = text->data();
|
99
|
+
while (count < text->size() && isspace(*ptr)) {
|
100
|
+
count++;
|
101
|
+
ptr++;
|
102
|
+
}
|
103
|
+
text->remove_prefix(count);
|
104
|
+
return count;
|
105
|
+
}
|
106
|
+
|
107
|
+
int RemoveTrailingWhitespace(StringPiece *text) {
|
108
|
+
int count = 0;
|
109
|
+
const char *ptr = text->data() + text->size() - 1;
|
110
|
+
while (count < text->size() && isspace(*ptr)) {
|
111
|
+
++count;
|
112
|
+
--ptr;
|
113
|
+
}
|
114
|
+
text->remove_suffix(count);
|
115
|
+
return count;
|
116
|
+
}
|
117
|
+
|
118
|
+
int RemoveWhitespaceContext(StringPiece *text) {
|
119
|
+
// use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
|
120
|
+
return RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text);
|
121
|
+
}
|
122
|
+
|
123
|
+
namespace {
|
124
|
+
// Lower-level versions of Get... that read directly from a character buffer
|
125
|
+
// without any bounds checking.
|
126
|
+
inline uint32 DecodeFixed32(const char *ptr) {
|
127
|
+
return ((static_cast<uint32>(static_cast<unsigned char>(ptr[0]))) |
|
128
|
+
(static_cast<uint32>(static_cast<unsigned char>(ptr[1])) << 8) |
|
129
|
+
(static_cast<uint32>(static_cast<unsigned char>(ptr[2])) << 16) |
|
130
|
+
(static_cast<uint32>(static_cast<unsigned char>(ptr[3])) << 24));
|
131
|
+
}
|
132
|
+
|
133
|
+
// 0xff is in case char is signed.
|
134
|
+
static inline uint32 ByteAs32(char c) { return static_cast<uint32>(c) & 0xff; }
|
135
|
+
} // namespace
|
136
|
+
|
137
|
+
uint32 Hash32(const char *data, size_t n, uint32 seed) {
|
138
|
+
// 'm' and 'r' are mixing constants generated offline.
|
139
|
+
// They're not really 'magic', they just happen to work well.
|
140
|
+
const uint32 m = 0x5bd1e995;
|
141
|
+
const int r = 24;
|
142
|
+
|
143
|
+
// Initialize the hash to a 'random' value
|
144
|
+
uint32 h = static_cast<uint32>(seed ^ n);
|
145
|
+
|
146
|
+
// Mix 4 bytes at a time into the hash
|
147
|
+
while (n >= 4) {
|
148
|
+
uint32 k = DecodeFixed32(data);
|
149
|
+
k *= m;
|
150
|
+
k ^= k >> r;
|
151
|
+
k *= m;
|
152
|
+
h *= m;
|
153
|
+
h ^= k;
|
154
|
+
data += 4;
|
155
|
+
n -= 4;
|
156
|
+
}
|
157
|
+
|
158
|
+
// Handle the last few bytes of the input array
|
159
|
+
if (n == 3) {
|
160
|
+
h ^= ByteAs32(data[2]) << 16;
|
161
|
+
h ^= ByteAs32(data[1]) << 8;
|
162
|
+
h ^= ByteAs32(data[0]);
|
163
|
+
h *= m;
|
164
|
+
} else if (n == 2) {
|
165
|
+
h ^= ByteAs32(data[1]) << 8;
|
166
|
+
h ^= ByteAs32(data[0]);
|
167
|
+
h *= m;
|
168
|
+
} else if (n == 1) {
|
169
|
+
h ^= ByteAs32(data[0]);
|
170
|
+
h *= m;
|
171
|
+
}
|
172
|
+
|
173
|
+
// Do a few final mixes of the hash to ensure the last few
|
174
|
+
// bytes are well-incorporated.
|
175
|
+
h ^= h >> 13;
|
176
|
+
h *= m;
|
177
|
+
h ^= h >> 15;
|
178
|
+
return h;
|
179
|
+
}
|
180
|
+
|
181
|
+
uint32 Hash32WithDefaultSeed(const string &input) {
|
182
|
+
return Hash32(input.data(), input.size(), 0xBEEF);
|
183
|
+
}
|
184
|
+
|
185
|
+
PunctuationUtil::CharacterRange PunctuationUtil::kPunctuation[] = {
|
186
|
+
{33, 35}, {37, 42}, {44, 47}, {58, 59},
|
187
|
+
{63, 64}, {91, 93}, {95, 95}, {123, 123},
|
188
|
+
{125, 125}, {161, 161}, {171, 171}, {183, 183},
|
189
|
+
{187, 187}, {191, 191}, {894, 894}, {903, 903},
|
190
|
+
{1370, 1375}, {1417, 1418}, {1470, 1470}, {1472, 1472},
|
191
|
+
{1475, 1475}, {1478, 1478}, {1523, 1524}, {1548, 1549},
|
192
|
+
{1563, 1563}, {1566, 1567}, {1642, 1645}, {1748, 1748},
|
193
|
+
{1792, 1805}, {2404, 2405}, {2416, 2416}, {3572, 3572},
|
194
|
+
{3663, 3663}, {3674, 3675}, {3844, 3858}, {3898, 3901},
|
195
|
+
{3973, 3973}, {4048, 4049}, {4170, 4175}, {4347, 4347},
|
196
|
+
{4961, 4968}, {5741, 5742}, {5787, 5788}, {5867, 5869},
|
197
|
+
{5941, 5942}, {6100, 6102}, {6104, 6106}, {6144, 6154},
|
198
|
+
{6468, 6469}, {6622, 6623}, {6686, 6687}, {8208, 8231},
|
199
|
+
{8240, 8259}, {8261, 8273}, {8275, 8286}, {8317, 8318},
|
200
|
+
{8333, 8334}, {9001, 9002}, {9140, 9142}, {10088, 10101},
|
201
|
+
{10181, 10182}, {10214, 10219}, {10627, 10648}, {10712, 10715},
|
202
|
+
{10748, 10749}, {11513, 11516}, {11518, 11519}, {11776, 11799},
|
203
|
+
{11804, 11805}, {12289, 12291}, {12296, 12305}, {12308, 12319},
|
204
|
+
{12336, 12336}, {12349, 12349}, {12448, 12448}, {12539, 12539},
|
205
|
+
{64830, 64831}, {65040, 65049}, {65072, 65106}, {65108, 65121},
|
206
|
+
{65123, 65123}, {65128, 65128}, {65130, 65131}, {65281, 65283},
|
207
|
+
{65285, 65290}, {65292, 65295}, {65306, 65307}, {65311, 65312},
|
208
|
+
{65339, 65341}, {65343, 65343}, {65371, 65371}, {65373, 65373},
|
209
|
+
{65375, 65381}, {65792, 65793}, {66463, 66463}, {68176, 68184},
|
210
|
+
{-1, -1}};
|
211
|
+
|
212
|
+
void NormalizeDigits(string *form) {
|
213
|
+
for (size_t i = 0; i < form->size(); ++i) {
|
214
|
+
if ((*form)[i] >= '0' && (*form)[i] <= '9') (*form)[i] = '9';
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
void GetUTF8Chars(const string &text, std::vector<string> *chars) {
|
219
|
+
const char *start = text.c_str();
|
220
|
+
const char *end = text.c_str() + text.size();
|
221
|
+
while (start < end) {
|
222
|
+
int char_length = UTF8FirstLetterNumBytes(start);
|
223
|
+
chars->emplace_back(start, char_length);
|
224
|
+
start += char_length;
|
225
|
+
}
|
226
|
+
}
|
227
|
+
|
228
|
+
int UTF8FirstLetterNumBytes(const char *utf8_str) {
|
229
|
+
if (*utf8_str == '\0') return 0;
|
230
|
+
return OneCharLen(utf8_str);
|
231
|
+
}
|
232
|
+
|
233
|
+
int OneCharLen(const char *src) {
|
234
|
+
// On most platforms, char is unsigned by default, but iOS is an exception.
|
235
|
+
// The cast below makes sure we always interpret *src as an unsigned char.
|
236
|
+
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
|
237
|
+
[(*(reinterpret_cast<const unsigned char *>(src)) & 0xFF) >> 4];
|
238
|
+
}
|
239
|
+
|
240
|
+
} // namespace utils
|
241
|
+
} // namespace chrome_lang_id
|