cld3 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +18 -0
  3. data/LICENSE +204 -0
  4. data/LICENSE_CLD3 +203 -0
  5. data/README.md +22 -0
  6. data/cld3.gemspec +35 -0
  7. data/ext/cld3/base.cc +36 -0
  8. data/ext/cld3/base.h +106 -0
  9. data/ext/cld3/casts.h +98 -0
  10. data/ext/cld3/embedding_feature_extractor.cc +51 -0
  11. data/ext/cld3/embedding_feature_extractor.h +182 -0
  12. data/ext/cld3/embedding_network.cc +196 -0
  13. data/ext/cld3/embedding_network.h +186 -0
  14. data/ext/cld3/embedding_network_params.h +285 -0
  15. data/ext/cld3/extconf.rb +49 -0
  16. data/ext/cld3/feature_extractor.cc +137 -0
  17. data/ext/cld3/feature_extractor.h +633 -0
  18. data/ext/cld3/feature_extractor.proto +50 -0
  19. data/ext/cld3/feature_types.cc +72 -0
  20. data/ext/cld3/feature_types.h +158 -0
  21. data/ext/cld3/fixunicodevalue.cc +55 -0
  22. data/ext/cld3/fixunicodevalue.h +69 -0
  23. data/ext/cld3/float16.h +58 -0
  24. data/ext/cld3/fml_parser.cc +308 -0
  25. data/ext/cld3/fml_parser.h +123 -0
  26. data/ext/cld3/generated_entities.cc +296 -0
  27. data/ext/cld3/generated_ulscript.cc +678 -0
  28. data/ext/cld3/generated_ulscript.h +142 -0
  29. data/ext/cld3/getonescriptspan.cc +1109 -0
  30. data/ext/cld3/getonescriptspan.h +124 -0
  31. data/ext/cld3/integral_types.h +37 -0
  32. data/ext/cld3/lang_id_nn_params.cc +57449 -0
  33. data/ext/cld3/lang_id_nn_params.h +178 -0
  34. data/ext/cld3/language_identifier_features.cc +165 -0
  35. data/ext/cld3/language_identifier_features.h +116 -0
  36. data/ext/cld3/nnet_language_identifier.cc +380 -0
  37. data/ext/cld3/nnet_language_identifier.h +175 -0
  38. data/ext/cld3/nnet_language_identifier_c.cc +72 -0
  39. data/ext/cld3/offsetmap.cc +478 -0
  40. data/ext/cld3/offsetmap.h +168 -0
  41. data/ext/cld3/port.h +143 -0
  42. data/ext/cld3/registry.cc +28 -0
  43. data/ext/cld3/registry.h +242 -0
  44. data/ext/cld3/relevant_script_feature.cc +89 -0
  45. data/ext/cld3/relevant_script_feature.h +49 -0
  46. data/ext/cld3/script_detector.h +156 -0
  47. data/ext/cld3/sentence.proto +77 -0
  48. data/ext/cld3/sentence_features.cc +29 -0
  49. data/ext/cld3/sentence_features.h +35 -0
  50. data/ext/cld3/simple_adder.h +72 -0
  51. data/ext/cld3/stringpiece.h +81 -0
  52. data/ext/cld3/task_context.cc +161 -0
  53. data/ext/cld3/task_context.h +81 -0
  54. data/ext/cld3/task_context_params.cc +74 -0
  55. data/ext/cld3/task_context_params.h +54 -0
  56. data/ext/cld3/task_spec.proto +98 -0
  57. data/ext/cld3/text_processing.cc +245 -0
  58. data/ext/cld3/text_processing.h +30 -0
  59. data/ext/cld3/unicodetext.cc +96 -0
  60. data/ext/cld3/unicodetext.h +144 -0
  61. data/ext/cld3/utf8acceptinterchange.h +486 -0
  62. data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
  63. data/ext/cld3/utf8repl_lettermarklower.h +758 -0
  64. data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
  65. data/ext/cld3/utf8statetable.cc +1344 -0
  66. data/ext/cld3/utf8statetable.h +285 -0
  67. data/ext/cld3/utils.cc +241 -0
  68. data/ext/cld3/utils.h +144 -0
  69. data/ext/cld3/workspace.cc +64 -0
  70. data/ext/cld3/workspace.h +177 -0
  71. data/lib/cld3.rb +99 -0
  72. metadata +158 -0
@@ -0,0 +1,98 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ // LINT: ALLOW_GROUPS
17
+ // Protocol buffer specifications for task configuration.
18
+
19
+ syntax = "proto2";
20
+ option optimize_for = LITE_RUNTIME;
21
+
22
+ package chrome_lang_id;
23
+
24
+ // Task input descriptor.
25
+ message TaskInput {
26
+ // Name of input resource.
27
+ required string name = 1;
28
+
29
+ // Name of stage responsible of creating this resource.
30
+ optional string creator = 2;
31
+
32
+ // File format for resource.
33
+ repeated string file_format = 3;
34
+
35
+ // Record format for resource.
36
+ repeated string record_format = 4;
37
+
38
+ // Is this resource multi-file?
39
+ optional bool multi_file = 5 [default = false];
40
+
41
+ // An input can consist of multiple file sets.
42
+ repeated group Part = 6 {
43
+ // File pattern for file set.
44
+ optional string file_pattern = 7;
45
+
46
+ // File format for file set.
47
+ optional string file_format = 8;
48
+
49
+ // Record format for file set.
50
+ optional string record_format = 9;
51
+ }
52
+ }
53
+
54
+ // Task output descriptor.
55
+ message TaskOutput {
56
+ // Name of output resource.
57
+ required string name = 1;
58
+
59
+ // File format for output resource.
60
+ optional string file_format = 2;
61
+
62
+ // Record format for output resource.
63
+ optional string record_format = 3;
64
+
65
+ // Number of shards in output. If it is different from zero this output is
66
+ // sharded. If the number of shards is set to -1 this means that the output is
67
+ // sharded, but the number of shard is unknown. The files are then named
68
+ // 'base-*-of-*'.
69
+ optional int32 shards = 4 [default = 0];
70
+
71
+ // Base file name for output resource. If this is not set by the task
72
+ // component it is set to a default value by the workflow engine.
73
+ optional string file_base = 5;
74
+
75
+ // Optional extension added to the file name.
76
+ optional string file_extension = 6;
77
+ }
78
+
79
+ // A task specification is used for describing executing parameters.
80
+ message TaskSpec {
81
+ // Name of task.
82
+ optional string task_name = 1;
83
+
84
+ // Workflow task type.
85
+ optional string task_type = 2;
86
+
87
+ // Task parameters.
88
+ repeated group Parameter = 3 {
89
+ required string name = 4;
90
+ optional string value = 5;
91
+ }
92
+
93
+ // Task inputs.
94
+ repeated TaskInput input = 6;
95
+
96
+ // Task outputs.
97
+ repeated TaskOutput output = 7;
98
+ }
@@ -0,0 +1,245 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ #include "text_processing.h"
16
+
17
+ #include <stdio.h>
18
+ #include <string.h>
19
+
20
+ namespace chrome_lang_id {
21
+ namespace CLD2 {
22
+ namespace {
23
+
24
+ static const int kMaxSpaceScan = 32; // Bytes
25
+
26
+ int minint(int a, int b) { return (a < b) ? a : b; }
27
+
28
+ // Counts number of spaces; a little faster than one-at-a-time
29
+ // Doesn't count odd bytes at end
30
+ int CountSpaces4(const char *src, int src_len) {
31
+ int s_count = 0;
32
+ for (int i = 0; i < (src_len & ~3); i += 4) {
33
+ s_count += (src[i] == ' ');
34
+ s_count += (src[i + 1] == ' ');
35
+ s_count += (src[i + 2] == ' ');
36
+ s_count += (src[i + 3] == ' ');
37
+ }
38
+ return s_count;
39
+ }
40
+
41
+ // This uses a cheap predictor to get a measure of compression, and
42
+ // hence a measure of repetitiveness. It works on complete UTF-8 characters
43
+ // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
44
+ // all the time when done with a byte-based count. Sigh.
45
+ //
46
+ // To allow running prediction across multiple chunks, caller passes in current
47
+ // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
48
+ //
49
+ // Returns the number of *bytes* correctly predicted, increments by 1..4 for
50
+ // each correctly-predicted character.
51
+ //
52
+ // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
53
+ //
54
+
55
+ // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
56
+
57
+ int CountPredictedBytes(const char *isrc, int src_len, int *hash, int *tbl) {
58
+ typedef unsigned char uint8;
59
+
60
+ int p_count = 0;
61
+ const uint8 *src = reinterpret_cast<const uint8 *>(isrc);
62
+ const uint8 *srclimit = src + src_len;
63
+ int local_hash = *hash;
64
+
65
+ while (src < srclimit) {
66
+ int c = src[0];
67
+ int incr = 1;
68
+
69
+ // Pick up one char and length
70
+ if (c < 0xc0) {
71
+ // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
72
+ // Do nothing more
73
+ } else if ((c & 0xe0) == 0xc0) {
74
+ // Two-byte
75
+ c = (c << 8) | src[1];
76
+ incr = 2;
77
+ } else if ((c & 0xf0) == 0xe0) {
78
+ // Three-byte
79
+ c = (c << 16) | (src[1] << 8) | src[2];
80
+ incr = 3;
81
+ } else {
82
+ // Four-byte
83
+ c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
84
+ incr = 4;
85
+ }
86
+ src += incr;
87
+
88
+ int p = tbl[local_hash]; // Prediction
89
+ tbl[local_hash] = c; // Update prediction
90
+ if (c == p) {
91
+ p_count += incr; // Count bytes of good predictions
92
+ }
93
+
94
+ local_hash = ((local_hash << 4) ^ c) & 0xfff;
95
+ }
96
+ *hash = local_hash;
97
+ return p_count;
98
+ }
99
+
100
+ // Backscan to word boundary, returning how many bytes n to go back
101
+ // so that src - n is non-space ans src - n - 1 is space.
102
+ // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
103
+ int BackscanToSpace(const char *src, int limit) {
104
+ int n = 0;
105
+ limit = minint(limit, kMaxSpaceScan);
106
+ while (n < limit) {
107
+ if (src[-n - 1] == ' ') {
108
+ return n;
109
+ } // We are at _X
110
+ ++n;
111
+ }
112
+ n = 0;
113
+ while (n < limit) {
114
+ if ((src[-n] & 0xc0) != 0x80) {
115
+ return n;
116
+ } // We are at char begin
117
+ ++n;
118
+ }
119
+ return 0;
120
+ }
121
+
122
+ // Forwardscan to word boundary, returning how many bytes n to go forward
123
+ // so that src + n is non-space ans src + n - 1 is space.
124
+ // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
125
+ int ForwardscanToSpace(const char *src, int limit) {
126
+ int n = 0;
127
+ limit = minint(limit, kMaxSpaceScan);
128
+ while (n < limit) {
129
+ if (src[n] == ' ') {
130
+ return n + 1;
131
+ } // We are at _X
132
+ ++n;
133
+ }
134
+ n = 0;
135
+ while (n < limit) {
136
+ if ((src[n] & 0xc0) != 0x80) {
137
+ return n;
138
+ } // We are at char begin
139
+ ++n;
140
+ }
141
+ return 0;
142
+ }
143
+
144
+ } // namespace
145
+
146
+ // Must be exactly 4096 for cheap compressor.
147
+ static const int kPredictionTableSize = 4096;
148
+ static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
149
+ static const int kSpacesThreshPercent = 30; // Squeeze if >=30% spaces
150
+ static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
151
+
152
+ // Remove portions of text that have a high density of spaces, or that are
153
+ // overly repetitive, squeezing the remaining text in-place to the front of the
154
+ // input buffer.
155
+ //
156
+ // Squeezing looks at density of space/prediced chars in fixed-size chunks,
157
+ // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
158
+ //
159
+ // Return the new, possibly-shorter length
160
+ //
161
+ // Result Buffer ALWAYS has leading space and trailing space space space NUL,
162
+ // if input does
163
+ //
164
+ int CheapSqueezeInplace(char *isrc, int src_len, int ichunksize) {
165
+ char *src = isrc;
166
+ char *dst = src;
167
+ char *srclimit = src + src_len;
168
+ bool skipping = false;
169
+
170
+ int hash = 0;
171
+
172
+ // Allocate local prediction table.
173
+ int *predict_tbl = new int[kPredictionTableSize];
174
+ memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
175
+
176
+ int chunksize = ichunksize;
177
+ if (chunksize == 0) {
178
+ chunksize = kChunksizeDefault;
179
+ }
180
+ int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
181
+ int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
182
+
183
+ while (src < srclimit) {
184
+ int remaining_bytes = srclimit - src;
185
+ int len = minint(chunksize, remaining_bytes);
186
+
187
+ // Make len land us on a UTF-8 character boundary.
188
+ // Ah. Also fixes mispredict because we could get out of phase
189
+ // Loop always terminates at trailing space in buffer
190
+ while ((src[len] & 0xc0) == 0x80) {
191
+ ++len;
192
+ } // Move past continuation bytes
193
+
194
+ int space_n = CountSpaces4(src, len);
195
+ int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
196
+ if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
197
+ // Skip the text
198
+ if (!skipping) {
199
+ // Keeping-to-skipping transition; do it at a space
200
+ int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
201
+ dst -= n;
202
+ if (dst == isrc) {
203
+ // Force a leading space if the first chunk is deleted
204
+ *dst++ = ' ';
205
+ }
206
+ skipping = true;
207
+ }
208
+ } else {
209
+ // Keep the text
210
+ if (skipping) {
211
+ // Skipping-to-keeping transition; do it at a space
212
+ int n = ForwardscanToSpace(src, len);
213
+ src += n;
214
+ remaining_bytes -= n; // Shrink remaining length
215
+ len -= n;
216
+ skipping = false;
217
+ }
218
+
219
+ // "len" can be negative in some cases
220
+ if (len > 0) {
221
+ memmove(dst, src, len);
222
+ dst += len;
223
+ }
224
+ }
225
+ src += len;
226
+ }
227
+
228
+ if ((dst - isrc) < (src_len - 3)) {
229
+ // Pad and make last char clean UTF-8 by putting following spaces
230
+ dst[0] = ' ';
231
+ dst[1] = ' ';
232
+ dst[2] = ' ';
233
+ dst[3] = '\0';
234
+ } else if ((dst - isrc) < src_len) {
235
+ // Make last char clean UTF-8 by putting following space off the end
236
+ dst[0] = ' ';
237
+ }
238
+
239
+ // Deallocate local prediction table
240
+ delete[] predict_tbl;
241
+ return static_cast<int>(dst - isrc);
242
+ }
243
+
244
+ } // namespace CLD2
245
+ } // namespace chrome_lang_id
@@ -0,0 +1,30 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ #ifndef SCRIPT_SPAN_TEXT_PROCESSING_H_
16
+ #define SCRIPT_SPAN_TEXT_PROCESSING_H_
17
+
18
+ namespace chrome_lang_id {
19
+ namespace CLD2 {
20
+
21
+ // Remove portions of text that have a high density of spaces, or that are
22
+ // overly repetitive, squeezing the remaining text in-place to the front
23
+ // of the input buffer.
24
+ // Return the new, possibly-shorter length
25
+ int CheapSqueezeInplace(char *isrc, int srclen, int ichunksize);
26
+
27
+ } // namespace CLD2
28
+ } // namespace chrome_lang_id
29
+
30
+ #endif // SCRIPT_SPAN_TEXT_PROCESSING_H_
@@ -0,0 +1,96 @@
1
+ // Copyright (C) 2006 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ // Author: Jim Meehan
16
+
17
+ #include "unicodetext.h"
18
+
19
+ #include "base.h"
20
+ #include "utils.h"
21
+
22
+ namespace chrome_lang_id {
23
+
24
+ // *************** Data representation **********
25
+ // Note: the copy constructor is undefined.
26
+
27
+ void UnicodeText::Repr::PointTo(const char *data, int size) {
28
+ if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
29
+ data_ = const_cast<char *>(data);
30
+ size_ = size;
31
+ capacity_ = size;
32
+ ours_ = false;
33
+ }
34
+
35
+ // *************** UnicodeText ******************
36
+
37
+ UnicodeText::UnicodeText() {}
38
+
39
+ UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {
40
+ repr_.PointTo(buffer, byte_length);
41
+ return *this;
42
+ }
43
+
44
+ UnicodeText::~UnicodeText() {}
45
+
46
+ // ******************* UnicodeText::const_iterator *********************
47
+
48
+ // The implementation of const_iterator would be nicer if it
49
+ // inherited from boost::iterator_facade
50
+ // (http://boost.org/libs/iterator/doc/iterator_facade.html).
51
+
52
+ UnicodeText::const_iterator::const_iterator() : it_(0) {}
53
+
54
+ UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(
55
+ const const_iterator &other) {
56
+ if (&other != this) it_ = other.it_;
57
+ return *this;
58
+ }
59
+
60
+ UnicodeText::const_iterator UnicodeText::begin() const {
61
+ return const_iterator(repr_.data_);
62
+ }
63
+
64
+ UnicodeText::const_iterator UnicodeText::end() const {
65
+ return const_iterator(repr_.data_ + repr_.size_);
66
+ }
67
+
68
+ char32 UnicodeText::const_iterator::operator*() const {
69
+ // (We could call chartorune here, but that does some
70
+ // error-checking, and we're guaranteed that our data is valid
71
+ // UTF-8. Also, we expect this routine to be called very often. So
72
+ // for speed, we do the calculation ourselves.)
73
+
74
+ // Convert from UTF-8
75
+ unsigned char byte1 = static_cast<unsigned char>(it_[0]);
76
+ if (byte1 < 0x80) return byte1;
77
+
78
+ unsigned char byte2 = static_cast<unsigned char>(it_[1]);
79
+ if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
80
+
81
+ unsigned char byte3 = static_cast<unsigned char>(it_[2]);
82
+ if (byte1 < 0xF0) {
83
+ return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
84
+ }
85
+
86
+ unsigned char byte4 = static_cast<unsigned char>(it_[3]);
87
+ return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
88
+ ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
89
+ }
90
+
91
+ UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {
92
+ it_ += chrome_lang_id::utils::OneCharLen(it_);
93
+ return *this;
94
+ }
95
+
96
+ } // namespace chrome_lang_id