cld3 3.5.0 → 3.5.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +0 -8
  3. data/cld3.gemspec +6 -6
  4. data/ext/cld3/extconf.rb +1 -2
  5. data/ext/cld3/nnet_language_identifier_c.cc +162 -70
  6. data/lib/cld3.rb +14 -102
  7. data/sig/cld3.rbs +2 -0
  8. metadata +15 -77
  9. data/ext/cld3/Makefile +0 -268
  10. data/ext/cld3/base.o +0 -0
  11. data/ext/cld3/embedding_feature_extractor.o +0 -0
  12. data/ext/cld3/embedding_network.o +0 -0
  13. data/ext/cld3/feature_extractor.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.o +0 -0
  20. data/ext/cld3/lang_id_nn_params.o +0 -0
  21. data/ext/cld3/language_identifier_features.o +0 -0
  22. data/ext/cld3/libcld3.def +0 -8
  23. data/ext/cld3/libcld3.so +0 -0
  24. data/ext/cld3/nnet_language_identifier.o +0 -0
  25. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  26. data/ext/cld3/offsetmap.o +0 -0
  27. data/ext/cld3/registry.o +0 -0
  28. data/ext/cld3/relevant_script_feature.o +0 -0
  29. data/ext/cld3/script_span/fixunicodevalue.h +0 -69
  30. data/ext/cld3/script_span/generated_ulscript.h +0 -142
  31. data/ext/cld3/script_span/getonescriptspan.h +0 -124
  32. data/ext/cld3/script_span/integral_types.h +0 -37
  33. data/ext/cld3/script_span/offsetmap.h +0 -168
  34. data/ext/cld3/script_span/port.h +0 -143
  35. data/ext/cld3/script_span/stringpiece.h +0 -81
  36. data/ext/cld3/script_span/text_processing.h +0 -30
  37. data/ext/cld3/script_span/utf8acceptinterchange.h +0 -486
  38. data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +0 -1631
  39. data/ext/cld3/script_span/utf8repl_lettermarklower.h +0 -758
  40. data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +0 -1455
  41. data/ext/cld3/script_span/utf8statetable.h +0 -285
  42. data/ext/cld3/sentence_features.o +0 -0
  43. data/ext/cld3/task_context.o +0 -0
  44. data/ext/cld3/task_context_params.o +0 -0
  45. data/ext/cld3/text_processing.o +0 -0
  46. data/ext/cld3/unicodetext.o +0 -0
  47. data/ext/cld3/utf8statetable.o +0 -0
  48. data/ext/cld3/utils.o +0 -0
  49. data/ext/cld3/workspace.o +0 -0
  50. data/lib/cld3/unstable.rb +0 -58
@@ -1,285 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- //
16
- // State Table follower for scanning UTF-8 strings without converting to
17
- // 32- or 16-bit Unicode values.
18
- //
19
- // Author: dsites@google.com (Dick Sites)
20
- //
21
-
22
- #ifndef SCRIPT_SPAN_UTF8STATETABLE_H_
23
- #define SCRIPT_SPAN_UTF8STATETABLE_H_
24
-
25
- #include <string>
26
-
27
- #include "integral_types.h" // for uint8, uint32, uint16
28
- #include "stringpiece.h"
29
-
30
- namespace chrome_lang_id {
31
- namespace CLD2 {
32
-
33
- class OffsetMap;
34
-
35
-
36
- // These four-byte entries compactly encode how many bytes 0..255 to delete
37
- // in making a string replacement, how many bytes to add 0..255, and the offset
38
- // 0..64k-1 of the replacement string in remap_string.
39
- struct RemapEntry {
40
- uint8 delete_bytes;
41
- uint8 add_bytes;
42
- uint16 bytes_offset;
43
- };
44
-
45
- // Exit type codes for state tables. All but the first get stuffed into
46
- // signed one-byte entries. The first is only generated by executable code.
47
- // To distinguish from next-state entries, these must be contiguous and
48
- // all <= kExitNone
49
- typedef enum {
50
- kExitDstSpaceFull = 239,
51
- kExitIllegalStructure, // 240
52
- kExitOK, // 241
53
- kExitReject, // ...
54
- kExitReplace1,
55
- kExitReplace2,
56
- kExitReplace3,
57
- kExitReplace21,
58
- kExitReplace31,
59
- kExitReplace32,
60
- kExitReplaceOffset1,
61
- kExitReplaceOffset2,
62
- kExitReplace1S0,
63
- kExitSpecial,
64
- kExitDoAgain,
65
- kExitRejectAlt,
66
- kExitNone // 255
67
- } ExitReason;
68
-
69
- typedef enum {
70
- kExitDstSpaceFull_2 = 32767, // 0x7fff
71
- kExitIllegalStructure_2, // 32768 0x8000
72
- kExitOK_2, // 32769 0x8001
73
- kExitReject_2, // ...
74
- kExitReplace1_2,
75
- kExitReplace2_2,
76
- kExitReplace3_2,
77
- kExitReplace21_2,
78
- kExitReplace31_2,
79
- kExitReplace32_2,
80
- kExitReplaceOffset1_2,
81
- kExitReplaceOffset2_2,
82
- kExitReplace1S0_2,
83
- kExitSpecial_2,
84
- kExitDoAgain_2,
85
- kExitRejectAlt_2,
86
- kExitNone_2 // 32783 0x800f
87
- } ExitReason_2;
88
-
89
-
90
- // This struct represents one entire state table. The three initialized byte
91
- // areas are state_table, remap_base, and remap_string. state0 and state0_size
92
- // give the byte offset and length within state_table of the initial state --
93
- // table lookups are expected to start and end in this state, but for
94
- // truncated UTF-8 strings, may end in a different state. These allow a quick
95
- // test for that condition. entry_shift is 8 for tables subscripted by a full
96
- // byte value and 6 for space-optimized tables subscripted by only six
97
- // significant bits in UTF-8 continuation bytes.
98
- typedef struct {
99
- const uint32 state0;
100
- const uint32 state0_size;
101
- const uint32 total_size;
102
- const int max_expand;
103
- const int entry_shift;
104
- const int bytes_per_entry;
105
- const uint32 losub;
106
- const uint32 hiadd;
107
- const uint8* state_table;
108
- const RemapEntry* remap_base;
109
- const uint8* remap_string;
110
- const uint8* fast_state;
111
- } UTF8StateMachineObj;
112
-
113
- // Near-duplicate declaration for tables with two-byte entries
114
- typedef struct {
115
- const uint32 state0;
116
- const uint32 state0_size;
117
- const uint32 total_size;
118
- const int max_expand;
119
- const int entry_shift;
120
- const int bytes_per_entry;
121
- const uint32 losub;
122
- const uint32 hiadd;
123
- const unsigned short* state_table;
124
- const RemapEntry* remap_base;
125
- const uint8* remap_string;
126
- const uint8* fast_state;
127
- } UTF8StateMachineObj_2;
128
-
129
-
130
- typedef UTF8StateMachineObj UTF8PropObj;
131
- typedef UTF8StateMachineObj UTF8ScanObj;
132
- typedef UTF8StateMachineObj UTF8ReplaceObj;
133
- typedef UTF8StateMachineObj_2 UTF8PropObj_2;
134
- typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
135
- // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
136
-
137
-
138
- // Look up property of one UTF-8 character and advance over it
139
- // Return 0 if input length is zero
140
- // Return 0 and advance one byte if input is ill-formed
141
- uint8 UTF8GenericProperty(const UTF8PropObj* st,
142
- const uint8** src,
143
- int* srclen);
144
-
145
- // Look up property of one UTF-8 character (assumed to be valid).
146
- // (This is a faster version of UTF8GenericProperty.)
147
- bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
148
-
149
-
150
- // BigOneByte versions are needed for tables > 240 states, but most
151
- // won't need the TwoByte versions.
152
-
153
- // Look up property of one UTF-8 character and advance over it
154
- // Return 0 if input length is zero
155
- // Return 0 and advance one byte if input is ill-formed
156
- uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
157
- const uint8** src,
158
- int* srclen);
159
-
160
-
161
- // TwoByte versions are needed for tables > 240 states that don't fit onto
162
- // BigOneByte -- rare ultimate fallback
163
-
164
- // Look up property of one UTF-8 character (assumed to be valid).
165
- // (This is a faster version of UTF8GenericProperty.)
166
- bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
167
-
168
- // Look up property of one UTF-8 character and advance over it
169
- // Return 0 if input length is zero
170
- // Return 0 and advance one byte if input is ill-formed
171
- uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
172
- const uint8** src,
173
- int* srclen);
174
-
175
- // Look up property of one UTF-8 character (assumed to be valid).
176
- // (This is a faster version of UTF8GenericProperty.)
177
- bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
178
-
179
- // Scan a UTF-8 stringpiece based on a state table.
180
- // Always scan complete UTF-8 characters
181
- // Set number of bytes scanned. Return reason for exiting
182
- int UTF8GenericScan(const UTF8ScanObj* st,
183
- const StringPiece& str,
184
- int* bytes_consumed);
185
-
186
-
187
-
188
- // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
189
- // and doing text replacements.
190
- // Always scan complete UTF-8 characters
191
- // Set number of bytes consumed from input, number filled to output.
192
- // Return reason for exiting
193
- // Also writes an optional OffsetMap. Pass NULL to skip writing one.
194
- int UTF8GenericReplace(const UTF8ReplaceObj* st,
195
- const StringPiece& istr,
196
- StringPiece& ostr,
197
- bool is_plain_text,
198
- int* bytes_consumed,
199
- int* bytes_filled,
200
- int* chars_changed,
201
- OffsetMap* offsetmap);
202
-
203
- // Older version without offsetmap
204
- int UTF8GenericReplace(const UTF8ReplaceObj* st,
205
- const StringPiece& istr,
206
- StringPiece& ostr,
207
- bool is_plain_text,
208
- int* bytes_consumed,
209
- int* bytes_filled,
210
- int* chars_changed);
211
-
212
- // Older version without is_plain_text or offsetmap
213
- int UTF8GenericReplace(const UTF8ReplaceObj* st,
214
- const StringPiece& istr,
215
- StringPiece& ostr,
216
- int* bytes_consumed,
217
- int* bytes_filled,
218
- int* chars_changed);
219
-
220
-
221
- // TwoByte version is needed for tables > about 256 states, such
222
- // as the table for full Unicode 4.1 canonical + compatibility mapping
223
-
224
- // Scan a UTF-8 stringpiece based on state table with two-byte entries,
225
- // copying to output stringpiece
226
- // and doing text replacements.
227
- // Always scan complete UTF-8 characters
228
- // Set number of bytes consumed from input, number filled to output.
229
- // Return reason for exiting
230
- // Also writes an optional OffsetMap. Pass NULL to skip writing one.
231
- int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
232
- const StringPiece& istr,
233
- StringPiece& ostr,
234
- bool is_plain_text,
235
- int* bytes_consumed,
236
- int* bytes_filled,
237
- int* chars_changed,
238
- OffsetMap* offsetmap);
239
-
240
- // Older version without offsetmap
241
- int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
242
- const StringPiece& istr,
243
- StringPiece& ostr,
244
- bool is_plain_text,
245
- int* bytes_consumed,
246
- int* bytes_filled,
247
- int* chars_changed);
248
-
249
- // Older version without is_plain_text or offsetmap
250
- int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
251
- const StringPiece& istr,
252
- StringPiece& ostr,
253
- int* bytes_consumed,
254
- int* bytes_filled,
255
- int* chars_changed);
256
-
257
-
258
- static const unsigned char kUTF8LenTbl[256] = {
259
- 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
260
- 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
261
- 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
262
- 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
263
-
264
- 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
265
- 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
266
- 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
267
- 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
268
- };
269
-
270
- inline int UTF8OneCharLen(const char* in) {
271
- return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
272
- }
273
-
274
- // Adjust a stringpiece to encompass complete UTF-8 characters.
275
- // The data pointer will be increased by 0..3 bytes to get to a character
276
- // boundary, and the length will then be decreased by 0..3 bytes
277
- // to encompass the last complete character.
278
- // This is useful especially when a UTF-8 string must be put into a fixed-
279
- // maximum-size buffer cleanly, such as a MySQL buffer.
280
- void UTF8TrimToChars(StringPiece* istr);
281
-
282
- } // End namespace CLD2
283
- } // End namespace chrome_lang_id
284
-
285
- #endif // SCRIPT_SPAN_UTF8STATETABLE_H_
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/ext/cld3/utils.o DELETED
Binary file
data/ext/cld3/workspace.o DELETED
Binary file
data/lib/cld3/unstable.rb DELETED
@@ -1,58 +0,0 @@
1
-
2
- # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
3
- # All Rights Reserved.
4
- #
5
- # Licensed under the Apache License, Version 2.0 (the "License");
6
- # you may not use this file except in compliance with the License.
7
- # You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- # ==============================================================================
17
-
18
- module CLD3
19
- module Unstable
20
- extend FFI::Library
21
-
22
- ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
23
-
24
- module NNetLanguageIdentifier
25
- class Pointer < FFI::AutoPointer
26
- def self.release(pointer)
27
- Unstable.delete_NNetLanguageIdentifier(pointer)
28
- end
29
- end
30
-
31
- class SpanInfo < FFI::Struct
32
- layout :start_index, :int, :end_index, :int, :probability, :float
33
- end
34
-
35
- class Result < FFI::Struct
36
- layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
37
- end
38
- end
39
-
40
- attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
41
-
42
- attach_function :delete_result, [ :pointer ], :void
43
-
44
- attach_function :delete_results, [ :pointer ], :void
45
-
46
- attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
47
-
48
- attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
49
-
50
- attach_function :NNetLanguageIdentifier_find_language,
51
- [ :pointer, :buffer_in, :size_t ], :pointer
52
-
53
- attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
54
- [ :pointer, :buffer_in, :size_t, :int ], :pointer
55
- end
56
-
57
- private_constant :Unstable
58
- end