cld3 3.5.0 → 3.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +0 -8
- data/cld3.gemspec +6 -6
- data/ext/cld3/extconf.rb +1 -2
- data/ext/cld3/nnet_language_identifier_c.cc +162 -70
- data/lib/cld3.rb +14 -102
- data/sig/cld3.rbs +2 -0
- metadata +15 -77
- data/ext/cld3/Makefile +0 -268
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.def +0 -8
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/script_span/fixunicodevalue.h +0 -69
- data/ext/cld3/script_span/generated_ulscript.h +0 -142
- data/ext/cld3/script_span/getonescriptspan.h +0 -124
- data/ext/cld3/script_span/integral_types.h +0 -37
- data/ext/cld3/script_span/offsetmap.h +0 -168
- data/ext/cld3/script_span/port.h +0 -143
- data/ext/cld3/script_span/stringpiece.h +0 -81
- data/ext/cld3/script_span/text_processing.h +0 -30
- data/ext/cld3/script_span/utf8acceptinterchange.h +0 -486
- data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +0 -1631
- data/ext/cld3/script_span/utf8repl_lettermarklower.h +0 -758
- data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +0 -1455
- data/ext/cld3/script_span/utf8statetable.h +0 -285
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3/unstable.rb +0 -58
@@ -1,285 +0,0 @@
|
|
1
|
-
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
|
15
|
-
//
|
16
|
-
// State Table follower for scanning UTF-8 strings without converting to
|
17
|
-
// 32- or 16-bit Unicode values.
|
18
|
-
//
|
19
|
-
// Author: dsites@google.com (Dick Sites)
|
20
|
-
//
|
21
|
-
|
22
|
-
#ifndef SCRIPT_SPAN_UTF8STATETABLE_H_
|
23
|
-
#define SCRIPT_SPAN_UTF8STATETABLE_H_
|
24
|
-
|
25
|
-
#include <string>
|
26
|
-
|
27
|
-
#include "integral_types.h" // for uint8, uint32, uint16
|
28
|
-
#include "stringpiece.h"
|
29
|
-
|
30
|
-
namespace chrome_lang_id {
|
31
|
-
namespace CLD2 {
|
32
|
-
|
33
|
-
class OffsetMap;
|
34
|
-
|
35
|
-
|
36
|
-
// These four-byte entries compactly encode how many bytes 0..255 to delete
|
37
|
-
// in making a string replacement, how many bytes to add 0..255, and the offset
|
38
|
-
// 0..64k-1 of the replacement string in remap_string.
|
39
|
-
struct RemapEntry {
|
40
|
-
uint8 delete_bytes;
|
41
|
-
uint8 add_bytes;
|
42
|
-
uint16 bytes_offset;
|
43
|
-
};
|
44
|
-
|
45
|
-
// Exit type codes for state tables. All but the first get stuffed into
|
46
|
-
// signed one-byte entries. The first is only generated by executable code.
|
47
|
-
// To distinguish from next-state entries, these must be contiguous and
|
48
|
-
// all <= kExitNone
|
49
|
-
typedef enum {
|
50
|
-
kExitDstSpaceFull = 239,
|
51
|
-
kExitIllegalStructure, // 240
|
52
|
-
kExitOK, // 241
|
53
|
-
kExitReject, // ...
|
54
|
-
kExitReplace1,
|
55
|
-
kExitReplace2,
|
56
|
-
kExitReplace3,
|
57
|
-
kExitReplace21,
|
58
|
-
kExitReplace31,
|
59
|
-
kExitReplace32,
|
60
|
-
kExitReplaceOffset1,
|
61
|
-
kExitReplaceOffset2,
|
62
|
-
kExitReplace1S0,
|
63
|
-
kExitSpecial,
|
64
|
-
kExitDoAgain,
|
65
|
-
kExitRejectAlt,
|
66
|
-
kExitNone // 255
|
67
|
-
} ExitReason;
|
68
|
-
|
69
|
-
typedef enum {
|
70
|
-
kExitDstSpaceFull_2 = 32767, // 0x7fff
|
71
|
-
kExitIllegalStructure_2, // 32768 0x8000
|
72
|
-
kExitOK_2, // 32769 0x8001
|
73
|
-
kExitReject_2, // ...
|
74
|
-
kExitReplace1_2,
|
75
|
-
kExitReplace2_2,
|
76
|
-
kExitReplace3_2,
|
77
|
-
kExitReplace21_2,
|
78
|
-
kExitReplace31_2,
|
79
|
-
kExitReplace32_2,
|
80
|
-
kExitReplaceOffset1_2,
|
81
|
-
kExitReplaceOffset2_2,
|
82
|
-
kExitReplace1S0_2,
|
83
|
-
kExitSpecial_2,
|
84
|
-
kExitDoAgain_2,
|
85
|
-
kExitRejectAlt_2,
|
86
|
-
kExitNone_2 // 32783 0x800f
|
87
|
-
} ExitReason_2;
|
88
|
-
|
89
|
-
|
90
|
-
// This struct represents one entire state table. The three initialized byte
|
91
|
-
// areas are state_table, remap_base, and remap_string. state0 and state0_size
|
92
|
-
// give the byte offset and length within state_table of the initial state --
|
93
|
-
// table lookups are expected to start and end in this state, but for
|
94
|
-
// truncated UTF-8 strings, may end in a different state. These allow a quick
|
95
|
-
// test for that condition. entry_shift is 8 for tables subscripted by a full
|
96
|
-
// byte value and 6 for space-optimized tables subscripted by only six
|
97
|
-
// significant bits in UTF-8 continuation bytes.
|
98
|
-
typedef struct {
|
99
|
-
const uint32 state0;
|
100
|
-
const uint32 state0_size;
|
101
|
-
const uint32 total_size;
|
102
|
-
const int max_expand;
|
103
|
-
const int entry_shift;
|
104
|
-
const int bytes_per_entry;
|
105
|
-
const uint32 losub;
|
106
|
-
const uint32 hiadd;
|
107
|
-
const uint8* state_table;
|
108
|
-
const RemapEntry* remap_base;
|
109
|
-
const uint8* remap_string;
|
110
|
-
const uint8* fast_state;
|
111
|
-
} UTF8StateMachineObj;
|
112
|
-
|
113
|
-
// Near-duplicate declaration for tables with two-byte entries
|
114
|
-
typedef struct {
|
115
|
-
const uint32 state0;
|
116
|
-
const uint32 state0_size;
|
117
|
-
const uint32 total_size;
|
118
|
-
const int max_expand;
|
119
|
-
const int entry_shift;
|
120
|
-
const int bytes_per_entry;
|
121
|
-
const uint32 losub;
|
122
|
-
const uint32 hiadd;
|
123
|
-
const unsigned short* state_table;
|
124
|
-
const RemapEntry* remap_base;
|
125
|
-
const uint8* remap_string;
|
126
|
-
const uint8* fast_state;
|
127
|
-
} UTF8StateMachineObj_2;
|
128
|
-
|
129
|
-
|
130
|
-
typedef UTF8StateMachineObj UTF8PropObj;
|
131
|
-
typedef UTF8StateMachineObj UTF8ScanObj;
|
132
|
-
typedef UTF8StateMachineObj UTF8ReplaceObj;
|
133
|
-
typedef UTF8StateMachineObj_2 UTF8PropObj_2;
|
134
|
-
typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
|
135
|
-
// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
|
136
|
-
|
137
|
-
|
138
|
-
// Look up property of one UTF-8 character and advance over it
|
139
|
-
// Return 0 if input length is zero
|
140
|
-
// Return 0 and advance one byte if input is ill-formed
|
141
|
-
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
142
|
-
const uint8** src,
|
143
|
-
int* srclen);
|
144
|
-
|
145
|
-
// Look up property of one UTF-8 character (assumed to be valid).
|
146
|
-
// (This is a faster version of UTF8GenericProperty.)
|
147
|
-
bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
|
148
|
-
|
149
|
-
|
150
|
-
// BigOneByte versions are needed for tables > 240 states, but most
|
151
|
-
// won't need the TwoByte versions.
|
152
|
-
|
153
|
-
// Look up property of one UTF-8 character and advance over it
|
154
|
-
// Return 0 if input length is zero
|
155
|
-
// Return 0 and advance one byte if input is ill-formed
|
156
|
-
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
157
|
-
const uint8** src,
|
158
|
-
int* srclen);
|
159
|
-
|
160
|
-
|
161
|
-
// TwoByte versions are needed for tables > 240 states that don't fit onto
|
162
|
-
// BigOneByte -- rare ultimate fallback
|
163
|
-
|
164
|
-
// Look up property of one UTF-8 character (assumed to be valid).
|
165
|
-
// (This is a faster version of UTF8GenericProperty.)
|
166
|
-
bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
|
167
|
-
|
168
|
-
// Look up property of one UTF-8 character and advance over it
|
169
|
-
// Return 0 if input length is zero
|
170
|
-
// Return 0 and advance one byte if input is ill-formed
|
171
|
-
uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
|
172
|
-
const uint8** src,
|
173
|
-
int* srclen);
|
174
|
-
|
175
|
-
// Look up property of one UTF-8 character (assumed to be valid).
|
176
|
-
// (This is a faster version of UTF8GenericProperty.)
|
177
|
-
bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
|
178
|
-
|
179
|
-
// Scan a UTF-8 stringpiece based on a state table.
|
180
|
-
// Always scan complete UTF-8 characters
|
181
|
-
// Set number of bytes scanned. Return reason for exiting
|
182
|
-
int UTF8GenericScan(const UTF8ScanObj* st,
|
183
|
-
const StringPiece& str,
|
184
|
-
int* bytes_consumed);
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
|
189
|
-
// and doing text replacements.
|
190
|
-
// Always scan complete UTF-8 characters
|
191
|
-
// Set number of bytes consumed from input, number filled to output.
|
192
|
-
// Return reason for exiting
|
193
|
-
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
194
|
-
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
195
|
-
const StringPiece& istr,
|
196
|
-
StringPiece& ostr,
|
197
|
-
bool is_plain_text,
|
198
|
-
int* bytes_consumed,
|
199
|
-
int* bytes_filled,
|
200
|
-
int* chars_changed,
|
201
|
-
OffsetMap* offsetmap);
|
202
|
-
|
203
|
-
// Older version without offsetmap
|
204
|
-
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
205
|
-
const StringPiece& istr,
|
206
|
-
StringPiece& ostr,
|
207
|
-
bool is_plain_text,
|
208
|
-
int* bytes_consumed,
|
209
|
-
int* bytes_filled,
|
210
|
-
int* chars_changed);
|
211
|
-
|
212
|
-
// Older version without is_plain_text or offsetmap
|
213
|
-
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
214
|
-
const StringPiece& istr,
|
215
|
-
StringPiece& ostr,
|
216
|
-
int* bytes_consumed,
|
217
|
-
int* bytes_filled,
|
218
|
-
int* chars_changed);
|
219
|
-
|
220
|
-
|
221
|
-
// TwoByte version is needed for tables > about 256 states, such
|
222
|
-
// as the table for full Unicode 4.1 canonical + compatibility mapping
|
223
|
-
|
224
|
-
// Scan a UTF-8 stringpiece based on state table with two-byte entries,
|
225
|
-
// copying to output stringpiece
|
226
|
-
// and doing text replacements.
|
227
|
-
// Always scan complete UTF-8 characters
|
228
|
-
// Set number of bytes consumed from input, number filled to output.
|
229
|
-
// Return reason for exiting
|
230
|
-
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
231
|
-
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
232
|
-
const StringPiece& istr,
|
233
|
-
StringPiece& ostr,
|
234
|
-
bool is_plain_text,
|
235
|
-
int* bytes_consumed,
|
236
|
-
int* bytes_filled,
|
237
|
-
int* chars_changed,
|
238
|
-
OffsetMap* offsetmap);
|
239
|
-
|
240
|
-
// Older version without offsetmap
|
241
|
-
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
242
|
-
const StringPiece& istr,
|
243
|
-
StringPiece& ostr,
|
244
|
-
bool is_plain_text,
|
245
|
-
int* bytes_consumed,
|
246
|
-
int* bytes_filled,
|
247
|
-
int* chars_changed);
|
248
|
-
|
249
|
-
// Older version without is_plain_text or offsetmap
|
250
|
-
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
251
|
-
const StringPiece& istr,
|
252
|
-
StringPiece& ostr,
|
253
|
-
int* bytes_consumed,
|
254
|
-
int* bytes_filled,
|
255
|
-
int* chars_changed);
|
256
|
-
|
257
|
-
|
258
|
-
static const unsigned char kUTF8LenTbl[256] = {
|
259
|
-
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
260
|
-
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
261
|
-
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
262
|
-
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
263
|
-
|
264
|
-
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
265
|
-
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
266
|
-
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
267
|
-
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
|
268
|
-
};
|
269
|
-
|
270
|
-
inline int UTF8OneCharLen(const char* in) {
|
271
|
-
return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
|
272
|
-
}
|
273
|
-
|
274
|
-
// Adjust a stringpiece to encompass complete UTF-8 characters.
|
275
|
-
// The data pointer will be increased by 0..3 bytes to get to a character
|
276
|
-
// boundary, and the length will then be decreased by 0..3 bytes
|
277
|
-
// to encompass the last complete character.
|
278
|
-
// This is useful especially when a UTF-8 string must be put into a fixed-
|
279
|
-
// maximum-size buffer cleanly, such as a MySQL buffer.
|
280
|
-
void UTF8TrimToChars(StringPiece* istr);
|
281
|
-
|
282
|
-
} // End namespace CLD2
|
283
|
-
} // End namespace chrome_lang_id
|
284
|
-
|
285
|
-
#endif // SCRIPT_SPAN_UTF8STATETABLE_H_
|
Binary file
|
data/ext/cld3/task_context.o
DELETED
Binary file
|
Binary file
|
data/ext/cld3/text_processing.o
DELETED
Binary file
|
data/ext/cld3/unicodetext.o
DELETED
Binary file
|
data/ext/cld3/utf8statetable.o
DELETED
Binary file
|
data/ext/cld3/utils.o
DELETED
Binary file
|
data/ext/cld3/workspace.o
DELETED
Binary file
|
data/lib/cld3/unstable.rb
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
|
2
|
-
# Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
|
3
|
-
# All Rights Reserved.
|
4
|
-
#
|
5
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
-
# you may not use this file except in compliance with the License.
|
7
|
-
# You may obtain a copy of the License at
|
8
|
-
#
|
9
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
-
#
|
11
|
-
# Unless required by applicable law or agreed to in writing, software
|
12
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
-
# See the License for the specific language governing permissions and
|
15
|
-
# limitations under the License.
|
16
|
-
# ==============================================================================
|
17
|
-
|
18
|
-
module CLD3
|
19
|
-
module Unstable
|
20
|
-
extend FFI::Library
|
21
|
-
|
22
|
-
ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
|
23
|
-
|
24
|
-
module NNetLanguageIdentifier
|
25
|
-
class Pointer < FFI::AutoPointer
|
26
|
-
def self.release(pointer)
|
27
|
-
Unstable.delete_NNetLanguageIdentifier(pointer)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
class SpanInfo < FFI::Struct
|
32
|
-
layout :start_index, :int, :end_index, :int, :probability, :float
|
33
|
-
end
|
34
|
-
|
35
|
-
class Result < FFI::Struct
|
36
|
-
layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
41
|
-
|
42
|
-
attach_function :delete_result, [ :pointer ], :void
|
43
|
-
|
44
|
-
attach_function :delete_results, [ :pointer ], :void
|
45
|
-
|
46
|
-
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
47
|
-
|
48
|
-
attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
|
49
|
-
|
50
|
-
attach_function :NNetLanguageIdentifier_find_language,
|
51
|
-
[ :pointer, :buffer_in, :size_t ], :pointer
|
52
|
-
|
53
|
-
attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
|
54
|
-
[ :pointer, :buffer_in, :size_t, :int ], :pointer
|
55
|
-
end
|
56
|
-
|
57
|
-
private_constant :Unstable
|
58
|
-
end
|