cld3 3.4.4 → 3.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/README.md +4 -7
- data/cld3.gemspec +5 -5
- data/ext/cld3/Makefile +17 -16
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
- data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
- data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/extconf.rb +1 -10
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.cc +0 -2
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/script_span/fixunicodevalue.h +69 -0
- data/ext/cld3/script_span/generated_ulscript.h +142 -0
- data/ext/cld3/script_span/getonescriptspan.h +124 -0
- data/ext/cld3/script_span/integral_types.h +37 -0
- data/ext/cld3/script_span/offsetmap.h +168 -0
- data/ext/cld3/script_span/port.h +143 -0
- data/ext/cld3/script_span/stringpiece.h +81 -0
- data/ext/cld3/script_span/text_processing.h +30 -0
- data/ext/cld3/script_span/utf8acceptinterchange.h +486 -0
- data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/script_span/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/script_span/utf8statetable.h +285 -0
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3.rb +4 -1
- metadata +33 -25
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_extractor.proto +0 -50
- data/ext/cld3/mkmf.log +0 -37
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence.proto +0 -77
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/task_spec.proto +0 -98
- data/lib/a.rb +0 -24
@@ -0,0 +1,285 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// State Table follower for scanning UTF-8 strings without converting to
|
17
|
+
// 32- or 16-bit Unicode values.
|
18
|
+
//
|
19
|
+
// Author: dsites@google.com (Dick Sites)
|
20
|
+
//
|
21
|
+
|
22
|
+
#ifndef SCRIPT_SPAN_UTF8STATETABLE_H_
|
23
|
+
#define SCRIPT_SPAN_UTF8STATETABLE_H_
|
24
|
+
|
25
|
+
#include <string>
|
26
|
+
|
27
|
+
#include "integral_types.h" // for uint8, uint32, uint16
|
28
|
+
#include "stringpiece.h"
|
29
|
+
|
30
|
+
namespace chrome_lang_id {
|
31
|
+
namespace CLD2 {
|
32
|
+
|
33
|
+
class OffsetMap;
|
34
|
+
|
35
|
+
|
36
|
+
// These four-byte entries compactly encode how many bytes 0..255 to delete
|
37
|
+
// in making a string replacement, how many bytes to add 0..255, and the offset
|
38
|
+
// 0..64k-1 of the replacement string in remap_string.
|
39
|
+
struct RemapEntry {
|
40
|
+
uint8 delete_bytes;
|
41
|
+
uint8 add_bytes;
|
42
|
+
uint16 bytes_offset;
|
43
|
+
};
|
44
|
+
|
45
|
+
// Exit type codes for state tables. All but the first get stuffed into
|
46
|
+
// signed one-byte entries. The first is only generated by executable code.
|
47
|
+
// To distinguish from next-state entries, these must be contiguous and
|
48
|
+
// all <= kExitNone
|
49
|
+
typedef enum {
|
50
|
+
kExitDstSpaceFull = 239,
|
51
|
+
kExitIllegalStructure, // 240
|
52
|
+
kExitOK, // 241
|
53
|
+
kExitReject, // ...
|
54
|
+
kExitReplace1,
|
55
|
+
kExitReplace2,
|
56
|
+
kExitReplace3,
|
57
|
+
kExitReplace21,
|
58
|
+
kExitReplace31,
|
59
|
+
kExitReplace32,
|
60
|
+
kExitReplaceOffset1,
|
61
|
+
kExitReplaceOffset2,
|
62
|
+
kExitReplace1S0,
|
63
|
+
kExitSpecial,
|
64
|
+
kExitDoAgain,
|
65
|
+
kExitRejectAlt,
|
66
|
+
kExitNone // 255
|
67
|
+
} ExitReason;
|
68
|
+
|
69
|
+
typedef enum {
|
70
|
+
kExitDstSpaceFull_2 = 32767, // 0x7fff
|
71
|
+
kExitIllegalStructure_2, // 32768 0x8000
|
72
|
+
kExitOK_2, // 32769 0x8001
|
73
|
+
kExitReject_2, // ...
|
74
|
+
kExitReplace1_2,
|
75
|
+
kExitReplace2_2,
|
76
|
+
kExitReplace3_2,
|
77
|
+
kExitReplace21_2,
|
78
|
+
kExitReplace31_2,
|
79
|
+
kExitReplace32_2,
|
80
|
+
kExitReplaceOffset1_2,
|
81
|
+
kExitReplaceOffset2_2,
|
82
|
+
kExitReplace1S0_2,
|
83
|
+
kExitSpecial_2,
|
84
|
+
kExitDoAgain_2,
|
85
|
+
kExitRejectAlt_2,
|
86
|
+
kExitNone_2 // 32783 0x800f
|
87
|
+
} ExitReason_2;
|
88
|
+
|
89
|
+
|
90
|
+
// This struct represents one entire state table. The three initialized byte
|
91
|
+
// areas are state_table, remap_base, and remap_string. state0 and state0_size
|
92
|
+
// give the byte offset and length within state_table of the initial state --
|
93
|
+
// table lookups are expected to start and end in this state, but for
|
94
|
+
// truncated UTF-8 strings, may end in a different state. These allow a quick
|
95
|
+
// test for that condition. entry_shift is 8 for tables subscripted by a full
|
96
|
+
// byte value and 6 for space-optimized tables subscripted by only six
|
97
|
+
// significant bits in UTF-8 continuation bytes.
|
98
|
+
typedef struct {
|
99
|
+
const uint32 state0;
|
100
|
+
const uint32 state0_size;
|
101
|
+
const uint32 total_size;
|
102
|
+
const int max_expand;
|
103
|
+
const int entry_shift;
|
104
|
+
const int bytes_per_entry;
|
105
|
+
const uint32 losub;
|
106
|
+
const uint32 hiadd;
|
107
|
+
const uint8* state_table;
|
108
|
+
const RemapEntry* remap_base;
|
109
|
+
const uint8* remap_string;
|
110
|
+
const uint8* fast_state;
|
111
|
+
} UTF8StateMachineObj;
|
112
|
+
|
113
|
+
// Near-duplicate declaration for tables with two-byte entries
|
114
|
+
typedef struct {
|
115
|
+
const uint32 state0;
|
116
|
+
const uint32 state0_size;
|
117
|
+
const uint32 total_size;
|
118
|
+
const int max_expand;
|
119
|
+
const int entry_shift;
|
120
|
+
const int bytes_per_entry;
|
121
|
+
const uint32 losub;
|
122
|
+
const uint32 hiadd;
|
123
|
+
const unsigned short* state_table;
|
124
|
+
const RemapEntry* remap_base;
|
125
|
+
const uint8* remap_string;
|
126
|
+
const uint8* fast_state;
|
127
|
+
} UTF8StateMachineObj_2;
|
128
|
+
|
129
|
+
|
130
|
+
typedef UTF8StateMachineObj UTF8PropObj;
|
131
|
+
typedef UTF8StateMachineObj UTF8ScanObj;
|
132
|
+
typedef UTF8StateMachineObj UTF8ReplaceObj;
|
133
|
+
typedef UTF8StateMachineObj_2 UTF8PropObj_2;
|
134
|
+
typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
|
135
|
+
// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
|
136
|
+
|
137
|
+
|
138
|
+
// Look up property of one UTF-8 character and advance over it
|
139
|
+
// Return 0 if input length is zero
|
140
|
+
// Return 0 and advance one byte if input is ill-formed
|
141
|
+
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
142
|
+
const uint8** src,
|
143
|
+
int* srclen);
|
144
|
+
|
145
|
+
// Look up property of one UTF-8 character (assumed to be valid).
|
146
|
+
// (This is a faster version of UTF8GenericProperty.)
|
147
|
+
bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
|
148
|
+
|
149
|
+
|
150
|
+
// BigOneByte versions are needed for tables > 240 states, but most
|
151
|
+
// won't need the TwoByte versions.
|
152
|
+
|
153
|
+
// Look up property of one UTF-8 character and advance over it
|
154
|
+
// Return 0 if input length is zero
|
155
|
+
// Return 0 and advance one byte if input is ill-formed
|
156
|
+
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
157
|
+
const uint8** src,
|
158
|
+
int* srclen);
|
159
|
+
|
160
|
+
|
161
|
+
// TwoByte versions are needed for tables > 240 states that don't fit onto
|
162
|
+
// BigOneByte -- rare ultimate fallback
|
163
|
+
|
164
|
+
// Look up property of one UTF-8 character (assumed to be valid).
|
165
|
+
// (This is a faster version of UTF8GenericProperty.)
|
166
|
+
bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
|
167
|
+
|
168
|
+
// Look up property of one UTF-8 character and advance over it
|
169
|
+
// Return 0 if input length is zero
|
170
|
+
// Return 0 and advance one byte if input is ill-formed
|
171
|
+
uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
|
172
|
+
const uint8** src,
|
173
|
+
int* srclen);
|
174
|
+
|
175
|
+
// Look up property of one UTF-8 character (assumed to be valid).
|
176
|
+
// (This is a faster version of UTF8GenericProperty.)
|
177
|
+
bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
|
178
|
+
|
179
|
+
// Scan a UTF-8 stringpiece based on a state table.
|
180
|
+
// Always scan complete UTF-8 characters
|
181
|
+
// Set number of bytes scanned. Return reason for exiting
|
182
|
+
int UTF8GenericScan(const UTF8ScanObj* st,
|
183
|
+
const StringPiece& str,
|
184
|
+
int* bytes_consumed);
|
185
|
+
|
186
|
+
|
187
|
+
|
188
|
+
// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
|
189
|
+
// and doing text replacements.
|
190
|
+
// Always scan complete UTF-8 characters
|
191
|
+
// Set number of bytes consumed from input, number filled to output.
|
192
|
+
// Return reason for exiting
|
193
|
+
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
194
|
+
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
195
|
+
const StringPiece& istr,
|
196
|
+
StringPiece& ostr,
|
197
|
+
bool is_plain_text,
|
198
|
+
int* bytes_consumed,
|
199
|
+
int* bytes_filled,
|
200
|
+
int* chars_changed,
|
201
|
+
OffsetMap* offsetmap);
|
202
|
+
|
203
|
+
// Older version without offsetmap
|
204
|
+
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
205
|
+
const StringPiece& istr,
|
206
|
+
StringPiece& ostr,
|
207
|
+
bool is_plain_text,
|
208
|
+
int* bytes_consumed,
|
209
|
+
int* bytes_filled,
|
210
|
+
int* chars_changed);
|
211
|
+
|
212
|
+
// Older version without is_plain_text or offsetmap
|
213
|
+
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
214
|
+
const StringPiece& istr,
|
215
|
+
StringPiece& ostr,
|
216
|
+
int* bytes_consumed,
|
217
|
+
int* bytes_filled,
|
218
|
+
int* chars_changed);
|
219
|
+
|
220
|
+
|
221
|
+
// TwoByte version is needed for tables > about 256 states, such
|
222
|
+
// as the table for full Unicode 4.1 canonical + compatibility mapping
|
223
|
+
|
224
|
+
// Scan a UTF-8 stringpiece based on state table with two-byte entries,
|
225
|
+
// copying to output stringpiece
|
226
|
+
// and doing text replacements.
|
227
|
+
// Always scan complete UTF-8 characters
|
228
|
+
// Set number of bytes consumed from input, number filled to output.
|
229
|
+
// Return reason for exiting
|
230
|
+
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
231
|
+
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
232
|
+
const StringPiece& istr,
|
233
|
+
StringPiece& ostr,
|
234
|
+
bool is_plain_text,
|
235
|
+
int* bytes_consumed,
|
236
|
+
int* bytes_filled,
|
237
|
+
int* chars_changed,
|
238
|
+
OffsetMap* offsetmap);
|
239
|
+
|
240
|
+
// Older version without offsetmap
|
241
|
+
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
242
|
+
const StringPiece& istr,
|
243
|
+
StringPiece& ostr,
|
244
|
+
bool is_plain_text,
|
245
|
+
int* bytes_consumed,
|
246
|
+
int* bytes_filled,
|
247
|
+
int* chars_changed);
|
248
|
+
|
249
|
+
// Older version without is_plain_text or offsetmap
|
250
|
+
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
251
|
+
const StringPiece& istr,
|
252
|
+
StringPiece& ostr,
|
253
|
+
int* bytes_consumed,
|
254
|
+
int* bytes_filled,
|
255
|
+
int* chars_changed);
|
256
|
+
|
257
|
+
|
258
|
+
static const unsigned char kUTF8LenTbl[256] = {
|
259
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
260
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
261
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
262
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
263
|
+
|
264
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
265
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
266
|
+
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
267
|
+
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
|
268
|
+
};
|
269
|
+
|
270
|
+
inline int UTF8OneCharLen(const char* in) {
|
271
|
+
return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
|
272
|
+
}
|
273
|
+
|
274
|
+
// Adjust a stringpiece to encompass complete UTF-8 characters.
|
275
|
+
// The data pointer will be increased by 0..3 bytes to get to a character
|
276
|
+
// boundary, and the length will then be decreased by 0..3 bytes
|
277
|
+
// to encompass the last complete character.
|
278
|
+
// This is useful especially when a UTF-8 string must be put into a fixed-
|
279
|
+
// maximum-size buffer cleanly, such as a MySQL buffer.
|
280
|
+
void UTF8TrimToChars(StringPiece* istr);
|
281
|
+
|
282
|
+
} // End namespace CLD2
|
283
|
+
} // End namespace chrome_lang_id
|
284
|
+
|
285
|
+
#endif // SCRIPT_SPAN_UTF8STATETABLE_H_
|
Binary file
|
data/ext/cld3/task_context.o
CHANGED
Binary file
|
Binary file
|
data/ext/cld3/text_processing.o
CHANGED
Binary file
|
data/ext/cld3/unicodetext.o
CHANGED
Binary file
|
data/ext/cld3/utf8statetable.o
CHANGED
Binary file
|
data/ext/cld3/utils.o
CHANGED
Binary file
|
data/ext/cld3/workspace.o
CHANGED
Binary file
|
data/lib/cld3.rb
CHANGED
@@ -76,7 +76,7 @@ module CLD3
|
|
76
76
|
|
77
77
|
# The arguments are two Numeric objects.
|
78
78
|
def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
|
79
|
-
raise ArgumentError if
|
79
|
+
raise ArgumentError if min_num_bytes < 0 || min_num_bytes >= max_num_bytes
|
80
80
|
@cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
|
81
81
|
end
|
82
82
|
|
@@ -88,6 +88,8 @@ module CLD3
|
|
88
88
|
# The argument is a String object.
|
89
89
|
# The returned value of this function is an instance of Result.
|
90
90
|
def find_language(text)
|
91
|
+
# @type const FFI: untyped
|
92
|
+
|
91
93
|
text_utf8 = text.encode(Encoding::UTF_8)
|
92
94
|
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
93
95
|
|
@@ -119,6 +121,7 @@ module CLD3
|
|
119
121
|
# The second argument is Numeric object.
|
120
122
|
# The returned value of this functions is an Array of Result instances.
|
121
123
|
def find_top_n_most_freq_langs(text, num_langs)
|
124
|
+
# @type const FFI: untyped
|
122
125
|
# @type var a: untyped
|
123
126
|
|
124
127
|
text_utf8 = text.encode(Encoding::UTF_8)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cld3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Akihiko Odaki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -36,60 +36,60 @@ dependencies:
|
|
36
36
|
requirements:
|
37
37
|
- - ">="
|
38
38
|
- !ruby/object:Gem::Version
|
39
|
-
version:
|
39
|
+
version: 2.6.0
|
40
40
|
- - "<"
|
41
41
|
- !ruby/object:Gem::Version
|
42
|
-
version:
|
42
|
+
version: 2.7.0
|
43
43
|
type: :development
|
44
44
|
prerelease: false
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
46
46
|
requirements:
|
47
47
|
- - ">="
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version:
|
49
|
+
version: 2.6.0
|
50
50
|
- - "<"
|
51
51
|
- !ruby/object:Gem::Version
|
52
|
-
version:
|
52
|
+
version: 2.7.0
|
53
53
|
- !ruby/object:Gem::Dependency
|
54
54
|
name: rspec
|
55
55
|
requirement: !ruby/object:Gem::Requirement
|
56
56
|
requirements:
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version: 3.
|
59
|
+
version: 3.11.0
|
60
60
|
- - "<"
|
61
61
|
- !ruby/object:Gem::Version
|
62
|
-
version: 3.
|
62
|
+
version: 3.12.0
|
63
63
|
type: :development
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 3.
|
69
|
+
version: 3.11.0
|
70
70
|
- - "<"
|
71
71
|
- !ruby/object:Gem::Version
|
72
|
-
version: 3.
|
72
|
+
version: 3.12.0
|
73
73
|
- !ruby/object:Gem::Dependency
|
74
74
|
name: steep
|
75
75
|
requirement: !ruby/object:Gem::Requirement
|
76
76
|
requirements:
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 1.0.0
|
80
80
|
- - "<"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: 1.1.0
|
83
83
|
type: :development
|
84
84
|
prerelease: false
|
85
85
|
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - ">="
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0.
|
89
|
+
version: 1.0.0
|
90
90
|
- - "<"
|
91
91
|
- !ruby/object:Gem::Version
|
92
|
-
version:
|
92
|
+
version: 1.1.0
|
93
93
|
description: Compact Language Detector v3 (CLD3) is a neural network model for language
|
94
94
|
identification.
|
95
95
|
email: akihiko.odaki@gmail.com
|
@@ -108,6 +108,9 @@ files:
|
|
108
108
|
- ext/cld3/base.h
|
109
109
|
- ext/cld3/base.o
|
110
110
|
- ext/cld3/casts.h
|
111
|
+
- ext/cld3/cld_3/protos/feature_extractor.pb.h
|
112
|
+
- ext/cld3/cld_3/protos/sentence.pb.h
|
113
|
+
- ext/cld3/cld_3/protos/task_spec.pb.h
|
111
114
|
- ext/cld3/embedding_feature_extractor.cc
|
112
115
|
- ext/cld3/embedding_feature_extractor.h
|
113
116
|
- ext/cld3/embedding_feature_extractor.o
|
@@ -119,8 +122,6 @@ files:
|
|
119
122
|
- ext/cld3/feature_extractor.cc
|
120
123
|
- ext/cld3/feature_extractor.h
|
121
124
|
- ext/cld3/feature_extractor.o
|
122
|
-
- ext/cld3/feature_extractor.pb.o
|
123
|
-
- ext/cld3/feature_extractor.proto
|
124
125
|
- ext/cld3/feature_types.cc
|
125
126
|
- ext/cld3/feature_types.h
|
126
127
|
- ext/cld3/feature_types.o
|
@@ -148,7 +149,6 @@ files:
|
|
148
149
|
- ext/cld3/language_identifier_features.o
|
149
150
|
- ext/cld3/libcld3.def
|
150
151
|
- ext/cld3/libcld3.so
|
151
|
-
- ext/cld3/mkmf.log
|
152
152
|
- ext/cld3/nnet_language_identifier.cc
|
153
153
|
- ext/cld3/nnet_language_identifier.h
|
154
154
|
- ext/cld3/nnet_language_identifier.o
|
@@ -165,8 +165,19 @@ files:
|
|
165
165
|
- ext/cld3/relevant_script_feature.h
|
166
166
|
- ext/cld3/relevant_script_feature.o
|
167
167
|
- ext/cld3/script_detector.h
|
168
|
-
- ext/cld3/
|
169
|
-
- ext/cld3/
|
168
|
+
- ext/cld3/script_span/fixunicodevalue.h
|
169
|
+
- ext/cld3/script_span/generated_ulscript.h
|
170
|
+
- ext/cld3/script_span/getonescriptspan.h
|
171
|
+
- ext/cld3/script_span/integral_types.h
|
172
|
+
- ext/cld3/script_span/offsetmap.h
|
173
|
+
- ext/cld3/script_span/port.h
|
174
|
+
- ext/cld3/script_span/stringpiece.h
|
175
|
+
- ext/cld3/script_span/text_processing.h
|
176
|
+
- ext/cld3/script_span/utf8acceptinterchange.h
|
177
|
+
- ext/cld3/script_span/utf8prop_lettermarkscriptnum.h
|
178
|
+
- ext/cld3/script_span/utf8repl_lettermarklower.h
|
179
|
+
- ext/cld3/script_span/utf8scannot_lettermarkspecial.h
|
180
|
+
- ext/cld3/script_span/utf8statetable.h
|
170
181
|
- ext/cld3/sentence_features.cc
|
171
182
|
- ext/cld3/sentence_features.h
|
172
183
|
- ext/cld3/sentence_features.o
|
@@ -178,8 +189,6 @@ files:
|
|
178
189
|
- ext/cld3/task_context_params.cc
|
179
190
|
- ext/cld3/task_context_params.h
|
180
191
|
- ext/cld3/task_context_params.o
|
181
|
-
- ext/cld3/task_spec.pb.o
|
182
|
-
- ext/cld3/task_spec.proto
|
183
192
|
- ext/cld3/text_processing.cc
|
184
193
|
- ext/cld3/text_processing.h
|
185
194
|
- ext/cld3/text_processing.o
|
@@ -199,7 +208,6 @@ files:
|
|
199
208
|
- ext/cld3/workspace.cc
|
200
209
|
- ext/cld3/workspace.h
|
201
210
|
- ext/cld3/workspace.o
|
202
|
-
- lib/a.rb
|
203
211
|
- lib/cld3.rb
|
204
212
|
- lib/cld3/unstable.rb
|
205
213
|
- sig/cld3.rbs
|
@@ -215,17 +223,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
215
223
|
requirements:
|
216
224
|
- - ">="
|
217
225
|
- !ruby/object:Gem::Version
|
218
|
-
version: 2.
|
226
|
+
version: 2.7.0
|
219
227
|
- - "<"
|
220
228
|
- !ruby/object:Gem::Version
|
221
|
-
version: 3.
|
229
|
+
version: 3.3.0
|
222
230
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
223
231
|
requirements:
|
224
232
|
- - ">="
|
225
233
|
- !ruby/object:Gem::Version
|
226
234
|
version: '0'
|
227
235
|
requirements: []
|
228
|
-
rubygems_version: 3.
|
236
|
+
rubygems_version: 3.3.7
|
229
237
|
signing_key:
|
230
238
|
specification_version: 4
|
231
239
|
summary: Compact Language Detector v3 (CLD3)
|
Binary file
|
@@ -1,50 +0,0 @@
|
|
1
|
-
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
-
|
3
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
you may not use this file except in compliance with the License.
|
5
|
-
You may obtain a copy of the License at
|
6
|
-
|
7
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
|
9
|
-
Unless required by applicable law or agreed to in writing, software
|
10
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
See the License for the specific language governing permissions and
|
13
|
-
limitations under the License.
|
14
|
-
==============================================================================*/
|
15
|
-
|
16
|
-
// Protocol buffers for feature extractor.
|
17
|
-
|
18
|
-
syntax = "proto2";
|
19
|
-
option optimize_for = LITE_RUNTIME;
|
20
|
-
|
21
|
-
package chrome_lang_id;
|
22
|
-
|
23
|
-
message Parameter {
|
24
|
-
optional string name = 1;
|
25
|
-
optional string value = 2;
|
26
|
-
}
|
27
|
-
|
28
|
-
// Descriptor for feature function.
|
29
|
-
message FeatureFunctionDescriptor {
|
30
|
-
// Feature function type.
|
31
|
-
required string type = 1;
|
32
|
-
|
33
|
-
// Feature function name.
|
34
|
-
optional string name = 2;
|
35
|
-
|
36
|
-
// Default argument for feature function.
|
37
|
-
optional int32 argument = 3 [default = 0];
|
38
|
-
|
39
|
-
// Named parameters for feature descriptor.
|
40
|
-
repeated Parameter parameter = 4;
|
41
|
-
|
42
|
-
// Nested sub-feature function descriptors.
|
43
|
-
repeated FeatureFunctionDescriptor feature = 7;
|
44
|
-
};
|
45
|
-
|
46
|
-
// Descriptor for feature extractor.
|
47
|
-
message FeatureExtractorDescriptor {
|
48
|
-
// Top-level feature function for extractor.
|
49
|
-
repeated FeatureFunctionDescriptor feature = 1;
|
50
|
-
};
|
data/ext/cld3/mkmf.log
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
"pkg-config --exists protobuf"
|
2
|
-
| pkg-config --libs protobuf
|
3
|
-
=> "-lprotobuf -lpthread \n"
|
4
|
-
"gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lm -lc"
|
5
|
-
checked program was:
|
6
|
-
/* begin */
|
7
|
-
1: #include "ruby.h"
|
8
|
-
2:
|
9
|
-
3: int main(int argc, char **argv)
|
10
|
-
4: {
|
11
|
-
5: return !!argv[argc];
|
12
|
-
6: }
|
13
|
-
/* end */
|
14
|
-
|
15
|
-
"gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lprotobuf -lpthread -lm -lc"
|
16
|
-
checked program was:
|
17
|
-
/* begin */
|
18
|
-
1: #include "ruby.h"
|
19
|
-
2:
|
20
|
-
3: int main(int argc, char **argv)
|
21
|
-
4: {
|
22
|
-
5: return !!argv[argc];
|
23
|
-
6: }
|
24
|
-
/* end */
|
25
|
-
|
26
|
-
| pkg-config --cflags-only-I protobuf
|
27
|
-
=> "\n"
|
28
|
-
| pkg-config --cflags-only-other protobuf
|
29
|
-
=> "\n"
|
30
|
-
| pkg-config --libs-only-l protobuf
|
31
|
-
=> "-lprotobuf -lpthread \n"
|
32
|
-
package configuration for protobuf
|
33
|
-
incflags:
|
34
|
-
cflags:
|
35
|
-
ldflags:
|
36
|
-
libs: -lprotobuf -lpthread
|
37
|
-
|
data/ext/cld3/sentence.pb.o
DELETED
Binary file
|
data/ext/cld3/sentence.proto
DELETED
@@ -1,77 +0,0 @@
|
|
1
|
-
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
-
|
3
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
you may not use this file except in compliance with the License.
|
5
|
-
You may obtain a copy of the License at
|
6
|
-
|
7
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
|
9
|
-
Unless required by applicable law or agreed to in writing, software
|
10
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
See the License for the specific language governing permissions and
|
13
|
-
limitations under the License.
|
14
|
-
==============================================================================*/
|
15
|
-
|
16
|
-
// Protocol buffer specification for sentence analysis.
|
17
|
-
|
18
|
-
syntax = "proto2";
|
19
|
-
option optimize_for = LITE_RUNTIME;
|
20
|
-
|
21
|
-
package chrome_lang_id;
|
22
|
-
|
23
|
-
// A Sentence contains the raw text contents of a sentence, as well as an
|
24
|
-
// analysis.
|
25
|
-
message Sentence {
|
26
|
-
// Identifier for sentence.
|
27
|
-
optional string id = 1;
|
28
|
-
|
29
|
-
// Raw text contents of the sentence.
|
30
|
-
optional string text = 2;
|
31
|
-
|
32
|
-
// Tokenization of the sentence.
|
33
|
-
repeated Token token = 3;
|
34
|
-
|
35
|
-
extensions 1000 to max;
|
36
|
-
}
|
37
|
-
|
38
|
-
// A sentence token marks a span of bytes in the sentence text as a token
|
39
|
-
// or word.
|
40
|
-
message Token {
|
41
|
-
// Token word form.
|
42
|
-
required string word = 1;
|
43
|
-
|
44
|
-
// Start position of token in text.
|
45
|
-
required int32 start = 2;
|
46
|
-
|
47
|
-
// End position of token in text. Gives index of last byte, not one past
|
48
|
-
// the last byte. If token came from lexer, excludes any trailing HTML tags.
|
49
|
-
required int32 end = 3;
|
50
|
-
|
51
|
-
// Head of this token in the dependency tree: the id of the token which has an
|
52
|
-
// arc going to this one. If it is the root token of a sentence, then it is
|
53
|
-
// set to -1.
|
54
|
-
optional int32 head = 4 [default = -1];
|
55
|
-
|
56
|
-
// Part-of-speech tag for token.
|
57
|
-
optional string tag = 5;
|
58
|
-
|
59
|
-
// Coarse-grained word category for token.
|
60
|
-
optional string category = 6;
|
61
|
-
|
62
|
-
// Label for dependency relation between this token and its head.
|
63
|
-
optional string label = 7;
|
64
|
-
|
65
|
-
// Break level for tokens that indicates how it was separated from the
|
66
|
-
// previous token in the text.
|
67
|
-
enum BreakLevel {
|
68
|
-
NO_BREAK = 0; // No separation between tokens.
|
69
|
-
SPACE_BREAK = 1; // Tokens separated by space.
|
70
|
-
LINE_BREAK = 2; // Tokens separated by line break.
|
71
|
-
SENTENCE_BREAK = 3; // Tokens separated by sentence break.
|
72
|
-
}
|
73
|
-
|
74
|
-
optional BreakLevel break_level = 8 [default = SPACE_BREAK];
|
75
|
-
|
76
|
-
extensions 1000 to max;
|
77
|
-
}
|
data/ext/cld3/task_spec.pb.o
DELETED
Binary file
|