cld3 3.4.4 → 3.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -1
  3. data/README.md +4 -7
  4. data/cld3.gemspec +5 -5
  5. data/ext/cld3/Makefile +17 -16
  6. data/ext/cld3/base.o +0 -0
  7. data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
  8. data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
  9. data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
  10. data/ext/cld3/embedding_feature_extractor.o +0 -0
  11. data/ext/cld3/embedding_network.o +0 -0
  12. data/ext/cld3/extconf.rb +1 -10
  13. data/ext/cld3/feature_extractor.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.cc +0 -2
  20. data/ext/cld3/getonescriptspan.o +0 -0
  21. data/ext/cld3/lang_id_nn_params.o +0 -0
  22. data/ext/cld3/language_identifier_features.o +0 -0
  23. data/ext/cld3/libcld3.so +0 -0
  24. data/ext/cld3/nnet_language_identifier.o +0 -0
  25. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  26. data/ext/cld3/offsetmap.o +0 -0
  27. data/ext/cld3/registry.o +0 -0
  28. data/ext/cld3/relevant_script_feature.o +0 -0
  29. data/ext/cld3/script_span/fixunicodevalue.h +69 -0
  30. data/ext/cld3/script_span/generated_ulscript.h +142 -0
  31. data/ext/cld3/script_span/getonescriptspan.h +124 -0
  32. data/ext/cld3/script_span/integral_types.h +37 -0
  33. data/ext/cld3/script_span/offsetmap.h +168 -0
  34. data/ext/cld3/script_span/port.h +143 -0
  35. data/ext/cld3/script_span/stringpiece.h +81 -0
  36. data/ext/cld3/script_span/text_processing.h +30 -0
  37. data/ext/cld3/script_span/utf8acceptinterchange.h +486 -0
  38. data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +1631 -0
  39. data/ext/cld3/script_span/utf8repl_lettermarklower.h +758 -0
  40. data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +1455 -0
  41. data/ext/cld3/script_span/utf8statetable.h +285 -0
  42. data/ext/cld3/sentence_features.o +0 -0
  43. data/ext/cld3/task_context.o +0 -0
  44. data/ext/cld3/task_context_params.o +0 -0
  45. data/ext/cld3/text_processing.o +0 -0
  46. data/ext/cld3/unicodetext.o +0 -0
  47. data/ext/cld3/utf8statetable.o +0 -0
  48. data/ext/cld3/utils.o +0 -0
  49. data/ext/cld3/workspace.o +0 -0
  50. data/lib/cld3.rb +4 -1
  51. metadata +33 -25
  52. data/ext/cld3/feature_extractor.pb.o +0 -0
  53. data/ext/cld3/feature_extractor.proto +0 -50
  54. data/ext/cld3/mkmf.log +0 -37
  55. data/ext/cld3/sentence.pb.o +0 -0
  56. data/ext/cld3/sentence.proto +0 -77
  57. data/ext/cld3/task_spec.pb.o +0 -0
  58. data/ext/cld3/task_spec.proto +0 -98
  59. data/lib/a.rb +0 -24
@@ -0,0 +1,285 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // State Table follower for scanning UTF-8 strings without converting to
17
+ // 32- or 16-bit Unicode values.
18
+ //
19
+ // Author: dsites@google.com (Dick Sites)
20
+ //
21
+
22
+ #ifndef SCRIPT_SPAN_UTF8STATETABLE_H_
23
+ #define SCRIPT_SPAN_UTF8STATETABLE_H_
24
+
25
+ #include <string>
26
+
27
+ #include "integral_types.h" // for uint8, uint32, uint16
28
+ #include "stringpiece.h"
29
+
30
+ namespace chrome_lang_id {
31
+ namespace CLD2 {
32
+
33
+ class OffsetMap;
34
+
35
+
36
+ // These four-byte entries compactly encode how many bytes 0..255 to delete
37
+ // in making a string replacement, how many bytes to add 0..255, and the offset
38
+ // 0..64k-1 of the replacement string in remap_string.
39
+ struct RemapEntry {
40
+ uint8 delete_bytes;
41
+ uint8 add_bytes;
42
+ uint16 bytes_offset;
43
+ };
44
+
45
+ // Exit type codes for state tables. All but the first get stuffed into
46
+ // signed one-byte entries. The first is only generated by executable code.
47
+ // To distinguish from next-state entries, these must be contiguous and
48
+ // all <= kExitNone
49
+ typedef enum {
50
+ kExitDstSpaceFull = 239,
51
+ kExitIllegalStructure, // 240
52
+ kExitOK, // 241
53
+ kExitReject, // ...
54
+ kExitReplace1,
55
+ kExitReplace2,
56
+ kExitReplace3,
57
+ kExitReplace21,
58
+ kExitReplace31,
59
+ kExitReplace32,
60
+ kExitReplaceOffset1,
61
+ kExitReplaceOffset2,
62
+ kExitReplace1S0,
63
+ kExitSpecial,
64
+ kExitDoAgain,
65
+ kExitRejectAlt,
66
+ kExitNone // 255
67
+ } ExitReason;
68
+
69
+ typedef enum {
70
+ kExitDstSpaceFull_2 = 32767, // 0x7fff
71
+ kExitIllegalStructure_2, // 32768 0x8000
72
+ kExitOK_2, // 32769 0x8001
73
+ kExitReject_2, // ...
74
+ kExitReplace1_2,
75
+ kExitReplace2_2,
76
+ kExitReplace3_2,
77
+ kExitReplace21_2,
78
+ kExitReplace31_2,
79
+ kExitReplace32_2,
80
+ kExitReplaceOffset1_2,
81
+ kExitReplaceOffset2_2,
82
+ kExitReplace1S0_2,
83
+ kExitSpecial_2,
84
+ kExitDoAgain_2,
85
+ kExitRejectAlt_2,
86
+ kExitNone_2 // 32783 0x800f
87
+ } ExitReason_2;
88
+
89
+
90
+ // This struct represents one entire state table. The three initialized byte
91
+ // areas are state_table, remap_base, and remap_string. state0 and state0_size
92
+ // give the byte offset and length within state_table of the initial state --
93
+ // table lookups are expected to start and end in this state, but for
94
+ // truncated UTF-8 strings, may end in a different state. These allow a quick
95
+ // test for that condition. entry_shift is 8 for tables subscripted by a full
96
+ // byte value and 6 for space-optimized tables subscripted by only six
97
+ // significant bits in UTF-8 continuation bytes.
98
+ typedef struct {
99
+ const uint32 state0;
100
+ const uint32 state0_size;
101
+ const uint32 total_size;
102
+ const int max_expand;
103
+ const int entry_shift;
104
+ const int bytes_per_entry;
105
+ const uint32 losub;
106
+ const uint32 hiadd;
107
+ const uint8* state_table;
108
+ const RemapEntry* remap_base;
109
+ const uint8* remap_string;
110
+ const uint8* fast_state;
111
+ } UTF8StateMachineObj;
112
+
113
+ // Near-duplicate declaration for tables with two-byte entries
114
+ typedef struct {
115
+ const uint32 state0;
116
+ const uint32 state0_size;
117
+ const uint32 total_size;
118
+ const int max_expand;
119
+ const int entry_shift;
120
+ const int bytes_per_entry;
121
+ const uint32 losub;
122
+ const uint32 hiadd;
123
+ const unsigned short* state_table;
124
+ const RemapEntry* remap_base;
125
+ const uint8* remap_string;
126
+ const uint8* fast_state;
127
+ } UTF8StateMachineObj_2;
128
+
129
+
130
+ typedef UTF8StateMachineObj UTF8PropObj;
131
+ typedef UTF8StateMachineObj UTF8ScanObj;
132
+ typedef UTF8StateMachineObj UTF8ReplaceObj;
133
+ typedef UTF8StateMachineObj_2 UTF8PropObj_2;
134
+ typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
135
+ // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
136
+
137
+
138
+ // Look up property of one UTF-8 character and advance over it
139
+ // Return 0 if input length is zero
140
+ // Return 0 and advance one byte if input is ill-formed
141
+ uint8 UTF8GenericProperty(const UTF8PropObj* st,
142
+ const uint8** src,
143
+ int* srclen);
144
+
145
+ // Look up property of one UTF-8 character (assumed to be valid).
146
+ // (This is a faster version of UTF8GenericProperty.)
147
+ bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
148
+
149
+
150
+ // BigOneByte versions are needed for tables > 240 states, but most
151
+ // won't need the TwoByte versions.
152
+
153
+ // Look up property of one UTF-8 character and advance over it
154
+ // Return 0 if input length is zero
155
+ // Return 0 and advance one byte if input is ill-formed
156
+ uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
157
+ const uint8** src,
158
+ int* srclen);
159
+
160
+
161
+ // TwoByte versions are needed for tables > 240 states that don't fit onto
162
+ // BigOneByte -- rare ultimate fallback
163
+
164
+ // Look up property of one UTF-8 character (assumed to be valid).
165
+ // (This is a faster version of UTF8GenericProperty.)
166
+ bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
167
+
168
+ // Look up property of one UTF-8 character and advance over it
169
+ // Return 0 if input length is zero
170
+ // Return 0 and advance one byte if input is ill-formed
171
+ uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
172
+ const uint8** src,
173
+ int* srclen);
174
+
175
+ // Look up property of one UTF-8 character (assumed to be valid).
176
+ // (This is a faster version of UTF8GenericProperty.)
177
+ bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
178
+
179
+ // Scan a UTF-8 stringpiece based on a state table.
180
+ // Always scan complete UTF-8 characters
181
+ // Set number of bytes scanned. Return reason for exiting
182
+ int UTF8GenericScan(const UTF8ScanObj* st,
183
+ const StringPiece& str,
184
+ int* bytes_consumed);
185
+
186
+
187
+
188
+ // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
189
+ // and doing text replacements.
190
+ // Always scan complete UTF-8 characters
191
+ // Set number of bytes consumed from input, number filled to output.
192
+ // Return reason for exiting
193
+ // Also writes an optional OffsetMap. Pass NULL to skip writing one.
194
+ int UTF8GenericReplace(const UTF8ReplaceObj* st,
195
+ const StringPiece& istr,
196
+ StringPiece& ostr,
197
+ bool is_plain_text,
198
+ int* bytes_consumed,
199
+ int* bytes_filled,
200
+ int* chars_changed,
201
+ OffsetMap* offsetmap);
202
+
203
+ // Older version without offsetmap
204
+ int UTF8GenericReplace(const UTF8ReplaceObj* st,
205
+ const StringPiece& istr,
206
+ StringPiece& ostr,
207
+ bool is_plain_text,
208
+ int* bytes_consumed,
209
+ int* bytes_filled,
210
+ int* chars_changed);
211
+
212
+ // Older version without is_plain_text or offsetmap
213
+ int UTF8GenericReplace(const UTF8ReplaceObj* st,
214
+ const StringPiece& istr,
215
+ StringPiece& ostr,
216
+ int* bytes_consumed,
217
+ int* bytes_filled,
218
+ int* chars_changed);
219
+
220
+
221
+ // TwoByte version is needed for tables > about 256 states, such
222
+ // as the table for full Unicode 4.1 canonical + compatibility mapping
223
+
224
+ // Scan a UTF-8 stringpiece based on state table with two-byte entries,
225
+ // copying to output stringpiece
226
+ // and doing text replacements.
227
+ // Always scan complete UTF-8 characters
228
+ // Set number of bytes consumed from input, number filled to output.
229
+ // Return reason for exiting
230
+ // Also writes an optional OffsetMap. Pass NULL to skip writing one.
231
+ int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
232
+ const StringPiece& istr,
233
+ StringPiece& ostr,
234
+ bool is_plain_text,
235
+ int* bytes_consumed,
236
+ int* bytes_filled,
237
+ int* chars_changed,
238
+ OffsetMap* offsetmap);
239
+
240
+ // Older version without offsetmap
241
+ int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
242
+ const StringPiece& istr,
243
+ StringPiece& ostr,
244
+ bool is_plain_text,
245
+ int* bytes_consumed,
246
+ int* bytes_filled,
247
+ int* chars_changed);
248
+
249
+ // Older version without is_plain_text or offsetmap
250
+ int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
251
+ const StringPiece& istr,
252
+ StringPiece& ostr,
253
+ int* bytes_consumed,
254
+ int* bytes_filled,
255
+ int* chars_changed);
256
+
257
+
258
+ static const unsigned char kUTF8LenTbl[256] = {
259
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
260
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
261
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
262
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
263
+
264
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
265
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
266
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
267
+ 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
268
+ };
269
+
270
+ inline int UTF8OneCharLen(const char* in) {
271
+ return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
272
+ }
273
+
274
+ // Adjust a stringpiece to encompass complete UTF-8 characters.
275
+ // The data pointer will be increased by 0..3 bytes to get to a character
276
+ // boundary, and the length will then be decreased by 0..3 bytes
277
+ // to encompass the last complete character.
278
+ // This is useful especially when a UTF-8 string must be put into a fixed-
279
+ // maximum-size buffer cleanly, such as a MySQL buffer.
280
+ void UTF8TrimToChars(StringPiece* istr);
281
+
282
+ } // End namespace CLD2
283
+ } // End namespace chrome_lang_id
284
+
285
+ #endif // SCRIPT_SPAN_UTF8STATETABLE_H_
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/ext/cld3/utils.o CHANGED
Binary file
data/ext/cld3/workspace.o CHANGED
Binary file
data/lib/cld3.rb CHANGED
@@ -76,7 +76,7 @@ module CLD3
76
76
 
77
77
  # The arguments are two Numeric objects.
78
78
  def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
79
- raise ArgumentError if max_num_bytes <= 0 || min_num_bytes < 0 || min_num_bytes >= max_num_bytes
79
+ raise ArgumentError if min_num_bytes < 0 || min_num_bytes >= max_num_bytes
80
80
  @cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
81
81
  end
82
82
 
@@ -88,6 +88,8 @@ module CLD3
88
88
  # The argument is a String object.
89
89
  # The returned value of this function is an instance of Result.
90
90
  def find_language(text)
91
+ # @type const FFI: untyped
92
+
91
93
  text_utf8 = text.encode(Encoding::UTF_8)
92
94
  pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
93
95
 
@@ -119,6 +121,7 @@ module CLD3
119
121
  # The second argument is Numeric object.
120
122
  # The returned value of this functions is an Array of Result instances.
121
123
  def find_top_n_most_freq_langs(text, num_langs)
124
+ # @type const FFI: untyped
122
125
  # @type var a: untyped
123
126
 
124
127
  text_utf8 = text.encode(Encoding::UTF_8)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cld3
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.4.4
4
+ version: 3.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Akihiko Odaki
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-01-20 00:00:00.000000000 Z
11
+ date: 2022-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -36,60 +36,60 @@ dependencies:
36
36
  requirements:
37
37
  - - ">="
38
38
  - !ruby/object:Gem::Version
39
- version: 1.7.0
39
+ version: 2.6.0
40
40
  - - "<"
41
41
  - !ruby/object:Gem::Version
42
- version: 1.8.0
42
+ version: 2.7.0
43
43
  type: :development
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
47
  - - ">="
48
48
  - !ruby/object:Gem::Version
49
- version: 1.7.0
49
+ version: 2.6.0
50
50
  - - "<"
51
51
  - !ruby/object:Gem::Version
52
- version: 1.8.0
52
+ version: 2.7.0
53
53
  - !ruby/object:Gem::Dependency
54
54
  name: rspec
55
55
  requirement: !ruby/object:Gem::Requirement
56
56
  requirements:
57
57
  - - ">="
58
58
  - !ruby/object:Gem::Version
59
- version: 3.0.0
59
+ version: 3.11.0
60
60
  - - "<"
61
61
  - !ruby/object:Gem::Version
62
- version: 3.11.0
62
+ version: 3.12.0
63
63
  type: :development
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 3.0.0
69
+ version: 3.11.0
70
70
  - - "<"
71
71
  - !ruby/object:Gem::Version
72
- version: 3.11.0
72
+ version: 3.12.0
73
73
  - !ruby/object:Gem::Dependency
74
74
  name: steep
75
75
  requirement: !ruby/object:Gem::Requirement
76
76
  requirements:
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.47.0
79
+ version: 1.0.0
80
80
  - - "<"
81
81
  - !ruby/object:Gem::Version
82
- version: 0.48.0
82
+ version: 1.1.0
83
83
  type: :development
84
84
  prerelease: false
85
85
  version_requirements: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - ">="
88
88
  - !ruby/object:Gem::Version
89
- version: 0.47.0
89
+ version: 1.0.0
90
90
  - - "<"
91
91
  - !ruby/object:Gem::Version
92
- version: 0.48.0
92
+ version: 1.1.0
93
93
  description: Compact Language Detector v3 (CLD3) is a neural network model for language
94
94
  identification.
95
95
  email: akihiko.odaki@gmail.com
@@ -108,6 +108,9 @@ files:
108
108
  - ext/cld3/base.h
109
109
  - ext/cld3/base.o
110
110
  - ext/cld3/casts.h
111
+ - ext/cld3/cld_3/protos/feature_extractor.pb.h
112
+ - ext/cld3/cld_3/protos/sentence.pb.h
113
+ - ext/cld3/cld_3/protos/task_spec.pb.h
111
114
  - ext/cld3/embedding_feature_extractor.cc
112
115
  - ext/cld3/embedding_feature_extractor.h
113
116
  - ext/cld3/embedding_feature_extractor.o
@@ -119,8 +122,6 @@ files:
119
122
  - ext/cld3/feature_extractor.cc
120
123
  - ext/cld3/feature_extractor.h
121
124
  - ext/cld3/feature_extractor.o
122
- - ext/cld3/feature_extractor.pb.o
123
- - ext/cld3/feature_extractor.proto
124
125
  - ext/cld3/feature_types.cc
125
126
  - ext/cld3/feature_types.h
126
127
  - ext/cld3/feature_types.o
@@ -148,7 +149,6 @@ files:
148
149
  - ext/cld3/language_identifier_features.o
149
150
  - ext/cld3/libcld3.def
150
151
  - ext/cld3/libcld3.so
151
- - ext/cld3/mkmf.log
152
152
  - ext/cld3/nnet_language_identifier.cc
153
153
  - ext/cld3/nnet_language_identifier.h
154
154
  - ext/cld3/nnet_language_identifier.o
@@ -165,8 +165,19 @@ files:
165
165
  - ext/cld3/relevant_script_feature.h
166
166
  - ext/cld3/relevant_script_feature.o
167
167
  - ext/cld3/script_detector.h
168
- - ext/cld3/sentence.pb.o
169
- - ext/cld3/sentence.proto
168
+ - ext/cld3/script_span/fixunicodevalue.h
169
+ - ext/cld3/script_span/generated_ulscript.h
170
+ - ext/cld3/script_span/getonescriptspan.h
171
+ - ext/cld3/script_span/integral_types.h
172
+ - ext/cld3/script_span/offsetmap.h
173
+ - ext/cld3/script_span/port.h
174
+ - ext/cld3/script_span/stringpiece.h
175
+ - ext/cld3/script_span/text_processing.h
176
+ - ext/cld3/script_span/utf8acceptinterchange.h
177
+ - ext/cld3/script_span/utf8prop_lettermarkscriptnum.h
178
+ - ext/cld3/script_span/utf8repl_lettermarklower.h
179
+ - ext/cld3/script_span/utf8scannot_lettermarkspecial.h
180
+ - ext/cld3/script_span/utf8statetable.h
170
181
  - ext/cld3/sentence_features.cc
171
182
  - ext/cld3/sentence_features.h
172
183
  - ext/cld3/sentence_features.o
@@ -178,8 +189,6 @@ files:
178
189
  - ext/cld3/task_context_params.cc
179
190
  - ext/cld3/task_context_params.h
180
191
  - ext/cld3/task_context_params.o
181
- - ext/cld3/task_spec.pb.o
182
- - ext/cld3/task_spec.proto
183
192
  - ext/cld3/text_processing.cc
184
193
  - ext/cld3/text_processing.h
185
194
  - ext/cld3/text_processing.o
@@ -199,7 +208,6 @@ files:
199
208
  - ext/cld3/workspace.cc
200
209
  - ext/cld3/workspace.h
201
210
  - ext/cld3/workspace.o
202
- - lib/a.rb
203
211
  - lib/cld3.rb
204
212
  - lib/cld3/unstable.rb
205
213
  - sig/cld3.rbs
@@ -215,17 +223,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
215
223
  requirements:
216
224
  - - ">="
217
225
  - !ruby/object:Gem::Version
218
- version: 2.6.0
226
+ version: 2.7.0
219
227
  - - "<"
220
228
  - !ruby/object:Gem::Version
221
- version: 3.2.0
229
+ version: 3.3.0
222
230
  required_rubygems_version: !ruby/object:Gem::Requirement
223
231
  requirements:
224
232
  - - ">="
225
233
  - !ruby/object:Gem::Version
226
234
  version: '0'
227
235
  requirements: []
228
- rubygems_version: 3.2.22
236
+ rubygems_version: 3.3.7
229
237
  signing_key:
230
238
  specification_version: 4
231
239
  summary: Compact Language Detector v3 (CLD3)
Binary file
@@ -1,50 +0,0 @@
1
- /* Copyright 2016 Google Inc. All Rights Reserved.
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- ==============================================================================*/
15
-
16
- // Protocol buffers for feature extractor.
17
-
18
- syntax = "proto2";
19
- option optimize_for = LITE_RUNTIME;
20
-
21
- package chrome_lang_id;
22
-
23
- message Parameter {
24
- optional string name = 1;
25
- optional string value = 2;
26
- }
27
-
28
- // Descriptor for feature function.
29
- message FeatureFunctionDescriptor {
30
- // Feature function type.
31
- required string type = 1;
32
-
33
- // Feature function name.
34
- optional string name = 2;
35
-
36
- // Default argument for feature function.
37
- optional int32 argument = 3 [default = 0];
38
-
39
- // Named parameters for feature descriptor.
40
- repeated Parameter parameter = 4;
41
-
42
- // Nested sub-feature function descriptors.
43
- repeated FeatureFunctionDescriptor feature = 7;
44
- };
45
-
46
- // Descriptor for feature extractor.
47
- message FeatureExtractorDescriptor {
48
- // Top-level feature function for extractor.
49
- repeated FeatureFunctionDescriptor feature = 1;
50
- };
data/ext/cld3/mkmf.log DELETED
@@ -1,37 +0,0 @@
1
- "pkg-config --exists protobuf"
2
- | pkg-config --libs protobuf
3
- => "-lprotobuf -lpthread \n"
4
- "gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lm -lc"
5
- checked program was:
6
- /* begin */
7
- 1: #include "ruby.h"
8
- 2:
9
- 3: int main(int argc, char **argv)
10
- 4: {
11
- 5: return !!argv[argc];
12
- 6: }
13
- /* end */
14
-
15
- "gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lprotobuf -lpthread -lm -lc"
16
- checked program was:
17
- /* begin */
18
- 1: #include "ruby.h"
19
- 2:
20
- 3: int main(int argc, char **argv)
21
- 4: {
22
- 5: return !!argv[argc];
23
- 6: }
24
- /* end */
25
-
26
- | pkg-config --cflags-only-I protobuf
27
- => "\n"
28
- | pkg-config --cflags-only-other protobuf
29
- => "\n"
30
- | pkg-config --libs-only-l protobuf
31
- => "-lprotobuf -lpthread \n"
32
- package configuration for protobuf
33
- incflags:
34
- cflags:
35
- ldflags:
36
- libs: -lprotobuf -lpthread
37
-
Binary file
@@ -1,77 +0,0 @@
1
- /* Copyright 2016 Google Inc. All Rights Reserved.
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- ==============================================================================*/
15
-
16
- // Protocol buffer specification for sentence analysis.
17
-
18
- syntax = "proto2";
19
- option optimize_for = LITE_RUNTIME;
20
-
21
- package chrome_lang_id;
22
-
23
- // A Sentence contains the raw text contents of a sentence, as well as an
24
- // analysis.
25
- message Sentence {
26
- // Identifier for sentence.
27
- optional string id = 1;
28
-
29
- // Raw text contents of the sentence.
30
- optional string text = 2;
31
-
32
- // Tokenization of the sentence.
33
- repeated Token token = 3;
34
-
35
- extensions 1000 to max;
36
- }
37
-
38
- // A sentence token marks a span of bytes in the sentence text as a token
39
- // or word.
40
- message Token {
41
- // Token word form.
42
- required string word = 1;
43
-
44
- // Start position of token in text.
45
- required int32 start = 2;
46
-
47
- // End position of token in text. Gives index of last byte, not one past
48
- // the last byte. If token came from lexer, excludes any trailing HTML tags.
49
- required int32 end = 3;
50
-
51
- // Head of this token in the dependency tree: the id of the token which has an
52
- // arc going to this one. If it is the root token of a sentence, then it is
53
- // set to -1.
54
- optional int32 head = 4 [default = -1];
55
-
56
- // Part-of-speech tag for token.
57
- optional string tag = 5;
58
-
59
- // Coarse-grained word category for token.
60
- optional string category = 6;
61
-
62
- // Label for dependency relation between this token and its head.
63
- optional string label = 7;
64
-
65
- // Break level for tokens that indicates how it was separated from the
66
- // previous token in the text.
67
- enum BreakLevel {
68
- NO_BREAK = 0; // No separation between tokens.
69
- SPACE_BREAK = 1; // Tokens separated by space.
70
- LINE_BREAK = 2; // Tokens separated by line break.
71
- SENTENCE_BREAK = 3; // Tokens separated by sentence break.
72
- }
73
-
74
- optional BreakLevel break_level = 8 [default = SPACE_BREAK];
75
-
76
- extensions 1000 to max;
77
- }
Binary file