cld3 3.5.0 → 3.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +0 -8
- data/cld3.gemspec +6 -6
- data/ext/cld3/extconf.rb +1 -2
- data/ext/cld3/nnet_language_identifier_c.cc +162 -70
- data/lib/cld3.rb +14 -102
- data/sig/cld3.rbs +2 -0
- metadata +15 -77
- data/ext/cld3/Makefile +0 -268
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.def +0 -8
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/script_span/fixunicodevalue.h +0 -69
- data/ext/cld3/script_span/generated_ulscript.h +0 -142
- data/ext/cld3/script_span/getonescriptspan.h +0 -124
- data/ext/cld3/script_span/integral_types.h +0 -37
- data/ext/cld3/script_span/offsetmap.h +0 -168
- data/ext/cld3/script_span/port.h +0 -143
- data/ext/cld3/script_span/stringpiece.h +0 -81
- data/ext/cld3/script_span/text_processing.h +0 -30
- data/ext/cld3/script_span/utf8acceptinterchange.h +0 -486
- data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +0 -1631
- data/ext/cld3/script_span/utf8repl_lettermarklower.h +0 -758
- data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +0 -1455
- data/ext/cld3/script_span/utf8statetable.h +0 -285
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3/unstable.rb +0 -58
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 730f1cc6022bc381311f03ee67a87cdcaab01bcfc6c9bc5d1b056871acef197c
|
4
|
+
data.tar.gz: d1f640c24df2a95cdd2605e4078c5b820ec15ddbe775b7c281004d6f3772df8f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41aedd4699c653dcfcc97f34adde401918928c7bf27cfe48fd5df4c108476be837e5c43add574823577afa35066c0386b31ee97f28c83660d35c753103955b20
|
7
|
+
data.tar.gz: 6f3b389ac6b334980e5899fd1a40664ee67011bb998deef5ccef4794f67d1613b4777243b97c7e87b8aab6118332907712700799437e3e261e0eb6ce85cd6952
|
data/README.md
CHANGED
@@ -35,19 +35,11 @@ FreeBSD port is available as `rubygem-cld3` in `textproc` category.
|
|
35
35
|
|
36
36
|
https://svnweb.freebsd.org/ports/head/textproc/rubygem-cld3/
|
37
37
|
|
38
|
-
#### JRuby
|
39
|
-
JRuby has a bug which prevents the feature detection. Apply the following
|
40
|
-
change:
|
41
|
-
https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
|
42
|
-
|
43
38
|
### Troubleshooting
|
44
39
|
`gem install cld3` triggers native library building. If it fails, it is likely
|
45
40
|
that some required facilities are missing. Make sure C++ compiler is installed.
|
46
41
|
I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
|
47
42
|
|
48
|
-
Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
|
49
|
-
programming errors. Make sure they are all correct.
|
50
|
-
|
51
43
|
If you cannot identify the cause of your problem, run spec of this library and
|
52
44
|
see whether the problem is reproducible with it or not. Spec is not included in
|
53
45
|
the gem, so clone the source code repository and then run `rake spec`.
|
data/cld3.gemspec
CHANGED
@@ -16,7 +16,7 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.5.
|
19
|
+
gem.version = "3.5.1"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
@@ -24,13 +24,13 @@ Gem::Specification.new do |gem|
|
|
24
24
|
gem.author = "Akihiko Odaki"
|
25
25
|
gem.email = "akihiko.odaki@gmail.com"
|
26
26
|
gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
|
27
|
-
gem.
|
28
|
-
gem.add_development_dependency "
|
29
|
-
gem.add_development_dependency "
|
30
|
-
gem.add_development_dependency "steep", [ ">= 1.0.0", "< 1.1.0" ]
|
27
|
+
gem.add_development_dependency "rbs", [ ">= 2.8.0", "< 2.9.0" ]
|
28
|
+
gem.add_development_dependency "rspec", [ ">= 3.12.0", "< 3.13.0" ]
|
29
|
+
gem.add_development_dependency "steep", [ ">= 1.3.0", "< 1.4.0" ]
|
31
30
|
gem.files = Dir[
|
32
31
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
33
|
-
"cld3.gemspec", "ext
|
32
|
+
"cld3.gemspec", "ext/**/*.c", "ext/**/*.cc", "ext/**/*.h",
|
33
|
+
"lib/**/*.rb", "sig/**/*"
|
34
34
|
]
|
35
35
|
gem.require_paths = [ "lib" ]
|
36
36
|
gem.extensions = [ "ext/cld3/extconf.rb" ]
|
data/ext/cld3/extconf.rb
CHANGED
@@ -18,6 +18,7 @@ limitations under the License.
|
|
18
18
|
#include <iostream>
|
19
19
|
#include <string>
|
20
20
|
#include <utility>
|
21
|
+
#include <ruby.h>
|
21
22
|
#include "nnet_language_identifier.h"
|
22
23
|
|
23
24
|
#if defined _WIN32 || defined __CYGWIN__
|
@@ -27,89 +28,180 @@ limitations under the License.
|
|
27
28
|
#endif
|
28
29
|
|
29
30
|
struct Result {
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
31
|
+
VALUE result_klass;
|
32
|
+
VALUE span_info_klass;
|
33
|
+
const chrome_lang_id::NNetLanguageIdentifier::Result& data;
|
34
|
+
|
35
|
+
VALUE convert() const {
|
36
|
+
if (data.language == chrome_lang_id::NNetLanguageIdentifier::kUnknown)
|
37
|
+
return Qnil;
|
38
|
+
|
39
|
+
VALUE byte_ranges = rb_ary_new2(data.byte_ranges.size());
|
40
|
+
for (auto& byte_range_data : data.byte_ranges) {
|
41
|
+
VALUE argv[] = {
|
42
|
+
INT2NUM(byte_range_data.start_index),
|
43
|
+
INT2NUM(byte_range_data.end_index),
|
44
|
+
DBL2NUM(byte_range_data.probability),
|
45
|
+
};
|
46
|
+
|
47
|
+
VALUE byte_range = rb_class_new_instance(sizeof(argv) / sizeof(*argv),
|
48
|
+
argv,
|
49
|
+
span_info_klass);
|
50
|
+
rb_ary_push(byte_ranges, byte_range);
|
51
|
+
}
|
52
|
+
|
53
|
+
VALUE argv[] = {
|
54
|
+
ID2SYM(rb_intern2(data.language.data(), data.language.size())),
|
55
|
+
DBL2NUM(data.probability),
|
56
|
+
data.is_reliable ? Qtrue : Qfalse,
|
57
|
+
DBL2NUM(data.proportion),
|
58
|
+
byte_ranges,
|
59
|
+
};
|
60
|
+
|
61
|
+
return rb_class_new_instance(sizeof(argv) / sizeof(*argv), argv,
|
62
|
+
result_klass);
|
63
|
+
}
|
41
64
|
};
|
42
65
|
|
43
|
-
struct
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
66
|
+
struct ResultVector {
|
67
|
+
VALUE result_klass;
|
68
|
+
VALUE span_info_klass;
|
69
|
+
VALUE buffer;
|
70
|
+
const std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>& data;
|
71
|
+
|
72
|
+
VALUE convert() const {
|
73
|
+
for (auto& element_data : data) {
|
74
|
+
Result result { result_klass, span_info_klass, element_data };
|
75
|
+
VALUE element = result.convert();
|
76
|
+
if (element == Qnil)
|
77
|
+
break;
|
78
|
+
|
79
|
+
rb_ary_push(buffer, element);
|
80
|
+
}
|
81
|
+
|
82
|
+
return buffer;
|
54
83
|
}
|
84
|
+
};
|
55
85
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
86
|
+
template<typename T>
|
87
|
+
VALUE convert_protected(VALUE arg)
|
88
|
+
{
|
89
|
+
auto result = reinterpret_cast<const T *>(arg);
|
90
|
+
return result->convert();
|
91
|
+
}
|
92
|
+
|
93
|
+
static void dfree(void *arg) {
|
94
|
+
auto data = static_cast<chrome_lang_id::NNetLanguageIdentifier *>(arg);
|
95
|
+
data->~NNetLanguageIdentifier();
|
96
|
+
xfree(arg);
|
97
|
+
}
|
98
|
+
|
99
|
+
static size_t dsize(const void *data) {
|
100
|
+
return sizeof(chrome_lang_id::NNetLanguageIdentifier);
|
101
|
+
}
|
102
|
+
|
103
|
+
static const rb_data_type_t data_type = {
|
104
|
+
.wrap_struct_name = "CLD3::NNetLanguageIdentifier",
|
105
|
+
.function = {
|
106
|
+
.dfree = dfree,
|
107
|
+
.dsize = dsize,
|
108
|
+
},
|
109
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY
|
61
110
|
};
|
62
111
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
112
|
+
static VALUE find_language(VALUE obj,
|
113
|
+
VALUE result_klass, VALUE span_info_klass,
|
114
|
+
VALUE text) {
|
115
|
+
int state;
|
116
|
+
VALUE converted;
|
117
|
+
|
118
|
+
{
|
119
|
+
chrome_lang_id::NNetLanguageIdentifier *data;
|
120
|
+
TypedData_Get_Struct(obj, chrome_lang_id::NNetLanguageIdentifier,
|
121
|
+
&data_type, data);
|
122
|
+
std::string text_string = std::string(RSTRING_PTR(text), RSTRING_LEN(text));
|
123
|
+
auto result_data = data->FindLanguage(text_string);
|
124
|
+
Result result { result_klass, span_info_klass, result_data };
|
125
|
+
|
126
|
+
converted = rb_protect(convert_protected<Result>,
|
127
|
+
reinterpret_cast<VALUE>(&result),
|
128
|
+
&state);
|
69
129
|
}
|
70
130
|
|
71
|
-
|
72
|
-
|
73
|
-
chrome_lang_id::NNetLanguageIdentifier *instance,
|
74
|
-
const char *data, std::size_t size, int num_langs) {
|
75
|
-
std::string text(data, size);
|
76
|
-
return new auto(instance->FindTopNMostFreqLangs(text, num_langs));
|
77
|
-
}
|
131
|
+
if (state)
|
132
|
+
rb_jump_tag(state);
|
78
133
|
|
79
|
-
|
80
|
-
|
81
|
-
delete pointer;
|
82
|
-
}
|
134
|
+
return converted;
|
135
|
+
}
|
83
136
|
|
84
|
-
|
85
|
-
|
137
|
+
static VALUE find_top_n_most_freq_langs(VALUE obj,
|
138
|
+
VALUE result_klass,
|
139
|
+
VALUE span_info_klass,
|
140
|
+
VALUE text,
|
141
|
+
VALUE num_langs) {
|
142
|
+
int state;
|
143
|
+
VALUE converted;
|
144
|
+
|
145
|
+
{
|
146
|
+
chrome_lang_id::NNetLanguageIdentifier *data;
|
147
|
+
TypedData_Get_Struct(obj, chrome_lang_id::NNetLanguageIdentifier,
|
148
|
+
&data_type, data);
|
149
|
+
VALUE buffer = rb_ary_new2(NUM2INT(num_langs));
|
150
|
+
std::string text_string = std::string(RSTRING_PTR(text), RSTRING_LEN(text));
|
151
|
+
auto result_data = data->FindTopNMostFreqLangs(text_string, num_langs);
|
152
|
+
ResultVector result { result_klass, span_info_klass, buffer, result_data };
|
153
|
+
|
154
|
+
converted = rb_protect(convert_protected<ResultVector>,
|
155
|
+
reinterpret_cast<VALUE>(&result),
|
156
|
+
&state);
|
86
157
|
}
|
87
158
|
|
88
|
-
|
89
|
-
|
90
|
-
delete pointer;
|
91
|
-
}
|
159
|
+
if (state)
|
160
|
+
rb_jump_tag(state);
|
92
161
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
162
|
+
return converted;
|
163
|
+
}
|
164
|
+
|
165
|
+
static VALUE make(VALUE klass, VALUE min_num_bytes, VALUE max_num_bytes) {
|
166
|
+
int min_num_bytes_int = NUM2INT(min_num_bytes);
|
167
|
+
int max_num_bytes_int = NUM2INT(max_num_bytes);
|
168
|
+
chrome_lang_id::NNetLanguageIdentifier *data;
|
169
|
+
VALUE value = TypedData_Make_Struct(klass,
|
170
|
+
chrome_lang_id::NNetLanguageIdentifier,
|
171
|
+
&data_type, data);
|
172
|
+
new (data) chrome_lang_id::NNetLanguageIdentifier(min_num_bytes_int, max_num_bytes_int);
|
173
|
+
return value;
|
174
|
+
}
|
98
175
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
176
|
+
extern "C" EXPORT void Init_cld3_ext() {
|
177
|
+
VALUE cld3 = rb_const_get(rb_cObject, rb_intern("CLD3"));
|
178
|
+
VALUE identifier =
|
179
|
+
rb_const_get(cld3, rb_intern("NNetLanguageIdentifier"));
|
180
|
+
VALUE unstable = rb_const_get(identifier, rb_intern("Unstable"));
|
181
|
+
VALUE params = rb_const_get(cld3, rb_intern("TaskContextParams"));
|
182
|
+
VALUE language_names = rb_const_get(params, rb_intern("LANGUAGE_NAMES"));
|
183
|
+
|
184
|
+
rb_define_const(identifier, "MIN_NUM_BYTES_TO_CONSIDER",
|
185
|
+
INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider));
|
186
|
+
rb_define_const(identifier, "MAX_NUM_BYTES_TO_CONSIDER",
|
187
|
+
INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMaxNumBytesToConsider));
|
188
|
+
rb_define_const(identifier, "MAX_NUM_INPUT_BYTES_TO_CONSIDER",
|
189
|
+
INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMaxNumInputBytesToConsider));
|
190
|
+
rb_define_const(identifier, "RELIABILITY_THRESHOLD",
|
191
|
+
DBL2NUM(chrome_lang_id::NNetLanguageIdentifier::kReliabilityThreshold));
|
192
|
+
rb_define_const(identifier, "RELIABILITY_HR_BS_THRESHOLD",
|
193
|
+
DBL2NUM(chrome_lang_id::NNetLanguageIdentifier::kReliabilityHrBsThreshold));
|
194
|
+
|
195
|
+
rb_define_singleton_method(unstable, "make", make, 2);
|
196
|
+
rb_define_method(unstable, "find_language", find_language, 3);
|
197
|
+
rb_define_method(unstable, "find_top_n_most_freq_langs",
|
198
|
+
find_top_n_most_freq_langs, 4);
|
199
|
+
|
200
|
+
for (int i = 0; ; i++) {
|
201
|
+
const char *name = chrome_lang_id::TaskContextParams::language_names(i);
|
202
|
+
if (!name)
|
203
|
+
break;
|
204
|
+
|
205
|
+
rb_ary_push(language_names, ID2SYM(rb_intern(name)));
|
114
206
|
}
|
115
207
|
}
|
data/lib/cld3.rb
CHANGED
@@ -17,39 +17,10 @@
|
|
17
17
|
# limitations under the License.
|
18
18
|
# ==============================================================================
|
19
19
|
|
20
|
-
require "ffi"
|
21
|
-
require "rbconfig"
|
22
|
-
require "cld3/unstable"
|
23
|
-
|
24
20
|
# Module providing an interface for Compact Language Detector v3 (CLD3)
|
25
21
|
module CLD3
|
26
22
|
# Class for detecting the language of a document.
|
27
23
|
class NNetLanguageIdentifier
|
28
|
-
# Min number of bytes needed to make a prediction if the construcotr is
|
29
|
-
# called without the corresponding parameter.
|
30
|
-
# This is Numeric object.
|
31
|
-
MIN_NUM_BYTES_TO_CONSIDER = 140
|
32
|
-
|
33
|
-
# Max number of bytes needed to make a prediction if the construcotr is
|
34
|
-
# called without the corresponding parameter.
|
35
|
-
# This is Numeric object.
|
36
|
-
MAX_NUM_BYTES_TO_CONSIDER = 700
|
37
|
-
|
38
|
-
# Max number of input bytes to process.
|
39
|
-
# This is Numeric object.
|
40
|
-
MAX_NUM_INPUT_BYTES_TO_CONSIDER = 10000
|
41
|
-
|
42
|
-
# Predictions with probability greater than or equal to this threshold are
|
43
|
-
# marked as reliable. This threshold was optimized on a set of text segments
|
44
|
-
# extracted from wikipedia, and results in an overall precision, recall,
|
45
|
-
# and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
|
46
|
-
# This is Numeric object.
|
47
|
-
RELIABILITY_THRESHOLD = 0.7
|
48
|
-
|
49
|
-
# Reliability threshold for the languages hr and bs.
|
50
|
-
# This is Numeric object.
|
51
|
-
RELIABILITY_HR_BS_THRESHOLD = 0.5
|
52
|
-
|
53
24
|
# Holds probability that Span, specified by start/end indices, is a given
|
54
25
|
# language. The langauge is not stored here; it can be found in Result, which
|
55
26
|
# holds an Array of SpanInfo.
|
@@ -76,8 +47,10 @@ module CLD3
|
|
76
47
|
|
77
48
|
# The arguments are two Numeric objects.
|
78
49
|
def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
|
50
|
+
min_num_bytes = min_num_bytes.ceil
|
51
|
+
max_num_bytes = max_num_bytes.floor
|
79
52
|
raise ArgumentError if min_num_bytes < 0 || min_num_bytes >= max_num_bytes
|
80
|
-
@cc = Unstable
|
53
|
+
@cc = Unstable.make(min_num_bytes, max_num_bytes)
|
81
54
|
end
|
82
55
|
|
83
56
|
# Finds the most likely language for the given text, along with additional
|
@@ -88,23 +61,7 @@ module CLD3
|
|
88
61
|
# The argument is a String object.
|
89
62
|
# The returned value of this function is an instance of Result.
|
90
63
|
def find_language(text)
|
91
|
-
|
92
|
-
|
93
|
-
text_utf8 = text.encode(Encoding::UTF_8)
|
94
|
-
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
95
|
-
|
96
|
-
begin
|
97
|
-
pointer.put_bytes(0, text_utf8)
|
98
|
-
|
99
|
-
result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
|
100
|
-
begin
|
101
|
-
convert_result Unstable::NNetLanguageIdentifier::Result.new(result)
|
102
|
-
ensure
|
103
|
-
Unstable.delete_result result
|
104
|
-
end
|
105
|
-
ensure
|
106
|
-
pointer.free
|
107
|
-
end
|
64
|
+
@cc.find_language(Result, SpanInfo, text.encode(Encoding::UTF_8))
|
108
65
|
end
|
109
66
|
|
110
67
|
# Splits the input text (up to the first byte, if any, that is not
|
@@ -121,52 +78,15 @@ module CLD3
|
|
121
78
|
# The second argument is Numeric object.
|
122
79
|
# The returned value of this functions is an Array of Result instances.
|
123
80
|
def find_top_n_most_freq_langs(text, num_langs)
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
text_utf8 = text.encode(Encoding::UTF_8)
|
128
|
-
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
129
|
-
|
130
|
-
begin
|
131
|
-
pointer.put_bytes(0, text_utf8)
|
132
|
-
|
133
|
-
results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
|
134
|
-
begin
|
135
|
-
a = num_langs.times
|
136
|
-
.lazy
|
137
|
-
.map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
|
138
|
-
.take_while { |result| !result.nil? }
|
139
|
-
.to_a
|
140
|
-
|
141
|
-
a
|
142
|
-
ensure
|
143
|
-
Unstable.delete_results results
|
144
|
-
end
|
145
|
-
ensure
|
146
|
-
pointer.free
|
147
|
-
end
|
81
|
+
@cc.find_top_n_most_freq_langs(Result, SpanInfo,
|
82
|
+
text.encode(Encoding::UTF_8),
|
83
|
+
num_langs)
|
148
84
|
end
|
149
85
|
|
150
|
-
|
151
|
-
|
152
|
-
def convert_result(result)
|
153
|
-
language = result[:language_data].read_bytes(result[:language_size])
|
154
|
-
return nil if language == "und"
|
155
|
-
|
156
|
-
cursor = result[:byte_ranges_data]
|
157
|
-
byte_ranges = result[:byte_ranges_size].times.map do
|
158
|
-
info = Unstable::NNetLanguageIdentifier::SpanInfo.new(cursor)
|
159
|
-
cursor += Unstable::NNetLanguageIdentifier::SpanInfo.size
|
160
|
-
SpanInfo.new(info[:start_index], info[:end_index], info[:probability])
|
161
|
-
end
|
162
|
-
|
163
|
-
Result.new(
|
164
|
-
language.to_sym,
|
165
|
-
result[:probability],
|
166
|
-
result[:reliable?],
|
167
|
-
result[:proportion],
|
168
|
-
byte_ranges)
|
86
|
+
class Unstable
|
169
87
|
end
|
88
|
+
|
89
|
+
private_constant :Unstable
|
170
90
|
end
|
171
91
|
|
172
92
|
# Encapsulates the TaskContext specifying only the parameters for the model.
|
@@ -174,17 +94,9 @@ module CLD3
|
|
174
94
|
module TaskContextParams
|
175
95
|
# This is an frozen Array object containing symbols.
|
176
96
|
# @type const LANGUAGE_NAMES: untyped
|
177
|
-
LANGUAGE_NAMES = [
|
178
|
-
:eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
|
179
|
-
:nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
|
180
|
-
:mr, :th, :zu, :ml, :hr, :bs, :lo, :sd, :cy, :hy, :uk, :pt,
|
181
|
-
:lv, :iw, :cs, :vi, :jv, :be, :km, :mk, :tr, :fy, :am, :zh,
|
182
|
-
:da, :sv, :fi, :ht, :af, :la, :id, :fil, :sm, :ca, :el, :ka,
|
183
|
-
:sr, :it, :sk, :ru, :'ru-Latn', :bg, :ny, :fa, :haw, :gl, :et,
|
184
|
-
:ms, :gd, :'bg-Latn', :ha, :is, :ur, :mi, :hi, :bn, :'hi-Latn', :fr,
|
185
|
-
:yi, :hu, :xh, :my, :tg, :ro, :ar, :lb, :'el-Latn', :st, :ceb,
|
186
|
-
:kn, :az, :si, :ky, :mg, :en, :gu, :es, :pl, :'ja-Latn', :ga, :lt,
|
187
|
-
:sn, :yo, :pa, :ku,
|
188
|
-
].freeze
|
97
|
+
LANGUAGE_NAMES = []
|
189
98
|
end
|
190
99
|
end
|
100
|
+
|
101
|
+
require "cld3_ext"
|
102
|
+
CLD3::TaskContextParams::LANGUAGE_NAMES.freeze
|
data/sig/cld3.rbs
CHANGED