cld3 3.5.0 → 3.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +0 -8
- data/cld3.gemspec +6 -6
- data/ext/cld3/extconf.rb +1 -2
- data/ext/cld3/nnet_language_identifier_c.cc +163 -70
- data/lib/cld3.rb +14 -102
- data/sig/cld3.rbs +2 -0
- metadata +15 -77
- data/ext/cld3/Makefile +0 -268
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.def +0 -8
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/script_span/fixunicodevalue.h +0 -69
- data/ext/cld3/script_span/generated_ulscript.h +0 -142
- data/ext/cld3/script_span/getonescriptspan.h +0 -124
- data/ext/cld3/script_span/integral_types.h +0 -37
- data/ext/cld3/script_span/offsetmap.h +0 -168
- data/ext/cld3/script_span/port.h +0 -143
- data/ext/cld3/script_span/stringpiece.h +0 -81
- data/ext/cld3/script_span/text_processing.h +0 -30
- data/ext/cld3/script_span/utf8acceptinterchange.h +0 -486
- data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +0 -1631
- data/ext/cld3/script_span/utf8repl_lettermarklower.h +0 -758
- data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +0 -1455
- data/ext/cld3/script_span/utf8statetable.h +0 -285
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3/unstable.rb +0 -58
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f7c737ba1af7401c90400054cd2908557e8957f7444777bd18f7781e8467da6
|
4
|
+
data.tar.gz: 7f7db14d8cbb0ecb70ee12a77e5e0f57f0b247118567f902bd8fc3e583ace749
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c04a26422eed82c14f7e3a60b6a738736f4dc46edc2d79e77a016ca08cbb4c60a2f78f176597bfc6d3dc470e6a2a09697657277ff7703516d9bd85a2c7ae4321
|
7
|
+
data.tar.gz: d9752e2366d44b29dd9493ae8aa5e92af27af7e790424897fc7ec072e9e9fa4aa08ca49b237f2e7efb39b21e6d2250c61355aba48b69afe9ebb34bb1ca6c7e93
|
data/README.md
CHANGED
@@ -35,19 +35,11 @@ FreeBSD port is available as `rubygem-cld3` in `textproc` category.
|
|
35
35
|
|
36
36
|
https://svnweb.freebsd.org/ports/head/textproc/rubygem-cld3/
|
37
37
|
|
38
|
-
#### JRuby
|
39
|
-
JRuby has a bug which prevents the feature detection. Apply the following
|
40
|
-
change:
|
41
|
-
https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
|
42
|
-
|
43
38
|
### Troubleshooting
|
44
39
|
`gem install cld3` triggers native library building. If it fails, it is likely
|
45
40
|
that some required facilities are missing. Make sure C++ compiler is installed.
|
46
41
|
I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
|
47
42
|
|
48
|
-
Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
|
49
|
-
programming errors. Make sure they are all correct.
|
50
|
-
|
51
43
|
If you cannot identify the cause of your problem, run spec of this library and
|
52
44
|
see whether the problem is reproducible with it or not. Spec is not included in
|
53
45
|
the gem, so clone the source code repository and then run `rake spec`.
|
data/cld3.gemspec
CHANGED
@@ -16,7 +16,7 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.5.
|
19
|
+
gem.version = "3.5.2"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
@@ -24,13 +24,13 @@ Gem::Specification.new do |gem|
|
|
24
24
|
gem.author = "Akihiko Odaki"
|
25
25
|
gem.email = "akihiko.odaki@gmail.com"
|
26
26
|
gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
|
27
|
-
gem.
|
28
|
-
gem.add_development_dependency "
|
29
|
-
gem.add_development_dependency "
|
30
|
-
gem.add_development_dependency "steep", [ ">= 1.0.0", "< 1.1.0" ]
|
27
|
+
gem.add_development_dependency "rbs", [ ">= 2.8.0", "< 2.9.0" ]
|
28
|
+
gem.add_development_dependency "rspec", [ ">= 3.12.0", "< 3.13.0" ]
|
29
|
+
gem.add_development_dependency "steep", [ ">= 1.3.0", "< 1.4.0" ]
|
31
30
|
gem.files = Dir[
|
32
31
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
33
|
-
"cld3.gemspec", "ext
|
32
|
+
"cld3.gemspec", "ext/**/*.c", "ext/**/*.cc", "ext/**/*.h",
|
33
|
+
"lib/**/*.rb", "sig/**/*"
|
34
34
|
]
|
35
35
|
gem.require_paths = [ "lib" ]
|
36
36
|
gem.extensions = [ "ext/cld3/extconf.rb" ]
|
data/ext/cld3/extconf.rb
CHANGED
@@ -18,6 +18,7 @@ limitations under the License.
|
|
18
18
|
#include <iostream>
|
19
19
|
#include <string>
|
20
20
|
#include <utility>
|
21
|
+
#include <ruby.h>
|
21
22
|
#include "nnet_language_identifier.h"
|
22
23
|
|
23
24
|
#if defined _WIN32 || defined __CYGWIN__
|
@@ -27,89 +28,181 @@ limitations under the License.
|
|
27
28
|
#endif
|
28
29
|
|
29
30
|
struct Result {
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
31
|
+
VALUE result_klass;
|
32
|
+
VALUE span_info_klass;
|
33
|
+
const chrome_lang_id::NNetLanguageIdentifier::Result& data;
|
34
|
+
|
35
|
+
VALUE convert() const {
|
36
|
+
if (data.language == chrome_lang_id::NNetLanguageIdentifier::kUnknown)
|
37
|
+
return Qnil;
|
38
|
+
|
39
|
+
VALUE byte_ranges = rb_ary_new2(data.byte_ranges.size());
|
40
|
+
for (auto& byte_range_data : data.byte_ranges) {
|
41
|
+
VALUE argv[] = {
|
42
|
+
INT2NUM(byte_range_data.start_index),
|
43
|
+
INT2NUM(byte_range_data.end_index),
|
44
|
+
DBL2NUM(byte_range_data.probability),
|
45
|
+
};
|
46
|
+
|
47
|
+
VALUE byte_range = rb_class_new_instance(sizeof(argv) / sizeof(*argv),
|
48
|
+
argv,
|
49
|
+
span_info_klass);
|
50
|
+
rb_ary_push(byte_ranges, byte_range);
|
51
|
+
}
|
52
|
+
|
53
|
+
VALUE argv[] = {
|
54
|
+
ID2SYM(rb_intern2(data.language.data(), data.language.size())),
|
55
|
+
DBL2NUM(data.probability),
|
56
|
+
data.is_reliable ? Qtrue : Qfalse,
|
57
|
+
DBL2NUM(data.proportion),
|
58
|
+
byte_ranges,
|
59
|
+
};
|
60
|
+
|
61
|
+
return rb_class_new_instance(sizeof(argv) / sizeof(*argv), argv,
|
62
|
+
result_klass);
|
63
|
+
}
|
41
64
|
};
|
42
65
|
|
43
|
-
struct
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
66
|
+
struct ResultVector {
|
67
|
+
VALUE result_klass;
|
68
|
+
VALUE span_info_klass;
|
69
|
+
VALUE buffer;
|
70
|
+
const std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>& data;
|
71
|
+
|
72
|
+
VALUE convert() const {
|
73
|
+
for (auto& element_data : data) {
|
74
|
+
Result result { result_klass, span_info_klass, element_data };
|
75
|
+
VALUE element = result.convert();
|
76
|
+
if (element == Qnil)
|
77
|
+
break;
|
78
|
+
|
79
|
+
rb_ary_push(buffer, element);
|
80
|
+
}
|
81
|
+
|
82
|
+
return buffer;
|
54
83
|
}
|
84
|
+
};
|
55
85
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
86
|
+
template<typename T>
|
87
|
+
VALUE convert_protected(VALUE arg)
|
88
|
+
{
|
89
|
+
auto result = reinterpret_cast<const T *>(arg);
|
90
|
+
return result->convert();
|
91
|
+
}
|
92
|
+
|
93
|
+
static void dfree(void *arg) {
|
94
|
+
auto data = static_cast<chrome_lang_id::NNetLanguageIdentifier *>(arg);
|
95
|
+
data->~NNetLanguageIdentifier();
|
96
|
+
xfree(arg);
|
97
|
+
}
|
98
|
+
|
99
|
+
static size_t dsize(const void *data) {
|
100
|
+
return sizeof(chrome_lang_id::NNetLanguageIdentifier);
|
101
|
+
}
|
102
|
+
|
103
|
+
static const rb_data_type_t data_type = {
|
104
|
+
.wrap_struct_name = "CLD3::NNetLanguageIdentifier",
|
105
|
+
.function = {
|
106
|
+
.dfree = dfree,
|
107
|
+
.dsize = dsize,
|
108
|
+
},
|
109
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY
|
61
110
|
};
|
62
111
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
112
|
+
static VALUE find_language(VALUE obj,
|
113
|
+
VALUE result_klass, VALUE span_info_klass,
|
114
|
+
VALUE text) {
|
115
|
+
int state;
|
116
|
+
VALUE converted;
|
117
|
+
|
118
|
+
{
|
119
|
+
chrome_lang_id::NNetLanguageIdentifier *data;
|
120
|
+
TypedData_Get_Struct(obj, chrome_lang_id::NNetLanguageIdentifier,
|
121
|
+
&data_type, data);
|
122
|
+
std::string text_string = std::string(RSTRING_PTR(text), RSTRING_LEN(text));
|
123
|
+
auto result_data = data->FindLanguage(text_string);
|
124
|
+
Result result { result_klass, span_info_klass, result_data };
|
125
|
+
|
126
|
+
converted = rb_protect(convert_protected<Result>,
|
127
|
+
reinterpret_cast<VALUE>(&result),
|
128
|
+
&state);
|
69
129
|
}
|
70
130
|
|
71
|
-
|
72
|
-
|
73
|
-
chrome_lang_id::NNetLanguageIdentifier *instance,
|
74
|
-
const char *data, std::size_t size, int num_langs) {
|
75
|
-
std::string text(data, size);
|
76
|
-
return new auto(instance->FindTopNMostFreqLangs(text, num_langs));
|
77
|
-
}
|
131
|
+
if (state)
|
132
|
+
rb_jump_tag(state);
|
78
133
|
|
79
|
-
|
80
|
-
|
81
|
-
delete pointer;
|
82
|
-
}
|
134
|
+
return converted;
|
135
|
+
}
|
83
136
|
|
84
|
-
|
85
|
-
|
137
|
+
static VALUE find_top_n_most_freq_langs(VALUE obj,
|
138
|
+
VALUE result_klass,
|
139
|
+
VALUE span_info_klass,
|
140
|
+
VALUE text,
|
141
|
+
VALUE num_langs) {
|
142
|
+
int state;
|
143
|
+
VALUE converted;
|
144
|
+
|
145
|
+
{
|
146
|
+
chrome_lang_id::NNetLanguageIdentifier *data;
|
147
|
+
TypedData_Get_Struct(obj, chrome_lang_id::NNetLanguageIdentifier,
|
148
|
+
&data_type, data);
|
149
|
+
VALUE buffer = rb_ary_new2(NUM2INT(num_langs));
|
150
|
+
std::string text_string = std::string(RSTRING_PTR(text), RSTRING_LEN(text));
|
151
|
+
auto result_data = data->FindTopNMostFreqLangs(text_string, num_langs);
|
152
|
+
ResultVector result { result_klass, span_info_klass, buffer, result_data };
|
153
|
+
|
154
|
+
converted = rb_protect(convert_protected<ResultVector>,
|
155
|
+
reinterpret_cast<VALUE>(&result),
|
156
|
+
&state);
|
86
157
|
}
|
87
158
|
|
88
|
-
|
89
|
-
|
90
|
-
delete pointer;
|
91
|
-
}
|
159
|
+
if (state)
|
160
|
+
rb_jump_tag(state);
|
92
161
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
162
|
+
return converted;
|
163
|
+
}
|
164
|
+
|
165
|
+
static VALUE make(VALUE klass, VALUE min_num_bytes, VALUE max_num_bytes) {
|
166
|
+
int min_num_bytes_int = NUM2INT(min_num_bytes);
|
167
|
+
int max_num_bytes_int = NUM2INT(max_num_bytes);
|
168
|
+
chrome_lang_id::NNetLanguageIdentifier *data;
|
169
|
+
VALUE value = TypedData_Make_Struct(klass,
|
170
|
+
chrome_lang_id::NNetLanguageIdentifier,
|
171
|
+
&data_type, data);
|
172
|
+
new (data) chrome_lang_id::NNetLanguageIdentifier(min_num_bytes_int, max_num_bytes_int);
|
173
|
+
return value;
|
174
|
+
}
|
98
175
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
176
|
+
extern "C" EXPORT void Init_cld3_ext() {
|
177
|
+
VALUE cld3 = rb_const_get(rb_cObject, rb_intern("CLD3"));
|
178
|
+
VALUE identifier =
|
179
|
+
rb_const_get(cld3, rb_intern("NNetLanguageIdentifier"));
|
180
|
+
VALUE unstable = rb_const_get(identifier, rb_intern("Unstable"));
|
181
|
+
VALUE params = rb_const_get(cld3, rb_intern("TaskContextParams"));
|
182
|
+
VALUE language_names = rb_const_get(params, rb_intern("LANGUAGE_NAMES"));
|
183
|
+
|
184
|
+
rb_define_const(identifier, "MIN_NUM_BYTES_TO_CONSIDER",
|
185
|
+
INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider));
|
186
|
+
rb_define_const(identifier, "MAX_NUM_BYTES_TO_CONSIDER",
|
187
|
+
INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMaxNumBytesToConsider));
|
188
|
+
rb_define_const(identifier, "MAX_NUM_INPUT_BYTES_TO_CONSIDER",
|
189
|
+
INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMaxNumInputBytesToConsider));
|
190
|
+
rb_define_const(identifier, "RELIABILITY_THRESHOLD",
|
191
|
+
DBL2NUM(chrome_lang_id::NNetLanguageIdentifier::kReliabilityThreshold));
|
192
|
+
rb_define_const(identifier, "RELIABILITY_HR_BS_THRESHOLD",
|
193
|
+
DBL2NUM(chrome_lang_id::NNetLanguageIdentifier::kReliabilityHrBsThreshold));
|
194
|
+
|
195
|
+
rb_undef_alloc_func(unstable);
|
196
|
+
rb_define_singleton_method(unstable, "make", make, 2);
|
197
|
+
rb_define_method(unstable, "find_language", find_language, 3);
|
198
|
+
rb_define_method(unstable, "find_top_n_most_freq_langs",
|
199
|
+
find_top_n_most_freq_langs, 4);
|
200
|
+
|
201
|
+
for (int i = 0; ; i++) {
|
202
|
+
const char *name = chrome_lang_id::TaskContextParams::language_names(i);
|
203
|
+
if (!name)
|
204
|
+
break;
|
205
|
+
|
206
|
+
rb_ary_push(language_names, ID2SYM(rb_intern(name)));
|
114
207
|
}
|
115
208
|
}
|
data/lib/cld3.rb
CHANGED
@@ -17,39 +17,10 @@
|
|
17
17
|
# limitations under the License.
|
18
18
|
# ==============================================================================
|
19
19
|
|
20
|
-
require "ffi"
|
21
|
-
require "rbconfig"
|
22
|
-
require "cld3/unstable"
|
23
|
-
|
24
20
|
# Module providing an interface for Compact Language Detector v3 (CLD3)
|
25
21
|
module CLD3
|
26
22
|
# Class for detecting the language of a document.
|
27
23
|
class NNetLanguageIdentifier
|
28
|
-
# Min number of bytes needed to make a prediction if the construcotr is
|
29
|
-
# called without the corresponding parameter.
|
30
|
-
# This is Numeric object.
|
31
|
-
MIN_NUM_BYTES_TO_CONSIDER = 140
|
32
|
-
|
33
|
-
# Max number of bytes needed to make a prediction if the construcotr is
|
34
|
-
# called without the corresponding parameter.
|
35
|
-
# This is Numeric object.
|
36
|
-
MAX_NUM_BYTES_TO_CONSIDER = 700
|
37
|
-
|
38
|
-
# Max number of input bytes to process.
|
39
|
-
# This is Numeric object.
|
40
|
-
MAX_NUM_INPUT_BYTES_TO_CONSIDER = 10000
|
41
|
-
|
42
|
-
# Predictions with probability greater than or equal to this threshold are
|
43
|
-
# marked as reliable. This threshold was optimized on a set of text segments
|
44
|
-
# extracted from wikipedia, and results in an overall precision, recall,
|
45
|
-
# and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
|
46
|
-
# This is Numeric object.
|
47
|
-
RELIABILITY_THRESHOLD = 0.7
|
48
|
-
|
49
|
-
# Reliability threshold for the languages hr and bs.
|
50
|
-
# This is Numeric object.
|
51
|
-
RELIABILITY_HR_BS_THRESHOLD = 0.5
|
52
|
-
|
53
24
|
# Holds probability that Span, specified by start/end indices, is a given
|
54
25
|
# language. The langauge is not stored here; it can be found in Result, which
|
55
26
|
# holds an Array of SpanInfo.
|
@@ -76,8 +47,10 @@ module CLD3
|
|
76
47
|
|
77
48
|
# The arguments are two Numeric objects.
|
78
49
|
def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
|
50
|
+
min_num_bytes = min_num_bytes.ceil
|
51
|
+
max_num_bytes = max_num_bytes.floor
|
79
52
|
raise ArgumentError if min_num_bytes < 0 || min_num_bytes >= max_num_bytes
|
80
|
-
@cc = Unstable
|
53
|
+
@cc = Unstable.make(min_num_bytes, max_num_bytes)
|
81
54
|
end
|
82
55
|
|
83
56
|
# Finds the most likely language for the given text, along with additional
|
@@ -88,23 +61,7 @@ module CLD3
|
|
88
61
|
# The argument is a String object.
|
89
62
|
# The returned value of this function is an instance of Result.
|
90
63
|
def find_language(text)
|
91
|
-
|
92
|
-
|
93
|
-
text_utf8 = text.encode(Encoding::UTF_8)
|
94
|
-
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
95
|
-
|
96
|
-
begin
|
97
|
-
pointer.put_bytes(0, text_utf8)
|
98
|
-
|
99
|
-
result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
|
100
|
-
begin
|
101
|
-
convert_result Unstable::NNetLanguageIdentifier::Result.new(result)
|
102
|
-
ensure
|
103
|
-
Unstable.delete_result result
|
104
|
-
end
|
105
|
-
ensure
|
106
|
-
pointer.free
|
107
|
-
end
|
64
|
+
@cc.find_language(Result, SpanInfo, text.encode(Encoding::UTF_8))
|
108
65
|
end
|
109
66
|
|
110
67
|
# Splits the input text (up to the first byte, if any, that is not
|
@@ -121,52 +78,15 @@ module CLD3
|
|
121
78
|
# The second argument is Numeric object.
|
122
79
|
# The returned value of this functions is an Array of Result instances.
|
123
80
|
def find_top_n_most_freq_langs(text, num_langs)
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
text_utf8 = text.encode(Encoding::UTF_8)
|
128
|
-
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
129
|
-
|
130
|
-
begin
|
131
|
-
pointer.put_bytes(0, text_utf8)
|
132
|
-
|
133
|
-
results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
|
134
|
-
begin
|
135
|
-
a = num_langs.times
|
136
|
-
.lazy
|
137
|
-
.map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
|
138
|
-
.take_while { |result| !result.nil? }
|
139
|
-
.to_a
|
140
|
-
|
141
|
-
a
|
142
|
-
ensure
|
143
|
-
Unstable.delete_results results
|
144
|
-
end
|
145
|
-
ensure
|
146
|
-
pointer.free
|
147
|
-
end
|
81
|
+
@cc.find_top_n_most_freq_langs(Result, SpanInfo,
|
82
|
+
text.encode(Encoding::UTF_8),
|
83
|
+
num_langs)
|
148
84
|
end
|
149
85
|
|
150
|
-
|
151
|
-
|
152
|
-
def convert_result(result)
|
153
|
-
language = result[:language_data].read_bytes(result[:language_size])
|
154
|
-
return nil if language == "und"
|
155
|
-
|
156
|
-
cursor = result[:byte_ranges_data]
|
157
|
-
byte_ranges = result[:byte_ranges_size].times.map do
|
158
|
-
info = Unstable::NNetLanguageIdentifier::SpanInfo.new(cursor)
|
159
|
-
cursor += Unstable::NNetLanguageIdentifier::SpanInfo.size
|
160
|
-
SpanInfo.new(info[:start_index], info[:end_index], info[:probability])
|
161
|
-
end
|
162
|
-
|
163
|
-
Result.new(
|
164
|
-
language.to_sym,
|
165
|
-
result[:probability],
|
166
|
-
result[:reliable?],
|
167
|
-
result[:proportion],
|
168
|
-
byte_ranges)
|
86
|
+
class Unstable
|
169
87
|
end
|
88
|
+
|
89
|
+
private_constant :Unstable
|
170
90
|
end
|
171
91
|
|
172
92
|
# Encapsulates the TaskContext specifying only the parameters for the model.
|
@@ -174,17 +94,9 @@ module CLD3
|
|
174
94
|
module TaskContextParams
|
175
95
|
# This is an frozen Array object containing symbols.
|
176
96
|
# @type const LANGUAGE_NAMES: untyped
|
177
|
-
LANGUAGE_NAMES = [
|
178
|
-
:eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
|
179
|
-
:nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
|
180
|
-
:mr, :th, :zu, :ml, :hr, :bs, :lo, :sd, :cy, :hy, :uk, :pt,
|
181
|
-
:lv, :iw, :cs, :vi, :jv, :be, :km, :mk, :tr, :fy, :am, :zh,
|
182
|
-
:da, :sv, :fi, :ht, :af, :la, :id, :fil, :sm, :ca, :el, :ka,
|
183
|
-
:sr, :it, :sk, :ru, :'ru-Latn', :bg, :ny, :fa, :haw, :gl, :et,
|
184
|
-
:ms, :gd, :'bg-Latn', :ha, :is, :ur, :mi, :hi, :bn, :'hi-Latn', :fr,
|
185
|
-
:yi, :hu, :xh, :my, :tg, :ro, :ar, :lb, :'el-Latn', :st, :ceb,
|
186
|
-
:kn, :az, :si, :ky, :mg, :en, :gu, :es, :pl, :'ja-Latn', :ga, :lt,
|
187
|
-
:sn, :yo, :pa, :ku,
|
188
|
-
].freeze
|
97
|
+
LANGUAGE_NAMES = []
|
189
98
|
end
|
190
99
|
end
|
100
|
+
|
101
|
+
require "cld3_ext"
|
102
|
+
CLD3::TaskContextParams::LANGUAGE_NAMES.freeze
|
data/sig/cld3.rbs
CHANGED