cld3 3.5.0 → 3.5.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +0 -8
  3. data/cld3.gemspec +6 -6
  4. data/ext/cld3/extconf.rb +1 -2
  5. data/ext/cld3/nnet_language_identifier_c.cc +162 -70
  6. data/lib/cld3.rb +14 -102
  7. data/sig/cld3.rbs +2 -0
  8. metadata +15 -77
  9. data/ext/cld3/Makefile +0 -268
  10. data/ext/cld3/base.o +0 -0
  11. data/ext/cld3/embedding_feature_extractor.o +0 -0
  12. data/ext/cld3/embedding_network.o +0 -0
  13. data/ext/cld3/feature_extractor.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.o +0 -0
  20. data/ext/cld3/lang_id_nn_params.o +0 -0
  21. data/ext/cld3/language_identifier_features.o +0 -0
  22. data/ext/cld3/libcld3.def +0 -8
  23. data/ext/cld3/libcld3.so +0 -0
  24. data/ext/cld3/nnet_language_identifier.o +0 -0
  25. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  26. data/ext/cld3/offsetmap.o +0 -0
  27. data/ext/cld3/registry.o +0 -0
  28. data/ext/cld3/relevant_script_feature.o +0 -0
  29. data/ext/cld3/script_span/fixunicodevalue.h +0 -69
  30. data/ext/cld3/script_span/generated_ulscript.h +0 -142
  31. data/ext/cld3/script_span/getonescriptspan.h +0 -124
  32. data/ext/cld3/script_span/integral_types.h +0 -37
  33. data/ext/cld3/script_span/offsetmap.h +0 -168
  34. data/ext/cld3/script_span/port.h +0 -143
  35. data/ext/cld3/script_span/stringpiece.h +0 -81
  36. data/ext/cld3/script_span/text_processing.h +0 -30
  37. data/ext/cld3/script_span/utf8acceptinterchange.h +0 -486
  38. data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +0 -1631
  39. data/ext/cld3/script_span/utf8repl_lettermarklower.h +0 -758
  40. data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +0 -1455
  41. data/ext/cld3/script_span/utf8statetable.h +0 -285
  42. data/ext/cld3/sentence_features.o +0 -0
  43. data/ext/cld3/task_context.o +0 -0
  44. data/ext/cld3/task_context_params.o +0 -0
  45. data/ext/cld3/text_processing.o +0 -0
  46. data/ext/cld3/unicodetext.o +0 -0
  47. data/ext/cld3/utf8statetable.o +0 -0
  48. data/ext/cld3/utils.o +0 -0
  49. data/ext/cld3/workspace.o +0 -0
  50. data/lib/cld3/unstable.rb +0 -58
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e12ff58a2cc0242896e307f33cb1adb2e210a1889244861e2c751b0399cb6415
4
- data.tar.gz: 3b0c348b00126c8ee825a76eed0919e074a7f97e7dc7e647614f7a754c69d716
3
+ metadata.gz: 730f1cc6022bc381311f03ee67a87cdcaab01bcfc6c9bc5d1b056871acef197c
4
+ data.tar.gz: d1f640c24df2a95cdd2605e4078c5b820ec15ddbe775b7c281004d6f3772df8f
5
5
  SHA512:
6
- metadata.gz: 690c6e6f0cc8f0cc369b05a611daeed74adc273cd9c02f780233921ed0c220a425552e7020c127ff0503e88d473f57088cafd3b0d2143f3f86527d886d3eaf63
7
- data.tar.gz: 6c3b7c105fa799fb918a077d9d1f47c948ec29bad86e57f452ea9eae82fc4f5bfc2686654a2a68ebc7102dfcb34e2a1ca55648d75a1f6bcf97b532de482de76e
6
+ metadata.gz: 41aedd4699c653dcfcc97f34adde401918928c7bf27cfe48fd5df4c108476be837e5c43add574823577afa35066c0386b31ee97f28c83660d35c753103955b20
7
+ data.tar.gz: 6f3b389ac6b334980e5899fd1a40664ee67011bb998deef5ccef4794f67d1613b4777243b97c7e87b8aab6118332907712700799437e3e261e0eb6ce85cd6952
data/README.md CHANGED
@@ -35,19 +35,11 @@ FreeBSD port is available as `rubygem-cld3` in `textproc` category.
35
35
 
36
36
  https://svnweb.freebsd.org/ports/head/textproc/rubygem-cld3/
37
37
 
38
- #### JRuby
39
- JRuby has a bug which prevents the feature detection. Apply the following
40
- change:
41
- https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
42
-
43
38
  ### Troubleshooting
44
39
  `gem install cld3` triggers native library building. If it fails, it is likely
45
40
  that some required facilities are missing. Make sure C++ compiler is installed.
46
41
  I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
47
42
 
48
- Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
49
- programming errors. Make sure they are all correct.
50
-
51
43
  If you cannot identify the cause of your problem, run spec of this library and
52
44
  see whether the problem is reproducible with it or not. Spec is not included in
53
45
  the gem, so clone the source code repository and then run `rake spec`.
data/cld3.gemspec CHANGED
@@ -16,7 +16,7 @@
16
16
 
17
17
  Gem::Specification.new do |gem|
18
18
  gem.name = "cld3"
19
- gem.version = "3.5.0"
19
+ gem.version = "3.5.1"
20
20
  gem.summary = "Compact Language Detector v3 (CLD3)"
21
21
  gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
22
  gem.license = "Apache-2.0"
@@ -24,13 +24,13 @@ Gem::Specification.new do |gem|
24
24
  gem.author = "Akihiko Odaki"
25
25
  gem.email = "akihiko.odaki@gmail.com"
26
26
  gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
27
- gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
28
- gem.add_development_dependency "rbs", [ ">= 2.6.0", "< 2.7.0" ]
29
- gem.add_development_dependency "rspec", [ ">= 3.11.0", "< 3.12.0" ]
30
- gem.add_development_dependency "steep", [ ">= 1.0.0", "< 1.1.0" ]
27
+ gem.add_development_dependency "rbs", [ ">= 2.8.0", "< 2.9.0" ]
28
+ gem.add_development_dependency "rspec", [ ">= 3.12.0", "< 3.13.0" ]
29
+ gem.add_development_dependency "steep", [ ">= 1.3.0", "< 1.4.0" ]
31
30
  gem.files = Dir[
32
31
  "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
33
- "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
32
+ "cld3.gemspec", "ext/**/*.c", "ext/**/*.cc", "ext/**/*.h",
33
+ "lib/**/*.rb", "sig/**/*"
34
34
  ]
35
35
  gem.require_paths = [ "lib" ]
36
36
  gem.extensions = [ "ext/cld3/extconf.rb" ]
data/ext/cld3/extconf.rb CHANGED
@@ -47,5 +47,4 @@ FileUtils.mkdir_p("script_span")
47
47
  }
48
48
 
49
49
  $CXXFLAGS += " -fvisibility=hidden -std=c++17"
50
- $LIBRUBYARG = ""
51
- create_makefile("libcld3")
50
+ create_makefile("cld3_ext")
@@ -18,6 +18,7 @@ limitations under the License.
18
18
  #include <iostream>
19
19
  #include <string>
20
20
  #include <utility>
21
+ #include <ruby.h>
21
22
  #include "nnet_language_identifier.h"
22
23
 
23
24
  #if defined _WIN32 || defined __CYGWIN__
@@ -27,89 +28,180 @@ limitations under the License.
27
28
  #endif
28
29
 
29
30
  struct Result {
30
- struct {
31
- const char *data;
32
- std::size_t size;
33
- } language;
34
- struct {
35
- const chrome_lang_id::NNetLanguageIdentifier::SpanInfo *data;
36
- std::size_t size;
37
- } byte_ranges;
38
- float probability;
39
- float proportion;
40
- bool is_reliable;
31
+ VALUE result_klass;
32
+ VALUE span_info_klass;
33
+ const chrome_lang_id::NNetLanguageIdentifier::Result& data;
34
+
35
+ VALUE convert() const {
36
+ if (data.language == chrome_lang_id::NNetLanguageIdentifier::kUnknown)
37
+ return Qnil;
38
+
39
+ VALUE byte_ranges = rb_ary_new2(data.byte_ranges.size());
40
+ for (auto& byte_range_data : data.byte_ranges) {
41
+ VALUE argv[] = {
42
+ INT2NUM(byte_range_data.start_index),
43
+ INT2NUM(byte_range_data.end_index),
44
+ DBL2NUM(byte_range_data.probability),
45
+ };
46
+
47
+ VALUE byte_range = rb_class_new_instance(sizeof(argv) / sizeof(*argv),
48
+ argv,
49
+ span_info_klass);
50
+ rb_ary_push(byte_ranges, byte_range);
51
+ }
52
+
53
+ VALUE argv[] = {
54
+ ID2SYM(rb_intern2(data.language.data(), data.language.size())),
55
+ DBL2NUM(data.probability),
56
+ data.is_reliable ? Qtrue : Qfalse,
57
+ DBL2NUM(data.proportion),
58
+ byte_ranges,
59
+ };
60
+
61
+ return rb_class_new_instance(sizeof(argv) / sizeof(*argv), argv,
62
+ result_klass);
63
+ }
41
64
  };
42
65
 
43
- struct OwningResult {
44
- OwningResult(chrome_lang_id::NNetLanguageIdentifier::Result&& result) {
45
- references.language = std::move(result.language);
46
- references.byte_ranges = std::move(result.byte_ranges);
47
- plain.language.data = references.language.data();
48
- plain.language.size = references.language.size();
49
- plain.byte_ranges.data = references.byte_ranges.data();
50
- plain.byte_ranges.size = references.byte_ranges.size();
51
- plain.probability = result.probability;
52
- plain.proportion = result.proportion;
53
- plain.is_reliable = result.is_reliable;
66
+ struct ResultVector {
67
+ VALUE result_klass;
68
+ VALUE span_info_klass;
69
+ VALUE buffer;
70
+ const std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>& data;
71
+
72
+ VALUE convert() const {
73
+ for (auto& element_data : data) {
74
+ Result result { result_klass, span_info_klass, element_data };
75
+ VALUE element = result.convert();
76
+ if (element == Qnil)
77
+ break;
78
+
79
+ rb_ary_push(buffer, element);
80
+ }
81
+
82
+ return buffer;
54
83
  }
84
+ };
55
85
 
56
- Result plain;
57
- struct {
58
- std::string language;
59
- std::vector<chrome_lang_id::NNetLanguageIdentifier::SpanInfo> byte_ranges;
60
- } references;
86
+ template<typename T>
87
+ VALUE convert_protected(VALUE arg)
88
+ {
89
+ auto result = reinterpret_cast<const T *>(arg);
90
+ return result->convert();
91
+ }
92
+
93
+ static void dfree(void *arg) {
94
+ auto data = static_cast<chrome_lang_id::NNetLanguageIdentifier *>(arg);
95
+ data->~NNetLanguageIdentifier();
96
+ xfree(arg);
97
+ }
98
+
99
+ static size_t dsize(const void *data) {
100
+ return sizeof(chrome_lang_id::NNetLanguageIdentifier);
101
+ }
102
+
103
+ static const rb_data_type_t data_type = {
104
+ .wrap_struct_name = "CLD3::NNetLanguageIdentifier",
105
+ .function = {
106
+ .dfree = dfree,
107
+ .dsize = dsize,
108
+ },
109
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
61
110
  };
62
111
 
63
- extern "C" {
64
- EXPORT OwningResult *NNetLanguageIdentifier_find_language(
65
- chrome_lang_id::NNetLanguageIdentifier *instance,
66
- const char *data,
67
- std::size_t size) {
68
- return new OwningResult(instance->FindLanguage(std::string(data, size)));
112
+ static VALUE find_language(VALUE obj,
113
+ VALUE result_klass, VALUE span_info_klass,
114
+ VALUE text) {
115
+ int state;
116
+ VALUE converted;
117
+
118
+ {
119
+ chrome_lang_id::NNetLanguageIdentifier *data;
120
+ TypedData_Get_Struct(obj, chrome_lang_id::NNetLanguageIdentifier,
121
+ &data_type, data);
122
+ std::string text_string = std::string(RSTRING_PTR(text), RSTRING_LEN(text));
123
+ auto result_data = data->FindLanguage(text_string);
124
+ Result result { result_klass, span_info_klass, result_data };
125
+
126
+ converted = rb_protect(convert_protected<Result>,
127
+ reinterpret_cast<VALUE>(&result),
128
+ &state);
69
129
  }
70
130
 
71
- EXPORT std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>*
72
- NNetLanguageIdentifier_find_top_n_most_freq_langs(
73
- chrome_lang_id::NNetLanguageIdentifier *instance,
74
- const char *data, std::size_t size, int num_langs) {
75
- std::string text(data, size);
76
- return new auto(instance->FindTopNMostFreqLangs(text, num_langs));
77
- }
131
+ if (state)
132
+ rb_jump_tag(state);
78
133
 
79
- EXPORT void delete_NNetLanguageIdentifier(
80
- chrome_lang_id::NNetLanguageIdentifier *pointer) {
81
- delete pointer;
82
- }
134
+ return converted;
135
+ }
83
136
 
84
- EXPORT void delete_result(OwningResult *pointer) {
85
- delete pointer;
137
+ static VALUE find_top_n_most_freq_langs(VALUE obj,
138
+ VALUE result_klass,
139
+ VALUE span_info_klass,
140
+ VALUE text,
141
+ VALUE num_langs) {
142
+ int state;
143
+ VALUE converted;
144
+
145
+ {
146
+ chrome_lang_id::NNetLanguageIdentifier *data;
147
+ TypedData_Get_Struct(obj, chrome_lang_id::NNetLanguageIdentifier,
148
+ &data_type, data);
149
+ VALUE buffer = rb_ary_new2(NUM2INT(num_langs));
150
+ std::string text_string = std::string(RSTRING_PTR(text), RSTRING_LEN(text));
151
+ auto result_data = data->FindTopNMostFreqLangs(text_string, num_langs);
152
+ ResultVector result { result_klass, span_info_klass, buffer, result_data };
153
+
154
+ converted = rb_protect(convert_protected<ResultVector>,
155
+ reinterpret_cast<VALUE>(&result),
156
+ &state);
86
157
  }
87
158
 
88
- EXPORT void delete_results(
89
- std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *pointer) {
90
- delete pointer;
91
- }
159
+ if (state)
160
+ rb_jump_tag(state);
92
161
 
93
- EXPORT chrome_lang_id::NNetLanguageIdentifier *new_NNetLanguageIdentifier(
94
- int min_num_bytes, int max_num_bytes) {
95
- return new chrome_lang_id::NNetLanguageIdentifier(
96
- min_num_bytes, max_num_bytes);
97
- }
162
+ return converted;
163
+ }
164
+
165
+ static VALUE make(VALUE klass, VALUE min_num_bytes, VALUE max_num_bytes) {
166
+ int min_num_bytes_int = NUM2INT(min_num_bytes);
167
+ int max_num_bytes_int = NUM2INT(max_num_bytes);
168
+ chrome_lang_id::NNetLanguageIdentifier *data;
169
+ VALUE value = TypedData_Make_Struct(klass,
170
+ chrome_lang_id::NNetLanguageIdentifier,
171
+ &data_type, data);
172
+ new (data) chrome_lang_id::NNetLanguageIdentifier(min_num_bytes_int, max_num_bytes_int);
173
+ return value;
174
+ }
98
175
 
99
- EXPORT Result refer_to_nth_result(
100
- std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *results,
101
- std::size_t index) {
102
- Result c;
103
- auto& cc = (*results)[index];
104
-
105
- c.language.data = cc.language.data();
106
- c.language.size = cc.language.size();
107
- c.byte_ranges.data = cc.byte_ranges.data();
108
- c.byte_ranges.size = cc.byte_ranges.size();
109
- c.probability = cc.probability;
110
- c.proportion = cc.proportion;
111
- c.is_reliable = cc.is_reliable;
112
-
113
- return c;
176
+ extern "C" EXPORT void Init_cld3_ext() {
177
+ VALUE cld3 = rb_const_get(rb_cObject, rb_intern("CLD3"));
178
+ VALUE identifier =
179
+ rb_const_get(cld3, rb_intern("NNetLanguageIdentifier"));
180
+ VALUE unstable = rb_const_get(identifier, rb_intern("Unstable"));
181
+ VALUE params = rb_const_get(cld3, rb_intern("TaskContextParams"));
182
+ VALUE language_names = rb_const_get(params, rb_intern("LANGUAGE_NAMES"));
183
+
184
+ rb_define_const(identifier, "MIN_NUM_BYTES_TO_CONSIDER",
185
+ INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider));
186
+ rb_define_const(identifier, "MAX_NUM_BYTES_TO_CONSIDER",
187
+ INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMaxNumBytesToConsider));
188
+ rb_define_const(identifier, "MAX_NUM_INPUT_BYTES_TO_CONSIDER",
189
+ INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMaxNumInputBytesToConsider));
190
+ rb_define_const(identifier, "RELIABILITY_THRESHOLD",
191
+ DBL2NUM(chrome_lang_id::NNetLanguageIdentifier::kReliabilityThreshold));
192
+ rb_define_const(identifier, "RELIABILITY_HR_BS_THRESHOLD",
193
+ DBL2NUM(chrome_lang_id::NNetLanguageIdentifier::kReliabilityHrBsThreshold));
194
+
195
+ rb_define_singleton_method(unstable, "make", make, 2);
196
+ rb_define_method(unstable, "find_language", find_language, 3);
197
+ rb_define_method(unstable, "find_top_n_most_freq_langs",
198
+ find_top_n_most_freq_langs, 4);
199
+
200
+ for (int i = 0; ; i++) {
201
+ const char *name = chrome_lang_id::TaskContextParams::language_names(i);
202
+ if (!name)
203
+ break;
204
+
205
+ rb_ary_push(language_names, ID2SYM(rb_intern(name)));
114
206
  }
115
207
  }
data/lib/cld3.rb CHANGED
@@ -17,39 +17,10 @@
17
17
  # limitations under the License.
18
18
  # ==============================================================================
19
19
 
20
- require "ffi"
21
- require "rbconfig"
22
- require "cld3/unstable"
23
-
24
20
  # Module providing an interface for Compact Language Detector v3 (CLD3)
25
21
  module CLD3
26
22
  # Class for detecting the language of a document.
27
23
  class NNetLanguageIdentifier
28
- # Min number of bytes needed to make a prediction if the construcotr is
29
- # called without the corresponding parameter.
30
- # This is Numeric object.
31
- MIN_NUM_BYTES_TO_CONSIDER = 140
32
-
33
- # Max number of bytes needed to make a prediction if the construcotr is
34
- # called without the corresponding parameter.
35
- # This is Numeric object.
36
- MAX_NUM_BYTES_TO_CONSIDER = 700
37
-
38
- # Max number of input bytes to process.
39
- # This is Numeric object.
40
- MAX_NUM_INPUT_BYTES_TO_CONSIDER = 10000
41
-
42
- # Predictions with probability greater than or equal to this threshold are
43
- # marked as reliable. This threshold was optimized on a set of text segments
44
- # extracted from wikipedia, and results in an overall precision, recall,
45
- # and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
46
- # This is Numeric object.
47
- RELIABILITY_THRESHOLD = 0.7
48
-
49
- # Reliability threshold for the languages hr and bs.
50
- # This is Numeric object.
51
- RELIABILITY_HR_BS_THRESHOLD = 0.5
52
-
53
24
  # Holds probability that Span, specified by start/end indices, is a given
54
25
  # language. The langauge is not stored here; it can be found in Result, which
55
26
  # holds an Array of SpanInfo.
@@ -76,8 +47,10 @@ module CLD3
76
47
 
77
48
  # The arguments are two Numeric objects.
78
49
  def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
50
+ min_num_bytes = min_num_bytes.ceil
51
+ max_num_bytes = max_num_bytes.floor
79
52
  raise ArgumentError if min_num_bytes < 0 || min_num_bytes >= max_num_bytes
80
- @cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
53
+ @cc = Unstable.make(min_num_bytes, max_num_bytes)
81
54
  end
82
55
 
83
56
  # Finds the most likely language for the given text, along with additional
@@ -88,23 +61,7 @@ module CLD3
88
61
  # The argument is a String object.
89
62
  # The returned value of this function is an instance of Result.
90
63
  def find_language(text)
91
- # @type const FFI: untyped
92
-
93
- text_utf8 = text.encode(Encoding::UTF_8)
94
- pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
95
-
96
- begin
97
- pointer.put_bytes(0, text_utf8)
98
-
99
- result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
100
- begin
101
- convert_result Unstable::NNetLanguageIdentifier::Result.new(result)
102
- ensure
103
- Unstable.delete_result result
104
- end
105
- ensure
106
- pointer.free
107
- end
64
+ @cc.find_language(Result, SpanInfo, text.encode(Encoding::UTF_8))
108
65
  end
109
66
 
110
67
  # Splits the input text (up to the first byte, if any, that is not
@@ -121,52 +78,15 @@ module CLD3
121
78
  # The second argument is Numeric object.
122
79
  # The returned value of this functions is an Array of Result instances.
123
80
  def find_top_n_most_freq_langs(text, num_langs)
124
- # @type const FFI: untyped
125
- # @type var a: untyped
126
-
127
- text_utf8 = text.encode(Encoding::UTF_8)
128
- pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
129
-
130
- begin
131
- pointer.put_bytes(0, text_utf8)
132
-
133
- results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
134
- begin
135
- a = num_langs.times
136
- .lazy
137
- .map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
138
- .take_while { |result| !result.nil? }
139
- .to_a
140
-
141
- a
142
- ensure
143
- Unstable.delete_results results
144
- end
145
- ensure
146
- pointer.free
147
- end
81
+ @cc.find_top_n_most_freq_langs(Result, SpanInfo,
82
+ text.encode(Encoding::UTF_8),
83
+ num_langs)
148
84
  end
149
85
 
150
- private
151
-
152
- def convert_result(result)
153
- language = result[:language_data].read_bytes(result[:language_size])
154
- return nil if language == "und"
155
-
156
- cursor = result[:byte_ranges_data]
157
- byte_ranges = result[:byte_ranges_size].times.map do
158
- info = Unstable::NNetLanguageIdentifier::SpanInfo.new(cursor)
159
- cursor += Unstable::NNetLanguageIdentifier::SpanInfo.size
160
- SpanInfo.new(info[:start_index], info[:end_index], info[:probability])
161
- end
162
-
163
- Result.new(
164
- language.to_sym,
165
- result[:probability],
166
- result[:reliable?],
167
- result[:proportion],
168
- byte_ranges)
86
+ class Unstable
169
87
  end
88
+
89
+ private_constant :Unstable
170
90
  end
171
91
 
172
92
  # Encapsulates the TaskContext specifying only the parameters for the model.
@@ -174,17 +94,9 @@ module CLD3
174
94
  module TaskContextParams
175
95
  # This is an frozen Array object containing symbols.
176
96
  # @type const LANGUAGE_NAMES: untyped
177
- LANGUAGE_NAMES = [
178
- :eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
179
- :nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
180
- :mr, :th, :zu, :ml, :hr, :bs, :lo, :sd, :cy, :hy, :uk, :pt,
181
- :lv, :iw, :cs, :vi, :jv, :be, :km, :mk, :tr, :fy, :am, :zh,
182
- :da, :sv, :fi, :ht, :af, :la, :id, :fil, :sm, :ca, :el, :ka,
183
- :sr, :it, :sk, :ru, :'ru-Latn', :bg, :ny, :fa, :haw, :gl, :et,
184
- :ms, :gd, :'bg-Latn', :ha, :is, :ur, :mi, :hi, :bn, :'hi-Latn', :fr,
185
- :yi, :hu, :xh, :my, :tg, :ro, :ar, :lb, :'el-Latn', :st, :ceb,
186
- :kn, :az, :si, :ky, :mg, :en, :gu, :es, :pl, :'ja-Latn', :ga, :lt,
187
- :sn, :yo, :pa, :ku,
188
- ].freeze
97
+ LANGUAGE_NAMES = []
189
98
  end
190
99
  end
100
+
101
+ require "cld3_ext"
102
+ CLD3::TaskContextParams::LANGUAGE_NAMES.freeze
data/sig/cld3.rbs CHANGED
@@ -36,6 +36,8 @@ module CLD3
36
36
  attr_accessor byte_ranges(): Array[SpanInfo]
37
37
  end
38
38
 
39
+ @cc: untyped
40
+
39
41
  def initialize: (?Integer, ?Integer) -> void
40
42
  def find_language: (String) -> Result?
41
43
  def find_top_n_most_freq_langs: (String, Integer) -> Array[Result]