cld3 3.4.2 → 3.4.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/cld3.gemspec +5 -3
  3. data/ext/cld3/Makefile +6 -5
  4. data/ext/cld3/base.o +0 -0
  5. data/ext/cld3/embedding_feature_extractor.o +0 -0
  6. data/ext/cld3/embedding_network.cc +1 -0
  7. data/ext/cld3/embedding_network.o +0 -0
  8. data/ext/cld3/feature_extractor.o +0 -0
  9. data/ext/cld3/feature_extractor.pb.o +0 -0
  10. data/ext/cld3/feature_types.o +0 -0
  11. data/ext/cld3/fixunicodevalue.o +0 -0
  12. data/ext/cld3/fml_parser.o +0 -0
  13. data/ext/cld3/generated_entities.o +0 -0
  14. data/ext/cld3/generated_ulscript.o +0 -0
  15. data/ext/cld3/getonescriptspan.h +2 -2
  16. data/ext/cld3/getonescriptspan.o +0 -0
  17. data/ext/cld3/lang_id_nn_params.o +0 -0
  18. data/ext/cld3/language_identifier_features.o +0 -0
  19. data/ext/cld3/libcld3.so +0 -0
  20. data/ext/cld3/nnet_language_identifier.cc +3 -5
  21. data/ext/cld3/nnet_language_identifier.o +0 -0
  22. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  23. data/ext/cld3/offsetmap.o +0 -0
  24. data/ext/cld3/registry.o +0 -0
  25. data/ext/cld3/relevant_script_feature.o +0 -0
  26. data/ext/cld3/sentence.pb.o +0 -0
  27. data/ext/cld3/sentence_features.cc +4 -4
  28. data/ext/cld3/sentence_features.h +13 -3
  29. data/ext/cld3/sentence_features.o +0 -0
  30. data/ext/cld3/task_context.o +0 -0
  31. data/ext/cld3/task_context_params.o +0 -0
  32. data/ext/cld3/task_spec.pb.o +0 -0
  33. data/ext/cld3/text_processing.o +0 -0
  34. data/ext/cld3/unicodetext.o +0 -0
  35. data/ext/cld3/utf8statetable.o +0 -0
  36. data/ext/cld3/utils.o +0 -0
  37. data/ext/cld3/workspace.o +0 -0
  38. data/lib/cld3/unstable.rb +58 -0
  39. data/lib/cld3.rb +9 -41
  40. data/sig/cld3.rbs +65 -0
  41. metadata +46 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f5b3cc203abda97cb85d5dee0983b7f63c626397b8af8b90e2110bb5fedbbdec
4
- data.tar.gz: 197f66798925404ded7af722d0194a705018d6953b11f4576c4e180ea093675d
3
+ metadata.gz: 2c161cbf12d260074efd2e9db3981b6615af20ee04c234d6b2710bd52a283a4e
4
+ data.tar.gz: c388ae6b529d95e015ecdb7d21cdd7f1ceaca72d167d0f8008b5477d5bce5b3c
5
5
  SHA512:
6
- metadata.gz: 855e8ee464a2842906bfef211e2afb21820fe9a7449b58d91b9ab1908c997966b9dd4c2d5d51f82ceb84b65b5a118736a5aa4eff6ea9548b9a9abc61b297a9d0
7
- data.tar.gz: e38ddfd81489aeb83bccc7b509dd17ea79c56ba641de37cac2d800d3428ed31e5ac57066016bd118e9e71c30c78d31b4c38a266abe012065495558adf07e68f5
6
+ metadata.gz: 8e3c1c07283730e722c450acc308a497756fd501595a02a7fc066d0b3e59b96e1ab1e7941549293b02e41274b176772bdae3779a041eb28f8ae53f5c44308cc0
7
+ data.tar.gz: 52e95027de7a595b2eabc49745a11f664e305c18f9926bc9d649642a92fea9846efdd23da699529795d80609b8871b00e77f9379449d2e4f6cb79ecbcf2785db
data/cld3.gemspec CHANGED
@@ -16,19 +16,21 @@
16
16
 
17
17
  Gem::Specification.new do |gem|
18
18
  gem.name = "cld3"
19
- gem.version = "3.4.2"
19
+ gem.version = "3.4.3"
20
20
  gem.summary = "Compact Language Detector v3 (CLD3)"
21
21
  gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
22
  gem.license = "Apache-2.0"
23
23
  gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
24
24
  gem.author = "Akihiko Odaki"
25
25
  gem.email = "akihiko.odaki@gmail.com"
26
- gem.required_ruby_version = [ ">= 2.6.0", "< 3.1.0" ]
26
+ gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
27
27
  gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
28
+ gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
28
29
  gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
30
+ gem.add_development_dependency "steep", [ ">= 0.46.0", "< 0.47.0" ]
29
31
  gem.files = Dir[
30
32
  "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
31
- "cld3.gemspec", "ext/**/*", "lib/**/*"
33
+ "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
32
34
  ]
33
35
  gem.require_paths = [ "lib" ]
34
36
  gem.extensions = [ "ext/cld3/extconf.rb" ]
data/ext/cld3/Makefile CHANGED
@@ -53,6 +53,7 @@ infodir = $(DESTDIR)/usr/share/info
53
53
  docdir = $(datarootdir)/doc/$(PACKAGE)
54
54
  oldincludedir = $(DESTDIR)/usr/include
55
55
  includedir = $(DESTDIR)/usr/include
56
+ runstatedir = $(localstatedir)/run
56
57
  localstatedir = $(DESTDIR)/var
57
58
  sharedstatedir = $(DESTDIR)/var/lib
58
59
  sysconfdir = $(DESTDIR)/etc
@@ -95,7 +96,7 @@ ARCH_FLAG =
95
96
  DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
96
97
  LDSHARED = $(CC) -shared
97
98
  LDSHAREDXX = $(CXX) -shared
98
- AR = ar
99
+ AR = gcc-ar
99
100
  EXEEXT =
100
101
 
101
102
  RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
@@ -107,7 +108,7 @@ RUBY_BASE_NAME = ruby
107
108
 
108
109
  arch = aarch64-linux
109
110
  sitearch = $(arch)
110
- ruby_version = 2.7.0
111
+ ruby_version = 3.0.0
111
112
  ruby = $(bindir)/$(RUBY_BASE_NAME)
112
113
  RUBY = $(ruby)
113
114
  ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
@@ -141,7 +142,7 @@ LIBS = -lprotobuf -lpthread -lm -lc
141
142
  ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
142
143
  SRCS = $(ORIG_SRCS)
143
144
  OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
144
- HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/language_identifier_features.h $(srcdir)/lang_id_nn_params.h $(srcdir)/nnet_language_identifier.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/fixunicodevalue.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/stringpiece.h $(srcdir)/text_processing.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/unicodetext.h $(srcdir)/utils.h $(srcdir)/workspace.h $(srcdir)/feature_extractor.pb.h $(srcdir)/sentence.pb.h $(srcdir)/task_spec.pb.h
145
+ HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
145
146
  LOCAL_HDRS =
146
147
  TARGET = libcld3
147
148
  TARGET_NAME = libcld3
@@ -155,8 +156,8 @@ BINDIR = $(bindir)
155
156
  RUBYCOMMONDIR = $(sitedir)$(target_prefix)
156
157
  RUBYLIBDIR = $(sitelibdir)$(target_prefix)
157
158
  RUBYARCHDIR = $(sitearchdir)$(target_prefix)
158
- HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
159
- ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
159
+ HDRDIR = $(sitehdrdir)$(target_prefix)
160
+ ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
160
161
  TARGET_SO_DIR =
161
162
  TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
162
163
  CLEANLIBS = $(TARGET_SO)
data/ext/cld3/base.o CHANGED
Binary file
Binary file
@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
167
167
  for (int i = 0; i < model_->embedding_dim_size(); ++i) {
168
168
  CLD3_DCHECK(offset_sum == model_->concat_offset(i));
169
169
  offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
170
+ (void)offset_sum; // Avoid compiler warning for "unused" variable.
170
171
  embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
171
172
  }
172
173
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
33
33
  static const int kWithinScriptTail = 32; // Stop at word space in last
34
34
  // N bytes of script buffer
35
35
 
36
- typedef struct {
36
+ struct LangSpan {
37
37
  char* text = nullptr; // Pointer to the span, somewhere
38
38
  int text_bytes = 0; // Number of bytes of text in the span
39
39
  int offset = 0; // Offset of start of span in original input buffer
40
40
  ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
41
41
  bool truncated = false; // true if buffer filled up before a
42
42
  // different script or EOF was found
43
- } LangSpan;
43
+ };
44
44
 
45
45
  static inline bool IsContinuationByte(char c) {
46
46
  return static_cast<signed char>(c) < -64;
Binary file
Binary file
data/ext/cld3/libcld3.so CHANGED
Binary file
@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
284
284
  CLD2::LangSpan script_span;
285
285
  std::unordered_map<string, LangChunksStats> lang_stats;
286
286
  int total_num_bytes = 0;
287
- Result result;
288
- string language;
289
287
  int chunk_size = 0; // Use the default.
290
288
  while (ss.GetOneScriptSpanLower(&script_span)) {
291
289
  const int num_original_span_bytes = script_span.text_bytes;
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
302
300
 
303
301
  const string selected_text = SelectTextGivenScriptSpan(script_span);
304
302
 
305
- result = FindLanguageOfValidUTF8(selected_text);
306
- language = result.language;
303
+ Result result = FindLanguageOfValidUTF8(selected_text);
304
+ string language = result.language;
307
305
  lang_stats[language].byte_sum += num_original_span_bytes;
308
306
  lang_stats[language].prob_sum +=
309
307
  result.probability * num_original_span_bytes;
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
356
354
  const char *text_begin, int text_size) {
357
355
  string output_text;
358
356
 
359
- // If the size of the input is greater than the maxium number of bytes needed
357
+ // If the size of the input is greater than the maximum number of bytes needed
360
358
  // for a prediction, then concatenate snippets that are equally spread out
361
359
  // throughout the input.
362
360
  if (text_size > max_num_bytes_) {
Binary file
Binary file
data/ext/cld3/offsetmap.o CHANGED
Binary file
data/ext/cld3/registry.o CHANGED
Binary file
Binary file
Binary file
@@ -19,11 +19,11 @@ limitations under the License.
19
19
 
20
20
  namespace chrome_lang_id {
21
21
 
22
- // Declare registry for the whole Sentence feature functions. NOTE: this is not
22
+ // Define registry for the whole Sentence feature functions. NOTE: this is not
23
23
  // yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
24
24
  // constructor, *before* we use any feature.
25
25
  template <>
26
- WholeSentenceFeature::Registry
27
- *RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
26
+ WholeSentenceFeature::Registry*
27
+ RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
28
28
 
29
- } // namespace chrome_lang_id
29
+ } // namespace chrome_lang_id
@@ -26,9 +26,19 @@ limitations under the License.
26
26
  namespace chrome_lang_id {
27
27
 
28
28
  // Feature function that extracts features for the full Sentence.
29
- typedef FeatureFunction<Sentence> WholeSentenceFeature;
30
-
31
- typedef FeatureExtractor<Sentence> WholeSentenceExtractor;
29
+ using WholeSentenceFeature = FeatureFunction<Sentence>;
30
+
31
+ using WholeSentenceExtractor = FeatureExtractor<Sentence>;
32
+
33
+ // Declare registry for the whole Sentence feature functions. This is required
34
+ // for clang's -Wundefined-var-template. However, MSVC has a bug which treats
35
+ // this declaration as a definition, leading to multiple definition errors, so
36
+ // omit this on MSVC.
37
+ #if !defined(COMPILER_MSVC)
38
+ template <>
39
+ WholeSentenceFeature::Registry
40
+ *RegisterableClass<WholeSentenceFeature>::registry_;
41
+ #endif
32
42
 
33
43
  } // namespace chrome_lang_id
34
44
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/ext/cld3/utils.o CHANGED
Binary file
data/ext/cld3/workspace.o CHANGED
Binary file
@@ -0,0 +1,58 @@
1
+
2
+ # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
3
+ # All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ module CLD3
19
+ module Unstable
20
+ extend FFI::Library
21
+
22
+ ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
23
+
24
+ module NNetLanguageIdentifier
25
+ class Pointer < FFI::AutoPointer
26
+ def self.release(pointer)
27
+ Unstable.delete_NNetLanguageIdentifier(pointer)
28
+ end
29
+ end
30
+
31
+ class SpanInfo < FFI::Struct
32
+ layout :start_index, :int, :end_index, :int, :probability, :float
33
+ end
34
+
35
+ class Result < FFI::Struct
36
+ layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
37
+ end
38
+ end
39
+
40
+ attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
41
+
42
+ attach_function :delete_result, [ :pointer ], :void
43
+
44
+ attach_function :delete_results, [ :pointer ], :void
45
+
46
+ attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
47
+
48
+ attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
49
+
50
+ attach_function :NNetLanguageIdentifier_find_language,
51
+ [ :pointer, :buffer_in, :size_t ], :pointer
52
+
53
+ attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
54
+ [ :pointer, :buffer_in, :size_t, :int ], :pointer
55
+ end
56
+
57
+ private_constant :Unstable
58
+ end
data/lib/cld3.rb CHANGED
@@ -19,6 +19,7 @@
19
19
 
20
20
  require "ffi"
21
21
  require "rbconfig"
22
+ require "cld3/unstable"
22
23
 
23
24
  # Module providing an interface for Compact Language Detector v3 (CLD3)
24
25
  module CLD3
@@ -52,6 +53,7 @@ module CLD3
52
53
  # Holds probability that Span, specified by start/end indices, is a given
53
54
  # language. The langauge is not stored here; it can be found in Result, which
54
55
  # holds an Array of SpanInfo.
56
+ # @type const SpanInfo: untyped
55
57
  SpanInfo = Struct.new(:start_index, :end_index, :probability)
56
58
 
57
59
  # Information about a predicted language.
@@ -69,6 +71,7 @@ module CLD3
69
71
  #
70
72
  # [byte_ranges] Specifies the byte ranges in UTF-8 that |language| applies to.
71
73
  # This is an Array of SpanInfo.
74
+ # @type const Result: untyped
72
75
  Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
73
76
 
74
77
  # The arguments are two String objects.
@@ -115,6 +118,8 @@ module CLD3
115
118
  # The second argument is Numeric object.
116
119
  # The returned value of this functions is an Array of Result instances.
117
120
  def find_top_n_most_freq_langs(text, num_langs)
121
+ # @type var a: untyped
122
+
118
123
  text_utf8 = text.encode(Encoding::UTF_8)
119
124
  pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
120
125
 
@@ -123,11 +128,13 @@ module CLD3
123
128
 
124
129
  results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
125
130
  begin
126
- num_langs.times
131
+ a = num_langs.times
127
132
  .lazy
128
133
  .map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
129
134
  .take_while { |result| !result.nil? }
130
135
  .to_a
136
+
137
+ a
131
138
  ensure
132
139
  Unstable.delete_results results
133
140
  end
@@ -162,6 +169,7 @@ module CLD3
162
169
  # The model weights are loaded statically.
163
170
  module TaskContextParams
164
171
  # This is an frozen Array object containing symbols.
172
+ # @type const LANGUAGE_NAMES: untyped
165
173
  LANGUAGE_NAMES = [
166
174
  :eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
167
175
  :nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
@@ -175,44 +183,4 @@ module CLD3
175
183
  :sn, :yo, :pa, :ku,
176
184
  ].freeze
177
185
  end
178
-
179
- module Unstable
180
- extend FFI::Library
181
-
182
- ffi_lib File.join(__dir__, "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
183
-
184
- module NNetLanguageIdentifier
185
- class Pointer < FFI::AutoPointer
186
- def self.release(pointer)
187
- Unstable.delete_NNetLanguageIdentifier(pointer)
188
- end
189
- end
190
-
191
- class SpanInfo < FFI::Struct
192
- layout :start_index, :int, :end_index, :int, :probability, :float
193
- end
194
-
195
- class Result < FFI::Struct
196
- layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
197
- end
198
- end
199
-
200
- attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
201
-
202
- attach_function :delete_result, [ :pointer ], :void
203
-
204
- attach_function :delete_results, [ :pointer ], :void
205
-
206
- attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
207
-
208
- attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
209
-
210
- attach_function :NNetLanguageIdentifier_find_language,
211
- [ :pointer, :buffer_in, :size_t ], :pointer
212
-
213
- attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
214
- [ :pointer, :buffer_in, :size_t, :int ], :pointer
215
- end
216
-
217
- private_constant :Unstable
218
186
  end
data/sig/cld3.rbs ADDED
@@ -0,0 +1,65 @@
1
+ # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ # All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ==============================================================================
16
+
17
+ module CLD3
18
+ class NNetLanguageIdentifier
19
+ MIN_NUM_BYTES_TO_CONSIDER: Integer
20
+ MAX_NUM_BYTES_TO_CONSIDER: Integer
21
+ MAX_NUM_INPUT_BYTES_TO_CONSIDER: Integer
22
+ RELIABILITY_THRESHOLD: Float
23
+ RELIABILITY_HR_BS_THRESHOLD: Float
24
+
25
+ class SpanInfo < Struct[Float | Integer]
26
+ attr_accessor start_index(): Integer
27
+ attr_accessor end_index(): Integer
28
+ attr_accessor probability(): Float
29
+ end
30
+
31
+ class Result < Struct[Array[SpanInfo] | Float | TaskContextParams::language_names | bool]
32
+ attr_accessor language(): TaskContextParams::language_names
33
+ attr_accessor probability(): Float
34
+ attr_accessor reliable?(): bool
35
+ attr_accessor proportion(): Float
36
+ attr_accessor byte_ranges(): Array[SpanInfo]
37
+ end
38
+
39
+ def initialize: (?Integer, ?Integer) -> void
40
+ def find_language: (String) -> Result?
41
+ def find_top_n_most_freq_langs: (String, Integer) -> Array[Result]
42
+
43
+ private
44
+
45
+ def convert_result: (untyped) -> Result?
46
+ end
47
+
48
+ module TaskContextParams
49
+ type language_names =
50
+ :eo | :co | :eu | :ta | :de | :mt | :ps | :te | :su | :uz | :'zh-Latn' | :ne |
51
+ :nl | :sw | :sq | :hmn | :ja | :no | :mn | :so | :ko | :kk | :sl | :ig |
52
+ :mr | :th | :zu | :ml | :hr | :bs | :lo | :sd | :cy | :hy | :uk | :pt |
53
+ :lv | :iw | :cs | :vi | :jv | :be | :km | :mk | :tr | :fy | :am | :zh |
54
+ :da | :sv | :fi | :ht | :af | :la | :id | :fil | :sm | :ca | :el | :ka |
55
+ :sr | :it | :sk | :ru | :'ru-Latn' | :bg | :ny | :fa | :haw | :gl | :et |
56
+ :ms | :gd | :'bg-Latn' | :ha | :is | :ur | :mi | :hi | :bn | :'hi-Latn' | :fr |
57
+ :yi | :hu | :xh | :my | :tg | :ro | :ar | :lb | :'el-Latn' | :st | :ceb |
58
+ :kn | :az | :si | :ky | :mg | :en | :gu | :es | :pl | :'ja-Latn' | :ga | :lt |
59
+ :sn | :yo | :pa | :ku
60
+
61
+ LANGUAGE_NAMES: Array[language_names]
62
+ end
63
+
64
+ Unstable: untyped
65
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cld3
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.4.2
4
+ version: 3.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Akihiko Odaki
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-04-17 00:00:00.000000000 Z
11
+ date: 2021-11-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -30,6 +30,26 @@ dependencies:
30
30
  - - "<"
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.16.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: rbs
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 1.7.0
40
+ - - "<"
41
+ - !ruby/object:Gem::Version
42
+ version: 1.8.0
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.7.0
50
+ - - "<"
51
+ - !ruby/object:Gem::Version
52
+ version: 1.8.0
33
53
  - !ruby/object:Gem::Dependency
34
54
  name: rspec
35
55
  requirement: !ruby/object:Gem::Requirement
@@ -50,6 +70,26 @@ dependencies:
50
70
  - - "<"
51
71
  - !ruby/object:Gem::Version
52
72
  version: 3.11.0
73
+ - !ruby/object:Gem::Dependency
74
+ name: steep
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 0.46.0
80
+ - - "<"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.47.0
83
+ type: :development
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.46.0
90
+ - - "<"
91
+ - !ruby/object:Gem::Version
92
+ version: 0.47.0
53
93
  description: Compact Language Detector v3 (CLD3) is a neural network model for language
54
94
  identification.
55
95
  email: akihiko.odaki@gmail.com
@@ -160,6 +200,8 @@ files:
160
200
  - ext/cld3/workspace.h
161
201
  - ext/cld3/workspace.o
162
202
  - lib/cld3.rb
203
+ - lib/cld3/unstable.rb
204
+ - sig/cld3.rbs
163
205
  homepage: https://github.com/akihikodaki/cld3-ruby
164
206
  licenses:
165
207
  - Apache-2.0
@@ -175,14 +217,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
175
217
  version: 2.6.0
176
218
  - - "<"
177
219
  - !ruby/object:Gem::Version
178
- version: 3.1.0
220
+ version: 3.2.0
179
221
  required_rubygems_version: !ruby/object:Gem::Requirement
180
222
  requirements:
181
223
  - - ">="
182
224
  - !ruby/object:Gem::Version
183
225
  version: '0'
184
226
  requirements: []
185
- rubygems_version: 3.1.4
227
+ rubygems_version: 3.2.22
186
228
  signing_key:
187
229
  specification_version: 4
188
230
  summary: Compact Language Detector v3 (CLD3)