cld3 3.4.2 → 3.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/cld3.gemspec +5 -3
  3. data/ext/cld3/Makefile +6 -5
  4. data/ext/cld3/base.o +0 -0
  5. data/ext/cld3/embedding_feature_extractor.o +0 -0
  6. data/ext/cld3/embedding_network.cc +1 -0
  7. data/ext/cld3/embedding_network.o +0 -0
  8. data/ext/cld3/feature_extractor.o +0 -0
  9. data/ext/cld3/feature_extractor.pb.o +0 -0
  10. data/ext/cld3/feature_types.o +0 -0
  11. data/ext/cld3/fixunicodevalue.o +0 -0
  12. data/ext/cld3/fml_parser.o +0 -0
  13. data/ext/cld3/generated_entities.o +0 -0
  14. data/ext/cld3/generated_ulscript.o +0 -0
  15. data/ext/cld3/getonescriptspan.h +2 -2
  16. data/ext/cld3/getonescriptspan.o +0 -0
  17. data/ext/cld3/lang_id_nn_params.o +0 -0
  18. data/ext/cld3/language_identifier_features.o +0 -0
  19. data/ext/cld3/libcld3.so +0 -0
  20. data/ext/cld3/nnet_language_identifier.cc +3 -5
  21. data/ext/cld3/nnet_language_identifier.o +0 -0
  22. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  23. data/ext/cld3/offsetmap.o +0 -0
  24. data/ext/cld3/registry.o +0 -0
  25. data/ext/cld3/relevant_script_feature.o +0 -0
  26. data/ext/cld3/sentence.pb.o +0 -0
  27. data/ext/cld3/sentence_features.cc +4 -4
  28. data/ext/cld3/sentence_features.h +13 -3
  29. data/ext/cld3/sentence_features.o +0 -0
  30. data/ext/cld3/task_context.o +0 -0
  31. data/ext/cld3/task_context_params.o +0 -0
  32. data/ext/cld3/task_spec.pb.o +0 -0
  33. data/ext/cld3/text_processing.o +0 -0
  34. data/ext/cld3/unicodetext.o +0 -0
  35. data/ext/cld3/utf8statetable.o +0 -0
  36. data/ext/cld3/utils.o +0 -0
  37. data/ext/cld3/workspace.o +0 -0
  38. data/lib/cld3/unstable.rb +58 -0
  39. data/lib/cld3.rb +9 -41
  40. data/sig/cld3.rbs +65 -0
  41. metadata +46 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f5b3cc203abda97cb85d5dee0983b7f63c626397b8af8b90e2110bb5fedbbdec
4
- data.tar.gz: 197f66798925404ded7af722d0194a705018d6953b11f4576c4e180ea093675d
3
+ metadata.gz: 2c161cbf12d260074efd2e9db3981b6615af20ee04c234d6b2710bd52a283a4e
4
+ data.tar.gz: c388ae6b529d95e015ecdb7d21cdd7f1ceaca72d167d0f8008b5477d5bce5b3c
5
5
  SHA512:
6
- metadata.gz: 855e8ee464a2842906bfef211e2afb21820fe9a7449b58d91b9ab1908c997966b9dd4c2d5d51f82ceb84b65b5a118736a5aa4eff6ea9548b9a9abc61b297a9d0
7
- data.tar.gz: e38ddfd81489aeb83bccc7b509dd17ea79c56ba641de37cac2d800d3428ed31e5ac57066016bd118e9e71c30c78d31b4c38a266abe012065495558adf07e68f5
6
+ metadata.gz: 8e3c1c07283730e722c450acc308a497756fd501595a02a7fc066d0b3e59b96e1ab1e7941549293b02e41274b176772bdae3779a041eb28f8ae53f5c44308cc0
7
+ data.tar.gz: 52e95027de7a595b2eabc49745a11f664e305c18f9926bc9d649642a92fea9846efdd23da699529795d80609b8871b00e77f9379449d2e4f6cb79ecbcf2785db
data/cld3.gemspec CHANGED
@@ -16,19 +16,21 @@
16
16
 
17
17
  Gem::Specification.new do |gem|
18
18
  gem.name = "cld3"
19
- gem.version = "3.4.2"
19
+ gem.version = "3.4.3"
20
20
  gem.summary = "Compact Language Detector v3 (CLD3)"
21
21
  gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
22
  gem.license = "Apache-2.0"
23
23
  gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
24
24
  gem.author = "Akihiko Odaki"
25
25
  gem.email = "akihiko.odaki@gmail.com"
26
- gem.required_ruby_version = [ ">= 2.6.0", "< 3.1.0" ]
26
+ gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
27
27
  gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
28
+ gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
28
29
  gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
30
+ gem.add_development_dependency "steep", [ ">= 0.46.0", "< 0.47.0" ]
29
31
  gem.files = Dir[
30
32
  "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
31
- "cld3.gemspec", "ext/**/*", "lib/**/*"
33
+ "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
32
34
  ]
33
35
  gem.require_paths = [ "lib" ]
34
36
  gem.extensions = [ "ext/cld3/extconf.rb" ]
data/ext/cld3/Makefile CHANGED
@@ -53,6 +53,7 @@ infodir = $(DESTDIR)/usr/share/info
53
53
  docdir = $(datarootdir)/doc/$(PACKAGE)
54
54
  oldincludedir = $(DESTDIR)/usr/include
55
55
  includedir = $(DESTDIR)/usr/include
56
+ runstatedir = $(localstatedir)/run
56
57
  localstatedir = $(DESTDIR)/var
57
58
  sharedstatedir = $(DESTDIR)/var/lib
58
59
  sysconfdir = $(DESTDIR)/etc
@@ -95,7 +96,7 @@ ARCH_FLAG =
95
96
  DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
96
97
  LDSHARED = $(CC) -shared
97
98
  LDSHAREDXX = $(CXX) -shared
98
- AR = ar
99
+ AR = gcc-ar
99
100
  EXEEXT =
100
101
 
101
102
  RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
@@ -107,7 +108,7 @@ RUBY_BASE_NAME = ruby
107
108
 
108
109
  arch = aarch64-linux
109
110
  sitearch = $(arch)
110
- ruby_version = 2.7.0
111
+ ruby_version = 3.0.0
111
112
  ruby = $(bindir)/$(RUBY_BASE_NAME)
112
113
  RUBY = $(ruby)
113
114
  ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
@@ -141,7 +142,7 @@ LIBS = -lprotobuf -lpthread -lm -lc
141
142
  ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
142
143
  SRCS = $(ORIG_SRCS)
143
144
  OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
144
- HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/language_identifier_features.h $(srcdir)/lang_id_nn_params.h $(srcdir)/nnet_language_identifier.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/fixunicodevalue.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/stringpiece.h $(srcdir)/text_processing.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/unicodetext.h $(srcdir)/utils.h $(srcdir)/workspace.h $(srcdir)/feature_extractor.pb.h $(srcdir)/sentence.pb.h $(srcdir)/task_spec.pb.h
145
+ HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
145
146
  LOCAL_HDRS =
146
147
  TARGET = libcld3
147
148
  TARGET_NAME = libcld3
@@ -155,8 +156,8 @@ BINDIR = $(bindir)
155
156
  RUBYCOMMONDIR = $(sitedir)$(target_prefix)
156
157
  RUBYLIBDIR = $(sitelibdir)$(target_prefix)
157
158
  RUBYARCHDIR = $(sitearchdir)$(target_prefix)
158
- HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
159
- ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
159
+ HDRDIR = $(sitehdrdir)$(target_prefix)
160
+ ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
160
161
  TARGET_SO_DIR =
161
162
  TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
162
163
  CLEANLIBS = $(TARGET_SO)
data/ext/cld3/base.o CHANGED
Binary file
Binary file
@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
167
167
  for (int i = 0; i < model_->embedding_dim_size(); ++i) {
168
168
  CLD3_DCHECK(offset_sum == model_->concat_offset(i));
169
169
  offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
170
+ (void)offset_sum; // Avoid compiler warning for "unused" variable.
170
171
  embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
171
172
  }
172
173
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
33
33
  static const int kWithinScriptTail = 32; // Stop at word space in last
34
34
  // N bytes of script buffer
35
35
 
36
- typedef struct {
36
+ struct LangSpan {
37
37
  char* text = nullptr; // Pointer to the span, somewhere
38
38
  int text_bytes = 0; // Number of bytes of text in the span
39
39
  int offset = 0; // Offset of start of span in original input buffer
40
40
  ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
41
41
  bool truncated = false; // true if buffer filled up before a
42
42
  // different script or EOF was found
43
- } LangSpan;
43
+ };
44
44
 
45
45
  static inline bool IsContinuationByte(char c) {
46
46
  return static_cast<signed char>(c) < -64;
Binary file
Binary file
data/ext/cld3/libcld3.so CHANGED
Binary file
@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
284
284
  CLD2::LangSpan script_span;
285
285
  std::unordered_map<string, LangChunksStats> lang_stats;
286
286
  int total_num_bytes = 0;
287
- Result result;
288
- string language;
289
287
  int chunk_size = 0; // Use the default.
290
288
  while (ss.GetOneScriptSpanLower(&script_span)) {
291
289
  const int num_original_span_bytes = script_span.text_bytes;
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
302
300
 
303
301
  const string selected_text = SelectTextGivenScriptSpan(script_span);
304
302
 
305
- result = FindLanguageOfValidUTF8(selected_text);
306
- language = result.language;
303
+ Result result = FindLanguageOfValidUTF8(selected_text);
304
+ string language = result.language;
307
305
  lang_stats[language].byte_sum += num_original_span_bytes;
308
306
  lang_stats[language].prob_sum +=
309
307
  result.probability * num_original_span_bytes;
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
356
354
  const char *text_begin, int text_size) {
357
355
  string output_text;
358
356
 
359
- // If the size of the input is greater than the maxium number of bytes needed
357
+ // If the size of the input is greater than the maximum number of bytes needed
360
358
  // for a prediction, then concatenate snippets that are equally spread out
361
359
  // throughout the input.
362
360
  if (text_size > max_num_bytes_) {
Binary file
Binary file
data/ext/cld3/offsetmap.o CHANGED
Binary file
data/ext/cld3/registry.o CHANGED
Binary file
Binary file
Binary file
@@ -19,11 +19,11 @@ limitations under the License.
19
19
 
20
20
  namespace chrome_lang_id {
21
21
 
22
- // Declare registry for the whole Sentence feature functions. NOTE: this is not
22
+ // Define registry for the whole Sentence feature functions. NOTE: this is not
23
23
  // yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
24
24
  // constructor, *before* we use any feature.
25
25
  template <>
26
- WholeSentenceFeature::Registry
27
- *RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
26
+ WholeSentenceFeature::Registry*
27
+ RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
28
28
 
29
- } // namespace chrome_lang_id
29
+ } // namespace chrome_lang_id
@@ -26,9 +26,19 @@ limitations under the License.
26
26
  namespace chrome_lang_id {
27
27
 
28
28
  // Feature function that extracts features for the full Sentence.
29
- typedef FeatureFunction<Sentence> WholeSentenceFeature;
30
-
31
- typedef FeatureExtractor<Sentence> WholeSentenceExtractor;
29
+ using WholeSentenceFeature = FeatureFunction<Sentence>;
30
+
31
+ using WholeSentenceExtractor = FeatureExtractor<Sentence>;
32
+
33
+ // Declare registry for the whole Sentence feature functions. This is required
34
+ // for clang's -Wundefined-var-template. However, MSVC has a bug which treats
35
+ // this declaration as a definition, leading to multiple definition errors, so
36
+ // omit this on MSVC.
37
+ #if !defined(COMPILER_MSVC)
38
+ template <>
39
+ WholeSentenceFeature::Registry
40
+ *RegisterableClass<WholeSentenceFeature>::registry_;
41
+ #endif
32
42
 
33
43
  } // namespace chrome_lang_id
34
44
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/ext/cld3/utils.o CHANGED
Binary file
data/ext/cld3/workspace.o CHANGED
Binary file
@@ -0,0 +1,58 @@
1
+
2
+ # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
3
+ # All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ module CLD3
19
+ module Unstable
20
+ extend FFI::Library
21
+
22
+ ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
23
+
24
+ module NNetLanguageIdentifier
25
+ class Pointer < FFI::AutoPointer
26
+ def self.release(pointer)
27
+ Unstable.delete_NNetLanguageIdentifier(pointer)
28
+ end
29
+ end
30
+
31
+ class SpanInfo < FFI::Struct
32
+ layout :start_index, :int, :end_index, :int, :probability, :float
33
+ end
34
+
35
+ class Result < FFI::Struct
36
+ layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
37
+ end
38
+ end
39
+
40
+ attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
41
+
42
+ attach_function :delete_result, [ :pointer ], :void
43
+
44
+ attach_function :delete_results, [ :pointer ], :void
45
+
46
+ attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
47
+
48
+ attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
49
+
50
+ attach_function :NNetLanguageIdentifier_find_language,
51
+ [ :pointer, :buffer_in, :size_t ], :pointer
52
+
53
+ attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
54
+ [ :pointer, :buffer_in, :size_t, :int ], :pointer
55
+ end
56
+
57
+ private_constant :Unstable
58
+ end
data/lib/cld3.rb CHANGED
@@ -19,6 +19,7 @@
19
19
 
20
20
  require "ffi"
21
21
  require "rbconfig"
22
+ require "cld3/unstable"
22
23
 
23
24
  # Module providing an interface for Compact Language Detector v3 (CLD3)
24
25
  module CLD3
@@ -52,6 +53,7 @@ module CLD3
52
53
  # Holds probability that Span, specified by start/end indices, is a given
53
54
  # language. The langauge is not stored here; it can be found in Result, which
54
55
  # holds an Array of SpanInfo.
56
+ # @type const SpanInfo: untyped
55
57
  SpanInfo = Struct.new(:start_index, :end_index, :probability)
56
58
 
57
59
  # Information about a predicted language.
@@ -69,6 +71,7 @@ module CLD3
69
71
  #
70
72
  # [byte_ranges] Specifies the byte ranges in UTF-8 that |language| applies to.
71
73
  # This is an Array of SpanInfo.
74
+ # @type const Result: untyped
72
75
  Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
73
76
 
74
77
  # The arguments are two String objects.
@@ -115,6 +118,8 @@ module CLD3
115
118
  # The second argument is Numeric object.
116
119
  # The returned value of this functions is an Array of Result instances.
117
120
  def find_top_n_most_freq_langs(text, num_langs)
121
+ # @type var a: untyped
122
+
118
123
  text_utf8 = text.encode(Encoding::UTF_8)
119
124
  pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
120
125
 
@@ -123,11 +128,13 @@ module CLD3
123
128
 
124
129
  results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
125
130
  begin
126
- num_langs.times
131
+ a = num_langs.times
127
132
  .lazy
128
133
  .map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
129
134
  .take_while { |result| !result.nil? }
130
135
  .to_a
136
+
137
+ a
131
138
  ensure
132
139
  Unstable.delete_results results
133
140
  end
@@ -162,6 +169,7 @@ module CLD3
162
169
  # The model weights are loaded statically.
163
170
  module TaskContextParams
164
171
  # This is an frozen Array object containing symbols.
172
+ # @type const LANGUAGE_NAMES: untyped
165
173
  LANGUAGE_NAMES = [
166
174
  :eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
167
175
  :nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
@@ -175,44 +183,4 @@ module CLD3
175
183
  :sn, :yo, :pa, :ku,
176
184
  ].freeze
177
185
  end
178
-
179
- module Unstable
180
- extend FFI::Library
181
-
182
- ffi_lib File.join(__dir__, "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
183
-
184
- module NNetLanguageIdentifier
185
- class Pointer < FFI::AutoPointer
186
- def self.release(pointer)
187
- Unstable.delete_NNetLanguageIdentifier(pointer)
188
- end
189
- end
190
-
191
- class SpanInfo < FFI::Struct
192
- layout :start_index, :int, :end_index, :int, :probability, :float
193
- end
194
-
195
- class Result < FFI::Struct
196
- layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
197
- end
198
- end
199
-
200
- attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
201
-
202
- attach_function :delete_result, [ :pointer ], :void
203
-
204
- attach_function :delete_results, [ :pointer ], :void
205
-
206
- attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
207
-
208
- attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
209
-
210
- attach_function :NNetLanguageIdentifier_find_language,
211
- [ :pointer, :buffer_in, :size_t ], :pointer
212
-
213
- attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
214
- [ :pointer, :buffer_in, :size_t, :int ], :pointer
215
- end
216
-
217
- private_constant :Unstable
218
186
  end
data/sig/cld3.rbs ADDED
@@ -0,0 +1,65 @@
1
+ # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ # All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ==============================================================================
16
+
17
+ module CLD3
18
+ class NNetLanguageIdentifier
19
+ MIN_NUM_BYTES_TO_CONSIDER: Integer
20
+ MAX_NUM_BYTES_TO_CONSIDER: Integer
21
+ MAX_NUM_INPUT_BYTES_TO_CONSIDER: Integer
22
+ RELIABILITY_THRESHOLD: Float
23
+ RELIABILITY_HR_BS_THRESHOLD: Float
24
+
25
+ class SpanInfo < Struct[Float | Integer]
26
+ attr_accessor start_index(): Integer
27
+ attr_accessor end_index(): Integer
28
+ attr_accessor probability(): Float
29
+ end
30
+
31
+ class Result < Struct[Array[SpanInfo] | Float | TaskContextParams::language_names | bool]
32
+ attr_accessor language(): TaskContextParams::language_names
33
+ attr_accessor probability(): Float
34
+ attr_accessor reliable?(): bool
35
+ attr_accessor proportion(): Float
36
+ attr_accessor byte_ranges(): Array[SpanInfo]
37
+ end
38
+
39
+ def initialize: (?Integer, ?Integer) -> void
40
+ def find_language: (String) -> Result?
41
+ def find_top_n_most_freq_langs: (String, Integer) -> Array[Result]
42
+
43
+ private
44
+
45
+ def convert_result: (untyped) -> Result?
46
+ end
47
+
48
+ module TaskContextParams
49
+ type language_names =
50
+ :eo | :co | :eu | :ta | :de | :mt | :ps | :te | :su | :uz | :'zh-Latn' | :ne |
51
+ :nl | :sw | :sq | :hmn | :ja | :no | :mn | :so | :ko | :kk | :sl | :ig |
52
+ :mr | :th | :zu | :ml | :hr | :bs | :lo | :sd | :cy | :hy | :uk | :pt |
53
+ :lv | :iw | :cs | :vi | :jv | :be | :km | :mk | :tr | :fy | :am | :zh |
54
+ :da | :sv | :fi | :ht | :af | :la | :id | :fil | :sm | :ca | :el | :ka |
55
+ :sr | :it | :sk | :ru | :'ru-Latn' | :bg | :ny | :fa | :haw | :gl | :et |
56
+ :ms | :gd | :'bg-Latn' | :ha | :is | :ur | :mi | :hi | :bn | :'hi-Latn' | :fr |
57
+ :yi | :hu | :xh | :my | :tg | :ro | :ar | :lb | :'el-Latn' | :st | :ceb |
58
+ :kn | :az | :si | :ky | :mg | :en | :gu | :es | :pl | :'ja-Latn' | :ga | :lt |
59
+ :sn | :yo | :pa | :ku
60
+
61
+ LANGUAGE_NAMES: Array[language_names]
62
+ end
63
+
64
+ Unstable: untyped
65
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cld3
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.4.2
4
+ version: 3.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Akihiko Odaki
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-04-17 00:00:00.000000000 Z
11
+ date: 2021-11-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -30,6 +30,26 @@ dependencies:
30
30
  - - "<"
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.16.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: rbs
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 1.7.0
40
+ - - "<"
41
+ - !ruby/object:Gem::Version
42
+ version: 1.8.0
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.7.0
50
+ - - "<"
51
+ - !ruby/object:Gem::Version
52
+ version: 1.8.0
33
53
  - !ruby/object:Gem::Dependency
34
54
  name: rspec
35
55
  requirement: !ruby/object:Gem::Requirement
@@ -50,6 +70,26 @@ dependencies:
50
70
  - - "<"
51
71
  - !ruby/object:Gem::Version
52
72
  version: 3.11.0
73
+ - !ruby/object:Gem::Dependency
74
+ name: steep
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 0.46.0
80
+ - - "<"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.47.0
83
+ type: :development
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.46.0
90
+ - - "<"
91
+ - !ruby/object:Gem::Version
92
+ version: 0.47.0
53
93
  description: Compact Language Detector v3 (CLD3) is a neural network model for language
54
94
  identification.
55
95
  email: akihiko.odaki@gmail.com
@@ -160,6 +200,8 @@ files:
160
200
  - ext/cld3/workspace.h
161
201
  - ext/cld3/workspace.o
162
202
  - lib/cld3.rb
203
+ - lib/cld3/unstable.rb
204
+ - sig/cld3.rbs
163
205
  homepage: https://github.com/akihikodaki/cld3-ruby
164
206
  licenses:
165
207
  - Apache-2.0
@@ -175,14 +217,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
175
217
  version: 2.6.0
176
218
  - - "<"
177
219
  - !ruby/object:Gem::Version
178
- version: 3.1.0
220
+ version: 3.2.0
179
221
  required_rubygems_version: !ruby/object:Gem::Requirement
180
222
  requirements:
181
223
  - - ">="
182
224
  - !ruby/object:Gem::Version
183
225
  version: '0'
184
226
  requirements: []
185
- rubygems_version: 3.1.4
227
+ rubygems_version: 3.2.22
186
228
  signing_key:
187
229
  specification_version: 4
188
230
  summary: Compact Language Detector v3 (CLD3)