cld3 3.4.2 → 3.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/cld3.gemspec +5 -3
- data/ext/cld3/Makefile +6 -5
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.cc +1 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.h +2 -2
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.cc +3 -5
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence_features.cc +4 -4
- data/ext/cld3/sentence_features.h +13 -3
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3/unstable.rb +58 -0
- data/lib/cld3.rb +9 -41
- data/sig/cld3.rbs +65 -0
- metadata +46 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2c161cbf12d260074efd2e9db3981b6615af20ee04c234d6b2710bd52a283a4e
|
|
4
|
+
data.tar.gz: c388ae6b529d95e015ecdb7d21cdd7f1ceaca72d167d0f8008b5477d5bce5b3c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 8e3c1c07283730e722c450acc308a497756fd501595a02a7fc066d0b3e59b96e1ab1e7941549293b02e41274b176772bdae3779a041eb28f8ae53f5c44308cc0
|
|
7
|
+
data.tar.gz: 52e95027de7a595b2eabc49745a11f664e305c18f9926bc9d649642a92fea9846efdd23da699529795d80609b8871b00e77f9379449d2e4f6cb79ecbcf2785db
|
data/cld3.gemspec
CHANGED
|
@@ -16,19 +16,21 @@
|
|
|
16
16
|
|
|
17
17
|
Gem::Specification.new do |gem|
|
|
18
18
|
gem.name = "cld3"
|
|
19
|
-
gem.version = "3.4.
|
|
19
|
+
gem.version = "3.4.3"
|
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
|
22
22
|
gem.license = "Apache-2.0"
|
|
23
23
|
gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
|
|
24
24
|
gem.author = "Akihiko Odaki"
|
|
25
25
|
gem.email = "akihiko.odaki@gmail.com"
|
|
26
|
-
gem.required_ruby_version = [ ">= 2.6.0", "< 3.
|
|
26
|
+
gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
|
|
27
27
|
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
|
|
28
|
+
gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
|
|
28
29
|
gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
|
|
30
|
+
gem.add_development_dependency "steep", [ ">= 0.46.0", "< 0.47.0" ]
|
|
29
31
|
gem.files = Dir[
|
|
30
32
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
|
31
|
-
"cld3.gemspec", "ext/**/*", "lib/**/*"
|
|
33
|
+
"cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
|
|
32
34
|
]
|
|
33
35
|
gem.require_paths = [ "lib" ]
|
|
34
36
|
gem.extensions = [ "ext/cld3/extconf.rb" ]
|
data/ext/cld3/Makefile
CHANGED
|
@@ -53,6 +53,7 @@ infodir = $(DESTDIR)/usr/share/info
|
|
|
53
53
|
docdir = $(datarootdir)/doc/$(PACKAGE)
|
|
54
54
|
oldincludedir = $(DESTDIR)/usr/include
|
|
55
55
|
includedir = $(DESTDIR)/usr/include
|
|
56
|
+
runstatedir = $(localstatedir)/run
|
|
56
57
|
localstatedir = $(DESTDIR)/var
|
|
57
58
|
sharedstatedir = $(DESTDIR)/var/lib
|
|
58
59
|
sysconfdir = $(DESTDIR)/etc
|
|
@@ -95,7 +96,7 @@ ARCH_FLAG =
|
|
|
95
96
|
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
|
96
97
|
LDSHARED = $(CC) -shared
|
|
97
98
|
LDSHAREDXX = $(CXX) -shared
|
|
98
|
-
AR = ar
|
|
99
|
+
AR = gcc-ar
|
|
99
100
|
EXEEXT =
|
|
100
101
|
|
|
101
102
|
RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
|
|
@@ -107,7 +108,7 @@ RUBY_BASE_NAME = ruby
|
|
|
107
108
|
|
|
108
109
|
arch = aarch64-linux
|
|
109
110
|
sitearch = $(arch)
|
|
110
|
-
ruby_version =
|
|
111
|
+
ruby_version = 3.0.0
|
|
111
112
|
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
|
112
113
|
RUBY = $(ruby)
|
|
113
114
|
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
|
@@ -141,7 +142,7 @@ LIBS = -lprotobuf -lpthread -lm -lc
|
|
|
141
142
|
ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
|
|
142
143
|
SRCS = $(ORIG_SRCS)
|
|
143
144
|
OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
|
|
144
|
-
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/
|
|
145
|
+
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
|
|
145
146
|
LOCAL_HDRS =
|
|
146
147
|
TARGET = libcld3
|
|
147
148
|
TARGET_NAME = libcld3
|
|
@@ -155,8 +156,8 @@ BINDIR = $(bindir)
|
|
|
155
156
|
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
|
156
157
|
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
|
157
158
|
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
|
158
|
-
HDRDIR = $(
|
|
159
|
-
ARCHHDRDIR = $(
|
|
159
|
+
HDRDIR = $(sitehdrdir)$(target_prefix)
|
|
160
|
+
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
|
160
161
|
TARGET_SO_DIR =
|
|
161
162
|
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
|
162
163
|
CLEANLIBS = $(TARGET_SO)
|
data/ext/cld3/base.o
CHANGED
|
Binary file
|
|
Binary file
|
|
@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
|
|
|
167
167
|
for (int i = 0; i < model_->embedding_dim_size(); ++i) {
|
|
168
168
|
CLD3_DCHECK(offset_sum == model_->concat_offset(i));
|
|
169
169
|
offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
|
|
170
|
+
(void)offset_sum; // Avoid compiler warning for "unused" variable.
|
|
170
171
|
embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
|
|
171
172
|
}
|
|
172
173
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
data/ext/cld3/feature_types.o
CHANGED
|
Binary file
|
data/ext/cld3/fixunicodevalue.o
CHANGED
|
Binary file
|
data/ext/cld3/fml_parser.o
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
data/ext/cld3/getonescriptspan.h
CHANGED
|
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
|
|
|
33
33
|
static const int kWithinScriptTail = 32; // Stop at word space in last
|
|
34
34
|
// N bytes of script buffer
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
struct LangSpan {
|
|
37
37
|
char* text = nullptr; // Pointer to the span, somewhere
|
|
38
38
|
int text_bytes = 0; // Number of bytes of text in the span
|
|
39
39
|
int offset = 0; // Offset of start of span in original input buffer
|
|
40
40
|
ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
|
|
41
41
|
bool truncated = false; // true if buffer filled up before a
|
|
42
42
|
// different script or EOF was found
|
|
43
|
-
}
|
|
43
|
+
};
|
|
44
44
|
|
|
45
45
|
static inline bool IsContinuationByte(char c) {
|
|
46
46
|
return static_cast<signed char>(c) < -64;
|
data/ext/cld3/getonescriptspan.o
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
data/ext/cld3/libcld3.so
CHANGED
|
Binary file
|
|
@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
|
|
284
284
|
CLD2::LangSpan script_span;
|
|
285
285
|
std::unordered_map<string, LangChunksStats> lang_stats;
|
|
286
286
|
int total_num_bytes = 0;
|
|
287
|
-
Result result;
|
|
288
|
-
string language;
|
|
289
287
|
int chunk_size = 0; // Use the default.
|
|
290
288
|
while (ss.GetOneScriptSpanLower(&script_span)) {
|
|
291
289
|
const int num_original_span_bytes = script_span.text_bytes;
|
|
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
|
|
302
300
|
|
|
303
301
|
const string selected_text = SelectTextGivenScriptSpan(script_span);
|
|
304
302
|
|
|
305
|
-
result = FindLanguageOfValidUTF8(selected_text);
|
|
306
|
-
language = result.language;
|
|
303
|
+
Result result = FindLanguageOfValidUTF8(selected_text);
|
|
304
|
+
string language = result.language;
|
|
307
305
|
lang_stats[language].byte_sum += num_original_span_bytes;
|
|
308
306
|
lang_stats[language].prob_sum +=
|
|
309
307
|
result.probability * num_original_span_bytes;
|
|
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
|
|
|
356
354
|
const char *text_begin, int text_size) {
|
|
357
355
|
string output_text;
|
|
358
356
|
|
|
359
|
-
// If the size of the input is greater than the
|
|
357
|
+
// If the size of the input is greater than the maximum number of bytes needed
|
|
360
358
|
// for a prediction, then concatenate snippets that are equally spread out
|
|
361
359
|
// throughout the input.
|
|
362
360
|
if (text_size > max_num_bytes_) {
|
|
Binary file
|
|
Binary file
|
data/ext/cld3/offsetmap.o
CHANGED
|
Binary file
|
data/ext/cld3/registry.o
CHANGED
|
Binary file
|
|
Binary file
|
data/ext/cld3/sentence.pb.o
CHANGED
|
Binary file
|
|
@@ -19,11 +19,11 @@ limitations under the License.
|
|
|
19
19
|
|
|
20
20
|
namespace chrome_lang_id {
|
|
21
21
|
|
|
22
|
-
//
|
|
22
|
+
// Define registry for the whole Sentence feature functions. NOTE: this is not
|
|
23
23
|
// yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
|
|
24
24
|
// constructor, *before* we use any feature.
|
|
25
25
|
template <>
|
|
26
|
-
WholeSentenceFeature::Registry
|
|
27
|
-
|
|
26
|
+
WholeSentenceFeature::Registry*
|
|
27
|
+
RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
|
|
28
28
|
|
|
29
|
-
} // namespace chrome_lang_id
|
|
29
|
+
} // namespace chrome_lang_id
|
|
@@ -26,9 +26,19 @@ limitations under the License.
|
|
|
26
26
|
namespace chrome_lang_id {
|
|
27
27
|
|
|
28
28
|
// Feature function that extracts features for the full Sentence.
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
using WholeSentenceFeature = FeatureFunction<Sentence>;
|
|
30
|
+
|
|
31
|
+
using WholeSentenceExtractor = FeatureExtractor<Sentence>;
|
|
32
|
+
|
|
33
|
+
// Declare registry for the whole Sentence feature functions. This is required
|
|
34
|
+
// for clang's -Wundefined-var-template. However, MSVC has a bug which treats
|
|
35
|
+
// this declaration as a definition, leading to multiple definition errors, so
|
|
36
|
+
// omit this on MSVC.
|
|
37
|
+
#if !defined(COMPILER_MSVC)
|
|
38
|
+
template <>
|
|
39
|
+
WholeSentenceFeature::Registry
|
|
40
|
+
*RegisterableClass<WholeSentenceFeature>::registry_;
|
|
41
|
+
#endif
|
|
32
42
|
|
|
33
43
|
} // namespace chrome_lang_id
|
|
34
44
|
|
|
Binary file
|
data/ext/cld3/task_context.o
CHANGED
|
Binary file
|
|
Binary file
|
data/ext/cld3/task_spec.pb.o
CHANGED
|
Binary file
|
data/ext/cld3/text_processing.o
CHANGED
|
Binary file
|
data/ext/cld3/unicodetext.o
CHANGED
|
Binary file
|
data/ext/cld3/utf8statetable.o
CHANGED
|
Binary file
|
data/ext/cld3/utils.o
CHANGED
|
Binary file
|
data/ext/cld3/workspace.o
CHANGED
|
Binary file
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
|
|
2
|
+
# Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
|
|
3
|
+
# All Rights Reserved.
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
# ==============================================================================
|
|
17
|
+
|
|
18
|
+
module CLD3
|
|
19
|
+
module Unstable
|
|
20
|
+
extend FFI::Library
|
|
21
|
+
|
|
22
|
+
ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
|
|
23
|
+
|
|
24
|
+
module NNetLanguageIdentifier
|
|
25
|
+
class Pointer < FFI::AutoPointer
|
|
26
|
+
def self.release(pointer)
|
|
27
|
+
Unstable.delete_NNetLanguageIdentifier(pointer)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
class SpanInfo < FFI::Struct
|
|
32
|
+
layout :start_index, :int, :end_index, :int, :probability, :float
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
class Result < FFI::Struct
|
|
36
|
+
layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
|
41
|
+
|
|
42
|
+
attach_function :delete_result, [ :pointer ], :void
|
|
43
|
+
|
|
44
|
+
attach_function :delete_results, [ :pointer ], :void
|
|
45
|
+
|
|
46
|
+
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
|
47
|
+
|
|
48
|
+
attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
|
|
49
|
+
|
|
50
|
+
attach_function :NNetLanguageIdentifier_find_language,
|
|
51
|
+
[ :pointer, :buffer_in, :size_t ], :pointer
|
|
52
|
+
|
|
53
|
+
attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
|
|
54
|
+
[ :pointer, :buffer_in, :size_t, :int ], :pointer
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private_constant :Unstable
|
|
58
|
+
end
|
data/lib/cld3.rb
CHANGED
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
|
|
20
20
|
require "ffi"
|
|
21
21
|
require "rbconfig"
|
|
22
|
+
require "cld3/unstable"
|
|
22
23
|
|
|
23
24
|
# Module providing an interface for Compact Language Detector v3 (CLD3)
|
|
24
25
|
module CLD3
|
|
@@ -52,6 +53,7 @@ module CLD3
|
|
|
52
53
|
# Holds probability that Span, specified by start/end indices, is a given
|
|
53
54
|
# language. The langauge is not stored here; it can be found in Result, which
|
|
54
55
|
# holds an Array of SpanInfo.
|
|
56
|
+
# @type const SpanInfo: untyped
|
|
55
57
|
SpanInfo = Struct.new(:start_index, :end_index, :probability)
|
|
56
58
|
|
|
57
59
|
# Information about a predicted language.
|
|
@@ -69,6 +71,7 @@ module CLD3
|
|
|
69
71
|
#
|
|
70
72
|
# [byte_ranges] Specifies the byte ranges in UTF-8 that |language| applies to.
|
|
71
73
|
# This is an Array of SpanInfo.
|
|
74
|
+
# @type const Result: untyped
|
|
72
75
|
Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
|
|
73
76
|
|
|
74
77
|
# The arguments are two String objects.
|
|
@@ -115,6 +118,8 @@ module CLD3
|
|
|
115
118
|
# The second argument is Numeric object.
|
|
116
119
|
# The returned value of this functions is an Array of Result instances.
|
|
117
120
|
def find_top_n_most_freq_langs(text, num_langs)
|
|
121
|
+
# @type var a: untyped
|
|
122
|
+
|
|
118
123
|
text_utf8 = text.encode(Encoding::UTF_8)
|
|
119
124
|
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
|
120
125
|
|
|
@@ -123,11 +128,13 @@ module CLD3
|
|
|
123
128
|
|
|
124
129
|
results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
|
|
125
130
|
begin
|
|
126
|
-
num_langs.times
|
|
131
|
+
a = num_langs.times
|
|
127
132
|
.lazy
|
|
128
133
|
.map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
|
|
129
134
|
.take_while { |result| !result.nil? }
|
|
130
135
|
.to_a
|
|
136
|
+
|
|
137
|
+
a
|
|
131
138
|
ensure
|
|
132
139
|
Unstable.delete_results results
|
|
133
140
|
end
|
|
@@ -162,6 +169,7 @@ module CLD3
|
|
|
162
169
|
# The model weights are loaded statically.
|
|
163
170
|
module TaskContextParams
|
|
164
171
|
# This is an frozen Array object containing symbols.
|
|
172
|
+
# @type const LANGUAGE_NAMES: untyped
|
|
165
173
|
LANGUAGE_NAMES = [
|
|
166
174
|
:eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
|
|
167
175
|
:nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
|
|
@@ -175,44 +183,4 @@ module CLD3
|
|
|
175
183
|
:sn, :yo, :pa, :ku,
|
|
176
184
|
].freeze
|
|
177
185
|
end
|
|
178
|
-
|
|
179
|
-
module Unstable
|
|
180
|
-
extend FFI::Library
|
|
181
|
-
|
|
182
|
-
ffi_lib File.join(__dir__, "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
|
|
183
|
-
|
|
184
|
-
module NNetLanguageIdentifier
|
|
185
|
-
class Pointer < FFI::AutoPointer
|
|
186
|
-
def self.release(pointer)
|
|
187
|
-
Unstable.delete_NNetLanguageIdentifier(pointer)
|
|
188
|
-
end
|
|
189
|
-
end
|
|
190
|
-
|
|
191
|
-
class SpanInfo < FFI::Struct
|
|
192
|
-
layout :start_index, :int, :end_index, :int, :probability, :float
|
|
193
|
-
end
|
|
194
|
-
|
|
195
|
-
class Result < FFI::Struct
|
|
196
|
-
layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
|
197
|
-
end
|
|
198
|
-
end
|
|
199
|
-
|
|
200
|
-
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
|
201
|
-
|
|
202
|
-
attach_function :delete_result, [ :pointer ], :void
|
|
203
|
-
|
|
204
|
-
attach_function :delete_results, [ :pointer ], :void
|
|
205
|
-
|
|
206
|
-
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
|
207
|
-
|
|
208
|
-
attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
|
|
209
|
-
|
|
210
|
-
attach_function :NNetLanguageIdentifier_find_language,
|
|
211
|
-
[ :pointer, :buffer_in, :size_t ], :pointer
|
|
212
|
-
|
|
213
|
-
attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
|
|
214
|
-
[ :pointer, :buffer_in, :size_t, :int ], :pointer
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
private_constant :Unstable
|
|
218
186
|
end
|
data/sig/cld3.rbs
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
|
|
2
|
+
# All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
# ==============================================================================
|
|
16
|
+
|
|
17
|
+
module CLD3
|
|
18
|
+
class NNetLanguageIdentifier
|
|
19
|
+
MIN_NUM_BYTES_TO_CONSIDER: Integer
|
|
20
|
+
MAX_NUM_BYTES_TO_CONSIDER: Integer
|
|
21
|
+
MAX_NUM_INPUT_BYTES_TO_CONSIDER: Integer
|
|
22
|
+
RELIABILITY_THRESHOLD: Float
|
|
23
|
+
RELIABILITY_HR_BS_THRESHOLD: Float
|
|
24
|
+
|
|
25
|
+
class SpanInfo < Struct[Float | Integer]
|
|
26
|
+
attr_accessor start_index(): Integer
|
|
27
|
+
attr_accessor end_index(): Integer
|
|
28
|
+
attr_accessor probability(): Float
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
class Result < Struct[Array[SpanInfo] | Float | TaskContextParams::language_names | bool]
|
|
32
|
+
attr_accessor language(): TaskContextParams::language_names
|
|
33
|
+
attr_accessor probability(): Float
|
|
34
|
+
attr_accessor reliable?(): bool
|
|
35
|
+
attr_accessor proportion(): Float
|
|
36
|
+
attr_accessor byte_ranges(): Array[SpanInfo]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def initialize: (?Integer, ?Integer) -> void
|
|
40
|
+
def find_language: (String) -> Result?
|
|
41
|
+
def find_top_n_most_freq_langs: (String, Integer) -> Array[Result]
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def convert_result: (untyped) -> Result?
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
module TaskContextParams
|
|
49
|
+
type language_names =
|
|
50
|
+
:eo | :co | :eu | :ta | :de | :mt | :ps | :te | :su | :uz | :'zh-Latn' | :ne |
|
|
51
|
+
:nl | :sw | :sq | :hmn | :ja | :no | :mn | :so | :ko | :kk | :sl | :ig |
|
|
52
|
+
:mr | :th | :zu | :ml | :hr | :bs | :lo | :sd | :cy | :hy | :uk | :pt |
|
|
53
|
+
:lv | :iw | :cs | :vi | :jv | :be | :km | :mk | :tr | :fy | :am | :zh |
|
|
54
|
+
:da | :sv | :fi | :ht | :af | :la | :id | :fil | :sm | :ca | :el | :ka |
|
|
55
|
+
:sr | :it | :sk | :ru | :'ru-Latn' | :bg | :ny | :fa | :haw | :gl | :et |
|
|
56
|
+
:ms | :gd | :'bg-Latn' | :ha | :is | :ur | :mi | :hi | :bn | :'hi-Latn' | :fr |
|
|
57
|
+
:yi | :hu | :xh | :my | :tg | :ro | :ar | :lb | :'el-Latn' | :st | :ceb |
|
|
58
|
+
:kn | :az | :si | :ky | :mg | :en | :gu | :es | :pl | :'ja-Latn' | :ga | :lt |
|
|
59
|
+
:sn | :yo | :pa | :ku
|
|
60
|
+
|
|
61
|
+
LANGUAGE_NAMES: Array[language_names]
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
Unstable: untyped
|
|
65
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: cld3
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.4.
|
|
4
|
+
version: 3.4.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Akihiko Odaki
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2021-
|
|
11
|
+
date: 2021-11-25 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: ffi
|
|
@@ -30,6 +30,26 @@ dependencies:
|
|
|
30
30
|
- - "<"
|
|
31
31
|
- !ruby/object:Gem::Version
|
|
32
32
|
version: 1.16.0
|
|
33
|
+
- !ruby/object:Gem::Dependency
|
|
34
|
+
name: rbs
|
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: 1.7.0
|
|
40
|
+
- - "<"
|
|
41
|
+
- !ruby/object:Gem::Version
|
|
42
|
+
version: 1.8.0
|
|
43
|
+
type: :development
|
|
44
|
+
prerelease: false
|
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
46
|
+
requirements:
|
|
47
|
+
- - ">="
|
|
48
|
+
- !ruby/object:Gem::Version
|
|
49
|
+
version: 1.7.0
|
|
50
|
+
- - "<"
|
|
51
|
+
- !ruby/object:Gem::Version
|
|
52
|
+
version: 1.8.0
|
|
33
53
|
- !ruby/object:Gem::Dependency
|
|
34
54
|
name: rspec
|
|
35
55
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -50,6 +70,26 @@ dependencies:
|
|
|
50
70
|
- - "<"
|
|
51
71
|
- !ruby/object:Gem::Version
|
|
52
72
|
version: 3.11.0
|
|
73
|
+
- !ruby/object:Gem::Dependency
|
|
74
|
+
name: steep
|
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
|
76
|
+
requirements:
|
|
77
|
+
- - ">="
|
|
78
|
+
- !ruby/object:Gem::Version
|
|
79
|
+
version: 0.46.0
|
|
80
|
+
- - "<"
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: 0.47.0
|
|
83
|
+
type: :development
|
|
84
|
+
prerelease: false
|
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
86
|
+
requirements:
|
|
87
|
+
- - ">="
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
version: 0.46.0
|
|
90
|
+
- - "<"
|
|
91
|
+
- !ruby/object:Gem::Version
|
|
92
|
+
version: 0.47.0
|
|
53
93
|
description: Compact Language Detector v3 (CLD3) is a neural network model for language
|
|
54
94
|
identification.
|
|
55
95
|
email: akihiko.odaki@gmail.com
|
|
@@ -160,6 +200,8 @@ files:
|
|
|
160
200
|
- ext/cld3/workspace.h
|
|
161
201
|
- ext/cld3/workspace.o
|
|
162
202
|
- lib/cld3.rb
|
|
203
|
+
- lib/cld3/unstable.rb
|
|
204
|
+
- sig/cld3.rbs
|
|
163
205
|
homepage: https://github.com/akihikodaki/cld3-ruby
|
|
164
206
|
licenses:
|
|
165
207
|
- Apache-2.0
|
|
@@ -175,14 +217,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
175
217
|
version: 2.6.0
|
|
176
218
|
- - "<"
|
|
177
219
|
- !ruby/object:Gem::Version
|
|
178
|
-
version: 3.
|
|
220
|
+
version: 3.2.0
|
|
179
221
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
180
222
|
requirements:
|
|
181
223
|
- - ">="
|
|
182
224
|
- !ruby/object:Gem::Version
|
|
183
225
|
version: '0'
|
|
184
226
|
requirements: []
|
|
185
|
-
rubygems_version: 3.
|
|
227
|
+
rubygems_version: 3.2.22
|
|
186
228
|
signing_key:
|
|
187
229
|
specification_version: 4
|
|
188
230
|
summary: Compact Language Detector v3 (CLD3)
|