cld3 3.4.2 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -25
- data/cld3.gemspec +6 -4
- data/ext/cld3/Makefile +21 -19
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
- data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
- data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.cc +1 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/extconf.rb +1 -10
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.cc +0 -2
- data/ext/cld3/getonescriptspan.h +2 -2
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.cc +3 -5
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/script_span/fixunicodevalue.h +69 -0
- data/ext/cld3/script_span/generated_ulscript.h +142 -0
- data/ext/cld3/script_span/getonescriptspan.h +124 -0
- data/ext/cld3/script_span/integral_types.h +37 -0
- data/ext/cld3/script_span/offsetmap.h +168 -0
- data/ext/cld3/script_span/port.h +143 -0
- data/ext/cld3/script_span/stringpiece.h +81 -0
- data/ext/cld3/script_span/text_processing.h +30 -0
- data/ext/cld3/script_span/utf8acceptinterchange.h +486 -0
- data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/script_span/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/script_span/utf8statetable.h +285 -0
- data/ext/cld3/sentence_features.cc +4 -4
- data/ext/cld3/sentence_features.h +13 -3
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3/unstable.rb +58 -0
- data/lib/cld3.rb +15 -43
- data/sig/cld3.rbs +65 -0
- metadata +66 -15
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_extractor.proto +0 -50
- data/ext/cld3/mkmf.log +0 -37
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence.proto +0 -77
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/task_spec.proto +0 -98
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e12ff58a2cc0242896e307f33cb1adb2e210a1889244861e2c751b0399cb6415
|
4
|
+
data.tar.gz: 3b0c348b00126c8ee825a76eed0919e074a7f97e7dc7e647614f7a754c69d716
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 690c6e6f0cc8f0cc369b05a611daeed74adc273cd9c02f780233921ed0c220a425552e7020c127ff0503e88d473f57088cafd3b0d2143f3f86527d886d3eaf63
|
7
|
+
data.tar.gz: 6c3b7c105fa799fb918a077d9d1f47c948ec29bad86e57f452ea9eae82fc4f5bfc2686654a2a68ebc7102dfcb34e2a1ca55648d75a1f6bcf97b532de482de76e
|
data/README.md
CHANGED
@@ -19,7 +19,6 @@ cld3.find_language("здравствуйте") # => #<struct Struct::Result lang
|
|
19
19
|
### Prerequisites
|
20
20
|
* [Bundler](http://bundler.io/)
|
21
21
|
* C++ compiler
|
22
|
-
* [Protocol buffers](https://developers.google.com/protocol-buffers/)
|
23
22
|
* [Rake](https://ruby.github.io/rake/)
|
24
23
|
* [RubyGems](https://rubygems.org/)
|
25
24
|
|
@@ -41,36 +40,16 @@ JRuby has a bug which prevents the feature detection. Apply the following
|
|
41
40
|
change:
|
42
41
|
https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
|
43
42
|
|
44
|
-
#### OpenBSD
|
45
|
-
Ruby has a bug which recognizes non-fatal linker warnings as fatal. Apply the
|
46
|
-
following patch to Ruby to workaround the bug.
|
47
|
-
|
48
|
-
```diff
|
49
|
-
--- a/lib/mkmf.rb
|
50
|
-
+++ b/lib/mkmf.rb
|
51
|
-
@@ -657,7 +657,7 @@ def with_ldflags(flags)
|
52
|
-
end
|
53
|
-
|
54
|
-
def try_ldflags(flags, opts = {})
|
55
|
-
- try_link(MAIN_DOES_NOTHING, flags, {:werror => true}.update(opts))
|
56
|
-
+ try_link(MAIN_DOES_NOTHING, flags, {:werror => false}.update(opts))
|
57
|
-
end
|
58
|
-
|
59
|
-
def append_ldflags(flags, *opts)
|
60
|
-
```
|
61
|
-
|
62
43
|
### Troubleshooting
|
63
|
-
`gem install cld3` triggers native library building. If it fails,
|
64
|
-
|
65
|
-
|
66
|
-
likely to need [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/)
|
67
|
-
as well.
|
44
|
+
`gem install cld3` triggers native library building. If it fails, it is likely
|
45
|
+
that some required facilities are missing. Make sure C++ compiler is installed.
|
46
|
+
I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
|
68
47
|
|
69
48
|
Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
|
70
49
|
programming errors. Make sure they are all correct.
|
71
50
|
|
72
51
|
If you cannot identify the cause of your problem, run spec of this library and
|
73
|
-
see whether the problem is
|
52
|
+
see whether the problem is reproducible with it or not. Spec is not included in
|
74
53
|
the gem, so clone the source code repository and then run `rake spec`.
|
75
54
|
The source code repository is at
|
76
55
|
https://github.com/akihikodaki/cld3-ruby.
|
data/cld3.gemspec
CHANGED
@@ -16,19 +16,21 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.
|
19
|
+
gem.version = "3.5.0"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
23
23
|
gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
|
24
24
|
gem.author = "Akihiko Odaki"
|
25
25
|
gem.email = "akihiko.odaki@gmail.com"
|
26
|
-
gem.required_ruby_version = [ ">= 2.
|
26
|
+
gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
|
27
27
|
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
|
28
|
-
gem.add_development_dependency "
|
28
|
+
gem.add_development_dependency "rbs", [ ">= 2.6.0", "< 2.7.0" ]
|
29
|
+
gem.add_development_dependency "rspec", [ ">= 3.11.0", "< 3.12.0" ]
|
30
|
+
gem.add_development_dependency "steep", [ ">= 1.0.0", "< 1.1.0" ]
|
29
31
|
gem.files = Dir[
|
30
32
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
31
|
-
"cld3.gemspec", "ext/**/*", "lib/**/*"
|
33
|
+
"cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
|
32
34
|
]
|
33
35
|
gem.require_paths = [ "lib" ]
|
34
36
|
gem.extensions = [ "ext/cld3/extconf.rb" ]
|
data/ext/cld3/Makefile
CHANGED
@@ -3,6 +3,7 @@ SHELL = /bin/sh
|
|
3
3
|
|
4
4
|
# V=0 quiet, V=1 verbose. other values don't work.
|
5
5
|
V = 1
|
6
|
+
V0 = $(V:0=)
|
6
7
|
Q1 = $(V:1=)
|
7
8
|
Q = $(Q1:0=@)
|
8
9
|
ECHO1 = $(V:1=@ :)
|
@@ -52,7 +53,8 @@ htmldir = $(docdir)
|
|
52
53
|
infodir = $(DESTDIR)/usr/share/info
|
53
54
|
docdir = $(datarootdir)/doc/$(PACKAGE)
|
54
55
|
oldincludedir = $(DESTDIR)/usr/include
|
55
|
-
includedir = $(
|
56
|
+
includedir = $(exec_prefix)/include
|
57
|
+
runstatedir = $(localstatedir)/run
|
56
58
|
localstatedir = $(DESTDIR)/var
|
57
59
|
sharedstatedir = $(DESTDIR)/var/lib
|
58
60
|
sysconfdir = $(DESTDIR)/etc
|
@@ -79,23 +81,23 @@ CSRCFLAG = $(empty)
|
|
79
81
|
RUBY_EXTCONF_H =
|
80
82
|
cflags = $(optflags) $(debugflags) $(warnflags)
|
81
83
|
cxxflags =
|
82
|
-
optflags = -O3
|
84
|
+
optflags = -O3 -fno-fast-math
|
83
85
|
debugflags = -ggdb3
|
84
|
-
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
|
86
|
+
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wold-style-definition -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable -Wundef
|
85
87
|
cppflags =
|
86
88
|
CCDLFLAGS = -fPIC
|
87
|
-
CFLAGS = $(CCDLFLAGS) -O2
|
88
|
-
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
89
|
+
CFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
|
90
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
89
91
|
DEFS =
|
90
92
|
CPPFLAGS = $(DEFS) $(cppflags)
|
91
|
-
CXXFLAGS = $(CCDLFLAGS) -O2
|
92
|
-
ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
|
93
|
-
dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
|
93
|
+
CXXFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++17 $(ARCH_FLAG)
|
94
|
+
ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1 -fstack-protector-strong -rdynamic -Wl,-export-dynamic -Wl,--no-as-needed
|
95
|
+
dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1
|
94
96
|
ARCH_FLAG =
|
95
97
|
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
96
98
|
LDSHARED = $(CC) -shared
|
97
99
|
LDSHAREDXX = $(CXX) -shared
|
98
|
-
AR = ar
|
100
|
+
AR = gcc-ar
|
99
101
|
EXEEXT =
|
100
102
|
|
101
103
|
RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
|
@@ -107,13 +109,13 @@ RUBY_BASE_NAME = ruby
|
|
107
109
|
|
108
110
|
arch = aarch64-linux
|
109
111
|
sitearch = $(arch)
|
110
|
-
ruby_version =
|
112
|
+
ruby_version = 3.1.0
|
111
113
|
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
112
114
|
RUBY = $(ruby)
|
113
115
|
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
114
116
|
|
115
117
|
RM = rm -f
|
116
|
-
RM_RF =
|
118
|
+
RM_RF = rm -fr
|
117
119
|
RMDIRS = rmdir --ignore-fail-on-non-empty -p
|
118
120
|
MAKEDIRS = /usr/bin/mkdir -p
|
119
121
|
INSTALL = /usr/bin/install -c
|
@@ -137,11 +139,11 @@ extout =
|
|
137
139
|
extout_prefix =
|
138
140
|
target_prefix =
|
139
141
|
LOCAL_LIBS =
|
140
|
-
LIBS = -
|
141
|
-
ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc
|
142
|
+
LIBS = -lm -lc
|
143
|
+
ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence_features.cc task_context.cc task_context_params.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
|
142
144
|
SRCS = $(ORIG_SRCS)
|
143
|
-
OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o
|
144
|
-
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/
|
145
|
+
OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence_features.o task_context.o task_context_params.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
|
146
|
+
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
|
145
147
|
LOCAL_HDRS =
|
146
148
|
TARGET = libcld3
|
147
149
|
TARGET_NAME = libcld3
|
@@ -155,11 +157,11 @@ BINDIR = $(bindir)
|
|
155
157
|
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
156
158
|
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
157
159
|
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
158
|
-
HDRDIR = $(
|
159
|
-
ARCHHDRDIR = $(
|
160
|
+
HDRDIR = $(sitehdrdir)$(target_prefix)
|
161
|
+
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
160
162
|
TARGET_SO_DIR =
|
161
163
|
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
162
|
-
CLEANLIBS = $(TARGET_SO)
|
164
|
+
CLEANLIBS = $(TARGET_SO) false
|
163
165
|
CLEANOBJS = *.o *.bak
|
164
166
|
|
165
167
|
all: $(DLLIB)
|
@@ -172,7 +174,7 @@ clean-rb-default::
|
|
172
174
|
clean-rb::
|
173
175
|
clean-so::
|
174
176
|
clean: clean-so clean-static clean-rb-default clean-rb
|
175
|
-
-$(Q)$(
|
177
|
+
-$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
176
178
|
|
177
179
|
distclean-rb-default::
|
178
180
|
distclean-rb::
|
data/ext/cld3/base.o
CHANGED
Binary file
|
@@ -0,0 +1,100 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef FEATURE_EXTRACTOR_PB_H_
|
18
|
+
#define FEATURE_EXTRACTOR_PB_H_
|
19
|
+
|
20
|
+
#include <cstdint>
|
21
|
+
#include <string>
|
22
|
+
#include <vector>
|
23
|
+
|
24
|
+
namespace chrome_lang_id {
|
25
|
+
|
26
|
+
class Parameter {
|
27
|
+
public:
|
28
|
+
const std::string& name() const { return name_; }
|
29
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
30
|
+
const std::string& value() const { return value_; }
|
31
|
+
void set_value(std::string value) { value_ = std::move(value); }
|
32
|
+
|
33
|
+
private:
|
34
|
+
std::string name_;
|
35
|
+
std::string value_;
|
36
|
+
};
|
37
|
+
|
38
|
+
class FeatureFunctionDescriptor {
|
39
|
+
public:
|
40
|
+
const std::string& type() const { return type_; }
|
41
|
+
|
42
|
+
void set_type(std::string value) { type_ = std::move(value); }
|
43
|
+
|
44
|
+
const std::string& name() const { return name_; }
|
45
|
+
|
46
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
47
|
+
|
48
|
+
bool has_argument() const { return true; }
|
49
|
+
|
50
|
+
std::int32_t argument() const { return argument_; }
|
51
|
+
|
52
|
+
void set_argument(int32_t value) { argument_ = value; }
|
53
|
+
|
54
|
+
int parameter_size() const { return parameter_.size(); }
|
55
|
+
|
56
|
+
const Parameter& parameter(int index) const { return parameter_[index]; }
|
57
|
+
|
58
|
+
Parameter* add_parameter() { return ¶meter_.emplace_back(); }
|
59
|
+
|
60
|
+
int feature_size() const { return feature_.size(); }
|
61
|
+
|
62
|
+
FeatureFunctionDescriptor* mutable_feature(int index) {
|
63
|
+
return &feature_[index];
|
64
|
+
}
|
65
|
+
|
66
|
+
const FeatureFunctionDescriptor& feature(int index) const {
|
67
|
+
return feature_[index];
|
68
|
+
}
|
69
|
+
|
70
|
+
FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
|
71
|
+
|
72
|
+
private:
|
73
|
+
std::string type_;
|
74
|
+
std::string name_;
|
75
|
+
std::int32_t argument_;
|
76
|
+
std::vector<Parameter> parameter_;
|
77
|
+
std::vector<FeatureFunctionDescriptor> feature_;
|
78
|
+
};
|
79
|
+
|
80
|
+
class FeatureExtractorDescriptor {
|
81
|
+
public:
|
82
|
+
int feature_size() const { return feature_.size(); }
|
83
|
+
|
84
|
+
FeatureFunctionDescriptor* mutable_feature(int index) {
|
85
|
+
return &feature_[index];
|
86
|
+
}
|
87
|
+
|
88
|
+
const FeatureFunctionDescriptor& feature(int index) const {
|
89
|
+
return feature_[index];
|
90
|
+
}
|
91
|
+
|
92
|
+
FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
|
93
|
+
|
94
|
+
private:
|
95
|
+
std::vector<FeatureFunctionDescriptor> feature_;
|
96
|
+
};
|
97
|
+
|
98
|
+
}
|
99
|
+
|
100
|
+
#endif
|
@@ -0,0 +1,35 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef SENTENCE_PB_H_
|
18
|
+
#define SENTENCE_PB_H_
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
|
22
|
+
namespace chrome_lang_id {
|
23
|
+
|
24
|
+
class Sentence {
|
25
|
+
public:
|
26
|
+
const std::string& text() const { return text_; }
|
27
|
+
void set_text(std::string value) { text_ = std::move(value); }
|
28
|
+
|
29
|
+
private:
|
30
|
+
std::string text_;
|
31
|
+
};
|
32
|
+
|
33
|
+
}
|
34
|
+
|
35
|
+
#endif
|
@@ -0,0 +1,106 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef TASK_SPEC_PB_H_
|
18
|
+
#define TASK_SPEC_PB_H_
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
#include <vector>
|
22
|
+
|
23
|
+
namespace chrome_lang_id {
|
24
|
+
|
25
|
+
class TaskInput {
|
26
|
+
public:
|
27
|
+
class Part {
|
28
|
+
public:
|
29
|
+
const std::string& file_pattern() const { return file_pattern_; }
|
30
|
+
|
31
|
+
private:
|
32
|
+
std::string file_pattern_;
|
33
|
+
};
|
34
|
+
|
35
|
+
const std::string& name() const { return name_; }
|
36
|
+
|
37
|
+
void set_name(std::string value) { name_ = value; }
|
38
|
+
|
39
|
+
int file_format_size() const { return file_format_.size(); }
|
40
|
+
|
41
|
+
const std::string& file_format(int index) const {
|
42
|
+
return file_format_[index];
|
43
|
+
}
|
44
|
+
|
45
|
+
void add_file_format(std::string value) {
|
46
|
+
file_format_.push_back(std::move(value));
|
47
|
+
}
|
48
|
+
|
49
|
+
int record_format_size() const { return record_format_.size(); }
|
50
|
+
|
51
|
+
const std::string& record_format(int index) const {
|
52
|
+
return record_format_[index];
|
53
|
+
}
|
54
|
+
|
55
|
+
void add_record_format(std::string value) {
|
56
|
+
record_format_.push_back(std::move(value));
|
57
|
+
}
|
58
|
+
|
59
|
+
int part_size() const { return part_.size(); }
|
60
|
+
const Part& part(int index) const { return part_[index]; }
|
61
|
+
|
62
|
+
private:
|
63
|
+
std::string name_;
|
64
|
+
std::vector<std::string> file_format_;
|
65
|
+
std::vector<std::string> record_format_;
|
66
|
+
std::vector<Part> part_;
|
67
|
+
};
|
68
|
+
|
69
|
+
class TaskSpec {
|
70
|
+
public:
|
71
|
+
class Parameter {
|
72
|
+
public:
|
73
|
+
const std::string& name() const { return name_; }
|
74
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
75
|
+
const std::string& value() const { return value_; }
|
76
|
+
void set_value(std::string value) { value_ = std::move(value); }
|
77
|
+
|
78
|
+
private:
|
79
|
+
std::string name_;
|
80
|
+
std::string value_;
|
81
|
+
};
|
82
|
+
|
83
|
+
int parameter_size() const { return parameter_.size(); }
|
84
|
+
|
85
|
+
Parameter* mutable_parameter(int index) { return ¶meter_[index]; }
|
86
|
+
|
87
|
+
const Parameter& parameter(int index) const { return parameter_[index]; }
|
88
|
+
|
89
|
+
Parameter* add_parameter() { return ¶meter_.emplace_back(); }
|
90
|
+
|
91
|
+
int input_size() const { return input_.size(); }
|
92
|
+
|
93
|
+
TaskInput* mutable_input(int index) { return &input_[index]; }
|
94
|
+
|
95
|
+
const TaskInput& input(int index) const { return input_[index]; }
|
96
|
+
|
97
|
+
TaskInput* add_input() { return &input_.emplace_back(); }
|
98
|
+
|
99
|
+
private:
|
100
|
+
std::vector<Parameter> parameter_;
|
101
|
+
std::vector<TaskInput> input_;
|
102
|
+
};
|
103
|
+
|
104
|
+
}
|
105
|
+
|
106
|
+
#endif
|
Binary file
|
@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
|
|
167
167
|
for (int i = 0; i < model_->embedding_dim_size(); ++i) {
|
168
168
|
CLD3_DCHECK(offset_sum == model_->concat_offset(i));
|
169
169
|
offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
|
170
|
+
(void)offset_sum; // Avoid compiler warning for "unused" variable.
|
170
171
|
embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
|
171
172
|
}
|
172
173
|
|
Binary file
|
data/ext/cld3/extconf.rb
CHANGED
@@ -26,17 +26,8 @@ rescue
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
# Check pkg-config first to inform the library is missing if so.
|
30
|
-
pkg_config("protobuf") or abort "Failed to locate protobuf"
|
31
|
-
|
32
|
-
FileUtils.mkdir_p("cld_3/protos")
|
33
29
|
FileUtils.mkdir_p("script_span")
|
34
30
|
|
35
|
-
[ "feature_extractor", "sentence", "task_spec" ].each {|name|
|
36
|
-
system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
|
37
|
-
ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
|
38
|
-
}
|
39
|
-
|
40
31
|
[
|
41
32
|
"fixunicodevalue.h",
|
42
33
|
"generated_ulscript.h",
|
@@ -55,6 +46,6 @@ FileUtils.mkdir_p("script_span")
|
|
55
46
|
ln_fallback("#{name}", "script_span/#{name}")
|
56
47
|
}
|
57
48
|
|
58
|
-
$CXXFLAGS += " -fvisibility=hidden -std=c++
|
49
|
+
$CXXFLAGS += " -fvisibility=hidden -std=c++17"
|
59
50
|
$LIBRUBYARG = ""
|
60
51
|
create_makefile("libcld3")
|
Binary file
|
data/ext/cld3/feature_types.o
CHANGED
Binary file
|
data/ext/cld3/fixunicodevalue.o
CHANGED
Binary file
|
data/ext/cld3/fml_parser.o
CHANGED
Binary file
|
Binary file
|
Binary file
|
@@ -878,7 +878,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
|
|
878
878
|
// copying letters to buffer with single spaces for each run of non-letters
|
879
879
|
while (take < byte_length_) {
|
880
880
|
// Copy run of letters in same script (&LS | LS)*
|
881
|
-
int letter_count = 0; // Keep track of word length
|
882
881
|
bool need_break = false;
|
883
882
|
|
884
883
|
while (take < byte_length_) {
|
@@ -963,7 +962,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
|
|
963
962
|
map2original_.Delete(tlen - plen);
|
964
963
|
}
|
965
964
|
|
966
|
-
++letter_count;
|
967
965
|
if (put >= kMaxScriptBytes) {
|
968
966
|
// Buffer is full
|
969
967
|
span->truncated = true;
|
data/ext/cld3/getonescriptspan.h
CHANGED
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
|
|
33
33
|
static const int kWithinScriptTail = 32; // Stop at word space in last
|
34
34
|
// N bytes of script buffer
|
35
35
|
|
36
|
-
|
36
|
+
struct LangSpan {
|
37
37
|
char* text = nullptr; // Pointer to the span, somewhere
|
38
38
|
int text_bytes = 0; // Number of bytes of text in the span
|
39
39
|
int offset = 0; // Offset of start of span in original input buffer
|
40
40
|
ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
|
41
41
|
bool truncated = false; // true if buffer filled up before a
|
42
42
|
// different script or EOF was found
|
43
|
-
}
|
43
|
+
};
|
44
44
|
|
45
45
|
static inline bool IsContinuationByte(char c) {
|
46
46
|
return static_cast<signed char>(c) < -64;
|
data/ext/cld3/getonescriptspan.o
CHANGED
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/libcld3.so
CHANGED
Binary file
|
@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
|
284
284
|
CLD2::LangSpan script_span;
|
285
285
|
std::unordered_map<string, LangChunksStats> lang_stats;
|
286
286
|
int total_num_bytes = 0;
|
287
|
-
Result result;
|
288
|
-
string language;
|
289
287
|
int chunk_size = 0; // Use the default.
|
290
288
|
while (ss.GetOneScriptSpanLower(&script_span)) {
|
291
289
|
const int num_original_span_bytes = script_span.text_bytes;
|
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
|
302
300
|
|
303
301
|
const string selected_text = SelectTextGivenScriptSpan(script_span);
|
304
302
|
|
305
|
-
result = FindLanguageOfValidUTF8(selected_text);
|
306
|
-
language = result.language;
|
303
|
+
Result result = FindLanguageOfValidUTF8(selected_text);
|
304
|
+
string language = result.language;
|
307
305
|
lang_stats[language].byte_sum += num_original_span_bytes;
|
308
306
|
lang_stats[language].prob_sum +=
|
309
307
|
result.probability * num_original_span_bytes;
|
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
|
|
356
354
|
const char *text_begin, int text_size) {
|
357
355
|
string output_text;
|
358
356
|
|
359
|
-
// If the size of the input is greater than the
|
357
|
+
// If the size of the input is greater than the maximum number of bytes needed
|
360
358
|
// for a prediction, then concatenate snippets that are equally spread out
|
361
359
|
// throughout the input.
|
362
360
|
if (text_size > max_num_bytes_) {
|
Binary file
|
Binary file
|
data/ext/cld3/offsetmap.o
CHANGED
Binary file
|
data/ext/cld3/registry.o
CHANGED
Binary file
|
Binary file
|
@@ -0,0 +1,69 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Routine that maps a Unicode code point to an interchange-valid one
|
17
|
+
//
|
18
|
+
// Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
|
19
|
+
// code points. C0 and C1 control codes that are not interchange-valid
|
20
|
+
// are mapped to spaces.
|
21
|
+
|
22
|
+
|
23
|
+
#ifndef SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
24
|
+
#define SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
25
|
+
|
26
|
+
#include "integral_types.h" // for char32
|
27
|
+
#include "port.h"
|
28
|
+
|
29
|
+
namespace chrome_lang_id {
|
30
|
+
namespace CLD2 {
|
31
|
+
|
32
|
+
// Map byte value 0000-00FF to char32
|
33
|
+
// Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
|
34
|
+
// Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
|
35
|
+
static const char32 kMapFullMicrosoft1252OrSpace[256] = {
|
36
|
+
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
|
37
|
+
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
|
38
|
+
0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
|
39
|
+
0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
|
40
|
+
|
41
|
+
0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
|
42
|
+
0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
|
43
|
+
0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
|
44
|
+
0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
|
45
|
+
|
46
|
+
0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
|
47
|
+
0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
|
48
|
+
0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
|
49
|
+
0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
|
50
|
+
0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
|
51
|
+
0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
|
52
|
+
|
53
|
+
0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
|
54
|
+
0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
|
55
|
+
0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
|
56
|
+
0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
|
57
|
+
};
|
58
|
+
|
59
|
+
// Guarantees that the resulting output value is interchange valid
|
60
|
+
// 00-FF; map to spaces or MS CP1252
|
61
|
+
// D800-DFFF; surrogates
|
62
|
+
// FDD0-FDEF; non-characters
|
63
|
+
// xxFFFE-xxFFFF; non-characters
|
64
|
+
char32 FixUnicodeValue(char32 uv);
|
65
|
+
|
66
|
+
} // End namespace CLD2
|
67
|
+
} // End namespace chrome_lang_id
|
68
|
+
|
69
|
+
#endif // SCRIPT_SPAN_FIXUNICODEVALUE_H_
|