cld3 3.4.4 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/README.md +4 -7
- data/cld3.gemspec +5 -5
- data/ext/cld3/Makefile +17 -16
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
- data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
- data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/extconf.rb +1 -10
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.cc +0 -2
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/script_span/fixunicodevalue.h +69 -0
- data/ext/cld3/script_span/generated_ulscript.h +142 -0
- data/ext/cld3/script_span/getonescriptspan.h +124 -0
- data/ext/cld3/script_span/integral_types.h +37 -0
- data/ext/cld3/script_span/offsetmap.h +168 -0
- data/ext/cld3/script_span/port.h +143 -0
- data/ext/cld3/script_span/stringpiece.h +81 -0
- data/ext/cld3/script_span/text_processing.h +30 -0
- data/ext/cld3/script_span/utf8acceptinterchange.h +486 -0
- data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/script_span/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/script_span/utf8statetable.h +285 -0
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3.rb +4 -1
- metadata +33 -25
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_extractor.proto +0 -50
- data/ext/cld3/mkmf.log +0 -37
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence.proto +0 -77
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/task_spec.proto +0 -98
- data/lib/a.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e12ff58a2cc0242896e307f33cb1adb2e210a1889244861e2c751b0399cb6415
|
4
|
+
data.tar.gz: 3b0c348b00126c8ee825a76eed0919e074a7f97e7dc7e647614f7a754c69d716
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 690c6e6f0cc8f0cc369b05a611daeed74adc273cd9c02f780233921ed0c220a425552e7020c127ff0503e88d473f57088cafd3b0d2143f3f86527d886d3eaf63
|
7
|
+
data.tar.gz: 6c3b7c105fa799fb918a077d9d1f47c948ec29bad86e57f452ea9eae82fc4f5bfc2686654a2a68ebc7102dfcb34e2a1ca55648d75a1f6bcf97b532de482de76e
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -19,7 +19,6 @@ cld3.find_language("здравствуйте") # => #<struct Struct::Result lang
|
|
19
19
|
### Prerequisites
|
20
20
|
* [Bundler](http://bundler.io/)
|
21
21
|
* C++ compiler
|
22
|
-
* [Protocol buffers](https://developers.google.com/protocol-buffers/)
|
23
22
|
* [Rake](https://ruby.github.io/rake/)
|
24
23
|
* [RubyGems](https://rubygems.org/)
|
25
24
|
|
@@ -42,17 +41,15 @@ change:
|
|
42
41
|
https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
|
43
42
|
|
44
43
|
### Troubleshooting
|
45
|
-
`gem install cld3` triggers native library building. If it fails,
|
46
|
-
|
47
|
-
|
48
|
-
likely to need [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/)
|
49
|
-
as well.
|
44
|
+
`gem install cld3` triggers native library building. If it fails, it is likely
|
45
|
+
that some required facilities are missing. Make sure C++ compiler is installed.
|
46
|
+
I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
|
50
47
|
|
51
48
|
Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
|
52
49
|
programming errors. Make sure they are all correct.
|
53
50
|
|
54
51
|
If you cannot identify the cause of your problem, run spec of this library and
|
55
|
-
see whether the problem is
|
52
|
+
see whether the problem is reproducible with it or not. Spec is not included in
|
56
53
|
the gem, so clone the source code repository and then run `rake spec`.
|
57
54
|
The source code repository is at
|
58
55
|
https://github.com/akihikodaki/cld3-ruby.
|
data/cld3.gemspec
CHANGED
@@ -16,18 +16,18 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.
|
19
|
+
gem.version = "3.5.0"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
23
23
|
gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
|
24
24
|
gem.author = "Akihiko Odaki"
|
25
25
|
gem.email = "akihiko.odaki@gmail.com"
|
26
|
-
gem.required_ruby_version = [ ">= 2.
|
26
|
+
gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
|
27
27
|
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
|
28
|
-
gem.add_development_dependency "rbs", [ ">=
|
29
|
-
gem.add_development_dependency "rspec", [ ">=3.
|
30
|
-
gem.add_development_dependency "steep", [ ">= 0.
|
28
|
+
gem.add_development_dependency "rbs", [ ">= 2.6.0", "< 2.7.0" ]
|
29
|
+
gem.add_development_dependency "rspec", [ ">= 3.11.0", "< 3.12.0" ]
|
30
|
+
gem.add_development_dependency "steep", [ ">= 1.0.0", "< 1.1.0" ]
|
31
31
|
gem.files = Dir[
|
32
32
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
33
33
|
"cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
|
data/ext/cld3/Makefile
CHANGED
@@ -3,6 +3,7 @@ SHELL = /bin/sh
|
|
3
3
|
|
4
4
|
# V=0 quiet, V=1 verbose. other values don't work.
|
5
5
|
V = 1
|
6
|
+
V0 = $(V:0=)
|
6
7
|
Q1 = $(V:1=)
|
7
8
|
Q = $(Q1:0=@)
|
8
9
|
ECHO1 = $(V:1=@ :)
|
@@ -52,7 +53,7 @@ htmldir = $(docdir)
|
|
52
53
|
infodir = $(DESTDIR)/usr/share/info
|
53
54
|
docdir = $(datarootdir)/doc/$(PACKAGE)
|
54
55
|
oldincludedir = $(DESTDIR)/usr/include
|
55
|
-
includedir = $(
|
56
|
+
includedir = $(exec_prefix)/include
|
56
57
|
runstatedir = $(localstatedir)/run
|
57
58
|
localstatedir = $(DESTDIR)/var
|
58
59
|
sharedstatedir = $(DESTDIR)/var/lib
|
@@ -80,18 +81,18 @@ CSRCFLAG = $(empty)
|
|
80
81
|
RUBY_EXTCONF_H =
|
81
82
|
cflags = $(optflags) $(debugflags) $(warnflags)
|
82
83
|
cxxflags =
|
83
|
-
optflags = -O3
|
84
|
+
optflags = -O3 -fno-fast-math
|
84
85
|
debugflags = -ggdb3
|
85
|
-
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
|
86
|
+
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wold-style-definition -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable -Wundef
|
86
87
|
cppflags =
|
87
88
|
CCDLFLAGS = -fPIC
|
88
|
-
CFLAGS = $(CCDLFLAGS) -O2
|
89
|
-
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
89
|
+
CFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
|
90
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
90
91
|
DEFS =
|
91
92
|
CPPFLAGS = $(DEFS) $(cppflags)
|
92
|
-
CXXFLAGS = $(CCDLFLAGS) -O2
|
93
|
-
ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
|
94
|
-
dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
|
93
|
+
CXXFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++17 $(ARCH_FLAG)
|
94
|
+
ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1 -fstack-protector-strong -rdynamic -Wl,-export-dynamic -Wl,--no-as-needed
|
95
|
+
dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1
|
95
96
|
ARCH_FLAG =
|
96
97
|
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
97
98
|
LDSHARED = $(CC) -shared
|
@@ -108,13 +109,13 @@ RUBY_BASE_NAME = ruby
|
|
108
109
|
|
109
110
|
arch = aarch64-linux
|
110
111
|
sitearch = $(arch)
|
111
|
-
ruby_version = 3.
|
112
|
+
ruby_version = 3.1.0
|
112
113
|
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
113
114
|
RUBY = $(ruby)
|
114
115
|
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
115
116
|
|
116
117
|
RM = rm -f
|
117
|
-
RM_RF =
|
118
|
+
RM_RF = rm -fr
|
118
119
|
RMDIRS = rmdir --ignore-fail-on-non-empty -p
|
119
120
|
MAKEDIRS = /usr/bin/mkdir -p
|
120
121
|
INSTALL = /usr/bin/install -c
|
@@ -138,11 +139,11 @@ extout =
|
|
138
139
|
extout_prefix =
|
139
140
|
target_prefix =
|
140
141
|
LOCAL_LIBS =
|
141
|
-
LIBS = -
|
142
|
-
ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc
|
142
|
+
LIBS = -lm -lc
|
143
|
+
ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence_features.cc task_context.cc task_context_params.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
|
143
144
|
SRCS = $(ORIG_SRCS)
|
144
|
-
OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o
|
145
|
-
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/
|
145
|
+
OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence_features.o task_context.o task_context_params.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
|
146
|
+
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
|
146
147
|
LOCAL_HDRS =
|
147
148
|
TARGET = libcld3
|
148
149
|
TARGET_NAME = libcld3
|
@@ -160,7 +161,7 @@ HDRDIR = $(sitehdrdir)$(target_prefix)
|
|
160
161
|
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
161
162
|
TARGET_SO_DIR =
|
162
163
|
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
163
|
-
CLEANLIBS = $(TARGET_SO)
|
164
|
+
CLEANLIBS = $(TARGET_SO) false
|
164
165
|
CLEANOBJS = *.o *.bak
|
165
166
|
|
166
167
|
all: $(DLLIB)
|
@@ -173,7 +174,7 @@ clean-rb-default::
|
|
173
174
|
clean-rb::
|
174
175
|
clean-so::
|
175
176
|
clean: clean-so clean-static clean-rb-default clean-rb
|
176
|
-
-$(Q)$(
|
177
|
+
-$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
177
178
|
|
178
179
|
distclean-rb-default::
|
179
180
|
distclean-rb::
|
data/ext/cld3/base.o
CHANGED
Binary file
|
@@ -0,0 +1,100 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef FEATURE_EXTRACTOR_PB_H_
|
18
|
+
#define FEATURE_EXTRACTOR_PB_H_
|
19
|
+
|
20
|
+
#include <cstdint>
|
21
|
+
#include <string>
|
22
|
+
#include <vector>
|
23
|
+
|
24
|
+
namespace chrome_lang_id {
|
25
|
+
|
26
|
+
class Parameter {
|
27
|
+
public:
|
28
|
+
const std::string& name() const { return name_; }
|
29
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
30
|
+
const std::string& value() const { return value_; }
|
31
|
+
void set_value(std::string value) { value_ = std::move(value); }
|
32
|
+
|
33
|
+
private:
|
34
|
+
std::string name_;
|
35
|
+
std::string value_;
|
36
|
+
};
|
37
|
+
|
38
|
+
class FeatureFunctionDescriptor {
|
39
|
+
public:
|
40
|
+
const std::string& type() const { return type_; }
|
41
|
+
|
42
|
+
void set_type(std::string value) { type_ = std::move(value); }
|
43
|
+
|
44
|
+
const std::string& name() const { return name_; }
|
45
|
+
|
46
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
47
|
+
|
48
|
+
bool has_argument() const { return true; }
|
49
|
+
|
50
|
+
std::int32_t argument() const { return argument_; }
|
51
|
+
|
52
|
+
void set_argument(int32_t value) { argument_ = value; }
|
53
|
+
|
54
|
+
int parameter_size() const { return parameter_.size(); }
|
55
|
+
|
56
|
+
const Parameter& parameter(int index) const { return parameter_[index]; }
|
57
|
+
|
58
|
+
Parameter* add_parameter() { return ¶meter_.emplace_back(); }
|
59
|
+
|
60
|
+
int feature_size() const { return feature_.size(); }
|
61
|
+
|
62
|
+
FeatureFunctionDescriptor* mutable_feature(int index) {
|
63
|
+
return &feature_[index];
|
64
|
+
}
|
65
|
+
|
66
|
+
const FeatureFunctionDescriptor& feature(int index) const {
|
67
|
+
return feature_[index];
|
68
|
+
}
|
69
|
+
|
70
|
+
FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
|
71
|
+
|
72
|
+
private:
|
73
|
+
std::string type_;
|
74
|
+
std::string name_;
|
75
|
+
std::int32_t argument_;
|
76
|
+
std::vector<Parameter> parameter_;
|
77
|
+
std::vector<FeatureFunctionDescriptor> feature_;
|
78
|
+
};
|
79
|
+
|
80
|
+
class FeatureExtractorDescriptor {
|
81
|
+
public:
|
82
|
+
int feature_size() const { return feature_.size(); }
|
83
|
+
|
84
|
+
FeatureFunctionDescriptor* mutable_feature(int index) {
|
85
|
+
return &feature_[index];
|
86
|
+
}
|
87
|
+
|
88
|
+
const FeatureFunctionDescriptor& feature(int index) const {
|
89
|
+
return feature_[index];
|
90
|
+
}
|
91
|
+
|
92
|
+
FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
|
93
|
+
|
94
|
+
private:
|
95
|
+
std::vector<FeatureFunctionDescriptor> feature_;
|
96
|
+
};
|
97
|
+
|
98
|
+
}
|
99
|
+
|
100
|
+
#endif
|
@@ -0,0 +1,35 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef SENTENCE_PB_H_
|
18
|
+
#define SENTENCE_PB_H_
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
|
22
|
+
namespace chrome_lang_id {
|
23
|
+
|
24
|
+
class Sentence {
|
25
|
+
public:
|
26
|
+
const std::string& text() const { return text_; }
|
27
|
+
void set_text(std::string value) { text_ = std::move(value); }
|
28
|
+
|
29
|
+
private:
|
30
|
+
std::string text_;
|
31
|
+
};
|
32
|
+
|
33
|
+
}
|
34
|
+
|
35
|
+
#endif
|
@@ -0,0 +1,106 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef TASK_SPEC_PB_H_
|
18
|
+
#define TASK_SPEC_PB_H_
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
#include <vector>
|
22
|
+
|
23
|
+
namespace chrome_lang_id {
|
24
|
+
|
25
|
+
class TaskInput {
|
26
|
+
public:
|
27
|
+
class Part {
|
28
|
+
public:
|
29
|
+
const std::string& file_pattern() const { return file_pattern_; }
|
30
|
+
|
31
|
+
private:
|
32
|
+
std::string file_pattern_;
|
33
|
+
};
|
34
|
+
|
35
|
+
const std::string& name() const { return name_; }
|
36
|
+
|
37
|
+
void set_name(std::string value) { name_ = value; }
|
38
|
+
|
39
|
+
int file_format_size() const { return file_format_.size(); }
|
40
|
+
|
41
|
+
const std::string& file_format(int index) const {
|
42
|
+
return file_format_[index];
|
43
|
+
}
|
44
|
+
|
45
|
+
void add_file_format(std::string value) {
|
46
|
+
file_format_.push_back(std::move(value));
|
47
|
+
}
|
48
|
+
|
49
|
+
int record_format_size() const { return record_format_.size(); }
|
50
|
+
|
51
|
+
const std::string& record_format(int index) const {
|
52
|
+
return record_format_[index];
|
53
|
+
}
|
54
|
+
|
55
|
+
void add_record_format(std::string value) {
|
56
|
+
record_format_.push_back(std::move(value));
|
57
|
+
}
|
58
|
+
|
59
|
+
int part_size() const { return part_.size(); }
|
60
|
+
const Part& part(int index) const { return part_[index]; }
|
61
|
+
|
62
|
+
private:
|
63
|
+
std::string name_;
|
64
|
+
std::vector<std::string> file_format_;
|
65
|
+
std::vector<std::string> record_format_;
|
66
|
+
std::vector<Part> part_;
|
67
|
+
};
|
68
|
+
|
69
|
+
class TaskSpec {
|
70
|
+
public:
|
71
|
+
class Parameter {
|
72
|
+
public:
|
73
|
+
const std::string& name() const { return name_; }
|
74
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
75
|
+
const std::string& value() const { return value_; }
|
76
|
+
void set_value(std::string value) { value_ = std::move(value); }
|
77
|
+
|
78
|
+
private:
|
79
|
+
std::string name_;
|
80
|
+
std::string value_;
|
81
|
+
};
|
82
|
+
|
83
|
+
int parameter_size() const { return parameter_.size(); }
|
84
|
+
|
85
|
+
Parameter* mutable_parameter(int index) { return ¶meter_[index]; }
|
86
|
+
|
87
|
+
const Parameter& parameter(int index) const { return parameter_[index]; }
|
88
|
+
|
89
|
+
Parameter* add_parameter() { return ¶meter_.emplace_back(); }
|
90
|
+
|
91
|
+
int input_size() const { return input_.size(); }
|
92
|
+
|
93
|
+
TaskInput* mutable_input(int index) { return &input_[index]; }
|
94
|
+
|
95
|
+
const TaskInput& input(int index) const { return input_[index]; }
|
96
|
+
|
97
|
+
TaskInput* add_input() { return &input_.emplace_back(); }
|
98
|
+
|
99
|
+
private:
|
100
|
+
std::vector<Parameter> parameter_;
|
101
|
+
std::vector<TaskInput> input_;
|
102
|
+
};
|
103
|
+
|
104
|
+
}
|
105
|
+
|
106
|
+
#endif
|
Binary file
|
Binary file
|
data/ext/cld3/extconf.rb
CHANGED
@@ -26,17 +26,8 @@ rescue
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
# Check pkg-config first to inform the library is missing if so.
|
30
|
-
pkg_config("protobuf") or abort "Failed to locate protobuf"
|
31
|
-
|
32
|
-
FileUtils.mkdir_p("cld_3/protos")
|
33
29
|
FileUtils.mkdir_p("script_span")
|
34
30
|
|
35
|
-
[ "feature_extractor", "sentence", "task_spec" ].each {|name|
|
36
|
-
system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
|
37
|
-
ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
|
38
|
-
}
|
39
|
-
|
40
31
|
[
|
41
32
|
"fixunicodevalue.h",
|
42
33
|
"generated_ulscript.h",
|
@@ -55,6 +46,6 @@ FileUtils.mkdir_p("script_span")
|
|
55
46
|
ln_fallback("#{name}", "script_span/#{name}")
|
56
47
|
}
|
57
48
|
|
58
|
-
$CXXFLAGS += " -fvisibility=hidden -std=c++
|
49
|
+
$CXXFLAGS += " -fvisibility=hidden -std=c++17"
|
59
50
|
$LIBRUBYARG = ""
|
60
51
|
create_makefile("libcld3")
|
Binary file
|
data/ext/cld3/feature_types.o
CHANGED
Binary file
|
data/ext/cld3/fixunicodevalue.o
CHANGED
Binary file
|
data/ext/cld3/fml_parser.o
CHANGED
Binary file
|
Binary file
|
Binary file
|
@@ -878,7 +878,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
|
|
878
878
|
// copying letters to buffer with single spaces for each run of non-letters
|
879
879
|
while (take < byte_length_) {
|
880
880
|
// Copy run of letters in same script (&LS | LS)*
|
881
|
-
int letter_count = 0; // Keep track of word length
|
882
881
|
bool need_break = false;
|
883
882
|
|
884
883
|
while (take < byte_length_) {
|
@@ -963,7 +962,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
|
|
963
962
|
map2original_.Delete(tlen - plen);
|
964
963
|
}
|
965
964
|
|
966
|
-
++letter_count;
|
967
965
|
if (put >= kMaxScriptBytes) {
|
968
966
|
// Buffer is full
|
969
967
|
span->truncated = true;
|
data/ext/cld3/getonescriptspan.o
CHANGED
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/libcld3.so
CHANGED
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/offsetmap.o
CHANGED
Binary file
|
data/ext/cld3/registry.o
CHANGED
Binary file
|
Binary file
|
@@ -0,0 +1,69 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Routine that maps a Unicode code point to an interchange-valid one
|
17
|
+
//
|
18
|
+
// Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
|
19
|
+
// code points. C0 and C1 control codes that are not interchange-valid
|
20
|
+
// are mapped to spaces.
|
21
|
+
|
22
|
+
|
23
|
+
#ifndef SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
24
|
+
#define SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
25
|
+
|
26
|
+
#include "integral_types.h" // for char32
|
27
|
+
#include "port.h"
|
28
|
+
|
29
|
+
namespace chrome_lang_id {
|
30
|
+
namespace CLD2 {
|
31
|
+
|
32
|
+
// Map byte value 0000-00FF to char32
|
33
|
+
// Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
|
34
|
+
// Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
|
35
|
+
static const char32 kMapFullMicrosoft1252OrSpace[256] = {
|
36
|
+
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
|
37
|
+
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
|
38
|
+
0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
|
39
|
+
0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
|
40
|
+
|
41
|
+
0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
|
42
|
+
0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
|
43
|
+
0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
|
44
|
+
0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
|
45
|
+
|
46
|
+
0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
|
47
|
+
0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
|
48
|
+
0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
|
49
|
+
0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
|
50
|
+
0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
|
51
|
+
0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
|
52
|
+
|
53
|
+
0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
|
54
|
+
0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
|
55
|
+
0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
|
56
|
+
0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
|
57
|
+
};
|
58
|
+
|
59
|
+
// Guarantees that the resulting output value is interchange valid
|
60
|
+
// 00-FF; map to spaces or MS CP1252
|
61
|
+
// D800-DFFF; surrogates
|
62
|
+
// FDD0-FDEF; non-characters
|
63
|
+
// xxFFFE-xxFFFF; non-characters
|
64
|
+
char32 FixUnicodeValue(char32 uv);
|
65
|
+
|
66
|
+
} // End namespace CLD2
|
67
|
+
} // End namespace chrome_lang_id
|
68
|
+
|
69
|
+
#endif // SCRIPT_SPAN_FIXUNICODEVALUE_H_
|