cld3 3.4.4 → 3.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/README.md +4 -7
- data/cld3.gemspec +5 -5
- data/ext/cld3/Makefile +17 -16
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
- data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
- data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/extconf.rb +1 -10
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.cc +0 -2
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/script_span/fixunicodevalue.h +69 -0
- data/ext/cld3/script_span/generated_ulscript.h +142 -0
- data/ext/cld3/script_span/getonescriptspan.h +124 -0
- data/ext/cld3/script_span/integral_types.h +37 -0
- data/ext/cld3/script_span/offsetmap.h +168 -0
- data/ext/cld3/script_span/port.h +143 -0
- data/ext/cld3/script_span/stringpiece.h +81 -0
- data/ext/cld3/script_span/text_processing.h +30 -0
- data/ext/cld3/script_span/utf8acceptinterchange.h +486 -0
- data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/script_span/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/script_span/utf8statetable.h +285 -0
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3.rb +4 -1
- metadata +33 -25
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_extractor.proto +0 -50
- data/ext/cld3/mkmf.log +0 -37
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence.proto +0 -77
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/task_spec.proto +0 -98
- data/lib/a.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e12ff58a2cc0242896e307f33cb1adb2e210a1889244861e2c751b0399cb6415
|
4
|
+
data.tar.gz: 3b0c348b00126c8ee825a76eed0919e074a7f97e7dc7e647614f7a754c69d716
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 690c6e6f0cc8f0cc369b05a611daeed74adc273cd9c02f780233921ed0c220a425552e7020c127ff0503e88d473f57088cafd3b0d2143f3f86527d886d3eaf63
|
7
|
+
data.tar.gz: 6c3b7c105fa799fb918a077d9d1f47c948ec29bad86e57f452ea9eae82fc4f5bfc2686654a2a68ebc7102dfcb34e2a1ca55648d75a1f6bcf97b532de482de76e
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -19,7 +19,6 @@ cld3.find_language("здравствуйте") # => #<struct Struct::Result lang
|
|
19
19
|
### Prerequisites
|
20
20
|
* [Bundler](http://bundler.io/)
|
21
21
|
* C++ compiler
|
22
|
-
* [Protocol buffers](https://developers.google.com/protocol-buffers/)
|
23
22
|
* [Rake](https://ruby.github.io/rake/)
|
24
23
|
* [RubyGems](https://rubygems.org/)
|
25
24
|
|
@@ -42,17 +41,15 @@ change:
|
|
42
41
|
https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
|
43
42
|
|
44
43
|
### Troubleshooting
|
45
|
-
`gem install cld3` triggers native library building. If it fails,
|
46
|
-
|
47
|
-
|
48
|
-
likely to need [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/)
|
49
|
-
as well.
|
44
|
+
`gem install cld3` triggers native library building. If it fails, it is likely
|
45
|
+
that some required facilities are missing. Make sure C++ compiler is installed.
|
46
|
+
I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
|
50
47
|
|
51
48
|
Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
|
52
49
|
programming errors. Make sure they are all correct.
|
53
50
|
|
54
51
|
If you cannot identify the cause of your problem, run spec of this library and
|
55
|
-
see whether the problem is
|
52
|
+
see whether the problem is reproducible with it or not. Spec is not included in
|
56
53
|
the gem, so clone the source code repository and then run `rake spec`.
|
57
54
|
The source code repository is at
|
58
55
|
https://github.com/akihikodaki/cld3-ruby.
|
data/cld3.gemspec
CHANGED
@@ -16,18 +16,18 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.
|
19
|
+
gem.version = "3.5.0"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
23
23
|
gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
|
24
24
|
gem.author = "Akihiko Odaki"
|
25
25
|
gem.email = "akihiko.odaki@gmail.com"
|
26
|
-
gem.required_ruby_version = [ ">= 2.
|
26
|
+
gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
|
27
27
|
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
|
28
|
-
gem.add_development_dependency "rbs", [ ">=
|
29
|
-
gem.add_development_dependency "rspec", [ ">=3.
|
30
|
-
gem.add_development_dependency "steep", [ ">= 0.
|
28
|
+
gem.add_development_dependency "rbs", [ ">= 2.6.0", "< 2.7.0" ]
|
29
|
+
gem.add_development_dependency "rspec", [ ">= 3.11.0", "< 3.12.0" ]
|
30
|
+
gem.add_development_dependency "steep", [ ">= 1.0.0", "< 1.1.0" ]
|
31
31
|
gem.files = Dir[
|
32
32
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
33
33
|
"cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
|
data/ext/cld3/Makefile
CHANGED
@@ -3,6 +3,7 @@ SHELL = /bin/sh
|
|
3
3
|
|
4
4
|
# V=0 quiet, V=1 verbose. other values don't work.
|
5
5
|
V = 1
|
6
|
+
V0 = $(V:0=)
|
6
7
|
Q1 = $(V:1=)
|
7
8
|
Q = $(Q1:0=@)
|
8
9
|
ECHO1 = $(V:1=@ :)
|
@@ -52,7 +53,7 @@ htmldir = $(docdir)
|
|
52
53
|
infodir = $(DESTDIR)/usr/share/info
|
53
54
|
docdir = $(datarootdir)/doc/$(PACKAGE)
|
54
55
|
oldincludedir = $(DESTDIR)/usr/include
|
55
|
-
includedir = $(
|
56
|
+
includedir = $(exec_prefix)/include
|
56
57
|
runstatedir = $(localstatedir)/run
|
57
58
|
localstatedir = $(DESTDIR)/var
|
58
59
|
sharedstatedir = $(DESTDIR)/var/lib
|
@@ -80,18 +81,18 @@ CSRCFLAG = $(empty)
|
|
80
81
|
RUBY_EXTCONF_H =
|
81
82
|
cflags = $(optflags) $(debugflags) $(warnflags)
|
82
83
|
cxxflags =
|
83
|
-
optflags = -O3
|
84
|
+
optflags = -O3 -fno-fast-math
|
84
85
|
debugflags = -ggdb3
|
85
|
-
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
|
86
|
+
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wold-style-definition -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable -Wundef
|
86
87
|
cppflags =
|
87
88
|
CCDLFLAGS = -fPIC
|
88
|
-
CFLAGS = $(CCDLFLAGS) -O2
|
89
|
-
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
89
|
+
CFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
|
90
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
90
91
|
DEFS =
|
91
92
|
CPPFLAGS = $(DEFS) $(cppflags)
|
92
|
-
CXXFLAGS = $(CCDLFLAGS) -O2
|
93
|
-
ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
|
94
|
-
dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
|
93
|
+
CXXFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++17 $(ARCH_FLAG)
|
94
|
+
ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1 -fstack-protector-strong -rdynamic -Wl,-export-dynamic -Wl,--no-as-needed
|
95
|
+
dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1
|
95
96
|
ARCH_FLAG =
|
96
97
|
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
97
98
|
LDSHARED = $(CC) -shared
|
@@ -108,13 +109,13 @@ RUBY_BASE_NAME = ruby
|
|
108
109
|
|
109
110
|
arch = aarch64-linux
|
110
111
|
sitearch = $(arch)
|
111
|
-
ruby_version = 3.
|
112
|
+
ruby_version = 3.1.0
|
112
113
|
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
113
114
|
RUBY = $(ruby)
|
114
115
|
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
115
116
|
|
116
117
|
RM = rm -f
|
117
|
-
RM_RF =
|
118
|
+
RM_RF = rm -fr
|
118
119
|
RMDIRS = rmdir --ignore-fail-on-non-empty -p
|
119
120
|
MAKEDIRS = /usr/bin/mkdir -p
|
120
121
|
INSTALL = /usr/bin/install -c
|
@@ -138,11 +139,11 @@ extout =
|
|
138
139
|
extout_prefix =
|
139
140
|
target_prefix =
|
140
141
|
LOCAL_LIBS =
|
141
|
-
LIBS = -
|
142
|
-
ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc
|
142
|
+
LIBS = -lm -lc
|
143
|
+
ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence_features.cc task_context.cc task_context_params.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
|
143
144
|
SRCS = $(ORIG_SRCS)
|
144
|
-
OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o
|
145
|
-
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/
|
145
|
+
OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence_features.o task_context.o task_context_params.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
|
146
|
+
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
|
146
147
|
LOCAL_HDRS =
|
147
148
|
TARGET = libcld3
|
148
149
|
TARGET_NAME = libcld3
|
@@ -160,7 +161,7 @@ HDRDIR = $(sitehdrdir)$(target_prefix)
|
|
160
161
|
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
161
162
|
TARGET_SO_DIR =
|
162
163
|
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
163
|
-
CLEANLIBS = $(TARGET_SO)
|
164
|
+
CLEANLIBS = $(TARGET_SO) false
|
164
165
|
CLEANOBJS = *.o *.bak
|
165
166
|
|
166
167
|
all: $(DLLIB)
|
@@ -173,7 +174,7 @@ clean-rb-default::
|
|
173
174
|
clean-rb::
|
174
175
|
clean-so::
|
175
176
|
clean: clean-so clean-static clean-rb-default clean-rb
|
176
|
-
-$(Q)$(
|
177
|
+
-$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
177
178
|
|
178
179
|
distclean-rb-default::
|
179
180
|
distclean-rb::
|
data/ext/cld3/base.o
CHANGED
Binary file
|
@@ -0,0 +1,100 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef FEATURE_EXTRACTOR_PB_H_
|
18
|
+
#define FEATURE_EXTRACTOR_PB_H_
|
19
|
+
|
20
|
+
#include <cstdint>
|
21
|
+
#include <string>
|
22
|
+
#include <vector>
|
23
|
+
|
24
|
+
namespace chrome_lang_id {
|
25
|
+
|
26
|
+
class Parameter {
|
27
|
+
public:
|
28
|
+
const std::string& name() const { return name_; }
|
29
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
30
|
+
const std::string& value() const { return value_; }
|
31
|
+
void set_value(std::string value) { value_ = std::move(value); }
|
32
|
+
|
33
|
+
private:
|
34
|
+
std::string name_;
|
35
|
+
std::string value_;
|
36
|
+
};
|
37
|
+
|
38
|
+
class FeatureFunctionDescriptor {
|
39
|
+
public:
|
40
|
+
const std::string& type() const { return type_; }
|
41
|
+
|
42
|
+
void set_type(std::string value) { type_ = std::move(value); }
|
43
|
+
|
44
|
+
const std::string& name() const { return name_; }
|
45
|
+
|
46
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
47
|
+
|
48
|
+
bool has_argument() const { return true; }
|
49
|
+
|
50
|
+
std::int32_t argument() const { return argument_; }
|
51
|
+
|
52
|
+
void set_argument(int32_t value) { argument_ = value; }
|
53
|
+
|
54
|
+
int parameter_size() const { return parameter_.size(); }
|
55
|
+
|
56
|
+
const Parameter& parameter(int index) const { return parameter_[index]; }
|
57
|
+
|
58
|
+
Parameter* add_parameter() { return ¶meter_.emplace_back(); }
|
59
|
+
|
60
|
+
int feature_size() const { return feature_.size(); }
|
61
|
+
|
62
|
+
FeatureFunctionDescriptor* mutable_feature(int index) {
|
63
|
+
return &feature_[index];
|
64
|
+
}
|
65
|
+
|
66
|
+
const FeatureFunctionDescriptor& feature(int index) const {
|
67
|
+
return feature_[index];
|
68
|
+
}
|
69
|
+
|
70
|
+
FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
|
71
|
+
|
72
|
+
private:
|
73
|
+
std::string type_;
|
74
|
+
std::string name_;
|
75
|
+
std::int32_t argument_;
|
76
|
+
std::vector<Parameter> parameter_;
|
77
|
+
std::vector<FeatureFunctionDescriptor> feature_;
|
78
|
+
};
|
79
|
+
|
80
|
+
class FeatureExtractorDescriptor {
|
81
|
+
public:
|
82
|
+
int feature_size() const { return feature_.size(); }
|
83
|
+
|
84
|
+
FeatureFunctionDescriptor* mutable_feature(int index) {
|
85
|
+
return &feature_[index];
|
86
|
+
}
|
87
|
+
|
88
|
+
const FeatureFunctionDescriptor& feature(int index) const {
|
89
|
+
return feature_[index];
|
90
|
+
}
|
91
|
+
|
92
|
+
FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
|
93
|
+
|
94
|
+
private:
|
95
|
+
std::vector<FeatureFunctionDescriptor> feature_;
|
96
|
+
};
|
97
|
+
|
98
|
+
}
|
99
|
+
|
100
|
+
#endif
|
@@ -0,0 +1,35 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef SENTENCE_PB_H_
|
18
|
+
#define SENTENCE_PB_H_
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
|
22
|
+
namespace chrome_lang_id {
|
23
|
+
|
24
|
+
class Sentence {
|
25
|
+
public:
|
26
|
+
const std::string& text() const { return text_; }
|
27
|
+
void set_text(std::string value) { text_ = std::move(value); }
|
28
|
+
|
29
|
+
private:
|
30
|
+
std::string text_;
|
31
|
+
};
|
32
|
+
|
33
|
+
}
|
34
|
+
|
35
|
+
#endif
|
@@ -0,0 +1,106 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef TASK_SPEC_PB_H_
|
18
|
+
#define TASK_SPEC_PB_H_
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
#include <vector>
|
22
|
+
|
23
|
+
namespace chrome_lang_id {
|
24
|
+
|
25
|
+
class TaskInput {
|
26
|
+
public:
|
27
|
+
class Part {
|
28
|
+
public:
|
29
|
+
const std::string& file_pattern() const { return file_pattern_; }
|
30
|
+
|
31
|
+
private:
|
32
|
+
std::string file_pattern_;
|
33
|
+
};
|
34
|
+
|
35
|
+
const std::string& name() const { return name_; }
|
36
|
+
|
37
|
+
void set_name(std::string value) { name_ = value; }
|
38
|
+
|
39
|
+
int file_format_size() const { return file_format_.size(); }
|
40
|
+
|
41
|
+
const std::string& file_format(int index) const {
|
42
|
+
return file_format_[index];
|
43
|
+
}
|
44
|
+
|
45
|
+
void add_file_format(std::string value) {
|
46
|
+
file_format_.push_back(std::move(value));
|
47
|
+
}
|
48
|
+
|
49
|
+
int record_format_size() const { return record_format_.size(); }
|
50
|
+
|
51
|
+
const std::string& record_format(int index) const {
|
52
|
+
return record_format_[index];
|
53
|
+
}
|
54
|
+
|
55
|
+
void add_record_format(std::string value) {
|
56
|
+
record_format_.push_back(std::move(value));
|
57
|
+
}
|
58
|
+
|
59
|
+
int part_size() const { return part_.size(); }
|
60
|
+
const Part& part(int index) const { return part_[index]; }
|
61
|
+
|
62
|
+
private:
|
63
|
+
std::string name_;
|
64
|
+
std::vector<std::string> file_format_;
|
65
|
+
std::vector<std::string> record_format_;
|
66
|
+
std::vector<Part> part_;
|
67
|
+
};
|
68
|
+
|
69
|
+
class TaskSpec {
|
70
|
+
public:
|
71
|
+
class Parameter {
|
72
|
+
public:
|
73
|
+
const std::string& name() const { return name_; }
|
74
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
75
|
+
const std::string& value() const { return value_; }
|
76
|
+
void set_value(std::string value) { value_ = std::move(value); }
|
77
|
+
|
78
|
+
private:
|
79
|
+
std::string name_;
|
80
|
+
std::string value_;
|
81
|
+
};
|
82
|
+
|
83
|
+
int parameter_size() const { return parameter_.size(); }
|
84
|
+
|
85
|
+
Parameter* mutable_parameter(int index) { return ¶meter_[index]; }
|
86
|
+
|
87
|
+
const Parameter& parameter(int index) const { return parameter_[index]; }
|
88
|
+
|
89
|
+
Parameter* add_parameter() { return ¶meter_.emplace_back(); }
|
90
|
+
|
91
|
+
int input_size() const { return input_.size(); }
|
92
|
+
|
93
|
+
TaskInput* mutable_input(int index) { return &input_[index]; }
|
94
|
+
|
95
|
+
const TaskInput& input(int index) const { return input_[index]; }
|
96
|
+
|
97
|
+
TaskInput* add_input() { return &input_.emplace_back(); }
|
98
|
+
|
99
|
+
private:
|
100
|
+
std::vector<Parameter> parameter_;
|
101
|
+
std::vector<TaskInput> input_;
|
102
|
+
};
|
103
|
+
|
104
|
+
}
|
105
|
+
|
106
|
+
#endif
|
Binary file
|
Binary file
|
data/ext/cld3/extconf.rb
CHANGED
@@ -26,17 +26,8 @@ rescue
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
# Check pkg-config first to inform the library is missing if so.
|
30
|
-
pkg_config("protobuf") or abort "Failed to locate protobuf"
|
31
|
-
|
32
|
-
FileUtils.mkdir_p("cld_3/protos")
|
33
29
|
FileUtils.mkdir_p("script_span")
|
34
30
|
|
35
|
-
[ "feature_extractor", "sentence", "task_spec" ].each {|name|
|
36
|
-
system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
|
37
|
-
ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
|
38
|
-
}
|
39
|
-
|
40
31
|
[
|
41
32
|
"fixunicodevalue.h",
|
42
33
|
"generated_ulscript.h",
|
@@ -55,6 +46,6 @@ FileUtils.mkdir_p("script_span")
|
|
55
46
|
ln_fallback("#{name}", "script_span/#{name}")
|
56
47
|
}
|
57
48
|
|
58
|
-
$CXXFLAGS += " -fvisibility=hidden -std=c++
|
49
|
+
$CXXFLAGS += " -fvisibility=hidden -std=c++17"
|
59
50
|
$LIBRUBYARG = ""
|
60
51
|
create_makefile("libcld3")
|
Binary file
|
data/ext/cld3/feature_types.o
CHANGED
Binary file
|
data/ext/cld3/fixunicodevalue.o
CHANGED
Binary file
|
data/ext/cld3/fml_parser.o
CHANGED
Binary file
|
Binary file
|
Binary file
|
@@ -878,7 +878,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
|
|
878
878
|
// copying letters to buffer with single spaces for each run of non-letters
|
879
879
|
while (take < byte_length_) {
|
880
880
|
// Copy run of letters in same script (&LS | LS)*
|
881
|
-
int letter_count = 0; // Keep track of word length
|
882
881
|
bool need_break = false;
|
883
882
|
|
884
883
|
while (take < byte_length_) {
|
@@ -963,7 +962,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
|
|
963
962
|
map2original_.Delete(tlen - plen);
|
964
963
|
}
|
965
964
|
|
966
|
-
++letter_count;
|
967
965
|
if (put >= kMaxScriptBytes) {
|
968
966
|
// Buffer is full
|
969
967
|
span->truncated = true;
|
data/ext/cld3/getonescriptspan.o
CHANGED
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/libcld3.so
CHANGED
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/offsetmap.o
CHANGED
Binary file
|
data/ext/cld3/registry.o
CHANGED
Binary file
|
Binary file
|
@@ -0,0 +1,69 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Routine that maps a Unicode code point to an interchange-valid one
|
17
|
+
//
|
18
|
+
// Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
|
19
|
+
// code points. C0 and C1 control codes that are not interchange-valid
|
20
|
+
// are mapped to spaces.
|
21
|
+
|
22
|
+
|
23
|
+
#ifndef SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
24
|
+
#define SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
25
|
+
|
26
|
+
#include "integral_types.h" // for char32
|
27
|
+
#include "port.h"
|
28
|
+
|
29
|
+
namespace chrome_lang_id {
|
30
|
+
namespace CLD2 {
|
31
|
+
|
32
|
+
// Map byte value 0000-00FF to char32
|
33
|
+
// Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
|
34
|
+
// Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
|
35
|
+
static const char32 kMapFullMicrosoft1252OrSpace[256] = {
|
36
|
+
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
|
37
|
+
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
|
38
|
+
0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
|
39
|
+
0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
|
40
|
+
|
41
|
+
0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
|
42
|
+
0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
|
43
|
+
0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
|
44
|
+
0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
|
45
|
+
|
46
|
+
0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
|
47
|
+
0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
|
48
|
+
0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
|
49
|
+
0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
|
50
|
+
0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
|
51
|
+
0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
|
52
|
+
|
53
|
+
0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
|
54
|
+
0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
|
55
|
+
0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
|
56
|
+
0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
|
57
|
+
};
|
58
|
+
|
59
|
+
// Guarantees that the resulting output value is interchange valid
|
60
|
+
// 00-FF; map to spaces or MS CP1252
|
61
|
+
// D800-DFFF; surrogates
|
62
|
+
// FDD0-FDEF; non-characters
|
63
|
+
// xxFFFE-xxFFFF; non-characters
|
64
|
+
char32 FixUnicodeValue(char32 uv);
|
65
|
+
|
66
|
+
} // End namespace CLD2
|
67
|
+
} // End namespace chrome_lang_id
|
68
|
+
|
69
|
+
#endif // SCRIPT_SPAN_FIXUNICODEVALUE_H_
|