cld3 3.4.4 → 3.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -1
  3. data/README.md +4 -7
  4. data/cld3.gemspec +5 -5
  5. data/ext/cld3/Makefile +17 -16
  6. data/ext/cld3/base.o +0 -0
  7. data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
  8. data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
  9. data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
  10. data/ext/cld3/embedding_feature_extractor.o +0 -0
  11. data/ext/cld3/embedding_network.o +0 -0
  12. data/ext/cld3/extconf.rb +1 -10
  13. data/ext/cld3/feature_extractor.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.cc +0 -2
  20. data/ext/cld3/getonescriptspan.o +0 -0
  21. data/ext/cld3/lang_id_nn_params.o +0 -0
  22. data/ext/cld3/language_identifier_features.o +0 -0
  23. data/ext/cld3/libcld3.so +0 -0
  24. data/ext/cld3/nnet_language_identifier.o +0 -0
  25. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  26. data/ext/cld3/offsetmap.o +0 -0
  27. data/ext/cld3/registry.o +0 -0
  28. data/ext/cld3/relevant_script_feature.o +0 -0
  29. data/ext/cld3/script_span/fixunicodevalue.h +69 -0
  30. data/ext/cld3/script_span/generated_ulscript.h +142 -0
  31. data/ext/cld3/script_span/getonescriptspan.h +124 -0
  32. data/ext/cld3/script_span/integral_types.h +37 -0
  33. data/ext/cld3/script_span/offsetmap.h +168 -0
  34. data/ext/cld3/script_span/port.h +143 -0
  35. data/ext/cld3/script_span/stringpiece.h +81 -0
  36. data/ext/cld3/script_span/text_processing.h +30 -0
  37. data/ext/cld3/script_span/utf8acceptinterchange.h +486 -0
  38. data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +1631 -0
  39. data/ext/cld3/script_span/utf8repl_lettermarklower.h +758 -0
  40. data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +1455 -0
  41. data/ext/cld3/script_span/utf8statetable.h +285 -0
  42. data/ext/cld3/sentence_features.o +0 -0
  43. data/ext/cld3/task_context.o +0 -0
  44. data/ext/cld3/task_context_params.o +0 -0
  45. data/ext/cld3/text_processing.o +0 -0
  46. data/ext/cld3/unicodetext.o +0 -0
  47. data/ext/cld3/utf8statetable.o +0 -0
  48. data/ext/cld3/utils.o +0 -0
  49. data/ext/cld3/workspace.o +0 -0
  50. data/lib/cld3.rb +4 -1
  51. metadata +33 -25
  52. data/ext/cld3/feature_extractor.pb.o +0 -0
  53. data/ext/cld3/feature_extractor.proto +0 -50
  54. data/ext/cld3/mkmf.log +0 -37
  55. data/ext/cld3/sentence.pb.o +0 -0
  56. data/ext/cld3/sentence.proto +0 -77
  57. data/ext/cld3/task_spec.pb.o +0 -0
  58. data/ext/cld3/task_spec.proto +0 -98
  59. data/lib/a.rb +0 -24
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f40e4947fea97543686caceba0082bdba30b5ae0485a25b41004ad048057b0ad
4
- data.tar.gz: e45c60300550caf513fdde6bcbc05e68e1063bf9ad8074626bf5f88f4a6f77bd
3
+ metadata.gz: e12ff58a2cc0242896e307f33cb1adb2e210a1889244861e2c751b0399cb6415
4
+ data.tar.gz: 3b0c348b00126c8ee825a76eed0919e074a7f97e7dc7e647614f7a754c69d716
5
5
  SHA512:
6
- metadata.gz: 393fc138a279ee42c3de90c49bcc982e55860f74e2796d4c895d0f2f175894bcb1ec1bbe796811f896a16be9cc97943e1309cbe175bc029a510b4c51b2f700da
7
- data.tar.gz: d16e8c87e7d12cc90cc1a4babb4873df8f553d9527e1d69a548a250ae0b240f79a6338070bbc88cbb0e23db48c23ef0393cd4b62e0ac673722ace81ce1564895
6
+ metadata.gz: 690c6e6f0cc8f0cc369b05a611daeed74adc273cd9c02f780233921ed0c220a425552e7020c127ff0503e88d473f57088cafd3b0d2143f3f86527d886d3eaf63
7
+ data.tar.gz: 6c3b7c105fa799fb918a077d9d1f47c948ec29bad86e57f452ea9eae82fc4f5bfc2686654a2a68ebc7102dfcb34e2a1ca55648d75a1f6bcf97b532de482de76e
data/Gemfile CHANGED
@@ -15,5 +15,4 @@
15
15
  #==============================================================================
16
16
 
17
17
  source 'https://rubygems.org'
18
- gem 'steep', github: 'akihikodaki/steep', branch: 'cld3'
19
18
  gemspec
data/README.md CHANGED
@@ -19,7 +19,6 @@ cld3.find_language("здравствуйте") # => #<struct Struct::Result lang
19
19
  ### Prerequisites
20
20
  * [Bundler](http://bundler.io/)
21
21
  * C++ compiler
22
- * [Protocol buffers](https://developers.google.com/protocol-buffers/)
23
22
  * [Rake](https://ruby.github.io/rake/)
24
23
  * [RubyGems](https://rubygems.org/)
25
24
 
@@ -42,17 +41,15 @@ change:
42
41
  https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
43
42
 
44
43
  ### Troubleshooting
45
- `gem install cld3` triggers native library building. If it fails, you are likely
46
- to missing required facilities. Make sure C++ compiler and protocol buffers
47
- is installed. I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler. Ruby is
48
- likely to need [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/)
49
- as well.
44
+ `gem install cld3` triggers native library building. If it fails, it is likely
45
+ that some required facilities are missing. Make sure C++ compiler is installed.
46
+ I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
50
47
 
51
48
  Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
52
49
  programming errors. Make sure they are all correct.
53
50
 
54
51
  If you cannot identify the cause of your problem, run spec of this library and
55
- see whether the problem is reproducable with it or not. Spec is not included in
52
+ see whether the problem is reproducible with it or not. Spec is not included in
56
53
  the gem, so clone the source code repository and then run `rake spec`.
57
54
  The source code repository is at
58
55
  https://github.com/akihikodaki/cld3-ruby.
data/cld3.gemspec CHANGED
@@ -16,18 +16,18 @@
16
16
 
17
17
  Gem::Specification.new do |gem|
18
18
  gem.name = "cld3"
19
- gem.version = "3.4.4"
19
+ gem.version = "3.5.0"
20
20
  gem.summary = "Compact Language Detector v3 (CLD3)"
21
21
  gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
22
  gem.license = "Apache-2.0"
23
23
  gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
24
24
  gem.author = "Akihiko Odaki"
25
25
  gem.email = "akihiko.odaki@gmail.com"
26
- gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
26
+ gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
27
27
  gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
28
- gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
29
- gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
30
- gem.add_development_dependency "steep", [ ">= 0.47.0", "< 0.48.0" ]
28
+ gem.add_development_dependency "rbs", [ ">= 2.6.0", "< 2.7.0" ]
29
+ gem.add_development_dependency "rspec", [ ">= 3.11.0", "< 3.12.0" ]
30
+ gem.add_development_dependency "steep", [ ">= 1.0.0", "< 1.1.0" ]
31
31
  gem.files = Dir[
32
32
  "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
33
33
  "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
data/ext/cld3/Makefile CHANGED
@@ -3,6 +3,7 @@ SHELL = /bin/sh
3
3
 
4
4
  # V=0 quiet, V=1 verbose. other values don't work.
5
5
  V = 1
6
+ V0 = $(V:0=)
6
7
  Q1 = $(V:1=)
7
8
  Q = $(Q1:0=@)
8
9
  ECHO1 = $(V:1=@ :)
@@ -52,7 +53,7 @@ htmldir = $(docdir)
52
53
  infodir = $(DESTDIR)/usr/share/info
53
54
  docdir = $(datarootdir)/doc/$(PACKAGE)
54
55
  oldincludedir = $(DESTDIR)/usr/include
55
- includedir = $(DESTDIR)/usr/include
56
+ includedir = $(exec_prefix)/include
56
57
  runstatedir = $(localstatedir)/run
57
58
  localstatedir = $(DESTDIR)/var
58
59
  sharedstatedir = $(DESTDIR)/var/lib
@@ -80,18 +81,18 @@ CSRCFLAG = $(empty)
80
81
  RUBY_EXTCONF_H =
81
82
  cflags = $(optflags) $(debugflags) $(warnflags)
82
83
  cxxflags =
83
- optflags = -O3
84
+ optflags = -O3 -fno-fast-math
84
85
  debugflags = -ggdb3
85
- warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
86
+ warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wold-style-definition -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable -Wundef
86
87
  cppflags =
87
88
  CCDLFLAGS = -fPIC
88
- CFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
89
- INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
89
+ CFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
90
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
90
91
  DEFS =
91
92
  CPPFLAGS = $(DEFS) $(cppflags)
92
- CXXFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
93
- ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
94
- dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
93
+ CXXFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++17 $(ARCH_FLAG)
94
+ ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1 -fstack-protector-strong -rdynamic -Wl,-export-dynamic -Wl,--no-as-needed
95
+ dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1
95
96
  ARCH_FLAG =
96
97
  DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
97
98
  LDSHARED = $(CC) -shared
@@ -108,13 +109,13 @@ RUBY_BASE_NAME = ruby
108
109
 
109
110
  arch = aarch64-linux
110
111
  sitearch = $(arch)
111
- ruby_version = 3.0.0
112
+ ruby_version = 3.1.0
112
113
  ruby = $(bindir)/$(RUBY_BASE_NAME)
113
114
  RUBY = $(ruby)
114
115
  ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
115
116
 
116
117
  RM = rm -f
117
- RM_RF = $(RUBY) -run -e rm -- -rf
118
+ RM_RF = rm -fr
118
119
  RMDIRS = rmdir --ignore-fail-on-non-empty -p
119
120
  MAKEDIRS = /usr/bin/mkdir -p
120
121
  INSTALL = /usr/bin/install -c
@@ -138,11 +139,11 @@ extout =
138
139
  extout_prefix =
139
140
  target_prefix =
140
141
  LOCAL_LIBS =
141
- LIBS = -lprotobuf -lpthread -lm -lc
142
- ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
142
+ LIBS = -lm -lc
143
+ ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence_features.cc task_context.cc task_context_params.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
143
144
  SRCS = $(ORIG_SRCS)
144
- OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
145
- HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
145
+ OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence_features.o task_context.o task_context_params.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
146
+ HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
146
147
  LOCAL_HDRS =
147
148
  TARGET = libcld3
148
149
  TARGET_NAME = libcld3
@@ -160,7 +161,7 @@ HDRDIR = $(sitehdrdir)$(target_prefix)
160
161
  ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
161
162
  TARGET_SO_DIR =
162
163
  TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
163
- CLEANLIBS = $(TARGET_SO)
164
+ CLEANLIBS = $(TARGET_SO) false
164
165
  CLEANOBJS = *.o *.bak
165
166
 
166
167
  all: $(DLLIB)
@@ -173,7 +174,7 @@ clean-rb-default::
173
174
  clean-rb::
174
175
  clean-so::
175
176
  clean: clean-so clean-static clean-rb-default clean-rb
176
- -$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
177
+ -$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
177
178
 
178
179
  distclean-rb-default::
179
180
  distclean-rb::
data/ext/cld3/base.o CHANGED
Binary file
@@ -0,0 +1,100 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef FEATURE_EXTRACTOR_PB_H_
18
+ #define FEATURE_EXTRACTOR_PB_H_
19
+
20
+ #include <cstdint>
21
+ #include <string>
22
+ #include <vector>
23
+
24
+ namespace chrome_lang_id {
25
+
26
+ class Parameter {
27
+ public:
28
+ const std::string& name() const { return name_; }
29
+ void set_name(std::string value) { name_ = std::move(value); }
30
+ const std::string& value() const { return value_; }
31
+ void set_value(std::string value) { value_ = std::move(value); }
32
+
33
+ private:
34
+ std::string name_;
35
+ std::string value_;
36
+ };
37
+
38
+ class FeatureFunctionDescriptor {
39
+ public:
40
+ const std::string& type() const { return type_; }
41
+
42
+ void set_type(std::string value) { type_ = std::move(value); }
43
+
44
+ const std::string& name() const { return name_; }
45
+
46
+ void set_name(std::string value) { name_ = std::move(value); }
47
+
48
+ bool has_argument() const { return true; }
49
+
50
+ std::int32_t argument() const { return argument_; }
51
+
52
+ void set_argument(int32_t value) { argument_ = value; }
53
+
54
+ int parameter_size() const { return parameter_.size(); }
55
+
56
+ const Parameter& parameter(int index) const { return parameter_[index]; }
57
+
58
+ Parameter* add_parameter() { return &parameter_.emplace_back(); }
59
+
60
+ int feature_size() const { return feature_.size(); }
61
+
62
+ FeatureFunctionDescriptor* mutable_feature(int index) {
63
+ return &feature_[index];
64
+ }
65
+
66
+ const FeatureFunctionDescriptor& feature(int index) const {
67
+ return feature_[index];
68
+ }
69
+
70
+ FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
71
+
72
+ private:
73
+ std::string type_;
74
+ std::string name_;
75
+ std::int32_t argument_;
76
+ std::vector<Parameter> parameter_;
77
+ std::vector<FeatureFunctionDescriptor> feature_;
78
+ };
79
+
80
+ class FeatureExtractorDescriptor {
81
+ public:
82
+ int feature_size() const { return feature_.size(); }
83
+
84
+ FeatureFunctionDescriptor* mutable_feature(int index) {
85
+ return &feature_[index];
86
+ }
87
+
88
+ const FeatureFunctionDescriptor& feature(int index) const {
89
+ return feature_[index];
90
+ }
91
+
92
+ FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
93
+
94
+ private:
95
+ std::vector<FeatureFunctionDescriptor> feature_;
96
+ };
97
+
98
+ }
99
+
100
+ #endif
@@ -0,0 +1,35 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef SENTENCE_PB_H_
18
+ #define SENTENCE_PB_H_
19
+
20
+ #include <string>
21
+
22
+ namespace chrome_lang_id {
23
+
24
+ class Sentence {
25
+ public:
26
+ const std::string& text() const { return text_; }
27
+ void set_text(std::string value) { text_ = std::move(value); }
28
+
29
+ private:
30
+ std::string text_;
31
+ };
32
+
33
+ }
34
+
35
+ #endif
@@ -0,0 +1,106 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef TASK_SPEC_PB_H_
18
+ #define TASK_SPEC_PB_H_
19
+
20
+ #include <string>
21
+ #include <vector>
22
+
23
+ namespace chrome_lang_id {
24
+
25
+ class TaskInput {
26
+ public:
27
+ class Part {
28
+ public:
29
+ const std::string& file_pattern() const { return file_pattern_; }
30
+
31
+ private:
32
+ std::string file_pattern_;
33
+ };
34
+
35
+ const std::string& name() const { return name_; }
36
+
37
+ void set_name(std::string value) { name_ = value; }
38
+
39
+ int file_format_size() const { return file_format_.size(); }
40
+
41
+ const std::string& file_format(int index) const {
42
+ return file_format_[index];
43
+ }
44
+
45
+ void add_file_format(std::string value) {
46
+ file_format_.push_back(std::move(value));
47
+ }
48
+
49
+ int record_format_size() const { return record_format_.size(); }
50
+
51
+ const std::string& record_format(int index) const {
52
+ return record_format_[index];
53
+ }
54
+
55
+ void add_record_format(std::string value) {
56
+ record_format_.push_back(std::move(value));
57
+ }
58
+
59
+ int part_size() const { return part_.size(); }
60
+ const Part& part(int index) const { return part_[index]; }
61
+
62
+ private:
63
+ std::string name_;
64
+ std::vector<std::string> file_format_;
65
+ std::vector<std::string> record_format_;
66
+ std::vector<Part> part_;
67
+ };
68
+
69
+ class TaskSpec {
70
+ public:
71
+ class Parameter {
72
+ public:
73
+ const std::string& name() const { return name_; }
74
+ void set_name(std::string value) { name_ = std::move(value); }
75
+ const std::string& value() const { return value_; }
76
+ void set_value(std::string value) { value_ = std::move(value); }
77
+
78
+ private:
79
+ std::string name_;
80
+ std::string value_;
81
+ };
82
+
83
+ int parameter_size() const { return parameter_.size(); }
84
+
85
+ Parameter* mutable_parameter(int index) { return &parameter_[index]; }
86
+
87
+ const Parameter& parameter(int index) const { return parameter_[index]; }
88
+
89
+ Parameter* add_parameter() { return &parameter_.emplace_back(); }
90
+
91
+ int input_size() const { return input_.size(); }
92
+
93
+ TaskInput* mutable_input(int index) { return &input_[index]; }
94
+
95
+ const TaskInput& input(int index) const { return input_[index]; }
96
+
97
+ TaskInput* add_input() { return &input_.emplace_back(); }
98
+
99
+ private:
100
+ std::vector<Parameter> parameter_;
101
+ std::vector<TaskInput> input_;
102
+ };
103
+
104
+ }
105
+
106
+ #endif
Binary file
Binary file
data/ext/cld3/extconf.rb CHANGED
@@ -26,17 +26,8 @@ rescue
26
26
  end
27
27
  end
28
28
 
29
- # Check pkg-config first to inform the library is missing if so.
30
- pkg_config("protobuf") or abort "Failed to locate protobuf"
31
-
32
- FileUtils.mkdir_p("cld_3/protos")
33
29
  FileUtils.mkdir_p("script_span")
34
30
 
35
- [ "feature_extractor", "sentence", "task_spec" ].each {|name|
36
- system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
37
- ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
38
- }
39
-
40
31
  [
41
32
  "fixunicodevalue.h",
42
33
  "generated_ulscript.h",
@@ -55,6 +46,6 @@ FileUtils.mkdir_p("script_span")
55
46
  ln_fallback("#{name}", "script_span/#{name}")
56
47
  }
57
48
 
58
- $CXXFLAGS += " -fvisibility=hidden -std=c++11"
49
+ $CXXFLAGS += " -fvisibility=hidden -std=c++17"
59
50
  $LIBRUBYARG = ""
60
51
  create_makefile("libcld3")
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -878,7 +878,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
878
878
  // copying letters to buffer with single spaces for each run of non-letters
879
879
  while (take < byte_length_) {
880
880
  // Copy run of letters in same script (&LS | LS)*
881
- int letter_count = 0; // Keep track of word length
882
881
  bool need_break = false;
883
882
 
884
883
  while (take < byte_length_) {
@@ -963,7 +962,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
963
962
  map2original_.Delete(tlen - plen);
964
963
  }
965
964
 
966
- ++letter_count;
967
965
  if (put >= kMaxScriptBytes) {
968
966
  // Buffer is full
969
967
  span->truncated = true;
Binary file
Binary file
data/ext/cld3/libcld3.so CHANGED
Binary file
Binary file
Binary file
data/ext/cld3/offsetmap.o CHANGED
Binary file
data/ext/cld3/registry.o CHANGED
Binary file
Binary file
@@ -0,0 +1,69 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // Routine that maps a Unicode code point to an interchange-valid one
17
+ //
18
+ // Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
19
+ // code points. C0 and C1 control codes that are not interchange-valid
20
+ // are mapped to spaces.
21
+
22
+
23
+ #ifndef SCRIPT_SPAN_FIXUNICODEVALUE_H_
24
+ #define SCRIPT_SPAN_FIXUNICODEVALUE_H_
25
+
26
+ #include "integral_types.h" // for char32
27
+ #include "port.h"
28
+
29
+ namespace chrome_lang_id {
30
+ namespace CLD2 {
31
+
32
+ // Map byte value 0000-00FF to char32
33
+ // Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
34
+ // Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
35
+ static const char32 kMapFullMicrosoft1252OrSpace[256] = {
36
+ 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
37
+ 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
38
+ 0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
39
+ 0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
40
+
41
+ 0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
42
+ 0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
43
+ 0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
44
+ 0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
45
+
46
+ 0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
47
+ 0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
48
+ 0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
49
+ 0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
50
+ 0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
51
+ 0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
52
+
53
+ 0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
54
+ 0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
55
+ 0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
56
+ 0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
57
+ };
58
+
59
+ // Guarantees that the resulting output value is interchange valid
60
+ // 00-FF; map to spaces or MS CP1252
61
+ // D800-DFFF; surrogates
62
+ // FDD0-FDEF; non-characters
63
+ // xxFFFE-xxFFFF; non-characters
64
+ char32 FixUnicodeValue(char32 uv);
65
+
66
+ } // End namespace CLD2
67
+ } // End namespace chrome_lang_id
68
+
69
+ #endif // SCRIPT_SPAN_FIXUNICODEVALUE_H_