cld3 3.4.4 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -1
  3. data/README.md +4 -7
  4. data/cld3.gemspec +5 -5
  5. data/ext/cld3/Makefile +17 -16
  6. data/ext/cld3/base.o +0 -0
  7. data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
  8. data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
  9. data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
  10. data/ext/cld3/embedding_feature_extractor.o +0 -0
  11. data/ext/cld3/embedding_network.o +0 -0
  12. data/ext/cld3/extconf.rb +1 -10
  13. data/ext/cld3/feature_extractor.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.cc +0 -2
  20. data/ext/cld3/getonescriptspan.o +0 -0
  21. data/ext/cld3/lang_id_nn_params.o +0 -0
  22. data/ext/cld3/language_identifier_features.o +0 -0
  23. data/ext/cld3/libcld3.so +0 -0
  24. data/ext/cld3/nnet_language_identifier.o +0 -0
  25. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  26. data/ext/cld3/offsetmap.o +0 -0
  27. data/ext/cld3/registry.o +0 -0
  28. data/ext/cld3/relevant_script_feature.o +0 -0
  29. data/ext/cld3/script_span/fixunicodevalue.h +69 -0
  30. data/ext/cld3/script_span/generated_ulscript.h +142 -0
  31. data/ext/cld3/script_span/getonescriptspan.h +124 -0
  32. data/ext/cld3/script_span/integral_types.h +37 -0
  33. data/ext/cld3/script_span/offsetmap.h +168 -0
  34. data/ext/cld3/script_span/port.h +143 -0
  35. data/ext/cld3/script_span/stringpiece.h +81 -0
  36. data/ext/cld3/script_span/text_processing.h +30 -0
  37. data/ext/cld3/script_span/utf8acceptinterchange.h +486 -0
  38. data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +1631 -0
  39. data/ext/cld3/script_span/utf8repl_lettermarklower.h +758 -0
  40. data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +1455 -0
  41. data/ext/cld3/script_span/utf8statetable.h +285 -0
  42. data/ext/cld3/sentence_features.o +0 -0
  43. data/ext/cld3/task_context.o +0 -0
  44. data/ext/cld3/task_context_params.o +0 -0
  45. data/ext/cld3/text_processing.o +0 -0
  46. data/ext/cld3/unicodetext.o +0 -0
  47. data/ext/cld3/utf8statetable.o +0 -0
  48. data/ext/cld3/utils.o +0 -0
  49. data/ext/cld3/workspace.o +0 -0
  50. data/lib/cld3.rb +4 -1
  51. metadata +33 -25
  52. data/ext/cld3/feature_extractor.pb.o +0 -0
  53. data/ext/cld3/feature_extractor.proto +0 -50
  54. data/ext/cld3/mkmf.log +0 -37
  55. data/ext/cld3/sentence.pb.o +0 -0
  56. data/ext/cld3/sentence.proto +0 -77
  57. data/ext/cld3/task_spec.pb.o +0 -0
  58. data/ext/cld3/task_spec.proto +0 -98
  59. data/lib/a.rb +0 -24
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f40e4947fea97543686caceba0082bdba30b5ae0485a25b41004ad048057b0ad
4
- data.tar.gz: e45c60300550caf513fdde6bcbc05e68e1063bf9ad8074626bf5f88f4a6f77bd
3
+ metadata.gz: e12ff58a2cc0242896e307f33cb1adb2e210a1889244861e2c751b0399cb6415
4
+ data.tar.gz: 3b0c348b00126c8ee825a76eed0919e074a7f97e7dc7e647614f7a754c69d716
5
5
  SHA512:
6
- metadata.gz: 393fc138a279ee42c3de90c49bcc982e55860f74e2796d4c895d0f2f175894bcb1ec1bbe796811f896a16be9cc97943e1309cbe175bc029a510b4c51b2f700da
7
- data.tar.gz: d16e8c87e7d12cc90cc1a4babb4873df8f553d9527e1d69a548a250ae0b240f79a6338070bbc88cbb0e23db48c23ef0393cd4b62e0ac673722ace81ce1564895
6
+ metadata.gz: 690c6e6f0cc8f0cc369b05a611daeed74adc273cd9c02f780233921ed0c220a425552e7020c127ff0503e88d473f57088cafd3b0d2143f3f86527d886d3eaf63
7
+ data.tar.gz: 6c3b7c105fa799fb918a077d9d1f47c948ec29bad86e57f452ea9eae82fc4f5bfc2686654a2a68ebc7102dfcb34e2a1ca55648d75a1f6bcf97b532de482de76e
data/Gemfile CHANGED
@@ -15,5 +15,4 @@
15
15
  #==============================================================================
16
16
 
17
17
  source 'https://rubygems.org'
18
- gem 'steep', github: 'akihikodaki/steep', branch: 'cld3'
19
18
  gemspec
data/README.md CHANGED
@@ -19,7 +19,6 @@ cld3.find_language("здравствуйте") # => #<struct Struct::Result lang
19
19
  ### Prerequisites
20
20
  * [Bundler](http://bundler.io/)
21
21
  * C++ compiler
22
- * [Protocol buffers](https://developers.google.com/protocol-buffers/)
23
22
  * [Rake](https://ruby.github.io/rake/)
24
23
  * [RubyGems](https://rubygems.org/)
25
24
 
@@ -42,17 +41,15 @@ change:
42
41
  https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
43
42
 
44
43
  ### Troubleshooting
45
- `gem install cld3` triggers native library building. If it fails, you are likely
46
- to missing required facilities. Make sure C++ compiler and protocol buffers
47
- is installed. I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler. Ruby is
48
- likely to need [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/)
49
- as well.
44
+ `gem install cld3` triggers native library building. If it fails, it is likely
45
+ that some required facilities are missing. Make sure C++ compiler is installed.
46
+ I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
50
47
 
51
48
  Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
52
49
  programming errors. Make sure they are all correct.
53
50
 
54
51
  If you cannot identify the cause of your problem, run spec of this library and
55
- see whether the problem is reproducable with it or not. Spec is not included in
52
+ see whether the problem is reproducible with it or not. Spec is not included in
56
53
  the gem, so clone the source code repository and then run `rake spec`.
57
54
  The source code repository is at
58
55
  https://github.com/akihikodaki/cld3-ruby.
data/cld3.gemspec CHANGED
@@ -16,18 +16,18 @@
16
16
 
17
17
  Gem::Specification.new do |gem|
18
18
  gem.name = "cld3"
19
- gem.version = "3.4.4"
19
+ gem.version = "3.5.0"
20
20
  gem.summary = "Compact Language Detector v3 (CLD3)"
21
21
  gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
22
  gem.license = "Apache-2.0"
23
23
  gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
24
24
  gem.author = "Akihiko Odaki"
25
25
  gem.email = "akihiko.odaki@gmail.com"
26
- gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
26
+ gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
27
27
  gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
28
- gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
29
- gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
30
- gem.add_development_dependency "steep", [ ">= 0.47.0", "< 0.48.0" ]
28
+ gem.add_development_dependency "rbs", [ ">= 2.6.0", "< 2.7.0" ]
29
+ gem.add_development_dependency "rspec", [ ">= 3.11.0", "< 3.12.0" ]
30
+ gem.add_development_dependency "steep", [ ">= 1.0.0", "< 1.1.0" ]
31
31
  gem.files = Dir[
32
32
  "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
33
33
  "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
data/ext/cld3/Makefile CHANGED
@@ -3,6 +3,7 @@ SHELL = /bin/sh
3
3
 
4
4
  # V=0 quiet, V=1 verbose. other values don't work.
5
5
  V = 1
6
+ V0 = $(V:0=)
6
7
  Q1 = $(V:1=)
7
8
  Q = $(Q1:0=@)
8
9
  ECHO1 = $(V:1=@ :)
@@ -52,7 +53,7 @@ htmldir = $(docdir)
52
53
  infodir = $(DESTDIR)/usr/share/info
53
54
  docdir = $(datarootdir)/doc/$(PACKAGE)
54
55
  oldincludedir = $(DESTDIR)/usr/include
55
- includedir = $(DESTDIR)/usr/include
56
+ includedir = $(exec_prefix)/include
56
57
  runstatedir = $(localstatedir)/run
57
58
  localstatedir = $(DESTDIR)/var
58
59
  sharedstatedir = $(DESTDIR)/var/lib
@@ -80,18 +81,18 @@ CSRCFLAG = $(empty)
80
81
  RUBY_EXTCONF_H =
81
82
  cflags = $(optflags) $(debugflags) $(warnflags)
82
83
  cxxflags =
83
- optflags = -O3
84
+ optflags = -O3 -fno-fast-math
84
85
  debugflags = -ggdb3
85
- warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
86
+ warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wold-style-definition -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable -Wundef
86
87
  cppflags =
87
88
  CCDLFLAGS = -fPIC
88
- CFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
89
- INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
89
+ CFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
90
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
90
91
  DEFS =
91
92
  CPPFLAGS = $(DEFS) $(cppflags)
92
- CXXFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
93
- ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
94
- dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
93
+ CXXFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++17 $(ARCH_FLAG)
94
+ ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1 -fstack-protector-strong -rdynamic -Wl,-export-dynamic -Wl,--no-as-needed
95
+ dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1
95
96
  ARCH_FLAG =
96
97
  DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
97
98
  LDSHARED = $(CC) -shared
@@ -108,13 +109,13 @@ RUBY_BASE_NAME = ruby
108
109
 
109
110
  arch = aarch64-linux
110
111
  sitearch = $(arch)
111
- ruby_version = 3.0.0
112
+ ruby_version = 3.1.0
112
113
  ruby = $(bindir)/$(RUBY_BASE_NAME)
113
114
  RUBY = $(ruby)
114
115
  ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
115
116
 
116
117
  RM = rm -f
117
- RM_RF = $(RUBY) -run -e rm -- -rf
118
+ RM_RF = rm -fr
118
119
  RMDIRS = rmdir --ignore-fail-on-non-empty -p
119
120
  MAKEDIRS = /usr/bin/mkdir -p
120
121
  INSTALL = /usr/bin/install -c
@@ -138,11 +139,11 @@ extout =
138
139
  extout_prefix =
139
140
  target_prefix =
140
141
  LOCAL_LIBS =
141
- LIBS = -lprotobuf -lpthread -lm -lc
142
- ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
142
+ LIBS = -lm -lc
143
+ ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence_features.cc task_context.cc task_context_params.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
143
144
  SRCS = $(ORIG_SRCS)
144
- OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
145
- HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
145
+ OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence_features.o task_context.o task_context_params.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
146
+ HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
146
147
  LOCAL_HDRS =
147
148
  TARGET = libcld3
148
149
  TARGET_NAME = libcld3
@@ -160,7 +161,7 @@ HDRDIR = $(sitehdrdir)$(target_prefix)
160
161
  ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
161
162
  TARGET_SO_DIR =
162
163
  TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
163
- CLEANLIBS = $(TARGET_SO)
164
+ CLEANLIBS = $(TARGET_SO) false
164
165
  CLEANOBJS = *.o *.bak
165
166
 
166
167
  all: $(DLLIB)
@@ -173,7 +174,7 @@ clean-rb-default::
173
174
  clean-rb::
174
175
  clean-so::
175
176
  clean: clean-so clean-static clean-rb-default clean-rb
176
- -$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
177
+ -$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
177
178
 
178
179
  distclean-rb-default::
179
180
  distclean-rb::
data/ext/cld3/base.o CHANGED
Binary file
@@ -0,0 +1,100 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef FEATURE_EXTRACTOR_PB_H_
18
+ #define FEATURE_EXTRACTOR_PB_H_
19
+
20
+ #include <cstdint>
21
+ #include <string>
22
+ #include <vector>
23
+
24
+ namespace chrome_lang_id {
25
+
26
+ class Parameter {
27
+ public:
28
+ const std::string& name() const { return name_; }
29
+ void set_name(std::string value) { name_ = std::move(value); }
30
+ const std::string& value() const { return value_; }
31
+ void set_value(std::string value) { value_ = std::move(value); }
32
+
33
+ private:
34
+ std::string name_;
35
+ std::string value_;
36
+ };
37
+
38
+ class FeatureFunctionDescriptor {
39
+ public:
40
+ const std::string& type() const { return type_; }
41
+
42
+ void set_type(std::string value) { type_ = std::move(value); }
43
+
44
+ const std::string& name() const { return name_; }
45
+
46
+ void set_name(std::string value) { name_ = std::move(value); }
47
+
48
+ bool has_argument() const { return true; }
49
+
50
+ std::int32_t argument() const { return argument_; }
51
+
52
+ void set_argument(int32_t value) { argument_ = value; }
53
+
54
+ int parameter_size() const { return parameter_.size(); }
55
+
56
+ const Parameter& parameter(int index) const { return parameter_[index]; }
57
+
58
+ Parameter* add_parameter() { return &parameter_.emplace_back(); }
59
+
60
+ int feature_size() const { return feature_.size(); }
61
+
62
+ FeatureFunctionDescriptor* mutable_feature(int index) {
63
+ return &feature_[index];
64
+ }
65
+
66
+ const FeatureFunctionDescriptor& feature(int index) const {
67
+ return feature_[index];
68
+ }
69
+
70
+ FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
71
+
72
+ private:
73
+ std::string type_;
74
+ std::string name_;
75
+ std::int32_t argument_;
76
+ std::vector<Parameter> parameter_;
77
+ std::vector<FeatureFunctionDescriptor> feature_;
78
+ };
79
+
80
+ class FeatureExtractorDescriptor {
81
+ public:
82
+ int feature_size() const { return feature_.size(); }
83
+
84
+ FeatureFunctionDescriptor* mutable_feature(int index) {
85
+ return &feature_[index];
86
+ }
87
+
88
+ const FeatureFunctionDescriptor& feature(int index) const {
89
+ return feature_[index];
90
+ }
91
+
92
+ FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
93
+
94
+ private:
95
+ std::vector<FeatureFunctionDescriptor> feature_;
96
+ };
97
+
98
+ }
99
+
100
+ #endif
@@ -0,0 +1,35 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef SENTENCE_PB_H_
18
+ #define SENTENCE_PB_H_
19
+
20
+ #include <string>
21
+
22
+ namespace chrome_lang_id {
23
+
24
+ class Sentence {
25
+ public:
26
+ const std::string& text() const { return text_; }
27
+ void set_text(std::string value) { text_ = std::move(value); }
28
+
29
+ private:
30
+ std::string text_;
31
+ };
32
+
33
+ }
34
+
35
+ #endif
@@ -0,0 +1,106 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef TASK_SPEC_PB_H_
18
+ #define TASK_SPEC_PB_H_
19
+
20
+ #include <string>
21
+ #include <vector>
22
+
23
+ namespace chrome_lang_id {
24
+
25
+ class TaskInput {
26
+ public:
27
+ class Part {
28
+ public:
29
+ const std::string& file_pattern() const { return file_pattern_; }
30
+
31
+ private:
32
+ std::string file_pattern_;
33
+ };
34
+
35
+ const std::string& name() const { return name_; }
36
+
37
+ void set_name(std::string value) { name_ = value; }
38
+
39
+ int file_format_size() const { return file_format_.size(); }
40
+
41
+ const std::string& file_format(int index) const {
42
+ return file_format_[index];
43
+ }
44
+
45
+ void add_file_format(std::string value) {
46
+ file_format_.push_back(std::move(value));
47
+ }
48
+
49
+ int record_format_size() const { return record_format_.size(); }
50
+
51
+ const std::string& record_format(int index) const {
52
+ return record_format_[index];
53
+ }
54
+
55
+ void add_record_format(std::string value) {
56
+ record_format_.push_back(std::move(value));
57
+ }
58
+
59
+ int part_size() const { return part_.size(); }
60
+ const Part& part(int index) const { return part_[index]; }
61
+
62
+ private:
63
+ std::string name_;
64
+ std::vector<std::string> file_format_;
65
+ std::vector<std::string> record_format_;
66
+ std::vector<Part> part_;
67
+ };
68
+
69
+ class TaskSpec {
70
+ public:
71
+ class Parameter {
72
+ public:
73
+ const std::string& name() const { return name_; }
74
+ void set_name(std::string value) { name_ = std::move(value); }
75
+ const std::string& value() const { return value_; }
76
+ void set_value(std::string value) { value_ = std::move(value); }
77
+
78
+ private:
79
+ std::string name_;
80
+ std::string value_;
81
+ };
82
+
83
+ int parameter_size() const { return parameter_.size(); }
84
+
85
+ Parameter* mutable_parameter(int index) { return &parameter_[index]; }
86
+
87
+ const Parameter& parameter(int index) const { return parameter_[index]; }
88
+
89
+ Parameter* add_parameter() { return &parameter_.emplace_back(); }
90
+
91
+ int input_size() const { return input_.size(); }
92
+
93
+ TaskInput* mutable_input(int index) { return &input_[index]; }
94
+
95
+ const TaskInput& input(int index) const { return input_[index]; }
96
+
97
+ TaskInput* add_input() { return &input_.emplace_back(); }
98
+
99
+ private:
100
+ std::vector<Parameter> parameter_;
101
+ std::vector<TaskInput> input_;
102
+ };
103
+
104
+ }
105
+
106
+ #endif
Binary file
Binary file
data/ext/cld3/extconf.rb CHANGED
@@ -26,17 +26,8 @@ rescue
26
26
  end
27
27
  end
28
28
 
29
- # Check pkg-config first to inform the library is missing if so.
30
- pkg_config("protobuf") or abort "Failed to locate protobuf"
31
-
32
- FileUtils.mkdir_p("cld_3/protos")
33
29
  FileUtils.mkdir_p("script_span")
34
30
 
35
- [ "feature_extractor", "sentence", "task_spec" ].each {|name|
36
- system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
37
- ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
38
- }
39
-
40
31
  [
41
32
  "fixunicodevalue.h",
42
33
  "generated_ulscript.h",
@@ -55,6 +46,6 @@ FileUtils.mkdir_p("script_span")
55
46
  ln_fallback("#{name}", "script_span/#{name}")
56
47
  }
57
48
 
58
- $CXXFLAGS += " -fvisibility=hidden -std=c++11"
49
+ $CXXFLAGS += " -fvisibility=hidden -std=c++17"
59
50
  $LIBRUBYARG = ""
60
51
  create_makefile("libcld3")
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -878,7 +878,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
878
878
  // copying letters to buffer with single spaces for each run of non-letters
879
879
  while (take < byte_length_) {
880
880
  // Copy run of letters in same script (&LS | LS)*
881
- int letter_count = 0; // Keep track of word length
882
881
  bool need_break = false;
883
882
 
884
883
  while (take < byte_length_) {
@@ -963,7 +962,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
963
962
  map2original_.Delete(tlen - plen);
964
963
  }
965
964
 
966
- ++letter_count;
967
965
  if (put >= kMaxScriptBytes) {
968
966
  // Buffer is full
969
967
  span->truncated = true;
Binary file
Binary file
data/ext/cld3/libcld3.so CHANGED
Binary file
Binary file
Binary file
data/ext/cld3/offsetmap.o CHANGED
Binary file
data/ext/cld3/registry.o CHANGED
Binary file
Binary file
@@ -0,0 +1,69 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // Routine that maps a Unicode code point to an interchange-valid one
17
+ //
18
+ // Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
19
+ // code points. C0 and C1 control codes that are not interchange-valid
20
+ // are mapped to spaces.
21
+
22
+
23
+ #ifndef SCRIPT_SPAN_FIXUNICODEVALUE_H_
24
+ #define SCRIPT_SPAN_FIXUNICODEVALUE_H_
25
+
26
+ #include "integral_types.h" // for char32
27
+ #include "port.h"
28
+
29
+ namespace chrome_lang_id {
30
+ namespace CLD2 {
31
+
32
+ // Map byte value 0000-00FF to char32
33
+ // Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
34
+ // Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
35
+ static const char32 kMapFullMicrosoft1252OrSpace[256] = {
36
+ 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
37
+ 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
38
+ 0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
39
+ 0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
40
+
41
+ 0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
42
+ 0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
43
+ 0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
44
+ 0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
45
+
46
+ 0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
47
+ 0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
48
+ 0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
49
+ 0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
50
+ 0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
51
+ 0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
52
+
53
+ 0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
54
+ 0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
55
+ 0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
56
+ 0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
57
+ };
58
+
59
+ // Guarantees that the resulting output value is interchange valid
60
+ // 00-FF; map to spaces or MS CP1252
61
+ // D800-DFFF; surrogates
62
+ // FDD0-FDEF; non-characters
63
+ // xxFFFE-xxFFFF; non-characters
64
+ char32 FixUnicodeValue(char32 uv);
65
+
66
+ } // End namespace CLD2
67
+ } // End namespace chrome_lang_id
68
+
69
+ #endif // SCRIPT_SPAN_FIXUNICODEVALUE_H_