cld3 3.4.2 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +4 -25
  3. data/cld3.gemspec +6 -4
  4. data/ext/cld3/Makefile +21 -19
  5. data/ext/cld3/base.o +0 -0
  6. data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
  7. data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
  8. data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
  9. data/ext/cld3/embedding_feature_extractor.o +0 -0
  10. data/ext/cld3/embedding_network.cc +1 -0
  11. data/ext/cld3/embedding_network.o +0 -0
  12. data/ext/cld3/extconf.rb +1 -10
  13. data/ext/cld3/feature_extractor.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.cc +0 -2
  20. data/ext/cld3/getonescriptspan.h +2 -2
  21. data/ext/cld3/getonescriptspan.o +0 -0
  22. data/ext/cld3/lang_id_nn_params.o +0 -0
  23. data/ext/cld3/language_identifier_features.o +0 -0
  24. data/ext/cld3/libcld3.so +0 -0
  25. data/ext/cld3/nnet_language_identifier.cc +3 -5
  26. data/ext/cld3/nnet_language_identifier.o +0 -0
  27. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  28. data/ext/cld3/offsetmap.o +0 -0
  29. data/ext/cld3/registry.o +0 -0
  30. data/ext/cld3/relevant_script_feature.o +0 -0
  31. data/ext/cld3/script_span/fixunicodevalue.h +69 -0
  32. data/ext/cld3/script_span/generated_ulscript.h +142 -0
  33. data/ext/cld3/script_span/getonescriptspan.h +124 -0
  34. data/ext/cld3/script_span/integral_types.h +37 -0
  35. data/ext/cld3/script_span/offsetmap.h +168 -0
  36. data/ext/cld3/script_span/port.h +143 -0
  37. data/ext/cld3/script_span/stringpiece.h +81 -0
  38. data/ext/cld3/script_span/text_processing.h +30 -0
  39. data/ext/cld3/script_span/utf8acceptinterchange.h +486 -0
  40. data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +1631 -0
  41. data/ext/cld3/script_span/utf8repl_lettermarklower.h +758 -0
  42. data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +1455 -0
  43. data/ext/cld3/script_span/utf8statetable.h +285 -0
  44. data/ext/cld3/sentence_features.cc +4 -4
  45. data/ext/cld3/sentence_features.h +13 -3
  46. data/ext/cld3/sentence_features.o +0 -0
  47. data/ext/cld3/task_context.o +0 -0
  48. data/ext/cld3/task_context_params.o +0 -0
  49. data/ext/cld3/text_processing.o +0 -0
  50. data/ext/cld3/unicodetext.o +0 -0
  51. data/ext/cld3/utf8statetable.o +0 -0
  52. data/ext/cld3/utils.o +0 -0
  53. data/ext/cld3/workspace.o +0 -0
  54. data/lib/cld3/unstable.rb +58 -0
  55. data/lib/cld3.rb +15 -43
  56. data/sig/cld3.rbs +65 -0
  57. metadata +66 -15
  58. data/ext/cld3/feature_extractor.pb.o +0 -0
  59. data/ext/cld3/feature_extractor.proto +0 -50
  60. data/ext/cld3/mkmf.log +0 -37
  61. data/ext/cld3/sentence.pb.o +0 -0
  62. data/ext/cld3/sentence.proto +0 -77
  63. data/ext/cld3/task_spec.pb.o +0 -0
  64. data/ext/cld3/task_spec.proto +0 -98
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f5b3cc203abda97cb85d5dee0983b7f63c626397b8af8b90e2110bb5fedbbdec
4
- data.tar.gz: 197f66798925404ded7af722d0194a705018d6953b11f4576c4e180ea093675d
3
+ metadata.gz: e12ff58a2cc0242896e307f33cb1adb2e210a1889244861e2c751b0399cb6415
4
+ data.tar.gz: 3b0c348b00126c8ee825a76eed0919e074a7f97e7dc7e647614f7a754c69d716
5
5
  SHA512:
6
- metadata.gz: 855e8ee464a2842906bfef211e2afb21820fe9a7449b58d91b9ab1908c997966b9dd4c2d5d51f82ceb84b65b5a118736a5aa4eff6ea9548b9a9abc61b297a9d0
7
- data.tar.gz: e38ddfd81489aeb83bccc7b509dd17ea79c56ba641de37cac2d800d3428ed31e5ac57066016bd118e9e71c30c78d31b4c38a266abe012065495558adf07e68f5
6
+ metadata.gz: 690c6e6f0cc8f0cc369b05a611daeed74adc273cd9c02f780233921ed0c220a425552e7020c127ff0503e88d473f57088cafd3b0d2143f3f86527d886d3eaf63
7
+ data.tar.gz: 6c3b7c105fa799fb918a077d9d1f47c948ec29bad86e57f452ea9eae82fc4f5bfc2686654a2a68ebc7102dfcb34e2a1ca55648d75a1f6bcf97b532de482de76e
data/README.md CHANGED
@@ -19,7 +19,6 @@ cld3.find_language("здравствуйте") # => #<struct Struct::Result lang
19
19
  ### Prerequisites
20
20
  * [Bundler](http://bundler.io/)
21
21
  * C++ compiler
22
- * [Protocol buffers](https://developers.google.com/protocol-buffers/)
23
22
  * [Rake](https://ruby.github.io/rake/)
24
23
  * [RubyGems](https://rubygems.org/)
25
24
 
@@ -41,36 +40,16 @@ JRuby has a bug which prevents the feature detection. Apply the following
41
40
  change:
42
41
  https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
43
42
 
44
- #### OpenBSD
45
- Ruby has a bug which recognizes non-fatal linker warnings as fatal. Apply the
46
- following patch to Ruby to workaround the bug.
47
-
48
- ```diff
49
- --- a/lib/mkmf.rb
50
- +++ b/lib/mkmf.rb
51
- @@ -657,7 +657,7 @@ def with_ldflags(flags)
52
- end
53
-
54
- def try_ldflags(flags, opts = {})
55
- - try_link(MAIN_DOES_NOTHING, flags, {:werror => true}.update(opts))
56
- + try_link(MAIN_DOES_NOTHING, flags, {:werror => false}.update(opts))
57
- end
58
-
59
- def append_ldflags(flags, *opts)
60
- ```
61
-
62
43
  ### Troubleshooting
63
- `gem install cld3` triggers native library building. If it fails, you are likely
64
- to missing required facilities. Make sure C++ compiler and protocol buffers
65
- is installed. I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler. Ruby is
66
- likely to need [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/)
67
- as well.
44
+ `gem install cld3` triggers native library building. If it fails, it is likely
45
+ that some required facilities are missing. Make sure C++ compiler is installed.
46
+ I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
68
47
 
69
48
  Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
70
49
  programming errors. Make sure they are all correct.
71
50
 
72
51
  If you cannot identify the cause of your problem, run spec of this library and
73
- see whether the problem is reproducable with it or not. Spec is not included in
52
+ see whether the problem is reproducible with it or not. Spec is not included in
74
53
  the gem, so clone the source code repository and then run `rake spec`.
75
54
  The source code repository is at
76
55
  https://github.com/akihikodaki/cld3-ruby.
data/cld3.gemspec CHANGED
@@ -16,19 +16,21 @@
16
16
 
17
17
  Gem::Specification.new do |gem|
18
18
  gem.name = "cld3"
19
- gem.version = "3.4.2"
19
+ gem.version = "3.5.0"
20
20
  gem.summary = "Compact Language Detector v3 (CLD3)"
21
21
  gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
22
  gem.license = "Apache-2.0"
23
23
  gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
24
24
  gem.author = "Akihiko Odaki"
25
25
  gem.email = "akihiko.odaki@gmail.com"
26
- gem.required_ruby_version = [ ">= 2.6.0", "< 3.1.0" ]
26
+ gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
27
27
  gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
28
- gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
28
+ gem.add_development_dependency "rbs", [ ">= 2.6.0", "< 2.7.0" ]
29
+ gem.add_development_dependency "rspec", [ ">= 3.11.0", "< 3.12.0" ]
30
+ gem.add_development_dependency "steep", [ ">= 1.0.0", "< 1.1.0" ]
29
31
  gem.files = Dir[
30
32
  "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
31
- "cld3.gemspec", "ext/**/*", "lib/**/*"
33
+ "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
32
34
  ]
33
35
  gem.require_paths = [ "lib" ]
34
36
  gem.extensions = [ "ext/cld3/extconf.rb" ]
data/ext/cld3/Makefile CHANGED
@@ -3,6 +3,7 @@ SHELL = /bin/sh
3
3
 
4
4
  # V=0 quiet, V=1 verbose. other values don't work.
5
5
  V = 1
6
+ V0 = $(V:0=)
6
7
  Q1 = $(V:1=)
7
8
  Q = $(Q1:0=@)
8
9
  ECHO1 = $(V:1=@ :)
@@ -52,7 +53,8 @@ htmldir = $(docdir)
52
53
  infodir = $(DESTDIR)/usr/share/info
53
54
  docdir = $(datarootdir)/doc/$(PACKAGE)
54
55
  oldincludedir = $(DESTDIR)/usr/include
55
- includedir = $(DESTDIR)/usr/include
56
+ includedir = $(exec_prefix)/include
57
+ runstatedir = $(localstatedir)/run
56
58
  localstatedir = $(DESTDIR)/var
57
59
  sharedstatedir = $(DESTDIR)/var/lib
58
60
  sysconfdir = $(DESTDIR)/etc
@@ -79,23 +81,23 @@ CSRCFLAG = $(empty)
79
81
  RUBY_EXTCONF_H =
80
82
  cflags = $(optflags) $(debugflags) $(warnflags)
81
83
  cxxflags =
82
- optflags = -O3
84
+ optflags = -O3 -fno-fast-math
83
85
  debugflags = -ggdb3
84
- warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
86
+ warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wold-style-definition -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable -Wundef
85
87
  cppflags =
86
88
  CCDLFLAGS = -fPIC
87
- CFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
88
- INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
89
+ CFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
90
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
89
91
  DEFS =
90
92
  CPPFLAGS = $(DEFS) $(cppflags)
91
- CXXFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
92
- ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
93
- dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
93
+ CXXFLAGS = $(CCDLFLAGS) -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++17 $(ARCH_FLAG)
94
+ ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1 -fstack-protector-strong -rdynamic -Wl,-export-dynamic -Wl,--no-as-needed
95
+ dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1
94
96
  ARCH_FLAG =
95
97
  DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
96
98
  LDSHARED = $(CC) -shared
97
99
  LDSHAREDXX = $(CXX) -shared
98
- AR = ar
100
+ AR = gcc-ar
99
101
  EXEEXT =
100
102
 
101
103
  RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
@@ -107,13 +109,13 @@ RUBY_BASE_NAME = ruby
107
109
 
108
110
  arch = aarch64-linux
109
111
  sitearch = $(arch)
110
- ruby_version = 2.7.0
112
+ ruby_version = 3.1.0
111
113
  ruby = $(bindir)/$(RUBY_BASE_NAME)
112
114
  RUBY = $(ruby)
113
115
  ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
114
116
 
115
117
  RM = rm -f
116
- RM_RF = $(RUBY) -run -e rm -- -rf
118
+ RM_RF = rm -fr
117
119
  RMDIRS = rmdir --ignore-fail-on-non-empty -p
118
120
  MAKEDIRS = /usr/bin/mkdir -p
119
121
  INSTALL = /usr/bin/install -c
@@ -137,11 +139,11 @@ extout =
137
139
  extout_prefix =
138
140
  target_prefix =
139
141
  LOCAL_LIBS =
140
- LIBS = -lprotobuf -lpthread -lm -lc
141
- ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
142
+ LIBS = -lm -lc
143
+ ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence_features.cc task_context.cc task_context_params.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
142
144
  SRCS = $(ORIG_SRCS)
143
- OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
144
- HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/language_identifier_features.h $(srcdir)/lang_id_nn_params.h $(srcdir)/nnet_language_identifier.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/fixunicodevalue.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/stringpiece.h $(srcdir)/text_processing.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/unicodetext.h $(srcdir)/utils.h $(srcdir)/workspace.h $(srcdir)/feature_extractor.pb.h $(srcdir)/sentence.pb.h $(srcdir)/task_spec.pb.h
145
+ OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence_features.o task_context.o task_context_params.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
146
+ HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
145
147
  LOCAL_HDRS =
146
148
  TARGET = libcld3
147
149
  TARGET_NAME = libcld3
@@ -155,11 +157,11 @@ BINDIR = $(bindir)
155
157
  RUBYCOMMONDIR = $(sitedir)$(target_prefix)
156
158
  RUBYLIBDIR = $(sitelibdir)$(target_prefix)
157
159
  RUBYARCHDIR = $(sitearchdir)$(target_prefix)
158
- HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
159
- ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
160
+ HDRDIR = $(sitehdrdir)$(target_prefix)
161
+ ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
160
162
  TARGET_SO_DIR =
161
163
  TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
162
- CLEANLIBS = $(TARGET_SO)
164
+ CLEANLIBS = $(TARGET_SO) false
163
165
  CLEANOBJS = *.o *.bak
164
166
 
165
167
  all: $(DLLIB)
@@ -172,7 +174,7 @@ clean-rb-default::
172
174
  clean-rb::
173
175
  clean-so::
174
176
  clean: clean-so clean-static clean-rb-default clean-rb
175
- -$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
177
+ -$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
176
178
 
177
179
  distclean-rb-default::
178
180
  distclean-rb::
data/ext/cld3/base.o CHANGED
Binary file
@@ -0,0 +1,100 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef FEATURE_EXTRACTOR_PB_H_
18
+ #define FEATURE_EXTRACTOR_PB_H_
19
+
20
+ #include <cstdint>
21
+ #include <string>
22
+ #include <vector>
23
+
24
+ namespace chrome_lang_id {
25
+
26
+ class Parameter {
27
+ public:
28
+ const std::string& name() const { return name_; }
29
+ void set_name(std::string value) { name_ = std::move(value); }
30
+ const std::string& value() const { return value_; }
31
+ void set_value(std::string value) { value_ = std::move(value); }
32
+
33
+ private:
34
+ std::string name_;
35
+ std::string value_;
36
+ };
37
+
38
+ class FeatureFunctionDescriptor {
39
+ public:
40
+ const std::string& type() const { return type_; }
41
+
42
+ void set_type(std::string value) { type_ = std::move(value); }
43
+
44
+ const std::string& name() const { return name_; }
45
+
46
+ void set_name(std::string value) { name_ = std::move(value); }
47
+
48
+ bool has_argument() const { return true; }
49
+
50
+ std::int32_t argument() const { return argument_; }
51
+
52
+ void set_argument(int32_t value) { argument_ = value; }
53
+
54
+ int parameter_size() const { return parameter_.size(); }
55
+
56
+ const Parameter& parameter(int index) const { return parameter_[index]; }
57
+
58
+ Parameter* add_parameter() { return &parameter_.emplace_back(); }
59
+
60
+ int feature_size() const { return feature_.size(); }
61
+
62
+ FeatureFunctionDescriptor* mutable_feature(int index) {
63
+ return &feature_[index];
64
+ }
65
+
66
+ const FeatureFunctionDescriptor& feature(int index) const {
67
+ return feature_[index];
68
+ }
69
+
70
+ FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
71
+
72
+ private:
73
+ std::string type_;
74
+ std::string name_;
75
+ std::int32_t argument_;
76
+ std::vector<Parameter> parameter_;
77
+ std::vector<FeatureFunctionDescriptor> feature_;
78
+ };
79
+
80
+ class FeatureExtractorDescriptor {
81
+ public:
82
+ int feature_size() const { return feature_.size(); }
83
+
84
+ FeatureFunctionDescriptor* mutable_feature(int index) {
85
+ return &feature_[index];
86
+ }
87
+
88
+ const FeatureFunctionDescriptor& feature(int index) const {
89
+ return feature_[index];
90
+ }
91
+
92
+ FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
93
+
94
+ private:
95
+ std::vector<FeatureFunctionDescriptor> feature_;
96
+ };
97
+
98
+ }
99
+
100
+ #endif
@@ -0,0 +1,35 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef SENTENCE_PB_H_
18
+ #define SENTENCE_PB_H_
19
+
20
+ #include <string>
21
+
22
+ namespace chrome_lang_id {
23
+
24
+ class Sentence {
25
+ public:
26
+ const std::string& text() const { return text_; }
27
+ void set_text(std::string value) { text_ = std::move(value); }
28
+
29
+ private:
30
+ std::string text_;
31
+ };
32
+
33
+ }
34
+
35
+ #endif
@@ -0,0 +1,106 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef TASK_SPEC_PB_H_
18
+ #define TASK_SPEC_PB_H_
19
+
20
+ #include <string>
21
+ #include <vector>
22
+
23
+ namespace chrome_lang_id {
24
+
25
+ class TaskInput {
26
+ public:
27
+ class Part {
28
+ public:
29
+ const std::string& file_pattern() const { return file_pattern_; }
30
+
31
+ private:
32
+ std::string file_pattern_;
33
+ };
34
+
35
+ const std::string& name() const { return name_; }
36
+
37
+ void set_name(std::string value) { name_ = value; }
38
+
39
+ int file_format_size() const { return file_format_.size(); }
40
+
41
+ const std::string& file_format(int index) const {
42
+ return file_format_[index];
43
+ }
44
+
45
+ void add_file_format(std::string value) {
46
+ file_format_.push_back(std::move(value));
47
+ }
48
+
49
+ int record_format_size() const { return record_format_.size(); }
50
+
51
+ const std::string& record_format(int index) const {
52
+ return record_format_[index];
53
+ }
54
+
55
+ void add_record_format(std::string value) {
56
+ record_format_.push_back(std::move(value));
57
+ }
58
+
59
+ int part_size() const { return part_.size(); }
60
+ const Part& part(int index) const { return part_[index]; }
61
+
62
+ private:
63
+ std::string name_;
64
+ std::vector<std::string> file_format_;
65
+ std::vector<std::string> record_format_;
66
+ std::vector<Part> part_;
67
+ };
68
+
69
+ class TaskSpec {
70
+ public:
71
+ class Parameter {
72
+ public:
73
+ const std::string& name() const { return name_; }
74
+ void set_name(std::string value) { name_ = std::move(value); }
75
+ const std::string& value() const { return value_; }
76
+ void set_value(std::string value) { value_ = std::move(value); }
77
+
78
+ private:
79
+ std::string name_;
80
+ std::string value_;
81
+ };
82
+
83
+ int parameter_size() const { return parameter_.size(); }
84
+
85
+ Parameter* mutable_parameter(int index) { return &parameter_[index]; }
86
+
87
+ const Parameter& parameter(int index) const { return parameter_[index]; }
88
+
89
+ Parameter* add_parameter() { return &parameter_.emplace_back(); }
90
+
91
+ int input_size() const { return input_.size(); }
92
+
93
+ TaskInput* mutable_input(int index) { return &input_[index]; }
94
+
95
+ const TaskInput& input(int index) const { return input_[index]; }
96
+
97
+ TaskInput* add_input() { return &input_.emplace_back(); }
98
+
99
+ private:
100
+ std::vector<Parameter> parameter_;
101
+ std::vector<TaskInput> input_;
102
+ };
103
+
104
+ }
105
+
106
+ #endif
Binary file
@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
167
167
  for (int i = 0; i < model_->embedding_dim_size(); ++i) {
168
168
  CLD3_DCHECK(offset_sum == model_->concat_offset(i));
169
169
  offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
170
+ (void)offset_sum; // Avoid compiler warning for "unused" variable.
170
171
  embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
171
172
  }
172
173
 
Binary file
data/ext/cld3/extconf.rb CHANGED
@@ -26,17 +26,8 @@ rescue
26
26
  end
27
27
  end
28
28
 
29
- # Check pkg-config first to inform the library is missing if so.
30
- pkg_config("protobuf") or abort "Failed to locate protobuf"
31
-
32
- FileUtils.mkdir_p("cld_3/protos")
33
29
  FileUtils.mkdir_p("script_span")
34
30
 
35
- [ "feature_extractor", "sentence", "task_spec" ].each {|name|
36
- system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
37
- ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
38
- }
39
-
40
31
  [
41
32
  "fixunicodevalue.h",
42
33
  "generated_ulscript.h",
@@ -55,6 +46,6 @@ FileUtils.mkdir_p("script_span")
55
46
  ln_fallback("#{name}", "script_span/#{name}")
56
47
  }
57
48
 
58
- $CXXFLAGS += " -fvisibility=hidden -std=c++11"
49
+ $CXXFLAGS += " -fvisibility=hidden -std=c++17"
59
50
  $LIBRUBYARG = ""
60
51
  create_makefile("libcld3")
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -878,7 +878,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
878
878
  // copying letters to buffer with single spaces for each run of non-letters
879
879
  while (take < byte_length_) {
880
880
  // Copy run of letters in same script (&LS | LS)*
881
- int letter_count = 0; // Keep track of word length
882
881
  bool need_break = false;
883
882
 
884
883
  while (take < byte_length_) {
@@ -963,7 +962,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
963
962
  map2original_.Delete(tlen - plen);
964
963
  }
965
964
 
966
- ++letter_count;
967
965
  if (put >= kMaxScriptBytes) {
968
966
  // Buffer is full
969
967
  span->truncated = true;
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
33
33
  static const int kWithinScriptTail = 32; // Stop at word space in last
34
34
  // N bytes of script buffer
35
35
 
36
- typedef struct {
36
+ struct LangSpan {
37
37
  char* text = nullptr; // Pointer to the span, somewhere
38
38
  int text_bytes = 0; // Number of bytes of text in the span
39
39
  int offset = 0; // Offset of start of span in original input buffer
40
40
  ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
41
41
  bool truncated = false; // true if buffer filled up before a
42
42
  // different script or EOF was found
43
- } LangSpan;
43
+ };
44
44
 
45
45
  static inline bool IsContinuationByte(char c) {
46
46
  return static_cast<signed char>(c) < -64;
Binary file
Binary file
data/ext/cld3/libcld3.so CHANGED
Binary file
@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
284
284
  CLD2::LangSpan script_span;
285
285
  std::unordered_map<string, LangChunksStats> lang_stats;
286
286
  int total_num_bytes = 0;
287
- Result result;
288
- string language;
289
287
  int chunk_size = 0; // Use the default.
290
288
  while (ss.GetOneScriptSpanLower(&script_span)) {
291
289
  const int num_original_span_bytes = script_span.text_bytes;
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
302
300
 
303
301
  const string selected_text = SelectTextGivenScriptSpan(script_span);
304
302
 
305
- result = FindLanguageOfValidUTF8(selected_text);
306
- language = result.language;
303
+ Result result = FindLanguageOfValidUTF8(selected_text);
304
+ string language = result.language;
307
305
  lang_stats[language].byte_sum += num_original_span_bytes;
308
306
  lang_stats[language].prob_sum +=
309
307
  result.probability * num_original_span_bytes;
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
356
354
  const char *text_begin, int text_size) {
357
355
  string output_text;
358
356
 
359
- // If the size of the input is greater than the maxium number of bytes needed
357
+ // If the size of the input is greater than the maximum number of bytes needed
360
358
  // for a prediction, then concatenate snippets that are equally spread out
361
359
  // throughout the input.
362
360
  if (text_size > max_num_bytes_) {
Binary file
Binary file
data/ext/cld3/offsetmap.o CHANGED
Binary file
data/ext/cld3/registry.o CHANGED
Binary file
Binary file
@@ -0,0 +1,69 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // Routine that maps a Unicode code point to an interchange-valid one
17
+ //
18
+ // Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
19
+ // code points. C0 and C1 control codes that are not interchange-valid
20
+ // are mapped to spaces.
21
+
22
+
23
+ #ifndef SCRIPT_SPAN_FIXUNICODEVALUE_H_
24
+ #define SCRIPT_SPAN_FIXUNICODEVALUE_H_
25
+
26
+ #include "integral_types.h" // for char32
27
+ #include "port.h"
28
+
29
+ namespace chrome_lang_id {
30
+ namespace CLD2 {
31
+
32
+ // Map byte value 0000-00FF to char32
33
+ // Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
34
+ // Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
35
+ static const char32 kMapFullMicrosoft1252OrSpace[256] = {
36
+ 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
37
+ 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
38
+ 0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
39
+ 0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
40
+
41
+ 0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
42
+ 0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
43
+ 0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
44
+ 0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
45
+
46
+ 0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
47
+ 0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
48
+ 0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
49
+ 0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
50
+ 0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
51
+ 0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
52
+
53
+ 0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
54
+ 0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
55
+ 0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
56
+ 0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
57
+ };
58
+
59
+ // Guarantees that the resulting output value is interchange valid
60
+ // 00-FF; map to spaces or MS CP1252
61
+ // D800-DFFF; surrogates
62
+ // FDD0-FDEF; non-characters
63
+ // xxFFFE-xxFFFF; non-characters
64
+ char32 FixUnicodeValue(char32 uv);
65
+
66
+ } // End namespace CLD2
67
+ } // End namespace chrome_lang_id
68
+
69
+ #endif // SCRIPT_SPAN_FIXUNICODEVALUE_H_