cld3 3.3.0 → 3.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -1
  3. data/LICENSE +2 -2
  4. data/README.md +0 -18
  5. data/cld3.gemspec +9 -7
  6. data/ext/cld3/Makefile +267 -0
  7. data/ext/cld3/base.o +0 -0
  8. data/ext/cld3/embedding_feature_extractor.o +0 -0
  9. data/ext/cld3/embedding_network.cc +1 -0
  10. data/ext/cld3/embedding_network.o +0 -0
  11. data/ext/cld3/extconf.rb +3 -2
  12. data/ext/cld3/feature_extractor.o +0 -0
  13. data/ext/cld3/feature_extractor.pb.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.h +2 -2
  20. data/ext/cld3/getonescriptspan.o +0 -0
  21. data/ext/cld3/lang_id_nn_params.o +0 -0
  22. data/ext/cld3/language_identifier_features.o +0 -0
  23. data/ext/cld3/libcld3.def +8 -0
  24. data/ext/cld3/libcld3.so +0 -0
  25. data/ext/cld3/mkmf.log +37 -0
  26. data/ext/cld3/nnet_language_identifier.cc +3 -5
  27. data/ext/cld3/nnet_language_identifier.o +0 -0
  28. data/ext/cld3/nnet_language_identifier_c.cc +1 -1
  29. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  30. data/ext/cld3/offsetmap.o +0 -0
  31. data/ext/cld3/registry.o +0 -0
  32. data/ext/cld3/relevant_script_feature.o +0 -0
  33. data/ext/cld3/sentence.pb.o +0 -0
  34. data/ext/cld3/sentence_features.cc +4 -4
  35. data/ext/cld3/sentence_features.h +13 -3
  36. data/ext/cld3/sentence_features.o +0 -0
  37. data/ext/cld3/task_context.o +0 -0
  38. data/ext/cld3/task_context_params.o +0 -0
  39. data/ext/cld3/task_spec.pb.o +0 -0
  40. data/ext/cld3/text_processing.o +0 -0
  41. data/ext/cld3/unicodetext.o +0 -0
  42. data/ext/cld3/utf8statetable.o +0 -0
  43. data/ext/cld3/utils.o +0 -0
  44. data/ext/cld3/workspace.o +0 -0
  45. data/lib/a.rb +24 -0
  46. data/lib/cld3/unstable.rb +58 -0
  47. data/lib/cld3.rb +13 -44
  48. data/sig/cld3.rbs +65 -0
  49. metadata +88 -13
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 44d2292a62861aa8551a46b69ebf0d55f518bf07ab7b20605ec7db61cd58d6c4
4
- data.tar.gz: e706b2bc83c2f4915c95c0f9a9d60b8f4728626ca47f705cd3bd0dbc2c555c11
3
+ metadata.gz: f40e4947fea97543686caceba0082bdba30b5ae0485a25b41004ad048057b0ad
4
+ data.tar.gz: e45c60300550caf513fdde6bcbc05e68e1063bf9ad8074626bf5f88f4a6f77bd
5
5
  SHA512:
6
- metadata.gz: 1f70a575dbb1c18ceb0c9f79588e1cdd1d15a09fc2b0ea8cd0ea6dbc24837d2a2d9619bc555349552b64e9b3ec29b51e0dd062384a2f798ae27aa2ddd3803cb7
7
- data.tar.gz: 59952aaf853da6e5acc5a1043f1ccf1777ed708848d9be7ba51e64fc0f1667a4f9319faaa118e07c1fb1ef2eac80f8cc0112fe5d9224b0c3fa733bb33221498c
6
+ metadata.gz: 393fc138a279ee42c3de90c49bcc982e55860f74e2796d4c895d0f2f175894bcb1ec1bbe796811f896a16be9cc97943e1309cbe175bc029a510b4c51b2f700da
7
+ data.tar.gz: d16e8c87e7d12cc90cc1a4babb4873df8f553d9527e1d69a548a250ae0b240f79a6338070bbc88cbb0e23db48c23ef0393cd4b62e0ac673722ace81ce1564895
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  # All Rights Reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,4 +15,5 @@
15
15
  #==============================================================================
16
16
 
17
17
  source 'https://rubygems.org'
18
+ gem 'steep', github: 'akihikodaki/steep', branch: 'cld3'
18
19
  gemspec
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  All rights reserved.
3
3
 
4
4
  Apache License
@@ -189,7 +189,7 @@ All rights reserved.
189
189
  same "printed page" as the copyright notice for easier
190
190
  identification within third-party archives.
191
191
 
192
- Copyright 2017, Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
192
+ Copyright 2017, Akihiko Odaki <akihiko.odaki@gmail.com>
193
193
 
194
194
  Licensed under the Apache License, Version 2.0 (the "License");
195
195
  you may not use this file except in compliance with the License.
data/README.md CHANGED
@@ -41,24 +41,6 @@ JRuby has a bug which prevents the feature detection. Apply the following
41
41
  change:
42
42
  https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
43
43
 
44
- #### OpenBSD
45
- Ruby has a bug which recognizes non-fatal linker warnings as fatal. Apply the
46
- following patch to Ruby to workaround the bug.
47
-
48
- ```diff
49
- --- a/lib/mkmf.rb
50
- +++ b/lib/mkmf.rb
51
- @@ -657,7 +657,7 @@ def with_ldflags(flags)
52
- end
53
-
54
- def try_ldflags(flags, opts = {})
55
- - try_link(MAIN_DOES_NOTHING, flags, {:werror => true}.update(opts))
56
- + try_link(MAIN_DOES_NOTHING, flags, {:werror => false}.update(opts))
57
- end
58
-
59
- def append_ldflags(flags, *opts)
60
- ```
61
-
62
44
  ### Troubleshooting
63
45
  `gem install cld3` triggers native library building. If it fails, you are likely
64
46
  to missing required facilities. Make sure C++ compiler and protocol buffers
data/cld3.gemspec CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  # All Rights Reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,19 +16,21 @@
16
16
 
17
17
  Gem::Specification.new do |gem|
18
18
  gem.name = "cld3"
19
- gem.version = "3.3.0"
19
+ gem.version = "3.4.4"
20
20
  gem.summary = "Compact Language Detector v3 (CLD3)"
21
21
  gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
22
  gem.license = "Apache-2.0"
23
23
  gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
24
24
  gem.author = "Akihiko Odaki"
25
- gem.email = "akihiko.odaki.4i@stu.hosei.ac.jp"
26
- gem.required_ruby_version = [ ">= 2.3.0", "< 2.8.0" ]
27
- gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.12.0" ]
28
- gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.10.0" ]
25
+ gem.email = "akihiko.odaki@gmail.com"
26
+ gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
27
+ gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
28
+ gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
29
+ gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
30
+ gem.add_development_dependency "steep", [ ">= 0.47.0", "< 0.48.0" ]
29
31
  gem.files = Dir[
30
32
  "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
31
- "cld3.gemspec", "ext/**/*", "lib/**/*"
33
+ "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
32
34
  ]
33
35
  gem.require_paths = [ "lib" ]
34
36
  gem.extensions = [ "ext/cld3/extconf.rb" ]
data/ext/cld3/Makefile ADDED
@@ -0,0 +1,267 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ # V=0 quiet, V=1 verbose. other values don't work.
5
+ V = 1
6
+ Q1 = $(V:1=)
7
+ Q = $(Q1:0=@)
8
+ ECHO1 = $(V:1=@ :)
9
+ ECHO = $(ECHO1:0=@ echo)
10
+ NULLCMD = :
11
+
12
+ #### Start of system configuration section. ####
13
+
14
+ srcdir = .
15
+ topdir = /usr/include
16
+ hdrdir = $(topdir)
17
+ arch_hdrdir = /usr/include
18
+ PATH_SEPARATOR = :
19
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
20
+ prefix = $(DESTDIR)/usr
21
+ rubysitearchprefix = $(sitearchlibdir)/$(RUBY_BASE_NAME)
22
+ rubyarchprefix = $(DESTDIR)/usr/lib64/ruby
23
+ rubylibprefix = $(exec_prefix)/share/ruby
24
+ exec_prefix = $(DESTDIR)/usr
25
+ vendorarchhdrdir = $(vendorhdrdir)/$(arch)
26
+ sitearchhdrdir = $(sitehdrdir)/$(arch)
27
+ rubyarchhdrdir = $(DESTDIR)/usr/include
28
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
29
+ sitehdrdir = $(rubyhdrdir)/site_ruby
30
+ rubyhdrdir = $(DESTDIR)/usr/include
31
+ rubygemsdir = $(DESTDIR)/usr/share/rubygems
32
+ vendorarchdir = $(DESTDIR)/usr/lib64/ruby/vendor_ruby
33
+ vendorlibdir = $(vendordir)
34
+ vendordir = $(DESTDIR)/usr/share/ruby/vendor_ruby
35
+ sitearchdir = $(DESTDIR)/usr/local/lib64/ruby/site_ruby
36
+ sitelibdir = $(sitedir)
37
+ sitedir = $(DESTDIR)/usr/local/share/ruby/site_ruby
38
+ rubyarchdir = $(rubyarchprefix)
39
+ rubylibdir = $(rubylibprefix)
40
+ sitearchincludedir = $(includedir)/$(sitearch)
41
+ archincludedir = $(includedir)/$(arch)
42
+ sitearchlibdir = $(libdir)/$(sitearch)
43
+ archlibdir = $(DESTDIR)/usr/lib64
44
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
45
+ mandir = $(DESTDIR)/usr/share/man
46
+ localedir = $(datarootdir)/locale
47
+ libdir = $(exec_prefix)/lib64
48
+ psdir = $(docdir)
49
+ pdfdir = $(docdir)
50
+ dvidir = $(docdir)
51
+ htmldir = $(docdir)
52
+ infodir = $(DESTDIR)/usr/share/info
53
+ docdir = $(datarootdir)/doc/$(PACKAGE)
54
+ oldincludedir = $(DESTDIR)/usr/include
55
+ includedir = $(DESTDIR)/usr/include
56
+ runstatedir = $(localstatedir)/run
57
+ localstatedir = $(DESTDIR)/var
58
+ sharedstatedir = $(DESTDIR)/var/lib
59
+ sysconfdir = $(DESTDIR)/etc
60
+ datadir = $(DESTDIR)/usr/share
61
+ datarootdir = $(prefix)/share
62
+ libexecdir = $(DESTDIR)/usr/libexec
63
+ sbindir = $(DESTDIR)/usr/sbin
64
+ bindir = $(exec_prefix)/bin
65
+ archdir = $(rubyarchdir)
66
+
67
+
68
+ CC_WRAPPER =
69
+ CC = gcc
70
+ CXX = g++
71
+ LIBRUBY = $(LIBRUBY_SO)
72
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
73
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
74
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static $(MAINLIBS)
75
+ empty =
76
+ OUTFLAG = -o $(empty)
77
+ COUTFLAG = -o $(empty)
78
+ CSRCFLAG = $(empty)
79
+
80
+ RUBY_EXTCONF_H =
81
+ cflags = $(optflags) $(debugflags) $(warnflags)
82
+ cxxflags =
83
+ optflags = -O3
84
+ debugflags = -ggdb3
85
+ warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
86
+ cppflags =
87
+ CCDLFLAGS = -fPIC
88
+ CFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
89
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
90
+ DEFS =
91
+ CPPFLAGS = $(DEFS) $(cppflags)
92
+ CXXFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
93
+ ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
94
+ dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
95
+ ARCH_FLAG =
96
+ DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
97
+ LDSHARED = $(CC) -shared
98
+ LDSHAREDXX = $(CXX) -shared
99
+ AR = gcc-ar
100
+ EXEEXT =
101
+
102
+ RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
103
+ RUBY_SO_NAME = ruby
104
+ RUBYW_INSTALL_NAME =
105
+ RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version_dir_name)
106
+ RUBYW_BASE_NAME = rubyw
107
+ RUBY_BASE_NAME = ruby
108
+
109
+ arch = aarch64-linux
110
+ sitearch = $(arch)
111
+ ruby_version = 3.0.0
112
+ ruby = $(bindir)/$(RUBY_BASE_NAME)
113
+ RUBY = $(ruby)
114
+ ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
115
+
116
+ RM = rm -f
117
+ RM_RF = $(RUBY) -run -e rm -- -rf
118
+ RMDIRS = rmdir --ignore-fail-on-non-empty -p
119
+ MAKEDIRS = /usr/bin/mkdir -p
120
+ INSTALL = /usr/bin/install -c
121
+ INSTALL_PROG = $(INSTALL) -m 0755
122
+ INSTALL_DATA = $(INSTALL) -m 644
123
+ COPY = cp
124
+ TOUCH = exit >
125
+
126
+ #### End of system configuration section. ####
127
+
128
+ preload =
129
+ libpath = . $(archlibdir)
130
+ LIBPATH = -L. -L$(archlibdir)
131
+ DEFFILE =
132
+
133
+ CLEANFILES = mkmf.log
134
+ DISTCLEANFILES =
135
+ DISTCLEANDIRS =
136
+
137
+ extout =
138
+ extout_prefix =
139
+ target_prefix =
140
+ LOCAL_LIBS =
141
+ LIBS = -lprotobuf -lpthread -lm -lc
142
+ ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
143
+ SRCS = $(ORIG_SRCS)
144
+ OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
145
+ HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
146
+ LOCAL_HDRS =
147
+ TARGET = libcld3
148
+ TARGET_NAME = libcld3
149
+ TARGET_ENTRY = Init_$(TARGET_NAME)
150
+ DLLIB = $(TARGET).so
151
+ EXTSTATIC =
152
+ STATIC_LIB =
153
+
154
+ TIMESTAMP_DIR = .
155
+ BINDIR = $(bindir)
156
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
157
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
158
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
159
+ HDRDIR = $(sitehdrdir)$(target_prefix)
160
+ ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
161
+ TARGET_SO_DIR =
162
+ TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
163
+ CLEANLIBS = $(TARGET_SO)
164
+ CLEANOBJS = *.o *.bak
165
+
166
+ all: $(DLLIB)
167
+ static: $(STATIC_LIB)
168
+ .PHONY: all install static install-so install-rb
169
+ .PHONY: clean clean-so clean-static clean-rb
170
+
171
+ clean-static::
172
+ clean-rb-default::
173
+ clean-rb::
174
+ clean-so::
175
+ clean: clean-so clean-static clean-rb-default clean-rb
176
+ -$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
177
+
178
+ distclean-rb-default::
179
+ distclean-rb::
180
+ distclean-so::
181
+ distclean-static::
182
+ distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
183
+ -$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
184
+ -$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
185
+ -$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
186
+
187
+ realclean: distclean
188
+ install: install-so install-rb
189
+
190
+ install-so: $(DLLIB) $(TIMESTAMP_DIR)/.sitearchdir.time
191
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
192
+ clean-static::
193
+ -$(Q)$(RM) $(STATIC_LIB)
194
+ install-rb: pre-install-rb do-install-rb install-rb-default
195
+ install-rb-default: pre-install-rb-default do-install-rb-default
196
+ pre-install-rb: Makefile
197
+ pre-install-rb-default: Makefile
198
+ do-install-rb:
199
+ do-install-rb-default:
200
+ pre-install-rb-default:
201
+ @$(NULLCMD)
202
+ $(TIMESTAMP_DIR)/.sitearchdir.time:
203
+ $(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
204
+ $(Q) $(TOUCH) $@
205
+
206
+ site-install: site-install-so site-install-rb
207
+ site-install-so: install-so
208
+ site-install-rb: install-rb
209
+
210
+ .SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
211
+
212
+ .cc.o:
213
+ $(ECHO) compiling $(<)
214
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
215
+
216
+ .cc.S:
217
+ $(ECHO) translating $(<)
218
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
219
+
220
+ .mm.o:
221
+ $(ECHO) compiling $(<)
222
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
223
+
224
+ .mm.S:
225
+ $(ECHO) translating $(<)
226
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
227
+
228
+ .cxx.o:
229
+ $(ECHO) compiling $(<)
230
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
231
+
232
+ .cxx.S:
233
+ $(ECHO) translating $(<)
234
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
235
+
236
+ .cpp.o:
237
+ $(ECHO) compiling $(<)
238
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
239
+
240
+ .cpp.S:
241
+ $(ECHO) translating $(<)
242
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
243
+
244
+ .c.o:
245
+ $(ECHO) compiling $(<)
246
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
247
+
248
+ .c.S:
249
+ $(ECHO) translating $(<)
250
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
251
+
252
+ .m.o:
253
+ $(ECHO) compiling $(<)
254
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
255
+
256
+ .m.S:
257
+ $(ECHO) translating $(<)
258
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
259
+
260
+ $(TARGET_SO): $(OBJS) Makefile
261
+ $(ECHO) linking shared-object $(DLLIB)
262
+ -$(Q)$(RM) $(@)
263
+ $(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
264
+
265
+
266
+
267
+ $(OBJS): $(HDRS) $(ruby_headers)
data/ext/cld3/base.o ADDED
Binary file
@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
167
167
  for (int i = 0; i < model_->embedding_dim_size(); ++i) {
168
168
  CLD3_DCHECK(offset_sum == model_->concat_offset(i));
169
169
  offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
170
+ (void)offset_sum; // Avoid compiler warning for "unused" variable.
170
171
  embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
171
172
  }
172
173
 
Binary file
data/ext/cld3/extconf.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  # All Rights Reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,7 @@ FileUtils.mkdir_p("cld_3/protos")
33
33
  FileUtils.mkdir_p("script_span")
34
34
 
35
35
  [ "feature_extractor", "sentence", "task_spec" ].each {|name|
36
- `protoc '#{name}.proto' --cpp_out=.`
36
+ system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
37
37
  ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
38
38
  }
39
39
 
@@ -56,4 +56,5 @@ FileUtils.mkdir_p("script_span")
56
56
  }
57
57
 
58
58
  $CXXFLAGS += " -fvisibility=hidden -std=c++11"
59
+ $LIBRUBYARG = ""
59
60
  create_makefile("libcld3")
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
33
33
  static const int kWithinScriptTail = 32; // Stop at word space in last
34
34
  // N bytes of script buffer
35
35
 
36
- typedef struct {
36
+ struct LangSpan {
37
37
  char* text = nullptr; // Pointer to the span, somewhere
38
38
  int text_bytes = 0; // Number of bytes of text in the span
39
39
  int offset = 0; // Offset of start of span in original input buffer
40
40
  ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
41
41
  bool truncated = false; // true if buffer filled up before a
42
42
  // different script or EOF was found
43
- } LangSpan;
43
+ };
44
44
 
45
45
  static inline bool IsContinuationByte(char c) {
46
46
  return static_cast<signed char>(c) < -64;
Binary file
Binary file
@@ -0,0 +1,8 @@
1
+ EXPORTS
2
+ NNetLanguageIdentifier_find_language
3
+ NNetLanguageIdentifier_find_top_n_most_freq_langs
4
+ delete_NNetLanguageIdentifier
5
+ delete_result
6
+ delete_results
7
+ new_NNetLanguageIdentifier
8
+ refer_to_nth_result
Binary file
data/ext/cld3/mkmf.log ADDED
@@ -0,0 +1,37 @@
1
+ "pkg-config --exists protobuf"
2
+ | pkg-config --libs protobuf
3
+ => "-lprotobuf -lpthread \n"
4
+ "gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lm -lc"
5
+ checked program was:
6
+ /* begin */
7
+ 1: #include "ruby.h"
8
+ 2:
9
+ 3: int main(int argc, char **argv)
10
+ 4: {
11
+ 5: return !!argv[argc];
12
+ 6: }
13
+ /* end */
14
+
15
+ "gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lprotobuf -lpthread -lm -lc"
16
+ checked program was:
17
+ /* begin */
18
+ 1: #include "ruby.h"
19
+ 2:
20
+ 3: int main(int argc, char **argv)
21
+ 4: {
22
+ 5: return !!argv[argc];
23
+ 6: }
24
+ /* end */
25
+
26
+ | pkg-config --cflags-only-I protobuf
27
+ => "\n"
28
+ | pkg-config --cflags-only-other protobuf
29
+ => "\n"
30
+ | pkg-config --libs-only-l protobuf
31
+ => "-lprotobuf -lpthread \n"
32
+ package configuration for protobuf
33
+ incflags:
34
+ cflags:
35
+ ldflags:
36
+ libs: -lprotobuf -lpthread
37
+
@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
284
284
  CLD2::LangSpan script_span;
285
285
  std::unordered_map<string, LangChunksStats> lang_stats;
286
286
  int total_num_bytes = 0;
287
- Result result;
288
- string language;
289
287
  int chunk_size = 0; // Use the default.
290
288
  while (ss.GetOneScriptSpanLower(&script_span)) {
291
289
  const int num_original_span_bytes = script_span.text_bytes;
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
302
300
 
303
301
  const string selected_text = SelectTextGivenScriptSpan(script_span);
304
302
 
305
- result = FindLanguageOfValidUTF8(selected_text);
306
- language = result.language;
303
+ Result result = FindLanguageOfValidUTF8(selected_text);
304
+ string language = result.language;
307
305
  lang_stats[language].byte_sum += num_original_span_bytes;
308
306
  lang_stats[language].prob_sum +=
309
307
  result.probability * num_original_span_bytes;
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
356
354
  const char *text_begin, int text_size) {
357
355
  string output_text;
358
356
 
359
- // If the size of the input is greater than the maxium number of bytes needed
357
+ // If the size of the input is greater than the maximum number of bytes needed
360
358
  // for a prediction, then concatenate snippets that are equally spread out
361
359
  // throughout the input.
362
360
  if (text_size > max_num_bytes_) {
Binary file
@@ -1,4 +1,4 @@
1
- /* Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ /* Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  All Rights Reserved.
3
3
 
4
4
  Licensed under the Apache License, Version 2.0 (the "License");
Binary file
Binary file
Binary file
Binary file
@@ -19,11 +19,11 @@ limitations under the License.
19
19
 
20
20
  namespace chrome_lang_id {
21
21
 
22
- // Declare registry for the whole Sentence feature functions. NOTE: this is not
22
+ // Define registry for the whole Sentence feature functions. NOTE: this is not
23
23
  // yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
24
24
  // constructor, *before* we use any feature.
25
25
  template <>
26
- WholeSentenceFeature::Registry
27
- *RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
26
+ WholeSentenceFeature::Registry*
27
+ RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
28
28
 
29
- } // namespace chrome_lang_id
29
+ } // namespace chrome_lang_id
@@ -26,9 +26,19 @@ limitations under the License.
26
26
  namespace chrome_lang_id {
27
27
 
28
28
  // Feature function that extracts features for the full Sentence.
29
- typedef FeatureFunction<Sentence> WholeSentenceFeature;
30
-
31
- typedef FeatureExtractor<Sentence> WholeSentenceExtractor;
29
+ using WholeSentenceFeature = FeatureFunction<Sentence>;
30
+
31
+ using WholeSentenceExtractor = FeatureExtractor<Sentence>;
32
+
33
+ // Declare registry for the whole Sentence feature functions. This is required
34
+ // for clang's -Wundefined-var-template. However, MSVC has a bug which treats
35
+ // this declaration as a definition, leading to multiple definition errors, so
36
+ // omit this on MSVC.
37
+ #if !defined(COMPILER_MSVC)
38
+ template <>
39
+ WholeSentenceFeature::Registry
40
+ *RegisterableClass<WholeSentenceFeature>::registry_;
41
+ #endif
32
42
 
33
43
  } // namespace chrome_lang_id
34
44
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/ext/cld3/utils.o ADDED
Binary file
Binary file
data/lib/a.rb ADDED
@@ -0,0 +1,24 @@
1
+ require "cld3"
2
+
3
+ # Kafka text as an example + the word Velcro
4
+ text = "Πολυαγαπημένε πατέρα πρόσφατα Velcro με ρώτησες κάποια φορά γιατί ισχυρίζομαι πως σε φοβάμαι. Εγώ δεν ήξερα, ως συνήθως, τι να σου απαντήσω, εν μέρει ακριβώς λόγω του φόβου που νιώθω για σένα, εν μέρει επειδή στην αιτιολόγηση του φόβου αυτού συγκαταλέγονται πάρα πολλές λεπτομέρειες, που εν τη ρύμη του λόγου εγώ ούτε κατά το ήμισυ δεν θα μπορούσα να τις συγκρατήσω. Κι αν εδώ προσπαθώ να σου απαντήσω γραπτώς, μόνο ανολοκλήρωτο κατά πολύ θα αποβεί και τούτο, επειδή και κατά τη γραφή ο φόβος και οι συνέπειές του με κωλύουν έναντί σου κι επειδή το μέγεθος του υλικού εν γένει υπερβαίνει κατά πολύ τη μνήμη μου και το λογικό μου. Για σένα το ζήτημα αποδεικνυόταν πάντοτε πολύ απλό, τουλάχιστον στον βαθμό που μιλούσες εσύ γι’ αυτό ενώπιόν μου και, αδιακρίτως, ενώπιον πολλών άλλων. Εσένα σου φαινόταν να είναι κάπως έτσι: Εσύ εργαζόσουν σκληρά σ’ όλη σου τη ζωή, τα πάντα για τα παιδιά σου, προ πάντων για εμένα τα θυσίαζες, εγώ έκαμνα συνεπώς «ζωή χαρισάμενη», είχα πλήρη ελευθερία να μάθω ό,τι ήθελα, κανέναν λόγο δεν είχα να έχω έγνοιες για την καθημερινή διατροφή, να έχω έγνοιες συνεπώς εν γένει• εσύ αντ’ αυτών καμμίαν ευγνωμοσύνη δεν αξίωνες, γνωρίζεις «την ευγνωμοσύνη των παιδιών, αλλά εν τούτοις τουλάχιστον μια "
5
+ pp text.bytesize
6
+
7
+ 200.times { |i|
8
+ max_bytes = 500 + i * 10
9
+ cld3 = CLD3::NNetLanguageIdentifier.new("foo", max_bytes)
10
+
11
+ lang = cld3.find_language(text)
12
+ lang2 = cld3.find_top_n_most_freq_langs(text, 1)
13
+
14
+ puts "When max_bytes is #{max_bytes} probability is less than 0.999: #{lang.probability}" if lang.probability < 0.999
15
+
16
+ if lang.language != :el
17
+ puts "When max_bytes is #{max_bytes} then cld3::find_language returns #{lang.language},
18
+ find_top_n_most_freq_langs returns #{lang2.first.language}"
19
+ #pp lang
20
+ #pp lang2
21
+ end
22
+ }
23
+
24
+ puts "Size: #{text.length} - Bytesize: #{text.encode(Encoding::UTF_8).bytesize}"
@@ -0,0 +1,58 @@
1
+
2
+ # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
3
+ # All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ module CLD3
19
+ module Unstable
20
+ extend FFI::Library
21
+
22
+ ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
23
+
24
+ module NNetLanguageIdentifier
25
+ class Pointer < FFI::AutoPointer
26
+ def self.release(pointer)
27
+ Unstable.delete_NNetLanguageIdentifier(pointer)
28
+ end
29
+ end
30
+
31
+ class SpanInfo < FFI::Struct
32
+ layout :start_index, :int, :end_index, :int, :probability, :float
33
+ end
34
+
35
+ class Result < FFI::Struct
36
+ layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
37
+ end
38
+ end
39
+
40
+ attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
41
+
42
+ attach_function :delete_result, [ :pointer ], :void
43
+
44
+ attach_function :delete_results, [ :pointer ], :void
45
+
46
+ attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
47
+
48
+ attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
49
+
50
+ attach_function :NNetLanguageIdentifier_find_language,
51
+ [ :pointer, :buffer_in, :size_t ], :pointer
52
+
53
+ attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
54
+ [ :pointer, :buffer_in, :size_t, :int ], :pointer
55
+ end
56
+
57
+ private_constant :Unstable
58
+ end
data/lib/cld3.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # File including an implementation of CLD3 module. Some documentations are
2
2
  # extracted from ext/cld3/ext/src/nnet_language_identifier.h.
3
3
  #
4
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
4
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
5
5
  # All Rights Reserved.
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +19,7 @@
19
19
 
20
20
  require "ffi"
21
21
  require "rbconfig"
22
+ require "cld3/unstable"
22
23
 
23
24
  # Module providing an interface for Compact Language Detector v3 (CLD3)
24
25
  module CLD3
@@ -52,6 +53,7 @@ module CLD3
52
53
  # Holds probability that Span, specified by start/end indices, is a given
53
54
  # language. The langauge is not stored here; it can be found in Result, which
54
55
  # holds an Array of SpanInfo.
56
+ # @type const SpanInfo: untyped
55
57
  SpanInfo = Struct.new(:start_index, :end_index, :probability)
56
58
 
57
59
  # Information about a predicted language.
@@ -69,16 +71,18 @@ module CLD3
69
71
  #
70
72
  # [byte_ranges] Specifies the byte ranges in UTF-8 that |language| applies to.
71
73
  # This is an Array of SpanInfo.
74
+ # @type const Result: untyped
72
75
  Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
73
76
 
74
- # The arguments are two String objects.
77
+ # The arguments are two Numeric objects.
75
78
  def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
79
+ raise ArgumentError if max_num_bytes <= 0 || min_num_bytes < 0 || min_num_bytes >= max_num_bytes
76
80
  @cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
77
81
  end
78
82
 
79
83
  # Finds the most likely language for the given text, along with additional
80
84
  # information (e.g., probability). The prediction is based on the first N
81
- # bytes where N is the minumum between the number of interchange valid UTF8
85
+ # bytes where N is the minimum between the number of interchange valid UTF8
82
86
  # bytes and +max_num_bytes_+. If N is less than +min_num_bytes_+ long, then
83
87
  # this function returns nil.
84
88
  # The argument is a String object.
@@ -115,6 +119,8 @@ module CLD3
115
119
  # The second argument is Numeric object.
116
120
  # The returned value of this functions is an Array of Result instances.
117
121
  def find_top_n_most_freq_langs(text, num_langs)
122
+ # @type var a: untyped
123
+
118
124
  text_utf8 = text.encode(Encoding::UTF_8)
119
125
  pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
120
126
 
@@ -123,11 +129,13 @@ module CLD3
123
129
 
124
130
  results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
125
131
  begin
126
- num_langs.times
132
+ a = num_langs.times
127
133
  .lazy
128
134
  .map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
129
135
  .take_while { |result| !result.nil? }
130
136
  .to_a
137
+
138
+ a
131
139
  ensure
132
140
  Unstable.delete_results results
133
141
  end
@@ -162,6 +170,7 @@ module CLD3
162
170
  # The model weights are loaded statically.
163
171
  module TaskContextParams
164
172
  # This is an frozen Array object containing symbols.
173
+ # @type const LANGUAGE_NAMES: untyped
165
174
  LANGUAGE_NAMES = [
166
175
  :eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
167
176
  :nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
@@ -175,44 +184,4 @@ module CLD3
175
184
  :sn, :yo, :pa, :ku,
176
185
  ].freeze
177
186
  end
178
-
179
- module Unstable
180
- extend FFI::Library
181
-
182
- ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
183
-
184
- module NNetLanguageIdentifier
185
- class Pointer < FFI::AutoPointer
186
- def self.release(pointer)
187
- Unstable.delete_NNetLanguageIdentifier(pointer)
188
- end
189
- end
190
-
191
- class SpanInfo < FFI::Struct
192
- layout :start_index, :int, :end_index, :int, :probability, :float
193
- end
194
-
195
- class Result < FFI::Struct
196
- layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
197
- end
198
- end
199
-
200
- attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
201
-
202
- attach_function :delete_result, [ :pointer ], :void
203
-
204
- attach_function :delete_results, [ :pointer ], :void
205
-
206
- attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
207
-
208
- attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
209
-
210
- attach_function :NNetLanguageIdentifier_find_language,
211
- [ :pointer, :buffer_in, :size_t ], :pointer
212
-
213
- attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
214
- [ :pointer, :buffer_in, :size_t, :int ], :pointer
215
- end
216
-
217
- private_constant :Unstable
218
187
  end
data/sig/cld3.rbs ADDED
@@ -0,0 +1,65 @@
1
+ # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ # All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ==============================================================================
16
+
17
+ module CLD3
18
+ class NNetLanguageIdentifier
19
+ MIN_NUM_BYTES_TO_CONSIDER: Integer
20
+ MAX_NUM_BYTES_TO_CONSIDER: Integer
21
+ MAX_NUM_INPUT_BYTES_TO_CONSIDER: Integer
22
+ RELIABILITY_THRESHOLD: Float
23
+ RELIABILITY_HR_BS_THRESHOLD: Float
24
+
25
+ class SpanInfo < Struct[Float | Integer]
26
+ attr_accessor start_index(): Integer
27
+ attr_accessor end_index(): Integer
28
+ attr_accessor probability(): Float
29
+ end
30
+
31
+ class Result < Struct[Array[SpanInfo] | Float | TaskContextParams::language_names | bool]
32
+ attr_accessor language(): TaskContextParams::language_names
33
+ attr_accessor probability(): Float
34
+ attr_accessor reliable?(): bool
35
+ attr_accessor proportion(): Float
36
+ attr_accessor byte_ranges(): Array[SpanInfo]
37
+ end
38
+
39
+ def initialize: (?Integer, ?Integer) -> void
40
+ def find_language: (String) -> Result?
41
+ def find_top_n_most_freq_langs: (String, Integer) -> Array[Result]
42
+
43
+ private
44
+
45
+ def convert_result: (untyped) -> Result?
46
+ end
47
+
48
+ module TaskContextParams
49
+ type language_names =
50
+ :eo | :co | :eu | :ta | :de | :mt | :ps | :te | :su | :uz | :'zh-Latn' | :ne |
51
+ :nl | :sw | :sq | :hmn | :ja | :no | :mn | :so | :ko | :kk | :sl | :ig |
52
+ :mr | :th | :zu | :ml | :hr | :bs | :lo | :sd | :cy | :hy | :uk | :pt |
53
+ :lv | :iw | :cs | :vi | :jv | :be | :km | :mk | :tr | :fy | :am | :zh |
54
+ :da | :sv | :fi | :ht | :af | :la | :id | :fil | :sm | :ca | :el | :ka |
55
+ :sr | :it | :sk | :ru | :'ru-Latn' | :bg | :ny | :fa | :haw | :gl | :et |
56
+ :ms | :gd | :'bg-Latn' | :ha | :is | :ur | :mi | :hi | :bn | :'hi-Latn' | :fr |
57
+ :yi | :hu | :xh | :my | :tg | :ro | :ar | :lb | :'el-Latn' | :st | :ceb |
58
+ :kn | :az | :si | :ky | :mg | :en | :gu | :es | :pl | :'ja-Latn' | :ga | :lt |
59
+ :sn | :yo | :pa | :ku
60
+
61
+ LANGUAGE_NAMES: Array[language_names]
62
+ end
63
+
64
+ Unstable: untyped
65
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cld3
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.0
4
+ version: 3.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Akihiko Odaki
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-02-09 00:00:00.000000000 Z
11
+ date: 2022-01-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -19,7 +19,7 @@ dependencies:
19
19
  version: 1.1.0
20
20
  - - "<"
21
21
  - !ruby/object:Gem::Version
22
- version: 1.12.0
22
+ version: 1.16.0
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
@@ -29,7 +29,27 @@ dependencies:
29
29
  version: 1.1.0
30
30
  - - "<"
31
31
  - !ruby/object:Gem::Version
32
- version: 1.12.0
32
+ version: 1.16.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: rbs
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 1.7.0
40
+ - - "<"
41
+ - !ruby/object:Gem::Version
42
+ version: 1.8.0
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.7.0
50
+ - - "<"
51
+ - !ruby/object:Gem::Version
52
+ version: 1.8.0
33
53
  - !ruby/object:Gem::Dependency
34
54
  name: rspec
35
55
  requirement: !ruby/object:Gem::Requirement
@@ -39,7 +59,7 @@ dependencies:
39
59
  version: 3.0.0
40
60
  - - "<"
41
61
  - !ruby/object:Gem::Version
42
- version: 3.10.0
62
+ version: 3.11.0
43
63
  type: :development
44
64
  prerelease: false
45
65
  version_requirements: !ruby/object:Gem::Requirement
@@ -49,10 +69,30 @@ dependencies:
49
69
  version: 3.0.0
50
70
  - - "<"
51
71
  - !ruby/object:Gem::Version
52
- version: 3.10.0
72
+ version: 3.11.0
73
+ - !ruby/object:Gem::Dependency
74
+ name: steep
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 0.47.0
80
+ - - "<"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.48.0
83
+ type: :development
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.47.0
90
+ - - "<"
91
+ - !ruby/object:Gem::Version
92
+ version: 0.48.0
53
93
  description: Compact Language Detector v3 (CLD3) is a neural network model for language
54
94
  identification.
55
- email: akihiko.odaki.4i@stu.hosei.ac.jp
95
+ email: akihiko.odaki@gmail.com
56
96
  executables: []
57
97
  extensions:
58
98
  - ext/cld3/extconf.rb
@@ -63,76 +103,111 @@ files:
63
103
  - LICENSE_CLD3
64
104
  - README.md
65
105
  - cld3.gemspec
106
+ - ext/cld3/Makefile
66
107
  - ext/cld3/base.cc
67
108
  - ext/cld3/base.h
109
+ - ext/cld3/base.o
68
110
  - ext/cld3/casts.h
69
111
  - ext/cld3/embedding_feature_extractor.cc
70
112
  - ext/cld3/embedding_feature_extractor.h
113
+ - ext/cld3/embedding_feature_extractor.o
71
114
  - ext/cld3/embedding_network.cc
72
115
  - ext/cld3/embedding_network.h
116
+ - ext/cld3/embedding_network.o
73
117
  - ext/cld3/embedding_network_params.h
74
118
  - ext/cld3/extconf.rb
75
119
  - ext/cld3/feature_extractor.cc
76
120
  - ext/cld3/feature_extractor.h
121
+ - ext/cld3/feature_extractor.o
122
+ - ext/cld3/feature_extractor.pb.o
77
123
  - ext/cld3/feature_extractor.proto
78
124
  - ext/cld3/feature_types.cc
79
125
  - ext/cld3/feature_types.h
126
+ - ext/cld3/feature_types.o
80
127
  - ext/cld3/fixunicodevalue.cc
81
128
  - ext/cld3/fixunicodevalue.h
129
+ - ext/cld3/fixunicodevalue.o
82
130
  - ext/cld3/float16.h
83
131
  - ext/cld3/fml_parser.cc
84
132
  - ext/cld3/fml_parser.h
133
+ - ext/cld3/fml_parser.o
85
134
  - ext/cld3/generated_entities.cc
135
+ - ext/cld3/generated_entities.o
86
136
  - ext/cld3/generated_ulscript.cc
87
137
  - ext/cld3/generated_ulscript.h
138
+ - ext/cld3/generated_ulscript.o
88
139
  - ext/cld3/getonescriptspan.cc
89
140
  - ext/cld3/getonescriptspan.h
141
+ - ext/cld3/getonescriptspan.o
90
142
  - ext/cld3/integral_types.h
91
143
  - ext/cld3/lang_id_nn_params.cc
92
144
  - ext/cld3/lang_id_nn_params.h
145
+ - ext/cld3/lang_id_nn_params.o
93
146
  - ext/cld3/language_identifier_features.cc
94
147
  - ext/cld3/language_identifier_features.h
148
+ - ext/cld3/language_identifier_features.o
149
+ - ext/cld3/libcld3.def
150
+ - ext/cld3/libcld3.so
151
+ - ext/cld3/mkmf.log
95
152
  - ext/cld3/nnet_language_identifier.cc
96
153
  - ext/cld3/nnet_language_identifier.h
154
+ - ext/cld3/nnet_language_identifier.o
97
155
  - ext/cld3/nnet_language_identifier_c.cc
156
+ - ext/cld3/nnet_language_identifier_c.o
98
157
  - ext/cld3/offsetmap.cc
99
158
  - ext/cld3/offsetmap.h
159
+ - ext/cld3/offsetmap.o
100
160
  - ext/cld3/port.h
101
161
  - ext/cld3/registry.cc
102
162
  - ext/cld3/registry.h
163
+ - ext/cld3/registry.o
103
164
  - ext/cld3/relevant_script_feature.cc
104
165
  - ext/cld3/relevant_script_feature.h
166
+ - ext/cld3/relevant_script_feature.o
105
167
  - ext/cld3/script_detector.h
168
+ - ext/cld3/sentence.pb.o
106
169
  - ext/cld3/sentence.proto
107
170
  - ext/cld3/sentence_features.cc
108
171
  - ext/cld3/sentence_features.h
172
+ - ext/cld3/sentence_features.o
109
173
  - ext/cld3/simple_adder.h
110
174
  - ext/cld3/stringpiece.h
111
175
  - ext/cld3/task_context.cc
112
176
  - ext/cld3/task_context.h
177
+ - ext/cld3/task_context.o
113
178
  - ext/cld3/task_context_params.cc
114
179
  - ext/cld3/task_context_params.h
180
+ - ext/cld3/task_context_params.o
181
+ - ext/cld3/task_spec.pb.o
115
182
  - ext/cld3/task_spec.proto
116
183
  - ext/cld3/text_processing.cc
117
184
  - ext/cld3/text_processing.h
185
+ - ext/cld3/text_processing.o
118
186
  - ext/cld3/unicodetext.cc
119
187
  - ext/cld3/unicodetext.h
188
+ - ext/cld3/unicodetext.o
120
189
  - ext/cld3/utf8acceptinterchange.h
121
190
  - ext/cld3/utf8prop_lettermarkscriptnum.h
122
191
  - ext/cld3/utf8repl_lettermarklower.h
123
192
  - ext/cld3/utf8scannot_lettermarkspecial.h
124
193
  - ext/cld3/utf8statetable.cc
125
194
  - ext/cld3/utf8statetable.h
195
+ - ext/cld3/utf8statetable.o
126
196
  - ext/cld3/utils.cc
127
197
  - ext/cld3/utils.h
198
+ - ext/cld3/utils.o
128
199
  - ext/cld3/workspace.cc
129
200
  - ext/cld3/workspace.h
201
+ - ext/cld3/workspace.o
202
+ - lib/a.rb
130
203
  - lib/cld3.rb
204
+ - lib/cld3/unstable.rb
205
+ - sig/cld3.rbs
131
206
  homepage: https://github.com/akihikodaki/cld3-ruby
132
207
  licenses:
133
208
  - Apache-2.0
134
209
  metadata: {}
135
- post_install_message:
210
+ post_install_message:
136
211
  rdoc_options: []
137
212
  require_paths:
138
213
  - lib
@@ -140,18 +215,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
140
215
  requirements:
141
216
  - - ">="
142
217
  - !ruby/object:Gem::Version
143
- version: 2.3.0
218
+ version: 2.6.0
144
219
  - - "<"
145
220
  - !ruby/object:Gem::Version
146
- version: 2.8.0
221
+ version: 3.2.0
147
222
  required_rubygems_version: !ruby/object:Gem::Requirement
148
223
  requirements:
149
224
  - - ">="
150
225
  - !ruby/object:Gem::Version
151
226
  version: '0'
152
227
  requirements: []
153
- rubygems_version: 3.1.2
154
- signing_key:
228
+ rubygems_version: 3.2.22
229
+ signing_key:
155
230
  specification_version: 4
156
231
  summary: Compact Language Detector v3 (CLD3)
157
232
  test_files: []