cld3 3.3.0 → 3.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -1
- data/LICENSE +2 -2
- data/README.md +0 -18
- data/cld3.gemspec +9 -7
- data/ext/cld3/Makefile +267 -0
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.cc +1 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/extconf.rb +3 -2
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.h +2 -2
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.def +8 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/mkmf.log +37 -0
- data/ext/cld3/nnet_language_identifier.cc +3 -5
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.cc +1 -1
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence_features.cc +4 -4
- data/ext/cld3/sentence_features.h +13 -3
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/a.rb +24 -0
- data/lib/cld3/unstable.rb +58 -0
- data/lib/cld3.rb +13 -44
- data/sig/cld3.rbs +65 -0
- metadata +88 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f40e4947fea97543686caceba0082bdba30b5ae0485a25b41004ad048057b0ad
|
4
|
+
data.tar.gz: e45c60300550caf513fdde6bcbc05e68e1063bf9ad8074626bf5f88f4a6f77bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 393fc138a279ee42c3de90c49bcc982e55860f74e2796d4c895d0f2f175894bcb1ec1bbe796811f896a16be9cc97943e1309cbe175bc029a510b4c51b2f700da
|
7
|
+
data.tar.gz: d16e8c87e7d12cc90cc1a4babb4873df8f553d9527e1d69a548a250ae0b240f79a6338070bbc88cbb0e23db48c23ef0393cd4b62e0ac673722ace81ce1564895
|
data/Gemfile
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2017 Akihiko Odaki <akihiko.odaki
|
1
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
2
|
# All Rights Reserved.
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -15,4 +15,5 @@
|
|
15
15
|
#==============================================================================
|
16
16
|
|
17
17
|
source 'https://rubygems.org'
|
18
|
+
gem 'steep', github: 'akihikodaki/steep', branch: 'cld3'
|
18
19
|
gemspec
|
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Copyright 2017 Akihiko Odaki <akihiko.odaki
|
1
|
+
Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
2
|
All rights reserved.
|
3
3
|
|
4
4
|
Apache License
|
@@ -189,7 +189,7 @@ All rights reserved.
|
|
189
189
|
same "printed page" as the copyright notice for easier
|
190
190
|
identification within third-party archives.
|
191
191
|
|
192
|
-
Copyright 2017, Akihiko Odaki <akihiko.odaki
|
192
|
+
Copyright 2017, Akihiko Odaki <akihiko.odaki@gmail.com>
|
193
193
|
|
194
194
|
Licensed under the Apache License, Version 2.0 (the "License");
|
195
195
|
you may not use this file except in compliance with the License.
|
data/README.md
CHANGED
@@ -41,24 +41,6 @@ JRuby has a bug which prevents the feature detection. Apply the following
|
|
41
41
|
change:
|
42
42
|
https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
|
43
43
|
|
44
|
-
#### OpenBSD
|
45
|
-
Ruby has a bug which recognizes non-fatal linker warnings as fatal. Apply the
|
46
|
-
following patch to Ruby to workaround the bug.
|
47
|
-
|
48
|
-
```diff
|
49
|
-
--- a/lib/mkmf.rb
|
50
|
-
+++ b/lib/mkmf.rb
|
51
|
-
@@ -657,7 +657,7 @@ def with_ldflags(flags)
|
52
|
-
end
|
53
|
-
|
54
|
-
def try_ldflags(flags, opts = {})
|
55
|
-
- try_link(MAIN_DOES_NOTHING, flags, {:werror => true}.update(opts))
|
56
|
-
+ try_link(MAIN_DOES_NOTHING, flags, {:werror => false}.update(opts))
|
57
|
-
end
|
58
|
-
|
59
|
-
def append_ldflags(flags, *opts)
|
60
|
-
```
|
61
|
-
|
62
44
|
### Troubleshooting
|
63
45
|
`gem install cld3` triggers native library building. If it fails, you are likely
|
64
46
|
to missing required facilities. Make sure C++ compiler and protocol buffers
|
data/cld3.gemspec
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2017 Akihiko Odaki <akihiko.odaki
|
1
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
2
|
# All Rights Reserved.
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -16,19 +16,21 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.
|
19
|
+
gem.version = "3.4.4"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
23
23
|
gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
|
24
24
|
gem.author = "Akihiko Odaki"
|
25
|
-
gem.email = "akihiko.odaki
|
26
|
-
gem.required_ruby_version = [ ">= 2.
|
27
|
-
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.
|
28
|
-
gem.add_development_dependency "
|
25
|
+
gem.email = "akihiko.odaki@gmail.com"
|
26
|
+
gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
|
27
|
+
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
|
28
|
+
gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
|
29
|
+
gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
|
30
|
+
gem.add_development_dependency "steep", [ ">= 0.47.0", "< 0.48.0" ]
|
29
31
|
gem.files = Dir[
|
30
32
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
31
|
-
"cld3.gemspec", "ext/**/*", "lib/**/*"
|
33
|
+
"cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
|
32
34
|
]
|
33
35
|
gem.require_paths = [ "lib" ]
|
34
36
|
gem.extensions = [ "ext/cld3/extconf.rb" ]
|
data/ext/cld3/Makefile
ADDED
@@ -0,0 +1,267 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
5
|
+
V = 1
|
6
|
+
Q1 = $(V:1=)
|
7
|
+
Q = $(Q1:0=@)
|
8
|
+
ECHO1 = $(V:1=@ :)
|
9
|
+
ECHO = $(ECHO1:0=@ echo)
|
10
|
+
NULLCMD = :
|
11
|
+
|
12
|
+
#### Start of system configuration section. ####
|
13
|
+
|
14
|
+
srcdir = .
|
15
|
+
topdir = /usr/include
|
16
|
+
hdrdir = $(topdir)
|
17
|
+
arch_hdrdir = /usr/include
|
18
|
+
PATH_SEPARATOR = :
|
19
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
20
|
+
prefix = $(DESTDIR)/usr
|
21
|
+
rubysitearchprefix = $(sitearchlibdir)/$(RUBY_BASE_NAME)
|
22
|
+
rubyarchprefix = $(DESTDIR)/usr/lib64/ruby
|
23
|
+
rubylibprefix = $(exec_prefix)/share/ruby
|
24
|
+
exec_prefix = $(DESTDIR)/usr
|
25
|
+
vendorarchhdrdir = $(vendorhdrdir)/$(arch)
|
26
|
+
sitearchhdrdir = $(sitehdrdir)/$(arch)
|
27
|
+
rubyarchhdrdir = $(DESTDIR)/usr/include
|
28
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
29
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
30
|
+
rubyhdrdir = $(DESTDIR)/usr/include
|
31
|
+
rubygemsdir = $(DESTDIR)/usr/share/rubygems
|
32
|
+
vendorarchdir = $(DESTDIR)/usr/lib64/ruby/vendor_ruby
|
33
|
+
vendorlibdir = $(vendordir)
|
34
|
+
vendordir = $(DESTDIR)/usr/share/ruby/vendor_ruby
|
35
|
+
sitearchdir = $(DESTDIR)/usr/local/lib64/ruby/site_ruby
|
36
|
+
sitelibdir = $(sitedir)
|
37
|
+
sitedir = $(DESTDIR)/usr/local/share/ruby/site_ruby
|
38
|
+
rubyarchdir = $(rubyarchprefix)
|
39
|
+
rubylibdir = $(rubylibprefix)
|
40
|
+
sitearchincludedir = $(includedir)/$(sitearch)
|
41
|
+
archincludedir = $(includedir)/$(arch)
|
42
|
+
sitearchlibdir = $(libdir)/$(sitearch)
|
43
|
+
archlibdir = $(DESTDIR)/usr/lib64
|
44
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
45
|
+
mandir = $(DESTDIR)/usr/share/man
|
46
|
+
localedir = $(datarootdir)/locale
|
47
|
+
libdir = $(exec_prefix)/lib64
|
48
|
+
psdir = $(docdir)
|
49
|
+
pdfdir = $(docdir)
|
50
|
+
dvidir = $(docdir)
|
51
|
+
htmldir = $(docdir)
|
52
|
+
infodir = $(DESTDIR)/usr/share/info
|
53
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
54
|
+
oldincludedir = $(DESTDIR)/usr/include
|
55
|
+
includedir = $(DESTDIR)/usr/include
|
56
|
+
runstatedir = $(localstatedir)/run
|
57
|
+
localstatedir = $(DESTDIR)/var
|
58
|
+
sharedstatedir = $(DESTDIR)/var/lib
|
59
|
+
sysconfdir = $(DESTDIR)/etc
|
60
|
+
datadir = $(DESTDIR)/usr/share
|
61
|
+
datarootdir = $(prefix)/share
|
62
|
+
libexecdir = $(DESTDIR)/usr/libexec
|
63
|
+
sbindir = $(DESTDIR)/usr/sbin
|
64
|
+
bindir = $(exec_prefix)/bin
|
65
|
+
archdir = $(rubyarchdir)
|
66
|
+
|
67
|
+
|
68
|
+
CC_WRAPPER =
|
69
|
+
CC = gcc
|
70
|
+
CXX = g++
|
71
|
+
LIBRUBY = $(LIBRUBY_SO)
|
72
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
73
|
+
LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
|
74
|
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static $(MAINLIBS)
|
75
|
+
empty =
|
76
|
+
OUTFLAG = -o $(empty)
|
77
|
+
COUTFLAG = -o $(empty)
|
78
|
+
CSRCFLAG = $(empty)
|
79
|
+
|
80
|
+
RUBY_EXTCONF_H =
|
81
|
+
cflags = $(optflags) $(debugflags) $(warnflags)
|
82
|
+
cxxflags =
|
83
|
+
optflags = -O3
|
84
|
+
debugflags = -ggdb3
|
85
|
+
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
|
86
|
+
cppflags =
|
87
|
+
CCDLFLAGS = -fPIC
|
88
|
+
CFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
|
89
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
90
|
+
DEFS =
|
91
|
+
CPPFLAGS = $(DEFS) $(cppflags)
|
92
|
+
CXXFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
|
93
|
+
ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
|
94
|
+
dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
|
95
|
+
ARCH_FLAG =
|
96
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
97
|
+
LDSHARED = $(CC) -shared
|
98
|
+
LDSHAREDXX = $(CXX) -shared
|
99
|
+
AR = gcc-ar
|
100
|
+
EXEEXT =
|
101
|
+
|
102
|
+
RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
|
103
|
+
RUBY_SO_NAME = ruby
|
104
|
+
RUBYW_INSTALL_NAME =
|
105
|
+
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version_dir_name)
|
106
|
+
RUBYW_BASE_NAME = rubyw
|
107
|
+
RUBY_BASE_NAME = ruby
|
108
|
+
|
109
|
+
arch = aarch64-linux
|
110
|
+
sitearch = $(arch)
|
111
|
+
ruby_version = 3.0.0
|
112
|
+
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
113
|
+
RUBY = $(ruby)
|
114
|
+
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
115
|
+
|
116
|
+
RM = rm -f
|
117
|
+
RM_RF = $(RUBY) -run -e rm -- -rf
|
118
|
+
RMDIRS = rmdir --ignore-fail-on-non-empty -p
|
119
|
+
MAKEDIRS = /usr/bin/mkdir -p
|
120
|
+
INSTALL = /usr/bin/install -c
|
121
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
122
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
123
|
+
COPY = cp
|
124
|
+
TOUCH = exit >
|
125
|
+
|
126
|
+
#### End of system configuration section. ####
|
127
|
+
|
128
|
+
preload =
|
129
|
+
libpath = . $(archlibdir)
|
130
|
+
LIBPATH = -L. -L$(archlibdir)
|
131
|
+
DEFFILE =
|
132
|
+
|
133
|
+
CLEANFILES = mkmf.log
|
134
|
+
DISTCLEANFILES =
|
135
|
+
DISTCLEANDIRS =
|
136
|
+
|
137
|
+
extout =
|
138
|
+
extout_prefix =
|
139
|
+
target_prefix =
|
140
|
+
LOCAL_LIBS =
|
141
|
+
LIBS = -lprotobuf -lpthread -lm -lc
|
142
|
+
ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
|
143
|
+
SRCS = $(ORIG_SRCS)
|
144
|
+
OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
|
145
|
+
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
|
146
|
+
LOCAL_HDRS =
|
147
|
+
TARGET = libcld3
|
148
|
+
TARGET_NAME = libcld3
|
149
|
+
TARGET_ENTRY = Init_$(TARGET_NAME)
|
150
|
+
DLLIB = $(TARGET).so
|
151
|
+
EXTSTATIC =
|
152
|
+
STATIC_LIB =
|
153
|
+
|
154
|
+
TIMESTAMP_DIR = .
|
155
|
+
BINDIR = $(bindir)
|
156
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
157
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
158
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
159
|
+
HDRDIR = $(sitehdrdir)$(target_prefix)
|
160
|
+
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
161
|
+
TARGET_SO_DIR =
|
162
|
+
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
163
|
+
CLEANLIBS = $(TARGET_SO)
|
164
|
+
CLEANOBJS = *.o *.bak
|
165
|
+
|
166
|
+
all: $(DLLIB)
|
167
|
+
static: $(STATIC_LIB)
|
168
|
+
.PHONY: all install static install-so install-rb
|
169
|
+
.PHONY: clean clean-so clean-static clean-rb
|
170
|
+
|
171
|
+
clean-static::
|
172
|
+
clean-rb-default::
|
173
|
+
clean-rb::
|
174
|
+
clean-so::
|
175
|
+
clean: clean-so clean-static clean-rb-default clean-rb
|
176
|
+
-$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
177
|
+
|
178
|
+
distclean-rb-default::
|
179
|
+
distclean-rb::
|
180
|
+
distclean-so::
|
181
|
+
distclean-static::
|
182
|
+
distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
|
183
|
+
-$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
184
|
+
-$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
185
|
+
-$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
186
|
+
|
187
|
+
realclean: distclean
|
188
|
+
install: install-so install-rb
|
189
|
+
|
190
|
+
install-so: $(DLLIB) $(TIMESTAMP_DIR)/.sitearchdir.time
|
191
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
192
|
+
clean-static::
|
193
|
+
-$(Q)$(RM) $(STATIC_LIB)
|
194
|
+
install-rb: pre-install-rb do-install-rb install-rb-default
|
195
|
+
install-rb-default: pre-install-rb-default do-install-rb-default
|
196
|
+
pre-install-rb: Makefile
|
197
|
+
pre-install-rb-default: Makefile
|
198
|
+
do-install-rb:
|
199
|
+
do-install-rb-default:
|
200
|
+
pre-install-rb-default:
|
201
|
+
@$(NULLCMD)
|
202
|
+
$(TIMESTAMP_DIR)/.sitearchdir.time:
|
203
|
+
$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
|
204
|
+
$(Q) $(TOUCH) $@
|
205
|
+
|
206
|
+
site-install: site-install-so site-install-rb
|
207
|
+
site-install-so: install-so
|
208
|
+
site-install-rb: install-rb
|
209
|
+
|
210
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
|
211
|
+
|
212
|
+
.cc.o:
|
213
|
+
$(ECHO) compiling $(<)
|
214
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
215
|
+
|
216
|
+
.cc.S:
|
217
|
+
$(ECHO) translating $(<)
|
218
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
219
|
+
|
220
|
+
.mm.o:
|
221
|
+
$(ECHO) compiling $(<)
|
222
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
223
|
+
|
224
|
+
.mm.S:
|
225
|
+
$(ECHO) translating $(<)
|
226
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
227
|
+
|
228
|
+
.cxx.o:
|
229
|
+
$(ECHO) compiling $(<)
|
230
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
231
|
+
|
232
|
+
.cxx.S:
|
233
|
+
$(ECHO) translating $(<)
|
234
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
235
|
+
|
236
|
+
.cpp.o:
|
237
|
+
$(ECHO) compiling $(<)
|
238
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
239
|
+
|
240
|
+
.cpp.S:
|
241
|
+
$(ECHO) translating $(<)
|
242
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
243
|
+
|
244
|
+
.c.o:
|
245
|
+
$(ECHO) compiling $(<)
|
246
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
247
|
+
|
248
|
+
.c.S:
|
249
|
+
$(ECHO) translating $(<)
|
250
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
251
|
+
|
252
|
+
.m.o:
|
253
|
+
$(ECHO) compiling $(<)
|
254
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
255
|
+
|
256
|
+
.m.S:
|
257
|
+
$(ECHO) translating $(<)
|
258
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
259
|
+
|
260
|
+
$(TARGET_SO): $(OBJS) Makefile
|
261
|
+
$(ECHO) linking shared-object $(DLLIB)
|
262
|
+
-$(Q)$(RM) $(@)
|
263
|
+
$(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
264
|
+
|
265
|
+
|
266
|
+
|
267
|
+
$(OBJS): $(HDRS) $(ruby_headers)
|
data/ext/cld3/base.o
ADDED
Binary file
|
Binary file
|
@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
|
|
167
167
|
for (int i = 0; i < model_->embedding_dim_size(); ++i) {
|
168
168
|
CLD3_DCHECK(offset_sum == model_->concat_offset(i));
|
169
169
|
offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
|
170
|
+
(void)offset_sum; // Avoid compiler warning for "unused" variable.
|
170
171
|
embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
|
171
172
|
}
|
172
173
|
|
Binary file
|
data/ext/cld3/extconf.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2017 Akihiko Odaki <akihiko.odaki
|
1
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
2
|
# All Rights Reserved.
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -33,7 +33,7 @@ FileUtils.mkdir_p("cld_3/protos")
|
|
33
33
|
FileUtils.mkdir_p("script_span")
|
34
34
|
|
35
35
|
[ "feature_extractor", "sentence", "task_spec" ].each {|name|
|
36
|
-
|
36
|
+
system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
|
37
37
|
ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
|
38
38
|
}
|
39
39
|
|
@@ -56,4 +56,5 @@ FileUtils.mkdir_p("script_span")
|
|
56
56
|
}
|
57
57
|
|
58
58
|
$CXXFLAGS += " -fvisibility=hidden -std=c++11"
|
59
|
+
$LIBRUBYARG = ""
|
59
60
|
create_makefile("libcld3")
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/getonescriptspan.h
CHANGED
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
|
|
33
33
|
static const int kWithinScriptTail = 32; // Stop at word space in last
|
34
34
|
// N bytes of script buffer
|
35
35
|
|
36
|
-
|
36
|
+
struct LangSpan {
|
37
37
|
char* text = nullptr; // Pointer to the span, somewhere
|
38
38
|
int text_bytes = 0; // Number of bytes of text in the span
|
39
39
|
int offset = 0; // Offset of start of span in original input buffer
|
40
40
|
ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
|
41
41
|
bool truncated = false; // true if buffer filled up before a
|
42
42
|
// different script or EOF was found
|
43
|
-
}
|
43
|
+
};
|
44
44
|
|
45
45
|
static inline bool IsContinuationByte(char c) {
|
46
46
|
return static_cast<signed char>(c) < -64;
|
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/libcld3.so
ADDED
Binary file
|
data/ext/cld3/mkmf.log
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
"pkg-config --exists protobuf"
|
2
|
+
| pkg-config --libs protobuf
|
3
|
+
=> "-lprotobuf -lpthread \n"
|
4
|
+
"gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lm -lc"
|
5
|
+
checked program was:
|
6
|
+
/* begin */
|
7
|
+
1: #include "ruby.h"
|
8
|
+
2:
|
9
|
+
3: int main(int argc, char **argv)
|
10
|
+
4: {
|
11
|
+
5: return !!argv[argc];
|
12
|
+
6: }
|
13
|
+
/* end */
|
14
|
+
|
15
|
+
"gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lprotobuf -lpthread -lm -lc"
|
16
|
+
checked program was:
|
17
|
+
/* begin */
|
18
|
+
1: #include "ruby.h"
|
19
|
+
2:
|
20
|
+
3: int main(int argc, char **argv)
|
21
|
+
4: {
|
22
|
+
5: return !!argv[argc];
|
23
|
+
6: }
|
24
|
+
/* end */
|
25
|
+
|
26
|
+
| pkg-config --cflags-only-I protobuf
|
27
|
+
=> "\n"
|
28
|
+
| pkg-config --cflags-only-other protobuf
|
29
|
+
=> "\n"
|
30
|
+
| pkg-config --libs-only-l protobuf
|
31
|
+
=> "-lprotobuf -lpthread \n"
|
32
|
+
package configuration for protobuf
|
33
|
+
incflags:
|
34
|
+
cflags:
|
35
|
+
ldflags:
|
36
|
+
libs: -lprotobuf -lpthread
|
37
|
+
|
@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
|
284
284
|
CLD2::LangSpan script_span;
|
285
285
|
std::unordered_map<string, LangChunksStats> lang_stats;
|
286
286
|
int total_num_bytes = 0;
|
287
|
-
Result result;
|
288
|
-
string language;
|
289
287
|
int chunk_size = 0; // Use the default.
|
290
288
|
while (ss.GetOneScriptSpanLower(&script_span)) {
|
291
289
|
const int num_original_span_bytes = script_span.text_bytes;
|
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
|
302
300
|
|
303
301
|
const string selected_text = SelectTextGivenScriptSpan(script_span);
|
304
302
|
|
305
|
-
result = FindLanguageOfValidUTF8(selected_text);
|
306
|
-
language = result.language;
|
303
|
+
Result result = FindLanguageOfValidUTF8(selected_text);
|
304
|
+
string language = result.language;
|
307
305
|
lang_stats[language].byte_sum += num_original_span_bytes;
|
308
306
|
lang_stats[language].prob_sum +=
|
309
307
|
result.probability * num_original_span_bytes;
|
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
|
|
356
354
|
const char *text_begin, int text_size) {
|
357
355
|
string output_text;
|
358
356
|
|
359
|
-
// If the size of the input is greater than the
|
357
|
+
// If the size of the input is greater than the maximum number of bytes needed
|
360
358
|
// for a prediction, then concatenate snippets that are equally spread out
|
361
359
|
// throughout the input.
|
362
360
|
if (text_size > max_num_bytes_) {
|
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/registry.o
ADDED
Binary file
|
Binary file
|
Binary file
|
@@ -19,11 +19,11 @@ limitations under the License.
|
|
19
19
|
|
20
20
|
namespace chrome_lang_id {
|
21
21
|
|
22
|
-
//
|
22
|
+
// Define registry for the whole Sentence feature functions. NOTE: this is not
|
23
23
|
// yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
|
24
24
|
// constructor, *before* we use any feature.
|
25
25
|
template <>
|
26
|
-
WholeSentenceFeature::Registry
|
27
|
-
|
26
|
+
WholeSentenceFeature::Registry*
|
27
|
+
RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
|
28
28
|
|
29
|
-
} // namespace chrome_lang_id
|
29
|
+
} // namespace chrome_lang_id
|
@@ -26,9 +26,19 @@ limitations under the License.
|
|
26
26
|
namespace chrome_lang_id {
|
27
27
|
|
28
28
|
// Feature function that extracts features for the full Sentence.
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
using WholeSentenceFeature = FeatureFunction<Sentence>;
|
30
|
+
|
31
|
+
using WholeSentenceExtractor = FeatureExtractor<Sentence>;
|
32
|
+
|
33
|
+
// Declare registry for the whole Sentence feature functions. This is required
|
34
|
+
// for clang's -Wundefined-var-template. However, MSVC has a bug which treats
|
35
|
+
// this declaration as a definition, leading to multiple definition errors, so
|
36
|
+
// omit this on MSVC.
|
37
|
+
#if !defined(COMPILER_MSVC)
|
38
|
+
template <>
|
39
|
+
WholeSentenceFeature::Registry
|
40
|
+
*RegisterableClass<WholeSentenceFeature>::registry_;
|
41
|
+
#endif
|
32
42
|
|
33
43
|
} // namespace chrome_lang_id
|
34
44
|
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/utils.o
ADDED
Binary file
|
Binary file
|
data/lib/a.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require "cld3"
|
2
|
+
|
3
|
+
# Kafka text as an example + the word Velcro
|
4
|
+
text = "Πολυαγαπημένε πατέρα πρόσφατα Velcro με ρώτησες κάποια φορά γιατί ισχυρίζομαι πως σε φοβάμαι. Εγώ δεν ήξερα, ως συνήθως, τι να σου απαντήσω, εν μέρει ακριβώς λόγω του φόβου που νιώθω για σένα, εν μέρει επειδή στην αιτιολόγηση του φόβου αυτού συγκαταλέγονται πάρα πολλές λεπτομέρειες, που εν τη ρύμη του λόγου εγώ ούτε κατά το ήμισυ δεν θα μπορούσα να τις συγκρατήσω. Κι αν εδώ προσπαθώ να σου απαντήσω γραπτώς, μόνο ανολοκλήρωτο κατά πολύ θα αποβεί και τούτο, επειδή και κατά τη γραφή ο φόβος και οι συνέπειές του με κωλύουν έναντί σου κι επειδή το μέγεθος του υλικού εν γένει υπερβαίνει κατά πολύ τη μνήμη μου και το λογικό μου. Για σένα το ζήτημα αποδεικνυόταν πάντοτε πολύ απλό, τουλάχιστον στον βαθμό που μιλούσες εσύ γι’ αυτό ενώπιόν μου και, αδιακρίτως, ενώπιον πολλών άλλων. Εσένα σου φαινόταν να είναι κάπως έτσι: Εσύ εργαζόσουν σκληρά σ’ όλη σου τη ζωή, τα πάντα για τα παιδιά σου, προ πάντων για εμένα τα θυσίαζες, εγώ έκαμνα συνεπώς «ζωή χαρισάμενη», είχα πλήρη ελευθερία να μάθω ό,τι ήθελα, κανέναν λόγο δεν είχα να έχω έγνοιες για την καθημερινή διατροφή, να έχω έγνοιες συνεπώς εν γένει• εσύ αντ’ αυτών καμμίαν ευγνωμοσύνη δεν αξίωνες, γνωρίζεις «την ευγνωμοσύνη των παιδιών, αλλά εν τούτοις τουλάχιστον μια "
|
5
|
+
pp text.bytesize
|
6
|
+
|
7
|
+
200.times { |i|
|
8
|
+
max_bytes = 500 + i * 10
|
9
|
+
cld3 = CLD3::NNetLanguageIdentifier.new("foo", max_bytes)
|
10
|
+
|
11
|
+
lang = cld3.find_language(text)
|
12
|
+
lang2 = cld3.find_top_n_most_freq_langs(text, 1)
|
13
|
+
|
14
|
+
puts "When max_bytes is #{max_bytes} probability is less than 0.999: #{lang.probability}" if lang.probability < 0.999
|
15
|
+
|
16
|
+
if lang.language != :el
|
17
|
+
puts "When max_bytes is #{max_bytes} then cld3::find_language returns #{lang.language},
|
18
|
+
find_top_n_most_freq_langs returns #{lang2.first.language}"
|
19
|
+
#pp lang
|
20
|
+
#pp lang2
|
21
|
+
end
|
22
|
+
}
|
23
|
+
|
24
|
+
puts "Size: #{text.length} - Bytesize: #{text.encode(Encoding::UTF_8).bytesize}"
|
@@ -0,0 +1,58 @@
|
|
1
|
+
|
2
|
+
# Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
|
3
|
+
# All Rights Reserved.
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
# ==============================================================================
|
17
|
+
|
18
|
+
module CLD3
|
19
|
+
module Unstable
|
20
|
+
extend FFI::Library
|
21
|
+
|
22
|
+
ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
|
23
|
+
|
24
|
+
module NNetLanguageIdentifier
|
25
|
+
class Pointer < FFI::AutoPointer
|
26
|
+
def self.release(pointer)
|
27
|
+
Unstable.delete_NNetLanguageIdentifier(pointer)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class SpanInfo < FFI::Struct
|
32
|
+
layout :start_index, :int, :end_index, :int, :probability, :float
|
33
|
+
end
|
34
|
+
|
35
|
+
class Result < FFI::Struct
|
36
|
+
layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
41
|
+
|
42
|
+
attach_function :delete_result, [ :pointer ], :void
|
43
|
+
|
44
|
+
attach_function :delete_results, [ :pointer ], :void
|
45
|
+
|
46
|
+
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
47
|
+
|
48
|
+
attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
|
49
|
+
|
50
|
+
attach_function :NNetLanguageIdentifier_find_language,
|
51
|
+
[ :pointer, :buffer_in, :size_t ], :pointer
|
52
|
+
|
53
|
+
attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
|
54
|
+
[ :pointer, :buffer_in, :size_t, :int ], :pointer
|
55
|
+
end
|
56
|
+
|
57
|
+
private_constant :Unstable
|
58
|
+
end
|
data/lib/cld3.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# File including an implementation of CLD3 module. Some documentations are
|
2
2
|
# extracted from ext/cld3/ext/src/nnet_language_identifier.h.
|
3
3
|
#
|
4
|
-
# Copyright 2017 Akihiko Odaki <akihiko.odaki
|
4
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
5
5
|
# All Rights Reserved.
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -19,6 +19,7 @@
|
|
19
19
|
|
20
20
|
require "ffi"
|
21
21
|
require "rbconfig"
|
22
|
+
require "cld3/unstable"
|
22
23
|
|
23
24
|
# Module providing an interface for Compact Language Detector v3 (CLD3)
|
24
25
|
module CLD3
|
@@ -52,6 +53,7 @@ module CLD3
|
|
52
53
|
# Holds probability that Span, specified by start/end indices, is a given
|
53
54
|
# language. The langauge is not stored here; it can be found in Result, which
|
54
55
|
# holds an Array of SpanInfo.
|
56
|
+
# @type const SpanInfo: untyped
|
55
57
|
SpanInfo = Struct.new(:start_index, :end_index, :probability)
|
56
58
|
|
57
59
|
# Information about a predicted language.
|
@@ -69,16 +71,18 @@ module CLD3
|
|
69
71
|
#
|
70
72
|
# [byte_ranges] Specifies the byte ranges in UTF-8 that |language| applies to.
|
71
73
|
# This is an Array of SpanInfo.
|
74
|
+
# @type const Result: untyped
|
72
75
|
Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
|
73
76
|
|
74
|
-
# The arguments are two
|
77
|
+
# The arguments are two Numeric objects.
|
75
78
|
def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
|
79
|
+
raise ArgumentError if max_num_bytes <= 0 || min_num_bytes < 0 || min_num_bytes >= max_num_bytes
|
76
80
|
@cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
|
77
81
|
end
|
78
82
|
|
79
83
|
# Finds the most likely language for the given text, along with additional
|
80
84
|
# information (e.g., probability). The prediction is based on the first N
|
81
|
-
# bytes where N is the
|
85
|
+
# bytes where N is the minimum between the number of interchange valid UTF8
|
82
86
|
# bytes and +max_num_bytes_+. If N is less than +min_num_bytes_+ long, then
|
83
87
|
# this function returns nil.
|
84
88
|
# The argument is a String object.
|
@@ -115,6 +119,8 @@ module CLD3
|
|
115
119
|
# The second argument is Numeric object.
|
116
120
|
# The returned value of this functions is an Array of Result instances.
|
117
121
|
def find_top_n_most_freq_langs(text, num_langs)
|
122
|
+
# @type var a: untyped
|
123
|
+
|
118
124
|
text_utf8 = text.encode(Encoding::UTF_8)
|
119
125
|
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
120
126
|
|
@@ -123,11 +129,13 @@ module CLD3
|
|
123
129
|
|
124
130
|
results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
|
125
131
|
begin
|
126
|
-
num_langs.times
|
132
|
+
a = num_langs.times
|
127
133
|
.lazy
|
128
134
|
.map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
|
129
135
|
.take_while { |result| !result.nil? }
|
130
136
|
.to_a
|
137
|
+
|
138
|
+
a
|
131
139
|
ensure
|
132
140
|
Unstable.delete_results results
|
133
141
|
end
|
@@ -162,6 +170,7 @@ module CLD3
|
|
162
170
|
# The model weights are loaded statically.
|
163
171
|
module TaskContextParams
|
164
172
|
# This is an frozen Array object containing symbols.
|
173
|
+
# @type const LANGUAGE_NAMES: untyped
|
165
174
|
LANGUAGE_NAMES = [
|
166
175
|
:eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
|
167
176
|
:nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
|
@@ -175,44 +184,4 @@ module CLD3
|
|
175
184
|
:sn, :yo, :pa, :ku,
|
176
185
|
].freeze
|
177
186
|
end
|
178
|
-
|
179
|
-
module Unstable
|
180
|
-
extend FFI::Library
|
181
|
-
|
182
|
-
ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
|
183
|
-
|
184
|
-
module NNetLanguageIdentifier
|
185
|
-
class Pointer < FFI::AutoPointer
|
186
|
-
def self.release(pointer)
|
187
|
-
Unstable.delete_NNetLanguageIdentifier(pointer)
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
class SpanInfo < FFI::Struct
|
192
|
-
layout :start_index, :int, :end_index, :int, :probability, :float
|
193
|
-
end
|
194
|
-
|
195
|
-
class Result < FFI::Struct
|
196
|
-
layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
197
|
-
end
|
198
|
-
end
|
199
|
-
|
200
|
-
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
201
|
-
|
202
|
-
attach_function :delete_result, [ :pointer ], :void
|
203
|
-
|
204
|
-
attach_function :delete_results, [ :pointer ], :void
|
205
|
-
|
206
|
-
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
207
|
-
|
208
|
-
attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
|
209
|
-
|
210
|
-
attach_function :NNetLanguageIdentifier_find_language,
|
211
|
-
[ :pointer, :buffer_in, :size_t ], :pointer
|
212
|
-
|
213
|
-
attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
|
214
|
-
[ :pointer, :buffer_in, :size_t, :int ], :pointer
|
215
|
-
end
|
216
|
-
|
217
|
-
private_constant :Unstable
|
218
187
|
end
|
data/sig/cld3.rbs
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
# Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
# All Rights Reserved.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
# ==============================================================================
|
16
|
+
|
17
|
+
module CLD3
|
18
|
+
class NNetLanguageIdentifier
|
19
|
+
MIN_NUM_BYTES_TO_CONSIDER: Integer
|
20
|
+
MAX_NUM_BYTES_TO_CONSIDER: Integer
|
21
|
+
MAX_NUM_INPUT_BYTES_TO_CONSIDER: Integer
|
22
|
+
RELIABILITY_THRESHOLD: Float
|
23
|
+
RELIABILITY_HR_BS_THRESHOLD: Float
|
24
|
+
|
25
|
+
class SpanInfo < Struct[Float | Integer]
|
26
|
+
attr_accessor start_index(): Integer
|
27
|
+
attr_accessor end_index(): Integer
|
28
|
+
attr_accessor probability(): Float
|
29
|
+
end
|
30
|
+
|
31
|
+
class Result < Struct[Array[SpanInfo] | Float | TaskContextParams::language_names | bool]
|
32
|
+
attr_accessor language(): TaskContextParams::language_names
|
33
|
+
attr_accessor probability(): Float
|
34
|
+
attr_accessor reliable?(): bool
|
35
|
+
attr_accessor proportion(): Float
|
36
|
+
attr_accessor byte_ranges(): Array[SpanInfo]
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize: (?Integer, ?Integer) -> void
|
40
|
+
def find_language: (String) -> Result?
|
41
|
+
def find_top_n_most_freq_langs: (String, Integer) -> Array[Result]
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def convert_result: (untyped) -> Result?
|
46
|
+
end
|
47
|
+
|
48
|
+
module TaskContextParams
|
49
|
+
type language_names =
|
50
|
+
:eo | :co | :eu | :ta | :de | :mt | :ps | :te | :su | :uz | :'zh-Latn' | :ne |
|
51
|
+
:nl | :sw | :sq | :hmn | :ja | :no | :mn | :so | :ko | :kk | :sl | :ig |
|
52
|
+
:mr | :th | :zu | :ml | :hr | :bs | :lo | :sd | :cy | :hy | :uk | :pt |
|
53
|
+
:lv | :iw | :cs | :vi | :jv | :be | :km | :mk | :tr | :fy | :am | :zh |
|
54
|
+
:da | :sv | :fi | :ht | :af | :la | :id | :fil | :sm | :ca | :el | :ka |
|
55
|
+
:sr | :it | :sk | :ru | :'ru-Latn' | :bg | :ny | :fa | :haw | :gl | :et |
|
56
|
+
:ms | :gd | :'bg-Latn' | :ha | :is | :ur | :mi | :hi | :bn | :'hi-Latn' | :fr |
|
57
|
+
:yi | :hu | :xh | :my | :tg | :ro | :ar | :lb | :'el-Latn' | :st | :ceb |
|
58
|
+
:kn | :az | :si | :ky | :mg | :en | :gu | :es | :pl | :'ja-Latn' | :ga | :lt |
|
59
|
+
:sn | :yo | :pa | :ku
|
60
|
+
|
61
|
+
LANGUAGE_NAMES: Array[language_names]
|
62
|
+
end
|
63
|
+
|
64
|
+
Unstable: untyped
|
65
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cld3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Akihiko Odaki
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -19,7 +19,7 @@ dependencies:
|
|
19
19
|
version: 1.1.0
|
20
20
|
- - "<"
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version: 1.
|
22
|
+
version: 1.16.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -29,7 +29,27 @@ dependencies:
|
|
29
29
|
version: 1.1.0
|
30
30
|
- - "<"
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: 1.
|
32
|
+
version: 1.16.0
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: rbs
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 1.7.0
|
40
|
+
- - "<"
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 1.8.0
|
43
|
+
type: :development
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 1.7.0
|
50
|
+
- - "<"
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.8.0
|
33
53
|
- !ruby/object:Gem::Dependency
|
34
54
|
name: rspec
|
35
55
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,7 +59,7 @@ dependencies:
|
|
39
59
|
version: 3.0.0
|
40
60
|
- - "<"
|
41
61
|
- !ruby/object:Gem::Version
|
42
|
-
version: 3.
|
62
|
+
version: 3.11.0
|
43
63
|
type: :development
|
44
64
|
prerelease: false
|
45
65
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -49,10 +69,30 @@ dependencies:
|
|
49
69
|
version: 3.0.0
|
50
70
|
- - "<"
|
51
71
|
- !ruby/object:Gem::Version
|
52
|
-
version: 3.
|
72
|
+
version: 3.11.0
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: steep
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 0.47.0
|
80
|
+
- - "<"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.48.0
|
83
|
+
type: :development
|
84
|
+
prerelease: false
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.47.0
|
90
|
+
- - "<"
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: 0.48.0
|
53
93
|
description: Compact Language Detector v3 (CLD3) is a neural network model for language
|
54
94
|
identification.
|
55
|
-
email: akihiko.odaki
|
95
|
+
email: akihiko.odaki@gmail.com
|
56
96
|
executables: []
|
57
97
|
extensions:
|
58
98
|
- ext/cld3/extconf.rb
|
@@ -63,76 +103,111 @@ files:
|
|
63
103
|
- LICENSE_CLD3
|
64
104
|
- README.md
|
65
105
|
- cld3.gemspec
|
106
|
+
- ext/cld3/Makefile
|
66
107
|
- ext/cld3/base.cc
|
67
108
|
- ext/cld3/base.h
|
109
|
+
- ext/cld3/base.o
|
68
110
|
- ext/cld3/casts.h
|
69
111
|
- ext/cld3/embedding_feature_extractor.cc
|
70
112
|
- ext/cld3/embedding_feature_extractor.h
|
113
|
+
- ext/cld3/embedding_feature_extractor.o
|
71
114
|
- ext/cld3/embedding_network.cc
|
72
115
|
- ext/cld3/embedding_network.h
|
116
|
+
- ext/cld3/embedding_network.o
|
73
117
|
- ext/cld3/embedding_network_params.h
|
74
118
|
- ext/cld3/extconf.rb
|
75
119
|
- ext/cld3/feature_extractor.cc
|
76
120
|
- ext/cld3/feature_extractor.h
|
121
|
+
- ext/cld3/feature_extractor.o
|
122
|
+
- ext/cld3/feature_extractor.pb.o
|
77
123
|
- ext/cld3/feature_extractor.proto
|
78
124
|
- ext/cld3/feature_types.cc
|
79
125
|
- ext/cld3/feature_types.h
|
126
|
+
- ext/cld3/feature_types.o
|
80
127
|
- ext/cld3/fixunicodevalue.cc
|
81
128
|
- ext/cld3/fixunicodevalue.h
|
129
|
+
- ext/cld3/fixunicodevalue.o
|
82
130
|
- ext/cld3/float16.h
|
83
131
|
- ext/cld3/fml_parser.cc
|
84
132
|
- ext/cld3/fml_parser.h
|
133
|
+
- ext/cld3/fml_parser.o
|
85
134
|
- ext/cld3/generated_entities.cc
|
135
|
+
- ext/cld3/generated_entities.o
|
86
136
|
- ext/cld3/generated_ulscript.cc
|
87
137
|
- ext/cld3/generated_ulscript.h
|
138
|
+
- ext/cld3/generated_ulscript.o
|
88
139
|
- ext/cld3/getonescriptspan.cc
|
89
140
|
- ext/cld3/getonescriptspan.h
|
141
|
+
- ext/cld3/getonescriptspan.o
|
90
142
|
- ext/cld3/integral_types.h
|
91
143
|
- ext/cld3/lang_id_nn_params.cc
|
92
144
|
- ext/cld3/lang_id_nn_params.h
|
145
|
+
- ext/cld3/lang_id_nn_params.o
|
93
146
|
- ext/cld3/language_identifier_features.cc
|
94
147
|
- ext/cld3/language_identifier_features.h
|
148
|
+
- ext/cld3/language_identifier_features.o
|
149
|
+
- ext/cld3/libcld3.def
|
150
|
+
- ext/cld3/libcld3.so
|
151
|
+
- ext/cld3/mkmf.log
|
95
152
|
- ext/cld3/nnet_language_identifier.cc
|
96
153
|
- ext/cld3/nnet_language_identifier.h
|
154
|
+
- ext/cld3/nnet_language_identifier.o
|
97
155
|
- ext/cld3/nnet_language_identifier_c.cc
|
156
|
+
- ext/cld3/nnet_language_identifier_c.o
|
98
157
|
- ext/cld3/offsetmap.cc
|
99
158
|
- ext/cld3/offsetmap.h
|
159
|
+
- ext/cld3/offsetmap.o
|
100
160
|
- ext/cld3/port.h
|
101
161
|
- ext/cld3/registry.cc
|
102
162
|
- ext/cld3/registry.h
|
163
|
+
- ext/cld3/registry.o
|
103
164
|
- ext/cld3/relevant_script_feature.cc
|
104
165
|
- ext/cld3/relevant_script_feature.h
|
166
|
+
- ext/cld3/relevant_script_feature.o
|
105
167
|
- ext/cld3/script_detector.h
|
168
|
+
- ext/cld3/sentence.pb.o
|
106
169
|
- ext/cld3/sentence.proto
|
107
170
|
- ext/cld3/sentence_features.cc
|
108
171
|
- ext/cld3/sentence_features.h
|
172
|
+
- ext/cld3/sentence_features.o
|
109
173
|
- ext/cld3/simple_adder.h
|
110
174
|
- ext/cld3/stringpiece.h
|
111
175
|
- ext/cld3/task_context.cc
|
112
176
|
- ext/cld3/task_context.h
|
177
|
+
- ext/cld3/task_context.o
|
113
178
|
- ext/cld3/task_context_params.cc
|
114
179
|
- ext/cld3/task_context_params.h
|
180
|
+
- ext/cld3/task_context_params.o
|
181
|
+
- ext/cld3/task_spec.pb.o
|
115
182
|
- ext/cld3/task_spec.proto
|
116
183
|
- ext/cld3/text_processing.cc
|
117
184
|
- ext/cld3/text_processing.h
|
185
|
+
- ext/cld3/text_processing.o
|
118
186
|
- ext/cld3/unicodetext.cc
|
119
187
|
- ext/cld3/unicodetext.h
|
188
|
+
- ext/cld3/unicodetext.o
|
120
189
|
- ext/cld3/utf8acceptinterchange.h
|
121
190
|
- ext/cld3/utf8prop_lettermarkscriptnum.h
|
122
191
|
- ext/cld3/utf8repl_lettermarklower.h
|
123
192
|
- ext/cld3/utf8scannot_lettermarkspecial.h
|
124
193
|
- ext/cld3/utf8statetable.cc
|
125
194
|
- ext/cld3/utf8statetable.h
|
195
|
+
- ext/cld3/utf8statetable.o
|
126
196
|
- ext/cld3/utils.cc
|
127
197
|
- ext/cld3/utils.h
|
198
|
+
- ext/cld3/utils.o
|
128
199
|
- ext/cld3/workspace.cc
|
129
200
|
- ext/cld3/workspace.h
|
201
|
+
- ext/cld3/workspace.o
|
202
|
+
- lib/a.rb
|
130
203
|
- lib/cld3.rb
|
204
|
+
- lib/cld3/unstable.rb
|
205
|
+
- sig/cld3.rbs
|
131
206
|
homepage: https://github.com/akihikodaki/cld3-ruby
|
132
207
|
licenses:
|
133
208
|
- Apache-2.0
|
134
209
|
metadata: {}
|
135
|
-
post_install_message:
|
210
|
+
post_install_message:
|
136
211
|
rdoc_options: []
|
137
212
|
require_paths:
|
138
213
|
- lib
|
@@ -140,18 +215,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
140
215
|
requirements:
|
141
216
|
- - ">="
|
142
217
|
- !ruby/object:Gem::Version
|
143
|
-
version: 2.
|
218
|
+
version: 2.6.0
|
144
219
|
- - "<"
|
145
220
|
- !ruby/object:Gem::Version
|
146
|
-
version: 2.
|
221
|
+
version: 3.2.0
|
147
222
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
148
223
|
requirements:
|
149
224
|
- - ">="
|
150
225
|
- !ruby/object:Gem::Version
|
151
226
|
version: '0'
|
152
227
|
requirements: []
|
153
|
-
rubygems_version: 3.
|
154
|
-
signing_key:
|
228
|
+
rubygems_version: 3.2.22
|
229
|
+
signing_key:
|
155
230
|
specification_version: 4
|
156
231
|
summary: Compact Language Detector v3 (CLD3)
|
157
232
|
test_files: []
|