cld3 3.2.4 → 3.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/LICENSE +2 -2
- data/README.md +3 -3
- data/cld3.gemspec +6 -6
- data/ext/cld3/Makefile +266 -0
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/extconf.rb +3 -2
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.h +1 -1
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.def +8 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/mkmf.log +37 -0
- data/ext/cld3/nnet_language_identifier.cc +8 -0
- data/ext/cld3/nnet_language_identifier.h +16 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.cc +71 -23
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3.rb +96 -16
- metadata +45 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f5b3cc203abda97cb85d5dee0983b7f63c626397b8af8b90e2110bb5fedbbdec
|
4
|
+
data.tar.gz: 197f66798925404ded7af722d0194a705018d6953b11f4576c4e180ea093675d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 855e8ee464a2842906bfef211e2afb21820fe9a7449b58d91b9ab1908c997966b9dd4c2d5d51f82ceb84b65b5a118736a5aa4eff6ea9548b9a9abc61b297a9d0
|
7
|
+
data.tar.gz: e38ddfd81489aeb83bccc7b509dd17ea79c56ba641de37cac2d800d3428ed31e5ac57066016bd118e9e71c30c78d31b4c38a266abe012065495558adf07e68f5
|
data/Gemfile
CHANGED
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Copyright 2017 Akihiko Odaki <akihiko.odaki
|
1
|
+
Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
2
|
All rights reserved.
|
3
3
|
|
4
4
|
Apache License
|
@@ -189,7 +189,7 @@ All rights reserved.
|
|
189
189
|
same "printed page" as the copyright notice for easier
|
190
190
|
identification within third-party archives.
|
191
191
|
|
192
|
-
Copyright 2017, Akihiko Odaki <akihiko.odaki
|
192
|
+
Copyright 2017, Akihiko Odaki <akihiko.odaki@gmail.com>
|
193
193
|
|
194
194
|
Licensed under the Apache License, Version 2.0 (the "License");
|
195
195
|
you may not use this file except in compliance with the License.
|
data/README.md
CHANGED
@@ -8,11 +8,11 @@ require 'cld3'
|
|
8
8
|
|
9
9
|
cld3 = CLD3::NNetLanguageIdentifier.new(0, 1000)
|
10
10
|
|
11
|
-
cld3.find_language("こんにちは") # => #<struct Struct::Result language=:ja, probability=1.0, reliable?=true, proportion=1.0>
|
11
|
+
cld3.find_language("こんにちは") # => #<struct Struct::Result language=:ja, probability=1.0, reliable?=true, proportion=1.0, byte_ranges=[]>
|
12
12
|
|
13
|
-
cld3.find_language("This is a pen.") # => #<struct Struct::Result language=:en, probability=0.9999408721923828, reliable?=true, proportion=1.0>
|
13
|
+
cld3.find_language("This is a pen.") # => #<struct Struct::Result language=:en, probability=0.9999408721923828, reliable?=true, proportion=1.0, byte_ranges=[]>
|
14
14
|
|
15
|
-
cld3.find_language("здравствуйте") # => #<struct Struct::Result language=:ru, probability=0.3140212297439575, reliable?=false, proportion=1.0>
|
15
|
+
cld3.find_language("здравствуйте") # => #<struct Struct::Result language=:ru, probability=0.3140212297439575, reliable?=false, proportion=1.0, byte_ranges=[]>
|
16
16
|
```
|
17
17
|
|
18
18
|
## Installation
|
data/cld3.gemspec
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2017 Akihiko Odaki <akihiko.odaki
|
1
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
2
|
# All Rights Reserved.
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -16,16 +16,16 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.2
|
19
|
+
gem.version = "3.4.2"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
23
23
|
gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
|
24
24
|
gem.author = "Akihiko Odaki"
|
25
|
-
gem.email = "akihiko.odaki
|
26
|
-
gem.required_ruby_version = [ ">= 2.
|
27
|
-
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.
|
28
|
-
gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.
|
25
|
+
gem.email = "akihiko.odaki@gmail.com"
|
26
|
+
gem.required_ruby_version = [ ">= 2.6.0", "< 3.1.0" ]
|
27
|
+
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
|
28
|
+
gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
|
29
29
|
gem.files = Dir[
|
30
30
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
31
31
|
"cld3.gemspec", "ext/**/*", "lib/**/*"
|
data/ext/cld3/Makefile
ADDED
@@ -0,0 +1,266 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
5
|
+
V = 1
|
6
|
+
Q1 = $(V:1=)
|
7
|
+
Q = $(Q1:0=@)
|
8
|
+
ECHO1 = $(V:1=@ :)
|
9
|
+
ECHO = $(ECHO1:0=@ echo)
|
10
|
+
NULLCMD = :
|
11
|
+
|
12
|
+
#### Start of system configuration section. ####
|
13
|
+
|
14
|
+
srcdir = .
|
15
|
+
topdir = /usr/include
|
16
|
+
hdrdir = $(topdir)
|
17
|
+
arch_hdrdir = /usr/include
|
18
|
+
PATH_SEPARATOR = :
|
19
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
20
|
+
prefix = $(DESTDIR)/usr
|
21
|
+
rubysitearchprefix = $(sitearchlibdir)/$(RUBY_BASE_NAME)
|
22
|
+
rubyarchprefix = $(DESTDIR)/usr/lib64/ruby
|
23
|
+
rubylibprefix = $(exec_prefix)/share/ruby
|
24
|
+
exec_prefix = $(DESTDIR)/usr
|
25
|
+
vendorarchhdrdir = $(vendorhdrdir)/$(arch)
|
26
|
+
sitearchhdrdir = $(sitehdrdir)/$(arch)
|
27
|
+
rubyarchhdrdir = $(DESTDIR)/usr/include
|
28
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
29
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
30
|
+
rubyhdrdir = $(DESTDIR)/usr/include
|
31
|
+
rubygemsdir = $(DESTDIR)/usr/share/rubygems
|
32
|
+
vendorarchdir = $(DESTDIR)/usr/lib64/ruby/vendor_ruby
|
33
|
+
vendorlibdir = $(vendordir)
|
34
|
+
vendordir = $(DESTDIR)/usr/share/ruby/vendor_ruby
|
35
|
+
sitearchdir = $(DESTDIR)/usr/local/lib64/ruby/site_ruby
|
36
|
+
sitelibdir = $(sitedir)
|
37
|
+
sitedir = $(DESTDIR)/usr/local/share/ruby/site_ruby
|
38
|
+
rubyarchdir = $(rubyarchprefix)
|
39
|
+
rubylibdir = $(rubylibprefix)
|
40
|
+
sitearchincludedir = $(includedir)/$(sitearch)
|
41
|
+
archincludedir = $(includedir)/$(arch)
|
42
|
+
sitearchlibdir = $(libdir)/$(sitearch)
|
43
|
+
archlibdir = $(DESTDIR)/usr/lib64
|
44
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
45
|
+
mandir = $(DESTDIR)/usr/share/man
|
46
|
+
localedir = $(datarootdir)/locale
|
47
|
+
libdir = $(exec_prefix)/lib64
|
48
|
+
psdir = $(docdir)
|
49
|
+
pdfdir = $(docdir)
|
50
|
+
dvidir = $(docdir)
|
51
|
+
htmldir = $(docdir)
|
52
|
+
infodir = $(DESTDIR)/usr/share/info
|
53
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
54
|
+
oldincludedir = $(DESTDIR)/usr/include
|
55
|
+
includedir = $(DESTDIR)/usr/include
|
56
|
+
localstatedir = $(DESTDIR)/var
|
57
|
+
sharedstatedir = $(DESTDIR)/var/lib
|
58
|
+
sysconfdir = $(DESTDIR)/etc
|
59
|
+
datadir = $(DESTDIR)/usr/share
|
60
|
+
datarootdir = $(prefix)/share
|
61
|
+
libexecdir = $(DESTDIR)/usr/libexec
|
62
|
+
sbindir = $(DESTDIR)/usr/sbin
|
63
|
+
bindir = $(exec_prefix)/bin
|
64
|
+
archdir = $(rubyarchdir)
|
65
|
+
|
66
|
+
|
67
|
+
CC_WRAPPER =
|
68
|
+
CC = gcc
|
69
|
+
CXX = g++
|
70
|
+
LIBRUBY = $(LIBRUBY_SO)
|
71
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
72
|
+
LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
|
73
|
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static $(MAINLIBS)
|
74
|
+
empty =
|
75
|
+
OUTFLAG = -o $(empty)
|
76
|
+
COUTFLAG = -o $(empty)
|
77
|
+
CSRCFLAG = $(empty)
|
78
|
+
|
79
|
+
RUBY_EXTCONF_H =
|
80
|
+
cflags = $(optflags) $(debugflags) $(warnflags)
|
81
|
+
cxxflags =
|
82
|
+
optflags = -O3
|
83
|
+
debugflags = -ggdb3
|
84
|
+
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
|
85
|
+
cppflags =
|
86
|
+
CCDLFLAGS = -fPIC
|
87
|
+
CFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
|
88
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
89
|
+
DEFS =
|
90
|
+
CPPFLAGS = $(DEFS) $(cppflags)
|
91
|
+
CXXFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
|
92
|
+
ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
|
93
|
+
dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
|
94
|
+
ARCH_FLAG =
|
95
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
96
|
+
LDSHARED = $(CC) -shared
|
97
|
+
LDSHAREDXX = $(CXX) -shared
|
98
|
+
AR = ar
|
99
|
+
EXEEXT =
|
100
|
+
|
101
|
+
RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
|
102
|
+
RUBY_SO_NAME = ruby
|
103
|
+
RUBYW_INSTALL_NAME =
|
104
|
+
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version_dir_name)
|
105
|
+
RUBYW_BASE_NAME = rubyw
|
106
|
+
RUBY_BASE_NAME = ruby
|
107
|
+
|
108
|
+
arch = aarch64-linux
|
109
|
+
sitearch = $(arch)
|
110
|
+
ruby_version = 2.7.0
|
111
|
+
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
112
|
+
RUBY = $(ruby)
|
113
|
+
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
114
|
+
|
115
|
+
RM = rm -f
|
116
|
+
RM_RF = $(RUBY) -run -e rm -- -rf
|
117
|
+
RMDIRS = rmdir --ignore-fail-on-non-empty -p
|
118
|
+
MAKEDIRS = /usr/bin/mkdir -p
|
119
|
+
INSTALL = /usr/bin/install -c
|
120
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
121
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
122
|
+
COPY = cp
|
123
|
+
TOUCH = exit >
|
124
|
+
|
125
|
+
#### End of system configuration section. ####
|
126
|
+
|
127
|
+
preload =
|
128
|
+
libpath = . $(archlibdir)
|
129
|
+
LIBPATH = -L. -L$(archlibdir)
|
130
|
+
DEFFILE =
|
131
|
+
|
132
|
+
CLEANFILES = mkmf.log
|
133
|
+
DISTCLEANFILES =
|
134
|
+
DISTCLEANDIRS =
|
135
|
+
|
136
|
+
extout =
|
137
|
+
extout_prefix =
|
138
|
+
target_prefix =
|
139
|
+
LOCAL_LIBS =
|
140
|
+
LIBS = -lprotobuf -lpthread -lm -lc
|
141
|
+
ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
|
142
|
+
SRCS = $(ORIG_SRCS)
|
143
|
+
OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
|
144
|
+
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/language_identifier_features.h $(srcdir)/lang_id_nn_params.h $(srcdir)/nnet_language_identifier.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/fixunicodevalue.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/stringpiece.h $(srcdir)/text_processing.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/unicodetext.h $(srcdir)/utils.h $(srcdir)/workspace.h $(srcdir)/feature_extractor.pb.h $(srcdir)/sentence.pb.h $(srcdir)/task_spec.pb.h
|
145
|
+
LOCAL_HDRS =
|
146
|
+
TARGET = libcld3
|
147
|
+
TARGET_NAME = libcld3
|
148
|
+
TARGET_ENTRY = Init_$(TARGET_NAME)
|
149
|
+
DLLIB = $(TARGET).so
|
150
|
+
EXTSTATIC =
|
151
|
+
STATIC_LIB =
|
152
|
+
|
153
|
+
TIMESTAMP_DIR = .
|
154
|
+
BINDIR = $(bindir)
|
155
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
156
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
157
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
158
|
+
HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
|
159
|
+
ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
|
160
|
+
TARGET_SO_DIR =
|
161
|
+
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
162
|
+
CLEANLIBS = $(TARGET_SO)
|
163
|
+
CLEANOBJS = *.o *.bak
|
164
|
+
|
165
|
+
all: $(DLLIB)
|
166
|
+
static: $(STATIC_LIB)
|
167
|
+
.PHONY: all install static install-so install-rb
|
168
|
+
.PHONY: clean clean-so clean-static clean-rb
|
169
|
+
|
170
|
+
clean-static::
|
171
|
+
clean-rb-default::
|
172
|
+
clean-rb::
|
173
|
+
clean-so::
|
174
|
+
clean: clean-so clean-static clean-rb-default clean-rb
|
175
|
+
-$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
176
|
+
|
177
|
+
distclean-rb-default::
|
178
|
+
distclean-rb::
|
179
|
+
distclean-so::
|
180
|
+
distclean-static::
|
181
|
+
distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
|
182
|
+
-$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
183
|
+
-$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
184
|
+
-$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
185
|
+
|
186
|
+
realclean: distclean
|
187
|
+
install: install-so install-rb
|
188
|
+
|
189
|
+
install-so: $(DLLIB) $(TIMESTAMP_DIR)/.sitearchdir.time
|
190
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
191
|
+
clean-static::
|
192
|
+
-$(Q)$(RM) $(STATIC_LIB)
|
193
|
+
install-rb: pre-install-rb do-install-rb install-rb-default
|
194
|
+
install-rb-default: pre-install-rb-default do-install-rb-default
|
195
|
+
pre-install-rb: Makefile
|
196
|
+
pre-install-rb-default: Makefile
|
197
|
+
do-install-rb:
|
198
|
+
do-install-rb-default:
|
199
|
+
pre-install-rb-default:
|
200
|
+
@$(NULLCMD)
|
201
|
+
$(TIMESTAMP_DIR)/.sitearchdir.time:
|
202
|
+
$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
|
203
|
+
$(Q) $(TOUCH) $@
|
204
|
+
|
205
|
+
site-install: site-install-so site-install-rb
|
206
|
+
site-install-so: install-so
|
207
|
+
site-install-rb: install-rb
|
208
|
+
|
209
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
|
210
|
+
|
211
|
+
.cc.o:
|
212
|
+
$(ECHO) compiling $(<)
|
213
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
214
|
+
|
215
|
+
.cc.S:
|
216
|
+
$(ECHO) translating $(<)
|
217
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
218
|
+
|
219
|
+
.mm.o:
|
220
|
+
$(ECHO) compiling $(<)
|
221
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
222
|
+
|
223
|
+
.mm.S:
|
224
|
+
$(ECHO) translating $(<)
|
225
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
226
|
+
|
227
|
+
.cxx.o:
|
228
|
+
$(ECHO) compiling $(<)
|
229
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
230
|
+
|
231
|
+
.cxx.S:
|
232
|
+
$(ECHO) translating $(<)
|
233
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
234
|
+
|
235
|
+
.cpp.o:
|
236
|
+
$(ECHO) compiling $(<)
|
237
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
238
|
+
|
239
|
+
.cpp.S:
|
240
|
+
$(ECHO) translating $(<)
|
241
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
242
|
+
|
243
|
+
.c.o:
|
244
|
+
$(ECHO) compiling $(<)
|
245
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
246
|
+
|
247
|
+
.c.S:
|
248
|
+
$(ECHO) translating $(<)
|
249
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
250
|
+
|
251
|
+
.m.o:
|
252
|
+
$(ECHO) compiling $(<)
|
253
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
254
|
+
|
255
|
+
.m.S:
|
256
|
+
$(ECHO) translating $(<)
|
257
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
258
|
+
|
259
|
+
$(TARGET_SO): $(OBJS) Makefile
|
260
|
+
$(ECHO) linking shared-object $(DLLIB)
|
261
|
+
-$(Q)$(RM) $(@)
|
262
|
+
$(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
263
|
+
|
264
|
+
|
265
|
+
|
266
|
+
$(OBJS): $(HDRS) $(ruby_headers)
|
data/ext/cld3/base.o
ADDED
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/extconf.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2017 Akihiko Odaki <akihiko.odaki
|
1
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
2
|
# All Rights Reserved.
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -33,7 +33,7 @@ FileUtils.mkdir_p("cld_3/protos")
|
|
33
33
|
FileUtils.mkdir_p("script_span")
|
34
34
|
|
35
35
|
[ "feature_extractor", "sentence", "task_spec" ].each {|name|
|
36
|
-
|
36
|
+
system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
|
37
37
|
ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
|
38
38
|
}
|
39
39
|
|
@@ -56,4 +56,5 @@ FileUtils.mkdir_p("script_span")
|
|
56
56
|
}
|
57
57
|
|
58
58
|
$CXXFLAGS += " -fvisibility=hidden -std=c++11"
|
59
|
+
$LIBRUBYARG = ""
|
59
60
|
create_makefile("libcld3")
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/getonescriptspan.h
CHANGED
@@ -93,7 +93,7 @@ class ScriptScanner {
|
|
93
93
|
// again with the first byte of the following range.
|
94
94
|
int MapBack(int text_offset);
|
95
95
|
|
96
|
-
const char* GetBufferStart() {return start_byte_;}
|
96
|
+
const char* GetBufferStart() {return start_byte_;}
|
97
97
|
|
98
98
|
private:
|
99
99
|
// Skip over tags and non-letters
|
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/libcld3.so
ADDED
Binary file
|
data/ext/cld3/mkmf.log
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
"pkg-config --exists protobuf"
|
2
|
+
| pkg-config --libs protobuf
|
3
|
+
=> "-lprotobuf -lpthread \n"
|
4
|
+
"gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lm -lc"
|
5
|
+
checked program was:
|
6
|
+
/* begin */
|
7
|
+
1: #include "ruby.h"
|
8
|
+
2:
|
9
|
+
3: int main(int argc, char **argv)
|
10
|
+
4: {
|
11
|
+
5: return !!argv[argc];
|
12
|
+
6: }
|
13
|
+
/* end */
|
14
|
+
|
15
|
+
"gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lprotobuf -lpthread -lm -lc"
|
16
|
+
checked program was:
|
17
|
+
/* begin */
|
18
|
+
1: #include "ruby.h"
|
19
|
+
2:
|
20
|
+
3: int main(int argc, char **argv)
|
21
|
+
4: {
|
22
|
+
5: return !!argv[argc];
|
23
|
+
6: }
|
24
|
+
/* end */
|
25
|
+
|
26
|
+
| pkg-config --cflags-only-I protobuf
|
27
|
+
=> "\n"
|
28
|
+
| pkg-config --cflags-only-other protobuf
|
29
|
+
=> "\n"
|
30
|
+
| pkg-config --libs-only-l protobuf
|
31
|
+
=> "-lprotobuf -lpthread \n"
|
32
|
+
package configuration for protobuf
|
33
|
+
incflags:
|
34
|
+
cflags:
|
35
|
+
ldflags:
|
36
|
+
libs: -lprotobuf -lpthread
|
37
|
+
|
@@ -47,6 +47,9 @@ struct LangChunksStats {
|
|
47
47
|
|
48
48
|
// Number chunks corresponding to the language.
|
49
49
|
int num_chunks = 0;
|
50
|
+
|
51
|
+
// Specifies the byte ranges that language applies to.
|
52
|
+
std::vector<NNetLanguageIdentifier::SpanInfo> byte_ranges;
|
50
53
|
};
|
51
54
|
|
52
55
|
// Compares two pairs based on their values.
|
@@ -298,12 +301,16 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
|
298
301
|
total_num_bytes += num_original_span_bytes;
|
299
302
|
|
300
303
|
const string selected_text = SelectTextGivenScriptSpan(script_span);
|
304
|
+
|
301
305
|
result = FindLanguageOfValidUTF8(selected_text);
|
302
306
|
language = result.language;
|
303
307
|
lang_stats[language].byte_sum += num_original_span_bytes;
|
304
308
|
lang_stats[language].prob_sum +=
|
305
309
|
result.probability * num_original_span_bytes;
|
306
310
|
lang_stats[language].num_chunks++;
|
311
|
+
// Add SpanInfo. Start and end indices are relative to original input.
|
312
|
+
lang_stats[language].byte_ranges.push_back(SpanInfo(
|
313
|
+
ss.MapBack(0), ss.MapBack(script_span.text_bytes), result.probability));
|
307
314
|
}
|
308
315
|
|
309
316
|
// Sort the languages based on the number of bytes associated with them.
|
@@ -329,6 +336,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
|
329
336
|
result.probability = stats.prob_sum / stats.byte_sum;
|
330
337
|
result.proportion = stats.byte_sum / byte_sum;
|
331
338
|
result.is_reliable = ResultIsReliable(language, result.probability);
|
339
|
+
result.byte_ranges = stats.byte_ranges;
|
332
340
|
results.push_back(result);
|
333
341
|
}
|
334
342
|
|
@@ -44,6 +44,19 @@ class LanguageIdEmbeddingFeatureExtractor
|
|
44
44
|
// Class for detecting the language of a document.
|
45
45
|
class NNetLanguageIdentifier {
|
46
46
|
public:
|
47
|
+
// Holds probability that Span, specified by start/end indices, is a given
|
48
|
+
// language. The langauge is not stored here; it can be found in Result, which
|
49
|
+
// holds a vector of SpanInfo.
|
50
|
+
struct SpanInfo {
|
51
|
+
SpanInfo(int start_index_val, int end_index_val, float probability_val)
|
52
|
+
: start_index(start_index_val),
|
53
|
+
end_index(end_index_val),
|
54
|
+
probability(probability_val) {}
|
55
|
+
int start_index = -1;
|
56
|
+
int end_index = -1;
|
57
|
+
float probability = 0.0;
|
58
|
+
};
|
59
|
+
|
47
60
|
// Information about a predicted language.
|
48
61
|
struct Result {
|
49
62
|
string language = kUnknown;
|
@@ -53,6 +66,9 @@ class NNetLanguageIdentifier {
|
|
53
66
|
// Proportion of bytes associated with the language. If FindLanguage is
|
54
67
|
// called, this variable is set to 1.
|
55
68
|
float proportion = 0.0;
|
69
|
+
|
70
|
+
// Specifies the byte ranges that |language| applies to.
|
71
|
+
std::vector<SpanInfo> byte_ranges;
|
56
72
|
};
|
57
73
|
|
58
74
|
NNetLanguageIdentifier();
|
Binary file
|
@@ -1,4 +1,4 @@
|
|
1
|
-
/* Copyright 2017 Akihiko Odaki <akihiko.odaki
|
1
|
+
/* Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
2
|
All Rights Reserved.
|
3
3
|
|
4
4
|
Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -26,42 +26,90 @@ limitations under the License.
|
|
26
26
|
#define EXPORT __attribute__ ((visibility ("default")))
|
27
27
|
#endif
|
28
28
|
|
29
|
-
struct NNetLanguageIdentifier {
|
30
|
-
chrome_lang_id::NNetLanguageIdentifier context;
|
31
|
-
std::string language;
|
32
|
-
};
|
33
|
-
|
34
29
|
struct Result {
|
35
30
|
struct {
|
36
31
|
const char *data;
|
37
32
|
std::size_t size;
|
38
33
|
} language;
|
34
|
+
struct {
|
35
|
+
const chrome_lang_id::NNetLanguageIdentifier::SpanInfo *data;
|
36
|
+
std::size_t size;
|
37
|
+
} byte_ranges;
|
39
38
|
float probability;
|
40
39
|
float proportion;
|
41
40
|
bool is_reliable;
|
42
41
|
};
|
43
42
|
|
43
|
+
struct OwningResult {
|
44
|
+
OwningResult(chrome_lang_id::NNetLanguageIdentifier::Result&& result) {
|
45
|
+
references.language = std::move(result.language);
|
46
|
+
references.byte_ranges = std::move(result.byte_ranges);
|
47
|
+
plain.language.data = references.language.data();
|
48
|
+
plain.language.size = references.language.size();
|
49
|
+
plain.byte_ranges.data = references.byte_ranges.data();
|
50
|
+
plain.byte_ranges.size = references.byte_ranges.size();
|
51
|
+
plain.probability = result.probability;
|
52
|
+
plain.proportion = result.proportion;
|
53
|
+
plain.is_reliable = result.is_reliable;
|
54
|
+
}
|
55
|
+
|
56
|
+
Result plain;
|
57
|
+
struct {
|
58
|
+
std::string language;
|
59
|
+
std::vector<chrome_lang_id::NNetLanguageIdentifier::SpanInfo> byte_ranges;
|
60
|
+
} references;
|
61
|
+
};
|
62
|
+
|
44
63
|
extern "C" {
|
45
|
-
EXPORT
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
instance->language = std::move(result.language);
|
51
|
-
|
52
|
-
return Result {
|
53
|
-
{ instance->language.data(), instance->language.size() },
|
54
|
-
result.probability,
|
55
|
-
result.proportion,
|
56
|
-
result.is_reliable
|
57
|
-
};
|
64
|
+
EXPORT OwningResult *NNetLanguageIdentifier_find_language(
|
65
|
+
chrome_lang_id::NNetLanguageIdentifier *instance,
|
66
|
+
const char *data,
|
67
|
+
std::size_t size) {
|
68
|
+
return new OwningResult(instance->FindLanguage(std::string(data, size)));
|
58
69
|
}
|
59
70
|
|
60
|
-
EXPORT
|
61
|
-
|
71
|
+
EXPORT std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>*
|
72
|
+
NNetLanguageIdentifier_find_top_n_most_freq_langs(
|
73
|
+
chrome_lang_id::NNetLanguageIdentifier *instance,
|
74
|
+
const char *data, std::size_t size, int num_langs) {
|
75
|
+
std::string text(data, size);
|
76
|
+
return new auto(instance->FindTopNMostFreqLangs(text, num_langs));
|
62
77
|
}
|
63
78
|
|
64
|
-
EXPORT void
|
65
|
-
|
79
|
+
EXPORT void delete_NNetLanguageIdentifier(
|
80
|
+
chrome_lang_id::NNetLanguageIdentifier *pointer) {
|
81
|
+
delete pointer;
|
82
|
+
}
|
83
|
+
|
84
|
+
EXPORT void delete_result(OwningResult *pointer) {
|
85
|
+
delete pointer;
|
86
|
+
}
|
87
|
+
|
88
|
+
EXPORT void delete_results(
|
89
|
+
std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *pointer) {
|
90
|
+
delete pointer;
|
91
|
+
}
|
92
|
+
|
93
|
+
EXPORT chrome_lang_id::NNetLanguageIdentifier *new_NNetLanguageIdentifier(
|
94
|
+
int min_num_bytes, int max_num_bytes) {
|
95
|
+
return new chrome_lang_id::NNetLanguageIdentifier(
|
96
|
+
min_num_bytes, max_num_bytes);
|
97
|
+
}
|
98
|
+
|
99
|
+
EXPORT Result refer_to_nth_result(
|
100
|
+
std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *results,
|
101
|
+
std::size_t index) {
|
102
|
+
Result c;
|
103
|
+
auto& cc = (*results)[index];
|
104
|
+
|
105
|
+
c.language.data = cc.language.data();
|
106
|
+
c.language.size = cc.language.size();
|
107
|
+
c.byte_ranges.data = cc.byte_ranges.data();
|
108
|
+
c.byte_ranges.size = cc.byte_ranges.size();
|
109
|
+
c.probability = cc.probability;
|
110
|
+
c.proportion = cc.proportion;
|
111
|
+
c.is_reliable = cc.is_reliable;
|
112
|
+
|
113
|
+
return c;
|
66
114
|
}
|
67
115
|
}
|
Binary file
|
Binary file
|
data/ext/cld3/registry.o
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/utils.o
ADDED
Binary file
|
Binary file
|
data/lib/cld3.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# File including an implementation of CLD3 module. Some documentations are
|
2
2
|
# extracted from ext/cld3/ext/src/nnet_language_identifier.h.
|
3
3
|
#
|
4
|
-
# Copyright 2017 Akihiko Odaki <akihiko.odaki
|
4
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
5
5
|
# All Rights Reserved.
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -49,10 +49,15 @@ module CLD3
|
|
49
49
|
# This is Numeric object.
|
50
50
|
RELIABILITY_HR_BS_THRESHOLD = 0.5
|
51
51
|
|
52
|
+
# Holds probability that Span, specified by start/end indices, is a given
|
53
|
+
# language. The langauge is not stored here; it can be found in Result, which
|
54
|
+
# holds an Array of SpanInfo.
|
55
|
+
SpanInfo = Struct.new(:start_index, :end_index, :probability)
|
56
|
+
|
52
57
|
# Information about a predicted language.
|
53
58
|
# This is an instance of Struct with the following members:
|
54
59
|
#
|
55
|
-
# [language] This is symbol
|
60
|
+
# [language] This is symbol.
|
56
61
|
#
|
57
62
|
# [probability] Language probability. This is Numeric object.
|
58
63
|
#
|
@@ -61,33 +66,95 @@ module CLD3
|
|
61
66
|
# [proportion] Proportion of bytes associated with the language. If
|
62
67
|
# #find_language is called, this variable is set to 1.
|
63
68
|
# This is Numeric object.
|
64
|
-
|
69
|
+
#
|
70
|
+
# [byte_ranges] Specifies the byte ranges in UTF-8 that |language| applies to.
|
71
|
+
# This is an Array of SpanInfo.
|
72
|
+
Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
|
65
73
|
|
66
74
|
# The arguments are two String objects.
|
67
|
-
def initialize(
|
68
|
-
@cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(
|
75
|
+
def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
|
76
|
+
@cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
|
69
77
|
end
|
70
78
|
|
71
79
|
# Finds the most likely language for the given text, along with additional
|
72
80
|
# information (e.g., probability). The prediction is based on the first N
|
73
81
|
# bytes where N is the minumum between the number of interchange valid UTF8
|
74
82
|
# bytes and +max_num_bytes_+. If N is less than +min_num_bytes_+ long, then
|
75
|
-
# this function returns nil
|
83
|
+
# this function returns nil.
|
76
84
|
# The argument is a String object.
|
77
85
|
# The returned value of this function is an instance of Result.
|
78
86
|
def find_language(text)
|
79
87
|
text_utf8 = text.encode(Encoding::UTF_8)
|
80
88
|
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
81
|
-
pointer.put_bytes(0, text_utf8)
|
82
89
|
|
83
|
-
|
84
|
-
|
90
|
+
begin
|
91
|
+
pointer.put_bytes(0, text_utf8)
|
92
|
+
|
93
|
+
result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
|
94
|
+
begin
|
95
|
+
convert_result Unstable::NNetLanguageIdentifier::Result.new(result)
|
96
|
+
ensure
|
97
|
+
Unstable.delete_result result
|
98
|
+
end
|
99
|
+
ensure
|
100
|
+
pointer.free
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# Splits the input text (up to the first byte, if any, that is not
|
105
|
+
# interchange valid UTF8) into spans based on the script, predicts a language
|
106
|
+
# for each span, and returns a vector storing the top num_langs most frequent
|
107
|
+
# languages along with additional information (e.g., proportions). The number
|
108
|
+
# of bytes considered for each span is the minimum between the size of the
|
109
|
+
# span and +max_num_bytes_+. If more languages are requested than what is
|
110
|
+
# available in the input, then the number of the returned elements will be
|
111
|
+
# the number of the latter. Also, if the size of the span is less than
|
112
|
+
# +min_num_bytes_+ long, then the span is skipped. If the input text is too
|
113
|
+
# long, only the first +MAX_NUM_INPUT_BYTES_TO_CONSIDER+ bytes are processed.
|
114
|
+
# The first argument is a String object.
|
115
|
+
# The second argument is Numeric object.
|
116
|
+
# The returned value of this functions is an Array of Result instances.
|
117
|
+
def find_top_n_most_freq_langs(text, num_langs)
|
118
|
+
text_utf8 = text.encode(Encoding::UTF_8)
|
119
|
+
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
120
|
+
|
121
|
+
begin
|
122
|
+
pointer.put_bytes(0, text_utf8)
|
123
|
+
|
124
|
+
results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
|
125
|
+
begin
|
126
|
+
num_langs.times
|
127
|
+
.lazy
|
128
|
+
.map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
|
129
|
+
.take_while { |result| !result.nil? }
|
130
|
+
.to_a
|
131
|
+
ensure
|
132
|
+
Unstable.delete_results results
|
133
|
+
end
|
134
|
+
ensure
|
135
|
+
pointer.free
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
private
|
140
|
+
|
141
|
+
def convert_result(result)
|
142
|
+
language = result[:language_data].read_bytes(result[:language_size])
|
143
|
+
return nil if language == "und"
|
144
|
+
|
145
|
+
cursor = result[:byte_ranges_data]
|
146
|
+
byte_ranges = result[:byte_ranges_size].times.map do
|
147
|
+
info = Unstable::NNetLanguageIdentifier::SpanInfo.new(cursor)
|
148
|
+
cursor += Unstable::NNetLanguageIdentifier::SpanInfo.size
|
149
|
+
SpanInfo.new(info[:start_index], info[:end_index], info[:probability])
|
150
|
+
end
|
85
151
|
|
86
152
|
Result.new(
|
87
|
-
language
|
88
|
-
|
89
|
-
|
90
|
-
|
153
|
+
language.to_sym,
|
154
|
+
result[:probability],
|
155
|
+
result[:reliable?],
|
156
|
+
result[:proportion],
|
157
|
+
byte_ranges)
|
91
158
|
end
|
92
159
|
end
|
93
160
|
|
@@ -112,7 +179,7 @@ module CLD3
|
|
112
179
|
module Unstable
|
113
180
|
extend FFI::Library
|
114
181
|
|
115
|
-
ffi_lib File.join(
|
182
|
+
ffi_lib File.join(__dir__, "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
|
116
183
|
|
117
184
|
module NNetLanguageIdentifier
|
118
185
|
class Pointer < FFI::AutoPointer
|
@@ -121,17 +188,30 @@ module CLD3
|
|
121
188
|
end
|
122
189
|
end
|
123
190
|
|
191
|
+
class SpanInfo < FFI::Struct
|
192
|
+
layout :start_index, :int, :end_index, :int, :probability, :float
|
193
|
+
end
|
194
|
+
|
124
195
|
class Result < FFI::Struct
|
125
|
-
layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
196
|
+
layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
126
197
|
end
|
127
198
|
end
|
128
199
|
|
129
200
|
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
130
201
|
|
202
|
+
attach_function :delete_result, [ :pointer ], :void
|
203
|
+
|
204
|
+
attach_function :delete_results, [ :pointer ], :void
|
205
|
+
|
131
206
|
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
132
207
|
|
208
|
+
attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
|
209
|
+
|
133
210
|
attach_function :NNetLanguageIdentifier_find_language,
|
134
|
-
[ :pointer, :buffer_in, :size_t ],
|
211
|
+
[ :pointer, :buffer_in, :size_t ], :pointer
|
212
|
+
|
213
|
+
attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
|
214
|
+
[ :pointer, :buffer_in, :size_t, :int ], :pointer
|
135
215
|
end
|
136
216
|
|
137
217
|
private_constant :Unstable
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cld3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.2
|
4
|
+
version: 3.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Akihiko Odaki
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-04-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -19,7 +19,7 @@ dependencies:
|
|
19
19
|
version: 1.1.0
|
20
20
|
- - "<"
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version: 1.
|
22
|
+
version: 1.16.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -29,7 +29,7 @@ dependencies:
|
|
29
29
|
version: 1.1.0
|
30
30
|
- - "<"
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: 1.
|
32
|
+
version: 1.16.0
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: rspec
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
version: 3.0.0
|
40
40
|
- - "<"
|
41
41
|
- !ruby/object:Gem::Version
|
42
|
-
version: 3.
|
42
|
+
version: 3.11.0
|
43
43
|
type: :development
|
44
44
|
prerelease: false
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -49,10 +49,10 @@ dependencies:
|
|
49
49
|
version: 3.0.0
|
50
50
|
- - "<"
|
51
51
|
- !ruby/object:Gem::Version
|
52
|
-
version: 3.
|
52
|
+
version: 3.11.0
|
53
53
|
description: Compact Language Detector v3 (CLD3) is a neural network model for language
|
54
54
|
identification.
|
55
|
-
email: akihiko.odaki
|
55
|
+
email: akihiko.odaki@gmail.com
|
56
56
|
executables: []
|
57
57
|
extensions:
|
58
58
|
- ext/cld3/extconf.rb
|
@@ -63,76 +63,108 @@ files:
|
|
63
63
|
- LICENSE_CLD3
|
64
64
|
- README.md
|
65
65
|
- cld3.gemspec
|
66
|
+
- ext/cld3/Makefile
|
66
67
|
- ext/cld3/base.cc
|
67
68
|
- ext/cld3/base.h
|
69
|
+
- ext/cld3/base.o
|
68
70
|
- ext/cld3/casts.h
|
69
71
|
- ext/cld3/embedding_feature_extractor.cc
|
70
72
|
- ext/cld3/embedding_feature_extractor.h
|
73
|
+
- ext/cld3/embedding_feature_extractor.o
|
71
74
|
- ext/cld3/embedding_network.cc
|
72
75
|
- ext/cld3/embedding_network.h
|
76
|
+
- ext/cld3/embedding_network.o
|
73
77
|
- ext/cld3/embedding_network_params.h
|
74
78
|
- ext/cld3/extconf.rb
|
75
79
|
- ext/cld3/feature_extractor.cc
|
76
80
|
- ext/cld3/feature_extractor.h
|
81
|
+
- ext/cld3/feature_extractor.o
|
82
|
+
- ext/cld3/feature_extractor.pb.o
|
77
83
|
- ext/cld3/feature_extractor.proto
|
78
84
|
- ext/cld3/feature_types.cc
|
79
85
|
- ext/cld3/feature_types.h
|
86
|
+
- ext/cld3/feature_types.o
|
80
87
|
- ext/cld3/fixunicodevalue.cc
|
81
88
|
- ext/cld3/fixunicodevalue.h
|
89
|
+
- ext/cld3/fixunicodevalue.o
|
82
90
|
- ext/cld3/float16.h
|
83
91
|
- ext/cld3/fml_parser.cc
|
84
92
|
- ext/cld3/fml_parser.h
|
93
|
+
- ext/cld3/fml_parser.o
|
85
94
|
- ext/cld3/generated_entities.cc
|
95
|
+
- ext/cld3/generated_entities.o
|
86
96
|
- ext/cld3/generated_ulscript.cc
|
87
97
|
- ext/cld3/generated_ulscript.h
|
98
|
+
- ext/cld3/generated_ulscript.o
|
88
99
|
- ext/cld3/getonescriptspan.cc
|
89
100
|
- ext/cld3/getonescriptspan.h
|
101
|
+
- ext/cld3/getonescriptspan.o
|
90
102
|
- ext/cld3/integral_types.h
|
91
103
|
- ext/cld3/lang_id_nn_params.cc
|
92
104
|
- ext/cld3/lang_id_nn_params.h
|
105
|
+
- ext/cld3/lang_id_nn_params.o
|
93
106
|
- ext/cld3/language_identifier_features.cc
|
94
107
|
- ext/cld3/language_identifier_features.h
|
108
|
+
- ext/cld3/language_identifier_features.o
|
109
|
+
- ext/cld3/libcld3.def
|
110
|
+
- ext/cld3/libcld3.so
|
111
|
+
- ext/cld3/mkmf.log
|
95
112
|
- ext/cld3/nnet_language_identifier.cc
|
96
113
|
- ext/cld3/nnet_language_identifier.h
|
114
|
+
- ext/cld3/nnet_language_identifier.o
|
97
115
|
- ext/cld3/nnet_language_identifier_c.cc
|
116
|
+
- ext/cld3/nnet_language_identifier_c.o
|
98
117
|
- ext/cld3/offsetmap.cc
|
99
118
|
- ext/cld3/offsetmap.h
|
119
|
+
- ext/cld3/offsetmap.o
|
100
120
|
- ext/cld3/port.h
|
101
121
|
- ext/cld3/registry.cc
|
102
122
|
- ext/cld3/registry.h
|
123
|
+
- ext/cld3/registry.o
|
103
124
|
- ext/cld3/relevant_script_feature.cc
|
104
125
|
- ext/cld3/relevant_script_feature.h
|
126
|
+
- ext/cld3/relevant_script_feature.o
|
105
127
|
- ext/cld3/script_detector.h
|
128
|
+
- ext/cld3/sentence.pb.o
|
106
129
|
- ext/cld3/sentence.proto
|
107
130
|
- ext/cld3/sentence_features.cc
|
108
131
|
- ext/cld3/sentence_features.h
|
132
|
+
- ext/cld3/sentence_features.o
|
109
133
|
- ext/cld3/simple_adder.h
|
110
134
|
- ext/cld3/stringpiece.h
|
111
135
|
- ext/cld3/task_context.cc
|
112
136
|
- ext/cld3/task_context.h
|
137
|
+
- ext/cld3/task_context.o
|
113
138
|
- ext/cld3/task_context_params.cc
|
114
139
|
- ext/cld3/task_context_params.h
|
140
|
+
- ext/cld3/task_context_params.o
|
141
|
+
- ext/cld3/task_spec.pb.o
|
115
142
|
- ext/cld3/task_spec.proto
|
116
143
|
- ext/cld3/text_processing.cc
|
117
144
|
- ext/cld3/text_processing.h
|
145
|
+
- ext/cld3/text_processing.o
|
118
146
|
- ext/cld3/unicodetext.cc
|
119
147
|
- ext/cld3/unicodetext.h
|
148
|
+
- ext/cld3/unicodetext.o
|
120
149
|
- ext/cld3/utf8acceptinterchange.h
|
121
150
|
- ext/cld3/utf8prop_lettermarkscriptnum.h
|
122
151
|
- ext/cld3/utf8repl_lettermarklower.h
|
123
152
|
- ext/cld3/utf8scannot_lettermarkspecial.h
|
124
153
|
- ext/cld3/utf8statetable.cc
|
125
154
|
- ext/cld3/utf8statetable.h
|
155
|
+
- ext/cld3/utf8statetable.o
|
126
156
|
- ext/cld3/utils.cc
|
127
157
|
- ext/cld3/utils.h
|
158
|
+
- ext/cld3/utils.o
|
128
159
|
- ext/cld3/workspace.cc
|
129
160
|
- ext/cld3/workspace.h
|
161
|
+
- ext/cld3/workspace.o
|
130
162
|
- lib/cld3.rb
|
131
163
|
homepage: https://github.com/akihikodaki/cld3-ruby
|
132
164
|
licenses:
|
133
165
|
- Apache-2.0
|
134
166
|
metadata: {}
|
135
|
-
post_install_message:
|
167
|
+
post_install_message:
|
136
168
|
rdoc_options: []
|
137
169
|
require_paths:
|
138
170
|
- lib
|
@@ -140,18 +172,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
140
172
|
requirements:
|
141
173
|
- - ">="
|
142
174
|
- !ruby/object:Gem::Version
|
143
|
-
version: 2.
|
175
|
+
version: 2.6.0
|
144
176
|
- - "<"
|
145
177
|
- !ruby/object:Gem::Version
|
146
|
-
version:
|
178
|
+
version: 3.1.0
|
147
179
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
148
180
|
requirements:
|
149
181
|
- - ">="
|
150
182
|
- !ruby/object:Gem::Version
|
151
183
|
version: '0'
|
152
184
|
requirements: []
|
153
|
-
rubygems_version: 3.
|
154
|
-
signing_key:
|
185
|
+
rubygems_version: 3.1.4
|
186
|
+
signing_key:
|
155
187
|
specification_version: 4
|
156
188
|
summary: Compact Language Detector v3 (CLD3)
|
157
189
|
test_files: []
|