cld3 3.2.6 → 3.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/LICENSE +2 -2
- data/README.md +3 -3
- data/cld3.gemspec +9 -7
- data/ext/cld3/Makefile +45 -44
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.cc +1 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/extconf.rb +3 -2
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.h +2 -2
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.def +8 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/mkmf.log +10 -9
- data/ext/cld3/nnet_language_identifier.cc +3 -5
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.cc +71 -23
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence_features.cc +4 -4
- data/ext/cld3/sentence_features.h +13 -3
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3/unstable.rb +58 -0
- data/lib/cld3.rb +88 -40
- data/sig/cld3.rbs +65 -0
- metadata +56 -13
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2c161cbf12d260074efd2e9db3981b6615af20ee04c234d6b2710bd52a283a4e
|
|
4
|
+
data.tar.gz: c388ae6b529d95e015ecdb7d21cdd7f1ceaca72d167d0f8008b5477d5bce5b3c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 8e3c1c07283730e722c450acc308a497756fd501595a02a7fc066d0b3e59b96e1ab1e7941549293b02e41274b176772bdae3779a041eb28f8ae53f5c44308cc0
|
|
7
|
+
data.tar.gz: 52e95027de7a595b2eabc49745a11f664e305c18f9926bc9d649642a92fea9846efdd23da699529795d80609b8871b00e77f9379449d2e4f6cb79ecbcf2785db
|
data/Gemfile
CHANGED
data/LICENSE
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
Copyright 2017 Akihiko Odaki <akihiko.odaki
|
|
1
|
+
Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
|
2
2
|
All rights reserved.
|
|
3
3
|
|
|
4
4
|
Apache License
|
|
@@ -189,7 +189,7 @@ All rights reserved.
|
|
|
189
189
|
same "printed page" as the copyright notice for easier
|
|
190
190
|
identification within third-party archives.
|
|
191
191
|
|
|
192
|
-
Copyright 2017, Akihiko Odaki <akihiko.odaki
|
|
192
|
+
Copyright 2017, Akihiko Odaki <akihiko.odaki@gmail.com>
|
|
193
193
|
|
|
194
194
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
195
195
|
you may not use this file except in compliance with the License.
|
data/README.md
CHANGED
|
@@ -8,11 +8,11 @@ require 'cld3'
|
|
|
8
8
|
|
|
9
9
|
cld3 = CLD3::NNetLanguageIdentifier.new(0, 1000)
|
|
10
10
|
|
|
11
|
-
cld3.find_language("こんにちは") # => #<struct Struct::Result language=:ja, probability=1.0, reliable?=true, proportion=1.0>
|
|
11
|
+
cld3.find_language("こんにちは") # => #<struct Struct::Result language=:ja, probability=1.0, reliable?=true, proportion=1.0, byte_ranges=[]>
|
|
12
12
|
|
|
13
|
-
cld3.find_language("This is a pen.") # => #<struct Struct::Result language=:en, probability=0.9999408721923828, reliable?=true, proportion=1.0>
|
|
13
|
+
cld3.find_language("This is a pen.") # => #<struct Struct::Result language=:en, probability=0.9999408721923828, reliable?=true, proportion=1.0, byte_ranges=[]>
|
|
14
14
|
|
|
15
|
-
cld3.find_language("здравствуйте") # => #<struct Struct::Result language=:ru, probability=0.3140212297439575, reliable?=false, proportion=1.0>
|
|
15
|
+
cld3.find_language("здравствуйте") # => #<struct Struct::Result language=:ru, probability=0.3140212297439575, reliable?=false, proportion=1.0, byte_ranges=[]>
|
|
16
16
|
```
|
|
17
17
|
|
|
18
18
|
## Installation
|
data/cld3.gemspec
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright 2017 Akihiko Odaki <akihiko.odaki
|
|
1
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
|
2
2
|
# All Rights Reserved.
|
|
3
3
|
#
|
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -16,19 +16,21 @@
|
|
|
16
16
|
|
|
17
17
|
Gem::Specification.new do |gem|
|
|
18
18
|
gem.name = "cld3"
|
|
19
|
-
gem.version = "3.
|
|
19
|
+
gem.version = "3.4.3"
|
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
|
22
22
|
gem.license = "Apache-2.0"
|
|
23
23
|
gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
|
|
24
24
|
gem.author = "Akihiko Odaki"
|
|
25
|
-
gem.email = "akihiko.odaki
|
|
26
|
-
gem.required_ruby_version = [ ">= 2.
|
|
27
|
-
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.
|
|
28
|
-
gem.add_development_dependency "
|
|
25
|
+
gem.email = "akihiko.odaki@gmail.com"
|
|
26
|
+
gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
|
|
27
|
+
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
|
|
28
|
+
gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
|
|
29
|
+
gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
|
|
30
|
+
gem.add_development_dependency "steep", [ ">= 0.46.0", "< 0.47.0" ]
|
|
29
31
|
gem.files = Dir[
|
|
30
32
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
|
31
|
-
"cld3.gemspec", "ext/**/*", "lib/**/*"
|
|
33
|
+
"cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
|
|
32
34
|
]
|
|
33
35
|
gem.require_paths = [ "lib" ]
|
|
34
36
|
gem.extensions = [ "ext/cld3/extconf.rb" ]
|
data/ext/cld3/Makefile
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
SHELL = /bin/sh
|
|
3
3
|
|
|
4
4
|
# V=0 quiet, V=1 verbose. other values don't work.
|
|
5
|
-
V =
|
|
5
|
+
V = 1
|
|
6
6
|
Q1 = $(V:1=)
|
|
7
7
|
Q = $(Q1:0=@)
|
|
8
8
|
ECHO1 = $(V:1=@ :)
|
|
@@ -12,54 +12,55 @@ NULLCMD = :
|
|
|
12
12
|
#### Start of system configuration section. ####
|
|
13
13
|
|
|
14
14
|
srcdir = .
|
|
15
|
-
topdir = /usr/include
|
|
15
|
+
topdir = /usr/include
|
|
16
16
|
hdrdir = $(topdir)
|
|
17
|
-
arch_hdrdir = /usr/include
|
|
17
|
+
arch_hdrdir = /usr/include
|
|
18
18
|
PATH_SEPARATOR = :
|
|
19
19
|
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
|
20
20
|
prefix = $(DESTDIR)/usr
|
|
21
|
-
rubysitearchprefix = $(
|
|
22
|
-
rubyarchprefix = $(
|
|
23
|
-
rubylibprefix = $(
|
|
24
|
-
exec_prefix = $(
|
|
25
|
-
vendorarchhdrdir = $(vendorhdrdir)/$(
|
|
26
|
-
sitearchhdrdir = $(sitehdrdir)/$(
|
|
27
|
-
rubyarchhdrdir = $(
|
|
21
|
+
rubysitearchprefix = $(sitearchlibdir)/$(RUBY_BASE_NAME)
|
|
22
|
+
rubyarchprefix = $(DESTDIR)/usr/lib64/ruby
|
|
23
|
+
rubylibprefix = $(exec_prefix)/share/ruby
|
|
24
|
+
exec_prefix = $(DESTDIR)/usr
|
|
25
|
+
vendorarchhdrdir = $(vendorhdrdir)/$(arch)
|
|
26
|
+
sitearchhdrdir = $(sitehdrdir)/$(arch)
|
|
27
|
+
rubyarchhdrdir = $(DESTDIR)/usr/include
|
|
28
28
|
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
|
29
29
|
sitehdrdir = $(rubyhdrdir)/site_ruby
|
|
30
|
-
rubyhdrdir = $(
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
30
|
+
rubyhdrdir = $(DESTDIR)/usr/include
|
|
31
|
+
rubygemsdir = $(DESTDIR)/usr/share/rubygems
|
|
32
|
+
vendorarchdir = $(DESTDIR)/usr/lib64/ruby/vendor_ruby
|
|
33
|
+
vendorlibdir = $(vendordir)
|
|
34
|
+
vendordir = $(DESTDIR)/usr/share/ruby/vendor_ruby
|
|
35
|
+
sitearchdir = $(DESTDIR)/usr/local/lib64/ruby/site_ruby
|
|
36
|
+
sitelibdir = $(sitedir)
|
|
37
|
+
sitedir = $(DESTDIR)/usr/local/share/ruby/site_ruby
|
|
38
|
+
rubyarchdir = $(rubyarchprefix)
|
|
39
|
+
rubylibdir = $(rubylibprefix)
|
|
39
40
|
sitearchincludedir = $(includedir)/$(sitearch)
|
|
40
41
|
archincludedir = $(includedir)/$(arch)
|
|
41
42
|
sitearchlibdir = $(libdir)/$(sitearch)
|
|
42
|
-
archlibdir = $(
|
|
43
|
+
archlibdir = $(DESTDIR)/usr/lib64
|
|
43
44
|
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
|
44
|
-
mandir = $(
|
|
45
|
+
mandir = $(DESTDIR)/usr/share/man
|
|
45
46
|
localedir = $(datarootdir)/locale
|
|
46
|
-
libdir = $(exec_prefix)/
|
|
47
|
+
libdir = $(exec_prefix)/lib64
|
|
47
48
|
psdir = $(docdir)
|
|
48
49
|
pdfdir = $(docdir)
|
|
49
50
|
dvidir = $(docdir)
|
|
50
51
|
htmldir = $(docdir)
|
|
51
|
-
infodir = $(
|
|
52
|
+
infodir = $(DESTDIR)/usr/share/info
|
|
52
53
|
docdir = $(datarootdir)/doc/$(PACKAGE)
|
|
53
54
|
oldincludedir = $(DESTDIR)/usr/include
|
|
54
|
-
includedir = $(
|
|
55
|
+
includedir = $(DESTDIR)/usr/include
|
|
55
56
|
runstatedir = $(localstatedir)/run
|
|
56
57
|
localstatedir = $(DESTDIR)/var
|
|
57
58
|
sharedstatedir = $(DESTDIR)/var/lib
|
|
58
59
|
sysconfdir = $(DESTDIR)/etc
|
|
59
|
-
datadir = $(
|
|
60
|
+
datadir = $(DESTDIR)/usr/share
|
|
60
61
|
datarootdir = $(prefix)/share
|
|
61
|
-
libexecdir = $(DESTDIR)/usr/
|
|
62
|
-
sbindir = $(
|
|
62
|
+
libexecdir = $(DESTDIR)/usr/libexec
|
|
63
|
+
sbindir = $(DESTDIR)/usr/sbin
|
|
63
64
|
bindir = $(exec_prefix)/bin
|
|
64
65
|
archdir = $(rubyarchdir)
|
|
65
66
|
|
|
@@ -78,36 +79,36 @@ CSRCFLAG = $(empty)
|
|
|
78
79
|
|
|
79
80
|
RUBY_EXTCONF_H =
|
|
80
81
|
cflags = $(optflags) $(debugflags) $(warnflags)
|
|
81
|
-
cxxflags =
|
|
82
|
+
cxxflags =
|
|
82
83
|
optflags = -O3
|
|
83
84
|
debugflags = -ggdb3
|
|
84
|
-
warnflags = -Wall -Wextra -
|
|
85
|
+
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
|
|
85
86
|
cppflags =
|
|
86
87
|
CCDLFLAGS = -fPIC
|
|
87
|
-
CFLAGS = $(CCDLFLAGS) -
|
|
88
|
+
CFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
|
|
88
89
|
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
|
89
90
|
DEFS =
|
|
90
|
-
CPPFLAGS =
|
|
91
|
-
CXXFLAGS = $(CCDLFLAGS) -
|
|
92
|
-
ldflags = -L. -Wl,-
|
|
93
|
-
dldflags = -Wl,-
|
|
91
|
+
CPPFLAGS = $(DEFS) $(cppflags)
|
|
92
|
+
CXXFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
|
|
93
|
+
ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
|
|
94
|
+
dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
|
|
94
95
|
ARCH_FLAG =
|
|
95
96
|
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
|
96
97
|
LDSHARED = $(CC) -shared
|
|
97
98
|
LDSHAREDXX = $(CXX) -shared
|
|
98
|
-
AR = ar
|
|
99
|
+
AR = gcc-ar
|
|
99
100
|
EXEEXT =
|
|
100
101
|
|
|
101
102
|
RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
|
|
102
103
|
RUBY_SO_NAME = ruby
|
|
103
104
|
RUBYW_INSTALL_NAME =
|
|
104
|
-
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(
|
|
105
|
+
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version_dir_name)
|
|
105
106
|
RUBYW_BASE_NAME = rubyw
|
|
106
107
|
RUBY_BASE_NAME = ruby
|
|
107
108
|
|
|
108
|
-
arch =
|
|
109
|
+
arch = aarch64-linux
|
|
109
110
|
sitearch = $(arch)
|
|
110
|
-
ruby_version =
|
|
111
|
+
ruby_version = 3.0.0
|
|
111
112
|
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
|
112
113
|
RUBY = $(ruby)
|
|
113
114
|
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
|
@@ -125,8 +126,8 @@ TOUCH = exit >
|
|
|
125
126
|
#### End of system configuration section. ####
|
|
126
127
|
|
|
127
128
|
preload =
|
|
128
|
-
libpath = . $(
|
|
129
|
-
LIBPATH = -L. -L$(
|
|
129
|
+
libpath = . $(archlibdir)
|
|
130
|
+
LIBPATH = -L. -L$(archlibdir)
|
|
130
131
|
DEFFILE =
|
|
131
132
|
|
|
132
133
|
CLEANFILES = mkmf.log
|
|
@@ -137,11 +138,11 @@ extout =
|
|
|
137
138
|
extout_prefix =
|
|
138
139
|
target_prefix =
|
|
139
140
|
LOCAL_LIBS =
|
|
140
|
-
LIBS =
|
|
141
|
+
LIBS = -lprotobuf -lpthread -lm -lc
|
|
141
142
|
ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
|
|
142
143
|
SRCS = $(ORIG_SRCS)
|
|
143
144
|
OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
|
|
144
|
-
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/
|
|
145
|
+
HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
|
|
145
146
|
LOCAL_HDRS =
|
|
146
147
|
TARGET = libcld3
|
|
147
148
|
TARGET_NAME = libcld3
|
|
@@ -155,8 +156,8 @@ BINDIR = $(bindir)
|
|
|
155
156
|
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
|
156
157
|
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
|
157
158
|
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
|
158
|
-
HDRDIR = $(
|
|
159
|
-
ARCHHDRDIR = $(
|
|
159
|
+
HDRDIR = $(sitehdrdir)$(target_prefix)
|
|
160
|
+
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
|
160
161
|
TARGET_SO_DIR =
|
|
161
162
|
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
|
162
163
|
CLEANLIBS = $(TARGET_SO)
|
data/ext/cld3/base.o
CHANGED
|
Binary file
|
|
Binary file
|
|
@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
|
|
|
167
167
|
for (int i = 0; i < model_->embedding_dim_size(); ++i) {
|
|
168
168
|
CLD3_DCHECK(offset_sum == model_->concat_offset(i));
|
|
169
169
|
offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
|
|
170
|
+
(void)offset_sum; // Avoid compiler warning for "unused" variable.
|
|
170
171
|
embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
|
|
171
172
|
}
|
|
172
173
|
|
|
Binary file
|
data/ext/cld3/extconf.rb
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright 2017 Akihiko Odaki <akihiko.odaki
|
|
1
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
|
2
2
|
# All Rights Reserved.
|
|
3
3
|
#
|
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -33,7 +33,7 @@ FileUtils.mkdir_p("cld_3/protos")
|
|
|
33
33
|
FileUtils.mkdir_p("script_span")
|
|
34
34
|
|
|
35
35
|
[ "feature_extractor", "sentence", "task_spec" ].each {|name|
|
|
36
|
-
|
|
36
|
+
system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
|
|
37
37
|
ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
|
|
38
38
|
}
|
|
39
39
|
|
|
@@ -56,4 +56,5 @@ FileUtils.mkdir_p("script_span")
|
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
$CXXFLAGS += " -fvisibility=hidden -std=c++11"
|
|
59
|
+
$LIBRUBYARG = ""
|
|
59
60
|
create_makefile("libcld3")
|
|
Binary file
|
|
Binary file
|
data/ext/cld3/feature_types.o
CHANGED
|
Binary file
|
data/ext/cld3/fixunicodevalue.o
CHANGED
|
Binary file
|
data/ext/cld3/fml_parser.o
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
data/ext/cld3/getonescriptspan.h
CHANGED
|
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
|
|
|
33
33
|
static const int kWithinScriptTail = 32; // Stop at word space in last
|
|
34
34
|
// N bytes of script buffer
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
struct LangSpan {
|
|
37
37
|
char* text = nullptr; // Pointer to the span, somewhere
|
|
38
38
|
int text_bytes = 0; // Number of bytes of text in the span
|
|
39
39
|
int offset = 0; // Offset of start of span in original input buffer
|
|
40
40
|
ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
|
|
41
41
|
bool truncated = false; // true if buffer filled up before a
|
|
42
42
|
// different script or EOF was found
|
|
43
|
-
}
|
|
43
|
+
};
|
|
44
44
|
|
|
45
45
|
static inline bool IsContinuationByte(char c) {
|
|
46
46
|
return static_cast<signed char>(c) < -64;
|
data/ext/cld3/getonescriptspan.o
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
data/ext/cld3/libcld3.so
CHANGED
|
Binary file
|
data/ext/cld3/mkmf.log
CHANGED
|
@@ -1,36 +1,37 @@
|
|
|
1
1
|
"pkg-config --exists protobuf"
|
|
2
2
|
| pkg-config --libs protobuf
|
|
3
|
-
=> "-lprotobuf \n"
|
|
4
|
-
"gcc -o conftest -I/usr/include
|
|
3
|
+
=> "-lprotobuf -lpthread \n"
|
|
4
|
+
"gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lm -lc"
|
|
5
5
|
checked program was:
|
|
6
6
|
/* begin */
|
|
7
7
|
1: #include "ruby.h"
|
|
8
8
|
2:
|
|
9
9
|
3: int main(int argc, char **argv)
|
|
10
10
|
4: {
|
|
11
|
-
5: return
|
|
11
|
+
5: return !!argv[argc];
|
|
12
12
|
6: }
|
|
13
13
|
/* end */
|
|
14
14
|
|
|
15
|
-
"gcc -o conftest -I/usr/include
|
|
15
|
+
"gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lprotobuf -lpthread -lm -lc"
|
|
16
16
|
checked program was:
|
|
17
17
|
/* begin */
|
|
18
18
|
1: #include "ruby.h"
|
|
19
19
|
2:
|
|
20
20
|
3: int main(int argc, char **argv)
|
|
21
21
|
4: {
|
|
22
|
-
5: return
|
|
22
|
+
5: return !!argv[argc];
|
|
23
23
|
6: }
|
|
24
24
|
/* end */
|
|
25
25
|
|
|
26
26
|
| pkg-config --cflags-only-I protobuf
|
|
27
27
|
=> "\n"
|
|
28
28
|
| pkg-config --cflags-only-other protobuf
|
|
29
|
-
=> "
|
|
29
|
+
=> "\n"
|
|
30
30
|
| pkg-config --libs-only-l protobuf
|
|
31
|
-
=> "-lprotobuf \n"
|
|
31
|
+
=> "-lprotobuf -lpthread \n"
|
|
32
32
|
package configuration for protobuf
|
|
33
|
-
|
|
33
|
+
incflags:
|
|
34
|
+
cflags:
|
|
34
35
|
ldflags:
|
|
35
|
-
libs: -lprotobuf
|
|
36
|
+
libs: -lprotobuf -lpthread
|
|
36
37
|
|
|
@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
|
|
284
284
|
CLD2::LangSpan script_span;
|
|
285
285
|
std::unordered_map<string, LangChunksStats> lang_stats;
|
|
286
286
|
int total_num_bytes = 0;
|
|
287
|
-
Result result;
|
|
288
|
-
string language;
|
|
289
287
|
int chunk_size = 0; // Use the default.
|
|
290
288
|
while (ss.GetOneScriptSpanLower(&script_span)) {
|
|
291
289
|
const int num_original_span_bytes = script_span.text_bytes;
|
|
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
|
|
302
300
|
|
|
303
301
|
const string selected_text = SelectTextGivenScriptSpan(script_span);
|
|
304
302
|
|
|
305
|
-
result = FindLanguageOfValidUTF8(selected_text);
|
|
306
|
-
language = result.language;
|
|
303
|
+
Result result = FindLanguageOfValidUTF8(selected_text);
|
|
304
|
+
string language = result.language;
|
|
307
305
|
lang_stats[language].byte_sum += num_original_span_bytes;
|
|
308
306
|
lang_stats[language].prob_sum +=
|
|
309
307
|
result.probability * num_original_span_bytes;
|
|
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
|
|
|
356
354
|
const char *text_begin, int text_size) {
|
|
357
355
|
string output_text;
|
|
358
356
|
|
|
359
|
-
// If the size of the input is greater than the
|
|
357
|
+
// If the size of the input is greater than the maximum number of bytes needed
|
|
360
358
|
// for a prediction, then concatenate snippets that are equally spread out
|
|
361
359
|
// throughout the input.
|
|
362
360
|
if (text_size > max_num_bytes_) {
|
|
Binary file
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
/* Copyright 2017 Akihiko Odaki <akihiko.odaki
|
|
1
|
+
/* Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
|
2
2
|
All Rights Reserved.
|
|
3
3
|
|
|
4
4
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -26,42 +26,90 @@ limitations under the License.
|
|
|
26
26
|
#define EXPORT __attribute__ ((visibility ("default")))
|
|
27
27
|
#endif
|
|
28
28
|
|
|
29
|
-
struct NNetLanguageIdentifier {
|
|
30
|
-
chrome_lang_id::NNetLanguageIdentifier context;
|
|
31
|
-
std::string language;
|
|
32
|
-
};
|
|
33
|
-
|
|
34
29
|
struct Result {
|
|
35
30
|
struct {
|
|
36
31
|
const char *data;
|
|
37
32
|
std::size_t size;
|
|
38
33
|
} language;
|
|
34
|
+
struct {
|
|
35
|
+
const chrome_lang_id::NNetLanguageIdentifier::SpanInfo *data;
|
|
36
|
+
std::size_t size;
|
|
37
|
+
} byte_ranges;
|
|
39
38
|
float probability;
|
|
40
39
|
float proportion;
|
|
41
40
|
bool is_reliable;
|
|
42
41
|
};
|
|
43
42
|
|
|
43
|
+
struct OwningResult {
|
|
44
|
+
OwningResult(chrome_lang_id::NNetLanguageIdentifier::Result&& result) {
|
|
45
|
+
references.language = std::move(result.language);
|
|
46
|
+
references.byte_ranges = std::move(result.byte_ranges);
|
|
47
|
+
plain.language.data = references.language.data();
|
|
48
|
+
plain.language.size = references.language.size();
|
|
49
|
+
plain.byte_ranges.data = references.byte_ranges.data();
|
|
50
|
+
plain.byte_ranges.size = references.byte_ranges.size();
|
|
51
|
+
plain.probability = result.probability;
|
|
52
|
+
plain.proportion = result.proportion;
|
|
53
|
+
plain.is_reliable = result.is_reliable;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
Result plain;
|
|
57
|
+
struct {
|
|
58
|
+
std::string language;
|
|
59
|
+
std::vector<chrome_lang_id::NNetLanguageIdentifier::SpanInfo> byte_ranges;
|
|
60
|
+
} references;
|
|
61
|
+
};
|
|
62
|
+
|
|
44
63
|
extern "C" {
|
|
45
|
-
EXPORT
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
instance->language = std::move(result.language);
|
|
51
|
-
|
|
52
|
-
return Result {
|
|
53
|
-
{ instance->language.data(), instance->language.size() },
|
|
54
|
-
result.probability,
|
|
55
|
-
result.proportion,
|
|
56
|
-
result.is_reliable
|
|
57
|
-
};
|
|
64
|
+
EXPORT OwningResult *NNetLanguageIdentifier_find_language(
|
|
65
|
+
chrome_lang_id::NNetLanguageIdentifier *instance,
|
|
66
|
+
const char *data,
|
|
67
|
+
std::size_t size) {
|
|
68
|
+
return new OwningResult(instance->FindLanguage(std::string(data, size)));
|
|
58
69
|
}
|
|
59
70
|
|
|
60
|
-
EXPORT
|
|
61
|
-
|
|
71
|
+
EXPORT std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>*
|
|
72
|
+
NNetLanguageIdentifier_find_top_n_most_freq_langs(
|
|
73
|
+
chrome_lang_id::NNetLanguageIdentifier *instance,
|
|
74
|
+
const char *data, std::size_t size, int num_langs) {
|
|
75
|
+
std::string text(data, size);
|
|
76
|
+
return new auto(instance->FindTopNMostFreqLangs(text, num_langs));
|
|
62
77
|
}
|
|
63
78
|
|
|
64
|
-
EXPORT void
|
|
65
|
-
|
|
79
|
+
EXPORT void delete_NNetLanguageIdentifier(
|
|
80
|
+
chrome_lang_id::NNetLanguageIdentifier *pointer) {
|
|
81
|
+
delete pointer;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
EXPORT void delete_result(OwningResult *pointer) {
|
|
85
|
+
delete pointer;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
EXPORT void delete_results(
|
|
89
|
+
std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *pointer) {
|
|
90
|
+
delete pointer;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
EXPORT chrome_lang_id::NNetLanguageIdentifier *new_NNetLanguageIdentifier(
|
|
94
|
+
int min_num_bytes, int max_num_bytes) {
|
|
95
|
+
return new chrome_lang_id::NNetLanguageIdentifier(
|
|
96
|
+
min_num_bytes, max_num_bytes);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
EXPORT Result refer_to_nth_result(
|
|
100
|
+
std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *results,
|
|
101
|
+
std::size_t index) {
|
|
102
|
+
Result c;
|
|
103
|
+
auto& cc = (*results)[index];
|
|
104
|
+
|
|
105
|
+
c.language.data = cc.language.data();
|
|
106
|
+
c.language.size = cc.language.size();
|
|
107
|
+
c.byte_ranges.data = cc.byte_ranges.data();
|
|
108
|
+
c.byte_ranges.size = cc.byte_ranges.size();
|
|
109
|
+
c.probability = cc.probability;
|
|
110
|
+
c.proportion = cc.proportion;
|
|
111
|
+
c.is_reliable = cc.is_reliable;
|
|
112
|
+
|
|
113
|
+
return c;
|
|
66
114
|
}
|
|
67
115
|
}
|
|
Binary file
|
data/ext/cld3/offsetmap.o
CHANGED
|
Binary file
|
data/ext/cld3/registry.o
CHANGED
|
Binary file
|
|
Binary file
|
data/ext/cld3/sentence.pb.o
CHANGED
|
Binary file
|
|
@@ -19,11 +19,11 @@ limitations under the License.
|
|
|
19
19
|
|
|
20
20
|
namespace chrome_lang_id {
|
|
21
21
|
|
|
22
|
-
//
|
|
22
|
+
// Define registry for the whole Sentence feature functions. NOTE: this is not
|
|
23
23
|
// yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
|
|
24
24
|
// constructor, *before* we use any feature.
|
|
25
25
|
template <>
|
|
26
|
-
WholeSentenceFeature::Registry
|
|
27
|
-
|
|
26
|
+
WholeSentenceFeature::Registry*
|
|
27
|
+
RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
|
|
28
28
|
|
|
29
|
-
} // namespace chrome_lang_id
|
|
29
|
+
} // namespace chrome_lang_id
|
|
@@ -26,9 +26,19 @@ limitations under the License.
|
|
|
26
26
|
namespace chrome_lang_id {
|
|
27
27
|
|
|
28
28
|
// Feature function that extracts features for the full Sentence.
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
using WholeSentenceFeature = FeatureFunction<Sentence>;
|
|
30
|
+
|
|
31
|
+
using WholeSentenceExtractor = FeatureExtractor<Sentence>;
|
|
32
|
+
|
|
33
|
+
// Declare registry for the whole Sentence feature functions. This is required
|
|
34
|
+
// for clang's -Wundefined-var-template. However, MSVC has a bug which treats
|
|
35
|
+
// this declaration as a definition, leading to multiple definition errors, so
|
|
36
|
+
// omit this on MSVC.
|
|
37
|
+
#if !defined(COMPILER_MSVC)
|
|
38
|
+
template <>
|
|
39
|
+
WholeSentenceFeature::Registry
|
|
40
|
+
*RegisterableClass<WholeSentenceFeature>::registry_;
|
|
41
|
+
#endif
|
|
32
42
|
|
|
33
43
|
} // namespace chrome_lang_id
|
|
34
44
|
|
|
Binary file
|
data/ext/cld3/task_context.o
CHANGED
|
Binary file
|
|
Binary file
|
data/ext/cld3/task_spec.pb.o
CHANGED
|
Binary file
|
data/ext/cld3/text_processing.o
CHANGED
|
Binary file
|
data/ext/cld3/unicodetext.o
CHANGED
|
Binary file
|
data/ext/cld3/utf8statetable.o
CHANGED
|
Binary file
|
data/ext/cld3/utils.o
CHANGED
|
Binary file
|
data/ext/cld3/workspace.o
CHANGED
|
Binary file
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
|
|
2
|
+
# Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
|
|
3
|
+
# All Rights Reserved.
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
# ==============================================================================
|
|
17
|
+
|
|
18
|
+
module CLD3
|
|
19
|
+
module Unstable
|
|
20
|
+
extend FFI::Library
|
|
21
|
+
|
|
22
|
+
ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
|
|
23
|
+
|
|
24
|
+
module NNetLanguageIdentifier
|
|
25
|
+
class Pointer < FFI::AutoPointer
|
|
26
|
+
def self.release(pointer)
|
|
27
|
+
Unstable.delete_NNetLanguageIdentifier(pointer)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
class SpanInfo < FFI::Struct
|
|
32
|
+
layout :start_index, :int, :end_index, :int, :probability, :float
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
class Result < FFI::Struct
|
|
36
|
+
layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
|
41
|
+
|
|
42
|
+
attach_function :delete_result, [ :pointer ], :void
|
|
43
|
+
|
|
44
|
+
attach_function :delete_results, [ :pointer ], :void
|
|
45
|
+
|
|
46
|
+
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
|
47
|
+
|
|
48
|
+
attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
|
|
49
|
+
|
|
50
|
+
attach_function :NNetLanguageIdentifier_find_language,
|
|
51
|
+
[ :pointer, :buffer_in, :size_t ], :pointer
|
|
52
|
+
|
|
53
|
+
attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
|
|
54
|
+
[ :pointer, :buffer_in, :size_t, :int ], :pointer
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private_constant :Unstable
|
|
58
|
+
end
|
data/lib/cld3.rb
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# File including an implementation of CLD3 module. Some documentations are
|
|
2
2
|
# extracted from ext/cld3/ext/src/nnet_language_identifier.h.
|
|
3
3
|
#
|
|
4
|
-
# Copyright 2017 Akihiko Odaki <akihiko.odaki
|
|
4
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
|
5
5
|
# All Rights Reserved.
|
|
6
6
|
#
|
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
|
|
20
20
|
require "ffi"
|
|
21
21
|
require "rbconfig"
|
|
22
|
+
require "cld3/unstable"
|
|
22
23
|
|
|
23
24
|
# Module providing an interface for Compact Language Detector v3 (CLD3)
|
|
24
25
|
module CLD3
|
|
@@ -49,10 +50,16 @@ module CLD3
|
|
|
49
50
|
# This is Numeric object.
|
|
50
51
|
RELIABILITY_HR_BS_THRESHOLD = 0.5
|
|
51
52
|
|
|
53
|
+
# Holds probability that Span, specified by start/end indices, is a given
|
|
54
|
+
# language. The langauge is not stored here; it can be found in Result, which
|
|
55
|
+
# holds an Array of SpanInfo.
|
|
56
|
+
# @type const SpanInfo: untyped
|
|
57
|
+
SpanInfo = Struct.new(:start_index, :end_index, :probability)
|
|
58
|
+
|
|
52
59
|
# Information about a predicted language.
|
|
53
60
|
# This is an instance of Struct with the following members:
|
|
54
61
|
#
|
|
55
|
-
# [language] This is symbol
|
|
62
|
+
# [language] This is symbol.
|
|
56
63
|
#
|
|
57
64
|
# [probability] Language probability. This is Numeric object.
|
|
58
65
|
#
|
|
@@ -61,33 +68,100 @@ module CLD3
|
|
|
61
68
|
# [proportion] Proportion of bytes associated with the language. If
|
|
62
69
|
# #find_language is called, this variable is set to 1.
|
|
63
70
|
# This is Numeric object.
|
|
64
|
-
|
|
71
|
+
#
|
|
72
|
+
# [byte_ranges] Specifies the byte ranges in UTF-8 that |language| applies to.
|
|
73
|
+
# This is an Array of SpanInfo.
|
|
74
|
+
# @type const Result: untyped
|
|
75
|
+
Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
|
|
65
76
|
|
|
66
77
|
# The arguments are two String objects.
|
|
67
|
-
def initialize(
|
|
68
|
-
@cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(
|
|
78
|
+
def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
|
|
79
|
+
@cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
|
|
69
80
|
end
|
|
70
81
|
|
|
71
82
|
# Finds the most likely language for the given text, along with additional
|
|
72
83
|
# information (e.g., probability). The prediction is based on the first N
|
|
73
84
|
# bytes where N is the minumum between the number of interchange valid UTF8
|
|
74
85
|
# bytes and +max_num_bytes_+. If N is less than +min_num_bytes_+ long, then
|
|
75
|
-
# this function returns nil
|
|
86
|
+
# this function returns nil.
|
|
76
87
|
# The argument is a String object.
|
|
77
88
|
# The returned value of this function is an instance of Result.
|
|
78
89
|
def find_language(text)
|
|
79
90
|
text_utf8 = text.encode(Encoding::UTF_8)
|
|
80
91
|
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
|
81
|
-
pointer.put_bytes(0, text_utf8)
|
|
82
92
|
|
|
83
|
-
|
|
84
|
-
|
|
93
|
+
begin
|
|
94
|
+
pointer.put_bytes(0, text_utf8)
|
|
95
|
+
|
|
96
|
+
result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
|
|
97
|
+
begin
|
|
98
|
+
convert_result Unstable::NNetLanguageIdentifier::Result.new(result)
|
|
99
|
+
ensure
|
|
100
|
+
Unstable.delete_result result
|
|
101
|
+
end
|
|
102
|
+
ensure
|
|
103
|
+
pointer.free
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Splits the input text (up to the first byte, if any, that is not
|
|
108
|
+
# interchange valid UTF8) into spans based on the script, predicts a language
|
|
109
|
+
# for each span, and returns a vector storing the top num_langs most frequent
|
|
110
|
+
# languages along with additional information (e.g., proportions). The number
|
|
111
|
+
# of bytes considered for each span is the minimum between the size of the
|
|
112
|
+
# span and +max_num_bytes_+. If more languages are requested than what is
|
|
113
|
+
# available in the input, then the number of the returned elements will be
|
|
114
|
+
# the number of the latter. Also, if the size of the span is less than
|
|
115
|
+
# +min_num_bytes_+ long, then the span is skipped. If the input text is too
|
|
116
|
+
# long, only the first +MAX_NUM_INPUT_BYTES_TO_CONSIDER+ bytes are processed.
|
|
117
|
+
# The first argument is a String object.
|
|
118
|
+
# The second argument is Numeric object.
|
|
119
|
+
# The returned value of this functions is an Array of Result instances.
|
|
120
|
+
def find_top_n_most_freq_langs(text, num_langs)
|
|
121
|
+
# @type var a: untyped
|
|
122
|
+
|
|
123
|
+
text_utf8 = text.encode(Encoding::UTF_8)
|
|
124
|
+
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
|
125
|
+
|
|
126
|
+
begin
|
|
127
|
+
pointer.put_bytes(0, text_utf8)
|
|
128
|
+
|
|
129
|
+
results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
|
|
130
|
+
begin
|
|
131
|
+
a = num_langs.times
|
|
132
|
+
.lazy
|
|
133
|
+
.map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
|
|
134
|
+
.take_while { |result| !result.nil? }
|
|
135
|
+
.to_a
|
|
136
|
+
|
|
137
|
+
a
|
|
138
|
+
ensure
|
|
139
|
+
Unstable.delete_results results
|
|
140
|
+
end
|
|
141
|
+
ensure
|
|
142
|
+
pointer.free
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
private
|
|
147
|
+
|
|
148
|
+
def convert_result(result)
|
|
149
|
+
language = result[:language_data].read_bytes(result[:language_size])
|
|
150
|
+
return nil if language == "und"
|
|
151
|
+
|
|
152
|
+
cursor = result[:byte_ranges_data]
|
|
153
|
+
byte_ranges = result[:byte_ranges_size].times.map do
|
|
154
|
+
info = Unstable::NNetLanguageIdentifier::SpanInfo.new(cursor)
|
|
155
|
+
cursor += Unstable::NNetLanguageIdentifier::SpanInfo.size
|
|
156
|
+
SpanInfo.new(info[:start_index], info[:end_index], info[:probability])
|
|
157
|
+
end
|
|
85
158
|
|
|
86
159
|
Result.new(
|
|
87
|
-
language
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
160
|
+
language.to_sym,
|
|
161
|
+
result[:probability],
|
|
162
|
+
result[:reliable?],
|
|
163
|
+
result[:proportion],
|
|
164
|
+
byte_ranges)
|
|
91
165
|
end
|
|
92
166
|
end
|
|
93
167
|
|
|
@@ -95,6 +169,7 @@ module CLD3
|
|
|
95
169
|
# The model weights are loaded statically.
|
|
96
170
|
module TaskContextParams
|
|
97
171
|
# This is an frozen Array object containing symbols.
|
|
172
|
+
# @type const LANGUAGE_NAMES: untyped
|
|
98
173
|
LANGUAGE_NAMES = [
|
|
99
174
|
:eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
|
|
100
175
|
:nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
|
|
@@ -108,31 +183,4 @@ module CLD3
|
|
|
108
183
|
:sn, :yo, :pa, :ku,
|
|
109
184
|
].freeze
|
|
110
185
|
end
|
|
111
|
-
|
|
112
|
-
module Unstable
|
|
113
|
-
extend FFI::Library
|
|
114
|
-
|
|
115
|
-
ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
|
|
116
|
-
|
|
117
|
-
module NNetLanguageIdentifier
|
|
118
|
-
class Pointer < FFI::AutoPointer
|
|
119
|
-
def self.release(pointer)
|
|
120
|
-
Unstable.delete_NNetLanguageIdentifier(pointer)
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
class Result < FFI::Struct
|
|
125
|
-
layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
|
126
|
-
end
|
|
127
|
-
end
|
|
128
|
-
|
|
129
|
-
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
|
130
|
-
|
|
131
|
-
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
|
132
|
-
|
|
133
|
-
attach_function :NNetLanguageIdentifier_find_language,
|
|
134
|
-
[ :pointer, :buffer_in, :size_t ], NNetLanguageIdentifier::Result.by_value
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
private_constant :Unstable
|
|
138
186
|
end
|
data/sig/cld3.rbs
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
|
|
2
|
+
# All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
# ==============================================================================
|
|
16
|
+
|
|
17
|
+
module CLD3
|
|
18
|
+
class NNetLanguageIdentifier
|
|
19
|
+
MIN_NUM_BYTES_TO_CONSIDER: Integer
|
|
20
|
+
MAX_NUM_BYTES_TO_CONSIDER: Integer
|
|
21
|
+
MAX_NUM_INPUT_BYTES_TO_CONSIDER: Integer
|
|
22
|
+
RELIABILITY_THRESHOLD: Float
|
|
23
|
+
RELIABILITY_HR_BS_THRESHOLD: Float
|
|
24
|
+
|
|
25
|
+
class SpanInfo < Struct[Float | Integer]
|
|
26
|
+
attr_accessor start_index(): Integer
|
|
27
|
+
attr_accessor end_index(): Integer
|
|
28
|
+
attr_accessor probability(): Float
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
class Result < Struct[Array[SpanInfo] | Float | TaskContextParams::language_names | bool]
|
|
32
|
+
attr_accessor language(): TaskContextParams::language_names
|
|
33
|
+
attr_accessor probability(): Float
|
|
34
|
+
attr_accessor reliable?(): bool
|
|
35
|
+
attr_accessor proportion(): Float
|
|
36
|
+
attr_accessor byte_ranges(): Array[SpanInfo]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def initialize: (?Integer, ?Integer) -> void
|
|
40
|
+
def find_language: (String) -> Result?
|
|
41
|
+
def find_top_n_most_freq_langs: (String, Integer) -> Array[Result]
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def convert_result: (untyped) -> Result?
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
module TaskContextParams
|
|
49
|
+
type language_names =
|
|
50
|
+
:eo | :co | :eu | :ta | :de | :mt | :ps | :te | :su | :uz | :'zh-Latn' | :ne |
|
|
51
|
+
:nl | :sw | :sq | :hmn | :ja | :no | :mn | :so | :ko | :kk | :sl | :ig |
|
|
52
|
+
:mr | :th | :zu | :ml | :hr | :bs | :lo | :sd | :cy | :hy | :uk | :pt |
|
|
53
|
+
:lv | :iw | :cs | :vi | :jv | :be | :km | :mk | :tr | :fy | :am | :zh |
|
|
54
|
+
:da | :sv | :fi | :ht | :af | :la | :id | :fil | :sm | :ca | :el | :ka |
|
|
55
|
+
:sr | :it | :sk | :ru | :'ru-Latn' | :bg | :ny | :fa | :haw | :gl | :et |
|
|
56
|
+
:ms | :gd | :'bg-Latn' | :ha | :is | :ur | :mi | :hi | :bn | :'hi-Latn' | :fr |
|
|
57
|
+
:yi | :hu | :xh | :my | :tg | :ro | :ar | :lb | :'el-Latn' | :st | :ceb |
|
|
58
|
+
:kn | :az | :si | :ky | :mg | :en | :gu | :es | :pl | :'ja-Latn' | :ga | :lt |
|
|
59
|
+
:sn | :yo | :pa | :ku
|
|
60
|
+
|
|
61
|
+
LANGUAGE_NAMES: Array[language_names]
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
Unstable: untyped
|
|
65
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: cld3
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.
|
|
4
|
+
version: 3.4.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Akihiko Odaki
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2021-11-25 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: ffi
|
|
@@ -19,7 +19,7 @@ dependencies:
|
|
|
19
19
|
version: 1.1.0
|
|
20
20
|
- - "<"
|
|
21
21
|
- !ruby/object:Gem::Version
|
|
22
|
-
version: 1.
|
|
22
|
+
version: 1.16.0
|
|
23
23
|
type: :runtime
|
|
24
24
|
prerelease: false
|
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
|
@@ -29,7 +29,27 @@ dependencies:
|
|
|
29
29
|
version: 1.1.0
|
|
30
30
|
- - "<"
|
|
31
31
|
- !ruby/object:Gem::Version
|
|
32
|
-
version: 1.
|
|
32
|
+
version: 1.16.0
|
|
33
|
+
- !ruby/object:Gem::Dependency
|
|
34
|
+
name: rbs
|
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: 1.7.0
|
|
40
|
+
- - "<"
|
|
41
|
+
- !ruby/object:Gem::Version
|
|
42
|
+
version: 1.8.0
|
|
43
|
+
type: :development
|
|
44
|
+
prerelease: false
|
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
46
|
+
requirements:
|
|
47
|
+
- - ">="
|
|
48
|
+
- !ruby/object:Gem::Version
|
|
49
|
+
version: 1.7.0
|
|
50
|
+
- - "<"
|
|
51
|
+
- !ruby/object:Gem::Version
|
|
52
|
+
version: 1.8.0
|
|
33
53
|
- !ruby/object:Gem::Dependency
|
|
34
54
|
name: rspec
|
|
35
55
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -39,7 +59,7 @@ dependencies:
|
|
|
39
59
|
version: 3.0.0
|
|
40
60
|
- - "<"
|
|
41
61
|
- !ruby/object:Gem::Version
|
|
42
|
-
version: 3.
|
|
62
|
+
version: 3.11.0
|
|
43
63
|
type: :development
|
|
44
64
|
prerelease: false
|
|
45
65
|
version_requirements: !ruby/object:Gem::Requirement
|
|
@@ -49,10 +69,30 @@ dependencies:
|
|
|
49
69
|
version: 3.0.0
|
|
50
70
|
- - "<"
|
|
51
71
|
- !ruby/object:Gem::Version
|
|
52
|
-
version: 3.
|
|
72
|
+
version: 3.11.0
|
|
73
|
+
- !ruby/object:Gem::Dependency
|
|
74
|
+
name: steep
|
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
|
76
|
+
requirements:
|
|
77
|
+
- - ">="
|
|
78
|
+
- !ruby/object:Gem::Version
|
|
79
|
+
version: 0.46.0
|
|
80
|
+
- - "<"
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: 0.47.0
|
|
83
|
+
type: :development
|
|
84
|
+
prerelease: false
|
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
86
|
+
requirements:
|
|
87
|
+
- - ">="
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
version: 0.46.0
|
|
90
|
+
- - "<"
|
|
91
|
+
- !ruby/object:Gem::Version
|
|
92
|
+
version: 0.47.0
|
|
53
93
|
description: Compact Language Detector v3 (CLD3) is a neural network model for language
|
|
54
94
|
identification.
|
|
55
|
-
email: akihiko.odaki
|
|
95
|
+
email: akihiko.odaki@gmail.com
|
|
56
96
|
executables: []
|
|
57
97
|
extensions:
|
|
58
98
|
- ext/cld3/extconf.rb
|
|
@@ -106,6 +146,7 @@ files:
|
|
|
106
146
|
- ext/cld3/language_identifier_features.cc
|
|
107
147
|
- ext/cld3/language_identifier_features.h
|
|
108
148
|
- ext/cld3/language_identifier_features.o
|
|
149
|
+
- ext/cld3/libcld3.def
|
|
109
150
|
- ext/cld3/libcld3.so
|
|
110
151
|
- ext/cld3/mkmf.log
|
|
111
152
|
- ext/cld3/nnet_language_identifier.cc
|
|
@@ -159,11 +200,13 @@ files:
|
|
|
159
200
|
- ext/cld3/workspace.h
|
|
160
201
|
- ext/cld3/workspace.o
|
|
161
202
|
- lib/cld3.rb
|
|
203
|
+
- lib/cld3/unstable.rb
|
|
204
|
+
- sig/cld3.rbs
|
|
162
205
|
homepage: https://github.com/akihikodaki/cld3-ruby
|
|
163
206
|
licenses:
|
|
164
207
|
- Apache-2.0
|
|
165
208
|
metadata: {}
|
|
166
|
-
post_install_message:
|
|
209
|
+
post_install_message:
|
|
167
210
|
rdoc_options: []
|
|
168
211
|
require_paths:
|
|
169
212
|
- lib
|
|
@@ -171,18 +214,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
171
214
|
requirements:
|
|
172
215
|
- - ">="
|
|
173
216
|
- !ruby/object:Gem::Version
|
|
174
|
-
version: 2.
|
|
217
|
+
version: 2.6.0
|
|
175
218
|
- - "<"
|
|
176
219
|
- !ruby/object:Gem::Version
|
|
177
|
-
version: 2.
|
|
220
|
+
version: 3.2.0
|
|
178
221
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
179
222
|
requirements:
|
|
180
223
|
- - ">="
|
|
181
224
|
- !ruby/object:Gem::Version
|
|
182
225
|
version: '0'
|
|
183
226
|
requirements: []
|
|
184
|
-
rubygems_version: 3.
|
|
185
|
-
signing_key:
|
|
227
|
+
rubygems_version: 3.2.22
|
|
228
|
+
signing_key:
|
|
186
229
|
specification_version: 4
|
|
187
230
|
summary: Compact Language Detector v3 (CLD3)
|
|
188
231
|
test_files: []
|