cld3 3.2.6 → 3.4.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +1 -1
  3. data/LICENSE +2 -2
  4. data/README.md +3 -3
  5. data/cld3.gemspec +9 -7
  6. data/ext/cld3/Makefile +45 -44
  7. data/ext/cld3/base.o +0 -0
  8. data/ext/cld3/embedding_feature_extractor.o +0 -0
  9. data/ext/cld3/embedding_network.cc +1 -0
  10. data/ext/cld3/embedding_network.o +0 -0
  11. data/ext/cld3/extconf.rb +3 -2
  12. data/ext/cld3/feature_extractor.o +0 -0
  13. data/ext/cld3/feature_extractor.pb.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.h +2 -2
  20. data/ext/cld3/getonescriptspan.o +0 -0
  21. data/ext/cld3/lang_id_nn_params.o +0 -0
  22. data/ext/cld3/language_identifier_features.o +0 -0
  23. data/ext/cld3/libcld3.def +8 -0
  24. data/ext/cld3/libcld3.so +0 -0
  25. data/ext/cld3/mkmf.log +10 -9
  26. data/ext/cld3/nnet_language_identifier.cc +3 -5
  27. data/ext/cld3/nnet_language_identifier.o +0 -0
  28. data/ext/cld3/nnet_language_identifier_c.cc +71 -23
  29. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  30. data/ext/cld3/offsetmap.o +0 -0
  31. data/ext/cld3/registry.o +0 -0
  32. data/ext/cld3/relevant_script_feature.o +0 -0
  33. data/ext/cld3/sentence.pb.o +0 -0
  34. data/ext/cld3/sentence_features.cc +4 -4
  35. data/ext/cld3/sentence_features.h +13 -3
  36. data/ext/cld3/sentence_features.o +0 -0
  37. data/ext/cld3/task_context.o +0 -0
  38. data/ext/cld3/task_context_params.o +0 -0
  39. data/ext/cld3/task_spec.pb.o +0 -0
  40. data/ext/cld3/text_processing.o +0 -0
  41. data/ext/cld3/unicodetext.o +0 -0
  42. data/ext/cld3/utf8statetable.o +0 -0
  43. data/ext/cld3/utils.o +0 -0
  44. data/ext/cld3/workspace.o +0 -0
  45. data/lib/cld3/unstable.rb +58 -0
  46. data/lib/cld3.rb +88 -40
  47. data/sig/cld3.rbs +65 -0
  48. metadata +56 -13
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0ee3c6166247aaf958310ffa9976400fcfa5050eb1969dd17e186e3500dd06d9
4
- data.tar.gz: bfa75958e205683dfa2429d388bb40d18d237ef5c2b5877a9fd718489f95b7bd
3
+ metadata.gz: 2c161cbf12d260074efd2e9db3981b6615af20ee04c234d6b2710bd52a283a4e
4
+ data.tar.gz: c388ae6b529d95e015ecdb7d21cdd7f1ceaca72d167d0f8008b5477d5bce5b3c
5
5
  SHA512:
6
- metadata.gz: ac1fb08ebf438995878bb7c992bc2e2a71adf9d0f06f01316121b6d8d48f5b8f2f1ea9a3f68f501dad6682168b7a3e16b0137be16cae7ad876d0bce9f6d866e7
7
- data.tar.gz: 40e1036c1c7e08af1caed6efd187a04bb8883f9ff427c61824193f72f58e696279611575f7a63b8cbd80fff2c9f20da644807e71be2ea281a8d870e3721410bd
6
+ metadata.gz: 8e3c1c07283730e722c450acc308a497756fd501595a02a7fc066d0b3e59b96e1ab1e7941549293b02e41274b176772bdae3779a041eb28f8ae53f5c44308cc0
7
+ data.tar.gz: 52e95027de7a595b2eabc49745a11f664e305c18f9926bc9d649642a92fea9846efdd23da699529795d80609b8871b00e77f9379449d2e4f6cb79ecbcf2785db
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  # All Rights Reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  All rights reserved.
3
3
 
4
4
  Apache License
@@ -189,7 +189,7 @@ All rights reserved.
189
189
  same "printed page" as the copyright notice for easier
190
190
  identification within third-party archives.
191
191
 
192
- Copyright 2017, Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
192
+ Copyright 2017, Akihiko Odaki <akihiko.odaki@gmail.com>
193
193
 
194
194
  Licensed under the Apache License, Version 2.0 (the "License");
195
195
  you may not use this file except in compliance with the License.
data/README.md CHANGED
@@ -8,11 +8,11 @@ require 'cld3'
8
8
 
9
9
  cld3 = CLD3::NNetLanguageIdentifier.new(0, 1000)
10
10
 
11
- cld3.find_language("こんにちは") # => #<struct Struct::Result language=:ja, probability=1.0, reliable?=true, proportion=1.0>
11
+ cld3.find_language("こんにちは") # => #<struct Struct::Result language=:ja, probability=1.0, reliable?=true, proportion=1.0, byte_ranges=[]>
12
12
 
13
- cld3.find_language("This is a pen.") # => #<struct Struct::Result language=:en, probability=0.9999408721923828, reliable?=true, proportion=1.0>
13
+ cld3.find_language("This is a pen.") # => #<struct Struct::Result language=:en, probability=0.9999408721923828, reliable?=true, proportion=1.0, byte_ranges=[]>
14
14
 
15
- cld3.find_language("здравствуйте") # => #<struct Struct::Result language=:ru, probability=0.3140212297439575, reliable?=false, proportion=1.0>
15
+ cld3.find_language("здравствуйте") # => #<struct Struct::Result language=:ru, probability=0.3140212297439575, reliable?=false, proportion=1.0, byte_ranges=[]>
16
16
  ```
17
17
 
18
18
  ## Installation
data/cld3.gemspec CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  # All Rights Reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,19 +16,21 @@
16
16
 
17
17
  Gem::Specification.new do |gem|
18
18
  gem.name = "cld3"
19
- gem.version = "3.2.6"
19
+ gem.version = "3.4.3"
20
20
  gem.summary = "Compact Language Detector v3 (CLD3)"
21
21
  gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
22
  gem.license = "Apache-2.0"
23
23
  gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
24
24
  gem.author = "Akihiko Odaki"
25
- gem.email = "akihiko.odaki.4i@stu.hosei.ac.jp"
26
- gem.required_ruby_version = [ ">= 2.3.0", "< 2.8.0" ]
27
- gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.12.0" ]
28
- gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.10.0" ]
25
+ gem.email = "akihiko.odaki@gmail.com"
26
+ gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
27
+ gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
28
+ gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
29
+ gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
30
+ gem.add_development_dependency "steep", [ ">= 0.46.0", "< 0.47.0" ]
29
31
  gem.files = Dir[
30
32
  "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
31
- "cld3.gemspec", "ext/**/*", "lib/**/*"
33
+ "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
32
34
  ]
33
35
  gem.require_paths = [ "lib" ]
34
36
  gem.extensions = [ "ext/cld3/extconf.rb" ]
data/ext/cld3/Makefile CHANGED
@@ -2,7 +2,7 @@
2
2
  SHELL = /bin/sh
3
3
 
4
4
  # V=0 quiet, V=1 verbose. other values don't work.
5
- V = 0
5
+ V = 1
6
6
  Q1 = $(V:1=)
7
7
  Q = $(Q1:0=@)
8
8
  ECHO1 = $(V:1=@ :)
@@ -12,54 +12,55 @@ NULLCMD = :
12
12
  #### Start of system configuration section. ####
13
13
 
14
14
  srcdir = .
15
- topdir = /usr/include/ruby-2.6.0
15
+ topdir = /usr/include
16
16
  hdrdir = $(topdir)
17
- arch_hdrdir = /usr/include/ruby-2.6.0/x86_64-linux
17
+ arch_hdrdir = /usr/include
18
18
  PATH_SEPARATOR = :
19
19
  VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
20
20
  prefix = $(DESTDIR)/usr
21
- rubysitearchprefix = $(rubylibprefix)/$(sitearch)
22
- rubyarchprefix = $(rubylibprefix)/$(arch)
23
- rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
24
- exec_prefix = $(prefix)
25
- vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
26
- sitearchhdrdir = $(sitehdrdir)/$(sitearch)
27
- rubyarchhdrdir = $(rubyhdrdir)/$(arch)
21
+ rubysitearchprefix = $(sitearchlibdir)/$(RUBY_BASE_NAME)
22
+ rubyarchprefix = $(DESTDIR)/usr/lib64/ruby
23
+ rubylibprefix = $(exec_prefix)/share/ruby
24
+ exec_prefix = $(DESTDIR)/usr
25
+ vendorarchhdrdir = $(vendorhdrdir)/$(arch)
26
+ sitearchhdrdir = $(sitehdrdir)/$(arch)
27
+ rubyarchhdrdir = $(DESTDIR)/usr/include
28
28
  vendorhdrdir = $(rubyhdrdir)/vendor_ruby
29
29
  sitehdrdir = $(rubyhdrdir)/site_ruby
30
- rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
31
- vendorarchdir = $(vendorlibdir)/$(sitearch)
32
- vendorlibdir = $(vendordir)/$(ruby_version)
33
- vendordir = $(rubylibprefix)/vendor_ruby
34
- sitearchdir = $(sitelibdir)/$(sitearch)
35
- sitelibdir = $(sitedir)/$(ruby_version)
36
- sitedir = $(rubylibprefix)/site_ruby
37
- rubyarchdir = $(rubylibdir)/$(arch)
38
- rubylibdir = $(rubylibprefix)/$(ruby_version)
30
+ rubyhdrdir = $(DESTDIR)/usr/include
31
+ rubygemsdir = $(DESTDIR)/usr/share/rubygems
32
+ vendorarchdir = $(DESTDIR)/usr/lib64/ruby/vendor_ruby
33
+ vendorlibdir = $(vendordir)
34
+ vendordir = $(DESTDIR)/usr/share/ruby/vendor_ruby
35
+ sitearchdir = $(DESTDIR)/usr/local/lib64/ruby/site_ruby
36
+ sitelibdir = $(sitedir)
37
+ sitedir = $(DESTDIR)/usr/local/share/ruby/site_ruby
38
+ rubyarchdir = $(rubyarchprefix)
39
+ rubylibdir = $(rubylibprefix)
39
40
  sitearchincludedir = $(includedir)/$(sitearch)
40
41
  archincludedir = $(includedir)/$(arch)
41
42
  sitearchlibdir = $(libdir)/$(sitearch)
42
- archlibdir = $(libdir)/$(arch)
43
+ archlibdir = $(DESTDIR)/usr/lib64
43
44
  ridir = $(datarootdir)/$(RI_BASE_NAME)
44
- mandir = $(datarootdir)/man
45
+ mandir = $(DESTDIR)/usr/share/man
45
46
  localedir = $(datarootdir)/locale
46
- libdir = $(exec_prefix)/lib
47
+ libdir = $(exec_prefix)/lib64
47
48
  psdir = $(docdir)
48
49
  pdfdir = $(docdir)
49
50
  dvidir = $(docdir)
50
51
  htmldir = $(docdir)
51
- infodir = $(datarootdir)/info
52
+ infodir = $(DESTDIR)/usr/share/info
52
53
  docdir = $(datarootdir)/doc/$(PACKAGE)
53
54
  oldincludedir = $(DESTDIR)/usr/include
54
- includedir = $(prefix)/include
55
+ includedir = $(DESTDIR)/usr/include
55
56
  runstatedir = $(localstatedir)/run
56
57
  localstatedir = $(DESTDIR)/var
57
58
  sharedstatedir = $(DESTDIR)/var/lib
58
59
  sysconfdir = $(DESTDIR)/etc
59
- datadir = $(datarootdir)
60
+ datadir = $(DESTDIR)/usr/share
60
61
  datarootdir = $(prefix)/share
61
- libexecdir = $(DESTDIR)/usr/lib/ruby
62
- sbindir = $(exec_prefix)/sbin
62
+ libexecdir = $(DESTDIR)/usr/libexec
63
+ sbindir = $(DESTDIR)/usr/sbin
63
64
  bindir = $(exec_prefix)/bin
64
65
  archdir = $(rubyarchdir)
65
66
 
@@ -78,36 +79,36 @@ CSRCFLAG = $(empty)
78
79
 
79
80
  RUBY_EXTCONF_H =
80
81
  cflags = $(optflags) $(debugflags) $(warnflags)
81
- cxxflags = $(optflags) $(debugflags) $(warnflags)
82
+ cxxflags =
82
83
  optflags = -O3
83
84
  debugflags = -ggdb3
84
- warnflags = -Wall -Wextra -Wdeclaration-after-statement -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wrestrict -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
85
+ warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
85
86
  cppflags =
86
87
  CCDLFLAGS = -fPIC
87
- CFLAGS = $(CCDLFLAGS) -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC -pthread $(ARCH_FLAG)
88
+ CFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
88
89
  INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
89
90
  DEFS =
90
- CPPFLAGS = -D_FORTIFY_SOURCE=2 $(DEFS) $(cppflags)
91
- CXXFLAGS = $(CCDLFLAGS) -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -pthread -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
92
- ldflags = -L. -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -fstack-protector-strong -rdynamic -Wl,-export-dynamic
93
- dldflags = -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -Wl,--compress-debug-sections=zlib
91
+ CPPFLAGS = $(DEFS) $(cppflags)
92
+ CXXFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
93
+ ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
94
+ dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
94
95
  ARCH_FLAG =
95
96
  DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
96
97
  LDSHARED = $(CC) -shared
97
98
  LDSHAREDXX = $(CXX) -shared
98
- AR = ar
99
+ AR = gcc-ar
99
100
  EXEEXT =
100
101
 
101
102
  RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
102
103
  RUBY_SO_NAME = ruby
103
104
  RUBYW_INSTALL_NAME =
104
- RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
105
+ RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version_dir_name)
105
106
  RUBYW_BASE_NAME = rubyw
106
107
  RUBY_BASE_NAME = ruby
107
108
 
108
- arch = x86_64-linux
109
+ arch = aarch64-linux
109
110
  sitearch = $(arch)
110
- ruby_version = 2.6.0
111
+ ruby_version = 3.0.0
111
112
  ruby = $(bindir)/$(RUBY_BASE_NAME)
112
113
  RUBY = $(ruby)
113
114
  ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
@@ -125,8 +126,8 @@ TOUCH = exit >
125
126
  #### End of system configuration section. ####
126
127
 
127
128
  preload =
128
- libpath = . $(libdir)
129
- LIBPATH = -L. -L$(libdir)
129
+ libpath = . $(archlibdir)
130
+ LIBPATH = -L. -L$(archlibdir)
130
131
  DEFFILE =
131
132
 
132
133
  CLEANFILES = mkmf.log
@@ -137,11 +138,11 @@ extout =
137
138
  extout_prefix =
138
139
  target_prefix =
139
140
  LOCAL_LIBS =
140
- LIBS = $(LIBRUBYARG_SHARED) -lprotobuf -lm -lc
141
+ LIBS = -lprotobuf -lpthread -lm -lc
141
142
  ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
142
143
  SRCS = $(ORIG_SRCS)
143
144
  OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
144
- HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/language_identifier_features.h $(srcdir)/lang_id_nn_params.h $(srcdir)/nnet_language_identifier.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/fixunicodevalue.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/stringpiece.h $(srcdir)/text_processing.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/unicodetext.h $(srcdir)/utils.h $(srcdir)/workspace.h $(srcdir)/feature_extractor.pb.h $(srcdir)/sentence.pb.h $(srcdir)/task_spec.pb.h
145
+ HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
145
146
  LOCAL_HDRS =
146
147
  TARGET = libcld3
147
148
  TARGET_NAME = libcld3
@@ -155,8 +156,8 @@ BINDIR = $(bindir)
155
156
  RUBYCOMMONDIR = $(sitedir)$(target_prefix)
156
157
  RUBYLIBDIR = $(sitelibdir)$(target_prefix)
157
158
  RUBYARCHDIR = $(sitearchdir)$(target_prefix)
158
- HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
159
- ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
159
+ HDRDIR = $(sitehdrdir)$(target_prefix)
160
+ ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
160
161
  TARGET_SO_DIR =
161
162
  TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
162
163
  CLEANLIBS = $(TARGET_SO)
data/ext/cld3/base.o CHANGED
Binary file
Binary file
@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
167
167
  for (int i = 0; i < model_->embedding_dim_size(); ++i) {
168
168
  CLD3_DCHECK(offset_sum == model_->concat_offset(i));
169
169
  offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
170
+ (void)offset_sum; // Avoid compiler warning for "unused" variable.
170
171
  embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
171
172
  }
172
173
 
Binary file
data/ext/cld3/extconf.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  # All Rights Reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,7 @@ FileUtils.mkdir_p("cld_3/protos")
33
33
  FileUtils.mkdir_p("script_span")
34
34
 
35
35
  [ "feature_extractor", "sentence", "task_spec" ].each {|name|
36
- `protoc '#{name}.proto' --cpp_out=.`
36
+ system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
37
37
  ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
38
38
  }
39
39
 
@@ -56,4 +56,5 @@ FileUtils.mkdir_p("script_span")
56
56
  }
57
57
 
58
58
  $CXXFLAGS += " -fvisibility=hidden -std=c++11"
59
+ $LIBRUBYARG = ""
59
60
  create_makefile("libcld3")
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
33
33
  static const int kWithinScriptTail = 32; // Stop at word space in last
34
34
  // N bytes of script buffer
35
35
 
36
- typedef struct {
36
+ struct LangSpan {
37
37
  char* text = nullptr; // Pointer to the span, somewhere
38
38
  int text_bytes = 0; // Number of bytes of text in the span
39
39
  int offset = 0; // Offset of start of span in original input buffer
40
40
  ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
41
41
  bool truncated = false; // true if buffer filled up before a
42
42
  // different script or EOF was found
43
- } LangSpan;
43
+ };
44
44
 
45
45
  static inline bool IsContinuationByte(char c) {
46
46
  return static_cast<signed char>(c) < -64;
Binary file
Binary file
@@ -0,0 +1,8 @@
1
+ EXPORTS
2
+ NNetLanguageIdentifier_find_language
3
+ NNetLanguageIdentifier_find_top_n_most_freq_langs
4
+ delete_NNetLanguageIdentifier
5
+ delete_result
6
+ delete_results
7
+ new_NNetLanguageIdentifier
8
+ refer_to_nth_result
data/ext/cld3/libcld3.so CHANGED
Binary file
data/ext/cld3/mkmf.log CHANGED
@@ -1,36 +1,37 @@
1
1
  "pkg-config --exists protobuf"
2
2
  | pkg-config --libs protobuf
3
- => "-lprotobuf \n"
4
- "gcc -o conftest -I/usr/include/ruby-2.6.0/x86_64-linux -I/usr/include/ruby-2.6.0/ruby/backward -I/usr/include/ruby-2.6.0 -I. -D_FORTIFY_SOURCE=2 -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC conftest.c -L. -L/usr/lib -L. -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lm -lc"
3
+ => "-lprotobuf -lpthread \n"
4
+ "gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lm -lc"
5
5
  checked program was:
6
6
  /* begin */
7
7
  1: #include "ruby.h"
8
8
  2:
9
9
  3: int main(int argc, char **argv)
10
10
  4: {
11
- 5: return 0;
11
+ 5: return !!argv[argc];
12
12
  6: }
13
13
  /* end */
14
14
 
15
- "gcc -o conftest -I/usr/include/ruby-2.6.0/x86_64-linux -I/usr/include/ruby-2.6.0/ruby/backward -I/usr/include/ruby-2.6.0 -I. -D_FORTIFY_SOURCE=2 -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC conftest.c -L. -L/usr/lib -L. -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lprotobuf -lm -lc"
15
+ "gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lprotobuf -lpthread -lm -lc"
16
16
  checked program was:
17
17
  /* begin */
18
18
  1: #include "ruby.h"
19
19
  2:
20
20
  3: int main(int argc, char **argv)
21
21
  4: {
22
- 5: return 0;
22
+ 5: return !!argv[argc];
23
23
  6: }
24
24
  /* end */
25
25
 
26
26
  | pkg-config --cflags-only-I protobuf
27
27
  => "\n"
28
28
  | pkg-config --cflags-only-other protobuf
29
- => "-pthread \n"
29
+ => "\n"
30
30
  | pkg-config --libs-only-l protobuf
31
- => "-lprotobuf \n"
31
+ => "-lprotobuf -lpthread \n"
32
32
  package configuration for protobuf
33
- cflags: -pthread
33
+ incflags:
34
+ cflags:
34
35
  ldflags:
35
- libs: -lprotobuf
36
+ libs: -lprotobuf -lpthread
36
37
 
@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
284
284
  CLD2::LangSpan script_span;
285
285
  std::unordered_map<string, LangChunksStats> lang_stats;
286
286
  int total_num_bytes = 0;
287
- Result result;
288
- string language;
289
287
  int chunk_size = 0; // Use the default.
290
288
  while (ss.GetOneScriptSpanLower(&script_span)) {
291
289
  const int num_original_span_bytes = script_span.text_bytes;
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
302
300
 
303
301
  const string selected_text = SelectTextGivenScriptSpan(script_span);
304
302
 
305
- result = FindLanguageOfValidUTF8(selected_text);
306
- language = result.language;
303
+ Result result = FindLanguageOfValidUTF8(selected_text);
304
+ string language = result.language;
307
305
  lang_stats[language].byte_sum += num_original_span_bytes;
308
306
  lang_stats[language].prob_sum +=
309
307
  result.probability * num_original_span_bytes;
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
356
354
  const char *text_begin, int text_size) {
357
355
  string output_text;
358
356
 
359
- // If the size of the input is greater than the maxium number of bytes needed
357
+ // If the size of the input is greater than the maximum number of bytes needed
360
358
  // for a prediction, then concatenate snippets that are equally spread out
361
359
  // throughout the input.
362
360
  if (text_size > max_num_bytes_) {
Binary file
@@ -1,4 +1,4 @@
1
- /* Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ /* Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  All Rights Reserved.
3
3
 
4
4
  Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,42 +26,90 @@ limitations under the License.
26
26
  #define EXPORT __attribute__ ((visibility ("default")))
27
27
  #endif
28
28
 
29
- struct NNetLanguageIdentifier {
30
- chrome_lang_id::NNetLanguageIdentifier context;
31
- std::string language;
32
- };
33
-
34
29
  struct Result {
35
30
  struct {
36
31
  const char *data;
37
32
  std::size_t size;
38
33
  } language;
34
+ struct {
35
+ const chrome_lang_id::NNetLanguageIdentifier::SpanInfo *data;
36
+ std::size_t size;
37
+ } byte_ranges;
39
38
  float probability;
40
39
  float proportion;
41
40
  bool is_reliable;
42
41
  };
43
42
 
43
+ struct OwningResult {
44
+ OwningResult(chrome_lang_id::NNetLanguageIdentifier::Result&& result) {
45
+ references.language = std::move(result.language);
46
+ references.byte_ranges = std::move(result.byte_ranges);
47
+ plain.language.data = references.language.data();
48
+ plain.language.size = references.language.size();
49
+ plain.byte_ranges.data = references.byte_ranges.data();
50
+ plain.byte_ranges.size = references.byte_ranges.size();
51
+ plain.probability = result.probability;
52
+ plain.proportion = result.proportion;
53
+ plain.is_reliable = result.is_reliable;
54
+ }
55
+
56
+ Result plain;
57
+ struct {
58
+ std::string language;
59
+ std::vector<chrome_lang_id::NNetLanguageIdentifier::SpanInfo> byte_ranges;
60
+ } references;
61
+ };
62
+
44
63
  extern "C" {
45
- EXPORT Result NNetLanguageIdentifier_find_language(void *pointer,
46
- const char *data,
47
- std::size_t size) {
48
- auto instance = static_cast<NNetLanguageIdentifier *>(pointer);
49
- auto result = instance->context.FindLanguage(std::string(data, size));
50
- instance->language = std::move(result.language);
51
-
52
- return Result {
53
- { instance->language.data(), instance->language.size() },
54
- result.probability,
55
- result.proportion,
56
- result.is_reliable
57
- };
64
+ EXPORT OwningResult *NNetLanguageIdentifier_find_language(
65
+ chrome_lang_id::NNetLanguageIdentifier *instance,
66
+ const char *data,
67
+ std::size_t size) {
68
+ return new OwningResult(instance->FindLanguage(std::string(data, size)));
58
69
  }
59
70
 
60
- EXPORT void delete_NNetLanguageIdentifier(void *pointer) {
61
- delete static_cast<NNetLanguageIdentifier *>(pointer);
71
+ EXPORT std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>*
72
+ NNetLanguageIdentifier_find_top_n_most_freq_langs(
73
+ chrome_lang_id::NNetLanguageIdentifier *instance,
74
+ const char *data, std::size_t size, int num_langs) {
75
+ std::string text(data, size);
76
+ return new auto(instance->FindTopNMostFreqLangs(text, num_langs));
62
77
  }
63
78
 
64
- EXPORT void *new_NNetLanguageIdentifier(int min_num_bytes, int max_num_bytes) {
65
- return new NNetLanguageIdentifier{{min_num_bytes, max_num_bytes}, {}};
79
+ EXPORT void delete_NNetLanguageIdentifier(
80
+ chrome_lang_id::NNetLanguageIdentifier *pointer) {
81
+ delete pointer;
82
+ }
83
+
84
+ EXPORT void delete_result(OwningResult *pointer) {
85
+ delete pointer;
86
+ }
87
+
88
+ EXPORT void delete_results(
89
+ std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *pointer) {
90
+ delete pointer;
91
+ }
92
+
93
+ EXPORT chrome_lang_id::NNetLanguageIdentifier *new_NNetLanguageIdentifier(
94
+ int min_num_bytes, int max_num_bytes) {
95
+ return new chrome_lang_id::NNetLanguageIdentifier(
96
+ min_num_bytes, max_num_bytes);
97
+ }
98
+
99
+ EXPORT Result refer_to_nth_result(
100
+ std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *results,
101
+ std::size_t index) {
102
+ Result c;
103
+ auto& cc = (*results)[index];
104
+
105
+ c.language.data = cc.language.data();
106
+ c.language.size = cc.language.size();
107
+ c.byte_ranges.data = cc.byte_ranges.data();
108
+ c.byte_ranges.size = cc.byte_ranges.size();
109
+ c.probability = cc.probability;
110
+ c.proportion = cc.proportion;
111
+ c.is_reliable = cc.is_reliable;
112
+
113
+ return c;
66
114
  }
67
115
  }
Binary file
data/ext/cld3/offsetmap.o CHANGED
Binary file
data/ext/cld3/registry.o CHANGED
Binary file
Binary file
Binary file
@@ -19,11 +19,11 @@ limitations under the License.
19
19
 
20
20
  namespace chrome_lang_id {
21
21
 
22
- // Declare registry for the whole Sentence feature functions. NOTE: this is not
22
+ // Define registry for the whole Sentence feature functions. NOTE: this is not
23
23
  // yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
24
24
  // constructor, *before* we use any feature.
25
25
  template <>
26
- WholeSentenceFeature::Registry
27
- *RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
26
+ WholeSentenceFeature::Registry*
27
+ RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
28
28
 
29
- } // namespace chrome_lang_id
29
+ } // namespace chrome_lang_id
@@ -26,9 +26,19 @@ limitations under the License.
26
26
  namespace chrome_lang_id {
27
27
 
28
28
  // Feature function that extracts features for the full Sentence.
29
- typedef FeatureFunction<Sentence> WholeSentenceFeature;
30
-
31
- typedef FeatureExtractor<Sentence> WholeSentenceExtractor;
29
+ using WholeSentenceFeature = FeatureFunction<Sentence>;
30
+
31
+ using WholeSentenceExtractor = FeatureExtractor<Sentence>;
32
+
33
+ // Declare registry for the whole Sentence feature functions. This is required
34
+ // for clang's -Wundefined-var-template. However, MSVC has a bug which treats
35
+ // this declaration as a definition, leading to multiple definition errors, so
36
+ // omit this on MSVC.
37
+ #if !defined(COMPILER_MSVC)
38
+ template <>
39
+ WholeSentenceFeature::Registry
40
+ *RegisterableClass<WholeSentenceFeature>::registry_;
41
+ #endif
32
42
 
33
43
  } // namespace chrome_lang_id
34
44
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/ext/cld3/utils.o CHANGED
Binary file
data/ext/cld3/workspace.o CHANGED
Binary file
@@ -0,0 +1,58 @@
1
+
2
+ # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
3
+ # All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ module CLD3
19
+ module Unstable
20
+ extend FFI::Library
21
+
22
+ ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
23
+
24
+ module NNetLanguageIdentifier
25
+ class Pointer < FFI::AutoPointer
26
+ def self.release(pointer)
27
+ Unstable.delete_NNetLanguageIdentifier(pointer)
28
+ end
29
+ end
30
+
31
+ class SpanInfo < FFI::Struct
32
+ layout :start_index, :int, :end_index, :int, :probability, :float
33
+ end
34
+
35
+ class Result < FFI::Struct
36
+ layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
37
+ end
38
+ end
39
+
40
+ attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
41
+
42
+ attach_function :delete_result, [ :pointer ], :void
43
+
44
+ attach_function :delete_results, [ :pointer ], :void
45
+
46
+ attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
47
+
48
+ attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
49
+
50
+ attach_function :NNetLanguageIdentifier_find_language,
51
+ [ :pointer, :buffer_in, :size_t ], :pointer
52
+
53
+ attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
54
+ [ :pointer, :buffer_in, :size_t, :int ], :pointer
55
+ end
56
+
57
+ private_constant :Unstable
58
+ end
data/lib/cld3.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # File including an implementation of CLD3 module. Some documentations are
2
2
  # extracted from ext/cld3/ext/src/nnet_language_identifier.h.
3
3
  #
4
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
4
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
5
5
  # All Rights Reserved.
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +19,7 @@
19
19
 
20
20
  require "ffi"
21
21
  require "rbconfig"
22
+ require "cld3/unstable"
22
23
 
23
24
  # Module providing an interface for Compact Language Detector v3 (CLD3)
24
25
  module CLD3
@@ -49,10 +50,16 @@ module CLD3
49
50
  # This is Numeric object.
50
51
  RELIABILITY_HR_BS_THRESHOLD = 0.5
51
52
 
53
+ # Holds probability that Span, specified by start/end indices, is a given
54
+ # language. The langauge is not stored here; it can be found in Result, which
55
+ # holds an Array of SpanInfo.
56
+ # @type const SpanInfo: untyped
57
+ SpanInfo = Struct.new(:start_index, :end_index, :probability)
58
+
52
59
  # Information about a predicted language.
53
60
  # This is an instance of Struct with the following members:
54
61
  #
55
- # [language] This is symbol or nil.
62
+ # [language] This is symbol.
56
63
  #
57
64
  # [probability] Language probability. This is Numeric object.
58
65
  #
@@ -61,33 +68,100 @@ module CLD3
61
68
  # [proportion] Proportion of bytes associated with the language. If
62
69
  # #find_language is called, this variable is set to 1.
63
70
  # This is Numeric object.
64
- Result = Struct.new("Result", :language, :probability, :reliable?, :proportion)
71
+ #
72
+ # [byte_ranges] Specifies the byte ranges in UTF-8 that |language| applies to.
73
+ # This is an Array of SpanInfo.
74
+ # @type const Result: untyped
75
+ Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
65
76
 
66
77
  # The arguments are two String objects.
67
- def initialize(minNumBytes = MIN_NUM_BYTES_TO_CONSIDER, maxNumBytes = MAX_NUM_BYTES_TO_CONSIDER)
68
- @cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
78
+ def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
79
+ @cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
69
80
  end
70
81
 
71
82
  # Finds the most likely language for the given text, along with additional
72
83
  # information (e.g., probability). The prediction is based on the first N
73
84
  # bytes where N is the minumum between the number of interchange valid UTF8
74
85
  # bytes and +max_num_bytes_+. If N is less than +min_num_bytes_+ long, then
75
- # this function returns nil as language.
86
+ # this function returns nil.
76
87
  # The argument is a String object.
77
88
  # The returned value of this function is an instance of Result.
78
89
  def find_language(text)
79
90
  text_utf8 = text.encode(Encoding::UTF_8)
80
91
  pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
81
- pointer.put_bytes(0, text_utf8)
82
92
 
83
- cc_result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
84
- language = cc_result[:language_data].read_bytes(cc_result[:language_size])
93
+ begin
94
+ pointer.put_bytes(0, text_utf8)
95
+
96
+ result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
97
+ begin
98
+ convert_result Unstable::NNetLanguageIdentifier::Result.new(result)
99
+ ensure
100
+ Unstable.delete_result result
101
+ end
102
+ ensure
103
+ pointer.free
104
+ end
105
+ end
106
+
107
+ # Splits the input text (up to the first byte, if any, that is not
108
+ # interchange valid UTF8) into spans based on the script, predicts a language
109
+ # for each span, and returns a vector storing the top num_langs most frequent
110
+ # languages along with additional information (e.g., proportions). The number
111
+ # of bytes considered for each span is the minimum between the size of the
112
+ # span and +max_num_bytes_+. If more languages are requested than what is
113
+ # available in the input, then the number of the returned elements will be
114
+ # the number of the latter. Also, if the size of the span is less than
115
+ # +min_num_bytes_+ long, then the span is skipped. If the input text is too
116
+ # long, only the first +MAX_NUM_INPUT_BYTES_TO_CONSIDER+ bytes are processed.
117
+ # The first argument is a String object.
118
+ # The second argument is Numeric object.
119
+ # The returned value of this functions is an Array of Result instances.
120
+ def find_top_n_most_freq_langs(text, num_langs)
121
+ # @type var a: untyped
122
+
123
+ text_utf8 = text.encode(Encoding::UTF_8)
124
+ pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
125
+
126
+ begin
127
+ pointer.put_bytes(0, text_utf8)
128
+
129
+ results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
130
+ begin
131
+ a = num_langs.times
132
+ .lazy
133
+ .map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
134
+ .take_while { |result| !result.nil? }
135
+ .to_a
136
+
137
+ a
138
+ ensure
139
+ Unstable.delete_results results
140
+ end
141
+ ensure
142
+ pointer.free
143
+ end
144
+ end
145
+
146
+ private
147
+
148
+ def convert_result(result)
149
+ language = result[:language_data].read_bytes(result[:language_size])
150
+ return nil if language == "und"
151
+
152
+ cursor = result[:byte_ranges_data]
153
+ byte_ranges = result[:byte_ranges_size].times.map do
154
+ info = Unstable::NNetLanguageIdentifier::SpanInfo.new(cursor)
155
+ cursor += Unstable::NNetLanguageIdentifier::SpanInfo.size
156
+ SpanInfo.new(info[:start_index], info[:end_index], info[:probability])
157
+ end
85
158
 
86
159
  Result.new(
87
- language == "und" ? nil : language.to_sym,
88
- cc_result[:probability],
89
- cc_result[:reliable?],
90
- cc_result[:proportion])
160
+ language.to_sym,
161
+ result[:probability],
162
+ result[:reliable?],
163
+ result[:proportion],
164
+ byte_ranges)
91
165
  end
92
166
  end
93
167
 
@@ -95,6 +169,7 @@ module CLD3
95
169
  # The model weights are loaded statically.
96
170
  module TaskContextParams
97
171
  # This is an frozen Array object containing symbols.
172
+ # @type const LANGUAGE_NAMES: untyped
98
173
  LANGUAGE_NAMES = [
99
174
  :eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
100
175
  :nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
@@ -108,31 +183,4 @@ module CLD3
108
183
  :sn, :yo, :pa, :ku,
109
184
  ].freeze
110
185
  end
111
-
112
- module Unstable
113
- extend FFI::Library
114
-
115
- ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
116
-
117
- module NNetLanguageIdentifier
118
- class Pointer < FFI::AutoPointer
119
- def self.release(pointer)
120
- Unstable.delete_NNetLanguageIdentifier(pointer)
121
- end
122
- end
123
-
124
- class Result < FFI::Struct
125
- layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
126
- end
127
- end
128
-
129
- attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
130
-
131
- attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
132
-
133
- attach_function :NNetLanguageIdentifier_find_language,
134
- [ :pointer, :buffer_in, :size_t ], NNetLanguageIdentifier::Result.by_value
135
- end
136
-
137
- private_constant :Unstable
138
186
  end
data/sig/cld3.rbs ADDED
@@ -0,0 +1,65 @@
1
+ # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ # All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ==============================================================================
16
+
17
+ module CLD3
18
+ class NNetLanguageIdentifier
19
+ MIN_NUM_BYTES_TO_CONSIDER: Integer
20
+ MAX_NUM_BYTES_TO_CONSIDER: Integer
21
+ MAX_NUM_INPUT_BYTES_TO_CONSIDER: Integer
22
+ RELIABILITY_THRESHOLD: Float
23
+ RELIABILITY_HR_BS_THRESHOLD: Float
24
+
25
+ class SpanInfo < Struct[Float | Integer]
26
+ attr_accessor start_index(): Integer
27
+ attr_accessor end_index(): Integer
28
+ attr_accessor probability(): Float
29
+ end
30
+
31
+ class Result < Struct[Array[SpanInfo] | Float | TaskContextParams::language_names | bool]
32
+ attr_accessor language(): TaskContextParams::language_names
33
+ attr_accessor probability(): Float
34
+ attr_accessor reliable?(): bool
35
+ attr_accessor proportion(): Float
36
+ attr_accessor byte_ranges(): Array[SpanInfo]
37
+ end
38
+
39
+ def initialize: (?Integer, ?Integer) -> void
40
+ def find_language: (String) -> Result?
41
+ def find_top_n_most_freq_langs: (String, Integer) -> Array[Result]
42
+
43
+ private
44
+
45
+ def convert_result: (untyped) -> Result?
46
+ end
47
+
48
+ module TaskContextParams
49
+ type language_names =
50
+ :eo | :co | :eu | :ta | :de | :mt | :ps | :te | :su | :uz | :'zh-Latn' | :ne |
51
+ :nl | :sw | :sq | :hmn | :ja | :no | :mn | :so | :ko | :kk | :sl | :ig |
52
+ :mr | :th | :zu | :ml | :hr | :bs | :lo | :sd | :cy | :hy | :uk | :pt |
53
+ :lv | :iw | :cs | :vi | :jv | :be | :km | :mk | :tr | :fy | :am | :zh |
54
+ :da | :sv | :fi | :ht | :af | :la | :id | :fil | :sm | :ca | :el | :ka |
55
+ :sr | :it | :sk | :ru | :'ru-Latn' | :bg | :ny | :fa | :haw | :gl | :et |
56
+ :ms | :gd | :'bg-Latn' | :ha | :is | :ur | :mi | :hi | :bn | :'hi-Latn' | :fr |
57
+ :yi | :hu | :xh | :my | :tg | :ro | :ar | :lb | :'el-Latn' | :st | :ceb |
58
+ :kn | :az | :si | :ky | :mg | :en | :gu | :es | :pl | :'ja-Latn' | :ga | :lt |
59
+ :sn | :yo | :pa | :ku
60
+
61
+ LANGUAGE_NAMES: Array[language_names]
62
+ end
63
+
64
+ Unstable: untyped
65
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cld3
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.6
4
+ version: 3.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Akihiko Odaki
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-04 00:00:00.000000000 Z
11
+ date: 2021-11-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -19,7 +19,7 @@ dependencies:
19
19
  version: 1.1.0
20
20
  - - "<"
21
21
  - !ruby/object:Gem::Version
22
- version: 1.12.0
22
+ version: 1.16.0
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
@@ -29,7 +29,27 @@ dependencies:
29
29
  version: 1.1.0
30
30
  - - "<"
31
31
  - !ruby/object:Gem::Version
32
- version: 1.12.0
32
+ version: 1.16.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: rbs
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 1.7.0
40
+ - - "<"
41
+ - !ruby/object:Gem::Version
42
+ version: 1.8.0
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.7.0
50
+ - - "<"
51
+ - !ruby/object:Gem::Version
52
+ version: 1.8.0
33
53
  - !ruby/object:Gem::Dependency
34
54
  name: rspec
35
55
  requirement: !ruby/object:Gem::Requirement
@@ -39,7 +59,7 @@ dependencies:
39
59
  version: 3.0.0
40
60
  - - "<"
41
61
  - !ruby/object:Gem::Version
42
- version: 3.10.0
62
+ version: 3.11.0
43
63
  type: :development
44
64
  prerelease: false
45
65
  version_requirements: !ruby/object:Gem::Requirement
@@ -49,10 +69,30 @@ dependencies:
49
69
  version: 3.0.0
50
70
  - - "<"
51
71
  - !ruby/object:Gem::Version
52
- version: 3.10.0
72
+ version: 3.11.0
73
+ - !ruby/object:Gem::Dependency
74
+ name: steep
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 0.46.0
80
+ - - "<"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.47.0
83
+ type: :development
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.46.0
90
+ - - "<"
91
+ - !ruby/object:Gem::Version
92
+ version: 0.47.0
53
93
  description: Compact Language Detector v3 (CLD3) is a neural network model for language
54
94
  identification.
55
- email: akihiko.odaki.4i@stu.hosei.ac.jp
95
+ email: akihiko.odaki@gmail.com
56
96
  executables: []
57
97
  extensions:
58
98
  - ext/cld3/extconf.rb
@@ -106,6 +146,7 @@ files:
106
146
  - ext/cld3/language_identifier_features.cc
107
147
  - ext/cld3/language_identifier_features.h
108
148
  - ext/cld3/language_identifier_features.o
149
+ - ext/cld3/libcld3.def
109
150
  - ext/cld3/libcld3.so
110
151
  - ext/cld3/mkmf.log
111
152
  - ext/cld3/nnet_language_identifier.cc
@@ -159,11 +200,13 @@ files:
159
200
  - ext/cld3/workspace.h
160
201
  - ext/cld3/workspace.o
161
202
  - lib/cld3.rb
203
+ - lib/cld3/unstable.rb
204
+ - sig/cld3.rbs
162
205
  homepage: https://github.com/akihikodaki/cld3-ruby
163
206
  licenses:
164
207
  - Apache-2.0
165
208
  metadata: {}
166
- post_install_message:
209
+ post_install_message:
167
210
  rdoc_options: []
168
211
  require_paths:
169
212
  - lib
@@ -171,18 +214,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
171
214
  requirements:
172
215
  - - ">="
173
216
  - !ruby/object:Gem::Version
174
- version: 2.3.0
217
+ version: 2.6.0
175
218
  - - "<"
176
219
  - !ruby/object:Gem::Version
177
- version: 2.8.0
220
+ version: 3.2.0
178
221
  required_rubygems_version: !ruby/object:Gem::Requirement
179
222
  requirements:
180
223
  - - ">="
181
224
  - !ruby/object:Gem::Version
182
225
  version: '0'
183
226
  requirements: []
184
- rubygems_version: 3.0.6
185
- signing_key:
227
+ rubygems_version: 3.2.22
228
+ signing_key:
186
229
  specification_version: 4
187
230
  summary: Compact Language Detector v3 (CLD3)
188
231
  test_files: []