cld3 3.2.6 → 3.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +1 -1
  3. data/LICENSE +2 -2
  4. data/README.md +3 -3
  5. data/cld3.gemspec +9 -7
  6. data/ext/cld3/Makefile +45 -44
  7. data/ext/cld3/base.o +0 -0
  8. data/ext/cld3/embedding_feature_extractor.o +0 -0
  9. data/ext/cld3/embedding_network.cc +1 -0
  10. data/ext/cld3/embedding_network.o +0 -0
  11. data/ext/cld3/extconf.rb +3 -2
  12. data/ext/cld3/feature_extractor.o +0 -0
  13. data/ext/cld3/feature_extractor.pb.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.h +2 -2
  20. data/ext/cld3/getonescriptspan.o +0 -0
  21. data/ext/cld3/lang_id_nn_params.o +0 -0
  22. data/ext/cld3/language_identifier_features.o +0 -0
  23. data/ext/cld3/libcld3.def +8 -0
  24. data/ext/cld3/libcld3.so +0 -0
  25. data/ext/cld3/mkmf.log +10 -9
  26. data/ext/cld3/nnet_language_identifier.cc +3 -5
  27. data/ext/cld3/nnet_language_identifier.o +0 -0
  28. data/ext/cld3/nnet_language_identifier_c.cc +71 -23
  29. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  30. data/ext/cld3/offsetmap.o +0 -0
  31. data/ext/cld3/registry.o +0 -0
  32. data/ext/cld3/relevant_script_feature.o +0 -0
  33. data/ext/cld3/sentence.pb.o +0 -0
  34. data/ext/cld3/sentence_features.cc +4 -4
  35. data/ext/cld3/sentence_features.h +13 -3
  36. data/ext/cld3/sentence_features.o +0 -0
  37. data/ext/cld3/task_context.o +0 -0
  38. data/ext/cld3/task_context_params.o +0 -0
  39. data/ext/cld3/task_spec.pb.o +0 -0
  40. data/ext/cld3/text_processing.o +0 -0
  41. data/ext/cld3/unicodetext.o +0 -0
  42. data/ext/cld3/utf8statetable.o +0 -0
  43. data/ext/cld3/utils.o +0 -0
  44. data/ext/cld3/workspace.o +0 -0
  45. data/lib/cld3/unstable.rb +58 -0
  46. data/lib/cld3.rb +88 -40
  47. data/sig/cld3.rbs +65 -0
  48. metadata +56 -13
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0ee3c6166247aaf958310ffa9976400fcfa5050eb1969dd17e186e3500dd06d9
4
- data.tar.gz: bfa75958e205683dfa2429d388bb40d18d237ef5c2b5877a9fd718489f95b7bd
3
+ metadata.gz: 2c161cbf12d260074efd2e9db3981b6615af20ee04c234d6b2710bd52a283a4e
4
+ data.tar.gz: c388ae6b529d95e015ecdb7d21cdd7f1ceaca72d167d0f8008b5477d5bce5b3c
5
5
  SHA512:
6
- metadata.gz: ac1fb08ebf438995878bb7c992bc2e2a71adf9d0f06f01316121b6d8d48f5b8f2f1ea9a3f68f501dad6682168b7a3e16b0137be16cae7ad876d0bce9f6d866e7
7
- data.tar.gz: 40e1036c1c7e08af1caed6efd187a04bb8883f9ff427c61824193f72f58e696279611575f7a63b8cbd80fff2c9f20da644807e71be2ea281a8d870e3721410bd
6
+ metadata.gz: 8e3c1c07283730e722c450acc308a497756fd501595a02a7fc066d0b3e59b96e1ab1e7941549293b02e41274b176772bdae3779a041eb28f8ae53f5c44308cc0
7
+ data.tar.gz: 52e95027de7a595b2eabc49745a11f664e305c18f9926bc9d649642a92fea9846efdd23da699529795d80609b8871b00e77f9379449d2e4f6cb79ecbcf2785db
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  # All Rights Reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  All rights reserved.
3
3
 
4
4
  Apache License
@@ -189,7 +189,7 @@ All rights reserved.
189
189
  same "printed page" as the copyright notice for easier
190
190
  identification within third-party archives.
191
191
 
192
- Copyright 2017, Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
192
+ Copyright 2017, Akihiko Odaki <akihiko.odaki@gmail.com>
193
193
 
194
194
  Licensed under the Apache License, Version 2.0 (the "License");
195
195
  you may not use this file except in compliance with the License.
data/README.md CHANGED
@@ -8,11 +8,11 @@ require 'cld3'
8
8
 
9
9
  cld3 = CLD3::NNetLanguageIdentifier.new(0, 1000)
10
10
 
11
- cld3.find_language("こんにちは") # => #<struct Struct::Result language=:ja, probability=1.0, reliable?=true, proportion=1.0>
11
+ cld3.find_language("こんにちは") # => #<struct Struct::Result language=:ja, probability=1.0, reliable?=true, proportion=1.0, byte_ranges=[]>
12
12
 
13
- cld3.find_language("This is a pen.") # => #<struct Struct::Result language=:en, probability=0.9999408721923828, reliable?=true, proportion=1.0>
13
+ cld3.find_language("This is a pen.") # => #<struct Struct::Result language=:en, probability=0.9999408721923828, reliable?=true, proportion=1.0, byte_ranges=[]>
14
14
 
15
- cld3.find_language("здравствуйте") # => #<struct Struct::Result language=:ru, probability=0.3140212297439575, reliable?=false, proportion=1.0>
15
+ cld3.find_language("здравствуйте") # => #<struct Struct::Result language=:ru, probability=0.3140212297439575, reliable?=false, proportion=1.0, byte_ranges=[]>
16
16
  ```
17
17
 
18
18
  ## Installation
data/cld3.gemspec CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  # All Rights Reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,19 +16,21 @@
16
16
 
17
17
  Gem::Specification.new do |gem|
18
18
  gem.name = "cld3"
19
- gem.version = "3.2.6"
19
+ gem.version = "3.4.3"
20
20
  gem.summary = "Compact Language Detector v3 (CLD3)"
21
21
  gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
22
  gem.license = "Apache-2.0"
23
23
  gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
24
24
  gem.author = "Akihiko Odaki"
25
- gem.email = "akihiko.odaki.4i@stu.hosei.ac.jp"
26
- gem.required_ruby_version = [ ">= 2.3.0", "< 2.8.0" ]
27
- gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.12.0" ]
28
- gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.10.0" ]
25
+ gem.email = "akihiko.odaki@gmail.com"
26
+ gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
27
+ gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
28
+ gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
29
+ gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
30
+ gem.add_development_dependency "steep", [ ">= 0.46.0", "< 0.47.0" ]
29
31
  gem.files = Dir[
30
32
  "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
31
- "cld3.gemspec", "ext/**/*", "lib/**/*"
33
+ "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
32
34
  ]
33
35
  gem.require_paths = [ "lib" ]
34
36
  gem.extensions = [ "ext/cld3/extconf.rb" ]
data/ext/cld3/Makefile CHANGED
@@ -2,7 +2,7 @@
2
2
  SHELL = /bin/sh
3
3
 
4
4
  # V=0 quiet, V=1 verbose. other values don't work.
5
- V = 0
5
+ V = 1
6
6
  Q1 = $(V:1=)
7
7
  Q = $(Q1:0=@)
8
8
  ECHO1 = $(V:1=@ :)
@@ -12,54 +12,55 @@ NULLCMD = :
12
12
  #### Start of system configuration section. ####
13
13
 
14
14
  srcdir = .
15
- topdir = /usr/include/ruby-2.6.0
15
+ topdir = /usr/include
16
16
  hdrdir = $(topdir)
17
- arch_hdrdir = /usr/include/ruby-2.6.0/x86_64-linux
17
+ arch_hdrdir = /usr/include
18
18
  PATH_SEPARATOR = :
19
19
  VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
20
20
  prefix = $(DESTDIR)/usr
21
- rubysitearchprefix = $(rubylibprefix)/$(sitearch)
22
- rubyarchprefix = $(rubylibprefix)/$(arch)
23
- rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
24
- exec_prefix = $(prefix)
25
- vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
26
- sitearchhdrdir = $(sitehdrdir)/$(sitearch)
27
- rubyarchhdrdir = $(rubyhdrdir)/$(arch)
21
+ rubysitearchprefix = $(sitearchlibdir)/$(RUBY_BASE_NAME)
22
+ rubyarchprefix = $(DESTDIR)/usr/lib64/ruby
23
+ rubylibprefix = $(exec_prefix)/share/ruby
24
+ exec_prefix = $(DESTDIR)/usr
25
+ vendorarchhdrdir = $(vendorhdrdir)/$(arch)
26
+ sitearchhdrdir = $(sitehdrdir)/$(arch)
27
+ rubyarchhdrdir = $(DESTDIR)/usr/include
28
28
  vendorhdrdir = $(rubyhdrdir)/vendor_ruby
29
29
  sitehdrdir = $(rubyhdrdir)/site_ruby
30
- rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
31
- vendorarchdir = $(vendorlibdir)/$(sitearch)
32
- vendorlibdir = $(vendordir)/$(ruby_version)
33
- vendordir = $(rubylibprefix)/vendor_ruby
34
- sitearchdir = $(sitelibdir)/$(sitearch)
35
- sitelibdir = $(sitedir)/$(ruby_version)
36
- sitedir = $(rubylibprefix)/site_ruby
37
- rubyarchdir = $(rubylibdir)/$(arch)
38
- rubylibdir = $(rubylibprefix)/$(ruby_version)
30
+ rubyhdrdir = $(DESTDIR)/usr/include
31
+ rubygemsdir = $(DESTDIR)/usr/share/rubygems
32
+ vendorarchdir = $(DESTDIR)/usr/lib64/ruby/vendor_ruby
33
+ vendorlibdir = $(vendordir)
34
+ vendordir = $(DESTDIR)/usr/share/ruby/vendor_ruby
35
+ sitearchdir = $(DESTDIR)/usr/local/lib64/ruby/site_ruby
36
+ sitelibdir = $(sitedir)
37
+ sitedir = $(DESTDIR)/usr/local/share/ruby/site_ruby
38
+ rubyarchdir = $(rubyarchprefix)
39
+ rubylibdir = $(rubylibprefix)
39
40
  sitearchincludedir = $(includedir)/$(sitearch)
40
41
  archincludedir = $(includedir)/$(arch)
41
42
  sitearchlibdir = $(libdir)/$(sitearch)
42
- archlibdir = $(libdir)/$(arch)
43
+ archlibdir = $(DESTDIR)/usr/lib64
43
44
  ridir = $(datarootdir)/$(RI_BASE_NAME)
44
- mandir = $(datarootdir)/man
45
+ mandir = $(DESTDIR)/usr/share/man
45
46
  localedir = $(datarootdir)/locale
46
- libdir = $(exec_prefix)/lib
47
+ libdir = $(exec_prefix)/lib64
47
48
  psdir = $(docdir)
48
49
  pdfdir = $(docdir)
49
50
  dvidir = $(docdir)
50
51
  htmldir = $(docdir)
51
- infodir = $(datarootdir)/info
52
+ infodir = $(DESTDIR)/usr/share/info
52
53
  docdir = $(datarootdir)/doc/$(PACKAGE)
53
54
  oldincludedir = $(DESTDIR)/usr/include
54
- includedir = $(prefix)/include
55
+ includedir = $(DESTDIR)/usr/include
55
56
  runstatedir = $(localstatedir)/run
56
57
  localstatedir = $(DESTDIR)/var
57
58
  sharedstatedir = $(DESTDIR)/var/lib
58
59
  sysconfdir = $(DESTDIR)/etc
59
- datadir = $(datarootdir)
60
+ datadir = $(DESTDIR)/usr/share
60
61
  datarootdir = $(prefix)/share
61
- libexecdir = $(DESTDIR)/usr/lib/ruby
62
- sbindir = $(exec_prefix)/sbin
62
+ libexecdir = $(DESTDIR)/usr/libexec
63
+ sbindir = $(DESTDIR)/usr/sbin
63
64
  bindir = $(exec_prefix)/bin
64
65
  archdir = $(rubyarchdir)
65
66
 
@@ -78,36 +79,36 @@ CSRCFLAG = $(empty)
78
79
 
79
80
  RUBY_EXTCONF_H =
80
81
  cflags = $(optflags) $(debugflags) $(warnflags)
81
- cxxflags = $(optflags) $(debugflags) $(warnflags)
82
+ cxxflags =
82
83
  optflags = -O3
83
84
  debugflags = -ggdb3
84
- warnflags = -Wall -Wextra -Wdeclaration-after-statement -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wrestrict -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
85
+ warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
85
86
  cppflags =
86
87
  CCDLFLAGS = -fPIC
87
- CFLAGS = $(CCDLFLAGS) -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC -pthread $(ARCH_FLAG)
88
+ CFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC $(ARCH_FLAG)
88
89
  INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
89
90
  DEFS =
90
- CPPFLAGS = -D_FORTIFY_SOURCE=2 $(DEFS) $(cppflags)
91
- CXXFLAGS = $(CCDLFLAGS) -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -pthread -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
92
- ldflags = -L. -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -fstack-protector-strong -rdynamic -Wl,-export-dynamic
93
- dldflags = -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -Wl,--compress-debug-sections=zlib
91
+ CPPFLAGS = $(DEFS) $(cppflags)
92
+ CXXFLAGS = $(CCDLFLAGS) -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
93
+ ldflags = -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic
94
+ dldflags = -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
94
95
  ARCH_FLAG =
95
96
  DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
96
97
  LDSHARED = $(CC) -shared
97
98
  LDSHAREDXX = $(CXX) -shared
98
- AR = ar
99
+ AR = gcc-ar
99
100
  EXEEXT =
100
101
 
101
102
  RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
102
103
  RUBY_SO_NAME = ruby
103
104
  RUBYW_INSTALL_NAME =
104
- RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
105
+ RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version_dir_name)
105
106
  RUBYW_BASE_NAME = rubyw
106
107
  RUBY_BASE_NAME = ruby
107
108
 
108
- arch = x86_64-linux
109
+ arch = aarch64-linux
109
110
  sitearch = $(arch)
110
- ruby_version = 2.6.0
111
+ ruby_version = 3.0.0
111
112
  ruby = $(bindir)/$(RUBY_BASE_NAME)
112
113
  RUBY = $(ruby)
113
114
  ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
@@ -125,8 +126,8 @@ TOUCH = exit >
125
126
  #### End of system configuration section. ####
126
127
 
127
128
  preload =
128
- libpath = . $(libdir)
129
- LIBPATH = -L. -L$(libdir)
129
+ libpath = . $(archlibdir)
130
+ LIBPATH = -L. -L$(archlibdir)
130
131
  DEFFILE =
131
132
 
132
133
  CLEANFILES = mkmf.log
@@ -137,11 +138,11 @@ extout =
137
138
  extout_prefix =
138
139
  target_prefix =
139
140
  LOCAL_LIBS =
140
- LIBS = $(LIBRUBYARG_SHARED) -lprotobuf -lm -lc
141
+ LIBS = -lprotobuf -lpthread -lm -lc
141
142
  ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
142
143
  SRCS = $(ORIG_SRCS)
143
144
  OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
144
- HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/language_identifier_features.h $(srcdir)/lang_id_nn_params.h $(srcdir)/nnet_language_identifier.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/fixunicodevalue.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/stringpiece.h $(srcdir)/text_processing.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/unicodetext.h $(srcdir)/utils.h $(srcdir)/workspace.h $(srcdir)/feature_extractor.pb.h $(srcdir)/sentence.pb.h $(srcdir)/task_spec.pb.h
145
+ HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
145
146
  LOCAL_HDRS =
146
147
  TARGET = libcld3
147
148
  TARGET_NAME = libcld3
@@ -155,8 +156,8 @@ BINDIR = $(bindir)
155
156
  RUBYCOMMONDIR = $(sitedir)$(target_prefix)
156
157
  RUBYLIBDIR = $(sitelibdir)$(target_prefix)
157
158
  RUBYARCHDIR = $(sitearchdir)$(target_prefix)
158
- HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
159
- ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
159
+ HDRDIR = $(sitehdrdir)$(target_prefix)
160
+ ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
160
161
  TARGET_SO_DIR =
161
162
  TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
162
163
  CLEANLIBS = $(TARGET_SO)
data/ext/cld3/base.o CHANGED
Binary file
Binary file
@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
167
167
  for (int i = 0; i < model_->embedding_dim_size(); ++i) {
168
168
  CLD3_DCHECK(offset_sum == model_->concat_offset(i));
169
169
  offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
170
+ (void)offset_sum; // Avoid compiler warning for "unused" variable.
170
171
  embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
171
172
  }
172
173
 
Binary file
data/ext/cld3/extconf.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  # All Rights Reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,7 @@ FileUtils.mkdir_p("cld_3/protos")
33
33
  FileUtils.mkdir_p("script_span")
34
34
 
35
35
  [ "feature_extractor", "sentence", "task_spec" ].each {|name|
36
- `protoc '#{name}.proto' --cpp_out=.`
36
+ system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
37
37
  ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
38
38
  }
39
39
 
@@ -56,4 +56,5 @@ FileUtils.mkdir_p("script_span")
56
56
  }
57
57
 
58
58
  $CXXFLAGS += " -fvisibility=hidden -std=c++11"
59
+ $LIBRUBYARG = ""
59
60
  create_makefile("libcld3")
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
33
33
  static const int kWithinScriptTail = 32; // Stop at word space in last
34
34
  // N bytes of script buffer
35
35
 
36
- typedef struct {
36
+ struct LangSpan {
37
37
  char* text = nullptr; // Pointer to the span, somewhere
38
38
  int text_bytes = 0; // Number of bytes of text in the span
39
39
  int offset = 0; // Offset of start of span in original input buffer
40
40
  ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
41
41
  bool truncated = false; // true if buffer filled up before a
42
42
  // different script or EOF was found
43
- } LangSpan;
43
+ };
44
44
 
45
45
  static inline bool IsContinuationByte(char c) {
46
46
  return static_cast<signed char>(c) < -64;
Binary file
Binary file
@@ -0,0 +1,8 @@
1
+ EXPORTS
2
+ NNetLanguageIdentifier_find_language
3
+ NNetLanguageIdentifier_find_top_n_most_freq_langs
4
+ delete_NNetLanguageIdentifier
5
+ delete_result
6
+ delete_results
7
+ new_NNetLanguageIdentifier
8
+ refer_to_nth_result
data/ext/cld3/libcld3.so CHANGED
Binary file
data/ext/cld3/mkmf.log CHANGED
@@ -1,36 +1,37 @@
1
1
  "pkg-config --exists protobuf"
2
2
  | pkg-config --libs protobuf
3
- => "-lprotobuf \n"
4
- "gcc -o conftest -I/usr/include/ruby-2.6.0/x86_64-linux -I/usr/include/ruby-2.6.0/ruby/backward -I/usr/include/ruby-2.6.0 -I. -D_FORTIFY_SOURCE=2 -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC conftest.c -L. -L/usr/lib -L. -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lm -lc"
3
+ => "-lprotobuf -lpthread \n"
4
+ "gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lm -lc"
5
5
  checked program was:
6
6
  /* begin */
7
7
  1: #include "ruby.h"
8
8
  2:
9
9
  3: int main(int argc, char **argv)
10
10
  4: {
11
- 5: return 0;
11
+ 5: return !!argv[argc];
12
12
  6: }
13
13
  /* end */
14
14
 
15
- "gcc -o conftest -I/usr/include/ruby-2.6.0/x86_64-linux -I/usr/include/ruby-2.6.0/ruby/backward -I/usr/include/ruby-2.6.0 -I. -D_FORTIFY_SOURCE=2 -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC conftest.c -L. -L/usr/lib -L. -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lprotobuf -lm -lc"
15
+ "gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I. -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -fstack-protector-strong -rdynamic -Wl,-export-dynamic -lruby -lprotobuf -lpthread -lm -lc"
16
16
  checked program was:
17
17
  /* begin */
18
18
  1: #include "ruby.h"
19
19
  2:
20
20
  3: int main(int argc, char **argv)
21
21
  4: {
22
- 5: return 0;
22
+ 5: return !!argv[argc];
23
23
  6: }
24
24
  /* end */
25
25
 
26
26
  | pkg-config --cflags-only-I protobuf
27
27
  => "\n"
28
28
  | pkg-config --cflags-only-other protobuf
29
- => "-pthread \n"
29
+ => "\n"
30
30
  | pkg-config --libs-only-l protobuf
31
- => "-lprotobuf \n"
31
+ => "-lprotobuf -lpthread \n"
32
32
  package configuration for protobuf
33
- cflags: -pthread
33
+ incflags:
34
+ cflags:
34
35
  ldflags:
35
- libs: -lprotobuf
36
+ libs: -lprotobuf -lpthread
36
37
 
@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
284
284
  CLD2::LangSpan script_span;
285
285
  std::unordered_map<string, LangChunksStats> lang_stats;
286
286
  int total_num_bytes = 0;
287
- Result result;
288
- string language;
289
287
  int chunk_size = 0; // Use the default.
290
288
  while (ss.GetOneScriptSpanLower(&script_span)) {
291
289
  const int num_original_span_bytes = script_span.text_bytes;
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
302
300
 
303
301
  const string selected_text = SelectTextGivenScriptSpan(script_span);
304
302
 
305
- result = FindLanguageOfValidUTF8(selected_text);
306
- language = result.language;
303
+ Result result = FindLanguageOfValidUTF8(selected_text);
304
+ string language = result.language;
307
305
  lang_stats[language].byte_sum += num_original_span_bytes;
308
306
  lang_stats[language].prob_sum +=
309
307
  result.probability * num_original_span_bytes;
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
356
354
  const char *text_begin, int text_size) {
357
355
  string output_text;
358
356
 
359
- // If the size of the input is greater than the maxium number of bytes needed
357
+ // If the size of the input is greater than the maximum number of bytes needed
360
358
  // for a prediction, then concatenate snippets that are equally spread out
361
359
  // throughout the input.
362
360
  if (text_size > max_num_bytes_) {
Binary file
@@ -1,4 +1,4 @@
1
- /* Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
1
+ /* Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
2
2
  All Rights Reserved.
3
3
 
4
4
  Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,42 +26,90 @@ limitations under the License.
26
26
  #define EXPORT __attribute__ ((visibility ("default")))
27
27
  #endif
28
28
 
29
- struct NNetLanguageIdentifier {
30
- chrome_lang_id::NNetLanguageIdentifier context;
31
- std::string language;
32
- };
33
-
34
29
  struct Result {
35
30
  struct {
36
31
  const char *data;
37
32
  std::size_t size;
38
33
  } language;
34
+ struct {
35
+ const chrome_lang_id::NNetLanguageIdentifier::SpanInfo *data;
36
+ std::size_t size;
37
+ } byte_ranges;
39
38
  float probability;
40
39
  float proportion;
41
40
  bool is_reliable;
42
41
  };
43
42
 
43
+ struct OwningResult {
44
+ OwningResult(chrome_lang_id::NNetLanguageIdentifier::Result&& result) {
45
+ references.language = std::move(result.language);
46
+ references.byte_ranges = std::move(result.byte_ranges);
47
+ plain.language.data = references.language.data();
48
+ plain.language.size = references.language.size();
49
+ plain.byte_ranges.data = references.byte_ranges.data();
50
+ plain.byte_ranges.size = references.byte_ranges.size();
51
+ plain.probability = result.probability;
52
+ plain.proportion = result.proportion;
53
+ plain.is_reliable = result.is_reliable;
54
+ }
55
+
56
+ Result plain;
57
+ struct {
58
+ std::string language;
59
+ std::vector<chrome_lang_id::NNetLanguageIdentifier::SpanInfo> byte_ranges;
60
+ } references;
61
+ };
62
+
44
63
  extern "C" {
45
- EXPORT Result NNetLanguageIdentifier_find_language(void *pointer,
46
- const char *data,
47
- std::size_t size) {
48
- auto instance = static_cast<NNetLanguageIdentifier *>(pointer);
49
- auto result = instance->context.FindLanguage(std::string(data, size));
50
- instance->language = std::move(result.language);
51
-
52
- return Result {
53
- { instance->language.data(), instance->language.size() },
54
- result.probability,
55
- result.proportion,
56
- result.is_reliable
57
- };
64
+ EXPORT OwningResult *NNetLanguageIdentifier_find_language(
65
+ chrome_lang_id::NNetLanguageIdentifier *instance,
66
+ const char *data,
67
+ std::size_t size) {
68
+ return new OwningResult(instance->FindLanguage(std::string(data, size)));
58
69
  }
59
70
 
60
- EXPORT void delete_NNetLanguageIdentifier(void *pointer) {
61
- delete static_cast<NNetLanguageIdentifier *>(pointer);
71
+ EXPORT std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>*
72
+ NNetLanguageIdentifier_find_top_n_most_freq_langs(
73
+ chrome_lang_id::NNetLanguageIdentifier *instance,
74
+ const char *data, std::size_t size, int num_langs) {
75
+ std::string text(data, size);
76
+ return new auto(instance->FindTopNMostFreqLangs(text, num_langs));
62
77
  }
63
78
 
64
- EXPORT void *new_NNetLanguageIdentifier(int min_num_bytes, int max_num_bytes) {
65
- return new NNetLanguageIdentifier{{min_num_bytes, max_num_bytes}, {}};
79
+ EXPORT void delete_NNetLanguageIdentifier(
80
+ chrome_lang_id::NNetLanguageIdentifier *pointer) {
81
+ delete pointer;
82
+ }
83
+
84
+ EXPORT void delete_result(OwningResult *pointer) {
85
+ delete pointer;
86
+ }
87
+
88
+ EXPORT void delete_results(
89
+ std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *pointer) {
90
+ delete pointer;
91
+ }
92
+
93
+ EXPORT chrome_lang_id::NNetLanguageIdentifier *new_NNetLanguageIdentifier(
94
+ int min_num_bytes, int max_num_bytes) {
95
+ return new chrome_lang_id::NNetLanguageIdentifier(
96
+ min_num_bytes, max_num_bytes);
97
+ }
98
+
99
+ EXPORT Result refer_to_nth_result(
100
+ std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *results,
101
+ std::size_t index) {
102
+ Result c;
103
+ auto& cc = (*results)[index];
104
+
105
+ c.language.data = cc.language.data();
106
+ c.language.size = cc.language.size();
107
+ c.byte_ranges.data = cc.byte_ranges.data();
108
+ c.byte_ranges.size = cc.byte_ranges.size();
109
+ c.probability = cc.probability;
110
+ c.proportion = cc.proportion;
111
+ c.is_reliable = cc.is_reliable;
112
+
113
+ return c;
66
114
  }
67
115
  }
Binary file
data/ext/cld3/offsetmap.o CHANGED
Binary file
data/ext/cld3/registry.o CHANGED
Binary file
Binary file
Binary file
@@ -19,11 +19,11 @@ limitations under the License.
19
19
 
20
20
  namespace chrome_lang_id {
21
21
 
22
- // Declare registry for the whole Sentence feature functions. NOTE: this is not
22
+ // Define registry for the whole Sentence feature functions. NOTE: this is not
23
23
  // yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
24
24
  // constructor, *before* we use any feature.
25
25
  template <>
26
- WholeSentenceFeature::Registry
27
- *RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
26
+ WholeSentenceFeature::Registry*
27
+ RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
28
28
 
29
- } // namespace chrome_lang_id
29
+ } // namespace chrome_lang_id
@@ -26,9 +26,19 @@ limitations under the License.
26
26
  namespace chrome_lang_id {
27
27
 
28
28
  // Feature function that extracts features for the full Sentence.
29
- typedef FeatureFunction<Sentence> WholeSentenceFeature;
30
-
31
- typedef FeatureExtractor<Sentence> WholeSentenceExtractor;
29
+ using WholeSentenceFeature = FeatureFunction<Sentence>;
30
+
31
+ using WholeSentenceExtractor = FeatureExtractor<Sentence>;
32
+
33
+ // Declare registry for the whole Sentence feature functions. This is required
34
+ // for clang's -Wundefined-var-template. However, MSVC has a bug which treats
35
+ // this declaration as a definition, leading to multiple definition errors, so
36
+ // omit this on MSVC.
37
+ #if !defined(COMPILER_MSVC)
38
+ template <>
39
+ WholeSentenceFeature::Registry
40
+ *RegisterableClass<WholeSentenceFeature>::registry_;
41
+ #endif
32
42
 
33
43
  } // namespace chrome_lang_id
34
44
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/ext/cld3/utils.o CHANGED
Binary file
data/ext/cld3/workspace.o CHANGED
Binary file
@@ -0,0 +1,58 @@
1
+
2
+ # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
3
+ # All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ module CLD3
19
+ module Unstable
20
+ extend FFI::Library
21
+
22
+ ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
23
+
24
+ module NNetLanguageIdentifier
25
+ class Pointer < FFI::AutoPointer
26
+ def self.release(pointer)
27
+ Unstable.delete_NNetLanguageIdentifier(pointer)
28
+ end
29
+ end
30
+
31
+ class SpanInfo < FFI::Struct
32
+ layout :start_index, :int, :end_index, :int, :probability, :float
33
+ end
34
+
35
+ class Result < FFI::Struct
36
+ layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
37
+ end
38
+ end
39
+
40
+ attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
41
+
42
+ attach_function :delete_result, [ :pointer ], :void
43
+
44
+ attach_function :delete_results, [ :pointer ], :void
45
+
46
+ attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
47
+
48
+ attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
49
+
50
+ attach_function :NNetLanguageIdentifier_find_language,
51
+ [ :pointer, :buffer_in, :size_t ], :pointer
52
+
53
+ attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
54
+ [ :pointer, :buffer_in, :size_t, :int ], :pointer
55
+ end
56
+
57
+ private_constant :Unstable
58
+ end
data/lib/cld3.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # File including an implementation of CLD3 module. Some documentations are
2
2
  # extracted from ext/cld3/ext/src/nnet_language_identifier.h.
3
3
  #
4
- # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
4
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
5
5
  # All Rights Reserved.
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +19,7 @@
19
19
 
20
20
  require "ffi"
21
21
  require "rbconfig"
22
+ require "cld3/unstable"
22
23
 
23
24
  # Module providing an interface for Compact Language Detector v3 (CLD3)
24
25
  module CLD3
@@ -49,10 +50,16 @@ module CLD3
49
50
  # This is Numeric object.
50
51
  RELIABILITY_HR_BS_THRESHOLD = 0.5
51
52
 
53
+ # Holds probability that Span, specified by start/end indices, is a given
54
+ # language. The langauge is not stored here; it can be found in Result, which
55
+ # holds an Array of SpanInfo.
56
+ # @type const SpanInfo: untyped
57
+ SpanInfo = Struct.new(:start_index, :end_index, :probability)
58
+
52
59
  # Information about a predicted language.
53
60
  # This is an instance of Struct with the following members:
54
61
  #
55
- # [language] This is symbol or nil.
62
+ # [language] This is symbol.
56
63
  #
57
64
  # [probability] Language probability. This is Numeric object.
58
65
  #
@@ -61,33 +68,100 @@ module CLD3
61
68
  # [proportion] Proportion of bytes associated with the language. If
62
69
  # #find_language is called, this variable is set to 1.
63
70
  # This is Numeric object.
64
- Result = Struct.new("Result", :language, :probability, :reliable?, :proportion)
71
+ #
72
+ # [byte_ranges] Specifies the byte ranges in UTF-8 that |language| applies to.
73
+ # This is an Array of SpanInfo.
74
+ # @type const Result: untyped
75
+ Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
65
76
 
66
77
  # The arguments are two String objects.
67
- def initialize(minNumBytes = MIN_NUM_BYTES_TO_CONSIDER, maxNumBytes = MAX_NUM_BYTES_TO_CONSIDER)
68
- @cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
78
+ def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
79
+ @cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
69
80
  end
70
81
 
71
82
  # Finds the most likely language for the given text, along with additional
72
83
  # information (e.g., probability). The prediction is based on the first N
73
84
  # bytes where N is the minumum between the number of interchange valid UTF8
74
85
  # bytes and +max_num_bytes_+. If N is less than +min_num_bytes_+ long, then
75
- # this function returns nil as language.
86
+ # this function returns nil.
76
87
  # The argument is a String object.
77
88
  # The returned value of this function is an instance of Result.
78
89
  def find_language(text)
79
90
  text_utf8 = text.encode(Encoding::UTF_8)
80
91
  pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
81
- pointer.put_bytes(0, text_utf8)
82
92
 
83
- cc_result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
84
- language = cc_result[:language_data].read_bytes(cc_result[:language_size])
93
+ begin
94
+ pointer.put_bytes(0, text_utf8)
95
+
96
+ result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
97
+ begin
98
+ convert_result Unstable::NNetLanguageIdentifier::Result.new(result)
99
+ ensure
100
+ Unstable.delete_result result
101
+ end
102
+ ensure
103
+ pointer.free
104
+ end
105
+ end
106
+
107
+ # Splits the input text (up to the first byte, if any, that is not
108
+ # interchange valid UTF8) into spans based on the script, predicts a language
109
+ # for each span, and returns a vector storing the top num_langs most frequent
110
+ # languages along with additional information (e.g., proportions). The number
111
+ # of bytes considered for each span is the minimum between the size of the
112
+ # span and +max_num_bytes_+. If more languages are requested than what is
113
+ # available in the input, then the number of the returned elements will be
114
+ # the number of the latter. Also, if the size of the span is less than
115
+ # +min_num_bytes_+ long, then the span is skipped. If the input text is too
116
+ # long, only the first +MAX_NUM_INPUT_BYTES_TO_CONSIDER+ bytes are processed.
117
+ # The first argument is a String object.
118
+ # The second argument is Numeric object.
119
+ # The returned value of this functions is an Array of Result instances.
120
+ def find_top_n_most_freq_langs(text, num_langs)
121
+ # @type var a: untyped
122
+
123
+ text_utf8 = text.encode(Encoding::UTF_8)
124
+ pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
125
+
126
+ begin
127
+ pointer.put_bytes(0, text_utf8)
128
+
129
+ results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
130
+ begin
131
+ a = num_langs.times
132
+ .lazy
133
+ .map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
134
+ .take_while { |result| !result.nil? }
135
+ .to_a
136
+
137
+ a
138
+ ensure
139
+ Unstable.delete_results results
140
+ end
141
+ ensure
142
+ pointer.free
143
+ end
144
+ end
145
+
146
+ private
147
+
148
+ def convert_result(result)
149
+ language = result[:language_data].read_bytes(result[:language_size])
150
+ return nil if language == "und"
151
+
152
+ cursor = result[:byte_ranges_data]
153
+ byte_ranges = result[:byte_ranges_size].times.map do
154
+ info = Unstable::NNetLanguageIdentifier::SpanInfo.new(cursor)
155
+ cursor += Unstable::NNetLanguageIdentifier::SpanInfo.size
156
+ SpanInfo.new(info[:start_index], info[:end_index], info[:probability])
157
+ end
85
158
 
86
159
  Result.new(
87
- language == "und" ? nil : language.to_sym,
88
- cc_result[:probability],
89
- cc_result[:reliable?],
90
- cc_result[:proportion])
160
+ language.to_sym,
161
+ result[:probability],
162
+ result[:reliable?],
163
+ result[:proportion],
164
+ byte_ranges)
91
165
  end
92
166
  end
93
167
 
@@ -95,6 +169,7 @@ module CLD3
95
169
  # The model weights are loaded statically.
96
170
  module TaskContextParams
97
171
  # This is an frozen Array object containing symbols.
172
+ # @type const LANGUAGE_NAMES: untyped
98
173
  LANGUAGE_NAMES = [
99
174
  :eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
100
175
  :nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
@@ -108,31 +183,4 @@ module CLD3
108
183
  :sn, :yo, :pa, :ku,
109
184
  ].freeze
110
185
  end
111
-
112
- module Unstable
113
- extend FFI::Library
114
-
115
- ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
116
-
117
- module NNetLanguageIdentifier
118
- class Pointer < FFI::AutoPointer
119
- def self.release(pointer)
120
- Unstable.delete_NNetLanguageIdentifier(pointer)
121
- end
122
- end
123
-
124
- class Result < FFI::Struct
125
- layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
126
- end
127
- end
128
-
129
- attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
130
-
131
- attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
132
-
133
- attach_function :NNetLanguageIdentifier_find_language,
134
- [ :pointer, :buffer_in, :size_t ], NNetLanguageIdentifier::Result.by_value
135
- end
136
-
137
- private_constant :Unstable
138
186
  end
data/sig/cld3.rbs ADDED
@@ -0,0 +1,65 @@
1
+ # Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ # All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ==============================================================================
16
+
17
+ module CLD3
18
+ class NNetLanguageIdentifier
19
+ MIN_NUM_BYTES_TO_CONSIDER: Integer
20
+ MAX_NUM_BYTES_TO_CONSIDER: Integer
21
+ MAX_NUM_INPUT_BYTES_TO_CONSIDER: Integer
22
+ RELIABILITY_THRESHOLD: Float
23
+ RELIABILITY_HR_BS_THRESHOLD: Float
24
+
25
+ class SpanInfo < Struct[Float | Integer]
26
+ attr_accessor start_index(): Integer
27
+ attr_accessor end_index(): Integer
28
+ attr_accessor probability(): Float
29
+ end
30
+
31
+ class Result < Struct[Array[SpanInfo] | Float | TaskContextParams::language_names | bool]
32
+ attr_accessor language(): TaskContextParams::language_names
33
+ attr_accessor probability(): Float
34
+ attr_accessor reliable?(): bool
35
+ attr_accessor proportion(): Float
36
+ attr_accessor byte_ranges(): Array[SpanInfo]
37
+ end
38
+
39
+ def initialize: (?Integer, ?Integer) -> void
40
+ def find_language: (String) -> Result?
41
+ def find_top_n_most_freq_langs: (String, Integer) -> Array[Result]
42
+
43
+ private
44
+
45
+ def convert_result: (untyped) -> Result?
46
+ end
47
+
48
+ module TaskContextParams
49
+ type language_names =
50
+ :eo | :co | :eu | :ta | :de | :mt | :ps | :te | :su | :uz | :'zh-Latn' | :ne |
51
+ :nl | :sw | :sq | :hmn | :ja | :no | :mn | :so | :ko | :kk | :sl | :ig |
52
+ :mr | :th | :zu | :ml | :hr | :bs | :lo | :sd | :cy | :hy | :uk | :pt |
53
+ :lv | :iw | :cs | :vi | :jv | :be | :km | :mk | :tr | :fy | :am | :zh |
54
+ :da | :sv | :fi | :ht | :af | :la | :id | :fil | :sm | :ca | :el | :ka |
55
+ :sr | :it | :sk | :ru | :'ru-Latn' | :bg | :ny | :fa | :haw | :gl | :et |
56
+ :ms | :gd | :'bg-Latn' | :ha | :is | :ur | :mi | :hi | :bn | :'hi-Latn' | :fr |
57
+ :yi | :hu | :xh | :my | :tg | :ro | :ar | :lb | :'el-Latn' | :st | :ceb |
58
+ :kn | :az | :si | :ky | :mg | :en | :gu | :es | :pl | :'ja-Latn' | :ga | :lt |
59
+ :sn | :yo | :pa | :ku
60
+
61
+ LANGUAGE_NAMES: Array[language_names]
62
+ end
63
+
64
+ Unstable: untyped
65
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cld3
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.6
4
+ version: 3.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Akihiko Odaki
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-04 00:00:00.000000000 Z
11
+ date: 2021-11-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -19,7 +19,7 @@ dependencies:
19
19
  version: 1.1.0
20
20
  - - "<"
21
21
  - !ruby/object:Gem::Version
22
- version: 1.12.0
22
+ version: 1.16.0
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
@@ -29,7 +29,27 @@ dependencies:
29
29
  version: 1.1.0
30
30
  - - "<"
31
31
  - !ruby/object:Gem::Version
32
- version: 1.12.0
32
+ version: 1.16.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: rbs
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 1.7.0
40
+ - - "<"
41
+ - !ruby/object:Gem::Version
42
+ version: 1.8.0
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.7.0
50
+ - - "<"
51
+ - !ruby/object:Gem::Version
52
+ version: 1.8.0
33
53
  - !ruby/object:Gem::Dependency
34
54
  name: rspec
35
55
  requirement: !ruby/object:Gem::Requirement
@@ -39,7 +59,7 @@ dependencies:
39
59
  version: 3.0.0
40
60
  - - "<"
41
61
  - !ruby/object:Gem::Version
42
- version: 3.10.0
62
+ version: 3.11.0
43
63
  type: :development
44
64
  prerelease: false
45
65
  version_requirements: !ruby/object:Gem::Requirement
@@ -49,10 +69,30 @@ dependencies:
49
69
  version: 3.0.0
50
70
  - - "<"
51
71
  - !ruby/object:Gem::Version
52
- version: 3.10.0
72
+ version: 3.11.0
73
+ - !ruby/object:Gem::Dependency
74
+ name: steep
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 0.46.0
80
+ - - "<"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.47.0
83
+ type: :development
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.46.0
90
+ - - "<"
91
+ - !ruby/object:Gem::Version
92
+ version: 0.47.0
53
93
  description: Compact Language Detector v3 (CLD3) is a neural network model for language
54
94
  identification.
55
- email: akihiko.odaki.4i@stu.hosei.ac.jp
95
+ email: akihiko.odaki@gmail.com
56
96
  executables: []
57
97
  extensions:
58
98
  - ext/cld3/extconf.rb
@@ -106,6 +146,7 @@ files:
106
146
  - ext/cld3/language_identifier_features.cc
107
147
  - ext/cld3/language_identifier_features.h
108
148
  - ext/cld3/language_identifier_features.o
149
+ - ext/cld3/libcld3.def
109
150
  - ext/cld3/libcld3.so
110
151
  - ext/cld3/mkmf.log
111
152
  - ext/cld3/nnet_language_identifier.cc
@@ -159,11 +200,13 @@ files:
159
200
  - ext/cld3/workspace.h
160
201
  - ext/cld3/workspace.o
161
202
  - lib/cld3.rb
203
+ - lib/cld3/unstable.rb
204
+ - sig/cld3.rbs
162
205
  homepage: https://github.com/akihikodaki/cld3-ruby
163
206
  licenses:
164
207
  - Apache-2.0
165
208
  metadata: {}
166
- post_install_message:
209
+ post_install_message:
167
210
  rdoc_options: []
168
211
  require_paths:
169
212
  - lib
@@ -171,18 +214,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
171
214
  requirements:
172
215
  - - ">="
173
216
  - !ruby/object:Gem::Version
174
- version: 2.3.0
217
+ version: 2.6.0
175
218
  - - "<"
176
219
  - !ruby/object:Gem::Version
177
- version: 2.8.0
220
+ version: 3.2.0
178
221
  required_rubygems_version: !ruby/object:Gem::Requirement
179
222
  requirements:
180
223
  - - ">="
181
224
  - !ruby/object:Gem::Version
182
225
  version: '0'
183
226
  requirements: []
184
- rubygems_version: 3.0.6
185
- signing_key:
227
+ rubygems_version: 3.2.22
228
+ signing_key:
186
229
  specification_version: 4
187
230
  summary: Compact Language Detector v3 (CLD3)
188
231
  test_files: []