RubyGems - cld3 - Versions diffs - 3.2.6 → 3.4.3 - Mend

cld3 3.2.6 → 3.4.3

Files changed (48) hide show

checksums.yaml +4 -4
data/Gemfile +1 -1
data/LICENSE +2 -2
data/README.md +3 -3
data/cld3.gemspec +9 -7
data/ext/cld3/Makefile +45 -44
data/ext/cld3/base.o +0 -0
data/ext/cld3/embedding_feature_extractor.o +0 -0
data/ext/cld3/embedding_network.cc +1 -0
data/ext/cld3/embedding_network.o +0 -0
data/ext/cld3/extconf.rb +3 -2
data/ext/cld3/feature_extractor.o +0 -0
data/ext/cld3/feature_extractor.pb.o +0 -0
data/ext/cld3/feature_types.o +0 -0
data/ext/cld3/fixunicodevalue.o +0 -0
data/ext/cld3/fml_parser.o +0 -0
data/ext/cld3/generated_entities.o +0 -0
data/ext/cld3/generated_ulscript.o +0 -0
data/ext/cld3/getonescriptspan.h +2 -2
data/ext/cld3/getonescriptspan.o +0 -0
data/ext/cld3/lang_id_nn_params.o +0 -0
data/ext/cld3/language_identifier_features.o +0 -0
data/ext/cld3/libcld3.def +8 -0
data/ext/cld3/libcld3.so +0 -0
data/ext/cld3/mkmf.log +10 -9
data/ext/cld3/nnet_language_identifier.cc +3 -5
data/ext/cld3/nnet_language_identifier.o +0 -0
data/ext/cld3/nnet_language_identifier_c.cc +71 -23
data/ext/cld3/nnet_language_identifier_c.o +0 -0
data/ext/cld3/offsetmap.o +0 -0
data/ext/cld3/registry.o +0 -0
data/ext/cld3/relevant_script_feature.o +0 -0
data/ext/cld3/sentence.pb.o +0 -0
data/ext/cld3/sentence_features.cc +4 -4
data/ext/cld3/sentence_features.h +13 -3
data/ext/cld3/sentence_features.o +0 -0
data/ext/cld3/task_context.o +0 -0
data/ext/cld3/task_context_params.o +0 -0
data/ext/cld3/task_spec.pb.o +0 -0
data/ext/cld3/text_processing.o +0 -0
data/ext/cld3/unicodetext.o +0 -0
data/ext/cld3/utf8statetable.o +0 -0
data/ext/cld3/utils.o +0 -0
data/ext/cld3/workspace.o +0 -0
data/lib/cld3/unstable.rb +58 -0
data/lib/cld3.rb +88 -40
data/sig/cld3.rbs +65 -0
metadata +56 -13

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0ee3c6166247aaf958310ffa9976400fcfa5050eb1969dd17e186e3500dd06d9
-  data.tar.gz: bfa75958e205683dfa2429d388bb40d18d237ef5c2b5877a9fd718489f95b7bd
+  metadata.gz: 2c161cbf12d260074efd2e9db3981b6615af20ee04c234d6b2710bd52a283a4e
+  data.tar.gz: c388ae6b529d95e015ecdb7d21cdd7f1ceaca72d167d0f8008b5477d5bce5b3c
 SHA512:
-  metadata.gz: ac1fb08ebf438995878bb7c992bc2e2a71adf9d0f06f01316121b6d8d48f5b8f2f1ea9a3f68f501dad6682168b7a3e16b0137be16cae7ad876d0bce9f6d866e7
-  data.tar.gz: 40e1036c1c7e08af1caed6efd187a04bb8883f9ff427c61824193f72f58e696279611575f7a63b8cbd80fff2c9f20da644807e71be2ea281a8d870e3721410bd
+  metadata.gz: 8e3c1c07283730e722c450acc308a497756fd501595a02a7fc066d0b3e59b96e1ab1e7941549293b02e41274b176772bdae3779a041eb28f8ae53f5c44308cc0
+  data.tar.gz: 52e95027de7a595b2eabc49745a11f664e305c18f9926bc9d649642a92fea9846efdd23da699529795d80609b8871b00e77f9379449d2e4f6cb79ecbcf2785db

data/Gemfile CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
+# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
 # All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

data/LICENSE CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
+Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
 All rights reserved.
                                  Apache License
@@ -189,7 +189,7 @@ All rights reserved.
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
-   Copyright 2017, Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
+   Copyright 2017, Akihiko Odaki <akihiko.odaki@gmail.com>
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

data/README.md CHANGED Viewed

@@ -8,11 +8,11 @@ require 'cld3'
 cld3 = CLD3::NNetLanguageIdentifier.new(0, 1000)
-cld3.find_language("こんにちは") # => #<struct Struct::Result language=:ja, probability=1.0, reliable?=true, proportion=1.0>
+cld3.find_language("こんにちは") # => #<struct Struct::Result language=:ja, probability=1.0, reliable?=true, proportion=1.0, byte_ranges=[]>
-cld3.find_language("This is a pen.") # => #<struct Struct::Result language=:en, probability=0.9999408721923828, reliable?=true, proportion=1.0>
+cld3.find_language("This is a pen.") # => #<struct Struct::Result language=:en, probability=0.9999408721923828, reliable?=true, proportion=1.0, byte_ranges=[]>
-cld3.find_language("здравствуйте") # => #<struct Struct::Result language=:ru, probability=0.3140212297439575, reliable?=false, proportion=1.0>
+cld3.find_language("здравствуйте") # => #<struct Struct::Result language=:ru, probability=0.3140212297439575, reliable?=false, proportion=1.0, byte_ranges=[]>
 ```
 ## Installation

data/cld3.gemspec CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
+# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
 # All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,19 +16,21 @@
 Gem::Specification.new do |gem|
   gem.name = "cld3"
-  gem.version = "3.2.6"
+  gem.version = "3.4.3"
   gem.summary = "Compact Language Detector v3 (CLD3)"
   gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
   gem.license = "Apache-2.0"
   gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
   gem.author = "Akihiko Odaki"
-  gem.email = "akihiko.odaki.4i@stu.hosei.ac.jp"
-  gem.required_ruby_version = [ ">= 2.3.0", "< 2.8.0" ]
-  gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.12.0" ]
-  gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.10.0" ]
+  gem.email = "akihiko.odaki@gmail.com"
+  gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
+  gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
+  gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
+  gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
+  gem.add_development_dependency "steep", [ ">= 0.46.0", "< 0.47.0" ]
   gem.files = Dir[
     "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
-    "cld3.gemspec", "ext/**/*", "lib/**/*"
+    "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
   ]
   gem.require_paths = [ "lib" ]
   gem.extensions = [ "ext/cld3/extconf.rb" ]

data/ext/cld3/Makefile CHANGED Viewed

@@ -2,7 +2,7 @@
 SHELL = /bin/sh
 # V=0 quiet, V=1 verbose.  other values don't work.
-V = 0
+V = 1
 Q1 = $(V:1=)
 Q = $(Q1:0=@)
 ECHO1 = $(V:1=@ :)
@@ -12,54 +12,55 @@ NULLCMD = :
 #### Start of system configuration section. ####
 srcdir = .
-topdir = /usr/include/ruby-2.6.0
+topdir = /usr/include
 hdrdir = $(topdir)
-arch_hdrdir = /usr/include/ruby-2.6.0/x86_64-linux
+arch_hdrdir = /usr/include
 PATH_SEPARATOR = :
 VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
 prefix = $(DESTDIR)/usr
-rubysitearchprefix = $(rubylibprefix)/$(sitearch)
-rubyarchprefix = $(rubylibprefix)/$(arch)
-rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
-exec_prefix = $(prefix)
-vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
-sitearchhdrdir = $(sitehdrdir)/$(sitearch)
-rubyarchhdrdir = $(rubyhdrdir)/$(arch)
+rubysitearchprefix = $(sitearchlibdir)/$(RUBY_BASE_NAME)
+rubyarchprefix = $(DESTDIR)/usr/lib64/ruby
+rubylibprefix = $(exec_prefix)/share/ruby
+exec_prefix = $(DESTDIR)/usr
+vendorarchhdrdir = $(vendorhdrdir)/$(arch)
+sitearchhdrdir = $(sitehdrdir)/$(arch)
+rubyarchhdrdir = $(DESTDIR)/usr/include
 vendorhdrdir = $(rubyhdrdir)/vendor_ruby
 sitehdrdir = $(rubyhdrdir)/site_ruby
-rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
-vendorarchdir = $(vendorlibdir)/$(sitearch)
-vendorlibdir = $(vendordir)/$(ruby_version)
-vendordir = $(rubylibprefix)/vendor_ruby
-sitearchdir = $(sitelibdir)/$(sitearch)
-sitelibdir = $(sitedir)/$(ruby_version)
-sitedir = $(rubylibprefix)/site_ruby
-rubyarchdir = $(rubylibdir)/$(arch)
-rubylibdir = $(rubylibprefix)/$(ruby_version)
+rubyhdrdir = $(DESTDIR)/usr/include
+rubygemsdir = $(DESTDIR)/usr/share/rubygems
+vendorarchdir = $(DESTDIR)/usr/lib64/ruby/vendor_ruby
+vendorlibdir = $(vendordir)
+vendordir = $(DESTDIR)/usr/share/ruby/vendor_ruby
+sitearchdir = $(DESTDIR)/usr/local/lib64/ruby/site_ruby
+sitelibdir = $(sitedir)
+sitedir = $(DESTDIR)/usr/local/share/ruby/site_ruby
+rubyarchdir = $(rubyarchprefix)
+rubylibdir = $(rubylibprefix)
 sitearchincludedir = $(includedir)/$(sitearch)
 archincludedir = $(includedir)/$(arch)
 sitearchlibdir = $(libdir)/$(sitearch)
-archlibdir = $(libdir)/$(arch)
+archlibdir = $(DESTDIR)/usr/lib64
 ridir = $(datarootdir)/$(RI_BASE_NAME)
-mandir = $(datarootdir)/man
+mandir = $(DESTDIR)/usr/share/man
 localedir = $(datarootdir)/locale
-libdir = $(exec_prefix)/lib
+libdir = $(exec_prefix)/lib64
 psdir = $(docdir)
 pdfdir = $(docdir)
 dvidir = $(docdir)
 htmldir = $(docdir)
-infodir = $(datarootdir)/info
+infodir = $(DESTDIR)/usr/share/info
 docdir = $(datarootdir)/doc/$(PACKAGE)
 oldincludedir = $(DESTDIR)/usr/include
-includedir = $(prefix)/include
+includedir = $(DESTDIR)/usr/include
 runstatedir = $(localstatedir)/run
 localstatedir = $(DESTDIR)/var
 sharedstatedir = $(DESTDIR)/var/lib
 sysconfdir = $(DESTDIR)/etc
-datadir = $(datarootdir)
+datadir = $(DESTDIR)/usr/share
 datarootdir = $(prefix)/share
-libexecdir = $(DESTDIR)/usr/lib/ruby
-sbindir = $(exec_prefix)/sbin
+libexecdir = $(DESTDIR)/usr/libexec
+sbindir = $(DESTDIR)/usr/sbin
 bindir = $(exec_prefix)/bin
 archdir = $(rubyarchdir)
@@ -78,36 +79,36 @@ CSRCFLAG = $(empty)
 RUBY_EXTCONF_H =
 cflags   = $(optflags) $(debugflags) $(warnflags)
-cxxflags = $(optflags) $(debugflags) $(warnflags)
+cxxflags =
 optflags = -O3
 debugflags = -ggdb3
-warnflags = -Wall -Wextra -Wdeclaration-after-statement -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wrestrict -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
+warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable
 cppflags =
 CCDLFLAGS = -fPIC
-CFLAGS   = $(CCDLFLAGS) -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC -pthread $(ARCH_FLAG)
+CFLAGS   = $(CCDLFLAGS) -O2  -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1  -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC  $(ARCH_FLAG)
 INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
 DEFS     =
-CPPFLAGS =  -D_FORTIFY_SOURCE=2 $(DEFS) $(cppflags)
-CXXFLAGS = $(CCDLFLAGS) -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -pthread -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
-ldflags  = -L. -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -fstack-protector-strong -rdynamic -Wl,-export-dynamic
-dldflags = -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -Wl,--compress-debug-sections=zlib
+CPPFLAGS =   $(DEFS) $(cppflags)
+CXXFLAGS = $(CCDLFLAGS) -O2  -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1  -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection  -fvisibility=hidden -std=c++11 $(ARCH_FLAG)
+ldflags  = -L. -Wl,-z,relro -Wl,--as-needed  -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld  -fstack-protector-strong -rdynamic -Wl,-export-dynamic
+dldflags = -Wl,-z,relro -Wl,--as-needed  -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld
 ARCH_FLAG =
 DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
 LDSHARED = $(CC) -shared
 LDSHAREDXX = $(CXX) -shared
-AR = ar
+AR = gcc-ar
 EXEEXT =
 RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
 RUBY_SO_NAME = ruby
 RUBYW_INSTALL_NAME =
-RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
+RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version_dir_name)
 RUBYW_BASE_NAME = rubyw
 RUBY_BASE_NAME = ruby
-arch = x86_64-linux
+arch = aarch64-linux
 sitearch = $(arch)
-ruby_version = 2.6.0
+ruby_version = 3.0.0
 ruby = $(bindir)/$(RUBY_BASE_NAME)
 RUBY = $(ruby)
 ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
@@ -125,8 +126,8 @@ TOUCH = exit >
 #### End of system configuration section. ####
 preload =
-libpath = . $(libdir)
-LIBPATH =  -L. -L$(libdir)
+libpath = . $(archlibdir)
+LIBPATH =  -L. -L$(archlibdir)
 DEFFILE =
 CLEANFILES = mkmf.log
@@ -137,11 +138,11 @@ extout =
 extout_prefix =
 target_prefix =
 LOCAL_LIBS =
-LIBS = $(LIBRUBYARG_SHARED)  -lprotobuf -lm   -lc
+LIBS =   -lprotobuf -lpthread -lm   -lc
 ORIG_SRCS = base.cc embedding_feature_extractor.cc embedding_network.cc feature_extractor.cc feature_extractor.pb.cc feature_types.cc fixunicodevalue.cc fml_parser.cc generated_entities.cc generated_ulscript.cc getonescriptspan.cc lang_id_nn_params.cc language_identifier_features.cc nnet_language_identifier.cc nnet_language_identifier_c.cc offsetmap.cc registry.cc relevant_script_feature.cc sentence.pb.cc sentence_features.cc task_context.cc task_context_params.cc task_spec.pb.cc text_processing.cc unicodetext.cc utf8statetable.cc utils.cc workspace.cc
 SRCS = $(ORIG_SRCS)
 OBJS = base.o embedding_feature_extractor.o embedding_network.o feature_extractor.o feature_extractor.pb.o feature_types.o fixunicodevalue.o fml_parser.o generated_entities.o generated_ulscript.o getonescriptspan.o lang_id_nn_params.o language_identifier_features.o nnet_language_identifier.o nnet_language_identifier_c.o offsetmap.o registry.o relevant_script_feature.o sentence.pb.o sentence_features.o task_context.o task_context_params.o task_spec.pb.o text_processing.o unicodetext.o utf8statetable.o utils.o workspace.o
-HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_types.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/language_identifier_features.h $(srcdir)/lang_id_nn_params.h $(srcdir)/nnet_language_identifier.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/fixunicodevalue.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/stringpiece.h $(srcdir)/text_processing.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/unicodetext.h $(srcdir)/utils.h $(srcdir)/workspace.h $(srcdir)/feature_extractor.pb.h $(srcdir)/sentence.pb.h $(srcdir)/task_spec.pb.h
+HDRS = $(srcdir)/base.h $(srcdir)/casts.h $(srcdir)/embedding_feature_extractor.h $(srcdir)/embedding_network.h $(srcdir)/embedding_network_params.h $(srcdir)/feature_extractor.h $(srcdir)/feature_extractor.pb.h $(srcdir)/feature_types.h $(srcdir)/fixunicodevalue.h $(srcdir)/float16.h $(srcdir)/fml_parser.h $(srcdir)/generated_ulscript.h $(srcdir)/getonescriptspan.h $(srcdir)/integral_types.h $(srcdir)/lang_id_nn_params.h $(srcdir)/language_identifier_features.h $(srcdir)/nnet_language_identifier.h $(srcdir)/offsetmap.h $(srcdir)/port.h $(srcdir)/registry.h $(srcdir)/relevant_script_feature.h $(srcdir)/script_detector.h $(srcdir)/sentence.pb.h $(srcdir)/sentence_features.h $(srcdir)/simple_adder.h $(srcdir)/stringpiece.h $(srcdir)/task_context.h $(srcdir)/task_context_params.h $(srcdir)/task_spec.pb.h $(srcdir)/text_processing.h $(srcdir)/unicodetext.h $(srcdir)/utf8acceptinterchange.h $(srcdir)/utf8prop_lettermarkscriptnum.h $(srcdir)/utf8repl_lettermarklower.h $(srcdir)/utf8scannot_lettermarkspecial.h $(srcdir)/utf8statetable.h $(srcdir)/utils.h $(srcdir)/workspace.h
 LOCAL_HDRS =
 TARGET = libcld3
 TARGET_NAME = libcld3
@@ -155,8 +156,8 @@ BINDIR        = $(bindir)
 RUBYCOMMONDIR = $(sitedir)$(target_prefix)
 RUBYLIBDIR    = $(sitelibdir)$(target_prefix)
 RUBYARCHDIR   = $(sitearchdir)$(target_prefix)
-HDRDIR        = $(rubyhdrdir)/ruby$(target_prefix)
-ARCHHDRDIR    = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
+HDRDIR        = $(sitehdrdir)$(target_prefix)
+ARCHHDRDIR    = $(sitearchhdrdir)$(target_prefix)
 TARGET_SO_DIR =
 TARGET_SO     = $(TARGET_SO_DIR)$(DLLIB)
 CLEANLIBS     = $(TARGET_SO)

data/ext/cld3/base.o CHANGED Viewed

Binary file

data/ext/cld3/embedding_feature_extractor.o CHANGED Viewed

Binary file

data/ext/cld3/embedding_network.cc CHANGED Viewed

@@ -167,6 +167,7 @@ EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
   for (int i = 0; i < model_->embedding_dim_size(); ++i) {
     CLD3_DCHECK(offset_sum == model_->concat_offset(i));
     offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
+    (void)offset_sum;  // Avoid compiler warning for "unused" variable.
     embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
   }

data/ext/cld3/embedding_network.o CHANGED Viewed

Binary file

data/ext/cld3/extconf.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
+# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
 # All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,7 @@ FileUtils.mkdir_p("cld_3/protos")
 FileUtils.mkdir_p("script_span")
 [ "feature_extractor", "sentence", "task_spec" ].each {|name|
-  `protoc '#{name}.proto' --cpp_out=.`
+  system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
   ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
 }
@@ -56,4 +56,5 @@ FileUtils.mkdir_p("script_span")
 }
 $CXXFLAGS += " -fvisibility=hidden -std=c++11"
+$LIBRUBYARG = ""
 create_makefile("libcld3")

data/ext/cld3/feature_extractor.o CHANGED Viewed

Binary file

data/ext/cld3/feature_extractor.pb.o CHANGED Viewed

Binary file

data/ext/cld3/feature_types.o CHANGED Viewed

Binary file

data/ext/cld3/fixunicodevalue.o CHANGED Viewed

Binary file

data/ext/cld3/fml_parser.o CHANGED Viewed

Binary file

data/ext/cld3/generated_entities.o CHANGED Viewed

Binary file

data/ext/cld3/generated_ulscript.o CHANGED Viewed

Binary file

data/ext/cld3/getonescriptspan.h CHANGED Viewed

@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32;   // Leave some room
 static const int kWithinScriptTail = 32;    // Stop at word space in last
                                             // N bytes of script buffer
-typedef struct {
+struct LangSpan {
   char* text = nullptr;   // Pointer to the span, somewhere
   int text_bytes = 0;     // Number of bytes of text in the span
   int offset = 0;         // Offset of start of span in original input buffer
   ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
   bool truncated = false; // true if buffer filled up before a
                           // different script or EOF was found
-} LangSpan;
+};
 static inline bool IsContinuationByte(char c) {
   return static_cast<signed char>(c) < -64;

data/ext/cld3/getonescriptspan.o CHANGED Viewed

Binary file

data/ext/cld3/lang_id_nn_params.o CHANGED Viewed

Binary file

data/ext/cld3/language_identifier_features.o CHANGED Viewed

Binary file

data/ext/cld3/libcld3.def ADDED Viewed

@@ -0,0 +1,8 @@
+EXPORTS
+  NNetLanguageIdentifier_find_language
+  NNetLanguageIdentifier_find_top_n_most_freq_langs
+  delete_NNetLanguageIdentifier
+  delete_result
+  delete_results
+  new_NNetLanguageIdentifier
+  refer_to_nth_result

data/ext/cld3/libcld3.so CHANGED Viewed

Binary file

data/ext/cld3/mkmf.log CHANGED Viewed

@@ -1,36 +1,37 @@
 "pkg-config --exists protobuf"
 | pkg-config --libs protobuf
-=> "-lprotobuf \n"
-"gcc -o conftest -I/usr/include/ruby-2.6.0/x86_64-linux -I/usr/include/ruby-2.6.0/ruby/backward -I/usr/include/ruby-2.6.0 -I. -D_FORTIFY_SOURCE=2   -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC conftest.c  -L. -L/usr/lib -L. -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -fstack-protector-strong -rdynamic -Wl,-export-dynamic     -lruby  -lm   -lc"
+=> "-lprotobuf -lpthread \n"
+"gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I.    -O2  -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1  -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c  -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed  -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld  -fstack-protector-strong -rdynamic -Wl,-export-dynamic     -lruby  -lm   -lc"
 checked program was:
 /* begin */
 1: #include "ruby.h"
 2:
 3: int main(int argc, char **argv)
 4: {
-5:   return 0;
+5:   return !!argv[argc];
 6: }
 /* end */
-"gcc -o conftest -I/usr/include/ruby-2.6.0/x86_64-linux -I/usr/include/ruby-2.6.0/ruby/backward -I/usr/include/ruby-2.6.0 -I. -D_FORTIFY_SOURCE=2   -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC conftest.c  -L. -L/usr/lib -L. -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -fstack-protector-strong -rdynamic -Wl,-export-dynamic     -lruby -lprotobuf -lm   -lc"
+"gcc -o conftest -I/usr/include -I/usr/include/ruby/backward -I/usr/include -I.    -O2  -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1  -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC conftest.c  -L. -L/usr/lib64 -L. -Wl,-z,relro -Wl,--as-needed  -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld  -fstack-protector-strong -rdynamic -Wl,-export-dynamic     -lruby -lprotobuf -lpthread -lm   -lc"
 checked program was:
 /* begin */
 1: #include "ruby.h"
 2:
 3: int main(int argc, char **argv)
 4: {
-5:   return 0;
+5:   return !!argv[argc];
 6: }
 /* end */
 | pkg-config --cflags-only-I protobuf
 => "\n"
 | pkg-config --cflags-only-other protobuf
-=> "-pthread \n"
+=> "\n"
 | pkg-config --libs-only-l protobuf
-=> "-lprotobuf \n"
+=> "-lprotobuf -lpthread \n"
 package configuration for protobuf
-cflags: -pthread
+incflags:
+cflags:
 ldflags:
-libs: -lprotobuf
+libs: -lprotobuf -lpthread

data/ext/cld3/nnet_language_identifier.cc CHANGED Viewed

@@ -284,8 +284,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
   CLD2::LangSpan script_span;
   std::unordered_map<string, LangChunksStats> lang_stats;
   int total_num_bytes = 0;
-  Result result;
-  string language;
   int chunk_size = 0;  // Use the default.
   while (ss.GetOneScriptSpanLower(&script_span)) {
     const int num_original_span_bytes = script_span.text_bytes;
@@ -302,8 +300,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
     const string selected_text = SelectTextGivenScriptSpan(script_span);
-    result = FindLanguageOfValidUTF8(selected_text);
-    language = result.language;
+    Result result = FindLanguageOfValidUTF8(selected_text);
+    string language = result.language;
     lang_stats[language].byte_sum += num_original_span_bytes;
     lang_stats[language].prob_sum +=
         result.probability * num_original_span_bytes;
@@ -356,7 +354,7 @@ string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
     const char *text_begin, int text_size) {
   string output_text;
-  // If the size of the input is greater than the maxium number of bytes needed
+  // If the size of the input is greater than the maximum number of bytes needed
   // for a prediction, then concatenate snippets that are equally spread out
   // throughout the input.
   if (text_size > max_num_bytes_) {

data/ext/cld3/nnet_language_identifier.o CHANGED Viewed

Binary file

data/ext/cld3/nnet_language_identifier_c.cc CHANGED Viewed

@@ -1,4 +1,4 @@
-/* Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
+/* Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
 All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,42 +26,90 @@ limitations under the License.
   #define EXPORT __attribute__ ((visibility ("default")))
 #endif
-struct NNetLanguageIdentifier {
-  chrome_lang_id::NNetLanguageIdentifier context;
-  std::string language;
-};
 struct Result {
   struct {
     const char *data;
     std::size_t size;
   } language;
+  struct {
+    const chrome_lang_id::NNetLanguageIdentifier::SpanInfo *data;
+    std::size_t size;
+  } byte_ranges;
   float probability;
   float proportion;
   bool is_reliable;
 };
+struct OwningResult {
+  OwningResult(chrome_lang_id::NNetLanguageIdentifier::Result&& result) {
+    references.language = std::move(result.language);
+    references.byte_ranges = std::move(result.byte_ranges);
+    plain.language.data = references.language.data();
+    plain.language.size = references.language.size();
+    plain.byte_ranges.data = references.byte_ranges.data();
+    plain.byte_ranges.size = references.byte_ranges.size();
+    plain.probability = result.probability;
+    plain.proportion = result.proportion;
+    plain.is_reliable = result.is_reliable;
+  }
+  Result plain;
+  struct {
+    std::string language;
+    std::vector<chrome_lang_id::NNetLanguageIdentifier::SpanInfo> byte_ranges;
+  } references;
+};
 extern "C" {
-  EXPORT Result NNetLanguageIdentifier_find_language(void *pointer,
-                                                     const char *data,
-                                                     std::size_t size) {
-    auto instance = static_cast<NNetLanguageIdentifier *>(pointer);
-    auto result = instance->context.FindLanguage(std::string(data, size));
-    instance->language = std::move(result.language);
-    return Result {
-        { instance->language.data(), instance->language.size() },
-        result.probability,
-        result.proportion,
-        result.is_reliable
-    };
+  EXPORT OwningResult *NNetLanguageIdentifier_find_language(
+      chrome_lang_id::NNetLanguageIdentifier *instance,
+      const char *data,
+      std::size_t size) {
+    return new OwningResult(instance->FindLanguage(std::string(data, size)));
   }
-  EXPORT void delete_NNetLanguageIdentifier(void *pointer) {
-    delete static_cast<NNetLanguageIdentifier *>(pointer);
+  EXPORT std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>*
+  NNetLanguageIdentifier_find_top_n_most_freq_langs(
+      chrome_lang_id::NNetLanguageIdentifier *instance,
+      const char *data, std::size_t size, int num_langs) {
+    std::string text(data, size);
+    return new auto(instance->FindTopNMostFreqLangs(text, num_langs));
   }
-  EXPORT void *new_NNetLanguageIdentifier(int min_num_bytes, int max_num_bytes) {
-    return new NNetLanguageIdentifier{{min_num_bytes, max_num_bytes}, {}};
+  EXPORT void delete_NNetLanguageIdentifier(
+      chrome_lang_id::NNetLanguageIdentifier *pointer) {
+    delete pointer;
+  }
+  EXPORT void delete_result(OwningResult *pointer) {
+    delete pointer;
+  }
+  EXPORT void delete_results(
+      std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *pointer) {
+    delete pointer;
+  }
+  EXPORT chrome_lang_id::NNetLanguageIdentifier *new_NNetLanguageIdentifier(
+      int min_num_bytes, int max_num_bytes) {
+    return new chrome_lang_id::NNetLanguageIdentifier(
+        min_num_bytes, max_num_bytes);
+  }
+  EXPORT Result refer_to_nth_result(
+      std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *results,
+      std::size_t index) {
+    Result c;
+    auto& cc = (*results)[index];
+    c.language.data = cc.language.data();
+    c.language.size = cc.language.size();
+    c.byte_ranges.data = cc.byte_ranges.data();
+    c.byte_ranges.size = cc.byte_ranges.size();
+    c.probability = cc.probability;
+    c.proportion = cc.proportion;
+    c.is_reliable = cc.is_reliable;
+    return c;
   }
 }

data/ext/cld3/nnet_language_identifier_c.o CHANGED Viewed

Binary file

data/ext/cld3/offsetmap.o CHANGED Viewed

Binary file

data/ext/cld3/registry.o CHANGED Viewed

Binary file

data/ext/cld3/relevant_script_feature.o CHANGED Viewed

Binary file

data/ext/cld3/sentence.pb.o CHANGED Viewed

Binary file

data/ext/cld3/sentence_features.cc CHANGED Viewed

@@ -19,11 +19,11 @@ limitations under the License.
 namespace chrome_lang_id {
-// Declare registry for the whole Sentence feature functions.  NOTE: this is not
+// Define registry for the whole Sentence feature functions.  NOTE: this is not
 // yet set to anything meaningful.  It will be set so in NNetLanguageIdentifier
 // constructor, *before* we use any feature.
 template <>
-WholeSentenceFeature::Registry
-    *RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
+WholeSentenceFeature::Registry*
+    RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
-}  // namespace chrome_lang_id
+}  // namespace chrome_lang_id

data/ext/cld3/sentence_features.h CHANGED Viewed

@@ -26,9 +26,19 @@ limitations under the License.
 namespace chrome_lang_id {
 // Feature function that extracts features for the full Sentence.
-typedef FeatureFunction<Sentence> WholeSentenceFeature;
-typedef FeatureExtractor<Sentence> WholeSentenceExtractor;
+using WholeSentenceFeature = FeatureFunction<Sentence>;
+using WholeSentenceExtractor = FeatureExtractor<Sentence>;
+// Declare registry for the whole Sentence feature functions.  This is required
+// for clang's -Wundefined-var-template.  However, MSVC has a bug which treats
+// this declaration as a definition, leading to multiple definition errors, so
+// omit this on MSVC.
+#if !defined(COMPILER_MSVC)
+template <>
+WholeSentenceFeature::Registry
+    *RegisterableClass<WholeSentenceFeature>::registry_;
+#endif
 }  // namespace chrome_lang_id

data/ext/cld3/sentence_features.o CHANGED Viewed

Binary file

data/ext/cld3/task_context.o CHANGED Viewed

Binary file

data/ext/cld3/task_context_params.o CHANGED Viewed

Binary file

data/ext/cld3/task_spec.pb.o CHANGED Viewed

Binary file

data/ext/cld3/text_processing.o CHANGED Viewed

Binary file

data/ext/cld3/unicodetext.o CHANGED Viewed

Binary file

data/ext/cld3/utf8statetable.o CHANGED Viewed

Binary file

data/ext/cld3/utils.o CHANGED Viewed

Binary file

data/ext/cld3/workspace.o CHANGED Viewed

Binary file

data/lib/cld3/unstable.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+module CLD3
+  module Unstable
+    extend FFI::Library
+    ffi_lib File.join(__dir__, "..", "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
+    module NNetLanguageIdentifier
+      class Pointer < FFI::AutoPointer
+        def self.release(pointer)
+          Unstable.delete_NNetLanguageIdentifier(pointer)
+        end
+      end
+      class SpanInfo < FFI::Struct
+        layout :start_index, :int, :end_index, :int, :probability, :float
+      end
+      class Result < FFI::Struct
+        layout :language_data, :pointer, :language_size, :size_t, :byte_ranges_data, :pointer, :byte_ranges_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
+      end
+    end
+    attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
+    attach_function :delete_result, [ :pointer ], :void
+    attach_function :delete_results, [ :pointer ], :void
+    attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
+    attach_function :refer_to_nth_result, [ :pointer, :size_t ], NNetLanguageIdentifier::Result.by_value
+    attach_function :NNetLanguageIdentifier_find_language,
+        [ :pointer, :buffer_in, :size_t ], :pointer
+    attach_function :NNetLanguageIdentifier_find_top_n_most_freq_langs,
+        [ :pointer, :buffer_in, :size_t, :int ], :pointer
+  end
+  private_constant :Unstable
+end

data/lib/cld3.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # File including an implementation of CLD3 module. Some documentations are
 # extracted from ext/cld3/ext/src/nnet_language_identifier.h.
 #
-# Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
+# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
 # All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +19,7 @@
 require "ffi"
 require "rbconfig"
+require "cld3/unstable"
 # Module providing an interface for Compact Language Detector v3 (CLD3)
 module CLD3
@@ -49,10 +50,16 @@ module CLD3
     # This is Numeric object.
     RELIABILITY_HR_BS_THRESHOLD = 0.5
+    # Holds probability that Span, specified by start/end indices, is a given
+    # language. The langauge is not stored here; it can be found in Result, which
+    # holds an Array of SpanInfo.
+    # @type const SpanInfo: untyped
+    SpanInfo = Struct.new(:start_index, :end_index, :probability)
     # Information about a predicted language.
     # This is an instance of Struct with the following members:
     #
-    # [language]    This is symbol or nil.
+    # [language]    This is symbol.
     #
     # [probability] Language probability. This is Numeric object.
     #
@@ -61,33 +68,100 @@ module CLD3
     # [proportion]  Proportion of bytes associated with the language. If
     #               #find_language is called, this variable is set to 1.
     #               This is Numeric object.
-    Result = Struct.new("Result", :language, :probability, :reliable?, :proportion)
+    #
+    # [byte_ranges] Specifies the byte ranges in UTF-8 that |language| applies to.
+    #               This is an Array of SpanInfo.
+    # @type const Result: untyped
+    Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
     # The arguments are two String objects.
-    def initialize(minNumBytes = MIN_NUM_BYTES_TO_CONSIDER, maxNumBytes = MAX_NUM_BYTES_TO_CONSIDER)
-      @cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
+    def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
+      @cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
     end
     # Finds the most likely language for the given text, along with additional
     # information (e.g., probability). The prediction is based on the first N
     # bytes where N is the minumum between the number of interchange valid UTF8
     # bytes and +max_num_bytes_+. If N is less than +min_num_bytes_+ long, then
-    # this function returns nil as language.
+    # this function returns nil.
     # The argument is a String object.
     # The returned value of this function is an instance of Result.
     def find_language(text)
       text_utf8 = text.encode(Encoding::UTF_8)
       pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
-      pointer.put_bytes(0, text_utf8)
-      cc_result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
-      language = cc_result[:language_data].read_bytes(cc_result[:language_size])
+      begin
+        pointer.put_bytes(0, text_utf8)
+        result = Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
+        begin
+          convert_result Unstable::NNetLanguageIdentifier::Result.new(result)
+        ensure
+          Unstable.delete_result result
+        end
+      ensure
+        pointer.free
+      end
+    end
+    # Splits the input text (up to the first byte, if any, that is not
+    # interchange valid UTF8) into spans based on the script, predicts a language
+    # for each span, and returns a vector storing the top num_langs most frequent
+    # languages along with additional information (e.g., proportions). The number
+    # of bytes considered for each span is the minimum between the size of the
+    # span and +max_num_bytes_+. If more languages are requested than what is
+    # available in the input, then the number of the returned elements will be
+    # the number of the latter. Also, if the size of the span is less than
+    # +min_num_bytes_+ long, then the span is skipped. If the input text is too
+    # long, only the first +MAX_NUM_INPUT_BYTES_TO_CONSIDER+ bytes are processed.
+    # The first argument is a String object.
+    # The second argument is Numeric object.
+    # The returned value of this functions is an Array of Result instances.
+    def find_top_n_most_freq_langs(text, num_langs)
+      # @type var a: untyped
+      text_utf8 = text.encode(Encoding::UTF_8)
+      pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
+      begin
+        pointer.put_bytes(0, text_utf8)
+        results = Unstable.NNetLanguageIdentifier_find_top_n_most_freq_langs(@cc, pointer, text_utf8.bytesize, num_langs)
+        begin
+          a = num_langs.times
+            .lazy
+            .map { |index| convert_result Unstable.refer_to_nth_result(results, index) }
+            .take_while { |result| !result.nil? }
+            .to_a
+          a
+        ensure
+          Unstable.delete_results results
+        end
+      ensure
+        pointer.free
+      end
+    end
+    private
+    def convert_result(result)
+      language = result[:language_data].read_bytes(result[:language_size])
+      return nil if language == "und"
+      cursor = result[:byte_ranges_data]
+      byte_ranges = result[:byte_ranges_size].times.map do
+        info = Unstable::NNetLanguageIdentifier::SpanInfo.new(cursor)
+        cursor += Unstable::NNetLanguageIdentifier::SpanInfo.size
+        SpanInfo.new(info[:start_index], info[:end_index], info[:probability])
+      end
       Result.new(
-          language == "und" ? nil : language.to_sym,
-          cc_result[:probability],
-          cc_result[:reliable?],
-          cc_result[:proportion])
+          language.to_sym,
+          result[:probability],
+          result[:reliable?],
+          result[:proportion],
+          byte_ranges)
     end
   end
@@ -95,6 +169,7 @@ module CLD3
   # The model weights are loaded statically.
   module TaskContextParams
     # This is an frozen Array object containing symbols.
+    # @type const LANGUAGE_NAMES: untyped
     LANGUAGE_NAMES = [
       :eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
       :nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
@@ -108,31 +183,4 @@ module CLD3
       :sn, :yo, :pa, :ku,
     ].freeze
   end
-  module Unstable
-    extend FFI::Library
-    ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
-    module NNetLanguageIdentifier
-      class Pointer < FFI::AutoPointer
-        def self.release(pointer)
-          Unstable.delete_NNetLanguageIdentifier(pointer)
-        end
-      end
-      class Result < FFI::Struct
-        layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
-      end
-    end
-    attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
-    attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
-    attach_function :NNetLanguageIdentifier_find_language,
-        [ :pointer, :buffer_in, :size_t ], NNetLanguageIdentifier::Result.by_value
-  end
-  private_constant :Unstable
 end

data/sig/cld3.rbs ADDED Viewed

@@ -0,0 +1,65 @@
+# Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+module CLD3
+  class NNetLanguageIdentifier
+    MIN_NUM_BYTES_TO_CONSIDER: Integer
+    MAX_NUM_BYTES_TO_CONSIDER: Integer
+    MAX_NUM_INPUT_BYTES_TO_CONSIDER: Integer
+    RELIABILITY_THRESHOLD: Float
+    RELIABILITY_HR_BS_THRESHOLD: Float
+    class SpanInfo < Struct[Float | Integer]
+      attr_accessor start_index(): Integer
+      attr_accessor end_index(): Integer
+      attr_accessor probability(): Float
+    end
+    class Result < Struct[Array[SpanInfo] | Float | TaskContextParams::language_names | bool]
+      attr_accessor language(): TaskContextParams::language_names
+      attr_accessor probability(): Float
+      attr_accessor reliable?(): bool
+      attr_accessor proportion(): Float
+      attr_accessor byte_ranges(): Array[SpanInfo]
+    end
+    def initialize: (?Integer, ?Integer) -> void
+    def find_language: (String) -> Result?
+    def find_top_n_most_freq_langs: (String, Integer) -> Array[Result]
+    private
+    def convert_result: (untyped) -> Result?
+  end
+  module TaskContextParams
+    type language_names =
+      :eo | :co | :eu | :ta | :de | :mt | :ps | :te | :su | :uz | :'zh-Latn' | :ne |
+      :nl | :sw | :sq | :hmn | :ja | :no | :mn | :so | :ko | :kk | :sl | :ig |
+      :mr | :th | :zu | :ml | :hr | :bs | :lo | :sd | :cy | :hy | :uk | :pt |
+      :lv | :iw | :cs | :vi | :jv | :be | :km | :mk | :tr | :fy | :am | :zh |
+      :da | :sv | :fi | :ht | :af | :la | :id | :fil | :sm | :ca | :el | :ka |
+      :sr | :it | :sk | :ru | :'ru-Latn' | :bg | :ny | :fa | :haw | :gl | :et |
+      :ms | :gd | :'bg-Latn' | :ha | :is | :ur | :mi | :hi | :bn | :'hi-Latn' | :fr |
+      :yi | :hu | :xh | :my | :tg | :ro | :ar | :lb | :'el-Latn' | :st | :ceb |
+      :kn | :az | :si | :ky | :mg | :en | :gu | :es | :pl | :'ja-Latn' | :ga | :lt |
+      :sn | :yo | :pa | :ku
+    LANGUAGE_NAMES: Array[language_names]
+  end
+  Unstable: untyped
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: cld3
 version: !ruby/object:Gem::Version
-  version: 3.2.6
+  version: 3.4.3
 platform: ruby
 authors:
 - Akihiko Odaki
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-01-04 00:00:00.000000000 Z
+date: 2021-11-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ffi
@@ -19,7 +19,7 @@ dependencies:
         version: 1.1.0
     - - "<"
       - !ruby/object:Gem::Version
-        version: 1.12.0
+        version: 1.16.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -29,7 +29,27 @@ dependencies:
         version: 1.1.0
     - - "<"
       - !ruby/object:Gem::Version
-        version: 1.12.0
+        version: 1.16.0
+- !ruby/object:Gem::Dependency
+  name: rbs
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.7.0
+    - - "<"
+      - !ruby/object:Gem::Version
+        version: 1.8.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.7.0
+    - - "<"
+      - !ruby/object:Gem::Version
+        version: 1.8.0
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
@@ -39,7 +59,7 @@ dependencies:
         version: 3.0.0
     - - "<"
       - !ruby/object:Gem::Version
-        version: 3.10.0
+        version: 3.11.0
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -49,10 +69,30 @@ dependencies:
         version: 3.0.0
     - - "<"
       - !ruby/object:Gem::Version
-        version: 3.10.0
+        version: 3.11.0
+- !ruby/object:Gem::Dependency
+  name: steep
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.46.0
+    - - "<"
+      - !ruby/object:Gem::Version
+        version: 0.47.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.46.0
+    - - "<"
+      - !ruby/object:Gem::Version
+        version: 0.47.0
 description: Compact Language Detector v3 (CLD3) is a neural network model for language
   identification.
-email: akihiko.odaki.4i@stu.hosei.ac.jp
+email: akihiko.odaki@gmail.com
 executables: []
 extensions:
 - ext/cld3/extconf.rb
@@ -106,6 +146,7 @@ files:
 - ext/cld3/language_identifier_features.cc
 - ext/cld3/language_identifier_features.h
 - ext/cld3/language_identifier_features.o
+- ext/cld3/libcld3.def
 - ext/cld3/libcld3.so
 - ext/cld3/mkmf.log
 - ext/cld3/nnet_language_identifier.cc
@@ -159,11 +200,13 @@ files:
 - ext/cld3/workspace.h
 - ext/cld3/workspace.o
 - lib/cld3.rb
+- lib/cld3/unstable.rb
+- sig/cld3.rbs
 homepage: https://github.com/akihikodaki/cld3-ruby
 licenses:
 - Apache-2.0
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -171,18 +214,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: 2.3.0
+      version: 2.6.0
   - - "<"
     - !ruby/object:Gem::Version
-      version: 2.8.0
+      version: 3.2.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.6
-signing_key:
+rubygems_version: 3.2.22
+signing_key:
 specification_version: 4
 summary: Compact Language Detector v3 (CLD3)
 test_files: []