cld3 3.4.4 → 3.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -1
  3. data/README.md +4 -15
  4. data/cld3.gemspec +7 -7
  5. data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
  6. data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
  7. data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
  8. data/ext/cld3/extconf.rb +2 -12
  9. data/ext/cld3/getonescriptspan.cc +0 -2
  10. data/ext/cld3/nnet_language_identifier_c.cc +162 -70
  11. data/lib/cld3.rb +15 -100
  12. data/sig/cld3.rbs +2 -0
  13. metadata +20 -74
  14. data/ext/cld3/Makefile +0 -267
  15. data/ext/cld3/base.o +0 -0
  16. data/ext/cld3/embedding_feature_extractor.o +0 -0
  17. data/ext/cld3/embedding_network.o +0 -0
  18. data/ext/cld3/feature_extractor.o +0 -0
  19. data/ext/cld3/feature_extractor.pb.o +0 -0
  20. data/ext/cld3/feature_extractor.proto +0 -50
  21. data/ext/cld3/feature_types.o +0 -0
  22. data/ext/cld3/fixunicodevalue.o +0 -0
  23. data/ext/cld3/fml_parser.o +0 -0
  24. data/ext/cld3/generated_entities.o +0 -0
  25. data/ext/cld3/generated_ulscript.o +0 -0
  26. data/ext/cld3/getonescriptspan.o +0 -0
  27. data/ext/cld3/lang_id_nn_params.o +0 -0
  28. data/ext/cld3/language_identifier_features.o +0 -0
  29. data/ext/cld3/libcld3.def +0 -8
  30. data/ext/cld3/libcld3.so +0 -0
  31. data/ext/cld3/mkmf.log +0 -37
  32. data/ext/cld3/nnet_language_identifier.o +0 -0
  33. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  34. data/ext/cld3/offsetmap.o +0 -0
  35. data/ext/cld3/registry.o +0 -0
  36. data/ext/cld3/relevant_script_feature.o +0 -0
  37. data/ext/cld3/sentence.pb.o +0 -0
  38. data/ext/cld3/sentence.proto +0 -77
  39. data/ext/cld3/sentence_features.o +0 -0
  40. data/ext/cld3/task_context.o +0 -0
  41. data/ext/cld3/task_context_params.o +0 -0
  42. data/ext/cld3/task_spec.pb.o +0 -0
  43. data/ext/cld3/task_spec.proto +0 -98
  44. data/ext/cld3/text_processing.o +0 -0
  45. data/ext/cld3/unicodetext.o +0 -0
  46. data/ext/cld3/utf8statetable.o +0 -0
  47. data/ext/cld3/utils.o +0 -0
  48. data/ext/cld3/workspace.o +0 -0
  49. data/lib/a.rb +0 -24
  50. data/lib/cld3/unstable.rb +0 -58
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f40e4947fea97543686caceba0082bdba30b5ae0485a25b41004ad048057b0ad
4
- data.tar.gz: e45c60300550caf513fdde6bcbc05e68e1063bf9ad8074626bf5f88f4a6f77bd
3
+ metadata.gz: 730f1cc6022bc381311f03ee67a87cdcaab01bcfc6c9bc5d1b056871acef197c
4
+ data.tar.gz: d1f640c24df2a95cdd2605e4078c5b820ec15ddbe775b7c281004d6f3772df8f
5
5
  SHA512:
6
- metadata.gz: 393fc138a279ee42c3de90c49bcc982e55860f74e2796d4c895d0f2f175894bcb1ec1bbe796811f896a16be9cc97943e1309cbe175bc029a510b4c51b2f700da
7
- data.tar.gz: d16e8c87e7d12cc90cc1a4babb4873df8f553d9527e1d69a548a250ae0b240f79a6338070bbc88cbb0e23db48c23ef0393cd4b62e0ac673722ace81ce1564895
6
+ metadata.gz: 41aedd4699c653dcfcc97f34adde401918928c7bf27cfe48fd5df4c108476be837e5c43add574823577afa35066c0386b31ee97f28c83660d35c753103955b20
7
+ data.tar.gz: 6f3b389ac6b334980e5899fd1a40664ee67011bb998deef5ccef4794f67d1613b4777243b97c7e87b8aab6118332907712700799437e3e261e0eb6ce85cd6952
data/Gemfile CHANGED
@@ -15,5 +15,4 @@
15
15
  #==============================================================================
16
16
 
17
17
  source 'https://rubygems.org'
18
- gem 'steep', github: 'akihikodaki/steep', branch: 'cld3'
19
18
  gemspec
data/README.md CHANGED
@@ -19,7 +19,6 @@ cld3.find_language("здравствуйте") # => #<struct Struct::Result lang
19
19
  ### Prerequisites
20
20
  * [Bundler](http://bundler.io/)
21
21
  * C++ compiler
22
- * [Protocol buffers](https://developers.google.com/protocol-buffers/)
23
22
  * [Rake](https://ruby.github.io/rake/)
24
23
  * [RubyGems](https://rubygems.org/)
25
24
 
@@ -36,23 +35,13 @@ FreeBSD port is available as `rubygem-cld3` in `textproc` category.
36
35
 
37
36
  https://svnweb.freebsd.org/ports/head/textproc/rubygem-cld3/
38
37
 
39
- #### JRuby
40
- JRuby has a bug which prevents the feature detection. Apply the following
41
- change:
42
- https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
43
-
44
38
  ### Troubleshooting
45
- `gem install cld3` triggers native library building. If it fails, you are likely
46
- to missing required facilities. Make sure C++ compiler and protocol buffers
47
- is installed. I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler. Ruby is
48
- likely to need [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/)
49
- as well.
50
-
51
- Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
52
- programming errors. Make sure they are all correct.
39
+ `gem install cld3` triggers native library building. If it fails, it is likely
40
+ that some required facilities are missing. Make sure C++ compiler is installed.
41
+ I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
53
42
 
54
43
  If you cannot identify the cause of your problem, run spec of this library and
55
- see whether the problem is reproducable with it or not. Spec is not included in
44
+ see whether the problem is reproducible with it or not. Spec is not included in
56
45
  the gem, so clone the source code repository and then run `rake spec`.
57
46
  The source code repository is at
58
47
  https://github.com/akihikodaki/cld3-ruby.
data/cld3.gemspec CHANGED
@@ -16,21 +16,21 @@
16
16
 
17
17
  Gem::Specification.new do |gem|
18
18
  gem.name = "cld3"
19
- gem.version = "3.4.4"
19
+ gem.version = "3.5.1"
20
20
  gem.summary = "Compact Language Detector v3 (CLD3)"
21
21
  gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
22
  gem.license = "Apache-2.0"
23
23
  gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
24
24
  gem.author = "Akihiko Odaki"
25
25
  gem.email = "akihiko.odaki@gmail.com"
26
- gem.required_ruby_version = [ ">= 2.6.0", "< 3.2.0" ]
27
- gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
28
- gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
29
- gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
30
- gem.add_development_dependency "steep", [ ">= 0.47.0", "< 0.48.0" ]
26
+ gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
27
+ gem.add_development_dependency "rbs", [ ">= 2.8.0", "< 2.9.0" ]
28
+ gem.add_development_dependency "rspec", [ ">= 3.12.0", "< 3.13.0" ]
29
+ gem.add_development_dependency "steep", [ ">= 1.3.0", "< 1.4.0" ]
31
30
  gem.files = Dir[
32
31
  "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
33
- "cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
32
+ "cld3.gemspec", "ext/**/*.c", "ext/**/*.cc", "ext/**/*.h",
33
+ "lib/**/*.rb", "sig/**/*"
34
34
  ]
35
35
  gem.require_paths = [ "lib" ]
36
36
  gem.extensions = [ "ext/cld3/extconf.rb" ]
@@ -0,0 +1,100 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef FEATURE_EXTRACTOR_PB_H_
18
+ #define FEATURE_EXTRACTOR_PB_H_
19
+
20
+ #include <cstdint>
21
+ #include <string>
22
+ #include <vector>
23
+
24
+ namespace chrome_lang_id {
25
+
26
+ class Parameter {
27
+ public:
28
+ const std::string& name() const { return name_; }
29
+ void set_name(std::string value) { name_ = std::move(value); }
30
+ const std::string& value() const { return value_; }
31
+ void set_value(std::string value) { value_ = std::move(value); }
32
+
33
+ private:
34
+ std::string name_;
35
+ std::string value_;
36
+ };
37
+
38
+ class FeatureFunctionDescriptor {
39
+ public:
40
+ const std::string& type() const { return type_; }
41
+
42
+ void set_type(std::string value) { type_ = std::move(value); }
43
+
44
+ const std::string& name() const { return name_; }
45
+
46
+ void set_name(std::string value) { name_ = std::move(value); }
47
+
48
+ bool has_argument() const { return true; }
49
+
50
+ std::int32_t argument() const { return argument_; }
51
+
52
+ void set_argument(int32_t value) { argument_ = value; }
53
+
54
+ int parameter_size() const { return parameter_.size(); }
55
+
56
+ const Parameter& parameter(int index) const { return parameter_[index]; }
57
+
58
+ Parameter* add_parameter() { return &parameter_.emplace_back(); }
59
+
60
+ int feature_size() const { return feature_.size(); }
61
+
62
+ FeatureFunctionDescriptor* mutable_feature(int index) {
63
+ return &feature_[index];
64
+ }
65
+
66
+ const FeatureFunctionDescriptor& feature(int index) const {
67
+ return feature_[index];
68
+ }
69
+
70
+ FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
71
+
72
+ private:
73
+ std::string type_;
74
+ std::string name_;
75
+ std::int32_t argument_;
76
+ std::vector<Parameter> parameter_;
77
+ std::vector<FeatureFunctionDescriptor> feature_;
78
+ };
79
+
80
+ class FeatureExtractorDescriptor {
81
+ public:
82
+ int feature_size() const { return feature_.size(); }
83
+
84
+ FeatureFunctionDescriptor* mutable_feature(int index) {
85
+ return &feature_[index];
86
+ }
87
+
88
+ const FeatureFunctionDescriptor& feature(int index) const {
89
+ return feature_[index];
90
+ }
91
+
92
+ FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
93
+
94
+ private:
95
+ std::vector<FeatureFunctionDescriptor> feature_;
96
+ };
97
+
98
+ }
99
+
100
+ #endif
@@ -0,0 +1,35 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef SENTENCE_PB_H_
18
+ #define SENTENCE_PB_H_
19
+
20
+ #include <string>
21
+
22
+ namespace chrome_lang_id {
23
+
24
+ class Sentence {
25
+ public:
26
+ const std::string& text() const { return text_; }
27
+ void set_text(std::string value) { text_ = std::move(value); }
28
+
29
+ private:
30
+ std::string text_;
31
+ };
32
+
33
+ }
34
+
35
+ #endif
@@ -0,0 +1,106 @@
1
+ /* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
2
+ All Rights Reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ ==============================================================================*/
16
+
17
+ #ifndef TASK_SPEC_PB_H_
18
+ #define TASK_SPEC_PB_H_
19
+
20
+ #include <string>
21
+ #include <vector>
22
+
23
+ namespace chrome_lang_id {
24
+
25
+ class TaskInput {
26
+ public:
27
+ class Part {
28
+ public:
29
+ const std::string& file_pattern() const { return file_pattern_; }
30
+
31
+ private:
32
+ std::string file_pattern_;
33
+ };
34
+
35
+ const std::string& name() const { return name_; }
36
+
37
+ void set_name(std::string value) { name_ = value; }
38
+
39
+ int file_format_size() const { return file_format_.size(); }
40
+
41
+ const std::string& file_format(int index) const {
42
+ return file_format_[index];
43
+ }
44
+
45
+ void add_file_format(std::string value) {
46
+ file_format_.push_back(std::move(value));
47
+ }
48
+
49
+ int record_format_size() const { return record_format_.size(); }
50
+
51
+ const std::string& record_format(int index) const {
52
+ return record_format_[index];
53
+ }
54
+
55
+ void add_record_format(std::string value) {
56
+ record_format_.push_back(std::move(value));
57
+ }
58
+
59
+ int part_size() const { return part_.size(); }
60
+ const Part& part(int index) const { return part_[index]; }
61
+
62
+ private:
63
+ std::string name_;
64
+ std::vector<std::string> file_format_;
65
+ std::vector<std::string> record_format_;
66
+ std::vector<Part> part_;
67
+ };
68
+
69
+ class TaskSpec {
70
+ public:
71
+ class Parameter {
72
+ public:
73
+ const std::string& name() const { return name_; }
74
+ void set_name(std::string value) { name_ = std::move(value); }
75
+ const std::string& value() const { return value_; }
76
+ void set_value(std::string value) { value_ = std::move(value); }
77
+
78
+ private:
79
+ std::string name_;
80
+ std::string value_;
81
+ };
82
+
83
+ int parameter_size() const { return parameter_.size(); }
84
+
85
+ Parameter* mutable_parameter(int index) { return &parameter_[index]; }
86
+
87
+ const Parameter& parameter(int index) const { return parameter_[index]; }
88
+
89
+ Parameter* add_parameter() { return &parameter_.emplace_back(); }
90
+
91
+ int input_size() const { return input_.size(); }
92
+
93
+ TaskInput* mutable_input(int index) { return &input_[index]; }
94
+
95
+ const TaskInput& input(int index) const { return input_[index]; }
96
+
97
+ TaskInput* add_input() { return &input_.emplace_back(); }
98
+
99
+ private:
100
+ std::vector<Parameter> parameter_;
101
+ std::vector<TaskInput> input_;
102
+ };
103
+
104
+ }
105
+
106
+ #endif
data/ext/cld3/extconf.rb CHANGED
@@ -26,17 +26,8 @@ rescue
26
26
  end
27
27
  end
28
28
 
29
- # Check pkg-config first to inform the library is missing if so.
30
- pkg_config("protobuf") or abort "Failed to locate protobuf"
31
-
32
- FileUtils.mkdir_p("cld_3/protos")
33
29
  FileUtils.mkdir_p("script_span")
34
30
 
35
- [ "feature_extractor", "sentence", "task_spec" ].each {|name|
36
- system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
37
- ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
38
- }
39
-
40
31
  [
41
32
  "fixunicodevalue.h",
42
33
  "generated_ulscript.h",
@@ -55,6 +46,5 @@ FileUtils.mkdir_p("script_span")
55
46
  ln_fallback("#{name}", "script_span/#{name}")
56
47
  }
57
48
 
58
- $CXXFLAGS += " -fvisibility=hidden -std=c++11"
59
- $LIBRUBYARG = ""
60
- create_makefile("libcld3")
49
+ $CXXFLAGS += " -fvisibility=hidden -std=c++17"
50
+ create_makefile("cld3_ext")
@@ -878,7 +878,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
878
878
  // copying letters to buffer with single spaces for each run of non-letters
879
879
  while (take < byte_length_) {
880
880
  // Copy run of letters in same script (&LS | LS)*
881
- int letter_count = 0; // Keep track of word length
882
881
  bool need_break = false;
883
882
 
884
883
  while (take < byte_length_) {
@@ -963,7 +962,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
963
962
  map2original_.Delete(tlen - plen);
964
963
  }
965
964
 
966
- ++letter_count;
967
965
  if (put >= kMaxScriptBytes) {
968
966
  // Buffer is full
969
967
  span->truncated = true;
@@ -18,6 +18,7 @@ limitations under the License.
18
18
  #include <iostream>
19
19
  #include <string>
20
20
  #include <utility>
21
+ #include <ruby.h>
21
22
  #include "nnet_language_identifier.h"
22
23
 
23
24
  #if defined _WIN32 || defined __CYGWIN__
@@ -27,89 +28,180 @@ limitations under the License.
27
28
  #endif
28
29
 
29
30
  struct Result {
30
- struct {
31
- const char *data;
32
- std::size_t size;
33
- } language;
34
- struct {
35
- const chrome_lang_id::NNetLanguageIdentifier::SpanInfo *data;
36
- std::size_t size;
37
- } byte_ranges;
38
- float probability;
39
- float proportion;
40
- bool is_reliable;
31
+ VALUE result_klass;
32
+ VALUE span_info_klass;
33
+ const chrome_lang_id::NNetLanguageIdentifier::Result& data;
34
+
35
+ VALUE convert() const {
36
+ if (data.language == chrome_lang_id::NNetLanguageIdentifier::kUnknown)
37
+ return Qnil;
38
+
39
+ VALUE byte_ranges = rb_ary_new2(data.byte_ranges.size());
40
+ for (auto& byte_range_data : data.byte_ranges) {
41
+ VALUE argv[] = {
42
+ INT2NUM(byte_range_data.start_index),
43
+ INT2NUM(byte_range_data.end_index),
44
+ DBL2NUM(byte_range_data.probability),
45
+ };
46
+
47
+ VALUE byte_range = rb_class_new_instance(sizeof(argv) / sizeof(*argv),
48
+ argv,
49
+ span_info_klass);
50
+ rb_ary_push(byte_ranges, byte_range);
51
+ }
52
+
53
+ VALUE argv[] = {
54
+ ID2SYM(rb_intern2(data.language.data(), data.language.size())),
55
+ DBL2NUM(data.probability),
56
+ data.is_reliable ? Qtrue : Qfalse,
57
+ DBL2NUM(data.proportion),
58
+ byte_ranges,
59
+ };
60
+
61
+ return rb_class_new_instance(sizeof(argv) / sizeof(*argv), argv,
62
+ result_klass);
63
+ }
41
64
  };
42
65
 
43
- struct OwningResult {
44
- OwningResult(chrome_lang_id::NNetLanguageIdentifier::Result&& result) {
45
- references.language = std::move(result.language);
46
- references.byte_ranges = std::move(result.byte_ranges);
47
- plain.language.data = references.language.data();
48
- plain.language.size = references.language.size();
49
- plain.byte_ranges.data = references.byte_ranges.data();
50
- plain.byte_ranges.size = references.byte_ranges.size();
51
- plain.probability = result.probability;
52
- plain.proportion = result.proportion;
53
- plain.is_reliable = result.is_reliable;
66
+ struct ResultVector {
67
+ VALUE result_klass;
68
+ VALUE span_info_klass;
69
+ VALUE buffer;
70
+ const std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>& data;
71
+
72
+ VALUE convert() const {
73
+ for (auto& element_data : data) {
74
+ Result result { result_klass, span_info_klass, element_data };
75
+ VALUE element = result.convert();
76
+ if (element == Qnil)
77
+ break;
78
+
79
+ rb_ary_push(buffer, element);
80
+ }
81
+
82
+ return buffer;
54
83
  }
84
+ };
55
85
 
56
- Result plain;
57
- struct {
58
- std::string language;
59
- std::vector<chrome_lang_id::NNetLanguageIdentifier::SpanInfo> byte_ranges;
60
- } references;
86
+ template<typename T>
87
+ VALUE convert_protected(VALUE arg)
88
+ {
89
+ auto result = reinterpret_cast<const T *>(arg);
90
+ return result->convert();
91
+ }
92
+
93
+ static void dfree(void *arg) {
94
+ auto data = static_cast<chrome_lang_id::NNetLanguageIdentifier *>(arg);
95
+ data->~NNetLanguageIdentifier();
96
+ xfree(arg);
97
+ }
98
+
99
+ static size_t dsize(const void *data) {
100
+ return sizeof(chrome_lang_id::NNetLanguageIdentifier);
101
+ }
102
+
103
+ static const rb_data_type_t data_type = {
104
+ .wrap_struct_name = "CLD3::NNetLanguageIdentifier",
105
+ .function = {
106
+ .dfree = dfree,
107
+ .dsize = dsize,
108
+ },
109
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
61
110
  };
62
111
 
63
- extern "C" {
64
- EXPORT OwningResult *NNetLanguageIdentifier_find_language(
65
- chrome_lang_id::NNetLanguageIdentifier *instance,
66
- const char *data,
67
- std::size_t size) {
68
- return new OwningResult(instance->FindLanguage(std::string(data, size)));
112
+ static VALUE find_language(VALUE obj,
113
+ VALUE result_klass, VALUE span_info_klass,
114
+ VALUE text) {
115
+ int state;
116
+ VALUE converted;
117
+
118
+ {
119
+ chrome_lang_id::NNetLanguageIdentifier *data;
120
+ TypedData_Get_Struct(obj, chrome_lang_id::NNetLanguageIdentifier,
121
+ &data_type, data);
122
+ std::string text_string = std::string(RSTRING_PTR(text), RSTRING_LEN(text));
123
+ auto result_data = data->FindLanguage(text_string);
124
+ Result result { result_klass, span_info_klass, result_data };
125
+
126
+ converted = rb_protect(convert_protected<Result>,
127
+ reinterpret_cast<VALUE>(&result),
128
+ &state);
69
129
  }
70
130
 
71
- EXPORT std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>*
72
- NNetLanguageIdentifier_find_top_n_most_freq_langs(
73
- chrome_lang_id::NNetLanguageIdentifier *instance,
74
- const char *data, std::size_t size, int num_langs) {
75
- std::string text(data, size);
76
- return new auto(instance->FindTopNMostFreqLangs(text, num_langs));
77
- }
131
+ if (state)
132
+ rb_jump_tag(state);
78
133
 
79
- EXPORT void delete_NNetLanguageIdentifier(
80
- chrome_lang_id::NNetLanguageIdentifier *pointer) {
81
- delete pointer;
82
- }
134
+ return converted;
135
+ }
83
136
 
84
- EXPORT void delete_result(OwningResult *pointer) {
85
- delete pointer;
137
+ static VALUE find_top_n_most_freq_langs(VALUE obj,
138
+ VALUE result_klass,
139
+ VALUE span_info_klass,
140
+ VALUE text,
141
+ VALUE num_langs) {
142
+ int state;
143
+ VALUE converted;
144
+
145
+ {
146
+ chrome_lang_id::NNetLanguageIdentifier *data;
147
+ TypedData_Get_Struct(obj, chrome_lang_id::NNetLanguageIdentifier,
148
+ &data_type, data);
149
+ VALUE buffer = rb_ary_new2(NUM2INT(num_langs));
150
+ std::string text_string = std::string(RSTRING_PTR(text), RSTRING_LEN(text));
151
+ auto result_data = data->FindTopNMostFreqLangs(text_string, num_langs);
152
+ ResultVector result { result_klass, span_info_klass, buffer, result_data };
153
+
154
+ converted = rb_protect(convert_protected<ResultVector>,
155
+ reinterpret_cast<VALUE>(&result),
156
+ &state);
86
157
  }
87
158
 
88
- EXPORT void delete_results(
89
- std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *pointer) {
90
- delete pointer;
91
- }
159
+ if (state)
160
+ rb_jump_tag(state);
92
161
 
93
- EXPORT chrome_lang_id::NNetLanguageIdentifier *new_NNetLanguageIdentifier(
94
- int min_num_bytes, int max_num_bytes) {
95
- return new chrome_lang_id::NNetLanguageIdentifier(
96
- min_num_bytes, max_num_bytes);
97
- }
162
+ return converted;
163
+ }
164
+
165
+ static VALUE make(VALUE klass, VALUE min_num_bytes, VALUE max_num_bytes) {
166
+ int min_num_bytes_int = NUM2INT(min_num_bytes);
167
+ int max_num_bytes_int = NUM2INT(max_num_bytes);
168
+ chrome_lang_id::NNetLanguageIdentifier *data;
169
+ VALUE value = TypedData_Make_Struct(klass,
170
+ chrome_lang_id::NNetLanguageIdentifier,
171
+ &data_type, data);
172
+ new (data) chrome_lang_id::NNetLanguageIdentifier(min_num_bytes_int, max_num_bytes_int);
173
+ return value;
174
+ }
98
175
 
99
- EXPORT Result refer_to_nth_result(
100
- std::vector<chrome_lang_id::NNetLanguageIdentifier::Result> *results,
101
- std::size_t index) {
102
- Result c;
103
- auto& cc = (*results)[index];
104
-
105
- c.language.data = cc.language.data();
106
- c.language.size = cc.language.size();
107
- c.byte_ranges.data = cc.byte_ranges.data();
108
- c.byte_ranges.size = cc.byte_ranges.size();
109
- c.probability = cc.probability;
110
- c.proportion = cc.proportion;
111
- c.is_reliable = cc.is_reliable;
112
-
113
- return c;
176
+ extern "C" EXPORT void Init_cld3_ext() {
177
+ VALUE cld3 = rb_const_get(rb_cObject, rb_intern("CLD3"));
178
+ VALUE identifier =
179
+ rb_const_get(cld3, rb_intern("NNetLanguageIdentifier"));
180
+ VALUE unstable = rb_const_get(identifier, rb_intern("Unstable"));
181
+ VALUE params = rb_const_get(cld3, rb_intern("TaskContextParams"));
182
+ VALUE language_names = rb_const_get(params, rb_intern("LANGUAGE_NAMES"));
183
+
184
+ rb_define_const(identifier, "MIN_NUM_BYTES_TO_CONSIDER",
185
+ INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider));
186
+ rb_define_const(identifier, "MAX_NUM_BYTES_TO_CONSIDER",
187
+ INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMaxNumBytesToConsider));
188
+ rb_define_const(identifier, "MAX_NUM_INPUT_BYTES_TO_CONSIDER",
189
+ INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMaxNumInputBytesToConsider));
190
+ rb_define_const(identifier, "RELIABILITY_THRESHOLD",
191
+ DBL2NUM(chrome_lang_id::NNetLanguageIdentifier::kReliabilityThreshold));
192
+ rb_define_const(identifier, "RELIABILITY_HR_BS_THRESHOLD",
193
+ DBL2NUM(chrome_lang_id::NNetLanguageIdentifier::kReliabilityHrBsThreshold));
194
+
195
+ rb_define_singleton_method(unstable, "make", make, 2);
196
+ rb_define_method(unstable, "find_language", find_language, 3);
197
+ rb_define_method(unstable, "find_top_n_most_freq_langs",
198
+ find_top_n_most_freq_langs, 4);
199
+
200
+ for (int i = 0; ; i++) {
201
+ const char *name = chrome_lang_id::TaskContextParams::language_names(i);
202
+ if (!name)
203
+ break;
204
+
205
+ rb_ary_push(language_names, ID2SYM(rb_intern(name)));
114
206
  }
115
207
  }