cld3 3.4.4 → 3.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/README.md +4 -15
- data/cld3.gemspec +7 -7
- data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
- data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
- data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
- data/ext/cld3/extconf.rb +2 -12
- data/ext/cld3/getonescriptspan.cc +0 -2
- data/ext/cld3/nnet_language_identifier_c.cc +162 -70
- data/lib/cld3.rb +15 -100
- data/sig/cld3.rbs +2 -0
- metadata +20 -74
- data/ext/cld3/Makefile +0 -267
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_extractor.proto +0 -50
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.def +0 -8
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/mkmf.log +0 -37
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence.proto +0 -77
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/task_spec.proto +0 -98
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/a.rb +0 -24
- data/lib/cld3/unstable.rb +0 -58
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 730f1cc6022bc381311f03ee67a87cdcaab01bcfc6c9bc5d1b056871acef197c
|
4
|
+
data.tar.gz: d1f640c24df2a95cdd2605e4078c5b820ec15ddbe775b7c281004d6f3772df8f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41aedd4699c653dcfcc97f34adde401918928c7bf27cfe48fd5df4c108476be837e5c43add574823577afa35066c0386b31ee97f28c83660d35c753103955b20
|
7
|
+
data.tar.gz: 6f3b389ac6b334980e5899fd1a40664ee67011bb998deef5ccef4794f67d1613b4777243b97c7e87b8aab6118332907712700799437e3e261e0eb6ce85cd6952
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -19,7 +19,6 @@ cld3.find_language("здравствуйте") # => #<struct Struct::Result lang
|
|
19
19
|
### Prerequisites
|
20
20
|
* [Bundler](http://bundler.io/)
|
21
21
|
* C++ compiler
|
22
|
-
* [Protocol buffers](https://developers.google.com/protocol-buffers/)
|
23
22
|
* [Rake](https://ruby.github.io/rake/)
|
24
23
|
* [RubyGems](https://rubygems.org/)
|
25
24
|
|
@@ -36,23 +35,13 @@ FreeBSD port is available as `rubygem-cld3` in `textproc` category.
|
|
36
35
|
|
37
36
|
https://svnweb.freebsd.org/ports/head/textproc/rubygem-cld3/
|
38
37
|
|
39
|
-
#### JRuby
|
40
|
-
JRuby has a bug which prevents the feature detection. Apply the following
|
41
|
-
change:
|
42
|
-
https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
|
43
|
-
|
44
38
|
### Troubleshooting
|
45
|
-
`gem install cld3` triggers native library building. If it fails,
|
46
|
-
|
47
|
-
|
48
|
-
likely to need [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/)
|
49
|
-
as well.
|
50
|
-
|
51
|
-
Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
|
52
|
-
programming errors. Make sure they are all correct.
|
39
|
+
`gem install cld3` triggers native library building. If it fails, it is likely
|
40
|
+
that some required facilities are missing. Make sure C++ compiler is installed.
|
41
|
+
I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
|
53
42
|
|
54
43
|
If you cannot identify the cause of your problem, run spec of this library and
|
55
|
-
see whether the problem is
|
44
|
+
see whether the problem is reproducible with it or not. Spec is not included in
|
56
45
|
the gem, so clone the source code repository and then run `rake spec`.
|
57
46
|
The source code repository is at
|
58
47
|
https://github.com/akihikodaki/cld3-ruby.
|
data/cld3.gemspec
CHANGED
@@ -16,21 +16,21 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.
|
19
|
+
gem.version = "3.5.1"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
23
23
|
gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
|
24
24
|
gem.author = "Akihiko Odaki"
|
25
25
|
gem.email = "akihiko.odaki@gmail.com"
|
26
|
-
gem.required_ruby_version = [ ">= 2.
|
27
|
-
gem.
|
28
|
-
gem.add_development_dependency "
|
29
|
-
gem.add_development_dependency "
|
30
|
-
gem.add_development_dependency "steep", [ ">= 0.47.0", "< 0.48.0" ]
|
26
|
+
gem.required_ruby_version = [ ">= 2.7.0", "< 3.3.0" ]
|
27
|
+
gem.add_development_dependency "rbs", [ ">= 2.8.0", "< 2.9.0" ]
|
28
|
+
gem.add_development_dependency "rspec", [ ">= 3.12.0", "< 3.13.0" ]
|
29
|
+
gem.add_development_dependency "steep", [ ">= 1.3.0", "< 1.4.0" ]
|
31
30
|
gem.files = Dir[
|
32
31
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
33
|
-
"cld3.gemspec", "ext
|
32
|
+
"cld3.gemspec", "ext/**/*.c", "ext/**/*.cc", "ext/**/*.h",
|
33
|
+
"lib/**/*.rb", "sig/**/*"
|
34
34
|
]
|
35
35
|
gem.require_paths = [ "lib" ]
|
36
36
|
gem.extensions = [ "ext/cld3/extconf.rb" ]
|
@@ -0,0 +1,100 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef FEATURE_EXTRACTOR_PB_H_
|
18
|
+
#define FEATURE_EXTRACTOR_PB_H_
|
19
|
+
|
20
|
+
#include <cstdint>
|
21
|
+
#include <string>
|
22
|
+
#include <vector>
|
23
|
+
|
24
|
+
namespace chrome_lang_id {
|
25
|
+
|
26
|
+
class Parameter {
|
27
|
+
public:
|
28
|
+
const std::string& name() const { return name_; }
|
29
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
30
|
+
const std::string& value() const { return value_; }
|
31
|
+
void set_value(std::string value) { value_ = std::move(value); }
|
32
|
+
|
33
|
+
private:
|
34
|
+
std::string name_;
|
35
|
+
std::string value_;
|
36
|
+
};
|
37
|
+
|
38
|
+
class FeatureFunctionDescriptor {
|
39
|
+
public:
|
40
|
+
const std::string& type() const { return type_; }
|
41
|
+
|
42
|
+
void set_type(std::string value) { type_ = std::move(value); }
|
43
|
+
|
44
|
+
const std::string& name() const { return name_; }
|
45
|
+
|
46
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
47
|
+
|
48
|
+
bool has_argument() const { return true; }
|
49
|
+
|
50
|
+
std::int32_t argument() const { return argument_; }
|
51
|
+
|
52
|
+
void set_argument(int32_t value) { argument_ = value; }
|
53
|
+
|
54
|
+
int parameter_size() const { return parameter_.size(); }
|
55
|
+
|
56
|
+
const Parameter& parameter(int index) const { return parameter_[index]; }
|
57
|
+
|
58
|
+
Parameter* add_parameter() { return ¶meter_.emplace_back(); }
|
59
|
+
|
60
|
+
int feature_size() const { return feature_.size(); }
|
61
|
+
|
62
|
+
FeatureFunctionDescriptor* mutable_feature(int index) {
|
63
|
+
return &feature_[index];
|
64
|
+
}
|
65
|
+
|
66
|
+
const FeatureFunctionDescriptor& feature(int index) const {
|
67
|
+
return feature_[index];
|
68
|
+
}
|
69
|
+
|
70
|
+
FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
|
71
|
+
|
72
|
+
private:
|
73
|
+
std::string type_;
|
74
|
+
std::string name_;
|
75
|
+
std::int32_t argument_;
|
76
|
+
std::vector<Parameter> parameter_;
|
77
|
+
std::vector<FeatureFunctionDescriptor> feature_;
|
78
|
+
};
|
79
|
+
|
80
|
+
class FeatureExtractorDescriptor {
|
81
|
+
public:
|
82
|
+
int feature_size() const { return feature_.size(); }
|
83
|
+
|
84
|
+
FeatureFunctionDescriptor* mutable_feature(int index) {
|
85
|
+
return &feature_[index];
|
86
|
+
}
|
87
|
+
|
88
|
+
const FeatureFunctionDescriptor& feature(int index) const {
|
89
|
+
return feature_[index];
|
90
|
+
}
|
91
|
+
|
92
|
+
FeatureFunctionDescriptor* add_feature() { return &feature_.emplace_back(); }
|
93
|
+
|
94
|
+
private:
|
95
|
+
std::vector<FeatureFunctionDescriptor> feature_;
|
96
|
+
};
|
97
|
+
|
98
|
+
}
|
99
|
+
|
100
|
+
#endif
|
@@ -0,0 +1,35 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef SENTENCE_PB_H_
|
18
|
+
#define SENTENCE_PB_H_
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
|
22
|
+
namespace chrome_lang_id {
|
23
|
+
|
24
|
+
class Sentence {
|
25
|
+
public:
|
26
|
+
const std::string& text() const { return text_; }
|
27
|
+
void set_text(std::string value) { text_ = std::move(value); }
|
28
|
+
|
29
|
+
private:
|
30
|
+
std::string text_;
|
31
|
+
};
|
32
|
+
|
33
|
+
}
|
34
|
+
|
35
|
+
#endif
|
@@ -0,0 +1,106 @@
|
|
1
|
+
/* Copyright 2022 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#ifndef TASK_SPEC_PB_H_
|
18
|
+
#define TASK_SPEC_PB_H_
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
#include <vector>
|
22
|
+
|
23
|
+
namespace chrome_lang_id {
|
24
|
+
|
25
|
+
class TaskInput {
|
26
|
+
public:
|
27
|
+
class Part {
|
28
|
+
public:
|
29
|
+
const std::string& file_pattern() const { return file_pattern_; }
|
30
|
+
|
31
|
+
private:
|
32
|
+
std::string file_pattern_;
|
33
|
+
};
|
34
|
+
|
35
|
+
const std::string& name() const { return name_; }
|
36
|
+
|
37
|
+
void set_name(std::string value) { name_ = value; }
|
38
|
+
|
39
|
+
int file_format_size() const { return file_format_.size(); }
|
40
|
+
|
41
|
+
const std::string& file_format(int index) const {
|
42
|
+
return file_format_[index];
|
43
|
+
}
|
44
|
+
|
45
|
+
void add_file_format(std::string value) {
|
46
|
+
file_format_.push_back(std::move(value));
|
47
|
+
}
|
48
|
+
|
49
|
+
int record_format_size() const { return record_format_.size(); }
|
50
|
+
|
51
|
+
const std::string& record_format(int index) const {
|
52
|
+
return record_format_[index];
|
53
|
+
}
|
54
|
+
|
55
|
+
void add_record_format(std::string value) {
|
56
|
+
record_format_.push_back(std::move(value));
|
57
|
+
}
|
58
|
+
|
59
|
+
int part_size() const { return part_.size(); }
|
60
|
+
const Part& part(int index) const { return part_[index]; }
|
61
|
+
|
62
|
+
private:
|
63
|
+
std::string name_;
|
64
|
+
std::vector<std::string> file_format_;
|
65
|
+
std::vector<std::string> record_format_;
|
66
|
+
std::vector<Part> part_;
|
67
|
+
};
|
68
|
+
|
69
|
+
class TaskSpec {
|
70
|
+
public:
|
71
|
+
class Parameter {
|
72
|
+
public:
|
73
|
+
const std::string& name() const { return name_; }
|
74
|
+
void set_name(std::string value) { name_ = std::move(value); }
|
75
|
+
const std::string& value() const { return value_; }
|
76
|
+
void set_value(std::string value) { value_ = std::move(value); }
|
77
|
+
|
78
|
+
private:
|
79
|
+
std::string name_;
|
80
|
+
std::string value_;
|
81
|
+
};
|
82
|
+
|
83
|
+
int parameter_size() const { return parameter_.size(); }
|
84
|
+
|
85
|
+
Parameter* mutable_parameter(int index) { return ¶meter_[index]; }
|
86
|
+
|
87
|
+
const Parameter& parameter(int index) const { return parameter_[index]; }
|
88
|
+
|
89
|
+
Parameter* add_parameter() { return ¶meter_.emplace_back(); }
|
90
|
+
|
91
|
+
int input_size() const { return input_.size(); }
|
92
|
+
|
93
|
+
TaskInput* mutable_input(int index) { return &input_[index]; }
|
94
|
+
|
95
|
+
const TaskInput& input(int index) const { return input_[index]; }
|
96
|
+
|
97
|
+
TaskInput* add_input() { return &input_.emplace_back(); }
|
98
|
+
|
99
|
+
private:
|
100
|
+
std::vector<Parameter> parameter_;
|
101
|
+
std::vector<TaskInput> input_;
|
102
|
+
};
|
103
|
+
|
104
|
+
}
|
105
|
+
|
106
|
+
#endif
|
data/ext/cld3/extconf.rb
CHANGED
@@ -26,17 +26,8 @@ rescue
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
# Check pkg-config first to inform the library is missing if so.
|
30
|
-
pkg_config("protobuf") or abort "Failed to locate protobuf"
|
31
|
-
|
32
|
-
FileUtils.mkdir_p("cld_3/protos")
|
33
29
|
FileUtils.mkdir_p("script_span")
|
34
30
|
|
35
|
-
[ "feature_extractor", "sentence", "task_spec" ].each {|name|
|
36
|
-
system "protoc", "#{name}.proto", "--cpp_out=.", exception: true
|
37
|
-
ln_fallback("#{name}.pb.h", "cld_3/protos/#{name}.pb.h")
|
38
|
-
}
|
39
|
-
|
40
31
|
[
|
41
32
|
"fixunicodevalue.h",
|
42
33
|
"generated_ulscript.h",
|
@@ -55,6 +46,5 @@ FileUtils.mkdir_p("script_span")
|
|
55
46
|
ln_fallback("#{name}", "script_span/#{name}")
|
56
47
|
}
|
57
48
|
|
58
|
-
$CXXFLAGS += " -fvisibility=hidden -std=c++
|
59
|
-
|
60
|
-
create_makefile("libcld3")
|
49
|
+
$CXXFLAGS += " -fvisibility=hidden -std=c++17"
|
50
|
+
create_makefile("cld3_ext")
|
@@ -878,7 +878,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
|
|
878
878
|
// copying letters to buffer with single spaces for each run of non-letters
|
879
879
|
while (take < byte_length_) {
|
880
880
|
// Copy run of letters in same script (&LS | LS)*
|
881
|
-
int letter_count = 0; // Keep track of word length
|
882
881
|
bool need_break = false;
|
883
882
|
|
884
883
|
while (take < byte_length_) {
|
@@ -963,7 +962,6 @@ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
|
|
963
962
|
map2original_.Delete(tlen - plen);
|
964
963
|
}
|
965
964
|
|
966
|
-
++letter_count;
|
967
965
|
if (put >= kMaxScriptBytes) {
|
968
966
|
// Buffer is full
|
969
967
|
span->truncated = true;
|
@@ -18,6 +18,7 @@ limitations under the License.
|
|
18
18
|
#include <iostream>
|
19
19
|
#include <string>
|
20
20
|
#include <utility>
|
21
|
+
#include <ruby.h>
|
21
22
|
#include "nnet_language_identifier.h"
|
22
23
|
|
23
24
|
#if defined _WIN32 || defined __CYGWIN__
|
@@ -27,89 +28,180 @@ limitations under the License.
|
|
27
28
|
#endif
|
28
29
|
|
29
30
|
struct Result {
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
31
|
+
VALUE result_klass;
|
32
|
+
VALUE span_info_klass;
|
33
|
+
const chrome_lang_id::NNetLanguageIdentifier::Result& data;
|
34
|
+
|
35
|
+
VALUE convert() const {
|
36
|
+
if (data.language == chrome_lang_id::NNetLanguageIdentifier::kUnknown)
|
37
|
+
return Qnil;
|
38
|
+
|
39
|
+
VALUE byte_ranges = rb_ary_new2(data.byte_ranges.size());
|
40
|
+
for (auto& byte_range_data : data.byte_ranges) {
|
41
|
+
VALUE argv[] = {
|
42
|
+
INT2NUM(byte_range_data.start_index),
|
43
|
+
INT2NUM(byte_range_data.end_index),
|
44
|
+
DBL2NUM(byte_range_data.probability),
|
45
|
+
};
|
46
|
+
|
47
|
+
VALUE byte_range = rb_class_new_instance(sizeof(argv) / sizeof(*argv),
|
48
|
+
argv,
|
49
|
+
span_info_klass);
|
50
|
+
rb_ary_push(byte_ranges, byte_range);
|
51
|
+
}
|
52
|
+
|
53
|
+
VALUE argv[] = {
|
54
|
+
ID2SYM(rb_intern2(data.language.data(), data.language.size())),
|
55
|
+
DBL2NUM(data.probability),
|
56
|
+
data.is_reliable ? Qtrue : Qfalse,
|
57
|
+
DBL2NUM(data.proportion),
|
58
|
+
byte_ranges,
|
59
|
+
};
|
60
|
+
|
61
|
+
return rb_class_new_instance(sizeof(argv) / sizeof(*argv), argv,
|
62
|
+
result_klass);
|
63
|
+
}
|
41
64
|
};
|
42
65
|
|
43
|
-
struct
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
66
|
+
struct ResultVector {
|
67
|
+
VALUE result_klass;
|
68
|
+
VALUE span_info_klass;
|
69
|
+
VALUE buffer;
|
70
|
+
const std::vector<chrome_lang_id::NNetLanguageIdentifier::Result>& data;
|
71
|
+
|
72
|
+
VALUE convert() const {
|
73
|
+
for (auto& element_data : data) {
|
74
|
+
Result result { result_klass, span_info_klass, element_data };
|
75
|
+
VALUE element = result.convert();
|
76
|
+
if (element == Qnil)
|
77
|
+
break;
|
78
|
+
|
79
|
+
rb_ary_push(buffer, element);
|
80
|
+
}
|
81
|
+
|
82
|
+
return buffer;
|
54
83
|
}
|
84
|
+
};
|
55
85
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
86
|
+
template<typename T>
|
87
|
+
VALUE convert_protected(VALUE arg)
|
88
|
+
{
|
89
|
+
auto result = reinterpret_cast<const T *>(arg);
|
90
|
+
return result->convert();
|
91
|
+
}
|
92
|
+
|
93
|
+
static void dfree(void *arg) {
|
94
|
+
auto data = static_cast<chrome_lang_id::NNetLanguageIdentifier *>(arg);
|
95
|
+
data->~NNetLanguageIdentifier();
|
96
|
+
xfree(arg);
|
97
|
+
}
|
98
|
+
|
99
|
+
static size_t dsize(const void *data) {
|
100
|
+
return sizeof(chrome_lang_id::NNetLanguageIdentifier);
|
101
|
+
}
|
102
|
+
|
103
|
+
static const rb_data_type_t data_type = {
|
104
|
+
.wrap_struct_name = "CLD3::NNetLanguageIdentifier",
|
105
|
+
.function = {
|
106
|
+
.dfree = dfree,
|
107
|
+
.dsize = dsize,
|
108
|
+
},
|
109
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY
|
61
110
|
};
|
62
111
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
112
|
+
static VALUE find_language(VALUE obj,
|
113
|
+
VALUE result_klass, VALUE span_info_klass,
|
114
|
+
VALUE text) {
|
115
|
+
int state;
|
116
|
+
VALUE converted;
|
117
|
+
|
118
|
+
{
|
119
|
+
chrome_lang_id::NNetLanguageIdentifier *data;
|
120
|
+
TypedData_Get_Struct(obj, chrome_lang_id::NNetLanguageIdentifier,
|
121
|
+
&data_type, data);
|
122
|
+
std::string text_string = std::string(RSTRING_PTR(text), RSTRING_LEN(text));
|
123
|
+
auto result_data = data->FindLanguage(text_string);
|
124
|
+
Result result { result_klass, span_info_klass, result_data };
|
125
|
+
|
126
|
+
converted = rb_protect(convert_protected<Result>,
|
127
|
+
reinterpret_cast<VALUE>(&result),
|
128
|
+
&state);
|
69
129
|
}
|
70
130
|
|
71
|
-
|
72
|
-
|
73
|
-
chrome_lang_id::NNetLanguageIdentifier *instance,
|
74
|
-
const char *data, std::size_t size, int num_langs) {
|
75
|
-
std::string text(data, size);
|
76
|
-
return new auto(instance->FindTopNMostFreqLangs(text, num_langs));
|
77
|
-
}
|
131
|
+
if (state)
|
132
|
+
rb_jump_tag(state);
|
78
133
|
|
79
|
-
|
80
|
-
|
81
|
-
delete pointer;
|
82
|
-
}
|
134
|
+
return converted;
|
135
|
+
}
|
83
136
|
|
84
|
-
|
85
|
-
|
137
|
+
static VALUE find_top_n_most_freq_langs(VALUE obj,
|
138
|
+
VALUE result_klass,
|
139
|
+
VALUE span_info_klass,
|
140
|
+
VALUE text,
|
141
|
+
VALUE num_langs) {
|
142
|
+
int state;
|
143
|
+
VALUE converted;
|
144
|
+
|
145
|
+
{
|
146
|
+
chrome_lang_id::NNetLanguageIdentifier *data;
|
147
|
+
TypedData_Get_Struct(obj, chrome_lang_id::NNetLanguageIdentifier,
|
148
|
+
&data_type, data);
|
149
|
+
VALUE buffer = rb_ary_new2(NUM2INT(num_langs));
|
150
|
+
std::string text_string = std::string(RSTRING_PTR(text), RSTRING_LEN(text));
|
151
|
+
auto result_data = data->FindTopNMostFreqLangs(text_string, num_langs);
|
152
|
+
ResultVector result { result_klass, span_info_klass, buffer, result_data };
|
153
|
+
|
154
|
+
converted = rb_protect(convert_protected<ResultVector>,
|
155
|
+
reinterpret_cast<VALUE>(&result),
|
156
|
+
&state);
|
86
157
|
}
|
87
158
|
|
88
|
-
|
89
|
-
|
90
|
-
delete pointer;
|
91
|
-
}
|
159
|
+
if (state)
|
160
|
+
rb_jump_tag(state);
|
92
161
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
162
|
+
return converted;
|
163
|
+
}
|
164
|
+
|
165
|
+
static VALUE make(VALUE klass, VALUE min_num_bytes, VALUE max_num_bytes) {
|
166
|
+
int min_num_bytes_int = NUM2INT(min_num_bytes);
|
167
|
+
int max_num_bytes_int = NUM2INT(max_num_bytes);
|
168
|
+
chrome_lang_id::NNetLanguageIdentifier *data;
|
169
|
+
VALUE value = TypedData_Make_Struct(klass,
|
170
|
+
chrome_lang_id::NNetLanguageIdentifier,
|
171
|
+
&data_type, data);
|
172
|
+
new (data) chrome_lang_id::NNetLanguageIdentifier(min_num_bytes_int, max_num_bytes_int);
|
173
|
+
return value;
|
174
|
+
}
|
98
175
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
176
|
+
extern "C" EXPORT void Init_cld3_ext() {
|
177
|
+
VALUE cld3 = rb_const_get(rb_cObject, rb_intern("CLD3"));
|
178
|
+
VALUE identifier =
|
179
|
+
rb_const_get(cld3, rb_intern("NNetLanguageIdentifier"));
|
180
|
+
VALUE unstable = rb_const_get(identifier, rb_intern("Unstable"));
|
181
|
+
VALUE params = rb_const_get(cld3, rb_intern("TaskContextParams"));
|
182
|
+
VALUE language_names = rb_const_get(params, rb_intern("LANGUAGE_NAMES"));
|
183
|
+
|
184
|
+
rb_define_const(identifier, "MIN_NUM_BYTES_TO_CONSIDER",
|
185
|
+
INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider));
|
186
|
+
rb_define_const(identifier, "MAX_NUM_BYTES_TO_CONSIDER",
|
187
|
+
INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMaxNumBytesToConsider));
|
188
|
+
rb_define_const(identifier, "MAX_NUM_INPUT_BYTES_TO_CONSIDER",
|
189
|
+
INT2NUM(chrome_lang_id::NNetLanguageIdentifier::kMaxNumInputBytesToConsider));
|
190
|
+
rb_define_const(identifier, "RELIABILITY_THRESHOLD",
|
191
|
+
DBL2NUM(chrome_lang_id::NNetLanguageIdentifier::kReliabilityThreshold));
|
192
|
+
rb_define_const(identifier, "RELIABILITY_HR_BS_THRESHOLD",
|
193
|
+
DBL2NUM(chrome_lang_id::NNetLanguageIdentifier::kReliabilityHrBsThreshold));
|
194
|
+
|
195
|
+
rb_define_singleton_method(unstable, "make", make, 2);
|
196
|
+
rb_define_method(unstable, "find_language", find_language, 3);
|
197
|
+
rb_define_method(unstable, "find_top_n_most_freq_langs",
|
198
|
+
find_top_n_most_freq_langs, 4);
|
199
|
+
|
200
|
+
for (int i = 0; ; i++) {
|
201
|
+
const char *name = chrome_lang_id::TaskContextParams::language_names(i);
|
202
|
+
if (!name)
|
203
|
+
break;
|
204
|
+
|
205
|
+
rb_ary_push(language_names, ID2SYM(rb_intern(name)));
|
114
206
|
}
|
115
207
|
}
|