cld3 3.5.2 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -3
- data/{Gemfile → Steepfile} +5 -3
- data/cld3.gemspec +7 -7
- data/ext/cld3/nnet_language_identifier_c.cc +6 -11
- data/sig/cld3.rbs +1 -2
- data/spec/cld3_spec.rb +99 -0
- metadata +16 -33
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a1a78e26ab2a3e05e4b030e92aac7285fef10cea9d5fe82c0cb60718cc4212e7
|
4
|
+
data.tar.gz: fa95eaf115c80978b13718b2c5e9cd34d379cb32947a20860a24f7a78a9be5aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e4e3870a821d1099b50db929e1ba330f451ca2af011f3aa601a6749f2276af3fb7db46fcd797e807c779d6aa7ce0a9a042a6c9233ed43a88ae7d47b5a8922484
|
7
|
+
data.tar.gz: 84894909ef06022f4ba4e0b3fd7c3e816accb058bd434c28dabd4db1cb2a851cd8c66c672e1ab56d0aa41ab8e4572c4beb85964e84c2fca7d52f58940f2e6cac
|
data/README.md
CHANGED
@@ -19,14 +19,14 @@ cld3.find_language("здравствуйте") # => #<struct Struct::Result lang
|
|
19
19
|
### Prerequisites
|
20
20
|
* [Bundler](http://bundler.io/)
|
21
21
|
* C++ compiler
|
22
|
-
* [Rake](https://ruby.github.io/rake/)
|
23
22
|
* [RubyGems](https://rubygems.org/)
|
24
23
|
|
25
24
|
### Instructions
|
26
25
|
I (Akihiko Odaki) recommend to setup this library installing via `gem`.
|
27
26
|
|
28
27
|
You can also build this library by yourself. `Rakefile` includes a Rake task to
|
29
|
-
put this code into files buildable as a gem. Build a gem with `rake`
|
28
|
+
put this code into files buildable as a gem. Build a gem with `bundle exec rake`
|
29
|
+
command.
|
30
30
|
|
31
31
|
### Platform-specific information
|
32
32
|
|
@@ -42,7 +42,8 @@ I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler.
|
|
42
42
|
|
43
43
|
If you cannot identify the cause of your problem, run spec of this library and
|
44
44
|
see whether the problem is reproducible with it or not. Spec is not included in
|
45
|
-
the gem, so clone the source code repository and then run
|
45
|
+
the gem, so clone the source code repository and then run
|
46
|
+
`bundle exec rake spec`.
|
46
47
|
The source code repository is at
|
47
48
|
https://github.com/akihikodaki/cld3-ruby.
|
48
49
|
|
data/{Gemfile → Steepfile}
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2021 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
2
|
# All Rights Reserved.
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -14,5 +14,7 @@
|
|
14
14
|
# limitations under the License.
|
15
15
|
#==============================================================================
|
16
16
|
|
17
|
-
|
18
|
-
|
17
|
+
target :lib do
|
18
|
+
signature "sig"
|
19
|
+
check "lib/cld3.rb"
|
20
|
+
end
|
data/cld3.gemspec
CHANGED
@@ -16,21 +16,21 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.
|
19
|
+
gem.version = "3.6.0"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
23
23
|
gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
|
24
24
|
gem.author = "Akihiko Odaki"
|
25
25
|
gem.email = "akihiko.odaki@gmail.com"
|
26
|
-
gem.required_ruby_version = [ ">=
|
27
|
-
gem.add_development_dependency "rbs",
|
28
|
-
gem.add_development_dependency "rspec",
|
29
|
-
gem.add_development_dependency "steep",
|
26
|
+
gem.required_ruby_version = [ ">= 3.0.0", "< 3.4.0" ]
|
27
|
+
gem.add_development_dependency "rbs", "~> 3.1.0"
|
28
|
+
gem.add_development_dependency "rspec", "~> 3.12.0"
|
29
|
+
gem.add_development_dependency "steep", "~> 1.5.0"
|
30
30
|
gem.files = Dir[
|
31
|
-
"
|
31
|
+
"LICENSE", "LICENSE_CLD3", "README.md", "Steepfile",
|
32
32
|
"cld3.gemspec", "ext/**/*.c", "ext/**/*.cc", "ext/**/*.h",
|
33
|
-
"lib/**/*.rb", "sig
|
33
|
+
"lib/**/*.rb", "sig/**/*.rbs", "spec/**/*.rb"
|
34
34
|
]
|
35
35
|
gem.require_paths = [ "lib" ]
|
36
36
|
gem.extensions = [ "ext/cld3/extconf.rb" ]
|
@@ -24,7 +24,7 @@ limitations under the License.
|
|
24
24
|
#if defined _WIN32 || defined __CYGWIN__
|
25
25
|
#define EXPORT __declspec(dllexport)
|
26
26
|
#else
|
27
|
-
#define EXPORT __attribute__
|
27
|
+
#define EXPORT __attribute__((visibility("default")))
|
28
28
|
#endif
|
29
29
|
|
30
30
|
struct Result {
|
@@ -84,10 +84,9 @@ struct ResultVector {
|
|
84
84
|
};
|
85
85
|
|
86
86
|
template<typename T>
|
87
|
-
VALUE convert_protected(VALUE arg)
|
88
|
-
|
89
|
-
|
90
|
-
return result->convert();
|
87
|
+
VALUE convert_protected(VALUE arg) {
|
88
|
+
auto result = reinterpret_cast<const T *>(arg);
|
89
|
+
return result->convert();
|
91
90
|
}
|
92
91
|
|
93
92
|
static void dfree(void *arg) {
|
@@ -101,12 +100,8 @@ static size_t dsize(const void *data) {
|
|
101
100
|
}
|
102
101
|
|
103
102
|
static const rb_data_type_t data_type = {
|
104
|
-
|
105
|
-
|
106
|
-
.dfree = dfree,
|
107
|
-
.dsize = dsize,
|
108
|
-
},
|
109
|
-
.flags = RUBY_TYPED_FREE_IMMEDIATELY
|
103
|
+
"CLD3::NNetLanguageIdentifier", { nullptr, dfree, dsize }, nullptr, nullptr,
|
104
|
+
RUBY_TYPED_FREE_IMMEDIATELY
|
110
105
|
};
|
111
106
|
|
112
107
|
static VALUE find_language(VALUE obj,
|
data/sig/cld3.rbs
CHANGED
@@ -36,6 +36,7 @@ module CLD3
|
|
36
36
|
attr_accessor byte_ranges(): Array[SpanInfo]
|
37
37
|
end
|
38
38
|
|
39
|
+
Unstable: untyped
|
39
40
|
@cc: untyped
|
40
41
|
|
41
42
|
def initialize: (?Integer, ?Integer) -> void
|
@@ -62,6 +63,4 @@ module CLD3
|
|
62
63
|
|
63
64
|
LANGUAGE_NAMES: Array[language_names]
|
64
65
|
end
|
65
|
-
|
66
|
-
Unstable: untyped
|
67
66
|
end
|
data/spec/cld3_spec.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki@gmail.com>
|
2
|
+
# All Rights Reserved.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
#==============================================================================
|
16
|
+
|
17
|
+
require "bundler/setup"
|
18
|
+
Bundler.setup
|
19
|
+
|
20
|
+
require "rbs/test/setup"
|
21
|
+
require "cld3"
|
22
|
+
|
23
|
+
describe CLD3::NNetLanguageIdentifier do
|
24
|
+
describe "#initialize" do
|
25
|
+
it "is expected to raise ArgumentError with negative min_num_bytes" do
|
26
|
+
expect { described_class.new(-1, 1000) }.to raise_error(ArgumentError)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "is expected to raise ArgumentError with min_num_bytes <= max_num_bytes" do
|
30
|
+
expect { described_class.new(0, 0) }.to raise_error(ArgumentError)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "initialized without parameters" do
|
35
|
+
let(:lang_id) { described_class.new }
|
36
|
+
|
37
|
+
describe "#find_language" do
|
38
|
+
subject { lang_id.find_language("This text is written in English.") }
|
39
|
+
it { is_expected.to be_nil }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# See ext/cld3/ext/src/language_identifier_main.cc
|
44
|
+
context "initialized with custom parameters" do
|
45
|
+
let(:lang_id) { described_class.new(0, 1000) }
|
46
|
+
|
47
|
+
describe "#find_language" do
|
48
|
+
subject { lang_id.find_language text }
|
49
|
+
|
50
|
+
context "with an English text" do
|
51
|
+
let(:text) { "This text is written in English." }
|
52
|
+
it {
|
53
|
+
is_expected.to satisfy { |result|
|
54
|
+
result.language == :en &&
|
55
|
+
result.probability > 0 &&
|
56
|
+
result.probability < 1 &&
|
57
|
+
result.reliable? &&
|
58
|
+
result.proportion == 1 &&
|
59
|
+
result.byte_ranges == []
|
60
|
+
}
|
61
|
+
}
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "#find_top_n_most_freq_langs" do
|
66
|
+
subject { lang_id.find_top_n_most_freq_langs text, 3 }
|
67
|
+
|
68
|
+
context "with an English text followed by a Russian text" do
|
69
|
+
let(:text) { "This piece of text is in English. Този текст е на Български." }
|
70
|
+
it {
|
71
|
+
is_expected.to satisfy { |results|
|
72
|
+
results.size == 2 &&
|
73
|
+
results[0].language == :bg &&
|
74
|
+
results[0].probability > 0 &&
|
75
|
+
results[0].probability < 1 &&
|
76
|
+
results[0].reliable? &&
|
77
|
+
results[0].proportion > 0 &&
|
78
|
+
results[0].proportion < 1 &&
|
79
|
+
results[0].byte_ranges.size == 1 &&
|
80
|
+
results[0].byte_ranges[0].start_index == 34 &&
|
81
|
+
results[0].byte_ranges[0].end_index == 81 &&
|
82
|
+
results[0].byte_ranges[0].probability == results[0].probability &&
|
83
|
+
results.size == 2 &&
|
84
|
+
results[1].language == :en &&
|
85
|
+
results[1].probability > 0 &&
|
86
|
+
results[1].probability < 1 &&
|
87
|
+
results[1].reliable? &&
|
88
|
+
results[1].proportion > 0 &&
|
89
|
+
results[1].proportion < 1 &&
|
90
|
+
results[1].byte_ranges.size == 1 &&
|
91
|
+
results[1].byte_ranges[0].start_index == 0 &&
|
92
|
+
results[1].byte_ranges[0].end_index == 34 &&
|
93
|
+
results[1].byte_ranges[0].probability == results[1].probability
|
94
|
+
}
|
95
|
+
}
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
metadata
CHANGED
@@ -1,75 +1,57 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cld3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Akihiko Odaki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbs
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
20
|
-
- - "<"
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
version: 2.9.0
|
19
|
+
version: 3.1.0
|
23
20
|
type: :development
|
24
21
|
prerelease: false
|
25
22
|
version_requirements: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
|
-
- - "
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: 2.8.0
|
30
|
-
- - "<"
|
24
|
+
- - "~>"
|
31
25
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
26
|
+
version: 3.1.0
|
33
27
|
- !ruby/object:Gem::Dependency
|
34
28
|
name: rspec
|
35
29
|
requirement: !ruby/object:Gem::Requirement
|
36
30
|
requirements:
|
37
|
-
- - "
|
31
|
+
- - "~>"
|
38
32
|
- !ruby/object:Gem::Version
|
39
33
|
version: 3.12.0
|
40
|
-
- - "<"
|
41
|
-
- !ruby/object:Gem::Version
|
42
|
-
version: 3.13.0
|
43
34
|
type: :development
|
44
35
|
prerelease: false
|
45
36
|
version_requirements: !ruby/object:Gem::Requirement
|
46
37
|
requirements:
|
47
|
-
- - "
|
38
|
+
- - "~>"
|
48
39
|
- !ruby/object:Gem::Version
|
49
40
|
version: 3.12.0
|
50
|
-
- - "<"
|
51
|
-
- !ruby/object:Gem::Version
|
52
|
-
version: 3.13.0
|
53
41
|
- !ruby/object:Gem::Dependency
|
54
42
|
name: steep
|
55
43
|
requirement: !ruby/object:Gem::Requirement
|
56
44
|
requirements:
|
57
|
-
- - "
|
58
|
-
- !ruby/object:Gem::Version
|
59
|
-
version: 1.3.0
|
60
|
-
- - "<"
|
45
|
+
- - "~>"
|
61
46
|
- !ruby/object:Gem::Version
|
62
|
-
version: 1.
|
47
|
+
version: 1.5.0
|
63
48
|
type: :development
|
64
49
|
prerelease: false
|
65
50
|
version_requirements: !ruby/object:Gem::Requirement
|
66
51
|
requirements:
|
67
|
-
- - "
|
68
|
-
- !ruby/object:Gem::Version
|
69
|
-
version: 1.3.0
|
70
|
-
- - "<"
|
52
|
+
- - "~>"
|
71
53
|
- !ruby/object:Gem::Version
|
72
|
-
version: 1.
|
54
|
+
version: 1.5.0
|
73
55
|
description: Compact Language Detector v3 (CLD3) is a neural network model for language
|
74
56
|
identification.
|
75
57
|
email: akihiko.odaki@gmail.com
|
@@ -78,10 +60,10 @@ extensions:
|
|
78
60
|
- ext/cld3/extconf.rb
|
79
61
|
extra_rdoc_files: []
|
80
62
|
files:
|
81
|
-
- Gemfile
|
82
63
|
- LICENSE
|
83
64
|
- LICENSE_CLD3
|
84
65
|
- README.md
|
66
|
+
- Steepfile
|
85
67
|
- cld3.gemspec
|
86
68
|
- ext/cld3/base.cc
|
87
69
|
- ext/cld3/base.h
|
@@ -149,6 +131,7 @@ files:
|
|
149
131
|
- ext/cld3/workspace.h
|
150
132
|
- lib/cld3.rb
|
151
133
|
- sig/cld3.rbs
|
134
|
+
- spec/cld3_spec.rb
|
152
135
|
homepage: https://github.com/akihikodaki/cld3-ruby
|
153
136
|
licenses:
|
154
137
|
- Apache-2.0
|
@@ -161,10 +144,10 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
161
144
|
requirements:
|
162
145
|
- - ">="
|
163
146
|
- !ruby/object:Gem::Version
|
164
|
-
version:
|
147
|
+
version: 3.0.0
|
165
148
|
- - "<"
|
166
149
|
- !ruby/object:Gem::Version
|
167
|
-
version: 3.
|
150
|
+
version: 3.4.0
|
168
151
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
169
152
|
requirements:
|
170
153
|
- - ">="
|