cld3 3.1.3 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +38 -0
- data/cld3.gemspec +1 -1
- data/lib/cld3.rb +52 -14
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 63df114521c24a9f75617d88ca7bcd2f62643884
|
4
|
+
data.tar.gz: e1c9bff9042b97ec6974506065ca6ccb634a4702
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c653d6888313436fc2a90c2cd8c7e39c92bc1e8cf9ce09520cac466139b16e2dac285bf7fc587a39fa5313ee0c28da86963588bf39433c655a335d46e5ce419
|
7
|
+
data.tar.gz: d21670ff37a9404f2c07d8d2e17385dfe271f26eb436e76766aec5c6b372c0547bd975918f7d6a1d8d98ce811f218353e24d85fb07050b382c06f252a5b64b8c
|
data/README.md
CHANGED
@@ -13,6 +13,44 @@ cld3-ruby is an interface of Compact Language Detector v3 (CLD3) for Ruby.
|
|
13
13
|
`Rakefile` includes a Rake task to put this code into files buildable as a gem.
|
14
14
|
Build a gem with `rake` command.
|
15
15
|
|
16
|
+
## Troubleshooting Setup Problems
|
17
|
+
I (Akihiko Odaki) recommend to setup this library installing via `gem`.
|
18
|
+
|
19
|
+
`gem install cld3` triggers native library building. If it fails, you are likely
|
20
|
+
to missing required facilities. Make sure C++ compiler and protocol buffers
|
21
|
+
is installed. I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler. Ruby is
|
22
|
+
likely to need [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/)
|
23
|
+
as well.
|
24
|
+
|
25
|
+
Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
|
26
|
+
programming errors. Make sure they are all correct.
|
27
|
+
|
28
|
+
If you cannot identify the cause of your problem, run spec of this library and
|
29
|
+
see whether the problem is reproducable with it or not. Spec is not included in
|
30
|
+
the gem, so clone the source code repository and then run `rake spec`.
|
31
|
+
The source code repository is at
|
32
|
+
https://github.com/akihikodaki/cld3-ruby.
|
33
|
+
|
34
|
+
In case you cannot solve your problem by yourself and cannot help abandoning or
|
35
|
+
find an issue in this library, please open an issue at
|
36
|
+
https://github.com/akihikodaki/cld3-ruby/issues.
|
37
|
+
|
38
|
+
If you found an issue and managed to fix it and agree to share the fix under
|
39
|
+
[Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0), please open a pull
|
40
|
+
request at https://github.com/akihikodaki/cld3-ruby/pulls. Your contribution
|
41
|
+
would be appreciated by other users and recorded with Git.
|
42
|
+
|
43
|
+
## Versioning
|
44
|
+
|
45
|
+
The version has 3 parts: major, minor, and patch. They are joined with . as
|
46
|
+
delimiters in the order.
|
47
|
+
|
48
|
+
The increment of the major version and the minor version indicates it can involve
|
49
|
+
any change.
|
50
|
+
|
51
|
+
The increment of the patch version indicates there is no change of the supported
|
52
|
+
languages and no change of the existing APIs except `CLD3::Unstable`.
|
53
|
+
|
16
54
|
## Contact
|
17
55
|
|
18
56
|
To ask questions or report issues please open issues at
|
data/cld3.gemspec
CHANGED
@@ -16,7 +16,7 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.
|
19
|
+
gem.version = "3.2.0"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
data/lib/cld3.rb
CHANGED
@@ -26,36 +26,55 @@ module CLD3
|
|
26
26
|
class NNetLanguageIdentifier
|
27
27
|
# Min number of bytes needed to make a prediction if the construcotr is
|
28
28
|
# called without the corresponding parameter.
|
29
|
+
# This is Numeric object.
|
29
30
|
MIN_NUM_BYTES_TO_CONSIDER = 140
|
30
31
|
|
31
32
|
# Max number of bytes needed to make a prediction if the construcotr is
|
32
33
|
# called without the corresponding parameter.
|
34
|
+
# This is Numeric object.
|
33
35
|
MAX_NUM_BYTES_TO_CONSIDER = 700
|
34
36
|
|
35
37
|
# Max number of input bytes to process.
|
38
|
+
# This is Numeric object.
|
36
39
|
MAX_NUM_INPUT_BYTES_TO_CONSIDER = 10000
|
37
40
|
|
38
41
|
# Predictions with probability greater than or equal to this threshold are
|
39
42
|
# marked as reliable. This threshold was optimized on a set of text segments
|
40
43
|
# extracted from wikipedia, and results in an overall precision, recall,
|
41
44
|
# and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
|
45
|
+
# This is Numeric object.
|
42
46
|
RELIABILITY_THRESHOLD = 0.7
|
43
47
|
|
44
48
|
# Reliability threshold for the languages hr and bs.
|
49
|
+
# This is Numeric object.
|
45
50
|
RELIABILITY_HR_BS_THRESHOLD = 0.5
|
46
51
|
|
47
52
|
# Information about a predicted language.
|
53
|
+
# This is an instance of Struct with the following members:
|
54
|
+
#
|
55
|
+
# [language] This is symbol or nil.
|
56
|
+
#
|
57
|
+
# [probability] Language probability. This is Numeric object.
|
58
|
+
#
|
59
|
+
# [reliable?] Whether the prediction is reliable. This is true or false.
|
60
|
+
#
|
61
|
+
# [proportion] Proportion of bytes associated with the language. If
|
62
|
+
# #find_language is called, this variable is set to 1.
|
63
|
+
# This is Numeric object.
|
48
64
|
Result = Struct.new("Result", :language, :probability, :reliable?, :proportion)
|
49
65
|
|
66
|
+
# The arguments are two String objects.
|
50
67
|
def initialize(minNumBytes = MIN_NUM_BYTES_TO_CONSIDER, maxNumBytes = MAX_NUM_BYTES_TO_CONSIDER)
|
51
|
-
@cc = Pointer.new(CLD3::Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
|
68
|
+
@cc = CLD3::Unstable::NNetLanguageIdentifier::Pointer.new(CLD3::Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
|
52
69
|
end
|
53
70
|
|
54
71
|
# Finds the most likely language for the given text, along with additional
|
55
72
|
# information (e.g., probability). The prediction is based on the first N
|
56
73
|
# bytes where N is the minumum between the number of interchange valid UTF8
|
57
|
-
# bytes and max_num_bytes_
|
58
|
-
# function returns nil.
|
74
|
+
# bytes and +max_num_bytes_+. If N is less than +min_num_bytes_+ long, then
|
75
|
+
# this function returns nil as language.
|
76
|
+
# The argument is a String object.
|
77
|
+
# The returned value of this function is an instance of Result.
|
59
78
|
def find_language(text)
|
60
79
|
text_utf8 = text.encode(Encoding::UTF_8)
|
61
80
|
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
@@ -65,29 +84,48 @@ module CLD3
|
|
65
84
|
language = cc_result[:language_data].read_bytes(cc_result[:language_size])
|
66
85
|
|
67
86
|
Result.new(
|
68
|
-
language == "und" ? nil : language,
|
87
|
+
language == "und" ? nil : language.to_sym,
|
69
88
|
cc_result[:probability],
|
70
89
|
cc_result[:reliable?],
|
71
90
|
cc_result[:proportion])
|
72
91
|
end
|
92
|
+
end
|
73
93
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
94
|
+
# Encapsulates the TaskContext specifying only the parameters for the model.
|
95
|
+
# The model weights are loaded statically.
|
96
|
+
module TaskContextParams
|
97
|
+
# This is an frozen Array object containing symbols.
|
98
|
+
LANGUAGE_NAMES = [
|
99
|
+
:eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
|
100
|
+
:nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
|
101
|
+
:mr, :th, :zu, :ml, :hr, :bs, :lo, :sd, :cy, :hy, :uk, :pt,
|
102
|
+
:lv, :iw, :cs, :vi, :jv, :be, :km, :mk, :tr, :fy, :am, :zh,
|
103
|
+
:da, :sv, :fi, :ht, :af, :la, :id, :fil, :sm, :ca, :el, :ka,
|
104
|
+
:sr, :it, :sk, :ru, :'ru-Latn', :bg, :ny, :fa, :haw, :gl, :et,
|
105
|
+
:ms, :gd, :'bg-Latn', :ha, :is, :ur, :mi, :hi, :bn, :'hi-Latn', :fr,
|
106
|
+
:yi, :hu, :xh, :my, :tg, :ro, :ar, :lb, :'el-Latn', :st, :ceb,
|
107
|
+
:kn, :az, :si, :ky, :mg, :en, :gu, :es, :pl, :'ja-Latn', :ga, :lt,
|
108
|
+
:sn, :yo, :pa, :ku,
|
109
|
+
].freeze
|
81
110
|
end
|
82
111
|
|
112
|
+
# :nodoc: all
|
83
113
|
# Do NOT use this module from outside.
|
84
114
|
module Unstable
|
85
115
|
extend FFI::Library
|
86
116
|
|
87
117
|
ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
|
88
118
|
|
89
|
-
|
90
|
-
|
119
|
+
module NNetLanguageIdentifier
|
120
|
+
class Pointer < FFI::AutoPointer
|
121
|
+
def self.release(pointer)
|
122
|
+
CLD3::Unstable.delete_NNetLanguageIdentifier(pointer)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
class Result < FFI::Struct
|
127
|
+
layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
128
|
+
end
|
91
129
|
end
|
92
130
|
|
93
131
|
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
@@ -95,6 +133,6 @@ module CLD3
|
|
95
133
|
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
96
134
|
|
97
135
|
attach_function :NNetLanguageIdentifier_find_language,
|
98
|
-
[ :pointer, :buffer_in, :size_t ],
|
136
|
+
[ :pointer, :buffer_in, :size_t ], NNetLanguageIdentifier::Result.by_value
|
99
137
|
end
|
100
138
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cld3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Akihiko Odaki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-09-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|