cld3 3.1.3 → 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +38 -0
- data/cld3.gemspec +1 -1
- data/lib/cld3.rb +52 -14
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 63df114521c24a9f75617d88ca7bcd2f62643884
|
4
|
+
data.tar.gz: e1c9bff9042b97ec6974506065ca6ccb634a4702
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c653d6888313436fc2a90c2cd8c7e39c92bc1e8cf9ce09520cac466139b16e2dac285bf7fc587a39fa5313ee0c28da86963588bf39433c655a335d46e5ce419
|
7
|
+
data.tar.gz: d21670ff37a9404f2c07d8d2e17385dfe271f26eb436e76766aec5c6b372c0547bd975918f7d6a1d8d98ce811f218353e24d85fb07050b382c06f252a5b64b8c
|
data/README.md
CHANGED
@@ -13,6 +13,44 @@ cld3-ruby is an interface of Compact Language Detector v3 (CLD3) for Ruby.
|
|
13
13
|
`Rakefile` includes a Rake task to put this code into files buildable as a gem.
|
14
14
|
Build a gem with `rake` command.
|
15
15
|
|
16
|
+
## Troubleshooting Setup Problems
|
17
|
+
I (Akihiko Odaki) recommend to setup this library installing via `gem`.
|
18
|
+
|
19
|
+
`gem install cld3` triggers native library building. If it fails, you are likely
|
20
|
+
to missing required facilities. Make sure C++ compiler and protocol buffers
|
21
|
+
is installed. I recommend [GCC](https://gcc.gnu.org/) as a C++ compiler. Ruby is
|
22
|
+
likely to need [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/)
|
23
|
+
as well.
|
24
|
+
|
25
|
+
Runtime errors are likely to be issues of [FFI](https://github.com/ffi/ffi) or
|
26
|
+
programming errors. Make sure they are all correct.
|
27
|
+
|
28
|
+
If you cannot identify the cause of your problem, run spec of this library and
|
29
|
+
see whether the problem is reproducable with it or not. Spec is not included in
|
30
|
+
the gem, so clone the source code repository and then run `rake spec`.
|
31
|
+
The source code repository is at
|
32
|
+
https://github.com/akihikodaki/cld3-ruby.
|
33
|
+
|
34
|
+
In case you cannot solve your problem by yourself and cannot help abandoning or
|
35
|
+
find an issue in this library, please open an issue at
|
36
|
+
https://github.com/akihikodaki/cld3-ruby/issues.
|
37
|
+
|
38
|
+
If you found an issue and managed to fix it and agree to share the fix under
|
39
|
+
[Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0), please open a pull
|
40
|
+
request at https://github.com/akihikodaki/cld3-ruby/pulls. Your contribution
|
41
|
+
would be appreciated by other users and recorded with Git.
|
42
|
+
|
43
|
+
## Versioning
|
44
|
+
|
45
|
+
The version has 3 parts: major, minor, and patch. They are joined with . as
|
46
|
+
delimiters in the order.
|
47
|
+
|
48
|
+
The increment of the major version and the minor version indicates it can involve
|
49
|
+
any change.
|
50
|
+
|
51
|
+
The increment of the patch version indicates there is no change of the supported
|
52
|
+
languages and no change of the existing APIs except `CLD3::Unstable`.
|
53
|
+
|
16
54
|
## Contact
|
17
55
|
|
18
56
|
To ask questions or report issues please open issues at
|
data/cld3.gemspec
CHANGED
@@ -16,7 +16,7 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.
|
19
|
+
gem.version = "3.2.0"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
data/lib/cld3.rb
CHANGED
@@ -26,36 +26,55 @@ module CLD3
|
|
26
26
|
class NNetLanguageIdentifier
|
27
27
|
# Min number of bytes needed to make a prediction if the construcotr is
|
28
28
|
# called without the corresponding parameter.
|
29
|
+
# This is Numeric object.
|
29
30
|
MIN_NUM_BYTES_TO_CONSIDER = 140
|
30
31
|
|
31
32
|
# Max number of bytes needed to make a prediction if the construcotr is
|
32
33
|
# called without the corresponding parameter.
|
34
|
+
# This is Numeric object.
|
33
35
|
MAX_NUM_BYTES_TO_CONSIDER = 700
|
34
36
|
|
35
37
|
# Max number of input bytes to process.
|
38
|
+
# This is Numeric object.
|
36
39
|
MAX_NUM_INPUT_BYTES_TO_CONSIDER = 10000
|
37
40
|
|
38
41
|
# Predictions with probability greater than or equal to this threshold are
|
39
42
|
# marked as reliable. This threshold was optimized on a set of text segments
|
40
43
|
# extracted from wikipedia, and results in an overall precision, recall,
|
41
44
|
# and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
|
45
|
+
# This is Numeric object.
|
42
46
|
RELIABILITY_THRESHOLD = 0.7
|
43
47
|
|
44
48
|
# Reliability threshold for the languages hr and bs.
|
49
|
+
# This is Numeric object.
|
45
50
|
RELIABILITY_HR_BS_THRESHOLD = 0.5
|
46
51
|
|
47
52
|
# Information about a predicted language.
|
53
|
+
# This is an instance of Struct with the following members:
|
54
|
+
#
|
55
|
+
# [language] This is symbol or nil.
|
56
|
+
#
|
57
|
+
# [probability] Language probability. This is Numeric object.
|
58
|
+
#
|
59
|
+
# [reliable?] Whether the prediction is reliable. This is true or false.
|
60
|
+
#
|
61
|
+
# [proportion] Proportion of bytes associated with the language. If
|
62
|
+
# #find_language is called, this variable is set to 1.
|
63
|
+
# This is Numeric object.
|
48
64
|
Result = Struct.new("Result", :language, :probability, :reliable?, :proportion)
|
49
65
|
|
66
|
+
# The arguments are two String objects.
|
50
67
|
def initialize(minNumBytes = MIN_NUM_BYTES_TO_CONSIDER, maxNumBytes = MAX_NUM_BYTES_TO_CONSIDER)
|
51
|
-
@cc = Pointer.new(CLD3::Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
|
68
|
+
@cc = CLD3::Unstable::NNetLanguageIdentifier::Pointer.new(CLD3::Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
|
52
69
|
end
|
53
70
|
|
54
71
|
# Finds the most likely language for the given text, along with additional
|
55
72
|
# information (e.g., probability). The prediction is based on the first N
|
56
73
|
# bytes where N is the minumum between the number of interchange valid UTF8
|
57
|
-
# bytes and max_num_bytes_
|
58
|
-
# function returns nil.
|
74
|
+
# bytes and +max_num_bytes_+. If N is less than +min_num_bytes_+ long, then
|
75
|
+
# this function returns nil as language.
|
76
|
+
# The argument is a String object.
|
77
|
+
# The returned value of this function is an instance of Result.
|
59
78
|
def find_language(text)
|
60
79
|
text_utf8 = text.encode(Encoding::UTF_8)
|
61
80
|
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
@@ -65,29 +84,48 @@ module CLD3
|
|
65
84
|
language = cc_result[:language_data].read_bytes(cc_result[:language_size])
|
66
85
|
|
67
86
|
Result.new(
|
68
|
-
language == "und" ? nil : language,
|
87
|
+
language == "und" ? nil : language.to_sym,
|
69
88
|
cc_result[:probability],
|
70
89
|
cc_result[:reliable?],
|
71
90
|
cc_result[:proportion])
|
72
91
|
end
|
92
|
+
end
|
73
93
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
94
|
+
# Encapsulates the TaskContext specifying only the parameters for the model.
|
95
|
+
# The model weights are loaded statically.
|
96
|
+
module TaskContextParams
|
97
|
+
# This is an frozen Array object containing symbols.
|
98
|
+
LANGUAGE_NAMES = [
|
99
|
+
:eo, :co, :eu, :ta, :de, :mt, :ps, :te, :su, :uz, :'zh-Latn', :ne,
|
100
|
+
:nl, :sw, :sq, :hmn, :ja, :no, :mn, :so, :ko, :kk, :sl, :ig,
|
101
|
+
:mr, :th, :zu, :ml, :hr, :bs, :lo, :sd, :cy, :hy, :uk, :pt,
|
102
|
+
:lv, :iw, :cs, :vi, :jv, :be, :km, :mk, :tr, :fy, :am, :zh,
|
103
|
+
:da, :sv, :fi, :ht, :af, :la, :id, :fil, :sm, :ca, :el, :ka,
|
104
|
+
:sr, :it, :sk, :ru, :'ru-Latn', :bg, :ny, :fa, :haw, :gl, :et,
|
105
|
+
:ms, :gd, :'bg-Latn', :ha, :is, :ur, :mi, :hi, :bn, :'hi-Latn', :fr,
|
106
|
+
:yi, :hu, :xh, :my, :tg, :ro, :ar, :lb, :'el-Latn', :st, :ceb,
|
107
|
+
:kn, :az, :si, :ky, :mg, :en, :gu, :es, :pl, :'ja-Latn', :ga, :lt,
|
108
|
+
:sn, :yo, :pa, :ku,
|
109
|
+
].freeze
|
81
110
|
end
|
82
111
|
|
112
|
+
# :nodoc: all
|
83
113
|
# Do NOT use this module from outside.
|
84
114
|
module Unstable
|
85
115
|
extend FFI::Library
|
86
116
|
|
87
117
|
ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", "libcld3." + RbConfig::CONFIG["DLEXT"])
|
88
118
|
|
89
|
-
|
90
|
-
|
119
|
+
module NNetLanguageIdentifier
|
120
|
+
class Pointer < FFI::AutoPointer
|
121
|
+
def self.release(pointer)
|
122
|
+
CLD3::Unstable.delete_NNetLanguageIdentifier(pointer)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
class Result < FFI::Struct
|
127
|
+
layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
128
|
+
end
|
91
129
|
end
|
92
130
|
|
93
131
|
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
@@ -95,6 +133,6 @@ module CLD3
|
|
95
133
|
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
96
134
|
|
97
135
|
attach_function :NNetLanguageIdentifier_find_language,
|
98
|
-
[ :pointer, :buffer_in, :size_t ],
|
136
|
+
[ :pointer, :buffer_in, :size_t ], NNetLanguageIdentifier::Result.by_value
|
99
137
|
end
|
100
138
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cld3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Akihiko Odaki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-09-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|