cld3 3.4.3 → 3.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/README.md +0 -18
- data/cld3.gemspec +2 -2
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/a.rb +24 -0
- data/lib/cld3.rb +3 -2
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f40e4947fea97543686caceba0082bdba30b5ae0485a25b41004ad048057b0ad
|
4
|
+
data.tar.gz: e45c60300550caf513fdde6bcbc05e68e1063bf9ad8074626bf5f88f4a6f77bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 393fc138a279ee42c3de90c49bcc982e55860f74e2796d4c895d0f2f175894bcb1ec1bbe796811f896a16be9cc97943e1309cbe175bc029a510b4c51b2f700da
|
7
|
+
data.tar.gz: d16e8c87e7d12cc90cc1a4babb4873df8f553d9527e1d69a548a250ae0b240f79a6338070bbc88cbb0e23db48c23ef0393cd4b62e0ac673722ace81ce1564895
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -41,24 +41,6 @@ JRuby has a bug which prevents the feature detection. Apply the following
|
|
41
41
|
change:
|
42
42
|
https://github.com/jruby/jruby/pull/4118/commits/edad375ef4dcf195b19ce0afe4befac66468c736
|
43
43
|
|
44
|
-
#### OpenBSD
|
45
|
-
Ruby has a bug which recognizes non-fatal linker warnings as fatal. Apply the
|
46
|
-
following patch to Ruby to workaround the bug.
|
47
|
-
|
48
|
-
```diff
|
49
|
-
--- a/lib/mkmf.rb
|
50
|
-
+++ b/lib/mkmf.rb
|
51
|
-
@@ -657,7 +657,7 @@ def with_ldflags(flags)
|
52
|
-
end
|
53
|
-
|
54
|
-
def try_ldflags(flags, opts = {})
|
55
|
-
- try_link(MAIN_DOES_NOTHING, flags, {:werror => true}.update(opts))
|
56
|
-
+ try_link(MAIN_DOES_NOTHING, flags, {:werror => false}.update(opts))
|
57
|
-
end
|
58
|
-
|
59
|
-
def append_ldflags(flags, *opts)
|
60
|
-
```
|
61
|
-
|
62
44
|
### Troubleshooting
|
63
45
|
`gem install cld3` triggers native library building. If it fails, you are likely
|
64
46
|
to missing required facilities. Make sure C++ compiler and protocol buffers
|
data/cld3.gemspec
CHANGED
@@ -16,7 +16,7 @@
|
|
16
16
|
|
17
17
|
Gem::Specification.new do |gem|
|
18
18
|
gem.name = "cld3"
|
19
|
-
gem.version = "3.4.
|
19
|
+
gem.version = "3.4.4"
|
20
20
|
gem.summary = "Compact Language Detector v3 (CLD3)"
|
21
21
|
gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
|
22
22
|
gem.license = "Apache-2.0"
|
@@ -27,7 +27,7 @@ Gem::Specification.new do |gem|
|
|
27
27
|
gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.16.0" ]
|
28
28
|
gem.add_development_dependency "rbs", [ ">= 1.7.0", "< 1.8.0" ]
|
29
29
|
gem.add_development_dependency "rspec", [ ">=3.0.0", "< 3.11.0" ]
|
30
|
-
gem.add_development_dependency "steep", [ ">= 0.
|
30
|
+
gem.add_development_dependency "steep", [ ">= 0.47.0", "< 0.48.0" ]
|
31
31
|
gem.files = Dir[
|
32
32
|
"Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
|
33
33
|
"cld3.gemspec", "ext/**/*", "lib/**/*", "sig/**/*"
|
data/ext/cld3/base.o
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/feature_types.o
CHANGED
Binary file
|
data/ext/cld3/fixunicodevalue.o
CHANGED
Binary file
|
data/ext/cld3/fml_parser.o
CHANGED
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/getonescriptspan.o
CHANGED
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/libcld3.so
CHANGED
Binary file
|
Binary file
|
Binary file
|
data/ext/cld3/offsetmap.o
CHANGED
Binary file
|
data/ext/cld3/registry.o
CHANGED
Binary file
|
Binary file
|
data/ext/cld3/sentence.pb.o
CHANGED
Binary file
|
Binary file
|
data/ext/cld3/task_context.o
CHANGED
Binary file
|
Binary file
|
data/ext/cld3/task_spec.pb.o
CHANGED
Binary file
|
data/ext/cld3/text_processing.o
CHANGED
Binary file
|
data/ext/cld3/unicodetext.o
CHANGED
Binary file
|
data/ext/cld3/utf8statetable.o
CHANGED
Binary file
|
data/ext/cld3/utils.o
CHANGED
Binary file
|
data/ext/cld3/workspace.o
CHANGED
Binary file
|
data/lib/a.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require "cld3"
|
2
|
+
|
3
|
+
# Kafka text as an example + the word Velcro
|
4
|
+
text = "Πολυαγαπημένε πατέρα πρόσφατα Velcro με ρώτησες κάποια φορά γιατί ισχυρίζομαι πως σε φοβάμαι. Εγώ δεν ήξερα, ως συνήθως, τι να σου απαντήσω, εν μέρει ακριβώς λόγω του φόβου που νιώθω για σένα, εν μέρει επειδή στην αιτιολόγηση του φόβου αυτού συγκαταλέγονται πάρα πολλές λεπτομέρειες, που εν τη ρύμη του λόγου εγώ ούτε κατά το ήμισυ δεν θα μπορούσα να τις συγκρατήσω. Κι αν εδώ προσπαθώ να σου απαντήσω γραπτώς, μόνο ανολοκλήρωτο κατά πολύ θα αποβεί και τούτο, επειδή και κατά τη γραφή ο φόβος και οι συνέπειές του με κωλύουν έναντί σου κι επειδή το μέγεθος του υλικού εν γένει υπερβαίνει κατά πολύ τη μνήμη μου και το λογικό μου. Για σένα το ζήτημα αποδεικνυόταν πάντοτε πολύ απλό, τουλάχιστον στον βαθμό που μιλούσες εσύ γι’ αυτό ενώπιόν μου και, αδιακρίτως, ενώπιον πολλών άλλων. Εσένα σου φαινόταν να είναι κάπως έτσι: Εσύ εργαζόσουν σκληρά σ’ όλη σου τη ζωή, τα πάντα για τα παιδιά σου, προ πάντων για εμένα τα θυσίαζες, εγώ έκαμνα συνεπώς «ζωή χαρισάμενη», είχα πλήρη ελευθερία να μάθω ό,τι ήθελα, κανέναν λόγο δεν είχα να έχω έγνοιες για την καθημερινή διατροφή, να έχω έγνοιες συνεπώς εν γένει• εσύ αντ’ αυτών καμμίαν ευγνωμοσύνη δεν αξίωνες, γνωρίζεις «την ευγνωμοσύνη των παιδιών, αλλά εν τούτοις τουλάχιστον μια "
|
5
|
+
pp text.bytesize
|
6
|
+
|
7
|
+
200.times { |i|
|
8
|
+
max_bytes = 500 + i * 10
|
9
|
+
cld3 = CLD3::NNetLanguageIdentifier.new("foo", max_bytes)
|
10
|
+
|
11
|
+
lang = cld3.find_language(text)
|
12
|
+
lang2 = cld3.find_top_n_most_freq_langs(text, 1)
|
13
|
+
|
14
|
+
puts "When max_bytes is #{max_bytes} probability is less than 0.999: #{lang.probability}" if lang.probability < 0.999
|
15
|
+
|
16
|
+
if lang.language != :el
|
17
|
+
puts "When max_bytes is #{max_bytes} then cld3::find_language returns #{lang.language},
|
18
|
+
find_top_n_most_freq_langs returns #{lang2.first.language}"
|
19
|
+
#pp lang
|
20
|
+
#pp lang2
|
21
|
+
end
|
22
|
+
}
|
23
|
+
|
24
|
+
puts "Size: #{text.length} - Bytesize: #{text.encode(Encoding::UTF_8).bytesize}"
|
data/lib/cld3.rb
CHANGED
@@ -74,14 +74,15 @@ module CLD3
|
|
74
74
|
# @type const Result: untyped
|
75
75
|
Result = Struct.new(:language, :probability, :reliable?, :proportion, :byte_ranges)
|
76
76
|
|
77
|
-
# The arguments are two
|
77
|
+
# The arguments are two Numeric objects.
|
78
78
|
def initialize(min_num_bytes = MIN_NUM_BYTES_TO_CONSIDER, max_num_bytes = MAX_NUM_BYTES_TO_CONSIDER)
|
79
|
+
raise ArgumentError if max_num_bytes <= 0 || min_num_bytes < 0 || min_num_bytes >= max_num_bytes
|
79
80
|
@cc = Unstable::NNetLanguageIdentifier::Pointer.new(Unstable.new_NNetLanguageIdentifier(min_num_bytes, max_num_bytes))
|
80
81
|
end
|
81
82
|
|
82
83
|
# Finds the most likely language for the given text, along with additional
|
83
84
|
# information (e.g., probability). The prediction is based on the first N
|
84
|
-
# bytes where N is the
|
85
|
+
# bytes where N is the minimum between the number of interchange valid UTF8
|
85
86
|
# bytes and +max_num_bytes_+. If N is less than +min_num_bytes_+ long, then
|
86
87
|
# this function returns nil.
|
87
88
|
# The argument is a String object.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cld3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.4.
|
4
|
+
version: 3.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Akihiko Odaki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -76,20 +76,20 @@ dependencies:
|
|
76
76
|
requirements:
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 0.47.0
|
80
80
|
- - "<"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 0.
|
82
|
+
version: 0.48.0
|
83
83
|
type: :development
|
84
84
|
prerelease: false
|
85
85
|
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - ">="
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0.
|
89
|
+
version: 0.47.0
|
90
90
|
- - "<"
|
91
91
|
- !ruby/object:Gem::Version
|
92
|
-
version: 0.
|
92
|
+
version: 0.48.0
|
93
93
|
description: Compact Language Detector v3 (CLD3) is a neural network model for language
|
94
94
|
identification.
|
95
95
|
email: akihiko.odaki@gmail.com
|
@@ -199,6 +199,7 @@ files:
|
|
199
199
|
- ext/cld3/workspace.cc
|
200
200
|
- ext/cld3/workspace.h
|
201
201
|
- ext/cld3/workspace.o
|
202
|
+
- lib/a.rb
|
202
203
|
- lib/cld3.rb
|
203
204
|
- lib/cld3/unstable.rb
|
204
205
|
- sig/cld3.rbs
|