cld2-small 2.0.0 → 2.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +39 -0
- data/ext/cld/thunk.cc +22 -7
- data/lib/cld/version.rb +1 -1
- data/lib/cld.rb +17 -4
- metadata +2 -26
- data/ext/cld/cld2/internal/cld2_generated_cjk_compatible.o +0 -0
- data/ext/cld/cld2/internal/cld2_generated_deltaoctachrome.o +0 -0
- data/ext/cld/cld2/internal/cld2_generated_distinctoctachrome.o +0 -0
- data/ext/cld/cld2/internal/cld2_generated_quadchrome_2.o +0 -0
- data/ext/cld/cld2/internal/cld_generated_cjk_delta_bi_4.o +0 -0
- data/ext/cld/cld2/internal/cld_generated_cjk_uni_prop_80.o +0 -0
- data/ext/cld/cld2/internal/cld_generated_score_quad_octa_2.o +0 -0
- data/ext/cld/cld2/internal/cldutil.o +0 -0
- data/ext/cld/cld2/internal/cldutil_shared.o +0 -0
- data/ext/cld/cld2/internal/compact_lang_det.o +0 -0
- data/ext/cld/cld2/internal/compact_lang_det_hint_code.o +0 -0
- data/ext/cld/cld2/internal/compact_lang_det_impl.o +0 -0
- data/ext/cld/cld2/internal/debug.o +0 -0
- data/ext/cld/cld2/internal/fixunicodevalue.o +0 -0
- data/ext/cld/cld2/internal/generated_distinct_bi_0.o +0 -0
- data/ext/cld/cld2/internal/generated_entities.o +0 -0
- data/ext/cld/cld2/internal/generated_language.o +0 -0
- data/ext/cld/cld2/internal/generated_ulscript.o +0 -0
- data/ext/cld/cld2/internal/getonescriptspan.o +0 -0
- data/ext/cld/cld2/internal/lang_script.o +0 -0
- data/ext/cld/cld2/internal/offsetmap.o +0 -0
- data/ext/cld/cld2/internal/scoreonescriptspan.o +0 -0
- data/ext/cld/cld2/internal/tote.o +0 -0
- data/ext/cld/cld2/internal/utf8statetable.o +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6f10fe04b1de4166a0dabfbae2294879bc24e749312461fa882fc2f0de8176a1
|
|
4
|
+
data.tar.gz: 9de3c4cecb30494f28924e98c76303192ebbee33105b778b3d606a390a4a2f69
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '00418d5b352b925de52c3e82c4ec7651f5089a15612b8a0a2a43c76b709efae082c219c0e4513714cd0b4de7dba83fd215a769d3d67cf0995c7e0893588a5eaf'
|
|
7
|
+
data.tar.gz: '078fd9c608432d46f03adc3592438e16f54dcd7a9118129cc6af12cd2344ff010862a7bb799262e434d475a5a7f342da34609413277c5db93433dc41fb116b0e'
|
data/README.md
CHANGED
|
@@ -10,6 +10,8 @@ Blazing-fast language detection for Ruby provided using Compact Language Detecto
|
|
|
10
10
|
|
|
11
11
|
## How to Use
|
|
12
12
|
|
|
13
|
+
The `detect_language` method returns a hash with the language name, code, and reliability.
|
|
14
|
+
|
|
13
15
|
```ruby
|
|
14
16
|
CLD.detect_language("Working as expected")
|
|
15
17
|
# => {:name => "ENGLISH", :code => "en", :reliable => true}
|
|
@@ -18,6 +20,43 @@ CLD.detect_language("plus ça change, plus c'est la même chose")
|
|
|
18
20
|
# => {:name => "FRENCH", :code => "fr", :reliable => true}
|
|
19
21
|
```
|
|
20
22
|
|
|
23
|
+
**Options**
|
|
24
|
+
|
|
25
|
+
You can pass an options hash as the second argument to `detect_language`:
|
|
26
|
+
|
|
27
|
+
```ruby
|
|
28
|
+
CLD.detect_language(text, options = {})
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Available options:
|
|
32
|
+
|
|
33
|
+
* `:is_plain_text` (Boolean, default: `true`): Set to `false` if the input text is HTML. CLD2 will then try to skip HTML tags.
|
|
34
|
+
* `:best_effort` (Boolean, default: `false`): If `true`, CLD2 will give its best-effort answer, even on short or ambiguous text, instead of potentially returning "Unknown".
|
|
35
|
+
* `:tld_hint` (String, default: `nil`): A Top-Level Domain hint (e.g., `"id"`, `"us"`) to boost detection accuracy for languages associated with that TLD.
|
|
36
|
+
* `:content_language_hint` (String, default: `nil`): An HTTP Content-Language header style hint (e.g., `"en,fr"`) to boost detection for the specified languages.
|
|
37
|
+
* `:score_as_quads` (Boolean, default: `false`): Forces CLD2 to use quadgram-based scoring for certain languages that are normally detected by script alone. This can be a refinement for more meaningful text detection in those languages but depends on CLD2's internal data tables.
|
|
38
|
+
|
|
39
|
+
**Examples with Options:**
|
|
40
|
+
|
|
41
|
+
```ruby
|
|
42
|
+
# Using best_effort for short text
|
|
43
|
+
CLD.detect_language("test", best_effort: true)
|
|
44
|
+
# => Might return a language like {:name => "ENGLISH", :code => "en", :reliable => false}
|
|
45
|
+
# instead of "Unknown"
|
|
46
|
+
|
|
47
|
+
# Providing a TLD hint
|
|
48
|
+
CLD.detect_language("Ini adalah teks dalam bahasa Indonesia", tld_hint: "id")
|
|
49
|
+
# => {:name => "INDONESIAN", :code => "id", :reliable => true}
|
|
50
|
+
|
|
51
|
+
# Providing a content language hint
|
|
52
|
+
CLD.detect_language("Ceci est un texte en français.", content_language_hint: "fr,en")
|
|
53
|
+
# => {:name => "FRENCH", :code => "fr", :reliable => true}
|
|
54
|
+
|
|
55
|
+
# Using score_as_quads (example, effect depends on text and CLD2 tables)
|
|
56
|
+
CLD.detect_language("Ελληνικό κείμενο", score_as_quads: true)
|
|
57
|
+
# => May provide a more refined result for Greek
|
|
58
|
+
```
|
|
59
|
+
|
|
21
60
|
## Installation
|
|
22
61
|
|
|
23
62
|
Add this line to your application's Gemfile:
|
data/ext/cld/thunk.cc
CHANGED
|
@@ -13,28 +13,43 @@ typedef struct {
|
|
|
13
13
|
} RESULT;
|
|
14
14
|
|
|
15
15
|
extern "C" {
|
|
16
|
-
RESULT detectLanguageThunkInt(const char * src,
|
|
16
|
+
RESULT detectLanguageThunkInt(const char * src,
|
|
17
|
+
bool is_plain_text,
|
|
18
|
+
bool best_effort,
|
|
19
|
+
const char* tld_hint_param,
|
|
20
|
+
const char* content_lang_hint_param,
|
|
21
|
+
bool score_as_quads_param) {
|
|
17
22
|
bool is_reliable;
|
|
18
23
|
Language plus_one = UNKNOWN_LANGUAGE;
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
24
|
+
int encoding_hint_val = UNKNOWN_ENCODING;
|
|
25
|
+
Language language_hint_val = UNKNOWN_LANGUAGE;
|
|
26
|
+
|
|
27
|
+
CLDHints cld_hints = {content_lang_hint_param, tld_hint_param, encoding_hint_val, language_hint_val};
|
|
28
|
+
|
|
29
|
+
int flags = 0;
|
|
30
|
+
if (best_effort) {
|
|
31
|
+
flags |= kCLDFlagBestEffort;
|
|
32
|
+
}
|
|
33
|
+
if (score_as_quads_param) {
|
|
34
|
+
flags |= kCLDFlagScoreAsQuads;
|
|
35
|
+
}
|
|
22
36
|
|
|
23
37
|
double normalized_score3[3];
|
|
24
38
|
Language language3[3];
|
|
25
39
|
int percent3[3];
|
|
26
40
|
int text_bytes;
|
|
41
|
+
ResultChunkVector resultchunkvector;
|
|
27
42
|
|
|
28
43
|
Language lang;
|
|
29
44
|
lang = ExtDetectLanguageSummary(src,
|
|
30
45
|
strlen(src),
|
|
31
46
|
is_plain_text,
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
language_hint,
|
|
47
|
+
&cld_hints,
|
|
48
|
+
flags,
|
|
35
49
|
language3,
|
|
36
50
|
percent3,
|
|
37
51
|
normalized_score3,
|
|
52
|
+
&resultchunkvector,
|
|
38
53
|
&text_bytes,
|
|
39
54
|
&is_reliable);
|
|
40
55
|
|
data/lib/cld/version.rb
CHANGED
data/lib/cld.rb
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
require_relative "cld/version"
|
|
2
2
|
require "ffi"
|
|
3
3
|
|
|
4
4
|
module CLD
|
|
@@ -23,8 +23,19 @@ module CLD
|
|
|
23
23
|
|
|
24
24
|
ffi_lib lib_path
|
|
25
25
|
|
|
26
|
-
def self.detect_language(text,
|
|
27
|
-
|
|
26
|
+
def self.detect_language(text, options = {})
|
|
27
|
+
is_plain_text = options.fetch(:is_plain_text, true)
|
|
28
|
+
best_effort = options.fetch(:best_effort, false)
|
|
29
|
+
tld_hint = options.fetch(:tld_hint, nil)
|
|
30
|
+
content_language_hint = options.fetch(:content_language_hint, nil)
|
|
31
|
+
score_as_quads = options.fetch(:score_as_quads, false)
|
|
32
|
+
|
|
33
|
+
result = detect_language_ext(text.to_s,
|
|
34
|
+
is_plain_text,
|
|
35
|
+
best_effort,
|
|
36
|
+
tld_hint,
|
|
37
|
+
content_language_hint,
|
|
38
|
+
score_as_quads)
|
|
28
39
|
Hash[ result.members.map {|member| [member.to_sym, result[member]]} ]
|
|
29
40
|
end
|
|
30
41
|
|
|
@@ -34,5 +45,7 @@ module CLD
|
|
|
34
45
|
layout :name, :string, :code, :string, :reliable, :bool
|
|
35
46
|
end
|
|
36
47
|
|
|
37
|
-
attach_function "detect_language_ext", "detectLanguageThunkInt",
|
|
48
|
+
attach_function "detect_language_ext", "detectLanguageThunkInt",
|
|
49
|
+
[:buffer_in, :bool, :bool, :string, :string, :bool],
|
|
50
|
+
ReturnValue.by_value
|
|
38
51
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: cld2-small
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.0.
|
|
4
|
+
version: 2.0.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Alessandro Dal Grande
|
|
@@ -10,7 +10,7 @@ authors:
|
|
|
10
10
|
autorequire:
|
|
11
11
|
bindir: bin
|
|
12
12
|
cert_chain: []
|
|
13
|
-
date: 2025-
|
|
13
|
+
date: 2025-05-22 00:00:00.000000000 Z
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
16
16
|
name: ffi
|
|
@@ -74,51 +74,39 @@ files:
|
|
|
74
74
|
- ext/cld/cld2/internal/cld2_dynamic_data_loader.h
|
|
75
75
|
- ext/cld/cld2/internal/cld2_dynamic_data_tool.cc
|
|
76
76
|
- ext/cld/cld2/internal/cld2_generated_cjk_compatible.cc
|
|
77
|
-
- ext/cld/cld2/internal/cld2_generated_cjk_compatible.o
|
|
78
77
|
- ext/cld/cld2/internal/cld2_generated_deltaocta0122.cc
|
|
79
78
|
- ext/cld/cld2/internal/cld2_generated_deltaocta0527.cc
|
|
80
79
|
- ext/cld/cld2/internal/cld2_generated_deltaoctachrome.cc
|
|
81
|
-
- ext/cld/cld2/internal/cld2_generated_deltaoctachrome.o
|
|
82
80
|
- ext/cld/cld2/internal/cld2_generated_distinctocta0122.cc
|
|
83
81
|
- ext/cld/cld2/internal/cld2_generated_distinctocta0527.cc
|
|
84
82
|
- ext/cld/cld2/internal/cld2_generated_distinctoctachrome.cc
|
|
85
|
-
- ext/cld/cld2/internal/cld2_generated_distinctoctachrome.o
|
|
86
83
|
- ext/cld/cld2/internal/cld2_generated_octa2_dummy.cc
|
|
87
84
|
- ext/cld/cld2/internal/cld2_generated_quad0122.cc
|
|
88
85
|
- ext/cld/cld2/internal/cld2_generated_quad0720.cc
|
|
89
86
|
- ext/cld/cld2/internal/cld2_generated_quadchrome_16.cc
|
|
90
87
|
- ext/cld/cld2/internal/cld2_generated_quadchrome_2.cc
|
|
91
|
-
- ext/cld/cld2/internal/cld2_generated_quadchrome_2.o
|
|
92
88
|
- ext/cld/cld2/internal/cld2_unittest.cc
|
|
93
89
|
- ext/cld/cld2/internal/cld2_unittest_full.cc
|
|
94
90
|
- ext/cld/cld2/internal/cld2tablesummary.h
|
|
95
91
|
- ext/cld/cld2/internal/cld_generated_cjk_delta_bi_32.cc
|
|
96
92
|
- ext/cld/cld2/internal/cld_generated_cjk_delta_bi_4.cc
|
|
97
|
-
- ext/cld/cld2/internal/cld_generated_cjk_delta_bi_4.o
|
|
98
93
|
- ext/cld/cld2/internal/cld_generated_cjk_uni_prop_80.cc
|
|
99
|
-
- ext/cld/cld2/internal/cld_generated_cjk_uni_prop_80.o
|
|
100
94
|
- ext/cld/cld2/internal/cld_generated_score_quad_octa_0122.cc
|
|
101
95
|
- ext/cld/cld2/internal/cld_generated_score_quad_octa_0122_2.cc
|
|
102
96
|
- ext/cld/cld2/internal/cld_generated_score_quad_octa_1024_256.cc
|
|
103
97
|
- ext/cld/cld2/internal/cld_generated_score_quad_octa_2.cc
|
|
104
|
-
- ext/cld/cld2/internal/cld_generated_score_quad_octa_2.o
|
|
105
98
|
- ext/cld/cld2/internal/cldutil.cc
|
|
106
99
|
- ext/cld/cld2/internal/cldutil.h
|
|
107
|
-
- ext/cld/cld2/internal/cldutil.o
|
|
108
100
|
- ext/cld/cld2/internal/cldutil_offline.cc
|
|
109
101
|
- ext/cld/cld2/internal/cldutil_offline.h
|
|
110
102
|
- ext/cld/cld2/internal/cldutil_shared.cc
|
|
111
103
|
- ext/cld/cld2/internal/cldutil_shared.h
|
|
112
|
-
- ext/cld/cld2/internal/cldutil_shared.o
|
|
113
104
|
- ext/cld/cld2/internal/clean.sh
|
|
114
105
|
- ext/cld/cld2/internal/compact_lang_det.cc
|
|
115
|
-
- ext/cld/cld2/internal/compact_lang_det.o
|
|
116
106
|
- ext/cld/cld2/internal/compact_lang_det_hint_code.cc
|
|
117
107
|
- ext/cld/cld2/internal/compact_lang_det_hint_code.h
|
|
118
|
-
- ext/cld/cld2/internal/compact_lang_det_hint_code.o
|
|
119
108
|
- ext/cld/cld2/internal/compact_lang_det_impl.cc
|
|
120
109
|
- ext/cld/cld2/internal/compact_lang_det_impl.h
|
|
121
|
-
- ext/cld/cld2/internal/compact_lang_det_impl.o
|
|
122
110
|
- ext/cld/cld2/internal/compact_lang_det_test.cc
|
|
123
111
|
- ext/cld/cld2/internal/compile.sh
|
|
124
112
|
- ext/cld/cld2/internal/compile_and_test_all.sh
|
|
@@ -127,42 +115,31 @@ files:
|
|
|
127
115
|
- ext/cld/cld2/internal/compile_libs.sh
|
|
128
116
|
- ext/cld/cld2/internal/debug.cc
|
|
129
117
|
- ext/cld/cld2/internal/debug.h
|
|
130
|
-
- ext/cld/cld2/internal/debug.o
|
|
131
118
|
- ext/cld/cld2/internal/debug_empty.cc
|
|
132
119
|
- ext/cld/cld2/internal/fixunicodevalue.cc
|
|
133
120
|
- ext/cld/cld2/internal/fixunicodevalue.h
|
|
134
|
-
- ext/cld/cld2/internal/fixunicodevalue.o
|
|
135
121
|
- ext/cld/cld2/internal/generated_distinct_bi_0.cc
|
|
136
|
-
- ext/cld/cld2/internal/generated_distinct_bi_0.o
|
|
137
122
|
- ext/cld/cld2/internal/generated_entities.cc
|
|
138
|
-
- ext/cld/cld2/internal/generated_entities.o
|
|
139
123
|
- ext/cld/cld2/internal/generated_language.cc
|
|
140
124
|
- ext/cld/cld2/internal/generated_language.h
|
|
141
|
-
- ext/cld/cld2/internal/generated_language.o
|
|
142
125
|
- ext/cld/cld2/internal/generated_ulscript.cc
|
|
143
126
|
- ext/cld/cld2/internal/generated_ulscript.h
|
|
144
|
-
- ext/cld/cld2/internal/generated_ulscript.o
|
|
145
127
|
- ext/cld/cld2/internal/getonescriptspan.cc
|
|
146
128
|
- ext/cld/cld2/internal/getonescriptspan.h
|
|
147
|
-
- ext/cld/cld2/internal/getonescriptspan.o
|
|
148
129
|
- ext/cld/cld2/internal/integral_types.h
|
|
149
130
|
- ext/cld/cld2/internal/lang_script.cc
|
|
150
131
|
- ext/cld/cld2/internal/lang_script.h
|
|
151
|
-
- ext/cld/cld2/internal/lang_script.o
|
|
152
132
|
- ext/cld/cld2/internal/langspan.h
|
|
153
133
|
- ext/cld/cld2/internal/offsetmap.cc
|
|
154
134
|
- ext/cld/cld2/internal/offsetmap.h
|
|
155
|
-
- ext/cld/cld2/internal/offsetmap.o
|
|
156
135
|
- ext/cld/cld2/internal/port.h
|
|
157
136
|
- ext/cld/cld2/internal/scoreonescriptspan.cc
|
|
158
137
|
- ext/cld/cld2/internal/scoreonescriptspan.h
|
|
159
|
-
- ext/cld/cld2/internal/scoreonescriptspan.o
|
|
160
138
|
- ext/cld/cld2/internal/scoreutf8text.cc
|
|
161
139
|
- ext/cld/cld2/internal/stringpiece.h
|
|
162
140
|
- ext/cld/cld2/internal/test_shuffle_1000_48_666.utf8.gz
|
|
163
141
|
- ext/cld/cld2/internal/tote.cc
|
|
164
142
|
- ext/cld/cld2/internal/tote.h
|
|
165
|
-
- ext/cld/cld2/internal/tote.o
|
|
166
143
|
- ext/cld/cld2/internal/unittest_data.h
|
|
167
144
|
- ext/cld/cld2/internal/utf8acceptinterchange.h
|
|
168
145
|
- ext/cld/cld2/internal/utf8prop_lettermarkscriptnum.h
|
|
@@ -170,7 +147,6 @@ files:
|
|
|
170
147
|
- ext/cld/cld2/internal/utf8scannot_lettermarkspecial.h
|
|
171
148
|
- ext/cld/cld2/internal/utf8statetable.cc
|
|
172
149
|
- ext/cld/cld2/internal/utf8statetable.h
|
|
173
|
-
- ext/cld/cld2/internal/utf8statetable.o
|
|
174
150
|
- ext/cld/cld2/public/compact_lang_det.h
|
|
175
151
|
- ext/cld/cld2/public/encodings.h
|
|
176
152
|
- ext/cld/extconf.rb
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|