cld2-small 2.0.0 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f88610ddab27246200e533e2a863ecfa32adbf0542490bfbe42deb1f54653b7a
4
- data.tar.gz: fbecb14a1eac61c643a6bf8c347005d5dabed88e83a301e12708ec056336ddd0
3
+ metadata.gz: 6f10fe04b1de4166a0dabfbae2294879bc24e749312461fa882fc2f0de8176a1
4
+ data.tar.gz: 9de3c4cecb30494f28924e98c76303192ebbee33105b778b3d606a390a4a2f69
5
5
  SHA512:
6
- metadata.gz: 3dd02811b0fe15197f8640019ce5235c2531d8af0a48a97cf944fb27d30797813ca91ca4ee2e80e703b8d197e2e3d5f180f45733e6d39139a830c15ea2bc95f9
7
- data.tar.gz: ff09ffa0689e7e9ee426f0a383cc82b238d5f891ed0f81d1478720b9ea24ae6f4a469bb1bab43255e610847348f07416a2f3dbc07f072c0c0861ef40fc0c16ed
6
+ metadata.gz: '00418d5b352b925de52c3e82c4ec7651f5089a15612b8a0a2a43c76b709efae082c219c0e4513714cd0b4de7dba83fd215a769d3d67cf0995c7e0893588a5eaf'
7
+ data.tar.gz: '078fd9c608432d46f03adc3592438e16f54dcd7a9118129cc6af12cd2344ff010862a7bb799262e434d475a5a7f342da34609413277c5db93433dc41fb116b0e'
data/README.md CHANGED
@@ -10,6 +10,8 @@ Blazing-fast language detection for Ruby provided using Compact Language Detecto
10
10
 
11
11
  ## How to Use
12
12
 
13
+ The `detect_language` method returns a hash with the language name, code, and reliability.
14
+
13
15
  ```ruby
14
16
  CLD.detect_language("Working as expected")
15
17
  # => {:name => "ENGLISH", :code => "en", :reliable => true}
@@ -18,6 +20,43 @@ CLD.detect_language("plus ça change, plus c'est la même chose")
18
20
  # => {:name => "FRENCH", :code => "fr", :reliable => true}
19
21
  ```
20
22
 
23
+ **Options**
24
+
25
+ You can pass an options hash as the second argument to `detect_language`:
26
+
27
+ ```ruby
28
+ CLD.detect_language(text, options = {})
29
+ ```
30
+
31
+ Available options:
32
+
33
+ * `:is_plain_text` (Boolean, default: `true`): Set to `false` if the input text is HTML. CLD2 will then try to skip HTML tags.
34
+ * `:best_effort` (Boolean, default: `false`): If `true`, CLD2 will give its best-effort answer, even on short or ambiguous text, instead of potentially returning "Unknown".
35
+ * `:tld_hint` (String, default: `nil`): A Top-Level Domain hint (e.g., `"id"`, `"us"`) to boost detection accuracy for languages associated with that TLD.
36
+ * `:content_language_hint` (String, default: `nil`): An HTTP Content-Language header style hint (e.g., `"en,fr"`) to boost detection for the specified languages.
37
+ * `:score_as_quads` (Boolean, default: `false`): Forces CLD2 to use quadgram-based scoring for certain languages that are normally detected by script alone. This can be a refinement for more meaningful text detection in those languages but depends on CLD2's internal data tables.
38
+
39
+ **Examples with Options:**
40
+
41
+ ```ruby
42
+ # Using best_effort for short text
43
+ CLD.detect_language("test", best_effort: true)
44
+ # => Might return a language like {:name => "ENGLISH", :code => "en", :reliable => false}
45
+ # instead of "Unknown"
46
+
47
+ # Providing a TLD hint
48
+ CLD.detect_language("Ini adalah teks dalam bahasa Indonesia", tld_hint: "id")
49
+ # => {:name => "INDONESIAN", :code => "id", :reliable => true}
50
+
51
+ # Providing a content language hint
52
+ CLD.detect_language("Ceci est un texte en français.", content_language_hint: "fr,en")
53
+ # => {:name => "FRENCH", :code => "fr", :reliable => true}
54
+
55
+ # Using score_as_quads (example, effect depends on text and CLD2 tables)
56
+ CLD.detect_language("Ελληνικό κείμενο", score_as_quads: true)
57
+ # => May provide a more refined result for Greek
58
+ ```
59
+
21
60
  ## Installation
22
61
 
23
62
  Add this line to your application's Gemfile:
data/ext/cld/thunk.cc CHANGED
@@ -13,28 +13,43 @@ typedef struct {
13
13
  } RESULT;
14
14
 
15
15
  extern "C" {
16
- RESULT detectLanguageThunkInt(const char * src, bool is_plain_text) {
16
+ RESULT detectLanguageThunkInt(const char * src,
17
+ bool is_plain_text,
18
+ bool best_effort,
19
+ const char* tld_hint_param,
20
+ const char* content_lang_hint_param,
21
+ bool score_as_quads_param) {
17
22
  bool is_reliable;
18
23
  Language plus_one = UNKNOWN_LANGUAGE;
19
- const char* tld_hint = NULL;
20
- int encoding_hint = UNKNOWN_ENCODING;
21
- Language language_hint = UNKNOWN_LANGUAGE;
24
+ int encoding_hint_val = UNKNOWN_ENCODING;
25
+ Language language_hint_val = UNKNOWN_LANGUAGE;
26
+
27
+ CLDHints cld_hints = {content_lang_hint_param, tld_hint_param, encoding_hint_val, language_hint_val};
28
+
29
+ int flags = 0;
30
+ if (best_effort) {
31
+ flags |= kCLDFlagBestEffort;
32
+ }
33
+ if (score_as_quads_param) {
34
+ flags |= kCLDFlagScoreAsQuads;
35
+ }
22
36
 
23
37
  double normalized_score3[3];
24
38
  Language language3[3];
25
39
  int percent3[3];
26
40
  int text_bytes;
41
+ ResultChunkVector resultchunkvector;
27
42
 
28
43
  Language lang;
29
44
  lang = ExtDetectLanguageSummary(src,
30
45
  strlen(src),
31
46
  is_plain_text,
32
- tld_hint,
33
- encoding_hint,
34
- language_hint,
47
+ &cld_hints,
48
+ flags,
35
49
  language3,
36
50
  percent3,
37
51
  normalized_score3,
52
+ &resultchunkvector,
38
53
  &text_bytes,
39
54
  &is_reliable);
40
55
 
data/lib/cld/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module CLD
2
- VERSION = "2.0.0"
2
+ VERSION = "2.0.5"
3
3
  end
data/lib/cld.rb CHANGED
@@ -1,4 +1,4 @@
1
- require "cld/version"
1
+ require_relative "cld/version"
2
2
  require "ffi"
3
3
 
4
4
  module CLD
@@ -23,8 +23,19 @@ module CLD
23
23
 
24
24
  ffi_lib lib_path
25
25
 
26
- def self.detect_language(text, is_plain_text=true)
27
- result = detect_language_ext(text.to_s, is_plain_text)
26
+ def self.detect_language(text, options = {})
27
+ is_plain_text = options.fetch(:is_plain_text, true)
28
+ best_effort = options.fetch(:best_effort, false)
29
+ tld_hint = options.fetch(:tld_hint, nil)
30
+ content_language_hint = options.fetch(:content_language_hint, nil)
31
+ score_as_quads = options.fetch(:score_as_quads, false)
32
+
33
+ result = detect_language_ext(text.to_s,
34
+ is_plain_text,
35
+ best_effort,
36
+ tld_hint,
37
+ content_language_hint,
38
+ score_as_quads)
28
39
  Hash[ result.members.map {|member| [member.to_sym, result[member]]} ]
29
40
  end
30
41
 
@@ -34,5 +45,7 @@ module CLD
34
45
  layout :name, :string, :code, :string, :reliable, :bool
35
46
  end
36
47
 
37
- attach_function "detect_language_ext", "detectLanguageThunkInt", [:buffer_in, :bool], ReturnValue.by_value
48
+ attach_function "detect_language_ext", "detectLanguageThunkInt",
49
+ [:buffer_in, :bool, :bool, :string, :string, :bool],
50
+ ReturnValue.by_value
38
51
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cld2-small
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alessandro Dal Grande
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2025-03-10 00:00:00.000000000 Z
13
+ date: 2025-05-22 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ffi
@@ -74,51 +74,39 @@ files:
74
74
  - ext/cld/cld2/internal/cld2_dynamic_data_loader.h
75
75
  - ext/cld/cld2/internal/cld2_dynamic_data_tool.cc
76
76
  - ext/cld/cld2/internal/cld2_generated_cjk_compatible.cc
77
- - ext/cld/cld2/internal/cld2_generated_cjk_compatible.o
78
77
  - ext/cld/cld2/internal/cld2_generated_deltaocta0122.cc
79
78
  - ext/cld/cld2/internal/cld2_generated_deltaocta0527.cc
80
79
  - ext/cld/cld2/internal/cld2_generated_deltaoctachrome.cc
81
- - ext/cld/cld2/internal/cld2_generated_deltaoctachrome.o
82
80
  - ext/cld/cld2/internal/cld2_generated_distinctocta0122.cc
83
81
  - ext/cld/cld2/internal/cld2_generated_distinctocta0527.cc
84
82
  - ext/cld/cld2/internal/cld2_generated_distinctoctachrome.cc
85
- - ext/cld/cld2/internal/cld2_generated_distinctoctachrome.o
86
83
  - ext/cld/cld2/internal/cld2_generated_octa2_dummy.cc
87
84
  - ext/cld/cld2/internal/cld2_generated_quad0122.cc
88
85
  - ext/cld/cld2/internal/cld2_generated_quad0720.cc
89
86
  - ext/cld/cld2/internal/cld2_generated_quadchrome_16.cc
90
87
  - ext/cld/cld2/internal/cld2_generated_quadchrome_2.cc
91
- - ext/cld/cld2/internal/cld2_generated_quadchrome_2.o
92
88
  - ext/cld/cld2/internal/cld2_unittest.cc
93
89
  - ext/cld/cld2/internal/cld2_unittest_full.cc
94
90
  - ext/cld/cld2/internal/cld2tablesummary.h
95
91
  - ext/cld/cld2/internal/cld_generated_cjk_delta_bi_32.cc
96
92
  - ext/cld/cld2/internal/cld_generated_cjk_delta_bi_4.cc
97
- - ext/cld/cld2/internal/cld_generated_cjk_delta_bi_4.o
98
93
  - ext/cld/cld2/internal/cld_generated_cjk_uni_prop_80.cc
99
- - ext/cld/cld2/internal/cld_generated_cjk_uni_prop_80.o
100
94
  - ext/cld/cld2/internal/cld_generated_score_quad_octa_0122.cc
101
95
  - ext/cld/cld2/internal/cld_generated_score_quad_octa_0122_2.cc
102
96
  - ext/cld/cld2/internal/cld_generated_score_quad_octa_1024_256.cc
103
97
  - ext/cld/cld2/internal/cld_generated_score_quad_octa_2.cc
104
- - ext/cld/cld2/internal/cld_generated_score_quad_octa_2.o
105
98
  - ext/cld/cld2/internal/cldutil.cc
106
99
  - ext/cld/cld2/internal/cldutil.h
107
- - ext/cld/cld2/internal/cldutil.o
108
100
  - ext/cld/cld2/internal/cldutil_offline.cc
109
101
  - ext/cld/cld2/internal/cldutil_offline.h
110
102
  - ext/cld/cld2/internal/cldutil_shared.cc
111
103
  - ext/cld/cld2/internal/cldutil_shared.h
112
- - ext/cld/cld2/internal/cldutil_shared.o
113
104
  - ext/cld/cld2/internal/clean.sh
114
105
  - ext/cld/cld2/internal/compact_lang_det.cc
115
- - ext/cld/cld2/internal/compact_lang_det.o
116
106
  - ext/cld/cld2/internal/compact_lang_det_hint_code.cc
117
107
  - ext/cld/cld2/internal/compact_lang_det_hint_code.h
118
- - ext/cld/cld2/internal/compact_lang_det_hint_code.o
119
108
  - ext/cld/cld2/internal/compact_lang_det_impl.cc
120
109
  - ext/cld/cld2/internal/compact_lang_det_impl.h
121
- - ext/cld/cld2/internal/compact_lang_det_impl.o
122
110
  - ext/cld/cld2/internal/compact_lang_det_test.cc
123
111
  - ext/cld/cld2/internal/compile.sh
124
112
  - ext/cld/cld2/internal/compile_and_test_all.sh
@@ -127,42 +115,31 @@ files:
127
115
  - ext/cld/cld2/internal/compile_libs.sh
128
116
  - ext/cld/cld2/internal/debug.cc
129
117
  - ext/cld/cld2/internal/debug.h
130
- - ext/cld/cld2/internal/debug.o
131
118
  - ext/cld/cld2/internal/debug_empty.cc
132
119
  - ext/cld/cld2/internal/fixunicodevalue.cc
133
120
  - ext/cld/cld2/internal/fixunicodevalue.h
134
- - ext/cld/cld2/internal/fixunicodevalue.o
135
121
  - ext/cld/cld2/internal/generated_distinct_bi_0.cc
136
- - ext/cld/cld2/internal/generated_distinct_bi_0.o
137
122
  - ext/cld/cld2/internal/generated_entities.cc
138
- - ext/cld/cld2/internal/generated_entities.o
139
123
  - ext/cld/cld2/internal/generated_language.cc
140
124
  - ext/cld/cld2/internal/generated_language.h
141
- - ext/cld/cld2/internal/generated_language.o
142
125
  - ext/cld/cld2/internal/generated_ulscript.cc
143
126
  - ext/cld/cld2/internal/generated_ulscript.h
144
- - ext/cld/cld2/internal/generated_ulscript.o
145
127
  - ext/cld/cld2/internal/getonescriptspan.cc
146
128
  - ext/cld/cld2/internal/getonescriptspan.h
147
- - ext/cld/cld2/internal/getonescriptspan.o
148
129
  - ext/cld/cld2/internal/integral_types.h
149
130
  - ext/cld/cld2/internal/lang_script.cc
150
131
  - ext/cld/cld2/internal/lang_script.h
151
- - ext/cld/cld2/internal/lang_script.o
152
132
  - ext/cld/cld2/internal/langspan.h
153
133
  - ext/cld/cld2/internal/offsetmap.cc
154
134
  - ext/cld/cld2/internal/offsetmap.h
155
- - ext/cld/cld2/internal/offsetmap.o
156
135
  - ext/cld/cld2/internal/port.h
157
136
  - ext/cld/cld2/internal/scoreonescriptspan.cc
158
137
  - ext/cld/cld2/internal/scoreonescriptspan.h
159
- - ext/cld/cld2/internal/scoreonescriptspan.o
160
138
  - ext/cld/cld2/internal/scoreutf8text.cc
161
139
  - ext/cld/cld2/internal/stringpiece.h
162
140
  - ext/cld/cld2/internal/test_shuffle_1000_48_666.utf8.gz
163
141
  - ext/cld/cld2/internal/tote.cc
164
142
  - ext/cld/cld2/internal/tote.h
165
- - ext/cld/cld2/internal/tote.o
166
143
  - ext/cld/cld2/internal/unittest_data.h
167
144
  - ext/cld/cld2/internal/utf8acceptinterchange.h
168
145
  - ext/cld/cld2/internal/utf8prop_lettermarkscriptnum.h
@@ -170,7 +147,6 @@ files:
170
147
  - ext/cld/cld2/internal/utf8scannot_lettermarkspecial.h
171
148
  - ext/cld/cld2/internal/utf8statetable.cc
172
149
  - ext/cld/cld2/internal/utf8statetable.h
173
- - ext/cld/cld2/internal/utf8statetable.o
174
150
  - ext/cld/cld2/public/compact_lang_det.h
175
151
  - ext/cld/cld2/public/encodings.h
176
152
  - ext/cld/extconf.rb
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file