lingua_rs 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e301297292f2de86fd62416f6f06765c7ccd373c81020e2ad365e0c0151a55bf
4
- data.tar.gz: fe961910a3e44fa7df10580100b4aecef6f2552c08a7faba278a84eaf7ae89fc
3
+ metadata.gz: bd0dba860ee235a6acfc46afb3ba851ec08303df9499f403e47624c74b9396e7
4
+ data.tar.gz: b3d9faca1f0f12fb7abee0643b94f1f494ef02e6b4f2c92846fa88e94d503904
5
5
  SHA512:
6
- metadata.gz: 31e45f89765b5feb782a8b47f9574891c9ce826b5883f840b5f0d06c8715f26c9d9caa999a8e08349f475e307b2ccd28ecb303677ae8e51caa79920d94cc91fc
7
- data.tar.gz: 6ace3072e3b7e8e044c55712879ea398dc7f009475cb058444070cd22834bb7f4fa6525ab4a66bab95417d57b1f57b9599cda02dd159be80603280c0646b6414
6
+ metadata.gz: 2cae21715d81d650c71df5d55b18fb7b09eb86bcfe1356e5ace3b526a48e9ff31be39d852ee43abe342dffed711c7ea08ad3a9342702960b04a864a674df6068
7
+ data.tar.gz: 221eb0c62f4b7158d5c95b4d7c28490a2bcabc3f4cffe64914bea62c413c0d35fc8ceab1a6b6dc81a4750e75f8d7deac522bab1d0aa97b5c02cfd7d49e9eab40
data/README.md CHANGED
@@ -42,19 +42,18 @@ export PATH=/usr/lib/llvm21/bin:$PATH
42
42
  require 'lingua'
43
43
 
44
44
  lang = Lingua.detect('Bonjour le monde')
45
+ lang.french? # => true
45
46
  lang.to_s # => 'French'
46
47
  lang.to_sym # => :french
47
48
  lang.to_iso # => 'fr'
48
- lang.to_iso6391 # => 'fr'
49
49
  lang.to_iso6393 # => 'fra'
50
- lang.inspect # => '#<Lingua::Language French>'
51
50
 
52
51
  Lingua.detect('') # => nil
53
52
  ```
54
53
 
55
54
  ### Restricting languages
56
55
 
57
- Pass a `languages` option to limit detection to a subset of languages. Values can be full names, ISO 639-1 codes, or ISO 639-3 codes, and can be mixed freely.
56
+ Pass a `languages` option to limit detection to a subset of languages. Values can be full names, ISO 639-1 codes, or ISO 639-3 codes, and can be mixed freely. Restricting the candidate set is also faster, since the detector has fewer languages to compare against.
58
57
 
59
58
  ```ruby
60
59
  # By full name
@@ -154,9 +153,11 @@ Lingua::Language['xxx'] # => nil
154
153
  | `name` | `String` | `'French'` |
155
154
  | `to_s` | `String` | `'French'` (alias for `name`) |
156
155
  | `to_sym` | `Symbol` | `:french` |
156
+ | `iso_code` | `String` | `'fr'` (alias for `to_iso6391`) |
157
157
  | `to_iso` | `String` | `'fr'` (alias for `to_iso6391`) |
158
158
  | `to_iso6391` | `String` | `'fr'` |
159
159
  | `to_iso6393` | `String` | `'fra'` |
160
+ | `french?` | `Boolean` | `true` (works with name, ISO 639-1 or ISO 639-3: `fr?`, `fra?`) |
160
161
  | `inspect` | `String` | `'#<Lingua::Language French>'` |
161
162
  | `==` | `Boolean` | Compare two languages |
162
163
  | `hash` | `Integer` | Hash value (usable as Hash key) |
@@ -194,6 +195,60 @@ Returned by `detect_multiple`.
194
195
  | `to_s` | `String` | `'French (0-22): Parlez-vous français? '` |
195
196
  | `inspect` | `String` | `'#<Lingua::Segment French (0-22) "Parlez-vous français? ">'` |
196
197
 
198
+ ### Error handling
199
+
200
+ `Lingua::UnknownLanguageError` (subclass of `ArgumentError`) is raised when an unrecognized language name or code is passed:
201
+
202
+ ```ruby
203
+ Lingua.detect('Hello', languages: %w[en zzzz])
204
+ # => Lingua::UnknownLanguageError: unknown language: "zzzz"
205
+
206
+ # Can also be rescued as ArgumentError
207
+ begin
208
+ Lingua.detect('Hello', languages: %w[zzzz])
209
+ rescue ArgumentError => e
210
+ puts e.message
211
+ end
212
+ ```
213
+
214
+ ## Optimization: selecting languages
215
+
216
+ By default, all 75 languages are compiled into the native extension (~278 MB). If you only need a subset, set the `LINGUA_LANGUAGES` environment variable before installing to reduce binary size and improve detection speed:
217
+
218
+ ```bash
219
+ LINGUA_LANGUAGES=core bundle install
220
+ ```
221
+
222
+ This compiles only the selected language models (~29 MB for the `core` bundle). You can use individual language names or predefined bundles, and you can mix both in the same build.
223
+
224
+ This also works with `gem install`, for example:
225
+
226
+ ```bash
227
+ LINGUA_LANGUAGES=french,english,german gem install lingua_rs
228
+ ```
229
+
230
+ Available bundles:
231
+
232
+ | Bundle | Languages |
233
+ | --- | --- |
234
+ | `core` | `english`, `french`, `german`, `spanish`, `italian`, `portuguese` |
235
+ | `europe_common` | `core` + `dutch`, `polish`, `russian`, `turkish` |
236
+ | `americas` | `english`, `spanish`, `portuguese`, `french` |
237
+ | `mena` | `arabic`, `turkish`, `persian`, `hebrew` |
238
+ | `south_asia` | `hindi`, `urdu`, `bengali`, `tamil`, `telugu`, `punjabi`, `marathi`, `gujarati` |
239
+ | `east_asia` | `chinese`, `japanese`, `korean`, `vietnamese`, `thai` |
240
+ | `africa_common` | `arabic`, `english`, `french`, `swahili`, `somali`, `yoruba`, `zulu` |
241
+
242
+ Examples:
243
+
244
+ ```bash
245
+ LINGUA_LANGUAGES=core bundle install
246
+ LINGUA_LANGUAGES=europe_common,polish bundle install
247
+ LINGUA_LANGUAGES=east_asia,english bundle install
248
+ ```
249
+
250
+ Language and bundle names must match the Cargo features defined by this gem (lowercase, e.g. `french`, `english`, `german`, `core`, `europe_common`).
251
+
197
252
  ## Development
198
253
 
199
254
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake compile` to build the native extension and `rake test` to run the tests.
@@ -11,4 +11,187 @@ crate-type = ["cdylib"]
11
11
 
12
12
  [dependencies]
13
13
  magnus = { version = "0.8.2" }
14
- lingua = "1.8.0"
14
+ lingua = { version = "1.8.0", default-features = false }
15
+
16
+ [features]
17
+ default = ["full"]
18
+ full = [
19
+ "europe_common",
20
+ "americas",
21
+ "mena",
22
+ "south_asia",
23
+ "east_asia",
24
+ "africa_common",
25
+ "afrikaans",
26
+ "albanian",
27
+ "armenian",
28
+ "azerbaijani",
29
+ "basque",
30
+ "belarusian",
31
+ "bokmal",
32
+ "bosnian",
33
+ "bulgarian",
34
+ "catalan",
35
+ "croatian",
36
+ "czech",
37
+ "danish",
38
+ "esperanto",
39
+ "estonian",
40
+ "finnish",
41
+ "ganda",
42
+ "georgian",
43
+ "greek",
44
+ "hungarian",
45
+ "icelandic",
46
+ "indonesian",
47
+ "irish",
48
+ "kazakh",
49
+ "latin",
50
+ "latvian",
51
+ "lithuanian",
52
+ "macedonian",
53
+ "malay",
54
+ "maori",
55
+ "mongolian",
56
+ "nynorsk",
57
+ "romanian",
58
+ "serbian",
59
+ "shona",
60
+ "slovak",
61
+ "slovene",
62
+ "sotho",
63
+ "swedish",
64
+ "tagalog",
65
+ "tsonga",
66
+ "tswana",
67
+ "welsh",
68
+ "xhosa",
69
+ ]
70
+ core = [
71
+ "english",
72
+ "french",
73
+ "german",
74
+ "spanish",
75
+ "italian",
76
+ "portuguese",
77
+ ]
78
+ europe_common = [
79
+ "core",
80
+ "dutch",
81
+ "polish",
82
+ "russian",
83
+ "turkish",
84
+ ]
85
+ americas = [
86
+ "english",
87
+ "spanish",
88
+ "portuguese",
89
+ "french",
90
+ ]
91
+ mena = [
92
+ "arabic",
93
+ "turkish",
94
+ "persian",
95
+ "hebrew",
96
+ ]
97
+ south_asia = [
98
+ "hindi",
99
+ "urdu",
100
+ "bengali",
101
+ "tamil",
102
+ "telugu",
103
+ "punjabi",
104
+ "marathi",
105
+ "gujarati",
106
+ ]
107
+ east_asia = [
108
+ "chinese",
109
+ "japanese",
110
+ "korean",
111
+ "vietnamese",
112
+ "thai",
113
+ ]
114
+ africa_common = [
115
+ "arabic",
116
+ "english",
117
+ "french",
118
+ "swahili",
119
+ "somali",
120
+ "yoruba",
121
+ "zulu",
122
+ ]
123
+ afrikaans = ["lingua/afrikaans"]
124
+ albanian = ["lingua/albanian"]
125
+ arabic = ["lingua/arabic"]
126
+ armenian = ["lingua/armenian"]
127
+ azerbaijani = ["lingua/azerbaijani"]
128
+ basque = ["lingua/basque"]
129
+ belarusian = ["lingua/belarusian"]
130
+ bengali = ["lingua/bengali"]
131
+ bokmal = ["lingua/bokmal"]
132
+ bosnian = ["lingua/bosnian"]
133
+ bulgarian = ["lingua/bulgarian"]
134
+ catalan = ["lingua/catalan"]
135
+ chinese = ["lingua/chinese"]
136
+ croatian = ["lingua/croatian"]
137
+ czech = ["lingua/czech"]
138
+ danish = ["lingua/danish"]
139
+ dutch = ["lingua/dutch"]
140
+ english = ["lingua/english"]
141
+ esperanto = ["lingua/esperanto"]
142
+ estonian = ["lingua/estonian"]
143
+ finnish = ["lingua/finnish"]
144
+ french = ["lingua/french"]
145
+ ganda = ["lingua/ganda"]
146
+ georgian = ["lingua/georgian"]
147
+ german = ["lingua/german"]
148
+ greek = ["lingua/greek"]
149
+ gujarati = ["lingua/gujarati"]
150
+ hebrew = ["lingua/hebrew"]
151
+ hindi = ["lingua/hindi"]
152
+ hungarian = ["lingua/hungarian"]
153
+ icelandic = ["lingua/icelandic"]
154
+ indonesian = ["lingua/indonesian"]
155
+ irish = ["lingua/irish"]
156
+ italian = ["lingua/italian"]
157
+ japanese = ["lingua/japanese"]
158
+ kazakh = ["lingua/kazakh"]
159
+ korean = ["lingua/korean"]
160
+ latin = ["lingua/latin"]
161
+ latvian = ["lingua/latvian"]
162
+ lithuanian = ["lingua/lithuanian"]
163
+ macedonian = ["lingua/macedonian"]
164
+ malay = ["lingua/malay"]
165
+ maori = ["lingua/maori"]
166
+ marathi = ["lingua/marathi"]
167
+ mongolian = ["lingua/mongolian"]
168
+ nynorsk = ["lingua/nynorsk"]
169
+ persian = ["lingua/persian"]
170
+ polish = ["lingua/polish"]
171
+ portuguese = ["lingua/portuguese"]
172
+ punjabi = ["lingua/punjabi"]
173
+ romanian = ["lingua/romanian"]
174
+ russian = ["lingua/russian"]
175
+ serbian = ["lingua/serbian"]
176
+ shona = ["lingua/shona"]
177
+ slovak = ["lingua/slovak"]
178
+ slovene = ["lingua/slovene"]
179
+ somali = ["lingua/somali"]
180
+ sotho = ["lingua/sotho"]
181
+ spanish = ["lingua/spanish"]
182
+ swahili = ["lingua/swahili"]
183
+ swedish = ["lingua/swedish"]
184
+ tagalog = ["lingua/tagalog"]
185
+ tamil = ["lingua/tamil"]
186
+ telugu = ["lingua/telugu"]
187
+ thai = ["lingua/thai"]
188
+ tsonga = ["lingua/tsonga"]
189
+ tswana = ["lingua/tswana"]
190
+ turkish = ["lingua/turkish"]
191
+ ukrainian = ["lingua/ukrainian"]
192
+ urdu = ["lingua/urdu"]
193
+ vietnamese = ["lingua/vietnamese"]
194
+ welsh = ["lingua/welsh"]
195
+ xhosa = ["lingua/xhosa"]
196
+ yoruba = ["lingua/yoruba"]
197
+ zulu = ["lingua/zulu"]
@@ -3,4 +3,13 @@
3
3
  require 'mkmf'
4
4
  require 'rb_sys/mkmf'
5
5
 
6
- create_rust_makefile('lingua/lingua')
6
+ # Allow users to select specific languages to reduce binary size.
7
+ # Example: LINGUA_LANGUAGES=french,english,german bundle install
8
+ languages = ENV.fetch('LINGUA_LANGUAGES', '').split(',').map { |lang| lang.strip.downcase }
9
+
10
+ create_rust_makefile('lingua/lingua') do |r|
11
+ unless languages.empty?
12
+ r.extra_cargo_args << '--no-default-features'
13
+ r.features = languages
14
+ end
15
+ end
@@ -3,7 +3,7 @@ use magnus::{Error, RArray, RHash, RModule, Ruby, function, method, prelude::*};
3
3
 
4
4
  use crate::confidence_result::ConfidenceResult;
5
5
  use crate::segment::Segment;
6
- use crate::helpers::{fetch_option, parse_language, value_to_string};
6
+ use crate::helpers::{fetch_option, parse_language, unknown_language_error, value_to_string};
7
7
  use crate::language::WrappedLanguage;
8
8
 
9
9
  pub fn define(ruby: &Ruby, module: &RModule) -> Result<(), Error> {
@@ -25,7 +25,7 @@ pub fn compute_confidence(
25
25
  let language_str = value_to_string(language)?;
26
26
  let lang = parse_language(&language_str).ok_or_else(|| {
27
27
  Error::new(
28
- ruby.exception_arg_error(),
28
+ unknown_language_error(&ruby),
29
29
  format!("unknown language: \"{language_str}\""),
30
30
  )
31
31
  })?;
@@ -116,7 +116,7 @@ pub fn build_detector_from_options(
116
116
  for l in &raw_languages {
117
117
  let lang = parse_language(l).ok_or_else(|| {
118
118
  Error::new(
119
- ruby.exception_arg_error(),
119
+ unknown_language_error(ruby),
120
120
  format!("unknown language: \"{l}\""),
121
121
  )
122
122
  })?;
@@ -1,7 +1,23 @@
1
1
  use std::str::FromStr;
2
+ use std::cell::Cell;
2
3
 
3
4
  use lingua::{IsoCode639_1, IsoCode639_3, Language};
4
- use magnus::{Error, Ruby, Symbol, prelude::*};
5
+ use magnus::{ExceptionClass, Error, RModule, Ruby, Symbol, prelude::*};
6
+
7
+ thread_local! {
8
+ static UNKNOWN_LANGUAGE_ERROR: Cell<Option<ExceptionClass>> = const { Cell::new(None) };
9
+ }
10
+
11
+ pub fn define_errors(module: &RModule) -> Result<(), Error> {
12
+ let ruby = Ruby::get().unwrap();
13
+ let class = module.define_error("UnknownLanguageError", ruby.exception_arg_error())?;
14
+ UNKNOWN_LANGUAGE_ERROR.set(Some(class));
15
+ Ok(())
16
+ }
17
+
18
+ pub fn unknown_language_error(ruby: &Ruby) -> ExceptionClass {
19
+ UNKNOWN_LANGUAGE_ERROR.get().unwrap_or_else(|| ruby.exception_arg_error())
20
+ }
5
21
 
6
22
  pub fn parse_language(input: &str) -> Option<Language> {
7
23
  Language::from_str(input)
@@ -10,6 +10,7 @@ pub fn define(ruby: &Ruby, module: &RModule) -> Result<(), Error> {
10
10
  class.define_method("to_s", method!(WrappedLanguage::name, 0))?;
11
11
  class.define_method("to_iso6391", method!(WrappedLanguage::to_iso6391, 0))?;
12
12
  class.define_method("to_iso", method!(WrappedLanguage::to_iso6391, 0))?;
13
+ class.define_method("iso_code", method!(WrappedLanguage::to_iso6391, 0))?;
13
14
  class.define_method("to_iso6393", method!(WrappedLanguage::to_iso6393, 0))?;
14
15
  class.define_method("to_sym", method!(WrappedLanguage::to_sym, 0))?;
15
16
  class.define_method("inspect", method!(WrappedLanguage::inspect, 0))?;
@@ -45,6 +45,7 @@ fn detect_multiple(ruby: &Ruby, arguments: RArray) -> Result<RArray, Error> {
45
45
  fn init(ruby: &Ruby) -> Result<(), Error> {
46
46
  let module = ruby.define_module("Lingua")?;
47
47
 
48
+ helpers::define_errors(&module)?;
48
49
  language::define(ruby, &module)?;
49
50
  confidence_result::define(ruby, &module)?;
50
51
  segment::define(ruby, &module)?;
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lingua
4
+ # Represents a language returned by Lingua detection and lookup APIs.
5
+ class Language
6
+ private
7
+
8
+ def respond_to_missing?(method_name, include_private = false)
9
+ method_name.end_with?('?') || super
10
+ end
11
+
12
+ def method_missing(method_name, *args)
13
+ if method_name.end_with?('?') && args.empty?
14
+ match = Lingua::Language[method_name.name.delete_suffix('?')]
15
+ !match.nil? && self == match
16
+ else
17
+ super
18
+ end
19
+ end
20
+ end
21
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Lingua
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.1'
5
5
  end
data/lib/lingua.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'lingua/version'
4
4
  require_relative 'lingua/lingua'
5
+ require_relative 'lingua/language'
5
6
 
6
7
  module Lingua
7
8
  class Error < StandardError; end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lingua_rs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sébastien Vrillaud
@@ -43,11 +43,9 @@ email:
43
43
  - kochka@gmail.com
44
44
  executables: []
45
45
  extensions:
46
- - ext/lingua/Cargo.toml
46
+ - ext/lingua/extconf.rb
47
47
  extra_rdoc_files: []
48
48
  files:
49
- - Cargo.lock
50
- - Cargo.toml
51
49
  - LICENSE.txt
52
50
  - README.md
53
51
  - Rakefile
@@ -61,6 +59,7 @@ files:
61
59
  - ext/lingua/src/lib.rs
62
60
  - ext/lingua/src/segment.rs
63
61
  - lib/lingua.rb
62
+ - lib/lingua/language.rb
64
63
  - lib/lingua/version.rb
65
64
  - sig/lingua.rbs
66
65
  homepage: https://github.com/kochka/lingua_rs