lingua_rs 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 95237de3af0469a6394db02288b9230472c763dc35cfc08768fc7a0233de6d18
4
- data.tar.gz: fe793fbe6f8fa05d408b43579738b1e386db9fc9f93a590700f4726f617f7fa2
3
+ metadata.gz: c52e545860174c156f8e8276d25762293daa3235766fc88777e5d2b5d53f50f5
4
+ data.tar.gz: 90bb85fd278795e300b768e3e5079d465f9c5b85a7c3a62f61c4e45745b5ba96
5
5
  SHA512:
6
- metadata.gz: 9a0fe59c439a1b27c7f0f95b9a3eba1a9c4dabe441ab5a07c0864727d9579b307e0e0c345ff0285b9c11854c7188c3c0726480103ff0390006ef799c3ee5c1bd
7
- data.tar.gz: 82c83d17a21affafcc79cf0999658f33c225b5a7f9d532372f00dde0834b5b663cb0a572630042d5d3e725ae05b25189770993a66eebcf257c2b4eec6b5ca5e2
6
+ metadata.gz: 82cc16c4f8b55d0345f3312675cbf6602365ea514b0f085f12cb9d4c8266601fcdf06d3195a06d74ef64a965ada7d9d79215ee44e70f6ac56e70c4f068cefdcd
7
+ data.tar.gz: eee0c6da2add0958dd836223f2a5d958e393a83f3b84b240598cf600ff0dbf73c3686e556486ad7b42e44f9781ad4905bcece9a59736e988e9532ec2d4174d4c
data/README.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # LinguaRs
2
2
 
3
+ [![CI](https://github.com/kochka/lingua_rs/actions/workflows/ci.yml/badge.svg)](https://github.com/kochka/lingua_rs/actions/workflows/ci.yml)
4
+ [![Gem Version](https://badge.fury.io/rb/lingua_rs.svg)](https://badge.fury.io/rb/lingua_rs)
5
+ [![Downloads](https://img.shields.io/gem/dt/lingua_rs)](https://rubygems.org/gems/lingua_rs)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
7
+
3
8
  A Ruby gem wrapping the [Lingua](https://github.com/pemistahl/lingua-rs) Rust library for language detection.
4
9
 
5
10
  ## Installation
@@ -104,7 +109,7 @@ detector.detect('Hello world') # => #<Lingua::Language English>
104
109
 
105
110
  ```ruby
106
111
  detector.confidence('Bonjour le monde', :fr) # => 0.8217
107
- Lingua.confidence('Bonjour le monde', 'fr') # => 0.3794
112
+ Lingua.confidence('Bonjour le monde', 'fr') # => 0.8217
108
113
 
109
114
  results = detector.confidence_values('Bonjour le monde')
110
115
  results.first.language # => #<Lingua::Language French>
@@ -113,6 +118,26 @@ results.first.to_s # => "French (0.82)"
113
118
  results.sum(&:confidence) # => 1.0
114
119
  ```
115
120
 
121
+ ### Mixed-language detection
122
+
123
+ `detect_multiple` identifies multiple languages within a single text and returns an array of `Lingua::Segment` objects. Available on both `Lingua::Detector` and as a module method on `Lingua`.
124
+
125
+ ```ruby
126
+ text = "Parlez-vous français? Ich spreche Französisch nur ein bisschen. A little bit is better than nothing."
127
+
128
+ segments = Lingua.detect_multiple(text, languages: %w[en fr de])
129
+ segments.each do |s|
130
+ puts "#{s.language} (#{s.start_index}..#{s.end_index}): #{s.text}"
131
+ end
132
+ # French (0..22): Parlez-vous français?
133
+ # German (23..64): Ich spreche Französisch nur ein bisschen.
134
+ # English (65..101): A little bit is better than nothing.
135
+
136
+ # With a persistent detector
137
+ detector = Lingua::Detector.new(languages: %w[en fr de])
138
+ detector.detect_multiple(text)
139
+ ```
140
+
116
141
  ### `Lingua::Language` methods
117
142
 
118
143
  `Lingua::Language` objects support equality (`==`) and can be used as Hash keys.
@@ -139,10 +164,28 @@ Returned by `confidence_values`.
139
164
  | `to_s` | `String` | `'French (0.82)'` |
140
165
  | `inspect` | `String` | `'#<Lingua::ConfidenceResult French (0.8217)>'` |
141
166
 
167
+ ### `Lingua::Segment` methods
168
+
169
+ Returned by `detect_multiple`.
170
+
171
+ | Method | Return type | Example |
172
+ |---|---|---|
173
+ | `language` | `Lingua::Language` | `#<Lingua::Language French>` |
174
+ | `start_index` | `Integer` | `0` |
175
+ | `end_index` | `Integer` | `22` |
176
+ | `word_count` | `Integer` | `3` |
177
+ | `text` | `String` | `'Parlez-vous français? '` |
178
+ | `to_s` | `String` | `'French (0-22): Parlez-vous français? '` |
179
+ | `inspect` | `String` | `'#<Lingua::Segment French (0-22) "Parlez-vous français? ">'` |
180
+
142
181
  ## Development
143
182
 
144
183
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake compile` to build the native extension and `rake test` to run the tests.
145
184
 
185
+ ## Acknowledgements
186
+
187
+ This gem is built on top of [Lingua](https://github.com/pemistahl/lingua-rs) by [Peter M. Stahl](https://github.com/pemistahl), a highly accurate natural language detection library written in Rust.
188
+
146
189
  ## License
147
190
 
148
191
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -2,6 +2,7 @@ use lingua::{LanguageDetector, LanguageDetectorBuilder};
2
2
  use magnus::{Error, RArray, RHash, Ruby};
3
3
 
4
4
  use crate::confidence_result::ConfidenceResult;
5
+ use crate::segment::Segment;
5
6
  use crate::helpers::{fetch_option, parse_language, value_to_string};
6
7
  use crate::language::WrappedLanguage;
7
8
 
@@ -66,6 +67,32 @@ impl RubyDetector {
66
67
  pub fn confidence_values(&self, subject: String) -> Result<RArray, Error> {
67
68
  compute_confidence_values(&self.detector, subject)
68
69
  }
70
+
71
+ pub fn detect_multiple(&self, subject: String) -> Result<RArray, Error> {
72
+ compute_detect_multiple(&self.detector, &subject)
73
+ }
74
+ }
75
+
76
+ pub fn compute_detect_multiple(
77
+ detector: &LanguageDetector,
78
+ subject: &str,
79
+ ) -> Result<RArray, Error> {
80
+ let ruby = Ruby::get().unwrap();
81
+ let results = detector.detect_multiple_languages_of(subject);
82
+ let array = ruby.ary_new_capa(results.len());
83
+ for r in results {
84
+ let text = subject[r.start_index()..r.end_index()].to_string();
85
+ let start_index = subject[..r.start_index()].chars().count();
86
+ let end_index = start_index + text.chars().count();
87
+ array.push(Segment {
88
+ language: r.language(),
89
+ start_index,
90
+ end_index,
91
+ word_count: r.word_count(),
92
+ text,
93
+ })?;
94
+ }
95
+ Ok(array)
69
96
  }
70
97
 
71
98
  pub fn build_detector_from_options(
@@ -1,4 +1,5 @@
1
1
  mod confidence_result;
2
+ mod segment;
2
3
  mod detector;
3
4
  mod helpers;
4
5
  mod language;
@@ -6,7 +7,8 @@ mod language;
6
7
  use magnus::{Error, RArray, RHash, Ruby, function, method, prelude::*};
7
8
 
8
9
  use confidence_result::ConfidenceResult;
9
- use detector::{RubyDetector, build_detector_from_options, compute_confidence, compute_confidence_values};
10
+ use segment::Segment;
11
+ use detector::{RubyDetector, build_detector_from_options, compute_confidence, compute_confidence_values, compute_detect_multiple};
10
12
  use language::WrappedLanguage;
11
13
 
12
14
  fn detect(ruby: &Ruby, arguments: RArray) -> Result<Option<WrappedLanguage>, Error> {
@@ -32,6 +34,15 @@ fn confidence_values(ruby: &Ruby, arguments: RArray) -> Result<RArray, Error> {
32
34
  compute_confidence_values(&detector, subject)
33
35
  }
34
36
 
37
+ fn detect_multiple(ruby: &Ruby, arguments: RArray) -> Result<RArray, Error> {
38
+ let subject = arguments
39
+ .shift::<String>()
40
+ .map_err(|_| Error::new(ruby.exception_arg_error(), "expected a string as first argument"))?;
41
+ let options = arguments.shift::<RHash>().ok();
42
+ let detector = build_detector_from_options(ruby, options.as_ref())?;
43
+ compute_detect_multiple(&detector, &subject)
44
+ }
45
+
35
46
  #[magnus::init]
36
47
  fn init(ruby: &Ruby) -> Result<(), Error> {
37
48
  let module = ruby.define_module("Lingua")?;
@@ -57,17 +68,30 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
57
68
  confidence_class.define_method("to_s", method!(ConfidenceResult::to_s, 0))?;
58
69
  confidence_class.define_method("inspect", method!(ConfidenceResult::inspect, 0))?;
59
70
 
71
+ // Lingua::Segment
72
+ let segment_class = module.define_class("Segment", ruby.class_object())?;
73
+ segment_class.undef_default_alloc_func();
74
+ segment_class.define_method("language", method!(Segment::language, 0))?;
75
+ segment_class.define_method("start_index", method!(Segment::start_index, 0))?;
76
+ segment_class.define_method("end_index", method!(Segment::end_index, 0))?;
77
+ segment_class.define_method("word_count", method!(Segment::word_count, 0))?;
78
+ segment_class.define_method("text", method!(Segment::text, 0))?;
79
+ segment_class.define_method("to_s", method!(Segment::to_s, 0))?;
80
+ segment_class.define_method("inspect", method!(Segment::inspect, 0))?;
81
+
60
82
  // Lingua::Detector
61
83
  let detector_class = module.define_class("Detector", ruby.class_object())?;
62
84
  detector_class.define_singleton_method("new", function!(RubyDetector::new, -1))?;
63
85
  detector_class.define_method("detect", method!(RubyDetector::detect, 1))?;
64
86
  detector_class.define_method("confidence", method!(RubyDetector::confidence, 2))?;
65
87
  detector_class.define_method("confidence_values", method!(RubyDetector::confidence_values, 1))?;
88
+ detector_class.define_method("detect_multiple", method!(RubyDetector::detect_multiple, 1))?;
66
89
 
67
90
  // Functional API (module methods)
68
91
  module.define_singleton_method("detect", function!(detect, -2))?;
69
92
  module.define_singleton_method("confidence", function!(confidence, 2))?;
70
93
  module.define_singleton_method("confidence_values", function!(confidence_values, -2))?;
94
+ module.define_singleton_method("detect_multiple", function!(detect_multiple, -2))?;
71
95
 
72
96
  Ok(())
73
97
  }
@@ -0,0 +1,45 @@
1
+ use lingua::Language;
2
+
3
+ use crate::language::WrappedLanguage;
4
+
5
+ #[magnus::wrap(class = "Lingua::Segment")]
6
+ pub struct Segment {
7
+ pub language: Language,
8
+ pub start_index: usize,
9
+ pub end_index: usize,
10
+ pub word_count: usize,
11
+ pub text: String,
12
+ }
13
+
14
+ impl Segment {
15
+ pub fn language(&self) -> WrappedLanguage {
16
+ WrappedLanguage(self.language)
17
+ }
18
+
19
+ pub fn start_index(&self) -> usize {
20
+ self.start_index
21
+ }
22
+
23
+ pub fn end_index(&self) -> usize {
24
+ self.end_index
25
+ }
26
+
27
+ pub fn word_count(&self) -> usize {
28
+ self.word_count
29
+ }
30
+
31
+ pub fn text(&self) -> String {
32
+ self.text.clone()
33
+ }
34
+
35
+ pub fn to_s(&self) -> String {
36
+ format!("{} ({}-{}): {}", self.language, self.start_index, self.end_index, self.text)
37
+ }
38
+
39
+ pub fn inspect(&self) -> String {
40
+ format!(
41
+ "#<Lingua::Segment {} ({}-{}) \"{}\">",
42
+ self.language, self.start_index, self.end_index, self.text
43
+ )
44
+ }
45
+ }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Lingua
4
- VERSION = '0.1.0'
4
+ VERSION = '0.2.0'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lingua_rs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sébastien Vrillaud
@@ -59,6 +59,7 @@ files:
59
59
  - ext/lingua/src/helpers.rs
60
60
  - ext/lingua/src/language.rs
61
61
  - ext/lingua/src/lib.rs
62
+ - ext/lingua/src/segment.rs
62
63
  - lib/lingua.rb
63
64
  - lib/lingua/version.rb
64
65
  - sig/lingua.rbs
@@ -66,9 +67,10 @@ homepage: https://github.com/kochka/lingua_rs
66
67
  licenses:
67
68
  - MIT
68
69
  metadata:
69
- homepage_uri: https://github.com/kochka/lingua_rs
70
70
  source_code_uri: https://github.com/kochka/lingua_rs
71
71
  changelog_uri: https://github.com/kochka/lingua_rs/blob/main/CHANGELOG.md
72
+ documentation_uri: https://github.com/kochka/lingua_rs#readme
73
+ bug_tracker_uri: https://github.com/kochka/lingua_rs/issues
72
74
  rdoc_options: []
73
75
  require_paths:
74
76
  - lib