tantiny 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/query.rs CHANGED
@@ -1,260 +1,303 @@
1
- use std::str::FromStr;
1
+ use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder};
2
+ use magnus::{Error, Module, Object, RArray, RModule, Ruby, TryConvert, Value};
2
3
  use std::ops::Bound::Included;
3
- use rutie::{methods, Object, AnyObject, Integer, Float, Array, RString};
4
- use tantivy::{Term, DateTime};
5
- use tantivy::schema::{IndexRecordOption, Facet, Type, FieldType};
6
4
  use tantivy::query::*;
5
+ use tantivy::schema::{Facet, FieldType, IndexRecordOption};
6
+ use tantivy::Term;
7
+ use time::OffsetDateTime;
7
8
 
8
- use crate::helpers::{try_unwrap_params, scaffold, TryUnwrap};
9
- use crate::index::{unwrap_index, RTantinyIndex};
9
+ use crate::index::Index;
10
10
 
11
- pub struct TantinyQuery(pub(crate) Box<dyn Query>);
11
+ #[magnus::wrap(class = "Tantiny::Query", free_immediately, size)]
12
+ pub struct Query(Box<dyn tantivy::query::Query>);
12
13
 
13
- scaffold!(RTantinyQuery, TantinyQuery, "Query");
14
-
15
- fn wrap_query(query: Box<dyn Query>) -> RTantinyQuery {
16
- klass().wrap_data(
17
- TantinyQuery(query),
18
- &*TANTINY_QUERY_WRAPPER
19
- )
20
- }
21
-
22
- pub(crate) fn unwrap_query(query: &RTantinyQuery) -> &Box<dyn Query> {
23
- &query.get_data(&*TANTINY_QUERY_WRAPPER).0
24
- }
25
-
26
- #[rustfmt::skip::macros(methods)]
27
- methods!(
28
- RTantinyQuery,
29
- _itself,
14
+ impl Query {
15
+ pub fn get_query(&self) -> &dyn tantivy::query::Query {
16
+ self.0.as_ref()
17
+ }
30
18
 
31
- fn new_all_query() -> RTantinyQuery {
32
- wrap_query(Box::new(AllQuery))
19
+ fn new_all() -> Self {
20
+ Query(Box::new(AllQuery))
33
21
  }
34
22
 
35
- fn new_empty_query() -> RTantinyQuery {
36
- wrap_query(Box::new(EmptyQuery))
23
+ fn new_empty() -> Self {
24
+ Query(Box::new(EmptyQuery))
37
25
  }
38
26
 
39
- fn new_term_query(
40
- index: RTantinyIndex,
41
- field: RString,
42
- term: RString
43
- ) -> RTantinyQuery {
44
- try_unwrap_params!(
45
- index,
46
- field: String,
47
- term: String
48
- );
49
-
50
- let schema = &unwrap_index(&index).schema;
51
- let field = schema.get_field(&field).try_unwrap();
27
+ fn new_term(index: &Index, field: String, term: String) -> Result<Self, Error> {
28
+ let ruby = unsafe { Ruby::get_unchecked() };
29
+ let field = index.schema.get_field(&field).map_err(|e| {
30
+ Error::new(
31
+ ruby.exception_runtime_error(),
32
+ format!("Field not found: {}", e),
33
+ )
34
+ })?;
52
35
  let term = Term::from_field_text(field, &term);
53
36
  let query = TermQuery::new(term, IndexRecordOption::Basic);
54
-
55
- wrap_query(Box::new(query))
37
+ Ok(Query(Box::new(query)))
56
38
  }
57
39
 
58
- fn new_fuzzy_term_query(
59
- index: RTantinyIndex,
60
- field: RString,
61
- term: RString,
62
- distance: Integer
63
- ) -> RTantinyQuery {
64
- try_unwrap_params!(
65
- index,
66
- field: String,
67
- term: String,
68
- distance: i64
69
- );
70
-
71
- let schema = &unwrap_index(&index).schema;
72
- let field = schema.get_field(&field).try_unwrap();
40
+ fn new_fuzzy_term(
41
+ index: &Index,
42
+ field: String,
43
+ term: String,
44
+ distance: i64,
45
+ ) -> Result<Self, Error> {
46
+ let ruby = unsafe { Ruby::get_unchecked() };
47
+ let field = index.schema.get_field(&field).map_err(|e| {
48
+ Error::new(
49
+ ruby.exception_runtime_error(),
50
+ format!("Field not found: {}", e),
51
+ )
52
+ })?;
73
53
  let term = Term::from_field_text(field, &term);
74
54
  let query = FuzzyTermQuery::new(term, distance as u8, true);
75
-
76
- wrap_query(Box::new(query))
55
+ Ok(Query(Box::new(query)))
77
56
  }
78
57
 
79
- fn new_phrase_query(
80
- index: RTantinyIndex,
81
- field: RString,
82
- terms: Array
83
- ) -> RTantinyQuery {
84
- try_unwrap_params!(
85
- index,
86
- field: String,
87
- terms: Vec<String>
88
- );
89
-
90
- let schema = &unwrap_index(&index).schema;
91
- let field = schema.get_field(&field).try_unwrap();
92
-
93
- let terms: Vec<Term> = terms.into_iter().map(|term| {
94
- Term::from_field_text(field, &term)
95
- }).collect();
58
+ fn new_phrase(index: &Index, field: String, terms: Vec<String>) -> Result<Self, Error> {
59
+ let ruby = unsafe { Ruby::get_unchecked() };
60
+ let field = index.schema.get_field(&field).map_err(|e| {
61
+ Error::new(
62
+ ruby.exception_runtime_error(),
63
+ format!("Field not found: {}", e),
64
+ )
65
+ })?;
66
+
67
+ let terms: Vec<Term> = terms
68
+ .into_iter()
69
+ .map(|term| Term::from_field_text(field, &term))
70
+ .collect();
96
71
  let query = PhraseQuery::new(terms);
97
-
98
- wrap_query(Box::new(query))
72
+ Ok(Query(Box::new(query)))
99
73
  }
100
74
 
101
- fn new_regex_query(
102
- index: RTantinyIndex,
103
- field: RString,
104
- regex: RString
105
- ) -> RTantinyQuery {
106
- try_unwrap_params!(
107
- index,
108
- field: String,
109
- regex: String
110
- );
111
-
112
- let schema = &unwrap_index(&index).schema;
113
- let field = schema.get_field(&field).try_unwrap();
114
- let query = RegexQuery::from_pattern(&regex, field).try_unwrap();
115
-
116
- wrap_query(Box::new(query))
75
+ fn new_regex(index: &Index, field: String, regex: String) -> Result<Self, Error> {
76
+ let ruby = unsafe { Ruby::get_unchecked() };
77
+ let field = index.schema.get_field(&field).map_err(|e| {
78
+ Error::new(
79
+ ruby.exception_runtime_error(),
80
+ format!("Field not found: {}", e),
81
+ )
82
+ })?;
83
+ let query = RegexQuery::from_pattern(&regex, field).map_err(|e| {
84
+ Error::new(
85
+ ruby.exception_runtime_error(),
86
+ format!("Invalid regex: {}", e),
87
+ )
88
+ })?;
89
+ Ok(Query(Box::new(query)))
117
90
  }
118
91
 
119
- fn new_range_query(
120
- index: RTantinyIndex,
121
- field: RString,
122
- from: AnyObject,
123
- to: AnyObject
124
- ) -> RTantinyQuery {
125
- try_unwrap_params!(index, from, to, field: String);
126
-
127
- let schema = &unwrap_index(&index).schema;
128
- let field = schema.get_field(&field).try_unwrap();
129
- let field_name = schema.get_field_name(field);
130
- let field_type = schema.get_field_entry(field).field_type();
131
-
132
- let range = match field_type {
92
+ fn new_range(index: &Index, field: String, from: Value, to: Value) -> Result<Self, Error> {
93
+ let ruby = unsafe { Ruby::get_unchecked() };
94
+ let field_obj = index.schema.get_field(&field).map_err(|e| {
95
+ Error::new(
96
+ ruby.exception_runtime_error(),
97
+ format!("Field not found: {}", e),
98
+ )
99
+ })?;
100
+ let field_name = index.schema.get_field_name(field_obj);
101
+ let field_type = index.schema.get_field_entry(field_obj).field_type();
102
+
103
+ let (left, right) = match field_type {
133
104
  FieldType::Date(_) => {
134
- let from: String = from.try_unwrap();
135
- let to: String = to.try_unwrap();
136
- let from = DateTime::from_str(&from).try_unwrap();
137
- let to = DateTime::from_str(&to).try_unwrap();
138
-
139
- Ok((
140
- Type::Date,
141
- Included(Term::from_field_date(field, &from)),
142
- Included(Term::from_field_date(field, &to))
143
- ))
144
- },
105
+ let from_str: String = String::try_convert(from)?;
106
+ let to_str: String = String::try_convert(to)?;
107
+ let from_datetime = OffsetDateTime::parse(
108
+ &from_str,
109
+ &time::format_description::well_known::Rfc3339,
110
+ )
111
+ .map_err(|e| {
112
+ Error::new(
113
+ ruby.exception_runtime_error(),
114
+ format!("Invalid date format: {}", e),
115
+ )
116
+ })?;
117
+ let to_datetime =
118
+ OffsetDateTime::parse(&to_str, &time::format_description::well_known::Rfc3339)
119
+ .map_err(|e| {
120
+ Error::new(
121
+ ruby.exception_runtime_error(),
122
+ format!("Invalid date format: {}", e),
123
+ )
124
+ })?;
125
+ let from_dt = tantivy::DateTime::from_timestamp_nanos(
126
+ from_datetime.unix_timestamp_nanos() as i64,
127
+ );
128
+ let to_dt = tantivy::DateTime::from_timestamp_nanos(
129
+ to_datetime.unix_timestamp_nanos() as i64,
130
+ );
131
+
132
+ (
133
+ Term::from_field_date(field_obj, from_dt),
134
+ Term::from_field_date(field_obj, to_dt),
135
+ )
136
+ }
145
137
  FieldType::I64(_) => {
146
- let from: i64 = from.try_unwrap();
147
- let to: i64 = to.try_unwrap();
148
-
149
- Ok((
150
- Type::I64,
151
- Included(Term::from_field_i64(field, from)),
152
- Included(Term::from_field_i64(field, to))
153
- ))
154
- },
138
+ let from_val: i64 = i64::try_convert(from)?;
139
+ let to_val: i64 = i64::try_convert(to)?;
140
+ (
141
+ Term::from_field_i64(field_obj, from_val),
142
+ Term::from_field_i64(field_obj, to_val),
143
+ )
144
+ }
155
145
  FieldType::F64(_) => {
156
- let from: f64 = from.try_unwrap();
157
- let to: f64 = to.try_unwrap();
158
-
159
- Ok((
160
- Type::F64,
161
- Included(Term::from_field_f64(field, from)),
162
- Included(Term::from_field_f64(field, to))
146
+ let from_val: f64 = f64::try_convert(from)?;
147
+ let to_val: f64 = f64::try_convert(to)?;
148
+ (
149
+ Term::from_field_f64(field_obj, from_val),
150
+ Term::from_field_f64(field_obj, to_val),
151
+ )
152
+ }
153
+ _ => {
154
+ return Err(Error::new(
155
+ ruby.exception_runtime_error(),
156
+ format!("Field '{}' is not supported by range query.", field_name),
163
157
  ))
164
- },
165
- _ => { Err(format!("Field '{}' is not supported by range query.", field_name)) }
158
+ }
166
159
  };
167
160
 
168
- let (value_type, left, right) = range.try_unwrap();
169
-
170
- let query = RangeQuery::new_term_bounds(field, value_type, &left, &right);
171
-
172
- wrap_query(Box::new(query))
161
+ let query = RangeQuery::new(Included(left), Included(right));
162
+ Ok(Query(Box::new(query)))
173
163
  }
174
164
 
175
- fn new_facet_query(
176
- index: RTantinyIndex,
177
- field: RString,
178
- path: RString
179
- ) -> RTantinyQuery {
180
- try_unwrap_params!(
181
- index,
182
- field: String,
183
- path: String
184
- );
185
-
186
- let schema = &unwrap_index(&index).schema;
187
- let field = schema.get_field(&field).try_unwrap();
165
+ fn new_facet(index: &Index, field: String, path: String) -> Result<Self, Error> {
166
+ let ruby = unsafe { Ruby::get_unchecked() };
167
+ let field = index.schema.get_field(&field).map_err(|e| {
168
+ Error::new(
169
+ ruby.exception_runtime_error(),
170
+ format!("Field not found: {}", e),
171
+ )
172
+ })?;
188
173
  let facet = Facet::from(&path);
189
174
  let term = Term::from_facet(field, &facet);
190
175
  let query = TermQuery::new(term, IndexRecordOption::Basic);
191
-
192
- wrap_query(Box::new(query))
176
+ Ok(Query(Box::new(query)))
193
177
  }
194
178
 
195
- fn disjunction(queries: Array) -> RTantinyQuery {
196
- try_unwrap_params!(queries);
197
-
179
+ fn disjunction(queries: RArray) -> Result<Self, Error> {
198
180
  let mut query_vec = Vec::new();
199
181
 
200
- for query in queries {
201
- let query: RTantinyQuery = query.try_unwrap();
202
- query_vec.push((Occur::Should, unwrap_query(&query).box_clone()));
182
+ for item in queries.into_iter() {
183
+ let query: &Query = <&Query>::try_convert(item)?;
184
+ query_vec.push((Occur::Should, query.0.box_clone()));
203
185
  }
204
186
 
205
- let disjunction_query = BooleanQuery::from(query_vec);
206
-
207
- wrap_query(Box::new(disjunction_query))
187
+ Ok(Query(Box::new(BooleanQuery::from(query_vec))))
208
188
  }
209
189
 
210
- fn conjunction(queries: Array) -> RTantinyQuery {
211
- try_unwrap_params!(queries);
212
-
190
+ fn conjunction(queries: RArray) -> Result<Self, Error> {
213
191
  let mut query_vec = Vec::new();
214
192
 
215
- for query in queries {
216
- let query: RTantinyQuery = query.try_unwrap();
217
- query_vec.push((Occur::Must, unwrap_query(&query).box_clone()));
193
+ for item in queries.into_iter() {
194
+ let query: &Query = <&Query>::try_convert(item)?;
195
+ query_vec.push((Occur::Must, query.0.box_clone()));
218
196
  }
219
197
 
220
- let conjunction_query = BooleanQuery::from(query_vec);
221
-
222
- wrap_query(Box::new(conjunction_query))
198
+ Ok(Query(Box::new(BooleanQuery::from(query_vec))))
223
199
  }
224
200
 
225
- fn negation() -> RTantinyQuery {
226
- // See: https://github.com/quickwit-oss/tantivy/issues/1153
227
- let all_query: Box<dyn Query> = Box::new(AllQuery);
201
+ fn negation(&self) -> Self {
202
+ let all_query: Box<dyn tantivy::query::Query> = Box::new(AllQuery);
228
203
  let negation_query = BooleanQuery::from(vec![
229
204
  (Occur::Must, all_query.box_clone()),
230
- (Occur::MustNot, unwrap_query(&_itself).box_clone()),
205
+ (Occur::MustNot, self.0.box_clone()),
231
206
  ]);
232
207
 
233
- wrap_query(Box::new(negation_query))
208
+ Query(Box::new(negation_query))
209
+ }
210
+
211
+ fn boost(&self, score: f64) -> Self {
212
+ let query = BoostQuery::new(self.0.box_clone(), score as f32);
213
+ Query(Box::new(query))
234
214
  }
235
215
 
236
- fn boost(score: Float) -> RTantinyQuery {
237
- try_unwrap_params!(score: f64);
216
+ fn highlight(text: String, terms: Vec<String>, fuzzy_distance: i64) -> Result<String, Error> {
217
+ use tantivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, TokenStream};
218
+
219
+ // Create a simple tokenizer for highlighting
220
+ let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
221
+ .filter(LowerCaser)
222
+ .build();
238
223
 
239
- let query = BoostQuery::new(unwrap_query(&_itself).box_clone(), score as f32);
224
+ // Tokenize the input text
225
+ let mut token_stream = analyzer.token_stream(&text);
240
226
 
241
- wrap_query(Box::new(query))
227
+ // Collect all tokens with their positions
228
+ let mut tokens = Vec::new();
229
+ while token_stream.advance() {
230
+ let token = token_stream.token();
231
+ tokens.push((token.text.clone(), token.offset_from, token.offset_to));
232
+ }
233
+
234
+ // Build Levenshtein automata for each term (same algorithm as Tantivy's FuzzyTermQuery)
235
+ let lev_builder = LevenshteinAutomatonBuilder::new(fuzzy_distance as u8, true);
236
+ let automata: Vec<_> = terms
237
+ .iter()
238
+ .map(|term| lev_builder.build_dfa(term))
239
+ .collect();
240
+
241
+ // Build the highlighted text
242
+ let mut result = String::new();
243
+ let mut last_pos = 0;
244
+
245
+ for (token_text, start, end) in tokens {
246
+ // Check if this token matches any of the query terms (exact or fuzzy)
247
+ let should_highlight = terms.iter().zip(&automata).any(|(term, dfa)| {
248
+ // Exact match
249
+ if token_text.eq_ignore_ascii_case(term) {
250
+ return true;
251
+ }
252
+
253
+ // Fuzzy match using Levenshtein automaton (same as Tantivy's FuzzyTermQuery)
254
+ matches!(dfa.eval(&token_text), Distance::Exact(_))
255
+ });
256
+
257
+ // Add the text before the token
258
+ result.push_str(&text[last_pos..start]);
259
+
260
+ // Add the token, highlighted if it matches
261
+ if should_highlight {
262
+ result.push_str("<b>");
263
+ result.push_str(&text[start..end]);
264
+ result.push_str("</b>");
265
+ } else {
266
+ result.push_str(&text[start..end]);
267
+ }
268
+
269
+ last_pos = end;
270
+ }
271
+
272
+ // Add any remaining text after the last token
273
+ result.push_str(&text[last_pos..]);
274
+
275
+ Ok(result)
242
276
  }
243
- );
244
-
245
- pub(super) fn init() {
246
- klass().define(|klass| {
247
- klass.def_self("__new_all_query", new_all_query);
248
- klass.def_self("__new_empty_query", new_empty_query);
249
- klass.def_self("__new_term_query", new_term_query);
250
- klass.def_self("__new_fuzzy_term_query", new_fuzzy_term_query);
251
- klass.def_self("__new_regex_query", new_regex_query);
252
- klass.def_self("__new_range_query", new_range_query);
253
- klass.def_self("__new_phrase_query", new_phrase_query);
254
- klass.def_self("__new_facet_query", new_facet_query);
255
- klass.def_self("__disjunction", disjunction);
256
- klass.def_self("__conjunction", conjunction);
257
- klass.def("__negation", negation);
258
- klass.def("__boost", boost);
259
- });
260
- }
277
+ }
278
+
279
+ pub fn init(ruby: &Ruby, module: RModule) -> Result<(), Error> {
280
+ let class = module.define_class("Query", ruby.class_object())?;
281
+
282
+ class.define_singleton_method("__new_all_query", magnus::function!(Query::new_all, 0))?;
283
+ class.define_singleton_method("__new_empty_query", magnus::function!(Query::new_empty, 0))?;
284
+ class.define_singleton_method("__new_term_query", magnus::function!(Query::new_term, 3))?;
285
+ class.define_singleton_method(
286
+ "__new_fuzzy_term_query",
287
+ magnus::function!(Query::new_fuzzy_term, 4),
288
+ )?;
289
+ class.define_singleton_method(
290
+ "__new_phrase_query",
291
+ magnus::function!(Query::new_phrase, 3),
292
+ )?;
293
+ class.define_singleton_method("__new_regex_query", magnus::function!(Query::new_regex, 3))?;
294
+ class.define_singleton_method("__new_range_query", magnus::function!(Query::new_range, 4))?;
295
+ class.define_singleton_method("__new_facet_query", magnus::function!(Query::new_facet, 3))?;
296
+ class.define_singleton_method("__disjunction", magnus::function!(Query::disjunction, 1))?;
297
+ class.define_singleton_method("__conjunction", magnus::function!(Query::conjunction, 1))?;
298
+ class.define_method("__negation", magnus::method!(Query::negation, 0))?;
299
+ class.define_method("__boost", magnus::method!(Query::boost, 1))?;
300
+ class.define_singleton_method("__highlight", magnus::function!(Query::highlight, 3))?;
301
+
302
+ Ok(())
303
+ }
data/src/tokenizer.rs CHANGED
@@ -1,94 +1,81 @@
1
+ use magnus::{Error, Module, Object, RModule, Ruby};
2
+ use tantivy::tokenizer::{
3
+ LowerCaser, NgramTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, TextAnalyzer,
4
+ };
1
5
 
2
- use rutie::{methods, Object, Array, RString, Integer, Boolean};
3
- use tantivy::tokenizer::{TextAnalyzer, SimpleTokenizer, RemoveLongFilter, LowerCaser, Stemmer, NgramTokenizer};
6
+ use crate::helpers::LanguageWrapper;
4
7
 
5
- use crate::helpers::{try_unwrap_params, scaffold, TryUnwrap, LanguageWrapper};
8
+ #[magnus::wrap(class = "Tantiny::Tokenizer", free_immediately, size)]
9
+ pub struct Tokenizer(TextAnalyzer);
6
10
 
7
- pub struct TantinyTokenizer(pub(crate) TextAnalyzer);
8
-
9
- scaffold!(RTantinyTokenizer, TantinyTokenizer, "Tokenizer");
10
-
11
- fn wrap_tokenizer(tokenizer: TextAnalyzer) -> RTantinyTokenizer {
12
- klass().wrap_data(
13
- TantinyTokenizer(tokenizer),
14
- &*TANTINY_TOKENIZER_WRAPPER
15
- )
16
- }
17
-
18
- pub(crate) fn unwrap_tokenizer(tokenizer: &RTantinyTokenizer) -> &TextAnalyzer {
19
- &tokenizer.get_data(&*TANTINY_TOKENIZER_WRAPPER).0
20
- }
21
-
22
- #[rustfmt::skip::macros(methods)]
23
- methods!(
24
- RTantinyTokenizer,
25
- _itself,
11
+ impl Tokenizer {
12
+ pub fn get_analyzer(&self) -> TextAnalyzer {
13
+ self.0.clone()
14
+ }
26
15
 
27
- fn new_simple_tokenizer() -> RTantinyTokenizer {
28
- let tokenizer = TextAnalyzer::from(SimpleTokenizer)
16
+ fn new_simple() -> Result<Self, Error> {
17
+ let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
29
18
  .filter(RemoveLongFilter::limit(40))
30
- .filter(LowerCaser);
31
-
32
- wrap_tokenizer(tokenizer)
19
+ .filter(LowerCaser)
20
+ .build();
21
+ Ok(Tokenizer(tokenizer))
33
22
  }
34
23
 
35
- fn new_stemmer_tokenizer(locale_code: RString) -> RTantinyTokenizer {
36
- try_unwrap_params!(locale_code: String);
37
-
38
- let language: LanguageWrapper = locale_code.parse().try_unwrap();
39
- let tokenizer = TextAnalyzer::from(SimpleTokenizer)
24
+ fn new_stemmer(language: String) -> Result<Self, Error> {
25
+ let lang_wrapper = LanguageWrapper::try_from(language)?;
26
+ let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
40
27
  .filter(RemoveLongFilter::limit(40))
41
28
  .filter(LowerCaser)
42
- .filter(Stemmer::new(language.0));
43
-
44
- wrap_tokenizer(tokenizer)
29
+ .filter(Stemmer::new(lang_wrapper.0))
30
+ .build();
31
+ Ok(Tokenizer(tokenizer))
45
32
  }
46
33
 
47
- fn new_ngram_tokenizer(
48
- min_gram: Integer,
49
- max_gram: Integer,
50
- prefix_only: Boolean
51
- ) -> RTantinyTokenizer {
52
- try_unwrap_params!(
53
- min_gram: i64,
54
- max_gram: i64,
55
- prefix_only: bool
56
- );
57
-
58
- let tokenizer = NgramTokenizer::new(
59
- min_gram as usize,
60
- max_gram as usize,
61
- prefix_only
62
- );
63
-
64
- wrap_tokenizer(TextAnalyzer::from(tokenizer))
34
+ fn new_ngram(min_gram: i64, max_gram: i64, prefix_only: bool) -> Result<Self, Error> {
35
+ let ruby = unsafe { Ruby::get_unchecked() };
36
+ let tokenizer = NgramTokenizer::new(min_gram as usize, max_gram as usize, prefix_only)
37
+ .map_err(|e| {
38
+ Error::new(
39
+ ruby.exception_runtime_error(),
40
+ format!("Failed to create ngram tokenizer: {}", e),
41
+ )
42
+ })?;
43
+
44
+ Ok(Tokenizer(TextAnalyzer::builder(tokenizer).build()))
65
45
  }
66
46
 
67
- fn extract_terms(text: RString) -> Array {
68
- try_unwrap_params!(text: String);
69
-
70
- let mut token_stream = unwrap_tokenizer(&_itself).token_stream(&text);
71
- let mut terms = vec![];
47
+ fn extract_terms(&self, text: String) -> Result<Vec<String>, Error> {
48
+ let mut cloned_analyzer = self.0.clone();
49
+ let mut token_stream = cloned_analyzer.token_stream(&text);
50
+ let mut terms = Vec::new();
72
51
 
73
52
  while token_stream.advance() {
74
- terms.push(token_stream.token().clone().text);
53
+ terms.push(token_stream.token().text.clone());
75
54
  }
76
55
 
77
- let mut array = Array::with_capacity(terms.len());
78
-
79
- for term in terms {
80
- array.push(RString::from(term));
81
- }
82
-
83
- array
56
+ Ok(terms)
84
57
  }
85
- );
58
+ }
86
59
 
87
- pub(super) fn init() {
88
- klass().define(|klass| {
89
- klass.def_self("__new_simple_tokenizer", new_simple_tokenizer);
90
- klass.def_self("__new_stemmer_tokenizer", new_stemmer_tokenizer);
91
- klass.def_self("__new_ngram_tokenizer", new_ngram_tokenizer);
92
- klass.def("__extract_terms", extract_terms);
93
- });
94
- }
60
+ pub fn init(ruby: &Ruby, module: RModule) -> Result<(), Error> {
61
+ let class = module.define_class("Tokenizer", ruby.class_object())?;
62
+
63
+ class.define_singleton_method(
64
+ "__new_simple_tokenizer",
65
+ magnus::function!(Tokenizer::new_simple, 0),
66
+ )?;
67
+ class.define_singleton_method(
68
+ "__new_stemmer_tokenizer",
69
+ magnus::function!(Tokenizer::new_stemmer, 1),
70
+ )?;
71
+ class.define_singleton_method(
72
+ "__new_ngram_tokenizer",
73
+ magnus::function!(Tokenizer::new_ngram, 3),
74
+ )?;
75
+ class.define_method(
76
+ "__extract_terms",
77
+ magnus::method!(Tokenizer::extract_terms, 1),
78
+ )?;
79
+
80
+ Ok(())
81
+ }