tantiny 0.3.3 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/query.rs CHANGED
@@ -1,260 +1,323 @@
1
- use std::str::FromStr;
1
+ use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder};
2
+ use magnus::{Error, Module, Object, RArray, RModule, Ruby, TryConvert, Value};
2
3
  use std::ops::Bound::Included;
3
- use rutie::{methods, Object, AnyObject, Integer, Float, Array, RString};
4
- use tantivy::{Term, DateTime};
5
- use tantivy::schema::{IndexRecordOption, Facet, Type, FieldType};
6
4
  use tantivy::query::*;
5
+ use tantivy::schema::{Facet, FieldType, IndexRecordOption};
6
+ use tantivy::Term;
7
+ use time::OffsetDateTime;
7
8
 
8
- use crate::helpers::{try_unwrap_params, scaffold, TryUnwrap};
9
- use crate::index::{unwrap_index, RTantinyIndex};
9
+ use crate::index::Index;
10
+ use crate::tokenizer::Tokenizer;
10
11
 
11
- pub struct TantinyQuery(pub(crate) Box<dyn Query>);
12
+ #[magnus::wrap(class = "Tantiny::Query", free_immediately, size)]
13
+ pub struct Query(Box<dyn tantivy::query::Query>);
12
14
 
13
- scaffold!(RTantinyQuery, TantinyQuery, "Query");
14
-
15
- fn wrap_query(query: Box<dyn Query>) -> RTantinyQuery {
16
- klass().wrap_data(
17
- TantinyQuery(query),
18
- &*TANTINY_QUERY_WRAPPER
19
- )
20
- }
21
-
22
- pub(crate) fn unwrap_query(query: &RTantinyQuery) -> &Box<dyn Query> {
23
- &query.get_data(&*TANTINY_QUERY_WRAPPER).0
24
- }
25
-
26
- #[rustfmt::skip::macros(methods)]
27
- methods!(
28
- RTantinyQuery,
29
- _itself,
15
+ impl Query {
16
+ pub fn get_query(&self) -> &dyn tantivy::query::Query {
17
+ self.0.as_ref()
18
+ }
30
19
 
31
- fn new_all_query() -> RTantinyQuery {
32
- wrap_query(Box::new(AllQuery))
20
+ fn new_all() -> Self {
21
+ Query(Box::new(AllQuery))
33
22
  }
34
23
 
35
- fn new_empty_query() -> RTantinyQuery {
36
- wrap_query(Box::new(EmptyQuery))
24
+ fn new_empty() -> Self {
25
+ Query(Box::new(EmptyQuery))
37
26
  }
38
27
 
39
- fn new_term_query(
40
- index: RTantinyIndex,
41
- field: RString,
42
- term: RString
43
- ) -> RTantinyQuery {
44
- try_unwrap_params!(
45
- index,
46
- field: String,
47
- term: String
48
- );
49
-
50
- let schema = &unwrap_index(&index).schema;
51
- let field = schema.get_field(&field).try_unwrap();
28
+ fn new_term(index: &Index, field: String, term: String) -> Result<Self, Error> {
29
+ let ruby = unsafe { Ruby::get_unchecked() };
30
+ let field = index.schema.get_field(&field).map_err(|e| {
31
+ Error::new(
32
+ ruby.exception_runtime_error(),
33
+ format!("Field not found: {}", e),
34
+ )
35
+ })?;
52
36
  let term = Term::from_field_text(field, &term);
53
37
  let query = TermQuery::new(term, IndexRecordOption::Basic);
54
-
55
- wrap_query(Box::new(query))
38
+ Ok(Query(Box::new(query)))
56
39
  }
57
40
 
58
- fn new_fuzzy_term_query(
59
- index: RTantinyIndex,
60
- field: RString,
61
- term: RString,
62
- distance: Integer
63
- ) -> RTantinyQuery {
64
- try_unwrap_params!(
65
- index,
66
- field: String,
67
- term: String,
68
- distance: i64
69
- );
70
-
71
- let schema = &unwrap_index(&index).schema;
72
- let field = schema.get_field(&field).try_unwrap();
41
+ fn new_fuzzy_term(
42
+ index: &Index,
43
+ field: String,
44
+ term: String,
45
+ distance: i64,
46
+ ) -> Result<Self, Error> {
47
+ let ruby = unsafe { Ruby::get_unchecked() };
48
+ let field = index.schema.get_field(&field).map_err(|e| {
49
+ Error::new(
50
+ ruby.exception_runtime_error(),
51
+ format!("Field not found: {}", e),
52
+ )
53
+ })?;
73
54
  let term = Term::from_field_text(field, &term);
74
55
  let query = FuzzyTermQuery::new(term, distance as u8, true);
75
-
76
- wrap_query(Box::new(query))
56
+ Ok(Query(Box::new(query)))
77
57
  }
78
58
 
79
- fn new_phrase_query(
80
- index: RTantinyIndex,
81
- field: RString,
82
- terms: Array
83
- ) -> RTantinyQuery {
84
- try_unwrap_params!(
85
- index,
86
- field: String,
87
- terms: Vec<String>
88
- );
89
-
90
- let schema = &unwrap_index(&index).schema;
91
- let field = schema.get_field(&field).try_unwrap();
92
-
93
- let terms: Vec<Term> = terms.into_iter().map(|term| {
94
- Term::from_field_text(field, &term)
95
- }).collect();
59
+ fn new_phrase(index: &Index, field: String, terms: Vec<String>) -> Result<Self, Error> {
60
+ let ruby = unsafe { Ruby::get_unchecked() };
61
+ let field = index.schema.get_field(&field).map_err(|e| {
62
+ Error::new(
63
+ ruby.exception_runtime_error(),
64
+ format!("Field not found: {}", e),
65
+ )
66
+ })?;
67
+
68
+ let terms: Vec<Term> = terms
69
+ .into_iter()
70
+ .map(|term| Term::from_field_text(field, &term))
71
+ .collect();
96
72
  let query = PhraseQuery::new(terms);
97
-
98
- wrap_query(Box::new(query))
73
+ Ok(Query(Box::new(query)))
99
74
  }
100
75
 
101
- fn new_regex_query(
102
- index: RTantinyIndex,
103
- field: RString,
104
- regex: RString
105
- ) -> RTantinyQuery {
106
- try_unwrap_params!(
107
- index,
108
- field: String,
109
- regex: String
110
- );
111
-
112
- let schema = &unwrap_index(&index).schema;
113
- let field = schema.get_field(&field).try_unwrap();
114
- let query = RegexQuery::from_pattern(&regex, field).try_unwrap();
115
-
116
- wrap_query(Box::new(query))
76
+ fn new_regex(index: &Index, field: String, regex: String) -> Result<Self, Error> {
77
+ let ruby = unsafe { Ruby::get_unchecked() };
78
+ let field = index.schema.get_field(&field).map_err(|e| {
79
+ Error::new(
80
+ ruby.exception_runtime_error(),
81
+ format!("Field not found: {}", e),
82
+ )
83
+ })?;
84
+ let query = RegexQuery::from_pattern(&regex, field).map_err(|e| {
85
+ Error::new(
86
+ ruby.exception_runtime_error(),
87
+ format!("Invalid regex: {}", e),
88
+ )
89
+ })?;
90
+ Ok(Query(Box::new(query)))
117
91
  }
118
92
 
119
- fn new_range_query(
120
- index: RTantinyIndex,
121
- field: RString,
122
- from: AnyObject,
123
- to: AnyObject
124
- ) -> RTantinyQuery {
125
- try_unwrap_params!(index, from, to, field: String);
126
-
127
- let schema = &unwrap_index(&index).schema;
128
- let field = schema.get_field(&field).try_unwrap();
129
- let field_name = schema.get_field_name(field);
130
- let field_type = schema.get_field_entry(field).field_type();
131
-
132
- let range = match field_type {
93
+ fn new_range(index: &Index, field: String, from: Value, to: Value) -> Result<Self, Error> {
94
+ let ruby = unsafe { Ruby::get_unchecked() };
95
+ let field_obj = index.schema.get_field(&field).map_err(|e| {
96
+ Error::new(
97
+ ruby.exception_runtime_error(),
98
+ format!("Field not found: {}", e),
99
+ )
100
+ })?;
101
+ let field_name = index.schema.get_field_name(field_obj);
102
+ let field_type = index.schema.get_field_entry(field_obj).field_type();
103
+
104
+ let (left, right) = match field_type {
133
105
  FieldType::Date(_) => {
134
- let from: String = from.try_unwrap();
135
- let to: String = to.try_unwrap();
136
- let from = DateTime::from_str(&from).try_unwrap();
137
- let to = DateTime::from_str(&to).try_unwrap();
138
-
139
- Ok((
140
- Type::Date,
141
- Included(Term::from_field_date(field, &from)),
142
- Included(Term::from_field_date(field, &to))
143
- ))
144
- },
106
+ let from_str: String = String::try_convert(from)?;
107
+ let to_str: String = String::try_convert(to)?;
108
+ let from_datetime = OffsetDateTime::parse(
109
+ &from_str,
110
+ &time::format_description::well_known::Rfc3339,
111
+ )
112
+ .map_err(|e| {
113
+ Error::new(
114
+ ruby.exception_runtime_error(),
115
+ format!("Invalid date format: {}", e),
116
+ )
117
+ })?;
118
+ let to_datetime =
119
+ OffsetDateTime::parse(&to_str, &time::format_description::well_known::Rfc3339)
120
+ .map_err(|e| {
121
+ Error::new(
122
+ ruby.exception_runtime_error(),
123
+ format!("Invalid date format: {}", e),
124
+ )
125
+ })?;
126
+ let from_dt = tantivy::DateTime::from_timestamp_nanos(
127
+ from_datetime.unix_timestamp_nanos() as i64,
128
+ );
129
+ let to_dt = tantivy::DateTime::from_timestamp_nanos(
130
+ to_datetime.unix_timestamp_nanos() as i64,
131
+ );
132
+
133
+ (
134
+ Term::from_field_date(field_obj, from_dt),
135
+ Term::from_field_date(field_obj, to_dt),
136
+ )
137
+ }
145
138
  FieldType::I64(_) => {
146
- let from: i64 = from.try_unwrap();
147
- let to: i64 = to.try_unwrap();
148
-
149
- Ok((
150
- Type::I64,
151
- Included(Term::from_field_i64(field, from)),
152
- Included(Term::from_field_i64(field, to))
153
- ))
154
- },
139
+ let from_val: i64 = i64::try_convert(from)?;
140
+ let to_val: i64 = i64::try_convert(to)?;
141
+ (
142
+ Term::from_field_i64(field_obj, from_val),
143
+ Term::from_field_i64(field_obj, to_val),
144
+ )
145
+ }
155
146
  FieldType::F64(_) => {
156
- let from: f64 = from.try_unwrap();
157
- let to: f64 = to.try_unwrap();
158
-
159
- Ok((
160
- Type::F64,
161
- Included(Term::from_field_f64(field, from)),
162
- Included(Term::from_field_f64(field, to))
147
+ let from_val: f64 = f64::try_convert(from)?;
148
+ let to_val: f64 = f64::try_convert(to)?;
149
+ (
150
+ Term::from_field_f64(field_obj, from_val),
151
+ Term::from_field_f64(field_obj, to_val),
152
+ )
153
+ }
154
+ _ => {
155
+ return Err(Error::new(
156
+ ruby.exception_runtime_error(),
157
+ format!("Field '{}' is not supported by range query.", field_name),
163
158
  ))
164
- },
165
- _ => { Err(format!("Field '{}' is not supported by range query.", field_name)) }
159
+ }
166
160
  };
167
161
 
168
- let (value_type, left, right) = range.try_unwrap();
169
-
170
- let query = RangeQuery::new_term_bounds(field, value_type, &left, &right);
171
-
172
- wrap_query(Box::new(query))
162
+ let query = RangeQuery::new(Included(left), Included(right));
163
+ Ok(Query(Box::new(query)))
173
164
  }
174
165
 
175
- fn new_facet_query(
176
- index: RTantinyIndex,
177
- field: RString,
178
- path: RString
179
- ) -> RTantinyQuery {
180
- try_unwrap_params!(
181
- index,
182
- field: String,
183
- path: String
184
- );
185
-
186
- let schema = &unwrap_index(&index).schema;
187
- let field = schema.get_field(&field).try_unwrap();
166
+ fn new_facet(index: &Index, field: String, path: String) -> Result<Self, Error> {
167
+ let ruby = unsafe { Ruby::get_unchecked() };
168
+ let field = index.schema.get_field(&field).map_err(|e| {
169
+ Error::new(
170
+ ruby.exception_runtime_error(),
171
+ format!("Field not found: {}", e),
172
+ )
173
+ })?;
188
174
  let facet = Facet::from(&path);
189
175
  let term = Term::from_facet(field, &facet);
190
176
  let query = TermQuery::new(term, IndexRecordOption::Basic);
191
-
192
- wrap_query(Box::new(query))
177
+ Ok(Query(Box::new(query)))
193
178
  }
194
179
 
195
- fn disjunction(queries: Array) -> RTantinyQuery {
196
- try_unwrap_params!(queries);
197
-
180
+ fn disjunction(queries: RArray) -> Result<Self, Error> {
198
181
  let mut query_vec = Vec::new();
199
182
 
200
- for query in queries {
201
- let query: RTantinyQuery = query.try_unwrap();
202
- query_vec.push((Occur::Should, unwrap_query(&query).box_clone()));
183
+ for item in queries.into_iter() {
184
+ let query: &Query = <&Query>::try_convert(item)?;
185
+ query_vec.push((Occur::Should, query.0.box_clone()));
203
186
  }
204
187
 
205
- let disjunction_query = BooleanQuery::from(query_vec);
206
-
207
- wrap_query(Box::new(disjunction_query))
188
+ Ok(Query(Box::new(BooleanQuery::from(query_vec))))
208
189
  }
209
190
 
210
- fn conjunction(queries: Array) -> RTantinyQuery {
211
- try_unwrap_params!(queries);
212
-
191
+ fn conjunction(queries: RArray) -> Result<Self, Error> {
213
192
  let mut query_vec = Vec::new();
214
193
 
215
- for query in queries {
216
- let query: RTantinyQuery = query.try_unwrap();
217
- query_vec.push((Occur::Must, unwrap_query(&query).box_clone()));
194
+ for item in queries.into_iter() {
195
+ let query: &Query = <&Query>::try_convert(item)?;
196
+ query_vec.push((Occur::Must, query.0.box_clone()));
218
197
  }
219
198
 
220
- let conjunction_query = BooleanQuery::from(query_vec);
221
-
222
- wrap_query(Box::new(conjunction_query))
199
+ Ok(Query(Box::new(BooleanQuery::from(query_vec))))
223
200
  }
224
201
 
225
- fn negation() -> RTantinyQuery {
226
- // See: https://github.com/quickwit-oss/tantivy/issues/1153
227
- let all_query: Box<dyn Query> = Box::new(AllQuery);
202
+ fn negation(&self) -> Self {
203
+ let all_query: Box<dyn tantivy::query::Query> = Box::new(AllQuery);
228
204
  let negation_query = BooleanQuery::from(vec![
229
205
  (Occur::Must, all_query.box_clone()),
230
- (Occur::MustNot, unwrap_query(&_itself).box_clone()),
206
+ (Occur::MustNot, self.0.box_clone()),
231
207
  ]);
232
208
 
233
- wrap_query(Box::new(negation_query))
209
+ Query(Box::new(negation_query))
210
+ }
211
+
212
+ fn boost(&self, score: f64) -> Self {
213
+ let query = BoostQuery::new(self.0.box_clone(), score as f32);
214
+ Query(Box::new(query))
234
215
  }
235
216
 
236
- fn boost(score: Float) -> RTantinyQuery {
237
- try_unwrap_params!(score: f64);
217
+ fn highlight(
218
+ text: String,
219
+ query_string: String,
220
+ fuzzy_distance: i64,
221
+ tokenizer: &Tokenizer,
222
+ last_term_min_length_prefix_match: i64,
223
+ ) -> Result<String, Error> {
224
+ use tantivy::tokenizer::TokenStream;
225
+ let mut analyzer = tokenizer.get_analyzer();
226
+
227
+ // Tokenizer the query string
228
+ let query_tokens = {
229
+ let mut query_string_token_stream = analyzer.token_stream(&query_string);
230
+ let mut tokens = Vec::new();
231
+ while query_string_token_stream.advance() {
232
+ let token = query_string_token_stream.token();
233
+ tokens.push(token.text.clone());
234
+ }
235
+ tokens
236
+ };
237
+
238
+ // Tokenize the input text
239
+ let mut input_text_token_stream = analyzer.token_stream(&text);
240
+ let mut tokens = Vec::new();
241
+ while input_text_token_stream.advance() {
242
+ let token = input_text_token_stream.token();
243
+ tokens.push((token.text.clone(), token.offset_from, token.offset_to));
244
+ }
245
+
246
+ // Build Levenshtein automata for each term (same algorithm as Tantivy's FuzzyTermQuery)
247
+ let lev_builder = LevenshteinAutomatonBuilder::new(fuzzy_distance as u8, true);
248
+ let automata: Vec<_> = query_tokens
249
+ .iter()
250
+ .map(|term| lev_builder.build_dfa(term))
251
+ .collect();
252
+
253
+ // Build the highlighted text
254
+ let mut result = String::new();
255
+ let mut last_pos = 0;
256
+
257
+ for (token_text, start, end) in tokens {
258
+ // Check if this token matches any of the query terms (exact or fuzzy)
259
+ let fuzzy_match = query_tokens.iter().zip(&automata).any(|(term, dfa)| {
260
+ // Exact match
261
+ if token_text.eq_ignore_ascii_case(term) {
262
+ return true;
263
+ }
264
+
265
+ // Fuzzy match using Levenshtein automaton (same as Tantivy's FuzzyTermQuery)
266
+ matches!(dfa.eval(&token_text), Distance::Exact(_))
267
+ });
268
+
269
+ // Check if this token is a prefix match for the last query term
270
+ let prefix_match = token_text.len() > last_term_min_length_prefix_match as usize
271
+ && query_tokens
272
+ .last()
273
+ .map(|last_token| token_text.starts_with(last_token))
274
+ .unwrap_or(false);
275
+ let should_highlight = fuzzy_match || prefix_match;
276
+
277
+ // Add the text before the token
278
+ result.push_str(&text[last_pos..start]);
279
+
280
+ // Add the token, highlighted if it matches
281
+ if should_highlight {
282
+ result.push_str("<b>");
283
+ result.push_str(&text[start..end]);
284
+ result.push_str("</b>");
285
+ } else {
286
+ result.push_str(&text[start..end]);
287
+ }
288
+
289
+ last_pos = end;
290
+ }
238
291
 
239
- let query = BoostQuery::new(unwrap_query(&_itself).box_clone(), score as f32);
292
+ // Add any remaining text after the last token
293
+ result.push_str(&text[last_pos..]);
240
294
 
241
- wrap_query(Box::new(query))
295
+ Ok(result)
242
296
  }
243
- );
244
-
245
- pub(super) fn init() {
246
- klass().define(|klass| {
247
- klass.def_self("__new_all_query", new_all_query);
248
- klass.def_self("__new_empty_query", new_empty_query);
249
- klass.def_self("__new_term_query", new_term_query);
250
- klass.def_self("__new_fuzzy_term_query", new_fuzzy_term_query);
251
- klass.def_self("__new_regex_query", new_regex_query);
252
- klass.def_self("__new_range_query", new_range_query);
253
- klass.def_self("__new_phrase_query", new_phrase_query);
254
- klass.def_self("__new_facet_query", new_facet_query);
255
- klass.def_self("__disjunction", disjunction);
256
- klass.def_self("__conjunction", conjunction);
257
- klass.def("__negation", negation);
258
- klass.def("__boost", boost);
259
- });
260
- }
297
+ }
298
+
299
+ pub fn init(ruby: &Ruby, module: RModule) -> Result<(), Error> {
300
+ let class = module.define_class("Query", ruby.class_object())?;
301
+
302
+ class.define_singleton_method("__new_all_query", magnus::function!(Query::new_all, 0))?;
303
+ class.define_singleton_method("__new_empty_query", magnus::function!(Query::new_empty, 0))?;
304
+ class.define_singleton_method("__new_term_query", magnus::function!(Query::new_term, 3))?;
305
+ class.define_singleton_method(
306
+ "__new_fuzzy_term_query",
307
+ magnus::function!(Query::new_fuzzy_term, 4),
308
+ )?;
309
+ class.define_singleton_method(
310
+ "__new_phrase_query",
311
+ magnus::function!(Query::new_phrase, 3),
312
+ )?;
313
+ class.define_singleton_method("__new_regex_query", magnus::function!(Query::new_regex, 3))?;
314
+ class.define_singleton_method("__new_range_query", magnus::function!(Query::new_range, 4))?;
315
+ class.define_singleton_method("__new_facet_query", magnus::function!(Query::new_facet, 3))?;
316
+ class.define_singleton_method("__disjunction", magnus::function!(Query::disjunction, 1))?;
317
+ class.define_singleton_method("__conjunction", magnus::function!(Query::conjunction, 1))?;
318
+ class.define_method("__negation", magnus::method!(Query::negation, 0))?;
319
+ class.define_method("__boost", magnus::method!(Query::boost, 1))?;
320
+ class.define_singleton_method("__highlight", magnus::function!(Query::highlight, 5))?;
321
+
322
+ Ok(())
323
+ }