tantiny 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/Cargo.toml +9 -6
- data/README.md +118 -42
- data/bin/console +2 -3
- data/lib/tantiny/errors.rb +1 -1
- data/lib/tantiny/index.rb +29 -19
- data/lib/tantiny/query.rb +21 -16
- data/lib/tantiny/schema.rb +2 -2
- data/lib/tantiny/version.rb +1 -1
- data/lib/tantiny.rb +21 -10
- data/lib/tantiny.so +0 -0
- data/src/helpers.rs +71 -191
- data/src/index.rs +310 -197
- data/src/lib.rs +12 -9
- data/src/query.rs +246 -203
- data/src/tokenizer.rs +62 -75
- metadata +44 -43
- data/lib/.rbnext/3.0/tantiny/schema.rb +0 -53
- data/sig/tantiny/errors.rbs +0 -20
- data/sig/tantiny/helpers.rbs +0 -8
- data/sig/tantiny/index.rbs +0 -103
- data/sig/tantiny/query.rbs +0 -135
- data/sig/tantiny/schema.rbs +0 -26
- data/sig/tantiny/tokenizer.rbs +0 -25
- data/sig/tantiny/version.rbs +0 -3
- data/sig/tantiny.rbs +0 -5
data/src/query.rs
CHANGED
|
@@ -1,260 +1,303 @@
|
|
|
1
|
-
use
|
|
1
|
+
use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder};
|
|
2
|
+
use magnus::{Error, Module, Object, RArray, RModule, Ruby, TryConvert, Value};
|
|
2
3
|
use std::ops::Bound::Included;
|
|
3
|
-
use rutie::{methods, Object, AnyObject, Integer, Float, Array, RString};
|
|
4
|
-
use tantivy::{Term, DateTime};
|
|
5
|
-
use tantivy::schema::{IndexRecordOption, Facet, Type, FieldType};
|
|
6
4
|
use tantivy::query::*;
|
|
5
|
+
use tantivy::schema::{Facet, FieldType, IndexRecordOption};
|
|
6
|
+
use tantivy::Term;
|
|
7
|
+
use time::OffsetDateTime;
|
|
7
8
|
|
|
8
|
-
use crate::
|
|
9
|
-
use crate::index::{unwrap_index, RTantinyIndex};
|
|
9
|
+
use crate::index::Index;
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
#[magnus::wrap(class = "Tantiny::Query", free_immediately, size)]
|
|
12
|
+
pub struct Query(Box<dyn tantivy::query::Query>);
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
TantinyQuery(query),
|
|
18
|
-
&*TANTINY_QUERY_WRAPPER
|
|
19
|
-
)
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
pub(crate) fn unwrap_query(query: &RTantinyQuery) -> &Box<dyn Query> {
|
|
23
|
-
&query.get_data(&*TANTINY_QUERY_WRAPPER).0
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
#[rustfmt::skip::macros(methods)]
|
|
27
|
-
methods!(
|
|
28
|
-
RTantinyQuery,
|
|
29
|
-
_itself,
|
|
14
|
+
impl Query {
|
|
15
|
+
pub fn get_query(&self) -> &dyn tantivy::query::Query {
|
|
16
|
+
self.0.as_ref()
|
|
17
|
+
}
|
|
30
18
|
|
|
31
|
-
fn
|
|
32
|
-
|
|
19
|
+
fn new_all() -> Self {
|
|
20
|
+
Query(Box::new(AllQuery))
|
|
33
21
|
}
|
|
34
22
|
|
|
35
|
-
fn
|
|
36
|
-
|
|
23
|
+
fn new_empty() -> Self {
|
|
24
|
+
Query(Box::new(EmptyQuery))
|
|
37
25
|
}
|
|
38
26
|
|
|
39
|
-
fn
|
|
40
|
-
|
|
41
|
-
field
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
term: String
|
|
48
|
-
);
|
|
49
|
-
|
|
50
|
-
let schema = &unwrap_index(&index).schema;
|
|
51
|
-
let field = schema.get_field(&field).try_unwrap();
|
|
27
|
+
fn new_term(index: &Index, field: String, term: String) -> Result<Self, Error> {
|
|
28
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
29
|
+
let field = index.schema.get_field(&field).map_err(|e| {
|
|
30
|
+
Error::new(
|
|
31
|
+
ruby.exception_runtime_error(),
|
|
32
|
+
format!("Field not found: {}", e),
|
|
33
|
+
)
|
|
34
|
+
})?;
|
|
52
35
|
let term = Term::from_field_text(field, &term);
|
|
53
36
|
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
|
54
|
-
|
|
55
|
-
wrap_query(Box::new(query))
|
|
37
|
+
Ok(Query(Box::new(query)))
|
|
56
38
|
}
|
|
57
39
|
|
|
58
|
-
fn
|
|
59
|
-
index:
|
|
60
|
-
field:
|
|
61
|
-
term:
|
|
62
|
-
distance:
|
|
63
|
-
) ->
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
let schema = &unwrap_index(&index).schema;
|
|
72
|
-
let field = schema.get_field(&field).try_unwrap();
|
|
40
|
+
fn new_fuzzy_term(
|
|
41
|
+
index: &Index,
|
|
42
|
+
field: String,
|
|
43
|
+
term: String,
|
|
44
|
+
distance: i64,
|
|
45
|
+
) -> Result<Self, Error> {
|
|
46
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
47
|
+
let field = index.schema.get_field(&field).map_err(|e| {
|
|
48
|
+
Error::new(
|
|
49
|
+
ruby.exception_runtime_error(),
|
|
50
|
+
format!("Field not found: {}", e),
|
|
51
|
+
)
|
|
52
|
+
})?;
|
|
73
53
|
let term = Term::from_field_text(field, &term);
|
|
74
54
|
let query = FuzzyTermQuery::new(term, distance as u8, true);
|
|
75
|
-
|
|
76
|
-
wrap_query(Box::new(query))
|
|
55
|
+
Ok(Query(Box::new(query)))
|
|
77
56
|
}
|
|
78
57
|
|
|
79
|
-
fn
|
|
80
|
-
|
|
81
|
-
field
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
let terms: Vec<Term> = terms.into_iter().map(|term| {
|
|
94
|
-
Term::from_field_text(field, &term)
|
|
95
|
-
}).collect();
|
|
58
|
+
fn new_phrase(index: &Index, field: String, terms: Vec<String>) -> Result<Self, Error> {
|
|
59
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
60
|
+
let field = index.schema.get_field(&field).map_err(|e| {
|
|
61
|
+
Error::new(
|
|
62
|
+
ruby.exception_runtime_error(),
|
|
63
|
+
format!("Field not found: {}", e),
|
|
64
|
+
)
|
|
65
|
+
})?;
|
|
66
|
+
|
|
67
|
+
let terms: Vec<Term> = terms
|
|
68
|
+
.into_iter()
|
|
69
|
+
.map(|term| Term::from_field_text(field, &term))
|
|
70
|
+
.collect();
|
|
96
71
|
let query = PhraseQuery::new(terms);
|
|
97
|
-
|
|
98
|
-
wrap_query(Box::new(query))
|
|
72
|
+
Ok(Query(Box::new(query)))
|
|
99
73
|
}
|
|
100
74
|
|
|
101
|
-
fn
|
|
102
|
-
|
|
103
|
-
field
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
wrap_query(Box::new(query))
|
|
75
|
+
fn new_regex(index: &Index, field: String, regex: String) -> Result<Self, Error> {
|
|
76
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
77
|
+
let field = index.schema.get_field(&field).map_err(|e| {
|
|
78
|
+
Error::new(
|
|
79
|
+
ruby.exception_runtime_error(),
|
|
80
|
+
format!("Field not found: {}", e),
|
|
81
|
+
)
|
|
82
|
+
})?;
|
|
83
|
+
let query = RegexQuery::from_pattern(®ex, field).map_err(|e| {
|
|
84
|
+
Error::new(
|
|
85
|
+
ruby.exception_runtime_error(),
|
|
86
|
+
format!("Invalid regex: {}", e),
|
|
87
|
+
)
|
|
88
|
+
})?;
|
|
89
|
+
Ok(Query(Box::new(query)))
|
|
117
90
|
}
|
|
118
91
|
|
|
119
|
-
fn
|
|
120
|
-
|
|
121
|
-
field
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
let
|
|
128
|
-
let
|
|
129
|
-
|
|
130
|
-
let
|
|
131
|
-
|
|
132
|
-
let range = match field_type {
|
|
92
|
+
fn new_range(index: &Index, field: String, from: Value, to: Value) -> Result<Self, Error> {
|
|
93
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
94
|
+
let field_obj = index.schema.get_field(&field).map_err(|e| {
|
|
95
|
+
Error::new(
|
|
96
|
+
ruby.exception_runtime_error(),
|
|
97
|
+
format!("Field not found: {}", e),
|
|
98
|
+
)
|
|
99
|
+
})?;
|
|
100
|
+
let field_name = index.schema.get_field_name(field_obj);
|
|
101
|
+
let field_type = index.schema.get_field_entry(field_obj).field_type();
|
|
102
|
+
|
|
103
|
+
let (left, right) = match field_type {
|
|
133
104
|
FieldType::Date(_) => {
|
|
134
|
-
let
|
|
135
|
-
let
|
|
136
|
-
let
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
105
|
+
let from_str: String = String::try_convert(from)?;
|
|
106
|
+
let to_str: String = String::try_convert(to)?;
|
|
107
|
+
let from_datetime = OffsetDateTime::parse(
|
|
108
|
+
&from_str,
|
|
109
|
+
&time::format_description::well_known::Rfc3339,
|
|
110
|
+
)
|
|
111
|
+
.map_err(|e| {
|
|
112
|
+
Error::new(
|
|
113
|
+
ruby.exception_runtime_error(),
|
|
114
|
+
format!("Invalid date format: {}", e),
|
|
115
|
+
)
|
|
116
|
+
})?;
|
|
117
|
+
let to_datetime =
|
|
118
|
+
OffsetDateTime::parse(&to_str, &time::format_description::well_known::Rfc3339)
|
|
119
|
+
.map_err(|e| {
|
|
120
|
+
Error::new(
|
|
121
|
+
ruby.exception_runtime_error(),
|
|
122
|
+
format!("Invalid date format: {}", e),
|
|
123
|
+
)
|
|
124
|
+
})?;
|
|
125
|
+
let from_dt = tantivy::DateTime::from_timestamp_nanos(
|
|
126
|
+
from_datetime.unix_timestamp_nanos() as i64,
|
|
127
|
+
);
|
|
128
|
+
let to_dt = tantivy::DateTime::from_timestamp_nanos(
|
|
129
|
+
to_datetime.unix_timestamp_nanos() as i64,
|
|
130
|
+
);
|
|
131
|
+
|
|
132
|
+
(
|
|
133
|
+
Term::from_field_date(field_obj, from_dt),
|
|
134
|
+
Term::from_field_date(field_obj, to_dt),
|
|
135
|
+
)
|
|
136
|
+
}
|
|
145
137
|
FieldType::I64(_) => {
|
|
146
|
-
let
|
|
147
|
-
let
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
))
|
|
154
|
-
},
|
|
138
|
+
let from_val: i64 = i64::try_convert(from)?;
|
|
139
|
+
let to_val: i64 = i64::try_convert(to)?;
|
|
140
|
+
(
|
|
141
|
+
Term::from_field_i64(field_obj, from_val),
|
|
142
|
+
Term::from_field_i64(field_obj, to_val),
|
|
143
|
+
)
|
|
144
|
+
}
|
|
155
145
|
FieldType::F64(_) => {
|
|
156
|
-
let
|
|
157
|
-
let
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
146
|
+
let from_val: f64 = f64::try_convert(from)?;
|
|
147
|
+
let to_val: f64 = f64::try_convert(to)?;
|
|
148
|
+
(
|
|
149
|
+
Term::from_field_f64(field_obj, from_val),
|
|
150
|
+
Term::from_field_f64(field_obj, to_val),
|
|
151
|
+
)
|
|
152
|
+
}
|
|
153
|
+
_ => {
|
|
154
|
+
return Err(Error::new(
|
|
155
|
+
ruby.exception_runtime_error(),
|
|
156
|
+
format!("Field '{}' is not supported by range query.", field_name),
|
|
163
157
|
))
|
|
164
|
-
}
|
|
165
|
-
_ => { Err(format!("Field '{}' is not supported by range query.", field_name)) }
|
|
158
|
+
}
|
|
166
159
|
};
|
|
167
160
|
|
|
168
|
-
let (
|
|
169
|
-
|
|
170
|
-
let query = RangeQuery::new_term_bounds(field, value_type, &left, &right);
|
|
171
|
-
|
|
172
|
-
wrap_query(Box::new(query))
|
|
161
|
+
let query = RangeQuery::new(Included(left), Included(right));
|
|
162
|
+
Ok(Query(Box::new(query)))
|
|
173
163
|
}
|
|
174
164
|
|
|
175
|
-
fn
|
|
176
|
-
|
|
177
|
-
field
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
path: String
|
|
184
|
-
);
|
|
185
|
-
|
|
186
|
-
let schema = &unwrap_index(&index).schema;
|
|
187
|
-
let field = schema.get_field(&field).try_unwrap();
|
|
165
|
+
fn new_facet(index: &Index, field: String, path: String) -> Result<Self, Error> {
|
|
166
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
167
|
+
let field = index.schema.get_field(&field).map_err(|e| {
|
|
168
|
+
Error::new(
|
|
169
|
+
ruby.exception_runtime_error(),
|
|
170
|
+
format!("Field not found: {}", e),
|
|
171
|
+
)
|
|
172
|
+
})?;
|
|
188
173
|
let facet = Facet::from(&path);
|
|
189
174
|
let term = Term::from_facet(field, &facet);
|
|
190
175
|
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
|
191
|
-
|
|
192
|
-
wrap_query(Box::new(query))
|
|
176
|
+
Ok(Query(Box::new(query)))
|
|
193
177
|
}
|
|
194
178
|
|
|
195
|
-
fn disjunction(queries:
|
|
196
|
-
try_unwrap_params!(queries);
|
|
197
|
-
|
|
179
|
+
fn disjunction(queries: RArray) -> Result<Self, Error> {
|
|
198
180
|
let mut query_vec = Vec::new();
|
|
199
181
|
|
|
200
|
-
for
|
|
201
|
-
let query:
|
|
202
|
-
query_vec.push((Occur::Should,
|
|
182
|
+
for item in queries.into_iter() {
|
|
183
|
+
let query: &Query = <&Query>::try_convert(item)?;
|
|
184
|
+
query_vec.push((Occur::Should, query.0.box_clone()));
|
|
203
185
|
}
|
|
204
186
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
wrap_query(Box::new(disjunction_query))
|
|
187
|
+
Ok(Query(Box::new(BooleanQuery::from(query_vec))))
|
|
208
188
|
}
|
|
209
189
|
|
|
210
|
-
fn conjunction(queries:
|
|
211
|
-
try_unwrap_params!(queries);
|
|
212
|
-
|
|
190
|
+
fn conjunction(queries: RArray) -> Result<Self, Error> {
|
|
213
191
|
let mut query_vec = Vec::new();
|
|
214
192
|
|
|
215
|
-
for
|
|
216
|
-
let query:
|
|
217
|
-
query_vec.push((Occur::Must,
|
|
193
|
+
for item in queries.into_iter() {
|
|
194
|
+
let query: &Query = <&Query>::try_convert(item)?;
|
|
195
|
+
query_vec.push((Occur::Must, query.0.box_clone()));
|
|
218
196
|
}
|
|
219
197
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
wrap_query(Box::new(conjunction_query))
|
|
198
|
+
Ok(Query(Box::new(BooleanQuery::from(query_vec))))
|
|
223
199
|
}
|
|
224
200
|
|
|
225
|
-
fn negation() ->
|
|
226
|
-
|
|
227
|
-
let all_query: Box<dyn Query> = Box::new(AllQuery);
|
|
201
|
+
fn negation(&self) -> Self {
|
|
202
|
+
let all_query: Box<dyn tantivy::query::Query> = Box::new(AllQuery);
|
|
228
203
|
let negation_query = BooleanQuery::from(vec![
|
|
229
204
|
(Occur::Must, all_query.box_clone()),
|
|
230
|
-
(Occur::MustNot,
|
|
205
|
+
(Occur::MustNot, self.0.box_clone()),
|
|
231
206
|
]);
|
|
232
207
|
|
|
233
|
-
|
|
208
|
+
Query(Box::new(negation_query))
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
fn boost(&self, score: f64) -> Self {
|
|
212
|
+
let query = BoostQuery::new(self.0.box_clone(), score as f32);
|
|
213
|
+
Query(Box::new(query))
|
|
234
214
|
}
|
|
235
215
|
|
|
236
|
-
fn
|
|
237
|
-
|
|
216
|
+
fn highlight(text: String, terms: Vec<String>, fuzzy_distance: i64) -> Result<String, Error> {
|
|
217
|
+
use tantivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, TokenStream};
|
|
218
|
+
|
|
219
|
+
// Create a simple tokenizer for highlighting
|
|
220
|
+
let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
|
221
|
+
.filter(LowerCaser)
|
|
222
|
+
.build();
|
|
238
223
|
|
|
239
|
-
|
|
224
|
+
// Tokenize the input text
|
|
225
|
+
let mut token_stream = analyzer.token_stream(&text);
|
|
240
226
|
|
|
241
|
-
|
|
227
|
+
// Collect all tokens with their positions
|
|
228
|
+
let mut tokens = Vec::new();
|
|
229
|
+
while token_stream.advance() {
|
|
230
|
+
let token = token_stream.token();
|
|
231
|
+
tokens.push((token.text.clone(), token.offset_from, token.offset_to));
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Build Levenshtein automata for each term (same algorithm as Tantivy's FuzzyTermQuery)
|
|
235
|
+
let lev_builder = LevenshteinAutomatonBuilder::new(fuzzy_distance as u8, true);
|
|
236
|
+
let automata: Vec<_> = terms
|
|
237
|
+
.iter()
|
|
238
|
+
.map(|term| lev_builder.build_dfa(term))
|
|
239
|
+
.collect();
|
|
240
|
+
|
|
241
|
+
// Build the highlighted text
|
|
242
|
+
let mut result = String::new();
|
|
243
|
+
let mut last_pos = 0;
|
|
244
|
+
|
|
245
|
+
for (token_text, start, end) in tokens {
|
|
246
|
+
// Check if this token matches any of the query terms (exact or fuzzy)
|
|
247
|
+
let should_highlight = terms.iter().zip(&automata).any(|(term, dfa)| {
|
|
248
|
+
// Exact match
|
|
249
|
+
if token_text.eq_ignore_ascii_case(term) {
|
|
250
|
+
return true;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Fuzzy match using Levenshtein automaton (same as Tantivy's FuzzyTermQuery)
|
|
254
|
+
matches!(dfa.eval(&token_text), Distance::Exact(_))
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
// Add the text before the token
|
|
258
|
+
result.push_str(&text[last_pos..start]);
|
|
259
|
+
|
|
260
|
+
// Add the token, highlighted if it matches
|
|
261
|
+
if should_highlight {
|
|
262
|
+
result.push_str("<b>");
|
|
263
|
+
result.push_str(&text[start..end]);
|
|
264
|
+
result.push_str("</b>");
|
|
265
|
+
} else {
|
|
266
|
+
result.push_str(&text[start..end]);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
last_pos = end;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// Add any remaining text after the last token
|
|
273
|
+
result.push_str(&text[last_pos..]);
|
|
274
|
+
|
|
275
|
+
Ok(result)
|
|
242
276
|
}
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
pub
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
pub fn init(ruby: &Ruby, module: RModule) -> Result<(), Error> {
|
|
280
|
+
let class = module.define_class("Query", ruby.class_object())?;
|
|
281
|
+
|
|
282
|
+
class.define_singleton_method("__new_all_query", magnus::function!(Query::new_all, 0))?;
|
|
283
|
+
class.define_singleton_method("__new_empty_query", magnus::function!(Query::new_empty, 0))?;
|
|
284
|
+
class.define_singleton_method("__new_term_query", magnus::function!(Query::new_term, 3))?;
|
|
285
|
+
class.define_singleton_method(
|
|
286
|
+
"__new_fuzzy_term_query",
|
|
287
|
+
magnus::function!(Query::new_fuzzy_term, 4),
|
|
288
|
+
)?;
|
|
289
|
+
class.define_singleton_method(
|
|
290
|
+
"__new_phrase_query",
|
|
291
|
+
magnus::function!(Query::new_phrase, 3),
|
|
292
|
+
)?;
|
|
293
|
+
class.define_singleton_method("__new_regex_query", magnus::function!(Query::new_regex, 3))?;
|
|
294
|
+
class.define_singleton_method("__new_range_query", magnus::function!(Query::new_range, 4))?;
|
|
295
|
+
class.define_singleton_method("__new_facet_query", magnus::function!(Query::new_facet, 3))?;
|
|
296
|
+
class.define_singleton_method("__disjunction", magnus::function!(Query::disjunction, 1))?;
|
|
297
|
+
class.define_singleton_method("__conjunction", magnus::function!(Query::conjunction, 1))?;
|
|
298
|
+
class.define_method("__negation", magnus::method!(Query::negation, 0))?;
|
|
299
|
+
class.define_method("__boost", magnus::method!(Query::boost, 1))?;
|
|
300
|
+
class.define_singleton_method("__highlight", magnus::function!(Query::highlight, 3))?;
|
|
301
|
+
|
|
302
|
+
Ok(())
|
|
303
|
+
}
|
data/src/tokenizer.rs
CHANGED
|
@@ -1,94 +1,81 @@
|
|
|
1
|
+
use magnus::{Error, Module, Object, RModule, Ruby};
|
|
2
|
+
use tantivy::tokenizer::{
|
|
3
|
+
LowerCaser, NgramTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, TextAnalyzer,
|
|
4
|
+
};
|
|
1
5
|
|
|
2
|
-
use
|
|
3
|
-
use tantivy::tokenizer::{TextAnalyzer, SimpleTokenizer, RemoveLongFilter, LowerCaser, Stemmer, NgramTokenizer};
|
|
6
|
+
use crate::helpers::LanguageWrapper;
|
|
4
7
|
|
|
5
|
-
|
|
8
|
+
#[magnus::wrap(class = "Tantiny::Tokenizer", free_immediately, size)]
|
|
9
|
+
pub struct Tokenizer(TextAnalyzer);
|
|
6
10
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
fn wrap_tokenizer(tokenizer: TextAnalyzer) -> RTantinyTokenizer {
|
|
12
|
-
klass().wrap_data(
|
|
13
|
-
TantinyTokenizer(tokenizer),
|
|
14
|
-
&*TANTINY_TOKENIZER_WRAPPER
|
|
15
|
-
)
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
pub(crate) fn unwrap_tokenizer(tokenizer: &RTantinyTokenizer) -> &TextAnalyzer {
|
|
19
|
-
&tokenizer.get_data(&*TANTINY_TOKENIZER_WRAPPER).0
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
#[rustfmt::skip::macros(methods)]
|
|
23
|
-
methods!(
|
|
24
|
-
RTantinyTokenizer,
|
|
25
|
-
_itself,
|
|
11
|
+
impl Tokenizer {
|
|
12
|
+
pub fn get_analyzer(&self) -> TextAnalyzer {
|
|
13
|
+
self.0.clone()
|
|
14
|
+
}
|
|
26
15
|
|
|
27
|
-
fn
|
|
28
|
-
let tokenizer = TextAnalyzer::
|
|
16
|
+
fn new_simple() -> Result<Self, Error> {
|
|
17
|
+
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
|
29
18
|
.filter(RemoveLongFilter::limit(40))
|
|
30
|
-
.filter(LowerCaser)
|
|
31
|
-
|
|
32
|
-
|
|
19
|
+
.filter(LowerCaser)
|
|
20
|
+
.build();
|
|
21
|
+
Ok(Tokenizer(tokenizer))
|
|
33
22
|
}
|
|
34
23
|
|
|
35
|
-
fn
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
let language: LanguageWrapper = locale_code.parse().try_unwrap();
|
|
39
|
-
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
|
24
|
+
fn new_stemmer(language: String) -> Result<Self, Error> {
|
|
25
|
+
let lang_wrapper = LanguageWrapper::try_from(language)?;
|
|
26
|
+
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
|
40
27
|
.filter(RemoveLongFilter::limit(40))
|
|
41
28
|
.filter(LowerCaser)
|
|
42
|
-
.filter(Stemmer::new(
|
|
43
|
-
|
|
44
|
-
|
|
29
|
+
.filter(Stemmer::new(lang_wrapper.0))
|
|
30
|
+
.build();
|
|
31
|
+
Ok(Tokenizer(tokenizer))
|
|
45
32
|
}
|
|
46
33
|
|
|
47
|
-
fn
|
|
48
|
-
|
|
49
|
-
max_gram
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
let tokenizer = NgramTokenizer::new(
|
|
59
|
-
min_gram as usize,
|
|
60
|
-
max_gram as usize,
|
|
61
|
-
prefix_only
|
|
62
|
-
);
|
|
63
|
-
|
|
64
|
-
wrap_tokenizer(TextAnalyzer::from(tokenizer))
|
|
34
|
+
fn new_ngram(min_gram: i64, max_gram: i64, prefix_only: bool) -> Result<Self, Error> {
|
|
35
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
36
|
+
let tokenizer = NgramTokenizer::new(min_gram as usize, max_gram as usize, prefix_only)
|
|
37
|
+
.map_err(|e| {
|
|
38
|
+
Error::new(
|
|
39
|
+
ruby.exception_runtime_error(),
|
|
40
|
+
format!("Failed to create ngram tokenizer: {}", e),
|
|
41
|
+
)
|
|
42
|
+
})?;
|
|
43
|
+
|
|
44
|
+
Ok(Tokenizer(TextAnalyzer::builder(tokenizer).build()))
|
|
65
45
|
}
|
|
66
46
|
|
|
67
|
-
fn extract_terms(text:
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
let mut
|
|
71
|
-
let mut terms = vec![];
|
|
47
|
+
fn extract_terms(&self, text: String) -> Result<Vec<String>, Error> {
|
|
48
|
+
let mut cloned_analyzer = self.0.clone();
|
|
49
|
+
let mut token_stream = cloned_analyzer.token_stream(&text);
|
|
50
|
+
let mut terms = Vec::new();
|
|
72
51
|
|
|
73
52
|
while token_stream.advance() {
|
|
74
|
-
terms.push(token_stream.token().clone()
|
|
53
|
+
terms.push(token_stream.token().text.clone());
|
|
75
54
|
}
|
|
76
55
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
for term in terms {
|
|
80
|
-
array.push(RString::from(term));
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
array
|
|
56
|
+
Ok(terms)
|
|
84
57
|
}
|
|
85
|
-
|
|
58
|
+
}
|
|
86
59
|
|
|
87
|
-
pub
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
60
|
+
pub fn init(ruby: &Ruby, module: RModule) -> Result<(), Error> {
|
|
61
|
+
let class = module.define_class("Tokenizer", ruby.class_object())?;
|
|
62
|
+
|
|
63
|
+
class.define_singleton_method(
|
|
64
|
+
"__new_simple_tokenizer",
|
|
65
|
+
magnus::function!(Tokenizer::new_simple, 0),
|
|
66
|
+
)?;
|
|
67
|
+
class.define_singleton_method(
|
|
68
|
+
"__new_stemmer_tokenizer",
|
|
69
|
+
magnus::function!(Tokenizer::new_stemmer, 1),
|
|
70
|
+
)?;
|
|
71
|
+
class.define_singleton_method(
|
|
72
|
+
"__new_ngram_tokenizer",
|
|
73
|
+
magnus::function!(Tokenizer::new_ngram, 3),
|
|
74
|
+
)?;
|
|
75
|
+
class.define_method(
|
|
76
|
+
"__extract_terms",
|
|
77
|
+
magnus::method!(Tokenizer::extract_terms, 1),
|
|
78
|
+
)?;
|
|
79
|
+
|
|
80
|
+
Ok(())
|
|
81
|
+
}
|