tantiny 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/Cargo.toml +1 -1
- data/lib/tantiny/query.rb +2 -3
- data/lib/tantiny/version.rb +1 -1
- data/lib/tantiny.so +0 -0
- data/src/query.rs +35 -15
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3aeb0d870c24b42e8bf9b607e6ccb6bccff26b0f5f671a84e248d5465008d268
|
|
4
|
+
data.tar.gz: ea8fc4e425bdeeeb16d6a7db8be220b2742ad9c035b1b1a8c4c365a4ac0cd0f1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cf150617442ea2f5829de2a1e4e514310295775cfaad414f9438815fd9a5ab5746d161b69fcb87da1d8ee9a8fbf68b3b7a033f76f49a639e35d06633cc9dddda
|
|
7
|
+
data.tar.gz: 3a29e2db6f5ba7a71385185058b082ca2420a5dec2805bab73e947c8532ef6f67fde38a61f0d4bf2ef1ced0506e29b7ec75575b58fe7cb3dcaf7e9ef6b07ef25
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.4.1](https://github.com/altertable-ai/tantiny/compare/tantiny/v0.4.0...tantiny/v0.4.1) (2025-11-03)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Bug Fixes
|
|
7
|
+
|
|
8
|
+
* **highlighting:** ensure the same tokenizer is used for text & query + handle last term as prefix ([#7](https://github.com/altertable-ai/tantiny/issues/7)) ([e713de3](https://github.com/altertable-ai/tantiny/commit/e713de385fea5a8060e615c173e9097871e052fc))
|
|
9
|
+
|
|
3
10
|
## [0.4.0](https://github.com/altertable-ai/tantiny/compare/tantiny-v0.3.3...tantiny/v0.4.0) (2025-11-01)
|
|
4
11
|
|
|
5
12
|
[Resume development & transfer ownership](https://github.com/altertable-ai/tantiny/pull/1) following https://github.com/baygeldin/tantiny/pull/24
|
data/Cargo.toml
CHANGED
data/lib/tantiny/query.rb
CHANGED
|
@@ -118,9 +118,8 @@ module Tantiny
|
|
|
118
118
|
disjunction(*field_queries).boost(boost_factor)
|
|
119
119
|
end
|
|
120
120
|
|
|
121
|
-
def highlight(text, query_string, fuzzy_distance: 0, tokenizer: Tantiny::Tokenizer.new(:simple))
|
|
122
|
-
|
|
123
|
-
__highlight(text.to_s, terms, fuzzy_distance)
|
|
121
|
+
def highlight(text, query_string, fuzzy_distance: 0, tokenizer: Tantiny::Tokenizer.new(:simple), last_term_min_length_prefix_match: 3)
|
|
122
|
+
__highlight(text.to_s, query_string.to_s, fuzzy_distance.to_i, tokenizer, last_term_min_length_prefix_match.to_i)
|
|
124
123
|
end
|
|
125
124
|
|
|
126
125
|
private
|
data/lib/tantiny/version.rb
CHANGED
data/lib/tantiny.so
CHANGED
|
Binary file
|
data/src/query.rs
CHANGED
|
@@ -7,6 +7,7 @@ use tantivy::Term;
|
|
|
7
7
|
use time::OffsetDateTime;
|
|
8
8
|
|
|
9
9
|
use crate::index::Index;
|
|
10
|
+
use crate::tokenizer::Tokenizer;
|
|
10
11
|
|
|
11
12
|
#[magnus::wrap(class = "Tantiny::Query", free_immediately, size)]
|
|
12
13
|
pub struct Query(Box<dyn tantivy::query::Query>);
|
|
@@ -213,27 +214,38 @@ impl Query {
|
|
|
213
214
|
Query(Box::new(query))
|
|
214
215
|
}
|
|
215
216
|
|
|
216
|
-
fn highlight(
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
217
|
+
fn highlight(
|
|
218
|
+
text: String,
|
|
219
|
+
query_string: String,
|
|
220
|
+
fuzzy_distance: i64,
|
|
221
|
+
tokenizer: &Tokenizer,
|
|
222
|
+
last_term_min_length_prefix_match: i64,
|
|
223
|
+
) -> Result<String, Error> {
|
|
224
|
+
use tantivy::tokenizer::TokenStream;
|
|
225
|
+
let mut analyzer = tokenizer.get_analyzer();
|
|
226
|
+
|
|
227
|
+
// Tokenizer the query string
|
|
228
|
+
let query_tokens = {
|
|
229
|
+
let mut query_string_token_stream = analyzer.token_stream(&query_string);
|
|
230
|
+
let mut tokens = Vec::new();
|
|
231
|
+
while query_string_token_stream.advance() {
|
|
232
|
+
let token = query_string_token_stream.token();
|
|
233
|
+
tokens.push(token.text.clone());
|
|
234
|
+
}
|
|
235
|
+
tokens
|
|
236
|
+
};
|
|
223
237
|
|
|
224
238
|
// Tokenize the input text
|
|
225
|
-
let mut
|
|
226
|
-
|
|
227
|
-
// Collect all tokens with their positions
|
|
239
|
+
let mut input_text_token_stream = analyzer.token_stream(&text);
|
|
228
240
|
let mut tokens = Vec::new();
|
|
229
|
-
while
|
|
230
|
-
let token =
|
|
241
|
+
while input_text_token_stream.advance() {
|
|
242
|
+
let token = input_text_token_stream.token();
|
|
231
243
|
tokens.push((token.text.clone(), token.offset_from, token.offset_to));
|
|
232
244
|
}
|
|
233
245
|
|
|
234
246
|
// Build Levenshtein automata for each term (same algorithm as Tantivy's FuzzyTermQuery)
|
|
235
247
|
let lev_builder = LevenshteinAutomatonBuilder::new(fuzzy_distance as u8, true);
|
|
236
|
-
let automata: Vec<_> =
|
|
248
|
+
let automata: Vec<_> = query_tokens
|
|
237
249
|
.iter()
|
|
238
250
|
.map(|term| lev_builder.build_dfa(term))
|
|
239
251
|
.collect();
|
|
@@ -244,7 +256,7 @@ impl Query {
|
|
|
244
256
|
|
|
245
257
|
for (token_text, start, end) in tokens {
|
|
246
258
|
// Check if this token matches any of the query terms (exact or fuzzy)
|
|
247
|
-
let
|
|
259
|
+
let fuzzy_match = query_tokens.iter().zip(&automata).any(|(term, dfa)| {
|
|
248
260
|
// Exact match
|
|
249
261
|
if token_text.eq_ignore_ascii_case(term) {
|
|
250
262
|
return true;
|
|
@@ -254,6 +266,14 @@ impl Query {
|
|
|
254
266
|
matches!(dfa.eval(&token_text), Distance::Exact(_))
|
|
255
267
|
});
|
|
256
268
|
|
|
269
|
+
// Check if this token is a prefix match for the last query term
|
|
270
|
+
let prefix_match = token_text.len() > last_term_min_length_prefix_match as usize
|
|
271
|
+
&& query_tokens
|
|
272
|
+
.last()
|
|
273
|
+
.map(|last_token| token_text.starts_with(last_token))
|
|
274
|
+
.unwrap_or(false);
|
|
275
|
+
let should_highlight = fuzzy_match || prefix_match;
|
|
276
|
+
|
|
257
277
|
// Add the text before the token
|
|
258
278
|
result.push_str(&text[last_pos..start]);
|
|
259
279
|
|
|
@@ -297,7 +317,7 @@ pub fn init(ruby: &Ruby, module: RModule) -> Result<(), Error> {
|
|
|
297
317
|
class.define_singleton_method("__conjunction", magnus::function!(Query::conjunction, 1))?;
|
|
298
318
|
class.define_method("__negation", magnus::method!(Query::negation, 0))?;
|
|
299
319
|
class.define_method("__boost", magnus::method!(Query::boost, 1))?;
|
|
300
|
-
class.define_singleton_method("__highlight", magnus::function!(Query::highlight,
|
|
320
|
+
class.define_singleton_method("__highlight", magnus::function!(Query::highlight, 5))?;
|
|
301
321
|
|
|
302
322
|
Ok(())
|
|
303
323
|
}
|