tantiny 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 32334d17636719a204b09795443ba26989c1511e515965649af0e92aa0ee5d5a
4
- data.tar.gz: 2c596b09325d57012e7987e5c8eba5eb8e9e81f93f5fa3d99859567407f10c9f
3
+ metadata.gz: c687129e0e6f9b4ed586cbdf6dac036e5e15bfc1acba54315688179430c68ba7
4
+ data.tar.gz: 2738e1e30e118035bc92382282b77f726300dd69bd1e6284f9831c3ee9349e60
5
5
  SHA512:
6
- metadata.gz: 797d85d76769bf0165f8ecc81d652890d0603806b005de9cece8a3bb6b8b0f6866b4b53fd42caee0738cc43cc9b2e383b1f08ab28f1e706c6231c908bd7334dc
7
- data.tar.gz: c683bcb69c47af11da1020cffaa40a9aad40eef358e8c87674b39a9678600f987606db0945632b0d604db4c66a7d634c980fd9446f2674a53c5d42692e4e5913
6
+ metadata.gz: cea8e693d0cc014ce22bb3ab8f419e8ce31b448d81ba41205fa1114367279826071d04036529f6e95d386f32775d3c8e4aed8bc87896dc89b085496d1e3026ca
7
+ data.tar.gz: 36ae0a91d8339ffb694824c9157180ef12d05aa3e173ebd172e65bfce952e5bd8fc89b8e056f8ee040c5e4ca83ade238e8033ad17608c213b382a6fd959b1227
data/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.4.2](https://github.com/altertable-ai/tantiny/compare/tantiny/v0.4.1...tantiny/v0.4.2) (2025-11-04)
4
+
5
+
6
+ ### Bug Fixes
7
+
8
+ * **highlighting:** only highlight the matching prefix ([#9](https://github.com/altertable-ai/tantiny/issues/9)) ([8ce9bbe](https://github.com/altertable-ai/tantiny/commit/8ce9bbe56392ed055b56ee7118427e41c1140222))
9
+
10
+ ## [0.4.1](https://github.com/altertable-ai/tantiny/compare/tantiny/v0.4.0...tantiny/v0.4.1) (2025-11-03)
11
+
12
+
13
+ ### Bug Fixes
14
+
15
+ * **highlighting:** ensure the same tokenizer is used for text & query + handle last term as prefix ([#7](https://github.com/altertable-ai/tantiny/issues/7)) ([e713de3](https://github.com/altertable-ai/tantiny/commit/e713de385fea5a8060e615c173e9097871e052fc))
16
+
3
17
  ## [0.4.0](https://github.com/altertable-ai/tantiny/compare/tantiny-v0.3.3...tantiny/v0.4.0) (2025-11-01)
4
18
 
5
19
  [Resume development & transfer ownership](https://github.com/altertable-ai/tantiny/pull/1) following https://github.com/baygeldin/tantiny/pull/24
data/Cargo.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tantiny"
3
- version = "0.4.0" # {x-release-please-version}
3
+ version = "0.4.2" # {x-release-please-version}
4
4
  edition = "2021"
5
5
  authors = ["Sylvain Utard", "Alexander Baygeldin"]
6
6
  repository = "https://github.com/altertable-ai/tantiny"
data/lib/tantiny/query.rb CHANGED
@@ -118,9 +118,8 @@ module Tantiny
118
118
  disjunction(*field_queries).boost(boost_factor)
119
119
  end
120
120
 
121
- def highlight(text, query_string, fuzzy_distance: 0, tokenizer: Tantiny::Tokenizer.new(:simple))
122
- terms = tokenizer.terms(query_string).map(&:to_s)
123
- __highlight(text.to_s, terms, fuzzy_distance)
121
+ def highlight(text, query_string, fuzzy_distance: 0, tokenizer: Tantiny::Tokenizer.new(:simple), last_term_min_length_prefix_match: 3)
122
+ __highlight(text.to_s, query_string.to_s, fuzzy_distance.to_i, tokenizer, last_term_min_length_prefix_match.to_i)
124
123
  end
125
124
 
126
125
  private
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tantiny
4
- VERSION = "0.4.0" # {x-release-please-version}
4
+ VERSION = "0.4.2" # {x-release-please-version}
5
5
  end
data/lib/tantiny.so CHANGED
Binary file
data/src/query.rs CHANGED
@@ -7,6 +7,7 @@ use tantivy::Term;
7
7
  use time::OffsetDateTime;
8
8
 
9
9
  use crate::index::Index;
10
+ use crate::tokenizer::Tokenizer;
10
11
 
11
12
  #[magnus::wrap(class = "Tantiny::Query", free_immediately, size)]
12
13
  pub struct Query(Box<dyn tantivy::query::Query>);
@@ -213,27 +214,38 @@ impl Query {
213
214
  Query(Box::new(query))
214
215
  }
215
216
 
216
- fn highlight(text: String, terms: Vec<String>, fuzzy_distance: i64) -> Result<String, Error> {
217
- use tantivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, TokenStream};
218
-
219
- // Create a simple tokenizer for highlighting
220
- let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
221
- .filter(LowerCaser)
222
- .build();
217
+ fn highlight(
218
+ text: String,
219
+ query_string: String,
220
+ fuzzy_distance: i64,
221
+ tokenizer: &Tokenizer,
222
+ last_term_min_length_prefix_match: i64,
223
+ ) -> Result<String, Error> {
224
+ use tantivy::tokenizer::TokenStream;
225
+ let mut analyzer = tokenizer.get_analyzer();
226
+
227
+ // Tokenizer the query string
228
+ let query_tokens = {
229
+ let mut query_string_token_stream = analyzer.token_stream(&query_string);
230
+ let mut tokens = Vec::new();
231
+ while query_string_token_stream.advance() {
232
+ let token = query_string_token_stream.token();
233
+ tokens.push(token.text.clone());
234
+ }
235
+ tokens
236
+ };
223
237
 
224
238
  // Tokenize the input text
225
- let mut token_stream = analyzer.token_stream(&text);
226
-
227
- // Collect all tokens with their positions
239
+ let mut input_text_token_stream = analyzer.token_stream(&text);
228
240
  let mut tokens = Vec::new();
229
- while token_stream.advance() {
230
- let token = token_stream.token();
241
+ while input_text_token_stream.advance() {
242
+ let token = input_text_token_stream.token();
231
243
  tokens.push((token.text.clone(), token.offset_from, token.offset_to));
232
244
  }
233
245
 
234
246
  // Build Levenshtein automata for each term (same algorithm as Tantivy's FuzzyTermQuery)
235
247
  let lev_builder = LevenshteinAutomatonBuilder::new(fuzzy_distance as u8, true);
236
- let automata: Vec<_> = terms
248
+ let automata: Vec<_> = query_tokens
237
249
  .iter()
238
250
  .map(|term| lev_builder.build_dfa(term))
239
251
  .collect();
@@ -244,7 +256,7 @@ impl Query {
244
256
 
245
257
  for (token_text, start, end) in tokens {
246
258
  // Check if this token matches any of the query terms (exact or fuzzy)
247
- let should_highlight = terms.iter().zip(&automata).any(|(term, dfa)| {
259
+ let fuzzy_match = query_tokens.iter().zip(&automata).any(|(term, dfa)| {
248
260
  // Exact match
249
261
  if token_text.eq_ignore_ascii_case(term) {
250
262
  return true;
@@ -254,14 +266,29 @@ impl Query {
254
266
  matches!(dfa.eval(&token_text), Distance::Exact(_))
255
267
  });
256
268
 
269
+ // Check if this token is a prefix match for the last query term
270
+ let prefix_match = token_text.len() > last_term_min_length_prefix_match as usize
271
+ && query_tokens
272
+ .last()
273
+ .map(|last_token| token_text.starts_with(last_token))
274
+ .unwrap_or(false);
275
+
257
276
  // Add the text before the token
258
277
  result.push_str(&text[last_pos..start]);
259
278
 
260
279
  // Add the token, highlighted if it matches
261
- if should_highlight {
280
+ if fuzzy_match {
262
281
  result.push_str("<b>");
263
282
  result.push_str(&text[start..end]);
264
283
  result.push_str("</b>");
284
+ } else if prefix_match {
285
+ let last_token = query_tokens
286
+ .last()
287
+ .expect("Last token is present when prefix_match");
288
+ result.push_str("<b>");
289
+ result.push_str(&text[start..start + last_token.len()]);
290
+ result.push_str("</b>");
291
+ result.push_str(&text[start + last_token.len()..end]);
265
292
  } else {
266
293
  result.push_str(&text[start..end]);
267
294
  }
@@ -297,7 +324,7 @@ pub fn init(ruby: &Ruby, module: RModule) -> Result<(), Error> {
297
324
  class.define_singleton_method("__conjunction", magnus::function!(Query::conjunction, 1))?;
298
325
  class.define_method("__negation", magnus::method!(Query::negation, 0))?;
299
326
  class.define_method("__boost", magnus::method!(Query::boost, 1))?;
300
- class.define_singleton_method("__highlight", magnus::function!(Query::highlight, 3))?;
327
+ class.define_singleton_method("__highlight", magnus::function!(Query::highlight, 5))?;
301
328
 
302
329
  Ok(())
303
330
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tantiny
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sylvain Utard