tantiny 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 32334d17636719a204b09795443ba26989c1511e515965649af0e92aa0ee5d5a
4
- data.tar.gz: 2c596b09325d57012e7987e5c8eba5eb8e9e81f93f5fa3d99859567407f10c9f
3
+ metadata.gz: 3aeb0d870c24b42e8bf9b607e6ccb6bccff26b0f5f671a84e248d5465008d268
4
+ data.tar.gz: ea8fc4e425bdeeeb16d6a7db8be220b2742ad9c035b1b1a8c4c365a4ac0cd0f1
5
5
  SHA512:
6
- metadata.gz: 797d85d76769bf0165f8ecc81d652890d0603806b005de9cece8a3bb6b8b0f6866b4b53fd42caee0738cc43cc9b2e383b1f08ab28f1e706c6231c908bd7334dc
7
- data.tar.gz: c683bcb69c47af11da1020cffaa40a9aad40eef358e8c87674b39a9678600f987606db0945632b0d604db4c66a7d634c980fd9446f2674a53c5d42692e4e5913
6
+ metadata.gz: cf150617442ea2f5829de2a1e4e514310295775cfaad414f9438815fd9a5ab5746d161b69fcb87da1d8ee9a8fbf68b3b7a033f76f49a639e35d06633cc9dddda
7
+ data.tar.gz: 3a29e2db6f5ba7a71385185058b082ca2420a5dec2805bab73e947c8532ef6f67fde38a61f0d4bf2ef1ced0506e29b7ec75575b58fe7cb3dcaf7e9ef6b07ef25
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.4.1](https://github.com/altertable-ai/tantiny/compare/tantiny/v0.4.0...tantiny/v0.4.1) (2025-11-03)
4
+
5
+
6
+ ### Bug Fixes
7
+
8
+ * **highlighting:** ensure the same tokenizer is used for text & query + handle last term as prefix ([#7](https://github.com/altertable-ai/tantiny/issues/7)) ([e713de3](https://github.com/altertable-ai/tantiny/commit/e713de385fea5a8060e615c173e9097871e052fc))
9
+
3
10
  ## [0.4.0](https://github.com/altertable-ai/tantiny/compare/tantiny-v0.3.3...tantiny/v0.4.0) (2025-11-01)
4
11
 
5
12
  [Resume development & transfer ownership](https://github.com/altertable-ai/tantiny/pull/1) following https://github.com/baygeldin/tantiny/pull/24
data/Cargo.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tantiny"
3
- version = "0.4.0" # {x-release-please-version}
3
+ version = "0.4.1" # {x-release-please-version}
4
4
  edition = "2021"
5
5
  authors = ["Sylvain Utard", "Alexander Baygeldin"]
6
6
  repository = "https://github.com/altertable-ai/tantiny"
data/lib/tantiny/query.rb CHANGED
@@ -118,9 +118,8 @@ module Tantiny
118
118
  disjunction(*field_queries).boost(boost_factor)
119
119
  end
120
120
 
121
- def highlight(text, query_string, fuzzy_distance: 0, tokenizer: Tantiny::Tokenizer.new(:simple))
122
- terms = tokenizer.terms(query_string).map(&:to_s)
123
- __highlight(text.to_s, terms, fuzzy_distance)
121
+ def highlight(text, query_string, fuzzy_distance: 0, tokenizer: Tantiny::Tokenizer.new(:simple), last_term_min_length_prefix_match: 3)
122
+ __highlight(text.to_s, query_string.to_s, fuzzy_distance.to_i, tokenizer, last_term_min_length_prefix_match.to_i)
124
123
  end
125
124
 
126
125
  private
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tantiny
4
- VERSION = "0.4.0" # {x-release-please-version}
4
+ VERSION = "0.4.1" # {x-release-please-version}
5
5
  end
data/lib/tantiny.so CHANGED
Binary file
data/src/query.rs CHANGED
@@ -7,6 +7,7 @@ use tantivy::Term;
7
7
  use time::OffsetDateTime;
8
8
 
9
9
  use crate::index::Index;
10
+ use crate::tokenizer::Tokenizer;
10
11
 
11
12
  #[magnus::wrap(class = "Tantiny::Query", free_immediately, size)]
12
13
  pub struct Query(Box<dyn tantivy::query::Query>);
@@ -213,27 +214,38 @@ impl Query {
213
214
  Query(Box::new(query))
214
215
  }
215
216
 
216
- fn highlight(text: String, terms: Vec<String>, fuzzy_distance: i64) -> Result<String, Error> {
217
- use tantivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, TokenStream};
218
-
219
- // Create a simple tokenizer for highlighting
220
- let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
221
- .filter(LowerCaser)
222
- .build();
217
+ fn highlight(
218
+ text: String,
219
+ query_string: String,
220
+ fuzzy_distance: i64,
221
+ tokenizer: &Tokenizer,
222
+ last_term_min_length_prefix_match: i64,
223
+ ) -> Result<String, Error> {
224
+ use tantivy::tokenizer::TokenStream;
225
+ let mut analyzer = tokenizer.get_analyzer();
226
+
227
+ // Tokenizer the query string
228
+ let query_tokens = {
229
+ let mut query_string_token_stream = analyzer.token_stream(&query_string);
230
+ let mut tokens = Vec::new();
231
+ while query_string_token_stream.advance() {
232
+ let token = query_string_token_stream.token();
233
+ tokens.push(token.text.clone());
234
+ }
235
+ tokens
236
+ };
223
237
 
224
238
  // Tokenize the input text
225
- let mut token_stream = analyzer.token_stream(&text);
226
-
227
- // Collect all tokens with their positions
239
+ let mut input_text_token_stream = analyzer.token_stream(&text);
228
240
  let mut tokens = Vec::new();
229
- while token_stream.advance() {
230
- let token = token_stream.token();
241
+ while input_text_token_stream.advance() {
242
+ let token = input_text_token_stream.token();
231
243
  tokens.push((token.text.clone(), token.offset_from, token.offset_to));
232
244
  }
233
245
 
234
246
  // Build Levenshtein automata for each term (same algorithm as Tantivy's FuzzyTermQuery)
235
247
  let lev_builder = LevenshteinAutomatonBuilder::new(fuzzy_distance as u8, true);
236
- let automata: Vec<_> = terms
248
+ let automata: Vec<_> = query_tokens
237
249
  .iter()
238
250
  .map(|term| lev_builder.build_dfa(term))
239
251
  .collect();
@@ -244,7 +256,7 @@ impl Query {
244
256
 
245
257
  for (token_text, start, end) in tokens {
246
258
  // Check if this token matches any of the query terms (exact or fuzzy)
247
- let should_highlight = terms.iter().zip(&automata).any(|(term, dfa)| {
259
+ let fuzzy_match = query_tokens.iter().zip(&automata).any(|(term, dfa)| {
248
260
  // Exact match
249
261
  if token_text.eq_ignore_ascii_case(term) {
250
262
  return true;
@@ -254,6 +266,14 @@ impl Query {
254
266
  matches!(dfa.eval(&token_text), Distance::Exact(_))
255
267
  });
256
268
 
269
+ // Check if this token is a prefix match for the last query term
270
+ let prefix_match = token_text.len() > last_term_min_length_prefix_match as usize
271
+ && query_tokens
272
+ .last()
273
+ .map(|last_token| token_text.starts_with(last_token))
274
+ .unwrap_or(false);
275
+ let should_highlight = fuzzy_match || prefix_match;
276
+
257
277
  // Add the text before the token
258
278
  result.push_str(&text[last_pos..start]);
259
279
 
@@ -297,7 +317,7 @@ pub fn init(ruby: &Ruby, module: RModule) -> Result<(), Error> {
297
317
  class.define_singleton_method("__conjunction", magnus::function!(Query::conjunction, 1))?;
298
318
  class.define_method("__negation", magnus::method!(Query::negation, 0))?;
299
319
  class.define_method("__boost", magnus::method!(Query::boost, 1))?;
300
- class.define_singleton_method("__highlight", magnus::function!(Query::highlight, 3))?;
320
+ class.define_singleton_method("__highlight", magnus::function!(Query::highlight, 5))?;
301
321
 
302
322
  Ok(())
303
323
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tantiny
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sylvain Utard