tantiny 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/index.rs CHANGED
@@ -1,77 +1,65 @@
1
+ use magnus::{r_hash::ForEach, Error, Module, Object, RHash, RModule, Ruby, TryConvert, Value};
2
+ use std::cell::RefCell;
1
3
  use std::collections::HashMap;
2
- use std::str::FromStr;
3
- use rutie::{methods, Object, AnyObject, Integer, NilClass, Array, RString, Hash};
4
- use tantivy::{doc, Document, Term, ReloadPolicy, Index, IndexWriter, IndexReader, DateTime};
5
- use tantivy::schema::{Schema, TextOptions, TextFieldIndexing, IndexRecordOption, FacetOptions, STRING, STORED, INDEXED, FAST};
6
4
  use tantivy::collector::TopDocs;
7
5
  use tantivy::directory::MmapDirectory;
8
-
9
- use crate::helpers::{scaffold, try_unwrap_params, TryUnwrap};
10
- use crate::query::{unwrap_query, RTantinyQuery};
11
- use crate::tokenizer::{unwrap_tokenizer, RTantinyTokenizer};
12
-
13
- pub struct TantinyIndex {
14
- pub(crate) schema: Schema,
15
- pub(crate) index: Index,
16
- pub(crate) index_writer: Option<IndexWriter>,
17
- pub(crate) index_reader: IndexReader,
6
+ use tantivy::schema::{
7
+ FacetOptions, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Value as TantivyValue,
8
+ FAST, INDEXED, STORED, STRING,
9
+ };
10
+ use tantivy::{IndexReader, IndexWriter, ReloadPolicy, TantivyDocument, Term};
11
+ use time::OffsetDateTime;
12
+
13
+ use crate::helpers::hash_to_multivalue_map;
14
+ use crate::query::Query;
15
+ use crate::tokenizer::Tokenizer;
16
+
17
+ #[magnus::wrap(class = "Tantiny::Index", free_immediately, size)]
18
+ pub struct Index {
19
+ pub schema: Schema,
20
+ index: tantivy::Index,
21
+ index_writer: RefCell<Option<IndexWriter>>,
22
+ index_reader: IndexReader,
18
23
  }
19
24
 
20
- scaffold!(RTantinyIndex, TantinyIndex, "Index");
25
+ impl Index {
26
+ #[allow(clippy::too_many_arguments)]
27
+ fn new(
28
+ path: Option<String>,
29
+ default_tokenizer: &Tokenizer,
30
+ field_tokenizers: RHash,
31
+ text_fields: Vec<String>,
32
+ string_fields: Vec<String>,
33
+ integer_fields: Vec<String>,
34
+ double_fields: Vec<String>,
35
+ date_fields: Vec<String>,
36
+ facet_fields: Vec<String>,
37
+ ) -> Result<Self, Error> {
38
+ let ruby = unsafe { Ruby::get_unchecked() };
39
+ let field_tokenizers_map: HashMap<String, &Tokenizer> = {
40
+ let mut map = HashMap::new();
41
+ field_tokenizers.foreach(|key: String, value: Value| {
42
+ let tokenizer: &Tokenizer = <&Tokenizer>::try_convert(value)?;
43
+ map.insert(key, tokenizer);
44
+ Ok(ForEach::Continue)
45
+ })?;
46
+ map
47
+ };
21
48
 
22
- pub(crate) fn unwrap_index(index: &RTantinyIndex) -> &TantinyIndex {
23
- index.get_data(&*TANTINY_INDEX_WRAPPER)
24
- }
25
-
26
- pub(crate) fn unwrap_index_mut(index: &mut RTantinyIndex) -> &mut TantinyIndex {
27
- index.get_data_mut(&*TANTINY_INDEX_WRAPPER)
28
- }
29
-
30
- #[rustfmt::skip::macros(methods)]
31
- methods!(
32
- RTantinyIndex,
33
- _itself,
34
-
35
- fn new_index(
36
- path: RString,
37
- default_tokenizer: AnyObject,
38
- field_tokenizers: Hash,
39
- text_fields: Array,
40
- string_fields: Array,
41
- integer_fields: Array,
42
- double_fields: Array,
43
- date_fields: Array,
44
- facet_fields: Array
45
- ) -> RTantinyIndex {
46
- try_unwrap_params!(
47
- path: String,
48
- default_tokenizer: RTantinyTokenizer,
49
- field_tokenizers: HashMap<String, RTantinyTokenizer>,
50
- text_fields: Vec<String>,
51
- string_fields: Vec<String>,
52
- integer_fields: Vec<String>,
53
- double_fields: Vec<String>,
54
- date_fields: Vec<String>,
55
- facet_fields: Vec<String>
56
- );
57
-
58
- let index_path = MmapDirectory::open(path).try_unwrap();
59
49
  let mut schema_builder = Schema::builder();
60
50
 
61
51
  schema_builder.add_text_field("id", STRING | STORED);
62
52
 
63
53
  for field in text_fields {
64
- let tokenizer_name =
65
- if field_tokenizers.contains_key(&field) {
66
- &*field
67
- } else {
68
- "default"
69
- };
54
+ let tokenizer_name = if field_tokenizers_map.contains_key(&field) {
55
+ &field
56
+ } else {
57
+ "default"
58
+ };
70
59
  let indexing = TextFieldIndexing::default()
71
60
  .set_tokenizer(tokenizer_name)
72
61
  .set_index_option(IndexRecordOption::WithFreqsAndPositions);
73
- let options = TextOptions::default()
74
- .set_indexing_options(indexing);
62
+ let options = TextOptions::default().set_indexing_options(indexing);
75
63
  schema_builder.add_text_field(&field, options);
76
64
  }
77
65
 
@@ -92,197 +80,322 @@ methods!(
92
80
  }
93
81
 
94
82
  for field in facet_fields {
95
- let options = FacetOptions::default().set_indexed();
96
- schema_builder.add_facet_field(&field, options);
83
+ schema_builder.add_facet_field(&field, FacetOptions::default());
97
84
  }
98
85
 
99
86
  let schema = schema_builder.build();
100
- let index = Index::open_or_create(index_path, schema.clone()).try_unwrap();
87
+
88
+ // Create index based on whether path is provided
89
+ let index = match path {
90
+ Some(path_str) => {
91
+ let index_path = MmapDirectory::open(path_str).map_err(|e| {
92
+ Error::new(
93
+ ruby.exception_runtime_error(),
94
+ format!("Failed to open directory: {}", e),
95
+ )
96
+ })?;
97
+ tantivy::Index::open_or_create(index_path, schema.clone()).map_err(|e| {
98
+ Error::new(
99
+ ruby.exception_runtime_error(),
100
+ format!("Failed to create index: {}", e),
101
+ )
102
+ })?
103
+ }
104
+ None => {
105
+ // Create in-memory index
106
+ tantivy::Index::create_in_ram(schema.clone())
107
+ }
108
+ };
109
+
110
+ // Access the tokenizers field before moving index
101
111
  let tokenizers = index.tokenizers();
102
112
 
103
- tokenizers.register("default", unwrap_tokenizer(&default_tokenizer).clone());
113
+ // Register tokenizers
114
+ tokenizers.register("default", default_tokenizer.get_analyzer());
104
115
 
105
- for (field, tokenizer) in field_tokenizers {
106
- tokenizers.register(&field, unwrap_tokenizer(&tokenizer).clone())
116
+ for (field, tokenizer) in field_tokenizers_map {
117
+ tokenizers.register(&field, tokenizer.get_analyzer())
107
118
  }
108
119
 
109
- let index_writer = None;
110
-
111
120
  let index_reader = index
112
121
  .reader_builder()
113
122
  .reload_policy(ReloadPolicy::Manual)
114
123
  .try_into()
115
- .try_unwrap();
116
-
117
- klass().wrap_data(
118
- TantinyIndex { index, index_writer, index_reader, schema },
119
- &*TANTINY_INDEX_WRAPPER
120
- )
124
+ .map_err(|e| {
125
+ Error::new(
126
+ ruby.exception_runtime_error(),
127
+ format!("Failed to create reader: {}", e),
128
+ )
129
+ })?;
130
+
131
+ Ok(Index {
132
+ schema,
133
+ index,
134
+ index_writer: RefCell::new(None),
135
+ index_reader,
136
+ })
121
137
  }
122
138
 
139
+ #[allow(clippy::too_many_arguments)]
123
140
  fn add_document(
124
- id: RString,
125
- text_fields: Hash,
126
- string_fields: Hash,
127
- integer_fields: Hash,
128
- double_fields: Hash,
129
- date_fields: Hash,
130
- facet_fields: Hash
131
- ) -> NilClass {
132
- try_unwrap_params!(
133
- id: String,
134
- text_fields: HashMap<String, String>,
135
- string_fields: HashMap<String, String>,
136
- integer_fields: HashMap<String, i64>,
137
- double_fields: HashMap<String, f64>,
138
- date_fields: HashMap<String, String>,
139
- facet_fields: HashMap<String, String>
140
- );
141
-
142
- let internal = unwrap_index(&_itself);
143
- let index_writer = internal.index_writer.as_ref().try_unwrap();
144
- let schema = &internal.schema;
145
-
146
- let mut doc = Document::default();
147
-
148
- let id_field = schema.get_field("id").try_unwrap();
141
+ &self,
142
+ id: String,
143
+ text_fields: RHash,
144
+ string_fields: RHash,
145
+ integer_fields: RHash,
146
+ double_fields: RHash,
147
+ date_fields: RHash,
148
+ facet_fields: RHash,
149
+ ) -> Result<(), Error> {
150
+ let ruby = unsafe { Ruby::get_unchecked() };
151
+ let index_writer = self.index_writer.borrow();
152
+ let index_writer = index_writer.as_ref().ok_or_else(|| {
153
+ Error::new(ruby.exception_runtime_error(), "No index writer available")
154
+ })?;
155
+
156
+ let text_map: HashMap<String, Vec<String>> = hash_to_multivalue_map(text_fields)?;
157
+ let string_map: HashMap<String, Vec<String>> = hash_to_multivalue_map(string_fields)?;
158
+ let integer_map: HashMap<String, Vec<i64>> = hash_to_multivalue_map(integer_fields)?;
159
+ let double_map: HashMap<String, Vec<f64>> = hash_to_multivalue_map(double_fields)?;
160
+ let date_map: HashMap<String, Vec<String>> = hash_to_multivalue_map(date_fields)?;
161
+ let facet_map: HashMap<String, Vec<String>> = hash_to_multivalue_map(facet_fields)?;
162
+
163
+ let mut doc = TantivyDocument::default();
164
+
165
+ let id_field = self.schema.get_field("id").map_err(|e| {
166
+ Error::new(
167
+ ruby.exception_runtime_error(),
168
+ format!("Failed to get id field: {}", e),
169
+ )
170
+ })?;
149
171
  doc.add_text(id_field, &id);
150
172
 
151
- for (key, value) in text_fields.iter() {
152
- let field = schema.get_field(key).try_unwrap();
153
- doc.add_text(field, value);
173
+ for (key, values) in text_map.iter() {
174
+ let field = self.schema.get_field(key).map_err(|e| {
175
+ Error::new(
176
+ ruby.exception_runtime_error(),
177
+ format!("Failed to get field {}: {}", key, e),
178
+ )
179
+ })?;
180
+ for value in values {
181
+ doc.add_text(field, value);
182
+ }
154
183
  }
155
184
 
156
- for (key, value) in string_fields.iter() {
157
- let field = schema.get_field(key).try_unwrap();
158
- doc.add_text(field, value);
185
+ for (key, values) in string_map.iter() {
186
+ let field = self.schema.get_field(key).map_err(|e| {
187
+ Error::new(
188
+ ruby.exception_runtime_error(),
189
+ format!("Failed to get field {}: {}", key, e),
190
+ )
191
+ })?;
192
+ for value in values {
193
+ doc.add_text(field, value);
194
+ }
159
195
  }
160
196
 
161
- for (key, &value) in integer_fields.iter() {
162
- let field = schema.get_field(key).try_unwrap();
163
- doc.add_i64(field, value);
197
+ for (key, values) in integer_map.iter() {
198
+ let field = self.schema.get_field(key).map_err(|e| {
199
+ Error::new(
200
+ ruby.exception_runtime_error(),
201
+ format!("Failed to get field {}: {}", key, e),
202
+ )
203
+ })?;
204
+ for &value in values {
205
+ doc.add_i64(field, value);
206
+ }
164
207
  }
165
208
 
166
- for (key, &value) in double_fields.iter() {
167
- let field = schema.get_field(key).try_unwrap();
168
- doc.add_f64(field, value);
209
+ for (key, values) in double_map.iter() {
210
+ let field = self.schema.get_field(key).map_err(|e| {
211
+ Error::new(
212
+ ruby.exception_runtime_error(),
213
+ format!("Failed to get field {}: {}", key, e),
214
+ )
215
+ })?;
216
+ for &value in values {
217
+ doc.add_f64(field, value);
218
+ }
169
219
  }
170
220
 
171
- for (key, value) in date_fields.iter() {
172
- let field = schema.get_field(key).try_unwrap();
173
- let value = DateTime::from_str(value).try_unwrap();
174
- doc.add_date(field, &value);
221
+ for (key, values) in date_map.iter() {
222
+ let field = self.schema.get_field(key).map_err(|e| {
223
+ Error::new(
224
+ ruby.exception_runtime_error(),
225
+ format!("Failed to get field {}: {}", key, e),
226
+ )
227
+ })?;
228
+ for value in values {
229
+ let datetime =
230
+ OffsetDateTime::parse(value, &time::format_description::well_known::Rfc3339)
231
+ .map_err(|e| {
232
+ Error::new(
233
+ ruby.exception_runtime_error(),
234
+ format!("Invalid date format: {}", e),
235
+ )
236
+ })?;
237
+ doc.add_date(
238
+ field,
239
+ tantivy::DateTime::from_timestamp_nanos(datetime.unix_timestamp_nanos() as i64),
240
+ );
241
+ }
175
242
  }
176
243
 
177
- for (key, value) in facet_fields.iter() {
178
- let field = schema.get_field(key).try_unwrap();
179
- doc.add_facet(field, &value);
244
+ for (key, values) in facet_map.iter() {
245
+ let field = self.schema.get_field(key).map_err(|e| {
246
+ Error::new(
247
+ ruby.exception_runtime_error(),
248
+ format!("Failed to get field {}: {}", key, e),
249
+ )
250
+ })?;
251
+ for value in values {
252
+ doc.add_facet(field, value);
253
+ }
180
254
  }
181
255
 
182
256
  let doc_id = Term::from_field_text(id_field, &id);
183
257
  index_writer.delete_term(doc_id.clone());
184
-
185
- index_writer.add_document(doc);
186
-
187
- NilClass::new()
258
+ index_writer.add_document(doc).map_err(|e| {
259
+ Error::new(
260
+ ruby.exception_runtime_error(),
261
+ format!("Failed to add document: {}", e),
262
+ )
263
+ })?;
264
+
265
+ Ok(())
188
266
  }
189
267
 
190
- fn delete_document(id: RString) -> NilClass {
191
- try_unwrap_params!(id: String);
192
-
193
- let internal = unwrap_index(&_itself);
194
- let index_writer = internal.index_writer.as_ref().unwrap();
195
-
196
- let id_field = internal.schema.get_field("id").try_unwrap();
268
+ fn delete_document(&self, id: String) -> Result<(), Error> {
269
+ let ruby = unsafe { Ruby::get_unchecked() };
270
+ let index_writer = self.index_writer.borrow();
271
+ let index_writer = index_writer.as_ref().ok_or_else(|| {
272
+ Error::new(ruby.exception_runtime_error(), "No index writer available")
273
+ })?;
274
+
275
+ let id_field = self.schema.get_field("id").map_err(|e| {
276
+ Error::new(
277
+ ruby.exception_runtime_error(),
278
+ format!("Failed to get id field: {}", e),
279
+ )
280
+ })?;
197
281
  let doc_id = Term::from_field_text(id_field, &id);
198
282
 
199
283
  index_writer.delete_term(doc_id.clone());
200
-
201
- NilClass::new()
284
+ Ok(())
202
285
  }
203
286
 
204
- fn acquire_index_writer(
205
- overall_memory: Integer
206
- ) -> NilClass {
207
- try_unwrap_params!(overall_memory: i64);
208
-
209
- let internal = unwrap_index_mut(&mut _itself);
210
-
211
- let mut index_writer = internal.index
212
- .writer(overall_memory as usize)
213
- .try_unwrap();
214
-
215
- internal.index_writer = Some(index_writer);
216
-
217
- NilClass::new()
287
+ fn acquire_index_writer(&self, overall_memory: i64) -> Result<(), Error> {
288
+ let ruby = unsafe { Ruby::get_unchecked() };
289
+ let index_writer = self.index.writer(overall_memory as usize).map_err(|e| {
290
+ Error::new(
291
+ ruby.exception_runtime_error(),
292
+ format!("Failed to create writer: {}", e),
293
+ )
294
+ })?;
295
+
296
+ *self.index_writer.borrow_mut() = Some(index_writer);
297
+ Ok(())
218
298
  }
219
299
 
220
- fn release_index_writer() -> NilClass {
221
- let internal = unwrap_index_mut(&mut _itself);
222
-
223
- drop(internal.index_writer.as_ref().try_unwrap());
224
- internal.index_writer = None;
225
-
226
- NilClass::new()
300
+ fn release_index_writer(&self) -> Result<(), Error> {
301
+ let ruby = unsafe { Ruby::get_unchecked() };
302
+ let mut writer = self.index_writer.borrow_mut();
303
+ if writer.is_none() {
304
+ return Err(Error::new(
305
+ ruby.exception_runtime_error(),
306
+ "No index writer to release",
307
+ ));
308
+ }
309
+ *writer = None;
310
+ Ok(())
227
311
  }
228
312
 
229
- fn commit() -> NilClass {
230
- let internal = unwrap_index_mut(&mut _itself);
231
- let index_writer = internal.index_writer.as_mut().try_unwrap();
232
-
233
- index_writer.commit().try_unwrap();
234
-
235
- NilClass::new()
313
+ fn commit(&self) -> Result<(), Error> {
314
+ let ruby = unsafe { Ruby::get_unchecked() };
315
+ let mut writer_cell = self.index_writer.borrow_mut();
316
+ let index_writer = writer_cell.as_mut().ok_or_else(|| {
317
+ Error::new(ruby.exception_runtime_error(), "No index writer available")
318
+ })?;
319
+
320
+ index_writer.commit().map_err(|e| {
321
+ Error::new(
322
+ ruby.exception_runtime_error(),
323
+ format!("Failed to commit: {}", e),
324
+ )
325
+ })?;
326
+ Ok(())
236
327
  }
237
328
 
238
- fn reload() -> NilClass {
239
- unwrap_index(&_itself).index_reader.reload().try_unwrap();
240
-
241
- NilClass::new()
329
+ fn reload(&self) -> Result<(), Error> {
330
+ let ruby = unsafe { Ruby::get_unchecked() };
331
+ self.index_reader.reload().map_err(|e| {
332
+ Error::new(
333
+ ruby.exception_runtime_error(),
334
+ format!("Failed to reload: {}", e),
335
+ )
336
+ })?;
337
+ Ok(())
242
338
  }
243
339
 
244
- fn search(
245
- query: AnyObject,
246
- limit: Integer
247
- ) -> Array {
248
- try_unwrap_params!(
249
- query: RTantinyQuery,
250
- limit: i64
251
- );
252
-
253
- let internal = unwrap_index(&_itself);
254
- let id_field = internal.schema.get_field("id").try_unwrap();
255
- let searcher = internal.index_reader.searcher();
256
- let query = unwrap_query(&query);
340
+ fn search(&self, query: &Query, limit: i64) -> Result<Vec<String>, Error> {
341
+ let ruby = unsafe { Ruby::get_unchecked() };
342
+ let id_field = self.schema.get_field("id").map_err(|e| {
343
+ Error::new(
344
+ ruby.exception_runtime_error(),
345
+ format!("Failed to get id field: {}", e),
346
+ )
347
+ })?;
348
+ let searcher = self.index_reader.searcher();
257
349
 
258
350
  let top_docs = searcher
259
- .search(query, &TopDocs::with_limit(limit as usize))
260
- .try_unwrap();
351
+ .search(query.get_query(), &TopDocs::with_limit(limit as usize))
352
+ .map_err(|e| {
353
+ Error::new(
354
+ ruby.exception_runtime_error(),
355
+ format!("Search failed: {}", e),
356
+ )
357
+ })?;
261
358
 
262
- let mut array = Array::with_capacity(top_docs.len());
359
+ let mut results = Vec::with_capacity(top_docs.len());
263
360
 
264
361
  for (_score, doc_address) in top_docs {
265
- let doc = searcher.doc(doc_address).try_unwrap();
362
+ let doc: TantivyDocument = searcher.doc(doc_address).map_err(|e| {
363
+ Error::new(
364
+ ruby.exception_runtime_error(),
365
+ format!("Failed to get document: {}", e),
366
+ )
367
+ })?;
266
368
  if let Some(value) = doc.get_first(id_field) {
267
- if let Some(id) = (&*value).text() {
268
- array.push(RString::from(String::from(id)));
369
+ if let Some(id) = value.as_str() {
370
+ results.push(id.to_string());
269
371
  }
270
372
  }
271
373
  }
272
374
 
273
- array
375
+ Ok(results)
274
376
  }
275
- );
276
-
277
- pub(super) fn init() {
278
- klass().define(|klass| {
279
- klass.def_self("__new", new_index);
280
- klass.def("__add_document", add_document);
281
- klass.def("__delete_document", delete_document);
282
- klass.def("__acquire_index_writer", acquire_index_writer);
283
- klass.def("__release_index_writer", release_index_writer);
284
- klass.def("__commit", commit);
285
- klass.def("__reload", reload);
286
- klass.def("__search", search);
287
- });
288
- }
377
+ }
378
+
379
+ pub fn init(ruby: &Ruby, module: RModule) -> Result<(), Error> {
380
+ let class = module.define_class("Index", ruby.class_object())?;
381
+
382
+ class.define_singleton_method("__new", magnus::function!(Index::new, 9))?;
383
+ class.define_method("__add_document", magnus::method!(Index::add_document, 7))?;
384
+ class.define_method(
385
+ "__delete_document",
386
+ magnus::method!(Index::delete_document, 1),
387
+ )?;
388
+ class.define_method(
389
+ "__acquire_index_writer",
390
+ magnus::method!(Index::acquire_index_writer, 1),
391
+ )?;
392
+ class.define_method(
393
+ "__release_index_writer",
394
+ magnus::method!(Index::release_index_writer, 0),
395
+ )?;
396
+ class.define_method("__commit", magnus::method!(Index::commit, 0))?;
397
+ class.define_method("__reload", magnus::method!(Index::reload, 0))?;
398
+ class.define_method("__search", magnus::method!(Index::search, 2))?;
399
+
400
+ Ok(())
401
+ }
data/src/lib.rs CHANGED
@@ -1,14 +1,17 @@
1
1
  mod helpers;
2
- #[allow(improper_ctypes_definitions)]
3
2
  mod index;
4
- #[allow(improper_ctypes_definitions)]
5
3
  mod query;
6
- #[allow(improper_ctypes_definitions)]
7
4
  mod tokenizer;
8
5
 
9
- #[no_mangle]
10
- pub extern "C" fn Init_tantiny() {
11
- index::init();
12
- query::init();
13
- tokenizer::init();
14
- }
6
+ use magnus::{Error, Ruby};
7
+
8
+ #[magnus::init]
9
+ fn init(ruby: &Ruby) -> Result<(), Error> {
10
+ let module = ruby.define_module("Tantiny")?;
11
+
12
+ index::init(ruby, module)?;
13
+ query::init(ruby, module)?;
14
+ tokenizer::init(ruby, module)?;
15
+
16
+ Ok(())
17
+ }