tantiny 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/helpers.rs ADDED
@@ -0,0 +1,200 @@
1
+ use std::collections::HashMap;
2
+ use rutie::{AnyException, Array, Exception, RString, Hash, Integer, Float, Boolean, Module};
3
+ use tantivy::schema::{Field};
4
+ use tantivy::tokenizer::Language;
5
+
6
+ // Macro dependencies:
7
+ pub(super) use paste::paste;
8
+ pub(super) use rutie::{class, wrappable_struct, AnyObject, VerifiedObject, VM, Object, Class};
9
+
10
+ pub(crate) fn namespace() -> Module {
11
+ Module::from_existing("Tantiny")
12
+ }
13
+
14
+ pub(crate) struct LanguageWrapper(pub(crate) Language);
15
+
16
+ impl std::str::FromStr for LanguageWrapper {
17
+ type Err = String;
18
+
19
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
20
+ match s {
21
+ "en" => Ok(LanguageWrapper(Language::English)),
22
+ "ar" => Ok(LanguageWrapper(Language::Arabic)),
23
+ "da" => Ok(LanguageWrapper(Language::Danish)),
24
+ "nl" => Ok(LanguageWrapper(Language::Dutch)),
25
+ "fi" => Ok(LanguageWrapper(Language::Finnish)),
26
+ "fr" => Ok(LanguageWrapper(Language::French)),
27
+ "de" => Ok(LanguageWrapper(Language::German)),
28
+ "el" => Ok(LanguageWrapper(Language::Greek)),
29
+ "hu" => Ok(LanguageWrapper(Language::Hungarian)),
30
+ "it" => Ok(LanguageWrapper(Language::Italian)),
31
+ "no" => Ok(LanguageWrapper(Language::Norwegian)),
32
+ "pt" => Ok(LanguageWrapper(Language::Portuguese)),
33
+ "ro" => Ok(LanguageWrapper(Language::Romanian)),
34
+ "ru" => Ok(LanguageWrapper(Language::Russian)),
35
+ "es" => Ok(LanguageWrapper(Language::Spanish)),
36
+ "sv" => Ok(LanguageWrapper(Language::Swedish)),
37
+ "ta" => Ok(LanguageWrapper(Language::Tamil)),
38
+ "tr" => Ok(LanguageWrapper(Language::Turkish)),
39
+ _ => Err(format!("Language '{}' is not supported.", s)),
40
+ }
41
+ }
42
+ }
43
+
44
+ pub(crate) trait TryUnwrap<T> {
45
+ fn try_unwrap(self) -> T;
46
+ }
47
+
48
+ macro_rules! primitive_try_unwrap_impl {
49
+ ( $ruby_type:ty, $type:ty ) => {
50
+ paste! {
51
+ impl TryUnwrap<$type> for $ruby_type {
52
+ fn try_unwrap(self) -> $type {
53
+ self.[<to_ $type:lower>]()
54
+ }
55
+ }
56
+
57
+ impl TryUnwrap<$type> for AnyObject {
58
+ fn try_unwrap(self) -> $type {
59
+ self.try_convert_to::<$ruby_type>()
60
+ .try_unwrap()
61
+ .[<to_ $type:lower>]()
62
+ }
63
+ }
64
+ }
65
+ };
66
+ }
67
+
68
+ primitive_try_unwrap_impl!(RString, String);
69
+ primitive_try_unwrap_impl!(Integer, i64);
70
+ primitive_try_unwrap_impl!(Float, f64);
71
+ primitive_try_unwrap_impl!(Boolean, bool);
72
+
73
+ impl<T> TryUnwrap<Vec<T>> for Array where
74
+ AnyObject: TryUnwrap<T>
75
+ {
76
+ fn try_unwrap(self) -> Vec<T> {
77
+ let mut vec = Vec::new();
78
+
79
+ for elem in self {
80
+ vec.push(elem.try_unwrap());
81
+ }
82
+
83
+ vec
84
+ }
85
+ }
86
+
87
+ impl<K, V> TryUnwrap<HashMap<K, V>> for Hash where
88
+ AnyObject: TryUnwrap<K> + TryUnwrap<V>,
89
+ K: Eq + std::hash::Hash
90
+ {
91
+ fn try_unwrap(self) -> HashMap<K, V> {
92
+ let mut hashmap = HashMap::new();
93
+
94
+ self.each(|key, value| {
95
+ hashmap.insert(key.try_unwrap(), value.try_unwrap());
96
+ });
97
+
98
+ hashmap
99
+ }
100
+ }
101
+
102
+ impl<T, E> TryUnwrap<T> for Result<T, E>
103
+ where
104
+ E: ToString,
105
+ {
106
+ fn try_unwrap(self) -> T {
107
+ self.map_err(|e| {
108
+ VM::raise_ex(AnyException::new(
109
+ "Tantiny::TantivyError",
110
+ Some(&e.to_string()),
111
+ ))
112
+ })
113
+ .unwrap()
114
+ }
115
+ }
116
+
117
+ impl TryUnwrap<Field> for Option<Field> {
118
+ fn try_unwrap(self) -> Field {
119
+ if let Some(value) = self {
120
+ value
121
+ } else {
122
+ VM::raise_ex(AnyException::new("Tantiny::UnknownField", None));
123
+
124
+ self.unwrap()
125
+ }
126
+ }
127
+ }
128
+
129
+ macro_rules! try_unwrap_params {
130
+ (
131
+ $param:ident: $type:ty,
132
+ $( $rest:tt )*
133
+ ) => {
134
+ let _tmp = $param.map_err(|e| $crate::helpers::VM::raise_ex(e)).unwrap();
135
+ let $param = <_ as $crate::helpers::TryUnwrap<$type>>::try_unwrap(_tmp);
136
+
137
+ try_unwrap_params!($($rest)*)
138
+ };
139
+ (
140
+ $param:ident,
141
+ $( $rest:tt )*
142
+ ) => {
143
+ let $param = $param.map_err(|e| $crate::helpers::VM::raise_ex(e)).unwrap();
144
+
145
+ try_unwrap_params!($($rest)*)
146
+ };
147
+
148
+ // Handle optional trailing commas.
149
+ ( $param:ident: $type:ty ) => {
150
+ try_unwrap_params!($param: $type,)
151
+ };
152
+ ( $param:ident ) => {
153
+ try_unwrap_params!($param,)
154
+ };
155
+
156
+ () => {}
157
+ }
158
+
159
+ pub(crate) use try_unwrap_params;
160
+
161
+ macro_rules! scaffold {
162
+ ( $ruby_type:ident, $type:ty, $klass:literal ) => {
163
+ $crate::helpers::class!($ruby_type);
164
+
165
+ // There is a bug in Rutie which prevents using this macro
166
+ // by resolving it by a full path, so the only workaround is:
167
+ use crate::helpers::wrappable_struct;
168
+
169
+ $crate::helpers::paste! {
170
+ wrappable_struct!(
171
+ $type,
172
+ [<$type Wrapper>],
173
+ [<$type:snake:upper _WRAPPER>]
174
+ );
175
+ }
176
+
177
+ pub(crate) fn klass() -> $crate::helpers::Class {
178
+ $crate::helpers::namespace().get_nested_class($klass)
179
+ }
180
+
181
+ impl $crate::helpers::TryUnwrap<$ruby_type> for $crate::helpers::AnyObject {
182
+ fn try_unwrap(self) -> $ruby_type {
183
+ let result = self.try_convert_to::<$ruby_type>();
184
+ <_ as $crate::helpers::TryUnwrap<$ruby_type>>::try_unwrap(result)
185
+ }
186
+ }
187
+
188
+ impl $crate::helpers::VerifiedObject for $ruby_type {
189
+ fn is_correct_type<T: $crate::helpers::Object>(object: &T) -> bool {
190
+ object.class() == klass()
191
+ }
192
+
193
+ fn error_message() -> &'static str {
194
+ concat!("Error converting to ", stringify!($ruby_type), ".")
195
+ }
196
+ }
197
+ }
198
+ }
199
+
200
+ pub(crate) use scaffold;
data/src/index.rs ADDED
@@ -0,0 +1,261 @@
1
+ use std::collections::HashMap;
2
+ use std::str::FromStr;
3
+ use rutie::{methods, Object, AnyObject, Integer, NilClass, Array, RString, Hash};
4
+ use tantivy::{doc, Document, Term, ReloadPolicy, Index, IndexWriter, IndexReader, DateTime};
5
+ use tantivy::schema::{Schema, TextOptions, TextFieldIndexing, IndexRecordOption, FacetOptions, STRING, STORED, INDEXED, FAST};
6
+ use tantivy::collector::TopDocs;
7
+ use tantivy::directory::MmapDirectory;
8
+
9
+ use crate::helpers::{scaffold, try_unwrap_params, TryUnwrap};
10
+ use crate::query::{unwrap_query, RTantinyQuery};
11
+ use crate::tokenizer::{unwrap_tokenizer, RTantinyTokenizer};
12
+
13
+ pub struct TantinyIndex {
14
+ pub(crate) index_writer: IndexWriter,
15
+ pub(crate) index_reader: IndexReader,
16
+ pub(crate) schema: Schema,
17
+ }
18
+
19
+ scaffold!(RTantinyIndex, TantinyIndex, "Index");
20
+
21
+ pub(crate) fn unwrap_index(index: &RTantinyIndex) -> &TantinyIndex {
22
+ index.get_data(&*TANTINY_INDEX_WRAPPER)
23
+ }
24
+
25
+ #[rustfmt::skip::macros(methods)]
26
+ methods!(
27
+ RTantinyIndex,
28
+ _itself,
29
+
30
+ fn new_index(
31
+ path: RString,
32
+ index_size: Integer,
33
+ default_tokenizer: AnyObject,
34
+ field_tokenizers: Hash,
35
+ text_fields: Array,
36
+ string_fields: Array,
37
+ integer_fields: Array,
38
+ double_fields: Array,
39
+ date_fields: Array,
40
+ facet_fields: Array
41
+ ) -> RTantinyIndex {
42
+ try_unwrap_params!(
43
+ path: String,
44
+ index_size: i64,
45
+ default_tokenizer: RTantinyTokenizer,
46
+ field_tokenizers: HashMap<String, RTantinyTokenizer>,
47
+ text_fields: Vec<String>,
48
+ string_fields: Vec<String>,
49
+ integer_fields: Vec<String>,
50
+ double_fields: Vec<String>,
51
+ date_fields: Vec<String>,
52
+ facet_fields: Vec<String>
53
+ );
54
+
55
+ let index_path = MmapDirectory::open(path).try_unwrap();
56
+ let mut schema_builder = Schema::builder();
57
+
58
+ schema_builder.add_text_field("id", STRING | STORED);
59
+
60
+ for field in text_fields {
61
+ let tokenizer_name =
62
+ if field_tokenizers.contains_key(&field) {
63
+ &*field
64
+ } else {
65
+ "default"
66
+ };
67
+ let indexing = TextFieldIndexing::default()
68
+ .set_tokenizer(tokenizer_name)
69
+ .set_index_option(IndexRecordOption::WithFreqsAndPositions);
70
+ let options = TextOptions::default()
71
+ .set_indexing_options(indexing);
72
+ schema_builder.add_text_field(&field, options);
73
+ }
74
+
75
+ for field in string_fields {
76
+ schema_builder.add_text_field(&field, STRING);
77
+ }
78
+
79
+ for field in integer_fields {
80
+ schema_builder.add_i64_field(&field, FAST | INDEXED);
81
+ }
82
+
83
+ for field in double_fields {
84
+ schema_builder.add_f64_field(&field, FAST | INDEXED);
85
+ }
86
+
87
+ for field in date_fields {
88
+ schema_builder.add_date_field(&field, FAST | INDEXED);
89
+ }
90
+
91
+ for field in facet_fields {
92
+ let options = FacetOptions::default().set_indexed();
93
+ schema_builder.add_facet_field(&field, options);
94
+ }
95
+
96
+ let schema = schema_builder.build();
97
+ let index = Index::open_or_create(index_path, schema.clone()).try_unwrap();
98
+ let tokenizers = index.tokenizers();
99
+
100
+ tokenizers.register("default", unwrap_tokenizer(&default_tokenizer).clone());
101
+
102
+ for (field, tokenizer) in field_tokenizers {
103
+ tokenizers.register(&field, unwrap_tokenizer(&tokenizer).clone())
104
+ }
105
+
106
+ let mut index_writer = index
107
+ .writer(index_size as usize)
108
+ .try_unwrap();
109
+
110
+ let index_reader = index
111
+ .reader_builder()
112
+ .reload_policy(ReloadPolicy::Manual)
113
+ .try_into()
114
+ .try_unwrap();
115
+
116
+ klass().wrap_data(
117
+ TantinyIndex { index_writer, index_reader, schema },
118
+ &*TANTINY_INDEX_WRAPPER
119
+ )
120
+ }
121
+
122
+ fn add_document(
123
+ id: RString,
124
+ text_fields: Hash,
125
+ string_fields: Hash,
126
+ integer_fields: Hash,
127
+ double_fields: Hash,
128
+ date_fields: Hash,
129
+ facet_fields: Hash
130
+ ) -> NilClass {
131
+ try_unwrap_params!(
132
+ id: String,
133
+ text_fields: HashMap<String, String>,
134
+ string_fields: HashMap<String, String>,
135
+ integer_fields: HashMap<String, i64>,
136
+ double_fields: HashMap<String, f64>,
137
+ date_fields: HashMap<String, String>,
138
+ facet_fields: HashMap<String, String>
139
+ );
140
+
141
+
142
+ let internal = unwrap_index(&_itself);
143
+ let index_writer = &internal.index_writer;
144
+ let schema = &internal.schema;
145
+
146
+ let mut doc = Document::default();
147
+
148
+ let id_field = schema.get_field("id").try_unwrap();
149
+ doc.add_text(id_field, &id);
150
+
151
+ for (key, value) in text_fields.iter() {
152
+ let field = schema.get_field(key).try_unwrap();
153
+ doc.add_text(field, value);
154
+ }
155
+
156
+ for (key, value) in string_fields.iter() {
157
+ let field = schema.get_field(key).try_unwrap();
158
+ doc.add_text(field, value);
159
+ }
160
+
161
+ for (key, &value) in integer_fields.iter() {
162
+ let field = schema.get_field(key).try_unwrap();
163
+ doc.add_i64(field, value);
164
+ }
165
+
166
+ for (key, &value) in double_fields.iter() {
167
+ let field = schema.get_field(key).try_unwrap();
168
+ doc.add_f64(field, value);
169
+ }
170
+
171
+ for (key, value) in date_fields.iter() {
172
+ let field = schema.get_field(key).try_unwrap();
173
+ let value = DateTime::from_str(value).try_unwrap();
174
+ doc.add_date(field, &value);
175
+ }
176
+
177
+ for (key, value) in facet_fields.iter() {
178
+ let field = schema.get_field(key).try_unwrap();
179
+ doc.add_facet(field, &value);
180
+ }
181
+
182
+ let doc_id = Term::from_field_text(id_field, &id);
183
+ index_writer.delete_term(doc_id.clone());
184
+
185
+ index_writer.add_document(doc);
186
+
187
+ NilClass::new()
188
+ }
189
+
190
+ fn delete_document(id: RString) -> NilClass {
191
+ try_unwrap_params!(id: String);
192
+
193
+ let internal = unwrap_index(&_itself);
194
+ let index_writer = &internal.index_writer;
195
+
196
+ let id_field = internal.schema.get_field("id").try_unwrap();
197
+ let doc_id = Term::from_field_text(id_field, &id);
198
+
199
+ index_writer.delete_term(doc_id.clone());
200
+
201
+ NilClass::new()
202
+ }
203
+
204
+ fn commit() -> NilClass {
205
+ let internal = _itself.get_data_mut(&*TANTINY_INDEX_WRAPPER);
206
+ let index_writer = &mut internal.index_writer;
207
+
208
+ index_writer.commit().try_unwrap();
209
+
210
+ NilClass::new()
211
+ }
212
+
213
+ fn reload() -> NilClass {
214
+ unwrap_index(&_itself).index_reader.reload().try_unwrap();
215
+
216
+ NilClass::new()
217
+ }
218
+
219
+ fn search(
220
+ query: AnyObject,
221
+ limit: Integer
222
+ ) -> Array {
223
+ try_unwrap_params!(
224
+ query: RTantinyQuery,
225
+ limit: i64
226
+ );
227
+
228
+ let internal = unwrap_index(&_itself);
229
+ let id_field = internal.schema.get_field("id").try_unwrap();
230
+ let searcher = internal.index_reader.searcher();
231
+ let query = unwrap_query(&query);
232
+
233
+ let top_docs = searcher
234
+ .search(query, &TopDocs::with_limit(limit as usize))
235
+ .try_unwrap();
236
+
237
+ let mut array = Array::with_capacity(top_docs.len());
238
+
239
+ for (_score, doc_address) in top_docs {
240
+ let doc = searcher.doc(doc_address).try_unwrap();
241
+ if let Some(value) = doc.get_first(id_field) {
242
+ if let Some(id) = (&*value).text() {
243
+ array.push(RString::from(String::from(id)));
244
+ }
245
+ }
246
+ }
247
+
248
+ array
249
+ }
250
+ );
251
+
252
+ pub(super) fn init() {
253
+ klass().define(|klass| {
254
+ klass.def_self("__new", new_index);
255
+ klass.def("__add_document", add_document);
256
+ klass.def("__delete_document", delete_document);
257
+ klass.def("__commit", commit);
258
+ klass.def("__reload", reload);
259
+ klass.def("__search", search);
260
+ });
261
+ }
data/src/lib.rs ADDED
@@ -0,0 +1,15 @@
1
+ mod helpers;
2
+ #[allow(improper_ctypes_definitions)]
3
+ mod index;
4
+ #[allow(improper_ctypes_definitions)]
5
+ mod query;
6
+
7
+ #[allow(improper_ctypes_definitions)]
8
+ mod tokenizer;
9
+
10
+ #[no_mangle]
11
+ pub extern "C" fn Init_tantiny() {
12
+ index::init();
13
+ query::init();
14
+ tokenizer::init();
15
+ }