tantiny 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/src/helpers.rs ADDED
@@ -0,0 +1,200 @@
1
+ use std::collections::HashMap;
2
+ use rutie::{AnyException, Array, Exception, RString, Hash, Integer, Float, Boolean, Module};
3
+ use tantivy::schema::{Field};
4
+ use tantivy::tokenizer::Language;
5
+
6
+ // Macro dependencies:
7
+ pub(super) use paste::paste;
8
+ pub(super) use rutie::{class, wrappable_struct, AnyObject, VerifiedObject, VM, Object, Class};
9
+
10
+ pub(crate) fn namespace() -> Module {
11
+ Module::from_existing("Tantiny")
12
+ }
13
+
14
+ pub(crate) struct LanguageWrapper(pub(crate) Language);
15
+
16
+ impl std::str::FromStr for LanguageWrapper {
17
+ type Err = String;
18
+
19
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
20
+ match s {
21
+ "en" => Ok(LanguageWrapper(Language::English)),
22
+ "ar" => Ok(LanguageWrapper(Language::Arabic)),
23
+ "da" => Ok(LanguageWrapper(Language::Danish)),
24
+ "nl" => Ok(LanguageWrapper(Language::Dutch)),
25
+ "fi" => Ok(LanguageWrapper(Language::Finnish)),
26
+ "fr" => Ok(LanguageWrapper(Language::French)),
27
+ "de" => Ok(LanguageWrapper(Language::German)),
28
+ "el" => Ok(LanguageWrapper(Language::Greek)),
29
+ "hu" => Ok(LanguageWrapper(Language::Hungarian)),
30
+ "it" => Ok(LanguageWrapper(Language::Italian)),
31
+ "no" => Ok(LanguageWrapper(Language::Norwegian)),
32
+ "pt" => Ok(LanguageWrapper(Language::Portuguese)),
33
+ "ro" => Ok(LanguageWrapper(Language::Romanian)),
34
+ "ru" => Ok(LanguageWrapper(Language::Russian)),
35
+ "es" => Ok(LanguageWrapper(Language::Spanish)),
36
+ "sv" => Ok(LanguageWrapper(Language::Swedish)),
37
+ "ta" => Ok(LanguageWrapper(Language::Tamil)),
38
+ "tr" => Ok(LanguageWrapper(Language::Turkish)),
39
+ _ => Err(format!("Language '{}' is not supported.", s)),
40
+ }
41
+ }
42
+ }
43
+
44
+ pub(crate) trait TryUnwrap<T> {
45
+ fn try_unwrap(self) -> T;
46
+ }
47
+
48
+ macro_rules! primitive_try_unwrap_impl {
49
+ ( $ruby_type:ty, $type:ty ) => {
50
+ paste! {
51
+ impl TryUnwrap<$type> for $ruby_type {
52
+ fn try_unwrap(self) -> $type {
53
+ self.[<to_ $type:lower>]()
54
+ }
55
+ }
56
+
57
+ impl TryUnwrap<$type> for AnyObject {
58
+ fn try_unwrap(self) -> $type {
59
+ self.try_convert_to::<$ruby_type>()
60
+ .try_unwrap()
61
+ .[<to_ $type:lower>]()
62
+ }
63
+ }
64
+ }
65
+ };
66
+ }
67
+
68
+ primitive_try_unwrap_impl!(RString, String);
69
+ primitive_try_unwrap_impl!(Integer, i64);
70
+ primitive_try_unwrap_impl!(Float, f64);
71
+ primitive_try_unwrap_impl!(Boolean, bool);
72
+
73
+ impl<T> TryUnwrap<Vec<T>> for Array where
74
+ AnyObject: TryUnwrap<T>
75
+ {
76
+ fn try_unwrap(self) -> Vec<T> {
77
+ let mut vec = Vec::new();
78
+
79
+ for elem in self {
80
+ vec.push(elem.try_unwrap());
81
+ }
82
+
83
+ vec
84
+ }
85
+ }
86
+
87
+ impl<K, V> TryUnwrap<HashMap<K, V>> for Hash where
88
+ AnyObject: TryUnwrap<K> + TryUnwrap<V>,
89
+ K: Eq + std::hash::Hash
90
+ {
91
+ fn try_unwrap(self) -> HashMap<K, V> {
92
+ let mut hashmap = HashMap::new();
93
+
94
+ self.each(|key, value| {
95
+ hashmap.insert(key.try_unwrap(), value.try_unwrap());
96
+ });
97
+
98
+ hashmap
99
+ }
100
+ }
101
+
102
+ impl<T, E> TryUnwrap<T> for Result<T, E>
103
+ where
104
+ E: ToString,
105
+ {
106
+ fn try_unwrap(self) -> T {
107
+ self.map_err(|e| {
108
+ VM::raise_ex(AnyException::new(
109
+ "Tantiny::TantivyError",
110
+ Some(&e.to_string()),
111
+ ))
112
+ })
113
+ .unwrap()
114
+ }
115
+ }
116
+
117
+ impl TryUnwrap<Field> for Option<Field> {
118
+ fn try_unwrap(self) -> Field {
119
+ if let Some(value) = self {
120
+ value
121
+ } else {
122
+ VM::raise_ex(AnyException::new("Tantiny::UnknownField", None));
123
+
124
+ self.unwrap()
125
+ }
126
+ }
127
+ }
128
+
129
+ macro_rules! try_unwrap_params {
130
+ (
131
+ $param:ident: $type:ty,
132
+ $( $rest:tt )*
133
+ ) => {
134
+ let _tmp = $param.map_err(|e| $crate::helpers::VM::raise_ex(e)).unwrap();
135
+ let $param = <_ as $crate::helpers::TryUnwrap<$type>>::try_unwrap(_tmp);
136
+
137
+ try_unwrap_params!($($rest)*)
138
+ };
139
+ (
140
+ $param:ident,
141
+ $( $rest:tt )*
142
+ ) => {
143
+ let $param = $param.map_err(|e| $crate::helpers::VM::raise_ex(e)).unwrap();
144
+
145
+ try_unwrap_params!($($rest)*)
146
+ };
147
+
148
+ // Handle optional trailing commas.
149
+ ( $param:ident: $type:ty ) => {
150
+ try_unwrap_params!($param: $type,)
151
+ };
152
+ ( $param:ident ) => {
153
+ try_unwrap_params!($param,)
154
+ };
155
+
156
+ () => {}
157
+ }
158
+
159
+ pub(crate) use try_unwrap_params;
160
+
161
+ macro_rules! scaffold {
162
+ ( $ruby_type:ident, $type:ty, $klass:literal ) => {
163
+ $crate::helpers::class!($ruby_type);
164
+
165
+ // There is a bug in Rutie which prevents using this macro
166
+ // by resolving it by a full path, so the only workaround is:
167
+ use crate::helpers::wrappable_struct;
168
+
169
+ $crate::helpers::paste! {
170
+ wrappable_struct!(
171
+ $type,
172
+ [<$type Wrapper>],
173
+ [<$type:snake:upper _WRAPPER>]
174
+ );
175
+ }
176
+
177
+ pub(crate) fn klass() -> $crate::helpers::Class {
178
+ $crate::helpers::namespace().get_nested_class($klass)
179
+ }
180
+
181
+ impl $crate::helpers::TryUnwrap<$ruby_type> for $crate::helpers::AnyObject {
182
+ fn try_unwrap(self) -> $ruby_type {
183
+ let result = self.try_convert_to::<$ruby_type>();
184
+ <_ as $crate::helpers::TryUnwrap<$ruby_type>>::try_unwrap(result)
185
+ }
186
+ }
187
+
188
+ impl $crate::helpers::VerifiedObject for $ruby_type {
189
+ fn is_correct_type<T: $crate::helpers::Object>(object: &T) -> bool {
190
+ object.class() == klass()
191
+ }
192
+
193
+ fn error_message() -> &'static str {
194
+ concat!("Error converting to ", stringify!($ruby_type), ".")
195
+ }
196
+ }
197
+ }
198
+ }
199
+
200
+ pub(crate) use scaffold;
data/src/index.rs ADDED
@@ -0,0 +1,261 @@
1
+ use std::collections::HashMap;
2
+ use std::str::FromStr;
3
+ use rutie::{methods, Object, AnyObject, Integer, NilClass, Array, RString, Hash};
4
+ use tantivy::{doc, Document, Term, ReloadPolicy, Index, IndexWriter, IndexReader, DateTime};
5
+ use tantivy::schema::{Schema, TextOptions, TextFieldIndexing, IndexRecordOption, FacetOptions, STRING, STORED, INDEXED, FAST};
6
+ use tantivy::collector::TopDocs;
7
+ use tantivy::directory::MmapDirectory;
8
+
9
+ use crate::helpers::{scaffold, try_unwrap_params, TryUnwrap};
10
+ use crate::query::{unwrap_query, RTantinyQuery};
11
+ use crate::tokenizer::{unwrap_tokenizer, RTantinyTokenizer};
12
+
13
+ pub struct TantinyIndex {
14
+ pub(crate) index_writer: IndexWriter,
15
+ pub(crate) index_reader: IndexReader,
16
+ pub(crate) schema: Schema,
17
+ }
18
+
19
+ scaffold!(RTantinyIndex, TantinyIndex, "Index");
20
+
21
+ pub(crate) fn unwrap_index(index: &RTantinyIndex) -> &TantinyIndex {
22
+ index.get_data(&*TANTINY_INDEX_WRAPPER)
23
+ }
24
+
25
+ #[rustfmt::skip::macros(methods)]
26
+ methods!(
27
+ RTantinyIndex,
28
+ _itself,
29
+
30
+ fn new_index(
31
+ path: RString,
32
+ index_size: Integer,
33
+ default_tokenizer: AnyObject,
34
+ field_tokenizers: Hash,
35
+ text_fields: Array,
36
+ string_fields: Array,
37
+ integer_fields: Array,
38
+ double_fields: Array,
39
+ date_fields: Array,
40
+ facet_fields: Array
41
+ ) -> RTantinyIndex {
42
+ try_unwrap_params!(
43
+ path: String,
44
+ index_size: i64,
45
+ default_tokenizer: RTantinyTokenizer,
46
+ field_tokenizers: HashMap<String, RTantinyTokenizer>,
47
+ text_fields: Vec<String>,
48
+ string_fields: Vec<String>,
49
+ integer_fields: Vec<String>,
50
+ double_fields: Vec<String>,
51
+ date_fields: Vec<String>,
52
+ facet_fields: Vec<String>
53
+ );
54
+
55
+ let index_path = MmapDirectory::open(path).try_unwrap();
56
+ let mut schema_builder = Schema::builder();
57
+
58
+ schema_builder.add_text_field("id", STRING | STORED);
59
+
60
+ for field in text_fields {
61
+ let tokenizer_name =
62
+ if field_tokenizers.contains_key(&field) {
63
+ &*field
64
+ } else {
65
+ "default"
66
+ };
67
+ let indexing = TextFieldIndexing::default()
68
+ .set_tokenizer(tokenizer_name)
69
+ .set_index_option(IndexRecordOption::WithFreqsAndPositions);
70
+ let options = TextOptions::default()
71
+ .set_indexing_options(indexing);
72
+ schema_builder.add_text_field(&field, options);
73
+ }
74
+
75
+ for field in string_fields {
76
+ schema_builder.add_text_field(&field, STRING);
77
+ }
78
+
79
+ for field in integer_fields {
80
+ schema_builder.add_i64_field(&field, FAST | INDEXED);
81
+ }
82
+
83
+ for field in double_fields {
84
+ schema_builder.add_f64_field(&field, FAST | INDEXED);
85
+ }
86
+
87
+ for field in date_fields {
88
+ schema_builder.add_date_field(&field, FAST | INDEXED);
89
+ }
90
+
91
+ for field in facet_fields {
92
+ let options = FacetOptions::default().set_indexed();
93
+ schema_builder.add_facet_field(&field, options);
94
+ }
95
+
96
+ let schema = schema_builder.build();
97
+ let index = Index::open_or_create(index_path, schema.clone()).try_unwrap();
98
+ let tokenizers = index.tokenizers();
99
+
100
+ tokenizers.register("default", unwrap_tokenizer(&default_tokenizer).clone());
101
+
102
+ for (field, tokenizer) in field_tokenizers {
103
+ tokenizers.register(&field, unwrap_tokenizer(&tokenizer).clone())
104
+ }
105
+
106
+ let mut index_writer = index
107
+ .writer(index_size as usize)
108
+ .try_unwrap();
109
+
110
+ let index_reader = index
111
+ .reader_builder()
112
+ .reload_policy(ReloadPolicy::Manual)
113
+ .try_into()
114
+ .try_unwrap();
115
+
116
+ klass().wrap_data(
117
+ TantinyIndex { index_writer, index_reader, schema },
118
+ &*TANTINY_INDEX_WRAPPER
119
+ )
120
+ }
121
+
122
+ fn add_document(
123
+ id: RString,
124
+ text_fields: Hash,
125
+ string_fields: Hash,
126
+ integer_fields: Hash,
127
+ double_fields: Hash,
128
+ date_fields: Hash,
129
+ facet_fields: Hash
130
+ ) -> NilClass {
131
+ try_unwrap_params!(
132
+ id: String,
133
+ text_fields: HashMap<String, String>,
134
+ string_fields: HashMap<String, String>,
135
+ integer_fields: HashMap<String, i64>,
136
+ double_fields: HashMap<String, f64>,
137
+ date_fields: HashMap<String, String>,
138
+ facet_fields: HashMap<String, String>
139
+ );
140
+
141
+
142
+ let internal = unwrap_index(&_itself);
143
+ let index_writer = &internal.index_writer;
144
+ let schema = &internal.schema;
145
+
146
+ let mut doc = Document::default();
147
+
148
+ let id_field = schema.get_field("id").try_unwrap();
149
+ doc.add_text(id_field, &id);
150
+
151
+ for (key, value) in text_fields.iter() {
152
+ let field = schema.get_field(key).try_unwrap();
153
+ doc.add_text(field, value);
154
+ }
155
+
156
+ for (key, value) in string_fields.iter() {
157
+ let field = schema.get_field(key).try_unwrap();
158
+ doc.add_text(field, value);
159
+ }
160
+
161
+ for (key, &value) in integer_fields.iter() {
162
+ let field = schema.get_field(key).try_unwrap();
163
+ doc.add_i64(field, value);
164
+ }
165
+
166
+ for (key, &value) in double_fields.iter() {
167
+ let field = schema.get_field(key).try_unwrap();
168
+ doc.add_f64(field, value);
169
+ }
170
+
171
+ for (key, value) in date_fields.iter() {
172
+ let field = schema.get_field(key).try_unwrap();
173
+ let value = DateTime::from_str(value).try_unwrap();
174
+ doc.add_date(field, &value);
175
+ }
176
+
177
+ for (key, value) in facet_fields.iter() {
178
+ let field = schema.get_field(key).try_unwrap();
179
+ doc.add_facet(field, &value);
180
+ }
181
+
182
+ let doc_id = Term::from_field_text(id_field, &id);
183
+ index_writer.delete_term(doc_id.clone());
184
+
185
+ index_writer.add_document(doc);
186
+
187
+ NilClass::new()
188
+ }
189
+
190
+ fn delete_document(id: RString) -> NilClass {
191
+ try_unwrap_params!(id: String);
192
+
193
+ let internal = unwrap_index(&_itself);
194
+ let index_writer = &internal.index_writer;
195
+
196
+ let id_field = internal.schema.get_field("id").try_unwrap();
197
+ let doc_id = Term::from_field_text(id_field, &id);
198
+
199
+ index_writer.delete_term(doc_id.clone());
200
+
201
+ NilClass::new()
202
+ }
203
+
204
+ fn commit() -> NilClass {
205
+ let internal = _itself.get_data_mut(&*TANTINY_INDEX_WRAPPER);
206
+ let index_writer = &mut internal.index_writer;
207
+
208
+ index_writer.commit().try_unwrap();
209
+
210
+ NilClass::new()
211
+ }
212
+
213
+ fn reload() -> NilClass {
214
+ unwrap_index(&_itself).index_reader.reload().try_unwrap();
215
+
216
+ NilClass::new()
217
+ }
218
+
219
+ fn search(
220
+ query: AnyObject,
221
+ limit: Integer
222
+ ) -> Array {
223
+ try_unwrap_params!(
224
+ query: RTantinyQuery,
225
+ limit: i64
226
+ );
227
+
228
+ let internal = unwrap_index(&_itself);
229
+ let id_field = internal.schema.get_field("id").try_unwrap();
230
+ let searcher = internal.index_reader.searcher();
231
+ let query = unwrap_query(&query);
232
+
233
+ let top_docs = searcher
234
+ .search(query, &TopDocs::with_limit(limit as usize))
235
+ .try_unwrap();
236
+
237
+ let mut array = Array::with_capacity(top_docs.len());
238
+
239
+ for (_score, doc_address) in top_docs {
240
+ let doc = searcher.doc(doc_address).try_unwrap();
241
+ if let Some(value) = doc.get_first(id_field) {
242
+ if let Some(id) = (&*value).text() {
243
+ array.push(RString::from(String::from(id)));
244
+ }
245
+ }
246
+ }
247
+
248
+ array
249
+ }
250
+ );
251
+
252
+ pub(super) fn init() {
253
+ klass().define(|klass| {
254
+ klass.def_self("__new", new_index);
255
+ klass.def("__add_document", add_document);
256
+ klass.def("__delete_document", delete_document);
257
+ klass.def("__commit", commit);
258
+ klass.def("__reload", reload);
259
+ klass.def("__search", search);
260
+ });
261
+ }
data/src/lib.rs ADDED
@@ -0,0 +1,15 @@
1
+ mod helpers;
2
+ #[allow(improper_ctypes_definitions)]
3
+ mod index;
4
+ #[allow(improper_ctypes_definitions)]
5
+ mod query;
6
+
7
+ #[allow(improper_ctypes_definitions)]
8
+ mod tokenizer;
9
+
10
+ #[no_mangle]
11
+ pub extern "C" fn Init_tantiny() {
12
+ index::init();
13
+ query::init();
14
+ tokenizer::init();
15
+ }