tantiny 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +20 -0
- data/Cargo.toml +20 -0
- data/LICENSE +21 -0
- data/README.md +309 -0
- data/bin/console +59 -0
- data/bin/setup +6 -0
- data/ext/Rakefile +5 -0
- data/lib/.rbnext/3.0/tantiny/schema.rb +53 -0
- data/lib/tantiny/errors.rb +29 -0
- data/lib/tantiny/helpers.rb +9 -0
- data/lib/tantiny/index.rb +94 -0
- data/lib/tantiny/query.rb +165 -0
- data/lib/tantiny/schema.rb +53 -0
- data/lib/tantiny/tokenizer.rb +28 -0
- data/lib/tantiny/version.rb +5 -0
- data/lib/tantiny.rb +19 -0
- data/lib/tantiny.so +0 -0
- data/sig/tantiny/errors.rbs +20 -0
- data/sig/tantiny/helpers.rbs +6 -0
- data/sig/tantiny/index.rbs +82 -0
- data/sig/tantiny/query.rbs +135 -0
- data/sig/tantiny/schema.rbs +26 -0
- data/sig/tantiny/tokenizer.rbs +25 -0
- data/sig/tantiny/version.rbs +3 -0
- data/sig/tantiny.rbs +5 -0
- data/src/helpers.rs +200 -0
- data/src/index.rs +261 -0
- data/src/lib.rs +15 -0
- data/src/query.rs +260 -0
- data/src/tokenizer.rs +94 -0
- metadata +135 -0
data/src/helpers.rs
ADDED
@@ -0,0 +1,200 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
use rutie::{AnyException, Array, Exception, RString, Hash, Integer, Float, Boolean, Module};
|
3
|
+
use tantivy::schema::{Field};
|
4
|
+
use tantivy::tokenizer::Language;
|
5
|
+
|
6
|
+
// Macro dependencies:
|
7
|
+
pub(super) use paste::paste;
|
8
|
+
pub(super) use rutie::{class, wrappable_struct, AnyObject, VerifiedObject, VM, Object, Class};
|
9
|
+
|
10
|
+
pub(crate) fn namespace() -> Module {
|
11
|
+
Module::from_existing("Tantiny")
|
12
|
+
}
|
13
|
+
|
14
|
+
pub(crate) struct LanguageWrapper(pub(crate) Language);
|
15
|
+
|
16
|
+
impl std::str::FromStr for LanguageWrapper {
|
17
|
+
type Err = String;
|
18
|
+
|
19
|
+
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
20
|
+
match s {
|
21
|
+
"en" => Ok(LanguageWrapper(Language::English)),
|
22
|
+
"ar" => Ok(LanguageWrapper(Language::Arabic)),
|
23
|
+
"da" => Ok(LanguageWrapper(Language::Danish)),
|
24
|
+
"nl" => Ok(LanguageWrapper(Language::Dutch)),
|
25
|
+
"fi" => Ok(LanguageWrapper(Language::Finnish)),
|
26
|
+
"fr" => Ok(LanguageWrapper(Language::French)),
|
27
|
+
"de" => Ok(LanguageWrapper(Language::German)),
|
28
|
+
"el" => Ok(LanguageWrapper(Language::Greek)),
|
29
|
+
"hu" => Ok(LanguageWrapper(Language::Hungarian)),
|
30
|
+
"it" => Ok(LanguageWrapper(Language::Italian)),
|
31
|
+
"no" => Ok(LanguageWrapper(Language::Norwegian)),
|
32
|
+
"pt" => Ok(LanguageWrapper(Language::Portuguese)),
|
33
|
+
"ro" => Ok(LanguageWrapper(Language::Romanian)),
|
34
|
+
"ru" => Ok(LanguageWrapper(Language::Russian)),
|
35
|
+
"es" => Ok(LanguageWrapper(Language::Spanish)),
|
36
|
+
"sv" => Ok(LanguageWrapper(Language::Swedish)),
|
37
|
+
"ta" => Ok(LanguageWrapper(Language::Tamil)),
|
38
|
+
"tr" => Ok(LanguageWrapper(Language::Turkish)),
|
39
|
+
_ => Err(format!("Language '{}' is not supported.", s)),
|
40
|
+
}
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
pub(crate) trait TryUnwrap<T> {
|
45
|
+
fn try_unwrap(self) -> T;
|
46
|
+
}
|
47
|
+
|
48
|
+
macro_rules! primitive_try_unwrap_impl {
|
49
|
+
( $ruby_type:ty, $type:ty ) => {
|
50
|
+
paste! {
|
51
|
+
impl TryUnwrap<$type> for $ruby_type {
|
52
|
+
fn try_unwrap(self) -> $type {
|
53
|
+
self.[<to_ $type:lower>]()
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
impl TryUnwrap<$type> for AnyObject {
|
58
|
+
fn try_unwrap(self) -> $type {
|
59
|
+
self.try_convert_to::<$ruby_type>()
|
60
|
+
.try_unwrap()
|
61
|
+
.[<to_ $type:lower>]()
|
62
|
+
}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
};
|
66
|
+
}
|
67
|
+
|
68
|
+
primitive_try_unwrap_impl!(RString, String);
|
69
|
+
primitive_try_unwrap_impl!(Integer, i64);
|
70
|
+
primitive_try_unwrap_impl!(Float, f64);
|
71
|
+
primitive_try_unwrap_impl!(Boolean, bool);
|
72
|
+
|
73
|
+
impl<T> TryUnwrap<Vec<T>> for Array where
|
74
|
+
AnyObject: TryUnwrap<T>
|
75
|
+
{
|
76
|
+
fn try_unwrap(self) -> Vec<T> {
|
77
|
+
let mut vec = Vec::new();
|
78
|
+
|
79
|
+
for elem in self {
|
80
|
+
vec.push(elem.try_unwrap());
|
81
|
+
}
|
82
|
+
|
83
|
+
vec
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
impl<K, V> TryUnwrap<HashMap<K, V>> for Hash where
|
88
|
+
AnyObject: TryUnwrap<K> + TryUnwrap<V>,
|
89
|
+
K: Eq + std::hash::Hash
|
90
|
+
{
|
91
|
+
fn try_unwrap(self) -> HashMap<K, V> {
|
92
|
+
let mut hashmap = HashMap::new();
|
93
|
+
|
94
|
+
self.each(|key, value| {
|
95
|
+
hashmap.insert(key.try_unwrap(), value.try_unwrap());
|
96
|
+
});
|
97
|
+
|
98
|
+
hashmap
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
impl<T, E> TryUnwrap<T> for Result<T, E>
|
103
|
+
where
|
104
|
+
E: ToString,
|
105
|
+
{
|
106
|
+
fn try_unwrap(self) -> T {
|
107
|
+
self.map_err(|e| {
|
108
|
+
VM::raise_ex(AnyException::new(
|
109
|
+
"Tantiny::TantivyError",
|
110
|
+
Some(&e.to_string()),
|
111
|
+
))
|
112
|
+
})
|
113
|
+
.unwrap()
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
impl TryUnwrap<Field> for Option<Field> {
|
118
|
+
fn try_unwrap(self) -> Field {
|
119
|
+
if let Some(value) = self {
|
120
|
+
value
|
121
|
+
} else {
|
122
|
+
VM::raise_ex(AnyException::new("Tantiny::UnknownField", None));
|
123
|
+
|
124
|
+
self.unwrap()
|
125
|
+
}
|
126
|
+
}
|
127
|
+
}
|
128
|
+
|
129
|
+
macro_rules! try_unwrap_params {
|
130
|
+
(
|
131
|
+
$param:ident: $type:ty,
|
132
|
+
$( $rest:tt )*
|
133
|
+
) => {
|
134
|
+
let _tmp = $param.map_err(|e| $crate::helpers::VM::raise_ex(e)).unwrap();
|
135
|
+
let $param = <_ as $crate::helpers::TryUnwrap<$type>>::try_unwrap(_tmp);
|
136
|
+
|
137
|
+
try_unwrap_params!($($rest)*)
|
138
|
+
};
|
139
|
+
(
|
140
|
+
$param:ident,
|
141
|
+
$( $rest:tt )*
|
142
|
+
) => {
|
143
|
+
let $param = $param.map_err(|e| $crate::helpers::VM::raise_ex(e)).unwrap();
|
144
|
+
|
145
|
+
try_unwrap_params!($($rest)*)
|
146
|
+
};
|
147
|
+
|
148
|
+
// Handle optional trailing commas.
|
149
|
+
( $param:ident: $type:ty ) => {
|
150
|
+
try_unwrap_params!($param: $type,)
|
151
|
+
};
|
152
|
+
( $param:ident ) => {
|
153
|
+
try_unwrap_params!($param,)
|
154
|
+
};
|
155
|
+
|
156
|
+
() => {}
|
157
|
+
}
|
158
|
+
|
159
|
+
pub(crate) use try_unwrap_params;
|
160
|
+
|
161
|
+
macro_rules! scaffold {
|
162
|
+
( $ruby_type:ident, $type:ty, $klass:literal ) => {
|
163
|
+
$crate::helpers::class!($ruby_type);
|
164
|
+
|
165
|
+
// There is a bug in Rutie which prevents using this macro
|
166
|
+
// by resolving it by a full path, so the only workaround is:
|
167
|
+
use crate::helpers::wrappable_struct;
|
168
|
+
|
169
|
+
$crate::helpers::paste! {
|
170
|
+
wrappable_struct!(
|
171
|
+
$type,
|
172
|
+
[<$type Wrapper>],
|
173
|
+
[<$type:snake:upper _WRAPPER>]
|
174
|
+
);
|
175
|
+
}
|
176
|
+
|
177
|
+
pub(crate) fn klass() -> $crate::helpers::Class {
|
178
|
+
$crate::helpers::namespace().get_nested_class($klass)
|
179
|
+
}
|
180
|
+
|
181
|
+
impl $crate::helpers::TryUnwrap<$ruby_type> for $crate::helpers::AnyObject {
|
182
|
+
fn try_unwrap(self) -> $ruby_type {
|
183
|
+
let result = self.try_convert_to::<$ruby_type>();
|
184
|
+
<_ as $crate::helpers::TryUnwrap<$ruby_type>>::try_unwrap(result)
|
185
|
+
}
|
186
|
+
}
|
187
|
+
|
188
|
+
impl $crate::helpers::VerifiedObject for $ruby_type {
|
189
|
+
fn is_correct_type<T: $crate::helpers::Object>(object: &T) -> bool {
|
190
|
+
object.class() == klass()
|
191
|
+
}
|
192
|
+
|
193
|
+
fn error_message() -> &'static str {
|
194
|
+
concat!("Error converting to ", stringify!($ruby_type), ".")
|
195
|
+
}
|
196
|
+
}
|
197
|
+
}
|
198
|
+
}
|
199
|
+
|
200
|
+
pub(crate) use scaffold;
|
data/src/index.rs
ADDED
@@ -0,0 +1,261 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
use std::str::FromStr;
|
3
|
+
use rutie::{methods, Object, AnyObject, Integer, NilClass, Array, RString, Hash};
|
4
|
+
use tantivy::{doc, Document, Term, ReloadPolicy, Index, IndexWriter, IndexReader, DateTime};
|
5
|
+
use tantivy::schema::{Schema, TextOptions, TextFieldIndexing, IndexRecordOption, FacetOptions, STRING, STORED, INDEXED, FAST};
|
6
|
+
use tantivy::collector::TopDocs;
|
7
|
+
use tantivy::directory::MmapDirectory;
|
8
|
+
|
9
|
+
use crate::helpers::{scaffold, try_unwrap_params, TryUnwrap};
|
10
|
+
use crate::query::{unwrap_query, RTantinyQuery};
|
11
|
+
use crate::tokenizer::{unwrap_tokenizer, RTantinyTokenizer};
|
12
|
+
|
13
|
+
pub struct TantinyIndex {
|
14
|
+
pub(crate) index_writer: IndexWriter,
|
15
|
+
pub(crate) index_reader: IndexReader,
|
16
|
+
pub(crate) schema: Schema,
|
17
|
+
}
|
18
|
+
|
19
|
+
scaffold!(RTantinyIndex, TantinyIndex, "Index");
|
20
|
+
|
21
|
+
pub(crate) fn unwrap_index(index: &RTantinyIndex) -> &TantinyIndex {
|
22
|
+
index.get_data(&*TANTINY_INDEX_WRAPPER)
|
23
|
+
}
|
24
|
+
|
25
|
+
#[rustfmt::skip::macros(methods)]
|
26
|
+
methods!(
|
27
|
+
RTantinyIndex,
|
28
|
+
_itself,
|
29
|
+
|
30
|
+
fn new_index(
|
31
|
+
path: RString,
|
32
|
+
index_size: Integer,
|
33
|
+
default_tokenizer: AnyObject,
|
34
|
+
field_tokenizers: Hash,
|
35
|
+
text_fields: Array,
|
36
|
+
string_fields: Array,
|
37
|
+
integer_fields: Array,
|
38
|
+
double_fields: Array,
|
39
|
+
date_fields: Array,
|
40
|
+
facet_fields: Array
|
41
|
+
) -> RTantinyIndex {
|
42
|
+
try_unwrap_params!(
|
43
|
+
path: String,
|
44
|
+
index_size: i64,
|
45
|
+
default_tokenizer: RTantinyTokenizer,
|
46
|
+
field_tokenizers: HashMap<String, RTantinyTokenizer>,
|
47
|
+
text_fields: Vec<String>,
|
48
|
+
string_fields: Vec<String>,
|
49
|
+
integer_fields: Vec<String>,
|
50
|
+
double_fields: Vec<String>,
|
51
|
+
date_fields: Vec<String>,
|
52
|
+
facet_fields: Vec<String>
|
53
|
+
);
|
54
|
+
|
55
|
+
let index_path = MmapDirectory::open(path).try_unwrap();
|
56
|
+
let mut schema_builder = Schema::builder();
|
57
|
+
|
58
|
+
schema_builder.add_text_field("id", STRING | STORED);
|
59
|
+
|
60
|
+
for field in text_fields {
|
61
|
+
let tokenizer_name =
|
62
|
+
if field_tokenizers.contains_key(&field) {
|
63
|
+
&*field
|
64
|
+
} else {
|
65
|
+
"default"
|
66
|
+
};
|
67
|
+
let indexing = TextFieldIndexing::default()
|
68
|
+
.set_tokenizer(tokenizer_name)
|
69
|
+
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
70
|
+
let options = TextOptions::default()
|
71
|
+
.set_indexing_options(indexing);
|
72
|
+
schema_builder.add_text_field(&field, options);
|
73
|
+
}
|
74
|
+
|
75
|
+
for field in string_fields {
|
76
|
+
schema_builder.add_text_field(&field, STRING);
|
77
|
+
}
|
78
|
+
|
79
|
+
for field in integer_fields {
|
80
|
+
schema_builder.add_i64_field(&field, FAST | INDEXED);
|
81
|
+
}
|
82
|
+
|
83
|
+
for field in double_fields {
|
84
|
+
schema_builder.add_f64_field(&field, FAST | INDEXED);
|
85
|
+
}
|
86
|
+
|
87
|
+
for field in date_fields {
|
88
|
+
schema_builder.add_date_field(&field, FAST | INDEXED);
|
89
|
+
}
|
90
|
+
|
91
|
+
for field in facet_fields {
|
92
|
+
let options = FacetOptions::default().set_indexed();
|
93
|
+
schema_builder.add_facet_field(&field, options);
|
94
|
+
}
|
95
|
+
|
96
|
+
let schema = schema_builder.build();
|
97
|
+
let index = Index::open_or_create(index_path, schema.clone()).try_unwrap();
|
98
|
+
let tokenizers = index.tokenizers();
|
99
|
+
|
100
|
+
tokenizers.register("default", unwrap_tokenizer(&default_tokenizer).clone());
|
101
|
+
|
102
|
+
for (field, tokenizer) in field_tokenizers {
|
103
|
+
tokenizers.register(&field, unwrap_tokenizer(&tokenizer).clone())
|
104
|
+
}
|
105
|
+
|
106
|
+
let mut index_writer = index
|
107
|
+
.writer(index_size as usize)
|
108
|
+
.try_unwrap();
|
109
|
+
|
110
|
+
let index_reader = index
|
111
|
+
.reader_builder()
|
112
|
+
.reload_policy(ReloadPolicy::Manual)
|
113
|
+
.try_into()
|
114
|
+
.try_unwrap();
|
115
|
+
|
116
|
+
klass().wrap_data(
|
117
|
+
TantinyIndex { index_writer, index_reader, schema },
|
118
|
+
&*TANTINY_INDEX_WRAPPER
|
119
|
+
)
|
120
|
+
}
|
121
|
+
|
122
|
+
fn add_document(
|
123
|
+
id: RString,
|
124
|
+
text_fields: Hash,
|
125
|
+
string_fields: Hash,
|
126
|
+
integer_fields: Hash,
|
127
|
+
double_fields: Hash,
|
128
|
+
date_fields: Hash,
|
129
|
+
facet_fields: Hash
|
130
|
+
) -> NilClass {
|
131
|
+
try_unwrap_params!(
|
132
|
+
id: String,
|
133
|
+
text_fields: HashMap<String, String>,
|
134
|
+
string_fields: HashMap<String, String>,
|
135
|
+
integer_fields: HashMap<String, i64>,
|
136
|
+
double_fields: HashMap<String, f64>,
|
137
|
+
date_fields: HashMap<String, String>,
|
138
|
+
facet_fields: HashMap<String, String>
|
139
|
+
);
|
140
|
+
|
141
|
+
|
142
|
+
let internal = unwrap_index(&_itself);
|
143
|
+
let index_writer = &internal.index_writer;
|
144
|
+
let schema = &internal.schema;
|
145
|
+
|
146
|
+
let mut doc = Document::default();
|
147
|
+
|
148
|
+
let id_field = schema.get_field("id").try_unwrap();
|
149
|
+
doc.add_text(id_field, &id);
|
150
|
+
|
151
|
+
for (key, value) in text_fields.iter() {
|
152
|
+
let field = schema.get_field(key).try_unwrap();
|
153
|
+
doc.add_text(field, value);
|
154
|
+
}
|
155
|
+
|
156
|
+
for (key, value) in string_fields.iter() {
|
157
|
+
let field = schema.get_field(key).try_unwrap();
|
158
|
+
doc.add_text(field, value);
|
159
|
+
}
|
160
|
+
|
161
|
+
for (key, &value) in integer_fields.iter() {
|
162
|
+
let field = schema.get_field(key).try_unwrap();
|
163
|
+
doc.add_i64(field, value);
|
164
|
+
}
|
165
|
+
|
166
|
+
for (key, &value) in double_fields.iter() {
|
167
|
+
let field = schema.get_field(key).try_unwrap();
|
168
|
+
doc.add_f64(field, value);
|
169
|
+
}
|
170
|
+
|
171
|
+
for (key, value) in date_fields.iter() {
|
172
|
+
let field = schema.get_field(key).try_unwrap();
|
173
|
+
let value = DateTime::from_str(value).try_unwrap();
|
174
|
+
doc.add_date(field, &value);
|
175
|
+
}
|
176
|
+
|
177
|
+
for (key, value) in facet_fields.iter() {
|
178
|
+
let field = schema.get_field(key).try_unwrap();
|
179
|
+
doc.add_facet(field, &value);
|
180
|
+
}
|
181
|
+
|
182
|
+
let doc_id = Term::from_field_text(id_field, &id);
|
183
|
+
index_writer.delete_term(doc_id.clone());
|
184
|
+
|
185
|
+
index_writer.add_document(doc);
|
186
|
+
|
187
|
+
NilClass::new()
|
188
|
+
}
|
189
|
+
|
190
|
+
fn delete_document(id: RString) -> NilClass {
|
191
|
+
try_unwrap_params!(id: String);
|
192
|
+
|
193
|
+
let internal = unwrap_index(&_itself);
|
194
|
+
let index_writer = &internal.index_writer;
|
195
|
+
|
196
|
+
let id_field = internal.schema.get_field("id").try_unwrap();
|
197
|
+
let doc_id = Term::from_field_text(id_field, &id);
|
198
|
+
|
199
|
+
index_writer.delete_term(doc_id.clone());
|
200
|
+
|
201
|
+
NilClass::new()
|
202
|
+
}
|
203
|
+
|
204
|
+
fn commit() -> NilClass {
|
205
|
+
let internal = _itself.get_data_mut(&*TANTINY_INDEX_WRAPPER);
|
206
|
+
let index_writer = &mut internal.index_writer;
|
207
|
+
|
208
|
+
index_writer.commit().try_unwrap();
|
209
|
+
|
210
|
+
NilClass::new()
|
211
|
+
}
|
212
|
+
|
213
|
+
fn reload() -> NilClass {
|
214
|
+
unwrap_index(&_itself).index_reader.reload().try_unwrap();
|
215
|
+
|
216
|
+
NilClass::new()
|
217
|
+
}
|
218
|
+
|
219
|
+
fn search(
|
220
|
+
query: AnyObject,
|
221
|
+
limit: Integer
|
222
|
+
) -> Array {
|
223
|
+
try_unwrap_params!(
|
224
|
+
query: RTantinyQuery,
|
225
|
+
limit: i64
|
226
|
+
);
|
227
|
+
|
228
|
+
let internal = unwrap_index(&_itself);
|
229
|
+
let id_field = internal.schema.get_field("id").try_unwrap();
|
230
|
+
let searcher = internal.index_reader.searcher();
|
231
|
+
let query = unwrap_query(&query);
|
232
|
+
|
233
|
+
let top_docs = searcher
|
234
|
+
.search(query, &TopDocs::with_limit(limit as usize))
|
235
|
+
.try_unwrap();
|
236
|
+
|
237
|
+
let mut array = Array::with_capacity(top_docs.len());
|
238
|
+
|
239
|
+
for (_score, doc_address) in top_docs {
|
240
|
+
let doc = searcher.doc(doc_address).try_unwrap();
|
241
|
+
if let Some(value) = doc.get_first(id_field) {
|
242
|
+
if let Some(id) = (&*value).text() {
|
243
|
+
array.push(RString::from(String::from(id)));
|
244
|
+
}
|
245
|
+
}
|
246
|
+
}
|
247
|
+
|
248
|
+
array
|
249
|
+
}
|
250
|
+
);
|
251
|
+
|
252
|
+
pub(super) fn init() {
|
253
|
+
klass().define(|klass| {
|
254
|
+
klass.def_self("__new", new_index);
|
255
|
+
klass.def("__add_document", add_document);
|
256
|
+
klass.def("__delete_document", delete_document);
|
257
|
+
klass.def("__commit", commit);
|
258
|
+
klass.def("__reload", reload);
|
259
|
+
klass.def("__search", search);
|
260
|
+
});
|
261
|
+
}
|
data/src/lib.rs
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
mod helpers;
|
2
|
+
#[allow(improper_ctypes_definitions)]
|
3
|
+
mod index;
|
4
|
+
#[allow(improper_ctypes_definitions)]
|
5
|
+
mod query;
|
6
|
+
|
7
|
+
#[allow(improper_ctypes_definitions)]
|
8
|
+
mod tokenizer;
|
9
|
+
|
10
|
+
#[no_mangle]
|
11
|
+
pub extern "C" fn Init_tantiny() {
|
12
|
+
index::init();
|
13
|
+
query::init();
|
14
|
+
tokenizer::init();
|
15
|
+
}
|