tantiny-in-memory 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/query.rs ADDED
@@ -0,0 +1,260 @@
1
+ use std::str::FromStr;
2
+ use std::ops::Bound::Included;
3
+ use rutie::{methods, Object, AnyObject, Integer, Float, Array, RString};
4
+ use tantivy::{Term, DateTime};
5
+ use tantivy::schema::{IndexRecordOption, Facet, Type, FieldType};
6
+ use tantivy::query::*;
7
+
8
+ use crate::helpers::{try_unwrap_params, scaffold, TryUnwrap};
9
+ use crate::index::{unwrap_index, RTantinyIndex};
10
+
11
+ pub struct TantinyQuery(pub(crate) Box<dyn Query>);
12
+
13
+ scaffold!(RTantinyQuery, TantinyQuery, "Query");
14
+
15
+ fn wrap_query(query: Box<dyn Query>) -> RTantinyQuery {
16
+ klass().wrap_data(
17
+ TantinyQuery(query),
18
+ &*TANTINY_QUERY_WRAPPER
19
+ )
20
+ }
21
+
22
+ pub(crate) fn unwrap_query(query: &RTantinyQuery) -> &Box<dyn Query> {
23
+ &query.get_data(&*TANTINY_QUERY_WRAPPER).0
24
+ }
25
+
26
+ #[rustfmt::skip::macros(methods)]
27
+ methods!(
28
+ RTantinyQuery,
29
+ _itself,
30
+
31
+ fn new_all_query() -> RTantinyQuery {
32
+ wrap_query(Box::new(AllQuery))
33
+ }
34
+
35
+ fn new_empty_query() -> RTantinyQuery {
36
+ wrap_query(Box::new(EmptyQuery))
37
+ }
38
+
39
+ fn new_term_query(
40
+ index: RTantinyIndex,
41
+ field: RString,
42
+ term: RString
43
+ ) -> RTantinyQuery {
44
+ try_unwrap_params!(
45
+ index,
46
+ field: String,
47
+ term: String
48
+ );
49
+
50
+ let schema = &unwrap_index(&index).schema;
51
+ let field = schema.get_field(&field).try_unwrap();
52
+ let term = Term::from_field_text(field, &term);
53
+ let query = TermQuery::new(term, IndexRecordOption::Basic);
54
+
55
+ wrap_query(Box::new(query))
56
+ }
57
+
58
+ fn new_fuzzy_term_query(
59
+ index: RTantinyIndex,
60
+ field: RString,
61
+ term: RString,
62
+ distance: Integer
63
+ ) -> RTantinyQuery {
64
+ try_unwrap_params!(
65
+ index,
66
+ field: String,
67
+ term: String,
68
+ distance: i64
69
+ );
70
+
71
+ let schema = &unwrap_index(&index).schema;
72
+ let field = schema.get_field(&field).try_unwrap();
73
+ let term = Term::from_field_text(field, &term);
74
+ let query = FuzzyTermQuery::new(term, distance as u8, true);
75
+
76
+ wrap_query(Box::new(query))
77
+ }
78
+
79
+ fn new_phrase_query(
80
+ index: RTantinyIndex,
81
+ field: RString,
82
+ terms: Array
83
+ ) -> RTantinyQuery {
84
+ try_unwrap_params!(
85
+ index,
86
+ field: String,
87
+ terms: Vec<String>
88
+ );
89
+
90
+ let schema = &unwrap_index(&index).schema;
91
+ let field = schema.get_field(&field).try_unwrap();
92
+
93
+ let terms: Vec<Term> = terms.into_iter().map(|term| {
94
+ Term::from_field_text(field, &term)
95
+ }).collect();
96
+ let query = PhraseQuery::new(terms);
97
+
98
+ wrap_query(Box::new(query))
99
+ }
100
+
101
+ fn new_regex_query(
102
+ index: RTantinyIndex,
103
+ field: RString,
104
+ regex: RString
105
+ ) -> RTantinyQuery {
106
+ try_unwrap_params!(
107
+ index,
108
+ field: String,
109
+ regex: String
110
+ );
111
+
112
+ let schema = &unwrap_index(&index).schema;
113
+ let field = schema.get_field(&field).try_unwrap();
114
+ let query = RegexQuery::from_pattern(&regex, field).try_unwrap();
115
+
116
+ wrap_query(Box::new(query))
117
+ }
118
+
119
+ fn new_range_query(
120
+ index: RTantinyIndex,
121
+ field: RString,
122
+ from: AnyObject,
123
+ to: AnyObject
124
+ ) -> RTantinyQuery {
125
+ try_unwrap_params!(index, from, to, field: String);
126
+
127
+ let schema = &unwrap_index(&index).schema;
128
+ let field = schema.get_field(&field).try_unwrap();
129
+ let field_name = schema.get_field_name(field);
130
+ let field_type = schema.get_field_entry(field).field_type();
131
+
132
+ let range = match field_type {
133
+ FieldType::Date(_) => {
134
+ let from: String = from.try_unwrap();
135
+ let to: String = to.try_unwrap();
136
+ let from = DateTime::from_str(&from).try_unwrap();
137
+ let to = DateTime::from_str(&to).try_unwrap();
138
+
139
+ Ok((
140
+ Type::Date,
141
+ Included(Term::from_field_date(field, &from)),
142
+ Included(Term::from_field_date(field, &to))
143
+ ))
144
+ },
145
+ FieldType::I64(_) => {
146
+ let from: i64 = from.try_unwrap();
147
+ let to: i64 = to.try_unwrap();
148
+
149
+ Ok((
150
+ Type::I64,
151
+ Included(Term::from_field_i64(field, from)),
152
+ Included(Term::from_field_i64(field, to))
153
+ ))
154
+ },
155
+ FieldType::F64(_) => {
156
+ let from: f64 = from.try_unwrap();
157
+ let to: f64 = to.try_unwrap();
158
+
159
+ Ok((
160
+ Type::F64,
161
+ Included(Term::from_field_f64(field, from)),
162
+ Included(Term::from_field_f64(field, to))
163
+ ))
164
+ },
165
+ _ => { Err(format!("Field '{}' is not supported by range query.", field_name)) }
166
+ };
167
+
168
+ let (value_type, left, right) = range.try_unwrap();
169
+
170
+ let query = RangeQuery::new_term_bounds(field, value_type, &left, &right);
171
+
172
+ wrap_query(Box::new(query))
173
+ }
174
+
175
+ fn new_facet_query(
176
+ index: RTantinyIndex,
177
+ field: RString,
178
+ path: RString
179
+ ) -> RTantinyQuery {
180
+ try_unwrap_params!(
181
+ index,
182
+ field: String,
183
+ path: String
184
+ );
185
+
186
+ let schema = &unwrap_index(&index).schema;
187
+ let field = schema.get_field(&field).try_unwrap();
188
+ let facet = Facet::from(&path);
189
+ let term = Term::from_facet(field, &facet);
190
+ let query = TermQuery::new(term, IndexRecordOption::Basic);
191
+
192
+ wrap_query(Box::new(query))
193
+ }
194
+
195
+ fn disjunction(queries: Array) -> RTantinyQuery {
196
+ try_unwrap_params!(queries);
197
+
198
+ let mut query_vec = Vec::new();
199
+
200
+ for query in queries {
201
+ let query: RTantinyQuery = query.try_unwrap();
202
+ query_vec.push((Occur::Should, unwrap_query(&query).box_clone()));
203
+ }
204
+
205
+ let disjunction_query = BooleanQuery::from(query_vec);
206
+
207
+ wrap_query(Box::new(disjunction_query))
208
+ }
209
+
210
+ fn conjunction(queries: Array) -> RTantinyQuery {
211
+ try_unwrap_params!(queries);
212
+
213
+ let mut query_vec = Vec::new();
214
+
215
+ for query in queries {
216
+ let query: RTantinyQuery = query.try_unwrap();
217
+ query_vec.push((Occur::Must, unwrap_query(&query).box_clone()));
218
+ }
219
+
220
+ let conjunction_query = BooleanQuery::from(query_vec);
221
+
222
+ wrap_query(Box::new(conjunction_query))
223
+ }
224
+
225
+ fn negation() -> RTantinyQuery {
226
+ // See: https://github.com/quickwit-oss/tantivy/issues/1153
227
+ let all_query: Box<dyn Query> = Box::new(AllQuery);
228
+ let negation_query = BooleanQuery::from(vec![
229
+ (Occur::Must, all_query.box_clone()),
230
+ (Occur::MustNot, unwrap_query(&_itself).box_clone()),
231
+ ]);
232
+
233
+ wrap_query(Box::new(negation_query))
234
+ }
235
+
236
+ fn boost(score: Float) -> RTantinyQuery {
237
+ try_unwrap_params!(score: f64);
238
+
239
+ let query = BoostQuery::new(unwrap_query(&_itself).box_clone(), score as f32);
240
+
241
+ wrap_query(Box::new(query))
242
+ }
243
+ );
244
+
245
+ pub(super) fn init() {
246
+ klass().define(|klass| {
247
+ klass.def_self("__new_all_query", new_all_query);
248
+ klass.def_self("__new_empty_query", new_empty_query);
249
+ klass.def_self("__new_term_query", new_term_query);
250
+ klass.def_self("__new_fuzzy_term_query", new_fuzzy_term_query);
251
+ klass.def_self("__new_regex_query", new_regex_query);
252
+ klass.def_self("__new_range_query", new_range_query);
253
+ klass.def_self("__new_phrase_query", new_phrase_query);
254
+ klass.def_self("__new_facet_query", new_facet_query);
255
+ klass.def_self("__disjunction", disjunction);
256
+ klass.def_self("__conjunction", conjunction);
257
+ klass.def("__negation", negation);
258
+ klass.def("__boost", boost);
259
+ });
260
+ }
data/src/tokenizer.rs ADDED
@@ -0,0 +1,94 @@
1
+
2
+ use rutie::{methods, Object, Array, RString, Integer, Boolean};
3
+ use tantivy::tokenizer::{TextAnalyzer, SimpleTokenizer, RemoveLongFilter, LowerCaser, Stemmer, NgramTokenizer};
4
+
5
+ use crate::helpers::{try_unwrap_params, scaffold, TryUnwrap, LanguageWrapper};
6
+
7
+ pub struct TantinyTokenizer(pub(crate) TextAnalyzer);
8
+
9
+ scaffold!(RTantinyTokenizer, TantinyTokenizer, "Tokenizer");
10
+
11
+ fn wrap_tokenizer(tokenizer: TextAnalyzer) -> RTantinyTokenizer {
12
+ klass().wrap_data(
13
+ TantinyTokenizer(tokenizer),
14
+ &*TANTINY_TOKENIZER_WRAPPER
15
+ )
16
+ }
17
+
18
+ pub(crate) fn unwrap_tokenizer(tokenizer: &RTantinyTokenizer) -> &TextAnalyzer {
19
+ &tokenizer.get_data(&*TANTINY_TOKENIZER_WRAPPER).0
20
+ }
21
+
22
+ #[rustfmt::skip::macros(methods)]
23
+ methods!(
24
+ RTantinyTokenizer,
25
+ _itself,
26
+
27
+ fn new_simple_tokenizer() -> RTantinyTokenizer {
28
+ let tokenizer = TextAnalyzer::from(SimpleTokenizer)
29
+ .filter(RemoveLongFilter::limit(40))
30
+ .filter(LowerCaser);
31
+
32
+ wrap_tokenizer(tokenizer)
33
+ }
34
+
35
+ fn new_stemmer_tokenizer(locale_code: RString) -> RTantinyTokenizer {
36
+ try_unwrap_params!(locale_code: String);
37
+
38
+ let language: LanguageWrapper = locale_code.parse().try_unwrap();
39
+ let tokenizer = TextAnalyzer::from(SimpleTokenizer)
40
+ .filter(RemoveLongFilter::limit(40))
41
+ .filter(LowerCaser)
42
+ .filter(Stemmer::new(language.0));
43
+
44
+ wrap_tokenizer(tokenizer)
45
+ }
46
+
47
+ fn new_ngram_tokenizer(
48
+ min_gram: Integer,
49
+ max_gram: Integer,
50
+ prefix_only: Boolean
51
+ ) -> RTantinyTokenizer {
52
+ try_unwrap_params!(
53
+ min_gram: i64,
54
+ max_gram: i64,
55
+ prefix_only: bool
56
+ );
57
+
58
+ let tokenizer = NgramTokenizer::new(
59
+ min_gram as usize,
60
+ max_gram as usize,
61
+ prefix_only
62
+ );
63
+
64
+ wrap_tokenizer(TextAnalyzer::from(tokenizer))
65
+ }
66
+
67
+ fn extract_terms(text: RString) -> Array {
68
+ try_unwrap_params!(text: String);
69
+
70
+ let mut token_stream = unwrap_tokenizer(&_itself).token_stream(&text);
71
+ let mut terms = vec![];
72
+
73
+ while token_stream.advance() {
74
+ terms.push(token_stream.token().clone().text);
75
+ }
76
+
77
+ let mut array = Array::with_capacity(terms.len());
78
+
79
+ for term in terms {
80
+ array.push(RString::from(term));
81
+ }
82
+
83
+ array
84
+ }
85
+ );
86
+
87
+ pub(super) fn init() {
88
+ klass().define(|klass| {
89
+ klass.def_self("__new_simple_tokenizer", new_simple_tokenizer);
90
+ klass.def_self("__new_stemmer_tokenizer", new_stemmer_tokenizer);
91
+ klass.def_self("__new_ngram_tokenizer", new_ngram_tokenizer);
92
+ klass.def("__extract_terms", extract_terms);
93
+ });
94
+ }
metadata ADDED
@@ -0,0 +1,148 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tantiny-in-memory
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Christian Toscano
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-09-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ruby-next
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.14.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.14.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: rutie
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.0.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.0.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: thermite
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '13.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '13.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: concurrent-ruby
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.0'
83
+ description:
84
+ email:
85
+ - a.christian.toscano@gmail.com
86
+ executables: []
87
+ extensions:
88
+ - ext/Rakefile
89
+ extra_rdoc_files: []
90
+ files:
91
+ - CHANGELOG.md
92
+ - Cargo.toml
93
+ - LICENSE
94
+ - README.md
95
+ - bin/console
96
+ - bin/setup
97
+ - ext/Rakefile
98
+ - lib/tantiny.rb
99
+ - lib/tantiny.so
100
+ - lib/tantiny/errors.rb
101
+ - lib/tantiny/helpers.rb
102
+ - lib/tantiny/index.rb
103
+ - lib/tantiny/query.rb
104
+ - lib/tantiny/schema.rb
105
+ - lib/tantiny/tokenizer.rb
106
+ - lib/tantiny/version.rb
107
+ - sig/tantiny.rbs
108
+ - sig/tantiny/errors.rbs
109
+ - sig/tantiny/helpers.rbs
110
+ - sig/tantiny/index.rbs
111
+ - sig/tantiny/query.rbs
112
+ - sig/tantiny/schema.rbs
113
+ - sig/tantiny/tokenizer.rbs
114
+ - sig/tantiny/version.rbs
115
+ - src/helpers.rs
116
+ - src/index.rs
117
+ - src/lib.rs
118
+ - src/query.rs
119
+ - src/tokenizer.rs
120
+ homepage: https://github.com/a-chris/tantiny-in-memory
121
+ licenses:
122
+ - MIT
123
+ metadata:
124
+ bug_tracker_uri: https://github.com/a-chris/tantiny-in-memory/issues
125
+ changelog_uri: https://github.com/a-chris/tantiny-in-memory/blob/master/CHANGELOG.md
126
+ documentation_uri: https://github.com/a-chris/tantiny-in-memory/blob/master/README.md
127
+ homepage_uri: https://github.com/a-chris/tantiny-in-memory
128
+ source_code_uri: https://github.com/a-chris/tantiny-in-memory
129
+ post_install_message:
130
+ rdoc_options: []
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '2.7'
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubygems_version: 3.4.10
145
+ signing_key:
146
+ specification_version: 4
147
+ summary: Tiny full-text search for Ruby powered by Tantivy but in memory!
148
+ test_files: []