tantiny-in-memory 1.0.7 → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/Cargo.toml +2 -2
- data/lib/tantiny/index.rb +6 -0
- data/lib/tantiny/schema.rb +18 -6
- data/lib/tantiny/version.rb +1 -1
- data/src/index.rs +49 -8
- data/src/query.rs +8 -7
- data/src/tokenizer.rs +15 -9
- metadata +2 -3
- data/lib/.rbnext/3.0/tantiny/schema.rb +0 -53
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd302bd9b783fbee6a64945f376814799fa98f2c53d8f474acfa477f98c4e48c
|
4
|
+
data.tar.gz: 9dd34494fc0f16a7f76e972772cd08a1c7de9770bd3601530bbb20d2088f0eb3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f63f2b7c6984f097c3b9e2b984f1854a479a8bc195548d34428c3a01b3166fa6cbb61721cac1718f31e6974fa071aebe2befee1d030edcaa42145bf828c91b1f
|
7
|
+
data.tar.gz: 58a02e43c93bb326c7337e89c3133f266251f82b3dce29328df9dfe4a92deb8ecacd6ed7700d7686ae68a4fda7ff982e5e612208bc00ff3f53b109448c853d91
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## [1.0.8](https://github.com/a-chris/tantiny-in-memory/compare/v1.0.7...v1.0.8) (2023-09-22)
|
4
|
+
|
5
|
+
|
6
|
+
### Bug Fixes
|
7
|
+
|
8
|
+
* bad typo ([dc15ff1](https://github.com/a-chris/tantiny-in-memory/commit/dc15ff112d996bdc221c3d33218de0e0bde2086f))
|
9
|
+
|
3
10
|
## [1.0.7](https://github.com/a-chris/tantiny-in-memory/compare/v1.0.6...v1.0.7) (2023-09-04)
|
4
11
|
|
5
12
|
|
data/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tantiny"
|
3
|
-
version = "1.0.
|
3
|
+
version = "1.0.8" # {x-release-please-version}
|
4
4
|
edition = "2021"
|
5
5
|
authors = ["Christian Toscano"]
|
6
6
|
repository = "https://github.com/a-chris/tantiny-in-memory"
|
@@ -10,7 +10,7 @@ crate-type = ["cdylib"]
|
|
10
10
|
|
11
11
|
[dependencies]
|
12
12
|
rutie = "0.8"
|
13
|
-
tantivy = "0.
|
13
|
+
tantivy = "0.21"
|
14
14
|
lazy_static = "1.4"
|
15
15
|
paste = "1.0"
|
16
16
|
|
data/lib/tantiny/index.rb
CHANGED
@@ -81,6 +81,12 @@ module Tantiny
|
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
84
|
+
def raw_query_search(query, limit: DEFAULT_LIMIT)
|
85
|
+
raise ArgumentError, "Query must be a string" unless query.is_a?(String)
|
86
|
+
|
87
|
+
__raw_query_search(query, limit)
|
88
|
+
end
|
89
|
+
|
84
90
|
def search(query, limit: DEFAULT_LIMIT, **smart_query_options)
|
85
91
|
unless query.is_a?(Query)
|
86
92
|
fields = schema.text_fields
|
data/lib/tantiny/schema.rb
CHANGED
@@ -32,17 +32,29 @@ module Tantiny
|
|
32
32
|
|
33
33
|
private
|
34
34
|
|
35
|
-
def id(key)
|
35
|
+
def id(key)
|
36
|
+
@id_field = key
|
37
|
+
end
|
36
38
|
|
37
|
-
def string(key)
|
39
|
+
def string(key)
|
40
|
+
@string_fields << key
|
41
|
+
end
|
38
42
|
|
39
|
-
def integer(key)
|
43
|
+
def integer(key)
|
44
|
+
@integer_fields << key
|
45
|
+
end
|
40
46
|
|
41
|
-
def double(key)
|
47
|
+
def double(key)
|
48
|
+
@double_fields << key
|
49
|
+
end
|
42
50
|
|
43
|
-
def date(key)
|
51
|
+
def date(key)
|
52
|
+
@date_fields << key
|
53
|
+
end
|
44
54
|
|
45
|
-
def facet(key)
|
55
|
+
def facet(key)
|
56
|
+
@facet_fields << key
|
57
|
+
end
|
46
58
|
|
47
59
|
def text(key, tokenizer: nil)
|
48
60
|
@field_tokenizers[key] = tokenizer if tokenizer
|
data/lib/tantiny/version.rb
CHANGED
data/src/index.rs
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
use std::collections::HashMap;
|
2
|
-
use std::str::FromStr;
|
3
2
|
use rutie::{methods, Object, AnyObject, Integer, NilClass, Array, RString, Hash};
|
4
|
-
use tantivy::
|
3
|
+
use tantivy::time::OffsetDateTime;
|
4
|
+
use tantivy::time::format_description::well_known::Rfc3339;
|
5
|
+
use tantivy::{doc, DateTime, Document, Term, ReloadPolicy, Index, IndexWriter, IndexReader};
|
6
|
+
use tantivy::query::QueryParser;
|
5
7
|
use tantivy::schema::{Schema, TextOptions, TextFieldIndexing, IndexRecordOption, FacetOptions, STRING, STORED, INDEXED, FAST};
|
6
8
|
use tantivy::collector::TopDocs;
|
7
9
|
|
@@ -90,7 +92,7 @@ methods!(
|
|
90
92
|
}
|
91
93
|
|
92
94
|
for field in facet_fields {
|
93
|
-
let options = FacetOptions::default()
|
95
|
+
let options = FacetOptions::default();
|
94
96
|
schema_builder.add_facet_field(&field, options);
|
95
97
|
}
|
96
98
|
|
@@ -98,10 +100,10 @@ methods!(
|
|
98
100
|
let index = Index::create_in_ram(schema.clone());
|
99
101
|
let tokenizers = index.tokenizers();
|
100
102
|
|
101
|
-
tokenizers.register("default", unwrap_tokenizer(
|
103
|
+
tokenizers.register("default", (&unwrap_tokenizer(default_tokenizer)).clone());
|
102
104
|
|
103
105
|
for (field, tokenizer) in field_tokenizers {
|
104
|
-
tokenizers.register(&field, unwrap_tokenizer(
|
106
|
+
tokenizers.register(&field, (&unwrap_tokenizer(tokenizer)).clone())
|
105
107
|
}
|
106
108
|
|
107
109
|
let index_writer = None;
|
@@ -168,8 +170,8 @@ methods!(
|
|
168
170
|
|
169
171
|
for (key, value) in date_fields.iter() {
|
170
172
|
let field = schema.get_field(key).try_unwrap();
|
171
|
-
let value = DateTime::
|
172
|
-
doc.add_date(field,
|
173
|
+
let value = DateTime::from_utc(OffsetDateTime::parse(value, &Rfc3339).unwrap());
|
174
|
+
doc.add_date(field, value);
|
173
175
|
}
|
174
176
|
|
175
177
|
for (key, value) in facet_fields.iter() {
|
@@ -239,6 +241,44 @@ methods!(
|
|
239
241
|
NilClass::new()
|
240
242
|
}
|
241
243
|
|
244
|
+
fn raw_query_search(
|
245
|
+
query_string: RString,
|
246
|
+
limit: Integer
|
247
|
+
) -> Array {
|
248
|
+
try_unwrap_params!(
|
249
|
+
query_string: String,
|
250
|
+
limit: i64
|
251
|
+
);
|
252
|
+
|
253
|
+
let internal = unwrap_index(&_itself);
|
254
|
+
let content_field = internal.schema.get_field("content").try_unwrap();
|
255
|
+
let query_parser = QueryParser::for_index(
|
256
|
+
&internal.index,
|
257
|
+
vec![content_field],
|
258
|
+
);
|
259
|
+
|
260
|
+
let query = query_parser.parse_query(&query_string).try_unwrap();
|
261
|
+
let id_field = internal.schema.get_field("id").try_unwrap();
|
262
|
+
let searcher = internal.index_reader.searcher();
|
263
|
+
|
264
|
+
let top_docs = searcher
|
265
|
+
.search(&*query, &TopDocs::with_limit(limit as usize))
|
266
|
+
.try_unwrap();
|
267
|
+
|
268
|
+
let mut array = Array::with_capacity(top_docs.len());
|
269
|
+
|
270
|
+
for (_score, doc_address) in top_docs {
|
271
|
+
let doc = searcher.doc(doc_address).try_unwrap();
|
272
|
+
if let Some(value) = doc.get_first(id_field) {
|
273
|
+
if let Some(id) = (&*value).as_text() {
|
274
|
+
array.push(RString::from(String::from(id)));
|
275
|
+
}
|
276
|
+
}
|
277
|
+
}
|
278
|
+
|
279
|
+
array
|
280
|
+
}
|
281
|
+
|
242
282
|
fn search(
|
243
283
|
query: AnyObject,
|
244
284
|
limit: Integer
|
@@ -262,7 +302,7 @@ methods!(
|
|
262
302
|
for (_score, doc_address) in top_docs {
|
263
303
|
let doc = searcher.doc(doc_address).try_unwrap();
|
264
304
|
if let Some(value) = doc.get_first(id_field) {
|
265
|
-
if let Some(id) = (&*value).
|
305
|
+
if let Some(id) = (&*value).as_text() {
|
266
306
|
array.push(RString::from(String::from(id)));
|
267
307
|
}
|
268
308
|
}
|
@@ -282,5 +322,6 @@ pub(super) fn init() {
|
|
282
322
|
klass.def("__commit", commit);
|
283
323
|
klass.def("__reload", reload);
|
284
324
|
klass.def("__search", search);
|
325
|
+
klass.def("__raw_query_search", raw_query_search);
|
285
326
|
});
|
286
327
|
}
|
data/src/query.rs
CHANGED
@@ -1,9 +1,10 @@
|
|
1
|
-
use std::str::FromStr;
|
2
1
|
use std::ops::Bound::Included;
|
3
2
|
use rutie::{methods, Object, AnyObject, Integer, Float, Array, RString};
|
4
3
|
use tantivy::{Term, DateTime};
|
5
4
|
use tantivy::schema::{IndexRecordOption, Facet, Type, FieldType};
|
6
5
|
use tantivy::query::*;
|
6
|
+
use tantivy::time::format_description::well_known::Rfc3339;
|
7
|
+
use tantivy::time::OffsetDateTime;
|
7
8
|
|
8
9
|
use crate::helpers::{try_unwrap_params, scaffold, TryUnwrap};
|
9
10
|
use crate::index::{unwrap_index, RTantinyIndex};
|
@@ -133,13 +134,13 @@ methods!(
|
|
133
134
|
FieldType::Date(_) => {
|
134
135
|
let from: String = from.try_unwrap();
|
135
136
|
let to: String = to.try_unwrap();
|
136
|
-
let from = DateTime::
|
137
|
-
let to = DateTime::
|
137
|
+
let from = DateTime::from_utc(OffsetDateTime::parse(&from, &Rfc3339).unwrap());
|
138
|
+
let to = DateTime::from_utc(OffsetDateTime::parse(&to, &Rfc3339).unwrap());
|
138
139
|
|
139
140
|
Ok((
|
140
141
|
Type::Date,
|
141
|
-
Included(Term::from_field_date(field,
|
142
|
-
Included(Term::from_field_date(field,
|
142
|
+
Included(Term::from_field_date(field, from)),
|
143
|
+
Included(Term::from_field_date(field, to))
|
143
144
|
))
|
144
145
|
},
|
145
146
|
FieldType::I64(_) => {
|
@@ -167,7 +168,7 @@ methods!(
|
|
167
168
|
|
168
169
|
let (value_type, left, right) = range.try_unwrap();
|
169
170
|
|
170
|
-
let query = RangeQuery::new_term_bounds(
|
171
|
+
let query = RangeQuery::new_term_bounds(field_name.to_string(), value_type, &left, &right);
|
171
172
|
|
172
173
|
wrap_query(Box::new(query))
|
173
174
|
}
|
@@ -257,4 +258,4 @@ pub(super) fn init() {
|
|
257
258
|
klass.def("__negation", negation);
|
258
259
|
klass.def("__boost", boost);
|
259
260
|
});
|
260
|
-
}
|
261
|
+
}
|
data/src/tokenizer.rs
CHANGED
@@ -15,8 +15,8 @@ fn wrap_tokenizer(tokenizer: TextAnalyzer) -> RTantinyTokenizer {
|
|
15
15
|
)
|
16
16
|
}
|
17
17
|
|
18
|
-
pub(crate) fn unwrap_tokenizer(tokenizer:
|
19
|
-
|
18
|
+
pub(crate) fn unwrap_tokenizer(tokenizer: RTantinyTokenizer) -> TextAnalyzer {
|
19
|
+
tokenizer.get_data(&*TANTINY_TOKENIZER_WRAPPER).0.clone()
|
20
20
|
}
|
21
21
|
|
22
22
|
#[rustfmt::skip::macros(methods)]
|
@@ -25,9 +25,10 @@ methods!(
|
|
25
25
|
_itself,
|
26
26
|
|
27
27
|
fn new_simple_tokenizer() -> RTantinyTokenizer {
|
28
|
-
let tokenizer = TextAnalyzer::
|
28
|
+
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
29
29
|
.filter(RemoveLongFilter::limit(40))
|
30
|
-
.filter(LowerCaser)
|
30
|
+
.filter(LowerCaser)
|
31
|
+
.build();
|
31
32
|
|
32
33
|
wrap_tokenizer(tokenizer)
|
33
34
|
}
|
@@ -36,10 +37,11 @@ methods!(
|
|
36
37
|
try_unwrap_params!(locale_code: String);
|
37
38
|
|
38
39
|
let language: LanguageWrapper = locale_code.parse().try_unwrap();
|
39
|
-
let tokenizer = TextAnalyzer::
|
40
|
+
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
40
41
|
.filter(RemoveLongFilter::limit(40))
|
41
42
|
.filter(LowerCaser)
|
42
|
-
.filter(Stemmer::new(language.0))
|
43
|
+
.filter(Stemmer::new(language.0))
|
44
|
+
.build();
|
43
45
|
|
44
46
|
wrap_tokenizer(tokenizer)
|
45
47
|
}
|
@@ -61,13 +63,14 @@ methods!(
|
|
61
63
|
prefix_only
|
62
64
|
);
|
63
65
|
|
64
|
-
wrap_tokenizer(TextAnalyzer::from(tokenizer))
|
66
|
+
wrap_tokenizer(TextAnalyzer::from(tokenizer.try_unwrap()))
|
65
67
|
}
|
66
68
|
|
67
69
|
fn extract_terms(text: RString) -> Array {
|
68
70
|
try_unwrap_params!(text: String);
|
69
71
|
|
70
|
-
let mut
|
72
|
+
let mut tokenizer: TextAnalyzer = unwrap_tokenizer(_itself);
|
73
|
+
let mut token_stream = tokenizer.token_stream(&text);
|
71
74
|
let mut terms = vec![];
|
72
75
|
|
73
76
|
while token_stream.advance() {
|
@@ -91,4 +94,7 @@ pub(super) fn init() {
|
|
91
94
|
klass.def_self("__new_ngram_tokenizer", new_ngram_tokenizer);
|
92
95
|
klass.def("__extract_terms", extract_terms);
|
93
96
|
});
|
94
|
-
}
|
97
|
+
}
|
98
|
+
|
99
|
+
|
100
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tantiny-in-memory
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christian Toscano
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-09-
|
11
|
+
date: 2023-09-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-next
|
@@ -95,7 +95,6 @@ files:
|
|
95
95
|
- bin/console
|
96
96
|
- bin/setup
|
97
97
|
- ext/Rakefile
|
98
|
-
- lib/.rbnext/3.0/tantiny/schema.rb
|
99
98
|
- lib/tantiny-in-memory.rb
|
100
99
|
- lib/tantiny.rb
|
101
100
|
- lib/tantiny/errors.rb
|
@@ -1,53 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Tantiny
|
4
|
-
class Schema
|
5
|
-
attr_reader :default_tokenizer,
|
6
|
-
:id_field,
|
7
|
-
:text_fields,
|
8
|
-
:string_fields,
|
9
|
-
:integer_fields,
|
10
|
-
:double_fields,
|
11
|
-
:date_fields,
|
12
|
-
:facet_fields,
|
13
|
-
:field_tokenizers
|
14
|
-
|
15
|
-
def initialize(tokenizer, &block)
|
16
|
-
@default_tokenizer = tokenizer
|
17
|
-
@id_field = :id
|
18
|
-
@text_fields = []
|
19
|
-
@string_fields = []
|
20
|
-
@integer_fields = []
|
21
|
-
@double_fields = []
|
22
|
-
@date_fields = []
|
23
|
-
@facet_fields = []
|
24
|
-
@field_tokenizers = {}
|
25
|
-
|
26
|
-
instance_exec(&block)
|
27
|
-
end
|
28
|
-
|
29
|
-
def tokenizer_for(field)
|
30
|
-
field_tokenizers[field] || default_tokenizer
|
31
|
-
end
|
32
|
-
|
33
|
-
private
|
34
|
-
|
35
|
-
def id(key) ; @id_field = key; end
|
36
|
-
|
37
|
-
def string(key) ; @string_fields << key; end
|
38
|
-
|
39
|
-
def integer(key) ; @integer_fields << key; end
|
40
|
-
|
41
|
-
def double(key) ; @double_fields << key; end
|
42
|
-
|
43
|
-
def date(key) ; @date_fields << key; end
|
44
|
-
|
45
|
-
def facet(key) ; @facet_fields << key; end
|
46
|
-
|
47
|
-
def text(key, tokenizer: nil)
|
48
|
-
@field_tokenizers[key] = tokenizer if tokenizer
|
49
|
-
|
50
|
-
@text_fields << key
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|