tantiny-in-memory 1.0.6 → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/Cargo.toml +2 -2
- data/README.md +4 -4
- data/lib/tantiny/index.rb +6 -0
- data/lib/tantiny/schema.rb +18 -6
- data/lib/tantiny/version.rb +1 -1
- data/src/index.rs +49 -8
- data/src/query.rs +8 -7
- data/src/tokenizer.rs +15 -9
- metadata +2 -3
- data/lib/.rbnext/3.0/tantiny/schema.rb +0 -53
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd302bd9b783fbee6a64945f376814799fa98f2c53d8f474acfa477f98c4e48c
|
4
|
+
data.tar.gz: 9dd34494fc0f16a7f76e972772cd08a1c7de9770bd3601530bbb20d2088f0eb3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f63f2b7c6984f097c3b9e2b984f1854a479a8bc195548d34428c3a01b3166fa6cbb61721cac1718f31e6974fa071aebe2befee1d030edcaa42145bf828c91b1f
|
7
|
+
data.tar.gz: 58a02e43c93bb326c7337e89c3133f266251f82b3dce29328df9dfe4a92deb8ecacd6ed7700d7686ae68a4fda7ff982e5e612208bc00ff3f53b109448c853d91
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,19 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## [1.0.8](https://github.com/a-chris/tantiny-in-memory/compare/v1.0.7...v1.0.8) (2023-09-22)
|
4
|
+
|
5
|
+
|
6
|
+
### Bug Fixes
|
7
|
+
|
8
|
+
* bad typo ([dc15ff1](https://github.com/a-chris/tantiny-in-memory/commit/dc15ff112d996bdc221c3d33218de0e0bde2086f))
|
9
|
+
|
10
|
+
## [1.0.7](https://github.com/a-chris/tantiny-in-memory/compare/v1.0.6...v1.0.7) (2023-09-04)
|
11
|
+
|
12
|
+
|
13
|
+
### Bug Fixes
|
14
|
+
|
15
|
+
* update readme ([25dd0eb](https://github.com/a-chris/tantiny-in-memory/commit/25dd0eba68c0befc0dcfe0df95d507f429e78f6f))
|
16
|
+
|
3
17
|
## [1.0.6](https://github.com/a-chris/tantiny-in-memory/compare/v1.0.5...v1.0.6) (2023-09-04)
|
4
18
|
|
5
19
|
|
data/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tantiny"
|
3
|
-
version = "1.0.
|
3
|
+
version = "1.0.8" # {x-release-please-version}
|
4
4
|
edition = "2021"
|
5
5
|
authors = ["Christian Toscano"]
|
6
6
|
repository = "https://github.com/a-chris/tantiny-in-memory"
|
@@ -10,7 +10,7 @@ crate-type = ["cdylib"]
|
|
10
10
|
|
11
11
|
[dependencies]
|
12
12
|
rutie = "0.8"
|
13
|
-
tantivy = "0.
|
13
|
+
tantivy = "0.21"
|
14
14
|
lazy_static = "1.4"
|
15
15
|
paste = "1.0"
|
16
16
|
|
data/README.md
CHANGED
@@ -14,7 +14,7 @@ Tantiny is not exactly Ruby bindings to Tantivy, but it tries to be close. The m
|
|
14
14
|
Take a look at the most basic example:
|
15
15
|
|
16
16
|
```ruby
|
17
|
-
index = Tantiny::Index.new(
|
17
|
+
index = Tantiny::Index.new(nil) { text :description }
|
18
18
|
|
19
19
|
index << { id: 1, description: "Hello World!" }
|
20
20
|
index << { id: 2, description: "What's up?" }
|
@@ -30,7 +30,7 @@ index.search("world") # 1, 3
|
|
30
30
|
Add this line to your application's Gemfile:
|
31
31
|
|
32
32
|
```ruby
|
33
|
-
gem "tantiny"
|
33
|
+
gem "tantiny-in-memory"
|
34
34
|
```
|
35
35
|
|
36
36
|
And then execute:
|
@@ -39,7 +39,7 @@ And then execute:
|
|
39
39
|
|
40
40
|
Or install it yourself as:
|
41
41
|
|
42
|
-
$ gem install tantiny
|
42
|
+
$ gem install tantiny-in-memory
|
43
43
|
|
44
44
|
You don't **have to** have Rust installed on your system since Tantiny will try to download the pre-compiled binaries hosted on GitHub releases during the installation. However, if no pre-compiled binaries were found for your system (which is a combination of platform, architecture, and Ruby version) you will need to [install Rust](https://www.rust-lang.org/tools/install) first.
|
45
45
|
|
@@ -52,7 +52,7 @@ Please, make sure to specify the minor version when declaring dependency on `tan
|
|
52
52
|
You have to specify a path to where the index would be stored and a block that defines the schema:
|
53
53
|
|
54
54
|
```ruby
|
55
|
-
Tantiny::Index.new
|
55
|
+
Tantiny::Index.new(nil) do
|
56
56
|
id :imdb_id
|
57
57
|
facet :category
|
58
58
|
string :title
|
data/lib/tantiny/index.rb
CHANGED
@@ -81,6 +81,12 @@ module Tantiny
|
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
84
|
+
def raw_query_search(query, limit: DEFAULT_LIMIT)
|
85
|
+
raise ArgumentError, "Query must be a string" unless query.is_a?(String)
|
86
|
+
|
87
|
+
__raw_query_search(query, limit)
|
88
|
+
end
|
89
|
+
|
84
90
|
def search(query, limit: DEFAULT_LIMIT, **smart_query_options)
|
85
91
|
unless query.is_a?(Query)
|
86
92
|
fields = schema.text_fields
|
data/lib/tantiny/schema.rb
CHANGED
@@ -32,17 +32,29 @@ module Tantiny
|
|
32
32
|
|
33
33
|
private
|
34
34
|
|
35
|
-
def id(key)
|
35
|
+
def id(key)
|
36
|
+
@id_field = key
|
37
|
+
end
|
36
38
|
|
37
|
-
def string(key)
|
39
|
+
def string(key)
|
40
|
+
@string_fields << key
|
41
|
+
end
|
38
42
|
|
39
|
-
def integer(key)
|
43
|
+
def integer(key)
|
44
|
+
@integer_fields << key
|
45
|
+
end
|
40
46
|
|
41
|
-
def double(key)
|
47
|
+
def double(key)
|
48
|
+
@double_fields << key
|
49
|
+
end
|
42
50
|
|
43
|
-
def date(key)
|
51
|
+
def date(key)
|
52
|
+
@date_fields << key
|
53
|
+
end
|
44
54
|
|
45
|
-
def facet(key)
|
55
|
+
def facet(key)
|
56
|
+
@facet_fields << key
|
57
|
+
end
|
46
58
|
|
47
59
|
def text(key, tokenizer: nil)
|
48
60
|
@field_tokenizers[key] = tokenizer if tokenizer
|
data/lib/tantiny/version.rb
CHANGED
data/src/index.rs
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
use std::collections::HashMap;
|
2
|
-
use std::str::FromStr;
|
3
2
|
use rutie::{methods, Object, AnyObject, Integer, NilClass, Array, RString, Hash};
|
4
|
-
use tantivy::
|
3
|
+
use tantivy::time::OffsetDateTime;
|
4
|
+
use tantivy::time::format_description::well_known::Rfc3339;
|
5
|
+
use tantivy::{doc, DateTime, Document, Term, ReloadPolicy, Index, IndexWriter, IndexReader};
|
6
|
+
use tantivy::query::QueryParser;
|
5
7
|
use tantivy::schema::{Schema, TextOptions, TextFieldIndexing, IndexRecordOption, FacetOptions, STRING, STORED, INDEXED, FAST};
|
6
8
|
use tantivy::collector::TopDocs;
|
7
9
|
|
@@ -90,7 +92,7 @@ methods!(
|
|
90
92
|
}
|
91
93
|
|
92
94
|
for field in facet_fields {
|
93
|
-
let options = FacetOptions::default()
|
95
|
+
let options = FacetOptions::default();
|
94
96
|
schema_builder.add_facet_field(&field, options);
|
95
97
|
}
|
96
98
|
|
@@ -98,10 +100,10 @@ methods!(
|
|
98
100
|
let index = Index::create_in_ram(schema.clone());
|
99
101
|
let tokenizers = index.tokenizers();
|
100
102
|
|
101
|
-
tokenizers.register("default", unwrap_tokenizer(
|
103
|
+
tokenizers.register("default", (&unwrap_tokenizer(default_tokenizer)).clone());
|
102
104
|
|
103
105
|
for (field, tokenizer) in field_tokenizers {
|
104
|
-
tokenizers.register(&field, unwrap_tokenizer(
|
106
|
+
tokenizers.register(&field, (&unwrap_tokenizer(tokenizer)).clone())
|
105
107
|
}
|
106
108
|
|
107
109
|
let index_writer = None;
|
@@ -168,8 +170,8 @@ methods!(
|
|
168
170
|
|
169
171
|
for (key, value) in date_fields.iter() {
|
170
172
|
let field = schema.get_field(key).try_unwrap();
|
171
|
-
let value = DateTime::
|
172
|
-
doc.add_date(field,
|
173
|
+
let value = DateTime::from_utc(OffsetDateTime::parse(value, &Rfc3339).unwrap());
|
174
|
+
doc.add_date(field, value);
|
173
175
|
}
|
174
176
|
|
175
177
|
for (key, value) in facet_fields.iter() {
|
@@ -239,6 +241,44 @@ methods!(
|
|
239
241
|
NilClass::new()
|
240
242
|
}
|
241
243
|
|
244
|
+
fn raw_query_search(
|
245
|
+
query_string: RString,
|
246
|
+
limit: Integer
|
247
|
+
) -> Array {
|
248
|
+
try_unwrap_params!(
|
249
|
+
query_string: String,
|
250
|
+
limit: i64
|
251
|
+
);
|
252
|
+
|
253
|
+
let internal = unwrap_index(&_itself);
|
254
|
+
let content_field = internal.schema.get_field("content").try_unwrap();
|
255
|
+
let query_parser = QueryParser::for_index(
|
256
|
+
&internal.index,
|
257
|
+
vec![content_field],
|
258
|
+
);
|
259
|
+
|
260
|
+
let query = query_parser.parse_query(&query_string).try_unwrap();
|
261
|
+
let id_field = internal.schema.get_field("id").try_unwrap();
|
262
|
+
let searcher = internal.index_reader.searcher();
|
263
|
+
|
264
|
+
let top_docs = searcher
|
265
|
+
.search(&*query, &TopDocs::with_limit(limit as usize))
|
266
|
+
.try_unwrap();
|
267
|
+
|
268
|
+
let mut array = Array::with_capacity(top_docs.len());
|
269
|
+
|
270
|
+
for (_score, doc_address) in top_docs {
|
271
|
+
let doc = searcher.doc(doc_address).try_unwrap();
|
272
|
+
if let Some(value) = doc.get_first(id_field) {
|
273
|
+
if let Some(id) = (&*value).as_text() {
|
274
|
+
array.push(RString::from(String::from(id)));
|
275
|
+
}
|
276
|
+
}
|
277
|
+
}
|
278
|
+
|
279
|
+
array
|
280
|
+
}
|
281
|
+
|
242
282
|
fn search(
|
243
283
|
query: AnyObject,
|
244
284
|
limit: Integer
|
@@ -262,7 +302,7 @@ methods!(
|
|
262
302
|
for (_score, doc_address) in top_docs {
|
263
303
|
let doc = searcher.doc(doc_address).try_unwrap();
|
264
304
|
if let Some(value) = doc.get_first(id_field) {
|
265
|
-
if let Some(id) = (&*value).
|
305
|
+
if let Some(id) = (&*value).as_text() {
|
266
306
|
array.push(RString::from(String::from(id)));
|
267
307
|
}
|
268
308
|
}
|
@@ -282,5 +322,6 @@ pub(super) fn init() {
|
|
282
322
|
klass.def("__commit", commit);
|
283
323
|
klass.def("__reload", reload);
|
284
324
|
klass.def("__search", search);
|
325
|
+
klass.def("__raw_query_search", raw_query_search);
|
285
326
|
});
|
286
327
|
}
|
data/src/query.rs
CHANGED
@@ -1,9 +1,10 @@
|
|
1
|
-
use std::str::FromStr;
|
2
1
|
use std::ops::Bound::Included;
|
3
2
|
use rutie::{methods, Object, AnyObject, Integer, Float, Array, RString};
|
4
3
|
use tantivy::{Term, DateTime};
|
5
4
|
use tantivy::schema::{IndexRecordOption, Facet, Type, FieldType};
|
6
5
|
use tantivy::query::*;
|
6
|
+
use tantivy::time::format_description::well_known::Rfc3339;
|
7
|
+
use tantivy::time::OffsetDateTime;
|
7
8
|
|
8
9
|
use crate::helpers::{try_unwrap_params, scaffold, TryUnwrap};
|
9
10
|
use crate::index::{unwrap_index, RTantinyIndex};
|
@@ -133,13 +134,13 @@ methods!(
|
|
133
134
|
FieldType::Date(_) => {
|
134
135
|
let from: String = from.try_unwrap();
|
135
136
|
let to: String = to.try_unwrap();
|
136
|
-
let from = DateTime::
|
137
|
-
let to = DateTime::
|
137
|
+
let from = DateTime::from_utc(OffsetDateTime::parse(&from, &Rfc3339).unwrap());
|
138
|
+
let to = DateTime::from_utc(OffsetDateTime::parse(&to, &Rfc3339).unwrap());
|
138
139
|
|
139
140
|
Ok((
|
140
141
|
Type::Date,
|
141
|
-
Included(Term::from_field_date(field,
|
142
|
-
Included(Term::from_field_date(field,
|
142
|
+
Included(Term::from_field_date(field, from)),
|
143
|
+
Included(Term::from_field_date(field, to))
|
143
144
|
))
|
144
145
|
},
|
145
146
|
FieldType::I64(_) => {
|
@@ -167,7 +168,7 @@ methods!(
|
|
167
168
|
|
168
169
|
let (value_type, left, right) = range.try_unwrap();
|
169
170
|
|
170
|
-
let query = RangeQuery::new_term_bounds(
|
171
|
+
let query = RangeQuery::new_term_bounds(field_name.to_string(), value_type, &left, &right);
|
171
172
|
|
172
173
|
wrap_query(Box::new(query))
|
173
174
|
}
|
@@ -257,4 +258,4 @@ pub(super) fn init() {
|
|
257
258
|
klass.def("__negation", negation);
|
258
259
|
klass.def("__boost", boost);
|
259
260
|
});
|
260
|
-
}
|
261
|
+
}
|
data/src/tokenizer.rs
CHANGED
@@ -15,8 +15,8 @@ fn wrap_tokenizer(tokenizer: TextAnalyzer) -> RTantinyTokenizer {
|
|
15
15
|
)
|
16
16
|
}
|
17
17
|
|
18
|
-
pub(crate) fn unwrap_tokenizer(tokenizer:
|
19
|
-
|
18
|
+
pub(crate) fn unwrap_tokenizer(tokenizer: RTantinyTokenizer) -> TextAnalyzer {
|
19
|
+
tokenizer.get_data(&*TANTINY_TOKENIZER_WRAPPER).0.clone()
|
20
20
|
}
|
21
21
|
|
22
22
|
#[rustfmt::skip::macros(methods)]
|
@@ -25,9 +25,10 @@ methods!(
|
|
25
25
|
_itself,
|
26
26
|
|
27
27
|
fn new_simple_tokenizer() -> RTantinyTokenizer {
|
28
|
-
let tokenizer = TextAnalyzer::
|
28
|
+
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
29
29
|
.filter(RemoveLongFilter::limit(40))
|
30
|
-
.filter(LowerCaser)
|
30
|
+
.filter(LowerCaser)
|
31
|
+
.build();
|
31
32
|
|
32
33
|
wrap_tokenizer(tokenizer)
|
33
34
|
}
|
@@ -36,10 +37,11 @@ methods!(
|
|
36
37
|
try_unwrap_params!(locale_code: String);
|
37
38
|
|
38
39
|
let language: LanguageWrapper = locale_code.parse().try_unwrap();
|
39
|
-
let tokenizer = TextAnalyzer::
|
40
|
+
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
40
41
|
.filter(RemoveLongFilter::limit(40))
|
41
42
|
.filter(LowerCaser)
|
42
|
-
.filter(Stemmer::new(language.0))
|
43
|
+
.filter(Stemmer::new(language.0))
|
44
|
+
.build();
|
43
45
|
|
44
46
|
wrap_tokenizer(tokenizer)
|
45
47
|
}
|
@@ -61,13 +63,14 @@ methods!(
|
|
61
63
|
prefix_only
|
62
64
|
);
|
63
65
|
|
64
|
-
wrap_tokenizer(TextAnalyzer::from(tokenizer))
|
66
|
+
wrap_tokenizer(TextAnalyzer::from(tokenizer.try_unwrap()))
|
65
67
|
}
|
66
68
|
|
67
69
|
fn extract_terms(text: RString) -> Array {
|
68
70
|
try_unwrap_params!(text: String);
|
69
71
|
|
70
|
-
let mut
|
72
|
+
let mut tokenizer: TextAnalyzer = unwrap_tokenizer(_itself);
|
73
|
+
let mut token_stream = tokenizer.token_stream(&text);
|
71
74
|
let mut terms = vec![];
|
72
75
|
|
73
76
|
while token_stream.advance() {
|
@@ -91,4 +94,7 @@ pub(super) fn init() {
|
|
91
94
|
klass.def_self("__new_ngram_tokenizer", new_ngram_tokenizer);
|
92
95
|
klass.def("__extract_terms", extract_terms);
|
93
96
|
});
|
94
|
-
}
|
97
|
+
}
|
98
|
+
|
99
|
+
|
100
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tantiny-in-memory
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christian Toscano
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-09-
|
11
|
+
date: 2023-09-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-next
|
@@ -95,7 +95,6 @@ files:
|
|
95
95
|
- bin/console
|
96
96
|
- bin/setup
|
97
97
|
- ext/Rakefile
|
98
|
-
- lib/.rbnext/3.0/tantiny/schema.rb
|
99
98
|
- lib/tantiny-in-memory.rb
|
100
99
|
- lib/tantiny.rb
|
101
100
|
- lib/tantiny/errors.rb
|
@@ -1,53 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Tantiny
|
4
|
-
class Schema
|
5
|
-
attr_reader :default_tokenizer,
|
6
|
-
:id_field,
|
7
|
-
:text_fields,
|
8
|
-
:string_fields,
|
9
|
-
:integer_fields,
|
10
|
-
:double_fields,
|
11
|
-
:date_fields,
|
12
|
-
:facet_fields,
|
13
|
-
:field_tokenizers
|
14
|
-
|
15
|
-
def initialize(tokenizer, &block)
|
16
|
-
@default_tokenizer = tokenizer
|
17
|
-
@id_field = :id
|
18
|
-
@text_fields = []
|
19
|
-
@string_fields = []
|
20
|
-
@integer_fields = []
|
21
|
-
@double_fields = []
|
22
|
-
@date_fields = []
|
23
|
-
@facet_fields = []
|
24
|
-
@field_tokenizers = {}
|
25
|
-
|
26
|
-
instance_exec(&block)
|
27
|
-
end
|
28
|
-
|
29
|
-
def tokenizer_for(field)
|
30
|
-
field_tokenizers[field] || default_tokenizer
|
31
|
-
end
|
32
|
-
|
33
|
-
private
|
34
|
-
|
35
|
-
def id(key) ; @id_field = key; end
|
36
|
-
|
37
|
-
def string(key) ; @string_fields << key; end
|
38
|
-
|
39
|
-
def integer(key) ; @integer_fields << key; end
|
40
|
-
|
41
|
-
def double(key) ; @double_fields << key; end
|
42
|
-
|
43
|
-
def date(key) ; @date_fields << key; end
|
44
|
-
|
45
|
-
def facet(key) ; @facet_fields << key; end
|
46
|
-
|
47
|
-
def text(key, tokenizer: nil)
|
48
|
-
@field_tokenizers[key] = tokenizer if tokenizer
|
49
|
-
|
50
|
-
@text_fields << key
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|