tantiny 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/Cargo.toml +9 -6
- data/README.md +118 -42
- data/bin/console +2 -3
- data/lib/tantiny/errors.rb +1 -1
- data/lib/tantiny/index.rb +29 -19
- data/lib/tantiny/query.rb +21 -16
- data/lib/tantiny/schema.rb +2 -2
- data/lib/tantiny/version.rb +1 -1
- data/lib/tantiny.rb +21 -10
- data/lib/tantiny.so +0 -0
- data/src/helpers.rs +71 -191
- data/src/index.rs +310 -197
- data/src/lib.rs +12 -9
- data/src/query.rs +246 -203
- data/src/tokenizer.rs +62 -75
- metadata +44 -43
- data/lib/.rbnext/3.0/tantiny/schema.rb +0 -53
- data/sig/tantiny/errors.rbs +0 -20
- data/sig/tantiny/helpers.rbs +0 -8
- data/sig/tantiny/index.rbs +0 -103
- data/sig/tantiny/query.rbs +0 -135
- data/sig/tantiny/schema.rbs +0 -26
- data/sig/tantiny/tokenizer.rbs +0 -25
- data/sig/tantiny/version.rbs +0 -3
- data/sig/tantiny.rbs +0 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 32334d17636719a204b09795443ba26989c1511e515965649af0e92aa0ee5d5a
|
|
4
|
+
data.tar.gz: 2c596b09325d57012e7987e5c8eba5eb8e9e81f93f5fa3d99859567407f10c9f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 797d85d76769bf0165f8ecc81d652890d0603806b005de9cece8a3bb6b8b0f6866b4b53fd42caee0738cc43cc9b2e383b1f08ab28f1e706c6231c908bd7334dc
|
|
7
|
+
data.tar.gz: c683bcb69c47af11da1020cffaa40a9aad40eef358e8c87674b39a9678600f987606db0945632b0d604db4c66a7d634c980fd9446f2674a53c5d42692e4e5913
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.4.0](https://github.com/altertable-ai/tantiny/compare/tantiny-v0.3.3...tantiny/v0.4.0) (2025-11-01)
|
|
4
|
+
|
|
5
|
+
[Resume development & transfer ownership](https://github.com/altertable-ai/tantiny/pull/1) following https://github.com/baygeldin/tantiny/pull/24
|
|
6
|
+
|
|
7
|
+
### Features
|
|
8
|
+
|
|
9
|
+
* Add simple highlighting feature ([7d4273e](https://github.com/altertable-ai/tantiny/commit/7d4273e657b658a670fc882714bb319d0b3b374c))
|
|
10
|
+
* Add in-memory indexing capabilities ([7d4273e](https://github.com/altertable-ai/tantiny/commit/7d4273e657b658a670fc882714bb319d0b3b374c))
|
|
11
|
+
|
|
12
|
+
### Bug Fixes
|
|
13
|
+
|
|
14
|
+
* Support Ruby 3.4 moving from Rutie to Magnus ([7d4273e](https://github.com/altertable-ai/tantiny/commit/7d4273e657b658a670fc882714bb319d0b3b374c))
|
|
15
|
+
|
|
3
16
|
### [0.3.3](https://github.com/baygeldin/tantiny/compare/v0.3.2...v0.3.3) (2022-04-29)
|
|
4
17
|
|
|
5
18
|
|
data/Cargo.toml
CHANGED
|
@@ -1,20 +1,23 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "tantiny"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.0" # {x-release-please-version}
|
|
4
4
|
edition = "2021"
|
|
5
|
-
authors = ["Alexander Baygeldin"]
|
|
6
|
-
repository = "https://github.com/
|
|
5
|
+
authors = ["Sylvain Utard", "Alexander Baygeldin"]
|
|
6
|
+
repository = "https://github.com/altertable-ai/tantiny"
|
|
7
7
|
|
|
8
8
|
[lib]
|
|
9
|
+
name = "tantiny"
|
|
9
10
|
crate-type = ["cdylib"]
|
|
10
11
|
|
|
11
12
|
[dependencies]
|
|
12
|
-
|
|
13
|
-
tantivy = "0.
|
|
13
|
+
magnus = { version = "0.8", features = ["rb-sys"] }
|
|
14
|
+
tantivy = "0.25"
|
|
14
15
|
lazy_static = "1.4"
|
|
15
16
|
paste = "1.0"
|
|
17
|
+
time = { version = "0.3", features = ["parsing", "formatting"] }
|
|
18
|
+
levenshtein_automata = "0.2.1"
|
|
16
19
|
|
|
17
20
|
[package.metadata.thermite]
|
|
18
21
|
github_releases = true
|
|
19
22
|
github_release_type = "latest"
|
|
20
|
-
git_tag_regex = "^v(\\d+\\.\\d+\\.\\d+)$"
|
|
23
|
+
git_tag_regex = "^v(\\d+\\.\\d+\\.\\d+)$"
|
data/README.md
CHANGED
|
@@ -1,21 +1,24 @@
|
|
|
1
|
-
[](https://codeclimate.com/github/baygeldin/tantiny/test_coverage)
|
|
1
|
+
[](https://github.com/altertable-ai/tantiny/actions/workflows/build.yml) [](https://rubygems.org/gems/tantiny) [](https://opensource.org/licenses/MIT)
|
|
2
|
+
|
|
3
|
+
> This is a fork of the [original Tantiny](https://github.com/baygeldin/tantiny) gem by [Alexander Baygeldin](https://github.com/baygeldin). Following https://github.com/baygeldin/tantiny/pull/24 we agreed transfering ownership of the gem to [Altertable](https://github.com/altertable-ai) so we can keep it up to date with the latest versions of Tantivy and Ruby.
|
|
5
4
|
|
|
6
5
|
# Tantiny
|
|
7
6
|
|
|
8
|
-
Need a fast full-text search for your Ruby script, but
|
|
7
|
+
Need a fast full-text search for your Ruby script, but don't want to host/operate a full-blown search engine yet?
|
|
9
8
|
|
|
10
|
-
You're in the right place. **Tantiny** is a minimalistic full-text search library for Ruby based on [Tanti**v**y](https://github.com/quickwit-oss/tantivy) (an awesome alternative to Apache Lucene written in Rust). It's great for cases when your task at hand requires a full-text search, but configuring a full-blown distributed search engine would take more time than the task itself. And even if you already use such an engine in your project (which is highly likely, actually), it still might be easier to just use Tantiny instead because unlike Solr
|
|
9
|
+
You're in the right place. **Tantiny** is a minimalistic full-text search library for Ruby based on [Tanti**v**y](https://github.com/quickwit-oss/tantivy) (an awesome alternative to Apache Lucene written in Rust). It's great for cases when your task at hand requires a full-text search, but configuring a full-blown distributed search engine would take more time than the task itself. And even if you already use such an engine in your project (which is highly likely, actually), it still might be easier to just use Tantiny instead because unlike Solr, Elasticsearch, or any hosted search engine it doesn't need _anything_ to work (no separate server, process, API or whatever), it's purely embeddable. So, when you find yourself in a situation when using your search engine of choice would be tricky/inconvinient or would require additional setup you can always revert back to a quick and dirty solution that is nontheless flexible and fast.
|
|
11
10
|
|
|
12
11
|
Tantiny is not exactly Ruby bindings to Tantivy, but it tries to be close. The main philosophy is to provide low-level access to Tantivy's inverted index, but with a nice Ruby-esque API, sensible defaults, and additional functionality sprinkled on top.
|
|
13
12
|
|
|
14
13
|
Take a look at the most basic example:
|
|
15
14
|
|
|
16
15
|
```ruby
|
|
16
|
+
# Persisted index
|
|
17
17
|
index = Tantiny::Index.new("/path/to/index") { text :description }
|
|
18
18
|
|
|
19
|
+
# Or in-memory (no persistence)
|
|
20
|
+
index = Tantiny::Index.new { text :description }
|
|
21
|
+
|
|
19
22
|
index << { id: 1, description: "Hello World!" }
|
|
20
23
|
index << { id: 2, description: "What's up?" }
|
|
21
24
|
index << { id: 3, description: "Goodbye World!" }
|
|
@@ -43,37 +46,51 @@ Or install it yourself as:
|
|
|
43
46
|
|
|
44
47
|
You don't **have to** have Rust installed on your system since Tantiny will try to download the pre-compiled binaries hosted on GitHub releases during the installation. However, if no pre-compiled binaries were found for your system (which is a combination of platform, architecture, and Ruby version) you will need to [install Rust](https://www.rust-lang.org/tools/install) first.
|
|
45
48
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
Please, make sure to specify the minor version when declaring dependency on `tantiny`. The API is a subject to change, and until it reaches `1.0.0` a bump in the minor version will most likely signify a breaking change.
|
|
49
|
+
## Defining the index schema
|
|
49
50
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
You have to specify a path to where the index would be stored and a block that defines the schema:
|
|
51
|
+
Whether you want to use a persisted index or an in-memory index, you need to define the schema first:
|
|
53
52
|
|
|
54
53
|
```ruby
|
|
55
|
-
Tantiny::Index.new
|
|
54
|
+
Tantiny::Index.new(path_or_memory) do
|
|
56
55
|
id :imdb_id
|
|
57
|
-
facet :category
|
|
58
56
|
string :title
|
|
59
57
|
text :description
|
|
60
58
|
integer :duration
|
|
61
59
|
double :rating
|
|
62
60
|
date :release_date
|
|
61
|
+
facet :category
|
|
63
62
|
end
|
|
64
63
|
```
|
|
65
64
|
|
|
65
|
+
## In-memory indexes
|
|
66
|
+
|
|
67
|
+
For small to medium datasets or temporary search needs (or tests!), you can create an in-memory index by omitting the path parameter:
|
|
68
|
+
|
|
69
|
+
```ruby
|
|
70
|
+
index = Tantiny::Index.new do
|
|
71
|
+
text :title
|
|
72
|
+
text :description
|
|
73
|
+
double :price
|
|
74
|
+
end
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
In-memory indexes are perfect when you don't need persistence between runs, or when you're building a search index from data that already exists in a database. They offer the same full-text search capabilities without any file I/O overhead.
|
|
78
|
+
|
|
79
|
+
## Field types
|
|
80
|
+
|
|
66
81
|
Here are the descriptions for every field type:
|
|
67
82
|
|
|
68
|
-
| Type
|
|
69
|
-
|
|
|
70
|
-
| id
|
|
71
|
-
| facet
|
|
72
|
-
| string
|
|
73
|
-
| text
|
|
74
|
-
| integer | Fields with integer values.
|
|
75
|
-
| double | Fields with float values.
|
|
76
|
-
| date
|
|
83
|
+
| Type | Description |
|
|
84
|
+
| ------- | ----------------------------------------------------------------------- |
|
|
85
|
+
| id | Specifies where documents' ids are stored (defaults to `:id`). |
|
|
86
|
+
| facet | Fields with values like `/animals/birds` (i.e. hierarchial categories). |
|
|
87
|
+
| string | Fields with text that are **not** tokenized. |
|
|
88
|
+
| text | Fields with text that are tokenized by the specified tokenizer. |
|
|
89
|
+
| integer | Fields with integer values. |
|
|
90
|
+
| double | Fields with float values. |
|
|
91
|
+
| date | Fields with either `DateTime` type or something that converts to it. |
|
|
92
|
+
|
|
93
|
+
Each field can either be a single value or an array of values.
|
|
77
94
|
|
|
78
95
|
## Managing documents
|
|
79
96
|
|
|
@@ -142,7 +159,7 @@ index.transaction do
|
|
|
142
159
|
end
|
|
143
160
|
```
|
|
144
161
|
|
|
145
|
-
Transactions group changes and [commit](https://docs.rs/tantivy/latest/tantivy/struct.IndexWriter.html#method.commit) them to the index in one go. This is
|
|
162
|
+
Transactions group changes and [commit](https://docs.rs/tantivy/latest/tantivy/struct.IndexWriter.html#method.commit) them to the index in one go. This is _dramatically_ more efficient than performing these changes one by one. In fact, all writing operations (i.e. `<<` and `delete`) are wrapped in a transaction implicitly when you call them outside of a transaction, so calling `<<` 10 times outside of a transaction is the same thing as performing 10 separate transactions.
|
|
146
163
|
|
|
147
164
|
### Concurrency and thread-safety
|
|
148
165
|
|
|
@@ -182,7 +199,7 @@ You may wonder, how exactly does it conduct the search? Well, the default behavi
|
|
|
182
199
|
index.search("a dlunk, a kib, and an olt mab", fuzzy_distance: 1)
|
|
183
200
|
```
|
|
184
201
|
|
|
185
|
-
However, you can customize it by composing your own query out of basic building blocks:
|
|
202
|
+
However, you can customize it by composing your own query out of basic building blocks:
|
|
186
203
|
|
|
187
204
|
```ruby
|
|
188
205
|
popular_movies = index.range_query(:rating, 8.0..10.0)
|
|
@@ -198,20 +215,18 @@ I know, weird taste! But pretty cool, huh? Take a look at all the available quer
|
|
|
198
215
|
|
|
199
216
|
### Supported queries
|
|
200
217
|
|
|
201
|
-
| Query
|
|
202
|
-
|
|
|
203
|
-
| all_query
|
|
204
|
-
| empty_query
|
|
205
|
-
| term_query
|
|
206
|
-
| fuzzy_term_query | Documents that contain the specified term within a Levenshtein distance.
|
|
207
|
-
| phrase_query
|
|
208
|
-
| regex_query
|
|
209
|
-
| prefix_query
|
|
210
|
-
| range_query
|
|
211
|
-
| facet_query
|
|
212
|
-
| smart_query
|
|
213
|
-
|
|
214
|
-
Take a look at the [signatures file](https://github.com/baygeldin/tantiny/blob/main/sig/tantiny/query.rbs) to see what parameters do queries accept.
|
|
218
|
+
| Query | Behavior |
|
|
219
|
+
| ---------------- | -------------------------------------------------------------------------------------- |
|
|
220
|
+
| all_query | Returns all indexed documents. |
|
|
221
|
+
| empty_query | Returns exactly nothing (used internally). |
|
|
222
|
+
| term_query | Documents that contain the specified term. |
|
|
223
|
+
| fuzzy_term_query | Documents that contain the specified term within a Levenshtein distance. |
|
|
224
|
+
| phrase_query | Documents that contain the specified sequence of terms. |
|
|
225
|
+
| regex_query | Documents that contain a term that matches the specified regex. |
|
|
226
|
+
| prefix_query | Documents that contain a term with the specified prefix. |
|
|
227
|
+
| range_query | Documents that with an `integer`, `double` or `date` field within the specified range. |
|
|
228
|
+
| facet_query | Documents that belong to the specified category. |
|
|
229
|
+
| smart_query | A combination of `term_query`, `fuzzy_term_query` and `prefix_query`. |
|
|
215
230
|
|
|
216
231
|
### Searching on multiple fields
|
|
217
232
|
|
|
@@ -276,7 +291,7 @@ The `regex_query` accepts the regex pattern, but it has to be a [Rust regex](htt
|
|
|
276
291
|
|
|
277
292
|
So, we've mentioned tokenizers more than once already. What are they?
|
|
278
293
|
|
|
279
|
-
Tokenizers is what Tantivy uses to chop your text onto terms to build an inverted index. Then you can search the index by these terms. It's an important concept to understand so that you don't get confused when `index.term_query(:description, "Hello")` returns nothing because `Hello` isn't a term, but `hello` is. You have to extract the terms from the query before searching the index. Currently, only `smart_query` does that for you. Also, the only field type that is tokenized is `text`, so for `string` fields you should use the exact match (i.e. `index.term_query(:title, "Hello")`).
|
|
294
|
+
Tokenizers is what Tantivy uses to chop your text onto terms to build an inverted index. Then you can search the index by these terms. It's an important concept to understand so that you don't get confused when `index.term_query(:description, "Hello")` returns nothing because `Hello` isn't a term, but `hello` is. You have to extract the terms from the query before searching the index. Currently, only `smart_query` does that for you. Also, the only field type that is tokenized is `text`, so for `string` fields you should use the exact match (i.e. `index.term_query(:title, "Hello")`).
|
|
280
295
|
|
|
281
296
|
### Specifying the tokenizer
|
|
282
297
|
|
|
@@ -320,19 +335,80 @@ Ngram tokenizer chops your text onto ngrams of specified size.
|
|
|
320
335
|
tokenizer = Tantiny::Tokenizer.new(:ngram, min: 5, max: 10, prefix_only: true)
|
|
321
336
|
tokenizer.terms("Morrowind") # ["Morro", "Morrow", "Morrowi", "Morrowin", "Morrowind"]
|
|
322
337
|
```
|
|
338
|
+
|
|
323
339
|
## Retrieving documents
|
|
324
340
|
|
|
325
341
|
You may have noticed that `search` method returns only documents ids. This is by design. The documents themselves are **not** stored in the index. Tantiny is a minimalistic library, so it tries to keep things simple. If you need to retrieve a full document, use a key-value store like Redis alongside.
|
|
326
342
|
|
|
343
|
+
## Highlighting
|
|
344
|
+
|
|
345
|
+
Tantiny supports highlighting of search results. This is useful when you want to display the search results in a more readable format.
|
|
346
|
+
|
|
347
|
+
```ruby
|
|
348
|
+
Tantiny::Query.highlight(field_text, query_string)
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
It supports fuzzy highlighting by specifying the fuzzy distance.
|
|
352
|
+
|
|
353
|
+
```ruby
|
|
354
|
+
Tantiny::Query.highlight(field_text, query_string, fuzzy_distance: 2)
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
As well as custom tokenizers, but make sure to use the same tokenizer that was used to index the field.
|
|
358
|
+
|
|
359
|
+
```ruby
|
|
360
|
+
tokenizer = Tantiny::Tokenizer.new(:stemmer, language: :fr)
|
|
361
|
+
Tantiny::Query.highlight(field_text, query_string, tokenizer: tokenizer)
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
This will return the text with the terms highlighted:
|
|
365
|
+
|
|
366
|
+
```ruby
|
|
367
|
+
Tantiny::Query.highlight("hellow world. you are welcome.", "hello you")
|
|
368
|
+
# "<b>hellow</b> world. <b>you</b> are welcome."
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
## Examples
|
|
372
|
+
|
|
373
|
+
The [examples directory](examples/) contains practical demonstrations of Tantiny's capabilities. These examples are great starting points for understanding how to use Tantiny in real-world scenarios.
|
|
374
|
+
|
|
375
|
+
### Simple Ranking Example
|
|
376
|
+
|
|
377
|
+
[`examples/simple_ranking.rb`](examples/simple_ranking.rb)
|
|
378
|
+
|
|
379
|
+
A minimal demonstration of field-based ranking showing:
|
|
380
|
+
|
|
381
|
+
- Creating an in-memory index
|
|
382
|
+
- Using boost values to rank title matches higher than description matches
|
|
383
|
+
- Side-by-side comparison of equal weights vs boosted fields
|
|
384
|
+
|
|
385
|
+
This is perfect for understanding the core concept of ranking in just a few lines of code.
|
|
386
|
+
|
|
387
|
+
### Ecommerce Example
|
|
388
|
+
|
|
389
|
+
[`examples/ecommerce.rb`](examples/ecommerce.rb)
|
|
390
|
+
|
|
391
|
+
A comprehensive example demonstrating in-memory search for a product catalog:
|
|
392
|
+
|
|
393
|
+
- **In-memory indexing** - Perfect for small to medium datasets without persistent storage
|
|
394
|
+
- **Product search** - Indexing products with various attributes (title, description, category, price, stock)
|
|
395
|
+
- **Fuzzy search** - Handling typos and misspellings (e.g., "loptop" → "laptop")
|
|
396
|
+
- **Field-based ranking** - Boosting title matches to rank higher than description matches
|
|
397
|
+
- **Complex queries** - Combining multiple conditions with AND/OR operators
|
|
398
|
+
- **Category filtering** - Filtering products by exact category match
|
|
399
|
+
- **Price range queries** - Finding products within a specific price range
|
|
400
|
+
|
|
401
|
+
See the [examples README](examples/README.md) for more details.
|
|
402
|
+
|
|
327
403
|
## Development
|
|
328
404
|
|
|
329
405
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake build` to build native extensions, and then `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
330
406
|
|
|
331
|
-
We use [conventional commits](https://www.conventionalcommits.org) to automatically generate the CHANGELOG, bump the semantic version, and to publish and release the gem. All you need to do is stick to the convention and [CI will take care of everything else](https://github.com/
|
|
407
|
+
We use [conventional commits](https://www.conventionalcommits.org) to automatically generate the CHANGELOG, bump the semantic version, and to publish and release the gem. All you need to do is stick to the convention and [CI will take care of everything else](https://github.com/altertable-ai/tantiny/blob/main/.github/workflows/release.yml) for you.
|
|
332
408
|
|
|
333
409
|
## Contributing
|
|
334
410
|
|
|
335
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
|
411
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/altertable-ai/tantiny.
|
|
336
412
|
|
|
337
413
|
## License
|
|
338
414
|
|
data/bin/console
CHANGED
|
@@ -3,17 +3,16 @@
|
|
|
3
3
|
|
|
4
4
|
require "bundler/setup"
|
|
5
5
|
require "pry"
|
|
6
|
+
require "ostruct"
|
|
6
7
|
|
|
7
8
|
require "tantiny"
|
|
8
9
|
|
|
9
|
-
path = File.join(__dir__, "../tmp")
|
|
10
|
-
|
|
11
10
|
options = {
|
|
12
11
|
tokenizer: Tantiny::Tokenizer.new(:stemmer, language: :en),
|
|
13
12
|
exclusive_writer: true,
|
|
14
13
|
}
|
|
15
14
|
|
|
16
|
-
index = Tantiny::Index.new(
|
|
15
|
+
index = Tantiny::Index.new(nil, **options) do
|
|
17
16
|
id :imdb_id
|
|
18
17
|
facet :category
|
|
19
18
|
string :title
|
data/lib/tantiny/errors.rb
CHANGED
data/lib/tantiny/index.rb
CHANGED
|
@@ -3,17 +3,18 @@
|
|
|
3
3
|
module Tantiny
|
|
4
4
|
class Index
|
|
5
5
|
LOCKFILE = ".tantiny.lock"
|
|
6
|
-
DEFAULT_WRITER_MEMORY =
|
|
6
|
+
DEFAULT_WRITER_MEMORY = 15_000_000 # 15MB
|
|
7
7
|
DEFAULT_LIMIT = 10
|
|
8
8
|
|
|
9
|
-
def self.new(path, **options, &
|
|
10
|
-
|
|
9
|
+
def self.new(path = nil, **options, &)
|
|
10
|
+
# Only create directory if path is provided
|
|
11
|
+
FileUtils.mkdir_p(path) if path
|
|
11
12
|
|
|
12
13
|
default_tokenizer = options[:tokenizer] || Tokenizer.default
|
|
13
|
-
schema = Schema.new(default_tokenizer, &
|
|
14
|
+
schema = Schema.new(default_tokenizer, &)
|
|
14
15
|
|
|
15
16
|
object = __new(
|
|
16
|
-
path
|
|
17
|
+
path&.to_s,
|
|
17
18
|
schema.default_tokenizer,
|
|
18
19
|
schema.field_tokenizers.transform_keys(&:to_s),
|
|
19
20
|
schema.text_fields.map(&:to_s),
|
|
@@ -44,6 +45,10 @@ module Tantiny
|
|
|
44
45
|
|
|
45
46
|
attr_reader :schema
|
|
46
47
|
|
|
48
|
+
def in_memory?
|
|
49
|
+
@path.nil?
|
|
50
|
+
end
|
|
51
|
+
|
|
47
52
|
def transaction
|
|
48
53
|
if inside_transaction?
|
|
49
54
|
yield
|
|
@@ -68,12 +73,12 @@ module Tantiny
|
|
|
68
73
|
transaction do
|
|
69
74
|
__add_document(
|
|
70
75
|
resolve(document, schema.id_field).to_s,
|
|
71
|
-
slice_document(document, schema.text_fields) { |v| v.to_s },
|
|
72
|
-
slice_document(document, schema.string_fields) { |v| v.to_s },
|
|
73
|
-
slice_document(document, schema.integer_fields) { |v| v.to_i },
|
|
74
|
-
slice_document(document, schema.double_fields) { |v| v.to_f },
|
|
75
|
-
slice_document(document, schema.date_fields) { |v| Helpers.timestamp(v) },
|
|
76
|
-
slice_document(document, schema.facet_fields) { |v| v.to_s }
|
|
76
|
+
slice_document(document, schema.text_fields) { |v| v.is_a?(Array) ? v.map(&:to_s) : v.to_s },
|
|
77
|
+
slice_document(document, schema.string_fields) { |v| v.is_a?(Array) ? v.map(&:to_s) : v.to_s },
|
|
78
|
+
slice_document(document, schema.integer_fields) { |v| v.is_a?(Array) ? v.map(&:to_i) : v.to_i },
|
|
79
|
+
slice_document(document, schema.double_fields) { |v| v.is_a?(Array) ? v.map(&:to_f) : v.to_f },
|
|
80
|
+
slice_document(document, schema.date_fields) { |v| v.is_a?(Array) ? v.map { |d| Helpers.timestamp(d) } : Helpers.timestamp(v) },
|
|
81
|
+
slice_document(document, schema.facet_fields) { |v| v.is_a?(Array) ? v.map(&:to_s) : v.to_s }
|
|
77
82
|
)
|
|
78
83
|
end
|
|
79
84
|
end
|
|
@@ -103,10 +108,10 @@ module Tantiny
|
|
|
103
108
|
|
|
104
109
|
private
|
|
105
110
|
|
|
106
|
-
def slice_document(document, fields, &
|
|
111
|
+
def slice_document(document, fields, &)
|
|
107
112
|
fields.inject({}) do |hash, field|
|
|
108
113
|
hash.tap { |h| h[field.to_s] = resolve(document, field) }
|
|
109
|
-
end.compact.transform_values(&
|
|
114
|
+
end.compact.transform_values(&)
|
|
110
115
|
end
|
|
111
116
|
|
|
112
117
|
def resolve(document, field)
|
|
@@ -115,9 +120,9 @@ module Tantiny
|
|
|
115
120
|
|
|
116
121
|
def acquire_index_writer
|
|
117
122
|
__acquire_index_writer(@indexer_memory)
|
|
118
|
-
rescue
|
|
123
|
+
rescue RuntimeError => e
|
|
119
124
|
case e.message
|
|
120
|
-
when /Failed to acquire Lockfile/
|
|
125
|
+
when /Failed to acquire Lockfile/, /LockBusy/
|
|
121
126
|
raise IndexWriterBusyError.new
|
|
122
127
|
else
|
|
123
128
|
raise
|
|
@@ -154,14 +159,19 @@ module Tantiny
|
|
|
154
159
|
@exclusive_writer
|
|
155
160
|
end
|
|
156
161
|
|
|
157
|
-
def synchronize(&
|
|
158
|
-
|
|
159
|
-
|
|
162
|
+
def synchronize(&)
|
|
163
|
+
# In-memory indexes don't need file locking
|
|
164
|
+
if in_memory?
|
|
165
|
+
@transaction_semaphore.synchronize(&)
|
|
166
|
+
else
|
|
167
|
+
@transaction_semaphore.synchronize do
|
|
168
|
+
Helpers.with_lock(lockfile_path, &)
|
|
169
|
+
end
|
|
160
170
|
end
|
|
161
171
|
end
|
|
162
172
|
|
|
163
173
|
def lockfile_path
|
|
164
|
-
@lockfile_path ||= File.join(@path, LOCKFILE)
|
|
174
|
+
@lockfile_path ||= @path && File.join(@path, LOCKFILE)
|
|
165
175
|
end
|
|
166
176
|
end
|
|
167
177
|
end
|
data/lib/tantiny/query.rb
CHANGED
|
@@ -32,37 +32,37 @@ module Tantiny
|
|
|
32
32
|
__new_empty_query
|
|
33
33
|
end
|
|
34
34
|
|
|
35
|
-
def term_query(index, fields, term, **
|
|
35
|
+
def term_query(index, fields, term, **)
|
|
36
36
|
allowed_fields = text_and_strings(index)
|
|
37
|
-
construct_query(index, :term, allowed_fields, fields, [term.to_s], **
|
|
37
|
+
construct_query(index, :term, allowed_fields, fields, [term.to_s], **)
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
-
def fuzzy_term_query(index, fields, term, distance = DEFAULT_FUZZY_DISTANCE, **
|
|
40
|
+
def fuzzy_term_query(index, fields, term, distance = DEFAULT_FUZZY_DISTANCE, **)
|
|
41
41
|
params = [term.to_s, distance.to_i]
|
|
42
42
|
allowed_fields = text_and_strings(index)
|
|
43
|
-
construct_query(index, :fuzzy_term, allowed_fields, fields, params, **
|
|
43
|
+
construct_query(index, :fuzzy_term, allowed_fields, fields, params, **)
|
|
44
44
|
end
|
|
45
45
|
|
|
46
|
-
def phrase_query(index, fields, phrase, **
|
|
46
|
+
def phrase_query(index, fields, phrase, **)
|
|
47
47
|
queries = [*fields].map do |f|
|
|
48
48
|
terms = index.schema.tokenizer_for(f).terms(phrase)
|
|
49
49
|
allowed_fields = index.schema.text_fields
|
|
50
|
-
construct_query(index, :phrase, allowed_fields, f, [terms], **
|
|
50
|
+
construct_query(index, :phrase, allowed_fields, f, [terms], **)
|
|
51
51
|
end
|
|
52
52
|
|
|
53
53
|
queries.empty? ? empty_query : disjunction(*queries)
|
|
54
54
|
end
|
|
55
55
|
|
|
56
|
-
def regex_query(index, fields, regex, **
|
|
56
|
+
def regex_query(index, fields, regex, **)
|
|
57
57
|
allowed_fields = text_and_strings(index)
|
|
58
|
-
construct_query(index, :regex, allowed_fields, fields, [regex.to_s], **
|
|
58
|
+
construct_query(index, :regex, allowed_fields, fields, [regex.to_s], **)
|
|
59
59
|
end
|
|
60
60
|
|
|
61
|
-
def prefix_query(index, fields, prefix, **
|
|
62
|
-
regex_query(index, fields, Regexp.escape(prefix) + ".*", **
|
|
61
|
+
def prefix_query(index, fields, prefix, **)
|
|
62
|
+
regex_query(index, fields, Regexp.escape(prefix) + ".*", **)
|
|
63
63
|
end
|
|
64
64
|
|
|
65
|
-
def range_query(index, fields, range, **
|
|
65
|
+
def range_query(index, fields, range, **)
|
|
66
66
|
schema = index.schema
|
|
67
67
|
|
|
68
68
|
case range.first
|
|
@@ -81,19 +81,19 @@ module Tantiny
|
|
|
81
81
|
end
|
|
82
82
|
|
|
83
83
|
# @type var allowed_fields: Array[Symbol]
|
|
84
|
-
construct_query(index, :range, allowed_fields, fields, [from, to], **
|
|
84
|
+
construct_query(index, :range, allowed_fields, fields, [from, to], **)
|
|
85
85
|
end
|
|
86
86
|
|
|
87
|
-
def facet_query(index, field, path, **
|
|
87
|
+
def facet_query(index, field, path, **)
|
|
88
88
|
allowed_fields = index.schema.facet_fields
|
|
89
|
-
construct_query(index, :facet, allowed_fields, field, [path], **
|
|
89
|
+
construct_query(index, :facet, allowed_fields, field, [path], **)
|
|
90
90
|
end
|
|
91
91
|
|
|
92
92
|
def smart_query(index, fields, query_string, **options)
|
|
93
93
|
fuzzy_distance = options[:fuzzy_distance]
|
|
94
94
|
boost_factor = options.fetch(:boost, DEFAULT_BOOST)
|
|
95
95
|
|
|
96
|
-
field_queries = [*fields].
|
|
96
|
+
field_queries = [*fields].filter_map do |field|
|
|
97
97
|
terms = index.schema.tokenizer_for(field).terms(query_string)
|
|
98
98
|
|
|
99
99
|
# See: https://github.com/soutaro/steep/issues/272
|
|
@@ -113,11 +113,16 @@ module Tantiny
|
|
|
113
113
|
last_term_query = prefix_query(index, field, terms.last) | term_queries.last
|
|
114
114
|
|
|
115
115
|
conjunction(last_term_query, *term_queries[0...-1])
|
|
116
|
-
end
|
|
116
|
+
end
|
|
117
117
|
|
|
118
118
|
disjunction(*field_queries).boost(boost_factor)
|
|
119
119
|
end
|
|
120
120
|
|
|
121
|
+
def highlight(text, query_string, fuzzy_distance: 0, tokenizer: Tantiny::Tokenizer.new(:simple))
|
|
122
|
+
terms = tokenizer.terms(query_string).map(&:to_s)
|
|
123
|
+
__highlight(text.to_s, terms, fuzzy_distance)
|
|
124
|
+
end
|
|
125
|
+
|
|
121
126
|
private
|
|
122
127
|
|
|
123
128
|
# Can't use variadic argument `params` here due to:
|
data/lib/tantiny/schema.rb
CHANGED
|
@@ -12,7 +12,7 @@ module Tantiny
|
|
|
12
12
|
:facet_fields,
|
|
13
13
|
:field_tokenizers
|
|
14
14
|
|
|
15
|
-
def initialize(tokenizer, &
|
|
15
|
+
def initialize(tokenizer, &)
|
|
16
16
|
@default_tokenizer = tokenizer
|
|
17
17
|
@id_field = :id
|
|
18
18
|
@text_fields = []
|
|
@@ -23,7 +23,7 @@ module Tantiny
|
|
|
23
23
|
@facet_fields = []
|
|
24
24
|
@field_tokenizers = {}
|
|
25
25
|
|
|
26
|
-
instance_exec(&
|
|
26
|
+
instance_exec(&)
|
|
27
27
|
end
|
|
28
28
|
|
|
29
29
|
def tokenizer_for(field)
|
data/lib/tantiny/version.rb
CHANGED
data/lib/tantiny.rb
CHANGED
|
@@ -1,10 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "
|
|
4
|
-
RubyNext::Language.setup_gem_load_path
|
|
5
|
-
|
|
6
|
-
require "rutie"
|
|
7
|
-
require "thermite/fiddle"
|
|
3
|
+
require "fiddle/import"
|
|
8
4
|
require "concurrent"
|
|
9
5
|
require "fileutils"
|
|
10
6
|
|
|
@@ -19,9 +15,24 @@ require "tantiny/index"
|
|
|
19
15
|
module Tantiny
|
|
20
16
|
project_dir = File.expand_path("../..", __FILE__)
|
|
21
17
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
18
|
+
# Try multiple possible locations for the library
|
|
19
|
+
lib_paths = [
|
|
20
|
+
File.join(project_dir, "target", "release", "libtantiny.dylib"),
|
|
21
|
+
File.join(project_dir, "target", "debug", "libtantiny.dylib"),
|
|
22
|
+
File.join(project_dir, "target", "release", "libtantiny.so"),
|
|
23
|
+
File.join(project_dir, "target", "debug", "libtantiny.so"),
|
|
24
|
+
File.join(project_dir, "lib", "tantiny.bundle"),
|
|
25
|
+
File.join(project_dir, "lib", "tantiny.so"),
|
|
26
|
+
File.join(project_dir, "lib", "tantiny.dylib")
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
lib_path = lib_paths.find { |path| File.exist?(path) }
|
|
30
|
+
|
|
31
|
+
if lib_path.nil?
|
|
32
|
+
raise LoadError, "Could not find tantiny library in any of: #{lib_paths.join(", ")}"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Load the library using Fiddle and call the init function
|
|
36
|
+
handle = Fiddle.dlopen(lib_path)
|
|
37
|
+
Fiddle::Function.new(handle["Init_tantiny"], [], Fiddle::TYPE_VOIDP).call
|
|
27
38
|
end
|
data/lib/tantiny.so
ADDED
|
Binary file
|