mini_search 1.0.3 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.devcontainer/devcontainer.json +31 -0
- data/Gemfile.lock +22 -18
- data/README.md +116 -36
- data/bin/console +1 -0
- data/lib/mini_search/compound_tokenizer.rb +15 -0
- data/lib/mini_search/inverted_index.rb +2 -2
- data/lib/mini_search/ngram_tokenizer.rb +15 -0
- data/lib/mini_search/pipeline.rb +13 -2
- data/lib/mini_search/version.rb +1 -1
- data/lib/mini_search.rb +42 -11
- data/mini_search.gemspec +3 -2
- metadata +27 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 45bd263c2e9f4056e9401efe9f26bf735cafb41cdb9fef570e7426233ff6a92d
|
4
|
+
data.tar.gz: d29e30b91edf2434cb99d219b83b1dbba26e47190cf53be01a32355992af5042
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e1661985c5bee26475a35424fe78b81b462ea90e41984623bb13c3621626a241b3107390c69ddae177b6b6a3fdb49886e647bc4f2650ee1a9a308e62f7350a8
|
7
|
+
data.tar.gz: 36b1a2729315cec0e2b3805dea9dd05c888b9a2795b9e57e34f7a454e98c589705bcf5ad2bb5df2c022e1b3dc6d8b7388d9b5ea4bd6dd04a9412ceb4e2b1a2d9
|
@@ -0,0 +1,31 @@
|
|
1
|
+
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
2
|
+
// README at: https://github.com/devcontainers/templates/tree/main/src/ruby
|
3
|
+
{
|
4
|
+
"name": "Ruby",
|
5
|
+
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
|
6
|
+
"image": "mcr.microsoft.com/devcontainers/ruby:0-3.1-bullseye",
|
7
|
+
"features": {
|
8
|
+
"ghcr.io/devcontainers/features/git:1": {
|
9
|
+
"ppa": true,
|
10
|
+
"version": "os-provided"
|
11
|
+
}
|
12
|
+
},
|
13
|
+
"customizations": {
|
14
|
+
"vscode": {
|
15
|
+
"extensions": [
|
16
|
+
// Add the IDs of extensions you want installed when the container is created.
|
17
|
+
"rebornix.Ruby"
|
18
|
+
]
|
19
|
+
}
|
20
|
+
}
|
21
|
+
// Features to add to the dev container. More info: https://containers.dev/features.
|
22
|
+
// "features": {},
|
23
|
+
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
24
|
+
// "forwardPorts": [],
|
25
|
+
// Use 'postCreateCommand' to run commands after the container is created.
|
26
|
+
// "postCreateCommand": "ruby --version",
|
27
|
+
// Configure tool-specific properties.
|
28
|
+
// "customizations": {},
|
29
|
+
// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
|
30
|
+
// "remoteUser": "root"
|
31
|
+
}
|
data/Gemfile.lock
CHANGED
@@ -1,35 +1,39 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
mini_search (1.0.
|
4
|
+
mini_search (1.0.4)
|
5
|
+
ruby_ngrams (~> 0.0.6)
|
5
6
|
|
6
7
|
GEM
|
7
8
|
remote: https://rubygems.org/
|
8
9
|
specs:
|
9
|
-
diff-lcs (1.
|
10
|
-
rake (
|
11
|
-
rspec (3.
|
12
|
-
rspec-core (~> 3.
|
13
|
-
rspec-expectations (~> 3.
|
14
|
-
rspec-mocks (~> 3.
|
15
|
-
rspec-core (3.
|
16
|
-
rspec-support (~> 3.
|
17
|
-
rspec-expectations (3.
|
10
|
+
diff-lcs (1.5.0)
|
11
|
+
rake (12.3.3)
|
12
|
+
rspec (3.12.0)
|
13
|
+
rspec-core (~> 3.12.0)
|
14
|
+
rspec-expectations (~> 3.12.0)
|
15
|
+
rspec-mocks (~> 3.12.0)
|
16
|
+
rspec-core (3.12.1)
|
17
|
+
rspec-support (~> 3.12.0)
|
18
|
+
rspec-expectations (3.12.2)
|
18
19
|
diff-lcs (>= 1.2.0, < 2.0)
|
19
|
-
rspec-support (~> 3.
|
20
|
-
rspec-mocks (3.
|
20
|
+
rspec-support (~> 3.12.0)
|
21
|
+
rspec-mocks (3.12.5)
|
21
22
|
diff-lcs (>= 1.2.0, < 2.0)
|
22
|
-
rspec-support (~> 3.
|
23
|
-
rspec-support (3.
|
23
|
+
rspec-support (~> 3.12.0)
|
24
|
+
rspec-support (3.12.0)
|
25
|
+
ruby_cli (0.2.1)
|
26
|
+
ruby_ngrams (0.0.6)
|
27
|
+
ruby_cli (>= 0.2.0)
|
24
28
|
|
25
29
|
PLATFORMS
|
26
|
-
|
30
|
+
x86_64-linux
|
27
31
|
|
28
32
|
DEPENDENCIES
|
29
|
-
bundler (~>
|
33
|
+
bundler (~> 2.4.10)
|
30
34
|
mini_search!
|
31
|
-
rake (~>
|
35
|
+
rake (~> 12.0)
|
32
36
|
rspec (~> 3.0)
|
33
37
|
|
34
38
|
BUNDLED WITH
|
35
|
-
|
39
|
+
2.4.10
|
data/README.md
CHANGED
@@ -20,6 +20,36 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
$ gem install mini_search
|
22
22
|
|
23
|
+
## BM25 (from wikipedia)
|
24
|
+
|
25
|
+
BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document, regardless
|
26
|
+
of their proximity within the document. It is a family of scoring functions with slightly different components and parameters.
|
27
|
+
One of the most prominent instantiations of the function is as follows.
|
28
|
+
|
29
|
+
Given a query Q, containing keywords `q1....qn` the BM25 score of a document `D` is:
|
30
|
+
|
31
|
+
![BM25 Formula](formula1.svg)
|
32
|
+
|
33
|
+
where `f(qi, D)` is qi's term frequency (tf) in the document `D`, `|D|` is the length of the document `D` in words, and avgdl is the
|
34
|
+
average document length in the text collection from which documents are drawn. `k1` and `b` are free parameters, usually chosen, in absence of
|
35
|
+
an advanced optimization, as `k1 in |1.2,2.0|` and `b = 0.75`. `IDF(qi)` is the IDF (inverse document frequency) weight of the query term
|
36
|
+
`qi`. It is usually computed as:
|
37
|
+
|
38
|
+
![IDF Formula](formula2.svg)
|
39
|
+
|
40
|
+
where `N` is the total number of documents in the collection, and `n(q)` is the number of documents containing `qi`.
|
41
|
+
|
42
|
+
There are several interpretations for IDF and slight variations on its formula. In the original BM25 derivation,
|
43
|
+
the IDF component is derived from the Binary Independence Model.
|
44
|
+
|
45
|
+
The above formula for IDF has drawbacks for terms appearing in more than half of the corpus documents. These terms' IDF is negative,
|
46
|
+
so for any two almost-identical documents, one which contains the term may be ranked lower than one which does not. This is often an
|
47
|
+
undesirable behavior, so many applications adjust the IDF formula in various ways:
|
48
|
+
|
49
|
+
Each summand can be given a floor of 0, to trim out common terms;
|
50
|
+
The IDF function can be given a floor of a constant `e`, to avoid common terms being ignored at all;
|
51
|
+
The IDF function can be replaced with a similarly shaped one which is non-negative, or strictly positive to avoid terms being ignored at all.
|
52
|
+
|
23
53
|
## Inverted Index
|
24
54
|
|
25
55
|
MiniSearch implements a inverted index (basically a hashmap where terms are keys and values are documents that contains that key.
|
@@ -165,42 +195,6 @@ With this changes our index would be:
|
|
165
195
|
Pretty better now, we could apply other steps like removing some words that are irrelevant for us (stop words),
|
166
196
|
add synonyms for some words but this other changes are specifics from languages.
|
167
197
|
|
168
|
-
TODO
|
169
|
-
|
170
|
-
## Language support (stop words, stemmers)
|
171
|
-
|
172
|
-
TODO
|
173
|
-
|
174
|
-
## BM25 (from wikipedia)
|
175
|
-
|
176
|
-
BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document, regardless
|
177
|
-
of their proximity within the document. It is a family of scoring functions with slightly different components and parameters.
|
178
|
-
One of the most prominent instantiations of the function is as follows.
|
179
|
-
|
180
|
-
Given a query Q, containing keywords `q1....qn` the BM25 score of a document `D` is:
|
181
|
-
|
182
|
-
![BM25 Formula](formula1.svg)
|
183
|
-
|
184
|
-
where `f(qi, D)` is qi's term frequency (tf) in the document `D`, `|D|` is the length of the document `D` in words, and avgdl is the
|
185
|
-
average document length in the text collection from which documents are drawn. `k1` and `b` are free parameters, usually chosen, in absence of
|
186
|
-
an advanced optimization, as `k1 in |1.2,2.0|` and `b = 0.75`. `IDF(qi)` is the IDF (inverse document frequency) weight of the query term
|
187
|
-
`qi`. It is usually computed as:
|
188
|
-
|
189
|
-
![IDF Formula](formula2.svg)
|
190
|
-
|
191
|
-
where `N` is the total number of documents in the collection, and `n(q)` is the number of documents containing `qi`.
|
192
|
-
|
193
|
-
There are several interpretations for IDF and slight variations on its formula. In the original BM25 derivation,
|
194
|
-
the IDF component is derived from the Binary Independence Model.
|
195
|
-
|
196
|
-
The above formula for IDF has drawbacks for terms appearing in more than half of the corpus documents. These terms' IDF is negative,
|
197
|
-
so for any two almost-identical documents, one which contains the term may be ranked lower than one which does not. This is often an
|
198
|
-
undesirable behavior, so many applications adjust the IDF formula in various ways:
|
199
|
-
|
200
|
-
Each summand can be given a floor of 0, to trim out common terms;
|
201
|
-
The IDF function can be given a floor of a constant `e`, to avoid common terms being ignored at all;
|
202
|
-
The IDF function can be replaced with a similarly shaped one which is non-negative, or strictly positive to avoid terms being ignored at all.
|
203
|
-
|
204
198
|
## Usage
|
205
199
|
|
206
200
|
First we create an inverted Index
|
@@ -248,6 +242,92 @@ First we create an inverted Index
|
|
248
242
|
We can see results are sorted by score, notice that the document we index can have any other
|
249
243
|
fields like name, price and etc. But only `:id` and `:indexed_field` are required
|
250
244
|
|
245
|
+
## Language support (stop words, stemmers)
|
246
|
+
|
247
|
+
Creating an index using `MiniSearch.new_index` will gives an inverted_index that does not
|
248
|
+
have any language support like stop_words and synonyms. We could pass them as arguments
|
249
|
+
in `new_index` like:
|
250
|
+
|
251
|
+
```
|
252
|
+
index = MiniSearch.new_index(
|
253
|
+
stop_words: stop_words,
|
254
|
+
stemmer: stemmer,
|
255
|
+
synonyms_map: synonyms_map
|
256
|
+
)
|
257
|
+
```
|
258
|
+
|
259
|
+
Arguments:
|
260
|
+
|
261
|
+
- The stop_words is a array of worlds that should be removed when indexing the document.
|
262
|
+
- The stemmer is a object of type Stemmer, that implements a `stem` method that remove all but the stem of the word (example: `carrocha` -> `carr`).
|
263
|
+
- The synonyms_map is a hashmap with original terms and a list of synonyms (example: `{'calçado' => ['sapato', 'tenis', 'salto', 'chinelo]}`)
|
264
|
+
|
265
|
+
## n-gram Tokenizer
|
266
|
+
|
267
|
+
By default creating an index using `MiniSearch.new_index` will gives an inverted_index that uses a simple whitespace tokenizer.
|
268
|
+
(e.g. `"Hello World" => ["Hello", "World"]`)
|
269
|
+
|
270
|
+
You can change this behavior to use an n-gram tokenizer which will break words down into smaller pieces with a configurable
|
271
|
+
token window. You can read more about how this kind of tokenization works for [Elastic Search](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html).
|
272
|
+
(e.g. `ngrams: 2` the phrase `"Hello World" => ["He", "el", "ll", "lo", "o ", " W", "Wo", "or", "rl", "ld"]`)
|
273
|
+
or
|
274
|
+
(e.g. `ngrams: 3` the phrase `"Hello World" => ["Hel", "ell", "llo", "lo ", "o W", " Wo", "Wor", "orl", "rld"]`)
|
275
|
+
|
276
|
+
To enable this simply pass an integer for the parameter `ngrams`.
|
277
|
+
|
278
|
+
```
|
279
|
+
index = MiniSearch.new_index(
|
280
|
+
ngrams: 2,
|
281
|
+
)
|
282
|
+
```
|
283
|
+
|
284
|
+
Arguments:
|
285
|
+
|
286
|
+
- ngrams: An integer which represents the amount of characters each token should be. Common paramaeters are: (`2` for bigrams or `3` for trigrams)
|
287
|
+
|
288
|
+
# Stemmers
|
289
|
+
|
290
|
+
Stemmers are classes that implements the `def stem(word)` method, that receives a word and returs the stem:
|
291
|
+
|
292
|
+
Example of a NaiveEnglishStemmer:
|
293
|
+
|
294
|
+
```
|
295
|
+
module MiniSearch
|
296
|
+
module Stemmer
|
297
|
+
class NaiveEnglishStemmer
|
298
|
+
def stem(word)
|
299
|
+
# removes plural
|
300
|
+
word[0..-2] if word.end_with?('s')
|
301
|
+
end
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|
305
|
+
```
|
306
|
+
|
307
|
+
MiniSearch comes with a Brazilian Portuguese stemmer for now.
|
308
|
+
|
309
|
+
## Configuring multiple cores using yaml
|
310
|
+
|
311
|
+
You can configure a multiple core using a yaml config file.
|
312
|
+
|
313
|
+
```yaml
|
314
|
+
cores:
|
315
|
+
- main:
|
316
|
+
lang: 'pt'
|
317
|
+
synonyms_map:
|
318
|
+
bebe: 'nene'
|
319
|
+
stop_words:
|
320
|
+
- 'de'
|
321
|
+
- 'para'
|
322
|
+
- aux:
|
323
|
+
lang: 'pt'
|
324
|
+
synonyms_map:
|
325
|
+
bebe: 'nene'
|
326
|
+
stop_words:
|
327
|
+
- 'de'
|
328
|
+
- 'para'
|
329
|
+
```
|
330
|
+
|
251
331
|
## Development
|
252
332
|
|
253
333
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/bin/console
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require "bundler/setup"
|
4
4
|
require "mini_search"
|
5
|
+
require "ruby_ngrams" # We want to be able to use this when testing!
|
5
6
|
|
6
7
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
8
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniSearch
|
4
|
+
class CompoundTokenizer
|
5
|
+
def initialize(tokenizers)
|
6
|
+
@tokenizers = tokenizers
|
7
|
+
end
|
8
|
+
|
9
|
+
def execute(string)
|
10
|
+
@tokenizers.each_with_object([]) do |tokenizer, tokens|
|
11
|
+
tokens.concat(tokenizer.execute(string))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -154,8 +154,8 @@ module MiniSearch
|
|
154
154
|
|
155
155
|
def generate_idfs(processed_terms)
|
156
156
|
processed_terms.each_with_object({}) do |term, idfs|
|
157
|
-
if @
|
158
|
-
idfs[term] = Idf.calculate(@
|
157
|
+
if @inverted_index[term].to_a.any?
|
158
|
+
idfs[term] = Idf.calculate(@inverted_index[term].size, @documents.size)
|
159
159
|
end
|
160
160
|
end
|
161
161
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "ruby_ngrams"
|
3
|
+
|
4
|
+
module MiniSearch
|
5
|
+
class NgramTokenizer
|
6
|
+
def initialize(n)
|
7
|
+
@n = n || 2
|
8
|
+
end
|
9
|
+
|
10
|
+
def execute(string)
|
11
|
+
# In the future, we may want to consider doing a strip on tokens to remove whitespace.
|
12
|
+
string.ngrams(regex: //, n: @n).map(&:join)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/mini_search/pipeline.rb
CHANGED
@@ -5,16 +5,27 @@ module MiniSearch
|
|
5
5
|
# do when indexing a document or searching
|
6
6
|
class Pipeline
|
7
7
|
def initialize(tokenizer, filters)
|
8
|
+
@standard_tokenizer = MiniSearch::StandardWhitespaceTokenizer.new
|
8
9
|
@tokenizer = tokenizer
|
9
10
|
@filters = filters
|
10
11
|
end
|
11
12
|
|
12
13
|
def execute(string)
|
13
|
-
tokens
|
14
|
+
# Since the filter model expects tokens that are tokenized by
|
15
|
+
# the standard tokenizer, let's use that first.
|
16
|
+
tokens = @standard_tokenizer.execute(string)
|
14
17
|
|
15
|
-
|
18
|
+
# Apply filters
|
19
|
+
filters_applied = @filters.reduce(tokens) do |filtered_tokens, filter|
|
16
20
|
filter.execute(filtered_tokens)
|
17
21
|
end
|
22
|
+
|
23
|
+
# Return if our selected tokenizer is the standard tokenizer
|
24
|
+
return filters_applied if @tokenizer.is_a? MiniSearch::StandardWhitespaceTokenizer
|
25
|
+
|
26
|
+
# Execute non-standard tokenization after rejoining the tokens
|
27
|
+
# that were tokenized with the StandardWhitespaceTokenizer
|
28
|
+
@tokenizer.execute(filters_applied.join(' '))
|
18
29
|
end
|
19
30
|
end
|
20
31
|
end
|
data/lib/mini_search/version.rb
CHANGED
data/lib/mini_search.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
require 'yaml'
|
1
2
|
require 'mini_search/version.rb'
|
2
3
|
require 'mini_search/stemmer/portuguese.rb'
|
3
4
|
require 'mini_search/standard_whitespace_tokenizer.rb'
|
5
|
+
require 'mini_search/ngram_tokenizer.rb'
|
6
|
+
require 'mini_search/compound_tokenizer.rb'
|
4
7
|
require 'mini_search/strip_filter.rb'
|
5
8
|
require 'mini_search/remove_punctuation_filter.rb'
|
6
9
|
require 'mini_search/downcase_filter.rb'
|
@@ -23,8 +26,13 @@ module MiniSearch
|
|
23
26
|
MiniSearch::InvertedIndex.new(indexing_pipeline, querying_pipeline)
|
24
27
|
end
|
25
28
|
|
26
|
-
def self.new_index(stop_words: [], synonyms_map: {}, stemmer: nil)
|
27
|
-
|
29
|
+
def self.new_index(stop_words: [], synonyms_map: {}, stemmer: nil, ngrams: nil)
|
30
|
+
tokenizer =
|
31
|
+
if ngrams
|
32
|
+
NgramTokenizer.new(ngrams)
|
33
|
+
else
|
34
|
+
StandardWhitespaceTokenizer.new
|
35
|
+
end
|
28
36
|
|
29
37
|
strip_filter = StripFilter.new
|
30
38
|
remove_punctuation_filter = RemovePunctuationFilter.new
|
@@ -34,7 +42,7 @@ module MiniSearch
|
|
34
42
|
synonyms_filter = SynonymsFilter.new(synonyms_map)
|
35
43
|
|
36
44
|
indexing_pipeline = Pipeline.new(
|
37
|
-
|
45
|
+
tokenizer,
|
38
46
|
[
|
39
47
|
strip_filter,
|
40
48
|
remove_punctuation_filter,
|
@@ -45,7 +53,7 @@ module MiniSearch
|
|
45
53
|
)
|
46
54
|
|
47
55
|
querying_pipeline = Pipeline.new(
|
48
|
-
|
56
|
+
tokenizer,
|
49
57
|
[
|
50
58
|
strip_filter,
|
51
59
|
remove_punctuation_filter,
|
@@ -59,17 +67,40 @@ module MiniSearch
|
|
59
67
|
new(indexing_pipeline, querying_pipeline)
|
60
68
|
end
|
61
69
|
|
62
|
-
def self.new_localized_index(
|
63
|
-
|
64
|
-
language_support = LANGUAGE_SUPPORTS[language_support].new(stop_words)
|
65
|
-
end
|
66
|
-
|
67
|
-
raise 'language support not found or nil' unless language_support
|
70
|
+
def self.new_localized_index(lang, synonyms_map: {}, stop_words: [], ngrams: nil)
|
71
|
+
language_support = find_language_support(lang, stop_words)
|
68
72
|
|
69
73
|
new_index(
|
70
74
|
stop_words: language_support.stop_words,
|
71
75
|
stemmer: language_support.stemmer,
|
72
|
-
synonyms_map: synonyms_map
|
76
|
+
synonyms_map: synonyms_map,
|
77
|
+
ngrams: ngrams
|
73
78
|
)
|
74
79
|
end
|
80
|
+
|
81
|
+
def self.from_config_file(file)
|
82
|
+
raise "file not found '#{file}'" unless File.exists?(file)
|
83
|
+
|
84
|
+
cores = YAML.load_file(file)['cores']
|
85
|
+
|
86
|
+
cores.map do |core|
|
87
|
+
lang = core['lang'].to_sym
|
88
|
+
|
89
|
+
new_localized_index(
|
90
|
+
lang,
|
91
|
+
stop_words: core['stop_words'],
|
92
|
+
synonyms_map: core['synonyms_map'].transform_values { |v| v.split(',') }
|
93
|
+
)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
private_class_method def self.find_language_support(lang, stop_words)
|
98
|
+
if lang.is_a?(Symbol)
|
99
|
+
language_support = LANGUAGE_SUPPORTS[lang].new(stop_words)
|
100
|
+
end
|
101
|
+
|
102
|
+
raise 'language support not found or nil' unless language_support
|
103
|
+
|
104
|
+
language_support
|
105
|
+
end
|
75
106
|
end
|
data/mini_search.gemspec
CHANGED
@@ -23,7 +23,8 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
24
24
|
spec.require_paths = ["lib"]
|
25
25
|
|
26
|
-
spec.add_development_dependency "bundler", "~>
|
27
|
-
spec.add_development_dependency "rake", "~>
|
26
|
+
spec.add_development_dependency "bundler", "~> 2.4.10"
|
27
|
+
spec.add_development_dependency "rake", "~> 12.0"
|
28
28
|
spec.add_development_dependency "rspec", "~> 3.0"
|
29
|
+
spec.add_runtime_dependency "ruby_ngrams", "~> 0.0.6"
|
29
30
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mini_search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew S Aguiar
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-05-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,28 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 2.4.10
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 2.4.10
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '12.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '12.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: ruby_ngrams
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.0.6
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.0.6
|
55
69
|
description: In-memory naive search engine.
|
56
70
|
email:
|
57
71
|
- andrewaguiar6@gmail.com
|
@@ -59,6 +73,7 @@ executables: []
|
|
59
73
|
extensions: []
|
60
74
|
extra_rdoc_files: []
|
61
75
|
files:
|
76
|
+
- ".devcontainer/devcontainer.json"
|
62
77
|
- ".gitignore"
|
63
78
|
- ".rspec"
|
64
79
|
- ".travis.yml"
|
@@ -74,11 +89,13 @@ files:
|
|
74
89
|
- formula2.svg
|
75
90
|
- lib/mini_search.rb
|
76
91
|
- lib/mini_search/bm_25.rb
|
92
|
+
- lib/mini_search/compound_tokenizer.rb
|
77
93
|
- lib/mini_search/downcase_filter.rb
|
78
94
|
- lib/mini_search/idf.rb
|
79
95
|
- lib/mini_search/inverted_index.rb
|
80
96
|
- lib/mini_search/inverted_index_spec.rb
|
81
97
|
- lib/mini_search/language_support/portuguese.rb
|
98
|
+
- lib/mini_search/ngram_tokenizer.rb
|
82
99
|
- lib/mini_search/pipeline.rb
|
83
100
|
- lib/mini_search/remove_punctuation_filter.rb
|
84
101
|
- lib/mini_search/standard_whitespace_tokenizer.rb
|
@@ -94,7 +111,7 @@ homepage: https://www.github.com/andrewaguiar/mini_search
|
|
94
111
|
licenses:
|
95
112
|
- MIT
|
96
113
|
metadata: {}
|
97
|
-
post_install_message:
|
114
|
+
post_install_message:
|
98
115
|
rdoc_options: []
|
99
116
|
require_paths:
|
100
117
|
- lib
|
@@ -109,8 +126,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
126
|
- !ruby/object:Gem::Version
|
110
127
|
version: '0'
|
111
128
|
requirements: []
|
112
|
-
rubygems_version: 3.
|
113
|
-
signing_key:
|
129
|
+
rubygems_version: 3.1.6
|
130
|
+
signing_key:
|
114
131
|
specification_version: 4
|
115
132
|
summary: In-memory naive search engine.
|
116
133
|
test_files: []
|