fuzzy_tools 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.travis.yml +9 -4
- data/Gemfile +2 -2
- data/README.md +5 -5
- data/lib/fuzzy_tools/tokenizers.rb +1 -1
- data/lib/fuzzy_tools/version.rb +1 -1
- data/lib/fuzzy_tools/weighted_document_tokens.rb +4 -3
- data/spec/enumerable_spec.rb +7 -7
- metadata +19 -33
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 1df8f364e469daaf9512eea1f3dd3438670f14a016973122d18449a15c42363c
|
|
4
|
+
data.tar.gz: e91486a18601f3e7d77bd560b84206609e6a2292e225fe05015ea007a0120445
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 72d976441112c687c50317654e16c9c7a4a4b8343c48af5eeb1b0b56c66333bdf0120b3ef77816d761dfddb37f9baba3aaad6ea86122385492f8c2ac10ce7e64
|
|
7
|
+
data.tar.gz: 7d8ce7c72f932a2c7cdd41a82d5f7274f6decf6373c38f1c32c28d31687aff0bfa6c7e3e647a6765f74739e9515636601521328693343acea3e6e08c571dc0a2
|
data/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pkg/*
|
data/.travis.yml
CHANGED
|
@@ -3,10 +3,15 @@ rvm:
|
|
|
3
3
|
- 1.8.7
|
|
4
4
|
- 1.9.2
|
|
5
5
|
- 1.9.3
|
|
6
|
+
- 2.0.0
|
|
7
|
+
- 2.1.0
|
|
8
|
+
- jruby-18mode
|
|
9
|
+
- jruby-19mode
|
|
6
10
|
- ruby-head
|
|
7
|
-
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
+
- rbx
|
|
12
|
+
matrix:
|
|
13
|
+
allow_failures:
|
|
14
|
+
- rvm: rbx
|
|
15
|
+
- rvm: ruby-head
|
|
11
16
|
# uncomment this line if your project needs to run something other than `rake`:
|
|
12
17
|
# script: bundle exec rspec spec
|
data/Gemfile
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
source "http://rubygems.org"
|
|
2
2
|
|
|
3
3
|
gem 'simple_stats'
|
|
4
|
-
gem 'nokogiri', :platforms => [:mri_18, :mri_19, :jruby
|
|
5
|
-
gem 'perftools.rb',
|
|
4
|
+
gem 'nokogiri', '~> 1.5.0', :platforms => [:mri_18, :mri_19, :jruby]
|
|
5
|
+
gem 'perftools.rb', :platforms => [:mri_18, :mri_19], :require => false
|
|
6
6
|
gem 'rake'
|
|
7
7
|
|
|
8
8
|
# Specify your gem's dependencies in fuzzy_tools.gemspec
|
data/README.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# FuzzyTools [](http://travis-ci.org/brianhempel/fuzzy_tools)
|
|
1
|
+
# FuzzyTools [](http://travis-ci.org/brianhempel/fuzzy_tools) [](https://gemnasium.com/brianhempel/fuzzy_tools)
|
|
2
2
|
|
|
3
3
|
FuzzyTools is a toolset for fuzzy searches in Ruby. The default algorithm has been tuned for accuracy (and reasonable speed) on 23 different [test files](https://github.com/brianhempel/fuzzy_tools/tree/master/accuracy/test_data/query_tests) gathered from [many sources](https://github.com/brianhempel/fuzzy_tools/blob/master/accuracy/test_data/sources/SOURCES.txt).
|
|
4
4
|
|
|
@@ -120,7 +120,7 @@ FuzzyTools::TfIdfIndex.new(:source => books, :attribute => lambda { |book| book.
|
|
|
120
120
|
|
|
121
121
|
## Can it go faster?
|
|
122
122
|
|
|
123
|
-
If you need to do multiple searches on the same collection, grab a fuzzy index with `my_collection.fuzzy_index` and do finds on that. The `fuzzy_find` and `
|
|
123
|
+
If you need to do multiple searches on the same collection, grab a fuzzy index with `my_collection.fuzzy_index` and do finds on that. The `fuzzy_find`, `fuzzy_find_all`, and `fuzzy_find_all_with_scores` methods on Enumerable reindex every time they are called.
|
|
124
124
|
|
|
125
125
|
Here's a performance comparison:
|
|
126
126
|
|
|
@@ -151,7 +151,7 @@ If it's still too slow, [open an issue](https://github.com/brianhempel/fuzzy_too
|
|
|
151
151
|
|
|
152
152
|
## How does it work?
|
|
153
153
|
|
|
154
|
-
FuzzyTools downcases and then tokenizes each value using a [hybrid combination](https://github.com/brianhempel/fuzzy_tools/blob/master/lib/
|
|
154
|
+
FuzzyTools downcases and then tokenizes each value using a [hybrid combination](https://github.com/brianhempel/fuzzy_tools/blob/master/lib/fuzzy_tools/tokenizers.rb#L20-27) of words, [character bigrams](http://en.wikipedia.org/wiki/N-gram), [Soundex](http://en.wikipedia.org/wiki/Soundex), and words without vowels.
|
|
155
155
|
|
|
156
156
|
``` ruby
|
|
157
157
|
FuzzyTools::Tokenizers::HYBRID.call("Till We Have Faces")
|
|
@@ -195,7 +195,7 @@ Trust me, it works.
|
|
|
195
195
|
|
|
196
196
|
## Specifying your own tokenizer
|
|
197
197
|
|
|
198
|
-
If the default tokenizer isn't working for your data or you need more speed, you can try swapping out the tokenizers. You can use one of the various tokenizers
|
|
198
|
+
If the default tokenizer isn't working for your data or you need more speed, you can try swapping out the tokenizers. You can use one of the various tokenizers defined in [`FuzzyTools::Tokenizers`](https://github.com/brianhempel/fuzzy_tools/blob/master/lib/fuzzy_tools/tokenizers.rb), or you can write your own.
|
|
199
199
|
|
|
200
200
|
``` ruby
|
|
201
201
|
# a predefined tokenizer
|
|
@@ -233,4 +233,4 @@ The [SecondString](http://secondstring.sourceforge.net/) source code was a valua
|
|
|
233
233
|
|
|
234
234
|
## License
|
|
235
235
|
|
|
236
|
-
Authored by Brian Hempel. Public domain, no restrictions.
|
|
236
|
+
Authored by Brian Hempel. Public domain, no restrictions.
|
|
@@ -21,7 +21,7 @@ module FuzzyTools
|
|
|
21
21
|
str = str.downcase
|
|
22
22
|
words = str.split
|
|
23
23
|
words.map { |word| FuzzyTools::Helpers.soundex(word) } +
|
|
24
|
-
FuzzyTools::Helpers.ngrams(str
|
|
24
|
+
FuzzyTools::Helpers.ngrams(str, 2) +
|
|
25
25
|
words.map { |word| word.gsub(/[aeiou]/, '') } +
|
|
26
26
|
words
|
|
27
27
|
end
|
data/lib/fuzzy_tools/version.rb
CHANGED
|
@@ -41,13 +41,14 @@ module FuzzyTools
|
|
|
41
41
|
VALUE my_weights = argv[0];
|
|
42
42
|
VALUE my_tokens = argv[1];
|
|
43
43
|
VALUE other_weights = argv[2];
|
|
44
|
-
|
|
44
|
+
long i;
|
|
45
45
|
VALUE token;
|
|
46
46
|
VALUE my_weight;
|
|
47
47
|
VALUE other_weight;
|
|
48
|
+
long len = RARRAY_LEN(my_tokens);
|
|
48
49
|
|
|
49
|
-
for(i = 0; i <
|
|
50
|
-
token =
|
|
50
|
+
for (i = 0; i < len; i++) {
|
|
51
|
+
token = rb_ary_entry(my_tokens, i);
|
|
51
52
|
other_weight = rb_hash_aref(other_weights, token);
|
|
52
53
|
if (other_weight != Qnil) {
|
|
53
54
|
my_weight = rb_hash_aref(my_weights, token);
|
data/spec/enumerable_spec.rb
CHANGED
|
@@ -27,7 +27,7 @@ describe Enumerable do
|
|
|
27
27
|
before(:each) { @letter_count_tokenizer = lambda { |str| str.size.to_s } }
|
|
28
28
|
|
|
29
29
|
it "passes :tokenizer through to the index with simple query syntax" do
|
|
30
|
-
FuzzyTools::TfIdfIndex.should_receive(:new).with(:source => @books, :tokenizer => @letter_count_tokenizer)
|
|
30
|
+
FuzzyTools::TfIdfIndex.should_receive(:new).with({ :source => @books, :tokenizer => @letter_count_tokenizer })
|
|
31
31
|
begin
|
|
32
32
|
@books.fuzzy_find("the", :tokenizer => @letter_count_tokenizer)
|
|
33
33
|
rescue
|
|
@@ -35,7 +35,7 @@ describe Enumerable do
|
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
it "passes :tokenizer through to the index with :attribute => query syntax" do
|
|
38
|
-
FuzzyTools::TfIdfIndex.should_receive(:new).with(:source => @books, :tokenizer => @letter_count_tokenizer, :attribute => :title)
|
|
38
|
+
FuzzyTools::TfIdfIndex.should_receive(:new).with({ :source => @books, :tokenizer => @letter_count_tokenizer, :attribute => :title })
|
|
39
39
|
begin
|
|
40
40
|
@books.fuzzy_find(:title => "the", :tokenizer => @letter_count_tokenizer)
|
|
41
41
|
rescue
|
|
@@ -57,7 +57,7 @@ describe Enumerable do
|
|
|
57
57
|
before(:each) { @letter_count_tokenizer = lambda { |str| str.size.to_s } }
|
|
58
58
|
|
|
59
59
|
it "passes :tokenizer through to the index with simple query syntax" do
|
|
60
|
-
FuzzyTools::TfIdfIndex.should_receive(:new).with(:source => @books, :tokenizer => @letter_count_tokenizer)
|
|
60
|
+
FuzzyTools::TfIdfIndex.should_receive(:new).with({ :source => @books, :tokenizer => @letter_count_tokenizer })
|
|
61
61
|
begin
|
|
62
62
|
@books.fuzzy_find_all("the", :tokenizer => @letter_count_tokenizer)
|
|
63
63
|
rescue
|
|
@@ -65,7 +65,7 @@ describe Enumerable do
|
|
|
65
65
|
end
|
|
66
66
|
|
|
67
67
|
it "passes :tokenizer through to the index with :attribute => query syntax" do
|
|
68
|
-
FuzzyTools::TfIdfIndex.should_receive(:new).with(:source => @books, :tokenizer => @letter_count_tokenizer, :attribute => :title)
|
|
68
|
+
FuzzyTools::TfIdfIndex.should_receive(:new).with({ :source => @books, :tokenizer => @letter_count_tokenizer, :attribute => :title })
|
|
69
69
|
begin
|
|
70
70
|
@books.fuzzy_find_all(:title => "the", :tokenizer => @letter_count_tokenizer)
|
|
71
71
|
rescue
|
|
@@ -93,7 +93,7 @@ describe Enumerable do
|
|
|
93
93
|
before(:each) { @letter_count_tokenizer = lambda { |str| str.size.to_s } }
|
|
94
94
|
|
|
95
95
|
it "passes :tokenizer through to the index with simple query syntax" do
|
|
96
|
-
FuzzyTools::TfIdfIndex.should_receive(:new).with(:source => @books, :tokenizer => @letter_count_tokenizer)
|
|
96
|
+
FuzzyTools::TfIdfIndex.should_receive(:new).with({ :source => @books, :tokenizer => @letter_count_tokenizer })
|
|
97
97
|
begin
|
|
98
98
|
@books.fuzzy_find_all_with_scores("the", :tokenizer => @letter_count_tokenizer)
|
|
99
99
|
rescue
|
|
@@ -101,7 +101,7 @@ describe Enumerable do
|
|
|
101
101
|
end
|
|
102
102
|
|
|
103
103
|
it "passes :tokenizer through to the index with :attribute => query syntax" do
|
|
104
|
-
FuzzyTools::TfIdfIndex.should_receive(:new).with(:source => @books, :tokenizer => @letter_count_tokenizer, :attribute => :title)
|
|
104
|
+
FuzzyTools::TfIdfIndex.should_receive(:new).with({ :source => @books, :tokenizer => @letter_count_tokenizer, :attribute => :title })
|
|
105
105
|
begin
|
|
106
106
|
@books.fuzzy_find_all_with_scores(:title => "the", :tokenizer => @letter_count_tokenizer)
|
|
107
107
|
rescue
|
|
@@ -117,7 +117,7 @@ describe Enumerable do
|
|
|
117
117
|
|
|
118
118
|
it "passes options along to the index" do
|
|
119
119
|
letter_count_tokenizer = lambda { |str| str.size.to_s }
|
|
120
|
-
FuzzyTools::TfIdfIndex.should_receive(:new).with(:source => @books, :tokenizer => letter_count_tokenizer, :attribute => :title)
|
|
120
|
+
FuzzyTools::TfIdfIndex.should_receive(:new).with({ :source => @books, :tokenizer => letter_count_tokenizer, :attribute => :title })
|
|
121
121
|
@books.fuzzy_index(:attribute => :title, :tokenizer => letter_count_tokenizer)
|
|
122
122
|
end
|
|
123
123
|
end
|
metadata
CHANGED
|
@@ -1,62 +1,55 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: fuzzy_tools
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0.
|
|
5
|
-
prerelease:
|
|
4
|
+
version: 1.0.1
|
|
6
5
|
platform: ruby
|
|
7
6
|
authors:
|
|
8
7
|
- Brian Hempel
|
|
9
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
10
9
|
bindir: bin
|
|
11
10
|
cert_chain: []
|
|
12
|
-
date:
|
|
11
|
+
date: 2025-11-15 00:00:00.000000000 Z
|
|
13
12
|
dependencies:
|
|
14
13
|
- !ruby/object:Gem::Dependency
|
|
15
14
|
name: RubyInline
|
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
|
17
|
-
none: false
|
|
18
16
|
requirements:
|
|
19
|
-
- -
|
|
17
|
+
- - ">="
|
|
20
18
|
- !ruby/object:Gem::Version
|
|
21
19
|
version: '0'
|
|
22
20
|
type: :runtime
|
|
23
21
|
prerelease: false
|
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
25
|
-
none: false
|
|
26
23
|
requirements:
|
|
27
|
-
- -
|
|
24
|
+
- - ">="
|
|
28
25
|
- !ruby/object:Gem::Version
|
|
29
26
|
version: '0'
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
|
31
28
|
name: bundler
|
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
|
33
|
-
none: false
|
|
34
30
|
requirements:
|
|
35
|
-
- -
|
|
31
|
+
- - ">="
|
|
36
32
|
- !ruby/object:Gem::Version
|
|
37
33
|
version: '0'
|
|
38
34
|
type: :development
|
|
39
35
|
prerelease: false
|
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
41
|
-
none: false
|
|
42
37
|
requirements:
|
|
43
|
-
- -
|
|
38
|
+
- - ">="
|
|
44
39
|
- !ruby/object:Gem::Version
|
|
45
40
|
version: '0'
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
|
47
42
|
name: rspec
|
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
|
49
|
-
none: false
|
|
50
44
|
requirements:
|
|
51
|
-
- -
|
|
45
|
+
- - ">="
|
|
52
46
|
- !ruby/object:Gem::Version
|
|
53
47
|
version: '0'
|
|
54
48
|
type: :development
|
|
55
49
|
prerelease: false
|
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
57
|
-
none: false
|
|
58
51
|
requirements:
|
|
59
|
-
- -
|
|
52
|
+
- - ">="
|
|
60
53
|
- !ruby/object:Gem::Version
|
|
61
54
|
version: '0'
|
|
62
55
|
description: Easy, high quality fuzzy search in Ruby.
|
|
@@ -66,8 +59,9 @@ executables: []
|
|
|
66
59
|
extensions: []
|
|
67
60
|
extra_rdoc_files: []
|
|
68
61
|
files:
|
|
69
|
-
- .
|
|
70
|
-
- .
|
|
62
|
+
- ".gitignore"
|
|
63
|
+
- ".rspec"
|
|
64
|
+
- ".travis.yml"
|
|
71
65
|
- Gemfile
|
|
72
66
|
- README.md
|
|
73
67
|
- Rakefile
|
|
@@ -86,33 +80,25 @@ files:
|
|
|
86
80
|
- spec/tf_idf_index_spec.rb
|
|
87
81
|
homepage: https://github.com/brianhempel/fuzzy_tools
|
|
88
82
|
licenses: []
|
|
89
|
-
|
|
83
|
+
metadata: {}
|
|
84
|
+
post_install_message:
|
|
90
85
|
rdoc_options: []
|
|
91
86
|
require_paths:
|
|
92
87
|
- lib
|
|
93
88
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
94
|
-
none: false
|
|
95
89
|
requirements:
|
|
96
|
-
- -
|
|
90
|
+
- - ">="
|
|
97
91
|
- !ruby/object:Gem::Version
|
|
98
92
|
version: '0'
|
|
99
|
-
segments:
|
|
100
|
-
- 0
|
|
101
|
-
hash: -1099286336038854081
|
|
102
93
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
103
|
-
none: false
|
|
104
94
|
requirements:
|
|
105
|
-
- -
|
|
95
|
+
- - ">="
|
|
106
96
|
- !ruby/object:Gem::Version
|
|
107
97
|
version: '0'
|
|
108
|
-
segments:
|
|
109
|
-
- 0
|
|
110
|
-
hash: -1099286336038854081
|
|
111
98
|
requirements: []
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
specification_version: 3
|
|
99
|
+
rubygems_version: 3.4.10
|
|
100
|
+
signing_key:
|
|
101
|
+
specification_version: 4
|
|
116
102
|
summary: Easy, high quality fuzzy search in Ruby.
|
|
117
103
|
test_files:
|
|
118
104
|
- spec/enumerable_spec.rb
|