fuzzy_match 1.3.3 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +14 -2
- data/Gemfile +0 -15
- data/README.markdown +17 -10
- data/bin/fuzzy_match_checker +8 -3
- data/fuzzy_match.gemspec +15 -2
- data/lib/fuzzy_match.rb +22 -20
- data/lib/fuzzy_match/result.rb +5 -4
- data/lib/fuzzy_match/score/pure_ruby.rb +64 -67
- data/lib/fuzzy_match/similarity.rb +20 -18
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match/wrapper.rb +30 -28
- data/test/helper.rb +5 -5
- data/test/test_cache.rb +6 -3
- data/test/test_fuzzy_match.rb +0 -5
- metadata +125 -14
- data/.document +0 -5
data/CHANGELOG
CHANGED
@@ -1,4 +1,16 @@
|
|
1
|
-
|
1
|
+
1.4.0 / 2012-09-07
|
2
|
+
|
3
|
+
* Breaking changes
|
4
|
+
|
5
|
+
* Option keys are no longer symbolized automatically - make sure you do that if there's any chance they'll be strings
|
6
|
+
* active_record_inline_schema is no longer a runtime dependency - add it to your Gemfile if you use FuzzyMatch::CachedResult
|
7
|
+
|
8
|
+
* Enhancements
|
9
|
+
|
10
|
+
* Tiny bit better #explain(needle)
|
11
|
+
* Remove dependency on ActiveSupport
|
12
|
+
|
13
|
+
1.3.3 / 2012-04-13
|
2
14
|
|
3
15
|
* Enhancements
|
4
16
|
|
@@ -7,7 +19,7 @@
|
|
7
19
|
* Test against CohortAnalysis, the replacement for CohortScope
|
8
20
|
* Fix some other random deprecations (like set_primary_key)
|
9
21
|
|
10
|
-
|
22
|
+
1.3.2 / 2012-02-24
|
11
23
|
|
12
24
|
* Enhancements
|
13
25
|
|
data/Gemfile
CHANGED
@@ -1,18 +1,3 @@
|
|
1
1
|
source :rubygems
|
2
2
|
|
3
3
|
gemspec
|
4
|
-
|
5
|
-
# bin dependencies
|
6
|
-
gem 'remote_table'
|
7
|
-
gem 'thor'
|
8
|
-
|
9
|
-
# development dependencies
|
10
|
-
gem 'minitest-reporters'
|
11
|
-
gem "minitest"
|
12
|
-
gem 'activerecord', '>=3'
|
13
|
-
gem 'mysql2'
|
14
|
-
gem 'cohort_analysis'
|
15
|
-
gem 'weighted_average'
|
16
|
-
gem 'rake'
|
17
|
-
gem 'yard'
|
18
|
-
gem 'amatch'
|
data/README.markdown
CHANGED
@@ -4,6 +4,21 @@ Find a needle in a haystack based on string similarity and regular expression ru
|
|
4
4
|
|
5
5
|
Replaces [`loose_tight_dictionary`](https://github.com/seamusabshere/loose_tight_dictionary) because that was a confusing name.
|
6
6
|
|
7
|
+
## Real-world usage
|
8
|
+
|
9
|
+
<p><a href="http://brighterplanet.com"><img src="https://s3.amazonaws.com/static.brighterplanet.com/assets/logos/flush-left/inline/green/rasterized/brighter_planet-160-transparent.png" alt="Brighter Planet logo"/></a></p>
|
10
|
+
|
11
|
+
We use `fuzzy_match` for [data science at Brighter Planet](http://brighterplanet.com/research) and in production at
|
12
|
+
|
13
|
+
* [Brighter Planet's impact estimate web service](http://impact.brighterplanet.com)
|
14
|
+
* [Brighter Planet's reference data web service](http://data.brighterplanet.com)
|
15
|
+
|
16
|
+
We often combine it with [`remote_table`](https://github.com/seamusabshere/remote_table) and [`errata`](https://github.com/seamusabshere/errata):
|
17
|
+
|
18
|
+
- download table with `remote_table`
|
19
|
+
- correct serious or repeated errors with `errata`
|
20
|
+
- `fuzzy_match` the rest
|
21
|
+
|
7
22
|
## Quickstart
|
8
23
|
|
9
24
|
>> require 'fuzzy_match'
|
@@ -114,18 +129,10 @@ In edge cases where Dice's finds that two strings are equally similar to a third
|
|
114
129
|
>> 'RITZ'.levenshtein_similar 'CATZ'
|
115
130
|
=> 0.5 # which properly shows that RATZ should win
|
116
131
|
|
117
|
-
## Production use
|
118
|
-
|
119
|
-
Over 2 years in [Brighter Planet's impact estimate API](http://impact.brighterplanet.com) and [reference data service](http://data.brighterplanet.com).
|
120
|
-
|
121
|
-
We often combine `fuzzy_match` with [`remote_table`](https://github.com/seamusabshere/remote_table) and [`errata`](https://github.com/seamusabshere/errata):
|
122
|
-
|
123
|
-
- download table with `remote_table`
|
124
|
-
- correct serious or repeated errors with `errata`
|
125
|
-
- `fuzzy_match` the rest
|
126
|
-
|
127
132
|
## Cached results
|
128
133
|
|
134
|
+
Make sure you add active\_record\_inline\_schema to your gemfile.
|
135
|
+
|
129
136
|
TODO write documentation. For now, please see how [we manually cache matches between aircraft and flight segments](https://github.com/brighterplanet/earth/blob/master/lib/earth/air/aircraft.rb).
|
130
137
|
|
131
138
|
## Glossary
|
data/bin/fuzzy_match_checker
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
if File.exist?(File.join(Dir.pwd, 'fuzzy_match.
|
4
|
-
|
3
|
+
if File.exist?(File.join(Dir.pwd, 'lib', 'fuzzy_match.rb'))
|
4
|
+
$LOAD_PATH.unshift File.join(Dir.pwd, 'lib')
|
5
|
+
require File.join(Dir.pwd, 'lib', 'fuzzy_match')
|
6
|
+
else
|
7
|
+
require 'fuzzy_match'
|
5
8
|
end
|
9
|
+
require 'fuzzy_match/version'
|
6
10
|
|
7
|
-
|
11
|
+
# note: not included in gemfile but neither is bundler used here
|
8
12
|
require 'active_support/core_ext'
|
9
13
|
require 'remote_table'
|
10
14
|
require 'thor'
|
@@ -52,6 +56,7 @@ class FuzzyMatch
|
|
52
56
|
end
|
53
57
|
unless match == correct_match
|
54
58
|
puts "MISMATCH: #{needle.inspect} should match #{correct_match.inspect}"
|
59
|
+
puts fm.explain needle
|
55
60
|
exit 1
|
56
61
|
end
|
57
62
|
count += 1
|
data/fuzzy_match.gemspec
CHANGED
@@ -17,7 +17,20 @@ Gem::Specification.new do |s|
|
|
17
17
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
18
18
|
s.require_paths = ["lib"]
|
19
19
|
|
20
|
-
s.add_runtime_dependency 'activesupport', '>=3'
|
21
20
|
s.add_runtime_dependency 'to_regexp', '>=0.0.3'
|
22
|
-
|
21
|
+
|
22
|
+
# needed if you use FuzzyMatch::CachedResult
|
23
|
+
s.add_development_dependency 'active_record_inline_schema', '>=0.4.0'
|
24
|
+
|
25
|
+
# development dependencies
|
26
|
+
s.add_development_dependency "minitest"
|
27
|
+
s.add_development_dependency 'activerecord', '>=3'
|
28
|
+
s.add_development_dependency 'mysql2'
|
29
|
+
s.add_development_dependency 'cohort_analysis'
|
30
|
+
s.add_development_dependency 'weighted_average'
|
31
|
+
s.add_development_dependency 'yard'
|
32
|
+
s.add_development_dependency 'amatch'
|
33
|
+
if RUBY_VERSION >= '1.9'
|
34
|
+
s.add_development_dependency 'minitest-reporters'
|
35
|
+
end
|
23
36
|
end
|
data/lib/fuzzy_match.rb
CHANGED
@@ -1,8 +1,3 @@
|
|
1
|
-
require 'active_support'
|
2
|
-
require 'active_support/version'
|
3
|
-
if ::ActiveSupport::VERSION::MAJOR >= 3
|
4
|
-
require 'active_support/core_ext'
|
5
|
-
end
|
6
1
|
require 'to_regexp'
|
7
2
|
|
8
3
|
require 'fuzzy_match/rule'
|
@@ -19,11 +14,11 @@ require 'fuzzy_match/score'
|
|
19
14
|
class FuzzyMatch
|
20
15
|
class << self
|
21
16
|
def engine
|
22
|
-
|
17
|
+
@engine
|
23
18
|
end
|
24
19
|
|
25
20
|
def engine=(alt_engine)
|
26
|
-
|
21
|
+
@engine = alt_engine
|
27
22
|
end
|
28
23
|
|
29
24
|
def score_class
|
@@ -47,6 +42,8 @@ class FuzzyMatch
|
|
47
42
|
:gather_last_result => false,
|
48
43
|
:find_all => false
|
49
44
|
}
|
45
|
+
|
46
|
+
self.engine = DEFAULT_ENGINE
|
50
47
|
|
51
48
|
attr_reader :haystack
|
52
49
|
attr_reader :groupings
|
@@ -71,7 +68,7 @@ class FuzzyMatch
|
|
71
68
|
# * :<tt>first_grouping_decides</tt> - force records into the first grouping they match, rather than choosing a grouping that will give them a higher score
|
72
69
|
# * :<tt>gather_last_result</tt> - enable <tt>last_result</tt>
|
73
70
|
def initialize(competitors, options_and_rules = {})
|
74
|
-
options_and_rules = options_and_rules.
|
71
|
+
options_and_rules = options_and_rules.dup
|
75
72
|
|
76
73
|
# rules
|
77
74
|
self.groupings = options_and_rules.delete(:groupings) || options_and_rules.delete(:blockings) || []
|
@@ -87,7 +84,7 @@ class FuzzyMatch
|
|
87
84
|
if deprecated = options_and_rules.delete(:must_match_blocking)
|
88
85
|
options_and_rules[:must_match_grouping] = deprecated
|
89
86
|
end
|
90
|
-
@default_options =
|
87
|
+
@default_options = DEFAULT_OPTIONS.merge(options_and_rules).freeze
|
91
88
|
|
92
89
|
# do this last
|
93
90
|
self.haystack = competitors
|
@@ -118,12 +115,12 @@ class FuzzyMatch
|
|
118
115
|
end
|
119
116
|
|
120
117
|
def find_all(needle, options = {})
|
121
|
-
options = options.
|
118
|
+
options = options.merge(:find_all => true)
|
122
119
|
find needle, options
|
123
120
|
end
|
124
121
|
|
125
122
|
def find(needle, options = {})
|
126
|
-
options =
|
123
|
+
options = default_options.merge options
|
127
124
|
|
128
125
|
gather_last_result = options[:gather_last_result]
|
129
126
|
is_find_all = options[:find_all]
|
@@ -193,7 +190,9 @@ EOS
|
|
193
190
|
if groupings.any?
|
194
191
|
joint = passed_word_requirement.select do |straw|
|
195
192
|
if first_grouping_decides
|
196
|
-
groupings.detect { |grouping| grouping.match? needle }
|
193
|
+
if first_grouping = groupings.detect { |grouping| grouping.match? needle }
|
194
|
+
first_grouping.join? needle, straw
|
195
|
+
end
|
197
196
|
else
|
198
197
|
groupings.any? { |grouping| grouping.join? needle, straw }
|
199
198
|
end
|
@@ -237,21 +236,21 @@ EOS
|
|
237
236
|
if gather_last_result
|
238
237
|
last_result.timeline << <<-EOS
|
239
238
|
Since there were identities, the competition was reduced to records that might be identical to the needle (in other words, are not certainly different)
|
240
|
-
\
|
241
|
-
\tPassed (first
|
242
|
-
\tFailed (first
|
239
|
+
\tIdentities (first 10 of #{identities.length}): #{identities[0,9].map(&:inspect).join(', ')}
|
240
|
+
\tPassed (first 10 of #{possibly_identical.length}): #{possibly_identical[0,9].map(&:render).map(&:inspect).join(', ')}
|
241
|
+
\tFailed (first 10 of #{(joint-possibly_identical).length}): #{(joint-possibly_identical)[0,9].map(&:render).map(&:inspect).join(', ')}
|
243
242
|
EOS
|
244
243
|
end
|
245
244
|
else
|
246
245
|
possibly_identical = joint.dup
|
247
246
|
end
|
248
|
-
|
247
|
+
|
249
248
|
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort.reverse
|
250
249
|
|
251
250
|
if gather_last_result
|
252
|
-
|
251
|
+
last_result.timeline << <<-EOS
|
253
252
|
The competition was sorted in order of similarity to the needle.
|
254
|
-
\tSimilar (first
|
253
|
+
\tSimilar (first 10 of #{similarities.length}): #{similarities[0,9].map { |s| "#{s.wrapper2.render.inspect} (#{[s.best_score.dices_coefficient_similar, s.best_score.levenshtein_similar].map { |v| '%0.5f' % v }.join('/')})" }.join(', ')}
|
255
254
|
EOS
|
256
255
|
end
|
257
256
|
|
@@ -272,8 +271,11 @@ A winner was determined because the Dice's Coefficient similarity (#{best_simila
|
|
272
271
|
EOS
|
273
272
|
end
|
274
273
|
elsif gather_last_result
|
275
|
-
|
276
|
-
|
274
|
+
best_similarity_record = if best_similarity and best_similarity.wrapper2
|
275
|
+
best_similarity.wrapper2.record
|
276
|
+
end
|
277
|
+
last_result.timeline << <<-EOS
|
278
|
+
No winner assigned because the score of the best similarity (#{best_similarity_record.inspect}) was zero and it didn't match any words with the needle (#{needle.inspect}).
|
277
279
|
EOS
|
278
280
|
end
|
279
281
|
|
data/lib/fuzzy_match/result.rb
CHANGED
@@ -19,10 +19,6 @@ The haystack contained <%= haystack.length %> records like <%= haystack[0, 3].ma
|
|
19
19
|
<% end %>
|
20
20
|
ERB
|
21
21
|
|
22
|
-
def timeline
|
23
|
-
@timeline ||= []
|
24
|
-
end
|
25
|
-
|
26
22
|
attr_accessor :needle
|
27
23
|
attr_accessor :read
|
28
24
|
attr_accessor :haystack
|
@@ -33,6 +29,11 @@ ERB
|
|
33
29
|
attr_accessor :stop_words
|
34
30
|
attr_accessor :winner
|
35
31
|
attr_accessor :score
|
32
|
+
attr_reader :timeline
|
33
|
+
|
34
|
+
def initialize
|
35
|
+
@timeline = []
|
36
|
+
end
|
36
37
|
|
37
38
|
def explain
|
38
39
|
$stdout.puts ::ERB.new(EXPLANATION, 0, '%<').result(binding)
|
@@ -10,84 +10,81 @@ class FuzzyMatch
|
|
10
10
|
|
11
11
|
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
12
12
|
def dices_coefficient_similar
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
pairs2.slice!(i)
|
40
|
-
break
|
13
|
+
@dices_coefficient_similar ||= begin
|
14
|
+
if str1 == str2
|
15
|
+
1.0
|
16
|
+
elsif str1.length == 1 and str2.length == 1
|
17
|
+
0.0
|
18
|
+
else
|
19
|
+
pairs1 = (0..str1.length-2).map do |i|
|
20
|
+
str1[i,2]
|
21
|
+
end.reject do |pair|
|
22
|
+
pair.include? SPACE
|
23
|
+
end
|
24
|
+
pairs2 = (0..str2.length-2).map do |i|
|
25
|
+
str2[i,2]
|
26
|
+
end.reject do |pair|
|
27
|
+
pair.include? SPACE
|
28
|
+
end
|
29
|
+
union = pairs1.size + pairs2.size
|
30
|
+
intersection = 0
|
31
|
+
pairs1.each do |p1|
|
32
|
+
0.upto(pairs2.size-1) do |i|
|
33
|
+
if p1 == pairs2[i]
|
34
|
+
intersection += 1
|
35
|
+
pairs2.slice!(i)
|
36
|
+
break
|
37
|
+
end
|
38
|
+
end
|
41
39
|
end
|
40
|
+
(2.0 * intersection) / union
|
42
41
|
end
|
43
42
|
end
|
44
|
-
@dices_coefficient_similar = (2.0 * intersection) / union
|
45
43
|
end
|
46
44
|
|
47
45
|
# extracted/adapted from the text gem version 1.0.2
|
48
46
|
# normalization added for utf-8 strings
|
49
47
|
# lib/text/levenshtein.rb
|
50
48
|
def levenshtein_similar
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
49
|
+
@levenshtein_similar ||= begin
|
50
|
+
if utf8?
|
51
|
+
unpack_rule = 'U*'
|
52
|
+
else
|
53
|
+
unpack_rule = 'C*'
|
54
|
+
end
|
55
|
+
s = str1.unpack(unpack_rule)
|
56
|
+
t = str2.unpack(unpack_rule)
|
57
|
+
n = s.length
|
58
|
+
m = t.length
|
59
|
+
|
60
|
+
if n == 0 or m == 0
|
61
|
+
0.0
|
62
|
+
else
|
63
|
+
d = (0..m).to_a
|
64
|
+
x = nil
|
65
|
+
(0...n).each do |i|
|
66
|
+
e = i+1
|
67
|
+
(0...m).each do |j|
|
68
|
+
cost = (s[i] == t[j]) ? 0 : 1
|
69
|
+
x = [
|
70
|
+
d[j+1] + 1, # insertion
|
71
|
+
e + 1, # deletion
|
72
|
+
d[j] + cost # substitution
|
73
|
+
].min
|
74
|
+
d[j] = e
|
75
|
+
e = x
|
76
|
+
end
|
77
|
+
d[m] = x
|
78
|
+
end
|
79
|
+
# normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
|
80
|
+
# if (b_len > a_len) {
|
81
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
82
|
+
# } else {
|
83
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
84
|
+
# }
|
85
|
+
1.0 - x.to_f / [n, m].max
|
81
86
|
end
|
82
|
-
d[m] = x
|
83
87
|
end
|
84
|
-
# normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
|
85
|
-
# if (b_len > a_len) {
|
86
|
-
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
87
|
-
# } else {
|
88
|
-
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
89
|
-
# }
|
90
|
-
@levenshtein_similar = 1.0 - x.to_f / [n, m].max
|
91
88
|
end
|
92
89
|
|
93
90
|
private
|
@@ -17,15 +17,21 @@ class FuzzyMatch
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
+
def best_score
|
21
|
+
@best_score ||= FuzzyMatch.score_class.new(best_wrapper1_variant, best_wrapper2_variant)
|
22
|
+
end
|
23
|
+
|
24
|
+
def inspect
|
25
|
+
%{#<FuzzyMatch::Similarity #{wrapper2.render.inspect}=>#{best_wrapper2_variant.inspect} versus #{wrapper1.render.inspect}=>#{best_wrapper1_variant.inspect} original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
|
26
|
+
end
|
27
|
+
|
20
28
|
# Weight things towards short original strings
|
21
29
|
def original_weight
|
22
30
|
@original_weight ||= (1.0 / (wrapper1.render.length * wrapper2.render.length))
|
23
31
|
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
end
|
28
|
-
|
32
|
+
|
33
|
+
private
|
34
|
+
|
29
35
|
def best_wrapper1_variant
|
30
36
|
best_variants[0]
|
31
37
|
end
|
@@ -35,19 +41,15 @@ class FuzzyMatch
|
|
35
41
|
end
|
36
42
|
|
37
43
|
def best_variants
|
38
|
-
@best_variants ||=
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def inspect
|
50
|
-
%{#<FuzzyMatch::Similarity #{wrapper2.render.inspect}=>#{best_wrapper2_variant.inspect} versus #{wrapper1.render.inspect}=>#{best_wrapper1_variant.inspect} original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
|
44
|
+
@best_variants ||= begin
|
45
|
+
wrapper1.variants.product(wrapper2.variants).sort do |tuple1, tuple2|
|
46
|
+
wrapper1_variant1, wrapper2_variant1 = tuple1
|
47
|
+
wrapper1_variant2, wrapper2_variant2 = tuple2
|
48
|
+
score1 = FuzzyMatch.score_class.new wrapper1_variant1, wrapper2_variant1
|
49
|
+
score2 = FuzzyMatch.score_class.new wrapper1_variant2, wrapper2_variant2
|
50
|
+
score1 <=> score2
|
51
|
+
end.last
|
52
|
+
end
|
51
53
|
end
|
52
54
|
end
|
53
55
|
end
|
data/lib/fuzzy_match/version.rb
CHANGED
data/lib/fuzzy_match/wrapper.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
class FuzzyMatch
|
2
2
|
# Wrappers are the tokens that are passed around when doing scoring and optimizing.
|
3
3
|
class Wrapper #:nodoc: all
|
4
|
+
# "Foo's" is one word
|
5
|
+
# "North-west" is just one word
|
6
|
+
# "Bolivia," is just Bolivia
|
7
|
+
WORD_BOUNDARY = %r{\W*(?:\s+|$)}
|
8
|
+
|
4
9
|
attr_reader :fuzzy_match
|
5
10
|
attr_reader :record
|
6
11
|
attr_reader :literal
|
@@ -21,36 +26,31 @@ class FuzzyMatch
|
|
21
26
|
end
|
22
27
|
|
23
28
|
def render
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
29
|
+
@render ||= begin
|
30
|
+
memo = case read
|
31
|
+
when ::Proc
|
32
|
+
read.call record
|
33
|
+
when ::Symbol
|
34
|
+
if record.respond_to?(read)
|
35
|
+
record.send read
|
36
|
+
else
|
37
|
+
record[read]
|
38
|
+
end
|
39
|
+
when ::NilClass
|
40
|
+
record
|
31
41
|
else
|
32
42
|
record[read]
|
43
|
+
end.to_s.dup
|
44
|
+
fuzzy_match.stop_words.each do |stop_word|
|
45
|
+
stop_word.apply! memo
|
33
46
|
end
|
34
|
-
|
35
|
-
|
36
|
-
else
|
37
|
-
record[read]
|
38
|
-
end.to_s.dup
|
39
|
-
fuzzy_match.stop_words.each do |stop_word|
|
40
|
-
stop_word.apply! str
|
47
|
+
memo.strip!
|
48
|
+
@render = memo.freeze
|
41
49
|
end
|
42
|
-
str.strip!
|
43
|
-
@render = str.freeze
|
44
|
-
@rendered = true
|
45
|
-
@render
|
46
50
|
end
|
47
51
|
|
48
52
|
alias :to_str :render
|
49
53
|
|
50
|
-
# "Foo's" is one word
|
51
|
-
# "North-west" is just one word
|
52
|
-
# "Bolivia," is just Bolivia
|
53
|
-
WORD_BOUNDARY = %r{\W*(?:\s+|$)}
|
54
54
|
def words
|
55
55
|
@words ||= render.downcase.split(WORD_BOUNDARY)
|
56
56
|
end
|
@@ -60,12 +60,14 @@ class FuzzyMatch
|
|
60
60
|
end
|
61
61
|
|
62
62
|
def variants
|
63
|
-
@variants ||=
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
63
|
+
@variants ||= begin
|
64
|
+
fuzzy_match.normalizers.inject([ render ]) do |memo, normalizer|
|
65
|
+
if normalizer.apply? render
|
66
|
+
memo << normalizer.apply(render)
|
67
|
+
end
|
68
|
+
memo
|
69
|
+
end.uniq
|
70
|
+
end
|
69
71
|
end
|
70
72
|
end
|
71
73
|
end
|
data/test/helper.rb
CHANGED
@@ -3,10 +3,10 @@ require 'bundler'
|
|
3
3
|
Bundler.setup
|
4
4
|
require 'minitest/spec'
|
5
5
|
require 'minitest/autorun'
|
6
|
-
require 'minitest/reporters'
|
7
|
-
MiniTest::Unit.runner = MiniTest::SuiteRunner.new
|
8
|
-
MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
|
9
6
|
|
10
|
-
|
11
|
-
|
7
|
+
if RUBY_VERSION >= '1.9'
|
8
|
+
require 'minitest/reporters'
|
9
|
+
MiniTest::Reporters.use! MiniTest::Reporters::SpecReporter.new
|
10
|
+
end
|
11
|
+
|
12
12
|
require 'fuzzy_match'
|
data/test/test_cache.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
require 'active_support/all'
|
4
3
|
require 'active_record'
|
5
4
|
require 'cohort_analysis'
|
6
5
|
require 'weighted_average'
|
@@ -25,6 +24,7 @@ require 'fuzzy_match/cached_result'
|
|
25
24
|
::FuzzyMatch::CachedResult.setup(true)
|
26
25
|
|
27
26
|
class Aircraft < ActiveRecord::Base
|
27
|
+
MUTEX = ::Mutex.new
|
28
28
|
self.primary_key = 'icao_code'
|
29
29
|
|
30
30
|
cache_fuzzy_match_with :flight_segments, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
@@ -34,7 +34,9 @@ class Aircraft < ActiveRecord::Base
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def self.fuzzy_match
|
37
|
-
@fuzzy_match
|
37
|
+
@fuzzy_match || MUTEX.synchronize do
|
38
|
+
@fuzzy_match||= FuzzyMatch.new(all, :read => ::Proc.new { |straw| straw.aircraft_description })
|
39
|
+
end
|
38
40
|
end
|
39
41
|
|
40
42
|
def self.create_table
|
@@ -115,7 +117,8 @@ describe FuzzyMatch::CachedResult do
|
|
115
117
|
|
116
118
|
it %{works with cohort_scope (albeit rather clumsily)} do
|
117
119
|
aircraft = Aircraft.find('B742')
|
118
|
-
FlightSegment.cohort({:aircraft_description => aircraft.flight_segments_foreign_keys}, :minimum_size => 2)
|
120
|
+
cohort = FlightSegment.cohort({:aircraft_description => aircraft.flight_segments_foreign_keys}, :minimum_size => 2)
|
121
|
+
FlightSegment.connection.select_value(cohort.project('COUNT(*)').to_sql).must_equal 2
|
119
122
|
# FlightSegment.cohort(:aircraft_description => aircraft.flight_segments_foreign_keys).must_equal []
|
120
123
|
end
|
121
124
|
|
data/test/test_fuzzy_match.rb
CHANGED
@@ -12,11 +12,6 @@ describe FuzzyMatch do
|
|
12
12
|
d.find('X').must_equal 'X'
|
13
13
|
d.find('A').must_be_nil
|
14
14
|
end
|
15
|
-
|
16
|
-
it %{does the right thing} do
|
17
|
-
d = FuzzyMatch.new [ 'Artyom Makarov', 'Karl' ], :must_match_at_least_one_word => true
|
18
|
-
puts d.explain('art')#.must_equal 'Artyom Makarov'
|
19
|
-
end
|
20
15
|
|
21
16
|
it %{not return any result if the maximum score is zero} do
|
22
17
|
FuzzyMatch.new(['a']).find('b').must_be_nil
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,18 +9,66 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-09-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
15
|
+
name: to_regexp
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
21
|
+
version: 0.0.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.0.3
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: active_record_inline_schema
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 0.4.0
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 0.4.0
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: minitest
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: activerecord
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '3'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
24
72
|
version_requirements: !ruby/object:Gem::Requirement
|
25
73
|
none: false
|
26
74
|
requirements:
|
@@ -28,37 +76,101 @@ dependencies:
|
|
28
76
|
- !ruby/object:Gem::Version
|
29
77
|
version: '3'
|
30
78
|
- !ruby/object:Gem::Dependency
|
31
|
-
name:
|
79
|
+
name: mysql2
|
32
80
|
requirement: !ruby/object:Gem::Requirement
|
33
81
|
none: false
|
34
82
|
requirements:
|
35
83
|
- - ! '>='
|
36
84
|
- !ruby/object:Gem::Version
|
37
|
-
version: 0
|
38
|
-
type: :
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
39
87
|
prerelease: false
|
40
88
|
version_requirements: !ruby/object:Gem::Requirement
|
41
89
|
none: false
|
42
90
|
requirements:
|
43
91
|
- - ! '>='
|
44
92
|
- !ruby/object:Gem::Version
|
45
|
-
version: 0
|
93
|
+
version: '0'
|
46
94
|
- !ruby/object:Gem::Dependency
|
47
|
-
name:
|
95
|
+
name: cohort_analysis
|
48
96
|
requirement: !ruby/object:Gem::Requirement
|
49
97
|
none: false
|
50
98
|
requirements:
|
51
99
|
- - ! '>='
|
52
100
|
- !ruby/object:Gem::Version
|
53
|
-
version: 0
|
54
|
-
type: :
|
101
|
+
version: '0'
|
102
|
+
type: :development
|
55
103
|
prerelease: false
|
56
104
|
version_requirements: !ruby/object:Gem::Requirement
|
57
105
|
none: false
|
58
106
|
requirements:
|
59
107
|
- - ! '>='
|
60
108
|
- !ruby/object:Gem::Version
|
61
|
-
version: 0
|
109
|
+
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: weighted_average
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: yard
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :development
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
- !ruby/object:Gem::Dependency
|
143
|
+
name: amatch
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
146
|
+
requirements:
|
147
|
+
- - ! '>='
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
type: :development
|
151
|
+
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
158
|
+
- !ruby/object:Gem::Dependency
|
159
|
+
name: minitest-reporters
|
160
|
+
requirement: !ruby/object:Gem::Requirement
|
161
|
+
none: false
|
162
|
+
requirements:
|
163
|
+
- - ! '>='
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0'
|
166
|
+
type: :development
|
167
|
+
prerelease: false
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ! '>='
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
62
174
|
description: Find a needle in a haystack using string similarity and (optionally)
|
63
175
|
regexp rules. Replaces loose_tight_dictionary.
|
64
176
|
email:
|
@@ -68,7 +180,6 @@ executables:
|
|
68
180
|
extensions: []
|
69
181
|
extra_rdoc_files: []
|
70
182
|
files:
|
71
|
-
- .document
|
72
183
|
- .gitignore
|
73
184
|
- CHANGELOG
|
74
185
|
- Gemfile
|
@@ -139,7 +250,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
139
250
|
version: '0'
|
140
251
|
requirements: []
|
141
252
|
rubyforge_project: fuzzy_match
|
142
|
-
rubygems_version: 1.8.
|
253
|
+
rubygems_version: 1.8.24
|
143
254
|
signing_key:
|
144
255
|
specification_version: 3
|
145
256
|
summary: Find a needle in a haystack using string similarity and (optionally) regexp
|