fuzzy_match 1.3.3 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +14 -2
- data/Gemfile +0 -15
- data/README.markdown +17 -10
- data/bin/fuzzy_match_checker +8 -3
- data/fuzzy_match.gemspec +15 -2
- data/lib/fuzzy_match.rb +22 -20
- data/lib/fuzzy_match/result.rb +5 -4
- data/lib/fuzzy_match/score/pure_ruby.rb +64 -67
- data/lib/fuzzy_match/similarity.rb +20 -18
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match/wrapper.rb +30 -28
- data/test/helper.rb +5 -5
- data/test/test_cache.rb +6 -3
- data/test/test_fuzzy_match.rb +0 -5
- metadata +125 -14
- data/.document +0 -5
data/CHANGELOG
CHANGED
@@ -1,4 +1,16 @@
|
|
1
|
-
|
1
|
+
1.4.0 / 2012-09-07
|
2
|
+
|
3
|
+
* Breaking changes
|
4
|
+
|
5
|
+
* Option keys are no longer symbolized automatically - make sure you do that if there's any chance they'll be strings
|
6
|
+
* active_record_inline_schema is no longer a runtime dependency - add it to your Gemfile if you use FuzzyMatch::CachedResult
|
7
|
+
|
8
|
+
* Enhancements
|
9
|
+
|
10
|
+
* Tiny bit better #explain(needle)
|
11
|
+
* Remove dependency on ActiveSupport
|
12
|
+
|
13
|
+
1.3.3 / 2012-04-13
|
2
14
|
|
3
15
|
* Enhancements
|
4
16
|
|
@@ -7,7 +19,7 @@
|
|
7
19
|
* Test against CohortAnalysis, the replacement for CohortScope
|
8
20
|
* Fix some other random deprecations (like set_primary_key)
|
9
21
|
|
10
|
-
|
22
|
+
1.3.2 / 2012-02-24
|
11
23
|
|
12
24
|
* Enhancements
|
13
25
|
|
data/Gemfile
CHANGED
@@ -1,18 +1,3 @@
|
|
1
1
|
source :rubygems
|
2
2
|
|
3
3
|
gemspec
|
4
|
-
|
5
|
-
# bin dependencies
|
6
|
-
gem 'remote_table'
|
7
|
-
gem 'thor'
|
8
|
-
|
9
|
-
# development dependencies
|
10
|
-
gem 'minitest-reporters'
|
11
|
-
gem "minitest"
|
12
|
-
gem 'activerecord', '>=3'
|
13
|
-
gem 'mysql2'
|
14
|
-
gem 'cohort_analysis'
|
15
|
-
gem 'weighted_average'
|
16
|
-
gem 'rake'
|
17
|
-
gem 'yard'
|
18
|
-
gem 'amatch'
|
data/README.markdown
CHANGED
@@ -4,6 +4,21 @@ Find a needle in a haystack based on string similarity and regular expression ru
|
|
4
4
|
|
5
5
|
Replaces [`loose_tight_dictionary`](https://github.com/seamusabshere/loose_tight_dictionary) because that was a confusing name.
|
6
6
|
|
7
|
+
## Real-world usage
|
8
|
+
|
9
|
+
<p><a href="http://brighterplanet.com"><img src="https://s3.amazonaws.com/static.brighterplanet.com/assets/logos/flush-left/inline/green/rasterized/brighter_planet-160-transparent.png" alt="Brighter Planet logo"/></a></p>
|
10
|
+
|
11
|
+
We use `fuzzy_match` for [data science at Brighter Planet](http://brighterplanet.com/research) and in production at
|
12
|
+
|
13
|
+
* [Brighter Planet's impact estimate web service](http://impact.brighterplanet.com)
|
14
|
+
* [Brighter Planet's reference data web service](http://data.brighterplanet.com)
|
15
|
+
|
16
|
+
We often combine it with [`remote_table`](https://github.com/seamusabshere/remote_table) and [`errata`](https://github.com/seamusabshere/errata):
|
17
|
+
|
18
|
+
- download table with `remote_table`
|
19
|
+
- correct serious or repeated errors with `errata`
|
20
|
+
- `fuzzy_match` the rest
|
21
|
+
|
7
22
|
## Quickstart
|
8
23
|
|
9
24
|
>> require 'fuzzy_match'
|
@@ -114,18 +129,10 @@ In edge cases where Dice's finds that two strings are equally similar to a third
|
|
114
129
|
>> 'RITZ'.levenshtein_similar 'CATZ'
|
115
130
|
=> 0.5 # which properly shows that RATZ should win
|
116
131
|
|
117
|
-
## Production use
|
118
|
-
|
119
|
-
Over 2 years in [Brighter Planet's impact estimate API](http://impact.brighterplanet.com) and [reference data service](http://data.brighterplanet.com).
|
120
|
-
|
121
|
-
We often combine `fuzzy_match` with [`remote_table`](https://github.com/seamusabshere/remote_table) and [`errata`](https://github.com/seamusabshere/errata):
|
122
|
-
|
123
|
-
- download table with `remote_table`
|
124
|
-
- correct serious or repeated errors with `errata`
|
125
|
-
- `fuzzy_match` the rest
|
126
|
-
|
127
132
|
## Cached results
|
128
133
|
|
134
|
+
Make sure you add active\_record\_inline\_schema to your gemfile.
|
135
|
+
|
129
136
|
TODO write documentation. For now, please see how [we manually cache matches between aircraft and flight segments](https://github.com/brighterplanet/earth/blob/master/lib/earth/air/aircraft.rb).
|
130
137
|
|
131
138
|
## Glossary
|
data/bin/fuzzy_match_checker
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
if File.exist?(File.join(Dir.pwd, 'fuzzy_match.
|
4
|
-
|
3
|
+
if File.exist?(File.join(Dir.pwd, 'lib', 'fuzzy_match.rb'))
|
4
|
+
$LOAD_PATH.unshift File.join(Dir.pwd, 'lib')
|
5
|
+
require File.join(Dir.pwd, 'lib', 'fuzzy_match')
|
6
|
+
else
|
7
|
+
require 'fuzzy_match'
|
5
8
|
end
|
9
|
+
require 'fuzzy_match/version'
|
6
10
|
|
7
|
-
|
11
|
+
# note: not included in gemfile but neither is bundler used here
|
8
12
|
require 'active_support/core_ext'
|
9
13
|
require 'remote_table'
|
10
14
|
require 'thor'
|
@@ -52,6 +56,7 @@ class FuzzyMatch
|
|
52
56
|
end
|
53
57
|
unless match == correct_match
|
54
58
|
puts "MISMATCH: #{needle.inspect} should match #{correct_match.inspect}"
|
59
|
+
puts fm.explain needle
|
55
60
|
exit 1
|
56
61
|
end
|
57
62
|
count += 1
|
data/fuzzy_match.gemspec
CHANGED
@@ -17,7 +17,20 @@ Gem::Specification.new do |s|
|
|
17
17
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
18
18
|
s.require_paths = ["lib"]
|
19
19
|
|
20
|
-
s.add_runtime_dependency 'activesupport', '>=3'
|
21
20
|
s.add_runtime_dependency 'to_regexp', '>=0.0.3'
|
22
|
-
|
21
|
+
|
22
|
+
# needed if you use FuzzyMatch::CachedResult
|
23
|
+
s.add_development_dependency 'active_record_inline_schema', '>=0.4.0'
|
24
|
+
|
25
|
+
# development dependencies
|
26
|
+
s.add_development_dependency "minitest"
|
27
|
+
s.add_development_dependency 'activerecord', '>=3'
|
28
|
+
s.add_development_dependency 'mysql2'
|
29
|
+
s.add_development_dependency 'cohort_analysis'
|
30
|
+
s.add_development_dependency 'weighted_average'
|
31
|
+
s.add_development_dependency 'yard'
|
32
|
+
s.add_development_dependency 'amatch'
|
33
|
+
if RUBY_VERSION >= '1.9'
|
34
|
+
s.add_development_dependency 'minitest-reporters'
|
35
|
+
end
|
23
36
|
end
|
data/lib/fuzzy_match.rb
CHANGED
@@ -1,8 +1,3 @@
|
|
1
|
-
require 'active_support'
|
2
|
-
require 'active_support/version'
|
3
|
-
if ::ActiveSupport::VERSION::MAJOR >= 3
|
4
|
-
require 'active_support/core_ext'
|
5
|
-
end
|
6
1
|
require 'to_regexp'
|
7
2
|
|
8
3
|
require 'fuzzy_match/rule'
|
@@ -19,11 +14,11 @@ require 'fuzzy_match/score'
|
|
19
14
|
class FuzzyMatch
|
20
15
|
class << self
|
21
16
|
def engine
|
22
|
-
|
17
|
+
@engine
|
23
18
|
end
|
24
19
|
|
25
20
|
def engine=(alt_engine)
|
26
|
-
|
21
|
+
@engine = alt_engine
|
27
22
|
end
|
28
23
|
|
29
24
|
def score_class
|
@@ -47,6 +42,8 @@ class FuzzyMatch
|
|
47
42
|
:gather_last_result => false,
|
48
43
|
:find_all => false
|
49
44
|
}
|
45
|
+
|
46
|
+
self.engine = DEFAULT_ENGINE
|
50
47
|
|
51
48
|
attr_reader :haystack
|
52
49
|
attr_reader :groupings
|
@@ -71,7 +68,7 @@ class FuzzyMatch
|
|
71
68
|
# * :<tt>first_grouping_decides</tt> - force records into the first grouping they match, rather than choosing a grouping that will give them a higher score
|
72
69
|
# * :<tt>gather_last_result</tt> - enable <tt>last_result</tt>
|
73
70
|
def initialize(competitors, options_and_rules = {})
|
74
|
-
options_and_rules = options_and_rules.
|
71
|
+
options_and_rules = options_and_rules.dup
|
75
72
|
|
76
73
|
# rules
|
77
74
|
self.groupings = options_and_rules.delete(:groupings) || options_and_rules.delete(:blockings) || []
|
@@ -87,7 +84,7 @@ class FuzzyMatch
|
|
87
84
|
if deprecated = options_and_rules.delete(:must_match_blocking)
|
88
85
|
options_and_rules[:must_match_grouping] = deprecated
|
89
86
|
end
|
90
|
-
@default_options =
|
87
|
+
@default_options = DEFAULT_OPTIONS.merge(options_and_rules).freeze
|
91
88
|
|
92
89
|
# do this last
|
93
90
|
self.haystack = competitors
|
@@ -118,12 +115,12 @@ class FuzzyMatch
|
|
118
115
|
end
|
119
116
|
|
120
117
|
def find_all(needle, options = {})
|
121
|
-
options = options.
|
118
|
+
options = options.merge(:find_all => true)
|
122
119
|
find needle, options
|
123
120
|
end
|
124
121
|
|
125
122
|
def find(needle, options = {})
|
126
|
-
options =
|
123
|
+
options = default_options.merge options
|
127
124
|
|
128
125
|
gather_last_result = options[:gather_last_result]
|
129
126
|
is_find_all = options[:find_all]
|
@@ -193,7 +190,9 @@ EOS
|
|
193
190
|
if groupings.any?
|
194
191
|
joint = passed_word_requirement.select do |straw|
|
195
192
|
if first_grouping_decides
|
196
|
-
groupings.detect { |grouping| grouping.match? needle }
|
193
|
+
if first_grouping = groupings.detect { |grouping| grouping.match? needle }
|
194
|
+
first_grouping.join? needle, straw
|
195
|
+
end
|
197
196
|
else
|
198
197
|
groupings.any? { |grouping| grouping.join? needle, straw }
|
199
198
|
end
|
@@ -237,21 +236,21 @@ EOS
|
|
237
236
|
if gather_last_result
|
238
237
|
last_result.timeline << <<-EOS
|
239
238
|
Since there were identities, the competition was reduced to records that might be identical to the needle (in other words, are not certainly different)
|
240
|
-
\
|
241
|
-
\tPassed (first
|
242
|
-
\tFailed (first
|
239
|
+
\tIdentities (first 10 of #{identities.length}): #{identities[0,9].map(&:inspect).join(', ')}
|
240
|
+
\tPassed (first 10 of #{possibly_identical.length}): #{possibly_identical[0,9].map(&:render).map(&:inspect).join(', ')}
|
241
|
+
\tFailed (first 10 of #{(joint-possibly_identical).length}): #{(joint-possibly_identical)[0,9].map(&:render).map(&:inspect).join(', ')}
|
243
242
|
EOS
|
244
243
|
end
|
245
244
|
else
|
246
245
|
possibly_identical = joint.dup
|
247
246
|
end
|
248
|
-
|
247
|
+
|
249
248
|
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort.reverse
|
250
249
|
|
251
250
|
if gather_last_result
|
252
|
-
|
251
|
+
last_result.timeline << <<-EOS
|
253
252
|
The competition was sorted in order of similarity to the needle.
|
254
|
-
\tSimilar (first
|
253
|
+
\tSimilar (first 10 of #{similarities.length}): #{similarities[0,9].map { |s| "#{s.wrapper2.render.inspect} (#{[s.best_score.dices_coefficient_similar, s.best_score.levenshtein_similar].map { |v| '%0.5f' % v }.join('/')})" }.join(', ')}
|
255
254
|
EOS
|
256
255
|
end
|
257
256
|
|
@@ -272,8 +271,11 @@ A winner was determined because the Dice's Coefficient similarity (#{best_simila
|
|
272
271
|
EOS
|
273
272
|
end
|
274
273
|
elsif gather_last_result
|
275
|
-
|
276
|
-
|
274
|
+
best_similarity_record = if best_similarity and best_similarity.wrapper2
|
275
|
+
best_similarity.wrapper2.record
|
276
|
+
end
|
277
|
+
last_result.timeline << <<-EOS
|
278
|
+
No winner assigned because the score of the best similarity (#{best_similarity_record.inspect}) was zero and it didn't match any words with the needle (#{needle.inspect}).
|
277
279
|
EOS
|
278
280
|
end
|
279
281
|
|
data/lib/fuzzy_match/result.rb
CHANGED
@@ -19,10 +19,6 @@ The haystack contained <%= haystack.length %> records like <%= haystack[0, 3].ma
|
|
19
19
|
<% end %>
|
20
20
|
ERB
|
21
21
|
|
22
|
-
def timeline
|
23
|
-
@timeline ||= []
|
24
|
-
end
|
25
|
-
|
26
22
|
attr_accessor :needle
|
27
23
|
attr_accessor :read
|
28
24
|
attr_accessor :haystack
|
@@ -33,6 +29,11 @@ ERB
|
|
33
29
|
attr_accessor :stop_words
|
34
30
|
attr_accessor :winner
|
35
31
|
attr_accessor :score
|
32
|
+
attr_reader :timeline
|
33
|
+
|
34
|
+
def initialize
|
35
|
+
@timeline = []
|
36
|
+
end
|
36
37
|
|
37
38
|
def explain
|
38
39
|
$stdout.puts ::ERB.new(EXPLANATION, 0, '%<').result(binding)
|
@@ -10,84 +10,81 @@ class FuzzyMatch
|
|
10
10
|
|
11
11
|
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
12
12
|
def dices_coefficient_similar
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
pairs2.slice!(i)
|
40
|
-
break
|
13
|
+
@dices_coefficient_similar ||= begin
|
14
|
+
if str1 == str2
|
15
|
+
1.0
|
16
|
+
elsif str1.length == 1 and str2.length == 1
|
17
|
+
0.0
|
18
|
+
else
|
19
|
+
pairs1 = (0..str1.length-2).map do |i|
|
20
|
+
str1[i,2]
|
21
|
+
end.reject do |pair|
|
22
|
+
pair.include? SPACE
|
23
|
+
end
|
24
|
+
pairs2 = (0..str2.length-2).map do |i|
|
25
|
+
str2[i,2]
|
26
|
+
end.reject do |pair|
|
27
|
+
pair.include? SPACE
|
28
|
+
end
|
29
|
+
union = pairs1.size + pairs2.size
|
30
|
+
intersection = 0
|
31
|
+
pairs1.each do |p1|
|
32
|
+
0.upto(pairs2.size-1) do |i|
|
33
|
+
if p1 == pairs2[i]
|
34
|
+
intersection += 1
|
35
|
+
pairs2.slice!(i)
|
36
|
+
break
|
37
|
+
end
|
38
|
+
end
|
41
39
|
end
|
40
|
+
(2.0 * intersection) / union
|
42
41
|
end
|
43
42
|
end
|
44
|
-
@dices_coefficient_similar = (2.0 * intersection) / union
|
45
43
|
end
|
46
44
|
|
47
45
|
# extracted/adapted from the text gem version 1.0.2
|
48
46
|
# normalization added for utf-8 strings
|
49
47
|
# lib/text/levenshtein.rb
|
50
48
|
def levenshtein_similar
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
49
|
+
@levenshtein_similar ||= begin
|
50
|
+
if utf8?
|
51
|
+
unpack_rule = 'U*'
|
52
|
+
else
|
53
|
+
unpack_rule = 'C*'
|
54
|
+
end
|
55
|
+
s = str1.unpack(unpack_rule)
|
56
|
+
t = str2.unpack(unpack_rule)
|
57
|
+
n = s.length
|
58
|
+
m = t.length
|
59
|
+
|
60
|
+
if n == 0 or m == 0
|
61
|
+
0.0
|
62
|
+
else
|
63
|
+
d = (0..m).to_a
|
64
|
+
x = nil
|
65
|
+
(0...n).each do |i|
|
66
|
+
e = i+1
|
67
|
+
(0...m).each do |j|
|
68
|
+
cost = (s[i] == t[j]) ? 0 : 1
|
69
|
+
x = [
|
70
|
+
d[j+1] + 1, # insertion
|
71
|
+
e + 1, # deletion
|
72
|
+
d[j] + cost # substitution
|
73
|
+
].min
|
74
|
+
d[j] = e
|
75
|
+
e = x
|
76
|
+
end
|
77
|
+
d[m] = x
|
78
|
+
end
|
79
|
+
# normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
|
80
|
+
# if (b_len > a_len) {
|
81
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
82
|
+
# } else {
|
83
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
84
|
+
# }
|
85
|
+
1.0 - x.to_f / [n, m].max
|
81
86
|
end
|
82
|
-
d[m] = x
|
83
87
|
end
|
84
|
-
# normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
|
85
|
-
# if (b_len > a_len) {
|
86
|
-
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
87
|
-
# } else {
|
88
|
-
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
89
|
-
# }
|
90
|
-
@levenshtein_similar = 1.0 - x.to_f / [n, m].max
|
91
88
|
end
|
92
89
|
|
93
90
|
private
|
@@ -17,15 +17,21 @@ class FuzzyMatch
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
+
def best_score
|
21
|
+
@best_score ||= FuzzyMatch.score_class.new(best_wrapper1_variant, best_wrapper2_variant)
|
22
|
+
end
|
23
|
+
|
24
|
+
def inspect
|
25
|
+
%{#<FuzzyMatch::Similarity #{wrapper2.render.inspect}=>#{best_wrapper2_variant.inspect} versus #{wrapper1.render.inspect}=>#{best_wrapper1_variant.inspect} original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
|
26
|
+
end
|
27
|
+
|
20
28
|
# Weight things towards short original strings
|
21
29
|
def original_weight
|
22
30
|
@original_weight ||= (1.0 / (wrapper1.render.length * wrapper2.render.length))
|
23
31
|
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
end
|
28
|
-
|
32
|
+
|
33
|
+
private
|
34
|
+
|
29
35
|
def best_wrapper1_variant
|
30
36
|
best_variants[0]
|
31
37
|
end
|
@@ -35,19 +41,15 @@ class FuzzyMatch
|
|
35
41
|
end
|
36
42
|
|
37
43
|
def best_variants
|
38
|
-
@best_variants ||=
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def inspect
|
50
|
-
%{#<FuzzyMatch::Similarity #{wrapper2.render.inspect}=>#{best_wrapper2_variant.inspect} versus #{wrapper1.render.inspect}=>#{best_wrapper1_variant.inspect} original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
|
44
|
+
@best_variants ||= begin
|
45
|
+
wrapper1.variants.product(wrapper2.variants).sort do |tuple1, tuple2|
|
46
|
+
wrapper1_variant1, wrapper2_variant1 = tuple1
|
47
|
+
wrapper1_variant2, wrapper2_variant2 = tuple2
|
48
|
+
score1 = FuzzyMatch.score_class.new wrapper1_variant1, wrapper2_variant1
|
49
|
+
score2 = FuzzyMatch.score_class.new wrapper1_variant2, wrapper2_variant2
|
50
|
+
score1 <=> score2
|
51
|
+
end.last
|
52
|
+
end
|
51
53
|
end
|
52
54
|
end
|
53
55
|
end
|
data/lib/fuzzy_match/version.rb
CHANGED
data/lib/fuzzy_match/wrapper.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
class FuzzyMatch
|
2
2
|
# Wrappers are the tokens that are passed around when doing scoring and optimizing.
|
3
3
|
class Wrapper #:nodoc: all
|
4
|
+
# "Foo's" is one word
|
5
|
+
# "North-west" is just one word
|
6
|
+
# "Bolivia," is just Bolivia
|
7
|
+
WORD_BOUNDARY = %r{\W*(?:\s+|$)}
|
8
|
+
|
4
9
|
attr_reader :fuzzy_match
|
5
10
|
attr_reader :record
|
6
11
|
attr_reader :literal
|
@@ -21,36 +26,31 @@ class FuzzyMatch
|
|
21
26
|
end
|
22
27
|
|
23
28
|
def render
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
29
|
+
@render ||= begin
|
30
|
+
memo = case read
|
31
|
+
when ::Proc
|
32
|
+
read.call record
|
33
|
+
when ::Symbol
|
34
|
+
if record.respond_to?(read)
|
35
|
+
record.send read
|
36
|
+
else
|
37
|
+
record[read]
|
38
|
+
end
|
39
|
+
when ::NilClass
|
40
|
+
record
|
31
41
|
else
|
32
42
|
record[read]
|
43
|
+
end.to_s.dup
|
44
|
+
fuzzy_match.stop_words.each do |stop_word|
|
45
|
+
stop_word.apply! memo
|
33
46
|
end
|
34
|
-
|
35
|
-
|
36
|
-
else
|
37
|
-
record[read]
|
38
|
-
end.to_s.dup
|
39
|
-
fuzzy_match.stop_words.each do |stop_word|
|
40
|
-
stop_word.apply! str
|
47
|
+
memo.strip!
|
48
|
+
@render = memo.freeze
|
41
49
|
end
|
42
|
-
str.strip!
|
43
|
-
@render = str.freeze
|
44
|
-
@rendered = true
|
45
|
-
@render
|
46
50
|
end
|
47
51
|
|
48
52
|
alias :to_str :render
|
49
53
|
|
50
|
-
# "Foo's" is one word
|
51
|
-
# "North-west" is just one word
|
52
|
-
# "Bolivia," is just Bolivia
|
53
|
-
WORD_BOUNDARY = %r{\W*(?:\s+|$)}
|
54
54
|
def words
|
55
55
|
@words ||= render.downcase.split(WORD_BOUNDARY)
|
56
56
|
end
|
@@ -60,12 +60,14 @@ class FuzzyMatch
|
|
60
60
|
end
|
61
61
|
|
62
62
|
def variants
|
63
|
-
@variants ||=
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
63
|
+
@variants ||= begin
|
64
|
+
fuzzy_match.normalizers.inject([ render ]) do |memo, normalizer|
|
65
|
+
if normalizer.apply? render
|
66
|
+
memo << normalizer.apply(render)
|
67
|
+
end
|
68
|
+
memo
|
69
|
+
end.uniq
|
70
|
+
end
|
69
71
|
end
|
70
72
|
end
|
71
73
|
end
|
data/test/helper.rb
CHANGED
@@ -3,10 +3,10 @@ require 'bundler'
|
|
3
3
|
Bundler.setup
|
4
4
|
require 'minitest/spec'
|
5
5
|
require 'minitest/autorun'
|
6
|
-
require 'minitest/reporters'
|
7
|
-
MiniTest::Unit.runner = MiniTest::SuiteRunner.new
|
8
|
-
MiniTest::Unit.runner.reporters << MiniTest::Reporters::SpecReporter.new
|
9
6
|
|
10
|
-
|
11
|
-
|
7
|
+
if RUBY_VERSION >= '1.9'
|
8
|
+
require 'minitest/reporters'
|
9
|
+
MiniTest::Reporters.use! MiniTest::Reporters::SpecReporter.new
|
10
|
+
end
|
11
|
+
|
12
12
|
require 'fuzzy_match'
|
data/test/test_cache.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
|
-
require 'active_support/all'
|
4
3
|
require 'active_record'
|
5
4
|
require 'cohort_analysis'
|
6
5
|
require 'weighted_average'
|
@@ -25,6 +24,7 @@ require 'fuzzy_match/cached_result'
|
|
25
24
|
::FuzzyMatch::CachedResult.setup(true)
|
26
25
|
|
27
26
|
class Aircraft < ActiveRecord::Base
|
27
|
+
MUTEX = ::Mutex.new
|
28
28
|
self.primary_key = 'icao_code'
|
29
29
|
|
30
30
|
cache_fuzzy_match_with :flight_segments, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
@@ -34,7 +34,9 @@ class Aircraft < ActiveRecord::Base
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def self.fuzzy_match
|
37
|
-
@fuzzy_match
|
37
|
+
@fuzzy_match || MUTEX.synchronize do
|
38
|
+
@fuzzy_match||= FuzzyMatch.new(all, :read => ::Proc.new { |straw| straw.aircraft_description })
|
39
|
+
end
|
38
40
|
end
|
39
41
|
|
40
42
|
def self.create_table
|
@@ -115,7 +117,8 @@ describe FuzzyMatch::CachedResult do
|
|
115
117
|
|
116
118
|
it %{works with cohort_scope (albeit rather clumsily)} do
|
117
119
|
aircraft = Aircraft.find('B742')
|
118
|
-
FlightSegment.cohort({:aircraft_description => aircraft.flight_segments_foreign_keys}, :minimum_size => 2)
|
120
|
+
cohort = FlightSegment.cohort({:aircraft_description => aircraft.flight_segments_foreign_keys}, :minimum_size => 2)
|
121
|
+
FlightSegment.connection.select_value(cohort.project('COUNT(*)').to_sql).must_equal 2
|
119
122
|
# FlightSegment.cohort(:aircraft_description => aircraft.flight_segments_foreign_keys).must_equal []
|
120
123
|
end
|
121
124
|
|
data/test/test_fuzzy_match.rb
CHANGED
@@ -12,11 +12,6 @@ describe FuzzyMatch do
|
|
12
12
|
d.find('X').must_equal 'X'
|
13
13
|
d.find('A').must_be_nil
|
14
14
|
end
|
15
|
-
|
16
|
-
it %{does the right thing} do
|
17
|
-
d = FuzzyMatch.new [ 'Artyom Makarov', 'Karl' ], :must_match_at_least_one_word => true
|
18
|
-
puts d.explain('art')#.must_equal 'Artyom Makarov'
|
19
|
-
end
|
20
15
|
|
21
16
|
it %{not return any result if the maximum score is zero} do
|
22
17
|
FuzzyMatch.new(['a']).find('b').must_be_nil
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,18 +9,66 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-09-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
15
|
+
name: to_regexp
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
21
|
+
version: 0.0.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.0.3
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: active_record_inline_schema
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 0.4.0
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 0.4.0
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: minitest
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: activerecord
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '3'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
24
72
|
version_requirements: !ruby/object:Gem::Requirement
|
25
73
|
none: false
|
26
74
|
requirements:
|
@@ -28,37 +76,101 @@ dependencies:
|
|
28
76
|
- !ruby/object:Gem::Version
|
29
77
|
version: '3'
|
30
78
|
- !ruby/object:Gem::Dependency
|
31
|
-
name:
|
79
|
+
name: mysql2
|
32
80
|
requirement: !ruby/object:Gem::Requirement
|
33
81
|
none: false
|
34
82
|
requirements:
|
35
83
|
- - ! '>='
|
36
84
|
- !ruby/object:Gem::Version
|
37
|
-
version: 0
|
38
|
-
type: :
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
39
87
|
prerelease: false
|
40
88
|
version_requirements: !ruby/object:Gem::Requirement
|
41
89
|
none: false
|
42
90
|
requirements:
|
43
91
|
- - ! '>='
|
44
92
|
- !ruby/object:Gem::Version
|
45
|
-
version: 0
|
93
|
+
version: '0'
|
46
94
|
- !ruby/object:Gem::Dependency
|
47
|
-
name:
|
95
|
+
name: cohort_analysis
|
48
96
|
requirement: !ruby/object:Gem::Requirement
|
49
97
|
none: false
|
50
98
|
requirements:
|
51
99
|
- - ! '>='
|
52
100
|
- !ruby/object:Gem::Version
|
53
|
-
version: 0
|
54
|
-
type: :
|
101
|
+
version: '0'
|
102
|
+
type: :development
|
55
103
|
prerelease: false
|
56
104
|
version_requirements: !ruby/object:Gem::Requirement
|
57
105
|
none: false
|
58
106
|
requirements:
|
59
107
|
- - ! '>='
|
60
108
|
- !ruby/object:Gem::Version
|
61
|
-
version: 0
|
109
|
+
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: weighted_average
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: yard
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :development
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
- !ruby/object:Gem::Dependency
|
143
|
+
name: amatch
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
146
|
+
requirements:
|
147
|
+
- - ! '>='
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
type: :development
|
151
|
+
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
158
|
+
- !ruby/object:Gem::Dependency
|
159
|
+
name: minitest-reporters
|
160
|
+
requirement: !ruby/object:Gem::Requirement
|
161
|
+
none: false
|
162
|
+
requirements:
|
163
|
+
- - ! '>='
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0'
|
166
|
+
type: :development
|
167
|
+
prerelease: false
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ! '>='
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
62
174
|
description: Find a needle in a haystack using string similarity and (optionally)
|
63
175
|
regexp rules. Replaces loose_tight_dictionary.
|
64
176
|
email:
|
@@ -68,7 +180,6 @@ executables:
|
|
68
180
|
extensions: []
|
69
181
|
extra_rdoc_files: []
|
70
182
|
files:
|
71
|
-
- .document
|
72
183
|
- .gitignore
|
73
184
|
- CHANGELOG
|
74
185
|
- Gemfile
|
@@ -139,7 +250,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
139
250
|
version: '0'
|
140
251
|
requirements: []
|
141
252
|
rubyforge_project: fuzzy_match
|
142
|
-
rubygems_version: 1.8.
|
253
|
+
rubygems_version: 1.8.24
|
143
254
|
signing_key:
|
144
255
|
specification_version: 3
|
145
256
|
summary: Find a needle in a haystack using string similarity and (optionally) regexp
|