fuzzy_match 1.1.1 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,3 @@
1
1
  class FuzzyMatch
2
- VERSION = '1.1.1'
2
+ VERSION = '1.2.1'
3
3
  end
@@ -58,9 +58,9 @@ class FuzzyMatch
58
58
  end
59
59
 
60
60
  def variants
61
- @variants ||= fuzzy_match.tighteners.inject([ render ]) do |memo, tightener|
62
- if tightener.apply? render
63
- memo.push tightener.apply(render)
61
+ @variants ||= fuzzy_match.normalizers.inject([ render ]) do |memo, normalizer|
62
+ if normalizer.apply? render
63
+ memo.push normalizer.apply(render)
64
64
  end
65
65
  memo
66
66
  end.uniq
data/lib/fuzzy_match.rb CHANGED
@@ -5,18 +5,21 @@ if ::ActiveSupport::VERSION::MAJOR >= 3
5
5
  end
6
6
  require 'to_regexp'
7
7
 
8
+ require 'fuzzy_match/normalizer'
9
+ require 'fuzzy_match/stop_word'
10
+ require 'fuzzy_match/blocking'
11
+ require 'fuzzy_match/identity'
12
+ require 'fuzzy_match/result'
13
+ require 'fuzzy_match/wrapper'
14
+ require 'fuzzy_match/similarity'
15
+ require 'fuzzy_match/score'
16
+
17
+ if defined?(::ActiveRecord)
18
+ require 'fuzzy_match/cached_result'
19
+ end
20
+
8
21
  # See the README for more information.
9
22
  class FuzzyMatch
10
- autoload :Tightener, 'fuzzy_match/tightener'
11
- autoload :StopWord, 'fuzzy_match/stop_word'
12
- autoload :Blocking, 'fuzzy_match/blocking'
13
- autoload :Identity, 'fuzzy_match/identity'
14
- autoload :Result, 'fuzzy_match/result'
15
- autoload :Wrapper, 'fuzzy_match/wrapper'
16
- autoload :Similarity, 'fuzzy_match/similarity'
17
- autoload :Score, 'fuzzy_match/score'
18
- autoload :CachedResult, 'fuzzy_match/cached_result'
19
-
20
23
  DEFAULT_OPTIONS = {
21
24
  :first_blocking_decides => false,
22
25
  :must_match_blocking => false,
@@ -28,33 +31,32 @@ class FuzzyMatch
28
31
  attr_reader :haystack
29
32
  attr_reader :blockings
30
33
  attr_reader :identities
31
- attr_reader :tighteners
34
+ attr_reader :normalizers
32
35
  attr_reader :stop_words
33
36
  attr_reader :read
34
37
  attr_reader :default_options
35
38
 
36
39
  # haystack - a bunch of records that will compete to see who best matches the needle
37
40
  #
38
- # rules (can only be specified at initialization or by using a setter)
39
- # * tighteners: regexps (see readme)
40
- # * identities: regexps
41
- # * blockings: regexps
42
- # * stop_words: regexps
43
- # * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
41
+ # Rules (can only be specified at initialization or by using a setter)
42
+ # * :<tt>normalizers</tt> - regexps (see README)
43
+ # * :<tt>identities</tt> - regexps
44
+ # * :<tt>blockings</tt> - regexps
45
+ # * :<tt>stop_words</tt> - regexps
44
46
  #
45
- # options (can be specified at initialization or when calling #find)
46
- # * first_blocking_decides
47
- # * must_match_blocking
48
- # * must_match_at_least_one_word
49
- # * gather_last_result
50
- # * find_all
47
+ # Options (can be specified at initialization or when calling #find)
48
+ # * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol
49
+ # * :<tt>must_match_blocking</tt> - don't return a match unless the needle fits into one of the blockings you specified
50
+ # * :<tt>must_match_at_least_one_word</tt> - don't return a match unless the needle shares at least one word with the match
51
+ # * :<tt>first_blocking_decides</tt> - force records into the first blocking they match, rather than choosing a blocking that will give them a higher score
52
+ # * :<tt>gather_last_result</tt> - enable <tt>last_result</tt>
51
53
  def initialize(competitors, options_and_rules = {})
52
54
  options_and_rules = options_and_rules.symbolize_keys
53
55
 
54
56
  # rules
55
57
  self.blockings = options_and_rules.delete(:blockings) || []
56
58
  self.identities = options_and_rules.delete(:identities) || []
57
- self.tighteners = options_and_rules.delete(:tighteners) || []
59
+ self.normalizers = options_and_rules.delete(:normalizers) || options_and_rules.delete(:tighteners) || []
58
60
  self.stop_words = options_and_rules.delete(:stop_words) || []
59
61
  @read = options_and_rules.delete(:read) || options_and_rules.delete(:haystack_reader)
60
62
 
@@ -73,8 +75,8 @@ class FuzzyMatch
73
75
  @identities = ary.map { |regexp_or_str| Identity.new regexp_or_str }
74
76
  end
75
77
 
76
- def tighteners=(ary)
77
- @tighteners = ary.map { |regexp_or_str| Tightener.new regexp_or_str }
78
+ def normalizers=(ary)
79
+ @normalizers = ary.map { |regexp_or_str| Normalizer.new regexp_or_str }
78
80
  end
79
81
 
80
82
  def stop_words=(ary)
@@ -95,8 +97,6 @@ class FuzzyMatch
95
97
  end
96
98
 
97
99
  def find(needle, options = {})
98
- raise ::RuntimeError, "[fuzzy_match] Dictionary has already been freed, can't perform more finds" if freed?
99
-
100
100
  options = options.symbolize_keys.reverse_merge default_options
101
101
 
102
102
  gather_last_result = options[:gather_last_result]
@@ -106,7 +106,6 @@ class FuzzyMatch
106
106
  must_match_at_least_one_word = options[:must_match_at_least_one_word]
107
107
 
108
108
  if gather_last_result
109
- free_last_result
110
109
  @last_result = Result.new
111
110
  last_result.read = read
112
111
  last_result.haystack = haystack
@@ -118,7 +117,7 @@ EOS
118
117
  end
119
118
 
120
119
  if gather_last_result
121
- last_result.tighteners = tighteners
120
+ last_result.normalizers = normalizers
122
121
  last_result.identities = identities
123
122
  last_result.blockings = blockings
124
123
  last_result.stop_words = stop_words
@@ -263,21 +262,7 @@ EOS
263
262
  last_result.explain
264
263
  end
265
264
 
266
- def freed?
267
- @freed == true
268
- end
269
-
265
+ # DEPRECATED - doesn't do anything
270
266
  def free
271
- free_last_result
272
- @haystack.try :clear
273
- @haystack = nil
274
- ensure
275
- @freed = true
276
- end
277
-
278
- private
279
-
280
- def free_last_result
281
- @last_result = nil
282
267
  end
283
268
  end
@@ -20,4 +20,9 @@ class TestBlocking < Test::Unit::TestCase
20
20
  b = FuzzyMatch::Blocking.new %r{apple}
21
21
  assert_equal nil, b.join?('orange', 'orange')
22
22
  end
23
+
24
+ def test_004_accepts_case_insensitivity
25
+ b = FuzzyMatch::Blocking.new %r{apple}i
26
+ assert_equal true, b.match?('2 Apples')
27
+ end
23
28
  end
@@ -6,12 +6,12 @@ class TestFuzzyMatch < Test::Unit::TestCase
6
6
  d = FuzzyMatch.new %w{ RATZ CATZ }
7
7
  assert_equal 'RATZ', d.find('RITZ')
8
8
  assert_equal 'RATZ', d.find('RíTZ')
9
-
9
+
10
10
  d = FuzzyMatch.new [ 'X' ]
11
11
  assert_equal 'X', d.find('X')
12
12
  assert_equal nil, d.find('A')
13
13
  end
14
-
14
+
15
15
  def test_002_dont_gather_last_result_by_default
16
16
  d = FuzzyMatch.new %w{ NISSAN HONDA }
17
17
  d.find('MISSAM')
@@ -19,88 +19,86 @@ class TestFuzzyMatch < Test::Unit::TestCase
19
19
  d.last_result
20
20
  end
21
21
  end
22
-
22
+
23
23
  def test_003_last_result
24
24
  d = FuzzyMatch.new %w{ NISSAN HONDA }
25
25
  d.find 'MISSAM', :gather_last_result => true
26
26
  assert_equal 0.6, d.last_result.score
27
27
  assert_equal 'NISSAN', d.last_result.winner
28
28
  end
29
-
30
- def test_004_false_positive_without_tightener
29
+
30
+ def test_005_correct_with_normalizer
31
31
  d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900']
32
- assert_equal 'BOEING 737-900', d.find('BOEING 737100 number 900')
33
- end
34
-
35
- def test_005_correct_with_tightener
36
- tighteners = [
32
+ assert_equal 'BOEING 737-900', d.find('BOEING 737100 number 900') # false positive without normalizer
33
+
34
+ normalizers = [
37
35
  %r{(7\d)(7|0)-?(\d{1,3})} # tighten 737-100/200 => 737100, which will cause it to win over 737-900
38
36
  ]
39
- d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :tighteners => tighteners
37
+ d = FuzzyMatch.new ['BOEING 737-100/200', 'BOEING 737-900'], :normalizers => normalizers
40
38
  assert_equal 'BOEING 737-100/200', d.find('BOEING 737100 number 900')
41
39
  end
42
-
40
+
43
41
  def test_008_false_positive_without_identity
44
42
  d = FuzzyMatch.new %w{ foo bar }
45
43
  assert_equal 'bar', d.find('baz')
46
44
  end
47
-
45
+
48
46
  def test_008_identify_false_positive
49
47
  d = FuzzyMatch.new %w{ foo bar }, :identities => [ /ba(.)/ ]
50
48
  assert_equal nil, d.find('baz')
51
49
  end
52
-
50
+
53
51
  # TODO this is not very helpful
54
52
  def test_009_blocking
55
53
  d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
56
54
  assert_equal 'X', d.find('X')
57
55
  assert_equal nil, d.find('A')
58
56
  end
59
-
57
+
60
58
  # TODO this is not very helpful
61
59
  def test_0095_must_match_blocking
62
60
  d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
63
61
  assert_equal 'X', d.find('X')
64
62
  assert_equal nil, d.find('A')
65
-
63
+
66
64
  d = FuzzyMatch.new [ 'X' ], :blockings => [ /X/, /Y/ ]
67
65
  assert_equal 'X', d.find('X', :must_match_blocking => true)
68
66
  assert_equal nil, d.find('A', :must_match_blocking => true)
69
67
  end
70
-
71
- def test_011_free
72
- d = FuzzyMatch.new %w{ NISSAN HONDA }
73
- d.free
74
- assert_raises(::RuntimeError, /free/) do
75
- d.find('foobar')
68
+
69
+ def test_011_free_does_nothing
70
+ d = FuzzyMatch.new %w{ A B }
71
+ assert_nothing_raised do
72
+ d.free
73
+ d.find 'A'
76
74
  end
77
75
  end
78
-
76
+
79
77
  def test_012_find_all
80
78
  d = FuzzyMatch.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
81
79
  assert_equal ['X', 'X22' ], d.find_all('X')
82
80
  assert_equal [], d.find_all('A')
83
81
  end
84
-
82
+
85
83
  def test_013_first_blocking_decides
86
84
  d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
87
85
  assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing 747')
88
-
86
+
89
87
  d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
90
88
  assert_equal [ 'Boeing 747', 'Boeing 747SR' ], d.find_all('Boeing 747')
91
-
89
+
92
90
  # first_blocking_decides refers to the needle
93
91
  d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
94
92
  assert_equal ["Boeing ER6", "Boeing 747", "Boeing 747SR"], d.find_all('Boeing ER6')
95
-
93
+
96
94
  d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
97
95
  assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
98
-
96
+
99
97
  # or equivalently with an identity
100
98
  d = FuzzyMatch.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
101
99
  assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
102
100
  end
103
-
101
+
104
102
  MyStruct = Struct.new(:one, :two)
105
103
  def test_014_symbol_read_sends_method
106
104
  ab = MyStruct.new('a', 'b')
@@ -115,7 +113,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
115
113
  assert_equal ba, by_first.find('b')
116
114
  assert_equal ba, by_last.find('a')
117
115
  end
118
-
116
+
119
117
  def test_015_symbol_read_reads_array
120
118
  ab = ['a', 'b']
121
119
  ba = ['b', 'a']
@@ -127,7 +125,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
127
125
  assert_equal ba, by_first.find('b')
128
126
  assert_equal ba, by_last.find('a')
129
127
  end
130
-
128
+
131
129
  def test_016_symbol_read_reads_hash
132
130
  ab = { :one => 'a', :two => 'b' }
133
131
  ba = { :one => 'b', :two => 'a' }
@@ -139,7 +137,7 @@ class TestFuzzyMatch < Test::Unit::TestCase
139
137
  assert_equal ba, by_first.find('b')
140
138
  assert_equal ba, by_last.find('a')
141
139
  end
142
-
140
+
143
141
  def test_017_understands_haystack_reader_option
144
142
  ab = ['a', 'b']
145
143
  ba = ['b', 'a']
@@ -148,31 +146,31 @@ class TestFuzzyMatch < Test::Unit::TestCase
148
146
  assert_equal ab, by_first.find('a')
149
147
  assert_equal ba, by_first.find('b')
150
148
  end
151
-
149
+
152
150
  def test_018_no_result_if_best_score_is_zero
153
151
  assert_equal nil, FuzzyMatch.new(['a']).find('b')
154
152
  end
155
-
153
+
156
154
  def test_019_must_match_at_least_one_word
157
155
  d = FuzzyMatch.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
158
156
  assert_equal nil, d.find('RITZ')
159
-
157
+
160
158
  d = FuzzyMatch.new ["Foo's Bar"], :must_match_at_least_one_word => true
161
159
  assert_equal nil, d.find("Jacob's")
162
160
  assert_equal "Foo's Bar", d.find("Foo's")
163
161
  end
164
-
162
+
165
163
  def test_020_stop_words
166
164
  d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ]
167
165
  assert_equal 'B HTL', d.find('A HTL', :must_match_at_least_one_word => true)
168
-
166
+
169
167
  d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
170
168
  assert_equal 'B HTL', d.find('A HTL')
171
-
169
+
172
170
  d = FuzzyMatch.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
173
171
  assert_equal 'A HOTEL', d.find('A HTL')
174
172
  end
175
-
173
+
176
174
  def test_021_explain_prints_to_stdout
177
175
  require 'stringio'
178
176
  capture = StringIO.new
@@ -187,15 +185,15 @@ class TestFuzzyMatch < Test::Unit::TestCase
187
185
  capture.rewind
188
186
  assert capture.read.include?('CATZ')
189
187
  end
190
-
188
+
191
189
  def test_022_compare_words_with_words
192
190
  d = FuzzyMatch.new [ 'PENINSULA HOTELS' ], :must_match_at_least_one_word => true
193
191
  assert_equal nil, d.find('DOLCE LA HULPE BXL FI')
194
192
  end
195
-
193
+
196
194
  def test_023_must_match_at_least_one_word_is_case_insensitive
197
195
  d = FuzzyMatch.new [ 'A', 'B' ]
198
196
  assert_equal 'A', d.find('a', :must_match_at_least_one_word => true)
199
197
  end
200
-
198
+
201
199
  end
@@ -30,4 +30,9 @@ class TestIdentity < Test::Unit::TestCase
30
30
  i = FuzzyMatch::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
31
31
  assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
32
32
  end
33
+
34
+ def test_007_accepts_case_insensitivity
35
+ i = FuzzyMatch::Identity.new %r{(A)[ ]*(\d)}i
36
+ assert_equal true, i.identical?('A1', 'a 1foobar')
37
+ end
33
38
  end
@@ -1,8 +1,8 @@
1
1
  require 'helper'
2
2
 
3
- class TestTightener < Test::Unit::TestCase
3
+ class TestNormalizer < Test::Unit::TestCase
4
4
  def test_001_apply
5
- t = FuzzyMatch::Tightener.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
5
+ t = FuzzyMatch::Normalizer.new %r{(Ford )[ ]*(F)[\- ]*(\d\d\d)}i
6
6
  assert_equal 'Ford F350', t.apply('Ford F-350')
7
7
  assert_equal 'Ford F150', t.apply('Ford F150')
8
8
  assert_equal 'Ford F350', t.apply('Ford F 350')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fuzzy_match
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-01-17 00:00:00.000000000Z
12
+ date: 2012-01-18 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: shoulda
16
- requirement: &2153863620 !ruby/object:Gem::Requirement
16
+ requirement: &2177380220 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2153863620
24
+ version_requirements: *2177380220
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: remote_table
27
- requirement: &2153862820 !ruby/object:Gem::Requirement
27
+ requirement: &2177379700 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2153862820
35
+ version_requirements: *2177379700
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: activerecord
38
- requirement: &2153861940 !ruby/object:Gem::Requirement
38
+ requirement: &2177379100 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '3'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2153861940
46
+ version_requirements: *2177379100
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: mysql
49
- requirement: &2153861380 !ruby/object:Gem::Requirement
49
+ requirement: &2177378440 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2153861380
57
+ version_requirements: *2177378440
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: cohort_scope
60
- requirement: &2153860800 !ruby/object:Gem::Requirement
60
+ requirement: &2177377600 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2153860800
68
+ version_requirements: *2177377600
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: weighted_average
71
- requirement: &2153860020 !ruby/object:Gem::Requirement
71
+ requirement: &2177377020 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *2153860020
79
+ version_requirements: *2177377020
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: rake
82
- requirement: &2153858540 !ruby/object:Gem::Requirement
82
+ requirement: &2177376420 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :development
89
89
  prerelease: false
90
- version_requirements: *2153858540
90
+ version_requirements: *2177376420
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: activesupport
93
- requirement: &2153857380 !ruby/object:Gem::Requirement
93
+ requirement: &2177375240 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '3'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *2153857380
101
+ version_requirements: *2177375240
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: to_regexp
104
- requirement: &2153856360 !ruby/object:Gem::Requirement
104
+ requirement: &2177374500 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,7 +109,7 @@ dependencies:
109
109
  version: 0.0.3
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *2153856360
112
+ version_requirements: *2177374500
113
113
  description: Find a needle in a haystack using string similarity and (optionally)
114
114
  regexp rules. Replaces loose_tight_dictionary.
115
115
  email:
@@ -122,7 +122,7 @@ files:
122
122
  - .gitignore
123
123
  - Gemfile
124
124
  - LICENSE
125
- - README.rdoc
125
+ - README.markdown
126
126
  - Rakefile
127
127
  - THANKS-WILLIAM-JAMES.rb
128
128
  - benchmark/before-with-free.txt
@@ -137,10 +137,10 @@ files:
137
137
  - examples/bts_aircraft/blockings.csv
138
138
  - examples/bts_aircraft/identities.csv
139
139
  - examples/bts_aircraft/negatives.csv
140
+ - examples/bts_aircraft/normalizers.csv
140
141
  - examples/bts_aircraft/number_260.csv
141
142
  - examples/bts_aircraft/positives.csv
142
143
  - examples/bts_aircraft/test_bts_aircraft.rb
143
- - examples/bts_aircraft/tighteners.csv
144
144
  - examples/first_name_matching.rb
145
145
  - examples/icao-bts.xls
146
146
  - fuzzy_match.gemspec
@@ -148,11 +148,11 @@ files:
148
148
  - lib/fuzzy_match/blocking.rb
149
149
  - lib/fuzzy_match/cached_result.rb
150
150
  - lib/fuzzy_match/identity.rb
151
+ - lib/fuzzy_match/normalizer.rb
151
152
  - lib/fuzzy_match/result.rb
152
153
  - lib/fuzzy_match/score.rb
153
154
  - lib/fuzzy_match/similarity.rb
154
155
  - lib/fuzzy_match/stop_word.rb
155
- - lib/fuzzy_match/tightener.rb
156
156
  - lib/fuzzy_match/version.rb
157
157
  - lib/fuzzy_match/wrapper.rb
158
158
  - test/helper.rb
@@ -161,7 +161,7 @@ files:
161
161
  - test/test_fuzzy_match.rb
162
162
  - test/test_fuzzy_match_convoluted.rb.disabled
163
163
  - test/test_identity.rb
164
- - test/test_tightening.rb
164
+ - test/test_normalizer.rb
165
165
  homepage: https://github.com/seamusabshere/fuzzy_match
166
166
  licenses: []
167
167
  post_install_message:
@@ -194,4 +194,5 @@ test_files:
194
194
  - test/test_fuzzy_match.rb
195
195
  - test/test_fuzzy_match_convoluted.rb.disabled
196
196
  - test/test_identity.rb
197
- - test/test_tightening.rb
197
+ - test/test_normalizer.rb
198
+ has_rdoc:
data/README.rdoc DELETED
@@ -1,94 +0,0 @@
1
- = fuzzy_match
2
-
3
- Find a needle in a haystack based on string similarity (using the Pair Distance algorithm and Levenshtein distance) and regular expressions.
4
-
5
- Replaces {loose_tight_dictionary}[https://github.com/seamusabshere/loose_tight_dictionary] because that was a confusing name.
6
-
7
- == Quickstart
8
-
9
- >> require 'fuzzy_match'
10
- => true
11
- >> FuzzyMatch.new(%w{seamus andy ben}).find('Shamus')
12
- => "seamus"
13
-
14
- == String similarity matching
15
-
16
- Uses {Dice's Coefficient}[http://en.wikipedia.org/wiki/Dice's_coefficient] algorithm (aka Pair Distance).
17
-
18
- If that judges two strings to be be equally similar to a third string, then Levenshtein distance is used. For example, pair distance considers "RATZ" and "CATZ" to be equally similar to "RITZ" so we invoke Levenshtein.
19
-
20
- >> require 'amatch'
21
- => true
22
- >> 'RITZ'.pair_distance_similar 'RATZ'
23
- => 0.3333333333333333
24
- >> 'RITZ'.pair_distance_similar 'CATZ' # <-- pair distance can't tell the difference, so we fall back to levenshtein...
25
- => 0.3333333333333333
26
- >> 'RITZ'.levenshtein_similar 'RATZ'
27
- => 0.75
28
- >> 'RITZ'.levenshtein_similar 'CATZ' # <-- which properly shows that RATZ should win
29
- => 0.5
30
-
31
- == Production use
32
-
33
- Over 2 years in {Brighter Planet's environmental impact API}[http://impact.brighterplanet.com] and {reference data service}[http://data.brighterplanet.com].
34
-
35
- == Haystacks and how to read them
36
-
37
- The (admittedly imperfect) metaphor is "look for a needle in a haystack"
38
-
39
- * needle - the search term
40
- * haystack - the records you are searching (<b>your result will be an object from here</b>)
41
-
42
- So, what if your needle is a string like <tt>youruguay</tt> and your haystack is full of <tt>Country</tt> objects like <tt><Country name:"Uruguay"></tt>?
43
-
44
- >> FuzzyMatch.new(countries, :read => :name).find('youruguay')
45
- => <Country name:"Uruguay">
46
-
47
- == Regular expressions
48
-
49
- You can improve the default matchings with regular expressions.
50
-
51
- * Emphasize important words using <b>blockings</b> and <b>tighteners</b>
52
- * Filter out stop words with <b>tighteners</b>
53
- * Prevent impossible matches with <b>blockings</b> and <b>identities</b>
54
- * Ignore words with <b>stop words</b>
55
-
56
- === Blockings
57
-
58
- Setting a blocking of <tt>/Airbus/</tt> ensures that strings containing "Airbus" will only be scored against to other strings containing "Airbus". A better blocking in this case would probably be <tt>/airbus/i</tt>.
59
-
60
- === Tighteners
61
-
62
- Adding a tightener like <tt>/(boeing).*(7\d\d)/i</tt> will cause "BOEING COMPANY 747" and "boeing747" to be scored as if they were "BOEING 747" and "boeing 747", respectively. See also "Case sensitivity" below.
63
-
64
- === Identities
65
-
66
- Adding an identity like <tt>/(F)\-?(\d50)/</tt> ensures that "Ford F-150" and "Ford F-250" never match.
67
-
68
- === Stop words
69
-
70
- Adding a stop word like <tt>THE</tt> ensures that it is not taken into account when comparing "THE CAT", "THE DAT", and "THE CATT"
71
-
72
- == Case sensitivity
73
-
74
- Scoring is case-insensitive. Everything is downcased before scoring. This is a change from previous versions. Your regexps may still be case-sensitive, though.
75
-
76
- == Examples
77
-
78
- Check out the tests.
79
-
80
- == Speed (and who to thank for the algorithms)
81
-
82
- If you add the amatch[http://flori.github.com/amatch/] gem to your Gemfile, it will use that, which is much faster (but {segfaults have been seen in the wild}[https://github.com/flori/amatch/issues/3]). Thanks {Flori}[https://github.com/flori]!
83
-
84
- Otherwise, pure ruby versions of the string similarity algorithms derived from the {answer to a StackOverflow question}[http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings] and {the text gem}[https://github.com/threedaymonk/text/blob/master/lib/text/levenshtein.rb] are used. Thanks {marzagao}[http://stackoverflow.com/users/10997/marzagao] and {threedaymonk}[https://github.com/threedaymonk]!
85
-
86
- == Authors
87
-
88
- * Seamus Abshere <seamus@abshere.net>
89
- * Ian Hough <ijhough@gmail.com>
90
- * Andy Rossmeissl <andy@rossmeissl.net>
91
-
92
- == Copyright
93
-
94
- Copyright 2011 Brighter Planet, Inc.