fuzzy_match 1.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.rspec +2 -0
- data/CHANGELOG +14 -0
- data/Gemfile +8 -0
- data/README.markdown +58 -38
- data/Rakefile +0 -9
- data/bin/fuzzy_match +106 -0
- data/fuzzy_match.gemspec +4 -4
- data/groupings-screenshot.png +0 -0
- data/highlevel.graffle +0 -0
- data/highlevel.png +0 -0
- data/lib/fuzzy_match/record.rb +58 -0
- data/lib/fuzzy_match/result.rb +11 -8
- data/lib/fuzzy_match/rule/grouping.rb +70 -12
- data/lib/fuzzy_match/rule/identity.rb +3 -3
- data/lib/fuzzy_match/rule.rb +1 -1
- data/lib/fuzzy_match/score/amatch.rb +0 -4
- data/lib/fuzzy_match/score/pure_ruby.rb +2 -8
- data/lib/fuzzy_match/score.rb +4 -0
- data/lib/fuzzy_match/similarity.rb +10 -32
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match.rb +78 -94
- data/{test/test_amatch.rb → spec/amatch_spec.rb} +1 -2
- data/{test/test_cache.rb → spec/cache_spec.rb} +7 -7
- data/spec/foo.rb +9 -0
- data/spec/fuzzy_match_spec.rb +354 -0
- data/spec/grouping_spec.rb +60 -0
- data/spec/identity_spec.rb +29 -0
- data/{test/test_wrapper.rb → spec/record_spec.rb} +3 -7
- data/spec/spec_helper.rb +21 -0
- metadata +56 -50
- data/bin/fuzzy_match_checker +0 -71
- data/examples/bts_aircraft/5-2-A.htm +0 -10305
- data/examples/bts_aircraft/5-2-B.htm +0 -9576
- data/examples/bts_aircraft/5-2-D.htm +0 -7094
- data/examples/bts_aircraft/5-2-E.htm +0 -2349
- data/examples/bts_aircraft/5-2-G.htm +0 -2922
- data/examples/bts_aircraft/groupings.csv +0 -1
- data/examples/bts_aircraft/identities.csv +0 -1
- data/examples/bts_aircraft/negatives.csv +0 -1
- data/examples/bts_aircraft/normalizers.csv +0 -1
- data/examples/bts_aircraft/number_260.csv +0 -334
- data/examples/bts_aircraft/positives.csv +0 -1
- data/examples/bts_aircraft/test_bts_aircraft.rb +0 -116
- data/examples/first_name_matching.rb +0 -15
- data/examples/icao-bts.xls +0 -0
- data/lib/fuzzy_match/rule/normalizer.rb +0 -20
- data/lib/fuzzy_match/rule/stop_word.rb +0 -11
- data/lib/fuzzy_match/wrapper.rb +0 -73
- data/test/helper.rb +0 -12
- data/test/test_fuzzy_match.rb +0 -304
- data/test/test_fuzzy_match_convoluted.rb.disabled +0 -268
- data/test/test_grouping.rb +0 -28
- data/test/test_identity.rb +0 -34
- data/test/test_normalizer.rb +0 -10
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Seamus Abshere
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-05-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: active_record_inline_schema
|
@@ -25,7 +25,7 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 0.4.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: pry
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ! '>='
|
@@ -39,21 +39,21 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rspec-core
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ! '>='
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec-expectations
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - ! '>='
|
@@ -67,7 +67,7 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rspec-mocks
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ! '>='
|
@@ -81,7 +81,21 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: activerecord
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ! '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '3'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '3'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: mysql2
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
101
|
- - ! '>='
|
@@ -95,7 +109,7 @@ dependencies:
|
|
95
109
|
- !ruby/object:Gem::Version
|
96
110
|
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
112
|
+
name: cohort_analysis
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - ! '>='
|
@@ -109,7 +123,7 @@ dependencies:
|
|
109
123
|
- !ruby/object:Gem::Version
|
110
124
|
version: '0'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
|
-
name:
|
126
|
+
name: weighted_average
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
114
128
|
requirements:
|
115
129
|
- - ! '>='
|
@@ -123,7 +137,21 @@ dependencies:
|
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '0'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
140
|
+
name: yard
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ! '>='
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ! '>='
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: amatch
|
127
155
|
requirement: !ruby/object:Gem::Requirement
|
128
156
|
requirements:
|
129
157
|
- - ! '>='
|
@@ -141,11 +169,12 @@ description: Find a needle in a haystack using string similarity and (optionally
|
|
141
169
|
email:
|
142
170
|
- seamus@abshere.net
|
143
171
|
executables:
|
144
|
-
-
|
172
|
+
- fuzzy_match
|
145
173
|
extensions: []
|
146
174
|
extra_rdoc_files: []
|
147
175
|
files:
|
148
176
|
- .gitignore
|
177
|
+
- .rspec
|
149
178
|
- CHANGELOG
|
150
179
|
- Gemfile
|
151
180
|
- LICENSE
|
@@ -156,45 +185,31 @@ files:
|
|
156
185
|
- benchmark/before-without-last-result.txt
|
157
186
|
- benchmark/before.txt
|
158
187
|
- benchmark/memory.rb
|
159
|
-
- bin/
|
160
|
-
- examples/bts_aircraft/5-2-A.htm
|
161
|
-
- examples/bts_aircraft/5-2-B.htm
|
162
|
-
- examples/bts_aircraft/5-2-D.htm
|
163
|
-
- examples/bts_aircraft/5-2-E.htm
|
164
|
-
- examples/bts_aircraft/5-2-G.htm
|
165
|
-
- examples/bts_aircraft/groupings.csv
|
166
|
-
- examples/bts_aircraft/identities.csv
|
167
|
-
- examples/bts_aircraft/negatives.csv
|
168
|
-
- examples/bts_aircraft/normalizers.csv
|
169
|
-
- examples/bts_aircraft/number_260.csv
|
170
|
-
- examples/bts_aircraft/positives.csv
|
171
|
-
- examples/bts_aircraft/test_bts_aircraft.rb
|
172
|
-
- examples/first_name_matching.rb
|
173
|
-
- examples/icao-bts.xls
|
188
|
+
- bin/fuzzy_match
|
174
189
|
- fuzzy_match.gemspec
|
190
|
+
- groupings-screenshot.png
|
191
|
+
- highlevel.graffle
|
192
|
+
- highlevel.png
|
175
193
|
- lib/fuzzy_match.rb
|
176
194
|
- lib/fuzzy_match/cached_result.rb
|
195
|
+
- lib/fuzzy_match/record.rb
|
177
196
|
- lib/fuzzy_match/result.rb
|
178
197
|
- lib/fuzzy_match/rule.rb
|
179
198
|
- lib/fuzzy_match/rule/grouping.rb
|
180
199
|
- lib/fuzzy_match/rule/identity.rb
|
181
|
-
- lib/fuzzy_match/rule/normalizer.rb
|
182
|
-
- lib/fuzzy_match/rule/stop_word.rb
|
183
200
|
- lib/fuzzy_match/score.rb
|
184
201
|
- lib/fuzzy_match/score/amatch.rb
|
185
202
|
- lib/fuzzy_match/score/pure_ruby.rb
|
186
203
|
- lib/fuzzy_match/similarity.rb
|
187
204
|
- lib/fuzzy_match/version.rb
|
188
|
-
-
|
189
|
-
-
|
190
|
-
-
|
191
|
-
-
|
192
|
-
-
|
193
|
-
-
|
194
|
-
-
|
195
|
-
-
|
196
|
-
- test/test_normalizer.rb
|
197
|
-
- test/test_wrapper.rb
|
205
|
+
- spec/amatch_spec.rb
|
206
|
+
- spec/cache_spec.rb
|
207
|
+
- spec/foo.rb
|
208
|
+
- spec/fuzzy_match_spec.rb
|
209
|
+
- spec/grouping_spec.rb
|
210
|
+
- spec/identity_spec.rb
|
211
|
+
- spec/record_spec.rb
|
212
|
+
- spec/spec_helper.rb
|
198
213
|
homepage: https://github.com/seamusabshere/fuzzy_match
|
199
214
|
licenses: []
|
200
215
|
metadata: {}
|
@@ -219,14 +234,5 @@ signing_key:
|
|
219
234
|
specification_version: 4
|
220
235
|
summary: Find a needle in a haystack using string similarity and (optionally) regexp
|
221
236
|
rules. Replaces loose_tight_dictionary.
|
222
|
-
test_files:
|
223
|
-
- test/helper.rb
|
224
|
-
- test/test_amatch.rb
|
225
|
-
- test/test_cache.rb
|
226
|
-
- test/test_fuzzy_match.rb
|
227
|
-
- test/test_fuzzy_match_convoluted.rb.disabled
|
228
|
-
- test/test_grouping.rb
|
229
|
-
- test/test_identity.rb
|
230
|
-
- test/test_normalizer.rb
|
231
|
-
- test/test_wrapper.rb
|
237
|
+
test_files: []
|
232
238
|
has_rdoc:
|
data/bin/fuzzy_match_checker
DELETED
@@ -1,71 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
if File.exist?(File.join(Dir.pwd, 'lib', 'fuzzy_match.rb'))
|
4
|
-
$LOAD_PATH.unshift File.join(Dir.pwd, 'lib')
|
5
|
-
require File.join(Dir.pwd, 'lib', 'fuzzy_match')
|
6
|
-
else
|
7
|
-
require 'fuzzy_match'
|
8
|
-
end
|
9
|
-
require 'fuzzy_match/version'
|
10
|
-
|
11
|
-
# note: not included in gemfile but neither is bundler used here
|
12
|
-
require 'active_support/core_ext'
|
13
|
-
require 'remote_table'
|
14
|
-
require 'thor'
|
15
|
-
|
16
|
-
class FuzzyMatch
|
17
|
-
class Checker < ::Thor
|
18
|
-
# for example: https://docs.google.com/spreadsheet/pub?key=0AkCJNpm9Ks6JdHZURUI2S2xOa3ZFVzlZb205VVhpQnc&single=true&gid=0&output=csv
|
19
|
-
desc :check, "Check a spreadsheet containing columns with these headers: haystack, needles, correct_matches, groupings, stop_words, identities, normalizers, find_options (listing an option like must_match_grouping makes it true)"
|
20
|
-
method_option :show_success, :default => false, :type => :boolean, :desc => "Whether to print successful matches as you go"
|
21
|
-
method_option :downcase, :default => false, :type => :boolean, :desc => "Whether to downcase everything (except regexes, where you have to do /foo/i)"
|
22
|
-
def check(url)
|
23
|
-
puts "Checking matches using fuzzy_match version #{FuzzyMatch::VERSION}..."
|
24
|
-
|
25
|
-
t = RemoteTable.new(url, :headers => :first_row)
|
26
|
-
if (violators = %w{needle grouping correct_match stop_word identity normalizer find_option} & t.rows.first.keys).any?
|
27
|
-
raise ArgumentError, "Make sure you pluralize your right row headers (violators: #{violators.map(&:inspect).join(', ')}"
|
28
|
-
end
|
29
|
-
haystack = t.rows.map { |row| row['haystack'] }.select(&:present?)
|
30
|
-
haystack.map!(&:downcase) if options.downcase
|
31
|
-
find_options = t.rows.map { |row| row['find_options'] }
|
32
|
-
fm = FuzzyMatch.new(
|
33
|
-
haystack,
|
34
|
-
:groupings => t.rows.map { |row| row['groupings'] }.select(&:present?),
|
35
|
-
:identities => t.rows.map { |row| row['identities'] }.select(&:present?),
|
36
|
-
:stop_words => t.rows.map { |row| row['stop_words'] }.select(&:present?),
|
37
|
-
:normalizers => t.rows.map { |row| row['normalizers'] }.select(&:present?),
|
38
|
-
:must_match_grouping => find_options.include?('must_match_grouping'),
|
39
|
-
:must_match_at_least_one_word => find_options.include?('must_match_at_least_one_word'),
|
40
|
-
:first_grouping_decides => find_options.include?('first_grouping_decides')
|
41
|
-
)
|
42
|
-
|
43
|
-
count = 0
|
44
|
-
t.each do |row|
|
45
|
-
needle = row['needles']
|
46
|
-
correct_match = row['correct_matches']
|
47
|
-
next unless needle.present?
|
48
|
-
if options.downcase
|
49
|
-
needle.to_s.downcase!
|
50
|
-
correct_match.to_s.downcase!
|
51
|
-
end
|
52
|
-
correct_match = nil if correct_match.blank?
|
53
|
-
match = fm.find needle
|
54
|
-
if options.show_success? or match != correct_match
|
55
|
-
puts " #{needle.inspect} => #{match.inspect}"
|
56
|
-
end
|
57
|
-
unless match == correct_match
|
58
|
-
puts "MISMATCH: #{needle.inspect} should match #{correct_match.inspect}"
|
59
|
-
puts fm.explain needle
|
60
|
-
exit 1
|
61
|
-
end
|
62
|
-
count += 1
|
63
|
-
end
|
64
|
-
|
65
|
-
puts "Correctly matched #{count} needles."
|
66
|
-
exit 0
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
FuzzyMatch::Checker.start
|