fuzzy_match 1.5.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.rspec +2 -0
- data/CHANGELOG +14 -0
- data/Gemfile +8 -0
- data/README.markdown +58 -38
- data/Rakefile +0 -9
- data/bin/fuzzy_match +106 -0
- data/fuzzy_match.gemspec +4 -4
- data/groupings-screenshot.png +0 -0
- data/highlevel.graffle +0 -0
- data/highlevel.png +0 -0
- data/lib/fuzzy_match/record.rb +58 -0
- data/lib/fuzzy_match/result.rb +11 -8
- data/lib/fuzzy_match/rule/grouping.rb +70 -12
- data/lib/fuzzy_match/rule/identity.rb +3 -3
- data/lib/fuzzy_match/rule.rb +1 -1
- data/lib/fuzzy_match/score/amatch.rb +0 -4
- data/lib/fuzzy_match/score/pure_ruby.rb +2 -8
- data/lib/fuzzy_match/score.rb +4 -0
- data/lib/fuzzy_match/similarity.rb +10 -32
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match.rb +78 -94
- data/{test/test_amatch.rb → spec/amatch_spec.rb} +1 -2
- data/{test/test_cache.rb → spec/cache_spec.rb} +7 -7
- data/spec/foo.rb +9 -0
- data/spec/fuzzy_match_spec.rb +354 -0
- data/spec/grouping_spec.rb +60 -0
- data/spec/identity_spec.rb +29 -0
- data/{test/test_wrapper.rb → spec/record_spec.rb} +3 -7
- data/spec/spec_helper.rb +21 -0
- metadata +56 -50
- data/bin/fuzzy_match_checker +0 -71
- data/examples/bts_aircraft/5-2-A.htm +0 -10305
- data/examples/bts_aircraft/5-2-B.htm +0 -9576
- data/examples/bts_aircraft/5-2-D.htm +0 -7094
- data/examples/bts_aircraft/5-2-E.htm +0 -2349
- data/examples/bts_aircraft/5-2-G.htm +0 -2922
- data/examples/bts_aircraft/groupings.csv +0 -1
- data/examples/bts_aircraft/identities.csv +0 -1
- data/examples/bts_aircraft/negatives.csv +0 -1
- data/examples/bts_aircraft/normalizers.csv +0 -1
- data/examples/bts_aircraft/number_260.csv +0 -334
- data/examples/bts_aircraft/positives.csv +0 -1
- data/examples/bts_aircraft/test_bts_aircraft.rb +0 -116
- data/examples/first_name_matching.rb +0 -15
- data/examples/icao-bts.xls +0 -0
- data/lib/fuzzy_match/rule/normalizer.rb +0 -20
- data/lib/fuzzy_match/rule/stop_word.rb +0 -11
- data/lib/fuzzy_match/wrapper.rb +0 -73
- data/test/helper.rb +0 -12
- data/test/test_fuzzy_match.rb +0 -304
- data/test/test_fuzzy_match_convoluted.rb.disabled +0 -268
- data/test/test_grouping.rb +0 -28
- data/test/test_identity.rb +0 -34
- data/test/test_normalizer.rb +0 -10
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Seamus Abshere
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-05-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: active_record_inline_schema
|
@@ -25,7 +25,7 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 0.4.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: pry
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ! '>='
|
@@ -39,21 +39,21 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rspec-core
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ! '>='
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec-expectations
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - ! '>='
|
@@ -67,7 +67,7 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rspec-mocks
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ! '>='
|
@@ -81,7 +81,21 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: activerecord
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ! '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '3'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '3'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: mysql2
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
101
|
- - ! '>='
|
@@ -95,7 +109,7 @@ dependencies:
|
|
95
109
|
- !ruby/object:Gem::Version
|
96
110
|
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
112
|
+
name: cohort_analysis
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - ! '>='
|
@@ -109,7 +123,7 @@ dependencies:
|
|
109
123
|
- !ruby/object:Gem::Version
|
110
124
|
version: '0'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
|
-
name:
|
126
|
+
name: weighted_average
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
114
128
|
requirements:
|
115
129
|
- - ! '>='
|
@@ -123,7 +137,21 @@ dependencies:
|
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '0'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
140
|
+
name: yard
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ! '>='
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ! '>='
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: amatch
|
127
155
|
requirement: !ruby/object:Gem::Requirement
|
128
156
|
requirements:
|
129
157
|
- - ! '>='
|
@@ -141,11 +169,12 @@ description: Find a needle in a haystack using string similarity and (optionally
|
|
141
169
|
email:
|
142
170
|
- seamus@abshere.net
|
143
171
|
executables:
|
144
|
-
-
|
172
|
+
- fuzzy_match
|
145
173
|
extensions: []
|
146
174
|
extra_rdoc_files: []
|
147
175
|
files:
|
148
176
|
- .gitignore
|
177
|
+
- .rspec
|
149
178
|
- CHANGELOG
|
150
179
|
- Gemfile
|
151
180
|
- LICENSE
|
@@ -156,45 +185,31 @@ files:
|
|
156
185
|
- benchmark/before-without-last-result.txt
|
157
186
|
- benchmark/before.txt
|
158
187
|
- benchmark/memory.rb
|
159
|
-
- bin/
|
160
|
-
- examples/bts_aircraft/5-2-A.htm
|
161
|
-
- examples/bts_aircraft/5-2-B.htm
|
162
|
-
- examples/bts_aircraft/5-2-D.htm
|
163
|
-
- examples/bts_aircraft/5-2-E.htm
|
164
|
-
- examples/bts_aircraft/5-2-G.htm
|
165
|
-
- examples/bts_aircraft/groupings.csv
|
166
|
-
- examples/bts_aircraft/identities.csv
|
167
|
-
- examples/bts_aircraft/negatives.csv
|
168
|
-
- examples/bts_aircraft/normalizers.csv
|
169
|
-
- examples/bts_aircraft/number_260.csv
|
170
|
-
- examples/bts_aircraft/positives.csv
|
171
|
-
- examples/bts_aircraft/test_bts_aircraft.rb
|
172
|
-
- examples/first_name_matching.rb
|
173
|
-
- examples/icao-bts.xls
|
188
|
+
- bin/fuzzy_match
|
174
189
|
- fuzzy_match.gemspec
|
190
|
+
- groupings-screenshot.png
|
191
|
+
- highlevel.graffle
|
192
|
+
- highlevel.png
|
175
193
|
- lib/fuzzy_match.rb
|
176
194
|
- lib/fuzzy_match/cached_result.rb
|
195
|
+
- lib/fuzzy_match/record.rb
|
177
196
|
- lib/fuzzy_match/result.rb
|
178
197
|
- lib/fuzzy_match/rule.rb
|
179
198
|
- lib/fuzzy_match/rule/grouping.rb
|
180
199
|
- lib/fuzzy_match/rule/identity.rb
|
181
|
-
- lib/fuzzy_match/rule/normalizer.rb
|
182
|
-
- lib/fuzzy_match/rule/stop_word.rb
|
183
200
|
- lib/fuzzy_match/score.rb
|
184
201
|
- lib/fuzzy_match/score/amatch.rb
|
185
202
|
- lib/fuzzy_match/score/pure_ruby.rb
|
186
203
|
- lib/fuzzy_match/similarity.rb
|
187
204
|
- lib/fuzzy_match/version.rb
|
188
|
-
-
|
189
|
-
-
|
190
|
-
-
|
191
|
-
-
|
192
|
-
-
|
193
|
-
-
|
194
|
-
-
|
195
|
-
-
|
196
|
-
- test/test_normalizer.rb
|
197
|
-
- test/test_wrapper.rb
|
205
|
+
- spec/amatch_spec.rb
|
206
|
+
- spec/cache_spec.rb
|
207
|
+
- spec/foo.rb
|
208
|
+
- spec/fuzzy_match_spec.rb
|
209
|
+
- spec/grouping_spec.rb
|
210
|
+
- spec/identity_spec.rb
|
211
|
+
- spec/record_spec.rb
|
212
|
+
- spec/spec_helper.rb
|
198
213
|
homepage: https://github.com/seamusabshere/fuzzy_match
|
199
214
|
licenses: []
|
200
215
|
metadata: {}
|
@@ -219,14 +234,5 @@ signing_key:
|
|
219
234
|
specification_version: 4
|
220
235
|
summary: Find a needle in a haystack using string similarity and (optionally) regexp
|
221
236
|
rules. Replaces loose_tight_dictionary.
|
222
|
-
test_files:
|
223
|
-
- test/helper.rb
|
224
|
-
- test/test_amatch.rb
|
225
|
-
- test/test_cache.rb
|
226
|
-
- test/test_fuzzy_match.rb
|
227
|
-
- test/test_fuzzy_match_convoluted.rb.disabled
|
228
|
-
- test/test_grouping.rb
|
229
|
-
- test/test_identity.rb
|
230
|
-
- test/test_normalizer.rb
|
231
|
-
- test/test_wrapper.rb
|
237
|
+
test_files: []
|
232
238
|
has_rdoc:
|
data/bin/fuzzy_match_checker
DELETED
@@ -1,71 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
if File.exist?(File.join(Dir.pwd, 'lib', 'fuzzy_match.rb'))
|
4
|
-
$LOAD_PATH.unshift File.join(Dir.pwd, 'lib')
|
5
|
-
require File.join(Dir.pwd, 'lib', 'fuzzy_match')
|
6
|
-
else
|
7
|
-
require 'fuzzy_match'
|
8
|
-
end
|
9
|
-
require 'fuzzy_match/version'
|
10
|
-
|
11
|
-
# note: not included in gemfile but neither is bundler used here
|
12
|
-
require 'active_support/core_ext'
|
13
|
-
require 'remote_table'
|
14
|
-
require 'thor'
|
15
|
-
|
16
|
-
class FuzzyMatch
|
17
|
-
class Checker < ::Thor
|
18
|
-
# for example: https://docs.google.com/spreadsheet/pub?key=0AkCJNpm9Ks6JdHZURUI2S2xOa3ZFVzlZb205VVhpQnc&single=true&gid=0&output=csv
|
19
|
-
desc :check, "Check a spreadsheet containing columns with these headers: haystack, needles, correct_matches, groupings, stop_words, identities, normalizers, find_options (listing an option like must_match_grouping makes it true)"
|
20
|
-
method_option :show_success, :default => false, :type => :boolean, :desc => "Whether to print successful matches as you go"
|
21
|
-
method_option :downcase, :default => false, :type => :boolean, :desc => "Whether to downcase everything (except regexes, where you have to do /foo/i)"
|
22
|
-
def check(url)
|
23
|
-
puts "Checking matches using fuzzy_match version #{FuzzyMatch::VERSION}..."
|
24
|
-
|
25
|
-
t = RemoteTable.new(url, :headers => :first_row)
|
26
|
-
if (violators = %w{needle grouping correct_match stop_word identity normalizer find_option} & t.rows.first.keys).any?
|
27
|
-
raise ArgumentError, "Make sure you pluralize your right row headers (violators: #{violators.map(&:inspect).join(', ')}"
|
28
|
-
end
|
29
|
-
haystack = t.rows.map { |row| row['haystack'] }.select(&:present?)
|
30
|
-
haystack.map!(&:downcase) if options.downcase
|
31
|
-
find_options = t.rows.map { |row| row['find_options'] }
|
32
|
-
fm = FuzzyMatch.new(
|
33
|
-
haystack,
|
34
|
-
:groupings => t.rows.map { |row| row['groupings'] }.select(&:present?),
|
35
|
-
:identities => t.rows.map { |row| row['identities'] }.select(&:present?),
|
36
|
-
:stop_words => t.rows.map { |row| row['stop_words'] }.select(&:present?),
|
37
|
-
:normalizers => t.rows.map { |row| row['normalizers'] }.select(&:present?),
|
38
|
-
:must_match_grouping => find_options.include?('must_match_grouping'),
|
39
|
-
:must_match_at_least_one_word => find_options.include?('must_match_at_least_one_word'),
|
40
|
-
:first_grouping_decides => find_options.include?('first_grouping_decides')
|
41
|
-
)
|
42
|
-
|
43
|
-
count = 0
|
44
|
-
t.each do |row|
|
45
|
-
needle = row['needles']
|
46
|
-
correct_match = row['correct_matches']
|
47
|
-
next unless needle.present?
|
48
|
-
if options.downcase
|
49
|
-
needle.to_s.downcase!
|
50
|
-
correct_match.to_s.downcase!
|
51
|
-
end
|
52
|
-
correct_match = nil if correct_match.blank?
|
53
|
-
match = fm.find needle
|
54
|
-
if options.show_success? or match != correct_match
|
55
|
-
puts " #{needle.inspect} => #{match.inspect}"
|
56
|
-
end
|
57
|
-
unless match == correct_match
|
58
|
-
puts "MISMATCH: #{needle.inspect} should match #{correct_match.inspect}"
|
59
|
-
puts fm.explain needle
|
60
|
-
exit 1
|
61
|
-
end
|
62
|
-
count += 1
|
63
|
-
end
|
64
|
-
|
65
|
-
puts "Correctly matched #{count} needles."
|
66
|
-
exit 0
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
FuzzyMatch::Checker.start
|