fuzzy_match 1.3.2 → 1.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +21 -0
- data/Gemfile +5 -1
- data/README.markdown +6 -4
- data/bin/fuzzy_match_checker +66 -0
- data/fuzzy_match.gemspec +2 -3
- data/lib/fuzzy_match/cached_result.rb +17 -18
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match.rb +0 -4
- data/test/test_amatch.rb +10 -8
- data/test/test_cache.rb +7 -9
- data/test/test_fuzzy_match.rb +6 -1
- metadata +37 -9
- data/History.txt +0 -13
data/CHANGELOG
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
== 1.3.3 / 2012-04-13
|
2
|
+
|
3
|
+
* Enhancements
|
4
|
+
|
5
|
+
* Now you must require 'fuzzy_match/cached_result' if you want to use it.
|
6
|
+
* Use active_record_inline_schema to create the FuzzyMatch::CachedResult table
|
7
|
+
* Test against CohortAnalysis, the replacement for CohortScope
|
8
|
+
* Fix some other random deprecations (like set_primary_key)
|
9
|
+
|
10
|
+
== 1.3.2 / 2012-02-24
|
11
|
+
|
12
|
+
* Enhancements
|
13
|
+
|
14
|
+
* Start keeping a changelog!
|
15
|
+
* renamed blockings to groupings
|
16
|
+
* cleaned up tests
|
17
|
+
|
18
|
+
* Bug fixes
|
19
|
+
|
20
|
+
* better handling for one-letter similiarities like 'X foo' vs 'X bar' which couldn't be detected by pair distance
|
21
|
+
* take deprecated option :tighteners as :normalizers
|
data/Gemfile
CHANGED
@@ -2,12 +2,16 @@ source :rubygems
|
|
2
2
|
|
3
3
|
gemspec
|
4
4
|
|
5
|
+
# bin dependencies
|
6
|
+
gem 'remote_table'
|
7
|
+
gem 'thor'
|
8
|
+
|
5
9
|
# development dependencies
|
6
10
|
gem 'minitest-reporters'
|
7
11
|
gem "minitest"
|
8
12
|
gem 'activerecord', '>=3'
|
9
13
|
gem 'mysql2'
|
10
|
-
gem '
|
14
|
+
gem 'cohort_analysis'
|
11
15
|
gem 'weighted_average'
|
12
16
|
gem 'rake'
|
13
17
|
gem 'yard'
|
data/README.markdown
CHANGED
@@ -8,9 +8,7 @@ Replaces [`loose_tight_dictionary`](https://github.com/seamusabshere/loose_tight
|
|
8
8
|
|
9
9
|
>> require 'fuzzy_match'
|
10
10
|
=> true
|
11
|
-
>>
|
12
|
-
=> #<FuzzyMatch: [...]>
|
13
|
-
>> matcher.find('Shamus')
|
11
|
+
>> FuzzyMatch.new(['seamus', 'andy', 'ben']).find('Shamus)
|
14
12
|
=> "seamus"
|
15
13
|
|
16
14
|
See also the blog post [Fuzzy match in Ruby](http://numbers.brighterplanet.com/2012/01/18/fuzzy-match-in-ruby/).
|
@@ -118,7 +116,7 @@ In edge cases where Dice's finds that two strings are equally similar to a third
|
|
118
116
|
|
119
117
|
## Production use
|
120
118
|
|
121
|
-
Over 2 years in [Brighter Planet's
|
119
|
+
Over 2 years in [Brighter Planet's impact estimate API](http://impact.brighterplanet.com) and [reference data service](http://data.brighterplanet.com).
|
122
120
|
|
123
121
|
We often combine `fuzzy_match` with [`remote_table`](https://github.com/seamusabshere/remote_table) and [`errata`](https://github.com/seamusabshere/errata):
|
124
122
|
|
@@ -126,6 +124,10 @@ We often combine `fuzzy_match` with [`remote_table`](https://github.com/seamusab
|
|
126
124
|
- correct serious or repeated errors with `errata`
|
127
125
|
- `fuzzy_match` the rest
|
128
126
|
|
127
|
+
## Cached results
|
128
|
+
|
129
|
+
TODO write documentation. For now, please see how [we manually cache matches between aircraft and flight segments](https://github.com/brighterplanet/earth/blob/master/lib/earth/air/aircraft.rb).
|
130
|
+
|
129
131
|
## Glossary
|
130
132
|
|
131
133
|
The admittedly imperfect metaphor is "look for a needle in a haystack"
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
if File.exist?(File.join(Dir.pwd, 'fuzzy_match.gemspec'))
|
4
|
+
require 'bundler/setup'
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'fuzzy_match'
|
8
|
+
require 'active_support/core_ext'
|
9
|
+
require 'remote_table'
|
10
|
+
require 'thor'
|
11
|
+
|
12
|
+
class FuzzyMatch
|
13
|
+
class Checker < ::Thor
|
14
|
+
# for example: https://docs.google.com/spreadsheet/pub?key=0AkCJNpm9Ks6JdHZURUI2S2xOa3ZFVzlZb205VVhpQnc&single=true&gid=0&output=csv
|
15
|
+
desc :check, "Check a spreadsheet containing columns with these headers: haystack, needles, correct_matches, groupings, stop_words, identities, normalizers, find_options (listing an option like must_match_grouping makes it true)"
|
16
|
+
method_option :show_success, :default => false, :type => :boolean, :desc => "Whether to print successful matches as you go"
|
17
|
+
method_option :downcase, :default => false, :type => :boolean, :desc => "Whether to downcase everything (except regexes, where you have to do /foo/i)"
|
18
|
+
def check(url)
|
19
|
+
puts "Checking matches using fuzzy_match version #{FuzzyMatch::VERSION}..."
|
20
|
+
|
21
|
+
t = RemoteTable.new(url, :headers => :first_row)
|
22
|
+
if (violators = %w{needle grouping correct_match stop_word identity normalizer find_option} & t.rows.first.keys).any?
|
23
|
+
raise ArgumentError, "Make sure you pluralize your right row headers (violators: #{violators.map(&:inspect).join(', ')}"
|
24
|
+
end
|
25
|
+
haystack = t.rows.map { |row| row['haystack'] }.select(&:present?)
|
26
|
+
haystack.map!(&:downcase) if options.downcase
|
27
|
+
find_options = t.rows.map { |row| row['find_options'] }
|
28
|
+
fm = FuzzyMatch.new(
|
29
|
+
haystack,
|
30
|
+
:groupings => t.rows.map { |row| row['groupings'] }.select(&:present?),
|
31
|
+
:identities => t.rows.map { |row| row['identities'] }.select(&:present?),
|
32
|
+
:stop_words => t.rows.map { |row| row['stop_words'] }.select(&:present?),
|
33
|
+
:normalizers => t.rows.map { |row| row['normalizers'] }.select(&:present?),
|
34
|
+
:must_match_grouping => find_options.include?('must_match_grouping'),
|
35
|
+
:must_match_at_least_one_word => find_options.include?('must_match_at_least_one_word'),
|
36
|
+
:first_grouping_decides => find_options.include?('first_grouping_decides')
|
37
|
+
)
|
38
|
+
|
39
|
+
count = 0
|
40
|
+
t.each do |row|
|
41
|
+
needle = row['needles']
|
42
|
+
correct_match = row['correct_matches']
|
43
|
+
next unless needle.present?
|
44
|
+
if options.downcase
|
45
|
+
needle.to_s.downcase!
|
46
|
+
correct_match.to_s.downcase!
|
47
|
+
end
|
48
|
+
correct_match = nil if correct_match.blank?
|
49
|
+
match = fm.find needle
|
50
|
+
if options.show_success? or match != correct_match
|
51
|
+
puts " #{needle.inspect} => #{match.inspect}"
|
52
|
+
end
|
53
|
+
unless match == correct_match
|
54
|
+
puts "MISMATCH: #{needle.inspect} should match #{correct_match.inspect}"
|
55
|
+
exit 1
|
56
|
+
end
|
57
|
+
count += 1
|
58
|
+
end
|
59
|
+
|
60
|
+
puts "Correctly matched #{count} needles."
|
61
|
+
exit 0
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
FuzzyMatch::Checker.start
|
data/fuzzy_match.gemspec
CHANGED
@@ -1,11 +1,9 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
require "fuzzy_match/version"
|
2
|
+
require File.expand_path("../lib/fuzzy_match/version", __FILE__)
|
4
3
|
|
5
4
|
Gem::Specification.new do |s|
|
6
5
|
s.name = "fuzzy_match"
|
7
6
|
s.version = FuzzyMatch::VERSION
|
8
|
-
s.platform = Gem::Platform::RUBY
|
9
7
|
s.authors = ["Seamus Abshere"]
|
10
8
|
s.email = ["seamus@abshere.net"]
|
11
9
|
s.homepage = "https://github.com/seamusabshere/fuzzy_match"
|
@@ -21,4 +19,5 @@ Gem::Specification.new do |s|
|
|
21
19
|
|
22
20
|
s.add_runtime_dependency 'activesupport', '>=3'
|
23
21
|
s.add_runtime_dependency 'to_regexp', '>=0.0.3'
|
22
|
+
s.add_runtime_dependency 'active_record_inline_schema', '>=0.4.0'
|
24
23
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'active_record_inline_schema'
|
2
|
+
|
1
3
|
class FuzzyMatch
|
2
4
|
class CachedResult < ::ActiveRecord::Base
|
3
5
|
if ::ActiveRecord::VERSION::STRING >= '3.2'
|
@@ -5,26 +7,23 @@ class FuzzyMatch
|
|
5
7
|
else
|
6
8
|
set_table_name :fuzzy_match_cached_results
|
7
9
|
end
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
end
|
16
|
-
connection.add_index :fuzzy_match_cached_results, [:a_class, :b_class, :a], :name => 'aba'
|
17
|
-
connection.add_index :fuzzy_match_cached_results, [:a_class, :b_class, :b], :name => 'abb'
|
18
|
-
connection.add_index :fuzzy_match_cached_results, [:a_class, :b_class, :a, :b], :name => 'abab'
|
19
|
-
reset_column_information
|
20
|
-
end
|
21
|
-
|
22
|
-
def self.setup(from_scratch = false)
|
23
|
-
if from_scratch or not table_exists?
|
24
|
-
connection.drop_table :fuzzy_match_cached_results rescue nil
|
25
|
-
create_table rescue nil
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def setup(from_scratch = false)
|
13
|
+
if from_scratch
|
14
|
+
connection.drop_table :fuzzy_match_cached_results rescue nil
|
15
|
+
end
|
16
|
+
auto_upgrade!
|
26
17
|
end
|
27
18
|
end
|
19
|
+
|
20
|
+
col :a_class
|
21
|
+
col :a
|
22
|
+
col :b_class
|
23
|
+
col :b
|
24
|
+
add_index [:a_class, :b_class, :a], :name => 'aba'
|
25
|
+
add_index [:a_class, :b_class, :b], :name => 'abb'
|
26
|
+
add_index [:a_class, :b_class, :a, :b], :name => 'abab'
|
28
27
|
|
29
28
|
module ActiveRecordBaseExtension
|
30
29
|
# required options:
|
data/lib/fuzzy_match/version.rb
CHANGED
data/lib/fuzzy_match.rb
CHANGED
@@ -15,10 +15,6 @@ require 'fuzzy_match/wrapper'
|
|
15
15
|
require 'fuzzy_match/similarity'
|
16
16
|
require 'fuzzy_match/score'
|
17
17
|
|
18
|
-
if defined?(::ActiveRecord)
|
19
|
-
require 'fuzzy_match/cached_result'
|
20
|
-
end
|
21
|
-
|
22
18
|
# See the README for more information.
|
23
19
|
class FuzzyMatch
|
24
20
|
class << self
|
data/test/test_amatch.rb
CHANGED
@@ -3,14 +3,16 @@ unless RUBY_PLATFORM == 'java'
|
|
3
3
|
require 'test_fuzzy_match'
|
4
4
|
require 'amatch'
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
6
|
+
describe FuzzyMatch do
|
7
|
+
describe %{when using the :amatch string similarity engine} do
|
8
|
+
before do
|
9
|
+
$testing_amatch = true
|
10
|
+
FuzzyMatch.engine = :amatch
|
11
|
+
end
|
12
|
+
after do
|
13
|
+
$testing_amatch = false
|
14
|
+
FuzzyMatch.engine = nil
|
15
|
+
end
|
14
16
|
end
|
15
17
|
end
|
16
18
|
end
|
data/test/test_cache.rb
CHANGED
@@ -2,7 +2,7 @@ require 'helper'
|
|
2
2
|
|
3
3
|
require 'active_support/all'
|
4
4
|
require 'active_record'
|
5
|
-
require '
|
5
|
+
require 'cohort_analysis'
|
6
6
|
require 'weighted_average'
|
7
7
|
|
8
8
|
ActiveRecord::Base.establish_connection(
|
@@ -25,7 +25,7 @@ require 'fuzzy_match/cached_result'
|
|
25
25
|
::FuzzyMatch::CachedResult.setup(true)
|
26
26
|
|
27
27
|
class Aircraft < ActiveRecord::Base
|
28
|
-
|
28
|
+
self.primary_key = 'icao_code'
|
29
29
|
|
30
30
|
cache_fuzzy_match_with :flight_segments, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
31
31
|
|
@@ -52,13 +52,10 @@ CREATE TABLE `aircraft` (
|
|
52
52
|
end
|
53
53
|
|
54
54
|
class FlightSegment < ActiveRecord::Base
|
55
|
-
|
55
|
+
self.primary_key = 'row_hash'
|
56
56
|
|
57
57
|
cache_fuzzy_match_with :aircraft, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
58
58
|
|
59
|
-
extend CohortScope
|
60
|
-
self.minimum_cohort_size = 1
|
61
|
-
|
62
59
|
def self.create_table
|
63
60
|
connection.drop_table(:flight_segments) rescue nil
|
64
61
|
connection.execute %{
|
@@ -96,11 +93,11 @@ fs.passengers = 100
|
|
96
93
|
fs.seats = 5
|
97
94
|
fs.save!
|
98
95
|
|
99
|
-
FlightSegment.
|
96
|
+
FlightSegment.all.each do |fs|
|
100
97
|
fs.cache_aircraft!
|
101
98
|
end
|
102
99
|
|
103
|
-
|
100
|
+
describe FuzzyMatch::CachedResult do
|
104
101
|
it %{joins aircraft to flight segments} do
|
105
102
|
aircraft = Aircraft.find('B742')
|
106
103
|
aircraft.flight_segments.count.must_equal 2
|
@@ -118,7 +115,8 @@ class TestCache < MiniTest::Spec
|
|
118
115
|
|
119
116
|
it %{works with cohort_scope (albeit rather clumsily)} do
|
120
117
|
aircraft = Aircraft.find('B742')
|
121
|
-
FlightSegment.
|
118
|
+
FlightSegment.cohort({:aircraft_description => aircraft.flight_segments_foreign_keys}, :minimum_size => 2).count.must_equal 2
|
119
|
+
# FlightSegment.cohort(:aircraft_description => aircraft.flight_segments_foreign_keys).must_equal []
|
122
120
|
end
|
123
121
|
|
124
122
|
# def test_006_you_can_get_aircraft_from_flight_segments
|
data/test/test_fuzzy_match.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
require 'helper'
|
3
3
|
|
4
|
-
|
4
|
+
describe FuzzyMatch do
|
5
5
|
describe '#find' do
|
6
6
|
it %{identifies the best match based on string similarity} do
|
7
7
|
d = FuzzyMatch.new %w{ RATZ CATZ }
|
@@ -12,6 +12,11 @@ class TestFuzzyMatch < MiniTest::Spec
|
|
12
12
|
d.find('X').must_equal 'X'
|
13
13
|
d.find('A').must_be_nil
|
14
14
|
end
|
15
|
+
|
16
|
+
it %{does the right thing} do
|
17
|
+
d = FuzzyMatch.new [ 'Artyom Makarov', 'Karl' ], :must_match_at_least_one_word => true
|
18
|
+
puts d.explain('art')#.must_equal 'Artyom Makarov'
|
19
|
+
end
|
15
20
|
|
16
21
|
it %{not return any result if the maximum score is zero} do
|
17
22
|
FuzzyMatch.new(['a']).find('b').must_be_nil
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-04-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: '3'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '3'
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: to_regexp
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ! '>='
|
@@ -32,19 +37,41 @@ dependencies:
|
|
32
37
|
version: 0.0.3
|
33
38
|
type: :runtime
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 0.0.3
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: active_record_inline_schema
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 0.4.0
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.4.0
|
36
62
|
description: Find a needle in a haystack using string similarity and (optionally)
|
37
63
|
regexp rules. Replaces loose_tight_dictionary.
|
38
64
|
email:
|
39
65
|
- seamus@abshere.net
|
40
|
-
executables:
|
66
|
+
executables:
|
67
|
+
- fuzzy_match_checker
|
41
68
|
extensions: []
|
42
69
|
extra_rdoc_files: []
|
43
70
|
files:
|
44
71
|
- .document
|
45
72
|
- .gitignore
|
73
|
+
- CHANGELOG
|
46
74
|
- Gemfile
|
47
|
-
- History.txt
|
48
75
|
- LICENSE
|
49
76
|
- README.markdown
|
50
77
|
- Rakefile
|
@@ -53,6 +80,7 @@ files:
|
|
53
80
|
- benchmark/before-without-last-result.txt
|
54
81
|
- benchmark/before.txt
|
55
82
|
- benchmark/memory.rb
|
83
|
+
- bin/fuzzy_match_checker
|
56
84
|
- examples/bts_aircraft/5-2-A.htm
|
57
85
|
- examples/bts_aircraft/5-2-B.htm
|
58
86
|
- examples/bts_aircraft/5-2-D.htm
|
@@ -111,7 +139,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
111
139
|
version: '0'
|
112
140
|
requirements: []
|
113
141
|
rubyforge_project: fuzzy_match
|
114
|
-
rubygems_version: 1.8.
|
142
|
+
rubygems_version: 1.8.21
|
115
143
|
signing_key:
|
116
144
|
specification_version: 3
|
117
145
|
summary: Find a needle in a haystack using string similarity and (optionally) regexp
|
data/History.txt
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
== 1.3.2 / 2012-02-24
|
2
|
-
|
3
|
-
* Start keeping a changelog!
|
4
|
-
|
5
|
-
* Enhancements
|
6
|
-
|
7
|
-
* renamed blockings to groupings
|
8
|
-
* cleaned up tests
|
9
|
-
|
10
|
-
* Bug fixes
|
11
|
-
|
12
|
-
* better handling for one-letter similiarities like 'X foo' vs 'X bar' which couldn't be detected by pair distance
|
13
|
-
* take deprecated option :tighteners as :normalizers
|