fuzzy_match 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +21 -0
- data/Gemfile +5 -1
- data/README.markdown +6 -4
- data/bin/fuzzy_match_checker +66 -0
- data/fuzzy_match.gemspec +2 -3
- data/lib/fuzzy_match/cached_result.rb +17 -18
- data/lib/fuzzy_match/version.rb +1 -1
- data/lib/fuzzy_match.rb +0 -4
- data/test/test_amatch.rb +10 -8
- data/test/test_cache.rb +7 -9
- data/test/test_fuzzy_match.rb +6 -1
- metadata +37 -9
- data/History.txt +0 -13
data/CHANGELOG
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
== 1.3.3 / 2012-04-13
|
2
|
+
|
3
|
+
* Enhancements
|
4
|
+
|
5
|
+
* Now you must require 'fuzzy_match/cached_result' if you want to use it.
|
6
|
+
* Use active_record_inline_schema to create the FuzzyMatch::CachedResult table
|
7
|
+
* Test against CohortAnalysis, the replacement for CohortScope
|
8
|
+
* Fix some other random deprecations (like set_primary_key)
|
9
|
+
|
10
|
+
== 1.3.2 / 2012-02-24
|
11
|
+
|
12
|
+
* Enhancements
|
13
|
+
|
14
|
+
* Start keeping a changelog!
|
15
|
+
* renamed blockings to groupings
|
16
|
+
* cleaned up tests
|
17
|
+
|
18
|
+
* Bug fixes
|
19
|
+
|
20
|
+
* better handling for one-letter similiarities like 'X foo' vs 'X bar' which couldn't be detected by pair distance
|
21
|
+
* take deprecated option :tighteners as :normalizers
|
data/Gemfile
CHANGED
@@ -2,12 +2,16 @@ source :rubygems
|
|
2
2
|
|
3
3
|
gemspec
|
4
4
|
|
5
|
+
# bin dependencies
|
6
|
+
gem 'remote_table'
|
7
|
+
gem 'thor'
|
8
|
+
|
5
9
|
# development dependencies
|
6
10
|
gem 'minitest-reporters'
|
7
11
|
gem "minitest"
|
8
12
|
gem 'activerecord', '>=3'
|
9
13
|
gem 'mysql2'
|
10
|
-
gem '
|
14
|
+
gem 'cohort_analysis'
|
11
15
|
gem 'weighted_average'
|
12
16
|
gem 'rake'
|
13
17
|
gem 'yard'
|
data/README.markdown
CHANGED
@@ -8,9 +8,7 @@ Replaces [`loose_tight_dictionary`](https://github.com/seamusabshere/loose_tight
|
|
8
8
|
|
9
9
|
>> require 'fuzzy_match'
|
10
10
|
=> true
|
11
|
-
>>
|
12
|
-
=> #<FuzzyMatch: [...]>
|
13
|
-
>> matcher.find('Shamus')
|
11
|
+
>> FuzzyMatch.new(['seamus', 'andy', 'ben']).find('Shamus)
|
14
12
|
=> "seamus"
|
15
13
|
|
16
14
|
See also the blog post [Fuzzy match in Ruby](http://numbers.brighterplanet.com/2012/01/18/fuzzy-match-in-ruby/).
|
@@ -118,7 +116,7 @@ In edge cases where Dice's finds that two strings are equally similar to a third
|
|
118
116
|
|
119
117
|
## Production use
|
120
118
|
|
121
|
-
Over 2 years in [Brighter Planet's
|
119
|
+
Over 2 years in [Brighter Planet's impact estimate API](http://impact.brighterplanet.com) and [reference data service](http://data.brighterplanet.com).
|
122
120
|
|
123
121
|
We often combine `fuzzy_match` with [`remote_table`](https://github.com/seamusabshere/remote_table) and [`errata`](https://github.com/seamusabshere/errata):
|
124
122
|
|
@@ -126,6 +124,10 @@ We often combine `fuzzy_match` with [`remote_table`](https://github.com/seamusab
|
|
126
124
|
- correct serious or repeated errors with `errata`
|
127
125
|
- `fuzzy_match` the rest
|
128
126
|
|
127
|
+
## Cached results
|
128
|
+
|
129
|
+
TODO write documentation. For now, please see how [we manually cache matches between aircraft and flight segments](https://github.com/brighterplanet/earth/blob/master/lib/earth/air/aircraft.rb).
|
130
|
+
|
129
131
|
## Glossary
|
130
132
|
|
131
133
|
The admittedly imperfect metaphor is "look for a needle in a haystack"
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
if File.exist?(File.join(Dir.pwd, 'fuzzy_match.gemspec'))
|
4
|
+
require 'bundler/setup'
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'fuzzy_match'
|
8
|
+
require 'active_support/core_ext'
|
9
|
+
require 'remote_table'
|
10
|
+
require 'thor'
|
11
|
+
|
12
|
+
class FuzzyMatch
|
13
|
+
class Checker < ::Thor
|
14
|
+
# for example: https://docs.google.com/spreadsheet/pub?key=0AkCJNpm9Ks6JdHZURUI2S2xOa3ZFVzlZb205VVhpQnc&single=true&gid=0&output=csv
|
15
|
+
desc :check, "Check a spreadsheet containing columns with these headers: haystack, needles, correct_matches, groupings, stop_words, identities, normalizers, find_options (listing an option like must_match_grouping makes it true)"
|
16
|
+
method_option :show_success, :default => false, :type => :boolean, :desc => "Whether to print successful matches as you go"
|
17
|
+
method_option :downcase, :default => false, :type => :boolean, :desc => "Whether to downcase everything (except regexes, where you have to do /foo/i)"
|
18
|
+
def check(url)
|
19
|
+
puts "Checking matches using fuzzy_match version #{FuzzyMatch::VERSION}..."
|
20
|
+
|
21
|
+
t = RemoteTable.new(url, :headers => :first_row)
|
22
|
+
if (violators = %w{needle grouping correct_match stop_word identity normalizer find_option} & t.rows.first.keys).any?
|
23
|
+
raise ArgumentError, "Make sure you pluralize your right row headers (violators: #{violators.map(&:inspect).join(', ')}"
|
24
|
+
end
|
25
|
+
haystack = t.rows.map { |row| row['haystack'] }.select(&:present?)
|
26
|
+
haystack.map!(&:downcase) if options.downcase
|
27
|
+
find_options = t.rows.map { |row| row['find_options'] }
|
28
|
+
fm = FuzzyMatch.new(
|
29
|
+
haystack,
|
30
|
+
:groupings => t.rows.map { |row| row['groupings'] }.select(&:present?),
|
31
|
+
:identities => t.rows.map { |row| row['identities'] }.select(&:present?),
|
32
|
+
:stop_words => t.rows.map { |row| row['stop_words'] }.select(&:present?),
|
33
|
+
:normalizers => t.rows.map { |row| row['normalizers'] }.select(&:present?),
|
34
|
+
:must_match_grouping => find_options.include?('must_match_grouping'),
|
35
|
+
:must_match_at_least_one_word => find_options.include?('must_match_at_least_one_word'),
|
36
|
+
:first_grouping_decides => find_options.include?('first_grouping_decides')
|
37
|
+
)
|
38
|
+
|
39
|
+
count = 0
|
40
|
+
t.each do |row|
|
41
|
+
needle = row['needles']
|
42
|
+
correct_match = row['correct_matches']
|
43
|
+
next unless needle.present?
|
44
|
+
if options.downcase
|
45
|
+
needle.to_s.downcase!
|
46
|
+
correct_match.to_s.downcase!
|
47
|
+
end
|
48
|
+
correct_match = nil if correct_match.blank?
|
49
|
+
match = fm.find needle
|
50
|
+
if options.show_success? or match != correct_match
|
51
|
+
puts " #{needle.inspect} => #{match.inspect}"
|
52
|
+
end
|
53
|
+
unless match == correct_match
|
54
|
+
puts "MISMATCH: #{needle.inspect} should match #{correct_match.inspect}"
|
55
|
+
exit 1
|
56
|
+
end
|
57
|
+
count += 1
|
58
|
+
end
|
59
|
+
|
60
|
+
puts "Correctly matched #{count} needles."
|
61
|
+
exit 0
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
FuzzyMatch::Checker.start
|
data/fuzzy_match.gemspec
CHANGED
@@ -1,11 +1,9 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
require "fuzzy_match/version"
|
2
|
+
require File.expand_path("../lib/fuzzy_match/version", __FILE__)
|
4
3
|
|
5
4
|
Gem::Specification.new do |s|
|
6
5
|
s.name = "fuzzy_match"
|
7
6
|
s.version = FuzzyMatch::VERSION
|
8
|
-
s.platform = Gem::Platform::RUBY
|
9
7
|
s.authors = ["Seamus Abshere"]
|
10
8
|
s.email = ["seamus@abshere.net"]
|
11
9
|
s.homepage = "https://github.com/seamusabshere/fuzzy_match"
|
@@ -21,4 +19,5 @@ Gem::Specification.new do |s|
|
|
21
19
|
|
22
20
|
s.add_runtime_dependency 'activesupport', '>=3'
|
23
21
|
s.add_runtime_dependency 'to_regexp', '>=0.0.3'
|
22
|
+
s.add_runtime_dependency 'active_record_inline_schema', '>=0.4.0'
|
24
23
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'active_record_inline_schema'
|
2
|
+
|
1
3
|
class FuzzyMatch
|
2
4
|
class CachedResult < ::ActiveRecord::Base
|
3
5
|
if ::ActiveRecord::VERSION::STRING >= '3.2'
|
@@ -5,26 +7,23 @@ class FuzzyMatch
|
|
5
7
|
else
|
6
8
|
set_table_name :fuzzy_match_cached_results
|
7
9
|
end
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
end
|
16
|
-
connection.add_index :fuzzy_match_cached_results, [:a_class, :b_class, :a], :name => 'aba'
|
17
|
-
connection.add_index :fuzzy_match_cached_results, [:a_class, :b_class, :b], :name => 'abb'
|
18
|
-
connection.add_index :fuzzy_match_cached_results, [:a_class, :b_class, :a, :b], :name => 'abab'
|
19
|
-
reset_column_information
|
20
|
-
end
|
21
|
-
|
22
|
-
def self.setup(from_scratch = false)
|
23
|
-
if from_scratch or not table_exists?
|
24
|
-
connection.drop_table :fuzzy_match_cached_results rescue nil
|
25
|
-
create_table rescue nil
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def setup(from_scratch = false)
|
13
|
+
if from_scratch
|
14
|
+
connection.drop_table :fuzzy_match_cached_results rescue nil
|
15
|
+
end
|
16
|
+
auto_upgrade!
|
26
17
|
end
|
27
18
|
end
|
19
|
+
|
20
|
+
col :a_class
|
21
|
+
col :a
|
22
|
+
col :b_class
|
23
|
+
col :b
|
24
|
+
add_index [:a_class, :b_class, :a], :name => 'aba'
|
25
|
+
add_index [:a_class, :b_class, :b], :name => 'abb'
|
26
|
+
add_index [:a_class, :b_class, :a, :b], :name => 'abab'
|
28
27
|
|
29
28
|
module ActiveRecordBaseExtension
|
30
29
|
# required options:
|
data/lib/fuzzy_match/version.rb
CHANGED
data/lib/fuzzy_match.rb
CHANGED
@@ -15,10 +15,6 @@ require 'fuzzy_match/wrapper'
|
|
15
15
|
require 'fuzzy_match/similarity'
|
16
16
|
require 'fuzzy_match/score'
|
17
17
|
|
18
|
-
if defined?(::ActiveRecord)
|
19
|
-
require 'fuzzy_match/cached_result'
|
20
|
-
end
|
21
|
-
|
22
18
|
# See the README for more information.
|
23
19
|
class FuzzyMatch
|
24
20
|
class << self
|
data/test/test_amatch.rb
CHANGED
@@ -3,14 +3,16 @@ unless RUBY_PLATFORM == 'java'
|
|
3
3
|
require 'test_fuzzy_match'
|
4
4
|
require 'amatch'
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
6
|
+
describe FuzzyMatch do
|
7
|
+
describe %{when using the :amatch string similarity engine} do
|
8
|
+
before do
|
9
|
+
$testing_amatch = true
|
10
|
+
FuzzyMatch.engine = :amatch
|
11
|
+
end
|
12
|
+
after do
|
13
|
+
$testing_amatch = false
|
14
|
+
FuzzyMatch.engine = nil
|
15
|
+
end
|
14
16
|
end
|
15
17
|
end
|
16
18
|
end
|
data/test/test_cache.rb
CHANGED
@@ -2,7 +2,7 @@ require 'helper'
|
|
2
2
|
|
3
3
|
require 'active_support/all'
|
4
4
|
require 'active_record'
|
5
|
-
require '
|
5
|
+
require 'cohort_analysis'
|
6
6
|
require 'weighted_average'
|
7
7
|
|
8
8
|
ActiveRecord::Base.establish_connection(
|
@@ -25,7 +25,7 @@ require 'fuzzy_match/cached_result'
|
|
25
25
|
::FuzzyMatch::CachedResult.setup(true)
|
26
26
|
|
27
27
|
class Aircraft < ActiveRecord::Base
|
28
|
-
|
28
|
+
self.primary_key = 'icao_code'
|
29
29
|
|
30
30
|
cache_fuzzy_match_with :flight_segments, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
31
31
|
|
@@ -52,13 +52,10 @@ CREATE TABLE `aircraft` (
|
|
52
52
|
end
|
53
53
|
|
54
54
|
class FlightSegment < ActiveRecord::Base
|
55
|
-
|
55
|
+
self.primary_key = 'row_hash'
|
56
56
|
|
57
57
|
cache_fuzzy_match_with :aircraft, :primary_key => :aircraft_description, :foreign_key => :aircraft_description
|
58
58
|
|
59
|
-
extend CohortScope
|
60
|
-
self.minimum_cohort_size = 1
|
61
|
-
|
62
59
|
def self.create_table
|
63
60
|
connection.drop_table(:flight_segments) rescue nil
|
64
61
|
connection.execute %{
|
@@ -96,11 +93,11 @@ fs.passengers = 100
|
|
96
93
|
fs.seats = 5
|
97
94
|
fs.save!
|
98
95
|
|
99
|
-
FlightSegment.
|
96
|
+
FlightSegment.all.each do |fs|
|
100
97
|
fs.cache_aircraft!
|
101
98
|
end
|
102
99
|
|
103
|
-
|
100
|
+
describe FuzzyMatch::CachedResult do
|
104
101
|
it %{joins aircraft to flight segments} do
|
105
102
|
aircraft = Aircraft.find('B742')
|
106
103
|
aircraft.flight_segments.count.must_equal 2
|
@@ -118,7 +115,8 @@ class TestCache < MiniTest::Spec
|
|
118
115
|
|
119
116
|
it %{works with cohort_scope (albeit rather clumsily)} do
|
120
117
|
aircraft = Aircraft.find('B742')
|
121
|
-
FlightSegment.
|
118
|
+
FlightSegment.cohort({:aircraft_description => aircraft.flight_segments_foreign_keys}, :minimum_size => 2).count.must_equal 2
|
119
|
+
# FlightSegment.cohort(:aircraft_description => aircraft.flight_segments_foreign_keys).must_equal []
|
122
120
|
end
|
123
121
|
|
124
122
|
# def test_006_you_can_get_aircraft_from_flight_segments
|
data/test/test_fuzzy_match.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
require 'helper'
|
3
3
|
|
4
|
-
|
4
|
+
describe FuzzyMatch do
|
5
5
|
describe '#find' do
|
6
6
|
it %{identifies the best match based on string similarity} do
|
7
7
|
d = FuzzyMatch.new %w{ RATZ CATZ }
|
@@ -12,6 +12,11 @@ class TestFuzzyMatch < MiniTest::Spec
|
|
12
12
|
d.find('X').must_equal 'X'
|
13
13
|
d.find('A').must_be_nil
|
14
14
|
end
|
15
|
+
|
16
|
+
it %{does the right thing} do
|
17
|
+
d = FuzzyMatch.new [ 'Artyom Makarov', 'Karl' ], :must_match_at_least_one_word => true
|
18
|
+
puts d.explain('art')#.must_equal 'Artyom Makarov'
|
19
|
+
end
|
15
20
|
|
16
21
|
it %{not return any result if the maximum score is zero} do
|
17
22
|
FuzzyMatch.new(['a']).find('b').must_be_nil
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-04-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: '3'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '3'
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: to_regexp
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ! '>='
|
@@ -32,19 +37,41 @@ dependencies:
|
|
32
37
|
version: 0.0.3
|
33
38
|
type: :runtime
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 0.0.3
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: active_record_inline_schema
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 0.4.0
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.4.0
|
36
62
|
description: Find a needle in a haystack using string similarity and (optionally)
|
37
63
|
regexp rules. Replaces loose_tight_dictionary.
|
38
64
|
email:
|
39
65
|
- seamus@abshere.net
|
40
|
-
executables:
|
66
|
+
executables:
|
67
|
+
- fuzzy_match_checker
|
41
68
|
extensions: []
|
42
69
|
extra_rdoc_files: []
|
43
70
|
files:
|
44
71
|
- .document
|
45
72
|
- .gitignore
|
73
|
+
- CHANGELOG
|
46
74
|
- Gemfile
|
47
|
-
- History.txt
|
48
75
|
- LICENSE
|
49
76
|
- README.markdown
|
50
77
|
- Rakefile
|
@@ -53,6 +80,7 @@ files:
|
|
53
80
|
- benchmark/before-without-last-result.txt
|
54
81
|
- benchmark/before.txt
|
55
82
|
- benchmark/memory.rb
|
83
|
+
- bin/fuzzy_match_checker
|
56
84
|
- examples/bts_aircraft/5-2-A.htm
|
57
85
|
- examples/bts_aircraft/5-2-B.htm
|
58
86
|
- examples/bts_aircraft/5-2-D.htm
|
@@ -111,7 +139,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
111
139
|
version: '0'
|
112
140
|
requirements: []
|
113
141
|
rubyforge_project: fuzzy_match
|
114
|
-
rubygems_version: 1.8.
|
142
|
+
rubygems_version: 1.8.21
|
115
143
|
signing_key:
|
116
144
|
specification_version: 3
|
117
145
|
summary: Find a needle in a haystack using string similarity and (optionally) regexp
|
data/History.txt
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
== 1.3.2 / 2012-02-24
|
2
|
-
|
3
|
-
* Start keeping a changelog!
|
4
|
-
|
5
|
-
* Enhancements
|
6
|
-
|
7
|
-
* renamed blockings to groupings
|
8
|
-
* cleaned up tests
|
9
|
-
|
10
|
-
* Bug fixes
|
11
|
-
|
12
|
-
* better handling for one-letter similiarities like 'X foo' vs 'X bar' which couldn't be detected by pair distance
|
13
|
-
* take deprecated option :tighteners as :normalizers
|