matching 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ require 'redis'
2
+
3
+ module Matching
4
+ class RedisIndex
5
+
6
+ def initialize(db_num=8)
7
+ @redis = Redis.new
8
+ @redis.select(db_num)
9
+ @redis.flushdb
10
+ end
11
+
12
+ #Add a value to the index for a given attribute and object id
13
+ def put(attr, val, id)
14
+ unless val.nil?
15
+ @redis.sadd("#{attr}:#{val}",id)
16
+ end
17
+ end
18
+
19
+ #Return an array of object ids for a given attribute and value
20
+ def get(attr, val)
21
+ str_ids = @redis.smembers("#{attr}:#{val}")
22
+ (str_ids.any? ? str_ids.map { |a| a.to_i } : nil)
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,78 @@
1
+ require 'text/levenshtein'
2
+ require 'date'
3
+
4
+ # Adds fuzzy methods to standard classes for
5
+ # comparing two instances on a rules-based scale
6
+ # between 0.0 and 1.0.
7
+
8
+ class Date
9
+ # Calculates a score between 0.0 and 1.0 for all dates within :days_scale
10
+ # of each other.
11
+ def similarity_to(other_date, opts={})
12
+ days_scale = opts[:days_scale] || 30
13
+ raise ArgumentError, 'days_scale must be numeric' unless days_scale.class == Fixnum
14
+ days_scale = days_scale.to_f
15
+
16
+ delta = (self - other_date).to_f.abs
17
+ (delta < days_scale ? (days_scale - delta) / days_scale : 0.0)
18
+ end
19
+ end
20
+
21
+ class String
22
+
23
+ def similarity_to(other_string, opts={})
24
+ case opts[:comparison]
25
+ when :name
26
+ name_similarity_to(other_string)
27
+ else
28
+ ## use just levenshtein edit distance (see levenshtein.rb)
29
+ return raw_similarity_to(other_string)
30
+ end
31
+ end
32
+
33
+ #Given a string, return one or more tokens parsed with the following rules:
34
+ # 1. Turn commas into spaces
35
+ # 2. Split on spaces
36
+ # 3. Strip periods
37
+ # 4. Discard any tokens with single letters
38
+ def tokenize
39
+ tokens = self.gsub(/\,/,' ').gsub(/\./,'').split(' ')
40
+ tokens.reject! { |p| p.size == 1 }
41
+ tokens
42
+ end
43
+
44
+ # Given two names, return a floating-point evaluation
45
+ # of similarity in the range 0.0 - 1.0
46
+ def name_similarity_to(other_string)
47
+ return 0.0 if self.nil? || other_string.nil? || self.size == 0 || other_string.size == 0
48
+ return 1.0 if self == other_string
49
+
50
+ l_tokens = self.tokenize
51
+ r_tokens = other_string.tokenize
52
+
53
+ total_sim = 0.0
54
+ l_tokens.each do |l|
55
+ r_tokens.each do |r|
56
+ total_sim += l.raw_similarity_to(r)
57
+ end
58
+ end
59
+
60
+ avg_tokens = (l_tokens.size + r_tokens.size).to_f / 2.0
61
+ score = total_sim / avg_tokens
62
+ (score > 1.0 ? 1.0 : score)
63
+ end
64
+
65
+ # Returns a floating point value of the similarity
66
+ # between this string and other.
67
+ # Uses 'text' gem, http://rubyforge.org/projects/text
68
+ def raw_similarity_to(other)
69
+ delta = Text::Levenshtein::distance(self.downcase, other.downcase)
70
+ return 0.0 unless delta
71
+ return 1.0 if delta == 0
72
+
73
+ avg_len = (size + other.size).to_f / 2.0
74
+ return 0.0 if delta > avg_len
75
+ (avg_len - delta.to_f) / avg_len
76
+ end
77
+ end
78
+
data/matching.gemspec ADDED
@@ -0,0 +1,71 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{matching}
8
+ s.version = "0.14.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = [%q{Barry Ezell}]
12
+ s.date = %q{2012-02-09}
13
+ s.description = %q{}
14
+ s.email = %q{barrye@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "README.md"
17
+ ]
18
+ s.files = [
19
+ ".document",
20
+ ".rspec",
21
+ "Gemfile",
22
+ "Gemfile.lock",
23
+ "README.md",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/matching.rb",
27
+ "lib/matching/active_relation_store.rb",
28
+ "lib/matching/array_store.rb",
29
+ "lib/matching/attribute_pair.rb",
30
+ "lib/matching/deduplicator.rb",
31
+ "lib/matching/hash_index.rb",
32
+ "lib/matching/match.rb",
33
+ "lib/matching/matcher.rb",
34
+ "lib/matching/redis_index.rb",
35
+ "lib/matching/similarity.rb",
36
+ "matching.gemspec",
37
+ "spec/db/database.yml",
38
+ "spec/integration/bank_rec_spec.rb",
39
+ "spec/lib/ar_spec.rb",
40
+ "spec/lib/deduplicator_spec.rb",
41
+ "spec/lib/matcher_spec.rb",
42
+ "spec/lib/redis_spec.rb",
43
+ "spec/lib/similarity_spec.rb",
44
+ "spec/samples/agent_recs.csv",
45
+ "spec/spec_helper.rb"
46
+ ]
47
+ s.homepage = %q{http://github.com/btedev/matching}
48
+ s.licenses = [%q{MIT license}]
49
+ s.require_paths = [%q{lib}]
50
+ s.rubygems_version = %q{1.8.7}
51
+ s.summary = %q{Dataset matching engine}
52
+
53
+ if s.respond_to? :specification_version then
54
+ s.specification_version = 3
55
+
56
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
57
+ s.add_runtime_dependency(%q<text>, [">= 0"])
58
+ s.add_development_dependency(%q<bundler>, [">= 0"])
59
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
60
+ else
61
+ s.add_dependency(%q<text>, [">= 0"])
62
+ s.add_dependency(%q<bundler>, [">= 0"])
63
+ s.add_dependency(%q<jeweler>, [">= 0"])
64
+ end
65
+ else
66
+ s.add_dependency(%q<text>, [">= 0"])
67
+ s.add_dependency(%q<bundler>, [">= 0"])
68
+ s.add_dependency(%q<jeweler>, [">= 0"])
69
+ end
70
+ end
71
+
@@ -0,0 +1,5 @@
1
+ development:
2
+ adapter: sqlite3
3
+ database: spec/db/development.sqlite3
4
+ pool: 5
5
+ timeout: 5000
@@ -0,0 +1,50 @@
1
+ require File.expand_path("../../spec_helper", __FILE__)
2
+ include Matching
3
+
4
+ # Note: do not use a Struct in place of the class because matcher.rb relies
5
+ # on object_id for determine object inclusion in exception arrays. Two
6
+ # instances of a Struct with the same values have the same object_id.
7
+ class Transaction
8
+ attr_accessor :date, :desc, :amount
9
+ def initialize(date, desc, amount)
10
+ @date, @desc, @amount = date, desc, amount
11
+ end
12
+ end
13
+
14
+ describe "Bank reconciliation" do
15
+
16
+ let(:ledger_txns) do
17
+ [
18
+ Transaction.new(Date.new(2012,1,1),'Basecamp','25.0'),
19
+ Transaction.new(Date.new(2012,1,1),'Basecamp','25.0'),
20
+ Transaction.new(Date.new(2012,1,2),'Github','25.0')
21
+ ]
22
+ end
23
+
24
+ let(:bank_txns) do
25
+ [
26
+ Transaction.new(Date.new(2012,1,1),'Basecamp (37 signals)','25.0'),
27
+ Transaction.new(Date.new(2012,1,3),'Github','25.0')
28
+ ]
29
+ end
30
+
31
+ let(:matcher) do
32
+ Matching::Matcher.new(
33
+ :left_store => ArrayStore.new(ledger_txns),
34
+ :right_store => ArrayStore.new(bank_txns),
35
+ :min_score => 1.0
36
+ )
37
+ end
38
+
39
+ it "should rec" do
40
+ matcher.define do
41
+ join :amount, :amount, 1.0
42
+ compare :date, :date, 0.2, :fuzzy => true
43
+ end
44
+ matcher.match
45
+ matcher.left_matches.should have(2).items
46
+ matcher.left_exceptions.should have(1).items
47
+ matcher.right_exceptions.should have(0).items
48
+ end
49
+
50
+ end
@@ -0,0 +1,182 @@
1
+ # Tests ActiveRecord as the data store
2
+
3
+ require File.expand_path("../../spec_helper", __FILE__)
4
+ require File.expand_path("../../../lib/matching/active_relation_store", __FILE__)
5
+ include Matching
6
+
7
+ module ArSpecHelper
8
+
9
+ class Txn < ActiveRecord::Base
10
+ end
11
+
12
+ def config
13
+ @config ||= YAML.load_file(File.expand_path(File.dirname(__FILE__) + '/../db/database.yml'))['development']
14
+ end
15
+
16
+ def db_connect
17
+ File.delete(config['database']) if File.exists?(config['database'])
18
+ options = {:charset => 'utf8', :collation => 'utf8_unicode_ci'}
19
+ ActiveRecord::Base.establish_connection config
20
+ sql = "create table txns(id integer primary key, company text, esn text, mdn text, date date);"
21
+ ActiveRecord::Base.connection.execute(sql)
22
+ end
23
+
24
+ #creates arrays of Transaction and ServiceChange model objects using similar structure to
25
+ #create_test_data above
26
+ def create_ar_test_data
27
+ db_connect
28
+
29
+ @left_a = Txn.create(:company => 'ACME', :esn => "11111111111", :mdn => "7275551111", :date => Date.new(2010,6,1))
30
+ @left_b = Txn.create(:company => 'ACME', :esn => "22222222222", :mdn => "8135554444", :date => Date.new(2010,6,1))
31
+ @left_c = Txn.create(:company => 'ACME', :esn => "33333333333", :mdn => "7275551111", :date => Date.new(2010,6,15))
32
+
33
+ @right_a = Txn.create(:company => 'Cinco', :esn => "11111111111", :mdn => "2015559999", :date => Date.new(2010,6,1))
34
+ @right_b = Txn.create(:company => 'Cinco', :esn => "11111111111", :mdn => "7275551111", :date => Date.new(2010,6,1))
35
+ @right_c = Txn.create(:company => 'Cinco', :esn => "22222222222", :mdn => "8135554444", :date => Date.new(2010,6,2))
36
+ @right_d = Txn.create(:company => 'Cinco', :esn => "44444444444", :mdn => "7275551111", :date => Date.new(2010,6,14))
37
+ end
38
+
39
+ #matcher using ActiveRecord for the data store
40
+ def create_ar_matcher(use_redis = false)
41
+ create_ar_test_data
42
+
43
+ matcher = Matcher.new(
44
+ :left_store => ActiveRelationStore.new(Txn, "company = 'ACME'"),
45
+ :right_store => ActiveRelationStore.new(Txn, "company = 'Cinco'"),
46
+ :redis_db => (use_redis ? 8 : nil)
47
+ )
48
+ end
49
+ end
50
+
51
+ describe ActiveRelationStore do
52
+ include ArSpecHelper
53
+
54
+ before(:each) do
55
+ create_ar_test_data
56
+ end
57
+
58
+ context " unfiltered" do
59
+ let(:store) { ActiveRelationStore.new(Txn) }
60
+
61
+ it "should enumerate left AR objects with id" do
62
+ cnt = 0
63
+ expect { store.each { |o,idx| cnt += 1 } }.to change{cnt}.from(0).to(7)
64
+
65
+ obj, id = nil, nil
66
+ store.each do |_obj,_id|
67
+ obj, id = _obj, _id
68
+ break
69
+ end
70
+
71
+ id.should == 1
72
+ obj.should == @left_a
73
+ end
74
+
75
+ it "should retrieve objects by their id through the find method" do
76
+ store.find(2).should == @left_b
77
+ end
78
+ end
79
+
80
+ context " filtered" do
81
+ let(:store) { ActiveRelationStore.new(Txn, "company = 'Cinco'") }
82
+
83
+ it "should have a where clause" do
84
+ store.where_clause.should == "company = 'Cinco'"
85
+ end
86
+
87
+ it "should enumerate left AR objects from query with where clause" do
88
+ cnt = 0
89
+ expect { store.each { |o,idx| cnt += 1 } }.to change{cnt}.from(0).to(4)
90
+ end
91
+ end
92
+ end
93
+
94
+ describe Matcher do
95
+ include ArSpecHelper
96
+
97
+ context "with hash index and ActiveRecord store" do
98
+
99
+ before(:each) do
100
+ @matcher = create_ar_matcher
101
+ end
102
+
103
+ let(:esn_matcher) do
104
+ @matcher.define { join :esn, :esn, 1.0 }
105
+ end
106
+
107
+ let(:ptn_esn_matcher) do
108
+ @matcher.define do
109
+ join :mdn, :mdn, 1.0
110
+ join :esn, :esn, 1.0
111
+ end
112
+ end
113
+
114
+ it "requires at least one join pair to be defined" do
115
+ expect { @matcher.index_right_objects }.to raise_error
116
+ end
117
+
118
+ context "using ptn and esn matcher" do
119
+
120
+ before(:each) do
121
+ ptn_esn_matcher
122
+ @matcher.index_right_objects
123
+ end
124
+
125
+ it "indexes right records on join attributes" do
126
+ @matcher.right_index.get(:esn, "11111111111").should_not be_nil
127
+ @matcher.right_index.get(:esn, "11111111111").size.should == 2
128
+ @matcher.right_index.get(:mdn, "8135554444").size.should == 1
129
+ end
130
+
131
+ it "finds potential matches for left_objects from right_objects based on join criteria" do
132
+ right_matches = @matcher.find_potential_matches(@left_a)
133
+ right_matches.should have(3).items
134
+ right_matches.should include(@right_a)
135
+ end
136
+
137
+ it "finds scored matches by applying rules after finding potential matches" do
138
+ right_matches = @matcher.find_matches(@left_a)
139
+ right_matches.should have(3).items
140
+
141
+ #raise matching threshold
142
+ @matcher.min_score = 2.0
143
+ right_matches = @matcher.find_matches(@left_a)
144
+ right_matches.should have(1).items
145
+
146
+ #note: return value is an array of arrays, not an array of just
147
+ #right_objects
148
+ right_matches[0].should == [@right_b, 2.0]
149
+ end
150
+ end
151
+
152
+ it "should reconcile test data based on single attribute pair" do
153
+ esn_matcher
154
+ @matcher.match
155
+ @matcher.right_matches.size.should == 2
156
+ @matcher.left_matches.size.should == 2
157
+
158
+ @matcher.left_matches.should include(@left_a)
159
+ @matcher.left_matches.should include(@left_b)
160
+ @matcher.left_matches[@left_b].right_obj.should == @right_c
161
+ end
162
+
163
+ it "should reconcile test data based on two attribute pairs" do
164
+ ptn_esn_matcher
165
+ @matcher.match
166
+ @matcher.right_matches.size.should == 3
167
+ @matcher.left_matches.size.should == 3
168
+ @matcher.left_matches[@left_c].right_obj.should == @right_d
169
+ end
170
+
171
+ it "should fail to match records below the min_score threshold" do
172
+ ptn_esn_matcher
173
+ @matcher.min_score = 2.0
174
+ @matcher.match
175
+ @matcher.right_matches.size.should == 2
176
+ @matcher.left_matches.size.should == 2
177
+ @matcher.left_matches[@left_c].should be_nil
178
+ end
179
+
180
+ end #hash index and ActiveRecord tests
181
+
182
+ end
@@ -0,0 +1,221 @@
1
+ # Tests main functionality using array data stores and hash indexing.
2
+ # See ar_spec.rb for tests of ActiveRecord as the data store
3
+ # See redis_spec.rb for tests of Redis for indexing.
4
+
5
+ require 'rspec'
6
+ require 'date'
7
+ require File.expand_path(File.dirname(__FILE__) + '/../../lib/matching.rb')
8
+ include Matching
9
+
10
+ module DedupeSpecHelper
11
+ CellTxn = Struct.new(:id, :mid, :esn, :act_date, :nilly)
12
+ end
13
+
14
+ describe Deduplicator do
15
+ include DedupeSpecHelper
16
+
17
+ let (:array_store) do
18
+ c1 = CellTxn.new(0, "7275554444", "11111111111", Date.new(2011,1,1))
19
+ c2 = CellTxn.new(1, "7275554444", "22222222222", Date.new(2011,1,2))
20
+ c3 = CellTxn.new(2, "8135552222", "22222222222", Date.new(2011,1,3))
21
+ c4 = CellTxn.new(3, "8135552222", "22222222222", Date.new(2011,1,2))
22
+ ArrayStore.new([c1,c2,c3,c4])
23
+ end
24
+
25
+ before(:each) do
26
+ @deduper = Deduplicator.new(array_store)
27
+ end
28
+
29
+ subject { @deduper }
30
+ specify { subject.index.should_not be_nil }
31
+
32
+ describe :store do
33
+ context "when not empty" do
34
+ specify { subject.store.should_not be_nil }
35
+ end
36
+
37
+ context "when empty" do
38
+ specify { expect { Deduplicator.new }.to raise_error }
39
+ end
40
+ end
41
+
42
+ describe "match criteria" do
43
+ it "adds match definitions to criteria array" do
44
+ subject.match_attrs([:mid])
45
+ subject.criteria.should == [[:mid]]
46
+ end
47
+
48
+ it "should convert single items into arrays when adding criteria" do
49
+ subject.match_attrs(:mid)
50
+ subject.criteria.should == [[:mid]]
51
+ end
52
+
53
+ it "has a flattened, unique array combining any and all criteria" do
54
+ subject.match_attrs([:mid, :esn])
55
+ subject.match_attrs([:date, :mid])
56
+ ua = subject.unique_attrs
57
+ ua.should have(3).items
58
+ ua.should include(:mid)
59
+ ua.should include(:esn)
60
+ ua.should include(:date)
61
+ end
62
+
63
+ it "calls any and all via a block" do
64
+ subject.define do
65
+ match_attrs [:mid, :esn]
66
+ match_attrs [:date, :esn]
67
+ end
68
+
69
+ subject.criteria.should == [[:mid, :esn], [:date, :esn]]
70
+ end
71
+ end
72
+
73
+ it "indexes store values" do
74
+ subject.define { match_attrs [:mid] }
75
+ subject.create_index
76
+ subject.index.get(:mid, "7275554444").should have(2).items
77
+ subject.index.get(:mid, "8135552222").should have(2).items
78
+ subject.index.get(:mid, "2055558888").should be_nil
79
+ end
80
+
81
+ describe "deduplicate" do
82
+ context "single criteria arrays" do
83
+
84
+ it "should deduplicate an ArrayStore on a single match criterion (1 of 3)" do
85
+ subject.define do
86
+ match_attrs :mid
87
+ end
88
+
89
+ subject.deduplicate
90
+ subject.groups.count.should == 2
91
+ subject.groups[0].count.should == 2
92
+ subject.groups[1].count.should == 2
93
+ end
94
+
95
+ it "should deduplicate an ArrayStore on a single match criterion (2 of 3)" do
96
+ subject.define do
97
+ match_attrs :esn
98
+ end
99
+
100
+ subject.deduplicate
101
+ subject.groups.count.should == 2
102
+ subject.groups[0].count.should == 1
103
+ subject.groups[1].count.should == 3
104
+ end
105
+
106
+ it "should deduplicate an ArrayStore on a single match criterion (3 of 3)" do
107
+ subject.define do
108
+ match_attrs :act_date
109
+ end
110
+
111
+ subject.deduplicate
112
+ subject.groups.count.should == 3
113
+ end
114
+
115
+ it "should group with only nil values" do
116
+ subject.define do
117
+ match_attrs :nilly
118
+ end
119
+
120
+ subject.deduplicate
121
+ subject.groups.count.should == 1
122
+ end
123
+
124
+ it "should group with some nil values" do
125
+ subject.define do
126
+ match_attrs [:mid, :nilly]
127
+ end
128
+
129
+ subject.deduplicate
130
+ subject.groups.count.should == 2
131
+ end
132
+
133
+ it "should deduplicate an ArrayStore on multiple criteria" do
134
+ subject.define do
135
+ match_attrs [:esn, :act_date]
136
+ end
137
+
138
+ subject.deduplicate
139
+ subject.groups.count.should == 3
140
+ end
141
+ end #single criteria arrays
142
+
143
+ context "multiple criteria arrays" do
144
+
145
+ let (:larger_array_store) do
146
+ c1 = CellTxn.new(0, "7275554444", "11111111111", Date.new(2011,1,1))
147
+ c2 = CellTxn.new(1, "7275554444", "22222222222", Date.new(2011,1,2))
148
+ c3 = CellTxn.new(2, "8135552222", "22222222222", Date.new(2011,1,3))
149
+ c4 = CellTxn.new(3, "8135552222", "22222222222", Date.new(2011,1,2))
150
+ c5 = CellTxn.new(4, "7275554444", "11111111111", Date.new(2011,1,2)) #hybrid of c1 and c2
151
+ ArrayStore.new([c1,c2,c3,c4,c5])
152
+ end
153
+
154
+ it "should join groups that are joined by different match criteria" do
155
+ subject = Deduplicator.new(larger_array_store)
156
+ subject.define do
157
+ match_attrs [:mid, :esn] #joins 0 and 4
158
+ match_attrs [:mid, :act_date] #joins 1 and 4
159
+ end
160
+
161
+ subject.deduplicate
162
+ subject.groups.count.should == 2 # expect [0,1,4],[2,3]
163
+ two_group = subject.groups.find { |grp| grp.size == 2}
164
+ two_group.should include(2,3)
165
+ three_group = subject.groups.find { |grp| grp.size == 3}
166
+ three_group.should include(0,1,4)
167
+ end
168
+
169
+ it "should return results with objects, group index, and item index" do
170
+ subject = Deduplicator.new(larger_array_store)
171
+ subject.define do
172
+ match_attrs [:mid, :esn] #joins 0 and 4
173
+ match_attrs [:mid, :act_date] #joins 1 and 4
174
+ end
175
+
176
+ subject.deduplicate
177
+ group_sum, item_sum = 0, 0
178
+ subject.each_with_groups do |obj, grp_idx, item_idx|
179
+ group_sum += grp_idx
180
+ item_sum += item_idx
181
+ #puts "grp: #{grp_idx}, item: #{item_idx}"
182
+ end
183
+
184
+ #grp: 0, item: 0
185
+ #grp: 0, item: 1
186
+ #grp: 0, item: 2
187
+ #grp: 1, item: 0
188
+ #grp: 1, item: 1
189
+
190
+ group_sum.should == 2
191
+ item_sum.should == 4
192
+ end
193
+ end #multiple criteria arrays
194
+ end #deduplication
195
+
196
+ context "integration tests" do
197
+
198
+ it "should deduplicate on a common key" do
199
+ txns = []
200
+ i = 0
201
+ File.open(File.join(File.dirname(__FILE__),'/../samples/agent_recs.csv'),'r').each do |line|
202
+ parts = line.split ','
203
+ txns << CellTxn.new(parts[0],parts[1],parts[2],parts[3])
204
+
205
+ i += 1
206
+ break if i == 200
207
+ end
208
+
209
+ subject = Deduplicator.new(ArrayStore.new(txns))
210
+ subject.define do
211
+ match_attrs :act_date
212
+ end
213
+
214
+ subject.deduplicate
215
+
216
+ dates = txns.map { |txn| txn.act_date }
217
+
218
+ subject.groups.size.should == dates.uniq.count
219
+ end
220
+ end #integration
221
+ end