matching 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +2 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +60 -0
- data/README.md +319 -0
- data/Rakefile +47 -0
- data/VERSION +1 -0
- data/lib/matching.rb +11 -0
- data/lib/matching/active_relation_store.rb +30 -0
- data/lib/matching/array_store.rb +23 -0
- data/lib/matching/attribute_pair.rb +17 -0
- data/lib/matching/deduplicator.rb +133 -0
- data/lib/matching/hash_index.rb +25 -0
- data/lib/matching/match.rb +14 -0
- data/lib/matching/matcher.rb +266 -0
- data/lib/matching/redis_index.rb +26 -0
- data/lib/matching/similarity.rb +78 -0
- data/matching.gemspec +71 -0
- data/spec/db/database.yml +5 -0
- data/spec/integration/bank_rec_spec.rb +50 -0
- data/spec/lib/ar_spec.rb +182 -0
- data/spec/lib/deduplicator_spec.rb +221 -0
- data/spec/lib/matcher_spec.rb +297 -0
- data/spec/lib/redis_spec.rb +105 -0
- data/spec/lib/similarity_spec.rb +88 -0
- data/spec/samples/agent_recs.csv +2024 -0
- data/spec/spec_helper.rb +70 -0
- metadata +109 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
require 'redis'
|
|
2
|
+
|
|
3
|
+
module Matching
|
|
4
|
+
class RedisIndex
|
|
5
|
+
|
|
6
|
+
def initialize(db_num=8)
|
|
7
|
+
@redis = Redis.new
|
|
8
|
+
@redis.select(db_num)
|
|
9
|
+
@redis.flushdb
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
#Add a value to the index for a given attribute and object id
|
|
13
|
+
def put(attr, val, id)
|
|
14
|
+
unless val.nil?
|
|
15
|
+
@redis.sadd("#{attr}:#{val}",id)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
#Return an array of object ids for a given attribute and value
|
|
20
|
+
def get(attr, val)
|
|
21
|
+
str_ids = @redis.smembers("#{attr}:#{val}")
|
|
22
|
+
(str_ids.any? ? str_ids.map { |a| a.to_i } : nil)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
require 'text/levenshtein'
|
|
2
|
+
require 'date'
|
|
3
|
+
|
|
4
|
+
# Adds fuzzy methods to standard classes for
|
|
5
|
+
# comparing two instances on a rules-based scale
|
|
6
|
+
# between 0.0 and 1.0.
|
|
7
|
+
|
|
8
|
+
class Date
|
|
9
|
+
# Calculates a score between 0.0 and 1.0 for all dates within :days_scale
|
|
10
|
+
# of each other.
|
|
11
|
+
def similarity_to(other_date, opts={})
|
|
12
|
+
days_scale = opts[:days_scale] || 30
|
|
13
|
+
raise ArgumentError, 'days_scale must be numeric' unless days_scale.class == Fixnum
|
|
14
|
+
days_scale = days_scale.to_f
|
|
15
|
+
|
|
16
|
+
delta = (self - other_date).to_f.abs
|
|
17
|
+
(delta < days_scale ? (days_scale - delta) / days_scale : 0.0)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
class String
|
|
22
|
+
|
|
23
|
+
def similarity_to(other_string, opts={})
|
|
24
|
+
case opts[:comparison]
|
|
25
|
+
when :name
|
|
26
|
+
name_similarity_to(other_string)
|
|
27
|
+
else
|
|
28
|
+
## use just levenshtein edit distance (see levenshtein.rb)
|
|
29
|
+
return raw_similarity_to(other_string)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
#Given a string, return one or more tokens parsed with the following rules:
|
|
34
|
+
# 1. Turn commas into spaces
|
|
35
|
+
# 2. Split on spaces
|
|
36
|
+
# 3. Strip periods
|
|
37
|
+
# 4. Discard any tokens with single letters
|
|
38
|
+
def tokenize
|
|
39
|
+
tokens = self.gsub(/\,/,' ').gsub(/\./,'').split(' ')
|
|
40
|
+
tokens.reject! { |p| p.size == 1 }
|
|
41
|
+
tokens
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Given two names, return a floating-point evaluation
|
|
45
|
+
# of similarity in the range 0.0 - 1.0
|
|
46
|
+
def name_similarity_to(other_string)
|
|
47
|
+
return 0.0 if self.nil? || other_string.nil? || self.size == 0 || other_string.size == 0
|
|
48
|
+
return 1.0 if self == other_string
|
|
49
|
+
|
|
50
|
+
l_tokens = self.tokenize
|
|
51
|
+
r_tokens = other_string.tokenize
|
|
52
|
+
|
|
53
|
+
total_sim = 0.0
|
|
54
|
+
l_tokens.each do |l|
|
|
55
|
+
r_tokens.each do |r|
|
|
56
|
+
total_sim += l.raw_similarity_to(r)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
avg_tokens = (l_tokens.size + r_tokens.size).to_f / 2.0
|
|
61
|
+
score = total_sim / avg_tokens
|
|
62
|
+
(score > 1.0 ? 1.0 : score)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Returns a floating point value of the similarity
|
|
66
|
+
# between this string and other.
|
|
67
|
+
# Uses 'text' gem, http://rubyforge.org/projects/text
|
|
68
|
+
def raw_similarity_to(other)
|
|
69
|
+
delta = Text::Levenshtein::distance(self.downcase, other.downcase)
|
|
70
|
+
return 0.0 unless delta
|
|
71
|
+
return 1.0 if delta == 0
|
|
72
|
+
|
|
73
|
+
avg_len = (size + other.size).to_f / 2.0
|
|
74
|
+
return 0.0 if delta > avg_len
|
|
75
|
+
(avg_len - delta.to_f) / avg_len
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
data/matching.gemspec
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Generated by jeweler
|
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
|
4
|
+
# -*- encoding: utf-8 -*-
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |s|
|
|
7
|
+
s.name = %q{matching}
|
|
8
|
+
s.version = "0.14.1"
|
|
9
|
+
|
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
|
+
s.authors = [%q{Barry Ezell}]
|
|
12
|
+
s.date = %q{2012-02-09}
|
|
13
|
+
s.description = %q{}
|
|
14
|
+
s.email = %q{barrye@gmail.com}
|
|
15
|
+
s.extra_rdoc_files = [
|
|
16
|
+
"README.md"
|
|
17
|
+
]
|
|
18
|
+
s.files = [
|
|
19
|
+
".document",
|
|
20
|
+
".rspec",
|
|
21
|
+
"Gemfile",
|
|
22
|
+
"Gemfile.lock",
|
|
23
|
+
"README.md",
|
|
24
|
+
"Rakefile",
|
|
25
|
+
"VERSION",
|
|
26
|
+
"lib/matching.rb",
|
|
27
|
+
"lib/matching/active_relation_store.rb",
|
|
28
|
+
"lib/matching/array_store.rb",
|
|
29
|
+
"lib/matching/attribute_pair.rb",
|
|
30
|
+
"lib/matching/deduplicator.rb",
|
|
31
|
+
"lib/matching/hash_index.rb",
|
|
32
|
+
"lib/matching/match.rb",
|
|
33
|
+
"lib/matching/matcher.rb",
|
|
34
|
+
"lib/matching/redis_index.rb",
|
|
35
|
+
"lib/matching/similarity.rb",
|
|
36
|
+
"matching.gemspec",
|
|
37
|
+
"spec/db/database.yml",
|
|
38
|
+
"spec/integration/bank_rec_spec.rb",
|
|
39
|
+
"spec/lib/ar_spec.rb",
|
|
40
|
+
"spec/lib/deduplicator_spec.rb",
|
|
41
|
+
"spec/lib/matcher_spec.rb",
|
|
42
|
+
"spec/lib/redis_spec.rb",
|
|
43
|
+
"spec/lib/similarity_spec.rb",
|
|
44
|
+
"spec/samples/agent_recs.csv",
|
|
45
|
+
"spec/spec_helper.rb"
|
|
46
|
+
]
|
|
47
|
+
s.homepage = %q{http://github.com/btedev/matching}
|
|
48
|
+
s.licenses = [%q{MIT license}]
|
|
49
|
+
s.require_paths = [%q{lib}]
|
|
50
|
+
s.rubygems_version = %q{1.8.7}
|
|
51
|
+
s.summary = %q{Dataset matching engine}
|
|
52
|
+
|
|
53
|
+
if s.respond_to? :specification_version then
|
|
54
|
+
s.specification_version = 3
|
|
55
|
+
|
|
56
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
|
57
|
+
s.add_runtime_dependency(%q<text>, [">= 0"])
|
|
58
|
+
s.add_development_dependency(%q<bundler>, [">= 0"])
|
|
59
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
|
60
|
+
else
|
|
61
|
+
s.add_dependency(%q<text>, [">= 0"])
|
|
62
|
+
s.add_dependency(%q<bundler>, [">= 0"])
|
|
63
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
|
64
|
+
end
|
|
65
|
+
else
|
|
66
|
+
s.add_dependency(%q<text>, [">= 0"])
|
|
67
|
+
s.add_dependency(%q<bundler>, [">= 0"])
|
|
68
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
require File.expand_path("../../spec_helper", __FILE__)
|
|
2
|
+
include Matching
|
|
3
|
+
|
|
4
|
+
# Note: do not use a Struct in place of the class because matcher.rb relies
|
|
5
|
+
# on object_id for determine object inclusion in exception arrays. Two
|
|
6
|
+
# instances of a Struct with the same values have the same object_id.
|
|
7
|
+
class Transaction
|
|
8
|
+
attr_accessor :date, :desc, :amount
|
|
9
|
+
def initialize(date, desc, amount)
|
|
10
|
+
@date, @desc, @amount = date, desc, amount
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
describe "Bank reconciliation" do
|
|
15
|
+
|
|
16
|
+
let(:ledger_txns) do
|
|
17
|
+
[
|
|
18
|
+
Transaction.new(Date.new(2012,1,1),'Basecamp','25.0'),
|
|
19
|
+
Transaction.new(Date.new(2012,1,1),'Basecamp','25.0'),
|
|
20
|
+
Transaction.new(Date.new(2012,1,2),'Github','25.0')
|
|
21
|
+
]
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
let(:bank_txns) do
|
|
25
|
+
[
|
|
26
|
+
Transaction.new(Date.new(2012,1,1),'Basecamp (37 signals)','25.0'),
|
|
27
|
+
Transaction.new(Date.new(2012,1,3),'Github','25.0')
|
|
28
|
+
]
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
let(:matcher) do
|
|
32
|
+
Matching::Matcher.new(
|
|
33
|
+
:left_store => ArrayStore.new(ledger_txns),
|
|
34
|
+
:right_store => ArrayStore.new(bank_txns),
|
|
35
|
+
:min_score => 1.0
|
|
36
|
+
)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it "should rec" do
|
|
40
|
+
matcher.define do
|
|
41
|
+
join :amount, :amount, 1.0
|
|
42
|
+
compare :date, :date, 0.2, :fuzzy => true
|
|
43
|
+
end
|
|
44
|
+
matcher.match
|
|
45
|
+
matcher.left_matches.should have(2).items
|
|
46
|
+
matcher.left_exceptions.should have(1).items
|
|
47
|
+
matcher.right_exceptions.should have(0).items
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
end
|
data/spec/lib/ar_spec.rb
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# Tests ActiveRecord as the data store
|
|
2
|
+
|
|
3
|
+
require File.expand_path("../../spec_helper", __FILE__)
|
|
4
|
+
require File.expand_path("../../../lib/matching/active_relation_store", __FILE__)
|
|
5
|
+
include Matching
|
|
6
|
+
|
|
7
|
+
module ArSpecHelper
|
|
8
|
+
|
|
9
|
+
class Txn < ActiveRecord::Base
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def config
|
|
13
|
+
@config ||= YAML.load_file(File.expand_path(File.dirname(__FILE__) + '/../db/database.yml'))['development']
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def db_connect
|
|
17
|
+
File.delete(config['database']) if File.exists?(config['database'])
|
|
18
|
+
options = {:charset => 'utf8', :collation => 'utf8_unicode_ci'}
|
|
19
|
+
ActiveRecord::Base.establish_connection config
|
|
20
|
+
sql = "create table txns(id integer primary key, company text, esn text, mdn text, date date);"
|
|
21
|
+
ActiveRecord::Base.connection.execute(sql)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
#creates arrays of Transaction and ServiceChange model objects using similar structure to
|
|
25
|
+
#create_test_data above
|
|
26
|
+
def create_ar_test_data
|
|
27
|
+
db_connect
|
|
28
|
+
|
|
29
|
+
@left_a = Txn.create(:company => 'ACME', :esn => "11111111111", :mdn => "7275551111", :date => Date.new(2010,6,1))
|
|
30
|
+
@left_b = Txn.create(:company => 'ACME', :esn => "22222222222", :mdn => "8135554444", :date => Date.new(2010,6,1))
|
|
31
|
+
@left_c = Txn.create(:company => 'ACME', :esn => "33333333333", :mdn => "7275551111", :date => Date.new(2010,6,15))
|
|
32
|
+
|
|
33
|
+
@right_a = Txn.create(:company => 'Cinco', :esn => "11111111111", :mdn => "2015559999", :date => Date.new(2010,6,1))
|
|
34
|
+
@right_b = Txn.create(:company => 'Cinco', :esn => "11111111111", :mdn => "7275551111", :date => Date.new(2010,6,1))
|
|
35
|
+
@right_c = Txn.create(:company => 'Cinco', :esn => "22222222222", :mdn => "8135554444", :date => Date.new(2010,6,2))
|
|
36
|
+
@right_d = Txn.create(:company => 'Cinco', :esn => "44444444444", :mdn => "7275551111", :date => Date.new(2010,6,14))
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
#matcher using ActiveRecord for the data store
|
|
40
|
+
def create_ar_matcher(use_redis = false)
|
|
41
|
+
create_ar_test_data
|
|
42
|
+
|
|
43
|
+
matcher = Matcher.new(
|
|
44
|
+
:left_store => ActiveRelationStore.new(Txn, "company = 'ACME'"),
|
|
45
|
+
:right_store => ActiveRelationStore.new(Txn, "company = 'Cinco'"),
|
|
46
|
+
:redis_db => (use_redis ? 8 : nil)
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
describe ActiveRelationStore do
|
|
52
|
+
include ArSpecHelper
|
|
53
|
+
|
|
54
|
+
before(:each) do
|
|
55
|
+
create_ar_test_data
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
context " unfiltered" do
|
|
59
|
+
let(:store) { ActiveRelationStore.new(Txn) }
|
|
60
|
+
|
|
61
|
+
it "should enumerate left AR objects with id" do
|
|
62
|
+
cnt = 0
|
|
63
|
+
expect { store.each { |o,idx| cnt += 1 } }.to change{cnt}.from(0).to(7)
|
|
64
|
+
|
|
65
|
+
obj, id = nil, nil
|
|
66
|
+
store.each do |_obj,_id|
|
|
67
|
+
obj, id = _obj, _id
|
|
68
|
+
break
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
id.should == 1
|
|
72
|
+
obj.should == @left_a
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
it "should retrieve objects by their id through the find method" do
|
|
76
|
+
store.find(2).should == @left_b
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
context " filtered" do
|
|
81
|
+
let(:store) { ActiveRelationStore.new(Txn, "company = 'Cinco'") }
|
|
82
|
+
|
|
83
|
+
it "should have a where clause" do
|
|
84
|
+
store.where_clause.should == "company = 'Cinco'"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
it "should enumerate left AR objects from query with where clause" do
|
|
88
|
+
cnt = 0
|
|
89
|
+
expect { store.each { |o,idx| cnt += 1 } }.to change{cnt}.from(0).to(4)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
describe Matcher do
|
|
95
|
+
include ArSpecHelper
|
|
96
|
+
|
|
97
|
+
context "with hash index and ActiveRecord store" do
|
|
98
|
+
|
|
99
|
+
before(:each) do
|
|
100
|
+
@matcher = create_ar_matcher
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
let(:esn_matcher) do
|
|
104
|
+
@matcher.define { join :esn, :esn, 1.0 }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
let(:ptn_esn_matcher) do
|
|
108
|
+
@matcher.define do
|
|
109
|
+
join :mdn, :mdn, 1.0
|
|
110
|
+
join :esn, :esn, 1.0
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
it "requires at least one join pair to be defined" do
|
|
115
|
+
expect { @matcher.index_right_objects }.to raise_error
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
context "using ptn and esn matcher" do
|
|
119
|
+
|
|
120
|
+
before(:each) do
|
|
121
|
+
ptn_esn_matcher
|
|
122
|
+
@matcher.index_right_objects
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
it "indexes right records on join attributes" do
|
|
126
|
+
@matcher.right_index.get(:esn, "11111111111").should_not be_nil
|
|
127
|
+
@matcher.right_index.get(:esn, "11111111111").size.should == 2
|
|
128
|
+
@matcher.right_index.get(:mdn, "8135554444").size.should == 1
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it "finds potential matches for left_objects from right_objects based on join criteria" do
|
|
132
|
+
right_matches = @matcher.find_potential_matches(@left_a)
|
|
133
|
+
right_matches.should have(3).items
|
|
134
|
+
right_matches.should include(@right_a)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
it "finds scored matches by applying rules after finding potential matches" do
|
|
138
|
+
right_matches = @matcher.find_matches(@left_a)
|
|
139
|
+
right_matches.should have(3).items
|
|
140
|
+
|
|
141
|
+
#raise matching threshold
|
|
142
|
+
@matcher.min_score = 2.0
|
|
143
|
+
right_matches = @matcher.find_matches(@left_a)
|
|
144
|
+
right_matches.should have(1).items
|
|
145
|
+
|
|
146
|
+
#note: return value is an array of arrays, not an array of just
|
|
147
|
+
#right_objects
|
|
148
|
+
right_matches[0].should == [@right_b, 2.0]
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it "should reconcile test data based on single attribute pair" do
|
|
153
|
+
esn_matcher
|
|
154
|
+
@matcher.match
|
|
155
|
+
@matcher.right_matches.size.should == 2
|
|
156
|
+
@matcher.left_matches.size.should == 2
|
|
157
|
+
|
|
158
|
+
@matcher.left_matches.should include(@left_a)
|
|
159
|
+
@matcher.left_matches.should include(@left_b)
|
|
160
|
+
@matcher.left_matches[@left_b].right_obj.should == @right_c
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
it "should reconcile test data based on two attribute pairs" do
|
|
164
|
+
ptn_esn_matcher
|
|
165
|
+
@matcher.match
|
|
166
|
+
@matcher.right_matches.size.should == 3
|
|
167
|
+
@matcher.left_matches.size.should == 3
|
|
168
|
+
@matcher.left_matches[@left_c].right_obj.should == @right_d
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
it "should fail to match records below the min_score threshold" do
|
|
172
|
+
ptn_esn_matcher
|
|
173
|
+
@matcher.min_score = 2.0
|
|
174
|
+
@matcher.match
|
|
175
|
+
@matcher.right_matches.size.should == 2
|
|
176
|
+
@matcher.left_matches.size.should == 2
|
|
177
|
+
@matcher.left_matches[@left_c].should be_nil
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
end #hash index and ActiveRecord tests
|
|
181
|
+
|
|
182
|
+
end
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Tests main functionality using array data stores and hash indexing.
|
|
2
|
+
# See ar_spec.rb for tests of ActiveRecord as the data store
|
|
3
|
+
# See redis_spec.rb for tests of Redis for indexing.
|
|
4
|
+
|
|
5
|
+
require 'rspec'
|
|
6
|
+
require 'date'
|
|
7
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../lib/matching.rb')
|
|
8
|
+
include Matching
|
|
9
|
+
|
|
10
|
+
module DedupeSpecHelper
|
|
11
|
+
CellTxn = Struct.new(:id, :mid, :esn, :act_date, :nilly)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
describe Deduplicator do
|
|
15
|
+
include DedupeSpecHelper
|
|
16
|
+
|
|
17
|
+
let (:array_store) do
|
|
18
|
+
c1 = CellTxn.new(0, "7275554444", "11111111111", Date.new(2011,1,1))
|
|
19
|
+
c2 = CellTxn.new(1, "7275554444", "22222222222", Date.new(2011,1,2))
|
|
20
|
+
c3 = CellTxn.new(2, "8135552222", "22222222222", Date.new(2011,1,3))
|
|
21
|
+
c4 = CellTxn.new(3, "8135552222", "22222222222", Date.new(2011,1,2))
|
|
22
|
+
ArrayStore.new([c1,c2,c3,c4])
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
before(:each) do
|
|
26
|
+
@deduper = Deduplicator.new(array_store)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
subject { @deduper }
|
|
30
|
+
specify { subject.index.should_not be_nil }
|
|
31
|
+
|
|
32
|
+
describe :store do
|
|
33
|
+
context "when not empty" do
|
|
34
|
+
specify { subject.store.should_not be_nil }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
context "when empty" do
|
|
38
|
+
specify { expect { Deduplicator.new }.to raise_error }
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
describe "match criteria" do
|
|
43
|
+
it "adds match definitions to criteria array" do
|
|
44
|
+
subject.match_attrs([:mid])
|
|
45
|
+
subject.criteria.should == [[:mid]]
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "should convert single items into arrays when adding criteria" do
|
|
49
|
+
subject.match_attrs(:mid)
|
|
50
|
+
subject.criteria.should == [[:mid]]
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
it "has a flattened, unique array combining any and all criteria" do
|
|
54
|
+
subject.match_attrs([:mid, :esn])
|
|
55
|
+
subject.match_attrs([:date, :mid])
|
|
56
|
+
ua = subject.unique_attrs
|
|
57
|
+
ua.should have(3).items
|
|
58
|
+
ua.should include(:mid)
|
|
59
|
+
ua.should include(:esn)
|
|
60
|
+
ua.should include(:date)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it "calls any and all via a block" do
|
|
64
|
+
subject.define do
|
|
65
|
+
match_attrs [:mid, :esn]
|
|
66
|
+
match_attrs [:date, :esn]
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
subject.criteria.should == [[:mid, :esn], [:date, :esn]]
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
it "indexes store values" do
|
|
74
|
+
subject.define { match_attrs [:mid] }
|
|
75
|
+
subject.create_index
|
|
76
|
+
subject.index.get(:mid, "7275554444").should have(2).items
|
|
77
|
+
subject.index.get(:mid, "8135552222").should have(2).items
|
|
78
|
+
subject.index.get(:mid, "2055558888").should be_nil
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
describe "deduplicate" do
|
|
82
|
+
context "single criteria arrays" do
|
|
83
|
+
|
|
84
|
+
it "should deduplicate an ArrayStore on a single match criterion (1 of 3)" do
|
|
85
|
+
subject.define do
|
|
86
|
+
match_attrs :mid
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
subject.deduplicate
|
|
90
|
+
subject.groups.count.should == 2
|
|
91
|
+
subject.groups[0].count.should == 2
|
|
92
|
+
subject.groups[1].count.should == 2
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
it "should deduplicate an ArrayStore on a single match criterion (2 of 3)" do
|
|
96
|
+
subject.define do
|
|
97
|
+
match_attrs :esn
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
subject.deduplicate
|
|
101
|
+
subject.groups.count.should == 2
|
|
102
|
+
subject.groups[0].count.should == 1
|
|
103
|
+
subject.groups[1].count.should == 3
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it "should deduplicate an ArrayStore on a single match criterion (3 of 3)" do
|
|
107
|
+
subject.define do
|
|
108
|
+
match_attrs :act_date
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
subject.deduplicate
|
|
112
|
+
subject.groups.count.should == 3
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
it "should group with only nil values" do
|
|
116
|
+
subject.define do
|
|
117
|
+
match_attrs :nilly
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
subject.deduplicate
|
|
121
|
+
subject.groups.count.should == 1
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
it "should group with some nil values" do
|
|
125
|
+
subject.define do
|
|
126
|
+
match_attrs [:mid, :nilly]
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
subject.deduplicate
|
|
130
|
+
subject.groups.count.should == 2
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
it "should deduplicate an ArrayStore on multiple criteria" do
|
|
134
|
+
subject.define do
|
|
135
|
+
match_attrs [:esn, :act_date]
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
subject.deduplicate
|
|
139
|
+
subject.groups.count.should == 3
|
|
140
|
+
end
|
|
141
|
+
end #single criteria arrays
|
|
142
|
+
|
|
143
|
+
context "multiple criteria arrays" do
|
|
144
|
+
|
|
145
|
+
let (:larger_array_store) do
|
|
146
|
+
c1 = CellTxn.new(0, "7275554444", "11111111111", Date.new(2011,1,1))
|
|
147
|
+
c2 = CellTxn.new(1, "7275554444", "22222222222", Date.new(2011,1,2))
|
|
148
|
+
c3 = CellTxn.new(2, "8135552222", "22222222222", Date.new(2011,1,3))
|
|
149
|
+
c4 = CellTxn.new(3, "8135552222", "22222222222", Date.new(2011,1,2))
|
|
150
|
+
c5 = CellTxn.new(4, "7275554444", "11111111111", Date.new(2011,1,2)) #hybrid of c1 and c2
|
|
151
|
+
ArrayStore.new([c1,c2,c3,c4,c5])
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
it "should join groups that are joined by different match criteria" do
|
|
155
|
+
subject = Deduplicator.new(larger_array_store)
|
|
156
|
+
subject.define do
|
|
157
|
+
match_attrs [:mid, :esn] #joins 0 and 4
|
|
158
|
+
match_attrs [:mid, :act_date] #joins 1 and 4
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
subject.deduplicate
|
|
162
|
+
subject.groups.count.should == 2 # expect [0,1,4],[2,3]
|
|
163
|
+
two_group = subject.groups.find { |grp| grp.size == 2}
|
|
164
|
+
two_group.should include(2,3)
|
|
165
|
+
three_group = subject.groups.find { |grp| grp.size == 3}
|
|
166
|
+
three_group.should include(0,1,4)
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
it "should return results with objects, group index, and item index" do
|
|
170
|
+
subject = Deduplicator.new(larger_array_store)
|
|
171
|
+
subject.define do
|
|
172
|
+
match_attrs [:mid, :esn] #joins 0 and 4
|
|
173
|
+
match_attrs [:mid, :act_date] #joins 1 and 4
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
subject.deduplicate
|
|
177
|
+
group_sum, item_sum = 0, 0
|
|
178
|
+
subject.each_with_groups do |obj, grp_idx, item_idx|
|
|
179
|
+
group_sum += grp_idx
|
|
180
|
+
item_sum += item_idx
|
|
181
|
+
#puts "grp: #{grp_idx}, item: #{item_idx}"
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
#grp: 0, item: 0
|
|
185
|
+
#grp: 0, item: 1
|
|
186
|
+
#grp: 0, item: 2
|
|
187
|
+
#grp: 1, item: 0
|
|
188
|
+
#grp: 1, item: 1
|
|
189
|
+
|
|
190
|
+
group_sum.should == 2
|
|
191
|
+
item_sum.should == 4
|
|
192
|
+
end
|
|
193
|
+
end #multiple criteria arrays
|
|
194
|
+
end #deduplication
|
|
195
|
+
|
|
196
|
+
context "integration tests" do
|
|
197
|
+
|
|
198
|
+
it "should deduplicate on a common key" do
|
|
199
|
+
txns = []
|
|
200
|
+
i = 0
|
|
201
|
+
File.open(File.join(File.dirname(__FILE__),'/../samples/agent_recs.csv'),'r').each do |line|
|
|
202
|
+
parts = line.split ','
|
|
203
|
+
txns << CellTxn.new(parts[0],parts[1],parts[2],parts[3])
|
|
204
|
+
|
|
205
|
+
i += 1
|
|
206
|
+
break if i == 200
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
subject = Deduplicator.new(ArrayStore.new(txns))
|
|
210
|
+
subject.define do
|
|
211
|
+
match_attrs :act_date
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
subject.deduplicate
|
|
215
|
+
|
|
216
|
+
dates = txns.map { |txn| txn.act_date }
|
|
217
|
+
|
|
218
|
+
subject.groups.size.should == dates.uniq.count
|
|
219
|
+
end
|
|
220
|
+
end #integration
|
|
221
|
+
end
|