matching 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.14.1
data/lib/matching.rb ADDED
@@ -0,0 +1,11 @@
1
+ files = %w(
2
+ attribute_pair
3
+ array_store
4
+ similarity
5
+ hash_index
6
+ match
7
+ matcher
8
+ deduplicator
9
+ )
10
+
11
+ files.each { |f| require File.expand_path(File.dirname(__FILE__) + "/matching/#{f}.rb") }
@@ -0,0 +1,30 @@
1
+ require 'active_record'
2
+
3
+ module Matching
4
+
5
+ #Stores and retrieves data from ActiveRelation for Matcher
6
+ class ActiveRelationStore
7
+
8
+ attr_reader :model, :where_clause
9
+
10
+ def initialize(model, where_clause = nil)
11
+ @model = model
12
+ @where_clause = where_clause
13
+ end
14
+
15
+ #Iterates over array, also returning id
16
+ def each(&blk)
17
+ @model.where(@where_clause).find_in_batches do |group|
18
+ group.each do |obj|
19
+ blk.yield(obj, obj.id)
20
+ end
21
+ end
22
+ end
23
+
24
+ #Return an object by its AR id
25
+ def find(id)
26
+ @model.find(id)
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,23 @@
1
+ module Matching
2
+
3
+ #Stores and retrieves data from arrays for Matcher
4
+ class ArrayStore
5
+
6
+ attr_reader :arr
7
+
8
+ def initialize(arr)
9
+ @arr = arr
10
+ end
11
+
12
+ #Iterates over array, also returning index as a kind of ID
13
+ def each(&blk)
14
+ @arr.each_with_index(&blk)
15
+ end
16
+
17
+ #Return an object from the array by its index position
18
+ def find(idx)
19
+ @arr[idx]
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,17 @@
1
+ module Matching
2
+
3
+ #Defines the comparison of two attributes, one from the "left" class
4
+ #and one from the "right"
5
+ class AttributePair
6
+ attr_reader :left_attr, :right_attr, :weight, :is_fuzzy
7
+
8
+ def initialize(left_attr, right_attr, weight, is_fuzzy = false)
9
+ @left_attr = left_attr
10
+ @right_attr = right_attr
11
+ @weight = weight
12
+ @is_fuzzy = is_fuzzy
13
+
14
+ raise "Weight must be > 0.0" unless weight > 0.0
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,133 @@
1
+ module Matching
2
+ class Deduplicator
3
+
4
+ attr_accessor :store, :index, :criteria
5
+ attr_accessor :groups # array of arrays of duplicate records in form [[1,5],[2,3,4],[6]]
6
+ attr_accessor :grouped # hash of all ids present in @groups. Eventually all ids from @store will be added.
7
+ # Stored in form { id => index_of_groups_object }
8
+
9
+ def initialize(store,opts={})
10
+ raise 'Store parameter required' unless store
11
+ @store = store
12
+
13
+ @criteria = []
14
+
15
+ # Create an index using either a hash or Redis as the backing store
16
+ if opts[:redis_db] && opts[:redis_db].to_i >= 1
17
+ @index = RedisIndex.new(opts[:redis_db])
18
+ else
19
+ @index = HashIndex.new
20
+ end
21
+ end
22
+
23
+ def match_attrs(attrs)
24
+ @criteria << [*attrs] #converts to array if not already, doesn't affect arrays
25
+ end
26
+
27
+ def unique_attrs
28
+ @criteria.flatten.uniq
29
+ end
30
+
31
+ def define(&block)
32
+ instance_eval(&block)
33
+ end
34
+
35
+ def deduplicate
36
+ @groups = [] # Array of arrays containing ids of grouped objects
37
+ @nil_group = [] # Special array of objects whose indexed values are all nil (because index isn't tracking them)
38
+ @grouped = {} # Hash of each object's id to the index of @groups in which its found
39
+
40
+ # Index all records in the store to speed search
41
+ create_index
42
+
43
+ # Place each object into an array in @groups that contain all
44
+ # records that match the defined matching logic.
45
+ @store.each do |obj,store_idx|
46
+
47
+ puts "On #{store_idx}" if store_idx % 100 == 0 && store_idx > 0
48
+
49
+ # Shortcut the process if there is only one array in criteria
50
+ # and this object is already present (because it can't possibly match
51
+ # a second time)
52
+ next if @criteria.size == 1 && @grouped[obj.id]
53
+
54
+ @criteria.each do |arr|
55
+
56
+ # Find matching objects
57
+ all_matches = nil
58
+ arr.each do |match_attr|
59
+ val = obj.send(match_attr)
60
+
61
+ if val != nil
62
+ matches = @index.get(match_attr, val)
63
+ all_matches = (all_matches ? all_matches & matches : matches)
64
+ end
65
+ end
66
+
67
+ if all_matches.nil?
68
+ @nil_group << obj.id
69
+ next
70
+ end
71
+
72
+ # Assign matched objects to a group.
73
+ # Groups may be merged in this process.
74
+ current_group_indexes = all_matches.inject([]) do |arr,id|
75
+ arr << @grouped[id] if @grouped[id]
76
+ arr
77
+ end.uniq.compact
78
+
79
+ next if current_group_indexes.size == 1 # can only be [obj_id]
80
+
81
+ if current_group_indexes.size > 1
82
+ # Merge related groups into mega_group based on first group
83
+ mega_group = @groups[current_group_indexes[0]]
84
+ current_group_indexes[1..-1].each do |idx|
85
+ @groups[idx].each { |id| mega_group << id }
86
+ @groups.delete_at(idx)
87
+ end
88
+
89
+ # Re-assign @grouped for all objects to new mega-group
90
+ mega_group.each { |obj_id| @grouped[obj_id] = current_group_indexes[0] }
91
+ else
92
+ # Create new group
93
+ @groups << all_matches
94
+ group_idx = @groups.size - 1
95
+ all_matches.each { |obj_id| @grouped[obj_id] = group_idx }
96
+ end
97
+ end
98
+ end
99
+
100
+ # Add the contents of nil group as a single group
101
+ @groups << @nil_group if @nil_group.any?
102
+
103
+ #puts "Results: #{@groups.inspect}"
104
+ end
105
+
106
+ def create_index
107
+ raise 'Deduplicator requires at least one match attribute be defined' unless @criteria.any?
108
+
109
+ @store.each do |obj, id|
110
+ unique_attrs.each do |ma|
111
+ @index.put(ma, obj.send(ma), id)
112
+ end
113
+ end
114
+ end
115
+
116
+ # Returns each object in store along with its group's index and index within
117
+ # the group. For example...
118
+ # group_idx | idx | name
119
+ # 0 | 0 | Fred Smith
120
+ # 0 | 1 | Fred Smith
121
+ # 1 | 0 | Jane Green
122
+ # 2 | 0 | Linda Smythe
123
+ # 2 | 1 | Linda Smythe
124
+ def each_with_groups
125
+ @groups.each_with_index do |arr,grp_idx|
126
+ arr.each_with_index do |obj_id,obj_idx|
127
+ yield(@store.find(obj_id), grp_idx, obj_idx)
128
+ end
129
+ end
130
+ end
131
+
132
+ end # class
133
+ end # module
@@ -0,0 +1,25 @@
1
+ module Matching
2
+ class HashIndex
3
+
4
+ attr_reader :hashes
5
+
6
+ def initialize
7
+ #one hash for each attribute
8
+ @hashes = {}
9
+ end
10
+
11
+ # Add a value to the index for a given attribute and object id
12
+ def put(attr, val, id)
13
+ unless val.nil?
14
+ h = @hashes[attr] || (@hashes[attr] = {})
15
+ (h[val] ? h[val] << id : h[val] = [id])
16
+ end
17
+ end
18
+
19
+ # Return an array of object ids for a given attribute and value
20
+ def get(attr, val)
21
+ (@hashes[attr] ? @hashes[attr][val] : nil)
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,14 @@
1
+ module Matching
2
+
3
+ #Defines a pair of objects that have been matched by the matcher
4
+ class Match
5
+ attr_reader :left_obj, :right_obj
6
+ attr_accessor :score
7
+
8
+ def initialize(left_obj, right_obj, score)
9
+ @left_obj = left_obj
10
+ @right_obj = right_obj
11
+ @score = score
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,266 @@
1
+ module Matching
2
+
3
+ class Matcher
4
+ attr_accessor :min_score
5
+ attr_reader :left_store, :right_store
6
+ attr_reader :join_pairs, :compare_pairs, :custom_functions, :filter_functions
7
+ attr_reader :left_matches, :right_matches
8
+ attr_reader :right_index
9
+
10
+ def self.define(opts=nil, &block)
11
+ m = new(opts)
12
+ m.define(block)
13
+ m
14
+ end
15
+
16
+ def initialize(opts={})
17
+ @left_store = opts[:left_store]
18
+ @right_store = opts[:right_store]
19
+ @min_score = opts[:min_score] || 1.0
20
+
21
+ @join_pairs = []
22
+ @compare_pairs = []
23
+ @custom_functions = []
24
+ @filter_functions = []
25
+ @right_matches = {} #hash keyed on right_class records, used during main rec loop
26
+ @left_matches = {} #hash keyed on left_class records, created after main rec loop from reverse of @right_matches
27
+ @left_losers = [] #array of left objects that were matched to right records then unmatched, requiring re-match attempt
28
+
29
+ # Create @right_index using either a hash or Redis as the backing store
30
+ if opts[:redis_db] && opts[:redis_db].to_i >= 1
31
+ @right_index = RedisIndex.new(opts[:redis_db])
32
+ else
33
+ @right_index = HashIndex.new
34
+ end
35
+ end
36
+
37
+ # Compare left and right arguments and return similarity as a floating point
38
+ # value where 0.0 represents no similarity and 1.0 represents equality.
39
+ def compare_values(left,right,opts={})
40
+ return 0.0 unless left && right
41
+
42
+ raise ArgumentError, "Cannot compare values of dissimilar type - left = #{left}, right = #{right}" unless left.class == right.class
43
+
44
+ if opts[:fuzzy]
45
+ raise ArgumentError, "Cannot calculate fuzzy comparison for type #{left.class}" unless left.respond_to?(:similarity_to)
46
+ left.similarity_to(right,opts)
47
+ else
48
+ (left == right ? 1.0 : 0.0)
49
+ end
50
+ end
51
+
52
+ def define(&block)
53
+ instance_eval(&block)
54
+ end
55
+
56
+ # One or more join attributes are required for a match between two records
57
+ # to occur. Attributes must be equal.
58
+ def join(left_attr, right_attr, weight)
59
+ @join_pairs << AttributePair.new(left_attr, right_attr, weight)
60
+ end
61
+
62
+ # For records matched via join attributes, comparisons may be applied to
63
+ # adjust the score.
64
+ def compare(left_attr, right_attr, weight, is_fuzzy = false)
65
+ @compare_pairs << AttributePair.new(left_attr, right_attr, weight, is_fuzzy)
66
+ end
67
+
68
+ # Custom functions may adjust the score beyond the simple comparisons
69
+ # performed via @compare_pairs.
70
+ def custom(lmbda)
71
+ @custom_functions << lmbda
72
+ end
73
+
74
+ # Filter lambdas must return a boolean. Returning true will prevent a match.
75
+ def filter(lmbda)
76
+ @filter_functions << lmbda
77
+ end
78
+
79
+ # Given join, compare, and custom rules, return the floating point
80
+ # matching score of two objects.
81
+ def score_pair(left_obj, right_obj)
82
+ score = 0
83
+
84
+ @join_pairs.each do |pair|
85
+ score += pair.weight * compare_values(left_obj.send(pair.left_attr), right_obj.send(pair.right_attr))
86
+ end
87
+
88
+ @compare_pairs.each do |pair|
89
+ score += pair.weight * compare_values(left_obj.send(pair.left_attr), right_obj.send(pair.right_attr), pair.is_fuzzy)
90
+ end
91
+
92
+ @custom_functions.each do |lmbda|
93
+ score += lmbda.call(left_obj, right_obj)
94
+ end
95
+
96
+ @filter_functions.each do |lmbda|
97
+ score = 0 unless lmbda.call(left_obj, right_obj)
98
+ end
99
+
100
+ score
101
+ end
102
+
103
+ # Perform matching
104
+ def match
105
+ unless @left_store && @right_store
106
+ raise ArgumentError, "Matcher requires left_store and right_store attributes"
107
+ end
108
+
109
+ # Index right objects to speed search
110
+ index_right_objects
111
+
112
+ # Evaluate each left record for matches.
113
+ # If more than one match is found, the best-possible match
114
+ # will be awarded the match unless another object is already
115
+ # matched to it. Conflicts are resolved in a separate method.
116
+ @left_store.each do |left_obj|
117
+
118
+ yield left_obj if block_given?
119
+
120
+ # Results are pre-sorted with the best matches first
121
+ ranked_matches = find_matches(left_obj)
122
+
123
+ # Attempt to pair the left_object with one of the
124
+ # ranked right matches
125
+ pair_matches(left_obj, ranked_matches)
126
+ end #each left_obj
127
+
128
+ # Call the recursive method evaluate_left_losers which will attempt to
129
+ # find new matches
130
+ evaluate_left_losers
131
+
132
+ # Populate left_matches as the mirror of right_matches
133
+ @right_matches.each { |right_obj, match| @left_matches[match.left_obj] = match }
134
+ end
135
+
136
+ # Indexes attribues from right object in @right_index (either hash or Redis, see
137
+ # initialize). For each join_pair, store the attribute's values in the form:
138
+ # attr:val -> [array_of_ids]
139
+ def index_right_objects
140
+
141
+ # Require at least one exact_pair else would execute in quadratic time
142
+ raise 'Matcher requires at least one join pair to be defined' unless @join_pairs.any?
143
+
144
+ @right_store.each do |right_obj, id|
145
+ @join_pairs.each { |jp| @right_index.put(jp.right_attr, right_obj.send(jp.right_attr), id) }
146
+ end
147
+ end
148
+
149
+ # Return of scored matches for the left_object argument.
150
+ # Results are in an ordered array of form [[right_obj_a, score_a], [right_obj_b, score_b], ...]
151
+ def find_matches(left_obj)
152
+ potential_matches = find_potential_matches(left_obj)
153
+ ranked_pairs = []
154
+
155
+ potential_matches.each do |right_obj|
156
+ score = score_pair(left_obj, right_obj)
157
+ ranked_pairs << [right_obj, score] if score >= @min_score
158
+ end
159
+
160
+ ranked_pairs.sort! { |a,b| a[1] <=> b[1] }
161
+ ranked_pairs.reverse
162
+ end
163
+
164
+ # Return an array of right_objects that match the left_object by
165
+ # join criteria. This is equivalent to an index lookup. No scoring
166
+ # is done by this method.
167
+ def find_potential_matches(left_obj)
168
+ right_objects = []
169
+
170
+ @join_pairs.each do |jp|
171
+ left_val = left_obj.send(jp.left_attr)
172
+ next if left_val.nil? || left_val == ''
173
+
174
+ matches = @right_index.get(jp.right_attr, left_val)
175
+ right_objects = right_objects | matches if matches
176
+ end
177
+
178
+ # At this point right_objects contains an array of right object ID's.
179
+ # Retrieve the matching objects now.
180
+ right_objects.map! { |r_id| @right_store.find(r_id) }
181
+ end
182
+
183
+ # Evaluate and possibly create Match objects to join the
184
+ # left_object to one of the right_objects from the
185
+ # ranked_matches array
186
+ def pair_matches(left_obj, ranked_matches)
187
+
188
+ ranked_matches.each do |pair|
189
+ (right_obj, score) = pair
190
+
191
+ if @right_matches[right_obj]
192
+ # A match already exists. Determine which left_obj is the best fit.
193
+ if score > @right_matches[right_obj].score
194
+ # The current left_obj is a better fit.
195
+ # Record the other left_obj as a loser then switch
196
+ # the match for the right_obj.
197
+ @left_losers << @right_matches[right_obj].left_obj
198
+ @right_matches[right_obj] = Match.new(left_obj, right_obj, score)
199
+ break
200
+ else
201
+ # Continue looping to try to find a better match
202
+ end
203
+ else
204
+ # Assign first match for this right_obj
205
+ @right_matches[right_obj] = Match.new(left_obj, right_obj, score)
206
+ break
207
+ end
208
+ end
209
+ end
210
+
211
+ # Attempt to find matches while any left losers remain
212
+ def evaluate_left_losers
213
+ return unless @left_losers.any?
214
+
215
+ # Use a copy of the array because it may be filled again as
216
+ # find_matches is called
217
+ working_losers = @left_losers
218
+ @left_losers = []
219
+ working_losers.each do |left_obj|
220
+ ranked_matches = find_matches(left_obj)
221
+ pair_matches(left_obj, ranked_matches)
222
+ end
223
+
224
+ # To understand recursion you first must understand recursion
225
+ evaluate_left_losers
226
+ end
227
+
228
+ # Returns array of non-matched left objects
229
+ def left_exceptions
230
+ return @left_exceptions if @left_exceptions
231
+ @left_exceptions = exceptions(:left)
232
+ @left_exceptions
233
+ end
234
+
235
+ # Returns array of non-matched right objects
236
+ def right_exceptions
237
+ return @right_exceptions if @right_exceptions
238
+ @right_exceptions = exceptions(:right)
239
+ @right_exceptions
240
+ end
241
+
242
+ def exceptions(side)
243
+ if side == :left
244
+ store, matches = @left_store, @left_matches
245
+ else
246
+ store, matches = @right_store, @right_matches
247
+ end
248
+
249
+ arr = []
250
+ if arr.class == ArrayStore
251
+ arr = store.arr - matches
252
+ else
253
+ store.each do |obj|
254
+ arr << obj unless matches[obj]
255
+ end
256
+ end
257
+ arr
258
+ end
259
+
260
+ def matches
261
+ @left_matches.map do |left_obj, match|
262
+ match
263
+ end
264
+ end
265
+ end #class
266
+ end #module