slide_rule 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +4 -0
- data/README.md +44 -5
- data/lib/slide_rule/distance_calculator.rb +72 -12
- data/lib/slide_rule/distance_calculators/day_of_month.rb +1 -0
- data/lib/slide_rule/distance_calculators/day_of_year.rb +1 -0
- data/lib/slide_rule/version.rb +1 -1
- data/spec/slide_rule/distance_calculator_spec.rb +87 -8
- data/spec/slide_rule/distance_calculators/day_of_month_spec.rb +4 -0
- data/spec/slide_rule/distance_calculators/day_of_year_spec.rb +4 -0
- data/spec/slide_rule/distance_calculators/levenshtein_spec.rb +10 -2
- data/spec/spec_helper.rb +1 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f3035bfd6643001929a84db47e8de72911901eb6
|
4
|
+
data.tar.gz: 0a2f570fb3371d5814a3247aea005676525e69aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 282666dafc39dad85c6850c460e7bf2babe7ad3b38b54469ae0df5b91a9802400f3ed50626507712cc7a5afa1173add644ec229704133fe5b60858fa986b2d12
|
7
|
+
data.tar.gz: 8c167275bb751444636e25fd1c197566e3dde22f88d30a30f8d9a670a786f897252991429ade2e37efa766efbe5c9dfd3982c73d896780df61d6f869e075b92d
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -19,25 +19,26 @@ _Note: weights are assumed to be equal if not provided_
|
|
19
19
|
|
20
20
|
#API
|
21
21
|
|
22
|
-
##Describe the field calculators
|
22
|
+
##Describe the field distance calculators
|
23
23
|
|
24
24
|
Each field to be considered in the distance calculation should be described
|
25
25
|
with a calculation method and weight(optional)
|
26
26
|
|
27
27
|
Valid calculators:
|
28
28
|
|
29
|
-
*
|
30
|
-
*
|
29
|
+
* day_of_year
|
30
|
+
* day_of_month
|
31
|
+
* levenshtein
|
31
32
|
|
32
33
|
```ruby
|
33
34
|
distance_rules = {
|
34
35
|
:description => {
|
35
36
|
:weight => 0.80,
|
36
|
-
:
|
37
|
+
:calculator => :levenshtein,
|
37
38
|
},
|
38
39
|
:date => {
|
39
40
|
:weight => 0.90,
|
40
|
-
:
|
41
|
+
:calculator => :day_of_month,
|
41
42
|
},
|
42
43
|
}
|
43
44
|
```
|
@@ -81,3 +82,41 @@ matcher.closest_match(candidate, [example, example2], 0.2)
|
|
81
82
|
=> example
|
82
83
|
|
83
84
|
```
|
85
|
+
|
86
|
+
## Custom Field Distance Calculators
|
87
|
+
|
88
|
+
To define a custom field distance calculator, define a class with a `calculate(value1, value2)` method.
|
89
|
+
|
90
|
+
Requirements:
|
91
|
+
* Class must be stateless
|
92
|
+
* Calculate should return a float from `0` (perfect match) to `1.0` (no match)
|
93
|
+
* Calculation should not be order dependent (e.g. `calculate(a, b) == calculate(b, a)`)
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
class StringLengthCalculator
|
97
|
+
def calculate(l1, l2)
|
98
|
+
diff = (l1 - l2).abs.to_f
|
99
|
+
return diff / [l1, l2].max
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
matcher = ::SlideRule::DistanceCalculator.new(
|
104
|
+
:length => {
|
105
|
+
:weight => 1.0,
|
106
|
+
:calculator => StringLengthCalculator
|
107
|
+
}
|
108
|
+
)
|
109
|
+
|
110
|
+
# Find the string with the closest length
|
111
|
+
matcher.closest_match("Howdy Doody Time!", ["Felix the cat", "Mighty Mouse"], 0.5)
|
112
|
+
# => { :item=>"Mighty Mouse", :distance=>0.29411764705882354 }
|
113
|
+
```
|
114
|
+
|
115
|
+
See the [distance_calculators](https://github.com/mattnichols/slide_rule/tree/master/lib/slide_rule/distance_calculators) directory in source for more examples.
|
116
|
+
|
117
|
+
|
118
|
+
# To Do
|
119
|
+
|
120
|
+
* Add more field distance calculators
|
121
|
+
|
122
|
+
|
@@ -1,7 +1,9 @@
|
|
1
1
|
module SlideRule
|
2
2
|
class DistanceCalculator
|
3
|
+
attr_accessor :rules
|
4
|
+
|
3
5
|
def initialize(rules)
|
4
|
-
@rules =
|
6
|
+
@rules = prepare_rules(rules)
|
5
7
|
end
|
6
8
|
|
7
9
|
# TODO: Figure this out. Very inefficient!
|
@@ -20,8 +22,15 @@ module SlideRule
|
|
20
22
|
end
|
21
23
|
end
|
22
24
|
|
23
|
-
def closest_match(obj, array, threshold)
|
24
|
-
matches(obj, array, threshold).
|
25
|
+
def closest_match(obj, array, threshold = 1.0)
|
26
|
+
matches(obj, array, threshold).sort_by { |match| match[:distance] }.first
|
27
|
+
end
|
28
|
+
|
29
|
+
def closest_matching_item(obj, array, threshold = 1.0)
|
30
|
+
match = closest_match(obj, array, threshold)
|
31
|
+
return nil if match.nil?
|
32
|
+
|
33
|
+
match[:item]
|
25
34
|
end
|
26
35
|
|
27
36
|
def is_match?(obj_1, obj_2, threshold)
|
@@ -32,7 +41,7 @@ module SlideRule
|
|
32
41
|
def matches(obj, array, threshold)
|
33
42
|
array.map do |item|
|
34
43
|
distance = calculate_distance(obj, item)
|
35
|
-
next nil unless distance
|
44
|
+
next nil unless distance <= threshold
|
36
45
|
{
|
37
46
|
item: item,
|
38
47
|
distance: distance
|
@@ -48,23 +57,40 @@ module SlideRule
|
|
48
57
|
# {
|
49
58
|
# :attribute_name => {
|
50
59
|
# :weight => 0.90,
|
51
|
-
# :
|
60
|
+
# :calculator => :distance_calculator,
|
52
61
|
# }
|
53
62
|
# }
|
54
63
|
def calculate_distance(i1, i2)
|
55
|
-
|
64
|
+
calculate_weighted_distances(i1, i2).reduce(0.0) do |distance, obj|
|
65
|
+
distance + (obj[:distance] * obj[:weight])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def calculate_weighted_distances(i1, i2)
|
72
|
+
distances = @rules.map do |attribute, rule|
|
56
73
|
val1 = i1.send(attribute)
|
57
74
|
val2 = i2.send(attribute)
|
58
|
-
|
59
|
-
|
60
|
-
|
75
|
+
distance = rule[:calculator].calculate(val1, val2)
|
76
|
+
next { distance: distance.to_f, weight: rule[:weight] } unless distance.nil?
|
77
|
+
|
78
|
+
nil
|
79
|
+
end
|
80
|
+
normalize_weights_array(distances) if distances.compact!
|
81
|
+
|
82
|
+
distances
|
61
83
|
end
|
62
84
|
|
63
85
|
def get_calculator(calculator)
|
64
86
|
return calculator.new if calculator.is_a?(Class)
|
65
87
|
|
66
88
|
klass_name = "#{calculator.to_s.split('_').collect(&:capitalize).join}"
|
67
|
-
klass =
|
89
|
+
klass = begin
|
90
|
+
::SlideRule::DistanceCalculators.const_get(klass_name)
|
91
|
+
rescue(::NameError)
|
92
|
+
nil
|
93
|
+
end
|
68
94
|
|
69
95
|
fail ArgumentError, "Unable to find calculator #{klass_name}" if klass.nil?
|
70
96
|
|
@@ -73,12 +99,46 @@ module SlideRule
|
|
73
99
|
|
74
100
|
# Ensures all weights add up to 1.0
|
75
101
|
#
|
76
|
-
def normalize_weights(
|
77
|
-
rules = rules_hash.dup
|
102
|
+
def normalize_weights(rules)
|
78
103
|
weight_total = rules.map { |_attr, rule| rule[:weight] }.reduce(0.0, &:+)
|
79
104
|
rules.each do |_attr, rule|
|
80
105
|
rule[:weight] = rule[:weight] / weight_total
|
81
106
|
end
|
82
107
|
end
|
108
|
+
|
109
|
+
# Ensures all weights add up to 1.0 in array of hashes
|
110
|
+
#
|
111
|
+
def normalize_weights_array(rules)
|
112
|
+
weight_total = rules.map { |rule| rule[:weight] }.reduce(0.0, &:+)
|
113
|
+
rules.each do |rule|
|
114
|
+
rule[:weight] = rule[:weight] / weight_total
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Prepares a duplicate of given rules hash with normalized weights and calculator instances
|
119
|
+
#
|
120
|
+
def prepare_rules(rules)
|
121
|
+
prepared_rules = rules.each_with_object({}) do |(attribute, rule), copy|
|
122
|
+
rule = copy[attribute] = safe_dup(rule)
|
123
|
+
|
124
|
+
if rule[:type]
|
125
|
+
puts 'Rule key `:type` is deprecated. Use `:calculator` instead.'
|
126
|
+
rule[:calculator] = rule[:type]
|
127
|
+
end
|
128
|
+
|
129
|
+
rule[:calculator] = get_calculator(rule[:calculator])
|
130
|
+
|
131
|
+
copy
|
132
|
+
end
|
133
|
+
prepared_rules = normalize_weights(prepared_rules)
|
134
|
+
|
135
|
+
prepared_rules
|
136
|
+
end
|
137
|
+
|
138
|
+
def safe_dup(obj)
|
139
|
+
obj.dup
|
140
|
+
rescue
|
141
|
+
obj
|
142
|
+
end
|
83
143
|
end
|
84
144
|
end
|
data/lib/slide_rule/version.rb
CHANGED
@@ -19,6 +19,12 @@ describe ::SlideRule::DistanceCalculator do
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
+
class NilCalc
|
23
|
+
def calculate(_first, _second)
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
22
28
|
let(:examples) do
|
23
29
|
[
|
24
30
|
::ExampleTransaction.new(amount: 25.00, date: '2015-02-05', description: 'Audible.com'),
|
@@ -36,22 +42,32 @@ describe ::SlideRule::DistanceCalculator do
|
|
36
42
|
::SlideRule::DistanceCalculator.new(
|
37
43
|
description: {
|
38
44
|
weight: 0.80,
|
39
|
-
|
45
|
+
calculator: :levenshtein
|
40
46
|
},
|
41
47
|
date: {
|
42
48
|
weight: 0.90,
|
43
|
-
|
49
|
+
calculator: :day_of_month
|
44
50
|
}
|
45
51
|
)
|
46
52
|
end
|
47
53
|
|
48
|
-
it 'finds
|
54
|
+
it 'finds closest' do
|
49
55
|
example = ExampleTransaction.new(description: 'Wells Fargo Dealer SVC', date: '2015-06-17')
|
50
56
|
expect(calculator.closest_match(example, examples, 0.2)[:item]).to eq(examples[3])
|
51
57
|
|
52
58
|
example = ExampleTransaction.new(description: 'Audible.com', date: '2015-06-05')
|
53
59
|
expect(calculator.closest_match(example, examples, 0.2)[:item]).to eq(examples[0])
|
54
60
|
end
|
61
|
+
|
62
|
+
it 'with default threshold' do
|
63
|
+
example = ExampleTransaction.new(description: 'Audible.com', date: '2015-06-05')
|
64
|
+
expect(calculator.closest_match(example, examples)[:item]).to eq(examples[0])
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'finds closest matching item' do
|
68
|
+
example = ExampleTransaction.new(description: 'Audible.com', date: '2015-06-05')
|
69
|
+
expect(calculator.closest_matching_item(example, examples)).to eq(examples[0])
|
70
|
+
end
|
55
71
|
end
|
56
72
|
|
57
73
|
describe '#is_match?' do
|
@@ -89,11 +105,11 @@ describe ::SlideRule::DistanceCalculator do
|
|
89
105
|
calculator = ::SlideRule::DistanceCalculator.new(
|
90
106
|
description: {
|
91
107
|
weight: 1.00,
|
92
|
-
|
108
|
+
calculator: :levenshtein
|
93
109
|
},
|
94
110
|
date: {
|
95
111
|
weight: 0.50,
|
96
|
-
|
112
|
+
calculator: :day_of_month
|
97
113
|
}
|
98
114
|
)
|
99
115
|
example = ::ExampleTransaction.new(amount: 25.00, date: '2015-02-05', description: 'Audible.com')
|
@@ -105,11 +121,11 @@ describe ::SlideRule::DistanceCalculator do
|
|
105
121
|
calculator = ::SlideRule::DistanceCalculator.new(
|
106
122
|
description: {
|
107
123
|
weight: 0.50,
|
108
|
-
|
124
|
+
calculator: :levenshtein
|
109
125
|
},
|
110
126
|
date: {
|
111
127
|
weight: 0.50,
|
112
|
-
|
128
|
+
calculator: :day_of_month
|
113
129
|
}
|
114
130
|
)
|
115
131
|
example = ::ExampleTransaction.new(amount: 25.00, date: '2015-02-05', description: 'Audible.com')
|
@@ -125,6 +141,23 @@ describe ::SlideRule::DistanceCalculator do
|
|
125
141
|
distance = calculator.calculate_distance(example, candidate)
|
126
142
|
expect(distance.round(4)).to eq(((3.0 * 0.5 / 15) + (4.0 * 0.5 / 11)).round(4))
|
127
143
|
end
|
144
|
+
|
145
|
+
it 'should renormalize on nil' do
|
146
|
+
calculator = ::SlideRule::DistanceCalculator.new(
|
147
|
+
description: {
|
148
|
+
weight: 0.50,
|
149
|
+
calculator: :levenshtein
|
150
|
+
},
|
151
|
+
date: {
|
152
|
+
weight: 0.50,
|
153
|
+
calculator: NilCalc
|
154
|
+
}
|
155
|
+
)
|
156
|
+
example1 = ::ExampleTransaction.new(amount: 25.00, date: '2015-02-05', description: 'Audible.com')
|
157
|
+
example2 = ::ExampleTransaction.new(amount: 25.00, date: '2015-06-08', description: 'Audible Inc')
|
158
|
+
|
159
|
+
expect(calculator.calculate_distance(example1, example2).round(4)).to eq((4.0 / 11).round(4))
|
160
|
+
end
|
128
161
|
end
|
129
162
|
|
130
163
|
context 'uses custom calculator' do
|
@@ -132,7 +165,7 @@ describe ::SlideRule::DistanceCalculator do
|
|
132
165
|
calculator = ::SlideRule::DistanceCalculator.new(
|
133
166
|
description: {
|
134
167
|
weight: 1.00,
|
135
|
-
|
168
|
+
calculator: CustomCalc
|
136
169
|
}
|
137
170
|
)
|
138
171
|
example = ::ExampleTransaction.new
|
@@ -142,5 +175,51 @@ describe ::SlideRule::DistanceCalculator do
|
|
142
175
|
expect(distance).to eq(0.9)
|
143
176
|
end
|
144
177
|
end
|
178
|
+
|
179
|
+
describe '#initialize' do
|
180
|
+
context 'validates rules on initialize' do
|
181
|
+
it 'should allow :type' do
|
182
|
+
::SlideRule::DistanceCalculator.new(
|
183
|
+
description: {
|
184
|
+
weight: 1.00,
|
185
|
+
type: CustomCalc
|
186
|
+
}
|
187
|
+
)
|
188
|
+
end
|
189
|
+
|
190
|
+
it 'should not modify input rule hash' do
|
191
|
+
rules = {
|
192
|
+
description: {
|
193
|
+
weight: 1.0,
|
194
|
+
calculator: CustomCalc
|
195
|
+
},
|
196
|
+
name: {
|
197
|
+
weight: 1.0,
|
198
|
+
type: CustomCalc
|
199
|
+
}
|
200
|
+
}
|
201
|
+
::SlideRule::DistanceCalculator.new(rules)
|
202
|
+
# Run a second time to ensure that no calculator instance is in rules. Will currently throw an error.
|
203
|
+
::SlideRule::DistanceCalculator.new(rules)
|
204
|
+
|
205
|
+
# :type should still be in original hash
|
206
|
+
expect(rules[:name].key?(:calculator)).to eq(false)
|
207
|
+
|
208
|
+
# :weight should not be normalized in original hash
|
209
|
+
expect(rules[:name][:weight]).to eq(1.0)
|
210
|
+
end
|
211
|
+
|
212
|
+
it 'should raise error if not valid calculator' do
|
213
|
+
expect do
|
214
|
+
::SlideRule::DistanceCalculator.new(
|
215
|
+
description: {
|
216
|
+
weight: 1.00,
|
217
|
+
calculator: :some_junk
|
218
|
+
}
|
219
|
+
)
|
220
|
+
end.to raise_error(::ArgumentError, 'Unable to find calculator SomeJunk')
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
145
224
|
end
|
146
225
|
end
|
@@ -6,6 +6,10 @@ describe ::SlideRule::DistanceCalculators::DayOfMonth do
|
|
6
6
|
expect(described_class.new.calculate('2012-03-19', '2014-08-19')).to eq(0.0)
|
7
7
|
end
|
8
8
|
|
9
|
+
it 'should accept epoch date' do
|
10
|
+
expect(described_class.new.calculate(1_444_262_400, 1_444_262_400)).to eq(0.0)
|
11
|
+
end
|
12
|
+
|
9
13
|
it 'should calculate when date is in the same month' do
|
10
14
|
expect(described_class.new.calculate('2012-03-19', '2014-08-22')).to eq(3.0 / 15)
|
11
15
|
expect(described_class.new.calculate('2012-03-19', '2014-08-09')).to eq(10.0 / 15)
|
@@ -5,6 +5,10 @@ describe ::SlideRule::DistanceCalculators::DayOfYear do
|
|
5
5
|
it 'should return a 0 distance' do
|
6
6
|
expect(described_class.new.calculate('2015-10-8', '2015-10-8')).to eq(0.0)
|
7
7
|
end
|
8
|
+
|
9
|
+
it 'should accept epoch date' do
|
10
|
+
expect(described_class.new.calculate(1_444_262_400, 1_444_262_400)).to eq(0.0)
|
11
|
+
end
|
8
12
|
end
|
9
13
|
|
10
14
|
context 'when dates are more than a year apart' do
|
@@ -1,11 +1,19 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe ::SlideRule::DistanceCalculators::Levenshtein do
|
4
|
+
let(:subject) { described_class.new }
|
5
|
+
|
4
6
|
it 'should calculate perfect match' do
|
5
|
-
expect(
|
7
|
+
expect(subject.calculate('this is a test', 'this is a test')).to eq(0.0)
|
6
8
|
end
|
7
9
|
|
8
10
|
it 'should calculate distance as distance divided by length of longest string' do
|
9
|
-
expect(
|
11
|
+
expect(subject.calculate('this is a test', 'this is a test!').round(4)).to eq((1.0 / 15).round(4))
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should handle nils' do
|
15
|
+
expect(subject.calculate(nil, nil)).to eq(0.0)
|
16
|
+
expect(subject.calculate(nil, 'goodbye')).to eq(1.0)
|
17
|
+
expect(subject.calculate('hello', nil)).to eq(1.0)
|
10
18
|
end
|
11
19
|
end
|
data/spec/spec_helper.rb
CHANGED