slide_rule 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +4 -0
- data/README.md +44 -5
- data/lib/slide_rule/distance_calculator.rb +72 -12
- data/lib/slide_rule/distance_calculators/day_of_month.rb +1 -0
- data/lib/slide_rule/distance_calculators/day_of_year.rb +1 -0
- data/lib/slide_rule/version.rb +1 -1
- data/spec/slide_rule/distance_calculator_spec.rb +87 -8
- data/spec/slide_rule/distance_calculators/day_of_month_spec.rb +4 -0
- data/spec/slide_rule/distance_calculators/day_of_year_spec.rb +4 -0
- data/spec/slide_rule/distance_calculators/levenshtein_spec.rb +10 -2
- data/spec/spec_helper.rb +1 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f3035bfd6643001929a84db47e8de72911901eb6
|
4
|
+
data.tar.gz: 0a2f570fb3371d5814a3247aea005676525e69aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 282666dafc39dad85c6850c460e7bf2babe7ad3b38b54469ae0df5b91a9802400f3ed50626507712cc7a5afa1173add644ec229704133fe5b60858fa986b2d12
|
7
|
+
data.tar.gz: 8c167275bb751444636e25fd1c197566e3dde22f88d30a30f8d9a670a786f897252991429ade2e37efa766efbe5c9dfd3982c73d896780df61d6f869e075b92d
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -19,25 +19,26 @@ _Note: weights are assumed to be equal if not provided_
|
|
19
19
|
|
20
20
|
#API
|
21
21
|
|
22
|
-
##Describe the field calculators
|
22
|
+
##Describe the field distance calculators
|
23
23
|
|
24
24
|
Each field to be considered in the distance calculation should be described
|
25
25
|
with a calculation method and weight(optional)
|
26
26
|
|
27
27
|
Valid calculators:
|
28
28
|
|
29
|
-
*
|
30
|
-
*
|
29
|
+
* day_of_year
|
30
|
+
* day_of_month
|
31
|
+
* levenshtein
|
31
32
|
|
32
33
|
```ruby
|
33
34
|
distance_rules = {
|
34
35
|
:description => {
|
35
36
|
:weight => 0.80,
|
36
|
-
:
|
37
|
+
:calculator => :levenshtein,
|
37
38
|
},
|
38
39
|
:date => {
|
39
40
|
:weight => 0.90,
|
40
|
-
:
|
41
|
+
:calculator => :day_of_month,
|
41
42
|
},
|
42
43
|
}
|
43
44
|
```
|
@@ -81,3 +82,41 @@ matcher.closest_match(candidate, [example, example2], 0.2)
|
|
81
82
|
=> example
|
82
83
|
|
83
84
|
```
|
85
|
+
|
86
|
+
## Custom Field Distance Calculators
|
87
|
+
|
88
|
+
To define a custom field distance calculator, define a class with a `calculate(value1, value2)` method.
|
89
|
+
|
90
|
+
Requirements:
|
91
|
+
* Class must be stateless
|
92
|
+
* Calculate should return a float from `0` (perfect match) to `1.0` (no match)
|
93
|
+
* Calculation should not be order dependent (e.g. `calculate(a, b) == calculate(b, a)`)
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
class StringLengthCalculator
|
97
|
+
def calculate(l1, l2)
|
98
|
+
diff = (l1 - l2).abs.to_f
|
99
|
+
return diff / [l1, l2].max
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
matcher = ::SlideRule::DistanceCalculator.new(
|
104
|
+
:length => {
|
105
|
+
:weight => 1.0,
|
106
|
+
:calculator => StringLengthCalculator
|
107
|
+
}
|
108
|
+
)
|
109
|
+
|
110
|
+
# Find the string with the closest length
|
111
|
+
matcher.closest_match("Howdy Doody Time!", ["Felix the cat", "Mighty Mouse"], 0.5)
|
112
|
+
# => { :item=>"Mighty Mouse", :distance=>0.29411764705882354 }
|
113
|
+
```
|
114
|
+
|
115
|
+
See the [distance_calculators](https://github.com/mattnichols/slide_rule/tree/master/lib/slide_rule/distance_calculators) directory in source for more examples.
|
116
|
+
|
117
|
+
|
118
|
+
# To Do
|
119
|
+
|
120
|
+
* Add more field distance calculators
|
121
|
+
|
122
|
+
|
@@ -1,7 +1,9 @@
|
|
1
1
|
module SlideRule
|
2
2
|
class DistanceCalculator
|
3
|
+
attr_accessor :rules
|
4
|
+
|
3
5
|
def initialize(rules)
|
4
|
-
@rules =
|
6
|
+
@rules = prepare_rules(rules)
|
5
7
|
end
|
6
8
|
|
7
9
|
# TODO: Figure this out. Very inefficient!
|
@@ -20,8 +22,15 @@ module SlideRule
|
|
20
22
|
end
|
21
23
|
end
|
22
24
|
|
23
|
-
def closest_match(obj, array, threshold)
|
24
|
-
matches(obj, array, threshold).
|
25
|
+
def closest_match(obj, array, threshold = 1.0)
|
26
|
+
matches(obj, array, threshold).sort_by { |match| match[:distance] }.first
|
27
|
+
end
|
28
|
+
|
29
|
+
def closest_matching_item(obj, array, threshold = 1.0)
|
30
|
+
match = closest_match(obj, array, threshold)
|
31
|
+
return nil if match.nil?
|
32
|
+
|
33
|
+
match[:item]
|
25
34
|
end
|
26
35
|
|
27
36
|
def is_match?(obj_1, obj_2, threshold)
|
@@ -32,7 +41,7 @@ module SlideRule
|
|
32
41
|
def matches(obj, array, threshold)
|
33
42
|
array.map do |item|
|
34
43
|
distance = calculate_distance(obj, item)
|
35
|
-
next nil unless distance
|
44
|
+
next nil unless distance <= threshold
|
36
45
|
{
|
37
46
|
item: item,
|
38
47
|
distance: distance
|
@@ -48,23 +57,40 @@ module SlideRule
|
|
48
57
|
# {
|
49
58
|
# :attribute_name => {
|
50
59
|
# :weight => 0.90,
|
51
|
-
# :
|
60
|
+
# :calculator => :distance_calculator,
|
52
61
|
# }
|
53
62
|
# }
|
54
63
|
def calculate_distance(i1, i2)
|
55
|
-
|
64
|
+
calculate_weighted_distances(i1, i2).reduce(0.0) do |distance, obj|
|
65
|
+
distance + (obj[:distance] * obj[:weight])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def calculate_weighted_distances(i1, i2)
|
72
|
+
distances = @rules.map do |attribute, rule|
|
56
73
|
val1 = i1.send(attribute)
|
57
74
|
val2 = i2.send(attribute)
|
58
|
-
|
59
|
-
|
60
|
-
|
75
|
+
distance = rule[:calculator].calculate(val1, val2)
|
76
|
+
next { distance: distance.to_f, weight: rule[:weight] } unless distance.nil?
|
77
|
+
|
78
|
+
nil
|
79
|
+
end
|
80
|
+
normalize_weights_array(distances) if distances.compact!
|
81
|
+
|
82
|
+
distances
|
61
83
|
end
|
62
84
|
|
63
85
|
def get_calculator(calculator)
|
64
86
|
return calculator.new if calculator.is_a?(Class)
|
65
87
|
|
66
88
|
klass_name = "#{calculator.to_s.split('_').collect(&:capitalize).join}"
|
67
|
-
klass =
|
89
|
+
klass = begin
|
90
|
+
::SlideRule::DistanceCalculators.const_get(klass_name)
|
91
|
+
rescue(::NameError)
|
92
|
+
nil
|
93
|
+
end
|
68
94
|
|
69
95
|
fail ArgumentError, "Unable to find calculator #{klass_name}" if klass.nil?
|
70
96
|
|
@@ -73,12 +99,46 @@ module SlideRule
|
|
73
99
|
|
74
100
|
# Ensures all weights add up to 1.0
|
75
101
|
#
|
76
|
-
def normalize_weights(
|
77
|
-
rules = rules_hash.dup
|
102
|
+
def normalize_weights(rules)
|
78
103
|
weight_total = rules.map { |_attr, rule| rule[:weight] }.reduce(0.0, &:+)
|
79
104
|
rules.each do |_attr, rule|
|
80
105
|
rule[:weight] = rule[:weight] / weight_total
|
81
106
|
end
|
82
107
|
end
|
108
|
+
|
109
|
+
# Ensures all weights add up to 1.0 in array of hashes
|
110
|
+
#
|
111
|
+
def normalize_weights_array(rules)
|
112
|
+
weight_total = rules.map { |rule| rule[:weight] }.reduce(0.0, &:+)
|
113
|
+
rules.each do |rule|
|
114
|
+
rule[:weight] = rule[:weight] / weight_total
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Prepares a duplicate of given rules hash with normalized weights and calculator instances
|
119
|
+
#
|
120
|
+
def prepare_rules(rules)
|
121
|
+
prepared_rules = rules.each_with_object({}) do |(attribute, rule), copy|
|
122
|
+
rule = copy[attribute] = safe_dup(rule)
|
123
|
+
|
124
|
+
if rule[:type]
|
125
|
+
puts 'Rule key `:type` is deprecated. Use `:calculator` instead.'
|
126
|
+
rule[:calculator] = rule[:type]
|
127
|
+
end
|
128
|
+
|
129
|
+
rule[:calculator] = get_calculator(rule[:calculator])
|
130
|
+
|
131
|
+
copy
|
132
|
+
end
|
133
|
+
prepared_rules = normalize_weights(prepared_rules)
|
134
|
+
|
135
|
+
prepared_rules
|
136
|
+
end
|
137
|
+
|
138
|
+
def safe_dup(obj)
|
139
|
+
obj.dup
|
140
|
+
rescue
|
141
|
+
obj
|
142
|
+
end
|
83
143
|
end
|
84
144
|
end
|
data/lib/slide_rule/version.rb
CHANGED
@@ -19,6 +19,12 @@ describe ::SlideRule::DistanceCalculator do
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
+
class NilCalc
|
23
|
+
def calculate(_first, _second)
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
22
28
|
let(:examples) do
|
23
29
|
[
|
24
30
|
::ExampleTransaction.new(amount: 25.00, date: '2015-02-05', description: 'Audible.com'),
|
@@ -36,22 +42,32 @@ describe ::SlideRule::DistanceCalculator do
|
|
36
42
|
::SlideRule::DistanceCalculator.new(
|
37
43
|
description: {
|
38
44
|
weight: 0.80,
|
39
|
-
|
45
|
+
calculator: :levenshtein
|
40
46
|
},
|
41
47
|
date: {
|
42
48
|
weight: 0.90,
|
43
|
-
|
49
|
+
calculator: :day_of_month
|
44
50
|
}
|
45
51
|
)
|
46
52
|
end
|
47
53
|
|
48
|
-
it 'finds
|
54
|
+
it 'finds closest' do
|
49
55
|
example = ExampleTransaction.new(description: 'Wells Fargo Dealer SVC', date: '2015-06-17')
|
50
56
|
expect(calculator.closest_match(example, examples, 0.2)[:item]).to eq(examples[3])
|
51
57
|
|
52
58
|
example = ExampleTransaction.new(description: 'Audible.com', date: '2015-06-05')
|
53
59
|
expect(calculator.closest_match(example, examples, 0.2)[:item]).to eq(examples[0])
|
54
60
|
end
|
61
|
+
|
62
|
+
it 'with default threshold' do
|
63
|
+
example = ExampleTransaction.new(description: 'Audible.com', date: '2015-06-05')
|
64
|
+
expect(calculator.closest_match(example, examples)[:item]).to eq(examples[0])
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'finds closest matching item' do
|
68
|
+
example = ExampleTransaction.new(description: 'Audible.com', date: '2015-06-05')
|
69
|
+
expect(calculator.closest_matching_item(example, examples)).to eq(examples[0])
|
70
|
+
end
|
55
71
|
end
|
56
72
|
|
57
73
|
describe '#is_match?' do
|
@@ -89,11 +105,11 @@ describe ::SlideRule::DistanceCalculator do
|
|
89
105
|
calculator = ::SlideRule::DistanceCalculator.new(
|
90
106
|
description: {
|
91
107
|
weight: 1.00,
|
92
|
-
|
108
|
+
calculator: :levenshtein
|
93
109
|
},
|
94
110
|
date: {
|
95
111
|
weight: 0.50,
|
96
|
-
|
112
|
+
calculator: :day_of_month
|
97
113
|
}
|
98
114
|
)
|
99
115
|
example = ::ExampleTransaction.new(amount: 25.00, date: '2015-02-05', description: 'Audible.com')
|
@@ -105,11 +121,11 @@ describe ::SlideRule::DistanceCalculator do
|
|
105
121
|
calculator = ::SlideRule::DistanceCalculator.new(
|
106
122
|
description: {
|
107
123
|
weight: 0.50,
|
108
|
-
|
124
|
+
calculator: :levenshtein
|
109
125
|
},
|
110
126
|
date: {
|
111
127
|
weight: 0.50,
|
112
|
-
|
128
|
+
calculator: :day_of_month
|
113
129
|
}
|
114
130
|
)
|
115
131
|
example = ::ExampleTransaction.new(amount: 25.00, date: '2015-02-05', description: 'Audible.com')
|
@@ -125,6 +141,23 @@ describe ::SlideRule::DistanceCalculator do
|
|
125
141
|
distance = calculator.calculate_distance(example, candidate)
|
126
142
|
expect(distance.round(4)).to eq(((3.0 * 0.5 / 15) + (4.0 * 0.5 / 11)).round(4))
|
127
143
|
end
|
144
|
+
|
145
|
+
it 'should renormalize on nil' do
|
146
|
+
calculator = ::SlideRule::DistanceCalculator.new(
|
147
|
+
description: {
|
148
|
+
weight: 0.50,
|
149
|
+
calculator: :levenshtein
|
150
|
+
},
|
151
|
+
date: {
|
152
|
+
weight: 0.50,
|
153
|
+
calculator: NilCalc
|
154
|
+
}
|
155
|
+
)
|
156
|
+
example1 = ::ExampleTransaction.new(amount: 25.00, date: '2015-02-05', description: 'Audible.com')
|
157
|
+
example2 = ::ExampleTransaction.new(amount: 25.00, date: '2015-06-08', description: 'Audible Inc')
|
158
|
+
|
159
|
+
expect(calculator.calculate_distance(example1, example2).round(4)).to eq((4.0 / 11).round(4))
|
160
|
+
end
|
128
161
|
end
|
129
162
|
|
130
163
|
context 'uses custom calculator' do
|
@@ -132,7 +165,7 @@ describe ::SlideRule::DistanceCalculator do
|
|
132
165
|
calculator = ::SlideRule::DistanceCalculator.new(
|
133
166
|
description: {
|
134
167
|
weight: 1.00,
|
135
|
-
|
168
|
+
calculator: CustomCalc
|
136
169
|
}
|
137
170
|
)
|
138
171
|
example = ::ExampleTransaction.new
|
@@ -142,5 +175,51 @@ describe ::SlideRule::DistanceCalculator do
|
|
142
175
|
expect(distance).to eq(0.9)
|
143
176
|
end
|
144
177
|
end
|
178
|
+
|
179
|
+
describe '#initialize' do
|
180
|
+
context 'validates rules on initialize' do
|
181
|
+
it 'should allow :type' do
|
182
|
+
::SlideRule::DistanceCalculator.new(
|
183
|
+
description: {
|
184
|
+
weight: 1.00,
|
185
|
+
type: CustomCalc
|
186
|
+
}
|
187
|
+
)
|
188
|
+
end
|
189
|
+
|
190
|
+
it 'should not modify input rule hash' do
|
191
|
+
rules = {
|
192
|
+
description: {
|
193
|
+
weight: 1.0,
|
194
|
+
calculator: CustomCalc
|
195
|
+
},
|
196
|
+
name: {
|
197
|
+
weight: 1.0,
|
198
|
+
type: CustomCalc
|
199
|
+
}
|
200
|
+
}
|
201
|
+
::SlideRule::DistanceCalculator.new(rules)
|
202
|
+
# Run a second time to ensure that no calculator instance is in rules. Will currently throw an error.
|
203
|
+
::SlideRule::DistanceCalculator.new(rules)
|
204
|
+
|
205
|
+
# :type should still be in original hash
|
206
|
+
expect(rules[:name].key?(:calculator)).to eq(false)
|
207
|
+
|
208
|
+
# :weight should not be normalized in original hash
|
209
|
+
expect(rules[:name][:weight]).to eq(1.0)
|
210
|
+
end
|
211
|
+
|
212
|
+
it 'should raise error if not valid calculator' do
|
213
|
+
expect do
|
214
|
+
::SlideRule::DistanceCalculator.new(
|
215
|
+
description: {
|
216
|
+
weight: 1.00,
|
217
|
+
calculator: :some_junk
|
218
|
+
}
|
219
|
+
)
|
220
|
+
end.to raise_error(::ArgumentError, 'Unable to find calculator SomeJunk')
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
145
224
|
end
|
146
225
|
end
|
@@ -6,6 +6,10 @@ describe ::SlideRule::DistanceCalculators::DayOfMonth do
|
|
6
6
|
expect(described_class.new.calculate('2012-03-19', '2014-08-19')).to eq(0.0)
|
7
7
|
end
|
8
8
|
|
9
|
+
it 'should accept epoch date' do
|
10
|
+
expect(described_class.new.calculate(1_444_262_400, 1_444_262_400)).to eq(0.0)
|
11
|
+
end
|
12
|
+
|
9
13
|
it 'should calculate when date is in the same month' do
|
10
14
|
expect(described_class.new.calculate('2012-03-19', '2014-08-22')).to eq(3.0 / 15)
|
11
15
|
expect(described_class.new.calculate('2012-03-19', '2014-08-09')).to eq(10.0 / 15)
|
@@ -5,6 +5,10 @@ describe ::SlideRule::DistanceCalculators::DayOfYear do
|
|
5
5
|
it 'should return a 0 distance' do
|
6
6
|
expect(described_class.new.calculate('2015-10-8', '2015-10-8')).to eq(0.0)
|
7
7
|
end
|
8
|
+
|
9
|
+
it 'should accept epoch date' do
|
10
|
+
expect(described_class.new.calculate(1_444_262_400, 1_444_262_400)).to eq(0.0)
|
11
|
+
end
|
8
12
|
end
|
9
13
|
|
10
14
|
context 'when dates are more than a year apart' do
|
@@ -1,11 +1,19 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe ::SlideRule::DistanceCalculators::Levenshtein do
|
4
|
+
let(:subject) { described_class.new }
|
5
|
+
|
4
6
|
it 'should calculate perfect match' do
|
5
|
-
expect(
|
7
|
+
expect(subject.calculate('this is a test', 'this is a test')).to eq(0.0)
|
6
8
|
end
|
7
9
|
|
8
10
|
it 'should calculate distance as distance divided by length of longest string' do
|
9
|
-
expect(
|
11
|
+
expect(subject.calculate('this is a test', 'this is a test!').round(4)).to eq((1.0 / 15).round(4))
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should handle nils' do
|
15
|
+
expect(subject.calculate(nil, nil)).to eq(0.0)
|
16
|
+
expect(subject.calculate(nil, 'goodbye')).to eq(1.0)
|
17
|
+
expect(subject.calculate('hello', nil)).to eq(1.0)
|
10
18
|
end
|
11
19
|
end
|
data/spec/spec_helper.rb
CHANGED