record_linkage 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +12 -0
- data/README.md +2 -0
- data/lib/record_linkage/object_comparer.rb +122 -0
- data/lib/record_linkage/version.rb +3 -0
- data/record_linkage.gemspec +46 -0
- metadata +142 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 287d2406379da16f1c2c6b31f15f041d8dd702db
|
4
|
+
data.tar.gz: c52cec061500568bae597814daf2da9931778aa6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3cae10fbf8077fe9c8e8b2f360aab56bb33d180145478ef26d274b721e65dcd5671c8b8ceca8144ec13bef7b619f308802452e20f105cd0a648146f3d53b4bf8
|
7
|
+
data.tar.gz: 18be3cc9528c859746f68ecc3bd461efea5e28df5b52e86eb684f5cc89e344b1b6464fa1586af7ad8c9cb6f004100476716a4831b0efbc78672246fc9da2d6f4
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'fuzzystringmatch'
|
2
|
+
|
3
|
+
module RecordLinkage
|
4
|
+
# Can be use to create an object with certian rules which can then
|
5
|
+
# be used to compare objects to each other
|
6
|
+
class ObjectComparer
|
7
|
+
# Matcher objects represent how to handle individual rules
|
8
|
+
# to compare one field to another
|
9
|
+
class Matcher
|
10
|
+
# A module to hold the various matcher logic which can be declared
|
11
|
+
# with a String or a Symbol
|
12
|
+
module Matchers
|
13
|
+
JAROW = ::FuzzyStringMatch::JaroWinkler.create(:native)
|
14
|
+
|
15
|
+
def self.fuzzy_string_matcher(value1, value2, options = {})
|
16
|
+
if value1.to_s.strip.size < 3 || value2.to_s.strip.size < 3
|
17
|
+
0.0
|
18
|
+
else
|
19
|
+
score = JAROW.getDistance(value1.downcase, value2.downcase)
|
20
|
+
score > options[:threshold] ? score : 0
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.exact_string_matcher(value1, value2, _options = {})
|
25
|
+
value1 = value1.to_s.strip.downcase
|
26
|
+
value2 = value2.to_s.strip.downcase
|
27
|
+
(value1.size >= 1 && value1 == value2) ? 1.0 : 0.0
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.array_fuzzy_string_matcher(array1, array2, options = {})
|
31
|
+
array_matcher(:fuzzy_string_matcher, array1, array2, options)
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.array_exact_string_matcher(array1, array2, options = {})
|
35
|
+
array_matcher(:exact_string_matcher, array1, array2, options)
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.array_matcher(single_matcher, array1, array2, options = {})
|
39
|
+
array1.map do |value1|
|
40
|
+
array2.map do |value2|
|
41
|
+
send(single_matcher, value1, value2, options)
|
42
|
+
end
|
43
|
+
end.flatten.sum
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
attr_reader :property1, :property2
|
48
|
+
|
49
|
+
def initialize(property1, property2, definition, options = {})
|
50
|
+
@property1 = property1
|
51
|
+
@property2 = property2
|
52
|
+
@block = self.class.match_block_from_definition(definition)
|
53
|
+
@options = options
|
54
|
+
end
|
55
|
+
|
56
|
+
def score_objects(object1, object2, default_threshold, default_weight)
|
57
|
+
value1 = object1.send(@property1)
|
58
|
+
value2 = object2.send(@property2)
|
59
|
+
|
60
|
+
threshold = @options[:threshold] || default_threshold
|
61
|
+
weight = @options[:weight] || default_weight
|
62
|
+
|
63
|
+
@block.call(value1, value2, threshold: threshold) * weight
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.match_block_from_definition(definition)
|
67
|
+
case definition
|
68
|
+
when String, Symbol
|
69
|
+
if !Matchers.respond_to?("#{definition}_matcher")
|
70
|
+
fail ArgumentError, "Matcher `#{definition}` is not defined"
|
71
|
+
end
|
72
|
+
|
73
|
+
Matchers.method("#{definition}_matcher")
|
74
|
+
when Proc then definition
|
75
|
+
else
|
76
|
+
fail ArgumentError, "Invalid matcher definition: #{matcher.inspect}"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Object given to `initialize` block to allow API
|
82
|
+
# for configuring matchers / values
|
83
|
+
class Config
|
84
|
+
attr_accessor :default_threshold, :default_weight
|
85
|
+
attr_reader :matchers
|
86
|
+
|
87
|
+
def add_matcher(property1, property2, definition, options = {})
|
88
|
+
matchers << Matcher.new(property1, property2, definition, options)
|
89
|
+
end
|
90
|
+
|
91
|
+
def matchers
|
92
|
+
@matchers ||= []
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def initialize
|
97
|
+
yield config
|
98
|
+
end
|
99
|
+
|
100
|
+
def config
|
101
|
+
@config ||= Config.new
|
102
|
+
end
|
103
|
+
|
104
|
+
def classify_hash(object1, object2)
|
105
|
+
config.matchers.each_with_object({}) do |matcher, result|
|
106
|
+
result[[matcher.property1, matcher.property2]] =
|
107
|
+
matcher.score_objects(object1,
|
108
|
+
object2,
|
109
|
+
default_threshold,
|
110
|
+
default_weight)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def default_threshold
|
115
|
+
config.default_threshold || 0.0
|
116
|
+
end
|
117
|
+
|
118
|
+
def default_weight
|
119
|
+
config.default_weight || 1.0
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
lib = File.expand_path('../lib/', __FILE__)
|
2
|
+
$LOAD_PATH.unshift lib unless $LOAD_PATH.include?(lib)
|
3
|
+
|
4
|
+
require 'record_linkage/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = 'record_linkage'
|
8
|
+
s.version = RecordLinkage::VERSION
|
9
|
+
s.required_ruby_version = '>= 1.9.3'
|
10
|
+
|
11
|
+
s.authors = 'Brian Underwood'
|
12
|
+
s.email = 'public@brian-underwood.codes'
|
13
|
+
s.homepage = 'https://github.com/cheerfulstoic/record_linkage'
|
14
|
+
s.summary = <<SUMMARY
|
15
|
+
A library to do record linkage
|
16
|
+
SUMMARY
|
17
|
+
|
18
|
+
s.license = 'MIT'
|
19
|
+
|
20
|
+
s.description = <<DESCRIPTION
|
21
|
+
A library to do record linkage
|
22
|
+
|
23
|
+
Comparing objects to determine which are the same
|
24
|
+
DESCRIPTION
|
25
|
+
|
26
|
+
s.require_path = 'lib'
|
27
|
+
s.files = Dir.glob('{bin,lib,config}/**/*') +
|
28
|
+
%w(README.md Gemfile record_linkage.gemspec)
|
29
|
+
s.has_rdoc = true
|
30
|
+
s.extra_rdoc_files = %w( README.md )
|
31
|
+
s.rdoc_options = [
|
32
|
+
'--quiet',
|
33
|
+
'--title',
|
34
|
+
'--line-numbers',
|
35
|
+
'--main',
|
36
|
+
'README.rdoc',
|
37
|
+
'--inline-source']
|
38
|
+
|
39
|
+
s.add_dependency('fuzzy-string-match')
|
40
|
+
|
41
|
+
s.add_development_dependency('pry')
|
42
|
+
s.add_development_dependency('simplecov')
|
43
|
+
s.add_development_dependency('guard')
|
44
|
+
s.add_development_dependency('guard-rubocop')
|
45
|
+
s.add_development_dependency('rubocop')
|
46
|
+
end
|
metadata
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: record_linkage
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Brian Underwood
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-07-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: fuzzy-string-match
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: pry
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: simplecov
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: guard
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: guard-rubocop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: |
|
98
|
+
A library to do record linkage
|
99
|
+
|
100
|
+
Comparing objects to determine which are the same
|
101
|
+
email: public@brian-underwood.codes
|
102
|
+
executables: []
|
103
|
+
extensions: []
|
104
|
+
extra_rdoc_files:
|
105
|
+
- README.md
|
106
|
+
files:
|
107
|
+
- Gemfile
|
108
|
+
- README.md
|
109
|
+
- lib/record_linkage/object_comparer.rb
|
110
|
+
- lib/record_linkage/version.rb
|
111
|
+
- record_linkage.gemspec
|
112
|
+
homepage: https://github.com/cheerfulstoic/record_linkage
|
113
|
+
licenses:
|
114
|
+
- MIT
|
115
|
+
metadata: {}
|
116
|
+
post_install_message:
|
117
|
+
rdoc_options:
|
118
|
+
- "--quiet"
|
119
|
+
- "--title"
|
120
|
+
- "--line-numbers"
|
121
|
+
- "--main"
|
122
|
+
- README.rdoc
|
123
|
+
- "--inline-source"
|
124
|
+
require_paths:
|
125
|
+
- lib
|
126
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: 1.9.3
|
131
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
requirements: []
|
137
|
+
rubyforge_project:
|
138
|
+
rubygems_version: 2.4.5
|
139
|
+
signing_key:
|
140
|
+
specification_version: 4
|
141
|
+
summary: A library to do record linkage
|
142
|
+
test_files: []
|