record_linkage 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +12 -0
- data/README.md +2 -0
- data/lib/record_linkage/object_comparer.rb +122 -0
- data/lib/record_linkage/version.rb +3 -0
- data/record_linkage.gemspec +46 -0
- metadata +142 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 287d2406379da16f1c2c6b31f15f041d8dd702db
|
4
|
+
data.tar.gz: c52cec061500568bae597814daf2da9931778aa6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3cae10fbf8077fe9c8e8b2f360aab56bb33d180145478ef26d274b721e65dcd5671c8b8ceca8144ec13bef7b619f308802452e20f105cd0a648146f3d53b4bf8
|
7
|
+
data.tar.gz: 18be3cc9528c859746f68ecc3bd461efea5e28df5b52e86eb684f5cc89e344b1b6464fa1586af7ad8c9cb6f004100476716a4831b0efbc78672246fc9da2d6f4
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'fuzzystringmatch'
|
2
|
+
|
3
|
+
module RecordLinkage
|
4
|
+
# Can be use to create an object with certian rules which can then
|
5
|
+
# be used to compare objects to each other
|
6
|
+
class ObjectComparer
|
7
|
+
# Matcher objects represent how to handle individual rules
|
8
|
+
# to compare one field to another
|
9
|
+
class Matcher
|
10
|
+
# A module to hold the various matcher logic which can be declared
|
11
|
+
# with a String or a Symbol
|
12
|
+
module Matchers
|
13
|
+
JAROW = ::FuzzyStringMatch::JaroWinkler.create(:native)
|
14
|
+
|
15
|
+
def self.fuzzy_string_matcher(value1, value2, options = {})
|
16
|
+
if value1.to_s.strip.size < 3 || value2.to_s.strip.size < 3
|
17
|
+
0.0
|
18
|
+
else
|
19
|
+
score = JAROW.getDistance(value1.downcase, value2.downcase)
|
20
|
+
score > options[:threshold] ? score : 0
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.exact_string_matcher(value1, value2, _options = {})
|
25
|
+
value1 = value1.to_s.strip.downcase
|
26
|
+
value2 = value2.to_s.strip.downcase
|
27
|
+
(value1.size >= 1 && value1 == value2) ? 1.0 : 0.0
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.array_fuzzy_string_matcher(array1, array2, options = {})
|
31
|
+
array_matcher(:fuzzy_string_matcher, array1, array2, options)
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.array_exact_string_matcher(array1, array2, options = {})
|
35
|
+
array_matcher(:exact_string_matcher, array1, array2, options)
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.array_matcher(single_matcher, array1, array2, options = {})
|
39
|
+
array1.map do |value1|
|
40
|
+
array2.map do |value2|
|
41
|
+
send(single_matcher, value1, value2, options)
|
42
|
+
end
|
43
|
+
end.flatten.sum
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
attr_reader :property1, :property2
|
48
|
+
|
49
|
+
def initialize(property1, property2, definition, options = {})
|
50
|
+
@property1 = property1
|
51
|
+
@property2 = property2
|
52
|
+
@block = self.class.match_block_from_definition(definition)
|
53
|
+
@options = options
|
54
|
+
end
|
55
|
+
|
56
|
+
def score_objects(object1, object2, default_threshold, default_weight)
|
57
|
+
value1 = object1.send(@property1)
|
58
|
+
value2 = object2.send(@property2)
|
59
|
+
|
60
|
+
threshold = @options[:threshold] || default_threshold
|
61
|
+
weight = @options[:weight] || default_weight
|
62
|
+
|
63
|
+
@block.call(value1, value2, threshold: threshold) * weight
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.match_block_from_definition(definition)
|
67
|
+
case definition
|
68
|
+
when String, Symbol
|
69
|
+
if !Matchers.respond_to?("#{definition}_matcher")
|
70
|
+
fail ArgumentError, "Matcher `#{definition}` is not defined"
|
71
|
+
end
|
72
|
+
|
73
|
+
Matchers.method("#{definition}_matcher")
|
74
|
+
when Proc then definition
|
75
|
+
else
|
76
|
+
fail ArgumentError, "Invalid matcher definition: #{matcher.inspect}"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Object given to `initialize` block to allow API
|
82
|
+
# for configuring matchers / values
|
83
|
+
class Config
|
84
|
+
attr_accessor :default_threshold, :default_weight
|
85
|
+
attr_reader :matchers
|
86
|
+
|
87
|
+
def add_matcher(property1, property2, definition, options = {})
|
88
|
+
matchers << Matcher.new(property1, property2, definition, options)
|
89
|
+
end
|
90
|
+
|
91
|
+
def matchers
|
92
|
+
@matchers ||= []
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def initialize
|
97
|
+
yield config
|
98
|
+
end
|
99
|
+
|
100
|
+
def config
|
101
|
+
@config ||= Config.new
|
102
|
+
end
|
103
|
+
|
104
|
+
def classify_hash(object1, object2)
|
105
|
+
config.matchers.each_with_object({}) do |matcher, result|
|
106
|
+
result[[matcher.property1, matcher.property2]] =
|
107
|
+
matcher.score_objects(object1,
|
108
|
+
object2,
|
109
|
+
default_threshold,
|
110
|
+
default_weight)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def default_threshold
|
115
|
+
config.default_threshold || 0.0
|
116
|
+
end
|
117
|
+
|
118
|
+
def default_weight
|
119
|
+
config.default_weight || 1.0
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
lib = File.expand_path('../lib/', __FILE__)
|
2
|
+
$LOAD_PATH.unshift lib unless $LOAD_PATH.include?(lib)
|
3
|
+
|
4
|
+
require 'record_linkage/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = 'record_linkage'
|
8
|
+
s.version = RecordLinkage::VERSION
|
9
|
+
s.required_ruby_version = '>= 1.9.3'
|
10
|
+
|
11
|
+
s.authors = 'Brian Underwood'
|
12
|
+
s.email = 'public@brian-underwood.codes'
|
13
|
+
s.homepage = 'https://github.com/cheerfulstoic/record_linkage'
|
14
|
+
s.summary = <<SUMMARY
|
15
|
+
A library to do record linkage
|
16
|
+
SUMMARY
|
17
|
+
|
18
|
+
s.license = 'MIT'
|
19
|
+
|
20
|
+
s.description = <<DESCRIPTION
|
21
|
+
A library to do record linkage
|
22
|
+
|
23
|
+
Comparing objects to determine which are the same
|
24
|
+
DESCRIPTION
|
25
|
+
|
26
|
+
s.require_path = 'lib'
|
27
|
+
s.files = Dir.glob('{bin,lib,config}/**/*') +
|
28
|
+
%w(README.md Gemfile record_linkage.gemspec)
|
29
|
+
s.has_rdoc = true
|
30
|
+
s.extra_rdoc_files = %w( README.md )
|
31
|
+
s.rdoc_options = [
|
32
|
+
'--quiet',
|
33
|
+
'--title',
|
34
|
+
'--line-numbers',
|
35
|
+
'--main',
|
36
|
+
'README.rdoc',
|
37
|
+
'--inline-source']
|
38
|
+
|
39
|
+
s.add_dependency('fuzzy-string-match')
|
40
|
+
|
41
|
+
s.add_development_dependency('pry')
|
42
|
+
s.add_development_dependency('simplecov')
|
43
|
+
s.add_development_dependency('guard')
|
44
|
+
s.add_development_dependency('guard-rubocop')
|
45
|
+
s.add_development_dependency('rubocop')
|
46
|
+
end
|
metadata
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: record_linkage
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Brian Underwood
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-07-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: fuzzy-string-match
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: pry
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: simplecov
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: guard
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: guard-rubocop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: |
|
98
|
+
A library to do record linkage
|
99
|
+
|
100
|
+
Comparing objects to determine which are the same
|
101
|
+
email: public@brian-underwood.codes
|
102
|
+
executables: []
|
103
|
+
extensions: []
|
104
|
+
extra_rdoc_files:
|
105
|
+
- README.md
|
106
|
+
files:
|
107
|
+
- Gemfile
|
108
|
+
- README.md
|
109
|
+
- lib/record_linkage/object_comparer.rb
|
110
|
+
- lib/record_linkage/version.rb
|
111
|
+
- record_linkage.gemspec
|
112
|
+
homepage: https://github.com/cheerfulstoic/record_linkage
|
113
|
+
licenses:
|
114
|
+
- MIT
|
115
|
+
metadata: {}
|
116
|
+
post_install_message:
|
117
|
+
rdoc_options:
|
118
|
+
- "--quiet"
|
119
|
+
- "--title"
|
120
|
+
- "--line-numbers"
|
121
|
+
- "--main"
|
122
|
+
- README.rdoc
|
123
|
+
- "--inline-source"
|
124
|
+
require_paths:
|
125
|
+
- lib
|
126
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: 1.9.3
|
131
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
requirements: []
|
137
|
+
rubyforge_project:
|
138
|
+
rubygems_version: 2.4.5
|
139
|
+
signing_key:
|
140
|
+
specification_version: 4
|
141
|
+
summary: A library to do record linkage
|
142
|
+
test_files: []
|