record_linkage 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 287d2406379da16f1c2c6b31f15f041d8dd702db
4
+ data.tar.gz: c52cec061500568bae597814daf2da9931778aa6
5
+ SHA512:
6
+ metadata.gz: 3cae10fbf8077fe9c8e8b2f360aab56bb33d180145478ef26d274b721e65dcd5671c8b8ceca8144ec13bef7b619f308802452e20f105cd0a648146f3d53b4bf8
7
+ data.tar.gz: 18be3cc9528c859746f68ecc3bd461efea5e28df5b52e86eb684f5cc89e344b1b6464fa1586af7ad8c9cb6f004100476716a4831b0efbc78672246fc9da2d6f4
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group 'test' do
6
+ gem 'coveralls', require: false
7
+ gem 'simplecov-html', require: false
8
+ gem 'rake'
9
+ gem 'rspec'
10
+ gem 'rspec-its'
11
+ gem 'guard-rspec', require: false
12
+ end
@@ -0,0 +1,2 @@
1
+
2
+ For doing record linkage, like with Master Data Management
@@ -0,0 +1,122 @@
1
+ require 'fuzzystringmatch'
2
+
3
+ module RecordLinkage
4
+ # Can be use to create an object with certian rules which can then
5
+ # be used to compare objects to each other
6
+ class ObjectComparer
7
+ # Matcher objects represent how to handle individual rules
8
+ # to compare one field to another
9
+ class Matcher
10
+ # A module to hold the various matcher logic which can be declared
11
+ # with a String or a Symbol
12
+ module Matchers
13
+ JAROW = ::FuzzyStringMatch::JaroWinkler.create(:native)
14
+
15
+ def self.fuzzy_string_matcher(value1, value2, options = {})
16
+ if value1.to_s.strip.size < 3 || value2.to_s.strip.size < 3
17
+ 0.0
18
+ else
19
+ score = JAROW.getDistance(value1.downcase, value2.downcase)
20
+ score > options[:threshold] ? score : 0
21
+ end
22
+ end
23
+
24
+ def self.exact_string_matcher(value1, value2, _options = {})
25
+ value1 = value1.to_s.strip.downcase
26
+ value2 = value2.to_s.strip.downcase
27
+ (value1.size >= 1 && value1 == value2) ? 1.0 : 0.0
28
+ end
29
+
30
+ def self.array_fuzzy_string_matcher(array1, array2, options = {})
31
+ array_matcher(:fuzzy_string_matcher, array1, array2, options)
32
+ end
33
+
34
+ def self.array_exact_string_matcher(array1, array2, options = {})
35
+ array_matcher(:exact_string_matcher, array1, array2, options)
36
+ end
37
+
38
+ def self.array_matcher(single_matcher, array1, array2, options = {})
39
+ array1.map do |value1|
40
+ array2.map do |value2|
41
+ send(single_matcher, value1, value2, options)
42
+ end
43
+ end.flatten.sum
44
+ end
45
+ end
46
+
47
+ attr_reader :property1, :property2
48
+
49
+ def initialize(property1, property2, definition, options = {})
50
+ @property1 = property1
51
+ @property2 = property2
52
+ @block = self.class.match_block_from_definition(definition)
53
+ @options = options
54
+ end
55
+
56
+ def score_objects(object1, object2, default_threshold, default_weight)
57
+ value1 = object1.send(@property1)
58
+ value2 = object2.send(@property2)
59
+
60
+ threshold = @options[:threshold] || default_threshold
61
+ weight = @options[:weight] || default_weight
62
+
63
+ @block.call(value1, value2, threshold: threshold) * weight
64
+ end
65
+
66
+ def self.match_block_from_definition(definition)
67
+ case definition
68
+ when String, Symbol
69
+ if !Matchers.respond_to?("#{definition}_matcher")
70
+ fail ArgumentError, "Matcher `#{definition}` is not defined"
71
+ end
72
+
73
+ Matchers.method("#{definition}_matcher")
74
+ when Proc then definition
75
+ else
76
+ fail ArgumentError, "Invalid matcher definition: #{matcher.inspect}"
77
+ end
78
+ end
79
+ end
80
+
81
+ # Object given to `initialize` block to allow API
82
+ # for configuring matchers / values
83
+ class Config
84
+ attr_accessor :default_threshold, :default_weight
85
+ attr_reader :matchers
86
+
87
+ def add_matcher(property1, property2, definition, options = {})
88
+ matchers << Matcher.new(property1, property2, definition, options)
89
+ end
90
+
91
+ def matchers
92
+ @matchers ||= []
93
+ end
94
+ end
95
+
96
+ def initialize
97
+ yield config
98
+ end
99
+
100
+ def config
101
+ @config ||= Config.new
102
+ end
103
+
104
+ def classify_hash(object1, object2)
105
+ config.matchers.each_with_object({}) do |matcher, result|
106
+ result[[matcher.property1, matcher.property2]] =
107
+ matcher.score_objects(object1,
108
+ object2,
109
+ default_threshold,
110
+ default_weight)
111
+ end
112
+ end
113
+
114
+ def default_threshold
115
+ config.default_threshold || 0.0
116
+ end
117
+
118
+ def default_weight
119
+ config.default_weight || 1.0
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,3 @@
1
+ module RecordLinkage
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,46 @@
1
+ lib = File.expand_path('../lib/', __FILE__)
2
+ $LOAD_PATH.unshift lib unless $LOAD_PATH.include?(lib)
3
+
4
+ require 'record_linkage/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = 'record_linkage'
8
+ s.version = RecordLinkage::VERSION
9
+ s.required_ruby_version = '>= 1.9.3'
10
+
11
+ s.authors = 'Brian Underwood'
12
+ s.email = 'public@brian-underwood.codes'
13
+ s.homepage = 'https://github.com/cheerfulstoic/record_linkage'
14
+ s.summary = <<SUMMARY
15
+ A library to do record linkage
16
+ SUMMARY
17
+
18
+ s.license = 'MIT'
19
+
20
+ s.description = <<DESCRIPTION
21
+ A library to do record linkage
22
+
23
+ Comparing objects to determine which are the same
24
+ DESCRIPTION
25
+
26
+ s.require_path = 'lib'
27
+ s.files = Dir.glob('{bin,lib,config}/**/*') +
28
+ %w(README.md Gemfile record_linkage.gemspec)
29
+ s.has_rdoc = true
30
+ s.extra_rdoc_files = %w( README.md )
31
+ s.rdoc_options = [
32
+ '--quiet',
33
+ '--title',
34
+ '--line-numbers',
35
+ '--main',
36
+ 'README.rdoc',
37
+ '--inline-source']
38
+
39
+ s.add_dependency('fuzzy-string-match')
40
+
41
+ s.add_development_dependency('pry')
42
+ s.add_development_dependency('simplecov')
43
+ s.add_development_dependency('guard')
44
+ s.add_development_dependency('guard-rubocop')
45
+ s.add_development_dependency('rubocop')
46
+ end
metadata ADDED
@@ -0,0 +1,142 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: record_linkage
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Brian Underwood
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-07-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: fuzzy-string-match
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pry
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: simplecov
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: guard
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: guard-rubocop
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rubocop
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: |
98
+ A library to do record linkage
99
+
100
+ Comparing objects to determine which are the same
101
+ email: public@brian-underwood.codes
102
+ executables: []
103
+ extensions: []
104
+ extra_rdoc_files:
105
+ - README.md
106
+ files:
107
+ - Gemfile
108
+ - README.md
109
+ - lib/record_linkage/object_comparer.rb
110
+ - lib/record_linkage/version.rb
111
+ - record_linkage.gemspec
112
+ homepage: https://github.com/cheerfulstoic/record_linkage
113
+ licenses:
114
+ - MIT
115
+ metadata: {}
116
+ post_install_message:
117
+ rdoc_options:
118
+ - "--quiet"
119
+ - "--title"
120
+ - "--line-numbers"
121
+ - "--main"
122
+ - README.rdoc
123
+ - "--inline-source"
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: 1.9.3
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.4.5
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: A library to do record linkage
142
+ test_files: []