record_linkage 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 287d2406379da16f1c2c6b31f15f041d8dd702db
4
+ data.tar.gz: c52cec061500568bae597814daf2da9931778aa6
5
+ SHA512:
6
+ metadata.gz: 3cae10fbf8077fe9c8e8b2f360aab56bb33d180145478ef26d274b721e65dcd5671c8b8ceca8144ec13bef7b619f308802452e20f105cd0a648146f3d53b4bf8
7
+ data.tar.gz: 18be3cc9528c859746f68ecc3bd461efea5e28df5b52e86eb684f5cc89e344b1b6464fa1586af7ad8c9cb6f004100476716a4831b0efbc78672246fc9da2d6f4
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group 'test' do
6
+ gem 'coveralls', require: false
7
+ gem 'simplecov-html', require: false
8
+ gem 'rake'
9
+ gem 'rspec'
10
+ gem 'rspec-its'
11
+ gem 'guard-rspec', require: false
12
+ end
@@ -0,0 +1,2 @@
1
+
2
+ For doing record linkage, like with Master Data Management
@@ -0,0 +1,122 @@
1
+ require 'fuzzystringmatch'
2
+
3
+ module RecordLinkage
4
+ # Can be use to create an object with certian rules which can then
5
+ # be used to compare objects to each other
6
+ class ObjectComparer
7
+ # Matcher objects represent how to handle individual rules
8
+ # to compare one field to another
9
+ class Matcher
10
+ # A module to hold the various matcher logic which can be declared
11
+ # with a String or a Symbol
12
+ module Matchers
13
+ JAROW = ::FuzzyStringMatch::JaroWinkler.create(:native)
14
+
15
+ def self.fuzzy_string_matcher(value1, value2, options = {})
16
+ if value1.to_s.strip.size < 3 || value2.to_s.strip.size < 3
17
+ 0.0
18
+ else
19
+ score = JAROW.getDistance(value1.downcase, value2.downcase)
20
+ score > options[:threshold] ? score : 0
21
+ end
22
+ end
23
+
24
+ def self.exact_string_matcher(value1, value2, _options = {})
25
+ value1 = value1.to_s.strip.downcase
26
+ value2 = value2.to_s.strip.downcase
27
+ (value1.size >= 1 && value1 == value2) ? 1.0 : 0.0
28
+ end
29
+
30
+ def self.array_fuzzy_string_matcher(array1, array2, options = {})
31
+ array_matcher(:fuzzy_string_matcher, array1, array2, options)
32
+ end
33
+
34
+ def self.array_exact_string_matcher(array1, array2, options = {})
35
+ array_matcher(:exact_string_matcher, array1, array2, options)
36
+ end
37
+
38
+ def self.array_matcher(single_matcher, array1, array2, options = {})
39
+ array1.map do |value1|
40
+ array2.map do |value2|
41
+ send(single_matcher, value1, value2, options)
42
+ end
43
+ end.flatten.sum
44
+ end
45
+ end
46
+
47
+ attr_reader :property1, :property2
48
+
49
+ def initialize(property1, property2, definition, options = {})
50
+ @property1 = property1
51
+ @property2 = property2
52
+ @block = self.class.match_block_from_definition(definition)
53
+ @options = options
54
+ end
55
+
56
+ def score_objects(object1, object2, default_threshold, default_weight)
57
+ value1 = object1.send(@property1)
58
+ value2 = object2.send(@property2)
59
+
60
+ threshold = @options[:threshold] || default_threshold
61
+ weight = @options[:weight] || default_weight
62
+
63
+ @block.call(value1, value2, threshold: threshold) * weight
64
+ end
65
+
66
+ def self.match_block_from_definition(definition)
67
+ case definition
68
+ when String, Symbol
69
+ if !Matchers.respond_to?("#{definition}_matcher")
70
+ fail ArgumentError, "Matcher `#{definition}` is not defined"
71
+ end
72
+
73
+ Matchers.method("#{definition}_matcher")
74
+ when Proc then definition
75
+ else
76
+ fail ArgumentError, "Invalid matcher definition: #{matcher.inspect}"
77
+ end
78
+ end
79
+ end
80
+
81
+ # Object given to `initialize` block to allow API
82
+ # for configuring matchers / values
83
+ class Config
84
+ attr_accessor :default_threshold, :default_weight
85
+ attr_reader :matchers
86
+
87
+ def add_matcher(property1, property2, definition, options = {})
88
+ matchers << Matcher.new(property1, property2, definition, options)
89
+ end
90
+
91
+ def matchers
92
+ @matchers ||= []
93
+ end
94
+ end
95
+
96
+ def initialize
97
+ yield config
98
+ end
99
+
100
+ def config
101
+ @config ||= Config.new
102
+ end
103
+
104
+ def classify_hash(object1, object2)
105
+ config.matchers.each_with_object({}) do |matcher, result|
106
+ result[[matcher.property1, matcher.property2]] =
107
+ matcher.score_objects(object1,
108
+ object2,
109
+ default_threshold,
110
+ default_weight)
111
+ end
112
+ end
113
+
114
+ def default_threshold
115
+ config.default_threshold || 0.0
116
+ end
117
+
118
+ def default_weight
119
+ config.default_weight || 1.0
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,3 @@
1
+ module RecordLinkage
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,46 @@
1
+ lib = File.expand_path('../lib/', __FILE__)
2
+ $LOAD_PATH.unshift lib unless $LOAD_PATH.include?(lib)
3
+
4
+ require 'record_linkage/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = 'record_linkage'
8
+ s.version = RecordLinkage::VERSION
9
+ s.required_ruby_version = '>= 1.9.3'
10
+
11
+ s.authors = 'Brian Underwood'
12
+ s.email = 'public@brian-underwood.codes'
13
+ s.homepage = 'https://github.com/cheerfulstoic/record_linkage'
14
+ s.summary = <<SUMMARY
15
+ A library to do record linkage
16
+ SUMMARY
17
+
18
+ s.license = 'MIT'
19
+
20
+ s.description = <<DESCRIPTION
21
+ A library to do record linkage
22
+
23
+ Comparing objects to determine which are the same
24
+ DESCRIPTION
25
+
26
+ s.require_path = 'lib'
27
+ s.files = Dir.glob('{bin,lib,config}/**/*') +
28
+ %w(README.md Gemfile record_linkage.gemspec)
29
+ s.has_rdoc = true
30
+ s.extra_rdoc_files = %w( README.md )
31
+ s.rdoc_options = [
32
+ '--quiet',
33
+ '--title',
34
+ '--line-numbers',
35
+ '--main',
36
+ 'README.rdoc',
37
+ '--inline-source']
38
+
39
+ s.add_dependency('fuzzy-string-match')
40
+
41
+ s.add_development_dependency('pry')
42
+ s.add_development_dependency('simplecov')
43
+ s.add_development_dependency('guard')
44
+ s.add_development_dependency('guard-rubocop')
45
+ s.add_development_dependency('rubocop')
46
+ end
metadata ADDED
@@ -0,0 +1,142 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: record_linkage
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Brian Underwood
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-07-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: fuzzy-string-match
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pry
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: simplecov
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: guard
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: guard-rubocop
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rubocop
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: |
98
+ A library to do record linkage
99
+
100
+ Comparing objects to determine which are the same
101
+ email: public@brian-underwood.codes
102
+ executables: []
103
+ extensions: []
104
+ extra_rdoc_files:
105
+ - README.md
106
+ files:
107
+ - Gemfile
108
+ - README.md
109
+ - lib/record_linkage/object_comparer.rb
110
+ - lib/record_linkage/version.rb
111
+ - record_linkage.gemspec
112
+ homepage: https://github.com/cheerfulstoic/record_linkage
113
+ licenses:
114
+ - MIT
115
+ metadata: {}
116
+ post_install_message:
117
+ rdoc_options:
118
+ - "--quiet"
119
+ - "--title"
120
+ - "--line-numbers"
121
+ - "--main"
122
+ - README.rdoc
123
+ - "--inline-source"
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: 1.9.3
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.4.5
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: A library to do record linkage
142
+ test_files: []