cijef-fullname-matcher 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/Gemfile +3 -0
- data/README.md +16 -0
- data/fullname-matcher.gemspec +17 -0
- data/lib/fullname.rb +4 -0
- data/lib/fullname/equivalence.rb +1264 -0
- data/lib/fullname/matcher.rb +15 -0
- data/lib/fullname/matcher/core.rb +208 -0
- data/lib/fullname/matcher/version.rb +5 -0
- metadata +66 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'fullname/matcher/core'
|
2
|
+
require 'fullname/matcher/version'
|
3
|
+
|
4
|
+
module Fullname
|
5
|
+
module Matcher
|
6
|
+
|
7
|
+
def self.create(table, mapping = {}, options = {}, &blk)
|
8
|
+
core = Core.new(table, mapping, options)
|
9
|
+
blk.call(core) if block_given?
|
10
|
+
core
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
@@ -0,0 +1,208 @@
|
|
1
|
+
# this class gets people, based on name match, from a table that stores firstname/middlename/lastaname/suffix separately.
|
2
|
+
#
|
3
|
+
# try a series of searches (exact match first, then different variations) until success
|
4
|
+
# a lof of logics/examples (eg. middlename handling, suffix handling, when to use abbreviation, when to use 'regexp') are commented inline
|
5
|
+
#
|
6
|
+
# public methods:
|
7
|
+
# . new(table, mapping={}, options = {})
|
8
|
+
# when constructing a name match using xxx = Fullname::Matcher.new(...), the first arg is the table where the search is executed in;
|
9
|
+
# default column mapping is {:first => 'first', :middle => 'middle', :last => 'last', :suffix => 'suffix'};
|
10
|
+
# - if the actual mapping is different, it should be provided as the second arg of new()
|
11
|
+
# - options:
|
12
|
+
# :skip_match_middle_name default is false
|
13
|
+
# . set_condition(c)
|
14
|
+
# if there's other condition (like "data_import_key = 'yyyy.mm.dd'") in search criteria, set it this way
|
15
|
+
#
|
16
|
+
# . get_matches
|
17
|
+
# if the name is one string, use get_matches(orig_name)
|
18
|
+
# if the name is in pieces, use get_matches(firstname, middlename, lastname, suffix)
|
19
|
+
# return ALL matches of the first successful search or [] if all searches fail
|
20
|
+
# . match_fullname
|
21
|
+
# alias of get_matches
|
22
|
+
# . names_match?(n1, style1, n2, style2)
|
23
|
+
# return true if two names (n1 and n2) are same; false otherwise
|
24
|
+
#
|
25
|
+
require 'fullname/parser'
|
26
|
+
require 'fullname/equivalence'
|
27
|
+
|
28
|
+
module Fullname::Matcher
|
29
|
+
class Core
|
30
|
+
|
31
|
+
DEFAULT_MAPPING = {:first => 'first', :middle => 'middle', :last => 'last', :suffix => 'suffix'}
|
32
|
+
DEFAULT_OPTIONS = {
|
33
|
+
:skip_match_middle_name => false, # skip match middle name if middle name not provided.
|
34
|
+
:null_middle_name_match_allowed => false,
|
35
|
+
:skip_match_suffix => false # skip match suffix if suffix not provided or no column suffix in database.
|
36
|
+
}
|
37
|
+
|
38
|
+
class Error < StandardError ; end
|
39
|
+
|
40
|
+
attr_accessor :options
|
41
|
+
|
42
|
+
def initialize(table, mapping = {}, options = {})
|
43
|
+
@table = table
|
44
|
+
@mapping = DEFAULT_MAPPING.merge(mapping)
|
45
|
+
@condition = nil
|
46
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
47
|
+
end
|
48
|
+
|
49
|
+
def set_condition(c)
|
50
|
+
@condition = c
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_matches(*args)
|
54
|
+
name = nil
|
55
|
+
match_options = {}
|
56
|
+
case args.size
|
57
|
+
when 1
|
58
|
+
name = ::Fullname::Parser.parse_fullname(args[0])
|
59
|
+
when 4,5
|
60
|
+
name = {:first => args[0], :middle => args[1], :last => args[2], :suffix => args[3]}
|
61
|
+
match_options = args.pop if args.size == 5
|
62
|
+
else
|
63
|
+
raise Error, 'illeagle arguments length of get_matches, must be the length of 1,4,5'
|
64
|
+
end
|
65
|
+
recursive = match_options.include?(:recursive) ? match_options[:recursive] : true
|
66
|
+
return [] if name[:first].nil? || name[:last].nil?
|
67
|
+
match_list = match_first_last_and_suffix(name)
|
68
|
+
|
69
|
+
# skip validating middlename if @options[:skip_match_middle_name] == true
|
70
|
+
# all matched result which middle name is NULL or NON-NULL will be returned
|
71
|
+
return match_list if @options[:skip_match_middle_name] && match_list.size > 0
|
72
|
+
|
73
|
+
if match_list.size > 0
|
74
|
+
# 1. exactly match
|
75
|
+
exact_match_list = match_list.select do |r|
|
76
|
+
compare_without_dot(r.send(@mapping[:middle]), name[:middle]) && compare_without_dot(r.send(@mapping[:suffix]), name[:suffix])
|
77
|
+
end
|
78
|
+
return exact_match_list if exact_match_list.size > 0 && @options[:null_middle_name_match_allowed] == false
|
79
|
+
|
80
|
+
# 2. if name[:middle] is not NULL, regexp match
|
81
|
+
if name[:middle]
|
82
|
+
m_re = build_middlename_regexp(name[:middle])
|
83
|
+
match_list_with_middlename = match_list.select do |r|
|
84
|
+
r_middle_name = r.send(@mapping[:middle])
|
85
|
+
(r_middle_name && r_middle_name =~ m_re) || (r_middle_name.blank? && @options[:null_middle_name_match_allowed])
|
86
|
+
end
|
87
|
+
return match_list_with_middlename if match_list_with_middlename.size > 0
|
88
|
+
# 2.1 fuzzy match: if middlename in DB is NULL, it matches
|
89
|
+
match_list_with_middlename = match_list.select{ |r| r.send(@mapping[:middle]).nil? }
|
90
|
+
return match_list_with_middlename if match_list_with_middlename.size > 0
|
91
|
+
# clear match list if don't match middlename
|
92
|
+
match_list = []
|
93
|
+
else
|
94
|
+
# 2.2 fuzzy match: assume all matches since name[:middle] is NULL
|
95
|
+
return match_list
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# if nothing matches, try to search with equivalence of first name
|
100
|
+
if match_list.size == 0 && recursive
|
101
|
+
firstname_array = ::Fullname::Equivalence.get_name_equivalence(name[:first])
|
102
|
+
firstname_array.each do |n|
|
103
|
+
match_list += get_matches(n, name[:middle], name[:last], name[:suffix], {:recursive => false})
|
104
|
+
end if firstname_array
|
105
|
+
end
|
106
|
+
|
107
|
+
return match_list
|
108
|
+
end
|
109
|
+
|
110
|
+
alias_method :match_fullname, :get_matches
|
111
|
+
|
112
|
+
# return true if two names (n1 and n2) are same; false otherwise
|
113
|
+
# style = :short means the pieces are first/middle/last/suffix; firstname/middlename/lastname/suffix otherwise
|
114
|
+
def names_match?(n1, style1, n2, style2)
|
115
|
+
f1 = style1 == :short ? n1.first : n1.firstname
|
116
|
+
m1 = style1 == :short ? n1.middle : n1.middlename
|
117
|
+
l1 = style1 == :short ? n1.last : n1.lastname
|
118
|
+
|
119
|
+
f2 = style2 == :short ? n2.first : n2.firstname
|
120
|
+
m2 = style2 == :short ? n2.middle : n2.middlename
|
121
|
+
l2 = style2 == :short ? n2.last : n2.lastname
|
122
|
+
|
123
|
+
# first/last name have to be provided
|
124
|
+
return false if l1.nil? || l2.nil? || f1.nil? || f2.nil?
|
125
|
+
return false if l1.downcase.strip != l2.downcase.strip
|
126
|
+
|
127
|
+
unless @options[:skip_match_suffix]
|
128
|
+
s1 = n1.suffix
|
129
|
+
s2 = n2.suffix
|
130
|
+
return false if s1 && s2 && compare_without_dot(s1, s2) == false
|
131
|
+
end
|
132
|
+
|
133
|
+
return false if !abbr_match?(f1, f2)
|
134
|
+
m1.nil? or m2.nil? or abbr_match?(m1, m2)
|
135
|
+
end
|
136
|
+
|
137
|
+
# 2 strings are 'abbr-match'ed if
|
138
|
+
# . they are same, or
|
139
|
+
# . one string is one char long and the other starts with it
|
140
|
+
# ex: 'abc edf' abbr-matches 'a. e' or 'abc edf', but not 'abc e'
|
141
|
+
def abbr_match?(str1, str2)
|
142
|
+
build_middlename_regexp(str1) =~ str2
|
143
|
+
end
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
def match_first_last_and_suffix(name)
|
148
|
+
conditions = []
|
149
|
+
queries = []
|
150
|
+
firstname_filter = nil
|
151
|
+
conditions << '(' + @condition + ')' if @condition
|
152
|
+
queries << '(placeholder)'
|
153
|
+
conditions << "#{@mapping[:last]} = ?"
|
154
|
+
queries << name[:last]
|
155
|
+
# if first name is abbreviation, fetch all firstnames then filter with Ruby regexp
|
156
|
+
if name[:first] =~ /^[a-z]\.?$/i
|
157
|
+
firstname_filter = %r{^#{name[:first][0].chr}.*}i
|
158
|
+
# otherwise search directly in database, because search with regexp in DB won't use indexer
|
159
|
+
else
|
160
|
+
conditions << "(#{@mapping[:first]} IN (?, ?, ?))"
|
161
|
+
queries << name[:first]
|
162
|
+
queries << name[:first][0].chr
|
163
|
+
queries << name[:first][0].chr + '.'
|
164
|
+
end
|
165
|
+
queries[0] = conditions.join(' AND ')
|
166
|
+
matched_list = @table.where(queries).to_a
|
167
|
+
matched_list.delete_if{|r| r.send(@mapping[:first]) !~ firstname_filter} if firstname_filter
|
168
|
+
|
169
|
+
unless @options[:skip_match_suffix]
|
170
|
+
|
171
|
+
# exactly match suffix
|
172
|
+
if name[:suffix].present?
|
173
|
+
matched_list_with_suffix = matched_list.select{|r| compare_without_dot(r.send(@mapping[:suffix]), name[:suffix]) }
|
174
|
+
return matched_list_with_suffix if matched_list_with_suffix.size > 0
|
175
|
+
end
|
176
|
+
|
177
|
+
# fuzzy match suffix( NULL matches NON-NULL )
|
178
|
+
return matched_list.select{|r| r.send(@mapping[:suffix]).to_s.strip.empty? || name[:suffix].nil? }
|
179
|
+
|
180
|
+
end
|
181
|
+
return matched_list
|
182
|
+
end
|
183
|
+
|
184
|
+
def compare_without_dot(str1, str2)
|
185
|
+
[str1, str2].map{|s| s.to_s.gsub('.', '').downcase.strip}.uniq.size == 1
|
186
|
+
end
|
187
|
+
|
188
|
+
def build_middlename_regexp(middlename)
|
189
|
+
middle_arr = middlename.split(/[. ]+/)
|
190
|
+
tmp_reg = []
|
191
|
+
# Z M |Z M
|
192
|
+
# Z. M. |ZM
|
193
|
+
# Z.M. |Zoellner M
|
194
|
+
# Z Miller |Z Miller
|
195
|
+
# Zoellner M |Zoellner Miller
|
196
|
+
# Zoellner Miller |
|
197
|
+
# K.Taylor
|
198
|
+
if middle_arr.size > 1
|
199
|
+
last_ele = middle_arr.pop
|
200
|
+
tmp_reg << middle_arr.map{|m| Regexp.escape(m[0].chr) + '[. ]+'}.join + Regexp.escape(last_ele) + '[.]?'
|
201
|
+
middle_arr.push(last_ele)
|
202
|
+
end
|
203
|
+
tmp_reg << middle_arr.map{|m| m.size == 1 ? (Regexp.escape(m) + '\S*') : (Regexp.escape(m[0].chr) + '(' + Regexp.escape(m[1..-1]) + '|[.])?')}.join('[. ]+')
|
204
|
+
Regexp.new("^(#{tmp_reg.join('|')})$", true)
|
205
|
+
end
|
206
|
+
|
207
|
+
end
|
208
|
+
end
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cijef-fullname-matcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- yegang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-02-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: fullname-parser
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.0.3
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.0.3
|
27
|
+
description: Provide fullname, search in database with proper conditions
|
28
|
+
email:
|
29
|
+
- yegang.avvo@gmail.com
|
30
|
+
executables: []
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- ".gitignore"
|
35
|
+
- Gemfile
|
36
|
+
- README.md
|
37
|
+
- fullname-matcher.gemspec
|
38
|
+
- lib/fullname.rb
|
39
|
+
- lib/fullname/equivalence.rb
|
40
|
+
- lib/fullname/matcher.rb
|
41
|
+
- lib/fullname/matcher/core.rb
|
42
|
+
- lib/fullname/matcher/version.rb
|
43
|
+
homepage: https://github.com/yegang90/cijef-fullname-matcher
|
44
|
+
licenses: []
|
45
|
+
metadata: {}
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 2.5.2
|
63
|
+
signing_key:
|
64
|
+
specification_version: 4
|
65
|
+
summary: Match fullname in database
|
66
|
+
test_files: []
|