icu_name 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/icu_name/name.rb +230 -0
- data/lib/icu_name/util.rb +19 -0
- data/lib/icu_name/version.rb +1 -1
- data/lib/icu_name.rb +2 -230
- data/spec/{icu_name_spec.rb → name_spec.rb} +44 -31
- data/spec/util_spec.rb +37 -0
- metadata +7 -4
@@ -0,0 +1,230 @@
|
|
1
|
+
module ICU
|
2
|
+
class Name
|
3
|
+
attr_reader :first, :last
|
4
|
+
|
5
|
+
# Construct from one or two strings or any objects that have a to_s method.
|
6
|
+
def initialize(name1='', name2='')
|
7
|
+
@name1 = name1.to_s.dup
|
8
|
+
@name2 = name2.to_s.dup
|
9
|
+
canonicalize
|
10
|
+
end
|
11
|
+
|
12
|
+
# Return a complete name, first name first, no comma.
|
13
|
+
def name
|
14
|
+
name = ''
|
15
|
+
name << @first
|
16
|
+
name << ' ' if @first.length > 0 && @last.length > 0
|
17
|
+
name << @last
|
18
|
+
name
|
19
|
+
end
|
20
|
+
|
21
|
+
# Return a reversed complete name, first name last after a comma.
|
22
|
+
def rname
|
23
|
+
name = ''
|
24
|
+
name << @last
|
25
|
+
name << ', ' if @first.length > 0 && @last.length > 0
|
26
|
+
name << @first
|
27
|
+
name
|
28
|
+
end
|
29
|
+
|
30
|
+
# Convert object to a string.
|
31
|
+
def to_s
|
32
|
+
rname
|
33
|
+
end
|
34
|
+
|
35
|
+
# Match another name to this object, returning true or false.
|
36
|
+
def match(name1='', name2='')
|
37
|
+
other = Name.new(name1, name2)
|
38
|
+
match_first(first, other.first) && match_last(last, other.last)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
# Canonicalise the first and last names.
|
44
|
+
def canonicalize
|
45
|
+
first, last = partition
|
46
|
+
@first = finish_first(first)
|
47
|
+
@last = finish_last(last)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Split one complete name into first and last parts.
|
51
|
+
def partition
|
52
|
+
if @name2.length == 0
|
53
|
+
# Only one imput so we must split first and last.
|
54
|
+
parts = @name1.split(/,/)
|
55
|
+
if parts.size > 1
|
56
|
+
last = clean(parts.shift || '')
|
57
|
+
first = clean(parts.join(' '))
|
58
|
+
else
|
59
|
+
parts = clean(@name1).split(/ /)
|
60
|
+
last = parts.pop || ''
|
61
|
+
first = parts.join(' ')
|
62
|
+
end
|
63
|
+
else
|
64
|
+
# Two inputs, so we are given first and last.
|
65
|
+
first = clean(@name1)
|
66
|
+
last = clean(@name2)
|
67
|
+
end
|
68
|
+
[first, last]
|
69
|
+
end
|
70
|
+
|
71
|
+
# Clean up characters in any name.
|
72
|
+
def clean(name)
|
73
|
+
name.gsub!(/`/, "'")
|
74
|
+
name.gsub!(/[^-a-zA-Z.'\s]/, '')
|
75
|
+
name.gsub!(/\./, ' ')
|
76
|
+
name.gsub!(/\s*-\s*/, '-')
|
77
|
+
name.gsub!(/'+/, "'")
|
78
|
+
name.strip.downcase.split(/\s+/).map do |n|
|
79
|
+
n.sub!(/^-+/, '')
|
80
|
+
n.sub!(/-+$/, '')
|
81
|
+
n.split(/-/).map do |p|
|
82
|
+
p.capitalize!
|
83
|
+
end.join('-')
|
84
|
+
end.join(' ')
|
85
|
+
end
|
86
|
+
|
87
|
+
# Apply final touches to finish canonicalising a first name.
|
88
|
+
def finish_first(names)
|
89
|
+
names.gsub(/([A-Z])\b/, '\1.')
|
90
|
+
end
|
91
|
+
|
92
|
+
# Apply final touches to finish canonicalising a last name.
|
93
|
+
def finish_last(names)
|
94
|
+
names.gsub!(/\b([A-Z])'([a-z])/) { |m| $1 << "'" << $2.upcase}
|
95
|
+
names.gsub!(/\bMc([a-z])/) { |m| 'Mc' << $1.upcase}
|
96
|
+
names.gsub!(/\bMac([a-z])/) do |m|
|
97
|
+
letter = $1
|
98
|
+
'Mac'.concat(@name2.match("[mM][aA][cC]#{letter}") ? letter : letter.upcase)
|
99
|
+
end
|
100
|
+
names.gsub!(/\bO ([A-Z])/) { |m| "O'" << $1 }
|
101
|
+
names
|
102
|
+
end
|
103
|
+
|
104
|
+
# Match a complete first name.
|
105
|
+
def match_first(first1, first2)
|
106
|
+
# Is this one a walk in the park?
|
107
|
+
return true if first1 == first2
|
108
|
+
|
109
|
+
# No easy ride. Begin by splitting into individual first names.
|
110
|
+
first1 = split_first(first1)
|
111
|
+
first2 = split_first(first2)
|
112
|
+
|
113
|
+
# Get the long list and the short list.
|
114
|
+
long, short = first1.size >= first2.size ? [first1, first2] : [first2, first1]
|
115
|
+
|
116
|
+
# The short one must be a "subset" of the long one.
|
117
|
+
# An extra condition must also be satisfied.
|
118
|
+
extra = false
|
119
|
+
(0..long.size-1).each do |i|
|
120
|
+
lword = long.shift
|
121
|
+
score = match_first_name(lword, short.first)
|
122
|
+
if score >= 0
|
123
|
+
short.shift
|
124
|
+
extra = true if i == 0 || score == 0
|
125
|
+
end
|
126
|
+
break if short.empty? || long.empty?
|
127
|
+
end
|
128
|
+
|
129
|
+
# There's a match if the following is true.
|
130
|
+
short.empty? && extra
|
131
|
+
end
|
132
|
+
|
133
|
+
# Match a complete last name.
|
134
|
+
def match_last(last1, last2)
|
135
|
+
return true if last1 == last2
|
136
|
+
[last1, last2].each do |last|
|
137
|
+
last.downcase! # MacDonaugh and Macdonaugh
|
138
|
+
last.gsub!(/\bmac/, 'mc') # MacDonaugh and McDonaugh
|
139
|
+
last.tr!('-', ' ') # Lowry-O'Reilly and Lowry O'Reilly
|
140
|
+
end
|
141
|
+
last1 == last2
|
142
|
+
end
|
143
|
+
|
144
|
+
# Split a complete first name for matching.
|
145
|
+
def split_first(first)
|
146
|
+
first.tr!('-', ' ') # J. K. and J.-K.
|
147
|
+
first = first.split(/ /) # split on spaces
|
148
|
+
first = [''] if first.size == 0 # in case input was empty string
|
149
|
+
first
|
150
|
+
end
|
151
|
+
|
152
|
+
# Match individual first names or initials.
|
153
|
+
# -1 = no match
|
154
|
+
# 0 = full match
|
155
|
+
# 1 = match involving 1 initial
|
156
|
+
# 2 = match involving 2 initials
|
157
|
+
def match_first_name(first1, first2)
|
158
|
+
initials = 0
|
159
|
+
initials+= 1 if first1.match(/^[A-Z]\.?$/)
|
160
|
+
initials+= 1 if first2.match(/^[A-Z]\.?$/)
|
161
|
+
return initials if first1 == first2
|
162
|
+
return 0 if initials == 0 && match_nick_name(first1, first2)
|
163
|
+
return -1 unless initials > 0
|
164
|
+
return initials if first1[0] == first2[0]
|
165
|
+
-1
|
166
|
+
end
|
167
|
+
|
168
|
+
# Match two first names that might be equivalent nicknames.
|
169
|
+
def match_nick_name(nick1, nick2)
|
170
|
+
compile_nick_names unless @@nc
|
171
|
+
code1 = @@nc[nick1]
|
172
|
+
return false unless code1
|
173
|
+
code1 == @@nc[nick2]
|
174
|
+
end
|
175
|
+
|
176
|
+
# Compile the nick names code hash when matching nick names is first attempted.
|
177
|
+
def compile_nick_names
|
178
|
+
@@nc = Hash.new
|
179
|
+
code = 1
|
180
|
+
@@nl.each do |nicks|
|
181
|
+
nicks.each do |n|
|
182
|
+
throw "duplicate name #{n}" if @@nc[n]
|
183
|
+
@@nc[n] = code
|
184
|
+
end
|
185
|
+
code+= 1
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
# A array of data for matching nicknames and also a few common misspellings.
|
190
|
+
@@nc = nil
|
191
|
+
@@nl = <<EOF.split(/\n/).reject{|x| x.length == 0 }.map{|x| x.split(' ')}
|
192
|
+
Abdul Abul
|
193
|
+
Alexander Alex
|
194
|
+
Anandagopal Ananda
|
195
|
+
Anne Ann
|
196
|
+
Anthony Tony
|
197
|
+
Benjamin Ben
|
198
|
+
Catherine Cathy Cath
|
199
|
+
Daniel Danial Danny Dan
|
200
|
+
David Dave
|
201
|
+
Deborah Debbie
|
202
|
+
Des Desmond
|
203
|
+
Eamonn Eamon
|
204
|
+
Edward Eddie Ed
|
205
|
+
Eric Erick Erik
|
206
|
+
Frederick Frederic Fred
|
207
|
+
Gerald Gerry
|
208
|
+
Gerhard Gerard Ger
|
209
|
+
James Jim
|
210
|
+
Joanna Joan Joanne
|
211
|
+
John Johnny
|
212
|
+
Jonathan Jon
|
213
|
+
Kenneth Ken Kenny
|
214
|
+
Michael Mike Mick Micky
|
215
|
+
Nicholas Nick Nicolas
|
216
|
+
Nicola Nickie Nicky
|
217
|
+
Patrick Pat Paddy
|
218
|
+
Peter Pete
|
219
|
+
Philippe Philip Phillippe Phillip
|
220
|
+
Rick Ricky
|
221
|
+
Robert Bob Bobby
|
222
|
+
Samual Sam Samuel
|
223
|
+
Stefanie Stef
|
224
|
+
Stephen Steven Steve
|
225
|
+
Terence Terry
|
226
|
+
Thomas Tom Tommy
|
227
|
+
William Will Willy Willie Bill
|
228
|
+
EOF
|
229
|
+
end
|
230
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module ICU
|
2
|
+
class Util
|
3
|
+
# Decide if a string is valid UTF-8 or not, returning true or false.
|
4
|
+
def self.is_utf8(str)
|
5
|
+
dup = str.dup
|
6
|
+
dup.force_encoding("UTF-8")
|
7
|
+
dup.valid_encoding?
|
8
|
+
end
|
9
|
+
|
10
|
+
# Try to convert any string to UTF-8.
|
11
|
+
def self.to_utf8(str)
|
12
|
+
utf8 = is_utf8(str)
|
13
|
+
dup = str.dup
|
14
|
+
return dup.force_encoding("UTF-8") if utf8
|
15
|
+
dup.force_encoding("Windows-1252") if dup.encoding.name.match(/^(ASCII-8BIT|UTF-8)$/)
|
16
|
+
dup.encode("UTF-8")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/icu_name/version.rb
CHANGED
data/lib/icu_name.rb
CHANGED
@@ -1,230 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
attr_reader :first, :last
|
4
|
-
|
5
|
-
# Construct from one or two strings or any objects that have a to_s method.
|
6
|
-
def initialize(name1='', name2='')
|
7
|
-
@name1 = name1.to_s
|
8
|
-
@name2 = name2.to_s
|
9
|
-
canonicalize
|
10
|
-
end
|
11
|
-
|
12
|
-
# Return a complete name, first name first, no comma.
|
13
|
-
def name
|
14
|
-
name = ''
|
15
|
-
name << @first
|
16
|
-
name << ' ' if @first.length > 0 && @last.length > 0
|
17
|
-
name << @last
|
18
|
-
name
|
19
|
-
end
|
20
|
-
|
21
|
-
# Return a reversed complete name, first name last after a comma.
|
22
|
-
def rname
|
23
|
-
name = ''
|
24
|
-
name << @last
|
25
|
-
name << ', ' if @first.length > 0 && @last.length > 0
|
26
|
-
name << @first
|
27
|
-
name
|
28
|
-
end
|
29
|
-
|
30
|
-
# Convert object to a string.
|
31
|
-
def to_s
|
32
|
-
rname
|
33
|
-
end
|
34
|
-
|
35
|
-
# Match another name to this object, returning true or false.
|
36
|
-
def match(name1='', name2='')
|
37
|
-
other = Name.new(name1, name2)
|
38
|
-
match_first(first, other.first) && match_last(last, other.last)
|
39
|
-
end
|
40
|
-
|
41
|
-
private
|
42
|
-
|
43
|
-
# Canonicalise the first and last names.
|
44
|
-
def canonicalize
|
45
|
-
first, last = partition
|
46
|
-
@first = finish_first(first)
|
47
|
-
@last = finish_last(last)
|
48
|
-
end
|
49
|
-
|
50
|
-
# Split one complete name into first and last parts.
|
51
|
-
def partition
|
52
|
-
if @name2.length == 0
|
53
|
-
# Only one imput so we must split first and last.
|
54
|
-
parts = @name1.split(/,/)
|
55
|
-
if parts.size > 1
|
56
|
-
last = clean(parts.shift || '')
|
57
|
-
first = clean(parts.join(' '))
|
58
|
-
else
|
59
|
-
parts = clean(@name1).split(/ /)
|
60
|
-
last = parts.pop || ''
|
61
|
-
first = parts.join(' ')
|
62
|
-
end
|
63
|
-
else
|
64
|
-
# Two inputs, so we are given first and last.
|
65
|
-
first = clean(@name1)
|
66
|
-
last = clean(@name2)
|
67
|
-
end
|
68
|
-
[first, last]
|
69
|
-
end
|
70
|
-
|
71
|
-
# Clean up characters in any name.
|
72
|
-
def clean(name)
|
73
|
-
name.gsub!(/`/, "'")
|
74
|
-
name.gsub!(/[^-a-zA-Z.'\s]/, '')
|
75
|
-
name.gsub!(/\./, ' ')
|
76
|
-
name.gsub!(/\s*-\s*/, '-')
|
77
|
-
name.gsub!(/'+/, "'")
|
78
|
-
name.strip.downcase.split(/\s+/).map do |n|
|
79
|
-
n.sub!(/^-+/, '')
|
80
|
-
n.sub!(/-+$/, '')
|
81
|
-
n.split(/-/).map do |p|
|
82
|
-
p.capitalize!
|
83
|
-
end.join('-')
|
84
|
-
end.join(' ')
|
85
|
-
end
|
86
|
-
|
87
|
-
# Apply final touches to finish canonicalising a first name.
|
88
|
-
def finish_first(names)
|
89
|
-
names.gsub(/([A-Z])\b/, '\1.')
|
90
|
-
end
|
91
|
-
|
92
|
-
# Apply final touches to finish canonicalising a last name.
|
93
|
-
def finish_last(names)
|
94
|
-
names.gsub!(/\b([A-Z])'([a-z])/) { |m| $1 << "'" << $2.upcase}
|
95
|
-
names.gsub!(/\bMc([a-z])/) { |m| 'Mc' << $1.upcase}
|
96
|
-
names.gsub!(/\bMac([a-z])/) do |m|
|
97
|
-
letter = $1
|
98
|
-
'Mac'.concat(@name2.match("[mM][aA][cC]#{letter}") ? letter : letter.upcase)
|
99
|
-
end
|
100
|
-
names.gsub!(/\bO ([A-Z])/) { |m| "O'" << $1 }
|
101
|
-
names
|
102
|
-
end
|
103
|
-
|
104
|
-
# Match a complete first name.
|
105
|
-
def match_first(first1, first2)
|
106
|
-
# Is this one a walk in the park?
|
107
|
-
return true if first1 == first2
|
108
|
-
|
109
|
-
# No easy ride. Begin by splitting into individual first names.
|
110
|
-
first1 = split_first(first1)
|
111
|
-
first2 = split_first(first2)
|
112
|
-
|
113
|
-
# Get the long list and the short list.
|
114
|
-
long, short = first1.size >= first2.size ? [first1, first2] : [first2, first1]
|
115
|
-
|
116
|
-
# The short one must be a "subset" of the long one.
|
117
|
-
# An extra condition must also be satisfied.
|
118
|
-
extra = false
|
119
|
-
(0..long.size-1).each do |i|
|
120
|
-
lword = long.shift
|
121
|
-
score = match_first_name(lword, short.first)
|
122
|
-
if score >= 0
|
123
|
-
short.shift
|
124
|
-
extra = true if i == 0 || score == 0
|
125
|
-
end
|
126
|
-
break if short.empty? || long.empty?
|
127
|
-
end
|
128
|
-
|
129
|
-
# There's a match if the following is true.
|
130
|
-
short.empty? && extra
|
131
|
-
end
|
132
|
-
|
133
|
-
# Match a complete last name.
|
134
|
-
def match_last(last1, last2)
|
135
|
-
return true if last1 == last2
|
136
|
-
[last1, last2].each do |last|
|
137
|
-
last.downcase! # MacDonaugh and Macdonaugh
|
138
|
-
last.gsub!(/\bmac/, 'mc') # MacDonaugh and McDonaugh
|
139
|
-
last.tr!('-', ' ') # Lowry-O'Reilly and Lowry O'Reilly
|
140
|
-
end
|
141
|
-
last1 == last2
|
142
|
-
end
|
143
|
-
|
144
|
-
# Split a complete first name for matching.
|
145
|
-
def split_first(first)
|
146
|
-
first.tr!('-', ' ') # J. K. and J.-K.
|
147
|
-
first = first.split(/ /) # split on spaces
|
148
|
-
first = [''] if first.size == 0 # in case input was empty string
|
149
|
-
first
|
150
|
-
end
|
151
|
-
|
152
|
-
# Match individual first names or initials.
|
153
|
-
# -1 = no match
|
154
|
-
# 0 = full match
|
155
|
-
# 1 = match involving 1 initial
|
156
|
-
# 2 = match involving 2 initials
|
157
|
-
def match_first_name(first1, first2)
|
158
|
-
initials = 0
|
159
|
-
initials+= 1 if first1.match(/^[A-Z]\.?$/)
|
160
|
-
initials+= 1 if first2.match(/^[A-Z]\.?$/)
|
161
|
-
return initials if first1 == first2
|
162
|
-
return 0 if initials == 0 && match_nick_name(first1, first2)
|
163
|
-
return -1 unless initials > 0
|
164
|
-
return initials if first1[0] == first2[0]
|
165
|
-
-1
|
166
|
-
end
|
167
|
-
|
168
|
-
# Match two first names that might be equivalent nicknames.
|
169
|
-
def match_nick_name(nick1, nick2)
|
170
|
-
compile_nick_names unless @@nc
|
171
|
-
code1 = @@nc[nick1]
|
172
|
-
return false unless code1
|
173
|
-
code1 == @@nc[nick2]
|
174
|
-
end
|
175
|
-
|
176
|
-
# Compile the nick names code hash when matching nick names is first attempted.
|
177
|
-
def compile_nick_names
|
178
|
-
@@nc = Hash.new
|
179
|
-
code = 1
|
180
|
-
@@nl.each do |nicks|
|
181
|
-
nicks.each do |n|
|
182
|
-
throw "duplicate name #{n}" if @@nc[n]
|
183
|
-
@@nc[n] = code
|
184
|
-
end
|
185
|
-
code+= 1
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
# A array of data for matching nicknames and also a few common misspellings.
|
190
|
-
@@nc = nil
|
191
|
-
@@nl = <<EOF.split(/\n/).reject{|x| x.length == 0 }.map{|x| x.split(' ')}
|
192
|
-
Abdul Abul
|
193
|
-
Alexander Alex
|
194
|
-
Anandagopal Ananda
|
195
|
-
Anne Ann
|
196
|
-
Anthony Tony
|
197
|
-
Benjamin Ben
|
198
|
-
Catherine Cathy Cath
|
199
|
-
Daniel Danial Danny Dan
|
200
|
-
David Dave
|
201
|
-
Deborah Debbie
|
202
|
-
Des Desmond
|
203
|
-
Eamonn Eamon
|
204
|
-
Edward Eddie Ed
|
205
|
-
Eric Erick Erik
|
206
|
-
Frederick Frederic Fred
|
207
|
-
Gerald Gerry
|
208
|
-
Gerhard Gerard Ger
|
209
|
-
James Jim
|
210
|
-
Joanna Joan Joanne
|
211
|
-
John Johnny
|
212
|
-
Jonathan Jon
|
213
|
-
Kenneth Ken Kenny
|
214
|
-
Michael Mike Mick Micky
|
215
|
-
Nicholas Nick Nicolas
|
216
|
-
Nicola Nickie Nicky
|
217
|
-
Patrick Pat Paddy
|
218
|
-
Peter Pete
|
219
|
-
Philippe Philip Phillippe Phillip
|
220
|
-
Rick Ricky
|
221
|
-
Robert Bob Bobby
|
222
|
-
Samual Sam Samuel
|
223
|
-
Stefanie Stef
|
224
|
-
Stephen Steven Steve
|
225
|
-
Terence Terry
|
226
|
-
Thomas Tom Tommy
|
227
|
-
William Will Willy Willie Bill
|
228
|
-
EOF
|
229
|
-
end
|
230
|
-
end
|
1
|
+
require 'icu_name/name.rb'
|
2
|
+
require 'icu_name/util.rb'
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: UTF-8
|
1
2
|
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
3
|
|
3
4
|
module ICU
|
@@ -6,68 +7,68 @@ module ICU
|
|
6
7
|
before(:each) do
|
7
8
|
@simple = Name.new('mark j l', 'orr')
|
8
9
|
end
|
9
|
-
|
10
|
+
|
10
11
|
it "#first returns the first name(s)" do
|
11
12
|
@simple.first.should == 'Mark J. L.'
|
12
13
|
end
|
13
|
-
|
14
|
+
|
14
15
|
it "#last returns the last name(s)" do
|
15
16
|
@simple.last.should == 'Orr'
|
16
17
|
end
|
17
|
-
|
18
|
+
|
18
19
|
it "#name returns the full name with first name(s) first" do
|
19
20
|
@simple.name.should == 'Mark J. L. Orr'
|
20
21
|
end
|
21
|
-
|
22
|
+
|
22
23
|
it "#rname returns the full name with last name(s) first" do
|
23
24
|
@simple.rname.should == 'Orr, Mark J. L.'
|
24
25
|
end
|
25
|
-
|
26
|
+
|
26
27
|
it "#to_s is the same as rname" do
|
27
28
|
@simple.to_s.should == 'Orr, Mark J. L.'
|
28
29
|
end
|
29
|
-
|
30
|
+
|
30
31
|
it "#match returns true if and only if two names match" do
|
31
32
|
@simple.match('mark j l orr').should be_true
|
32
33
|
@simple.match('malcolm g l orr').should be_false
|
33
34
|
end
|
34
35
|
end
|
35
|
-
|
36
|
+
|
36
37
|
context "rdoc expample" do
|
37
38
|
before(:each) do
|
38
39
|
@robert = Name.new(' robert j ', ' FISCHER ')
|
39
40
|
@bobby = Name.new(' bobby fischer ')
|
40
41
|
end
|
41
|
-
|
42
|
+
|
42
43
|
it "should get Robert" do
|
43
44
|
@robert.name.should == 'Robert J. Fischer'
|
44
45
|
end
|
45
|
-
|
46
|
+
|
46
47
|
it "should get Bobby" do
|
47
48
|
@bobby.last.should == 'Fischer'
|
48
49
|
@bobby.first.should == 'Bobby'
|
49
50
|
end
|
50
|
-
|
51
|
+
|
51
52
|
it "should match Robert and Bobby" do
|
52
53
|
@robert.match(@bobby).should be_true
|
53
54
|
@robert.match('R. J.', 'Fischer').should be_true
|
54
55
|
@bobby.match('R. J.', 'Fischer').should be_false
|
55
56
|
end
|
56
|
-
|
57
|
+
|
57
58
|
it "should canconicalise last names" do
|
58
59
|
Name.new('John', 'O Reilly').last.should == "O'Reilly"
|
59
60
|
Name.new('dave', 'mcmanus').last.should == "McManus"
|
60
61
|
Name.new('pete', 'MACMANUS').last.should == "MacManus"
|
61
62
|
end
|
62
63
|
end
|
63
|
-
|
64
|
+
|
64
65
|
context "names that are already canonical" do
|
65
66
|
it "should not be altered" do
|
66
67
|
Name.new('Mark J. L.', 'Orr').name.should == 'Mark J. L. Orr'
|
67
68
|
Name.new('Anna-Marie J.-K.', 'Liviu-Dieter').name.should == 'Anna-Marie J.-K. Liviu-Dieter'
|
68
69
|
end
|
69
70
|
end
|
70
|
-
|
71
|
+
|
71
72
|
context "last names beginning with a single letter followed by a quote" do
|
72
73
|
it "should be handled correctly" do
|
73
74
|
Name.new('una', "O'boyle").name.should == "Una O'Boyle"
|
@@ -76,7 +77,7 @@ module ICU
|
|
76
77
|
Name.new('cormac', "o brien").name.should == "Cormac O'Brien"
|
77
78
|
end
|
78
79
|
end
|
79
|
-
|
80
|
+
|
80
81
|
context "last beginning with Mc" do
|
81
82
|
it "should be handled correctly" do
|
82
83
|
Name.new('shane', "mccabe").name.should == "Shane McCabe"
|
@@ -85,7 +86,7 @@ module ICU
|
|
85
86
|
Name.new('bartlomiej', "macieja").name.should == "Bartlomiej Macieja"
|
86
87
|
end
|
87
88
|
end
|
88
|
-
|
89
|
+
|
89
90
|
context "doubled barrelled names or initials" do
|
90
91
|
it "should be handled correctly" do
|
91
92
|
Name.new('anna-marie', 'den-otter').name.should == 'Anna-Marie Den-Otter'
|
@@ -95,26 +96,26 @@ module ICU
|
|
95
96
|
Name.new('hannah', "lowry - o reilly").name.should == "Hannah Lowry-O'Reilly"
|
96
97
|
end
|
97
98
|
end
|
98
|
-
|
99
|
+
|
99
100
|
context "extraneous white space" do
|
100
101
|
it "should be handled correctly" do
|
101
102
|
Name.new(' mark j l ', " \t\r\n orr \n").name.should == 'Mark J. L. Orr'
|
102
103
|
end
|
103
104
|
end
|
104
|
-
|
105
|
+
|
105
106
|
context "extraneous full stops" do
|
106
107
|
it "should be handled correctly" do
|
107
108
|
Name.new('. mark j..l', 'orr.').name.should == 'Mark J. L. Orr'
|
108
109
|
end
|
109
110
|
end
|
110
|
-
|
111
|
+
|
111
112
|
context "construction from a single string" do
|
112
113
|
before(:each) do
|
113
114
|
@mark1 = Name.new('ORR, mark j l')
|
114
115
|
@mark2 = Name.new('MARK J L ORR')
|
115
116
|
@oreil = Name.new("O'Reilly, j-k")
|
116
117
|
end
|
117
|
-
|
118
|
+
|
118
119
|
it "should be possible in simple cases" do
|
119
120
|
@mark1.first.should == 'Mark J. L.'
|
120
121
|
@mark1.last.should == 'Orr'
|
@@ -123,13 +124,13 @@ module ICU
|
|
123
124
|
@oreil.name.should == "J.-K. O'Reilly"
|
124
125
|
end
|
125
126
|
end
|
126
|
-
|
127
|
+
|
127
128
|
context "construction from an instance" do
|
128
129
|
it "should be possible" do
|
129
130
|
Name.new(Name.new('ORR, mark j l')).name.should == 'Mark J. L. Orr'
|
130
131
|
end
|
131
132
|
end
|
132
|
-
|
133
|
+
|
133
134
|
context "constuction corner cases" do
|
134
135
|
it "should be handled correctly" do
|
135
136
|
Name.new('Orr').name.should == 'Orr'
|
@@ -140,13 +141,13 @@ module ICU
|
|
140
141
|
Name.new.rname.should == ''
|
141
142
|
end
|
142
143
|
end
|
143
|
-
|
144
|
+
|
144
145
|
context "inputs to matching" do
|
145
146
|
before(:all) do
|
146
147
|
@mark = Name.new('Mark', 'Orr')
|
147
148
|
@kram = Name.new('Mark', 'Orr')
|
148
149
|
end
|
149
|
-
|
150
|
+
|
150
151
|
it "should be flexible" do
|
151
152
|
@mark.match('Mark', 'Orr').should be_true
|
152
153
|
@mark.match('Mark Orr').should be_true
|
@@ -159,12 +160,12 @@ module ICU
|
|
159
160
|
it "should match when first names are the same" do
|
160
161
|
Name.new('Mark', 'Orr').match('Mark', 'Orr').should be_true
|
161
162
|
end
|
162
|
-
|
163
|
+
|
163
164
|
it "should be flexible with regards to hyphens in double barrelled names" do
|
164
165
|
Name.new('J.-K.', 'Rowling').match('J. K.', 'Rowling').should be_true
|
165
166
|
Name.new('Joanne-K.', 'Rowling').match('Joanne K.', 'Rowling').should be_true
|
166
167
|
end
|
167
|
-
|
168
|
+
|
168
169
|
it "should match initials" do
|
169
170
|
Name.new('M. J. L.', 'Orr').match('Mark John Legard', 'Orr').should be_true
|
170
171
|
Name.new('M.', 'Orr').match('Mark', 'Orr').should be_true
|
@@ -172,37 +173,49 @@ module ICU
|
|
172
173
|
Name.new('M.', 'Orr').match('M. J.', 'Orr').should be_true
|
173
174
|
Name.new('M. J. L.', 'Orr').match('M. G.', 'Orr').should be_false
|
174
175
|
end
|
175
|
-
|
176
|
+
|
176
177
|
it "should not match on full names not in first position or without an exact match" do
|
177
178
|
Name.new('J. M.', 'Orr').match('John', 'Orr').should be_true
|
178
179
|
Name.new('M. J.', 'Orr').match('John', 'Orr').should be_false
|
179
180
|
Name.new('M. John', 'Orr').match('John', 'Orr').should be_true
|
180
181
|
end
|
181
|
-
|
182
|
+
|
182
183
|
it "should handle common nicknames" do
|
183
184
|
Name.new('William', 'Orr').match('Bill', 'Orr').should be_true
|
184
185
|
Name.new('David', 'Orr').match('Dave', 'Orr').should be_true
|
185
186
|
Name.new('Mick', 'Orr').match('Mike', 'Orr').should be_true
|
186
187
|
end
|
187
|
-
|
188
|
+
|
188
189
|
it "should not mix up nick names" do
|
189
190
|
Name.new('David', 'Orr').match('Bill', 'Orr').should be_false
|
190
191
|
end
|
191
192
|
end
|
192
|
-
|
193
|
+
|
193
194
|
context "last name matches" do
|
194
195
|
it "should be flexible with regards to hyphens in double barrelled names" do
|
195
196
|
Name.new('Johanna', "Lowry-O'Reilly").match('Johanna', "Lowry O'Reilly").should be_true
|
196
197
|
end
|
197
|
-
|
198
|
+
|
198
199
|
it "should be case insensitive in matches involving Macsomething and MacSomething" do
|
199
200
|
Name.new('Alan', 'MacDonagh').match('Alan', 'Macdonagh').should be_true
|
200
201
|
end
|
201
|
-
|
202
|
+
|
202
203
|
it "should cater for the common mispelling of names beginning with Mc or Mac" do
|
203
204
|
Name.new('Alan', 'McDonagh').match('Alan', 'MacDonagh').should be_true
|
204
205
|
Name.new('Darko', 'Polimac').match('Darko', 'Polimc').should be_false
|
205
206
|
end
|
206
207
|
end
|
208
|
+
|
209
|
+
context "accented characters" do
|
210
|
+
before(:each) do
|
211
|
+
@first = 'Gearóidín'
|
212
|
+
@last = 'Uí Laighléis'
|
213
|
+
end
|
214
|
+
|
215
|
+
it "should not yet deal with UTF-8" do
|
216
|
+
name = Name.new(@first, @last)
|
217
|
+
name.first.should_not == @first
|
218
|
+
end
|
219
|
+
end
|
207
220
|
end
|
208
221
|
end
|
data/spec/util_spec.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
3
|
+
|
4
|
+
module ICU
|
5
|
+
describe Util do
|
6
|
+
context "#is_utf8" do
|
7
|
+
it "should recognise US-ASCII as a special case of UTF-8" do
|
8
|
+
Util.is_utf8("Resume".encode("US-ASCII")).should be_true
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should recognise UTF-8" do
|
12
|
+
Util.is_utf8("Résumé").should be_true
|
13
|
+
Util.is_utf8("δog").should be_true
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should recognize other encodings as not being UTF-8" do
|
17
|
+
Util.is_utf8("Résumé".encode("ISO-8859-1")).should be_false
|
18
|
+
Util.is_utf8("€50".encode("Windows-1252")).should be_false
|
19
|
+
Util.is_utf8("ひらがな".encode("Shift_JIS")).should be_false
|
20
|
+
Util.is_utf8("\xa3").should be_false
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
context "#to_utf8" do
|
25
|
+
it "should convert to UTF-8" do
|
26
|
+
Util.to_utf8("Resume").should == "Resume"
|
27
|
+
Util.to_utf8("Resume".force_encoding("US-ASCII")).encoding.name.should == "UTF-8"
|
28
|
+
Util.to_utf8("Résumé".encode("ISO-8859-1")).should == "Résumé"
|
29
|
+
Util.to_utf8("Résumé".encode("Windows-1252")).should == "Résumé"
|
30
|
+
Util.to_utf8("€50".encode("Windows-1252")).should == "€50"
|
31
|
+
Util.to_utf8("\xa350".force_encoding("ASCII-8BIT")).should == "£50"
|
32
|
+
Util.to_utf8("\xa350").should == "£50"
|
33
|
+
Util.to_utf8("ひらがな".encode("Shift_JIS")).should == "ひらがな"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 4
|
9
|
+
version: 0.0.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Mark Orr
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-21 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -55,10 +55,13 @@ extra_rdoc_files:
|
|
55
55
|
- LICENCE
|
56
56
|
- README.rdoc
|
57
57
|
files:
|
58
|
+
- lib/icu_name/name.rb
|
59
|
+
- lib/icu_name/util.rb
|
58
60
|
- lib/icu_name/version.rb
|
59
61
|
- lib/icu_name.rb
|
60
|
-
- spec/
|
62
|
+
- spec/name_spec.rb
|
61
63
|
- spec/spec_helper.rb
|
64
|
+
- spec/util_spec.rb
|
62
65
|
- LICENCE
|
63
66
|
- README.rdoc
|
64
67
|
has_rdoc: true
|