icu_name 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/icu_name/name.rb +230 -0
- data/lib/icu_name/util.rb +19 -0
- data/lib/icu_name/version.rb +1 -1
- data/lib/icu_name.rb +2 -230
- data/spec/{icu_name_spec.rb → name_spec.rb} +44 -31
- data/spec/util_spec.rb +37 -0
- metadata +7 -4
@@ -0,0 +1,230 @@
|
|
1
|
+
module ICU
|
2
|
+
class Name
|
3
|
+
attr_reader :first, :last
|
4
|
+
|
5
|
+
# Construct from one or two strings or any objects that have a to_s method.
|
6
|
+
def initialize(name1='', name2='')
|
7
|
+
@name1 = name1.to_s.dup
|
8
|
+
@name2 = name2.to_s.dup
|
9
|
+
canonicalize
|
10
|
+
end
|
11
|
+
|
12
|
+
# Return a complete name, first name first, no comma.
|
13
|
+
def name
|
14
|
+
name = ''
|
15
|
+
name << @first
|
16
|
+
name << ' ' if @first.length > 0 && @last.length > 0
|
17
|
+
name << @last
|
18
|
+
name
|
19
|
+
end
|
20
|
+
|
21
|
+
# Return a reversed complete name, first name last after a comma.
|
22
|
+
def rname
|
23
|
+
name = ''
|
24
|
+
name << @last
|
25
|
+
name << ', ' if @first.length > 0 && @last.length > 0
|
26
|
+
name << @first
|
27
|
+
name
|
28
|
+
end
|
29
|
+
|
30
|
+
# Convert object to a string.
|
31
|
+
def to_s
|
32
|
+
rname
|
33
|
+
end
|
34
|
+
|
35
|
+
# Match another name to this object, returning true or false.
|
36
|
+
def match(name1='', name2='')
|
37
|
+
other = Name.new(name1, name2)
|
38
|
+
match_first(first, other.first) && match_last(last, other.last)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
# Canonicalise the first and last names.
|
44
|
+
def canonicalize
|
45
|
+
first, last = partition
|
46
|
+
@first = finish_first(first)
|
47
|
+
@last = finish_last(last)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Split one complete name into first and last parts.
|
51
|
+
def partition
|
52
|
+
if @name2.length == 0
|
53
|
+
# Only one imput so we must split first and last.
|
54
|
+
parts = @name1.split(/,/)
|
55
|
+
if parts.size > 1
|
56
|
+
last = clean(parts.shift || '')
|
57
|
+
first = clean(parts.join(' '))
|
58
|
+
else
|
59
|
+
parts = clean(@name1).split(/ /)
|
60
|
+
last = parts.pop || ''
|
61
|
+
first = parts.join(' ')
|
62
|
+
end
|
63
|
+
else
|
64
|
+
# Two inputs, so we are given first and last.
|
65
|
+
first = clean(@name1)
|
66
|
+
last = clean(@name2)
|
67
|
+
end
|
68
|
+
[first, last]
|
69
|
+
end
|
70
|
+
|
71
|
+
# Clean up characters in any name.
|
72
|
+
def clean(name)
|
73
|
+
name.gsub!(/`/, "'")
|
74
|
+
name.gsub!(/[^-a-zA-Z.'\s]/, '')
|
75
|
+
name.gsub!(/\./, ' ')
|
76
|
+
name.gsub!(/\s*-\s*/, '-')
|
77
|
+
name.gsub!(/'+/, "'")
|
78
|
+
name.strip.downcase.split(/\s+/).map do |n|
|
79
|
+
n.sub!(/^-+/, '')
|
80
|
+
n.sub!(/-+$/, '')
|
81
|
+
n.split(/-/).map do |p|
|
82
|
+
p.capitalize!
|
83
|
+
end.join('-')
|
84
|
+
end.join(' ')
|
85
|
+
end
|
86
|
+
|
87
|
+
# Apply final touches to finish canonicalising a first name.
|
88
|
+
def finish_first(names)
|
89
|
+
names.gsub(/([A-Z])\b/, '\1.')
|
90
|
+
end
|
91
|
+
|
92
|
+
# Apply final touches to finish canonicalising a last name.
|
93
|
+
def finish_last(names)
|
94
|
+
names.gsub!(/\b([A-Z])'([a-z])/) { |m| $1 << "'" << $2.upcase}
|
95
|
+
names.gsub!(/\bMc([a-z])/) { |m| 'Mc' << $1.upcase}
|
96
|
+
names.gsub!(/\bMac([a-z])/) do |m|
|
97
|
+
letter = $1
|
98
|
+
'Mac'.concat(@name2.match("[mM][aA][cC]#{letter}") ? letter : letter.upcase)
|
99
|
+
end
|
100
|
+
names.gsub!(/\bO ([A-Z])/) { |m| "O'" << $1 }
|
101
|
+
names
|
102
|
+
end
|
103
|
+
|
104
|
+
# Match a complete first name.
|
105
|
+
def match_first(first1, first2)
|
106
|
+
# Is this one a walk in the park?
|
107
|
+
return true if first1 == first2
|
108
|
+
|
109
|
+
# No easy ride. Begin by splitting into individual first names.
|
110
|
+
first1 = split_first(first1)
|
111
|
+
first2 = split_first(first2)
|
112
|
+
|
113
|
+
# Get the long list and the short list.
|
114
|
+
long, short = first1.size >= first2.size ? [first1, first2] : [first2, first1]
|
115
|
+
|
116
|
+
# The short one must be a "subset" of the long one.
|
117
|
+
# An extra condition must also be satisfied.
|
118
|
+
extra = false
|
119
|
+
(0..long.size-1).each do |i|
|
120
|
+
lword = long.shift
|
121
|
+
score = match_first_name(lword, short.first)
|
122
|
+
if score >= 0
|
123
|
+
short.shift
|
124
|
+
extra = true if i == 0 || score == 0
|
125
|
+
end
|
126
|
+
break if short.empty? || long.empty?
|
127
|
+
end
|
128
|
+
|
129
|
+
# There's a match if the following is true.
|
130
|
+
short.empty? && extra
|
131
|
+
end
|
132
|
+
|
133
|
+
# Match a complete last name.
|
134
|
+
def match_last(last1, last2)
|
135
|
+
return true if last1 == last2
|
136
|
+
[last1, last2].each do |last|
|
137
|
+
last.downcase! # MacDonaugh and Macdonaugh
|
138
|
+
last.gsub!(/\bmac/, 'mc') # MacDonaugh and McDonaugh
|
139
|
+
last.tr!('-', ' ') # Lowry-O'Reilly and Lowry O'Reilly
|
140
|
+
end
|
141
|
+
last1 == last2
|
142
|
+
end
|
143
|
+
|
144
|
+
# Split a complete first name for matching.
|
145
|
+
def split_first(first)
|
146
|
+
first.tr!('-', ' ') # J. K. and J.-K.
|
147
|
+
first = first.split(/ /) # split on spaces
|
148
|
+
first = [''] if first.size == 0 # in case input was empty string
|
149
|
+
first
|
150
|
+
end
|
151
|
+
|
152
|
+
# Match individual first names or initials.
|
153
|
+
# -1 = no match
|
154
|
+
# 0 = full match
|
155
|
+
# 1 = match involving 1 initial
|
156
|
+
# 2 = match involving 2 initials
|
157
|
+
def match_first_name(first1, first2)
|
158
|
+
initials = 0
|
159
|
+
initials+= 1 if first1.match(/^[A-Z]\.?$/)
|
160
|
+
initials+= 1 if first2.match(/^[A-Z]\.?$/)
|
161
|
+
return initials if first1 == first2
|
162
|
+
return 0 if initials == 0 && match_nick_name(first1, first2)
|
163
|
+
return -1 unless initials > 0
|
164
|
+
return initials if first1[0] == first2[0]
|
165
|
+
-1
|
166
|
+
end
|
167
|
+
|
168
|
+
# Match two first names that might be equivalent nicknames.
|
169
|
+
def match_nick_name(nick1, nick2)
|
170
|
+
compile_nick_names unless @@nc
|
171
|
+
code1 = @@nc[nick1]
|
172
|
+
return false unless code1
|
173
|
+
code1 == @@nc[nick2]
|
174
|
+
end
|
175
|
+
|
176
|
+
# Compile the nick names code hash when matching nick names is first attempted.
|
177
|
+
def compile_nick_names
|
178
|
+
@@nc = Hash.new
|
179
|
+
code = 1
|
180
|
+
@@nl.each do |nicks|
|
181
|
+
nicks.each do |n|
|
182
|
+
throw "duplicate name #{n}" if @@nc[n]
|
183
|
+
@@nc[n] = code
|
184
|
+
end
|
185
|
+
code+= 1
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
# A array of data for matching nicknames and also a few common misspellings.
|
190
|
+
@@nc = nil
|
191
|
+
@@nl = <<EOF.split(/\n/).reject{|x| x.length == 0 }.map{|x| x.split(' ')}
|
192
|
+
Abdul Abul
|
193
|
+
Alexander Alex
|
194
|
+
Anandagopal Ananda
|
195
|
+
Anne Ann
|
196
|
+
Anthony Tony
|
197
|
+
Benjamin Ben
|
198
|
+
Catherine Cathy Cath
|
199
|
+
Daniel Danial Danny Dan
|
200
|
+
David Dave
|
201
|
+
Deborah Debbie
|
202
|
+
Des Desmond
|
203
|
+
Eamonn Eamon
|
204
|
+
Edward Eddie Ed
|
205
|
+
Eric Erick Erik
|
206
|
+
Frederick Frederic Fred
|
207
|
+
Gerald Gerry
|
208
|
+
Gerhard Gerard Ger
|
209
|
+
James Jim
|
210
|
+
Joanna Joan Joanne
|
211
|
+
John Johnny
|
212
|
+
Jonathan Jon
|
213
|
+
Kenneth Ken Kenny
|
214
|
+
Michael Mike Mick Micky
|
215
|
+
Nicholas Nick Nicolas
|
216
|
+
Nicola Nickie Nicky
|
217
|
+
Patrick Pat Paddy
|
218
|
+
Peter Pete
|
219
|
+
Philippe Philip Phillippe Phillip
|
220
|
+
Rick Ricky
|
221
|
+
Robert Bob Bobby
|
222
|
+
Samual Sam Samuel
|
223
|
+
Stefanie Stef
|
224
|
+
Stephen Steven Steve
|
225
|
+
Terence Terry
|
226
|
+
Thomas Tom Tommy
|
227
|
+
William Will Willy Willie Bill
|
228
|
+
EOF
|
229
|
+
end
|
230
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module ICU
|
2
|
+
class Util
|
3
|
+
# Decide if a string is valid UTF-8 or not, returning true or false.
|
4
|
+
def self.is_utf8(str)
|
5
|
+
dup = str.dup
|
6
|
+
dup.force_encoding("UTF-8")
|
7
|
+
dup.valid_encoding?
|
8
|
+
end
|
9
|
+
|
10
|
+
# Try to convert any string to UTF-8.
|
11
|
+
def self.to_utf8(str)
|
12
|
+
utf8 = is_utf8(str)
|
13
|
+
dup = str.dup
|
14
|
+
return dup.force_encoding("UTF-8") if utf8
|
15
|
+
dup.force_encoding("Windows-1252") if dup.encoding.name.match(/^(ASCII-8BIT|UTF-8)$/)
|
16
|
+
dup.encode("UTF-8")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/icu_name/version.rb
CHANGED
data/lib/icu_name.rb
CHANGED
@@ -1,230 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
attr_reader :first, :last
|
4
|
-
|
5
|
-
# Construct from one or two strings or any objects that have a to_s method.
|
6
|
-
def initialize(name1='', name2='')
|
7
|
-
@name1 = name1.to_s
|
8
|
-
@name2 = name2.to_s
|
9
|
-
canonicalize
|
10
|
-
end
|
11
|
-
|
12
|
-
# Return a complete name, first name first, no comma.
|
13
|
-
def name
|
14
|
-
name = ''
|
15
|
-
name << @first
|
16
|
-
name << ' ' if @first.length > 0 && @last.length > 0
|
17
|
-
name << @last
|
18
|
-
name
|
19
|
-
end
|
20
|
-
|
21
|
-
# Return a reversed complete name, first name last after a comma.
|
22
|
-
def rname
|
23
|
-
name = ''
|
24
|
-
name << @last
|
25
|
-
name << ', ' if @first.length > 0 && @last.length > 0
|
26
|
-
name << @first
|
27
|
-
name
|
28
|
-
end
|
29
|
-
|
30
|
-
# Convert object to a string.
|
31
|
-
def to_s
|
32
|
-
rname
|
33
|
-
end
|
34
|
-
|
35
|
-
# Match another name to this object, returning true or false.
|
36
|
-
def match(name1='', name2='')
|
37
|
-
other = Name.new(name1, name2)
|
38
|
-
match_first(first, other.first) && match_last(last, other.last)
|
39
|
-
end
|
40
|
-
|
41
|
-
private
|
42
|
-
|
43
|
-
# Canonicalise the first and last names.
|
44
|
-
def canonicalize
|
45
|
-
first, last = partition
|
46
|
-
@first = finish_first(first)
|
47
|
-
@last = finish_last(last)
|
48
|
-
end
|
49
|
-
|
50
|
-
# Split one complete name into first and last parts.
|
51
|
-
def partition
|
52
|
-
if @name2.length == 0
|
53
|
-
# Only one imput so we must split first and last.
|
54
|
-
parts = @name1.split(/,/)
|
55
|
-
if parts.size > 1
|
56
|
-
last = clean(parts.shift || '')
|
57
|
-
first = clean(parts.join(' '))
|
58
|
-
else
|
59
|
-
parts = clean(@name1).split(/ /)
|
60
|
-
last = parts.pop || ''
|
61
|
-
first = parts.join(' ')
|
62
|
-
end
|
63
|
-
else
|
64
|
-
# Two inputs, so we are given first and last.
|
65
|
-
first = clean(@name1)
|
66
|
-
last = clean(@name2)
|
67
|
-
end
|
68
|
-
[first, last]
|
69
|
-
end
|
70
|
-
|
71
|
-
# Clean up characters in any name.
|
72
|
-
def clean(name)
|
73
|
-
name.gsub!(/`/, "'")
|
74
|
-
name.gsub!(/[^-a-zA-Z.'\s]/, '')
|
75
|
-
name.gsub!(/\./, ' ')
|
76
|
-
name.gsub!(/\s*-\s*/, '-')
|
77
|
-
name.gsub!(/'+/, "'")
|
78
|
-
name.strip.downcase.split(/\s+/).map do |n|
|
79
|
-
n.sub!(/^-+/, '')
|
80
|
-
n.sub!(/-+$/, '')
|
81
|
-
n.split(/-/).map do |p|
|
82
|
-
p.capitalize!
|
83
|
-
end.join('-')
|
84
|
-
end.join(' ')
|
85
|
-
end
|
86
|
-
|
87
|
-
# Apply final touches to finish canonicalising a first name.
|
88
|
-
def finish_first(names)
|
89
|
-
names.gsub(/([A-Z])\b/, '\1.')
|
90
|
-
end
|
91
|
-
|
92
|
-
# Apply final touches to finish canonicalising a last name.
|
93
|
-
def finish_last(names)
|
94
|
-
names.gsub!(/\b([A-Z])'([a-z])/) { |m| $1 << "'" << $2.upcase}
|
95
|
-
names.gsub!(/\bMc([a-z])/) { |m| 'Mc' << $1.upcase}
|
96
|
-
names.gsub!(/\bMac([a-z])/) do |m|
|
97
|
-
letter = $1
|
98
|
-
'Mac'.concat(@name2.match("[mM][aA][cC]#{letter}") ? letter : letter.upcase)
|
99
|
-
end
|
100
|
-
names.gsub!(/\bO ([A-Z])/) { |m| "O'" << $1 }
|
101
|
-
names
|
102
|
-
end
|
103
|
-
|
104
|
-
# Match a complete first name.
|
105
|
-
def match_first(first1, first2)
|
106
|
-
# Is this one a walk in the park?
|
107
|
-
return true if first1 == first2
|
108
|
-
|
109
|
-
# No easy ride. Begin by splitting into individual first names.
|
110
|
-
first1 = split_first(first1)
|
111
|
-
first2 = split_first(first2)
|
112
|
-
|
113
|
-
# Get the long list and the short list.
|
114
|
-
long, short = first1.size >= first2.size ? [first1, first2] : [first2, first1]
|
115
|
-
|
116
|
-
# The short one must be a "subset" of the long one.
|
117
|
-
# An extra condition must also be satisfied.
|
118
|
-
extra = false
|
119
|
-
(0..long.size-1).each do |i|
|
120
|
-
lword = long.shift
|
121
|
-
score = match_first_name(lword, short.first)
|
122
|
-
if score >= 0
|
123
|
-
short.shift
|
124
|
-
extra = true if i == 0 || score == 0
|
125
|
-
end
|
126
|
-
break if short.empty? || long.empty?
|
127
|
-
end
|
128
|
-
|
129
|
-
# There's a match if the following is true.
|
130
|
-
short.empty? && extra
|
131
|
-
end
|
132
|
-
|
133
|
-
# Match a complete last name.
|
134
|
-
def match_last(last1, last2)
|
135
|
-
return true if last1 == last2
|
136
|
-
[last1, last2].each do |last|
|
137
|
-
last.downcase! # MacDonaugh and Macdonaugh
|
138
|
-
last.gsub!(/\bmac/, 'mc') # MacDonaugh and McDonaugh
|
139
|
-
last.tr!('-', ' ') # Lowry-O'Reilly and Lowry O'Reilly
|
140
|
-
end
|
141
|
-
last1 == last2
|
142
|
-
end
|
143
|
-
|
144
|
-
# Split a complete first name for matching.
|
145
|
-
def split_first(first)
|
146
|
-
first.tr!('-', ' ') # J. K. and J.-K.
|
147
|
-
first = first.split(/ /) # split on spaces
|
148
|
-
first = [''] if first.size == 0 # in case input was empty string
|
149
|
-
first
|
150
|
-
end
|
151
|
-
|
152
|
-
# Match individual first names or initials.
|
153
|
-
# -1 = no match
|
154
|
-
# 0 = full match
|
155
|
-
# 1 = match involving 1 initial
|
156
|
-
# 2 = match involving 2 initials
|
157
|
-
def match_first_name(first1, first2)
|
158
|
-
initials = 0
|
159
|
-
initials+= 1 if first1.match(/^[A-Z]\.?$/)
|
160
|
-
initials+= 1 if first2.match(/^[A-Z]\.?$/)
|
161
|
-
return initials if first1 == first2
|
162
|
-
return 0 if initials == 0 && match_nick_name(first1, first2)
|
163
|
-
return -1 unless initials > 0
|
164
|
-
return initials if first1[0] == first2[0]
|
165
|
-
-1
|
166
|
-
end
|
167
|
-
|
168
|
-
# Match two first names that might be equivalent nicknames.
|
169
|
-
def match_nick_name(nick1, nick2)
|
170
|
-
compile_nick_names unless @@nc
|
171
|
-
code1 = @@nc[nick1]
|
172
|
-
return false unless code1
|
173
|
-
code1 == @@nc[nick2]
|
174
|
-
end
|
175
|
-
|
176
|
-
# Compile the nick names code hash when matching nick names is first attempted.
|
177
|
-
def compile_nick_names
|
178
|
-
@@nc = Hash.new
|
179
|
-
code = 1
|
180
|
-
@@nl.each do |nicks|
|
181
|
-
nicks.each do |n|
|
182
|
-
throw "duplicate name #{n}" if @@nc[n]
|
183
|
-
@@nc[n] = code
|
184
|
-
end
|
185
|
-
code+= 1
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
# A array of data for matching nicknames and also a few common misspellings.
|
190
|
-
@@nc = nil
|
191
|
-
@@nl = <<EOF.split(/\n/).reject{|x| x.length == 0 }.map{|x| x.split(' ')}
|
192
|
-
Abdul Abul
|
193
|
-
Alexander Alex
|
194
|
-
Anandagopal Ananda
|
195
|
-
Anne Ann
|
196
|
-
Anthony Tony
|
197
|
-
Benjamin Ben
|
198
|
-
Catherine Cathy Cath
|
199
|
-
Daniel Danial Danny Dan
|
200
|
-
David Dave
|
201
|
-
Deborah Debbie
|
202
|
-
Des Desmond
|
203
|
-
Eamonn Eamon
|
204
|
-
Edward Eddie Ed
|
205
|
-
Eric Erick Erik
|
206
|
-
Frederick Frederic Fred
|
207
|
-
Gerald Gerry
|
208
|
-
Gerhard Gerard Ger
|
209
|
-
James Jim
|
210
|
-
Joanna Joan Joanne
|
211
|
-
John Johnny
|
212
|
-
Jonathan Jon
|
213
|
-
Kenneth Ken Kenny
|
214
|
-
Michael Mike Mick Micky
|
215
|
-
Nicholas Nick Nicolas
|
216
|
-
Nicola Nickie Nicky
|
217
|
-
Patrick Pat Paddy
|
218
|
-
Peter Pete
|
219
|
-
Philippe Philip Phillippe Phillip
|
220
|
-
Rick Ricky
|
221
|
-
Robert Bob Bobby
|
222
|
-
Samual Sam Samuel
|
223
|
-
Stefanie Stef
|
224
|
-
Stephen Steven Steve
|
225
|
-
Terence Terry
|
226
|
-
Thomas Tom Tommy
|
227
|
-
William Will Willy Willie Bill
|
228
|
-
EOF
|
229
|
-
end
|
230
|
-
end
|
1
|
+
require 'icu_name/name.rb'
|
2
|
+
require 'icu_name/util.rb'
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: UTF-8
|
1
2
|
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
3
|
|
3
4
|
module ICU
|
@@ -6,68 +7,68 @@ module ICU
|
|
6
7
|
before(:each) do
|
7
8
|
@simple = Name.new('mark j l', 'orr')
|
8
9
|
end
|
9
|
-
|
10
|
+
|
10
11
|
it "#first returns the first name(s)" do
|
11
12
|
@simple.first.should == 'Mark J. L.'
|
12
13
|
end
|
13
|
-
|
14
|
+
|
14
15
|
it "#last returns the last name(s)" do
|
15
16
|
@simple.last.should == 'Orr'
|
16
17
|
end
|
17
|
-
|
18
|
+
|
18
19
|
it "#name returns the full name with first name(s) first" do
|
19
20
|
@simple.name.should == 'Mark J. L. Orr'
|
20
21
|
end
|
21
|
-
|
22
|
+
|
22
23
|
it "#rname returns the full name with last name(s) first" do
|
23
24
|
@simple.rname.should == 'Orr, Mark J. L.'
|
24
25
|
end
|
25
|
-
|
26
|
+
|
26
27
|
it "#to_s is the same as rname" do
|
27
28
|
@simple.to_s.should == 'Orr, Mark J. L.'
|
28
29
|
end
|
29
|
-
|
30
|
+
|
30
31
|
it "#match returns true if and only if two names match" do
|
31
32
|
@simple.match('mark j l orr').should be_true
|
32
33
|
@simple.match('malcolm g l orr').should be_false
|
33
34
|
end
|
34
35
|
end
|
35
|
-
|
36
|
+
|
36
37
|
context "rdoc expample" do
|
37
38
|
before(:each) do
|
38
39
|
@robert = Name.new(' robert j ', ' FISCHER ')
|
39
40
|
@bobby = Name.new(' bobby fischer ')
|
40
41
|
end
|
41
|
-
|
42
|
+
|
42
43
|
it "should get Robert" do
|
43
44
|
@robert.name.should == 'Robert J. Fischer'
|
44
45
|
end
|
45
|
-
|
46
|
+
|
46
47
|
it "should get Bobby" do
|
47
48
|
@bobby.last.should == 'Fischer'
|
48
49
|
@bobby.first.should == 'Bobby'
|
49
50
|
end
|
50
|
-
|
51
|
+
|
51
52
|
it "should match Robert and Bobby" do
|
52
53
|
@robert.match(@bobby).should be_true
|
53
54
|
@robert.match('R. J.', 'Fischer').should be_true
|
54
55
|
@bobby.match('R. J.', 'Fischer').should be_false
|
55
56
|
end
|
56
|
-
|
57
|
+
|
57
58
|
it "should canconicalise last names" do
|
58
59
|
Name.new('John', 'O Reilly').last.should == "O'Reilly"
|
59
60
|
Name.new('dave', 'mcmanus').last.should == "McManus"
|
60
61
|
Name.new('pete', 'MACMANUS').last.should == "MacManus"
|
61
62
|
end
|
62
63
|
end
|
63
|
-
|
64
|
+
|
64
65
|
context "names that are already canonical" do
|
65
66
|
it "should not be altered" do
|
66
67
|
Name.new('Mark J. L.', 'Orr').name.should == 'Mark J. L. Orr'
|
67
68
|
Name.new('Anna-Marie J.-K.', 'Liviu-Dieter').name.should == 'Anna-Marie J.-K. Liviu-Dieter'
|
68
69
|
end
|
69
70
|
end
|
70
|
-
|
71
|
+
|
71
72
|
context "last names beginning with a single letter followed by a quote" do
|
72
73
|
it "should be handled correctly" do
|
73
74
|
Name.new('una', "O'boyle").name.should == "Una O'Boyle"
|
@@ -76,7 +77,7 @@ module ICU
|
|
76
77
|
Name.new('cormac', "o brien").name.should == "Cormac O'Brien"
|
77
78
|
end
|
78
79
|
end
|
79
|
-
|
80
|
+
|
80
81
|
context "last beginning with Mc" do
|
81
82
|
it "should be handled correctly" do
|
82
83
|
Name.new('shane', "mccabe").name.should == "Shane McCabe"
|
@@ -85,7 +86,7 @@ module ICU
|
|
85
86
|
Name.new('bartlomiej', "macieja").name.should == "Bartlomiej Macieja"
|
86
87
|
end
|
87
88
|
end
|
88
|
-
|
89
|
+
|
89
90
|
context "doubled barrelled names or initials" do
|
90
91
|
it "should be handled correctly" do
|
91
92
|
Name.new('anna-marie', 'den-otter').name.should == 'Anna-Marie Den-Otter'
|
@@ -95,26 +96,26 @@ module ICU
|
|
95
96
|
Name.new('hannah', "lowry - o reilly").name.should == "Hannah Lowry-O'Reilly"
|
96
97
|
end
|
97
98
|
end
|
98
|
-
|
99
|
+
|
99
100
|
context "extraneous white space" do
|
100
101
|
it "should be handled correctly" do
|
101
102
|
Name.new(' mark j l ', " \t\r\n orr \n").name.should == 'Mark J. L. Orr'
|
102
103
|
end
|
103
104
|
end
|
104
|
-
|
105
|
+
|
105
106
|
context "extraneous full stops" do
|
106
107
|
it "should be handled correctly" do
|
107
108
|
Name.new('. mark j..l', 'orr.').name.should == 'Mark J. L. Orr'
|
108
109
|
end
|
109
110
|
end
|
110
|
-
|
111
|
+
|
111
112
|
context "construction from a single string" do
|
112
113
|
before(:each) do
|
113
114
|
@mark1 = Name.new('ORR, mark j l')
|
114
115
|
@mark2 = Name.new('MARK J L ORR')
|
115
116
|
@oreil = Name.new("O'Reilly, j-k")
|
116
117
|
end
|
117
|
-
|
118
|
+
|
118
119
|
it "should be possible in simple cases" do
|
119
120
|
@mark1.first.should == 'Mark J. L.'
|
120
121
|
@mark1.last.should == 'Orr'
|
@@ -123,13 +124,13 @@ module ICU
|
|
123
124
|
@oreil.name.should == "J.-K. O'Reilly"
|
124
125
|
end
|
125
126
|
end
|
126
|
-
|
127
|
+
|
127
128
|
context "construction from an instance" do
|
128
129
|
it "should be possible" do
|
129
130
|
Name.new(Name.new('ORR, mark j l')).name.should == 'Mark J. L. Orr'
|
130
131
|
end
|
131
132
|
end
|
132
|
-
|
133
|
+
|
133
134
|
context "constuction corner cases" do
|
134
135
|
it "should be handled correctly" do
|
135
136
|
Name.new('Orr').name.should == 'Orr'
|
@@ -140,13 +141,13 @@ module ICU
|
|
140
141
|
Name.new.rname.should == ''
|
141
142
|
end
|
142
143
|
end
|
143
|
-
|
144
|
+
|
144
145
|
context "inputs to matching" do
|
145
146
|
before(:all) do
|
146
147
|
@mark = Name.new('Mark', 'Orr')
|
147
148
|
@kram = Name.new('Mark', 'Orr')
|
148
149
|
end
|
149
|
-
|
150
|
+
|
150
151
|
it "should be flexible" do
|
151
152
|
@mark.match('Mark', 'Orr').should be_true
|
152
153
|
@mark.match('Mark Orr').should be_true
|
@@ -159,12 +160,12 @@ module ICU
|
|
159
160
|
it "should match when first names are the same" do
|
160
161
|
Name.new('Mark', 'Orr').match('Mark', 'Orr').should be_true
|
161
162
|
end
|
162
|
-
|
163
|
+
|
163
164
|
it "should be flexible with regards to hyphens in double barrelled names" do
|
164
165
|
Name.new('J.-K.', 'Rowling').match('J. K.', 'Rowling').should be_true
|
165
166
|
Name.new('Joanne-K.', 'Rowling').match('Joanne K.', 'Rowling').should be_true
|
166
167
|
end
|
167
|
-
|
168
|
+
|
168
169
|
it "should match initials" do
|
169
170
|
Name.new('M. J. L.', 'Orr').match('Mark John Legard', 'Orr').should be_true
|
170
171
|
Name.new('M.', 'Orr').match('Mark', 'Orr').should be_true
|
@@ -172,37 +173,49 @@ module ICU
|
|
172
173
|
Name.new('M.', 'Orr').match('M. J.', 'Orr').should be_true
|
173
174
|
Name.new('M. J. L.', 'Orr').match('M. G.', 'Orr').should be_false
|
174
175
|
end
|
175
|
-
|
176
|
+
|
176
177
|
it "should not match on full names not in first position or without an exact match" do
|
177
178
|
Name.new('J. M.', 'Orr').match('John', 'Orr').should be_true
|
178
179
|
Name.new('M. J.', 'Orr').match('John', 'Orr').should be_false
|
179
180
|
Name.new('M. John', 'Orr').match('John', 'Orr').should be_true
|
180
181
|
end
|
181
|
-
|
182
|
+
|
182
183
|
it "should handle common nicknames" do
|
183
184
|
Name.new('William', 'Orr').match('Bill', 'Orr').should be_true
|
184
185
|
Name.new('David', 'Orr').match('Dave', 'Orr').should be_true
|
185
186
|
Name.new('Mick', 'Orr').match('Mike', 'Orr').should be_true
|
186
187
|
end
|
187
|
-
|
188
|
+
|
188
189
|
it "should not mix up nick names" do
|
189
190
|
Name.new('David', 'Orr').match('Bill', 'Orr').should be_false
|
190
191
|
end
|
191
192
|
end
|
192
|
-
|
193
|
+
|
193
194
|
context "last name matches" do
|
194
195
|
it "should be flexible with regards to hyphens in double barrelled names" do
|
195
196
|
Name.new('Johanna', "Lowry-O'Reilly").match('Johanna', "Lowry O'Reilly").should be_true
|
196
197
|
end
|
197
|
-
|
198
|
+
|
198
199
|
it "should be case insensitive in matches involving Macsomething and MacSomething" do
|
199
200
|
Name.new('Alan', 'MacDonagh').match('Alan', 'Macdonagh').should be_true
|
200
201
|
end
|
201
|
-
|
202
|
+
|
202
203
|
it "should cater for the common mispelling of names beginning with Mc or Mac" do
|
203
204
|
Name.new('Alan', 'McDonagh').match('Alan', 'MacDonagh').should be_true
|
204
205
|
Name.new('Darko', 'Polimac').match('Darko', 'Polimc').should be_false
|
205
206
|
end
|
206
207
|
end
|
208
|
+
|
209
|
+
context "accented characters" do
|
210
|
+
before(:each) do
|
211
|
+
@first = 'Gearóidín'
|
212
|
+
@last = 'Uí Laighléis'
|
213
|
+
end
|
214
|
+
|
215
|
+
it "should not yet deal with UTF-8" do
|
216
|
+
name = Name.new(@first, @last)
|
217
|
+
name.first.should_not == @first
|
218
|
+
end
|
219
|
+
end
|
207
220
|
end
|
208
221
|
end
|
data/spec/util_spec.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
3
|
+
|
4
|
+
module ICU
|
5
|
+
describe Util do
|
6
|
+
context "#is_utf8" do
|
7
|
+
it "should recognise US-ASCII as a special case of UTF-8" do
|
8
|
+
Util.is_utf8("Resume".encode("US-ASCII")).should be_true
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should recognise UTF-8" do
|
12
|
+
Util.is_utf8("Résumé").should be_true
|
13
|
+
Util.is_utf8("δog").should be_true
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should recognize other encodings as not being UTF-8" do
|
17
|
+
Util.is_utf8("Résumé".encode("ISO-8859-1")).should be_false
|
18
|
+
Util.is_utf8("€50".encode("Windows-1252")).should be_false
|
19
|
+
Util.is_utf8("ひらがな".encode("Shift_JIS")).should be_false
|
20
|
+
Util.is_utf8("\xa3").should be_false
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
context "#to_utf8" do
|
25
|
+
it "should convert to UTF-8" do
|
26
|
+
Util.to_utf8("Resume").should == "Resume"
|
27
|
+
Util.to_utf8("Resume".force_encoding("US-ASCII")).encoding.name.should == "UTF-8"
|
28
|
+
Util.to_utf8("Résumé".encode("ISO-8859-1")).should == "Résumé"
|
29
|
+
Util.to_utf8("Résumé".encode("Windows-1252")).should == "Résumé"
|
30
|
+
Util.to_utf8("€50".encode("Windows-1252")).should == "€50"
|
31
|
+
Util.to_utf8("\xa350".force_encoding("ASCII-8BIT")).should == "£50"
|
32
|
+
Util.to_utf8("\xa350").should == "£50"
|
33
|
+
Util.to_utf8("ひらがな".encode("Shift_JIS")).should == "ひらがな"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 4
|
9
|
+
version: 0.0.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Mark Orr
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-21 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -55,10 +55,13 @@ extra_rdoc_files:
|
|
55
55
|
- LICENCE
|
56
56
|
- README.rdoc
|
57
57
|
files:
|
58
|
+
- lib/icu_name/name.rb
|
59
|
+
- lib/icu_name/util.rb
|
58
60
|
- lib/icu_name/version.rb
|
59
61
|
- lib/icu_name.rb
|
60
|
-
- spec/
|
62
|
+
- spec/name_spec.rb
|
61
63
|
- spec/spec_helper.rb
|
64
|
+
- spec/util_spec.rb
|
62
65
|
- LICENCE
|
63
66
|
- README.rdoc
|
64
67
|
has_rdoc: true
|