icu_name 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +12 -4
- data/lib/icu_name/name.rb +24 -5
- data/lib/icu_name/version.rb +1 -1
- data/spec/name_spec.rb +28 -6
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -26,6 +26,10 @@ Capitalisation, white space and punctuation will all be automatically corrected:
|
|
26
26
|
robert.name # => 'Robert J. Fischer'
|
27
27
|
robert.rname # => 'Fischer, Robert J.' (reversed name)
|
28
28
|
|
29
|
+
The input text, without any changes apart from white-space cleanup, is returned by the _original_ method:
|
30
|
+
|
31
|
+
robert.original # => 'robert j FISHER'
|
32
|
+
|
29
33
|
To avoid ambiguity when either the first or second names consist of multiple words, it is better to
|
30
34
|
supply the two separately, if known. However, the full name can be supplied alone to the constructor
|
31
35
|
and a guess will be made as to the first and last names.
|
@@ -61,8 +65,8 @@ Some of the ways last names are canonicalised are illustrated below:
|
|
61
65
|
== Characters and Encoding
|
62
66
|
|
63
67
|
The class can only cope with Western European letter characters, including the accented ones in Latin-1.
|
64
|
-
It's various accessors (_first_, _last_, _name_, _rname_, _to_s_) always return strings
|
65
|
-
no matter what the input encoding.
|
68
|
+
It's various accessors (_first_, _last_, _name_, _rname_, _to_s_, _original_) always return strings
|
69
|
+
encoded in UTF-8, no matter what the input encoding.
|
66
70
|
|
67
71
|
eric = ICU::Name.new('éric', 'PRIÉ')
|
68
72
|
eric.rname # => "Prié, Éric"
|
@@ -71,11 +75,13 @@ no matter what the input encoding.
|
|
71
75
|
eric = ICU::Name.new('éric'.encode("ISO-8859-1"), 'PRIÉ'.force_encoding("ASCII-8BIT"))
|
72
76
|
eric.rname # => "Prié, Éric"
|
73
77
|
eric.rname.encoding.name # => "UTF-8"
|
78
|
+
eric.original # => "éric PRIÉ"
|
79
|
+
eric.original.encoding.name # => "UTF-8"
|
74
80
|
|
75
81
|
Currently, all characters outside the Latin-1 range are removed as if they wern't there.
|
76
82
|
|
77
|
-
ICU::Name.new('Józef Żabiński').name # "Józef Abiski"
|
78
|
-
ICU::Name.new('Bǔ Xiángzhì').name # "B. Xiángzhì"
|
83
|
+
ICU::Name.new('Józef Żabiński').name # => "Józef Abiski"
|
84
|
+
ICU::Name.new('Bǔ Xiángzhì').name # => "B. Xiángzhì"
|
79
85
|
|
80
86
|
Accented Latin-1 characters can be transliterated into their ascii counterparts by setting the
|
81
87
|
_ascii_ option to a true value.
|
@@ -86,6 +92,8 @@ This works with all the other accessors and also with the constructor:
|
|
86
92
|
|
87
93
|
eric_ascii = ICU::Name.new('éric', 'PRIÉ', :ascii => true)
|
88
94
|
eric_ascii.name # => "Eric Prie"
|
95
|
+
jozef_ascii = ICU::Name.new('Józef', 'Żabiński', :ascii => true).name
|
96
|
+
jozef_ascii.name # => "Jozef Zabinski"
|
89
97
|
|
90
98
|
The option also relaxes the need for accented characters to match exactly:
|
91
99
|
|
data/lib/icu_name/name.rb
CHANGED
@@ -9,11 +9,18 @@ module ICU
|
|
9
9
|
def initialize(name1='', name2='', opt={})
|
10
10
|
@name1 = Util.to_utf8(name1.to_s)
|
11
11
|
@name2 = Util.to_utf8(name2.to_s)
|
12
|
-
|
12
|
+
originalize
|
13
13
|
if opt[:ascii]
|
14
|
-
@
|
15
|
-
@
|
14
|
+
@name1 = ActiveSupport::Inflector.transliterate(@name1)
|
15
|
+
@name2 = ActiveSupport::Inflector.transliterate(@name2)
|
16
16
|
end
|
17
|
+
canonicalize
|
18
|
+
end
|
19
|
+
|
20
|
+
# Original text getter.
|
21
|
+
def original(opts={})
|
22
|
+
return ActiveSupport::Inflector.transliterate(@original) if opts[:ascii]
|
23
|
+
@original
|
17
24
|
end
|
18
25
|
|
19
26
|
# First name getter.
|
@@ -60,6 +67,13 @@ module ICU
|
|
60
67
|
# :stopdoc:
|
61
68
|
private
|
62
69
|
|
70
|
+
# Save the original inputs without any cleanup other than whitespace.
|
71
|
+
def originalize
|
72
|
+
@original = "#{@name1} #{@name2}"
|
73
|
+
@original.strip!
|
74
|
+
@original.gsub!(/\s+/, ' ')
|
75
|
+
end
|
76
|
+
|
63
77
|
# Canonicalise the first and last names.
|
64
78
|
def canonicalize
|
65
79
|
first, last = partition
|
@@ -70,7 +84,7 @@ module ICU
|
|
70
84
|
# Split one complete name into first and last parts.
|
71
85
|
def partition
|
72
86
|
if @name2.length == 0
|
73
|
-
# Only one
|
87
|
+
# Only one input so we must split it into first and last.
|
74
88
|
parts = @name1.split(/,/)
|
75
89
|
if parts.size > 1
|
76
90
|
last = clean(parts.shift || '')
|
@@ -78,7 +92,7 @@ module ICU
|
|
78
92
|
else
|
79
93
|
parts = clean(@name1).split(/ /)
|
80
94
|
last = parts.pop || ''
|
81
|
-
last = "#{parts.pop}'#{last}" if parts.size > 1 && parts.last
|
95
|
+
last = "#{parts.pop}'#{last}" if parts.size > 1 && parts.last.match(/^O$/i) && !last.match(/^O'/i) # "O", "Reilly" => "O'Reilly"
|
82
96
|
first = parts.join(' ')
|
83
97
|
end
|
84
98
|
else
|
@@ -114,6 +128,11 @@ module ICU
|
|
114
128
|
def finish_last(names)
|
115
129
|
names.gsub!(/\b([A-Z\u{c0}-\u{de}]')([a-z\u{e0}-\u{ff}])/) { |m| $1 << $2.mb_chars.upcase.to_s }
|
116
130
|
names.gsub!(/\b(Mc)([a-z\u{e0}-\u{ff}])/) { |m| $1 << $2.mb_chars.upcase.to_s }
|
131
|
+
names.gsub!(/\bMac([a-z\u{e0}-\u{ff}])/) do |m|
|
132
|
+
letter = $1 # capitalize after "Mac" only if the original clearly indicates it
|
133
|
+
upper = letter.mb_chars.upcase.to_s
|
134
|
+
'Mac'.concat(@original.match(/\bMac#{upper}/) ? upper : letter)
|
135
|
+
end
|
117
136
|
names.gsub!(/\bO ([A-Z\u{c0}-\u{de}])/) { |m| "O'" << $1 }
|
118
137
|
names
|
119
138
|
end
|
data/lib/icu_name/version.rb
CHANGED
data/spec/name_spec.rb
CHANGED
@@ -5,7 +5,7 @@ module ICU
|
|
5
5
|
describe Name do
|
6
6
|
context "public methods" do
|
7
7
|
before(:each) do
|
8
|
-
@simple = Name.new('mark j l', '
|
8
|
+
@simple = Name.new('mark j l', 'ORR')
|
9
9
|
end
|
10
10
|
|
11
11
|
it "#first returns the first name(s)" do
|
@@ -28,6 +28,10 @@ module ICU
|
|
28
28
|
@simple.to_s.should == 'Orr, Mark J. L.'
|
29
29
|
end
|
30
30
|
|
31
|
+
it "#original returns the original data" do
|
32
|
+
@simple.original.should == 'mark j l ORR'
|
33
|
+
end
|
34
|
+
|
31
35
|
it "#match returns true if and only if two names match" do
|
32
36
|
@simple.match('mark j l orr').should be_true
|
33
37
|
@simple.match('malcolm g l orr').should be_false
|
@@ -62,18 +66,25 @@ module ICU
|
|
62
66
|
end
|
63
67
|
|
64
68
|
it "characters and encoding" do
|
65
|
-
josef =
|
69
|
+
josef = Name.new('Józef', 'Żabiński')
|
66
70
|
josef.name.should == "Józef Abiski"
|
67
|
-
|
71
|
+
josef.original.should == "Józef Żabiński"
|
72
|
+
josef.original(:ascii => true).should == "Jozef Zabinski"
|
73
|
+
josef = Name.new('Józef', 'Żabiński', :ascii => true)
|
74
|
+
josef.name.should == "Jozef Zabinski"
|
75
|
+
bu = Name.new('Bǔ Xiángzhì')
|
68
76
|
bu.name.should == "B. Xiángzhì"
|
69
|
-
eric =
|
77
|
+
eric = Name.new('éric', 'PRIÉ')
|
70
78
|
eric.rname.should == "Prié, Éric"
|
71
79
|
eric.rname.encoding.name.should == "UTF-8"
|
72
|
-
eric =
|
80
|
+
eric = Name.new('éric'.encode("ISO-8859-1"), 'PRIÉ'.force_encoding("ASCII-8BIT"))
|
73
81
|
eric.rname.should == "Prié, Éric"
|
74
82
|
eric.rname.encoding.name.should == "UTF-8"
|
83
|
+
eric.original.should == "éric PRIÉ"
|
84
|
+
eric.original(:ascii => true).should == "eric PRIE"
|
85
|
+
eric.original.encoding.name.should == "UTF-8"
|
75
86
|
eric.name(:ascii => true).should == "Eric Prie"
|
76
|
-
eric_ascii =
|
87
|
+
eric_ascii = Name.new('éric', 'PRIÉ', :ascii => true)
|
77
88
|
eric_ascii.name.should == "Eric Prie"
|
78
89
|
eric.match('Éric', 'Prié').should be_true
|
79
90
|
eric.match('Eric', 'Prie').should be_false
|
@@ -104,9 +115,12 @@ module ICU
|
|
104
115
|
it "should be handled correctly" do
|
105
116
|
Name.new('shane', "mccabe").name.should == "Shane McCabe"
|
106
117
|
Name.new('shawn', "macdonagh").name.should == "Shawn Macdonagh"
|
118
|
+
Name.new('Colin', "MacNab").name.should == "Colin MacNab"
|
119
|
+
Name.new('colin', "macnab").name.should == "Colin Macnab"
|
107
120
|
Name.new('bartlomiej', "macieja").name.should == "Bartlomiej Macieja"
|
108
121
|
Name.new('türko', "mcözgür").name.should == "Türko McÖzgür"
|
109
122
|
Name.new('TÜRKO', "MACÖZGÜR").name.should == "Türko Macözgür"
|
123
|
+
Name.new('Türko', "MacÖzgür").name.should == "Türko MacÖzgür"
|
110
124
|
end
|
111
125
|
end
|
112
126
|
|
@@ -171,6 +185,14 @@ module ICU
|
|
171
185
|
end
|
172
186
|
end
|
173
187
|
|
188
|
+
context "the original input" do
|
189
|
+
it "should be the original text unaltered except for white space" do
|
190
|
+
Name.new(' Mark j l ', ' ORR ').original.should == 'Mark j l ORR'
|
191
|
+
Name.new('Józef', 'Żabiński').original.should == 'Józef Żabiński'
|
192
|
+
Name.new('Ui Laigleis,Gearoidin').original.should == 'Ui Laigleis,Gearoidin'
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
174
196
|
context "encoding" do
|
175
197
|
before(:each) do
|
176
198
|
@first = 'Gearóidín'
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 7
|
9
|
+
version: 0.0.7
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Mark Orr
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-24 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|