icu_name 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +12 -4
- data/lib/icu_name/name.rb +24 -5
- data/lib/icu_name/version.rb +1 -1
- data/spec/name_spec.rb +28 -6
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -26,6 +26,10 @@ Capitalisation, white space and punctuation will all be automatically corrected:
|
|
26
26
|
robert.name # => 'Robert J. Fischer'
|
27
27
|
robert.rname # => 'Fischer, Robert J.' (reversed name)
|
28
28
|
|
29
|
+
The input text, without any changes apart from white-space cleanup, is returned by the _original_ method:
|
30
|
+
|
31
|
+
robert.original # => 'robert j FISHER'
|
32
|
+
|
29
33
|
To avoid ambiguity when either the first or second names consist of multiple words, it is better to
|
30
34
|
supply the two separately, if known. However, the full name can be supplied alone to the constructor
|
31
35
|
and a guess will be made as to the first and last names.
|
@@ -61,8 +65,8 @@ Some of the ways last names are canonicalised are illustrated below:
|
|
61
65
|
== Characters and Encoding
|
62
66
|
|
63
67
|
The class can only cope with Western European letter characters, including the accented ones in Latin-1.
|
64
|
-
It's various accessors (_first_, _last_, _name_, _rname_, _to_s_) always return strings
|
65
|
-
no matter what the input encoding.
|
68
|
+
It's various accessors (_first_, _last_, _name_, _rname_, _to_s_, _original_) always return strings
|
69
|
+
encoded in UTF-8, no matter what the input encoding.
|
66
70
|
|
67
71
|
eric = ICU::Name.new('éric', 'PRIÉ')
|
68
72
|
eric.rname # => "Prié, Éric"
|
@@ -71,11 +75,13 @@ no matter what the input encoding.
|
|
71
75
|
eric = ICU::Name.new('éric'.encode("ISO-8859-1"), 'PRIÉ'.force_encoding("ASCII-8BIT"))
|
72
76
|
eric.rname # => "Prié, Éric"
|
73
77
|
eric.rname.encoding.name # => "UTF-8"
|
78
|
+
eric.original # => "éric PRIÉ"
|
79
|
+
eric.original.encoding.name # => "UTF-8"
|
74
80
|
|
75
81
|
Currently, all characters outside the Latin-1 range are removed as if they wern't there.
|
76
82
|
|
77
|
-
ICU::Name.new('Józef Żabiński').name # "Józef Abiski"
|
78
|
-
ICU::Name.new('Bǔ Xiángzhì').name # "B. Xiángzhì"
|
83
|
+
ICU::Name.new('Józef Żabiński').name # => "Józef Abiski"
|
84
|
+
ICU::Name.new('Bǔ Xiángzhì').name # => "B. Xiángzhì"
|
79
85
|
|
80
86
|
Accented Latin-1 characters can be transliterated into their ascii counterparts by setting the
|
81
87
|
_ascii_ option to a true value.
|
@@ -86,6 +92,8 @@ This works with all the other accessors and also with the constructor:
|
|
86
92
|
|
87
93
|
eric_ascii = ICU::Name.new('éric', 'PRIÉ', :ascii => true)
|
88
94
|
eric_ascii.name # => "Eric Prie"
|
95
|
+
jozef_ascii = ICU::Name.new('Józef', 'Żabiński', :ascii => true).name
|
96
|
+
jozef_ascii.name # => "Jozef Zabinski"
|
89
97
|
|
90
98
|
The option also relaxes the need for accented characters to match exactly:
|
91
99
|
|
data/lib/icu_name/name.rb
CHANGED
@@ -9,11 +9,18 @@ module ICU
|
|
9
9
|
def initialize(name1='', name2='', opt={})
|
10
10
|
@name1 = Util.to_utf8(name1.to_s)
|
11
11
|
@name2 = Util.to_utf8(name2.to_s)
|
12
|
-
|
12
|
+
originalize
|
13
13
|
if opt[:ascii]
|
14
|
-
@
|
15
|
-
@
|
14
|
+
@name1 = ActiveSupport::Inflector.transliterate(@name1)
|
15
|
+
@name2 = ActiveSupport::Inflector.transliterate(@name2)
|
16
16
|
end
|
17
|
+
canonicalize
|
18
|
+
end
|
19
|
+
|
20
|
+
# Original text getter.
|
21
|
+
def original(opts={})
|
22
|
+
return ActiveSupport::Inflector.transliterate(@original) if opts[:ascii]
|
23
|
+
@original
|
17
24
|
end
|
18
25
|
|
19
26
|
# First name getter.
|
@@ -60,6 +67,13 @@ module ICU
|
|
60
67
|
# :stopdoc:
|
61
68
|
private
|
62
69
|
|
70
|
+
# Save the original inputs without any cleanup other than whitespace.
|
71
|
+
def originalize
|
72
|
+
@original = "#{@name1} #{@name2}"
|
73
|
+
@original.strip!
|
74
|
+
@original.gsub!(/\s+/, ' ')
|
75
|
+
end
|
76
|
+
|
63
77
|
# Canonicalise the first and last names.
|
64
78
|
def canonicalize
|
65
79
|
first, last = partition
|
@@ -70,7 +84,7 @@ module ICU
|
|
70
84
|
# Split one complete name into first and last parts.
|
71
85
|
def partition
|
72
86
|
if @name2.length == 0
|
73
|
-
# Only one
|
87
|
+
# Only one input so we must split it into first and last.
|
74
88
|
parts = @name1.split(/,/)
|
75
89
|
if parts.size > 1
|
76
90
|
last = clean(parts.shift || '')
|
@@ -78,7 +92,7 @@ module ICU
|
|
78
92
|
else
|
79
93
|
parts = clean(@name1).split(/ /)
|
80
94
|
last = parts.pop || ''
|
81
|
-
last = "#{parts.pop}'#{last}" if parts.size > 1 && parts.last
|
95
|
+
last = "#{parts.pop}'#{last}" if parts.size > 1 && parts.last.match(/^O$/i) && !last.match(/^O'/i) # "O", "Reilly" => "O'Reilly"
|
82
96
|
first = parts.join(' ')
|
83
97
|
end
|
84
98
|
else
|
@@ -114,6 +128,11 @@ module ICU
|
|
114
128
|
def finish_last(names)
|
115
129
|
names.gsub!(/\b([A-Z\u{c0}-\u{de}]')([a-z\u{e0}-\u{ff}])/) { |m| $1 << $2.mb_chars.upcase.to_s }
|
116
130
|
names.gsub!(/\b(Mc)([a-z\u{e0}-\u{ff}])/) { |m| $1 << $2.mb_chars.upcase.to_s }
|
131
|
+
names.gsub!(/\bMac([a-z\u{e0}-\u{ff}])/) do |m|
|
132
|
+
letter = $1 # capitalize after "Mac" only if the original clearly indicates it
|
133
|
+
upper = letter.mb_chars.upcase.to_s
|
134
|
+
'Mac'.concat(@original.match(/\bMac#{upper}/) ? upper : letter)
|
135
|
+
end
|
117
136
|
names.gsub!(/\bO ([A-Z\u{c0}-\u{de}])/) { |m| "O'" << $1 }
|
118
137
|
names
|
119
138
|
end
|
data/lib/icu_name/version.rb
CHANGED
data/spec/name_spec.rb
CHANGED
@@ -5,7 +5,7 @@ module ICU
|
|
5
5
|
describe Name do
|
6
6
|
context "public methods" do
|
7
7
|
before(:each) do
|
8
|
-
@simple = Name.new('mark j l', '
|
8
|
+
@simple = Name.new('mark j l', 'ORR')
|
9
9
|
end
|
10
10
|
|
11
11
|
it "#first returns the first name(s)" do
|
@@ -28,6 +28,10 @@ module ICU
|
|
28
28
|
@simple.to_s.should == 'Orr, Mark J. L.'
|
29
29
|
end
|
30
30
|
|
31
|
+
it "#original returns the original data" do
|
32
|
+
@simple.original.should == 'mark j l ORR'
|
33
|
+
end
|
34
|
+
|
31
35
|
it "#match returns true if and only if two names match" do
|
32
36
|
@simple.match('mark j l orr').should be_true
|
33
37
|
@simple.match('malcolm g l orr').should be_false
|
@@ -62,18 +66,25 @@ module ICU
|
|
62
66
|
end
|
63
67
|
|
64
68
|
it "characters and encoding" do
|
65
|
-
josef =
|
69
|
+
josef = Name.new('Józef', 'Żabiński')
|
66
70
|
josef.name.should == "Józef Abiski"
|
67
|
-
|
71
|
+
josef.original.should == "Józef Żabiński"
|
72
|
+
josef.original(:ascii => true).should == "Jozef Zabinski"
|
73
|
+
josef = Name.new('Józef', 'Żabiński', :ascii => true)
|
74
|
+
josef.name.should == "Jozef Zabinski"
|
75
|
+
bu = Name.new('Bǔ Xiángzhì')
|
68
76
|
bu.name.should == "B. Xiángzhì"
|
69
|
-
eric =
|
77
|
+
eric = Name.new('éric', 'PRIÉ')
|
70
78
|
eric.rname.should == "Prié, Éric"
|
71
79
|
eric.rname.encoding.name.should == "UTF-8"
|
72
|
-
eric =
|
80
|
+
eric = Name.new('éric'.encode("ISO-8859-1"), 'PRIÉ'.force_encoding("ASCII-8BIT"))
|
73
81
|
eric.rname.should == "Prié, Éric"
|
74
82
|
eric.rname.encoding.name.should == "UTF-8"
|
83
|
+
eric.original.should == "éric PRIÉ"
|
84
|
+
eric.original(:ascii => true).should == "eric PRIE"
|
85
|
+
eric.original.encoding.name.should == "UTF-8"
|
75
86
|
eric.name(:ascii => true).should == "Eric Prie"
|
76
|
-
eric_ascii =
|
87
|
+
eric_ascii = Name.new('éric', 'PRIÉ', :ascii => true)
|
77
88
|
eric_ascii.name.should == "Eric Prie"
|
78
89
|
eric.match('Éric', 'Prié').should be_true
|
79
90
|
eric.match('Eric', 'Prie').should be_false
|
@@ -104,9 +115,12 @@ module ICU
|
|
104
115
|
it "should be handled correctly" do
|
105
116
|
Name.new('shane', "mccabe").name.should == "Shane McCabe"
|
106
117
|
Name.new('shawn', "macdonagh").name.should == "Shawn Macdonagh"
|
118
|
+
Name.new('Colin', "MacNab").name.should == "Colin MacNab"
|
119
|
+
Name.new('colin', "macnab").name.should == "Colin Macnab"
|
107
120
|
Name.new('bartlomiej', "macieja").name.should == "Bartlomiej Macieja"
|
108
121
|
Name.new('türko', "mcözgür").name.should == "Türko McÖzgür"
|
109
122
|
Name.new('TÜRKO', "MACÖZGÜR").name.should == "Türko Macözgür"
|
123
|
+
Name.new('Türko', "MacÖzgür").name.should == "Türko MacÖzgür"
|
110
124
|
end
|
111
125
|
end
|
112
126
|
|
@@ -171,6 +185,14 @@ module ICU
|
|
171
185
|
end
|
172
186
|
end
|
173
187
|
|
188
|
+
context "the original input" do
|
189
|
+
it "should be the original text unaltered except for white space" do
|
190
|
+
Name.new(' Mark j l ', ' ORR ').original.should == 'Mark j l ORR'
|
191
|
+
Name.new('Józef', 'Żabiński').original.should == 'Józef Żabiński'
|
192
|
+
Name.new('Ui Laigleis,Gearoidin').original.should == 'Ui Laigleis,Gearoidin'
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
174
196
|
context "encoding" do
|
175
197
|
before(:each) do
|
176
198
|
@first = 'Gearóidín'
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 7
|
9
|
+
version: 0.0.7
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Mark Orr
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-24 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|