icu_name 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -26,6 +26,10 @@ Capitalisation, white space and punctuation will all be automatically corrected:
26
26
  robert.name # => 'Robert J. Fischer'
27
27
  robert.rname # => 'Fischer, Robert J.' (reversed name)
28
28
 
29
+ The input text, without any changes apart from white-space cleanup, is returned by the _original_ method:
30
+
31
+ robert.original # => 'robert j FISHER'
32
+
29
33
  To avoid ambiguity when either the first or second names consist of multiple words, it is better to
30
34
  supply the two separately, if known. However, the full name can be supplied alone to the constructor
31
35
  and a guess will be made as to the first and last names.
@@ -61,8 +65,8 @@ Some of the ways last names are canonicalised are illustrated below:
61
65
  == Characters and Encoding
62
66
 
63
67
  The class can only cope with Western European letter characters, including the accented ones in Latin-1.
64
- It's various accessors (_first_, _last_, _name_, _rname_, _to_s_) always return strings encoded in UTF-8,
65
- no matter what the input encoding.
68
+ It's various accessors (_first_, _last_, _name_, _rname_, _to_s_, _original_) always return strings
69
+ encoded in UTF-8, no matter what the input encoding.
66
70
 
67
71
  eric = ICU::Name.new('éric', 'PRIÉ')
68
72
  eric.rname # => "Prié, Éric"
@@ -71,11 +75,13 @@ no matter what the input encoding.
71
75
  eric = ICU::Name.new('éric'.encode("ISO-8859-1"), 'PRIÉ'.force_encoding("ASCII-8BIT"))
72
76
  eric.rname # => "Prié, Éric"
73
77
  eric.rname.encoding.name # => "UTF-8"
78
+ eric.original # => "éric PRIÉ"
79
+ eric.original.encoding.name # => "UTF-8"
74
80
 
75
81
  Currently, all characters outside the Latin-1 range are removed as if they wern't there.
76
82
 
77
- ICU::Name.new('Józef Żabiński').name # "Józef Abiski"
78
- ICU::Name.new('Bǔ Xiángzhì').name # "B. Xiángzhì"
83
+ ICU::Name.new('Józef Żabiński').name # => "Józef Abiski"
84
+ ICU::Name.new('Bǔ Xiángzhì').name # => "B. Xiángzhì"
79
85
 
80
86
  Accented Latin-1 characters can be transliterated into their ascii counterparts by setting the
81
87
  _ascii_ option to a true value.
@@ -86,6 +92,8 @@ This works with all the other accessors and also with the constructor:
86
92
 
87
93
  eric_ascii = ICU::Name.new('éric', 'PRIÉ', :ascii => true)
88
94
  eric_ascii.name # => "Eric Prie"
95
+ jozef_ascii = ICU::Name.new('Józef', 'Żabiński', :ascii => true).name
96
+ jozef_ascii.name # => "Jozef Zabinski"
89
97
 
90
98
  The option also relaxes the need for accented characters to match exactly:
91
99
 
data/lib/icu_name/name.rb CHANGED
@@ -9,11 +9,18 @@ module ICU
9
9
  def initialize(name1='', name2='', opt={})
10
10
  @name1 = Util.to_utf8(name1.to_s)
11
11
  @name2 = Util.to_utf8(name2.to_s)
12
- canonicalize
12
+ originalize
13
13
  if opt[:ascii]
14
- @first = ActiveSupport::Inflector.transliterate(@first)
15
- @last = ActiveSupport::Inflector.transliterate(@last)
14
+ @name1 = ActiveSupport::Inflector.transliterate(@name1)
15
+ @name2 = ActiveSupport::Inflector.transliterate(@name2)
16
16
  end
17
+ canonicalize
18
+ end
19
+
20
+ # Original text getter.
21
+ def original(opts={})
22
+ return ActiveSupport::Inflector.transliterate(@original) if opts[:ascii]
23
+ @original
17
24
  end
18
25
 
19
26
  # First name getter.
@@ -60,6 +67,13 @@ module ICU
60
67
  # :stopdoc:
61
68
  private
62
69
 
70
+ # Save the original inputs without any cleanup other than whitespace.
71
+ def originalize
72
+ @original = "#{@name1} #{@name2}"
73
+ @original.strip!
74
+ @original.gsub!(/\s+/, ' ')
75
+ end
76
+
63
77
  # Canonicalise the first and last names.
64
78
  def canonicalize
65
79
  first, last = partition
@@ -70,7 +84,7 @@ module ICU
70
84
  # Split one complete name into first and last parts.
71
85
  def partition
72
86
  if @name2.length == 0
73
- # Only one imput so we must split first and last.
87
+ # Only one input so we must split it into first and last.
74
88
  parts = @name1.split(/,/)
75
89
  if parts.size > 1
76
90
  last = clean(parts.shift || '')
@@ -78,7 +92,7 @@ module ICU
78
92
  else
79
93
  parts = clean(@name1).split(/ /)
80
94
  last = parts.pop || ''
81
- last = "#{parts.pop}'#{last}" if parts.size > 1 && parts.last == "O" && !last.match(/^O'/)
95
+ last = "#{parts.pop}'#{last}" if parts.size > 1 && parts.last.match(/^O$/i) && !last.match(/^O'/i) # "O", "Reilly" => "O'Reilly"
82
96
  first = parts.join(' ')
83
97
  end
84
98
  else
@@ -114,6 +128,11 @@ module ICU
114
128
  def finish_last(names)
115
129
  names.gsub!(/\b([A-Z\u{c0}-\u{de}]')([a-z\u{e0}-\u{ff}])/) { |m| $1 << $2.mb_chars.upcase.to_s }
116
130
  names.gsub!(/\b(Mc)([a-z\u{e0}-\u{ff}])/) { |m| $1 << $2.mb_chars.upcase.to_s }
131
+ names.gsub!(/\bMac([a-z\u{e0}-\u{ff}])/) do |m|
132
+ letter = $1 # capitalize after "Mac" only if the original clearly indicates it
133
+ upper = letter.mb_chars.upcase.to_s
134
+ 'Mac'.concat(@original.match(/\bMac#{upper}/) ? upper : letter)
135
+ end
117
136
  names.gsub!(/\bO ([A-Z\u{c0}-\u{de}])/) { |m| "O'" << $1 }
118
137
  names
119
138
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module ICU
4
4
  class Name
5
- VERSION = "0.0.6"
5
+ VERSION = "0.0.7"
6
6
  end
7
7
  end
data/spec/name_spec.rb CHANGED
@@ -5,7 +5,7 @@ module ICU
5
5
  describe Name do
6
6
  context "public methods" do
7
7
  before(:each) do
8
- @simple = Name.new('mark j l', 'orr')
8
+ @simple = Name.new('mark j l', 'ORR')
9
9
  end
10
10
 
11
11
  it "#first returns the first name(s)" do
@@ -28,6 +28,10 @@ module ICU
28
28
  @simple.to_s.should == 'Orr, Mark J. L.'
29
29
  end
30
30
 
31
+ it "#original returns the original data" do
32
+ @simple.original.should == 'mark j l ORR'
33
+ end
34
+
31
35
  it "#match returns true if and only if two names match" do
32
36
  @simple.match('mark j l orr').should be_true
33
37
  @simple.match('malcolm g l orr').should be_false
@@ -62,18 +66,25 @@ module ICU
62
66
  end
63
67
 
64
68
  it "characters and encoding" do
65
- josef = ICU::Name.new('Józef', 'Żabiński')
69
+ josef = Name.new('Józef', 'Żabiński')
66
70
  josef.name.should == "Józef Abiski"
67
- bu = ICU::Name.new('Bǔ Xiángzhì')
71
+ josef.original.should == "Józef Żabiński"
72
+ josef.original(:ascii => true).should == "Jozef Zabinski"
73
+ josef = Name.new('Józef', 'Żabiński', :ascii => true)
74
+ josef.name.should == "Jozef Zabinski"
75
+ bu = Name.new('Bǔ Xiángzhì')
68
76
  bu.name.should == "B. Xiángzhì"
69
- eric = ICU::Name.new('éric', 'PRIÉ')
77
+ eric = Name.new('éric', 'PRIÉ')
70
78
  eric.rname.should == "Prié, Éric"
71
79
  eric.rname.encoding.name.should == "UTF-8"
72
- eric = ICU::Name.new('éric'.encode("ISO-8859-1"), 'PRIÉ'.force_encoding("ASCII-8BIT"))
80
+ eric = Name.new('éric'.encode("ISO-8859-1"), 'PRIÉ'.force_encoding("ASCII-8BIT"))
73
81
  eric.rname.should == "Prié, Éric"
74
82
  eric.rname.encoding.name.should == "UTF-8"
83
+ eric.original.should == "éric PRIÉ"
84
+ eric.original(:ascii => true).should == "eric PRIE"
85
+ eric.original.encoding.name.should == "UTF-8"
75
86
  eric.name(:ascii => true).should == "Eric Prie"
76
- eric_ascii = ICU::Name.new('éric', 'PRIÉ', :ascii => true)
87
+ eric_ascii = Name.new('éric', 'PRIÉ', :ascii => true)
77
88
  eric_ascii.name.should == "Eric Prie"
78
89
  eric.match('Éric', 'Prié').should be_true
79
90
  eric.match('Eric', 'Prie').should be_false
@@ -104,9 +115,12 @@ module ICU
104
115
  it "should be handled correctly" do
105
116
  Name.new('shane', "mccabe").name.should == "Shane McCabe"
106
117
  Name.new('shawn', "macdonagh").name.should == "Shawn Macdonagh"
118
+ Name.new('Colin', "MacNab").name.should == "Colin MacNab"
119
+ Name.new('colin', "macnab").name.should == "Colin Macnab"
107
120
  Name.new('bartlomiej', "macieja").name.should == "Bartlomiej Macieja"
108
121
  Name.new('türko', "mcözgür").name.should == "Türko McÖzgür"
109
122
  Name.new('TÜRKO', "MACÖZGÜR").name.should == "Türko Macözgür"
123
+ Name.new('Türko', "MacÖzgür").name.should == "Türko MacÖzgür"
110
124
  end
111
125
  end
112
126
 
@@ -171,6 +185,14 @@ module ICU
171
185
  end
172
186
  end
173
187
 
188
+ context "the original input" do
189
+ it "should be the original text unaltered except for white space" do
190
+ Name.new(' Mark j l ', ' ORR ').original.should == 'Mark j l ORR'
191
+ Name.new('Józef', 'Żabiński').original.should == 'Józef Żabiński'
192
+ Name.new('Ui Laigleis,Gearoidin').original.should == 'Ui Laigleis,Gearoidin'
193
+ end
194
+ end
195
+
174
196
  context "encoding" do
175
197
  before(:each) do
176
198
  @first = 'Gearóidín'
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 6
9
- version: 0.0.6
8
+ - 7
9
+ version: 0.0.7
10
10
  platform: ruby
11
11
  authors:
12
12
  - Mark Orr
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-01-23 00:00:00 +00:00
17
+ date: 2011-01-24 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency