RubyGems - icu_name - Versions diffs - 0.0.4 → 0.0.5 - Mend

icu_name 0.0.4 → 0.0.5

Files changed (5) hide show

data/README.rdoc CHANGED Viewed

@@ -8,6 +8,8 @@ For ruby 1.9.2 and above.
   gem install icu_name
+It depends on active_support and i18n.
 == Names
 This class exists for two main purposes:
@@ -55,8 +57,42 @@ Some of the ways last names are canonicalised are illustrated below:
   ICU::Name.new('John', 'O Reilly').last         # => "O'Reilly"
   ICU::Name.new('dave', 'mcmanus').last          # => "McManus"
-  ICU::Name.new('pete', 'MACMANUS').last         # => "MacManus"
+== Characters and Encoding
+The class can only cope with Western European letter characters, including the accented ones in Latin-1.
+It's various accessors (_first_, _last_, _name_, _rname_, _to_s_) always return strings encoded in UTF-8,
+no matter what the input encoding.
+  eric = ICU::Name.new('éric', 'PRIÉ')
+  eric.rname                                     # => "Prié, Éric"
+  eric.rname.encoding.name                       # => "UTF-8"
+  eric = ICU::Name.new('éric'.encode("ISO-8859-1"), 'PRIÉ'.force_encoding("ASCII-8BIT"))
+  eric.rname                                     # => "Prié, Éric"
+  eric.rname.encoding.name                       # => "UTF-8"
+Currently, all characters outside the Latin-1 range are removed as if they wern't there.
+  ICU::Name.new('Józef Żabiński').name           # "Józef Abiski"
+  ICU::Name.new('Bǔ Xiángzhì').name              # "B. Xiángzhì"
+Accented Latin-1 characters can be transliterated into their ascii counterparts by setting the
+_ascii_ option to a true value.
+  eric.name(:ascii => true)                      # => "Eric Prie"
+This works with all the other accessors and also with the constructor:
+  eric_ascii = ICU::Name.new('éric', 'PRIÉ', :ascii => true)
+  eric_ascii.name                                # => "Eric Prie"
+The option also relaxes the need for accented characters to match exactly:
+  eric.match('Éric', 'Prié')                     # => true
+  eric.match('Eric', 'Prie')                     # => false
+  eric.match('Eric', 'Prie', :ascii => true)     # => true
 == Author
 Mark Orr, rating officer for the Irish Chess Union (ICU[http://icu.ie]).

data/lib/icu_name/name.rb CHANGED Viewed

@@ -1,41 +1,60 @@
+require 'active_support'
+require 'active_support/inflector/transliterate'
+require 'active_support/core_ext/string/multibyte'
 module ICU
   class Name
-    attr_reader :first, :last
     # Construct from one or two strings or any objects that have a to_s method.
-    def initialize(name1='', name2='')
-      @name1 = name1.to_s.dup
-      @name2 = name2.to_s.dup
+    def initialize(name1='', name2='', opt={})
+      @name1 = Util.to_utf8(name1.to_s)
+      @name2 = Util.to_utf8(name2.to_s)
       canonicalize
+      if opt[:ascii]
+        @first = ActiveSupport::Inflector.transliterate(@first)
+        @last  = ActiveSupport::Inflector.transliterate(@last)
+      end
+    end
+    # First name getter.
+    def first(opt={})
+      return ActiveSupport::Inflector.transliterate(@first) if opt[:ascii]
+      @first
+    end
+    # Last name getter.
+    def last(opt={})
+      return ActiveSupport::Inflector.transliterate(@last) if opt[:ascii]
+      @last
     end
     # Return a complete name, first name first, no comma.
-    def name
+    def name(opts={})
       name = ''
-      name << @first
+      name << first(opts)
       name << ' ' if @first.length > 0 && @last.length > 0
-      name << @last
+      name << last(opts)
       name
     end
     # Return a reversed complete name, first name last after a comma.
-    def rname
+    def rname(opts={})
       name = ''
-      name << @last
+      name << last(opts)
       name << ', ' if @first.length > 0 && @last.length > 0
-      name << @first
+      name << first(opts)
       name
     end
     # Convert object to a string.
-    def to_s
-      rname
+    def to_s(opts={})
+      rname(opts)
     end
     # Match another name to this object, returning true or false.
-    def match(name1='', name2='')
-      other = Name.new(name1, name2)
-      match_first(first, other.first) && match_last(last, other.last)
+    def match(name1='', name2='', opts={})
+      other = Name.new(name1, name2, opts)
+      match_first(first(opts), other.first) && match_last(last(opts), other.last)
     end
     private
@@ -58,6 +77,7 @@ module ICU
         else
           parts = clean(@name1).split(/ /)
           last  = parts.pop || ''
+          last  = "#{parts.pop}'#{last}" if parts.size > 1 && parts.last == "O" && !last.match(/^O'/)
           first = parts.join(' ')
         end
       else
@@ -68,36 +88,32 @@ module ICU
       [first, last]
     end
-    # Clean up characters in any name.
+    # Clean up characters in any name keeping only letters (including accented), hyphens, and single quotes.
     def clean(name)
       name.gsub!(/`/, "'")
-      name.gsub!(/[^-a-zA-Z.'\s]/, '')
+      name.gsub!(/[^-a-zA-Z\u{c0}-\u{d6}\u{d8}-\u{f6}\u{f8}-\u{ff}.'\s]/, '')
       name.gsub!(/\./, ' ')
       name.gsub!(/\s*-\s*/, '-')
       name.gsub!(/'+/, "'")
-      name.strip.downcase.split(/\s+/).map do |n|
+      name.strip.mb_chars.downcase.split(/\s+/).map do |n|
         n.sub!(/^-+/, '')
         n.sub!(/-+$/, '')
         n.split(/-/).map do |p|
           p.capitalize!
         end.join('-')
-      end.join(' ')
+      end.join(' ').to_s
     end
-    # Apply final touches to finish canonicalising a first name.
+    # Apply final touches to finish canonicalising a first name mb_chars object, returning a normal string.
     def finish_first(names)
-      names.gsub(/([A-Z])\b/, '\1.')
+      names.gsub(/([A-Z\u{c0}-\u{de}])\b/, '\1.')
     end
-    # Apply final touches to finish canonicalising a last name.
+    # Apply final touches to finish canonicalising a last name mb_chars object, returning a normal string.
     def finish_last(names)
-      names.gsub!(/\b([A-Z])'([a-z])/) { |m| $1 << "'" << $2.upcase}
-      names.gsub!(/\bMc([a-z])/) { |m| 'Mc' << $1.upcase}
-      names.gsub!(/\bMac([a-z])/) do |m|
-        letter = $1
-        'Mac'.concat(@name2.match("[mM][aA][cC]#{letter}") ? letter : letter.upcase)
-      end
-      names.gsub!(/\bO ([A-Z])/) { |m| "O'" << $1 }
+      names.gsub!(/\b([A-Z\u{c0}-\u{de}]')([a-z\u{e0}-\u{ff}])/) { |m| $1 << $2.mb_chars.upcase.to_s }
+      names.gsub!(/\b(Mc)([a-z\u{e0}-\u{ff}])/) { |m| $1 << $2.mb_chars.upcase.to_s }
+      names.gsub!(/\bO ([A-Z\u{c0}-\u{de}])/) { |m| "O'" << $1 }
       names
     end
@@ -134,9 +150,9 @@ module ICU
     def match_last(last1, last2)
       return true if last1 == last2
       [last1, last2].each do |last|
-        last.downcase!             # MacDonaugh and Macdonaugh
-        last.gsub!(/\bmac/, 'mc')  # MacDonaugh and McDonaugh
-        last.tr!('-', ' ')         # Lowry-O'Reilly and Lowry O'Reilly
+        last.downcase!            # case insensitive
+        last.gsub!(/\bmac/, 'mc') # MacDonaugh and McDonaugh
+        last.tr!('-', ' ')        # Lowry-O'Reilly and Lowry O'Reilly
       end
       last1 == last2
     end
@@ -156,8 +172,8 @@ module ICU
     #  2 = match involving 2 initials
     def match_first_name(first1, first2)
       initials = 0
-      initials+= 1 if first1.match(/^[A-Z]\.?$/)
-      initials+= 1 if first2.match(/^[A-Z]\.?$/)
+      initials+= 1 if first1.match(/^[A-Z\u{c0}-\u{de}]\.?$/)
+      initials+= 1 if first2.match(/^[A-Z\u{c0}-\u{de}]\.?$/)
       return initials if first1 == first2
       return 0 if initials == 0 && match_nick_name(first1, first2)
       return -1 unless initials > 0

data/lib/icu_name/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@
 module ICU
   class Name
-    VERSION = "0.0.4"
+    VERSION = "0.0.5"
   end
 end

data/spec/name_spec.rb CHANGED Viewed

@@ -58,7 +58,26 @@ module ICU
       it "should canconicalise last names" do
         Name.new('John', 'O Reilly').last.should == "O'Reilly"
         Name.new('dave', 'mcmanus').last.should == "McManus"
-        Name.new('pete', 'MACMANUS').last.should == "MacManus"
+        Name.new('pete', 'MACMANUS').last.should == "Macmanus"
+      end
+      it "characters and encoding" do
+        josef = ICU::Name.new('Józef', 'Żabiński')
+        josef.name.should == "Józef Abiski"
+        bu = ICU::Name.new('Bǔ Xiángzhì')
+        bu.name.should == "B. Xiángzhì"
+        eric = ICU::Name.new('éric', 'PRIÉ')
+        eric.rname.should == "Prié, Éric"
+        eric.rname.encoding.name.should == "UTF-8"
+        eric = ICU::Name.new('éric'.encode("ISO-8859-1"), 'PRIÉ'.force_encoding("ASCII-8BIT"))
+        eric.rname.should == "Prié, Éric"
+        eric.rname.encoding.name.should == "UTF-8"
+        eric.name(:ascii => true).should == "Eric Prie"
+        eric_ascii = ICU::Name.new('éric', 'PRIÉ', :ascii => true)
+        eric_ascii.name.should == "Eric Prie"
+        eric.match('Éric', 'Prié').should be_true
+        eric.match('Eric', 'Prie').should be_false
+        eric.match('Eric', 'Prie', :ascii => true).should be_true
       end
     end
@@ -66,24 +85,35 @@ module ICU
       it "should not be altered" do
         Name.new('Mark J. L.', 'Orr').name.should == 'Mark J. L. Orr'
         Name.new('Anna-Marie J.-K.', 'Liviu-Dieter').name.should == 'Anna-Marie J.-K. Liviu-Dieter'
+        Name.new('Èric Cantona').name.should == 'Èric Cantona'
       end
     end
-    context "last names beginning with a single letter followed by a quote" do
+    context "last names involving a quote" do
       it "should be handled correctly" do
         Name.new('una', "O'boyle").name.should == "Una O'Boyle"
         Name.new('jonathan', 'd`arcy').name.should == "Jonathan D'Arcy"
         Name.new('erwin e', "L'AMI").name.should == "Erwin E. L'Ami"
         Name.new('cormac', "o brien").name.should == "Cormac O'Brien"
+        Name.new('türko', "o özgür").name.should == "Türko O'Özgür"
+        Name.new('türko', "l`özgür").name.should == "Türko L'Özgür"
       end
     end
-    context "last beginning with Mc" do
+    context "last beginning with Mc or Mac" do
       it "should be handled correctly" do
         Name.new('shane', "mccabe").name.should == "Shane McCabe"
-        Name.new('shawn', "macDonagh").name.should == "Shawn MacDonagh"
         Name.new('shawn', "macdonagh").name.should == "Shawn Macdonagh"
         Name.new('bartlomiej', "macieja").name.should == "Bartlomiej Macieja"
+        Name.new('türko', "mcözgür").name.should == "Türko McÖzgür"
+        Name.new('TÜRKO', "MACÖZGÜR").name.should == "Türko Macözgür"
+      end
+    end
+    context "first name initials" do
+      it "should be handled correctly" do
+        Name.new('m j l', 'Orr').first.should == 'M. J. L.'
+        Name.new('Ö. é m', 'Panno').first.should == "Ö. É. M."
       end
     end
@@ -94,6 +124,22 @@ module ICU
         Name.new("mark j. - l", 'ORR').name.should == 'Mark J.-L. Orr'
         Name.new('JOHANNA', "lowry-o'REILLY").name.should == "Johanna Lowry-O'Reilly"
         Name.new('hannah', "lowry - o reilly").name.should == "Hannah Lowry-O'Reilly"
+        Name.new('hannah', "lowry - o reilly").name.should == "Hannah Lowry-O'Reilly"
+        Name.new('ètienne', "gèrard - mcözgür").name.should == "Ètienne Gèrard-McÖzgür"
+      end
+    end
+    context "accented characters and capitalisation" do
+      it "should downcase upper case accented characters where appropriate" do
+        name = Name.new('GEARÓIDÍN', 'UÍ LAIGHLÉIS')
+        name.first.should == 'Gearóidín'
+        name.last.should == 'Uí Laighléis'
+      end
+      it "should upcase upper case accented characters where appropriate" do
+        name = Name.new('èric özgür')
+        name.first.should == 'Èric'
+        name.last.should == 'Özgür'
       end
     end
@@ -110,18 +156,12 @@ module ICU
     end
     context "construction from a single string" do
-      before(:each) do
-        @mark1 = Name.new('ORR, mark j l')
-        @mark2 = Name.new('MARK J L ORR')
-        @oreil = Name.new("O'Reilly, j-k")
-      end
       it "should be possible in simple cases" do
-        @mark1.first.should == 'Mark J. L.'
-        @mark1.last.should == 'Orr'
-        @mark2.first.should == 'Mark J. L.'
-        @mark2.last.should == 'Orr'
-        @oreil.name.should == "J.-K. O'Reilly"
+        Name.new('ORR, mark j l').rname.should == 'Orr, Mark J. L.'
+        Name.new('MARK J L ORR').rname.should == 'Orr, Mark J. L.'
+        Name.new("j-k O'Reilly").rname.should == "O'Reilly, J.-K."
+        Name.new("j-k O Reilly").rname.should == "O'Reilly, J.-K."
+        Name.new('ètienne o o özgür').name.should == "Ètienne O. O'Özgür"
       end
     end
@@ -131,10 +171,94 @@ module ICU
       end
     end
+    context "encoding" do
+      before(:each) do
+        @first = 'Gearóidín'
+        @last  = 'Uí Laighléis'
+      end
+      it "should handle UTF-8" do
+        name = Name.new(@first, @last)
+        name.first.should == @first
+        name.last.should == @last
+        name.first.encoding.name.should == "UTF-8"
+        name.last.encoding.name.should == "UTF-8"
+      end
+      it "should handle ISO-8859-1" do
+        name = Name.new(@first.encode("ISO-8859-1"), @last.encode("ISO-8859-1"))
+        name.first.should == @first
+        name.last.should == @last
+        name.first.encoding.name.should == "UTF-8"
+        name.last.encoding.name.should == "UTF-8"
+      end
+      it "should handle Windows-1252" do
+        name = Name.new(@first.encode("Windows-1252"), @last.encode("Windows-1252"))
+        name.first.should == @first
+        name.last.should == @last
+        name.first.encoding.name.should == "UTF-8"
+        name.last.encoding.name.should == "UTF-8"
+      end
+      it "should handle ASCII-8BIT" do
+        name = Name.new(@first.dup.force_encoding('ASCII-8BIT'), @last.dup.force_encoding('ASCII-8BIT'))
+        name.first.should == @first
+        name.last.should == @last
+        name.first.encoding.name.should == "UTF-8"
+        name.last.encoding.name.should == "UTF-8"
+      end
+      it "should handle US-ASCII" do
+        @first = 'Gearoidin'
+        @last  = 'Ui Laighleis'
+        name = Name.new(@first.encode("US-ASCII"), @last.encode("US-ASCII"))
+        name.first.should == @first
+        name.last.should == @last
+        name.first.encoding.name.should == "UTF-8"
+        name.last.encoding.name.should == "UTF-8"
+      end
+    end
+    context "transliteration" do
+      before(:all) do
+        @opt = { :ascii => true }
+      end
+      it "should be a no-op for names that already ASCII" do
+        name = Name.new('Mark J. L.', 'Orr')
+        name.first(@opt).should == 'Mark J. L.'
+        name.last(@opt).should == 'Orr'
+        name.name(@opt).should == 'Mark J. L. Orr'
+        name.rname(@opt).should == 'Orr, Mark J. L.'
+        name.to_s(@opt).should == 'Orr, Mark J. L.'
+      end
+      it "should remove the accents from accented characters" do
+        name = Name.new('Gearóidín', 'Uí Laighléis')
+        name.first(@opt).should == 'Gearoidin'
+        name.last(@opt).should == 'Ui Laighleis'
+        name.name(@opt).should == 'Gearoidin Ui Laighleis'
+        name.rname(@opt).should == 'Ui Laighleis, Gearoidin'
+        name.to_s(@opt).should == 'Ui Laighleis, Gearoidin'
+        name = Name.new('èric PRIÉ')
+        name.first(@opt).should == 'Eric'
+        name.last(@opt).should == 'Prie'
+      end
+      it "should work for the constructor as well as accessors" do
+        name = Name.new('Gearóidín', 'Uí Laighléis', @opt)
+        name.first.should == 'Gearoidin'
+        name.last.should == 'Ui Laighleis'
+      end
+    end
     context "constuction corner cases" do
       it "should be handled correctly" do
         Name.new('Orr').name.should == 'Orr'
         Name.new('Orr').rname.should == 'Orr'
+        Name.new('Uí Laighléis').rname.should == 'Laighléis, Uí'
+        Name.new('', 'Uí Laighléis', :ascii => true).last.should == 'Ui Laighleis'
         Name.new('').name.should == ''
         Name.new('').rname.should == ''
         Name.new.name.should == ''
@@ -164,6 +288,7 @@ module ICU
       it "should be flexible with regards to hyphens in double barrelled names" do
         Name.new('J.-K.', 'Rowling').match('J. K.', 'Rowling').should be_true
         Name.new('Joanne-K.', 'Rowling').match('Joanne K.', 'Rowling').should be_true
+        Name.new('Èric-K.', 'Cantona').match('Èric K.', 'Cantona').should be_true
       end
       it "should match initials" do
@@ -172,6 +297,8 @@ module ICU
         Name.new('M. J. L.', 'Orr').match('Mark', 'Orr').should be_true
         Name.new('M.', 'Orr').match('M. J.', 'Orr').should be_true
         Name.new('M. J. L.', 'Orr').match('M. G.', 'Orr').should be_false
+        Name.new('È', 'Cantona').match('Èric K.', 'Cantona').should be_true
+        Name.new('E. K.', 'Cantona').match('Èric K.', 'Cantona').should be_false
       end
       it "should not match on full names not in first position or without an exact match" do
@@ -206,15 +333,20 @@ module ICU
       end
     end
-    context "accented characters" do
-      before(:each) do
-        @first = 'Gearóidín'
-        @last  = 'Uí Laighléis'
+    context "matches involving accented characters" do
+      it "should work for identical names" do
+        Name.new('Gearóidín', 'Uí Laighléis').match('Gearóidín', 'Uí Laighléis').should be_true
+        Name.new('Gearóidín', 'Uí Laighléis').match('Gearoidin', 'Ui Laighleis').should be_false
       end
-      it "should not yet deal with UTF-8" do
-        name = Name.new(@first, @last)
-        name.first.should_not == @first
+      it "should work for first name initials" do
+        Name.new('Èric-K.', 'Cantona').match('È. K.', 'Cantona').should be_true
+        Name.new('Èric-K.', 'Cantona').match('E. K.', 'Cantona').should be_false
+      end
+      it "the matching of accented characters can be relaxed" do
+        Name.new('Gearóidín', 'Uí Laighléis').match('Gearoidin', 'Ui Laíghleis', :ascii => true).should be_true
+        Name.new('Èric-K.', 'Cantona').match('E. K.', 'Cantona', :ascii => true).should be_true
       end
     end
   end

metadata CHANGED Viewed

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 0
-  - 4
-  version: 0.0.4
+  - 5
+  version: 0.0.5
 platform: ruby
 authors:
 - Mark Orr
@@ -14,13 +14,43 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-21 00:00:00 +00:00
+date: 2011-01-23 00:00:00 +00:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: bundler
+  name: activesupport
   prerelease: false
   requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 3
+        - 0
+        - 3
+        version: 3.0.3
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: i18n
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 5
+        - 0
+        version: 0.5.0
+  type: :runtime
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: bundler
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -31,11 +61,11 @@ dependencies:
         - 7
         version: 1.0.7
   type: :development
-  version_requirements: *id001
+  version_requirements: *id003
 - !ruby/object:Gem::Dependency
   name: rspec
   prerelease: false
-  requirement: &id002 !ruby/object:Gem::Requirement
+  requirement: &id004 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
@@ -44,7 +74,7 @@ dependencies:
         - 0
         version: "0"
   type: :development
-  version_requirements: *id002
+  version_requirements: *id004
 description: Canonicalises and matches person names with Latin1 characters and first and last names
 email: mark.j.l.orr@googlemail.com
 executables: []