RubyGems - fullname-parser - Versions diffs - 1.0.5 → 1.1.0 - Mend

fullname-parser 1.0.5 → 1.1.0

Files changed (4) hide show

checksums.yaml +7 -0
data/lib/fullname/parser.rb +161 -78
data/lib/fullname/parser/version.rb +1 -1
metadata +7 -10

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 8701ad30b034d522d4116053672296dad64fe2b8
+  data.tar.gz: 00b0315fbefe4988a2aadde0c14a0f128baf7271
+SHA512:
+  metadata.gz: f3498c76c53588b73363fa76652fd1d268282e8742116931793330eab1cc9becbbc8b5f65fe20647be02e0b5755d7c729e6cf4f309b436eda1d82adba56d5e74
+  data.tar.gz: 4472a7dc0e50d9129c0ab96b7b0006b588f1f5db1c96dccf79f21d34870ad09f2a498c1481311e94ce69e9c75c13a36019b69a632c34017f1d840f6dc250973d

data/lib/fullname/parser.rb CHANGED Viewed

@@ -101,91 +101,174 @@ module Fullname
       '9th' => 'IX',
     } unless const_defined?(:CONVERSION)
-    def parse_fullname(name)
-      first_name  = nil
-      middle_name = nil
-      last_name   = nil
-      prefix      = nil
-      suffix      = nil
-      # replace "’" to "'"
-      name = name.gsub(/’/, "'")
-      # remove strings which contain and include in parentheses
-      # ex. 'Susan M. (Scully) Schultz'  =>  'Susan M. Schultz'
-      #     'Jay (Jung) Heum Kim'        =>  'Jay Heum Kim'
-      name = name.gsub(/\(.*?\)/, ' ').gsub(/\(|\)/, '')
-      # remove quoted strings
-      # Darin "Derry" Ronald Anderson    => 'Darin Ronald Anderson'
-      # Nancy M. "Shelli" Egger          => 'Nancy M. Egger'
-      # Nicole 'nikki' Adame             => 'Nicole Adame'
-      name = name.gsub(/".*?"/, ' ').gsub(/'.*?'/i, ' ')
-      # remove curly brackets
-      # Henry C.{Harry} Wilson           => 'Henry C. Wilson'
-      # Cellestine {Steen} Armstrong     => 'Cellestine Armstrong'
-      name = name.gsub(/\{.*?\}/, ' ')
-      # remove exceptional names
-      # ex. "William . D. 'Bill' Beard"  =>  "William D. 'Bill' Beard"
-      # also this regexp can remove
-      name = name.gsub(/\s+[^a-zA-Z]+\s+/, ' ')
-      # Why we use substitute(sub) comma to whitespace, not global substitute(gsub).
-      # the reason is the substitution applies for suffix splitting, not for replacing
-      # bad data. As we want, convert "Marvene A Gordon, JD" to "Marvene A Gordon JD",
-      # so that the suffix will get into the split array.
-      # and, standardize suffix as '2nd' => 'II', '3rd' => 'III'
-      nameSplit   = name.gsub(',', ' ').strip.split(/\s+/).map{ |n| CONVERSION[n.downcase] || n }
-      return { :last=>name } if nameSplit.length <= 1
-      suffix_arr  = []
-      while (nameSplit.length > 1)
-        if IGNORABLE_SUFFIXES.include?(nameSplit.last.downcase)
-          suffix_arr.unshift([nameSplit.pop, false])
-        elsif SUFFIX_LIST.include?(nameSplit.last.downcase) || GLOBAL_SUFFIX_LIST.include?(nameSplit.last.downcase)
-          suffix_arr.unshift([nameSplit.pop, true])
-        else
-          break
+    class Error < StandardError; end
+    class Identifier
+      attr_reader :name, :original_name, :prefix, :firstname, :middlename, :lastname, :suffix
+      def initialize(name)
+        @original_name = name.dup
+        @name = name.dup
+        @prefix_list = []
+        @suffix_list = []
+        sanitize!
+        flip_parts!
+        breakup!
+      end
+      private
+      def sanitize!
+        # replace "’" to "'"
+        name.gsub!(/’/, "'")
+        # remove strings which contain and include in parentheses
+        # ex. 'Susan M. (Scully) Schultz'  =>  'Susan M. Schultz'
+        #     'Jay (Jung) Heum Kim'        =>  'Jay Heum Kim'
+        name.gsub!(/\(.*?\)/, ' ')
+        name.gsub!(/\(|\)/, '')
+        # remove quoted strings
+        # Darin "Derry" Ronald Anderson    => 'Darin Ronald Anderson'
+        # Nancy M. "Shelli" Egger          => 'Nancy M. Egger'
+        # Nicole 'nikki' Adame             => 'Nicole Adame'
+        name.gsub!(/".*?"/, ' ')
+        name.gsub!(/'.*?'/i, ' ')
+        # remove curly brackets
+        # Henry C.{Harry} Wilson           => 'Henry C. Wilson'
+        # Cellestine {Steen} Armstrong     => 'Cellestine Armstrong'
+        name.gsub!(/\{.*?\}/, ' ')
+        # remove exceptional names
+        # ex. "William . D. 'Bill' Beard"  =>  "William D. 'Bill' Beard"
+        # also this regexp can remove
+        name.gsub!(/\s+[^a-zA-Z]+\s+/, ' ')
+        # Why we use substitute(sub) comma to whitespace, not global substitute(gsub).
+        # the reason is the substitution applies for suffix splitting, not for replacing
+        # bad data. As we want, convert "Marvene A Gordon, JD" to "Marvene A Gordon JD",
+        # so that the suffix will get into the split array.
+        # and, standardize suffix as '2nd' => 'II', '3rd' => 'III'
+        CONVERSION.each_pair do |finder, replacer|
+          name.gsub!(Regexp.new("\\b#{Regexp.escape(finder)}\\b", true), replacer)
         end
       end
-      # Loop around until we run into a name that is not contained in the PREFIX_LIST
-      # ex(FL): 'Lt Col Marvene A Gordon', 'The Honorable Dexter F George'
-      prefix_arr      = []
-      while (nameSplit.length > 1)
-        if IGNORABLE_PREFIXS.include?(nameSplit.first.downcase)
-          nameSplit.shift
-        elsif PREFIX_LIST.include?(nameSplit.first.downcase)
-          prefix_arr.push(nameSplit.shift)
-        else
-          break
+      def extract_suffix(str)
+        list = []
+        loop do
+          m = /(.*)[, ](.+)/.match(str)
+          break unless m
+          remaining = m[1]
+          last_part = m[2].strip
+          last_part_downcase = last_part.downcase
+          if IGNORABLE_SUFFIXES.include?(last_part_downcase)
+            list.unshift([last_part, false])
+          elsif SUFFIX_LIST.include?(last_part_downcase) || GLOBAL_SUFFIX_LIST.include?(last_part_downcase)
+            list.unshift([last_part, true])
+          else
+            break
+          end
+          str = remaining.gsub(/[, ]+$/, '').strip
         end
+        [str, list]
       end
-      prefix = prefix_arr.join(' ') if prefix_arr.size > 0
-      # Loop around until we run into a name that is not contained in the LAST_NAME_EXTENSIONS
-      last_name_arr  = []
-      last_name_arr.push(nameSplit.pop)
-      last_name_arr.push(nameSplit.pop) while nameSplit.length > 1 && LAST_NAME_EXTENSIONS.include?(nameSplit.last.downcase)
-      last_name = last_name_arr.reverse.join(' ') if last_name_arr.size > 0
-      first_name  = nameSplit.shift     if nameSplit.length >= 1
-      middle_name = nameSplit.join(' ') if nameSplit.length > 0
-      if first_name.nil? && prefix
-        first_name = prefix
-        prefix     = nil
+      def extract_prefix(str)
+        list = []
+        loop do
+          m = /(.+?)[, ](.+)/.match(str)
+          break unless m
+          remining = m[2]
+          first_part = m[1]
+          first_part_downcase = first_part.downcase
+          if IGNORABLE_PREFIXS.include?(first_part_downcase)
+            # skip words
+          elsif PREFIX_LIST.include?(first_part_downcase)
+            list.push(first_part)
+          else
+            break
+          end
+          str = remining.gsub(/^[, ]+/, '').strip
+        end
+        [str, list]
       end
-      if first_name.nil? && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
-        first_name = last_name
-        last_name = suffix_arr.shift.first
+      def extract_suffix_before_flipping_parts
+        remaining, list = extract_suffix(name)
+        @name = remaining
+        @suffix_list += list
       end
-      if last_name =~ /^[A-Z]\.?$/i && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
-        middle_name = [middle_name, last_name].compact.join(' ')
-        last_name = suffix_arr.shift.first
+      def flip_parts!
+        extract_suffix_before_flipping_parts
+        parts = name.split(/,/)
+        case parts.size
+        when 1
+        when 2
+          remining, list = extract_suffix(parts[0])
+          @name = [parts[1], remining].join(' ').strip.gsub(/ +/, ' ')
+          @suffix_list += list
+        when 3
+          remining, list = extract_suffix(parts[0..1].join(' '))
+          @name = [parts[2], remining].join(' ').strip.gsub(/ +/, ' ')
+          @suffix_list += list
+        else
+          fail Error.new("name [ #{name} ] has >2 commas, don't know how to parse")
+        end
+        extract_prefix_after_flipping_parts
       end
-      suffix_arr.delete_if{|a, b| !b}
-      suffix = suffix_arr.size == 0 ? nil : suffix_arr.first.first # only return first suffix
-      return { :last => last_name, :middle => middle_name, :first => first_name, :prefix => prefix, :suffix => suffix }
+      def extract_prefix_after_flipping_parts
+        remaining, list = extract_prefix(name)
+        @name = remaining
+        @prefix_list += list
+      end
+      def breakup!
+        parts = name.split(/[, ]+/)
+        # process prefix
+        @prefix = @prefix_list.join(' ') if @prefix_list.any?
+        # process lastname
+        # Loop around until we run into a name that is not contained in the LAST_NAME_EXTENSIONS
+        last_name_arr  = []
+        last_name_arr.push(parts.pop)
+        last_name_arr.push(parts.pop) while parts.length > 1 && LAST_NAME_EXTENSIONS.include?(parts.last.downcase)
+        @lastname = last_name_arr.reverse.join(' ')
+        # process firstname and middlename
+        @firstname  = parts.shift if parts.length >= 1
+        @middlename = parts.join(' ') if parts.length > 0
+        if firstname.nil? && prefix
+          @firstname = prefix
+          @prefix    = nil
+        end
+        # move lastname to firstname, move first suffix to lastname
+        if firstname.nil? && @suffix_list.any? && SUFFIX_CAN_BE_LASTNAME.include?(@suffix_list.first.first.downcase)
+          @firstname = lastname
+          @lastname = @suffix_list.shift.first
+        end
+        # move lastname to middlename, move first suffix to lastname
+        if lastname =~ /^[A-Z]\.?$/i && @suffix_list.any? && SUFFIX_CAN_BE_LASTNAME.include?(@suffix_list.first.first.downcase)
+          @middlename = [middlename, lastname].compact.join(' ')
+          @lastname = @suffix_list.shift.first
+        end
+        # process suffix
+        @suffix_list.delete_if { |_, ignore_able| !ignore_able }
+        @suffix = @suffix_list.any? ? @suffix_list.first.first : nil
+      end
+    end
+    def parse_fullname(name)
+      i = Identifier.new(name)
+      return {
+        prefix: i.prefix,
+        first: i.firstname,
+        middle: i.middlename,
+        last: i.lastname,
+        suffix: i.suffix
+      }
     end # << parse_fullname
     extend self
   end

data/lib/fullname/parser/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Fullname
   module Parser
-    VERSION = '1.0.5'
+    VERSION = '1.1.0'
   end
 end

metadata CHANGED Viewed

@@ -1,15 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fullname-parser
 version: !ruby/object:Gem::Version
-  version: 1.0.5
-  prerelease:
+  version: 1.1.0
 platform: ruby
 authors:
 - xiaohui
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-06-24 00:00:00.000000000 Z
+date: 2016-02-16 00:00:00.000000000 Z
 dependencies: []
 description: For parsing people's fullname into pieces(prefix/first/middle/last/suffix)
 email:
@@ -25,27 +24,25 @@ files:
 - lib/fullname/parser/version.rb
 homepage: http://github.com/xiaohui-zhangxh/fullname-parser
 licenses: []
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.25
+rubygems_version: 2.4.8
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Split fullname into pieces(prefix/first/middle/last/suffix)
 test_files: []
-has_rdoc: