RubyGems - fullname-parser - Versions diffs - 1.0.5 → 1.1.0 - Mend

fullname-parser 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +7 -0
data/lib/fullname/parser.rb +161 -78
data/lib/fullname/parser/version.rb +1 -1
metadata +7 -10

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 8701ad30b034d522d4116053672296dad64fe2b8
+  data.tar.gz: 00b0315fbefe4988a2aadde0c14a0f128baf7271
+SHA512:
+  metadata.gz: f3498c76c53588b73363fa76652fd1d268282e8742116931793330eab1cc9becbbc8b5f65fe20647be02e0b5755d7c729e6cf4f309b436eda1d82adba56d5e74
+  data.tar.gz: 4472a7dc0e50d9129c0ab96b7b0006b588f1f5db1c96dccf79f21d34870ad09f2a498c1481311e94ce69e9c75c13a36019b69a632c34017f1d840f6dc250973d

data/lib/fullname/parser.rb CHANGED Viewed

@@ -101,91 +101,174 @@ module Fullname
       '9th' => 'IX',
     } unless const_defined?(:CONVERSION)
-    def parse_fullname(name)
-      first_name  = nil
-      middle_name = nil
-      last_name   = nil
-      prefix      = nil
-      suffix      = nil
-      # replace "’" to "'"
-      name = name.gsub(/’/, "'")
-      # remove strings which contain and include in parentheses
-      # ex. 'Susan M. (Scully) Schultz'  =>  'Susan M. Schultz'
-      #     'Jay (Jung) Heum Kim'        =>  'Jay Heum Kim'
-      name = name.gsub(/\(.*?\)/, ' ').gsub(/\(|\)/, '')
-      # remove quoted strings
-      # Darin "Derry" Ronald Anderson    => 'Darin Ronald Anderson'
-      # Nancy M. "Shelli" Egger          => 'Nancy M. Egger'
-      # Nicole 'nikki' Adame             => 'Nicole Adame'
-      name = name.gsub(/".*?"/, ' ').gsub(/'.*?'/i, ' ')
-      # remove curly brackets
-      # Henry C.{Harry} Wilson           => 'Henry C. Wilson'
-      # Cellestine {Steen} Armstrong     => 'Cellestine Armstrong'
-      name = name.gsub(/\{.*?\}/, ' ')
-      # remove exceptional names
-      # ex. "William . D. 'Bill' Beard"  =>  "William D. 'Bill' Beard"
-      # also this regexp can remove
-      name = name.gsub(/\s+[^a-zA-Z]+\s+/, ' ')
-      # Why we use substitute(sub) comma to whitespace, not global substitute(gsub).
-      # the reason is the substitution applies for suffix splitting, not for replacing
-      # bad data. As we want, convert "Marvene A Gordon, JD" to "Marvene A Gordon JD",
-      # so that the suffix will get into the split array.
-      # and, standardize suffix as '2nd' => 'II', '3rd' => 'III'
-      nameSplit   = name.gsub(',', ' ').strip.split(/\s+/).map{ |n| CONVERSION[n.downcase] || n }
-      return { :last=>name } if nameSplit.length <= 1
-      suffix_arr  = []
-      while (nameSplit.length > 1)
-        if IGNORABLE_SUFFIXES.include?(nameSplit.last.downcase)
-          suffix_arr.unshift([nameSplit.pop, false])
-        elsif SUFFIX_LIST.include?(nameSplit.last.downcase) || GLOBAL_SUFFIX_LIST.include?(nameSplit.last.downcase)
-          suffix_arr.unshift([nameSplit.pop, true])
-        else
-          break
+    class Error < StandardError; end
+    class Identifier
+      attr_reader :name, :original_name, :prefix, :firstname, :middlename, :lastname, :suffix
+      def initialize(name)
+        @original_name = name.dup
+        @name = name.dup
+        @prefix_list = []
+        @suffix_list = []
+        sanitize!
+        flip_parts!
+        breakup!
+      end
+      private
+      def sanitize!
+        # replace "’" to "'"
+        name.gsub!(/’/, "'")
+        # remove strings which contain and include in parentheses
+        # ex. 'Susan M. (Scully) Schultz'  =>  'Susan M. Schultz'
+        #     'Jay (Jung) Heum Kim'        =>  'Jay Heum Kim'
+        name.gsub!(/\(.*?\)/, ' ')
+        name.gsub!(/\(|\)/, '')
+        # remove quoted strings
+        # Darin "Derry" Ronald Anderson    => 'Darin Ronald Anderson'
+        # Nancy M. "Shelli" Egger          => 'Nancy M. Egger'
+        # Nicole 'nikki' Adame             => 'Nicole Adame'
+        name.gsub!(/".*?"/, ' ')
+        name.gsub!(/'.*?'/i, ' ')
+        # remove curly brackets
+        # Henry C.{Harry} Wilson           => 'Henry C. Wilson'
+        # Cellestine {Steen} Armstrong     => 'Cellestine Armstrong'
+        name.gsub!(/\{.*?\}/, ' ')
+        # remove exceptional names
+        # ex. "William . D. 'Bill' Beard"  =>  "William D. 'Bill' Beard"
+        # also this regexp can remove
+        name.gsub!(/\s+[^a-zA-Z]+\s+/, ' ')
+        # Why we use substitute(sub) comma to whitespace, not global substitute(gsub).
+        # the reason is the substitution applies for suffix splitting, not for replacing
+        # bad data. As we want, convert "Marvene A Gordon, JD" to "Marvene A Gordon JD",
+        # so that the suffix will get into the split array.
+        # and, standardize suffix as '2nd' => 'II', '3rd' => 'III'
+        CONVERSION.each_pair do |finder, replacer|
+          name.gsub!(Regexp.new("\\b#{Regexp.escape(finder)}\\b", true), replacer)
         end
       end
-      # Loop around until we run into a name that is not contained in the PREFIX_LIST
-      # ex(FL): 'Lt Col Marvene A Gordon', 'The Honorable Dexter F George'
-      prefix_arr      = []
-      while (nameSplit.length > 1)
-        if IGNORABLE_PREFIXS.include?(nameSplit.first.downcase)
-          nameSplit.shift
-        elsif PREFIX_LIST.include?(nameSplit.first.downcase)
-          prefix_arr.push(nameSplit.shift)
-        else
-          break
+      def extract_suffix(str)
+        list = []
+        loop do
+          m = /(.*)[, ](.+)/.match(str)
+          break unless m
+          remaining = m[1]
+          last_part = m[2].strip
+          last_part_downcase = last_part.downcase
+          if IGNORABLE_SUFFIXES.include?(last_part_downcase)
+            list.unshift([last_part, false])
+          elsif SUFFIX_LIST.include?(last_part_downcase) || GLOBAL_SUFFIX_LIST.include?(last_part_downcase)
+            list.unshift([last_part, true])
+          else
+            break
+          end
+          str = remaining.gsub(/[, ]+$/, '').strip
         end
+        [str, list]
       end
-      prefix = prefix_arr.join(' ') if prefix_arr.size > 0
-      # Loop around until we run into a name that is not contained in the LAST_NAME_EXTENSIONS
-      last_name_arr  = []
-      last_name_arr.push(nameSplit.pop)
-      last_name_arr.push(nameSplit.pop) while nameSplit.length > 1 && LAST_NAME_EXTENSIONS.include?(nameSplit.last.downcase)
-      last_name = last_name_arr.reverse.join(' ') if last_name_arr.size > 0
-      first_name  = nameSplit.shift     if nameSplit.length >= 1
-      middle_name = nameSplit.join(' ') if nameSplit.length > 0
-      if first_name.nil? && prefix
-        first_name = prefix
-        prefix     = nil
+      def extract_prefix(str)
+        list = []
+        loop do
+          m = /(.+?)[, ](.+)/.match(str)
+          break unless m
+          remining = m[2]
+          first_part = m[1]
+          first_part_downcase = first_part.downcase
+          if IGNORABLE_PREFIXS.include?(first_part_downcase)
+            # skip words
+          elsif PREFIX_LIST.include?(first_part_downcase)
+            list.push(first_part)
+          else
+            break
+          end
+          str = remining.gsub(/^[, ]+/, '').strip
+        end
+        [str, list]
       end
-      if first_name.nil? && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
-        first_name = last_name
-        last_name = suffix_arr.shift.first
+      def extract_suffix_before_flipping_parts
+        remaining, list = extract_suffix(name)
+        @name = remaining
+        @suffix_list += list
       end
-      if last_name =~ /^[A-Z]\.?$/i && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
-        middle_name = [middle_name, last_name].compact.join(' ')
-        last_name = suffix_arr.shift.first
+      def flip_parts!
+        extract_suffix_before_flipping_parts
+        parts = name.split(/,/)
+        case parts.size
+        when 1
+        when 2
+          remining, list = extract_suffix(parts[0])
+          @name = [parts[1], remining].join(' ').strip.gsub(/ +/, ' ')
+          @suffix_list += list
+        when 3
+          remining, list = extract_suffix(parts[0..1].join(' '))
+          @name = [parts[2], remining].join(' ').strip.gsub(/ +/, ' ')
+          @suffix_list += list
+        else
+          fail Error.new("name [ #{name} ] has >2 commas, don't know how to parse")
+        end
+        extract_prefix_after_flipping_parts
       end
-      suffix_arr.delete_if{|a, b| !b}
-      suffix = suffix_arr.size == 0 ? nil : suffix_arr.first.first # only return first suffix
-      return { :last => last_name, :middle => middle_name, :first => first_name, :prefix => prefix, :suffix => suffix }
+      def extract_prefix_after_flipping_parts
+        remaining, list = extract_prefix(name)
+        @name = remaining
+        @prefix_list += list
+      end
+      def breakup!
+        parts = name.split(/[, ]+/)
+        # process prefix
+        @prefix = @prefix_list.join(' ') if @prefix_list.any?
+        # process lastname
+        # Loop around until we run into a name that is not contained in the LAST_NAME_EXTENSIONS
+        last_name_arr  = []
+        last_name_arr.push(parts.pop)
+        last_name_arr.push(parts.pop) while parts.length > 1 && LAST_NAME_EXTENSIONS.include?(parts.last.downcase)
+        @lastname = last_name_arr.reverse.join(' ')
+        # process firstname and middlename
+        @firstname  = parts.shift if parts.length >= 1
+        @middlename = parts.join(' ') if parts.length > 0
+        if firstname.nil? && prefix
+          @firstname = prefix
+          @prefix    = nil
+        end
+        # move lastname to firstname, move first suffix to lastname
+        if firstname.nil? && @suffix_list.any? && SUFFIX_CAN_BE_LASTNAME.include?(@suffix_list.first.first.downcase)
+          @firstname = lastname
+          @lastname = @suffix_list.shift.first
+        end
+        # move lastname to middlename, move first suffix to lastname
+        if lastname =~ /^[A-Z]\.?$/i && @suffix_list.any? && SUFFIX_CAN_BE_LASTNAME.include?(@suffix_list.first.first.downcase)
+          @middlename = [middlename, lastname].compact.join(' ')
+          @lastname = @suffix_list.shift.first
+        end
+        # process suffix
+        @suffix_list.delete_if { |_, ignore_able| !ignore_able }
+        @suffix = @suffix_list.any? ? @suffix_list.first.first : nil
+      end
+    end
+    def parse_fullname(name)
+      i = Identifier.new(name)
+      return {
+        prefix: i.prefix,
+        first: i.firstname,
+        middle: i.middlename,
+        last: i.lastname,
+        suffix: i.suffix
+      }
     end # << parse_fullname
     extend self
   end

data/lib/fullname/parser/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Fullname
   module Parser
-    VERSION = '1.0.5'
+    VERSION = '1.1.0'
   end
 end

metadata CHANGED Viewed

@@ -1,15 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fullname-parser
 version: !ruby/object:Gem::Version
-  version: 1.0.5
-  prerelease:
+  version: 1.1.0
 platform: ruby
 authors:
 - xiaohui
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-06-24 00:00:00.000000000 Z
+date: 2016-02-16 00:00:00.000000000 Z
 dependencies: []
 description: For parsing people's fullname into pieces(prefix/first/middle/last/suffix)
 email:
@@ -25,27 +24,25 @@ files:
 - lib/fullname/parser/version.rb
 homepage: http://github.com/xiaohui-zhangxh/fullname-parser
 licenses: []
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.25
+rubygems_version: 2.4.8
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Split fullname into pieces(prefix/first/middle/last/suffix)
 test_files: []
-has_rdoc: