fullname-parser 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +18 -0
- data/README.md +18 -0
- data/fullname-parser.gemspec +17 -0
- data/lib/fullname/parser.rb +185 -0
- data/lib/fullname/parser/version.rb +6 -0
- metadata +51 -0
    
        data/.gitignore
    ADDED
    
    
    
        data/README.md
    ADDED
    
    | @@ -0,0 +1,18 @@ | |
| 1 | 
            +
            fullname_parser
         | 
| 2 | 
            +
            ===============
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            There are two ways to use this function:
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            require 'fullname/parser'
         | 
| 7 | 
            +
            Fullname::Parser.parse_fullname("Xiaohui Zhang")
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            => {:last=>"Zhang", :middle=>nil, :first=>"Xiaohui", :prefix=>nil, :suffix=>nil}
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            or
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            require 'fullname/parser'
         | 
| 14 | 
            +
            include Fullname::Parser
         | 
| 15 | 
            +
            parse_fullname("Xiaohui Zhang")
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            => {:last=>"Zhang", :middle=>nil, :first=>"Xiaohui", :prefix=>nil, :suffix=>nil}
         | 
| 18 | 
            +
             | 
| @@ -0,0 +1,17 @@ | |
| 1 | 
            +
            $:.unshift File.expand_path("../lib", __FILE__)
         | 
| 2 | 
            +
            require 'fullname/parser/version'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            Gem::Specification.new do |s|
         | 
| 5 | 
            +
              s.name        = "fullname-parser"
         | 
| 6 | 
            +
              s.version     = Fullname::Parser::VERSION
         | 
| 7 | 
            +
              s.platform    = Gem::Platform::RUBY
         | 
| 8 | 
            +
              s.authors     = ['xiaohui']
         | 
| 9 | 
            +
              s.email       = ['xiaohui@zhangxh.net']
         | 
| 10 | 
            +
              s.homepage    = 'http://github.com/xiaohui-zhangxh/fullname-parser'
         | 
| 11 | 
            +
              s.summary     = "Split fullname into pieces(prefix/first/middle/last/suffix)"
         | 
| 12 | 
            +
              s.description = "For parsing people's fullname into pieces(prefix/first/middle/last/suffix)"
         | 
| 13 | 
            +
             | 
| 14 | 
            +
              s.files        = `git ls-files`.split("\n")
         | 
| 15 | 
            +
              s.executables  = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
         | 
| 16 | 
            +
              s.require_path = 'lib'
         | 
| 17 | 
            +
            end
         | 
| @@ -0,0 +1,185 @@ | |
| 1 | 
            +
            # encoding: utf-8
         | 
| 2 | 
            +
            require File.expand_path('../parser/version', __FILE__)
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module Fullname
         | 
| 5 | 
            +
              module Parser
         | 
| 6 | 
            +
              
         | 
| 7 | 
            +
                # When "II" or "III" or even "IV" appear in the Middle Name/Suffix slot, it can safely be assumed that they are Suffixes. 
         | 
| 8 | 
            +
                # (John Smith has a son named John Smith II, who has a son named John Smith III, etc.) However, nobody (except a king) 
         | 
| 9 | 
            +
                # puts "I" after their name to indicate that they are the "first." If anything, they put "Sr." Therefore, a letter "I" 
         | 
| 10 | 
            +
                # appearing in the Middle Name/Suffix slot can be assumed to be their Middle Initial.
         | 
| 11 | 
            +
                # So here 'i' will be removed from the GENERATION_LIST
         | 
| 12 | 
            +
                #
         | 
| 13 | 
            +
                # Also almost nobody will reach to 'v'(except a king), so all suffixes later than 'v' we won't use.
         | 
| 14 | 
            +
                GENERATION_LIST = [
         | 
| 15 | 
            +
                  #'i',
         | 
| 16 | 
            +
                  'ii', 'iii', 'iv', 'v',
         | 
| 17 | 
            +
                  # 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx',
         | 
| 18 | 
            +
                ] unless const_defined?(:GENERATION_LIST)
         | 
| 19 | 
            +
                
         | 
| 20 | 
            +
                GLOBAL_SUFFIX_LIST = GENERATION_LIST + [
         | 
| 21 | 
            +
                  'jr', 'jr.',
         | 
| 22 | 
            +
                  'sr', 'sr.',
         | 
| 23 | 
            +
                ] unless const_defined?(:GLOBAL_SUFFIX_LIST)
         | 
| 24 | 
            +
                
         | 
| 25 | 
            +
                SUFFIX_LIST = [
         | 
| 26 | 
            +
                  'b.a.',
         | 
| 27 | 
            +
                  'capt.', 'col.', 'cfa', 'c.f.a', 'c.f.a.', 'cpa', 'c.p.a', 'c.p.a.',
         | 
| 28 | 
            +
                  'edd', 'ed.d',
         | 
| 29 | 
            +
                  'mph',
         | 
| 30 | 
            +
                  'pc', 'p.c.', 'psyd', 'psyd.', 'psy.d', 'phd', 'phd.', 'ph.d', 'ph.d.',
         | 
| 31 | 
            +
                  'r.s.m.',
         | 
| 32 | 
            +
                  'usn',
         | 
| 33 | 
            +
                ] unless const_defined?(:SUFFIX_LIST)
         | 
| 34 | 
            +
              
         | 
| 35 | 
            +
                IGNORABLE_SUFFIXES = [
         | 
| 36 | 
            +
                  'do', 'd.o.', 'd.o', 'dds', 'd.d.s.',
         | 
| 37 | 
            +
                  'esq', 'esq.',
         | 
| 38 | 
            +
                  'md', 'm.d.', 'm.d',
         | 
| 39 | 
            +
                  'mr.', 'ms.', 'mrs.',
         | 
| 40 | 
            +
                  'jd', 'jd.', 'j.d.',
         | 
| 41 | 
            +
                  'retd', 'ret.', 'retd.',
         | 
| 42 | 
            +
                  'usmc',
         | 
| 43 | 
            +
                ] unless const_defined?(:IGNORABLE_SUFFIXES)
         | 
| 44 | 
            +
              
         | 
| 45 | 
            +
                SUFFIX_CAN_BE_LASTNAME = [
         | 
| 46 | 
            +
                  'do',
         | 
| 47 | 
            +
                ] unless const_defined?(:SUFFIX_CAN_BE_LASTNAME)
         | 
| 48 | 
            +
              
         | 
| 49 | 
            +
                PREFIX_LIST = [ 
         | 
| 50 | 
            +
                  'asst.',
         | 
| 51 | 
            +
                  'attorney', 'atty.',
         | 
| 52 | 
            +
                  'bg', 'brig', 'gen',
         | 
| 53 | 
            +
                  'colonel', 'cardinal', 'capt', 'capt.', 'captain', 'cdr', 'col' , 'col.', 'congressman', 'cpt',
         | 
| 54 | 
            +
                  'dir.', 'dr', 'dr.',
         | 
| 55 | 
            +
                  'exec.',
         | 
| 56 | 
            +
                  'general', 'gen', 'gen.',
         | 
| 57 | 
            +
                  'honorable', 'hon', 'hon.',
         | 
| 58 | 
            +
                  'judge', 'justice', 'chiefjustice',
         | 
| 59 | 
            +
                  'lieutenant', 'lcdr', 'lt', 'lt.', 'ltc', 'ltcol.', 'ltcol', 'ltjg', 
         | 
| 60 | 
            +
                  'mr', 'mr.', 'ms', 'ms.', 'mrs', 'mrs.', 'maj', 'maj.', 'major', 'miss',
         | 
| 61 | 
            +
                  'president', 'prof', 'prof.', 'professor',
         | 
| 62 | 
            +
                  'reverend', 'rev', 'rev.',
         | 
| 63 | 
            +
                  'sheriff',
         | 
| 64 | 
            +
                  'sr', 'sr.'
         | 
| 65 | 
            +
                  
         | 
| 66 | 
            +
                ] unless const_defined?(:PREFIX_LIST)
         | 
| 67 | 
            +
              
         | 
| 68 | 
            +
                IGNORABLE_PREFIXS = [
         | 
| 69 | 
            +
                  'the',
         | 
| 70 | 
            +
                ] unless const_defined?(:IGNORABLE_PREFIXS)
         | 
| 71 | 
            +
              
         | 
| 72 | 
            +
                
         | 
| 73 | 
            +
                # These will be considered part of the last name
         | 
| 74 | 
            +
                LAST_NAME_EXTENSIONS = [    
         | 
| 75 | 
            +
                  'bar', 'ben',
         | 
| 76 | 
            +
                  'da', 'dal', 'dan', 'de', 'del', 'den', 'der', 'des', 'dela', 'della', 'di', 'do', 'du',
         | 
| 77 | 
            +
                  'el',
         | 
| 78 | 
            +
                  'la', 'le', 'lo',
         | 
| 79 | 
            +
                  'mac', 'mc',
         | 
| 80 | 
            +
                  'san',
         | 
| 81 | 
            +
                  'st', 'st.', 'sta', 'sta.',
         | 
| 82 | 
            +
                  'van','von', 'ver', 'vanden', 'vander'
         | 
| 83 | 
            +
                ] unless const_defined?(:LAST_NAME_EXTENSIONS)
         | 
| 84 | 
            +
                
         | 
| 85 | 
            +
                CONVERSION = {
         | 
| 86 | 
            +
                  '1st' => 'I', 
         | 
| 87 | 
            +
                  '2nd' => 'II', 
         | 
| 88 | 
            +
                  '3rd' => 'III', 
         | 
| 89 | 
            +
                  '4th' => 'IV',
         | 
| 90 | 
            +
                  '5th' => 'V',
         | 
| 91 | 
            +
                  '6th' => 'VI',
         | 
| 92 | 
            +
                  '7th' => 'VII',
         | 
| 93 | 
            +
                  '8th' => 'VIII',
         | 
| 94 | 
            +
                  '9th' => 'IX',
         | 
| 95 | 
            +
                } unless const_defined?(:CONVERSION)
         | 
| 96 | 
            +
                
         | 
| 97 | 
            +
                def parse_fullname(name)
         | 
| 98 | 
            +
                  first_name  = nil
         | 
| 99 | 
            +
                  middle_name = nil
         | 
| 100 | 
            +
                  last_name   = nil
         | 
| 101 | 
            +
                  prefix      = nil
         | 
| 102 | 
            +
                  suffix      = nil
         | 
| 103 | 
            +
                  
         | 
| 104 | 
            +
                  # replace "’" to "'"
         | 
| 105 | 
            +
                  name = name.gsub(/’/, "'")
         | 
| 106 | 
            +
                  # remove strings which contain and include in parentheses
         | 
| 107 | 
            +
                  # ex. 'Susan M. (Scully) Schultz'  =>  'Susan M. Schultz'
         | 
| 108 | 
            +
                  #     'Jay (Jung) Heum Kim'        =>  'Jay Heum Kim'
         | 
| 109 | 
            +
                  name = name.gsub(/\(.*?\)/, ' ').gsub(/\(|\)/, '')
         | 
| 110 | 
            +
                  # remove quoted strings 
         | 
| 111 | 
            +
                  # Darin "Derry" Ronald Anderson    => 'Darin Ronald Anderson'
         | 
| 112 | 
            +
                  # Nancy M. "Shelli" Egger          => 'Nancy M. Egger'  
         | 
| 113 | 
            +
                  # Nicole 'nikki' Adame             => 'Nicole Adame'                  
         | 
| 114 | 
            +
                  name = name.gsub(/".*?"/, ' ').gsub(/'.*?'/i, ' ')
         | 
| 115 | 
            +
              
         | 
| 116 | 
            +
                  # remove curly brackets
         | 
| 117 | 
            +
                  # Henry C.{Harry} Wilson           => 'Henry C. Wilson'
         | 
| 118 | 
            +
                  # Cellestine {Steen} Armstrong     => 'Cellestine Armstrong'
         | 
| 119 | 
            +
                  name = name.gsub(/\{.*?\}/, ' ')
         | 
| 120 | 
            +
                  # remove exceptional names
         | 
| 121 | 
            +
                  # ex. "William . D. 'Bill' Beard"  =>  "William D. 'Bill' Beard"
         | 
| 122 | 
            +
                  # also this regexp can remove 
         | 
| 123 | 
            +
                  name = name.gsub(/\s+[^a-zA-Z]+\s+/, ' ')
         | 
| 124 | 
            +
                  # Why we use substitute(sub) comma to whitespace, not global substitute(gsub).
         | 
| 125 | 
            +
                  # the reason is the substitution applies for suffix splitting, not for replacing
         | 
| 126 | 
            +
                  # bad data. As we want, convert "Marvene A Gordon, JD" to "Marvene A Gordon JD", 
         | 
| 127 | 
            +
                  # so that the suffix will get into the split array.
         | 
| 128 | 
            +
                  # and, standardize suffix as '2nd' => 'II', '3rd' => 'III'
         | 
| 129 | 
            +
                  nameSplit   = name.gsub(',', ' ').strip.split(/\s+/).map{ |n| CONVERSION[n.downcase] || n }
         | 
| 130 | 
            +
              
         | 
| 131 | 
            +
                  return { :last=>name } if nameSplit.length <= 1
         | 
| 132 | 
            +
                  
         | 
| 133 | 
            +
                  suffix_arr  = []
         | 
| 134 | 
            +
                  while (nameSplit.length > 1)
         | 
| 135 | 
            +
                    if IGNORABLE_SUFFIXES.include?(nameSplit.last.downcase)
         | 
| 136 | 
            +
                      suffix_arr.unshift([nameSplit.pop, false])
         | 
| 137 | 
            +
                    elsif SUFFIX_LIST.include?(nameSplit.last.downcase) || GLOBAL_SUFFIX_LIST.include?(nameSplit.last.downcase)
         | 
| 138 | 
            +
                      suffix_arr.unshift([nameSplit.pop, true])
         | 
| 139 | 
            +
                    else
         | 
| 140 | 
            +
                      break
         | 
| 141 | 
            +
                    end
         | 
| 142 | 
            +
                  end
         | 
| 143 | 
            +
              
         | 
| 144 | 
            +
                  # Loop around until we run into a name that is not contained in the PREFIX_LIST
         | 
| 145 | 
            +
                  # ex(FL): 'Lt Col Marvene A Gordon', 'The Honorable Dexter F George'
         | 
| 146 | 
            +
                  prefix_arr      = []
         | 
| 147 | 
            +
                  while (nameSplit.length > 1)
         | 
| 148 | 
            +
                    if IGNORABLE_PREFIXS.include?(nameSplit.first.downcase)
         | 
| 149 | 
            +
                      nameSplit.shift
         | 
| 150 | 
            +
                    elsif PREFIX_LIST.include?(nameSplit.first.downcase)
         | 
| 151 | 
            +
                      prefix_arr.push(nameSplit.shift)
         | 
| 152 | 
            +
                    else
         | 
| 153 | 
            +
                      break
         | 
| 154 | 
            +
                    end
         | 
| 155 | 
            +
                  end
         | 
| 156 | 
            +
                  prefix = prefix_arr.join(' ') if prefix_arr.size > 0
         | 
| 157 | 
            +
              
         | 
| 158 | 
            +
                  # Loop around until we run into a name that is not contained in the LAST_NAME_EXTENSIONS
         | 
| 159 | 
            +
                  last_name_arr  = []
         | 
| 160 | 
            +
                  last_name_arr.push(nameSplit.pop)
         | 
| 161 | 
            +
                  last_name_arr.push(nameSplit.pop) while nameSplit.length > 1 && LAST_NAME_EXTENSIONS.include?(nameSplit.last.downcase)
         | 
| 162 | 
            +
                  last_name = last_name_arr.reverse.join(' ') if last_name_arr.size > 0
         | 
| 163 | 
            +
              
         | 
| 164 | 
            +
                  first_name  = nameSplit.shift     if nameSplit.length >= 1
         | 
| 165 | 
            +
                  middle_name = nameSplit.join(' ') if nameSplit.length > 0
         | 
| 166 | 
            +
                  if first_name.nil? && prefix
         | 
| 167 | 
            +
                    first_name = prefix
         | 
| 168 | 
            +
                    prefix     = nil
         | 
| 169 | 
            +
                  end
         | 
| 170 | 
            +
                  
         | 
| 171 | 
            +
                  if first_name.nil? && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
         | 
| 172 | 
            +
                    first_name = last_name
         | 
| 173 | 
            +
                    last_name = suffix_arr.shift.first
         | 
| 174 | 
            +
                  end
         | 
| 175 | 
            +
                  if last_name =~ /^[A-Z]\.?$/i && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
         | 
| 176 | 
            +
                    middle_name = [middle_name, last_name].compact.join(' ')
         | 
| 177 | 
            +
                    last_name = suffix_arr.shift.first
         | 
| 178 | 
            +
                  end
         | 
| 179 | 
            +
                  suffix_arr.delete_if{|a, b| !b}
         | 
| 180 | 
            +
                  suffix = suffix_arr.size == 0 ? nil : suffix_arr.first.first # only return first suffix
         | 
| 181 | 
            +
                  return { :last => last_name, :middle => middle_name, :first => first_name, :prefix => prefix, :suffix => suffix }
         | 
| 182 | 
            +
                end # << parse_fullname
         | 
| 183 | 
            +
                extend self
         | 
| 184 | 
            +
              end
         | 
| 185 | 
            +
            end
         | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,51 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification
         | 
| 2 | 
            +
            name: fullname-parser
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            +
              version: 1.0.3
         | 
| 5 | 
            +
              prerelease: 
         | 
| 6 | 
            +
            platform: ruby
         | 
| 7 | 
            +
            authors:
         | 
| 8 | 
            +
            - xiaohui
         | 
| 9 | 
            +
            autorequire: 
         | 
| 10 | 
            +
            bindir: bin
         | 
| 11 | 
            +
            cert_chain: []
         | 
| 12 | 
            +
            date: 2013-05-15 00:00:00.000000000 Z
         | 
| 13 | 
            +
            dependencies: []
         | 
| 14 | 
            +
            description: For parsing people's fullname into pieces(prefix/first/middle/last/suffix)
         | 
| 15 | 
            +
            email:
         | 
| 16 | 
            +
            - xiaohui@zhangxh.net
         | 
| 17 | 
            +
            executables: []
         | 
| 18 | 
            +
            extensions: []
         | 
| 19 | 
            +
            extra_rdoc_files: []
         | 
| 20 | 
            +
            files:
         | 
| 21 | 
            +
            - .gitignore
         | 
| 22 | 
            +
            - README.md
         | 
| 23 | 
            +
            - fullname-parser.gemspec
         | 
| 24 | 
            +
            - lib/fullname/parser.rb
         | 
| 25 | 
            +
            - lib/fullname/parser/version.rb
         | 
| 26 | 
            +
            homepage: http://github.com/xiaohui-zhangxh/fullname-parser
         | 
| 27 | 
            +
            licenses: []
         | 
| 28 | 
            +
            post_install_message: 
         | 
| 29 | 
            +
            rdoc_options: []
         | 
| 30 | 
            +
            require_paths:
         | 
| 31 | 
            +
            - lib
         | 
| 32 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 33 | 
            +
              none: false
         | 
| 34 | 
            +
              requirements:
         | 
| 35 | 
            +
              - - ! '>='
         | 
| 36 | 
            +
                - !ruby/object:Gem::Version
         | 
| 37 | 
            +
                  version: '0'
         | 
| 38 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 39 | 
            +
              none: false
         | 
| 40 | 
            +
              requirements:
         | 
| 41 | 
            +
              - - ! '>='
         | 
| 42 | 
            +
                - !ruby/object:Gem::Version
         | 
| 43 | 
            +
                  version: '0'
         | 
| 44 | 
            +
            requirements: []
         | 
| 45 | 
            +
            rubyforge_project: 
         | 
| 46 | 
            +
            rubygems_version: 1.8.25
         | 
| 47 | 
            +
            signing_key: 
         | 
| 48 | 
            +
            specification_version: 3
         | 
| 49 | 
            +
            summary: Split fullname into pieces(prefix/first/middle/last/suffix)
         | 
| 50 | 
            +
            test_files: []
         | 
| 51 | 
            +
            has_rdoc: 
         |