RubyGems - name_parser - Versions diffs - 0.0.5 - Mend

name_parser 0.0.5

Files changed (15) hide show

data/.gitignore +5 -0
data/.rspec +1 -0
data/.rvmrc +48 -0
data/Gemfile +4 -0
data/README.md +40 -0
data/Rakefile +8 -0
data/lib/name_parser/parser.rb +74 -0
data/lib/name_parser/patterns.rb +29 -0
data/lib/name_parser/version.rb +3 -0
data/lib/name_parser.rb +9 -0
data/name_parser.gemspec +23 -0
data/spec/name_parser/parser_spec.rb +359 -0
data/spec/name_parser_spec.rb +25 -0
data/spec/spec_helper.rb +6 -0
metadata +89 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,5 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*
+.project

data/.rspec ADDED Viewed

	@@ -0,0 +1 @@
1	+ --colour

data/.rvmrc ADDED Viewed

@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# This is an RVM Project .rvmrc file, used to automatically load the ruby
+# development environment upon cd'ing into the directory
+# First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
+# Only full ruby name is supported here, for short names use:
+#     echo "rvm use 1.9.2" > .rvmrc
+environment_id="ruby-1.9.2-p318@name_parser"
+# Uncomment the following lines if you want to verify rvm version per project
+# rvmrc_rvm_version="1.10.3" # 1.10.1 seams as a safe start
+# eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
+#   echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
+#   return 1
+# }
+# First we attempt to load the desired environment directly from the environment
+# file. This is very fast and efficient compared to running through the entire
+# CLI and selector. If you want feedback on which environment was used then
+# insert the word 'use' after --create as this triggers verbose mode.
+if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
+  && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
+then
+  \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
+  [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
+    \. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
+else
+  # If the environment file has not yet been created, use the RVM CLI to select.
+  rvm --create  "$environment_id" || {
+    echo "Failed to create RVM environment '${environment_id}'."
+    return 1
+  }
+fi
+# If you use bundler, this might be useful to you:
+# if [[ -s Gemfile ]] && {
+#   ! builtin command -v bundle >/dev/null ||
+#   builtin command -v bundle | grep $rvm_path/bin/bundle >/dev/null
+# }
+# then
+#   printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
+#   gem install bundler
+# fi
+# if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
+# then
+#   bundle install | grep -vE '^Using|Your bundle is complete'
+# fi

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in parsely.gemspec
+gemspec

data/README.md ADDED Viewed

@@ -0,0 +1,40 @@
+NameParser
+=========
+Does what it says. Based on Matthew Ericson's people gem: https://github.com/mericson/people which, in turn, is loosely based on
+the Lingua-EN-NameParser Perl module.
+To set up development environment clone the repo and run `bundle` to get all of the dependencies.
+Usage
+-----
+```ruby
+require "name_parser"
+include NameParser
+name = "Captain Arthur Two Sheds Jackson Jr."
+parser = Parser.new(name)
+parser.first  # => "Arthur"
+parser.middle # => "Two Sheds"
+parser.last   # => "Jackson"
+parser.title  # => "Captain"
+parser.suffix # => "Jr."
+```
+or using the mixin
+```ruby
+require "name_parser"
+include NameParser
+name = "Captain Arthur Two Sheds Jackson Jr."
+parser = name_parser(name) # => NameParser::Parser
+parser.first # => "Arthur"
+# ...
+```

data/Rakefile ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env rake
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task :test => :spec
+task :default => :spec

data/lib/name_parser/parser.rb ADDED Viewed

@@ -0,0 +1,74 @@
+module NameParser
+  class Parser
+    include Patterns
+    attr_reader :first, :middle, :last, :title, :suffix
+    def initialize(name)
+      @name = name.dup
+      run
+    end
+    protected
+      def run
+        remove_non_name_characters
+        remove_extra_spaces
+        clean_trailing_suffixes
+        reverse_last_and_first_names
+        remove_commas
+        parse_title
+        parse_suffix
+        parse_name
+      end
+      def remove_non_name_characters
+        @name.gsub!(/[^A-Za-z0-9\-\'\.&\/ \,]/, '')
+      end
+      def remove_extra_spaces
+        @name.gsub!(/\s+/, ' ')
+        @name.strip!
+      end
+      def clean_trailing_suffixes
+        @name.gsub!(Regexp.new("(.+), (%s)$" % SUFFIX_PATTERN, true), "\\1 \\2")
+      end
+      def reverse_last_and_first_names
+        @name.gsub!(/;/, '')
+        @name.gsub!(/(.+),(.+)/, "\\2 ;\\1")
+        @name.strip!
+      end
+      def remove_commas
+        @name.gsub!(/,/, '')
+      end
+      def parse_title
+        if match = @name.match(Regexp.new("^(%s) (.+)" % TITLE_PATTERN, true))
+          @name = match[-1]
+          @title = match[1].strip
+        end
+      end
+      def parse_suffix
+        if match = @name.match(Regexp.new("(.+) (%s)$" % SUFFIX_PATTERN, true))
+          @name = match[1].strip
+          @suffix = match[2]
+        end
+      end
+      def parse_name
+        case
+          when match = @name.match(Regexp.new('^%s%s$' % [ NAME_PATTERN, LAST_NAME_PATTERN ], true))
+            @first, @last = match.captures
+          when match = @name.match(Regexp.new('^%s%s%s%s$' % [ NAME_PATTERN, NAME_PATTERN, NAME_PATTERN, LAST_NAME_PATTERN ], true))
+            @first, *middles, @last = match.captures[0..3]
+            @middle = middles.join(' ')
+          when match = @name.match(Regexp.new('^%s%s%s$' % [ NAME_PATTERN, NAME_PATTERN, LAST_NAME_PATTERN ], true))
+            @first, @middle, @last = match.captures
+        end
+      end
+  end
+end

data/lib/name_parser/patterns.rb ADDED Viewed

@@ -0,0 +1,29 @@
+module NameParser
+  module Patterns
+    NAME_PATTERN = "([\\w\\-\\']+)[\.{1,}\\s|\\s]+"
+    LAST_NAME_PATTERN = "\;?([\\w\\-\\']+|(Mc|Mac|Des|Dell[ae]|Del|De La|De Los|Da|Di|Du|La|Le\
+    |Lo|St\\.|Den|Von|Van|Von Der|Van De[nr])?\\s+([\\w]+))"
+    SUFFIX_PATTERN = "Jn?r\.?,? Esq\.?|Sn?r\.?,? Esq\.?|I{1,3},? Esq\.?|Jn?r\.?,? M\.?D\.?|Sn?r\.?,? M\.?D\.?|\
+    I{1,3},? M\.?D\.?|Sn?r\.?|Jn?r\.?|Esq(\.|uire)?|Esquire.|Attorney at Law.|Attorney-at-Law.|Ph\.?d\.?|C\.?P\.?A\.?|\
+    XI{1,3}|X|IV|VI{1,3}|V|IX|I{1,3}\.?|M\.?D\.?|D.?M\.?D\.?"
+    STANDARD = "M(ister|aster|issus|iss|r\\.?|rs\\.?|s\\.?|mme\\.?|essr\\.?)"
+    ROYALTY = "Sir|Lord|Lady|Madam(e)?|Dame|Duke|Duchess|King|Queen|Prince|Princess"
+    MEDICINE = "D(r\\.?|octor)|Sister|Matron"
+    LEGAL = "Judge|Justice|Att(\\.|orney) Gen(\\.|eral)"
+    POLICE = "Det(\\.|ective) Insp(\\.|ector)|Det(\\.|ective)|Insp(\\.|ector)|Chief|Constable|Officer"
+    MILITARY = "Brig(adier)?|Capt(\\.?|ain)|C(dr\\.?|ommander|ommodore)|Col(\\.?|onel)|\
+    Gen(\\.?|eral)|Field Marshall|Fl(\\.?|ight) Off(\\.?|icer)|Fl(t\\.?|ight) L(t\\.?|ieutenant)|\
+    P(te\\.?|rivate)|S(gt\\.?|argent)|Air (Commander|Commodore| Marshall)|L(t\\.?|ieutenant) (Col(\\.?|onel)|\
+    Gen(\\.?|eral)|C(Cdr\\.?|ommander))|L(t\\.?|eut\\.?|ieutenant|eutenant)|Maj(\\.?|or) Gen(\\.?|eral)|Maj(\\.?|or)"
+    RELIGIOUS = "Rabbi|Brother|Father|Chaplain|Pastor|(Archb|B)ishop|Cardinal|Pope|\
+    Mother( Superior)?|(Most|Mt\\.|Very|V.) Re(v\\.?|vd\\.?|ver[e|a]nd)|Re(v\\.?|vd\\.?|er[e|a]nd)"
+    POLITICIAN = "Mayor|Sen(\\.|ator)?|Rep(\\.|resentative)?|Ald(\\.|erman)?|Pres(\\.|ident)?|\
+    Ambassador|Assembly(woman|man)|Chair(woman|man)|Commissioner|Congress(woman|man)|Council(wo)man|\
+    Counselor|Delegate|(Lieutentant )Governor|Postmaster( General)"
+    EDUCATOR = "Dean|President|Ass(\\.|oc\\.|ociate|t\\.|istant) Prof(\\.|essor)|Prof(\\.|essor)"
+    TITLE_PATTERN = [ STANDARD, ROYALTY, MEDICINE, LEGAL, POLICE, MILITARY, RELIGIOUS, POLITICIAN, EDUCATOR ].join("|")
+ end
+end

data/lib/name_parser/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module NameParser
+  VERSION = "0.0.5"
+end

data/lib/name_parser.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module NameParser
+  autoload :Version, 'name_parser/version'
+  autoload :Patterns,'name_parser/patterns'
+  autoload :Parser,  'name_parser/parser'
+  def name_parser(name)
+    Parser.new(name)
+  end
+end

data/name_parser.gemspec ADDED Viewed

@@ -0,0 +1,23 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "name_parser/version"
+Gem::Specification.new do |s|
+  s.name        = "name_parser"
+  s.version     = NameParser::VERSION
+  s.authors     = ["Chris Pallotta", "Scott Pullen", "Tom Leonard"]
+  s.email       = ["ChristopherF_Pallotta@dfci.harvard.edu", "ScottT_Pullen@dfci.harvard.edu", "Thomas_Leonard@dfci.harvard.edu"]
+  s.homepage    = ""
+  s.summary     = %q{Parses strings.}
+  s.description = %q{Parses particular kinds of strings. For now, it only handles parsing people names.}
+  s.rubyforge_project = "name_parser"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.add_development_dependency 'rspec'
+  s.add_development_dependency 'debugger'
+end

data/spec/name_parser/parser_spec.rb ADDED Viewed

@@ -0,0 +1,359 @@
+require 'spec_helper'
+include NameParser
+Parser.send(:public, *Parser.protected_instance_methods)
+describe Parser do
+  let(:name) { 'Horatio Xavier Hornblower' }
+  let!(:parser) { Parser.new(name) }
+  [:name, :first, :middle, :last, :title, :suffix ].each do |attr|
+    describe "#{attr} attribute" do
+      it 'is read only' do
+        parser.methods.should_not include(":#{attr}=".to_sym)
+      end
+    end
+  end
+  describe 'name attribute' do
+    it 'is set on initialize' do
+      get_name.should == name
+    end
+  end
+  describe '#remove_non_name_characters' do
+    it 'only allows alpha-numerics, dashes, backslashes, apostrophes and ampersands' do
+      set_name("aZ1/&'`!@$#%^*()_+=[]{}|\:;""")
+      parser.remove_non_name_characters
+      get_name.should == "aZ1/&'"
+    end
+  end
+  describe '#remove_extra_spaces' do
+    it 'removes leading spaces, tabs and line breaks' do
+      set_name(" \t\nFoo")
+      parser.remove_extra_spaces
+      get_name.should == 'Foo'
+    end
+    it 'removes trailing spaces, tabs and line breaks' do
+      set_name("Foo \t\n")
+      parser.remove_extra_spaces
+      get_name.should == 'Foo'
+    end
+    it 'replaces repeating spaces, tabs and line breaks with a single space' do
+      set_name("  Foo  \t\nBar  ")
+      parser.remove_extra_spaces
+      get_name.should == 'Foo Bar'
+    end
+  end
+  describe '#clean_trailing_suffixes' do
+    it 'removes trailing suffixes' do
+      set_name('Biggie Smalls, Junior, Esquire, Phd., VII')
+      parser.clean_trailing_suffixes
+      get_name.should == 'Biggie Smalls, Junior, Esquire, Phd. VII'
+    end
+  end
+  describe '#reverse_last_and_first_names' do
+    it 'reorders last and first names if comma is present' do
+      set_name('Smith, Johnny')
+      parser.reverse_last_and_first_names
+      get_name.should == 'Johnny ;Smith'
+    end
+  end
+  describe '#remove_commas' do
+    it 'removes all commas' do
+      set_name('Hounddog ;Taylor,')
+      parser.remove_commas
+      get_name.should == 'Hounddog ;Taylor'
+    end
+  end
+  describe '#parse_title' do
+    context 'when a title is found' do
+      before { set_name('Colonel Henry Potter') }
+      it 'sets title attribute' do
+        parser.parse_title
+        parser.title.should == 'Colonel'
+      end
+      it 'removes the title from name' do
+        parser.parse_title
+        get_name.should == 'Henry Potter'
+      end
+    end
+    context 'when a title is not found' do
+      it 'returns nil' do
+        set_name('Frank Burns')
+        parser.parse_title
+        parser.title.should be_nil
+      end
+    end
+  end
+  describe '#parse_suffix' do
+    context 'when a suffix is found' do
+      before { set_name('Bubba Watson Jr.') }
+      it 'returns the suffix' do
+        parser.parse_suffix
+        parser.suffix.should == 'Jr.'
+      end
+      it 'removes the suffix from name' do
+        parser.parse_suffix
+        get_name.should == 'Bubba Watson'
+      end
+    end
+    context 'when a suffix is not found' do
+       it 'returns nil' do
+         set_name('Bubba Watson')
+         parser.parse_suffix
+         parser.suffix.should be_nil
+       end
+    end
+  end
+  describe '#parse_name' do
+    context 'when first initial and last name' do
+      before do
+        set_name('J Tolkien')
+        parser.parse_name
+      end
+      it 'returns first initial' do
+        parser.first.should == 'J'
+      end
+      it 'returns nil middle name' do
+        parser.middle.should be_nil
+      end
+      it 'returns last name' do
+        parser.last.should == 'Tolkien'
+      end
+    end
+    context 'when first initial, middle initial and last name' do
+      before do
+        set_name('J R Tolkien')
+        parser.parse_name
+      end
+      it 'returns first initial' do
+        parser.first.should == 'J'
+      end
+      it 'returns middle initial' do
+        parser.middle.should == 'R'
+      end
+      it 'returns last name' do
+        parser.last.should == 'Tolkien'
+      end
+    end
+    context 'when first initial dot middle initial dot last name' do
+      before do
+        set_name('J. R. Tolkien')
+        parser.parse_name
+      end
+      it 'returns first initial' do
+        parser.first.should == 'J'
+      end
+      it 'returns middle initial' do
+        parser.middle.should == 'R'
+      end
+      it 'returns last name' do
+        parser.last.should == 'Tolkien'
+      end
+    end
+    context 'when first initial, two middle initials and last name' do
+      before do
+        set_name('J R R Tolkien')
+         parser.parse_name
+      end
+      it 'returns first initial' do
+        parser.first.should == 'J'
+      end
+      it 'returns both middle initials' do
+        parser.middle.should == 'R R'
+      end
+      it 'returns last name' do
+        parser.last.should == 'Tolkien'
+      end
+    end
+    context 'when first initial, middle name and last name' do
+      before do
+        set_name('J Ronald Tolkien')
+        parser.parse_name
+      end
+      it 'returns first initial' do
+        parser.first.should == 'J'
+      end
+      it 'returns middle name' do
+        parser.middle.should == 'Ronald'
+      end
+      it 'returns last name' do
+        parser.last.should == 'Tolkien'
+      end
+    end
+    context 'when first name, middle initial and last name' do
+      before do
+        set_name('John R Tolkien')
+        parser.parse_name
+      end
+      it 'returns first name' do
+        parser.first.should == 'John'
+      end
+      it 'returns middle initial' do
+        parser.middle.should == 'R'
+      end
+      it 'returns last name' do
+        parser.last.should == 'Tolkien'
+      end
+    end
+    context 'when first name, two middle initials and last name' do
+      before do
+        set_name('John R R Tolkien')
+        parser.parse_name
+      end
+      it 'returns first name' do
+        parser.first.should == 'John'
+      end
+      it 'returns middle name' do
+        parser.middle.should == 'R R'
+      end
+      it 'returns last name' do
+        parser.last.should == 'Tolkien'
+      end
+    end
+    context 'when first name, two middle initials with dots and last name' do
+      before do
+        set_name('John R. R. Tolkien')
+        parser.parse_name
+      end
+      it 'returns first name' do
+        parser.first.should == 'John'
+      end
+      it 'returns middle name' do
+        parser.middle.should == 'R R'
+      end
+      it 'returns last name' do
+        parser.last.should == 'Tolkien'
+      end
+    end
+    context 'when first name and last name' do
+      before do
+        set_name('John Tolkien')
+        parser.parse_name
+      end
+      it 'returns first name' do
+        parser.first.should == 'John'
+      end
+      it 'returns nil middle name' do
+        parser.middle.should be_nil
+      end
+      it 'returns last name' do
+        parser.last.should == 'Tolkien'
+      end
+    end
+    context 'when first name, middle name and last name' do
+      before do
+        set_name('John Ronald Tolkien')
+        parser.parse_name
+      end
+      it 'returns first name' do
+        parser.first.should == 'John'
+      end
+      it 'returns  middle name' do
+        parser.middle.should == 'Ronald'
+      end
+      it 'returns last name' do
+        parser.last.should == 'Tolkien'
+      end
+    end
+    context 'when last name is hyphenated' do
+      it 'returns last name' do
+        set_name('John R. Tolkien-Smith')
+        parser.parse_name
+        parser.last.should == 'Tolkien-Smith'
+      end
+    end
+    context 'when last name is preceded by a semicolon' do
+      it 'returns last name' do
+        set_name('J R R ;Tolkien')
+        parser.parse_name
+        parser.last.should == 'Tolkien'
+      end
+    end
+  end
+  def set_name(name)
+    parser.instance_variable_set(:@first, nil)
+    parser.instance_variable_set(:@middle, nil)
+    parser.instance_variable_set(:@last, nil)
+    parser.instance_variable_set(:@title, nil)
+    parser.instance_variable_set(:@suffix, nil)
+    parser.instance_variable_set(:@name, name)
+  end
+  def get_name
+    parser.instance_variable_get(:@name)
+  end
+end

data/spec/name_parser_spec.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require 'spec_helper'
+class TestClass
+  include NameParser
+end
+describe NameParser do
+  let!(:name) { "Adams Jr., Mr. John Quincy" }
+  let!(:test_class) { TestClass.new }
+  describe '#name_parser' do
+    it 'returns a new NameParser::Parser object' do
+      test_class.name_parser(name).class.should == NameParser::Parser
+    end
+    it 'should run the parser' do
+      parser = test_class.name_parser(name)
+      parser.title.should == 'Mr.'
+      parser.first.should == 'John'
+      parser.middle.should == 'Quincy'
+      parser.last.should == 'Adams'
+      parser.suffix.should == 'Jr.'
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'rubygems'
+require 'rspec'
+require 'debugger'
+$:.push File.expand_path("../lib", __FILE__)
+require 'name_parser'

metadata ADDED Viewed

@@ -0,0 +1,89 @@
+--- !ruby/object:Gem::Specification
+name: name_parser
+version: !ruby/object:Gem::Version
+  version: 0.0.5
+  prerelease:
+platform: ruby
+authors:
+- Chris Pallotta
+- Scott Pullen
+- Tom Leonard
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-02-07 00:00:00.000000000Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: &2152901460 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *2152901460
+- !ruby/object:Gem::Dependency
+  name: debugger
+  requirement: &2152901040 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *2152901040
+description: Parses particular kinds of strings. For now, it only handles parsing
+  people names.
+email:
+- ChristopherF_Pallotta@dfci.harvard.edu
+- ScottT_Pullen@dfci.harvard.edu
+- Thomas_Leonard@dfci.harvard.edu
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rspec
+- .rvmrc
+- Gemfile
+- README.md
+- Rakefile
+- lib/name_parser.rb
+- lib/name_parser/parser.rb
+- lib/name_parser/patterns.rb
+- lib/name_parser/version.rb
+- name_parser.gemspec
+- spec/name_parser/parser_spec.rb
+- spec/name_parser_spec.rb
+- spec/spec_helper.rb
+homepage: ''
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project: name_parser
+rubygems_version: 1.8.17
+signing_key:
+specification_version: 3
+summary: Parses strings.
+test_files:
+- spec/name_parser/parser_spec.rb
+- spec/name_parser_spec.rb
+- spec/spec_helper.rb