namae 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,175 @@
1
+ # -*- racc -*-
2
+
3
+ class Namae::Parser
4
+
5
+ token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE
6
+
7
+ expect 0
8
+
9
+ rule
10
+
11
+ names : { result = [] }
12
+ | name { result = [val[0]] }
13
+ | names AND name { result = val[0] << val[2] }
14
+
15
+ name : word { result = Name.new(:given => val[0]) }
16
+ | display_order
17
+ | honorific word { result = val[0].merge(:family => val[1]) }
18
+ | honorific display_order { result = val[1].merge(val[0]) }
19
+ | sort_order
20
+
21
+ honorific : APPELLATION { result = Name.new(:appellation => val[0]) }
22
+ | TITLE { result = Name.new(:title => val[0]) }
23
+
24
+ display_order : u_words word
25
+ {
26
+ result = Name.new(:given => val[0], :family => val[1])
27
+ }
28
+ | u_words NICK last
29
+ {
30
+ result = Name.new(:given => val[0], :nick => val[1], :family => val[2])
31
+ }
32
+ | u_words NICK von last
33
+ {
34
+ result = Name.new(:given => val[0], :nick => val[1],
35
+ :particle => val[2], :family => val[3])
36
+ }
37
+ | u_words von last
38
+ {
39
+ result = Name.new(:given => val[0], :particle => val[1],
40
+ :family => val[2])
41
+ }
42
+ | von last
43
+ {
44
+ result = Name.new(:particle => val[0], :family => val[1])
45
+ }
46
+
47
+ sort_order : last COMMA first
48
+ {
49
+ result = Name.new(:family => val[0], :suffix => val[2][0],
50
+ :given => val[2][1])
51
+ }
52
+ | von last COMMA first
53
+ {
54
+ result = Name.new(:particle => val[0], :family => val[1],
55
+ :suffix => val[3][0], :given => val[3][1])
56
+ }
57
+ | u_words von last COMMA first
58
+ {
59
+ result = Name.new(:particle => val[0,2].join(' '), :family => val[2],
60
+ :suffix => val[4][0], :given => val[4][1])
61
+ }
62
+ ;
63
+
64
+ von : LWORD
65
+ | von LWORD { result = val.join(' ') }
66
+ | von u_words LWORD { result = val.join(' ') }
67
+
68
+ last : LWORD | u_words
69
+
70
+ first : opt_words { result = [nil,val[0]] }
71
+ | opt_words COMMA opt_words { result = [val[0],val[2]] }
72
+
73
+ u_words : u_word
74
+ | u_words u_word { result = val.join(' ') }
75
+
76
+ u_word : UWORD | PWORD
77
+
78
+ words : word
79
+ | words word { result = val.join(' ') }
80
+
81
+ opt_words : /* empty */ | words
82
+
83
+ word : LWORD | UWORD | PWORD
84
+
85
+ ---- header
86
+ require 'singleton'
87
+ require 'strscan'
88
+
89
+ ---- inner
90
+
91
+ include Singleton
92
+
93
+ attr_reader :options
94
+
95
+ def initialize
96
+ @input, @options = StringScanner.new(''), {
97
+ :debug => false,
98
+ :comma => ',',
99
+ :separator => /\s*(\band\b|\&)\s*/i,
100
+ :title => /\s*\b(sir|lord|(prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
101
+ :appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
102
+ }
103
+ end
104
+
105
+ def debug?
106
+ options[:debug] || ENV['DEBUG']
107
+ end
108
+
109
+ def separator
110
+ options[:separator]
111
+ end
112
+
113
+ def comma
114
+ options[:comma]
115
+ end
116
+
117
+ def title
118
+ options[:title]
119
+ end
120
+
121
+ def appellation
122
+ options[:appellation]
123
+ end
124
+
125
+ def parse(input)
126
+ parse!(input)
127
+ rescue => e
128
+ warn e.message if debug?
129
+ []
130
+ end
131
+
132
+ def parse!(string)
133
+ @yydebug = debug?
134
+ input.string = string.strip
135
+ do_parse
136
+ end
137
+
138
+ private
139
+
140
+ def next_token
141
+ case
142
+ when input.nil?, input.eos?
143
+ nil
144
+ when input.scan(separator)
145
+ [:AND, nil]
146
+ when input.scan(/\s*,\s*/)
147
+ [:COMMA, nil]
148
+ when input.scan(/\s+/)
149
+ next_token
150
+ when input.scan(title)
151
+ [:TITLE, input.matched.strip]
152
+ when input.scan(appellation)
153
+ [:APPELLATION, input.matched.strip]
154
+ when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{comma}]*/)
155
+ [:UWORD, input.matched]
156
+ when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{comma}]*/)
157
+ [:LWORD, input.matched]
158
+ when input.scan(/(\\\w+)?\{[^\}]*\}[^\s#{comma}]*/)
159
+ [:PWORD, input.matched]
160
+ when input.scan(/('[^'\n]+')|("[^"\n]+")/)
161
+ [:NICK, input.matched[1...-1]]
162
+ else
163
+ raise ArgumentError,
164
+ "Failed to parse name #{input.string.inspect}: unmatched data at offset #{input.pos}"
165
+ end
166
+ end
167
+
168
+ def on_error(tid, value, stack)
169
+ raise ArgumentError,
170
+ "Failed to parse name: unexpected '#{value}' at #{stack.inspect}"
171
+ end
172
+
173
+ attr_reader :input
174
+
175
+ # -*- racc -*-
@@ -0,0 +1,47 @@
1
+
2
+ # Namae is a parser for human names. It recognizes personal names of
3
+ # various cultural backgrounds and tries to split them into their
4
+ # component parts (e.g., given and family names, honorifics etc.).
5
+ #
6
+ # The main use case of Namae is to use the {Namae.parse .parse} or
7
+ # {Namae.parse! .parse!} method to parse a string of names and return
8
+ # a list of {Namae::Name Name} objects.
9
+ #
10
+ # @example Name parsing
11
+ # Namae.parse('Yukihiro "Matz" Matsumoto')
12
+ # #=> [#<Name family="Matsumoto" given="Yukihiro" nick="Matz">]
13
+ #
14
+ # Namae.parse('Torvalds, Linus and Cox, Alan')
15
+ # #=> [#<Name family="Torvalds" given="Linus">, #<Name family="Cox" given="Alan">]
16
+ #
17
+ module Namae
18
+
19
+ module_function
20
+
21
+ # Parses the passed-in string and returns a list of names. Behaves like
22
+ # parse but returns an empty list for bad input without raising an error.
23
+ #
24
+ # @see parse!
25
+ #
26
+ # @param names [String] the name or names to be parsed
27
+ # @return [Array] the list of parsed names
28
+ def parse(names)
29
+ Parser.instance.parse(names)
30
+ end
31
+
32
+ # Parses the passed-in string and returns a list of names.
33
+ #
34
+ # @param names [String] the name or names to be parsed
35
+ # @return [Array] the list of parsed names
36
+ #
37
+ # @raise [ArgumentError] if the string cannot be parsed.
38
+ def parse!(names)
39
+ Parser.instance.parse!(names)
40
+ end
41
+
42
+ # @return [Hash] the parser's current configuration.
43
+ def options
44
+ Parser.instance.options
45
+ end
46
+
47
+ end
@@ -0,0 +1,10 @@
1
+ module Namae
2
+ module Version
3
+ MAJOR = 0
4
+ MINOR = 1
5
+ PATCH = 0
6
+ BUILD = nil
7
+
8
+ STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.').freeze
9
+ end
10
+ end
@@ -0,0 +1,80 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "namae"
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Sylvester Keil", "Dan Collis-Puro"]
12
+ s.date = "2012-06-04"
13
+ s.description = " Namae is a parser for human names. It recognizes personal names of various cultural backgrounds and tries to split them into their component parts (e.g., given and family names, honorifics etc.). "
14
+ s.email = ["sylvester@keil.or.at", "dan@collispuro.com"]
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".autotest",
21
+ ".document",
22
+ ".rspec",
23
+ ".simplecov",
24
+ ".travis.yml",
25
+ ".yardopts",
26
+ "Gemfile",
27
+ "LICENSE",
28
+ "README.md",
29
+ "Rakefile",
30
+ "cucumber.yml",
31
+ "features/bibtex.feature",
32
+ "features/examples.feature",
33
+ "features/step_definitions/namae_steps.rb",
34
+ "features/support/env.rb",
35
+ "lib/namae.rb",
36
+ "lib/namae/name.rb",
37
+ "lib/namae/parser.rb",
38
+ "lib/namae/parser.y",
39
+ "lib/namae/utility.rb",
40
+ "lib/namae/version.rb",
41
+ "namae.gemspec",
42
+ "spec/namae/name_spec.rb",
43
+ "spec/namae/parser_spec.rb",
44
+ "spec/namae/utility_spec.rb",
45
+ "spec/spec_helper.rb"
46
+ ]
47
+ s.homepage = "https://github.com/berkmancenter/namae"
48
+ s.licenses = ["AGPL"]
49
+ s.require_paths = ["lib"]
50
+ s.rubygems_version = "1.8.10"
51
+ s.summary = "Namae parses personal names and splits them into their component parts."
52
+
53
+ if s.respond_to? :specification_version then
54
+ s.specification_version = 3
55
+
56
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
57
+ s.add_development_dependency(%q<racc>, ["~> 1.4.8"])
58
+ s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
59
+ s.add_development_dependency(%q<bundler>, ["~> 1.1"])
60
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
61
+ s.add_development_dependency(%q<ZenTest>, ["~> 4.8.0"])
62
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8.3"])
63
+ else
64
+ s.add_dependency(%q<racc>, ["~> 1.4.8"])
65
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
66
+ s.add_dependency(%q<bundler>, ["~> 1.1"])
67
+ s.add_dependency(%q<simplecov>, [">= 0"])
68
+ s.add_dependency(%q<ZenTest>, ["~> 4.8.0"])
69
+ s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
70
+ end
71
+ else
72
+ s.add_dependency(%q<racc>, ["~> 1.4.8"])
73
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
74
+ s.add_dependency(%q<bundler>, ["~> 1.1"])
75
+ s.add_dependency(%q<simplecov>, [">= 0"])
76
+ s.add_dependency(%q<ZenTest>, ["~> 4.8.0"])
77
+ s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
78
+ end
79
+ end
80
+
@@ -0,0 +1,65 @@
1
+ module Namae
2
+ describe 'Name' do
3
+
4
+ describe '.new' do
5
+
6
+ it 'returns an empty name by default' do
7
+ Name.new.should be_empty
8
+ end
9
+
10
+ it 'sets all passed-in attributes' do
11
+ Name.new(:given => 'Foo').given.should == 'Foo'
12
+ end
13
+
14
+ it 'ignores unknown attributes' do
15
+ Name.new(:foo => 'bar').should be_empty
16
+ end
17
+
18
+ end
19
+
20
+ describe '#values_at' do
21
+ it 'returns an array with the given values' do
22
+ Name.new(:family => 'foo').values_at(:family).should == ['foo']
23
+ end
24
+
25
+ it 'returns an array with the given values' do
26
+ Name.new(:family => 'foo').values_at(:family).should == ['foo']
27
+ end
28
+ end
29
+
30
+ describe '#initials' do
31
+ it "returns the name's initials" do
32
+ Name.new(:family => 'Poe', :given => 'Edgar A.').initials.should == 'E.A.P.'
33
+ end
34
+
35
+ it "returns the name's initials but leaves the family name expanded" do
36
+ Name.new(:family => 'Poe', :given => 'Edgar A.').initials(:expand => true).should == 'E.A. Poe'
37
+ end
38
+ end
39
+
40
+ describe '#merge' do
41
+ it 'merges the attributes in the given hash into the name' do
42
+ Name.new.merge(:family => 'foo').family.should == 'foo'
43
+ end
44
+
45
+ it 'merges the attributes in the given name into the name' do
46
+ Name.new.merge(Name.new(:family => 'foo')).family.should == 'foo'
47
+ end
48
+
49
+ it 'ignores unknown attributes' do
50
+ Name.new.merge(:foo => 'bar').should be_empty
51
+ end
52
+
53
+ it 'ignores nil values' do
54
+ Name.new(:family => 'foo').merge(:family => nil).family.should == 'foo'
55
+ end
56
+ end
57
+
58
+ describe '#inspect' do
59
+ it 'returns the name as a string' do
60
+ Name.new(:given => 'Ichiro').inspect.should == '#<Name given="Ichiro">'
61
+ end
62
+ end
63
+
64
+ end
65
+ end
@@ -0,0 +1,107 @@
1
+ module Namae
2
+ describe 'Parser' do
3
+
4
+ it 'does not respond to .new' do
5
+ Parser.should_not respond_to(:new)
6
+ end
7
+
8
+ describe '.instance' do
9
+ let(:parser) { Parser.instance }
10
+
11
+ it 'returns the parser' do
12
+ parser.should be_a(Parser)
13
+ end
14
+
15
+ describe '#next_token' do
16
+ describe 'when the input is empty' do
17
+ it 'returns nil' do
18
+ parser.send(:next_token).should be_nil
19
+ end
20
+ end
21
+
22
+ describe 'when the next input is " and "' do
23
+ before { parser.send(:input).string = ' and ' }
24
+ it 'returns an AND token' do
25
+ parser.send(:next_token).should == [:AND, nil]
26
+ end
27
+ end
28
+
29
+ describe 'when the next input is " & "' do
30
+ before { parser.send(:input).string = ' & ' }
31
+ it 'returns an AND token' do
32
+ parser.send(:next_token).should == [:AND, nil]
33
+ end
34
+ end
35
+
36
+ describe 'when the next input is " , "' do
37
+ before { parser.send(:input).string = ' , ' }
38
+ it 'returns a COMMA token' do
39
+ parser.send(:next_token).should == [:COMMA, nil]
40
+ end
41
+ end
42
+
43
+ describe 'when the next input is " \'foo bar\' "' do
44
+ before { parser.send(:input).string = " 'foo bar' " }
45
+ it 'returns a NICK token' do
46
+ parser.send(:next_token).should == [:NICK, 'foo bar']
47
+ end
48
+ end
49
+
50
+ %w{Mr. Mr Mrs. Ms Herr Frau Miss}.each do |appellation|
51
+ describe "the next token is #{appellation.inspect}" do
52
+ before { parser.send(:input).string = appellation }
53
+ it 'returns an APPELLATION token' do
54
+ parser.send(:next_token).should == [:APPELLATION, appellation]
55
+ end
56
+ end
57
+ end
58
+
59
+ end
60
+
61
+ describe '#parse!' do
62
+ it 'returns an empty list by default' do
63
+ parser.parse!('').should be_empty
64
+ end
65
+
66
+ it 'returns a list of names' do
67
+ parser.parse!('foo')[0].should be_a(Name)
68
+ end
69
+
70
+ describe 'when parsing a single name' do
71
+
72
+ it 'treats "Ichiro" as a given name' do
73
+ parser.parse!('Ichiro')[0].given.should == 'Ichiro'
74
+ end
75
+
76
+ it 'treats "Lord Byron" as a title and family name' do
77
+ parser.parse!('Lord Byron')[0].values_at(:family, :title).should == ['Byron', 'Lord']
78
+ end
79
+
80
+ it 'parses given and family part name in "Ichiro Suzuki"' do
81
+ parser.parse!('Ichiro Suzuki')[0].values_at(:given, :family).should == %w{Ichiro Suzuki}
82
+ end
83
+
84
+ it 'parses given, nick and family part name in "Yukihiro \'Matz\' Matsumoto"' do
85
+ parser.parse!("Yukihiro 'Matz' Matsumoto")[0].values_at(:given, :family, :nick).should == %w{Yukihiro Matsumoto Matz}
86
+ end
87
+
88
+ it 'parses given, nick and family part name in \'Yukihiro "Matz" Matsumoto\'' do
89
+ parser.parse!('Yukihiro "Matz" Matsumoto')[0].values_at(:given, :family, :nick).should == %w{Yukihiro Matsumoto Matz}
90
+ end
91
+
92
+ it 'parses given and family name in "Poe, Edgar A."' do
93
+ parser.parse!('Poe, Edgar A.')[0].values_at(:given, :family).should == ['Edgar A.', 'Poe']
94
+ end
95
+
96
+ %w{Mr. Mr Mrs. Ms Herr Frau Miss}.each do |appellation|
97
+ it "recognizes #{appellation.inspect} as an appellation" do
98
+ parser.parse!([appellation, 'Edgar A. Poe'].join(' '))[0].appellation.should == appellation
99
+ end
100
+ end
101
+
102
+ end
103
+ end
104
+
105
+ end
106
+ end
107
+ end