namae 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +21 -0
- data/.document +6 -0
- data/.rspec +2 -0
- data/.simplecov +2 -0
- data/.travis.yml +11 -0
- data/.yardopts +3 -0
- data/Gemfile +25 -0
- data/LICENSE +661 -0
- data/README.md +133 -0
- data/Rakefile +62 -0
- data/cucumber.yml +1 -0
- data/features/bibtex.feature +78 -0
- data/features/examples.feature +24 -0
- data/features/step_definitions/namae_steps.rb +22 -0
- data/features/support/env.rb +19 -0
- data/lib/namae.rb +5 -0
- data/lib/namae/name.rb +119 -0
- data/lib/namae/parser.rb +470 -0
- data/lib/namae/parser.y +175 -0
- data/lib/namae/utility.rb +47 -0
- data/lib/namae/version.rb +10 -0
- data/namae.gemspec +80 -0
- data/spec/namae/name_spec.rb +65 -0
- data/spec/namae/parser_spec.rb +107 -0
- data/spec/namae/utility_spec.rb +21 -0
- data/spec/spec_helper.rb +19 -0
- metadata +147 -0
data/lib/namae/parser.y
ADDED
@@ -0,0 +1,175 @@
|
|
1
|
+
# -*- racc -*-
|
2
|
+
|
3
|
+
class Namae::Parser
|
4
|
+
|
5
|
+
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE
|
6
|
+
|
7
|
+
expect 0
|
8
|
+
|
9
|
+
rule
|
10
|
+
|
11
|
+
names : { result = [] }
|
12
|
+
| name { result = [val[0]] }
|
13
|
+
| names AND name { result = val[0] << val[2] }
|
14
|
+
|
15
|
+
name : word { result = Name.new(:given => val[0]) }
|
16
|
+
| display_order
|
17
|
+
| honorific word { result = val[0].merge(:family => val[1]) }
|
18
|
+
| honorific display_order { result = val[1].merge(val[0]) }
|
19
|
+
| sort_order
|
20
|
+
|
21
|
+
honorific : APPELLATION { result = Name.new(:appellation => val[0]) }
|
22
|
+
| TITLE { result = Name.new(:title => val[0]) }
|
23
|
+
|
24
|
+
display_order : u_words word
|
25
|
+
{
|
26
|
+
result = Name.new(:given => val[0], :family => val[1])
|
27
|
+
}
|
28
|
+
| u_words NICK last
|
29
|
+
{
|
30
|
+
result = Name.new(:given => val[0], :nick => val[1], :family => val[2])
|
31
|
+
}
|
32
|
+
| u_words NICK von last
|
33
|
+
{
|
34
|
+
result = Name.new(:given => val[0], :nick => val[1],
|
35
|
+
:particle => val[2], :family => val[3])
|
36
|
+
}
|
37
|
+
| u_words von last
|
38
|
+
{
|
39
|
+
result = Name.new(:given => val[0], :particle => val[1],
|
40
|
+
:family => val[2])
|
41
|
+
}
|
42
|
+
| von last
|
43
|
+
{
|
44
|
+
result = Name.new(:particle => val[0], :family => val[1])
|
45
|
+
}
|
46
|
+
|
47
|
+
sort_order : last COMMA first
|
48
|
+
{
|
49
|
+
result = Name.new(:family => val[0], :suffix => val[2][0],
|
50
|
+
:given => val[2][1])
|
51
|
+
}
|
52
|
+
| von last COMMA first
|
53
|
+
{
|
54
|
+
result = Name.new(:particle => val[0], :family => val[1],
|
55
|
+
:suffix => val[3][0], :given => val[3][1])
|
56
|
+
}
|
57
|
+
| u_words von last COMMA first
|
58
|
+
{
|
59
|
+
result = Name.new(:particle => val[0,2].join(' '), :family => val[2],
|
60
|
+
:suffix => val[4][0], :given => val[4][1])
|
61
|
+
}
|
62
|
+
;
|
63
|
+
|
64
|
+
von : LWORD
|
65
|
+
| von LWORD { result = val.join(' ') }
|
66
|
+
| von u_words LWORD { result = val.join(' ') }
|
67
|
+
|
68
|
+
last : LWORD | u_words
|
69
|
+
|
70
|
+
first : opt_words { result = [nil,val[0]] }
|
71
|
+
| opt_words COMMA opt_words { result = [val[0],val[2]] }
|
72
|
+
|
73
|
+
u_words : u_word
|
74
|
+
| u_words u_word { result = val.join(' ') }
|
75
|
+
|
76
|
+
u_word : UWORD | PWORD
|
77
|
+
|
78
|
+
words : word
|
79
|
+
| words word { result = val.join(' ') }
|
80
|
+
|
81
|
+
opt_words : /* empty */ | words
|
82
|
+
|
83
|
+
word : LWORD | UWORD | PWORD
|
84
|
+
|
85
|
+
---- header
|
86
|
+
require 'singleton'
|
87
|
+
require 'strscan'
|
88
|
+
|
89
|
+
---- inner
|
90
|
+
|
91
|
+
include Singleton
|
92
|
+
|
93
|
+
attr_reader :options
|
94
|
+
|
95
|
+
def initialize
|
96
|
+
@input, @options = StringScanner.new(''), {
|
97
|
+
:debug => false,
|
98
|
+
:comma => ',',
|
99
|
+
:separator => /\s*(\band\b|\&)\s*/i,
|
100
|
+
:title => /\s*\b(sir|lord|(prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
|
101
|
+
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def debug?
|
106
|
+
options[:debug] || ENV['DEBUG']
|
107
|
+
end
|
108
|
+
|
109
|
+
def separator
|
110
|
+
options[:separator]
|
111
|
+
end
|
112
|
+
|
113
|
+
def comma
|
114
|
+
options[:comma]
|
115
|
+
end
|
116
|
+
|
117
|
+
def title
|
118
|
+
options[:title]
|
119
|
+
end
|
120
|
+
|
121
|
+
def appellation
|
122
|
+
options[:appellation]
|
123
|
+
end
|
124
|
+
|
125
|
+
def parse(input)
|
126
|
+
parse!(input)
|
127
|
+
rescue => e
|
128
|
+
warn e.message if debug?
|
129
|
+
[]
|
130
|
+
end
|
131
|
+
|
132
|
+
def parse!(string)
|
133
|
+
@yydebug = debug?
|
134
|
+
input.string = string.strip
|
135
|
+
do_parse
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
def next_token
|
141
|
+
case
|
142
|
+
when input.nil?, input.eos?
|
143
|
+
nil
|
144
|
+
when input.scan(separator)
|
145
|
+
[:AND, nil]
|
146
|
+
when input.scan(/\s*,\s*/)
|
147
|
+
[:COMMA, nil]
|
148
|
+
when input.scan(/\s+/)
|
149
|
+
next_token
|
150
|
+
when input.scan(title)
|
151
|
+
[:TITLE, input.matched.strip]
|
152
|
+
when input.scan(appellation)
|
153
|
+
[:APPELLATION, input.matched.strip]
|
154
|
+
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{comma}]*/)
|
155
|
+
[:UWORD, input.matched]
|
156
|
+
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{comma}]*/)
|
157
|
+
[:LWORD, input.matched]
|
158
|
+
when input.scan(/(\\\w+)?\{[^\}]*\}[^\s#{comma}]*/)
|
159
|
+
[:PWORD, input.matched]
|
160
|
+
when input.scan(/('[^'\n]+')|("[^"\n]+")/)
|
161
|
+
[:NICK, input.matched[1...-1]]
|
162
|
+
else
|
163
|
+
raise ArgumentError,
|
164
|
+
"Failed to parse name #{input.string.inspect}: unmatched data at offset #{input.pos}"
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def on_error(tid, value, stack)
|
169
|
+
raise ArgumentError,
|
170
|
+
"Failed to parse name: unexpected '#{value}' at #{stack.inspect}"
|
171
|
+
end
|
172
|
+
|
173
|
+
attr_reader :input
|
174
|
+
|
175
|
+
# -*- racc -*-
|
@@ -0,0 +1,47 @@
|
|
1
|
+
|
2
|
+
# Namae is a parser for human names. It recognizes personal names of
|
3
|
+
# various cultural backgrounds and tries to split them into their
|
4
|
+
# component parts (e.g., given and family names, honorifics etc.).
|
5
|
+
#
|
6
|
+
# The main use case of Namae is to use the {Namae.parse .parse} or
|
7
|
+
# {Namae.parse! .parse!} method to parse a string of names and return
|
8
|
+
# a list of {Namae::Name Name} objects.
|
9
|
+
#
|
10
|
+
# @example Name parsing
|
11
|
+
# Namae.parse('Yukihiro "Matz" Matsumoto')
|
12
|
+
# #=> [#<Name family="Matsumoto" given="Yukihiro" nick="Matz">]
|
13
|
+
#
|
14
|
+
# Namae.parse('Torvalds, Linus and Cox, Alan')
|
15
|
+
# #=> [#<Name family="Torvalds" given="Linus">, #<Name family="Cox" given="Alan">]
|
16
|
+
#
|
17
|
+
module Namae
|
18
|
+
|
19
|
+
module_function
|
20
|
+
|
21
|
+
# Parses the passed-in string and returns a list of names. Behaves like
|
22
|
+
# parse but returns an empty list for bad input without raising an error.
|
23
|
+
#
|
24
|
+
# @see parse!
|
25
|
+
#
|
26
|
+
# @param names [String] the name or names to be parsed
|
27
|
+
# @return [Array] the list of parsed names
|
28
|
+
def parse(names)
|
29
|
+
Parser.instance.parse(names)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Parses the passed-in string and returns a list of names.
|
33
|
+
#
|
34
|
+
# @param names [String] the name or names to be parsed
|
35
|
+
# @return [Array] the list of parsed names
|
36
|
+
#
|
37
|
+
# @raise [ArgumentError] if the string cannot be parsed.
|
38
|
+
def parse!(names)
|
39
|
+
Parser.instance.parse!(names)
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return [Hash] the parser's current configuration.
|
43
|
+
def options
|
44
|
+
Parser.instance.options
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
data/namae.gemspec
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "namae"
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Sylvester Keil", "Dan Collis-Puro"]
|
12
|
+
s.date = "2012-06-04"
|
13
|
+
s.description = " Namae is a parser for human names. It recognizes personal names of various cultural backgrounds and tries to split them into their component parts (e.g., given and family names, honorifics etc.). "
|
14
|
+
s.email = ["sylvester@keil.or.at", "dan@collispuro.com"]
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".autotest",
|
21
|
+
".document",
|
22
|
+
".rspec",
|
23
|
+
".simplecov",
|
24
|
+
".travis.yml",
|
25
|
+
".yardopts",
|
26
|
+
"Gemfile",
|
27
|
+
"LICENSE",
|
28
|
+
"README.md",
|
29
|
+
"Rakefile",
|
30
|
+
"cucumber.yml",
|
31
|
+
"features/bibtex.feature",
|
32
|
+
"features/examples.feature",
|
33
|
+
"features/step_definitions/namae_steps.rb",
|
34
|
+
"features/support/env.rb",
|
35
|
+
"lib/namae.rb",
|
36
|
+
"lib/namae/name.rb",
|
37
|
+
"lib/namae/parser.rb",
|
38
|
+
"lib/namae/parser.y",
|
39
|
+
"lib/namae/utility.rb",
|
40
|
+
"lib/namae/version.rb",
|
41
|
+
"namae.gemspec",
|
42
|
+
"spec/namae/name_spec.rb",
|
43
|
+
"spec/namae/parser_spec.rb",
|
44
|
+
"spec/namae/utility_spec.rb",
|
45
|
+
"spec/spec_helper.rb"
|
46
|
+
]
|
47
|
+
s.homepage = "https://github.com/berkmancenter/namae"
|
48
|
+
s.licenses = ["AGPL"]
|
49
|
+
s.require_paths = ["lib"]
|
50
|
+
s.rubygems_version = "1.8.10"
|
51
|
+
s.summary = "Namae parses personal names and splits them into their component parts."
|
52
|
+
|
53
|
+
if s.respond_to? :specification_version then
|
54
|
+
s.specification_version = 3
|
55
|
+
|
56
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
57
|
+
s.add_development_dependency(%q<racc>, ["~> 1.4.8"])
|
58
|
+
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
59
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.1"])
|
60
|
+
s.add_development_dependency(%q<simplecov>, [">= 0"])
|
61
|
+
s.add_development_dependency(%q<ZenTest>, ["~> 4.8.0"])
|
62
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.8.3"])
|
63
|
+
else
|
64
|
+
s.add_dependency(%q<racc>, ["~> 1.4.8"])
|
65
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
66
|
+
s.add_dependency(%q<bundler>, ["~> 1.1"])
|
67
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
68
|
+
s.add_dependency(%q<ZenTest>, ["~> 4.8.0"])
|
69
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
|
70
|
+
end
|
71
|
+
else
|
72
|
+
s.add_dependency(%q<racc>, ["~> 1.4.8"])
|
73
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
74
|
+
s.add_dependency(%q<bundler>, ["~> 1.1"])
|
75
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
76
|
+
s.add_dependency(%q<ZenTest>, ["~> 4.8.0"])
|
77
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Namae
|
2
|
+
describe 'Name' do
|
3
|
+
|
4
|
+
describe '.new' do
|
5
|
+
|
6
|
+
it 'returns an empty name by default' do
|
7
|
+
Name.new.should be_empty
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'sets all passed-in attributes' do
|
11
|
+
Name.new(:given => 'Foo').given.should == 'Foo'
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'ignores unknown attributes' do
|
15
|
+
Name.new(:foo => 'bar').should be_empty
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
describe '#values_at' do
|
21
|
+
it 'returns an array with the given values' do
|
22
|
+
Name.new(:family => 'foo').values_at(:family).should == ['foo']
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'returns an array with the given values' do
|
26
|
+
Name.new(:family => 'foo').values_at(:family).should == ['foo']
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe '#initials' do
|
31
|
+
it "returns the name's initials" do
|
32
|
+
Name.new(:family => 'Poe', :given => 'Edgar A.').initials.should == 'E.A.P.'
|
33
|
+
end
|
34
|
+
|
35
|
+
it "returns the name's initials but leaves the family name expanded" do
|
36
|
+
Name.new(:family => 'Poe', :given => 'Edgar A.').initials(:expand => true).should == 'E.A. Poe'
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe '#merge' do
|
41
|
+
it 'merges the attributes in the given hash into the name' do
|
42
|
+
Name.new.merge(:family => 'foo').family.should == 'foo'
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'merges the attributes in the given name into the name' do
|
46
|
+
Name.new.merge(Name.new(:family => 'foo')).family.should == 'foo'
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'ignores unknown attributes' do
|
50
|
+
Name.new.merge(:foo => 'bar').should be_empty
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'ignores nil values' do
|
54
|
+
Name.new(:family => 'foo').merge(:family => nil).family.should == 'foo'
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
describe '#inspect' do
|
59
|
+
it 'returns the name as a string' do
|
60
|
+
Name.new(:given => 'Ichiro').inspect.should == '#<Name given="Ichiro">'
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module Namae
|
2
|
+
describe 'Parser' do
|
3
|
+
|
4
|
+
it 'does not respond to .new' do
|
5
|
+
Parser.should_not respond_to(:new)
|
6
|
+
end
|
7
|
+
|
8
|
+
describe '.instance' do
|
9
|
+
let(:parser) { Parser.instance }
|
10
|
+
|
11
|
+
it 'returns the parser' do
|
12
|
+
parser.should be_a(Parser)
|
13
|
+
end
|
14
|
+
|
15
|
+
describe '#next_token' do
|
16
|
+
describe 'when the input is empty' do
|
17
|
+
it 'returns nil' do
|
18
|
+
parser.send(:next_token).should be_nil
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe 'when the next input is " and "' do
|
23
|
+
before { parser.send(:input).string = ' and ' }
|
24
|
+
it 'returns an AND token' do
|
25
|
+
parser.send(:next_token).should == [:AND, nil]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe 'when the next input is " & "' do
|
30
|
+
before { parser.send(:input).string = ' & ' }
|
31
|
+
it 'returns an AND token' do
|
32
|
+
parser.send(:next_token).should == [:AND, nil]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe 'when the next input is " , "' do
|
37
|
+
before { parser.send(:input).string = ' , ' }
|
38
|
+
it 'returns a COMMA token' do
|
39
|
+
parser.send(:next_token).should == [:COMMA, nil]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe 'when the next input is " \'foo bar\' "' do
|
44
|
+
before { parser.send(:input).string = " 'foo bar' " }
|
45
|
+
it 'returns a NICK token' do
|
46
|
+
parser.send(:next_token).should == [:NICK, 'foo bar']
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
%w{Mr. Mr Mrs. Ms Herr Frau Miss}.each do |appellation|
|
51
|
+
describe "the next token is #{appellation.inspect}" do
|
52
|
+
before { parser.send(:input).string = appellation }
|
53
|
+
it 'returns an APPELLATION token' do
|
54
|
+
parser.send(:next_token).should == [:APPELLATION, appellation]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
describe '#parse!' do
|
62
|
+
it 'returns an empty list by default' do
|
63
|
+
parser.parse!('').should be_empty
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'returns a list of names' do
|
67
|
+
parser.parse!('foo')[0].should be_a(Name)
|
68
|
+
end
|
69
|
+
|
70
|
+
describe 'when parsing a single name' do
|
71
|
+
|
72
|
+
it 'treats "Ichiro" as a given name' do
|
73
|
+
parser.parse!('Ichiro')[0].given.should == 'Ichiro'
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'treats "Lord Byron" as a title and family name' do
|
77
|
+
parser.parse!('Lord Byron')[0].values_at(:family, :title).should == ['Byron', 'Lord']
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'parses given and family part name in "Ichiro Suzuki"' do
|
81
|
+
parser.parse!('Ichiro Suzuki')[0].values_at(:given, :family).should == %w{Ichiro Suzuki}
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'parses given, nick and family part name in "Yukihiro \'Matz\' Matsumoto"' do
|
85
|
+
parser.parse!("Yukihiro 'Matz' Matsumoto")[0].values_at(:given, :family, :nick).should == %w{Yukihiro Matsumoto Matz}
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'parses given, nick and family part name in \'Yukihiro "Matz" Matsumoto\'' do
|
89
|
+
parser.parse!('Yukihiro "Matz" Matsumoto')[0].values_at(:given, :family, :nick).should == %w{Yukihiro Matsumoto Matz}
|
90
|
+
end
|
91
|
+
|
92
|
+
it 'parses given and family name in "Poe, Edgar A."' do
|
93
|
+
parser.parse!('Poe, Edgar A.')[0].values_at(:given, :family).should == ['Edgar A.', 'Poe']
|
94
|
+
end
|
95
|
+
|
96
|
+
%w{Mr. Mr Mrs. Ms Herr Frau Miss}.each do |appellation|
|
97
|
+
it "recognizes #{appellation.inspect} as an appellation" do
|
98
|
+
parser.parse!([appellation, 'Edgar A. Poe'].join(' '))[0].appellation.should == appellation
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|