nomener 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/nomener.rb +6 -6
- data/lib/nomener/base.rb +34 -0
- data/lib/nomener/cleaner.rb +111 -0
- data/lib/nomener/compounders.rb +3 -3
- data/lib/nomener/name.rb +41 -46
- data/lib/nomener/parser.rb +80 -142
- data/lib/nomener/suffixes.rb +29 -12
- data/lib/nomener/titles.rb +68 -46
- data/lib/nomener/version.rb +2 -1
- data/spec/nomener/nomener_helper_spec.rb +5 -5
- data/spec/nomener/nomener_parser_spec.rb +0 -26
- data/spec/nomener/nomener_suffixes_spec.rb +14 -0
- data/spec/nomener/nomener_titles_spec.rb +233 -0
- metadata +4 -5
- data/lib/nomener/helper.rb +0 -41
- data/spec/nomener/titles_spec.rb +0 -227
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1a5d4f2bd5060c45197d9b93c147ecf4975b66a4
|
|
4
|
+
data.tar.gz: 82d2f6e476c9fab3b3c18e1203fb7fc57cb7fe5d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a84f031b213de553071f89fbfc16d0efab68e2b69471e32c2c92aa35598c80f2e2a1520a0ecbd0b88c4262c605f77c879b57a085ee33578bfa548483b1ccee54
|
|
7
|
+
data.tar.gz: 0783c6281f125eb7808772518e7bc3d0a26335625b3bb7d3340c59e4eb5f4c0ebcc1355864660d6550ffcbc66183133f2ccce0756e3c4322e27b9b829644b87e
|
data/lib/nomener.rb
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
#-- encoding: UTF-8
|
|
2
|
-
require
|
|
3
|
-
require
|
|
4
|
-
require "nomener/parser"
|
|
2
|
+
require 'nomener/version'
|
|
3
|
+
require 'nomener/name'
|
|
5
4
|
|
|
5
|
+
# Base module for our names
|
|
6
6
|
module Nomener
|
|
7
|
+
# Error class raised for the few times we aren't able to parse a name
|
|
8
|
+
class ParseError < StandardError
|
|
9
|
+
end
|
|
7
10
|
|
|
8
|
-
class ParseError < StandardError; end
|
|
9
|
-
|
|
10
11
|
# Public: Convenience method to parse a name
|
|
11
12
|
#
|
|
12
13
|
# name - a string of a name to parse
|
|
@@ -15,5 +16,4 @@ module Nomener
|
|
|
15
16
|
def self.parse(name)
|
|
16
17
|
Name.new(name).parse
|
|
17
18
|
end
|
|
18
|
-
|
|
19
19
|
end
|
data/lib/nomener/base.rb
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
module Nomener
|
|
2
|
+
# Module of constants, methods being used in other modules and classes
|
|
3
|
+
module Base
|
|
4
|
+
# probably unnecessary constant
|
|
5
|
+
PERIOD = /\./
|
|
6
|
+
|
|
7
|
+
# Internal: a softer clean we keep re-using
|
|
8
|
+
#
|
|
9
|
+
# str - the string to dust off
|
|
10
|
+
#
|
|
11
|
+
# Returns the nice clean
|
|
12
|
+
def dustoff(str)
|
|
13
|
+
str = str.gsub PERIOD, ' '
|
|
14
|
+
str = str.squeeze ' '
|
|
15
|
+
str.strip
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Internal: clean out a given string with a given pattern
|
|
19
|
+
# Modfies the given string
|
|
20
|
+
#
|
|
21
|
+
# str - the string to gut
|
|
22
|
+
# pattern - the regext to cut with
|
|
23
|
+
#
|
|
24
|
+
# Returns the gutted pattern
|
|
25
|
+
def gut!(str = '', pattern = / /)
|
|
26
|
+
found = []
|
|
27
|
+
str.gsub! pattern do |pat|
|
|
28
|
+
found << pat.strip
|
|
29
|
+
''
|
|
30
|
+
end
|
|
31
|
+
found.join ' '
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
#-- encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
# For Ruby 1.9.3, 2.0.0
|
|
4
|
+
rv = RUBY_VERSION.split('.')[(0..1)].join('')
|
|
5
|
+
require 'string-scrub' if rv >= '19' && rv < '21'
|
|
6
|
+
|
|
7
|
+
module Nomener
|
|
8
|
+
# Module with helper functions to clean strings
|
|
9
|
+
#
|
|
10
|
+
# Currently exposes
|
|
11
|
+
# .reformat
|
|
12
|
+
# .cleanup!
|
|
13
|
+
# .dustoff
|
|
14
|
+
#
|
|
15
|
+
module Cleaner
|
|
16
|
+
# Allowable characters in a name after quotes have been reduced
|
|
17
|
+
@@allowable = nil
|
|
18
|
+
|
|
19
|
+
# regex for stuff at the end we want to get out
|
|
20
|
+
TRAILER_TRASH = /[,|\s]+$/
|
|
21
|
+
|
|
22
|
+
# regex for name characters we aren't going to use
|
|
23
|
+
DIRTY_STUFF = /[^,'\-(?:\p{Alpha}(?<\.))\p{Alpha}\p{Blank}]/
|
|
24
|
+
|
|
25
|
+
# Internal: Clean up a given string. Quotes from http://en.wikipedia.org/wiki/Quotation_mark
|
|
26
|
+
# Needs to be fixed up for matching and non-english quotes
|
|
27
|
+
#
|
|
28
|
+
# name - the string to clean
|
|
29
|
+
# double - the double quotes to replace others with "" by default
|
|
30
|
+
# single - the single quotes to replace others with '' by default
|
|
31
|
+
#
|
|
32
|
+
# Returns a string which is (ideally) pretty much the same as it was given.
|
|
33
|
+
def self.reformat(name, double = '""', single = "''")
|
|
34
|
+
@@allowable = %r![^\p{Alpha}\-&\/\ \.\,\'\"\(\)
|
|
35
|
+
#{double[0] || '"'}
|
|
36
|
+
#{double[1] || '"'}
|
|
37
|
+
#{single[0] || "'"}
|
|
38
|
+
#{single[1] || "'"}] !x unless @@allowable
|
|
39
|
+
|
|
40
|
+
# remove illegal characters, translate fullwidth down
|
|
41
|
+
nomen = name.dup.scrub.tr("\uFF02\uFF07", "\u0022\u0027")
|
|
42
|
+
|
|
43
|
+
nomen = replace_doubles(nomen, double)
|
|
44
|
+
replace_singles(nomen, single)
|
|
45
|
+
.gsub(/@@allowable/, ' ')
|
|
46
|
+
.squeeze(' ')
|
|
47
|
+
.strip
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Internal: Clean up a string where there are numerous consecutive and
|
|
51
|
+
# trailing non-name characters.
|
|
52
|
+
# Modifies given string in place.
|
|
53
|
+
#
|
|
54
|
+
# args - strings to clean up
|
|
55
|
+
#
|
|
56
|
+
# Returns nothing
|
|
57
|
+
def self.cleanup!(*args)
|
|
58
|
+
args.each do |dirty|
|
|
59
|
+
next unless dirty.is_a?(String)
|
|
60
|
+
|
|
61
|
+
dirty.gsub! DIRTY_STUFF, ' '
|
|
62
|
+
dirty.squeeze! ' '
|
|
63
|
+
# remove any trailing commas or whitespace
|
|
64
|
+
dirty.gsub! TRAILER_TRASH, ''
|
|
65
|
+
dirty.strip!
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Internal: Replace various double quote characters with given char
|
|
70
|
+
#
|
|
71
|
+
# str - the string to find replacements in
|
|
72
|
+
# double - string with two characters, the left and right quotes
|
|
73
|
+
#
|
|
74
|
+
# Returns the string with the quotes replaced
|
|
75
|
+
def self.replace_doubles(str, double)
|
|
76
|
+
left, right = quotes_from double
|
|
77
|
+
|
|
78
|
+
# replace left and right double quotes
|
|
79
|
+
str.tr("\u0022\u00AB\u201C\u201E\u2036\u300E\u301D\u301F\uFE43", left)
|
|
80
|
+
.tr("\u0022\u00BB\u201D\u201F\u2033\u300F\u301E\uFE44", right)
|
|
81
|
+
end
|
|
82
|
+
private_class_method :replace_doubles
|
|
83
|
+
|
|
84
|
+
# Internal: Replace various single quote characters with given chars
|
|
85
|
+
#
|
|
86
|
+
# str - the string to find replacements in
|
|
87
|
+
# double - string with two characters, the left and right quotes
|
|
88
|
+
#
|
|
89
|
+
# Returns the string with the quotes replaced
|
|
90
|
+
def self.replace_singles(str, single)
|
|
91
|
+
left, right = quotes_from single
|
|
92
|
+
|
|
93
|
+
# replace left and right single quotes
|
|
94
|
+
str.tr("\u0027\u2018\u201A\u2035\u2039\u300C\uFE41\uFF62", left)
|
|
95
|
+
.tr("\u0027\u2019\u201B\u2032\u203A\u300D\uFE42\uFF62", right)
|
|
96
|
+
end
|
|
97
|
+
private_class_method :replace_singles
|
|
98
|
+
|
|
99
|
+
# Internal: Get the quotes from a string
|
|
100
|
+
#
|
|
101
|
+
# str - the string of two characters for the left and right quotes
|
|
102
|
+
#
|
|
103
|
+
# Returns an array of the [left, right] quotes
|
|
104
|
+
def self.quotes_from(str = '""')
|
|
105
|
+
left, right = str.split(/\B/)
|
|
106
|
+
right = left unless right
|
|
107
|
+
[left, right]
|
|
108
|
+
end
|
|
109
|
+
private_class_method :quotes_from
|
|
110
|
+
end
|
|
111
|
+
end
|
data/lib/nomener/compounders.rb
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#-- encoding: UTF-8
|
|
2
2
|
module Nomener
|
|
3
|
+
# Module for assist with finding family name compounds and prefixes
|
|
3
4
|
module Compounders
|
|
4
5
|
# Many of these are from http://en.wikipedia.org/wiki/List_of_family_name_affixes
|
|
5
|
-
|
|
6
6
|
# Internal: Regex last name prefixes.
|
|
7
|
-
COMPOUNDS = %r
|
|
7
|
+
COMPOUNDS = %r/(?<part>(?:
|
|
8
8
|
Ab
|
|
9
9
|
| Ap
|
|
10
10
|
| Abu
|
|
@@ -74,6 +74,6 @@ module Nomener
|
|
|
74
74
|
| Zu
|
|
75
75
|
| (?-i:y)
|
|
76
76
|
| 't
|
|
77
|
-
)\b\p{Blank}?\g<part>*)
|
|
77
|
+
)\b\p{Blank}?\g<part>*)*/xi
|
|
78
78
|
end
|
|
79
79
|
end
|
data/lib/nomener/name.rb
CHANGED
|
@@ -1,23 +1,22 @@
|
|
|
1
1
|
#-- encoding: UTF-8
|
|
2
|
-
require
|
|
2
|
+
require 'nomener/parser'
|
|
3
3
|
|
|
4
4
|
module Nomener
|
|
5
|
+
# name class for general purposes
|
|
5
6
|
class Name < Struct.new :title, :first, :middle, :nick, :last, :suffix
|
|
6
|
-
|
|
7
7
|
# we don't want to change what we were instantiated with
|
|
8
8
|
attr_reader :original
|
|
9
9
|
|
|
10
10
|
# Public: Create an instance!
|
|
11
11
|
def initialize(nomen = '')
|
|
12
|
-
@original =
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
parse
|
|
16
|
-
end
|
|
12
|
+
return @original = '' unless nomen.is_a?(String)
|
|
13
|
+
@original = Cleaner.reformat nomen
|
|
14
|
+
parse
|
|
17
15
|
end
|
|
18
16
|
|
|
19
17
|
# Public: Break down a string into parts of a persons name
|
|
20
|
-
# As of 0.2.5 parse no longer needs to be called after initialization,
|
|
18
|
+
# As of 0.2.5 parse no longer needs to be called after initialization,
|
|
19
|
+
# it's done automatically. Recalling it doesn't hurt though.
|
|
21
20
|
#
|
|
22
21
|
# name - A string of name to parse
|
|
23
22
|
#
|
|
@@ -32,12 +31,11 @@ module Nomener
|
|
|
32
31
|
#
|
|
33
32
|
# Returns a string of the full name in a proper (western) case
|
|
34
33
|
def properlike
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
"#{t} #{f} #{n} #{m} #{l} #{suffix}".strip.gsub(/\p{Blank}+/, ' ')
|
|
34
|
+
[capit(title), capit(first),
|
|
35
|
+
(nick.to_s.empty? ? '' : "\"#{nick}\""),
|
|
36
|
+
capit(middle), capit(last),
|
|
37
|
+
suffix
|
|
38
|
+
].join(' ').strip.squeeze ' '
|
|
41
39
|
end
|
|
42
40
|
|
|
43
41
|
# Internal: try to capitalize last names with Mac and Mc and D' and such
|
|
@@ -46,19 +44,15 @@ module Nomener
|
|
|
46
44
|
#
|
|
47
45
|
# Returns a string of the capitalized name
|
|
48
46
|
def capit(last)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
fix = last.dup
|
|
47
|
+
fix = last.to_s.dup
|
|
52
48
|
|
|
53
49
|
# if there are multiple last names separated by a dash
|
|
54
|
-
fix = fix.split(
|
|
55
|
-
|
|
56
|
-
}.join "-"
|
|
50
|
+
fix = fix.split('-')
|
|
51
|
+
.map { |outer| outer.split(' ').map(&:capitalize).join ' ' }.join '-'
|
|
57
52
|
|
|
58
53
|
# anything begining with Mac and not ending in [aciozj], except for a few
|
|
59
54
|
fix.sub!(/Mac(?!
|
|
60
|
-
hin|
|
|
61
|
-
hlen|
|
|
55
|
+
hin|hlen|
|
|
62
56
|
har|
|
|
63
57
|
kle|
|
|
64
58
|
klin|
|
|
@@ -67,15 +61,15 @@ module Nomener
|
|
|
67
61
|
evicius| # Lithuanian
|
|
68
62
|
iulis| # Lithuanian
|
|
69
63
|
ias # Lithuanian
|
|
70
|
-
)([\p{Alpha}]{2,}[^aAcCiIoOzZjJ])\b/x) {
|
|
64
|
+
)([\p{Alpha}]{2,}[^aAcCiIoOzZjJ])\b/x) { "Mac#{$1.capitalize}" }
|
|
71
65
|
|
|
72
|
-
fix.sub!
|
|
66
|
+
fix.sub!(/\bMacmurdo\b/, 'MacMurdo') # fix MacMurdo
|
|
73
67
|
|
|
74
68
|
# anything beginning with Mc, Mcdonald == McDonald
|
|
75
|
-
fix.sub!(/Mc(\p{Alpha}{2,})/) { |s| "Mc#{
|
|
69
|
+
fix.sub!(/Mc(\p{Alpha}{2,})/) { |s| "Mc#{s[2..-1].capitalize}" }
|
|
76
70
|
|
|
77
71
|
# names like D'Angelo or Van 't Hooft, no cap 't
|
|
78
|
-
fix.gsub!(/('\p{Alpha})(?=\p{Alpha})/) { |s| "'#{
|
|
72
|
+
fix.gsub!(/('\p{Alpha})(?=\p{Alpha})/) { |s| "'#{s[(1..-1)].capitalize}" }
|
|
79
73
|
|
|
80
74
|
fix
|
|
81
75
|
end
|
|
@@ -84,7 +78,10 @@ module Nomener
|
|
|
84
78
|
#
|
|
85
79
|
# Returns a nicely formatted string
|
|
86
80
|
def inspect
|
|
87
|
-
"#<Nomener::Name #{
|
|
81
|
+
"#<Nomener::Name #{
|
|
82
|
+
each_pair.map { |k, v| [k, v.inspect].join('=') unless v.to_s.empty? }
|
|
83
|
+
.compact
|
|
84
|
+
.join(' ') }>"
|
|
88
85
|
end
|
|
89
86
|
|
|
90
87
|
# Public: an alias for the last name
|
|
@@ -93,7 +90,7 @@ module Nomener
|
|
|
93
90
|
def surname
|
|
94
91
|
last
|
|
95
92
|
end
|
|
96
|
-
|
|
93
|
+
alias_method :family, :surname
|
|
97
94
|
|
|
98
95
|
# Public: Return the first name
|
|
99
96
|
#
|
|
@@ -104,7 +101,7 @@ module Nomener
|
|
|
104
101
|
|
|
105
102
|
# Public: Make the name a string.
|
|
106
103
|
#
|
|
107
|
-
# format - a string using
|
|
104
|
+
# format - a string using symbols for the format of the name to return
|
|
108
105
|
# defaults to "%f %l"
|
|
109
106
|
# %f -> first name
|
|
110
107
|
# %l -> last/surname/family name
|
|
@@ -118,16 +115,15 @@ module Nomener
|
|
|
118
115
|
# defaults to true
|
|
119
116
|
#
|
|
120
117
|
# Returns the name as a string
|
|
121
|
-
def name(format =
|
|
118
|
+
def name(format = '%f %l', _propercase = true)
|
|
122
119
|
nomen = to_h
|
|
123
|
-
nomen[:nick] = (nick.nil? || nick.empty?) ?
|
|
124
|
-
|
|
125
|
-
format.gsub
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
format.
|
|
130
|
-
(format % nomen).strip.gsub /\p{Blank}+/, " "
|
|
120
|
+
nomen[:nick] = (nick.nil? || nick.empty?) ? '' : "\"#{nick}\""
|
|
121
|
+
|
|
122
|
+
format = format.gsub(/%[flmnst]/,
|
|
123
|
+
'%f' => '%{first}', '%l' => '%{last}', '%m' => '%{middle}',
|
|
124
|
+
'%n' => '%{nick}', '%s' => '%{suffix}', '%t' => '%{title}'
|
|
125
|
+
)
|
|
126
|
+
(format % nomen).strip.squeeze ' '
|
|
131
127
|
end
|
|
132
128
|
|
|
133
129
|
# Public: Shortcut for name format
|
|
@@ -135,15 +131,15 @@ module Nomener
|
|
|
135
131
|
#
|
|
136
132
|
# Returns the full name
|
|
137
133
|
def full
|
|
138
|
-
name
|
|
134
|
+
name '%f %m %l'
|
|
139
135
|
end
|
|
140
|
-
|
|
136
|
+
alias_method :fullname, :full
|
|
141
137
|
|
|
142
138
|
# Public: See name
|
|
143
139
|
#
|
|
144
140
|
# Returns the name as a string
|
|
145
141
|
def to_s
|
|
146
|
-
name
|
|
142
|
+
name '%f %l'
|
|
147
143
|
end
|
|
148
144
|
|
|
149
145
|
# Internal: merge another Nomener::Name to this one
|
|
@@ -152,16 +148,15 @@ module Nomener
|
|
|
152
148
|
#
|
|
153
149
|
# Returns nothing
|
|
154
150
|
def merge(other)
|
|
155
|
-
return self unless other.
|
|
156
|
-
each_pair { |k,
|
|
151
|
+
return self unless other.is_a?(Hash)
|
|
152
|
+
each_pair { |k, _| self[k] = other[k] }
|
|
157
153
|
end
|
|
158
154
|
|
|
159
155
|
# Public: return self as a hash. For ruby 1.9.3
|
|
160
156
|
#
|
|
161
157
|
# Returns a hash of the name parts
|
|
162
158
|
def to_h
|
|
163
|
-
Hash[
|
|
159
|
+
Hash[each_pair.to_a]
|
|
164
160
|
end unless method_defined?(:to_h)
|
|
165
|
-
|
|
166
161
|
end
|
|
167
|
-
end
|
|
162
|
+
end
|
data/lib/nomener/parser.rb
CHANGED
|
@@ -1,21 +1,23 @@
|
|
|
1
1
|
#-- encoding: UTF-8
|
|
2
|
-
require
|
|
3
|
-
require
|
|
4
|
-
require
|
|
5
|
-
require
|
|
6
|
-
require
|
|
2
|
+
require 'nomener/base'
|
|
3
|
+
require 'nomener/compounders'
|
|
4
|
+
require 'nomener/cleaner'
|
|
5
|
+
require 'nomener/name'
|
|
6
|
+
require 'nomener/suffixes'
|
|
7
|
+
require 'nomener/titles'
|
|
7
8
|
|
|
8
9
|
module Nomener
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
10
|
+
# Class containing the blades for carving a string into a name
|
|
11
|
+
#
|
|
12
|
+
# The two significant methods are:
|
|
13
|
+
# parse returning a hash or nil
|
|
14
|
+
# parse! returning a hash or raising an exception
|
|
15
|
+
module Parser
|
|
16
|
+
extend Nomener::Base
|
|
17
|
+
extend Nomener::Cleaner
|
|
18
|
+
extend Nomener::Titles
|
|
16
19
|
|
|
17
|
-
|
|
18
|
-
DIRTY_STUFF = /[^,'(?:\p{Alpha}(?<\.))\p{Alpha}\p{Blank}]{2,}/
|
|
20
|
+
include Nomener::Compounders
|
|
19
21
|
|
|
20
22
|
# regex for boundaries we'll use to find leftover nickname boundaries
|
|
21
23
|
NICKNAME_LEFTOVER = /["'\(\)]{2}/
|
|
@@ -32,26 +34,22 @@ module Nomener
|
|
|
32
34
|
# regex for matching last names in a "last, first" pattern
|
|
33
35
|
LASTCOMFIRST_MATCHER = /\A(?<fam>#{COMPOUNDS}\b[\p{Alpha}\-\'\p{Blank}]+),/i
|
|
34
36
|
|
|
35
|
-
# period. probably not much performance help.
|
|
36
|
-
PERIOD = /\./
|
|
37
|
-
|
|
38
37
|
# Public: parse a string into name parts
|
|
39
38
|
#
|
|
40
39
|
# name - a string to get the name from
|
|
41
|
-
# format -
|
|
40
|
+
# format - hash of options to parse name
|
|
41
|
+
# default {:order => :fl, :spacelimit => 0}
|
|
42
42
|
# :order - format the name. defaults to "last first" of the available
|
|
43
43
|
# :fl - presumes the name is in the format of "first last"
|
|
44
44
|
# :lf - presumes the name is in the format of "last first"
|
|
45
45
|
# :lcf - presumes the name is in the format of "last, first"
|
|
46
46
|
# :spacelimit - the number of spaces to consider in the first name
|
|
47
47
|
#
|
|
48
|
-
# Returns a Nomener::Name
|
|
49
|
-
def self.parse(name, format = {:
|
|
50
|
-
|
|
51
|
-
self.parse!(name, format)
|
|
48
|
+
# Returns a Nomener::Name of a parsed name of the string or nil
|
|
49
|
+
def self.parse(name, format = { order: :auto, spacelimit: 1 })
|
|
50
|
+
self.parse!(name, format)
|
|
52
51
|
rescue
|
|
53
52
|
nil
|
|
54
|
-
end
|
|
55
53
|
end
|
|
56
54
|
|
|
57
55
|
# Public: parse a string into name parts
|
|
@@ -61,80 +59,60 @@ module Nomener
|
|
|
61
59
|
#
|
|
62
60
|
# Returns a hash of name parts or nil
|
|
63
61
|
# Raises ArgumentError if 'name' is not a string or is empty
|
|
64
|
-
def self.parse!(name, format = {:
|
|
65
|
-
raise ArgumentError,
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
#
|
|
71
|
-
newname
|
|
72
|
-
name
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
newname[:title] = parse_title! name
|
|
80
|
-
name = dustoff name
|
|
81
|
-
|
|
82
|
-
newname[:last] = name # possibly mononyms
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
case name
|
|
86
|
-
when /,/ # if there's a comma, it may be a useful hint
|
|
87
|
-
clues = name.split(",").each { |i| i.strip! }
|
|
88
|
-
|
|
89
|
-
raise ParseError, "Could not decipher commas in \"#{name}\"" if clues.length > 2
|
|
90
|
-
|
|
91
|
-
# convention is last, first when there's a comma
|
|
92
|
-
newname[:last], newname[:first] = clues
|
|
93
|
-
|
|
94
|
-
# check the last by comparing a re-ordering of the name
|
|
95
|
-
# Mies van der Rohe, Ludwig
|
|
96
|
-
# Snepscheut, Jan L. A. van de
|
|
97
|
-
unless newname[:first].nil? || newname[:first].split(" ").length == 1
|
|
98
|
-
check = parse_last!("#{newname[:first]} #{newname[:last]}", :fl)
|
|
99
|
-
|
|
100
|
-
# let's trust the full name
|
|
101
|
-
if check != newname[:last]
|
|
102
|
-
newname[:first] = "#{newname[:first]} #{newname[:last]}".sub(check, "").strip
|
|
103
|
-
newname[:last] = check
|
|
104
|
-
end
|
|
105
|
-
end
|
|
62
|
+
def self.parse!(name, format = { order: :auto, spacelimit: 0 })
|
|
63
|
+
raise ArgumentError,
|
|
64
|
+
'Name to parse not provided' if name.to_s.empty?
|
|
65
|
+
|
|
66
|
+
name = Cleaner.reformat name
|
|
67
|
+
|
|
68
|
+
# we want the hash in this order as it helps with parsing out pieces
|
|
69
|
+
newname = { first: '', middle: '', last: '' }
|
|
70
|
+
newname[:nick] = parse_nick!(name) # grab any identified nickname
|
|
71
|
+
newname[:suffix] = Suffixes.parse_suffix!(name) # grab any suffix'
|
|
72
|
+
newname[:title] = Titles.parse_title!(name)
|
|
73
|
+
|
|
74
|
+
# stop here if we know we'll be confused
|
|
75
|
+
raise ParseError,
|
|
76
|
+
"Could not decipher commas in \"#{name}\"" if name.count(',') > 1
|
|
106
77
|
|
|
78
|
+
newname[:last] = dustoff name # possibly mononyms
|
|
79
|
+
|
|
80
|
+
if name.count(',') > 0
|
|
81
|
+
newname[:last], newname[:first] = splitcomma(name)
|
|
107
82
|
# titles which are part of the first name...
|
|
108
|
-
newname[:title] = parse_title!(newname[:first]) if newname[:title].empty?
|
|
109
|
-
|
|
110
|
-
when / / # no comma, check for space on first then last
|
|
83
|
+
newname[:title] = Titles.parse_title!(newname[:first]) if newname[:title].empty?
|
|
84
|
+
else
|
|
111
85
|
newname[:last] = parse_last!(name, format[:order])
|
|
112
86
|
newname[:first], newname[:middle] = parse_first!(name, format[:spacelimit])
|
|
113
87
|
end
|
|
114
88
|
|
|
115
|
-
cleanup! newname[:last], newname[:first], newname[:middle]
|
|
89
|
+
Cleaner.cleanup! newname[:last], newname[:first], newname[:middle]
|
|
90
|
+
newname[:first] = dustoff newname[:first]
|
|
116
91
|
|
|
117
92
|
newname
|
|
118
93
|
end
|
|
119
94
|
|
|
120
|
-
# Internal
|
|
121
|
-
# Modifies given string in place.
|
|
95
|
+
# Internal split on the comma to get the first and last names
|
|
122
96
|
#
|
|
123
|
-
#
|
|
97
|
+
# str - the name
|
|
124
98
|
#
|
|
125
|
-
# Returns
|
|
126
|
-
def self.
|
|
127
|
-
|
|
128
|
-
end
|
|
99
|
+
# Returns an array of the last and first names found
|
|
100
|
+
def self.splitcomma(str)
|
|
101
|
+
last, first = str.split(',').each(&:strip!)
|
|
129
102
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
103
|
+
# check the last by comparing a re-ordering of the name
|
|
104
|
+
# Mies van der Rohe, Ludwig
|
|
105
|
+
# Snepscheut, Jan L. A. van de
|
|
106
|
+
unless first.to_s.count(' ') == 0
|
|
107
|
+
check = parse_last!("#{first} #{last}", :fl)
|
|
108
|
+
|
|
109
|
+
# trust the full name and remove the parsed last
|
|
110
|
+
if check != last
|
|
111
|
+
first = "#{first} #{last}".sub(check, '').strip
|
|
112
|
+
last = check
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
[last, first]
|
|
138
116
|
end
|
|
139
117
|
|
|
140
118
|
# Internal: parse nickname out of string. presuming it's in quotes
|
|
@@ -144,7 +122,12 @@ module Nomener
|
|
|
144
122
|
#
|
|
145
123
|
# Returns string of the nickname found or and empty string
|
|
146
124
|
def self.parse_nick!(nm)
|
|
147
|
-
|
|
125
|
+
return '' if nm.to_s.empty?
|
|
126
|
+
|
|
127
|
+
nick = dustoff gut!(nm, NICKNAME)
|
|
128
|
+
nm.sub! NICKNAME_LEFTOVER, ''
|
|
129
|
+
Cleaner.cleanup! nm
|
|
130
|
+
nick
|
|
148
131
|
end
|
|
149
132
|
|
|
150
133
|
# Internal: parse last name from string
|
|
@@ -155,25 +138,25 @@ module Nomener
|
|
|
155
138
|
#
|
|
156
139
|
# Returns string of the last name found or an empty string
|
|
157
140
|
def self.parse_last!(nm, format = :fl)
|
|
158
|
-
last =
|
|
141
|
+
last = ''
|
|
159
142
|
|
|
160
|
-
format = :fl if
|
|
161
|
-
format = :lcf if
|
|
143
|
+
format = :fl if format == :auto
|
|
144
|
+
format = :lcf if format == :auto && nm.index(',')
|
|
162
145
|
|
|
163
146
|
# these constants should have the named match :fam
|
|
164
147
|
nomen = case format
|
|
165
148
|
when :fl
|
|
166
|
-
nm.match
|
|
149
|
+
nm.match FIRSTLAST_MATCHER
|
|
167
150
|
when :lf
|
|
168
|
-
nm.match
|
|
151
|
+
nm.match LASTFIRST_MATCHER
|
|
169
152
|
when :lcf
|
|
170
|
-
nm.match
|
|
153
|
+
nm.match LASTCOMFIRST_MATCHER
|
|
171
154
|
end
|
|
172
155
|
|
|
173
156
|
unless nomen.nil? || nomen[:fam].nil?
|
|
174
157
|
last = nomen[:fam].strip
|
|
175
|
-
nm.sub!(last,
|
|
176
|
-
nm.sub!(
|
|
158
|
+
nm.sub!(last, '')
|
|
159
|
+
nm.sub!(',', '')
|
|
177
160
|
end
|
|
178
161
|
|
|
179
162
|
last
|
|
@@ -187,56 +170,11 @@ module Nomener
|
|
|
187
170
|
#
|
|
188
171
|
# Returns an array containing the first name and middle name if any
|
|
189
172
|
def self.parse_first!(nm, namecount = 0)
|
|
190
|
-
nm.tr!
|
|
191
|
-
nm.squeeze!
|
|
192
|
-
first, middle = nm.split
|
|
193
|
-
|
|
194
|
-
[first || "", middle || ""]
|
|
195
|
-
end
|
|
173
|
+
nm.tr! '.', ' '
|
|
174
|
+
nm.squeeze! ' '
|
|
175
|
+
first, middle = nm.split ' ', namecount
|
|
196
176
|
|
|
197
|
-
|
|
198
|
-
# Internal: Clean up a string where there are numerous consecutive and trailing non-name characters.
|
|
199
|
-
# Modifies given string in place.
|
|
200
|
-
#
|
|
201
|
-
# args - strings to clean up
|
|
202
|
-
#
|
|
203
|
-
# Returns nothing
|
|
204
|
-
def self.cleanup!(*args)
|
|
205
|
-
args.each do |dirty|
|
|
206
|
-
next if(dirty.nil? || !dirty.kind_of?(String))
|
|
207
|
-
|
|
208
|
-
dirty.gsub! DIRTY_STUFF, ""
|
|
209
|
-
dirty.squeeze! " "
|
|
210
|
-
# remove any trailing commas or whitespace
|
|
211
|
-
dirty.gsub! TRAILER_TRASH, ""
|
|
212
|
-
dirty.strip!
|
|
213
|
-
end
|
|
214
|
-
end
|
|
215
|
-
|
|
216
|
-
# Internal: a softer clean we keep re-using
|
|
217
|
-
#
|
|
218
|
-
# str - the string to dust off
|
|
219
|
-
#
|
|
220
|
-
# Returns the nice clean
|
|
221
|
-
def self.dustoff(str)
|
|
222
|
-
str = str.gsub PERIOD, " "
|
|
223
|
-
str = str.squeeze " "
|
|
224
|
-
str = str.strip
|
|
225
|
-
end
|
|
226
|
-
|
|
227
|
-
# Internal: clean out a given string with a given pattern
|
|
228
|
-
# Modfies the given string
|
|
229
|
-
# str - the string to gut
|
|
230
|
-
# pattern - the regext to cut with
|
|
231
|
-
#
|
|
232
|
-
# Returns the gutted pattern
|
|
233
|
-
def self.gut!(str = "", pattern = / /)
|
|
234
|
-
found = []
|
|
235
|
-
str.gsub! pattern do |pat|
|
|
236
|
-
found << pat.strip
|
|
237
|
-
""
|
|
238
|
-
end
|
|
239
|
-
found.join " "
|
|
177
|
+
[first || '', middle || '']
|
|
240
178
|
end
|
|
241
179
|
end
|
|
242
180
|
end
|