name-tamer 0.1.9 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/Gemfile.lock +1 -1
- data/lib/name-tamer.rb +10 -4
- data/lib/name-tamer/version.rb +1 -1
- data/lib/string_extras.rb +40 -43
- data/spec/name_tamer_spec.rb +3 -6
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3532b7472b3daecb0bb11863268531c229771639
|
4
|
+
data.tar.gz: 0096dd16106d480f6c5e1e043dbf54896f787599
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ee6d017e93b54acd10791f44a2920c46fe76faaafcb0171ab59c582f7d07c34036bc64610c5298ae363a33fa26fbedd1711800771acc29620beb6967adea10e
|
7
|
+
data.tar.gz: 23741e994c62fc8c746f826e3124824a29e3a5ecff22930a1ec350db0363b5b6d20d67bc5104d546f25c9301850a8944e936cb5e41c6320524d766303a2be69c
|
data/.rubocop.yml
CHANGED
data/Gemfile.lock
CHANGED
data/lib/name-tamer.rb
CHANGED
@@ -23,11 +23,12 @@ class NameTamer
|
|
23
23
|
|
24
24
|
def tidy_name
|
25
25
|
unless @tidy_name
|
26
|
-
@tidy_name = name.dup
|
26
|
+
@tidy_name = name.dup # Start with the name we've received
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
ensure_safe # Invalid byte sequence in UTF-8, for example
|
29
|
+
tidy_spacing # " John Smith " -> "John Smith"
|
30
|
+
fix_encoding_errors # "Ren\u00c3\u00a9 Descartes" -> "Ren\u00e9 Descartes"
|
31
|
+
consolidate_initials # "I. B. M." -> "I.B.M."
|
31
32
|
end
|
32
33
|
|
33
34
|
@tidy_name
|
@@ -111,6 +112,10 @@ class NameTamer
|
|
111
112
|
# Tidy up the name we've received
|
112
113
|
#--------------------------------------------------------
|
113
114
|
|
115
|
+
def ensure_safe
|
116
|
+
@tidy_name.ensure_safe
|
117
|
+
end
|
118
|
+
|
114
119
|
def tidy_spacing
|
115
120
|
@tidy_name
|
116
121
|
.space_after_comma!
|
@@ -295,6 +300,7 @@ class NameTamer
|
|
295
300
|
@contact_type = ct
|
296
301
|
end
|
297
302
|
|
303
|
+
@tidy_name = nil
|
298
304
|
@nice_name = nil
|
299
305
|
@simple_name = nil
|
300
306
|
@slug = nil
|
data/lib/name-tamer/version.rb
CHANGED
data/lib/string_extras.rb
CHANGED
@@ -2,83 +2,74 @@
|
|
2
2
|
class String
|
3
3
|
# Strip illegal characters out completely
|
4
4
|
def strip_unwanted!(filter)
|
5
|
-
|
6
|
-
self # Allows chaining
|
5
|
+
substitute!(filter, '')
|
7
6
|
end
|
8
7
|
|
9
8
|
def strip_or_self!
|
10
|
-
|
11
|
-
self # Allows chaining
|
9
|
+
strip! || self
|
12
10
|
end
|
13
11
|
|
14
12
|
# Change any whitespace into our separator character
|
15
13
|
def whitespace_to!(separator)
|
16
|
-
|
17
|
-
self # Allows chaining
|
14
|
+
substitute!(/[[:space:]]+/, separator)
|
18
15
|
end
|
19
16
|
|
20
17
|
# Ensure commas have exactly one space after them
|
21
18
|
def space_after_comma!
|
22
|
-
|
23
|
-
self # Allows chaining
|
19
|
+
substitute!(/,[[:space:]]*/, ', ')
|
24
20
|
end
|
25
21
|
|
26
22
|
# Change some characters embedded in words to our separator character
|
27
23
|
# e.g. example.com -> example-com
|
28
24
|
def invalid_chars_to!(separator)
|
29
|
-
|
30
|
-
self # Allows chaining
|
25
|
+
substitute!(/(?<![[:space:]])[\.\/](?![[:space:]])/, separator)
|
31
26
|
end
|
32
27
|
|
33
28
|
# Make sure separators are not where they shouldn't be
|
34
29
|
def fix_separators!(separator)
|
35
|
-
|
36
|
-
r = Regexp.escape(separator)
|
37
|
-
# No more than one of the separator in a row.
|
38
|
-
self.gsub!(/#{r}{2,}/, separator)
|
39
|
-
# Remove leading/trailing separator.
|
40
|
-
self.gsub!(/^#{r}|#{r}$/i, '')
|
41
|
-
end
|
30
|
+
return self if separator.nil? || separator.empty?
|
42
31
|
|
43
|
-
|
32
|
+
r = Regexp.escape(separator)
|
33
|
+
|
34
|
+
# No more than one of the separator in a row.
|
35
|
+
substitute!(/#{r}{2,}/, separator)
|
36
|
+
|
37
|
+
# Remove leading/trailing separator.
|
38
|
+
substitute!(/^#{r}|#{r}$/i, '')
|
44
39
|
end
|
45
40
|
|
46
41
|
# Any characters that resemble latin characters might usefully be
|
47
42
|
# transliterated into ones that are easy to type on an anglophone
|
48
43
|
# keyboard.
|
49
44
|
def approximate_latin_chars!
|
50
|
-
|
51
|
-
self # Allows chaining
|
45
|
+
gsub!(/[^\x00-\x7f]/u) { |char| APPROXIMATIONS[char] || char } || self
|
52
46
|
end
|
53
47
|
|
54
48
|
# Strings that were wrongly encoded with single-byte encodings sometimes have
|
55
49
|
# tell-tale substrings that we can put back into the correct UTF-8 character
|
56
50
|
def fix_encoding_errors!
|
57
|
-
|
58
|
-
self # Allows chaining
|
51
|
+
gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring } || self
|
59
52
|
end
|
60
53
|
|
61
54
|
def upcase_first_letter!
|
62
|
-
|
63
|
-
self # Allows chaining
|
55
|
+
gsub!(/\b\w/) { |first| first.upcase } || self
|
64
56
|
end
|
65
57
|
|
66
58
|
def downcase_after_apostrophe!
|
67
|
-
|
68
|
-
self # Allows chaining
|
59
|
+
gsub!(/\'\w\b/) { |c| c.downcase } || self # Lowercase 's
|
69
60
|
end
|
70
61
|
|
71
62
|
# Our list of terminal characters that indicate a non-celtic name used
|
72
63
|
# to include o but we removed it because of MacMurdo.
|
73
64
|
def fix_mac!
|
74
65
|
if self =~ /\bMac[A-Za-z]{2,}[^acizj]\b/ || self =~ /\bMc/
|
75
|
-
|
66
|
+
gsub!(/\b(Ma?c)([A-Za-z]+)/) { |_| Regexp.last_match[1] + Regexp.last_match[2].capitalize }
|
76
67
|
|
77
68
|
# Fix Mac exceptions
|
78
69
|
%w(
|
79
70
|
MacEdo MacEvicius MacHado MacHar MacHin MacHlin MacIas MacIulis MacKie
|
80
71
|
MacKle MacKlin MacKmin MacKmurdo MacQuarie MacLise MacKenzie
|
81
|
-
).each { |mac_name|
|
72
|
+
).each { |mac_name| substitute!(/\b#{mac_name}/, mac_name.capitalize) }
|
82
73
|
end
|
83
74
|
|
84
75
|
self # Allows chaining
|
@@ -88,7 +79,7 @@ class String
|
|
88
79
|
def fix_ff!
|
89
80
|
%w(
|
90
81
|
Fforbes Fforde Ffinch Ffrench Ffoulkes
|
91
|
-
).each { |ff_name|
|
82
|
+
).each { |ff_name| substitute!(ff_name, ff_name.downcase) }
|
92
83
|
|
93
84
|
self # Allows chaining
|
94
85
|
end
|
@@ -98,13 +89,13 @@ class String
|
|
98
89
|
# Fixes for name modifiers followed by an apostrophe, e.g. d'Artagnan, Commedia dell'Arte
|
99
90
|
def fix_name_modifiers!
|
100
91
|
NAME_MODIFIERS.each do |modifier|
|
101
|
-
|
92
|
+
gsub!(/((?:[[:space:]]|^)#{modifier})([[:space:]]+|-)/) do |_|
|
102
93
|
"#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2].tr(ASCII_SPACE, NONBREAKING_SPACE)}"
|
103
94
|
end
|
104
95
|
end
|
105
96
|
|
106
97
|
%w(Dell D).each do |modifier|
|
107
|
-
|
98
|
+
gsub!(/(.#{modifier}')(\w)/) { |_| "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2]}" }
|
108
99
|
end
|
109
100
|
|
110
101
|
self # Allows chaining
|
@@ -113,16 +104,14 @@ class String
|
|
113
104
|
# Upcase words with no vowels, e.g JPR Williams
|
114
105
|
# Except Ng
|
115
106
|
def upcase_initials!
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
self # Allows chaining
|
107
|
+
gsub!(/\b([bcdfghjklmnpqrstvwxz]+)\b/i) { |_| Regexp.last_match[1].upcase }
|
108
|
+
gsub!(/\b(NG)\b/i) { |_| Regexp.last_match[1].capitalize } || self # http://en.wikipedia.org/wiki/Ng
|
120
109
|
end
|
121
110
|
|
122
111
|
# Fix known last names that have spaces (not hyphens!)
|
123
112
|
def nbsp_in_compound_name!
|
124
113
|
COMPOUND_NAMES.each do |compound_name|
|
125
|
-
|
114
|
+
substitute!(compound_name, compound_name.tr(ASCII_SPACE, NONBREAKING_SPACE))
|
126
115
|
end
|
127
116
|
|
128
117
|
self # Allows chaining
|
@@ -130,25 +119,33 @@ class String
|
|
130
119
|
|
131
120
|
def nbsp_in_name_modifier!
|
132
121
|
NAME_MODIFIERS.each do |modifier|
|
133
|
-
|
122
|
+
gsub!(/([[:space:]]#{modifier})([[:space:]])/i) { |_| "#{Regexp.last_match[1]}#{NONBREAKING_SPACE}" }
|
134
123
|
end
|
135
124
|
|
136
125
|
self # Allows chaining
|
137
126
|
end
|
138
127
|
|
139
128
|
def remove_periods_from_initials!
|
140
|
-
|
141
|
-
self # Allows chaining
|
129
|
+
gsub!(/\b([a-z])\./i) { |_| Regexp.last_match[1] } || self
|
142
130
|
end
|
143
131
|
|
144
132
|
def remove_spaces_from_initials!
|
145
|
-
|
146
|
-
|
133
|
+
gsub!(/\b([a-z])(\.)* \b(?![a-z0-9'\u00C0-\u00FF]{2,})/i) do |_|
|
134
|
+
"#{Regexp.last_match[1]}#{Regexp.last_match[2]}"
|
135
|
+
end || self
|
147
136
|
end
|
148
137
|
|
149
138
|
def ensure_space_after_initials!
|
150
|
-
|
151
|
-
|
139
|
+
gsub!(/\b([a-z]\.)(?=[a-z0-9]{2,})/i) { |_| "#{Regexp.last_match[1]} " } || self
|
140
|
+
end
|
141
|
+
|
142
|
+
def ensure_safe
|
143
|
+
return if valid_encoding?
|
144
|
+
encode!('UTF-8', invalid: :replace, undef: :replace, replace: '')
|
145
|
+
end
|
146
|
+
|
147
|
+
def substitute!(pattern, replacement)
|
148
|
+
gsub!(pattern, replacement) || self
|
152
149
|
end
|
153
150
|
|
154
151
|
NONBREAKING_SPACE = "\u00a0"
|
data/spec/name_tamer_spec.rb
CHANGED
@@ -183,12 +183,9 @@ describe NameTamer do
|
|
183
183
|
sn: 'Scout Loyalty Optimizer',
|
184
184
|
s: 'scout-loyalty-optimizer'
|
185
185
|
},
|
186
|
-
{ n: 'René Descartes',
|
187
|
-
|
188
|
-
|
189
|
-
sn: 'René Descartes',
|
190
|
-
s: 'rene-descartes'
|
191
|
-
}
|
186
|
+
{ n: 'René Descartes', t: :person, nn: 'René Descartes', sn: 'René Descartes', s: 'rene-descartes' },
|
187
|
+
{ n: 'Pablo M Sánchez', t: :person, nn: 'Pablo M Sánchez', sn: 'Pablo Sánchez', s: 'pablo-sanchez' },
|
188
|
+
{ n: "\xc3\x28", t: :person, nn: '()', sn: '()', s: '_' } # Invalid byte sequence in UTF-8
|
192
189
|
]
|
193
190
|
end
|
194
191
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: name-tamer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Xenapto
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|