name-tamer 0.1.9 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/Gemfile.lock +1 -1
- data/lib/name-tamer.rb +10 -4
- data/lib/name-tamer/version.rb +1 -1
- data/lib/string_extras.rb +40 -43
- data/spec/name_tamer_spec.rb +3 -6
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3532b7472b3daecb0bb11863268531c229771639
|
4
|
+
data.tar.gz: 0096dd16106d480f6c5e1e043dbf54896f787599
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ee6d017e93b54acd10791f44a2920c46fe76faaafcb0171ab59c582f7d07c34036bc64610c5298ae363a33fa26fbedd1711800771acc29620beb6967adea10e
|
7
|
+
data.tar.gz: 23741e994c62fc8c746f826e3124824a29e3a5ecff22930a1ec350db0363b5b6d20d67bc5104d546f25c9301850a8944e936cb5e41c6320524d766303a2be69c
|
data/.rubocop.yml
CHANGED
data/Gemfile.lock
CHANGED
data/lib/name-tamer.rb
CHANGED
@@ -23,11 +23,12 @@ class NameTamer
|
|
23
23
|
|
24
24
|
def tidy_name
|
25
25
|
unless @tidy_name
|
26
|
-
@tidy_name = name.dup
|
26
|
+
@tidy_name = name.dup # Start with the name we've received
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
ensure_safe # Invalid byte sequence in UTF-8, for example
|
29
|
+
tidy_spacing # " John Smith " -> "John Smith"
|
30
|
+
fix_encoding_errors # "Ren\u00c3\u00a9 Descartes" -> "Ren\u00e9 Descartes"
|
31
|
+
consolidate_initials # "I. B. M." -> "I.B.M."
|
31
32
|
end
|
32
33
|
|
33
34
|
@tidy_name
|
@@ -111,6 +112,10 @@ class NameTamer
|
|
111
112
|
# Tidy up the name we've received
|
112
113
|
#--------------------------------------------------------
|
113
114
|
|
115
|
+
def ensure_safe
|
116
|
+
@tidy_name.ensure_safe
|
117
|
+
end
|
118
|
+
|
114
119
|
def tidy_spacing
|
115
120
|
@tidy_name
|
116
121
|
.space_after_comma!
|
@@ -295,6 +300,7 @@ class NameTamer
|
|
295
300
|
@contact_type = ct
|
296
301
|
end
|
297
302
|
|
303
|
+
@tidy_name = nil
|
298
304
|
@nice_name = nil
|
299
305
|
@simple_name = nil
|
300
306
|
@slug = nil
|
data/lib/name-tamer/version.rb
CHANGED
data/lib/string_extras.rb
CHANGED
@@ -2,83 +2,74 @@
|
|
2
2
|
class String
|
3
3
|
# Strip illegal characters out completely
|
4
4
|
def strip_unwanted!(filter)
|
5
|
-
|
6
|
-
self # Allows chaining
|
5
|
+
substitute!(filter, '')
|
7
6
|
end
|
8
7
|
|
9
8
|
def strip_or_self!
|
10
|
-
|
11
|
-
self # Allows chaining
|
9
|
+
strip! || self
|
12
10
|
end
|
13
11
|
|
14
12
|
# Change any whitespace into our separator character
|
15
13
|
def whitespace_to!(separator)
|
16
|
-
|
17
|
-
self # Allows chaining
|
14
|
+
substitute!(/[[:space:]]+/, separator)
|
18
15
|
end
|
19
16
|
|
20
17
|
# Ensure commas have exactly one space after them
|
21
18
|
def space_after_comma!
|
22
|
-
|
23
|
-
self # Allows chaining
|
19
|
+
substitute!(/,[[:space:]]*/, ', ')
|
24
20
|
end
|
25
21
|
|
26
22
|
# Change some characters embedded in words to our separator character
|
27
23
|
# e.g. example.com -> example-com
|
28
24
|
def invalid_chars_to!(separator)
|
29
|
-
|
30
|
-
self # Allows chaining
|
25
|
+
substitute!(/(?<![[:space:]])[\.\/](?![[:space:]])/, separator)
|
31
26
|
end
|
32
27
|
|
33
28
|
# Make sure separators are not where they shouldn't be
|
34
29
|
def fix_separators!(separator)
|
35
|
-
|
36
|
-
r = Regexp.escape(separator)
|
37
|
-
# No more than one of the separator in a row.
|
38
|
-
self.gsub!(/#{r}{2,}/, separator)
|
39
|
-
# Remove leading/trailing separator.
|
40
|
-
self.gsub!(/^#{r}|#{r}$/i, '')
|
41
|
-
end
|
30
|
+
return self if separator.nil? || separator.empty?
|
42
31
|
|
43
|
-
|
32
|
+
r = Regexp.escape(separator)
|
33
|
+
|
34
|
+
# No more than one of the separator in a row.
|
35
|
+
substitute!(/#{r}{2,}/, separator)
|
36
|
+
|
37
|
+
# Remove leading/trailing separator.
|
38
|
+
substitute!(/^#{r}|#{r}$/i, '')
|
44
39
|
end
|
45
40
|
|
46
41
|
# Any characters that resemble latin characters might usefully be
|
47
42
|
# transliterated into ones that are easy to type on an anglophone
|
48
43
|
# keyboard.
|
49
44
|
def approximate_latin_chars!
|
50
|
-
|
51
|
-
self # Allows chaining
|
45
|
+
gsub!(/[^\x00-\x7f]/u) { |char| APPROXIMATIONS[char] || char } || self
|
52
46
|
end
|
53
47
|
|
54
48
|
# Strings that were wrongly encoded with single-byte encodings sometimes have
|
55
49
|
# tell-tale substrings that we can put back into the correct UTF-8 character
|
56
50
|
def fix_encoding_errors!
|
57
|
-
|
58
|
-
self # Allows chaining
|
51
|
+
gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring } || self
|
59
52
|
end
|
60
53
|
|
61
54
|
def upcase_first_letter!
|
62
|
-
|
63
|
-
self # Allows chaining
|
55
|
+
gsub!(/\b\w/) { |first| first.upcase } || self
|
64
56
|
end
|
65
57
|
|
66
58
|
def downcase_after_apostrophe!
|
67
|
-
|
68
|
-
self # Allows chaining
|
59
|
+
gsub!(/\'\w\b/) { |c| c.downcase } || self # Lowercase 's
|
69
60
|
end
|
70
61
|
|
71
62
|
# Our list of terminal characters that indicate a non-celtic name used
|
72
63
|
# to include o but we removed it because of MacMurdo.
|
73
64
|
def fix_mac!
|
74
65
|
if self =~ /\bMac[A-Za-z]{2,}[^acizj]\b/ || self =~ /\bMc/
|
75
|
-
|
66
|
+
gsub!(/\b(Ma?c)([A-Za-z]+)/) { |_| Regexp.last_match[1] + Regexp.last_match[2].capitalize }
|
76
67
|
|
77
68
|
# Fix Mac exceptions
|
78
69
|
%w(
|
79
70
|
MacEdo MacEvicius MacHado MacHar MacHin MacHlin MacIas MacIulis MacKie
|
80
71
|
MacKle MacKlin MacKmin MacKmurdo MacQuarie MacLise MacKenzie
|
81
|
-
).each { |mac_name|
|
72
|
+
).each { |mac_name| substitute!(/\b#{mac_name}/, mac_name.capitalize) }
|
82
73
|
end
|
83
74
|
|
84
75
|
self # Allows chaining
|
@@ -88,7 +79,7 @@ class String
|
|
88
79
|
def fix_ff!
|
89
80
|
%w(
|
90
81
|
Fforbes Fforde Ffinch Ffrench Ffoulkes
|
91
|
-
).each { |ff_name|
|
82
|
+
).each { |ff_name| substitute!(ff_name, ff_name.downcase) }
|
92
83
|
|
93
84
|
self # Allows chaining
|
94
85
|
end
|
@@ -98,13 +89,13 @@ class String
|
|
98
89
|
# Fixes for name modifiers followed by an apostrophe, e.g. d'Artagnan, Commedia dell'Arte
|
99
90
|
def fix_name_modifiers!
|
100
91
|
NAME_MODIFIERS.each do |modifier|
|
101
|
-
|
92
|
+
gsub!(/((?:[[:space:]]|^)#{modifier})([[:space:]]+|-)/) do |_|
|
102
93
|
"#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2].tr(ASCII_SPACE, NONBREAKING_SPACE)}"
|
103
94
|
end
|
104
95
|
end
|
105
96
|
|
106
97
|
%w(Dell D).each do |modifier|
|
107
|
-
|
98
|
+
gsub!(/(.#{modifier}')(\w)/) { |_| "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2]}" }
|
108
99
|
end
|
109
100
|
|
110
101
|
self # Allows chaining
|
@@ -113,16 +104,14 @@ class String
|
|
113
104
|
# Upcase words with no vowels, e.g JPR Williams
|
114
105
|
# Except Ng
|
115
106
|
def upcase_initials!
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
self # Allows chaining
|
107
|
+
gsub!(/\b([bcdfghjklmnpqrstvwxz]+)\b/i) { |_| Regexp.last_match[1].upcase }
|
108
|
+
gsub!(/\b(NG)\b/i) { |_| Regexp.last_match[1].capitalize } || self # http://en.wikipedia.org/wiki/Ng
|
120
109
|
end
|
121
110
|
|
122
111
|
# Fix known last names that have spaces (not hyphens!)
|
123
112
|
def nbsp_in_compound_name!
|
124
113
|
COMPOUND_NAMES.each do |compound_name|
|
125
|
-
|
114
|
+
substitute!(compound_name, compound_name.tr(ASCII_SPACE, NONBREAKING_SPACE))
|
126
115
|
end
|
127
116
|
|
128
117
|
self # Allows chaining
|
@@ -130,25 +119,33 @@ class String
|
|
130
119
|
|
131
120
|
def nbsp_in_name_modifier!
|
132
121
|
NAME_MODIFIERS.each do |modifier|
|
133
|
-
|
122
|
+
gsub!(/([[:space:]]#{modifier})([[:space:]])/i) { |_| "#{Regexp.last_match[1]}#{NONBREAKING_SPACE}" }
|
134
123
|
end
|
135
124
|
|
136
125
|
self # Allows chaining
|
137
126
|
end
|
138
127
|
|
139
128
|
def remove_periods_from_initials!
|
140
|
-
|
141
|
-
self # Allows chaining
|
129
|
+
gsub!(/\b([a-z])\./i) { |_| Regexp.last_match[1] } || self
|
142
130
|
end
|
143
131
|
|
144
132
|
def remove_spaces_from_initials!
|
145
|
-
|
146
|
-
|
133
|
+
gsub!(/\b([a-z])(\.)* \b(?![a-z0-9'\u00C0-\u00FF]{2,})/i) do |_|
|
134
|
+
"#{Regexp.last_match[1]}#{Regexp.last_match[2]}"
|
135
|
+
end || self
|
147
136
|
end
|
148
137
|
|
149
138
|
def ensure_space_after_initials!
|
150
|
-
|
151
|
-
|
139
|
+
gsub!(/\b([a-z]\.)(?=[a-z0-9]{2,})/i) { |_| "#{Regexp.last_match[1]} " } || self
|
140
|
+
end
|
141
|
+
|
142
|
+
def ensure_safe
|
143
|
+
return if valid_encoding?
|
144
|
+
encode!('UTF-8', invalid: :replace, undef: :replace, replace: '')
|
145
|
+
end
|
146
|
+
|
147
|
+
def substitute!(pattern, replacement)
|
148
|
+
gsub!(pattern, replacement) || self
|
152
149
|
end
|
153
150
|
|
154
151
|
NONBREAKING_SPACE = "\u00a0"
|
data/spec/name_tamer_spec.rb
CHANGED
@@ -183,12 +183,9 @@ describe NameTamer do
|
|
183
183
|
sn: 'Scout Loyalty Optimizer',
|
184
184
|
s: 'scout-loyalty-optimizer'
|
185
185
|
},
|
186
|
-
{ n: 'René Descartes',
|
187
|
-
|
188
|
-
|
189
|
-
sn: 'René Descartes',
|
190
|
-
s: 'rene-descartes'
|
191
|
-
}
|
186
|
+
{ n: 'René Descartes', t: :person, nn: 'René Descartes', sn: 'René Descartes', s: 'rene-descartes' },
|
187
|
+
{ n: 'Pablo M Sánchez', t: :person, nn: 'Pablo M Sánchez', sn: 'Pablo Sánchez', s: 'pablo-sanchez' },
|
188
|
+
{ n: "\xc3\x28", t: :person, nn: '()', sn: '()', s: '_' } # Invalid byte sequence in UTF-8
|
192
189
|
]
|
193
190
|
end
|
194
191
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: name-tamer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Xenapto
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|