name-tamer 0.1.9 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b556d5a36fcb89c56e435fd67a0f159987b7f8b9
4
- data.tar.gz: 7ac8e5b948e6edb607f367d4d8303bb3c97c0d9d
3
+ metadata.gz: 3532b7472b3daecb0bb11863268531c229771639
4
+ data.tar.gz: 0096dd16106d480f6c5e1e043dbf54896f787599
5
5
  SHA512:
6
- metadata.gz: 215966db363f5630a1b53671c95792057deb29306a07e1cf93c1772700be4b3fc3e4f9cab40a1ba6d39f58813eddead3226d1faa2583aa21e3d73f41eb2c1403
7
- data.tar.gz: 4d2bcd7d0f9b8556c548235c4d12ae0c92643399444a2ce70c5e9b8ef984577d54915827a86f4b243d3434775a1ddb37ccd1d0055b8645c56ab770bdba558613
6
+ metadata.gz: 4ee6d017e93b54acd10791f44a2920c46fe76faaafcb0171ab59c582f7d07c34036bc64610c5298ae363a33fa26fbedd1711800771acc29620beb6967adea10e
7
+ data.tar.gz: 23741e994c62fc8c746f826e3124824a29e3a5ecff22930a1ec350db0363b5b6d20d67bc5104d546f25c9301850a8944e936cb5e41c6320524d766303a2be69c
data/.rubocop.yml CHANGED
@@ -7,4 +7,4 @@ CyclomaticComplexity:
7
7
  ClassLength:
8
8
  Description: 'Avoid classes longer than 100 lines of code.'
9
9
  CountComments: false # count full line comments?
10
- Max: 316
10
+ Max: 321
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- name-tamer (0.1.8)
4
+ name-tamer (0.1.9)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/lib/name-tamer.rb CHANGED
@@ -23,11 +23,12 @@ class NameTamer
23
23
 
24
24
  def tidy_name
25
25
  unless @tidy_name
26
- @tidy_name = name.dup # Start with the name we've received
26
+ @tidy_name = name.dup # Start with the name we've received
27
27
 
28
- tidy_spacing # " John Smith " -> "John Smith"
29
- fix_encoding_errors # "Ren\u00c3\u00a9 Descartes" -> "Ren\u00e9 Descartes"
30
- consolidate_initials # "I. B. M." -> "I.B.M."
28
+ ensure_safe # Invalid byte sequence in UTF-8, for example
29
+ tidy_spacing # " John Smith " -> "John Smith"
30
+ fix_encoding_errors # "Ren\u00c3\u00a9 Descartes" -> "Ren\u00e9 Descartes"
31
+ consolidate_initials # "I. B. M." -> "I.B.M."
31
32
  end
32
33
 
33
34
  @tidy_name
@@ -111,6 +112,10 @@ class NameTamer
111
112
  # Tidy up the name we've received
112
113
  #--------------------------------------------------------
113
114
 
115
+ def ensure_safe
116
+ @tidy_name.ensure_safe
117
+ end
118
+
114
119
  def tidy_spacing
115
120
  @tidy_name
116
121
  .space_after_comma!
@@ -295,6 +300,7 @@ class NameTamer
295
300
  @contact_type = ct
296
301
  end
297
302
 
303
+ @tidy_name = nil
298
304
  @nice_name = nil
299
305
  @simple_name = nil
300
306
  @slug = nil
@@ -1,3 +1,3 @@
1
1
  class NameTamer
2
- VERSION = '0.1.9'
2
+ VERSION = '0.2.0'
3
3
  end
data/lib/string_extras.rb CHANGED
@@ -2,83 +2,74 @@
2
2
  class String
3
3
  # Strip illegal characters out completely
4
4
  def strip_unwanted!(filter)
5
- self.gsub!(filter, '')
6
- self # Allows chaining
5
+ substitute!(filter, '')
7
6
  end
8
7
 
9
8
  def strip_or_self!
10
- self.strip!
11
- self # Allows chaining
9
+ strip! || self
12
10
  end
13
11
 
14
12
  # Change any whitespace into our separator character
15
13
  def whitespace_to!(separator)
16
- self.gsub!(/[[:space:]]+/, separator)
17
- self # Allows chaining
14
+ substitute!(/[[:space:]]+/, separator)
18
15
  end
19
16
 
20
17
  # Ensure commas have exactly one space after them
21
18
  def space_after_comma!
22
- self.gsub!(/,[[:space:]]*/, ', ')
23
- self # Allows chaining
19
+ substitute!(/,[[:space:]]*/, ', ')
24
20
  end
25
21
 
26
22
  # Change some characters embedded in words to our separator character
27
23
  # e.g. example.com -> example-com
28
24
  def invalid_chars_to!(separator)
29
- self.gsub!(/(?<![[:space:]])[\.\/](?![[:space:]])/, separator)
30
- self # Allows chaining
25
+ substitute!(/(?<![[:space:]])[\.\/](?![[:space:]])/, separator)
31
26
  end
32
27
 
33
28
  # Make sure separators are not where they shouldn't be
34
29
  def fix_separators!(separator)
35
- unless separator.nil? || separator.empty?
36
- r = Regexp.escape(separator)
37
- # No more than one of the separator in a row.
38
- self.gsub!(/#{r}{2,}/, separator)
39
- # Remove leading/trailing separator.
40
- self.gsub!(/^#{r}|#{r}$/i, '')
41
- end
30
+ return self if separator.nil? || separator.empty?
42
31
 
43
- self # Allows chaining
32
+ r = Regexp.escape(separator)
33
+
34
+ # No more than one of the separator in a row.
35
+ substitute!(/#{r}{2,}/, separator)
36
+
37
+ # Remove leading/trailing separator.
38
+ substitute!(/^#{r}|#{r}$/i, '')
44
39
  end
45
40
 
46
41
  # Any characters that resemble latin characters might usefully be
47
42
  # transliterated into ones that are easy to type on an anglophone
48
43
  # keyboard.
49
44
  def approximate_latin_chars!
50
- self.gsub!(/[^\x00-\x7f]/u) { |char| APPROXIMATIONS[char] || char }
51
- self # Allows chaining
45
+ gsub!(/[^\x00-\x7f]/u) { |char| APPROXIMATIONS[char] || char } || self
52
46
  end
53
47
 
54
48
  # Strings that were wrongly encoded with single-byte encodings sometimes have
55
49
  # tell-tale substrings that we can put back into the correct UTF-8 character
56
50
  def fix_encoding_errors!
57
- self.gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring }
58
- self # Allows chaining
51
+ gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring } || self
59
52
  end
60
53
 
61
54
  def upcase_first_letter!
62
- self.gsub!(/\b\w/) { |first| first.upcase }
63
- self # Allows chaining
55
+ gsub!(/\b\w/) { |first| first.upcase } || self
64
56
  end
65
57
 
66
58
  def downcase_after_apostrophe!
67
- self.gsub!(/\'\w\b/) { |c| c.downcase } # Lowercase 's
68
- self # Allows chaining
59
+ gsub!(/\'\w\b/) { |c| c.downcase } || self # Lowercase 's
69
60
  end
70
61
 
71
62
  # Our list of terminal characters that indicate a non-celtic name used
72
63
  # to include o but we removed it because of MacMurdo.
73
64
  def fix_mac!
74
65
  if self =~ /\bMac[A-Za-z]{2,}[^acizj]\b/ || self =~ /\bMc/
75
- self.gsub!(/\b(Ma?c)([A-Za-z]+)/) { |_| Regexp.last_match[1] + Regexp.last_match[2].capitalize }
66
+ gsub!(/\b(Ma?c)([A-Za-z]+)/) { |_| Regexp.last_match[1] + Regexp.last_match[2].capitalize }
76
67
 
77
68
  # Fix Mac exceptions
78
69
  %w(
79
70
  MacEdo MacEvicius MacHado MacHar MacHin MacHlin MacIas MacIulis MacKie
80
71
  MacKle MacKlin MacKmin MacKmurdo MacQuarie MacLise MacKenzie
81
- ).each { |mac_name| self.gsub!(/\b#{mac_name}/, mac_name.capitalize) }
72
+ ).each { |mac_name| substitute!(/\b#{mac_name}/, mac_name.capitalize) }
82
73
  end
83
74
 
84
75
  self # Allows chaining
@@ -88,7 +79,7 @@ class String
88
79
  def fix_ff!
89
80
  %w(
90
81
  Fforbes Fforde Ffinch Ffrench Ffoulkes
91
- ).each { |ff_name| self.gsub!(ff_name, ff_name.downcase) }
82
+ ).each { |ff_name| substitute!(ff_name, ff_name.downcase) }
92
83
 
93
84
  self # Allows chaining
94
85
  end
@@ -98,13 +89,13 @@ class String
98
89
  # Fixes for name modifiers followed by an apostrophe, e.g. d'Artagnan, Commedia dell'Arte
99
90
  def fix_name_modifiers!
100
91
  NAME_MODIFIERS.each do |modifier|
101
- self.gsub!(/((?:[[:space:]]|^)#{modifier})([[:space:]]+|-)/) do |_|
92
+ gsub!(/((?:[[:space:]]|^)#{modifier})([[:space:]]+|-)/) do |_|
102
93
  "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2].tr(ASCII_SPACE, NONBREAKING_SPACE)}"
103
94
  end
104
95
  end
105
96
 
106
97
  %w(Dell D).each do |modifier|
107
- self.gsub!(/(.#{modifier}')(\w)/) { |_| "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2]}" }
98
+ gsub!(/(.#{modifier}')(\w)/) { |_| "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2]}" }
108
99
  end
109
100
 
110
101
  self # Allows chaining
@@ -113,16 +104,14 @@ class String
113
104
  # Upcase words with no vowels, e.g JPR Williams
114
105
  # Except Ng
115
106
  def upcase_initials!
116
- self.gsub!(/\b([bcdfghjklmnpqrstvwxz]+)\b/i) { |_| Regexp.last_match[1].upcase }
117
- self.gsub!(/\b(NG)\b/i) { |_| Regexp.last_match[1].capitalize } # http://en.wikipedia.org/wiki/Ng
118
-
119
- self # Allows chaining
107
+ gsub!(/\b([bcdfghjklmnpqrstvwxz]+)\b/i) { |_| Regexp.last_match[1].upcase }
108
+ gsub!(/\b(NG)\b/i) { |_| Regexp.last_match[1].capitalize } || self # http://en.wikipedia.org/wiki/Ng
120
109
  end
121
110
 
122
111
  # Fix known last names that have spaces (not hyphens!)
123
112
  def nbsp_in_compound_name!
124
113
  COMPOUND_NAMES.each do |compound_name|
125
- self.gsub!(compound_name, compound_name.tr(ASCII_SPACE, NONBREAKING_SPACE))
114
+ substitute!(compound_name, compound_name.tr(ASCII_SPACE, NONBREAKING_SPACE))
126
115
  end
127
116
 
128
117
  self # Allows chaining
@@ -130,25 +119,33 @@ class String
130
119
 
131
120
  def nbsp_in_name_modifier!
132
121
  NAME_MODIFIERS.each do |modifier|
133
- self.gsub!(/([[:space:]]#{modifier})([[:space:]])/i) { |_| "#{Regexp.last_match[1]}#{NONBREAKING_SPACE}" }
122
+ gsub!(/([[:space:]]#{modifier})([[:space:]])/i) { |_| "#{Regexp.last_match[1]}#{NONBREAKING_SPACE}" }
134
123
  end
135
124
 
136
125
  self # Allows chaining
137
126
  end
138
127
 
139
128
  def remove_periods_from_initials!
140
- self.gsub!(/\b([a-z])\./i) { |_| Regexp.last_match[1] }
141
- self # Allows chaining
129
+ gsub!(/\b([a-z])\./i) { |_| Regexp.last_match[1] } || self
142
130
  end
143
131
 
144
132
  def remove_spaces_from_initials!
145
- self.gsub!(/\b([a-z])(\.)* \b(?![a-z0-9']{2,})/i) { |_| "#{Regexp.last_match[1]}#{Regexp.last_match[2]}" }
146
- self # Allows chaining
133
+ gsub!(/\b([a-z])(\.)* \b(?![a-z0-9'\u00C0-\u00FF]{2,})/i) do |_|
134
+ "#{Regexp.last_match[1]}#{Regexp.last_match[2]}"
135
+ end || self
147
136
  end
148
137
 
149
138
  def ensure_space_after_initials!
150
- self.gsub!(/\b([a-z]\.)(?=[a-z0-9]{2,})/i) { |_| "#{Regexp.last_match[1]} " }
151
- self # Allows chaining
139
+ gsub!(/\b([a-z]\.)(?=[a-z0-9]{2,})/i) { |_| "#{Regexp.last_match[1]} " } || self
140
+ end
141
+
142
+ def ensure_safe
143
+ return if valid_encoding?
144
+ encode!('UTF-8', invalid: :replace, undef: :replace, replace: '')
145
+ end
146
+
147
+ def substitute!(pattern, replacement)
148
+ gsub!(pattern, replacement) || self
152
149
  end
153
150
 
154
151
  NONBREAKING_SPACE = "\u00a0"
@@ -183,12 +183,9 @@ describe NameTamer do
183
183
  sn: 'Scout Loyalty Optimizer',
184
184
  s: 'scout-loyalty-optimizer'
185
185
  },
186
- { n: 'René Descartes',
187
- t: :person,
188
- nn: 'René Descartes',
189
- sn: 'René Descartes',
190
- s: 'rene-descartes'
191
- }
186
+ { n: 'René Descartes', t: :person, nn: 'René Descartes', sn: 'René Descartes', s: 'rene-descartes' },
187
+ { n: 'Pablo M Sánchez', t: :person, nn: 'Pablo M Sánchez', sn: 'Pablo Sánchez', s: 'pablo-sanchez' },
188
+ { n: "\xc3\x28", t: :person, nn: '()', sn: '()', s: '_' } # Invalid byte sequence in UTF-8
192
189
  ]
193
190
  end
194
191
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: name-tamer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Xenapto
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-30 00:00:00.000000000 Z
11
+ date: 2014-07-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler