babosa 1.0.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data.tar.gz.sig +0 -0
  4. data/Changelog.md +12 -0
  5. data/README.md +81 -119
  6. data/Rakefile +9 -8
  7. data/lib/babosa.rb +2 -4
  8. data/lib/babosa/identifier.rb +104 -129
  9. data/lib/babosa/transliterator/base.rb +57 -56
  10. data/lib/babosa/transliterator/bulgarian.rb +3 -2
  11. data/lib/babosa/transliterator/cyrillic.rb +5 -5
  12. data/lib/babosa/transliterator/danish.rb +3 -3
  13. data/lib/babosa/transliterator/german.rb +3 -2
  14. data/lib/babosa/transliterator/greek.rb +4 -3
  15. data/lib/babosa/transliterator/hindi.rb +3 -2
  16. data/lib/babosa/transliterator/latin.rb +5 -5
  17. data/lib/babosa/transliterator/macedonian.rb +3 -2
  18. data/lib/babosa/transliterator/norwegian.rb +3 -3
  19. data/lib/babosa/transliterator/romanian.rb +3 -2
  20. data/lib/babosa/transliterator/russian.rb +3 -2
  21. data/lib/babosa/transliterator/serbian.rb +29 -27
  22. data/lib/babosa/transliterator/spanish.rb +2 -2
  23. data/lib/babosa/transliterator/swedish.rb +3 -3
  24. data/lib/babosa/transliterator/turkish.rb +8 -8
  25. data/lib/babosa/transliterator/ukrainian.rb +5 -4
  26. data/lib/babosa/transliterator/vietnamese.rb +4 -3
  27. data/lib/babosa/version.rb +3 -1
  28. data/spec/{babosa_spec.rb → identifier_spec.rb} +13 -14
  29. data/spec/spec_helper.rb +6 -6
  30. data/spec/transliterators/base_spec.rb +5 -6
  31. data/spec/transliterators/bulgarian_spec.rb +4 -5
  32. data/spec/transliterators/danish_spec.rb +5 -6
  33. data/spec/transliterators/german_spec.rb +4 -5
  34. data/spec/transliterators/greek_spec.rb +7 -7
  35. data/spec/transliterators/hindi_spec.rb +7 -7
  36. data/spec/transliterators/latin_spec.rb +3 -4
  37. data/spec/transliterators/macedonian_spec.rb +3 -4
  38. data/spec/transliterators/norwegian_spec.rb +4 -4
  39. data/spec/transliterators/polish_spec.rb +3 -5
  40. data/spec/transliterators/romanian_spec.rb +5 -6
  41. data/spec/transliterators/russian_spec.rb +3 -4
  42. data/spec/transliterators/serbian_spec.rb +6 -7
  43. data/spec/transliterators/spanish_spec.rb +4 -5
  44. data/spec/transliterators/swedish_spec.rb +7 -7
  45. data/spec/transliterators/turkish_spec.rb +24 -24
  46. data/spec/transliterators/ukrainian_spec.rb +74 -75
  47. data/spec/transliterators/vietnamese_spec.rb +10 -10
  48. metadata +44 -38
  49. metadata.gz.sig +2 -0
  50. data/lib/babosa/utf8/active_support_proxy.rb +0 -38
  51. data/lib/babosa/utf8/dumb_proxy.rb +0 -49
  52. data/lib/babosa/utf8/java_proxy.rb +0 -22
  53. data/lib/babosa/utf8/mappings.rb +0 -193
  54. data/lib/babosa/utf8/proxy.rb +0 -125
  55. data/lib/babosa/utf8/unicode_proxy.rb +0 -23
  56. data/spec/utf8_proxy_spec.rb +0 -52
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7878565d1bfb436b7110d42e81dff1eb589f86e10e0919ded4b2de695784fae3
4
- data.tar.gz: f6f3e7cc2b4876a940ec66fa1895f9cd3390526c62cebc888110494b72d77fe5
3
+ metadata.gz: 84fa74468828d44925314cc1bcf6d297aa3a94ebfd993bca491686c92c936667
4
+ data.tar.gz: 618a52e9d52878fe8bc5933879115d32dd31b212a9cd2178e78bb7320d186c43
5
5
  SHA512:
6
- metadata.gz: 6ea0ff964d688d9ca29710da13207c12d49509c13d72f8198a33d38141f7f6ad39b5792f5937a94a44b3976dcd68b24233c11b3418dc9d100a170caa799404ed
7
- data.tar.gz: 1b37fd01a1907f244e112171da5dd942105181b860e6dc05ede3e1798bfdf1e50e8543b69757109dc9bd7dfddecb6ba2fbec2b59f38843223052c559f085490e
6
+ metadata.gz: d8d86867859c5af0f8f80631b5d7aefe14cf87d61ec5908fa08d68fa38cc2e287c77c2c59c41377eb41e6524ba723ed511d1d09e5837259aad4a1ced7162c25a
7
+ data.tar.gz: 005dbd796a469420ddc905e2fcf4261703f3777ab29000102039e52b907d51d51c3ab69f11779bfc3e76ed19c24fad997733d008c7d08f50dc1446fa230b7e3d
checksums.yaml.gz.sig ADDED
Binary file
data.tar.gz.sig ADDED
Binary file
data/Changelog.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # Babosa Changelog
2
2
 
3
+ ## 2.0.0
4
+
5
+ This release contains no important changes. I had a week off from work and
6
+ decided to refactor the code. However there are some small breaking changes so
7
+ I have released it as 2.0.0.
8
+
9
+ * Refactor internals for simplicity
10
+ * Use built-in Ruby UTF-8 support in places of other gems.
11
+ * Drop support for Ruby < 2.5.0.
12
+ * `Babosa::Identifier#word_chars` no longer removes dashes
13
+ * `Babosa::Identifier#to_ruby_method` default argument `allow_bangs` is now a keyword argument
14
+
3
15
  ## 1.0.4
4
16
 
5
17
  * Fix nil being cast to frozen string (https://github.com/norman/babosa/pull/52)
data/README.md CHANGED
@@ -1,7 +1,6 @@
1
1
  # Babosa
2
2
 
3
- [![Build Status](https://travis-ci.org/norman/babosa.png?branch=master)](https://travis-ci.org/norman/babosa)
4
-
3
+ [![Build Status](https://github.com/norman/babosa/actions/workflows/main.yml/badge.svg)](https://github.com/norman/babosa/actions)
5
4
 
6
5
  Babosa is a library for creating human-friendly identifiers, aka "slugs". It can
7
6
  also be useful for normalizing and sanitizing data.
@@ -15,12 +14,16 @@ FriendlyId.
15
14
 
16
15
  ### Transliterate UTF-8 characters to ASCII
17
16
 
18
- "Gölcük, Turkey".to_slug.transliterate.to_s #=> "Golcuk, Turkey"
17
+ ```ruby
18
+ "Gölcük, Turkey".to_slug.transliterate.to_s #=> "Golcuk, Turkey"
19
+ ```
19
20
 
20
21
  ### Locale sensitive transliteration, with support for many languages
21
22
 
22
- "Jürgen Müller".to_slug.transliterate.to_s #=> "Jurgen Muller"
23
- "Jürgen Müller".to_slug.transliterate(:german).to_s #=> "Juergen Mueller"
23
+ ```ruby
24
+ "Jürgen Müller".to_slug.transliterate.to_s #=> "Jurgen Muller"
25
+ "Jürgen Müller".to_slug.transliterate(:german).to_s #=> "Juergen Mueller"
26
+ ```
24
27
 
25
28
  Currently supported languages include:
26
29
 
@@ -28,6 +31,7 @@ Currently supported languages include:
28
31
  * Danish
29
32
  * German
30
33
  * Greek
34
+ * Hindi
31
35
  * Macedonian
32
36
  * Norwegian
33
37
  * Romanian
@@ -35,124 +39,125 @@ Currently supported languages include:
35
39
  * Serbian
36
40
  * Spanish
37
41
  * Swedish
42
+ * Turkish
38
43
  * Ukrainian
44
+ * Vietnamese
45
+
46
+ Additionally there are generic transliterators for transliterating from the
47
+ Cyrillic alphabet and Latin alphabet with diacritics. The Latin transliterator
48
+ can be used, for example, with Czech. There is also a transliterator named
49
+ "Hindi" which may be sufficient for other Indic languages using Devanagari, but
50
+ I do not know enough to say whether the transliterations would make sense.
39
51
 
40
52
  I'll gladly accept contributions from fluent speakers to support more languages.
41
53
 
42
54
  ### Strip non-ASCII characters
43
55
 
44
- "Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
56
+ ```ruby
57
+ "Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
58
+ ```
45
59
 
46
60
  ### Truncate by characters
47
61
 
48
- "üüü".to_slug.truncate(2).to_s #=> "üü"
62
+ ```ruby
63
+ "üüü".to_slug.truncate(2).to_s #=> "üü"
64
+ ```
49
65
 
50
66
  ### Truncate by bytes
51
67
 
52
68
  This can be useful to ensure the generated slug will fit in a database column
53
69
  whose length is limited by bytes rather than UTF-8 characters.
54
70
 
55
- "üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
71
+ ```ruby
72
+ "üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
73
+ ```
56
74
 
57
75
  ### Remove punctuation chars
58
76
 
59
- "this is, um, **really** cool, huh?".to_slug.word_chars.to_s #=> "this is um really cool huh"
77
+ ```ruby
78
+ "this is, um, **really** cool, huh?".to_slug.word_chars.to_s #=> "this is um really cool huh"
79
+ ```
60
80
 
61
81
  ### All-in-one
62
82
 
63
- "Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
83
+ ```ruby
84
+ "Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
85
+ ```
64
86
 
65
87
  ### Other stuff
66
88
 
67
- #### Using Babosa With FriendlyId 4
68
-
69
- require "babosa"
70
-
71
- class Person < ActiveRecord::Base
72
- friendly_id :name, use: :slugged
73
-
74
- def normalize_friendly_id(input)
75
- input.to_s.to_slug.normalize(transliterations: :russian).to_s
76
- end
77
- end
78
-
79
- #### Pedantic UTF-8 support
89
+ #### Using Babosa With FriendlyId 4+
80
90
 
81
- Babosa goes out of its way to handle [nasty Unicode issues you might never think
82
- you would have](https://github.com/norman/enc/blob/master/equivalence.rb) by
83
- checking, sanitizing and normalizing your string input.
91
+ ```ruby
92
+ require "babosa"
84
93
 
85
- It will automatically use whatever Unicode library you have loaded before
86
- Babosa, or fall back to a simple built-in library. Supported
87
- Unicode libraries include:
94
+ class Person < ActiveRecord::Base
95
+ friendly_id :name, use: :slugged
88
96
 
89
- * Java (only on JRuby of course)
90
- * Active Support
91
- * [Unicode](https://github.com/blackwinter/unicode)
92
- * Built-in
97
+ def normalize_friendly_id(input)
98
+ input.to_s.to_slug.normalize(transliterations: :russian).to_s
99
+ end
100
+ end
101
+ ```
93
102
 
94
- This built-in module is much faster than Active Support but much slower than
95
- Java or Unicode. It can only do **very** naive Unicode composition to ensure
96
- that, for example, "é" will always be composed to a single codepoint rather than
97
- an "e" and a "´" - making it safe to use as a hash key.
103
+ #### UTF-8 support
98
104
 
99
- But seriously - save yourself the headache and install a real Unicode library.
100
- If you are using Babosa with a language that uses the Cyrillic alphabet, Babosa
101
- requires either Unicode, Active Support or Java.
105
+ Babosa normalizes all input strings [to NFC](https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms).
102
106
 
103
107
  #### Ruby Method Names
104
108
 
105
- Babosa can also generate strings for Ruby method names. (Yes, Ruby 1.9 can use
109
+ Babosa can generate strings for Ruby method names. (Yes, Ruby 1.9+ can use
106
110
  UTF-8 chars in method names, but you may not want to):
107
111
 
108
112
 
109
- "this is a method".to_slug.to_ruby_method! #=> this_is_a_method
110
- "über cool stuff!".to_slug.to_ruby_method! #=> uber_cool_stuff!
113
+ ```ruby
114
+ "this is a method".to_slug.to_ruby_method! #=> this_is_a_method
115
+ "über cool stuff!".to_slug.to_ruby_method! #=> uber_cool_stuff!
111
116
 
112
- # You can also disallow trailing punctuation chars
113
- "über cool stuff!".to_slug.to_ruby_method(false) #=> uber_cool_stuff
117
+ # You can also disallow trailing punctuation chars
118
+ "über cool stuff!".to_slug.to_ruby_method(allow_bangs: false) #=> uber_cool_stuff
119
+ ```
114
120
 
115
121
  #### Easy to Extend
116
122
 
117
123
  You can add custom transliterators for your language with very little code. For
118
124
  example here's the transliterator for German:
119
125
 
120
- # encoding: utf-8
121
- module Babosa
122
- module Transliterator
123
- class German < Latin
124
- APPROXIMATIONS = {
125
- "ä" => "ae",
126
- "ö" => "oe",
127
- "ü" => "ue",
128
- "Ä" => "Ae",
129
- "Ö" => "Oe",
130
- "Ü" => "Ue"
131
- }
132
- end
133
- end
126
+ ```ruby
127
+ module Babosa
128
+ module Transliterator
129
+ class German < Latin
130
+ APPROXIMATIONS = {
131
+ "ä" => "ae",
132
+ "ö" => "oe",
133
+ "ü" => "ue",
134
+ "Ä" => "Ae",
135
+ "Ö" => "Oe",
136
+ "Ü" => "Ue"
137
+ }
134
138
  end
139
+ end
140
+ end
141
+ ```
135
142
 
136
143
  And a spec (you can use this as a template):
137
144
 
138
- # encoding: utf-8
139
- require File.expand_path("../../spec_helper", __FILE__)
145
+ ```ruby
146
+ require "spec_helper"
140
147
 
141
- describe Babosa::Transliterator::German do
148
+ describe Babosa::Transliterator::German do
149
+ let(:t) { described_class.instance }
150
+ it_behaves_like "a latin transliterator"
142
151
 
143
- let(:t) { described_class.instance }
144
- it_behaves_like "a latin transliterator"
145
-
146
- it "should transliterate Eszett" do
147
- t.transliterate("ß").should eql("ss")
148
- end
149
-
150
- it "should transliterate vowels with umlauts" do
151
- t.transliterate("üöä").should eql("ueoeae")
152
- end
153
-
154
- end
152
+ it "should transliterate Eszett" do
153
+ t.transliterate("ß").should eql("ss")
154
+ end
155
155
 
156
+ it "should transliterate vowels with umlauts" do
157
+ t.transliterate("üöä").should eql("ueoeae")
158
+ end
159
+ end
160
+ ```
156
161
 
157
162
  ### Rails 3.x and higher
158
163
 
@@ -167,46 +172,6 @@ and
167
172
  [parameterize](http://api.rubyonrails.org/classes/ActiveSupport/Inflector.html#method-i-parameterize)
168
173
  to see if they suit your needs.
169
174
 
170
- ### Babosa vs. Stringex
171
-
172
- Babosa provides much of the functionality provided by the
173
- [Stringex](https://github.com/rsl/stringex) gem, but in the subjective opinion
174
- of the author, is for most use cases a better choice.
175
-
176
- #### Fewer Features
177
-
178
- Stringex offers functionality for storing slugs in an Active Record model, like
179
- a simple version of [FriendlyId](http://github.com/norman/friendly_id), in
180
- addition to string processing. Babosa only does string processing.
181
-
182
- #### Less Aggressive Unicode Transliteration
183
-
184
- Stringex uses an agressive Unicode to ASCII mapping which outputs gibberish for
185
- almost anything but Western European langages and Mandarin Chinese. Babosa
186
- supports only languages for which fluent speakers have provided
187
- transliterations, to ensure that the output makes sense to users.
188
-
189
- #### Unicode Support
190
-
191
- Stringex does no Unicode normalization or validation before transliterating
192
- strings, so if you pass in strings with encoding errors or with different
193
- Unicode normalizations, you'll get unpredictable results.
194
-
195
- #### No Locale Assumptions
196
-
197
- Babosa avoids making assumptions about locales like Stringex does, so it doesn't
198
- offer transliterations like this out of the box:
199
-
200
- "$12 worth of Ruby power".to_url => "12-dollars-worth-of-ruby-power"
201
-
202
- This is because the symbol "$" is used in many Latin American countries for the
203
- peso. Stringex does this in many places, for example, transliterating all Han
204
- characters into Pinyin, effectively treating Japanese text as if it were
205
- Mandarin Chinese.
206
-
207
-
208
- ### More info
209
-
210
175
  Please see the [API docs](http://rubydoc.info/github/norman/babosa/master/frames) and source code for
211
176
  more info.
212
177
 
@@ -218,9 +183,6 @@ Babosa can be installed via Rubygems:
218
183
 
219
184
  You can get the source code from its [Github repository](http://github.com/norman/babosa).
220
185
 
221
- Babosa is tested to be compatible with Ruby 2.x, JRuby 1.7+, and
222
- Rubinius 2.x It's probably compatible with other Rubies as well.
223
-
224
186
  ## Reporting bugs
225
187
 
226
188
  Please use Babosa's [Github issue
@@ -229,7 +191,7 @@ tracker](http://github.com/norman/babosa/issues).
229
191
 
230
192
  ## Misc
231
193
 
232
- "Babosa" means slug in Spanish.
194
+ "Babosa" means "slug" in Spanish.
233
195
 
234
196
  ## Author
235
197
 
@@ -258,7 +220,7 @@ Many thanks to the following people for their help:
258
220
 
259
221
  ## Copyright
260
222
 
261
- Copyright (c) 2010-2013 Norman Clarke
223
+ Copyright (c) 2010-2020 Norman Clarke
262
224
 
263
225
  Permission is hereby granted, free of charge, to any person obtaining a copy of
264
226
  this software and associated documentation files (the "Software"), to deal in
data/Rakefile CHANGED
@@ -1,10 +1,13 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "rubygems"
2
4
  require "rake/testtask"
3
5
  require "rake/clean"
4
6
  require "rubygems/package_task"
7
+ require "rubocop/rake_task"
5
8
 
6
- task :default => :spec
7
- task :test => :spec
9
+ task default: [:rubocop, :spec]
10
+ task test: :spec
8
11
 
9
12
  CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
10
13
 
@@ -14,6 +17,7 @@ begin
14
17
  t.options = ["--output-dir=doc"]
15
18
  end
16
19
  rescue LoadError
20
+ puts "Yard not present"
17
21
  end
18
22
 
19
23
  begin
@@ -23,12 +27,9 @@ begin
23
27
  Rake::Task["spec"].execute
24
28
  end
25
29
  rescue LoadError
30
+ puts "SimpleCov not present"
26
31
  end
27
32
 
28
- gemspec = File.expand_path("../babosa.gemspec", __FILE__)
29
- if File.exist? gemspec
30
- Gem::PackageTask.new(eval(File.read(gemspec))) { |pkg| }
31
- end
32
-
33
- require 'rspec/core/rake_task'
33
+ require "rspec/core/rake_task"
34
34
  RSpec::Core::RakeTask.new(:spec)
35
+ RuboCop::RakeTask.new
data/lib/babosa.rb CHANGED
@@ -1,7 +1,6 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Babosa
2
- def self.jruby15?
3
- JRUBY_VERSION >= "1.5" rescue false
4
- end
5
4
  end
6
5
 
7
6
  class String
@@ -12,5 +11,4 @@ class String
12
11
  end
13
12
 
14
13
  require "babosa/transliterator/base"
15
- require "babosa/utf8/proxy"
16
14
  require "babosa/identifier"
@@ -1,16 +1,6 @@
1
- # encoding: utf-8
2
- module Babosa
3
-
4
- # Codepoints for characters that will be deleted by +#word_chars!+.
5
- STRIPPABLE = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19,
6
- 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39,
7
- 40, 41, 42, 43, 44, 45, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94,
8
- 96, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
9
- 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
10
- 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167,
11
- 168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 184,
12
- 185, 187, 188, 189, 190, 191, 215, 247, 8203, 8204, 8205, 8239, 65279]
1
+ # frozen_string_literal: true
13
2
 
3
+ module Babosa
14
4
  # This class provides some string-manipulation methods specific to slugs.
15
5
  #
16
6
  # Note that this class includes many "bang methods" such as {#clean!} and
@@ -20,7 +10,7 @@ module Babosa
20
10
  # it is generated dynamically.
21
11
  #
22
12
  # All of the bang methods return an instance of String, while the bangless
23
- # versions return an instance of Babosa::Identifier, so that calls to methods
13
+ # versions return an instance of {Babosa::Identifier}, so that calls to methods
24
14
  # specific to this class can be chained:
25
15
  #
26
16
  # string = Identifier.new("hello world")
@@ -29,71 +19,47 @@ module Babosa
29
19
  #
30
20
  # @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
31
21
  class Identifier
32
-
33
22
  Error = Class.new(StandardError)
34
23
 
35
24
  attr_reader :wrapped_string
36
25
  alias to_s wrapped_string
37
26
 
38
- @@utf8_proxy = if Babosa.jruby15?
39
- UTF8::JavaProxy
40
- elsif defined? Unicode::VERSION
41
- UTF8::UnicodeProxy
42
- elsif defined? ActiveSupport
43
- UTF8::ActiveSupportProxy
44
- else
45
- UTF8::DumbProxy
46
- end
47
-
48
- # Return the proxy used for UTF-8 support.
49
- # @see Babosa::UTF8::Proxy
50
- def self.utf8_proxy
51
- @@utf8_proxy
52
- end
53
-
54
- # Set a proxy object used for UTF-8 support.
55
- # @see Babosa::UTF8::Proxy
56
- def self.utf8_proxy=(obj)
57
- @@utf8_proxy = obj
58
- end
59
-
60
27
  def method_missing(symbol, *args, &block)
61
28
  @wrapped_string.__send__(symbol, *args, &block)
62
29
  end
63
30
 
31
+ def respond_to_missing?(name, include_all)
32
+ @wrapped_string.respond_to?(name, include_all)
33
+ end
34
+
64
35
  # @param string [#to_s] The string to use as the basis of the Identifier.
65
36
  def initialize(string)
66
- @wrapped_string = string.to_s
37
+ @wrapped_string = string.to_s.dup
67
38
  tidy_bytes!
68
39
  normalize_utf8!
69
40
  end
70
41
 
71
- def ==(value)
72
- @wrapped_string.to_s == value.to_s
42
+ def ==(other)
43
+ to_s == other.to_s
73
44
  end
74
45
 
75
- def eql?(value)
76
- @wrapped_string == value
46
+ def eql?(other)
47
+ self == other
77
48
  end
78
49
 
79
- def empty?
80
- # included to make this class :respond_to? :empty for compatibility with Active Support's
81
- # #blank?
82
- @wrapped_string.empty?
83
- end
84
-
85
- # Approximate an ASCII string. This works only for Western strings using
86
- # characters that are Roman-alphabet characters + diacritics. Non-letter
87
- # characters are left unmodified.
50
+ # Approximate an ASCII string. This works only for strings using characters
51
+ # that are Roman-alphabet characters + diacritics. Non-letter characters
52
+ # are left unmodified.
88
53
  #
89
- # string = Identifier.new "Łódź
54
+ # string = Identifier.new "Łódź, Poland"
90
55
  # string.transliterate # => "Lodz, Poland"
91
56
  # string = Identifier.new "日本"
92
57
  # string.transliterate # => "日本"
93
58
  #
94
- # You can pass any key(s) from +Characters.approximations+ as arguments. This allows
95
- # for contextual approximations. Various languages are supported, you can see which ones
96
- # by looking at the source of {Babosa::Transliterator::Base}.
59
+ # You can pass the names of any transliterator class as arguments. This
60
+ # allows for contextual approximations. Various languages are supported,
61
+ # you can see which ones by looking at the source of
62
+ # {Babosa::Transliterator::Base}.
97
63
  #
98
64
  # string = Identifier.new "Jürgen Müller"
99
65
  # string.transliterate # => "Jurgen Muller"
@@ -111,7 +77,7 @@ module Babosa
111
77
  # to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
112
78
  #
113
79
  # string.transliterate!(:spanish) # => "¡Feliz anio!"
114
- # string.transliterate! # => "¡Feliz anio!"
80
+ # string.to_ascii! # => "Feliz anio!"
115
81
  #
116
82
  # @param *args <Symbol>
117
83
  # @return String
@@ -122,40 +88,50 @@ module Babosa
122
88
  transliterator = Transliterator.get(kind).instance
123
89
  @wrapped_string = transliterator.transliterate(@wrapped_string)
124
90
  end
125
- @wrapped_string
91
+ to_s
126
92
  end
127
93
 
128
94
  # Converts dashes to spaces, removes leading and trailing spaces, and
129
95
  # replaces multiple whitespace characters with a single space.
96
+ #
130
97
  # @return String
131
98
  def clean!
132
- @wrapped_string = @wrapped_string.gsub("-", " ").squeeze(" ").strip
99
+ gsub!(/[- ]+/, " ")
100
+ strip!
101
+ to_s
133
102
  end
134
103
 
135
104
  # Remove any non-word characters. For this library's purposes, this means
136
- # anything other than letters, numbers, spaces, newlines and linefeeds.
105
+ # anything other than letters, numbers, spaces, underscores, dashes,
106
+ # newlines, and linefeeds.
107
+ #
137
108
  # @return String
138
109
  def word_chars!
139
- @wrapped_string = (unpack("U*") - Babosa::STRIPPABLE).pack("U*")
110
+ # `^\p{letter}` = Any non-Unicode letter
111
+ # `&&` = add the following character class
112
+ # `[^ _\n\r]` = Anything other than space, underscore, newline or linefeed
113
+ gsub!(/[[^\p{letter}]&&[^ \d_\-\n\r]]/, "")
114
+ to_s
140
115
  end
141
116
 
142
117
  # Normalize the string for use as a URL slug. Note that in this context,
143
118
  # +normalize+ means, strip, remove non-letters/numbers, downcasing,
144
119
  # truncating to 255 bytes and converting whitespace to dashes.
145
- # @param Options
120
+ #
121
+ # @param options [Hash]
146
122
  # @return String
147
- def normalize!(options = nil)
148
- options = default_normalize_options.merge(options || {})
123
+ def normalize!(options = {})
124
+ options = default_normalize_options.merge(options)
149
125
 
150
- if translit_option = options[:transliterate]
151
- if translit_option != true
152
- transliterate!(*translit_option)
153
- else
126
+ if options[:transliterate]
127
+ option = options[:transliterate]
128
+ if option == true
154
129
  transliterate!(*options[:transliterations])
130
+ else
131
+ transliterate!(*option)
155
132
  end
156
133
  end
157
134
  to_ascii! if options[:to_ascii]
158
- clean!
159
135
  word_chars!
160
136
  clean!
161
137
  downcase!
@@ -164,105 +140,103 @@ module Babosa
164
140
  end
165
141
 
166
142
  # Normalize a string so that it can safely be used as a Ruby method name.
167
- def to_ruby_method!(allow_bangs = true)
168
- leader, trailer = @wrapped_string.strip.scan(/\A(.+)(.)\z/).flatten
169
- leader = leader.to_s.dup
170
- trailer = trailer.to_s.dup
171
- if allow_bangs
172
- trailer.downcase!
173
- trailer.gsub!(/[^a-z0-9!=\\?]/, '')
174
- else
175
- trailer.downcase!
176
- trailer.gsub!(/[^a-z0-9]/, '')
177
- end
178
- id = leader.to_identifier
179
- id.transliterate!
180
- id.to_ascii!
181
- id.clean!
182
- id.word_chars!
183
- id.clean!
184
- @wrapped_string = id.to_s + trailer
185
- if @wrapped_string == ""
186
- raise Error, "Input generates impossible Ruby method name"
187
- end
143
+ #
144
+ # @param allow_bangs [Boolean]
145
+ # @return String
146
+ def to_ruby_method!(allow_bangs: true)
147
+ last_char = self[-1]
148
+ transliterate!
149
+ to_ascii!
150
+ word_chars!
151
+ strip_leading_digits!
152
+ clean!
153
+ @wrapped_string += last_char if allow_bangs && ["!", "?"].include?(last_char)
154
+ raise Error, "Input generates impossible Ruby method name" if self == ""
155
+
188
156
  with_separators!("_")
189
157
  end
190
158
 
191
159
  # Delete any non-ascii characters.
160
+ #
192
161
  # @return String
193
162
  def to_ascii!
194
- @wrapped_string = @wrapped_string.gsub(/[^\x00-\x7f]/u, '')
163
+ gsub!(/[^\x00-\x7f]/u, "")
164
+ to_s
195
165
  end
196
166
 
197
167
  # Truncate the string to +max+ characters.
168
+ #
198
169
  # @example
199
170
  # "üéøá".to_identifier.truncate(3) #=> "üéø"
171
+ #
172
+ # @param max [Integer] The maximum number of characters.
200
173
  # @return String
201
174
  def truncate!(max)
202
- @wrapped_string = unpack("U*")[0...max].pack("U*")
175
+ @wrapped_string = slice(0, max)
203
176
  end
204
177
 
205
178
  # Truncate the string to +max+ bytes. This can be useful for ensuring that
206
179
  # a UTF-8 string will always fit into a database column with a certain max
207
180
  # byte length. The resulting string may be less than +max+ if the string must
208
181
  # be truncated at a multibyte character boundary.
182
+ #
209
183
  # @example
210
184
  # "üéøá".to_identifier.truncate_bytes(3) #=> "ü"
185
+ #
186
+ # @param max [Integer] The maximum number of bytes.
211
187
  # @return String
212
188
  def truncate_bytes!(max)
213
- return @wrapped_string if @wrapped_string.bytesize <= max
214
- curr = 0
215
- new = []
216
- unpack("U*").each do |char|
217
- break if curr > max
218
- char = [char].pack("U")
219
- curr += char.bytesize
220
- if curr <= max
221
- new << char
222
- end
223
- end
224
- @wrapped_string = new.join
189
+ truncate!(max)
190
+ chop! until bytesize <= max
225
191
  end
226
192
 
227
193
  # Replaces whitespace with dashes ("-").
194
+ #
195
+ # @param char [String] the separator character to use.
228
196
  # @return String
229
197
  def with_separators!(char = "-")
230
- @wrapped_string = @wrapped_string.gsub(/\s/u, char)
231
- end
232
-
233
- # Perform UTF-8 sensitive upcasing.
234
- # @return String
235
- def upcase!
236
- @wrapped_string = @@utf8_proxy.upcase(@wrapped_string)
198
+ gsub!(/\s/u, char)
199
+ to_s
237
200
  end
238
201
 
239
- # Perform UTF-8 sensitive downcasing.
202
+ # Perform Unicode composition on the wrapped string.
203
+ #
240
204
  # @return String
241
- def downcase!
242
- @wrapped_string = @@utf8_proxy.downcase(@wrapped_string)
205
+ def normalize_utf8!
206
+ unicode_normalize!(:nfc)
207
+ to_s
243
208
  end
244
209
 
245
- # Perform Unicode composition on the wrapped string.
210
+ # Strip any leading digits.
211
+ #
246
212
  # @return String
247
- def normalize_utf8!
248
- @wrapped_string = @@utf8_proxy.normalize_utf8(@wrapped_string)
213
+ def strip_leading_digits!
214
+ gsub!(/^\d+/, "")
215
+ to_s
249
216
  end
250
217
 
251
218
  # Attempt to convert characters encoded using CP1252 and IS0-8859-1 to
252
219
  # UTF-8.
253
220
  # @return String
254
221
  def tidy_bytes!
255
- @wrapped_string = @@utf8_proxy.tidy_bytes(@wrapped_string)
222
+ scrub! do |bad|
223
+ bad.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
224
+ end
225
+ to_s
256
226
  end
257
227
 
258
- %w[transliterate clean downcase word_chars normalize normalize_utf8
259
- tidy_bytes to_ascii to_ruby_method truncate truncate_bytes upcase
260
- with_separators].each do |method|
261
- class_eval(<<-EOM, __FILE__, __LINE__ + 1)
228
+ %w[clean downcase normalize normalize_utf8 strip_leading_digits
229
+ tidy_bytes to_ascii transliterate truncate truncate_bytes upcase
230
+ with_separators word_chars].each do |method|
231
+ class_eval(<<-METHOD, __FILE__, __LINE__ + 1)
262
232
  def #{method}(*args)
263
- send_to_new_instance(:#{method}!, *args)
233
+ with_new_instance { |id| id.send(:#{method}!, *args) }
264
234
  end
265
- EOM
235
+ METHOD
236
+ end
237
+
238
+ def to_ruby_method(allow_bangs: true)
239
+ with_new_instance { |id| id.to_ruby_method!(allow_bangs: allow_bangs) }
266
240
  end
267
241
 
268
242
  def to_identifier
@@ -271,7 +245,7 @@ module Babosa
271
245
 
272
246
  # The default options for {#normalize!}. Override to set your own defaults.
273
247
  def default_normalize_options
274
- {:transliterate => true, :max_length => 255, :separator => "-"}
248
+ {transliterate: :latin, max_length: 255, separator: "-"}
275
249
  end
276
250
 
277
251
  alias approximate_ascii transliterate
@@ -282,12 +256,13 @@ module Babosa
282
256
 
283
257
  private
284
258
 
285
- # Used as the basis of the bangless methods.
286
- def send_to_new_instance(*args)
287
- id = Identifier.allocate
288
- id.instance_variable_set :@wrapped_string, to_s
289
- id.send(*args)
290
- id
259
+ # Used as the basis of the non-mutating (bangless) methods.
260
+ def with_new_instance
261
+ Identifier.allocate.tap do |id|
262
+ id.instance_variable_set :@wrapped_string, to_s
263
+
264
+ yield id
265
+ end
291
266
  end
292
267
  end
293
268
  end