babosa 1.0.4 → 2.0.0.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/Changelog.md +12 -0
  3. data/README.md +80 -117
  4. data/Rakefile +9 -8
  5. data/lib/babosa.rb +2 -4
  6. data/lib/babosa/identifier.rb +82 -121
  7. data/lib/babosa/transliterator/base.rb +57 -56
  8. data/lib/babosa/transliterator/bulgarian.rb +3 -2
  9. data/lib/babosa/transliterator/cyrillic.rb +5 -5
  10. data/lib/babosa/transliterator/danish.rb +3 -3
  11. data/lib/babosa/transliterator/german.rb +3 -2
  12. data/lib/babosa/transliterator/greek.rb +4 -3
  13. data/lib/babosa/transliterator/hindi.rb +3 -2
  14. data/lib/babosa/transliterator/latin.rb +5 -5
  15. data/lib/babosa/transliterator/macedonian.rb +3 -2
  16. data/lib/babosa/transliterator/norwegian.rb +3 -3
  17. data/lib/babosa/transliterator/romanian.rb +3 -2
  18. data/lib/babosa/transliterator/russian.rb +3 -2
  19. data/lib/babosa/transliterator/serbian.rb +29 -27
  20. data/lib/babosa/transliterator/spanish.rb +2 -2
  21. data/lib/babosa/transliterator/swedish.rb +3 -3
  22. data/lib/babosa/transliterator/turkish.rb +8 -8
  23. data/lib/babosa/transliterator/ukrainian.rb +5 -4
  24. data/lib/babosa/transliterator/vietnamese.rb +4 -3
  25. data/lib/babosa/version.rb +3 -1
  26. data/spec/{babosa_spec.rb → identifier_spec.rb} +9 -10
  27. data/spec/spec_helper.rb +6 -6
  28. data/spec/transliterators/base_spec.rb +5 -6
  29. data/spec/transliterators/bulgarian_spec.rb +4 -5
  30. data/spec/transliterators/danish_spec.rb +5 -6
  31. data/spec/transliterators/german_spec.rb +4 -5
  32. data/spec/transliterators/greek_spec.rb +7 -7
  33. data/spec/transliterators/hindi_spec.rb +7 -7
  34. data/spec/transliterators/latin_spec.rb +3 -4
  35. data/spec/transliterators/macedonian_spec.rb +3 -4
  36. data/spec/transliterators/norwegian_spec.rb +4 -4
  37. data/spec/transliterators/polish_spec.rb +3 -5
  38. data/spec/transliterators/romanian_spec.rb +5 -6
  39. data/spec/transliterators/russian_spec.rb +3 -4
  40. data/spec/transliterators/serbian_spec.rb +6 -7
  41. data/spec/transliterators/spanish_spec.rb +4 -5
  42. data/spec/transliterators/swedish_spec.rb +7 -7
  43. data/spec/transliterators/turkish_spec.rb +24 -24
  44. data/spec/transliterators/ukrainian_spec.rb +74 -75
  45. data/spec/transliterators/vietnamese_spec.rb +10 -10
  46. metadata +17 -38
  47. data/lib/babosa/utf8/active_support_proxy.rb +0 -38
  48. data/lib/babosa/utf8/dumb_proxy.rb +0 -49
  49. data/lib/babosa/utf8/java_proxy.rb +0 -22
  50. data/lib/babosa/utf8/mappings.rb +0 -193
  51. data/lib/babosa/utf8/proxy.rb +0 -125
  52. data/lib/babosa/utf8/unicode_proxy.rb +0 -23
  53. data/spec/utf8_proxy_spec.rb +0 -52
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7878565d1bfb436b7110d42e81dff1eb589f86e10e0919ded4b2de695784fae3
4
- data.tar.gz: f6f3e7cc2b4876a940ec66fa1895f9cd3390526c62cebc888110494b72d77fe5
3
+ metadata.gz: ef1346a05f1b3a1104af8a095b7829eaa853427927335c7905d41d5ab47b2e9c
4
+ data.tar.gz: 54e543a250c7eff0c9613bea3f22d2e79317707f60cbb364e6e68979237c4c53
5
5
  SHA512:
6
- metadata.gz: 6ea0ff964d688d9ca29710da13207c12d49509c13d72f8198a33d38141f7f6ad39b5792f5937a94a44b3976dcd68b24233c11b3418dc9d100a170caa799404ed
7
- data.tar.gz: 1b37fd01a1907f244e112171da5dd942105181b860e6dc05ede3e1798bfdf1e50e8543b69757109dc9bd7dfddecb6ba2fbec2b59f38843223052c559f085490e
6
+ metadata.gz: d4f6732579088cda9d4514b4d1dddd32f2cd933f2818ed4c74bf0f00168ccc3b20e6ce220bc1c473a0453d29ef50de256689ed17e1f809b42dcb8956377c0e76
7
+ data.tar.gz: 797887db1d626a92b28249883f2dcc55e3b92d9f423aa23052ba74cc45856a15ace2b6c823c983b6e6baa0907cfc3d467b9a52e05cd9de9d9fcf37c779bd0647
@@ -1,5 +1,17 @@
1
1
  # Babosa Changelog
2
2
 
3
+ ## 2.0.0
4
+
5
+ This release contains no important changes. I had a week off from work and
6
+ decided to refactor the code. However there are some small breaking changes so
7
+ I have released it as 2.0.0.
8
+
9
+ * Refactor internals for simplicity
10
+ * Use built-in Ruby UTF-8 support in places of other gems.
11
+ * Drop support for Ruby < 2.5.0.
12
+ * `Babosa::Identifier#word_chars` no longer removes dashes
13
+ * `Babosa::Identifier#to_ruby_method` default argument `allow_bangs` is now a keyword argument
14
+
3
15
  ## 1.0.4
4
16
 
5
17
  * Fix nil being cast to frozen string (https://github.com/norman/babosa/pull/52)
data/README.md CHANGED
@@ -15,12 +15,16 @@ FriendlyId.
15
15
 
16
16
  ### Transliterate UTF-8 characters to ASCII
17
17
 
18
- "Gölcük, Turkey".to_slug.transliterate.to_s #=> "Golcuk, Turkey"
18
+ ```ruby
19
+ "Gölcük, Turkey".to_slug.transliterate.to_s #=> "Golcuk, Turkey"
20
+ ```
19
21
 
20
22
  ### Locale sensitive transliteration, with support for many languages
21
23
 
22
- "Jürgen Müller".to_slug.transliterate.to_s #=> "Jurgen Muller"
23
- "Jürgen Müller".to_slug.transliterate(:german).to_s #=> "Juergen Mueller"
24
+ ```ruby
25
+ "Jürgen Müller".to_slug.transliterate.to_s #=> "Jurgen Muller"
26
+ "Jürgen Müller".to_slug.transliterate(:german).to_s #=> "Juergen Mueller"
27
+ ```
24
28
 
25
29
  Currently supported languages include:
26
30
 
@@ -28,6 +32,7 @@ Currently supported languages include:
28
32
  * Danish
29
33
  * German
30
34
  * Greek
35
+ * Hindi
31
36
  * Macedonian
32
37
  * Norwegian
33
38
  * Romanian
@@ -35,124 +40,125 @@ Currently supported languages include:
35
40
  * Serbian
36
41
  * Spanish
37
42
  * Swedish
43
+ * Turkish
38
44
  * Ukrainian
45
+ * Vietnamese
46
+
47
+ Additionally there are generic transliterators for transliterating from the
48
+ Cyrillic alphabet and Latin alphabet with diacritics. The Latin transliterator
49
+ can be used, for example, with Czech. There is also a transliterator named
50
+ "Hindi" which may be sufficient for other Indic languages using Devanagari, but
51
+ I do not know enough to say whether the transliterations would make sense.
39
52
 
40
53
  I'll gladly accept contributions from fluent speakers to support more languages.
41
54
 
42
55
  ### Strip non-ASCII characters
43
56
 
44
- "Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
57
+ ```ruby
58
+ "Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
59
+ ```
45
60
 
46
61
  ### Truncate by characters
47
62
 
48
- "üüü".to_slug.truncate(2).to_s #=> "üü"
63
+ ```ruby
64
+ "üüü".to_slug.truncate(2).to_s #=> "üü"
65
+ ```
49
66
 
50
67
  ### Truncate by bytes
51
68
 
52
69
  This can be useful to ensure the generated slug will fit in a database column
53
70
  whose length is limited by bytes rather than UTF-8 characters.
54
71
 
55
- "üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
72
+ ```ruby
73
+ "üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
74
+ ```
56
75
 
57
76
  ### Remove punctuation chars
58
77
 
59
- "this is, um, **really** cool, huh?".to_slug.word_chars.to_s #=> "this is um really cool huh"
78
+ ```ruby
79
+ "this is, um, **really** cool, huh?".to_slug.word_chars.to_s #=> "this is um really cool huh"
80
+ ```
60
81
 
61
82
  ### All-in-one
62
83
 
63
- "Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
84
+ ```ruby
85
+ "Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
86
+ ```
64
87
 
65
88
  ### Other stuff
66
89
 
67
- #### Using Babosa With FriendlyId 4
68
-
69
- require "babosa"
90
+ #### Using Babosa With FriendlyId 4+
70
91
 
71
- class Person < ActiveRecord::Base
72
- friendly_id :name, use: :slugged
73
-
74
- def normalize_friendly_id(input)
75
- input.to_s.to_slug.normalize(transliterations: :russian).to_s
76
- end
77
- end
92
+ ```ruby
93
+ require "babosa"
78
94
 
79
- #### Pedantic UTF-8 support
95
+ class Person < ActiveRecord::Base
96
+ friendly_id :name, use: :slugged
80
97
 
81
- Babosa goes out of its way to handle [nasty Unicode issues you might never think
82
- you would have](https://github.com/norman/enc/blob/master/equivalence.rb) by
83
- checking, sanitizing and normalizing your string input.
98
+ def normalize_friendly_id(input)
99
+ input.to_s.to_slug.normalize(transliterations: :russian).to_s
100
+ end
101
+ end
102
+ ```
84
103
 
85
- It will automatically use whatever Unicode library you have loaded before
86
- Babosa, or fall back to a simple built-in library. Supported
87
- Unicode libraries include:
104
+ #### UTF-8 support
88
105
 
89
- * Java (only on JRuby of course)
90
- * Active Support
91
- * [Unicode](https://github.com/blackwinter/unicode)
92
- * Built-in
93
-
94
- This built-in module is much faster than Active Support but much slower than
95
- Java or Unicode. It can only do **very** naive Unicode composition to ensure
96
- that, for example, "é" will always be composed to a single codepoint rather than
97
- an "e" and a "´" - making it safe to use as a hash key.
98
-
99
- But seriously - save yourself the headache and install a real Unicode library.
100
- If you are using Babosa with a language that uses the Cyrillic alphabet, Babosa
101
- requires either Unicode, Active Support or Java.
106
+ Babosa normalizes all input strings [to NFC](https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms).
102
107
 
103
108
  #### Ruby Method Names
104
109
 
105
- Babosa can also generate strings for Ruby method names. (Yes, Ruby 1.9 can use
110
+ Babosa can generate strings for Ruby method names. (Yes, Ruby 1.9+ can use
106
111
  UTF-8 chars in method names, but you may not want to):
107
112
 
108
113
 
109
- "this is a method".to_slug.to_ruby_method! #=> this_is_a_method
110
- "über cool stuff!".to_slug.to_ruby_method! #=> uber_cool_stuff!
114
+ ```ruby
115
+ "this is a method".to_slug.to_ruby_method! #=> this_is_a_method
116
+ "über cool stuff!".to_slug.to_ruby_method! #=> uber_cool_stuff!
111
117
 
112
- # You can also disallow trailing punctuation chars
113
- "über cool stuff!".to_slug.to_ruby_method(false) #=> uber_cool_stuff
118
+ # You can also disallow trailing punctuation chars
119
+ "über cool stuff!".to_slug.to_ruby_method(allow_bangs: false) #=> uber_cool_stuff
120
+ ```
114
121
 
115
122
  #### Easy to Extend
116
123
 
117
124
  You can add custom transliterators for your language with very little code. For
118
125
  example here's the transliterator for German:
119
126
 
120
- # encoding: utf-8
121
- module Babosa
122
- module Transliterator
123
- class German < Latin
124
- APPROXIMATIONS = {
125
- "ä" => "ae",
126
- "ö" => "oe",
127
- "ü" => "ue",
128
- "Ä" => "Ae",
129
- "Ö" => "Oe",
130
- "Ü" => "Ue"
131
- }
132
- end
133
- end
127
+ ```ruby
128
+ module Babosa
129
+ module Transliterator
130
+ class German < Latin
131
+ APPROXIMATIONS = {
132
+ "ä" => "ae",
133
+ "ö" => "oe",
134
+ "ü" => "ue",
135
+ "Ä" => "Ae",
136
+ "Ö" => "Oe",
137
+ "Ü" => "Ue"
138
+ }
134
139
  end
140
+ end
141
+ end
142
+ ```
135
143
 
136
144
  And a spec (you can use this as a template):
137
145
 
138
- # encoding: utf-8
139
- require File.expand_path("../../spec_helper", __FILE__)
140
-
141
- describe Babosa::Transliterator::German do
146
+ ```ruby
147
+ require "spec_helper"
142
148
 
143
- let(:t) { described_class.instance }
144
- it_behaves_like "a latin transliterator"
149
+ describe Babosa::Transliterator::German do
150
+ let(:t) { described_class.instance }
151
+ it_behaves_like "a latin transliterator"
145
152
 
146
- it "should transliterate Eszett" do
147
- t.transliterate("ß").should eql("ss")
148
- end
149
-
150
- it "should transliterate vowels with umlauts" do
151
- t.transliterate("üöä").should eql("ueoeae")
152
- end
153
-
154
- end
153
+ it "should transliterate Eszett" do
154
+ t.transliterate("ß").should eql("ss")
155
+ end
155
156
 
157
+ it "should transliterate vowels with umlauts" do
158
+ t.transliterate("üöä").should eql("ueoeae")
159
+ end
160
+ end
161
+ ```
156
162
 
157
163
  ### Rails 3.x and higher
158
164
 
@@ -167,46 +173,6 @@ and
167
173
  [parameterize](http://api.rubyonrails.org/classes/ActiveSupport/Inflector.html#method-i-parameterize)
168
174
  to see if they suit your needs.
169
175
 
170
- ### Babosa vs. Stringex
171
-
172
- Babosa provides much of the functionality provided by the
173
- [Stringex](https://github.com/rsl/stringex) gem, but in the subjective opinion
174
- of the author, is for most use cases a better choice.
175
-
176
- #### Fewer Features
177
-
178
- Stringex offers functionality for storing slugs in an Active Record model, like
179
- a simple version of [FriendlyId](http://github.com/norman/friendly_id), in
180
- addition to string processing. Babosa only does string processing.
181
-
182
- #### Less Aggressive Unicode Transliteration
183
-
184
- Stringex uses an agressive Unicode to ASCII mapping which outputs gibberish for
185
- almost anything but Western European langages and Mandarin Chinese. Babosa
186
- supports only languages for which fluent speakers have provided
187
- transliterations, to ensure that the output makes sense to users.
188
-
189
- #### Unicode Support
190
-
191
- Stringex does no Unicode normalization or validation before transliterating
192
- strings, so if you pass in strings with encoding errors or with different
193
- Unicode normalizations, you'll get unpredictable results.
194
-
195
- #### No Locale Assumptions
196
-
197
- Babosa avoids making assumptions about locales like Stringex does, so it doesn't
198
- offer transliterations like this out of the box:
199
-
200
- "$12 worth of Ruby power".to_url => "12-dollars-worth-of-ruby-power"
201
-
202
- This is because the symbol "$" is used in many Latin American countries for the
203
- peso. Stringex does this in many places, for example, transliterating all Han
204
- characters into Pinyin, effectively treating Japanese text as if it were
205
- Mandarin Chinese.
206
-
207
-
208
- ### More info
209
-
210
176
  Please see the [API docs](http://rubydoc.info/github/norman/babosa/master/frames) and source code for
211
177
  more info.
212
178
 
@@ -218,9 +184,6 @@ Babosa can be installed via Rubygems:
218
184
 
219
185
  You can get the source code from its [Github repository](http://github.com/norman/babosa).
220
186
 
221
- Babosa is tested to be compatible with Ruby 2.x, JRuby 1.7+, and
222
- Rubinius 2.x It's probably compatible with other Rubies as well.
223
-
224
187
  ## Reporting bugs
225
188
 
226
189
  Please use Babosa's [Github issue
@@ -229,7 +192,7 @@ tracker](http://github.com/norman/babosa/issues).
229
192
 
230
193
  ## Misc
231
194
 
232
- "Babosa" means slug in Spanish.
195
+ "Babosa" means "slug" in Spanish.
233
196
 
234
197
  ## Author
235
198
 
@@ -258,7 +221,7 @@ Many thanks to the following people for their help:
258
221
 
259
222
  ## Copyright
260
223
 
261
- Copyright (c) 2010-2013 Norman Clarke
224
+ Copyright (c) 2010-2020 Norman Clarke
262
225
 
263
226
  Permission is hereby granted, free of charge, to any person obtaining a copy of
264
227
  this software and associated documentation files (the "Software"), to deal in
data/Rakefile CHANGED
@@ -1,10 +1,13 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "rubygems"
2
4
  require "rake/testtask"
3
5
  require "rake/clean"
4
6
  require "rubygems/package_task"
7
+ require "rubocop/rake_task"
5
8
 
6
- task :default => :spec
7
- task :test => :spec
9
+ task default: [:rubocop, :spec]
10
+ task test: :spec
8
11
 
9
12
  CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
10
13
 
@@ -14,6 +17,7 @@ begin
14
17
  t.options = ["--output-dir=doc"]
15
18
  end
16
19
  rescue LoadError
20
+ puts "Yard not present"
17
21
  end
18
22
 
19
23
  begin
@@ -23,12 +27,9 @@ begin
23
27
  Rake::Task["spec"].execute
24
28
  end
25
29
  rescue LoadError
30
+ puts "SimpleCov not present"
26
31
  end
27
32
 
28
- gemspec = File.expand_path("../babosa.gemspec", __FILE__)
29
- if File.exist? gemspec
30
- Gem::PackageTask.new(eval(File.read(gemspec))) { |pkg| }
31
- end
32
-
33
- require 'rspec/core/rake_task'
33
+ require "rspec/core/rake_task"
34
34
  RSpec::Core::RakeTask.new(:spec)
35
+ RuboCop::RakeTask.new
@@ -1,7 +1,6 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Babosa
2
- def self.jruby15?
3
- JRUBY_VERSION >= "1.5" rescue false
4
- end
5
4
  end
6
5
 
7
6
  class String
@@ -12,5 +11,4 @@ class String
12
11
  end
13
12
 
14
13
  require "babosa/transliterator/base"
15
- require "babosa/utf8/proxy"
16
14
  require "babosa/identifier"
@@ -1,16 +1,6 @@
1
- # encoding: utf-8
2
- module Babosa
3
-
4
- # Codepoints for characters that will be deleted by +#word_chars!+.
5
- STRIPPABLE = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19,
6
- 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39,
7
- 40, 41, 42, 43, 44, 45, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94,
8
- 96, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
9
- 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
10
- 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167,
11
- 168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 184,
12
- 185, 187, 188, 189, 190, 191, 215, 247, 8203, 8204, 8205, 8239, 65279]
1
+ # frozen_string_literal: true
13
2
 
3
+ module Babosa
14
4
  # This class provides some string-manipulation methods specific to slugs.
15
5
  #
16
6
  # Note that this class includes many "bang methods" such as {#clean!} and
@@ -20,7 +10,7 @@ module Babosa
20
10
  # it is generated dynamically.
21
11
  #
22
12
  # All of the bang methods return an instance of String, while the bangless
23
- # versions return an instance of Babosa::Identifier, so that calls to methods
13
+ # versions return an instance of {Babosa::Identifier}, so that calls to methods
24
14
  # specific to this class can be chained:
25
15
  #
26
16
  # string = Identifier.new("hello world")
@@ -29,71 +19,47 @@ module Babosa
29
19
  #
30
20
  # @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
31
21
  class Identifier
32
-
33
22
  Error = Class.new(StandardError)
34
23
 
35
24
  attr_reader :wrapped_string
36
25
  alias to_s wrapped_string
37
26
 
38
- @@utf8_proxy = if Babosa.jruby15?
39
- UTF8::JavaProxy
40
- elsif defined? Unicode::VERSION
41
- UTF8::UnicodeProxy
42
- elsif defined? ActiveSupport
43
- UTF8::ActiveSupportProxy
44
- else
45
- UTF8::DumbProxy
46
- end
47
-
48
- # Return the proxy used for UTF-8 support.
49
- # @see Babosa::UTF8::Proxy
50
- def self.utf8_proxy
51
- @@utf8_proxy
52
- end
53
-
54
- # Set a proxy object used for UTF-8 support.
55
- # @see Babosa::UTF8::Proxy
56
- def self.utf8_proxy=(obj)
57
- @@utf8_proxy = obj
58
- end
59
-
60
27
  def method_missing(symbol, *args, &block)
61
28
  @wrapped_string.__send__(symbol, *args, &block)
62
29
  end
63
30
 
31
+ def respond_to_missing?(name, include_all)
32
+ @wrapped_string.respond_to?(name, include_all)
33
+ end
34
+
64
35
  # @param string [#to_s] The string to use as the basis of the Identifier.
65
36
  def initialize(string)
66
- @wrapped_string = string.to_s
37
+ @wrapped_string = string.to_s.dup
67
38
  tidy_bytes!
68
39
  normalize_utf8!
69
40
  end
70
41
 
71
- def ==(value)
72
- @wrapped_string.to_s == value.to_s
73
- end
74
-
75
- def eql?(value)
76
- @wrapped_string == value
42
+ def ==(other)
43
+ to_s == other.to_s
77
44
  end
78
45
 
79
- def empty?
80
- # included to make this class :respond_to? :empty for compatibility with Active Support's
81
- # #blank?
82
- @wrapped_string.empty?
46
+ def eql?(other)
47
+ self == other
83
48
  end
84
49
 
85
- # Approximate an ASCII string. This works only for Western strings using
86
- # characters that are Roman-alphabet characters + diacritics. Non-letter
87
- # characters are left unmodified.
50
+ # Approximate an ASCII string. This works only for strings using characters
51
+ # that are Roman-alphabet characters + diacritics. Non-letter characters
52
+ # are left unmodified.
88
53
  #
89
- # string = Identifier.new "Łódź
54
+ # string = Identifier.new "Łódź, Poland"
90
55
  # string.transliterate # => "Lodz, Poland"
91
56
  # string = Identifier.new "日本"
92
57
  # string.transliterate # => "日本"
93
58
  #
94
- # You can pass any key(s) from +Characters.approximations+ as arguments. This allows
95
- # for contextual approximations. Various languages are supported, you can see which ones
96
- # by looking at the source of {Babosa::Transliterator::Base}.
59
+ # You can pass the names of any transliterator class as arguments. This
60
+ # allows for contextual approximations. Various languages are supported,
61
+ # you can see which ones by looking at the source of
62
+ # {Babosa::Transliterator::Base}.
97
63
  #
98
64
  # string = Identifier.new "Jürgen Müller"
99
65
  # string.transliterate # => "Jurgen Muller"
@@ -111,7 +77,7 @@ module Babosa
111
77
  # to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
112
78
  #
113
79
  # string.transliterate!(:spanish) # => "¡Feliz anio!"
114
- # string.transliterate! # => "¡Feliz anio!"
80
+ # string.to_ascii! # => "Feliz anio!"
115
81
  #
116
82
  # @param *args <Symbol>
117
83
  # @return String
@@ -122,40 +88,50 @@ module Babosa
122
88
  transliterator = Transliterator.get(kind).instance
123
89
  @wrapped_string = transliterator.transliterate(@wrapped_string)
124
90
  end
125
- @wrapped_string
91
+ to_s
126
92
  end
127
93
 
128
94
  # Converts dashes to spaces, removes leading and trailing spaces, and
129
95
  # replaces multiple whitespace characters with a single space.
96
+ #
130
97
  # @return String
131
98
  def clean!
132
- @wrapped_string = @wrapped_string.gsub("-", " ").squeeze(" ").strip
99
+ gsub!(/[- ]+/, " ")
100
+ strip!
101
+ to_s
133
102
  end
134
103
 
135
104
  # Remove any non-word characters. For this library's purposes, this means
136
- # anything other than letters, numbers, spaces, newlines and linefeeds.
105
+ # anything other than letters, numbers, spaces, underscores, dashes,
106
+ # newlines, and linefeeds.
107
+ #
137
108
  # @return String
138
109
  def word_chars!
139
- @wrapped_string = (unpack("U*") - Babosa::STRIPPABLE).pack("U*")
110
+ # `^\p{letter}` = Any non-Unicode letter
111
+ # `&&` = add the following character class
112
+ # `[^ _\n\r]` = Anything other than space, underscore, newline or linefeed
113
+ gsub!(/[[^\p{letter}]&&[^ _\-\n\r]]/, "")
114
+ to_s
140
115
  end
141
116
 
142
117
  # Normalize the string for use as a URL slug. Note that in this context,
143
118
  # +normalize+ means, strip, remove non-letters/numbers, downcasing,
144
119
  # truncating to 255 bytes and converting whitespace to dashes.
145
- # @param Options
120
+ #
121
+ # @param options [Hash]
146
122
  # @return String
147
- def normalize!(options = nil)
148
- options = default_normalize_options.merge(options || {})
123
+ def normalize!(options = {})
124
+ options = default_normalize_options.merge(options)
149
125
 
150
- if translit_option = options[:transliterate]
151
- if translit_option != true
152
- transliterate!(*translit_option)
126
+ if options[:transliterate]
127
+ option = options[:transliterate]
128
+ if option != true
129
+ transliterate!(*option)
153
130
  else
154
131
  transliterate!(*options[:transliterations])
155
132
  end
156
133
  end
157
134
  to_ascii! if options[:to_ascii]
158
- clean!
159
135
  word_chars!
160
136
  clean!
161
137
  downcase!
@@ -164,105 +140,90 @@ module Babosa
164
140
  end
165
141
 
166
142
  # Normalize a string so that it can safely be used as a Ruby method name.
167
- def to_ruby_method!(allow_bangs = true)
168
- leader, trailer = @wrapped_string.strip.scan(/\A(.+)(.)\z/).flatten
169
- leader = leader.to_s.dup
170
- trailer = trailer.to_s.dup
171
- if allow_bangs
172
- trailer.downcase!
173
- trailer.gsub!(/[^a-z0-9!=\\?]/, '')
174
- else
175
- trailer.downcase!
176
- trailer.gsub!(/[^a-z0-9]/, '')
177
- end
178
- id = leader.to_identifier
179
- id.transliterate!
180
- id.to_ascii!
181
- id.clean!
182
- id.word_chars!
183
- id.clean!
184
- @wrapped_string = id.to_s + trailer
185
- if @wrapped_string == ""
186
- raise Error, "Input generates impossible Ruby method name"
187
- end
143
+ #
144
+ # @param allow_bangs [Boolean]
145
+ # @return String
146
+ def to_ruby_method!(allow_bangs: true)
147
+ last_char = self[-1]
148
+ transliterate!
149
+ to_ascii!
150
+ word_chars!
151
+ clean!
152
+ @wrapped_string += last_char if allow_bangs && ["!", "?"].include?(last_char)
153
+ raise Error, "Input generates impossible Ruby method name" if self == ""
154
+
188
155
  with_separators!("_")
189
156
  end
190
157
 
191
158
  # Delete any non-ascii characters.
159
+ #
192
160
  # @return String
193
161
  def to_ascii!
194
- @wrapped_string = @wrapped_string.gsub(/[^\x00-\x7f]/u, '')
162
+ gsub!(/[^\x00-\x7f]/u, "")
163
+ to_s
195
164
  end
196
165
 
197
166
  # Truncate the string to +max+ characters.
167
+ #
198
168
  # @example
199
169
  # "üéøá".to_identifier.truncate(3) #=> "üéø"
170
+ #
171
+ # @param max [Integer] The maximum number of characters.
200
172
  # @return String
201
173
  def truncate!(max)
202
- @wrapped_string = unpack("U*")[0...max].pack("U*")
174
+ @wrapped_string = slice(0, max)
203
175
  end
204
176
 
205
177
  # Truncate the string to +max+ bytes. This can be useful for ensuring that
206
178
  # a UTF-8 string will always fit into a database column with a certain max
207
179
  # byte length. The resulting string may be less than +max+ if the string must
208
180
  # be truncated at a multibyte character boundary.
181
+ #
209
182
  # @example
210
183
  # "üéøá".to_identifier.truncate_bytes(3) #=> "ü"
184
+ #
185
+ # @param max [Integer] The maximum number of bytes.
211
186
  # @return String
212
187
  def truncate_bytes!(max)
213
- return @wrapped_string if @wrapped_string.bytesize <= max
214
- curr = 0
215
- new = []
216
- unpack("U*").each do |char|
217
- break if curr > max
218
- char = [char].pack("U")
219
- curr += char.bytesize
220
- if curr <= max
221
- new << char
222
- end
223
- end
224
- @wrapped_string = new.join
188
+ truncate!(max)
189
+ chop! until bytesize <= max
225
190
  end
226
191
 
227
192
  # Replaces whitespace with dashes ("-").
193
+ #
194
+ # @param char [String] the separator character to use.
228
195
  # @return String
229
196
  def with_separators!(char = "-")
230
- @wrapped_string = @wrapped_string.gsub(/\s/u, char)
231
- end
232
-
233
- # Perform UTF-8 sensitive upcasing.
234
- # @return String
235
- def upcase!
236
- @wrapped_string = @@utf8_proxy.upcase(@wrapped_string)
237
- end
238
-
239
- # Perform UTF-8 sensitive downcasing.
240
- # @return String
241
- def downcase!
242
- @wrapped_string = @@utf8_proxy.downcase(@wrapped_string)
197
+ gsub!(/\s/u, char)
198
+ to_s
243
199
  end
244
200
 
245
201
  # Perform Unicode composition on the wrapped string.
202
+ #
246
203
  # @return String
247
204
  def normalize_utf8!
248
- @wrapped_string = @@utf8_proxy.normalize_utf8(@wrapped_string)
205
+ unicode_normalize!(:nfc)
206
+ to_s
249
207
  end
250
208
 
251
209
  # Attempt to convert characters encoded using CP1252 and IS0-8859-1 to
252
210
  # UTF-8.
253
211
  # @return String
254
212
  def tidy_bytes!
255
- @wrapped_string = @@utf8_proxy.tidy_bytes(@wrapped_string)
213
+ scrub! do |bad|
214
+ bad.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
215
+ end
216
+ to_s
256
217
  end
257
218
 
258
219
  %w[transliterate clean downcase word_chars normalize normalize_utf8
259
- tidy_bytes to_ascii to_ruby_method truncate truncate_bytes upcase
260
- with_separators].each do |method|
261
- class_eval(<<-EOM, __FILE__, __LINE__ + 1)
220
+ tidy_bytes to_ascii to_ruby_method truncate truncate_bytes upcase
221
+ with_separators].each do |method|
222
+ class_eval(<<-METHOD, __FILE__, __LINE__ + 1)
262
223
  def #{method}(*args)
263
224
  send_to_new_instance(:#{method}!, *args)
264
225
  end
265
- EOM
226
+ METHOD
266
227
  end
267
228
 
268
229
  def to_identifier
@@ -271,7 +232,7 @@ module Babosa
271
232
 
272
233
  # The default options for {#normalize!}. Override to set your own defaults.
273
234
  def default_normalize_options
274
- {:transliterate => true, :max_length => 255, :separator => "-"}
235
+ {transliterate: :latin, max_length: 255, separator: "-"}
275
236
  end
276
237
 
277
238
  alias approximate_ascii transliterate