babosa 1.0.4 → 2.0.0.beta

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/Changelog.md +12 -0
  3. data/README.md +80 -117
  4. data/Rakefile +9 -8
  5. data/lib/babosa.rb +2 -4
  6. data/lib/babosa/identifier.rb +82 -121
  7. data/lib/babosa/transliterator/base.rb +57 -56
  8. data/lib/babosa/transliterator/bulgarian.rb +3 -2
  9. data/lib/babosa/transliterator/cyrillic.rb +5 -5
  10. data/lib/babosa/transliterator/danish.rb +3 -3
  11. data/lib/babosa/transliterator/german.rb +3 -2
  12. data/lib/babosa/transliterator/greek.rb +4 -3
  13. data/lib/babosa/transliterator/hindi.rb +3 -2
  14. data/lib/babosa/transliterator/latin.rb +5 -5
  15. data/lib/babosa/transliterator/macedonian.rb +3 -2
  16. data/lib/babosa/transliterator/norwegian.rb +3 -3
  17. data/lib/babosa/transliterator/romanian.rb +3 -2
  18. data/lib/babosa/transliterator/russian.rb +3 -2
  19. data/lib/babosa/transliterator/serbian.rb +29 -27
  20. data/lib/babosa/transliterator/spanish.rb +2 -2
  21. data/lib/babosa/transliterator/swedish.rb +3 -3
  22. data/lib/babosa/transliterator/turkish.rb +8 -8
  23. data/lib/babosa/transliterator/ukrainian.rb +5 -4
  24. data/lib/babosa/transliterator/vietnamese.rb +4 -3
  25. data/lib/babosa/version.rb +3 -1
  26. data/spec/{babosa_spec.rb → identifier_spec.rb} +9 -10
  27. data/spec/spec_helper.rb +6 -6
  28. data/spec/transliterators/base_spec.rb +5 -6
  29. data/spec/transliterators/bulgarian_spec.rb +4 -5
  30. data/spec/transliterators/danish_spec.rb +5 -6
  31. data/spec/transliterators/german_spec.rb +4 -5
  32. data/spec/transliterators/greek_spec.rb +7 -7
  33. data/spec/transliterators/hindi_spec.rb +7 -7
  34. data/spec/transliterators/latin_spec.rb +3 -4
  35. data/spec/transliterators/macedonian_spec.rb +3 -4
  36. data/spec/transliterators/norwegian_spec.rb +4 -4
  37. data/spec/transliterators/polish_spec.rb +3 -5
  38. data/spec/transliterators/romanian_spec.rb +5 -6
  39. data/spec/transliterators/russian_spec.rb +3 -4
  40. data/spec/transliterators/serbian_spec.rb +6 -7
  41. data/spec/transliterators/spanish_spec.rb +4 -5
  42. data/spec/transliterators/swedish_spec.rb +7 -7
  43. data/spec/transliterators/turkish_spec.rb +24 -24
  44. data/spec/transliterators/ukrainian_spec.rb +74 -75
  45. data/spec/transliterators/vietnamese_spec.rb +10 -10
  46. metadata +17 -38
  47. data/lib/babosa/utf8/active_support_proxy.rb +0 -38
  48. data/lib/babosa/utf8/dumb_proxy.rb +0 -49
  49. data/lib/babosa/utf8/java_proxy.rb +0 -22
  50. data/lib/babosa/utf8/mappings.rb +0 -193
  51. data/lib/babosa/utf8/proxy.rb +0 -125
  52. data/lib/babosa/utf8/unicode_proxy.rb +0 -23
  53. data/spec/utf8_proxy_spec.rb +0 -52
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7878565d1bfb436b7110d42e81dff1eb589f86e10e0919ded4b2de695784fae3
4
- data.tar.gz: f6f3e7cc2b4876a940ec66fa1895f9cd3390526c62cebc888110494b72d77fe5
3
+ metadata.gz: ef1346a05f1b3a1104af8a095b7829eaa853427927335c7905d41d5ab47b2e9c
4
+ data.tar.gz: 54e543a250c7eff0c9613bea3f22d2e79317707f60cbb364e6e68979237c4c53
5
5
  SHA512:
6
- metadata.gz: 6ea0ff964d688d9ca29710da13207c12d49509c13d72f8198a33d38141f7f6ad39b5792f5937a94a44b3976dcd68b24233c11b3418dc9d100a170caa799404ed
7
- data.tar.gz: 1b37fd01a1907f244e112171da5dd942105181b860e6dc05ede3e1798bfdf1e50e8543b69757109dc9bd7dfddecb6ba2fbec2b59f38843223052c559f085490e
6
+ metadata.gz: d4f6732579088cda9d4514b4d1dddd32f2cd933f2818ed4c74bf0f00168ccc3b20e6ce220bc1c473a0453d29ef50de256689ed17e1f809b42dcb8956377c0e76
7
+ data.tar.gz: 797887db1d626a92b28249883f2dcc55e3b92d9f423aa23052ba74cc45856a15ace2b6c823c983b6e6baa0907cfc3d467b9a52e05cd9de9d9fcf37c779bd0647
@@ -1,5 +1,17 @@
1
1
  # Babosa Changelog
2
2
 
3
+ ## 2.0.0
4
+
5
+ This release contains no important changes. I had a week off from work and
6
+ decided to refactor the code. However there are some small breaking changes so
7
+ I have released it as 2.0.0.
8
+
9
+ * Refactor internals for simplicity
10
+ * Use built-in Ruby UTF-8 support in places of other gems.
11
+ * Drop support for Ruby < 2.5.0.
12
+ * `Babosa::Identifier#word_chars` no longer removes dashes
13
+ * `Babosa::Identifier#to_ruby_method` default argument `allow_bangs` is now a keyword argument
14
+
3
15
  ## 1.0.4
4
16
 
5
17
  * Fix nil being cast to frozen string (https://github.com/norman/babosa/pull/52)
data/README.md CHANGED
@@ -15,12 +15,16 @@ FriendlyId.
15
15
 
16
16
  ### Transliterate UTF-8 characters to ASCII
17
17
 
18
- "Gölcük, Turkey".to_slug.transliterate.to_s #=> "Golcuk, Turkey"
18
+ ```ruby
19
+ "Gölcük, Turkey".to_slug.transliterate.to_s #=> "Golcuk, Turkey"
20
+ ```
19
21
 
20
22
  ### Locale sensitive transliteration, with support for many languages
21
23
 
22
- "Jürgen Müller".to_slug.transliterate.to_s #=> "Jurgen Muller"
23
- "Jürgen Müller".to_slug.transliterate(:german).to_s #=> "Juergen Mueller"
24
+ ```ruby
25
+ "Jürgen Müller".to_slug.transliterate.to_s #=> "Jurgen Muller"
26
+ "Jürgen Müller".to_slug.transliterate(:german).to_s #=> "Juergen Mueller"
27
+ ```
24
28
 
25
29
  Currently supported languages include:
26
30
 
@@ -28,6 +32,7 @@ Currently supported languages include:
28
32
  * Danish
29
33
  * German
30
34
  * Greek
35
+ * Hindi
31
36
  * Macedonian
32
37
  * Norwegian
33
38
  * Romanian
@@ -35,124 +40,125 @@ Currently supported languages include:
35
40
  * Serbian
36
41
  * Spanish
37
42
  * Swedish
43
+ * Turkish
38
44
  * Ukrainian
45
+ * Vietnamese
46
+
47
+ Additionally there are generic transliterators for transliterating from the
48
+ Cyrillic alphabet and Latin alphabet with diacritics. The Latin transliterator
49
+ can be used, for example, with Czech. There is also a transliterator named
50
+ "Hindi" which may be sufficient for other Indic languages using Devanagari, but
51
+ I do not know enough to say whether the transliterations would make sense.
39
52
 
40
53
  I'll gladly accept contributions from fluent speakers to support more languages.
41
54
 
42
55
  ### Strip non-ASCII characters
43
56
 
44
- "Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
57
+ ```ruby
58
+ "Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
59
+ ```
45
60
 
46
61
  ### Truncate by characters
47
62
 
48
- "üüü".to_slug.truncate(2).to_s #=> "üü"
63
+ ```ruby
64
+ "üüü".to_slug.truncate(2).to_s #=> "üü"
65
+ ```
49
66
 
50
67
  ### Truncate by bytes
51
68
 
52
69
  This can be useful to ensure the generated slug will fit in a database column
53
70
  whose length is limited by bytes rather than UTF-8 characters.
54
71
 
55
- "üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
72
+ ```ruby
73
+ "üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
74
+ ```
56
75
 
57
76
  ### Remove punctuation chars
58
77
 
59
- "this is, um, **really** cool, huh?".to_slug.word_chars.to_s #=> "this is um really cool huh"
78
+ ```ruby
79
+ "this is, um, **really** cool, huh?".to_slug.word_chars.to_s #=> "this is um really cool huh"
80
+ ```
60
81
 
61
82
  ### All-in-one
62
83
 
63
- "Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
84
+ ```ruby
85
+ "Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
86
+ ```
64
87
 
65
88
  ### Other stuff
66
89
 
67
- #### Using Babosa With FriendlyId 4
68
-
69
- require "babosa"
90
+ #### Using Babosa With FriendlyId 4+
70
91
 
71
- class Person < ActiveRecord::Base
72
- friendly_id :name, use: :slugged
73
-
74
- def normalize_friendly_id(input)
75
- input.to_s.to_slug.normalize(transliterations: :russian).to_s
76
- end
77
- end
92
+ ```ruby
93
+ require "babosa"
78
94
 
79
- #### Pedantic UTF-8 support
95
+ class Person < ActiveRecord::Base
96
+ friendly_id :name, use: :slugged
80
97
 
81
- Babosa goes out of its way to handle [nasty Unicode issues you might never think
82
- you would have](https://github.com/norman/enc/blob/master/equivalence.rb) by
83
- checking, sanitizing and normalizing your string input.
98
+ def normalize_friendly_id(input)
99
+ input.to_s.to_slug.normalize(transliterations: :russian).to_s
100
+ end
101
+ end
102
+ ```
84
103
 
85
- It will automatically use whatever Unicode library you have loaded before
86
- Babosa, or fall back to a simple built-in library. Supported
87
- Unicode libraries include:
104
+ #### UTF-8 support
88
105
 
89
- * Java (only on JRuby of course)
90
- * Active Support
91
- * [Unicode](https://github.com/blackwinter/unicode)
92
- * Built-in
93
-
94
- This built-in module is much faster than Active Support but much slower than
95
- Java or Unicode. It can only do **very** naive Unicode composition to ensure
96
- that, for example, "é" will always be composed to a single codepoint rather than
97
- an "e" and a "´" - making it safe to use as a hash key.
98
-
99
- But seriously - save yourself the headache and install a real Unicode library.
100
- If you are using Babosa with a language that uses the Cyrillic alphabet, Babosa
101
- requires either Unicode, Active Support or Java.
106
+ Babosa normalizes all input strings [to NFC](https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms).
102
107
 
103
108
  #### Ruby Method Names
104
109
 
105
- Babosa can also generate strings for Ruby method names. (Yes, Ruby 1.9 can use
110
+ Babosa can generate strings for Ruby method names. (Yes, Ruby 1.9+ can use
106
111
  UTF-8 chars in method names, but you may not want to):
107
112
 
108
113
 
109
- "this is a method".to_slug.to_ruby_method! #=> this_is_a_method
110
- "über cool stuff!".to_slug.to_ruby_method! #=> uber_cool_stuff!
114
+ ```ruby
115
+ "this is a method".to_slug.to_ruby_method! #=> this_is_a_method
116
+ "über cool stuff!".to_slug.to_ruby_method! #=> uber_cool_stuff!
111
117
 
112
- # You can also disallow trailing punctuation chars
113
- "über cool stuff!".to_slug.to_ruby_method(false) #=> uber_cool_stuff
118
+ # You can also disallow trailing punctuation chars
119
+ "über cool stuff!".to_slug.to_ruby_method(allow_bangs: false) #=> uber_cool_stuff
120
+ ```
114
121
 
115
122
  #### Easy to Extend
116
123
 
117
124
  You can add custom transliterators for your language with very little code. For
118
125
  example here's the transliterator for German:
119
126
 
120
- # encoding: utf-8
121
- module Babosa
122
- module Transliterator
123
- class German < Latin
124
- APPROXIMATIONS = {
125
- "ä" => "ae",
126
- "ö" => "oe",
127
- "ü" => "ue",
128
- "Ä" => "Ae",
129
- "Ö" => "Oe",
130
- "Ü" => "Ue"
131
- }
132
- end
133
- end
127
+ ```ruby
128
+ module Babosa
129
+ module Transliterator
130
+ class German < Latin
131
+ APPROXIMATIONS = {
132
+ "ä" => "ae",
133
+ "ö" => "oe",
134
+ "ü" => "ue",
135
+ "Ä" => "Ae",
136
+ "Ö" => "Oe",
137
+ "Ü" => "Ue"
138
+ }
134
139
  end
140
+ end
141
+ end
142
+ ```
135
143
 
136
144
  And a spec (you can use this as a template):
137
145
 
138
- # encoding: utf-8
139
- require File.expand_path("../../spec_helper", __FILE__)
140
-
141
- describe Babosa::Transliterator::German do
146
+ ```ruby
147
+ require "spec_helper"
142
148
 
143
- let(:t) { described_class.instance }
144
- it_behaves_like "a latin transliterator"
149
+ describe Babosa::Transliterator::German do
150
+ let(:t) { described_class.instance }
151
+ it_behaves_like "a latin transliterator"
145
152
 
146
- it "should transliterate Eszett" do
147
- t.transliterate("ß").should eql("ss")
148
- end
149
-
150
- it "should transliterate vowels with umlauts" do
151
- t.transliterate("üöä").should eql("ueoeae")
152
- end
153
-
154
- end
153
+ it "should transliterate Eszett" do
154
+ t.transliterate("ß").should eql("ss")
155
+ end
155
156
 
157
+ it "should transliterate vowels with umlauts" do
158
+ t.transliterate("üöä").should eql("ueoeae")
159
+ end
160
+ end
161
+ ```
156
162
 
157
163
  ### Rails 3.x and higher
158
164
 
@@ -167,46 +173,6 @@ and
167
173
  [parameterize](http://api.rubyonrails.org/classes/ActiveSupport/Inflector.html#method-i-parameterize)
168
174
  to see if they suit your needs.
169
175
 
170
- ### Babosa vs. Stringex
171
-
172
- Babosa provides much of the functionality provided by the
173
- [Stringex](https://github.com/rsl/stringex) gem, but in the subjective opinion
174
- of the author, is for most use cases a better choice.
175
-
176
- #### Fewer Features
177
-
178
- Stringex offers functionality for storing slugs in an Active Record model, like
179
- a simple version of [FriendlyId](http://github.com/norman/friendly_id), in
180
- addition to string processing. Babosa only does string processing.
181
-
182
- #### Less Aggressive Unicode Transliteration
183
-
184
- Stringex uses an agressive Unicode to ASCII mapping which outputs gibberish for
185
- almost anything but Western European langages and Mandarin Chinese. Babosa
186
- supports only languages for which fluent speakers have provided
187
- transliterations, to ensure that the output makes sense to users.
188
-
189
- #### Unicode Support
190
-
191
- Stringex does no Unicode normalization or validation before transliterating
192
- strings, so if you pass in strings with encoding errors or with different
193
- Unicode normalizations, you'll get unpredictable results.
194
-
195
- #### No Locale Assumptions
196
-
197
- Babosa avoids making assumptions about locales like Stringex does, so it doesn't
198
- offer transliterations like this out of the box:
199
-
200
- "$12 worth of Ruby power".to_url => "12-dollars-worth-of-ruby-power"
201
-
202
- This is because the symbol "$" is used in many Latin American countries for the
203
- peso. Stringex does this in many places, for example, transliterating all Han
204
- characters into Pinyin, effectively treating Japanese text as if it were
205
- Mandarin Chinese.
206
-
207
-
208
- ### More info
209
-
210
176
  Please see the [API docs](http://rubydoc.info/github/norman/babosa/master/frames) and source code for
211
177
  more info.
212
178
 
@@ -218,9 +184,6 @@ Babosa can be installed via Rubygems:
218
184
 
219
185
  You can get the source code from its [Github repository](http://github.com/norman/babosa).
220
186
 
221
- Babosa is tested to be compatible with Ruby 2.x, JRuby 1.7+, and
222
- Rubinius 2.x It's probably compatible with other Rubies as well.
223
-
224
187
  ## Reporting bugs
225
188
 
226
189
  Please use Babosa's [Github issue
@@ -229,7 +192,7 @@ tracker](http://github.com/norman/babosa/issues).
229
192
 
230
193
  ## Misc
231
194
 
232
- "Babosa" means slug in Spanish.
195
+ "Babosa" means "slug" in Spanish.
233
196
 
234
197
  ## Author
235
198
 
@@ -258,7 +221,7 @@ Many thanks to the following people for their help:
258
221
 
259
222
  ## Copyright
260
223
 
261
- Copyright (c) 2010-2013 Norman Clarke
224
+ Copyright (c) 2010-2020 Norman Clarke
262
225
 
263
226
  Permission is hereby granted, free of charge, to any person obtaining a copy of
264
227
  this software and associated documentation files (the "Software"), to deal in
data/Rakefile CHANGED
@@ -1,10 +1,13 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "rubygems"
2
4
  require "rake/testtask"
3
5
  require "rake/clean"
4
6
  require "rubygems/package_task"
7
+ require "rubocop/rake_task"
5
8
 
6
- task :default => :spec
7
- task :test => :spec
9
+ task default: [:rubocop, :spec]
10
+ task test: :spec
8
11
 
9
12
  CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
10
13
 
@@ -14,6 +17,7 @@ begin
14
17
  t.options = ["--output-dir=doc"]
15
18
  end
16
19
  rescue LoadError
20
+ puts "Yard not present"
17
21
  end
18
22
 
19
23
  begin
@@ -23,12 +27,9 @@ begin
23
27
  Rake::Task["spec"].execute
24
28
  end
25
29
  rescue LoadError
30
+ puts "SimpleCov not present"
26
31
  end
27
32
 
28
- gemspec = File.expand_path("../babosa.gemspec", __FILE__)
29
- if File.exist? gemspec
30
- Gem::PackageTask.new(eval(File.read(gemspec))) { |pkg| }
31
- end
32
-
33
- require 'rspec/core/rake_task'
33
+ require "rspec/core/rake_task"
34
34
  RSpec::Core::RakeTask.new(:spec)
35
+ RuboCop::RakeTask.new
@@ -1,7 +1,6 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Babosa
2
- def self.jruby15?
3
- JRUBY_VERSION >= "1.5" rescue false
4
- end
5
4
  end
6
5
 
7
6
  class String
@@ -12,5 +11,4 @@ class String
12
11
  end
13
12
 
14
13
  require "babosa/transliterator/base"
15
- require "babosa/utf8/proxy"
16
14
  require "babosa/identifier"
@@ -1,16 +1,6 @@
1
- # encoding: utf-8
2
- module Babosa
3
-
4
- # Codepoints for characters that will be deleted by +#word_chars!+.
5
- STRIPPABLE = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19,
6
- 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39,
7
- 40, 41, 42, 43, 44, 45, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94,
8
- 96, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
9
- 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
10
- 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167,
11
- 168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 184,
12
- 185, 187, 188, 189, 190, 191, 215, 247, 8203, 8204, 8205, 8239, 65279]
1
+ # frozen_string_literal: true
13
2
 
3
+ module Babosa
14
4
  # This class provides some string-manipulation methods specific to slugs.
15
5
  #
16
6
  # Note that this class includes many "bang methods" such as {#clean!} and
@@ -20,7 +10,7 @@ module Babosa
20
10
  # it is generated dynamically.
21
11
  #
22
12
  # All of the bang methods return an instance of String, while the bangless
23
- # versions return an instance of Babosa::Identifier, so that calls to methods
13
+ # versions return an instance of {Babosa::Identifier}, so that calls to methods
24
14
  # specific to this class can be chained:
25
15
  #
26
16
  # string = Identifier.new("hello world")
@@ -29,71 +19,47 @@ module Babosa
29
19
  #
30
20
  # @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
31
21
  class Identifier
32
-
33
22
  Error = Class.new(StandardError)
34
23
 
35
24
  attr_reader :wrapped_string
36
25
  alias to_s wrapped_string
37
26
 
38
- @@utf8_proxy = if Babosa.jruby15?
39
- UTF8::JavaProxy
40
- elsif defined? Unicode::VERSION
41
- UTF8::UnicodeProxy
42
- elsif defined? ActiveSupport
43
- UTF8::ActiveSupportProxy
44
- else
45
- UTF8::DumbProxy
46
- end
47
-
48
- # Return the proxy used for UTF-8 support.
49
- # @see Babosa::UTF8::Proxy
50
- def self.utf8_proxy
51
- @@utf8_proxy
52
- end
53
-
54
- # Set a proxy object used for UTF-8 support.
55
- # @see Babosa::UTF8::Proxy
56
- def self.utf8_proxy=(obj)
57
- @@utf8_proxy = obj
58
- end
59
-
60
27
  def method_missing(symbol, *args, &block)
61
28
  @wrapped_string.__send__(symbol, *args, &block)
62
29
  end
63
30
 
31
+ def respond_to_missing?(name, include_all)
32
+ @wrapped_string.respond_to?(name, include_all)
33
+ end
34
+
64
35
  # @param string [#to_s] The string to use as the basis of the Identifier.
65
36
  def initialize(string)
66
- @wrapped_string = string.to_s
37
+ @wrapped_string = string.to_s.dup
67
38
  tidy_bytes!
68
39
  normalize_utf8!
69
40
  end
70
41
 
71
- def ==(value)
72
- @wrapped_string.to_s == value.to_s
73
- end
74
-
75
- def eql?(value)
76
- @wrapped_string == value
42
+ def ==(other)
43
+ to_s == other.to_s
77
44
  end
78
45
 
79
- def empty?
80
- # included to make this class :respond_to? :empty for compatibility with Active Support's
81
- # #blank?
82
- @wrapped_string.empty?
46
+ def eql?(other)
47
+ self == other
83
48
  end
84
49
 
85
- # Approximate an ASCII string. This works only for Western strings using
86
- # characters that are Roman-alphabet characters + diacritics. Non-letter
87
- # characters are left unmodified.
50
+ # Approximate an ASCII string. This works only for strings using characters
51
+ # that are Roman-alphabet characters + diacritics. Non-letter characters
52
+ # are left unmodified.
88
53
  #
89
- # string = Identifier.new "Łódź
54
+ # string = Identifier.new "Łódź, Poland"
90
55
  # string.transliterate # => "Lodz, Poland"
91
56
  # string = Identifier.new "日本"
92
57
  # string.transliterate # => "日本"
93
58
  #
94
- # You can pass any key(s) from +Characters.approximations+ as arguments. This allows
95
- # for contextual approximations. Various languages are supported, you can see which ones
96
- # by looking at the source of {Babosa::Transliterator::Base}.
59
+ # You can pass the names of any transliterator class as arguments. This
60
+ # allows for contextual approximations. Various languages are supported,
61
+ # you can see which ones by looking at the source of
62
+ # {Babosa::Transliterator::Base}.
97
63
  #
98
64
  # string = Identifier.new "Jürgen Müller"
99
65
  # string.transliterate # => "Jurgen Muller"
@@ -111,7 +77,7 @@ module Babosa
111
77
  # to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
112
78
  #
113
79
  # string.transliterate!(:spanish) # => "¡Feliz anio!"
114
- # string.transliterate! # => "¡Feliz anio!"
80
+ # string.to_ascii! # => "Feliz anio!"
115
81
  #
116
82
  # @param *args <Symbol>
117
83
  # @return String
@@ -122,40 +88,50 @@ module Babosa
122
88
  transliterator = Transliterator.get(kind).instance
123
89
  @wrapped_string = transliterator.transliterate(@wrapped_string)
124
90
  end
125
- @wrapped_string
91
+ to_s
126
92
  end
127
93
 
128
94
  # Converts dashes to spaces, removes leading and trailing spaces, and
129
95
  # replaces multiple whitespace characters with a single space.
96
+ #
130
97
  # @return String
131
98
  def clean!
132
- @wrapped_string = @wrapped_string.gsub("-", " ").squeeze(" ").strip
99
+ gsub!(/[- ]+/, " ")
100
+ strip!
101
+ to_s
133
102
  end
134
103
 
135
104
  # Remove any non-word characters. For this library's purposes, this means
136
- # anything other than letters, numbers, spaces, newlines and linefeeds.
105
+ # anything other than letters, numbers, spaces, underscores, dashes,
106
+ # newlines, and linefeeds.
107
+ #
137
108
  # @return String
138
109
  def word_chars!
139
- @wrapped_string = (unpack("U*") - Babosa::STRIPPABLE).pack("U*")
110
+ # `^\p{letter}` = Any non-Unicode letter
111
+ # `&&` = add the following character class
112
+ # `[^ _\n\r]` = Anything other than space, underscore, newline or linefeed
113
+ gsub!(/[[^\p{letter}]&&[^ _\-\n\r]]/, "")
114
+ to_s
140
115
  end
141
116
 
142
117
  # Normalize the string for use as a URL slug. Note that in this context,
143
118
  # +normalize+ means, strip, remove non-letters/numbers, downcasing,
144
119
  # truncating to 255 bytes and converting whitespace to dashes.
145
- # @param Options
120
+ #
121
+ # @param options [Hash]
146
122
  # @return String
147
- def normalize!(options = nil)
148
- options = default_normalize_options.merge(options || {})
123
+ def normalize!(options = {})
124
+ options = default_normalize_options.merge(options)
149
125
 
150
- if translit_option = options[:transliterate]
151
- if translit_option != true
152
- transliterate!(*translit_option)
126
+ if options[:transliterate]
127
+ option = options[:transliterate]
128
+ if option != true
129
+ transliterate!(*option)
153
130
  else
154
131
  transliterate!(*options[:transliterations])
155
132
  end
156
133
  end
157
134
  to_ascii! if options[:to_ascii]
158
- clean!
159
135
  word_chars!
160
136
  clean!
161
137
  downcase!
@@ -164,105 +140,90 @@ module Babosa
164
140
  end
165
141
 
166
142
  # Normalize a string so that it can safely be used as a Ruby method name.
167
- def to_ruby_method!(allow_bangs = true)
168
- leader, trailer = @wrapped_string.strip.scan(/\A(.+)(.)\z/).flatten
169
- leader = leader.to_s.dup
170
- trailer = trailer.to_s.dup
171
- if allow_bangs
172
- trailer.downcase!
173
- trailer.gsub!(/[^a-z0-9!=\\?]/, '')
174
- else
175
- trailer.downcase!
176
- trailer.gsub!(/[^a-z0-9]/, '')
177
- end
178
- id = leader.to_identifier
179
- id.transliterate!
180
- id.to_ascii!
181
- id.clean!
182
- id.word_chars!
183
- id.clean!
184
- @wrapped_string = id.to_s + trailer
185
- if @wrapped_string == ""
186
- raise Error, "Input generates impossible Ruby method name"
187
- end
143
+ #
144
+ # @param allow_bangs [Boolean]
145
+ # @return String
146
+ def to_ruby_method!(allow_bangs: true)
147
+ last_char = self[-1]
148
+ transliterate!
149
+ to_ascii!
150
+ word_chars!
151
+ clean!
152
+ @wrapped_string += last_char if allow_bangs && ["!", "?"].include?(last_char)
153
+ raise Error, "Input generates impossible Ruby method name" if self == ""
154
+
188
155
  with_separators!("_")
189
156
  end
190
157
 
191
158
  # Delete any non-ascii characters.
159
+ #
192
160
  # @return String
193
161
  def to_ascii!
194
- @wrapped_string = @wrapped_string.gsub(/[^\x00-\x7f]/u, '')
162
+ gsub!(/[^\x00-\x7f]/u, "")
163
+ to_s
195
164
  end
196
165
 
197
166
  # Truncate the string to +max+ characters.
167
+ #
198
168
  # @example
199
169
  # "üéøá".to_identifier.truncate(3) #=> "üéø"
170
+ #
171
+ # @param max [Integer] The maximum number of characters.
200
172
  # @return String
201
173
  def truncate!(max)
202
- @wrapped_string = unpack("U*")[0...max].pack("U*")
174
+ @wrapped_string = slice(0, max)
203
175
  end
204
176
 
205
177
  # Truncate the string to +max+ bytes. This can be useful for ensuring that
206
178
  # a UTF-8 string will always fit into a database column with a certain max
207
179
  # byte length. The resulting string may be less than +max+ if the string must
208
180
  # be truncated at a multibyte character boundary.
181
+ #
209
182
  # @example
210
183
  # "üéøá".to_identifier.truncate_bytes(3) #=> "ü"
184
+ #
185
+ # @param max [Integer] The maximum number of bytes.
211
186
  # @return String
212
187
  def truncate_bytes!(max)
213
- return @wrapped_string if @wrapped_string.bytesize <= max
214
- curr = 0
215
- new = []
216
- unpack("U*").each do |char|
217
- break if curr > max
218
- char = [char].pack("U")
219
- curr += char.bytesize
220
- if curr <= max
221
- new << char
222
- end
223
- end
224
- @wrapped_string = new.join
188
+ truncate!(max)
189
+ chop! until bytesize <= max
225
190
  end
226
191
 
227
192
  # Replaces whitespace with dashes ("-").
193
+ #
194
+ # @param char [String] the separator character to use.
228
195
  # @return String
229
196
  def with_separators!(char = "-")
230
- @wrapped_string = @wrapped_string.gsub(/\s/u, char)
231
- end
232
-
233
- # Perform UTF-8 sensitive upcasing.
234
- # @return String
235
- def upcase!
236
- @wrapped_string = @@utf8_proxy.upcase(@wrapped_string)
237
- end
238
-
239
- # Perform UTF-8 sensitive downcasing.
240
- # @return String
241
- def downcase!
242
- @wrapped_string = @@utf8_proxy.downcase(@wrapped_string)
197
+ gsub!(/\s/u, char)
198
+ to_s
243
199
  end
244
200
 
245
201
  # Perform Unicode composition on the wrapped string.
202
+ #
246
203
  # @return String
247
204
  def normalize_utf8!
248
- @wrapped_string = @@utf8_proxy.normalize_utf8(@wrapped_string)
205
+ unicode_normalize!(:nfc)
206
+ to_s
249
207
  end
250
208
 
251
209
  # Attempt to convert characters encoded using CP1252 and IS0-8859-1 to
252
210
  # UTF-8.
253
211
  # @return String
254
212
  def tidy_bytes!
255
- @wrapped_string = @@utf8_proxy.tidy_bytes(@wrapped_string)
213
+ scrub! do |bad|
214
+ bad.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
215
+ end
216
+ to_s
256
217
  end
257
218
 
258
219
  %w[transliterate clean downcase word_chars normalize normalize_utf8
259
- tidy_bytes to_ascii to_ruby_method truncate truncate_bytes upcase
260
- with_separators].each do |method|
261
- class_eval(<<-EOM, __FILE__, __LINE__ + 1)
220
+ tidy_bytes to_ascii to_ruby_method truncate truncate_bytes upcase
221
+ with_separators].each do |method|
222
+ class_eval(<<-METHOD, __FILE__, __LINE__ + 1)
262
223
  def #{method}(*args)
263
224
  send_to_new_instance(:#{method}!, *args)
264
225
  end
265
- EOM
226
+ METHOD
266
227
  end
267
228
 
268
229
  def to_identifier
@@ -271,7 +232,7 @@ module Babosa
271
232
 
272
233
  # The default options for {#normalize!}. Override to set your own defaults.
273
234
  def default_normalize_options
274
- {:transliterate => true, :max_length => 255, :separator => "-"}
235
+ {transliterate: :latin, max_length: 255, separator: "-"}
275
236
  end
276
237
 
277
238
  alias approximate_ascii transliterate