babosa 1.0.4 → 2.0.0.beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Changelog.md +12 -0
- data/README.md +80 -117
- data/Rakefile +9 -8
- data/lib/babosa.rb +2 -4
- data/lib/babosa/identifier.rb +82 -121
- data/lib/babosa/transliterator/base.rb +57 -56
- data/lib/babosa/transliterator/bulgarian.rb +3 -2
- data/lib/babosa/transliterator/cyrillic.rb +5 -5
- data/lib/babosa/transliterator/danish.rb +3 -3
- data/lib/babosa/transliterator/german.rb +3 -2
- data/lib/babosa/transliterator/greek.rb +4 -3
- data/lib/babosa/transliterator/hindi.rb +3 -2
- data/lib/babosa/transliterator/latin.rb +5 -5
- data/lib/babosa/transliterator/macedonian.rb +3 -2
- data/lib/babosa/transliterator/norwegian.rb +3 -3
- data/lib/babosa/transliterator/romanian.rb +3 -2
- data/lib/babosa/transliterator/russian.rb +3 -2
- data/lib/babosa/transliterator/serbian.rb +29 -27
- data/lib/babosa/transliterator/spanish.rb +2 -2
- data/lib/babosa/transliterator/swedish.rb +3 -3
- data/lib/babosa/transliterator/turkish.rb +8 -8
- data/lib/babosa/transliterator/ukrainian.rb +5 -4
- data/lib/babosa/transliterator/vietnamese.rb +4 -3
- data/lib/babosa/version.rb +3 -1
- data/spec/{babosa_spec.rb → identifier_spec.rb} +9 -10
- data/spec/spec_helper.rb +6 -6
- data/spec/transliterators/base_spec.rb +5 -6
- data/spec/transliterators/bulgarian_spec.rb +4 -5
- data/spec/transliterators/danish_spec.rb +5 -6
- data/spec/transliterators/german_spec.rb +4 -5
- data/spec/transliterators/greek_spec.rb +7 -7
- data/spec/transliterators/hindi_spec.rb +7 -7
- data/spec/transliterators/latin_spec.rb +3 -4
- data/spec/transliterators/macedonian_spec.rb +3 -4
- data/spec/transliterators/norwegian_spec.rb +4 -4
- data/spec/transliterators/polish_spec.rb +3 -5
- data/spec/transliterators/romanian_spec.rb +5 -6
- data/spec/transliterators/russian_spec.rb +3 -4
- data/spec/transliterators/serbian_spec.rb +6 -7
- data/spec/transliterators/spanish_spec.rb +4 -5
- data/spec/transliterators/swedish_spec.rb +7 -7
- data/spec/transliterators/turkish_spec.rb +24 -24
- data/spec/transliterators/ukrainian_spec.rb +74 -75
- data/spec/transliterators/vietnamese_spec.rb +10 -10
- metadata +17 -38
- data/lib/babosa/utf8/active_support_proxy.rb +0 -38
- data/lib/babosa/utf8/dumb_proxy.rb +0 -49
- data/lib/babosa/utf8/java_proxy.rb +0 -22
- data/lib/babosa/utf8/mappings.rb +0 -193
- data/lib/babosa/utf8/proxy.rb +0 -125
- data/lib/babosa/utf8/unicode_proxy.rb +0 -23
- data/spec/utf8_proxy_spec.rb +0 -52
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef1346a05f1b3a1104af8a095b7829eaa853427927335c7905d41d5ab47b2e9c
|
4
|
+
data.tar.gz: 54e543a250c7eff0c9613bea3f22d2e79317707f60cbb364e6e68979237c4c53
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d4f6732579088cda9d4514b4d1dddd32f2cd933f2818ed4c74bf0f00168ccc3b20e6ce220bc1c473a0453d29ef50de256689ed17e1f809b42dcb8956377c0e76
|
7
|
+
data.tar.gz: 797887db1d626a92b28249883f2dcc55e3b92d9f423aa23052ba74cc45856a15ace2b6c823c983b6e6baa0907cfc3d467b9a52e05cd9de9d9fcf37c779bd0647
|
data/Changelog.md
CHANGED
@@ -1,5 +1,17 @@
|
|
1
1
|
# Babosa Changelog
|
2
2
|
|
3
|
+
## 2.0.0
|
4
|
+
|
5
|
+
This release contains no important changes. I had a week off from work and
|
6
|
+
decided to refactor the code. However there are some small breaking changes so
|
7
|
+
I have released it as 2.0.0.
|
8
|
+
|
9
|
+
* Refactor internals for simplicity
|
10
|
+
* Use built-in Ruby UTF-8 support in places of other gems.
|
11
|
+
* Drop support for Ruby < 2.5.0.
|
12
|
+
* `Babosa::Identifier#word_chars` no longer removes dashes
|
13
|
+
* `Babosa::Identifier#to_ruby_method` default argument `allow_bangs` is now a keyword argument
|
14
|
+
|
3
15
|
## 1.0.4
|
4
16
|
|
5
17
|
* Fix nil being cast to frozen string (https://github.com/norman/babosa/pull/52)
|
data/README.md
CHANGED
@@ -15,12 +15,16 @@ FriendlyId.
|
|
15
15
|
|
16
16
|
### Transliterate UTF-8 characters to ASCII
|
17
17
|
|
18
|
-
|
18
|
+
```ruby
|
19
|
+
"Gölcük, Turkey".to_slug.transliterate.to_s #=> "Golcuk, Turkey"
|
20
|
+
```
|
19
21
|
|
20
22
|
### Locale sensitive transliteration, with support for many languages
|
21
23
|
|
22
|
-
|
23
|
-
|
24
|
+
```ruby
|
25
|
+
"Jürgen Müller".to_slug.transliterate.to_s #=> "Jurgen Muller"
|
26
|
+
"Jürgen Müller".to_slug.transliterate(:german).to_s #=> "Juergen Mueller"
|
27
|
+
```
|
24
28
|
|
25
29
|
Currently supported languages include:
|
26
30
|
|
@@ -28,6 +32,7 @@ Currently supported languages include:
|
|
28
32
|
* Danish
|
29
33
|
* German
|
30
34
|
* Greek
|
35
|
+
* Hindi
|
31
36
|
* Macedonian
|
32
37
|
* Norwegian
|
33
38
|
* Romanian
|
@@ -35,124 +40,125 @@ Currently supported languages include:
|
|
35
40
|
* Serbian
|
36
41
|
* Spanish
|
37
42
|
* Swedish
|
43
|
+
* Turkish
|
38
44
|
* Ukrainian
|
45
|
+
* Vietnamese
|
46
|
+
|
47
|
+
Additionally there are generic transliterators for transliterating from the
|
48
|
+
Cyrillic alphabet and Latin alphabet with diacritics. The Latin transliterator
|
49
|
+
can be used, for example, with Czech. There is also a transliterator named
|
50
|
+
"Hindi" which may be sufficient for other Indic languages using Devanagari, but
|
51
|
+
I do not know enough to say whether the transliterations would make sense.
|
39
52
|
|
40
53
|
I'll gladly accept contributions from fluent speakers to support more languages.
|
41
54
|
|
42
55
|
### Strip non-ASCII characters
|
43
56
|
|
44
|
-
|
57
|
+
```ruby
|
58
|
+
"Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey"
|
59
|
+
```
|
45
60
|
|
46
61
|
### Truncate by characters
|
47
62
|
|
48
|
-
|
63
|
+
```ruby
|
64
|
+
"üüü".to_slug.truncate(2).to_s #=> "üü"
|
65
|
+
```
|
49
66
|
|
50
67
|
### Truncate by bytes
|
51
68
|
|
52
69
|
This can be useful to ensure the generated slug will fit in a database column
|
53
70
|
whose length is limited by bytes rather than UTF-8 characters.
|
54
71
|
|
55
|
-
|
72
|
+
```ruby
|
73
|
+
"üüü".to_slug.truncate_bytes(2).to_s #=> "ü"
|
74
|
+
```
|
56
75
|
|
57
76
|
### Remove punctuation chars
|
58
77
|
|
59
|
-
|
78
|
+
```ruby
|
79
|
+
"this is, um, **really** cool, huh?".to_slug.word_chars.to_s #=> "this is um really cool huh"
|
80
|
+
```
|
60
81
|
|
61
82
|
### All-in-one
|
62
83
|
|
63
|
-
|
84
|
+
```ruby
|
85
|
+
"Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey"
|
86
|
+
```
|
64
87
|
|
65
88
|
### Other stuff
|
66
89
|
|
67
|
-
#### Using Babosa With FriendlyId 4
|
68
|
-
|
69
|
-
require "babosa"
|
90
|
+
#### Using Babosa With FriendlyId 4+
|
70
91
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
def normalize_friendly_id(input)
|
75
|
-
input.to_s.to_slug.normalize(transliterations: :russian).to_s
|
76
|
-
end
|
77
|
-
end
|
92
|
+
```ruby
|
93
|
+
require "babosa"
|
78
94
|
|
79
|
-
|
95
|
+
class Person < ActiveRecord::Base
|
96
|
+
friendly_id :name, use: :slugged
|
80
97
|
|
81
|
-
|
82
|
-
|
83
|
-
|
98
|
+
def normalize_friendly_id(input)
|
99
|
+
input.to_s.to_slug.normalize(transliterations: :russian).to_s
|
100
|
+
end
|
101
|
+
end
|
102
|
+
```
|
84
103
|
|
85
|
-
|
86
|
-
Babosa, or fall back to a simple built-in library. Supported
|
87
|
-
Unicode libraries include:
|
104
|
+
#### UTF-8 support
|
88
105
|
|
89
|
-
|
90
|
-
* Active Support
|
91
|
-
* [Unicode](https://github.com/blackwinter/unicode)
|
92
|
-
* Built-in
|
93
|
-
|
94
|
-
This built-in module is much faster than Active Support but much slower than
|
95
|
-
Java or Unicode. It can only do **very** naive Unicode composition to ensure
|
96
|
-
that, for example, "é" will always be composed to a single codepoint rather than
|
97
|
-
an "e" and a "´" - making it safe to use as a hash key.
|
98
|
-
|
99
|
-
But seriously - save yourself the headache and install a real Unicode library.
|
100
|
-
If you are using Babosa with a language that uses the Cyrillic alphabet, Babosa
|
101
|
-
requires either Unicode, Active Support or Java.
|
106
|
+
Babosa normalizes all input strings [to NFC](https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms).
|
102
107
|
|
103
108
|
#### Ruby Method Names
|
104
109
|
|
105
|
-
Babosa can
|
110
|
+
Babosa can generate strings for Ruby method names. (Yes, Ruby 1.9+ can use
|
106
111
|
UTF-8 chars in method names, but you may not want to):
|
107
112
|
|
108
113
|
|
109
|
-
|
110
|
-
|
114
|
+
```ruby
|
115
|
+
"this is a method".to_slug.to_ruby_method! #=> this_is_a_method
|
116
|
+
"über cool stuff!".to_slug.to_ruby_method! #=> uber_cool_stuff!
|
111
117
|
|
112
|
-
|
113
|
-
|
118
|
+
# You can also disallow trailing punctuation chars
|
119
|
+
"über cool stuff!".to_slug.to_ruby_method(allow_bangs: false) #=> uber_cool_stuff
|
120
|
+
```
|
114
121
|
|
115
122
|
#### Easy to Extend
|
116
123
|
|
117
124
|
You can add custom transliterators for your language with very little code. For
|
118
125
|
example here's the transliterator for German:
|
119
126
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
end
|
133
|
-
end
|
127
|
+
```ruby
|
128
|
+
module Babosa
|
129
|
+
module Transliterator
|
130
|
+
class German < Latin
|
131
|
+
APPROXIMATIONS = {
|
132
|
+
"ä" => "ae",
|
133
|
+
"ö" => "oe",
|
134
|
+
"ü" => "ue",
|
135
|
+
"Ä" => "Ae",
|
136
|
+
"Ö" => "Oe",
|
137
|
+
"Ü" => "Ue"
|
138
|
+
}
|
134
139
|
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
```
|
135
143
|
|
136
144
|
And a spec (you can use this as a template):
|
137
145
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
describe Babosa::Transliterator::German do
|
146
|
+
```ruby
|
147
|
+
require "spec_helper"
|
142
148
|
|
143
|
-
|
144
|
-
|
149
|
+
describe Babosa::Transliterator::German do
|
150
|
+
let(:t) { described_class.instance }
|
151
|
+
it_behaves_like "a latin transliterator"
|
145
152
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
it "should transliterate vowels with umlauts" do
|
151
|
-
t.transliterate("üöä").should eql("ueoeae")
|
152
|
-
end
|
153
|
-
|
154
|
-
end
|
153
|
+
it "should transliterate Eszett" do
|
154
|
+
t.transliterate("ß").should eql("ss")
|
155
|
+
end
|
155
156
|
|
157
|
+
it "should transliterate vowels with umlauts" do
|
158
|
+
t.transliterate("üöä").should eql("ueoeae")
|
159
|
+
end
|
160
|
+
end
|
161
|
+
```
|
156
162
|
|
157
163
|
### Rails 3.x and higher
|
158
164
|
|
@@ -167,46 +173,6 @@ and
|
|
167
173
|
[parameterize](http://api.rubyonrails.org/classes/ActiveSupport/Inflector.html#method-i-parameterize)
|
168
174
|
to see if they suit your needs.
|
169
175
|
|
170
|
-
### Babosa vs. Stringex
|
171
|
-
|
172
|
-
Babosa provides much of the functionality provided by the
|
173
|
-
[Stringex](https://github.com/rsl/stringex) gem, but in the subjective opinion
|
174
|
-
of the author, is for most use cases a better choice.
|
175
|
-
|
176
|
-
#### Fewer Features
|
177
|
-
|
178
|
-
Stringex offers functionality for storing slugs in an Active Record model, like
|
179
|
-
a simple version of [FriendlyId](http://github.com/norman/friendly_id), in
|
180
|
-
addition to string processing. Babosa only does string processing.
|
181
|
-
|
182
|
-
#### Less Aggressive Unicode Transliteration
|
183
|
-
|
184
|
-
Stringex uses an agressive Unicode to ASCII mapping which outputs gibberish for
|
185
|
-
almost anything but Western European langages and Mandarin Chinese. Babosa
|
186
|
-
supports only languages for which fluent speakers have provided
|
187
|
-
transliterations, to ensure that the output makes sense to users.
|
188
|
-
|
189
|
-
#### Unicode Support
|
190
|
-
|
191
|
-
Stringex does no Unicode normalization or validation before transliterating
|
192
|
-
strings, so if you pass in strings with encoding errors or with different
|
193
|
-
Unicode normalizations, you'll get unpredictable results.
|
194
|
-
|
195
|
-
#### No Locale Assumptions
|
196
|
-
|
197
|
-
Babosa avoids making assumptions about locales like Stringex does, so it doesn't
|
198
|
-
offer transliterations like this out of the box:
|
199
|
-
|
200
|
-
"$12 worth of Ruby power".to_url => "12-dollars-worth-of-ruby-power"
|
201
|
-
|
202
|
-
This is because the symbol "$" is used in many Latin American countries for the
|
203
|
-
peso. Stringex does this in many places, for example, transliterating all Han
|
204
|
-
characters into Pinyin, effectively treating Japanese text as if it were
|
205
|
-
Mandarin Chinese.
|
206
|
-
|
207
|
-
|
208
|
-
### More info
|
209
|
-
|
210
176
|
Please see the [API docs](http://rubydoc.info/github/norman/babosa/master/frames) and source code for
|
211
177
|
more info.
|
212
178
|
|
@@ -218,9 +184,6 @@ Babosa can be installed via Rubygems:
|
|
218
184
|
|
219
185
|
You can get the source code from its [Github repository](http://github.com/norman/babosa).
|
220
186
|
|
221
|
-
Babosa is tested to be compatible with Ruby 2.x, JRuby 1.7+, and
|
222
|
-
Rubinius 2.x It's probably compatible with other Rubies as well.
|
223
|
-
|
224
187
|
## Reporting bugs
|
225
188
|
|
226
189
|
Please use Babosa's [Github issue
|
@@ -229,7 +192,7 @@ tracker](http://github.com/norman/babosa/issues).
|
|
229
192
|
|
230
193
|
## Misc
|
231
194
|
|
232
|
-
"Babosa" means slug in Spanish.
|
195
|
+
"Babosa" means "slug" in Spanish.
|
233
196
|
|
234
197
|
## Author
|
235
198
|
|
@@ -258,7 +221,7 @@ Many thanks to the following people for their help:
|
|
258
221
|
|
259
222
|
## Copyright
|
260
223
|
|
261
|
-
Copyright (c) 2010-
|
224
|
+
Copyright (c) 2010-2020 Norman Clarke
|
262
225
|
|
263
226
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
264
227
|
this software and associated documentation files (the "Software"), to deal in
|
data/Rakefile
CHANGED
@@ -1,10 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "rubygems"
|
2
4
|
require "rake/testtask"
|
3
5
|
require "rake/clean"
|
4
6
|
require "rubygems/package_task"
|
7
|
+
require "rubocop/rake_task"
|
5
8
|
|
6
|
-
task :
|
7
|
-
task :
|
9
|
+
task default: [:rubocop, :spec]
|
10
|
+
task test: :spec
|
8
11
|
|
9
12
|
CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
|
10
13
|
|
@@ -14,6 +17,7 @@ begin
|
|
14
17
|
t.options = ["--output-dir=doc"]
|
15
18
|
end
|
16
19
|
rescue LoadError
|
20
|
+
puts "Yard not present"
|
17
21
|
end
|
18
22
|
|
19
23
|
begin
|
@@ -23,12 +27,9 @@ begin
|
|
23
27
|
Rake::Task["spec"].execute
|
24
28
|
end
|
25
29
|
rescue LoadError
|
30
|
+
puts "SimpleCov not present"
|
26
31
|
end
|
27
32
|
|
28
|
-
|
29
|
-
if File.exist? gemspec
|
30
|
-
Gem::PackageTask.new(eval(File.read(gemspec))) { |pkg| }
|
31
|
-
end
|
32
|
-
|
33
|
-
require 'rspec/core/rake_task'
|
33
|
+
require "rspec/core/rake_task"
|
34
34
|
RSpec::Core::RakeTask.new(:spec)
|
35
|
+
RuboCop::RakeTask.new
|
data/lib/babosa.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Babosa
|
2
|
-
def self.jruby15?
|
3
|
-
JRUBY_VERSION >= "1.5" rescue false
|
4
|
-
end
|
5
4
|
end
|
6
5
|
|
7
6
|
class String
|
@@ -12,5 +11,4 @@ class String
|
|
12
11
|
end
|
13
12
|
|
14
13
|
require "babosa/transliterator/base"
|
15
|
-
require "babosa/utf8/proxy"
|
16
14
|
require "babosa/identifier"
|
data/lib/babosa/identifier.rb
CHANGED
@@ -1,16 +1,6 @@
|
|
1
|
-
#
|
2
|
-
module Babosa
|
3
|
-
|
4
|
-
# Codepoints for characters that will be deleted by +#word_chars!+.
|
5
|
-
STRIPPABLE = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19,
|
6
|
-
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39,
|
7
|
-
40, 41, 42, 43, 44, 45, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94,
|
8
|
-
96, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
|
9
|
-
137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
|
10
|
-
152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167,
|
11
|
-
168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 184,
|
12
|
-
185, 187, 188, 189, 190, 191, 215, 247, 8203, 8204, 8205, 8239, 65279]
|
1
|
+
# frozen_string_literal: true
|
13
2
|
|
3
|
+
module Babosa
|
14
4
|
# This class provides some string-manipulation methods specific to slugs.
|
15
5
|
#
|
16
6
|
# Note that this class includes many "bang methods" such as {#clean!} and
|
@@ -20,7 +10,7 @@ module Babosa
|
|
20
10
|
# it is generated dynamically.
|
21
11
|
#
|
22
12
|
# All of the bang methods return an instance of String, while the bangless
|
23
|
-
# versions return an instance of Babosa::Identifier, so that calls to methods
|
13
|
+
# versions return an instance of {Babosa::Identifier}, so that calls to methods
|
24
14
|
# specific to this class can be chained:
|
25
15
|
#
|
26
16
|
# string = Identifier.new("hello world")
|
@@ -29,71 +19,47 @@ module Babosa
|
|
29
19
|
#
|
30
20
|
# @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
|
31
21
|
class Identifier
|
32
|
-
|
33
22
|
Error = Class.new(StandardError)
|
34
23
|
|
35
24
|
attr_reader :wrapped_string
|
36
25
|
alias to_s wrapped_string
|
37
26
|
|
38
|
-
@@utf8_proxy = if Babosa.jruby15?
|
39
|
-
UTF8::JavaProxy
|
40
|
-
elsif defined? Unicode::VERSION
|
41
|
-
UTF8::UnicodeProxy
|
42
|
-
elsif defined? ActiveSupport
|
43
|
-
UTF8::ActiveSupportProxy
|
44
|
-
else
|
45
|
-
UTF8::DumbProxy
|
46
|
-
end
|
47
|
-
|
48
|
-
# Return the proxy used for UTF-8 support.
|
49
|
-
# @see Babosa::UTF8::Proxy
|
50
|
-
def self.utf8_proxy
|
51
|
-
@@utf8_proxy
|
52
|
-
end
|
53
|
-
|
54
|
-
# Set a proxy object used for UTF-8 support.
|
55
|
-
# @see Babosa::UTF8::Proxy
|
56
|
-
def self.utf8_proxy=(obj)
|
57
|
-
@@utf8_proxy = obj
|
58
|
-
end
|
59
|
-
|
60
27
|
def method_missing(symbol, *args, &block)
|
61
28
|
@wrapped_string.__send__(symbol, *args, &block)
|
62
29
|
end
|
63
30
|
|
31
|
+
def respond_to_missing?(name, include_all)
|
32
|
+
@wrapped_string.respond_to?(name, include_all)
|
33
|
+
end
|
34
|
+
|
64
35
|
# @param string [#to_s] The string to use as the basis of the Identifier.
|
65
36
|
def initialize(string)
|
66
|
-
@wrapped_string = string.to_s
|
37
|
+
@wrapped_string = string.to_s.dup
|
67
38
|
tidy_bytes!
|
68
39
|
normalize_utf8!
|
69
40
|
end
|
70
41
|
|
71
|
-
def ==(
|
72
|
-
|
73
|
-
end
|
74
|
-
|
75
|
-
def eql?(value)
|
76
|
-
@wrapped_string == value
|
42
|
+
def ==(other)
|
43
|
+
to_s == other.to_s
|
77
44
|
end
|
78
45
|
|
79
|
-
def
|
80
|
-
|
81
|
-
# #blank?
|
82
|
-
@wrapped_string.empty?
|
46
|
+
def eql?(other)
|
47
|
+
self == other
|
83
48
|
end
|
84
49
|
|
85
|
-
# Approximate an ASCII string. This works only for
|
86
|
-
#
|
87
|
-
#
|
50
|
+
# Approximate an ASCII string. This works only for strings using characters
|
51
|
+
# that are Roman-alphabet characters + diacritics. Non-letter characters
|
52
|
+
# are left unmodified.
|
88
53
|
#
|
89
|
-
# string = Identifier.new "
|
54
|
+
# string = Identifier.new "Łódź, Poland"
|
90
55
|
# string.transliterate # => "Lodz, Poland"
|
91
56
|
# string = Identifier.new "日本"
|
92
57
|
# string.transliterate # => "日本"
|
93
58
|
#
|
94
|
-
# You can pass any
|
95
|
-
# for contextual approximations. Various languages are supported,
|
96
|
-
# by looking at the source of
|
59
|
+
# You can pass the names of any transliterator class as arguments. This
|
60
|
+
# allows for contextual approximations. Various languages are supported,
|
61
|
+
# you can see which ones by looking at the source of
|
62
|
+
# {Babosa::Transliterator::Base}.
|
97
63
|
#
|
98
64
|
# string = Identifier.new "Jürgen Müller"
|
99
65
|
# string.transliterate # => "Jurgen Muller"
|
@@ -111,7 +77,7 @@ module Babosa
|
|
111
77
|
# to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
|
112
78
|
#
|
113
79
|
# string.transliterate!(:spanish) # => "¡Feliz anio!"
|
114
|
-
# string.
|
80
|
+
# string.to_ascii! # => "Feliz anio!"
|
115
81
|
#
|
116
82
|
# @param *args <Symbol>
|
117
83
|
# @return String
|
@@ -122,40 +88,50 @@ module Babosa
|
|
122
88
|
transliterator = Transliterator.get(kind).instance
|
123
89
|
@wrapped_string = transliterator.transliterate(@wrapped_string)
|
124
90
|
end
|
125
|
-
|
91
|
+
to_s
|
126
92
|
end
|
127
93
|
|
128
94
|
# Converts dashes to spaces, removes leading and trailing spaces, and
|
129
95
|
# replaces multiple whitespace characters with a single space.
|
96
|
+
#
|
130
97
|
# @return String
|
131
98
|
def clean!
|
132
|
-
|
99
|
+
gsub!(/[- ]+/, " ")
|
100
|
+
strip!
|
101
|
+
to_s
|
133
102
|
end
|
134
103
|
|
135
104
|
# Remove any non-word characters. For this library's purposes, this means
|
136
|
-
# anything other than letters, numbers, spaces,
|
105
|
+
# anything other than letters, numbers, spaces, underscores, dashes,
|
106
|
+
# newlines, and linefeeds.
|
107
|
+
#
|
137
108
|
# @return String
|
138
109
|
def word_chars!
|
139
|
-
|
110
|
+
# `^\p{letter}` = Any non-Unicode letter
|
111
|
+
# `&&` = add the following character class
|
112
|
+
# `[^ _\n\r]` = Anything other than space, underscore, newline or linefeed
|
113
|
+
gsub!(/[[^\p{letter}]&&[^ _\-\n\r]]/, "")
|
114
|
+
to_s
|
140
115
|
end
|
141
116
|
|
142
117
|
# Normalize the string for use as a URL slug. Note that in this context,
|
143
118
|
# +normalize+ means, strip, remove non-letters/numbers, downcasing,
|
144
119
|
# truncating to 255 bytes and converting whitespace to dashes.
|
145
|
-
#
|
120
|
+
#
|
121
|
+
# @param options [Hash]
|
146
122
|
# @return String
|
147
|
-
def normalize!(options =
|
148
|
-
options = default_normalize_options.merge(options
|
123
|
+
def normalize!(options = {})
|
124
|
+
options = default_normalize_options.merge(options)
|
149
125
|
|
150
|
-
if
|
151
|
-
|
152
|
-
|
126
|
+
if options[:transliterate]
|
127
|
+
option = options[:transliterate]
|
128
|
+
if option != true
|
129
|
+
transliterate!(*option)
|
153
130
|
else
|
154
131
|
transliterate!(*options[:transliterations])
|
155
132
|
end
|
156
133
|
end
|
157
134
|
to_ascii! if options[:to_ascii]
|
158
|
-
clean!
|
159
135
|
word_chars!
|
160
136
|
clean!
|
161
137
|
downcase!
|
@@ -164,105 +140,90 @@ module Babosa
|
|
164
140
|
end
|
165
141
|
|
166
142
|
# Normalize a string so that it can safely be used as a Ruby method name.
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
id.transliterate!
|
180
|
-
id.to_ascii!
|
181
|
-
id.clean!
|
182
|
-
id.word_chars!
|
183
|
-
id.clean!
|
184
|
-
@wrapped_string = id.to_s + trailer
|
185
|
-
if @wrapped_string == ""
|
186
|
-
raise Error, "Input generates impossible Ruby method name"
|
187
|
-
end
|
143
|
+
#
|
144
|
+
# @param allow_bangs [Boolean]
|
145
|
+
# @return String
|
146
|
+
def to_ruby_method!(allow_bangs: true)
|
147
|
+
last_char = self[-1]
|
148
|
+
transliterate!
|
149
|
+
to_ascii!
|
150
|
+
word_chars!
|
151
|
+
clean!
|
152
|
+
@wrapped_string += last_char if allow_bangs && ["!", "?"].include?(last_char)
|
153
|
+
raise Error, "Input generates impossible Ruby method name" if self == ""
|
154
|
+
|
188
155
|
with_separators!("_")
|
189
156
|
end
|
190
157
|
|
191
158
|
# Delete any non-ascii characters.
|
159
|
+
#
|
192
160
|
# @return String
|
193
161
|
def to_ascii!
|
194
|
-
|
162
|
+
gsub!(/[^\x00-\x7f]/u, "")
|
163
|
+
to_s
|
195
164
|
end
|
196
165
|
|
197
166
|
# Truncate the string to +max+ characters.
|
167
|
+
#
|
198
168
|
# @example
|
199
169
|
# "üéøá".to_identifier.truncate(3) #=> "üéø"
|
170
|
+
#
|
171
|
+
# @param max [Integer] The maximum number of characters.
|
200
172
|
# @return String
|
201
173
|
def truncate!(max)
|
202
|
-
@wrapped_string =
|
174
|
+
@wrapped_string = slice(0, max)
|
203
175
|
end
|
204
176
|
|
205
177
|
# Truncate the string to +max+ bytes. This can be useful for ensuring that
|
206
178
|
# a UTF-8 string will always fit into a database column with a certain max
|
207
179
|
# byte length. The resulting string may be less than +max+ if the string must
|
208
180
|
# be truncated at a multibyte character boundary.
|
181
|
+
#
|
209
182
|
# @example
|
210
183
|
# "üéøá".to_identifier.truncate_bytes(3) #=> "ü"
|
184
|
+
#
|
185
|
+
# @param max [Integer] The maximum number of bytes.
|
211
186
|
# @return String
|
212
187
|
def truncate_bytes!(max)
|
213
|
-
|
214
|
-
|
215
|
-
new = []
|
216
|
-
unpack("U*").each do |char|
|
217
|
-
break if curr > max
|
218
|
-
char = [char].pack("U")
|
219
|
-
curr += char.bytesize
|
220
|
-
if curr <= max
|
221
|
-
new << char
|
222
|
-
end
|
223
|
-
end
|
224
|
-
@wrapped_string = new.join
|
188
|
+
truncate!(max)
|
189
|
+
chop! until bytesize <= max
|
225
190
|
end
|
226
191
|
|
227
192
|
# Replaces whitespace with dashes ("-").
|
193
|
+
#
|
194
|
+
# @param char [String] the separator character to use.
|
228
195
|
# @return String
|
229
196
|
def with_separators!(char = "-")
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
# Perform UTF-8 sensitive upcasing.
|
234
|
-
# @return String
|
235
|
-
def upcase!
|
236
|
-
@wrapped_string = @@utf8_proxy.upcase(@wrapped_string)
|
237
|
-
end
|
238
|
-
|
239
|
-
# Perform UTF-8 sensitive downcasing.
|
240
|
-
# @return String
|
241
|
-
def downcase!
|
242
|
-
@wrapped_string = @@utf8_proxy.downcase(@wrapped_string)
|
197
|
+
gsub!(/\s/u, char)
|
198
|
+
to_s
|
243
199
|
end
|
244
200
|
|
245
201
|
# Perform Unicode composition on the wrapped string.
|
202
|
+
#
|
246
203
|
# @return String
|
247
204
|
def normalize_utf8!
|
248
|
-
|
205
|
+
unicode_normalize!(:nfc)
|
206
|
+
to_s
|
249
207
|
end
|
250
208
|
|
251
209
|
# Attempt to convert characters encoded using CP1252 and IS0-8859-1 to
|
252
210
|
# UTF-8.
|
253
211
|
# @return String
|
254
212
|
def tidy_bytes!
|
255
|
-
|
213
|
+
scrub! do |bad|
|
214
|
+
bad.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
|
215
|
+
end
|
216
|
+
to_s
|
256
217
|
end
|
257
218
|
|
258
219
|
%w[transliterate clean downcase word_chars normalize normalize_utf8
|
259
|
-
|
260
|
-
|
261
|
-
class_eval(<<-
|
220
|
+
tidy_bytes to_ascii to_ruby_method truncate truncate_bytes upcase
|
221
|
+
with_separators].each do |method|
|
222
|
+
class_eval(<<-METHOD, __FILE__, __LINE__ + 1)
|
262
223
|
def #{method}(*args)
|
263
224
|
send_to_new_instance(:#{method}!, *args)
|
264
225
|
end
|
265
|
-
|
226
|
+
METHOD
|
266
227
|
end
|
267
228
|
|
268
229
|
def to_identifier
|
@@ -271,7 +232,7 @@ module Babosa
|
|
271
232
|
|
272
233
|
# The default options for {#normalize!}. Override to set your own defaults.
|
273
234
|
def default_normalize_options
|
274
|
-
{:
|
235
|
+
{transliterate: :latin, max_length: 255, separator: "-"}
|
275
236
|
end
|
276
237
|
|
277
238
|
alias approximate_ascii transliterate
|