babosa 0.3.11 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Changelog.md +93 -17
- data/lib/babosa.rb +0 -17
- data/lib/babosa/identifier.rb +18 -16
- data/lib/babosa/transliterator/base.rb +16 -3
- data/lib/babosa/transliterator/ukrainian.rb +19 -0
- data/lib/babosa/utf8/active_support_proxy.rb +5 -11
- data/lib/babosa/utf8/dumb_proxy.rb +23 -16
- data/lib/babosa/utf8/java_proxy.rb +1 -1
- data/lib/babosa/utf8/proxy.rb +46 -39
- data/lib/babosa/utf8/unicode_proxy.rb +3 -1
- data/lib/babosa/version.rb +1 -1
- data/spec/babosa_spec.rb +45 -36
- data/spec/spec_helper.rb +8 -14
- data/spec/transliterators/base_spec.rb +3 -3
- data/spec/transliterators/bulgarian_spec.rb +1 -1
- data/spec/transliterators/danish_spec.rb +1 -1
- data/spec/transliterators/german_spec.rb +2 -2
- data/spec/transliterators/greek_spec.rb +1 -1
- data/spec/transliterators/latin_spec.rb +9 -0
- data/spec/transliterators/norwegian_spec.rb +1 -1
- data/spec/transliterators/polish_spec.rb +14 -0
- data/spec/transliterators/romanian_spec.rb +1 -1
- data/spec/transliterators/serbian_spec.rb +1 -1
- data/spec/transliterators/spanish_spec.rb +1 -1
- data/spec/transliterators/swedish_spec.rb +1 -1
- data/spec/transliterators/ukrainian_spec.rb +80 -1
- data/spec/transliterators/vietnamese_spec.rb +1 -1
- data/spec/utf8_proxy_spec.rb +10 -18
- metadata +42 -29
- data/init.rb +0 -3
- data/lib/babosa/candidates.rb +0 -45
- data/lib/babosa/generator.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f3dfc2a054ed3f64981c0f18a149a9647ef0183
|
4
|
+
data.tar.gz: 3478c2c422839e82866d828cc77ba3bcf79a5117
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f7880005ce37bddd4b9780a3b31489eebdfbbb4e8f9b7d6eacd54cc3e33c36b412ec2fd4b5cc4aea87ea64288240002c9006a909e3ba2692161e775f5e43836f
|
7
|
+
data.tar.gz: c0e7fd6edeb02401dfb34493e5e072de62fdbe3c0333d252ebac5b359830ef966ededc8879077ed00e54a75432db72ec0852c3f4c4edf7e4a2b40c550c3e6245
|
data/Changelog.md
CHANGED
@@ -1,19 +1,95 @@
|
|
1
1
|
# Babosa Changelog
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
*
|
6
|
-
*
|
7
|
-
*
|
8
|
-
*
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
*
|
13
|
-
*
|
14
|
-
*
|
15
|
-
*
|
16
|
-
*
|
17
|
-
*
|
18
|
-
*
|
19
|
-
*
|
3
|
+
## 1.0.1
|
4
|
+
|
5
|
+
* Fix error with tidy_bytes on Rubinius.
|
6
|
+
* Simplify Active Support UTF8 proxy.
|
7
|
+
* Fix `allow_bangs` argument to to_ruby_method being silently ignored.
|
8
|
+
* Raise error when generating an impossible Ruby method name.
|
9
|
+
|
10
|
+
## 1.0.0
|
11
|
+
|
12
|
+
* Adopt semantic versioning.
|
13
|
+
* When using Active Support, require 3.2 or greater.
|
14
|
+
* Require Ruby 2.0 or greater.
|
15
|
+
* Fix Ruby warnings.
|
16
|
+
* Improve support for Ukrainian.
|
17
|
+
* Support some additional punctuation characters used by Chinese and others.
|
18
|
+
* Add Polish spec.
|
19
|
+
* Use native Unicode normalization on Ruby 2.2 in UTF8::DumbProxy.
|
20
|
+
* Invoke Ruby-native upcase/downcase in UTF8::DumbProxy.
|
21
|
+
* Proxy `tidy_bytes` method to Active Support when possible.
|
22
|
+
* Remove SlugString constant.
|
23
|
+
|
24
|
+
## 0.3.11
|
25
|
+
|
26
|
+
* Add support for Vietnamese.
|
27
|
+
|
28
|
+
## 0.3.10
|
29
|
+
|
30
|
+
* Fix Macedonian "S/S". Don't `include JRuby` unnecessarily.
|
31
|
+
|
32
|
+
## 0.3.9
|
33
|
+
|
34
|
+
* Add missing Greek vowels with diaeresis.
|
35
|
+
|
36
|
+
## 0.3.8
|
37
|
+
|
38
|
+
* Correct and improve Macedonian support.
|
39
|
+
|
40
|
+
## 0.3.7
|
41
|
+
|
42
|
+
* Fix compatibility with Ruby 1.8.7.
|
43
|
+
* Add Swedish support.
|
44
|
+
|
45
|
+
## 0.3.6
|
46
|
+
|
47
|
+
* Allow multiple transliterators.
|
48
|
+
* Add Greek support.
|
49
|
+
|
50
|
+
## 0.3.5
|
51
|
+
|
52
|
+
* Don't strip underscores from identifiers.
|
53
|
+
|
54
|
+
## 0.3.4
|
55
|
+
|
56
|
+
* Add Romanian support.
|
57
|
+
|
58
|
+
## 0.3.3
|
59
|
+
|
60
|
+
* Add Norwegian support.
|
61
|
+
|
62
|
+
## 0.3.2
|
63
|
+
|
64
|
+
* Improve Macedonian support.
|
65
|
+
|
66
|
+
## 0.3.1
|
67
|
+
|
68
|
+
* Small fixes to Cyrillic.
|
69
|
+
|
70
|
+
## 0.3.0
|
71
|
+
|
72
|
+
* Cyrillic support.
|
73
|
+
* Improve support for various Unicode spaces and dashes.
|
74
|
+
|
75
|
+
## 0.2.2
|
76
|
+
|
77
|
+
* Fix for "smart" quote handling.
|
78
|
+
|
79
|
+
## 0.2.1
|
80
|
+
|
81
|
+
* Implement #empty? for compatiblity with Active Support's #blank?.
|
82
|
+
|
83
|
+
## 0.2.0
|
84
|
+
|
85
|
+
* Add support for Danish.
|
86
|
+
* Add method to generate Ruby identifiers.
|
87
|
+
* Improve performance.
|
88
|
+
|
89
|
+
## 0.1.1
|
90
|
+
|
91
|
+
* Add support for Serbian.
|
92
|
+
|
93
|
+
## 0.1.0
|
94
|
+
|
95
|
+
* Initial extraction from FriendlyId.
|
data/lib/babosa.rb
CHANGED
@@ -9,23 +9,6 @@ class String
|
|
9
9
|
Babosa::Identifier.new self
|
10
10
|
end
|
11
11
|
alias to_slug to_identifier
|
12
|
-
|
13
|
-
# Compatibility with 1.8.6
|
14
|
-
if !public_method_defined? :bytesize
|
15
|
-
def bytesize
|
16
|
-
unpack("C*").length
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
# Define unless Active Support has already added this method.
|
21
|
-
if !public_method_defined? :classify
|
22
|
-
# Convert from underscores to class name. E.g.:
|
23
|
-
# hello_world => HelloWorld
|
24
|
-
def classify
|
25
|
-
split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
12
|
end
|
30
13
|
|
31
14
|
require "babosa/transliterator/base"
|
data/lib/babosa/identifier.rb
CHANGED
@@ -30,6 +30,8 @@ module Babosa
|
|
30
30
|
# @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
|
31
31
|
class Identifier
|
32
32
|
|
33
|
+
Error = Class.new(StandardError)
|
34
|
+
|
33
35
|
attr_reader :wrapped_string
|
34
36
|
alias to_s wrapped_string
|
35
37
|
|
@@ -44,13 +46,13 @@ module Babosa
|
|
44
46
|
end
|
45
47
|
|
46
48
|
# Return the proxy used for UTF-8 support.
|
47
|
-
# @see Babosa::UTF8::
|
49
|
+
# @see Babosa::UTF8::Proxy
|
48
50
|
def self.utf8_proxy
|
49
51
|
@@utf8_proxy
|
50
52
|
end
|
51
53
|
|
52
54
|
# Set a proxy object used for UTF-8 support.
|
53
|
-
# @see Babosa::UTF8::
|
55
|
+
# @see Babosa::UTF8::Proxy
|
54
56
|
def self.utf8_proxy=(obj)
|
55
57
|
@@utf8_proxy = obj
|
56
58
|
end
|
@@ -100,16 +102,17 @@ module Babosa
|
|
100
102
|
# string.transliterate # => "¡Feliz ano!"
|
101
103
|
# string.transliterate :spanish # => "¡Feliz anio!"
|
102
104
|
#
|
103
|
-
#
|
105
|
+
# The approximations are an array, which you can modify if you choose:
|
104
106
|
#
|
105
107
|
# # Make Spanish use "nh" rather than "nn"
|
106
|
-
# Babosa::
|
108
|
+
# Babosa::Transliterator::Spanish::APPROXIMATIONS["ñ"] = "nh"
|
107
109
|
#
|
108
110
|
# Notice that this method does not simply convert to ASCII; if you want
|
109
111
|
# to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
|
110
112
|
#
|
111
113
|
# string.transliterate!(:spanish) # => "¡Feliz anio!"
|
112
114
|
# string.transliterate! # => "¡Feliz anio!"
|
115
|
+
#
|
113
116
|
# @param *args <Symbol>
|
114
117
|
# @return String
|
115
118
|
def transliterate!(*kinds)
|
@@ -142,13 +145,8 @@ module Babosa
|
|
142
145
|
# @param Options
|
143
146
|
# @return String
|
144
147
|
def normalize!(options = nil)
|
145
|
-
|
146
|
-
|
147
|
-
warn "#normalize! now takes a hash of options rather than a boolean"
|
148
|
-
options = default_normalize_options.merge(:to_ascii => true)
|
149
|
-
else
|
150
|
-
options = default_normalize_options.merge(options || {})
|
151
|
-
end
|
148
|
+
options = default_normalize_options.merge(options || {})
|
149
|
+
|
152
150
|
if translit_option = options[:transliterate]
|
153
151
|
if translit_option != true
|
154
152
|
transliterate!(*translit_option)
|
@@ -168,10 +166,14 @@ module Babosa
|
|
168
166
|
# Normalize a string so that it can safely be used as a Ruby method name.
|
169
167
|
def to_ruby_method!(allow_bangs = true)
|
170
168
|
leader, trailer = @wrapped_string.strip.scan(/\A(.+)(.)\z/).flatten
|
169
|
+
leader = leader.to_s
|
170
|
+
trailer = trailer.to_s
|
171
171
|
if allow_bangs
|
172
|
-
trailer.downcase
|
172
|
+
trailer.downcase!
|
173
|
+
trailer.gsub!(/[^a-z0-9!=\\?]/, '')
|
173
174
|
else
|
174
|
-
trailer.downcase
|
175
|
+
trailer.downcase!
|
176
|
+
trailer.gsub!(/[^a-z0-9]/, '')
|
175
177
|
end
|
176
178
|
id = leader.to_identifier
|
177
179
|
id.transliterate!
|
@@ -180,6 +182,9 @@ module Babosa
|
|
180
182
|
id.word_chars!
|
181
183
|
id.clean!
|
182
184
|
@wrapped_string = id.to_s + trailer
|
185
|
+
if @wrapped_string == ""
|
186
|
+
raise Error, "Input generates impossible Ruby method name"
|
187
|
+
end
|
183
188
|
with_separators!("_")
|
184
189
|
end
|
185
190
|
|
@@ -285,7 +290,4 @@ module Babosa
|
|
285
290
|
id
|
286
291
|
end
|
287
292
|
end
|
288
|
-
|
289
|
-
# Identifier is aliased as SlugString to support older versions of FriendlyId.
|
290
|
-
SlugString = Identifier
|
291
293
|
end
|
@@ -23,11 +23,11 @@ module Babosa
|
|
23
23
|
autoload :Vietnamese, "babosa/transliterator/vietnamese"
|
24
24
|
|
25
25
|
def self.get(symbol)
|
26
|
-
|
26
|
+
class_name = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
|
27
|
+
const_get(class_name)
|
27
28
|
end
|
28
29
|
|
29
30
|
class Base
|
30
|
-
|
31
31
|
include Singleton
|
32
32
|
|
33
33
|
APPROXIMATIONS = {
|
@@ -39,7 +39,6 @@ module Babosa
|
|
39
39
|
"–" => "-",
|
40
40
|
"—" => "-",
|
41
41
|
"―" => "-",
|
42
|
-
"―" => "-",
|
43
42
|
"‘" => "'",
|
44
43
|
"‛" => "'",
|
45
44
|
"“" => '"',
|
@@ -47,6 +46,19 @@ module Babosa
|
|
47
46
|
"„" => '"',
|
48
47
|
"‟" => '"',
|
49
48
|
'’' => "'",
|
49
|
+
',' => ",",
|
50
|
+
'。' => ".",
|
51
|
+
'!' => "!",
|
52
|
+
'?' => '?',
|
53
|
+
'、' => ',',
|
54
|
+
'(' => '(',
|
55
|
+
')' => ')',
|
56
|
+
'【' => '[',
|
57
|
+
'】' => ']',
|
58
|
+
';' => ';',
|
59
|
+
':' => ':',
|
60
|
+
'《' => '<',
|
61
|
+
'》' => '>',
|
50
62
|
# various kinds of space characters
|
51
63
|
"\xc2\xa0" => " ",
|
52
64
|
"\xe2\x80\x80" => " ",
|
@@ -87,6 +99,7 @@ module Babosa
|
|
87
99
|
@approximations[codepoint]
|
88
100
|
end
|
89
101
|
|
102
|
+
# Transliterates a string.
|
90
103
|
def transliterate(string)
|
91
104
|
string.unpack("U*").map {|char| self[char] || char}.flatten.pack("U*")
|
92
105
|
end
|
@@ -3,8 +3,27 @@ module Babosa
|
|
3
3
|
module Transliterator
|
4
4
|
class Ukrainian < Cyrillic
|
5
5
|
APPROXIMATIONS = {
|
6
|
+
"Г" => "H",
|
7
|
+
"г" => "h",
|
8
|
+
"Ґ" => "G",
|
9
|
+
"ґ" => "g",
|
10
|
+
"є" => "ie",
|
6
11
|
"И" => "Y",
|
7
12
|
"и" => "y",
|
13
|
+
"І" => "I",
|
14
|
+
"і" => "i",
|
15
|
+
"ї" => "i",
|
16
|
+
"Й" => "Y",
|
17
|
+
"й" => "i",
|
18
|
+
"Х" => "Kh",
|
19
|
+
"х" => "kh",
|
20
|
+
"Ц" => "Ts",
|
21
|
+
"ц" => 'ts',
|
22
|
+
"Щ" => "Shch",
|
23
|
+
"щ" => "shch",
|
24
|
+
"ю" => "iu",
|
25
|
+
"я" => "ia",
|
26
|
+
"'" => ""
|
8
27
|
}
|
9
28
|
end
|
10
29
|
end
|
@@ -1,19 +1,13 @@
|
|
1
|
+
require 'active_support/multibyte/unicode'
|
2
|
+
|
1
3
|
module Babosa
|
2
4
|
module UTF8
|
3
5
|
# A UTF-8 proxy using Active Support's multibyte support.
|
4
6
|
module ActiveSupportProxy
|
5
|
-
extend
|
6
|
-
extend self
|
7
|
-
def downcase(string)
|
8
|
-
ActiveSupport::Multibyte::Chars.new(string).downcase.to_s
|
9
|
-
end
|
10
|
-
|
11
|
-
def upcase(string)
|
12
|
-
ActiveSupport::Multibyte::Chars.new(string).upcase.to_s
|
13
|
-
end
|
7
|
+
extend ActiveSupport::Multibyte::Unicode
|
14
8
|
|
15
|
-
def normalize_utf8(string)
|
16
|
-
|
9
|
+
def self.normalize_utf8(string)
|
10
|
+
normalize(string, :c)
|
17
11
|
end
|
18
12
|
end
|
19
13
|
end
|
@@ -10,32 +10,39 @@ module Babosa
|
|
10
10
|
# or ActiveSupport should be used instead because they support the full
|
11
11
|
# UTF-8 character range.
|
12
12
|
module DumbProxy
|
13
|
-
extend
|
13
|
+
extend Proxy
|
14
14
|
extend self
|
15
15
|
|
16
16
|
def downcase(string)
|
17
|
-
string.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
|
17
|
+
string.downcase.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
|
18
18
|
end
|
19
19
|
|
20
20
|
def upcase(string)
|
21
|
-
string.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
|
21
|
+
string.upcase.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
|
22
22
|
end
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
24
|
+
if ''.respond_to?(:unicode_normalize)
|
25
|
+
def normalize_utf8(string)
|
26
|
+
string.unicode_normalize
|
27
|
+
end
|
28
|
+
else
|
29
|
+
# On Ruby 2.2, this uses the native Unicode normalize method. On all
|
30
|
+
# other Rubies, it does a very naive Unicode normalization, which should
|
31
|
+
# work for this library's purposes (i.e., Roman-based codepoints, up to
|
32
|
+
# U+017E). Do not use reuse this as a general solution! Use a real
|
33
|
+
# library like Unicode or ActiveSupport instead.
|
34
|
+
def normalize_utf8(string)
|
35
|
+
codepoints = string.unpack("U*")
|
36
|
+
new = []
|
37
|
+
until codepoints.empty? do
|
38
|
+
if Mappings::COMPOSITION[codepoints[0..1]]
|
39
|
+
new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
|
40
|
+
else
|
41
|
+
new << codepoints.shift
|
42
|
+
end
|
36
43
|
end
|
44
|
+
new.compact.flatten.pack("U*")
|
37
45
|
end
|
38
|
-
new.compact.flatten.pack("U*")
|
39
46
|
end
|
40
47
|
end
|
41
48
|
end
|
data/lib/babosa/utf8/proxy.rb
CHANGED
@@ -8,7 +8,7 @@ module Babosa
|
|
8
8
|
|
9
9
|
# A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
|
10
10
|
# The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
|
11
|
-
module
|
11
|
+
module Proxy
|
12
12
|
CP1252 = {
|
13
13
|
128 => [226, 130, 172],
|
14
14
|
129 => nil,
|
@@ -62,50 +62,57 @@ module Babosa
|
|
62
62
|
raise NotImplementedError
|
63
63
|
end
|
64
64
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
65
|
+
if ''.respond_to?(:scrub) && !defined?(Rubinius)
|
66
|
+
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
67
|
+
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
68
|
+
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
69
|
+
# always work.
|
70
|
+
def tidy_bytes(string)
|
71
|
+
string.scrub do |bad|
|
72
|
+
tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*')
|
73
|
+
end
|
74
|
+
end
|
75
|
+
else
|
76
|
+
def tidy_bytes(string)
|
77
|
+
bytes = string.unpack("C*")
|
78
|
+
conts_expected = 0
|
79
|
+
last_lead = 0
|
73
80
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
is_restricted = byte > 244
|
81
|
+
bytes.each_index do |i|
|
82
|
+
byte = bytes[i]
|
83
|
+
is_cont = byte > 127 && byte < 192
|
84
|
+
is_lead = byte > 191 && byte < 245
|
85
|
+
is_unused = byte > 240
|
86
|
+
is_restricted = byte > 244
|
81
87
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
88
|
+
# Impossible or highly unlikely byte? Clean it.
|
89
|
+
if is_unused || is_restricted
|
90
|
+
bytes[i] = tidy_byte(byte)
|
91
|
+
elsif is_cont
|
92
|
+
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
93
|
+
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
94
|
+
else
|
95
|
+
if conts_expected > 0
|
96
|
+
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
97
|
+
# the leading byte.
|
98
|
+
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
99
|
+
conts_expected = 0
|
100
|
+
end
|
101
|
+
if is_lead
|
102
|
+
# Final byte is leading? Clean it.
|
103
|
+
if i == bytes.length - 1
|
104
|
+
bytes[i] = tidy_byte(bytes.last)
|
105
|
+
else
|
106
|
+
# Valid leading byte? Expect continuations determined by position of
|
107
|
+
# first zero bit, with max of 3.
|
108
|
+
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
109
|
+
last_lead = i
|
110
|
+
end
|
104
111
|
end
|
105
112
|
end
|
106
113
|
end
|
114
|
+
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
107
115
|
end
|
108
|
-
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
109
116
|
end
|
110
117
|
|
111
118
|
private
|