babosa 0.3.11 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Changelog.md +93 -17
- data/lib/babosa.rb +0 -17
- data/lib/babosa/identifier.rb +18 -16
- data/lib/babosa/transliterator/base.rb +16 -3
- data/lib/babosa/transliterator/ukrainian.rb +19 -0
- data/lib/babosa/utf8/active_support_proxy.rb +5 -11
- data/lib/babosa/utf8/dumb_proxy.rb +23 -16
- data/lib/babosa/utf8/java_proxy.rb +1 -1
- data/lib/babosa/utf8/proxy.rb +46 -39
- data/lib/babosa/utf8/unicode_proxy.rb +3 -1
- data/lib/babosa/version.rb +1 -1
- data/spec/babosa_spec.rb +45 -36
- data/spec/spec_helper.rb +8 -14
- data/spec/transliterators/base_spec.rb +3 -3
- data/spec/transliterators/bulgarian_spec.rb +1 -1
- data/spec/transliterators/danish_spec.rb +1 -1
- data/spec/transliterators/german_spec.rb +2 -2
- data/spec/transliterators/greek_spec.rb +1 -1
- data/spec/transliterators/latin_spec.rb +9 -0
- data/spec/transliterators/norwegian_spec.rb +1 -1
- data/spec/transliterators/polish_spec.rb +14 -0
- data/spec/transliterators/romanian_spec.rb +1 -1
- data/spec/transliterators/serbian_spec.rb +1 -1
- data/spec/transliterators/spanish_spec.rb +1 -1
- data/spec/transliterators/swedish_spec.rb +1 -1
- data/spec/transliterators/ukrainian_spec.rb +80 -1
- data/spec/transliterators/vietnamese_spec.rb +1 -1
- data/spec/utf8_proxy_spec.rb +10 -18
- metadata +42 -29
- data/init.rb +0 -3
- data/lib/babosa/candidates.rb +0 -45
- data/lib/babosa/generator.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f3dfc2a054ed3f64981c0f18a149a9647ef0183
|
4
|
+
data.tar.gz: 3478c2c422839e82866d828cc77ba3bcf79a5117
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f7880005ce37bddd4b9780a3b31489eebdfbbb4e8f9b7d6eacd54cc3e33c36b412ec2fd4b5cc4aea87ea64288240002c9006a909e3ba2692161e775f5e43836f
|
7
|
+
data.tar.gz: c0e7fd6edeb02401dfb34493e5e072de62fdbe3c0333d252ebac5b359830ef966ededc8879077ed00e54a75432db72ec0852c3f4c4edf7e4a2b40c550c3e6245
|
data/Changelog.md
CHANGED
@@ -1,19 +1,95 @@
|
|
1
1
|
# Babosa Changelog
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
*
|
6
|
-
*
|
7
|
-
*
|
8
|
-
*
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
*
|
13
|
-
*
|
14
|
-
*
|
15
|
-
*
|
16
|
-
*
|
17
|
-
*
|
18
|
-
*
|
19
|
-
*
|
3
|
+
## 1.0.1
|
4
|
+
|
5
|
+
* Fix error with tidy_bytes on Rubinius.
|
6
|
+
* Simplify Active Support UTF8 proxy.
|
7
|
+
* Fix `allow_bangs` argument to to_ruby_method being silently ignored.
|
8
|
+
* Raise error when generating an impossible Ruby method name.
|
9
|
+
|
10
|
+
## 1.0.0
|
11
|
+
|
12
|
+
* Adopt semantic versioning.
|
13
|
+
* When using Active Support, require 3.2 or greater.
|
14
|
+
* Require Ruby 2.0 or greater.
|
15
|
+
* Fix Ruby warnings.
|
16
|
+
* Improve support for Ukrainian.
|
17
|
+
* Support some additional punctuation characters used by Chinese and others.
|
18
|
+
* Add Polish spec.
|
19
|
+
* Use native Unicode normalization on Ruby 2.2 in UTF8::DumbProxy.
|
20
|
+
* Invoke Ruby-native upcase/downcase in UTF8::DumbProxy.
|
21
|
+
* Proxy `tidy_bytes` method to Active Support when possible.
|
22
|
+
* Remove SlugString constant.
|
23
|
+
|
24
|
+
## 0.3.11
|
25
|
+
|
26
|
+
* Add support for Vietnamese.
|
27
|
+
|
28
|
+
## 0.3.10
|
29
|
+
|
30
|
+
* Fix Macedonian "S/S". Don't `include JRuby` unnecessarily.
|
31
|
+
|
32
|
+
## 0.3.9
|
33
|
+
|
34
|
+
* Add missing Greek vowels with diaeresis.
|
35
|
+
|
36
|
+
## 0.3.8
|
37
|
+
|
38
|
+
* Correct and improve Macedonian support.
|
39
|
+
|
40
|
+
## 0.3.7
|
41
|
+
|
42
|
+
* Fix compatibility with Ruby 1.8.7.
|
43
|
+
* Add Swedish support.
|
44
|
+
|
45
|
+
## 0.3.6
|
46
|
+
|
47
|
+
* Allow multiple transliterators.
|
48
|
+
* Add Greek support.
|
49
|
+
|
50
|
+
## 0.3.5
|
51
|
+
|
52
|
+
* Don't strip underscores from identifiers.
|
53
|
+
|
54
|
+
## 0.3.4
|
55
|
+
|
56
|
+
* Add Romanian support.
|
57
|
+
|
58
|
+
## 0.3.3
|
59
|
+
|
60
|
+
* Add Norwegian support.
|
61
|
+
|
62
|
+
## 0.3.2
|
63
|
+
|
64
|
+
* Improve Macedonian support.
|
65
|
+
|
66
|
+
## 0.3.1
|
67
|
+
|
68
|
+
* Small fixes to Cyrillic.
|
69
|
+
|
70
|
+
## 0.3.0
|
71
|
+
|
72
|
+
* Cyrillic support.
|
73
|
+
* Improve support for various Unicode spaces and dashes.
|
74
|
+
|
75
|
+
## 0.2.2
|
76
|
+
|
77
|
+
* Fix for "smart" quote handling.
|
78
|
+
|
79
|
+
## 0.2.1
|
80
|
+
|
81
|
+
* Implement #empty? for compatiblity with Active Support's #blank?.
|
82
|
+
|
83
|
+
## 0.2.0
|
84
|
+
|
85
|
+
* Add support for Danish.
|
86
|
+
* Add method to generate Ruby identifiers.
|
87
|
+
* Improve performance.
|
88
|
+
|
89
|
+
## 0.1.1
|
90
|
+
|
91
|
+
* Add support for Serbian.
|
92
|
+
|
93
|
+
## 0.1.0
|
94
|
+
|
95
|
+
* Initial extraction from FriendlyId.
|
data/lib/babosa.rb
CHANGED
@@ -9,23 +9,6 @@ class String
|
|
9
9
|
Babosa::Identifier.new self
|
10
10
|
end
|
11
11
|
alias to_slug to_identifier
|
12
|
-
|
13
|
-
# Compatibility with 1.8.6
|
14
|
-
if !public_method_defined? :bytesize
|
15
|
-
def bytesize
|
16
|
-
unpack("C*").length
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
# Define unless Active Support has already added this method.
|
21
|
-
if !public_method_defined? :classify
|
22
|
-
# Convert from underscores to class name. E.g.:
|
23
|
-
# hello_world => HelloWorld
|
24
|
-
def classify
|
25
|
-
split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
12
|
end
|
30
13
|
|
31
14
|
require "babosa/transliterator/base"
|
data/lib/babosa/identifier.rb
CHANGED
@@ -30,6 +30,8 @@ module Babosa
|
|
30
30
|
# @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
|
31
31
|
class Identifier
|
32
32
|
|
33
|
+
Error = Class.new(StandardError)
|
34
|
+
|
33
35
|
attr_reader :wrapped_string
|
34
36
|
alias to_s wrapped_string
|
35
37
|
|
@@ -44,13 +46,13 @@ module Babosa
|
|
44
46
|
end
|
45
47
|
|
46
48
|
# Return the proxy used for UTF-8 support.
|
47
|
-
# @see Babosa::UTF8::
|
49
|
+
# @see Babosa::UTF8::Proxy
|
48
50
|
def self.utf8_proxy
|
49
51
|
@@utf8_proxy
|
50
52
|
end
|
51
53
|
|
52
54
|
# Set a proxy object used for UTF-8 support.
|
53
|
-
# @see Babosa::UTF8::
|
55
|
+
# @see Babosa::UTF8::Proxy
|
54
56
|
def self.utf8_proxy=(obj)
|
55
57
|
@@utf8_proxy = obj
|
56
58
|
end
|
@@ -100,16 +102,17 @@ module Babosa
|
|
100
102
|
# string.transliterate # => "¡Feliz ano!"
|
101
103
|
# string.transliterate :spanish # => "¡Feliz anio!"
|
102
104
|
#
|
103
|
-
#
|
105
|
+
# The approximations are an array, which you can modify if you choose:
|
104
106
|
#
|
105
107
|
# # Make Spanish use "nh" rather than "nn"
|
106
|
-
# Babosa::
|
108
|
+
# Babosa::Transliterator::Spanish::APPROXIMATIONS["ñ"] = "nh"
|
107
109
|
#
|
108
110
|
# Notice that this method does not simply convert to ASCII; if you want
|
109
111
|
# to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
|
110
112
|
#
|
111
113
|
# string.transliterate!(:spanish) # => "¡Feliz anio!"
|
112
114
|
# string.transliterate! # => "¡Feliz anio!"
|
115
|
+
#
|
113
116
|
# @param *args <Symbol>
|
114
117
|
# @return String
|
115
118
|
def transliterate!(*kinds)
|
@@ -142,13 +145,8 @@ module Babosa
|
|
142
145
|
# @param Options
|
143
146
|
# @return String
|
144
147
|
def normalize!(options = nil)
|
145
|
-
|
146
|
-
|
147
|
-
warn "#normalize! now takes a hash of options rather than a boolean"
|
148
|
-
options = default_normalize_options.merge(:to_ascii => true)
|
149
|
-
else
|
150
|
-
options = default_normalize_options.merge(options || {})
|
151
|
-
end
|
148
|
+
options = default_normalize_options.merge(options || {})
|
149
|
+
|
152
150
|
if translit_option = options[:transliterate]
|
153
151
|
if translit_option != true
|
154
152
|
transliterate!(*translit_option)
|
@@ -168,10 +166,14 @@ module Babosa
|
|
168
166
|
# Normalize a string so that it can safely be used as a Ruby method name.
|
169
167
|
def to_ruby_method!(allow_bangs = true)
|
170
168
|
leader, trailer = @wrapped_string.strip.scan(/\A(.+)(.)\z/).flatten
|
169
|
+
leader = leader.to_s
|
170
|
+
trailer = trailer.to_s
|
171
171
|
if allow_bangs
|
172
|
-
trailer.downcase
|
172
|
+
trailer.downcase!
|
173
|
+
trailer.gsub!(/[^a-z0-9!=\\?]/, '')
|
173
174
|
else
|
174
|
-
trailer.downcase
|
175
|
+
trailer.downcase!
|
176
|
+
trailer.gsub!(/[^a-z0-9]/, '')
|
175
177
|
end
|
176
178
|
id = leader.to_identifier
|
177
179
|
id.transliterate!
|
@@ -180,6 +182,9 @@ module Babosa
|
|
180
182
|
id.word_chars!
|
181
183
|
id.clean!
|
182
184
|
@wrapped_string = id.to_s + trailer
|
185
|
+
if @wrapped_string == ""
|
186
|
+
raise Error, "Input generates impossible Ruby method name"
|
187
|
+
end
|
183
188
|
with_separators!("_")
|
184
189
|
end
|
185
190
|
|
@@ -285,7 +290,4 @@ module Babosa
|
|
285
290
|
id
|
286
291
|
end
|
287
292
|
end
|
288
|
-
|
289
|
-
# Identifier is aliased as SlugString to support older versions of FriendlyId.
|
290
|
-
SlugString = Identifier
|
291
293
|
end
|
@@ -23,11 +23,11 @@ module Babosa
|
|
23
23
|
autoload :Vietnamese, "babosa/transliterator/vietnamese"
|
24
24
|
|
25
25
|
def self.get(symbol)
|
26
|
-
|
26
|
+
class_name = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
|
27
|
+
const_get(class_name)
|
27
28
|
end
|
28
29
|
|
29
30
|
class Base
|
30
|
-
|
31
31
|
include Singleton
|
32
32
|
|
33
33
|
APPROXIMATIONS = {
|
@@ -39,7 +39,6 @@ module Babosa
|
|
39
39
|
"–" => "-",
|
40
40
|
"—" => "-",
|
41
41
|
"―" => "-",
|
42
|
-
"―" => "-",
|
43
42
|
"‘" => "'",
|
44
43
|
"‛" => "'",
|
45
44
|
"“" => '"',
|
@@ -47,6 +46,19 @@ module Babosa
|
|
47
46
|
"„" => '"',
|
48
47
|
"‟" => '"',
|
49
48
|
'’' => "'",
|
49
|
+
',' => ",",
|
50
|
+
'。' => ".",
|
51
|
+
'!' => "!",
|
52
|
+
'?' => '?',
|
53
|
+
'、' => ',',
|
54
|
+
'(' => '(',
|
55
|
+
')' => ')',
|
56
|
+
'【' => '[',
|
57
|
+
'】' => ']',
|
58
|
+
';' => ';',
|
59
|
+
':' => ':',
|
60
|
+
'《' => '<',
|
61
|
+
'》' => '>',
|
50
62
|
# various kinds of space characters
|
51
63
|
"\xc2\xa0" => " ",
|
52
64
|
"\xe2\x80\x80" => " ",
|
@@ -87,6 +99,7 @@ module Babosa
|
|
87
99
|
@approximations[codepoint]
|
88
100
|
end
|
89
101
|
|
102
|
+
# Transliterates a string.
|
90
103
|
def transliterate(string)
|
91
104
|
string.unpack("U*").map {|char| self[char] || char}.flatten.pack("U*")
|
92
105
|
end
|
@@ -3,8 +3,27 @@ module Babosa
|
|
3
3
|
module Transliterator
|
4
4
|
class Ukrainian < Cyrillic
|
5
5
|
APPROXIMATIONS = {
|
6
|
+
"Г" => "H",
|
7
|
+
"г" => "h",
|
8
|
+
"Ґ" => "G",
|
9
|
+
"ґ" => "g",
|
10
|
+
"є" => "ie",
|
6
11
|
"И" => "Y",
|
7
12
|
"и" => "y",
|
13
|
+
"І" => "I",
|
14
|
+
"і" => "i",
|
15
|
+
"ї" => "i",
|
16
|
+
"Й" => "Y",
|
17
|
+
"й" => "i",
|
18
|
+
"Х" => "Kh",
|
19
|
+
"х" => "kh",
|
20
|
+
"Ц" => "Ts",
|
21
|
+
"ц" => 'ts',
|
22
|
+
"Щ" => "Shch",
|
23
|
+
"щ" => "shch",
|
24
|
+
"ю" => "iu",
|
25
|
+
"я" => "ia",
|
26
|
+
"'" => ""
|
8
27
|
}
|
9
28
|
end
|
10
29
|
end
|
@@ -1,19 +1,13 @@
|
|
1
|
+
require 'active_support/multibyte/unicode'
|
2
|
+
|
1
3
|
module Babosa
|
2
4
|
module UTF8
|
3
5
|
# A UTF-8 proxy using Active Support's multibyte support.
|
4
6
|
module ActiveSupportProxy
|
5
|
-
extend
|
6
|
-
extend self
|
7
|
-
def downcase(string)
|
8
|
-
ActiveSupport::Multibyte::Chars.new(string).downcase.to_s
|
9
|
-
end
|
10
|
-
|
11
|
-
def upcase(string)
|
12
|
-
ActiveSupport::Multibyte::Chars.new(string).upcase.to_s
|
13
|
-
end
|
7
|
+
extend ActiveSupport::Multibyte::Unicode
|
14
8
|
|
15
|
-
def normalize_utf8(string)
|
16
|
-
|
9
|
+
def self.normalize_utf8(string)
|
10
|
+
normalize(string, :c)
|
17
11
|
end
|
18
12
|
end
|
19
13
|
end
|
@@ -10,32 +10,39 @@ module Babosa
|
|
10
10
|
# or ActiveSupport should be used instead because they support the full
|
11
11
|
# UTF-8 character range.
|
12
12
|
module DumbProxy
|
13
|
-
extend
|
13
|
+
extend Proxy
|
14
14
|
extend self
|
15
15
|
|
16
16
|
def downcase(string)
|
17
|
-
string.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
|
17
|
+
string.downcase.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
|
18
18
|
end
|
19
19
|
|
20
20
|
def upcase(string)
|
21
|
-
string.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
|
21
|
+
string.upcase.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
|
22
22
|
end
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
24
|
+
if ''.respond_to?(:unicode_normalize)
|
25
|
+
def normalize_utf8(string)
|
26
|
+
string.unicode_normalize
|
27
|
+
end
|
28
|
+
else
|
29
|
+
# On Ruby 2.2, this uses the native Unicode normalize method. On all
|
30
|
+
# other Rubies, it does a very naive Unicode normalization, which should
|
31
|
+
# work for this library's purposes (i.e., Roman-based codepoints, up to
|
32
|
+
# U+017E). Do not use reuse this as a general solution! Use a real
|
33
|
+
# library like Unicode or ActiveSupport instead.
|
34
|
+
def normalize_utf8(string)
|
35
|
+
codepoints = string.unpack("U*")
|
36
|
+
new = []
|
37
|
+
until codepoints.empty? do
|
38
|
+
if Mappings::COMPOSITION[codepoints[0..1]]
|
39
|
+
new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
|
40
|
+
else
|
41
|
+
new << codepoints.shift
|
42
|
+
end
|
36
43
|
end
|
44
|
+
new.compact.flatten.pack("U*")
|
37
45
|
end
|
38
|
-
new.compact.flatten.pack("U*")
|
39
46
|
end
|
40
47
|
end
|
41
48
|
end
|
data/lib/babosa/utf8/proxy.rb
CHANGED
@@ -8,7 +8,7 @@ module Babosa
|
|
8
8
|
|
9
9
|
# A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
|
10
10
|
# The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
|
11
|
-
module
|
11
|
+
module Proxy
|
12
12
|
CP1252 = {
|
13
13
|
128 => [226, 130, 172],
|
14
14
|
129 => nil,
|
@@ -62,50 +62,57 @@ module Babosa
|
|
62
62
|
raise NotImplementedError
|
63
63
|
end
|
64
64
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
65
|
+
if ''.respond_to?(:scrub) && !defined?(Rubinius)
|
66
|
+
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
67
|
+
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
68
|
+
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
69
|
+
# always work.
|
70
|
+
def tidy_bytes(string)
|
71
|
+
string.scrub do |bad|
|
72
|
+
tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*')
|
73
|
+
end
|
74
|
+
end
|
75
|
+
else
|
76
|
+
def tidy_bytes(string)
|
77
|
+
bytes = string.unpack("C*")
|
78
|
+
conts_expected = 0
|
79
|
+
last_lead = 0
|
73
80
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
is_restricted = byte > 244
|
81
|
+
bytes.each_index do |i|
|
82
|
+
byte = bytes[i]
|
83
|
+
is_cont = byte > 127 && byte < 192
|
84
|
+
is_lead = byte > 191 && byte < 245
|
85
|
+
is_unused = byte > 240
|
86
|
+
is_restricted = byte > 244
|
81
87
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
88
|
+
# Impossible or highly unlikely byte? Clean it.
|
89
|
+
if is_unused || is_restricted
|
90
|
+
bytes[i] = tidy_byte(byte)
|
91
|
+
elsif is_cont
|
92
|
+
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
93
|
+
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
94
|
+
else
|
95
|
+
if conts_expected > 0
|
96
|
+
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
97
|
+
# the leading byte.
|
98
|
+
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
99
|
+
conts_expected = 0
|
100
|
+
end
|
101
|
+
if is_lead
|
102
|
+
# Final byte is leading? Clean it.
|
103
|
+
if i == bytes.length - 1
|
104
|
+
bytes[i] = tidy_byte(bytes.last)
|
105
|
+
else
|
106
|
+
# Valid leading byte? Expect continuations determined by position of
|
107
|
+
# first zero bit, with max of 3.
|
108
|
+
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
109
|
+
last_lead = i
|
110
|
+
end
|
104
111
|
end
|
105
112
|
end
|
106
113
|
end
|
114
|
+
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
107
115
|
end
|
108
|
-
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
109
116
|
end
|
110
117
|
|
111
118
|
private
|