babosa 0.3.11 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2ee04fad8c458a32dea08b5f6483d817d359dab4
4
- data.tar.gz: 22385c9ae0e279fc6531a6ad6e9851ec3d4f4e81
3
+ metadata.gz: 5f3dfc2a054ed3f64981c0f18a149a9647ef0183
4
+ data.tar.gz: 3478c2c422839e82866d828cc77ba3bcf79a5117
5
5
  SHA512:
6
- metadata.gz: ad5f5a7e2bbfd63ab2e0a89878b54dc52a96f6929f68dbb94070d86e7b0515d7f3fb0e6e30d4e3152320a07fe7684833f23135569e7200e4aad0f9930bb3b261
7
- data.tar.gz: 95e010b9b9c5138af14454258f332a0e32e735354ad49a65ed4f22514110e159909e4a4e714a06443a02f4d9f64edfe00ac2cf48f10036488beb2f4a4abde977
6
+ metadata.gz: f7880005ce37bddd4b9780a3b31489eebdfbbb4e8f9b7d6eacd54cc3e33c36b412ec2fd4b5cc4aea87ea64288240002c9006a909e3ba2692161e775f5e43836f
7
+ data.tar.gz: c0e7fd6edeb02401dfb34493e5e072de62fdbe3c0333d252ebac5b359830ef966ededc8879077ed00e54a75432db72ec0852c3f4c4edf7e4a2b40c550c3e6245
@@ -1,19 +1,95 @@
1
1
  # Babosa Changelog
2
2
 
3
- * 0.3.11 - Added support for Vietnamese
4
- * 0.3.10 - Fixed Macedonian "S/S". Don't `include JRuby` unnecessarily.
5
- * 0.3.9 - Added missing Greek vowels with diaeresis.
6
- * 0.3.8 - Correct and improve Macedonian support.
7
- * 0.3.7 - Fix compatibility with Ruby 1.8.7. Add Swedish support.
8
- * 0.3.6 - Allow multiple transliterators. Add Greek support.
9
- * 0.3.5 - Don't strip underscores from identifiers.
10
- * 0.3.4 - Add Romanian support.
11
- * 0.3.3 - Add Norwegian support.
12
- * 0.3.2 - Improve Macedonian support.
13
- * 0.3.1 - Small fixes to Cyrillic.
14
- * 0.3.0 - Cyrillic support. Improve support for various Unicode spaces and dashes.
15
- * 0.2.2 - Fix for "smart" quote handling.
16
- * 0.2.1 - Implement #empty? for compatiblity with Active Support's #blank?.
17
- * 0.2.0 - Added support for Danish. Added method to generate Ruby identifiers. Improved performance.
18
- * 0.1.1 - Added support for Serbian.
19
- * 0.1.0 - Initial extraction from FriendlyId.
3
+ ## 1.0.1
4
+
5
+ * Fix error with tidy_bytes on Rubinius.
6
+ * Simplify Active Support UTF8 proxy.
7
+ * Fix `allow_bangs` argument to to_ruby_method being silently ignored.
8
+ * Raise error when generating an impossible Ruby method name.
9
+
10
+ ## 1.0.0
11
+
12
+ * Adopt semantic versioning.
13
+ * When using Active Support, require 3.2 or greater.
14
+ * Require Ruby 2.0 or greater.
15
+ * Fix Ruby warnings.
16
+ * Improve support for Ukrainian.
17
+ * Support some additional punctuation characters used by Chinese and others.
18
+ * Add Polish spec.
19
+ * Use native Unicode normalization on Ruby 2.2 in UTF8::DumbProxy.
20
+ * Invoke Ruby-native upcase/downcase in UTF8::DumbProxy.
21
+ * Proxy `tidy_bytes` method to Active Support when possible.
22
+ * Remove SlugString constant.
23
+
24
+ ## 0.3.11
25
+
26
+ * Add support for Vietnamese.
27
+
28
+ ## 0.3.10
29
+
30
+ * Fix Macedonian "S/S". Don't `include JRuby` unnecessarily.
31
+
32
+ ## 0.3.9
33
+
34
+ * Add missing Greek vowels with diaeresis.
35
+
36
+ ## 0.3.8
37
+
38
+ * Correct and improve Macedonian support.
39
+
40
+ ## 0.3.7
41
+
42
+ * Fix compatibility with Ruby 1.8.7.
43
+ * Add Swedish support.
44
+
45
+ ## 0.3.6
46
+
47
+ * Allow multiple transliterators.
48
+ * Add Greek support.
49
+
50
+ ## 0.3.5
51
+
52
+ * Don't strip underscores from identifiers.
53
+
54
+ ## 0.3.4
55
+
56
+ * Add Romanian support.
57
+
58
+ ## 0.3.3
59
+
60
+ * Add Norwegian support.
61
+
62
+ ## 0.3.2
63
+
64
+ * Improve Macedonian support.
65
+
66
+ ## 0.3.1
67
+
68
+ * Small fixes to Cyrillic.
69
+
70
+ ## 0.3.0
71
+
72
+ * Cyrillic support.
73
+ * Improve support for various Unicode spaces and dashes.
74
+
75
+ ## 0.2.2
76
+
77
+ * Fix for "smart" quote handling.
78
+
79
+ ## 0.2.1
80
+
81
+ * Implement #empty? for compatiblity with Active Support's #blank?.
82
+
83
+ ## 0.2.0
84
+
85
+ * Add support for Danish.
86
+ * Add method to generate Ruby identifiers.
87
+ * Improve performance.
88
+
89
+ ## 0.1.1
90
+
91
+ * Add support for Serbian.
92
+
93
+ ## 0.1.0
94
+
95
+ * Initial extraction from FriendlyId.
@@ -9,23 +9,6 @@ class String
9
9
  Babosa::Identifier.new self
10
10
  end
11
11
  alias to_slug to_identifier
12
-
13
- # Compatibility with 1.8.6
14
- if !public_method_defined? :bytesize
15
- def bytesize
16
- unpack("C*").length
17
- end
18
- end
19
-
20
- # Define unless Active Support has already added this method.
21
- if !public_method_defined? :classify
22
- # Convert from underscores to class name. E.g.:
23
- # hello_world => HelloWorld
24
- def classify
25
- split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
26
- end
27
- end
28
-
29
12
  end
30
13
 
31
14
  require "babosa/transliterator/base"
@@ -30,6 +30,8 @@ module Babosa
30
30
  # @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
31
31
  class Identifier
32
32
 
33
+ Error = Class.new(StandardError)
34
+
33
35
  attr_reader :wrapped_string
34
36
  alias to_s wrapped_string
35
37
 
@@ -44,13 +46,13 @@ module Babosa
44
46
  end
45
47
 
46
48
  # Return the proxy used for UTF-8 support.
47
- # @see Babosa::UTF8::UTF8Proxy
49
+ # @see Babosa::UTF8::Proxy
48
50
  def self.utf8_proxy
49
51
  @@utf8_proxy
50
52
  end
51
53
 
52
54
  # Set a proxy object used for UTF-8 support.
53
- # @see Babosa::UTF8::UTF8Proxy
55
+ # @see Babosa::UTF8::Proxy
54
56
  def self.utf8_proxy=(obj)
55
57
  @@utf8_proxy = obj
56
58
  end
@@ -100,16 +102,17 @@ module Babosa
100
102
  # string.transliterate # => "¡Feliz ano!"
101
103
  # string.transliterate :spanish # => "¡Feliz anio!"
102
104
  #
103
- # You can modify the built-in approximations, or add your own:
105
+ # The approximations are an array, which you can modify if you choose:
104
106
  #
105
107
  # # Make Spanish use "nh" rather than "nn"
106
- # Babosa::Characters.add_approximations(:spanish, "ñ" => "nh")
108
+ # Babosa::Transliterator::Spanish::APPROXIMATIONS["ñ"] = "nh"
107
109
  #
108
110
  # Notice that this method does not simply convert to ASCII; if you want
109
111
  # to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
110
112
  #
111
113
  # string.transliterate!(:spanish) # => "¡Feliz anio!"
112
114
  # string.transliterate! # => "¡Feliz anio!"
115
+ #
113
116
  # @param *args <Symbol>
114
117
  # @return String
115
118
  def transliterate!(*kinds)
@@ -142,13 +145,8 @@ module Babosa
142
145
  # @param Options
143
146
  # @return String
144
147
  def normalize!(options = nil)
145
- # Handle deprecated usage
146
- if options == true
147
- warn "#normalize! now takes a hash of options rather than a boolean"
148
- options = default_normalize_options.merge(:to_ascii => true)
149
- else
150
- options = default_normalize_options.merge(options || {})
151
- end
148
+ options = default_normalize_options.merge(options || {})
149
+
152
150
  if translit_option = options[:transliterate]
153
151
  if translit_option != true
154
152
  transliterate!(*translit_option)
@@ -168,10 +166,14 @@ module Babosa
168
166
  # Normalize a string so that it can safely be used as a Ruby method name.
169
167
  def to_ruby_method!(allow_bangs = true)
170
168
  leader, trailer = @wrapped_string.strip.scan(/\A(.+)(.)\z/).flatten
169
+ leader = leader.to_s
170
+ trailer = trailer.to_s
171
171
  if allow_bangs
172
- trailer.downcase.gsub!(/[^a-z0-9!=\\\\?]/, '')
172
+ trailer.downcase!
173
+ trailer.gsub!(/[^a-z0-9!=\\?]/, '')
173
174
  else
174
- trailer.downcase.gsub!(/[^a-z0-9]/, '')
175
+ trailer.downcase!
176
+ trailer.gsub!(/[^a-z0-9]/, '')
175
177
  end
176
178
  id = leader.to_identifier
177
179
  id.transliterate!
@@ -180,6 +182,9 @@ module Babosa
180
182
  id.word_chars!
181
183
  id.clean!
182
184
  @wrapped_string = id.to_s + trailer
185
+ if @wrapped_string == ""
186
+ raise Error, "Input generates impossible Ruby method name"
187
+ end
183
188
  with_separators!("_")
184
189
  end
185
190
 
@@ -285,7 +290,4 @@ module Babosa
285
290
  id
286
291
  end
287
292
  end
288
-
289
- # Identifier is aliased as SlugString to support older versions of FriendlyId.
290
- SlugString = Identifier
291
293
  end
@@ -23,11 +23,11 @@ module Babosa
23
23
  autoload :Vietnamese, "babosa/transliterator/vietnamese"
24
24
 
25
25
  def self.get(symbol)
26
- const_get(symbol.to_s.classify)
26
+ class_name = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
27
+ const_get(class_name)
27
28
  end
28
29
 
29
30
  class Base
30
-
31
31
  include Singleton
32
32
 
33
33
  APPROXIMATIONS = {
@@ -39,7 +39,6 @@ module Babosa
39
39
  "–" => "-",
40
40
  "—" => "-",
41
41
  "―" => "-",
42
- "―" => "-",
43
42
  "‘" => "'",
44
43
  "‛" => "'",
45
44
  "“" => '"',
@@ -47,6 +46,19 @@ module Babosa
47
46
  "„" => '"',
48
47
  "‟" => '"',
49
48
  '’' => "'",
49
+ ',' => ",",
50
+ '。' => ".",
51
+ '!' => "!",
52
+ '?' => '?',
53
+ '、' => ',',
54
+ '(' => '(',
55
+ ')' => ')',
56
+ '【' => '[',
57
+ '】' => ']',
58
+ ';' => ';',
59
+ ':' => ':',
60
+ '《' => '<',
61
+ '》' => '>',
50
62
  # various kinds of space characters
51
63
  "\xc2\xa0" => " ",
52
64
  "\xe2\x80\x80" => " ",
@@ -87,6 +99,7 @@ module Babosa
87
99
  @approximations[codepoint]
88
100
  end
89
101
 
102
+ # Transliterates a string.
90
103
  def transliterate(string)
91
104
  string.unpack("U*").map {|char| self[char] || char}.flatten.pack("U*")
92
105
  end
@@ -3,8 +3,27 @@ module Babosa
3
3
  module Transliterator
4
4
  class Ukrainian < Cyrillic
5
5
  APPROXIMATIONS = {
6
+ "Г" => "H",
7
+ "г" => "h",
8
+ "Ґ" => "G",
9
+ "ґ" => "g",
10
+ "є" => "ie",
6
11
  "И" => "Y",
7
12
  "и" => "y",
13
+ "І" => "I",
14
+ "і" => "i",
15
+ "ї" => "i",
16
+ "Й" => "Y",
17
+ "й" => "i",
18
+ "Х" => "Kh",
19
+ "х" => "kh",
20
+ "Ц" => "Ts",
21
+ "ц" => 'ts',
22
+ "Щ" => "Shch",
23
+ "щ" => "shch",
24
+ "ю" => "iu",
25
+ "я" => "ia",
26
+ "'" => ""
8
27
  }
9
28
  end
10
29
  end
@@ -1,19 +1,13 @@
1
+ require 'active_support/multibyte/unicode'
2
+
1
3
  module Babosa
2
4
  module UTF8
3
5
  # A UTF-8 proxy using Active Support's multibyte support.
4
6
  module ActiveSupportProxy
5
- extend UTF8Proxy
6
- extend self
7
- def downcase(string)
8
- ActiveSupport::Multibyte::Chars.new(string).downcase.to_s
9
- end
10
-
11
- def upcase(string)
12
- ActiveSupport::Multibyte::Chars.new(string).upcase.to_s
13
- end
7
+ extend ActiveSupport::Multibyte::Unicode
14
8
 
15
- def normalize_utf8(string)
16
- ActiveSupport::Multibyte::Chars.new(string).normalize(:c).to_s
9
+ def self.normalize_utf8(string)
10
+ normalize(string, :c)
17
11
  end
18
12
  end
19
13
  end
@@ -10,32 +10,39 @@ module Babosa
10
10
  # or ActiveSupport should be used instead because they support the full
11
11
  # UTF-8 character range.
12
12
  module DumbProxy
13
- extend UTF8Proxy
13
+ extend Proxy
14
14
  extend self
15
15
 
16
16
  def downcase(string)
17
- string.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
17
+ string.downcase.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
18
18
  end
19
19
 
20
20
  def upcase(string)
21
- string.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
21
+ string.upcase.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
22
22
  end
23
23
 
24
- # This does a very naive Unicode normalization, which should work for
25
- # this library's purposes (i.e., Roman-based codepoints, up to U+017E).
26
- # Do not use reuse this as a general solution! Use a real library like
27
- # Unicode or ActiveSupport instead.
28
- def normalize_utf8(string)
29
- codepoints = string.unpack("U*")
30
- new = []
31
- until codepoints.empty? do
32
- if Mappings::COMPOSITION[codepoints[0..1]]
33
- new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
34
- else
35
- new << codepoints.shift
24
+ if ''.respond_to?(:unicode_normalize)
25
+ def normalize_utf8(string)
26
+ string.unicode_normalize
27
+ end
28
+ else
29
+ # On Ruby 2.2, this uses the native Unicode normalize method. On all
30
+ # other Rubies, it does a very naive Unicode normalization, which should
31
+ # work for this library's purposes (i.e., Roman-based codepoints, up to
32
+ # U+017E). Do not use reuse this as a general solution! Use a real
33
+ # library like Unicode or ActiveSupport instead.
34
+ def normalize_utf8(string)
35
+ codepoints = string.unpack("U*")
36
+ new = []
37
+ until codepoints.empty? do
38
+ if Mappings::COMPOSITION[codepoints[0..1]]
39
+ new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
40
+ else
41
+ new << codepoints.shift
42
+ end
36
43
  end
44
+ new.compact.flatten.pack("U*")
37
45
  end
38
- new.compact.flatten.pack("U*")
39
46
  end
40
47
  end
41
48
  end
@@ -2,7 +2,7 @@ module Babosa
2
2
  module UTF8
3
3
  # A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
4
4
  module JavaProxy
5
- extend UTF8Proxy
5
+ extend Proxy
6
6
  extend self
7
7
  java_import java.text.Normalizer
8
8
 
@@ -8,7 +8,7 @@ module Babosa
8
8
 
9
9
  # A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
10
10
  # The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
11
- module UTF8Proxy
11
+ module Proxy
12
12
  CP1252 = {
13
13
  128 => [226, 130, 172],
14
14
  129 => nil,
@@ -62,50 +62,57 @@ module Babosa
62
62
  raise NotImplementedError
63
63
  end
64
64
 
65
- # Attempt to replace invalid UTF-8 bytes with valid ones. This method
66
- # naively assumes if you have invalid UTF8 bytes, they are either Windows
67
- # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
68
- # always work.
69
- def tidy_bytes(string)
70
- bytes = string.unpack("C*")
71
- conts_expected = 0
72
- last_lead = 0
65
+ if ''.respond_to?(:scrub) && !defined?(Rubinius)
66
+ # Attempt to replace invalid UTF-8 bytes with valid ones. This method
67
+ # naively assumes if you have invalid UTF8 bytes, they are either Windows
68
+ # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
69
+ # always work.
70
+ def tidy_bytes(string)
71
+ string.scrub do |bad|
72
+ tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*')
73
+ end
74
+ end
75
+ else
76
+ def tidy_bytes(string)
77
+ bytes = string.unpack("C*")
78
+ conts_expected = 0
79
+ last_lead = 0
73
80
 
74
- bytes.each_index do |i|
75
- byte = bytes[i]
76
- is_ascii = byte < 128
77
- is_cont = byte > 127 && byte < 192
78
- is_lead = byte > 191 && byte < 245
79
- is_unused = byte > 240
80
- is_restricted = byte > 244
81
+ bytes.each_index do |i|
82
+ byte = bytes[i]
83
+ is_cont = byte > 127 && byte < 192
84
+ is_lead = byte > 191 && byte < 245
85
+ is_unused = byte > 240
86
+ is_restricted = byte > 244
81
87
 
82
- # Impossible or highly unlikely byte? Clean it.
83
- if is_unused || is_restricted
84
- bytes[i] = tidy_byte(byte)
85
- elsif is_cont
86
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
87
- conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
88
- else
89
- if conts_expected > 0
90
- # Expected continuation, but got ASCII or leading? Clean backwards up to
91
- # the leading byte.
92
- (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
93
- conts_expected = 0
94
- end
95
- if is_lead
96
- # Final byte is leading? Clean it.
97
- if i == bytes.length - 1
98
- bytes[i] = tidy_byte(bytes.last)
99
- else
100
- # Valid leading byte? Expect continuations determined by position of
101
- # first zero bit, with max of 3.
102
- conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
103
- last_lead = i
88
+ # Impossible or highly unlikely byte? Clean it.
89
+ if is_unused || is_restricted
90
+ bytes[i] = tidy_byte(byte)
91
+ elsif is_cont
92
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
93
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
94
+ else
95
+ if conts_expected > 0
96
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
97
+ # the leading byte.
98
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
99
+ conts_expected = 0
100
+ end
101
+ if is_lead
102
+ # Final byte is leading? Clean it.
103
+ if i == bytes.length - 1
104
+ bytes[i] = tidy_byte(bytes.last)
105
+ else
106
+ # Valid leading byte? Expect continuations determined by position of
107
+ # first zero bit, with max of 3.
108
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
109
+ last_lead = i
110
+ end
104
111
  end
105
112
  end
106
113
  end
114
+ bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
107
115
  end
108
- bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
109
116
  end
110
117
 
111
118
  private