babosa 0.3.11 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2ee04fad8c458a32dea08b5f6483d817d359dab4
4
- data.tar.gz: 22385c9ae0e279fc6531a6ad6e9851ec3d4f4e81
3
+ metadata.gz: 5f3dfc2a054ed3f64981c0f18a149a9647ef0183
4
+ data.tar.gz: 3478c2c422839e82866d828cc77ba3bcf79a5117
5
5
  SHA512:
6
- metadata.gz: ad5f5a7e2bbfd63ab2e0a89878b54dc52a96f6929f68dbb94070d86e7b0515d7f3fb0e6e30d4e3152320a07fe7684833f23135569e7200e4aad0f9930bb3b261
7
- data.tar.gz: 95e010b9b9c5138af14454258f332a0e32e735354ad49a65ed4f22514110e159909e4a4e714a06443a02f4d9f64edfe00ac2cf48f10036488beb2f4a4abde977
6
+ metadata.gz: f7880005ce37bddd4b9780a3b31489eebdfbbb4e8f9b7d6eacd54cc3e33c36b412ec2fd4b5cc4aea87ea64288240002c9006a909e3ba2692161e775f5e43836f
7
+ data.tar.gz: c0e7fd6edeb02401dfb34493e5e072de62fdbe3c0333d252ebac5b359830ef966ededc8879077ed00e54a75432db72ec0852c3f4c4edf7e4a2b40c550c3e6245
@@ -1,19 +1,95 @@
1
1
  # Babosa Changelog
2
2
 
3
- * 0.3.11 - Added support for Vietnamese
4
- * 0.3.10 - Fixed Macedonian "S/S". Don't `include JRuby` unnecessarily.
5
- * 0.3.9 - Added missing Greek vowels with diaeresis.
6
- * 0.3.8 - Correct and improve Macedonian support.
7
- * 0.3.7 - Fix compatibility with Ruby 1.8.7. Add Swedish support.
8
- * 0.3.6 - Allow multiple transliterators. Add Greek support.
9
- * 0.3.5 - Don't strip underscores from identifiers.
10
- * 0.3.4 - Add Romanian support.
11
- * 0.3.3 - Add Norwegian support.
12
- * 0.3.2 - Improve Macedonian support.
13
- * 0.3.1 - Small fixes to Cyrillic.
14
- * 0.3.0 - Cyrillic support. Improve support for various Unicode spaces and dashes.
15
- * 0.2.2 - Fix for "smart" quote handling.
16
- * 0.2.1 - Implement #empty? for compatiblity with Active Support's #blank?.
17
- * 0.2.0 - Added support for Danish. Added method to generate Ruby identifiers. Improved performance.
18
- * 0.1.1 - Added support for Serbian.
19
- * 0.1.0 - Initial extraction from FriendlyId.
3
+ ## 1.0.1
4
+
5
+ * Fix error with tidy_bytes on Rubinius.
6
+ * Simplify Active Support UTF8 proxy.
7
+ * Fix `allow_bangs` argument to to_ruby_method being silently ignored.
8
+ * Raise error when generating an impossible Ruby method name.
9
+
10
+ ## 1.0.0
11
+
12
+ * Adopt semantic versioning.
13
+ * When using Active Support, require 3.2 or greater.
14
+ * Require Ruby 2.0 or greater.
15
+ * Fix Ruby warnings.
16
+ * Improve support for Ukrainian.
17
+ * Support some additional punctuation characters used by Chinese and others.
18
+ * Add Polish spec.
19
+ * Use native Unicode normalization on Ruby 2.2 in UTF8::DumbProxy.
20
+ * Invoke Ruby-native upcase/downcase in UTF8::DumbProxy.
21
+ * Proxy `tidy_bytes` method to Active Support when possible.
22
+ * Remove SlugString constant.
23
+
24
+ ## 0.3.11
25
+
26
+ * Add support for Vietnamese.
27
+
28
+ ## 0.3.10
29
+
30
+ * Fix Macedonian "S/S". Don't `include JRuby` unnecessarily.
31
+
32
+ ## 0.3.9
33
+
34
+ * Add missing Greek vowels with diaeresis.
35
+
36
+ ## 0.3.8
37
+
38
+ * Correct and improve Macedonian support.
39
+
40
+ ## 0.3.7
41
+
42
+ * Fix compatibility with Ruby 1.8.7.
43
+ * Add Swedish support.
44
+
45
+ ## 0.3.6
46
+
47
+ * Allow multiple transliterators.
48
+ * Add Greek support.
49
+
50
+ ## 0.3.5
51
+
52
+ * Don't strip underscores from identifiers.
53
+
54
+ ## 0.3.4
55
+
56
+ * Add Romanian support.
57
+
58
+ ## 0.3.3
59
+
60
+ * Add Norwegian support.
61
+
62
+ ## 0.3.2
63
+
64
+ * Improve Macedonian support.
65
+
66
+ ## 0.3.1
67
+
68
+ * Small fixes to Cyrillic.
69
+
70
+ ## 0.3.0
71
+
72
+ * Cyrillic support.
73
+ * Improve support for various Unicode spaces and dashes.
74
+
75
+ ## 0.2.2
76
+
77
+ * Fix for "smart" quote handling.
78
+
79
+ ## 0.2.1
80
+
81
+ * Implement #empty? for compatiblity with Active Support's #blank?.
82
+
83
+ ## 0.2.0
84
+
85
+ * Add support for Danish.
86
+ * Add method to generate Ruby identifiers.
87
+ * Improve performance.
88
+
89
+ ## 0.1.1
90
+
91
+ * Add support for Serbian.
92
+
93
+ ## 0.1.0
94
+
95
+ * Initial extraction from FriendlyId.
@@ -9,23 +9,6 @@ class String
9
9
  Babosa::Identifier.new self
10
10
  end
11
11
  alias to_slug to_identifier
12
-
13
- # Compatibility with 1.8.6
14
- if !public_method_defined? :bytesize
15
- def bytesize
16
- unpack("C*").length
17
- end
18
- end
19
-
20
- # Define unless Active Support has already added this method.
21
- if !public_method_defined? :classify
22
- # Convert from underscores to class name. E.g.:
23
- # hello_world => HelloWorld
24
- def classify
25
- split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
26
- end
27
- end
28
-
29
12
  end
30
13
 
31
14
  require "babosa/transliterator/base"
@@ -30,6 +30,8 @@ module Babosa
30
30
  # @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
31
31
  class Identifier
32
32
 
33
+ Error = Class.new(StandardError)
34
+
33
35
  attr_reader :wrapped_string
34
36
  alias to_s wrapped_string
35
37
 
@@ -44,13 +46,13 @@ module Babosa
44
46
  end
45
47
 
46
48
  # Return the proxy used for UTF-8 support.
47
- # @see Babosa::UTF8::UTF8Proxy
49
+ # @see Babosa::UTF8::Proxy
48
50
  def self.utf8_proxy
49
51
  @@utf8_proxy
50
52
  end
51
53
 
52
54
  # Set a proxy object used for UTF-8 support.
53
- # @see Babosa::UTF8::UTF8Proxy
55
+ # @see Babosa::UTF8::Proxy
54
56
  def self.utf8_proxy=(obj)
55
57
  @@utf8_proxy = obj
56
58
  end
@@ -100,16 +102,17 @@ module Babosa
100
102
  # string.transliterate # => "¡Feliz ano!"
101
103
  # string.transliterate :spanish # => "¡Feliz anio!"
102
104
  #
103
- # You can modify the built-in approximations, or add your own:
105
+ # The approximations are an array, which you can modify if you choose:
104
106
  #
105
107
  # # Make Spanish use "nh" rather than "nn"
106
- # Babosa::Characters.add_approximations(:spanish, "ñ" => "nh")
108
+ # Babosa::Transliterator::Spanish::APPROXIMATIONS["ñ"] = "nh"
107
109
  #
108
110
  # Notice that this method does not simply convert to ASCII; if you want
109
111
  # to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
110
112
  #
111
113
  # string.transliterate!(:spanish) # => "¡Feliz anio!"
112
114
  # string.transliterate! # => "¡Feliz anio!"
115
+ #
113
116
  # @param *args <Symbol>
114
117
  # @return String
115
118
  def transliterate!(*kinds)
@@ -142,13 +145,8 @@ module Babosa
142
145
  # @param Options
143
146
  # @return String
144
147
  def normalize!(options = nil)
145
- # Handle deprecated usage
146
- if options == true
147
- warn "#normalize! now takes a hash of options rather than a boolean"
148
- options = default_normalize_options.merge(:to_ascii => true)
149
- else
150
- options = default_normalize_options.merge(options || {})
151
- end
148
+ options = default_normalize_options.merge(options || {})
149
+
152
150
  if translit_option = options[:transliterate]
153
151
  if translit_option != true
154
152
  transliterate!(*translit_option)
@@ -168,10 +166,14 @@ module Babosa
168
166
  # Normalize a string so that it can safely be used as a Ruby method name.
169
167
  def to_ruby_method!(allow_bangs = true)
170
168
  leader, trailer = @wrapped_string.strip.scan(/\A(.+)(.)\z/).flatten
169
+ leader = leader.to_s
170
+ trailer = trailer.to_s
171
171
  if allow_bangs
172
- trailer.downcase.gsub!(/[^a-z0-9!=\\\\?]/, '')
172
+ trailer.downcase!
173
+ trailer.gsub!(/[^a-z0-9!=\\?]/, '')
173
174
  else
174
- trailer.downcase.gsub!(/[^a-z0-9]/, '')
175
+ trailer.downcase!
176
+ trailer.gsub!(/[^a-z0-9]/, '')
175
177
  end
176
178
  id = leader.to_identifier
177
179
  id.transliterate!
@@ -180,6 +182,9 @@ module Babosa
180
182
  id.word_chars!
181
183
  id.clean!
182
184
  @wrapped_string = id.to_s + trailer
185
+ if @wrapped_string == ""
186
+ raise Error, "Input generates impossible Ruby method name"
187
+ end
183
188
  with_separators!("_")
184
189
  end
185
190
 
@@ -285,7 +290,4 @@ module Babosa
285
290
  id
286
291
  end
287
292
  end
288
-
289
- # Identifier is aliased as SlugString to support older versions of FriendlyId.
290
- SlugString = Identifier
291
293
  end
@@ -23,11 +23,11 @@ module Babosa
23
23
  autoload :Vietnamese, "babosa/transliterator/vietnamese"
24
24
 
25
25
  def self.get(symbol)
26
- const_get(symbol.to_s.classify)
26
+ class_name = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
27
+ const_get(class_name)
27
28
  end
28
29
 
29
30
  class Base
30
-
31
31
  include Singleton
32
32
 
33
33
  APPROXIMATIONS = {
@@ -39,7 +39,6 @@ module Babosa
39
39
  "–" => "-",
40
40
  "—" => "-",
41
41
  "―" => "-",
42
- "―" => "-",
43
42
  "‘" => "'",
44
43
  "‛" => "'",
45
44
  "“" => '"',
@@ -47,6 +46,19 @@ module Babosa
47
46
  "„" => '"',
48
47
  "‟" => '"',
49
48
  '’' => "'",
49
+ ',' => ",",
50
+ '。' => ".",
51
+ '!' => "!",
52
+ '?' => '?',
53
+ '、' => ',',
54
+ '(' => '(',
55
+ ')' => ')',
56
+ '【' => '[',
57
+ '】' => ']',
58
+ ';' => ';',
59
+ ':' => ':',
60
+ '《' => '<',
61
+ '》' => '>',
50
62
  # various kinds of space characters
51
63
  "\xc2\xa0" => " ",
52
64
  "\xe2\x80\x80" => " ",
@@ -87,6 +99,7 @@ module Babosa
87
99
  @approximations[codepoint]
88
100
  end
89
101
 
102
+ # Transliterates a string.
90
103
  def transliterate(string)
91
104
  string.unpack("U*").map {|char| self[char] || char}.flatten.pack("U*")
92
105
  end
@@ -3,8 +3,27 @@ module Babosa
3
3
  module Transliterator
4
4
  class Ukrainian < Cyrillic
5
5
  APPROXIMATIONS = {
6
+ "Г" => "H",
7
+ "г" => "h",
8
+ "Ґ" => "G",
9
+ "ґ" => "g",
10
+ "є" => "ie",
6
11
  "И" => "Y",
7
12
  "и" => "y",
13
+ "І" => "I",
14
+ "і" => "i",
15
+ "ї" => "i",
16
+ "Й" => "Y",
17
+ "й" => "i",
18
+ "Х" => "Kh",
19
+ "х" => "kh",
20
+ "Ц" => "Ts",
21
+ "ц" => 'ts',
22
+ "Щ" => "Shch",
23
+ "щ" => "shch",
24
+ "ю" => "iu",
25
+ "я" => "ia",
26
+ "'" => ""
8
27
  }
9
28
  end
10
29
  end
@@ -1,19 +1,13 @@
1
+ require 'active_support/multibyte/unicode'
2
+
1
3
  module Babosa
2
4
  module UTF8
3
5
  # A UTF-8 proxy using Active Support's multibyte support.
4
6
  module ActiveSupportProxy
5
- extend UTF8Proxy
6
- extend self
7
- def downcase(string)
8
- ActiveSupport::Multibyte::Chars.new(string).downcase.to_s
9
- end
10
-
11
- def upcase(string)
12
- ActiveSupport::Multibyte::Chars.new(string).upcase.to_s
13
- end
7
+ extend ActiveSupport::Multibyte::Unicode
14
8
 
15
- def normalize_utf8(string)
16
- ActiveSupport::Multibyte::Chars.new(string).normalize(:c).to_s
9
+ def self.normalize_utf8(string)
10
+ normalize(string, :c)
17
11
  end
18
12
  end
19
13
  end
@@ -10,32 +10,39 @@ module Babosa
10
10
  # or ActiveSupport should be used instead because they support the full
11
11
  # UTF-8 character range.
12
12
  module DumbProxy
13
- extend UTF8Proxy
13
+ extend Proxy
14
14
  extend self
15
15
 
16
16
  def downcase(string)
17
- string.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
17
+ string.downcase.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
18
18
  end
19
19
 
20
20
  def upcase(string)
21
- string.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
21
+ string.upcase.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
22
22
  end
23
23
 
24
- # This does a very naive Unicode normalization, which should work for
25
- # this library's purposes (i.e., Roman-based codepoints, up to U+017E).
26
- # Do not use reuse this as a general solution! Use a real library like
27
- # Unicode or ActiveSupport instead.
28
- def normalize_utf8(string)
29
- codepoints = string.unpack("U*")
30
- new = []
31
- until codepoints.empty? do
32
- if Mappings::COMPOSITION[codepoints[0..1]]
33
- new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
34
- else
35
- new << codepoints.shift
24
+ if ''.respond_to?(:unicode_normalize)
25
+ def normalize_utf8(string)
26
+ string.unicode_normalize
27
+ end
28
+ else
29
+ # On Ruby 2.2, this uses the native Unicode normalize method. On all
30
+ # other Rubies, it does a very naive Unicode normalization, which should
31
+ # work for this library's purposes (i.e., Roman-based codepoints, up to
32
+ # U+017E). Do not use reuse this as a general solution! Use a real
33
+ # library like Unicode or ActiveSupport instead.
34
+ def normalize_utf8(string)
35
+ codepoints = string.unpack("U*")
36
+ new = []
37
+ until codepoints.empty? do
38
+ if Mappings::COMPOSITION[codepoints[0..1]]
39
+ new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
40
+ else
41
+ new << codepoints.shift
42
+ end
36
43
  end
44
+ new.compact.flatten.pack("U*")
37
45
  end
38
- new.compact.flatten.pack("U*")
39
46
  end
40
47
  end
41
48
  end
@@ -2,7 +2,7 @@ module Babosa
2
2
  module UTF8
3
3
  # A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
4
4
  module JavaProxy
5
- extend UTF8Proxy
5
+ extend Proxy
6
6
  extend self
7
7
  java_import java.text.Normalizer
8
8
 
@@ -8,7 +8,7 @@ module Babosa
8
8
 
9
9
  # A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
10
10
  # The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
11
- module UTF8Proxy
11
+ module Proxy
12
12
  CP1252 = {
13
13
  128 => [226, 130, 172],
14
14
  129 => nil,
@@ -62,50 +62,57 @@ module Babosa
62
62
  raise NotImplementedError
63
63
  end
64
64
 
65
- # Attempt to replace invalid UTF-8 bytes with valid ones. This method
66
- # naively assumes if you have invalid UTF8 bytes, they are either Windows
67
- # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
68
- # always work.
69
- def tidy_bytes(string)
70
- bytes = string.unpack("C*")
71
- conts_expected = 0
72
- last_lead = 0
65
+ if ''.respond_to?(:scrub) && !defined?(Rubinius)
66
+ # Attempt to replace invalid UTF-8 bytes with valid ones. This method
67
+ # naively assumes if you have invalid UTF8 bytes, they are either Windows
68
+ # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
69
+ # always work.
70
+ def tidy_bytes(string)
71
+ string.scrub do |bad|
72
+ tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*')
73
+ end
74
+ end
75
+ else
76
+ def tidy_bytes(string)
77
+ bytes = string.unpack("C*")
78
+ conts_expected = 0
79
+ last_lead = 0
73
80
 
74
- bytes.each_index do |i|
75
- byte = bytes[i]
76
- is_ascii = byte < 128
77
- is_cont = byte > 127 && byte < 192
78
- is_lead = byte > 191 && byte < 245
79
- is_unused = byte > 240
80
- is_restricted = byte > 244
81
+ bytes.each_index do |i|
82
+ byte = bytes[i]
83
+ is_cont = byte > 127 && byte < 192
84
+ is_lead = byte > 191 && byte < 245
85
+ is_unused = byte > 240
86
+ is_restricted = byte > 244
81
87
 
82
- # Impossible or highly unlikely byte? Clean it.
83
- if is_unused || is_restricted
84
- bytes[i] = tidy_byte(byte)
85
- elsif is_cont
86
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
87
- conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
88
- else
89
- if conts_expected > 0
90
- # Expected continuation, but got ASCII or leading? Clean backwards up to
91
- # the leading byte.
92
- (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
93
- conts_expected = 0
94
- end
95
- if is_lead
96
- # Final byte is leading? Clean it.
97
- if i == bytes.length - 1
98
- bytes[i] = tidy_byte(bytes.last)
99
- else
100
- # Valid leading byte? Expect continuations determined by position of
101
- # first zero bit, with max of 3.
102
- conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
103
- last_lead = i
88
+ # Impossible or highly unlikely byte? Clean it.
89
+ if is_unused || is_restricted
90
+ bytes[i] = tidy_byte(byte)
91
+ elsif is_cont
92
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
93
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
94
+ else
95
+ if conts_expected > 0
96
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
97
+ # the leading byte.
98
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
99
+ conts_expected = 0
100
+ end
101
+ if is_lead
102
+ # Final byte is leading? Clean it.
103
+ if i == bytes.length - 1
104
+ bytes[i] = tidy_byte(bytes.last)
105
+ else
106
+ # Valid leading byte? Expect continuations determined by position of
107
+ # first zero bit, with max of 3.
108
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
109
+ last_lead = i
110
+ end
104
111
  end
105
112
  end
106
113
  end
114
+ bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
107
115
  end
108
- bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
109
116
  end
110
117
 
111
118
  private