RubyGems - babosa - Versions diffs - 0.3.11 → 1.0.1 - Mend

babosa 0.3.11 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/Changelog.md +93 -17
data/lib/babosa.rb +0 -17
data/lib/babosa/identifier.rb +18 -16
data/lib/babosa/transliterator/base.rb +16 -3
data/lib/babosa/transliterator/ukrainian.rb +19 -0
data/lib/babosa/utf8/active_support_proxy.rb +5 -11
data/lib/babosa/utf8/dumb_proxy.rb +23 -16
data/lib/babosa/utf8/java_proxy.rb +1 -1
data/lib/babosa/utf8/proxy.rb +46 -39
data/lib/babosa/utf8/unicode_proxy.rb +3 -1
data/lib/babosa/version.rb +1 -1
data/spec/babosa_spec.rb +45 -36
data/spec/spec_helper.rb +8 -14
data/spec/transliterators/base_spec.rb +3 -3
data/spec/transliterators/bulgarian_spec.rb +1 -1
data/spec/transliterators/danish_spec.rb +1 -1
data/spec/transliterators/german_spec.rb +2 -2
data/spec/transliterators/greek_spec.rb +1 -1
data/spec/transliterators/latin_spec.rb +9 -0
data/spec/transliterators/norwegian_spec.rb +1 -1
data/spec/transliterators/polish_spec.rb +14 -0
data/spec/transliterators/romanian_spec.rb +1 -1
data/spec/transliterators/serbian_spec.rb +1 -1
data/spec/transliterators/spanish_spec.rb +1 -1
data/spec/transliterators/swedish_spec.rb +1 -1
data/spec/transliterators/ukrainian_spec.rb +80 -1
data/spec/transliterators/vietnamese_spec.rb +1 -1
data/spec/utf8_proxy_spec.rb +10 -18
metadata +42 -29
data/init.rb +0 -3
data/lib/babosa/candidates.rb +0 -45
data/lib/babosa/generator.rb +0 -24

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2ee04fad8c458a32dea08b5f6483d817d359dab4
-  data.tar.gz: 22385c9ae0e279fc6531a6ad6e9851ec3d4f4e81
+  metadata.gz: 5f3dfc2a054ed3f64981c0f18a149a9647ef0183
+  data.tar.gz: 3478c2c422839e82866d828cc77ba3bcf79a5117
 SHA512:
-  metadata.gz: ad5f5a7e2bbfd63ab2e0a89878b54dc52a96f6929f68dbb94070d86e7b0515d7f3fb0e6e30d4e3152320a07fe7684833f23135569e7200e4aad0f9930bb3b261
-  data.tar.gz: 95e010b9b9c5138af14454258f332a0e32e735354ad49a65ed4f22514110e159909e4a4e714a06443a02f4d9f64edfe00ac2cf48f10036488beb2f4a4abde977
+  metadata.gz: f7880005ce37bddd4b9780a3b31489eebdfbbb4e8f9b7d6eacd54cc3e33c36b412ec2fd4b5cc4aea87ea64288240002c9006a909e3ba2692161e775f5e43836f
+  data.tar.gz: c0e7fd6edeb02401dfb34493e5e072de62fdbe3c0333d252ebac5b359830ef966ededc8879077ed00e54a75432db72ec0852c3f4c4edf7e4a2b40c550c3e6245

data/Changelog.md CHANGED

@@ -1,19 +1,95 @@
 # Babosa Changelog
-* 0.3.11 - Added support for Vietnamese
-* 0.3.10 - Fixed Macedonian "S/S". Don't `include JRuby` unnecessarily.
-* 0.3.9 - Added missing Greek vowels with diaeresis.
-* 0.3.8 - Correct and improve Macedonian support.
-* 0.3.7 - Fix compatibility with Ruby 1.8.7. Add Swedish support.
-* 0.3.6 - Allow multiple transliterators. Add Greek support.
-* 0.3.5 - Don't strip underscores from identifiers.
-* 0.3.4 - Add Romanian support.
-* 0.3.3 - Add Norwegian support.
-* 0.3.2 - Improve Macedonian support.
-* 0.3.1 - Small fixes to Cyrillic.
-* 0.3.0 - Cyrillic support. Improve support for various Unicode spaces and dashes.
-* 0.2.2 - Fix for "smart" quote handling.
-* 0.2.1 - Implement #empty? for compatiblity with Active Support's #blank?.
-* 0.2.0 - Added support for Danish. Added method to generate Ruby identifiers. Improved performance.
-* 0.1.1 - Added support for Serbian.
-* 0.1.0 - Initial extraction from FriendlyId.
+## 1.0.1
+* Fix error with tidy_bytes on Rubinius.
+* Simplify Active Support UTF8 proxy.
+* Fix `allow_bangs` argument to to_ruby_method being silently ignored.
+* Raise error when generating an impossible Ruby method name.
+## 1.0.0
+* Adopt semantic versioning.
+* When using Active Support, require 3.2 or greater.
+* Require Ruby 2.0 or greater.
+* Fix Ruby warnings.
+* Improve support for Ukrainian.
+* Support some additional punctuation characters used by Chinese and others.
+* Add Polish spec.
+* Use native Unicode normalization on Ruby 2.2 in UTF8::DumbProxy.
+* Invoke Ruby-native upcase/downcase in UTF8::DumbProxy.
+* Proxy `tidy_bytes` method to Active Support when possible.
+* Remove SlugString constant.
+## 0.3.11
+*  Add support for Vietnamese.
+## 0.3.10
+*  Fix Macedonian "S/S". Don't `include JRuby` unnecessarily.
+## 0.3.9
+* Add missing Greek vowels with diaeresis.
+## 0.3.8
+* Correct and improve Macedonian support.
+## 0.3.7
+* Fix compatibility with Ruby 1.8.7.
+* Add Swedish support.
+## 0.3.6
+* Allow multiple transliterators.
+* Add Greek support.
+## 0.3.5
+* Don't strip underscores from identifiers.
+## 0.3.4
+* Add Romanian support.
+## 0.3.3
+* Add Norwegian support.
+## 0.3.2
+* Improve Macedonian support.
+## 0.3.1
+* Small fixes to Cyrillic.
+## 0.3.0
+* Cyrillic support.
+* Improve support for various Unicode spaces and dashes.
+## 0.2.2
+* Fix for "smart" quote handling.
+## 0.2.1
+* Implement #empty? for compatiblity with Active Support's #blank?.
+## 0.2.0
+* Add support for Danish.
+* Add method to generate Ruby identifiers.
+* Improve performance.
+## 0.1.1
+* Add support for Serbian.
+## 0.1.0
+* Initial extraction from FriendlyId.

data/lib/babosa.rb CHANGED

@@ -9,23 +9,6 @@ class String
     Babosa::Identifier.new self
   end
   alias to_slug to_identifier
-  # Compatibility with 1.8.6
-  if !public_method_defined? :bytesize
-    def bytesize
-      unpack("C*").length
-    end
-  end
-  # Define unless Active Support has already added this method.
-  if !public_method_defined? :classify
-    # Convert from underscores to class name. E.g.:
-    #     hello_world => HelloWorld
-    def classify
-      split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
-    end
-  end
 end
 require "babosa/transliterator/base"

data/lib/babosa/identifier.rb CHANGED

@@ -30,6 +30,8 @@ module Babosa
   # @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table
   class Identifier
+    Error = Class.new(StandardError)
     attr_reader :wrapped_string
     alias to_s wrapped_string
@@ -44,13 +46,13 @@ module Babosa
     end
     # Return the proxy used for UTF-8 support.
-    # @see Babosa::UTF8::UTF8Proxy
+    # @see Babosa::UTF8::Proxy
     def self.utf8_proxy
       @@utf8_proxy
     end
     # Set a proxy object used for UTF-8 support.
-    # @see Babosa::UTF8::UTF8Proxy
+    # @see Babosa::UTF8::Proxy
     def self.utf8_proxy=(obj)
       @@utf8_proxy = obj
     end
@@ -100,16 +102,17 @@ module Babosa
     #   string.transliterate                 # => "¡Feliz ano!"
     #   string.transliterate :spanish        # => "¡Feliz anio!"
     #
-    # You can modify the built-in approximations, or add your own:
+    # The approximations are an array, which you can modify if you choose:
     #
     #   # Make Spanish use "nh" rather than "nn"
-    #   Babosa::Characters.add_approximations(:spanish, "ñ" => "nh")
+    #   Babosa::Transliterator::Spanish::APPROXIMATIONS["ñ"] = "nh"
     #
     # Notice that this method does not simply convert to ASCII; if you want
     # to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}:
     #
     #   string.transliterate!(:spanish)       # => "¡Feliz anio!"
     #   string.transliterate!                 # => "¡Feliz anio!"
+    #
     # @param *args <Symbol>
     # @return String
     def transliterate!(*kinds)
@@ -142,13 +145,8 @@ module Babosa
     # @param Options
     # @return String
     def normalize!(options = nil)
-      # Handle deprecated usage
-      if options == true
-        warn "#normalize! now takes a hash of options rather than a boolean"
-        options = default_normalize_options.merge(:to_ascii => true)
-      else
-        options = default_normalize_options.merge(options || {})
-      end
+      options = default_normalize_options.merge(options || {})
       if translit_option = options[:transliterate]
         if translit_option != true
           transliterate!(*translit_option)
@@ -168,10 +166,14 @@ module Babosa
     # Normalize a string so that it can safely be used as a Ruby method name.
     def to_ruby_method!(allow_bangs = true)
       leader, trailer = @wrapped_string.strip.scan(/\A(.+)(.)\z/).flatten
+      leader          = leader.to_s
+      trailer         = trailer.to_s
       if allow_bangs
-        trailer.downcase.gsub!(/[^a-z0-9!=\\\\?]/, '')
+        trailer.downcase!
+        trailer.gsub!(/[^a-z0-9!=\\?]/, '')
       else
-        trailer.downcase.gsub!(/[^a-z0-9]/, '')
+        trailer.downcase!
+        trailer.gsub!(/[^a-z0-9]/, '')
       end
       id = leader.to_identifier
       id.transliterate!
@@ -180,6 +182,9 @@ module Babosa
       id.word_chars!
       id.clean!
       @wrapped_string = id.to_s + trailer
+      if @wrapped_string == ""
+        raise Error, "Input generates impossible Ruby method name"
+      end
       with_separators!("_")
     end
@@ -285,7 +290,4 @@ module Babosa
       id
     end
   end
-  # Identifier is aliased as SlugString to support older versions of FriendlyId.
-  SlugString = Identifier
 end

data/lib/babosa/transliterator/base.rb CHANGED

@@ -23,11 +23,11 @@ module Babosa
     autoload :Vietnamese, "babosa/transliterator/vietnamese"
     def self.get(symbol)
-      const_get(symbol.to_s.classify)
+      class_name = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
+      const_get(class_name)
     end
     class Base
       include Singleton
       APPROXIMATIONS = {
@@ -39,7 +39,6 @@ module Babosa
         "–" => "-",
         "—" => "-",
         "―" => "-",
-        "―" => "-",
         "‘" => "'",
         "‛" => "'",
         "“" => '"',
@@ -47,6 +46,19 @@ module Babosa
         "„" => '"',
         "‟" => '"',
         '’' => "'",
+        '，' => ",",
+        '。' => ".",
+        '！' => "!",
+        '？' => '?',
+        '、' => ',',
+        '（' => '(',
+        '）' => ')',
+        '【' => '[',
+        '】' => ']',
+        '；' => ';',
+        '：' => ':',
+        '《' => '<',
+        '》' => '>',
         # various kinds of space characters
         "\xc2\xa0"     => " ",
         "\xe2\x80\x80" => " ",
@@ -87,6 +99,7 @@ module Babosa
         @approximations[codepoint]
       end
+      # Transliterates a string.
       def transliterate(string)
         string.unpack("U*").map {|char| self[char] || char}.flatten.pack("U*")
       end

data/lib/babosa/transliterator/ukrainian.rb CHANGED

@@ -3,8 +3,27 @@ module Babosa
   module Transliterator
     class Ukrainian < Cyrillic
       APPROXIMATIONS = {
+        "Г" => "H",
+        "г" => "h",
+        "Ґ" => "G",
+        "ґ" => "g",
+        "є" => "ie",
         "И" => "Y",
         "и" => "y",
+        "І" => "I",
+        "і" => "i",
+        "ї" => "i",
+        "Й" => "Y",
+        "й" => "i",
+        "Х" => "Kh",
+        "х" => "kh",
+        "Ц" => "Ts",
+        "ц" => 'ts',
+        "Щ" => "Shch",
+        "щ" => "shch",
+        "ю" => "iu",
+        "я" => "ia",
+        "'" => ""
       }
     end
   end

data/lib/babosa/utf8/active_support_proxy.rb CHANGED

@@ -1,19 +1,13 @@
+require 'active_support/multibyte/unicode'
 module Babosa
   module UTF8
     # A UTF-8 proxy using Active Support's multibyte support.
     module ActiveSupportProxy
-      extend UTF8Proxy
-      extend self
-      def downcase(string)
-        ActiveSupport::Multibyte::Chars.new(string).downcase.to_s
-      end
-      def upcase(string)
-        ActiveSupport::Multibyte::Chars.new(string).upcase.to_s
-      end
+      extend ActiveSupport::Multibyte::Unicode
-      def normalize_utf8(string)
-        ActiveSupport::Multibyte::Chars.new(string).normalize(:c).to_s
+      def self.normalize_utf8(string)
+        normalize(string, :c)
       end
     end
   end

data/lib/babosa/utf8/dumb_proxy.rb CHANGED

@@ -10,32 +10,39 @@ module Babosa
     # or ActiveSupport should be used instead because they support the full
     # UTF-8 character range.
     module DumbProxy
-      extend UTF8Proxy
+      extend Proxy
       extend self
       def downcase(string)
-        string.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
+        string.downcase.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*")
       end
       def upcase(string)
-        string.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
+        string.upcase.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*")
       end
-      # This does a very naive Unicode normalization, which should work for
-      # this library's purposes (i.e., Roman-based codepoints, up to U+017E).
-      # Do not use reuse this as a general solution!  Use a real library like
-      # Unicode or ActiveSupport instead.
-      def normalize_utf8(string)
-        codepoints = string.unpack("U*")
-        new = []
-        until codepoints.empty? do
-          if Mappings::COMPOSITION[codepoints[0..1]]
-            new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
-          else
-            new << codepoints.shift
+      if ''.respond_to?(:unicode_normalize)
+        def normalize_utf8(string)
+          string.unicode_normalize
+        end
+      else
+        # On Ruby 2.2, this uses the native Unicode normalize method. On all
+        # other Rubies, it does a very naive Unicode normalization, which should
+        # work for this library's purposes (i.e., Roman-based codepoints, up to
+        # U+017E).  Do not use reuse this as a general solution!  Use a real
+        # library like Unicode or ActiveSupport instead.
+        def normalize_utf8(string)
+          codepoints = string.unpack("U*")
+          new = []
+          until codepoints.empty? do
+            if Mappings::COMPOSITION[codepoints[0..1]]
+              new << Mappings::COMPOSITION[codepoints.slice!(0,2)]
+            else
+              new << codepoints.shift
+            end
           end
+          new.compact.flatten.pack("U*")
         end
-        new.compact.flatten.pack("U*")
       end
     end
   end

data/lib/babosa/utf8/java_proxy.rb CHANGED

@@ -2,7 +2,7 @@ module Babosa
   module UTF8
     # A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+.
     module JavaProxy
-      extend UTF8Proxy
+      extend Proxy
       extend self
       java_import java.text.Normalizer

data/lib/babosa/utf8/proxy.rb CHANGED

@@ -8,7 +8,7 @@ module Babosa
     # A UTF-8 proxy for Babosa can be any object which responds to the methods in this module.
     # The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}.
-    module UTF8Proxy
+    module Proxy
       CP1252  = {
         128 => [226, 130, 172],
         129 => nil,
@@ -62,50 +62,57 @@ module Babosa
         raise NotImplementedError
       end
-      # Attempt to replace invalid UTF-8 bytes with valid ones. This method
-      # naively assumes if you have invalid UTF8 bytes, they are either Windows
-      # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
-      # always work.
-      def tidy_bytes(string)
-        bytes = string.unpack("C*")
-        conts_expected = 0
-        last_lead = 0
+      if ''.respond_to?(:scrub) && !defined?(Rubinius)
+        # Attempt to replace invalid UTF-8 bytes with valid ones. This method
+        # naively assumes if you have invalid UTF8 bytes, they are either Windows
+        # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
+        # always work.
+        def tidy_bytes(string)
+          string.scrub do |bad|
+            tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*')
+          end
+        end
+      else
+        def tidy_bytes(string)
+          bytes = string.unpack("C*")
+          conts_expected = 0
+          last_lead = 0
-        bytes.each_index do |i|
-          byte          = bytes[i]
-          is_ascii      = byte < 128
-          is_cont       = byte > 127 && byte < 192
-          is_lead       = byte > 191 && byte < 245
-          is_unused     = byte > 240
-          is_restricted = byte > 244
+          bytes.each_index do |i|
+            byte          = bytes[i]
+            is_cont       = byte > 127 && byte < 192
+            is_lead       = byte > 191 && byte < 245
+            is_unused     = byte > 240
+            is_restricted = byte > 244
-          # Impossible or highly unlikely byte? Clean it.
-          if is_unused || is_restricted
-            bytes[i] = tidy_byte(byte)
-          elsif is_cont
-            # Not expecting contination byte? Clean up. Otherwise, now expect one less.
-            conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
-          else
-            if conts_expected > 0
-              # Expected continuation, but got ASCII or leading? Clean backwards up to
-              # the leading byte.
-              (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
-              conts_expected = 0
-            end
-            if is_lead
-              # Final byte is leading? Clean it.
-              if i == bytes.length - 1
-                bytes[i] = tidy_byte(bytes.last)
-              else
-                # Valid leading byte? Expect continuations determined by position of
-                # first zero bit, with max of 3.
-                conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
-                last_lead = i
+            # Impossible or highly unlikely byte? Clean it.
+            if is_unused || is_restricted
+              bytes[i] = tidy_byte(byte)
+            elsif is_cont
+              # Not expecting contination byte? Clean up. Otherwise, now expect one less.
+              conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
+            else
+              if conts_expected > 0
+                # Expected continuation, but got ASCII or leading? Clean backwards up to
+                # the leading byte.
+                (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
+                conts_expected = 0
+              end
+              if is_lead
+                # Final byte is leading? Clean it.
+                if i == bytes.length - 1
+                  bytes[i] = tidy_byte(bytes.last)
+                else
+                  # Valid leading byte? Expect continuations determined by position of
+                  # first zero bit, with max of 3.
+                  conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
+                  last_lead = i
+                end
               end
             end
           end
+          bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
         end
-        bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
       end
       private