RubyGems - unicode_utils - Versions diffs - 0.5.0 → 1.0.0 - Mend

unicode_utils 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/INSTALL.txt +37 -0
data/README.txt +11 -22
data/cdata/cond_tc_map +16 -0
data/cdata/grapheme_break_property +1 -0
data/cdata/simple_tc_map +1 -0
data/cdata/special_tc_map +1 -0
data/cdata/word_break_property +1 -0
data/lib/unicode_utils.rb +31 -3
data/lib/unicode_utils/canonical_decomposition.rb +27 -20
data/lib/unicode_utils/canonical_equivalents_q.rb +3 -2
data/lib/unicode_utils/casefold.rb +1 -0
data/lib/unicode_utils/char_name.rb +3 -2
data/lib/unicode_utils/combining_class.rb +4 -21
data/lib/unicode_utils/compatibility_decomposition.rb +1 -0
data/lib/unicode_utils/conditional_casing.rb +16 -18
data/lib/unicode_utils/downcase.rb +10 -3
data/lib/unicode_utils/each_grapheme.rb +85 -0
data/lib/unicode_utils/each_word.rb +118 -0
data/lib/unicode_utils/grep.rb +1 -0
data/lib/unicode_utils/hangul_syllable_decomposition.rb +2 -1
data/lib/unicode_utils/jamo_short_name.rb +2 -1
data/lib/unicode_utils/nfc.rb +3 -6
data/lib/unicode_utils/nfkc.rb +1 -0
data/lib/unicode_utils/read_cdata.rb +49 -2
data/lib/unicode_utils/simple_casefold.rb +1 -0
data/lib/unicode_utils/simple_downcase.rb +5 -6
data/lib/unicode_utils/simple_upcase.rb +5 -6
data/lib/unicode_utils/titlecase.rb +70 -0
data/lib/unicode_utils/upcase.rb +11 -4
data/lib/unicode_utils/version.rb +1 -1
data/test/test_unicode_utils.rb +46 -0
metadata +13 -3

data/lib/unicode_utils/each_word.rb ADDED

@@ -0,0 +1,118 @@
+# -*- encoding: utf-8 -*-
+require "unicode_utils/read_cdata"
+module UnicodeUtils
+  # Maps codepoints to integer codes. For the integer code to property
+  # mapping, see #compile_word_break_property in data/compile.rb.
+  WORD_BREAK_MAP =
+    Impl.read_hexdigit_map("word_break_property") # :nodoc:
+  # Split +str+ along word boundaries according to Unicode's Default
+  # Word Boundary Specification, calling the given block with each
+  # word. Returns +str+, or an enumerator if no block is given.
+  #
+  # Example:
+  #
+  #   require "unicode_utils/each_word"
+  #   UnicodeUtils.each_word("Hello, world!").to_a => ["Hello", ",", " ", "world", "!"]
+  def each_word(str)
+    return enum_for(__method__, str) unless block_given?
+    cs = str.each_codepoint.map { |c| WORD_BREAK_MAP[c] }
+    cs << nil << nil # for negative indices
+    word = String.new.force_encoding(str.encoding)
+    i = 0
+    str.each_codepoint { |c|
+      word << c
+      if Impl.word_break?(cs, i) && !word.empty?
+        yield word
+        word = String.new.force_encoding(str.encoding)
+      end
+      i += 1
+    }
+    yield word unless word.empty?
+    str
+  end
+  module_function :each_word
+  module Impl # :nodoc:all
+    def self.word_break?(cs, i)
+      # wb3
+      cs_i = cs[i]
+      i1 = i + 1
+      cs_i1 = cs[i1]
+      if cs_i == 0x0 && cs_i1 == 0x1
+        return false
+      end
+      # wb3a
+      if cs_i == 0x2 || cs_i == 0x0 || cs_i == 0x1
+        return true
+      end
+      # wb3b
+      if cs_i1 == 0x2 || cs_i1 == 0x0 || cs_i1 == 0x1
+        return true
+      end
+      # wb5
+      i0 = i
+      # inline skip_l
+      c = nil
+      loop { c = cs[i0]; break unless c == 0x3 || c == 0x4; i0 -= 1 }
+      ci0 = c
+      if ci0 == 0x6 && cs_i1 == 0x6
+        return false
+      end
+      # wb6
+      i2 = i1 + 1
+      # inline skip_r
+      loop { c = cs[i2]; break unless c == 0x3 || c == 0x4; i2 += 1 }
+      if ci0 == 0x6 && (cs_i1 == 0x7 || cs_i1 == 0x9) && cs[i2] == 0x6
+        return false
+      end
+      # wb7
+      i_1 = i0 - 1
+      # inline skip_l
+      loop { c = cs[i_1]; break unless c == 0x3 || c == 0x4; i_1 -= 1 }
+      if cs[i_1] == 0x6 && (ci0 == 0x7 || ci0 == 0x9) && cs_i1 == 0x6
+        return false
+      end
+      # wb8
+      if ci0 == 0xA && cs_i1 == 0xA
+        return false
+      end
+      # wb9
+      if ci0 == 0x6 && cs_i1 == 0xA
+        return false
+      end
+      # wb10
+      if ci0 == 0xA && cs_i1 == 0x6
+        return false
+      end
+      # wb11
+      if cs[i_1] == 0xA && (ci0 == 0x8 || ci0 == 0x9) && cs_i1 == 0xA
+        return false
+      end
+      # wb12
+      if ci0 == 0xA && (cs_i1 == 0x8 || cs_i1 == 0x9) && cs[i2] == 0xA
+        return false
+      end
+      # wb13
+      if ci0 == 0x5 && cs_i1 == 0x5
+        return false
+      end
+      # wb13a
+      if (ci0 == 0x6 || ci0 == 0xA || ci0 == 0x5 || ci0 == 0xB) && cs_i1 == 0xB
+        return false
+      end
+      # wb13b
+      if ci0 == 0xB && (cs_i1 == 0x6 || cs_i1 == 0xA || cs_i1 == 0x5)
+        return false
+      end
+      # break unless next char is Extend/Format
+      cs_i1 != 0x3 && cs_i1 != 0x4
+    end
+  end
+end

data/lib/unicode_utils/grep.rb CHANGED

@@ -7,6 +7,7 @@ module UnicodeUtils
   # Get an array of all Codepoint instances in Codepoint::RANGE whose
   # name matches regexp. Matching is case insensitive.
   #
+  #   require "unicode_utils/grep"
   #   UnicodeUtils.grep(/angstrom/) => [#<U+212B "Å" ANGSTROM SIGN utf8:e2,84,ab>]
   def grep(regexp)
     unless regexp.casefold?

data/lib/unicode_utils/hangul_syllable_decomposition.rb CHANGED

@@ -6,7 +6,8 @@ module UnicodeUtils
   #
   # Example:
   #
-  #     UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
+  #   require "unicode_utils/hangul_syllable_decomposition"
+  #   UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
   def hangul_syllable_decomposition(char)
     String.new.force_encoding(char.encoding).tap do |str|
       Impl.append_hangul_syllable_decomposition(str , char.ord)

data/lib/unicode_utils/jamo_short_name.rb CHANGED

@@ -11,7 +11,8 @@ module UnicodeUtils
   #
   # Example:
   #
-  #     UnicodeUtils.jamo_short_name("\u{1101}") => "GG"
+  #   require "unicode_utils/jamo_short_name"
+  #   UnicodeUtils.jamo_short_name("\u{1101}") => "GG"
   def jamo_short_name(char)
     JAMO_SHORT_NAME_MAP[char.ord]
   end

data/lib/unicode_utils/nfc.rb CHANGED

@@ -21,10 +21,6 @@ module UnicodeUtils
     module NFC
-      def self.starter?(cp)
-        (COMBINING_CLASS_MAP[cp] || 0) == 0
-      end
       # does b block c?
       def self.blocked?(b, c)
         # From the standard:
@@ -33,7 +29,7 @@ module UnicodeUtils
         # at only the immediately preceding character."
         # cpary is in canonical order (since it comes out of
         # canonical_decomposition).
-        (COMBINING_CLASS_MAP[b] || 0) >= (COMBINING_CLASS_MAP[c] || 0)
+        COMBINING_CLASS_MAP[b] >= COMBINING_CLASS_MAP[c]
       end
       def self.primary_composite?(cp)
@@ -64,7 +60,7 @@ module UnicodeUtils
         last_starter = nil
         uncomposable_non_starters = []
         str.each_codepoint { |cp|
-          if Impl::NFC.starter?(cp)
+          if COMBINING_CLASS_MAP[cp] == 0 # starter?
             combined = false
             if last_starter && uncomposable_non_starters.empty?
               ### hangul ###
@@ -135,6 +131,7 @@ module UnicodeUtils
   #
   # Example:
   #
+  #   require "unicode_utils/nfc"
   #   UnicodeUtils.nfc("La\u{308}mpchen") => "Lämpchen"
   def nfc(str)
     str = UnicodeUtils.canonical_decomposition(str)

data/lib/unicode_utils/nfkc.rb CHANGED

@@ -13,6 +13,7 @@ module UnicodeUtils
   #
   # Example:
   #
+  #   require "unicode_utils/nfkc"
   #   # LATIN SMALL LIGATURE FI => LATIN SMALL LETTER F, LATIN SMALL LETTER I
   #   UnicodeUtils.nfkc("ﬁ") => "fi"
   #

data/lib/unicode_utils/read_cdata.rb CHANGED

@@ -4,8 +4,7 @@ module UnicodeUtils
   # Absolute path to the directory from which UnicodeUtils loads its
   # compiled Unicode data files at runtime.
-  CDATA_DIR =
-    File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
+  CDATA_DIR = File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
   module Impl # :nodoc:
@@ -66,6 +65,54 @@ module UnicodeUtils
       }
     end
+    def self.read_conditional_casings(filename)
+      Hash.new.tap { |cp_map|
+        open_cdata_file(filename) do |input|
+          input.each_line { |line|
+            line.chomp!
+            record = line.split(";")
+            cp = record[0].to_i(16)
+            mapping = record[1].split(",").map { |c| c.to_i(16) }
+            language_id = record[2].empty? ? nil : record[2].to_sym
+            context = record[3] && record[3].gsub('_', '')
+            casing = Impl.const_get("#{context}ConditionalCasing").new(mapping)
+            (cp_map[cp] ||= {})[language_id] = casing
+          }
+        end
+      }
+    end
+    def self.read_combining_class_map
+      Hash.new.tap { |map|
+        open_cdata_file("combining_class_map") do |input|
+          buffer = "x" * 6
+          buffer.force_encoding(Encoding::US_ASCII)
+          cc_buffer = "x" * 2
+          cc_buffer.force_encoding(Encoding::US_ASCII)
+          while input.read(6, buffer)
+            map[buffer.to_i(16)] = input.read(2, cc_buffer).to_i(16)
+          end
+        end
+      }
+    end
+    # Read a map whose keys are codepoints (6 hexgdigits, converted to
+    # integer) and whose values are single hexdigits (converted to
+    # integer).
+    def self.read_hexdigit_map(filename)
+      Hash.new.tap { |map|
+        open_cdata_file(filename) do |input|
+          buffer = "x" * 6
+          buffer.force_encoding(Encoding::US_ASCII)
+          val_buffer = "x"
+          val_buffer.force_encoding(Encoding::US_ASCII)
+          while input.read(6, buffer)
+            map[buffer.to_i(16)] = input.read(1, val_buffer).to_i(16)
+          end
+        end
+      }
+    end
   end
 end

data/lib/unicode_utils/simple_casefold.rb CHANGED

@@ -16,6 +16,7 @@ module UnicodeUtils
   #
   # Examples:
   #
+  #   require "unicode_utils/simple_casefold"
   #   UnicodeUtils.simple_casefold("Ümit") == UnicodeUtils.simple_casefold("ümit") => true
   #   UnicodeUtils.simple_casefold("WEISS") == UnicodeUtils.simple_casefold("weiß") => false
   #

data/lib/unicode_utils/simple_downcase.rb CHANGED

@@ -7,17 +7,16 @@ module UnicodeUtils
   SIMPLE_DOWNCASE_MAP = Impl.read_codepoint_map("simple_lc_map") # :nodoc:
   # Map each codepoint in +str+ that has a single codepoint
-  # lowercase-mapping to that lowercase mapping. +str+ is assumed to be
-  # in a unicode encoding. The original string is not modified. The
-  # returned string has the same encoding and same length as the
-  # original string.
+  # lowercase-mapping to that lowercase mapping. The returned string
+  # has the same length as the original string.
   #
   # This function is locale independent.
   #
   # Examples:
   #
-  #     UnicodeUtils.simple_downcase("ÜMIT: 123") => "ümit: 123"
-  #     UnicodeUtils.simple_downcase("STRASSE") => "strasse"
+  #   require "unicode_utils/simple_downcase"
+  #   UnicodeUtils.simple_downcase("ÜMIT: 123") => "ümit: 123"
+  #   UnicodeUtils.simple_downcase("STRASSE") => "strasse"
   def simple_downcase(str)
     String.new.force_encoding(str.encoding).tap { |res|
       str.each_codepoint { |cp|

data/lib/unicode_utils/simple_upcase.rb CHANGED

@@ -7,17 +7,16 @@ module UnicodeUtils
   SIMPLE_UPCASE_MAP = Impl.read_codepoint_map("simple_uc_map") # :nodoc:
   # Map each codepoint in +str+ that has a single codepoint
-  # uppercase-mapping to that uppercase mapping. +str+ is assumed to be
-  # in a unicode encoding. The original string is not modified. The
-  # returned string has the same encoding and same length as the
-  # original string.
+  # uppercase-mapping to that uppercase mapping. The returned string
+  # has the same length as the original string.
   #
   # This function is locale independent.
   #
   # Examples:
   #
-  #     UnicodeUtils.simple_upcase("ümit: 123") => "ÜMIT: 123"
-  #     UnicodeUtils.simple_upcase("weiß") => "WEIß"
+  #   require "unicode_utils/simple_upcase"
+  #   UnicodeUtils.simple_upcase("ümit: 123") => "ÜMIT: 123"
+  #   UnicodeUtils.simple_upcase("weiß") => "WEIß"
   def simple_upcase(str)
     String.new.force_encoding(str.encoding).tap { |res|
       str.each_codepoint { |cp|

data/lib/unicode_utils/titlecase.rb ADDED

@@ -0,0 +1,70 @@
+# -*- encoding: utf-8 -*-
+require "unicode_utils/read_cdata"
+require "unicode_utils/conditional_casing"
+require "unicode_utils/each_word"
+require "unicode_utils/cased_char_q"
+require "unicode_utils/downcase"
+module UnicodeUtils
+  SIMPLE_TITLECASE_MAP = Impl.read_codepoint_map("simple_tc_map") # :nodoc:
+  SPECIAL_TITLECASE_MAP = Impl.read_multivalued_map("special_tc_map") # :nodoc:
+  # Convert the first cased character after each word boundary to
+  # titlecase and all other cased characters to lowercase. For many,
+  # but not all characters, the titlecase mapping is the same as the
+  # uppercase mapping.
+  #
+  # Some conversion rules are language dependent, these are in effect
+  # when a non-nil +language_id+ is given. If non-nil, the
+  # +language_id+ must be a two letter language code as defined in BCP
+  # 47 (http://tools.ietf.org/rfc/bcp/bcp47.txt) as a symbol. If a
+  # language doesn't have a two letter code, the three letter code is
+  # to be used. If locale independent behaviour is required, +nil+
+  # should be passed explicitely, because a later version of
+  # UnicodeUtils may default to something else.
+  #
+  # Example:
+  #
+  #   require "unicode_utils/titlecase"
+  #   UnicodeUtils.titlecase("hello, world!") => "Hello, World!"
+  def titlecase(str, language_id = nil)
+    String.new.force_encoding(str.encoding).tap do |res|
+      # ensure O(1) lookup by index
+      str = str.encode(Encoding::UTF_32LE)
+      i = 0
+      each_word(str) { |word|
+        cased_char_found = false
+        word.each_codepoint { |cp|
+          cased = cased_char?(cp)
+          if !cased_char_found && cased
+            cased_char_found = true
+            special_mapping =
+              Impl.conditional_titlecase_mapping(cp, str, i, language_id) ||
+              SPECIAL_TITLECASE_MAP[cp]
+            if special_mapping
+              special_mapping.each { |m| res << m }
+            else
+              res << (SIMPLE_TITLECASE_MAP[cp] || cp)
+            end
+          elsif cased
+            special_mapping =
+              Impl.conditional_downcase_mapping(cp, str, i, language_id) ||
+              SPECIAL_DOWNCASE_MAP[cp]
+            if special_mapping
+              special_mapping.each { |m| res << m }
+            else
+              res << (SIMPLE_DOWNCASE_MAP[cp] || cp)
+            end
+          else
+            res << cp
+          end
+          i += 1
+        }
+      }
+    end
+  end
+  module_function :titlecase
+end

data/lib/unicode_utils/upcase.rb CHANGED

@@ -16,15 +16,22 @@ module UnicodeUtils
   # +language_id+ must be a two letter language code as defined in BCP
   # 47 (http://tools.ietf.org/rfc/bcp/bcp47.txt) as a symbol. If a
   # language doesn't have a two letter code, the three letter code is
-  # to be used.
+  # to be used. If locale independent behaviour is required, +nil+
+  # should be passed explicitely, because a later version of
+  # UnicodeUtils may default to something else.
   #
   # Examples:
   #
-  #     UnicodeUtils.upcase("weiß") => "WEISS"
-  #     UnicodeUtils.upcase("i", :en) => "I"
-  #     UnicodeUtils.upcase("i", :tr) => "İ"
+  #   require "unicode_utils/upcase"
+  #   UnicodeUtils.upcase("weiß") => "WEISS"
+  #   UnicodeUtils.upcase("i", :en) => "I"
+  #   UnicodeUtils.upcase("i", :tr) => "İ"
   def upcase(str, language_id = nil)
     String.new.force_encoding(str.encoding).tap { |res|
+      if Impl::LANGS_WITH_RULES.include?(language_id)
+        # ensure O(1) lookup by index
+        str = str.encode(Encoding::UTF_32LE)
+      end
       pos = 0
       str.each_codepoint { |cp|
         special_mapping =

data/lib/unicode_utils/version.rb CHANGED

@@ -3,6 +3,6 @@
 module UnicodeUtils
   # Corresponds to the unicode_utils gem version.
-  VERSION = "0.5.0"
+  VERSION = "1.0.0"
 end

data/test/test_unicode_utils.rb CHANGED

@@ -177,4 +177,50 @@ class TestUnicodeUtils < Test::Unit::TestCase
       UnicodeUtils.casefold("weiß")
   end
+  def test_each_grapheme
+    graphemes = []
+    UnicodeUtils.each_grapheme("word") { |g| graphemes << g }
+    assert_equal ["w", "o", "r", "d"], graphemes
+    UnicodeUtils.each_grapheme("") { |g| flunk }
+    graphemes = []
+    UnicodeUtils.each_grapheme("u\u{308}mit") { |g| graphemes << g }
+    # diaeresis
+    assert_equal ["u\u{308}", "m", "i", "t"], graphemes
+    # hangul syllable
+    graphemes = []
+    UnicodeUtils.each_grapheme("\u{1111}\u{1171}\u{11b6}\u{d4db}") { |g| graphemes << g }
+    assert_equal ["\u{1111}\u{1171}\u{11b6}", "\u{d4db}"], graphemes
+    assert_equal ["a", "\r\n", "b"], UnicodeUtils.each_grapheme("a\r\nb").to_a
+  end
+  def test_each_word
+    words = []
+    UnicodeUtils.each_word("two words") { |w| words << w }
+    assert_equal ["two", " ", "words"], words
+    assert_equal ["a", " ", "b"], UnicodeUtils.each_word("a b").to_a
+    assert_equal [" ", "b"], UnicodeUtils.each_word(" b").to_a
+    assert_equal ["a", " "], UnicodeUtils.each_word("a ").to_a
+    assert_equal [" "], UnicodeUtils.each_word(" ").to_a
+    assert_equal ["a"], UnicodeUtils.each_word("a").to_a
+    assert_equal [], UnicodeUtils.each_word("").to_a
+    assert_equal ["Hello", ",", " ", "world", "!"],
+      UnicodeUtils.each_word("Hello, world!").to_a
+    assert_equal ["o\u{308}12"],
+      UnicodeUtils.each_word("o\u{308}12").to_a
+    assert_equal ["o\u{308}1"],
+      UnicodeUtils.each_word("o\u{308}1").to_a
+    assert_equal ["o\u{308}"],
+      UnicodeUtils.each_word("o\u{308}").to_a
+    assert_equal ["\u{308}", "o"],
+      UnicodeUtils.each_word("\u{308}o").to_a
+  end
+  def test_titlecase
+    assert_equal "Hello, World!", UnicodeUtils.titlecase("heLlo, world!")
+    assert_equal "Find", UnicodeUtils.titlecase("ﬁnD")
+    assert_equal "Ümit Huber Jandl", UnicodeUtils.titlecase("ümit huber jandl")
+    assert_equal "İ Can Has 1Kg Cheesburger",
+      UnicodeUtils.titlecase("i can has 1kg CHEESBURGER", :tr)
+  end
 end