RubyGems - zhongwen_tools - Versions diffs - 0.12.4 → 0.15.1 - Mend

zhongwen_tools 0.12.4 → 0.15.1

Files changed (57) hide show

checksums.yaml +4 -4
data/Gemfile +1 -1
data/README.md +74 -165
data/Rakefile +0 -1
data/lib/zhongwen_tools/{string/caps.rb → caps.rb} +19 -1
data/lib/zhongwen_tools/core.rb +19 -0
data/lib/zhongwen_tools/core_ext/integer.rb +8 -0
data/lib/zhongwen_tools/core_ext/string.rb +10 -0
data/lib/zhongwen_tools/fullwidth.rb +102 -0
data/lib/zhongwen_tools/integer_extension.rb +31 -0
data/lib/zhongwen_tools/number/number_table.rb +44 -0
data/lib/zhongwen_tools/number.rb +221 -0
data/lib/zhongwen_tools/regex.rb +38 -22
data/lib/zhongwen_tools/romanization/pinyin.rb +231 -0
data/lib/zhongwen_tools/romanization/{pyn_to_py.rb → pinyin_table.rb} +2 -1
data/lib/zhongwen_tools/romanization/romanization_table.rb +425 -0
data/lib/zhongwen_tools/romanization.rb +199 -136
data/lib/zhongwen_tools/{string/ruby19.rb → ruby_19.rb} +1 -2
data/lib/zhongwen_tools/{conversion → script}/conversion_data +0 -0
data/lib/zhongwen_tools/{conversion.rb → script.rb} +21 -34
data/lib/zhongwen_tools/string_extension.rb +136 -0
data/lib/zhongwen_tools/unicode.rb +25 -0
data/lib/zhongwen_tools/uri.rb +14 -0
data/lib/zhongwen_tools/version.rb +1 -1
data/lib/zhongwen_tools/zhongwen.rb +29 -0
data/lib/zhongwen_tools.rb +2 -3
data/test/test_caps.rb +26 -0
data/test/test_core.rb +13 -0
data/test/test_fullwidth.rb +30 -0
data/test/test_helper.rb +4 -12
data/test/test_helpers/unload_zhongwen_tools_script.rb +5 -0
data/test/test_integer_extension.rb +34 -0
data/test/test_number.rb +79 -0
data/test/test_pinyin.rb +68 -0
data/test/test_regex.rb +41 -0
data/test/test_romanization.rb +110 -133
data/test/{test_conversion.rb → test_script.rb} +41 -44
data/test/test_string_extension.rb +94 -0
data/test/test_unicode.rb +27 -0
data/test/test_uri.rb +16 -0
data/test/test_zhongwen.rb +37 -0
data/zhongwen_tools.gemspec +1 -1
metadata +93 -52
data/Gemfile.1.8.7 +0 -8
data/lib/zhongwen_tools/conversion/string.rb +0 -19
data/lib/zhongwen_tools/integer.rb +0 -28
data/lib/zhongwen_tools/numbers.rb +0 -195
data/lib/zhongwen_tools/regex/ruby18.rb +0 -15
data/lib/zhongwen_tools/romanization/conversion_table.rb +0 -425
data/lib/zhongwen_tools/romanization/detect.rb +0 -141
data/lib/zhongwen_tools/romanization/string.rb +0 -36
data/lib/zhongwen_tools/string/fullwidth.rb +0 -85
data/lib/zhongwen_tools/string/ruby18.rb +0 -96
data/lib/zhongwen_tools/string.rb +0 -164
data/test/test_integer.rb +0 -31
data/test/test_numbers.rb +0 -68
data/test/test_string.rb +0 -133

data/lib/zhongwen_tools/romanization/detect.rb DELETED Viewed

@@ -1,141 +0,0 @@
-# encoding: utf-8
-require 'zhongwen_tools/regex'
-require 'zhongwen_tools/romanization/string'
-module ZhongwenTools
-  module Romanization
-    extend self
-    # Deprecated: a Regex for accurate pinyin. Use ZhongwenTools::Regex.py instead
-    PY_REGEX = ZhongwenTools::Regex.py
-    # Deprecate: a Regex for accurate pinyin with numbers. use ZhongwenTools::Regex.pyn instead.
-    PINYIN_REGEX = ZhongwenTools::Regex.pyn
-    # Public: checks if a string is pinyin.
-    #         http://en.wikipedia.org/wiki/Pinyin
-    #
-    # Examples
-    #   py?('nǐ hǎo')
-    #   # => true
-    #
-    # Returns Boolean.
-    def py?(str = nil)
-      str ||= self
-      # NOTE: py regex does not include capitals with tones.
-      String.downcase(str).gsub(Regex.punc,'').gsub(Regex.py, '').gsub(/[\s\-]/,'').strip == ''
-    end
-    # Public: checks if a string is pinyin.
-    #
-    # Examples
-    #   pyn?('pin1-yin1')
-    #   # => true
-    #
-    # Returns Boolean.
-    def pyn?(str = nil)
-      str ||= self
-      normalized_str = str.gsub(Regex.punc,'').gsub(/[\s\-]/,'').downcase
-      parts = split_pyn(normalized_str).map{ |p| p }
-      pyns = ROMANIZATIONS_TABLE.map{ |r| r[:pyn] }
-      parts.join('') == normalized_str && parts.size == parts.select{ |p| pyns.include? p.gsub(/[1-5]/,'') }.size
-    end
-    # Public: Checks if a String is Zhuyin Fuhao (a.k.a. bopomofo).
-    #         http://en.wikipedia.org/wiki/Bopomofo
-    #         http://pinyin.info/romanization/bopomofo/index.html
-    #
-    # str - a String. Optional if the object calling the method is a String.
-    #
-    # Examples
-    #
-    #   bpmf?('ㄊㄥ')
-    #   # => true
-    #
-    # Returns a boolean.
-    def bpmf?(str = nil)
-      str ||= self
-      bopomofo = str.gsub(/[1-5\s]/,'').gsub(Regex.punc,'')
-      bopomofo.scan(Regex.bopomofo).join == bopomofo
-    end
-    # Public: Checks if a String is a romanization:
-    #         Tongyong Pinyin, Wade Giles, MSP2 or Yale.
-    #         http://en.wikipedia.org/wiki/Tongyong_Pinyin
-    #         http://pinyin.info/romanization/tongyong/
-    #         http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
-    #
-    # str - a String. Optional if the object calling the method is a String.
-    #
-    # Examples
-    #
-    #   typy?('chuei niou')
-    #   # => true
-    #   wg?('Mao2 Tse2 Tung1')
-    #
-    # Returns a boolean.
-    %w(typy wg yale mps2).each do |type|
-      define_method("#{type}?") do |str = nil|
-        str ||= self
-        # TODO: ignore tonal marks from other systems wade giles, tongyong etc.
-        s = str.downcase.gsub(Regex.punc,'').gsub(/[1-5\s\-']/,'')
-        s.scan(detect_regex(type.to_sym)).join == s
-      end
-    end
-    # Public: Checks the srings romanizaiton. It always assumes the first correct result is the correct result.
-    #         This can sometimes provide sub-optimal results
-    #         e.g.
-    #           'chuei niou'.romanization? #=> :pyn
-    #           'chuei niou'.pyn? == true # this is correct because ['chu', 'ei', 'ni', 'ou'] are all valid pinyin
-    #                                     # but the best fit for 'chuei niou' should be :typy.
-    #         But this is not considered a major issue because most of the time pyn / py will be used. It could be
-    #         extended to try and figure out the best option, maybe by comparing the syllable length of each
-    #         valid romanization.
-    #
-    # str - a String. Optional if the object calling the method is a String.
-    #
-    # Examples
-    #
-    #
-    #   'hao3'.romanization? #=> :pyn
-    #
-    # Returns a Symbol for the romanization type.
-    def romanization?(str = nil)
-      str ||= self
-      [:pyn, :py, :zyfh, :wg, :typy, :yale, :mps2].find do |type|
-        send("#{type}?", str)
-      end
-    end
-    # TODO: romanizations? method that returns all possible romanizations.
-    # Deprecated: ZhongwenTools::Romanizaiton.zyfh? is deprecated. Use ZhongwenTools::Romanizaiton.bpmf? instead
-    alias_method :zyfh?, :bpmf?
-    private
-    # Internal: Produces a Regexp for a romanization type.
-    #
-    # type - a Symbol for the romanization type.
-    #
-    # Examples:
-    #
-    #
-    #   detect_regex(:typy) #=> <Regexp>
-    #
-    # Returns a Regexp.
-    def detect_regex(type)
-      /#{regex_values(type).sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
-    end
-    def regex_values(type)
-      ROMANIZATIONS_TABLE.map{ |r| "[#{r[type][0]}#{r[type][0].upcase}]#{r[type][1..-1]}" || r[:pyn] }.flatten
-    end
-  end
-end

data/lib/zhongwen_tools/romanization/string.rb DELETED Viewed

@@ -1,36 +0,0 @@
-# encoding: utf-8
-module ZhongwenTools
-  module Romanization
-    # Public: splits pinyin number strings.
-    #
-    # str - a String to be split
-    #
-    # Examples
-    #
-    #
-    #   split_pyn('zhong1guo2')
-    #   # => ['zhong1', 'guo2']
-    #
-    # Returns an Array of Strings.
-    def split_pyn(str = nil)
-      str ||= self
-      # FIXME: ignore punctuation
-      str.scan(/(#{Regex.pyn})/).map{ |arr| arr[0].strip.gsub('-','') }.flatten
-    end
-    def split_zyfh(str = nil)
-      str ||= self
-      str.scan(/([#{Regex.bopomofo}]*)/).map{ |arr| arr[0].strip.gsub('-','') }.flatten - ['']
-    end
-    %w(typy wg yale mps2).each do |type|
-      define_method("split_#{type}") do |str = nil|
-        str ||= self
-        # TODO: ignore tonal marks from other systems wade giles, tongyong etc.
-        str.scan(/(#{detect_regex(type.to_sym)}*)/).map{ |arr| arr[0].strip.gsub('-','') }.flatten - ['']
-      end
-    end
-  end
-end

data/lib/zhongwen_tools/string/fullwidth.rb DELETED Viewed

@@ -1,85 +0,0 @@
-# encoding: utf-8
-module ZhongwenTools
-  FW_HW ={
-    "０" => "0",
-    "１" => "1",
-    "２" => "2",
-    "３" => "3",
-    "４" => "4",
-    "５" => "5",
-    "６" => "6",
-    "７" => "7",
-    "８" => "8",
-    "９" => "9",
-    "Ａ" => "A",
-    "Ｂ" => "B",
-    "Ｃ" => "C",
-    "Ｄ" => "D",
-    "Ｅ" => "E",
-    "Ｆ" => "F",
-    "Ｇ" => "G",
-    "Ｈ" => "H",
-    "Ｉ" => "I",
-    "Ｊ" => "J",
-    "Ｋ" => "K",
-    "Ｌ" => "L",
-    "Ｍ" => "M",
-    "Ｎ" => "N",
-    "Ｏ" => "O",
-    "Ｐ" => "P",
-    "Ｑ" => "Q",
-    "Ｒ" => "R",
-    "Ｓ" => "S",
-    "Ｔ" => "T",
-    "Ｕ" => "U",
-    "Ｖ" => "V",
-    "Ｗ" => "W",
-    "Ｘ" => "X",
-    "Ｙ" => "Y",
-    "Ｚ" => "Z",
-    "ａ" => "a",
-    "ｂ" => "b",
-    "ｃ" => "c",
-    "ｄ" => "d",
-    "ｅ" => "e",
-    "ｆ" => "f",
-    "ｇ" => "g",
-    "ｈ" => "h",
-    "ｉ" => "i",
-    "ｊ" => "j",
-    "ｋ" => "k",
-    "ｌ" => "l",
-    "ｍ" => "m",
-    "ｎ" => "n",
-    "ｏ" => "o",
-    "ｐ" => "p",
-    "ｑ" => "q",
-    "ｒ" => "r",
-    "ｓ" => "s",
-    "ｔ" => "t",
-    "ｕ" => "u",
-    "ｖ" => "v",
-    "ｗ" => "w",
-    "ｘ" => "x",
-    "ｙ" => "y",
-    "ｚ" => "z",
-    "％" => '%',
-    "．" => '.',
-    '：' => ':',
-    "＃" => '#',
-    "＄" => "$",
-    "＆" => "&",
-    "＋" => "+",
-    "－" => "-",
-    "／" => "/",
-    "＼" => '\\',
-    '＝' => '=',
-    "；" => ";",
-    "＜" => "<",
-    "＞" => ">",
-    "？" => "?",
-    "。" => ".",
-    "！" => "!",
-    '，' => ','
-  }
-end

data/lib/zhongwen_tools/string/ruby18.rb DELETED Viewed

@@ -1,96 +0,0 @@
-# encoding: utf-8
-class String
-  define_method(:chars) do
-    self.scan(/./mu).to_a
-  end
-  def size
-    self.chars.size
-  end
-  def reverse(str = nil)
-    self.chars.reverse.join
-  end
-  def gsub_with_hash(pattern, hash)
-    gsub(pattern) do |m|
-      hash[m]
-    end
-  end
-end
-module ZhongwenTools
-  module String
-    # TODO: replace deprecated constant UNICODE_REGEX.
-  end
-  def to_utf8(encoding = nil, encodings = nil)
-    # FIXME: should substitute out known bad actors like space
-    encodings = ['utf-8', 'GB18030', 'BIG5', 'GBK', 'GB2312'] if encodings.nil?
-    encodings = encoding + encodings unless encoding.nil?
-    raise 'Unable to Convert' if encodings.size == 0
-    begin
-      text = Iconv.conv('utf-8', encodings[0], self)
-    rescue
-      text = self.to_utf8(nil, encodings[1..-1])
-    end
-    text
-  end
-  def convert_regex(regex)
-    str = regex.to_s
-    regex.to_s.scan(/u[0-9A-Z]{4}/).each{|cp| str = str.sub('\\' + cp,cp.from_codepoint)}
-    /#{str}/
-  end
-  def has_zh?(str = nil)
-    str ||= self
-    regex = {
-      :zh => self.convert_regex(UNICODE_REGEX[:zh]),
-      :punc => self.convert_regex(UNICODE_REGEX[:punc])
-    }
-    # str.scan(/#{regex[:zh]}|#{regex[:punc]}|\s/).join == str
-    !self.fullwidth?(str) && (!str[regex[:zh]].nil? || !str[regex[:punc]].nil?)
-  end
-  def zh?(str = nil)
-    str ||= self
-    regex = {
-      :zh => self.convert_regex(UNICODE_REGEX[:zh]),
-      :punc => self.convert_regex(UNICODE_REGEX[:punc])
-    }
-    !str.fullwidth? && (str.scan(/(#{regex[:zh]}+|#{regex[:punc]}+|\s+)/).join == str)
-  end
-  def has_zh_punctuation?(str = nil)
-    str ||= self
-    regex = {
-      :zh => self.convert_regex(UNICODE_REGEX[:zh]),
-      :punc => self.convert_regex(UNICODE_REGEX[:punc])
-    }
-    !str[regex[:punc]].nil?
-  end
-  def strip_zh_punctuation(str = nil)
-    str ||= self
-    str.gsub(self.convert_regex(UNICODE_REGEX[:punc]), '')
-  end
-  def to_halfwidth(str = nil)
-    str ||= self
-    matches = str.scan(/([０-９Ａ-Ｚａ-ｚ％．：＃＄＆＋－／＼＝；＜＞])/u).uniq.flatten
-    matches.each do |match|
-      replacement = FW_HW[match]
-      str = str.gsub(match, replacement)
-    end
-    str
-  end
-end

data/lib/zhongwen_tools/string.rb DELETED Viewed

@@ -1,164 +0,0 @@
-# encoding: utf-8
-#$:.unshift File.join(File.dirname(__FILE__),'..','lib','zhongwen_tools', 'string')
-require 'uri'
-require 'zhongwen_tools/regex'
-require 'zhongwen_tools/string/fullwidth'
-require 'zhongwen_tools/string/caps'
-class String
-  alias_method :_downcase, :downcase
-  alias_method :_upcase, :upcase
-  alias_method :gsub_with_hash, :gsub
-  def downcase
-    self._downcase.gsub(/(#{ZhongwenTools::UNICODE_CAPS.keys.join('|')})/){
-      ZhongwenTools::UNICODE_CAPS[$1]
-    }
-  end
-  def upcase
-    self._upcase.gsub(/(#{ZhongwenTools::UNICODE_CAPS.values.join('|')})/){
-      ZhongwenTools::UNICODE_CAPS.find{|k,v| v == $1}[0]
-    }
-  end
-  def capitalize
-    #sub only substitues the first occurence.
-    c = self.chars[0]
-    self.sub(c, c.upcase) unless c.nil?
-  end
-  def scan_utf8(regex)
-    scan(regex)
-  end
-end
-module ZhongwenTools
-  module String
-    extend self
-    # Deprecated: a Hash of unicode Regexes. Use ZhongwenTools::Regex.zh instead
-    UNICODE_REGEX = {
-      :zh => Regex.zh,
-      :punc => Regex.zh_punc
-    }
-    def to_utf8(str = nil)
-      (str || self).force_encoding('utf-8')
-      #TODO: better conversion methods can be extracted from categories service
-    end
-    def has_zh?(str = nil)
-      str ||= self
-      !str[/(#{Regex.zh}|#{Regex.zh_punc})/].nil?
-    end
-    def zh?(str = nil)
-      str ||= self
-      str.scan(/(#{Regex.zh}+|#{Regex.zh_punc}+|\s+)/).join == str
-    end
-    def downcase(str = nil)
-      str ||= self
-      str.downcase
-    end
-    def upcase(str = nil)
-      str ||= self
-      str.upcase
-    end
-    def capitalize(str = nil)
-      str ||= self
-      str.capitalize
-    end
-    def has_zh_punctuation?(str = nil)
-      str ||= self
-      !str[Regex.zh_punc].nil?
-    end
-    def strip_zh_punctuation(str = nil)
-      str ||= self
-      str.gsub(Regex.zh_punc, '')
-    end
-    def size(str = nil)
-      str ||= self
-      str.chars.size
-    end
-    def chars(str = nil)
-      (str || self).scan(/./mu).to_a
-    end
-    def reverse(str = nil)
-      str ||= self
-      str.chars.reverse.join
-    end
-    def uri_encode(str = nil)
-      str ||= self
-      URI.encode str
-    end
-    def uri_escape(str = nil)
-      str ||= self
-      URI.escape(str, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]"))
-    end
-    def ascii?(str = nil)
-      str ||= self
-      str.chars.size == str.bytes.to_a.size
-    end
-    def multibyte?(str = nil)
-      !(str || self).ascii?
-    end
-    def halfwidth?(str = nil)
-      str ||= self
-      str[Regex.fullwidth].nil?
-    end
-    def fullwidth?(str = nil)
-      str ||= self
-      !self.halfwidth?(str) && self.to_halfwidth(str) != str
-    end
-    def to_halfwidth(str = nil)
-      str ||= self
-      str.gsub(/(#{Regex.fullwidth})/){  ZhongwenTools::FW_HW[$1] }
-    end
-    def to_codepoint(str = nil)
-      str ||= self
-      #chars = (self.class.to_s == 'String')? self.chars : self.chars(str)
-      codepoints = str.chars.map{|c| "\\u%04x" % c.unpack("U")[0]}
-      codepoints.join
-    end
-    def from_codepoint(str = nil)
-      str ||= self
-      [str.sub(/\\?u/,'').hex].pack("U")
-    end
-  end
-end
-if RUBY_VERSION < '1.9'
-  require File.expand_path("../string/ruby18", __FILE__)
-elsif RUBY_VERSION < '2.0'
-  require File.expand_path("../string/ruby19", __FILE__)
-end

data/test/test_integer.rb DELETED Viewed

@@ -1,31 +0,0 @@
-#encoding: utf-8
-$:.unshift File.join(File.dirname(__FILE__),'..','lib')
-require './test/test_helper'
-require 'zhongwen_tools/integer'
-class Integer
-  include ZhongwenTools::Integer
-end
-class TestInteger < Minitest::Test
-  def test_zh
-    assert_equal 122.to_zh, '一百二十二'
-    assert_equal 12.to_zh, '十二'
-    assert_equal 12000.to_zht, '一萬二千'
-    assert_equal 12000.to_zhs, '一万二千'
-    refute 12000.to_zh == 12000.to_zht
-    assert_equal '十二', ZhongwenTools::Integer.to_zhs(12)
-    assert_equal '一萬二千', ZhongwenTools::Integer.to_zht(12000)
-    assert_equal '一万二千', ZhongwenTools::Integer.to_zhs(12000)
-    refute  ZhongwenTools::Integer.to_zhs(12000) == ZhongwenTools::Integer.to_zht(12000)
-  end
-  def test_pinyin
-    assert_equal 12.to_pyn, 'shi2-er4'
-    assert_equal 'shi2-er4', ZhongwenTools::Integer.to_pyn(12)
-  end
-end

data/test/test_numbers.rb DELETED Viewed

@@ -1,68 +0,0 @@
-#encoding: utf-8
-$:.unshift File.join(File.dirname(__FILE__),'..','lib')
-require './test/test_helper'
-require 'zhongwen_tools/string'
-require 'zhongwen_tools/numbers'
-class TestNumbers < Minitest::Test
-  include ZhongwenTools::Numbers
-  def test_convert_to_numbers
-    #skip
-    #your function sucks dick man
-    @numbers.each do |num|
-      number = zh_number_to_number num[:zh]
-      assert_equal num[:en], number
-    end
-  end
-  def test_class_methods
-    i = rand @numbers.length
-    number = @numbers[i]
-    assert_equal number[:en], ZhongwenTools::Numbers.zh_number_to_number(number[:zh])
-  end
-  def test_convert_to_traditional_number
-    zhs = @numbers[0][:zh]
-    zht = number_to_zht :zht, zhs
-    assert_equal '一萬兩千七', zht
-  end
-  def test_convert_to_simplified_from_number
-    num = @numbers[0][:en]
-    zhs = number_to_zhs :num, num
-    assert_equal '一万二千七', zhs
-  end
-  def test_convert_number_to_pyn
-    num = '一百三十六'
-    pyn = self.number_to_pyn num
-    assert_equal 'yi1-bai2-san1-shi2-liu4', pyn
-    num = '一千五百四十二'
-    pyn = self.number_to_pyn num
-    assert_equal 'yi1-qian2-wu3-bai2-si4-shi2-er4', pyn
-  end
-  def test_is_number
-    @numbers.map{ |n| n[:zh]}.each do |zh|
-     assert self.number? zh
-    end
-    assert self.number? '一'
-  end
-  def setup
-    @numbers = [
-      {:zh =>'一万两千七', :en => 12_007},
-      {:zh => '三千六十三', :en => 3_063},
-      {:zh => '一百五十', :en => 150 },
-      {:zh => '三千亿', :en => 300_000_000_000},
-      {:zh => '一九六六', :en => 1966},
-      {:zh => '二零零八', :en => 2008},
-    ]
-  end
-end