RubyGems - zhongwen_tools - Versions diffs - 0.0.6 - Mend

zhongwen_tools 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +7 -0
data/.travis.yml +12 -0
data/Gemfile +7 -0
data/Gemfile.1.8.7 +3 -0
data/README.md +128 -0
data/Rakefile +10 -0
data/lib/zhongwen_tools/numbers.rb +185 -0
data/lib/zhongwen_tools/string/fullwidth.rb +81 -0
data/lib/zhongwen_tools/string/ruby18.rb +71 -0
data/lib/zhongwen_tools/string/ruby19.rb +6 -0
data/lib/zhongwen_tools/string.rb +164 -0
data/lib/zhongwen_tools.rb +8 -0
data/test/test_conversion.rb +0 -0
data/test/test_helper.rb +14 -0
data/test/test_numbers.rb +53 -0
data/test/test_romanization.rb +0 -0
data/test/test_string.rb +123 -0
data/zhongwen_tools.gemspec +27 -0
metadata +123 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: b1e19e456d7cf778c9a749a75284044981086a02
+  data.tar.gz: 103ae6d8d26029b2854bdd09e02a10bff64d5df1
+SHA512:
+  metadata.gz: dff5a94d7af2e65b6f6a63ae8a5593312eef78df4fe3b8fe9c1280bf05874db12d4230c266f6de63de655b1d07db06fa430d49584daf120d65cabc33fd9cd94a
+  data.tar.gz: 0078f0cb0ca8724c34403c04472c063ea53836b261047d968c4a78eb18eba2985356004d3dabf6314e3b930635e4a1c1058f154f45fea1f14750e903991d21b3

data/.travis.yml ADDED Viewed

@@ -0,0 +1,12 @@
+language: ruby
+rvm:
+  - 1.9.2
+  - 1.9.3
+  - 2.0.0
+  - 2.1.0
+  - ruby-head
+matrix:
+  include:
+    - rvm: 1.8.7
+      gemfile: Gemfile.1.8.7

data/Gemfile ADDED Viewed

@@ -0,0 +1,7 @@
+source "https://rubygems.org"
+# Specify your gem's dependencies in zhongwen_tools.gemspec
+gemspec
+group :test do
+  gem 'pry'
+end

data/Gemfile.1.8.7 ADDED Viewed

@@ -0,0 +1,3 @@
+source "https://rubygems.org"
+# Specify your gem's dependencies in zhongwen_tools.gemspec
+gemspec

data/README.md ADDED Viewed

@@ -0,0 +1,128 @@
+#Zhongwen Tools: tools and methods for dealing with Chinese.
+[![Build
+Status](https://travis-ci.org/stevendaniels/zhongwen_tools.png?branch=master)](https://travis-ci.org/stevendaniels/zhongwen_tools) [![Dependency Status](https://gemnasium.com/stevendaniels/zhongwen_tools.png)](https://gemnasium.com/stevendaniels/zhongwen_tools) [![Code Climate](https://codeclimate.com/github/stevendaniels/zhongwen_tools.png)](https://codeclimate.com/github/stevendaniels/zhongwen_tools) [![Coverage Status](https://coveralls.io/repos/stevendaniels/zhongwen_tools/badge.png)](https://coveralls.io/r/stevendaniels/zhongwen_tools)
+##INSTALLATION
+Install as a gem
+    $ [sudo] gem install zhongwen_tools
+## Usage
+Add the ZhongwenTools component you need to your classes as a module.
+    class String
+      include ZhongwenToolsRomanization
+    end
+    str = "ni3 hao3"  #pinyin with numbers
+    str.to_pinyin     #=> "nǐ hǎo"
+    str.to_zhuyinfuhao  #=>
+    mzd = "Mao Tse-tung"
+    mzd.to_pinyin   #=> Mao Zedong
+Or you can require the components you want
+    require 'zhongwen_tools/numbers'
+    ZhongwenTools::Numbers.to_pinyin '一百二十' #=> 'yi1-bai2-er4-shi2'
+ZhongwenTools includes the following modules:
+1. ZhongwenTools::String => some useful string functions and functions for identifying Chinese scripts and romanizations.
+2. ZhongwenTools::Numbers => functions for identifying and converting numbers.
+3. ZhongwenTools::Integer => some useful integer functions for Chinese:
+   e.g. 12.to_pinyin 12.to_zht
+4. ZhongwenTools::Romanization => functions for converting between Chinese romanization systems
+5. ZhongwenTools::Conversion => functions for converting between Chinese scripts.
+6. ZhongwenTools::ToneSandhi => functions for identifying and dealing with tone sandhi. (Wiki URL)
+7. [TODO] ZhongwenTools::Segmentation => functions for segmenting Chinese. Can provide different methods for converting
+8. ZhongwenTools::Tagging => functions for tagging Chinese POS, NER, etc.
+### ZhongwenTools::String: useful string functions for ZhongwenTools language
+    ZhongwenTools::String.ascii? 'hello'    #=> true #non-multibyle strings
+    ZhongwenTools::String.multibyte? '中文'  #=> true #multibtye strings
+    ZhongwenTools::String.halfwidth?
+    ZhongwenTools::String.fullwidth?
+    ZhongwenTools::String.to_halfwidth
+    ZhongwenTools::String.uri_encode  #=> just because I'm lazy
+    ZhongwenTools::Unicode.to_codepoint
+    ZhongwenTools::Unicode.to_unicode --> converts from unicode codepoint.
+    ZhongwenTools::String.downcase --> does pinyin/ lowercase
+    ZhongwenTools::String.upcase --> does pinyin uppercase
+    ZhongwenTools::String.capitalize ---> does pinyin / fullwidth capitalization
+    ZhongwenTools::String.has_zh? '1月'     #=> true
+    ZhongwenTools::String.is_zh? '1月'      #=> false can't be mixed.
+    ZhongwenTools::String.is_zhs? '中国'    #=> true
+    ZhongwenTools::String.is_zht? '中国'    #=> false
+#### ruby 1.8 safe methods
+    ZhongwenTools::String.chars '中文' #=> ['中','文']
+    ZhongwenTools::String.size '中文'  #=> 2
+    ZhongwenTools::String.reverse '中文' #=> '文中'
+    ZhongwenTools::Unicode.to_utf8 '\x{D6D0}\x{CEC4}' => '中文'
+###Numbers
+Functions for converting to and from Chinese numbers.
+###Integers
+### Romanization
+ZhongwenTools::Chinese has tools for converting between Chinese language romanization systems and
+scripts.
+    class String
+      include ZhongwenToolsRomanization
+    end
+    str = "ni3 hao3"
+    romanization_system = "pyn" #pyn|wg|yale|bpmf|zhyfh|wade-giles|bopomofo
+    str.to_pinyin romanization_system
+    #=> "nǐ hǎo"
+    str.to_py romanization_system
+    #=> "nǐ hǎo"
+    str.to_pyn
+    #=> "ni3 hao3"
+    str.to_wg
+    str.to_bpmf
+    str.to_yale
+    str.to_typy
+    str.to_msp3
+    str.to_tone_sandhi   #=> converts pinyin into it's spoken tones.
+    #=> "ni2 hao3"
+    str.tone_sandhi?     #=> checks if the word has tone sandhi
+    #=> true
+    str.romanization?
+### Conversion
+Functions for converting between scripts (e.g. traditional Chinese to
+simplified Chinese) and between chinese and romanization systems (e.g.
+Chinese to pinyin).
+ZhongwenTools::Conversion.to_zhs
+ZhongwenTools::Conversion.to_zht
+ZhongwenTools::Conversion.to_zhtw
+ZhongwenTools::Conversion.to_zhhk
+ZhongwenTools::Conversion.to_zhmc
+ZhongwenTools::Conversion.to_zhsg
+ZhongwenTools::Conversion.to_zhprc
+###Tone Sandhi
+Some functions for predicting / converting to tone sandhi
+##Plugins
+Zhongwen Tools tries to avoid having many dependencies. Functionality
+that requires an external dependency is packaged as a separate gem.
+## TODO
+1. A trad/simp script converter
+2. A character -> pinyin converter
+3. A language detector

data/Rakefile ADDED Viewed

@@ -0,0 +1,10 @@
+require 'bundler/gem_tasks'
+require 'rake/testtask'
+Bundler.require :test
+Rake::TestTask.new do |t|
+  t.libs << 'test'
+end
+desc "Run tests"
+task :default => :test

data/lib/zhongwen_tools/numbers.rb ADDED Viewed

@@ -0,0 +1,185 @@
+#encoding: utf-8
+module ZhongwenTools
+  module Numbers
+    NUMBER_MULTIPLES = '拾十百佰千仟仟万萬亿億'
+    NUMBERS_TABLE = [
+      { :zh_s => '零', :zh_t => '零', :num => 0, :pyn => 'ling2'},
+      { :zh_s => '〇', :zh_t => '〇', :num => 0, :pyn => 'ling2'},
+      { :zh_s => '一', :zh_t => '一', :num => 1, :pyn => 'yi1'},
+      { :zh_s => '壹', :zh_t => '壹', :num => 1, :pyn => 'yi1'},
+      { :zh_s => '幺', :zh_t => '幺', :num => 1, :pyn => 'yao1'},
+      { :zh_s => '二', :zh_t => '二', :num => 2, :pyn => 'er4'},
+      { :zh_s => '两', :zh_t => '兩', :num => 2, :pyn => 'liang3'},
+      { :zh_s => '贰', :zh_t => '貳', :num => 2, :pyn => 'er4'},
+      { :zh_s => '三', :zh_t => '三', :num => 3, :pyn => 'san1'},
+      { :zh_s => '弎', :zh_t => '弎', :num => 3, :pyn => 'san1'},
+      { :zh_s => '叁', :zh_t => '參', :num => 3, :pyn => 'san1'},
+      { :zh_s => '四', :zh_t => '四', :num => 4, :pyn => 'si4'},
+      { :zh_s => '䦉', :zh_t => '䦉', :num => 4, :pyn => 'si4'},
+      { :zh_s => '肆', :zh_t => '肆', :num => 4, :pyn => 'si4'},
+      { :zh_s => '五', :zh_t => '五', :num => 5, :pyn => 'wu3'},
+      { :zh_s => '伍', :zh_t => '伍', :num => 5, :pyn => 'wu3'},
+      { :zh_s => '六', :zh_t => '六', :num => 6, :pyn => 'liu4'},
+      { :zh_s => '陆', :zh_t => '陸', :num => 6, :pyn => 'liu4'},
+      { :zh_s => '七', :zh_t => '七', :num => 7, :pyn => 'qi1'},
+      { :zh_s => '柒', :zh_t => '柒', :num => 7, :pyn => 'qi1'},
+      { :zh_s => '八', :zh_t => '八', :num => 8, :pyn => 'ba1'},
+      { :zh_s => '捌', :zh_t => '捌', :num => 8, :pyn => 'ba1'},
+      { :zh_s => '九', :zh_t => '九', :num => 9, :pyn => 'jiu3'},
+      { :zh_s => '玖', :zh_t => '玖', :num => 9, :pyn => 'jiu3'},
+      { :zh_s => '十', :zh_t => '十', :num => 10, :pyn => 'shi2'},
+      { :zh_s => '拾', :zh_t => '拾', :num => 10, :pyn => 'shi2'},
+      { :zh_s => '廿', :zh_t => '廿', :num => 20, :pyn => ' nian4'},
+      { :zh_s => '百', :zh_t => '百', :num => 100, :pyn => 'bai2'},
+      { :zh_s => '佰', :zh_t => '佰', :num => 100, :pyn => 'bai2'},
+      { :zh_s => '千', :zh_t => '千', :num => 1000, :pyn => 'qian2'},
+      { :zh_s => '仟', :zh_t => '仟', :num => 1000, :pyn => 'qian2'},
+      { :zh_s => '万', :zh_t => '萬', :num => 10000, :pyn => 'wan4'},
+      { :zh_s => '亿', :zh_t => '億', :num => 100000000, :pyn => 'yi4'},
+    ]
+    def is_number? word
+      #垓	秭	穰	溝	澗	正	載 --> beyond 100,000,000!
+      "#{word}".gsub(/([\d]|[一二三四五六七八九十百千萬万億亿]){2,}/,'') == ''
+    end
+    def convert_date(zh)
+      #if it's a year, or an oddly formatted number
+      zh_numbers = ZhongwenTools::String.chars zh
+      numbers = [];
+      i = 0
+      while( i < zh_numbers.length)
+        curr_number = zh_numbers[i]
+        #x[:num] == curr_number.to_i is a kludge; any string will == 0
+        num = convert(curr_number)[:num]
+        numbers << num
+        i += 1
+      end
+      return numbers
+    end
+    def convert(number)
+      NUMBERS_TABLE.find{|x|  x[:zh_s] == number || x[:zh_t] == number  || x[:num].to_s == number}
+    end
+    def convert_numbers(numbers)
+      number = 0
+      length = numbers.length
+      skipped = false
+      length.times do |i|
+        unless skipped == i
+          curr_num = numbers[i] || 0
+          if (i+2) <= length
+            number, i = convert_current_number(numbers, number, curr_num, i)
+            skipped = i + 1
+          else
+            number = adjust_number(number, curr_num)
+          end
+        end
+      end
+      number
+    end
+    def convert_current_number numbers, number, curr_num, i
+      next_number = numbers[i + 1]
+      if is_number_multiplier? next_number
+        number += next_number * curr_num
+      end
+      [number, i]
+    end
+    def adjust_number(number, curr_num)
+      is_number_multiplier?(curr_num) ? number * curr_num : number + curr_num
+    end
+    def convert_chinese_numbers_to_numbers(zh_number)
+      zh_number = zh_number.to_s
+      numbers = convert_date(zh_number)
+      #if it's a year, or an oddly formatted number
+      return numbers.join('').to_i if zh_number[/[#{NUMBER_MULTIPLES}]/u].nil?
+      convert_numbers numbers
+    end
+    def is_number_multiplier?(number)
+      [10,100,1000,10000,100000000].include? number
+    end
+    #these should also be able to convert numbers to chinese numbers
+    def convert_number_to_simplified type, number
+      convert_number_to :zh_s, type.to_sym, number
+    end
+    def convert_number_to_traditional type, number
+      convert_number_to :zh_t, type.to_sym, number
+    end
+    def convert_number_to_pyn number, type = 'zh_s'
+      convert_number_to :pyn, type.to_sym, number, '-'
+    end
+    def check_wan(wan, i)
+      wan ||= 0
+      wan += 1 if (i + 1) % 5 == 0
+    end
+    def convert_from_zh number, to
+      converted_number = number.chars.map do |digit|
+        convert(digit).fetch(to){ digit }
+      end
+    end
+    def convert_from_num number, to
+      #TODO: this will fail for numbers over 1 billion. grr.
+      str = number.to_s
+      len = str.length
+      converted_number = []
+      len.times do |i|
+        wan = check_wan(wan, i)
+        num = str[(len - 1 - i),1].to_i
+        if i == 0
+          replacement = NUMBERS_TABLE.find{|x| x[:num] == num}.fetch(to){0}
+          converted_number << replacement unless num == 0
+        else
+          replacement = (NUMBERS_TABLE.find{|x| x[:num] == (10**(i))} || NUMBERS_TABLE.find{|x| x[:num] == (10**(i) / 10000)} || NUMBERS_TABLE.find{|x| x[:num] == (10**(i) / 10000**2)} )[to]
+          converted_number << replacement
+          #checks the wan level and ...
+          if (num == 1 && (10**(i) / 10000 ** wan) != 10) || num != 1
+            replacement = NUMBERS_TABLE.find{|x| x[:num] == num}[to]
+            converted_number << replacement
+            #elsif num != 1
+            #replacement = NUMBERS_TABLE.find{|x| x[:num] == num}[to]
+            #converted_number << replacement
+          end
+        end
+      end
+      converted_number.reverse!
+    end
+    def convert_number_to(to, from, number, separator = '')
+      return number unless [:zh_t, :zh_s, :num, :pyn].include? to
+      if from == :num
+        converted_number = convert_from_num(number, to)
+      else
+        converted_number = convert_from_zh number, to
+      end
+      #liang rules are tough...
+      converted_number.join(separator).gsub(/零[#{NUMBER_MULTIPLES}]/u,'')#.gsub(/二([百佰千仟仟万萬亿億])/){"#{NUMBERS_TABLE.find{|x|x[:pyn] == 'liang3'}[to]}#{$1}"}
+    end
+  end
+end

data/lib/zhongwen_tools/string/fullwidth.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# encoding: utf-8
+module ZhongwenTools
+  FW_HW ={
+    "０" => "0",
+    "１" => "1",
+    "２" => "2",
+    "３" => "3",
+    "４" => "4",
+    "５" => "5",
+    "６" => "6",
+    "７" => "7",
+    "８" => "8",
+    "９" => "9",
+    "Ａ" => "A",
+    "Ｂ" => "B",
+    "Ｃ" => "C",
+    "Ｄ" => "D",
+    "Ｅ" => "E",
+    "Ｆ" => "F",
+    "Ｇ" => "G",
+    "Ｈ" => "H",
+    "Ｉ" => "I",
+    "Ｊ" => "J",
+    "Ｋ" => "K",
+    "Ｌ" => "L",
+    "Ｍ" => "M",
+    "Ｎ" => "N",
+    "Ｏ" => "O",
+    "Ｐ" => "P",
+    "Ｑ" => "Q",
+    "Ｒ" => "R",
+    "Ｓ" => "S",
+    "Ｔ" => "T",
+    "Ｕ" => "U",
+    "Ｖ" => "V",
+    "Ｗ" => "W",
+    "Ｘ" => "X",
+    "Ｙ" => "Y",
+    "Ｚ" => "Z",
+    "ａ" => "a",
+    "ｂ" => "b",
+    "ｃ" => "c",
+    "ｄ" => "d",
+    "ｅ" => "e",
+    "ｆ" => "f",
+    "ｇ" => "g",
+    "ｈ" => "h",
+    "ｉ" => "i",
+    "ｊ" => "j",
+    "ｋ" => "k",
+    "ｌ" => "l",
+    "ｍ" => "m",
+    "ｎ" => "n",
+    "ｏ" => "o",
+    "ｐ" => "p",
+    "ｑ" => "q",
+    "ｒ" => "r",
+    "ｓ" => "s",
+    "ｔ" => "t",
+    "ｕ" => "u",
+    "ｖ" => "v",
+    "ｗ" => "w",
+    "ｘ" => "x",
+    "ｙ" => "y",
+    "ｚ" => "z",
+    "％" => '%',
+    "．" => '.',
+    '：' => ':',
+    "＃" => '#',
+    "＄" => "$",
+    "＆" => "&",
+    "＋" => "+",
+    "－" => "-",
+    "／" => "/",
+    "＼" => '\\',
+    '＝' => '=',
+    "；" => ";",
+    "＜" => "<",
+    "＞" => ">"
+  }
+end

data/lib/zhongwen_tools/string/ruby18.rb ADDED Viewed

@@ -0,0 +1,71 @@
+#encoding: utf-8
+class String
+  define_method(:chars) do
+    self.scan(/./mu).to_a
+  end
+  def size
+    self.chars.size
+  end
+  def reverse(str = nil)
+    self.chars.reverse.join
+  end
+end
+module ZhongwenTools
+  module String
+    def to_utf8(encoding = nil, encodings = nil)
+      #should substitute out known bad actors like space
+      encodings = ['utf-8', 'GB18030', 'BIG5', 'GBK', 'GB2312'] if encodings.nil?
+      encodings = encoding + encodings unless encoding.nil?
+      raise 'Unable to Convert' if encodings.size == 0
+      begin
+        text = Iconv.conv('utf-8', encodings[0], self)
+      rescue
+        text = self.to_utf8(nil, encodings[1..-1])
+      end
+      text
+    end
+    def convert_regex(regex)
+      str = regex.to_s
+      regex.to_s.scan(/u[0-9A-Z]{4}/).each{|cp| str = str.sub('\\' + cp,cp.from_codepoint)}
+      /#{str}/
+    end
+    def has_zh?(str = nil)
+      str ||= self
+      regex = {
+        :zh => self.convert_regex(UNICODE_REGEX[:zh]),
+        :punc => self.convert_regex(UNICODE_REGEX[:punc])
+      }
+      #str.scan(/#{regex[:zh]}|#{regex[:punc]}|\s/).join == str
+      !self.fullwidth?(str) && (!str[regex[:zh]].nil? || !str[regex[:punc]].nil?)
+    end
+    def zh?(str = nil)
+      str ||= self
+      regex = {
+        :zh => self.convert_regex(UNICODE_REGEX[:zh]),
+        :punc => self.convert_regex(UNICODE_REGEX[:punc])
+      }
+      !str.fullwidth? && (str.scan(/(#{regex[:zh]}+|#{regex[:punc]}+|\s+)/).join == str)
+    end
+    def has_zh_punctuation?(str = nil)
+      str ||= self
+      regex = {
+        :zh => self.convert_regex(UNICODE_REGEX[:zh]),
+        :punc => self.convert_regex(UNICODE_REGEX[:punc])
+      }
+      !str[regex[:punc]].nil?
+    end
+  end
+end

data/lib/zhongwen_tools/string/ruby19.rb ADDED Viewed

@@ -0,0 +1,6 @@
+#encoding: utf-8
+class String
+  define_method(:chars) do
+    self.scan(/./mu).to_a
+  end
+end

data/lib/zhongwen_tools/string.rb ADDED Viewed

@@ -0,0 +1,164 @@
+# encoding: utf-8
+#$:.unshift File.join(File.dirname(__FILE__),'..','lib','zhongwen_tools', 'string')
+require 'uri'
+require './lib/zhongwen_tools/string/fullwidth'
+module ZhongwenTools
+  module String
+    UNICODE_REGEX = {
+      :zh => /[\u2E80-\u2E99]|[\u2E9B-\u2EF3]|[\u2F00-\u2FD5]|[\u3005|\u3007]|[\u3021-\u3029]|[\u3038-\u303B]|[\u3400-\u4DB5]|[\u4E00-\u9FCC]|[\uF900-\uFA6D]|[\uFA70-\uFAD9]/,
+      :punc => /[\u0021-\u0023]|[\u0025-\u002A]|[\u002C-\u002F]|[\u003A\u003B\u003F\u0040]|[\u005B-\u005D\u005F\u007B\u007D\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387]|[\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F]|[\u066A-\u066D]|[\u06D4]|[\u0700-\u070D]|[\u07F7-\u07F9]|[\u0830-\u083E]|[\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B]|[\u0F04-\u0F12]|[\u0F14]|[\u0F3A-\u0F3D]|[\u0F85]|[\u0FD0-\u0FD4]|[\u0FD9\u0FDA]|[\u104A-\u104F]|[\u10FB]|[\u1360-\u1368]|[\u1400\u166D\u166E\u169B\u169C]|[\u16EB-\u16ED]|[\u1735\u1736]|[\u17D4-\u17D6]|[\u17D8-\u17DA]|[\u1800-\u180A\u1944\u1945\u1A1E\u1A1F]|[\u1AA0-\u1AA6]|[\u1AA8-\u1AAD]|[\u1B5A-\u1B60]|[\u1BFC-\u1BFF]|[\u1C3B-\u1C3F]|[\u1C7E\u1C7F]|[\u1CC0-\u1CC7]|[\u1CD3]|[\u2010-\u2027]|[\u2030-\u2043]|[\u2045-\u2051]|[\u2053-\u205E]|[\u207D\u207E\u208D\u208E\u2329\u232A]|[\u2768-\u2775\u27C5\u27C6]|[\u27E6-\u27EF]|[\u2983-\u2998]|[\u29D8-\u29DB\u29FC\u29FD]|[\u2CF9-\u2CFC]|[\u2CFE\u2CFF\u2D70]|[\u2E00-\u2E2E]|[\u2E30-\u2E3B]|[\u3001-\u3003]|[\u3008-\u3011]|[\u3014-\u301F]|[\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF]|[\uA60D-\uA60F]|[\uA673\uA67E]|[\uA6F2-\uA6F7]|[\uA874-\uA877]|[\uA8CE\uA8CF]|[\uA8F8-\uA8FA]|[\uA92E\uA92F\uA95F]|[\uA9C1-\uA9CD]|[\uA9DE\uA9DF]|[\uAA5C-\uAA5F]|[\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F]|[\uFE10-\uFE19]|[\uFE30-\uFE52]|[\uFE54-\uFE61]|[\uFE63\uFE68\uFE6A\uFE6B]|[\uFF01-\uFF03]|[\uFF05-\uFF0A]|[\uFF0C-\uFF0F]|[\uFF1A\uFF1B\uFF1F\uFF20]|[\uFF3B-\uFF3D]|[\uFF3F\uFF5B\uFF5D]|[\uFF5F-\uFF65]/
+    }
+    def to_utf8(str = nil)
+      (str || self).force_encoding('utf-8')
+      #TODO: better conversion functions available in categorize
+    end
+    def has_zh?(str = nil)
+      str ||= self
+      !str[/(#{UNICODE_REGEX[:zh]}|#{UNICODE_REGEX[:punc]})/].nil?
+    end
+    def zh?(str = nil)
+      str ||= self
+      str.scan(/(#{UNICODE_REGEX[:zh]}+|#{UNICODE_REGEX[:punc]}+|\s+)/).join == str
+    end
+    def has_zh_punctuation?(str = nil)
+      str ||= self
+      !str[UNICODE_REGEX[:punc]].nil?
+    end
+    def size(str = nil)
+      str ||= self
+      str.chars.size
+    end
+    def chars(str = nil)
+      (str || self).scan(/./mu).to_a
+    end
+    def reverse(str = nil)
+      str ||= self
+      str.chars.reverse.join
+    end
+    def uri_encode(str = nil)
+      str ||= self
+      URI.encode str
+    end
+    def uri_escape(str = nil)
+      str ||= self
+      URI.escape(str, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]"))
+    end
+    def ascii?(str = nil)
+      str ||= self
+      str.chars.size == str.bytes.to_a.size
+    end
+    def multibyte?(str = nil)
+      !(str || self).ascii?
+    end
+    def halfwidth?(str = nil)
+      str ||= self
+      str[/[０-９Ａ-Ｚａ-ｚ％．：＃＄＆＋－／＼＝；＜＞]/].nil?
+    end
+    def fullwidth?(str = nil)
+      str ||= self
+      !self.halfwidth?(str) && self.to_halfwidth(str) != str
+    end
+    def to_halfwidth(str = nil)
+      str ||= self
+      matches = str.scan(/([０-９Ａ-Ｚａ-ｚ％．：＃＄＆＋－／＼＝；＜＞])/u).uniq.flatten
+      matches.each do |match|
+        replacement = FW_HW[match]
+        str = str.gsub(match, replacement) #unless str.nil?
+      end
+      str
+    end
+    def to_codepoint(str = nil)
+      str ||= self
+      #chars = (self.class.to_s == 'String')? self.chars : self.chars(str)
+      codepoints = str.chars.map{|c| "\\u%04x" % c.unpack("U")[0]}
+      codepoints.join
+    end
+    def from_codepoint(str = nil)
+      str ||= self
+      [str.sub(/\\?u/,'').hex].pack("U")
+    end
+    class Basement #:nodoc:
+      include ZhongwenTools::String
+    end
+    def self.chars(*args)
+      Basement.new.chars(*args)
+    end
+    def self.size(*args)
+      Basement.new.size(*args)
+    end
+    def self.reverse(*args)
+      Basement.new.reverse(*args)
+    end
+    def self.to_utf8(*args)
+      Basement.new.to_utf8(*args)
+    end
+    def self.uri_encode(*args)
+      Basement.new.uri_encode(*args)
+    end
+    def self.uri_escape(*args)
+      Basement.new.uri_escape(*args)
+    end
+    def self.ascii?(*args)
+      Basement.new.ascii?(*args)
+    end
+    def self.multibyte?(*args)
+      Basement.new.multibyte?(*args)
+    end
+    def self.halfwidth?(*args)
+      Basement.new.halfwidth?(*args)
+    end
+    def self.fullwidth?(*args)
+      Basement.new.fullwidth?(*args)
+    end
+    def self.to_halfwidth(*args)
+      Basement.new.to_halfwidth(*args)
+    end
+    def self.has_zh?(*args)
+      Basement.new.has_zh?(*args)
+    end
+    def self.has_zh_punctuation?(*args)
+      Basement.new.has_zh_punctuation?(*args)
+    end
+    def self.zh?(*args)
+      Basement.new.zh?(*args)
+    end
+    def self.to_codepoint(*args)
+      Basement.new.to_codepoint(*args)
+    end
+    def self.from_codepoint(*args)
+      Basement.new.from_codepoint(*args)
+    end
+  end
+end
+if RUBY_VERSION < '1.9'
+  require './lib/zhongwen_tools/string/ruby18'
+elsif RUBY_VERSION < '2.0'
+  require './lib/zhongwen_tools/string/ruby19'
+end

data/lib/zhongwen_tools.rb ADDED Viewed

@@ -0,0 +1,8 @@
+# encoding: utf-8
+require File.expand_path("../zhongwen_tools/string", __FILE__)
+require File.expand_path("../zhongwen_tools/numbers", __FILE__)
+#require File.expand_path("../zhongwen_tools/romanization", __FILE__)
+#require File.expand_path("../zhongwen_tools/conversion", __FILE__)
+module ZhongwenTools
+end

data/test/test_conversion.rb ADDED Viewed

File without changes

data/test/test_helper.rb ADDED Viewed

@@ -0,0 +1,14 @@
+begin
+  require 'coveralls'
+  Coveralls.wear!
+rescue LoadError
+  puts 'Coverage disabled.'
+end
+begin
+  require 'pry'
+rescue LoadError
+  puts 'Pry disabled'
+end
+require 'test/unit'

data/test/test_numbers.rb ADDED Viewed

@@ -0,0 +1,53 @@
+#encoding: utf-8
+$:.unshift File.join(File.dirname(__FILE__),'..','lib')
+require './test/test_helper'
+require 'zhongwen_tools/string'
+require 'zhongwen_tools/numbers'
+class TestCJKTools < Test::Unit::TestCase
+  include ZhongwenTools::Numbers
+  def test_convert_to_numbers
+    #skip
+    #your function sucks dick man
+      @numbers.each do |num|
+        number = convert_chinese_numbers_to_numbers num[:zh]
+        binding.pry if num[:en] != number
+        assert_equal num[:en], number
+      end
+  end
+  def test_convert_to_traditional_number
+    zhs = @numbers[0][:zh]
+    zht = convert_number_to_traditional :zh_s, zhs
+    assert_equal '一萬兩千七', zht
+  end
+  def test_convert_to_simplified_from_number
+    #skip
+   num = @numbers[0][:en]
+   zht = convert_number_to_traditional :num, num
+#adds garbage!!
+    assert_equal '一萬二千七', zht
+  end
+  def test_convert_number_to_pyn
+    num = '一百三十六'
+    pyn = self.convert_number_to_pyn num
+    assert_equal 'yi1-bai2-san1-shi2-liu4', pyn
+  end
+  def setup
+    @numbers = [
+      {:zh =>'一万两千七', :en => 12007},
+      {:zh => '三千六十三', :en => 3063},
+      {:zh => '一百五十', :en => 150 },
+      {:zh => '三千亿', :en => 300000000000},
+      {:zh => '一九六六', :en => 1966},
+      {:zh => '二零零八', :en => 2008},
+    ]
+  end
+end

data/test/test_romanization.rb ADDED Viewed

File without changes

data/test/test_string.rb ADDED Viewed

@@ -0,0 +1,123 @@
+#encoding: utf-8
+$:.unshift File.join(File.dirname(__FILE__),'..','lib')
+require './test/test_helper'
+require 'zhongwen_tools/string'
+class String
+  include ZhongwenTools::String
+end
+if RUBY_VERSION < '1.9'
+  class Test::Unit::TestCase
+    def refute(statement, message = '')
+      assert !statement, message
+    end
+  end
+end
+class TestString < Test::Unit::TestCase
+  def test_size
+    assert_equal 2, @str.size
+    assert_equal 2, ZhongwenTools::String.size(@str)
+  end
+  def test_chars
+    assert_equal %w(中 文), @str.chars
+    assert_equal %w(中 文), ZhongwenTools::String.chars(@str)
+  end
+  def test_reverse
+    assert_equal '文中', '中文'.reverse
+    assert_equal '文中', ZhongwenTools::String.reverse('中文')
+  end
+  def test_ascii
+    refute @str.ascii?
+    assert 'zhongwen'.ascii?
+    assert @str.multibyte?
+    refute ZhongwenTools::String.ascii? @str
+    assert ZhongwenTools::String.ascii? 'zhongwen'
+    assert ZhongwenTools::String.multibyte? @str
+  end
+  def test_halfwidth
+    str = 'hellｏ'
+    refute str.halfwidth?
+    assert_equal str.to_halfwidth, 'hello'
+    assert str.to_halfwidth.halfwidth?
+    refute ZhongwenTools::String.halfwidth? str
+    assert_equal ZhongwenTools::String.to_halfwidth(str), 'hello'
+    assert ZhongwenTools::String.halfwidth?(ZhongwenTools::String.to_halfwidth(str))
+  end
+  def test_fullwidth
+    str = 'hellｏ'
+    assert str.fullwidth?
+    refute @str.fullwidth?
+    assert  ZhongwenTools::String.fullwidth? str
+  end
+  def test_uri_encode
+    url = 'http://www.3000hanzi.com/chinese-to-english/definition/好'
+    assert_equal URI.encode('好'), '好'.uri_encode
+    assert_equal "http://www.3000hanzi.com/chinese-to-english/definition/#{URI.encode '好'}", ZhongwenTools::String.uri_encode(url)
+    assert_equal "http://www.3000hanzi.com/chinese-to-english/definition/#{URI.encode '好'}", url.uri_encode
+  end
+  def test_uri_escape
+    url = 'http://www.3000hanzi.com/chinese-to-english/definition/好'
+    regex = Regexp.new("[^#{URI::PATTERN::UNRESERVED}]")
+    assert_equal URI.escape(url, regex), ZhongwenTools::String.uri_escape(url)
+    assert_equal URI.escape(url, regex), url.uri_escape
+  end
+  def test_has_zh
+    assert @str.has_zh?
+    refute @hw.has_zh?
+    refute @fw.has_zh?
+    assert ZhongwenTools::String.has_zh? @str
+    refute ZhongwenTools::String.has_zh? @hw
+    refute ZhongwenTools::String.has_zh? @fw
+  end
+  def test_is_zh
+    assert @str.zh?
+    assert @zh_punc.zh?
+    assert ZhongwenTools::String.zh? @str
+    assert ZhongwenTools::String.zh? @zh_punc
+  end
+  def test_codepoint
+    assert_equal "\\u4e2d\\u6587", @str.to_codepoint
+    assert_equal '羊', 'u7f8a'.from_codepoint
+    assert_equal '羊', '\\u7f8a'.from_codepoint
+    assert_equal "\\u4e2d\\u6587", ZhongwenTools::String.to_codepoint(@str)
+    assert_equal '羊', ZhongwenTools::String.from_codepoint('u7f8a')
+    assert_equal '羊', ZhongwenTools::String.from_codepoint('\\u7f8a')
+  end
+  def test_punctuation
+    assert ZhongwenTools::String.has_zh_punctuation?(@zh_punc)
+    assert @zh_punc.has_zh_punctuation?
+  end
+  def setup
+    @str = '中文'
+    @fw = 'ｈｅｌｌｏ'
+    @hw = 'hello'
+    @zh_punc = '不错吧！'
+  end
+end

data/zhongwen_tools.gemspec ADDED Viewed

@@ -0,0 +1,27 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+Gem::Specification.new do |s|
+  s.name        = "zhongwen_tools"
+  s.license     = "MIT"
+  s.version     = "0.0.6"
+  s.authors     = ["Steven Daniels"]
+  s.email       = ["steven@tastymantou.com"]
+  s.homepage    = "https://github.com/stevendaniels/zhongwen_tools"
+  s.summary     = %q{Zhongwen Tools provide romanization conversions and helper methods for Chinese.}
+  s.description = %q{Chinese tools for romanization conversions and other helpful string functions for Chinese.}
+  s.rubyforge_project = "zhongwen_tools"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.add_development_dependency('rake', "~> 10.1")
+  if RUBY_VERSION >= '1.9'
+    s.add_development_dependency('simplecov', "~> 0.7")
+    s.add_development_dependency('simplecov-gem-adapter', "~> 1.0.1")
+    s.add_development_dependency('coveralls', "~> 0.7.0")
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,123 @@
+--- !ruby/object:Gem::Specification
+name: zhongwen_tools
+version: !ruby/object:Gem::Version
+  version: 0.0.6
+platform: ruby
+authors:
+- Steven Daniels
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-01-18 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '10.1'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '10.1'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0.7'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0.7'
+- !ruby/object:Gem::Dependency
+  name: simplecov-gem-adapter
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.1
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.1
+- !ruby/object:Gem::Dependency
+  name: coveralls
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.7.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.7.0
+description: Chinese tools for romanization conversions and other helpful string functions
+  for Chinese.
+email:
+- steven@tastymantou.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .travis.yml
+- Gemfile
+- Gemfile.1.8.7
+- README.md
+- Rakefile
+- lib/zhongwen_tools.rb
+- lib/zhongwen_tools/numbers.rb
+- lib/zhongwen_tools/string.rb
+- lib/zhongwen_tools/string/fullwidth.rb
+- lib/zhongwen_tools/string/ruby18.rb
+- lib/zhongwen_tools/string/ruby19.rb
+- test/test_conversion.rb
+- test/test_helper.rb
+- test/test_numbers.rb
+- test/test_romanization.rb
+- test/test_string.rb
+- zhongwen_tools.gemspec
+homepage: https://github.com/stevendaniels/zhongwen_tools
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project: zhongwen_tools
+rubygems_version: 2.0.3
+signing_key:
+specification_version: 4
+summary: Zhongwen Tools provide romanization conversions and helper methods for Chinese.
+test_files:
+- test/test_conversion.rb
+- test/test_helper.rb
+- test/test_numbers.rb
+- test/test_romanization.rb
+- test/test_string.rb