zhongwen_tools 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b1e19e456d7cf778c9a749a75284044981086a02
4
+ data.tar.gz: 103ae6d8d26029b2854bdd09e02a10bff64d5df1
5
+ SHA512:
6
+ metadata.gz: dff5a94d7af2e65b6f6a63ae8a5593312eef78df4fe3b8fe9c1280bf05874db12d4230c266f6de63de655b1d07db06fa430d49584daf120d65cabc33fd9cd94a
7
+ data.tar.gz: 0078f0cb0ca8724c34403c04472c063ea53836b261047d968c4a78eb18eba2985356004d3dabf6314e3b930635e4a1c1058f154f45fea1f14750e903991d21b3
data/.travis.yml ADDED
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - 2.0.0
6
+ - 2.1.0
7
+ - ruby-head
8
+
9
+ matrix:
10
+ include:
11
+ - rvm: 1.8.7
12
+ gemfile: Gemfile.1.8.7
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "https://rubygems.org"
2
+ # Specify your gem's dependencies in zhongwen_tools.gemspec
3
+ gemspec
4
+
5
+ group :test do
6
+ gem 'pry'
7
+ end
data/Gemfile.1.8.7 ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+ # Specify your gem's dependencies in zhongwen_tools.gemspec
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,128 @@
1
+ #Zhongwen Tools: tools and methods for dealing with Chinese.
2
+ [![Build
3
+ Status](https://travis-ci.org/stevendaniels/zhongwen_tools.png?branch=master)](https://travis-ci.org/stevendaniels/zhongwen_tools) [![Dependency Status](https://gemnasium.com/stevendaniels/zhongwen_tools.png)](https://gemnasium.com/stevendaniels/zhongwen_tools) [![Code Climate](https://codeclimate.com/github/stevendaniels/zhongwen_tools.png)](https://codeclimate.com/github/stevendaniels/zhongwen_tools) [![Coverage Status](https://coveralls.io/repos/stevendaniels/zhongwen_tools/badge.png)](https://coveralls.io/r/stevendaniels/zhongwen_tools)
4
+ ##INSTALLATION
5
+
6
+ Install as a gem
7
+
8
+ $ [sudo] gem install zhongwen_tools
9
+
10
+ ## Usage
11
+
12
+ Add the ZhongwenTools component you need to your classes as a module.
13
+
14
+ class String
15
+ include ZhongwenToolsRomanization
16
+ end
17
+
18
+ str = "ni3 hao3" #pinyin with numbers
19
+ str.to_pinyin #=> "nǐ hǎo"
20
+ str.to_zhuyinfuhao #=>
21
+
22
+ mzd = "Mao Tse-tung"
23
+ mzd.to_pinyin #=> Mao Zedong
24
+
25
+ Or you can require the components you want
26
+ require 'zhongwen_tools/numbers'
27
+ ZhongwenTools::Numbers.to_pinyin '一百二十' #=> 'yi1-bai2-er4-shi2'
28
+
29
+ ZhongwenTools includes the following modules:
30
+
31
+ 1. ZhongwenTools::String => some useful string functions and functions for identifying Chinese scripts and romanizations.
32
+ 2. ZhongwenTools::Numbers => functions for identifying and converting numbers.
33
+ 3. ZhongwenTools::Integer => some useful integer functions for Chinese:
34
+ e.g. 12.to_pinyin 12.to_zht
35
+ 4. ZhongwenTools::Romanization => functions for converting between Chinese romanization systems
36
+ 5. ZhongwenTools::Conversion => functions for converting between Chinese scripts.
37
+ 6. ZhongwenTools::ToneSandhi => functions for identifying and dealing with tone sandhi. (Wiki URL)
38
+ 7. [TODO] ZhongwenTools::Segmentation => functions for segmenting Chinese. Can provide different methods for converting
39
+ 8. ZhongwenTools::Tagging => functions for tagging Chinese POS, NER, etc.
40
+
41
+
42
+ ### ZhongwenTools::String: useful string functions for ZhongwenTools language
43
+ ZhongwenTools::String.ascii? 'hello' #=> true #non-multibyle strings
44
+ ZhongwenTools::String.multibyte? '中文' #=> true #multibtye strings
45
+ ZhongwenTools::String.halfwidth?
46
+ ZhongwenTools::String.fullwidth?
47
+ ZhongwenTools::String.to_halfwidth
48
+ ZhongwenTools::String.uri_encode #=> just because I'm lazy
49
+ ZhongwenTools::Unicode.to_codepoint
50
+ ZhongwenTools::Unicode.to_unicode --> converts from unicode codepoint.
51
+ ZhongwenTools::String.downcase --> does pinyin/ lowercase
52
+ ZhongwenTools::String.upcase --> does pinyin uppercase
53
+ ZhongwenTools::String.capitalize ---> does pinyin / fullwidth capitalization
54
+
55
+ ZhongwenTools::String.has_zh? '1月' #=> true
56
+ ZhongwenTools::String.is_zh? '1月' #=> false can't be mixed.
57
+ ZhongwenTools::String.is_zhs? '中国' #=> true
58
+ ZhongwenTools::String.is_zht? '中国' #=> false
59
+
60
+ #### ruby 1.8 safe methods
61
+ ZhongwenTools::String.chars '中文' #=> ['中','文']
62
+ ZhongwenTools::String.size '中文' #=> 2
63
+ ZhongwenTools::String.reverse '中文' #=> '文中'
64
+ ZhongwenTools::Unicode.to_utf8 '\x{D6D0}\x{CEC4}' => '中文'
65
+
66
+
67
+ ###Numbers
68
+ Functions for converting to and from Chinese numbers.
69
+
70
+ ###Integers
71
+
72
+ ### Romanization
73
+ ZhongwenTools::Chinese has tools for converting between Chinese language romanization systems and
74
+ scripts.
75
+
76
+ class String
77
+ include ZhongwenToolsRomanization
78
+ end
79
+
80
+
81
+ str = "ni3 hao3"
82
+ romanization_system = "pyn" #pyn|wg|yale|bpmf|zhyfh|wade-giles|bopomofo
83
+
84
+ str.to_pinyin romanization_system
85
+ #=> "nǐ hǎo"
86
+
87
+ str.to_py romanization_system
88
+ #=> "nǐ hǎo"
89
+
90
+ str.to_pyn
91
+ #=> "ni3 hao3"
92
+
93
+ str.to_wg
94
+ str.to_bpmf
95
+ str.to_yale
96
+ str.to_typy
97
+ str.to_msp3
98
+ str.to_tone_sandhi #=> converts pinyin into it's spoken tones.
99
+ #=> "ni2 hao3"
100
+ str.tone_sandhi? #=> checks if the word has tone sandhi
101
+ #=> true
102
+ str.romanization?
103
+
104
+ ### Conversion
105
+ Functions for converting between scripts (e.g. traditional Chinese to
106
+ simplified Chinese) and between chinese and romanization systems (e.g.
107
+ Chinese to pinyin).
108
+
109
+ ZhongwenTools::Conversion.to_zhs
110
+ ZhongwenTools::Conversion.to_zht
111
+ ZhongwenTools::Conversion.to_zhtw
112
+ ZhongwenTools::Conversion.to_zhhk
113
+ ZhongwenTools::Conversion.to_zhmc
114
+ ZhongwenTools::Conversion.to_zhsg
115
+ ZhongwenTools::Conversion.to_zhprc
116
+
117
+
118
+ ###Tone Sandhi
119
+ Some functions for predicting / converting to tone sandhi
120
+
121
+ ##Plugins
122
+ Zhongwen Tools tries to avoid having many dependencies. Functionality
123
+ that requires an external dependency is packaged as a separate gem.
124
+
125
+ ## TODO
126
+ 1. A trad/simp script converter
127
+ 2. A character -> pinyin converter
128
+ 3. A language detector
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+ Bundler.require :test
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ end
8
+
9
+ desc "Run tests"
10
+ task :default => :test
@@ -0,0 +1,185 @@
1
+ #encoding: utf-8
2
+ module ZhongwenTools
3
+ module Numbers
4
+
5
+ NUMBER_MULTIPLES = '拾十百佰千仟仟万萬亿億'
6
+
7
+ NUMBERS_TABLE = [
8
+ { :zh_s => '零', :zh_t => '零', :num => 0, :pyn => 'ling2'},
9
+ { :zh_s => '〇', :zh_t => '〇', :num => 0, :pyn => 'ling2'},
10
+ { :zh_s => '一', :zh_t => '一', :num => 1, :pyn => 'yi1'},
11
+ { :zh_s => '壹', :zh_t => '壹', :num => 1, :pyn => 'yi1'},
12
+ { :zh_s => '幺', :zh_t => '幺', :num => 1, :pyn => 'yao1'},
13
+ { :zh_s => '二', :zh_t => '二', :num => 2, :pyn => 'er4'},
14
+ { :zh_s => '两', :zh_t => '兩', :num => 2, :pyn => 'liang3'},
15
+ { :zh_s => '贰', :zh_t => '貳', :num => 2, :pyn => 'er4'},
16
+ { :zh_s => '三', :zh_t => '三', :num => 3, :pyn => 'san1'},
17
+ { :zh_s => '弎', :zh_t => '弎', :num => 3, :pyn => 'san1'},
18
+ { :zh_s => '叁', :zh_t => '參', :num => 3, :pyn => 'san1'},
19
+ { :zh_s => '四', :zh_t => '四', :num => 4, :pyn => 'si4'},
20
+ { :zh_s => '䦉', :zh_t => '䦉', :num => 4, :pyn => 'si4'},
21
+ { :zh_s => '肆', :zh_t => '肆', :num => 4, :pyn => 'si4'},
22
+ { :zh_s => '五', :zh_t => '五', :num => 5, :pyn => 'wu3'},
23
+ { :zh_s => '伍', :zh_t => '伍', :num => 5, :pyn => 'wu3'},
24
+ { :zh_s => '六', :zh_t => '六', :num => 6, :pyn => 'liu4'},
25
+ { :zh_s => '陆', :zh_t => '陸', :num => 6, :pyn => 'liu4'},
26
+ { :zh_s => '七', :zh_t => '七', :num => 7, :pyn => 'qi1'},
27
+ { :zh_s => '柒', :zh_t => '柒', :num => 7, :pyn => 'qi1'},
28
+ { :zh_s => '八', :zh_t => '八', :num => 8, :pyn => 'ba1'},
29
+ { :zh_s => '捌', :zh_t => '捌', :num => 8, :pyn => 'ba1'},
30
+ { :zh_s => '九', :zh_t => '九', :num => 9, :pyn => 'jiu3'},
31
+ { :zh_s => '玖', :zh_t => '玖', :num => 9, :pyn => 'jiu3'},
32
+ { :zh_s => '十', :zh_t => '十', :num => 10, :pyn => 'shi2'},
33
+ { :zh_s => '拾', :zh_t => '拾', :num => 10, :pyn => 'shi2'},
34
+ { :zh_s => '廿', :zh_t => '廿', :num => 20, :pyn => ' nian4'},
35
+ { :zh_s => '百', :zh_t => '百', :num => 100, :pyn => 'bai2'},
36
+ { :zh_s => '佰', :zh_t => '佰', :num => 100, :pyn => 'bai2'},
37
+ { :zh_s => '千', :zh_t => '千', :num => 1000, :pyn => 'qian2'},
38
+ { :zh_s => '仟', :zh_t => '仟', :num => 1000, :pyn => 'qian2'},
39
+ { :zh_s => '万', :zh_t => '萬', :num => 10000, :pyn => 'wan4'},
40
+ { :zh_s => '亿', :zh_t => '億', :num => 100000000, :pyn => 'yi4'},
41
+ ]
42
+
43
+ def is_number? word
44
+ #垓 秭 穰 溝 澗 正 載 --> beyond 100,000,000!
45
+ "#{word}".gsub(/([\d]|[一二三四五六七八九十百千萬万億亿]){2,}/,'') == ''
46
+ end
47
+
48
+ def convert_date(zh)
49
+ #if it's a year, or an oddly formatted number
50
+ zh_numbers = ZhongwenTools::String.chars zh
51
+ numbers = [];
52
+ i = 0
53
+
54
+ while( i < zh_numbers.length)
55
+ curr_number = zh_numbers[i]
56
+
57
+ #x[:num] == curr_number.to_i is a kludge; any string will == 0
58
+ num = convert(curr_number)[:num]
59
+ numbers << num
60
+ i += 1
61
+ end
62
+
63
+ return numbers
64
+ end
65
+
66
+ def convert(number)
67
+ NUMBERS_TABLE.find{|x| x[:zh_s] == number || x[:zh_t] == number || x[:num].to_s == number}
68
+ end
69
+
70
+ def convert_numbers(numbers)
71
+ number = 0
72
+ length = numbers.length
73
+ skipped = false
74
+
75
+ length.times do |i|
76
+ unless skipped == i
77
+ curr_num = numbers[i] || 0
78
+ if (i+2) <= length
79
+ number, i = convert_current_number(numbers, number, curr_num, i)
80
+ skipped = i + 1
81
+ else
82
+ number = adjust_number(number, curr_num)
83
+ end
84
+ end
85
+ end
86
+
87
+ number
88
+ end
89
+
90
+ def convert_current_number numbers, number, curr_num, i
91
+ next_number = numbers[i + 1]
92
+ if is_number_multiplier? next_number
93
+ number += next_number * curr_num
94
+ end
95
+
96
+ [number, i]
97
+ end
98
+ def adjust_number(number, curr_num)
99
+ is_number_multiplier?(curr_num) ? number * curr_num : number + curr_num
100
+ end
101
+
102
+ def convert_chinese_numbers_to_numbers(zh_number)
103
+ zh_number = zh_number.to_s
104
+ numbers = convert_date(zh_number)
105
+
106
+ #if it's a year, or an oddly formatted number
107
+ return numbers.join('').to_i if zh_number[/[#{NUMBER_MULTIPLES}]/u].nil?
108
+
109
+ convert_numbers numbers
110
+ end
111
+
112
+ def is_number_multiplier?(number)
113
+ [10,100,1000,10000,100000000].include? number
114
+ end
115
+
116
+ #these should also be able to convert numbers to chinese numbers
117
+ def convert_number_to_simplified type, number
118
+ convert_number_to :zh_s, type.to_sym, number
119
+ end
120
+ def convert_number_to_traditional type, number
121
+ convert_number_to :zh_t, type.to_sym, number
122
+ end
123
+
124
+ def convert_number_to_pyn number, type = 'zh_s'
125
+ convert_number_to :pyn, type.to_sym, number, '-'
126
+ end
127
+
128
+
129
+ def check_wan(wan, i)
130
+ wan ||= 0
131
+ wan += 1 if (i + 1) % 5 == 0
132
+ end
133
+
134
+ def convert_from_zh number, to
135
+ converted_number = number.chars.map do |digit|
136
+ convert(digit).fetch(to){ digit }
137
+ end
138
+ end
139
+
140
+ def convert_from_num number, to
141
+ #TODO: this will fail for numbers over 1 billion. grr.
142
+ str = number.to_s
143
+ len = str.length
144
+ converted_number = []
145
+
146
+ len.times do |i|
147
+ wan = check_wan(wan, i)
148
+ num = str[(len - 1 - i),1].to_i
149
+
150
+ if i == 0
151
+ replacement = NUMBERS_TABLE.find{|x| x[:num] == num}.fetch(to){0}
152
+
153
+ converted_number << replacement unless num == 0
154
+ else
155
+ replacement = (NUMBERS_TABLE.find{|x| x[:num] == (10**(i))} || NUMBERS_TABLE.find{|x| x[:num] == (10**(i) / 10000)} || NUMBERS_TABLE.find{|x| x[:num] == (10**(i) / 10000**2)} )[to]
156
+ converted_number << replacement
157
+
158
+ #checks the wan level and ...
159
+ if (num == 1 && (10**(i) / 10000 ** wan) != 10) || num != 1
160
+ replacement = NUMBERS_TABLE.find{|x| x[:num] == num}[to]
161
+ converted_number << replacement
162
+ #elsif num != 1
163
+ #replacement = NUMBERS_TABLE.find{|x| x[:num] == num}[to]
164
+ #converted_number << replacement
165
+ end
166
+ end
167
+ end
168
+
169
+ converted_number.reverse!
170
+ end
171
+
172
+ def convert_number_to(to, from, number, separator = '')
173
+ return number unless [:zh_t, :zh_s, :num, :pyn].include? to
174
+
175
+ if from == :num
176
+ converted_number = convert_from_num(number, to)
177
+ else
178
+ converted_number = convert_from_zh number, to
179
+ end
180
+
181
+ #liang rules are tough...
182
+ converted_number.join(separator).gsub(/零[#{NUMBER_MULTIPLES}]/u,'')#.gsub(/二([百佰千仟仟万萬亿億])/){"#{NUMBERS_TABLE.find{|x|x[:pyn] == 'liang3'}[to]}#{$1}"}
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+ module ZhongwenTools
3
+ FW_HW ={
4
+ "0" => "0",
5
+ "1" => "1",
6
+ "2" => "2",
7
+ "3" => "3",
8
+ "4" => "4",
9
+ "5" => "5",
10
+ "6" => "6",
11
+ "7" => "7",
12
+ "8" => "8",
13
+ "9" => "9",
14
+ "A" => "A",
15
+ "B" => "B",
16
+ "C" => "C",
17
+ "D" => "D",
18
+ "E" => "E",
19
+ "F" => "F",
20
+ "G" => "G",
21
+ "H" => "H",
22
+ "I" => "I",
23
+ "J" => "J",
24
+ "K" => "K",
25
+ "L" => "L",
26
+ "M" => "M",
27
+ "N" => "N",
28
+ "O" => "O",
29
+ "P" => "P",
30
+ "Q" => "Q",
31
+ "R" => "R",
32
+ "S" => "S",
33
+ "T" => "T",
34
+ "U" => "U",
35
+ "V" => "V",
36
+ "W" => "W",
37
+ "X" => "X",
38
+ "Y" => "Y",
39
+ "Z" => "Z",
40
+ "a" => "a",
41
+ "b" => "b",
42
+ "c" => "c",
43
+ "d" => "d",
44
+ "e" => "e",
45
+ "f" => "f",
46
+ "g" => "g",
47
+ "h" => "h",
48
+ "i" => "i",
49
+ "j" => "j",
50
+ "k" => "k",
51
+ "l" => "l",
52
+ "m" => "m",
53
+ "n" => "n",
54
+ "o" => "o",
55
+ "p" => "p",
56
+ "q" => "q",
57
+ "r" => "r",
58
+ "s" => "s",
59
+ "t" => "t",
60
+ "u" => "u",
61
+ "v" => "v",
62
+ "w" => "w",
63
+ "x" => "x",
64
+ "y" => "y",
65
+ "z" => "z",
66
+ "%" => '%',
67
+ "." => '.',
68
+ ':' => ':',
69
+ "#" => '#',
70
+ "$" => "$",
71
+ "&" => "&",
72
+ "+" => "+",
73
+ "-" => "-",
74
+ "/" => "/",
75
+ "\" => '\\',
76
+ '=' => '=',
77
+ ";" => ";",
78
+ "<" => "<",
79
+ ">" => ">"
80
+ }
81
+ end
@@ -0,0 +1,71 @@
1
+ #encoding: utf-8
2
+
3
+ class String
4
+ define_method(:chars) do
5
+ self.scan(/./mu).to_a
6
+ end
7
+
8
+ def size
9
+ self.chars.size
10
+ end
11
+
12
+ def reverse(str = nil)
13
+ self.chars.reverse.join
14
+ end
15
+ end
16
+
17
+ module ZhongwenTools
18
+ module String
19
+ def to_utf8(encoding = nil, encodings = nil)
20
+ #should substitute out known bad actors like space
21
+ encodings = ['utf-8', 'GB18030', 'BIG5', 'GBK', 'GB2312'] if encodings.nil?
22
+ encodings = encoding + encodings unless encoding.nil?
23
+ raise 'Unable to Convert' if encodings.size == 0
24
+
25
+ begin
26
+ text = Iconv.conv('utf-8', encodings[0], self)
27
+ rescue
28
+ text = self.to_utf8(nil, encodings[1..-1])
29
+ end
30
+ text
31
+ end
32
+
33
+ def convert_regex(regex)
34
+ str = regex.to_s
35
+ regex.to_s.scan(/u[0-9A-Z]{4}/).each{|cp| str = str.sub('\\' + cp,cp.from_codepoint)}
36
+ /#{str}/
37
+ end
38
+
39
+ def has_zh?(str = nil)
40
+ str ||= self
41
+
42
+ regex = {
43
+ :zh => self.convert_regex(UNICODE_REGEX[:zh]),
44
+ :punc => self.convert_regex(UNICODE_REGEX[:punc])
45
+ }
46
+ #str.scan(/#{regex[:zh]}|#{regex[:punc]}|\s/).join == str
47
+ !self.fullwidth?(str) && (!str[regex[:zh]].nil? || !str[regex[:punc]].nil?)
48
+ end
49
+
50
+ def zh?(str = nil)
51
+ str ||= self
52
+
53
+ regex = {
54
+ :zh => self.convert_regex(UNICODE_REGEX[:zh]),
55
+ :punc => self.convert_regex(UNICODE_REGEX[:punc])
56
+ }
57
+
58
+ !str.fullwidth? && (str.scan(/(#{regex[:zh]}+|#{regex[:punc]}+|\s+)/).join == str)
59
+ end
60
+
61
+ def has_zh_punctuation?(str = nil)
62
+ str ||= self
63
+ regex = {
64
+ :zh => self.convert_regex(UNICODE_REGEX[:zh]),
65
+ :punc => self.convert_regex(UNICODE_REGEX[:punc])
66
+ }
67
+
68
+ !str[regex[:punc]].nil?
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,6 @@
1
+ #encoding: utf-8
2
+ class String
3
+ define_method(:chars) do
4
+ self.scan(/./mu).to_a
5
+ end
6
+ end
@@ -0,0 +1,164 @@
1
+ # encoding: utf-8
2
+ #$:.unshift File.join(File.dirname(__FILE__),'..','lib','zhongwen_tools', 'string')
3
+ require 'uri'
4
+ require './lib/zhongwen_tools/string/fullwidth'
5
+
6
+ module ZhongwenTools
7
+ module String
8
+ UNICODE_REGEX = {
9
+ :zh => /[\u2E80-\u2E99]|[\u2E9B-\u2EF3]|[\u2F00-\u2FD5]|[\u3005|\u3007]|[\u3021-\u3029]|[\u3038-\u303B]|[\u3400-\u4DB5]|[\u4E00-\u9FCC]|[\uF900-\uFA6D]|[\uFA70-\uFAD9]/,
10
+ :punc => /[\u0021-\u0023]|[\u0025-\u002A]|[\u002C-\u002F]|[\u003A\u003B\u003F\u0040]|[\u005B-\u005D\u005F\u007B\u007D\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387]|[\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F]|[\u066A-\u066D]|[\u06D4]|[\u0700-\u070D]|[\u07F7-\u07F9]|[\u0830-\u083E]|[\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B]|[\u0F04-\u0F12]|[\u0F14]|[\u0F3A-\u0F3D]|[\u0F85]|[\u0FD0-\u0FD4]|[\u0FD9\u0FDA]|[\u104A-\u104F]|[\u10FB]|[\u1360-\u1368]|[\u1400\u166D\u166E\u169B\u169C]|[\u16EB-\u16ED]|[\u1735\u1736]|[\u17D4-\u17D6]|[\u17D8-\u17DA]|[\u1800-\u180A\u1944\u1945\u1A1E\u1A1F]|[\u1AA0-\u1AA6]|[\u1AA8-\u1AAD]|[\u1B5A-\u1B60]|[\u1BFC-\u1BFF]|[\u1C3B-\u1C3F]|[\u1C7E\u1C7F]|[\u1CC0-\u1CC7]|[\u1CD3]|[\u2010-\u2027]|[\u2030-\u2043]|[\u2045-\u2051]|[\u2053-\u205E]|[\u207D\u207E\u208D\u208E\u2329\u232A]|[\u2768-\u2775\u27C5\u27C6]|[\u27E6-\u27EF]|[\u2983-\u2998]|[\u29D8-\u29DB\u29FC\u29FD]|[\u2CF9-\u2CFC]|[\u2CFE\u2CFF\u2D70]|[\u2E00-\u2E2E]|[\u2E30-\u2E3B]|[\u3001-\u3003]|[\u3008-\u3011]|[\u3014-\u301F]|[\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF]|[\uA60D-\uA60F]|[\uA673\uA67E]|[\uA6F2-\uA6F7]|[\uA874-\uA877]|[\uA8CE\uA8CF]|[\uA8F8-\uA8FA]|[\uA92E\uA92F\uA95F]|[\uA9C1-\uA9CD]|[\uA9DE\uA9DF]|[\uAA5C-\uAA5F]|[\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F]|[\uFE10-\uFE19]|[\uFE30-\uFE52]|[\uFE54-\uFE61]|[\uFE63\uFE68\uFE6A\uFE6B]|[\uFF01-\uFF03]|[\uFF05-\uFF0A]|[\uFF0C-\uFF0F]|[\uFF1A\uFF1B\uFF1F\uFF20]|[\uFF3B-\uFF3D]|[\uFF3F\uFF5B\uFF5D]|[\uFF5F-\uFF65]/
11
+ }
12
+
13
+ def to_utf8(str = nil)
14
+ (str || self).force_encoding('utf-8')
15
+ #TODO: better conversion functions available in categorize
16
+ end
17
+
18
+ def has_zh?(str = nil)
19
+ str ||= self
20
+
21
+ !str[/(#{UNICODE_REGEX[:zh]}|#{UNICODE_REGEX[:punc]})/].nil?
22
+ end
23
+
24
+ def zh?(str = nil)
25
+ str ||= self
26
+
27
+ str.scan(/(#{UNICODE_REGEX[:zh]}+|#{UNICODE_REGEX[:punc]}+|\s+)/).join == str
28
+ end
29
+
30
+ def has_zh_punctuation?(str = nil)
31
+ str ||= self
32
+
33
+ !str[UNICODE_REGEX[:punc]].nil?
34
+ end
35
+
36
+ def size(str = nil)
37
+ str ||= self
38
+ str.chars.size
39
+ end
40
+
41
+ def chars(str = nil)
42
+ (str || self).scan(/./mu).to_a
43
+ end
44
+
45
+ def reverse(str = nil)
46
+ str ||= self
47
+ str.chars.reverse.join
48
+ end
49
+
50
+ def uri_encode(str = nil)
51
+ str ||= self
52
+ URI.encode str
53
+ end
54
+
55
+ def uri_escape(str = nil)
56
+ str ||= self
57
+
58
+ URI.escape(str, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]"))
59
+ end
60
+
61
+ def ascii?(str = nil)
62
+ str ||= self
63
+ str.chars.size == str.bytes.to_a.size
64
+ end
65
+
66
+ def multibyte?(str = nil)
67
+ !(str || self).ascii?
68
+ end
69
+
70
+ def halfwidth?(str = nil)
71
+ str ||= self
72
+ str[/[0-9A-Za-z%.:#$&+-/\=;<>]/].nil?
73
+ end
74
+
75
+ def fullwidth?(str = nil)
76
+ str ||= self
77
+ !self.halfwidth?(str) && self.to_halfwidth(str) != str
78
+ end
79
+
80
+ def to_halfwidth(str = nil)
81
+ str ||= self
82
+ matches = str.scan(/([0-9A-Za-z%.:#$&+-/\=;<>])/u).uniq.flatten
83
+
84
+ matches.each do |match|
85
+ replacement = FW_HW[match]
86
+ str = str.gsub(match, replacement) #unless str.nil?
87
+ end
88
+
89
+ str
90
+ end
91
+
92
+ def to_codepoint(str = nil)
93
+ str ||= self
94
+ #chars = (self.class.to_s == 'String')? self.chars : self.chars(str)
95
+ codepoints = str.chars.map{|c| "\\u%04x" % c.unpack("U")[0]}
96
+
97
+ codepoints.join
98
+ end
99
+
100
+ def from_codepoint(str = nil)
101
+ str ||= self
102
+
103
+ [str.sub(/\\?u/,'').hex].pack("U")
104
+ end
105
+
106
+ class Basement #:nodoc:
107
+ include ZhongwenTools::String
108
+ end
109
+ def self.chars(*args)
110
+ Basement.new.chars(*args)
111
+ end
112
+ def self.size(*args)
113
+ Basement.new.size(*args)
114
+ end
115
+ def self.reverse(*args)
116
+ Basement.new.reverse(*args)
117
+ end
118
+ def self.to_utf8(*args)
119
+ Basement.new.to_utf8(*args)
120
+ end
121
+ def self.uri_encode(*args)
122
+ Basement.new.uri_encode(*args)
123
+ end
124
+ def self.uri_escape(*args)
125
+ Basement.new.uri_escape(*args)
126
+ end
127
+ def self.ascii?(*args)
128
+ Basement.new.ascii?(*args)
129
+ end
130
+ def self.multibyte?(*args)
131
+ Basement.new.multibyte?(*args)
132
+ end
133
+ def self.halfwidth?(*args)
134
+ Basement.new.halfwidth?(*args)
135
+ end
136
+ def self.fullwidth?(*args)
137
+ Basement.new.fullwidth?(*args)
138
+ end
139
+ def self.to_halfwidth(*args)
140
+ Basement.new.to_halfwidth(*args)
141
+ end
142
+ def self.has_zh?(*args)
143
+ Basement.new.has_zh?(*args)
144
+ end
145
+ def self.has_zh_punctuation?(*args)
146
+ Basement.new.has_zh_punctuation?(*args)
147
+ end
148
+ def self.zh?(*args)
149
+ Basement.new.zh?(*args)
150
+ end
151
+ def self.to_codepoint(*args)
152
+ Basement.new.to_codepoint(*args)
153
+ end
154
+ def self.from_codepoint(*args)
155
+ Basement.new.from_codepoint(*args)
156
+ end
157
+ end
158
+ end
159
+
160
+ if RUBY_VERSION < '1.9'
161
+ require './lib/zhongwen_tools/string/ruby18'
162
+ elsif RUBY_VERSION < '2.0'
163
+ require './lib/zhongwen_tools/string/ruby19'
164
+ end
@@ -0,0 +1,8 @@
1
+ # encoding: utf-8
2
+ require File.expand_path("../zhongwen_tools/string", __FILE__)
3
+ require File.expand_path("../zhongwen_tools/numbers", __FILE__)
4
+ #require File.expand_path("../zhongwen_tools/romanization", __FILE__)
5
+ #require File.expand_path("../zhongwen_tools/conversion", __FILE__)
6
+
7
+ module ZhongwenTools
8
+ end
File without changes
@@ -0,0 +1,14 @@
1
+ begin
2
+ require 'coveralls'
3
+ Coveralls.wear!
4
+ rescue LoadError
5
+ puts 'Coverage disabled.'
6
+ end
7
+
8
+ begin
9
+ require 'pry'
10
+ rescue LoadError
11
+ puts 'Pry disabled'
12
+ end
13
+
14
+ require 'test/unit'
@@ -0,0 +1,53 @@
1
+ #encoding: utf-8
2
+ $:.unshift File.join(File.dirname(__FILE__),'..','lib')
3
+
4
+ require './test/test_helper'
5
+ require 'zhongwen_tools/string'
6
+ require 'zhongwen_tools/numbers'
7
+
8
+ class TestCJKTools < Test::Unit::TestCase
9
+ include ZhongwenTools::Numbers
10
+ def test_convert_to_numbers
11
+ #skip
12
+ #your function sucks dick man
13
+ @numbers.each do |num|
14
+ number = convert_chinese_numbers_to_numbers num[:zh]
15
+ binding.pry if num[:en] != number
16
+ assert_equal num[:en], number
17
+ end
18
+ end
19
+
20
+ def test_convert_to_traditional_number
21
+ zhs = @numbers[0][:zh]
22
+ zht = convert_number_to_traditional :zh_s, zhs
23
+
24
+ assert_equal '一萬兩千七', zht
25
+ end
26
+
27
+ def test_convert_to_simplified_from_number
28
+ #skip
29
+ num = @numbers[0][:en]
30
+ zht = convert_number_to_traditional :num, num
31
+
32
+ #adds garbage!!
33
+ assert_equal '一萬二千七', zht
34
+ end
35
+
36
+ def test_convert_number_to_pyn
37
+ num = '一百三十六'
38
+ pyn = self.convert_number_to_pyn num
39
+
40
+ assert_equal 'yi1-bai2-san1-shi2-liu4', pyn
41
+ end
42
+
43
+ def setup
44
+ @numbers = [
45
+ {:zh =>'一万两千七', :en => 12007},
46
+ {:zh => '三千六十三', :en => 3063},
47
+ {:zh => '一百五十', :en => 150 },
48
+ {:zh => '三千亿', :en => 300000000000},
49
+ {:zh => '一九六六', :en => 1966},
50
+ {:zh => '二零零八', :en => 2008},
51
+ ]
52
+ end
53
+ end
File without changes
@@ -0,0 +1,123 @@
1
+ #encoding: utf-8
2
+ $:.unshift File.join(File.dirname(__FILE__),'..','lib')
3
+ require './test/test_helper'
4
+ require 'zhongwen_tools/string'
5
+
6
+ class String
7
+ include ZhongwenTools::String
8
+ end
9
+
10
+ if RUBY_VERSION < '1.9'
11
+ class Test::Unit::TestCase
12
+ def refute(statement, message = '')
13
+ assert !statement, message
14
+ end
15
+ end
16
+ end
17
+
18
+ class TestString < Test::Unit::TestCase
19
+
20
+ def test_size
21
+ assert_equal 2, @str.size
22
+ assert_equal 2, ZhongwenTools::String.size(@str)
23
+ end
24
+
25
+ def test_chars
26
+ assert_equal %w(中 文), @str.chars
27
+
28
+ assert_equal %w(中 文), ZhongwenTools::String.chars(@str)
29
+ end
30
+
31
+ def test_reverse
32
+ assert_equal '文中', '中文'.reverse
33
+
34
+ assert_equal '文中', ZhongwenTools::String.reverse('中文')
35
+ end
36
+
37
+ def test_ascii
38
+ refute @str.ascii?
39
+ assert 'zhongwen'.ascii?
40
+ assert @str.multibyte?
41
+
42
+ refute ZhongwenTools::String.ascii? @str
43
+ assert ZhongwenTools::String.ascii? 'zhongwen'
44
+ assert ZhongwenTools::String.multibyte? @str
45
+ end
46
+
47
+ def test_halfwidth
48
+ str = 'hello'
49
+ refute str.halfwidth?
50
+ assert_equal str.to_halfwidth, 'hello'
51
+ assert str.to_halfwidth.halfwidth?
52
+
53
+ refute ZhongwenTools::String.halfwidth? str
54
+ assert_equal ZhongwenTools::String.to_halfwidth(str), 'hello'
55
+ assert ZhongwenTools::String.halfwidth?(ZhongwenTools::String.to_halfwidth(str))
56
+ end
57
+
58
+ def test_fullwidth
59
+ str = 'hello'
60
+ assert str.fullwidth?
61
+ refute @str.fullwidth?
62
+
63
+ assert ZhongwenTools::String.fullwidth? str
64
+ end
65
+
66
+ def test_uri_encode
67
+ url = 'http://www.3000hanzi.com/chinese-to-english/definition/好'
68
+ assert_equal URI.encode('好'), '好'.uri_encode
69
+
70
+ assert_equal "http://www.3000hanzi.com/chinese-to-english/definition/#{URI.encode '好'}", ZhongwenTools::String.uri_encode(url)
71
+ assert_equal "http://www.3000hanzi.com/chinese-to-english/definition/#{URI.encode '好'}", url.uri_encode
72
+ end
73
+
74
+ def test_uri_escape
75
+ url = 'http://www.3000hanzi.com/chinese-to-english/definition/好'
76
+ regex = Regexp.new("[^#{URI::PATTERN::UNRESERVED}]")
77
+
78
+ assert_equal URI.escape(url, regex), ZhongwenTools::String.uri_escape(url)
79
+ assert_equal URI.escape(url, regex), url.uri_escape
80
+ end
81
+
82
+ def test_has_zh
83
+ assert @str.has_zh?
84
+ refute @hw.has_zh?
85
+ refute @fw.has_zh?
86
+
87
+ assert ZhongwenTools::String.has_zh? @str
88
+ refute ZhongwenTools::String.has_zh? @hw
89
+ refute ZhongwenTools::String.has_zh? @fw
90
+ end
91
+
92
+ def test_is_zh
93
+ assert @str.zh?
94
+ assert @zh_punc.zh?
95
+
96
+ assert ZhongwenTools::String.zh? @str
97
+ assert ZhongwenTools::String.zh? @zh_punc
98
+ end
99
+
100
+ def test_codepoint
101
+ assert_equal "\\u4e2d\\u6587", @str.to_codepoint
102
+ assert_equal '羊', 'u7f8a'.from_codepoint
103
+ assert_equal '羊', '\\u7f8a'.from_codepoint
104
+
105
+ assert_equal "\\u4e2d\\u6587", ZhongwenTools::String.to_codepoint(@str)
106
+ assert_equal '羊', ZhongwenTools::String.from_codepoint('u7f8a')
107
+ assert_equal '羊', ZhongwenTools::String.from_codepoint('\\u7f8a')
108
+ end
109
+
110
+ def test_punctuation
111
+ assert ZhongwenTools::String.has_zh_punctuation?(@zh_punc)
112
+
113
+ assert @zh_punc.has_zh_punctuation?
114
+ end
115
+
116
+ def setup
117
+ @str = '中文'
118
+ @fw = 'hello'
119
+ @hw = 'hello'
120
+ @zh_punc = '不错吧!'
121
+ end
122
+
123
+ end
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "zhongwen_tools"
6
+ s.license = "MIT"
7
+ s.version = "0.0.6"
8
+ s.authors = ["Steven Daniels"]
9
+ s.email = ["steven@tastymantou.com"]
10
+ s.homepage = "https://github.com/stevendaniels/zhongwen_tools"
11
+ s.summary = %q{Zhongwen Tools provide romanization conversions and helper methods for Chinese.}
12
+ s.description = %q{Chinese tools for romanization conversions and other helpful string functions for Chinese.}
13
+
14
+ s.rubyforge_project = "zhongwen_tools"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency('rake', "~> 10.1")
22
+ if RUBY_VERSION >= '1.9'
23
+ s.add_development_dependency('simplecov', "~> 0.7")
24
+ s.add_development_dependency('simplecov-gem-adapter', "~> 1.0.1")
25
+ s.add_development_dependency('coveralls', "~> 0.7.0")
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zhongwen_tools
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.6
5
+ platform: ruby
6
+ authors:
7
+ - Steven Daniels
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-01-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '10.1'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '10.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: simplecov
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '0.7'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '0.7'
41
+ - !ruby/object:Gem::Dependency
42
+ name: simplecov-gem-adapter
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: 1.0.1
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 1.0.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: coveralls
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.7.0
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 0.7.0
69
+ description: Chinese tools for romanization conversions and other helpful string functions
70
+ for Chinese.
71
+ email:
72
+ - steven@tastymantou.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - .travis.yml
78
+ - Gemfile
79
+ - Gemfile.1.8.7
80
+ - README.md
81
+ - Rakefile
82
+ - lib/zhongwen_tools.rb
83
+ - lib/zhongwen_tools/numbers.rb
84
+ - lib/zhongwen_tools/string.rb
85
+ - lib/zhongwen_tools/string/fullwidth.rb
86
+ - lib/zhongwen_tools/string/ruby18.rb
87
+ - lib/zhongwen_tools/string/ruby19.rb
88
+ - test/test_conversion.rb
89
+ - test/test_helper.rb
90
+ - test/test_numbers.rb
91
+ - test/test_romanization.rb
92
+ - test/test_string.rb
93
+ - zhongwen_tools.gemspec
94
+ homepage: https://github.com/stevendaniels/zhongwen_tools
95
+ licenses:
96
+ - MIT
97
+ metadata: {}
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - '>='
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubyforge_project: zhongwen_tools
114
+ rubygems_version: 2.0.3
115
+ signing_key:
116
+ specification_version: 4
117
+ summary: Zhongwen Tools provide romanization conversions and helper methods for Chinese.
118
+ test_files:
119
+ - test/test_conversion.rb
120
+ - test/test_helper.rb
121
+ - test/test_numbers.rb
122
+ - test/test_romanization.rb
123
+ - test/test_string.rb