zhongwen_tools 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b1e19e456d7cf778c9a749a75284044981086a02
4
+ data.tar.gz: 103ae6d8d26029b2854bdd09e02a10bff64d5df1
5
+ SHA512:
6
+ metadata.gz: dff5a94d7af2e65b6f6a63ae8a5593312eef78df4fe3b8fe9c1280bf05874db12d4230c266f6de63de655b1d07db06fa430d49584daf120d65cabc33fd9cd94a
7
+ data.tar.gz: 0078f0cb0ca8724c34403c04472c063ea53836b261047d968c4a78eb18eba2985356004d3dabf6314e3b930635e4a1c1058f154f45fea1f14750e903991d21b3
data/.travis.yml ADDED
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - 2.0.0
6
+ - 2.1.0
7
+ - ruby-head
8
+
9
+ matrix:
10
+ include:
11
+ - rvm: 1.8.7
12
+ gemfile: Gemfile.1.8.7
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "https://rubygems.org"
2
+ # Specify your gem's dependencies in zhongwen_tools.gemspec
3
+ gemspec
4
+
5
+ group :test do
6
+ gem 'pry'
7
+ end
data/Gemfile.1.8.7 ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+ # Specify your gem's dependencies in zhongwen_tools.gemspec
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,128 @@
1
+ #Zhongwen Tools: tools and methods for dealing with Chinese.
2
+ [![Build
3
+ Status](https://travis-ci.org/stevendaniels/zhongwen_tools.png?branch=master)](https://travis-ci.org/stevendaniels/zhongwen_tools) [![Dependency Status](https://gemnasium.com/stevendaniels/zhongwen_tools.png)](https://gemnasium.com/stevendaniels/zhongwen_tools) [![Code Climate](https://codeclimate.com/github/stevendaniels/zhongwen_tools.png)](https://codeclimate.com/github/stevendaniels/zhongwen_tools) [![Coverage Status](https://coveralls.io/repos/stevendaniels/zhongwen_tools/badge.png)](https://coveralls.io/r/stevendaniels/zhongwen_tools)
4
+ ##INSTALLATION
5
+
6
+ Install as a gem
7
+
8
+ $ [sudo] gem install zhongwen_tools
9
+
10
+ ## Usage
11
+
12
+ Add the ZhongwenTools component you need to your classes as a module.
13
+
14
+ class String
15
+ include ZhongwenToolsRomanization
16
+ end
17
+
18
+ str = "ni3 hao3" #pinyin with numbers
19
+ str.to_pinyin #=> "nǐ hǎo"
20
+ str.to_zhuyinfuhao #=>
21
+
22
+ mzd = "Mao Tse-tung"
23
+ mzd.to_pinyin #=> Mao Zedong
24
+
25
+ Or you can require the components you want
26
+ require 'zhongwen_tools/numbers'
27
+ ZhongwenTools::Numbers.to_pinyin '一百二十' #=> 'yi1-bai2-er4-shi2'
28
+
29
+ ZhongwenTools includes the following modules:
30
+
31
+ 1. ZhongwenTools::String => some useful string functions and functions for identifying Chinese scripts and romanizations.
32
+ 2. ZhongwenTools::Numbers => functions for identifying and converting numbers.
33
+ 3. ZhongwenTools::Integer => some useful integer functions for Chinese:
34
+ e.g. 12.to_pinyin 12.to_zht
35
+ 4. ZhongwenTools::Romanization => functions for converting between Chinese romanization systems
36
+ 5. ZhongwenTools::Conversion => functions for converting between Chinese scripts.
37
+ 6. ZhongwenTools::ToneSandhi => functions for identifying and dealing with tone sandhi. (Wiki URL)
38
+ 7. [TODO] ZhongwenTools::Segmentation => functions for segmenting Chinese. Can provide different methods for converting
39
+ 8. ZhongwenTools::Tagging => functions for tagging Chinese POS, NER, etc.
40
+
41
+
42
+ ### ZhongwenTools::String: useful string functions for ZhongwenTools language
43
+ ZhongwenTools::String.ascii? 'hello' #=> true #non-multibyle strings
44
+ ZhongwenTools::String.multibyte? '中文' #=> true #multibtye strings
45
+ ZhongwenTools::String.halfwidth?
46
+ ZhongwenTools::String.fullwidth?
47
+ ZhongwenTools::String.to_halfwidth
48
+ ZhongwenTools::String.uri_encode #=> just because I'm lazy
49
+ ZhongwenTools::Unicode.to_codepoint
50
+ ZhongwenTools::Unicode.to_unicode --> converts from unicode codepoint.
51
+ ZhongwenTools::String.downcase --> does pinyin/ lowercase
52
+ ZhongwenTools::String.upcase --> does pinyin uppercase
53
+ ZhongwenTools::String.capitalize ---> does pinyin / fullwidth capitalization
54
+
55
+ ZhongwenTools::String.has_zh? '1月' #=> true
56
+ ZhongwenTools::String.is_zh? '1月' #=> false can't be mixed.
57
+ ZhongwenTools::String.is_zhs? '中国' #=> true
58
+ ZhongwenTools::String.is_zht? '中国' #=> false
59
+
60
+ #### ruby 1.8 safe methods
61
+ ZhongwenTools::String.chars '中文' #=> ['中','文']
62
+ ZhongwenTools::String.size '中文' #=> 2
63
+ ZhongwenTools::String.reverse '中文' #=> '文中'
64
+ ZhongwenTools::Unicode.to_utf8 '\x{D6D0}\x{CEC4}' => '中文'
65
+
66
+
67
+ ###Numbers
68
+ Functions for converting to and from Chinese numbers.
69
+
70
+ ###Integers
71
+
72
+ ### Romanization
73
+ ZhongwenTools::Chinese has tools for converting between Chinese language romanization systems and
74
+ scripts.
75
+
76
+ class String
77
+ include ZhongwenToolsRomanization
78
+ end
79
+
80
+
81
+ str = "ni3 hao3"
82
+ romanization_system = "pyn" #pyn|wg|yale|bpmf|zhyfh|wade-giles|bopomofo
83
+
84
+ str.to_pinyin romanization_system
85
+ #=> "nǐ hǎo"
86
+
87
+ str.to_py romanization_system
88
+ #=> "nǐ hǎo"
89
+
90
+ str.to_pyn
91
+ #=> "ni3 hao3"
92
+
93
+ str.to_wg
94
+ str.to_bpmf
95
+ str.to_yale
96
+ str.to_typy
97
+ str.to_msp3
98
+ str.to_tone_sandhi #=> converts pinyin into it's spoken tones.
99
+ #=> "ni2 hao3"
100
+ str.tone_sandhi? #=> checks if the word has tone sandhi
101
+ #=> true
102
+ str.romanization?
103
+
104
+ ### Conversion
105
+ Functions for converting between scripts (e.g. traditional Chinese to
106
+ simplified Chinese) and between chinese and romanization systems (e.g.
107
+ Chinese to pinyin).
108
+
109
+ ZhongwenTools::Conversion.to_zhs
110
+ ZhongwenTools::Conversion.to_zht
111
+ ZhongwenTools::Conversion.to_zhtw
112
+ ZhongwenTools::Conversion.to_zhhk
113
+ ZhongwenTools::Conversion.to_zhmc
114
+ ZhongwenTools::Conversion.to_zhsg
115
+ ZhongwenTools::Conversion.to_zhprc
116
+
117
+
118
+ ###Tone Sandhi
119
+ Some functions for predicting / converting to tone sandhi
120
+
121
+ ##Plugins
122
+ Zhongwen Tools tries to avoid having many dependencies. Functionality
123
+ that requires an external dependency is packaged as a separate gem.
124
+
125
+ ## TODO
126
+ 1. A trad/simp script converter
127
+ 2. A character -> pinyin converter
128
+ 3. A language detector
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+ Bundler.require :test
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ end
8
+
9
+ desc "Run tests"
10
+ task :default => :test
@@ -0,0 +1,185 @@
1
+ #encoding: utf-8
2
+ module ZhongwenTools
3
+ module Numbers
4
+
5
+ NUMBER_MULTIPLES = '拾十百佰千仟仟万萬亿億'
6
+
7
+ NUMBERS_TABLE = [
8
+ { :zh_s => '零', :zh_t => '零', :num => 0, :pyn => 'ling2'},
9
+ { :zh_s => '〇', :zh_t => '〇', :num => 0, :pyn => 'ling2'},
10
+ { :zh_s => '一', :zh_t => '一', :num => 1, :pyn => 'yi1'},
11
+ { :zh_s => '壹', :zh_t => '壹', :num => 1, :pyn => 'yi1'},
12
+ { :zh_s => '幺', :zh_t => '幺', :num => 1, :pyn => 'yao1'},
13
+ { :zh_s => '二', :zh_t => '二', :num => 2, :pyn => 'er4'},
14
+ { :zh_s => '两', :zh_t => '兩', :num => 2, :pyn => 'liang3'},
15
+ { :zh_s => '贰', :zh_t => '貳', :num => 2, :pyn => 'er4'},
16
+ { :zh_s => '三', :zh_t => '三', :num => 3, :pyn => 'san1'},
17
+ { :zh_s => '弎', :zh_t => '弎', :num => 3, :pyn => 'san1'},
18
+ { :zh_s => '叁', :zh_t => '參', :num => 3, :pyn => 'san1'},
19
+ { :zh_s => '四', :zh_t => '四', :num => 4, :pyn => 'si4'},
20
+ { :zh_s => '䦉', :zh_t => '䦉', :num => 4, :pyn => 'si4'},
21
+ { :zh_s => '肆', :zh_t => '肆', :num => 4, :pyn => 'si4'},
22
+ { :zh_s => '五', :zh_t => '五', :num => 5, :pyn => 'wu3'},
23
+ { :zh_s => '伍', :zh_t => '伍', :num => 5, :pyn => 'wu3'},
24
+ { :zh_s => '六', :zh_t => '六', :num => 6, :pyn => 'liu4'},
25
+ { :zh_s => '陆', :zh_t => '陸', :num => 6, :pyn => 'liu4'},
26
+ { :zh_s => '七', :zh_t => '七', :num => 7, :pyn => 'qi1'},
27
+ { :zh_s => '柒', :zh_t => '柒', :num => 7, :pyn => 'qi1'},
28
+ { :zh_s => '八', :zh_t => '八', :num => 8, :pyn => 'ba1'},
29
+ { :zh_s => '捌', :zh_t => '捌', :num => 8, :pyn => 'ba1'},
30
+ { :zh_s => '九', :zh_t => '九', :num => 9, :pyn => 'jiu3'},
31
+ { :zh_s => '玖', :zh_t => '玖', :num => 9, :pyn => 'jiu3'},
32
+ { :zh_s => '十', :zh_t => '十', :num => 10, :pyn => 'shi2'},
33
+ { :zh_s => '拾', :zh_t => '拾', :num => 10, :pyn => 'shi2'},
34
+ { :zh_s => '廿', :zh_t => '廿', :num => 20, :pyn => ' nian4'},
35
+ { :zh_s => '百', :zh_t => '百', :num => 100, :pyn => 'bai2'},
36
+ { :zh_s => '佰', :zh_t => '佰', :num => 100, :pyn => 'bai2'},
37
+ { :zh_s => '千', :zh_t => '千', :num => 1000, :pyn => 'qian2'},
38
+ { :zh_s => '仟', :zh_t => '仟', :num => 1000, :pyn => 'qian2'},
39
+ { :zh_s => '万', :zh_t => '萬', :num => 10000, :pyn => 'wan4'},
40
+ { :zh_s => '亿', :zh_t => '億', :num => 100000000, :pyn => 'yi4'},
41
+ ]
42
+
43
+ def is_number? word
44
+ #垓 秭 穰 溝 澗 正 載 --> beyond 100,000,000!
45
+ "#{word}".gsub(/([\d]|[一二三四五六七八九十百千萬万億亿]){2,}/,'') == ''
46
+ end
47
+
48
+ def convert_date(zh)
49
+ #if it's a year, or an oddly formatted number
50
+ zh_numbers = ZhongwenTools::String.chars zh
51
+ numbers = [];
52
+ i = 0
53
+
54
+ while( i < zh_numbers.length)
55
+ curr_number = zh_numbers[i]
56
+
57
+ #x[:num] == curr_number.to_i is a kludge; any string will == 0
58
+ num = convert(curr_number)[:num]
59
+ numbers << num
60
+ i += 1
61
+ end
62
+
63
+ return numbers
64
+ end
65
+
66
+ def convert(number)
67
+ NUMBERS_TABLE.find{|x| x[:zh_s] == number || x[:zh_t] == number || x[:num].to_s == number}
68
+ end
69
+
70
+ def convert_numbers(numbers)
71
+ number = 0
72
+ length = numbers.length
73
+ skipped = false
74
+
75
+ length.times do |i|
76
+ unless skipped == i
77
+ curr_num = numbers[i] || 0
78
+ if (i+2) <= length
79
+ number, i = convert_current_number(numbers, number, curr_num, i)
80
+ skipped = i + 1
81
+ else
82
+ number = adjust_number(number, curr_num)
83
+ end
84
+ end
85
+ end
86
+
87
+ number
88
+ end
89
+
90
+ def convert_current_number numbers, number, curr_num, i
91
+ next_number = numbers[i + 1]
92
+ if is_number_multiplier? next_number
93
+ number += next_number * curr_num
94
+ end
95
+
96
+ [number, i]
97
+ end
98
+ def adjust_number(number, curr_num)
99
+ is_number_multiplier?(curr_num) ? number * curr_num : number + curr_num
100
+ end
101
+
102
+ def convert_chinese_numbers_to_numbers(zh_number)
103
+ zh_number = zh_number.to_s
104
+ numbers = convert_date(zh_number)
105
+
106
+ #if it's a year, or an oddly formatted number
107
+ return numbers.join('').to_i if zh_number[/[#{NUMBER_MULTIPLES}]/u].nil?
108
+
109
+ convert_numbers numbers
110
+ end
111
+
112
+ def is_number_multiplier?(number)
113
+ [10,100,1000,10000,100000000].include? number
114
+ end
115
+
116
+ #these should also be able to convert numbers to chinese numbers
117
+ def convert_number_to_simplified type, number
118
+ convert_number_to :zh_s, type.to_sym, number
119
+ end
120
+ def convert_number_to_traditional type, number
121
+ convert_number_to :zh_t, type.to_sym, number
122
+ end
123
+
124
+ def convert_number_to_pyn number, type = 'zh_s'
125
+ convert_number_to :pyn, type.to_sym, number, '-'
126
+ end
127
+
128
+
129
+ def check_wan(wan, i)
130
+ wan ||= 0
131
+ wan += 1 if (i + 1) % 5 == 0
132
+ end
133
+
134
+ def convert_from_zh number, to
135
+ converted_number = number.chars.map do |digit|
136
+ convert(digit).fetch(to){ digit }
137
+ end
138
+ end
139
+
140
+ def convert_from_num number, to
141
+ #TODO: this will fail for numbers over 1 billion. grr.
142
+ str = number.to_s
143
+ len = str.length
144
+ converted_number = []
145
+
146
+ len.times do |i|
147
+ wan = check_wan(wan, i)
148
+ num = str[(len - 1 - i),1].to_i
149
+
150
+ if i == 0
151
+ replacement = NUMBERS_TABLE.find{|x| x[:num] == num}.fetch(to){0}
152
+
153
+ converted_number << replacement unless num == 0
154
+ else
155
+ replacement = (NUMBERS_TABLE.find{|x| x[:num] == (10**(i))} || NUMBERS_TABLE.find{|x| x[:num] == (10**(i) / 10000)} || NUMBERS_TABLE.find{|x| x[:num] == (10**(i) / 10000**2)} )[to]
156
+ converted_number << replacement
157
+
158
+ #checks the wan level and ...
159
+ if (num == 1 && (10**(i) / 10000 ** wan) != 10) || num != 1
160
+ replacement = NUMBERS_TABLE.find{|x| x[:num] == num}[to]
161
+ converted_number << replacement
162
+ #elsif num != 1
163
+ #replacement = NUMBERS_TABLE.find{|x| x[:num] == num}[to]
164
+ #converted_number << replacement
165
+ end
166
+ end
167
+ end
168
+
169
+ converted_number.reverse!
170
+ end
171
+
172
+ def convert_number_to(to, from, number, separator = '')
173
+ return number unless [:zh_t, :zh_s, :num, :pyn].include? to
174
+
175
+ if from == :num
176
+ converted_number = convert_from_num(number, to)
177
+ else
178
+ converted_number = convert_from_zh number, to
179
+ end
180
+
181
+ #liang rules are tough...
182
+ converted_number.join(separator).gsub(/零[#{NUMBER_MULTIPLES}]/u,'')#.gsub(/二([百佰千仟仟万萬亿億])/){"#{NUMBERS_TABLE.find{|x|x[:pyn] == 'liang3'}[to]}#{$1}"}
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+ module ZhongwenTools
3
+ FW_HW ={
4
+ "0" => "0",
5
+ "1" => "1",
6
+ "2" => "2",
7
+ "3" => "3",
8
+ "4" => "4",
9
+ "5" => "5",
10
+ "6" => "6",
11
+ "7" => "7",
12
+ "8" => "8",
13
+ "9" => "9",
14
+ "A" => "A",
15
+ "B" => "B",
16
+ "C" => "C",
17
+ "D" => "D",
18
+ "E" => "E",
19
+ "F" => "F",
20
+ "G" => "G",
21
+ "H" => "H",
22
+ "I" => "I",
23
+ "J" => "J",
24
+ "K" => "K",
25
+ "L" => "L",
26
+ "M" => "M",
27
+ "N" => "N",
28
+ "O" => "O",
29
+ "P" => "P",
30
+ "Q" => "Q",
31
+ "R" => "R",
32
+ "S" => "S",
33
+ "T" => "T",
34
+ "U" => "U",
35
+ "V" => "V",
36
+ "W" => "W",
37
+ "X" => "X",
38
+ "Y" => "Y",
39
+ "Z" => "Z",
40
+ "a" => "a",
41
+ "b" => "b",
42
+ "c" => "c",
43
+ "d" => "d",
44
+ "e" => "e",
45
+ "f" => "f",
46
+ "g" => "g",
47
+ "h" => "h",
48
+ "i" => "i",
49
+ "j" => "j",
50
+ "k" => "k",
51
+ "l" => "l",
52
+ "m" => "m",
53
+ "n" => "n",
54
+ "o" => "o",
55
+ "p" => "p",
56
+ "q" => "q",
57
+ "r" => "r",
58
+ "s" => "s",
59
+ "t" => "t",
60
+ "u" => "u",
61
+ "v" => "v",
62
+ "w" => "w",
63
+ "x" => "x",
64
+ "y" => "y",
65
+ "z" => "z",
66
+ "%" => '%',
67
+ "." => '.',
68
+ ':' => ':',
69
+ "#" => '#',
70
+ "$" => "$",
71
+ "&" => "&",
72
+ "+" => "+",
73
+ "-" => "-",
74
+ "/" => "/",
75
+ "\" => '\\',
76
+ '=' => '=',
77
+ ";" => ";",
78
+ "<" => "<",
79
+ ">" => ">"
80
+ }
81
+ end
@@ -0,0 +1,71 @@
1
+ #encoding: utf-8
2
+
3
+ class String
4
+ define_method(:chars) do
5
+ self.scan(/./mu).to_a
6
+ end
7
+
8
+ def size
9
+ self.chars.size
10
+ end
11
+
12
+ def reverse(str = nil)
13
+ self.chars.reverse.join
14
+ end
15
+ end
16
+
17
+ module ZhongwenTools
18
+ module String
19
+ def to_utf8(encoding = nil, encodings = nil)
20
+ #should substitute out known bad actors like space
21
+ encodings = ['utf-8', 'GB18030', 'BIG5', 'GBK', 'GB2312'] if encodings.nil?
22
+ encodings = encoding + encodings unless encoding.nil?
23
+ raise 'Unable to Convert' if encodings.size == 0
24
+
25
+ begin
26
+ text = Iconv.conv('utf-8', encodings[0], self)
27
+ rescue
28
+ text = self.to_utf8(nil, encodings[1..-1])
29
+ end
30
+ text
31
+ end
32
+
33
+ def convert_regex(regex)
34
+ str = regex.to_s
35
+ regex.to_s.scan(/u[0-9A-Z]{4}/).each{|cp| str = str.sub('\\' + cp,cp.from_codepoint)}
36
+ /#{str}/
37
+ end
38
+
39
+ def has_zh?(str = nil)
40
+ str ||= self
41
+
42
+ regex = {
43
+ :zh => self.convert_regex(UNICODE_REGEX[:zh]),
44
+ :punc => self.convert_regex(UNICODE_REGEX[:punc])
45
+ }
46
+ #str.scan(/#{regex[:zh]}|#{regex[:punc]}|\s/).join == str
47
+ !self.fullwidth?(str) && (!str[regex[:zh]].nil? || !str[regex[:punc]].nil?)
48
+ end
49
+
50
+ def zh?(str = nil)
51
+ str ||= self
52
+
53
+ regex = {
54
+ :zh => self.convert_regex(UNICODE_REGEX[:zh]),
55
+ :punc => self.convert_regex(UNICODE_REGEX[:punc])
56
+ }
57
+
58
+ !str.fullwidth? && (str.scan(/(#{regex[:zh]}+|#{regex[:punc]}+|\s+)/).join == str)
59
+ end
60
+
61
+ def has_zh_punctuation?(str = nil)
62
+ str ||= self
63
+ regex = {
64
+ :zh => self.convert_regex(UNICODE_REGEX[:zh]),
65
+ :punc => self.convert_regex(UNICODE_REGEX[:punc])
66
+ }
67
+
68
+ !str[regex[:punc]].nil?
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,6 @@
1
+ #encoding: utf-8
2
+ class String
3
+ define_method(:chars) do
4
+ self.scan(/./mu).to_a
5
+ end
6
+ end
@@ -0,0 +1,164 @@
1
+ # encoding: utf-8
2
+ #$:.unshift File.join(File.dirname(__FILE__),'..','lib','zhongwen_tools', 'string')
3
+ require 'uri'
4
+ require './lib/zhongwen_tools/string/fullwidth'
5
+
6
+ module ZhongwenTools
7
+ module String
8
+ UNICODE_REGEX = {
9
+ :zh => /[\u2E80-\u2E99]|[\u2E9B-\u2EF3]|[\u2F00-\u2FD5]|[\u3005|\u3007]|[\u3021-\u3029]|[\u3038-\u303B]|[\u3400-\u4DB5]|[\u4E00-\u9FCC]|[\uF900-\uFA6D]|[\uFA70-\uFAD9]/,
10
+ :punc => /[\u0021-\u0023]|[\u0025-\u002A]|[\u002C-\u002F]|[\u003A\u003B\u003F\u0040]|[\u005B-\u005D\u005F\u007B\u007D\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387]|[\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F]|[\u066A-\u066D]|[\u06D4]|[\u0700-\u070D]|[\u07F7-\u07F9]|[\u0830-\u083E]|[\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B]|[\u0F04-\u0F12]|[\u0F14]|[\u0F3A-\u0F3D]|[\u0F85]|[\u0FD0-\u0FD4]|[\u0FD9\u0FDA]|[\u104A-\u104F]|[\u10FB]|[\u1360-\u1368]|[\u1400\u166D\u166E\u169B\u169C]|[\u16EB-\u16ED]|[\u1735\u1736]|[\u17D4-\u17D6]|[\u17D8-\u17DA]|[\u1800-\u180A\u1944\u1945\u1A1E\u1A1F]|[\u1AA0-\u1AA6]|[\u1AA8-\u1AAD]|[\u1B5A-\u1B60]|[\u1BFC-\u1BFF]|[\u1C3B-\u1C3F]|[\u1C7E\u1C7F]|[\u1CC0-\u1CC7]|[\u1CD3]|[\u2010-\u2027]|[\u2030-\u2043]|[\u2045-\u2051]|[\u2053-\u205E]|[\u207D\u207E\u208D\u208E\u2329\u232A]|[\u2768-\u2775\u27C5\u27C6]|[\u27E6-\u27EF]|[\u2983-\u2998]|[\u29D8-\u29DB\u29FC\u29FD]|[\u2CF9-\u2CFC]|[\u2CFE\u2CFF\u2D70]|[\u2E00-\u2E2E]|[\u2E30-\u2E3B]|[\u3001-\u3003]|[\u3008-\u3011]|[\u3014-\u301F]|[\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF]|[\uA60D-\uA60F]|[\uA673\uA67E]|[\uA6F2-\uA6F7]|[\uA874-\uA877]|[\uA8CE\uA8CF]|[\uA8F8-\uA8FA]|[\uA92E\uA92F\uA95F]|[\uA9C1-\uA9CD]|[\uA9DE\uA9DF]|[\uAA5C-\uAA5F]|[\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F]|[\uFE10-\uFE19]|[\uFE30-\uFE52]|[\uFE54-\uFE61]|[\uFE63\uFE68\uFE6A\uFE6B]|[\uFF01-\uFF03]|[\uFF05-\uFF0A]|[\uFF0C-\uFF0F]|[\uFF1A\uFF1B\uFF1F\uFF20]|[\uFF3B-\uFF3D]|[\uFF3F\uFF5B\uFF5D]|[\uFF5F-\uFF65]/
11
+ }
12
+
13
+ def to_utf8(str = nil)
14
+ (str || self).force_encoding('utf-8')
15
+ #TODO: better conversion functions available in categorize
16
+ end
17
+
18
+ def has_zh?(str = nil)
19
+ str ||= self
20
+
21
+ !str[/(#{UNICODE_REGEX[:zh]}|#{UNICODE_REGEX[:punc]})/].nil?
22
+ end
23
+
24
+ def zh?(str = nil)
25
+ str ||= self
26
+
27
+ str.scan(/(#{UNICODE_REGEX[:zh]}+|#{UNICODE_REGEX[:punc]}+|\s+)/).join == str
28
+ end
29
+
30
+ def has_zh_punctuation?(str = nil)
31
+ str ||= self
32
+
33
+ !str[UNICODE_REGEX[:punc]].nil?
34
+ end
35
+
36
+ def size(str = nil)
37
+ str ||= self
38
+ str.chars.size
39
+ end
40
+
41
+ def chars(str = nil)
42
+ (str || self).scan(/./mu).to_a
43
+ end
44
+
45
+ def reverse(str = nil)
46
+ str ||= self
47
+ str.chars.reverse.join
48
+ end
49
+
50
+ def uri_encode(str = nil)
51
+ str ||= self
52
+ URI.encode str
53
+ end
54
+
55
+ def uri_escape(str = nil)
56
+ str ||= self
57
+
58
+ URI.escape(str, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]"))
59
+ end
60
+
61
+ def ascii?(str = nil)
62
+ str ||= self
63
+ str.chars.size == str.bytes.to_a.size
64
+ end
65
+
66
+ def multibyte?(str = nil)
67
+ !(str || self).ascii?
68
+ end
69
+
70
+ def halfwidth?(str = nil)
71
+ str ||= self
72
+ str[/[0-9A-Za-z%.:#$&+-/\=;<>]/].nil?
73
+ end
74
+
75
+ def fullwidth?(str = nil)
76
+ str ||= self
77
+ !self.halfwidth?(str) && self.to_halfwidth(str) != str
78
+ end
79
+
80
+ def to_halfwidth(str = nil)
81
+ str ||= self
82
+ matches = str.scan(/([0-9A-Za-z%.:#$&+-/\=;<>])/u).uniq.flatten
83
+
84
+ matches.each do |match|
85
+ replacement = FW_HW[match]
86
+ str = str.gsub(match, replacement) #unless str.nil?
87
+ end
88
+
89
+ str
90
+ end
91
+
92
+ def to_codepoint(str = nil)
93
+ str ||= self
94
+ #chars = (self.class.to_s == 'String')? self.chars : self.chars(str)
95
+ codepoints = str.chars.map{|c| "\\u%04x" % c.unpack("U")[0]}
96
+
97
+ codepoints.join
98
+ end
99
+
100
+ def from_codepoint(str = nil)
101
+ str ||= self
102
+
103
+ [str.sub(/\\?u/,'').hex].pack("U")
104
+ end
105
+
106
+ class Basement #:nodoc:
107
+ include ZhongwenTools::String
108
+ end
109
+ def self.chars(*args)
110
+ Basement.new.chars(*args)
111
+ end
112
+ def self.size(*args)
113
+ Basement.new.size(*args)
114
+ end
115
+ def self.reverse(*args)
116
+ Basement.new.reverse(*args)
117
+ end
118
+ def self.to_utf8(*args)
119
+ Basement.new.to_utf8(*args)
120
+ end
121
+ def self.uri_encode(*args)
122
+ Basement.new.uri_encode(*args)
123
+ end
124
+ def self.uri_escape(*args)
125
+ Basement.new.uri_escape(*args)
126
+ end
127
+ def self.ascii?(*args)
128
+ Basement.new.ascii?(*args)
129
+ end
130
+ def self.multibyte?(*args)
131
+ Basement.new.multibyte?(*args)
132
+ end
133
+ def self.halfwidth?(*args)
134
+ Basement.new.halfwidth?(*args)
135
+ end
136
+ def self.fullwidth?(*args)
137
+ Basement.new.fullwidth?(*args)
138
+ end
139
+ def self.to_halfwidth(*args)
140
+ Basement.new.to_halfwidth(*args)
141
+ end
142
+ def self.has_zh?(*args)
143
+ Basement.new.has_zh?(*args)
144
+ end
145
+ def self.has_zh_punctuation?(*args)
146
+ Basement.new.has_zh_punctuation?(*args)
147
+ end
148
+ def self.zh?(*args)
149
+ Basement.new.zh?(*args)
150
+ end
151
+ def self.to_codepoint(*args)
152
+ Basement.new.to_codepoint(*args)
153
+ end
154
+ def self.from_codepoint(*args)
155
+ Basement.new.from_codepoint(*args)
156
+ end
157
+ end
158
+ end
159
+
160
+ if RUBY_VERSION < '1.9'
161
+ require './lib/zhongwen_tools/string/ruby18'
162
+ elsif RUBY_VERSION < '2.0'
163
+ require './lib/zhongwen_tools/string/ruby19'
164
+ end
@@ -0,0 +1,8 @@
1
+ # encoding: utf-8
2
+ require File.expand_path("../zhongwen_tools/string", __FILE__)
3
+ require File.expand_path("../zhongwen_tools/numbers", __FILE__)
4
+ #require File.expand_path("../zhongwen_tools/romanization", __FILE__)
5
+ #require File.expand_path("../zhongwen_tools/conversion", __FILE__)
6
+
7
+ module ZhongwenTools
8
+ end
File without changes
@@ -0,0 +1,14 @@
1
+ begin
2
+ require 'coveralls'
3
+ Coveralls.wear!
4
+ rescue LoadError
5
+ puts 'Coverage disabled.'
6
+ end
7
+
8
+ begin
9
+ require 'pry'
10
+ rescue LoadError
11
+ puts 'Pry disabled'
12
+ end
13
+
14
+ require 'test/unit'
@@ -0,0 +1,53 @@
1
+ #encoding: utf-8
2
+ $:.unshift File.join(File.dirname(__FILE__),'..','lib')
3
+
4
+ require './test/test_helper'
5
+ require 'zhongwen_tools/string'
6
+ require 'zhongwen_tools/numbers'
7
+
8
+ class TestCJKTools < Test::Unit::TestCase
9
+ include ZhongwenTools::Numbers
10
+ def test_convert_to_numbers
11
+ #skip
12
+ #your function sucks dick man
13
+ @numbers.each do |num|
14
+ number = convert_chinese_numbers_to_numbers num[:zh]
15
+ binding.pry if num[:en] != number
16
+ assert_equal num[:en], number
17
+ end
18
+ end
19
+
20
+ def test_convert_to_traditional_number
21
+ zhs = @numbers[0][:zh]
22
+ zht = convert_number_to_traditional :zh_s, zhs
23
+
24
+ assert_equal '一萬兩千七', zht
25
+ end
26
+
27
+ def test_convert_to_simplified_from_number
28
+ #skip
29
+ num = @numbers[0][:en]
30
+ zht = convert_number_to_traditional :num, num
31
+
32
+ #adds garbage!!
33
+ assert_equal '一萬二千七', zht
34
+ end
35
+
36
+ def test_convert_number_to_pyn
37
+ num = '一百三十六'
38
+ pyn = self.convert_number_to_pyn num
39
+
40
+ assert_equal 'yi1-bai2-san1-shi2-liu4', pyn
41
+ end
42
+
43
+ def setup
44
+ @numbers = [
45
+ {:zh =>'一万两千七', :en => 12007},
46
+ {:zh => '三千六十三', :en => 3063},
47
+ {:zh => '一百五十', :en => 150 },
48
+ {:zh => '三千亿', :en => 300000000000},
49
+ {:zh => '一九六六', :en => 1966},
50
+ {:zh => '二零零八', :en => 2008},
51
+ ]
52
+ end
53
+ end
File without changes
@@ -0,0 +1,123 @@
1
+ #encoding: utf-8
2
+ $:.unshift File.join(File.dirname(__FILE__),'..','lib')
3
+ require './test/test_helper'
4
+ require 'zhongwen_tools/string'
5
+
6
+ class String
7
+ include ZhongwenTools::String
8
+ end
9
+
10
+ if RUBY_VERSION < '1.9'
11
+ class Test::Unit::TestCase
12
+ def refute(statement, message = '')
13
+ assert !statement, message
14
+ end
15
+ end
16
+ end
17
+
18
+ class TestString < Test::Unit::TestCase
19
+
20
+ def test_size
21
+ assert_equal 2, @str.size
22
+ assert_equal 2, ZhongwenTools::String.size(@str)
23
+ end
24
+
25
+ def test_chars
26
+ assert_equal %w(中 文), @str.chars
27
+
28
+ assert_equal %w(中 文), ZhongwenTools::String.chars(@str)
29
+ end
30
+
31
+ def test_reverse
32
+ assert_equal '文中', '中文'.reverse
33
+
34
+ assert_equal '文中', ZhongwenTools::String.reverse('中文')
35
+ end
36
+
37
+ def test_ascii
38
+ refute @str.ascii?
39
+ assert 'zhongwen'.ascii?
40
+ assert @str.multibyte?
41
+
42
+ refute ZhongwenTools::String.ascii? @str
43
+ assert ZhongwenTools::String.ascii? 'zhongwen'
44
+ assert ZhongwenTools::String.multibyte? @str
45
+ end
46
+
47
+ def test_halfwidth
48
+ str = 'hello'
49
+ refute str.halfwidth?
50
+ assert_equal str.to_halfwidth, 'hello'
51
+ assert str.to_halfwidth.halfwidth?
52
+
53
+ refute ZhongwenTools::String.halfwidth? str
54
+ assert_equal ZhongwenTools::String.to_halfwidth(str), 'hello'
55
+ assert ZhongwenTools::String.halfwidth?(ZhongwenTools::String.to_halfwidth(str))
56
+ end
57
+
58
+ def test_fullwidth
59
+ str = 'hello'
60
+ assert str.fullwidth?
61
+ refute @str.fullwidth?
62
+
63
+ assert ZhongwenTools::String.fullwidth? str
64
+ end
65
+
66
+ def test_uri_encode
67
+ url = 'http://www.3000hanzi.com/chinese-to-english/definition/好'
68
+ assert_equal URI.encode('好'), '好'.uri_encode
69
+
70
+ assert_equal "http://www.3000hanzi.com/chinese-to-english/definition/#{URI.encode '好'}", ZhongwenTools::String.uri_encode(url)
71
+ assert_equal "http://www.3000hanzi.com/chinese-to-english/definition/#{URI.encode '好'}", url.uri_encode
72
+ end
73
+
74
+ def test_uri_escape
75
+ url = 'http://www.3000hanzi.com/chinese-to-english/definition/好'
76
+ regex = Regexp.new("[^#{URI::PATTERN::UNRESERVED}]")
77
+
78
+ assert_equal URI.escape(url, regex), ZhongwenTools::String.uri_escape(url)
79
+ assert_equal URI.escape(url, regex), url.uri_escape
80
+ end
81
+
82
+ def test_has_zh
83
+ assert @str.has_zh?
84
+ refute @hw.has_zh?
85
+ refute @fw.has_zh?
86
+
87
+ assert ZhongwenTools::String.has_zh? @str
88
+ refute ZhongwenTools::String.has_zh? @hw
89
+ refute ZhongwenTools::String.has_zh? @fw
90
+ end
91
+
92
+ def test_is_zh
93
+ assert @str.zh?
94
+ assert @zh_punc.zh?
95
+
96
+ assert ZhongwenTools::String.zh? @str
97
+ assert ZhongwenTools::String.zh? @zh_punc
98
+ end
99
+
100
+ def test_codepoint
101
+ assert_equal "\\u4e2d\\u6587", @str.to_codepoint
102
+ assert_equal '羊', 'u7f8a'.from_codepoint
103
+ assert_equal '羊', '\\u7f8a'.from_codepoint
104
+
105
+ assert_equal "\\u4e2d\\u6587", ZhongwenTools::String.to_codepoint(@str)
106
+ assert_equal '羊', ZhongwenTools::String.from_codepoint('u7f8a')
107
+ assert_equal '羊', ZhongwenTools::String.from_codepoint('\\u7f8a')
108
+ end
109
+
110
+ def test_punctuation
111
+ assert ZhongwenTools::String.has_zh_punctuation?(@zh_punc)
112
+
113
+ assert @zh_punc.has_zh_punctuation?
114
+ end
115
+
116
+ def setup
117
+ @str = '中文'
118
+ @fw = 'hello'
119
+ @hw = 'hello'
120
+ @zh_punc = '不错吧!'
121
+ end
122
+
123
+ end
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "zhongwen_tools"
6
+ s.license = "MIT"
7
+ s.version = "0.0.6"
8
+ s.authors = ["Steven Daniels"]
9
+ s.email = ["steven@tastymantou.com"]
10
+ s.homepage = "https://github.com/stevendaniels/zhongwen_tools"
11
+ s.summary = %q{Zhongwen Tools provide romanization conversions and helper methods for Chinese.}
12
+ s.description = %q{Chinese tools for romanization conversions and other helpful string functions for Chinese.}
13
+
14
+ s.rubyforge_project = "zhongwen_tools"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency('rake', "~> 10.1")
22
+ if RUBY_VERSION >= '1.9'
23
+ s.add_development_dependency('simplecov', "~> 0.7")
24
+ s.add_development_dependency('simplecov-gem-adapter', "~> 1.0.1")
25
+ s.add_development_dependency('coveralls', "~> 0.7.0")
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zhongwen_tools
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.6
5
+ platform: ruby
6
+ authors:
7
+ - Steven Daniels
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-01-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '10.1'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '10.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: simplecov
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '0.7'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '0.7'
41
+ - !ruby/object:Gem::Dependency
42
+ name: simplecov-gem-adapter
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: 1.0.1
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 1.0.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: coveralls
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.7.0
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 0.7.0
69
+ description: Chinese tools for romanization conversions and other helpful string functions
70
+ for Chinese.
71
+ email:
72
+ - steven@tastymantou.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - .travis.yml
78
+ - Gemfile
79
+ - Gemfile.1.8.7
80
+ - README.md
81
+ - Rakefile
82
+ - lib/zhongwen_tools.rb
83
+ - lib/zhongwen_tools/numbers.rb
84
+ - lib/zhongwen_tools/string.rb
85
+ - lib/zhongwen_tools/string/fullwidth.rb
86
+ - lib/zhongwen_tools/string/ruby18.rb
87
+ - lib/zhongwen_tools/string/ruby19.rb
88
+ - test/test_conversion.rb
89
+ - test/test_helper.rb
90
+ - test/test_numbers.rb
91
+ - test/test_romanization.rb
92
+ - test/test_string.rb
93
+ - zhongwen_tools.gemspec
94
+ homepage: https://github.com/stevendaniels/zhongwen_tools
95
+ licenses:
96
+ - MIT
97
+ metadata: {}
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - '>='
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubyforge_project: zhongwen_tools
114
+ rubygems_version: 2.0.3
115
+ signing_key:
116
+ specification_version: 4
117
+ summary: Zhongwen Tools provide romanization conversions and helper methods for Chinese.
118
+ test_files:
119
+ - test/test_conversion.rb
120
+ - test/test_helper.rb
121
+ - test/test_numbers.rb
122
+ - test/test_romanization.rb
123
+ - test/test_string.rb