tsukiko 0.1.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/data/tw_cn.data +0 -0
- data/lib/tsukiko.rb +56 -15
- metadata +3 -2
data/data/tw_cn.data
ADDED
Binary file
|
data/lib/tsukiko.rb
CHANGED
@@ -3,39 +3,73 @@
|
|
3
3
|
#encoding:utf-8
|
4
4
|
class Tsukiko
|
5
5
|
def initialize(dic_path=File.dirname(__FILE__))
|
6
|
-
@words=Hash.new
|
7
|
-
@han=Hash.new
|
8
|
-
@bigram=Hash.new
|
9
6
|
@words=Marshal.load(File.open(dic_path+"/../data/words.data","rb").read)
|
10
7
|
@han=Marshal.load(File.open(dic_path+"/../data/cn_tw.data","rb").read)
|
11
8
|
@bigram=Marshal.load(File.open(dic_path+"/../data/bigram.data","rb").read)
|
9
|
+
@trhan=Marshal.load(File.open(dic_path+"/../data/tw_cn.data","rb").read)
|
12
10
|
end
|
13
|
-
|
11
|
+
def convert_words(str)
|
14
12
|
@words.each{|cn,tw|
|
15
|
-
str.gsub!(cn," "+tw+" ")
|
13
|
+
str.gsub!(cn," #%"+tw+"%# ")
|
16
14
|
}
|
17
15
|
@result=str
|
18
16
|
end
|
17
|
+
def convert_words_tw(str)
|
18
|
+
words=@words.invert
|
19
|
+
words.each{|tw,cn|
|
20
|
+
str.gsub!(tw," "+cn+" ")
|
21
|
+
}
|
22
|
+
@result=str
|
23
|
+
end
|
24
|
+
def convert_han_tw(str)
|
25
|
+
# puts str
|
26
|
+
tmpstr=str.clone
|
27
|
+
while tmpstr.sub!(/[\u4e00-\u9fa5]+/,"")
|
28
|
+
# matches=str.match(/(\S+)/)
|
29
|
+
tmp=$&.chomp
|
30
|
+
# puts tmp
|
31
|
+
# puts @trhan["卜"]
|
32
|
+
tmpp=tmp.clone
|
33
|
+
for i in 0..tmp.length-1
|
34
|
+
# puts @trhan[tmp[i]]
|
35
|
+
if @trhan[tmp[i]]==0
|
36
|
+
tmpp[i]=tmp[i]
|
37
|
+
next
|
38
|
+
else
|
39
|
+
tmpp[i]=@trhan[tmp[i]]
|
40
|
+
#convert each character
|
41
|
+
end
|
42
|
+
end
|
43
|
+
@result=str.gsub!(tmp,tmpp)
|
44
|
+
end
|
45
|
+
@result
|
46
|
+
end
|
19
47
|
# call after convert_words
|
20
48
|
def convert_han(str)
|
21
49
|
tmpstr=str.clone
|
22
50
|
while tmpstr.sub!(/\S+/,"")
|
23
51
|
# matches=str.match(/(\S+)/)
|
24
52
|
tmp=$&.chomp
|
53
|
+
if tmp[0..1]=="#%"
|
54
|
+
next
|
55
|
+
else
|
25
56
|
# puts tmp
|
26
|
-
|
27
|
-
|
57
|
+
tmpp=tmp.clone
|
58
|
+
for i in 0..tmp.length-1
|
28
59
|
#convert each character
|
29
|
-
|
30
|
-
|
31
|
-
|
60
|
+
if @han[tmp[i]]==1
|
61
|
+
tmpp[i]=@han[tmp[i]][0]
|
62
|
+
elsif @han[tmp[i]]==nil
|
63
|
+
|
64
|
+
# break
|
32
65
|
tmpp[i]=tmp[i]
|
33
|
-
|
34
|
-
|
66
|
+
else
|
67
|
+
tmpp[i]=use_bigram(tmpp,i)
|
68
|
+
end
|
35
69
|
end
|
36
|
-
end
|
37
70
|
@result=str.gsub!(tmp,tmpp)
|
38
|
-
|
71
|
+
end
|
72
|
+
end
|
39
73
|
@result
|
40
74
|
end
|
41
75
|
# when there is a 1:n converation
|
@@ -66,10 +100,17 @@ class Tsukiko
|
|
66
100
|
# puts @bigram[str]
|
67
101
|
return @bigram[str]
|
68
102
|
end
|
69
|
-
|
103
|
+
# convert simplified chinesse into tradtional chinese
|
70
104
|
def convert(str)
|
71
105
|
@result=convert_han(convert_words(str))
|
72
106
|
# @result=convert_words(str)
|
107
|
+
@result.gsub!(" #%","")
|
108
|
+
@result.gsub!("%# ","")
|
109
|
+
@result
|
110
|
+
end
|
111
|
+
def convert_tw(str)
|
112
|
+
@result=convert_han_tw(convert_words_tw(str))
|
113
|
+
# @result=convert_words(str)
|
73
114
|
@result.gsub!(" ","")
|
74
115
|
@result
|
75
116
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tsukiko
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-12-
|
12
|
+
date: 2013-12-25 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: a tool to convert simplified Chinsese into tradtional Chinese
|
15
15
|
email:
|
@@ -22,6 +22,7 @@ files:
|
|
22
22
|
- data/bigram.data
|
23
23
|
- data/cn_tw.data
|
24
24
|
- data/words.data
|
25
|
+
- data/tw_cn.data
|
25
26
|
homepage: ''
|
26
27
|
licenses: []
|
27
28
|
post_install_message:
|