tsukiko 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/data/tw_cn.data +0 -0
- data/lib/tsukiko.rb +56 -15
- metadata +3 -2
data/data/tw_cn.data
ADDED
|
Binary file
|
data/lib/tsukiko.rb
CHANGED
|
@@ -3,39 +3,73 @@
|
|
|
3
3
|
#encoding:utf-8
|
|
4
4
|
class Tsukiko
|
|
5
5
|
def initialize(dic_path=File.dirname(__FILE__))
|
|
6
|
-
@words=Hash.new
|
|
7
|
-
@han=Hash.new
|
|
8
|
-
@bigram=Hash.new
|
|
9
6
|
@words=Marshal.load(File.open(dic_path+"/../data/words.data","rb").read)
|
|
10
7
|
@han=Marshal.load(File.open(dic_path+"/../data/cn_tw.data","rb").read)
|
|
11
8
|
@bigram=Marshal.load(File.open(dic_path+"/../data/bigram.data","rb").read)
|
|
9
|
+
@trhan=Marshal.load(File.open(dic_path+"/../data/tw_cn.data","rb").read)
|
|
12
10
|
end
|
|
13
|
-
|
|
11
|
+
def convert_words(str)
|
|
14
12
|
@words.each{|cn,tw|
|
|
15
|
-
str.gsub!(cn," "+tw+" ")
|
|
13
|
+
str.gsub!(cn," #%"+tw+"%# ")
|
|
16
14
|
}
|
|
17
15
|
@result=str
|
|
18
16
|
end
|
|
17
|
+
def convert_words_tw(str)
|
|
18
|
+
words=@words.invert
|
|
19
|
+
words.each{|tw,cn|
|
|
20
|
+
str.gsub!(tw," "+cn+" ")
|
|
21
|
+
}
|
|
22
|
+
@result=str
|
|
23
|
+
end
|
|
24
|
+
def convert_han_tw(str)
|
|
25
|
+
# puts str
|
|
26
|
+
tmpstr=str.clone
|
|
27
|
+
while tmpstr.sub!(/[\u4e00-\u9fa5]+/,"")
|
|
28
|
+
# matches=str.match(/(\S+)/)
|
|
29
|
+
tmp=$&.chomp
|
|
30
|
+
# puts tmp
|
|
31
|
+
# puts @trhan["卜"]
|
|
32
|
+
tmpp=tmp.clone
|
|
33
|
+
for i in 0..tmp.length-1
|
|
34
|
+
# puts @trhan[tmp[i]]
|
|
35
|
+
if @trhan[tmp[i]]==0
|
|
36
|
+
tmpp[i]=tmp[i]
|
|
37
|
+
next
|
|
38
|
+
else
|
|
39
|
+
tmpp[i]=@trhan[tmp[i]]
|
|
40
|
+
#convert each character
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
@result=str.gsub!(tmp,tmpp)
|
|
44
|
+
end
|
|
45
|
+
@result
|
|
46
|
+
end
|
|
19
47
|
# call after convert_words
|
|
20
48
|
def convert_han(str)
|
|
21
49
|
tmpstr=str.clone
|
|
22
50
|
while tmpstr.sub!(/\S+/,"")
|
|
23
51
|
# matches=str.match(/(\S+)/)
|
|
24
52
|
tmp=$&.chomp
|
|
53
|
+
if tmp[0..1]=="#%"
|
|
54
|
+
next
|
|
55
|
+
else
|
|
25
56
|
# puts tmp
|
|
26
|
-
|
|
27
|
-
|
|
57
|
+
tmpp=tmp.clone
|
|
58
|
+
for i in 0..tmp.length-1
|
|
28
59
|
#convert each character
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
60
|
+
if @han[tmp[i]]==1
|
|
61
|
+
tmpp[i]=@han[tmp[i]][0]
|
|
62
|
+
elsif @han[tmp[i]]==nil
|
|
63
|
+
|
|
64
|
+
# break
|
|
32
65
|
tmpp[i]=tmp[i]
|
|
33
|
-
|
|
34
|
-
|
|
66
|
+
else
|
|
67
|
+
tmpp[i]=use_bigram(tmpp,i)
|
|
68
|
+
end
|
|
35
69
|
end
|
|
36
|
-
end
|
|
37
70
|
@result=str.gsub!(tmp,tmpp)
|
|
38
|
-
|
|
71
|
+
end
|
|
72
|
+
end
|
|
39
73
|
@result
|
|
40
74
|
end
|
|
41
75
|
# when there is a 1:n converation
|
|
@@ -66,10 +100,17 @@ class Tsukiko
|
|
|
66
100
|
# puts @bigram[str]
|
|
67
101
|
return @bigram[str]
|
|
68
102
|
end
|
|
69
|
-
|
|
103
|
+
# convert simplified chinesse into tradtional chinese
|
|
70
104
|
def convert(str)
|
|
71
105
|
@result=convert_han(convert_words(str))
|
|
72
106
|
# @result=convert_words(str)
|
|
107
|
+
@result.gsub!(" #%","")
|
|
108
|
+
@result.gsub!("%# ","")
|
|
109
|
+
@result
|
|
110
|
+
end
|
|
111
|
+
def convert_tw(str)
|
|
112
|
+
@result=convert_han_tw(convert_words_tw(str))
|
|
113
|
+
# @result=convert_words(str)
|
|
73
114
|
@result.gsub!(" ","")
|
|
74
115
|
@result
|
|
75
116
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tsukiko
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1
|
|
4
|
+
version: 0.2.1
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2013-12-
|
|
12
|
+
date: 2013-12-25 00:00:00.000000000 Z
|
|
13
13
|
dependencies: []
|
|
14
14
|
description: a tool to convert simplified Chinsese into tradtional Chinese
|
|
15
15
|
email:
|
|
@@ -22,6 +22,7 @@ files:
|
|
|
22
22
|
- data/bigram.data
|
|
23
23
|
- data/cn_tw.data
|
|
24
24
|
- data/words.data
|
|
25
|
+
- data/tw_cn.data
|
|
25
26
|
homepage: ''
|
|
26
27
|
licenses: []
|
|
27
28
|
post_install_message:
|