tsukiko 0.1.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/data/tw_cn.data +0 -0
  2. data/lib/tsukiko.rb +56 -15
  3. metadata +3 -2
data/data/tw_cn.data ADDED
Binary file
data/lib/tsukiko.rb CHANGED
@@ -3,39 +3,73 @@
3
3
  #encoding:utf-8
4
4
  class Tsukiko
5
5
  def initialize(dic_path=File.dirname(__FILE__))
6
- @words=Hash.new
7
- @han=Hash.new
8
- @bigram=Hash.new
9
6
  @words=Marshal.load(File.open(dic_path+"/../data/words.data","rb").read)
10
7
  @han=Marshal.load(File.open(dic_path+"/../data/cn_tw.data","rb").read)
11
8
  @bigram=Marshal.load(File.open(dic_path+"/../data/bigram.data","rb").read)
9
+ @trhan=Marshal.load(File.open(dic_path+"/../data/tw_cn.data","rb").read)
12
10
  end
13
- def convert_words(str)
11
+ def convert_words(str)
14
12
  @words.each{|cn,tw|
15
- str.gsub!(cn," "+tw+" ")
13
+ str.gsub!(cn," #%"+tw+"%# ")
16
14
  }
17
15
  @result=str
18
16
  end
17
+ def convert_words_tw(str)
18
+ words=@words.invert
19
+ words.each{|tw,cn|
20
+ str.gsub!(tw," "+cn+" ")
21
+ }
22
+ @result=str
23
+ end
24
+ def convert_han_tw(str)
25
+ # puts str
26
+ tmpstr=str.clone
27
+ while tmpstr.sub!(/[\u4e00-\u9fa5]+/,"")
28
+ # matches=str.match(/(\S+)/)
29
+ tmp=$&.chomp
30
+ # puts tmp
31
+ # puts @trhan["卜"]
32
+ tmpp=tmp.clone
33
+ for i in 0..tmp.length-1
34
+ # puts @trhan[tmp[i]]
35
+ if @trhan[tmp[i]]==0
36
+ tmpp[i]=tmp[i]
37
+ next
38
+ else
39
+ tmpp[i]=@trhan[tmp[i]]
40
+ #convert each character
41
+ end
42
+ end
43
+ @result=str.gsub!(tmp,tmpp)
44
+ end
45
+ @result
46
+ end
19
47
  # call after convert_words
20
48
  def convert_han(str)
21
49
  tmpstr=str.clone
22
50
  while tmpstr.sub!(/\S+/,"")
23
51
  # matches=str.match(/(\S+)/)
24
52
  tmp=$&.chomp
53
+ if tmp[0..1]=="#%"
54
+ next
55
+ else
25
56
  # puts tmp
26
- tmpp=tmp.clone
27
- for i in 0..tmp.length-1
57
+ tmpp=tmp.clone
58
+ for i in 0..tmp.length-1
28
59
  #convert each character
29
- if @han[tmp[i]]==1
30
- tmpp[i]=@han[tmp[i]][0]
31
- elsif @han[tmp[i]]==nil
60
+ if @han[tmp[i]]==1
61
+ tmpp[i]=@han[tmp[i]][0]
62
+ elsif @han[tmp[i]]==nil
63
+
64
+ # break
32
65
  tmpp[i]=tmp[i]
33
- else
34
- tmpp[i]=use_bigram(tmpp,i)
66
+ else
67
+ tmpp[i]=use_bigram(tmpp,i)
68
+ end
35
69
  end
36
- end
37
70
  @result=str.gsub!(tmp,tmpp)
38
- end
71
+ end
72
+ end
39
73
  @result
40
74
  end
41
75
  # when there is a 1:n converation
@@ -66,10 +100,17 @@ class Tsukiko
66
100
  # puts @bigram[str]
67
101
  return @bigram[str]
68
102
  end
69
-
103
+ # convert simplified chinesse into tradtional chinese
70
104
  def convert(str)
71
105
  @result=convert_han(convert_words(str))
72
106
  # @result=convert_words(str)
107
+ @result.gsub!(" #%","")
108
+ @result.gsub!("%# ","")
109
+ @result
110
+ end
111
+ def convert_tw(str)
112
+ @result=convert_han_tw(convert_words_tw(str))
113
+ # @result=convert_words(str)
73
114
  @result.gsub!(" ","")
74
115
  @result
75
116
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tsukiko
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-12-24 00:00:00.000000000 Z
12
+ date: 2013-12-25 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: a tool to convert simplified Chinsese into tradtional Chinese
15
15
  email:
@@ -22,6 +22,7 @@ files:
22
22
  - data/bigram.data
23
23
  - data/cn_tw.data
24
24
  - data/words.data
25
+ - data/tw_cn.data
25
26
  homepage: ''
26
27
  licenses: []
27
28
  post_install_message: