tsukiko 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/data/tw_cn.data +0 -0
  2. data/lib/tsukiko.rb +56 -15
  3. metadata +3 -2
data/data/tw_cn.data ADDED
Binary file
data/lib/tsukiko.rb CHANGED
@@ -3,39 +3,73 @@
3
3
  #encoding:utf-8
4
4
  class Tsukiko
5
5
  def initialize(dic_path=File.dirname(__FILE__))
6
- @words=Hash.new
7
- @han=Hash.new
8
- @bigram=Hash.new
9
6
  @words=Marshal.load(File.open(dic_path+"/../data/words.data","rb").read)
10
7
  @han=Marshal.load(File.open(dic_path+"/../data/cn_tw.data","rb").read)
11
8
  @bigram=Marshal.load(File.open(dic_path+"/../data/bigram.data","rb").read)
9
+ @trhan=Marshal.load(File.open(dic_path+"/../data/tw_cn.data","rb").read)
12
10
  end
13
- def convert_words(str)
11
+ def convert_words(str)
14
12
  @words.each{|cn,tw|
15
- str.gsub!(cn," "+tw+" ")
13
+ str.gsub!(cn," #%"+tw+"%# ")
16
14
  }
17
15
  @result=str
18
16
  end
17
+ def convert_words_tw(str)
18
+ words=@words.invert
19
+ words.each{|tw,cn|
20
+ str.gsub!(tw," "+cn+" ")
21
+ }
22
+ @result=str
23
+ end
24
+ def convert_han_tw(str)
25
+ # puts str
26
+ tmpstr=str.clone
27
+ while tmpstr.sub!(/[\u4e00-\u9fa5]+/,"")
28
+ # matches=str.match(/(\S+)/)
29
+ tmp=$&.chomp
30
+ # puts tmp
31
+ # puts @trhan["卜"]
32
+ tmpp=tmp.clone
33
+ for i in 0..tmp.length-1
34
+ # puts @trhan[tmp[i]]
35
+ if @trhan[tmp[i]]==0
36
+ tmpp[i]=tmp[i]
37
+ next
38
+ else
39
+ tmpp[i]=@trhan[tmp[i]]
40
+ #convert each character
41
+ end
42
+ end
43
+ @result=str.gsub!(tmp,tmpp)
44
+ end
45
+ @result
46
+ end
19
47
  # call after convert_words
20
48
  def convert_han(str)
21
49
  tmpstr=str.clone
22
50
  while tmpstr.sub!(/\S+/,"")
23
51
  # matches=str.match(/(\S+)/)
24
52
  tmp=$&.chomp
53
+ if tmp[0..1]=="#%"
54
+ next
55
+ else
25
56
  # puts tmp
26
- tmpp=tmp.clone
27
- for i in 0..tmp.length-1
57
+ tmpp=tmp.clone
58
+ for i in 0..tmp.length-1
28
59
  #convert each character
29
- if @han[tmp[i]]==1
30
- tmpp[i]=@han[tmp[i]][0]
31
- elsif @han[tmp[i]]==nil
60
+ if @han[tmp[i]]==1
61
+ tmpp[i]=@han[tmp[i]][0]
62
+ elsif @han[tmp[i]]==nil
63
+
64
+ # break
32
65
  tmpp[i]=tmp[i]
33
- else
34
- tmpp[i]=use_bigram(tmpp,i)
66
+ else
67
+ tmpp[i]=use_bigram(tmpp,i)
68
+ end
35
69
  end
36
- end
37
70
  @result=str.gsub!(tmp,tmpp)
38
- end
71
+ end
72
+ end
39
73
  @result
40
74
  end
41
75
  # when there is a 1:n converation
@@ -66,10 +100,17 @@ class Tsukiko
66
100
  # puts @bigram[str]
67
101
  return @bigram[str]
68
102
  end
69
-
103
+ # convert simplified chinesse into tradtional chinese
70
104
  def convert(str)
71
105
  @result=convert_han(convert_words(str))
72
106
  # @result=convert_words(str)
107
+ @result.gsub!(" #%","")
108
+ @result.gsub!("%# ","")
109
+ @result
110
+ end
111
+ def convert_tw(str)
112
+ @result=convert_han_tw(convert_words_tw(str))
113
+ # @result=convert_words(str)
73
114
  @result.gsub!(" ","")
74
115
  @result
75
116
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tsukiko
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-12-24 00:00:00.000000000 Z
12
+ date: 2013-12-25 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: a tool to convert simplified Chinsese into tradtional Chinese
15
15
  email:
@@ -22,6 +22,7 @@ files:
22
22
  - data/bigram.data
23
23
  - data/cn_tw.data
24
24
  - data/words.data
25
+ - data/tw_cn.data
25
26
  homepage: ''
26
27
  licenses: []
27
28
  post_install_message: