jis2euc 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/Gemfile +4 -0
- data/Rakefile +2 -0
- data/jis2euc.gemspec +21 -0
- data/lib/jis2euc/jis8ctl.rb +205 -0
- data/lib/jis2euc/libjis2euc.rb +271 -0
- data/lib/jis2euc/version.rb +3 -0
- data/lib/jis2euc.rb +15 -0
- data/lib/test.rb +55 -0
- metadata +76 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
data/jis2euc.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "jis2euc/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "jis2euc"
|
7
|
+
s.version = Jis2euc::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["pierr.chen"]
|
10
|
+
s.email = ["pierr.chen@gmail.com"]
|
11
|
+
s.homepage = ""
|
12
|
+
s.summary = %q{convert jis encoding to EUC-JP}
|
13
|
+
s.description = %q{ARIB standard combines serveral JIS encoding for texts ,this gem convert them to EUC-JP }
|
14
|
+
|
15
|
+
s.rubyforge_project = "jis2euc"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
end
|
@@ -0,0 +1,205 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
=begin
|
4
|
+
|
5
|
+
Two area : GL , GR
|
6
|
+
For each area it could be ONE of the following 4 value ,that could be G0 ,G1, G2 and G3.
|
7
|
+
GL was init to G0
|
8
|
+
GR was init to G2
|
9
|
+
For G0..G3 ,it could be assigned to different character set.
|
10
|
+
G0 was init to KANJI_1
|
11
|
+
G1 was init to EISU
|
12
|
+
G2 was init to HIRAGANA
|
13
|
+
G3 was init to KATAKANA
|
14
|
+
|
15
|
+
How to decide the character set we are using is :
|
16
|
+
1. which area we are using?
|
17
|
+
2. what's the value of the that area? Say Gx
|
18
|
+
3. what is the character was assigned to Gx
|
19
|
+
|
20
|
+
Q:what is single shift?
|
21
|
+
A:We have to restore back to previous code set after finis processing following character.
|
22
|
+
|
23
|
+
Q:How to determine current area?
|
24
|
+
A: if current_char > 0x80
|
25
|
+
"GR"
|
26
|
+
else
|
27
|
+
"GL"
|
28
|
+
=end
|
29
|
+
|
30
|
+
#To understand how ctl code works
|
31
|
+
|
32
|
+
def is_ctl_code? b
|
33
|
+
|
34
|
+
return true if b <= 0x20 or b == 0x7f or (b >0x80 and b < 0xa0) or b == 0xff
|
35
|
+
false
|
36
|
+
end
|
37
|
+
|
38
|
+
=begin
|
39
|
+
Table 7-1 Invocation of code elements
|
40
|
+
{{0x0f,}, CODE_SET_G0, CODE_AREA_GL, LOCKING_SHIFT}, //LS0 set GL to G0
|
41
|
+
{{0x0e,}, CODE_SET_G1, CODE_AREA_GL, LOCKING_SHIFT}, //LS1 set GL to G1
|
42
|
+
{{0x1b,0x6e,}, CODE_SET_G2, CODE_AREA_GL, LOCKING_SHIFT}, //LS2 set GL to G2
|
43
|
+
{{0x1b,0x6f,}, CODE_SET_G3, CODE_AREA_GL, LOCKING_SHIFT}, //LS3 set GL to G3
|
44
|
+
{{0x1b,0x7e,}, CODE_SET_G1, CODE_AREA_GR, LOCKING_SHIFT}, //LS1R set GR to G1
|
45
|
+
{{0x1b,0x7d,}, CODE_SET_G2, CODE_AREA_GR, LOCKING_SHIFT}, //LS2R set GR to G2
|
46
|
+
{{0x1b,0x7c,}, CODE_SET_G3, CODE_AREA_GR, LOCKING_SHIFT}, //LS3R set GR to G3
|
47
|
+
{{0x19,}, CODE_SET_G2, CODE_AREA_GL, SINGLE_SHIFT }, //SS2 set GL to G2 ,single shift
|
48
|
+
{{0x1d,}, CODE_SET_G3, CODE_AREA_GL, SINGLE_SHIFT } //SS3 set GL to G3 ,single shift
|
49
|
+
=end
|
50
|
+
|
51
|
+
$esc_invok = {
|
52
|
+
0x6e=>"LS2 : GL=>G2 , locking shift",
|
53
|
+
0x6f=>"LS3 : GL=>G3 , locking shift",
|
54
|
+
0x7e=>"LS1R : GR=>G1 , locking shift",
|
55
|
+
0x7d=>"LS2R : GR=>G2 , locking shift",
|
56
|
+
0x7c=>"LS3R : GR=>G3 , locking shift",
|
57
|
+
0x7b=>"Confused... "
|
58
|
+
}
|
59
|
+
$other_invok = {
|
60
|
+
0x0f=>"LS0 : GL=>G0, locking shift",
|
61
|
+
0x0e=>"LS1 : GL=>G1, locking shift",
|
62
|
+
0x19=>"SS2 : GL=>G2, single shift",
|
63
|
+
0x1d=>"SS3 : GL=>G3, single shift",
|
64
|
+
#0x20=>"SP", #means space " "
|
65
|
+
#0x0d=>"APR", #means "\n"
|
66
|
+
#0x89=>"MSZ", #specify the character size is Middle
|
67
|
+
}
|
68
|
+
#?
|
69
|
+
|
70
|
+
#Table 7-2 B24
|
71
|
+
#All start with ESC (0x1B)
|
72
|
+
$set_designation = [0x24,0x28,0x29,0x2a,0x2b]
|
73
|
+
|
74
|
+
$designation_action={
|
75
|
+
"0x24" =>"set 2 bytes Graphic set to G0",
|
76
|
+
"0x24,0x29" =>"set 2 bytes Graphic set to G1",
|
77
|
+
"0x24,0x2a" =>"set 2 bytes Graphic set to G2",
|
78
|
+
"0x24,0x2b" =>"set 2 bytes Graphic set to G3",
|
79
|
+
"0x28" =>"set 1 bytes Graphic set to G0",
|
80
|
+
"0x29" =>"set 1 bytes Graphic set to G1",
|
81
|
+
"0x2a" =>"set 1 bytes Graphic set to G2",
|
82
|
+
"0x2b" =>"set 1 bytes Graphic set to G3",
|
83
|
+
}
|
84
|
+
ESC = 0x1b
|
85
|
+
$encoding_mapping = {
|
86
|
+
"0x39"=>"kanji1",
|
87
|
+
"0x3a"=>"kanji2",
|
88
|
+
"0x4a"=>"eisu",
|
89
|
+
"0x30"=>"hira",
|
90
|
+
"0x31"=>"kata",
|
91
|
+
"0x3b"=>"tusika",
|
92
|
+
}
|
93
|
+
|
94
|
+
#
|
95
|
+
#
|
96
|
+
#
|
97
|
+
#
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
class Jis8charset
|
102
|
+
|
103
|
+
#TODO : make it singleton
|
104
|
+
def initialize
|
105
|
+
|
106
|
+
@areas= {"GL"=> "G0" , "GR" => "G2"}
|
107
|
+
@sets = {"G0"=> "kanji1" , "G1" => "eisu", "G2"=>"hira","G3"=>"kata"}
|
108
|
+
@last_gl = "G0"
|
109
|
+
@single_shift = false
|
110
|
+
end
|
111
|
+
def reset
|
112
|
+
@areas= {"GL"=> "G0" , "GR" => "G2"}
|
113
|
+
@sets = {"G0"=> "kanji1" , "G1" => "eisu", "G2"=>"hira","G3"=>"kata"}
|
114
|
+
@last_gl = "G0"
|
115
|
+
@single_shift = false
|
116
|
+
end
|
117
|
+
|
118
|
+
def verbose s
|
119
|
+
#puts s
|
120
|
+
end
|
121
|
+
|
122
|
+
def get_current_charset area
|
123
|
+
charset = @sets[@areas[area]]
|
124
|
+
bytes = if charset.eql?("kanji1") or charset.eql?("tusika") then 2 else 1 end
|
125
|
+
|
126
|
+
return charset , bytes , @single_shift
|
127
|
+
end
|
128
|
+
|
129
|
+
def set_area area ,graphic_set ,single_shift=false
|
130
|
+
raise ArgumentError, "unknow #{area} , illeale key" if !@areas.key?(area)
|
131
|
+
if single_shift
|
132
|
+
raise ArgumentError if !area.eql?("GL")
|
133
|
+
@last_gl = @areas["GL"]
|
134
|
+
@single_shift = true
|
135
|
+
end
|
136
|
+
@areas[area] = graphic_set
|
137
|
+
end
|
138
|
+
|
139
|
+
def set_graphicset graphic_set, character_set
|
140
|
+
raise ArgumentError, "unknow #{graphic_set} , illeale key" if !@sets.key?(graphic_set)
|
141
|
+
verbose "set #{graphic_set} to #{character_set}"
|
142
|
+
@sets[graphic_set] = character_set
|
143
|
+
end
|
144
|
+
|
145
|
+
def show
|
146
|
+
p @areas
|
147
|
+
p @sets
|
148
|
+
end
|
149
|
+
|
150
|
+
def restore
|
151
|
+
#TODO: restore previous value if current is single shift
|
152
|
+
#
|
153
|
+
#only to restore @areas["GL"]
|
154
|
+
@areas["GL"]=@last_gl
|
155
|
+
@single_shift = false
|
156
|
+
end
|
157
|
+
|
158
|
+
|
159
|
+
def do action
|
160
|
+
matches = /.*(\w{2})=>(\w{2}).*/.match(action)
|
161
|
+
shift = (action.include?("single") ? true :false)
|
162
|
+
set_area(matches[1],matches[2],shift)
|
163
|
+
end
|
164
|
+
|
165
|
+
|
166
|
+
def do_b action , para
|
167
|
+
verbose "action #{action}"
|
168
|
+
action =~ /to\s(.*)/
|
169
|
+
set=$1
|
170
|
+
raise ArgumentError , "unknow encoding format #{para}" unless $encoding_mapping.key?(para)
|
171
|
+
log = action + " using " + "[#{$encoding_mapping[para]}]"
|
172
|
+
verbose "-> Action : #{log}"
|
173
|
+
@sets[set]=$encoding_mapping[para]
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
178
|
+
def test_jis8charset
|
179
|
+
|
180
|
+
|
181
|
+
j = Jis8charset.new
|
182
|
+
|
183
|
+
puts j.get_current_charset "GL"
|
184
|
+
|
185
|
+
puts j.get_current_charset "GR"
|
186
|
+
|
187
|
+
j.show
|
188
|
+
j.set_area("GL","G1")
|
189
|
+
j.set_graphicset("G0" ,"eisu")
|
190
|
+
j.set_graphicset("G1" ,"kanji")
|
191
|
+
j.show
|
192
|
+
|
193
|
+
|
194
|
+
#matches = /.*(\w{2})=>(\w{2}).*/.match("xxxxxx GL=>G0 yyyyy ")
|
195
|
+
puts action = $esc_invok[0x6e]
|
196
|
+
j.do(action)
|
197
|
+
|
198
|
+
c , b , shift = j.get_current_charset("GL")
|
199
|
+
puts c
|
200
|
+
puts b
|
201
|
+
puts shift
|
202
|
+
|
203
|
+
end
|
204
|
+
|
205
|
+
#test_jis8charset
|
@@ -0,0 +1,271 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'iconv'
|
5
|
+
#require 'jis8ctl.rb'
|
6
|
+
|
7
|
+
require File.join(File.dirname(__FILE__),'jis8ctl')
|
8
|
+
|
9
|
+
=begin
|
10
|
+
Dir.foreach("sample") do |f|
|
11
|
+
puts "->#{f}";
|
12
|
+
f.each_byte {|b| puts b}
|
13
|
+
|
14
|
+
end
|
15
|
+
=end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
#puts "中文"
|
21
|
+
#puts "Résumés"
|
22
|
+
#puts "講座は4つの分野で"
|
23
|
+
|
24
|
+
if RUBY_VERSION < "1.9"
|
25
|
+
#no use in ruby 1.9
|
26
|
+
$KCODE = "UTF8"
|
27
|
+
end
|
28
|
+
|
29
|
+
#An internal test case
|
30
|
+
def test_1
|
31
|
+
ch = "中文"
|
32
|
+
fr = "Résumés"
|
33
|
+
jp = "講座は4つの分野で"
|
34
|
+
eng = "Hello"
|
35
|
+
a = [eng,ch, fr, jp].each do |s|
|
36
|
+
|
37
|
+
puts "->#{s}"
|
38
|
+
if RUBY_VERSION > "1.9"
|
39
|
+
puts RUBY_VERSION
|
40
|
+
puts s.encoding.name
|
41
|
+
#s.each_char {|c| puts c}
|
42
|
+
|
43
|
+
else
|
44
|
+
puts RUBY_VERSION
|
45
|
+
puts %w|->scan(/./u):|
|
46
|
+
p s.scan(/./u)
|
47
|
+
puts "->size = #{s.size}"
|
48
|
+
puts "->length = #{s.length}"
|
49
|
+
puts "->each_char :"
|
50
|
+
s.each_char {|c| print c , ","}
|
51
|
+
print "\n"
|
52
|
+
puts "->each_byte :"
|
53
|
+
s.each_byte {|b| print b , ","}
|
54
|
+
print "\n"
|
55
|
+
end
|
56
|
+
#puts s.encode("UTF-8")
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
#convert EUC-JP to UTF-8 so that Terminal can display it
|
61
|
+
|
62
|
+
utf8_jp = "講座は4つの分野で" #as the source coding is UTF8 , it will be UTF8
|
63
|
+
puts utf8_jp.size
|
64
|
+
utf8_jp.each_byte {|b| print b , ","} ;print "\n"
|
65
|
+
euc_jp = Iconv.conv("EUCJP","UTF8",utf8_jp)
|
66
|
+
puts "ouput euc_jp coding :"
|
67
|
+
puts euc_jp #Terminal can not understand EUC-JP ,so only garbage ouput
|
68
|
+
puts euc_jp.size
|
69
|
+
euc_jp.each_byte {|b| print b , ","} ; print "\n"
|
70
|
+
puts "convert back to utf-8"
|
71
|
+
utf8_jp_again = Iconv.conv("UTF8","EUCJP",euc_jp)
|
72
|
+
puts utf8_jp_again
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
$eisu_to_euc = [
|
77
|
+
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
|
78
|
+
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 ,
|
79
|
+
# ! " # $ % & ' ( ) * + , - . /
|
80
|
+
0xA1A0, 0xA1AA, 0xA1ED, 0xA1F4, 0xA1F0, 0xA1F3, 0xA1F5, 0xA1EC, 0xA1CA, 0xA1CB, 0xA1F6, 0xA1DC, 0xA1A4, 0xA1BD, 0xA1A5, 0xA1BF ,
|
81
|
+
#0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
82
|
+
0xA3B0, 0xA3B1, 0xA3B2, 0xA3B3, 0xA3B4, 0xA3B5, 0xA3B6, 0xA3B7, 0xA3B8, 0xA3B9, 0xA1A7, 0xA1A8, 0xA1E3, 0xA1E1, 0xA1E4, 0xA1A9 ,
|
83
|
+
#@ A B C D E F G H I J K L M N O
|
84
|
+
0xA1F7, 0xA3C1, 0xA3C2, 0xA3C3, 0xA3C4, 0xA3C5, 0xA3C6, 0xA3C7, 0xA3C8, 0xA3C9, 0xA3CA, 0xA3CB, 0xA3CC, 0xA3CD, 0xA3CE, 0xA3CF ,
|
85
|
+
#P Q R S T U V W X Y Z [ \ ] ^ _
|
86
|
+
0xA3D0, 0xA3D1, 0xA3D2, 0xA3D3, 0xA3D4, 0xA3D5, 0xA3D6, 0xA3D7, 0xA3D8, 0xA3D9, 0xA3DA, 0xA1CE, 0xA1EF, 0xA1CF, 0xA1B0, 0xA1A1 ,
|
87
|
+
#` a b c d e f g h i j k l m n o
|
88
|
+
0xA1AE, 0xA3E1, 0xA3E2, 0xA3E3, 0xA3E4, 0xA3E5, 0xA3E6, 0xA3E7, 0xA3E8, 0xA3E9, 0xA3EA, 0xA3EB, 0xA3EC, 0xA3ED, 0xA3EE, 0xA3EF ,
|
89
|
+
#p q r s t u v w x y z { | } ~
|
90
|
+
0xA3F0, 0xA3F1, 0xA3F2, 0xA3F3, 0xA3F4, 0xA3F5, 0xA3F6, 0xA3F7, 0xA3F8, 0xA3F9, 0xA3FA, 0xA1D0, 0xA1C3, 0xA1D1, 0xA1B1, 0x0000
|
91
|
+
]
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
$hiragana_to_euc = [
|
96
|
+
#0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
97
|
+
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 ,
|
98
|
+
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 ,
|
99
|
+
0xA1A0, 0xA4A1, 0xA4A2, 0xA4A3, 0xA4A4, 0xA4A5, 0xA4A6, 0xA4A7, 0xA4A8, 0xA4A9, 0xA4AA, 0xA4AB, 0xA4AC, 0xA4AD, 0xA4AE, 0xA4AF ,
|
100
|
+
0xA4B0, 0xA4B1, 0xA4B2, 0xA4B3, 0xA4B4, 0xA4B5, 0xA4B6, 0xA4B7, 0xA4B8, 0xA4B9, 0xA4BA, 0xA4BB, 0xA4BC, 0xA4BD, 0xA4BE, 0xA4BF ,
|
101
|
+
0xA4C0, 0xA4C1, 0xA4C2, 0xA4C3, 0xA4C4, 0xA4C5, 0xA4C6, 0xA4C7, 0xA4C8, 0xA4C9, 0xA4CA, 0xA4CB, 0xA4CC, 0xA4CD, 0xA4CE, 0xA4CF ,
|
102
|
+
0xA4D0, 0xA4D1, 0xA4D2, 0xA4D3, 0xA4D4, 0xA4D5, 0xA4D6, 0xA4D7, 0xA4D8, 0xA4D9, 0xA4DA, 0xA4DB, 0xA4DC, 0xA4DD, 0xA4DE, 0xA4DF ,
|
103
|
+
0xA4E0, 0xA4E1, 0xA4E2, 0xA4E3, 0xA4E4, 0xA4E5, 0xA4E6, 0xA4E7, 0xA4E8, 0xA4E9, 0xA4EA, 0xA4EB, 0xA4EC, 0xA4ED, 0xA4EE, 0xA4EF ,
|
104
|
+
0xA4F0, 0xA4F1, 0xA4F2, 0xA4F3, 0x0000, 0x0000, 0x0000, 0xA1B5, 0xA1B6, 0xA1BC, 0xA1A3, 0xA1D6, 0xA1D7, 0xA1A2, 0xA1A6, 0x0000
|
105
|
+
]
|
106
|
+
|
107
|
+
$katakana_to_euc = [
|
108
|
+
#0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
109
|
+
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 ,
|
110
|
+
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 ,
|
111
|
+
0xA1A0, 0xA5A1, 0xA5A2, 0xA5A3, 0xA5A4, 0xA5A5, 0xA5A6, 0xA5A7, 0xA5A8, 0xA5A9, 0xA5AA, 0xA5AB, 0xA5AC, 0xA5AD, 0xA5AE, 0xA5AF ,
|
112
|
+
0xA5B0, 0xA5B1, 0xA5B2, 0xA5B3, 0xA5B4, 0xA5B5, 0xA5B6, 0xA5B7, 0xA5B8, 0xA5B9, 0xA5BA, 0xA5BB, 0xA5BC, 0xA5BD, 0xA5BE, 0xA5BF ,
|
113
|
+
0xA5C0, 0xA5C1, 0xA5C2, 0xA5C3, 0xA5C4, 0xA5C5, 0xA5C6, 0xA5C7, 0xA5C8, 0xA5C9, 0xA5CA, 0xA5CB, 0xA5CC, 0xA5CD, 0xA5CE, 0xA5CF ,
|
114
|
+
0xA5D0, 0xA5D1, 0xA5D2, 0xA5D3, 0xA5D4, 0xA5D5, 0xA5D6, 0xA5D7, 0xA5D8, 0xA5D9, 0xA5DA, 0xA5DB, 0xA5DC, 0xA5DD, 0xA5DE, 0xA5DF ,
|
115
|
+
0xA5E0, 0xA5E1, 0xA5E2, 0xA5E3, 0xA5E4, 0xA5E5, 0xA5E6, 0xA5E7, 0xA5E8, 0xA5E9, 0xA5EA, 0xA5EB, 0xA5EC, 0xA5ED, 0xA5EE, 0xA5EF ,
|
116
|
+
0xA5F0, 0xA5F1, 0xA5F2, 0xA5F3, 0xA5F4, 0xA5F5, 0xA5F6, 0xA1B3, 0xA1B4, 0xA1BC, 0xA1A3, 0xA1D6, 0xA1D7, 0xA1A2, 0xA1A6, 0x0000
|
117
|
+
]
|
118
|
+
|
119
|
+
|
120
|
+
$debug = false
|
121
|
+
def verbose s
|
122
|
+
puts s if $debug
|
123
|
+
end
|
124
|
+
|
125
|
+
$eucjp_to_utf8 = Iconv.new("UTF8//TRANSLIT//IGNORE","EUCJP")
|
126
|
+
|
127
|
+
|
128
|
+
def to_euc from ,a ,b = 0x00
|
129
|
+
special = [0x2d,0x7a,0x7b,0x7c,0x7e,0x7f]
|
130
|
+
v = case from
|
131
|
+
when "eisu" then $eisu_to_euc[a & 0x7F]
|
132
|
+
when "hira" then $hiragana_to_euc[a & 0x7F]
|
133
|
+
when "kata" then $katakana_to_euc[a & 0x7F]
|
134
|
+
#Hack. Icon can not handle 0x2Dxx ,which is leagal in JIS-X-0213
|
135
|
+
when "kanji1" then if special.index(a) != nil then 0xa1a1 else ((a|0x80) << 8 )+ (b|0x80) end
|
136
|
+
when "tusika" then ((a|0x80) << 8 )+ (b|0x80) #these are character in 91-93 , we may not be able to show correctly in desktop
|
137
|
+
else raise ArgumentError , "unknow encoding #{from}"
|
138
|
+
end
|
139
|
+
s = ""
|
140
|
+
s << ((v & 0xFF00) >> 8 ).chr << (v & 0x00FF).chr
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
|
145
|
+
#To display the sisu
|
146
|
+
|
147
|
+
def test_display_eisu
|
148
|
+
s_euc = ""
|
149
|
+
0x23.upto(0x7e) do |eisu|
|
150
|
+
s_euc << to_euc("eisu",eisu)
|
151
|
+
end
|
152
|
+
0x23.upto(0x7e) do |b|
|
153
|
+
s_euc << to_euc("hira",b)
|
154
|
+
end
|
155
|
+
0x23.upto(0x7e) do |b|
|
156
|
+
s_euc << to_euc("kata",b)
|
157
|
+
end
|
158
|
+
#Kanji1
|
159
|
+
s_euc << to_euc("kanji1",0x5f,0x37)
|
160
|
+
puts eucjp_to_utf8(s_euc)
|
161
|
+
end
|
162
|
+
|
163
|
+
$jis8 = Jis8charset.new
|
164
|
+
def arib_jis_to_euc s
|
165
|
+
|
166
|
+
#convert string to array , so that we can index it freely
|
167
|
+
src = []
|
168
|
+
out = ""
|
169
|
+
s.each_byte {|c| src << c}
|
170
|
+
#each message is independent of each other
|
171
|
+
$jis8.reset
|
172
|
+
|
173
|
+
if $debug
|
174
|
+
puts "convert following string to euc :"
|
175
|
+
src.each_with_index do |c , index|
|
176
|
+
print "#{c.to_s(16)} "
|
177
|
+
end
|
178
|
+
|
179
|
+
puts "\n"
|
180
|
+
end
|
181
|
+
|
182
|
+
while !src.empty?
|
183
|
+
c1 = src.shift
|
184
|
+
if $debug
|
185
|
+
print "c1 : #{c1.to_s(16)} "
|
186
|
+
if is_ctl_code?(c1) then puts "->Is CTL" else print "->Char" end
|
187
|
+
end
|
188
|
+
if is_ctl_code?(c1)
|
189
|
+
#may need to get c2, c3 to determine what to do.
|
190
|
+
if c1 == ESC
|
191
|
+
verbose "ESC .."
|
192
|
+
c2 = src.shift
|
193
|
+
if $esc_invok.key?(c2)
|
194
|
+
action = $esc_invok[c2]
|
195
|
+
verbose action
|
196
|
+
$jis8.do(action)
|
197
|
+
elsif $set_designation.index(c2) != nil
|
198
|
+
#raise StandardError , "this case no implemented yet"
|
199
|
+
#Let's check follwing 3 bytes,using powerful regex!
|
200
|
+
c3 = src[1].nil? ? nil : src[1].to_s(16)
|
201
|
+
s = [c2.to_s(16), src[0].to_s(16) , c3 ].join(',')
|
202
|
+
#puts "following 3 string #{s}"
|
203
|
+
if s =~ /(24,29,|24,2a,|24,2b,)([0-9a-f]{1,})/
|
204
|
+
cmd , para = $1 , $2
|
205
|
+
elsif s =~ /(24,|28,|29,|2a,|2b,)([0-9a-f]{1,})/
|
206
|
+
cmd , para = $1 , $2
|
207
|
+
else
|
208
|
+
raise ArgumentError , "unknow ctrl command"
|
209
|
+
end
|
210
|
+
verbose "-> CTL #{cmd.split(",").map{|t| "0x"+t}.join(",")} Type #{para}"
|
211
|
+
action = $designation_action[cmd.split(",").map {|t| "0x"+t}.join(",")]
|
212
|
+
$jis8.do_b(action ,"0x"+para)
|
213
|
+
c = cmd.split(",").size
|
214
|
+
c.times {|i| src.shift}
|
215
|
+
#may need to take two more
|
216
|
+
end
|
217
|
+
elsif $other_invok.key?(c1)
|
218
|
+
action = $other_invok[c1]
|
219
|
+
verbose action
|
220
|
+
$jis8.do(action)
|
221
|
+
elsif c1 == 0x20
|
222
|
+
out << " "
|
223
|
+
elsif c1 == 0x0d
|
224
|
+
out << "\n"
|
225
|
+
elsif c1 == 0x89
|
226
|
+
verbose "ingor"
|
227
|
+
else
|
228
|
+
verbose "ignor"
|
229
|
+
end
|
230
|
+
else
|
231
|
+
#depend on what current charaset is using , we will get one or two characters
|
232
|
+
area = (c1 > 0x80) ? "GR" : "GL"
|
233
|
+
charset , bytes , single_shift = $jis8.get_current_charset(area)
|
234
|
+
#ignor tusika character
|
235
|
+
verbose "-> #{charset.ljust(10)} , #{bytes}"
|
236
|
+
if charset.eql?("tusika")
|
237
|
+
c2 = src.shift
|
238
|
+
out << "~~"
|
239
|
+
else
|
240
|
+
if bytes == 1
|
241
|
+
out << to_euc(charset, c1)
|
242
|
+
elsif bytes == 2
|
243
|
+
c2 = src.shift
|
244
|
+
if c2.nil?
|
245
|
+
#It is an error. Just ignor it
|
246
|
+
out << "??"
|
247
|
+
puts "Error: No more bytes for 2 bytes character."
|
248
|
+
else
|
249
|
+
out << to_euc(charset, c1, c2 )
|
250
|
+
end
|
251
|
+
#used to locate error when iconv
|
252
|
+
verbose eucjp_to_utf8(out)
|
253
|
+
end
|
254
|
+
end
|
255
|
+
$jis8.restore if single_shift
|
256
|
+
|
257
|
+
end
|
258
|
+
|
259
|
+
end
|
260
|
+
|
261
|
+
|
262
|
+
out
|
263
|
+
end
|
264
|
+
|
265
|
+
def eucjp_to_utf8 euc
|
266
|
+
$eucjp_to_utf8.iconv(euc)
|
267
|
+
end
|
268
|
+
|
269
|
+
|
270
|
+
#test_display_eisu
|
271
|
+
|
data/lib/jis2euc.rb
ADDED
data/lib/test.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
#include this to test local latest change before publishing the gem
|
2
|
+
require './lib/jis2euc'
|
3
|
+
|
4
|
+
#include this to use the installed gem.
|
5
|
+
#require 'jis2euc'
|
6
|
+
require 'test/unit'
|
7
|
+
|
8
|
+
#THIS IS MUST
|
9
|
+
if RUBY_VERSION < "1.9"
|
10
|
+
#no use in ruby 1.9
|
11
|
+
$KCODE = "UTF8"
|
12
|
+
end
|
13
|
+
|
14
|
+
class TC_JIS2EUC < Test::Unit::TestCase
|
15
|
+
|
16
|
+
def setup
|
17
|
+
|
18
|
+
@raw1 = "32 2d 46 6c cb fb 1b 7c b5 d0 cb fc 1b 7d c8 a4 a6 45 41 45 7d ce 3e 2e 3d 2e c7 e 33 37 f 2d 52 e2 ce 35 77 4e 25 f2 41 66 a4 c7 3f 4a e0 32 61 39 73 ca 1b 7c ec f9 b9 1b 7d ac a2 eb fa b3 ce 1b 7c ec f9 b9 1b 7d cb 3b 32 32 43 b9 eb e 35 30 f 42 65 ce 43 4b bf c1 ac 4d 27 3e 70 f2 49 70 34 6f cb 42 4e 4e 4f ce 4a 49 f2 3e 68 ea 31 5b a8 eb 3b 51 f2 44 49 c3 bf fa "
|
19
|
+
@raw2 = "1b 24 3b 7a 56 1b 24 39 34 6f 21 21 4c 34 39 29 4b 3c fb 35 35 4e 76 ac 3f 25 ea ca b9 37 4a 3f 27 fc "
|
20
|
+
@raw3 = "1b 24 2a 3b fa d7 1b 7c ab f3 cb f3 b0 1b 2a 30 1b 7d ce e 44 41 49 f 30 42 35 48 46 7c 21 2a "
|
21
|
+
@raw4 = "48 56 41 48 46 62 4d 46 e 32 89 2d 8a 1b 24 3b "
|
22
|
+
@raw5 = [0xe,0x51,0x56,0x43,0xf,0x21,0x21,0x5f,0x37,0x3a,0x6a,0x89,0x20,0x8a,0x4e,0x34,0x3f,0x4d,0xce,0x1b,0x7c,0xc9].inject("") {|s,c| s << c.chr}
|
23
|
+
|
24
|
+
end
|
25
|
+
def test_arib_jis_to_euc_2
|
26
|
+
|
27
|
+
puts_jis(@raw1)
|
28
|
+
puts_jis(@raw2)
|
29
|
+
puts_jis(@raw3)
|
30
|
+
puts_jis(@raw4)
|
31
|
+
puts_jis(@raw5)
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
def puts_jis raw
|
37
|
+
|
38
|
+
s = raw.split.map {|t| ("0x"+t).to_i(16)}.inject("") {|p,c| p << c.chr}
|
39
|
+
euc_s = Jis2euc.jis2euc(s)
|
40
|
+
puts Jis2euc.euc2utf8(euc_s)
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_euc_to_utf8
|
44
|
+
#Refer to : http://www.rikai.com/library/kanjitables/kanji_codes.euc.shtml
|
45
|
+
#s = [0xa3,0xb0,0xa5,0xc0,0x8f,0xe6,0xc0,0x8f,0xe6,0xd0].inject("") {|s ,b| s << b.chr}
|
46
|
+
s = [0xa1,0xa1,0xdf,0xb7,0xba,0xea].inject("") {|p ,b| p << b.chr}
|
47
|
+
puts Jis2euc.euc2utf8(s)
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
#test_iconv_euc_to_utf8
|
54
|
+
#test_display_eisu
|
55
|
+
#test_arib_jis_to_euc_2
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jis2euc
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- pierr.chen
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-07-05 00:00:00 +08:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: "ARIB standard combines serveral JIS encoding for texts ,this gem convert them to EUC-JP "
|
23
|
+
email:
|
24
|
+
- pierr.chen@gmail.com
|
25
|
+
executables: []
|
26
|
+
|
27
|
+
extensions: []
|
28
|
+
|
29
|
+
extra_rdoc_files: []
|
30
|
+
|
31
|
+
files:
|
32
|
+
- .gitignore
|
33
|
+
- Gemfile
|
34
|
+
- Rakefile
|
35
|
+
- jis2euc.gemspec
|
36
|
+
- lib/jis2euc.rb
|
37
|
+
- lib/jis2euc/jis8ctl.rb
|
38
|
+
- lib/jis2euc/libjis2euc.rb
|
39
|
+
- lib/jis2euc/version.rb
|
40
|
+
- lib/test.rb
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: ""
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
hash: 3
|
56
|
+
segments:
|
57
|
+
- 0
|
58
|
+
version: "0"
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
hash: 3
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
version: "0"
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: jis2euc
|
71
|
+
rubygems_version: 1.3.7
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: convert jis encoding to EUC-JP
|
75
|
+
test_files: []
|
76
|
+
|