docdiff 0.6.5 → 0.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +7 -7
- data/Guardfile +4 -4
- data/Makefile +1 -1
- data/Rakefile +6 -6
- data/bin/docdiff +1 -1
- data/devutil/Rakefile +12 -5
- data/devutil/char_by_charclass.rb +43 -20
- data/devutil/charclass_by_char.rb +40 -19
- data/devutil/jis0208.rb +263 -231
- data/devutil/jis0208_test.rb +196 -0
- data/doc/news.md +8 -0
- data/docdiff.gemspec +12 -10
- data/lib/doc_diff.rb +59 -60
- data/lib/docdiff/charstring.rb +225 -241
- data/lib/docdiff/cli.rb +285 -250
- data/lib/docdiff/diff/contours.rb +1 -1
- data/lib/docdiff/diff/editscript.rb +1 -1
- data/lib/docdiff/diff/rcsdiff.rb +1 -1
- data/lib/docdiff/diff/shortestpath.rb +1 -1
- data/lib/docdiff/diff/speculative.rb +1 -1
- data/lib/docdiff/diff/subsequence.rb +1 -1
- data/lib/docdiff/diff/unidiff.rb +1 -1
- data/lib/docdiff/diff.rb +1 -1
- data/lib/docdiff/difference.rb +71 -70
- data/lib/docdiff/document.rb +129 -109
- data/lib/docdiff/encoding/en_ascii.rb +64 -58
- data/lib/docdiff/encoding/ja_eucjp.rb +250 -235
- data/lib/docdiff/encoding/ja_sjis.rb +240 -226
- data/lib/docdiff/encoding/ja_utf8.rb +6952 -6939
- data/lib/docdiff/version.rb +1 -1
- data/lib/docdiff/view.rb +522 -438
- data/lib/docdiff.rb +2 -2
- data/test/charstring_test.rb +475 -351
- data/test/cli_test.rb +103 -101
- data/test/diff_test.rb +15 -16
- data/test/difference_test.rb +40 -31
- data/test/docdiff_test.rb +162 -136
- data/test/document_test.rb +280 -175
- data/test/test_helper.rb +2 -1
- data/test/view_test.rb +636 -497
- metadata +8 -8
- data/devutil/testjis0208.rb +0 -38
data/devutil/jis0208.rb
CHANGED
|
@@ -1,343 +1,375 @@
|
|
|
1
1
|
#!/usr/bin/ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
2
4
|
# Extracts multibyte characters from JIS0208.TXT.
|
|
3
5
|
# (ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT)
|
|
4
6
|
# 2003-03-03 .. 20xx-xx-xx, Hisashi MORITA. Use freely at your own risk.
|
|
5
7
|
# Usage: jis0208.rb <Ku> <Ten> <Codeset> #=> \xXX...
|
|
6
8
|
# Example: jis0208.rb 1 1 utf-8 #=> \xe3\x80\x80
|
|
7
9
|
|
|
10
|
+
Encoding.default_external = "UTF-8"
|
|
11
|
+
|
|
8
12
|
class JIS0208
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
13
|
+
# Convert UTF-16 to UTF-8
|
|
14
|
+
def utf16_to_utf8(utf16)
|
|
15
|
+
utf16value =
|
|
16
|
+
utf16.unpack("C*")[0] * 256 + utf16.unpack("C*")[1]
|
|
17
|
+
utf8 =
|
|
18
|
+
if utf16value < 0x7f
|
|
19
|
+
# 1-byte UTF-8
|
|
20
|
+
[utf16value].pack("C*")
|
|
21
|
+
elsif utf16value < 0x800
|
|
22
|
+
# 2-byte UTF-8
|
|
23
|
+
[
|
|
24
|
+
(0xC0 | (utf16value / 64)),
|
|
25
|
+
(0x80 | (utf16value % 64)),
|
|
26
|
+
].pack("C*")
|
|
27
|
+
else
|
|
28
|
+
# 3-byte UTF-8
|
|
29
|
+
[
|
|
30
|
+
(0xE0 | ((utf16value / 64) / 64)),
|
|
31
|
+
(0x80 | ((utf16value / 64) % 64)),
|
|
32
|
+
(0x80 | (utf16value % 64)),
|
|
33
|
+
].pack("C*")
|
|
34
|
+
end
|
|
35
|
+
utf8
|
|
21
36
|
end
|
|
22
|
-
|
|
37
|
+
|
|
38
|
+
def initialize
|
|
23
39
|
@lines = File.readlines("JIS0208.TXT")
|
|
24
|
-
@lines = @lines.grep(/^[^\#]/)
|
|
25
|
-
@lines = @lines.
|
|
26
|
-
@char_db = @lines.collect
|
|
27
|
-
sjis, jis, utf16 = line.split.
|
|
28
|
-
string.sub(
|
|
29
|
-
|
|
40
|
+
@lines = @lines.grep(/^[^\#]/) # remove comments
|
|
41
|
+
@lines = @lines.map { |l| l.sub(/\s+\#[^\#]+$/, "") } # remove unicode names
|
|
42
|
+
@char_db = @lines.collect do |line|
|
|
43
|
+
sjis, jis, utf16 = line.split.map do |string|
|
|
44
|
+
[string.sub("0x", "")].pack("H*") # "0xXXXX" to 8-bit byte string
|
|
45
|
+
end
|
|
30
46
|
jis_byte_pair = jis.unpack("C*")
|
|
31
47
|
# jis + 0x8080 => euc
|
|
32
|
-
euc
|
|
48
|
+
euc = jis_byte_pair.map { |byte| (byte + 0x80) }.pack("C*")
|
|
33
49
|
# jis - 0x2020 => ku, ten
|
|
34
|
-
ku, ten = jis_byte_pair.
|
|
35
|
-
utf8
|
|
36
|
-
{
|
|
37
|
-
|
|
50
|
+
ku, ten = jis_byte_pair.map { |byte| (byte - 0x20) }
|
|
51
|
+
utf8 = utf16_to_utf8(utf16)
|
|
52
|
+
{
|
|
53
|
+
s: sjis,
|
|
54
|
+
j: jis,
|
|
55
|
+
u16: utf16,
|
|
56
|
+
e: euc,
|
|
57
|
+
u8: utf8,
|
|
58
|
+
ku: ku,
|
|
59
|
+
ten: ten,
|
|
60
|
+
}
|
|
61
|
+
end
|
|
38
62
|
@characters = {}
|
|
39
|
-
@char_db.each
|
|
63
|
+
@char_db.each do |char|
|
|
40
64
|
if @characters[char[:ku]].nil?
|
|
41
65
|
@characters[char[:ku]] = {}
|
|
42
66
|
end
|
|
43
67
|
if @characters[char[:ku]][char[:ten]].nil?
|
|
44
68
|
@characters[char[:ku]][char[:ten]] = {
|
|
45
|
-
:
|
|
46
|
-
:
|
|
47
|
-
:
|
|
48
|
-
:
|
|
49
|
-
:
|
|
69
|
+
s: char[:s],
|
|
70
|
+
j: char[:j],
|
|
71
|
+
u16: char[:u16],
|
|
72
|
+
e: char[:e],
|
|
73
|
+
u8: char[:u8],
|
|
50
74
|
}
|
|
51
75
|
end
|
|
52
|
-
|
|
76
|
+
end
|
|
53
77
|
end
|
|
54
78
|
attr_reader :char_db
|
|
55
79
|
attr_reader :characters
|
|
56
|
-
def char(ku, ten, codeset)
|
|
57
|
-
case
|
|
58
|
-
when /^[Ee]/ =~ codeset then codeset = :e
|
|
59
|
-
when /^[Ss]/ =~ codeset then codeset = :s
|
|
60
|
-
when /^[Jj]/ =~ codeset then codeset = :j
|
|
61
|
-
when /^[Uu].*16$/ =~ codeset then codeset = :u16
|
|
62
|
-
when /^[Uu].*8$/ =~ codeset then codeset = :u8
|
|
63
|
-
else
|
|
64
|
-
raise "invalid codeset name (#{codeset})\n"
|
|
65
|
-
end
|
|
66
|
-
characters[ku][ten][codeset].unpack('C*').collect{|byte|
|
|
67
|
-
sprintf("\\x%x",byte)
|
|
68
|
-
}.join
|
|
69
|
-
end
|
|
70
80
|
|
|
81
|
+
def char(ku, ten, codeset_name)
|
|
82
|
+
codeset =
|
|
83
|
+
case codeset_name
|
|
84
|
+
when /^[Ee]/ then :e
|
|
85
|
+
when /^[Ss]/ then :s
|
|
86
|
+
when /^[Jj]/ then :j
|
|
87
|
+
when /^[Uu].*16$/ then :u16
|
|
88
|
+
when /^[Uu].*8$/ then :u8
|
|
89
|
+
else
|
|
90
|
+
raise "invalid codeset name (#{codeset_name})"
|
|
91
|
+
end
|
|
71
92
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
93
|
+
characters[ku][ten][codeset].unpack("C*").map do |byte|
|
|
94
|
+
format("\\x%x", byte)
|
|
95
|
+
end.join
|
|
96
|
+
end
|
|
75
97
|
|
|
76
|
-
|
|
77
|
-
def euc_ja_alnum()
|
|
78
|
-
j = JIS0208.new
|
|
98
|
+
def euc_ja_alnum
|
|
79
99
|
r = []
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
100
|
+
[3].each { |ku| (16..25).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
101
|
+
[3].each { |ku| (33..58).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
102
|
+
[3].each { |ku| (65..90).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
83
103
|
r
|
|
84
104
|
end
|
|
85
|
-
|
|
86
|
-
|
|
105
|
+
|
|
106
|
+
def euc_ja_blank
|
|
87
107
|
r = []
|
|
88
|
-
|
|
108
|
+
[1].each { |ku| [1].each { |ten| r << char(ku, ten, "e") } }
|
|
89
109
|
r
|
|
90
110
|
end
|
|
91
|
-
|
|
92
|
-
|
|
111
|
+
|
|
112
|
+
def euc_ja_print
|
|
113
|
+
euc_ja_graph + euc_ja_blank
|
|
93
114
|
end
|
|
94
|
-
|
|
95
|
-
|
|
115
|
+
|
|
116
|
+
def euc_ja_graph
|
|
117
|
+
euc_ja_alnum + euc_ja_punct
|
|
96
118
|
end
|
|
97
|
-
|
|
98
|
-
|
|
119
|
+
|
|
120
|
+
def euc_ja_punct
|
|
99
121
|
r = []
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
122
|
+
[1].each { |ku| (2..94).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
123
|
+
[2].each { |ku| (1..14).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
124
|
+
[2].each { |ku| (26..33).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
125
|
+
[2].each { |ku| (42..48).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
126
|
+
[2].each { |ku| (60..74).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
127
|
+
[2].each { |ku| (82..89).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
128
|
+
[2].each { |ku| [94].each { |ten| r << char(ku, ten, "e") } }
|
|
129
|
+
[6].each { |ku| (1..24).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
130
|
+
[6].each { |ku| (33..56).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
131
|
+
[7].each { |ku| (1..33).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
132
|
+
[7].each { |ku| (49..81).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
133
|
+
[8].each { |ku| (1..32).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
112
134
|
r
|
|
113
135
|
end
|
|
114
|
-
|
|
115
|
-
|
|
136
|
+
|
|
137
|
+
def euc_ja_space
|
|
116
138
|
r = []
|
|
117
|
-
|
|
139
|
+
[1].each { |ku| [1].each { |ten| r << char(ku, ten, "e") } }
|
|
118
140
|
r
|
|
119
141
|
end
|
|
120
|
-
|
|
121
|
-
|
|
142
|
+
|
|
143
|
+
def euc_hiragana
|
|
122
144
|
r = []
|
|
123
|
-
|
|
145
|
+
[4].each { |ku| (1..83).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
124
146
|
r
|
|
125
147
|
end
|
|
126
|
-
|
|
127
|
-
|
|
148
|
+
|
|
149
|
+
def euc_katakana
|
|
128
150
|
r = []
|
|
129
|
-
|
|
151
|
+
[5].each { |ku| (1..86).to_a.each { |ten| r << char(ku, ten, "e") } }
|
|
130
152
|
r
|
|
131
153
|
end
|
|
132
|
-
|
|
133
|
-
|
|
154
|
+
|
|
155
|
+
def euc_kanji
|
|
134
156
|
r = []
|
|
135
|
-
(16..46).to_a.each{|ku| r << "#{
|
|
136
|
-
|
|
137
|
-
(48..83).to_a.each{|ku|r << "#{
|
|
138
|
-
|
|
157
|
+
(16..46).to_a.each { |ku| r << "#{char(ku, 1, "e")}-#{char(ku, 94, "e")}" }
|
|
158
|
+
[47].each { |ku| r << "#{char(ku, 1, "e")}-#{char(ku, 51, "e")}" }
|
|
159
|
+
(48..83).to_a.each { |ku| r << "#{char(ku, 1, "e")}-#{char(ku, 94, "e")}" }
|
|
160
|
+
[84].each { |ku| r << "#{char(ku, 1, "e")}-#{char(ku, 6, "e")}" }
|
|
139
161
|
r
|
|
140
162
|
end
|
|
141
163
|
|
|
142
|
-
|
|
143
|
-
def sjis_ja_alnum()
|
|
144
|
-
j = JIS0208.new
|
|
164
|
+
def sjis_ja_alnum
|
|
145
165
|
r = []
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
166
|
+
[3].each { |ku| (16..25).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
167
|
+
[3].each { |ku| (33..58).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
168
|
+
[3].each { |ku| (65..90).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
149
169
|
r
|
|
150
170
|
end
|
|
151
|
-
|
|
152
|
-
|
|
171
|
+
|
|
172
|
+
def sjis_ja_blank
|
|
153
173
|
r = []
|
|
154
|
-
|
|
174
|
+
[1].each { |ku| [1].each { |ten| r << char(ku, ten, "s") } }
|
|
155
175
|
r
|
|
156
176
|
end
|
|
157
|
-
|
|
158
|
-
|
|
177
|
+
|
|
178
|
+
def sjis_ja_print
|
|
179
|
+
sjis_ja_graph + sjis_ja_blank
|
|
159
180
|
end
|
|
160
|
-
|
|
161
|
-
|
|
181
|
+
|
|
182
|
+
def sjis_ja_graph
|
|
183
|
+
sjis_ja_alnum + sjis_ja_punct
|
|
162
184
|
end
|
|
163
|
-
|
|
164
|
-
|
|
185
|
+
|
|
186
|
+
def sjis_ja_punct
|
|
165
187
|
r = []
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
#
|
|
179
|
-
#
|
|
180
|
-
#
|
|
181
|
-
#
|
|
188
|
+
[1].each { |ku| (2..94).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
189
|
+
[2].each { |ku| (1..14).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
190
|
+
[2].each { |ku| (26..33).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
191
|
+
[2].each { |ku| (42..48).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
192
|
+
[2].each { |ku| (60..74).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
193
|
+
[2].each { |ku| (82..89).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
194
|
+
[2].each { |ku| [94].each { |ten| r << char(ku, ten, "s") } }
|
|
195
|
+
[6].each { |ku| (1..24).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
196
|
+
[6].each { |ku| (33..56).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
197
|
+
[7].each { |ku| (1..33).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
198
|
+
[7].each { |ku| (49..81).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
199
|
+
[8].each { |ku| (1..32).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
200
|
+
# [13].each { |ku| (1..30).to_a.each { |ten| r << char(ku, ten, "s") } } # cp932
|
|
201
|
+
# [13].each { |ku| (32..54).to_a.each { |ten| r << char(ku, ten, "s") } } # cp932
|
|
202
|
+
# [13].each { |ku| (63..92).to_a.each { |ten| r << char(ku, ten, "s") } } # cp932
|
|
203
|
+
# [92].each { |ku| (81..94).to_a.each { |ten| r << char(ku, ten, "s") } } # cp932
|
|
182
204
|
r
|
|
183
205
|
end
|
|
184
|
-
|
|
185
|
-
|
|
206
|
+
|
|
207
|
+
def sjis_ja_space
|
|
186
208
|
r = []
|
|
187
|
-
|
|
209
|
+
[1].each { |ku| [1].each { |ten| r << char(ku, ten, "s") } }
|
|
188
210
|
r
|
|
189
211
|
end
|
|
190
|
-
|
|
191
|
-
|
|
212
|
+
|
|
213
|
+
def sjis_hiragana
|
|
192
214
|
r = []
|
|
193
|
-
|
|
215
|
+
[4].each { |ku| (1..83).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
194
216
|
r
|
|
195
217
|
end
|
|
196
|
-
|
|
197
|
-
|
|
218
|
+
|
|
219
|
+
def sjis_katakana
|
|
198
220
|
r = []
|
|
199
|
-
|
|
221
|
+
[5].each { |ku| (1..86).to_a.each { |ten| r << char(ku, ten, "s") } }
|
|
200
222
|
r
|
|
201
223
|
end
|
|
202
|
-
|
|
203
|
-
|
|
224
|
+
|
|
225
|
+
def sjis_kanji
|
|
204
226
|
r = []
|
|
205
|
-
(16..46).to_a.each{|ku|r << "#{
|
|
206
|
-
|
|
207
|
-
(48..83).to_a.each{|ku|r << "#{
|
|
208
|
-
|
|
209
|
-
(89..91).to_a.each{|ku|r << "#{
|
|
210
|
-
|
|
227
|
+
(16..46).to_a.each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 94, "s")}" }
|
|
228
|
+
[47].each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 51, "s")}" }
|
|
229
|
+
(48..83).to_a.each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 94, "s")}" }
|
|
230
|
+
[84].each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 6, "s")}" }
|
|
231
|
+
# (89..91).to_a.each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 94, "s")}" } # cp932 # FIXME
|
|
232
|
+
# [92].each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 78, "s")}" } # cp932 # FIXME
|
|
211
233
|
r
|
|
212
234
|
end
|
|
213
235
|
|
|
214
|
-
|
|
215
|
-
def utf8_ja_alnum()
|
|
216
|
-
j = JIS0208.new
|
|
236
|
+
def utf8_ja_alnum
|
|
217
237
|
r = []
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
238
|
+
[3].each { |ku| (16..25).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
239
|
+
[3].each { |ku| (33..58).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
240
|
+
[3].each { |ku| (65..90).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
221
241
|
r
|
|
222
242
|
end
|
|
223
|
-
|
|
224
|
-
|
|
243
|
+
|
|
244
|
+
def utf8_ja_blank
|
|
225
245
|
r = []
|
|
226
|
-
|
|
246
|
+
[1].each { |ku| [1].each { |ten| r << char(ku, ten, "u8") } }
|
|
227
247
|
r
|
|
228
248
|
end
|
|
229
|
-
|
|
230
|
-
|
|
249
|
+
|
|
250
|
+
def utf8_ja_print
|
|
251
|
+
utf8_ja_graph + utf8_ja_blank
|
|
231
252
|
end
|
|
232
|
-
|
|
233
|
-
|
|
253
|
+
|
|
254
|
+
def utf8_ja_graph
|
|
255
|
+
utf8_ja_alnum + utf8_ja_punct
|
|
234
256
|
end
|
|
235
|
-
|
|
236
|
-
|
|
257
|
+
|
|
258
|
+
def utf8_ja_punct
|
|
237
259
|
r = []
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
260
|
+
[1].each { |ku| (2..94).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
261
|
+
[2].each { |ku| (1..14).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
262
|
+
[2].each { |ku| (26..33).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
263
|
+
[2].each { |ku| (42..48).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
264
|
+
[2].each { |ku| (60..74).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
265
|
+
[2].each { |ku| (82..89).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
266
|
+
[2].each { |ku| [94].each { |ten| r << char(ku, ten, "u8") } }
|
|
267
|
+
[6].each { |ku| (1..24).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
268
|
+
[6].each { |ku| (33..56).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
269
|
+
[7].each { |ku| (1..33).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
270
|
+
[7].each { |ku| (49..81).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
271
|
+
[8].each { |ku| (1..32).to_a.each { |ten| r << char(ku, ten, "u8") } }
|
|
250
272
|
r
|
|
251
273
|
end
|
|
252
|
-
|
|
253
|
-
|
|
274
|
+
|
|
275
|
+
def utf8_ja_space
|
|
254
276
|
r = []
|
|
255
|
-
|
|
277
|
+
[1].each { |ku| [1].each { |ten| r << char(ku, ten, "u8") } }
|
|
256
278
|
r
|
|
257
279
|
end
|
|
258
|
-
|
|
259
|
-
|
|
280
|
+
|
|
281
|
+
def utf8_hiragana
|
|
260
282
|
r = []
|
|
261
|
-
|
|
283
|
+
[4].each { |ku| (1..83).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
|
|
262
284
|
r
|
|
263
285
|
end
|
|
264
|
-
|
|
265
|
-
|
|
286
|
+
|
|
287
|
+
def utf8_katakana
|
|
266
288
|
r = []
|
|
267
|
-
|
|
289
|
+
[5].each { |ku| (1..86).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
|
|
268
290
|
r
|
|
269
291
|
end
|
|
270
|
-
|
|
271
|
-
|
|
292
|
+
|
|
293
|
+
def utf8_kanji
|
|
272
294
|
r = []
|
|
273
|
-
(16..46).to_a.each{|ku|(1..94).to_a.each{|ten|r <<
|
|
274
|
-
|
|
275
|
-
(48..83).to_a.each{|ku|(1..94).to_a.each{|ten|r <<
|
|
276
|
-
|
|
295
|
+
(16..46).to_a.each { |ku| (1..94).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
|
|
296
|
+
[47].each { |ku| (1..51).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
|
|
297
|
+
(48..83).to_a.each { |ku| (1..94).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
|
|
298
|
+
[84].each { |ku| (1..6).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
|
|
277
299
|
r
|
|
278
300
|
end
|
|
301
|
+
end
|
|
279
302
|
|
|
303
|
+
if __FILE__ == $PROGRAM_NAME
|
|
280
304
|
jis0208 = JIS0208.new
|
|
305
|
+
|
|
281
306
|
if ARGV.size == 3
|
|
282
|
-
ku
|
|
307
|
+
ku = ARGV[0].to_i
|
|
308
|
+
ten = ARGV[1].to_i
|
|
309
|
+
codeset = ARGV[2]
|
|
283
310
|
puts jis0208.char(ku, ten, codeset)
|
|
284
311
|
exit(0)
|
|
285
312
|
elsif ARGV.size == 2
|
|
286
|
-
codeset
|
|
313
|
+
codeset = ARGV[0]
|
|
314
|
+
charclass = ARGV[1]
|
|
287
315
|
else
|
|
288
|
-
puts
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
316
|
+
puts <<~EOS
|
|
317
|
+
Usage: jis0208.rb (<Ku> <Ten> <Codeset> | <Codeset> <CharClass>)
|
|
318
|
+
Supported codeset: EUC-JP, Shift_JIS, UTF-8
|
|
319
|
+
Supported charclass: blank, space, alnum, punct, print, graph, hiragana, katakana, kanji
|
|
320
|
+
Example 1: jis0208.rb 16 1 utf-8 #=> \xe4\xba\x9c
|
|
321
|
+
Example 2: jis0208.rb euc-jp punct #=> \xa1\xa2 ... \xa8\xc0
|
|
322
|
+
EOS
|
|
293
323
|
exit(0)
|
|
294
324
|
end
|
|
295
325
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
326
|
+
output =
|
|
327
|
+
case codeset
|
|
328
|
+
when /^e/i # euc-jp
|
|
329
|
+
case charclass
|
|
330
|
+
when /^space/i then jis0208.euc_ja_space
|
|
331
|
+
when /^blank/i then jis0208.euc_ja_blank
|
|
332
|
+
when /^alnum/i then jis0208.euc_ja_alnum
|
|
333
|
+
when /^punct/i then jis0208.euc_ja_punct
|
|
334
|
+
when /^print/i then jis0208.euc_ja_print
|
|
335
|
+
when /^graph/i then jis0208.euc_ja_graph
|
|
336
|
+
when /^hira/i then jis0208.euc_hiragana
|
|
337
|
+
when /^kata/i then jis0208.euc_katakana
|
|
338
|
+
when /^kanji/i then jis0208.euc_kanji
|
|
339
|
+
else
|
|
340
|
+
raise "invalid charclass (#{charclass})"
|
|
341
|
+
end
|
|
342
|
+
when /^s/i # sjis
|
|
343
|
+
case charclass
|
|
344
|
+
when /^space/i then jis0208.sjis_ja_space
|
|
345
|
+
when /^blank/i then jis0208.sjis_ja_blank
|
|
346
|
+
when /^alnum/i then jis0208.sjis_ja_alnum
|
|
347
|
+
when /^punct/i then jis0208.sjis_ja_punct
|
|
348
|
+
when /^print/i then jis0208.sjis_ja_print
|
|
349
|
+
when /^graph/i then jis0208.sjis_ja_graph
|
|
350
|
+
when /^hira/i then jis0208.sjis_hiragana
|
|
351
|
+
when /^kata/i then jis0208.sjis_katakana
|
|
352
|
+
when /^kanji/i then jis0208.sjis_kanji
|
|
353
|
+
else
|
|
354
|
+
raise "invalid charclass (#{charclass})"
|
|
355
|
+
end
|
|
356
|
+
when /^u/i # utf-8
|
|
357
|
+
case charclass
|
|
358
|
+
when /^space/i then jis0208.utf8_ja_space
|
|
359
|
+
when /^blank/i then jis0208.utf8_ja_blank
|
|
360
|
+
when /^alnum/i then jis0208.utf8_ja_alnum
|
|
361
|
+
when /^punct/i then jis0208.utf8_ja_punct
|
|
362
|
+
when /^print/i then jis0208.utf8_ja_print
|
|
363
|
+
when /^graph/i then jis0208.utf8_ja_graph
|
|
364
|
+
when /^hira/i then jis0208.utf8_hiragana
|
|
365
|
+
when /^kata/i then jis0208.utf8_katakana
|
|
366
|
+
when /^kanji/i then jis0208.utf8_kanji
|
|
367
|
+
else
|
|
368
|
+
raise "invalid charclass (#{charclass})"
|
|
369
|
+
end
|
|
336
370
|
else
|
|
337
|
-
raise "invalid charclass (#{charclass})
|
|
371
|
+
raise "invalid codeset (#{codeset}) or charclass (#{charclass})"
|
|
338
372
|
end
|
|
339
|
-
else
|
|
340
|
-
raise "invalid codeset (#{codeset}) or charclass (#{charclass}).\n"
|
|
341
|
-
end
|
|
342
373
|
|
|
374
|
+
puts output
|
|
343
375
|
end
|