docdiff 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/.travis.yml +7 -0
- data/Gemfile +17 -0
- data/Guardfile +8 -0
- data/Makefile +108 -0
- data/Rakefile +17 -0
- data/bin/docdiff +179 -0
- data/devutil/JIS0208.TXT +6952 -0
- data/devutil/char_by_charclass.rb +23 -0
- data/devutil/charclass_by_char.rb +21 -0
- data/devutil/jis0208.rb +343 -0
- data/devutil/testjis0208.rb +38 -0
- data/docdiff.conf.example +22 -0
- data/docdiff.gemspec +23 -0
- data/docdiffwebui.cgi +176 -0
- data/docdiffwebui.html +123 -0
- data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
- data/img/docdiff-screenshot-format-html-firefox.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
- data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
- data/index.html +181 -0
- data/langfilter.rb +14 -0
- data/lib/doc_diff.rb +170 -0
- data/lib/docdiff.rb +7 -0
- data/lib/docdiff/charstring.rb +579 -0
- data/lib/docdiff/diff.rb +217 -0
- data/lib/docdiff/diff/contours.rb +382 -0
- data/lib/docdiff/diff/editscript.rb +148 -0
- data/lib/docdiff/diff/rcsdiff.rb +107 -0
- data/lib/docdiff/diff/shortestpath.rb +93 -0
- data/lib/docdiff/diff/speculative.rb +40 -0
- data/lib/docdiff/diff/subsequence.rb +39 -0
- data/lib/docdiff/diff/unidiff.rb +124 -0
- data/lib/docdiff/difference.rb +92 -0
- data/lib/docdiff/document.rb +127 -0
- data/lib/docdiff/encoding/en_ascii.rb +97 -0
- data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
- data/lib/docdiff/encoding/ja_sjis.rb +260 -0
- data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
- data/lib/docdiff/version.rb +3 -0
- data/lib/docdiff/view.rb +476 -0
- data/lib/viewdiff.rb +375 -0
- data/readme.html +713 -0
- data/sample/01.en.ascii.cr +1 -0
- data/sample/01.en.ascii.crlf +2 -0
- data/sample/01.en.ascii.lf +2 -0
- data/sample/01.ja.eucjp.lf +2 -0
- data/sample/01.ja.sjis.cr +1 -0
- data/sample/01.ja.sjis.crlf +2 -0
- data/sample/01.ja.utf8.crlf +2 -0
- data/sample/02.en.ascii.cr +1 -0
- data/sample/02.en.ascii.crlf +2 -0
- data/sample/02.en.ascii.lf +2 -0
- data/sample/02.ja.eucjp.lf +2 -0
- data/sample/02.ja.sjis.cr +1 -0
- data/sample/02.ja.sjis.crlf +2 -0
- data/sample/02.ja.utf8.crlf +2 -0
- data/sample/humpty_dumpty01.ascii.lf +4 -0
- data/sample/humpty_dumpty02.ascii.lf +4 -0
- data/test/charstring_test.rb +1008 -0
- data/test/diff_test.rb +36 -0
- data/test/difference_test.rb +64 -0
- data/test/docdiff_test.rb +193 -0
- data/test/document_test.rb +626 -0
- data/test/test_helper.rb +7 -0
- data/test/view_test.rb +570 -0
- data/test/viewdiff_test.rb +908 -0
- metadata +129 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# test character classes on ASCII characters.
|
3
|
+
# 2003-03-10 Hisashi MORITA
|
4
|
+
|
5
|
+
charclasses = ["[:cntrl:]",
|
6
|
+
"[:space:]", "[:blank:]",
|
7
|
+
"[:digit:]",
|
8
|
+
"[:alpha:]", "[:alnum:]",
|
9
|
+
"[:punct:]",
|
10
|
+
"[:lower:]", "[:upper:]",
|
11
|
+
"[:print:]", "[:graph:]",
|
12
|
+
"[:xdigit:]"]
|
13
|
+
chars = (0x00 .. 0xff).to_a
|
14
|
+
|
15
|
+
charclasses.each{|charclass|
|
16
|
+
member_chars = []
|
17
|
+
chars.each{|char|
|
18
|
+
if Regexp.new("[#{charclass}]") =~ char.to_a.pack("C*")
|
19
|
+
member_chars.push char
|
20
|
+
end
|
21
|
+
}
|
22
|
+
puts "#{charclass}\t#{member_chars.collect{|char|sprintf("\\x%02x", char)}.join}\n\t\t(#{member_chars.collect{|char|char.to_a.pack('C*').inspect[1..-2]}.join})"
|
23
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# test character classes on ASCII characters.
|
3
|
+
# 2003-03-10 Hisashi MORITA
|
4
|
+
|
5
|
+
charclasses = ["[:cntrl:]",
|
6
|
+
"[:space:]", "[:blank:]",
|
7
|
+
"[:digit:]",
|
8
|
+
"[:alpha:]", "[:alnum:]",
|
9
|
+
"[:punct:]",
|
10
|
+
"[:lower:]", "[:upper:]",
|
11
|
+
"[:print:]", "[:graph:]",
|
12
|
+
"[:xdigit:]"]
|
13
|
+
(0x00 .. 0xff).to_a.each{|char|
|
14
|
+
attribute = []
|
15
|
+
charclasses.each{|charclass|
|
16
|
+
if Regexp.new("[#{charclass}]") =~ char.to_a.pack("C*")
|
17
|
+
attribute.push charclass
|
18
|
+
end
|
19
|
+
}
|
20
|
+
puts "#{sprintf("\\x%02x", char)} (#{char.to_a.pack('C*').inspect})\t#{attribute.join(', ')}"
|
21
|
+
}
|
data/devutil/jis0208.rb
ADDED
@@ -0,0 +1,343 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# Extracts multibyte characters from JIS0208.TXT.
|
3
|
+
# (ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT)
|
4
|
+
# 2003-03-03 .. 20xx-xx-xx, Hisashi MORITA. Use freely at your own risk.
|
5
|
+
# Usage: jis0208.rb <Ku> <Ten> <Codeset> #=> \xXX...
|
6
|
+
# Example: jis0208.rb 1 1 utf-8 #=> \xe3\x80\x80
|
7
|
+
|
8
|
+
class JIS0208
|
9
|
+
def utf16_to_utf8(utf16) # Convert UTF-16 to UTF-8N
|
10
|
+
utf16value = (utf16.unpack("C*")[0] * 256 + utf16.unpack("C*")[1])
|
11
|
+
if utf16value < 0x7f # 1-byte utf-8
|
12
|
+
utf8 = utf16value.to_a.pack("C*")
|
13
|
+
elsif utf16value < 0x800 # 2-byte utf-8
|
14
|
+
utf8 = [(0xC0 | (utf16value / 64)),
|
15
|
+
(0x80 | (utf16value % 64))].pack("C*")
|
16
|
+
else # 3-byte utf-8
|
17
|
+
utf8 = [(0xE0 | ((utf16value / 64) / 64)),
|
18
|
+
(0x80 | ((utf16value / 64) % 64)),
|
19
|
+
(0x80 | (utf16value % 64))].pack("C*")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
def initialize()
|
23
|
+
@lines = File.readlines("JIS0208.TXT")
|
24
|
+
@lines = @lines.grep(/^[^\#]/) # remove comments
|
25
|
+
@lines = @lines.collect{|l| l.sub(/\s+\#[^\#]+$/,'')} # remove unicode names
|
26
|
+
@char_db = @lines.collect {|line|
|
27
|
+
sjis, jis, utf16 = line.split.collect{|string|
|
28
|
+
string.sub(/0x/, '').to_a.pack("H*") # "0xXXXX" to 8-bit byte string
|
29
|
+
}
|
30
|
+
jis_byte_pair = jis.unpack("C*")
|
31
|
+
# jis + 0x8080 => euc
|
32
|
+
euc = jis_byte_pair.collect {|byte| (byte + 0x80)}.pack("C*")
|
33
|
+
# jis - 0x2020 => ku, ten
|
34
|
+
ku, ten = jis_byte_pair.collect {|byte| (byte - 0x20)}
|
35
|
+
utf8 = utf16_to_utf8(utf16)
|
36
|
+
{:s=>sjis, :j=>jis, :u16=>utf16, :e=>euc, :u8=>utf8, :ku=>ku, :ten=>ten}
|
37
|
+
}
|
38
|
+
@characters = {}
|
39
|
+
@char_db.each{|char|
|
40
|
+
if @characters[char[:ku]].nil?
|
41
|
+
@characters[char[:ku]] = {}
|
42
|
+
end
|
43
|
+
if @characters[char[:ku]][char[:ten]].nil?
|
44
|
+
@characters[char[:ku]][char[:ten]] = {
|
45
|
+
:s=>char[:s],
|
46
|
+
:j=>char[:j],
|
47
|
+
:u16=>char[:u16],
|
48
|
+
:e=>char[:e],
|
49
|
+
:u8=>char[:u8]
|
50
|
+
}
|
51
|
+
end
|
52
|
+
}
|
53
|
+
end
|
54
|
+
attr_reader :char_db
|
55
|
+
attr_reader :characters
|
56
|
+
def char(ku, ten, codeset)
|
57
|
+
case
|
58
|
+
when /^[Ee]/ =~ codeset then codeset = :e
|
59
|
+
when /^[Ss]/ =~ codeset then codeset = :s
|
60
|
+
when /^[Jj]/ =~ codeset then codeset = :j
|
61
|
+
when /^[Uu].*16$/ =~ codeset then codeset = :u16
|
62
|
+
when /^[Uu].*8$/ =~ codeset then codeset = :u8
|
63
|
+
else
|
64
|
+
raise "invalid codeset name (#{codeset})\n"
|
65
|
+
end
|
66
|
+
characters[ku][ten][codeset].unpack('C*').collect{|byte|
|
67
|
+
sprintf("\\x%x",byte)
|
68
|
+
}.join
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
if __FILE__ == $0
|
75
|
+
|
76
|
+
# euc-jp
|
77
|
+
def euc_ja_alnum()
|
78
|
+
j = JIS0208.new
|
79
|
+
r = []
|
80
|
+
(3).to_a.each{|ku|(16..25).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
81
|
+
(3).to_a.each{|ku|(33..58).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
82
|
+
(3).to_a.each{|ku|(65..90).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
83
|
+
r
|
84
|
+
end
|
85
|
+
def euc_ja_blank()
|
86
|
+
j = JIS0208.new
|
87
|
+
r = []
|
88
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
89
|
+
r
|
90
|
+
end
|
91
|
+
def euc_ja_print()
|
92
|
+
euc_ja_graph() + euc_ja_blank()
|
93
|
+
end
|
94
|
+
def euc_ja_graph()
|
95
|
+
euc_ja_alnum() + euc_ja_punct()
|
96
|
+
end
|
97
|
+
def euc_ja_punct()
|
98
|
+
j = JIS0208.new
|
99
|
+
r = []
|
100
|
+
(1).to_a.each{|ku|( 2..94).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
101
|
+
(2).to_a.each{|ku|( 1..14).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
102
|
+
(2).to_a.each{|ku|(26..33).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
103
|
+
(2).to_a.each{|ku|(42..48).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
104
|
+
(2).to_a.each{|ku|(60..74).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
105
|
+
(2).to_a.each{|ku|(82..89).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
106
|
+
(2).to_a.each{|ku|(94 ).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
107
|
+
(6).to_a.each{|ku|( 1..24).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
108
|
+
(6).to_a.each{|ku|(33..56).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
109
|
+
(7).to_a.each{|ku|( 1..33).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
110
|
+
(7).to_a.each{|ku|(49..81).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
111
|
+
(8).to_a.each{|ku|( 1..32).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
112
|
+
r
|
113
|
+
end
|
114
|
+
def euc_ja_space()
|
115
|
+
j = JIS0208.new
|
116
|
+
r = []
|
117
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
118
|
+
r
|
119
|
+
end
|
120
|
+
def euc_hiragana()
|
121
|
+
j = JIS0208.new
|
122
|
+
r = []
|
123
|
+
(4).to_a.each{|ku|(1..83).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
124
|
+
r
|
125
|
+
end
|
126
|
+
def euc_katakana()
|
127
|
+
j = JIS0208.new
|
128
|
+
r = []
|
129
|
+
(5).to_a.each{|ku|(1..86).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
130
|
+
r
|
131
|
+
end
|
132
|
+
def euc_kanji()
|
133
|
+
j = JIS0208.new
|
134
|
+
r = []
|
135
|
+
(16..46).to_a.each{|ku| r << "#{j.char(ku,1,'e')}-#{j.char(ku,94,'e')}"}
|
136
|
+
(47).to_a.each{|ku|r << "#{j.char(ku,1,'e')}-#{j.char(ku,51,'e')}"}
|
137
|
+
(48..83).to_a.each{|ku|r << "#{j.char(ku,1,'e')}-#{j.char(ku,94,'e')}"}
|
138
|
+
(84).to_a.each{|ku|r << "#{j.char(ku,1,'e')}-#{j.char(ku,6,'e')}"}
|
139
|
+
r
|
140
|
+
end
|
141
|
+
|
142
|
+
# sjis (cp932)
|
143
|
+
def sjis_ja_alnum()
|
144
|
+
j = JIS0208.new
|
145
|
+
r = []
|
146
|
+
(3).to_a.each{|ku|(16..25).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
147
|
+
(3).to_a.each{|ku|(33..58).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
148
|
+
(3).to_a.each{|ku|(65..90).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
149
|
+
r
|
150
|
+
end
|
151
|
+
def sjis_ja_blank()
|
152
|
+
j = JIS0208.new
|
153
|
+
r = []
|
154
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
155
|
+
r
|
156
|
+
end
|
157
|
+
def sjis_ja_print()
|
158
|
+
sjis_ja_graph() + sjis_ja_blank()
|
159
|
+
end
|
160
|
+
def sjis_ja_graph()
|
161
|
+
sjis_ja_alnum() + sjis_ja_punct()
|
162
|
+
end
|
163
|
+
def sjis_ja_punct()
|
164
|
+
j = JIS0208.new
|
165
|
+
r = []
|
166
|
+
(1).to_a.each{|ku|(2..94).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
167
|
+
(2).to_a.each{|ku|(1..14).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
168
|
+
(2).to_a.each{|ku|(26..33).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
169
|
+
(2).to_a.each{|ku|(42..48).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
170
|
+
(2).to_a.each{|ku|(60..74).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
171
|
+
(2).to_a.each{|ku|(82..89).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
172
|
+
(2).to_a.each{|ku|(94).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
173
|
+
(6).to_a.each{|ku|(1..24).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
174
|
+
(6).to_a.each{|ku|(33..56).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
175
|
+
(7).to_a.each{|ku|(1..33).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
176
|
+
(7).to_a.each{|ku|(49..81).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
177
|
+
(8).to_a.each{|ku|(1..32).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
178
|
+
#(13).to_a.each{|ku|(1..30).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
|
179
|
+
#(13).to_a.each{|ku|(32..54).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
|
180
|
+
#(13).to_a.each{|ku|(63..92).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
|
181
|
+
#(92).to_a.each{|ku|(81..94).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
|
182
|
+
r
|
183
|
+
end
|
184
|
+
def sjis_ja_space()
|
185
|
+
j = JIS0208.new
|
186
|
+
r = []
|
187
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
188
|
+
r
|
189
|
+
end
|
190
|
+
def sjis_hiragana()
|
191
|
+
j = JIS0208.new
|
192
|
+
r = []
|
193
|
+
(4).to_a.each{|ku|(1..83).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
194
|
+
r
|
195
|
+
end
|
196
|
+
def sjis_katakana()
|
197
|
+
j = JIS0208.new
|
198
|
+
r = []
|
199
|
+
(5).to_a.each{|ku|(1..86).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
200
|
+
r
|
201
|
+
end
|
202
|
+
def sjis_kanji()
|
203
|
+
j = JIS0208.new
|
204
|
+
r = []
|
205
|
+
(16..46).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,94,'s')}"}
|
206
|
+
(47).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,51,'s')}"}
|
207
|
+
(48..83).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,94,'s')}"}
|
208
|
+
(84).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,6,'s')}"}
|
209
|
+
(89..91).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,94,'s')}"}#cp932
|
210
|
+
(92).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,78,'s')}"}#cp932
|
211
|
+
r
|
212
|
+
end
|
213
|
+
|
214
|
+
# utf8
|
215
|
+
def utf8_ja_alnum()
|
216
|
+
j = JIS0208.new
|
217
|
+
r = []
|
218
|
+
(3).to_a.each{|ku|(16..25).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
219
|
+
(3).to_a.each{|ku|(33..58).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
220
|
+
(3).to_a.each{|ku|(65..90).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
221
|
+
r
|
222
|
+
end
|
223
|
+
def utf8_ja_blank()
|
224
|
+
j = JIS0208.new
|
225
|
+
r = []
|
226
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
227
|
+
r
|
228
|
+
end
|
229
|
+
def utf8_ja_print()
|
230
|
+
utf8_ja_graph() + utf8_ja_blank()
|
231
|
+
end
|
232
|
+
def utf8_ja_graph()
|
233
|
+
utf8_ja_alnum() + utf8_ja_punct()
|
234
|
+
end
|
235
|
+
def utf8_ja_punct()
|
236
|
+
j = JIS0208.new
|
237
|
+
r = []
|
238
|
+
(1).to_a.each{|ku|( 2..94).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
239
|
+
(2).to_a.each{|ku|( 1..14).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
240
|
+
(2).to_a.each{|ku|(26..33).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
241
|
+
(2).to_a.each{|ku|(42..48).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
242
|
+
(2).to_a.each{|ku|(60..74).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
243
|
+
(2).to_a.each{|ku|(82..89).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
244
|
+
(2).to_a.each{|ku|(94 ).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
245
|
+
(6).to_a.each{|ku|( 1..24).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
246
|
+
(6).to_a.each{|ku|(33..56).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
247
|
+
(7).to_a.each{|ku|( 1..33).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
248
|
+
(7).to_a.each{|ku|(49..81).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
249
|
+
(8).to_a.each{|ku|( 1..32).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
250
|
+
r
|
251
|
+
end
|
252
|
+
def utf8_ja_space()
|
253
|
+
j = JIS0208.new
|
254
|
+
r = []
|
255
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
256
|
+
r
|
257
|
+
end
|
258
|
+
def utf8_hiragana()
|
259
|
+
j = JIS0208.new
|
260
|
+
r = []
|
261
|
+
(4).to_a.each{|ku|(1..83).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
262
|
+
r
|
263
|
+
end
|
264
|
+
def utf8_katakana()
|
265
|
+
j = JIS0208.new
|
266
|
+
r = []
|
267
|
+
(5).to_a.each{|ku|(1..86).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
268
|
+
r
|
269
|
+
end
|
270
|
+
def utf8_kanji()
|
271
|
+
j = JIS0208.new
|
272
|
+
r = []
|
273
|
+
(16..46).to_a.each{|ku|(1..94).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
274
|
+
(47).to_a.each{|ku|(1..51).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
275
|
+
(48..83).to_a.each{|ku|(1..94).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
276
|
+
(84).to_a.each{|ku|(1..6).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
277
|
+
r
|
278
|
+
end
|
279
|
+
|
280
|
+
jis0208 = JIS0208.new
|
281
|
+
if ARGV.size == 3
|
282
|
+
ku, ten, codeset = ARGV[0].to_i, ARGV[1].to_i, ARGV[2].to_s
|
283
|
+
puts jis0208.char(ku, ten, codeset)
|
284
|
+
exit(0)
|
285
|
+
elsif ARGV.size == 2
|
286
|
+
codeset, charclass = ARGV[0].to_s, ARGV[1].to_s
|
287
|
+
else
|
288
|
+
puts "Usage: jis0208.rb (<Ku> <Ten> <Codeset> | <Codeset> <CharClass>)"
|
289
|
+
puts "Supported codeset: EUC-JP, Shift_JIS, UTF-8"
|
290
|
+
puts "Supported charclass: blank, space, alnum, punct, print, graph, hiragana, katakana, kanji"
|
291
|
+
puts "Example 1: jis0208.rb 16 1 utf-8"
|
292
|
+
puts "Example 2: jis0208.rb euc-jp punct"
|
293
|
+
exit(0)
|
294
|
+
end
|
295
|
+
|
296
|
+
case
|
297
|
+
when (/^e/i.match codeset) # euc-jp
|
298
|
+
case
|
299
|
+
when (/^space/i.match charclass) then puts euc_ja_space()
|
300
|
+
when (/^blank/i.match charclass) then puts euc_ja_blank()
|
301
|
+
when (/^alnum/i.match charclass) then puts euc_ja_alnum()
|
302
|
+
when (/^punct/i.match charclass) then puts euc_ja_punct()
|
303
|
+
when (/^print/i.match charclass) then puts euc_ja_print()
|
304
|
+
when (/^graph/i.match charclass) then puts euc_ja_graph()
|
305
|
+
when (/^hira/i.match charclass) then puts euc_hiragana()
|
306
|
+
when (/^kata/i.match charclass) then puts euc_katakana()
|
307
|
+
when (/^kanji/i.match charclass) then puts euc_kanji()
|
308
|
+
else
|
309
|
+
raise "invalid charclass (#{charclass}).\n"
|
310
|
+
end
|
311
|
+
when (/^s/i.match codeset) # sjis
|
312
|
+
case
|
313
|
+
when (/^space/i.match charclass) then puts sjis_ja_space()
|
314
|
+
when (/^blank/i.match charclass) then puts sjis_ja_blank()
|
315
|
+
when (/^alnum/i.match charclass) then puts sjis_ja_alnum()
|
316
|
+
when (/^punct/i.match charclass) then puts sjis_ja_punct()
|
317
|
+
when (/^print/i.match charclass) then puts sjis_ja_print()
|
318
|
+
when (/^graph/i.match charclass) then puts sjis_ja_graph()
|
319
|
+
when (/^hira/i.match charclass) then puts sjis_hiragana()
|
320
|
+
when (/^kata/i.match charclass) then puts sjis_katakana()
|
321
|
+
when (/^kanji/i.match charclass) then puts sjis_kanji()
|
322
|
+
else
|
323
|
+
raise "invalid charclass (#{charclass}).\n"
|
324
|
+
end
|
325
|
+
when (/^u/i.match codeset) # utf-8
|
326
|
+
case
|
327
|
+
when (/^space/i.match charclass) then puts utf8_ja_space()
|
328
|
+
when (/^blank/i.match charclass) then puts utf8_ja_blank()
|
329
|
+
when (/^alnum/i.match charclass) then puts utf8_ja_alnum()
|
330
|
+
when (/^punct/i.match charclass) then puts utf8_ja_punct()
|
331
|
+
when (/^print/i.match charclass) then puts utf8_ja_print()
|
332
|
+
when (/^graph/i.match charclass) then puts utf8_ja_graph()
|
333
|
+
when (/^hira/i.match charclass) then puts utf8_hiragana()
|
334
|
+
when (/^kata/i.match charclass) then puts utf8_katakana()
|
335
|
+
when (/^kanji/i.match charclass) then puts utf8_kanji()
|
336
|
+
else
|
337
|
+
raise "invalid charclass (#{charclass}).\n"
|
338
|
+
end
|
339
|
+
else
|
340
|
+
raise "invalid codeset (#{codeset}) or charclass (#{charclass}).\n"
|
341
|
+
end
|
342
|
+
|
343
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'jis0208'
|
3
|
+
require 'nkf'
|
4
|
+
require 'iconv'
|
5
|
+
|
6
|
+
class TC_JIS0208 < Test::Unit::TestCase
|
7
|
+
def setup()
|
8
|
+
#
|
9
|
+
end
|
10
|
+
|
11
|
+
=begin obsolete
|
12
|
+
def test_string_to_array()
|
13
|
+
jis0208 = JIS0208.new
|
14
|
+
expected = [0xe1, 0xa1, 0xa8] # "ɽ"(4129) UTF-8 as array
|
15
|
+
# assert_equal(expected, Iconv.iconv("UTF-8", "EUC-JP", "ɽ").to_s)
|
16
|
+
assert_equal(expected, jis0208.string_to_array("\xe1\xa1\xa8"))
|
17
|
+
end
|
18
|
+
def test_array_to_string()
|
19
|
+
jis0208 = JIS0208.new
|
20
|
+
expected = "\xe1\xa1\xa8" # "ɽ"(4129) in UTF-8 string
|
21
|
+
# assert_equal(expected, Iconv.iconv("UTF-8", "EUC-JP", "ɽ").to_s)
|
22
|
+
assert_equal(expected, jis0208.array_to_string([0xe1, 0xa1, 0xa8]))
|
23
|
+
end
|
24
|
+
=end
|
25
|
+
=begin obsolete
|
26
|
+
def test_to_value_array()
|
27
|
+
expected = [0xe1, 0xa1, 0xa8] # "ɽ"(4129) UTF-8 as array
|
28
|
+
assert_equal(expected, "\xe1\xa1\xa8".to_value_array)
|
29
|
+
end
|
30
|
+
def test_to_binary_string()
|
31
|
+
expected = "\xe1\xa1\xa8" # "ɽ"(4129) in UTF-8 string
|
32
|
+
assert_equal(expected, [0xe1, 0xa1, 0xa8].to_binary_string)
|
33
|
+
end
|
34
|
+
def teardown()
|
35
|
+
#
|
36
|
+
end
|
37
|
+
=end
|
38
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
## DocDiff configuration file
|
2
|
+
## Comment out and modify the following lines as you like.
|
3
|
+
#
|
4
|
+
# resolution = word
|
5
|
+
# encoding = UTF-8
|
6
|
+
# eol = LF
|
7
|
+
# format = html
|
8
|
+
# digest = off
|
9
|
+
# cache = off # not implemented yet
|
10
|
+
# verbose = no # not implemented yet
|
11
|
+
#
|
12
|
+
## user-defined tags (not well-supported yet)
|
13
|
+
# tag_common_start = '<=>'
|
14
|
+
# tag_common_end = '</=>'
|
15
|
+
# tag_del_start = '<->'
|
16
|
+
# tag_del_end = '</->'
|
17
|
+
# tag_add_start = '<+>'
|
18
|
+
# tag_add_end = '</+>'
|
19
|
+
# tag_change_before_start = '<!->'
|
20
|
+
# tag_change_before_end = '</!->'
|
21
|
+
# tag_change_after_start = '<!+>'
|
22
|
+
# tag_change_after_end = '</!+>'
|