docdiff 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +6 -0
- data/.travis.yml +7 -0
- data/Gemfile +17 -0
- data/Guardfile +8 -0
- data/Makefile +108 -0
- data/Rakefile +17 -0
- data/bin/docdiff +179 -0
- data/devutil/JIS0208.TXT +6952 -0
- data/devutil/char_by_charclass.rb +23 -0
- data/devutil/charclass_by_char.rb +21 -0
- data/devutil/jis0208.rb +343 -0
- data/devutil/testjis0208.rb +38 -0
- data/docdiff.conf.example +22 -0
- data/docdiff.gemspec +23 -0
- data/docdiffwebui.cgi +176 -0
- data/docdiffwebui.html +123 -0
- data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
- data/img/docdiff-screenshot-format-html-firefox.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
- data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
- data/index.html +181 -0
- data/langfilter.rb +14 -0
- data/lib/doc_diff.rb +170 -0
- data/lib/docdiff.rb +7 -0
- data/lib/docdiff/charstring.rb +579 -0
- data/lib/docdiff/diff.rb +217 -0
- data/lib/docdiff/diff/contours.rb +382 -0
- data/lib/docdiff/diff/editscript.rb +148 -0
- data/lib/docdiff/diff/rcsdiff.rb +107 -0
- data/lib/docdiff/diff/shortestpath.rb +93 -0
- data/lib/docdiff/diff/speculative.rb +40 -0
- data/lib/docdiff/diff/subsequence.rb +39 -0
- data/lib/docdiff/diff/unidiff.rb +124 -0
- data/lib/docdiff/difference.rb +92 -0
- data/lib/docdiff/document.rb +127 -0
- data/lib/docdiff/encoding/en_ascii.rb +97 -0
- data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
- data/lib/docdiff/encoding/ja_sjis.rb +260 -0
- data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
- data/lib/docdiff/version.rb +3 -0
- data/lib/docdiff/view.rb +476 -0
- data/lib/viewdiff.rb +375 -0
- data/readme.html +713 -0
- data/sample/01.en.ascii.cr +1 -0
- data/sample/01.en.ascii.crlf +2 -0
- data/sample/01.en.ascii.lf +2 -0
- data/sample/01.ja.eucjp.lf +2 -0
- data/sample/01.ja.sjis.cr +1 -0
- data/sample/01.ja.sjis.crlf +2 -0
- data/sample/01.ja.utf8.crlf +2 -0
- data/sample/02.en.ascii.cr +1 -0
- data/sample/02.en.ascii.crlf +2 -0
- data/sample/02.en.ascii.lf +2 -0
- data/sample/02.ja.eucjp.lf +2 -0
- data/sample/02.ja.sjis.cr +1 -0
- data/sample/02.ja.sjis.crlf +2 -0
- data/sample/02.ja.utf8.crlf +2 -0
- data/sample/humpty_dumpty01.ascii.lf +4 -0
- data/sample/humpty_dumpty02.ascii.lf +4 -0
- data/test/charstring_test.rb +1008 -0
- data/test/diff_test.rb +36 -0
- data/test/difference_test.rb +64 -0
- data/test/docdiff_test.rb +193 -0
- data/test/document_test.rb +626 -0
- data/test/test_helper.rb +7 -0
- data/test/view_test.rb +570 -0
- data/test/viewdiff_test.rb +908 -0
- metadata +129 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# test character classes on ASCII characters.
|
3
|
+
# 2003-03-10 Hisashi MORITA
|
4
|
+
|
5
|
+
charclasses = ["[:cntrl:]",
|
6
|
+
"[:space:]", "[:blank:]",
|
7
|
+
"[:digit:]",
|
8
|
+
"[:alpha:]", "[:alnum:]",
|
9
|
+
"[:punct:]",
|
10
|
+
"[:lower:]", "[:upper:]",
|
11
|
+
"[:print:]", "[:graph:]",
|
12
|
+
"[:xdigit:]"]
|
13
|
+
chars = (0x00 .. 0xff).to_a
|
14
|
+
|
15
|
+
charclasses.each{|charclass|
|
16
|
+
member_chars = []
|
17
|
+
chars.each{|char|
|
18
|
+
if Regexp.new("[#{charclass}]") =~ char.to_a.pack("C*")
|
19
|
+
member_chars.push char
|
20
|
+
end
|
21
|
+
}
|
22
|
+
puts "#{charclass}\t#{member_chars.collect{|char|sprintf("\\x%02x", char)}.join}\n\t\t(#{member_chars.collect{|char|char.to_a.pack('C*').inspect[1..-2]}.join})"
|
23
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# test character classes on ASCII characters.
|
3
|
+
# 2003-03-10 Hisashi MORITA
|
4
|
+
|
5
|
+
charclasses = ["[:cntrl:]",
|
6
|
+
"[:space:]", "[:blank:]",
|
7
|
+
"[:digit:]",
|
8
|
+
"[:alpha:]", "[:alnum:]",
|
9
|
+
"[:punct:]",
|
10
|
+
"[:lower:]", "[:upper:]",
|
11
|
+
"[:print:]", "[:graph:]",
|
12
|
+
"[:xdigit:]"]
|
13
|
+
(0x00 .. 0xff).to_a.each{|char|
|
14
|
+
attribute = []
|
15
|
+
charclasses.each{|charclass|
|
16
|
+
if Regexp.new("[#{charclass}]") =~ char.to_a.pack("C*")
|
17
|
+
attribute.push charclass
|
18
|
+
end
|
19
|
+
}
|
20
|
+
puts "#{sprintf("\\x%02x", char)} (#{char.to_a.pack('C*').inspect})\t#{attribute.join(', ')}"
|
21
|
+
}
|
data/devutil/jis0208.rb
ADDED
@@ -0,0 +1,343 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# Extracts multibyte characters from JIS0208.TXT.
|
3
|
+
# (ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT)
|
4
|
+
# 2003-03-03 .. 20xx-xx-xx, Hisashi MORITA. Use freely at your own risk.
|
5
|
+
# Usage: jis0208.rb <Ku> <Ten> <Codeset> #=> \xXX...
|
6
|
+
# Example: jis0208.rb 1 1 utf-8 #=> \xe3\x80\x80
|
7
|
+
|
8
|
+
class JIS0208
|
9
|
+
def utf16_to_utf8(utf16) # Convert UTF-16 to UTF-8N
|
10
|
+
utf16value = (utf16.unpack("C*")[0] * 256 + utf16.unpack("C*")[1])
|
11
|
+
if utf16value < 0x7f # 1-byte utf-8
|
12
|
+
utf8 = utf16value.to_a.pack("C*")
|
13
|
+
elsif utf16value < 0x800 # 2-byte utf-8
|
14
|
+
utf8 = [(0xC0 | (utf16value / 64)),
|
15
|
+
(0x80 | (utf16value % 64))].pack("C*")
|
16
|
+
else # 3-byte utf-8
|
17
|
+
utf8 = [(0xE0 | ((utf16value / 64) / 64)),
|
18
|
+
(0x80 | ((utf16value / 64) % 64)),
|
19
|
+
(0x80 | (utf16value % 64))].pack("C*")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
def initialize()
|
23
|
+
@lines = File.readlines("JIS0208.TXT")
|
24
|
+
@lines = @lines.grep(/^[^\#]/) # remove comments
|
25
|
+
@lines = @lines.collect{|l| l.sub(/\s+\#[^\#]+$/,'')} # remove unicode names
|
26
|
+
@char_db = @lines.collect {|line|
|
27
|
+
sjis, jis, utf16 = line.split.collect{|string|
|
28
|
+
string.sub(/0x/, '').to_a.pack("H*") # "0xXXXX" to 8-bit byte string
|
29
|
+
}
|
30
|
+
jis_byte_pair = jis.unpack("C*")
|
31
|
+
# jis + 0x8080 => euc
|
32
|
+
euc = jis_byte_pair.collect {|byte| (byte + 0x80)}.pack("C*")
|
33
|
+
# jis - 0x2020 => ku, ten
|
34
|
+
ku, ten = jis_byte_pair.collect {|byte| (byte - 0x20)}
|
35
|
+
utf8 = utf16_to_utf8(utf16)
|
36
|
+
{:s=>sjis, :j=>jis, :u16=>utf16, :e=>euc, :u8=>utf8, :ku=>ku, :ten=>ten}
|
37
|
+
}
|
38
|
+
@characters = {}
|
39
|
+
@char_db.each{|char|
|
40
|
+
if @characters[char[:ku]].nil?
|
41
|
+
@characters[char[:ku]] = {}
|
42
|
+
end
|
43
|
+
if @characters[char[:ku]][char[:ten]].nil?
|
44
|
+
@characters[char[:ku]][char[:ten]] = {
|
45
|
+
:s=>char[:s],
|
46
|
+
:j=>char[:j],
|
47
|
+
:u16=>char[:u16],
|
48
|
+
:e=>char[:e],
|
49
|
+
:u8=>char[:u8]
|
50
|
+
}
|
51
|
+
end
|
52
|
+
}
|
53
|
+
end
|
54
|
+
attr_reader :char_db
|
55
|
+
attr_reader :characters
|
56
|
+
def char(ku, ten, codeset)
|
57
|
+
case
|
58
|
+
when /^[Ee]/ =~ codeset then codeset = :e
|
59
|
+
when /^[Ss]/ =~ codeset then codeset = :s
|
60
|
+
when /^[Jj]/ =~ codeset then codeset = :j
|
61
|
+
when /^[Uu].*16$/ =~ codeset then codeset = :u16
|
62
|
+
when /^[Uu].*8$/ =~ codeset then codeset = :u8
|
63
|
+
else
|
64
|
+
raise "invalid codeset name (#{codeset})\n"
|
65
|
+
end
|
66
|
+
characters[ku][ten][codeset].unpack('C*').collect{|byte|
|
67
|
+
sprintf("\\x%x",byte)
|
68
|
+
}.join
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
if __FILE__ == $0
|
75
|
+
|
76
|
+
# euc-jp
|
77
|
+
def euc_ja_alnum()
|
78
|
+
j = JIS0208.new
|
79
|
+
r = []
|
80
|
+
(3).to_a.each{|ku|(16..25).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
81
|
+
(3).to_a.each{|ku|(33..58).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
82
|
+
(3).to_a.each{|ku|(65..90).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
83
|
+
r
|
84
|
+
end
|
85
|
+
def euc_ja_blank()
|
86
|
+
j = JIS0208.new
|
87
|
+
r = []
|
88
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
89
|
+
r
|
90
|
+
end
|
91
|
+
def euc_ja_print()
|
92
|
+
euc_ja_graph() + euc_ja_blank()
|
93
|
+
end
|
94
|
+
def euc_ja_graph()
|
95
|
+
euc_ja_alnum() + euc_ja_punct()
|
96
|
+
end
|
97
|
+
def euc_ja_punct()
|
98
|
+
j = JIS0208.new
|
99
|
+
r = []
|
100
|
+
(1).to_a.each{|ku|( 2..94).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
101
|
+
(2).to_a.each{|ku|( 1..14).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
102
|
+
(2).to_a.each{|ku|(26..33).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
103
|
+
(2).to_a.each{|ku|(42..48).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
104
|
+
(2).to_a.each{|ku|(60..74).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
105
|
+
(2).to_a.each{|ku|(82..89).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
106
|
+
(2).to_a.each{|ku|(94 ).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
107
|
+
(6).to_a.each{|ku|( 1..24).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
108
|
+
(6).to_a.each{|ku|(33..56).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
109
|
+
(7).to_a.each{|ku|( 1..33).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
110
|
+
(7).to_a.each{|ku|(49..81).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
111
|
+
(8).to_a.each{|ku|( 1..32).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
112
|
+
r
|
113
|
+
end
|
114
|
+
def euc_ja_space()
|
115
|
+
j = JIS0208.new
|
116
|
+
r = []
|
117
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
118
|
+
r
|
119
|
+
end
|
120
|
+
def euc_hiragana()
|
121
|
+
j = JIS0208.new
|
122
|
+
r = []
|
123
|
+
(4).to_a.each{|ku|(1..83).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
124
|
+
r
|
125
|
+
end
|
126
|
+
def euc_katakana()
|
127
|
+
j = JIS0208.new
|
128
|
+
r = []
|
129
|
+
(5).to_a.each{|ku|(1..86).to_a.each{|ten|r << j.char(ku,ten,"e")}}
|
130
|
+
r
|
131
|
+
end
|
132
|
+
def euc_kanji()
|
133
|
+
j = JIS0208.new
|
134
|
+
r = []
|
135
|
+
(16..46).to_a.each{|ku| r << "#{j.char(ku,1,'e')}-#{j.char(ku,94,'e')}"}
|
136
|
+
(47).to_a.each{|ku|r << "#{j.char(ku,1,'e')}-#{j.char(ku,51,'e')}"}
|
137
|
+
(48..83).to_a.each{|ku|r << "#{j.char(ku,1,'e')}-#{j.char(ku,94,'e')}"}
|
138
|
+
(84).to_a.each{|ku|r << "#{j.char(ku,1,'e')}-#{j.char(ku,6,'e')}"}
|
139
|
+
r
|
140
|
+
end
|
141
|
+
|
142
|
+
# sjis (cp932)
|
143
|
+
def sjis_ja_alnum()
|
144
|
+
j = JIS0208.new
|
145
|
+
r = []
|
146
|
+
(3).to_a.each{|ku|(16..25).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
147
|
+
(3).to_a.each{|ku|(33..58).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
148
|
+
(3).to_a.each{|ku|(65..90).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
149
|
+
r
|
150
|
+
end
|
151
|
+
def sjis_ja_blank()
|
152
|
+
j = JIS0208.new
|
153
|
+
r = []
|
154
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
155
|
+
r
|
156
|
+
end
|
157
|
+
def sjis_ja_print()
|
158
|
+
sjis_ja_graph() + sjis_ja_blank()
|
159
|
+
end
|
160
|
+
def sjis_ja_graph()
|
161
|
+
sjis_ja_alnum() + sjis_ja_punct()
|
162
|
+
end
|
163
|
+
def sjis_ja_punct()
|
164
|
+
j = JIS0208.new
|
165
|
+
r = []
|
166
|
+
(1).to_a.each{|ku|(2..94).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
167
|
+
(2).to_a.each{|ku|(1..14).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
168
|
+
(2).to_a.each{|ku|(26..33).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
169
|
+
(2).to_a.each{|ku|(42..48).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
170
|
+
(2).to_a.each{|ku|(60..74).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
171
|
+
(2).to_a.each{|ku|(82..89).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
172
|
+
(2).to_a.each{|ku|(94).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
173
|
+
(6).to_a.each{|ku|(1..24).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
174
|
+
(6).to_a.each{|ku|(33..56).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
175
|
+
(7).to_a.each{|ku|(1..33).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
176
|
+
(7).to_a.each{|ku|(49..81).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
177
|
+
(8).to_a.each{|ku|(1..32).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
178
|
+
#(13).to_a.each{|ku|(1..30).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
|
179
|
+
#(13).to_a.each{|ku|(32..54).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
|
180
|
+
#(13).to_a.each{|ku|(63..92).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
|
181
|
+
#(92).to_a.each{|ku|(81..94).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
|
182
|
+
r
|
183
|
+
end
|
184
|
+
def sjis_ja_space()
|
185
|
+
j = JIS0208.new
|
186
|
+
r = []
|
187
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
188
|
+
r
|
189
|
+
end
|
190
|
+
def sjis_hiragana()
|
191
|
+
j = JIS0208.new
|
192
|
+
r = []
|
193
|
+
(4).to_a.each{|ku|(1..83).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
194
|
+
r
|
195
|
+
end
|
196
|
+
def sjis_katakana()
|
197
|
+
j = JIS0208.new
|
198
|
+
r = []
|
199
|
+
(5).to_a.each{|ku|(1..86).to_a.each{|ten|r << j.char(ku,ten,"s")}}
|
200
|
+
r
|
201
|
+
end
|
202
|
+
def sjis_kanji()
|
203
|
+
j = JIS0208.new
|
204
|
+
r = []
|
205
|
+
(16..46).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,94,'s')}"}
|
206
|
+
(47).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,51,'s')}"}
|
207
|
+
(48..83).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,94,'s')}"}
|
208
|
+
(84).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,6,'s')}"}
|
209
|
+
(89..91).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,94,'s')}"}#cp932
|
210
|
+
(92).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,78,'s')}"}#cp932
|
211
|
+
r
|
212
|
+
end
|
213
|
+
|
214
|
+
# utf8
|
215
|
+
def utf8_ja_alnum()
|
216
|
+
j = JIS0208.new
|
217
|
+
r = []
|
218
|
+
(3).to_a.each{|ku|(16..25).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
219
|
+
(3).to_a.each{|ku|(33..58).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
220
|
+
(3).to_a.each{|ku|(65..90).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
221
|
+
r
|
222
|
+
end
|
223
|
+
def utf8_ja_blank()
|
224
|
+
j = JIS0208.new
|
225
|
+
r = []
|
226
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
227
|
+
r
|
228
|
+
end
|
229
|
+
def utf8_ja_print()
|
230
|
+
utf8_ja_graph() + utf8_ja_blank()
|
231
|
+
end
|
232
|
+
def utf8_ja_graph()
|
233
|
+
utf8_ja_alnum() + utf8_ja_punct()
|
234
|
+
end
|
235
|
+
def utf8_ja_punct()
|
236
|
+
j = JIS0208.new
|
237
|
+
r = []
|
238
|
+
(1).to_a.each{|ku|( 2..94).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
239
|
+
(2).to_a.each{|ku|( 1..14).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
240
|
+
(2).to_a.each{|ku|(26..33).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
241
|
+
(2).to_a.each{|ku|(42..48).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
242
|
+
(2).to_a.each{|ku|(60..74).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
243
|
+
(2).to_a.each{|ku|(82..89).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
244
|
+
(2).to_a.each{|ku|(94 ).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
245
|
+
(6).to_a.each{|ku|( 1..24).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
246
|
+
(6).to_a.each{|ku|(33..56).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
247
|
+
(7).to_a.each{|ku|( 1..33).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
248
|
+
(7).to_a.each{|ku|(49..81).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
249
|
+
(8).to_a.each{|ku|( 1..32).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
250
|
+
r
|
251
|
+
end
|
252
|
+
def utf8_ja_space()
|
253
|
+
j = JIS0208.new
|
254
|
+
r = []
|
255
|
+
(1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
|
256
|
+
r
|
257
|
+
end
|
258
|
+
def utf8_hiragana()
|
259
|
+
j = JIS0208.new
|
260
|
+
r = []
|
261
|
+
(4).to_a.each{|ku|(1..83).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
262
|
+
r
|
263
|
+
end
|
264
|
+
def utf8_katakana()
|
265
|
+
j = JIS0208.new
|
266
|
+
r = []
|
267
|
+
(5).to_a.each{|ku|(1..86).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
268
|
+
r
|
269
|
+
end
|
270
|
+
def utf8_kanji()
|
271
|
+
j = JIS0208.new
|
272
|
+
r = []
|
273
|
+
(16..46).to_a.each{|ku|(1..94).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
274
|
+
(47).to_a.each{|ku|(1..51).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
275
|
+
(48..83).to_a.each{|ku|(1..94).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
276
|
+
(84).to_a.each{|ku|(1..6).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
|
277
|
+
r
|
278
|
+
end
|
279
|
+
|
280
|
+
jis0208 = JIS0208.new
|
281
|
+
if ARGV.size == 3
|
282
|
+
ku, ten, codeset = ARGV[0].to_i, ARGV[1].to_i, ARGV[2].to_s
|
283
|
+
puts jis0208.char(ku, ten, codeset)
|
284
|
+
exit(0)
|
285
|
+
elsif ARGV.size == 2
|
286
|
+
codeset, charclass = ARGV[0].to_s, ARGV[1].to_s
|
287
|
+
else
|
288
|
+
puts "Usage: jis0208.rb (<Ku> <Ten> <Codeset> | <Codeset> <CharClass>)"
|
289
|
+
puts "Supported codeset: EUC-JP, Shift_JIS, UTF-8"
|
290
|
+
puts "Supported charclass: blank, space, alnum, punct, print, graph, hiragana, katakana, kanji"
|
291
|
+
puts "Example 1: jis0208.rb 16 1 utf-8"
|
292
|
+
puts "Example 2: jis0208.rb euc-jp punct"
|
293
|
+
exit(0)
|
294
|
+
end
|
295
|
+
|
296
|
+
case
|
297
|
+
when (/^e/i.match codeset) # euc-jp
|
298
|
+
case
|
299
|
+
when (/^space/i.match charclass) then puts euc_ja_space()
|
300
|
+
when (/^blank/i.match charclass) then puts euc_ja_blank()
|
301
|
+
when (/^alnum/i.match charclass) then puts euc_ja_alnum()
|
302
|
+
when (/^punct/i.match charclass) then puts euc_ja_punct()
|
303
|
+
when (/^print/i.match charclass) then puts euc_ja_print()
|
304
|
+
when (/^graph/i.match charclass) then puts euc_ja_graph()
|
305
|
+
when (/^hira/i.match charclass) then puts euc_hiragana()
|
306
|
+
when (/^kata/i.match charclass) then puts euc_katakana()
|
307
|
+
when (/^kanji/i.match charclass) then puts euc_kanji()
|
308
|
+
else
|
309
|
+
raise "invalid charclass (#{charclass}).\n"
|
310
|
+
end
|
311
|
+
when (/^s/i.match codeset) # sjis
|
312
|
+
case
|
313
|
+
when (/^space/i.match charclass) then puts sjis_ja_space()
|
314
|
+
when (/^blank/i.match charclass) then puts sjis_ja_blank()
|
315
|
+
when (/^alnum/i.match charclass) then puts sjis_ja_alnum()
|
316
|
+
when (/^punct/i.match charclass) then puts sjis_ja_punct()
|
317
|
+
when (/^print/i.match charclass) then puts sjis_ja_print()
|
318
|
+
when (/^graph/i.match charclass) then puts sjis_ja_graph()
|
319
|
+
when (/^hira/i.match charclass) then puts sjis_hiragana()
|
320
|
+
when (/^kata/i.match charclass) then puts sjis_katakana()
|
321
|
+
when (/^kanji/i.match charclass) then puts sjis_kanji()
|
322
|
+
else
|
323
|
+
raise "invalid charclass (#{charclass}).\n"
|
324
|
+
end
|
325
|
+
when (/^u/i.match codeset) # utf-8
|
326
|
+
case
|
327
|
+
when (/^space/i.match charclass) then puts utf8_ja_space()
|
328
|
+
when (/^blank/i.match charclass) then puts utf8_ja_blank()
|
329
|
+
when (/^alnum/i.match charclass) then puts utf8_ja_alnum()
|
330
|
+
when (/^punct/i.match charclass) then puts utf8_ja_punct()
|
331
|
+
when (/^print/i.match charclass) then puts utf8_ja_print()
|
332
|
+
when (/^graph/i.match charclass) then puts utf8_ja_graph()
|
333
|
+
when (/^hira/i.match charclass) then puts utf8_hiragana()
|
334
|
+
when (/^kata/i.match charclass) then puts utf8_katakana()
|
335
|
+
when (/^kanji/i.match charclass) then puts utf8_kanji()
|
336
|
+
else
|
337
|
+
raise "invalid charclass (#{charclass}).\n"
|
338
|
+
end
|
339
|
+
else
|
340
|
+
raise "invalid codeset (#{codeset}) or charclass (#{charclass}).\n"
|
341
|
+
end
|
342
|
+
|
343
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'jis0208'
|
3
|
+
require 'nkf'
|
4
|
+
require 'iconv'
|
5
|
+
|
6
|
+
class TC_JIS0208 < Test::Unit::TestCase
|
7
|
+
def setup()
|
8
|
+
#
|
9
|
+
end
|
10
|
+
|
11
|
+
=begin obsolete
|
12
|
+
def test_string_to_array()
|
13
|
+
jis0208 = JIS0208.new
|
14
|
+
expected = [0xe1, 0xa1, 0xa8] # "ɽ"(4129) UTF-8 as array
|
15
|
+
# assert_equal(expected, Iconv.iconv("UTF-8", "EUC-JP", "ɽ").to_s)
|
16
|
+
assert_equal(expected, jis0208.string_to_array("\xe1\xa1\xa8"))
|
17
|
+
end
|
18
|
+
def test_array_to_string()
|
19
|
+
jis0208 = JIS0208.new
|
20
|
+
expected = "\xe1\xa1\xa8" # "ɽ"(4129) in UTF-8 string
|
21
|
+
# assert_equal(expected, Iconv.iconv("UTF-8", "EUC-JP", "ɽ").to_s)
|
22
|
+
assert_equal(expected, jis0208.array_to_string([0xe1, 0xa1, 0xa8]))
|
23
|
+
end
|
24
|
+
=end
|
25
|
+
=begin obsolete
|
26
|
+
def test_to_value_array()
|
27
|
+
expected = [0xe1, 0xa1, 0xa8] # "ɽ"(4129) UTF-8 as array
|
28
|
+
assert_equal(expected, "\xe1\xa1\xa8".to_value_array)
|
29
|
+
end
|
30
|
+
def test_to_binary_string()
|
31
|
+
expected = "\xe1\xa1\xa8" # "ɽ"(4129) in UTF-8 string
|
32
|
+
assert_equal(expected, [0xe1, 0xa1, 0xa8].to_binary_string)
|
33
|
+
end
|
34
|
+
def teardown()
|
35
|
+
#
|
36
|
+
end
|
37
|
+
=end
|
38
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
## DocDiff configuration file
|
2
|
+
## Comment out and modify the following lines as you like.
|
3
|
+
#
|
4
|
+
# resolution = word
|
5
|
+
# encoding = UTF-8
|
6
|
+
# eol = LF
|
7
|
+
# format = html
|
8
|
+
# digest = off
|
9
|
+
# cache = off # not implemented yet
|
10
|
+
# verbose = no # not implemented yet
|
11
|
+
#
|
12
|
+
## user-defined tags (not well-supported yet)
|
13
|
+
# tag_common_start = '<=>'
|
14
|
+
# tag_common_end = '</=>'
|
15
|
+
# tag_del_start = '<->'
|
16
|
+
# tag_del_end = '</->'
|
17
|
+
# tag_add_start = '<+>'
|
18
|
+
# tag_add_end = '</+>'
|
19
|
+
# tag_change_before_start = '<!->'
|
20
|
+
# tag_change_before_end = '</!->'
|
21
|
+
# tag_change_after_start = '<!+>'
|
22
|
+
# tag_change_after_end = '</!+>'
|