docdiff 0.6.5 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +7 -7
  3. data/Guardfile +4 -4
  4. data/Makefile +1 -1
  5. data/Rakefile +6 -6
  6. data/bin/docdiff +1 -1
  7. data/devutil/Rakefile +12 -5
  8. data/devutil/char_by_charclass.rb +43 -20
  9. data/devutil/charclass_by_char.rb +40 -19
  10. data/devutil/jis0208.rb +263 -231
  11. data/devutil/jis0208_test.rb +196 -0
  12. data/doc/news.md +8 -0
  13. data/docdiff.gemspec +12 -10
  14. data/lib/doc_diff.rb +59 -60
  15. data/lib/docdiff/charstring.rb +225 -241
  16. data/lib/docdiff/cli.rb +285 -250
  17. data/lib/docdiff/diff/contours.rb +1 -1
  18. data/lib/docdiff/diff/editscript.rb +1 -1
  19. data/lib/docdiff/diff/rcsdiff.rb +1 -1
  20. data/lib/docdiff/diff/shortestpath.rb +1 -1
  21. data/lib/docdiff/diff/speculative.rb +1 -1
  22. data/lib/docdiff/diff/subsequence.rb +1 -1
  23. data/lib/docdiff/diff/unidiff.rb +1 -1
  24. data/lib/docdiff/diff.rb +1 -1
  25. data/lib/docdiff/difference.rb +71 -70
  26. data/lib/docdiff/document.rb +129 -109
  27. data/lib/docdiff/encoding/en_ascii.rb +64 -58
  28. data/lib/docdiff/encoding/ja_eucjp.rb +250 -235
  29. data/lib/docdiff/encoding/ja_sjis.rb +240 -226
  30. data/lib/docdiff/encoding/ja_utf8.rb +6952 -6939
  31. data/lib/docdiff/version.rb +1 -1
  32. data/lib/docdiff/view.rb +522 -438
  33. data/lib/docdiff.rb +2 -2
  34. data/test/charstring_test.rb +475 -351
  35. data/test/cli_test.rb +103 -101
  36. data/test/diff_test.rb +15 -16
  37. data/test/difference_test.rb +40 -31
  38. data/test/docdiff_test.rb +162 -136
  39. data/test/document_test.rb +280 -175
  40. data/test/test_helper.rb +2 -1
  41. data/test/view_test.rb +636 -497
  42. metadata +8 -8
  43. data/devutil/testjis0208.rb +0 -38
data/devutil/jis0208.rb CHANGED
@@ -1,343 +1,375 @@
1
1
  #!/usr/bin/ruby
2
+ # frozen_string_literal: true
3
+
2
4
  # Extracts multibyte characters from JIS0208.TXT.
3
5
  # (ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT)
4
6
  # 2003-03-03 .. 20xx-xx-xx, Hisashi MORITA. Use freely at your own risk.
5
7
  # Usage: jis0208.rb <Ku> <Ten> <Codeset> #=> \xXX...
6
8
  # Example: jis0208.rb 1 1 utf-8 #=> \xe3\x80\x80
7
9
 
10
+ Encoding.default_external = "UTF-8"
11
+
8
12
  class JIS0208
9
- def utf16_to_utf8(utf16) # Convert UTF-16 to UTF-8N
10
- utf16value = (utf16.unpack("C*")[0] * 256 + utf16.unpack("C*")[1])
11
- if utf16value < 0x7f # 1-byte utf-8
12
- utf8 = utf16value.to_a.pack("C*")
13
- elsif utf16value < 0x800 # 2-byte utf-8
14
- utf8 = [(0xC0 | (utf16value / 64)),
15
- (0x80 | (utf16value % 64))].pack("C*")
16
- else # 3-byte utf-8
17
- utf8 = [(0xE0 | ((utf16value / 64) / 64)),
18
- (0x80 | ((utf16value / 64) % 64)),
19
- (0x80 | (utf16value % 64))].pack("C*")
20
- end
13
+ # Convert UTF-16 to UTF-8
14
+ def utf16_to_utf8(utf16)
15
+ utf16value =
16
+ utf16.unpack("C*")[0] * 256 + utf16.unpack("C*")[1]
17
+ utf8 =
18
+ if utf16value < 0x7f
19
+ # 1-byte UTF-8
20
+ [utf16value].pack("C*")
21
+ elsif utf16value < 0x800
22
+ # 2-byte UTF-8
23
+ [
24
+ (0xC0 | (utf16value / 64)),
25
+ (0x80 | (utf16value % 64)),
26
+ ].pack("C*")
27
+ else
28
+ # 3-byte UTF-8
29
+ [
30
+ (0xE0 | ((utf16value / 64) / 64)),
31
+ (0x80 | ((utf16value / 64) % 64)),
32
+ (0x80 | (utf16value % 64)),
33
+ ].pack("C*")
34
+ end
35
+ utf8
21
36
  end
22
- def initialize()
37
+
38
+ def initialize
23
39
  @lines = File.readlines("JIS0208.TXT")
24
- @lines = @lines.grep(/^[^\#]/) # remove comments
25
- @lines = @lines.collect{|l| l.sub(/\s+\#[^\#]+$/,'')} # remove unicode names
26
- @char_db = @lines.collect {|line|
27
- sjis, jis, utf16 = line.split.collect{|string|
28
- string.sub(/0x/, '').to_a.pack("H*") # "0xXXXX" to 8-bit byte string
29
- }
40
+ @lines = @lines.grep(/^[^\#]/) # remove comments
41
+ @lines = @lines.map { |l| l.sub(/\s+\#[^\#]+$/, "") } # remove unicode names
42
+ @char_db = @lines.collect do |line|
43
+ sjis, jis, utf16 = line.split.map do |string|
44
+ [string.sub("0x", "")].pack("H*") # "0xXXXX" to 8-bit byte string
45
+ end
30
46
  jis_byte_pair = jis.unpack("C*")
31
47
  # jis + 0x8080 => euc
32
- euc = jis_byte_pair.collect {|byte| (byte + 0x80)}.pack("C*")
48
+ euc = jis_byte_pair.map { |byte| (byte + 0x80) }.pack("C*")
33
49
  # jis - 0x2020 => ku, ten
34
- ku, ten = jis_byte_pair.collect {|byte| (byte - 0x20)}
35
- utf8 = utf16_to_utf8(utf16)
36
- {:s=>sjis, :j=>jis, :u16=>utf16, :e=>euc, :u8=>utf8, :ku=>ku, :ten=>ten}
37
- }
50
+ ku, ten = jis_byte_pair.map { |byte| (byte - 0x20) }
51
+ utf8 = utf16_to_utf8(utf16)
52
+ {
53
+ s: sjis,
54
+ j: jis,
55
+ u16: utf16,
56
+ e: euc,
57
+ u8: utf8,
58
+ ku: ku,
59
+ ten: ten,
60
+ }
61
+ end
38
62
  @characters = {}
39
- @char_db.each{|char|
63
+ @char_db.each do |char|
40
64
  if @characters[char[:ku]].nil?
41
65
  @characters[char[:ku]] = {}
42
66
  end
43
67
  if @characters[char[:ku]][char[:ten]].nil?
44
68
  @characters[char[:ku]][char[:ten]] = {
45
- :s=>char[:s],
46
- :j=>char[:j],
47
- :u16=>char[:u16],
48
- :e=>char[:e],
49
- :u8=>char[:u8]
69
+ s: char[:s],
70
+ j: char[:j],
71
+ u16: char[:u16],
72
+ e: char[:e],
73
+ u8: char[:u8],
50
74
  }
51
75
  end
52
- }
76
+ end
53
77
  end
54
78
  attr_reader :char_db
55
79
  attr_reader :characters
56
- def char(ku, ten, codeset)
57
- case
58
- when /^[Ee]/ =~ codeset then codeset = :e
59
- when /^[Ss]/ =~ codeset then codeset = :s
60
- when /^[Jj]/ =~ codeset then codeset = :j
61
- when /^[Uu].*16$/ =~ codeset then codeset = :u16
62
- when /^[Uu].*8$/ =~ codeset then codeset = :u8
63
- else
64
- raise "invalid codeset name (#{codeset})\n"
65
- end
66
- characters[ku][ten][codeset].unpack('C*').collect{|byte|
67
- sprintf("\\x%x",byte)
68
- }.join
69
- end
70
80
 
81
+ def char(ku, ten, codeset_name)
82
+ codeset =
83
+ case codeset_name
84
+ when /^[Ee]/ then :e
85
+ when /^[Ss]/ then :s
86
+ when /^[Jj]/ then :j
87
+ when /^[Uu].*16$/ then :u16
88
+ when /^[Uu].*8$/ then :u8
89
+ else
90
+ raise "invalid codeset name (#{codeset_name})"
91
+ end
71
92
 
72
- end
73
-
74
- if __FILE__ == $0
93
+ characters[ku][ten][codeset].unpack("C*").map do |byte|
94
+ format("\\x%x", byte)
95
+ end.join
96
+ end
75
97
 
76
- # euc-jp
77
- def euc_ja_alnum()
78
- j = JIS0208.new
98
+ def euc_ja_alnum
79
99
  r = []
80
- (3).to_a.each{|ku|(16..25).to_a.each{|ten|r << j.char(ku,ten,"e")}}
81
- (3).to_a.each{|ku|(33..58).to_a.each{|ten|r << j.char(ku,ten,"e")}}
82
- (3).to_a.each{|ku|(65..90).to_a.each{|ten|r << j.char(ku,ten,"e")}}
100
+ [3].each { |ku| (16..25).to_a.each { |ten| r << char(ku, ten, "e") } }
101
+ [3].each { |ku| (33..58).to_a.each { |ten| r << char(ku, ten, "e") } }
102
+ [3].each { |ku| (65..90).to_a.each { |ten| r << char(ku, ten, "e") } }
83
103
  r
84
104
  end
85
- def euc_ja_blank()
86
- j = JIS0208.new
105
+
106
+ def euc_ja_blank
87
107
  r = []
88
- (1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"e")}}
108
+ [1].each { |ku| [1].each { |ten| r << char(ku, ten, "e") } }
89
109
  r
90
110
  end
91
- def euc_ja_print()
92
- euc_ja_graph() + euc_ja_blank()
111
+
112
+ def euc_ja_print
113
+ euc_ja_graph + euc_ja_blank
93
114
  end
94
- def euc_ja_graph()
95
- euc_ja_alnum() + euc_ja_punct()
115
+
116
+ def euc_ja_graph
117
+ euc_ja_alnum + euc_ja_punct
96
118
  end
97
- def euc_ja_punct()
98
- j = JIS0208.new
119
+
120
+ def euc_ja_punct
99
121
  r = []
100
- (1).to_a.each{|ku|( 2..94).to_a.each{|ten|r << j.char(ku,ten,"e")}}
101
- (2).to_a.each{|ku|( 1..14).to_a.each{|ten|r << j.char(ku,ten,"e")}}
102
- (2).to_a.each{|ku|(26..33).to_a.each{|ten|r << j.char(ku,ten,"e")}}
103
- (2).to_a.each{|ku|(42..48).to_a.each{|ten|r << j.char(ku,ten,"e")}}
104
- (2).to_a.each{|ku|(60..74).to_a.each{|ten|r << j.char(ku,ten,"e")}}
105
- (2).to_a.each{|ku|(82..89).to_a.each{|ten|r << j.char(ku,ten,"e")}}
106
- (2).to_a.each{|ku|(94 ).to_a.each{|ten|r << j.char(ku,ten,"e")}}
107
- (6).to_a.each{|ku|( 1..24).to_a.each{|ten|r << j.char(ku,ten,"e")}}
108
- (6).to_a.each{|ku|(33..56).to_a.each{|ten|r << j.char(ku,ten,"e")}}
109
- (7).to_a.each{|ku|( 1..33).to_a.each{|ten|r << j.char(ku,ten,"e")}}
110
- (7).to_a.each{|ku|(49..81).to_a.each{|ten|r << j.char(ku,ten,"e")}}
111
- (8).to_a.each{|ku|( 1..32).to_a.each{|ten|r << j.char(ku,ten,"e")}}
122
+ [1].each { |ku| (2..94).to_a.each { |ten| r << char(ku, ten, "e") } }
123
+ [2].each { |ku| (1..14).to_a.each { |ten| r << char(ku, ten, "e") } }
124
+ [2].each { |ku| (26..33).to_a.each { |ten| r << char(ku, ten, "e") } }
125
+ [2].each { |ku| (42..48).to_a.each { |ten| r << char(ku, ten, "e") } }
126
+ [2].each { |ku| (60..74).to_a.each { |ten| r << char(ku, ten, "e") } }
127
+ [2].each { |ku| (82..89).to_a.each { |ten| r << char(ku, ten, "e") } }
128
+ [2].each { |ku| [94].each { |ten| r << char(ku, ten, "e") } }
129
+ [6].each { |ku| (1..24).to_a.each { |ten| r << char(ku, ten, "e") } }
130
+ [6].each { |ku| (33..56).to_a.each { |ten| r << char(ku, ten, "e") } }
131
+ [7].each { |ku| (1..33).to_a.each { |ten| r << char(ku, ten, "e") } }
132
+ [7].each { |ku| (49..81).to_a.each { |ten| r << char(ku, ten, "e") } }
133
+ [8].each { |ku| (1..32).to_a.each { |ten| r << char(ku, ten, "e") } }
112
134
  r
113
135
  end
114
- def euc_ja_space()
115
- j = JIS0208.new
136
+
137
+ def euc_ja_space
116
138
  r = []
117
- (1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"e")}}
139
+ [1].each { |ku| [1].each { |ten| r << char(ku, ten, "e") } }
118
140
  r
119
141
  end
120
- def euc_hiragana()
121
- j = JIS0208.new
142
+
143
+ def euc_hiragana
122
144
  r = []
123
- (4).to_a.each{|ku|(1..83).to_a.each{|ten|r << j.char(ku,ten,"e")}}
145
+ [4].each { |ku| (1..83).to_a.each { |ten| r << char(ku, ten, "e") } }
124
146
  r
125
147
  end
126
- def euc_katakana()
127
- j = JIS0208.new
148
+
149
+ def euc_katakana
128
150
  r = []
129
- (5).to_a.each{|ku|(1..86).to_a.each{|ten|r << j.char(ku,ten,"e")}}
151
+ [5].each { |ku| (1..86).to_a.each { |ten| r << char(ku, ten, "e") } }
130
152
  r
131
153
  end
132
- def euc_kanji()
133
- j = JIS0208.new
154
+
155
+ def euc_kanji
134
156
  r = []
135
- (16..46).to_a.each{|ku| r << "#{j.char(ku,1,'e')}-#{j.char(ku,94,'e')}"}
136
- (47).to_a.each{|ku|r << "#{j.char(ku,1,'e')}-#{j.char(ku,51,'e')}"}
137
- (48..83).to_a.each{|ku|r << "#{j.char(ku,1,'e')}-#{j.char(ku,94,'e')}"}
138
- (84).to_a.each{|ku|r << "#{j.char(ku,1,'e')}-#{j.char(ku,6,'e')}"}
157
+ (16..46).to_a.each { |ku| r << "#{char(ku, 1, "e")}-#{char(ku, 94, "e")}" }
158
+ [47].each { |ku| r << "#{char(ku, 1, "e")}-#{char(ku, 51, "e")}" }
159
+ (48..83).to_a.each { |ku| r << "#{char(ku, 1, "e")}-#{char(ku, 94, "e")}" }
160
+ [84].each { |ku| r << "#{char(ku, 1, "e")}-#{char(ku, 6, "e")}" }
139
161
  r
140
162
  end
141
163
 
142
- # sjis (cp932)
143
- def sjis_ja_alnum()
144
- j = JIS0208.new
164
+ def sjis_ja_alnum
145
165
  r = []
146
- (3).to_a.each{|ku|(16..25).to_a.each{|ten|r << j.char(ku,ten,"s")}}
147
- (3).to_a.each{|ku|(33..58).to_a.each{|ten|r << j.char(ku,ten,"s")}}
148
- (3).to_a.each{|ku|(65..90).to_a.each{|ten|r << j.char(ku,ten,"s")}}
166
+ [3].each { |ku| (16..25).to_a.each { |ten| r << char(ku, ten, "s") } }
167
+ [3].each { |ku| (33..58).to_a.each { |ten| r << char(ku, ten, "s") } }
168
+ [3].each { |ku| (65..90).to_a.each { |ten| r << char(ku, ten, "s") } }
149
169
  r
150
170
  end
151
- def sjis_ja_blank()
152
- j = JIS0208.new
171
+
172
+ def sjis_ja_blank
153
173
  r = []
154
- (1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"s")}}
174
+ [1].each { |ku| [1].each { |ten| r << char(ku, ten, "s") } }
155
175
  r
156
176
  end
157
- def sjis_ja_print()
158
- sjis_ja_graph() + sjis_ja_blank()
177
+
178
+ def sjis_ja_print
179
+ sjis_ja_graph + sjis_ja_blank
159
180
  end
160
- def sjis_ja_graph()
161
- sjis_ja_alnum() + sjis_ja_punct()
181
+
182
+ def sjis_ja_graph
183
+ sjis_ja_alnum + sjis_ja_punct
162
184
  end
163
- def sjis_ja_punct()
164
- j = JIS0208.new
185
+
186
+ def sjis_ja_punct
165
187
  r = []
166
- (1).to_a.each{|ku|(2..94).to_a.each{|ten|r << j.char(ku,ten,"s")}}
167
- (2).to_a.each{|ku|(1..14).to_a.each{|ten|r << j.char(ku,ten,"s")}}
168
- (2).to_a.each{|ku|(26..33).to_a.each{|ten|r << j.char(ku,ten,"s")}}
169
- (2).to_a.each{|ku|(42..48).to_a.each{|ten|r << j.char(ku,ten,"s")}}
170
- (2).to_a.each{|ku|(60..74).to_a.each{|ten|r << j.char(ku,ten,"s")}}
171
- (2).to_a.each{|ku|(82..89).to_a.each{|ten|r << j.char(ku,ten,"s")}}
172
- (2).to_a.each{|ku|(94).to_a.each{|ten|r << j.char(ku,ten,"s")}}
173
- (6).to_a.each{|ku|(1..24).to_a.each{|ten|r << j.char(ku,ten,"s")}}
174
- (6).to_a.each{|ku|(33..56).to_a.each{|ten|r << j.char(ku,ten,"s")}}
175
- (7).to_a.each{|ku|(1..33).to_a.each{|ten|r << j.char(ku,ten,"s")}}
176
- (7).to_a.each{|ku|(49..81).to_a.each{|ten|r << j.char(ku,ten,"s")}}
177
- (8).to_a.each{|ku|(1..32).to_a.each{|ten|r << j.char(ku,ten,"s")}}
178
- #(13).to_a.each{|ku|(1..30).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
179
- #(13).to_a.each{|ku|(32..54).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
180
- #(13).to_a.each{|ku|(63..92).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
181
- #(92).to_a.each{|ku|(81..94).to_a.each{|ten|r << j.char(ku,ten,"s")}}#cp932
188
+ [1].each { |ku| (2..94).to_a.each { |ten| r << char(ku, ten, "s") } }
189
+ [2].each { |ku| (1..14).to_a.each { |ten| r << char(ku, ten, "s") } }
190
+ [2].each { |ku| (26..33).to_a.each { |ten| r << char(ku, ten, "s") } }
191
+ [2].each { |ku| (42..48).to_a.each { |ten| r << char(ku, ten, "s") } }
192
+ [2].each { |ku| (60..74).to_a.each { |ten| r << char(ku, ten, "s") } }
193
+ [2].each { |ku| (82..89).to_a.each { |ten| r << char(ku, ten, "s") } }
194
+ [2].each { |ku| [94].each { |ten| r << char(ku, ten, "s") } }
195
+ [6].each { |ku| (1..24).to_a.each { |ten| r << char(ku, ten, "s") } }
196
+ [6].each { |ku| (33..56).to_a.each { |ten| r << char(ku, ten, "s") } }
197
+ [7].each { |ku| (1..33).to_a.each { |ten| r << char(ku, ten, "s") } }
198
+ [7].each { |ku| (49..81).to_a.each { |ten| r << char(ku, ten, "s") } }
199
+ [8].each { |ku| (1..32).to_a.each { |ten| r << char(ku, ten, "s") } }
200
+ # [13].each { |ku| (1..30).to_a.each { |ten| r << char(ku, ten, "s") } } # cp932
201
+ # [13].each { |ku| (32..54).to_a.each { |ten| r << char(ku, ten, "s") } } # cp932
202
+ # [13].each { |ku| (63..92).to_a.each { |ten| r << char(ku, ten, "s") } } # cp932
203
+ # [92].each { |ku| (81..94).to_a.each { |ten| r << char(ku, ten, "s") } } # cp932
182
204
  r
183
205
  end
184
- def sjis_ja_space()
185
- j = JIS0208.new
206
+
207
+ def sjis_ja_space
186
208
  r = []
187
- (1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"s")}}
209
+ [1].each { |ku| [1].each { |ten| r << char(ku, ten, "s") } }
188
210
  r
189
211
  end
190
- def sjis_hiragana()
191
- j = JIS0208.new
212
+
213
+ def sjis_hiragana
192
214
  r = []
193
- (4).to_a.each{|ku|(1..83).to_a.each{|ten|r << j.char(ku,ten,"s")}}
215
+ [4].each { |ku| (1..83).to_a.each { |ten| r << char(ku, ten, "s") } }
194
216
  r
195
217
  end
196
- def sjis_katakana()
197
- j = JIS0208.new
218
+
219
+ def sjis_katakana
198
220
  r = []
199
- (5).to_a.each{|ku|(1..86).to_a.each{|ten|r << j.char(ku,ten,"s")}}
221
+ [5].each { |ku| (1..86).to_a.each { |ten| r << char(ku, ten, "s") } }
200
222
  r
201
223
  end
202
- def sjis_kanji()
203
- j = JIS0208.new
224
+
225
+ def sjis_kanji
204
226
  r = []
205
- (16..46).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,94,'s')}"}
206
- (47).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,51,'s')}"}
207
- (48..83).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,94,'s')}"}
208
- (84).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,6,'s')}"}
209
- (89..91).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,94,'s')}"}#cp932
210
- (92).to_a.each{|ku|r << "#{j.char(ku,1,'s')}-#{j.char(ku,78,'s')}"}#cp932
227
+ (16..46).to_a.each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 94, "s")}" }
228
+ [47].each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 51, "s")}" }
229
+ (48..83).to_a.each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 94, "s")}" }
230
+ [84].each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 6, "s")}" }
231
+ # (89..91).to_a.each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 94, "s")}" } # cp932 # FIXME
232
+ # [92].each { |ku| r << "#{char(ku, 1, "s")}-#{char(ku, 78, "s")}" } # cp932 # FIXME
211
233
  r
212
234
  end
213
235
 
214
- # utf8
215
- def utf8_ja_alnum()
216
- j = JIS0208.new
236
+ def utf8_ja_alnum
217
237
  r = []
218
- (3).to_a.each{|ku|(16..25).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
219
- (3).to_a.each{|ku|(33..58).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
220
- (3).to_a.each{|ku|(65..90).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
238
+ [3].each { |ku| (16..25).to_a.each { |ten| r << char(ku, ten, "u8") } }
239
+ [3].each { |ku| (33..58).to_a.each { |ten| r << char(ku, ten, "u8") } }
240
+ [3].each { |ku| (65..90).to_a.each { |ten| r << char(ku, ten, "u8") } }
221
241
  r
222
242
  end
223
- def utf8_ja_blank()
224
- j = JIS0208.new
243
+
244
+ def utf8_ja_blank
225
245
  r = []
226
- (1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
246
+ [1].each { |ku| [1].each { |ten| r << char(ku, ten, "u8") } }
227
247
  r
228
248
  end
229
- def utf8_ja_print()
230
- utf8_ja_graph() + utf8_ja_blank()
249
+
250
+ def utf8_ja_print
251
+ utf8_ja_graph + utf8_ja_blank
231
252
  end
232
- def utf8_ja_graph()
233
- utf8_ja_alnum() + utf8_ja_punct()
253
+
254
+ def utf8_ja_graph
255
+ utf8_ja_alnum + utf8_ja_punct
234
256
  end
235
- def utf8_ja_punct()
236
- j = JIS0208.new
257
+
258
+ def utf8_ja_punct
237
259
  r = []
238
- (1).to_a.each{|ku|( 2..94).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
239
- (2).to_a.each{|ku|( 1..14).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
240
- (2).to_a.each{|ku|(26..33).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
241
- (2).to_a.each{|ku|(42..48).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
242
- (2).to_a.each{|ku|(60..74).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
243
- (2).to_a.each{|ku|(82..89).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
244
- (2).to_a.each{|ku|(94 ).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
245
- (6).to_a.each{|ku|( 1..24).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
246
- (6).to_a.each{|ku|(33..56).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
247
- (7).to_a.each{|ku|( 1..33).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
248
- (7).to_a.each{|ku|(49..81).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
249
- (8).to_a.each{|ku|( 1..32).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
260
+ [1].each { |ku| (2..94).to_a.each { |ten| r << char(ku, ten, "u8") } }
261
+ [2].each { |ku| (1..14).to_a.each { |ten| r << char(ku, ten, "u8") } }
262
+ [2].each { |ku| (26..33).to_a.each { |ten| r << char(ku, ten, "u8") } }
263
+ [2].each { |ku| (42..48).to_a.each { |ten| r << char(ku, ten, "u8") } }
264
+ [2].each { |ku| (60..74).to_a.each { |ten| r << char(ku, ten, "u8") } }
265
+ [2].each { |ku| (82..89).to_a.each { |ten| r << char(ku, ten, "u8") } }
266
+ [2].each { |ku| [94].each { |ten| r << char(ku, ten, "u8") } }
267
+ [6].each { |ku| (1..24).to_a.each { |ten| r << char(ku, ten, "u8") } }
268
+ [6].each { |ku| (33..56).to_a.each { |ten| r << char(ku, ten, "u8") } }
269
+ [7].each { |ku| (1..33).to_a.each { |ten| r << char(ku, ten, "u8") } }
270
+ [7].each { |ku| (49..81).to_a.each { |ten| r << char(ku, ten, "u8") } }
271
+ [8].each { |ku| (1..32).to_a.each { |ten| r << char(ku, ten, "u8") } }
250
272
  r
251
273
  end
252
- def utf8_ja_space()
253
- j = JIS0208.new
274
+
275
+ def utf8_ja_space
254
276
  r = []
255
- (1).to_a.each{|ku|(1).to_a.each{|ten|r << j.char(ku,ten,"u8")}}
277
+ [1].each { |ku| [1].each { |ten| r << char(ku, ten, "u8") } }
256
278
  r
257
279
  end
258
- def utf8_hiragana()
259
- j = JIS0208.new
280
+
281
+ def utf8_hiragana
260
282
  r = []
261
- (4).to_a.each{|ku|(1..83).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
283
+ [4].each { |ku| (1..83).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
262
284
  r
263
285
  end
264
- def utf8_katakana()
265
- j = JIS0208.new
286
+
287
+ def utf8_katakana
266
288
  r = []
267
- (5).to_a.each{|ku|(1..86).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
289
+ [5].each { |ku| (1..86).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
268
290
  r
269
291
  end
270
- def utf8_kanji()
271
- j = JIS0208.new
292
+
293
+ def utf8_kanji
272
294
  r = []
273
- (16..46).to_a.each{|ku|(1..94).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
274
- (47).to_a.each{|ku|(1..51).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
275
- (48..83).to_a.each{|ku|(1..94).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
276
- (84).to_a.each{|ku|(1..6).to_a.each{|ten|r << j.char(ku,ten,"utf-8")}}
295
+ (16..46).to_a.each { |ku| (1..94).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
296
+ [47].each { |ku| (1..51).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
297
+ (48..83).to_a.each { |ku| (1..94).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
298
+ [84].each { |ku| (1..6).to_a.each { |ten| r << char(ku, ten, "utf-8") } }
277
299
  r
278
300
  end
301
+ end
279
302
 
303
+ if __FILE__ == $PROGRAM_NAME
280
304
  jis0208 = JIS0208.new
305
+
281
306
  if ARGV.size == 3
282
- ku, ten, codeset = ARGV[0].to_i, ARGV[1].to_i, ARGV[2].to_s
307
+ ku = ARGV[0].to_i
308
+ ten = ARGV[1].to_i
309
+ codeset = ARGV[2]
283
310
  puts jis0208.char(ku, ten, codeset)
284
311
  exit(0)
285
312
  elsif ARGV.size == 2
286
- codeset, charclass = ARGV[0].to_s, ARGV[1].to_s
313
+ codeset = ARGV[0]
314
+ charclass = ARGV[1]
287
315
  else
288
- puts "Usage: jis0208.rb (<Ku> <Ten> <Codeset> | <Codeset> <CharClass>)"
289
- puts "Supported codeset: EUC-JP, Shift_JIS, UTF-8"
290
- puts "Supported charclass: blank, space, alnum, punct, print, graph, hiragana, katakana, kanji"
291
- puts "Example 1: jis0208.rb 16 1 utf-8"
292
- puts "Example 2: jis0208.rb euc-jp punct"
316
+ puts <<~EOS
317
+ Usage: jis0208.rb (<Ku> <Ten> <Codeset> | <Codeset> <CharClass>)
318
+ Supported codeset: EUC-JP, Shift_JIS, UTF-8
319
+ Supported charclass: blank, space, alnum, punct, print, graph, hiragana, katakana, kanji
320
+ Example 1: jis0208.rb 16 1 utf-8 #=> \xe4\xba\x9c
321
+ Example 2: jis0208.rb euc-jp punct #=> \xa1\xa2 ... \xa8\xc0
322
+ EOS
293
323
  exit(0)
294
324
  end
295
325
 
296
- case
297
- when (/^e/i.match codeset) # euc-jp
298
- case
299
- when (/^space/i.match charclass) then puts euc_ja_space()
300
- when (/^blank/i.match charclass) then puts euc_ja_blank()
301
- when (/^alnum/i.match charclass) then puts euc_ja_alnum()
302
- when (/^punct/i.match charclass) then puts euc_ja_punct()
303
- when (/^print/i.match charclass) then puts euc_ja_print()
304
- when (/^graph/i.match charclass) then puts euc_ja_graph()
305
- when (/^hira/i.match charclass) then puts euc_hiragana()
306
- when (/^kata/i.match charclass) then puts euc_katakana()
307
- when (/^kanji/i.match charclass) then puts euc_kanji()
308
- else
309
- raise "invalid charclass (#{charclass}).\n"
310
- end
311
- when (/^s/i.match codeset) # sjis
312
- case
313
- when (/^space/i.match charclass) then puts sjis_ja_space()
314
- when (/^blank/i.match charclass) then puts sjis_ja_blank()
315
- when (/^alnum/i.match charclass) then puts sjis_ja_alnum()
316
- when (/^punct/i.match charclass) then puts sjis_ja_punct()
317
- when (/^print/i.match charclass) then puts sjis_ja_print()
318
- when (/^graph/i.match charclass) then puts sjis_ja_graph()
319
- when (/^hira/i.match charclass) then puts sjis_hiragana()
320
- when (/^kata/i.match charclass) then puts sjis_katakana()
321
- when (/^kanji/i.match charclass) then puts sjis_kanji()
322
- else
323
- raise "invalid charclass (#{charclass}).\n"
324
- end
325
- when (/^u/i.match codeset) # utf-8
326
- case
327
- when (/^space/i.match charclass) then puts utf8_ja_space()
328
- when (/^blank/i.match charclass) then puts utf8_ja_blank()
329
- when (/^alnum/i.match charclass) then puts utf8_ja_alnum()
330
- when (/^punct/i.match charclass) then puts utf8_ja_punct()
331
- when (/^print/i.match charclass) then puts utf8_ja_print()
332
- when (/^graph/i.match charclass) then puts utf8_ja_graph()
333
- when (/^hira/i.match charclass) then puts utf8_hiragana()
334
- when (/^kata/i.match charclass) then puts utf8_katakana()
335
- when (/^kanji/i.match charclass) then puts utf8_kanji()
326
+ output =
327
+ case codeset
328
+ when /^e/i # euc-jp
329
+ case charclass
330
+ when /^space/i then jis0208.euc_ja_space
331
+ when /^blank/i then jis0208.euc_ja_blank
332
+ when /^alnum/i then jis0208.euc_ja_alnum
333
+ when /^punct/i then jis0208.euc_ja_punct
334
+ when /^print/i then jis0208.euc_ja_print
335
+ when /^graph/i then jis0208.euc_ja_graph
336
+ when /^hira/i then jis0208.euc_hiragana
337
+ when /^kata/i then jis0208.euc_katakana
338
+ when /^kanji/i then jis0208.euc_kanji
339
+ else
340
+ raise "invalid charclass (#{charclass})"
341
+ end
342
+ when /^s/i # sjis
343
+ case charclass
344
+ when /^space/i then jis0208.sjis_ja_space
345
+ when /^blank/i then jis0208.sjis_ja_blank
346
+ when /^alnum/i then jis0208.sjis_ja_alnum
347
+ when /^punct/i then jis0208.sjis_ja_punct
348
+ when /^print/i then jis0208.sjis_ja_print
349
+ when /^graph/i then jis0208.sjis_ja_graph
350
+ when /^hira/i then jis0208.sjis_hiragana
351
+ when /^kata/i then jis0208.sjis_katakana
352
+ when /^kanji/i then jis0208.sjis_kanji
353
+ else
354
+ raise "invalid charclass (#{charclass})"
355
+ end
356
+ when /^u/i # utf-8
357
+ case charclass
358
+ when /^space/i then jis0208.utf8_ja_space
359
+ when /^blank/i then jis0208.utf8_ja_blank
360
+ when /^alnum/i then jis0208.utf8_ja_alnum
361
+ when /^punct/i then jis0208.utf8_ja_punct
362
+ when /^print/i then jis0208.utf8_ja_print
363
+ when /^graph/i then jis0208.utf8_ja_graph
364
+ when /^hira/i then jis0208.utf8_hiragana
365
+ when /^kata/i then jis0208.utf8_katakana
366
+ when /^kanji/i then jis0208.utf8_kanji
367
+ else
368
+ raise "invalid charclass (#{charclass})"
369
+ end
336
370
  else
337
- raise "invalid charclass (#{charclass}).\n"
371
+ raise "invalid codeset (#{codeset}) or charclass (#{charclass})"
338
372
  end
339
- else
340
- raise "invalid codeset (#{codeset}) or charclass (#{charclass}).\n"
341
- end
342
373
 
374
+ puts output
343
375
  end