docdiff 0.6.5 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +7 -7
  3. data/Guardfile +4 -4
  4. data/Makefile +1 -1
  5. data/Rakefile +6 -6
  6. data/bin/docdiff +1 -1
  7. data/devutil/Rakefile +12 -5
  8. data/devutil/char_by_charclass.rb +43 -20
  9. data/devutil/charclass_by_char.rb +40 -19
  10. data/devutil/jis0208.rb +263 -231
  11. data/devutil/jis0208_test.rb +196 -0
  12. data/doc/news.md +8 -0
  13. data/docdiff.gemspec +12 -10
  14. data/lib/doc_diff.rb +59 -60
  15. data/lib/docdiff/charstring.rb +225 -241
  16. data/lib/docdiff/cli.rb +285 -250
  17. data/lib/docdiff/diff/contours.rb +1 -1
  18. data/lib/docdiff/diff/editscript.rb +1 -1
  19. data/lib/docdiff/diff/rcsdiff.rb +1 -1
  20. data/lib/docdiff/diff/shortestpath.rb +1 -1
  21. data/lib/docdiff/diff/speculative.rb +1 -1
  22. data/lib/docdiff/diff/subsequence.rb +1 -1
  23. data/lib/docdiff/diff/unidiff.rb +1 -1
  24. data/lib/docdiff/diff.rb +1 -1
  25. data/lib/docdiff/difference.rb +71 -70
  26. data/lib/docdiff/document.rb +129 -109
  27. data/lib/docdiff/encoding/en_ascii.rb +64 -58
  28. data/lib/docdiff/encoding/ja_eucjp.rb +250 -235
  29. data/lib/docdiff/encoding/ja_sjis.rb +240 -226
  30. data/lib/docdiff/encoding/ja_utf8.rb +6952 -6939
  31. data/lib/docdiff/version.rb +1 -1
  32. data/lib/docdiff/view.rb +522 -438
  33. data/lib/docdiff.rb +2 -2
  34. data/test/charstring_test.rb +475 -351
  35. data/test/cli_test.rb +103 -101
  36. data/test/diff_test.rb +15 -16
  37. data/test/difference_test.rb +40 -31
  38. data/test/docdiff_test.rb +162 -136
  39. data/test/document_test.rb +280 -175
  40. data/test/test_helper.rb +2 -1
  41. data/test/view_test.rb +636 -497
  42. metadata +8 -8
  43. data/devutil/testjis0208.rb +0 -38
@@ -1,244 +1,259 @@
1
1
  # Japanese EUC-JP encoding module for CharString
2
2
  # 2003- Hisashi MORITA
3
3
 
4
- # frozen_string_literal: false
4
+ # frozen_string_literal: true
5
5
 
6
6
  class DocDiff
7
- module CharString
8
- module EUC_JP
7
+ module CharString
8
+ module EUCJP
9
+ ENCODING = "EUC-JP"
9
10
 
10
- Encoding = "EUC-JP"
11
+ # character table based on:
12
+ # ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
11
13
 
12
- # character table based on:
13
- # ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
14
+ CNTRL =
15
+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
16
+ "\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
17
+ "\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
18
+ "\x1e\x1f\x7f"
19
+ SPACE =
20
+ "\x09\x0a\x0b\x0c\x0d\x20"
21
+ BLANK =
22
+ "\x09\x20"
23
+ DIGIT =
24
+ "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
25
+ UPPER =
26
+ "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
27
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
28
+ "\x55\x56\x57\x58\x59\x5a"
29
+ LOWER =
30
+ "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
31
+ "\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
32
+ "\x75\x76\x77\x78\x79\x7a"
33
+ ALPHA = UPPER + LOWER
34
+ ALNUM = DIGIT + ALPHA
35
+ PUNCT =
36
+ Regexp.quote(
37
+ "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
38
+ "\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
39
+ "\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
40
+ "\x7d\x7e",
41
+ )
42
+ GRAPH = DIGIT + UPPER + LOWER + PUNCT
43
+ PRINT = "\x20" + GRAPH
44
+ XDIGIT =
45
+ DIGIT +
46
+ "\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
47
+ "\x65\x66"
48
+ JA_SPACE =
49
+ "\xa1\xa1"
50
+ JA_BLANK =
51
+ "\xa1\xa1"
52
+ JA_ALNUM =
53
+ "\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4" \
54
+ "\xa3\xb5\xa3\xb6\xa3\xb7\xa3\xb8\xa3\xb9" \
55
+ "\xa3\xc1\xa3\xc2\xa3\xc3\xa3\xc4\xa3\xc5" \
56
+ "\xa3\xc6\xa3\xc7\xa3\xc8\xa3\xc9\xa3\xca" \
57
+ "\xa3\xcb\xa3\xcc\xa3\xcd\xa3\xce\xa3\xcf" \
58
+ "\xa3\xd0\xa3\xd1\xa3\xd2\xa3\xd3\xa3\xd4" \
59
+ "\xa3\xd5\xa3\xd6\xa3\xd7\xa3\xd8\xa3\xd9" \
60
+ "\xa3\xda\xa3\xe1\xa3\xe2\xa3\xe3\xa3\xe4" \
61
+ "\xa3\xe5\xa3\xe6\xa3\xe7\xa3\xe8\xa3\xe9" \
62
+ "\xa3\xea\xa3\xeb\xa3\xec\xa3\xed\xa3\xee" \
63
+ "\xa3\xef\xa3\xf0\xa3\xf1\xa3\xf2\xa3\xf3" \
64
+ "\xa3\xf4\xa3\xf5\xa3\xf6\xa3\xf7\xa3\xf8" \
65
+ "\xa3\xf9\xa3\xfa"
66
+ JA_PUNCT =
67
+ "\xa1\xa2\xa1\xa3\xa1\xa4\xa1\xa5\xa1\xa6" \
68
+ "\xa1\xa7\xa1\xa8\xa1\xa9\xa1\xaa\xa1\xab" \
69
+ "\xa1\xac\xa1\xad\xa1\xae\xa1\xaf\xa1\xb0" \
70
+ "\xa1\xb1\xa1\xb2\xa1\xb3\xa1\xb4\xa1\xb5" \
71
+ "\xa1\xb6\xa1\xb7\xa1\xb8\xa1\xb9\xa1\xba" \
72
+ "\xa1\xbb\xa1\xbc\xa1\xbd\xa1\xbe\xa1\xbf" \
73
+ "\xa1\xc0\xa1\xc1\xa1\xc2\xa1\xc3\xa1\xc4" \
74
+ "\xa1\xc5\xa1\xc6\xa1\xc7\xa1\xc8\xa1\xc9" \
75
+ "\xa1\xca\xa1\xcb\xa1\xcc\xa1\xcd\xa1\xce" \
76
+ "\xa1\xcf\xa1\xd0\xa1\xd1\xa1\xd2\xa1\xd3" \
77
+ "\xa1\xd4\xa1\xd5\xa1\xd6\xa1\xd7\xa1\xd8" \
78
+ "\xa1\xd9\xa1\xda\xa1\xdb\xa1\xdc\xa1\xdd" \
79
+ "\xa1\xde\xa1\xdf\xa1\xe0\xa1\xe1\xa1\xe2" \
80
+ "\xa1\xe3\xa1\xe4\xa1\xe5\xa1\xe6\xa1\xe7" \
81
+ "\xa1\xe8\xa1\xe9\xa1\xea\xa1\xeb\xa1\xec" \
82
+ "\xa1\xed\xa1\xee\xa1\xef\xa1\xf0\xa1\xf1" \
83
+ "\xa1\xf2\xa1\xf3\xa1\xf4\xa1\xf5\xa1\xf6" \
84
+ "\xa1\xf7\xa1\xf8\xa1\xf9\xa1\xfa\xa1\xfb" \
85
+ "\xa1\xfc\xa1\xfd\xa1\xfe\xa2\xa1\xa2\xa2" \
86
+ "\xa2\xa3\xa2\xa4\xa2\xa5\xa2\xa6\xa2\xa7" \
87
+ "\xa2\xa8\xa2\xa9\xa2\xaa\xa2\xab\xa2\xac" \
88
+ "\xa2\xad\xa2\xae\xa2\xba\xa2\xbb\xa2\xbc" \
89
+ "\xa2\xbd\xa2\xbe\xa2\xbf\xa2\xc0\xa2\xc1" \
90
+ "\xa2\xca\xa2\xcb\xa2\xcc\xa2\xcd\xa2\xce" \
91
+ "\xa2\xcf\xa2\xd0\xa2\xdc\xa2\xdd\xa2\xde" \
92
+ "\xa2\xdf\xa2\xe0\xa2\xe1\xa2\xe2\xa2\xe3" \
93
+ "\xa2\xe4\xa2\xe5\xa2\xe6\xa2\xe7\xa2\xe8" \
94
+ "\xa2\xe9\xa2\xea\xa2\xf2\xa2\xf3\xa2\xf4" \
95
+ "\xa2\xf5\xa2\xf6\xa2\xf7\xa2\xf8\xa2\xf9" \
96
+ "\xa2\xfe\xa6\xa1\xa6\xa2\xa6\xa3\xa6\xa4" \
97
+ "\xa6\xa5\xa6\xa6\xa6\xa7\xa6\xa8\xa6\xa9" \
98
+ "\xa6\xaa\xa6\xab\xa6\xac\xa6\xad\xa6\xae" \
99
+ "\xa6\xaf\xa6\xb0\xa6\xb1\xa6\xb2\xa6\xb3" \
100
+ "\xa6\xb4\xa6\xb5\xa6\xb6\xa6\xb7\xa6\xb8" \
101
+ "\xa6\xc1\xa6\xc2\xa6\xc3\xa6\xc4\xa6\xc5" \
102
+ "\xa6\xc6\xa6\xc7\xa6\xc8\xa6\xc9\xa6\xca" \
103
+ "\xa6\xcb\xa6\xcc\xa6\xcd\xa6\xce\xa6\xcf" \
104
+ "\xa6\xd0\xa6\xd1\xa6\xd2\xa6\xd3\xa6\xd4" \
105
+ "\xa6\xd5\xa6\xd6\xa6\xd7\xa6\xd8\xa7\xa1" \
106
+ "\xa7\xa2\xa7\xa3\xa7\xa4\xa7\xa5\xa7\xa6" \
107
+ "\xa7\xa7\xa7\xa8\xa7\xa9\xa7\xaa\xa7\xab" \
108
+ "\xa7\xac\xa7\xad\xa7\xae\xa7\xaf\xa7\xb0" \
109
+ "\xa7\xb1\xa7\xb2\xa7\xb3\xa7\xb4\xa7\xb5" \
110
+ "\xa7\xb6\xa7\xb7\xa7\xb8\xa7\xb9\xa7\xba" \
111
+ "\xa7\xbb\xa7\xbc\xa7\xbd\xa7\xbe\xa7\xbf" \
112
+ "\xa7\xc0\xa7\xc1\xa7\xd1\xa7\xd2\xa7\xd3" \
113
+ "\xa7\xd4\xa7\xd5\xa7\xd6\xa7\xd7\xa7\xd8" \
114
+ "\xa7\xd9\xa7\xda\xa7\xdb\xa7\xdc\xa7\xdd" \
115
+ "\xa7\xde\xa7\xdf\xa7\xe0\xa7\xe1\xa7\xe2" \
116
+ "\xa7\xe3\xa7\xe4\xa7\xe5\xa7\xe6\xa7\xe7" \
117
+ "\xa7\xe8\xa7\xe9\xa7\xea\xa7\xeb\xa7\xec" \
118
+ "\xa7\xed\xa7\xee\xa7\xef\xa7\xf0\xa7\xf1" \
119
+ "\xa8\xa1\xa8\xa2\xa8\xa3\xa8\xa4\xa8\xa5" \
120
+ "\xa8\xa6\xa8\xa7\xa8\xa8\xa8\xa9\xa8\xaa" \
121
+ "\xa8\xab\xa8\xac\xa8\xad\xa8\xae\xa8\xaf" \
122
+ "\xa8\xb0\xa8\xb1\xa8\xb2\xa8\xb3\xa8\xb4" \
123
+ "\xa8\xb5\xa8\xb6\xa8\xb7\xa8\xb8\xa8\xb9" \
124
+ "\xa8\xba\xa8\xbb\xa8\xbc\xa8\xbd\xa8\xbe" \
125
+ "\xa8\xbf\xa8\xc0"
126
+ HIRA =
127
+ "\xa4\xa1\xa4\xa2\xa4\xa3\xa4\xa4\xa4\xa5" \
128
+ "\xa4\xa6\xa4\xa7\xa4\xa8\xa4\xa9\xa4\xaa" \
129
+ "\xa4\xab\xa4\xac\xa4\xad\xa4\xae\xa4\xaf" \
130
+ "\xa4\xb0\xa4\xb1\xa4\xb2\xa4\xb3\xa4\xb4" \
131
+ "\xa4\xb5\xa4\xb6\xa4\xb7\xa4\xb8\xa4\xb9" \
132
+ "\xa4\xba\xa4\xbb\xa4\xbc\xa4\xbd\xa4\xbe" \
133
+ "\xa4\xbf\xa4\xc0\xa4\xc1\xa4\xc2\xa4\xc3" \
134
+ "\xa4\xc4\xa4\xc5\xa4\xc6\xa4\xc7\xa4\xc8" \
135
+ "\xa4\xc9\xa4\xca\xa4\xcb\xa4\xcc\xa4\xcd" \
136
+ "\xa4\xce\xa4\xcf\xa4\xd0\xa4\xd1\xa4\xd2" \
137
+ "\xa4\xd3\xa4\xd4\xa4\xd5\xa4\xd6\xa4\xd7" \
138
+ "\xa4\xd8\xa4\xd9\xa4\xda\xa4\xdb\xa4\xdc" \
139
+ "\xa4\xdd\xa4\xde\xa4\xdf\xa4\xe0\xa4\xe1" \
140
+ "\xa4\xe2\xa4\xe3\xa4\xe4\xa4\xe5\xa4\xe6" \
141
+ "\xa4\xe7\xa4\xe8\xa4\xe9\xa4\xea\xa4\xeb" \
142
+ "\xa4\xec\xa4\xed\xa4\xee\xa4\xef\xa4\xf0" \
143
+ "\xa4\xf1\xa4\xf2\xa4\xf3" # [\xa4\xa1-\xa4\xf3]
144
+ HIRA_EX =
145
+ HIRA +
146
+ "\xa1\xbc" \
147
+ "\xa1\xb3\xa1\xb4" # add onbiki and kanagaeshi(hira)
148
+ KATA =
149
+ "\xa5\xa1\xa5\xa2\xa5\xa3\xa5\xa4\xa5\xa5" \
150
+ "\xa5\xa6\xa5\xa7\xa5\xa8\xa5\xa9\xa5\xaa" \
151
+ "\xa5\xab\xa5\xac\xa5\xad\xa5\xae\xa5\xaf" \
152
+ "\xa5\xb0\xa5\xb1\xa5\xb2\xa5\xb3\xa5\xb4" \
153
+ "\xa5\xb5\xa5\xb6\xa5\xb7\xa5\xb8\xa5\xb9" \
154
+ "\xa5\xba\xa5\xbb\xa5\xbc\xa5\xbd\xa5\xbe" \
155
+ "\xa5\xbf\xa5\xc0\xa5\xc1\xa5\xc2\xa5\xc3" \
156
+ "\xa5\xc4\xa5\xc5\xa5\xc6\xa5\xc7\xa5\xc8" \
157
+ "\xa5\xc9\xa5\xca\xa5\xcb\xa5\xcc\xa5\xcd" \
158
+ "\xa5\xce\xa5\xcf\xa5\xd0\xa5\xd1\xa5\xd2" \
159
+ "\xa5\xd3\xa5\xd4\xa5\xd5\xa5\xd6\xa5\xd7" \
160
+ "\xa5\xd8\xa5\xd9\xa5\xda\xa5\xdb\xa5\xdc" \
161
+ "\xa5\xdd\xa5\xde\xa5\xdf\xa5\xe0\xa5\xe1" \
162
+ "\xa5\xe2\xa5\xe3\xa5\xe4\xa5\xe5\xa5\xe6" \
163
+ "\xa5\xe7\xa5\xe8\xa5\xe9\xa5\xea\xa5\xeb" \
164
+ "\xa5\xec\xa5\xed\xa5\xee\xa5\xef\xa5\xf0" \
165
+ "\xa5\xf1\xa5\xf2\xa5\xf3\xa5\xf4\xa5\xf5" \
166
+ "\xa5\xf6" # [\xa5\xa1-\xa5\xf6]
167
+ KATA_EX =
168
+ KATA +
169
+ "\xa1\xbc" \
170
+ "\xa1\xb5\xa1\xb6" # add onbiki and kanagaeshi(kata)
171
+ KANJI =
172
+ "\xb0\xa1-\xb0\xfe" \
173
+ "\xb1\xa1-\xb1\xfe" \
174
+ "\xb2\xa1-\xb2\xfe" \
175
+ "\xb3\xa1-\xb3\xfe" \
176
+ "\xb4\xa1-\xb4\xfe" \
177
+ "\xb5\xa1-\xb5\xfe" \
178
+ "\xb6\xa1-\xb6\xfe" \
179
+ "\xb7\xa1-\xb7\xfe" \
180
+ "\xb8\xa1-\xb8\xfe" \
181
+ "\xb9\xa1-\xb9\xfe" \
182
+ "\xba\xa1-\xba\xfe" \
183
+ "\xbb\xa1-\xbb\xfe" \
184
+ "\xbc\xa1-\xbc\xfe" \
185
+ "\xbd\xa1-\xbd\xfe" \
186
+ "\xbe\xa1-\xbe\xfe" \
187
+ "\xbf\xa1-\xbf\xfe" \
188
+ "\xc0\xa1-\xc0\xfe" \
189
+ "\xc1\xa1-\xc1\xfe" \
190
+ "\xc2\xa1-\xc2\xfe" \
191
+ "\xc3\xa1-\xc3\xfe" \
192
+ "\xc4\xa1-\xc4\xfe" \
193
+ "\xc5\xa1-\xc5\xfe" \
194
+ "\xc6\xa1-\xc6\xfe" \
195
+ "\xc7\xa1-\xc7\xfe" \
196
+ "\xc8\xa1-\xc8\xfe" \
197
+ "\xc9\xa1-\xc9\xfe" \
198
+ "\xca\xa1-\xca\xfe" \
199
+ "\xcb\xa1-\xcb\xfe" \
200
+ "\xcc\xa1-\xcc\xfe" \
201
+ "\xcd\xa1-\xcd\xfe" \
202
+ "\xce\xa1-\xce\xfe" \
203
+ "\xcf\xa1-\xcf\xd3" \
204
+ "\xd0\xa1-\xd0\xfe" \
205
+ "\xd1\xa1-\xd1\xfe" \
206
+ "\xd2\xa1-\xd2\xfe" \
207
+ "\xd3\xa1-\xd3\xfe" \
208
+ "\xd4\xa1-\xd4\xfe" \
209
+ "\xd5\xa1-\xd5\xfe" \
210
+ "\xd6\xa1-\xd6\xfe" \
211
+ "\xd7\xa1-\xd7\xfe" \
212
+ "\xd8\xa1-\xd8\xfe" \
213
+ "\xd9\xa1-\xd9\xfe" \
214
+ "\xda\xa1-\xda\xfe" \
215
+ "\xdb\xa1-\xdb\xfe" \
216
+ "\xdc\xa1-\xdc\xfe" \
217
+ "\xdd\xa1-\xdd\xfe" \
218
+ "\xde\xa1-\xde\xfe" \
219
+ "\xdf\xa1-\xdf\xfe" \
220
+ "\xe0\xa1-\xe0\xfe" \
221
+ "\xe1\xa1-\xe1\xfe" \
222
+ "\xe2\xa1-\xe2\xfe" \
223
+ "\xe3\xa1-\xe3\xfe" \
224
+ "\xe4\xa1-\xe4\xfe" \
225
+ "\xe5\xa1-\xe5\xfe" \
226
+ "\xe6\xa1-\xe6\xfe" \
227
+ "\xe7\xa1-\xe7\xfe" \
228
+ "\xe8\xa1-\xe8\xfe" \
229
+ "\xe9\xa1-\xe9\xfe" \
230
+ "\xea\xa1-\xea\xfe" \
231
+ "\xeb\xa1-\xeb\xfe" \
232
+ "\xec\xa1-\xec\xfe" \
233
+ "\xed\xa1-\xed\xfe" \
234
+ "\xee\xa1-\xee\xfe" \
235
+ "\xef\xa1-\xef\xfe" \
236
+ "\xf0\xa1-\xf0\xfe" \
237
+ "\xf1\xa1-\xf1\xfe" \
238
+ "\xf2\xa1-\xf2\xfe" \
239
+ "\xf3\xa1-\xf3\xfe" \
240
+ "\xf4\xa1-\xf4\xa6"
241
+ KANJI_EX = KANJI + "\xa1\xb9" # + noma
242
+ JA_GRAPH = JA_ALNUM + JA_PUNCT + HIRA + KATA + KANJI
243
+ JA_PRINT = JA_GRAPH + JA_BLANK
14
244
 
15
- CNTRL = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
16
- "\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
17
- "\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
18
- "\x1e\x1f\x7f"
19
- SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
20
- BLANK = "\x09\x20"
21
- DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
22
- UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
23
- "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
24
- "\x55\x56\x57\x58\x59\x5a"
25
- LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
26
- "\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
27
- "\x75\x76\x77\x78\x79\x7a"
28
- ALPHA = UPPER + LOWER
29
- ALNUM = DIGIT + ALPHA
30
- PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
31
- "\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
32
- "\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
33
- "\x7d\x7e"
34
- GRAPH = DIGIT + UPPER + LOWER + PUNCT
35
- PRINT = "\x20" + GRAPH
36
- XDIGIT = DIGIT +
37
- "\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
38
- "\x65\x66"
39
- JA_SPACE = "\xa1\xa1"
40
- JA_BLANK = "\xa1\xa1"
41
- JA_ALNUM = "\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4" \
42
- "\xa3\xb5\xa3\xb6\xa3\xb7\xa3\xb8\xa3\xb9" \
43
- "\xa3\xc1\xa3\xc2\xa3\xc3\xa3\xc4\xa3\xc5" \
44
- "\xa3\xc6\xa3\xc7\xa3\xc8\xa3\xc9\xa3\xca" \
45
- "\xa3\xcb\xa3\xcc\xa3\xcd\xa3\xce\xa3\xcf" \
46
- "\xa3\xd0\xa3\xd1\xa3\xd2\xa3\xd3\xa3\xd4" \
47
- "\xa3\xd5\xa3\xd6\xa3\xd7\xa3\xd8\xa3\xd9" \
48
- "\xa3\xda\xa3\xe1\xa3\xe2\xa3\xe3\xa3\xe4" \
49
- "\xa3\xe5\xa3\xe6\xa3\xe7\xa3\xe8\xa3\xe9" \
50
- "\xa3\xea\xa3\xeb\xa3\xec\xa3\xed\xa3\xee" \
51
- "\xa3\xef\xa3\xf0\xa3\xf1\xa3\xf2\xa3\xf3" \
52
- "\xa3\xf4\xa3\xf5\xa3\xf6\xa3\xf7\xa3\xf8" \
53
- "\xa3\xf9\xa3\xfa"
54
- JA_PUNCT = "\xa1\xa2\xa1\xa3\xa1\xa4\xa1\xa5\xa1\xa6" \
55
- "\xa1\xa7\xa1\xa8\xa1\xa9\xa1\xaa\xa1\xab" \
56
- "\xa1\xac\xa1\xad\xa1\xae\xa1\xaf\xa1\xb0" \
57
- "\xa1\xb1\xa1\xb2\xa1\xb3\xa1\xb4\xa1\xb5" \
58
- "\xa1\xb6\xa1\xb7\xa1\xb8\xa1\xb9\xa1\xba" \
59
- "\xa1\xbb\xa1\xbc\xa1\xbd\xa1\xbe\xa1\xbf" \
60
- "\xa1\xc0\xa1\xc1\xa1\xc2\xa1\xc3\xa1\xc4" \
61
- "\xa1\xc5\xa1\xc6\xa1\xc7\xa1\xc8\xa1\xc9" \
62
- "\xa1\xca\xa1\xcb\xa1\xcc\xa1\xcd\xa1\xce" \
63
- "\xa1\xcf\xa1\xd0\xa1\xd1\xa1\xd2\xa1\xd3" \
64
- "\xa1\xd4\xa1\xd5\xa1\xd6\xa1\xd7\xa1\xd8" \
65
- "\xa1\xd9\xa1\xda\xa1\xdb\xa1\xdc\xa1\xdd" \
66
- "\xa1\xde\xa1\xdf\xa1\xe0\xa1\xe1\xa1\xe2" \
67
- "\xa1\xe3\xa1\xe4\xa1\xe5\xa1\xe6\xa1\xe7" \
68
- "\xa1\xe8\xa1\xe9\xa1\xea\xa1\xeb\xa1\xec" \
69
- "\xa1\xed\xa1\xee\xa1\xef\xa1\xf0\xa1\xf1" \
70
- "\xa1\xf2\xa1\xf3\xa1\xf4\xa1\xf5\xa1\xf6" \
71
- "\xa1\xf7\xa1\xf8\xa1\xf9\xa1\xfa\xa1\xfb" \
72
- "\xa1\xfc\xa1\xfd\xa1\xfe\xa2\xa1\xa2\xa2" \
73
- "\xa2\xa3\xa2\xa4\xa2\xa5\xa2\xa6\xa2\xa7" \
74
- "\xa2\xa8\xa2\xa9\xa2\xaa\xa2\xab\xa2\xac" \
75
- "\xa2\xad\xa2\xae\xa2\xba\xa2\xbb\xa2\xbc" \
76
- "\xa2\xbd\xa2\xbe\xa2\xbf\xa2\xc0\xa2\xc1" \
77
- "\xa2\xca\xa2\xcb\xa2\xcc\xa2\xcd\xa2\xce" \
78
- "\xa2\xcf\xa2\xd0\xa2\xdc\xa2\xdd\xa2\xde" \
79
- "\xa2\xdf\xa2\xe0\xa2\xe1\xa2\xe2\xa2\xe3" \
80
- "\xa2\xe4\xa2\xe5\xa2\xe6\xa2\xe7\xa2\xe8" \
81
- "\xa2\xe9\xa2\xea\xa2\xf2\xa2\xf3\xa2\xf4" \
82
- "\xa2\xf5\xa2\xf6\xa2\xf7\xa2\xf8\xa2\xf9" \
83
- "\xa2\xfe\xa6\xa1\xa6\xa2\xa6\xa3\xa6\xa4" \
84
- "\xa6\xa5\xa6\xa6\xa6\xa7\xa6\xa8\xa6\xa9" \
85
- "\xa6\xaa\xa6\xab\xa6\xac\xa6\xad\xa6\xae" \
86
- "\xa6\xaf\xa6\xb0\xa6\xb1\xa6\xb2\xa6\xb3" \
87
- "\xa6\xb4\xa6\xb5\xa6\xb6\xa6\xb7\xa6\xb8" \
88
- "\xa6\xc1\xa6\xc2\xa6\xc3\xa6\xc4\xa6\xc5" \
89
- "\xa6\xc6\xa6\xc7\xa6\xc8\xa6\xc9\xa6\xca" \
90
- "\xa6\xcb\xa6\xcc\xa6\xcd\xa6\xce\xa6\xcf" \
91
- "\xa6\xd0\xa6\xd1\xa6\xd2\xa6\xd3\xa6\xd4" \
92
- "\xa6\xd5\xa6\xd6\xa6\xd7\xa6\xd8\xa7\xa1" \
93
- "\xa7\xa2\xa7\xa3\xa7\xa4\xa7\xa5\xa7\xa6" \
94
- "\xa7\xa7\xa7\xa8\xa7\xa9\xa7\xaa\xa7\xab" \
95
- "\xa7\xac\xa7\xad\xa7\xae\xa7\xaf\xa7\xb0" \
96
- "\xa7\xb1\xa7\xb2\xa7\xb3\xa7\xb4\xa7\xb5" \
97
- "\xa7\xb6\xa7\xb7\xa7\xb8\xa7\xb9\xa7\xba" \
98
- "\xa7\xbb\xa7\xbc\xa7\xbd\xa7\xbe\xa7\xbf" \
99
- "\xa7\xc0\xa7\xc1\xa7\xd1\xa7\xd2\xa7\xd3" \
100
- "\xa7\xd4\xa7\xd5\xa7\xd6\xa7\xd7\xa7\xd8" \
101
- "\xa7\xd9\xa7\xda\xa7\xdb\xa7\xdc\xa7\xdd" \
102
- "\xa7\xde\xa7\xdf\xa7\xe0\xa7\xe1\xa7\xe2" \
103
- "\xa7\xe3\xa7\xe4\xa7\xe5\xa7\xe6\xa7\xe7" \
104
- "\xa7\xe8\xa7\xe9\xa7\xea\xa7\xeb\xa7\xec" \
105
- "\xa7\xed\xa7\xee\xa7\xef\xa7\xf0\xa7\xf1" \
106
- "\xa8\xa1\xa8\xa2\xa8\xa3\xa8\xa4\xa8\xa5" \
107
- "\xa8\xa6\xa8\xa7\xa8\xa8\xa8\xa9\xa8\xaa" \
108
- "\xa8\xab\xa8\xac\xa8\xad\xa8\xae\xa8\xaf" \
109
- "\xa8\xb0\xa8\xb1\xa8\xb2\xa8\xb3\xa8\xb4" \
110
- "\xa8\xb5\xa8\xb6\xa8\xb7\xa8\xb8\xa8\xb9" \
111
- "\xa8\xba\xa8\xbb\xa8\xbc\xa8\xbd\xa8\xbe" \
112
- "\xa8\xbf\xa8\xc0"
113
- HIRA = "\xa4\xa1\xa4\xa2\xa4\xa3\xa4\xa4\xa4\xa5" \
114
- "\xa4\xa6\xa4\xa7\xa4\xa8\xa4\xa9\xa4\xaa" \
115
- "\xa4\xab\xa4\xac\xa4\xad\xa4\xae\xa4\xaf" \
116
- "\xa4\xb0\xa4\xb1\xa4\xb2\xa4\xb3\xa4\xb4" \
117
- "\xa4\xb5\xa4\xb6\xa4\xb7\xa4\xb8\xa4\xb9" \
118
- "\xa4\xba\xa4\xbb\xa4\xbc\xa4\xbd\xa4\xbe" \
119
- "\xa4\xbf\xa4\xc0\xa4\xc1\xa4\xc2\xa4\xc3" \
120
- "\xa4\xc4\xa4\xc5\xa4\xc6\xa4\xc7\xa4\xc8" \
121
- "\xa4\xc9\xa4\xca\xa4\xcb\xa4\xcc\xa4\xcd" \
122
- "\xa4\xce\xa4\xcf\xa4\xd0\xa4\xd1\xa4\xd2" \
123
- "\xa4\xd3\xa4\xd4\xa4\xd5\xa4\xd6\xa4\xd7" \
124
- "\xa4\xd8\xa4\xd9\xa4\xda\xa4\xdb\xa4\xdc" \
125
- "\xa4\xdd\xa4\xde\xa4\xdf\xa4\xe0\xa4\xe1" \
126
- "\xa4\xe2\xa4\xe3\xa4\xe4\xa4\xe5\xa4\xe6" \
127
- "\xa4\xe7\xa4\xe8\xa4\xe9\xa4\xea\xa4\xeb" \
128
- "\xa4\xec\xa4\xed\xa4\xee\xa4\xef\xa4\xf0" \
129
- "\xa4\xf1\xa4\xf2\xa4\xf3" # [\xa4\xa1-\xa4\xf3]
130
- HIRA_EX = HIRA +
131
- "\xa1\xbc" +
132
- "\xa1\xb3\xa1\xb4" # add onbiki and kanagaeshi(hira)
133
- KATA = "\xa5\xa1\xa5\xa2\xa5\xa3\xa5\xa4\xa5\xa5" \
134
- "\xa5\xa6\xa5\xa7\xa5\xa8\xa5\xa9\xa5\xaa" \
135
- "\xa5\xab\xa5\xac\xa5\xad\xa5\xae\xa5\xaf" \
136
- "\xa5\xb0\xa5\xb1\xa5\xb2\xa5\xb3\xa5\xb4" \
137
- "\xa5\xb5\xa5\xb6\xa5\xb7\xa5\xb8\xa5\xb9" \
138
- "\xa5\xba\xa5\xbb\xa5\xbc\xa5\xbd\xa5\xbe" \
139
- "\xa5\xbf\xa5\xc0\xa5\xc1\xa5\xc2\xa5\xc3" \
140
- "\xa5\xc4\xa5\xc5\xa5\xc6\xa5\xc7\xa5\xc8" \
141
- "\xa5\xc9\xa5\xca\xa5\xcb\xa5\xcc\xa5\xcd" \
142
- "\xa5\xce\xa5\xcf\xa5\xd0\xa5\xd1\xa5\xd2" \
143
- "\xa5\xd3\xa5\xd4\xa5\xd5\xa5\xd6\xa5\xd7" \
144
- "\xa5\xd8\xa5\xd9\xa5\xda\xa5\xdb\xa5\xdc" \
145
- "\xa5\xdd\xa5\xde\xa5\xdf\xa5\xe0\xa5\xe1" \
146
- "\xa5\xe2\xa5\xe3\xa5\xe4\xa5\xe5\xa5\xe6" \
147
- "\xa5\xe7\xa5\xe8\xa5\xe9\xa5\xea\xa5\xeb" \
148
- "\xa5\xec\xa5\xed\xa5\xee\xa5\xef\xa5\xf0" \
149
- "\xa5\xf1\xa5\xf2\xa5\xf3\xa5\xf4\xa5\xf5" \
150
- "\xa5\xf6" # [\xa5\xa1-\xa5\xf6]
151
- KATA_EX = KATA +
152
- "\xa1\xbc" +
153
- "\xa1\xb5\xa1\xb6" # add onbiki and kanagaeshi(kata)
154
- KANJI = "\xb0\xa1-\xb0\xfe" \
155
- "\xb1\xa1-\xb1\xfe" \
156
- "\xb2\xa1-\xb2\xfe" \
157
- "\xb3\xa1-\xb3\xfe" \
158
- "\xb4\xa1-\xb4\xfe" \
159
- "\xb5\xa1-\xb5\xfe" \
160
- "\xb6\xa1-\xb6\xfe" \
161
- "\xb7\xa1-\xb7\xfe" \
162
- "\xb8\xa1-\xb8\xfe" \
163
- "\xb9\xa1-\xb9\xfe" \
164
- "\xba\xa1-\xba\xfe" \
165
- "\xbb\xa1-\xbb\xfe" \
166
- "\xbc\xa1-\xbc\xfe" \
167
- "\xbd\xa1-\xbd\xfe" \
168
- "\xbe\xa1-\xbe\xfe" \
169
- "\xbf\xa1-\xbf\xfe" \
170
- "\xc0\xa1-\xc0\xfe" \
171
- "\xc1\xa1-\xc1\xfe" \
172
- "\xc2\xa1-\xc2\xfe" \
173
- "\xc3\xa1-\xc3\xfe" \
174
- "\xc4\xa1-\xc4\xfe" \
175
- "\xc5\xa1-\xc5\xfe" \
176
- "\xc6\xa1-\xc6\xfe" \
177
- "\xc7\xa1-\xc7\xfe" \
178
- "\xc8\xa1-\xc8\xfe" \
179
- "\xc9\xa1-\xc9\xfe" \
180
- "\xca\xa1-\xca\xfe" \
181
- "\xcb\xa1-\xcb\xfe" \
182
- "\xcc\xa1-\xcc\xfe" \
183
- "\xcd\xa1-\xcd\xfe" \
184
- "\xce\xa1-\xce\xfe" \
185
- "\xcf\xa1-\xcf\xd3" \
186
- "\xd0\xa1-\xd0\xfe" \
187
- "\xd1\xa1-\xd1\xfe" \
188
- "\xd2\xa1-\xd2\xfe" \
189
- "\xd3\xa1-\xd3\xfe" \
190
- "\xd4\xa1-\xd4\xfe" \
191
- "\xd5\xa1-\xd5\xfe" \
192
- "\xd6\xa1-\xd6\xfe" \
193
- "\xd7\xa1-\xd7\xfe" \
194
- "\xd8\xa1-\xd8\xfe" \
195
- "\xd9\xa1-\xd9\xfe" \
196
- "\xda\xa1-\xda\xfe" \
197
- "\xdb\xa1-\xdb\xfe" \
198
- "\xdc\xa1-\xdc\xfe" \
199
- "\xdd\xa1-\xdd\xfe" \
200
- "\xde\xa1-\xde\xfe" \
201
- "\xdf\xa1-\xdf\xfe" \
202
- "\xe0\xa1-\xe0\xfe" \
203
- "\xe1\xa1-\xe1\xfe" \
204
- "\xe2\xa1-\xe2\xfe" \
205
- "\xe3\xa1-\xe3\xfe" \
206
- "\xe4\xa1-\xe4\xfe" \
207
- "\xe5\xa1-\xe5\xfe" \
208
- "\xe6\xa1-\xe6\xfe" \
209
- "\xe7\xa1-\xe7\xfe" \
210
- "\xe8\xa1-\xe8\xfe" \
211
- "\xe9\xa1-\xe9\xfe" \
212
- "\xea\xa1-\xea\xfe" \
213
- "\xeb\xa1-\xeb\xfe" \
214
- "\xec\xa1-\xec\xfe" \
215
- "\xed\xa1-\xed\xfe" \
216
- "\xee\xa1-\xee\xfe" \
217
- "\xef\xa1-\xef\xfe" \
218
- "\xf0\xa1-\xf0\xfe" \
219
- "\xf1\xa1-\xf1\xfe" \
220
- "\xf2\xa1-\xf2\xfe" \
221
- "\xf3\xa1-\xf3\xfe" \
222
- "\xf4\xa1-\xf4\xa6"
223
- KANJI_EX = KANJI + "\xa1\xb9" # + noma
224
- JA_GRAPH = JA_ALNUM + JA_PUNCT + HIRA + KATA + KANJI
225
- JA_PRINT = JA_GRAPH + JA_BLANK
245
+ WORD_REGEXP_SRC = [
246
+ "(?:[#{GRAPH}]+[#{BLANK}]?)",
247
+ "|(?:[#{SPACE}]+)",
248
+ "|(?:[#{KANJI_EX}]+[#{HIRA}]+)",
249
+ "|(?:[#{KATA_EX}]+[#{HIRA}]+)",
250
+ "|(?:[#{KANJI_EX}]+)",
251
+ "|(?:[#{KATA_EX}]+)",
252
+ "|(?:[#{HIRA_EX}]+)",
253
+ "|(?:.+?)",
254
+ ].join
226
255
 
227
- PUNCT.replace(Regexp.quote(PUNCT)) # kludge to avoid warning "character class has `[' without escape"
228
- PRINT.replace(Regexp.quote(PRINT)) # kludge to avoid warning "character class has `[' without escape"
229
- GRAPH.replace(Regexp.quote(GRAPH)) # kludge to avoid warning "character class has `[' without escape"
230
-
231
- WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
232
- "|(?:[#{SPACE}]+)",
233
- "|(?:[#{KANJI_EX}]+[#{HIRA}]+)",
234
- "|(?:[#{KATA_EX}]+[#{HIRA}]+)",
235
- "|(?:[#{KANJI_EX}]+)",
236
- "|(?:[#{KATA_EX}]+)",
237
- "|(?:[#{HIRA_EX}]+)",
238
- "|(?:.+?)"].join
239
-
240
- CharString.register_encoding(self)
241
-
242
- end # module EUCJP
243
- end # module CharString
244
- end # class DocDiff
256
+ CharString.register_encoding(self)
257
+ end
258
+ end
259
+ end