docdiff 0.6.5 → 0.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +7 -7
- data/Guardfile +4 -4
- data/Makefile +1 -1
- data/Rakefile +6 -6
- data/bin/docdiff +1 -1
- data/devutil/Rakefile +12 -5
- data/devutil/char_by_charclass.rb +43 -20
- data/devutil/charclass_by_char.rb +40 -19
- data/devutil/jis0208.rb +263 -231
- data/devutil/jis0208_test.rb +196 -0
- data/doc/news.md +8 -0
- data/docdiff.gemspec +12 -10
- data/lib/doc_diff.rb +59 -60
- data/lib/docdiff/charstring.rb +225 -241
- data/lib/docdiff/cli.rb +285 -250
- data/lib/docdiff/diff/contours.rb +1 -1
- data/lib/docdiff/diff/editscript.rb +1 -1
- data/lib/docdiff/diff/rcsdiff.rb +1 -1
- data/lib/docdiff/diff/shortestpath.rb +1 -1
- data/lib/docdiff/diff/speculative.rb +1 -1
- data/lib/docdiff/diff/subsequence.rb +1 -1
- data/lib/docdiff/diff/unidiff.rb +1 -1
- data/lib/docdiff/diff.rb +1 -1
- data/lib/docdiff/difference.rb +71 -70
- data/lib/docdiff/document.rb +129 -109
- data/lib/docdiff/encoding/en_ascii.rb +64 -58
- data/lib/docdiff/encoding/ja_eucjp.rb +250 -235
- data/lib/docdiff/encoding/ja_sjis.rb +240 -226
- data/lib/docdiff/encoding/ja_utf8.rb +6952 -6939
- data/lib/docdiff/version.rb +1 -1
- data/lib/docdiff/view.rb +522 -438
- data/lib/docdiff.rb +2 -2
- data/test/charstring_test.rb +475 -351
- data/test/cli_test.rb +103 -101
- data/test/diff_test.rb +15 -16
- data/test/difference_test.rb +40 -31
- data/test/docdiff_test.rb +162 -136
- data/test/document_test.rb +280 -175
- data/test/test_helper.rb +2 -1
- data/test/view_test.rb +636 -497
- metadata +8 -8
- data/devutil/testjis0208.rb +0 -38
|
@@ -1,244 +1,259 @@
|
|
|
1
1
|
# Japanese EUC-JP encoding module for CharString
|
|
2
2
|
# 2003- Hisashi MORITA
|
|
3
3
|
|
|
4
|
-
# frozen_string_literal:
|
|
4
|
+
# frozen_string_literal: true
|
|
5
5
|
|
|
6
6
|
class DocDiff
|
|
7
|
-
module CharString
|
|
8
|
-
|
|
7
|
+
module CharString
|
|
8
|
+
module EUCJP
|
|
9
|
+
ENCODING = "EUC-JP"
|
|
9
10
|
|
|
10
|
-
|
|
11
|
+
# character table based on:
|
|
12
|
+
# ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
|
|
11
13
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
+
CNTRL =
|
|
15
|
+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
|
|
16
|
+
"\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
|
|
17
|
+
"\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
|
|
18
|
+
"\x1e\x1f\x7f"
|
|
19
|
+
SPACE =
|
|
20
|
+
"\x09\x0a\x0b\x0c\x0d\x20"
|
|
21
|
+
BLANK =
|
|
22
|
+
"\x09\x20"
|
|
23
|
+
DIGIT =
|
|
24
|
+
"\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
|
|
25
|
+
UPPER =
|
|
26
|
+
"\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
|
27
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
|
28
|
+
"\x55\x56\x57\x58\x59\x5a"
|
|
29
|
+
LOWER =
|
|
30
|
+
"\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
|
31
|
+
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
|
32
|
+
"\x75\x76\x77\x78\x79\x7a"
|
|
33
|
+
ALPHA = UPPER + LOWER
|
|
34
|
+
ALNUM = DIGIT + ALPHA
|
|
35
|
+
PUNCT =
|
|
36
|
+
Regexp.quote(
|
|
37
|
+
"\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
|
38
|
+
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
|
39
|
+
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
|
40
|
+
"\x7d\x7e",
|
|
41
|
+
)
|
|
42
|
+
GRAPH = DIGIT + UPPER + LOWER + PUNCT
|
|
43
|
+
PRINT = "\x20" + GRAPH
|
|
44
|
+
XDIGIT =
|
|
45
|
+
DIGIT +
|
|
46
|
+
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
|
47
|
+
"\x65\x66"
|
|
48
|
+
JA_SPACE =
|
|
49
|
+
"\xa1\xa1"
|
|
50
|
+
JA_BLANK =
|
|
51
|
+
"\xa1\xa1"
|
|
52
|
+
JA_ALNUM =
|
|
53
|
+
"\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4" \
|
|
54
|
+
"\xa3\xb5\xa3\xb6\xa3\xb7\xa3\xb8\xa3\xb9" \
|
|
55
|
+
"\xa3\xc1\xa3\xc2\xa3\xc3\xa3\xc4\xa3\xc5" \
|
|
56
|
+
"\xa3\xc6\xa3\xc7\xa3\xc8\xa3\xc9\xa3\xca" \
|
|
57
|
+
"\xa3\xcb\xa3\xcc\xa3\xcd\xa3\xce\xa3\xcf" \
|
|
58
|
+
"\xa3\xd0\xa3\xd1\xa3\xd2\xa3\xd3\xa3\xd4" \
|
|
59
|
+
"\xa3\xd5\xa3\xd6\xa3\xd7\xa3\xd8\xa3\xd9" \
|
|
60
|
+
"\xa3\xda\xa3\xe1\xa3\xe2\xa3\xe3\xa3\xe4" \
|
|
61
|
+
"\xa3\xe5\xa3\xe6\xa3\xe7\xa3\xe8\xa3\xe9" \
|
|
62
|
+
"\xa3\xea\xa3\xeb\xa3\xec\xa3\xed\xa3\xee" \
|
|
63
|
+
"\xa3\xef\xa3\xf0\xa3\xf1\xa3\xf2\xa3\xf3" \
|
|
64
|
+
"\xa3\xf4\xa3\xf5\xa3\xf6\xa3\xf7\xa3\xf8" \
|
|
65
|
+
"\xa3\xf9\xa3\xfa"
|
|
66
|
+
JA_PUNCT =
|
|
67
|
+
"\xa1\xa2\xa1\xa3\xa1\xa4\xa1\xa5\xa1\xa6" \
|
|
68
|
+
"\xa1\xa7\xa1\xa8\xa1\xa9\xa1\xaa\xa1\xab" \
|
|
69
|
+
"\xa1\xac\xa1\xad\xa1\xae\xa1\xaf\xa1\xb0" \
|
|
70
|
+
"\xa1\xb1\xa1\xb2\xa1\xb3\xa1\xb4\xa1\xb5" \
|
|
71
|
+
"\xa1\xb6\xa1\xb7\xa1\xb8\xa1\xb9\xa1\xba" \
|
|
72
|
+
"\xa1\xbb\xa1\xbc\xa1\xbd\xa1\xbe\xa1\xbf" \
|
|
73
|
+
"\xa1\xc0\xa1\xc1\xa1\xc2\xa1\xc3\xa1\xc4" \
|
|
74
|
+
"\xa1\xc5\xa1\xc6\xa1\xc7\xa1\xc8\xa1\xc9" \
|
|
75
|
+
"\xa1\xca\xa1\xcb\xa1\xcc\xa1\xcd\xa1\xce" \
|
|
76
|
+
"\xa1\xcf\xa1\xd0\xa1\xd1\xa1\xd2\xa1\xd3" \
|
|
77
|
+
"\xa1\xd4\xa1\xd5\xa1\xd6\xa1\xd7\xa1\xd8" \
|
|
78
|
+
"\xa1\xd9\xa1\xda\xa1\xdb\xa1\xdc\xa1\xdd" \
|
|
79
|
+
"\xa1\xde\xa1\xdf\xa1\xe0\xa1\xe1\xa1\xe2" \
|
|
80
|
+
"\xa1\xe3\xa1\xe4\xa1\xe5\xa1\xe6\xa1\xe7" \
|
|
81
|
+
"\xa1\xe8\xa1\xe9\xa1\xea\xa1\xeb\xa1\xec" \
|
|
82
|
+
"\xa1\xed\xa1\xee\xa1\xef\xa1\xf0\xa1\xf1" \
|
|
83
|
+
"\xa1\xf2\xa1\xf3\xa1\xf4\xa1\xf5\xa1\xf6" \
|
|
84
|
+
"\xa1\xf7\xa1\xf8\xa1\xf9\xa1\xfa\xa1\xfb" \
|
|
85
|
+
"\xa1\xfc\xa1\xfd\xa1\xfe\xa2\xa1\xa2\xa2" \
|
|
86
|
+
"\xa2\xa3\xa2\xa4\xa2\xa5\xa2\xa6\xa2\xa7" \
|
|
87
|
+
"\xa2\xa8\xa2\xa9\xa2\xaa\xa2\xab\xa2\xac" \
|
|
88
|
+
"\xa2\xad\xa2\xae\xa2\xba\xa2\xbb\xa2\xbc" \
|
|
89
|
+
"\xa2\xbd\xa2\xbe\xa2\xbf\xa2\xc0\xa2\xc1" \
|
|
90
|
+
"\xa2\xca\xa2\xcb\xa2\xcc\xa2\xcd\xa2\xce" \
|
|
91
|
+
"\xa2\xcf\xa2\xd0\xa2\xdc\xa2\xdd\xa2\xde" \
|
|
92
|
+
"\xa2\xdf\xa2\xe0\xa2\xe1\xa2\xe2\xa2\xe3" \
|
|
93
|
+
"\xa2\xe4\xa2\xe5\xa2\xe6\xa2\xe7\xa2\xe8" \
|
|
94
|
+
"\xa2\xe9\xa2\xea\xa2\xf2\xa2\xf3\xa2\xf4" \
|
|
95
|
+
"\xa2\xf5\xa2\xf6\xa2\xf7\xa2\xf8\xa2\xf9" \
|
|
96
|
+
"\xa2\xfe\xa6\xa1\xa6\xa2\xa6\xa3\xa6\xa4" \
|
|
97
|
+
"\xa6\xa5\xa6\xa6\xa6\xa7\xa6\xa8\xa6\xa9" \
|
|
98
|
+
"\xa6\xaa\xa6\xab\xa6\xac\xa6\xad\xa6\xae" \
|
|
99
|
+
"\xa6\xaf\xa6\xb0\xa6\xb1\xa6\xb2\xa6\xb3" \
|
|
100
|
+
"\xa6\xb4\xa6\xb5\xa6\xb6\xa6\xb7\xa6\xb8" \
|
|
101
|
+
"\xa6\xc1\xa6\xc2\xa6\xc3\xa6\xc4\xa6\xc5" \
|
|
102
|
+
"\xa6\xc6\xa6\xc7\xa6\xc8\xa6\xc9\xa6\xca" \
|
|
103
|
+
"\xa6\xcb\xa6\xcc\xa6\xcd\xa6\xce\xa6\xcf" \
|
|
104
|
+
"\xa6\xd0\xa6\xd1\xa6\xd2\xa6\xd3\xa6\xd4" \
|
|
105
|
+
"\xa6\xd5\xa6\xd6\xa6\xd7\xa6\xd8\xa7\xa1" \
|
|
106
|
+
"\xa7\xa2\xa7\xa3\xa7\xa4\xa7\xa5\xa7\xa6" \
|
|
107
|
+
"\xa7\xa7\xa7\xa8\xa7\xa9\xa7\xaa\xa7\xab" \
|
|
108
|
+
"\xa7\xac\xa7\xad\xa7\xae\xa7\xaf\xa7\xb0" \
|
|
109
|
+
"\xa7\xb1\xa7\xb2\xa7\xb3\xa7\xb4\xa7\xb5" \
|
|
110
|
+
"\xa7\xb6\xa7\xb7\xa7\xb8\xa7\xb9\xa7\xba" \
|
|
111
|
+
"\xa7\xbb\xa7\xbc\xa7\xbd\xa7\xbe\xa7\xbf" \
|
|
112
|
+
"\xa7\xc0\xa7\xc1\xa7\xd1\xa7\xd2\xa7\xd3" \
|
|
113
|
+
"\xa7\xd4\xa7\xd5\xa7\xd6\xa7\xd7\xa7\xd8" \
|
|
114
|
+
"\xa7\xd9\xa7\xda\xa7\xdb\xa7\xdc\xa7\xdd" \
|
|
115
|
+
"\xa7\xde\xa7\xdf\xa7\xe0\xa7\xe1\xa7\xe2" \
|
|
116
|
+
"\xa7\xe3\xa7\xe4\xa7\xe5\xa7\xe6\xa7\xe7" \
|
|
117
|
+
"\xa7\xe8\xa7\xe9\xa7\xea\xa7\xeb\xa7\xec" \
|
|
118
|
+
"\xa7\xed\xa7\xee\xa7\xef\xa7\xf0\xa7\xf1" \
|
|
119
|
+
"\xa8\xa1\xa8\xa2\xa8\xa3\xa8\xa4\xa8\xa5" \
|
|
120
|
+
"\xa8\xa6\xa8\xa7\xa8\xa8\xa8\xa9\xa8\xaa" \
|
|
121
|
+
"\xa8\xab\xa8\xac\xa8\xad\xa8\xae\xa8\xaf" \
|
|
122
|
+
"\xa8\xb0\xa8\xb1\xa8\xb2\xa8\xb3\xa8\xb4" \
|
|
123
|
+
"\xa8\xb5\xa8\xb6\xa8\xb7\xa8\xb8\xa8\xb9" \
|
|
124
|
+
"\xa8\xba\xa8\xbb\xa8\xbc\xa8\xbd\xa8\xbe" \
|
|
125
|
+
"\xa8\xbf\xa8\xc0"
|
|
126
|
+
HIRA =
|
|
127
|
+
"\xa4\xa1\xa4\xa2\xa4\xa3\xa4\xa4\xa4\xa5" \
|
|
128
|
+
"\xa4\xa6\xa4\xa7\xa4\xa8\xa4\xa9\xa4\xaa" \
|
|
129
|
+
"\xa4\xab\xa4\xac\xa4\xad\xa4\xae\xa4\xaf" \
|
|
130
|
+
"\xa4\xb0\xa4\xb1\xa4\xb2\xa4\xb3\xa4\xb4" \
|
|
131
|
+
"\xa4\xb5\xa4\xb6\xa4\xb7\xa4\xb8\xa4\xb9" \
|
|
132
|
+
"\xa4\xba\xa4\xbb\xa4\xbc\xa4\xbd\xa4\xbe" \
|
|
133
|
+
"\xa4\xbf\xa4\xc0\xa4\xc1\xa4\xc2\xa4\xc3" \
|
|
134
|
+
"\xa4\xc4\xa4\xc5\xa4\xc6\xa4\xc7\xa4\xc8" \
|
|
135
|
+
"\xa4\xc9\xa4\xca\xa4\xcb\xa4\xcc\xa4\xcd" \
|
|
136
|
+
"\xa4\xce\xa4\xcf\xa4\xd0\xa4\xd1\xa4\xd2" \
|
|
137
|
+
"\xa4\xd3\xa4\xd4\xa4\xd5\xa4\xd6\xa4\xd7" \
|
|
138
|
+
"\xa4\xd8\xa4\xd9\xa4\xda\xa4\xdb\xa4\xdc" \
|
|
139
|
+
"\xa4\xdd\xa4\xde\xa4\xdf\xa4\xe0\xa4\xe1" \
|
|
140
|
+
"\xa4\xe2\xa4\xe3\xa4\xe4\xa4\xe5\xa4\xe6" \
|
|
141
|
+
"\xa4\xe7\xa4\xe8\xa4\xe9\xa4\xea\xa4\xeb" \
|
|
142
|
+
"\xa4\xec\xa4\xed\xa4\xee\xa4\xef\xa4\xf0" \
|
|
143
|
+
"\xa4\xf1\xa4\xf2\xa4\xf3" # [\xa4\xa1-\xa4\xf3]
|
|
144
|
+
HIRA_EX =
|
|
145
|
+
HIRA +
|
|
146
|
+
"\xa1\xbc" \
|
|
147
|
+
"\xa1\xb3\xa1\xb4" # add onbiki and kanagaeshi(hira)
|
|
148
|
+
KATA =
|
|
149
|
+
"\xa5\xa1\xa5\xa2\xa5\xa3\xa5\xa4\xa5\xa5" \
|
|
150
|
+
"\xa5\xa6\xa5\xa7\xa5\xa8\xa5\xa9\xa5\xaa" \
|
|
151
|
+
"\xa5\xab\xa5\xac\xa5\xad\xa5\xae\xa5\xaf" \
|
|
152
|
+
"\xa5\xb0\xa5\xb1\xa5\xb2\xa5\xb3\xa5\xb4" \
|
|
153
|
+
"\xa5\xb5\xa5\xb6\xa5\xb7\xa5\xb8\xa5\xb9" \
|
|
154
|
+
"\xa5\xba\xa5\xbb\xa5\xbc\xa5\xbd\xa5\xbe" \
|
|
155
|
+
"\xa5\xbf\xa5\xc0\xa5\xc1\xa5\xc2\xa5\xc3" \
|
|
156
|
+
"\xa5\xc4\xa5\xc5\xa5\xc6\xa5\xc7\xa5\xc8" \
|
|
157
|
+
"\xa5\xc9\xa5\xca\xa5\xcb\xa5\xcc\xa5\xcd" \
|
|
158
|
+
"\xa5\xce\xa5\xcf\xa5\xd0\xa5\xd1\xa5\xd2" \
|
|
159
|
+
"\xa5\xd3\xa5\xd4\xa5\xd5\xa5\xd6\xa5\xd7" \
|
|
160
|
+
"\xa5\xd8\xa5\xd9\xa5\xda\xa5\xdb\xa5\xdc" \
|
|
161
|
+
"\xa5\xdd\xa5\xde\xa5\xdf\xa5\xe0\xa5\xe1" \
|
|
162
|
+
"\xa5\xe2\xa5\xe3\xa5\xe4\xa5\xe5\xa5\xe6" \
|
|
163
|
+
"\xa5\xe7\xa5\xe8\xa5\xe9\xa5\xea\xa5\xeb" \
|
|
164
|
+
"\xa5\xec\xa5\xed\xa5\xee\xa5\xef\xa5\xf0" \
|
|
165
|
+
"\xa5\xf1\xa5\xf2\xa5\xf3\xa5\xf4\xa5\xf5" \
|
|
166
|
+
"\xa5\xf6" # [\xa5\xa1-\xa5\xf6]
|
|
167
|
+
KATA_EX =
|
|
168
|
+
KATA +
|
|
169
|
+
"\xa1\xbc" \
|
|
170
|
+
"\xa1\xb5\xa1\xb6" # add onbiki and kanagaeshi(kata)
|
|
171
|
+
KANJI =
|
|
172
|
+
"\xb0\xa1-\xb0\xfe" \
|
|
173
|
+
"\xb1\xa1-\xb1\xfe" \
|
|
174
|
+
"\xb2\xa1-\xb2\xfe" \
|
|
175
|
+
"\xb3\xa1-\xb3\xfe" \
|
|
176
|
+
"\xb4\xa1-\xb4\xfe" \
|
|
177
|
+
"\xb5\xa1-\xb5\xfe" \
|
|
178
|
+
"\xb6\xa1-\xb6\xfe" \
|
|
179
|
+
"\xb7\xa1-\xb7\xfe" \
|
|
180
|
+
"\xb8\xa1-\xb8\xfe" \
|
|
181
|
+
"\xb9\xa1-\xb9\xfe" \
|
|
182
|
+
"\xba\xa1-\xba\xfe" \
|
|
183
|
+
"\xbb\xa1-\xbb\xfe" \
|
|
184
|
+
"\xbc\xa1-\xbc\xfe" \
|
|
185
|
+
"\xbd\xa1-\xbd\xfe" \
|
|
186
|
+
"\xbe\xa1-\xbe\xfe" \
|
|
187
|
+
"\xbf\xa1-\xbf\xfe" \
|
|
188
|
+
"\xc0\xa1-\xc0\xfe" \
|
|
189
|
+
"\xc1\xa1-\xc1\xfe" \
|
|
190
|
+
"\xc2\xa1-\xc2\xfe" \
|
|
191
|
+
"\xc3\xa1-\xc3\xfe" \
|
|
192
|
+
"\xc4\xa1-\xc4\xfe" \
|
|
193
|
+
"\xc5\xa1-\xc5\xfe" \
|
|
194
|
+
"\xc6\xa1-\xc6\xfe" \
|
|
195
|
+
"\xc7\xa1-\xc7\xfe" \
|
|
196
|
+
"\xc8\xa1-\xc8\xfe" \
|
|
197
|
+
"\xc9\xa1-\xc9\xfe" \
|
|
198
|
+
"\xca\xa1-\xca\xfe" \
|
|
199
|
+
"\xcb\xa1-\xcb\xfe" \
|
|
200
|
+
"\xcc\xa1-\xcc\xfe" \
|
|
201
|
+
"\xcd\xa1-\xcd\xfe" \
|
|
202
|
+
"\xce\xa1-\xce\xfe" \
|
|
203
|
+
"\xcf\xa1-\xcf\xd3" \
|
|
204
|
+
"\xd0\xa1-\xd0\xfe" \
|
|
205
|
+
"\xd1\xa1-\xd1\xfe" \
|
|
206
|
+
"\xd2\xa1-\xd2\xfe" \
|
|
207
|
+
"\xd3\xa1-\xd3\xfe" \
|
|
208
|
+
"\xd4\xa1-\xd4\xfe" \
|
|
209
|
+
"\xd5\xa1-\xd5\xfe" \
|
|
210
|
+
"\xd6\xa1-\xd6\xfe" \
|
|
211
|
+
"\xd7\xa1-\xd7\xfe" \
|
|
212
|
+
"\xd8\xa1-\xd8\xfe" \
|
|
213
|
+
"\xd9\xa1-\xd9\xfe" \
|
|
214
|
+
"\xda\xa1-\xda\xfe" \
|
|
215
|
+
"\xdb\xa1-\xdb\xfe" \
|
|
216
|
+
"\xdc\xa1-\xdc\xfe" \
|
|
217
|
+
"\xdd\xa1-\xdd\xfe" \
|
|
218
|
+
"\xde\xa1-\xde\xfe" \
|
|
219
|
+
"\xdf\xa1-\xdf\xfe" \
|
|
220
|
+
"\xe0\xa1-\xe0\xfe" \
|
|
221
|
+
"\xe1\xa1-\xe1\xfe" \
|
|
222
|
+
"\xe2\xa1-\xe2\xfe" \
|
|
223
|
+
"\xe3\xa1-\xe3\xfe" \
|
|
224
|
+
"\xe4\xa1-\xe4\xfe" \
|
|
225
|
+
"\xe5\xa1-\xe5\xfe" \
|
|
226
|
+
"\xe6\xa1-\xe6\xfe" \
|
|
227
|
+
"\xe7\xa1-\xe7\xfe" \
|
|
228
|
+
"\xe8\xa1-\xe8\xfe" \
|
|
229
|
+
"\xe9\xa1-\xe9\xfe" \
|
|
230
|
+
"\xea\xa1-\xea\xfe" \
|
|
231
|
+
"\xeb\xa1-\xeb\xfe" \
|
|
232
|
+
"\xec\xa1-\xec\xfe" \
|
|
233
|
+
"\xed\xa1-\xed\xfe" \
|
|
234
|
+
"\xee\xa1-\xee\xfe" \
|
|
235
|
+
"\xef\xa1-\xef\xfe" \
|
|
236
|
+
"\xf0\xa1-\xf0\xfe" \
|
|
237
|
+
"\xf1\xa1-\xf1\xfe" \
|
|
238
|
+
"\xf2\xa1-\xf2\xfe" \
|
|
239
|
+
"\xf3\xa1-\xf3\xfe" \
|
|
240
|
+
"\xf4\xa1-\xf4\xa6"
|
|
241
|
+
KANJI_EX = KANJI + "\xa1\xb9" # + noma
|
|
242
|
+
JA_GRAPH = JA_ALNUM + JA_PUNCT + HIRA + KATA + KANJI
|
|
243
|
+
JA_PRINT = JA_GRAPH + JA_BLANK
|
|
14
244
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
|
26
|
-
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
|
27
|
-
"\x75\x76\x77\x78\x79\x7a"
|
|
28
|
-
ALPHA = UPPER + LOWER
|
|
29
|
-
ALNUM = DIGIT + ALPHA
|
|
30
|
-
PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
|
31
|
-
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
|
32
|
-
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
|
33
|
-
"\x7d\x7e"
|
|
34
|
-
GRAPH = DIGIT + UPPER + LOWER + PUNCT
|
|
35
|
-
PRINT = "\x20" + GRAPH
|
|
36
|
-
XDIGIT = DIGIT +
|
|
37
|
-
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
|
38
|
-
"\x65\x66"
|
|
39
|
-
JA_SPACE = "\xa1\xa1"
|
|
40
|
-
JA_BLANK = "\xa1\xa1"
|
|
41
|
-
JA_ALNUM = "\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4" \
|
|
42
|
-
"\xa3\xb5\xa3\xb6\xa3\xb7\xa3\xb8\xa3\xb9" \
|
|
43
|
-
"\xa3\xc1\xa3\xc2\xa3\xc3\xa3\xc4\xa3\xc5" \
|
|
44
|
-
"\xa3\xc6\xa3\xc7\xa3\xc8\xa3\xc9\xa3\xca" \
|
|
45
|
-
"\xa3\xcb\xa3\xcc\xa3\xcd\xa3\xce\xa3\xcf" \
|
|
46
|
-
"\xa3\xd0\xa3\xd1\xa3\xd2\xa3\xd3\xa3\xd4" \
|
|
47
|
-
"\xa3\xd5\xa3\xd6\xa3\xd7\xa3\xd8\xa3\xd9" \
|
|
48
|
-
"\xa3\xda\xa3\xe1\xa3\xe2\xa3\xe3\xa3\xe4" \
|
|
49
|
-
"\xa3\xe5\xa3\xe6\xa3\xe7\xa3\xe8\xa3\xe9" \
|
|
50
|
-
"\xa3\xea\xa3\xeb\xa3\xec\xa3\xed\xa3\xee" \
|
|
51
|
-
"\xa3\xef\xa3\xf0\xa3\xf1\xa3\xf2\xa3\xf3" \
|
|
52
|
-
"\xa3\xf4\xa3\xf5\xa3\xf6\xa3\xf7\xa3\xf8" \
|
|
53
|
-
"\xa3\xf9\xa3\xfa"
|
|
54
|
-
JA_PUNCT = "\xa1\xa2\xa1\xa3\xa1\xa4\xa1\xa5\xa1\xa6" \
|
|
55
|
-
"\xa1\xa7\xa1\xa8\xa1\xa9\xa1\xaa\xa1\xab" \
|
|
56
|
-
"\xa1\xac\xa1\xad\xa1\xae\xa1\xaf\xa1\xb0" \
|
|
57
|
-
"\xa1\xb1\xa1\xb2\xa1\xb3\xa1\xb4\xa1\xb5" \
|
|
58
|
-
"\xa1\xb6\xa1\xb7\xa1\xb8\xa1\xb9\xa1\xba" \
|
|
59
|
-
"\xa1\xbb\xa1\xbc\xa1\xbd\xa1\xbe\xa1\xbf" \
|
|
60
|
-
"\xa1\xc0\xa1\xc1\xa1\xc2\xa1\xc3\xa1\xc4" \
|
|
61
|
-
"\xa1\xc5\xa1\xc6\xa1\xc7\xa1\xc8\xa1\xc9" \
|
|
62
|
-
"\xa1\xca\xa1\xcb\xa1\xcc\xa1\xcd\xa1\xce" \
|
|
63
|
-
"\xa1\xcf\xa1\xd0\xa1\xd1\xa1\xd2\xa1\xd3" \
|
|
64
|
-
"\xa1\xd4\xa1\xd5\xa1\xd6\xa1\xd7\xa1\xd8" \
|
|
65
|
-
"\xa1\xd9\xa1\xda\xa1\xdb\xa1\xdc\xa1\xdd" \
|
|
66
|
-
"\xa1\xde\xa1\xdf\xa1\xe0\xa1\xe1\xa1\xe2" \
|
|
67
|
-
"\xa1\xe3\xa1\xe4\xa1\xe5\xa1\xe6\xa1\xe7" \
|
|
68
|
-
"\xa1\xe8\xa1\xe9\xa1\xea\xa1\xeb\xa1\xec" \
|
|
69
|
-
"\xa1\xed\xa1\xee\xa1\xef\xa1\xf0\xa1\xf1" \
|
|
70
|
-
"\xa1\xf2\xa1\xf3\xa1\xf4\xa1\xf5\xa1\xf6" \
|
|
71
|
-
"\xa1\xf7\xa1\xf8\xa1\xf9\xa1\xfa\xa1\xfb" \
|
|
72
|
-
"\xa1\xfc\xa1\xfd\xa1\xfe\xa2\xa1\xa2\xa2" \
|
|
73
|
-
"\xa2\xa3\xa2\xa4\xa2\xa5\xa2\xa6\xa2\xa7" \
|
|
74
|
-
"\xa2\xa8\xa2\xa9\xa2\xaa\xa2\xab\xa2\xac" \
|
|
75
|
-
"\xa2\xad\xa2\xae\xa2\xba\xa2\xbb\xa2\xbc" \
|
|
76
|
-
"\xa2\xbd\xa2\xbe\xa2\xbf\xa2\xc0\xa2\xc1" \
|
|
77
|
-
"\xa2\xca\xa2\xcb\xa2\xcc\xa2\xcd\xa2\xce" \
|
|
78
|
-
"\xa2\xcf\xa2\xd0\xa2\xdc\xa2\xdd\xa2\xde" \
|
|
79
|
-
"\xa2\xdf\xa2\xe0\xa2\xe1\xa2\xe2\xa2\xe3" \
|
|
80
|
-
"\xa2\xe4\xa2\xe5\xa2\xe6\xa2\xe7\xa2\xe8" \
|
|
81
|
-
"\xa2\xe9\xa2\xea\xa2\xf2\xa2\xf3\xa2\xf4" \
|
|
82
|
-
"\xa2\xf5\xa2\xf6\xa2\xf7\xa2\xf8\xa2\xf9" \
|
|
83
|
-
"\xa2\xfe\xa6\xa1\xa6\xa2\xa6\xa3\xa6\xa4" \
|
|
84
|
-
"\xa6\xa5\xa6\xa6\xa6\xa7\xa6\xa8\xa6\xa9" \
|
|
85
|
-
"\xa6\xaa\xa6\xab\xa6\xac\xa6\xad\xa6\xae" \
|
|
86
|
-
"\xa6\xaf\xa6\xb0\xa6\xb1\xa6\xb2\xa6\xb3" \
|
|
87
|
-
"\xa6\xb4\xa6\xb5\xa6\xb6\xa6\xb7\xa6\xb8" \
|
|
88
|
-
"\xa6\xc1\xa6\xc2\xa6\xc3\xa6\xc4\xa6\xc5" \
|
|
89
|
-
"\xa6\xc6\xa6\xc7\xa6\xc8\xa6\xc9\xa6\xca" \
|
|
90
|
-
"\xa6\xcb\xa6\xcc\xa6\xcd\xa6\xce\xa6\xcf" \
|
|
91
|
-
"\xa6\xd0\xa6\xd1\xa6\xd2\xa6\xd3\xa6\xd4" \
|
|
92
|
-
"\xa6\xd5\xa6\xd6\xa6\xd7\xa6\xd8\xa7\xa1" \
|
|
93
|
-
"\xa7\xa2\xa7\xa3\xa7\xa4\xa7\xa5\xa7\xa6" \
|
|
94
|
-
"\xa7\xa7\xa7\xa8\xa7\xa9\xa7\xaa\xa7\xab" \
|
|
95
|
-
"\xa7\xac\xa7\xad\xa7\xae\xa7\xaf\xa7\xb0" \
|
|
96
|
-
"\xa7\xb1\xa7\xb2\xa7\xb3\xa7\xb4\xa7\xb5" \
|
|
97
|
-
"\xa7\xb6\xa7\xb7\xa7\xb8\xa7\xb9\xa7\xba" \
|
|
98
|
-
"\xa7\xbb\xa7\xbc\xa7\xbd\xa7\xbe\xa7\xbf" \
|
|
99
|
-
"\xa7\xc0\xa7\xc1\xa7\xd1\xa7\xd2\xa7\xd3" \
|
|
100
|
-
"\xa7\xd4\xa7\xd5\xa7\xd6\xa7\xd7\xa7\xd8" \
|
|
101
|
-
"\xa7\xd9\xa7\xda\xa7\xdb\xa7\xdc\xa7\xdd" \
|
|
102
|
-
"\xa7\xde\xa7\xdf\xa7\xe0\xa7\xe1\xa7\xe2" \
|
|
103
|
-
"\xa7\xe3\xa7\xe4\xa7\xe5\xa7\xe6\xa7\xe7" \
|
|
104
|
-
"\xa7\xe8\xa7\xe9\xa7\xea\xa7\xeb\xa7\xec" \
|
|
105
|
-
"\xa7\xed\xa7\xee\xa7\xef\xa7\xf0\xa7\xf1" \
|
|
106
|
-
"\xa8\xa1\xa8\xa2\xa8\xa3\xa8\xa4\xa8\xa5" \
|
|
107
|
-
"\xa8\xa6\xa8\xa7\xa8\xa8\xa8\xa9\xa8\xaa" \
|
|
108
|
-
"\xa8\xab\xa8\xac\xa8\xad\xa8\xae\xa8\xaf" \
|
|
109
|
-
"\xa8\xb0\xa8\xb1\xa8\xb2\xa8\xb3\xa8\xb4" \
|
|
110
|
-
"\xa8\xb5\xa8\xb6\xa8\xb7\xa8\xb8\xa8\xb9" \
|
|
111
|
-
"\xa8\xba\xa8\xbb\xa8\xbc\xa8\xbd\xa8\xbe" \
|
|
112
|
-
"\xa8\xbf\xa8\xc0"
|
|
113
|
-
HIRA = "\xa4\xa1\xa4\xa2\xa4\xa3\xa4\xa4\xa4\xa5" \
|
|
114
|
-
"\xa4\xa6\xa4\xa7\xa4\xa8\xa4\xa9\xa4\xaa" \
|
|
115
|
-
"\xa4\xab\xa4\xac\xa4\xad\xa4\xae\xa4\xaf" \
|
|
116
|
-
"\xa4\xb0\xa4\xb1\xa4\xb2\xa4\xb3\xa4\xb4" \
|
|
117
|
-
"\xa4\xb5\xa4\xb6\xa4\xb7\xa4\xb8\xa4\xb9" \
|
|
118
|
-
"\xa4\xba\xa4\xbb\xa4\xbc\xa4\xbd\xa4\xbe" \
|
|
119
|
-
"\xa4\xbf\xa4\xc0\xa4\xc1\xa4\xc2\xa4\xc3" \
|
|
120
|
-
"\xa4\xc4\xa4\xc5\xa4\xc6\xa4\xc7\xa4\xc8" \
|
|
121
|
-
"\xa4\xc9\xa4\xca\xa4\xcb\xa4\xcc\xa4\xcd" \
|
|
122
|
-
"\xa4\xce\xa4\xcf\xa4\xd0\xa4\xd1\xa4\xd2" \
|
|
123
|
-
"\xa4\xd3\xa4\xd4\xa4\xd5\xa4\xd6\xa4\xd7" \
|
|
124
|
-
"\xa4\xd8\xa4\xd9\xa4\xda\xa4\xdb\xa4\xdc" \
|
|
125
|
-
"\xa4\xdd\xa4\xde\xa4\xdf\xa4\xe0\xa4\xe1" \
|
|
126
|
-
"\xa4\xe2\xa4\xe3\xa4\xe4\xa4\xe5\xa4\xe6" \
|
|
127
|
-
"\xa4\xe7\xa4\xe8\xa4\xe9\xa4\xea\xa4\xeb" \
|
|
128
|
-
"\xa4\xec\xa4\xed\xa4\xee\xa4\xef\xa4\xf0" \
|
|
129
|
-
"\xa4\xf1\xa4\xf2\xa4\xf3" # [\xa4\xa1-\xa4\xf3]
|
|
130
|
-
HIRA_EX = HIRA +
|
|
131
|
-
"\xa1\xbc" +
|
|
132
|
-
"\xa1\xb3\xa1\xb4" # add onbiki and kanagaeshi(hira)
|
|
133
|
-
KATA = "\xa5\xa1\xa5\xa2\xa5\xa3\xa5\xa4\xa5\xa5" \
|
|
134
|
-
"\xa5\xa6\xa5\xa7\xa5\xa8\xa5\xa9\xa5\xaa" \
|
|
135
|
-
"\xa5\xab\xa5\xac\xa5\xad\xa5\xae\xa5\xaf" \
|
|
136
|
-
"\xa5\xb0\xa5\xb1\xa5\xb2\xa5\xb3\xa5\xb4" \
|
|
137
|
-
"\xa5\xb5\xa5\xb6\xa5\xb7\xa5\xb8\xa5\xb9" \
|
|
138
|
-
"\xa5\xba\xa5\xbb\xa5\xbc\xa5\xbd\xa5\xbe" \
|
|
139
|
-
"\xa5\xbf\xa5\xc0\xa5\xc1\xa5\xc2\xa5\xc3" \
|
|
140
|
-
"\xa5\xc4\xa5\xc5\xa5\xc6\xa5\xc7\xa5\xc8" \
|
|
141
|
-
"\xa5\xc9\xa5\xca\xa5\xcb\xa5\xcc\xa5\xcd" \
|
|
142
|
-
"\xa5\xce\xa5\xcf\xa5\xd0\xa5\xd1\xa5\xd2" \
|
|
143
|
-
"\xa5\xd3\xa5\xd4\xa5\xd5\xa5\xd6\xa5\xd7" \
|
|
144
|
-
"\xa5\xd8\xa5\xd9\xa5\xda\xa5\xdb\xa5\xdc" \
|
|
145
|
-
"\xa5\xdd\xa5\xde\xa5\xdf\xa5\xe0\xa5\xe1" \
|
|
146
|
-
"\xa5\xe2\xa5\xe3\xa5\xe4\xa5\xe5\xa5\xe6" \
|
|
147
|
-
"\xa5\xe7\xa5\xe8\xa5\xe9\xa5\xea\xa5\xeb" \
|
|
148
|
-
"\xa5\xec\xa5\xed\xa5\xee\xa5\xef\xa5\xf0" \
|
|
149
|
-
"\xa5\xf1\xa5\xf2\xa5\xf3\xa5\xf4\xa5\xf5" \
|
|
150
|
-
"\xa5\xf6" # [\xa5\xa1-\xa5\xf6]
|
|
151
|
-
KATA_EX = KATA +
|
|
152
|
-
"\xa1\xbc" +
|
|
153
|
-
"\xa1\xb5\xa1\xb6" # add onbiki and kanagaeshi(kata)
|
|
154
|
-
KANJI = "\xb0\xa1-\xb0\xfe" \
|
|
155
|
-
"\xb1\xa1-\xb1\xfe" \
|
|
156
|
-
"\xb2\xa1-\xb2\xfe" \
|
|
157
|
-
"\xb3\xa1-\xb3\xfe" \
|
|
158
|
-
"\xb4\xa1-\xb4\xfe" \
|
|
159
|
-
"\xb5\xa1-\xb5\xfe" \
|
|
160
|
-
"\xb6\xa1-\xb6\xfe" \
|
|
161
|
-
"\xb7\xa1-\xb7\xfe" \
|
|
162
|
-
"\xb8\xa1-\xb8\xfe" \
|
|
163
|
-
"\xb9\xa1-\xb9\xfe" \
|
|
164
|
-
"\xba\xa1-\xba\xfe" \
|
|
165
|
-
"\xbb\xa1-\xbb\xfe" \
|
|
166
|
-
"\xbc\xa1-\xbc\xfe" \
|
|
167
|
-
"\xbd\xa1-\xbd\xfe" \
|
|
168
|
-
"\xbe\xa1-\xbe\xfe" \
|
|
169
|
-
"\xbf\xa1-\xbf\xfe" \
|
|
170
|
-
"\xc0\xa1-\xc0\xfe" \
|
|
171
|
-
"\xc1\xa1-\xc1\xfe" \
|
|
172
|
-
"\xc2\xa1-\xc2\xfe" \
|
|
173
|
-
"\xc3\xa1-\xc3\xfe" \
|
|
174
|
-
"\xc4\xa1-\xc4\xfe" \
|
|
175
|
-
"\xc5\xa1-\xc5\xfe" \
|
|
176
|
-
"\xc6\xa1-\xc6\xfe" \
|
|
177
|
-
"\xc7\xa1-\xc7\xfe" \
|
|
178
|
-
"\xc8\xa1-\xc8\xfe" \
|
|
179
|
-
"\xc9\xa1-\xc9\xfe" \
|
|
180
|
-
"\xca\xa1-\xca\xfe" \
|
|
181
|
-
"\xcb\xa1-\xcb\xfe" \
|
|
182
|
-
"\xcc\xa1-\xcc\xfe" \
|
|
183
|
-
"\xcd\xa1-\xcd\xfe" \
|
|
184
|
-
"\xce\xa1-\xce\xfe" \
|
|
185
|
-
"\xcf\xa1-\xcf\xd3" \
|
|
186
|
-
"\xd0\xa1-\xd0\xfe" \
|
|
187
|
-
"\xd1\xa1-\xd1\xfe" \
|
|
188
|
-
"\xd2\xa1-\xd2\xfe" \
|
|
189
|
-
"\xd3\xa1-\xd3\xfe" \
|
|
190
|
-
"\xd4\xa1-\xd4\xfe" \
|
|
191
|
-
"\xd5\xa1-\xd5\xfe" \
|
|
192
|
-
"\xd6\xa1-\xd6\xfe" \
|
|
193
|
-
"\xd7\xa1-\xd7\xfe" \
|
|
194
|
-
"\xd8\xa1-\xd8\xfe" \
|
|
195
|
-
"\xd9\xa1-\xd9\xfe" \
|
|
196
|
-
"\xda\xa1-\xda\xfe" \
|
|
197
|
-
"\xdb\xa1-\xdb\xfe" \
|
|
198
|
-
"\xdc\xa1-\xdc\xfe" \
|
|
199
|
-
"\xdd\xa1-\xdd\xfe" \
|
|
200
|
-
"\xde\xa1-\xde\xfe" \
|
|
201
|
-
"\xdf\xa1-\xdf\xfe" \
|
|
202
|
-
"\xe0\xa1-\xe0\xfe" \
|
|
203
|
-
"\xe1\xa1-\xe1\xfe" \
|
|
204
|
-
"\xe2\xa1-\xe2\xfe" \
|
|
205
|
-
"\xe3\xa1-\xe3\xfe" \
|
|
206
|
-
"\xe4\xa1-\xe4\xfe" \
|
|
207
|
-
"\xe5\xa1-\xe5\xfe" \
|
|
208
|
-
"\xe6\xa1-\xe6\xfe" \
|
|
209
|
-
"\xe7\xa1-\xe7\xfe" \
|
|
210
|
-
"\xe8\xa1-\xe8\xfe" \
|
|
211
|
-
"\xe9\xa1-\xe9\xfe" \
|
|
212
|
-
"\xea\xa1-\xea\xfe" \
|
|
213
|
-
"\xeb\xa1-\xeb\xfe" \
|
|
214
|
-
"\xec\xa1-\xec\xfe" \
|
|
215
|
-
"\xed\xa1-\xed\xfe" \
|
|
216
|
-
"\xee\xa1-\xee\xfe" \
|
|
217
|
-
"\xef\xa1-\xef\xfe" \
|
|
218
|
-
"\xf0\xa1-\xf0\xfe" \
|
|
219
|
-
"\xf1\xa1-\xf1\xfe" \
|
|
220
|
-
"\xf2\xa1-\xf2\xfe" \
|
|
221
|
-
"\xf3\xa1-\xf3\xfe" \
|
|
222
|
-
"\xf4\xa1-\xf4\xa6"
|
|
223
|
-
KANJI_EX = KANJI + "\xa1\xb9" # + noma
|
|
224
|
-
JA_GRAPH = JA_ALNUM + JA_PUNCT + HIRA + KATA + KANJI
|
|
225
|
-
JA_PRINT = JA_GRAPH + JA_BLANK
|
|
245
|
+
WORD_REGEXP_SRC = [
|
|
246
|
+
"(?:[#{GRAPH}]+[#{BLANK}]?)",
|
|
247
|
+
"|(?:[#{SPACE}]+)",
|
|
248
|
+
"|(?:[#{KANJI_EX}]+[#{HIRA}]+)",
|
|
249
|
+
"|(?:[#{KATA_EX}]+[#{HIRA}]+)",
|
|
250
|
+
"|(?:[#{KANJI_EX}]+)",
|
|
251
|
+
"|(?:[#{KATA_EX}]+)",
|
|
252
|
+
"|(?:[#{HIRA_EX}]+)",
|
|
253
|
+
"|(?:.+?)",
|
|
254
|
+
].join
|
|
226
255
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
|
|
232
|
-
"|(?:[#{SPACE}]+)",
|
|
233
|
-
"|(?:[#{KANJI_EX}]+[#{HIRA}]+)",
|
|
234
|
-
"|(?:[#{KATA_EX}]+[#{HIRA}]+)",
|
|
235
|
-
"|(?:[#{KANJI_EX}]+)",
|
|
236
|
-
"|(?:[#{KATA_EX}]+)",
|
|
237
|
-
"|(?:[#{HIRA_EX}]+)",
|
|
238
|
-
"|(?:.+?)"].join
|
|
239
|
-
|
|
240
|
-
CharString.register_encoding(self)
|
|
241
|
-
|
|
242
|
-
end # module EUCJP
|
|
243
|
-
end # module CharString
|
|
244
|
-
end # class DocDiff
|
|
256
|
+
CharString.register_encoding(self)
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
end
|