docdiff 0.6.5 → 0.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +7 -7
- data/Guardfile +4 -4
- data/Makefile +1 -1
- data/Rakefile +6 -6
- data/bin/docdiff +1 -1
- data/devutil/Rakefile +12 -5
- data/devutil/char_by_charclass.rb +43 -20
- data/devutil/charclass_by_char.rb +40 -19
- data/devutil/jis0208.rb +263 -231
- data/devutil/jis0208_test.rb +196 -0
- data/doc/news.md +8 -0
- data/docdiff.gemspec +12 -10
- data/lib/doc_diff.rb +59 -60
- data/lib/docdiff/charstring.rb +225 -241
- data/lib/docdiff/cli.rb +285 -250
- data/lib/docdiff/diff/contours.rb +1 -1
- data/lib/docdiff/diff/editscript.rb +1 -1
- data/lib/docdiff/diff/rcsdiff.rb +1 -1
- data/lib/docdiff/diff/shortestpath.rb +1 -1
- data/lib/docdiff/diff/speculative.rb +1 -1
- data/lib/docdiff/diff/subsequence.rb +1 -1
- data/lib/docdiff/diff/unidiff.rb +1 -1
- data/lib/docdiff/diff.rb +1 -1
- data/lib/docdiff/difference.rb +71 -70
- data/lib/docdiff/document.rb +129 -109
- data/lib/docdiff/encoding/en_ascii.rb +64 -58
- data/lib/docdiff/encoding/ja_eucjp.rb +250 -235
- data/lib/docdiff/encoding/ja_sjis.rb +240 -226
- data/lib/docdiff/encoding/ja_utf8.rb +6952 -6939
- data/lib/docdiff/version.rb +1 -1
- data/lib/docdiff/view.rb +522 -438
- data/lib/docdiff.rb +2 -2
- data/test/charstring_test.rb +475 -351
- data/test/cli_test.rb +103 -101
- data/test/diff_test.rb +15 -16
- data/test/difference_test.rb +40 -31
- data/test/docdiff_test.rb +162 -136
- data/test/document_test.rb +280 -175
- data/test/test_helper.rb +2 -1
- data/test/view_test.rb +636 -497
- metadata +8 -8
- data/devutil/testjis0208.rb +0 -38
|
@@ -1,235 +1,249 @@
|
|
|
1
1
|
# Japanese Shift_JIS encoding module for CharString
|
|
2
2
|
# 2003- Hisashi MORITA
|
|
3
3
|
|
|
4
|
-
# frozen_string_literal:
|
|
4
|
+
# frozen_string_literal: true
|
|
5
5
|
|
|
6
6
|
class DocDiff
|
|
7
|
-
module CharString
|
|
8
|
-
|
|
7
|
+
module CharString
|
|
8
|
+
module ShiftJIS
|
|
9
|
+
ENCODING = "Shift_JIS"
|
|
9
10
|
|
|
10
|
-
|
|
11
|
+
# character table based on:
|
|
12
|
+
# ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
|
|
11
13
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
+
CNTRL =
|
|
15
|
+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
|
|
16
|
+
"\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
|
|
17
|
+
"\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
|
|
18
|
+
"\x1e\x1f\x7f"
|
|
19
|
+
SPACE =
|
|
20
|
+
"\x09\x0a\x0b\x0c\x0d\x20"
|
|
21
|
+
BLANK =
|
|
22
|
+
"\x09\x20"
|
|
23
|
+
DIGIT =
|
|
24
|
+
"\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
|
|
25
|
+
UPPER =
|
|
26
|
+
"\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
|
27
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
|
28
|
+
"\x55\x56\x57\x58\x59\x5a"
|
|
29
|
+
LOWER =
|
|
30
|
+
"\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
|
31
|
+
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
|
32
|
+
"\x75\x76\x77\x78\x79\x7a"
|
|
33
|
+
ALPHA = UPPER + LOWER
|
|
34
|
+
ALNUM = DIGIT + ALPHA
|
|
35
|
+
PUNCT =
|
|
36
|
+
Regexp.quote(
|
|
37
|
+
"\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
|
38
|
+
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
|
39
|
+
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
|
40
|
+
"\x7d\x7e",
|
|
41
|
+
)
|
|
42
|
+
GRAPH = DIGIT + UPPER + LOWER + PUNCT
|
|
43
|
+
PRINT = "\x20" + GRAPH
|
|
44
|
+
XDIGIT =
|
|
45
|
+
DIGIT +
|
|
46
|
+
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
|
47
|
+
"\x65\x66"
|
|
48
|
+
JA_SPACE =
|
|
49
|
+
"\x81\x40"
|
|
50
|
+
JA_BLANK =
|
|
51
|
+
"\x81\x40"
|
|
52
|
+
JA_ALNUM =
|
|
53
|
+
"\x82\x4f\x82\x50\x82\x51\x82\x52\x82\x53" \
|
|
54
|
+
"\x82\x54\x82\x55\x82\x56\x82\x57\x82\x58" \
|
|
55
|
+
"\x82\x60\x82\x61\x82\x62\x82\x63\x82\x64" \
|
|
56
|
+
"\x82\x65\x82\x66\x82\x67\x82\x68\x82\x69" \
|
|
57
|
+
"\x82\x6a\x82\x6b\x82\x6c\x82\x6d\x82\x6e" \
|
|
58
|
+
"\x82\x6f\x82\x70\x82\x71\x82\x72\x82\x73" \
|
|
59
|
+
"\x82\x74\x82\x75\x82\x76\x82\x77\x82\x78" \
|
|
60
|
+
"\x82\x79\x82\x81\x82\x82\x82\x83\x82\x84" \
|
|
61
|
+
"\x82\x85\x82\x86\x82\x87\x82\x88\x82\x89" \
|
|
62
|
+
"\x82\x8a\x82\x8b\x82\x8c\x82\x8d\x82\x8e" \
|
|
63
|
+
"\x82\x8f\x82\x90\x82\x91\x82\x92\x82\x93" \
|
|
64
|
+
"\x82\x94\x82\x95\x82\x96\x82\x97\x82\x98" \
|
|
65
|
+
"\x82\x99\x82\x9a"
|
|
66
|
+
JA_PUNCT =
|
|
67
|
+
"\x81\x41\x81\x42\x81\x43\x81\x44\x81\x45\x81\x46" \
|
|
68
|
+
"\x81\x47\x81\x48\x81\x49\x81\x4a\x81\x4b\x81\x4c" \
|
|
69
|
+
"\x81\x4d\x81\x4e\x81\x4f\x81\x50\x81\x51\x81\x52" \
|
|
70
|
+
"\x81\x53\x81\x54\x81\x55\x81\x56\x81\x57\x81\x58" \
|
|
71
|
+
"\x81\x59\x81\x5a\x81\x5b\x81\x5c\x81\x5d\x81\x5e" \
|
|
72
|
+
"\x81\x5f\x81\x60\x81\x61\x81\x62\x81\x63\x81\x64" \
|
|
73
|
+
"\x81\x65\x81\x66\x81\x67\x81\x68\x81\x69\x81\x6a" \
|
|
74
|
+
"\x81\x6b\x81\x6c\x81\x6d\x81\x6e\x81\x6f\x81\x70" \
|
|
75
|
+
"\x81\x71\x81\x72\x81\x73\x81\x74\x81\x75\x81\x76" \
|
|
76
|
+
"\x81\x77\x81\x78\x81\x79\x81\x7a\x81\x7b\x81\x7c" \
|
|
77
|
+
"\x81\x7d\x81\x7e\x81\x80\x81\x81\x81\x82\x81\x83" \
|
|
78
|
+
"\x81\x84\x81\x85\x81\x86\x81\x87\x81\x88\x81\x89" \
|
|
79
|
+
"\x81\x8a\x81\x8b\x81\x8c\x81\x8d\x81\x8e\x81\x8f" \
|
|
80
|
+
"\x81\x90\x81\x91\x81\x92\x81\x93\x81\x94\x81\x95" \
|
|
81
|
+
"\x81\x96\x81\x97\x81\x98\x81\x99\x81\x9a\x81\x9b" \
|
|
82
|
+
"\x81\x9c\x81\x9d\x81\x9e\x81\x9f\x81\xa0\x81\xa1" \
|
|
83
|
+
"\x81\xa2\x81\xa3\x81\xa4\x81\xa5\x81\xa6\x81\xa7" \
|
|
84
|
+
"\x81\xa8\x81\xa9\x81\xaa\x81\xab\x81\xac\x81\xb8" \
|
|
85
|
+
"\x81\xb9\x81\xba\x81\xbb\x81\xbc\x81\xbd\x81\xbe" \
|
|
86
|
+
"\x81\xbf\x81\xc8\x81\xc9\x81\xca\x81\xcb\x81\xcc" \
|
|
87
|
+
"\x81\xcd\x81\xce\x81\xda\x81\xdb\x81\xdc\x81\xdd" \
|
|
88
|
+
"\x81\xde\x81\xdf\x81\xe0\x81\xe1\x81\xe2\x81\xe3" \
|
|
89
|
+
"\x81\xe4\x81\xe5\x81\xe6\x81\xe7\x81\xe8\x81\xf0" \
|
|
90
|
+
"\x81\xf1\x81\xf2\x81\xf3\x81\xf4\x81\xf5\x81\xf6" \
|
|
91
|
+
"\x81\xf7\x81\xfc\x83\x9f\x83\xa0\x83\xa1\x83\xa2" \
|
|
92
|
+
"\x83\xa3\x83\xa4\x83\xa5\x83\xa6\x83\xa7\x83\xa8" \
|
|
93
|
+
"\x83\xa9\x83\xaa\x83\xab\x83\xac\x83\xad\x83\xae" \
|
|
94
|
+
"\x83\xaf\x83\xb0\x83\xb1\x83\xb2\x83\xb3\x83\xb4" \
|
|
95
|
+
"\x83\xb5\x83\xb6\x83\xbf\x83\xc0\x83\xc1\x83\xc2" \
|
|
96
|
+
"\x83\xc3\x83\xc4\x83\xc5\x83\xc6\x83\xc7\x83\xc8" \
|
|
97
|
+
"\x83\xc9\x83\xca\x83\xcb\x83\xcc\x83\xcd\x83\xce" \
|
|
98
|
+
"\x83\xcf\x83\xd0\x83\xd1\x83\xd2\x83\xd3\x83\xd4" \
|
|
99
|
+
"\x83\xd5\x83\xd6\x84\x40\x84\x41\x84\x42\x84\x43" \
|
|
100
|
+
"\x84\x44\x84\x45\x84\x46\x84\x47\x84\x48\x84\x49" \
|
|
101
|
+
"\x84\x4a\x84\x4b\x84\x4c\x84\x4d\x84\x4e\x84\x4f" \
|
|
102
|
+
"\x84\x50\x84\x51\x84\x52\x84\x53\x84\x54\x84\x55" \
|
|
103
|
+
"\x84\x56\x84\x57\x84\x58\x84\x59\x84\x5a\x84\x5b" \
|
|
104
|
+
"\x84\x5c\x84\x5d\x84\x5e\x84\x5f\x84\x60\x84\x70" \
|
|
105
|
+
"\x84\x71\x84\x72\x84\x73\x84\x74\x84\x75\x84\x76" \
|
|
106
|
+
"\x84\x77\x84\x78\x84\x79\x84\x7a\x84\x7b\x84\x7c" \
|
|
107
|
+
"\x84\x7d\x84\x7e\x84\x80\x84\x81\x84\x82\x84\x83" \
|
|
108
|
+
"\x84\x84\x84\x85\x84\x86\x84\x87\x84\x88\x84\x89" \
|
|
109
|
+
"\x84\x8a\x84\x8b\x84\x8c\x84\x8d\x84\x8e\x84\x8f" \
|
|
110
|
+
"\x84\x90\x84\x91\x84\x9f\x84\xa0\x84\xa1\x84\xa2" \
|
|
111
|
+
"\x84\xa3\x84\xa4\x84\xa5\x84\xa6\x84\xa7\x84\xa8" \
|
|
112
|
+
"\x84\xa9\x84\xaa\x84\xab\x84\xac\x84\xad\x84\xae" \
|
|
113
|
+
"\x84\xaf\x84\xb0\x84\xb1\x84\xb2\x84\xb3\x84\xb4" \
|
|
114
|
+
"\x84\xb5\x84\xb6\x84\xb7\x84\xb8\x84\xb9\x84\xba" \
|
|
115
|
+
"\x84\xbb\x84\xbc\x84\xbd\x84\xbe"
|
|
116
|
+
HIRA =
|
|
117
|
+
"\x82\x9f\x82\xa0\x82\xa1\x82\xa2\x82\xa3" \
|
|
118
|
+
"\x82\xa4\x82\xa5\x82\xa6\x82\xa7\x82\xa8" \
|
|
119
|
+
"\x82\xa9\x82\xaa\x82\xab\x82\xac\x82\xad" \
|
|
120
|
+
"\x82\xae\x82\xaf\x82\xb0\x82\xb1\x82\xb2" \
|
|
121
|
+
"\x82\xb3\x82\xb4\x82\xb5\x82\xb6\x82\xb7" \
|
|
122
|
+
"\x82\xb8\x82\xb9\x82\xba\x82\xbb\x82\xbc" \
|
|
123
|
+
"\x82\xbd\x82\xbe\x82\xbf\x82\xc0\x82\xc1" \
|
|
124
|
+
"\x82\xc2\x82\xc3\x82\xc4\x82\xc5\x82\xc6" \
|
|
125
|
+
"\x82\xc7\x82\xc8\x82\xc9\x82\xca\x82\xcb" \
|
|
126
|
+
"\x82\xcc\x82\xcd\x82\xce\x82\xcf\x82\xd0" \
|
|
127
|
+
"\x82\xd1\x82\xd2\x82\xd3\x82\xd4\x82\xd5" \
|
|
128
|
+
"\x82\xd6\x82\xd7\x82\xd8\x82\xd9\x82\xda" \
|
|
129
|
+
"\x82\xdb\x82\xdc\x82\xdd\x82\xde\x82\xdf" \
|
|
130
|
+
"\x82\xe0\x82\xe1\x82\xe2\x82\xe3\x82\xe4" \
|
|
131
|
+
"\x82\xe5\x82\xe6\x82\xe7\x82\xe8\x82\xe9" \
|
|
132
|
+
"\x82\xea\x82\xeb\x82\xec\x82\xed\x82\xee" \
|
|
133
|
+
"\x82\xef\x82\xf0\x82\xf1" # [\x82\xf1-\x82\xf1]
|
|
134
|
+
HIRA_EX =
|
|
135
|
+
HIRA +
|
|
136
|
+
"\x81\x5b" \
|
|
137
|
+
"\x81\x52\x81\x53" # add onbiki and kanagaeshi(hira)
|
|
138
|
+
KATA =
|
|
139
|
+
"\x83\x40\x83\x41\x83\x42\x83\x43\x83\x44" \
|
|
140
|
+
"\x83\x45\x83\x46\x83\x47\x83\x48\x83\x49" \
|
|
141
|
+
"\x83\x4a\x83\x4b\x83\x4c\x83\x4d\x83\x4e" \
|
|
142
|
+
"\x83\x4f\x83\x50\x83\x51\x83\x52\x83\x53" \
|
|
143
|
+
"\x83\x54\x83\x55\x83\x56\x83\x57\x83\x58" \
|
|
144
|
+
"\x83\x59\x83\x5a\x83\x5b\x83\x5c\x83\x5d" \
|
|
145
|
+
"\x83\x5e\x83\x5f\x83\x60\x83\x61\x83\x62" \
|
|
146
|
+
"\x83\x63\x83\x64\x83\x65\x83\x66\x83\x67" \
|
|
147
|
+
"\x83\x68\x83\x69\x83\x6a\x83\x6b\x83\x6c" \
|
|
148
|
+
"\x83\x6d\x83\x6e\x83\x6f\x83\x70\x83\x71" \
|
|
149
|
+
"\x83\x72\x83\x73\x83\x74\x83\x75\x83\x76" \
|
|
150
|
+
"\x83\x77\x83\x78\x83\x79\x83\x7a\x83\x7b" \
|
|
151
|
+
"\x83\x7c\x83\x7d\x83\x7e\x83\x80\x83\x81" \
|
|
152
|
+
"\x83\x82\x83\x83\x83\x84\x83\x85\x83\x86" \
|
|
153
|
+
"\x83\x87\x83\x88\x83\x89\x83\x8a\x83\x8b" \
|
|
154
|
+
"\x83\x8c\x83\x8d\x83\x8e\x83\x8f\x83\x90" \
|
|
155
|
+
"\x83\x91\x83\x92\x83\x93\x83\x94\x83\x95" \
|
|
156
|
+
"\x83\x96" # [\x83\x40-\x83\x7e\x83\x80-\x83\x96] # Note that \x83\x7f is excluded.
|
|
157
|
+
KATA_EX =
|
|
158
|
+
KATA +
|
|
159
|
+
"\x81\x5b" \
|
|
160
|
+
"\x81\x54\x81\x55" # add onbiki and kanagaeshi(kata)
|
|
161
|
+
KANJI =
|
|
162
|
+
"\x88\x9f-\x88\xfc" \
|
|
163
|
+
"\x89\x40-\x89\x9e" \
|
|
164
|
+
"\x89\x9f-\x89\xfc" \
|
|
165
|
+
"\x8a\x40-\x8a\x9e" \
|
|
166
|
+
"\x8a\x9f-\x8a\xfc" \
|
|
167
|
+
"\x8b\x40-\x8b\x9e" \
|
|
168
|
+
"\x8b\x9f-\x8b\xfc" \
|
|
169
|
+
"\x8c\x40-\x8c\x9e" \
|
|
170
|
+
"\x8c\x9f-\x8c\xfc" \
|
|
171
|
+
"\x8d\x40-\x8d\x9e" \
|
|
172
|
+
"\x8d\x9f-\x8d\xfc" \
|
|
173
|
+
"\x8e\x40-\x8e\x9e" \
|
|
174
|
+
"\x8e\x9f-\x8e\xfc" \
|
|
175
|
+
"\x8f\x40-\x8f\x9e" \
|
|
176
|
+
"\x8f\x9f-\x8f\xfc" \
|
|
177
|
+
"\x90\x40-\x90\x9e" \
|
|
178
|
+
"\x90\x9f-\x90\xfc" \
|
|
179
|
+
"\x91\x40-\x91\x9e" \
|
|
180
|
+
"\x91\x9f-\x91\xfc" \
|
|
181
|
+
"\x92\x40-\x92\x9e" \
|
|
182
|
+
"\x92\x9f-\x92\xfc" \
|
|
183
|
+
"\x93\x40-\x93\x9e" \
|
|
184
|
+
"\x93\x9f-\x93\xfc" \
|
|
185
|
+
"\x94\x40-\x94\x9e" \
|
|
186
|
+
"\x94\x9f-\x94\xfc" \
|
|
187
|
+
"\x95\x40-\x95\x9e" \
|
|
188
|
+
"\x95\x9f-\x95\xfc" \
|
|
189
|
+
"\x96\x40-\x96\x9e" \
|
|
190
|
+
"\x96\x9f-\x96\xfc" \
|
|
191
|
+
"\x97\x40-\x97\x9e" \
|
|
192
|
+
"\x97\x9f-\x97\xfc" \
|
|
193
|
+
"\x98\x40-\x98\x72" \
|
|
194
|
+
"\x98\x9f-\x98\xfc" \
|
|
195
|
+
"\x99\x40-\x99\x9e" \
|
|
196
|
+
"\x99\x9f-\x99\xfc" \
|
|
197
|
+
"\x9a\x40-\x9a\x9e" \
|
|
198
|
+
"\x9a\x9f-\x9a\xfc" \
|
|
199
|
+
"\x9b\x40-\x9b\x9e" \
|
|
200
|
+
"\x9b\x9f-\x9b\xfc" \
|
|
201
|
+
"\x9c\x40-\x9c\x9e" \
|
|
202
|
+
"\x9c\x9f-\x9c\xfc" \
|
|
203
|
+
"\x9d\x40-\x9d\x9e" \
|
|
204
|
+
"\x9d\x9f-\x9d\xfc" \
|
|
205
|
+
"\x9e\x40-\x9e\x9e" \
|
|
206
|
+
"\x9e\x9f-\x9e\xfc" \
|
|
207
|
+
"\x9f\x40-\x9f\x9e" \
|
|
208
|
+
"\x9f\x9f-\x9f\xfc" \
|
|
209
|
+
"\xe0\x40-\xe0\x9e" \
|
|
210
|
+
"\xe0\x9f-\xe0\xfc" \
|
|
211
|
+
"\xe1\x40-\xe1\x9e" \
|
|
212
|
+
"\xe1\x9f-\xe1\xfc" \
|
|
213
|
+
"\xe2\x40-\xe2\x9e" \
|
|
214
|
+
"\xe2\x9f-\xe2\xfc" \
|
|
215
|
+
"\xe3\x40-\xe3\x9e" \
|
|
216
|
+
"\xe3\x9f-\xe3\xfc" \
|
|
217
|
+
"\xe4\x40-\xe4\x9e" \
|
|
218
|
+
"\xe4\x9f-\xe4\xfc" \
|
|
219
|
+
"\xe5\x40-\xe5\x9e" \
|
|
220
|
+
"\xe5\x9f-\xe5\xfc" \
|
|
221
|
+
"\xe6\x40-\xe6\x9e" \
|
|
222
|
+
"\xe6\x9f-\xe6\xfc" \
|
|
223
|
+
"\xe7\x40-\xe7\x9e" \
|
|
224
|
+
"\xe7\x9f-\xe7\xfc" \
|
|
225
|
+
"\xe8\x40-\xe8\x9e" \
|
|
226
|
+
"\xe8\x9f-\xe8\xfc" \
|
|
227
|
+
"\xe9\x40-\xe9\x9e" \
|
|
228
|
+
"\xe9\x9f-\xe9\xfc" \
|
|
229
|
+
"\xea\x40-\xea\x9e" \
|
|
230
|
+
"\xea\x9f-\xea\xa4"
|
|
231
|
+
KANJI_EX = KANJI + "\x81\x58" # + noma
|
|
232
|
+
JA_GRAPH = JA_ALNUM + JA_PUNCT + HIRA + KATA + KANJI
|
|
233
|
+
JA_PRINT = JA_GRAPH + JA_BLANK
|
|
14
234
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
|
26
|
-
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
|
27
|
-
"\x75\x76\x77\x78\x79\x7a"
|
|
28
|
-
ALPHA = UPPER + LOWER
|
|
29
|
-
ALNUM = DIGIT + ALPHA
|
|
30
|
-
PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
|
31
|
-
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
|
32
|
-
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
|
33
|
-
"\x7d\x7e"
|
|
34
|
-
GRAPH = DIGIT + UPPER + LOWER + PUNCT
|
|
35
|
-
PRINT = "\x20" + GRAPH
|
|
36
|
-
XDIGIT = DIGIT +
|
|
37
|
-
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
|
38
|
-
"\x65\x66"
|
|
39
|
-
JA_SPACE = "\x81\x40"
|
|
40
|
-
JA_BLANK = "\x81\x40"
|
|
41
|
-
JA_ALNUM = "\x82\x4f\x82\x50\x82\x51\x82\x52\x82\x53" \
|
|
42
|
-
"\x82\x54\x82\x55\x82\x56\x82\x57\x82\x58" \
|
|
43
|
-
"\x82\x60\x82\x61\x82\x62\x82\x63\x82\x64" \
|
|
44
|
-
"\x82\x65\x82\x66\x82\x67\x82\x68\x82\x69" \
|
|
45
|
-
"\x82\x6a\x82\x6b\x82\x6c\x82\x6d\x82\x6e" \
|
|
46
|
-
"\x82\x6f\x82\x70\x82\x71\x82\x72\x82\x73" \
|
|
47
|
-
"\x82\x74\x82\x75\x82\x76\x82\x77\x82\x78" \
|
|
48
|
-
"\x82\x79\x82\x81\x82\x82\x82\x83\x82\x84" \
|
|
49
|
-
"\x82\x85\x82\x86\x82\x87\x82\x88\x82\x89" \
|
|
50
|
-
"\x82\x8a\x82\x8b\x82\x8c\x82\x8d\x82\x8e" \
|
|
51
|
-
"\x82\x8f\x82\x90\x82\x91\x82\x92\x82\x93" \
|
|
52
|
-
"\x82\x94\x82\x95\x82\x96\x82\x97\x82\x98" \
|
|
53
|
-
"\x82\x99\x82\x9a"
|
|
54
|
-
JA_PUNCT = "\x81\x41\x81\x42\x81\x43\x81\x44\x81\x45\x81\x46" \
|
|
55
|
-
"\x81\x47\x81\x48\x81\x49\x81\x4a\x81\x4b\x81\x4c" \
|
|
56
|
-
"\x81\x4d\x81\x4e\x81\x4f\x81\x50\x81\x51\x81\x52" \
|
|
57
|
-
"\x81\x53\x81\x54\x81\x55\x81\x56\x81\x57\x81\x58" \
|
|
58
|
-
"\x81\x59\x81\x5a\x81\x5b\x81\x5c\x81\x5d\x81\x5e" \
|
|
59
|
-
"\x81\x5f\x81\x60\x81\x61\x81\x62\x81\x63\x81\x64" \
|
|
60
|
-
"\x81\x65\x81\x66\x81\x67\x81\x68\x81\x69\x81\x6a" \
|
|
61
|
-
"\x81\x6b\x81\x6c\x81\x6d\x81\x6e\x81\x6f\x81\x70" \
|
|
62
|
-
"\x81\x71\x81\x72\x81\x73\x81\x74\x81\x75\x81\x76" \
|
|
63
|
-
"\x81\x77\x81\x78\x81\x79\x81\x7a\x81\x7b\x81\x7c" \
|
|
64
|
-
"\x81\x7d\x81\x7e\x81\x80\x81\x81\x81\x82\x81\x83" \
|
|
65
|
-
"\x81\x84\x81\x85\x81\x86\x81\x87\x81\x88\x81\x89" \
|
|
66
|
-
"\x81\x8a\x81\x8b\x81\x8c\x81\x8d\x81\x8e\x81\x8f" \
|
|
67
|
-
"\x81\x90\x81\x91\x81\x92\x81\x93\x81\x94\x81\x95" \
|
|
68
|
-
"\x81\x96\x81\x97\x81\x98\x81\x99\x81\x9a\x81\x9b" \
|
|
69
|
-
"\x81\x9c\x81\x9d\x81\x9e\x81\x9f\x81\xa0\x81\xa1" \
|
|
70
|
-
"\x81\xa2\x81\xa3\x81\xa4\x81\xa5\x81\xa6\x81\xa7" \
|
|
71
|
-
"\x81\xa8\x81\xa9\x81\xaa\x81\xab\x81\xac\x81\xb8" \
|
|
72
|
-
"\x81\xb9\x81\xba\x81\xbb\x81\xbc\x81\xbd\x81\xbe" \
|
|
73
|
-
"\x81\xbf\x81\xc8\x81\xc9\x81\xca\x81\xcb\x81\xcc" \
|
|
74
|
-
"\x81\xcd\x81\xce\x81\xda\x81\xdb\x81\xdc\x81\xdd" \
|
|
75
|
-
"\x81\xde\x81\xdf\x81\xe0\x81\xe1\x81\xe2\x81\xe3" \
|
|
76
|
-
"\x81\xe4\x81\xe5\x81\xe6\x81\xe7\x81\xe8\x81\xf0" \
|
|
77
|
-
"\x81\xf1\x81\xf2\x81\xf3\x81\xf4\x81\xf5\x81\xf6" \
|
|
78
|
-
"\x81\xf7\x81\xfc\x83\x9f\x83\xa0\x83\xa1\x83\xa2" \
|
|
79
|
-
"\x83\xa3\x83\xa4\x83\xa5\x83\xa6\x83\xa7\x83\xa8" \
|
|
80
|
-
"\x83\xa9\x83\xaa\x83\xab\x83\xac\x83\xad\x83\xae" \
|
|
81
|
-
"\x83\xaf\x83\xb0\x83\xb1\x83\xb2\x83\xb3\x83\xb4" \
|
|
82
|
-
"\x83\xb5\x83\xb6\x83\xbf\x83\xc0\x83\xc1\x83\xc2" \
|
|
83
|
-
"\x83\xc3\x83\xc4\x83\xc5\x83\xc6\x83\xc7\x83\xc8" \
|
|
84
|
-
"\x83\xc9\x83\xca\x83\xcb\x83\xcc\x83\xcd\x83\xce" \
|
|
85
|
-
"\x83\xcf\x83\xd0\x83\xd1\x83\xd2\x83\xd3\x83\xd4" \
|
|
86
|
-
"\x83\xd5\x83\xd6\x84\x40\x84\x41\x84\x42\x84\x43" \
|
|
87
|
-
"\x84\x44\x84\x45\x84\x46\x84\x47\x84\x48\x84\x49" \
|
|
88
|
-
"\x84\x4a\x84\x4b\x84\x4c\x84\x4d\x84\x4e\x84\x4f" \
|
|
89
|
-
"\x84\x50\x84\x51\x84\x52\x84\x53\x84\x54\x84\x55" \
|
|
90
|
-
"\x84\x56\x84\x57\x84\x58\x84\x59\x84\x5a\x84\x5b" \
|
|
91
|
-
"\x84\x5c\x84\x5d\x84\x5e\x84\x5f\x84\x60\x84\x70" \
|
|
92
|
-
"\x84\x71\x84\x72\x84\x73\x84\x74\x84\x75\x84\x76" \
|
|
93
|
-
"\x84\x77\x84\x78\x84\x79\x84\x7a\x84\x7b\x84\x7c" \
|
|
94
|
-
"\x84\x7d\x84\x7e\x84\x80\x84\x81\x84\x82\x84\x83" \
|
|
95
|
-
"\x84\x84\x84\x85\x84\x86\x84\x87\x84\x88\x84\x89" \
|
|
96
|
-
"\x84\x8a\x84\x8b\x84\x8c\x84\x8d\x84\x8e\x84\x8f" \
|
|
97
|
-
"\x84\x90\x84\x91\x84\x9f\x84\xa0\x84\xa1\x84\xa2" \
|
|
98
|
-
"\x84\xa3\x84\xa4\x84\xa5\x84\xa6\x84\xa7\x84\xa8" \
|
|
99
|
-
"\x84\xa9\x84\xaa\x84\xab\x84\xac\x84\xad\x84\xae" \
|
|
100
|
-
"\x84\xaf\x84\xb0\x84\xb1\x84\xb2\x84\xb3\x84\xb4" \
|
|
101
|
-
"\x84\xb5\x84\xb6\x84\xb7\x84\xb8\x84\xb9\x84\xba" \
|
|
102
|
-
"\x84\xbb\x84\xbc\x84\xbd\x84\xbe"
|
|
103
|
-
HIRA = "\x82\x9f\x82\xa0\x82\xa1\x82\xa2\x82\xa3" \
|
|
104
|
-
"\x82\xa4\x82\xa5\x82\xa6\x82\xa7\x82\xa8" \
|
|
105
|
-
"\x82\xa9\x82\xaa\x82\xab\x82\xac\x82\xad" \
|
|
106
|
-
"\x82\xae\x82\xaf\x82\xb0\x82\xb1\x82\xb2" \
|
|
107
|
-
"\x82\xb3\x82\xb4\x82\xb5\x82\xb6\x82\xb7" \
|
|
108
|
-
"\x82\xb8\x82\xb9\x82\xba\x82\xbb\x82\xbc" \
|
|
109
|
-
"\x82\xbd\x82\xbe\x82\xbf\x82\xc0\x82\xc1" \
|
|
110
|
-
"\x82\xc2\x82\xc3\x82\xc4\x82\xc5\x82\xc6" \
|
|
111
|
-
"\x82\xc7\x82\xc8\x82\xc9\x82\xca\x82\xcb" \
|
|
112
|
-
"\x82\xcc\x82\xcd\x82\xce\x82\xcf\x82\xd0" \
|
|
113
|
-
"\x82\xd1\x82\xd2\x82\xd3\x82\xd4\x82\xd5" \
|
|
114
|
-
"\x82\xd6\x82\xd7\x82\xd8\x82\xd9\x82\xda" \
|
|
115
|
-
"\x82\xdb\x82\xdc\x82\xdd\x82\xde\x82\xdf" \
|
|
116
|
-
"\x82\xe0\x82\xe1\x82\xe2\x82\xe3\x82\xe4" \
|
|
117
|
-
"\x82\xe5\x82\xe6\x82\xe7\x82\xe8\x82\xe9" \
|
|
118
|
-
"\x82\xea\x82\xeb\x82\xec\x82\xed\x82\xee" \
|
|
119
|
-
"\x82\xef\x82\xf0\x82\xf1" # [\x82\xf1-\x82\xf1]
|
|
120
|
-
HIRA_EX = HIRA +
|
|
121
|
-
"\x81\x5b" +
|
|
122
|
-
"\x81\x52\x81\x53" # add onbiki and kanagaeshi(hira)
|
|
123
|
-
KATA = "\x83\x40\x83\x41\x83\x42\x83\x43\x83\x44" \
|
|
124
|
-
"\x83\x45\x83\x46\x83\x47\x83\x48\x83\x49" \
|
|
125
|
-
"\x83\x4a\x83\x4b\x83\x4c\x83\x4d\x83\x4e" \
|
|
126
|
-
"\x83\x4f\x83\x50\x83\x51\x83\x52\x83\x53" \
|
|
127
|
-
"\x83\x54\x83\x55\x83\x56\x83\x57\x83\x58" \
|
|
128
|
-
"\x83\x59\x83\x5a\x83\x5b\x83\x5c\x83\x5d" \
|
|
129
|
-
"\x83\x5e\x83\x5f\x83\x60\x83\x61\x83\x62" \
|
|
130
|
-
"\x83\x63\x83\x64\x83\x65\x83\x66\x83\x67" \
|
|
131
|
-
"\x83\x68\x83\x69\x83\x6a\x83\x6b\x83\x6c" \
|
|
132
|
-
"\x83\x6d\x83\x6e\x83\x6f\x83\x70\x83\x71" \
|
|
133
|
-
"\x83\x72\x83\x73\x83\x74\x83\x75\x83\x76" \
|
|
134
|
-
"\x83\x77\x83\x78\x83\x79\x83\x7a\x83\x7b" \
|
|
135
|
-
"\x83\x7c\x83\x7d\x83\x7e\x83\x80\x83\x81" \
|
|
136
|
-
"\x83\x82\x83\x83\x83\x84\x83\x85\x83\x86" \
|
|
137
|
-
"\x83\x87\x83\x88\x83\x89\x83\x8a\x83\x8b" \
|
|
138
|
-
"\x83\x8c\x83\x8d\x83\x8e\x83\x8f\x83\x90" \
|
|
139
|
-
"\x83\x91\x83\x92\x83\x93\x83\x94\x83\x95" \
|
|
140
|
-
"\x83\x96" # [\x83\x40-\x83\x7e\x83\x80-\x83\x96]
|
|
141
|
-
# Note that \x83\x7f is excluded.
|
|
142
|
-
KATA_EX = KATA +
|
|
143
|
-
"\x81\x5b" +
|
|
144
|
-
"\x81\x54\x81\x55" # add onbiki and kanagaeshi(kata)
|
|
145
|
-
KANJI = "\x88\x9f-\x88\xfc" \
|
|
146
|
-
"\x89\x40-\x89\x9e" \
|
|
147
|
-
"\x89\x9f-\x89\xfc" \
|
|
148
|
-
"\x8a\x40-\x8a\x9e" \
|
|
149
|
-
"\x8a\x9f-\x8a\xfc" \
|
|
150
|
-
"\x8b\x40-\x8b\x9e" \
|
|
151
|
-
"\x8b\x9f-\x8b\xfc" \
|
|
152
|
-
"\x8c\x40-\x8c\x9e" \
|
|
153
|
-
"\x8c\x9f-\x8c\xfc" \
|
|
154
|
-
"\x8d\x40-\x8d\x9e" \
|
|
155
|
-
"\x8d\x9f-\x8d\xfc" \
|
|
156
|
-
"\x8e\x40-\x8e\x9e" \
|
|
157
|
-
"\x8e\x9f-\x8e\xfc" \
|
|
158
|
-
"\x8f\x40-\x8f\x9e" \
|
|
159
|
-
"\x8f\x9f-\x8f\xfc" \
|
|
160
|
-
"\x90\x40-\x90\x9e" \
|
|
161
|
-
"\x90\x9f-\x90\xfc" \
|
|
162
|
-
"\x91\x40-\x91\x9e" \
|
|
163
|
-
"\x91\x9f-\x91\xfc" \
|
|
164
|
-
"\x92\x40-\x92\x9e" \
|
|
165
|
-
"\x92\x9f-\x92\xfc" \
|
|
166
|
-
"\x93\x40-\x93\x9e" \
|
|
167
|
-
"\x93\x9f-\x93\xfc" \
|
|
168
|
-
"\x94\x40-\x94\x9e" \
|
|
169
|
-
"\x94\x9f-\x94\xfc" \
|
|
170
|
-
"\x95\x40-\x95\x9e" \
|
|
171
|
-
"\x95\x9f-\x95\xfc" \
|
|
172
|
-
"\x96\x40-\x96\x9e" \
|
|
173
|
-
"\x96\x9f-\x96\xfc" \
|
|
174
|
-
"\x97\x40-\x97\x9e" \
|
|
175
|
-
"\x97\x9f-\x97\xfc" \
|
|
176
|
-
"\x98\x40-\x98\x72" \
|
|
177
|
-
"\x98\x9f-\x98\xfc" \
|
|
178
|
-
"\x99\x40-\x99\x9e" \
|
|
179
|
-
"\x99\x9f-\x99\xfc" \
|
|
180
|
-
"\x9a\x40-\x9a\x9e" \
|
|
181
|
-
"\x9a\x9f-\x9a\xfc" \
|
|
182
|
-
"\x9b\x40-\x9b\x9e" \
|
|
183
|
-
"\x9b\x9f-\x9b\xfc" \
|
|
184
|
-
"\x9c\x40-\x9c\x9e" \
|
|
185
|
-
"\x9c\x9f-\x9c\xfc" \
|
|
186
|
-
"\x9d\x40-\x9d\x9e" \
|
|
187
|
-
"\x9d\x9f-\x9d\xfc" \
|
|
188
|
-
"\x9e\x40-\x9e\x9e" \
|
|
189
|
-
"\x9e\x9f-\x9e\xfc" \
|
|
190
|
-
"\x9f\x40-\x9f\x9e" \
|
|
191
|
-
"\x9f\x9f-\x9f\xfc" \
|
|
192
|
-
"\xe0\x40-\xe0\x9e" \
|
|
193
|
-
"\xe0\x9f-\xe0\xfc" \
|
|
194
|
-
"\xe1\x40-\xe1\x9e" \
|
|
195
|
-
"\xe1\x9f-\xe1\xfc" \
|
|
196
|
-
"\xe2\x40-\xe2\x9e" \
|
|
197
|
-
"\xe2\x9f-\xe2\xfc" \
|
|
198
|
-
"\xe3\x40-\xe3\x9e" \
|
|
199
|
-
"\xe3\x9f-\xe3\xfc" \
|
|
200
|
-
"\xe4\x40-\xe4\x9e" \
|
|
201
|
-
"\xe4\x9f-\xe4\xfc" \
|
|
202
|
-
"\xe5\x40-\xe5\x9e" \
|
|
203
|
-
"\xe5\x9f-\xe5\xfc" \
|
|
204
|
-
"\xe6\x40-\xe6\x9e" \
|
|
205
|
-
"\xe6\x9f-\xe6\xfc" \
|
|
206
|
-
"\xe7\x40-\xe7\x9e" \
|
|
207
|
-
"\xe7\x9f-\xe7\xfc" \
|
|
208
|
-
"\xe8\x40-\xe8\x9e" \
|
|
209
|
-
"\xe8\x9f-\xe8\xfc" \
|
|
210
|
-
"\xe9\x40-\xe9\x9e" \
|
|
211
|
-
"\xe9\x9f-\xe9\xfc" \
|
|
212
|
-
"\xea\x40-\xea\x9e" \
|
|
213
|
-
"\xea\x9f-\xea\xa4"
|
|
214
|
-
KANJI_EX = KANJI + "\x81\x58" # + noma
|
|
215
|
-
JA_GRAPH = JA_ALNUM + JA_PUNCT + HIRA + KATA + KANJI
|
|
216
|
-
JA_PRINT = JA_GRAPH + JA_BLANK
|
|
235
|
+
WORD_REGEXP_SRC = [
|
|
236
|
+
"(?:[#{GRAPH}]+[#{BLANK}]?)",
|
|
237
|
+
"|(?:[#{SPACE}]+)",
|
|
238
|
+
"|(?:[#{KANJI_EX}]+[#{HIRA}]+)",
|
|
239
|
+
"|(?:[#{KATA_EX}]+[#{HIRA}]+)",
|
|
240
|
+
"|(?:[#{KANJI_EX}]+)",
|
|
241
|
+
"|(?:[#{KATA_EX}]+)",
|
|
242
|
+
"|(?:[#{HIRA_EX}]+)",
|
|
243
|
+
"|(?:.+?)",
|
|
244
|
+
].join
|
|
217
245
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
|
|
223
|
-
"|(?:[#{SPACE}]+)",
|
|
224
|
-
"|(?:[#{KANJI_EX}]+[#{HIRA}]+)",
|
|
225
|
-
"|(?:[#{KATA_EX}]+[#{HIRA}]+)",
|
|
226
|
-
"|(?:[#{KANJI_EX}]+)",
|
|
227
|
-
"|(?:[#{KATA_EX}]+)",
|
|
228
|
-
"|(?:[#{HIRA_EX}]+)",
|
|
229
|
-
"|(?:.+?)"].join
|
|
230
|
-
|
|
231
|
-
CharString.register_encoding(self)
|
|
232
|
-
|
|
233
|
-
end # module SJIS
|
|
234
|
-
end # module CharString
|
|
235
|
-
end # class DocDiff
|
|
246
|
+
CharString.register_encoding(self)
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|