docdiff 0.6.4 → 0.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +7 -7
- data/Guardfile +4 -4
- data/Makefile +6 -7
- data/README.md +1 -0
- data/README_ja.md +1 -0
- data/Rakefile +6 -6
- data/bin/docdiff +2 -209
- data/devutil/Rakefile +12 -5
- data/devutil/char_by_charclass.rb +43 -20
- data/devutil/charclass_by_char.rb +40 -19
- data/devutil/jis0208.rb +263 -231
- data/devutil/jis0208_test.rb +196 -0
- data/doc/news.md +17 -0
- data/docdiff.gemspec +13 -10
- data/lib/doc_diff.rb +63 -98
- data/lib/docdiff/charstring.rb +225 -241
- data/lib/docdiff/cli.rb +316 -0
- data/lib/docdiff/diff/contours.rb +1 -1
- data/lib/docdiff/diff/editscript.rb +1 -1
- data/lib/docdiff/diff/rcsdiff.rb +1 -1
- data/lib/docdiff/diff/shortestpath.rb +1 -1
- data/lib/docdiff/diff/speculative.rb +1 -1
- data/lib/docdiff/diff/subsequence.rb +1 -1
- data/lib/docdiff/diff/unidiff.rb +1 -1
- data/lib/docdiff/diff.rb +1 -1
- data/lib/docdiff/difference.rb +71 -70
- data/lib/docdiff/document.rb +129 -109
- data/lib/docdiff/encoding/en_ascii.rb +64 -58
- data/lib/docdiff/encoding/ja_eucjp.rb +250 -235
- data/lib/docdiff/encoding/ja_sjis.rb +240 -226
- data/lib/docdiff/encoding/ja_utf8.rb +6952 -6939
- data/lib/docdiff/version.rb +1 -1
- data/lib/docdiff/view.rb +523 -427
- data/lib/docdiff.rb +2 -2
- data/test/charstring_test.rb +475 -351
- data/test/cli_test.rb +314 -0
- data/test/diff_test.rb +15 -16
- data/test/difference_test.rb +40 -31
- data/test/docdiff_test.rb +162 -159
- data/test/document_test.rb +280 -175
- data/test/fixture/format_wdiff.conf +1 -0
- data/test/fixture/simple.conf +9 -0
- data/test/test_helper.rb +2 -1
- data/test/view_test.rb +636 -497
- metadata +27 -9
- data/devutil/testjis0208.rb +0 -38
data/lib/docdiff/document.rb
CHANGED
|
@@ -1,129 +1,149 @@
|
|
|
1
1
|
# Document class, a part of DocDiff
|
|
2
2
|
# 2004-01-14.. Hisashi MORITA
|
|
3
3
|
|
|
4
|
-
require
|
|
4
|
+
require "docdiff/charstring"
|
|
5
5
|
|
|
6
|
-
class EncodingDetectionFailure <
|
|
6
|
+
class EncodingDetectionFailure < StandardError
|
|
7
7
|
end
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
class EOLDetectionFailure < StandardError
|
|
9
10
|
end
|
|
10
11
|
|
|
11
12
|
class DocDiff
|
|
12
|
-
class Document
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
raise EncodingDetectionFailure, "encoding not specified, and auto detection failed."
|
|
13
|
+
class Document
|
|
14
|
+
def initialize(str, enc = nil, e = nil)
|
|
15
|
+
@body = str
|
|
16
|
+
@body.extend(CharString)
|
|
17
|
+
if enc
|
|
18
|
+
@body.encoding = enc
|
|
19
|
+
elsif !@body.encoding
|
|
20
|
+
guessed_encoding = CharString.guess_encoding(str)
|
|
21
|
+
if guessed_encoding == "UNKNOWN"
|
|
22
|
+
raise EncodingDetectionFailure, "encoding not specified, and auto detection failed."
|
|
23
23
|
# @body.encoding = 'ASCII' # default to ASCII <= BAD!
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
else
|
|
25
|
+
@body.encoding = guessed_encoding
|
|
26
|
+
end
|
|
26
27
|
end
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@body.eol = e
|
|
30
|
-
else
|
|
31
|
-
guessed_eol = CharString.guess_eol(str)
|
|
32
|
-
if guessed_eol == "UNKNOWN"
|
|
33
|
-
raise EOLDetectionFailure, "eol not specified, and auto detection failed."
|
|
34
|
-
# @body.eol = 'LF' # default to LF
|
|
28
|
+
if e
|
|
29
|
+
@body.eol = e
|
|
35
30
|
else
|
|
36
|
-
|
|
31
|
+
guessed_eol = CharString.guess_eol(str)
|
|
32
|
+
if guessed_eol == "UNKNOWN"
|
|
33
|
+
raise EOLDetectionFailure, "eol not specified, and auto detection failed."
|
|
34
|
+
# @body.eol = 'LF' # default to LF
|
|
35
|
+
else
|
|
36
|
+
@body.eol = guessed_eol
|
|
37
|
+
end
|
|
37
38
|
end
|
|
38
39
|
end
|
|
39
|
-
end
|
|
40
|
-
def encoding()
|
|
41
|
-
@body.encoding
|
|
42
|
-
end
|
|
43
|
-
def encoding=(cs)
|
|
44
|
-
@body.encoding = cs
|
|
45
|
-
end
|
|
46
|
-
def eol()
|
|
47
|
-
@body.eol
|
|
48
|
-
end
|
|
49
|
-
def eol=(eolstr)
|
|
50
|
-
@body.eol = eolstr
|
|
51
|
-
end
|
|
52
40
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def split_to_word()
|
|
57
|
-
@body.split_to_word
|
|
58
|
-
end
|
|
59
|
-
def split_to_char()
|
|
60
|
-
@body.split_to_char
|
|
61
|
-
end
|
|
62
|
-
def split_to_byte()
|
|
63
|
-
@body.split_to_byte
|
|
64
|
-
end
|
|
41
|
+
def encoding
|
|
42
|
+
@body.encoding
|
|
43
|
+
end
|
|
65
44
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def count_blank_line()
|
|
70
|
-
@body.count_blank_line
|
|
71
|
-
end
|
|
72
|
-
def count_empty_line()
|
|
73
|
-
@body.count_empty_line
|
|
74
|
-
end
|
|
75
|
-
def count_graph_line()
|
|
76
|
-
@body.count_graph_line
|
|
77
|
-
end
|
|
45
|
+
def encoding=(cs)
|
|
46
|
+
@body.encoding = cs
|
|
47
|
+
end
|
|
78
48
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def count_latin_word()
|
|
83
|
-
@body.count_latin_word
|
|
84
|
-
end
|
|
85
|
-
def count_ja_word()
|
|
86
|
-
@body.count_ja_word
|
|
87
|
-
end
|
|
88
|
-
def count_valid_word()
|
|
89
|
-
@body.count_valid_word
|
|
90
|
-
end
|
|
91
|
-
def count_latin_valid_word()
|
|
92
|
-
@body.count_latin_valid_word
|
|
93
|
-
end
|
|
94
|
-
def count_ja_valid_word()
|
|
95
|
-
@body.count_ja_valid_word
|
|
96
|
-
end
|
|
49
|
+
def eol
|
|
50
|
+
@body.eol
|
|
51
|
+
end
|
|
97
52
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def count_blank_char()
|
|
102
|
-
@body.count_blank_char
|
|
103
|
-
end
|
|
104
|
-
def count_graph_char()
|
|
105
|
-
@body.count_graph_char
|
|
106
|
-
end
|
|
107
|
-
def count_latin_blank_char()
|
|
108
|
-
@body.count_latin_blank_char
|
|
109
|
-
end
|
|
110
|
-
def count_latin_graph_char()
|
|
111
|
-
@body.count_latin_graph_char
|
|
112
|
-
end
|
|
113
|
-
def count_ja_blank_char()
|
|
114
|
-
@body.count_ja_blank_char
|
|
115
|
-
end
|
|
116
|
-
def count_ja_graph_char()
|
|
117
|
-
@body.count_ja_graph_char
|
|
118
|
-
end
|
|
53
|
+
def eol=(eolstr)
|
|
54
|
+
@body.eol = eolstr
|
|
55
|
+
end
|
|
119
56
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
57
|
+
def split_to_line
|
|
58
|
+
@body.split_to_line
|
|
59
|
+
end
|
|
123
60
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
61
|
+
def split_to_word
|
|
62
|
+
@body.split_to_word
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def split_to_char
|
|
66
|
+
@body.split_to_char
|
|
67
|
+
end
|
|
127
68
|
|
|
128
|
-
|
|
129
|
-
|
|
69
|
+
def split_to_byte
|
|
70
|
+
@body.split_to_byte
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def count_line
|
|
74
|
+
@body.count_line
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def count_blank_line
|
|
78
|
+
@body.count_blank_line
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def count_empty_line
|
|
82
|
+
@body.count_empty_line
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def count_graph_line
|
|
86
|
+
@body.count_graph_line
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def count_word
|
|
90
|
+
@body.count_word
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def count_latin_word
|
|
94
|
+
@body.count_latin_word
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def count_ja_word
|
|
98
|
+
@body.count_ja_word
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def count_valid_word
|
|
102
|
+
@body.count_valid_word
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def count_latin_valid_word
|
|
106
|
+
@body.count_latin_valid_word
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def count_ja_valid_word
|
|
110
|
+
@body.count_ja_valid_word
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def count_char
|
|
114
|
+
@body.count_char
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def count_blank_char
|
|
118
|
+
@body.count_blank_char
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def count_graph_char
|
|
122
|
+
@body.count_graph_char
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def count_latin_blank_char
|
|
126
|
+
@body.count_latin_blank_char
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def count_latin_graph_char
|
|
130
|
+
@body.count_latin_graph_char
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def count_ja_blank_char
|
|
134
|
+
@body.count_ja_blank_char
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def count_ja_graph_char
|
|
138
|
+
@body.count_ja_graph_char
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def count_byte
|
|
142
|
+
@body.count_byte
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def eol_char
|
|
146
|
+
@body.eol_char
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
@@ -1,72 +1,78 @@
|
|
|
1
1
|
# English ASCII encoding module for CharString
|
|
2
2
|
# 2003- Hisashi MORITA
|
|
3
3
|
|
|
4
|
-
# frozen_string_literal:
|
|
4
|
+
# frozen_string_literal: true
|
|
5
5
|
|
|
6
6
|
class DocDiff
|
|
7
|
-
module CharString
|
|
8
|
-
|
|
7
|
+
module CharString
|
|
8
|
+
module ASCII
|
|
9
|
+
ENCODING = "US-ASCII"
|
|
9
10
|
|
|
10
|
-
|
|
11
|
+
CNTRL =
|
|
12
|
+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
|
|
13
|
+
"\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
|
|
14
|
+
"\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
|
|
15
|
+
"\x1e\x1f\x7f"
|
|
16
|
+
SPACE =
|
|
17
|
+
"\x09\x0a\x0b\x0c\x0d\x20"
|
|
18
|
+
BLANK =
|
|
19
|
+
"\x09\x20"
|
|
20
|
+
DIGIT =
|
|
21
|
+
"\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
|
|
22
|
+
UPPER =
|
|
23
|
+
"\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
|
24
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
|
25
|
+
"\x55\x56\x57\x58\x59\x5a"
|
|
26
|
+
LOWER =
|
|
27
|
+
"\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
|
28
|
+
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
|
29
|
+
"\x75\x76\x77\x78\x79\x7a"
|
|
30
|
+
ALPHA = UPPER + LOWER
|
|
31
|
+
ALNUM = DIGIT + ALPHA
|
|
32
|
+
PUNCT =
|
|
33
|
+
Regexp.quote(
|
|
34
|
+
"\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
|
35
|
+
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
|
36
|
+
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
|
37
|
+
"\x7d\x7e",
|
|
38
|
+
)
|
|
39
|
+
GRAPH = DIGIT + UPPER + LOWER + PUNCT
|
|
40
|
+
PRINT = "\x20" + GRAPH
|
|
41
|
+
XDIGIT =
|
|
42
|
+
DIGIT +
|
|
43
|
+
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
|
44
|
+
"\x65\x66"
|
|
11
45
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
"\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
|
|
15
|
-
"\x1e\x1f\x7f"
|
|
16
|
-
SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
|
|
17
|
-
BLANK = "\x09\x20"
|
|
18
|
-
DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
|
|
19
|
-
UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
|
20
|
-
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
|
21
|
-
"\x55\x56\x57\x58\x59\x5a"
|
|
22
|
-
LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
|
23
|
-
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
|
24
|
-
"\x75\x76\x77\x78\x79\x7a"
|
|
25
|
-
ALPHA = UPPER + LOWER
|
|
26
|
-
ALNUM = DIGIT + ALPHA
|
|
27
|
-
PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
|
28
|
-
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
|
29
|
-
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
|
30
|
-
"\x7d\x7e"
|
|
31
|
-
GRAPH = DIGIT + UPPER + LOWER + PUNCT
|
|
32
|
-
PRINT = "\x20" + GRAPH
|
|
33
|
-
XDIGIT = DIGIT +
|
|
34
|
-
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
|
35
|
-
"\x65\x66"
|
|
46
|
+
JA_BLANK = "" # kludge...
|
|
47
|
+
JA_GRAPH = "" # kludge...
|
|
36
48
|
|
|
37
|
-
|
|
38
|
-
|
|
49
|
+
WORD_REGEXP_SRC = [
|
|
50
|
+
"(?:[#{GRAPH}]+[#{BLANK}]?)",
|
|
51
|
+
"|(?:[#{SPACE}]+)",
|
|
52
|
+
"|(?:.+?)",
|
|
53
|
+
].join
|
|
39
54
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
55
|
+
# override default method, as ASCII has no Japanese in it
|
|
56
|
+
def count_ja_graph_char
|
|
57
|
+
0
|
|
58
|
+
end
|
|
43
59
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
60
|
+
# override default method, as ASCII has no Japanese in it
|
|
61
|
+
def count_ja_blank_char
|
|
62
|
+
0
|
|
63
|
+
end
|
|
47
64
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
# override default method, as ASCII has no Japanese in it
|
|
54
|
-
def count_ja_blank_char()
|
|
55
|
-
0
|
|
56
|
-
end
|
|
65
|
+
# override default method, as ASCII has no Japanese in it
|
|
66
|
+
def count_ja_word
|
|
67
|
+
0
|
|
68
|
+
end
|
|
57
69
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
70
|
+
# override default method, as ASCII has no Japanese in it
|
|
71
|
+
def count_ja_valid_word
|
|
72
|
+
0
|
|
73
|
+
end
|
|
62
74
|
|
|
63
|
-
|
|
64
|
-
def count_ja_valid_word()
|
|
65
|
-
0
|
|
75
|
+
CharString.register_encoding(self)
|
|
66
76
|
end
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
end # module ASCII
|
|
71
|
-
end # module CharString
|
|
72
|
-
end # class DocDiff
|
|
77
|
+
end
|
|
78
|
+
end
|