docdiff 0.6.5 → 0.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +7 -7
- data/Guardfile +4 -4
- data/Makefile +1 -1
- data/Rakefile +6 -6
- data/bin/docdiff +1 -1
- data/devutil/Rakefile +12 -5
- data/devutil/char_by_charclass.rb +43 -20
- data/devutil/charclass_by_char.rb +40 -19
- data/devutil/jis0208.rb +263 -231
- data/devutil/jis0208_test.rb +196 -0
- data/doc/news.md +8 -0
- data/docdiff.gemspec +12 -10
- data/lib/doc_diff.rb +59 -60
- data/lib/docdiff/charstring.rb +225 -241
- data/lib/docdiff/cli.rb +285 -250
- data/lib/docdiff/diff/contours.rb +1 -1
- data/lib/docdiff/diff/editscript.rb +1 -1
- data/lib/docdiff/diff/rcsdiff.rb +1 -1
- data/lib/docdiff/diff/shortestpath.rb +1 -1
- data/lib/docdiff/diff/speculative.rb +1 -1
- data/lib/docdiff/diff/subsequence.rb +1 -1
- data/lib/docdiff/diff/unidiff.rb +1 -1
- data/lib/docdiff/diff.rb +1 -1
- data/lib/docdiff/difference.rb +71 -70
- data/lib/docdiff/document.rb +129 -109
- data/lib/docdiff/encoding/en_ascii.rb +64 -58
- data/lib/docdiff/encoding/ja_eucjp.rb +250 -235
- data/lib/docdiff/encoding/ja_sjis.rb +240 -226
- data/lib/docdiff/encoding/ja_utf8.rb +6952 -6939
- data/lib/docdiff/version.rb +1 -1
- data/lib/docdiff/view.rb +522 -438
- data/lib/docdiff.rb +2 -2
- data/test/charstring_test.rb +475 -351
- data/test/cli_test.rb +103 -101
- data/test/diff_test.rb +15 -16
- data/test/difference_test.rb +40 -31
- data/test/docdiff_test.rb +162 -136
- data/test/document_test.rb +280 -175
- data/test/test_helper.rb +2 -1
- data/test/view_test.rb +636 -497
- metadata +8 -8
- data/devutil/testjis0208.rb +0 -38
data/lib/docdiff/charstring.rb
CHANGED
|
@@ -4,300 +4,284 @@
|
|
|
4
4
|
# 2003- Hisashi MORITA
|
|
5
5
|
|
|
6
6
|
class DocDiff
|
|
7
|
-
module CharString
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
=
|
|
14
|
-
#
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
super
|
|
18
|
-
end
|
|
7
|
+
module CharString
|
|
8
|
+
Encodings = {}
|
|
9
|
+
EOLChars = {} # End-of-line characters, such as CR, LF, CRLF.
|
|
10
|
+
|
|
11
|
+
def initialize(string)
|
|
12
|
+
# unnecessary
|
|
13
|
+
# @encoding = CharString.guess_encoding(string)
|
|
14
|
+
# @eol = CharString.guess_eol(string)
|
|
15
|
+
super
|
|
16
|
+
end
|
|
19
17
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
#
|
|
28
|
-
|
|
18
|
+
def eol
|
|
19
|
+
@eol
|
|
20
|
+
# if @eol
|
|
21
|
+
# @eol
|
|
22
|
+
# else
|
|
23
|
+
# @eol = CharString.guess_eol(self)
|
|
24
|
+
# # raise "eol is not set.\n"
|
|
25
|
+
# end
|
|
26
|
+
end
|
|
29
27
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
28
|
+
def eol=(e)
|
|
29
|
+
@eol = e
|
|
30
|
+
extend(EOLChars[@eol])
|
|
31
|
+
end
|
|
34
32
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
#
|
|
41
|
-
#
|
|
33
|
+
def eol_char
|
|
34
|
+
if @eol_char
|
|
35
|
+
@eol_char
|
|
36
|
+
else
|
|
37
|
+
nil
|
|
38
|
+
# extend EOLChars[eol]
|
|
39
|
+
# eol_char
|
|
40
|
+
end
|
|
42
41
|
end
|
|
43
|
-
end
|
|
44
42
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
43
|
+
def debug
|
|
44
|
+
if @encoding.nil?
|
|
45
|
+
raise "@encoding is nil."
|
|
46
|
+
elsif Encodings[@encoding].nil?
|
|
47
|
+
raise "Encodings[@encoding(=#{@encoding})] is nil."
|
|
48
|
+
elsif Encodings[@encoding].class != Module
|
|
49
|
+
raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module."
|
|
50
|
+
elsif @eol.nil?
|
|
51
|
+
raise "@eol is nil."
|
|
52
|
+
elsif EOLChars[@eol].nil?
|
|
53
|
+
raise "EOLChars[@eol(=#{@eol})] is nil."
|
|
54
|
+
else
|
|
55
|
+
# should I do some alert?
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
[
|
|
59
|
+
"id: #{id}, class: #{self.class}, self: #{self}, ",
|
|
60
|
+
"module: #{Encodings[@encoding]}, #{EOLChars[@eol]}",
|
|
61
|
+
].join
|
|
62
|
+
end
|
|
63
63
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
class << self
|
|
65
|
+
def register_encoding(mod)
|
|
66
|
+
Encodings[mod::ENCODING] = mod
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def register_eol(mod)
|
|
70
|
+
EOLChars[mod::EOL] = mod
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def guess_eol(string)
|
|
74
|
+
# returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
|
|
75
|
+
# 'NONE'(1-line), or nil
|
|
76
|
+
return if string.nil? #=> nil (argument missing)
|
|
77
|
+
|
|
78
|
+
bin_string = string.dup.force_encoding("ASCII-8BIT")
|
|
79
|
+
eol_counts = {
|
|
80
|
+
"CR" => bin_string.scan(/(\r)(?!\n)/o).size,
|
|
81
|
+
"LF" => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
|
|
82
|
+
"CRLF" => bin_string.scan(/(\r\n)/o).size,
|
|
83
|
+
}
|
|
84
|
+
eol_counts.delete_if { |_eol, count| count == 0 } # Remove missing EOL
|
|
85
|
+
eols = eol_counts.keys
|
|
86
|
+
eol_variety = eols.size # numbers of flavors found
|
|
87
|
+
if eol_variety == 1 # Only one type of EOL found
|
|
88
|
+
eols[0] #=> 'CR', 'LF', or 'CRLF'
|
|
89
|
+
elsif eol_variety == 0 # No EOL found
|
|
90
|
+
"NONE" #=> 'NONE' (might be 1-line file)
|
|
91
|
+
else # Multiple types of EOL found
|
|
92
|
+
"UNKNOWN" #=> 'UNKNOWN' (might be binary data)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
67
96
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
end
|
|
97
|
+
# Note that some languages (like Japanese) do not have 'word' or 'phrase',
|
|
98
|
+
# thus some of the following methods are not 'linguistically correct'.
|
|
71
99
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
# 'NONE'(1-line), or nil
|
|
75
|
-
return nil if string == nil #=> nil (argument missing)
|
|
76
|
-
bin_string = string.dup.force_encoding("ASCII-8BIT")
|
|
77
|
-
eol_counts = {'CR' => bin_string.scan(/(\r)(?!\n)/o).size,
|
|
78
|
-
'LF' => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
|
|
79
|
-
'CRLF' => bin_string.scan(/(\r\n)/o).size}
|
|
80
|
-
eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL
|
|
81
|
-
eols = eol_counts.keys
|
|
82
|
-
eol_variety = eols.size # numbers of flavors found
|
|
83
|
-
if eol_variety == 1 # Only one type of EOL found
|
|
84
|
-
return eols[0] #=> 'CR', 'LF', or 'CRLF'
|
|
85
|
-
elsif eol_variety == 0 # No EOL found
|
|
86
|
-
return 'NONE' #=> 'NONE' (might be 1-line file)
|
|
87
|
-
else # Multiple types of EOL found
|
|
88
|
-
return 'UNKNOWN' #=> 'UNKNOWN' (might be binary data)
|
|
100
|
+
def count_byte
|
|
101
|
+
split_to_byte.size
|
|
89
102
|
end
|
|
90
|
-
end
|
|
91
103
|
|
|
92
|
-
#
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def count_byte()
|
|
96
|
-
split_to_byte().size
|
|
97
|
-
end
|
|
104
|
+
def count_char # eol = 1 char
|
|
105
|
+
split_to_char.size
|
|
106
|
+
end
|
|
98
107
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
108
|
+
def count_graph_char
|
|
109
|
+
count_latin_graph_char + count_ja_graph_char
|
|
110
|
+
end
|
|
102
111
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
112
|
+
def count_blank_char
|
|
113
|
+
count_latin_blank_char + count_ja_blank_char
|
|
114
|
+
end
|
|
106
115
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
116
|
+
def count_word
|
|
117
|
+
split_to_word.size
|
|
118
|
+
end
|
|
110
119
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
120
|
+
def count_valid_word
|
|
121
|
+
count_latin_valid_word + count_ja_valid_word
|
|
122
|
+
end
|
|
114
123
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
124
|
+
def count_line # this is common to all encodings.
|
|
125
|
+
split_to_line.size
|
|
126
|
+
end
|
|
118
127
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
128
|
+
def count_empty_line
|
|
129
|
+
split_to_line.count { |line| /^(?:#{eol_char})|^$/m.match(line) }
|
|
130
|
+
end
|
|
122
131
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
end
|
|
132
|
+
# for Ruby-1.9
|
|
133
|
+
def encoding
|
|
134
|
+
String.new(self).encoding.to_s
|
|
135
|
+
end
|
|
128
136
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
end
|
|
137
|
+
def encoding=(cs)
|
|
138
|
+
force_encoding(cs) if self
|
|
139
|
+
end
|
|
133
140
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
141
|
+
class << self
|
|
142
|
+
def guess_encoding(string)
|
|
143
|
+
if string
|
|
144
|
+
string.encoding.to_s
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
137
148
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
string.encoding.to_s
|
|
141
|
-
else
|
|
142
|
-
nil
|
|
149
|
+
def split_to_byte
|
|
150
|
+
encode("ASCII-8BIT").scan(/./nm)
|
|
143
151
|
end
|
|
144
|
-
end
|
|
145
152
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
153
|
+
def split_to_char
|
|
154
|
+
re =
|
|
155
|
+
if eol_char # sometimes string has no end-of-line char
|
|
156
|
+
Regexp.new("(?:#{eol_char})|(?:.)", Regexp::MULTILINE)
|
|
157
|
+
else # it seems that no EOL module was extended...
|
|
158
|
+
Regexp.new("(?:.)", Regexp::MULTILINE)
|
|
159
|
+
end
|
|
160
|
+
encode("UTF-8").scan(re).map { |e| e.encode(encoding) }
|
|
161
|
+
end
|
|
149
162
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
encode(
|
|
153
|
-
Regexp::MULTILINE)
|
|
154
|
-
).map{|e| e.encode(self.encoding)}
|
|
155
|
-
else # it seems that no EOL module was extended...
|
|
156
|
-
encode('UTF-8').scan(Regexp.new("(?:.)",
|
|
157
|
-
Regexp::MULTILINE)
|
|
158
|
-
).map{|e| e.encode(self.encoding)}
|
|
163
|
+
def count_latin_graph_char
|
|
164
|
+
re = Regexp.new("[#{Encodings["UTF-8"]::GRAPH}]", Regexp::MULTILINE)
|
|
165
|
+
encode("UTF-8").scan(re).size
|
|
159
166
|
end
|
|
160
|
-
end
|
|
161
167
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
end
|
|
168
|
+
def count_ja_graph_char
|
|
169
|
+
re = Regexp.new("[#{Encodings["UTF-8"]::JA_GRAPH}]", Regexp::MULTILINE)
|
|
170
|
+
encode("UTF-8").scan(re).size
|
|
171
|
+
end
|
|
167
172
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
end
|
|
173
|
+
def count_latin_blank_char
|
|
174
|
+
re = Regexp.new("[#{Encodings["UTF-8"]::BLANK}]", Regexp::MULTILINE)
|
|
175
|
+
encode("UTF-8").scan(re).size
|
|
176
|
+
end
|
|
173
177
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
end
|
|
178
|
+
def count_ja_blank_char
|
|
179
|
+
re = Regexp.new("[#{Encodings["UTF-8"]::JA_BLANK}]", Regexp::MULTILINE)
|
|
180
|
+
encode("UTF-8").scan(re).size
|
|
181
|
+
end
|
|
179
182
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
end
|
|
183
|
+
def split_to_word
|
|
184
|
+
re = Regexp.new(Encodings["UTF-8"]::WORD_REGEXP_SRC, Regexp::MULTILINE)
|
|
185
|
+
encode("UTF-8").scan(re).map { |e| e.encode(encoding) }
|
|
186
|
+
end
|
|
185
187
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
end
|
|
188
|
+
def count_latin_word
|
|
189
|
+
re = Regexp.new("[#{Encodings["UTF-8"]::PRINT}]", Regexp::MULTILINE)
|
|
190
|
+
split_to_word.count { |word| re.match(word.encode("UTF-8")) }
|
|
191
|
+
end
|
|
191
192
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
word
|
|
195
|
-
|
|
196
|
-
}.compact.size
|
|
197
|
-
end
|
|
193
|
+
def count_ja_word
|
|
194
|
+
re = Regexp.new("[#{Encodings["UTF-8"]::JA_PRINT}]", Regexp::MULTILINE)
|
|
195
|
+
split_to_word.count { |word| re.match(word.encode("UTF-8")) }
|
|
196
|
+
end
|
|
198
197
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
word
|
|
202
|
-
|
|
203
|
-
}.compact.size
|
|
204
|
-
end
|
|
198
|
+
def count_latin_valid_word
|
|
199
|
+
re = Regexp.new("[#{Encodings["UTF-8"]::ALNUM}]", Regexp::MULTILINE)
|
|
200
|
+
split_to_word.count { |word| re.match(word.encode("UTF-8")) }
|
|
201
|
+
end
|
|
205
202
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
word
|
|
209
|
-
|
|
210
|
-
}.compact.size
|
|
211
|
-
end
|
|
203
|
+
def count_ja_valid_word
|
|
204
|
+
re = Regexp.new("[#{Encodings["UTF-8"]::JA_GRAPH}]", Regexp::MULTILINE)
|
|
205
|
+
split_to_word.count { |word| re.match(word.encode("UTF-8")) }
|
|
206
|
+
end
|
|
212
207
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
208
|
+
def split_to_line
|
|
209
|
+
raise <<~EOS.chomp unless EOLChars[eol]
|
|
210
|
+
EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed.
|
|
211
|
+
EOS
|
|
212
|
+
|
|
213
|
+
re =
|
|
214
|
+
if defined? eol_char
|
|
215
|
+
Regexp.new(".*?#{eol_char}|.+", Regexp::MULTILINE)
|
|
216
|
+
else
|
|
217
|
+
Regexp.new(".+", Regexp::MULTILINE)
|
|
218
|
+
end
|
|
219
|
+
encode("UTF-8").scan(re).map { |e| e.encode(encoding) }
|
|
220
|
+
end
|
|
219
221
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
encode(
|
|
224
|
-
Regexp::MULTILINE)
|
|
225
|
-
).map{|e| e.encode(self.encoding)}
|
|
226
|
-
else
|
|
227
|
-
encode('UTF-8').scan(Regexp.new(".+",
|
|
228
|
-
Regexp::MULTILINE)
|
|
229
|
-
).map{|e| e.encode(self.encoding)}
|
|
222
|
+
def count_graph_line
|
|
223
|
+
graph = (Encodings["UTF-8"]::GRAPH + Encodings["UTF-8"]::JA_GRAPH).chars.uniq.join
|
|
224
|
+
re = Regexp.new("[#{Regexp.quote(graph)}]", Regexp::MULTILINE)
|
|
225
|
+
split_to_line.count { |line| re.match(line.encode("UTF-8")) }
|
|
230
226
|
end
|
|
231
|
-
end
|
|
232
227
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
line if re_graph.match line.encode('UTF-8')
|
|
239
|
-
}.compact.size
|
|
240
|
-
end
|
|
228
|
+
def count_blank_line
|
|
229
|
+
blank = (Encodings["UTF-8"]::BLANK + Encodings["UTF-8"]::JA_BLANK).chars.uniq.join
|
|
230
|
+
re = Regexp.new("^[#{blank}]+(?:#{eol_char})?", Regexp::MULTILINE)
|
|
231
|
+
split_to_line.count { |line| re.match(line.encode("UTF-8")) }
|
|
232
|
+
end
|
|
241
233
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
234
|
+
# load encoding modules
|
|
235
|
+
require "docdiff/encoding/en_ascii"
|
|
236
|
+
require "docdiff/encoding/ja_eucjp"
|
|
237
|
+
require "docdiff/encoding/ja_sjis"
|
|
238
|
+
require "docdiff/encoding/ja_utf8"
|
|
239
|
+
alias_method :to_bytes, :split_to_byte
|
|
240
|
+
alias_method :to_chars, :split_to_char
|
|
241
|
+
alias_method :to_words, :split_to_word
|
|
242
|
+
alias_method :to_lines, :split_to_line
|
|
249
243
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
require 'docdiff/encoding/ja_eucjp'
|
|
253
|
-
require 'docdiff/encoding/ja_sjis'
|
|
254
|
-
require 'docdiff/encoding/ja_utf8'
|
|
255
|
-
alias to_bytes split_to_byte
|
|
256
|
-
alias to_chars split_to_char
|
|
257
|
-
alias to_words split_to_word
|
|
258
|
-
alias to_lines split_to_line
|
|
244
|
+
module CR
|
|
245
|
+
EOL = "CR"
|
|
259
246
|
|
|
260
|
-
|
|
261
|
-
|
|
247
|
+
def eol_char
|
|
248
|
+
"\r"
|
|
249
|
+
end
|
|
262
250
|
|
|
263
|
-
|
|
264
|
-
"\r"
|
|
251
|
+
CharString.register_eol(self)
|
|
265
252
|
end
|
|
266
253
|
|
|
267
|
-
|
|
268
|
-
|
|
254
|
+
module LF
|
|
255
|
+
EOL = "LF"
|
|
269
256
|
|
|
270
|
-
|
|
271
|
-
|
|
257
|
+
def eol_char
|
|
258
|
+
"\n"
|
|
259
|
+
end
|
|
272
260
|
|
|
273
|
-
|
|
274
|
-
"\n"
|
|
261
|
+
CharString.register_eol(self)
|
|
275
262
|
end
|
|
276
263
|
|
|
277
|
-
|
|
278
|
-
|
|
264
|
+
module CRLF
|
|
265
|
+
EOL = "CRLF"
|
|
279
266
|
|
|
280
|
-
|
|
281
|
-
|
|
267
|
+
def eol_char
|
|
268
|
+
"\r\n"
|
|
269
|
+
end
|
|
282
270
|
|
|
283
|
-
|
|
284
|
-
"\r\n"
|
|
271
|
+
CharString.register_eol(self)
|
|
285
272
|
end
|
|
286
273
|
|
|
287
|
-
|
|
288
|
-
|
|
274
|
+
module NoEOL
|
|
275
|
+
EOL = "NONE"
|
|
289
276
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
nil
|
|
294
|
-
end
|
|
277
|
+
def eol_char
|
|
278
|
+
nil
|
|
279
|
+
end
|
|
295
280
|
|
|
296
|
-
|
|
281
|
+
CharString.register_eol(self)
|
|
282
|
+
end
|
|
297
283
|
end
|
|
298
|
-
|
|
299
|
-
end # module CharString
|
|
300
|
-
end # class DocDiff
|
|
284
|
+
end
|
|
301
285
|
|
|
302
286
|
# class String
|
|
303
287
|
# include CharString
|