docdiff 0.6.5 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +7 -7
  3. data/Guardfile +4 -4
  4. data/Makefile +1 -1
  5. data/Rakefile +6 -6
  6. data/bin/docdiff +1 -1
  7. data/devutil/Rakefile +12 -5
  8. data/devutil/char_by_charclass.rb +43 -20
  9. data/devutil/charclass_by_char.rb +40 -19
  10. data/devutil/jis0208.rb +263 -231
  11. data/devutil/jis0208_test.rb +196 -0
  12. data/doc/news.md +8 -0
  13. data/docdiff.gemspec +12 -10
  14. data/lib/doc_diff.rb +59 -60
  15. data/lib/docdiff/charstring.rb +225 -241
  16. data/lib/docdiff/cli.rb +285 -250
  17. data/lib/docdiff/diff/contours.rb +1 -1
  18. data/lib/docdiff/diff/editscript.rb +1 -1
  19. data/lib/docdiff/diff/rcsdiff.rb +1 -1
  20. data/lib/docdiff/diff/shortestpath.rb +1 -1
  21. data/lib/docdiff/diff/speculative.rb +1 -1
  22. data/lib/docdiff/diff/subsequence.rb +1 -1
  23. data/lib/docdiff/diff/unidiff.rb +1 -1
  24. data/lib/docdiff/diff.rb +1 -1
  25. data/lib/docdiff/difference.rb +71 -70
  26. data/lib/docdiff/document.rb +129 -109
  27. data/lib/docdiff/encoding/en_ascii.rb +64 -58
  28. data/lib/docdiff/encoding/ja_eucjp.rb +250 -235
  29. data/lib/docdiff/encoding/ja_sjis.rb +240 -226
  30. data/lib/docdiff/encoding/ja_utf8.rb +6952 -6939
  31. data/lib/docdiff/version.rb +1 -1
  32. data/lib/docdiff/view.rb +522 -438
  33. data/lib/docdiff.rb +2 -2
  34. data/test/charstring_test.rb +475 -351
  35. data/test/cli_test.rb +103 -101
  36. data/test/diff_test.rb +15 -16
  37. data/test/difference_test.rb +40 -31
  38. data/test/docdiff_test.rb +162 -136
  39. data/test/document_test.rb +280 -175
  40. data/test/test_helper.rb +2 -1
  41. data/test/view_test.rb +636 -497
  42. metadata +8 -8
  43. data/devutil/testjis0208.rb +0 -38
@@ -4,300 +4,284 @@
4
4
  # 2003- Hisashi MORITA
5
5
 
6
6
  class DocDiff
7
- module CharString
8
-
9
- Encodings = {}
10
- EOLChars = {} # End-of-line characters, such as CR, LF, CRLF.
11
-
12
- def initialize(string)
13
- =begin unnecessary
14
- # @encoding = CharString.guess_encoding(string)
15
- # @eol = CharString.guess_eol(string)
16
- =end unnecessary
17
- super
18
- end
7
+ module CharString
8
+ Encodings = {}
9
+ EOLChars = {} # End-of-line characters, such as CR, LF, CRLF.
10
+
11
+ def initialize(string)
12
+ # unnecessary
13
+ # @encoding = CharString.guess_encoding(string)
14
+ # @eol = CharString.guess_eol(string)
15
+ super
16
+ end
19
17
 
20
- def eol()
21
- @eol
22
- # if @eol
23
- # @eol
24
- # else
25
- # @eol = CharString.guess_eol(self)
26
- # # raise "eol is not set.\n"
27
- # end
28
- end
18
+ def eol
19
+ @eol
20
+ # if @eol
21
+ # @eol
22
+ # else
23
+ # @eol = CharString.guess_eol(self)
24
+ # # raise "eol is not set.\n"
25
+ # end
26
+ end
29
27
 
30
- def eol=(e)
31
- @eol = e
32
- extend EOLChars[@eol]
33
- end
28
+ def eol=(e)
29
+ @eol = e
30
+ extend(EOLChars[@eol])
31
+ end
34
32
 
35
- def eol_char()
36
- if @eol_char
37
- @eol_char
38
- else
39
- nil
40
- # extend EOLChars[eol]
41
- # eol_char
33
+ def eol_char
34
+ if @eol_char
35
+ @eol_char
36
+ else
37
+ nil
38
+ # extend EOLChars[eol]
39
+ # eol_char
40
+ end
42
41
  end
43
- end
44
42
 
45
- def debug()
46
- case
47
- when @encoding == nil
48
- raise "@encoding is nil."
49
- when Encodings[@encoding] == nil
50
- raise "Encodings[@encoding(=#{@encoding})] is nil."
51
- when Encodings[@encoding].class != Module
52
- raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module."
53
- when @eol == nil
54
- raise "@eol is nil."
55
- when EOLChars[@eol] == nil
56
- raise "EOLChars[@eol(=#{@eol})] is nil."
57
- else
58
- # should I do some alert?
59
- end
60
- ["id: #{self.id}, class: #{self.class}, self: #{self}, ",
61
- "module: #{Encodings[@encoding]}, #{EOLChars[@eol]}"].join
62
- end
43
+ def debug
44
+ if @encoding.nil?
45
+ raise "@encoding is nil."
46
+ elsif Encodings[@encoding].nil?
47
+ raise "Encodings[@encoding(=#{@encoding})] is nil."
48
+ elsif Encodings[@encoding].class != Module
49
+ raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module."
50
+ elsif @eol.nil?
51
+ raise "@eol is nil."
52
+ elsif EOLChars[@eol].nil?
53
+ raise "EOLChars[@eol(=#{@eol})] is nil."
54
+ else
55
+ # should I do some alert?
56
+ end
57
+
58
+ [
59
+ "id: #{id}, class: #{self.class}, self: #{self}, ",
60
+ "module: #{Encodings[@encoding]}, #{EOLChars[@eol]}",
61
+ ].join
62
+ end
63
63
 
64
- def CharString.register_encoding(mod)
65
- Encodings[mod::Encoding] = mod
66
- end
64
+ class << self
65
+ def register_encoding(mod)
66
+ Encodings[mod::ENCODING] = mod
67
+ end
68
+
69
+ def register_eol(mod)
70
+ EOLChars[mod::EOL] = mod
71
+ end
72
+
73
+ def guess_eol(string)
74
+ # returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
75
+ # 'NONE'(1-line), or nil
76
+ return if string.nil? #=> nil (argument missing)
77
+
78
+ bin_string = string.dup.force_encoding("ASCII-8BIT")
79
+ eol_counts = {
80
+ "CR" => bin_string.scan(/(\r)(?!\n)/o).size,
81
+ "LF" => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
82
+ "CRLF" => bin_string.scan(/(\r\n)/o).size,
83
+ }
84
+ eol_counts.delete_if { |_eol, count| count == 0 } # Remove missing EOL
85
+ eols = eol_counts.keys
86
+ eol_variety = eols.size # numbers of flavors found
87
+ if eol_variety == 1 # Only one type of EOL found
88
+ eols[0] #=> 'CR', 'LF', or 'CRLF'
89
+ elsif eol_variety == 0 # No EOL found
90
+ "NONE" #=> 'NONE' (might be 1-line file)
91
+ else # Multiple types of EOL found
92
+ "UNKNOWN" #=> 'UNKNOWN' (might be binary data)
93
+ end
94
+ end
95
+ end
67
96
 
68
- def CharString.register_eol(mod)
69
- EOLChars[mod::EOL] = mod
70
- end
97
+ # Note that some languages (like Japanese) do not have 'word' or 'phrase',
98
+ # thus some of the following methods are not 'linguistically correct'.
71
99
 
72
- def CharString.guess_eol(string)
73
- # returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
74
- # 'NONE'(1-line), or nil
75
- return nil if string == nil #=> nil (argument missing)
76
- bin_string = string.dup.force_encoding("ASCII-8BIT")
77
- eol_counts = {'CR' => bin_string.scan(/(\r)(?!\n)/o).size,
78
- 'LF' => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
79
- 'CRLF' => bin_string.scan(/(\r\n)/o).size}
80
- eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL
81
- eols = eol_counts.keys
82
- eol_variety = eols.size # numbers of flavors found
83
- if eol_variety == 1 # Only one type of EOL found
84
- return eols[0] #=> 'CR', 'LF', or 'CRLF'
85
- elsif eol_variety == 0 # No EOL found
86
- return 'NONE' #=> 'NONE' (might be 1-line file)
87
- else # Multiple types of EOL found
88
- return 'UNKNOWN' #=> 'UNKNOWN' (might be binary data)
100
+ def count_byte
101
+ split_to_byte.size
89
102
  end
90
- end
91
103
 
92
- # Note that some languages (like Japanese) do not have 'word' or 'phrase',
93
- # thus some of the following methods are not 'linguistically correct'.
94
-
95
- def count_byte()
96
- split_to_byte().size
97
- end
104
+ def count_char # eol = 1 char
105
+ split_to_char.size
106
+ end
98
107
 
99
- def count_char() # eol = 1 char
100
- split_to_char().size
101
- end
108
+ def count_graph_char
109
+ count_latin_graph_char + count_ja_graph_char
110
+ end
102
111
 
103
- def count_graph_char()
104
- count_latin_graph_char() + count_ja_graph_char()
105
- end
112
+ def count_blank_char
113
+ count_latin_blank_char + count_ja_blank_char
114
+ end
106
115
 
107
- def count_blank_char()
108
- count_latin_blank_char() + count_ja_blank_char()
109
- end
116
+ def count_word
117
+ split_to_word.size
118
+ end
110
119
 
111
- def count_word()
112
- split_to_word().size
113
- end
120
+ def count_valid_word
121
+ count_latin_valid_word + count_ja_valid_word
122
+ end
114
123
 
115
- def count_valid_word()
116
- count_latin_valid_word() + count_ja_valid_word()
117
- end
124
+ def count_line # this is common to all encodings.
125
+ split_to_line.size
126
+ end
118
127
 
119
- def count_line() # this is common to all encodings.
120
- split_to_line.size
121
- end
128
+ def count_empty_line
129
+ split_to_line.count { |line| /^(?:#{eol_char})|^$/m.match(line) }
130
+ end
122
131
 
123
- def count_empty_line()
124
- split_to_line.collect{|line|
125
- line if /^(?:#{eol_char})|^$/m.match line
126
- }.compact.size
127
- end
132
+ # for Ruby-1.9
133
+ def encoding
134
+ String.new(self).encoding.to_s
135
+ end
128
136
 
129
- # for Ruby-1.9
130
- def encoding()
131
- String.new(self).encoding.to_s
132
- end
137
+ def encoding=(cs)
138
+ force_encoding(cs) if self
139
+ end
133
140
 
134
- def encoding=(cs)
135
- force_encoding(cs) if self
136
- end
141
+ class << self
142
+ def guess_encoding(string)
143
+ if string
144
+ string.encoding.to_s
145
+ end
146
+ end
147
+ end
137
148
 
138
- def CharString.guess_encoding(string)
139
- if string
140
- string.encoding.to_s
141
- else
142
- nil
149
+ def split_to_byte
150
+ encode("ASCII-8BIT").scan(/./nm)
143
151
  end
144
- end
145
152
 
146
- def split_to_byte()
147
- encode("ASCII-8BIT").scan(/./nm)
148
- end
153
+ def split_to_char
154
+ re =
155
+ if eol_char # sometimes string has no end-of-line char
156
+ Regexp.new("(?:#{eol_char})|(?:.)", Regexp::MULTILINE)
157
+ else # it seems that no EOL module was extended...
158
+ Regexp.new("(?:.)", Regexp::MULTILINE)
159
+ end
160
+ encode("UTF-8").scan(re).map { |e| e.encode(encoding) }
161
+ end
149
162
 
150
- def split_to_char()
151
- if eol_char # sometimes string has no end-of-line char
152
- encode('UTF-8').scan(Regexp.new("(?:#{eol_char})|(?:.)",
153
- Regexp::MULTILINE)
154
- ).map{|e| e.encode(self.encoding)}
155
- else # it seems that no EOL module was extended...
156
- encode('UTF-8').scan(Regexp.new("(?:.)",
157
- Regexp::MULTILINE)
158
- ).map{|e| e.encode(self.encoding)}
163
+ def count_latin_graph_char
164
+ re = Regexp.new("[#{Encodings["UTF-8"]::GRAPH}]", Regexp::MULTILINE)
165
+ encode("UTF-8").scan(re).size
159
166
  end
160
- end
161
167
 
162
- def count_latin_graph_char()
163
- encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::GRAPH}]",
164
- Regexp::MULTILINE)
165
- ).size
166
- end
168
+ def count_ja_graph_char
169
+ re = Regexp.new("[#{Encodings["UTF-8"]::JA_GRAPH}]", Regexp::MULTILINE)
170
+ encode("UTF-8").scan(re).size
171
+ end
167
172
 
168
- def count_ja_graph_char()
169
- encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
170
- Regexp::MULTILINE)
171
- ).size
172
- end
173
+ def count_latin_blank_char
174
+ re = Regexp.new("[#{Encodings["UTF-8"]::BLANK}]", Regexp::MULTILINE)
175
+ encode("UTF-8").scan(re).size
176
+ end
173
177
 
174
- def count_latin_blank_char()
175
- encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::BLANK}]",
176
- Regexp::MULTILINE)
177
- ).size
178
- end
178
+ def count_ja_blank_char
179
+ re = Regexp.new("[#{Encodings["UTF-8"]::JA_BLANK}]", Regexp::MULTILINE)
180
+ encode("UTF-8").scan(re).size
181
+ end
179
182
 
180
- def count_ja_blank_char()
181
- encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_BLANK}]",
182
- Regexp::MULTILINE)
183
- ).size
184
- end
183
+ def split_to_word
184
+ re = Regexp.new(Encodings["UTF-8"]::WORD_REGEXP_SRC, Regexp::MULTILINE)
185
+ encode("UTF-8").scan(re).map { |e| e.encode(encoding) }
186
+ end
185
187
 
186
- def split_to_word()
187
- encode('UTF-8').scan(Regexp.new(Encodings['UTF-8']::WORD_REGEXP_SRC,
188
- Regexp::MULTILINE)
189
- ).map{|e| e.encode(self.encoding)}
190
- end
188
+ def count_latin_word
189
+ re = Regexp.new("[#{Encodings["UTF-8"]::PRINT}]", Regexp::MULTILINE)
190
+ split_to_word.count { |word| re.match(word.encode("UTF-8")) }
191
+ end
191
192
 
192
- def count_latin_word()
193
- split_to_word.collect{|word|
194
- word if Regexp.new("[#{Encodings['UTF-8']::PRINT}]",
195
- Regexp::MULTILINE).match word.encode('UTF-8')
196
- }.compact.size
197
- end
193
+ def count_ja_word
194
+ re = Regexp.new("[#{Encodings["UTF-8"]::JA_PRINT}]", Regexp::MULTILINE)
195
+ split_to_word.count { |word| re.match(word.encode("UTF-8")) }
196
+ end
198
197
 
199
- def count_ja_word()
200
- split_to_word.collect{|word|
201
- word if Regexp.new("[#{Encodings['UTF-8']::JA_PRINT}]",
202
- Regexp::MULTILINE).match word.encode('UTF-8')
203
- }.compact.size
204
- end
198
+ def count_latin_valid_word
199
+ re = Regexp.new("[#{Encodings["UTF-8"]::ALNUM}]", Regexp::MULTILINE)
200
+ split_to_word.count { |word| re.match(word.encode("UTF-8")) }
201
+ end
205
202
 
206
- def count_latin_valid_word()
207
- split_to_word.collect{|word|
208
- word if Regexp.new("[#{Encodings['UTF-8']::ALNUM}]",
209
- Regexp::MULTILINE).match word.encode('UTF-8')
210
- }.compact.size
211
- end
203
+ def count_ja_valid_word
204
+ re = Regexp.new("[#{Encodings["UTF-8"]::JA_GRAPH}]", Regexp::MULTILINE)
205
+ split_to_word.count { |word| re.match(word.encode("UTF-8")) }
206
+ end
212
207
 
213
- def count_ja_valid_word()
214
- split_to_word.collect{|word|
215
- word if Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
216
- Regexp::MULTILINE).match word.encode('UTF-8')
217
- }.compact.size
218
- end
208
+ def split_to_line
209
+ raise <<~EOS.chomp unless EOLChars[eol]
210
+ EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed.
211
+ EOS
212
+
213
+ re =
214
+ if defined? eol_char
215
+ Regexp.new(".*?#{eol_char}|.+", Regexp::MULTILINE)
216
+ else
217
+ Regexp.new(".+", Regexp::MULTILINE)
218
+ end
219
+ encode("UTF-8").scan(re).map { |e| e.encode(encoding) }
220
+ end
219
221
 
220
- def split_to_line()
221
- raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
222
- if defined? eol_char
223
- encode('UTF-8').scan(Regexp.new(".*?#{eol_char}|.+",
224
- Regexp::MULTILINE)
225
- ).map{|e| e.encode(self.encoding)}
226
- else
227
- encode('UTF-8').scan(Regexp.new(".+",
228
- Regexp::MULTILINE)
229
- ).map{|e| e.encode(self.encoding)}
222
+ def count_graph_line
223
+ graph = (Encodings["UTF-8"]::GRAPH + Encodings["UTF-8"]::JA_GRAPH).chars.uniq.join
224
+ re = Regexp.new("[#{Regexp.quote(graph)}]", Regexp::MULTILINE)
225
+ split_to_line.count { |line| re.match(line.encode("UTF-8")) }
230
226
  end
231
- end
232
227
 
233
- def count_graph_line()
234
- graph = (Encodings['UTF-8']::GRAPH +
235
- Encodings['UTF-8']::JA_GRAPH).chars.uniq.join
236
- re_graph = Regexp.new("[#{Regexp.quote(graph)}]", Regexp::MULTILINE)
237
- split_to_line.collect{|line|
238
- line if re_graph.match line.encode('UTF-8')
239
- }.compact.size
240
- end
228
+ def count_blank_line
229
+ blank = (Encodings["UTF-8"]::BLANK + Encodings["UTF-8"]::JA_BLANK).chars.uniq.join
230
+ re = Regexp.new("^[#{blank}]+(?:#{eol_char})?", Regexp::MULTILINE)
231
+ split_to_line.count { |line| re.match(line.encode("UTF-8")) }
232
+ end
241
233
 
242
- def count_blank_line()
243
- split_to_line.collect{|line|
244
- line if Regexp.new("^[#{Encodings['UTF-8']::BLANK}" +
245
- "#{Encodings['UTF-8']::JA_BLANK}]+(?:#{eol_char})?",
246
- Regexp::MULTILINE).match line.encode('UTF-8')
247
- }.compact.size
248
- end
234
+ # load encoding modules
235
+ require "docdiff/encoding/en_ascii"
236
+ require "docdiff/encoding/ja_eucjp"
237
+ require "docdiff/encoding/ja_sjis"
238
+ require "docdiff/encoding/ja_utf8"
239
+ alias_method :to_bytes, :split_to_byte
240
+ alias_method :to_chars, :split_to_char
241
+ alias_method :to_words, :split_to_word
242
+ alias_method :to_lines, :split_to_line
249
243
 
250
- # load encoding modules
251
- require 'docdiff/encoding/en_ascii'
252
- require 'docdiff/encoding/ja_eucjp'
253
- require 'docdiff/encoding/ja_sjis'
254
- require 'docdiff/encoding/ja_utf8'
255
- alias to_bytes split_to_byte
256
- alias to_chars split_to_char
257
- alias to_words split_to_word
258
- alias to_lines split_to_line
244
+ module CR
245
+ EOL = "CR"
259
246
 
260
- module CR
261
- EOL = 'CR'
247
+ def eol_char
248
+ "\r"
249
+ end
262
250
 
263
- def eol_char()
264
- "\r"
251
+ CharString.register_eol(self)
265
252
  end
266
253
 
267
- CharString.register_eol(self)
268
- end
254
+ module LF
255
+ EOL = "LF"
269
256
 
270
- module LF
271
- EOL = 'LF'
257
+ def eol_char
258
+ "\n"
259
+ end
272
260
 
273
- def eol_char()
274
- "\n"
261
+ CharString.register_eol(self)
275
262
  end
276
263
 
277
- CharString.register_eol(self)
278
- end
264
+ module CRLF
265
+ EOL = "CRLF"
279
266
 
280
- module CRLF
281
- EOL = 'CRLF'
267
+ def eol_char
268
+ "\r\n"
269
+ end
282
270
 
283
- def eol_char()
284
- "\r\n"
271
+ CharString.register_eol(self)
285
272
  end
286
273
 
287
- CharString.register_eol(self)
288
- end
274
+ module NoEOL
275
+ EOL = "NONE"
289
276
 
290
- module NoEOL
291
- EOL = 'NONE'
292
- def eol_char()
293
- nil
294
- end
277
+ def eol_char
278
+ nil
279
+ end
295
280
 
296
- CharString.register_eol(self)
281
+ CharString.register_eol(self)
282
+ end
297
283
  end
298
-
299
- end # module CharString
300
- end # class DocDiff
284
+ end
301
285
 
302
286
  # class String
303
287
  # include CharString