docdiff 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. data/.gitignore +6 -0
  2. data/.travis.yml +7 -0
  3. data/Gemfile +17 -0
  4. data/Guardfile +8 -0
  5. data/Makefile +108 -0
  6. data/Rakefile +17 -0
  7. data/bin/docdiff +179 -0
  8. data/devutil/JIS0208.TXT +6952 -0
  9. data/devutil/char_by_charclass.rb +23 -0
  10. data/devutil/charclass_by_char.rb +21 -0
  11. data/devutil/jis0208.rb +343 -0
  12. data/devutil/testjis0208.rb +38 -0
  13. data/docdiff.conf.example +22 -0
  14. data/docdiff.gemspec +23 -0
  15. data/docdiffwebui.cgi +176 -0
  16. data/docdiffwebui.html +123 -0
  17. data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
  18. data/img/docdiff-screenshot-format-html-firefox.png +0 -0
  19. data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
  20. data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
  21. data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
  22. data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
  23. data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
  24. data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
  25. data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
  26. data/index.html +181 -0
  27. data/langfilter.rb +14 -0
  28. data/lib/doc_diff.rb +170 -0
  29. data/lib/docdiff.rb +7 -0
  30. data/lib/docdiff/charstring.rb +579 -0
  31. data/lib/docdiff/diff.rb +217 -0
  32. data/lib/docdiff/diff/contours.rb +382 -0
  33. data/lib/docdiff/diff/editscript.rb +148 -0
  34. data/lib/docdiff/diff/rcsdiff.rb +107 -0
  35. data/lib/docdiff/diff/shortestpath.rb +93 -0
  36. data/lib/docdiff/diff/speculative.rb +40 -0
  37. data/lib/docdiff/diff/subsequence.rb +39 -0
  38. data/lib/docdiff/diff/unidiff.rb +124 -0
  39. data/lib/docdiff/difference.rb +92 -0
  40. data/lib/docdiff/document.rb +127 -0
  41. data/lib/docdiff/encoding/en_ascii.rb +97 -0
  42. data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
  43. data/lib/docdiff/encoding/ja_sjis.rb +260 -0
  44. data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
  45. data/lib/docdiff/version.rb +3 -0
  46. data/lib/docdiff/view.rb +476 -0
  47. data/lib/viewdiff.rb +375 -0
  48. data/readme.html +713 -0
  49. data/sample/01.en.ascii.cr +1 -0
  50. data/sample/01.en.ascii.crlf +2 -0
  51. data/sample/01.en.ascii.lf +2 -0
  52. data/sample/01.ja.eucjp.lf +2 -0
  53. data/sample/01.ja.sjis.cr +1 -0
  54. data/sample/01.ja.sjis.crlf +2 -0
  55. data/sample/01.ja.utf8.crlf +2 -0
  56. data/sample/02.en.ascii.cr +1 -0
  57. data/sample/02.en.ascii.crlf +2 -0
  58. data/sample/02.en.ascii.lf +2 -0
  59. data/sample/02.ja.eucjp.lf +2 -0
  60. data/sample/02.ja.sjis.cr +1 -0
  61. data/sample/02.ja.sjis.crlf +2 -0
  62. data/sample/02.ja.utf8.crlf +2 -0
  63. data/sample/humpty_dumpty01.ascii.lf +4 -0
  64. data/sample/humpty_dumpty02.ascii.lf +4 -0
  65. data/test/charstring_test.rb +1008 -0
  66. data/test/diff_test.rb +36 -0
  67. data/test/difference_test.rb +64 -0
  68. data/test/docdiff_test.rb +193 -0
  69. data/test/document_test.rb +626 -0
  70. data/test/test_helper.rb +7 -0
  71. data/test/view_test.rb +570 -0
  72. data/test/viewdiff_test.rb +908 -0
  73. metadata +129 -0
@@ -0,0 +1,148 @@
1
+ require 'docdiff/diff/rcsdiff'
2
+ require 'docdiff/diff/unidiff'
3
+
4
+ class Diff
5
+ class EditScript
6
+ def initialize
7
+ @chunk_common = nil
8
+ @chunk_add = []
9
+ @chunk_del = []
10
+ @list = []
11
+ @list << @chunk_del
12
+ @list << @chunk_add
13
+
14
+ @cs = Subsequence.new
15
+ @count_a = 0
16
+ @count_b = 0
17
+ @additions = 0
18
+ @deletions = 0
19
+ end
20
+
21
+ attr_reader :count_a, :additions
22
+ attr_reader :count_b, :deletions
23
+
24
+ def commonsubsequence
25
+ return @cs
26
+ end
27
+
28
+ def del(seq_or_len)
29
+ unless @chunk_del
30
+ @chunk_add = []
31
+ @chunk_del = []
32
+ @chunk_common = nil
33
+ @list << @chunk_del
34
+ @list << @chunk_add
35
+ end
36
+ if Array === seq_or_len
37
+ len = seq_or_len.length
38
+ mark = :del_elt
39
+ else
40
+ len = seq_or_len
41
+ mark = :del_num
42
+ end
43
+ if !@chunk_del.empty? && @chunk_del.last[0] == mark
44
+ @chunk_del.last[1] += seq_or_len
45
+ else
46
+ @chunk_del << [mark, seq_or_len, nil]
47
+ end
48
+ @count_a += len
49
+ @deletions += len
50
+ end
51
+
52
+ def add(seq_or_len)
53
+ unless @chunk_add
54
+ @chunk_add = []
55
+ @chunk_del = []
56
+ @chunk_common = nil
57
+ @list << @chunk_del
58
+ @list << @chunk_add
59
+ end
60
+ if Array === seq_or_len
61
+ len = seq_or_len.length
62
+ mark = :add_elt
63
+ else
64
+ len = seq_or_len
65
+ mark = :add_num
66
+ end
67
+ if !@chunk_add.empty? && @chunk_add.last[0] == mark
68
+ @chunk_add.last[2] += seq_or_len
69
+ else
70
+ @chunk_add << [mark, nil, seq_or_len]
71
+ end
72
+ @count_b += len
73
+ @additions += len
74
+ end
75
+
76
+ def common(seq_or_len_a, seq_or_len_b=seq_or_len_a)
77
+ unless @chunk_common
78
+ @list.pop
79
+ @list.pop
80
+ @list << @chunk_del unless @chunk_del.empty?
81
+ @list << @chunk_add unless @chunk_add.empty?
82
+ @chunk_add = nil
83
+ @chunk_del = nil
84
+ @chunk_common = []
85
+ @list << @chunk_common
86
+ end
87
+
88
+ len_a = Array === seq_or_len_a ? seq_or_len_a.length : seq_or_len_a
89
+ len_b = Array === seq_or_len_b ? seq_or_len_b.length : seq_or_len_b
90
+ raise ArgumentError.new("length not equal: #{len_a} != #{len_b}") if len_a != len_b
91
+ len = len_a
92
+
93
+ mark = ((Array === seq_or_len_a) ?
94
+ (Array === seq_or_len_b ? :common_elt_elt : :common_elt_num) :
95
+ (Array === seq_or_len_b ? :common_num_elt : :common_num_num))
96
+
97
+ if !@chunk_common.empty? && @chunk_common.last[0] == mark
98
+ @chunk_common.last[1] += seq_or_len_a
99
+ @chunk_common.last[2] += seq_or_len_b
100
+ else
101
+ @chunk_common << [mark, seq_or_len_a, seq_or_len_b]
102
+ end
103
+
104
+ @cs.add @count_a, @count_b, len
105
+ @count_a += len
106
+ @count_b += len
107
+ end
108
+
109
+ def each
110
+ @list.each {|chunk|
111
+ chunk.each {|mark_del_add|
112
+ yield mark_del_add
113
+ }
114
+ }
115
+ end
116
+
117
+ def apply(src)
118
+ l = 0
119
+ dst = []
120
+ each {|mark, del, add|
121
+ case mark
122
+ when :add_elt
123
+ dst.concat add
124
+ when :add_num
125
+ raise ArgumentError.new("additionnal lines are not known.")
126
+ when :common_elt_elt
127
+ dst.concat add
128
+ l += del.length
129
+ when :common_elt_num
130
+ dst.concat src[l, del]
131
+ l += del
132
+ when :common_num_elt
133
+ dst.concat add
134
+ l += add
135
+ when :common_num_num
136
+ dst.concat src[l, del]
137
+ l += del
138
+ when :del_elt
139
+ l += del.length
140
+ when :del_num
141
+ l += del
142
+ end
143
+ }
144
+ dst.concat src[l..-1]
145
+ return dst
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,107 @@
1
+ class Diff
2
+ def Diff.rcsdiff(a, b)
3
+ al = []
4
+ a.each_line {|l| al << l}
5
+ bl = []
6
+ b.each_line {|l| bl << l}
7
+ return Diff.new(al, bl).ses.rcsdiff
8
+ end
9
+
10
+ class EditScript
11
+ def EditScript.parse_rcsdiff(input)
12
+ ses = EditScript.new
13
+ l = 1
14
+ scan_rcsdiff(input) {|mark, beg, len, lines|
15
+ if mark == :del
16
+ ses.common beg - l if l < beg
17
+ ses.del len
18
+ l = beg + len
19
+ else
20
+ ses.add lines
21
+ end
22
+ }
23
+ return ses
24
+ end
25
+
26
+ def EditScript.scan_rcsdiff(input)
27
+ state = :command
28
+ beg = len = nil
29
+ adds = nil
30
+ input.each_line("\n") {|line|
31
+ case state
32
+ when :command
33
+ case line
34
+ when /\Aa(\d+)\s+(\d+)/
35
+ beg = $1.to_i
36
+ len = $2.to_i
37
+ adds = []
38
+ state = :add
39
+ when /\Ad(\d+)\s+(\d+)/
40
+ beg = $1.to_i
41
+ len = $2.to_i
42
+ yield :del, beg, len, nil
43
+ state = :command
44
+ else
45
+ raise InvalidRCSDiffFormat.new(line)
46
+ end
47
+ when :add
48
+ adds << line
49
+ if adds.length == len
50
+ yield :add, beg, len, adds
51
+ adds = nil
52
+ state = :command
53
+ end
54
+ else
55
+ raise StandardError.new("unknown state")
56
+ end
57
+ }
58
+ end
59
+
60
+ def rcsdiff(out='')
61
+ state = :lines
62
+ l = 1
63
+ each {|mark, del, add|
64
+ case mark
65
+ when :add_elt
66
+ out << "a#{l - 1} #{add.length}\n"
67
+ add.each {|line|
68
+ case state
69
+ when :lines
70
+ case line
71
+ when /\A.*\n\z/
72
+ when /\A.*\z/
73
+ state = :after_last_line
74
+ else
75
+ raise ArgumentError.new("additional element is not line")
76
+ end
77
+ when :after_last_line
78
+ raise ArgumentError.new("additional elements after last incomplete line")
79
+ end
80
+ out << line
81
+ }
82
+ when :add_num
83
+ raise ArgumentError.new("additionnal lines are not known.")
84
+ when :common_elt_elt
85
+ l += del.length
86
+ when :common_elt_num
87
+ l += add
88
+ when :common_num_elt
89
+ l += del
90
+ when :common_num_num
91
+ l += del
92
+ when :del_elt
93
+ del = del.length
94
+ out << "d#{l} #{del}\n"
95
+ l += del
96
+ when :del_num
97
+ out << "d#{l} #{del}\n"
98
+ l += del
99
+ end
100
+ }
101
+ return out
102
+ end
103
+
104
+ class InvalidRCSDiffFormat < StandardError
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,93 @@
1
+ =begin
2
+ Diff::ShortestPath uses the algorithm described in following paper.
3
+
4
+ [Wu1990] Sun Wu, Udi Manber, Gene Myers and Webb Miller,
5
+ An O(NP) Sequence Comparison Algorithm,
6
+ Information Processing Letters 35, 1990, 317-323
7
+ =end
8
+
9
+ class Diff
10
+ class ShortestPath
11
+ def initialize(a, b)
12
+ if a.length > b.length
13
+ @a = b
14
+ @b = a
15
+ @exchanged = true
16
+ else
17
+ @a = a
18
+ @b = b
19
+ @exchanged = false
20
+ end
21
+ @m = @a.length
22
+ @n = @b.length
23
+ end
24
+
25
+ def lcs(lcs=Subsequence.new)
26
+ d = @n - @m
27
+ fp = Array.new(@n+1+@m+1+1, -1)
28
+ fp_base = -(@m+1)
29
+ path = Array.new(fp.length)
30
+ p = -1
31
+ begin
32
+ p += 1
33
+ (-p).upto(d-1) {|k|
34
+ a = fp[fp_base+k-1]+1
35
+ b = fp[fp_base+k+1]
36
+ if a < b
37
+ y = fp[fp_base+k] = snake(k, b)
38
+ path[fp_base+k] = path[fp_base+k+1]
39
+ path[fp_base+k] = [y - k, y, y - b, path[fp_base+k]] if b < y
40
+ else
41
+ y = fp[fp_base+k] = snake(k, a)
42
+ path[fp_base+k] = path[fp_base+k-1]
43
+ path[fp_base+k] = [y - k, y, y - a, path[fp_base+k]] if a < y
44
+ end
45
+ }
46
+ (d+p).downto(d+1) {|k|
47
+ a = fp[fp_base+k-1]+1
48
+ b = fp[fp_base+k+1]
49
+ if a < b
50
+ y = fp[fp_base+k] = snake(k, b)
51
+ path[fp_base+k] = path[fp_base+k+1]
52
+ path[fp_base+k] = [y - k, y, y - b, path[fp_base+k]] if b < y
53
+ else
54
+ y = fp[fp_base+k] = snake(k, a)
55
+ path[fp_base+k] = path[fp_base+k-1]
56
+ path[fp_base+k] = [y - k, y, y - a, path[fp_base+k]] if a < y
57
+ end
58
+ }
59
+ a = fp[fp_base+d-1]+1
60
+ b = fp[fp_base+d+1]
61
+ if a < b
62
+ y = fp[fp_base+d] = snake(d, b)
63
+ path[fp_base+d] = path[fp_base+d+1]
64
+ path[fp_base+d] = [y - d, y, y - b, path[fp_base+d]] if b < y
65
+ else
66
+ y = fp[fp_base+d] = snake(d, a)
67
+ path[fp_base+d] = path[fp_base+d-1]
68
+ path[fp_base+d] = [y - d, y, y - a, path[fp_base+d]] if a < y
69
+ end
70
+ end until fp[fp_base+d] == @n
71
+ shortest_path = path[fp_base+d]
72
+ list = []
73
+ while shortest_path
74
+ x, y, l, shortest_path = shortest_path
75
+ list << [x - l, y - l, l]
76
+ end
77
+ if @exchanged
78
+ list.collect {|xyl| tmp = xyl[0]; xyl[0] = xyl[1]; xyl[1] = tmp}
79
+ end
80
+ list.reverse_each {|xyl| lcs.add(*xyl)}
81
+ return lcs
82
+ end
83
+
84
+ def snake(k, y)
85
+ x = y - k
86
+ while x < @m && y < @n && @a[x] == @b[y]
87
+ x += 1
88
+ y += 1
89
+ end
90
+ return y
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,40 @@
1
+ require 'docdiff/diff/shortestpath'
2
+ require 'docdiff/diff/contours'
3
+ require 'thread'
4
+
5
+ class Diff
6
+ class Speculative
7
+ def initialize(a, b)
8
+ @a = a
9
+ @b = b
10
+ end
11
+
12
+ def lcs
13
+ # Try speculative execution.
14
+ result = nil
15
+
16
+ tg = ThreadGroup.new
17
+
18
+ # Since ShortestPath is faster than Contours if two sequences are very similar,
19
+ # try it first.
20
+ tg.add(Thread.new {
21
+ #print "ShortestPath start.\n"
22
+ result = ShortestPath.new(@a, @b).lcs
23
+ Thread.exclusive {tg.list.each {|t| t.kill if t != Thread.current}}
24
+ #print "ShortestPath win.\n"
25
+ })
26
+
27
+ # start Contours unless ShortestPath is already ended with first quantum,
28
+ tg.add(Thread.new {
29
+ #print "Contours start.\n"
30
+ result = Contours.new(@a, @b).lcs
31
+ Thread.exclusive {tg.list.each {|t| t.kill if t != Thread.current}}
32
+ #print "Contours win.\n"
33
+ }) unless tg.list.empty?
34
+
35
+ tg.list.each {|t| t.join}
36
+
37
+ return result
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,39 @@
1
+ class Diff
2
+ class Subsequence
3
+ def initialize
4
+ @list = []
5
+ end
6
+
7
+ def add(i, j, len=1)
8
+ raise ArgumentError.new("non-positive length: #{len}") if len <= 0
9
+
10
+ if @list.empty?
11
+ @list << [i, j, len]
12
+ return
13
+ end
14
+
15
+ i0, j0, len0 = @list.last
16
+
17
+ if i0 + len0 == i && j0 + len0 == j
18
+ @list.last[2] += len
19
+ return
20
+ end
21
+
22
+ if i0 + len0 > i || j0 + len0 > j
23
+ raise ArgumentError.new("additional common sequence overlapped.")
24
+ end
25
+
26
+ @list << [i, j, len]
27
+ end
28
+
29
+ def each(&block)
30
+ @list.each(&block)
31
+ end
32
+
33
+ def length
34
+ len = 0
35
+ each {|i, j, l| len += l}
36
+ return len
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,124 @@
1
+ class Diff
2
+ def Diff.unidiff(a, b, algorithm=nil)
3
+ al = []
4
+ a.each_line {|l| al << l}
5
+ bl = []
6
+ b.each_line {|l| bl << l}
7
+ return Diff.new(al, bl).ses(algorithm).unidiff
8
+ end
9
+
10
+ class EditScript
11
+ def unidiff_hunk_header(l1, ll1, l2, ll2)
12
+ l1 = 0 if ll1 == 0
13
+ l2 = 0 if ll2 == 0
14
+ result = "@@ -#{l1}"
15
+ result << ",#{ll1}" if ll1 != 1
16
+ result << " +#{l2}"
17
+ result << ",#{ll2}" if ll2 != 1
18
+ result << " @@\n"
19
+ end
20
+
21
+ def unidiff(out='', context_lines=3)
22
+ state = :common
23
+ l1 = l2 = 1
24
+ hunk = []
25
+ hunk_l1 = hunk_l2 = 1
26
+ hunk_tail = 0
27
+ each {|mark, del, add|
28
+ case mark
29
+ when :add_elt
30
+ unless hunk
31
+ hunk = []
32
+ hunk_l1 = l1
33
+ hunk_l2 = l2
34
+ end
35
+
36
+ add.each {|line| hunk << '+' + line}
37
+ hunk[-1] += "\n\\n" if /\n\z/ !~ hunk[-1]
38
+ l2 += add.length
39
+ hunk_tail = 0
40
+ when :add_num
41
+ raise ArgumentError.new("additionnal lines are not known.")
42
+ when :common_elt_elt
43
+ if hunk
44
+ if hunk_tail + add.length <= context_lines * 2
45
+ add.each {|line| hunk << ' ' + line}
46
+ hunk[-1] += "\n\\n" if /\n\z/ !~ hunk[-1]
47
+ l1 += add.length
48
+ l2 += add.length
49
+ hunk_tail += add.length
50
+ else
51
+ i = 0
52
+ if hunk_tail != hunk.length
53
+ while hunk_tail < context_lines
54
+ hunk << ' ' + add[i]
55
+ l1 += 1
56
+ l2 += 1
57
+ hunk_tail += 1
58
+ i += 1
59
+ end
60
+ hunk[-1] += "\n\\n" if /\n\z/ !~ hunk[-1]
61
+
62
+ out << unidiff_hunk_header(hunk_l1, l1 - hunk_l1, hunk_l2, l2 - hunk_l1)
63
+ h = hunk.length - (hunk_tail - context_lines)
64
+ (0...h).each {|j| out << hunk[j]}
65
+ hunk[0, h] = []
66
+ end
67
+
68
+ l1 += add.length - i
69
+ l2 += add.length - i
70
+
71
+ hunk_l1 = l1 - context_lines
72
+ hunk_l2 = l2 - context_lines
73
+ hunk = add[-context_lines..-1].collect {|line| ' ' + line}
74
+ hunk[-1] += "\n\\n" if /\n\z/ !~ hunk[-1]
75
+ hunk_tail = context_lines
76
+ end
77
+ else
78
+ hunk_l1 = l1
79
+ hunk_l2 = l2
80
+ l1 += add.length
81
+ l2 += add.length
82
+ if context_lines <= add.length
83
+ hunk = add[-context_lines..-1].collect {|line| ' ' + line}
84
+ else
85
+ hunk = add.collect {|line| ' ' + line}
86
+ end
87
+ hunk[-1] += "\n\\n" if /\n\z/ !~ hunk[-1]
88
+ hunk_tail = hunk.length
89
+ end
90
+ when :common_elt_num
91
+ raise ArgumentError.new("deleted lines are not known.")
92
+ when :common_num_elt
93
+ raise ArgumentError.new("additional lines are not known.")
94
+ when :common_num_num
95
+ raise ArgumentError.new("deleted and additional lines are not known.")
96
+ when :del_elt
97
+ if hunk_tail == hunk.length && context_lines < hunk_tail
98
+ i = hunk_tail - context_lines
99
+ hunk[0, i] = []
100
+ hunk_l1 += i
101
+ hunk_l2 += i
102
+ end
103
+ del.each {|line| hunk << '-' + line}
104
+ hunk[-1] += "\n\\n" if /\n\z/ !~ hunk[-1]
105
+ l1 += del.length
106
+ hunk_tail = 0
107
+ when :del_num
108
+ raise ArgumentError.new("deleted lines are not known.")
109
+ end
110
+ }
111
+ if hunk_tail != hunk.length
112
+ if context_lines < hunk_tail
113
+ i = hunk_tail - context_lines
114
+ hunk[-i..-1] = []
115
+ l1 -= i
116
+ l2 -= i
117
+ end
118
+ out << unidiff_hunk_header(hunk_l1, l1 - hunk_l1, hunk_l2, l2 - hunk_l1)
119
+ hunk.each {|line| out << line}
120
+ end
121
+ return out
122
+ end
123
+ end
124
+ end