dimapa 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +13 -0
- data/LICENSE +23 -0
- data/README.md +78 -0
- data/Rakefile +22 -0
- data/lib/diff_methods.rb +132 -0
- data/lib/dimapa.rb +1522 -0
- data/lib/patch_obj.rb +54 -0
- data/scripts/speedtest.rb +13 -0
- data/scripts/speedtest/speedtest1.txt +230 -0
- data/scripts/speedtest/speedtest2.txt +188 -0
- data/test/helper.rb +1 -0
- data/test/test_dimapa.rb +1196 -0
- metadata +98 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5e87b2a963101ab22b8fb368d0956670c8f22a09691da2931ab28574f4d700fd
|
4
|
+
data.tar.gz: c94b5fa761a33875cd96db4afadc9aabfc87f0e3a3da249f6a59ce9659685f5d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: de8c3501e51d0fecfb0697e6120b630b6b3c46ee1e0f52ee9343c07db4e3e2a1667bb690407557726c0563ff712000d0ed2319aeda0eb5c3cf990bb98eb7a1bd
|
7
|
+
data.tar.gz: 6f638d15129dc34dda5d766796dda129340b981e06886c277ec2583044dbbdf34ddd70df51ce600d26ac4797b2cc4d2a9eeaee31377fa9e8b02360f1c650ea2f
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
Copyright (c) 2011, Jorge Kalmbach <kalmbach.at.gmail.com>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any
|
4
|
+
person obtaining a copy of this software and associated
|
5
|
+
documentation files (the "Software"), to deal in the
|
6
|
+
Software without restriction, including without limitation
|
7
|
+
the rights to use, copy, modify, merge, publish,
|
8
|
+
distribute, sublicense, and/or sell copies of the
|
9
|
+
Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice
|
13
|
+
shall be included in all copies or substantial portions of
|
14
|
+
the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
17
|
+
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
18
|
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
19
|
+
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
|
20
|
+
OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
21
|
+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
22
|
+
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
23
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# DiMaPa (Diff Match and Patch)
|
2
|
+
A modern Ruby implementation of Google's [Diff Match and Patch][google]
|
3
|
+
libraries.
|
4
|
+
|
5
|
+
> The Diff Match and Patch libraries offer robust algorithms to perform the
|
6
|
+
> operations required for synchronizing plain text.
|
7
|
+
|
8
|
+
## Usage
|
9
|
+
```ruby
|
10
|
+
require 'dimapa'
|
11
|
+
|
12
|
+
dmp = DiMaPa.new # or DiffMatchPatch
|
13
|
+
|
14
|
+
diff = dmp.diff_main("This is a sentence.", "This is also a sentence.")
|
15
|
+
#=> [[:equal, "This is a"], [:insert, "lso a"], [:equal, " sentence."]]
|
16
|
+
|
17
|
+
dmp.diff_cleanup_semantic(diff)
|
18
|
+
#=> nil
|
19
|
+
|
20
|
+
# diff is modified in place
|
21
|
+
diff
|
22
|
+
#=> [[:equal, "This is "], [:insert, "also "], [:equal, "a sentence."]]
|
23
|
+
|
24
|
+
patch = dmp.patch_make(diff)
|
25
|
+
#=> [#<PatchObj:0x00005608e6ac9500 @diffs=
|
26
|
+
# [[:equal, "This is "], [:insert, "also "], [:equal, "a senten"]],
|
27
|
+
# @length1=16,
|
28
|
+
# @length2=21,
|
29
|
+
# @start1=0,
|
30
|
+
# @start2=0>]
|
31
|
+
|
32
|
+
dmp.patch_to_text(patch)
|
33
|
+
#=> "@@ -1,16 +1,21 @@\n This is \n+also \n a senten\n"
|
34
|
+
|
35
|
+
dmp.patch_apply(patch, "This is a sentence.")
|
36
|
+
#=> ["This is also a sentence.", [true]]
|
37
|
+
```
|
38
|
+
|
39
|
+
## Installation
|
40
|
+
```sh
|
41
|
+
# RubyGem
|
42
|
+
gem install dimapa
|
43
|
+
|
44
|
+
# From source
|
45
|
+
bundle install
|
46
|
+
bundle exec rake install
|
47
|
+
```
|
48
|
+
|
49
|
+
## Benchmarks
|
50
|
+
|
51
|
+
This project includes [scripts/](speedtests) mirroring those in the official
|
52
|
+
project. Performance is on par with those reported for [Lua and Python][speedtest]
|
53
|
+
albeit run on a faster machine.
|
54
|
+
|
55
|
+
```
|
56
|
+
$ rake speedtest
|
57
|
+
|
58
|
+
user system total real
|
59
|
+
diff(t2,t1) 13.658214 0.003937 13.662151 ( 13.662453)
|
60
|
+
diff(t1,t2) 14.074079 0.000001 14.074080 ( 14.074350)
|
61
|
+
```
|
62
|
+
|
63
|
+
## Tests and Linting
|
64
|
+
|
65
|
+
```sh
|
66
|
+
bundle exec rake
|
67
|
+
```
|
68
|
+
|
69
|
+
### Fork of [kalmbach/diff_match_patch][kalmbach] b/w/o [DavidMikeSimon/diff_match_patch][davidmikesimon]
|
70
|
+
Copyright (c) 2011, Jorge Kalmbach <kalmbach.at.gmail.com>
|
71
|
+
|
72
|
+
Work was inspired by the [reima/diff_match_patch-ruby][reima] module.
|
73
|
+
|
74
|
+
[speedtest]: https://docs.google.com/spreadsheets/d/1zpZccuBpjMZTvL1nGDMKJc7rWL_m_drF4XKOJvB27Kc/edit#gid=0
|
75
|
+
[kalmbach]: https://github.com/kalmbach/diff_match_patch
|
76
|
+
[davidmikesimon]: https://github.com/DavidMikeSimon/diff_match_patch
|
77
|
+
[reima]: https://github.com/reima/diff_match_patch-ruby
|
78
|
+
[google]: https://github.com/google/diff-match-patch
|
data/Rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require "rake/testtask"
|
2
|
+
require "standard/rake"
|
3
|
+
require "bundler/gem_tasks"
|
4
|
+
|
5
|
+
Rake::TestTask.new do |t|
|
6
|
+
t.libs << "test"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "Run benchmarking speedtest"
|
10
|
+
task :speedtest do
|
11
|
+
ruby "scripts/speedtest.rb"
|
12
|
+
end
|
13
|
+
|
14
|
+
desc "Start REPL"
|
15
|
+
task :console do
|
16
|
+
require "pry"
|
17
|
+
require "dimapa"
|
18
|
+
Pry.start
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "Run tests and linter"
|
22
|
+
task default: [:standard, :test]
|
data/lib/diff_methods.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
module DiffMethods
|
2
|
+
FIXNUM_MAX = 2**(0.size * 8 - 2) - 1
|
3
|
+
|
4
|
+
attr_accessor :diff_timeout
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
# Number of seconds to map a diff before giving up (0 for infinity).
|
8
|
+
@diff_timeout = 1
|
9
|
+
end
|
10
|
+
|
11
|
+
# Find the differences between two texts. Simplifies the problem by
|
12
|
+
# stripping any common prefix or suffix off the texts before editing.
|
13
|
+
def diff_main(text1, text2, checklines = true, deadline = nil)
|
14
|
+
# Set a deadline by which time the diff must be complete.
|
15
|
+
deadline ||= diff_new_deadline
|
16
|
+
|
17
|
+
# Check for null inputs.
|
18
|
+
raise ArgumentError.new("Null inputs. (diff_main)") unless text1 || text2
|
19
|
+
|
20
|
+
# Check for equality (speedup).
|
21
|
+
return (text1.empty? ? [] : [[:equal, text1]]) if text1 == text2
|
22
|
+
|
23
|
+
diff_main_compute_diff(text1, text2, checklines, deadline)
|
24
|
+
end
|
25
|
+
|
26
|
+
def diff_main_compute_diff(text1, text2, checklines, deadline)
|
27
|
+
# Trim off common prefix and suffix (speedup).
|
28
|
+
common_prefix, text1, text2 = diff_trim_common_prefix(text1, text2)
|
29
|
+
common_suffix, text1, text2 = diff_trim_common_suffix(text1, text2)
|
30
|
+
|
31
|
+
# Compute the diff on the middle block.
|
32
|
+
diffs = diff_compute(text1, text2, checklines, deadline)
|
33
|
+
|
34
|
+
# Restore the prefix and suffix.
|
35
|
+
diffs.unshift([:equal, common_prefix]) unless common_prefix.nil?
|
36
|
+
diffs.push([:equal, common_suffix]) unless common_suffix.nil?
|
37
|
+
diff_cleanup_merge(diffs)
|
38
|
+
|
39
|
+
diffs
|
40
|
+
end
|
41
|
+
|
42
|
+
private :diff_main_compute_diff
|
43
|
+
|
44
|
+
# Calculate a new deadline using the @diff_timeout configuration value
|
45
|
+
def diff_new_deadline
|
46
|
+
Time.now + (diff_timeout.zero? ? FIXNUM_MAX : diff_timeout)
|
47
|
+
end
|
48
|
+
|
49
|
+
private :diff_new_deadline
|
50
|
+
|
51
|
+
# Trim off the common prefix
|
52
|
+
def diff_trim_common_prefix(text1, text2)
|
53
|
+
if (common_length = diff_common_prefix(text1, text2)).nonzero?
|
54
|
+
common_prefix = text1[0...common_length]
|
55
|
+
text1 = text1[common_length..-1]
|
56
|
+
text2 = text2[common_length..-1]
|
57
|
+
end
|
58
|
+
|
59
|
+
[common_prefix, text1, text2]
|
60
|
+
end
|
61
|
+
|
62
|
+
private :diff_trim_common_prefix
|
63
|
+
|
64
|
+
# Trim off the common suffix
|
65
|
+
def diff_trim_common_suffix(text1, text2)
|
66
|
+
if (common_length = diff_common_suffix(text1, text2)).nonzero?
|
67
|
+
common_suffix = text1[-common_length..-1]
|
68
|
+
text1 = text1[0...-common_length]
|
69
|
+
text2 = text2[0...-common_length]
|
70
|
+
end
|
71
|
+
|
72
|
+
[common_suffix, text1, text2]
|
73
|
+
end
|
74
|
+
|
75
|
+
private :diff_trim_common_suffix
|
76
|
+
|
77
|
+
# Find the differences between two texts. Assumes that the texts do not
|
78
|
+
# have any common prefix or suffix.
|
79
|
+
def diff_compute(text1, text2, checklines, deadline)
|
80
|
+
if (diffs = diff_compute_common_cases(text1, text2))
|
81
|
+
diffs
|
82
|
+
|
83
|
+
elsif (diffs = diff_compute_half_match(text1, text2, checklines, deadline))
|
84
|
+
diffs
|
85
|
+
|
86
|
+
elsif checklines && text1.length > 100 && text2.length > 100
|
87
|
+
diff_line_mode(text1, text2, deadline)
|
88
|
+
|
89
|
+
else
|
90
|
+
diff_bisect(text1, text2, deadline)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def diff_compute_half_match(text1, text2, checklines, deadline)
|
95
|
+
if (hm = diff_half_match(text1, text2))
|
96
|
+
# A half-match was found, sort out the return data.
|
97
|
+
text1_a, text1_b, text2_a, text2_b, mid_common = hm
|
98
|
+
|
99
|
+
# Send both pairs off for separate processing.
|
100
|
+
diffs_a = diff_main(text1_a, text2_a, checklines, deadline)
|
101
|
+
diffs_b = diff_main(text1_b, text2_b, checklines, deadline)
|
102
|
+
|
103
|
+
# Merge the results.
|
104
|
+
diffs_a + [[:equal, mid_common]] + diffs_b
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
private :diff_compute_half_match
|
109
|
+
|
110
|
+
def diff_compute_common_cases(text1, text2)
|
111
|
+
# Just add some text (speedup).
|
112
|
+
return [[:insert, text2]] if text1.empty?
|
113
|
+
|
114
|
+
# Just delete some text (speedup).
|
115
|
+
return [[:delete, text1]] if text2.empty?
|
116
|
+
|
117
|
+
short, long = [text1, text2].sort_by(&:length)
|
118
|
+
|
119
|
+
# Shorter text is inside the longer text (speedup).
|
120
|
+
if (i = long.index(short))
|
121
|
+
op = text1.length > text2.length ? :delete : :insert
|
122
|
+
[[op, long[0...i]], [:equal, short], [op, long[(i + short.length)..-1]]]
|
123
|
+
|
124
|
+
# Single character string.
|
125
|
+
elsif short.length == 1
|
126
|
+
# After the previous speedup, the character can't be an equality.
|
127
|
+
[[:delete, text1], [:insert, text2]]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
private :diff_compute_common_cases
|
132
|
+
end
|
data/lib/dimapa.rb
ADDED
@@ -0,0 +1,1522 @@
|
|
1
|
+
require "diff_methods"
|
2
|
+
require "patch_obj"
|
3
|
+
|
4
|
+
# Class containing the diff, match and patch methods.
|
5
|
+
# Also contains the behaviour settings.
|
6
|
+
class DiMaPa
|
7
|
+
include DiffMethods
|
8
|
+
|
9
|
+
attr_accessor :diff_edit_cost
|
10
|
+
attr_accessor :match_threshold
|
11
|
+
attr_accessor :match_distance
|
12
|
+
attr_accessor :patch_delete_threshold
|
13
|
+
attr_accessor :patch_margin
|
14
|
+
attr_reader :match_max_bits
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
# Inits a diff_match_patch object with default settings.
|
18
|
+
# Redefine these in your program to override the defaults.
|
19
|
+
|
20
|
+
# Cost of an empty edit operation in terms of edit characters.
|
21
|
+
@diff_edit_cost = 4
|
22
|
+
# At what point is no match declared (0.0 = perfection, 1.0 = very loose).
|
23
|
+
@match_threshold = 0.5
|
24
|
+
# How far to search for a match (0 = exact location, 1000+ = broad match).
|
25
|
+
# A match this many characters away from the expected location will add
|
26
|
+
# 1.0 to the score (0.0 is a perfect match).
|
27
|
+
@match_distance = 1000
|
28
|
+
# When deleting a large block of text (over ~64 characters), how close does
|
29
|
+
# the contents have to match the expected contents. (0.0 = perfection,
|
30
|
+
# 1.0 = very loose). Note that Match_Threshold controls how closely the
|
31
|
+
# end points of a delete need to match.
|
32
|
+
@patch_delete_threshold = 0.5
|
33
|
+
# Chunk size for context length.
|
34
|
+
@patch_margin = 4
|
35
|
+
|
36
|
+
# The number of bits in an int.
|
37
|
+
# Python has no maximum, thus to disable patch splitting set to 0.
|
38
|
+
# However to avoid long patches in certain pathological cases, use 32.
|
39
|
+
# Multiple short patches (using native ints) are much faster than long ones.
|
40
|
+
@match_max_bits = 32
|
41
|
+
super
|
42
|
+
end
|
43
|
+
|
44
|
+
# Do a quick line-level diff on both strings, then rediff the parts for
|
45
|
+
# greater accuracy.
|
46
|
+
# This speedup can produce non-minimal diffs.
|
47
|
+
def diff_line_mode(text1, text2, deadline)
|
48
|
+
# Scan the text on a line-by-line basis first.
|
49
|
+
text1, text2, line_array = diff_lines_to_chars(text1, text2)
|
50
|
+
|
51
|
+
diffs = diff_main(text1, text2, false, deadline)
|
52
|
+
|
53
|
+
# Convert the diff back to original text.
|
54
|
+
diff_chars_to_lines(diffs, line_array)
|
55
|
+
# Eliminate freak matches (e.g. blank lines)
|
56
|
+
diff_cleanup_semantic(diffs)
|
57
|
+
|
58
|
+
# Rediff any replacement blocks, this time character-by-character.
|
59
|
+
# Add a dummy entry at the end.
|
60
|
+
diffs.push([:equal, ""])
|
61
|
+
pointer = 0
|
62
|
+
count_delete = 0
|
63
|
+
count_insert = 0
|
64
|
+
text_delete = ""
|
65
|
+
text_insert = ""
|
66
|
+
|
67
|
+
while pointer < diffs.length
|
68
|
+
case diffs[pointer][0]
|
69
|
+
when :insert
|
70
|
+
count_insert += 1
|
71
|
+
text_insert += diffs[pointer][1]
|
72
|
+
when :delete
|
73
|
+
count_delete += 1
|
74
|
+
text_delete += diffs[pointer][1]
|
75
|
+
when :equal
|
76
|
+
# Upon reaching an equality, check for prior redundancies.
|
77
|
+
if count_delete >= 1 && count_insert >= 1
|
78
|
+
# Delete the offending records and add the merged ones.
|
79
|
+
a = diff_main(text_delete, text_insert, false, deadline)
|
80
|
+
diffs[pointer - count_delete - count_insert,
|
81
|
+
count_delete + count_insert] = []
|
82
|
+
pointer = pointer - count_delete - count_insert
|
83
|
+
diffs[pointer, 0] = a
|
84
|
+
pointer += a.length
|
85
|
+
end
|
86
|
+
count_insert = 0
|
87
|
+
count_delete = 0
|
88
|
+
text_delete = ""
|
89
|
+
text_insert = ""
|
90
|
+
end
|
91
|
+
pointer += 1
|
92
|
+
end
|
93
|
+
|
94
|
+
diffs.pop # Remove the dummy entry at the end.
|
95
|
+
diffs
|
96
|
+
end
|
97
|
+
|
98
|
+
# Find the 'middle snake' of a diff, split the problem in two
|
99
|
+
# and return the recursively constructed diff.
|
100
|
+
# See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
|
101
|
+
def diff_bisect(text1, text2, deadline)
|
102
|
+
# Cache the text lengths to prevent multiple calls.
|
103
|
+
text1_length = text1.length
|
104
|
+
text2_length = text2.length
|
105
|
+
max_d = (text1_length + text2_length + 1) / 2
|
106
|
+
v_offset = max_d
|
107
|
+
v_length = 2 * max_d
|
108
|
+
v1 = Array.new(v_length, -1)
|
109
|
+
v2 = Array.new(v_length, -1)
|
110
|
+
v1[v_offset + 1] = 0
|
111
|
+
v2[v_offset + 1] = 0
|
112
|
+
delta = text1_length - text2_length
|
113
|
+
|
114
|
+
# If the total number of characters is odd, then the front path will
|
115
|
+
# collide with the reverse path.
|
116
|
+
front = (delta % 2 != 0)
|
117
|
+
# Offsets for start and end of k loop.
|
118
|
+
# Prevents mapping of space beyond the grid.
|
119
|
+
k1start = 0
|
120
|
+
k1end = 0
|
121
|
+
k2start = 0
|
122
|
+
k2end = 0
|
123
|
+
max_d.times do |d|
|
124
|
+
# Bail out if deadline is reached.
|
125
|
+
break if deadline && Time.now >= deadline
|
126
|
+
|
127
|
+
# Walk the front path one step.
|
128
|
+
(-d + k1start).step(d - k1end, 2) do |k1|
|
129
|
+
k1_offset = v_offset + k1
|
130
|
+
x1 = if k1 == -d || k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]
|
131
|
+
v1[k1_offset + 1]
|
132
|
+
else
|
133
|
+
v1[k1_offset - 1] + 1
|
134
|
+
end
|
135
|
+
|
136
|
+
y1 = x1 - k1
|
137
|
+
while x1 < text1_length && y1 < text2_length && text1[x1] == text2[y1]
|
138
|
+
x1 += 1
|
139
|
+
y1 += 1
|
140
|
+
end
|
141
|
+
|
142
|
+
v1[k1_offset] = x1
|
143
|
+
if x1 > text1_length
|
144
|
+
# Ran off the right of the graph.
|
145
|
+
k1end += 2
|
146
|
+
elsif y1 > text2_length
|
147
|
+
# Ran off the bottom of the graph.
|
148
|
+
k1start += 2
|
149
|
+
elsif front
|
150
|
+
k2_offset = v_offset + delta - k1
|
151
|
+
if k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1
|
152
|
+
# Mirror x2 onto top-left coordinate system.
|
153
|
+
x2 = text1_length - v2[k2_offset]
|
154
|
+
if x1 >= x2
|
155
|
+
# Overlap detected.
|
156
|
+
return diff_bisect_split(text1, text2, x1, y1, deadline)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Walk the reverse path one step.
|
163
|
+
(-d + k2start).step(d - k2end, 2) do |k2|
|
164
|
+
k2_offset = v_offset + k2
|
165
|
+
x2 = if k2 == -d || k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]
|
166
|
+
v2[k2_offset + 1]
|
167
|
+
else
|
168
|
+
v2[k2_offset - 1] + 1
|
169
|
+
end
|
170
|
+
|
171
|
+
y2 = x2 - k2
|
172
|
+
while x2 < text1_length && y2 < text2_length && text1[-x2 - 1] == text2[-y2 - 1]
|
173
|
+
x2 += 1
|
174
|
+
y2 += 1
|
175
|
+
end
|
176
|
+
|
177
|
+
v2[k2_offset] = x2
|
178
|
+
if x2 > text1_length
|
179
|
+
# Ran off the left of the graph.
|
180
|
+
k2end += 2
|
181
|
+
elsif y2 > text2_length
|
182
|
+
# Ran off the top of the graph.
|
183
|
+
k2start += 2
|
184
|
+
elsif !front
|
185
|
+
k1_offset = v_offset + delta - k2
|
186
|
+
if k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1
|
187
|
+
x1 = v1[k1_offset]
|
188
|
+
y1 = v_offset + x1 - k1_offset
|
189
|
+
# Mirror x2 onto top-left coordinate system.
|
190
|
+
x2 = text1_length - x2
|
191
|
+
if x1 >= x2
|
192
|
+
# Overlap detected.
|
193
|
+
return diff_bisect_split(text1, text2, x1, y1, deadline)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Diff took too long and hit the deadline or
|
201
|
+
# number of diffs equals number of characters, no commonality at all.
|
202
|
+
[[:delete, text1], [:insert, text2]]
|
203
|
+
end
|
204
|
+
|
205
|
+
# Given the location of the 'middle snake', split the diff in two parts
|
206
|
+
# and recurse.
|
207
|
+
def diff_bisect_split(text1, text2, x, y, deadline)
|
208
|
+
text1a = text1[0...x]
|
209
|
+
text2a = text2[0...y]
|
210
|
+
text1b = text1[x..-1]
|
211
|
+
text2b = text2[y..-1]
|
212
|
+
|
213
|
+
# Compute both diffs serially.
|
214
|
+
diffs = diff_main(text1a, text2a, false, deadline)
|
215
|
+
diffsb = diff_main(text1b, text2b, false, deadline)
|
216
|
+
|
217
|
+
diffs + diffsb
|
218
|
+
end
|
219
|
+
|
220
|
+
# Split two texts into an array of strings. Reduce the texts to a string
|
221
|
+
# of hashes where each Unicode character represents one line.
|
222
|
+
def diff_lines_to_chars(text1, text2)
|
223
|
+
line_array = [""] # e.g. line_array[4] == "Hello\n"
|
224
|
+
line_hash = {} # e.g. line_hash["Hello\n"] == 4
|
225
|
+
|
226
|
+
[text1, text2].map { |text|
|
227
|
+
# Split text into an array of strings. Reduce the text to a string of
|
228
|
+
# hashes where each Unicode character represents one line.
|
229
|
+
chars = ""
|
230
|
+
text.each_line do |line|
|
231
|
+
if line_hash[line]
|
232
|
+
chars += line_hash[line].chr(Encoding::UTF_8)
|
233
|
+
else
|
234
|
+
chars += line_array.length.chr(Encoding::UTF_8)
|
235
|
+
line_hash[line] = line_array.length
|
236
|
+
line_array.push(line)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
chars
|
240
|
+
}.push(line_array)
|
241
|
+
end
|
242
|
+
|
243
|
+
# Rehydrate the text in a diff from a string of line hashes to real lines of text.
|
244
|
+
def diff_chars_to_lines(diffs, line_array)
|
245
|
+
diffs.each do |diff|
|
246
|
+
diff[1] = diff[1].chars.map { |c| line_array[c.ord] }.join
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
# Determine the common prefix of two strings.
|
251
|
+
def diff_common_prefix(text1, text2)
|
252
|
+
# Quick check for common null cases.
|
253
|
+
return 0 if text1.empty? || text2.empty? || text1[0] != text2[0]
|
254
|
+
|
255
|
+
# Binary search.
|
256
|
+
# Performance analysis: http://neil.fraser.name/news/2007/10/09/
|
257
|
+
pointer_min = 0
|
258
|
+
pointer_max = [text1.length, text2.length].min
|
259
|
+
pointer_mid = pointer_max
|
260
|
+
pointer_start = 0
|
261
|
+
|
262
|
+
while pointer_min < pointer_mid
|
263
|
+
if text1[pointer_start...pointer_mid] == text2[pointer_start...pointer_mid]
|
264
|
+
pointer_min = pointer_mid
|
265
|
+
pointer_start = pointer_min
|
266
|
+
else
|
267
|
+
pointer_max = pointer_mid
|
268
|
+
end
|
269
|
+
pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min
|
270
|
+
end
|
271
|
+
|
272
|
+
pointer_mid
|
273
|
+
end
|
274
|
+
|
275
|
+
# Determine the common suffix of two strings.
|
276
|
+
def diff_common_suffix(text1, text2)
|
277
|
+
# Quick check for common null cases.
|
278
|
+
return 0 if text1.empty? || text2.empty? || text1[-1] != text2[-1]
|
279
|
+
|
280
|
+
# Binary search.
|
281
|
+
# Performance analysis: http://neil.fraser.name/news/2007/10/09/
|
282
|
+
pointer_min = 0
|
283
|
+
pointer_max = [text1.length, text2.length].min
|
284
|
+
pointer_mid = pointer_max
|
285
|
+
pointer_end = 0
|
286
|
+
|
287
|
+
while pointer_min < pointer_mid
|
288
|
+
if text1[-pointer_mid..(-pointer_end - 1)] == text2[-pointer_mid..(-pointer_end - 1)]
|
289
|
+
pointer_min = pointer_mid
|
290
|
+
pointer_end = pointer_min
|
291
|
+
else
|
292
|
+
pointer_max = pointer_mid
|
293
|
+
end
|
294
|
+
pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min
|
295
|
+
end
|
296
|
+
|
297
|
+
pointer_mid
|
298
|
+
end
|
299
|
+
|
300
|
+
# Determine if the suffix of one string is the prefix of another.
|
301
|
+
def diff_common_overlap(text1, text2)
|
302
|
+
# Cache the text lengths to prevent multiple calls.
|
303
|
+
text1_length = text1.length
|
304
|
+
text2_length = text2.length
|
305
|
+
|
306
|
+
# Eliminate the null case.
|
307
|
+
return 0 if text1_length.zero? || text2_length.zero?
|
308
|
+
|
309
|
+
# Truncate the longer string.
|
310
|
+
if text1_length > text2_length
|
311
|
+
text1 = text1[-text2_length..-1]
|
312
|
+
else
|
313
|
+
text2 = text2[0...text1_length]
|
314
|
+
end
|
315
|
+
text_length = [text1_length, text2_length].min
|
316
|
+
|
317
|
+
# Quick check for the whole case.
|
318
|
+
return text_length if text1 == text2
|
319
|
+
|
320
|
+
# Start by looking for a single character match
|
321
|
+
# and increase length until no match is found.
|
322
|
+
# Performance analysis: http://neil.fraser.name/news/2010/11/04/
|
323
|
+
best = 0
|
324
|
+
length = 1
|
325
|
+
loop do
|
326
|
+
pattern = text1[(text_length - length)..-1]
|
327
|
+
found = text2.index(pattern)
|
328
|
+
|
329
|
+
return best if found.nil?
|
330
|
+
|
331
|
+
length += found
|
332
|
+
if found == 0 || text1[(text_length - length)..-1] == text2[0..length]
|
333
|
+
best = length
|
334
|
+
length += 1
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
# Does a substring of shorttext exist within longtext such that the
|
340
|
+
# substring is at least half the length of longtext?
|
341
|
+
def diff_half_match_i(longtext, shorttext, i)
|
342
|
+
seed = longtext[i, longtext.length / 4]
|
343
|
+
j = -1
|
344
|
+
best_common = ""
|
345
|
+
while (j = shorttext.index(seed, j + 1))
|
346
|
+
prefix_length = diff_common_prefix(longtext[i..-1], shorttext[j..-1])
|
347
|
+
suffix_length = diff_common_suffix(longtext[0...i], shorttext[0...j])
|
348
|
+
if best_common.length < suffix_length + prefix_length
|
349
|
+
best_common = shorttext[(j - suffix_length)...j] + shorttext[j...(j + prefix_length)]
|
350
|
+
best_longtext_a = longtext[0...(i - suffix_length)]
|
351
|
+
best_longtext_b = longtext[(i + prefix_length)..-1]
|
352
|
+
best_shorttext_a = shorttext[0...(j - suffix_length)]
|
353
|
+
best_shorttext_b = shorttext[(j + prefix_length)..-1]
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
if best_common.length * 2 >= longtext.length
|
358
|
+
[best_longtext_a, best_longtext_b, best_shorttext_a, best_shorttext_b, best_common]
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
# Do the two texts share a substring which is at least half the length of the
|
363
|
+
# longer text?
|
364
|
+
# This speedup can produce non-minimal diffs.
|
365
|
+
def diff_half_match(text1, text2)
|
366
|
+
# Don't risk returning a non-optimal diff if we have unlimited time
|
367
|
+
return nil if diff_timeout <= 0
|
368
|
+
|
369
|
+
shorttext, longtext = [text1, text2].sort_by(&:length)
|
370
|
+
if longtext.length < 4 || shorttext.length * 2 < longtext.length
|
371
|
+
return nil # Pointless.
|
372
|
+
end
|
373
|
+
|
374
|
+
# First check if the second quarter is the seed for a half-match.
|
375
|
+
hm1 = diff_half_match_i(longtext, shorttext, (longtext.length + 3) / 4)
|
376
|
+
# Check again based on the third quarter.
|
377
|
+
hm2 = diff_half_match_i(longtext, shorttext, (longtext.length + 1) / 2)
|
378
|
+
|
379
|
+
if hm1.nil? && hm2.nil?
|
380
|
+
return nil
|
381
|
+
elsif hm2.nil? || hm1.nil?
|
382
|
+
hm = hm2.nil? ? hm1 : hm2
|
383
|
+
else
|
384
|
+
# Both matched. Select the longest.
|
385
|
+
hm = hm1[4].length > hm2[4].length ? hm1 : hm2
|
386
|
+
end
|
387
|
+
|
388
|
+
# A half-match was found, sort out the return data.
|
389
|
+
if text1.length > text2.length
|
390
|
+
text1_a, text1_b, text2_a, text2_b, mid_common = hm
|
391
|
+
else
|
392
|
+
text2_a, text2_b, text1_a, text1_b, mid_common = hm
|
393
|
+
end
|
394
|
+
|
395
|
+
[text1_a, text1_b, text2_a, text2_b, mid_common]
|
396
|
+
end
|
397
|
+
|
398
|
+
# Reduce the number of edits by eliminating semantically trivial equalities.
|
399
|
+
def diff_cleanup_semantic(diffs)
|
400
|
+
changes = false
|
401
|
+
equalities = [] # Stack of indices where equalities are found.
|
402
|
+
last_equality = nil # Always equal to equalities.last[1]
|
403
|
+
pointer = 0 # Index of current position.
|
404
|
+
# Number of characters that changed prior to the equality.
|
405
|
+
length_insertions1 = 0
|
406
|
+
length_deletions1 = 0
|
407
|
+
# Number of characters that changed after the equality.
|
408
|
+
length_insertions2 = 0
|
409
|
+
length_deletions2 = 0
|
410
|
+
|
411
|
+
while pointer < diffs.length
|
412
|
+
if diffs[pointer][0] == :equal # Equality found.
|
413
|
+
equalities.push(pointer)
|
414
|
+
length_insertions1 = length_insertions2
|
415
|
+
length_deletions1 = length_deletions2
|
416
|
+
length_insertions2 = 0
|
417
|
+
length_deletions2 = 0
|
418
|
+
last_equality = diffs[pointer][1]
|
419
|
+
else # An insertion or deletion.
|
420
|
+
if diffs[pointer][0] == :insert
|
421
|
+
length_insertions2 += diffs[pointer][1].length
|
422
|
+
else
|
423
|
+
length_deletions2 += diffs[pointer][1].length
|
424
|
+
end
|
425
|
+
|
426
|
+
if last_equality &&
|
427
|
+
last_equality.length <= [length_insertions1, length_deletions1].max &&
|
428
|
+
last_equality.length <= [length_insertions2, length_deletions2].max
|
429
|
+
# Duplicate record.
|
430
|
+
diffs[equalities.last, 0] = [[:delete, last_equality]]
|
431
|
+
|
432
|
+
# Change second copy to insert.
|
433
|
+
diffs[equalities.last + 1][0] = :insert
|
434
|
+
|
435
|
+
# Throw away the equality we just deleted.
|
436
|
+
equalities.pop
|
437
|
+
# Throw away the previous equality (it needs to be reevaluated).
|
438
|
+
equalities.pop
|
439
|
+
pointer = equalities.last || -1
|
440
|
+
|
441
|
+
# Reset the counters.
|
442
|
+
length_insertions1 = 0
|
443
|
+
length_deletions1 = 0
|
444
|
+
length_insertions2 = 0
|
445
|
+
length_deletions2 = 0
|
446
|
+
last_equality = nil
|
447
|
+
|
448
|
+
changes = true
|
449
|
+
end
|
450
|
+
end
|
451
|
+
pointer += 1
|
452
|
+
end
|
453
|
+
|
454
|
+
# Normalize the diff.
|
455
|
+
diff_cleanup_merge(diffs) if changes
|
456
|
+
diff_cleanup_semantic_lossless(diffs)
|
457
|
+
|
458
|
+
# Find any overlaps between deletions and insertions.
|
459
|
+
# e.g: <del>abcxxx</del><ins>xxxdef</ins>
|
460
|
+
# -> <del>abc</del>xxx<ins>def</ins>
|
461
|
+
# e.g: <del>xxxabc</del><ins>defxxx</ins>
|
462
|
+
# -> <ins>def</ins>xxx<del>abc</del>
|
463
|
+
# Only extract an overlap if it is as big as the edit ahead or behind it.
|
464
|
+
pointer = 1
|
465
|
+
while pointer < diffs.length
|
466
|
+
if diffs[pointer - 1][0] == :delete && diffs[pointer][0] == :insert
|
467
|
+
deletion = diffs[pointer - 1][1]
|
468
|
+
insertion = diffs[pointer][1]
|
469
|
+
overlap_length1 = diff_common_overlap(deletion, insertion)
|
470
|
+
overlap_length2 = diff_common_overlap(insertion, deletion)
|
471
|
+
if overlap_length1 >= overlap_length2
|
472
|
+
if overlap_length1 >= deletion.length / 2.0 ||
|
473
|
+
overlap_length1 >= insertion.length / 2.0
|
474
|
+
# Overlap found. Insert an equality and trim the surrounding edits.
|
475
|
+
diffs[pointer, 0] = [[:equal, insertion[0...overlap_length1]]]
|
476
|
+
diffs[pointer - 1][0] = :delete
|
477
|
+
diffs[pointer - 1][1] = deletion[0...-overlap_length1]
|
478
|
+
diffs[pointer + 1][0] = :insert
|
479
|
+
diffs[pointer + 1][1] = insertion[overlap_length1..-1]
|
480
|
+
pointer += 1
|
481
|
+
end
|
482
|
+
elsif overlap_length2 >= deletion.length / 2.0 || overlap_length2 >= insertion.length / 2.0
|
483
|
+
diffs[pointer, 0] = [[:equal, deletion[0...overlap_length2]]]
|
484
|
+
diffs[pointer - 1][0] = :insert
|
485
|
+
diffs[pointer - 1][1] = insertion[0...-overlap_length2]
|
486
|
+
diffs[pointer + 1][0] = :delete
|
487
|
+
diffs[pointer + 1][1] = deletion[overlap_length2..-1]
|
488
|
+
pointer += 1
|
489
|
+
end
|
490
|
+
pointer += 1
|
491
|
+
end
|
492
|
+
pointer += 1
|
493
|
+
end
|
494
|
+
end
|
495
|
+
|
496
|
+
# Given two strings, compute a score representing whether the
|
497
|
+
# internal boundary falls on logical boundaries.
|
498
|
+
# Scores range from 5 (best) to 0 (worst).
|
499
|
+
def diff_cleanup_semantic_score(one, two)
|
500
|
+
if one.empty? || two.empty?
|
501
|
+
# Edges are the best.
|
502
|
+
return 5
|
503
|
+
end
|
504
|
+
|
505
|
+
# Define some regex patterns for matching boundaries.
|
506
|
+
non_word_character = /[^a-zA-Z0-9]/
|
507
|
+
whitespace = /\s/
|
508
|
+
linebreak = /[\r\n]/
|
509
|
+
line_end = /\n\r?\n$/
|
510
|
+
line_start = /^\r?\n\r?\n/
|
511
|
+
|
512
|
+
# Each port of this function behaves slightly differently due to
|
513
|
+
# subtle differences in each language's definition of things like
|
514
|
+
# 'whitespace'. Since this function's purpose is largely cosmetic,
|
515
|
+
# the choice has been made to use each language's native features
|
516
|
+
# rather than force total conformity.
|
517
|
+
score = 0
|
518
|
+
# One point for non-alphanumeric.
|
519
|
+
if one[-1] =~ non_word_character || two[0] =~ non_word_character
|
520
|
+
score += 1
|
521
|
+
# Two points for whitespace.
|
522
|
+
if one[-1] =~ whitespace || two[0] =~ whitespace
|
523
|
+
score += 1
|
524
|
+
# Three points for line breaks.
|
525
|
+
if one[-1] =~ linebreak || two[0] =~ linebreak
|
526
|
+
score += 1
|
527
|
+
# Four points for blank lines.
|
528
|
+
if one =~ line_end || two =~ line_start
|
529
|
+
score += 1
|
530
|
+
end
|
531
|
+
end
|
532
|
+
end
|
533
|
+
end
|
534
|
+
|
535
|
+
score
|
536
|
+
end
|
537
|
+
|
538
|
+
# Look for single edits surrounded on both sides by equalities
|
539
|
+
# which can be shifted sideways to align the edit to a word boundary.
|
540
|
+
# e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
|
541
|
+
def diff_cleanup_semantic_lossless(diffs)
|
542
|
+
pointer = 1
|
543
|
+
# Intentionally ignore the first and last element (don't need checking).
|
544
|
+
while pointer < diffs.length - 1
|
545
|
+
if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal
|
546
|
+
# This is a single edit surrounded by equalities.
|
547
|
+
equality1 = diffs[pointer - 1][1]
|
548
|
+
edit = diffs[pointer][1]
|
549
|
+
equality2 = diffs[pointer + 1][1]
|
550
|
+
|
551
|
+
# First, shift the edit as far left as possible.
|
552
|
+
common_offset = diff_common_suffix(equality1, edit)
|
553
|
+
if common_offset != 0
|
554
|
+
common_string = edit[-common_offset..-1]
|
555
|
+
equality1 = equality1[0...-common_offset]
|
556
|
+
edit = common_string + edit[0...-common_offset]
|
557
|
+
equality2 = common_string + equality2
|
558
|
+
end
|
559
|
+
|
560
|
+
# Second, step character by character right, looking for the best fit.
|
561
|
+
best_equality1 = equality1
|
562
|
+
best_edit = edit
|
563
|
+
best_equality2 = equality2
|
564
|
+
best_score = diff_cleanup_semantic_score(equality1, edit) +
|
565
|
+
diff_cleanup_semantic_score(edit, equality2)
|
566
|
+
while edit[0] == equality2[0]
|
567
|
+
equality1 += edit[0]
|
568
|
+
edit = edit[1..-1] + equality2[0]
|
569
|
+
equality2 = equality2[1..-1]
|
570
|
+
score = diff_cleanup_semantic_score(equality1, edit) +
|
571
|
+
diff_cleanup_semantic_score(edit, equality2)
|
572
|
+
# The >= encourages trailing rather than leading whitespace on edits.
|
573
|
+
if score >= best_score
|
574
|
+
best_score = score
|
575
|
+
best_equality1 = equality1
|
576
|
+
best_edit = edit
|
577
|
+
best_equality2 = equality2
|
578
|
+
end
|
579
|
+
end
|
580
|
+
|
581
|
+
if diffs[pointer - 1][1] != best_equality1
|
582
|
+
# We have an improvement, save it back to the diff.
|
583
|
+
if best_equality1.empty?
|
584
|
+
diffs[pointer - 1, 1] = []
|
585
|
+
pointer -= 1
|
586
|
+
else
|
587
|
+
diffs[pointer - 1][1] = best_equality1
|
588
|
+
end
|
589
|
+
|
590
|
+
diffs[pointer][1] = best_edit
|
591
|
+
|
592
|
+
if best_equality2.empty?
|
593
|
+
diffs[pointer + 1, 1] = []
|
594
|
+
pointer -= 1
|
595
|
+
else
|
596
|
+
diffs[pointer + 1][1] = best_equality2
|
597
|
+
end
|
598
|
+
end
|
599
|
+
end
|
600
|
+
|
601
|
+
pointer += 1
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
# Reduce the number of edits by eliminating operationally trivial equalities.
|
606
|
+
def diff_cleanup_efficiency(diffs)
|
607
|
+
changes = false
|
608
|
+
equalities = [] # Stack of indices where equalities are found.
|
609
|
+
last_equality = "" # Always equal to equalities.last[1]
|
610
|
+
pointer = 0 # Index of current position.
|
611
|
+
pre_ins = false # Is there an insertion operation before the last equality.
|
612
|
+
pre_del = false # Is there a deletion operation before the last equality.
|
613
|
+
post_ins = false # Is there an insertion operation after the last equality.
|
614
|
+
post_del = false # Is there a deletion operation after the last equality.
|
615
|
+
|
616
|
+
while pointer < diffs.length
|
617
|
+
if diffs[pointer][0] == :equal # Equality found.
|
618
|
+
if diffs[pointer][1].length < diff_edit_cost && (post_ins || post_del)
|
619
|
+
# Candidate found.
|
620
|
+
equalities.push(pointer)
|
621
|
+
pre_ins = post_ins
|
622
|
+
pre_del = post_del
|
623
|
+
last_equality = diffs[pointer][1]
|
624
|
+
else
|
625
|
+
# Not a candidate, and can never become one.
|
626
|
+
equalities.clear
|
627
|
+
last_equality = ""
|
628
|
+
end
|
629
|
+
post_ins = false
|
630
|
+
post_del = false
|
631
|
+
else # An insertion or deletion.
|
632
|
+
if diffs[pointer][0] == :delete
|
633
|
+
post_del = true
|
634
|
+
else
|
635
|
+
post_ins = true
|
636
|
+
end
|
637
|
+
|
638
|
+
# Five types to be split:
|
639
|
+
# <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
|
640
|
+
# <ins>A</ins>X<ins>C</ins><del>D</del>
|
641
|
+
# <ins>A</ins><del>B</del>X<ins>C</ins>
|
642
|
+
# <ins>A</del>X<ins>C</ins><del>D</del>
|
643
|
+
# <ins>A</ins><del>B</del>X<del>C</del>
|
644
|
+
|
645
|
+
if !last_equality.empty? &&
|
646
|
+
((pre_ins && pre_del && post_ins && post_del) ||
|
647
|
+
((last_equality.length < diff_edit_cost / 2) &&
|
648
|
+
[pre_ins, pre_del, post_ins, post_del].count(true) == 3))
|
649
|
+
# Duplicate record.
|
650
|
+
diffs[equalities.last, 0] = [[:delete, last_equality]]
|
651
|
+
# Change second copy to insert.
|
652
|
+
diffs[equalities.last + 1][0] = :insert
|
653
|
+
equalities.pop # Throw away the equality we just deleted
|
654
|
+
last_equality = ""
|
655
|
+
if pre_ins && pre_del
|
656
|
+
# No changes made which could affect previous entry, keep going.
|
657
|
+
post_ins = true
|
658
|
+
post_del = true
|
659
|
+
equalities.clear
|
660
|
+
else
|
661
|
+
unless equalities.empty?
|
662
|
+
equalities.pop # Throw away the previous equality.
|
663
|
+
pointer = equalities.last || -1
|
664
|
+
end
|
665
|
+
post_ins = false
|
666
|
+
post_del = false
|
667
|
+
end
|
668
|
+
changes = true
|
669
|
+
end
|
670
|
+
end
|
671
|
+
pointer += 1
|
672
|
+
end
|
673
|
+
|
674
|
+
if changes
|
675
|
+
diff_cleanup_merge(diffs)
|
676
|
+
end
|
677
|
+
end
|
678
|
+
|
679
|
+
# Reorder and merge like edit sections. Merge equalities.
|
680
|
+
# Any edit section can move as long as it doesn't cross an equality.
|
681
|
+
def diff_cleanup_merge(diffs)
|
682
|
+
diffs.push([:equal, ""]) # Add a dummy entry at the end.
|
683
|
+
pointer = 0
|
684
|
+
count_delete = 0
|
685
|
+
count_insert = 0
|
686
|
+
text_delete = ""
|
687
|
+
text_insert = ""
|
688
|
+
|
689
|
+
while pointer < diffs.length
|
690
|
+
case diffs[pointer][0]
|
691
|
+
when :insert
|
692
|
+
count_insert += 1
|
693
|
+
text_insert += diffs[pointer][1]
|
694
|
+
pointer += 1
|
695
|
+
when :delete
|
696
|
+
count_delete += 1
|
697
|
+
text_delete += diffs[pointer][1]
|
698
|
+
pointer += 1
|
699
|
+
when :equal
|
700
|
+
# Upon reaching an equality, check for prior redundancies.
|
701
|
+
if count_delete + count_insert > 1
|
702
|
+
if count_delete != 0 && count_insert != 0
|
703
|
+
# Factor out any common prefixies.
|
704
|
+
common_length = diff_common_prefix(text_insert, text_delete)
|
705
|
+
if common_length != 0
|
706
|
+
if (pointer - count_delete - count_insert) > 0 &&
|
707
|
+
diffs[pointer - count_delete - count_insert - 1][0] == :equal
|
708
|
+
diffs[pointer - count_delete - count_insert - 1][1] +=
|
709
|
+
text_insert[0...common_length]
|
710
|
+
else
|
711
|
+
diffs.unshift([:equal, text_insert[0...common_length]])
|
712
|
+
pointer += 1
|
713
|
+
end
|
714
|
+
text_insert = text_insert[common_length..-1]
|
715
|
+
text_delete = text_delete[common_length..-1]
|
716
|
+
end
|
717
|
+
# Factor out any common suffixies.
|
718
|
+
common_length = diff_common_suffix(text_insert, text_delete)
|
719
|
+
if common_length != 0
|
720
|
+
diffs[pointer][1] = text_insert[-common_length..-1] + diffs[pointer][1]
|
721
|
+
text_insert = text_insert[0...-common_length]
|
722
|
+
text_delete = text_delete[0...-common_length]
|
723
|
+
end
|
724
|
+
end
|
725
|
+
|
726
|
+
# Delete the offending records and add the merged ones.
|
727
|
+
diffs[pointer - count_delete - count_insert, count_delete + count_insert] = if count_delete.zero?
|
728
|
+
[[:insert, text_insert]]
|
729
|
+
elsif count_insert.zero?
|
730
|
+
[[:delete, text_delete]]
|
731
|
+
else
|
732
|
+
[[:delete, text_delete], [:insert, text_insert]]
|
733
|
+
end
|
734
|
+
pointer = pointer - count_delete - count_insert +
|
735
|
+
(count_delete.zero? ? 0 : 1) + (count_insert.zero? ? 0 : 1) + 1
|
736
|
+
elsif pointer != 0 && diffs[pointer - 1][0] == :equal
|
737
|
+
# Merge this equality with the previous one.
|
738
|
+
diffs[pointer - 1][1] += diffs[pointer][1]
|
739
|
+
diffs[pointer, 1] = []
|
740
|
+
else
|
741
|
+
pointer += 1
|
742
|
+
end
|
743
|
+
count_insert = 0
|
744
|
+
count_delete = 0
|
745
|
+
text_delete = ""
|
746
|
+
text_insert = ""
|
747
|
+
end
|
748
|
+
end
|
749
|
+
|
750
|
+
if diffs.last[1].empty?
|
751
|
+
diffs.pop # Remove the dummy entry at the end.
|
752
|
+
end
|
753
|
+
|
754
|
+
# Second pass: look for single edits surrounded on both sides by equalities
|
755
|
+
# which can be shifted sideways to eliminate an equality.
|
756
|
+
# e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
|
757
|
+
changes = false
|
758
|
+
pointer = 1
|
759
|
+
|
760
|
+
# Intentionally ignore the first and last element (don't need checking).
|
761
|
+
while pointer < diffs.length - 1
|
762
|
+
if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal
|
763
|
+
# This is a single edit surrounded by equalities.
|
764
|
+
if diffs[pointer][1][-diffs[pointer - 1][1].length..-1] == diffs[pointer - 1][1]
|
765
|
+
# Shift the edit over the previous equality.
|
766
|
+
diffs[pointer][1] = diffs[pointer - 1][1] + diffs[pointer][1][0...-diffs[pointer - 1][1].length]
|
767
|
+
diffs[pointer + 1][1] = diffs[pointer - 1][1] + diffs[pointer + 1][1]
|
768
|
+
diffs[pointer - 1, 1] = []
|
769
|
+
changes = true
|
770
|
+
elsif diffs[pointer][1][0...diffs[pointer + 1][1].length] == diffs[pointer + 1][1]
|
771
|
+
# Shift the edit over the next equality.
|
772
|
+
diffs[pointer - 1][1] += diffs[pointer + 1][1]
|
773
|
+
diffs[pointer][1] = diffs[pointer][1][diffs[pointer + 1][1].length..-1] +
|
774
|
+
diffs[pointer + 1][1]
|
775
|
+
diffs[pointer + 1, 1] = []
|
776
|
+
changes = true
|
777
|
+
end
|
778
|
+
end
|
779
|
+
pointer += 1
|
780
|
+
end
|
781
|
+
|
782
|
+
# If shifts were made, the diff needs reordering and another shift sweep.
|
783
|
+
if changes
|
784
|
+
diff_cleanup_merge(diffs)
|
785
|
+
end
|
786
|
+
end
|
787
|
+
|
788
|
+
# loc is a location in text1, compute and return the equivalent location
|
789
|
+
# in text2. e.g. 'The cat' vs 'The big cat', 1->1, 5->8
|
790
|
+
def diff_x_index(diffs, loc)
|
791
|
+
chars1 = 0
|
792
|
+
chars2 = 0
|
793
|
+
last_chars1 = 0
|
794
|
+
last_chars2 = 0
|
795
|
+
x = diffs.index { |diff|
|
796
|
+
if diff[0] != :insert
|
797
|
+
chars1 += diff[1].length
|
798
|
+
end
|
799
|
+
if diff[0] != :delete
|
800
|
+
chars2 += diff[1].length
|
801
|
+
end
|
802
|
+
if chars1 > loc
|
803
|
+
true
|
804
|
+
else
|
805
|
+
last_chars1 = chars1
|
806
|
+
last_chars2 = chars2
|
807
|
+
false
|
808
|
+
end
|
809
|
+
}
|
810
|
+
|
811
|
+
if !x.nil? && diffs.length != x && diffs[x][0] == :delete
|
812
|
+
# The location was deleted.
|
813
|
+
last_chars2
|
814
|
+
else
|
815
|
+
# Add the remaining len(character).
|
816
|
+
last_chars2 + (loc - last_chars1)
|
817
|
+
end
|
818
|
+
end
|
819
|
+
|
820
|
+
# Convert a diff array into a pretty HTML report.
|
821
|
+
def diff_pretty_html(diffs)
|
822
|
+
diffs.map { |op, data|
|
823
|
+
text = data.gsub("&", "&").gsub("<", "<").gsub(">", ">").gsub('\n', "¶<br>")
|
824
|
+
case op
|
825
|
+
when :insert
|
826
|
+
"<ins style=\"background:#e6ffe6;\">#{text}</ins>"
|
827
|
+
when :delete
|
828
|
+
"<del style=\"background:#ffe6e6;\">#{text}</del>"
|
829
|
+
when :equal
|
830
|
+
"<span>#{text}</span>"
|
831
|
+
end
|
832
|
+
}.join
|
833
|
+
end
|
834
|
+
|
835
|
+
# Compute and return the source text (all equalities and deletions).
|
836
|
+
def diff_text1(diffs)
|
837
|
+
diffs.map { |op, data|
|
838
|
+
if op == :insert
|
839
|
+
""
|
840
|
+
else
|
841
|
+
data
|
842
|
+
end
|
843
|
+
}.join
|
844
|
+
end
|
845
|
+
|
846
|
+
# Compute and return the destination text (all equalities and insertions).
|
847
|
+
def diff_text2(diffs)
|
848
|
+
diffs.map { |op, data|
|
849
|
+
if op == :delete
|
850
|
+
""
|
851
|
+
else
|
852
|
+
data
|
853
|
+
end
|
854
|
+
}.join
|
855
|
+
end
|
856
|
+
|
857
|
+
# Compute the Levenshtein distance; the number of inserted, deleted or
|
858
|
+
# substituted characters.
|
859
|
+
def diff_levenshtein(diffs)
|
860
|
+
levenshtein = 0
|
861
|
+
insertions = 0
|
862
|
+
deletions = 0
|
863
|
+
|
864
|
+
diffs.each do |op, data|
|
865
|
+
case op
|
866
|
+
when :insert
|
867
|
+
insertions += data.length
|
868
|
+
when :delete
|
869
|
+
deletions += data.length
|
870
|
+
when :equal
|
871
|
+
# A deletion and an insertion is one substitution.
|
872
|
+
levenshtein += [insertions, deletions].max
|
873
|
+
insertions = 0
|
874
|
+
deletions = 0
|
875
|
+
end
|
876
|
+
end
|
877
|
+
|
878
|
+
levenshtein + [insertions, deletions].max
|
879
|
+
end
|
880
|
+
|
881
|
+
# Crush the diff into an encoded string which describes the operations
|
882
|
+
# required to transform text1 into text2.
|
883
|
+
# E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
|
884
|
+
# Operations are tab-separated. Inserted text is escaped using %xx notation.
|
885
|
+
def diff_to_delta(diffs)
|
886
|
+
diffs.map { |op, data|
|
887
|
+
case op
|
888
|
+
when :insert
|
889
|
+
"+" + PatchObj::PATCH_PARSER.escape(data, /[^0-9A-Za-z_.;!~*'(),\/?:@&=+$\#-]/)
|
890
|
+
when :delete
|
891
|
+
"-" + data.length.to_s
|
892
|
+
when :equal
|
893
|
+
"=" + data.length.to_s
|
894
|
+
end
|
895
|
+
}.join("\t").gsub("%20", " ")
|
896
|
+
end
|
897
|
+
|
898
|
+
# Given the original text1, and an encoded string which describes the
|
899
|
+
# operations required to transform text1 into text2, compute the full diff.
|
900
|
+
def diff_from_delta(text1, delta)
|
901
|
+
# Deltas should be composed of a subset of ascii chars, Unicode not required.
|
902
|
+
delta.encode("ascii")
|
903
|
+
diffs = []
|
904
|
+
pointer = 0 # Cursor in text1
|
905
|
+
delta.split("\t").each do |token|
|
906
|
+
# Each token begins with a one character parameter which specifies the
|
907
|
+
# operation of this token (delete, insert, equality).
|
908
|
+
param = token[1..-1]
|
909
|
+
case token[0]
|
910
|
+
when "+"
|
911
|
+
diffs.push([:insert, PatchObj::PATCH_PARSER.unescape(param.force_encoding(Encoding::UTF_8))])
|
912
|
+
when "-", "="
|
913
|
+
begin
|
914
|
+
n = Integer(param)
|
915
|
+
raise if n < 0
|
916
|
+
text = text1[pointer...(pointer + n)]
|
917
|
+
pointer += n
|
918
|
+
if token[0] == "="
|
919
|
+
diffs.push([:equal, text])
|
920
|
+
else
|
921
|
+
diffs.push([:delete, text])
|
922
|
+
end
|
923
|
+
rescue ArgumentError => _
|
924
|
+
raise ArgumentError.new(
|
925
|
+
"Invalid number in diff_fromDelta: #{param.inspect}"
|
926
|
+
)
|
927
|
+
end
|
928
|
+
else
|
929
|
+
raise ArgumentError.new(
|
930
|
+
"Invalid diff operation in diff_fromDelta: #{token.inspect}"
|
931
|
+
)
|
932
|
+
end
|
933
|
+
end
|
934
|
+
|
935
|
+
if pointer != text1.length
|
936
|
+
raise ArgumentError.new("Delta length (#{pointer}) does not equal " \
|
937
|
+
"source text length #{text1.length}")
|
938
|
+
end
|
939
|
+
diffs
|
940
|
+
end
|
941
|
+
|
942
|
+
# Locate the best instance of 'pattern' in 'text' near 'loc'.
|
943
|
+
def match_main(text, pattern, loc)
|
944
|
+
# Check for null inputs.
|
945
|
+
if [text, pattern].any?(&:nil?)
|
946
|
+
raise ArgumentError.new("Null input. (match_main)")
|
947
|
+
end
|
948
|
+
|
949
|
+
loc = [0, [loc, text.length].min].max
|
950
|
+
if text == pattern
|
951
|
+
# Shortcut (potentially not guaranteed by the algorithm)
|
952
|
+
0
|
953
|
+
elsif text.empty?
|
954
|
+
# Nothing to match
|
955
|
+
-1
|
956
|
+
elsif text[loc, pattern.length] == pattern
|
957
|
+
# Perfect match at the perfect spot! (Includes case of null pattern)
|
958
|
+
loc
|
959
|
+
else
|
960
|
+
# Do a fuzzy compare.
|
961
|
+
match_bitap(text, pattern, loc)
|
962
|
+
end
|
963
|
+
end
|
964
|
+
|
965
|
+
# Locate the best instance of 'pattern' in 'text' near 'loc' using the
|
966
|
+
# Bitap algorithm.
|
967
|
+
def match_bitap(text, pattern, loc)
|
968
|
+
if pattern.length > match_max_bits
|
969
|
+
throw ArgumentError.new("Pattern too long")
|
970
|
+
end
|
971
|
+
|
972
|
+
# Initialise the alphabet.
|
973
|
+
s = match_alphabet(pattern)
|
974
|
+
|
975
|
+
# Compute and return the score for a match with e errors and x location.
|
976
|
+
match_bitap_score = ->(e, x) do
|
977
|
+
accuracy = e.to_f / pattern.length
|
978
|
+
proximity = (loc - x).abs
|
979
|
+
if match_distance == 0
|
980
|
+
# Dodge divide by zero error.
|
981
|
+
return proximity == 0 ? accuracy : 1.0
|
982
|
+
end
|
983
|
+
return accuracy + (proximity.to_f / match_distance)
|
984
|
+
end
|
985
|
+
|
986
|
+
# Highest score beyond which we give up.
|
987
|
+
score_threshold = match_threshold
|
988
|
+
# Is there a nearby exact match? (speedup)
|
989
|
+
best_loc = text.index(pattern, loc)
|
990
|
+
if best_loc
|
991
|
+
score_threshold = [match_bitap_score[0, best_loc], score_threshold].min
|
992
|
+
# What about in the other direction? (speedup)
|
993
|
+
best_loc = text.rindex(pattern, loc + pattern.length)
|
994
|
+
if best_loc
|
995
|
+
score_threshold = [match_bitap_score[0, best_loc], score_threshold].min
|
996
|
+
end
|
997
|
+
end
|
998
|
+
|
999
|
+
# Initialise the bit arrays.
|
1000
|
+
match_mask = 1 << (pattern.length - 1)
|
1001
|
+
best_loc = -1
|
1002
|
+
|
1003
|
+
bin_max = pattern.length + text.length
|
1004
|
+
# Empty initialization added to appease pychecker.
|
1005
|
+
last_rd = nil
|
1006
|
+
pattern.length.times do |d|
|
1007
|
+
# Scan for the best match; each iteration allows for one more error.
|
1008
|
+
# Run a binary search to determine how far from 'loc' we can stray at this
|
1009
|
+
# error level.
|
1010
|
+
bin_min = 0
|
1011
|
+
bin_mid = bin_max
|
1012
|
+
while bin_min < bin_mid
|
1013
|
+
if match_bitap_score[d, loc + bin_mid] <= score_threshold
|
1014
|
+
bin_min = bin_mid
|
1015
|
+
else
|
1016
|
+
bin_max = bin_mid
|
1017
|
+
end
|
1018
|
+
bin_mid = (bin_max - bin_min) / 2 + bin_min
|
1019
|
+
end
|
1020
|
+
|
1021
|
+
# Use the result from this iteration as the maximum for the next.
|
1022
|
+
bin_max = bin_mid
|
1023
|
+
start = [1, loc - bin_mid + 1].max
|
1024
|
+
finish = [loc + bin_mid, text.length].min + pattern.length
|
1025
|
+
|
1026
|
+
rd = Array.new(finish + 2, 0)
|
1027
|
+
rd[finish + 1] = (1 << d) - 1
|
1028
|
+
finish.downto(start) do |j|
|
1029
|
+
char_match = s[text[j - 1]] || 0
|
1030
|
+
rd[j] = if d == 0 # First pass: exact match.
|
1031
|
+
((rd[j + 1] << 1) | 1) & char_match
|
1032
|
+
else # Subsequent passes: fuzzy match.
|
1033
|
+
((rd[j + 1] << 1) | 1) & char_match |
|
1034
|
+
(((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
|
1035
|
+
end
|
1036
|
+
if (rd[j] & match_mask).nonzero?
|
1037
|
+
score = match_bitap_score[d, j - 1]
|
1038
|
+
# This match will almost certainly be better than any existing match.
|
1039
|
+
# But check anyway.
|
1040
|
+
if score <= score_threshold
|
1041
|
+
# Told you so.
|
1042
|
+
score_threshold = score
|
1043
|
+
best_loc = j - 1
|
1044
|
+
if best_loc > loc
|
1045
|
+
# When passing loc, don't exceed our current distance from loc.
|
1046
|
+
start = [1, 2 * loc - best_loc].max
|
1047
|
+
else
|
1048
|
+
# Already passed loc, downhill from here on in.
|
1049
|
+
break
|
1050
|
+
end
|
1051
|
+
end
|
1052
|
+
end
|
1053
|
+
end
|
1054
|
+
|
1055
|
+
# No hope for a (better) match at greater error levels.
|
1056
|
+
if match_bitap_score[d + 1, loc] > score_threshold
|
1057
|
+
break
|
1058
|
+
end
|
1059
|
+
last_rd = rd
|
1060
|
+
end
|
1061
|
+
|
1062
|
+
best_loc
|
1063
|
+
end
|
1064
|
+
|
1065
|
+
# Initialise the alphabet for the Bitap algorithm.
|
1066
|
+
def match_alphabet(pattern)
|
1067
|
+
s = {}
|
1068
|
+
pattern.chars.each_with_index do |c, i|
|
1069
|
+
s[c] ||= 0
|
1070
|
+
s[c] |= 1 << (pattern.length - i - 1)
|
1071
|
+
end
|
1072
|
+
s
|
1073
|
+
end
|
1074
|
+
|
1075
|
+
# Parse a textual representation of patches and return a list of patch
|
1076
|
+
# objects.
|
1077
|
+
def patch_from_text(textline)
|
1078
|
+
return [] if textline.empty?
|
1079
|
+
|
1080
|
+
patches = []
|
1081
|
+
text = textline.split("\n")
|
1082
|
+
text_pointer = 0
|
1083
|
+
patch_header = /^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$/
|
1084
|
+
while text_pointer < text.length
|
1085
|
+
m = text[text_pointer].match(patch_header)
|
1086
|
+
if m.nil?
|
1087
|
+
raise ArgumentError.new("Invalid patch string: #{text[text_pointer]}")
|
1088
|
+
end
|
1089
|
+
patch = PatchObj.new
|
1090
|
+
patches.push(patch)
|
1091
|
+
patch.start1 = m[1].to_i
|
1092
|
+
if m[2].empty?
|
1093
|
+
patch.start1 -= 1
|
1094
|
+
patch.length1 = 1
|
1095
|
+
elsif m[2] == "0"
|
1096
|
+
patch.length1 = 0
|
1097
|
+
else
|
1098
|
+
patch.start1 -= 1
|
1099
|
+
patch.length1 = m[2].to_i
|
1100
|
+
end
|
1101
|
+
|
1102
|
+
patch.start2 = m[3].to_i
|
1103
|
+
if m[4].empty?
|
1104
|
+
patch.start2 -= 1
|
1105
|
+
patch.length2 = 1
|
1106
|
+
elsif m[4] == "0"
|
1107
|
+
patch.length2 = 0
|
1108
|
+
else
|
1109
|
+
patch.start2 -= 1
|
1110
|
+
patch.length2 = m[4].to_i
|
1111
|
+
end
|
1112
|
+
text_pointer += 1
|
1113
|
+
|
1114
|
+
while text_pointer < text.length
|
1115
|
+
if text[text_pointer].empty?
|
1116
|
+
# Blank line? Whatever.
|
1117
|
+
text_pointer += 1
|
1118
|
+
next
|
1119
|
+
end
|
1120
|
+
|
1121
|
+
sign = text[text_pointer][0]
|
1122
|
+
line = PatchObj::PATCH_PARSER.unescape(text[text_pointer][1..-1].force_encoding(Encoding::UTF_8))
|
1123
|
+
|
1124
|
+
case sign
|
1125
|
+
when "-"
|
1126
|
+
# Deletion.
|
1127
|
+
patch.diffs.push([:delete, line])
|
1128
|
+
when "+"
|
1129
|
+
# Insertion.
|
1130
|
+
patch.diffs.push([:insert, line])
|
1131
|
+
when " "
|
1132
|
+
# Minor equality
|
1133
|
+
patch.diffs.push([:equal, line])
|
1134
|
+
when "@"
|
1135
|
+
# Start of next patch.
|
1136
|
+
break
|
1137
|
+
else
|
1138
|
+
# WTF?
|
1139
|
+
raise ArgumentError.new("Invalid patch mode \"#{sign}\" in: #{line}")
|
1140
|
+
end
|
1141
|
+
text_pointer += 1
|
1142
|
+
end
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
patches
|
1146
|
+
end
|
1147
|
+
|
1148
|
+
# Take a list of patches and return a textual representation
|
1149
|
+
def patch_to_text(patches)
|
1150
|
+
patches.join
|
1151
|
+
end
|
1152
|
+
|
1153
|
+
# Increase the context until it is unique,
|
1154
|
+
# but don't let the pattern expand beyond match_max_bits
|
1155
|
+
def patch_add_context(patch, text)
|
1156
|
+
return if text.empty?
|
1157
|
+
pattern = text[patch.start2, patch.length1]
|
1158
|
+
padding = 0
|
1159
|
+
|
1160
|
+
# Look for the first and last matches of pattern in text. If two different
|
1161
|
+
# matches are found, increase the pattern length.
|
1162
|
+
while text.index(pattern) != text.rindex(pattern) &&
|
1163
|
+
pattern.length < match_max_bits - 2 * patch_margin
|
1164
|
+
padding += patch_margin
|
1165
|
+
pattern = text[[0, patch.start2 - padding].max...(patch.start2 + patch.length1 + padding)]
|
1166
|
+
end
|
1167
|
+
|
1168
|
+
# Add one chunk for good luck.
|
1169
|
+
padding += patch_margin
|
1170
|
+
|
1171
|
+
# Add the prefix.
|
1172
|
+
prefix = text[[0, patch.start2 - padding].max...patch.start2]
|
1173
|
+
patch.diffs.unshift([:equal, prefix]) unless prefix.to_s.empty?
|
1174
|
+
|
1175
|
+
# Add the suffix.
|
1176
|
+
suffix = text[patch.start2 + patch.length1, padding]
|
1177
|
+
patch.diffs.push([:equal, suffix]) unless suffix.to_s.empty?
|
1178
|
+
|
1179
|
+
# Roll back the start points.
|
1180
|
+
patch.start1 -= prefix.length
|
1181
|
+
patch.start2 -= prefix.length
|
1182
|
+
|
1183
|
+
# Extend the lengths.
|
1184
|
+
patch.length1 += prefix.length + suffix.length
|
1185
|
+
patch.length2 += prefix.length + suffix.length
|
1186
|
+
end
|
1187
|
+
|
1188
|
+
# Compute a list of patches to turn text1 into text2.
|
1189
|
+
# Use diffs if provided, otherwise compute it ourselves.
|
1190
|
+
# There are four ways to call this function, depending on what data is
|
1191
|
+
# available to the caller:
|
1192
|
+
# Method 1:
|
1193
|
+
# a = text1, b = text2
|
1194
|
+
# Method 2:
|
1195
|
+
# a = diffs
|
1196
|
+
# Method 3 (optimal):
|
1197
|
+
# a = text1, b = diffs
|
1198
|
+
# Method 4 (deprecated, use method 3):
|
1199
|
+
# a = text1, b = text2, c = diffs
|
1200
|
+
def patch_make(*args)
|
1201
|
+
text1 = nil
|
1202
|
+
diffs = nil
|
1203
|
+
if args.length == 2 && args[0].is_a?(String) && args[1].is_a?(String)
|
1204
|
+
# Compute diffs from text1 and text2.
|
1205
|
+
text1 = args[0]
|
1206
|
+
text2 = args[1]
|
1207
|
+
diffs = diff_main(text1, text2, true)
|
1208
|
+
if diffs.length > 2
|
1209
|
+
diff_cleanup_semantic(diffs)
|
1210
|
+
diff_cleanup_efficiency(diffs)
|
1211
|
+
end
|
1212
|
+
elsif args.length == 1 && args[0].is_a?(Array)
|
1213
|
+
# Compute text1 from diffs.
|
1214
|
+
diffs = args[0]
|
1215
|
+
text1 = diff_text1(diffs)
|
1216
|
+
elsif args.length == 2 && args[0].is_a?(String) && args[1].is_a?(Array)
|
1217
|
+
text1 = args[0]
|
1218
|
+
diffs = args[1]
|
1219
|
+
elsif args.length == 3 && args[0].is_a?(String) && args[1].is_a?(String) &&
|
1220
|
+
args[2].is_a?(Array)
|
1221
|
+
# Method 4: text1, text2, diffs
|
1222
|
+
# text2 is not used.
|
1223
|
+
text1 = args[0]
|
1224
|
+
# text2 = args[1]
|
1225
|
+
diffs = args[2]
|
1226
|
+
else
|
1227
|
+
raise ArgumentError.new("Unknown call format to patch_make.")
|
1228
|
+
end
|
1229
|
+
|
1230
|
+
return [] if diffs.empty? # Get rid of the null case.
|
1231
|
+
|
1232
|
+
patches = []
|
1233
|
+
patch = PatchObj.new
|
1234
|
+
char_count1 = 0 # Number of characters into the text1 string.
|
1235
|
+
char_count2 = 0 # Number of characters into the text2 string.
|
1236
|
+
prepatch_text = text1 # Recreate the patches to determine context info.
|
1237
|
+
postpatch_text = text1
|
1238
|
+
|
1239
|
+
diffs.each_with_index do |diff, x|
|
1240
|
+
diff_type, diff_text = diffs[x]
|
1241
|
+
if patch.diffs.empty? && diff_type != :equal
|
1242
|
+
# A new patch starts here.
|
1243
|
+
patch.start1 = char_count1
|
1244
|
+
patch.start2 = char_count2
|
1245
|
+
end
|
1246
|
+
|
1247
|
+
case diff_type
|
1248
|
+
when :insert
|
1249
|
+
patch.diffs.push(diff)
|
1250
|
+
patch.length2 += diff_text.length
|
1251
|
+
postpatch_text = postpatch_text[0...char_count2] + diff_text +
|
1252
|
+
postpatch_text[char_count2..-1]
|
1253
|
+
when :delete
|
1254
|
+
patch.length1 += diff_text.length
|
1255
|
+
patch.diffs.push(diff)
|
1256
|
+
postpatch_text = postpatch_text[0...char_count2] +
|
1257
|
+
postpatch_text[(char_count2 + diff_text.length)..-1]
|
1258
|
+
when :equal
|
1259
|
+
if diff_text.length <= 2 * patch_margin &&
|
1260
|
+
!patch.diffs.empty? && diffs.length != x + 1
|
1261
|
+
# Small equality inside a patch.
|
1262
|
+
patch.diffs.push(diff)
|
1263
|
+
patch.length1 += diff_text.length
|
1264
|
+
patch.length2 += diff_text.length
|
1265
|
+
elsif diff_text.length >= 2 * patch_margin
|
1266
|
+
# Time for a new patch.
|
1267
|
+
unless patch.diffs.empty?
|
1268
|
+
patch_add_context(patch, prepatch_text)
|
1269
|
+
patches.push(patch)
|
1270
|
+
patch = PatchObj.new
|
1271
|
+
# Unlike Unidiff, our patch lists have a rolling context.
|
1272
|
+
# http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
|
1273
|
+
# Update prepatch text & pos to reflect the application of the
|
1274
|
+
# just completed patch.
|
1275
|
+
prepatch_text = postpatch_text
|
1276
|
+
char_count1 = char_count2
|
1277
|
+
end
|
1278
|
+
end
|
1279
|
+
end
|
1280
|
+
|
1281
|
+
# Update the current character count.
|
1282
|
+
if diff_type != :insert
|
1283
|
+
char_count1 += diff_text.length
|
1284
|
+
end
|
1285
|
+
if diff_type != :delete
|
1286
|
+
char_count2 += diff_text.length
|
1287
|
+
end
|
1288
|
+
end
|
1289
|
+
|
1290
|
+
# Pick up the leftover patch if not empty.
|
1291
|
+
unless patch.diffs.empty?
|
1292
|
+
patch_add_context(patch, prepatch_text)
|
1293
|
+
patches.push(patch)
|
1294
|
+
end
|
1295
|
+
|
1296
|
+
patches
|
1297
|
+
end
|
1298
|
+
|
1299
|
+
# Merge a set of patches onto the text. Return a patched text, as well
|
1300
|
+
# as a list of true/false values indicating which patches were applied.
|
1301
|
+
def patch_apply(patches, text)
|
1302
|
+
return [text, []] if patches.empty?
|
1303
|
+
|
1304
|
+
# Deep copy the patches so that no changes are made to originals.
|
1305
|
+
patches = Marshal.load(Marshal.dump(patches))
|
1306
|
+
|
1307
|
+
null_padding = patch_add_padding(patches)
|
1308
|
+
text = null_padding + text + null_padding
|
1309
|
+
patch_split_max(patches)
|
1310
|
+
|
1311
|
+
# delta keeps track of the offset between the expected and actual location
|
1312
|
+
# of the previous patch. If there are patches expected at positions 10 and
|
1313
|
+
# 20, but the first patch was found at 12, delta is 2 and the second patch
|
1314
|
+
# has an effective expected position of 22.
|
1315
|
+
delta = 0
|
1316
|
+
results = []
|
1317
|
+
patches.each_with_index do |patch, x|
|
1318
|
+
expected_loc = patch.start2 + delta
|
1319
|
+
text1 = diff_text1(patch.diffs)
|
1320
|
+
end_loc = -1
|
1321
|
+
if text1.length > match_max_bits
|
1322
|
+
# patch_splitMax will only provide an oversized pattern in the case of
|
1323
|
+
# a monster delete.
|
1324
|
+
start_loc = match_main(text, text1[0, match_max_bits], expected_loc)
|
1325
|
+
if start_loc != -1
|
1326
|
+
end_loc = match_main(text, text1[(text1.length - match_max_bits)..-1],
|
1327
|
+
expected_loc + text1.length - match_max_bits)
|
1328
|
+
if end_loc == -1 || start_loc >= end_loc
|
1329
|
+
# Can't find valid trailing context. Drop this patch.
|
1330
|
+
start_loc = -1
|
1331
|
+
end
|
1332
|
+
end
|
1333
|
+
else
|
1334
|
+
start_loc = match_main(text, text1, expected_loc)
|
1335
|
+
end
|
1336
|
+
if start_loc == -1
|
1337
|
+
# No match found. :(
|
1338
|
+
results[x] = false
|
1339
|
+
# Subtract the delta for this failed patch from subsequent patches.
|
1340
|
+
delta -= patch.length2 - patch.length1
|
1341
|
+
else
|
1342
|
+
# Found a match. :)
|
1343
|
+
results[x] = true
|
1344
|
+
delta = start_loc - expected_loc
|
1345
|
+
text2 = text[start_loc, end_loc == -1 ? text1.length : end_loc + match_max_bits]
|
1346
|
+
|
1347
|
+
if text1 == text2
|
1348
|
+
# Perfect match, just shove the replacement text in.
|
1349
|
+
text = text[0, start_loc] + diff_text2(patch.diffs) + text[(start_loc + text1.length)..-1]
|
1350
|
+
else
|
1351
|
+
# Imperfect match.
|
1352
|
+
# Run a diff to get a framework of equivalent indices.
|
1353
|
+
diffs = diff_main(text1, text2, false)
|
1354
|
+
if text1.length > match_max_bits &&
|
1355
|
+
diff_levenshtein(diffs).to_f / text1.length > patch_delete_threshold
|
1356
|
+
# The end points match, but the content is unacceptably bad.
|
1357
|
+
results[x] = false
|
1358
|
+
else
|
1359
|
+
diff_cleanup_semantic_lossless(diffs)
|
1360
|
+
index1 = 0
|
1361
|
+
patch.diffs.each do |op, data|
|
1362
|
+
if op != :equal
|
1363
|
+
index2 = diff_x_index(diffs, index1)
|
1364
|
+
end
|
1365
|
+
if op == :insert # Insertion
|
1366
|
+
text = text[0, start_loc + index2] + data + text[(start_loc + index2)..-1]
|
1367
|
+
elsif op == :delete # Deletion
|
1368
|
+
text = text[0, start_loc + index2] +
|
1369
|
+
text[(start_loc + diff_x_index(diffs, index1 + data.length))..-1]
|
1370
|
+
end
|
1371
|
+
if op != :delete
|
1372
|
+
index1 += data.length
|
1373
|
+
end
|
1374
|
+
end
|
1375
|
+
end
|
1376
|
+
end
|
1377
|
+
end
|
1378
|
+
end
|
1379
|
+
|
1380
|
+
# Strip the padding off.
|
1381
|
+
text = text[null_padding.length...-null_padding.length]
|
1382
|
+
[text, results]
|
1383
|
+
end
|
1384
|
+
|
1385
|
+
# Add some padding on text start and end so that edges can match
|
1386
|
+
# something. Intended to be called only from within patch_apply.
|
1387
|
+
def patch_add_padding(patches)
|
1388
|
+
padding_length = patch_margin
|
1389
|
+
null_padding = (1..padding_length).map { |x| x.chr(Encoding::UTF_8) }.join
|
1390
|
+
|
1391
|
+
# Bump all the patches forward.
|
1392
|
+
patches.each do |patch|
|
1393
|
+
patch.start1 += padding_length
|
1394
|
+
patch.start2 += padding_length
|
1395
|
+
end
|
1396
|
+
|
1397
|
+
# Add some padding on start of first diff.
|
1398
|
+
patch = patches.first
|
1399
|
+
diffs = patch.diffs
|
1400
|
+
if diffs.empty? || diffs.first[0] != :equal
|
1401
|
+
# Add nullPadding equality.
|
1402
|
+
diffs.unshift([:equal, null_padding])
|
1403
|
+
patch.start1 -= padding_length # Should be 0.
|
1404
|
+
patch.start2 -= padding_length # Should be 0.
|
1405
|
+
patch.length1 += padding_length
|
1406
|
+
patch.length2 += padding_length
|
1407
|
+
elsif padding_length > diffs.first[1].length
|
1408
|
+
# Grow first equality.
|
1409
|
+
extra_length = padding_length - diffs.first[1].length
|
1410
|
+
diffs.first[1] = null_padding[diffs.first[1].length..-1] + diffs.first[1]
|
1411
|
+
patch.start1 -= extra_length
|
1412
|
+
patch.start2 -= extra_length
|
1413
|
+
patch.length1 += extra_length
|
1414
|
+
patch.length2 += extra_length
|
1415
|
+
end
|
1416
|
+
|
1417
|
+
# Add some padding on end of last diff.
|
1418
|
+
patch = patches.last
|
1419
|
+
diffs = patch.diffs
|
1420
|
+
if diffs.empty? || diffs.last[0] != :equal
|
1421
|
+
# Add nullPadding equality.
|
1422
|
+
diffs.push([:equal, null_padding])
|
1423
|
+
patch.length1 += padding_length
|
1424
|
+
patch.length2 += padding_length
|
1425
|
+
elsif padding_length > diffs.last[1].length
|
1426
|
+
# Grow last equality.
|
1427
|
+
extra_length = padding_length - diffs.last[1].length
|
1428
|
+
diffs.last[1] += null_padding[0, extra_length]
|
1429
|
+
patch.length1 += extra_length
|
1430
|
+
patch.length2 += extra_length
|
1431
|
+
end
|
1432
|
+
|
1433
|
+
null_padding
|
1434
|
+
end
|
1435
|
+
|
1436
|
+
# Look through the patches and break up any which are longer than the
|
1437
|
+
# maximum limit of the match algorithm.
|
1438
|
+
def patch_split_max(patches)
|
1439
|
+
patch_size = match_max_bits
|
1440
|
+
|
1441
|
+
x = 0
|
1442
|
+
while x < patches.length
|
1443
|
+
if patches[x].length1 > patch_size
|
1444
|
+
big_patch = patches[x]
|
1445
|
+
# Remove the big old patch
|
1446
|
+
patches[x, 1] = []
|
1447
|
+
x -= 1
|
1448
|
+
start1 = big_patch.start1
|
1449
|
+
start2 = big_patch.start2
|
1450
|
+
pre_context = ""
|
1451
|
+
until big_patch.diffs.empty?
|
1452
|
+
# Create one of several smaller patches.
|
1453
|
+
patch = PatchObj.new
|
1454
|
+
empty = true
|
1455
|
+
patch.start1 = start1 - pre_context.length
|
1456
|
+
patch.start2 = start2 - pre_context.length
|
1457
|
+
unless pre_context.empty?
|
1458
|
+
patch.length1 = patch.length2 = pre_context.length
|
1459
|
+
patch.diffs.push([:equal, pre_context])
|
1460
|
+
end
|
1461
|
+
|
1462
|
+
while !big_patch.diffs.empty? && patch.length1 < patch_size - patch_margin
|
1463
|
+
diff = big_patch.diffs.first
|
1464
|
+
if diff[0] == :insert
|
1465
|
+
# Insertions are harmless.
|
1466
|
+
patch.length2 += diff[1].length
|
1467
|
+
start2 += diff[1].length
|
1468
|
+
patch.diffs.push(big_patch.diffs.shift)
|
1469
|
+
empty = false
|
1470
|
+
elsif diff[0] == :delete && patch.diffs.length == 1 &&
|
1471
|
+
patch.diffs.first[0] == :equal && diff[1].length > 2 * patch_size
|
1472
|
+
# This is a large deletion. Let it pass in one chunk.
|
1473
|
+
patch.length1 += diff[1].length
|
1474
|
+
start1 += diff[1].length
|
1475
|
+
empty = false
|
1476
|
+
patch.diffs.push(big_patch.diffs.shift)
|
1477
|
+
else
|
1478
|
+
# Deletion or equality. Only take as much as we can stomach.
|
1479
|
+
diff_text = diff[1][0, patch_size - patch.length1 - patch_margin]
|
1480
|
+
patch.length1 += diff_text.length
|
1481
|
+
start1 += diff_text.length
|
1482
|
+
if diff[0] == :equal
|
1483
|
+
patch.length2 += diff_text.length
|
1484
|
+
start2 += diff_text.length
|
1485
|
+
else
|
1486
|
+
empty = false
|
1487
|
+
end
|
1488
|
+
patch.diffs.push([diff[0], diff_text])
|
1489
|
+
if diff_text == big_patch.diffs.first[1]
|
1490
|
+
big_patch.diffs.shift
|
1491
|
+
else
|
1492
|
+
big_patch.diffs.first[1] = big_patch.diffs.first[1][diff_text.length..-1]
|
1493
|
+
end
|
1494
|
+
end
|
1495
|
+
end
|
1496
|
+
|
1497
|
+
# Compute the head context for the next patch.
|
1498
|
+
pre_context = diff_text2(patch.diffs)[-patch_margin..-1] || ""
|
1499
|
+
|
1500
|
+
# Append the end context for this patch.
|
1501
|
+
post_context = diff_text1(big_patch.diffs)[0...patch_margin] || ""
|
1502
|
+
unless post_context.empty?
|
1503
|
+
patch.length1 += post_context.length
|
1504
|
+
patch.length2 += post_context.length
|
1505
|
+
if !patch.diffs.empty? && patch.diffs.last[0] == :equal
|
1506
|
+
patch.diffs.last[1] += post_context
|
1507
|
+
else
|
1508
|
+
patch.diffs.push([:equal, post_context])
|
1509
|
+
end
|
1510
|
+
end
|
1511
|
+
unless empty
|
1512
|
+
x += 1
|
1513
|
+
patches[x, 0] = [patch]
|
1514
|
+
end
|
1515
|
+
end
|
1516
|
+
end
|
1517
|
+
x += 1
|
1518
|
+
end
|
1519
|
+
end
|
1520
|
+
end
|
1521
|
+
|
1522
|
+
DiffMatchPatch = DiMaPa
|