dimapa 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +13 -0
- data/LICENSE +23 -0
- data/README.md +78 -0
- data/Rakefile +22 -0
- data/lib/diff_methods.rb +132 -0
- data/lib/dimapa.rb +1522 -0
- data/lib/patch_obj.rb +54 -0
- data/scripts/speedtest.rb +13 -0
- data/scripts/speedtest/speedtest1.txt +230 -0
- data/scripts/speedtest/speedtest2.txt +188 -0
- data/test/helper.rb +1 -0
- data/test/test_dimapa.rb +1196 -0
- metadata +98 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5e87b2a963101ab22b8fb368d0956670c8f22a09691da2931ab28574f4d700fd
|
4
|
+
data.tar.gz: c94b5fa761a33875cd96db4afadc9aabfc87f0e3a3da249f6a59ce9659685f5d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: de8c3501e51d0fecfb0697e6120b630b6b3c46ee1e0f52ee9343c07db4e3e2a1667bb690407557726c0563ff712000d0ed2319aeda0eb5c3cf990bb98eb7a1bd
|
7
|
+
data.tar.gz: 6f638d15129dc34dda5d766796dda129340b981e06886c277ec2583044dbbdf34ddd70df51ce600d26ac4797b2cc4d2a9eeaee31377fa9e8b02360f1c650ea2f
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
Copyright (c) 2011, Jorge Kalmbach <kalmbach.at.gmail.com>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any
|
4
|
+
person obtaining a copy of this software and associated
|
5
|
+
documentation files (the "Software"), to deal in the
|
6
|
+
Software without restriction, including without limitation
|
7
|
+
the rights to use, copy, modify, merge, publish,
|
8
|
+
distribute, sublicense, and/or sell copies of the
|
9
|
+
Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice
|
13
|
+
shall be included in all copies or substantial portions of
|
14
|
+
the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
17
|
+
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
18
|
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
19
|
+
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
|
20
|
+
OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
21
|
+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
22
|
+
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
23
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# DiMaPa (Diff Match and Patch)
|
2
|
+
A modern Ruby implementation of Google's [Diff Match and Patch][google]
|
3
|
+
libraries.
|
4
|
+
|
5
|
+
> The Diff Match and Patch libraries offer robust algorithms to perform the
|
6
|
+
> operations required for synchronizing plain text.
|
7
|
+
|
8
|
+
## Usage
|
9
|
+
```ruby
|
10
|
+
require 'dimapa'
|
11
|
+
|
12
|
+
dmp = DiMaPa.new # or DiffMatchPatch
|
13
|
+
|
14
|
+
diff = dmp.diff_main("This is a sentence.", "This is also a sentence.")
|
15
|
+
#=> [[:equal, "This is a"], [:insert, "lso a"], [:equal, " sentence."]]
|
16
|
+
|
17
|
+
dmp.diff_cleanup_semantic(diff)
|
18
|
+
#=> nil
|
19
|
+
|
20
|
+
# diff is modified in place
|
21
|
+
diff
|
22
|
+
#=> [[:equal, "This is "], [:insert, "also "], [:equal, "a sentence."]]
|
23
|
+
|
24
|
+
patch = dmp.patch_make(diff)
|
25
|
+
#=> [#<PatchObj:0x00005608e6ac9500 @diffs=
|
26
|
+
# [[:equal, "This is "], [:insert, "also "], [:equal, "a senten"]],
|
27
|
+
# @length1=16,
|
28
|
+
# @length2=21,
|
29
|
+
# @start1=0,
|
30
|
+
# @start2=0>]
|
31
|
+
|
32
|
+
dmp.patch_to_text(patch)
|
33
|
+
#=> "@@ -1,16 +1,21 @@\n This is \n+also \n a senten\n"
|
34
|
+
|
35
|
+
dmp.patch_apply(patch, "This is a sentence.")
|
36
|
+
#=> ["This is also a sentence.", [true]]
|
37
|
+
```
|
38
|
+
|
39
|
+
## Installation
|
40
|
+
```sh
|
41
|
+
# RubyGem
|
42
|
+
gem install dimapa
|
43
|
+
|
44
|
+
# From source
|
45
|
+
bundle install
|
46
|
+
bundle exec rake install
|
47
|
+
```
|
48
|
+
|
49
|
+
## Benchmarks
|
50
|
+
|
51
|
+
This project includes [scripts/](speedtests) mirroring those in the official
|
52
|
+
project. Performance is on par with those reported for [Lua and Python][speedtest]
|
53
|
+
albeit run on a faster machine.
|
54
|
+
|
55
|
+
```
|
56
|
+
$ rake speedtest
|
57
|
+
|
58
|
+
user system total real
|
59
|
+
diff(t2,t1) 13.658214 0.003937 13.662151 ( 13.662453)
|
60
|
+
diff(t1,t2) 14.074079 0.000001 14.074080 ( 14.074350)
|
61
|
+
```
|
62
|
+
|
63
|
+
## Tests and Linting
|
64
|
+
|
65
|
+
```sh
|
66
|
+
bundle exec rake
|
67
|
+
```
|
68
|
+
|
69
|
+
### Fork of [kalmbach/diff_match_patch][kalmbach] b/w/o [DavidMikeSimon/diff_match_patch][davidmikesimon]
|
70
|
+
Copyright (c) 2011, Jorge Kalmbach <kalmbach.at.gmail.com>
|
71
|
+
|
72
|
+
Work was inspired by the [reima/diff_match_patch-ruby][reima] module.
|
73
|
+
|
74
|
+
[speedtest]: https://docs.google.com/spreadsheets/d/1zpZccuBpjMZTvL1nGDMKJc7rWL_m_drF4XKOJvB27Kc/edit#gid=0
|
75
|
+
[kalmbach]: https://github.com/kalmbach/diff_match_patch
|
76
|
+
[davidmikesimon]: https://github.com/DavidMikeSimon/diff_match_patch
|
77
|
+
[reima]: https://github.com/reima/diff_match_patch-ruby
|
78
|
+
[google]: https://github.com/google/diff-match-patch
|
data/Rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require "rake/testtask"
|
2
|
+
require "standard/rake"
|
3
|
+
require "bundler/gem_tasks"
|
4
|
+
|
5
|
+
Rake::TestTask.new do |t|
|
6
|
+
t.libs << "test"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "Run benchmarking speedtest"
|
10
|
+
task :speedtest do
|
11
|
+
ruby "scripts/speedtest.rb"
|
12
|
+
end
|
13
|
+
|
14
|
+
desc "Start REPL"
|
15
|
+
task :console do
|
16
|
+
require "pry"
|
17
|
+
require "dimapa"
|
18
|
+
Pry.start
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "Run tests and linter"
|
22
|
+
task default: [:standard, :test]
|
data/lib/diff_methods.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
module DiffMethods
|
2
|
+
FIXNUM_MAX = 2**(0.size * 8 - 2) - 1
|
3
|
+
|
4
|
+
attr_accessor :diff_timeout
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
# Number of seconds to map a diff before giving up (0 for infinity).
|
8
|
+
@diff_timeout = 1
|
9
|
+
end
|
10
|
+
|
11
|
+
# Find the differences between two texts. Simplifies the problem by
|
12
|
+
# stripping any common prefix or suffix off the texts before editing.
|
13
|
+
def diff_main(text1, text2, checklines = true, deadline = nil)
|
14
|
+
# Set a deadline by which time the diff must be complete.
|
15
|
+
deadline ||= diff_new_deadline
|
16
|
+
|
17
|
+
# Check for null inputs.
|
18
|
+
raise ArgumentError.new("Null inputs. (diff_main)") unless text1 || text2
|
19
|
+
|
20
|
+
# Check for equality (speedup).
|
21
|
+
return (text1.empty? ? [] : [[:equal, text1]]) if text1 == text2
|
22
|
+
|
23
|
+
diff_main_compute_diff(text1, text2, checklines, deadline)
|
24
|
+
end
|
25
|
+
|
26
|
+
def diff_main_compute_diff(text1, text2, checklines, deadline)
|
27
|
+
# Trim off common prefix and suffix (speedup).
|
28
|
+
common_prefix, text1, text2 = diff_trim_common_prefix(text1, text2)
|
29
|
+
common_suffix, text1, text2 = diff_trim_common_suffix(text1, text2)
|
30
|
+
|
31
|
+
# Compute the diff on the middle block.
|
32
|
+
diffs = diff_compute(text1, text2, checklines, deadline)
|
33
|
+
|
34
|
+
# Restore the prefix and suffix.
|
35
|
+
diffs.unshift([:equal, common_prefix]) unless common_prefix.nil?
|
36
|
+
diffs.push([:equal, common_suffix]) unless common_suffix.nil?
|
37
|
+
diff_cleanup_merge(diffs)
|
38
|
+
|
39
|
+
diffs
|
40
|
+
end
|
41
|
+
|
42
|
+
private :diff_main_compute_diff
|
43
|
+
|
44
|
+
# Calculate a new deadline using the @diff_timeout configuration value
|
45
|
+
def diff_new_deadline
|
46
|
+
Time.now + (diff_timeout.zero? ? FIXNUM_MAX : diff_timeout)
|
47
|
+
end
|
48
|
+
|
49
|
+
private :diff_new_deadline
|
50
|
+
|
51
|
+
# Trim off the common prefix
|
52
|
+
def diff_trim_common_prefix(text1, text2)
|
53
|
+
if (common_length = diff_common_prefix(text1, text2)).nonzero?
|
54
|
+
common_prefix = text1[0...common_length]
|
55
|
+
text1 = text1[common_length..-1]
|
56
|
+
text2 = text2[common_length..-1]
|
57
|
+
end
|
58
|
+
|
59
|
+
[common_prefix, text1, text2]
|
60
|
+
end
|
61
|
+
|
62
|
+
private :diff_trim_common_prefix
|
63
|
+
|
64
|
+
# Trim off the common suffix
|
65
|
+
def diff_trim_common_suffix(text1, text2)
|
66
|
+
if (common_length = diff_common_suffix(text1, text2)).nonzero?
|
67
|
+
common_suffix = text1[-common_length..-1]
|
68
|
+
text1 = text1[0...-common_length]
|
69
|
+
text2 = text2[0...-common_length]
|
70
|
+
end
|
71
|
+
|
72
|
+
[common_suffix, text1, text2]
|
73
|
+
end
|
74
|
+
|
75
|
+
private :diff_trim_common_suffix
|
76
|
+
|
77
|
+
# Find the differences between two texts. Assumes that the texts do not
|
78
|
+
# have any common prefix or suffix.
|
79
|
+
def diff_compute(text1, text2, checklines, deadline)
|
80
|
+
if (diffs = diff_compute_common_cases(text1, text2))
|
81
|
+
diffs
|
82
|
+
|
83
|
+
elsif (diffs = diff_compute_half_match(text1, text2, checklines, deadline))
|
84
|
+
diffs
|
85
|
+
|
86
|
+
elsif checklines && text1.length > 100 && text2.length > 100
|
87
|
+
diff_line_mode(text1, text2, deadline)
|
88
|
+
|
89
|
+
else
|
90
|
+
diff_bisect(text1, text2, deadline)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def diff_compute_half_match(text1, text2, checklines, deadline)
|
95
|
+
if (hm = diff_half_match(text1, text2))
|
96
|
+
# A half-match was found, sort out the return data.
|
97
|
+
text1_a, text1_b, text2_a, text2_b, mid_common = hm
|
98
|
+
|
99
|
+
# Send both pairs off for separate processing.
|
100
|
+
diffs_a = diff_main(text1_a, text2_a, checklines, deadline)
|
101
|
+
diffs_b = diff_main(text1_b, text2_b, checklines, deadline)
|
102
|
+
|
103
|
+
# Merge the results.
|
104
|
+
diffs_a + [[:equal, mid_common]] + diffs_b
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
private :diff_compute_half_match
|
109
|
+
|
110
|
+
def diff_compute_common_cases(text1, text2)
|
111
|
+
# Just add some text (speedup).
|
112
|
+
return [[:insert, text2]] if text1.empty?
|
113
|
+
|
114
|
+
# Just delete some text (speedup).
|
115
|
+
return [[:delete, text1]] if text2.empty?
|
116
|
+
|
117
|
+
short, long = [text1, text2].sort_by(&:length)
|
118
|
+
|
119
|
+
# Shorter text is inside the longer text (speedup).
|
120
|
+
if (i = long.index(short))
|
121
|
+
op = text1.length > text2.length ? :delete : :insert
|
122
|
+
[[op, long[0...i]], [:equal, short], [op, long[(i + short.length)..-1]]]
|
123
|
+
|
124
|
+
# Single character string.
|
125
|
+
elsif short.length == 1
|
126
|
+
# After the previous speedup, the character can't be an equality.
|
127
|
+
[[:delete, text1], [:insert, text2]]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
private :diff_compute_common_cases
|
132
|
+
end
|
data/lib/dimapa.rb
ADDED
@@ -0,0 +1,1522 @@
|
|
1
|
+
require "diff_methods"
|
2
|
+
require "patch_obj"
|
3
|
+
|
4
|
+
# Class containing the diff, match and patch methods.
|
5
|
+
# Also contains the behaviour settings.
|
6
|
+
class DiMaPa
|
7
|
+
include DiffMethods
|
8
|
+
|
9
|
+
attr_accessor :diff_edit_cost
|
10
|
+
attr_accessor :match_threshold
|
11
|
+
attr_accessor :match_distance
|
12
|
+
attr_accessor :patch_delete_threshold
|
13
|
+
attr_accessor :patch_margin
|
14
|
+
attr_reader :match_max_bits
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
# Inits a diff_match_patch object with default settings.
|
18
|
+
# Redefine these in your program to override the defaults.
|
19
|
+
|
20
|
+
# Cost of an empty edit operation in terms of edit characters.
|
21
|
+
@diff_edit_cost = 4
|
22
|
+
# At what point is no match declared (0.0 = perfection, 1.0 = very loose).
|
23
|
+
@match_threshold = 0.5
|
24
|
+
# How far to search for a match (0 = exact location, 1000+ = broad match).
|
25
|
+
# A match this many characters away from the expected location will add
|
26
|
+
# 1.0 to the score (0.0 is a perfect match).
|
27
|
+
@match_distance = 1000
|
28
|
+
# When deleting a large block of text (over ~64 characters), how close does
|
29
|
+
# the contents have to match the expected contents. (0.0 = perfection,
|
30
|
+
# 1.0 = very loose). Note that Match_Threshold controls how closely the
|
31
|
+
# end points of a delete need to match.
|
32
|
+
@patch_delete_threshold = 0.5
|
33
|
+
# Chunk size for context length.
|
34
|
+
@patch_margin = 4
|
35
|
+
|
36
|
+
# The number of bits in an int.
|
37
|
+
# Python has no maximum, thus to disable patch splitting set to 0.
|
38
|
+
# However to avoid long patches in certain pathological cases, use 32.
|
39
|
+
# Multiple short patches (using native ints) are much faster than long ones.
|
40
|
+
@match_max_bits = 32
|
41
|
+
super
|
42
|
+
end
|
43
|
+
|
44
|
+
# Do a quick line-level diff on both strings, then rediff the parts for
|
45
|
+
# greater accuracy.
|
46
|
+
# This speedup can produce non-minimal diffs.
|
47
|
+
def diff_line_mode(text1, text2, deadline)
|
48
|
+
# Scan the text on a line-by-line basis first.
|
49
|
+
text1, text2, line_array = diff_lines_to_chars(text1, text2)
|
50
|
+
|
51
|
+
diffs = diff_main(text1, text2, false, deadline)
|
52
|
+
|
53
|
+
# Convert the diff back to original text.
|
54
|
+
diff_chars_to_lines(diffs, line_array)
|
55
|
+
# Eliminate freak matches (e.g. blank lines)
|
56
|
+
diff_cleanup_semantic(diffs)
|
57
|
+
|
58
|
+
# Rediff any replacement blocks, this time character-by-character.
|
59
|
+
# Add a dummy entry at the end.
|
60
|
+
diffs.push([:equal, ""])
|
61
|
+
pointer = 0
|
62
|
+
count_delete = 0
|
63
|
+
count_insert = 0
|
64
|
+
text_delete = ""
|
65
|
+
text_insert = ""
|
66
|
+
|
67
|
+
while pointer < diffs.length
|
68
|
+
case diffs[pointer][0]
|
69
|
+
when :insert
|
70
|
+
count_insert += 1
|
71
|
+
text_insert += diffs[pointer][1]
|
72
|
+
when :delete
|
73
|
+
count_delete += 1
|
74
|
+
text_delete += diffs[pointer][1]
|
75
|
+
when :equal
|
76
|
+
# Upon reaching an equality, check for prior redundancies.
|
77
|
+
if count_delete >= 1 && count_insert >= 1
|
78
|
+
# Delete the offending records and add the merged ones.
|
79
|
+
a = diff_main(text_delete, text_insert, false, deadline)
|
80
|
+
diffs[pointer - count_delete - count_insert,
|
81
|
+
count_delete + count_insert] = []
|
82
|
+
pointer = pointer - count_delete - count_insert
|
83
|
+
diffs[pointer, 0] = a
|
84
|
+
pointer += a.length
|
85
|
+
end
|
86
|
+
count_insert = 0
|
87
|
+
count_delete = 0
|
88
|
+
text_delete = ""
|
89
|
+
text_insert = ""
|
90
|
+
end
|
91
|
+
pointer += 1
|
92
|
+
end
|
93
|
+
|
94
|
+
diffs.pop # Remove the dummy entry at the end.
|
95
|
+
diffs
|
96
|
+
end
|
97
|
+
|
98
|
+
# Find the 'middle snake' of a diff, split the problem in two
|
99
|
+
# and return the recursively constructed diff.
|
100
|
+
# See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
|
101
|
+
def diff_bisect(text1, text2, deadline)
|
102
|
+
# Cache the text lengths to prevent multiple calls.
|
103
|
+
text1_length = text1.length
|
104
|
+
text2_length = text2.length
|
105
|
+
max_d = (text1_length + text2_length + 1) / 2
|
106
|
+
v_offset = max_d
|
107
|
+
v_length = 2 * max_d
|
108
|
+
v1 = Array.new(v_length, -1)
|
109
|
+
v2 = Array.new(v_length, -1)
|
110
|
+
v1[v_offset + 1] = 0
|
111
|
+
v2[v_offset + 1] = 0
|
112
|
+
delta = text1_length - text2_length
|
113
|
+
|
114
|
+
# If the total number of characters is odd, then the front path will
|
115
|
+
# collide with the reverse path.
|
116
|
+
front = (delta % 2 != 0)
|
117
|
+
# Offsets for start and end of k loop.
|
118
|
+
# Prevents mapping of space beyond the grid.
|
119
|
+
k1start = 0
|
120
|
+
k1end = 0
|
121
|
+
k2start = 0
|
122
|
+
k2end = 0
|
123
|
+
max_d.times do |d|
|
124
|
+
# Bail out if deadline is reached.
|
125
|
+
break if deadline && Time.now >= deadline
|
126
|
+
|
127
|
+
# Walk the front path one step.
|
128
|
+
(-d + k1start).step(d - k1end, 2) do |k1|
|
129
|
+
k1_offset = v_offset + k1
|
130
|
+
x1 = if k1 == -d || k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]
|
131
|
+
v1[k1_offset + 1]
|
132
|
+
else
|
133
|
+
v1[k1_offset - 1] + 1
|
134
|
+
end
|
135
|
+
|
136
|
+
y1 = x1 - k1
|
137
|
+
while x1 < text1_length && y1 < text2_length && text1[x1] == text2[y1]
|
138
|
+
x1 += 1
|
139
|
+
y1 += 1
|
140
|
+
end
|
141
|
+
|
142
|
+
v1[k1_offset] = x1
|
143
|
+
if x1 > text1_length
|
144
|
+
# Ran off the right of the graph.
|
145
|
+
k1end += 2
|
146
|
+
elsif y1 > text2_length
|
147
|
+
# Ran off the bottom of the graph.
|
148
|
+
k1start += 2
|
149
|
+
elsif front
|
150
|
+
k2_offset = v_offset + delta - k1
|
151
|
+
if k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1
|
152
|
+
# Mirror x2 onto top-left coordinate system.
|
153
|
+
x2 = text1_length - v2[k2_offset]
|
154
|
+
if x1 >= x2
|
155
|
+
# Overlap detected.
|
156
|
+
return diff_bisect_split(text1, text2, x1, y1, deadline)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Walk the reverse path one step.
|
163
|
+
(-d + k2start).step(d - k2end, 2) do |k2|
|
164
|
+
k2_offset = v_offset + k2
|
165
|
+
x2 = if k2 == -d || k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]
|
166
|
+
v2[k2_offset + 1]
|
167
|
+
else
|
168
|
+
v2[k2_offset - 1] + 1
|
169
|
+
end
|
170
|
+
|
171
|
+
y2 = x2 - k2
|
172
|
+
while x2 < text1_length && y2 < text2_length && text1[-x2 - 1] == text2[-y2 - 1]
|
173
|
+
x2 += 1
|
174
|
+
y2 += 1
|
175
|
+
end
|
176
|
+
|
177
|
+
v2[k2_offset] = x2
|
178
|
+
if x2 > text1_length
|
179
|
+
# Ran off the left of the graph.
|
180
|
+
k2end += 2
|
181
|
+
elsif y2 > text2_length
|
182
|
+
# Ran off the top of the graph.
|
183
|
+
k2start += 2
|
184
|
+
elsif !front
|
185
|
+
k1_offset = v_offset + delta - k2
|
186
|
+
if k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1
|
187
|
+
x1 = v1[k1_offset]
|
188
|
+
y1 = v_offset + x1 - k1_offset
|
189
|
+
# Mirror x2 onto top-left coordinate system.
|
190
|
+
x2 = text1_length - x2
|
191
|
+
if x1 >= x2
|
192
|
+
# Overlap detected.
|
193
|
+
return diff_bisect_split(text1, text2, x1, y1, deadline)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Diff took too long and hit the deadline or
|
201
|
+
# number of diffs equals number of characters, no commonality at all.
|
202
|
+
[[:delete, text1], [:insert, text2]]
|
203
|
+
end
|
204
|
+
|
205
|
+
# Given the location of the 'middle snake', split the diff in two parts
|
206
|
+
# and recurse.
|
207
|
+
def diff_bisect_split(text1, text2, x, y, deadline)
|
208
|
+
text1a = text1[0...x]
|
209
|
+
text2a = text2[0...y]
|
210
|
+
text1b = text1[x..-1]
|
211
|
+
text2b = text2[y..-1]
|
212
|
+
|
213
|
+
# Compute both diffs serially.
|
214
|
+
diffs = diff_main(text1a, text2a, false, deadline)
|
215
|
+
diffsb = diff_main(text1b, text2b, false, deadline)
|
216
|
+
|
217
|
+
diffs + diffsb
|
218
|
+
end
|
219
|
+
|
220
|
+
# Split two texts into an array of strings. Reduce the texts to a string
|
221
|
+
# of hashes where each Unicode character represents one line.
|
222
|
+
def diff_lines_to_chars(text1, text2)
|
223
|
+
line_array = [""] # e.g. line_array[4] == "Hello\n"
|
224
|
+
line_hash = {} # e.g. line_hash["Hello\n"] == 4
|
225
|
+
|
226
|
+
[text1, text2].map { |text|
|
227
|
+
# Split text into an array of strings. Reduce the text to a string of
|
228
|
+
# hashes where each Unicode character represents one line.
|
229
|
+
chars = ""
|
230
|
+
text.each_line do |line|
|
231
|
+
if line_hash[line]
|
232
|
+
chars += line_hash[line].chr(Encoding::UTF_8)
|
233
|
+
else
|
234
|
+
chars += line_array.length.chr(Encoding::UTF_8)
|
235
|
+
line_hash[line] = line_array.length
|
236
|
+
line_array.push(line)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
chars
|
240
|
+
}.push(line_array)
|
241
|
+
end
|
242
|
+
|
243
|
+
# Rehydrate the text in a diff from a string of line hashes to real lines of text.
|
244
|
+
def diff_chars_to_lines(diffs, line_array)
|
245
|
+
diffs.each do |diff|
|
246
|
+
diff[1] = diff[1].chars.map { |c| line_array[c.ord] }.join
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
# Determine the common prefix of two strings.
|
251
|
+
def diff_common_prefix(text1, text2)
|
252
|
+
# Quick check for common null cases.
|
253
|
+
return 0 if text1.empty? || text2.empty? || text1[0] != text2[0]
|
254
|
+
|
255
|
+
# Binary search.
|
256
|
+
# Performance analysis: http://neil.fraser.name/news/2007/10/09/
|
257
|
+
pointer_min = 0
|
258
|
+
pointer_max = [text1.length, text2.length].min
|
259
|
+
pointer_mid = pointer_max
|
260
|
+
pointer_start = 0
|
261
|
+
|
262
|
+
while pointer_min < pointer_mid
|
263
|
+
if text1[pointer_start...pointer_mid] == text2[pointer_start...pointer_mid]
|
264
|
+
pointer_min = pointer_mid
|
265
|
+
pointer_start = pointer_min
|
266
|
+
else
|
267
|
+
pointer_max = pointer_mid
|
268
|
+
end
|
269
|
+
pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min
|
270
|
+
end
|
271
|
+
|
272
|
+
pointer_mid
|
273
|
+
end
|
274
|
+
|
275
|
+
# Determine the common suffix of two strings.
|
276
|
+
def diff_common_suffix(text1, text2)
|
277
|
+
# Quick check for common null cases.
|
278
|
+
return 0 if text1.empty? || text2.empty? || text1[-1] != text2[-1]
|
279
|
+
|
280
|
+
# Binary search.
|
281
|
+
# Performance analysis: http://neil.fraser.name/news/2007/10/09/
|
282
|
+
pointer_min = 0
|
283
|
+
pointer_max = [text1.length, text2.length].min
|
284
|
+
pointer_mid = pointer_max
|
285
|
+
pointer_end = 0
|
286
|
+
|
287
|
+
while pointer_min < pointer_mid
|
288
|
+
if text1[-pointer_mid..(-pointer_end - 1)] == text2[-pointer_mid..(-pointer_end - 1)]
|
289
|
+
pointer_min = pointer_mid
|
290
|
+
pointer_end = pointer_min
|
291
|
+
else
|
292
|
+
pointer_max = pointer_mid
|
293
|
+
end
|
294
|
+
pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min
|
295
|
+
end
|
296
|
+
|
297
|
+
pointer_mid
|
298
|
+
end
|
299
|
+
|
300
|
+
# Determine if the suffix of one string is the prefix of another.
|
301
|
+
def diff_common_overlap(text1, text2)
|
302
|
+
# Cache the text lengths to prevent multiple calls.
|
303
|
+
text1_length = text1.length
|
304
|
+
text2_length = text2.length
|
305
|
+
|
306
|
+
# Eliminate the null case.
|
307
|
+
return 0 if text1_length.zero? || text2_length.zero?
|
308
|
+
|
309
|
+
# Truncate the longer string.
|
310
|
+
if text1_length > text2_length
|
311
|
+
text1 = text1[-text2_length..-1]
|
312
|
+
else
|
313
|
+
text2 = text2[0...text1_length]
|
314
|
+
end
|
315
|
+
text_length = [text1_length, text2_length].min
|
316
|
+
|
317
|
+
# Quick check for the whole case.
|
318
|
+
return text_length if text1 == text2
|
319
|
+
|
320
|
+
# Start by looking for a single character match
|
321
|
+
# and increase length until no match is found.
|
322
|
+
# Performance analysis: http://neil.fraser.name/news/2010/11/04/
|
323
|
+
best = 0
|
324
|
+
length = 1
|
325
|
+
loop do
|
326
|
+
pattern = text1[(text_length - length)..-1]
|
327
|
+
found = text2.index(pattern)
|
328
|
+
|
329
|
+
return best if found.nil?
|
330
|
+
|
331
|
+
length += found
|
332
|
+
if found == 0 || text1[(text_length - length)..-1] == text2[0..length]
|
333
|
+
best = length
|
334
|
+
length += 1
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
# Does a substring of shorttext exist within longtext such that the
|
340
|
+
# substring is at least half the length of longtext?
|
341
|
+
def diff_half_match_i(longtext, shorttext, i)
|
342
|
+
seed = longtext[i, longtext.length / 4]
|
343
|
+
j = -1
|
344
|
+
best_common = ""
|
345
|
+
while (j = shorttext.index(seed, j + 1))
|
346
|
+
prefix_length = diff_common_prefix(longtext[i..-1], shorttext[j..-1])
|
347
|
+
suffix_length = diff_common_suffix(longtext[0...i], shorttext[0...j])
|
348
|
+
if best_common.length < suffix_length + prefix_length
|
349
|
+
best_common = shorttext[(j - suffix_length)...j] + shorttext[j...(j + prefix_length)]
|
350
|
+
best_longtext_a = longtext[0...(i - suffix_length)]
|
351
|
+
best_longtext_b = longtext[(i + prefix_length)..-1]
|
352
|
+
best_shorttext_a = shorttext[0...(j - suffix_length)]
|
353
|
+
best_shorttext_b = shorttext[(j + prefix_length)..-1]
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
if best_common.length * 2 >= longtext.length
|
358
|
+
[best_longtext_a, best_longtext_b, best_shorttext_a, best_shorttext_b, best_common]
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
# Do the two texts share a substring which is at least half the length of the
|
363
|
+
# longer text?
|
364
|
+
# This speedup can produce non-minimal diffs.
|
365
|
+
def diff_half_match(text1, text2)
|
366
|
+
# Don't risk returning a non-optimal diff if we have unlimited time
|
367
|
+
return nil if diff_timeout <= 0
|
368
|
+
|
369
|
+
shorttext, longtext = [text1, text2].sort_by(&:length)
|
370
|
+
if longtext.length < 4 || shorttext.length * 2 < longtext.length
|
371
|
+
return nil # Pointless.
|
372
|
+
end
|
373
|
+
|
374
|
+
# First check if the second quarter is the seed for a half-match.
|
375
|
+
hm1 = diff_half_match_i(longtext, shorttext, (longtext.length + 3) / 4)
|
376
|
+
# Check again based on the third quarter.
|
377
|
+
hm2 = diff_half_match_i(longtext, shorttext, (longtext.length + 1) / 2)
|
378
|
+
|
379
|
+
if hm1.nil? && hm2.nil?
|
380
|
+
return nil
|
381
|
+
elsif hm2.nil? || hm1.nil?
|
382
|
+
hm = hm2.nil? ? hm1 : hm2
|
383
|
+
else
|
384
|
+
# Both matched. Select the longest.
|
385
|
+
hm = hm1[4].length > hm2[4].length ? hm1 : hm2
|
386
|
+
end
|
387
|
+
|
388
|
+
# A half-match was found, sort out the return data.
|
389
|
+
if text1.length > text2.length
|
390
|
+
text1_a, text1_b, text2_a, text2_b, mid_common = hm
|
391
|
+
else
|
392
|
+
text2_a, text2_b, text1_a, text1_b, mid_common = hm
|
393
|
+
end
|
394
|
+
|
395
|
+
[text1_a, text1_b, text2_a, text2_b, mid_common]
|
396
|
+
end
|
397
|
+
|
398
|
+
# Reduce the number of edits by eliminating semantically trivial equalities.
|
399
|
+
def diff_cleanup_semantic(diffs)
|
400
|
+
changes = false
|
401
|
+
equalities = [] # Stack of indices where equalities are found.
|
402
|
+
last_equality = nil # Always equal to equalities.last[1]
|
403
|
+
pointer = 0 # Index of current position.
|
404
|
+
# Number of characters that changed prior to the equality.
|
405
|
+
length_insertions1 = 0
|
406
|
+
length_deletions1 = 0
|
407
|
+
# Number of characters that changed after the equality.
|
408
|
+
length_insertions2 = 0
|
409
|
+
length_deletions2 = 0
|
410
|
+
|
411
|
+
while pointer < diffs.length
|
412
|
+
if diffs[pointer][0] == :equal # Equality found.
|
413
|
+
equalities.push(pointer)
|
414
|
+
length_insertions1 = length_insertions2
|
415
|
+
length_deletions1 = length_deletions2
|
416
|
+
length_insertions2 = 0
|
417
|
+
length_deletions2 = 0
|
418
|
+
last_equality = diffs[pointer][1]
|
419
|
+
else # An insertion or deletion.
|
420
|
+
if diffs[pointer][0] == :insert
|
421
|
+
length_insertions2 += diffs[pointer][1].length
|
422
|
+
else
|
423
|
+
length_deletions2 += diffs[pointer][1].length
|
424
|
+
end
|
425
|
+
|
426
|
+
if last_equality &&
|
427
|
+
last_equality.length <= [length_insertions1, length_deletions1].max &&
|
428
|
+
last_equality.length <= [length_insertions2, length_deletions2].max
|
429
|
+
# Duplicate record.
|
430
|
+
diffs[equalities.last, 0] = [[:delete, last_equality]]
|
431
|
+
|
432
|
+
# Change second copy to insert.
|
433
|
+
diffs[equalities.last + 1][0] = :insert
|
434
|
+
|
435
|
+
# Throw away the equality we just deleted.
|
436
|
+
equalities.pop
|
437
|
+
# Throw away the previous equality (it needs to be reevaluated).
|
438
|
+
equalities.pop
|
439
|
+
pointer = equalities.last || -1
|
440
|
+
|
441
|
+
# Reset the counters.
|
442
|
+
length_insertions1 = 0
|
443
|
+
length_deletions1 = 0
|
444
|
+
length_insertions2 = 0
|
445
|
+
length_deletions2 = 0
|
446
|
+
last_equality = nil
|
447
|
+
|
448
|
+
changes = true
|
449
|
+
end
|
450
|
+
end
|
451
|
+
pointer += 1
|
452
|
+
end
|
453
|
+
|
454
|
+
# Normalize the diff.
|
455
|
+
diff_cleanup_merge(diffs) if changes
|
456
|
+
diff_cleanup_semantic_lossless(diffs)
|
457
|
+
|
458
|
+
# Find any overlaps between deletions and insertions.
|
459
|
+
# e.g: <del>abcxxx</del><ins>xxxdef</ins>
|
460
|
+
# -> <del>abc</del>xxx<ins>def</ins>
|
461
|
+
# e.g: <del>xxxabc</del><ins>defxxx</ins>
|
462
|
+
# -> <ins>def</ins>xxx<del>abc</del>
|
463
|
+
# Only extract an overlap if it is as big as the edit ahead or behind it.
|
464
|
+
pointer = 1
|
465
|
+
while pointer < diffs.length
|
466
|
+
if diffs[pointer - 1][0] == :delete && diffs[pointer][0] == :insert
|
467
|
+
deletion = diffs[pointer - 1][1]
|
468
|
+
insertion = diffs[pointer][1]
|
469
|
+
overlap_length1 = diff_common_overlap(deletion, insertion)
|
470
|
+
overlap_length2 = diff_common_overlap(insertion, deletion)
|
471
|
+
if overlap_length1 >= overlap_length2
|
472
|
+
if overlap_length1 >= deletion.length / 2.0 ||
|
473
|
+
overlap_length1 >= insertion.length / 2.0
|
474
|
+
# Overlap found. Insert an equality and trim the surrounding edits.
|
475
|
+
diffs[pointer, 0] = [[:equal, insertion[0...overlap_length1]]]
|
476
|
+
diffs[pointer - 1][0] = :delete
|
477
|
+
diffs[pointer - 1][1] = deletion[0...-overlap_length1]
|
478
|
+
diffs[pointer + 1][0] = :insert
|
479
|
+
diffs[pointer + 1][1] = insertion[overlap_length1..-1]
|
480
|
+
pointer += 1
|
481
|
+
end
|
482
|
+
elsif overlap_length2 >= deletion.length / 2.0 || overlap_length2 >= insertion.length / 2.0
|
483
|
+
diffs[pointer, 0] = [[:equal, deletion[0...overlap_length2]]]
|
484
|
+
diffs[pointer - 1][0] = :insert
|
485
|
+
diffs[pointer - 1][1] = insertion[0...-overlap_length2]
|
486
|
+
diffs[pointer + 1][0] = :delete
|
487
|
+
diffs[pointer + 1][1] = deletion[overlap_length2..-1]
|
488
|
+
pointer += 1
|
489
|
+
end
|
490
|
+
pointer += 1
|
491
|
+
end
|
492
|
+
pointer += 1
|
493
|
+
end
|
494
|
+
end
|
495
|
+
|
496
|
+
# Given two strings, compute a score representing whether the
|
497
|
+
# internal boundary falls on logical boundaries.
|
498
|
+
# Scores range from 5 (best) to 0 (worst).
|
499
|
+
def diff_cleanup_semantic_score(one, two)
|
500
|
+
if one.empty? || two.empty?
|
501
|
+
# Edges are the best.
|
502
|
+
return 5
|
503
|
+
end
|
504
|
+
|
505
|
+
# Define some regex patterns for matching boundaries.
|
506
|
+
non_word_character = /[^a-zA-Z0-9]/
|
507
|
+
whitespace = /\s/
|
508
|
+
linebreak = /[\r\n]/
|
509
|
+
line_end = /\n\r?\n$/
|
510
|
+
line_start = /^\r?\n\r?\n/
|
511
|
+
|
512
|
+
# Each port of this function behaves slightly differently due to
|
513
|
+
# subtle differences in each language's definition of things like
|
514
|
+
# 'whitespace'. Since this function's purpose is largely cosmetic,
|
515
|
+
# the choice has been made to use each language's native features
|
516
|
+
# rather than force total conformity.
|
517
|
+
score = 0
|
518
|
+
# One point for non-alphanumeric.
|
519
|
+
if one[-1] =~ non_word_character || two[0] =~ non_word_character
|
520
|
+
score += 1
|
521
|
+
# Two points for whitespace.
|
522
|
+
if one[-1] =~ whitespace || two[0] =~ whitespace
|
523
|
+
score += 1
|
524
|
+
# Three points for line breaks.
|
525
|
+
if one[-1] =~ linebreak || two[0] =~ linebreak
|
526
|
+
score += 1
|
527
|
+
# Four points for blank lines.
|
528
|
+
if one =~ line_end || two =~ line_start
|
529
|
+
score += 1
|
530
|
+
end
|
531
|
+
end
|
532
|
+
end
|
533
|
+
end
|
534
|
+
|
535
|
+
score
|
536
|
+
end
|
537
|
+
|
538
|
+
# Look for single edits surrounded on both sides by equalities
|
539
|
+
# which can be shifted sideways to align the edit to a word boundary.
|
540
|
+
# e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
|
541
|
+
def diff_cleanup_semantic_lossless(diffs)
|
542
|
+
pointer = 1
|
543
|
+
# Intentionally ignore the first and last element (don't need checking).
|
544
|
+
while pointer < diffs.length - 1
|
545
|
+
if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal
|
546
|
+
# This is a single edit surrounded by equalities.
|
547
|
+
equality1 = diffs[pointer - 1][1]
|
548
|
+
edit = diffs[pointer][1]
|
549
|
+
equality2 = diffs[pointer + 1][1]
|
550
|
+
|
551
|
+
# First, shift the edit as far left as possible.
|
552
|
+
common_offset = diff_common_suffix(equality1, edit)
|
553
|
+
if common_offset != 0
|
554
|
+
common_string = edit[-common_offset..-1]
|
555
|
+
equality1 = equality1[0...-common_offset]
|
556
|
+
edit = common_string + edit[0...-common_offset]
|
557
|
+
equality2 = common_string + equality2
|
558
|
+
end
|
559
|
+
|
560
|
+
# Second, step character by character right, looking for the best fit.
|
561
|
+
best_equality1 = equality1
|
562
|
+
best_edit = edit
|
563
|
+
best_equality2 = equality2
|
564
|
+
best_score = diff_cleanup_semantic_score(equality1, edit) +
|
565
|
+
diff_cleanup_semantic_score(edit, equality2)
|
566
|
+
while edit[0] == equality2[0]
|
567
|
+
equality1 += edit[0]
|
568
|
+
edit = edit[1..-1] + equality2[0]
|
569
|
+
equality2 = equality2[1..-1]
|
570
|
+
score = diff_cleanup_semantic_score(equality1, edit) +
|
571
|
+
diff_cleanup_semantic_score(edit, equality2)
|
572
|
+
# The >= encourages trailing rather than leading whitespace on edits.
|
573
|
+
if score >= best_score
|
574
|
+
best_score = score
|
575
|
+
best_equality1 = equality1
|
576
|
+
best_edit = edit
|
577
|
+
best_equality2 = equality2
|
578
|
+
end
|
579
|
+
end
|
580
|
+
|
581
|
+
if diffs[pointer - 1][1] != best_equality1
|
582
|
+
# We have an improvement, save it back to the diff.
|
583
|
+
if best_equality1.empty?
|
584
|
+
diffs[pointer - 1, 1] = []
|
585
|
+
pointer -= 1
|
586
|
+
else
|
587
|
+
diffs[pointer - 1][1] = best_equality1
|
588
|
+
end
|
589
|
+
|
590
|
+
diffs[pointer][1] = best_edit
|
591
|
+
|
592
|
+
if best_equality2.empty?
|
593
|
+
diffs[pointer + 1, 1] = []
|
594
|
+
pointer -= 1
|
595
|
+
else
|
596
|
+
diffs[pointer + 1][1] = best_equality2
|
597
|
+
end
|
598
|
+
end
|
599
|
+
end
|
600
|
+
|
601
|
+
pointer += 1
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
# Reduce the number of edits by eliminating operationally trivial equalities.
|
606
|
+
def diff_cleanup_efficiency(diffs)
|
607
|
+
changes = false
|
608
|
+
equalities = [] # Stack of indices where equalities are found.
|
609
|
+
last_equality = "" # Always equal to equalities.last[1]
|
610
|
+
pointer = 0 # Index of current position.
|
611
|
+
pre_ins = false # Is there an insertion operation before the last equality.
|
612
|
+
pre_del = false # Is there a deletion operation before the last equality.
|
613
|
+
post_ins = false # Is there an insertion operation after the last equality.
|
614
|
+
post_del = false # Is there a deletion operation after the last equality.
|
615
|
+
|
616
|
+
while pointer < diffs.length
|
617
|
+
if diffs[pointer][0] == :equal # Equality found.
|
618
|
+
if diffs[pointer][1].length < diff_edit_cost && (post_ins || post_del)
|
619
|
+
# Candidate found.
|
620
|
+
equalities.push(pointer)
|
621
|
+
pre_ins = post_ins
|
622
|
+
pre_del = post_del
|
623
|
+
last_equality = diffs[pointer][1]
|
624
|
+
else
|
625
|
+
# Not a candidate, and can never become one.
|
626
|
+
equalities.clear
|
627
|
+
last_equality = ""
|
628
|
+
end
|
629
|
+
post_ins = false
|
630
|
+
post_del = false
|
631
|
+
else # An insertion or deletion.
|
632
|
+
if diffs[pointer][0] == :delete
|
633
|
+
post_del = true
|
634
|
+
else
|
635
|
+
post_ins = true
|
636
|
+
end
|
637
|
+
|
638
|
+
# Five types to be split:
|
639
|
+
# <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
|
640
|
+
# <ins>A</ins>X<ins>C</ins><del>D</del>
|
641
|
+
# <ins>A</ins><del>B</del>X<ins>C</ins>
|
642
|
+
# <ins>A</del>X<ins>C</ins><del>D</del>
|
643
|
+
# <ins>A</ins><del>B</del>X<del>C</del>
|
644
|
+
|
645
|
+
if !last_equality.empty? &&
|
646
|
+
((pre_ins && pre_del && post_ins && post_del) ||
|
647
|
+
((last_equality.length < diff_edit_cost / 2) &&
|
648
|
+
[pre_ins, pre_del, post_ins, post_del].count(true) == 3))
|
649
|
+
# Duplicate record.
|
650
|
+
diffs[equalities.last, 0] = [[:delete, last_equality]]
|
651
|
+
# Change second copy to insert.
|
652
|
+
diffs[equalities.last + 1][0] = :insert
|
653
|
+
equalities.pop # Throw away the equality we just deleted
|
654
|
+
last_equality = ""
|
655
|
+
if pre_ins && pre_del
|
656
|
+
# No changes made which could affect previous entry, keep going.
|
657
|
+
post_ins = true
|
658
|
+
post_del = true
|
659
|
+
equalities.clear
|
660
|
+
else
|
661
|
+
unless equalities.empty?
|
662
|
+
equalities.pop # Throw away the previous equality.
|
663
|
+
pointer = equalities.last || -1
|
664
|
+
end
|
665
|
+
post_ins = false
|
666
|
+
post_del = false
|
667
|
+
end
|
668
|
+
changes = true
|
669
|
+
end
|
670
|
+
end
|
671
|
+
pointer += 1
|
672
|
+
end
|
673
|
+
|
674
|
+
if changes
|
675
|
+
diff_cleanup_merge(diffs)
|
676
|
+
end
|
677
|
+
end
|
678
|
+
|
679
|
+
# Reorder and merge like edit sections. Merge equalities.
|
680
|
+
# Any edit section can move as long as it doesn't cross an equality.
|
681
|
+
def diff_cleanup_merge(diffs)
|
682
|
+
diffs.push([:equal, ""]) # Add a dummy entry at the end.
|
683
|
+
pointer = 0
|
684
|
+
count_delete = 0
|
685
|
+
count_insert = 0
|
686
|
+
text_delete = ""
|
687
|
+
text_insert = ""
|
688
|
+
|
689
|
+
while pointer < diffs.length
|
690
|
+
case diffs[pointer][0]
|
691
|
+
when :insert
|
692
|
+
count_insert += 1
|
693
|
+
text_insert += diffs[pointer][1]
|
694
|
+
pointer += 1
|
695
|
+
when :delete
|
696
|
+
count_delete += 1
|
697
|
+
text_delete += diffs[pointer][1]
|
698
|
+
pointer += 1
|
699
|
+
when :equal
|
700
|
+
# Upon reaching an equality, check for prior redundancies.
|
701
|
+
if count_delete + count_insert > 1
|
702
|
+
if count_delete != 0 && count_insert != 0
|
703
|
+
# Factor out any common prefixies.
|
704
|
+
common_length = diff_common_prefix(text_insert, text_delete)
|
705
|
+
if common_length != 0
|
706
|
+
if (pointer - count_delete - count_insert) > 0 &&
|
707
|
+
diffs[pointer - count_delete - count_insert - 1][0] == :equal
|
708
|
+
diffs[pointer - count_delete - count_insert - 1][1] +=
|
709
|
+
text_insert[0...common_length]
|
710
|
+
else
|
711
|
+
diffs.unshift([:equal, text_insert[0...common_length]])
|
712
|
+
pointer += 1
|
713
|
+
end
|
714
|
+
text_insert = text_insert[common_length..-1]
|
715
|
+
text_delete = text_delete[common_length..-1]
|
716
|
+
end
|
717
|
+
# Factor out any common suffixies.
|
718
|
+
common_length = diff_common_suffix(text_insert, text_delete)
|
719
|
+
if common_length != 0
|
720
|
+
diffs[pointer][1] = text_insert[-common_length..-1] + diffs[pointer][1]
|
721
|
+
text_insert = text_insert[0...-common_length]
|
722
|
+
text_delete = text_delete[0...-common_length]
|
723
|
+
end
|
724
|
+
end
|
725
|
+
|
726
|
+
# Delete the offending records and add the merged ones.
|
727
|
+
diffs[pointer - count_delete - count_insert, count_delete + count_insert] = if count_delete.zero?
|
728
|
+
[[:insert, text_insert]]
|
729
|
+
elsif count_insert.zero?
|
730
|
+
[[:delete, text_delete]]
|
731
|
+
else
|
732
|
+
[[:delete, text_delete], [:insert, text_insert]]
|
733
|
+
end
|
734
|
+
pointer = pointer - count_delete - count_insert +
|
735
|
+
(count_delete.zero? ? 0 : 1) + (count_insert.zero? ? 0 : 1) + 1
|
736
|
+
elsif pointer != 0 && diffs[pointer - 1][0] == :equal
|
737
|
+
# Merge this equality with the previous one.
|
738
|
+
diffs[pointer - 1][1] += diffs[pointer][1]
|
739
|
+
diffs[pointer, 1] = []
|
740
|
+
else
|
741
|
+
pointer += 1
|
742
|
+
end
|
743
|
+
count_insert = 0
|
744
|
+
count_delete = 0
|
745
|
+
text_delete = ""
|
746
|
+
text_insert = ""
|
747
|
+
end
|
748
|
+
end
|
749
|
+
|
750
|
+
if diffs.last[1].empty?
|
751
|
+
diffs.pop # Remove the dummy entry at the end.
|
752
|
+
end
|
753
|
+
|
754
|
+
# Second pass: look for single edits surrounded on both sides by equalities
|
755
|
+
# which can be shifted sideways to eliminate an equality.
|
756
|
+
# e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
|
757
|
+
changes = false
|
758
|
+
pointer = 1
|
759
|
+
|
760
|
+
# Intentionally ignore the first and last element (don't need checking).
|
761
|
+
while pointer < diffs.length - 1
|
762
|
+
if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal
|
763
|
+
# This is a single edit surrounded by equalities.
|
764
|
+
if diffs[pointer][1][-diffs[pointer - 1][1].length..-1] == diffs[pointer - 1][1]
|
765
|
+
# Shift the edit over the previous equality.
|
766
|
+
diffs[pointer][1] = diffs[pointer - 1][1] + diffs[pointer][1][0...-diffs[pointer - 1][1].length]
|
767
|
+
diffs[pointer + 1][1] = diffs[pointer - 1][1] + diffs[pointer + 1][1]
|
768
|
+
diffs[pointer - 1, 1] = []
|
769
|
+
changes = true
|
770
|
+
elsif diffs[pointer][1][0...diffs[pointer + 1][1].length] == diffs[pointer + 1][1]
|
771
|
+
# Shift the edit over the next equality.
|
772
|
+
diffs[pointer - 1][1] += diffs[pointer + 1][1]
|
773
|
+
diffs[pointer][1] = diffs[pointer][1][diffs[pointer + 1][1].length..-1] +
|
774
|
+
diffs[pointer + 1][1]
|
775
|
+
diffs[pointer + 1, 1] = []
|
776
|
+
changes = true
|
777
|
+
end
|
778
|
+
end
|
779
|
+
pointer += 1
|
780
|
+
end
|
781
|
+
|
782
|
+
# If shifts were made, the diff needs reordering and another shift sweep.
|
783
|
+
if changes
|
784
|
+
diff_cleanup_merge(diffs)
|
785
|
+
end
|
786
|
+
end
|
787
|
+
|
788
|
+
# loc is a location in text1, compute and return the equivalent location
|
789
|
+
# in text2. e.g. 'The cat' vs 'The big cat', 1->1, 5->8
|
790
|
+
def diff_x_index(diffs, loc)
|
791
|
+
chars1 = 0
|
792
|
+
chars2 = 0
|
793
|
+
last_chars1 = 0
|
794
|
+
last_chars2 = 0
|
795
|
+
x = diffs.index { |diff|
|
796
|
+
if diff[0] != :insert
|
797
|
+
chars1 += diff[1].length
|
798
|
+
end
|
799
|
+
if diff[0] != :delete
|
800
|
+
chars2 += diff[1].length
|
801
|
+
end
|
802
|
+
if chars1 > loc
|
803
|
+
true
|
804
|
+
else
|
805
|
+
last_chars1 = chars1
|
806
|
+
last_chars2 = chars2
|
807
|
+
false
|
808
|
+
end
|
809
|
+
}
|
810
|
+
|
811
|
+
if !x.nil? && diffs.length != x && diffs[x][0] == :delete
|
812
|
+
# The location was deleted.
|
813
|
+
last_chars2
|
814
|
+
else
|
815
|
+
# Add the remaining len(character).
|
816
|
+
last_chars2 + (loc - last_chars1)
|
817
|
+
end
|
818
|
+
end
|
819
|
+
|
820
|
+
# Convert a diff array into a pretty HTML report.
|
821
|
+
def diff_pretty_html(diffs)
|
822
|
+
diffs.map { |op, data|
|
823
|
+
text = data.gsub("&", "&").gsub("<", "<").gsub(">", ">").gsub('\n', "¶<br>")
|
824
|
+
case op
|
825
|
+
when :insert
|
826
|
+
"<ins style=\"background:#e6ffe6;\">#{text}</ins>"
|
827
|
+
when :delete
|
828
|
+
"<del style=\"background:#ffe6e6;\">#{text}</del>"
|
829
|
+
when :equal
|
830
|
+
"<span>#{text}</span>"
|
831
|
+
end
|
832
|
+
}.join
|
833
|
+
end
|
834
|
+
|
835
|
+
# Compute and return the source text (all equalities and deletions).
|
836
|
+
def diff_text1(diffs)
|
837
|
+
diffs.map { |op, data|
|
838
|
+
if op == :insert
|
839
|
+
""
|
840
|
+
else
|
841
|
+
data
|
842
|
+
end
|
843
|
+
}.join
|
844
|
+
end
|
845
|
+
|
846
|
+
# Compute and return the destination text (all equalities and insertions).
|
847
|
+
def diff_text2(diffs)
|
848
|
+
diffs.map { |op, data|
|
849
|
+
if op == :delete
|
850
|
+
""
|
851
|
+
else
|
852
|
+
data
|
853
|
+
end
|
854
|
+
}.join
|
855
|
+
end
|
856
|
+
|
857
|
+
# Compute the Levenshtein distance; the number of inserted, deleted or
|
858
|
+
# substituted characters.
|
859
|
+
def diff_levenshtein(diffs)
|
860
|
+
levenshtein = 0
|
861
|
+
insertions = 0
|
862
|
+
deletions = 0
|
863
|
+
|
864
|
+
diffs.each do |op, data|
|
865
|
+
case op
|
866
|
+
when :insert
|
867
|
+
insertions += data.length
|
868
|
+
when :delete
|
869
|
+
deletions += data.length
|
870
|
+
when :equal
|
871
|
+
# A deletion and an insertion is one substitution.
|
872
|
+
levenshtein += [insertions, deletions].max
|
873
|
+
insertions = 0
|
874
|
+
deletions = 0
|
875
|
+
end
|
876
|
+
end
|
877
|
+
|
878
|
+
levenshtein + [insertions, deletions].max
|
879
|
+
end
|
880
|
+
|
881
|
+
# Crush the diff into an encoded string which describes the operations
|
882
|
+
# required to transform text1 into text2.
|
883
|
+
# E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
|
884
|
+
# Operations are tab-separated. Inserted text is escaped using %xx notation.
|
885
|
+
def diff_to_delta(diffs)
|
886
|
+
diffs.map { |op, data|
|
887
|
+
case op
|
888
|
+
when :insert
|
889
|
+
"+" + PatchObj::PATCH_PARSER.escape(data, /[^0-9A-Za-z_.;!~*'(),\/?:@&=+$\#-]/)
|
890
|
+
when :delete
|
891
|
+
"-" + data.length.to_s
|
892
|
+
when :equal
|
893
|
+
"=" + data.length.to_s
|
894
|
+
end
|
895
|
+
}.join("\t").gsub("%20", " ")
|
896
|
+
end
|
897
|
+
|
898
|
+
# Given the original text1, and an encoded string which describes the
|
899
|
+
# operations required to transform text1 into text2, compute the full diff.
|
900
|
+
def diff_from_delta(text1, delta)
|
901
|
+
# Deltas should be composed of a subset of ascii chars, Unicode not required.
|
902
|
+
delta.encode("ascii")
|
903
|
+
diffs = []
|
904
|
+
pointer = 0 # Cursor in text1
|
905
|
+
delta.split("\t").each do |token|
|
906
|
+
# Each token begins with a one character parameter which specifies the
|
907
|
+
# operation of this token (delete, insert, equality).
|
908
|
+
param = token[1..-1]
|
909
|
+
case token[0]
|
910
|
+
when "+"
|
911
|
+
diffs.push([:insert, PatchObj::PATCH_PARSER.unescape(param.force_encoding(Encoding::UTF_8))])
|
912
|
+
when "-", "="
|
913
|
+
begin
|
914
|
+
n = Integer(param)
|
915
|
+
raise if n < 0
|
916
|
+
text = text1[pointer...(pointer + n)]
|
917
|
+
pointer += n
|
918
|
+
if token[0] == "="
|
919
|
+
diffs.push([:equal, text])
|
920
|
+
else
|
921
|
+
diffs.push([:delete, text])
|
922
|
+
end
|
923
|
+
rescue ArgumentError => _
|
924
|
+
raise ArgumentError.new(
|
925
|
+
"Invalid number in diff_fromDelta: #{param.inspect}"
|
926
|
+
)
|
927
|
+
end
|
928
|
+
else
|
929
|
+
raise ArgumentError.new(
|
930
|
+
"Invalid diff operation in diff_fromDelta: #{token.inspect}"
|
931
|
+
)
|
932
|
+
end
|
933
|
+
end
|
934
|
+
|
935
|
+
if pointer != text1.length
|
936
|
+
raise ArgumentError.new("Delta length (#{pointer}) does not equal " \
|
937
|
+
"source text length #{text1.length}")
|
938
|
+
end
|
939
|
+
diffs
|
940
|
+
end
|
941
|
+
|
942
|
+
# Locate the best instance of 'pattern' in 'text' near 'loc'.
|
943
|
+
def match_main(text, pattern, loc)
|
944
|
+
# Check for null inputs.
|
945
|
+
if [text, pattern].any?(&:nil?)
|
946
|
+
raise ArgumentError.new("Null input. (match_main)")
|
947
|
+
end
|
948
|
+
|
949
|
+
loc = [0, [loc, text.length].min].max
|
950
|
+
if text == pattern
|
951
|
+
# Shortcut (potentially not guaranteed by the algorithm)
|
952
|
+
0
|
953
|
+
elsif text.empty?
|
954
|
+
# Nothing to match
|
955
|
+
-1
|
956
|
+
elsif text[loc, pattern.length] == pattern
|
957
|
+
# Perfect match at the perfect spot! (Includes case of null pattern)
|
958
|
+
loc
|
959
|
+
else
|
960
|
+
# Do a fuzzy compare.
|
961
|
+
match_bitap(text, pattern, loc)
|
962
|
+
end
|
963
|
+
end
|
964
|
+
|
965
|
+
# Locate the best instance of 'pattern' in 'text' near 'loc' using the
|
966
|
+
# Bitap algorithm.
|
967
|
+
def match_bitap(text, pattern, loc)
|
968
|
+
if pattern.length > match_max_bits
|
969
|
+
throw ArgumentError.new("Pattern too long")
|
970
|
+
end
|
971
|
+
|
972
|
+
# Initialise the alphabet.
|
973
|
+
s = match_alphabet(pattern)
|
974
|
+
|
975
|
+
# Compute and return the score for a match with e errors and x location.
|
976
|
+
match_bitap_score = ->(e, x) do
|
977
|
+
accuracy = e.to_f / pattern.length
|
978
|
+
proximity = (loc - x).abs
|
979
|
+
if match_distance == 0
|
980
|
+
# Dodge divide by zero error.
|
981
|
+
return proximity == 0 ? accuracy : 1.0
|
982
|
+
end
|
983
|
+
return accuracy + (proximity.to_f / match_distance)
|
984
|
+
end
|
985
|
+
|
986
|
+
# Highest score beyond which we give up.
|
987
|
+
score_threshold = match_threshold
|
988
|
+
# Is there a nearby exact match? (speedup)
|
989
|
+
best_loc = text.index(pattern, loc)
|
990
|
+
if best_loc
|
991
|
+
score_threshold = [match_bitap_score[0, best_loc], score_threshold].min
|
992
|
+
# What about in the other direction? (speedup)
|
993
|
+
best_loc = text.rindex(pattern, loc + pattern.length)
|
994
|
+
if best_loc
|
995
|
+
score_threshold = [match_bitap_score[0, best_loc], score_threshold].min
|
996
|
+
end
|
997
|
+
end
|
998
|
+
|
999
|
+
# Initialise the bit arrays.
|
1000
|
+
match_mask = 1 << (pattern.length - 1)
|
1001
|
+
best_loc = -1
|
1002
|
+
|
1003
|
+
bin_max = pattern.length + text.length
|
1004
|
+
# Empty initialization added to appease pychecker.
|
1005
|
+
last_rd = nil
|
1006
|
+
pattern.length.times do |d|
|
1007
|
+
# Scan for the best match; each iteration allows for one more error.
|
1008
|
+
# Run a binary search to determine how far from 'loc' we can stray at this
|
1009
|
+
# error level.
|
1010
|
+
bin_min = 0
|
1011
|
+
bin_mid = bin_max
|
1012
|
+
while bin_min < bin_mid
|
1013
|
+
if match_bitap_score[d, loc + bin_mid] <= score_threshold
|
1014
|
+
bin_min = bin_mid
|
1015
|
+
else
|
1016
|
+
bin_max = bin_mid
|
1017
|
+
end
|
1018
|
+
bin_mid = (bin_max - bin_min) / 2 + bin_min
|
1019
|
+
end
|
1020
|
+
|
1021
|
+
# Use the result from this iteration as the maximum for the next.
|
1022
|
+
bin_max = bin_mid
|
1023
|
+
start = [1, loc - bin_mid + 1].max
|
1024
|
+
finish = [loc + bin_mid, text.length].min + pattern.length
|
1025
|
+
|
1026
|
+
rd = Array.new(finish + 2, 0)
|
1027
|
+
rd[finish + 1] = (1 << d) - 1
|
1028
|
+
finish.downto(start) do |j|
|
1029
|
+
char_match = s[text[j - 1]] || 0
|
1030
|
+
rd[j] = if d == 0 # First pass: exact match.
|
1031
|
+
((rd[j + 1] << 1) | 1) & char_match
|
1032
|
+
else # Subsequent passes: fuzzy match.
|
1033
|
+
((rd[j + 1] << 1) | 1) & char_match |
|
1034
|
+
(((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
|
1035
|
+
end
|
1036
|
+
if (rd[j] & match_mask).nonzero?
|
1037
|
+
score = match_bitap_score[d, j - 1]
|
1038
|
+
# This match will almost certainly be better than any existing match.
|
1039
|
+
# But check anyway.
|
1040
|
+
if score <= score_threshold
|
1041
|
+
# Told you so.
|
1042
|
+
score_threshold = score
|
1043
|
+
best_loc = j - 1
|
1044
|
+
if best_loc > loc
|
1045
|
+
# When passing loc, don't exceed our current distance from loc.
|
1046
|
+
start = [1, 2 * loc - best_loc].max
|
1047
|
+
else
|
1048
|
+
# Already passed loc, downhill from here on in.
|
1049
|
+
break
|
1050
|
+
end
|
1051
|
+
end
|
1052
|
+
end
|
1053
|
+
end
|
1054
|
+
|
1055
|
+
# No hope for a (better) match at greater error levels.
|
1056
|
+
if match_bitap_score[d + 1, loc] > score_threshold
|
1057
|
+
break
|
1058
|
+
end
|
1059
|
+
last_rd = rd
|
1060
|
+
end
|
1061
|
+
|
1062
|
+
best_loc
|
1063
|
+
end
|
1064
|
+
|
1065
|
+
# Initialise the alphabet for the Bitap algorithm.
|
1066
|
+
def match_alphabet(pattern)
|
1067
|
+
s = {}
|
1068
|
+
pattern.chars.each_with_index do |c, i|
|
1069
|
+
s[c] ||= 0
|
1070
|
+
s[c] |= 1 << (pattern.length - i - 1)
|
1071
|
+
end
|
1072
|
+
s
|
1073
|
+
end
|
1074
|
+
|
1075
|
+
# Parse a textual representation of patches and return a list of patch
|
1076
|
+
# objects.
|
1077
|
+
def patch_from_text(textline)
|
1078
|
+
return [] if textline.empty?
|
1079
|
+
|
1080
|
+
patches = []
|
1081
|
+
text = textline.split("\n")
|
1082
|
+
text_pointer = 0
|
1083
|
+
patch_header = /^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$/
|
1084
|
+
while text_pointer < text.length
|
1085
|
+
m = text[text_pointer].match(patch_header)
|
1086
|
+
if m.nil?
|
1087
|
+
raise ArgumentError.new("Invalid patch string: #{text[text_pointer]}")
|
1088
|
+
end
|
1089
|
+
patch = PatchObj.new
|
1090
|
+
patches.push(patch)
|
1091
|
+
patch.start1 = m[1].to_i
|
1092
|
+
if m[2].empty?
|
1093
|
+
patch.start1 -= 1
|
1094
|
+
patch.length1 = 1
|
1095
|
+
elsif m[2] == "0"
|
1096
|
+
patch.length1 = 0
|
1097
|
+
else
|
1098
|
+
patch.start1 -= 1
|
1099
|
+
patch.length1 = m[2].to_i
|
1100
|
+
end
|
1101
|
+
|
1102
|
+
patch.start2 = m[3].to_i
|
1103
|
+
if m[4].empty?
|
1104
|
+
patch.start2 -= 1
|
1105
|
+
patch.length2 = 1
|
1106
|
+
elsif m[4] == "0"
|
1107
|
+
patch.length2 = 0
|
1108
|
+
else
|
1109
|
+
patch.start2 -= 1
|
1110
|
+
patch.length2 = m[4].to_i
|
1111
|
+
end
|
1112
|
+
text_pointer += 1
|
1113
|
+
|
1114
|
+
while text_pointer < text.length
|
1115
|
+
if text[text_pointer].empty?
|
1116
|
+
# Blank line? Whatever.
|
1117
|
+
text_pointer += 1
|
1118
|
+
next
|
1119
|
+
end
|
1120
|
+
|
1121
|
+
sign = text[text_pointer][0]
|
1122
|
+
line = PatchObj::PATCH_PARSER.unescape(text[text_pointer][1..-1].force_encoding(Encoding::UTF_8))
|
1123
|
+
|
1124
|
+
case sign
|
1125
|
+
when "-"
|
1126
|
+
# Deletion.
|
1127
|
+
patch.diffs.push([:delete, line])
|
1128
|
+
when "+"
|
1129
|
+
# Insertion.
|
1130
|
+
patch.diffs.push([:insert, line])
|
1131
|
+
when " "
|
1132
|
+
# Minor equality
|
1133
|
+
patch.diffs.push([:equal, line])
|
1134
|
+
when "@"
|
1135
|
+
# Start of next patch.
|
1136
|
+
break
|
1137
|
+
else
|
1138
|
+
# WTF?
|
1139
|
+
raise ArgumentError.new("Invalid patch mode \"#{sign}\" in: #{line}")
|
1140
|
+
end
|
1141
|
+
text_pointer += 1
|
1142
|
+
end
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
patches
|
1146
|
+
end
|
1147
|
+
|
1148
|
+
# Take a list of patches and return a textual representation
|
1149
|
+
def patch_to_text(patches)
|
1150
|
+
patches.join
|
1151
|
+
end
|
1152
|
+
|
1153
|
+
# Increase the context until it is unique,
|
1154
|
+
# but don't let the pattern expand beyond match_max_bits
|
1155
|
+
def patch_add_context(patch, text)
|
1156
|
+
return if text.empty?
|
1157
|
+
pattern = text[patch.start2, patch.length1]
|
1158
|
+
padding = 0
|
1159
|
+
|
1160
|
+
# Look for the first and last matches of pattern in text. If two different
|
1161
|
+
# matches are found, increase the pattern length.
|
1162
|
+
while text.index(pattern) != text.rindex(pattern) &&
|
1163
|
+
pattern.length < match_max_bits - 2 * patch_margin
|
1164
|
+
padding += patch_margin
|
1165
|
+
pattern = text[[0, patch.start2 - padding].max...(patch.start2 + patch.length1 + padding)]
|
1166
|
+
end
|
1167
|
+
|
1168
|
+
# Add one chunk for good luck.
|
1169
|
+
padding += patch_margin
|
1170
|
+
|
1171
|
+
# Add the prefix.
|
1172
|
+
prefix = text[[0, patch.start2 - padding].max...patch.start2]
|
1173
|
+
patch.diffs.unshift([:equal, prefix]) unless prefix.to_s.empty?
|
1174
|
+
|
1175
|
+
# Add the suffix.
|
1176
|
+
suffix = text[patch.start2 + patch.length1, padding]
|
1177
|
+
patch.diffs.push([:equal, suffix]) unless suffix.to_s.empty?
|
1178
|
+
|
1179
|
+
# Roll back the start points.
|
1180
|
+
patch.start1 -= prefix.length
|
1181
|
+
patch.start2 -= prefix.length
|
1182
|
+
|
1183
|
+
# Extend the lengths.
|
1184
|
+
patch.length1 += prefix.length + suffix.length
|
1185
|
+
patch.length2 += prefix.length + suffix.length
|
1186
|
+
end
|
1187
|
+
|
1188
|
+
# Compute a list of patches to turn text1 into text2.
|
1189
|
+
# Use diffs if provided, otherwise compute it ourselves.
|
1190
|
+
# There are four ways to call this function, depending on what data is
|
1191
|
+
# available to the caller:
|
1192
|
+
# Method 1:
|
1193
|
+
# a = text1, b = text2
|
1194
|
+
# Method 2:
|
1195
|
+
# a = diffs
|
1196
|
+
# Method 3 (optimal):
|
1197
|
+
# a = text1, b = diffs
|
1198
|
+
# Method 4 (deprecated, use method 3):
|
1199
|
+
# a = text1, b = text2, c = diffs
|
1200
|
+
def patch_make(*args)
|
1201
|
+
text1 = nil
|
1202
|
+
diffs = nil
|
1203
|
+
if args.length == 2 && args[0].is_a?(String) && args[1].is_a?(String)
|
1204
|
+
# Compute diffs from text1 and text2.
|
1205
|
+
text1 = args[0]
|
1206
|
+
text2 = args[1]
|
1207
|
+
diffs = diff_main(text1, text2, true)
|
1208
|
+
if diffs.length > 2
|
1209
|
+
diff_cleanup_semantic(diffs)
|
1210
|
+
diff_cleanup_efficiency(diffs)
|
1211
|
+
end
|
1212
|
+
elsif args.length == 1 && args[0].is_a?(Array)
|
1213
|
+
# Compute text1 from diffs.
|
1214
|
+
diffs = args[0]
|
1215
|
+
text1 = diff_text1(diffs)
|
1216
|
+
elsif args.length == 2 && args[0].is_a?(String) && args[1].is_a?(Array)
|
1217
|
+
text1 = args[0]
|
1218
|
+
diffs = args[1]
|
1219
|
+
elsif args.length == 3 && args[0].is_a?(String) && args[1].is_a?(String) &&
|
1220
|
+
args[2].is_a?(Array)
|
1221
|
+
# Method 4: text1, text2, diffs
|
1222
|
+
# text2 is not used.
|
1223
|
+
text1 = args[0]
|
1224
|
+
# text2 = args[1]
|
1225
|
+
diffs = args[2]
|
1226
|
+
else
|
1227
|
+
raise ArgumentError.new("Unknown call format to patch_make.")
|
1228
|
+
end
|
1229
|
+
|
1230
|
+
return [] if diffs.empty? # Get rid of the null case.
|
1231
|
+
|
1232
|
+
patches = []
|
1233
|
+
patch = PatchObj.new
|
1234
|
+
char_count1 = 0 # Number of characters into the text1 string.
|
1235
|
+
char_count2 = 0 # Number of characters into the text2 string.
|
1236
|
+
prepatch_text = text1 # Recreate the patches to determine context info.
|
1237
|
+
postpatch_text = text1
|
1238
|
+
|
1239
|
+
diffs.each_with_index do |diff, x|
|
1240
|
+
diff_type, diff_text = diffs[x]
|
1241
|
+
if patch.diffs.empty? && diff_type != :equal
|
1242
|
+
# A new patch starts here.
|
1243
|
+
patch.start1 = char_count1
|
1244
|
+
patch.start2 = char_count2
|
1245
|
+
end
|
1246
|
+
|
1247
|
+
case diff_type
|
1248
|
+
when :insert
|
1249
|
+
patch.diffs.push(diff)
|
1250
|
+
patch.length2 += diff_text.length
|
1251
|
+
postpatch_text = postpatch_text[0...char_count2] + diff_text +
|
1252
|
+
postpatch_text[char_count2..-1]
|
1253
|
+
when :delete
|
1254
|
+
patch.length1 += diff_text.length
|
1255
|
+
patch.diffs.push(diff)
|
1256
|
+
postpatch_text = postpatch_text[0...char_count2] +
|
1257
|
+
postpatch_text[(char_count2 + diff_text.length)..-1]
|
1258
|
+
when :equal
|
1259
|
+
if diff_text.length <= 2 * patch_margin &&
|
1260
|
+
!patch.diffs.empty? && diffs.length != x + 1
|
1261
|
+
# Small equality inside a patch.
|
1262
|
+
patch.diffs.push(diff)
|
1263
|
+
patch.length1 += diff_text.length
|
1264
|
+
patch.length2 += diff_text.length
|
1265
|
+
elsif diff_text.length >= 2 * patch_margin
|
1266
|
+
# Time for a new patch.
|
1267
|
+
unless patch.diffs.empty?
|
1268
|
+
patch_add_context(patch, prepatch_text)
|
1269
|
+
patches.push(patch)
|
1270
|
+
patch = PatchObj.new
|
1271
|
+
# Unlike Unidiff, our patch lists have a rolling context.
|
1272
|
+
# http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
|
1273
|
+
# Update prepatch text & pos to reflect the application of the
|
1274
|
+
# just completed patch.
|
1275
|
+
prepatch_text = postpatch_text
|
1276
|
+
char_count1 = char_count2
|
1277
|
+
end
|
1278
|
+
end
|
1279
|
+
end
|
1280
|
+
|
1281
|
+
# Update the current character count.
|
1282
|
+
if diff_type != :insert
|
1283
|
+
char_count1 += diff_text.length
|
1284
|
+
end
|
1285
|
+
if diff_type != :delete
|
1286
|
+
char_count2 += diff_text.length
|
1287
|
+
end
|
1288
|
+
end
|
1289
|
+
|
1290
|
+
# Pick up the leftover patch if not empty.
|
1291
|
+
unless patch.diffs.empty?
|
1292
|
+
patch_add_context(patch, prepatch_text)
|
1293
|
+
patches.push(patch)
|
1294
|
+
end
|
1295
|
+
|
1296
|
+
patches
|
1297
|
+
end
|
1298
|
+
|
1299
|
+
# Merge a set of patches onto the text. Return a patched text, as well
|
1300
|
+
# as a list of true/false values indicating which patches were applied.
|
1301
|
+
def patch_apply(patches, text)
|
1302
|
+
return [text, []] if patches.empty?
|
1303
|
+
|
1304
|
+
# Deep copy the patches so that no changes are made to originals.
|
1305
|
+
patches = Marshal.load(Marshal.dump(patches))
|
1306
|
+
|
1307
|
+
null_padding = patch_add_padding(patches)
|
1308
|
+
text = null_padding + text + null_padding
|
1309
|
+
patch_split_max(patches)
|
1310
|
+
|
1311
|
+
# delta keeps track of the offset between the expected and actual location
|
1312
|
+
# of the previous patch. If there are patches expected at positions 10 and
|
1313
|
+
# 20, but the first patch was found at 12, delta is 2 and the second patch
|
1314
|
+
# has an effective expected position of 22.
|
1315
|
+
delta = 0
|
1316
|
+
results = []
|
1317
|
+
patches.each_with_index do |patch, x|
|
1318
|
+
expected_loc = patch.start2 + delta
|
1319
|
+
text1 = diff_text1(patch.diffs)
|
1320
|
+
end_loc = -1
|
1321
|
+
if text1.length > match_max_bits
|
1322
|
+
# patch_splitMax will only provide an oversized pattern in the case of
|
1323
|
+
# a monster delete.
|
1324
|
+
start_loc = match_main(text, text1[0, match_max_bits], expected_loc)
|
1325
|
+
if start_loc != -1
|
1326
|
+
end_loc = match_main(text, text1[(text1.length - match_max_bits)..-1],
|
1327
|
+
expected_loc + text1.length - match_max_bits)
|
1328
|
+
if end_loc == -1 || start_loc >= end_loc
|
1329
|
+
# Can't find valid trailing context. Drop this patch.
|
1330
|
+
start_loc = -1
|
1331
|
+
end
|
1332
|
+
end
|
1333
|
+
else
|
1334
|
+
start_loc = match_main(text, text1, expected_loc)
|
1335
|
+
end
|
1336
|
+
if start_loc == -1
|
1337
|
+
# No match found. :(
|
1338
|
+
results[x] = false
|
1339
|
+
# Subtract the delta for this failed patch from subsequent patches.
|
1340
|
+
delta -= patch.length2 - patch.length1
|
1341
|
+
else
|
1342
|
+
# Found a match. :)
|
1343
|
+
results[x] = true
|
1344
|
+
delta = start_loc - expected_loc
|
1345
|
+
text2 = text[start_loc, end_loc == -1 ? text1.length : end_loc + match_max_bits]
|
1346
|
+
|
1347
|
+
if text1 == text2
|
1348
|
+
# Perfect match, just shove the replacement text in.
|
1349
|
+
text = text[0, start_loc] + diff_text2(patch.diffs) + text[(start_loc + text1.length)..-1]
|
1350
|
+
else
|
1351
|
+
# Imperfect match.
|
1352
|
+
# Run a diff to get a framework of equivalent indices.
|
1353
|
+
diffs = diff_main(text1, text2, false)
|
1354
|
+
if text1.length > match_max_bits &&
|
1355
|
+
diff_levenshtein(diffs).to_f / text1.length > patch_delete_threshold
|
1356
|
+
# The end points match, but the content is unacceptably bad.
|
1357
|
+
results[x] = false
|
1358
|
+
else
|
1359
|
+
diff_cleanup_semantic_lossless(diffs)
|
1360
|
+
index1 = 0
|
1361
|
+
patch.diffs.each do |op, data|
|
1362
|
+
if op != :equal
|
1363
|
+
index2 = diff_x_index(diffs, index1)
|
1364
|
+
end
|
1365
|
+
if op == :insert # Insertion
|
1366
|
+
text = text[0, start_loc + index2] + data + text[(start_loc + index2)..-1]
|
1367
|
+
elsif op == :delete # Deletion
|
1368
|
+
text = text[0, start_loc + index2] +
|
1369
|
+
text[(start_loc + diff_x_index(diffs, index1 + data.length))..-1]
|
1370
|
+
end
|
1371
|
+
if op != :delete
|
1372
|
+
index1 += data.length
|
1373
|
+
end
|
1374
|
+
end
|
1375
|
+
end
|
1376
|
+
end
|
1377
|
+
end
|
1378
|
+
end
|
1379
|
+
|
1380
|
+
# Strip the padding off.
|
1381
|
+
text = text[null_padding.length...-null_padding.length]
|
1382
|
+
[text, results]
|
1383
|
+
end
|
1384
|
+
|
1385
|
+
# Add some padding on text start and end so that edges can match
|
1386
|
+
# something. Intended to be called only from within patch_apply.
|
1387
|
+
def patch_add_padding(patches)
|
1388
|
+
padding_length = patch_margin
|
1389
|
+
null_padding = (1..padding_length).map { |x| x.chr(Encoding::UTF_8) }.join
|
1390
|
+
|
1391
|
+
# Bump all the patches forward.
|
1392
|
+
patches.each do |patch|
|
1393
|
+
patch.start1 += padding_length
|
1394
|
+
patch.start2 += padding_length
|
1395
|
+
end
|
1396
|
+
|
1397
|
+
# Add some padding on start of first diff.
|
1398
|
+
patch = patches.first
|
1399
|
+
diffs = patch.diffs
|
1400
|
+
if diffs.empty? || diffs.first[0] != :equal
|
1401
|
+
# Add nullPadding equality.
|
1402
|
+
diffs.unshift([:equal, null_padding])
|
1403
|
+
patch.start1 -= padding_length # Should be 0.
|
1404
|
+
patch.start2 -= padding_length # Should be 0.
|
1405
|
+
patch.length1 += padding_length
|
1406
|
+
patch.length2 += padding_length
|
1407
|
+
elsif padding_length > diffs.first[1].length
|
1408
|
+
# Grow first equality.
|
1409
|
+
extra_length = padding_length - diffs.first[1].length
|
1410
|
+
diffs.first[1] = null_padding[diffs.first[1].length..-1] + diffs.first[1]
|
1411
|
+
patch.start1 -= extra_length
|
1412
|
+
patch.start2 -= extra_length
|
1413
|
+
patch.length1 += extra_length
|
1414
|
+
patch.length2 += extra_length
|
1415
|
+
end
|
1416
|
+
|
1417
|
+
# Add some padding on end of last diff.
|
1418
|
+
patch = patches.last
|
1419
|
+
diffs = patch.diffs
|
1420
|
+
if diffs.empty? || diffs.last[0] != :equal
|
1421
|
+
# Add nullPadding equality.
|
1422
|
+
diffs.push([:equal, null_padding])
|
1423
|
+
patch.length1 += padding_length
|
1424
|
+
patch.length2 += padding_length
|
1425
|
+
elsif padding_length > diffs.last[1].length
|
1426
|
+
# Grow last equality.
|
1427
|
+
extra_length = padding_length - diffs.last[1].length
|
1428
|
+
diffs.last[1] += null_padding[0, extra_length]
|
1429
|
+
patch.length1 += extra_length
|
1430
|
+
patch.length2 += extra_length
|
1431
|
+
end
|
1432
|
+
|
1433
|
+
null_padding
|
1434
|
+
end
|
1435
|
+
|
1436
|
+
# Look through the patches and break up any which are longer than the
|
1437
|
+
# maximum limit of the match algorithm.
|
1438
|
+
def patch_split_max(patches)
|
1439
|
+
patch_size = match_max_bits
|
1440
|
+
|
1441
|
+
x = 0
|
1442
|
+
while x < patches.length
|
1443
|
+
if patches[x].length1 > patch_size
|
1444
|
+
big_patch = patches[x]
|
1445
|
+
# Remove the big old patch
|
1446
|
+
patches[x, 1] = []
|
1447
|
+
x -= 1
|
1448
|
+
start1 = big_patch.start1
|
1449
|
+
start2 = big_patch.start2
|
1450
|
+
pre_context = ""
|
1451
|
+
until big_patch.diffs.empty?
|
1452
|
+
# Create one of several smaller patches.
|
1453
|
+
patch = PatchObj.new
|
1454
|
+
empty = true
|
1455
|
+
patch.start1 = start1 - pre_context.length
|
1456
|
+
patch.start2 = start2 - pre_context.length
|
1457
|
+
unless pre_context.empty?
|
1458
|
+
patch.length1 = patch.length2 = pre_context.length
|
1459
|
+
patch.diffs.push([:equal, pre_context])
|
1460
|
+
end
|
1461
|
+
|
1462
|
+
while !big_patch.diffs.empty? && patch.length1 < patch_size - patch_margin
|
1463
|
+
diff = big_patch.diffs.first
|
1464
|
+
if diff[0] == :insert
|
1465
|
+
# Insertions are harmless.
|
1466
|
+
patch.length2 += diff[1].length
|
1467
|
+
start2 += diff[1].length
|
1468
|
+
patch.diffs.push(big_patch.diffs.shift)
|
1469
|
+
empty = false
|
1470
|
+
elsif diff[0] == :delete && patch.diffs.length == 1 &&
|
1471
|
+
patch.diffs.first[0] == :equal && diff[1].length > 2 * patch_size
|
1472
|
+
# This is a large deletion. Let it pass in one chunk.
|
1473
|
+
patch.length1 += diff[1].length
|
1474
|
+
start1 += diff[1].length
|
1475
|
+
empty = false
|
1476
|
+
patch.diffs.push(big_patch.diffs.shift)
|
1477
|
+
else
|
1478
|
+
# Deletion or equality. Only take as much as we can stomach.
|
1479
|
+
diff_text = diff[1][0, patch_size - patch.length1 - patch_margin]
|
1480
|
+
patch.length1 += diff_text.length
|
1481
|
+
start1 += diff_text.length
|
1482
|
+
if diff[0] == :equal
|
1483
|
+
patch.length2 += diff_text.length
|
1484
|
+
start2 += diff_text.length
|
1485
|
+
else
|
1486
|
+
empty = false
|
1487
|
+
end
|
1488
|
+
patch.diffs.push([diff[0], diff_text])
|
1489
|
+
if diff_text == big_patch.diffs.first[1]
|
1490
|
+
big_patch.diffs.shift
|
1491
|
+
else
|
1492
|
+
big_patch.diffs.first[1] = big_patch.diffs.first[1][diff_text.length..-1]
|
1493
|
+
end
|
1494
|
+
end
|
1495
|
+
end
|
1496
|
+
|
1497
|
+
# Compute the head context for the next patch.
|
1498
|
+
pre_context = diff_text2(patch.diffs)[-patch_margin..-1] || ""
|
1499
|
+
|
1500
|
+
# Append the end context for this patch.
|
1501
|
+
post_context = diff_text1(big_patch.diffs)[0...patch_margin] || ""
|
1502
|
+
unless post_context.empty?
|
1503
|
+
patch.length1 += post_context.length
|
1504
|
+
patch.length2 += post_context.length
|
1505
|
+
if !patch.diffs.empty? && patch.diffs.last[0] == :equal
|
1506
|
+
patch.diffs.last[1] += post_context
|
1507
|
+
else
|
1508
|
+
patch.diffs.push([:equal, post_context])
|
1509
|
+
end
|
1510
|
+
end
|
1511
|
+
unless empty
|
1512
|
+
x += 1
|
1513
|
+
patches[x, 0] = [patch]
|
1514
|
+
end
|
1515
|
+
end
|
1516
|
+
end
|
1517
|
+
x += 1
|
1518
|
+
end
|
1519
|
+
end
|
1520
|
+
end
|
1521
|
+
|
1522
|
+
DiffMatchPatch = DiMaPa
|