dimapa 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 5e87b2a963101ab22b8fb368d0956670c8f22a09691da2931ab28574f4d700fd
4
+ data.tar.gz: c94b5fa761a33875cd96db4afadc9aabfc87f0e3a3da249f6a59ce9659685f5d
5
+ SHA512:
6
+ metadata.gz: de8c3501e51d0fecfb0697e6120b630b6b3c46ee1e0f52ee9343c07db4e3e2a1667bb690407557726c0563ff712000d0ed2319aeda0eb5c3cf990bb98eb7a1bd
7
+ data.tar.gz: 6f638d15129dc34dda5d766796dda129340b981e06886c277ec2583044dbbdf34ddd70df51ce600d26ac4797b2cc4d2a9eeaee31377fa9e8b02360f1c650ea2f
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
6
+
7
+ gemspec
8
+
9
+ group :development, :test do
10
+ gem "pry"
11
+ gem "pry-doc"
12
+ gem "standard"
13
+ end
data/LICENSE ADDED
@@ -0,0 +1,23 @@
1
+ Copyright (c) 2011, Jorge Kalmbach <kalmbach.at.gmail.com>
2
+
3
+ Permission is hereby granted, free of charge, to any
4
+ person obtaining a copy of this software and associated
5
+ documentation files (the "Software"), to deal in the
6
+ Software without restriction, including without limitation
7
+ the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the
9
+ Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice
13
+ shall be included in all copies or substantial portions of
14
+ the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
17
+ KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
18
+ WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
19
+ PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
20
+ OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,78 @@
1
+ # DiMaPa (Diff Match and Patch)
2
+ A modern Ruby implementation of Google's [Diff Match and Patch][google]
3
+ libraries.
4
+
5
+ > The Diff Match and Patch libraries offer robust algorithms to perform the
6
+ > operations required for synchronizing plain text.
7
+
8
+ ## Usage
9
+ ```ruby
10
+ require 'dimapa'
11
+
12
+ dmp = DiMaPa.new # or DiffMatchPatch
13
+
14
+ diff = dmp.diff_main("This is a sentence.", "This is also a sentence.")
15
+ #=> [[:equal, "This is a"], [:insert, "lso a"], [:equal, " sentence."]]
16
+
17
+ dmp.diff_cleanup_semantic(diff)
18
+ #=> nil
19
+
20
+ # diff is modified in place
21
+ diff
22
+ #=> [[:equal, "This is "], [:insert, "also "], [:equal, "a sentence."]]
23
+
24
+ patch = dmp.patch_make(diff)
25
+ #=> [#<PatchObj:0x00005608e6ac9500 @diffs=
26
+ # [[:equal, "This is "], [:insert, "also "], [:equal, "a senten"]],
27
+ # @length1=16,
28
+ # @length2=21,
29
+ # @start1=0,
30
+ # @start2=0>]
31
+
32
+ dmp.patch_to_text(patch)
33
+ #=> "@@ -1,16 +1,21 @@\n This is \n+also \n a senten\n"
34
+
35
+ dmp.patch_apply(patch, "This is a sentence.")
36
+ #=> ["This is also a sentence.", [true]]
37
+ ```
38
+
39
+ ## Installation
40
+ ```sh
41
+ # RubyGem
42
+ gem install dimapa
43
+
44
+ # From source
45
+ bundle install
46
+ bundle exec rake install
47
+ ```
48
+
49
+ ## Benchmarks
50
+
51
+ This project includes [scripts/](speedtests) mirroring those in the official
52
+ project. Performance is on par with those reported for [Lua and Python][speedtest]
53
+ albeit run on a faster machine.
54
+
55
+ ```
56
+ $ rake speedtest
57
+
58
+ user system total real
59
+ diff(t2,t1) 13.658214 0.003937 13.662151 ( 13.662453)
60
+ diff(t1,t2) 14.074079 0.000001 14.074080 ( 14.074350)
61
+ ```
62
+
63
+ ## Tests and Linting
64
+
65
+ ```sh
66
+ bundle exec rake
67
+ ```
68
+
69
+ ### Fork of [kalmbach/diff_match_patch][kalmbach] b/w/o [DavidMikeSimon/diff_match_patch][davidmikesimon]
70
+ Copyright (c) 2011, Jorge Kalmbach <kalmbach.at.gmail.com>
71
+
72
+ Work was inspired by the [reima/diff_match_patch-ruby][reima] module.
73
+
74
+ [speedtest]: https://docs.google.com/spreadsheets/d/1zpZccuBpjMZTvL1nGDMKJc7rWL_m_drF4XKOJvB27Kc/edit#gid=0
75
+ [kalmbach]: https://github.com/kalmbach/diff_match_patch
76
+ [davidmikesimon]: https://github.com/DavidMikeSimon/diff_match_patch
77
+ [reima]: https://github.com/reima/diff_match_patch-ruby
78
+ [google]: https://github.com/google/diff-match-patch
@@ -0,0 +1,22 @@
1
+ require "rake/testtask"
2
+ require "standard/rake"
3
+ require "bundler/gem_tasks"
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << "test"
7
+ end
8
+
9
+ desc "Run benchmarking speedtest"
10
+ task :speedtest do
11
+ ruby "scripts/speedtest.rb"
12
+ end
13
+
14
+ desc "Start REPL"
15
+ task :console do
16
+ require "pry"
17
+ require "dimapa"
18
+ Pry.start
19
+ end
20
+
21
+ desc "Run tests and linter"
22
+ task default: [:standard, :test]
@@ -0,0 +1,132 @@
1
+ module DiffMethods
2
+ FIXNUM_MAX = 2**(0.size * 8 - 2) - 1
3
+
4
+ attr_accessor :diff_timeout
5
+
6
+ def initialize
7
+ # Number of seconds to map a diff before giving up (0 for infinity).
8
+ @diff_timeout = 1
9
+ end
10
+
11
+ # Find the differences between two texts. Simplifies the problem by
12
+ # stripping any common prefix or suffix off the texts before editing.
13
+ def diff_main(text1, text2, checklines = true, deadline = nil)
14
+ # Set a deadline by which time the diff must be complete.
15
+ deadline ||= diff_new_deadline
16
+
17
+ # Check for null inputs.
18
+ raise ArgumentError.new("Null inputs. (diff_main)") unless text1 || text2
19
+
20
+ # Check for equality (speedup).
21
+ return (text1.empty? ? [] : [[:equal, text1]]) if text1 == text2
22
+
23
+ diff_main_compute_diff(text1, text2, checklines, deadline)
24
+ end
25
+
26
+ def diff_main_compute_diff(text1, text2, checklines, deadline)
27
+ # Trim off common prefix and suffix (speedup).
28
+ common_prefix, text1, text2 = diff_trim_common_prefix(text1, text2)
29
+ common_suffix, text1, text2 = diff_trim_common_suffix(text1, text2)
30
+
31
+ # Compute the diff on the middle block.
32
+ diffs = diff_compute(text1, text2, checklines, deadline)
33
+
34
+ # Restore the prefix and suffix.
35
+ diffs.unshift([:equal, common_prefix]) unless common_prefix.nil?
36
+ diffs.push([:equal, common_suffix]) unless common_suffix.nil?
37
+ diff_cleanup_merge(diffs)
38
+
39
+ diffs
40
+ end
41
+
42
+ private :diff_main_compute_diff
43
+
44
+ # Calculate a new deadline using the @diff_timeout configuration value
45
+ def diff_new_deadline
46
+ Time.now + (diff_timeout.zero? ? FIXNUM_MAX : diff_timeout)
47
+ end
48
+
49
+ private :diff_new_deadline
50
+
51
+ # Trim off the common prefix
52
+ def diff_trim_common_prefix(text1, text2)
53
+ if (common_length = diff_common_prefix(text1, text2)).nonzero?
54
+ common_prefix = text1[0...common_length]
55
+ text1 = text1[common_length..-1]
56
+ text2 = text2[common_length..-1]
57
+ end
58
+
59
+ [common_prefix, text1, text2]
60
+ end
61
+
62
+ private :diff_trim_common_prefix
63
+
64
+ # Trim off the common suffix
65
+ def diff_trim_common_suffix(text1, text2)
66
+ if (common_length = diff_common_suffix(text1, text2)).nonzero?
67
+ common_suffix = text1[-common_length..-1]
68
+ text1 = text1[0...-common_length]
69
+ text2 = text2[0...-common_length]
70
+ end
71
+
72
+ [common_suffix, text1, text2]
73
+ end
74
+
75
+ private :diff_trim_common_suffix
76
+
77
+ # Find the differences between two texts. Assumes that the texts do not
78
+ # have any common prefix or suffix.
79
+ def diff_compute(text1, text2, checklines, deadline)
80
+ if (diffs = diff_compute_common_cases(text1, text2))
81
+ diffs
82
+
83
+ elsif (diffs = diff_compute_half_match(text1, text2, checklines, deadline))
84
+ diffs
85
+
86
+ elsif checklines && text1.length > 100 && text2.length > 100
87
+ diff_line_mode(text1, text2, deadline)
88
+
89
+ else
90
+ diff_bisect(text1, text2, deadline)
91
+ end
92
+ end
93
+
94
+ def diff_compute_half_match(text1, text2, checklines, deadline)
95
+ if (hm = diff_half_match(text1, text2))
96
+ # A half-match was found, sort out the return data.
97
+ text1_a, text1_b, text2_a, text2_b, mid_common = hm
98
+
99
+ # Send both pairs off for separate processing.
100
+ diffs_a = diff_main(text1_a, text2_a, checklines, deadline)
101
+ diffs_b = diff_main(text1_b, text2_b, checklines, deadline)
102
+
103
+ # Merge the results.
104
+ diffs_a + [[:equal, mid_common]] + diffs_b
105
+ end
106
+ end
107
+
108
+ private :diff_compute_half_match
109
+
110
+ def diff_compute_common_cases(text1, text2)
111
+ # Just add some text (speedup).
112
+ return [[:insert, text2]] if text1.empty?
113
+
114
+ # Just delete some text (speedup).
115
+ return [[:delete, text1]] if text2.empty?
116
+
117
+ short, long = [text1, text2].sort_by(&:length)
118
+
119
+ # Shorter text is inside the longer text (speedup).
120
+ if (i = long.index(short))
121
+ op = text1.length > text2.length ? :delete : :insert
122
+ [[op, long[0...i]], [:equal, short], [op, long[(i + short.length)..-1]]]
123
+
124
+ # Single character string.
125
+ elsif short.length == 1
126
+ # After the previous speedup, the character can't be an equality.
127
+ [[:delete, text1], [:insert, text2]]
128
+ end
129
+ end
130
+
131
+ private :diff_compute_common_cases
132
+ end
@@ -0,0 +1,1522 @@
1
+ require "diff_methods"
2
+ require "patch_obj"
3
+
4
+ # Class containing the diff, match and patch methods.
5
+ # Also contains the behaviour settings.
6
+ class DiMaPa
7
+ include DiffMethods
8
+
9
+ attr_accessor :diff_edit_cost
10
+ attr_accessor :match_threshold
11
+ attr_accessor :match_distance
12
+ attr_accessor :patch_delete_threshold
13
+ attr_accessor :patch_margin
14
+ attr_reader :match_max_bits
15
+
16
+ def initialize
17
+ # Inits a diff_match_patch object with default settings.
18
+ # Redefine these in your program to override the defaults.
19
+
20
+ # Cost of an empty edit operation in terms of edit characters.
21
+ @diff_edit_cost = 4
22
+ # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
23
+ @match_threshold = 0.5
24
+ # How far to search for a match (0 = exact location, 1000+ = broad match).
25
+ # A match this many characters away from the expected location will add
26
+ # 1.0 to the score (0.0 is a perfect match).
27
+ @match_distance = 1000
28
+ # When deleting a large block of text (over ~64 characters), how close does
29
+ # the contents have to match the expected contents. (0.0 = perfection,
30
+ # 1.0 = very loose). Note that Match_Threshold controls how closely the
31
+ # end points of a delete need to match.
32
+ @patch_delete_threshold = 0.5
33
+ # Chunk size for context length.
34
+ @patch_margin = 4
35
+
36
+ # The number of bits in an int.
37
+ # Python has no maximum, thus to disable patch splitting set to 0.
38
+ # However to avoid long patches in certain pathological cases, use 32.
39
+ # Multiple short patches (using native ints) are much faster than long ones.
40
+ @match_max_bits = 32
41
+ super
42
+ end
43
+
44
+ # Do a quick line-level diff on both strings, then rediff the parts for
45
+ # greater accuracy.
46
+ # This speedup can produce non-minimal diffs.
47
+ def diff_line_mode(text1, text2, deadline)
48
+ # Scan the text on a line-by-line basis first.
49
+ text1, text2, line_array = diff_lines_to_chars(text1, text2)
50
+
51
+ diffs = diff_main(text1, text2, false, deadline)
52
+
53
+ # Convert the diff back to original text.
54
+ diff_chars_to_lines(diffs, line_array)
55
+ # Eliminate freak matches (e.g. blank lines)
56
+ diff_cleanup_semantic(diffs)
57
+
58
+ # Rediff any replacement blocks, this time character-by-character.
59
+ # Add a dummy entry at the end.
60
+ diffs.push([:equal, ""])
61
+ pointer = 0
62
+ count_delete = 0
63
+ count_insert = 0
64
+ text_delete = ""
65
+ text_insert = ""
66
+
67
+ while pointer < diffs.length
68
+ case diffs[pointer][0]
69
+ when :insert
70
+ count_insert += 1
71
+ text_insert += diffs[pointer][1]
72
+ when :delete
73
+ count_delete += 1
74
+ text_delete += diffs[pointer][1]
75
+ when :equal
76
+ # Upon reaching an equality, check for prior redundancies.
77
+ if count_delete >= 1 && count_insert >= 1
78
+ # Delete the offending records and add the merged ones.
79
+ a = diff_main(text_delete, text_insert, false, deadline)
80
+ diffs[pointer - count_delete - count_insert,
81
+ count_delete + count_insert] = []
82
+ pointer = pointer - count_delete - count_insert
83
+ diffs[pointer, 0] = a
84
+ pointer += a.length
85
+ end
86
+ count_insert = 0
87
+ count_delete = 0
88
+ text_delete = ""
89
+ text_insert = ""
90
+ end
91
+ pointer += 1
92
+ end
93
+
94
+ diffs.pop # Remove the dummy entry at the end.
95
+ diffs
96
+ end
97
+
98
+ # Find the 'middle snake' of a diff, split the problem in two
99
+ # and return the recursively constructed diff.
100
+ # See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
101
+ def diff_bisect(text1, text2, deadline)
102
+ # Cache the text lengths to prevent multiple calls.
103
+ text1_length = text1.length
104
+ text2_length = text2.length
105
+ max_d = (text1_length + text2_length + 1) / 2
106
+ v_offset = max_d
107
+ v_length = 2 * max_d
108
+ v1 = Array.new(v_length, -1)
109
+ v2 = Array.new(v_length, -1)
110
+ v1[v_offset + 1] = 0
111
+ v2[v_offset + 1] = 0
112
+ delta = text1_length - text2_length
113
+
114
+ # If the total number of characters is odd, then the front path will
115
+ # collide with the reverse path.
116
+ front = (delta % 2 != 0)
117
+ # Offsets for start and end of k loop.
118
+ # Prevents mapping of space beyond the grid.
119
+ k1start = 0
120
+ k1end = 0
121
+ k2start = 0
122
+ k2end = 0
123
+ max_d.times do |d|
124
+ # Bail out if deadline is reached.
125
+ break if deadline && Time.now >= deadline
126
+
127
+ # Walk the front path one step.
128
+ (-d + k1start).step(d - k1end, 2) do |k1|
129
+ k1_offset = v_offset + k1
130
+ x1 = if k1 == -d || k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]
131
+ v1[k1_offset + 1]
132
+ else
133
+ v1[k1_offset - 1] + 1
134
+ end
135
+
136
+ y1 = x1 - k1
137
+ while x1 < text1_length && y1 < text2_length && text1[x1] == text2[y1]
138
+ x1 += 1
139
+ y1 += 1
140
+ end
141
+
142
+ v1[k1_offset] = x1
143
+ if x1 > text1_length
144
+ # Ran off the right of the graph.
145
+ k1end += 2
146
+ elsif y1 > text2_length
147
+ # Ran off the bottom of the graph.
148
+ k1start += 2
149
+ elsif front
150
+ k2_offset = v_offset + delta - k1
151
+ if k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1
152
+ # Mirror x2 onto top-left coordinate system.
153
+ x2 = text1_length - v2[k2_offset]
154
+ if x1 >= x2
155
+ # Overlap detected.
156
+ return diff_bisect_split(text1, text2, x1, y1, deadline)
157
+ end
158
+ end
159
+ end
160
+ end
161
+
162
+ # Walk the reverse path one step.
163
+ (-d + k2start).step(d - k2end, 2) do |k2|
164
+ k2_offset = v_offset + k2
165
+ x2 = if k2 == -d || k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]
166
+ v2[k2_offset + 1]
167
+ else
168
+ v2[k2_offset - 1] + 1
169
+ end
170
+
171
+ y2 = x2 - k2
172
+ while x2 < text1_length && y2 < text2_length && text1[-x2 - 1] == text2[-y2 - 1]
173
+ x2 += 1
174
+ y2 += 1
175
+ end
176
+
177
+ v2[k2_offset] = x2
178
+ if x2 > text1_length
179
+ # Ran off the left of the graph.
180
+ k2end += 2
181
+ elsif y2 > text2_length
182
+ # Ran off the top of the graph.
183
+ k2start += 2
184
+ elsif !front
185
+ k1_offset = v_offset + delta - k2
186
+ if k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1
187
+ x1 = v1[k1_offset]
188
+ y1 = v_offset + x1 - k1_offset
189
+ # Mirror x2 onto top-left coordinate system.
190
+ x2 = text1_length - x2
191
+ if x1 >= x2
192
+ # Overlap detected.
193
+ return diff_bisect_split(text1, text2, x1, y1, deadline)
194
+ end
195
+ end
196
+ end
197
+ end
198
+ end
199
+
200
+ # Diff took too long and hit the deadline or
201
+ # number of diffs equals number of characters, no commonality at all.
202
+ [[:delete, text1], [:insert, text2]]
203
+ end
204
+
205
+ # Given the location of the 'middle snake', split the diff in two parts
206
+ # and recurse.
207
+ def diff_bisect_split(text1, text2, x, y, deadline)
208
+ text1a = text1[0...x]
209
+ text2a = text2[0...y]
210
+ text1b = text1[x..-1]
211
+ text2b = text2[y..-1]
212
+
213
+ # Compute both diffs serially.
214
+ diffs = diff_main(text1a, text2a, false, deadline)
215
+ diffsb = diff_main(text1b, text2b, false, deadline)
216
+
217
+ diffs + diffsb
218
+ end
219
+
220
+ # Split two texts into an array of strings. Reduce the texts to a string
221
+ # of hashes where each Unicode character represents one line.
222
+ def diff_lines_to_chars(text1, text2)
223
+ line_array = [""] # e.g. line_array[4] == "Hello\n"
224
+ line_hash = {} # e.g. line_hash["Hello\n"] == 4
225
+
226
+ [text1, text2].map { |text|
227
+ # Split text into an array of strings. Reduce the text to a string of
228
+ # hashes where each Unicode character represents one line.
229
+ chars = ""
230
+ text.each_line do |line|
231
+ if line_hash[line]
232
+ chars += line_hash[line].chr(Encoding::UTF_8)
233
+ else
234
+ chars += line_array.length.chr(Encoding::UTF_8)
235
+ line_hash[line] = line_array.length
236
+ line_array.push(line)
237
+ end
238
+ end
239
+ chars
240
+ }.push(line_array)
241
+ end
242
+
243
+ # Rehydrate the text in a diff from a string of line hashes to real lines of text.
244
+ def diff_chars_to_lines(diffs, line_array)
245
+ diffs.each do |diff|
246
+ diff[1] = diff[1].chars.map { |c| line_array[c.ord] }.join
247
+ end
248
+ end
249
+
250
+ # Determine the common prefix of two strings.
251
+ def diff_common_prefix(text1, text2)
252
+ # Quick check for common null cases.
253
+ return 0 if text1.empty? || text2.empty? || text1[0] != text2[0]
254
+
255
+ # Binary search.
256
+ # Performance analysis: http://neil.fraser.name/news/2007/10/09/
257
+ pointer_min = 0
258
+ pointer_max = [text1.length, text2.length].min
259
+ pointer_mid = pointer_max
260
+ pointer_start = 0
261
+
262
+ while pointer_min < pointer_mid
263
+ if text1[pointer_start...pointer_mid] == text2[pointer_start...pointer_mid]
264
+ pointer_min = pointer_mid
265
+ pointer_start = pointer_min
266
+ else
267
+ pointer_max = pointer_mid
268
+ end
269
+ pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min
270
+ end
271
+
272
+ pointer_mid
273
+ end
274
+
275
+ # Determine the common suffix of two strings.
276
+ def diff_common_suffix(text1, text2)
277
+ # Quick check for common null cases.
278
+ return 0 if text1.empty? || text2.empty? || text1[-1] != text2[-1]
279
+
280
+ # Binary search.
281
+ # Performance analysis: http://neil.fraser.name/news/2007/10/09/
282
+ pointer_min = 0
283
+ pointer_max = [text1.length, text2.length].min
284
+ pointer_mid = pointer_max
285
+ pointer_end = 0
286
+
287
+ while pointer_min < pointer_mid
288
+ if text1[-pointer_mid..(-pointer_end - 1)] == text2[-pointer_mid..(-pointer_end - 1)]
289
+ pointer_min = pointer_mid
290
+ pointer_end = pointer_min
291
+ else
292
+ pointer_max = pointer_mid
293
+ end
294
+ pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min
295
+ end
296
+
297
+ pointer_mid
298
+ end
299
+
300
+ # Determine if the suffix of one string is the prefix of another.
301
+ def diff_common_overlap(text1, text2)
302
+ # Cache the text lengths to prevent multiple calls.
303
+ text1_length = text1.length
304
+ text2_length = text2.length
305
+
306
+ # Eliminate the null case.
307
+ return 0 if text1_length.zero? || text2_length.zero?
308
+
309
+ # Truncate the longer string.
310
+ if text1_length > text2_length
311
+ text1 = text1[-text2_length..-1]
312
+ else
313
+ text2 = text2[0...text1_length]
314
+ end
315
+ text_length = [text1_length, text2_length].min
316
+
317
+ # Quick check for the whole case.
318
+ return text_length if text1 == text2
319
+
320
+ # Start by looking for a single character match
321
+ # and increase length until no match is found.
322
+ # Performance analysis: http://neil.fraser.name/news/2010/11/04/
323
+ best = 0
324
+ length = 1
325
+ loop do
326
+ pattern = text1[(text_length - length)..-1]
327
+ found = text2.index(pattern)
328
+
329
+ return best if found.nil?
330
+
331
+ length += found
332
+ if found == 0 || text1[(text_length - length)..-1] == text2[0..length]
333
+ best = length
334
+ length += 1
335
+ end
336
+ end
337
+ end
338
+
339
+ # Does a substring of shorttext exist within longtext such that the
340
+ # substring is at least half the length of longtext?
341
+ def diff_half_match_i(longtext, shorttext, i)
342
+ seed = longtext[i, longtext.length / 4]
343
+ j = -1
344
+ best_common = ""
345
+ while (j = shorttext.index(seed, j + 1))
346
+ prefix_length = diff_common_prefix(longtext[i..-1], shorttext[j..-1])
347
+ suffix_length = diff_common_suffix(longtext[0...i], shorttext[0...j])
348
+ if best_common.length < suffix_length + prefix_length
349
+ best_common = shorttext[(j - suffix_length)...j] + shorttext[j...(j + prefix_length)]
350
+ best_longtext_a = longtext[0...(i - suffix_length)]
351
+ best_longtext_b = longtext[(i + prefix_length)..-1]
352
+ best_shorttext_a = shorttext[0...(j - suffix_length)]
353
+ best_shorttext_b = shorttext[(j + prefix_length)..-1]
354
+ end
355
+ end
356
+
357
+ if best_common.length * 2 >= longtext.length
358
+ [best_longtext_a, best_longtext_b, best_shorttext_a, best_shorttext_b, best_common]
359
+ end
360
+ end
361
+
362
+ # Do the two texts share a substring which is at least half the length of the
363
+ # longer text?
364
+ # This speedup can produce non-minimal diffs.
365
+ def diff_half_match(text1, text2)
366
+ # Don't risk returning a non-optimal diff if we have unlimited time
367
+ return nil if diff_timeout <= 0
368
+
369
+ shorttext, longtext = [text1, text2].sort_by(&:length)
370
+ if longtext.length < 4 || shorttext.length * 2 < longtext.length
371
+ return nil # Pointless.
372
+ end
373
+
374
+ # First check if the second quarter is the seed for a half-match.
375
+ hm1 = diff_half_match_i(longtext, shorttext, (longtext.length + 3) / 4)
376
+ # Check again based on the third quarter.
377
+ hm2 = diff_half_match_i(longtext, shorttext, (longtext.length + 1) / 2)
378
+
379
+ if hm1.nil? && hm2.nil?
380
+ return nil
381
+ elsif hm2.nil? || hm1.nil?
382
+ hm = hm2.nil? ? hm1 : hm2
383
+ else
384
+ # Both matched. Select the longest.
385
+ hm = hm1[4].length > hm2[4].length ? hm1 : hm2
386
+ end
387
+
388
+ # A half-match was found, sort out the return data.
389
+ if text1.length > text2.length
390
+ text1_a, text1_b, text2_a, text2_b, mid_common = hm
391
+ else
392
+ text2_a, text2_b, text1_a, text1_b, mid_common = hm
393
+ end
394
+
395
+ [text1_a, text1_b, text2_a, text2_b, mid_common]
396
+ end
397
+
398
+ # Reduce the number of edits by eliminating semantically trivial equalities.
399
+ def diff_cleanup_semantic(diffs)
400
+ changes = false
401
+ equalities = [] # Stack of indices where equalities are found.
402
+ last_equality = nil # Always equal to equalities.last[1]
403
+ pointer = 0 # Index of current position.
404
+ # Number of characters that changed prior to the equality.
405
+ length_insertions1 = 0
406
+ length_deletions1 = 0
407
+ # Number of characters that changed after the equality.
408
+ length_insertions2 = 0
409
+ length_deletions2 = 0
410
+
411
+ while pointer < diffs.length
412
+ if diffs[pointer][0] == :equal # Equality found.
413
+ equalities.push(pointer)
414
+ length_insertions1 = length_insertions2
415
+ length_deletions1 = length_deletions2
416
+ length_insertions2 = 0
417
+ length_deletions2 = 0
418
+ last_equality = diffs[pointer][1]
419
+ else # An insertion or deletion.
420
+ if diffs[pointer][0] == :insert
421
+ length_insertions2 += diffs[pointer][1].length
422
+ else
423
+ length_deletions2 += diffs[pointer][1].length
424
+ end
425
+
426
+ if last_equality &&
427
+ last_equality.length <= [length_insertions1, length_deletions1].max &&
428
+ last_equality.length <= [length_insertions2, length_deletions2].max
429
+ # Duplicate record.
430
+ diffs[equalities.last, 0] = [[:delete, last_equality]]
431
+
432
+ # Change second copy to insert.
433
+ diffs[equalities.last + 1][0] = :insert
434
+
435
+ # Throw away the equality we just deleted.
436
+ equalities.pop
437
+ # Throw away the previous equality (it needs to be reevaluated).
438
+ equalities.pop
439
+ pointer = equalities.last || -1
440
+
441
+ # Reset the counters.
442
+ length_insertions1 = 0
443
+ length_deletions1 = 0
444
+ length_insertions2 = 0
445
+ length_deletions2 = 0
446
+ last_equality = nil
447
+
448
+ changes = true
449
+ end
450
+ end
451
+ pointer += 1
452
+ end
453
+
454
+ # Normalize the diff.
455
+ diff_cleanup_merge(diffs) if changes
456
+ diff_cleanup_semantic_lossless(diffs)
457
+
458
+ # Find any overlaps between deletions and insertions.
459
+ # e.g: <del>abcxxx</del><ins>xxxdef</ins>
460
+ # -> <del>abc</del>xxx<ins>def</ins>
461
+ # e.g: <del>xxxabc</del><ins>defxxx</ins>
462
+ # -> <ins>def</ins>xxx<del>abc</del>
463
+ # Only extract an overlap if it is as big as the edit ahead or behind it.
464
+ pointer = 1
465
+ while pointer < diffs.length
466
+ if diffs[pointer - 1][0] == :delete && diffs[pointer][0] == :insert
467
+ deletion = diffs[pointer - 1][1]
468
+ insertion = diffs[pointer][1]
469
+ overlap_length1 = diff_common_overlap(deletion, insertion)
470
+ overlap_length2 = diff_common_overlap(insertion, deletion)
471
+ if overlap_length1 >= overlap_length2
472
+ if overlap_length1 >= deletion.length / 2.0 ||
473
+ overlap_length1 >= insertion.length / 2.0
474
+ # Overlap found. Insert an equality and trim the surrounding edits.
475
+ diffs[pointer, 0] = [[:equal, insertion[0...overlap_length1]]]
476
+ diffs[pointer - 1][0] = :delete
477
+ diffs[pointer - 1][1] = deletion[0...-overlap_length1]
478
+ diffs[pointer + 1][0] = :insert
479
+ diffs[pointer + 1][1] = insertion[overlap_length1..-1]
480
+ pointer += 1
481
+ end
482
+ elsif overlap_length2 >= deletion.length / 2.0 || overlap_length2 >= insertion.length / 2.0
483
+ diffs[pointer, 0] = [[:equal, deletion[0...overlap_length2]]]
484
+ diffs[pointer - 1][0] = :insert
485
+ diffs[pointer - 1][1] = insertion[0...-overlap_length2]
486
+ diffs[pointer + 1][0] = :delete
487
+ diffs[pointer + 1][1] = deletion[overlap_length2..-1]
488
+ pointer += 1
489
+ end
490
+ pointer += 1
491
+ end
492
+ pointer += 1
493
+ end
494
+ end
495
+
496
+ # Given two strings, compute a score representing whether the
497
+ # internal boundary falls on logical boundaries.
498
+ # Scores range from 5 (best) to 0 (worst).
499
+ def diff_cleanup_semantic_score(one, two)
500
+ if one.empty? || two.empty?
501
+ # Edges are the best.
502
+ return 5
503
+ end
504
+
505
+ # Define some regex patterns for matching boundaries.
506
+ non_word_character = /[^a-zA-Z0-9]/
507
+ whitespace = /\s/
508
+ linebreak = /[\r\n]/
509
+ line_end = /\n\r?\n$/
510
+ line_start = /^\r?\n\r?\n/
511
+
512
+ # Each port of this function behaves slightly differently due to
513
+ # subtle differences in each language's definition of things like
514
+ # 'whitespace'. Since this function's purpose is largely cosmetic,
515
+ # the choice has been made to use each language's native features
516
+ # rather than force total conformity.
517
+ score = 0
518
+ # One point for non-alphanumeric.
519
+ if one[-1] =~ non_word_character || two[0] =~ non_word_character
520
+ score += 1
521
+ # Two points for whitespace.
522
+ if one[-1] =~ whitespace || two[0] =~ whitespace
523
+ score += 1
524
+ # Three points for line breaks.
525
+ if one[-1] =~ linebreak || two[0] =~ linebreak
526
+ score += 1
527
+ # Four points for blank lines.
528
+ if one =~ line_end || two =~ line_start
529
+ score += 1
530
+ end
531
+ end
532
+ end
533
+ end
534
+
535
+ score
536
+ end
537
+
538
+ # Look for single edits surrounded on both sides by equalities
539
+ # which can be shifted sideways to align the edit to a word boundary.
540
+ # e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
541
+ def diff_cleanup_semantic_lossless(diffs)
542
+ pointer = 1
543
+ # Intentionally ignore the first and last element (don't need checking).
544
+ while pointer < diffs.length - 1
545
+ if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal
546
+ # This is a single edit surrounded by equalities.
547
+ equality1 = diffs[pointer - 1][1]
548
+ edit = diffs[pointer][1]
549
+ equality2 = diffs[pointer + 1][1]
550
+
551
+ # First, shift the edit as far left as possible.
552
+ common_offset = diff_common_suffix(equality1, edit)
553
+ if common_offset != 0
554
+ common_string = edit[-common_offset..-1]
555
+ equality1 = equality1[0...-common_offset]
556
+ edit = common_string + edit[0...-common_offset]
557
+ equality2 = common_string + equality2
558
+ end
559
+
560
+ # Second, step character by character right, looking for the best fit.
561
+ best_equality1 = equality1
562
+ best_edit = edit
563
+ best_equality2 = equality2
564
+ best_score = diff_cleanup_semantic_score(equality1, edit) +
565
+ diff_cleanup_semantic_score(edit, equality2)
566
+ while edit[0] == equality2[0]
567
+ equality1 += edit[0]
568
+ edit = edit[1..-1] + equality2[0]
569
+ equality2 = equality2[1..-1]
570
+ score = diff_cleanup_semantic_score(equality1, edit) +
571
+ diff_cleanup_semantic_score(edit, equality2)
572
+ # The >= encourages trailing rather than leading whitespace on edits.
573
+ if score >= best_score
574
+ best_score = score
575
+ best_equality1 = equality1
576
+ best_edit = edit
577
+ best_equality2 = equality2
578
+ end
579
+ end
580
+
581
+ if diffs[pointer - 1][1] != best_equality1
582
+ # We have an improvement, save it back to the diff.
583
+ if best_equality1.empty?
584
+ diffs[pointer - 1, 1] = []
585
+ pointer -= 1
586
+ else
587
+ diffs[pointer - 1][1] = best_equality1
588
+ end
589
+
590
+ diffs[pointer][1] = best_edit
591
+
592
+ if best_equality2.empty?
593
+ diffs[pointer + 1, 1] = []
594
+ pointer -= 1
595
+ else
596
+ diffs[pointer + 1][1] = best_equality2
597
+ end
598
+ end
599
+ end
600
+
601
+ pointer += 1
602
+ end
603
+ end
604
+
605
+ # Reduce the number of edits by eliminating operationally trivial equalities.
606
+ def diff_cleanup_efficiency(diffs)
607
+ changes = false
608
+ equalities = [] # Stack of indices where equalities are found.
609
+ last_equality = "" # Always equal to equalities.last[1]
610
+ pointer = 0 # Index of current position.
611
+ pre_ins = false # Is there an insertion operation before the last equality.
612
+ pre_del = false # Is there a deletion operation before the last equality.
613
+ post_ins = false # Is there an insertion operation after the last equality.
614
+ post_del = false # Is there a deletion operation after the last equality.
615
+
616
+ while pointer < diffs.length
617
+ if diffs[pointer][0] == :equal # Equality found.
618
+ if diffs[pointer][1].length < diff_edit_cost && (post_ins || post_del)
619
+ # Candidate found.
620
+ equalities.push(pointer)
621
+ pre_ins = post_ins
622
+ pre_del = post_del
623
+ last_equality = diffs[pointer][1]
624
+ else
625
+ # Not a candidate, and can never become one.
626
+ equalities.clear
627
+ last_equality = ""
628
+ end
629
+ post_ins = false
630
+ post_del = false
631
+ else # An insertion or deletion.
632
+ if diffs[pointer][0] == :delete
633
+ post_del = true
634
+ else
635
+ post_ins = true
636
+ end
637
+
638
+ # Five types to be split:
639
+ # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
640
+ # <ins>A</ins>X<ins>C</ins><del>D</del>
641
+ # <ins>A</ins><del>B</del>X<ins>C</ins>
642
+ # <ins>A</del>X<ins>C</ins><del>D</del>
643
+ # <ins>A</ins><del>B</del>X<del>C</del>
644
+
645
+ if !last_equality.empty? &&
646
+ ((pre_ins && pre_del && post_ins && post_del) ||
647
+ ((last_equality.length < diff_edit_cost / 2) &&
648
+ [pre_ins, pre_del, post_ins, post_del].count(true) == 3))
649
+ # Duplicate record.
650
+ diffs[equalities.last, 0] = [[:delete, last_equality]]
651
+ # Change second copy to insert.
652
+ diffs[equalities.last + 1][0] = :insert
653
+ equalities.pop # Throw away the equality we just deleted
654
+ last_equality = ""
655
+ if pre_ins && pre_del
656
+ # No changes made which could affect previous entry, keep going.
657
+ post_ins = true
658
+ post_del = true
659
+ equalities.clear
660
+ else
661
+ unless equalities.empty?
662
+ equalities.pop # Throw away the previous equality.
663
+ pointer = equalities.last || -1
664
+ end
665
+ post_ins = false
666
+ post_del = false
667
+ end
668
+ changes = true
669
+ end
670
+ end
671
+ pointer += 1
672
+ end
673
+
674
+ if changes
675
+ diff_cleanup_merge(diffs)
676
+ end
677
+ end
678
+
679
+ # Reorder and merge like edit sections. Merge equalities.
680
+ # Any edit section can move as long as it doesn't cross an equality.
681
+ def diff_cleanup_merge(diffs)
682
+ diffs.push([:equal, ""]) # Add a dummy entry at the end.
683
+ pointer = 0
684
+ count_delete = 0
685
+ count_insert = 0
686
+ text_delete = ""
687
+ text_insert = ""
688
+
689
+ while pointer < diffs.length
690
+ case diffs[pointer][0]
691
+ when :insert
692
+ count_insert += 1
693
+ text_insert += diffs[pointer][1]
694
+ pointer += 1
695
+ when :delete
696
+ count_delete += 1
697
+ text_delete += diffs[pointer][1]
698
+ pointer += 1
699
+ when :equal
700
+ # Upon reaching an equality, check for prior redundancies.
701
+ if count_delete + count_insert > 1
702
+ if count_delete != 0 && count_insert != 0
703
+ # Factor out any common prefixies.
704
+ common_length = diff_common_prefix(text_insert, text_delete)
705
+ if common_length != 0
706
+ if (pointer - count_delete - count_insert) > 0 &&
707
+ diffs[pointer - count_delete - count_insert - 1][0] == :equal
708
+ diffs[pointer - count_delete - count_insert - 1][1] +=
709
+ text_insert[0...common_length]
710
+ else
711
+ diffs.unshift([:equal, text_insert[0...common_length]])
712
+ pointer += 1
713
+ end
714
+ text_insert = text_insert[common_length..-1]
715
+ text_delete = text_delete[common_length..-1]
716
+ end
717
+ # Factor out any common suffixies.
718
+ common_length = diff_common_suffix(text_insert, text_delete)
719
+ if common_length != 0
720
+ diffs[pointer][1] = text_insert[-common_length..-1] + diffs[pointer][1]
721
+ text_insert = text_insert[0...-common_length]
722
+ text_delete = text_delete[0...-common_length]
723
+ end
724
+ end
725
+
726
+ # Delete the offending records and add the merged ones.
727
+ diffs[pointer - count_delete - count_insert, count_delete + count_insert] = if count_delete.zero?
728
+ [[:insert, text_insert]]
729
+ elsif count_insert.zero?
730
+ [[:delete, text_delete]]
731
+ else
732
+ [[:delete, text_delete], [:insert, text_insert]]
733
+ end
734
+ pointer = pointer - count_delete - count_insert +
735
+ (count_delete.zero? ? 0 : 1) + (count_insert.zero? ? 0 : 1) + 1
736
+ elsif pointer != 0 && diffs[pointer - 1][0] == :equal
737
+ # Merge this equality with the previous one.
738
+ diffs[pointer - 1][1] += diffs[pointer][1]
739
+ diffs[pointer, 1] = []
740
+ else
741
+ pointer += 1
742
+ end
743
+ count_insert = 0
744
+ count_delete = 0
745
+ text_delete = ""
746
+ text_insert = ""
747
+ end
748
+ end
749
+
750
+ if diffs.last[1].empty?
751
+ diffs.pop # Remove the dummy entry at the end.
752
+ end
753
+
754
+ # Second pass: look for single edits surrounded on both sides by equalities
755
+ # which can be shifted sideways to eliminate an equality.
756
+ # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
757
+ changes = false
758
+ pointer = 1
759
+
760
+ # Intentionally ignore the first and last element (don't need checking).
761
+ while pointer < diffs.length - 1
762
+ if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal
763
+ # This is a single edit surrounded by equalities.
764
+ if diffs[pointer][1][-diffs[pointer - 1][1].length..-1] == diffs[pointer - 1][1]
765
+ # Shift the edit over the previous equality.
766
+ diffs[pointer][1] = diffs[pointer - 1][1] + diffs[pointer][1][0...-diffs[pointer - 1][1].length]
767
+ diffs[pointer + 1][1] = diffs[pointer - 1][1] + diffs[pointer + 1][1]
768
+ diffs[pointer - 1, 1] = []
769
+ changes = true
770
+ elsif diffs[pointer][1][0...diffs[pointer + 1][1].length] == diffs[pointer + 1][1]
771
+ # Shift the edit over the next equality.
772
+ diffs[pointer - 1][1] += diffs[pointer + 1][1]
773
+ diffs[pointer][1] = diffs[pointer][1][diffs[pointer + 1][1].length..-1] +
774
+ diffs[pointer + 1][1]
775
+ diffs[pointer + 1, 1] = []
776
+ changes = true
777
+ end
778
+ end
779
+ pointer += 1
780
+ end
781
+
782
+ # If shifts were made, the diff needs reordering and another shift sweep.
783
+ if changes
784
+ diff_cleanup_merge(diffs)
785
+ end
786
+ end
787
+
788
+ # loc is a location in text1, compute and return the equivalent location
789
+ # in text2. e.g. 'The cat' vs 'The big cat', 1->1, 5->8
790
+ def diff_x_index(diffs, loc)
791
+ chars1 = 0
792
+ chars2 = 0
793
+ last_chars1 = 0
794
+ last_chars2 = 0
795
+ x = diffs.index { |diff|
796
+ if diff[0] != :insert
797
+ chars1 += diff[1].length
798
+ end
799
+ if diff[0] != :delete
800
+ chars2 += diff[1].length
801
+ end
802
+ if chars1 > loc
803
+ true
804
+ else
805
+ last_chars1 = chars1
806
+ last_chars2 = chars2
807
+ false
808
+ end
809
+ }
810
+
811
+ if !x.nil? && diffs.length != x && diffs[x][0] == :delete
812
+ # The location was deleted.
813
+ last_chars2
814
+ else
815
+ # Add the remaining len(character).
816
+ last_chars2 + (loc - last_chars1)
817
+ end
818
+ end
819
+
820
+ # Convert a diff array into a pretty HTML report.
821
+ def diff_pretty_html(diffs)
822
+ diffs.map { |op, data|
823
+ text = data.gsub("&", "&amp;").gsub("<", "&lt;").gsub(">", "&gt;").gsub('\n', "&para;<br>")
824
+ case op
825
+ when :insert
826
+ "<ins style=\"background:#e6ffe6;\">#{text}</ins>"
827
+ when :delete
828
+ "<del style=\"background:#ffe6e6;\">#{text}</del>"
829
+ when :equal
830
+ "<span>#{text}</span>"
831
+ end
832
+ }.join
833
+ end
834
+
835
+ # Compute and return the source text (all equalities and deletions).
836
+ def diff_text1(diffs)
837
+ diffs.map { |op, data|
838
+ if op == :insert
839
+ ""
840
+ else
841
+ data
842
+ end
843
+ }.join
844
+ end
845
+
846
+ # Compute and return the destination text (all equalities and insertions).
847
+ def diff_text2(diffs)
848
+ diffs.map { |op, data|
849
+ if op == :delete
850
+ ""
851
+ else
852
+ data
853
+ end
854
+ }.join
855
+ end
856
+
857
+ # Compute the Levenshtein distance; the number of inserted, deleted or
858
+ # substituted characters.
859
+ def diff_levenshtein(diffs)
860
+ levenshtein = 0
861
+ insertions = 0
862
+ deletions = 0
863
+
864
+ diffs.each do |op, data|
865
+ case op
866
+ when :insert
867
+ insertions += data.length
868
+ when :delete
869
+ deletions += data.length
870
+ when :equal
871
+ # A deletion and an insertion is one substitution.
872
+ levenshtein += [insertions, deletions].max
873
+ insertions = 0
874
+ deletions = 0
875
+ end
876
+ end
877
+
878
+ levenshtein + [insertions, deletions].max
879
+ end
880
+
881
+ # Crush the diff into an encoded string which describes the operations
882
+ # required to transform text1 into text2.
883
+ # E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
884
+ # Operations are tab-separated. Inserted text is escaped using %xx notation.
885
+ def diff_to_delta(diffs)
886
+ diffs.map { |op, data|
887
+ case op
888
+ when :insert
889
+ "+" + PatchObj::PATCH_PARSER.escape(data, /[^0-9A-Za-z_.;!~*'(),\/?:@&=+$\#-]/)
890
+ when :delete
891
+ "-" + data.length.to_s
892
+ when :equal
893
+ "=" + data.length.to_s
894
+ end
895
+ }.join("\t").gsub("%20", " ")
896
+ end
897
+
898
+ # Given the original text1, and an encoded string which describes the
899
+ # operations required to transform text1 into text2, compute the full diff.
900
+ def diff_from_delta(text1, delta)
901
+ # Deltas should be composed of a subset of ascii chars, Unicode not required.
902
+ delta.encode("ascii")
903
+ diffs = []
904
+ pointer = 0 # Cursor in text1
905
+ delta.split("\t").each do |token|
906
+ # Each token begins with a one character parameter which specifies the
907
+ # operation of this token (delete, insert, equality).
908
+ param = token[1..-1]
909
+ case token[0]
910
+ when "+"
911
+ diffs.push([:insert, PatchObj::PATCH_PARSER.unescape(param.force_encoding(Encoding::UTF_8))])
912
+ when "-", "="
913
+ begin
914
+ n = Integer(param)
915
+ raise if n < 0
916
+ text = text1[pointer...(pointer + n)]
917
+ pointer += n
918
+ if token[0] == "="
919
+ diffs.push([:equal, text])
920
+ else
921
+ diffs.push([:delete, text])
922
+ end
923
+ rescue ArgumentError => _
924
+ raise ArgumentError.new(
925
+ "Invalid number in diff_fromDelta: #{param.inspect}"
926
+ )
927
+ end
928
+ else
929
+ raise ArgumentError.new(
930
+ "Invalid diff operation in diff_fromDelta: #{token.inspect}"
931
+ )
932
+ end
933
+ end
934
+
935
+ if pointer != text1.length
936
+ raise ArgumentError.new("Delta length (#{pointer}) does not equal " \
937
+ "source text length #{text1.length}")
938
+ end
939
+ diffs
940
+ end
941
+
942
+ # Locate the best instance of 'pattern' in 'text' near 'loc'.
943
+ def match_main(text, pattern, loc)
944
+ # Check for null inputs.
945
+ if [text, pattern].any?(&:nil?)
946
+ raise ArgumentError.new("Null input. (match_main)")
947
+ end
948
+
949
+ loc = [0, [loc, text.length].min].max
950
+ if text == pattern
951
+ # Shortcut (potentially not guaranteed by the algorithm)
952
+ 0
953
+ elsif text.empty?
954
+ # Nothing to match
955
+ -1
956
+ elsif text[loc, pattern.length] == pattern
957
+ # Perfect match at the perfect spot! (Includes case of null pattern)
958
+ loc
959
+ else
960
+ # Do a fuzzy compare.
961
+ match_bitap(text, pattern, loc)
962
+ end
963
+ end
964
+
965
+ # Locate the best instance of 'pattern' in 'text' near 'loc' using the
966
+ # Bitap algorithm.
967
+ def match_bitap(text, pattern, loc)
968
+ if pattern.length > match_max_bits
969
+ throw ArgumentError.new("Pattern too long")
970
+ end
971
+
972
+ # Initialise the alphabet.
973
+ s = match_alphabet(pattern)
974
+
975
+ # Compute and return the score for a match with e errors and x location.
976
+ match_bitap_score = ->(e, x) do
977
+ accuracy = e.to_f / pattern.length
978
+ proximity = (loc - x).abs
979
+ if match_distance == 0
980
+ # Dodge divide by zero error.
981
+ return proximity == 0 ? accuracy : 1.0
982
+ end
983
+ return accuracy + (proximity.to_f / match_distance)
984
+ end
985
+
986
+ # Highest score beyond which we give up.
987
+ score_threshold = match_threshold
988
+ # Is there a nearby exact match? (speedup)
989
+ best_loc = text.index(pattern, loc)
990
+ if best_loc
991
+ score_threshold = [match_bitap_score[0, best_loc], score_threshold].min
992
+ # What about in the other direction? (speedup)
993
+ best_loc = text.rindex(pattern, loc + pattern.length)
994
+ if best_loc
995
+ score_threshold = [match_bitap_score[0, best_loc], score_threshold].min
996
+ end
997
+ end
998
+
999
+ # Initialise the bit arrays.
1000
+ match_mask = 1 << (pattern.length - 1)
1001
+ best_loc = -1
1002
+
1003
+ bin_max = pattern.length + text.length
1004
+ # Empty initialization added to appease pychecker.
1005
+ last_rd = nil
1006
+ pattern.length.times do |d|
1007
+ # Scan for the best match; each iteration allows for one more error.
1008
+ # Run a binary search to determine how far from 'loc' we can stray at this
1009
+ # error level.
1010
+ bin_min = 0
1011
+ bin_mid = bin_max
1012
+ while bin_min < bin_mid
1013
+ if match_bitap_score[d, loc + bin_mid] <= score_threshold
1014
+ bin_min = bin_mid
1015
+ else
1016
+ bin_max = bin_mid
1017
+ end
1018
+ bin_mid = (bin_max - bin_min) / 2 + bin_min
1019
+ end
1020
+
1021
+ # Use the result from this iteration as the maximum for the next.
1022
+ bin_max = bin_mid
1023
+ start = [1, loc - bin_mid + 1].max
1024
+ finish = [loc + bin_mid, text.length].min + pattern.length
1025
+
1026
+ rd = Array.new(finish + 2, 0)
1027
+ rd[finish + 1] = (1 << d) - 1
1028
+ finish.downto(start) do |j|
1029
+ char_match = s[text[j - 1]] || 0
1030
+ rd[j] = if d == 0 # First pass: exact match.
1031
+ ((rd[j + 1] << 1) | 1) & char_match
1032
+ else # Subsequent passes: fuzzy match.
1033
+ ((rd[j + 1] << 1) | 1) & char_match |
1034
+ (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1035
+ end
1036
+ if (rd[j] & match_mask).nonzero?
1037
+ score = match_bitap_score[d, j - 1]
1038
+ # This match will almost certainly be better than any existing match.
1039
+ # But check anyway.
1040
+ if score <= score_threshold
1041
+ # Told you so.
1042
+ score_threshold = score
1043
+ best_loc = j - 1
1044
+ if best_loc > loc
1045
+ # When passing loc, don't exceed our current distance from loc.
1046
+ start = [1, 2 * loc - best_loc].max
1047
+ else
1048
+ # Already passed loc, downhill from here on in.
1049
+ break
1050
+ end
1051
+ end
1052
+ end
1053
+ end
1054
+
1055
+ # No hope for a (better) match at greater error levels.
1056
+ if match_bitap_score[d + 1, loc] > score_threshold
1057
+ break
1058
+ end
1059
+ last_rd = rd
1060
+ end
1061
+
1062
+ best_loc
1063
+ end
1064
+
1065
+ # Initialise the alphabet for the Bitap algorithm.
1066
+ def match_alphabet(pattern)
1067
+ s = {}
1068
+ pattern.chars.each_with_index do |c, i|
1069
+ s[c] ||= 0
1070
+ s[c] |= 1 << (pattern.length - i - 1)
1071
+ end
1072
+ s
1073
+ end
1074
+
1075
+ # Parse a textual representation of patches and return a list of patch
1076
+ # objects.
1077
+ def patch_from_text(textline)
1078
+ return [] if textline.empty?
1079
+
1080
+ patches = []
1081
+ text = textline.split("\n")
1082
+ text_pointer = 0
1083
+ patch_header = /^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$/
1084
+ while text_pointer < text.length
1085
+ m = text[text_pointer].match(patch_header)
1086
+ if m.nil?
1087
+ raise ArgumentError.new("Invalid patch string: #{text[text_pointer]}")
1088
+ end
1089
+ patch = PatchObj.new
1090
+ patches.push(patch)
1091
+ patch.start1 = m[1].to_i
1092
+ if m[2].empty?
1093
+ patch.start1 -= 1
1094
+ patch.length1 = 1
1095
+ elsif m[2] == "0"
1096
+ patch.length1 = 0
1097
+ else
1098
+ patch.start1 -= 1
1099
+ patch.length1 = m[2].to_i
1100
+ end
1101
+
1102
+ patch.start2 = m[3].to_i
1103
+ if m[4].empty?
1104
+ patch.start2 -= 1
1105
+ patch.length2 = 1
1106
+ elsif m[4] == "0"
1107
+ patch.length2 = 0
1108
+ else
1109
+ patch.start2 -= 1
1110
+ patch.length2 = m[4].to_i
1111
+ end
1112
+ text_pointer += 1
1113
+
1114
+ while text_pointer < text.length
1115
+ if text[text_pointer].empty?
1116
+ # Blank line? Whatever.
1117
+ text_pointer += 1
1118
+ next
1119
+ end
1120
+
1121
+ sign = text[text_pointer][0]
1122
+ line = PatchObj::PATCH_PARSER.unescape(text[text_pointer][1..-1].force_encoding(Encoding::UTF_8))
1123
+
1124
+ case sign
1125
+ when "-"
1126
+ # Deletion.
1127
+ patch.diffs.push([:delete, line])
1128
+ when "+"
1129
+ # Insertion.
1130
+ patch.diffs.push([:insert, line])
1131
+ when " "
1132
+ # Minor equality
1133
+ patch.diffs.push([:equal, line])
1134
+ when "@"
1135
+ # Start of next patch.
1136
+ break
1137
+ else
1138
+ # WTF?
1139
+ raise ArgumentError.new("Invalid patch mode \"#{sign}\" in: #{line}")
1140
+ end
1141
+ text_pointer += 1
1142
+ end
1143
+ end
1144
+
1145
+ patches
1146
+ end
1147
+
1148
+ # Take a list of patches and return a textual representation
1149
+ def patch_to_text(patches)
1150
+ patches.join
1151
+ end
1152
+
1153
+ # Increase the context until it is unique,
1154
+ # but don't let the pattern expand beyond match_max_bits
1155
+ def patch_add_context(patch, text)
1156
+ return if text.empty?
1157
+ pattern = text[patch.start2, patch.length1]
1158
+ padding = 0
1159
+
1160
+ # Look for the first and last matches of pattern in text. If two different
1161
+ # matches are found, increase the pattern length.
1162
+ while text.index(pattern) != text.rindex(pattern) &&
1163
+ pattern.length < match_max_bits - 2 * patch_margin
1164
+ padding += patch_margin
1165
+ pattern = text[[0, patch.start2 - padding].max...(patch.start2 + patch.length1 + padding)]
1166
+ end
1167
+
1168
+ # Add one chunk for good luck.
1169
+ padding += patch_margin
1170
+
1171
+ # Add the prefix.
1172
+ prefix = text[[0, patch.start2 - padding].max...patch.start2]
1173
+ patch.diffs.unshift([:equal, prefix]) unless prefix.to_s.empty?
1174
+
1175
+ # Add the suffix.
1176
+ suffix = text[patch.start2 + patch.length1, padding]
1177
+ patch.diffs.push([:equal, suffix]) unless suffix.to_s.empty?
1178
+
1179
+ # Roll back the start points.
1180
+ patch.start1 -= prefix.length
1181
+ patch.start2 -= prefix.length
1182
+
1183
+ # Extend the lengths.
1184
+ patch.length1 += prefix.length + suffix.length
1185
+ patch.length2 += prefix.length + suffix.length
1186
+ end
1187
+
1188
+ # Compute a list of patches to turn text1 into text2.
1189
+ # Use diffs if provided, otherwise compute it ourselves.
1190
+ # There are four ways to call this function, depending on what data is
1191
+ # available to the caller:
1192
+ # Method 1:
1193
+ # a = text1, b = text2
1194
+ # Method 2:
1195
+ # a = diffs
1196
+ # Method 3 (optimal):
1197
+ # a = text1, b = diffs
1198
+ # Method 4 (deprecated, use method 3):
1199
+ # a = text1, b = text2, c = diffs
1200
+ def patch_make(*args)
1201
+ text1 = nil
1202
+ diffs = nil
1203
+ if args.length == 2 && args[0].is_a?(String) && args[1].is_a?(String)
1204
+ # Compute diffs from text1 and text2.
1205
+ text1 = args[0]
1206
+ text2 = args[1]
1207
+ diffs = diff_main(text1, text2, true)
1208
+ if diffs.length > 2
1209
+ diff_cleanup_semantic(diffs)
1210
+ diff_cleanup_efficiency(diffs)
1211
+ end
1212
+ elsif args.length == 1 && args[0].is_a?(Array)
1213
+ # Compute text1 from diffs.
1214
+ diffs = args[0]
1215
+ text1 = diff_text1(diffs)
1216
+ elsif args.length == 2 && args[0].is_a?(String) && args[1].is_a?(Array)
1217
+ text1 = args[0]
1218
+ diffs = args[1]
1219
+ elsif args.length == 3 && args[0].is_a?(String) && args[1].is_a?(String) &&
1220
+ args[2].is_a?(Array)
1221
+ # Method 4: text1, text2, diffs
1222
+ # text2 is not used.
1223
+ text1 = args[0]
1224
+ # text2 = args[1]
1225
+ diffs = args[2]
1226
+ else
1227
+ raise ArgumentError.new("Unknown call format to patch_make.")
1228
+ end
1229
+
1230
+ return [] if diffs.empty? # Get rid of the null case.
1231
+
1232
+ patches = []
1233
+ patch = PatchObj.new
1234
+ char_count1 = 0 # Number of characters into the text1 string.
1235
+ char_count2 = 0 # Number of characters into the text2 string.
1236
+ prepatch_text = text1 # Recreate the patches to determine context info.
1237
+ postpatch_text = text1
1238
+
1239
+ diffs.each_with_index do |diff, x|
1240
+ diff_type, diff_text = diffs[x]
1241
+ if patch.diffs.empty? && diff_type != :equal
1242
+ # A new patch starts here.
1243
+ patch.start1 = char_count1
1244
+ patch.start2 = char_count2
1245
+ end
1246
+
1247
+ case diff_type
1248
+ when :insert
1249
+ patch.diffs.push(diff)
1250
+ patch.length2 += diff_text.length
1251
+ postpatch_text = postpatch_text[0...char_count2] + diff_text +
1252
+ postpatch_text[char_count2..-1]
1253
+ when :delete
1254
+ patch.length1 += diff_text.length
1255
+ patch.diffs.push(diff)
1256
+ postpatch_text = postpatch_text[0...char_count2] +
1257
+ postpatch_text[(char_count2 + diff_text.length)..-1]
1258
+ when :equal
1259
+ if diff_text.length <= 2 * patch_margin &&
1260
+ !patch.diffs.empty? && diffs.length != x + 1
1261
+ # Small equality inside a patch.
1262
+ patch.diffs.push(diff)
1263
+ patch.length1 += diff_text.length
1264
+ patch.length2 += diff_text.length
1265
+ elsif diff_text.length >= 2 * patch_margin
1266
+ # Time for a new patch.
1267
+ unless patch.diffs.empty?
1268
+ patch_add_context(patch, prepatch_text)
1269
+ patches.push(patch)
1270
+ patch = PatchObj.new
1271
+ # Unlike Unidiff, our patch lists have a rolling context.
1272
+ # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1273
+ # Update prepatch text & pos to reflect the application of the
1274
+ # just completed patch.
1275
+ prepatch_text = postpatch_text
1276
+ char_count1 = char_count2
1277
+ end
1278
+ end
1279
+ end
1280
+
1281
+ # Update the current character count.
1282
+ if diff_type != :insert
1283
+ char_count1 += diff_text.length
1284
+ end
1285
+ if diff_type != :delete
1286
+ char_count2 += diff_text.length
1287
+ end
1288
+ end
1289
+
1290
+ # Pick up the leftover patch if not empty.
1291
+ unless patch.diffs.empty?
1292
+ patch_add_context(patch, prepatch_text)
1293
+ patches.push(patch)
1294
+ end
1295
+
1296
+ patches
1297
+ end
1298
+
1299
+ # Merge a set of patches onto the text. Return a patched text, as well
1300
+ # as a list of true/false values indicating which patches were applied.
1301
+ def patch_apply(patches, text)
1302
+ return [text, []] if patches.empty?
1303
+
1304
+ # Deep copy the patches so that no changes are made to originals.
1305
+ patches = Marshal.load(Marshal.dump(patches))
1306
+
1307
+ null_padding = patch_add_padding(patches)
1308
+ text = null_padding + text + null_padding
1309
+ patch_split_max(patches)
1310
+
1311
+ # delta keeps track of the offset between the expected and actual location
1312
+ # of the previous patch. If there are patches expected at positions 10 and
1313
+ # 20, but the first patch was found at 12, delta is 2 and the second patch
1314
+ # has an effective expected position of 22.
1315
+ delta = 0
1316
+ results = []
1317
+ patches.each_with_index do |patch, x|
1318
+ expected_loc = patch.start2 + delta
1319
+ text1 = diff_text1(patch.diffs)
1320
+ end_loc = -1
1321
+ if text1.length > match_max_bits
1322
+ # patch_splitMax will only provide an oversized pattern in the case of
1323
+ # a monster delete.
1324
+ start_loc = match_main(text, text1[0, match_max_bits], expected_loc)
1325
+ if start_loc != -1
1326
+ end_loc = match_main(text, text1[(text1.length - match_max_bits)..-1],
1327
+ expected_loc + text1.length - match_max_bits)
1328
+ if end_loc == -1 || start_loc >= end_loc
1329
+ # Can't find valid trailing context. Drop this patch.
1330
+ start_loc = -1
1331
+ end
1332
+ end
1333
+ else
1334
+ start_loc = match_main(text, text1, expected_loc)
1335
+ end
1336
+ if start_loc == -1
1337
+ # No match found. :(
1338
+ results[x] = false
1339
+ # Subtract the delta for this failed patch from subsequent patches.
1340
+ delta -= patch.length2 - patch.length1
1341
+ else
1342
+ # Found a match. :)
1343
+ results[x] = true
1344
+ delta = start_loc - expected_loc
1345
+ text2 = text[start_loc, end_loc == -1 ? text1.length : end_loc + match_max_bits]
1346
+
1347
+ if text1 == text2
1348
+ # Perfect match, just shove the replacement text in.
1349
+ text = text[0, start_loc] + diff_text2(patch.diffs) + text[(start_loc + text1.length)..-1]
1350
+ else
1351
+ # Imperfect match.
1352
+ # Run a diff to get a framework of equivalent indices.
1353
+ diffs = diff_main(text1, text2, false)
1354
+ if text1.length > match_max_bits &&
1355
+ diff_levenshtein(diffs).to_f / text1.length > patch_delete_threshold
1356
+ # The end points match, but the content is unacceptably bad.
1357
+ results[x] = false
1358
+ else
1359
+ diff_cleanup_semantic_lossless(diffs)
1360
+ index1 = 0
1361
+ patch.diffs.each do |op, data|
1362
+ if op != :equal
1363
+ index2 = diff_x_index(diffs, index1)
1364
+ end
1365
+ if op == :insert # Insertion
1366
+ text = text[0, start_loc + index2] + data + text[(start_loc + index2)..-1]
1367
+ elsif op == :delete # Deletion
1368
+ text = text[0, start_loc + index2] +
1369
+ text[(start_loc + diff_x_index(diffs, index1 + data.length))..-1]
1370
+ end
1371
+ if op != :delete
1372
+ index1 += data.length
1373
+ end
1374
+ end
1375
+ end
1376
+ end
1377
+ end
1378
+ end
1379
+
1380
+ # Strip the padding off.
1381
+ text = text[null_padding.length...-null_padding.length]
1382
+ [text, results]
1383
+ end
1384
+
1385
+ # Add some padding on text start and end so that edges can match
1386
+ # something. Intended to be called only from within patch_apply.
1387
+ def patch_add_padding(patches)
1388
+ padding_length = patch_margin
1389
+ null_padding = (1..padding_length).map { |x| x.chr(Encoding::UTF_8) }.join
1390
+
1391
+ # Bump all the patches forward.
1392
+ patches.each do |patch|
1393
+ patch.start1 += padding_length
1394
+ patch.start2 += padding_length
1395
+ end
1396
+
1397
+ # Add some padding on start of first diff.
1398
+ patch = patches.first
1399
+ diffs = patch.diffs
1400
+ if diffs.empty? || diffs.first[0] != :equal
1401
+ # Add nullPadding equality.
1402
+ diffs.unshift([:equal, null_padding])
1403
+ patch.start1 -= padding_length # Should be 0.
1404
+ patch.start2 -= padding_length # Should be 0.
1405
+ patch.length1 += padding_length
1406
+ patch.length2 += padding_length
1407
+ elsif padding_length > diffs.first[1].length
1408
+ # Grow first equality.
1409
+ extra_length = padding_length - diffs.first[1].length
1410
+ diffs.first[1] = null_padding[diffs.first[1].length..-1] + diffs.first[1]
1411
+ patch.start1 -= extra_length
1412
+ patch.start2 -= extra_length
1413
+ patch.length1 += extra_length
1414
+ patch.length2 += extra_length
1415
+ end
1416
+
1417
+ # Add some padding on end of last diff.
1418
+ patch = patches.last
1419
+ diffs = patch.diffs
1420
+ if diffs.empty? || diffs.last[0] != :equal
1421
+ # Add nullPadding equality.
1422
+ diffs.push([:equal, null_padding])
1423
+ patch.length1 += padding_length
1424
+ patch.length2 += padding_length
1425
+ elsif padding_length > diffs.last[1].length
1426
+ # Grow last equality.
1427
+ extra_length = padding_length - diffs.last[1].length
1428
+ diffs.last[1] += null_padding[0, extra_length]
1429
+ patch.length1 += extra_length
1430
+ patch.length2 += extra_length
1431
+ end
1432
+
1433
+ null_padding
1434
+ end
1435
+
1436
+ # Look through the patches and break up any which are longer than the
1437
+ # maximum limit of the match algorithm.
1438
+ def patch_split_max(patches)
1439
+ patch_size = match_max_bits
1440
+
1441
+ x = 0
1442
+ while x < patches.length
1443
+ if patches[x].length1 > patch_size
1444
+ big_patch = patches[x]
1445
+ # Remove the big old patch
1446
+ patches[x, 1] = []
1447
+ x -= 1
1448
+ start1 = big_patch.start1
1449
+ start2 = big_patch.start2
1450
+ pre_context = ""
1451
+ until big_patch.diffs.empty?
1452
+ # Create one of several smaller patches.
1453
+ patch = PatchObj.new
1454
+ empty = true
1455
+ patch.start1 = start1 - pre_context.length
1456
+ patch.start2 = start2 - pre_context.length
1457
+ unless pre_context.empty?
1458
+ patch.length1 = patch.length2 = pre_context.length
1459
+ patch.diffs.push([:equal, pre_context])
1460
+ end
1461
+
1462
+ while !big_patch.diffs.empty? && patch.length1 < patch_size - patch_margin
1463
+ diff = big_patch.diffs.first
1464
+ if diff[0] == :insert
1465
+ # Insertions are harmless.
1466
+ patch.length2 += diff[1].length
1467
+ start2 += diff[1].length
1468
+ patch.diffs.push(big_patch.diffs.shift)
1469
+ empty = false
1470
+ elsif diff[0] == :delete && patch.diffs.length == 1 &&
1471
+ patch.diffs.first[0] == :equal && diff[1].length > 2 * patch_size
1472
+ # This is a large deletion. Let it pass in one chunk.
1473
+ patch.length1 += diff[1].length
1474
+ start1 += diff[1].length
1475
+ empty = false
1476
+ patch.diffs.push(big_patch.diffs.shift)
1477
+ else
1478
+ # Deletion or equality. Only take as much as we can stomach.
1479
+ diff_text = diff[1][0, patch_size - patch.length1 - patch_margin]
1480
+ patch.length1 += diff_text.length
1481
+ start1 += diff_text.length
1482
+ if diff[0] == :equal
1483
+ patch.length2 += diff_text.length
1484
+ start2 += diff_text.length
1485
+ else
1486
+ empty = false
1487
+ end
1488
+ patch.diffs.push([diff[0], diff_text])
1489
+ if diff_text == big_patch.diffs.first[1]
1490
+ big_patch.diffs.shift
1491
+ else
1492
+ big_patch.diffs.first[1] = big_patch.diffs.first[1][diff_text.length..-1]
1493
+ end
1494
+ end
1495
+ end
1496
+
1497
+ # Compute the head context for the next patch.
1498
+ pre_context = diff_text2(patch.diffs)[-patch_margin..-1] || ""
1499
+
1500
+ # Append the end context for this patch.
1501
+ post_context = diff_text1(big_patch.diffs)[0...patch_margin] || ""
1502
+ unless post_context.empty?
1503
+ patch.length1 += post_context.length
1504
+ patch.length2 += post_context.length
1505
+ if !patch.diffs.empty? && patch.diffs.last[0] == :equal
1506
+ patch.diffs.last[1] += post_context
1507
+ else
1508
+ patch.diffs.push([:equal, post_context])
1509
+ end
1510
+ end
1511
+ unless empty
1512
+ x += 1
1513
+ patches[x, 0] = [patch]
1514
+ end
1515
+ end
1516
+ end
1517
+ x += 1
1518
+ end
1519
+ end
1520
+ end
1521
+
1522
+ DiffMatchPatch = DiMaPa