diff_match_patch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,23 @@
1
+ Copyright (c) 2011, Jorge Kalmbach <kalmbach.at.gmail.com>
2
+
3
+ Permission is hereby granted, free of charge, to any
4
+ person obtaining a copy of this software and associated
5
+ documentation files (the "Software"), to deal in the
6
+ Software without restriction, including without limitation
7
+ the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the
9
+ Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice
13
+ shall be included in all copies or substantial portions of
14
+ the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
17
+ KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
18
+ WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
19
+ PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
20
+ OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,11 @@
1
+ # DiffMatchPatch
2
+
3
+ A ruby implementation of the google diff-match-patch library.
4
+ http://code.google.com/p/google-diff-match-patch/
5
+
6
+ The Diff Match and Patch libraries offer robust algorithms to perform the operations required for synchronizing plain text.
7
+
8
+ This work was inspired by the diff_match_patch-ruby module.
9
+ (https://github.com/reima/diff_match_patch-ruby)
10
+
11
+ Copyright (c) 2011, Jorge Kalmbach <kalmbach.at.gmail.com>
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
@@ -0,0 +1,1626 @@
1
+ require 'patch_obj'
2
+
3
+ # Class containing the diff, match and patch methods.
4
+ # Also contains the behaviour settings.
5
+ class DiffMatchPatch
6
+ attr_accessor :diff_timeout
7
+ attr_accessor :diff_editCost
8
+ attr_accessor :match_threshold
9
+ attr_accessor :match_distance
10
+ attr_accessor :patch_deleteThreshold
11
+ attr_accessor :patch_margin
12
+ attr_reader :match_maxBits
13
+
14
+ def initialize
15
+ # Inits a diff_match_patch object with default settings.
16
+ # Redefine these in your program to override the defaults.
17
+
18
+ # Number of seconds to map a diff before giving up (0 for infinity).
19
+ @diff_timeout = 1
20
+ # Cost of an empty edit operation in terms of edit characters.
21
+ @diff_editCost = 4
22
+ # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
23
+ @match_threshold = 0.5
24
+ # How far to search for a match (0 = exact location, 1000+ = broad match).
25
+ # A match this many characters away from the expected location will add
26
+ # 1.0 to the score (0.0 is a perfect match).
27
+ @match_distance = 1000
28
+ # When deleting a large block of text (over ~64 characters), how close does
29
+ # the contents have to match the expected contents. (0.0 = perfection,
30
+ # 1.0 = very loose). Note that Match_Threshold controls how closely the
31
+ # end points of a delete need to match.
32
+ @patch_deleteThreshold = 0.5
33
+ # Chunk size for context length.
34
+ @patch_margin = 4
35
+
36
+ # The number of bits in an int.
37
+ # Python has no maximum, thus to disable patch splitting set to 0.
38
+ # However to avoid long patches in certain pathological cases, use 32.
39
+ # Multiple short patches (using native ints) are much faster than long ones.
40
+ @match_maxBits = 32
41
+ end
42
+
43
+
44
+ # Find the differences between two texts. Simplifies the problem by
45
+ # stripping any common prefix or suffix off the texts before diffing.
46
+ def diff_main(text1, text2, checklines=true, deadline=nil)
47
+ # Set a deadline by which time the diff must be complete.
48
+ if deadline.nil? && diff_timeout > 0
49
+ deadline = Time.now + diff_timeout
50
+ end
51
+
52
+ # Check for null inputs.
53
+ if text1.nil? || text2.nil?
54
+ raise ArgumentError.new('Null inputs. (diff_main)')
55
+ end
56
+
57
+ # Check for equality (speedup).
58
+ if text1 == text2
59
+ return [] if text1.empty?
60
+ return [[:equal, text1]]
61
+ end
62
+
63
+ checklines = true if checklines.nil?
64
+
65
+ # Trim off common prefix (speedup).
66
+ common_length = diff_commonPrefix(text1, text2)
67
+ if common_length.nonzero?
68
+ common_prefix = text1[0...common_length]
69
+ text1 = text1[common_length..-1]
70
+ text2 = text2[common_length..-1]
71
+ end
72
+
73
+ # Trim off common suffix (speedup).
74
+ common_length = diff_commonSuffix(text1, text2)
75
+ if common_length.nonzero?
76
+ common_suffix = text1[-common_length..-1]
77
+ text1 = text1[0...-common_length]
78
+ text2 = text2[0...-common_length]
79
+ end
80
+
81
+ # Compute the diff on the middle block.
82
+ diffs = diff_compute(text1, text2, checklines, deadline)
83
+
84
+ # Restore the prefix and suffix.
85
+ diffs.unshift([:equal, common_prefix]) unless common_prefix.nil?
86
+ diffs.push([:equal, common_suffix]) unless common_suffix.nil?
87
+ diff_cleanupMerge(diffs)
88
+
89
+ diffs
90
+ end
91
+
92
+ # Find the differences between two texts. Assumes that the texts do not
93
+ # have any common prefix or suffix.
94
+ def diff_compute(text1, text2, checklines, deadline)
95
+ # Just add some text (speedup).
96
+ return [[:insert, text2]] if text1.empty?
97
+
98
+ # Just delete some text (speedup).
99
+ return [[:delete, text1]] if text2.empty?
100
+
101
+ shorttext, longtext = [text1, text2].sort_by(&:length)
102
+ if i = longtext.index(shorttext)
103
+ # Shorter text is inside the longer text (speedup).
104
+ diffs = [[:insert, longtext[0...i]], [:equal, shorttext],
105
+ [:insert, longtext[(i + shorttext.length)..-1]]]
106
+
107
+ # Swap insertions for deletions if diff is reversed.
108
+ if text1.length > text2.length
109
+ diffs[0][0] = :delete
110
+ diffs[2][0] = :delete
111
+ end
112
+
113
+ return diffs
114
+ end
115
+
116
+ if shorttext.length == 1
117
+ # Single character string.
118
+ # After the previous speedup, the character can't be an equality.
119
+ return [[:delete, text1], [:insert, text2]]
120
+ end
121
+
122
+ # Garbage collect.
123
+ longtext = nil
124
+ shorttext = nil
125
+
126
+ # Check to see if the problem can be split in two.
127
+ if hm = diff_halfMatch(text1, text2)
128
+ # A half-match was found, sort out the return data.
129
+ text1_a, text1_b, text2_a, text2_b, mid_common = hm
130
+ # Send both pairs off for separate processing.
131
+ diffs_a = diff_main(text1_a, text2_a, checklines, deadline)
132
+ diffs_b = diff_main(text1_b, text2_b, checklines, deadline)
133
+ # Merge the results.
134
+ return diffs_a + [[:equal, mid_common]] + diffs_b
135
+ end
136
+
137
+ if checklines && text1.length > 100 && text2.length > 100
138
+ return diff_lineMode(text1, text2, deadline)
139
+ end
140
+
141
+ return diff_bisect(text1, text2, deadline)
142
+ end
143
+
144
+ # Do a quick line-level diff on both strings, then rediff the parts for
145
+ # greater accuracy.
146
+ # This speedup can produce non-minimal diffs.
147
+ def diff_lineMode(text1, text2, deadline)
148
+ # Scan the text on a line-by-line basis first.
149
+ text1, text2, line_array = diff_linesToChars(text1, text2)
150
+
151
+ diffs = diff_main(text1, text2, false, deadline)
152
+
153
+ # Convert the diff back to original text.
154
+ diff_charsToLines(diffs, line_array)
155
+ # Eliminate freak matches (e.g. blank lines)
156
+ diff_cleanupSemantic(diffs)
157
+
158
+ # Rediff any replacement blocks, this time character-by-character.
159
+ # Add a dummy entry at the end.
160
+ diffs.push([:equal, ''])
161
+ pointer = 0
162
+ count_delete = 0
163
+ count_insert = 0
164
+ text_delete = ''
165
+ text_insert = ''
166
+
167
+ while pointer < diffs.length
168
+ case diffs[pointer][0]
169
+ when :insert
170
+ count_insert += 1
171
+ text_insert += diffs[pointer][1]
172
+ when :delete
173
+ count_delete += 1
174
+ text_delete += diffs[pointer][1]
175
+ when :equal
176
+ # Upon reaching an equality, check for prior redundancies.
177
+ if count_delete >= 1 && count_insert >= 1
178
+ # Delete the offending records and add the merged ones.
179
+ a = diff_main(text_delete, text_insert, false, deadline)
180
+ diffs[pointer - count_delete - count_insert,
181
+ count_delete + count_insert] = []
182
+ pointer = pointer - count_delete - count_insert
183
+ diffs[pointer, 0] = a
184
+ pointer = pointer + a.length
185
+ end
186
+ count_insert = 0
187
+ count_delete = 0
188
+ text_delete = ''
189
+ text_insert = ''
190
+ end
191
+ pointer += 1
192
+ end
193
+
194
+ diffs.pop # Remove the dummy entry at the end.
195
+ return diffs
196
+ end
197
+
198
+ # Find the 'middle snake' of a diff, split the problem in two
199
+ # and return the recursively constructed diff.
200
+ # See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
201
+ def diff_bisect(text1, text2, deadline)
202
+ # Cache the text lengths to prevent multiple calls.
203
+ text1_length = text1.length
204
+ text2_length = text2.length
205
+ max_d = (text1_length + text2_length + 1) / 2
206
+ v_offset = max_d
207
+ v_length = 2 * max_d
208
+ v1 = Array.new(v_length, -1)
209
+ v2 = Array.new(v_length, -1)
210
+ v1[v_offset + 1] = 0
211
+ v2[v_offset + 1] = 0
212
+ delta = text1_length - text2_length
213
+
214
+ # If the total number of characters is odd, then the front path will
215
+ # collide with the reverse path.
216
+ front = (delta % 2 != 0)
217
+ # Offsets for start and end of k loop.
218
+ # Prevents mapping of space beyond the grid.
219
+ k1start = 0
220
+ k1end = 0
221
+ k2start = 0
222
+ k2end = 0
223
+ max_d.times do |d|
224
+ # Bail out if deadline is reached.
225
+ break if deadline && Time.now >= deadline
226
+
227
+ # Walk the front path one step.
228
+ (-d + k1start).step(d - k1end, 2) do |k1|
229
+ k1_offset = v_offset + k1
230
+ if k1 == -d || k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]
231
+ x1 = v1[k1_offset + 1]
232
+ else
233
+ x1 = v1[k1_offset - 1] + 1
234
+ end
235
+
236
+ y1 = x1 - k1
237
+ while x1 < text1_length && y1 < text2_length && text1[x1] == text2[y1]
238
+ x1 += 1
239
+ y1 += 1
240
+ end
241
+
242
+ v1[k1_offset] = x1
243
+ if x1 > text1_length
244
+ # Ran off the right of the graph.
245
+ k1end += 2
246
+ elsif y1 > text2_length
247
+ # Ran off the bottom of the graph.
248
+ k1start += 2
249
+ elsif front
250
+ k2_offset = v_offset + delta - k1
251
+ if k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1
252
+ # Mirror x2 onto top-left coordinate system.
253
+ x2 = text1_length - v2[k2_offset]
254
+ if x1 >= x2
255
+ # Overlap detected.
256
+ return diff_bisectSplit(text1, text2, x1, y1, deadline)
257
+ end
258
+ end
259
+ end
260
+ end
261
+
262
+ # Walk the reverse path one step.
263
+ (-d + k2start).step(d - k2end, 2) do |k2|
264
+ k2_offset = v_offset + k2
265
+ if k2 == -d || k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]
266
+ x2 = v2[k2_offset + 1]
267
+ else
268
+ x2 = v2[k2_offset - 1] + 1
269
+ end
270
+
271
+ y2 = x2 - k2
272
+ while x2 < text1_length && y2 < text2_length && text1[-x2-1] == text2[-y2-1]
273
+ x2 += 1
274
+ y2 += 1
275
+ end
276
+
277
+ v2[k2_offset] = x2
278
+ if x2 > text1_length
279
+ # Ran off the left of the graph.
280
+ k2end += 2
281
+ elsif y2 > text2_length
282
+ # Ran off the top of the graph.
283
+ k2start += 2
284
+ elsif !front
285
+ k1_offset = v_offset + delta - k2
286
+ if k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1
287
+ x1 = v1[k1_offset]
288
+ y1 = v_offset + x1 - k1_offset
289
+ # Mirror x2 onto top-left coordinate system.
290
+ x2 = text1_length - x2
291
+ if x1 >= x2
292
+ # Overlap detected.
293
+ return diff_bisectSplit(text1, text2, x1, y1, deadline)
294
+ end
295
+ end
296
+ end
297
+ end
298
+ end
299
+
300
+ # Diff took too long and hit the deadline or
301
+ # number of diffs equals number of characters, no commonality at all.
302
+ [[:delete, text1], [:insert, text2]]
303
+ end
304
+
305
+ # Given the location of the 'middle snake', split the diff in two parts
306
+ # and recurse.
307
+ def diff_bisectSplit(text1, text2, x, y, deadline)
308
+ text1a = text1[0...x]
309
+ text2a = text2[0...y]
310
+ text1b = text1[x..-1]
311
+ text2b = text2[y..-1]
312
+
313
+ # Compute both diffs serially.
314
+ diffs = diff_main(text1a, text2a, false, deadline)
315
+ diffsb = diff_main(text1b, text2b, false, deadline)
316
+
317
+ diffs + diffsb
318
+ end
319
+
320
+ # Split two texts into an array of strings. Reduce the texts to a string
321
+ # of hashes where each Unicode character represents one line.
322
+ def diff_linesToChars(text1, text2)
323
+ line_array = [''] # e.g. line_array[4] == "Hello\n"
324
+ line_hash = {} # e.g. line_hash["Hello\n"] == 4
325
+
326
+ [text1, text2].map do |text|
327
+ # Split text into an array of strings. Reduce the text to a string of
328
+ # hashes where each Unicode character represents one line.
329
+ chars = ''
330
+ text.each_line do |line|
331
+ if line_hash[line]
332
+ chars += line_hash[line].chr(Encoding::UTF_8)
333
+ else
334
+ chars += line_array.length.chr(Encoding::UTF_8)
335
+ line_hash[line] = line_array.length
336
+ line_array.push(line)
337
+ end
338
+ end
339
+ chars
340
+ end.push(line_array)
341
+ end
342
+
343
+ # Rehydrate the text in a diff from a string of line hashes to real lines of text.
344
+ def diff_charsToLines(diffs, line_array)
345
+ diffs.each do |diff|
346
+ diff[1] = diff[1].chars.map{ |c| line_array[c.ord] }.join
347
+ end
348
+ end
349
+
350
+ # Determine the common prefix of two strings.
351
+ def diff_commonPrefix(text1, text2)
352
+ # Quick check for common null cases.
353
+ return 0 if text1.empty? || text2.empty? || text1[0] != text2[0]
354
+
355
+ # Binary search.
356
+ # Performance analysis: http://neil.fraser.name/news/2007/10/09/
357
+ pointer_min = 0
358
+ pointer_max = [text1.length, text2.length].min
359
+ pointer_mid = pointer_max
360
+ pointer_start = 0
361
+
362
+ while pointer_min < pointer_mid
363
+ if text1[pointer_start...pointer_mid] == text2[pointer_start...pointer_mid]
364
+ pointer_min = pointer_mid
365
+ pointer_start = pointer_min
366
+ else
367
+ pointer_max = pointer_mid
368
+ end
369
+ pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min
370
+ end
371
+
372
+ pointer_mid
373
+ end
374
+
375
+ # Determine the common suffix of two strings.
376
+ def diff_commonSuffix(text1, text2)
377
+ # Quick check for common null cases.
378
+ return 0 if text1.empty? || text2.empty? || text1[-1] != text2[-1]
379
+
380
+ # Binary search.
381
+ # Performance analysis: http://neil.fraser.name/news/2007/10/09/
382
+ pointer_min = 0
383
+ pointer_max = [text1.length, text2.length].min
384
+ pointer_mid = pointer_max
385
+ pointer_end = 0
386
+
387
+ while pointer_min < pointer_mid
388
+ if text1[-pointer_mid..(-pointer_end-1)] == text2[-pointer_mid..(-pointer_end-1)]
389
+ pointer_min = pointer_mid
390
+ pointer_end = pointer_min
391
+ else
392
+ pointer_max = pointer_mid
393
+ end
394
+ pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min
395
+ end
396
+
397
+ pointer_mid
398
+ end
399
+
400
+ # Determine if the suffix of one string is the prefix of another.
401
+ def diff_commonOverlap(text1, text2)
402
+ # Cache the text lengths to prevent multiple calls.
403
+ text1_length = text1.length
404
+ text2_length = text2.length
405
+
406
+ # Eliminate the null case.
407
+ return 0 if text1_length.zero? || text2_length.zero?
408
+
409
+ # Truncate the longer string.
410
+ if text1_length > text2_length
411
+ text1 = text1[-text2_length..-1]
412
+ else
413
+ text2 = text2[0...text1_length]
414
+ end
415
+ text_length = [text1_length, text2_length].min
416
+
417
+ # Quick check for the whole case.
418
+ return text_length if text1 == text2
419
+
420
+ # Start by looking for a single character match
421
+ # and increase length until no match is found.
422
+ # Performance analysis: http://neil.fraser.name/news/2010/11/04/
423
+ best = 0
424
+ length = 1
425
+ loop do
426
+ pattern = text1[(text_length - length)..-1]
427
+ found = text2.index(pattern)
428
+
429
+ return best if found.nil?
430
+
431
+ length += found
432
+ if found == 0 || text1[(text_length - length)..-1] == text2[0..length]
433
+ best = length
434
+ length += 1
435
+ end
436
+ end
437
+ end
438
+
439
+ # Does a substring of shorttext exist within longtext such that the
440
+ # substring is at least half the length of longtext?
441
+ def diff_halfMatchI(longtext, shorttext, i)
442
+ seed = longtext[i, longtext.length / 4]
443
+ j = -1
444
+ best_common = ''
445
+ while j = shorttext.index(seed, j + 1)
446
+ prefix_length = diff_commonPrefix(longtext[i..-1], shorttext[j..-1])
447
+ suffix_length = diff_commonSuffix(longtext[0...i], shorttext[0...j])
448
+ if best_common.length < suffix_length + prefix_length
449
+ best_common = shorttext[(j - suffix_length)...j] + shorttext[j...(j + prefix_length)]
450
+ best_longtext_a = longtext[0...(i - suffix_length)]
451
+ best_longtext_b = longtext[(i + prefix_length)..-1]
452
+ best_shorttext_a = shorttext[0...(j - suffix_length)]
453
+ best_shorttext_b = shorttext[(j + prefix_length)..-1]
454
+ end
455
+ end
456
+
457
+ if best_common.length * 2 >= longtext.length
458
+ [best_longtext_a, best_longtext_b, best_shorttext_a, best_shorttext_b, best_common]
459
+ end
460
+ end
461
+
462
+ # Do the two texts share a substring which is at least half the length of the
463
+ # longer text?
464
+ # This speedup can produce non-minimal diffs.
465
+ def diff_halfMatch(text1, text2)
466
+ # Don't risk returning a non-optimal diff if we have unlimited time
467
+ return nil if diff_timeout <= 0
468
+
469
+ shorttext, longtext = [text1, text2].sort_by(&:length)
470
+ if longtext.length < 4 || shorttext.length * 2 < longtext.length
471
+ return nil # Pointless.
472
+ end
473
+
474
+ # First check if the second quarter is the seed for a half-match.
475
+ hm1 = diff_halfMatchI(longtext, shorttext, (longtext.length + 3) / 4)
476
+ # Check again based on the third quarter.
477
+ hm2 = diff_halfMatchI(longtext, shorttext, (longtext.length + 1) / 2)
478
+
479
+ if hm1.nil? && hm2.nil?
480
+ return nil
481
+ elsif hm2.nil? || hm1.nil?
482
+ hm = hm2.nil? ? hm1 : hm2
483
+ else
484
+ # Both matched. Select the longest.
485
+ hm = hm1[4].length > hm2[4].length ? hm1 : hm2
486
+ end
487
+
488
+ # A half-match was found, sort out the return data.
489
+ if text1.length > text2.length
490
+ text1_a, text1_b, text2_a, text2_b, mid_common = hm
491
+ else
492
+ text2_a, text2_b, text1_a, text1_b, mid_common = hm
493
+ end
494
+
495
+ [text1_a, text1_b, text2_a, text2_b, mid_common]
496
+ end
497
+
498
+ # Reduce the number of edits by eliminating semantically trivial equalities.
499
+ def diff_cleanupSemantic(diffs)
500
+ changes = false
501
+ equalities = [] # Stack of indices where equalities are found.
502
+ last_equality = nil # Always equal to equalities.last[1]
503
+ pointer = 0 # Index of current position.
504
+ # Number of characters that changed prior to the equality.
505
+ length_insertions1 = 0
506
+ length_deletions1 = 0
507
+ # Number of characters that changed after the equality.
508
+ length_insertions2 = 0
509
+ length_deletions2 = 0
510
+
511
+ while pointer < diffs.length
512
+ if diffs[pointer][0] == :equal # Equality found.
513
+ equalities.push(pointer)
514
+ length_insertions1 = length_insertions2
515
+ length_deletions1 = length_deletions2
516
+ length_insertions2 = 0
517
+ length_deletions2 = 0
518
+ last_equality = diffs[pointer][1]
519
+ else # An insertion or deletion.
520
+ if diffs[pointer][0] == :insert
521
+ length_insertions2 += diffs[pointer][1].length
522
+ else
523
+ length_deletions2 += diffs[pointer][1].length
524
+ end
525
+
526
+ if last_equality &&
527
+ last_equality.length <= [length_insertions1, length_deletions1].max &&
528
+ last_equality.length <= [length_insertions2, length_deletions2].max
529
+ # Duplicate record.
530
+ diffs[equalities.last, 0] = [[:delete, last_equality]]
531
+
532
+ # Change second copy to insert.
533
+ diffs[equalities.last + 1][0] = :insert
534
+
535
+ # Throw away the equality we just deleted.
536
+ equalities.pop
537
+ # Throw away the previous equality (it needs to be reevaluated).
538
+ equalities.pop
539
+ pointer = equalities.last || -1
540
+
541
+ # Reset the counters.
542
+ length_insertions1 = 0
543
+ length_deletions1 = 0
544
+ length_insertions2 = 0
545
+ length_deletions2 = 0
546
+ last_equality = nil
547
+
548
+ changes = true
549
+ end
550
+ end
551
+ pointer += 1
552
+ end
553
+
554
+ # Normalize the diff.
555
+ if changes
556
+ diff_cleanupMerge(diffs)
557
+ end
558
+ diff_cleanupSemanticLossless(diffs)
559
+
560
+ # Find any overlaps between deletions and insertions.
561
+ # e.g: <del>abcxxx</del><ins>xxxdef</ins>
562
+ # -> <del>abc</del>xxx<ins>def</ins>
563
+ # e.g: <del>xxxabc</del><ins>defxxx</ins>
564
+ # -> <ins>def</ins>xxx<del>abc</del>
565
+ # Only extract an overlap if it is as big as the edit ahead or behind it.
566
+ pointer = 1
567
+ while pointer < diffs.length
568
+ if diffs[pointer - 1][0] == :delete && diffs[pointer][0] == :insert
569
+ deletion = diffs[pointer - 1][1]
570
+ insertion = diffs[pointer][1]
571
+ overlap_length1 = diff_commonOverlap(deletion, insertion)
572
+ overlap_length2 = diff_commonOverlap(insertion, deletion)
573
+ if overlap_length1 >= overlap_length2
574
+ if overlap_length1 >= deletion.length / 2.0 ||
575
+ overlap_length1 >= insertion.length / 2.0
576
+ # Overlap found. Insert an equality and trim the surrounding edits.
577
+ diffs[pointer, 0] = [[:equal, insertion[0...overlap_length1]]]
578
+ diffs[pointer -1][0] = :delete
579
+ diffs[pointer - 1][1] = deletion[0...-overlap_length1]
580
+ diffs[pointer + 1][0] = :insert
581
+ diffs[pointer + 1][1] = insertion[overlap_length1..-1]
582
+ pointer += 1
583
+ end
584
+ else
585
+ if overlap_length2 >= deletion.length / 2.0 ||
586
+ overlap_length2 >= insertion.length / 2.0
587
+ diffs[pointer, 0] = [[:equal, deletion[0...overlap_length2]]]
588
+ diffs[pointer - 1][0] = :insert
589
+ diffs[pointer - 1][1] = insertion[0...-overlap_length2]
590
+ diffs[pointer + 1][0] = :delete
591
+ diffs[pointer + 1][1] = deletion[overlap_length2..-1]
592
+ pointer += 1
593
+ end
594
+ end
595
+ pointer += 1
596
+ end
597
+ pointer += 1
598
+ end
599
+ end
600
+
601
+ # Given two strings, compute a score representing whether the
602
+ # internal boundary falls on logical boundaries.
603
+ # Scores range from 5 (best) to 0 (worst).
604
+ def diff_cleanupSemanticScore(one, two)
605
+ if one.empty? || two.empty?
606
+ # Edges are the best.
607
+ return 5
608
+ end
609
+
610
+ # Define some regex patterns for matching boundaries.
611
+ nonWordCharacter = /[^a-zA-Z0-9]/
612
+ whitespace = /\s/
613
+ linebreak = /[\r\n]/
614
+ lineEnd = /\n\r?\n$/
615
+ lineStart = /^\r?\n\r?\n/
616
+
617
+ # Each port of this function behaves slightly differently due to
618
+ # subtle differences in each language's definition of things like
619
+ # 'whitespace'. Since this function's purpose is largely cosmetic,
620
+ # the choice has been made to use each language's native features
621
+ # rather than force total conformity.
622
+ score = 0
623
+ # One point for non-alphanumeric.
624
+ if one[-1] =~ nonWordCharacter || two[0] =~ nonWordCharacter
625
+ score += 1
626
+ # Two points for whitespace.
627
+ if one[-1] =~ whitespace || two[0] =~ whitespace
628
+ score += 1
629
+ # Three points for line breaks.
630
+ if one[-1] =~ linebreak || two[0] =~ linebreak
631
+ score += 1
632
+ # Four points for blank lines.
633
+ if one =~ lineEnd || two =~ lineStart
634
+ score += 1
635
+ end
636
+ end
637
+ end
638
+ end
639
+
640
+ score
641
+ end
642
+
643
+ # Look for single edits surrounded on both sides by equalities
644
+ # which can be shifted sideways to align the edit to a word boundary.
645
+ # e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
646
+ def diff_cleanupSemanticLossless(diffs)
647
+ pointer = 1
648
+ # Intentionally ignore the first and last element (don't need checking).
649
+ while pointer < diffs.length - 1
650
+ if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal
651
+ # This is a single edit surrounded by equalities.
652
+ equality1 = diffs[pointer - 1][1]
653
+ edit = diffs[pointer][1]
654
+ equality2 = diffs[pointer + 1][1]
655
+
656
+ # First, shift the edit as far left as possible.
657
+ common_offset = diff_commonSuffix(equality1, edit)
658
+ if common_offset != 0
659
+ common_string = edit[-common_offset..-1]
660
+ equality1 = equality1[0...-common_offset]
661
+ edit = common_string + edit[0...-common_offset]
662
+ equality2 = common_string + equality2
663
+ end
664
+
665
+ # Second, step character by character right, looking for the best fit.
666
+ best_equality1 = equality1
667
+ best_edit = edit
668
+ best_equality2 = equality2
669
+ best_score = diff_cleanupSemanticScore(equality1, edit) +
670
+ diff_cleanupSemanticScore(edit, equality2)
671
+ while edit[0] == equality2[0]
672
+ equality1 += edit[0]
673
+ edit = edit[1..-1] + equality2[0]
674
+ equality2 = equality2[1..-1]
675
+ score = diff_cleanupSemanticScore(equality1, edit) +
676
+ diff_cleanupSemanticScore(edit, equality2)
677
+ # The >= encourages trailing rather than leading whitespace on edits.
678
+ if score >= best_score
679
+ best_score = score
680
+ best_equality1 = equality1
681
+ best_edit = edit
682
+ best_equality2 = equality2
683
+ end
684
+ end
685
+
686
+ if diffs[pointer - 1][1] != best_equality1
687
+ # We have an improvement, save it back to the diff.
688
+ if best_equality1.empty?
689
+ diffs[pointer - 1, 1] = []
690
+ pointer -= 1
691
+ else
692
+ diffs[pointer - 1][1] = best_equality1
693
+ end
694
+
695
+ diffs[pointer][1] = best_edit
696
+
697
+ if best_equality2.empty?
698
+ diffs[pointer + 1, 1] = []
699
+ pointer -= 1
700
+ else
701
+ diffs[pointer + 1][1] = best_equality2
702
+ end
703
+ end
704
+ end
705
+
706
+ pointer += 1
707
+ end
708
+ end
709
+
710
+ # Reduce the number of edits by eliminating operationally trivial equalities.
711
+ def diff_cleanupEfficiency(diffs)
712
+ changes = false
713
+ equalities = [] # Stack of indices where equalities are found.
714
+ last_equality = '' # Always equal to equalities.last[1]
715
+ pointer = 0 # Index of current position.
716
+ pre_ins = false # Is there an insertion operation before the last equality.
717
+ pre_del = false # Is there a deletion operation before the last equality.
718
+ post_ins = false # Is there an insertion operation after the last equality.
719
+ post_del = false # Is there a deletion operation after the last equality.
720
+
721
+ while pointer < diffs.length
722
+ if diffs[pointer][0] == :equal # Equality found.
723
+ if diffs[pointer][1].length < diff_editCost && (post_ins || post_del)
724
+ # Candidate found.
725
+ equalities.push(pointer)
726
+ pre_ins = post_ins
727
+ pre_del = post_del
728
+ last_equality = diffs[pointer][1]
729
+ else
730
+ # Not a candidate, and can never become one.
731
+ equalities.clear
732
+ last_equality = ''
733
+ end
734
+ post_ins = false
735
+ post_del = false
736
+ else # An insertion or deletion.
737
+ if diffs[pointer][0] == :delete
738
+ post_del = true
739
+ else
740
+ post_ins = true
741
+ end
742
+
743
+ # Five types to be split:
744
+ # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
745
+ # <ins>A</ins>X<ins>C</ins><del>D</del>
746
+ # <ins>A</ins><del>B</del>X<ins>C</ins>
747
+ # <ins>A</del>X<ins>C</ins><del>D</del>
748
+ # <ins>A</ins><del>B</del>X<del>C</del>
749
+
750
+ if !last_equality.empty? &&
751
+ ((pre_ins && pre_del && post_ins && post_del) ||
752
+ ((last_equality.length < diff_editCost / 2) &&
753
+ [pre_ins, pre_del, post_ins, post_del].count(true) == 3))
754
+ # Duplicate record.
755
+ diffs[equalities.last, 0] = [[:delete, last_equality]]
756
+ # Change second copy to insert.
757
+ diffs[equalities.last + 1][0] = :insert
758
+ equalities.pop # Throw away the equality we just deleted
759
+ last_equality = ''
760
+ if pre_ins && pre_del
761
+ # No changes made which could affect previous entry, keep going.
762
+ post_ins = true
763
+ post_del = true
764
+ equalities.clear
765
+ else
766
+ if !equalities.empty?
767
+ equalities.pop # Throw away the previous equality.
768
+ pointer = equalities.last || -1
769
+ end
770
+ post_ins = false
771
+ post_del = false
772
+ end
773
+ changes = true
774
+ end
775
+ end
776
+ pointer += 1
777
+ end
778
+
779
+ if changes
780
+ diff_cleanupMerge(diffs)
781
+ end
782
+ end
783
+
784
+ # Reorder and merge like edit sections. Merge equalities.
785
+ # Any edit section can move as long as it doesn't cross an equality.
786
+ def diff_cleanupMerge(diffs)
787
+ diffs.push([:equal, '']) # Add a dummy entry at the end.
788
+ pointer = 0
789
+ count_delete = 0
790
+ count_insert = 0
791
+ text_delete = ''
792
+ text_insert = ''
793
+
794
+ while pointer < diffs.length
795
+ case diffs[pointer][0]
796
+ when :insert
797
+ count_insert += 1
798
+ text_insert += diffs[pointer][1]
799
+ pointer += 1
800
+ when :delete
801
+ count_delete += 1
802
+ text_delete += diffs[pointer][1]
803
+ pointer += 1
804
+ when :equal
805
+ # Upon reaching an equality, check for prior redundancies.
806
+ if count_delete + count_insert > 1
807
+ if count_delete != 0 && count_insert != 0
808
+ # Factor out any common prefixies.
809
+ common_length = diff_commonPrefix(text_insert, text_delete)
810
+ if common_length != 0
811
+ if (pointer - count_delete - count_insert) > 0 &&
812
+ diffs[pointer - count_delete - count_insert - 1][0] == :equal
813
+ diffs[pointer - count_delete - count_insert - 1][1] +=
814
+ text_insert[0...common_length]
815
+ else
816
+ diffs.unshift([:equal, text_insert[0...common_length]])
817
+ pointer += 1
818
+ end
819
+ text_insert = text_insert[common_length..-1]
820
+ text_delete = text_delete[common_length..-1]
821
+ end
822
+ # Factor out any common suffixies.
823
+ common_length = diff_commonSuffix(text_insert, text_delete)
824
+ if common_length != 0
825
+ diffs[pointer][1] = text_insert[-common_length..-1] + diffs[pointer][1]
826
+ text_insert = text_insert[0...-common_length]
827
+ text_delete = text_delete[0...-common_length]
828
+ end
829
+ end
830
+
831
+ # Delete the offending records and add the merged ones.
832
+ if count_delete.zero?
833
+ diffs[pointer - count_delete - count_insert, count_delete + count_insert] =
834
+ [[:insert, text_insert]]
835
+ elsif count_insert.zero?
836
+ diffs[pointer - count_delete - count_insert, count_delete + count_insert] =
837
+ [[:delete, text_delete]]
838
+ else
839
+ diffs[pointer - count_delete - count_insert, count_delete + count_insert] =
840
+ [[:delete, text_delete], [:insert, text_insert]]
841
+ end
842
+ pointer = pointer - count_delete - count_insert +
843
+ (count_delete.zero? ? 0 : 1) + (count_insert.zero? ? 0 : 1) + 1
844
+ elsif pointer != 0 && diffs[pointer - 1][0] == :equal
845
+ # Merge this equality with the previous one.
846
+ diffs[pointer - 1][1] += diffs[pointer][1]
847
+ diffs[pointer, 1] = []
848
+ else
849
+ pointer += 1
850
+ end
851
+ count_insert = 0
852
+ count_delete = 0
853
+ text_delete = ''
854
+ text_insert = ''
855
+ end
856
+ end
857
+
858
+ if diffs.last[1].empty?
859
+ diffs.pop # Remove the dummy entry at the end.
860
+ end
861
+
862
+ # Second pass: look for single edits surrounded on both sides by equalities
863
+ # which can be shifted sideways to eliminate an equality.
864
+ # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
865
+ changes = false
866
+ pointer = 1
867
+
868
+ # Intentionally ignore the first and last element (don't need checking).
869
+ while pointer < diffs.length - 1
870
+ if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal
871
+ # This is a single edit surrounded by equalities.
872
+ if diffs[pointer][1][-diffs[pointer - 1][1].length..-1] == diffs[pointer - 1][1]
873
+ # Shift the edit over the previous equality.
874
+ diffs[pointer][1] = diffs[pointer - 1][1] + diffs[pointer][1][0...-diffs[pointer - 1][1].length]
875
+ diffs[pointer + 1][1] = diffs[pointer - 1][1] + diffs[pointer + 1][1]
876
+ diffs[pointer - 1, 1] = []
877
+ changes = true
878
+ elsif diffs[pointer][1][0...diffs[pointer + 1][1].length] == diffs[pointer + 1][1]
879
+ # Shift the edit over the next equality.
880
+ diffs[pointer - 1][1] += diffs[pointer + 1][1]
881
+ diffs[pointer][1] = diffs[pointer][1][diffs[pointer + 1][1].length..-1] +
882
+ diffs[pointer + 1][1]
883
+ diffs[pointer + 1, 1] = []
884
+ changes = true
885
+ end
886
+ end
887
+ pointer += 1
888
+ end
889
+
890
+ # If shifts were made, the diff needs reordering and another shift sweep.
891
+ if changes
892
+ diff_cleanupMerge(diffs)
893
+ end
894
+ end
895
+
896
+ # loc is a location in text1, compute and return the equivalent location
897
+ # in text2. e.g. 'The cat' vs 'The big cat', 1->1, 5->8
898
+ def diff_xIndex(diffs, loc)
899
+ chars1 = 0
900
+ chars2 = 0
901
+ last_chars1 = 0
902
+ last_chars2 = 0
903
+ x = diffs.index do |diff|
904
+ if diff[0] != :insert
905
+ chars1 += diff[1].length
906
+ end
907
+ if diff[0] != :delete
908
+ chars2 += diff[1].length
909
+ end
910
+ if chars1 > loc
911
+ true
912
+ else
913
+ last_chars1 = chars1
914
+ last_chars2 = chars2
915
+ false
916
+ end
917
+ end
918
+
919
+ if diffs.length != x && diffs[x][0] == :delete
920
+ # The location was deleted.
921
+ last_chars2
922
+ else
923
+ # Add the remaining len(character).
924
+ last_chars2 + (loc - last_chars1)
925
+ end
926
+ end
927
+
928
+ # Convert a diff array into a pretty HTML report.
929
+ def diff_prettyHtml(diffs)
930
+ diffs.map do |op, data|
931
+ text = data.gsub('&', '&amp;').gsub('<', '&lt;').gsub('>', '&gt;').gsub('\n', '&para;<br>')
932
+ case op
933
+ when :insert
934
+ "<ins style=\"background:#e6ffe6;\">#{text}</ins>"
935
+ when :delete
936
+ "<del style=\"background:#ffe6e6;\">#{text}</del>"
937
+ when :equal
938
+ "<span>#{text}</span>"
939
+ end
940
+ end.join
941
+ end
942
+
943
+ # Compute and return the source text (all equalities and deletions).
944
+ def diff_text1(diffs)
945
+ diffs.map do |op, data|
946
+ if op == :insert
947
+ ''
948
+ else
949
+ data
950
+ end
951
+ end.join
952
+ end
953
+
954
+ # Compute and return the destination text (all equalities and insertions).
955
+ def diff_text2(diffs)
956
+ diffs.map do |op, data|
957
+ if op == :delete
958
+ ''
959
+ else
960
+ data
961
+ end
962
+ end.join
963
+ end
964
+
965
+ # Compute the Levenshtein distance; the number of inserted, deleted or
966
+ # substituted characters.
967
+ def diff_levenshtein(diffs)
968
+ levenshtein = 0
969
+ insertions = 0
970
+ deletions = 0
971
+
972
+ diffs.each do |op, data|
973
+ case op
974
+ when :insert
975
+ insertions += data.length
976
+ when :delete
977
+ deletions += data.length
978
+ when :equal
979
+ # A deletion and an insertion is one substitution.
980
+ levenshtein += [insertions, deletions].max
981
+ insertions = 0
982
+ deletions = 0
983
+ end
984
+ end
985
+
986
+ levenshtein + [insertions, deletions].max
987
+ end
988
+
989
+ # Crush the diff into an encoded string which describes the operations
990
+ # required to transform text1 into text2.
991
+ # E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
992
+ # Operations are tab-separated. Inserted text is escaped using %xx notation.
993
+ def diff_toDelta(diffs)
994
+ diffs.map do |op, data|
995
+ case op
996
+ when :insert
997
+ '+' + URI.encode(data, /[^0-9A-Za-z_.;!~*'(),\/?:@&=+$\#-]/)
998
+ when :delete
999
+ '-' + data.length.to_s
1000
+ when :equal
1001
+ '=' + data.length.to_s
1002
+ end
1003
+ end.join("\t").gsub('%20', ' ')
1004
+ end
1005
+
1006
+ # Given the original text1, and an encoded string which describes the
1007
+ # operations required to transform text1 into text2, compute the full diff.
1008
+ def diff_fromDelta(text1, delta)
1009
+ # Deltas should be composed of a subset of ascii chars, Unicode not required.
1010
+ delta.encode('ascii')
1011
+ diffs = []
1012
+ pointer = 0 # Cursor in text1
1013
+ delta.split("\t").each do |token|
1014
+ # Each token begins with a one character parameter which specifies the
1015
+ # operation of this token (delete, insert, equality).
1016
+ param = token[1..-1]
1017
+ case token[0]
1018
+ when '+'
1019
+ diffs.push([:insert, URI.decode(param.force_encoding(Encoding::UTF_8))])
1020
+ when '-', '='
1021
+ begin
1022
+ n = Integer(param)
1023
+ raise if n < 0
1024
+ text = text1[pointer...(pointer + n)]
1025
+ pointer += n
1026
+ if token[0] == '='
1027
+ diffs.push([:equal, text])
1028
+ else
1029
+ diffs.push([:delete, text])
1030
+ end
1031
+ rescue ArgumentError => e
1032
+ raise ArgumentError.new(
1033
+ "Invalid number in diff_fromDelta: #{param.inspect}")
1034
+ end
1035
+ else
1036
+ raise ArgumentError.new(
1037
+ "Invalid diff operation in diff_fromDelta: #{token.inspect}")
1038
+ end
1039
+ end
1040
+
1041
+ if pointer != text1.length
1042
+ raise ArgumentError.new("Delta length (#{pointer}) does not equal " +
1043
+ "source text length #{text1.length}")
1044
+ end
1045
+ diffs
1046
+ end
1047
+
1048
+ # Locate the best instance of 'pattern' in 'text' near 'loc'.
1049
+ def match_main(text, pattern, loc)
1050
+ # Check for null inputs.
1051
+ if [text, pattern].any?(&:nil?)
1052
+ raise ArgumentError.new("Null input. (match_main)")
1053
+ end
1054
+
1055
+ loc = [0, [loc, text.length].min].max
1056
+ if text == pattern
1057
+ # Shortcut (potentially not guaranteed by the algorithm)
1058
+ 0
1059
+ elsif text.empty?
1060
+ # Nothing to match
1061
+ -1
1062
+ elsif text[loc, pattern.length] == pattern
1063
+ # Perfect match at the perfect spot! (Includes case of null pattern)
1064
+ loc
1065
+ else
1066
+ # Do a fuzzy compare.
1067
+ match_bitap(text, pattern, loc)
1068
+ end
1069
+ end
1070
+
1071
+ # Locate the best instance of 'pattern' in 'text' near 'loc' using the
1072
+ # Bitap algorithm.
1073
+ def match_bitap(text, pattern, loc)
1074
+ if pattern.length > match_maxBits
1075
+ throw ArgumentError.new("Pattern too long")
1076
+ end
1077
+
1078
+ # Initialise the alphabet.
1079
+ s = match_alphabet(pattern)
1080
+
1081
+ # Compute and return the score for a match with e errors and x location.
1082
+ match_bitapScore = -> e, x do
1083
+ accuracy = e.to_f / pattern.length
1084
+ proximity = (loc - x).abs
1085
+ if match_distance == 0
1086
+ # Dodge divide by zero error.
1087
+ return proximity == 0 ? accuracy : 1.0
1088
+ end
1089
+ return accuracy + (proximity.to_f / match_distance)
1090
+ end
1091
+
1092
+ # Highest score beyond which we give up.
1093
+ score_threshold = match_threshold
1094
+ # Is there a nearby exact match? (speedup)
1095
+ best_loc = text.index(pattern, loc)
1096
+ if best_loc
1097
+ score_threshold = [match_bitapScore[0, best_loc], score_threshold].min
1098
+ # What about in the other direction? (speedup)
1099
+ best_loc = text.rindex(pattern, loc + pattern.length)
1100
+ if best_loc
1101
+ score_threshold = [match_bitapScore[0, best_loc], score_threshold].min
1102
+ end
1103
+ end
1104
+
1105
+ # Initialise the bit arrays.
1106
+ match_mask = 1 << (pattern.length - 1)
1107
+ best_loc = -1
1108
+
1109
+ bin_max = pattern.length + text.length
1110
+ # Empty initialization added to appease pychecker.
1111
+ last_rd = nil
1112
+ pattern.length.times do |d|
1113
+ # Scan for the best match; each iteration allows for one more error.
1114
+ # Run a binary search to determine how far from 'loc' we can stray at this
1115
+ # error level.
1116
+ bin_min = 0
1117
+ bin_mid = bin_max
1118
+ while bin_min < bin_mid
1119
+ if match_bitapScore[d, loc + bin_mid] <= score_threshold
1120
+ bin_min = bin_mid
1121
+ else
1122
+ bin_max = bin_mid
1123
+ end
1124
+ bin_mid = (bin_max - bin_min) / 2 + bin_min
1125
+ end
1126
+
1127
+ # Use the result from this iteration as the maximum for the next.
1128
+ bin_max = bin_mid
1129
+ start = [1, loc - bin_mid + 1].max
1130
+ finish = [loc + bin_mid, text.length].min + pattern.length
1131
+
1132
+ rd = Array.new(finish + 2, 0)
1133
+ rd[finish + 1] = (1 << d) - 1
1134
+ finish.downto(start) do |j|
1135
+ char_match = s[text[j - 1]] || 0
1136
+ if d == 0 # First pass: exact match.
1137
+ rd[j] = ((rd[j + 1] << 1) | 1) & char_match
1138
+ else # Subsequent passes: fuzzy match.
1139
+ rd[j] = ((rd[j + 1] << 1) | 1) & char_match |
1140
+ (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1141
+ end
1142
+ if (rd[j] & match_mask).nonzero?
1143
+ score = match_bitapScore[d, j - 1]
1144
+ # This match will almost certainly be better than any existing match.
1145
+ # But check anyway.
1146
+ if score <= score_threshold
1147
+ # Told you so.
1148
+ score_threshold = score
1149
+ best_loc = j - 1
1150
+ if best_loc > loc
1151
+ # When passing loc, don't exceed our current distance from loc.
1152
+ start = [1, 2 * loc - best_loc].max
1153
+ else
1154
+ # Already passed loc, downhill from here on in.
1155
+ break
1156
+ end
1157
+ end
1158
+ end
1159
+ end
1160
+
1161
+ # No hope for a (better) match at greater error levels.
1162
+ if match_bitapScore[d + 1, loc] > score_threshold
1163
+ break
1164
+ end
1165
+ last_rd = rd
1166
+ end
1167
+
1168
+ best_loc
1169
+ end
1170
+
1171
+ # Initialise the alphabet for the Bitap algorithm.
1172
+ def match_alphabet(pattern)
1173
+ s = {}
1174
+ pattern.chars.each_with_index do |c, i|
1175
+ s[c] ||= 0
1176
+ s[c] |= 1 << (pattern.length - i - 1)
1177
+ end
1178
+ s
1179
+ end
1180
+
1181
+ # Parse a textual representation of patches and return a list of patch
1182
+ # objects.
1183
+ def patch_fromText(textline)
1184
+ return [] if textline.empty?
1185
+
1186
+ patches = []
1187
+ text = textline.split("\n")
1188
+ text_pointer = 0
1189
+ patch_header = /^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$/
1190
+ while text_pointer < text.length
1191
+ m = text[text_pointer].match(patch_header)
1192
+ if m.nil?
1193
+ raise ArgumentError.new("Invalid patch string: #{text[text_pointer]}")
1194
+ end
1195
+ patch = PatchObj.new
1196
+ patches.push(patch)
1197
+ patch.start1 = m[1].to_i
1198
+ if m[2].empty?
1199
+ patch.start1 -= 1
1200
+ patch.length1 = 1
1201
+ elsif m[2] == '0'
1202
+ patch.length1 = 0
1203
+ else
1204
+ patch.start1 -= 1
1205
+ patch.length1 = m[2].to_i
1206
+ end
1207
+
1208
+ patch.start2 = m[3].to_i
1209
+ if m[4].empty?
1210
+ patch.start2 -= 1
1211
+ patch.length2 = 1
1212
+ elsif m[4] == '0'
1213
+ patch.length2 = 0
1214
+ else
1215
+ patch.start2 -= 1
1216
+ patch.length2 = m[4].to_i
1217
+ end
1218
+ text_pointer += 1
1219
+
1220
+ while text_pointer < text.length
1221
+ if text[text_pointer].empty?
1222
+ # Blank line? Whatever.
1223
+ text_pointer += 1
1224
+ next
1225
+ end
1226
+
1227
+ sign = text[text_pointer][0]
1228
+ line = URI.decode(text[text_pointer][1..-1].force_encoding(Encoding::UTF_8))
1229
+
1230
+ case sign
1231
+ when '-'
1232
+ # Deletion.
1233
+ patch.diffs.push([:delete, line])
1234
+ when '+'
1235
+ # Insertion.
1236
+ patch.diffs.push([:insert, line])
1237
+ when ' '
1238
+ # Minor equality
1239
+ patch.diffs.push([:equal, line])
1240
+ when '@'
1241
+ # Start of next patch.
1242
+ break
1243
+ else
1244
+ # WTF?
1245
+ raise ArgumentError.new("Invalid patch mode \"#{sign}\" in: #{line}")
1246
+ end
1247
+ text_pointer += 1
1248
+ end
1249
+ end
1250
+
1251
+ patches
1252
+ end
1253
+
1254
+ # Take a list of patches and return a textual representation
1255
+ def patch_toText(patches)
1256
+ patches.join
1257
+ end
1258
+
1259
+ # Increase the context until it is unique,
1260
+ # but don't let the pattern expand beyond match_maxBits
1261
+ def patch_addContext(patch, text)
1262
+ return if text.empty?
1263
+ pattern = text[patch.start2, patch.length1]
1264
+ padding = 0
1265
+
1266
+ # Look for the first and last matches of pattern in text. If two different
1267
+ # matches are found, increase the pattern length.
1268
+ while text.index(pattern) != text.rindex(pattern) &&
1269
+ pattern.length < match_maxBits - 2 * patch_margin
1270
+ padding += patch_margin
1271
+ pattern = text[[0, patch.start2 - padding].max...(patch.start2 + patch.length1 + padding)]
1272
+ end
1273
+
1274
+ # Add one chunk for good luck.
1275
+ padding += patch_margin
1276
+
1277
+ # Add the prefix.
1278
+ prefix = text[[0, patch.start2 - padding].max...patch.start2]
1279
+ patch.diffs.unshift([:equal, prefix]) if !prefix.to_s.empty?
1280
+
1281
+ # Add the suffix.
1282
+ suffix = text[patch.start2 + patch.length1, padding]
1283
+ patch.diffs.push([:equal, suffix]) if !suffix.to_s.empty?
1284
+
1285
+ # Roll back the start points.
1286
+ patch.start1 -= prefix.length
1287
+ patch.start2 -= prefix.length
1288
+
1289
+ # Extend the lengths.
1290
+ patch.length1 += prefix.length + suffix.length
1291
+ patch.length2 += prefix.length + suffix.length
1292
+ end
1293
+
1294
+ # Compute a list of patches to turn text1 into text2.
1295
+ # Use diffs if provided, otherwise compute it ourselves.
1296
+ # There are four ways to call this function, depending on what data is
1297
+ # available to the caller:
1298
+ # Method 1:
1299
+ # a = text1, b = text2
1300
+ # Method 2:
1301
+ # a = diffs
1302
+ # Method 3 (optimal):
1303
+ # a = text1, b = diffs
1304
+ # Method 4 (deprecated, use method 3):
1305
+ # a = text1, b = text2, c = diffs
1306
+ def patch_make(*args)
1307
+ text1 = nil
1308
+ diffs = nil
1309
+ if args.length == 2 && args[0].is_a?(String) && args[1].is_a?(String)
1310
+ # Compute diffs from text1 and text2.
1311
+ text1 = args[0]
1312
+ text2 = args[1]
1313
+ diffs = diff_main(text1, text2, true)
1314
+ if diffs.length > 2
1315
+ diff_cleanupSemantic(diffs)
1316
+ diff_cleanupEfficiency(diffs)
1317
+ end
1318
+ elsif args.length == 1 && args[0].is_a?(Array)
1319
+ # Compute text1 from diffs.
1320
+ diffs = args[0]
1321
+ text1 = diff_text1(diffs)
1322
+ elsif args.length == 2 && args[0].is_a?(String) && args[1].is_a?(Array)
1323
+ text1 = args[0]
1324
+ diffs = args[1]
1325
+ elsif args.length == 3 && args[0].is_a?(String) && args[1].is_a?(String) &&
1326
+ args[2].is_a?(Array)
1327
+ # Method 4: text1, text2, diffs
1328
+ # text2 is not used.
1329
+ text1 = args[0]
1330
+ text2 = args[1]
1331
+ diffs = args[2]
1332
+ else
1333
+ raise ArgumentError.new('Unknown call format to patch_make.')
1334
+ end
1335
+
1336
+ return [] if diffs.empty? # Get rid of the null case.
1337
+
1338
+ patches = []
1339
+ patch = PatchObj.new
1340
+ char_count1 = 0 # Number of characters into the text1 string.
1341
+ char_count2 = 0 # Number of characters into the text2 string.
1342
+ prepatch_text = text1 # Recreate the patches to determine context info.
1343
+ postpatch_text = text1
1344
+
1345
+ diffs.each_with_index do |diff, x|
1346
+ diff_type, diff_text = diffs[x]
1347
+ if patch.diffs.empty? && diff_type != :equal
1348
+ # A new patch starts here.
1349
+ patch.start1 = char_count1
1350
+ patch.start2 = char_count2
1351
+ end
1352
+
1353
+ case diff_type
1354
+ when :insert
1355
+ patch.diffs.push(diff)
1356
+ patch.length2 += diff_text.length
1357
+ postpatch_text = postpatch_text[0...char_count2] + diff_text +
1358
+ postpatch_text[char_count2..-1]
1359
+ when :delete
1360
+ patch.length1 += diff_text.length
1361
+ patch.diffs.push(diff)
1362
+ postpatch_text = postpatch_text[0...char_count2] +
1363
+ postpatch_text[(char_count2 + diff_text.length)..-1]
1364
+ when :equal
1365
+ if diff_text.length <= 2 * patch_margin &&
1366
+ !patch.diffs.empty? && diffs.length != x + 1
1367
+ # Small equality inside a patch.
1368
+ patch.diffs.push(diff)
1369
+ patch.length1 += diff_text.length
1370
+ patch.length2 += diff_text.length
1371
+ elsif diff_text.length >= 2 * patch_margin
1372
+ # Time for a new patch.
1373
+ unless patch.diffs.empty?
1374
+ patch_addContext(patch, prepatch_text)
1375
+ patches.push(patch)
1376
+ patch = PatchObj.new
1377
+ # Unlike Unidiff, our patch lists have a rolling context.
1378
+ # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1379
+ # Update prepatch text & pos to reflect the application of the
1380
+ # just completed patch.
1381
+ prepatch_text = postpatch_text
1382
+ char_count1 = char_count2
1383
+ end
1384
+ end
1385
+ end
1386
+
1387
+ # Update the current character count.
1388
+ if diff_type != :insert
1389
+ char_count1 += diff_text.length
1390
+ end
1391
+ if diff_type != :delete
1392
+ char_count2 += diff_text.length
1393
+ end
1394
+ end
1395
+
1396
+ # Pick up the leftover patch if not empty.
1397
+ unless patch.diffs.empty?
1398
+ patch_addContext(patch, prepatch_text)
1399
+ patches.push(patch)
1400
+ end
1401
+
1402
+ patches
1403
+ end
1404
+
1405
+ # Merge a set of patches onto the text. Return a patched text, as well
1406
+ # as a list of true/false values indicating which patches were applied.
1407
+ def patch_apply(patches, text)
1408
+ return [text, []] if patches.empty?
1409
+
1410
+ # Deep copy the patches so that no changes are made to originals.
1411
+ patches = Marshal.load(Marshal.dump(patches))
1412
+
1413
+ null_padding = patch_addPadding(patches)
1414
+ text = null_padding + text + null_padding
1415
+ patch_splitMax(patches)
1416
+
1417
+ # delta keeps track of the offset between the expected and actual location
1418
+ # of the previous patch. If there are patches expected at positions 10 and
1419
+ # 20, but the first patch was found at 12, delta is 2 and the second patch
1420
+ # has an effective expected position of 22.
1421
+ delta = 0
1422
+ results = []
1423
+ patches.each_with_index do |patch, x|
1424
+ expected_loc = patch.start2 + delta
1425
+ text1 = diff_text1(patch.diffs)
1426
+ end_loc = -1
1427
+ if text1.length > match_maxBits
1428
+ # patch_splitMax will only provide an oversized pattern in the case of
1429
+ # a monster delete.
1430
+ start_loc = match_main(text, text1[0, match_maxBits], expected_loc)
1431
+ if start_loc != -1
1432
+ end_loc = match_main(text, text1[(text1.length - match_maxBits)..-1],
1433
+ expected_loc + text1.length - match_maxBits)
1434
+ if end_loc == -1 || start_loc >= end_loc
1435
+ # Can't find valid trailing context. Drop this patch.
1436
+ start_loc = -1
1437
+ end
1438
+ end
1439
+ else
1440
+ start_loc = match_main(text, text1, expected_loc)
1441
+ end
1442
+ if start_loc == -1
1443
+ # No match found. :(
1444
+ results[x] = false
1445
+ # Subtract the delta for this failed patch from subsequent patches.
1446
+ delta -= patch.length2 - patch.length1
1447
+ else
1448
+ # Found a match. :)
1449
+ results[x] = true
1450
+ delta = start_loc - expected_loc
1451
+ text2 = text[start_loc, (end_loc == -1) ? text1.length : end_loc + match_maxBits]
1452
+
1453
+ if text1 == text2
1454
+ # Perfect match, just shove the replacement text in.
1455
+ text = text[0, start_loc] + diff_text2(patch.diffs) + text[(start_loc + text1.length)..-1]
1456
+ else
1457
+ # Imperfect match.
1458
+ # Run a diff to get a framework of equivalent indices.
1459
+ diffs = diff_main(text1, text2, false)
1460
+ if text1.length > match_maxBits &&
1461
+ diff_levenshtein(diffs).to_f / text1.length > patch_deleteThreshold
1462
+ # The end points match, but the content is unacceptably bad.
1463
+ results[x] = false
1464
+ else
1465
+ diff_cleanupSemanticLossless(diffs)
1466
+ index1 = 0
1467
+ patch.diffs.each do |op, data|
1468
+ if op != :equal
1469
+ index2 = diff_xIndex(diffs, index1)
1470
+ end
1471
+ if op == :insert # Insertion
1472
+ text = text[0, start_loc + index2] + data + text[(start_loc + index2)..-1]
1473
+ elsif op == :delete # Deletion
1474
+ text = text[0, start_loc + index2] +
1475
+ text[(start_loc + diff_xIndex(diffs, index1 + data.length))..-1]
1476
+ end
1477
+ if op != :delete
1478
+ index1 += data.length
1479
+ end
1480
+ end
1481
+ end
1482
+ end
1483
+ end
1484
+ end
1485
+
1486
+ # Strip the padding off.
1487
+ text = text[null_padding.length...-null_padding.length]
1488
+ [text, results]
1489
+ end
1490
+
1491
+ # Add some padding on text start and end so that edges can match
1492
+ # something. Intended to be called only from within patch_apply.
1493
+ def patch_addPadding(patches)
1494
+ padding_length = patch_margin
1495
+ null_padding = (1..padding_length).map{ |x| x.chr(Encoding::UTF_8) }.join
1496
+
1497
+ # Bump all the patches forward.
1498
+ patches.each do |patch|
1499
+ patch.start1 += padding_length
1500
+ patch.start2 += padding_length
1501
+ end
1502
+
1503
+ # Add some padding on start of first diff.
1504
+ patch = patches.first
1505
+ diffs = patch.diffs
1506
+ if diffs.empty? || diffs.first[0] != :equal
1507
+ # Add nullPadding equality.
1508
+ diffs.unshift([:equal, null_padding])
1509
+ patch.start1 -= padding_length # Should be 0.
1510
+ patch.start2 -= padding_length # Should be 0.
1511
+ patch.length1 += padding_length
1512
+ patch.length2 += padding_length
1513
+ elsif padding_length > diffs.first[1].length
1514
+ # Grow first equality.
1515
+ extra_length = padding_length - diffs.first[1].length
1516
+ diffs.first[1] = null_padding[diffs.first[1].length..-1] + diffs.first[1]
1517
+ patch.start1 -= extra_length
1518
+ patch.start2 -= extra_length
1519
+ patch.length1 += extra_length
1520
+ patch.length2 += extra_length
1521
+ end
1522
+
1523
+ # Add some padding on end of last diff.
1524
+ patch = patches.last
1525
+ diffs = patch.diffs
1526
+ if diffs.empty? || diffs.last[0] != :equal
1527
+ # Add nullPadding equality.
1528
+ diffs.push([:equal, null_padding])
1529
+ patch.length1 += padding_length
1530
+ patch.length2 += padding_length
1531
+ elsif padding_length > diffs.last[1].length
1532
+ # Grow last equality.
1533
+ extra_length = padding_length - diffs.last[1].length
1534
+ diffs.last[1] += null_padding[0, extra_length]
1535
+ patch.length1 += extra_length
1536
+ patch.length2 += extra_length
1537
+ end
1538
+
1539
+ null_padding
1540
+ end
1541
+
1542
+ # Look through the patches and break up any which are longer than the
1543
+ # maximum limit of the match algorithm.
1544
+ def patch_splitMax(patches)
1545
+ patch_size = match_maxBits
1546
+
1547
+ x = 0
1548
+ while x < patches.length
1549
+ if patches[x].length1 > patch_size
1550
+ big_patch = patches[x]
1551
+ # Remove the big old patch
1552
+ patches[x, 1] = []
1553
+ x -= 1
1554
+ start1 = big_patch.start1
1555
+ start2 = big_patch.start2
1556
+ pre_context = ''
1557
+ while !big_patch.diffs.empty?
1558
+ # Create one of several smaller patches.
1559
+ patch = PatchObj.new
1560
+ empty = true
1561
+ patch.start1 = start1 - pre_context.length
1562
+ patch.start2 = start2 - pre_context.length
1563
+ unless pre_context.empty?
1564
+ patch.length1 = patch.length2 = pre_context.length
1565
+ patch.diffs.push([:equal, pre_context])
1566
+ end
1567
+
1568
+ while !big_patch.diffs.empty? && patch.length1 < patch_size - patch_margin
1569
+ diff = big_patch.diffs.first
1570
+ if diff[0] == :insert
1571
+ # Insertions are harmless.
1572
+ patch.length2 += diff[1].length
1573
+ start2 += diff[1].length
1574
+ patch.diffs.push(big_patch.diffs.shift)
1575
+ empty = false
1576
+ elsif diff[0] == :delete && patch.diffs.length == 1 &&
1577
+ patch.diffs.first[0] == :equal && diff[1].length > 2 * patch_size
1578
+ # This is a large deletion. Let it pass in one chunk.
1579
+ patch.length1 += diff[1].length
1580
+ start1 += diff[1].length
1581
+ empty = false
1582
+ patch.diffs.push(big_patch.diffs.shift)
1583
+ else
1584
+ # Deletion or equality. Only take as much as we can stomach.
1585
+ diff_text = diff[1][0, patch_size - patch.length1 - patch_margin]
1586
+ patch.length1 += diff_text.length
1587
+ start1 += diff_text.length
1588
+ if diff[0] == :equal
1589
+ patch.length2 += diff_text.length
1590
+ start2 += diff_text.length
1591
+ else
1592
+ empty = false
1593
+ end
1594
+ patch.diffs.push([diff[0], diff_text])
1595
+ if diff_text == big_patch.diffs.first[1]
1596
+ big_patch.diffs.shift
1597
+ else
1598
+ big_patch.diffs.first[1] = big_patch.diffs.first[1][diff_text.length..-1]
1599
+ end
1600
+ end
1601
+ end
1602
+
1603
+ # Compute the head context for the next patch.
1604
+ pre_context = diff_text2(patch.diffs)[-patch_margin..-1] || ''
1605
+
1606
+ # Append the end context for this patch.
1607
+ post_context = diff_text1(big_patch.diffs)[0...patch_margin] || ''
1608
+ unless post_context.empty?
1609
+ patch.length1 += post_context.length
1610
+ patch.length2 += post_context.length
1611
+ if !patch.diffs.empty? && patch.diffs.last[0] == :equal
1612
+ patch.diffs.last[1] += post_context
1613
+ else
1614
+ patch.diffs.push([:equal, post_context])
1615
+ end
1616
+ end
1617
+ if !empty
1618
+ x += 1
1619
+ patches[x, 0] = [patch]
1620
+ end
1621
+ end
1622
+ end
1623
+ x += 1
1624
+ end
1625
+ end
1626
+ end