dimapa 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 5e87b2a963101ab22b8fb368d0956670c8f22a09691da2931ab28574f4d700fd
4
+ data.tar.gz: c94b5fa761a33875cd96db4afadc9aabfc87f0e3a3da249f6a59ce9659685f5d
5
+ SHA512:
6
+ metadata.gz: de8c3501e51d0fecfb0697e6120b630b6b3c46ee1e0f52ee9343c07db4e3e2a1667bb690407557726c0563ff712000d0ed2319aeda0eb5c3cf990bb98eb7a1bd
7
+ data.tar.gz: 6f638d15129dc34dda5d766796dda129340b981e06886c277ec2583044dbbdf34ddd70df51ce600d26ac4797b2cc4d2a9eeaee31377fa9e8b02360f1c650ea2f
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
6
+
7
+ gemspec
8
+
9
+ group :development, :test do
10
+ gem "pry"
11
+ gem "pry-doc"
12
+ gem "standard"
13
+ end
data/LICENSE ADDED
@@ -0,0 +1,23 @@
1
+ Copyright (c) 2011, Jorge Kalmbach <kalmbach.at.gmail.com>
2
+
3
+ Permission is hereby granted, free of charge, to any
4
+ person obtaining a copy of this software and associated
5
+ documentation files (the "Software"), to deal in the
6
+ Software without restriction, including without limitation
7
+ the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the
9
+ Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice
13
+ shall be included in all copies or substantial portions of
14
+ the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
17
+ KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
18
+ WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
19
+ PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
20
+ OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,78 @@
1
+ # DiMaPa (Diff Match and Patch)
2
+ A modern Ruby implementation of Google's [Diff Match and Patch][google]
3
+ libraries.
4
+
5
+ > The Diff Match and Patch libraries offer robust algorithms to perform the
6
+ > operations required for synchronizing plain text.
7
+
8
+ ## Usage
9
+ ```ruby
10
+ require 'dimapa'
11
+
12
+ dmp = DiMaPa.new # or DiffMatchPatch
13
+
14
+ diff = dmp.diff_main("This is a sentence.", "This is also a sentence.")
15
+ #=> [[:equal, "This is a"], [:insert, "lso a"], [:equal, " sentence."]]
16
+
17
+ dmp.diff_cleanup_semantic(diff)
18
+ #=> nil
19
+
20
+ # diff is modified in place
21
+ diff
22
+ #=> [[:equal, "This is "], [:insert, "also "], [:equal, "a sentence."]]
23
+
24
+ patch = dmp.patch_make(diff)
25
+ #=> [#<PatchObj:0x00005608e6ac9500 @diffs=
26
+ # [[:equal, "This is "], [:insert, "also "], [:equal, "a senten"]],
27
+ # @length1=16,
28
+ # @length2=21,
29
+ # @start1=0,
30
+ # @start2=0>]
31
+
32
+ dmp.patch_to_text(patch)
33
+ #=> "@@ -1,16 +1,21 @@\n This is \n+also \n a senten\n"
34
+
35
+ dmp.patch_apply(patch, "This is a sentence.")
36
+ #=> ["This is also a sentence.", [true]]
37
+ ```
38
+
39
+ ## Installation
40
+ ```sh
41
+ # RubyGem
42
+ gem install dimapa
43
+
44
+ # From source
45
+ bundle install
46
+ bundle exec rake install
47
+ ```
48
+
49
+ ## Benchmarks
50
+
51
+ This project includes [scripts/](speedtests) mirroring those in the official
52
+ project. Performance is on par with those reported for [Lua and Python][speedtest]
53
+ albeit run on a faster machine.
54
+
55
+ ```
56
+ $ rake speedtest
57
+
58
+ user system total real
59
+ diff(t2,t1) 13.658214 0.003937 13.662151 ( 13.662453)
60
+ diff(t1,t2) 14.074079 0.000001 14.074080 ( 14.074350)
61
+ ```
62
+
63
+ ## Tests and Linting
64
+
65
+ ```sh
66
+ bundle exec rake
67
+ ```
68
+
69
+ ### Fork of [kalmbach/diff_match_patch][kalmbach] b/w/o [DavidMikeSimon/diff_match_patch][davidmikesimon]
70
+ Copyright (c) 2011, Jorge Kalmbach <kalmbach.at.gmail.com>
71
+
72
+ Work was inspired by the [reima/diff_match_patch-ruby][reima] module.
73
+
74
+ [speedtest]: https://docs.google.com/spreadsheets/d/1zpZccuBpjMZTvL1nGDMKJc7rWL_m_drF4XKOJvB27Kc/edit#gid=0
75
+ [kalmbach]: https://github.com/kalmbach/diff_match_patch
76
+ [davidmikesimon]: https://github.com/DavidMikeSimon/diff_match_patch
77
+ [reima]: https://github.com/reima/diff_match_patch-ruby
78
+ [google]: https://github.com/google/diff-match-patch
@@ -0,0 +1,22 @@
1
+ require "rake/testtask"
2
+ require "standard/rake"
3
+ require "bundler/gem_tasks"
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << "test"
7
+ end
8
+
9
+ desc "Run benchmarking speedtest"
10
+ task :speedtest do
11
+ ruby "scripts/speedtest.rb"
12
+ end
13
+
14
+ desc "Start REPL"
15
+ task :console do
16
+ require "pry"
17
+ require "dimapa"
18
+ Pry.start
19
+ end
20
+
21
+ desc "Run tests and linter"
22
+ task default: [:standard, :test]
@@ -0,0 +1,132 @@
1
+ module DiffMethods
2
+ FIXNUM_MAX = 2**(0.size * 8 - 2) - 1
3
+
4
+ attr_accessor :diff_timeout
5
+
6
+ def initialize
7
+ # Number of seconds to map a diff before giving up (0 for infinity).
8
+ @diff_timeout = 1
9
+ end
10
+
11
+ # Find the differences between two texts. Simplifies the problem by
12
+ # stripping any common prefix or suffix off the texts before editing.
13
+ def diff_main(text1, text2, checklines = true, deadline = nil)
14
+ # Set a deadline by which time the diff must be complete.
15
+ deadline ||= diff_new_deadline
16
+
17
+ # Check for null inputs.
18
+ raise ArgumentError.new("Null inputs. (diff_main)") unless text1 || text2
19
+
20
+ # Check for equality (speedup).
21
+ return (text1.empty? ? [] : [[:equal, text1]]) if text1 == text2
22
+
23
+ diff_main_compute_diff(text1, text2, checklines, deadline)
24
+ end
25
+
26
+ def diff_main_compute_diff(text1, text2, checklines, deadline)
27
+ # Trim off common prefix and suffix (speedup).
28
+ common_prefix, text1, text2 = diff_trim_common_prefix(text1, text2)
29
+ common_suffix, text1, text2 = diff_trim_common_suffix(text1, text2)
30
+
31
+ # Compute the diff on the middle block.
32
+ diffs = diff_compute(text1, text2, checklines, deadline)
33
+
34
+ # Restore the prefix and suffix.
35
+ diffs.unshift([:equal, common_prefix]) unless common_prefix.nil?
36
+ diffs.push([:equal, common_suffix]) unless common_suffix.nil?
37
+ diff_cleanup_merge(diffs)
38
+
39
+ diffs
40
+ end
41
+
42
+ private :diff_main_compute_diff
43
+
44
+ # Calculate a new deadline using the @diff_timeout configuration value
45
+ def diff_new_deadline
46
+ Time.now + (diff_timeout.zero? ? FIXNUM_MAX : diff_timeout)
47
+ end
48
+
49
+ private :diff_new_deadline
50
+
51
+ # Trim off the common prefix
52
+ def diff_trim_common_prefix(text1, text2)
53
+ if (common_length = diff_common_prefix(text1, text2)).nonzero?
54
+ common_prefix = text1[0...common_length]
55
+ text1 = text1[common_length..-1]
56
+ text2 = text2[common_length..-1]
57
+ end
58
+
59
+ [common_prefix, text1, text2]
60
+ end
61
+
62
+ private :diff_trim_common_prefix
63
+
64
+ # Trim off the common suffix
65
+ def diff_trim_common_suffix(text1, text2)
66
+ if (common_length = diff_common_suffix(text1, text2)).nonzero?
67
+ common_suffix = text1[-common_length..-1]
68
+ text1 = text1[0...-common_length]
69
+ text2 = text2[0...-common_length]
70
+ end
71
+
72
+ [common_suffix, text1, text2]
73
+ end
74
+
75
+ private :diff_trim_common_suffix
76
+
77
+ # Find the differences between two texts. Assumes that the texts do not
78
+ # have any common prefix or suffix.
79
+ def diff_compute(text1, text2, checklines, deadline)
80
+ if (diffs = diff_compute_common_cases(text1, text2))
81
+ diffs
82
+
83
+ elsif (diffs = diff_compute_half_match(text1, text2, checklines, deadline))
84
+ diffs
85
+
86
+ elsif checklines && text1.length > 100 && text2.length > 100
87
+ diff_line_mode(text1, text2, deadline)
88
+
89
+ else
90
+ diff_bisect(text1, text2, deadline)
91
+ end
92
+ end
93
+
94
+ def diff_compute_half_match(text1, text2, checklines, deadline)
95
+ if (hm = diff_half_match(text1, text2))
96
+ # A half-match was found, sort out the return data.
97
+ text1_a, text1_b, text2_a, text2_b, mid_common = hm
98
+
99
+ # Send both pairs off for separate processing.
100
+ diffs_a = diff_main(text1_a, text2_a, checklines, deadline)
101
+ diffs_b = diff_main(text1_b, text2_b, checklines, deadline)
102
+
103
+ # Merge the results.
104
+ diffs_a + [[:equal, mid_common]] + diffs_b
105
+ end
106
+ end
107
+
108
+ private :diff_compute_half_match
109
+
110
+ def diff_compute_common_cases(text1, text2)
111
+ # Just add some text (speedup).
112
+ return [[:insert, text2]] if text1.empty?
113
+
114
+ # Just delete some text (speedup).
115
+ return [[:delete, text1]] if text2.empty?
116
+
117
+ short, long = [text1, text2].sort_by(&:length)
118
+
119
+ # Shorter text is inside the longer text (speedup).
120
+ if (i = long.index(short))
121
+ op = text1.length > text2.length ? :delete : :insert
122
+ [[op, long[0...i]], [:equal, short], [op, long[(i + short.length)..-1]]]
123
+
124
+ # Single character string.
125
+ elsif short.length == 1
126
+ # After the previous speedup, the character can't be an equality.
127
+ [[:delete, text1], [:insert, text2]]
128
+ end
129
+ end
130
+
131
+ private :diff_compute_common_cases
132
+ end
@@ -0,0 +1,1522 @@
1
+ require "diff_methods"
2
+ require "patch_obj"
3
+
4
+ # Class containing the diff, match and patch methods.
5
+ # Also contains the behaviour settings.
6
+ class DiMaPa
7
+ include DiffMethods
8
+
9
+ attr_accessor :diff_edit_cost
10
+ attr_accessor :match_threshold
11
+ attr_accessor :match_distance
12
+ attr_accessor :patch_delete_threshold
13
+ attr_accessor :patch_margin
14
+ attr_reader :match_max_bits
15
+
16
+ def initialize
17
+ # Inits a diff_match_patch object with default settings.
18
+ # Redefine these in your program to override the defaults.
19
+
20
+ # Cost of an empty edit operation in terms of edit characters.
21
+ @diff_edit_cost = 4
22
+ # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
23
+ @match_threshold = 0.5
24
+ # How far to search for a match (0 = exact location, 1000+ = broad match).
25
+ # A match this many characters away from the expected location will add
26
+ # 1.0 to the score (0.0 is a perfect match).
27
+ @match_distance = 1000
28
+ # When deleting a large block of text (over ~64 characters), how close does
29
+ # the contents have to match the expected contents. (0.0 = perfection,
30
+ # 1.0 = very loose). Note that Match_Threshold controls how closely the
31
+ # end points of a delete need to match.
32
+ @patch_delete_threshold = 0.5
33
+ # Chunk size for context length.
34
+ @patch_margin = 4
35
+
36
+ # The number of bits in an int.
37
+ # Python has no maximum, thus to disable patch splitting set to 0.
38
+ # However to avoid long patches in certain pathological cases, use 32.
39
+ # Multiple short patches (using native ints) are much faster than long ones.
40
+ @match_max_bits = 32
41
+ super
42
+ end
43
+
44
+ # Do a quick line-level diff on both strings, then rediff the parts for
45
+ # greater accuracy.
46
+ # This speedup can produce non-minimal diffs.
47
+ def diff_line_mode(text1, text2, deadline)
48
+ # Scan the text on a line-by-line basis first.
49
+ text1, text2, line_array = diff_lines_to_chars(text1, text2)
50
+
51
+ diffs = diff_main(text1, text2, false, deadline)
52
+
53
+ # Convert the diff back to original text.
54
+ diff_chars_to_lines(diffs, line_array)
55
+ # Eliminate freak matches (e.g. blank lines)
56
+ diff_cleanup_semantic(diffs)
57
+
58
+ # Rediff any replacement blocks, this time character-by-character.
59
+ # Add a dummy entry at the end.
60
+ diffs.push([:equal, ""])
61
+ pointer = 0
62
+ count_delete = 0
63
+ count_insert = 0
64
+ text_delete = ""
65
+ text_insert = ""
66
+
67
+ while pointer < diffs.length
68
+ case diffs[pointer][0]
69
+ when :insert
70
+ count_insert += 1
71
+ text_insert += diffs[pointer][1]
72
+ when :delete
73
+ count_delete += 1
74
+ text_delete += diffs[pointer][1]
75
+ when :equal
76
+ # Upon reaching an equality, check for prior redundancies.
77
+ if count_delete >= 1 && count_insert >= 1
78
+ # Delete the offending records and add the merged ones.
79
+ a = diff_main(text_delete, text_insert, false, deadline)
80
+ diffs[pointer - count_delete - count_insert,
81
+ count_delete + count_insert] = []
82
+ pointer = pointer - count_delete - count_insert
83
+ diffs[pointer, 0] = a
84
+ pointer += a.length
85
+ end
86
+ count_insert = 0
87
+ count_delete = 0
88
+ text_delete = ""
89
+ text_insert = ""
90
+ end
91
+ pointer += 1
92
+ end
93
+
94
+ diffs.pop # Remove the dummy entry at the end.
95
+ diffs
96
+ end
97
+
98
+ # Find the 'middle snake' of a diff, split the problem in two
99
+ # and return the recursively constructed diff.
100
+ # See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
101
+ def diff_bisect(text1, text2, deadline)
102
+ # Cache the text lengths to prevent multiple calls.
103
+ text1_length = text1.length
104
+ text2_length = text2.length
105
+ max_d = (text1_length + text2_length + 1) / 2
106
+ v_offset = max_d
107
+ v_length = 2 * max_d
108
+ v1 = Array.new(v_length, -1)
109
+ v2 = Array.new(v_length, -1)
110
+ v1[v_offset + 1] = 0
111
+ v2[v_offset + 1] = 0
112
+ delta = text1_length - text2_length
113
+
114
+ # If the total number of characters is odd, then the front path will
115
+ # collide with the reverse path.
116
+ front = (delta % 2 != 0)
117
+ # Offsets for start and end of k loop.
118
+ # Prevents mapping of space beyond the grid.
119
+ k1start = 0
120
+ k1end = 0
121
+ k2start = 0
122
+ k2end = 0
123
+ max_d.times do |d|
124
+ # Bail out if deadline is reached.
125
+ break if deadline && Time.now >= deadline
126
+
127
+ # Walk the front path one step.
128
+ (-d + k1start).step(d - k1end, 2) do |k1|
129
+ k1_offset = v_offset + k1
130
+ x1 = if k1 == -d || k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]
131
+ v1[k1_offset + 1]
132
+ else
133
+ v1[k1_offset - 1] + 1
134
+ end
135
+
136
+ y1 = x1 - k1
137
+ while x1 < text1_length && y1 < text2_length && text1[x1] == text2[y1]
138
+ x1 += 1
139
+ y1 += 1
140
+ end
141
+
142
+ v1[k1_offset] = x1
143
+ if x1 > text1_length
144
+ # Ran off the right of the graph.
145
+ k1end += 2
146
+ elsif y1 > text2_length
147
+ # Ran off the bottom of the graph.
148
+ k1start += 2
149
+ elsif front
150
+ k2_offset = v_offset + delta - k1
151
+ if k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1
152
+ # Mirror x2 onto top-left coordinate system.
153
+ x2 = text1_length - v2[k2_offset]
154
+ if x1 >= x2
155
+ # Overlap detected.
156
+ return diff_bisect_split(text1, text2, x1, y1, deadline)
157
+ end
158
+ end
159
+ end
160
+ end
161
+
162
+ # Walk the reverse path one step.
163
+ (-d + k2start).step(d - k2end, 2) do |k2|
164
+ k2_offset = v_offset + k2
165
+ x2 = if k2 == -d || k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]
166
+ v2[k2_offset + 1]
167
+ else
168
+ v2[k2_offset - 1] + 1
169
+ end
170
+
171
+ y2 = x2 - k2
172
+ while x2 < text1_length && y2 < text2_length && text1[-x2 - 1] == text2[-y2 - 1]
173
+ x2 += 1
174
+ y2 += 1
175
+ end
176
+
177
+ v2[k2_offset] = x2
178
+ if x2 > text1_length
179
+ # Ran off the left of the graph.
180
+ k2end += 2
181
+ elsif y2 > text2_length
182
+ # Ran off the top of the graph.
183
+ k2start += 2
184
+ elsif !front
185
+ k1_offset = v_offset + delta - k2
186
+ if k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1
187
+ x1 = v1[k1_offset]
188
+ y1 = v_offset + x1 - k1_offset
189
+ # Mirror x2 onto top-left coordinate system.
190
+ x2 = text1_length - x2
191
+ if x1 >= x2
192
+ # Overlap detected.
193
+ return diff_bisect_split(text1, text2, x1, y1, deadline)
194
+ end
195
+ end
196
+ end
197
+ end
198
+ end
199
+
200
+ # Diff took too long and hit the deadline or
201
+ # number of diffs equals number of characters, no commonality at all.
202
+ [[:delete, text1], [:insert, text2]]
203
+ end
204
+
205
+ # Given the location of the 'middle snake', split the diff in two parts
206
+ # and recurse.
207
+ def diff_bisect_split(text1, text2, x, y, deadline)
208
+ text1a = text1[0...x]
209
+ text2a = text2[0...y]
210
+ text1b = text1[x..-1]
211
+ text2b = text2[y..-1]
212
+
213
+ # Compute both diffs serially.
214
+ diffs = diff_main(text1a, text2a, false, deadline)
215
+ diffsb = diff_main(text1b, text2b, false, deadline)
216
+
217
+ diffs + diffsb
218
+ end
219
+
220
+ # Split two texts into an array of strings. Reduce the texts to a string
221
+ # of hashes where each Unicode character represents one line.
222
+ def diff_lines_to_chars(text1, text2)
223
+ line_array = [""] # e.g. line_array[4] == "Hello\n"
224
+ line_hash = {} # e.g. line_hash["Hello\n"] == 4
225
+
226
+ [text1, text2].map { |text|
227
+ # Split text into an array of strings. Reduce the text to a string of
228
+ # hashes where each Unicode character represents one line.
229
+ chars = ""
230
+ text.each_line do |line|
231
+ if line_hash[line]
232
+ chars += line_hash[line].chr(Encoding::UTF_8)
233
+ else
234
+ chars += line_array.length.chr(Encoding::UTF_8)
235
+ line_hash[line] = line_array.length
236
+ line_array.push(line)
237
+ end
238
+ end
239
+ chars
240
+ }.push(line_array)
241
+ end
242
+
243
+ # Rehydrate the text in a diff from a string of line hashes to real lines of text.
244
+ def diff_chars_to_lines(diffs, line_array)
245
+ diffs.each do |diff|
246
+ diff[1] = diff[1].chars.map { |c| line_array[c.ord] }.join
247
+ end
248
+ end
249
+
250
+ # Determine the common prefix of two strings.
251
+ def diff_common_prefix(text1, text2)
252
+ # Quick check for common null cases.
253
+ return 0 if text1.empty? || text2.empty? || text1[0] != text2[0]
254
+
255
+ # Binary search.
256
+ # Performance analysis: http://neil.fraser.name/news/2007/10/09/
257
+ pointer_min = 0
258
+ pointer_max = [text1.length, text2.length].min
259
+ pointer_mid = pointer_max
260
+ pointer_start = 0
261
+
262
+ while pointer_min < pointer_mid
263
+ if text1[pointer_start...pointer_mid] == text2[pointer_start...pointer_mid]
264
+ pointer_min = pointer_mid
265
+ pointer_start = pointer_min
266
+ else
267
+ pointer_max = pointer_mid
268
+ end
269
+ pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min
270
+ end
271
+
272
+ pointer_mid
273
+ end
274
+
275
+ # Determine the common suffix of two strings.
276
+ def diff_common_suffix(text1, text2)
277
+ # Quick check for common null cases.
278
+ return 0 if text1.empty? || text2.empty? || text1[-1] != text2[-1]
279
+
280
+ # Binary search.
281
+ # Performance analysis: http://neil.fraser.name/news/2007/10/09/
282
+ pointer_min = 0
283
+ pointer_max = [text1.length, text2.length].min
284
+ pointer_mid = pointer_max
285
+ pointer_end = 0
286
+
287
+ while pointer_min < pointer_mid
288
+ if text1[-pointer_mid..(-pointer_end - 1)] == text2[-pointer_mid..(-pointer_end - 1)]
289
+ pointer_min = pointer_mid
290
+ pointer_end = pointer_min
291
+ else
292
+ pointer_max = pointer_mid
293
+ end
294
+ pointer_mid = (pointer_max - pointer_min) / 2 + pointer_min
295
+ end
296
+
297
+ pointer_mid
298
+ end
299
+
300
+ # Determine if the suffix of one string is the prefix of another.
301
+ def diff_common_overlap(text1, text2)
302
+ # Cache the text lengths to prevent multiple calls.
303
+ text1_length = text1.length
304
+ text2_length = text2.length
305
+
306
+ # Eliminate the null case.
307
+ return 0 if text1_length.zero? || text2_length.zero?
308
+
309
+ # Truncate the longer string.
310
+ if text1_length > text2_length
311
+ text1 = text1[-text2_length..-1]
312
+ else
313
+ text2 = text2[0...text1_length]
314
+ end
315
+ text_length = [text1_length, text2_length].min
316
+
317
+ # Quick check for the whole case.
318
+ return text_length if text1 == text2
319
+
320
+ # Start by looking for a single character match
321
+ # and increase length until no match is found.
322
+ # Performance analysis: http://neil.fraser.name/news/2010/11/04/
323
+ best = 0
324
+ length = 1
325
+ loop do
326
+ pattern = text1[(text_length - length)..-1]
327
+ found = text2.index(pattern)
328
+
329
+ return best if found.nil?
330
+
331
+ length += found
332
+ if found == 0 || text1[(text_length - length)..-1] == text2[0..length]
333
+ best = length
334
+ length += 1
335
+ end
336
+ end
337
+ end
338
+
339
+ # Does a substring of shorttext exist within longtext such that the
340
+ # substring is at least half the length of longtext?
341
+ def diff_half_match_i(longtext, shorttext, i)
342
+ seed = longtext[i, longtext.length / 4]
343
+ j = -1
344
+ best_common = ""
345
+ while (j = shorttext.index(seed, j + 1))
346
+ prefix_length = diff_common_prefix(longtext[i..-1], shorttext[j..-1])
347
+ suffix_length = diff_common_suffix(longtext[0...i], shorttext[0...j])
348
+ if best_common.length < suffix_length + prefix_length
349
+ best_common = shorttext[(j - suffix_length)...j] + shorttext[j...(j + prefix_length)]
350
+ best_longtext_a = longtext[0...(i - suffix_length)]
351
+ best_longtext_b = longtext[(i + prefix_length)..-1]
352
+ best_shorttext_a = shorttext[0...(j - suffix_length)]
353
+ best_shorttext_b = shorttext[(j + prefix_length)..-1]
354
+ end
355
+ end
356
+
357
+ if best_common.length * 2 >= longtext.length
358
+ [best_longtext_a, best_longtext_b, best_shorttext_a, best_shorttext_b, best_common]
359
+ end
360
+ end
361
+
362
+ # Do the two texts share a substring which is at least half the length of the
363
+ # longer text?
364
+ # This speedup can produce non-minimal diffs.
365
+ def diff_half_match(text1, text2)
366
+ # Don't risk returning a non-optimal diff if we have unlimited time
367
+ return nil if diff_timeout <= 0
368
+
369
+ shorttext, longtext = [text1, text2].sort_by(&:length)
370
+ if longtext.length < 4 || shorttext.length * 2 < longtext.length
371
+ return nil # Pointless.
372
+ end
373
+
374
+ # First check if the second quarter is the seed for a half-match.
375
+ hm1 = diff_half_match_i(longtext, shorttext, (longtext.length + 3) / 4)
376
+ # Check again based on the third quarter.
377
+ hm2 = diff_half_match_i(longtext, shorttext, (longtext.length + 1) / 2)
378
+
379
+ if hm1.nil? && hm2.nil?
380
+ return nil
381
+ elsif hm2.nil? || hm1.nil?
382
+ hm = hm2.nil? ? hm1 : hm2
383
+ else
384
+ # Both matched. Select the longest.
385
+ hm = hm1[4].length > hm2[4].length ? hm1 : hm2
386
+ end
387
+
388
+ # A half-match was found, sort out the return data.
389
+ if text1.length > text2.length
390
+ text1_a, text1_b, text2_a, text2_b, mid_common = hm
391
+ else
392
+ text2_a, text2_b, text1_a, text1_b, mid_common = hm
393
+ end
394
+
395
+ [text1_a, text1_b, text2_a, text2_b, mid_common]
396
+ end
397
+
398
+ # Reduce the number of edits by eliminating semantically trivial equalities.
399
+ def diff_cleanup_semantic(diffs)
400
+ changes = false
401
+ equalities = [] # Stack of indices where equalities are found.
402
+ last_equality = nil # Always equal to equalities.last[1]
403
+ pointer = 0 # Index of current position.
404
+ # Number of characters that changed prior to the equality.
405
+ length_insertions1 = 0
406
+ length_deletions1 = 0
407
+ # Number of characters that changed after the equality.
408
+ length_insertions2 = 0
409
+ length_deletions2 = 0
410
+
411
+ while pointer < diffs.length
412
+ if diffs[pointer][0] == :equal # Equality found.
413
+ equalities.push(pointer)
414
+ length_insertions1 = length_insertions2
415
+ length_deletions1 = length_deletions2
416
+ length_insertions2 = 0
417
+ length_deletions2 = 0
418
+ last_equality = diffs[pointer][1]
419
+ else # An insertion or deletion.
420
+ if diffs[pointer][0] == :insert
421
+ length_insertions2 += diffs[pointer][1].length
422
+ else
423
+ length_deletions2 += diffs[pointer][1].length
424
+ end
425
+
426
+ if last_equality &&
427
+ last_equality.length <= [length_insertions1, length_deletions1].max &&
428
+ last_equality.length <= [length_insertions2, length_deletions2].max
429
+ # Duplicate record.
430
+ diffs[equalities.last, 0] = [[:delete, last_equality]]
431
+
432
+ # Change second copy to insert.
433
+ diffs[equalities.last + 1][0] = :insert
434
+
435
+ # Throw away the equality we just deleted.
436
+ equalities.pop
437
+ # Throw away the previous equality (it needs to be reevaluated).
438
+ equalities.pop
439
+ pointer = equalities.last || -1
440
+
441
+ # Reset the counters.
442
+ length_insertions1 = 0
443
+ length_deletions1 = 0
444
+ length_insertions2 = 0
445
+ length_deletions2 = 0
446
+ last_equality = nil
447
+
448
+ changes = true
449
+ end
450
+ end
451
+ pointer += 1
452
+ end
453
+
454
+ # Normalize the diff.
455
+ diff_cleanup_merge(diffs) if changes
456
+ diff_cleanup_semantic_lossless(diffs)
457
+
458
+ # Find any overlaps between deletions and insertions.
459
+ # e.g: <del>abcxxx</del><ins>xxxdef</ins>
460
+ # -> <del>abc</del>xxx<ins>def</ins>
461
+ # e.g: <del>xxxabc</del><ins>defxxx</ins>
462
+ # -> <ins>def</ins>xxx<del>abc</del>
463
+ # Only extract an overlap if it is as big as the edit ahead or behind it.
464
+ pointer = 1
465
+ while pointer < diffs.length
466
+ if diffs[pointer - 1][0] == :delete && diffs[pointer][0] == :insert
467
+ deletion = diffs[pointer - 1][1]
468
+ insertion = diffs[pointer][1]
469
+ overlap_length1 = diff_common_overlap(deletion, insertion)
470
+ overlap_length2 = diff_common_overlap(insertion, deletion)
471
+ if overlap_length1 >= overlap_length2
472
+ if overlap_length1 >= deletion.length / 2.0 ||
473
+ overlap_length1 >= insertion.length / 2.0
474
+ # Overlap found. Insert an equality and trim the surrounding edits.
475
+ diffs[pointer, 0] = [[:equal, insertion[0...overlap_length1]]]
476
+ diffs[pointer - 1][0] = :delete
477
+ diffs[pointer - 1][1] = deletion[0...-overlap_length1]
478
+ diffs[pointer + 1][0] = :insert
479
+ diffs[pointer + 1][1] = insertion[overlap_length1..-1]
480
+ pointer += 1
481
+ end
482
+ elsif overlap_length2 >= deletion.length / 2.0 || overlap_length2 >= insertion.length / 2.0
483
+ diffs[pointer, 0] = [[:equal, deletion[0...overlap_length2]]]
484
+ diffs[pointer - 1][0] = :insert
485
+ diffs[pointer - 1][1] = insertion[0...-overlap_length2]
486
+ diffs[pointer + 1][0] = :delete
487
+ diffs[pointer + 1][1] = deletion[overlap_length2..-1]
488
+ pointer += 1
489
+ end
490
+ pointer += 1
491
+ end
492
+ pointer += 1
493
+ end
494
+ end
495
+
496
+ # Given two strings, compute a score representing whether the
497
+ # internal boundary falls on logical boundaries.
498
+ # Scores range from 5 (best) to 0 (worst).
499
+ def diff_cleanup_semantic_score(one, two)
500
+ if one.empty? || two.empty?
501
+ # Edges are the best.
502
+ return 5
503
+ end
504
+
505
+ # Define some regex patterns for matching boundaries.
506
+ non_word_character = /[^a-zA-Z0-9]/
507
+ whitespace = /\s/
508
+ linebreak = /[\r\n]/
509
+ line_end = /\n\r?\n$/
510
+ line_start = /^\r?\n\r?\n/
511
+
512
+ # Each port of this function behaves slightly differently due to
513
+ # subtle differences in each language's definition of things like
514
+ # 'whitespace'. Since this function's purpose is largely cosmetic,
515
+ # the choice has been made to use each language's native features
516
+ # rather than force total conformity.
517
+ score = 0
518
+ # One point for non-alphanumeric.
519
+ if one[-1] =~ non_word_character || two[0] =~ non_word_character
520
+ score += 1
521
+ # Two points for whitespace.
522
+ if one[-1] =~ whitespace || two[0] =~ whitespace
523
+ score += 1
524
+ # Three points for line breaks.
525
+ if one[-1] =~ linebreak || two[0] =~ linebreak
526
+ score += 1
527
+ # Four points for blank lines.
528
+ if one =~ line_end || two =~ line_start
529
+ score += 1
530
+ end
531
+ end
532
+ end
533
+ end
534
+
535
+ score
536
+ end
537
+
538
+ # Look for single edits surrounded on both sides by equalities
539
+ # which can be shifted sideways to align the edit to a word boundary.
540
+ # e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
541
+ def diff_cleanup_semantic_lossless(diffs)
542
+ pointer = 1
543
+ # Intentionally ignore the first and last element (don't need checking).
544
+ while pointer < diffs.length - 1
545
+ if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal
546
+ # This is a single edit surrounded by equalities.
547
+ equality1 = diffs[pointer - 1][1]
548
+ edit = diffs[pointer][1]
549
+ equality2 = diffs[pointer + 1][1]
550
+
551
+ # First, shift the edit as far left as possible.
552
+ common_offset = diff_common_suffix(equality1, edit)
553
+ if common_offset != 0
554
+ common_string = edit[-common_offset..-1]
555
+ equality1 = equality1[0...-common_offset]
556
+ edit = common_string + edit[0...-common_offset]
557
+ equality2 = common_string + equality2
558
+ end
559
+
560
+ # Second, step character by character right, looking for the best fit.
561
+ best_equality1 = equality1
562
+ best_edit = edit
563
+ best_equality2 = equality2
564
+ best_score = diff_cleanup_semantic_score(equality1, edit) +
565
+ diff_cleanup_semantic_score(edit, equality2)
566
+ while edit[0] == equality2[0]
567
+ equality1 += edit[0]
568
+ edit = edit[1..-1] + equality2[0]
569
+ equality2 = equality2[1..-1]
570
+ score = diff_cleanup_semantic_score(equality1, edit) +
571
+ diff_cleanup_semantic_score(edit, equality2)
572
+ # The >= encourages trailing rather than leading whitespace on edits.
573
+ if score >= best_score
574
+ best_score = score
575
+ best_equality1 = equality1
576
+ best_edit = edit
577
+ best_equality2 = equality2
578
+ end
579
+ end
580
+
581
+ if diffs[pointer - 1][1] != best_equality1
582
+ # We have an improvement, save it back to the diff.
583
+ if best_equality1.empty?
584
+ diffs[pointer - 1, 1] = []
585
+ pointer -= 1
586
+ else
587
+ diffs[pointer - 1][1] = best_equality1
588
+ end
589
+
590
+ diffs[pointer][1] = best_edit
591
+
592
+ if best_equality2.empty?
593
+ diffs[pointer + 1, 1] = []
594
+ pointer -= 1
595
+ else
596
+ diffs[pointer + 1][1] = best_equality2
597
+ end
598
+ end
599
+ end
600
+
601
+ pointer += 1
602
+ end
603
+ end
604
+
605
+ # Reduce the number of edits by eliminating operationally trivial equalities.
606
+ def diff_cleanup_efficiency(diffs)
607
+ changes = false
608
+ equalities = [] # Stack of indices where equalities are found.
609
+ last_equality = "" # Always equal to equalities.last[1]
610
+ pointer = 0 # Index of current position.
611
+ pre_ins = false # Is there an insertion operation before the last equality.
612
+ pre_del = false # Is there a deletion operation before the last equality.
613
+ post_ins = false # Is there an insertion operation after the last equality.
614
+ post_del = false # Is there a deletion operation after the last equality.
615
+
616
+ while pointer < diffs.length
617
+ if diffs[pointer][0] == :equal # Equality found.
618
+ if diffs[pointer][1].length < diff_edit_cost && (post_ins || post_del)
619
+ # Candidate found.
620
+ equalities.push(pointer)
621
+ pre_ins = post_ins
622
+ pre_del = post_del
623
+ last_equality = diffs[pointer][1]
624
+ else
625
+ # Not a candidate, and can never become one.
626
+ equalities.clear
627
+ last_equality = ""
628
+ end
629
+ post_ins = false
630
+ post_del = false
631
+ else # An insertion or deletion.
632
+ if diffs[pointer][0] == :delete
633
+ post_del = true
634
+ else
635
+ post_ins = true
636
+ end
637
+
638
+ # Five types to be split:
639
+ # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
640
+ # <ins>A</ins>X<ins>C</ins><del>D</del>
641
+ # <ins>A</ins><del>B</del>X<ins>C</ins>
642
+ # <ins>A</del>X<ins>C</ins><del>D</del>
643
+ # <ins>A</ins><del>B</del>X<del>C</del>
644
+
645
+ if !last_equality.empty? &&
646
+ ((pre_ins && pre_del && post_ins && post_del) ||
647
+ ((last_equality.length < diff_edit_cost / 2) &&
648
+ [pre_ins, pre_del, post_ins, post_del].count(true) == 3))
649
+ # Duplicate record.
650
+ diffs[equalities.last, 0] = [[:delete, last_equality]]
651
+ # Change second copy to insert.
652
+ diffs[equalities.last + 1][0] = :insert
653
+ equalities.pop # Throw away the equality we just deleted
654
+ last_equality = ""
655
+ if pre_ins && pre_del
656
+ # No changes made which could affect previous entry, keep going.
657
+ post_ins = true
658
+ post_del = true
659
+ equalities.clear
660
+ else
661
+ unless equalities.empty?
662
+ equalities.pop # Throw away the previous equality.
663
+ pointer = equalities.last || -1
664
+ end
665
+ post_ins = false
666
+ post_del = false
667
+ end
668
+ changes = true
669
+ end
670
+ end
671
+ pointer += 1
672
+ end
673
+
674
+ if changes
675
+ diff_cleanup_merge(diffs)
676
+ end
677
+ end
678
+
679
+ # Reorder and merge like edit sections. Merge equalities.
680
+ # Any edit section can move as long as it doesn't cross an equality.
681
+ def diff_cleanup_merge(diffs)
682
+ diffs.push([:equal, ""]) # Add a dummy entry at the end.
683
+ pointer = 0
684
+ count_delete = 0
685
+ count_insert = 0
686
+ text_delete = ""
687
+ text_insert = ""
688
+
689
+ while pointer < diffs.length
690
+ case diffs[pointer][0]
691
+ when :insert
692
+ count_insert += 1
693
+ text_insert += diffs[pointer][1]
694
+ pointer += 1
695
+ when :delete
696
+ count_delete += 1
697
+ text_delete += diffs[pointer][1]
698
+ pointer += 1
699
+ when :equal
700
+ # Upon reaching an equality, check for prior redundancies.
701
+ if count_delete + count_insert > 1
702
+ if count_delete != 0 && count_insert != 0
703
+ # Factor out any common prefixies.
704
+ common_length = diff_common_prefix(text_insert, text_delete)
705
+ if common_length != 0
706
+ if (pointer - count_delete - count_insert) > 0 &&
707
+ diffs[pointer - count_delete - count_insert - 1][0] == :equal
708
+ diffs[pointer - count_delete - count_insert - 1][1] +=
709
+ text_insert[0...common_length]
710
+ else
711
+ diffs.unshift([:equal, text_insert[0...common_length]])
712
+ pointer += 1
713
+ end
714
+ text_insert = text_insert[common_length..-1]
715
+ text_delete = text_delete[common_length..-1]
716
+ end
717
+ # Factor out any common suffixies.
718
+ common_length = diff_common_suffix(text_insert, text_delete)
719
+ if common_length != 0
720
+ diffs[pointer][1] = text_insert[-common_length..-1] + diffs[pointer][1]
721
+ text_insert = text_insert[0...-common_length]
722
+ text_delete = text_delete[0...-common_length]
723
+ end
724
+ end
725
+
726
+ # Delete the offending records and add the merged ones.
727
+ diffs[pointer - count_delete - count_insert, count_delete + count_insert] = if count_delete.zero?
728
+ [[:insert, text_insert]]
729
+ elsif count_insert.zero?
730
+ [[:delete, text_delete]]
731
+ else
732
+ [[:delete, text_delete], [:insert, text_insert]]
733
+ end
734
+ pointer = pointer - count_delete - count_insert +
735
+ (count_delete.zero? ? 0 : 1) + (count_insert.zero? ? 0 : 1) + 1
736
+ elsif pointer != 0 && diffs[pointer - 1][0] == :equal
737
+ # Merge this equality with the previous one.
738
+ diffs[pointer - 1][1] += diffs[pointer][1]
739
+ diffs[pointer, 1] = []
740
+ else
741
+ pointer += 1
742
+ end
743
+ count_insert = 0
744
+ count_delete = 0
745
+ text_delete = ""
746
+ text_insert = ""
747
+ end
748
+ end
749
+
750
+ if diffs.last[1].empty?
751
+ diffs.pop # Remove the dummy entry at the end.
752
+ end
753
+
754
+ # Second pass: look for single edits surrounded on both sides by equalities
755
+ # which can be shifted sideways to eliminate an equality.
756
+ # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
757
+ changes = false
758
+ pointer = 1
759
+
760
+ # Intentionally ignore the first and last element (don't need checking).
761
+ while pointer < diffs.length - 1
762
+ if diffs[pointer - 1][0] == :equal && diffs[pointer + 1][0] == :equal
763
+ # This is a single edit surrounded by equalities.
764
+ if diffs[pointer][1][-diffs[pointer - 1][1].length..-1] == diffs[pointer - 1][1]
765
+ # Shift the edit over the previous equality.
766
+ diffs[pointer][1] = diffs[pointer - 1][1] + diffs[pointer][1][0...-diffs[pointer - 1][1].length]
767
+ diffs[pointer + 1][1] = diffs[pointer - 1][1] + diffs[pointer + 1][1]
768
+ diffs[pointer - 1, 1] = []
769
+ changes = true
770
+ elsif diffs[pointer][1][0...diffs[pointer + 1][1].length] == diffs[pointer + 1][1]
771
+ # Shift the edit over the next equality.
772
+ diffs[pointer - 1][1] += diffs[pointer + 1][1]
773
+ diffs[pointer][1] = diffs[pointer][1][diffs[pointer + 1][1].length..-1] +
774
+ diffs[pointer + 1][1]
775
+ diffs[pointer + 1, 1] = []
776
+ changes = true
777
+ end
778
+ end
779
+ pointer += 1
780
+ end
781
+
782
+ # If shifts were made, the diff needs reordering and another shift sweep.
783
+ if changes
784
+ diff_cleanup_merge(diffs)
785
+ end
786
+ end
787
+
788
+ # loc is a location in text1, compute and return the equivalent location
789
+ # in text2. e.g. 'The cat' vs 'The big cat', 1->1, 5->8
790
+ def diff_x_index(diffs, loc)
791
+ chars1 = 0
792
+ chars2 = 0
793
+ last_chars1 = 0
794
+ last_chars2 = 0
795
+ x = diffs.index { |diff|
796
+ if diff[0] != :insert
797
+ chars1 += diff[1].length
798
+ end
799
+ if diff[0] != :delete
800
+ chars2 += diff[1].length
801
+ end
802
+ if chars1 > loc
803
+ true
804
+ else
805
+ last_chars1 = chars1
806
+ last_chars2 = chars2
807
+ false
808
+ end
809
+ }
810
+
811
+ if !x.nil? && diffs.length != x && diffs[x][0] == :delete
812
+ # The location was deleted.
813
+ last_chars2
814
+ else
815
+ # Add the remaining len(character).
816
+ last_chars2 + (loc - last_chars1)
817
+ end
818
+ end
819
+
820
+ # Convert a diff array into a pretty HTML report.
821
+ def diff_pretty_html(diffs)
822
+ diffs.map { |op, data|
823
+ text = data.gsub("&", "&amp;").gsub("<", "&lt;").gsub(">", "&gt;").gsub('\n', "&para;<br>")
824
+ case op
825
+ when :insert
826
+ "<ins style=\"background:#e6ffe6;\">#{text}</ins>"
827
+ when :delete
828
+ "<del style=\"background:#ffe6e6;\">#{text}</del>"
829
+ when :equal
830
+ "<span>#{text}</span>"
831
+ end
832
+ }.join
833
+ end
834
+
835
+ # Compute and return the source text (all equalities and deletions).
836
+ def diff_text1(diffs)
837
+ diffs.map { |op, data|
838
+ if op == :insert
839
+ ""
840
+ else
841
+ data
842
+ end
843
+ }.join
844
+ end
845
+
846
+ # Compute and return the destination text (all equalities and insertions).
847
+ def diff_text2(diffs)
848
+ diffs.map { |op, data|
849
+ if op == :delete
850
+ ""
851
+ else
852
+ data
853
+ end
854
+ }.join
855
+ end
856
+
857
+ # Compute the Levenshtein distance; the number of inserted, deleted or
858
+ # substituted characters.
859
+ def diff_levenshtein(diffs)
860
+ levenshtein = 0
861
+ insertions = 0
862
+ deletions = 0
863
+
864
+ diffs.each do |op, data|
865
+ case op
866
+ when :insert
867
+ insertions += data.length
868
+ when :delete
869
+ deletions += data.length
870
+ when :equal
871
+ # A deletion and an insertion is one substitution.
872
+ levenshtein += [insertions, deletions].max
873
+ insertions = 0
874
+ deletions = 0
875
+ end
876
+ end
877
+
878
+ levenshtein + [insertions, deletions].max
879
+ end
880
+
881
+ # Crush the diff into an encoded string which describes the operations
882
+ # required to transform text1 into text2.
883
+ # E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
884
+ # Operations are tab-separated. Inserted text is escaped using %xx notation.
885
+ def diff_to_delta(diffs)
886
+ diffs.map { |op, data|
887
+ case op
888
+ when :insert
889
+ "+" + PatchObj::PATCH_PARSER.escape(data, /[^0-9A-Za-z_.;!~*'(),\/?:@&=+$\#-]/)
890
+ when :delete
891
+ "-" + data.length.to_s
892
+ when :equal
893
+ "=" + data.length.to_s
894
+ end
895
+ }.join("\t").gsub("%20", " ")
896
+ end
897
+
898
+ # Given the original text1, and an encoded string which describes the
899
+ # operations required to transform text1 into text2, compute the full diff.
900
+ def diff_from_delta(text1, delta)
901
+ # Deltas should be composed of a subset of ascii chars, Unicode not required.
902
+ delta.encode("ascii")
903
+ diffs = []
904
+ pointer = 0 # Cursor in text1
905
+ delta.split("\t").each do |token|
906
+ # Each token begins with a one character parameter which specifies the
907
+ # operation of this token (delete, insert, equality).
908
+ param = token[1..-1]
909
+ case token[0]
910
+ when "+"
911
+ diffs.push([:insert, PatchObj::PATCH_PARSER.unescape(param.force_encoding(Encoding::UTF_8))])
912
+ when "-", "="
913
+ begin
914
+ n = Integer(param)
915
+ raise if n < 0
916
+ text = text1[pointer...(pointer + n)]
917
+ pointer += n
918
+ if token[0] == "="
919
+ diffs.push([:equal, text])
920
+ else
921
+ diffs.push([:delete, text])
922
+ end
923
+ rescue ArgumentError => _
924
+ raise ArgumentError.new(
925
+ "Invalid number in diff_fromDelta: #{param.inspect}"
926
+ )
927
+ end
928
+ else
929
+ raise ArgumentError.new(
930
+ "Invalid diff operation in diff_fromDelta: #{token.inspect}"
931
+ )
932
+ end
933
+ end
934
+
935
+ if pointer != text1.length
936
+ raise ArgumentError.new("Delta length (#{pointer}) does not equal " \
937
+ "source text length #{text1.length}")
938
+ end
939
+ diffs
940
+ end
941
+
942
+ # Locate the best instance of 'pattern' in 'text' near 'loc'.
943
+ def match_main(text, pattern, loc)
944
+ # Check for null inputs.
945
+ if [text, pattern].any?(&:nil?)
946
+ raise ArgumentError.new("Null input. (match_main)")
947
+ end
948
+
949
+ loc = [0, [loc, text.length].min].max
950
+ if text == pattern
951
+ # Shortcut (potentially not guaranteed by the algorithm)
952
+ 0
953
+ elsif text.empty?
954
+ # Nothing to match
955
+ -1
956
+ elsif text[loc, pattern.length] == pattern
957
+ # Perfect match at the perfect spot! (Includes case of null pattern)
958
+ loc
959
+ else
960
+ # Do a fuzzy compare.
961
+ match_bitap(text, pattern, loc)
962
+ end
963
+ end
964
+
965
+ # Locate the best instance of 'pattern' in 'text' near 'loc' using the
966
+ # Bitap algorithm.
967
+ def match_bitap(text, pattern, loc)
968
+ if pattern.length > match_max_bits
969
+ throw ArgumentError.new("Pattern too long")
970
+ end
971
+
972
+ # Initialise the alphabet.
973
+ s = match_alphabet(pattern)
974
+
975
+ # Compute and return the score for a match with e errors and x location.
976
+ match_bitap_score = ->(e, x) do
977
+ accuracy = e.to_f / pattern.length
978
+ proximity = (loc - x).abs
979
+ if match_distance == 0
980
+ # Dodge divide by zero error.
981
+ return proximity == 0 ? accuracy : 1.0
982
+ end
983
+ return accuracy + (proximity.to_f / match_distance)
984
+ end
985
+
986
+ # Highest score beyond which we give up.
987
+ score_threshold = match_threshold
988
+ # Is there a nearby exact match? (speedup)
989
+ best_loc = text.index(pattern, loc)
990
+ if best_loc
991
+ score_threshold = [match_bitap_score[0, best_loc], score_threshold].min
992
+ # What about in the other direction? (speedup)
993
+ best_loc = text.rindex(pattern, loc + pattern.length)
994
+ if best_loc
995
+ score_threshold = [match_bitap_score[0, best_loc], score_threshold].min
996
+ end
997
+ end
998
+
999
+ # Initialise the bit arrays.
1000
+ match_mask = 1 << (pattern.length - 1)
1001
+ best_loc = -1
1002
+
1003
+ bin_max = pattern.length + text.length
1004
+ # Empty initialization added to appease pychecker.
1005
+ last_rd = nil
1006
+ pattern.length.times do |d|
1007
+ # Scan for the best match; each iteration allows for one more error.
1008
+ # Run a binary search to determine how far from 'loc' we can stray at this
1009
+ # error level.
1010
+ bin_min = 0
1011
+ bin_mid = bin_max
1012
+ while bin_min < bin_mid
1013
+ if match_bitap_score[d, loc + bin_mid] <= score_threshold
1014
+ bin_min = bin_mid
1015
+ else
1016
+ bin_max = bin_mid
1017
+ end
1018
+ bin_mid = (bin_max - bin_min) / 2 + bin_min
1019
+ end
1020
+
1021
+ # Use the result from this iteration as the maximum for the next.
1022
+ bin_max = bin_mid
1023
+ start = [1, loc - bin_mid + 1].max
1024
+ finish = [loc + bin_mid, text.length].min + pattern.length
1025
+
1026
+ rd = Array.new(finish + 2, 0)
1027
+ rd[finish + 1] = (1 << d) - 1
1028
+ finish.downto(start) do |j|
1029
+ char_match = s[text[j - 1]] || 0
1030
+ rd[j] = if d == 0 # First pass: exact match.
1031
+ ((rd[j + 1] << 1) | 1) & char_match
1032
+ else # Subsequent passes: fuzzy match.
1033
+ ((rd[j + 1] << 1) | 1) & char_match |
1034
+ (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1035
+ end
1036
+ if (rd[j] & match_mask).nonzero?
1037
+ score = match_bitap_score[d, j - 1]
1038
+ # This match will almost certainly be better than any existing match.
1039
+ # But check anyway.
1040
+ if score <= score_threshold
1041
+ # Told you so.
1042
+ score_threshold = score
1043
+ best_loc = j - 1
1044
+ if best_loc > loc
1045
+ # When passing loc, don't exceed our current distance from loc.
1046
+ start = [1, 2 * loc - best_loc].max
1047
+ else
1048
+ # Already passed loc, downhill from here on in.
1049
+ break
1050
+ end
1051
+ end
1052
+ end
1053
+ end
1054
+
1055
+ # No hope for a (better) match at greater error levels.
1056
+ if match_bitap_score[d + 1, loc] > score_threshold
1057
+ break
1058
+ end
1059
+ last_rd = rd
1060
+ end
1061
+
1062
+ best_loc
1063
+ end
1064
+
1065
+ # Initialise the alphabet for the Bitap algorithm.
1066
+ def match_alphabet(pattern)
1067
+ s = {}
1068
+ pattern.chars.each_with_index do |c, i|
1069
+ s[c] ||= 0
1070
+ s[c] |= 1 << (pattern.length - i - 1)
1071
+ end
1072
+ s
1073
+ end
1074
+
1075
+ # Parse a textual representation of patches and return a list of patch
1076
+ # objects.
1077
+ def patch_from_text(textline)
1078
+ return [] if textline.empty?
1079
+
1080
+ patches = []
1081
+ text = textline.split("\n")
1082
+ text_pointer = 0
1083
+ patch_header = /^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$/
1084
+ while text_pointer < text.length
1085
+ m = text[text_pointer].match(patch_header)
1086
+ if m.nil?
1087
+ raise ArgumentError.new("Invalid patch string: #{text[text_pointer]}")
1088
+ end
1089
+ patch = PatchObj.new
1090
+ patches.push(patch)
1091
+ patch.start1 = m[1].to_i
1092
+ if m[2].empty?
1093
+ patch.start1 -= 1
1094
+ patch.length1 = 1
1095
+ elsif m[2] == "0"
1096
+ patch.length1 = 0
1097
+ else
1098
+ patch.start1 -= 1
1099
+ patch.length1 = m[2].to_i
1100
+ end
1101
+
1102
+ patch.start2 = m[3].to_i
1103
+ if m[4].empty?
1104
+ patch.start2 -= 1
1105
+ patch.length2 = 1
1106
+ elsif m[4] == "0"
1107
+ patch.length2 = 0
1108
+ else
1109
+ patch.start2 -= 1
1110
+ patch.length2 = m[4].to_i
1111
+ end
1112
+ text_pointer += 1
1113
+
1114
+ while text_pointer < text.length
1115
+ if text[text_pointer].empty?
1116
+ # Blank line? Whatever.
1117
+ text_pointer += 1
1118
+ next
1119
+ end
1120
+
1121
+ sign = text[text_pointer][0]
1122
+ line = PatchObj::PATCH_PARSER.unescape(text[text_pointer][1..-1].force_encoding(Encoding::UTF_8))
1123
+
1124
+ case sign
1125
+ when "-"
1126
+ # Deletion.
1127
+ patch.diffs.push([:delete, line])
1128
+ when "+"
1129
+ # Insertion.
1130
+ patch.diffs.push([:insert, line])
1131
+ when " "
1132
+ # Minor equality
1133
+ patch.diffs.push([:equal, line])
1134
+ when "@"
1135
+ # Start of next patch.
1136
+ break
1137
+ else
1138
+ # WTF?
1139
+ raise ArgumentError.new("Invalid patch mode \"#{sign}\" in: #{line}")
1140
+ end
1141
+ text_pointer += 1
1142
+ end
1143
+ end
1144
+
1145
+ patches
1146
+ end
1147
+
1148
+ # Take a list of patches and return a textual representation
1149
+ def patch_to_text(patches)
1150
+ patches.join
1151
+ end
1152
+
1153
+ # Increase the context until it is unique,
1154
+ # but don't let the pattern expand beyond match_max_bits
1155
+ def patch_add_context(patch, text)
1156
+ return if text.empty?
1157
+ pattern = text[patch.start2, patch.length1]
1158
+ padding = 0
1159
+
1160
+ # Look for the first and last matches of pattern in text. If two different
1161
+ # matches are found, increase the pattern length.
1162
+ while text.index(pattern) != text.rindex(pattern) &&
1163
+ pattern.length < match_max_bits - 2 * patch_margin
1164
+ padding += patch_margin
1165
+ pattern = text[[0, patch.start2 - padding].max...(patch.start2 + patch.length1 + padding)]
1166
+ end
1167
+
1168
+ # Add one chunk for good luck.
1169
+ padding += patch_margin
1170
+
1171
+ # Add the prefix.
1172
+ prefix = text[[0, patch.start2 - padding].max...patch.start2]
1173
+ patch.diffs.unshift([:equal, prefix]) unless prefix.to_s.empty?
1174
+
1175
+ # Add the suffix.
1176
+ suffix = text[patch.start2 + patch.length1, padding]
1177
+ patch.diffs.push([:equal, suffix]) unless suffix.to_s.empty?
1178
+
1179
+ # Roll back the start points.
1180
+ patch.start1 -= prefix.length
1181
+ patch.start2 -= prefix.length
1182
+
1183
+ # Extend the lengths.
1184
+ patch.length1 += prefix.length + suffix.length
1185
+ patch.length2 += prefix.length + suffix.length
1186
+ end
1187
+
1188
+ # Compute a list of patches to turn text1 into text2.
1189
+ # Use diffs if provided, otherwise compute it ourselves.
1190
+ # There are four ways to call this function, depending on what data is
1191
+ # available to the caller:
1192
+ # Method 1:
1193
+ # a = text1, b = text2
1194
+ # Method 2:
1195
+ # a = diffs
1196
+ # Method 3 (optimal):
1197
+ # a = text1, b = diffs
1198
+ # Method 4 (deprecated, use method 3):
1199
+ # a = text1, b = text2, c = diffs
1200
+ def patch_make(*args)
1201
+ text1 = nil
1202
+ diffs = nil
1203
+ if args.length == 2 && args[0].is_a?(String) && args[1].is_a?(String)
1204
+ # Compute diffs from text1 and text2.
1205
+ text1 = args[0]
1206
+ text2 = args[1]
1207
+ diffs = diff_main(text1, text2, true)
1208
+ if diffs.length > 2
1209
+ diff_cleanup_semantic(diffs)
1210
+ diff_cleanup_efficiency(diffs)
1211
+ end
1212
+ elsif args.length == 1 && args[0].is_a?(Array)
1213
+ # Compute text1 from diffs.
1214
+ diffs = args[0]
1215
+ text1 = diff_text1(diffs)
1216
+ elsif args.length == 2 && args[0].is_a?(String) && args[1].is_a?(Array)
1217
+ text1 = args[0]
1218
+ diffs = args[1]
1219
+ elsif args.length == 3 && args[0].is_a?(String) && args[1].is_a?(String) &&
1220
+ args[2].is_a?(Array)
1221
+ # Method 4: text1, text2, diffs
1222
+ # text2 is not used.
1223
+ text1 = args[0]
1224
+ # text2 = args[1]
1225
+ diffs = args[2]
1226
+ else
1227
+ raise ArgumentError.new("Unknown call format to patch_make.")
1228
+ end
1229
+
1230
+ return [] if diffs.empty? # Get rid of the null case.
1231
+
1232
+ patches = []
1233
+ patch = PatchObj.new
1234
+ char_count1 = 0 # Number of characters into the text1 string.
1235
+ char_count2 = 0 # Number of characters into the text2 string.
1236
+ prepatch_text = text1 # Recreate the patches to determine context info.
1237
+ postpatch_text = text1
1238
+
1239
+ diffs.each_with_index do |diff, x|
1240
+ diff_type, diff_text = diffs[x]
1241
+ if patch.diffs.empty? && diff_type != :equal
1242
+ # A new patch starts here.
1243
+ patch.start1 = char_count1
1244
+ patch.start2 = char_count2
1245
+ end
1246
+
1247
+ case diff_type
1248
+ when :insert
1249
+ patch.diffs.push(diff)
1250
+ patch.length2 += diff_text.length
1251
+ postpatch_text = postpatch_text[0...char_count2] + diff_text +
1252
+ postpatch_text[char_count2..-1]
1253
+ when :delete
1254
+ patch.length1 += diff_text.length
1255
+ patch.diffs.push(diff)
1256
+ postpatch_text = postpatch_text[0...char_count2] +
1257
+ postpatch_text[(char_count2 + diff_text.length)..-1]
1258
+ when :equal
1259
+ if diff_text.length <= 2 * patch_margin &&
1260
+ !patch.diffs.empty? && diffs.length != x + 1
1261
+ # Small equality inside a patch.
1262
+ patch.diffs.push(diff)
1263
+ patch.length1 += diff_text.length
1264
+ patch.length2 += diff_text.length
1265
+ elsif diff_text.length >= 2 * patch_margin
1266
+ # Time for a new patch.
1267
+ unless patch.diffs.empty?
1268
+ patch_add_context(patch, prepatch_text)
1269
+ patches.push(patch)
1270
+ patch = PatchObj.new
1271
+ # Unlike Unidiff, our patch lists have a rolling context.
1272
+ # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1273
+ # Update prepatch text & pos to reflect the application of the
1274
+ # just completed patch.
1275
+ prepatch_text = postpatch_text
1276
+ char_count1 = char_count2
1277
+ end
1278
+ end
1279
+ end
1280
+
1281
+ # Update the current character count.
1282
+ if diff_type != :insert
1283
+ char_count1 += diff_text.length
1284
+ end
1285
+ if diff_type != :delete
1286
+ char_count2 += diff_text.length
1287
+ end
1288
+ end
1289
+
1290
+ # Pick up the leftover patch if not empty.
1291
+ unless patch.diffs.empty?
1292
+ patch_add_context(patch, prepatch_text)
1293
+ patches.push(patch)
1294
+ end
1295
+
1296
+ patches
1297
+ end
1298
+
1299
+ # Merge a set of patches onto the text. Return a patched text, as well
1300
+ # as a list of true/false values indicating which patches were applied.
1301
+ def patch_apply(patches, text)
1302
+ return [text, []] if patches.empty?
1303
+
1304
+ # Deep copy the patches so that no changes are made to originals.
1305
+ patches = Marshal.load(Marshal.dump(patches))
1306
+
1307
+ null_padding = patch_add_padding(patches)
1308
+ text = null_padding + text + null_padding
1309
+ patch_split_max(patches)
1310
+
1311
+ # delta keeps track of the offset between the expected and actual location
1312
+ # of the previous patch. If there are patches expected at positions 10 and
1313
+ # 20, but the first patch was found at 12, delta is 2 and the second patch
1314
+ # has an effective expected position of 22.
1315
+ delta = 0
1316
+ results = []
1317
+ patches.each_with_index do |patch, x|
1318
+ expected_loc = patch.start2 + delta
1319
+ text1 = diff_text1(patch.diffs)
1320
+ end_loc = -1
1321
+ if text1.length > match_max_bits
1322
+ # patch_splitMax will only provide an oversized pattern in the case of
1323
+ # a monster delete.
1324
+ start_loc = match_main(text, text1[0, match_max_bits], expected_loc)
1325
+ if start_loc != -1
1326
+ end_loc = match_main(text, text1[(text1.length - match_max_bits)..-1],
1327
+ expected_loc + text1.length - match_max_bits)
1328
+ if end_loc == -1 || start_loc >= end_loc
1329
+ # Can't find valid trailing context. Drop this patch.
1330
+ start_loc = -1
1331
+ end
1332
+ end
1333
+ else
1334
+ start_loc = match_main(text, text1, expected_loc)
1335
+ end
1336
+ if start_loc == -1
1337
+ # No match found. :(
1338
+ results[x] = false
1339
+ # Subtract the delta for this failed patch from subsequent patches.
1340
+ delta -= patch.length2 - patch.length1
1341
+ else
1342
+ # Found a match. :)
1343
+ results[x] = true
1344
+ delta = start_loc - expected_loc
1345
+ text2 = text[start_loc, end_loc == -1 ? text1.length : end_loc + match_max_bits]
1346
+
1347
+ if text1 == text2
1348
+ # Perfect match, just shove the replacement text in.
1349
+ text = text[0, start_loc] + diff_text2(patch.diffs) + text[(start_loc + text1.length)..-1]
1350
+ else
1351
+ # Imperfect match.
1352
+ # Run a diff to get a framework of equivalent indices.
1353
+ diffs = diff_main(text1, text2, false)
1354
+ if text1.length > match_max_bits &&
1355
+ diff_levenshtein(diffs).to_f / text1.length > patch_delete_threshold
1356
+ # The end points match, but the content is unacceptably bad.
1357
+ results[x] = false
1358
+ else
1359
+ diff_cleanup_semantic_lossless(diffs)
1360
+ index1 = 0
1361
+ patch.diffs.each do |op, data|
1362
+ if op != :equal
1363
+ index2 = diff_x_index(diffs, index1)
1364
+ end
1365
+ if op == :insert # Insertion
1366
+ text = text[0, start_loc + index2] + data + text[(start_loc + index2)..-1]
1367
+ elsif op == :delete # Deletion
1368
+ text = text[0, start_loc + index2] +
1369
+ text[(start_loc + diff_x_index(diffs, index1 + data.length))..-1]
1370
+ end
1371
+ if op != :delete
1372
+ index1 += data.length
1373
+ end
1374
+ end
1375
+ end
1376
+ end
1377
+ end
1378
+ end
1379
+
1380
+ # Strip the padding off.
1381
+ text = text[null_padding.length...-null_padding.length]
1382
+ [text, results]
1383
+ end
1384
+
1385
+ # Add some padding on text start and end so that edges can match
1386
+ # something. Intended to be called only from within patch_apply.
1387
+ def patch_add_padding(patches)
1388
+ padding_length = patch_margin
1389
+ null_padding = (1..padding_length).map { |x| x.chr(Encoding::UTF_8) }.join
1390
+
1391
+ # Bump all the patches forward.
1392
+ patches.each do |patch|
1393
+ patch.start1 += padding_length
1394
+ patch.start2 += padding_length
1395
+ end
1396
+
1397
+ # Add some padding on start of first diff.
1398
+ patch = patches.first
1399
+ diffs = patch.diffs
1400
+ if diffs.empty? || diffs.first[0] != :equal
1401
+ # Add nullPadding equality.
1402
+ diffs.unshift([:equal, null_padding])
1403
+ patch.start1 -= padding_length # Should be 0.
1404
+ patch.start2 -= padding_length # Should be 0.
1405
+ patch.length1 += padding_length
1406
+ patch.length2 += padding_length
1407
+ elsif padding_length > diffs.first[1].length
1408
+ # Grow first equality.
1409
+ extra_length = padding_length - diffs.first[1].length
1410
+ diffs.first[1] = null_padding[diffs.first[1].length..-1] + diffs.first[1]
1411
+ patch.start1 -= extra_length
1412
+ patch.start2 -= extra_length
1413
+ patch.length1 += extra_length
1414
+ patch.length2 += extra_length
1415
+ end
1416
+
1417
+ # Add some padding on end of last diff.
1418
+ patch = patches.last
1419
+ diffs = patch.diffs
1420
+ if diffs.empty? || diffs.last[0] != :equal
1421
+ # Add nullPadding equality.
1422
+ diffs.push([:equal, null_padding])
1423
+ patch.length1 += padding_length
1424
+ patch.length2 += padding_length
1425
+ elsif padding_length > diffs.last[1].length
1426
+ # Grow last equality.
1427
+ extra_length = padding_length - diffs.last[1].length
1428
+ diffs.last[1] += null_padding[0, extra_length]
1429
+ patch.length1 += extra_length
1430
+ patch.length2 += extra_length
1431
+ end
1432
+
1433
+ null_padding
1434
+ end
1435
+
1436
+ # Look through the patches and break up any which are longer than the
1437
+ # maximum limit of the match algorithm.
1438
+ def patch_split_max(patches)
1439
+ patch_size = match_max_bits
1440
+
1441
+ x = 0
1442
+ while x < patches.length
1443
+ if patches[x].length1 > patch_size
1444
+ big_patch = patches[x]
1445
+ # Remove the big old patch
1446
+ patches[x, 1] = []
1447
+ x -= 1
1448
+ start1 = big_patch.start1
1449
+ start2 = big_patch.start2
1450
+ pre_context = ""
1451
+ until big_patch.diffs.empty?
1452
+ # Create one of several smaller patches.
1453
+ patch = PatchObj.new
1454
+ empty = true
1455
+ patch.start1 = start1 - pre_context.length
1456
+ patch.start2 = start2 - pre_context.length
1457
+ unless pre_context.empty?
1458
+ patch.length1 = patch.length2 = pre_context.length
1459
+ patch.diffs.push([:equal, pre_context])
1460
+ end
1461
+
1462
+ while !big_patch.diffs.empty? && patch.length1 < patch_size - patch_margin
1463
+ diff = big_patch.diffs.first
1464
+ if diff[0] == :insert
1465
+ # Insertions are harmless.
1466
+ patch.length2 += diff[1].length
1467
+ start2 += diff[1].length
1468
+ patch.diffs.push(big_patch.diffs.shift)
1469
+ empty = false
1470
+ elsif diff[0] == :delete && patch.diffs.length == 1 &&
1471
+ patch.diffs.first[0] == :equal && diff[1].length > 2 * patch_size
1472
+ # This is a large deletion. Let it pass in one chunk.
1473
+ patch.length1 += diff[1].length
1474
+ start1 += diff[1].length
1475
+ empty = false
1476
+ patch.diffs.push(big_patch.diffs.shift)
1477
+ else
1478
+ # Deletion or equality. Only take as much as we can stomach.
1479
+ diff_text = diff[1][0, patch_size - patch.length1 - patch_margin]
1480
+ patch.length1 += diff_text.length
1481
+ start1 += diff_text.length
1482
+ if diff[0] == :equal
1483
+ patch.length2 += diff_text.length
1484
+ start2 += diff_text.length
1485
+ else
1486
+ empty = false
1487
+ end
1488
+ patch.diffs.push([diff[0], diff_text])
1489
+ if diff_text == big_patch.diffs.first[1]
1490
+ big_patch.diffs.shift
1491
+ else
1492
+ big_patch.diffs.first[1] = big_patch.diffs.first[1][diff_text.length..-1]
1493
+ end
1494
+ end
1495
+ end
1496
+
1497
+ # Compute the head context for the next patch.
1498
+ pre_context = diff_text2(patch.diffs)[-patch_margin..-1] || ""
1499
+
1500
+ # Append the end context for this patch.
1501
+ post_context = diff_text1(big_patch.diffs)[0...patch_margin] || ""
1502
+ unless post_context.empty?
1503
+ patch.length1 += post_context.length
1504
+ patch.length2 += post_context.length
1505
+ if !patch.diffs.empty? && patch.diffs.last[0] == :equal
1506
+ patch.diffs.last[1] += post_context
1507
+ else
1508
+ patch.diffs.push([:equal, post_context])
1509
+ end
1510
+ end
1511
+ unless empty
1512
+ x += 1
1513
+ patches[x, 0] = [patch]
1514
+ end
1515
+ end
1516
+ end
1517
+ x += 1
1518
+ end
1519
+ end
1520
+ end
1521
+
1522
+ DiffMatchPatch = DiMaPa