fast_html_diff 0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +20 -0
- data/README.md +58 -0
- data/Rakefile +1 -0
- data/fast_html_diff.gemspec +24 -0
- data/lib/fast_html_diff.rb +443 -0
- data/lib/fast_html_diff/version.rb +3 -0
- metadata +94 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MGI5NjhiMjI5ODFlNjJhM2NhODJhNjAxMDRlMWM2Y2ExNGJmYzlkNg==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
OTgzNjEwMWQ5NTJiNGMxNzljODljYzJmYmY5ZGEzMjg1NDA5ZjA0OA==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZTAxYmU5MzEwMGY4ZTYwYmYwYjVhNTIxM2UzM2RjMTg2MDZkZWZjYzYwNjQ1
|
10
|
+
NjMzOGY1YThjMzY1M2E4MjU4ZTZhODBhMmM3MmRlYzA0ZmQ2MDdkZWI5ZjE0
|
11
|
+
NzI0NGVmNjMwMjYxMGM5ZmE0MDdiMWU3ZTAwNmRjZWMyNGIxMWQ=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZWYyOWMwMDZkN2JiMjZjMWExNDA3OWNlMGI4ZDZiNDg3OTMxN2YwZjc0NDFk
|
14
|
+
OGE0OGZkYTU0ZTk4MjkwOGJlYmZlZmRiYjZiMmE2MzhjODU3ZjZlOWZkMmIy
|
15
|
+
ODU1NGQ1ZGY3MjU1ZDZhNDMzOTNmNGI5YzNkZjVjOTg0ZDIyMTc=
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
BSD License for FastHtmlDiff (http://github.com/kmewhort/fast_html_diff)
|
2
|
+
|
3
|
+
Copyright (c) 2013, Kent Mewhort
|
4
|
+
All rights reserved.
|
5
|
+
|
6
|
+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
|
7
|
+
following conditions are met:
|
8
|
+
|
9
|
+
Redistributions of source code must retain the above copyright notice, this list of conditions and the following
|
10
|
+
disclaimer.
|
11
|
+
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
|
12
|
+
disclaimer in the documentation and/or other materials provided with the distribution.
|
13
|
+
|
14
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
15
|
+
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
16
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
17
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
18
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
19
|
+
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
20
|
+
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
# FastHtmlDiff
|
2
|
+
|
3
|
+
This gem performs a diff on two input HTML files (outputting the result in HTML as well). It's built for speed, using
|
4
|
+
tried-and-true UNIX diff as the LCS algorithm. The implementation works directly on the DOM to ensure the output
|
5
|
+
always remains valid.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'fast_html_diff'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install fast_html_diff
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
Basic usage:
|
24
|
+
|
25
|
+
result_html_str = FastHtmlDiff::DiffBuilder.new(string_a,string_b).build
|
26
|
+
|
27
|
+
With options (see below for details):
|
28
|
+
|
29
|
+
result_html_str = FastHtmlDiff::DiffBuilder.new(string_a,string_b,
|
30
|
+
simplify_html: true, try_hard: true).build
|
31
|
+
|
32
|
+
## Options
|
33
|
+
|
34
|
+
* **ignore_punctuation:** boolean [default: true]
|
35
|
+
* **case_insensitive:** boolean [default: true]
|
36
|
+
* **tokenizer_regexp:** regexp [default: %r{([^A-Za-z0-9]+)};] Make sure to include the outer parentheses. This option overrides any "ignore_punctuation" setting.
|
37
|
+
* **diff_cmd:** str [default: 'diff']. May be useful if you only have diff available through cygwin or a Windows port.
|
38
|
+
* **try_hard:** boolean [default: false]. Try hard to find smaller-length matches (at a bit of a performance cost).
|
39
|
+
* **simplify_html:** boolean [default: false]. Strips HTML to only the permitted tags, giving better output format where the structure of the two inputs differ greatly.
|
40
|
+
* **simplified_html_tags:** array of strings [default %w(html body p strong em ul ol li)]
|
41
|
+
|
42
|
+
## Styling
|
43
|
+
|
44
|
+
Insertions are wrapped in **<ins>**; Deletions are wrapped **<del>**. Add the following CSS for much nicer looking output:
|
45
|
+
|
46
|
+
ins {
|
47
|
+
text-decoration: none;
|
48
|
+
background-color: #a3ffad;
|
49
|
+
}
|
50
|
+
del {
|
51
|
+
color: #ff5d5a;
|
52
|
+
background-color: #b4ecff;
|
53
|
+
}
|
54
|
+
|
55
|
+
## License
|
56
|
+
|
57
|
+
(c) 2013, Kent Mewhort, licensed under BSD. See LICENSE.txt for details.
|
58
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'fast_html_diff/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "fast_html_diff"
|
8
|
+
spec.version = FastHtmlDiff::VERSION
|
9
|
+
spec.authors = ["Kent Mewhort"]
|
10
|
+
spec.email = ["kent@openissues.ca"]
|
11
|
+
spec.description = %q{Performs a diff on two HTML inputs, outputting the result as HTML.}
|
12
|
+
spec.summary = %q{Performs a diff on two HTML inputs, outputting the result as HTML.}
|
13
|
+
spec.homepage = "https://github.com/kmewhort/fast_html_diff"
|
14
|
+
spec.license = "BSD"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_runtime_dependency "nokogiri"
|
24
|
+
end
|
@@ -0,0 +1,443 @@
|
|
1
|
+
require "fast_html_diff/version"
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module FastHtmlDiff
|
5
|
+
class DiffBuilder
|
6
|
+
def initialize(html_str_a,html_str_b,config={})
|
7
|
+
@a = html_str_a
|
8
|
+
@b = html_str_b
|
9
|
+
|
10
|
+
@config = default_config.merge(config)
|
11
|
+
if config[:tokenizer_regexp].nil?
|
12
|
+
if @config[:ignore_punctuation]
|
13
|
+
@config[:tokenizer_regexp] = %r{([^A-Za-z0-9]+)}
|
14
|
+
else
|
15
|
+
@config[:tokenizer_regexp] = %r{(\s+)}
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
@word_list = {}
|
20
|
+
@insertions = []
|
21
|
+
@deletions = []
|
22
|
+
@split_nodes = Hash.new
|
23
|
+
@insertion_nodes = Hash.new
|
24
|
+
end
|
25
|
+
|
26
|
+
def build
|
27
|
+
# parse, tokenize and index
|
28
|
+
@a = Nokogiri::HTML(@a)
|
29
|
+
@b = Nokogiri::HTML(@b)
|
30
|
+
if @config[:simplify_html]
|
31
|
+
simplify_html(@a)
|
32
|
+
simplify_html(@b)
|
33
|
+
end
|
34
|
+
index_document(@a, :a)
|
35
|
+
index_document(@b, :b)
|
36
|
+
|
37
|
+
# find the insertions and deletions
|
38
|
+
diff_words
|
39
|
+
|
40
|
+
# update doc a with tags for the insertions and deletions
|
41
|
+
update_dom
|
42
|
+
@a.to_html
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
# index the words in the document
|
48
|
+
def index_document(doc, doc_name)
|
49
|
+
@word_list[doc_name] = Array.new
|
50
|
+
|
51
|
+
# index each word of each text node
|
52
|
+
preceding_chars = ""
|
53
|
+
doc.xpath('//text()').each do |text_node|
|
54
|
+
position = 0
|
55
|
+
is_a_word = true
|
56
|
+
text_node.content.split(@config[:tokenizer_regexp]).each_with_index do |word,i|
|
57
|
+
# check whether we're starting with a word or a split itself
|
58
|
+
if (i == 0) || (i == 1)
|
59
|
+
is_a_word = !word.empty? && !word.match(@config[:tokenizer_regexp])
|
60
|
+
else
|
61
|
+
is_a_word = !is_a_word
|
62
|
+
end
|
63
|
+
|
64
|
+
if !is_a_word
|
65
|
+
preceding_chars = word unless word.empty?
|
66
|
+
else
|
67
|
+
@word_list[doc_name] << {
|
68
|
+
node: text_node,
|
69
|
+
index_word: (@config[:case_insensitive] ? word.downcase : word),
|
70
|
+
start_pos: position,
|
71
|
+
end_pos: position + word.length,
|
72
|
+
preceding_chars: preceding_chars
|
73
|
+
}
|
74
|
+
preceding_chars = ""
|
75
|
+
end
|
76
|
+
position += word.length
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def diff_words
|
82
|
+
# run diff on the word lists, using it as a quick, natively-run lcs algorithm
|
83
|
+
diff_result = nil
|
84
|
+
begin
|
85
|
+
file_a = Tempfile.new('fast_html_diff_a')
|
86
|
+
file_a.write @word_list[:a].map{|w| w[:index_word]}.join("\n") + "\n"
|
87
|
+
file_a.close
|
88
|
+
|
89
|
+
file_b = Tempfile.new('fast_html_diff_b')
|
90
|
+
file_b.write @word_list[:b].map{|w| w[:index_word]}.join("\n") + "\n"
|
91
|
+
file_b.close
|
92
|
+
|
93
|
+
diff_args = "-U 100000" + (@config[:try_hard] ? ' -d' : '')
|
94
|
+
diff_result = `#{@config[:diff_cmd]} #{diff_args} #{file_a.path} #{file_b.path}`
|
95
|
+
ensure
|
96
|
+
file_a.close
|
97
|
+
file_a.unlink
|
98
|
+
file_b.close
|
99
|
+
file_b.unlink
|
100
|
+
end
|
101
|
+
|
102
|
+
# remap output back to the indexed word list
|
103
|
+
doca_i = 0
|
104
|
+
docb_i = 0
|
105
|
+
prev_operation = :none
|
106
|
+
diff_result.each_line do |word|
|
107
|
+
next if word.match /^(---|\+\+\+|@@|\\\\)/ # skip info lines
|
108
|
+
|
109
|
+
case word[0]
|
110
|
+
when '+'
|
111
|
+
if prev_operation == :insertion
|
112
|
+
@insertions.last[:b_end] = docb_i
|
113
|
+
else
|
114
|
+
if prev_operation == :deletion
|
115
|
+
@deletions.last[:next_operation] = :insertion
|
116
|
+
end
|
117
|
+
|
118
|
+
@insertions << {
|
119
|
+
a_position: doca_i-1, #insert before the current word
|
120
|
+
b_start: docb_i,
|
121
|
+
b_end: docb_i,
|
122
|
+
prev_operation: prev_operation
|
123
|
+
}
|
124
|
+
prev_operation = :insertion
|
125
|
+
end
|
126
|
+
docb_i += 1
|
127
|
+
when '-'
|
128
|
+
if prev_operation == :deletion
|
129
|
+
@deletions.last[:a_end] = doca_i
|
130
|
+
else
|
131
|
+
if prev_operation == :insertion
|
132
|
+
@insertions.last[:next_operation] = :insertion
|
133
|
+
end
|
134
|
+
|
135
|
+
@deletions << {
|
136
|
+
a_start: doca_i,
|
137
|
+
a_end: doca_i,
|
138
|
+
prev_operation: prev_operation
|
139
|
+
}
|
140
|
+
prev_operation = :deletion
|
141
|
+
end
|
142
|
+
doca_i += 1
|
143
|
+
else
|
144
|
+
if prev_operation == :insertion
|
145
|
+
@insertions.last[:next_operation] = :match
|
146
|
+
elsif prev_operation == :deletion
|
147
|
+
@deletions.last[:next_operation] = :match
|
148
|
+
end
|
149
|
+
|
150
|
+
prev_operation = :match
|
151
|
+
doca_i += 1
|
152
|
+
docb_i += 1
|
153
|
+
end
|
154
|
+
# if an additon is one past the end, keep the marker at the end
|
155
|
+
doca_i = (@word_list[:a].length-1) if doca_i >= @word_list[:a].length
|
156
|
+
docb_i = (@word_list[:b].length-1) if docb_i >= @word_list[:b].length
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# mark insertions and deletions in doc a
|
161
|
+
def update_dom
|
162
|
+
# prepare the nodes to insert before making any modifications
|
163
|
+
@insertions.map! do |insertion|
|
164
|
+
prepare_insertion(insertion)
|
165
|
+
end
|
166
|
+
|
167
|
+
# perform the insertions
|
168
|
+
@insertions.each do |insertion|
|
169
|
+
# if the insertion point's parent is the same type as the cca, merge the children
|
170
|
+
# together; otherwise, insert the cca wholesale
|
171
|
+
|
172
|
+
# TODO: handle case where a_position is -1 (insertion before start of document)
|
173
|
+
|
174
|
+
# add whole nodes as-is and wrap partial nodes in a span
|
175
|
+
additional_node = nil
|
176
|
+
touches_node_start = @word_list[:b][insertion[:b_start]-1].nil? ||
|
177
|
+
(@word_list[:b][insertion[:b_start]-1][:node] != @word_list[:b][insertion[:b_start]][:node])
|
178
|
+
touches_node_end = @word_list[:b][insertion[:b_end]+1].nil? ||
|
179
|
+
(@word_list[:b][insertion[:b_end]+1][:node] != @word_list[:b][insertion[:b_end]][:node])
|
180
|
+
if touches_node_start && touches_node_end
|
181
|
+
additional_node = insertion[:new_nodes]
|
182
|
+
|
183
|
+
# bump the end char past whitespace/punctuation
|
184
|
+
unless @word_list[:b][insertion[:b_end]+1].nil?
|
185
|
+
insertion[:insertion_char_index] += @word_list[:b][insertion[:b_end]+1][:preceding_chars].length
|
186
|
+
end
|
187
|
+
else
|
188
|
+
additional_node = Nokogiri::XML::Node.new('span', @a)
|
189
|
+
if insertion[:new_nodes].children.length > 0
|
190
|
+
insertion[:new_nodes].children.each {|c| additional_node.add_child(c) }
|
191
|
+
else
|
192
|
+
additional_node.add_child(insertion[:new_nodes])
|
193
|
+
end
|
194
|
+
end
|
195
|
+
@insertion_nodes[additional_node] = true
|
196
|
+
|
197
|
+
# insertions need to wrap around the text nodes
|
198
|
+
additional_node.search('text()').each do |text_node|
|
199
|
+
parent = text_node.parent
|
200
|
+
wrapper = Nokogiri::XML::Node.new('ins', @a)
|
201
|
+
wrapper.add_child(text_node)
|
202
|
+
parent.add_child(wrapper)
|
203
|
+
end
|
204
|
+
|
205
|
+
# split the insertion point node (if necessary) and insert the new nodes
|
206
|
+
modify_each_node_between(insertion[:insertion_point_node],
|
207
|
+
insertion[:insertion_char_index], insertion[:insertion_char_index]) do |n|
|
208
|
+
additional_node
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
@deletions.each do |deletion|
|
213
|
+
start_node = @word_list[:a][deletion[:a_start]][:node]
|
214
|
+
start_char = @word_list[:a][deletion[:a_start]][:start_pos]
|
215
|
+
end_node = @word_list[:a][deletion[:a_end]][:node]
|
216
|
+
end_char = @word_list[:a][deletion[:a_end]][:end_pos]
|
217
|
+
|
218
|
+
# wrap deletions in del tags just above each text node (so as to preserve
|
219
|
+
# the original formatting)
|
220
|
+
prev_node = cur_node = nil
|
221
|
+
for word_i in deletion[:a_start]..deletion[:a_end]
|
222
|
+
cur_node = @word_list[:a][word_i][:node]
|
223
|
+
if cur_node != prev_node
|
224
|
+
first = (cur_node == start_node) ? start_char : 0
|
225
|
+
last = (cur_node == end_node) ? end_char : cur_node.content.length
|
226
|
+
modify_each_node_between(cur_node, first, last) do |n|
|
227
|
+
wrapper = Nokogiri::XML::Node.new('del', @a)
|
228
|
+
wrapper.add_child(n)
|
229
|
+
wrapper
|
230
|
+
end
|
231
|
+
end
|
232
|
+
prev_node = cur_node
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
# build the exact DOM tree for an insertion
|
238
|
+
def prepare_insertion(insertion)
|
239
|
+
start_node = @word_list[:b][insertion[:b_start]][:node]
|
240
|
+
start_char = @word_list[:b][insertion[:b_start]][:start_pos]
|
241
|
+
end_node = @word_list[:b][insertion[:b_end]][:node]
|
242
|
+
end_char = @word_list[:b][insertion[:b_end]][:end_pos]
|
243
|
+
|
244
|
+
# find the closest common ancestor of the start and end, and clone this portion
|
245
|
+
cca = (start_node.ancestors & end_node.ancestors).first
|
246
|
+
cca_clone = cca.dup
|
247
|
+
|
248
|
+
# find the start node in the clone by retracing the path
|
249
|
+
path_to_cca = []
|
250
|
+
target_node = start_node
|
251
|
+
until target_node == cca
|
252
|
+
path_to_cca.unshift target_node.parent.children.index(target_node)
|
253
|
+
target_node = target_node.parent
|
254
|
+
end
|
255
|
+
start_node = cca_clone
|
256
|
+
path_to_cca.each {|i| start_node = start_node.children[i]}
|
257
|
+
|
258
|
+
# find the end node in the clone by retracing the path
|
259
|
+
path_to_cca = []
|
260
|
+
target_node = end_node
|
261
|
+
until target_node == cca
|
262
|
+
path_to_cca.unshift target_node.parent.children.index(target_node)
|
263
|
+
target_node = target_node.parent
|
264
|
+
end
|
265
|
+
end_node = cca_clone
|
266
|
+
path_to_cca.each {|i| end_node = end_node.children[i]}
|
267
|
+
|
268
|
+
# trim away NODES up the tree that fall to the left of the start
|
269
|
+
# or to the right of the end
|
270
|
+
left_node = start_node
|
271
|
+
while left_node != cca_clone
|
272
|
+
siblings = left_node.parent.children
|
273
|
+
self_index = siblings.index(left_node)
|
274
|
+
unless self_index == 0
|
275
|
+
left_of_self = siblings.slice(0..(self_index-1))
|
276
|
+
left_of_self.each {|n| n.remove} unless left_of_self.nil?
|
277
|
+
end
|
278
|
+
left_node = left_node.parent
|
279
|
+
end
|
280
|
+
|
281
|
+
right_node = end_node
|
282
|
+
while right_node != cca_clone
|
283
|
+
siblings = right_node.parent.children
|
284
|
+
self_index = siblings.index(right_node)
|
285
|
+
right_of_self = siblings.slice((self_index+1)..-1)
|
286
|
+
right_of_self.each {|n| n.remove} unless right_of_self.nil?
|
287
|
+
right_node = right_node.parent
|
288
|
+
end
|
289
|
+
|
290
|
+
# trim away the TEXT that falls to the left of the start or to the right of
|
291
|
+
# the end; also include the preceding characters to the insertion
|
292
|
+
end_node.content = end_node.content[0..(end_char-1)]
|
293
|
+
start_node.content = start_node.content[start_char..-1]
|
294
|
+
|
295
|
+
# unless there's a deletion immediately before, include the preceding chars in the insertion
|
296
|
+
unless (insertion[:prev_operation] == :deletion) || (insertion[:b_start] <= 0)
|
297
|
+
start_node.content = @word_list[:b][insertion[:b_start]][:preceding_chars] + start_node.content
|
298
|
+
end
|
299
|
+
#unless (insertion[:next_operation] == :deletion) || (insertion[:b_end] >= @word_list[:b].length-1)
|
300
|
+
# end_node.content += @word_list[:b][insertion[:b_end]+1][:preceding_chars]
|
301
|
+
#end
|
302
|
+
|
303
|
+
insertion_data = {
|
304
|
+
new_nodes: cca_clone,
|
305
|
+
insertion_point_node: @word_list[:a][insertion[:a_position]][:node],
|
306
|
+
insertion_char_index: @word_list[:a][insertion[:a_position]][:end_pos]
|
307
|
+
}
|
308
|
+
insertion.merge insertion_data
|
309
|
+
end
|
310
|
+
|
311
|
+
# splits nodes (if necessary) between the specified character positions
|
312
|
+
# and runs the block for each node between the start and end
|
313
|
+
def modify_each_node_between(node, start_char, end_char)
|
314
|
+
prev_node_set = nil
|
315
|
+
if @split_nodes[node].nil?
|
316
|
+
prev_node_set = [node]
|
317
|
+
else
|
318
|
+
prev_node_set = @split_nodes[node]
|
319
|
+
end
|
320
|
+
|
321
|
+
# skip over inserted nodes, as they're not included in the character
|
322
|
+
# counts (and there's no further operations on them)
|
323
|
+
prev_node_set.delete_if {|n| @insertion_nodes[n] }
|
324
|
+
|
325
|
+
new_node_set = []
|
326
|
+
inside_nodes = []
|
327
|
+
insertion_queue = Hash.new
|
328
|
+
cur_char = 0
|
329
|
+
start_trimmed = false
|
330
|
+
end_trimmed = false
|
331
|
+
prev_node_set.each do |n|
|
332
|
+
cur_node = n
|
333
|
+
new_node_set << cur_node
|
334
|
+
node_end_char = cur_char + cur_node.content.length
|
335
|
+
|
336
|
+
# split node at the start_char
|
337
|
+
unless start_trimmed
|
338
|
+
if start_char > node_end_char
|
339
|
+
cur_char = node_end_char
|
340
|
+
next
|
341
|
+
else
|
342
|
+
if start_char == cur_char
|
343
|
+
start_trimmed = true
|
344
|
+
else # start_char beteen cur_char and node_end_char
|
345
|
+
after_node = cur_node.dup
|
346
|
+
cur_node.content = after_node.content[0..(start_char-cur_char-1)]
|
347
|
+
after_node.content = after_node.content[(start_char-cur_char)..-1]
|
348
|
+
insertion_queue[after_node] = cur_node # don't actually add_next_sibling yet, as Nokogiri will merge them
|
349
|
+
start_trimmed = true
|
350
|
+
|
351
|
+
cur_char += cur_node.content.length
|
352
|
+
cur_node = after_node
|
353
|
+
new_node_set << cur_node
|
354
|
+
end
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
# split node at the end_char
|
359
|
+
unless end_trimmed || !start_trimmed
|
360
|
+
inside_nodes << cur_node
|
361
|
+
if end_char > node_end_char
|
362
|
+
cur_char = node_end_char
|
363
|
+
next
|
364
|
+
elsif end_char == node_end_char
|
365
|
+
end_trimmed = true
|
366
|
+
cur_char = node_end_char
|
367
|
+
next
|
368
|
+
else # end_char < node_end_char
|
369
|
+
after_node = cur_node.dup
|
370
|
+
if (end_char-cur_char) > 0
|
371
|
+
cur_node.content = after_node.content[0..(end_char-cur_char-1)]
|
372
|
+
after_node.content = after_node.content[(end_char-cur_char)..-1]
|
373
|
+
else
|
374
|
+
cur_node.content = ""
|
375
|
+
end
|
376
|
+
insertion_queue[after_node] = cur_node
|
377
|
+
end_trimmed = true
|
378
|
+
|
379
|
+
new_node_set << after_node
|
380
|
+
end
|
381
|
+
end
|
382
|
+
cur_char = node_end_char
|
383
|
+
end
|
384
|
+
new_node_set.map! do |node_in_set|
|
385
|
+
insert_after = insertion_queue[node_in_set]
|
386
|
+
if inside_nodes.include?(node_in_set) && block_given?
|
387
|
+
modified_node = nil
|
388
|
+
if !insert_after.nil?
|
389
|
+
modified_node = yield node_in_set
|
390
|
+
insert_after.add_next_sibling(modified_node)
|
391
|
+
else
|
392
|
+
node_parent = node_in_set.parent
|
393
|
+
node_position = node_parent.children.index(node_in_set)
|
394
|
+
modified_node = yield node_in_set
|
395
|
+
|
396
|
+
# if the actual node has changed, need to rehook to parent (assume the origial has been removed)
|
397
|
+
if modified_node != node_in_set
|
398
|
+
if node_parent.children.length > node_position
|
399
|
+
node_parent.children[node_position].add_previous_sibling(modified_node)
|
400
|
+
else
|
401
|
+
node_parent.add_child(modified_node)
|
402
|
+
end
|
403
|
+
end
|
404
|
+
end
|
405
|
+
|
406
|
+
# also need to update the insertion queue if a node referenced by
|
407
|
+
# another has changed
|
408
|
+
if modified_node != node_in_set
|
409
|
+
insertion_queue.each do |floating_node, target_node|
|
410
|
+
if target_node == node_in_set
|
411
|
+
insertion_queue[floating_node] = modified_node
|
412
|
+
end
|
413
|
+
end
|
414
|
+
end
|
415
|
+
modified_node
|
416
|
+
else
|
417
|
+
insert_after.add_next_sibling(node_in_set) unless insert_after.nil?
|
418
|
+
node_in_set
|
419
|
+
end
|
420
|
+
end
|
421
|
+
|
422
|
+
@split_nodes[node] = new_node_set
|
423
|
+
end
|
424
|
+
|
425
|
+
def simplify_html(html)
|
426
|
+
(html.css('*') - html.css(@config[:simplified_html_tags].join(','))).each do |node|
|
427
|
+
node.replace(node.children)
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
def default_config
|
432
|
+
{
|
433
|
+
ignore_punctuation: true,
|
434
|
+
case_insensitive: true,
|
435
|
+
tokenizer_regexp: %r{([^A-Za-z0-9]+)}, # overrides any ignore_punctuation setting
|
436
|
+
diff_cmd: 'diff',
|
437
|
+
try_hard: false,
|
438
|
+
simplify_html: false,
|
439
|
+
simplified_html_tags: ['html','body','p','strong','em','ul','ol','li']
|
440
|
+
}
|
441
|
+
end
|
442
|
+
end
|
443
|
+
end
|
metadata
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fast_html_diff
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.8'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kent Mewhort
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-06-30 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: Performs a diff on two HTML inputs, outputting the result as HTML.
|
56
|
+
email:
|
57
|
+
- kent@openissues.ca
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- .gitignore
|
63
|
+
- Gemfile
|
64
|
+
- LICENSE.txt
|
65
|
+
- README.md
|
66
|
+
- Rakefile
|
67
|
+
- fast_html_diff.gemspec
|
68
|
+
- lib/fast_html_diff.rb
|
69
|
+
- lib/fast_html_diff/version.rb
|
70
|
+
homepage: https://github.com/kmewhort/fast_html_diff
|
71
|
+
licenses:
|
72
|
+
- BSD
|
73
|
+
metadata: {}
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 2.0.3
|
91
|
+
signing_key:
|
92
|
+
specification_version: 4
|
93
|
+
summary: Performs a diff on two HTML inputs, outputting the result as HTML.
|
94
|
+
test_files: []
|