htmldiff 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README +15 -0
- data/Rakefile +57 -0
- data/TODO +0 -0
- data/lib/htmldiff.rb +316 -0
- data/spec/htmldiff_spec.rb +32 -0
- data/spec/spec_helper.rb +2 -0
- metadata +63 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2008 Nathan Herald
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
class Stuff
|
2
|
+
|
3
|
+
class << self
|
4
|
+
include HTMLDiff
|
5
|
+
end
|
6
|
+
|
7
|
+
# or extend HTMLDiff ?
|
8
|
+
|
9
|
+
end
|
10
|
+
|
11
|
+
Stuff.diff('a word is here', 'a nother word is there')
|
12
|
+
|
13
|
+
# => 'a<ins class=\"diffins\"> nother</ins> word is <del class=\"diffmod\">here</del><ins class=\"diffmod\">there</ins>'
|
14
|
+
|
15
|
+
Checkout the crappy specs for good examples.
|
data/Rakefile
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake/gempackagetask'
|
3
|
+
require 'rubygems/specification'
|
4
|
+
require 'date'
|
5
|
+
require 'spec/rake/spectask'
|
6
|
+
|
7
|
+
GEM = "htmldiff"
|
8
|
+
GEM_VERSION = "0.0.1"
|
9
|
+
AUTHOR = "Nathan Herald"
|
10
|
+
EMAIL = "nathan@myobie.com"
|
11
|
+
HOMEPAGE = "http://github.com/myobie/htmldiff"
|
12
|
+
SUMMARY = "HTML diffs of text (borrowed from a wiki software I no longer remember)"
|
13
|
+
|
14
|
+
spec = Gem::Specification.new do |s|
|
15
|
+
s.name = GEM
|
16
|
+
s.version = GEM_VERSION
|
17
|
+
s.platform = Gem::Platform::RUBY
|
18
|
+
s.has_rdoc = true
|
19
|
+
s.extra_rdoc_files = ["README", "LICENSE", 'TODO']
|
20
|
+
s.summary = SUMMARY
|
21
|
+
s.description = s.summary
|
22
|
+
s.author = AUTHOR
|
23
|
+
s.email = EMAIL
|
24
|
+
s.homepage = HOMEPAGE
|
25
|
+
|
26
|
+
# Uncomment this to add a dependency
|
27
|
+
# s.add_dependency "foo"
|
28
|
+
|
29
|
+
s.require_path = 'lib'
|
30
|
+
s.autorequire = GEM
|
31
|
+
s.files = %w(LICENSE README Rakefile TODO) + Dir.glob("{lib,spec}/**/*")
|
32
|
+
end
|
33
|
+
|
34
|
+
task :default => :spec
|
35
|
+
|
36
|
+
desc "Run specs"
|
37
|
+
Spec::Rake::SpecTask.new do |t|
|
38
|
+
t.spec_files = FileList['spec/**/*_spec.rb']
|
39
|
+
t.spec_opts = %w(-fs --color)
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
44
|
+
pkg.gem_spec = spec
|
45
|
+
end
|
46
|
+
|
47
|
+
desc "install the gem locally"
|
48
|
+
task :install => [:package] do
|
49
|
+
sh %{sudo gem install pkg/#{GEM}-#{GEM_VERSION}}
|
50
|
+
end
|
51
|
+
|
52
|
+
desc "create a gemspec file"
|
53
|
+
task :make_spec do
|
54
|
+
File.open("#{GEM}.gemspec", "w") do |file|
|
55
|
+
file.puts spec.to_ruby
|
56
|
+
end
|
57
|
+
end
|
data/TODO
ADDED
File without changes
|
data/lib/htmldiff.rb
ADDED
@@ -0,0 +1,316 @@
|
|
1
|
+
module HTMLDiff
|
2
|
+
|
3
|
+
Match = Struct.new(:start_in_old, :start_in_new, :size)
|
4
|
+
class Match
|
5
|
+
def end_in_old
|
6
|
+
self.start_in_old + self.size
|
7
|
+
end
|
8
|
+
|
9
|
+
def end_in_new
|
10
|
+
self.start_in_new + self.size
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
Operation = Struct.new(:action, :start_in_old, :end_in_old, :start_in_new, :end_in_new)
|
15
|
+
|
16
|
+
class DiffBuilder
|
17
|
+
|
18
|
+
def initialize(old_version, new_version)
|
19
|
+
@old_version, @new_version = old_version, new_version
|
20
|
+
@content = []
|
21
|
+
end
|
22
|
+
|
23
|
+
def build
|
24
|
+
split_inputs_to_words
|
25
|
+
index_new_words
|
26
|
+
operations.each { |op| perform_operation(op) }
|
27
|
+
return @content.join
|
28
|
+
end
|
29
|
+
|
30
|
+
def split_inputs_to_words
|
31
|
+
@old_words = convert_html_to_list_of_words(explode(@old_version))
|
32
|
+
@new_words = convert_html_to_list_of_words(explode(@new_version))
|
33
|
+
end
|
34
|
+
|
35
|
+
def index_new_words
|
36
|
+
@word_indices = Hash.new { |h, word| h[word] = [] }
|
37
|
+
@new_words.each_with_index { |word, i| @word_indices[word] << i }
|
38
|
+
end
|
39
|
+
|
40
|
+
def operations
|
41
|
+
position_in_old = position_in_new = 0
|
42
|
+
operations = []
|
43
|
+
|
44
|
+
matches = matching_blocks
|
45
|
+
# an empty match at the end forces the loop below to handle the unmatched tails
|
46
|
+
# I'm sure it can be done more gracefully, but not at 23:52
|
47
|
+
matches << Match.new(@old_words.length, @new_words.length, 0)
|
48
|
+
|
49
|
+
matches.each_with_index do |match, i|
|
50
|
+
match_starts_at_current_position_in_old = (position_in_old == match.start_in_old)
|
51
|
+
match_starts_at_current_position_in_new = (position_in_new == match.start_in_new)
|
52
|
+
|
53
|
+
action_upto_match_positions =
|
54
|
+
case [match_starts_at_current_position_in_old, match_starts_at_current_position_in_new]
|
55
|
+
when [false, false]
|
56
|
+
:replace
|
57
|
+
when [true, false]
|
58
|
+
:insert
|
59
|
+
when [false, true]
|
60
|
+
:delete
|
61
|
+
else
|
62
|
+
# this happens if the first few words are same in both versions
|
63
|
+
:none
|
64
|
+
end
|
65
|
+
|
66
|
+
if action_upto_match_positions != :none
|
67
|
+
operation_upto_match_positions =
|
68
|
+
Operation.new(action_upto_match_positions,
|
69
|
+
position_in_old, match.start_in_old,
|
70
|
+
position_in_new, match.start_in_new)
|
71
|
+
operations << operation_upto_match_positions
|
72
|
+
end
|
73
|
+
if match.size != 0
|
74
|
+
match_operation = Operation.new(:equal,
|
75
|
+
match.start_in_old, match.end_in_old,
|
76
|
+
match.start_in_new, match.end_in_new)
|
77
|
+
operations << match_operation
|
78
|
+
end
|
79
|
+
|
80
|
+
position_in_old = match.end_in_old
|
81
|
+
position_in_new = match.end_in_new
|
82
|
+
end
|
83
|
+
|
84
|
+
operations
|
85
|
+
end
|
86
|
+
|
87
|
+
def matching_blocks
|
88
|
+
matching_blocks = []
|
89
|
+
recursively_find_matching_blocks(0, @old_words.size, 0, @new_words.size, matching_blocks)
|
90
|
+
matching_blocks
|
91
|
+
end
|
92
|
+
|
93
|
+
def recursively_find_matching_blocks(start_in_old, end_in_old, start_in_new, end_in_new, matching_blocks)
|
94
|
+
match = find_match(start_in_old, end_in_old, start_in_new, end_in_new)
|
95
|
+
if match
|
96
|
+
if start_in_old < match.start_in_old and start_in_new < match.start_in_new
|
97
|
+
recursively_find_matching_blocks(
|
98
|
+
start_in_old, match.start_in_old, start_in_new, match.start_in_new, matching_blocks)
|
99
|
+
end
|
100
|
+
matching_blocks << match
|
101
|
+
if match.end_in_old < end_in_old and match.end_in_new < end_in_new
|
102
|
+
recursively_find_matching_blocks(
|
103
|
+
match.end_in_old, end_in_old, match.end_in_new, end_in_new, matching_blocks)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def find_match(start_in_old, end_in_old, start_in_new, end_in_new)
|
109
|
+
|
110
|
+
best_match_in_old = start_in_old
|
111
|
+
best_match_in_new = start_in_new
|
112
|
+
best_match_size = 0
|
113
|
+
|
114
|
+
match_length_at = Hash.new { |h, index| h[index] = 0 }
|
115
|
+
|
116
|
+
start_in_old.upto(end_in_old - 1) do |index_in_old|
|
117
|
+
|
118
|
+
new_match_length_at = Hash.new { |h, index| h[index] = 0 }
|
119
|
+
|
120
|
+
@word_indices[@old_words[index_in_old]].each do |index_in_new|
|
121
|
+
next if index_in_new < start_in_new
|
122
|
+
break if index_in_new >= end_in_new
|
123
|
+
|
124
|
+
new_match_length = match_length_at[index_in_new - 1] + 1
|
125
|
+
new_match_length_at[index_in_new] = new_match_length
|
126
|
+
|
127
|
+
if new_match_length > best_match_size
|
128
|
+
best_match_in_old = index_in_old - new_match_length + 1
|
129
|
+
best_match_in_new = index_in_new - new_match_length + 1
|
130
|
+
best_match_size = new_match_length
|
131
|
+
end
|
132
|
+
end
|
133
|
+
match_length_at = new_match_length_at
|
134
|
+
end
|
135
|
+
|
136
|
+
# best_match_in_old, best_match_in_new, best_match_size = add_matching_words_left(
|
137
|
+
# best_match_in_old, best_match_in_new, best_match_size, start_in_old, start_in_new)
|
138
|
+
# best_match_in_old, best_match_in_new, match_size = add_matching_words_right(
|
139
|
+
# best_match_in_old, best_match_in_new, best_match_size, end_in_old, end_in_new)
|
140
|
+
|
141
|
+
return (best_match_size != 0 ? Match.new(best_match_in_old, best_match_in_new, best_match_size) : nil)
|
142
|
+
end
|
143
|
+
|
144
|
+
def add_matching_words_left(match_in_old, match_in_new, match_size, start_in_old, start_in_new)
|
145
|
+
while match_in_old > start_in_old and
|
146
|
+
match_in_new > start_in_new and
|
147
|
+
@old_words[match_in_old - 1] == @new_words[match_in_new - 1]
|
148
|
+
match_in_old -= 1
|
149
|
+
match_in_new -= 1
|
150
|
+
match_size += 1
|
151
|
+
end
|
152
|
+
[match_in_old, match_in_new, match_size]
|
153
|
+
end
|
154
|
+
|
155
|
+
def add_matching_words_right(match_in_old, match_in_new, match_size, end_in_old, end_in_new)
|
156
|
+
while match_in_old + match_size < end_in_old and
|
157
|
+
match_in_new + match_size < end_in_new and
|
158
|
+
@old_words[match_in_old + match_size] == @new_words[match_in_new + match_size]
|
159
|
+
match_size += 1
|
160
|
+
end
|
161
|
+
[match_in_old, match_in_new, match_size]
|
162
|
+
end
|
163
|
+
|
164
|
+
VALID_METHODS = [:replace, :insert, :delete, :equal]
|
165
|
+
|
166
|
+
def perform_operation(operation)
|
167
|
+
@operation = operation
|
168
|
+
self.send operation.action, operation
|
169
|
+
end
|
170
|
+
|
171
|
+
def replace(operation)
|
172
|
+
delete(operation, 'diffmod')
|
173
|
+
insert(operation, 'diffmod')
|
174
|
+
end
|
175
|
+
|
176
|
+
def insert(operation, tagclass = 'diffins')
|
177
|
+
insert_tag('ins', tagclass, @new_words[operation.start_in_new...operation.end_in_new])
|
178
|
+
end
|
179
|
+
|
180
|
+
def delete(operation, tagclass = 'diffdel')
|
181
|
+
insert_tag('del', tagclass, @old_words[operation.start_in_old...operation.end_in_old])
|
182
|
+
end
|
183
|
+
|
184
|
+
def equal(operation)
|
185
|
+
# no tags to insert, simply copy the matching words from one of the versions
|
186
|
+
@content += @new_words[operation.start_in_new...operation.end_in_new]
|
187
|
+
end
|
188
|
+
|
189
|
+
def opening_tag?(item)
|
190
|
+
item =~ %r!^\s*<[^>]+>\s*$!
|
191
|
+
end
|
192
|
+
|
193
|
+
def closing_tag?(item)
|
194
|
+
item =~ %r!^\s*</[^>]+>\s*$!
|
195
|
+
end
|
196
|
+
|
197
|
+
def tag?(item)
|
198
|
+
opening_tag?(item) or closing_tag?(item)
|
199
|
+
end
|
200
|
+
|
201
|
+
def extract_consecutive_words(words, &condition)
|
202
|
+
index_of_first_tag = nil
|
203
|
+
words.each_with_index do |word, i|
|
204
|
+
if !condition.call(word)
|
205
|
+
index_of_first_tag = i
|
206
|
+
break
|
207
|
+
end
|
208
|
+
end
|
209
|
+
if index_of_first_tag
|
210
|
+
return words.slice!(0...index_of_first_tag)
|
211
|
+
else
|
212
|
+
return words.slice!(0..words.length)
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# This method encloses words within a specified tag (ins or del), and adds this into @content,
|
217
|
+
# with a twist: if there are words contain tags, it actually creates multiple ins or del,
|
218
|
+
# so that they don't include any ins or del. This handles cases like
|
219
|
+
# old: '<p>a</p>'
|
220
|
+
# new: '<p>ab</p><p>c</b>'
|
221
|
+
# diff result: '<p>a<ins>b</ins></p><p><ins>c</ins></p>'
|
222
|
+
# this still doesn't guarantee valid HTML (hint: think about diffing a text containing ins or
|
223
|
+
# del tags), but handles correctly more cases than the earlier version.
|
224
|
+
#
|
225
|
+
# P.S.: Spare a thought for people who write HTML browsers. They live in this ... every day.
|
226
|
+
|
227
|
+
def insert_tag(tagname, cssclass, words)
|
228
|
+
loop do
|
229
|
+
break if words.empty?
|
230
|
+
non_tags = extract_consecutive_words(words) { |word| not tag?(word) }
|
231
|
+
@content << wrap_text(non_tags.join, tagname, cssclass) unless non_tags.empty?
|
232
|
+
|
233
|
+
break if words.empty?
|
234
|
+
@content += extract_consecutive_words(words) { |word| tag?(word) }
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
def wrap_text(text, tagname, cssclass)
|
239
|
+
%(<#{tagname} class="#{cssclass}">#{text}</#{tagname}>)
|
240
|
+
end
|
241
|
+
|
242
|
+
def explode(sequence)
|
243
|
+
sequence.is_a?(String) ? sequence.split(//) : sequence
|
244
|
+
end
|
245
|
+
|
246
|
+
def end_of_tag?(char)
|
247
|
+
char == '>'
|
248
|
+
end
|
249
|
+
|
250
|
+
def start_of_tag?(char)
|
251
|
+
char == '<'
|
252
|
+
end
|
253
|
+
|
254
|
+
def whitespace?(char)
|
255
|
+
char =~ /\s/
|
256
|
+
end
|
257
|
+
|
258
|
+
def convert_html_to_list_of_words(x, use_brackets = false)
|
259
|
+
mode = :char
|
260
|
+
current_word = ''
|
261
|
+
words = []
|
262
|
+
|
263
|
+
explode(x).each do |char|
|
264
|
+
case mode
|
265
|
+
when :tag
|
266
|
+
if end_of_tag? char
|
267
|
+
current_word << (use_brackets ? ']' : '>')
|
268
|
+
words << current_word
|
269
|
+
current_word = ''
|
270
|
+
if whitespace?(char)
|
271
|
+
mode = :whitespace
|
272
|
+
else
|
273
|
+
mode = :char
|
274
|
+
end
|
275
|
+
else
|
276
|
+
current_word << char
|
277
|
+
end
|
278
|
+
when :char
|
279
|
+
if start_of_tag? char
|
280
|
+
words << current_word unless current_word.empty?
|
281
|
+
current_word = (use_brackets ? '[' : '<')
|
282
|
+
mode = :tag
|
283
|
+
elsif /\s/.match char
|
284
|
+
words << current_word unless current_word.empty?
|
285
|
+
current_word = char
|
286
|
+
mode = :whitespace
|
287
|
+
else
|
288
|
+
current_word << char
|
289
|
+
end
|
290
|
+
when :whitespace
|
291
|
+
if start_of_tag? char
|
292
|
+
words << current_word unless current_word.empty?
|
293
|
+
current_word = (use_brackets ? '[' : '<')
|
294
|
+
mode = :tag
|
295
|
+
elsif /\s/.match char
|
296
|
+
current_word << char
|
297
|
+
else
|
298
|
+
words << current_word unless current_word.empty?
|
299
|
+
current_word = char
|
300
|
+
mode = :char
|
301
|
+
end
|
302
|
+
else
|
303
|
+
raise "Unknown mode #{mode.inspect}"
|
304
|
+
end
|
305
|
+
end
|
306
|
+
words << current_word unless current_word.empty?
|
307
|
+
words
|
308
|
+
end
|
309
|
+
|
310
|
+
end # of class Diff Builder
|
311
|
+
|
312
|
+
def diff(a, b)
|
313
|
+
DiffBuilder.new(a, b).build
|
314
|
+
end
|
315
|
+
|
316
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'htmldiff'
|
3
|
+
|
4
|
+
class TestDiff
|
5
|
+
extend HTMLDiff
|
6
|
+
end
|
7
|
+
|
8
|
+
describe "htmldiff" do
|
9
|
+
|
10
|
+
it "should diff text" do
|
11
|
+
|
12
|
+
diff = TestDiff.diff('a word is here', 'a nother word is there')
|
13
|
+
diff.should == "a<ins class=\"diffins\"> nother</ins> word is <del class=\"diffmod\">here</del><ins class=\"diffmod\">there</ins>"
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should insert a letter and a space" do
|
18
|
+
diff = TestDiff.diff('a c', 'a b c')
|
19
|
+
diff.should == "a <ins class=\"diffins\">b </ins>c"
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should remove a letter and a space" do
|
23
|
+
diff = TestDiff.diff('a b c', 'a c')
|
24
|
+
diff.should == "a <del class=\"diffdel\">b </del>c"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should change a letter" do
|
28
|
+
diff = TestDiff.diff('a b c', 'a d c')
|
29
|
+
diff.should == "a <del class=\"diffmod\">b</del><ins class=\"diffmod\">d</ins> c"
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: htmldiff
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nathan Herald
|
8
|
+
autorequire: htmldiff
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-11-21 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: HTML diffs of text (borrowed from a wiki software I no longer remember)
|
17
|
+
email: nathan@myobie.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
- LICENSE
|
25
|
+
- TODO
|
26
|
+
files:
|
27
|
+
- LICENSE
|
28
|
+
- README
|
29
|
+
- Rakefile
|
30
|
+
- TODO
|
31
|
+
- lib/htmldiff.rb
|
32
|
+
- spec/htmldiff_spec.rb
|
33
|
+
- spec/spec_helper.rb
|
34
|
+
has_rdoc: true
|
35
|
+
homepage: http://github.com/myobie/htmldiff
|
36
|
+
licenses: []
|
37
|
+
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: "0"
|
48
|
+
version:
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: "0"
|
54
|
+
version:
|
55
|
+
requirements: []
|
56
|
+
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 1.3.5
|
59
|
+
signing_key:
|
60
|
+
specification_version: 2
|
61
|
+
summary: HTML diffs of text (borrowed from a wiki software I no longer remember)
|
62
|
+
test_files: []
|
63
|
+
|