edouard-htmldiff 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 Nathan Herald
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,15 @@
1
+ class Stuff
2
+
3
+ class << self
4
+ include HTMLDiff
5
+ end
6
+
7
+ # or extend HTMLDiff ?
8
+
9
+ end
10
+
11
+ Stuff.diff('a word is here', 'a nother word is there')
12
+
13
+ # => 'a<ins class=\"diffins\"> nother</ins> word is <del class=\"diffmod\">here</del><ins class=\"diffmod\">there</ins>'
14
+
15
+ Checkout the crappy specs for good examples.
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake/gempackagetask'
3
+ require 'rubygems/specification'
4
+ require 'date'
5
+ require 'spec/rake/spectask'
6
+
7
+ GEM = "htmldiff"
8
+ GEM_VERSION = "0.0.1"
9
+ AUTHOR = "Nathan Herald"
10
+ EMAIL = "nathan@myobie.com"
11
+ HOMEPAGE = "http://github.com/myobie/htmldiff"
12
+ SUMMARY = "HTML diffs of text (borrowed from a wiki software I no longer remember)"
13
+
14
+ spec = Gem::Specification.new do |s|
15
+ s.name = GEM
16
+ s.version = GEM_VERSION
17
+ s.platform = Gem::Platform::RUBY
18
+ s.has_rdoc = true
19
+ s.extra_rdoc_files = ["README", "LICENSE", 'TODO']
20
+ s.summary = SUMMARY
21
+ s.description = s.summary
22
+ s.author = AUTHOR
23
+ s.email = EMAIL
24
+ s.homepage = HOMEPAGE
25
+
26
+ # Uncomment this to add a dependency
27
+ # s.add_dependency "foo"
28
+
29
+ s.require_path = 'lib'
30
+ s.autorequire = GEM
31
+ s.files = %w(LICENSE README Rakefile TODO) + Dir.glob("{lib,spec}/**/*")
32
+ end
33
+
34
+ task :default => :spec
35
+
36
+ desc "Run specs"
37
+ Spec::Rake::SpecTask.new do |t|
38
+ t.spec_files = FileList['spec/**/*_spec.rb']
39
+ t.spec_opts = %w(-fs --color)
40
+ end
41
+
42
+
43
+ Rake::GemPackageTask.new(spec) do |pkg|
44
+ pkg.gem_spec = spec
45
+ end
46
+
47
+ desc "install the gem locally"
48
+ task :install => [:package] do
49
+ sh %{sudo gem install pkg/#{GEM}-#{GEM_VERSION}}
50
+ end
51
+
52
+ desc "create a gemspec file"
53
+ task :make_spec do
54
+ File.open("#{GEM}.gemspec", "w") do |file|
55
+ file.puts spec.to_ruby
56
+ end
57
+ end
data/TODO ADDED
File without changes
data/lib/htmldiff.rb ADDED
@@ -0,0 +1,340 @@
1
+ module HTMLDiff
2
+
3
+ Match = Struct.new(:start_in_old, :start_in_new, :size)
4
+ class Match
5
+ def end_in_old
6
+ self.start_in_old + self.size
7
+ end
8
+
9
+ def end_in_new
10
+ self.start_in_new + self.size
11
+ end
12
+ end
13
+
14
+ Operation = Struct.new(:action, :start_in_old, :end_in_old, :start_in_new, :end_in_new)
15
+
16
+ class DiffBuilder
17
+
18
+ def initialize(old_version, new_version, dual = false)
19
+ @old_version, @new_version = old_version, new_version
20
+ @dual = dual
21
+ if @dual
22
+ @new_content = []
23
+ @old_content = []
24
+ else
25
+ @content = []
26
+ end
27
+ end
28
+
29
+ def build
30
+ split_inputs_to_words
31
+ index_new_words
32
+ operations.each { |op| perform_operation(op) }
33
+ return @dual ? [ @old_content.join, @new_content.join ] : @content.join
34
+ end
35
+
36
+ def split_inputs_to_words
37
+ @old_words = convert_html_to_list_of_words(explode(@old_version))
38
+ @new_words = convert_html_to_list_of_words(explode(@new_version))
39
+ end
40
+
41
+ def index_new_words
42
+ @word_indices = Hash.new { |h, word| h[word] = [] }
43
+ @new_words.each_with_index { |word, i| @word_indices[word] << i }
44
+ end
45
+
46
+ def operations
47
+ position_in_old = position_in_new = 0
48
+ operations = []
49
+
50
+ matches = matching_blocks
51
+ # an empty match at the end forces the loop below to handle the unmatched tails
52
+ # I'm sure it can be done more gracefully, but not at 23:52
53
+ matches << Match.new(@old_words.length, @new_words.length, 0)
54
+
55
+ matches.each_with_index do |match, i|
56
+ match_starts_at_current_position_in_old = (position_in_old == match.start_in_old)
57
+ match_starts_at_current_position_in_new = (position_in_new == match.start_in_new)
58
+
59
+ action_upto_match_positions =
60
+ case [match_starts_at_current_position_in_old, match_starts_at_current_position_in_new]
61
+ when [false, false]
62
+ :replace
63
+ when [true, false]
64
+ :insert
65
+ when [false, true]
66
+ :delete
67
+ else
68
+ # this happens if the first few words are same in both versions
69
+ :none
70
+ end
71
+
72
+ if action_upto_match_positions != :none
73
+ operation_upto_match_positions =
74
+ Operation.new(action_upto_match_positions,
75
+ position_in_old, match.start_in_old,
76
+ position_in_new, match.start_in_new)
77
+ operations << operation_upto_match_positions
78
+ end
79
+ if match.size != 0
80
+ match_operation = Operation.new(:equal,
81
+ match.start_in_old, match.end_in_old,
82
+ match.start_in_new, match.end_in_new)
83
+ operations << match_operation
84
+ end
85
+
86
+ position_in_old = match.end_in_old
87
+ position_in_new = match.end_in_new
88
+ end
89
+
90
+ operations
91
+ end
92
+
93
+ def matching_blocks
94
+ matching_blocks = []
95
+ recursively_find_matching_blocks(0, @old_words.size, 0, @new_words.size, matching_blocks)
96
+ matching_blocks
97
+ end
98
+
99
+ def recursively_find_matching_blocks(start_in_old, end_in_old, start_in_new, end_in_new, matching_blocks)
100
+ match = find_match(start_in_old, end_in_old, start_in_new, end_in_new)
101
+ if match
102
+ if start_in_old < match.start_in_old and start_in_new < match.start_in_new
103
+ recursively_find_matching_blocks(
104
+ start_in_old, match.start_in_old, start_in_new, match.start_in_new, matching_blocks)
105
+ end
106
+ matching_blocks << match
107
+ if match.end_in_old < end_in_old and match.end_in_new < end_in_new
108
+ recursively_find_matching_blocks(
109
+ match.end_in_old, end_in_old, match.end_in_new, end_in_new, matching_blocks)
110
+ end
111
+ end
112
+ end
113
+
114
+ def find_match(start_in_old, end_in_old, start_in_new, end_in_new)
115
+
116
+ best_match_in_old = start_in_old
117
+ best_match_in_new = start_in_new
118
+ best_match_size = 0
119
+
120
+ match_length_at = Hash.new { |h, index| h[index] = 0 }
121
+
122
+ start_in_old.upto(end_in_old - 1) do |index_in_old|
123
+
124
+ new_match_length_at = Hash.new { |h, index| h[index] = 0 }
125
+
126
+ @word_indices[@old_words[index_in_old]].each do |index_in_new|
127
+ next if index_in_new < start_in_new
128
+ break if index_in_new >= end_in_new
129
+
130
+ new_match_length = match_length_at[index_in_new - 1] + 1
131
+ new_match_length_at[index_in_new] = new_match_length
132
+
133
+ if new_match_length > best_match_size
134
+ best_match_in_old = index_in_old - new_match_length + 1
135
+ best_match_in_new = index_in_new - new_match_length + 1
136
+ best_match_size = new_match_length
137
+ end
138
+ end
139
+ match_length_at = new_match_length_at
140
+ end
141
+
142
+ # best_match_in_old, best_match_in_new, best_match_size = add_matching_words_left(
143
+ # best_match_in_old, best_match_in_new, best_match_size, start_in_old, start_in_new)
144
+ # best_match_in_old, best_match_in_new, match_size = add_matching_words_right(
145
+ # best_match_in_old, best_match_in_new, best_match_size, end_in_old, end_in_new)
146
+
147
+ return (best_match_size != 0 ? Match.new(best_match_in_old, best_match_in_new, best_match_size) : nil)
148
+ end
149
+
150
+ def add_matching_words_left(match_in_old, match_in_new, match_size, start_in_old, start_in_new)
151
+ while match_in_old > start_in_old and
152
+ match_in_new > start_in_new and
153
+ @old_words[match_in_old - 1] == @new_words[match_in_new - 1]
154
+ match_in_old -= 1
155
+ match_in_new -= 1
156
+ match_size += 1
157
+ end
158
+ [match_in_old, match_in_new, match_size]
159
+ end
160
+
161
+ def add_matching_words_right(match_in_old, match_in_new, match_size, end_in_old, end_in_new)
162
+ while match_in_old + match_size < end_in_old and
163
+ match_in_new + match_size < end_in_new and
164
+ @old_words[match_in_old + match_size] == @new_words[match_in_new + match_size]
165
+ match_size += 1
166
+ end
167
+ [match_in_old, match_in_new, match_size]
168
+ end
169
+
170
+ VALID_METHODS = [:replace, :insert, :delete, :equal]
171
+
172
+ def perform_operation(operation)
173
+ @operation = operation
174
+ self.send operation.action, operation
175
+ end
176
+
177
+ def replace(operation)
178
+ delete(operation, 'diffmod')
179
+ insert(operation, 'diffmod')
180
+ end
181
+
182
+ def insert(operation, tagclass = 'diffins')
183
+ insert_tag('ins', tagclass, @new_words[operation.start_in_new...operation.end_in_new], @dual ? @new_content : @content)
184
+ end
185
+
186
+ def delete(operation, tagclass = 'diffdel')
187
+ insert_tag('del', tagclass, @old_words[operation.start_in_old...operation.end_in_old], @dual ? @old_content : @content)
188
+ end
189
+
190
+ def equal(operation)
191
+ # no tags to insert, simply copy the matching words from one of the versions
192
+ if @dual
193
+ @old_content += @old_words[operation.start_in_old...operation.end_in_old]
194
+ @new_content += @new_words[operation.start_in_new...operation.end_in_new]
195
+ else
196
+ @content += @new_words[operation.start_in_new...operation.end_in_new]
197
+ end
198
+ end
199
+
200
+ def opening_tag?(item)
201
+ item =~ %r!^\s*<[^>]+>\s*$!
202
+ end
203
+
204
+ def closing_tag?(item)
205
+ item =~ %r!^\s*</[^>]+>\s*$!
206
+ end
207
+
208
+ def tag?(item)
209
+ opening_tag?(item) or closing_tag?(item)
210
+ end
211
+
212
+ def extract_consecutive_words(words, &condition)
213
+ index_of_first_tag = nil
214
+ words.each_with_index do |word, i|
215
+ if !condition.call(word)
216
+ index_of_first_tag = i
217
+ break
218
+ end
219
+ end
220
+ if index_of_first_tag
221
+ return words.slice!(0...index_of_first_tag)
222
+ else
223
+ return words.slice!(0..words.length)
224
+ end
225
+ end
226
+
227
+ def add_special_attribute opening_tag, type
228
+ opening_tag.sub('>', ' difftype="' + type + '">')
229
+ end
230
+
231
+ # This method encloses words within a specified tag (ins or del), and adds this into @content,
232
+ # with a twist: if there are words contain tags, it actually creates multiple ins or del,
233
+ # so that they don't include any ins or del. This handles cases like
234
+ # old: '<p>a</p>'
235
+ # new: '<p>ab</p><p>c</b>'
236
+ # diff result: '<p>a<ins>b</ins></p><p><ins>c</ins></p>'
237
+ # this still doesn't guarantee valid HTML (hint: think about diffing a text containing ins or
238
+ # del tags), but handles correctly more cases than the earlier version.
239
+ #
240
+ # P.S.: Spare a thought for people who write HTML browsers. They live in this ... every day.
241
+
242
+ def insert_tag(tagname, cssclass, words, content)
243
+ loop do
244
+ break if words.empty?
245
+ non_tags = extract_consecutive_words(words) { |word| not tag?(word) }
246
+ content << wrap_text(non_tags.join, tagname, cssclass) unless non_tags.empty?
247
+
248
+ break if words.empty?
249
+ loop do
250
+ opening = opening_tag? words.first
251
+ closing = closing_tag? words.first
252
+ break unless opening || closing
253
+ if opening
254
+ content << add_special_attribute(words.shift, cssclass)
255
+ else
256
+ content << words.shift
257
+ end
258
+ end
259
+ #content.push(*extract_consecutive_words(words) { |word| tag?(word) })
260
+ end
261
+ end
262
+
263
+ def wrap_text(text, tagname, cssclass)
264
+ %(<#{tagname} class="#{cssclass}">#{text}</#{tagname}>)
265
+ end
266
+
267
+ def explode(sequence)
268
+ sequence.is_a?(String) ? sequence.split(//) : sequence
269
+ end
270
+
271
+ def end_of_tag?(char)
272
+ char == '>'
273
+ end
274
+
275
+ def start_of_tag?(char)
276
+ char == '<'
277
+ end
278
+
279
+ def whitespace?(char)
280
+ char =~ /\s/
281
+ end
282
+
283
+ def convert_html_to_list_of_words(x, use_brackets = false)
284
+ mode = :char
285
+ current_word = ''
286
+ words = []
287
+
288
+ explode(x).each do |char|
289
+ case mode
290
+ when :tag
291
+ if end_of_tag? char
292
+ current_word << (use_brackets ? ']' : '>')
293
+ words << current_word
294
+ current_word = ''
295
+ if whitespace?(char)
296
+ mode = :whitespace
297
+ else
298
+ mode = :char
299
+ end
300
+ else
301
+ current_word << char
302
+ end
303
+ when :char
304
+ if start_of_tag? char
305
+ words << current_word unless current_word.empty?
306
+ current_word = (use_brackets ? '[' : '<')
307
+ mode = :tag
308
+ elsif /\s/.match char
309
+ words << current_word unless current_word.empty?
310
+ current_word = char
311
+ mode = :whitespace
312
+ else
313
+ current_word << char
314
+ end
315
+ when :whitespace
316
+ if start_of_tag? char
317
+ words << current_word unless current_word.empty?
318
+ current_word = (use_brackets ? '[' : '<')
319
+ mode = :tag
320
+ elsif /\s/.match char
321
+ current_word << char
322
+ else
323
+ words << current_word unless current_word.empty?
324
+ current_word = char
325
+ mode = :char
326
+ end
327
+ else
328
+ raise "Unknown mode #{mode.inspect}"
329
+ end
330
+ end
331
+ words << current_word unless current_word.empty?
332
+ words
333
+ end
334
+
335
+ end # of class Diff Builder
336
+
337
+ def diff(a, b, dual = false)
338
+ DiffBuilder.new(a, b, dual).build
339
+ end
340
+ end
@@ -0,0 +1,32 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+ require 'htmldiff'
3
+
4
+ class TestDiff
5
+ extend HTMLDiff
6
+ end
7
+
8
+ describe "htmldiff" do
9
+
10
+ it "should diff text" do
11
+
12
+ diff = TestDiff.diff('a word is here', 'a nother word is there')
13
+ diff.should == "a<ins class=\"diffins\"> nother</ins> word is <del class=\"diffmod\">here</del><ins class=\"diffmod\">there</ins>"
14
+
15
+ end
16
+
17
+ it "should insert a letter and a space" do
18
+ diff = TestDiff.diff('a c', 'a b c')
19
+ diff.should == "a <ins class=\"diffins\">b </ins>c"
20
+ end
21
+
22
+ it "should remove a letter and a space" do
23
+ diff = TestDiff.diff('a b c', 'a c')
24
+ diff.should == "a <del class=\"diffdel\">b </del>c"
25
+ end
26
+
27
+ it "should change a letter" do
28
+ diff = TestDiff.diff('a b c', 'a d c')
29
+ diff.should == "a <del class=\"diffmod\">b</del><ins class=\"diffmod\">d</ins> c"
30
+ end
31
+
32
+ end
@@ -0,0 +1,2 @@
1
+ $TESTING=true
2
+ $:.push File.join(File.dirname(__FILE__), '..', 'lib')
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: edouard-htmldiff
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 2
9
+ version: 0.0.2
10
+ platform: ruby
11
+ authors:
12
+ - Nathan Herald
13
+ autorequire: htmldiff
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-12-23 00:00:00 +01:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description:
22
+ email: nathan@myobie.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files:
28
+ - README
29
+ - LICENSE
30
+ - TODO
31
+ files:
32
+ - LICENSE
33
+ - README
34
+ - Rakefile
35
+ - TODO
36
+ - lib/htmldiff.rb
37
+ - spec/htmldiff_spec.rb
38
+ - spec/spec_helper.rb
39
+ has_rdoc: true
40
+ homepage: http://github.com/edouard/htmldiff
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ requirements: []
65
+
66
+ rubyforge_project:
67
+ rubygems_version: 1.3.7
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: HTML diffs of text
71
+ test_files: []
72
+