lorax 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data.tar.gz.sig ADDED
Binary file
data/CHANGELOG.rdoc ADDED
@@ -0,0 +1,6 @@
1
+ = Changelog
2
+
3
+ == 0.1.0 (2010-03-09)
4
+
5
+ * Happy Birthday!
6
+ * Diffs and generates patches, and for trivial cases applies patches correctly.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Mike Dalessio
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Manifest.txt ADDED
@@ -0,0 +1,37 @@
1
+ CHANGELOG.rdoc
2
+ LICENSE
3
+ Manifest.txt
4
+ README.rdoc
5
+ Rakefile
6
+ TODO
7
+ bin/lorax
8
+ lib/lorax.rb
9
+ lib/lorax/delta.rb
10
+ lib/lorax/delta/delete_delta.rb
11
+ lib/lorax/delta/insert_delta.rb
12
+ lib/lorax/delta/modify_delta.rb
13
+ lib/lorax/delta_set.rb
14
+ lib/lorax/delta_set_generator.rb
15
+ lib/lorax/fast_matcher.rb
16
+ lib/lorax/match.rb
17
+ lib/lorax/match_set.rb
18
+ lib/lorax/signature.rb
19
+ spec/fast_matcher_spec.rb
20
+ spec/files/Michael-Dalessio-200909.html
21
+ spec/files/Michael-Dalessio-201001.html
22
+ spec/files/slashdot-1.html
23
+ spec/files/slashdot-2.html
24
+ spec/files/slashdot-3.html
25
+ spec/files/slashdot-4.html
26
+ spec/integration/lorax_spec.rb
27
+ spec/match_spec.rb
28
+ spec/spec.opts
29
+ spec/spec_helper.rb
30
+ spec/unit/delta/delete_delta_spec.rb
31
+ spec/unit/delta/insert_delta_spec.rb
32
+ spec/unit/delta/modify_delta_spec.rb
33
+ spec/unit/delta_set_generator_spec.rb
34
+ spec/unit/delta_set_spec.rb
35
+ spec/unit/lorax_spec.rb
36
+ spec/unit/match_set_spec.rb
37
+ spec/unit/signature_spec.rb
data/README.rdoc ADDED
@@ -0,0 +1,70 @@
1
+ = Lorax
2
+
3
+ * http://github.com/flavorjones/lorax
4
+
5
+ == Description
6
+
7
+ The Lorax is a full diff and patch library for XML/HTML documents, based on Nokogiri.
8
+
9
+ It can tell you whether two XML/HTML documents are identical, or if
10
+ they're not, tell you what's different. In trivial cases, it can even
11
+ apply the patch.
12
+
13
+ It's based loosely on Gregory Cobena's master's thesis paper, which
14
+ generates deltas in less than O(n * log n) time, accepting some
15
+ tradeoffs in the size of the delta set. You can find his paper at
16
+ http://gregory.cobena.free.fr/www/Publications/thesis.html.
17
+
18
+ == Features / Problems
19
+
20
+ * Detect differences between documents, or tell whether two documents are the same.
21
+ * Generate patches for the differences between documents.
22
+ * Apply patches for trivial cases.
23
+ * More work needs to be done to make sure patches apply cleanly.
24
+
25
+ == Synopsis
26
+
27
+ Imagine you have two Nokogiri::XML::Documents. You can tell if they're identical:
28
+
29
+ Lorax::Signature.new(doc1.root).signature == Lorax::Signature.new(doc2.root).signature
30
+
31
+ You can generate a delta set (currently opaque (sorry kids)):
32
+
33
+ delta_set = Lorax.diff(doc1, doc2)
34
+
35
+ and apply the delta set as a patch to the original document:
36
+
37
+ new_doc = delta_set.apply(doc1)
38
+
39
+ == Requirements
40
+
41
+ * Nokogiri 1.4.0
42
+
43
+ == Install
44
+
45
+ * sudo gem install lorax
46
+
47
+ == License
48
+
49
+ (The MIT License)
50
+
51
+ Copyright (c) 2010 Mike Dalessio
52
+
53
+ Permission is hereby granted, free of charge, to any person obtaining
54
+ a copy of this software and associated documentation files (the
55
+ 'Software'), to deal in the Software without restriction, including
56
+ without limitation the rights to use, copy, modify, merge, publish,
57
+ distribute, sublicense, and/or sell copies of the Software, and to
58
+ permit persons to whom the Software is furnished to do so, subject to
59
+ the following conditions:
60
+
61
+ The above copyright notice and this permission notice shall be
62
+ included in all copies or substantial portions of the Software.
63
+
64
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
65
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
66
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
67
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
68
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
69
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
70
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,50 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ gem 'hoe', '>= 2.5.0'
5
+ require 'hoe'
6
+
7
+ Hoe.plugin :git
8
+
9
+ Hoe.spec 'lorax' do
10
+ developer "Mike Dalessio", "mike.dalessio@gmail.com"
11
+
12
+ self.extra_rdoc_files = FileList["*.rdoc"]
13
+ self.history_file = "CHANGELOG.rdoc"
14
+ self.readme_file = "README.rdoc"
15
+
16
+ extra_deps << ["nokogiri", ">= 1.4.0"]
17
+ extra_dev_deps << ["rspec", ">= 1.2.9"]
18
+ extra_dev_deps << ["rr", ">= 0.10.4"]
19
+ end
20
+
21
+ task :redocs => :fix_css
22
+ task :docs => :fix_css
23
+ task :fix_css do
24
+ better_css = <<-EOT
25
+ .method-description pre {
26
+ margin : 1em 0 ;
27
+ }
28
+
29
+ .method-description ul {
30
+ padding : .5em 0 .5em 2em ;
31
+ }
32
+
33
+ .method-description p {
34
+ margin-top : .5em ;
35
+ }
36
+
37
+ #main ul, div#documentation ul {
38
+ list-style-type : disc ! IMPORTANT ;
39
+ list-style-position : inside ! IMPORTANT ;
40
+ }
41
+
42
+ h2 + ul {
43
+ margin-top : 1em;
44
+ }
45
+ EOT
46
+ puts "* fixing css"
47
+ File.open("doc/rdoc.css", "a") { |f| f.write better_css }
48
+ end
49
+
50
+ # vim: syntax=ruby
data/TODO ADDED
@@ -0,0 +1,40 @@
1
+ # -*-org-*-
2
+ Diffaroo TODO
3
+
4
+ * gem
5
+ *** gemspec
6
+ *** license
7
+ *** gemcutter
8
+ * docs
9
+ *** rdocs
10
+ *** readme
11
+ *** class description notes
12
+ - Signature: calculate and persist signatures and weights for nodes in a single document
13
+ - Match: represents a match between two nodes
14
+ - MatchSet: composed of Signatures and Matches.
15
+ - Matcher: an algorithm that operates on a MatchSet statelessly to generate matches.
16
+ - Generator: generates a DeltaSet from a MatchSet
17
+ - Delta: an atomic change step
18
+ - DeltaSet: an ordered set of Deltas
19
+ - Apply: f(doc1, DeltaSet) -> doc2
20
+ *** algorithmic notes
21
+ ***** ignoring ID
22
+ - too many web sites fuck that up
23
+ - libxml2 allows duplicate ids
24
+ - algorithm would ignore changed content
25
+ ***** indexes (ascendant lookahead) needs to be implemented?
26
+ ***** if we do "phase 3" in weight-order, and recursively match parents, can't we avoid the "propagate to parent" step of phase 4?
27
+ * core
28
+ *** write integration test for MODIFY delta
29
+ *** write integration test for DELETE delta
30
+ *** write integration test for MODIFY delta with move
31
+ *** change API to specify HTML or XML. or should we make user pass in Nokogirified docs?
32
+ *** pick a hashing algorithm
33
+ - ruby hash / md5 / sha1
34
+ - benchmark? collision rate?
35
+ * additional
36
+ *** build an rspec matcher for xml
37
+ *** build a test/unit assertion for xml
38
+ *** try to make the code independent of the tree we're diffing
39
+ think about diffing any tree, e.g. AST, YAML
40
+ *** benchmark suite so we can try different algorithms
data/bin/lorax ADDED
@@ -0,0 +1,15 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require "lorax"
4
+
5
+ delta_set = Lorax.diff(File.read(ARGV[0]), File.read(ARGV[1]))
6
+ summary = delta_set.deltas.map do |d|
7
+ d.descriptor
8
+ end
9
+
10
+ File.open("foo.yml", "w+") { |f| f.puts summary.to_yaml }
11
+ File.open("foo.html", "w+") do |f|
12
+ doc = Nokogiri::HTML File.read(ARGV[0])
13
+ delta_set.apply!(doc)
14
+ f.puts doc.to_html
15
+ end
data/lib/lorax.rb ADDED
@@ -0,0 +1,35 @@
1
+ require 'nokogiri'
2
+
3
+ module Lorax
4
+ VERSION = "0.1.0"
5
+ REQUIRED_NOKOGIRI_VERSION = "1.4.0"
6
+ raise LoadError, "lorax requires Nokogiri version #{REQUIRED_NOKOGIRI_VERSION} or higher" unless Nokogiri::VERSION >= REQUIRED_NOKOGIRI_VERSION
7
+ end
8
+
9
+ require "lorax/signature"
10
+ require "lorax/match"
11
+ require "lorax/match_set"
12
+ require "lorax/fast_matcher"
13
+
14
+ require "lorax/delta"
15
+ require "lorax/delta_set_generator"
16
+ require "lorax/delta_set"
17
+
18
+ module Lorax
19
+ def Lorax.diff(string_or_io_or_nokogiridoc_1, string_or_io_or_nokogiridoc_2)
20
+ doc1 = documentize string_or_io_or_nokogiridoc_1
21
+ doc2 = documentize string_or_io_or_nokogiridoc_2
22
+
23
+ Lorax::FastMatcher.new(doc1, doc2).match.to_delta_set
24
+ end
25
+
26
+ private
27
+
28
+ def Lorax.documentize(string_or_io_or_nokogiridoc)
29
+ if string_or_io_or_nokogiridoc.is_a?(Nokogiri::XML::Document)
30
+ string_or_io_or_nokogiridoc
31
+ else
32
+ Nokogiri string_or_io_or_nokogiridoc
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,28 @@
1
+ module Lorax
2
+ class Delta
3
+ class NodeNotFoundError < RuntimeError ; end
4
+
5
+ def apply!(document)
6
+ raise NotImplementedError, self.class.to_s
7
+ end
8
+
9
+ def inspect
10
+ "#<#{self.class.name}:#{sprintf("0x%x", object_id)} #{descriptor.inspect}>"
11
+ end
12
+
13
+ private
14
+
15
+ def insert_node(node, parent, position)
16
+ children = parent.children
17
+ if children.empty? || position >= children.length
18
+ parent << node.dup
19
+ else
20
+ children[position].add_previous_sibling(node.dup)
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ require "lorax/delta/insert_delta"
27
+ require "lorax/delta/modify_delta"
28
+ require "lorax/delta/delete_delta"
@@ -0,0 +1,19 @@
1
+ module Lorax
2
+ class DeleteDelta < Delta
3
+ attr_accessor :node
4
+
5
+ def initialize(node)
6
+ @node = node
7
+ end
8
+
9
+ def apply!(document)
10
+ target = document.at_xpath(node.path)
11
+ raise NodeNotFoundError, xpath unless target
12
+ target.unlink
13
+ end
14
+
15
+ def descriptor
16
+ [:delete, {:xpath => node.path, :content => node.to_s}]
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,22 @@
1
+ module Lorax
2
+ class InsertDelta < Delta
3
+ attr_accessor :node, :xpath, :position
4
+
5
+ def initialize(node, xpath, position)
6
+ @node = node
7
+ @xpath = xpath
8
+ @position = position
9
+ end
10
+
11
+ def apply!(document)
12
+ # TODO: patch nokogiri to make inserting node copies efficient
13
+ parent = document.at_xpath(xpath)
14
+ raise NodeNotFoundError, xpath unless parent
15
+ insert_node(node.dup, parent, position)
16
+ end
17
+
18
+ def descriptor
19
+ [:insert, {:xpath => xpath, :position => position, :content => node.to_s}]
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,51 @@
1
+ module Lorax
2
+ class ModifyDelta < Delta
3
+ attr_accessor :node1, :node2
4
+
5
+ def initialize(node1, node2)
6
+ @node1 = node1
7
+ @node2 = node2
8
+ end
9
+
10
+ def apply!(doc)
11
+ node = doc.at_xpath(node1.path)
12
+ raise NodeNotFoundError, node1.path unless node
13
+
14
+ if node.text? || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
15
+ node.content = node2.content
16
+ else
17
+ attributes = attributes_hash(node)
18
+ attributes2 = attributes_hash(node2)
19
+ if attributes != attributes2
20
+ attributes .each { |name, value| node.remove_attribute(name) }
21
+ attributes2.each { |name, value| node[name] = value }
22
+ end
23
+ end
24
+
25
+ if node1.path != node2.path
26
+ position = node2.parent.children.index(node2)
27
+ target_parent = doc.at_xpath(node2.parent.path)
28
+ raise NodeNotFoundError, node2.parent.path unless target_parent
29
+ node.unlink
30
+ insert_node(node, target_parent, position)
31
+ end
32
+ end
33
+
34
+ def descriptor
35
+ if node1.text? || node1.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
36
+ [:modify, {:old => {:xpath => node1.path, :content => node1.to_s},
37
+ :new => {:xpath => node2.path, :content => node2.to_s}}]
38
+ else
39
+ [:modify, {:old => {:xpath => node1.path, :name => node1.name, :attributes => node1.attributes.map{|k,v| [k, v.value]}},
40
+ :new => {:xpath => node2.path, :name => node2.name, :attributes => node2.attributes.map{|k,v| [k, v.value]}}}]
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ def attributes_hash(node)
47
+ # lame.
48
+ node.attributes.inject({}) { |hash, attr| hash[attr.first] = attr.last.value ; hash }
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,24 @@
1
+ module Lorax
2
+ class DeltaSet
3
+ attr_accessor :deltas
4
+
5
+ def initialize
6
+ @deltas = []
7
+ end
8
+
9
+ def add(delta)
10
+ @deltas << delta
11
+ end
12
+
13
+ def apply(document)
14
+ apply! document.dup
15
+ end
16
+
17
+ def apply!(document)
18
+ deltas.each do |delta|
19
+ delta.apply! document
20
+ end
21
+ document
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,36 @@
1
+ module Lorax
2
+ module DeltaSetGenerator
3
+ def self.generate_delta_set match_set
4
+ delta_set = DeltaSet.new
5
+ generate_inserts_and_moves_recursively delta_set, match_set, match_set.signature2.root
6
+ generate_deletes_recursively delta_set, match_set, match_set.signature1.root
7
+ delta_set
8
+ end
9
+
10
+ private
11
+
12
+ def self.generate_inserts_and_moves_recursively delta_set, match_set, node
13
+ match = match_set.match node
14
+ if match
15
+ if ! match.perfect?
16
+ if match_set.signature1.monogram(match.pair.first) != match_set.signature2.monogram(match.pair.last)
17
+ delta_set.add ModifyDelta.new(match.pair.first, match.pair.last)
18
+ end
19
+ node.children.each { |child| generate_inserts_and_moves_recursively delta_set, match_set, child }
20
+ end
21
+ else
22
+ delta_set.add InsertDelta.new(node, node.parent.path, node.parent.children.index(node)) # TODO: demeter violation
23
+ end
24
+ end
25
+
26
+ def self.generate_deletes_recursively delta_set, match_set, node
27
+ match = match_set.match(node)
28
+ if match
29
+ return if match.perfect?
30
+ node.children.each { |child| generate_deletes_recursively delta_set, match_set, child }
31
+ else
32
+ delta_set.add DeleteDelta.new(node)
33
+ end
34
+ end
35
+ end
36
+ end