sitediff 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,18 @@
1
+ require 'sitediff/exception'
2
+ require 'sitediff/sanitize'
3
+ require 'pathname'
1
4
  require 'yaml'
2
5
 
3
6
  class SiteDiff
4
7
  class Config
8
+ DEFAULT_FILENAME = 'sitediff.yaml'
5
9
 
6
10
  # keys allowed in configuration files
7
- CONF_KEYS = Sanitize::TOOLS.values.flatten(1) +
11
+ CONF_KEYS = Sanitizer::TOOLS.values.flatten(1) +
8
12
  %w[paths before after before_url after_url includes]
9
13
 
10
- class InvalidConfig < Exception; end
14
+ class InvalidConfig < SiteDiffException; end
15
+ class ConfigNotFound < SiteDiffException; end
11
16
 
12
17
  # Takes a Hash and normalizes it to the following form by merging globals
13
18
  # into before and after. A normalized config Hash looks like this:
@@ -27,7 +32,7 @@ class SiteDiff
27
32
  # selector: body
28
33
  #
29
34
  def self.normalize(conf)
30
- tools = Sanitize::TOOLS
35
+ tools = Sanitizer::TOOLS
31
36
 
32
37
  # merge globals
33
38
  %w[before after].each do |pos|
@@ -67,7 +72,7 @@ class SiteDiff
67
72
  next
68
73
  end
69
74
  result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
70
- if Sanitize::TOOLS[:array].include? key # rule 2a
75
+ if Sanitizer::TOOLS[:array].include? key # rule 2a
71
76
  result[pos][key] = (a || []) + (b|| [])
72
77
  else
73
78
  result[pos][key] = a || b # rule 2b
@@ -77,9 +82,39 @@ class SiteDiff
77
82
  result
78
83
  end
79
84
 
80
- def initialize(files)
85
+ # Search for a config file. If found, change to the containing directory,
86
+ # and return an array of config files found.
87
+ def self.search
88
+ subdirs = %w[. sitediff]
89
+ root_indicators = %w[.git .svn]
90
+
91
+ Pathname.pwd.ascend do |dir|
92
+ subdirs.each do |subdir|
93
+ d = dir + subdir + DEFAULT_FILENAME
94
+ if d.exist?
95
+ Dir.chdir(dir.+(subdir).to_s)
96
+ return [DEFAULT_FILENAME]
97
+ end
98
+ end
99
+
100
+ root_indicators.each { |r| return [] if dir.+(r).exist? }
101
+ end
102
+
103
+ return []
104
+ end
105
+
106
+ def initialize(files, opts = {})
81
107
  @config = {'paths' => [], 'before' => {}, 'after' => {} }
108
+
109
+ files = Config.search if files.empty? && opts[:search]
110
+ files = [DEFAULT_FILENAME] if files.empty? &&
111
+ File.exists?(DEFAULT_FILENAME)
112
+ raise ConfigNotFound, "No configuration file found." if files.empty?
113
+
82
114
  files.each do |file|
115
+ raise InvalidConfig,
116
+ "Missing config file %s." % File.expand_path(file) \
117
+ unless File.exist?(file)
83
118
  @config = Config::merge(@config, Config::load_conf(file))
84
119
  end
85
120
  end
@@ -99,8 +134,11 @@ class SiteDiff
99
134
  end
100
135
 
101
136
  # Checks if the configuration is usable for diff-ing.
102
- def validate
103
- raise InvalidConfig, "Undefined 'before' base URL." unless before['url']
137
+ def validate(opts = {})
138
+ opts = { :need_before => true }.merge(opts)
139
+
140
+ raise InvalidConfig, "Undefined 'before' base URL." if \
141
+ opts[:need_before] && !before['url']
104
142
  raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
105
143
  raise InvalidConfig, "Undefined 'paths'." unless (paths and !paths.empty?)
106
144
  end
@@ -114,7 +152,7 @@ class SiteDiff
114
152
 
115
153
  # reads a YAML file and raises an InvalidConfig if the file is not valid.
116
154
  def self.load_raw_yaml(file)
117
- SiteDiff::log "Reading config file: #{file}"
155
+ SiteDiff::log "Reading config file: #{Pathname.new(file).expand_path}"
118
156
  conf = YAML.load_file(file) || {}
119
157
  unless conf.is_a? Hash
120
158
  raise InvalidConfig, "Invalid configuration file: '#{file}'"
@@ -149,6 +187,5 @@ class SiteDiff
149
187
  end
150
188
  conf
151
189
  end
152
-
153
190
  end
154
191
  end
@@ -0,0 +1,122 @@
1
+ require 'sitediff/cache'
2
+ require 'sitediff/config'
3
+ require 'sitediff/crawler'
4
+ require 'sitediff/rules'
5
+ require 'pathname'
6
+ require 'typhoeus'
7
+ require 'yaml'
8
+
9
+ class SiteDiff
10
+ class Config
11
+ class Creator
12
+ def initialize(*urls, &block)
13
+ @after = urls.pop
14
+ @before = urls.pop # May be nil
15
+ end
16
+
17
+ def roots
18
+ @roots = begin
19
+ r = { :after => @after }
20
+ r[:before] = @before if @before
21
+ r
22
+ end
23
+ end
24
+
25
+ # Build a config structure, return it
26
+ def create(opts, &block)
27
+ @config = {}
28
+ @callback = block
29
+
30
+ # Handle options
31
+ @dir = Pathname.new(opts[:directory])
32
+ @depth = opts[:depth]
33
+ @rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
34
+
35
+ # Create the dir. Must go before cache initialization!
36
+ @dir.mkpath unless @dir.directory?
37
+
38
+ # Setup instance vars
39
+ @paths = Hash.new { |h,k| h[k] = Set.new }
40
+ @cache = Cache.new(:file => @dir.+(Cache::DEFAULT_FILENAME).to_s,
41
+ :create => true)
42
+ @cache.write_tags << :before << :after
43
+
44
+ build_config
45
+ write_config
46
+ end
47
+
48
+ def build_config
49
+ %w[before after].each do |tag|
50
+ next unless u = roots[tag.to_sym]
51
+ @config[tag] = {'url' => u}
52
+ end
53
+
54
+ crawl(@depth)
55
+ @cache.close
56
+ @rules.add_config if @rules
57
+
58
+ @config['paths'] = @paths.values.reduce(&:|).to_a.sort
59
+ end
60
+
61
+ def crawl(depth = nil)
62
+ hydra = Typhoeus::Hydra.new(max_concurrency: 10)
63
+ roots.each do |tag, u|
64
+ Crawler.new(hydra, u, depth) do |info|
65
+ crawled_path(tag, info)
66
+ end
67
+ end
68
+ hydra.run
69
+ end
70
+
71
+ # Deduplicate paths with slashes at the end
72
+ def canonicalize(tag, path)
73
+ def altered_paths(path)
74
+ yield path + '/'
75
+ yield path.sub(%r[/$], '')
76
+ end
77
+
78
+ return path.empty? ? '/' : path
79
+ end
80
+
81
+ def crawled_path(tag, info)
82
+ path, dup = canonicalize(tag, info.relative)
83
+ return if dup
84
+
85
+ res = info.read_result
86
+
87
+ @callback[tag, info]
88
+ @paths[tag] << path
89
+ @cache.set(tag, path, res)
90
+
91
+ # If single-site, cache after as before!
92
+ @cache.set(:before, path, res) unless roots[:before]
93
+
94
+ @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
95
+ end
96
+
97
+ # Create a gitignore if we seem to be in git
98
+ def make_gitignore(dir)
99
+ # Check if we're in git
100
+ return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
101
+
102
+ dir.+('.gitignore').open('w') do |f|
103
+ f.puts <<-EOF.gsub(/^\s+/, '')
104
+ output
105
+ cache.db
106
+ cache.db.db
107
+ EOF
108
+ end
109
+ end
110
+
111
+ def config_file
112
+ @dir + Config::DEFAULT_FILENAME
113
+ end
114
+
115
+ # Turn a config structure into a config file
116
+ def write_config
117
+ make_gitignore(@dir)
118
+ config_file.open('w') { |f| f.puts @config.to_yaml }
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,95 @@
1
+ require 'sitediff'
2
+ require 'sitediff/uriwrapper'
3
+ require 'addressable/uri'
4
+ require 'nokogiri'
5
+ require 'ostruct'
6
+ require 'set'
7
+
8
+ class SiteDiff
9
+ class Crawler
10
+ class Info < OpenStruct; end
11
+
12
+ DEFAULT_DEPTH = 3
13
+
14
+ # Create a crawler with a base URL
15
+ def initialize(hydra, base, depth = DEFAULT_DEPTH, &block)
16
+ @hydra = hydra
17
+ @base_uri = Addressable::URI.parse(base)
18
+ @base = base
19
+ @found = Set.new
20
+ @callback = block
21
+
22
+ add_uri('', depth)
23
+ end
24
+
25
+ # Handle a newly found relative URI
26
+ def add_uri(rel, depth)
27
+ return if @found.include? rel
28
+ @found << rel
29
+
30
+ wrapper = UriWrapper.new(@base + rel)
31
+ wrapper.queue(@hydra) do |res|
32
+ fetched_uri(rel, depth, res)
33
+ end
34
+ end
35
+
36
+ # Handle the fetch of a URI
37
+ def fetched_uri(rel, depth, res)
38
+ return unless res.content # Ignore errors
39
+ return unless depth >= 0
40
+
41
+ base = Addressable::URI.parse(@base + rel)
42
+ doc = Nokogiri::HTML(res.content)
43
+
44
+ # Call the callback
45
+ info = Info.new(
46
+ :relative => rel,
47
+ :uri => base,
48
+ :read_result => res,
49
+ :document => doc,
50
+ )
51
+ @callback[info]
52
+
53
+ # Find links
54
+ links = find_links(doc)
55
+ uris = links.map { |l| resolve_link(base, l) }.compact
56
+ uris = filter_links(uris)
57
+
58
+ # Make them relative
59
+ rels = uris.map { |u| relativize_link(u) }
60
+
61
+ # Queue them in turn
62
+ rels.each do |r|
63
+ next if @found.include? r
64
+ add_uri(r, depth - 1)
65
+ end
66
+ end
67
+
68
+ # Resolve a potentially-relative link. Return nil on error.
69
+ def resolve_link(base, rel)
70
+ begin
71
+ return base + rel
72
+ rescue Addressable::URI::InvalidURIError
73
+ SiteDiff.log "skipped invalid URL: '#{rel}'", :warn
74
+ return nil
75
+ end
76
+ end
77
+
78
+ # Make a link relative to @base_uri
79
+ def relativize_link(uri)
80
+ uri.path.slice(@base_uri.path.length, uri.path.length)
81
+ end
82
+
83
+ # Return a list of string links found on a page.
84
+ def find_links(doc)
85
+ return doc.xpath('//a[@href]').map { |e| e['href'] }
86
+ end
87
+
88
+ # Filter out links we don't want. Links passed in are absolute URIs.
89
+ def filter_links(uris)
90
+ uris.find_all do |u|
91
+ u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
92
+ end
93
+ end
94
+ end
95
+ end
@@ -1,3 +1,4 @@
1
+ require 'sitediff'
1
2
  require 'diffy'
2
3
  require 'erb'
3
4
  require 'rainbow'
@@ -19,7 +20,7 @@ class SiteDiff
19
20
  to_s(*args)
20
21
  end
21
22
 
22
- def generate_html_report(results, before, after)
23
+ def generate_html_report(results, before, after, cache)
23
24
  erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
24
25
  report_html = ERB.new(File.read(erb_path)).result(binding)
25
26
  return report_html
@@ -0,0 +1,3 @@
1
+ class SiteDiff
2
+ class SiteDiffException < Exception; end
3
+ end
@@ -0,0 +1,55 @@
1
+ require 'sitediff/uriwrapper'
2
+ require 'typhoeus'
3
+
4
+ class SiteDiff
5
+ class Fetch
6
+ # Cache is a cache object, see sitediff/cache
7
+ # Paths is a list of sub-paths
8
+ # Tags is a hash of tag names => base URLs.
9
+ def initialize(cache, paths, tags)
10
+ @cache = cache
11
+ @paths = paths
12
+ @tags = tags
13
+ end
14
+
15
+ # Fetch all the paths, once per tag.
16
+ # When a path has been fetched for every tag, block will be called with the
17
+ # path, and a hash of tag => UriWrapper::ReadResult objects.
18
+ def run(&block)
19
+ @callback = block
20
+ @hydra = Typhoeus::Hydra.new(max_concurrency: 3)
21
+ @paths.each { |path| queue_path(path) }
22
+ @hydra.run
23
+ end
24
+
25
+ private
26
+ # Queue a path for fetching
27
+ def queue_path(path)
28
+ results = {}
29
+
30
+ @tags.each do |tag, base|
31
+ if res = @cache.get(tag, path)
32
+ results[tag] = res
33
+ process_results(path, results)
34
+ elsif !base
35
+ # We only have the cache, but this item isn't cached!
36
+ results[tag] = UriWrapper::ReadResult.error("Not cached")
37
+ process_results(path, results)
38
+ else
39
+ uri = UriWrapper.new(base + path)
40
+ uri.queue(@hydra) do |res|
41
+ @cache.set(tag, path, res)
42
+ results[tag] = res
43
+ process_results(path, results)
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ # Process fetch results
50
+ def process_results(path, results)
51
+ return unless results.size == @tags.size
52
+ @callback[path, results]
53
+ end
54
+ end
55
+ end
@@ -11,14 +11,22 @@
11
11
  <body>
12
12
  <div class="sitediff">
13
13
  <div class="legend">
14
- <strong>before</strong> (base url): <a href="<%=before%>"><%=before%></a> |
15
- <strong>after </strong> (base url): <a href="<%=after%>" ><%=after %></a>
14
+ <%
15
+ tags = %w[before after]
16
+ tags.each do |tag| %>
17
+ <% if tags.first != tag %> | <% end %>
18
+ <% notes = ['base url']
19
+ notes << 'cached' if cache.read_tags.include?(tag.to_sym) %>
20
+ <strong><%= tag %></strong> (<%= notes.join(', ') %>):
21
+ <a href="<%= eval(tag) %>"><%= eval(tag) %></a>
22
+ <% end %>
16
23
  </div>
17
24
  <table class="results">
18
25
 
19
26
  <colgroup>
20
27
  <col class="before-col">
21
28
  <col class="after-col">
29
+ <col class="both-col">
22
30
  <col class="path-col">
23
31
  <col class="diff-stat-col">
24
32
  </colgroup>
@@ -27,6 +35,7 @@
27
35
  <tr>
28
36
  <th> Before </th>
29
37
  <th> After </th>
38
+ <th> Both </th>
30
39
  <th> Path </th>
31
40
  <th> Status </th>
32
41
  </tr>
@@ -34,8 +43,15 @@
34
43
 
35
44
  <% results.each do |result| %>
36
45
  <tr class="<%= result.status_text %>">
37
- <td class="before"><a href="<%= result.url(before) %>">[before]</a></td>
38
- <td class="after"><a href="<%= result.url(after) %>">[after]</a></td>
46
+ <td class="before">
47
+ <a href="<%= result.url(:before, before, cache) %>">[before]</a>
48
+ </td>
49
+ <td class="after">
50
+ <a href="<%= result.url(:after, after, cache) %>">[after]</a>
51
+ </td>
52
+ <td class="both">
53
+ <a href="/sidebyside<%= result.path %>">[both]</a>
54
+ </td>
39
55
  <td class="path"><%= result.path %></td>
40
56
  <td class="status"><%= result.link %></td>
41
57
  </tr>