sitediff 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,13 +1,18 @@
1
+ require 'sitediff/exception'
2
+ require 'sitediff/sanitize'
3
+ require 'pathname'
1
4
  require 'yaml'
2
5
 
3
6
  class SiteDiff
4
7
  class Config
8
+ DEFAULT_FILENAME = 'sitediff.yaml'
5
9
 
6
10
  # keys allowed in configuration files
7
- CONF_KEYS = Sanitize::TOOLS.values.flatten(1) +
11
+ CONF_KEYS = Sanitizer::TOOLS.values.flatten(1) +
8
12
  %w[paths before after before_url after_url includes]
9
13
 
10
- class InvalidConfig < Exception; end
14
+ class InvalidConfig < SiteDiffException; end
15
+ class ConfigNotFound < SiteDiffException; end
11
16
 
12
17
  # Takes a Hash and normalizes it to the following form by merging globals
13
18
  # into before and after. A normalized config Hash looks like this:
@@ -27,7 +32,7 @@ class SiteDiff
27
32
  # selector: body
28
33
  #
29
34
  def self.normalize(conf)
30
- tools = Sanitize::TOOLS
35
+ tools = Sanitizer::TOOLS
31
36
 
32
37
  # merge globals
33
38
  %w[before after].each do |pos|
@@ -67,7 +72,7 @@ class SiteDiff
67
72
  next
68
73
  end
69
74
  result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
70
- if Sanitize::TOOLS[:array].include? key # rule 2a
75
+ if Sanitizer::TOOLS[:array].include? key # rule 2a
71
76
  result[pos][key] = (a || []) + (b|| [])
72
77
  else
73
78
  result[pos][key] = a || b # rule 2b
@@ -77,9 +82,39 @@ class SiteDiff
77
82
  result
78
83
  end
79
84
 
80
- def initialize(files)
85
+ # Search for a config file. If found, change to the containing directory,
86
+ # and return an array of config files found.
87
+ def self.search
88
+ subdirs = %w[. sitediff]
89
+ root_indicators = %w[.git .svn]
90
+
91
+ Pathname.pwd.ascend do |dir|
92
+ subdirs.each do |subdir|
93
+ d = dir + subdir + DEFAULT_FILENAME
94
+ if d.exist?
95
+ Dir.chdir(dir.+(subdir).to_s)
96
+ return [DEFAULT_FILENAME]
97
+ end
98
+ end
99
+
100
+ root_indicators.each { |r| return [] if dir.+(r).exist? }
101
+ end
102
+
103
+ return []
104
+ end
105
+
106
+ def initialize(files, opts = {})
81
107
  @config = {'paths' => [], 'before' => {}, 'after' => {} }
108
+
109
+ files = Config.search if files.empty? && opts[:search]
110
+ files = [DEFAULT_FILENAME] if files.empty? &&
111
+ File.exists?(DEFAULT_FILENAME)
112
+ raise ConfigNotFound, "No configuration file found." if files.empty?
113
+
82
114
  files.each do |file|
115
+ raise InvalidConfig,
116
+ "Missing config file %s." % File.expand_path(file) \
117
+ unless File.exist?(file)
83
118
  @config = Config::merge(@config, Config::load_conf(file))
84
119
  end
85
120
  end
@@ -99,8 +134,11 @@ class SiteDiff
99
134
  end
100
135
 
101
136
  # Checks if the configuration is usable for diff-ing.
102
- def validate
103
- raise InvalidConfig, "Undefined 'before' base URL." unless before['url']
137
+ def validate(opts = {})
138
+ opts = { :need_before => true }.merge(opts)
139
+
140
+ raise InvalidConfig, "Undefined 'before' base URL." if \
141
+ opts[:need_before] && !before['url']
104
142
  raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
105
143
  raise InvalidConfig, "Undefined 'paths'." unless (paths and !paths.empty?)
106
144
  end
@@ -114,7 +152,7 @@ class SiteDiff
114
152
 
115
153
  # reads a YAML file and raises an InvalidConfig if the file is not valid.
116
154
  def self.load_raw_yaml(file)
117
- SiteDiff::log "Reading config file: #{file}"
155
+ SiteDiff::log "Reading config file: #{Pathname.new(file).expand_path}"
118
156
  conf = YAML.load_file(file) || {}
119
157
  unless conf.is_a? Hash
120
158
  raise InvalidConfig, "Invalid configuration file: '#{file}'"
@@ -149,6 +187,5 @@ class SiteDiff
149
187
  end
150
188
  conf
151
189
  end
152
-
153
190
  end
154
191
  end
@@ -0,0 +1,122 @@
1
+ require 'sitediff/cache'
2
+ require 'sitediff/config'
3
+ require 'sitediff/crawler'
4
+ require 'sitediff/rules'
5
+ require 'pathname'
6
+ require 'typhoeus'
7
+ require 'yaml'
8
+
9
+ class SiteDiff
10
+ class Config
11
+ class Creator
12
+ def initialize(*urls, &block)
13
+ @after = urls.pop
14
+ @before = urls.pop # May be nil
15
+ end
16
+
17
+ def roots
18
+ @roots = begin
19
+ r = { :after => @after }
20
+ r[:before] = @before if @before
21
+ r
22
+ end
23
+ end
24
+
25
+ # Build a config structure, return it
26
+ def create(opts, &block)
27
+ @config = {}
28
+ @callback = block
29
+
30
+ # Handle options
31
+ @dir = Pathname.new(opts[:directory])
32
+ @depth = opts[:depth]
33
+ @rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
34
+
35
+ # Create the dir. Must go before cache initialization!
36
+ @dir.mkpath unless @dir.directory?
37
+
38
+ # Setup instance vars
39
+ @paths = Hash.new { |h,k| h[k] = Set.new }
40
+ @cache = Cache.new(:file => @dir.+(Cache::DEFAULT_FILENAME).to_s,
41
+ :create => true)
42
+ @cache.write_tags << :before << :after
43
+
44
+ build_config
45
+ write_config
46
+ end
47
+
48
+ def build_config
49
+ %w[before after].each do |tag|
50
+ next unless u = roots[tag.to_sym]
51
+ @config[tag] = {'url' => u}
52
+ end
53
+
54
+ crawl(@depth)
55
+ @cache.close
56
+ @rules.add_config if @rules
57
+
58
+ @config['paths'] = @paths.values.reduce(&:|).to_a.sort
59
+ end
60
+
61
+ def crawl(depth = nil)
62
+ hydra = Typhoeus::Hydra.new(max_concurrency: 10)
63
+ roots.each do |tag, u|
64
+ Crawler.new(hydra, u, depth) do |info|
65
+ crawled_path(tag, info)
66
+ end
67
+ end
68
+ hydra.run
69
+ end
70
+
71
+ # Deduplicate paths with slashes at the end
72
+ def canonicalize(tag, path)
73
+ def altered_paths(path)
74
+ yield path + '/'
75
+ yield path.sub(%r[/$], '')
76
+ end
77
+
78
+ return path.empty? ? '/' : path
79
+ end
80
+
81
+ def crawled_path(tag, info)
82
+ path, dup = canonicalize(tag, info.relative)
83
+ return if dup
84
+
85
+ res = info.read_result
86
+
87
+ @callback[tag, info]
88
+ @paths[tag] << path
89
+ @cache.set(tag, path, res)
90
+
91
+ # If single-site, cache after as before!
92
+ @cache.set(:before, path, res) unless roots[:before]
93
+
94
+ @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
95
+ end
96
+
97
+ # Create a gitignore if we seem to be in git
98
+ def make_gitignore(dir)
99
+ # Check if we're in git
100
+ return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
101
+
102
+ dir.+('.gitignore').open('w') do |f|
103
+ f.puts <<-EOF.gsub(/^\s+/, '')
104
+ output
105
+ cache.db
106
+ cache.db.db
107
+ EOF
108
+ end
109
+ end
110
+
111
+ def config_file
112
+ @dir + Config::DEFAULT_FILENAME
113
+ end
114
+
115
+ # Turn a config structure into a config file
116
+ def write_config
117
+ make_gitignore(@dir)
118
+ config_file.open('w') { |f| f.puts @config.to_yaml }
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,95 @@
1
+ require 'sitediff'
2
+ require 'sitediff/uriwrapper'
3
+ require 'addressable/uri'
4
+ require 'nokogiri'
5
+ require 'ostruct'
6
+ require 'set'
7
+
8
+ class SiteDiff
9
+ class Crawler
10
+ class Info < OpenStruct; end
11
+
12
+ DEFAULT_DEPTH = 3
13
+
14
+ # Create a crawler with a base URL
15
+ def initialize(hydra, base, depth = DEFAULT_DEPTH, &block)
16
+ @hydra = hydra
17
+ @base_uri = Addressable::URI.parse(base)
18
+ @base = base
19
+ @found = Set.new
20
+ @callback = block
21
+
22
+ add_uri('', depth)
23
+ end
24
+
25
+ # Handle a newly found relative URI
26
+ def add_uri(rel, depth)
27
+ return if @found.include? rel
28
+ @found << rel
29
+
30
+ wrapper = UriWrapper.new(@base + rel)
31
+ wrapper.queue(@hydra) do |res|
32
+ fetched_uri(rel, depth, res)
33
+ end
34
+ end
35
+
36
+ # Handle the fetch of a URI
37
+ def fetched_uri(rel, depth, res)
38
+ return unless res.content # Ignore errors
39
+ return unless depth >= 0
40
+
41
+ base = Addressable::URI.parse(@base + rel)
42
+ doc = Nokogiri::HTML(res.content)
43
+
44
+ # Call the callback
45
+ info = Info.new(
46
+ :relative => rel,
47
+ :uri => base,
48
+ :read_result => res,
49
+ :document => doc,
50
+ )
51
+ @callback[info]
52
+
53
+ # Find links
54
+ links = find_links(doc)
55
+ uris = links.map { |l| resolve_link(base, l) }.compact
56
+ uris = filter_links(uris)
57
+
58
+ # Make them relative
59
+ rels = uris.map { |u| relativize_link(u) }
60
+
61
+ # Queue them in turn
62
+ rels.each do |r|
63
+ next if @found.include? r
64
+ add_uri(r, depth - 1)
65
+ end
66
+ end
67
+
68
+ # Resolve a potentially-relative link. Return nil on error.
69
+ def resolve_link(base, rel)
70
+ begin
71
+ return base + rel
72
+ rescue Addressable::URI::InvalidURIError
73
+ SiteDiff.log "skipped invalid URL: '#{rel}'", :warn
74
+ return nil
75
+ end
76
+ end
77
+
78
+ # Make a link relative to @base_uri
79
+ def relativize_link(uri)
80
+ uri.path.slice(@base_uri.path.length, uri.path.length)
81
+ end
82
+
83
+ # Return a list of string links found on a page.
84
+ def find_links(doc)
85
+ return doc.xpath('//a[@href]').map { |e| e['href'] }
86
+ end
87
+
88
+ # Filter out links we don't want. Links passed in are absolute URIs.
89
+ def filter_links(uris)
90
+ uris.find_all do |u|
91
+ u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
92
+ end
93
+ end
94
+ end
95
+ end
@@ -1,3 +1,4 @@
1
+ require 'sitediff'
1
2
  require 'diffy'
2
3
  require 'erb'
3
4
  require 'rainbow'
@@ -19,7 +20,7 @@ class SiteDiff
19
20
  to_s(*args)
20
21
  end
21
22
 
22
- def generate_html_report(results, before, after)
23
+ def generate_html_report(results, before, after, cache)
23
24
  erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
24
25
  report_html = ERB.new(File.read(erb_path)).result(binding)
25
26
  return report_html
@@ -0,0 +1,3 @@
1
+ class SiteDiff
2
+ class SiteDiffException < Exception; end
3
+ end
@@ -0,0 +1,55 @@
1
+ require 'sitediff/uriwrapper'
2
+ require 'typhoeus'
3
+
4
+ class SiteDiff
5
+ class Fetch
6
+ # Cache is a cache object, see sitediff/cache
7
+ # Paths is a list of sub-paths
8
+ # Tags is a hash of tag names => base URLs.
9
+ def initialize(cache, paths, tags)
10
+ @cache = cache
11
+ @paths = paths
12
+ @tags = tags
13
+ end
14
+
15
+ # Fetch all the paths, once per tag.
16
+ # When a path has been fetched for every tag, block will be called with the
17
+ # path, and a hash of tag => UriWrapper::ReadResult objects.
18
+ def run(&block)
19
+ @callback = block
20
+ @hydra = Typhoeus::Hydra.new(max_concurrency: 3)
21
+ @paths.each { |path| queue_path(path) }
22
+ @hydra.run
23
+ end
24
+
25
+ private
26
+ # Queue a path for fetching
27
+ def queue_path(path)
28
+ results = {}
29
+
30
+ @tags.each do |tag, base|
31
+ if res = @cache.get(tag, path)
32
+ results[tag] = res
33
+ process_results(path, results)
34
+ elsif !base
35
+ # We only have the cache, but this item isn't cached!
36
+ results[tag] = UriWrapper::ReadResult.error("Not cached")
37
+ process_results(path, results)
38
+ else
39
+ uri = UriWrapper.new(base + path)
40
+ uri.queue(@hydra) do |res|
41
+ @cache.set(tag, path, res)
42
+ results[tag] = res
43
+ process_results(path, results)
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ # Process fetch results
50
+ def process_results(path, results)
51
+ return unless results.size == @tags.size
52
+ @callback[path, results]
53
+ end
54
+ end
55
+ end
@@ -11,14 +11,22 @@
11
11
  <body>
12
12
  <div class="sitediff">
13
13
  <div class="legend">
14
- <strong>before</strong> (base url): <a href="<%=before%>"><%=before%></a> |
15
- <strong>after </strong> (base url): <a href="<%=after%>" ><%=after %></a>
14
+ <%
15
+ tags = %w[before after]
16
+ tags.each do |tag| %>
17
+ <% if tags.first != tag %> | <% end %>
18
+ <% notes = ['base url']
19
+ notes << 'cached' if cache.read_tags.include?(tag.to_sym) %>
20
+ <strong><%= tag %></strong> (<%= notes.join(', ') %>):
21
+ <a href="<%= eval(tag) %>"><%= eval(tag) %></a>
22
+ <% end %>
16
23
  </div>
17
24
  <table class="results">
18
25
 
19
26
  <colgroup>
20
27
  <col class="before-col">
21
28
  <col class="after-col">
29
+ <col class="both-col">
22
30
  <col class="path-col">
23
31
  <col class="diff-stat-col">
24
32
  </colgroup>
@@ -27,6 +35,7 @@
27
35
  <tr>
28
36
  <th> Before </th>
29
37
  <th> After </th>
38
+ <th> Both </th>
30
39
  <th> Path </th>
31
40
  <th> Status </th>
32
41
  </tr>
@@ -34,8 +43,15 @@
34
43
 
35
44
  <% results.each do |result| %>
36
45
  <tr class="<%= result.status_text %>">
37
- <td class="before"><a href="<%= result.url(before) %>">[before]</a></td>
38
- <td class="after"><a href="<%= result.url(after) %>">[after]</a></td>
46
+ <td class="before">
47
+ <a href="<%= result.url(:before, before, cache) %>">[before]</a>
48
+ </td>
49
+ <td class="after">
50
+ <a href="<%= result.url(:after, after, cache) %>">[after]</a>
51
+ </td>
52
+ <td class="both">
53
+ <a href="/sidebyside<%= result.path %>">[both]</a>
54
+ </td>
39
55
  <td class="path"><%= result.path %></td>
40
56
  <td class="status"><%= result.link %></td>
41
57
  </tr>