sitediff 0.0.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff/cache'
4
+ require 'sitediff/config'
5
+ require 'sitediff/crawler'
6
+ require 'pathname'
7
+ require 'typhoeus'
8
+ require 'yaml'
9
+
10
+ class SiteDiff
11
+ class Config
12
+ ##
13
+ # SiteDiff Config Creator Object.
14
+ class Creator
15
+ ##
16
+ # Creates a Creator object.
17
+ def initialize(debug, *urls)
18
+ @config = nil
19
+ @after = urls.pop
20
+ @before = urls.pop # May be nil
21
+ @debug = debug
22
+ end
23
+
24
+ ##
25
+ # Determine if we're dealing with one or two URLs.
26
+ def roots
27
+ @roots = { 'after' => @after }
28
+ @roots['before'] = @before if @before
29
+ @roots
30
+ end
31
+
32
+ ##
33
+ # Build a config structure, return it.
34
+ def create(options, &block)
35
+ @config = {}
36
+ @callback = block
37
+ @dir = Pathname.new(options[:directory])
38
+
39
+ # Setup instance vars
40
+ @paths = Hash.new { |h, k| h[k] = Set.new }
41
+ @cache = Cache.new(directory: @dir.to_s, create: true)
42
+ @cache.write_tags << :before << :after
43
+
44
+ build_config options
45
+ write_config
46
+ end
47
+
48
+ ##
49
+ # Build and populate the config object which is being created.
50
+ #
51
+ # @param [String] options
52
+ # One or more options.
53
+ def build_config(options)
54
+ options = Config.stringify_keys options
55
+
56
+ # Build config for "before" and "after".
57
+ %w[before after].each do |tag|
58
+ next unless (url = roots[tag])
59
+
60
+ @config[tag] = { 'url' => url }
61
+ end
62
+
63
+ # Build other settings.
64
+ @config['settings'] = {}
65
+ Config::ALLOWED_SETTINGS_KEYS.each do |key|
66
+ @config['settings'][key] = options[key]
67
+ end
68
+ end
69
+
70
+ ##
71
+ # Create a gitignore if we seem to be in git.
72
+ def make_gitignore(dir)
73
+ # Check if we're in git
74
+ unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
75
+ return
76
+ end
77
+
78
+ dir.+('.gitignore').open('w') do |f|
79
+ f.puts <<-GITIGNORE.gsub(/^\s+/, '')
80
+ # Directories.
81
+ diffs
82
+ snapshot
83
+
84
+ # Files.
85
+ settings.yaml
86
+ paths.txt
87
+ failures.txt
88
+ GITIGNORE
89
+ end
90
+ end
91
+
92
+ ##
93
+ # Returns the name of the config directory.
94
+ def directory
95
+ @dir
96
+ end
97
+
98
+ ##
99
+ # Returns the name of the config file.
100
+ def config_file
101
+ @dir + Config::DEFAULT_FILENAME
102
+ end
103
+
104
+ ##
105
+ # Writes the built config into the config file.
106
+ # TODO: Exclude default params before writing.
107
+ def write_config
108
+ make_gitignore(@dir)
109
+ data = Config.remove_defaults(@config)
110
+ config_file.open('w') { |f| f.puts data.to_yaml }
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require 'sitediff/config'
5
+
6
+ class SiteDiff
7
+ class Config
8
+ ##
9
+ # Preset helper.
10
+ class Preset
11
+ ##
12
+ # Directory in which presets live.
13
+ #
14
+ # TODO: Move this outside "lib".
15
+ DIRECTORY = (Pathname.new(__dir__).dirname + 'presets').freeze
16
+
17
+ ##
18
+ # Reads preset rules.
19
+ #
20
+ # @param [String] preset
21
+ # Presets
22
+ #
23
+ # @return [Hash]
24
+ # A hash containing the preset's rules.
25
+ def self.read(name)
26
+ @cache = {} if @cache.nil?
27
+
28
+ # Load and cache preset config.
29
+ if @cache[name].nil?
30
+ exist? name, true
31
+ @cache[name] = Config.load_conf file(name)
32
+ end
33
+
34
+ @cache[name]
35
+ end
36
+
37
+ ##
38
+ # Get all possible rules.
39
+ #
40
+ # @return [Array]
41
+ # All presets.
42
+ def self.all
43
+ # Load and cache preset names.
44
+ if @all.nil?
45
+ @all = []
46
+ pattern = DIRECTORY + '*.yaml'
47
+ Dir.glob(pattern) do |file|
48
+ @all << File.basename(file, '.yaml')
49
+ end
50
+ end
51
+
52
+ @all
53
+ end
54
+
55
+ ##
56
+ # Checks whether a preset exists.
57
+ def self.exist?(name, exception = false)
58
+ result = File.exist? file(name)
59
+
60
+ # Raise an exception, if required.
61
+ if exception && !result
62
+ raise Config::InvalidConfig, "Preset not found: #{name}"
63
+ end
64
+
65
+ result
66
+ end
67
+
68
+ ##
69
+ # Returns the path to a preset file.
70
+ def self.file(name)
71
+ DIRECTORY + "#{name}.yaml"
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff'
4
+ require 'sitediff/uriwrapper'
5
+ require 'addressable/uri'
6
+ require 'nokogiri'
7
+ require 'ostruct'
8
+ require 'set'
9
+
10
+ class SiteDiff
11
+ # SiteDiff Crawler.
12
+ class Crawler
13
+ class Info < OpenStruct; end
14
+
15
+ DEFAULT_DEPTH = 3
16
+
17
+ # Create a crawler with a base URL
18
+ def initialize(hydra, base,
19
+ interval,
20
+ whitelist,
21
+ blacklist,
22
+ depth = DEFAULT_DEPTH,
23
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
24
+ debug = true,
25
+ &block)
26
+ @hydra = hydra
27
+ @base_uri = Addressable::URI.parse(base)
28
+ @base = base
29
+ @interval = interval
30
+ @whitelist = whitelist
31
+ @blacklist = blacklist
32
+ @found = Set.new
33
+ @callback = block
34
+ @curl_opts = curl_opts
35
+ @debug = debug
36
+
37
+ add_uri('', depth)
38
+ end
39
+
40
+ # Handle a newly found relative URI
41
+ def add_uri(rel, depth)
42
+ return if @found.include? rel
43
+
44
+ @found << rel
45
+
46
+ wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
47
+ wrapper.queue(@hydra) do |res|
48
+ fetched_uri(rel, depth, res)
49
+ end
50
+ end
51
+
52
+ # Handle the fetch of a URI
53
+ def fetched_uri(rel, depth, res)
54
+ if res.error
55
+ SiteDiff.log(res.error, :error)
56
+ return
57
+ elsif !res.content
58
+ SiteDiff.log('Response is missing content. Treating as an error.', :error)
59
+ return
60
+ end
61
+
62
+ base = Addressable::URI.parse(@base + rel)
63
+ doc = Nokogiri::HTML(res.content)
64
+
65
+ # Call the callback
66
+ info = Info.new(
67
+ relative: rel,
68
+ uri: base,
69
+ read_result: res,
70
+ document: doc
71
+ )
72
+ # Insert delay to limit fetching rate
73
+ if @interval != 0
74
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
75
+ sleep(@interval / 1000.0)
76
+ end
77
+ @callback[info]
78
+
79
+ return unless depth >= 1
80
+
81
+ # Find links
82
+ links = find_links(doc)
83
+ uris = links.map { |l| resolve_link(base, l) }.compact
84
+ uris = filter_links(uris)
85
+
86
+ # Make them relative
87
+ rels = uris.map { |u| relativize_link(u) }
88
+
89
+ # Queue them in turn
90
+ rels.each do |r|
91
+ next if @found.include? r
92
+
93
+ add_uri(r, depth - 1)
94
+ end
95
+ end
96
+
97
+ # Resolve a potentially-relative link. Return nil on error.
98
+ def resolve_link(base, rel)
99
+ base + rel
100
+ rescue Addressable::URI::InvalidURIError
101
+ SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
102
+ nil
103
+ end
104
+
105
+ # Make a link relative to @base_uri
106
+ def relativize_link(uri)
107
+ uri.path.slice(@base_uri.path.length, uri.path.length)
108
+ end
109
+
110
+ # Return a list of string links found on a page.
111
+ def find_links(doc)
112
+ doc.xpath('//a[@href]').map { |e| e['href'] }
113
+ end
114
+
115
+ # Filter out links we don't want. Links passed in are absolute URIs.
116
+ def filter_links(uris)
117
+ uris.find_all do |u|
118
+ is_sub_uri = (u.host == @base_uri.host) &&
119
+ u.path.start_with?(@base_uri.path)
120
+ next unless is_sub_uri
121
+
122
+ is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
123
+ is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
124
+ if is_blacklisted && !is_whitelisted
125
+ SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
126
+ end
127
+ is_whitelisted || !is_blacklisted
128
+ end
129
+ end
130
+ end
131
+ end
@@ -1,37 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff'
1
4
  require 'diffy'
2
5
  require 'erb'
3
6
  require 'rainbow'
7
+ require 'digest'
4
8
 
5
9
  class SiteDiff
10
+ # SiteDiff Diff Object.
6
11
  module Diff
7
12
  module_function
8
13
 
14
+ ##
15
+ # Generates HTML diff.
9
16
  def html_diffy(before_html, after_html)
10
17
  diff = Diffy::Diff.new(before_html, after_html)
11
- diff.first ? # Is it non-empty?
12
- diff.to_s(:html) : nil
18
+ # If the diff is non-empty, convert it to string.
19
+ diff.first ? diff.to_s(:html) : nil
20
+ end
21
+
22
+ ##
23
+ # Generates a description about encoding.
24
+ def encoding_blurb(encoding)
25
+ if encoding
26
+ "Text content returned - charset #{encoding}"
27
+ else
28
+ 'Binary content returned'
29
+ end
30
+ end
31
+
32
+ ##
33
+ # Computes diff of binary files using MD5 hashes.
34
+ def binary_diffy(before, after, before_encoding, after_encoding)
35
+ if before_encoding || after_encoding
36
+ Diffy::Diff.new(encoding_blurb(before_encoding),
37
+ encoding_blurb(after_encoding)).to_s(:html)
38
+ elsif before == after
39
+ nil
40
+ else
41
+ md5_before = Digest::MD5.hexdigest(before)
42
+ md5_after = Digest::MD5.hexdigest(after)
43
+ Diffy::Diff.new("Binary content returned md5: #{md5_before}",
44
+ "Binary content returned md5: #{md5_after}").to_s(:html)
45
+ end
13
46
  end
14
47
 
48
+ ##
49
+ # Generates diff for CLI output.
15
50
  def terminal_diffy(before_html, after_html)
16
51
  args = []
17
52
  args << :color if Rainbow.enabled
18
- return Diffy::Diff.new(before_html, after_html, :context => 3).
19
- to_s(*args)
53
+ Diffy::Diff.new(before_html, after_html, context: 3)
54
+ .to_s(*args)
20
55
  end
21
56
 
22
- def generate_html_report(results, before, after)
23
- erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
24
- report_html = ERB.new(File.read(erb_path)).result(binding)
25
- return report_html
57
+ ##
58
+ # Generates an HTML report.
59
+ # TODO: Generate the report in SiteDif::Report instead.
60
+ def generate_html(results, before, after, cache, relative = false)
61
+ erb_path = File.join(SiteDiff::FILES_DIR, 'report.html.erb')
62
+ ERB.new(File.read(erb_path)).result(binding)
26
63
  end
27
64
 
28
- def generate_diff_output(result)
65
+ ##
66
+ # Generates diff output for a single result.
67
+ def generate_diff_output(result, relative = false)
29
68
  erb_path = File.join(SiteDiff::FILES_DIR, 'diff.html.erb')
30
- return ERB.new(File.read(erb_path)).result(binding)
69
+ ERB.new(File.read(erb_path)).result(binding)
31
70
  end
32
71
 
33
- def css
34
- File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.css'))
72
+ ##
73
+ # Set configuration for Diffy.
74
+ def diff_config(config)
75
+ diff_options = Diffy::Diff.default_options[:diff]
76
+ diff_options = [diff_options] unless diff_options.is_a?(Array)
77
+ # ignore_whitespace option
78
+ diff_options.push('-w').uniq if config.ignore_whitespace
79
+ Diffy::Diff.default_options[:diff] = diff_options
35
80
  end
36
81
  end
37
82
  end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteDiff
4
+ class SiteDiffException < RuntimeError; end
5
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff/uriwrapper'
4
+ require 'typhoeus'
5
+
6
+ class SiteDiff
7
+ # SiteDiff Data Fetcher.
8
+ # TODO: Rename this to Fetcher.
9
+ class Fetch
10
+ # Cache is a cache object, see sitediff/cache
11
+ # Paths is a list of sub-paths
12
+ # Tags is a hash of tag names => base URLs.
13
+ def initialize(cache,
14
+ paths,
15
+ interval,
16
+ concurrency = 3,
17
+ curl_opts = nil,
18
+ debug = true,
19
+ **tags)
20
+ @cache = cache
21
+ @interval = interval
22
+ @paths = paths
23
+ @tags = tags
24
+ @curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
25
+ @concurrency = concurrency
26
+ @debug = debug
27
+ end
28
+
29
+ # Fetch all the paths, once per tag.
30
+ # When a path has been fetched for every tag, block will be called with the
31
+ # path, and a hash of tag => UriWrapper::ReadResult objects.
32
+ def run(&block)
33
+ @callback = block
34
+ @hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
35
+ @paths.each { |path| queue_path(path) }
36
+ @hydra.run
37
+ end
38
+
39
+ private
40
+
41
+ # Queue a path for fetching
42
+ def queue_path(path)
43
+ results = {}
44
+
45
+ @tags.each do |tag, base|
46
+ if (res = @cache.get(tag, path))
47
+ results[tag] = res
48
+ process_results(path, results)
49
+ elsif !base
50
+ # We only have the cache, but this item isn't cached!
51
+ results[tag] = UriWrapper::ReadResult.error('Not cached')
52
+ process_results(path, results)
53
+ else
54
+ uri = UriWrapper.new(base + path, @curl_opts, @debug)
55
+ uri.queue(@hydra) do |resl|
56
+ # Insert delay to limit fetching rate
57
+ if @interval != 0
58
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
59
+ sleep(@interval / 1000.0)
60
+ end
61
+ @cache.set(tag, path, resl)
62
+ results[tag] = resl
63
+ process_results(path, results)
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ # Process fetch results
70
+ def process_results(path, results)
71
+ return unless results.size == @tags.size
72
+
73
+ @callback[path, results]
74
+ end
75
+ end
76
+ end