sitediff 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff/cache'
4
+ require 'sitediff/config'
5
+ require 'sitediff/crawler'
6
+ require 'pathname'
7
+ require 'typhoeus'
8
+ require 'yaml'
9
+
10
+ class SiteDiff
11
+ class Config
12
+ ##
13
+ # SiteDiff Config Creator Object.
14
+ class Creator
15
+ ##
16
+ # Creates a Creator object.
17
+ def initialize(debug, *urls)
18
+ @config = nil
19
+ @after = urls.pop
20
+ @before = urls.pop # May be nil
21
+ @debug = debug
22
+ end
23
+
24
+ ##
25
+ # Determine if we're dealing with one or two URLs.
26
+ def roots
27
+ @roots = { 'after' => @after }
28
+ @roots['before'] = @before if @before
29
+ @roots
30
+ end
31
+
32
+ ##
33
+ # Build a config structure, return it.
34
+ def create(options, &block)
35
+ @config = {}
36
+ @callback = block
37
+ @dir = Pathname.new(options[:directory])
38
+
39
+ # Setup instance vars
40
+ @paths = Hash.new { |h, k| h[k] = Set.new }
41
+ @cache = Cache.new(directory: @dir.to_s, create: true)
42
+ @cache.write_tags << :before << :after
43
+
44
+ build_config options
45
+ write_config
46
+ end
47
+
48
+ ##
49
+ # Build and populate the config object which is being created.
50
+ #
51
+ # @param [String] options
52
+ # One or more options.
53
+ def build_config(options)
54
+ options = Config.stringify_keys options
55
+
56
+ # Build config for "before" and "after".
57
+ %w[before after].each do |tag|
58
+ next unless (url = roots[tag])
59
+
60
+ @config[tag] = { 'url' => url }
61
+ end
62
+
63
+ # Build other settings.
64
+ @config['settings'] = {}
65
+ Config::ALLOWED_SETTINGS_KEYS.each do |key|
66
+ @config['settings'][key] = options[key]
67
+ end
68
+ end
69
+
70
+ ##
71
+ # Create a gitignore if we seem to be in git.
72
+ def make_gitignore(dir)
73
+ # Check if we're in git
74
+ unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
75
+ return
76
+ end
77
+
78
+ dir.+('.gitignore').open('w') do |f|
79
+ f.puts <<-GITIGNORE.gsub(/^\s+/, '')
80
+ # Directories.
81
+ diffs
82
+ snapshot
83
+
84
+ # Files.
85
+ settings.yaml
86
+ paths.txt
87
+ failures.txt
88
+ GITIGNORE
89
+ end
90
+ end
91
+
92
+ ##
93
+ # Returns the name of the config directory.
94
+ def directory
95
+ @dir
96
+ end
97
+
98
+ ##
99
+ # Returns the name of the config file.
100
+ def config_file
101
+ @dir + Config::DEFAULT_FILENAME
102
+ end
103
+
104
+ ##
105
+ # Writes the built config into the config file.
106
+ # TODO: Exclude default params before writing.
107
+ def write_config
108
+ make_gitignore(@dir)
109
+ data = Config.remove_defaults(@config)
110
+ config_file.open('w') { |f| f.puts data.to_yaml }
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require 'sitediff/config'
5
+
6
+ class SiteDiff
7
+ class Config
8
+ ##
9
+ # Preset helper.
10
+ class Preset
11
+ ##
12
+ # Directory in which presets live.
13
+ #
14
+ # TODO: Move this outside "lib".
15
+ DIRECTORY = (Pathname.new(__dir__).dirname + 'presets').freeze
16
+
17
+ ##
18
+ # Reads preset rules.
19
+ #
20
+ # @param [String] preset
21
+ # Presets
22
+ #
23
+ # @return [Hash]
24
+ # A hash containing the preset's rules.
25
+ def self.read(name)
26
+ @cache = {} if @cache.nil?
27
+
28
+ # Load and cache preset config.
29
+ if @cache[name].nil?
30
+ exist? name, true
31
+ @cache[name] = Config.load_conf file(name)
32
+ end
33
+
34
+ @cache[name]
35
+ end
36
+
37
+ ##
38
+ # Get all possible rules.
39
+ #
40
+ # @return [Array]
41
+ # All presets.
42
+ def self.all
43
+ # Load and cache preset names.
44
+ if @all.nil?
45
+ @all = []
46
+ pattern = DIRECTORY + '*.yaml'
47
+ Dir.glob(pattern) do |file|
48
+ @all << File.basename(file, '.yaml')
49
+ end
50
+ end
51
+
52
+ @all
53
+ end
54
+
55
+ ##
56
+ # Checks whether a preset exists.
57
+ def self.exist?(name, exception = false)
58
+ result = File.exist? file(name)
59
+
60
+ # Raise an exception, if required.
61
+ if exception && !result
62
+ raise Config::InvalidConfig, "Preset not found: #{name}"
63
+ end
64
+
65
+ result
66
+ end
67
+
68
+ ##
69
+ # Returns the path to a preset file.
70
+ def self.file(name)
71
+ DIRECTORY + "#{name}.yaml"
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff'
4
+ require 'sitediff/uriwrapper'
5
+ require 'addressable/uri'
6
+ require 'nokogiri'
7
+ require 'ostruct'
8
+ require 'set'
9
+
10
+ class SiteDiff
11
+ # SiteDiff Crawler.
12
+ class Crawler
13
+ class Info < OpenStruct; end
14
+
15
+ DEFAULT_DEPTH = 3
16
+
17
+ # Create a crawler with a base URL
18
+ def initialize(hydra, base,
19
+ interval,
20
+ whitelist,
21
+ blacklist,
22
+ depth = DEFAULT_DEPTH,
23
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
24
+ debug = true,
25
+ &block)
26
+ @hydra = hydra
27
+ @base_uri = Addressable::URI.parse(base)
28
+ @base = base
29
+ @interval = interval
30
+ @whitelist = whitelist
31
+ @blacklist = blacklist
32
+ @found = Set.new
33
+ @callback = block
34
+ @curl_opts = curl_opts
35
+ @debug = debug
36
+
37
+ add_uri('', depth)
38
+ end
39
+
40
+ # Handle a newly found relative URI
41
+ def add_uri(rel, depth)
42
+ return if @found.include? rel
43
+
44
+ @found << rel
45
+
46
+ wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
47
+ wrapper.queue(@hydra) do |res|
48
+ fetched_uri(rel, depth, res)
49
+ end
50
+ end
51
+
52
+ # Handle the fetch of a URI
53
+ def fetched_uri(rel, depth, res)
54
+ if res.error
55
+ SiteDiff.log(res.error, :error)
56
+ return
57
+ elsif !res.content
58
+ SiteDiff.log('Response is missing content. Treating as an error.', :error)
59
+ return
60
+ end
61
+
62
+ base = Addressable::URI.parse(@base + rel)
63
+ doc = Nokogiri::HTML(res.content)
64
+
65
+ # Call the callback
66
+ info = Info.new(
67
+ relative: rel,
68
+ uri: base,
69
+ read_result: res,
70
+ document: doc
71
+ )
72
+ # Insert delay to limit fetching rate
73
+ if @interval != 0
74
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
75
+ sleep(@interval / 1000.0)
76
+ end
77
+ @callback[info]
78
+
79
+ return unless depth >= 1
80
+
81
+ # Find links
82
+ links = find_links(doc)
83
+ uris = links.map { |l| resolve_link(base, l) }.compact
84
+ uris = filter_links(uris)
85
+
86
+ # Make them relative
87
+ rels = uris.map { |u| relativize_link(u) }
88
+
89
+ # Queue them in turn
90
+ rels.each do |r|
91
+ next if @found.include? r
92
+
93
+ add_uri(r, depth - 1)
94
+ end
95
+ end
96
+
97
+ # Resolve a potentially-relative link. Return nil on error.
98
+ def resolve_link(base, rel)
99
+ base + rel
100
+ rescue Addressable::URI::InvalidURIError
101
+ SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
102
+ nil
103
+ end
104
+
105
+ # Make a link relative to @base_uri
106
+ def relativize_link(uri)
107
+ uri.path.slice(@base_uri.path.length, uri.path.length)
108
+ end
109
+
110
+ # Return a list of string links found on a page.
111
+ def find_links(doc)
112
+ doc.xpath('//a[@href]').map { |e| e['href'] }
113
+ end
114
+
115
+ # Filter out links we don't want. Links passed in are absolute URIs.
116
+ def filter_links(uris)
117
+ uris.find_all do |u|
118
+ is_sub_uri = (u.host == @base_uri.host) &&
119
+ u.path.start_with?(@base_uri.path)
120
+ next unless is_sub_uri
121
+
122
+ is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
123
+ is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
124
+ if is_blacklisted && !is_whitelisted
125
+ SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
126
+ end
127
+ is_whitelisted || !is_blacklisted
128
+ end
129
+ end
130
+ end
131
+ end
@@ -1,37 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff'
1
4
  require 'diffy'
2
5
  require 'erb'
3
6
  require 'rainbow'
7
+ require 'digest'
4
8
 
5
9
  class SiteDiff
10
+ # SiteDiff Diff Object.
6
11
  module Diff
7
12
  module_function
8
13
 
14
+ ##
15
+ # Generates HTML diff.
9
16
  def html_diffy(before_html, after_html)
10
17
  diff = Diffy::Diff.new(before_html, after_html)
11
- diff.first ? # Is it non-empty?
12
- diff.to_s(:html) : nil
18
+ # If the diff is non-empty, convert it to string.
19
+ diff.first ? diff.to_s(:html) : nil
20
+ end
21
+
22
+ ##
23
+ # Generates a description about encoding.
24
+ def encoding_blurb(encoding)
25
+ if encoding
26
+ "Text content returned - charset #{encoding}"
27
+ else
28
+ 'Binary content returned'
29
+ end
30
+ end
31
+
32
+ ##
33
+ # Computes diff of binary files using MD5 hashes.
34
+ def binary_diffy(before, after, before_encoding, after_encoding)
35
+ if before_encoding || after_encoding
36
+ Diffy::Diff.new(encoding_blurb(before_encoding),
37
+ encoding_blurb(after_encoding)).to_s(:html)
38
+ elsif before == after
39
+ nil
40
+ else
41
+ md5_before = Digest::MD5.hexdigest(before)
42
+ md5_after = Digest::MD5.hexdigest(after)
43
+ Diffy::Diff.new("Binary content returned md5: #{md5_before}",
44
+ "Binary content returned md5: #{md5_after}").to_s(:html)
45
+ end
13
46
  end
14
47
 
48
+ ##
49
+ # Generates diff for CLI output.
15
50
  def terminal_diffy(before_html, after_html)
16
51
  args = []
17
52
  args << :color if Rainbow.enabled
18
- return Diffy::Diff.new(before_html, after_html, :context => 3).
19
- to_s(*args)
53
+ Diffy::Diff.new(before_html, after_html, context: 3)
54
+ .to_s(*args)
20
55
  end
21
56
 
22
- def generate_html_report(results, before, after)
23
- erb_path = File.join(SiteDiff::FILES_DIR, 'html_report.html.erb')
24
- report_html = ERB.new(File.read(erb_path)).result(binding)
25
- return report_html
57
+ ##
58
+ # Generates an HTML report.
59
+ # TODO: Generate the report in SiteDif::Report instead.
60
+ def generate_html(results, before, after, cache, relative = false)
61
+ erb_path = File.join(SiteDiff::FILES_DIR, 'report.html.erb')
62
+ ERB.new(File.read(erb_path)).result(binding)
26
63
  end
27
64
 
28
- def generate_diff_output(result)
65
+ ##
66
+ # Generates diff output for a single result.
67
+ def generate_diff_output(result, relative = false)
29
68
  erb_path = File.join(SiteDiff::FILES_DIR, 'diff.html.erb')
30
- return ERB.new(File.read(erb_path)).result(binding)
69
+ ERB.new(File.read(erb_path)).result(binding)
31
70
  end
32
71
 
33
- def css
34
- File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.css'))
72
+ ##
73
+ # Set configuration for Diffy.
74
+ def diff_config(config)
75
+ diff_options = Diffy::Diff.default_options[:diff]
76
+ diff_options = [diff_options] unless diff_options.is_a?(Array)
77
+ # ignore_whitespace option
78
+ diff_options.push('-w').uniq if config.ignore_whitespace
79
+ Diffy::Diff.default_options[:diff] = diff_options
35
80
  end
36
81
  end
37
82
  end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteDiff
4
+ class SiteDiffException < RuntimeError; end
5
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sitediff/uriwrapper'
4
+ require 'typhoeus'
5
+
6
+ class SiteDiff
7
+ # SiteDiff Data Fetcher.
8
+ # TODO: Rename this to Fetcher.
9
+ class Fetch
10
+ # Cache is a cache object, see sitediff/cache
11
+ # Paths is a list of sub-paths
12
+ # Tags is a hash of tag names => base URLs.
13
+ def initialize(cache,
14
+ paths,
15
+ interval,
16
+ concurrency = 3,
17
+ curl_opts = nil,
18
+ debug = true,
19
+ **tags)
20
+ @cache = cache
21
+ @interval = interval
22
+ @paths = paths
23
+ @tags = tags
24
+ @curl_opts = curl_opts || UriWrapper::DEFAULT_CURL_OPTS
25
+ @concurrency = concurrency
26
+ @debug = debug
27
+ end
28
+
29
+ # Fetch all the paths, once per tag.
30
+ # When a path has been fetched for every tag, block will be called with the
31
+ # path, and a hash of tag => UriWrapper::ReadResult objects.
32
+ def run(&block)
33
+ @callback = block
34
+ @hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
35
+ @paths.each { |path| queue_path(path) }
36
+ @hydra.run
37
+ end
38
+
39
+ private
40
+
41
+ # Queue a path for fetching
42
+ def queue_path(path)
43
+ results = {}
44
+
45
+ @tags.each do |tag, base|
46
+ if (res = @cache.get(tag, path))
47
+ results[tag] = res
48
+ process_results(path, results)
49
+ elsif !base
50
+ # We only have the cache, but this item isn't cached!
51
+ results[tag] = UriWrapper::ReadResult.error('Not cached')
52
+ process_results(path, results)
53
+ else
54
+ uri = UriWrapper.new(base + path, @curl_opts, @debug)
55
+ uri.queue(@hydra) do |resl|
56
+ # Insert delay to limit fetching rate
57
+ if @interval != 0
58
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
59
+ sleep(@interval / 1000.0)
60
+ end
61
+ @cache.set(tag, path, resl)
62
+ results[tag] = resl
63
+ process_results(path, results)
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ # Process fetch results
70
+ def process_results(path, results)
71
+ return unless results.size == @tags.size
72
+
73
+ @callback[path, results]
74
+ end
75
+ end
76
+ end