sitediff 0.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,122 +1,116 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/cache'
2
4
  require 'sitediff/config'
3
5
  require 'sitediff/crawler'
4
- require 'sitediff/rules'
5
6
  require 'pathname'
6
7
  require 'typhoeus'
7
8
  require 'yaml'
8
9
 
9
10
  class SiteDiff
10
- class Config
11
- class Creator
12
- def initialize(*urls, &block)
13
- @after = urls.pop
14
- @before = urls.pop # May be nil
15
- end
16
-
17
- def roots
18
- @roots = begin
19
- r = { :after => @after }
20
- r[:before] = @before if @before
21
- r
22
- end
23
- end
24
-
25
- # Build a config structure, return it
26
- def create(opts, &block)
27
- @config = {}
28
- @callback = block
29
-
30
- # Handle options
31
- @dir = Pathname.new(opts[:directory])
32
- @depth = opts[:depth]
33
- @rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
34
-
35
- # Create the dir. Must go before cache initialization!
36
- @dir.mkpath unless @dir.directory?
11
+ class Config
12
+ ##
13
+ # SiteDiff Config Creator Object.
14
+ class Creator
15
+ ##
16
+ # Creates a Creator object.
17
+ def initialize(debug, before, after)
18
+ @config = nil
19
+ @before = before
20
+ @after = after
21
+ @debug = debug
22
+ end
37
23
 
38
- # Setup instance vars
39
- @paths = Hash.new { |h,k| h[k] = Set.new }
40
- @cache = Cache.new(:file => @dir.+(Cache::DEFAULT_FILENAME).to_s,
41
- :create => true)
42
- @cache.write_tags << :before << :after
24
+ ##
25
+ # Determine if we're dealing with one or two URLs.
26
+ def roots
27
+ @roots = { 'after' => @after }
28
+ @roots['before'] = @before || @after
29
+ @roots
30
+ end
43
31
 
44
- build_config
45
- write_config
46
- end
32
+ ##
33
+ # Build a config structure, return it.
34
+ def create(options)
35
+ @config = {}
47
36
 
48
- def build_config
49
- %w[before after].each do |tag|
50
- next unless u = roots[tag.to_sym]
51
- @config[tag] = {'url' => u}
52
- end
37
+ # @callback = block
53
38
 
54
- crawl(@depth)
55
- @cache.close
56
- @rules.add_config if @rules
39
+ @dir = Pathname.new(options[:directory])
57
40
 
58
- @config['paths'] = @paths.values.reduce(&:|).to_a.sort
59
- end
41
+ # Setup instance vars
42
+ @paths = Hash.new { |h, k| h[k] = Set.new }
43
+ @cache = Cache.new(directory: @dir.to_s, create: true)
44
+ @cache.write_tags << :before << :after
60
45
 
61
- def crawl(depth = nil)
62
- hydra = Typhoeus::Hydra.new(max_concurrency: 10)
63
- roots.each do |tag, u|
64
- Crawler.new(hydra, u, depth) do |info|
65
- crawled_path(tag, info)
46
+ build_config options
47
+ write_config
66
48
  end
67
- end
68
- hydra.run
69
- end
70
-
71
- # Deduplicate paths with slashes at the end
72
- def canonicalize(tag, path)
73
- def altered_paths(path)
74
- yield path + '/'
75
- yield path.sub(%r[/$], '')
76
- end
77
-
78
- return path.empty? ? '/' : path
79
- end
80
-
81
- def crawled_path(tag, info)
82
- path, dup = canonicalize(tag, info.relative)
83
- return if dup
84
49
 
85
- res = info.read_result
50
+ ##
51
+ # Build and populate the config object which is being created.
52
+ #
53
+ # @param [String] options
54
+ # One or more options.
55
+ def build_config(options)
56
+ options = Config.stringify_keys options
57
+
58
+ # Build config for "before" and "after".
59
+ %w[before after].each do |tag|
60
+ next unless (url = roots[tag])
61
+
62
+ @config[tag] = { 'url' => url }
63
+ end
64
+
65
+ # Build other settings.
66
+ @config['settings'] = {}
67
+ Config::ALLOWED_SETTINGS_KEYS.each do |key|
68
+ @config['settings'][key] = options[key]
69
+ end
70
+ end
86
71
 
87
- @callback[tag, info]
88
- @paths[tag] << path
89
- @cache.set(tag, path, res)
72
+ ##
73
+ # Create a gitignore if we seem to be in git.
74
+ def make_gitignore(dir)
75
+ # Check if we're in git
76
+ unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
77
+ return
78
+ end
79
+
80
+ dir.+('.gitignore').open('w') do |f|
81
+ f.puts <<-GITIGNORE.gsub(/^\s+/, '')
82
+ # Directories.
83
+ diffs
84
+ snapshot
85
+
86
+ # Files.
87
+ settings.yaml
88
+ paths.txt
89
+ failures.txt
90
+ GITIGNORE
91
+ end
92
+ end
90
93
 
91
- # If single-site, cache after as before!
92
- @cache.set(:before, path, res) unless roots[:before]
94
+ ##
95
+ # Returns the name of the config directory.
96
+ def directory
97
+ @dir
98
+ end
93
99
 
94
- @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
95
- end
100
+ ##
101
+ # Returns the name of the config file.
102
+ def config_file
103
+ @dir + Config::DEFAULT_FILENAME
104
+ end
96
105
 
97
- # Create a gitignore if we seem to be in git
98
- def make_gitignore(dir)
99
- # Check if we're in git
100
- return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
101
-
102
- dir.+('.gitignore').open('w') do |f|
103
- f.puts <<-EOF.gsub(/^\s+/, '')
104
- output
105
- cache.db
106
- cache.db.db
107
- EOF
106
+ ##
107
+ # Writes the built config into the config file.
108
+ # TODO: Exclude default params before writing.
109
+ def write_config
110
+ make_gitignore(@dir)
111
+ data = Config.remove_defaults(@config)
112
+ config_file.open('w') { |f| f.puts data.to_yaml }
113
+ end
108
114
  end
109
115
  end
110
-
111
- def config_file
112
- @dir + Config::DEFAULT_FILENAME
113
- end
114
-
115
- # Turn a config structure into a config file
116
- def write_config
117
- make_gitignore(@dir)
118
- config_file.open('w') { |f| f.puts @config.to_yaml }
119
- end
120
- end
121
- end
122
116
  end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require 'sitediff/config'
5
+
6
+ class SiteDiff
7
+ class Config
8
+ ##
9
+ # Preset helper.
10
+ class Preset
11
+ ##
12
+ # Directory in which presets live.
13
+ #
14
+ # TODO: Move this outside "lib".
15
+ DIRECTORY = (Pathname.new(__dir__).dirname + 'presets').freeze
16
+
17
+ ##
18
+ # Reads preset rules.
19
+ #
20
+ # @param [String] preset
21
+ # Presets
22
+ #
23
+ # @return [Hash]
24
+ # A hash containing the preset's rules.
25
+ def self.read(name)
26
+ @cache = {} if @cache.nil?
27
+
28
+ # Load and cache preset config.
29
+ if @cache[name].nil?
30
+ exist? name, true
31
+ @cache[name] = Config.load_conf file(name)
32
+ end
33
+
34
+ @cache[name]
35
+ end
36
+
37
+ ##
38
+ # Get all possible rules.
39
+ #
40
+ # @return [Array]
41
+ # All presets.
42
+ def self.all
43
+ # Load and cache preset names.
44
+ if @all.nil?
45
+ @all = []
46
+ pattern = DIRECTORY + '*.yaml'
47
+ Dir.glob(pattern) do |file|
48
+ @all << File.basename(file, '.yaml')
49
+ end
50
+ end
51
+
52
+ @all
53
+ end
54
+
55
+ ##
56
+ # Checks whether a preset exists.
57
+ def self.exist?(name, exception = false)
58
+ result = File.exist? file(name)
59
+
60
+ # Raise an exception, if required.
61
+ if exception && !result
62
+ raise Config::InvalidConfig, "Preset not found: #{name}"
63
+ end
64
+
65
+ result
66
+ end
67
+
68
+ ##
69
+ # Returns the path to a preset file.
70
+ def self.file(name)
71
+ DIRECTORY + "#{name}.yaml"
72
+ end
73
+ end
74
+ end
75
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'sitediff/uriwrapper'
3
5
  require 'addressable/uri'
@@ -6,90 +8,124 @@ require 'ostruct'
6
8
  require 'set'
7
9
 
8
10
  class SiteDiff
9
- class Crawler
10
- class Info < OpenStruct; end
11
-
12
- DEFAULT_DEPTH = 3
11
+ # SiteDiff Crawler.
12
+ class Crawler
13
+ class Info < OpenStruct; end
14
+
15
+ DEFAULT_DEPTH = 3
16
+
17
+ # Create a crawler with a base URL
18
+ def initialize(hydra, base,
19
+ interval,
20
+ include_regex,
21
+ exclude_regex,
22
+ depth = DEFAULT_DEPTH,
23
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
24
+ debug = true,
25
+ &block)
26
+ @hydra = hydra
27
+ @base_uri = Addressable::URI.parse(base)
28
+ @base = base
29
+ @interval = interval
30
+ @include_regex = include_regex
31
+ @exclude_regex = exclude_regex
32
+ @found = Set.new
33
+ @callback = block
34
+ @curl_opts = curl_opts
35
+ @debug = debug
36
+
37
+ add_uri('', depth)
38
+ end
13
39
 
14
- # Create a crawler with a base URL
15
- def initialize(hydra, base, depth = DEFAULT_DEPTH, &block)
16
- @hydra = hydra
17
- @base_uri = Addressable::URI.parse(base)
18
- @base = base
19
- @found = Set.new
20
- @callback = block
40
+ # Handle a newly found relative URI
41
+ def add_uri(rel, depth)
42
+ return if @found.include? rel
21
43
 
22
- add_uri('', depth)
23
- end
44
+ @found << rel
24
45
 
25
- # Handle a newly found relative URI
26
- def add_uri(rel, depth)
27
- return if @found.include? rel
28
- @found << rel
29
-
30
- wrapper = UriWrapper.new(@base + rel)
31
- wrapper.queue(@hydra) do |res|
32
- fetched_uri(rel, depth, res)
46
+ wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
47
+ wrapper.queue(@hydra) do |res|
48
+ fetched_uri(rel, depth, res)
49
+ end
33
50
  end
34
- end
35
51
 
36
- # Handle the fetch of a URI
37
- def fetched_uri(rel, depth, res)
38
- return unless res.content # Ignore errors
39
- return unless depth >= 0
40
-
41
- base = Addressable::URI.parse(@base + rel)
42
- doc = Nokogiri::HTML(res.content)
43
-
44
- # Call the callback
45
- info = Info.new(
46
- :relative => rel,
47
- :uri => base,
48
- :read_result => res,
49
- :document => doc,
50
- )
51
- @callback[info]
52
-
53
- # Find links
54
- links = find_links(doc)
55
- uris = links.map { |l| resolve_link(base, l) }.compact
56
- uris = filter_links(uris)
57
-
58
- # Make them relative
59
- rels = uris.map { |u| relativize_link(u) }
60
-
61
- # Queue them in turn
62
- rels.each do |r|
63
- next if @found.include? r
64
- add_uri(r, depth - 1)
52
+ # Handle the fetch of a URI
53
+ def fetched_uri(rel, depth, res)
54
+ if res.error
55
+ SiteDiff.log(res.error, :error)
56
+ return
57
+ elsif !res.content
58
+ SiteDiff.log('Response is missing content. Treating as an error.', :error)
59
+ return
60
+ end
61
+
62
+ base = Addressable::URI.parse(@base + rel)
63
+ doc = Nokogiri::HTML(res.content)
64
+
65
+ # Call the callback
66
+ info = Info.new(
67
+ relative: rel,
68
+ uri: base,
69
+ read_result: res,
70
+ document: doc
71
+ )
72
+ # Insert delay to limit fetching rate
73
+ if @interval != 0
74
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
75
+ sleep(@interval / 1000.0)
76
+ end
77
+ @callback[info]
78
+
79
+ return unless depth >= 1
80
+
81
+ # Find links
82
+ links = find_links(doc)
83
+ uris = links.map { |l| resolve_link(base, l) }.compact
84
+ uris = filter_links(uris)
85
+
86
+ # Make them relative
87
+ rels = uris.map { |u| relativize_link(u) }
88
+
89
+ # Queue them in turn
90
+ rels.each do |r|
91
+ next if @found.include? r
92
+
93
+ add_uri(r, depth - 1)
94
+ end
65
95
  end
66
- end
67
96
 
68
- # Resolve a potentially-relative link. Return nil on error.
69
- def resolve_link(base, rel)
70
- begin
71
- return base + rel
97
+ # Resolve a potentially-relative link. Return nil on error.
98
+ def resolve_link(base, rel)
99
+ base + rel
72
100
  rescue Addressable::URI::InvalidURIError
73
- SiteDiff.log "skipped invalid URL: '#{rel}'", :warn
74
- return nil
101
+ SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
102
+ nil
75
103
  end
76
- end
77
104
 
78
- # Make a link relative to @base_uri
79
- def relativize_link(uri)
80
- uri.path.slice(@base_uri.path.length, uri.path.length)
81
- end
105
+ # Make a link relative to @base_uri
106
+ def relativize_link(uri)
107
+ uri.path.slice(@base_uri.path.length, uri.path.length)
108
+ end
82
109
 
83
- # Return a list of string links found on a page.
84
- def find_links(doc)
85
- return doc.xpath('//a[@href]').map { |e| e['href'] }
86
- end
110
+ # Return a list of string links found on a page.
111
+ def find_links(doc)
112
+ doc.xpath('//a[@href]').map { |e| e['href'] }
113
+ end
87
114
 
88
- # Filter out links we don't want. Links passed in are absolute URIs.
89
- def filter_links(uris)
90
- uris.find_all do |u|
91
- u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
115
+ # Filter out links we don't want. Links passed in are absolute URIs.
116
+ def filter_links(uris)
117
+ uris.find_all do |u|
118
+ is_sub_uri = (u.host == @base_uri.host) &&
119
+ u.path.start_with?(@base_uri.path)
120
+ next unless is_sub_uri
121
+
122
+ is_included = @include_regex.nil? ? false : @include_regex.match(u.path)
123
+ is_excluded = @exclude_regex.nil? ? false : @exclude_regex.match(u.path)
124
+ if is_excluded && !is_included
125
+ SiteDiff.log "Ignoring excluded URL #{u.path}", :info
126
+ end
127
+ is_included || !is_excluded
128
+ end
92
129
  end
93
130
  end
94
131
  end
95
- end