sitediff 0.0.2 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,122 +1,116 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/cache'
2
4
  require 'sitediff/config'
3
5
  require 'sitediff/crawler'
4
- require 'sitediff/rules'
5
6
  require 'pathname'
6
7
  require 'typhoeus'
7
8
  require 'yaml'
8
9
 
9
10
  class SiteDiff
10
- class Config
11
- class Creator
12
- def initialize(*urls, &block)
13
- @after = urls.pop
14
- @before = urls.pop # May be nil
15
- end
16
-
17
- def roots
18
- @roots = begin
19
- r = { :after => @after }
20
- r[:before] = @before if @before
21
- r
22
- end
23
- end
24
-
25
- # Build a config structure, return it
26
- def create(opts, &block)
27
- @config = {}
28
- @callback = block
29
-
30
- # Handle options
31
- @dir = Pathname.new(opts[:directory])
32
- @depth = opts[:depth]
33
- @rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
34
-
35
- # Create the dir. Must go before cache initialization!
36
- @dir.mkpath unless @dir.directory?
11
+ class Config
12
+ ##
13
+ # SiteDiff Config Creator Object.
14
+ class Creator
15
+ ##
16
+ # Creates a Creator object.
17
+ def initialize(debug, before, after)
18
+ @config = nil
19
+ @before = before
20
+ @after = after
21
+ @debug = debug
22
+ end
37
23
 
38
- # Setup instance vars
39
- @paths = Hash.new { |h,k| h[k] = Set.new }
40
- @cache = Cache.new(:file => @dir.+(Cache::DEFAULT_FILENAME).to_s,
41
- :create => true)
42
- @cache.write_tags << :before << :after
24
+ ##
25
+ # Determine if we're dealing with one or two URLs.
26
+ def roots
27
+ @roots = { 'after' => @after }
28
+ @roots['before'] = @before || @after
29
+ @roots
30
+ end
43
31
 
44
- build_config
45
- write_config
46
- end
32
+ ##
33
+ # Build a config structure, return it.
34
+ def create(options)
35
+ @config = {}
47
36
 
48
- def build_config
49
- %w[before after].each do |tag|
50
- next unless u = roots[tag.to_sym]
51
- @config[tag] = {'url' => u}
52
- end
37
+ # @callback = block
53
38
 
54
- crawl(@depth)
55
- @cache.close
56
- @rules.add_config if @rules
39
+ @dir = Pathname.new(options[:directory])
57
40
 
58
- @config['paths'] = @paths.values.reduce(&:|).to_a.sort
59
- end
41
+ # Setup instance vars
42
+ @paths = Hash.new { |h, k| h[k] = Set.new }
43
+ @cache = Cache.new(directory: @dir.to_s, create: true)
44
+ @cache.write_tags << :before << :after
60
45
 
61
- def crawl(depth = nil)
62
- hydra = Typhoeus::Hydra.new(max_concurrency: 10)
63
- roots.each do |tag, u|
64
- Crawler.new(hydra, u, depth) do |info|
65
- crawled_path(tag, info)
46
+ build_config options
47
+ write_config
66
48
  end
67
- end
68
- hydra.run
69
- end
70
-
71
- # Deduplicate paths with slashes at the end
72
- def canonicalize(tag, path)
73
- def altered_paths(path)
74
- yield path + '/'
75
- yield path.sub(%r[/$], '')
76
- end
77
-
78
- return path.empty? ? '/' : path
79
- end
80
-
81
- def crawled_path(tag, info)
82
- path, dup = canonicalize(tag, info.relative)
83
- return if dup
84
49
 
85
- res = info.read_result
50
+ ##
51
+ # Build and populate the config object which is being created.
52
+ #
53
+ # @param [String] options
54
+ # One or more options.
55
+ def build_config(options)
56
+ options = Config.stringify_keys options
57
+
58
+ # Build config for "before" and "after".
59
+ %w[before after].each do |tag|
60
+ next unless (url = roots[tag])
61
+
62
+ @config[tag] = { 'url' => url }
63
+ end
64
+
65
+ # Build other settings.
66
+ @config['settings'] = {}
67
+ Config::ALLOWED_SETTINGS_KEYS.each do |key|
68
+ @config['settings'][key] = options[key]
69
+ end
70
+ end
86
71
 
87
- @callback[tag, info]
88
- @paths[tag] << path
89
- @cache.set(tag, path, res)
72
+ ##
73
+ # Create a gitignore if we seem to be in git.
74
+ def make_gitignore(dir)
75
+ # Check if we're in git
76
+ unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
77
+ return
78
+ end
79
+
80
+ dir.+('.gitignore').open('w') do |f|
81
+ f.puts <<-GITIGNORE.gsub(/^\s+/, '')
82
+ # Directories.
83
+ diffs
84
+ snapshot
85
+
86
+ # Files.
87
+ settings.yaml
88
+ paths.txt
89
+ failures.txt
90
+ GITIGNORE
91
+ end
92
+ end
90
93
 
91
- # If single-site, cache after as before!
92
- @cache.set(:before, path, res) unless roots[:before]
94
+ ##
95
+ # Returns the name of the config directory.
96
+ def directory
97
+ @dir
98
+ end
93
99
 
94
- @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
95
- end
100
+ ##
101
+ # Returns the name of the config file.
102
+ def config_file
103
+ @dir + Config::DEFAULT_FILENAME
104
+ end
96
105
 
97
- # Create a gitignore if we seem to be in git
98
- def make_gitignore(dir)
99
- # Check if we're in git
100
- return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
101
-
102
- dir.+('.gitignore').open('w') do |f|
103
- f.puts <<-EOF.gsub(/^\s+/, '')
104
- output
105
- cache.db
106
- cache.db.db
107
- EOF
106
+ ##
107
+ # Writes the built config into the config file.
108
+ # TODO: Exclude default params before writing.
109
+ def write_config
110
+ make_gitignore(@dir)
111
+ data = Config.remove_defaults(@config)
112
+ config_file.open('w') { |f| f.puts data.to_yaml }
113
+ end
108
114
  end
109
115
  end
110
-
111
- def config_file
112
- @dir + Config::DEFAULT_FILENAME
113
- end
114
-
115
- # Turn a config structure into a config file
116
- def write_config
117
- make_gitignore(@dir)
118
- config_file.open('w') { |f| f.puts @config.to_yaml }
119
- end
120
- end
121
- end
122
116
  end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require 'sitediff/config'
5
+
6
+ class SiteDiff
7
+ class Config
8
+ ##
9
+ # Preset helper.
10
+ class Preset
11
+ ##
12
+ # Directory in which presets live.
13
+ #
14
+ # TODO: Move this outside "lib".
15
+ DIRECTORY = (Pathname.new(__dir__).dirname + 'presets').freeze
16
+
17
+ ##
18
+ # Reads preset rules.
19
+ #
20
+ # @param [String] preset
21
+ # Presets
22
+ #
23
+ # @return [Hash]
24
+ # A hash containing the preset's rules.
25
+ def self.read(name)
26
+ @cache = {} if @cache.nil?
27
+
28
+ # Load and cache preset config.
29
+ if @cache[name].nil?
30
+ exist? name, true
31
+ @cache[name] = Config.load_conf file(name)
32
+ end
33
+
34
+ @cache[name]
35
+ end
36
+
37
+ ##
38
+ # Get all possible rules.
39
+ #
40
+ # @return [Array]
41
+ # All presets.
42
+ def self.all
43
+ # Load and cache preset names.
44
+ if @all.nil?
45
+ @all = []
46
+ pattern = DIRECTORY + '*.yaml'
47
+ Dir.glob(pattern) do |file|
48
+ @all << File.basename(file, '.yaml')
49
+ end
50
+ end
51
+
52
+ @all
53
+ end
54
+
55
+ ##
56
+ # Checks whether a preset exists.
57
+ def self.exist?(name, exception = false)
58
+ result = File.exist? file(name)
59
+
60
+ # Raise an exception, if required.
61
+ if exception && !result
62
+ raise Config::InvalidConfig, "Preset not found: #{name}"
63
+ end
64
+
65
+ result
66
+ end
67
+
68
+ ##
69
+ # Returns the path to a preset file.
70
+ def self.file(name)
71
+ DIRECTORY + "#{name}.yaml"
72
+ end
73
+ end
74
+ end
75
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'sitediff/uriwrapper'
3
5
  require 'addressable/uri'
@@ -6,90 +8,124 @@ require 'ostruct'
6
8
  require 'set'
7
9
 
8
10
  class SiteDiff
9
- class Crawler
10
- class Info < OpenStruct; end
11
-
12
- DEFAULT_DEPTH = 3
11
+ # SiteDiff Crawler.
12
+ class Crawler
13
+ class Info < OpenStruct; end
14
+
15
+ DEFAULT_DEPTH = 3
16
+
17
+ # Create a crawler with a base URL
18
+ def initialize(hydra, base,
19
+ interval,
20
+ include_regex,
21
+ exclude_regex,
22
+ depth = DEFAULT_DEPTH,
23
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
24
+ debug = true,
25
+ &block)
26
+ @hydra = hydra
27
+ @base_uri = Addressable::URI.parse(base)
28
+ @base = base
29
+ @interval = interval
30
+ @include_regex = include_regex
31
+ @exclude_regex = exclude_regex
32
+ @found = Set.new
33
+ @callback = block
34
+ @curl_opts = curl_opts
35
+ @debug = debug
36
+
37
+ add_uri('', depth)
38
+ end
13
39
 
14
- # Create a crawler with a base URL
15
- def initialize(hydra, base, depth = DEFAULT_DEPTH, &block)
16
- @hydra = hydra
17
- @base_uri = Addressable::URI.parse(base)
18
- @base = base
19
- @found = Set.new
20
- @callback = block
40
+ # Handle a newly found relative URI
41
+ def add_uri(rel, depth)
42
+ return if @found.include? rel
21
43
 
22
- add_uri('', depth)
23
- end
44
+ @found << rel
24
45
 
25
- # Handle a newly found relative URI
26
- def add_uri(rel, depth)
27
- return if @found.include? rel
28
- @found << rel
29
-
30
- wrapper = UriWrapper.new(@base + rel)
31
- wrapper.queue(@hydra) do |res|
32
- fetched_uri(rel, depth, res)
46
+ wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
47
+ wrapper.queue(@hydra) do |res|
48
+ fetched_uri(rel, depth, res)
49
+ end
33
50
  end
34
- end
35
51
 
36
- # Handle the fetch of a URI
37
- def fetched_uri(rel, depth, res)
38
- return unless res.content # Ignore errors
39
- return unless depth >= 0
40
-
41
- base = Addressable::URI.parse(@base + rel)
42
- doc = Nokogiri::HTML(res.content)
43
-
44
- # Call the callback
45
- info = Info.new(
46
- :relative => rel,
47
- :uri => base,
48
- :read_result => res,
49
- :document => doc,
50
- )
51
- @callback[info]
52
-
53
- # Find links
54
- links = find_links(doc)
55
- uris = links.map { |l| resolve_link(base, l) }.compact
56
- uris = filter_links(uris)
57
-
58
- # Make them relative
59
- rels = uris.map { |u| relativize_link(u) }
60
-
61
- # Queue them in turn
62
- rels.each do |r|
63
- next if @found.include? r
64
- add_uri(r, depth - 1)
52
+ # Handle the fetch of a URI
53
+ def fetched_uri(rel, depth, res)
54
+ if res.error
55
+ SiteDiff.log(res.error, :error)
56
+ return
57
+ elsif !res.content
58
+ SiteDiff.log('Response is missing content. Treating as an error.', :error)
59
+ return
60
+ end
61
+
62
+ base = Addressable::URI.parse(@base + rel)
63
+ doc = Nokogiri::HTML(res.content)
64
+
65
+ # Call the callback
66
+ info = Info.new(
67
+ relative: rel,
68
+ uri: base,
69
+ read_result: res,
70
+ document: doc
71
+ )
72
+ # Insert delay to limit fetching rate
73
+ if @interval != 0
74
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
75
+ sleep(@interval / 1000.0)
76
+ end
77
+ @callback[info]
78
+
79
+ return unless depth >= 1
80
+
81
+ # Find links
82
+ links = find_links(doc)
83
+ uris = links.map { |l| resolve_link(base, l) }.compact
84
+ uris = filter_links(uris)
85
+
86
+ # Make them relative
87
+ rels = uris.map { |u| relativize_link(u) }
88
+
89
+ # Queue them in turn
90
+ rels.each do |r|
91
+ next if @found.include? r
92
+
93
+ add_uri(r, depth - 1)
94
+ end
65
95
  end
66
- end
67
96
 
68
- # Resolve a potentially-relative link. Return nil on error.
69
- def resolve_link(base, rel)
70
- begin
71
- return base + rel
97
+ # Resolve a potentially-relative link. Return nil on error.
98
+ def resolve_link(base, rel)
99
+ base + rel
72
100
  rescue Addressable::URI::InvalidURIError
73
- SiteDiff.log "skipped invalid URL: '#{rel}'", :warn
74
- return nil
101
+ SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
102
+ nil
75
103
  end
76
- end
77
104
 
78
- # Make a link relative to @base_uri
79
- def relativize_link(uri)
80
- uri.path.slice(@base_uri.path.length, uri.path.length)
81
- end
105
+ # Make a link relative to @base_uri
106
+ def relativize_link(uri)
107
+ uri.path.slice(@base_uri.path.length, uri.path.length)
108
+ end
82
109
 
83
- # Return a list of string links found on a page.
84
- def find_links(doc)
85
- return doc.xpath('//a[@href]').map { |e| e['href'] }
86
- end
110
+ # Return a list of string links found on a page.
111
+ def find_links(doc)
112
+ doc.xpath('//a[@href]').map { |e| e['href'] }
113
+ end
87
114
 
88
- # Filter out links we don't want. Links passed in are absolute URIs.
89
- def filter_links(uris)
90
- uris.find_all do |u|
91
- u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
115
+ # Filter out links we don't want. Links passed in are absolute URIs.
116
+ def filter_links(uris)
117
+ uris.find_all do |u|
118
+ is_sub_uri = (u.host == @base_uri.host) &&
119
+ u.path.start_with?(@base_uri.path)
120
+ next unless is_sub_uri
121
+
122
+ is_included = @include_regex.nil? ? false : @include_regex.match(u.path)
123
+ is_excluded = @exclude_regex.nil? ? false : @exclude_regex.match(u.path)
124
+ if is_excluded && !is_included
125
+ SiteDiff.log "Ignoring excluded URL #{u.path}", :info
126
+ end
127
+ is_included || !is_excluded
128
+ end
92
129
  end
93
130
  end
94
131
  end
95
- end