sitediff 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/exception'
2
4
  require 'sitediff/sanitize'
3
5
  require 'pathname'
@@ -9,7 +11,7 @@ class SiteDiff
9
11
 
10
12
  # keys allowed in configuration files
11
13
  CONF_KEYS = Sanitizer::TOOLS.values.flatten(1) +
12
- %w[paths before after before_url after_url includes]
14
+ %w[paths before after before_url after_url includes curl_opts]
13
15
 
14
16
  class InvalidConfig < SiteDiffException; end
15
17
  class ConfigNotFound < SiteDiffException; end
@@ -41,13 +43,14 @@ class SiteDiff
41
43
  conf[pos][key] ||= []
42
44
  conf[pos][key] += conf[key] if conf[key]
43
45
  end
44
- tools[:scalar].each {|key| conf[pos][key] ||= conf[key]}
46
+ tools[:scalar].each { |key| conf[pos][key] ||= conf[key] }
45
47
  conf[pos]['url'] ||= conf[pos + '_url']
48
+ conf[pos]['curl_opts'] = conf['curl_opts']
46
49
  end
47
50
  # normalize paths
48
- conf['paths'] = Config::normalize_paths(conf['paths'])
51
+ conf['paths'] = Config.normalize_paths(conf['paths'])
49
52
 
50
- conf.select {|k,v| %w[before after paths].include? k}
53
+ conf.select { |k, _v| %w[before after paths curl_opts].include? k }
51
54
  end
52
55
 
53
56
  # Merges two normalized Hashes according to the following rules:
@@ -72,56 +75,33 @@ class SiteDiff
72
75
  next
73
76
  end
74
77
  result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
75
- if Sanitizer::TOOLS[:array].include? key # rule 2a
76
- result[pos][key] = (a || []) + (b|| [])
77
- else
78
- result[pos][key] = a || b # rule 2b
79
- end
78
+ result[pos][key] = if Sanitizer::TOOLS[:array].include? key # rule 2a
79
+ (a || []) + (b || [])
80
+ else
81
+ a || b # rule 2b
82
+ end
80
83
  end
81
84
  end
82
85
  result
83
86
  end
84
87
 
85
- # Search for a config file. If found, change to the containing directory,
86
- # and return an array of config files found.
87
- def self.search
88
- subdirs = %w[. sitediff]
89
- root_indicators = %w[.git .svn]
90
-
91
- Pathname.pwd.ascend do |dir|
92
- subdirs.each do |subdir|
93
- d = dir + subdir + DEFAULT_FILENAME
94
- if d.exist?
95
- Dir.chdir(dir.+(subdir).to_s)
96
- return [DEFAULT_FILENAME]
97
- end
98
- end
99
-
100
- root_indicators.each { |r| return [] if dir.+(r).exist? }
101
- end
102
-
103
- return []
104
- end
105
-
106
- def initialize(files, opts = {})
107
- @config = {'paths' => [], 'before' => {}, 'after' => {} }
108
-
109
- files = Config.search if files.empty? && opts[:search]
110
- files = [DEFAULT_FILENAME] if files.empty? &&
111
- File.exists?(DEFAULT_FILENAME)
112
- raise ConfigNotFound, "No configuration file found." if files.empty?
88
+ def initialize(files, dir)
89
+ @config = { 'paths' => [], 'before' => {}, 'after' => {} }
113
90
 
91
+ files = [File.join(dir, DEFAULT_FILENAME)] if files.empty?
114
92
  files.each do |file|
115
- raise InvalidConfig,
116
- "Missing config file %s." % File.expand_path(file) \
117
- unless File.exist?(file)
118
- @config = Config::merge(@config, Config::load_conf(file))
93
+ unless File.exist?(file)
94
+ raise InvalidConfig,
95
+ format('Missing config file %s.', File.expand_path(file))
96
+ end
97
+ @config = Config.merge(@config, Config.load_conf(file))
119
98
  end
120
99
  end
121
100
 
122
101
  def before
123
102
  @config['before']
124
103
  end
104
+
125
105
  def after
126
106
  @config['after']
127
107
  end
@@ -129,61 +109,57 @@ class SiteDiff
129
109
  def paths
130
110
  @config['paths']
131
111
  end
112
+
132
113
  def paths=(paths)
133
- @config['paths'] = Config::normalize_paths(paths)
114
+ @config['paths'] = Config.normalize_paths(paths)
134
115
  end
135
116
 
136
117
  # Checks if the configuration is usable for diff-ing.
137
118
  def validate(opts = {})
138
- opts = { :need_before => true }.merge(opts)
119
+ opts = { need_before: true }.merge(opts)
139
120
 
140
121
  raise InvalidConfig, "Undefined 'before' base URL." if \
141
122
  opts[:need_before] && !before['url']
142
123
  raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
143
- raise InvalidConfig, "Undefined 'paths'." unless (paths and !paths.empty?)
124
+ raise InvalidConfig, "Undefined 'paths'." unless paths && !paths.empty?
144
125
  end
145
126
 
146
127
  private
147
128
 
148
129
  def self.normalize_paths(paths)
149
130
  paths ||= []
150
- return paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
131
+ paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
151
132
  end
152
133
 
153
134
  # reads a YAML file and raises an InvalidConfig if the file is not valid.
154
135
  def self.load_raw_yaml(file)
155
- SiteDiff::log "Reading config file: #{Pathname.new(file).expand_path}"
136
+ SiteDiff.log "Reading config file: #{Pathname.new(file).expand_path}"
156
137
  conf = YAML.load_file(file) || {}
157
- unless conf.is_a? Hash
158
- raise InvalidConfig, "Invalid configuration file: '#{file}'"
159
- end
160
- conf.each do |k,v|
161
- unless CONF_KEYS.include? k
162
- raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'"
163
- end
138
+ raise InvalidConfig, "Invalid configuration file: '#{file}'" unless conf.is_a? Hash
139
+
140
+ conf.each_key do |k, _v|
141
+ raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'" unless CONF_KEYS.include? k
164
142
  end
165
143
  conf
166
144
  end
167
145
 
168
146
  # loads a single YAML configuration file, merges all its 'included' files
169
147
  # and returns a normalized Hash.
170
- def self.load_conf(file, visited=[])
148
+ def self.load_conf(file, visited = [])
171
149
  # don't get fooled by a/../a/ or symlinks
172
150
  file = File.realpath(file)
173
- if visited.include? file
174
- raise InvalidConfig, "Circular dependency: #{file}"
175
- end
151
+ raise InvalidConfig, "Circular dependency: #{file}" if visited.include? file
176
152
 
177
153
  conf = load_raw_yaml(file) # not normalized yet
178
154
  visited << file
179
155
 
180
156
  # normalize and merge includes
181
157
  includes = conf['includes'] || []
182
- conf = Config::normalize(conf)
158
+ conf = Config.normalize(conf)
183
159
  includes.each do |dep|
184
160
  # include paths are relative to the including file.
185
161
  dep = File.join(File.dirname(file), dep)
186
- conf = Config::merge(conf, load_conf(dep, visited))
162
+ conf = Config.merge(conf, load_conf(dep, visited))
187
163
  end
188
164
  conf
189
165
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/cache'
2
4
  require 'sitediff/config'
3
5
  require 'sitediff/crawler'
@@ -7,116 +9,119 @@ require 'typhoeus'
7
9
  require 'yaml'
8
10
 
9
11
  class SiteDiff
10
- class Config
11
- class Creator
12
- def initialize(*urls, &block)
13
- @after = urls.pop
14
- @before = urls.pop # May be nil
15
- end
12
+ class Config
13
+ class Creator
14
+ def initialize(concurrency, curl_opts, *urls)
15
+ @concurrency = concurrency
16
+ @after = urls.pop
17
+ @before = urls.pop # May be nil
18
+ @curl_opts = curl_opts
19
+ end
16
20
 
17
- def roots
18
- @roots = begin
19
- r = { :after => @after }
20
- r[:before] = @before if @before
21
- r
22
- end
23
- end
21
+ def roots
22
+ @roots = begin
23
+ r = { after: @after }
24
+ r[:before] = @before if @before
25
+ r
26
+ end
27
+ end
24
28
 
25
- # Build a config structure, return it
26
- def create(opts, &block)
27
- @config = {}
28
- @callback = block
29
+ # Build a config structure, return it
30
+ def create(opts, &block)
31
+ @config = {}
32
+ @callback = block
29
33
 
30
- # Handle options
31
- @dir = Pathname.new(opts[:directory])
32
- @depth = opts[:depth]
33
- @rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
34
+ # Handle options
35
+ @dir = Pathname.new(opts[:directory])
36
+ @depth = opts[:depth]
37
+ @rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
34
38
 
35
- # Create the dir. Must go before cache initialization!
36
- @dir.mkpath unless @dir.directory?
39
+ # Create the dir. Must go before cache initialization!
40
+ @dir.mkpath unless @dir.directory?
37
41
 
38
- # Setup instance vars
39
- @paths = Hash.new { |h,k| h[k] = Set.new }
40
- @cache = Cache.new(:file => @dir.+(Cache::DEFAULT_FILENAME).to_s,
41
- :create => true)
42
- @cache.write_tags << :before << :after
42
+ # Setup instance vars
43
+ @paths = Hash.new { |h, k| h[k] = Set.new }
44
+ @cache = Cache.new(dir: @dir.to_s, create: true)
45
+ @cache.write_tags << :before << :after
43
46
 
44
- build_config
45
- write_config
46
- end
47
+ build_config
48
+ write_config
49
+ end
47
50
 
48
- def build_config
49
- %w[before after].each do |tag|
50
- next unless u = roots[tag.to_sym]
51
- @config[tag] = {'url' => u}
52
- end
51
+ def build_config
52
+ %w[before after].each do |tag|
53
+ next unless (u = roots[tag.to_sym])
53
54
 
54
- crawl(@depth)
55
- @cache.close
56
- @rules.add_config if @rules
55
+ @config[tag] = { 'url' => u }
56
+ end
57
57
 
58
- @config['paths'] = @paths.values.reduce(&:|).to_a.sort
59
- end
58
+ crawl(@depth)
59
+ @rules&.add_config
60
60
 
61
- def crawl(depth = nil)
62
- hydra = Typhoeus::Hydra.new(max_concurrency: 10)
63
- roots.each do |tag, u|
64
- Crawler.new(hydra, u, depth) do |info|
65
- crawled_path(tag, info)
61
+ @config['paths'] = @paths.values.reduce(&:|).to_a.sort
66
62
  end
67
- end
68
- hydra.run
69
- end
70
63
 
71
- # Deduplicate paths with slashes at the end
72
- def canonicalize(tag, path)
73
- def altered_paths(path)
74
- yield path + '/'
75
- yield path.sub(%r[/$], '')
76
- end
64
+ def crawl(depth = nil)
65
+ hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
66
+ roots.each do |tag, u|
67
+ Crawler.new(hydra, u, depth, @curl_opts) do |info|
68
+ crawled_path(tag, info)
69
+ end
70
+ end
71
+ hydra.run
72
+ end
77
73
 
78
- return path.empty? ? '/' : path
79
- end
74
+ # Deduplicate paths with slashes at the end
75
+ def canonicalize(_tag, path)
76
+ def altered_paths(path)
77
+ yield path + '/'
78
+ yield path.sub(%r{/$}, '')
79
+ end
80
80
 
81
- def crawled_path(tag, info)
82
- path, dup = canonicalize(tag, info.relative)
83
- return if dup
81
+ path.empty? ? '/' : path
82
+ end
84
83
 
85
- res = info.read_result
84
+ def crawled_path(tag, info)
85
+ path, dup = canonicalize(tag, info.relative)
86
+ return if dup
86
87
 
87
- @callback[tag, info]
88
- @paths[tag] << path
89
- @cache.set(tag, path, res)
88
+ res = info.read_result
90
89
 
91
- # If single-site, cache after as before!
92
- @cache.set(:before, path, res) unless roots[:before]
90
+ @callback[tag, info]
91
+ @paths[tag] << path
92
+ @cache.set(tag, path, res)
93
93
 
94
- @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
95
- end
94
+ # If single-site, cache after as before!
95
+ @cache.set(:before, path, res) unless roots[:before]
96
96
 
97
- # Create a gitignore if we seem to be in git
98
- def make_gitignore(dir)
99
- # Check if we're in git
100
- return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
101
-
102
- dir.+('.gitignore').open('w') do |f|
103
- f.puts <<-EOF.gsub(/^\s+/, '')
104
- output
105
- cache.db
106
- cache.db.db
107
- EOF
108
- end
109
- end
97
+ # This is used to populate the list of rules we guess are
98
+ # applicable to the current site.
99
+ @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
100
+ end
110
101
 
111
- def config_file
112
- @dir + Config::DEFAULT_FILENAME
113
- end
102
+ # Create a gitignore if we seem to be in git
103
+ def make_gitignore(dir)
104
+ # Check if we're in git
105
+ return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
106
+
107
+ dir.+('.gitignore').open('w') do |f|
108
+ f.puts <<-GITIGNORE.gsub(/^\s+/, '')
109
+ output
110
+ cache.db
111
+ cache.db.db
112
+ GITIGNORE
113
+ end
114
+ end
115
+
116
+ def config_file
117
+ @dir + Config::DEFAULT_FILENAME
118
+ end
114
119
 
115
- # Turn a config structure into a config file
116
- def write_config
117
- make_gitignore(@dir)
118
- config_file.open('w') { |f| f.puts @config.to_yaml }
120
+ # Turn a config structure into a config file
121
+ def write_config
122
+ make_gitignore(@dir)
123
+ config_file.open('w') { |f| f.puts @config.to_yaml }
124
+ end
125
+ end
119
126
  end
120
127
  end
121
- end
122
- end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'sitediff/uriwrapper'
3
5
  require 'addressable/uri'
@@ -6,90 +8,99 @@ require 'ostruct'
6
8
  require 'set'
7
9
 
8
10
  class SiteDiff
9
- class Crawler
10
- class Info < OpenStruct; end
11
-
12
- DEFAULT_DEPTH = 3
11
+ class Crawler
12
+ class Info < OpenStruct; end
13
+
14
+ DEFAULT_DEPTH = 3
15
+
16
+ # Create a crawler with a base URL
17
+ def initialize(hydra, base, depth = DEFAULT_DEPTH,
18
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS, &block)
19
+ @hydra = hydra
20
+ @base_uri = Addressable::URI.parse(base)
21
+ @base = base
22
+ @found = Set.new
23
+ @callback = block
24
+ @curl_opts = curl_opts
25
+
26
+ add_uri('', depth)
27
+ end
13
28
 
14
- # Create a crawler with a base URL
15
- def initialize(hydra, base, depth = DEFAULT_DEPTH, &block)
16
- @hydra = hydra
17
- @base_uri = Addressable::URI.parse(base)
18
- @base = base
19
- @found = Set.new
20
- @callback = block
29
+ # Handle a newly found relative URI
30
+ def add_uri(rel, depth)
31
+ return if @found.include? rel
21
32
 
22
- add_uri('', depth)
23
- end
33
+ @found << rel
24
34
 
25
- # Handle a newly found relative URI
26
- def add_uri(rel, depth)
27
- return if @found.include? rel
28
- @found << rel
29
-
30
- wrapper = UriWrapper.new(@base + rel)
31
- wrapper.queue(@hydra) do |res|
32
- fetched_uri(rel, depth, res)
35
+ wrapper = UriWrapper.new(@base + rel, @curl_opts)
36
+ wrapper.queue(@hydra) do |res|
37
+ fetched_uri(rel, depth, res)
38
+ end
33
39
  end
34
- end
35
40
 
36
- # Handle the fetch of a URI
37
- def fetched_uri(rel, depth, res)
38
- return unless res.content # Ignore errors
39
- return unless depth >= 0
40
-
41
- base = Addressable::URI.parse(@base + rel)
42
- doc = Nokogiri::HTML(res.content)
43
-
44
- # Call the callback
45
- info = Info.new(
46
- :relative => rel,
47
- :uri => base,
48
- :read_result => res,
49
- :document => doc,
50
- )
51
- @callback[info]
52
-
53
- # Find links
54
- links = find_links(doc)
55
- uris = links.map { |l| resolve_link(base, l) }.compact
56
- uris = filter_links(uris)
57
-
58
- # Make them relative
59
- rels = uris.map { |u| relativize_link(u) }
60
-
61
- # Queue them in turn
62
- rels.each do |r|
63
- next if @found.include? r
64
- add_uri(r, depth - 1)
41
+ # Handle the fetch of a URI
42
+ def fetched_uri(rel, depth, res)
43
+ if res.error
44
+ SiteDiff.log(res.error, :error)
45
+ return
46
+ elsif !res.content
47
+ SiteDiff.log('Response is missing content. Treating as an error.', :error)
48
+ return
49
+ end
50
+
51
+ base = Addressable::URI.parse(@base + rel)
52
+ doc = Nokogiri::HTML(res.content)
53
+
54
+ # Call the callback
55
+ info = Info.new(
56
+ relative: rel,
57
+ uri: base,
58
+ read_result: res,
59
+ document: doc
60
+ )
61
+ @callback[info]
62
+
63
+ return unless depth >= 1
64
+
65
+ # Find links
66
+ links = find_links(doc)
67
+ uris = links.map { |l| resolve_link(base, l) }.compact
68
+ uris = filter_links(uris)
69
+
70
+ # Make them relative
71
+ rels = uris.map { |u| relativize_link(u) }
72
+
73
+ # Queue them in turn
74
+ rels.each do |r|
75
+ next if @found.include? r
76
+
77
+ add_uri(r, depth - 1)
78
+ end
65
79
  end
66
- end
67
80
 
68
- # Resolve a potentially-relative link. Return nil on error.
69
- def resolve_link(base, rel)
70
- begin
71
- return base + rel
81
+ # Resolve a potentially-relative link. Return nil on error.
82
+ def resolve_link(base, rel)
83
+ base + rel
72
84
  rescue Addressable::URI::InvalidURIError
73
- SiteDiff.log "skipped invalid URL: '#{rel}'", :warn
74
- return nil
85
+ SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warn
86
+ nil
75
87
  end
76
- end
77
88
 
78
- # Make a link relative to @base_uri
79
- def relativize_link(uri)
80
- uri.path.slice(@base_uri.path.length, uri.path.length)
81
- end
89
+ # Make a link relative to @base_uri
90
+ def relativize_link(uri)
91
+ uri.path.slice(@base_uri.path.length, uri.path.length)
92
+ end
82
93
 
83
- # Return a list of string links found on a page.
84
- def find_links(doc)
85
- return doc.xpath('//a[@href]').map { |e| e['href'] }
86
- end
94
+ # Return a list of string links found on a page.
95
+ def find_links(doc)
96
+ doc.xpath('//a[@href]').map { |e| e['href'] }
97
+ end
87
98
 
88
- # Filter out links we don't want. Links passed in are absolute URIs.
89
- def filter_links(uris)
90
- uris.find_all do |u|
91
- u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
99
+ # Filter out links we don't want. Links passed in are absolute URIs.
100
+ def filter_links(uris)
101
+ uris.find_all do |u|
102
+ u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
103
+ end
92
104
  end
93
105
  end
94
106
  end
95
- end