sitediff 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/exception'
2
4
  require 'sitediff/sanitize'
3
5
  require 'pathname'
@@ -9,7 +11,7 @@ class SiteDiff
9
11
 
10
12
  # keys allowed in configuration files
11
13
  CONF_KEYS = Sanitizer::TOOLS.values.flatten(1) +
12
- %w[paths before after before_url after_url includes]
14
+ %w[paths before after before_url after_url includes curl_opts]
13
15
 
14
16
  class InvalidConfig < SiteDiffException; end
15
17
  class ConfigNotFound < SiteDiffException; end
@@ -41,13 +43,14 @@ class SiteDiff
41
43
  conf[pos][key] ||= []
42
44
  conf[pos][key] += conf[key] if conf[key]
43
45
  end
44
- tools[:scalar].each {|key| conf[pos][key] ||= conf[key]}
46
+ tools[:scalar].each { |key| conf[pos][key] ||= conf[key] }
45
47
  conf[pos]['url'] ||= conf[pos + '_url']
48
+ conf[pos]['curl_opts'] = conf['curl_opts']
46
49
  end
47
50
  # normalize paths
48
- conf['paths'] = Config::normalize_paths(conf['paths'])
51
+ conf['paths'] = Config.normalize_paths(conf['paths'])
49
52
 
50
- conf.select {|k,v| %w[before after paths].include? k}
53
+ conf.select { |k, _v| %w[before after paths curl_opts].include? k }
51
54
  end
52
55
 
53
56
  # Merges two normalized Hashes according to the following rules:
@@ -72,56 +75,33 @@ class SiteDiff
72
75
  next
73
76
  end
74
77
  result[pos] = first[pos].merge!(second[pos]) do |key, a, b|
75
- if Sanitizer::TOOLS[:array].include? key # rule 2a
76
- result[pos][key] = (a || []) + (b|| [])
77
- else
78
- result[pos][key] = a || b # rule 2b
79
- end
78
+ result[pos][key] = if Sanitizer::TOOLS[:array].include? key # rule 2a
79
+ (a || []) + (b || [])
80
+ else
81
+ a || b # rule 2b
82
+ end
80
83
  end
81
84
  end
82
85
  result
83
86
  end
84
87
 
85
- # Search for a config file. If found, change to the containing directory,
86
- # and return an array of config files found.
87
- def self.search
88
- subdirs = %w[. sitediff]
89
- root_indicators = %w[.git .svn]
90
-
91
- Pathname.pwd.ascend do |dir|
92
- subdirs.each do |subdir|
93
- d = dir + subdir + DEFAULT_FILENAME
94
- if d.exist?
95
- Dir.chdir(dir.+(subdir).to_s)
96
- return [DEFAULT_FILENAME]
97
- end
98
- end
99
-
100
- root_indicators.each { |r| return [] if dir.+(r).exist? }
101
- end
102
-
103
- return []
104
- end
105
-
106
- def initialize(files, opts = {})
107
- @config = {'paths' => [], 'before' => {}, 'after' => {} }
108
-
109
- files = Config.search if files.empty? && opts[:search]
110
- files = [DEFAULT_FILENAME] if files.empty? &&
111
- File.exists?(DEFAULT_FILENAME)
112
- raise ConfigNotFound, "No configuration file found." if files.empty?
88
+ def initialize(files, dir)
89
+ @config = { 'paths' => [], 'before' => {}, 'after' => {} }
113
90
 
91
+ files = [File.join(dir, DEFAULT_FILENAME)] if files.empty?
114
92
  files.each do |file|
115
- raise InvalidConfig,
116
- "Missing config file %s." % File.expand_path(file) \
117
- unless File.exist?(file)
118
- @config = Config::merge(@config, Config::load_conf(file))
93
+ unless File.exist?(file)
94
+ raise InvalidConfig,
95
+ format('Missing config file %s.', File.expand_path(file))
96
+ end
97
+ @config = Config.merge(@config, Config.load_conf(file))
119
98
  end
120
99
  end
121
100
 
122
101
  def before
123
102
  @config['before']
124
103
  end
104
+
125
105
  def after
126
106
  @config['after']
127
107
  end
@@ -129,61 +109,57 @@ class SiteDiff
129
109
  def paths
130
110
  @config['paths']
131
111
  end
112
+
132
113
  def paths=(paths)
133
- @config['paths'] = Config::normalize_paths(paths)
114
+ @config['paths'] = Config.normalize_paths(paths)
134
115
  end
135
116
 
136
117
  # Checks if the configuration is usable for diff-ing.
137
118
  def validate(opts = {})
138
- opts = { :need_before => true }.merge(opts)
119
+ opts = { need_before: true }.merge(opts)
139
120
 
140
121
  raise InvalidConfig, "Undefined 'before' base URL." if \
141
122
  opts[:need_before] && !before['url']
142
123
  raise InvalidConfig, "Undefined 'after' base URL." unless after['url']
143
- raise InvalidConfig, "Undefined 'paths'." unless (paths and !paths.empty?)
124
+ raise InvalidConfig, "Undefined 'paths'." unless paths && !paths.empty?
144
125
  end
145
126
 
146
127
  private
147
128
 
148
129
  def self.normalize_paths(paths)
149
130
  paths ||= []
150
- return paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
131
+ paths.map { |p| (p[0] == '/' ? p : "/#{p}").chomp }
151
132
  end
152
133
 
153
134
  # reads a YAML file and raises an InvalidConfig if the file is not valid.
154
135
  def self.load_raw_yaml(file)
155
- SiteDiff::log "Reading config file: #{Pathname.new(file).expand_path}"
136
+ SiteDiff.log "Reading config file: #{Pathname.new(file).expand_path}"
156
137
  conf = YAML.load_file(file) || {}
157
- unless conf.is_a? Hash
158
- raise InvalidConfig, "Invalid configuration file: '#{file}'"
159
- end
160
- conf.each do |k,v|
161
- unless CONF_KEYS.include? k
162
- raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'"
163
- end
138
+ raise InvalidConfig, "Invalid configuration file: '#{file}'" unless conf.is_a? Hash
139
+
140
+ conf.each_key do |k, _v|
141
+ raise InvalidConfig, "Unknown configuration key (#{file}): '#{k}'" unless CONF_KEYS.include? k
164
142
  end
165
143
  conf
166
144
  end
167
145
 
168
146
  # loads a single YAML configuration file, merges all its 'included' files
169
147
  # and returns a normalized Hash.
170
- def self.load_conf(file, visited=[])
148
+ def self.load_conf(file, visited = [])
171
149
  # don't get fooled by a/../a/ or symlinks
172
150
  file = File.realpath(file)
173
- if visited.include? file
174
- raise InvalidConfig, "Circular dependency: #{file}"
175
- end
151
+ raise InvalidConfig, "Circular dependency: #{file}" if visited.include? file
176
152
 
177
153
  conf = load_raw_yaml(file) # not normalized yet
178
154
  visited << file
179
155
 
180
156
  # normalize and merge includes
181
157
  includes = conf['includes'] || []
182
- conf = Config::normalize(conf)
158
+ conf = Config.normalize(conf)
183
159
  includes.each do |dep|
184
160
  # include paths are relative to the including file.
185
161
  dep = File.join(File.dirname(file), dep)
186
- conf = Config::merge(conf, load_conf(dep, visited))
162
+ conf = Config.merge(conf, load_conf(dep, visited))
187
163
  end
188
164
  conf
189
165
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff/cache'
2
4
  require 'sitediff/config'
3
5
  require 'sitediff/crawler'
@@ -7,116 +9,119 @@ require 'typhoeus'
7
9
  require 'yaml'
8
10
 
9
11
  class SiteDiff
10
- class Config
11
- class Creator
12
- def initialize(*urls, &block)
13
- @after = urls.pop
14
- @before = urls.pop # May be nil
15
- end
12
+ class Config
13
+ class Creator
14
+ def initialize(concurrency, curl_opts, *urls)
15
+ @concurrency = concurrency
16
+ @after = urls.pop
17
+ @before = urls.pop # May be nil
18
+ @curl_opts = curl_opts
19
+ end
16
20
 
17
- def roots
18
- @roots = begin
19
- r = { :after => @after }
20
- r[:before] = @before if @before
21
- r
22
- end
23
- end
21
+ def roots
22
+ @roots = begin
23
+ r = { after: @after }
24
+ r[:before] = @before if @before
25
+ r
26
+ end
27
+ end
24
28
 
25
- # Build a config structure, return it
26
- def create(opts, &block)
27
- @config = {}
28
- @callback = block
29
+ # Build a config structure, return it
30
+ def create(opts, &block)
31
+ @config = {}
32
+ @callback = block
29
33
 
30
- # Handle options
31
- @dir = Pathname.new(opts[:directory])
32
- @depth = opts[:depth]
33
- @rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
34
+ # Handle options
35
+ @dir = Pathname.new(opts[:directory])
36
+ @depth = opts[:depth]
37
+ @rules = Rules.new(@config, opts[:rules_disabled]) if opts[:rules]
34
38
 
35
- # Create the dir. Must go before cache initialization!
36
- @dir.mkpath unless @dir.directory?
39
+ # Create the dir. Must go before cache initialization!
40
+ @dir.mkpath unless @dir.directory?
37
41
 
38
- # Setup instance vars
39
- @paths = Hash.new { |h,k| h[k] = Set.new }
40
- @cache = Cache.new(:file => @dir.+(Cache::DEFAULT_FILENAME).to_s,
41
- :create => true)
42
- @cache.write_tags << :before << :after
42
+ # Setup instance vars
43
+ @paths = Hash.new { |h, k| h[k] = Set.new }
44
+ @cache = Cache.new(dir: @dir.to_s, create: true)
45
+ @cache.write_tags << :before << :after
43
46
 
44
- build_config
45
- write_config
46
- end
47
+ build_config
48
+ write_config
49
+ end
47
50
 
48
- def build_config
49
- %w[before after].each do |tag|
50
- next unless u = roots[tag.to_sym]
51
- @config[tag] = {'url' => u}
52
- end
51
+ def build_config
52
+ %w[before after].each do |tag|
53
+ next unless (u = roots[tag.to_sym])
53
54
 
54
- crawl(@depth)
55
- @cache.close
56
- @rules.add_config if @rules
55
+ @config[tag] = { 'url' => u }
56
+ end
57
57
 
58
- @config['paths'] = @paths.values.reduce(&:|).to_a.sort
59
- end
58
+ crawl(@depth)
59
+ @rules&.add_config
60
60
 
61
- def crawl(depth = nil)
62
- hydra = Typhoeus::Hydra.new(max_concurrency: 10)
63
- roots.each do |tag, u|
64
- Crawler.new(hydra, u, depth) do |info|
65
- crawled_path(tag, info)
61
+ @config['paths'] = @paths.values.reduce(&:|).to_a.sort
66
62
  end
67
- end
68
- hydra.run
69
- end
70
63
 
71
- # Deduplicate paths with slashes at the end
72
- def canonicalize(tag, path)
73
- def altered_paths(path)
74
- yield path + '/'
75
- yield path.sub(%r[/$], '')
76
- end
64
+ def crawl(depth = nil)
65
+ hydra = Typhoeus::Hydra.new(max_concurrency: @concurrency)
66
+ roots.each do |tag, u|
67
+ Crawler.new(hydra, u, depth, @curl_opts) do |info|
68
+ crawled_path(tag, info)
69
+ end
70
+ end
71
+ hydra.run
72
+ end
77
73
 
78
- return path.empty? ? '/' : path
79
- end
74
+ # Deduplicate paths with slashes at the end
75
+ def canonicalize(_tag, path)
76
+ def altered_paths(path)
77
+ yield path + '/'
78
+ yield path.sub(%r{/$}, '')
79
+ end
80
80
 
81
- def crawled_path(tag, info)
82
- path, dup = canonicalize(tag, info.relative)
83
- return if dup
81
+ path.empty? ? '/' : path
82
+ end
84
83
 
85
- res = info.read_result
84
+ def crawled_path(tag, info)
85
+ path, dup = canonicalize(tag, info.relative)
86
+ return if dup
86
87
 
87
- @callback[tag, info]
88
- @paths[tag] << path
89
- @cache.set(tag, path, res)
88
+ res = info.read_result
90
89
 
91
- # If single-site, cache after as before!
92
- @cache.set(:before, path, res) unless roots[:before]
90
+ @callback[tag, info]
91
+ @paths[tag] << path
92
+ @cache.set(tag, path, res)
93
93
 
94
- @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
95
- end
94
+ # If single-site, cache after as before!
95
+ @cache.set(:before, path, res) unless roots[:before]
96
96
 
97
- # Create a gitignore if we seem to be in git
98
- def make_gitignore(dir)
99
- # Check if we're in git
100
- return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
101
-
102
- dir.+('.gitignore').open('w') do |f|
103
- f.puts <<-EOF.gsub(/^\s+/, '')
104
- output
105
- cache.db
106
- cache.db.db
107
- EOF
108
- end
109
- end
97
+ # This is used to populate the list of rules we guess are
98
+ # applicable to the current site.
99
+ @rules.handle_page(tag, res.content, info.document) if @rules && !res.error
100
+ end
110
101
 
111
- def config_file
112
- @dir + Config::DEFAULT_FILENAME
113
- end
102
+ # Create a gitignore if we seem to be in git
103
+ def make_gitignore(dir)
104
+ # Check if we're in git
105
+ return unless dir.realpath.to_enum(:ascend).any? { |d| d.+('.git').exist? }
106
+
107
+ dir.+('.gitignore').open('w') do |f|
108
+ f.puts <<-GITIGNORE.gsub(/^\s+/, '')
109
+ output
110
+ cache.db
111
+ cache.db.db
112
+ GITIGNORE
113
+ end
114
+ end
115
+
116
+ def config_file
117
+ @dir + Config::DEFAULT_FILENAME
118
+ end
114
119
 
115
- # Turn a config structure into a config file
116
- def write_config
117
- make_gitignore(@dir)
118
- config_file.open('w') { |f| f.puts @config.to_yaml }
120
+ # Turn a config structure into a config file
121
+ def write_config
122
+ make_gitignore(@dir)
123
+ config_file.open('w') { |f| f.puts @config.to_yaml }
124
+ end
125
+ end
119
126
  end
120
127
  end
121
- end
122
- end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sitediff'
2
4
  require 'sitediff/uriwrapper'
3
5
  require 'addressable/uri'
@@ -6,90 +8,99 @@ require 'ostruct'
6
8
  require 'set'
7
9
 
8
10
  class SiteDiff
9
- class Crawler
10
- class Info < OpenStruct; end
11
-
12
- DEFAULT_DEPTH = 3
11
+ class Crawler
12
+ class Info < OpenStruct; end
13
+
14
+ DEFAULT_DEPTH = 3
15
+
16
+ # Create a crawler with a base URL
17
+ def initialize(hydra, base, depth = DEFAULT_DEPTH,
18
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS, &block)
19
+ @hydra = hydra
20
+ @base_uri = Addressable::URI.parse(base)
21
+ @base = base
22
+ @found = Set.new
23
+ @callback = block
24
+ @curl_opts = curl_opts
25
+
26
+ add_uri('', depth)
27
+ end
13
28
 
14
- # Create a crawler with a base URL
15
- def initialize(hydra, base, depth = DEFAULT_DEPTH, &block)
16
- @hydra = hydra
17
- @base_uri = Addressable::URI.parse(base)
18
- @base = base
19
- @found = Set.new
20
- @callback = block
29
+ # Handle a newly found relative URI
30
+ def add_uri(rel, depth)
31
+ return if @found.include? rel
21
32
 
22
- add_uri('', depth)
23
- end
33
+ @found << rel
24
34
 
25
- # Handle a newly found relative URI
26
- def add_uri(rel, depth)
27
- return if @found.include? rel
28
- @found << rel
29
-
30
- wrapper = UriWrapper.new(@base + rel)
31
- wrapper.queue(@hydra) do |res|
32
- fetched_uri(rel, depth, res)
35
+ wrapper = UriWrapper.new(@base + rel, @curl_opts)
36
+ wrapper.queue(@hydra) do |res|
37
+ fetched_uri(rel, depth, res)
38
+ end
33
39
  end
34
- end
35
40
 
36
- # Handle the fetch of a URI
37
- def fetched_uri(rel, depth, res)
38
- return unless res.content # Ignore errors
39
- return unless depth >= 0
40
-
41
- base = Addressable::URI.parse(@base + rel)
42
- doc = Nokogiri::HTML(res.content)
43
-
44
- # Call the callback
45
- info = Info.new(
46
- :relative => rel,
47
- :uri => base,
48
- :read_result => res,
49
- :document => doc,
50
- )
51
- @callback[info]
52
-
53
- # Find links
54
- links = find_links(doc)
55
- uris = links.map { |l| resolve_link(base, l) }.compact
56
- uris = filter_links(uris)
57
-
58
- # Make them relative
59
- rels = uris.map { |u| relativize_link(u) }
60
-
61
- # Queue them in turn
62
- rels.each do |r|
63
- next if @found.include? r
64
- add_uri(r, depth - 1)
41
+ # Handle the fetch of a URI
42
+ def fetched_uri(rel, depth, res)
43
+ if res.error
44
+ SiteDiff.log(res.error, :error)
45
+ return
46
+ elsif !res.content
47
+ SiteDiff.log('Response is missing content. Treating as an error.', :error)
48
+ return
49
+ end
50
+
51
+ base = Addressable::URI.parse(@base + rel)
52
+ doc = Nokogiri::HTML(res.content)
53
+
54
+ # Call the callback
55
+ info = Info.new(
56
+ relative: rel,
57
+ uri: base,
58
+ read_result: res,
59
+ document: doc
60
+ )
61
+ @callback[info]
62
+
63
+ return unless depth >= 1
64
+
65
+ # Find links
66
+ links = find_links(doc)
67
+ uris = links.map { |l| resolve_link(base, l) }.compact
68
+ uris = filter_links(uris)
69
+
70
+ # Make them relative
71
+ rels = uris.map { |u| relativize_link(u) }
72
+
73
+ # Queue them in turn
74
+ rels.each do |r|
75
+ next if @found.include? r
76
+
77
+ add_uri(r, depth - 1)
78
+ end
65
79
  end
66
- end
67
80
 
68
- # Resolve a potentially-relative link. Return nil on error.
69
- def resolve_link(base, rel)
70
- begin
71
- return base + rel
81
+ # Resolve a potentially-relative link. Return nil on error.
82
+ def resolve_link(base, rel)
83
+ base + rel
72
84
  rescue Addressable::URI::InvalidURIError
73
- SiteDiff.log "skipped invalid URL: '#{rel}'", :warn
74
- return nil
85
+ SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warn
86
+ nil
75
87
  end
76
- end
77
88
 
78
- # Make a link relative to @base_uri
79
- def relativize_link(uri)
80
- uri.path.slice(@base_uri.path.length, uri.path.length)
81
- end
89
+ # Make a link relative to @base_uri
90
+ def relativize_link(uri)
91
+ uri.path.slice(@base_uri.path.length, uri.path.length)
92
+ end
82
93
 
83
- # Return a list of string links found on a page.
84
- def find_links(doc)
85
- return doc.xpath('//a[@href]').map { |e| e['href'] }
86
- end
94
+ # Return a list of string links found on a page.
95
+ def find_links(doc)
96
+ doc.xpath('//a[@href]').map { |e| e['href'] }
97
+ end
87
98
 
88
- # Filter out links we don't want. Links passed in are absolute URIs.
89
- def filter_links(uris)
90
- uris.find_all do |u|
91
- u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
99
+ # Filter out links we don't want. Links passed in are absolute URIs.
100
+ def filter_links(uris)
101
+ uris.find_all do |u|
102
+ u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
103
+ end
92
104
  end
93
105
  end
94
106
  end
95
- end