recluse 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,292 @@
1
+ require 'recluse/hashtree'
2
+ require 'recluse/link'
3
+ require 'recluse/result'
4
+ require 'recluse/info'
5
+ require 'addressable/uri'
6
+ require 'mechanize'
7
+ require 'colorize'
8
+ require 'user_config'
9
+ require 'ruby-progressbar'
10
+
11
+ module Recluse
12
+ ##
13
+ # Error to throw if there's something non-standard with the profile configuration.
14
+ class ProfileError < RuntimeError
15
+ end
16
+
17
+ ##
18
+ # A profile is an atomic unit of rules for link checking.
19
+ class Profile
20
+ ##
21
+ # Identifier of the profile. Make sure that it is filename friendly. Required.
22
+ attr_accessor :name
23
+
24
+ ##
25
+ # Array of URLs to start spidering. Required.
26
+ attr_accessor :roots
27
+
28
+ ##
29
+ # Used in the user-agent to identify who is running the crawler. This is so that if there's a problem with your spidering, you will be contacted and not the author of Recluse. Required.
30
+ attr_accessor :email
31
+
32
+ ##
33
+ # Array of URL patterns to check. Optional. Defaults to empty array.
34
+ attr_accessor :blacklist
35
+
36
+ ##
37
+ # Array of exceptions to the blacklist. Optional. Defaults to empty array.
38
+ attr_accessor :whitelist
39
+
40
+ ##
41
+ # Don't check external URLs. Optional. Defaults to +false+.
42
+ attr_accessor :internal_only
43
+
44
+ ##
45
+ # HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to +false+.
46
+ attr_accessor :scheme_squash
47
+
48
+ ##
49
+ # +HashTree+ representation of results.
50
+ attr_accessor :results
51
+
52
+ ##
53
+ # When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to +false+.
54
+ attr_accessor :redirect
55
+
56
+ ##
57
+ # Create a profile.
58
+ def initialize(
59
+ name,
60
+ roots,
61
+ email,
62
+ blacklist: [],
63
+ whitelist: [],
64
+ internal_only: false,
65
+ scheme_squash: false,
66
+ redirect: false
67
+ )
68
+ raise ProfileError, 'Profile needs roots for starting point' if roots.empty?
69
+ @name = name
70
+ @email = email
71
+ @roots = roots
72
+ @blacklist = blacklist
73
+ @whitelist = whitelist
74
+ @internal_only = internal_only
75
+ @scheme_squash = scheme_squash
76
+ @redirect = redirect
77
+ @results = HashTree.new do |url1, url2|
78
+ url1, url2 = url2, url1 if url2.length > url1.length
79
+ # Detect if URL exists already, but just has a slash at end
80
+ (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
81
+ end
82
+ end
83
+
84
+ ##
85
+ # Create a +Mechanize+ agent.
86
+ def create_agent
87
+ Mechanize.new do |a|
88
+ a.ssl_version = 'TLSv1'
89
+ a.verify_mode = OpenSSL::SSL::VERIFY_NONE
90
+ a.max_history = nil
91
+ a.follow_meta_refresh = true
92
+ a.keep_alive = false
93
+ a.redirect_ok = @redirect
94
+ a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}"
95
+ end
96
+ end
97
+
98
+ ##
99
+ # Starting from the roots, goes through each runnable link and records the referrer, the status code, and any errors.
100
+ # Results are saved in <tt>@results</tt>.
101
+ def status(quiet: false)
102
+ queue = @roots.map { |url| Link.new(url, :root) }
103
+ addrroot = @roots.map { |url| Addressable::URI.parse url }
104
+ raise ProfileError, 'No roots to start from' if queue.empty?
105
+ agent = create_agent
106
+ while queue.length >= 1
107
+ element = queue.shift
108
+ next unless element.run?(@blacklist, @whitelist)
109
+ internal = element.internal?(addrroot)
110
+ next if @internal_only && !internal
111
+ if @results.child?(element.absolute)
112
+ @results.add element.absolute, element.parent
113
+ next
114
+ end
115
+ @results.add element.absolute, element.parent
116
+ if @scheme_squash
117
+ alt = element.address
118
+ alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
119
+ if @results.child?(alt.to_s)
120
+ @results.set_child_value element.absolute, @results.get_child_value(alt.to_s)
121
+ next
122
+ end
123
+ end
124
+ result = Result.new 'idk', false
125
+ begin
126
+ page = agent.get element.absolute
127
+ result.code = page.code
128
+ if @redirect
129
+ result_link = Link.new(page.uri.to_s, element.parent)
130
+ internal = result_link.internal?(addrroot)
131
+ end
132
+ queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } if internal && (page.class != Mechanize::File) && (page.class != Mechanize::Image)
133
+ rescue Mechanize::ResponseCodeError => code
134
+ result.code = code.response_code
135
+ rescue => e
136
+ result.error = e
137
+ end
138
+ @results.set_child_value element.absolute, result
139
+ unless quiet
140
+ puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{element.absolute}"
141
+ puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
142
+ end
143
+ end
144
+ end
145
+
146
+ ##
147
+ # Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) +internal_only+ behavior to +true+.
148
+ def find(glob, quiet: false)
149
+ queue = @roots.map { |url| Link.new(url, :root) }
150
+ addrroot = @roots.map { |url| Addressable::URI.parse url }
151
+ raise ProfileError, 'No roots to start from' if queue.empty?
152
+ progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
153
+ agent = create_agent
154
+ while queue.length >= 1
155
+ element = queue.shift
156
+ match = element.match? glob
157
+ if match
158
+ @results.add element.absolute, element.parent
159
+ progress.log "[#{@name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{element.parent} => #{element.absolute}" unless quiet
160
+ end
161
+ next unless element.run?(@blacklist, @whitelist)
162
+ internal = element.internal?(addrroot)
163
+ next unless internal
164
+ next if @results.parent?(element.absolute)
165
+ if @scheme_squash
166
+ alt = element.address
167
+ alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
168
+ next if @results.parent?(alt.to_s)
169
+ end
170
+ @results.add_parent element.absolute
171
+ result = Result.new 'idk', false
172
+ begin
173
+ page = agent.get element.absolute
174
+ result.code = page.code
175
+ if @redirect
176
+ result_link = Link.new(page.uri.to_s, element.parent)
177
+ next unless result_link.internal?(addrroot)
178
+ end
179
+ queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
180
+ rescue Mechanize::ResponseCodeError => code
181
+ result.code = code.response_code
182
+ rescue => e
183
+ result.error = e
184
+ end
185
+ progress.increment unless quiet
186
+ unless quiet || (result.error == false)
187
+ progress.log "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
188
+ progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
189
+ end
190
+ end
191
+ end
192
+
193
+ ##
194
+ # Asserts existence of CSS selectors.
195
+ def assert(selectors, quiet: false)
196
+ queue = @roots.map { |url| Link.new(url, :root) }
197
+ addrroot = @roots.map { |url| Addressable::URI.parse url }
198
+ raise ProfileError, 'No roots to start from' if queue.empty?
199
+ agent = create_agent
200
+ while queue.length >= 1
201
+ element = queue.shift
202
+ internal = element.internal?(addrroot)
203
+ next unless element.run?(@blacklist, @whitelist) && internal && !@results.child?(element.absolute)
204
+ if @scheme_squash
205
+ alt = element.address
206
+ alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
207
+ next if @results.child?(alt.to_s)
208
+ end
209
+ @results.add_child element.absolute
210
+ existence = nil
211
+ result = Result.new 'idk', false
212
+ begin
213
+ page = agent.get element.absolute
214
+ result.code = page.code
215
+ if @redirect
216
+ result_link = Link.new(page.uri.to_s, element.parent)
217
+ next unless result_link.internal?(addrroot)
218
+ end
219
+ unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
220
+ existence = {}
221
+ selectors.each do |selector|
222
+ existence[selector] = !page.css(selector).empty?
223
+ end
224
+ @results.set_child_value element.absolute, existence
225
+ queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) }
226
+ end
227
+ rescue Mechanize::ResponseCodeError => code
228
+ result.code = code.response_code
229
+ rescue => e
230
+ result.error = e
231
+ end
232
+ unless quiet
233
+ if result.error != false
234
+ puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
235
+ puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
236
+ elsif !existence.nil?
237
+ existence.each do |selector, exists|
238
+ puts "[#{@name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{element.absolute}"
239
+ end
240
+ end
241
+ end
242
+ end
243
+ end
244
+
245
+ ##
246
+ # Saves profile to <tt>~/.recluse/NAME.yaml</tt>.
247
+ def save
248
+ uconf = UserConfig.new '.recluse'
249
+ fname = "#{@name}.yaml"
250
+ options = uconf[fname]
251
+ options['name'] = @name
252
+ options['roots'] = @roots
253
+ options['email'] = @email
254
+ options['blacklist'] = @blacklist
255
+ options['whitelist'] = @whitelist
256
+ options['internal_only'] = @internal_only
257
+ options['scheme_squash'] = @scheme_squash
258
+ options['redirect'] = @redirect
259
+ options.save
260
+ end
261
+
262
+ ##
263
+ # Test if profiles share the same configuration options.
264
+ def ==(other)
265
+ return false if other.class != self.class
266
+ instance_variables.all? do |ivar|
267
+ ivar == '@results'.to_sym || instance_variable_get(ivar) == other.instance_variable_get(ivar)
268
+ end
269
+ end
270
+
271
+ ##
272
+ # Loads profile by name.
273
+ def self.load(profile)
274
+ uconf = UserConfig.new '.recluse'
275
+ raise ProfileError, "Profile '#{profile}' doesn't exist" unless uconf.exist?("#{profile}.yaml")
276
+ options = uconf["#{profile}.yaml"]
277
+ expects = [:blacklist, :whitelist, :internal_only, :scheme_squash, :redirect]
278
+ opts = {}
279
+ expects.each do |e|
280
+ estr = e.to_s
281
+ opts[e] = options[estr] if options.key?(estr) && !options[estr].nil?
282
+ end
283
+ ret = Profile.new(
284
+ profile,
285
+ (options.key?('roots') && !options['roots'].nil? ? options['roots'] : []),
286
+ (options.key?('email') && !options['email'].nil? ? options['email'] : ''),
287
+ **opts
288
+ )
289
+ ret
290
+ end
291
+ end
292
+ end
@@ -0,0 +1,42 @@
1
+ module Recluse
2
+ ##
3
+ # Very simple result container.
4
+ class Result
5
+ ##
6
+ # HTTP status code.
7
+ attr_accessor :code
8
+
9
+ ##
10
+ # Access error message.
11
+ attr_accessor :error
12
+
13
+ ##
14
+ # Create a result.
15
+ def initialize(code, error)
16
+ @code = code
17
+ @error = error
18
+ end
19
+
20
+ ##
21
+ # Returns the HTTP status code.
22
+ def inspect
23
+ @code
24
+ end
25
+
26
+ ##
27
+ # Color based on code.
28
+ def color
29
+ case (@code.to_i / 100)
30
+ when 2
31
+ color = :green
32
+ when 3
33
+ color = :yellow
34
+ when 4, 5
35
+ color = :red
36
+ else
37
+ color = :blue
38
+ end
39
+ color
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,91 @@
1
+ module Recluse
2
+ ##
3
+ # Error to throw if there's something non-standard with the status code.
4
+ class StatusCodeError < RuntimeError
5
+ end
6
+
7
+ ##
8
+ # An HTTP status code.
9
+ class StatusCode
10
+ ##
11
+ # The status code. Either a number, a string with x's to represent wildcards, or 'idk'.
12
+ attr_reader :code
13
+
14
+ ##
15
+ # Whether or not this is an exact numerical code.
16
+ attr_reader :exact
17
+
18
+ ##
19
+ # Create a status code.
20
+ def initialize(code)
21
+ raise StatusCodeError, "Invalid status code: #{code}" unless StatusCode.valid_code?(code)
22
+ case code
23
+ when String
24
+ if (code =~ /^[\d]{3}/).nil? # wildcards or idk
25
+ @code = code.downcase
26
+ @exact = @code == 'idk'
27
+ else # whole number
28
+ @code = code.to_i
29
+ @exact = true
30
+ end
31
+ when Recluse::StatusCode
32
+ @code = code.code
33
+ @exact = code.exact
34
+ when Integer
35
+ @code = code
36
+ @exact = true
37
+ end
38
+ end
39
+
40
+ ##
41
+ # Output the status code to a string.
42
+ def to_s
43
+ @code.to_s
44
+ end
45
+
46
+ ##
47
+ # Whether or not this is an exact numerical code.
48
+ def exact?
49
+ @exact
50
+ end
51
+
52
+ ##
53
+ # Is this code equal to another?
54
+ def equal?(other)
55
+ comparable = StatusCode.new other
56
+ return @code == comparable.code if exact? && comparable.exact?
57
+ self_s = to_s
58
+ comparable_s = comparable.to_s
59
+ (0...3).all? do |i|
60
+ StatusCode.equal_digit?(self_s[i], comparable_s[i])
61
+ end
62
+ end
63
+
64
+ ##
65
+ # Is the passed code valid?
66
+ def self.valid_code?(code)
67
+ case code
68
+ when String
69
+ code = code.downcase
70
+ return false if (code =~ /^([\dx]{3}|idk)$/i).nil?
71
+ return true if (code == 'idk') || (code[0] == 'x')
72
+ initial = code[0].to_i
73
+ ((1 <= initial) && (initial <= 5))
74
+ when Integer
75
+ ((100 <= code) && code < 600)
76
+ when Recluse::StatusCode
77
+ true
78
+ else
79
+ false
80
+ end
81
+ end
82
+
83
+ class << self
84
+ ##
85
+ # Digital comparison. x's are wildcards.
86
+ def equal_digit?(a, b)
87
+ ((a == b) || (a == 'x') || (b == 'x'))
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'recluse/info'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'recluse'
8
+ spec.version = Recluse::VERSION
9
+ spec.authors = ['James Anthony Bruno']
10
+ spec.email = ['j.bruno.che@gmail.com']
11
+
12
+ spec.summary = 'Friendly, neighborhood web crawler for quality assurance.'
13
+ spec.homepage = Recluse::URL
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = 'exe'
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_runtime_dependency 'thor', '~> 0.19.1'
22
+ spec.add_runtime_dependency 'mechanize', '~> 2.7.5'
23
+ spec.add_runtime_dependency 'nokogiri', '~> 1.7.0.1'
24
+ spec.add_runtime_dependency 'addressable', '~> 2.4.0'
25
+ spec.add_runtime_dependency 'colorize', '~> 0.7.7'
26
+ spec.add_runtime_dependency 'user_config', '~> 0.0.4'
27
+ spec.add_runtime_dependency 'ruby-progressbar', '~> 1.8.1'
28
+
29
+ spec.add_development_dependency 'bundler', '~> 1.12'
30
+ spec.add_development_dependency 'rake', '~> 10.0'
31
+ spec.add_development_dependency 'rubocop', '~> 0.47.1'
32
+ spec.add_development_dependency 'minitest', '~> 5.10.1'
33
+ spec.add_development_dependency 'minitest-reporters', '~> 1.1.14'
34
+ end