recluse 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.rubocop.yml +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +235 -0
- data/Rakefile +48 -0
- data/exe/recluse +5 -0
- data/lib/recluse.rb +7 -0
- data/lib/recluse/cli/blacklist.rb +59 -0
- data/lib/recluse/cli/main.rb +287 -0
- data/lib/recluse/cli/profile.rb +117 -0
- data/lib/recluse/cli/roots.rb +59 -0
- data/lib/recluse/cli/whitelist.rb +59 -0
- data/lib/recluse/hashtree.rb +172 -0
- data/lib/recluse/info.rb +9 -0
- data/lib/recluse/link.rb +89 -0
- data/lib/recluse/profile.rb +292 -0
- data/lib/recluse/result.rb +42 -0
- data/lib/recluse/statuscode.rb +91 -0
- data/recluse.gemspec +34 -0
- metadata +233 -0
@@ -0,0 +1,292 @@
|
|
1
|
+
require 'recluse/hashtree'
|
2
|
+
require 'recluse/link'
|
3
|
+
require 'recluse/result'
|
4
|
+
require 'recluse/info'
|
5
|
+
require 'addressable/uri'
|
6
|
+
require 'mechanize'
|
7
|
+
require 'colorize'
|
8
|
+
require 'user_config'
|
9
|
+
require 'ruby-progressbar'
|
10
|
+
|
11
|
+
module Recluse
|
12
|
+
##
|
13
|
+
# Error to throw if there's something non-standard with the profile configuration.
|
14
|
+
class ProfileError < RuntimeError
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# A profile is an atomic unit of rules for link checking.
|
19
|
+
class Profile
|
20
|
+
##
|
21
|
+
# Identifier of the profile. Make sure that it is filename friendly. Required.
|
22
|
+
attr_accessor :name
|
23
|
+
|
24
|
+
##
|
25
|
+
# Array of URLs to start spidering. Required.
|
26
|
+
attr_accessor :roots
|
27
|
+
|
28
|
+
##
|
29
|
+
# Used in the user-agent to identify who is running the crawler. This is so that if there's a problem with your spidering, you will be contacted and not the author of Recluse. Required.
|
30
|
+
attr_accessor :email
|
31
|
+
|
32
|
+
##
|
33
|
+
# Array of URL patterns to check. Optional. Defaults to empty array.
|
34
|
+
attr_accessor :blacklist
|
35
|
+
|
36
|
+
##
|
37
|
+
# Array of exceptions to the blacklist. Optional. Defaults to empty array.
|
38
|
+
attr_accessor :whitelist
|
39
|
+
|
40
|
+
##
|
41
|
+
# Don't check external URLs. Optional. Defaults to +false+.
|
42
|
+
attr_accessor :internal_only
|
43
|
+
|
44
|
+
##
|
45
|
+
# HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to +false+.
|
46
|
+
attr_accessor :scheme_squash
|
47
|
+
|
48
|
+
##
|
49
|
+
# +HashTree+ representation of results.
|
50
|
+
attr_accessor :results
|
51
|
+
|
52
|
+
##
|
53
|
+
# When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to +false+.
|
54
|
+
attr_accessor :redirect
|
55
|
+
|
56
|
+
##
|
57
|
+
# Create a profile.
|
58
|
+
def initialize(
|
59
|
+
name,
|
60
|
+
roots,
|
61
|
+
email,
|
62
|
+
blacklist: [],
|
63
|
+
whitelist: [],
|
64
|
+
internal_only: false,
|
65
|
+
scheme_squash: false,
|
66
|
+
redirect: false
|
67
|
+
)
|
68
|
+
raise ProfileError, 'Profile needs roots for starting point' if roots.empty?
|
69
|
+
@name = name
|
70
|
+
@email = email
|
71
|
+
@roots = roots
|
72
|
+
@blacklist = blacklist
|
73
|
+
@whitelist = whitelist
|
74
|
+
@internal_only = internal_only
|
75
|
+
@scheme_squash = scheme_squash
|
76
|
+
@redirect = redirect
|
77
|
+
@results = HashTree.new do |url1, url2|
|
78
|
+
url1, url2 = url2, url1 if url2.length > url1.length
|
79
|
+
# Detect if URL exists already, but just has a slash at end
|
80
|
+
(url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
##
|
85
|
+
# Create a +Mechanize+ agent.
|
86
|
+
def create_agent
|
87
|
+
Mechanize.new do |a|
|
88
|
+
a.ssl_version = 'TLSv1'
|
89
|
+
a.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
90
|
+
a.max_history = nil
|
91
|
+
a.follow_meta_refresh = true
|
92
|
+
a.keep_alive = false
|
93
|
+
a.redirect_ok = @redirect
|
94
|
+
a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
##
|
99
|
+
# Starting from the roots, goes through each runnable link and records the referrer, the status code, and any errors.
|
100
|
+
# Results are saved in <tt>@results</tt>.
|
101
|
+
def status(quiet: false)
|
102
|
+
queue = @roots.map { |url| Link.new(url, :root) }
|
103
|
+
addrroot = @roots.map { |url| Addressable::URI.parse url }
|
104
|
+
raise ProfileError, 'No roots to start from' if queue.empty?
|
105
|
+
agent = create_agent
|
106
|
+
while queue.length >= 1
|
107
|
+
element = queue.shift
|
108
|
+
next unless element.run?(@blacklist, @whitelist)
|
109
|
+
internal = element.internal?(addrroot)
|
110
|
+
next if @internal_only && !internal
|
111
|
+
if @results.child?(element.absolute)
|
112
|
+
@results.add element.absolute, element.parent
|
113
|
+
next
|
114
|
+
end
|
115
|
+
@results.add element.absolute, element.parent
|
116
|
+
if @scheme_squash
|
117
|
+
alt = element.address
|
118
|
+
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
119
|
+
if @results.child?(alt.to_s)
|
120
|
+
@results.set_child_value element.absolute, @results.get_child_value(alt.to_s)
|
121
|
+
next
|
122
|
+
end
|
123
|
+
end
|
124
|
+
result = Result.new 'idk', false
|
125
|
+
begin
|
126
|
+
page = agent.get element.absolute
|
127
|
+
result.code = page.code
|
128
|
+
if @redirect
|
129
|
+
result_link = Link.new(page.uri.to_s, element.parent)
|
130
|
+
internal = result_link.internal?(addrroot)
|
131
|
+
end
|
132
|
+
queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } if internal && (page.class != Mechanize::File) && (page.class != Mechanize::Image)
|
133
|
+
rescue Mechanize::ResponseCodeError => code
|
134
|
+
result.code = code.response_code
|
135
|
+
rescue => e
|
136
|
+
result.error = e
|
137
|
+
end
|
138
|
+
@results.set_child_value element.absolute, result
|
139
|
+
unless quiet
|
140
|
+
puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{element.absolute}"
|
141
|
+
puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
##
|
147
|
+
# Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) +internal_only+ behavior to +true+.
|
148
|
+
def find(glob, quiet: false)
|
149
|
+
queue = @roots.map { |url| Link.new(url, :root) }
|
150
|
+
addrroot = @roots.map { |url| Addressable::URI.parse url }
|
151
|
+
raise ProfileError, 'No roots to start from' if queue.empty?
|
152
|
+
progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
|
153
|
+
agent = create_agent
|
154
|
+
while queue.length >= 1
|
155
|
+
element = queue.shift
|
156
|
+
match = element.match? glob
|
157
|
+
if match
|
158
|
+
@results.add element.absolute, element.parent
|
159
|
+
progress.log "[#{@name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{element.parent} => #{element.absolute}" unless quiet
|
160
|
+
end
|
161
|
+
next unless element.run?(@blacklist, @whitelist)
|
162
|
+
internal = element.internal?(addrroot)
|
163
|
+
next unless internal
|
164
|
+
next if @results.parent?(element.absolute)
|
165
|
+
if @scheme_squash
|
166
|
+
alt = element.address
|
167
|
+
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
168
|
+
next if @results.parent?(alt.to_s)
|
169
|
+
end
|
170
|
+
@results.add_parent element.absolute
|
171
|
+
result = Result.new 'idk', false
|
172
|
+
begin
|
173
|
+
page = agent.get element.absolute
|
174
|
+
result.code = page.code
|
175
|
+
if @redirect
|
176
|
+
result_link = Link.new(page.uri.to_s, element.parent)
|
177
|
+
next unless result_link.internal?(addrroot)
|
178
|
+
end
|
179
|
+
queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
|
180
|
+
rescue Mechanize::ResponseCodeError => code
|
181
|
+
result.code = code.response_code
|
182
|
+
rescue => e
|
183
|
+
result.error = e
|
184
|
+
end
|
185
|
+
progress.increment unless quiet
|
186
|
+
unless quiet || (result.error == false)
|
187
|
+
progress.log "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
|
188
|
+
progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
##
|
194
|
+
# Asserts existence of CSS selectors.
|
195
|
+
def assert(selectors, quiet: false)
|
196
|
+
queue = @roots.map { |url| Link.new(url, :root) }
|
197
|
+
addrroot = @roots.map { |url| Addressable::URI.parse url }
|
198
|
+
raise ProfileError, 'No roots to start from' if queue.empty?
|
199
|
+
agent = create_agent
|
200
|
+
while queue.length >= 1
|
201
|
+
element = queue.shift
|
202
|
+
internal = element.internal?(addrroot)
|
203
|
+
next unless element.run?(@blacklist, @whitelist) && internal && !@results.child?(element.absolute)
|
204
|
+
if @scheme_squash
|
205
|
+
alt = element.address
|
206
|
+
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
207
|
+
next if @results.child?(alt.to_s)
|
208
|
+
end
|
209
|
+
@results.add_child element.absolute
|
210
|
+
existence = nil
|
211
|
+
result = Result.new 'idk', false
|
212
|
+
begin
|
213
|
+
page = agent.get element.absolute
|
214
|
+
result.code = page.code
|
215
|
+
if @redirect
|
216
|
+
result_link = Link.new(page.uri.to_s, element.parent)
|
217
|
+
next unless result_link.internal?(addrroot)
|
218
|
+
end
|
219
|
+
unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
|
220
|
+
existence = {}
|
221
|
+
selectors.each do |selector|
|
222
|
+
existence[selector] = !page.css(selector).empty?
|
223
|
+
end
|
224
|
+
@results.set_child_value element.absolute, existence
|
225
|
+
queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) }
|
226
|
+
end
|
227
|
+
rescue Mechanize::ResponseCodeError => code
|
228
|
+
result.code = code.response_code
|
229
|
+
rescue => e
|
230
|
+
result.error = e
|
231
|
+
end
|
232
|
+
unless quiet
|
233
|
+
if result.error != false
|
234
|
+
puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
|
235
|
+
puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
|
236
|
+
elsif !existence.nil?
|
237
|
+
existence.each do |selector, exists|
|
238
|
+
puts "[#{@name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{element.absolute}"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
##
|
246
|
+
# Saves profile to <tt>~/.recluse/NAME.yaml</tt>.
|
247
|
+
def save
|
248
|
+
uconf = UserConfig.new '.recluse'
|
249
|
+
fname = "#{@name}.yaml"
|
250
|
+
options = uconf[fname]
|
251
|
+
options['name'] = @name
|
252
|
+
options['roots'] = @roots
|
253
|
+
options['email'] = @email
|
254
|
+
options['blacklist'] = @blacklist
|
255
|
+
options['whitelist'] = @whitelist
|
256
|
+
options['internal_only'] = @internal_only
|
257
|
+
options['scheme_squash'] = @scheme_squash
|
258
|
+
options['redirect'] = @redirect
|
259
|
+
options.save
|
260
|
+
end
|
261
|
+
|
262
|
+
##
|
263
|
+
# Test if profiles share the same configuration options.
|
264
|
+
def ==(other)
|
265
|
+
return false if other.class != self.class
|
266
|
+
instance_variables.all? do |ivar|
|
267
|
+
ivar == '@results'.to_sym || instance_variable_get(ivar) == other.instance_variable_get(ivar)
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
##
|
272
|
+
# Loads profile by name.
|
273
|
+
def self.load(profile)
|
274
|
+
uconf = UserConfig.new '.recluse'
|
275
|
+
raise ProfileError, "Profile '#{profile}' doesn't exist" unless uconf.exist?("#{profile}.yaml")
|
276
|
+
options = uconf["#{profile}.yaml"]
|
277
|
+
expects = [:blacklist, :whitelist, :internal_only, :scheme_squash, :redirect]
|
278
|
+
opts = {}
|
279
|
+
expects.each do |e|
|
280
|
+
estr = e.to_s
|
281
|
+
opts[e] = options[estr] if options.key?(estr) && !options[estr].nil?
|
282
|
+
end
|
283
|
+
ret = Profile.new(
|
284
|
+
profile,
|
285
|
+
(options.key?('roots') && !options['roots'].nil? ? options['roots'] : []),
|
286
|
+
(options.key?('email') && !options['email'].nil? ? options['email'] : ''),
|
287
|
+
**opts
|
288
|
+
)
|
289
|
+
ret
|
290
|
+
end
|
291
|
+
end
|
292
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Recluse
|
2
|
+
##
|
3
|
+
# Very simple result container.
|
4
|
+
class Result
|
5
|
+
##
|
6
|
+
# HTTP status code.
|
7
|
+
attr_accessor :code
|
8
|
+
|
9
|
+
##
|
10
|
+
# Access error message.
|
11
|
+
attr_accessor :error
|
12
|
+
|
13
|
+
##
|
14
|
+
# Create a result.
|
15
|
+
def initialize(code, error)
|
16
|
+
@code = code
|
17
|
+
@error = error
|
18
|
+
end
|
19
|
+
|
20
|
+
##
|
21
|
+
# Returns the HTTP status code.
|
22
|
+
def inspect
|
23
|
+
@code
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Color based on code.
|
28
|
+
def color
|
29
|
+
case (@code.to_i / 100)
|
30
|
+
when 2
|
31
|
+
color = :green
|
32
|
+
when 3
|
33
|
+
color = :yellow
|
34
|
+
when 4, 5
|
35
|
+
color = :red
|
36
|
+
else
|
37
|
+
color = :blue
|
38
|
+
end
|
39
|
+
color
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Recluse
|
2
|
+
##
|
3
|
+
# Error to throw if there's something non-standard with the status code.
|
4
|
+
class StatusCodeError < RuntimeError
|
5
|
+
end
|
6
|
+
|
7
|
+
##
|
8
|
+
# An HTTP status code.
|
9
|
+
class StatusCode
|
10
|
+
##
|
11
|
+
# The status code. Either a number, a string with x's to represent wildcards, or 'idk'.
|
12
|
+
attr_reader :code
|
13
|
+
|
14
|
+
##
|
15
|
+
# Whether or not this is an exact numerical code.
|
16
|
+
attr_reader :exact
|
17
|
+
|
18
|
+
##
|
19
|
+
# Create a status code.
|
20
|
+
def initialize(code)
|
21
|
+
raise StatusCodeError, "Invalid status code: #{code}" unless StatusCode.valid_code?(code)
|
22
|
+
case code
|
23
|
+
when String
|
24
|
+
if (code =~ /^[\d]{3}/).nil? # wildcards or idk
|
25
|
+
@code = code.downcase
|
26
|
+
@exact = @code == 'idk'
|
27
|
+
else # whole number
|
28
|
+
@code = code.to_i
|
29
|
+
@exact = true
|
30
|
+
end
|
31
|
+
when Recluse::StatusCode
|
32
|
+
@code = code.code
|
33
|
+
@exact = code.exact
|
34
|
+
when Integer
|
35
|
+
@code = code
|
36
|
+
@exact = true
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# Output the status code to a string.
|
42
|
+
def to_s
|
43
|
+
@code.to_s
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# Whether or not this is an exact numerical code.
|
48
|
+
def exact?
|
49
|
+
@exact
|
50
|
+
end
|
51
|
+
|
52
|
+
##
|
53
|
+
# Is this code equal to another?
|
54
|
+
def equal?(other)
|
55
|
+
comparable = StatusCode.new other
|
56
|
+
return @code == comparable.code if exact? && comparable.exact?
|
57
|
+
self_s = to_s
|
58
|
+
comparable_s = comparable.to_s
|
59
|
+
(0...3).all? do |i|
|
60
|
+
StatusCode.equal_digit?(self_s[i], comparable_s[i])
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Is the passed code valid?
|
66
|
+
def self.valid_code?(code)
|
67
|
+
case code
|
68
|
+
when String
|
69
|
+
code = code.downcase
|
70
|
+
return false if (code =~ /^([\dx]{3}|idk)$/i).nil?
|
71
|
+
return true if (code == 'idk') || (code[0] == 'x')
|
72
|
+
initial = code[0].to_i
|
73
|
+
((1 <= initial) && (initial <= 5))
|
74
|
+
when Integer
|
75
|
+
((100 <= code) && code < 600)
|
76
|
+
when Recluse::StatusCode
|
77
|
+
true
|
78
|
+
else
|
79
|
+
false
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
class << self
|
84
|
+
##
|
85
|
+
# Digital comparison. x's are wildcards.
|
86
|
+
def equal_digit?(a, b)
|
87
|
+
((a == b) || (a == 'x') || (b == 'x'))
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
data/recluse.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'recluse/info'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'recluse'
|
8
|
+
spec.version = Recluse::VERSION
|
9
|
+
spec.authors = ['James Anthony Bruno']
|
10
|
+
spec.email = ['j.bruno.che@gmail.com']
|
11
|
+
|
12
|
+
spec.summary = 'Friendly, neighborhood web crawler for quality assurance.'
|
13
|
+
spec.homepage = Recluse::URL
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.bindir = 'exe'
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'thor', '~> 0.19.1'
|
22
|
+
spec.add_runtime_dependency 'mechanize', '~> 2.7.5'
|
23
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.7.0.1'
|
24
|
+
spec.add_runtime_dependency 'addressable', '~> 2.4.0'
|
25
|
+
spec.add_runtime_dependency 'colorize', '~> 0.7.7'
|
26
|
+
spec.add_runtime_dependency 'user_config', '~> 0.0.4'
|
27
|
+
spec.add_runtime_dependency 'ruby-progressbar', '~> 1.8.1'
|
28
|
+
|
29
|
+
spec.add_development_dependency 'bundler', '~> 1.12'
|
30
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
31
|
+
spec.add_development_dependency 'rubocop', '~> 0.47.1'
|
32
|
+
spec.add_development_dependency 'minitest', '~> 5.10.1'
|
33
|
+
spec.add_development_dependency 'minitest-reporters', '~> 1.1.14'
|
34
|
+
end
|