recluse 1.0.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +37 -0
- data/lib/recluse/cli/main.rb +13 -12
- data/lib/recluse/info.rb +1 -1
- data/lib/recluse/link.rb +6 -0
- data/lib/recluse/profile.rb +32 -154
- data/lib/recluse/queue.rb +74 -0
- data/lib/recluse/response.rb +39 -0
- data/lib/recluse/result.rb +1 -1
- data/lib/recluse/tasks/assert.rb +59 -0
- data/lib/recluse/tasks/find.rb +55 -0
- data/lib/recluse/tasks/list.rb +42 -0
- data/lib/recluse/tasks/status.rb +55 -0
- data/lib/recluse/tasks/task.rb +46 -0
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4bc973f210fae9eda00a82ccc421c957098182b3
|
4
|
+
data.tar.gz: 27b8c662b5e24f3b48e8d29277ef82d027d128a7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 528286fe546e0862abd045b13abef5722a102aa3e9a660d93b08206683cf01a8a61593de6bfd40e93bbeff765a82890ebf593a8b70d97125d683ce7bead4c38f
|
7
|
+
data.tar.gz: cb4fcf8872a03d57635756616813603556a9099498f8376dc1dfc9e8e7363fbe71017af2c95b61801f6b15691746c1d4fc909c2de9537e8021be113c5e3873e4
|
data/README.md
CHANGED
@@ -228,6 +228,43 @@ List the YAML info of the profile.
|
|
228
228
|
|
229
229
|
Bug reports and pull requests are welcome on GitHub.
|
230
230
|
|
231
|
+
## Extending
|
232
|
+
|
233
|
+
Recluse is modular so you can add tasks if you want. Below is an example of adding your own task to Recluse.
|
234
|
+
|
235
|
+
```ruby
|
236
|
+
require 'recluse'
|
237
|
+
|
238
|
+
module MyModule
|
239
|
+
##
|
240
|
+
# Create a task object
|
241
|
+
class MyTask < Recluse::Tasks::Task
|
242
|
+
##
|
243
|
+
# First argument must be the profile. The rest are hash arguments specific for the task.
|
244
|
+
def initialize(profile, option1: false, option2: true, results: nil)
|
245
|
+
# Sets up everything based on the profile, queue-specific options, and can also prepopulate results.
|
246
|
+
super(profile, queue_options, results: results)
|
247
|
+
@queue.run_if do |link|
|
248
|
+
# Run a link if this function returns true.
|
249
|
+
# Link is a Recluse::Link object.
|
250
|
+
end
|
251
|
+
@queue.on_complete do |link, response|
|
252
|
+
# Run this function after the page has either successfully been retrieved, or failed to be retrieved.
|
253
|
+
# Link is a Recluse::Link object.
|
254
|
+
# Response is a Recluse::Response object.
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
# Add your task to the task list under the key 'my_task'.
|
261
|
+
Recluse::Tasks.add_task(:my_task, MyModule::MyTask)
|
262
|
+
|
263
|
+
# You can now access 'my_task' like you would the default Recluse tasks.
|
264
|
+
my_profile = Recluse::Profile.load('my_profile')
|
265
|
+
my_profile.test(:my_task, option1: true, option2: true)
|
266
|
+
results = my_profile.results[:my_task]
|
267
|
+
```
|
231
268
|
|
232
269
|
## License
|
233
270
|
|
data/lib/recluse/cli/main.rb
CHANGED
@@ -24,7 +24,7 @@ module Recluse
|
|
24
24
|
parent_count = 0
|
25
25
|
case group_by
|
26
26
|
when :page
|
27
|
-
report = profile.results.parents
|
27
|
+
report = profile.results[:find].parents
|
28
28
|
CSV.open(csv_path, 'w+') do |csv|
|
29
29
|
csv << ['Page', 'Matching URLs']
|
30
30
|
report.each do |parent, children|
|
@@ -35,7 +35,7 @@ module Recluse
|
|
35
35
|
end
|
36
36
|
end
|
37
37
|
when :none
|
38
|
-
report = profile.results.parents
|
38
|
+
report = profile.results[:find].parents
|
39
39
|
CSV.open(csv_path, 'w+') do |csv|
|
40
40
|
csv << ['Matching URL', 'Page']
|
41
41
|
report.each do |parent, children|
|
@@ -47,7 +47,7 @@ module Recluse
|
|
47
47
|
end
|
48
48
|
end
|
49
49
|
when :url
|
50
|
-
report = profile.results.children
|
50
|
+
report = profile.results[:find].children
|
51
51
|
CSV.open(csv_path, 'w+') do |csv|
|
52
52
|
csv << ['Matching URL', 'Pages']
|
53
53
|
parents = Set.new
|
@@ -61,7 +61,7 @@ module Recluse
|
|
61
61
|
parent_count = parents.length
|
62
62
|
end
|
63
63
|
end
|
64
|
-
total = profile.results.parents.keys.length
|
64
|
+
total = profile.results[:find].parents.keys.length
|
65
65
|
puts "Total pages:\t#{total}"
|
66
66
|
puts "Matched URLs:\t#{child_count}"
|
67
67
|
puts "Pages with matches:\t#{parent_count}\t#{perc parent_count, total}%"
|
@@ -87,7 +87,7 @@ module Recluse
|
|
87
87
|
valid_status = proc do |code|
|
88
88
|
(includes.any? { |include_code| include_code.equal?(code) }) && (excludes.none? { |exclude_code| exclude_code.equal?(code) })
|
89
89
|
end
|
90
|
-
report = profile.results.children
|
90
|
+
report = profile.results[:status].children
|
91
91
|
CSV.open(csv_path, 'w+') do |csv|
|
92
92
|
csv << ['Status code', 'URL', page_label, 'With error']
|
93
93
|
report.each do |child, info|
|
@@ -118,7 +118,7 @@ module Recluse
|
|
118
118
|
|
119
119
|
def assert_save(profile, csv_path, report_vals)
|
120
120
|
puts 'Saving report...'
|
121
|
-
report = profile.results.children
|
121
|
+
report = profile.results[:assert].children
|
122
122
|
counts = {}
|
123
123
|
CSV.open(csv_path, 'w+') do |csv|
|
124
124
|
csv << ['Selector', 'Exists', 'On page']
|
@@ -182,8 +182,8 @@ module Recluse
|
|
182
182
|
Signal.trap sig, &ending
|
183
183
|
end
|
184
184
|
(0...profile_queue.length).each do |i|
|
185
|
-
profile.results = profile_queue[i - 1].results unless i.zero?
|
186
|
-
profile.status
|
185
|
+
profile.results[:status] = profile_queue[i - 1].results[:status] unless i.zero?
|
186
|
+
profile.test :status
|
187
187
|
profile = profile_queue[i + 1] if i + 1 < profile_queue.length
|
188
188
|
end
|
189
189
|
%w(INT TERM).each do |sig|
|
@@ -219,8 +219,8 @@ module Recluse
|
|
219
219
|
Signal.trap sig, &ending
|
220
220
|
end
|
221
221
|
(0...profile_queue.length).each do |i|
|
222
|
-
profile.results = profile_queue[i - 1].results unless i.zero?
|
223
|
-
profile.find options['globs']
|
222
|
+
profile.results[:find] = profile_queue[i - 1].results[:find] unless i.zero?
|
223
|
+
profile.test(:find, globs: options['globs'])
|
224
224
|
profile = profile_queue[i + 1] if i + 1 < profile_queue.length
|
225
225
|
end
|
226
226
|
%w(INT TERM).each do |sig|
|
@@ -265,9 +265,10 @@ module Recluse
|
|
265
265
|
%w(INT TERM).each do |sig|
|
266
266
|
Signal.trap sig, &ending
|
267
267
|
end
|
268
|
+
|
268
269
|
(0...profile_queue.length).each do |i|
|
269
|
-
profile.results = profile_queue[i - 1].results unless i.zero?
|
270
|
-
profile.assert options['exists']
|
270
|
+
profile.results[:assert] = profile_queue[i - 1].results[:assert] unless i.zero?
|
271
|
+
profile.test(:assert, selectors: options['exists'])
|
271
272
|
profile = profile_queue[i + 1] if i + 1 < profile_queue.length
|
272
273
|
end
|
273
274
|
%w(INT TERM).each do |sig|
|
data/lib/recluse/info.rb
CHANGED
data/lib/recluse/link.rb
CHANGED
data/lib/recluse/profile.rb
CHANGED
@@ -2,6 +2,7 @@ require 'recluse/hashtree'
|
|
2
2
|
require 'recluse/link'
|
3
3
|
require 'recluse/result'
|
4
4
|
require 'recluse/info'
|
5
|
+
require 'recluse/tasks/list'
|
5
6
|
require 'addressable/uri'
|
6
7
|
require 'mechanize'
|
7
8
|
require 'colorize'
|
@@ -45,14 +46,18 @@ module Recluse
|
|
45
46
|
# HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to +false+.
|
46
47
|
attr_accessor :scheme_squash
|
47
48
|
|
48
|
-
##
|
49
|
-
# +HashTree+ representation of results.
|
50
|
-
attr_accessor :results
|
51
|
-
|
52
49
|
##
|
53
50
|
# When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to +false+.
|
54
51
|
attr_accessor :redirect
|
55
52
|
|
53
|
+
##
|
54
|
+
# The list of run tests.
|
55
|
+
attr_accessor :tasks
|
56
|
+
|
57
|
+
##
|
58
|
+
# Hash of resulting +HashTree+s.
|
59
|
+
attr_accessor :results
|
60
|
+
|
56
61
|
##
|
57
62
|
# Create a profile.
|
58
63
|
def initialize(
|
@@ -68,17 +73,20 @@ module Recluse
|
|
68
73
|
raise ProfileError, 'Profile needs roots for starting point' if roots.empty?
|
69
74
|
@name = name
|
70
75
|
@email = email
|
71
|
-
@roots = roots
|
76
|
+
@roots = roots.map do |root|
|
77
|
+
if root.class == Link
|
78
|
+
root
|
79
|
+
else
|
80
|
+
Link.new(root, :root)
|
81
|
+
end
|
82
|
+
end
|
72
83
|
@blacklist = blacklist
|
73
84
|
@whitelist = whitelist
|
74
85
|
@internal_only = internal_only
|
75
86
|
@scheme_squash = scheme_squash
|
76
87
|
@redirect = redirect
|
77
|
-
@
|
78
|
-
|
79
|
-
# Detect if URL exists already, but just has a slash at end
|
80
|
-
(url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
|
81
|
-
end
|
88
|
+
@tasks = {}
|
89
|
+
@results = {}
|
82
90
|
end
|
83
91
|
|
84
92
|
##
|
@@ -96,150 +104,18 @@ module Recluse
|
|
96
104
|
end
|
97
105
|
|
98
106
|
##
|
99
|
-
#
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
while queue.length >= 1
|
107
|
-
element = queue.shift
|
108
|
-
next unless element.run?(@blacklist, @whitelist)
|
109
|
-
internal = element.internal?(addrroot)
|
110
|
-
next if @internal_only && !internal
|
111
|
-
if @results.child?(element.absolute)
|
112
|
-
@results.add element.absolute, element.parent
|
113
|
-
next
|
114
|
-
end
|
115
|
-
@results.add element.absolute, element.parent
|
116
|
-
if @scheme_squash
|
117
|
-
alt = element.address
|
118
|
-
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
119
|
-
if @results.child?(alt.to_s)
|
120
|
-
@results.set_child_value element.absolute, @results.get_child_value(alt.to_s)
|
121
|
-
next
|
122
|
-
end
|
123
|
-
end
|
124
|
-
result = Result.new 'idk', false
|
125
|
-
begin
|
126
|
-
page = agent.get element.absolute
|
127
|
-
result.code = page.code
|
128
|
-
if @redirect
|
129
|
-
result_link = Link.new(page.uri.to_s, element.parent)
|
130
|
-
internal = result_link.internal?(addrroot)
|
131
|
-
end
|
132
|
-
queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } if internal && (page.class != Mechanize::File) && (page.class != Mechanize::Image)
|
133
|
-
rescue Mechanize::ResponseCodeError => code
|
134
|
-
result.code = code.response_code
|
135
|
-
rescue => e
|
136
|
-
result.error = e
|
137
|
-
end
|
138
|
-
@results.set_child_value element.absolute, result
|
139
|
-
unless quiet
|
140
|
-
puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{element.absolute}"
|
141
|
-
puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
|
142
|
-
end
|
143
|
-
end
|
144
|
-
end
|
145
|
-
|
146
|
-
##
|
147
|
-
# Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) +internal_only+ behavior to +true+.
|
148
|
-
def find(glob, quiet: false)
|
149
|
-
queue = @roots.map { |url| Link.new(url, :root) }
|
150
|
-
addrroot = @roots.map { |url| Addressable::URI.parse url }
|
151
|
-
raise ProfileError, 'No roots to start from' if queue.empty?
|
152
|
-
progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
|
153
|
-
agent = create_agent
|
154
|
-
while queue.length >= 1
|
155
|
-
element = queue.shift
|
156
|
-
match = element.match? glob
|
157
|
-
if match
|
158
|
-
@results.add element.absolute, element.parent
|
159
|
-
progress.log "[#{@name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{element.parent} => #{element.absolute}" unless quiet
|
160
|
-
end
|
161
|
-
next unless element.run?(@blacklist, @whitelist)
|
162
|
-
internal = element.internal?(addrroot)
|
163
|
-
next unless internal
|
164
|
-
next if @results.parent?(element.absolute)
|
165
|
-
if @scheme_squash
|
166
|
-
alt = element.address
|
167
|
-
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
168
|
-
next if @results.parent?(alt.to_s)
|
169
|
-
end
|
170
|
-
@results.add_parent element.absolute
|
171
|
-
result = Result.new 'idk', false
|
172
|
-
begin
|
173
|
-
page = agent.get element.absolute
|
174
|
-
result.code = page.code
|
175
|
-
if @redirect
|
176
|
-
result_link = Link.new(page.uri.to_s, element.parent)
|
177
|
-
next unless result_link.internal?(addrroot)
|
178
|
-
end
|
179
|
-
queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
|
180
|
-
rescue Mechanize::ResponseCodeError => code
|
181
|
-
result.code = code.response_code
|
182
|
-
rescue => e
|
183
|
-
result.error = e
|
184
|
-
end
|
185
|
-
progress.increment unless quiet
|
186
|
-
unless quiet || (result.error == false)
|
187
|
-
progress.log "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
|
188
|
-
progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
|
189
|
-
end
|
190
|
-
end
|
191
|
-
end
|
192
|
-
|
193
|
-
##
|
194
|
-
# Asserts existence of CSS selectors.
|
195
|
-
def assert(selectors, quiet: false)
|
196
|
-
queue = @roots.map { |url| Link.new(url, :root) }
|
197
|
-
addrroot = @roots.map { |url| Addressable::URI.parse url }
|
198
|
-
raise ProfileError, 'No roots to start from' if queue.empty?
|
199
|
-
agent = create_agent
|
200
|
-
while queue.length >= 1
|
201
|
-
element = queue.shift
|
202
|
-
internal = element.internal?(addrroot)
|
203
|
-
next unless element.run?(@blacklist, @whitelist) && internal && !@results.child?(element.absolute)
|
204
|
-
if @scheme_squash
|
205
|
-
alt = element.address
|
206
|
-
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
207
|
-
next if @results.child?(alt.to_s)
|
208
|
-
end
|
209
|
-
@results.add_child element.absolute
|
210
|
-
existence = nil
|
211
|
-
result = Result.new 'idk', false
|
212
|
-
begin
|
213
|
-
page = agent.get element.absolute
|
214
|
-
result.code = page.code
|
215
|
-
if @redirect
|
216
|
-
result_link = Link.new(page.uri.to_s, element.parent)
|
217
|
-
next unless result_link.internal?(addrroot)
|
218
|
-
end
|
219
|
-
unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
|
220
|
-
existence = {}
|
221
|
-
selectors.each do |selector|
|
222
|
-
existence[selector] = !page.css(selector).empty?
|
223
|
-
end
|
224
|
-
@results.set_child_value element.absolute, existence
|
225
|
-
queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) }
|
226
|
-
end
|
227
|
-
rescue Mechanize::ResponseCodeError => code
|
228
|
-
result.code = code.response_code
|
229
|
-
rescue => e
|
230
|
-
result.error = e
|
231
|
-
end
|
232
|
-
unless quiet
|
233
|
-
if result.error != false
|
234
|
-
puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
|
235
|
-
puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
|
236
|
-
elsif !existence.nil?
|
237
|
-
existence.each do |selector, exists|
|
238
|
-
puts "[#{@name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{element.absolute}"
|
239
|
-
end
|
240
|
-
end
|
107
|
+
# Runs test.
|
108
|
+
def test(key, options = {})
|
109
|
+
unless @results.key?(key) && @results[key].class == Recluse::HashTree
|
110
|
+
@results[key] = Recluse::HashTree.new do |url1, url2|
|
111
|
+
url1, url2 = url2, url1 if url2.length > url1.length
|
112
|
+
# Detect if URL exists already, but just has a slash at end
|
113
|
+
(url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
|
241
114
|
end
|
242
115
|
end
|
116
|
+
@tasks[key] = Recluse::Tasks.get(key).new(self, options.merge(results: @results[key]))
|
117
|
+
@tasks[key].run
|
118
|
+
@results[key]
|
243
119
|
end
|
244
120
|
|
245
121
|
##
|
@@ -249,7 +125,7 @@ module Recluse
|
|
249
125
|
fname = "#{@name}.yaml"
|
250
126
|
options = uconf[fname]
|
251
127
|
options['name'] = @name
|
252
|
-
options['roots'] = @roots
|
128
|
+
options['roots'] = @roots.map(&:to_s)
|
253
129
|
options['email'] = @email
|
254
130
|
options['blacklist'] = @blacklist
|
255
131
|
options['whitelist'] = @whitelist
|
@@ -264,7 +140,9 @@ module Recluse
|
|
264
140
|
def ==(other)
|
265
141
|
return false if other.class != self.class
|
266
142
|
instance_variables.all? do |ivar|
|
267
|
-
ivar == '@results'.to_sym
|
143
|
+
next true if ivar == '@results'.to_sym
|
144
|
+
next true if ivar == '@roots' && instance_variable_get(ivar).map(&:to_s) == other.instance_variable_get(ivar).map(&:to_s)
|
145
|
+
instance_variable_get(ivar) == other.instance_variable_get(ivar)
|
268
146
|
end
|
269
147
|
end
|
270
148
|
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'recluse/response'
|
3
|
+
|
4
|
+
module Recluse
|
5
|
+
##
|
6
|
+
# Link checker
|
7
|
+
class Queue
|
8
|
+
##
|
9
|
+
# Create an empty queue
|
10
|
+
def initialize(email, redirect: false)
|
11
|
+
@links = []
|
12
|
+
@run_if = proc { true }
|
13
|
+
@on_complete = proc { |link, response| }
|
14
|
+
@redirect = redirect
|
15
|
+
@email = email
|
16
|
+
@agent = Mechanize.new do |a|
|
17
|
+
a.ssl_version = 'TLSv1'
|
18
|
+
a.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
19
|
+
a.max_history = nil
|
20
|
+
a.follow_meta_refresh = true
|
21
|
+
a.keep_alive = false
|
22
|
+
a.redirect_ok = @redirect
|
23
|
+
a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# Add to queue.
|
29
|
+
def add(link)
|
30
|
+
@links += [*link]
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# If the test is true, run the link. Procedure takes the link as input.
|
35
|
+
def run_if(&block)
|
36
|
+
@run_if = block
|
37
|
+
end
|
38
|
+
|
39
|
+
##
|
40
|
+
# Run when a link has been checked. Procedure takes the link and response as inputs.
|
41
|
+
def on_complete(&block)
|
42
|
+
@on_complete = block
|
43
|
+
end
|
44
|
+
|
45
|
+
##
|
46
|
+
# Run a link
|
47
|
+
def run_link(link)
|
48
|
+
response = Response.new
|
49
|
+
return nil unless @run_if.call(link)
|
50
|
+
begin
|
51
|
+
response.page = @agent.get link.absolute
|
52
|
+
response.code = response.page.code
|
53
|
+
response.success = true
|
54
|
+
rescue Mechanize::ResponseCodeError => code
|
55
|
+
response.code = code.response_code
|
56
|
+
response.success = false
|
57
|
+
rescue => error
|
58
|
+
response.errors = error
|
59
|
+
response.success = false
|
60
|
+
end
|
61
|
+
@on_complete.call link, response
|
62
|
+
response
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Run queue
|
67
|
+
def run
|
68
|
+
until @links.empty?
|
69
|
+
link = @links.shift
|
70
|
+
run_link link
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'recluse/statuscode'
|
3
|
+
|
4
|
+
module Recluse
|
5
|
+
##
|
6
|
+
# Response wrapper.
|
7
|
+
class Response
|
8
|
+
##
|
9
|
+
# +Mechanize::Page+ of the response page. Might be +nil+.
|
10
|
+
attr_accessor :page
|
11
|
+
|
12
|
+
##
|
13
|
+
# +StatusCode+ of the response.
|
14
|
+
attr_reader :code
|
15
|
+
|
16
|
+
##
|
17
|
+
# Error string if any.
|
18
|
+
attr_accessor :errors
|
19
|
+
|
20
|
+
##
|
21
|
+
# Whether the page was successfully accessed or not.
|
22
|
+
attr_accessor :success
|
23
|
+
|
24
|
+
##
|
25
|
+
# Create new response.
|
26
|
+
def initialize(page: nil, errors: false, code: StatusCode.new('idk'), success: false)
|
27
|
+
@page = page
|
28
|
+
@code = code
|
29
|
+
@errors = errors
|
30
|
+
@success = success
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# Set a new status code.
|
35
|
+
def code=(new_code)
|
36
|
+
@code = StatusCode.new new_code
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/recluse/result.rb
CHANGED
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'recluse/tasks/task'
|
2
|
+
require 'recluse/link'
|
3
|
+
require 'recluse/result'
|
4
|
+
require 'addressable/uri'
|
5
|
+
require 'colorize'
|
6
|
+
|
7
|
+
module Recluse
|
8
|
+
module Tasks
|
9
|
+
##
|
10
|
+
# Asserts existence of CSS selectors.
|
11
|
+
class Assert < Task
|
12
|
+
##
|
13
|
+
# Create new assertion task.
|
14
|
+
def initialize(profile, selectors: [], quiet: false, results: nil)
|
15
|
+
super(profile, queue_options: { redirect: profile.redirect }, results: results)
|
16
|
+
addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
|
17
|
+
@queue.run_if do |link|
|
18
|
+
internal = link.internal?(addr_roots)
|
19
|
+
next false unless link.run?(profile.blacklist, profile.whitelist) && internal && !@results.child?(link.absolute)
|
20
|
+
if profile.scheme_squash
|
21
|
+
alt = link.address
|
22
|
+
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
23
|
+
next false if @results.child?(alt.to_s)
|
24
|
+
end
|
25
|
+
@results.add_child link.absolute
|
26
|
+
true
|
27
|
+
end
|
28
|
+
@queue.on_complete do |link, response|
|
29
|
+
existence = nil
|
30
|
+
result = Recluse::Result.new response.code.to_s, response.errors
|
31
|
+
if response.success
|
32
|
+
if profile.redirect
|
33
|
+
result_link = Link.new(response.page.uri.to_s, link.parent)
|
34
|
+
next unless result_link.internal?(addr_roots)
|
35
|
+
end
|
36
|
+
unless (response.page.class == Mechanize::File) || (response.page.class == Mechanize::Image)
|
37
|
+
existence = {}
|
38
|
+
selectors.each do |selector|
|
39
|
+
existence[selector] = !response.page.css(selector).empty?
|
40
|
+
end
|
41
|
+
@results.set_child_value link.absolute, existence
|
42
|
+
@queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) })
|
43
|
+
end
|
44
|
+
end
|
45
|
+
unless quiet
|
46
|
+
if result.error != false
|
47
|
+
puts "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{link.absolute}"
|
48
|
+
puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
|
49
|
+
elsif !existence.nil?
|
50
|
+
existence.each do |selector, exists|
|
51
|
+
puts "[#{profile.name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{link.absolute}"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'recluse/tasks/task'
|
2
|
+
require 'recluse/link'
|
3
|
+
require 'recluse/result'
|
4
|
+
require 'addressable/uri'
|
5
|
+
require 'colorize'
|
6
|
+
require 'ruby-progressbar'
|
7
|
+
|
8
|
+
module Recluse
|
9
|
+
module Tasks
|
10
|
+
##
|
11
|
+
# Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) +internal_only+ behavior to +true+.
|
12
|
+
class Find < Task
|
13
|
+
##
|
14
|
+
# Create new find task.
|
15
|
+
def initialize(profile, globs: [], quiet: false, results: nil)
|
16
|
+
super(profile, queue_options: { redirect: profile.redirect }, results: results)
|
17
|
+
addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
|
18
|
+
progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
|
19
|
+
@queue.run_if do |link|
|
20
|
+
match = link.match? globs
|
21
|
+
if match
|
22
|
+
@results.add link.absolute, link.parent
|
23
|
+
progress.log "[#{profile.name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{link.parent} => #{link.absolute}" unless quiet
|
24
|
+
end
|
25
|
+
next false unless link.run?(profile.blacklist, profile.whitelist)
|
26
|
+
internal = link.internal?(addr_roots)
|
27
|
+
next false unless internal
|
28
|
+
next false if @results.parent?(link.absolute)
|
29
|
+
if profile.scheme_squash
|
30
|
+
alt = link.address
|
31
|
+
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
32
|
+
next false if @results.parent?(alt.to_s)
|
33
|
+
end
|
34
|
+
@results.add_parent link.absolute
|
35
|
+
true
|
36
|
+
end
|
37
|
+
@queue.on_complete do |link, response|
|
38
|
+
result = Recluse::Result.new response.code.to_s, response.errors
|
39
|
+
if response.success
|
40
|
+
if profile.redirect
|
41
|
+
result_link = Recluse::Link.new(response.page.uri.to_s, link.parent)
|
42
|
+
next unless result_link.internal?(addr_roots)
|
43
|
+
end
|
44
|
+
@queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) }) unless (response.page.class == Mechanize::File) || (response.page.class == Mechanize::Image)
|
45
|
+
end
|
46
|
+
progress.increment unless quiet
|
47
|
+
unless quiet || (result.error == false)
|
48
|
+
progress.log "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{link.absolute}"
|
49
|
+
progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'recluse/tasks/status'
|
2
|
+
require 'recluse/tasks/assert'
|
3
|
+
require 'recluse/tasks/find'
|
4
|
+
|
5
|
+
module Recluse
|
6
|
+
##
|
7
|
+
# Tasks are tests for Recluse.
|
8
|
+
module Tasks
|
9
|
+
##
|
10
|
+
# Hash of available tasks.
|
11
|
+
@@list = {
|
12
|
+
status: Recluse::Tasks::Status,
|
13
|
+
assert: Recluse::Tasks::Assert,
|
14
|
+
find: Recluse::Tasks::Find
|
15
|
+
}
|
16
|
+
class << self
|
17
|
+
##
|
18
|
+
# Add task to the list.
|
19
|
+
def add_task(key, task_class)
|
20
|
+
list[key] = task_class
|
21
|
+
end
|
22
|
+
|
23
|
+
##
|
24
|
+
# Hash of available tasks.
|
25
|
+
def list
|
26
|
+
@@list
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Get task by key name.
|
31
|
+
def get(key)
|
32
|
+
@@list[key]
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# Do something for each task.
|
37
|
+
def each(&block)
|
38
|
+
@@list.each(&block)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'recluse/tasks/task'
|
2
|
+
require 'recluse/link'
|
3
|
+
require 'recluse/result'
|
4
|
+
require 'addressable/uri'
|
5
|
+
require 'colorize'
|
6
|
+
|
7
|
+
module Recluse
|
8
|
+
module Tasks
|
9
|
+
##
|
10
|
+
# Starting from the roots, goes through each runnable link and records the referrer, the status code, and any errors.
|
11
|
+
class Status < Task
|
12
|
+
##
|
13
|
+
# Create new status task.
|
14
|
+
def initialize(profile, quiet: false, results: nil)
|
15
|
+
super(profile, queue_options: { redirect: profile.redirect }, results: results)
|
16
|
+
addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
|
17
|
+
@queue.run_if do |link|
|
18
|
+
next false unless link.run?(profile.blacklist, profile.whitelist)
|
19
|
+
internal = link.internal?(addr_roots)
|
20
|
+
next false if profile.internal_only && !internal
|
21
|
+
if @results.child?(link.absolute)
|
22
|
+
@results.add link.absolute, link.parent
|
23
|
+
next false
|
24
|
+
end
|
25
|
+
@results.add link.absolute, link.parent
|
26
|
+
if profile.scheme_squash
|
27
|
+
alt = link.address
|
28
|
+
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
29
|
+
if @results.child?(alt.to_s)
|
30
|
+
@results.set_child_value link.absolute, @results.get_child_value(alt.to_s)
|
31
|
+
next false
|
32
|
+
end
|
33
|
+
end
|
34
|
+
true
|
35
|
+
end
|
36
|
+
@queue.on_complete do |link, response|
|
37
|
+
result = Recluse::Result.new response.code.to_s, response.errors
|
38
|
+
if response.success
|
39
|
+
internal = link.internal? addr_roots
|
40
|
+
if profile.redirect
|
41
|
+
result_link = Recluse::Link.new response.page.uri.to_s, link.parent
|
42
|
+
internal = result_link.internal? addr_roots
|
43
|
+
end
|
44
|
+
queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) }) if internal && (response.page.class != Mechanize::File) && (response.page.class != Mechanize::Image)
|
45
|
+
end
|
46
|
+
@results.set_child_value link.absolute, result
|
47
|
+
unless quiet
|
48
|
+
puts "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{link.absolute}"
|
49
|
+
puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'recluse/queue'
|
2
|
+
require 'recluse/hashtree'
|
3
|
+
|
4
|
+
module Recluse
|
5
|
+
module Tasks
|
6
|
+
##
|
7
|
+
# Task interface. Runs the queue with customized behavior.
|
8
|
+
class Task
|
9
|
+
##
|
10
|
+
# +HashTree+ representation of results.
|
11
|
+
attr_reader :results
|
12
|
+
|
13
|
+
##
|
14
|
+
# +Queue+ of links to check.
|
15
|
+
attr_accessor :queue
|
16
|
+
|
17
|
+
##
|
18
|
+
# Create new task.
|
19
|
+
def initialize(profile, queue_options: {}, results: nil)
|
20
|
+
@queue = Recluse::Queue.new(profile.email, queue_options)
|
21
|
+
if results.nil?
|
22
|
+
@results = Recluse::HashTree.new do |url1, url2|
|
23
|
+
url1, url2 = url2, url1 if url2.length > url1.length
|
24
|
+
# Detect if URL exists already, but just has a slash at end
|
25
|
+
(url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
|
26
|
+
end
|
27
|
+
else
|
28
|
+
@results = results
|
29
|
+
end
|
30
|
+
@queue.add profile.roots
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# Add link (or links) to the queue.
|
35
|
+
def add(link)
|
36
|
+
@queue.add link
|
37
|
+
end
|
38
|
+
|
39
|
+
##
|
40
|
+
# Run the queue.
|
41
|
+
def run
|
42
|
+
@queue.run
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: recluse
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Anthony Bruno
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-03-
|
11
|
+
date: 2017-03-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -203,8 +203,15 @@ files:
|
|
203
203
|
- lib/recluse/info.rb
|
204
204
|
- lib/recluse/link.rb
|
205
205
|
- lib/recluse/profile.rb
|
206
|
+
- lib/recluse/queue.rb
|
207
|
+
- lib/recluse/response.rb
|
206
208
|
- lib/recluse/result.rb
|
207
209
|
- lib/recluse/statuscode.rb
|
210
|
+
- lib/recluse/tasks/assert.rb
|
211
|
+
- lib/recluse/tasks/find.rb
|
212
|
+
- lib/recluse/tasks/list.rb
|
213
|
+
- lib/recluse/tasks/status.rb
|
214
|
+
- lib/recluse/tasks/task.rb
|
208
215
|
- recluse.gemspec
|
209
216
|
homepage: https://github.com/czycha/recluse
|
210
217
|
licenses:
|