recluse 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +37 -0
- data/lib/recluse/cli/main.rb +13 -12
- data/lib/recluse/info.rb +1 -1
- data/lib/recluse/link.rb +6 -0
- data/lib/recluse/profile.rb +32 -154
- data/lib/recluse/queue.rb +74 -0
- data/lib/recluse/response.rb +39 -0
- data/lib/recluse/result.rb +1 -1
- data/lib/recluse/tasks/assert.rb +59 -0
- data/lib/recluse/tasks/find.rb +55 -0
- data/lib/recluse/tasks/list.rb +42 -0
- data/lib/recluse/tasks/status.rb +55 -0
- data/lib/recluse/tasks/task.rb +46 -0
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4bc973f210fae9eda00a82ccc421c957098182b3
|
4
|
+
data.tar.gz: 27b8c662b5e24f3b48e8d29277ef82d027d128a7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 528286fe546e0862abd045b13abef5722a102aa3e9a660d93b08206683cf01a8a61593de6bfd40e93bbeff765a82890ebf593a8b70d97125d683ce7bead4c38f
|
7
|
+
data.tar.gz: cb4fcf8872a03d57635756616813603556a9099498f8376dc1dfc9e8e7363fbe71017af2c95b61801f6b15691746c1d4fc909c2de9537e8021be113c5e3873e4
|
data/README.md
CHANGED
@@ -228,6 +228,43 @@ List the YAML info of the profile.
|
|
228
228
|
|
229
229
|
Bug reports and pull requests are welcome on GitHub.
|
230
230
|
|
231
|
+
## Extending
|
232
|
+
|
233
|
+
Recluse is modular so you can add tasks if you want. Below is an example of adding your own task to Recluse.
|
234
|
+
|
235
|
+
```ruby
|
236
|
+
require 'recluse'
|
237
|
+
|
238
|
+
module MyModule
|
239
|
+
##
|
240
|
+
# Create a task object
|
241
|
+
class MyTask < Recluse::Tasks::Task
|
242
|
+
##
|
243
|
+
# First argument must be the profile. The rest are hash arguments specific for the task.
|
244
|
+
def initialize(profile, option1: false, option2: true, results: nil)
|
245
|
+
# Sets up everything based on the profile, queue-specific options, and can also prepopulate results.
|
246
|
+
super(profile, queue_options, results: results)
|
247
|
+
@queue.run_if do |link|
|
248
|
+
# Run a link if this function returns true.
|
249
|
+
# Link is a Recluse::Link object.
|
250
|
+
end
|
251
|
+
@queue.on_complete do |link, response|
|
252
|
+
# Run this function after the page has either successfully been retrieved, or failed to be retrieved.
|
253
|
+
# Link is a Recluse::Link object.
|
254
|
+
# Response is a Recluse::Response object.
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
# Add your task to the task list under the key 'my_task'.
|
261
|
+
Recluse::Tasks.add_task(:my_task, MyModule::MyTask)
|
262
|
+
|
263
|
+
# You can now access 'my_task' like you would the default Recluse tasks.
|
264
|
+
my_profile = Recluse::Profile.load('my_profile')
|
265
|
+
my_profile.test(:my_task, option1: true, option2: true)
|
266
|
+
results = my_profile.results[:my_task]
|
267
|
+
```
|
231
268
|
|
232
269
|
## License
|
233
270
|
|
data/lib/recluse/cli/main.rb
CHANGED
@@ -24,7 +24,7 @@ module Recluse
|
|
24
24
|
parent_count = 0
|
25
25
|
case group_by
|
26
26
|
when :page
|
27
|
-
report = profile.results.parents
|
27
|
+
report = profile.results[:find].parents
|
28
28
|
CSV.open(csv_path, 'w+') do |csv|
|
29
29
|
csv << ['Page', 'Matching URLs']
|
30
30
|
report.each do |parent, children|
|
@@ -35,7 +35,7 @@ module Recluse
|
|
35
35
|
end
|
36
36
|
end
|
37
37
|
when :none
|
38
|
-
report = profile.results.parents
|
38
|
+
report = profile.results[:find].parents
|
39
39
|
CSV.open(csv_path, 'w+') do |csv|
|
40
40
|
csv << ['Matching URL', 'Page']
|
41
41
|
report.each do |parent, children|
|
@@ -47,7 +47,7 @@ module Recluse
|
|
47
47
|
end
|
48
48
|
end
|
49
49
|
when :url
|
50
|
-
report = profile.results.children
|
50
|
+
report = profile.results[:find].children
|
51
51
|
CSV.open(csv_path, 'w+') do |csv|
|
52
52
|
csv << ['Matching URL', 'Pages']
|
53
53
|
parents = Set.new
|
@@ -61,7 +61,7 @@ module Recluse
|
|
61
61
|
parent_count = parents.length
|
62
62
|
end
|
63
63
|
end
|
64
|
-
total = profile.results.parents.keys.length
|
64
|
+
total = profile.results[:find].parents.keys.length
|
65
65
|
puts "Total pages:\t#{total}"
|
66
66
|
puts "Matched URLs:\t#{child_count}"
|
67
67
|
puts "Pages with matches:\t#{parent_count}\t#{perc parent_count, total}%"
|
@@ -87,7 +87,7 @@ module Recluse
|
|
87
87
|
valid_status = proc do |code|
|
88
88
|
(includes.any? { |include_code| include_code.equal?(code) }) && (excludes.none? { |exclude_code| exclude_code.equal?(code) })
|
89
89
|
end
|
90
|
-
report = profile.results.children
|
90
|
+
report = profile.results[:status].children
|
91
91
|
CSV.open(csv_path, 'w+') do |csv|
|
92
92
|
csv << ['Status code', 'URL', page_label, 'With error']
|
93
93
|
report.each do |child, info|
|
@@ -118,7 +118,7 @@ module Recluse
|
|
118
118
|
|
119
119
|
def assert_save(profile, csv_path, report_vals)
|
120
120
|
puts 'Saving report...'
|
121
|
-
report = profile.results.children
|
121
|
+
report = profile.results[:assert].children
|
122
122
|
counts = {}
|
123
123
|
CSV.open(csv_path, 'w+') do |csv|
|
124
124
|
csv << ['Selector', 'Exists', 'On page']
|
@@ -182,8 +182,8 @@ module Recluse
|
|
182
182
|
Signal.trap sig, &ending
|
183
183
|
end
|
184
184
|
(0...profile_queue.length).each do |i|
|
185
|
-
profile.results = profile_queue[i - 1].results unless i.zero?
|
186
|
-
profile.status
|
185
|
+
profile.results[:status] = profile_queue[i - 1].results[:status] unless i.zero?
|
186
|
+
profile.test :status
|
187
187
|
profile = profile_queue[i + 1] if i + 1 < profile_queue.length
|
188
188
|
end
|
189
189
|
%w(INT TERM).each do |sig|
|
@@ -219,8 +219,8 @@ module Recluse
|
|
219
219
|
Signal.trap sig, &ending
|
220
220
|
end
|
221
221
|
(0...profile_queue.length).each do |i|
|
222
|
-
profile.results = profile_queue[i - 1].results unless i.zero?
|
223
|
-
profile.find options['globs']
|
222
|
+
profile.results[:find] = profile_queue[i - 1].results[:find] unless i.zero?
|
223
|
+
profile.test(:find, globs: options['globs'])
|
224
224
|
profile = profile_queue[i + 1] if i + 1 < profile_queue.length
|
225
225
|
end
|
226
226
|
%w(INT TERM).each do |sig|
|
@@ -265,9 +265,10 @@ module Recluse
|
|
265
265
|
%w(INT TERM).each do |sig|
|
266
266
|
Signal.trap sig, &ending
|
267
267
|
end
|
268
|
+
|
268
269
|
(0...profile_queue.length).each do |i|
|
269
|
-
profile.results = profile_queue[i - 1].results unless i.zero?
|
270
|
-
profile.assert options['exists']
|
270
|
+
profile.results[:assert] = profile_queue[i - 1].results[:assert] unless i.zero?
|
271
|
+
profile.test(:assert, selectors: options['exists'])
|
271
272
|
profile = profile_queue[i + 1] if i + 1 < profile_queue.length
|
272
273
|
end
|
273
274
|
%w(INT TERM).each do |sig|
|
data/lib/recluse/info.rb
CHANGED
data/lib/recluse/link.rb
CHANGED
data/lib/recluse/profile.rb
CHANGED
@@ -2,6 +2,7 @@ require 'recluse/hashtree'
|
|
2
2
|
require 'recluse/link'
|
3
3
|
require 'recluse/result'
|
4
4
|
require 'recluse/info'
|
5
|
+
require 'recluse/tasks/list'
|
5
6
|
require 'addressable/uri'
|
6
7
|
require 'mechanize'
|
7
8
|
require 'colorize'
|
@@ -45,14 +46,18 @@ module Recluse
|
|
45
46
|
# HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to +false+.
|
46
47
|
attr_accessor :scheme_squash
|
47
48
|
|
48
|
-
##
|
49
|
-
# +HashTree+ representation of results.
|
50
|
-
attr_accessor :results
|
51
|
-
|
52
49
|
##
|
53
50
|
# When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to +false+.
|
54
51
|
attr_accessor :redirect
|
55
52
|
|
53
|
+
##
|
54
|
+
# The list of run tests.
|
55
|
+
attr_accessor :tasks
|
56
|
+
|
57
|
+
##
|
58
|
+
# Hash of resulting +HashTree+s.
|
59
|
+
attr_accessor :results
|
60
|
+
|
56
61
|
##
|
57
62
|
# Create a profile.
|
58
63
|
def initialize(
|
@@ -68,17 +73,20 @@ module Recluse
|
|
68
73
|
raise ProfileError, 'Profile needs roots for starting point' if roots.empty?
|
69
74
|
@name = name
|
70
75
|
@email = email
|
71
|
-
@roots = roots
|
76
|
+
@roots = roots.map do |root|
|
77
|
+
if root.class == Link
|
78
|
+
root
|
79
|
+
else
|
80
|
+
Link.new(root, :root)
|
81
|
+
end
|
82
|
+
end
|
72
83
|
@blacklist = blacklist
|
73
84
|
@whitelist = whitelist
|
74
85
|
@internal_only = internal_only
|
75
86
|
@scheme_squash = scheme_squash
|
76
87
|
@redirect = redirect
|
77
|
-
@
|
78
|
-
|
79
|
-
# Detect if URL exists already, but just has a slash at end
|
80
|
-
(url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
|
81
|
-
end
|
88
|
+
@tasks = {}
|
89
|
+
@results = {}
|
82
90
|
end
|
83
91
|
|
84
92
|
##
|
@@ -96,150 +104,18 @@ module Recluse
|
|
96
104
|
end
|
97
105
|
|
98
106
|
##
|
99
|
-
#
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
while queue.length >= 1
|
107
|
-
element = queue.shift
|
108
|
-
next unless element.run?(@blacklist, @whitelist)
|
109
|
-
internal = element.internal?(addrroot)
|
110
|
-
next if @internal_only && !internal
|
111
|
-
if @results.child?(element.absolute)
|
112
|
-
@results.add element.absolute, element.parent
|
113
|
-
next
|
114
|
-
end
|
115
|
-
@results.add element.absolute, element.parent
|
116
|
-
if @scheme_squash
|
117
|
-
alt = element.address
|
118
|
-
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
119
|
-
if @results.child?(alt.to_s)
|
120
|
-
@results.set_child_value element.absolute, @results.get_child_value(alt.to_s)
|
121
|
-
next
|
122
|
-
end
|
123
|
-
end
|
124
|
-
result = Result.new 'idk', false
|
125
|
-
begin
|
126
|
-
page = agent.get element.absolute
|
127
|
-
result.code = page.code
|
128
|
-
if @redirect
|
129
|
-
result_link = Link.new(page.uri.to_s, element.parent)
|
130
|
-
internal = result_link.internal?(addrroot)
|
131
|
-
end
|
132
|
-
queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } if internal && (page.class != Mechanize::File) && (page.class != Mechanize::Image)
|
133
|
-
rescue Mechanize::ResponseCodeError => code
|
134
|
-
result.code = code.response_code
|
135
|
-
rescue => e
|
136
|
-
result.error = e
|
137
|
-
end
|
138
|
-
@results.set_child_value element.absolute, result
|
139
|
-
unless quiet
|
140
|
-
puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{element.absolute}"
|
141
|
-
puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
|
142
|
-
end
|
143
|
-
end
|
144
|
-
end
|
145
|
-
|
146
|
-
##
|
147
|
-
# Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) +internal_only+ behavior to +true+.
|
148
|
-
def find(glob, quiet: false)
|
149
|
-
queue = @roots.map { |url| Link.new(url, :root) }
|
150
|
-
addrroot = @roots.map { |url| Addressable::URI.parse url }
|
151
|
-
raise ProfileError, 'No roots to start from' if queue.empty?
|
152
|
-
progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
|
153
|
-
agent = create_agent
|
154
|
-
while queue.length >= 1
|
155
|
-
element = queue.shift
|
156
|
-
match = element.match? glob
|
157
|
-
if match
|
158
|
-
@results.add element.absolute, element.parent
|
159
|
-
progress.log "[#{@name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{element.parent} => #{element.absolute}" unless quiet
|
160
|
-
end
|
161
|
-
next unless element.run?(@blacklist, @whitelist)
|
162
|
-
internal = element.internal?(addrroot)
|
163
|
-
next unless internal
|
164
|
-
next if @results.parent?(element.absolute)
|
165
|
-
if @scheme_squash
|
166
|
-
alt = element.address
|
167
|
-
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
168
|
-
next if @results.parent?(alt.to_s)
|
169
|
-
end
|
170
|
-
@results.add_parent element.absolute
|
171
|
-
result = Result.new 'idk', false
|
172
|
-
begin
|
173
|
-
page = agent.get element.absolute
|
174
|
-
result.code = page.code
|
175
|
-
if @redirect
|
176
|
-
result_link = Link.new(page.uri.to_s, element.parent)
|
177
|
-
next unless result_link.internal?(addrroot)
|
178
|
-
end
|
179
|
-
queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
|
180
|
-
rescue Mechanize::ResponseCodeError => code
|
181
|
-
result.code = code.response_code
|
182
|
-
rescue => e
|
183
|
-
result.error = e
|
184
|
-
end
|
185
|
-
progress.increment unless quiet
|
186
|
-
unless quiet || (result.error == false)
|
187
|
-
progress.log "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
|
188
|
-
progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
|
189
|
-
end
|
190
|
-
end
|
191
|
-
end
|
192
|
-
|
193
|
-
##
|
194
|
-
# Asserts existence of CSS selectors.
|
195
|
-
def assert(selectors, quiet: false)
|
196
|
-
queue = @roots.map { |url| Link.new(url, :root) }
|
197
|
-
addrroot = @roots.map { |url| Addressable::URI.parse url }
|
198
|
-
raise ProfileError, 'No roots to start from' if queue.empty?
|
199
|
-
agent = create_agent
|
200
|
-
while queue.length >= 1
|
201
|
-
element = queue.shift
|
202
|
-
internal = element.internal?(addrroot)
|
203
|
-
next unless element.run?(@blacklist, @whitelist) && internal && !@results.child?(element.absolute)
|
204
|
-
if @scheme_squash
|
205
|
-
alt = element.address
|
206
|
-
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
207
|
-
next if @results.child?(alt.to_s)
|
208
|
-
end
|
209
|
-
@results.add_child element.absolute
|
210
|
-
existence = nil
|
211
|
-
result = Result.new 'idk', false
|
212
|
-
begin
|
213
|
-
page = agent.get element.absolute
|
214
|
-
result.code = page.code
|
215
|
-
if @redirect
|
216
|
-
result_link = Link.new(page.uri.to_s, element.parent)
|
217
|
-
next unless result_link.internal?(addrroot)
|
218
|
-
end
|
219
|
-
unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
|
220
|
-
existence = {}
|
221
|
-
selectors.each do |selector|
|
222
|
-
existence[selector] = !page.css(selector).empty?
|
223
|
-
end
|
224
|
-
@results.set_child_value element.absolute, existence
|
225
|
-
queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) }
|
226
|
-
end
|
227
|
-
rescue Mechanize::ResponseCodeError => code
|
228
|
-
result.code = code.response_code
|
229
|
-
rescue => e
|
230
|
-
result.error = e
|
231
|
-
end
|
232
|
-
unless quiet
|
233
|
-
if result.error != false
|
234
|
-
puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
|
235
|
-
puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
|
236
|
-
elsif !existence.nil?
|
237
|
-
existence.each do |selector, exists|
|
238
|
-
puts "[#{@name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{element.absolute}"
|
239
|
-
end
|
240
|
-
end
|
107
|
+
# Runs test.
|
108
|
+
def test(key, options = {})
|
109
|
+
unless @results.key?(key) && @results[key].class == Recluse::HashTree
|
110
|
+
@results[key] = Recluse::HashTree.new do |url1, url2|
|
111
|
+
url1, url2 = url2, url1 if url2.length > url1.length
|
112
|
+
# Detect if URL exists already, but just has a slash at end
|
113
|
+
(url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
|
241
114
|
end
|
242
115
|
end
|
116
|
+
@tasks[key] = Recluse::Tasks.get(key).new(self, options.merge(results: @results[key]))
|
117
|
+
@tasks[key].run
|
118
|
+
@results[key]
|
243
119
|
end
|
244
120
|
|
245
121
|
##
|
@@ -249,7 +125,7 @@ module Recluse
|
|
249
125
|
fname = "#{@name}.yaml"
|
250
126
|
options = uconf[fname]
|
251
127
|
options['name'] = @name
|
252
|
-
options['roots'] = @roots
|
128
|
+
options['roots'] = @roots.map(&:to_s)
|
253
129
|
options['email'] = @email
|
254
130
|
options['blacklist'] = @blacklist
|
255
131
|
options['whitelist'] = @whitelist
|
@@ -264,7 +140,9 @@ module Recluse
|
|
264
140
|
def ==(other)
|
265
141
|
return false if other.class != self.class
|
266
142
|
instance_variables.all? do |ivar|
|
267
|
-
ivar == '@results'.to_sym
|
143
|
+
next true if ivar == '@results'.to_sym
|
144
|
+
next true if ivar == '@roots' && instance_variable_get(ivar).map(&:to_s) == other.instance_variable_get(ivar).map(&:to_s)
|
145
|
+
instance_variable_get(ivar) == other.instance_variable_get(ivar)
|
268
146
|
end
|
269
147
|
end
|
270
148
|
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'recluse/response'
|
3
|
+
|
4
|
+
module Recluse
|
5
|
+
##
|
6
|
+
# Link checker
|
7
|
+
class Queue
|
8
|
+
##
|
9
|
+
# Create an empty queue
|
10
|
+
def initialize(email, redirect: false)
|
11
|
+
@links = []
|
12
|
+
@run_if = proc { true }
|
13
|
+
@on_complete = proc { |link, response| }
|
14
|
+
@redirect = redirect
|
15
|
+
@email = email
|
16
|
+
@agent = Mechanize.new do |a|
|
17
|
+
a.ssl_version = 'TLSv1'
|
18
|
+
a.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
19
|
+
a.max_history = nil
|
20
|
+
a.follow_meta_refresh = true
|
21
|
+
a.keep_alive = false
|
22
|
+
a.redirect_ok = @redirect
|
23
|
+
a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# Add to queue.
|
29
|
+
def add(link)
|
30
|
+
@links += [*link]
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# If the test is true, run the link. Procedure takes the link as input.
|
35
|
+
def run_if(&block)
|
36
|
+
@run_if = block
|
37
|
+
end
|
38
|
+
|
39
|
+
##
|
40
|
+
# Run when a link has been checked. Procedure takes the link and response as inputs.
|
41
|
+
def on_complete(&block)
|
42
|
+
@on_complete = block
|
43
|
+
end
|
44
|
+
|
45
|
+
##
|
46
|
+
# Run a link
|
47
|
+
def run_link(link)
|
48
|
+
response = Response.new
|
49
|
+
return nil unless @run_if.call(link)
|
50
|
+
begin
|
51
|
+
response.page = @agent.get link.absolute
|
52
|
+
response.code = response.page.code
|
53
|
+
response.success = true
|
54
|
+
rescue Mechanize::ResponseCodeError => code
|
55
|
+
response.code = code.response_code
|
56
|
+
response.success = false
|
57
|
+
rescue => error
|
58
|
+
response.errors = error
|
59
|
+
response.success = false
|
60
|
+
end
|
61
|
+
@on_complete.call link, response
|
62
|
+
response
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Run queue
|
67
|
+
def run
|
68
|
+
until @links.empty?
|
69
|
+
link = @links.shift
|
70
|
+
run_link link
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'recluse/statuscode'
|
3
|
+
|
4
|
+
module Recluse
|
5
|
+
##
|
6
|
+
# Response wrapper.
|
7
|
+
class Response
|
8
|
+
##
|
9
|
+
# +Mechanize::Page+ of the response page. Might be +nil+.
|
10
|
+
attr_accessor :page
|
11
|
+
|
12
|
+
##
|
13
|
+
# +StatusCode+ of the response.
|
14
|
+
attr_reader :code
|
15
|
+
|
16
|
+
##
|
17
|
+
# Error string if any.
|
18
|
+
attr_accessor :errors
|
19
|
+
|
20
|
+
##
|
21
|
+
# Whether the page was successfully accessed or not.
|
22
|
+
attr_accessor :success
|
23
|
+
|
24
|
+
##
|
25
|
+
# Create new response.
|
26
|
+
def initialize(page: nil, errors: false, code: StatusCode.new('idk'), success: false)
|
27
|
+
@page = page
|
28
|
+
@code = code
|
29
|
+
@errors = errors
|
30
|
+
@success = success
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# Set a new status code.
|
35
|
+
def code=(new_code)
|
36
|
+
@code = StatusCode.new new_code
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/recluse/result.rb
CHANGED
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'recluse/tasks/task'
|
2
|
+
require 'recluse/link'
|
3
|
+
require 'recluse/result'
|
4
|
+
require 'addressable/uri'
|
5
|
+
require 'colorize'
|
6
|
+
|
7
|
+
module Recluse
|
8
|
+
module Tasks
|
9
|
+
##
|
10
|
+
# Asserts existence of CSS selectors.
|
11
|
+
class Assert < Task
|
12
|
+
##
|
13
|
+
# Create new assertion task.
|
14
|
+
def initialize(profile, selectors: [], quiet: false, results: nil)
|
15
|
+
super(profile, queue_options: { redirect: profile.redirect }, results: results)
|
16
|
+
addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
|
17
|
+
@queue.run_if do |link|
|
18
|
+
internal = link.internal?(addr_roots)
|
19
|
+
next false unless link.run?(profile.blacklist, profile.whitelist) && internal && !@results.child?(link.absolute)
|
20
|
+
if profile.scheme_squash
|
21
|
+
alt = link.address
|
22
|
+
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
23
|
+
next false if @results.child?(alt.to_s)
|
24
|
+
end
|
25
|
+
@results.add_child link.absolute
|
26
|
+
true
|
27
|
+
end
|
28
|
+
@queue.on_complete do |link, response|
|
29
|
+
existence = nil
|
30
|
+
result = Recluse::Result.new response.code.to_s, response.errors
|
31
|
+
if response.success
|
32
|
+
if profile.redirect
|
33
|
+
result_link = Link.new(response.page.uri.to_s, link.parent)
|
34
|
+
next unless result_link.internal?(addr_roots)
|
35
|
+
end
|
36
|
+
unless (response.page.class == Mechanize::File) || (response.page.class == Mechanize::Image)
|
37
|
+
existence = {}
|
38
|
+
selectors.each do |selector|
|
39
|
+
existence[selector] = !response.page.css(selector).empty?
|
40
|
+
end
|
41
|
+
@results.set_child_value link.absolute, existence
|
42
|
+
@queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) })
|
43
|
+
end
|
44
|
+
end
|
45
|
+
unless quiet
|
46
|
+
if result.error != false
|
47
|
+
puts "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{link.absolute}"
|
48
|
+
puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
|
49
|
+
elsif !existence.nil?
|
50
|
+
existence.each do |selector, exists|
|
51
|
+
puts "[#{profile.name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{link.absolute}"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'recluse/tasks/task'
|
2
|
+
require 'recluse/link'
|
3
|
+
require 'recluse/result'
|
4
|
+
require 'addressable/uri'
|
5
|
+
require 'colorize'
|
6
|
+
require 'ruby-progressbar'
|
7
|
+
|
8
|
+
module Recluse
|
9
|
+
module Tasks
|
10
|
+
##
|
11
|
+
# Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) +internal_only+ behavior to +true+.
|
12
|
+
class Find < Task
|
13
|
+
##
|
14
|
+
# Create new find task.
|
15
|
+
def initialize(profile, globs: [], quiet: false, results: nil)
|
16
|
+
super(profile, queue_options: { redirect: profile.redirect }, results: results)
|
17
|
+
addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
|
18
|
+
progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
|
19
|
+
@queue.run_if do |link|
|
20
|
+
match = link.match? globs
|
21
|
+
if match
|
22
|
+
@results.add link.absolute, link.parent
|
23
|
+
progress.log "[#{profile.name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{link.parent} => #{link.absolute}" unless quiet
|
24
|
+
end
|
25
|
+
next false unless link.run?(profile.blacklist, profile.whitelist)
|
26
|
+
internal = link.internal?(addr_roots)
|
27
|
+
next false unless internal
|
28
|
+
next false if @results.parent?(link.absolute)
|
29
|
+
if profile.scheme_squash
|
30
|
+
alt = link.address
|
31
|
+
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
32
|
+
next false if @results.parent?(alt.to_s)
|
33
|
+
end
|
34
|
+
@results.add_parent link.absolute
|
35
|
+
true
|
36
|
+
end
|
37
|
+
@queue.on_complete do |link, response|
|
38
|
+
result = Recluse::Result.new response.code.to_s, response.errors
|
39
|
+
if response.success
|
40
|
+
if profile.redirect
|
41
|
+
result_link = Recluse::Link.new(response.page.uri.to_s, link.parent)
|
42
|
+
next unless result_link.internal?(addr_roots)
|
43
|
+
end
|
44
|
+
@queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) }) unless (response.page.class == Mechanize::File) || (response.page.class == Mechanize::Image)
|
45
|
+
end
|
46
|
+
progress.increment unless quiet
|
47
|
+
unless quiet || (result.error == false)
|
48
|
+
progress.log "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{link.absolute}"
|
49
|
+
progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'recluse/tasks/status'
|
2
|
+
require 'recluse/tasks/assert'
|
3
|
+
require 'recluse/tasks/find'
|
4
|
+
|
5
|
+
module Recluse
|
6
|
+
##
|
7
|
+
# Tasks are tests for Recluse.
|
8
|
+
module Tasks
|
9
|
+
##
|
10
|
+
# Hash of available tasks.
|
11
|
+
@@list = {
|
12
|
+
status: Recluse::Tasks::Status,
|
13
|
+
assert: Recluse::Tasks::Assert,
|
14
|
+
find: Recluse::Tasks::Find
|
15
|
+
}
|
16
|
+
class << self
|
17
|
+
##
|
18
|
+
# Add task to the list.
|
19
|
+
def add_task(key, task_class)
|
20
|
+
list[key] = task_class
|
21
|
+
end
|
22
|
+
|
23
|
+
##
|
24
|
+
# Hash of available tasks.
|
25
|
+
def list
|
26
|
+
@@list
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Get task by key name.
|
31
|
+
def get(key)
|
32
|
+
@@list[key]
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# Do something for each task.
|
37
|
+
def each(&block)
|
38
|
+
@@list.each(&block)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'recluse/tasks/task'
|
2
|
+
require 'recluse/link'
|
3
|
+
require 'recluse/result'
|
4
|
+
require 'addressable/uri'
|
5
|
+
require 'colorize'
|
6
|
+
|
7
|
+
module Recluse
|
8
|
+
module Tasks
|
9
|
+
##
|
10
|
+
# Starting from the roots, goes through each runnable link and records the referrer, the status code, and any errors.
|
11
|
+
class Status < Task
|
12
|
+
##
|
13
|
+
# Create new status task.
|
14
|
+
def initialize(profile, quiet: false, results: nil)
|
15
|
+
super(profile, queue_options: { redirect: profile.redirect }, results: results)
|
16
|
+
addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
|
17
|
+
@queue.run_if do |link|
|
18
|
+
next false unless link.run?(profile.blacklist, profile.whitelist)
|
19
|
+
internal = link.internal?(addr_roots)
|
20
|
+
next false if profile.internal_only && !internal
|
21
|
+
if @results.child?(link.absolute)
|
22
|
+
@results.add link.absolute, link.parent
|
23
|
+
next false
|
24
|
+
end
|
25
|
+
@results.add link.absolute, link.parent
|
26
|
+
if profile.scheme_squash
|
27
|
+
alt = link.address
|
28
|
+
alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
|
29
|
+
if @results.child?(alt.to_s)
|
30
|
+
@results.set_child_value link.absolute, @results.get_child_value(alt.to_s)
|
31
|
+
next false
|
32
|
+
end
|
33
|
+
end
|
34
|
+
true
|
35
|
+
end
|
36
|
+
@queue.on_complete do |link, response|
|
37
|
+
result = Recluse::Result.new response.code.to_s, response.errors
|
38
|
+
if response.success
|
39
|
+
internal = link.internal? addr_roots
|
40
|
+
if profile.redirect
|
41
|
+
result_link = Recluse::Link.new response.page.uri.to_s, link.parent
|
42
|
+
internal = result_link.internal? addr_roots
|
43
|
+
end
|
44
|
+
queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) }) if internal && (response.page.class != Mechanize::File) && (response.page.class != Mechanize::Image)
|
45
|
+
end
|
46
|
+
@results.set_child_value link.absolute, result
|
47
|
+
unless quiet
|
48
|
+
puts "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{link.absolute}"
|
49
|
+
puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'recluse/queue'
|
2
|
+
require 'recluse/hashtree'
|
3
|
+
|
4
|
+
module Recluse
|
5
|
+
module Tasks
|
6
|
+
##
|
7
|
+
# Task interface. Runs the queue with customized behavior.
|
8
|
+
class Task
|
9
|
+
##
|
10
|
+
# +HashTree+ representation of results.
|
11
|
+
attr_reader :results
|
12
|
+
|
13
|
+
##
|
14
|
+
# +Queue+ of links to check.
|
15
|
+
attr_accessor :queue
|
16
|
+
|
17
|
+
##
|
18
|
+
# Create new task.
|
19
|
+
def initialize(profile, queue_options: {}, results: nil)
|
20
|
+
@queue = Recluse::Queue.new(profile.email, queue_options)
|
21
|
+
if results.nil?
|
22
|
+
@results = Recluse::HashTree.new do |url1, url2|
|
23
|
+
url1, url2 = url2, url1 if url2.length > url1.length
|
24
|
+
# Detect if URL exists already, but just has a slash at end
|
25
|
+
(url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
|
26
|
+
end
|
27
|
+
else
|
28
|
+
@results = results
|
29
|
+
end
|
30
|
+
@queue.add profile.roots
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# Add link (or links) to the queue.
|
35
|
+
def add(link)
|
36
|
+
@queue.add link
|
37
|
+
end
|
38
|
+
|
39
|
+
##
|
40
|
+
# Run the queue.
|
41
|
+
def run
|
42
|
+
@queue.run
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: recluse
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Anthony Bruno
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-03-
|
11
|
+
date: 2017-03-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -203,8 +203,15 @@ files:
|
|
203
203
|
- lib/recluse/info.rb
|
204
204
|
- lib/recluse/link.rb
|
205
205
|
- lib/recluse/profile.rb
|
206
|
+
- lib/recluse/queue.rb
|
207
|
+
- lib/recluse/response.rb
|
206
208
|
- lib/recluse/result.rb
|
207
209
|
- lib/recluse/statuscode.rb
|
210
|
+
- lib/recluse/tasks/assert.rb
|
211
|
+
- lib/recluse/tasks/find.rb
|
212
|
+
- lib/recluse/tasks/list.rb
|
213
|
+
- lib/recluse/tasks/status.rb
|
214
|
+
- lib/recluse/tasks/task.rb
|
208
215
|
- recluse.gemspec
|
209
216
|
homepage: https://github.com/czycha/recluse
|
210
217
|
licenses:
|