recluse 1.0.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 396ec13b808c57e16b103fcc9dd55c228fdef9b5
4
- data.tar.gz: 89b5a9aa4d946fb4d3653a7422407b63fa8263fc
3
+ metadata.gz: 4bc973f210fae9eda00a82ccc421c957098182b3
4
+ data.tar.gz: 27b8c662b5e24f3b48e8d29277ef82d027d128a7
5
5
  SHA512:
6
- metadata.gz: b43648a734c3330cf4f536e0572e02069922574a275ec83c3fc7f2b60019274741d361ce61acc469b26222211c27dc630ec975ae7f59bf8bf313c906edd1a079
7
- data.tar.gz: 04f1f680c9dc0f6fad64f97ae410ca1233a28a52276059d652449c3eac4baa53da1068fe487c5d88f03ed791e4b8cb0011782c53051c6f182990a8869f2139d5
6
+ metadata.gz: 528286fe546e0862abd045b13abef5722a102aa3e9a660d93b08206683cf01a8a61593de6bfd40e93bbeff765a82890ebf593a8b70d97125d683ce7bead4c38f
7
+ data.tar.gz: cb4fcf8872a03d57635756616813603556a9099498f8376dc1dfc9e8e7363fbe71017af2c95b61801f6b15691746c1d4fc909c2de9537e8021be113c5e3873e4
data/README.md CHANGED
@@ -228,6 +228,43 @@ List the YAML info of the profile.
228
228
 
229
229
  Bug reports and pull requests are welcome on GitHub.
230
230
 
231
+ ## Extending
232
+
233
+ Recluse is modular so you can add tasks if you want. Below is an example of adding your own task to Recluse.
234
+
235
+ ```ruby
236
+ require 'recluse'
237
+
238
+ module MyModule
239
+ ##
240
+ # Create a task object
241
+ class MyTask < Recluse::Tasks::Task
242
+ ##
243
+ # First argument must be the profile. The rest are hash arguments specific for the task.
244
+ def initialize(profile, option1: false, option2: true, results: nil)
245
+ # Sets up everything based on the profile, queue-specific options, and can also prepopulate results.
246
+ super(profile, queue_options, results: results)
247
+ @queue.run_if do |link|
248
+ # Run a link if this function returns true.
249
+ # Link is a Recluse::Link object.
250
+ end
251
+ @queue.on_complete do |link, response|
252
+ # Run this function after the page has either successfully been retrieved, or failed to be retrieved.
253
+ # Link is a Recluse::Link object.
254
+ # Response is a Recluse::Response object.
255
+ end
256
+ end
257
+ end
258
+ end
259
+
260
+ # Add your task to the task list under the key 'my_task'.
261
+ Recluse::Tasks.add_task(:my_task, MyModule::MyTask)
262
+
263
+ # You can now access 'my_task' like you would the default Recluse tasks.
264
+ my_profile = Recluse::Profile.load('my_profile')
265
+ my_profile.test(:my_task, option1: true, option2: true)
266
+ results = my_profile.results[:my_task]
267
+ ```
231
268
 
232
269
  ## License
233
270
 
@@ -24,7 +24,7 @@ module Recluse
24
24
  parent_count = 0
25
25
  case group_by
26
26
  when :page
27
- report = profile.results.parents
27
+ report = profile.results[:find].parents
28
28
  CSV.open(csv_path, 'w+') do |csv|
29
29
  csv << ['Page', 'Matching URLs']
30
30
  report.each do |parent, children|
@@ -35,7 +35,7 @@ module Recluse
35
35
  end
36
36
  end
37
37
  when :none
38
- report = profile.results.parents
38
+ report = profile.results[:find].parents
39
39
  CSV.open(csv_path, 'w+') do |csv|
40
40
  csv << ['Matching URL', 'Page']
41
41
  report.each do |parent, children|
@@ -47,7 +47,7 @@ module Recluse
47
47
  end
48
48
  end
49
49
  when :url
50
- report = profile.results.children
50
+ report = profile.results[:find].children
51
51
  CSV.open(csv_path, 'w+') do |csv|
52
52
  csv << ['Matching URL', 'Pages']
53
53
  parents = Set.new
@@ -61,7 +61,7 @@ module Recluse
61
61
  parent_count = parents.length
62
62
  end
63
63
  end
64
- total = profile.results.parents.keys.length
64
+ total = profile.results[:find].parents.keys.length
65
65
  puts "Total pages:\t#{total}"
66
66
  puts "Matched URLs:\t#{child_count}"
67
67
  puts "Pages with matches:\t#{parent_count}\t#{perc parent_count, total}%"
@@ -87,7 +87,7 @@ module Recluse
87
87
  valid_status = proc do |code|
88
88
  (includes.any? { |include_code| include_code.equal?(code) }) && (excludes.none? { |exclude_code| exclude_code.equal?(code) })
89
89
  end
90
- report = profile.results.children
90
+ report = profile.results[:status].children
91
91
  CSV.open(csv_path, 'w+') do |csv|
92
92
  csv << ['Status code', 'URL', page_label, 'With error']
93
93
  report.each do |child, info|
@@ -118,7 +118,7 @@ module Recluse
118
118
 
119
119
  def assert_save(profile, csv_path, report_vals)
120
120
  puts 'Saving report...'
121
- report = profile.results.children
121
+ report = profile.results[:assert].children
122
122
  counts = {}
123
123
  CSV.open(csv_path, 'w+') do |csv|
124
124
  csv << ['Selector', 'Exists', 'On page']
@@ -182,8 +182,8 @@ module Recluse
182
182
  Signal.trap sig, &ending
183
183
  end
184
184
  (0...profile_queue.length).each do |i|
185
- profile.results = profile_queue[i - 1].results unless i.zero?
186
- profile.status
185
+ profile.results[:status] = profile_queue[i - 1].results[:status] unless i.zero?
186
+ profile.test :status
187
187
  profile = profile_queue[i + 1] if i + 1 < profile_queue.length
188
188
  end
189
189
  %w(INT TERM).each do |sig|
@@ -219,8 +219,8 @@ module Recluse
219
219
  Signal.trap sig, &ending
220
220
  end
221
221
  (0...profile_queue.length).each do |i|
222
- profile.results = profile_queue[i - 1].results unless i.zero?
223
- profile.find options['globs']
222
+ profile.results[:find] = profile_queue[i - 1].results[:find] unless i.zero?
223
+ profile.test(:find, globs: options['globs'])
224
224
  profile = profile_queue[i + 1] if i + 1 < profile_queue.length
225
225
  end
226
226
  %w(INT TERM).each do |sig|
@@ -265,9 +265,10 @@ module Recluse
265
265
  %w(INT TERM).each do |sig|
266
266
  Signal.trap sig, &ending
267
267
  end
268
+
268
269
  (0...profile_queue.length).each do |i|
269
- profile.results = profile_queue[i - 1].results unless i.zero?
270
- profile.assert options['exists']
270
+ profile.results[:assert] = profile_queue[i - 1].results[:assert] unless i.zero?
271
+ profile.test(:assert, selectors: options['exists'])
271
272
  profile = profile_queue[i + 1] if i + 1 < profile_queue.length
272
273
  end
273
274
  %w(INT TERM).each do |sig|
@@ -1,7 +1,7 @@
1
1
  module Recluse
2
2
  ##
3
3
  # Version of the gem.
4
- VERSION = '1.0.2'.freeze
4
+ VERSION = '2.0.0'.freeze
5
5
 
6
6
  ##
7
7
  # Homepage of the gem (also used in user-agent).
@@ -42,6 +42,12 @@ module Recluse
42
42
  @absolute
43
43
  end
44
44
 
45
+ ##
46
+ # Inspection
47
+ def inspect
48
+ to_s
49
+ end
50
+
45
51
  ##
46
52
  # Is the link internal compared to +Addressable::URI+ roots?
47
53
  def internal?(addrroots, scheme_squash: false)
@@ -2,6 +2,7 @@ require 'recluse/hashtree'
2
2
  require 'recluse/link'
3
3
  require 'recluse/result'
4
4
  require 'recluse/info'
5
+ require 'recluse/tasks/list'
5
6
  require 'addressable/uri'
6
7
  require 'mechanize'
7
8
  require 'colorize'
@@ -45,14 +46,18 @@ module Recluse
45
46
  # HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to +false+.
46
47
  attr_accessor :scheme_squash
47
48
 
48
- ##
49
- # +HashTree+ representation of results.
50
- attr_accessor :results
51
-
52
49
  ##
53
50
  # When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to +false+.
54
51
  attr_accessor :redirect
55
52
 
53
+ ##
54
+ # The list of run tests.
55
+ attr_accessor :tasks
56
+
57
+ ##
58
+ # Hash of resulting +HashTree+s.
59
+ attr_accessor :results
60
+
56
61
  ##
57
62
  # Create a profile.
58
63
  def initialize(
@@ -68,17 +73,20 @@ module Recluse
68
73
  raise ProfileError, 'Profile needs roots for starting point' if roots.empty?
69
74
  @name = name
70
75
  @email = email
71
- @roots = roots
76
+ @roots = roots.map do |root|
77
+ if root.class == Link
78
+ root
79
+ else
80
+ Link.new(root, :root)
81
+ end
82
+ end
72
83
  @blacklist = blacklist
73
84
  @whitelist = whitelist
74
85
  @internal_only = internal_only
75
86
  @scheme_squash = scheme_squash
76
87
  @redirect = redirect
77
- @results = HashTree.new do |url1, url2|
78
- url1, url2 = url2, url1 if url2.length > url1.length
79
- # Detect if URL exists already, but just has a slash at end
80
- (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
81
- end
88
+ @tasks = {}
89
+ @results = {}
82
90
  end
83
91
 
84
92
  ##
@@ -96,150 +104,18 @@ module Recluse
96
104
  end
97
105
 
98
106
  ##
99
- # Starting from the roots, goes through each runnable link and records the referrer, the status code, and any errors.
100
- # Results are saved in <tt>@results</tt>.
101
- def status(quiet: false)
102
- queue = @roots.map { |url| Link.new(url, :root) }
103
- addrroot = @roots.map { |url| Addressable::URI.parse url }
104
- raise ProfileError, 'No roots to start from' if queue.empty?
105
- agent = create_agent
106
- while queue.length >= 1
107
- element = queue.shift
108
- next unless element.run?(@blacklist, @whitelist)
109
- internal = element.internal?(addrroot)
110
- next if @internal_only && !internal
111
- if @results.child?(element.absolute)
112
- @results.add element.absolute, element.parent
113
- next
114
- end
115
- @results.add element.absolute, element.parent
116
- if @scheme_squash
117
- alt = element.address
118
- alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
119
- if @results.child?(alt.to_s)
120
- @results.set_child_value element.absolute, @results.get_child_value(alt.to_s)
121
- next
122
- end
123
- end
124
- result = Result.new 'idk', false
125
- begin
126
- page = agent.get element.absolute
127
- result.code = page.code
128
- if @redirect
129
- result_link = Link.new(page.uri.to_s, element.parent)
130
- internal = result_link.internal?(addrroot)
131
- end
132
- queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } if internal && (page.class != Mechanize::File) && (page.class != Mechanize::Image)
133
- rescue Mechanize::ResponseCodeError => code
134
- result.code = code.response_code
135
- rescue => e
136
- result.error = e
137
- end
138
- @results.set_child_value element.absolute, result
139
- unless quiet
140
- puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{element.absolute}"
141
- puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
142
- end
143
- end
144
- end
145
-
146
- ##
147
- # Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) +internal_only+ behavior to +true+.
148
- def find(glob, quiet: false)
149
- queue = @roots.map { |url| Link.new(url, :root) }
150
- addrroot = @roots.map { |url| Addressable::URI.parse url }
151
- raise ProfileError, 'No roots to start from' if queue.empty?
152
- progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
153
- agent = create_agent
154
- while queue.length >= 1
155
- element = queue.shift
156
- match = element.match? glob
157
- if match
158
- @results.add element.absolute, element.parent
159
- progress.log "[#{@name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{element.parent} => #{element.absolute}" unless quiet
160
- end
161
- next unless element.run?(@blacklist, @whitelist)
162
- internal = element.internal?(addrroot)
163
- next unless internal
164
- next if @results.parent?(element.absolute)
165
- if @scheme_squash
166
- alt = element.address
167
- alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
168
- next if @results.parent?(alt.to_s)
169
- end
170
- @results.add_parent element.absolute
171
- result = Result.new 'idk', false
172
- begin
173
- page = agent.get element.absolute
174
- result.code = page.code
175
- if @redirect
176
- result_link = Link.new(page.uri.to_s, element.parent)
177
- next unless result_link.internal?(addrroot)
178
- end
179
- queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
180
- rescue Mechanize::ResponseCodeError => code
181
- result.code = code.response_code
182
- rescue => e
183
- result.error = e
184
- end
185
- progress.increment unless quiet
186
- unless quiet || (result.error == false)
187
- progress.log "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
188
- progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
189
- end
190
- end
191
- end
192
-
193
- ##
194
- # Asserts existence of CSS selectors.
195
- def assert(selectors, quiet: false)
196
- queue = @roots.map { |url| Link.new(url, :root) }
197
- addrroot = @roots.map { |url| Addressable::URI.parse url }
198
- raise ProfileError, 'No roots to start from' if queue.empty?
199
- agent = create_agent
200
- while queue.length >= 1
201
- element = queue.shift
202
- internal = element.internal?(addrroot)
203
- next unless element.run?(@blacklist, @whitelist) && internal && !@results.child?(element.absolute)
204
- if @scheme_squash
205
- alt = element.address
206
- alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
207
- next if @results.child?(alt.to_s)
208
- end
209
- @results.add_child element.absolute
210
- existence = nil
211
- result = Result.new 'idk', false
212
- begin
213
- page = agent.get element.absolute
214
- result.code = page.code
215
- if @redirect
216
- result_link = Link.new(page.uri.to_s, element.parent)
217
- next unless result_link.internal?(addrroot)
218
- end
219
- unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
220
- existence = {}
221
- selectors.each do |selector|
222
- existence[selector] = !page.css(selector).empty?
223
- end
224
- @results.set_child_value element.absolute, existence
225
- queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) }
226
- end
227
- rescue Mechanize::ResponseCodeError => code
228
- result.code = code.response_code
229
- rescue => e
230
- result.error = e
231
- end
232
- unless quiet
233
- if result.error != false
234
- puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
235
- puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
236
- elsif !existence.nil?
237
- existence.each do |selector, exists|
238
- puts "[#{@name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{element.absolute}"
239
- end
240
- end
107
+ # Runs test.
108
+ def test(key, options = {})
109
+ unless @results.key?(key) && @results[key].class == Recluse::HashTree
110
+ @results[key] = Recluse::HashTree.new do |url1, url2|
111
+ url1, url2 = url2, url1 if url2.length > url1.length
112
+ # Detect if URL exists already, but just has a slash at end
113
+ (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
241
114
  end
242
115
  end
116
+ @tasks[key] = Recluse::Tasks.get(key).new(self, options.merge(results: @results[key]))
117
+ @tasks[key].run
118
+ @results[key]
243
119
  end
244
120
 
245
121
  ##
@@ -249,7 +125,7 @@ module Recluse
249
125
  fname = "#{@name}.yaml"
250
126
  options = uconf[fname]
251
127
  options['name'] = @name
252
- options['roots'] = @roots
128
+ options['roots'] = @roots.map(&:to_s)
253
129
  options['email'] = @email
254
130
  options['blacklist'] = @blacklist
255
131
  options['whitelist'] = @whitelist
@@ -264,7 +140,9 @@ module Recluse
264
140
  def ==(other)
265
141
  return false if other.class != self.class
266
142
  instance_variables.all? do |ivar|
267
- ivar == '@results'.to_sym || instance_variable_get(ivar) == other.instance_variable_get(ivar)
143
+ next true if ivar == '@results'.to_sym
144
+ next true if ivar == '@roots' && instance_variable_get(ivar).map(&:to_s) == other.instance_variable_get(ivar).map(&:to_s)
145
+ instance_variable_get(ivar) == other.instance_variable_get(ivar)
268
146
  end
269
147
  end
270
148
 
@@ -0,0 +1,74 @@
1
+ require 'mechanize'
2
+ require 'recluse/response'
3
+
4
+ module Recluse
5
+ ##
6
+ # Link checker
7
+ class Queue
8
+ ##
9
+ # Create an empty queue
10
+ def initialize(email, redirect: false)
11
+ @links = []
12
+ @run_if = proc { true }
13
+ @on_complete = proc { |link, response| }
14
+ @redirect = redirect
15
+ @email = email
16
+ @agent = Mechanize.new do |a|
17
+ a.ssl_version = 'TLSv1'
18
+ a.verify_mode = OpenSSL::SSL::VERIFY_NONE
19
+ a.max_history = nil
20
+ a.follow_meta_refresh = true
21
+ a.keep_alive = false
22
+ a.redirect_ok = @redirect
23
+ a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}"
24
+ end
25
+ end
26
+
27
+ ##
28
+ # Add to queue.
29
+ def add(link)
30
+ @links += [*link]
31
+ end
32
+
33
+ ##
34
+ # If the test is true, run the link. Procedure takes the link as input.
35
+ def run_if(&block)
36
+ @run_if = block
37
+ end
38
+
39
+ ##
40
+ # Run when a link has been checked. Procedure takes the link and response as inputs.
41
+ def on_complete(&block)
42
+ @on_complete = block
43
+ end
44
+
45
+ ##
46
+ # Run a link
47
+ def run_link(link)
48
+ response = Response.new
49
+ return nil unless @run_if.call(link)
50
+ begin
51
+ response.page = @agent.get link.absolute
52
+ response.code = response.page.code
53
+ response.success = true
54
+ rescue Mechanize::ResponseCodeError => code
55
+ response.code = code.response_code
56
+ response.success = false
57
+ rescue => error
58
+ response.errors = error
59
+ response.success = false
60
+ end
61
+ @on_complete.call link, response
62
+ response
63
+ end
64
+
65
+ ##
66
+ # Run queue
67
+ def run
68
+ until @links.empty?
69
+ link = @links.shift
70
+ run_link link
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,39 @@
1
+ require 'mechanize'
2
+ require 'recluse/statuscode'
3
+
4
+ module Recluse
5
+ ##
6
+ # Response wrapper.
7
+ class Response
8
+ ##
9
+ # +Mechanize::Page+ of the response page. Might be +nil+.
10
+ attr_accessor :page
11
+
12
+ ##
13
+ # +StatusCode+ of the response.
14
+ attr_reader :code
15
+
16
+ ##
17
+ # Error string if any.
18
+ attr_accessor :errors
19
+
20
+ ##
21
+ # Whether the page was successfully accessed or not.
22
+ attr_accessor :success
23
+
24
+ ##
25
+ # Create new response.
26
+ def initialize(page: nil, errors: false, code: StatusCode.new('idk'), success: false)
27
+ @page = page
28
+ @code = code
29
+ @errors = errors
30
+ @success = success
31
+ end
32
+
33
+ ##
34
+ # Set a new status code.
35
+ def code=(new_code)
36
+ @code = StatusCode.new new_code
37
+ end
38
+ end
39
+ end
@@ -20,7 +20,7 @@ module Recluse
20
20
  ##
21
21
  # Returns the HTTP status code.
22
22
  def inspect
23
- @code
23
+ @code.to_s
24
24
  end
25
25
 
26
26
  ##
@@ -0,0 +1,59 @@
1
+ require 'recluse/tasks/task'
2
+ require 'recluse/link'
3
+ require 'recluse/result'
4
+ require 'addressable/uri'
5
+ require 'colorize'
6
+
7
+ module Recluse
8
+ module Tasks
9
+ ##
10
+ # Asserts existence of CSS selectors.
11
+ class Assert < Task
12
+ ##
13
+ # Create new assertion task.
14
+ def initialize(profile, selectors: [], quiet: false, results: nil)
15
+ super(profile, queue_options: { redirect: profile.redirect }, results: results)
16
+ addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
17
+ @queue.run_if do |link|
18
+ internal = link.internal?(addr_roots)
19
+ next false unless link.run?(profile.blacklist, profile.whitelist) && internal && !@results.child?(link.absolute)
20
+ if profile.scheme_squash
21
+ alt = link.address
22
+ alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
23
+ next false if @results.child?(alt.to_s)
24
+ end
25
+ @results.add_child link.absolute
26
+ true
27
+ end
28
+ @queue.on_complete do |link, response|
29
+ existence = nil
30
+ result = Recluse::Result.new response.code.to_s, response.errors
31
+ if response.success
32
+ if profile.redirect
33
+ result_link = Link.new(response.page.uri.to_s, link.parent)
34
+ next unless result_link.internal?(addr_roots)
35
+ end
36
+ unless (response.page.class == Mechanize::File) || (response.page.class == Mechanize::Image)
37
+ existence = {}
38
+ selectors.each do |selector|
39
+ existence[selector] = !response.page.css(selector).empty?
40
+ end
41
+ @results.set_child_value link.absolute, existence
42
+ @queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) })
43
+ end
44
+ end
45
+ unless quiet
46
+ if result.error != false
47
+ puts "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{link.absolute}"
48
+ puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
49
+ elsif !existence.nil?
50
+ existence.each do |selector, exists|
51
+ puts "[#{profile.name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{link.absolute}"
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,55 @@
1
+ require 'recluse/tasks/task'
2
+ require 'recluse/link'
3
+ require 'recluse/result'
4
+ require 'addressable/uri'
5
+ require 'colorize'
6
+ require 'ruby-progressbar'
7
+
8
+ module Recluse
9
+ module Tasks
10
+ ##
11
+ # Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) +internal_only+ behavior to +true+.
12
+ class Find < Task
13
+ ##
14
+ # Create new find task.
15
+ def initialize(profile, globs: [], quiet: false, results: nil)
16
+ super(profile, queue_options: { redirect: profile.redirect }, results: results)
17
+ addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
18
+ progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
19
+ @queue.run_if do |link|
20
+ match = link.match? globs
21
+ if match
22
+ @results.add link.absolute, link.parent
23
+ progress.log "[#{profile.name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{link.parent} => #{link.absolute}" unless quiet
24
+ end
25
+ next false unless link.run?(profile.blacklist, profile.whitelist)
26
+ internal = link.internal?(addr_roots)
27
+ next false unless internal
28
+ next false if @results.parent?(link.absolute)
29
+ if profile.scheme_squash
30
+ alt = link.address
31
+ alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
32
+ next false if @results.parent?(alt.to_s)
33
+ end
34
+ @results.add_parent link.absolute
35
+ true
36
+ end
37
+ @queue.on_complete do |link, response|
38
+ result = Recluse::Result.new response.code.to_s, response.errors
39
+ if response.success
40
+ if profile.redirect
41
+ result_link = Recluse::Link.new(response.page.uri.to_s, link.parent)
42
+ next unless result_link.internal?(addr_roots)
43
+ end
44
+ @queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) }) unless (response.page.class == Mechanize::File) || (response.page.class == Mechanize::Image)
45
+ end
46
+ progress.increment unless quiet
47
+ unless quiet || (result.error == false)
48
+ progress.log "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{link.absolute}"
49
+ progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,42 @@
1
+ require 'recluse/tasks/status'
2
+ require 'recluse/tasks/assert'
3
+ require 'recluse/tasks/find'
4
+
5
+ module Recluse
6
+ ##
7
+ # Tasks are tests for Recluse.
8
+ module Tasks
9
+ ##
10
+ # Hash of available tasks.
11
+ @@list = {
12
+ status: Recluse::Tasks::Status,
13
+ assert: Recluse::Tasks::Assert,
14
+ find: Recluse::Tasks::Find
15
+ }
16
+ class << self
17
+ ##
18
+ # Add task to the list.
19
+ def add_task(key, task_class)
20
+ list[key] = task_class
21
+ end
22
+
23
+ ##
24
+ # Hash of available tasks.
25
+ def list
26
+ @@list
27
+ end
28
+
29
+ ##
30
+ # Get task by key name.
31
+ def get(key)
32
+ @@list[key]
33
+ end
34
+
35
+ ##
36
+ # Do something for each task.
37
+ def each(&block)
38
+ @@list.each(&block)
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,55 @@
1
+ require 'recluse/tasks/task'
2
+ require 'recluse/link'
3
+ require 'recluse/result'
4
+ require 'addressable/uri'
5
+ require 'colorize'
6
+
7
+ module Recluse
8
+ module Tasks
9
+ ##
10
+ # Starting from the roots, goes through each runnable link and records the referrer, the status code, and any errors.
11
+ class Status < Task
12
+ ##
13
+ # Create new status task.
14
+ def initialize(profile, quiet: false, results: nil)
15
+ super(profile, queue_options: { redirect: profile.redirect }, results: results)
16
+ addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
17
+ @queue.run_if do |link|
18
+ next false unless link.run?(profile.blacklist, profile.whitelist)
19
+ internal = link.internal?(addr_roots)
20
+ next false if profile.internal_only && !internal
21
+ if @results.child?(link.absolute)
22
+ @results.add link.absolute, link.parent
23
+ next false
24
+ end
25
+ @results.add link.absolute, link.parent
26
+ if profile.scheme_squash
27
+ alt = link.address
28
+ alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
29
+ if @results.child?(alt.to_s)
30
+ @results.set_child_value link.absolute, @results.get_child_value(alt.to_s)
31
+ next false
32
+ end
33
+ end
34
+ true
35
+ end
36
+ @queue.on_complete do |link, response|
37
+ result = Recluse::Result.new response.code.to_s, response.errors
38
+ if response.success
39
+ internal = link.internal? addr_roots
40
+ if profile.redirect
41
+ result_link = Recluse::Link.new response.page.uri.to_s, link.parent
42
+ internal = result_link.internal? addr_roots
43
+ end
44
+ queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) }) if internal && (response.page.class != Mechanize::File) && (response.page.class != Mechanize::Image)
45
+ end
46
+ @results.set_child_value link.absolute, result
47
+ unless quiet
48
+ puts "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{link.absolute}"
49
+ puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,46 @@
1
+ require 'recluse/queue'
2
+ require 'recluse/hashtree'
3
+
4
+ module Recluse
5
+ module Tasks
6
+ ##
7
+ # Task interface. Runs the queue with customized behavior.
8
+ class Task
9
+ ##
10
+ # +HashTree+ representation of results.
11
+ attr_reader :results
12
+
13
+ ##
14
+ # +Queue+ of links to check.
15
+ attr_accessor :queue
16
+
17
+ ##
18
+ # Create new task.
19
+ def initialize(profile, queue_options: {}, results: nil)
20
+ @queue = Recluse::Queue.new(profile.email, queue_options)
21
+ if results.nil?
22
+ @results = Recluse::HashTree.new do |url1, url2|
23
+ url1, url2 = url2, url1 if url2.length > url1.length
24
+ # Detect if URL exists already, but just has a slash at end
25
+ (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
26
+ end
27
+ else
28
+ @results = results
29
+ end
30
+ @queue.add profile.roots
31
+ end
32
+
33
+ ##
34
+ # Add link (or links) to the queue.
35
+ def add(link)
36
+ @queue.add link
37
+ end
38
+
39
+ ##
40
+ # Run the queue.
41
+ def run
42
+ @queue.run
43
+ end
44
+ end
45
+ end
46
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: recluse
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Anthony Bruno
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-03-16 00:00:00.000000000 Z
11
+ date: 2017-03-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -203,8 +203,15 @@ files:
203
203
  - lib/recluse/info.rb
204
204
  - lib/recluse/link.rb
205
205
  - lib/recluse/profile.rb
206
+ - lib/recluse/queue.rb
207
+ - lib/recluse/response.rb
206
208
  - lib/recluse/result.rb
207
209
  - lib/recluse/statuscode.rb
210
+ - lib/recluse/tasks/assert.rb
211
+ - lib/recluse/tasks/find.rb
212
+ - lib/recluse/tasks/list.rb
213
+ - lib/recluse/tasks/status.rb
214
+ - lib/recluse/tasks/task.rb
208
215
  - recluse.gemspec
209
216
  homepage: https://github.com/czycha/recluse
210
217
  licenses: