recluse 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 396ec13b808c57e16b103fcc9dd55c228fdef9b5
4
- data.tar.gz: 89b5a9aa4d946fb4d3653a7422407b63fa8263fc
3
+ metadata.gz: 4bc973f210fae9eda00a82ccc421c957098182b3
4
+ data.tar.gz: 27b8c662b5e24f3b48e8d29277ef82d027d128a7
5
5
  SHA512:
6
- metadata.gz: b43648a734c3330cf4f536e0572e02069922574a275ec83c3fc7f2b60019274741d361ce61acc469b26222211c27dc630ec975ae7f59bf8bf313c906edd1a079
7
- data.tar.gz: 04f1f680c9dc0f6fad64f97ae410ca1233a28a52276059d652449c3eac4baa53da1068fe487c5d88f03ed791e4b8cb0011782c53051c6f182990a8869f2139d5
6
+ metadata.gz: 528286fe546e0862abd045b13abef5722a102aa3e9a660d93b08206683cf01a8a61593de6bfd40e93bbeff765a82890ebf593a8b70d97125d683ce7bead4c38f
7
+ data.tar.gz: cb4fcf8872a03d57635756616813603556a9099498f8376dc1dfc9e8e7363fbe71017af2c95b61801f6b15691746c1d4fc909c2de9537e8021be113c5e3873e4
data/README.md CHANGED
@@ -228,6 +228,43 @@ List the YAML info of the profile.
228
228
 
229
229
  Bug reports and pull requests are welcome on GitHub.
230
230
 
231
+ ## Extending
232
+
233
+ Recluse is modular so you can add tasks if you want. Below is an example of adding your own task to Recluse.
234
+
235
+ ```ruby
236
+ require 'recluse'
237
+
238
+ module MyModule
239
+ ##
240
+ # Create a task object
241
+ class MyTask < Recluse::Tasks::Task
242
+ ##
243
+ # First argument must be the profile. The rest are hash arguments specific for the task.
244
+ def initialize(profile, option1: false, option2: true, results: nil)
245
+ # Sets up everything based on the profile, queue-specific options, and can also prepopulate results.
246
+ super(profile, queue_options, results: results)
247
+ @queue.run_if do |link|
248
+ # Run a link if this function returns true.
249
+ # Link is a Recluse::Link object.
250
+ end
251
+ @queue.on_complete do |link, response|
252
+ # Run this function after the page has either successfully been retrieved, or failed to be retrieved.
253
+ # Link is a Recluse::Link object.
254
+ # Response is a Recluse::Response object.
255
+ end
256
+ end
257
+ end
258
+ end
259
+
260
+ # Add your task to the task list under the key 'my_task'.
261
+ Recluse::Tasks.add_task(:my_task, MyModule::MyTask)
262
+
263
+ # You can now access 'my_task' like you would the default Recluse tasks.
264
+ my_profile = Recluse::Profile.load('my_profile')
265
+ my_profile.test(:my_task, option1: true, option2: true)
266
+ results = my_profile.results[:my_task]
267
+ ```
231
268
 
232
269
  ## License
233
270
 
@@ -24,7 +24,7 @@ module Recluse
24
24
  parent_count = 0
25
25
  case group_by
26
26
  when :page
27
- report = profile.results.parents
27
+ report = profile.results[:find].parents
28
28
  CSV.open(csv_path, 'w+') do |csv|
29
29
  csv << ['Page', 'Matching URLs']
30
30
  report.each do |parent, children|
@@ -35,7 +35,7 @@ module Recluse
35
35
  end
36
36
  end
37
37
  when :none
38
- report = profile.results.parents
38
+ report = profile.results[:find].parents
39
39
  CSV.open(csv_path, 'w+') do |csv|
40
40
  csv << ['Matching URL', 'Page']
41
41
  report.each do |parent, children|
@@ -47,7 +47,7 @@ module Recluse
47
47
  end
48
48
  end
49
49
  when :url
50
- report = profile.results.children
50
+ report = profile.results[:find].children
51
51
  CSV.open(csv_path, 'w+') do |csv|
52
52
  csv << ['Matching URL', 'Pages']
53
53
  parents = Set.new
@@ -61,7 +61,7 @@ module Recluse
61
61
  parent_count = parents.length
62
62
  end
63
63
  end
64
- total = profile.results.parents.keys.length
64
+ total = profile.results[:find].parents.keys.length
65
65
  puts "Total pages:\t#{total}"
66
66
  puts "Matched URLs:\t#{child_count}"
67
67
  puts "Pages with matches:\t#{parent_count}\t#{perc parent_count, total}%"
@@ -87,7 +87,7 @@ module Recluse
87
87
  valid_status = proc do |code|
88
88
  (includes.any? { |include_code| include_code.equal?(code) }) && (excludes.none? { |exclude_code| exclude_code.equal?(code) })
89
89
  end
90
- report = profile.results.children
90
+ report = profile.results[:status].children
91
91
  CSV.open(csv_path, 'w+') do |csv|
92
92
  csv << ['Status code', 'URL', page_label, 'With error']
93
93
  report.each do |child, info|
@@ -118,7 +118,7 @@ module Recluse
118
118
 
119
119
  def assert_save(profile, csv_path, report_vals)
120
120
  puts 'Saving report...'
121
- report = profile.results.children
121
+ report = profile.results[:assert].children
122
122
  counts = {}
123
123
  CSV.open(csv_path, 'w+') do |csv|
124
124
  csv << ['Selector', 'Exists', 'On page']
@@ -182,8 +182,8 @@ module Recluse
182
182
  Signal.trap sig, &ending
183
183
  end
184
184
  (0...profile_queue.length).each do |i|
185
- profile.results = profile_queue[i - 1].results unless i.zero?
186
- profile.status
185
+ profile.results[:status] = profile_queue[i - 1].results[:status] unless i.zero?
186
+ profile.test :status
187
187
  profile = profile_queue[i + 1] if i + 1 < profile_queue.length
188
188
  end
189
189
  %w(INT TERM).each do |sig|
@@ -219,8 +219,8 @@ module Recluse
219
219
  Signal.trap sig, &ending
220
220
  end
221
221
  (0...profile_queue.length).each do |i|
222
- profile.results = profile_queue[i - 1].results unless i.zero?
223
- profile.find options['globs']
222
+ profile.results[:find] = profile_queue[i - 1].results[:find] unless i.zero?
223
+ profile.test(:find, globs: options['globs'])
224
224
  profile = profile_queue[i + 1] if i + 1 < profile_queue.length
225
225
  end
226
226
  %w(INT TERM).each do |sig|
@@ -265,9 +265,10 @@ module Recluse
265
265
  %w(INT TERM).each do |sig|
266
266
  Signal.trap sig, &ending
267
267
  end
268
+
268
269
  (0...profile_queue.length).each do |i|
269
- profile.results = profile_queue[i - 1].results unless i.zero?
270
- profile.assert options['exists']
270
+ profile.results[:assert] = profile_queue[i - 1].results[:assert] unless i.zero?
271
+ profile.test(:assert, selectors: options['exists'])
271
272
  profile = profile_queue[i + 1] if i + 1 < profile_queue.length
272
273
  end
273
274
  %w(INT TERM).each do |sig|
@@ -1,7 +1,7 @@
1
1
  module Recluse
2
2
  ##
3
3
  # Version of the gem.
4
- VERSION = '1.0.2'.freeze
4
+ VERSION = '2.0.0'.freeze
5
5
 
6
6
  ##
7
7
  # Homepage of the gem (also used in user-agent).
@@ -42,6 +42,12 @@ module Recluse
42
42
  @absolute
43
43
  end
44
44
 
45
+ ##
46
+ # Inspection
47
+ def inspect
48
+ to_s
49
+ end
50
+
45
51
  ##
46
52
  # Is the link internal compared to +Addressable::URI+ roots?
47
53
  def internal?(addrroots, scheme_squash: false)
@@ -2,6 +2,7 @@ require 'recluse/hashtree'
2
2
  require 'recluse/link'
3
3
  require 'recluse/result'
4
4
  require 'recluse/info'
5
+ require 'recluse/tasks/list'
5
6
  require 'addressable/uri'
6
7
  require 'mechanize'
7
8
  require 'colorize'
@@ -45,14 +46,18 @@ module Recluse
45
46
  # HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to +false+.
46
47
  attr_accessor :scheme_squash
47
48
 
48
- ##
49
- # +HashTree+ representation of results.
50
- attr_accessor :results
51
-
52
49
  ##
53
50
  # When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to +false+.
54
51
  attr_accessor :redirect
55
52
 
53
+ ##
54
+ # The list of run tests.
55
+ attr_accessor :tasks
56
+
57
+ ##
58
+ # Hash of resulting +HashTree+s.
59
+ attr_accessor :results
60
+
56
61
  ##
57
62
  # Create a profile.
58
63
  def initialize(
@@ -68,17 +73,20 @@ module Recluse
68
73
  raise ProfileError, 'Profile needs roots for starting point' if roots.empty?
69
74
  @name = name
70
75
  @email = email
71
- @roots = roots
76
+ @roots = roots.map do |root|
77
+ if root.class == Link
78
+ root
79
+ else
80
+ Link.new(root, :root)
81
+ end
82
+ end
72
83
  @blacklist = blacklist
73
84
  @whitelist = whitelist
74
85
  @internal_only = internal_only
75
86
  @scheme_squash = scheme_squash
76
87
  @redirect = redirect
77
- @results = HashTree.new do |url1, url2|
78
- url1, url2 = url2, url1 if url2.length > url1.length
79
- # Detect if URL exists already, but just has a slash at end
80
- (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
81
- end
88
+ @tasks = {}
89
+ @results = {}
82
90
  end
83
91
 
84
92
  ##
@@ -96,150 +104,18 @@ module Recluse
96
104
  end
97
105
 
98
106
  ##
99
- # Starting from the roots, goes through each runnable link and records the referrer, the status code, and any errors.
100
- # Results are saved in <tt>@results</tt>.
101
- def status(quiet: false)
102
- queue = @roots.map { |url| Link.new(url, :root) }
103
- addrroot = @roots.map { |url| Addressable::URI.parse url }
104
- raise ProfileError, 'No roots to start from' if queue.empty?
105
- agent = create_agent
106
- while queue.length >= 1
107
- element = queue.shift
108
- next unless element.run?(@blacklist, @whitelist)
109
- internal = element.internal?(addrroot)
110
- next if @internal_only && !internal
111
- if @results.child?(element.absolute)
112
- @results.add element.absolute, element.parent
113
- next
114
- end
115
- @results.add element.absolute, element.parent
116
- if @scheme_squash
117
- alt = element.address
118
- alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
119
- if @results.child?(alt.to_s)
120
- @results.set_child_value element.absolute, @results.get_child_value(alt.to_s)
121
- next
122
- end
123
- end
124
- result = Result.new 'idk', false
125
- begin
126
- page = agent.get element.absolute
127
- result.code = page.code
128
- if @redirect
129
- result_link = Link.new(page.uri.to_s, element.parent)
130
- internal = result_link.internal?(addrroot)
131
- end
132
- queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } if internal && (page.class != Mechanize::File) && (page.class != Mechanize::Image)
133
- rescue Mechanize::ResponseCodeError => code
134
- result.code = code.response_code
135
- rescue => e
136
- result.error = e
137
- end
138
- @results.set_child_value element.absolute, result
139
- unless quiet
140
- puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{element.absolute}"
141
- puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
142
- end
143
- end
144
- end
145
-
146
- ##
147
- # Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) +internal_only+ behavior to +true+.
148
- def find(glob, quiet: false)
149
- queue = @roots.map { |url| Link.new(url, :root) }
150
- addrroot = @roots.map { |url| Addressable::URI.parse url }
151
- raise ProfileError, 'No roots to start from' if queue.empty?
152
- progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
153
- agent = create_agent
154
- while queue.length >= 1
155
- element = queue.shift
156
- match = element.match? glob
157
- if match
158
- @results.add element.absolute, element.parent
159
- progress.log "[#{@name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{element.parent} => #{element.absolute}" unless quiet
160
- end
161
- next unless element.run?(@blacklist, @whitelist)
162
- internal = element.internal?(addrroot)
163
- next unless internal
164
- next if @results.parent?(element.absolute)
165
- if @scheme_squash
166
- alt = element.address
167
- alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
168
- next if @results.parent?(alt.to_s)
169
- end
170
- @results.add_parent element.absolute
171
- result = Result.new 'idk', false
172
- begin
173
- page = agent.get element.absolute
174
- result.code = page.code
175
- if @redirect
176
- result_link = Link.new(page.uri.to_s, element.parent)
177
- next unless result_link.internal?(addrroot)
178
- end
179
- queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
180
- rescue Mechanize::ResponseCodeError => code
181
- result.code = code.response_code
182
- rescue => e
183
- result.error = e
184
- end
185
- progress.increment unless quiet
186
- unless quiet || (result.error == false)
187
- progress.log "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
188
- progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
189
- end
190
- end
191
- end
192
-
193
- ##
194
- # Asserts existence of CSS selectors.
195
- def assert(selectors, quiet: false)
196
- queue = @roots.map { |url| Link.new(url, :root) }
197
- addrroot = @roots.map { |url| Addressable::URI.parse url }
198
- raise ProfileError, 'No roots to start from' if queue.empty?
199
- agent = create_agent
200
- while queue.length >= 1
201
- element = queue.shift
202
- internal = element.internal?(addrroot)
203
- next unless element.run?(@blacklist, @whitelist) && internal && !@results.child?(element.absolute)
204
- if @scheme_squash
205
- alt = element.address
206
- alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
207
- next if @results.child?(alt.to_s)
208
- end
209
- @results.add_child element.absolute
210
- existence = nil
211
- result = Result.new 'idk', false
212
- begin
213
- page = agent.get element.absolute
214
- result.code = page.code
215
- if @redirect
216
- result_link = Link.new(page.uri.to_s, element.parent)
217
- next unless result_link.internal?(addrroot)
218
- end
219
- unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
220
- existence = {}
221
- selectors.each do |selector|
222
- existence[selector] = !page.css(selector).empty?
223
- end
224
- @results.set_child_value element.absolute, existence
225
- queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) }
226
- end
227
- rescue Mechanize::ResponseCodeError => code
228
- result.code = code.response_code
229
- rescue => e
230
- result.error = e
231
- end
232
- unless quiet
233
- if result.error != false
234
- puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
235
- puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
236
- elsif !existence.nil?
237
- existence.each do |selector, exists|
238
- puts "[#{@name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{element.absolute}"
239
- end
240
- end
107
+ # Runs test.
108
+ def test(key, options = {})
109
+ unless @results.key?(key) && @results[key].class == Recluse::HashTree
110
+ @results[key] = Recluse::HashTree.new do |url1, url2|
111
+ url1, url2 = url2, url1 if url2.length > url1.length
112
+ # Detect if URL exists already, but just has a slash at end
113
+ (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
241
114
  end
242
115
  end
116
+ @tasks[key] = Recluse::Tasks.get(key).new(self, options.merge(results: @results[key]))
117
+ @tasks[key].run
118
+ @results[key]
243
119
  end
244
120
 
245
121
  ##
@@ -249,7 +125,7 @@ module Recluse
249
125
  fname = "#{@name}.yaml"
250
126
  options = uconf[fname]
251
127
  options['name'] = @name
252
- options['roots'] = @roots
128
+ options['roots'] = @roots.map(&:to_s)
253
129
  options['email'] = @email
254
130
  options['blacklist'] = @blacklist
255
131
  options['whitelist'] = @whitelist
@@ -264,7 +140,9 @@ module Recluse
264
140
  def ==(other)
265
141
  return false if other.class != self.class
266
142
  instance_variables.all? do |ivar|
267
- ivar == '@results'.to_sym || instance_variable_get(ivar) == other.instance_variable_get(ivar)
143
+ next true if ivar == '@results'.to_sym
144
+ next true if ivar == '@roots' && instance_variable_get(ivar).map(&:to_s) == other.instance_variable_get(ivar).map(&:to_s)
145
+ instance_variable_get(ivar) == other.instance_variable_get(ivar)
268
146
  end
269
147
  end
270
148
 
@@ -0,0 +1,74 @@
1
+ require 'mechanize'
2
+ require 'recluse/response'
3
+
4
+ module Recluse
5
+ ##
6
+ # Link checker
7
+ class Queue
8
+ ##
9
+ # Create an empty queue
10
+ def initialize(email, redirect: false)
11
+ @links = []
12
+ @run_if = proc { true }
13
+ @on_complete = proc { |link, response| }
14
+ @redirect = redirect
15
+ @email = email
16
+ @agent = Mechanize.new do |a|
17
+ a.ssl_version = 'TLSv1'
18
+ a.verify_mode = OpenSSL::SSL::VERIFY_NONE
19
+ a.max_history = nil
20
+ a.follow_meta_refresh = true
21
+ a.keep_alive = false
22
+ a.redirect_ok = @redirect
23
+ a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}"
24
+ end
25
+ end
26
+
27
+ ##
28
+ # Add to queue.
29
+ def add(link)
30
+ @links += [*link]
31
+ end
32
+
33
+ ##
34
+ # If the test is true, run the link. Procedure takes the link as input.
35
+ def run_if(&block)
36
+ @run_if = block
37
+ end
38
+
39
+ ##
40
+ # Run when a link has been checked. Procedure takes the link and response as inputs.
41
+ def on_complete(&block)
42
+ @on_complete = block
43
+ end
44
+
45
+ ##
46
+ # Run a link
47
+ def run_link(link)
48
+ response = Response.new
49
+ return nil unless @run_if.call(link)
50
+ begin
51
+ response.page = @agent.get link.absolute
52
+ response.code = response.page.code
53
+ response.success = true
54
+ rescue Mechanize::ResponseCodeError => code
55
+ response.code = code.response_code
56
+ response.success = false
57
+ rescue => error
58
+ response.errors = error
59
+ response.success = false
60
+ end
61
+ @on_complete.call link, response
62
+ response
63
+ end
64
+
65
+ ##
66
+ # Run queue
67
+ def run
68
+ until @links.empty?
69
+ link = @links.shift
70
+ run_link link
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,39 @@
1
+ require 'mechanize'
2
+ require 'recluse/statuscode'
3
+
4
+ module Recluse
5
+ ##
6
+ # Response wrapper.
7
+ class Response
8
+ ##
9
+ # +Mechanize::Page+ of the response page. Might be +nil+.
10
+ attr_accessor :page
11
+
12
+ ##
13
+ # +StatusCode+ of the response.
14
+ attr_reader :code
15
+
16
+ ##
17
+ # Error string if any.
18
+ attr_accessor :errors
19
+
20
+ ##
21
+ # Whether the page was successfully accessed or not.
22
+ attr_accessor :success
23
+
24
+ ##
25
+ # Create new response.
26
+ def initialize(page: nil, errors: false, code: StatusCode.new('idk'), success: false)
27
+ @page = page
28
+ @code = code
29
+ @errors = errors
30
+ @success = success
31
+ end
32
+
33
+ ##
34
+ # Set a new status code.
35
+ def code=(new_code)
36
+ @code = StatusCode.new new_code
37
+ end
38
+ end
39
+ end
@@ -20,7 +20,7 @@ module Recluse
20
20
  ##
21
21
  # Returns the HTTP status code.
22
22
  def inspect
23
- @code
23
+ @code.to_s
24
24
  end
25
25
 
26
26
  ##
@@ -0,0 +1,59 @@
1
+ require 'recluse/tasks/task'
2
+ require 'recluse/link'
3
+ require 'recluse/result'
4
+ require 'addressable/uri'
5
+ require 'colorize'
6
+
7
+ module Recluse
8
+ module Tasks
9
+ ##
10
+ # Asserts existence of CSS selectors.
11
+ class Assert < Task
12
+ ##
13
+ # Create new assertion task.
14
+ def initialize(profile, selectors: [], quiet: false, results: nil)
15
+ super(profile, queue_options: { redirect: profile.redirect }, results: results)
16
+ addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
17
+ @queue.run_if do |link|
18
+ internal = link.internal?(addr_roots)
19
+ next false unless link.run?(profile.blacklist, profile.whitelist) && internal && !@results.child?(link.absolute)
20
+ if profile.scheme_squash
21
+ alt = link.address
22
+ alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
23
+ next false if @results.child?(alt.to_s)
24
+ end
25
+ @results.add_child link.absolute
26
+ true
27
+ end
28
+ @queue.on_complete do |link, response|
29
+ existence = nil
30
+ result = Recluse::Result.new response.code.to_s, response.errors
31
+ if response.success
32
+ if profile.redirect
33
+ result_link = Link.new(response.page.uri.to_s, link.parent)
34
+ next unless result_link.internal?(addr_roots)
35
+ end
36
+ unless (response.page.class == Mechanize::File) || (response.page.class == Mechanize::Image)
37
+ existence = {}
38
+ selectors.each do |selector|
39
+ existence[selector] = !response.page.css(selector).empty?
40
+ end
41
+ @results.set_child_value link.absolute, existence
42
+ @queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) })
43
+ end
44
+ end
45
+ unless quiet
46
+ if result.error != false
47
+ puts "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{link.absolute}"
48
+ puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
49
+ elsif !existence.nil?
50
+ existence.each do |selector, exists|
51
+ puts "[#{profile.name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{link.absolute}"
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,55 @@
1
+ require 'recluse/tasks/task'
2
+ require 'recluse/link'
3
+ require 'recluse/result'
4
+ require 'addressable/uri'
5
+ require 'colorize'
6
+ require 'ruby-progressbar'
7
+
8
+ module Recluse
9
+ module Tasks
10
+ ##
11
+ # Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) +internal_only+ behavior to +true+.
12
+ class Find < Task
13
+ ##
14
+ # Create new find task.
15
+ def initialize(profile, globs: [], quiet: false, results: nil)
16
+ super(profile, queue_options: { redirect: profile.redirect }, results: results)
17
+ addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
18
+ progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
19
+ @queue.run_if do |link|
20
+ match = link.match? globs
21
+ if match
22
+ @results.add link.absolute, link.parent
23
+ progress.log "[#{profile.name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{link.parent} => #{link.absolute}" unless quiet
24
+ end
25
+ next false unless link.run?(profile.blacklist, profile.whitelist)
26
+ internal = link.internal?(addr_roots)
27
+ next false unless internal
28
+ next false if @results.parent?(link.absolute)
29
+ if profile.scheme_squash
30
+ alt = link.address
31
+ alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
32
+ next false if @results.parent?(alt.to_s)
33
+ end
34
+ @results.add_parent link.absolute
35
+ true
36
+ end
37
+ @queue.on_complete do |link, response|
38
+ result = Recluse::Result.new response.code.to_s, response.errors
39
+ if response.success
40
+ if profile.redirect
41
+ result_link = Recluse::Link.new(response.page.uri.to_s, link.parent)
42
+ next unless result_link.internal?(addr_roots)
43
+ end
44
+ @queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) }) unless (response.page.class == Mechanize::File) || (response.page.class == Mechanize::Image)
45
+ end
46
+ progress.increment unless quiet
47
+ unless quiet || (result.error == false)
48
+ progress.log "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{link.absolute}"
49
+ progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,42 @@
1
+ require 'recluse/tasks/status'
2
+ require 'recluse/tasks/assert'
3
+ require 'recluse/tasks/find'
4
+
5
+ module Recluse
6
+ ##
7
+ # Tasks are tests for Recluse.
8
+ module Tasks
9
+ ##
10
+ # Hash of available tasks.
11
+ @@list = {
12
+ status: Recluse::Tasks::Status,
13
+ assert: Recluse::Tasks::Assert,
14
+ find: Recluse::Tasks::Find
15
+ }
16
+ class << self
17
+ ##
18
+ # Add task to the list.
19
+ def add_task(key, task_class)
20
+ list[key] = task_class
21
+ end
22
+
23
+ ##
24
+ # Hash of available tasks.
25
+ def list
26
+ @@list
27
+ end
28
+
29
+ ##
30
+ # Get task by key name.
31
+ def get(key)
32
+ @@list[key]
33
+ end
34
+
35
+ ##
36
+ # Do something for each task.
37
+ def each(&block)
38
+ @@list.each(&block)
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,55 @@
1
+ require 'recluse/tasks/task'
2
+ require 'recluse/link'
3
+ require 'recluse/result'
4
+ require 'addressable/uri'
5
+ require 'colorize'
6
+
7
+ module Recluse
8
+ module Tasks
9
+ ##
10
+ # Starting from the roots, goes through each runnable link and records the referrer, the status code, and any errors.
11
+ class Status < Task
12
+ ##
13
+ # Create new status task.
14
+ def initialize(profile, quiet: false, results: nil)
15
+ super(profile, queue_options: { redirect: profile.redirect }, results: results)
16
+ addr_roots = profile.roots.map { |root| Addressable::URI.parse(root.url) }
17
+ @queue.run_if do |link|
18
+ next false unless link.run?(profile.blacklist, profile.whitelist)
19
+ internal = link.internal?(addr_roots)
20
+ next false if profile.internal_only && !internal
21
+ if @results.child?(link.absolute)
22
+ @results.add link.absolute, link.parent
23
+ next false
24
+ end
25
+ @results.add link.absolute, link.parent
26
+ if profile.scheme_squash
27
+ alt = link.address
28
+ alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
29
+ if @results.child?(alt.to_s)
30
+ @results.set_child_value link.absolute, @results.get_child_value(alt.to_s)
31
+ next false
32
+ end
33
+ end
34
+ true
35
+ end
36
+ @queue.on_complete do |link, response|
37
+ result = Recluse::Result.new response.code.to_s, response.errors
38
+ if response.success
39
+ internal = link.internal? addr_roots
40
+ if profile.redirect
41
+ result_link = Recluse::Link.new response.page.uri.to_s, link.parent
42
+ internal = result_link.internal? addr_roots
43
+ end
44
+ queue.add(response.page.links.map { |new_link| Link.new(new_link.uri.to_s, link.absolute) }) if internal && (response.page.class != Mechanize::File) && (response.page.class != Mechanize::Image)
45
+ end
46
+ @results.set_child_value link.absolute, result
47
+ unless quiet
48
+ puts "[#{profile.name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{link.absolute}"
49
+ puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,46 @@
1
+ require 'recluse/queue'
2
+ require 'recluse/hashtree'
3
+
4
+ module Recluse
5
+ module Tasks
6
+ ##
7
+ # Task interface. Runs the queue with customized behavior.
8
+ class Task
9
+ ##
10
+ # +HashTree+ representation of results.
11
+ attr_reader :results
12
+
13
+ ##
14
+ # +Queue+ of links to check.
15
+ attr_accessor :queue
16
+
17
+ ##
18
+ # Create new task.
19
+ def initialize(profile, queue_options: {}, results: nil)
20
+ @queue = Recluse::Queue.new(profile.email, queue_options)
21
+ if results.nil?
22
+ @results = Recluse::HashTree.new do |url1, url2|
23
+ url1, url2 = url2, url1 if url2.length > url1.length
24
+ # Detect if URL exists already, but just has a slash at end
25
+ (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
26
+ end
27
+ else
28
+ @results = results
29
+ end
30
+ @queue.add profile.roots
31
+ end
32
+
33
+ ##
34
+ # Add link (or links) to the queue.
35
+ def add(link)
36
+ @queue.add link
37
+ end
38
+
39
+ ##
40
+ # Run the queue.
41
+ def run
42
+ @queue.run
43
+ end
44
+ end
45
+ end
46
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: recluse
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Anthony Bruno
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-03-16 00:00:00.000000000 Z
11
+ date: 2017-03-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -203,8 +203,15 @@ files:
203
203
  - lib/recluse/info.rb
204
204
  - lib/recluse/link.rb
205
205
  - lib/recluse/profile.rb
206
+ - lib/recluse/queue.rb
207
+ - lib/recluse/response.rb
206
208
  - lib/recluse/result.rb
207
209
  - lib/recluse/statuscode.rb
210
+ - lib/recluse/tasks/assert.rb
211
+ - lib/recluse/tasks/find.rb
212
+ - lib/recluse/tasks/list.rb
213
+ - lib/recluse/tasks/status.rb
214
+ - lib/recluse/tasks/task.rb
208
215
  - recluse.gemspec
209
216
  homepage: https://github.com/czycha/recluse
210
217
  licenses: