arachni 0.2.4 → 0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. data/CHANGELOG.md +33 -0
  2. data/README.md +2 -4
  3. data/Rakefile +15 -4
  4. data/bin/arachni +0 -0
  5. data/bin/arachni_web +0 -0
  6. data/bin/arachni_web_autostart +0 -0
  7. data/bin/arachni_xmlrpc +0 -0
  8. data/bin/arachni_xmlrpcd +0 -0
  9. data/bin/arachni_xmlrpcd_monitor +0 -0
  10. data/lib/arachni.rb +1 -1
  11. data/lib/framework.rb +36 -6
  12. data/lib/http.rb +12 -5
  13. data/lib/module/auditor.rb +482 -59
  14. data/lib/module/base.rb +17 -0
  15. data/lib/module/manager.rb +26 -2
  16. data/lib/module/trainer.rb +1 -12
  17. data/lib/module/utilities.rb +12 -0
  18. data/lib/parser/auditable.rb +8 -3
  19. data/lib/parser/elements.rb +11 -0
  20. data/lib/parser/page.rb +3 -1
  21. data/lib/parser/parser.rb +130 -18
  22. data/lib/rpc/xml/server/dispatcher.rb +21 -0
  23. data/lib/spider.rb +141 -82
  24. data/lib/ui/cli/cli.rb +2 -3
  25. data/lib/ui/web/addon_manager.rb +273 -0
  26. data/lib/ui/web/addons/autodeploy.rb +172 -0
  27. data/lib/ui/web/addons/autodeploy/lib/manager.rb +291 -0
  28. data/lib/ui/web/addons/autodeploy/views/index.erb +124 -0
  29. data/lib/ui/web/addons/sample.rb +78 -0
  30. data/lib/ui/web/addons/sample/views/index.erb +4 -0
  31. data/lib/ui/web/addons/scheduler.rb +139 -0
  32. data/lib/ui/web/addons/scheduler/views/index.erb +131 -0
  33. data/lib/ui/web/addons/scheduler/views/options.erb +93 -0
  34. data/lib/ui/web/dispatcher_manager.rb +80 -13
  35. data/lib/ui/web/instance_manager.rb +87 -0
  36. data/lib/ui/web/scheduler.rb +166 -0
  37. data/lib/ui/web/server.rb +142 -202
  38. data/lib/ui/web/server/public/js/jquery-ui-timepicker.js +985 -0
  39. data/lib/ui/web/server/public/plugins/sample/style.css +0 -0
  40. data/lib/ui/web/server/public/style.css +42 -0
  41. data/lib/ui/web/server/views/addon.erb +15 -0
  42. data/lib/ui/web/server/views/addons.erb +46 -0
  43. data/lib/ui/web/server/views/dispatchers.erb +1 -1
  44. data/lib/ui/web/server/views/instance.erb +9 -11
  45. data/lib/ui/web/server/views/layout.erb +14 -1
  46. data/lib/ui/web/server/views/welcome.erb +7 -6
  47. data/lib/ui/web/utilities.rb +134 -0
  48. data/modules/audit/code_injection_timing.rb +6 -2
  49. data/modules/audit/code_injection_timing/payloads.txt +2 -2
  50. data/modules/audit/os_cmd_injection_timing.rb +7 -3
  51. data/modules/audit/os_cmd_injection_timing/payloads.txt +1 -1
  52. data/modules/audit/sqli_blind_rdiff.rb +18 -233
  53. data/modules/audit/sqli_blind_rdiff/payloads.txt +5 -0
  54. data/modules/audit/sqli_blind_timing.rb +9 -2
  55. data/path_extractors/anchors.rb +1 -1
  56. data/path_extractors/forms.rb +1 -1
  57. data/path_extractors/frames.rb +1 -1
  58. data/path_extractors/generic.rb +1 -1
  59. data/path_extractors/links.rb +1 -1
  60. data/path_extractors/meta_refresh.rb +1 -1
  61. data/path_extractors/scripts.rb +1 -1
  62. data/path_extractors/sitemap.rb +1 -1
  63. data/plugins/proxy/server.rb +3 -2
  64. data/plugins/waf_detector.rb +0 -3
  65. metadata +37 -34
  66. data/lib/anemone/cookie_store.rb +0 -35
  67. data/lib/anemone/core.rb +0 -371
  68. data/lib/anemone/exceptions.rb +0 -5
  69. data/lib/anemone/http.rb +0 -144
  70. data/lib/anemone/page.rb +0 -338
  71. data/lib/anemone/page_store.rb +0 -160
  72. data/lib/anemone/storage.rb +0 -34
  73. data/lib/anemone/storage/base.rb +0 -75
  74. data/lib/anemone/storage/exceptions.rb +0 -15
  75. data/lib/anemone/storage/mongodb.rb +0 -89
  76. data/lib/anemone/storage/pstore.rb +0 -50
  77. data/lib/anemone/storage/redis.rb +0 -90
  78. data/lib/anemone/storage/tokyo_cabinet.rb +0 -57
  79. data/lib/anemone/tentacle.rb +0 -40
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: arachni
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.2.4
5
+ version: "0.3"
6
6
  platform: ruby
7
7
  authors:
8
8
  - Tasos Laskos
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-07-01 00:00:00 +03:00
13
+ date: 2011-07-26 00:00:00 +03:00
14
14
  default_executable:
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
@@ -47,102 +47,102 @@ dependencies:
47
47
  type: :runtime
48
48
  version_requirements: *id003
49
49
  - !ruby/object:Gem::Dependency
50
- name: robots
50
+ name: sys-proctable
51
51
  prerelease: false
52
52
  requirement: &id004 !ruby/object:Gem::Requirement
53
53
  none: false
54
54
  requirements:
55
55
  - - ~>
56
56
  - !ruby/object:Gem::Version
57
- version: 0.10.0
57
+ version: 0.8.1
58
58
  type: :runtime
59
59
  version_requirements: *id004
60
60
  - !ruby/object:Gem::Dependency
61
- name: sys-proctable
61
+ name: terminal-table
62
62
  prerelease: false
63
63
  requirement: &id005 !ruby/object:Gem::Requirement
64
64
  none: false
65
65
  requirements:
66
66
  - - ~>
67
67
  - !ruby/object:Gem::Version
68
- version: 0.8.1
68
+ version: 1.4.2
69
69
  type: :runtime
70
70
  version_requirements: *id005
71
71
  - !ruby/object:Gem::Dependency
72
- name: terminal-table
72
+ name: sinatra
73
73
  prerelease: false
74
74
  requirement: &id006 !ruby/object:Gem::Requirement
75
75
  none: false
76
76
  requirements:
77
77
  - - ~>
78
78
  - !ruby/object:Gem::Version
79
- version: 1.4.2
79
+ version: 1.2.1
80
80
  type: :runtime
81
81
  version_requirements: *id006
82
82
  - !ruby/object:Gem::Dependency
83
- name: sinatra
83
+ name: datamapper
84
84
  prerelease: false
85
85
  requirement: &id007 !ruby/object:Gem::Requirement
86
86
  none: false
87
87
  requirements:
88
88
  - - ~>
89
89
  - !ruby/object:Gem::Version
90
- version: 1.2.1
90
+ version: 1.0.2
91
91
  type: :runtime
92
92
  version_requirements: *id007
93
93
  - !ruby/object:Gem::Dependency
94
- name: datamapper
94
+ name: rack_csrf
95
95
  prerelease: false
96
96
  requirement: &id008 !ruby/object:Gem::Requirement
97
97
  none: false
98
98
  requirements:
99
99
  - - ~>
100
100
  - !ruby/object:Gem::Version
101
- version: 1.0.2
101
+ version: 2.1.0
102
102
  type: :runtime
103
103
  version_requirements: *id008
104
104
  - !ruby/object:Gem::Dependency
105
- name: rack_csrf
105
+ name: rack-flash
106
106
  prerelease: false
107
107
  requirement: &id009 !ruby/object:Gem::Requirement
108
108
  none: false
109
109
  requirements:
110
110
  - - ~>
111
111
  - !ruby/object:Gem::Version
112
- version: 2.1.0
112
+ version: 0.1.1
113
113
  type: :runtime
114
114
  version_requirements: *id009
115
115
  - !ruby/object:Gem::Dependency
116
- name: rack-flash
116
+ name: json
117
117
  prerelease: false
118
118
  requirement: &id010 !ruby/object:Gem::Requirement
119
119
  none: false
120
120
  requirements:
121
121
  - - ~>
122
122
  - !ruby/object:Gem::Version
123
- version: 0.1.1
123
+ version: 1.4.6
124
124
  type: :runtime
125
125
  version_requirements: *id010
126
126
  - !ruby/object:Gem::Dependency
127
- name: json
127
+ name: dm-sqlite-adapter
128
128
  prerelease: false
129
129
  requirement: &id011 !ruby/object:Gem::Requirement
130
130
  none: false
131
131
  requirements:
132
132
  - - ~>
133
133
  - !ruby/object:Gem::Version
134
- version: 1.4.6
134
+ version: 1.0.2
135
135
  type: :runtime
136
136
  version_requirements: *id011
137
137
  - !ruby/object:Gem::Dependency
138
- name: dm-sqlite-adapter
138
+ name: net-ssh
139
139
  prerelease: false
140
140
  requirement: &id012 !ruby/object:Gem::Requirement
141
141
  none: false
142
142
  requirements:
143
143
  - - ~>
144
144
  - !ruby/object:Gem::Version
145
- version: 1.0.2
145
+ version: 2.1.4
146
146
  type: :runtime
147
147
  version_requirements: *id012
148
148
  description: " Arachni is a feature-full, modular, high-performance Ruby framework aimed towards\n helping penetration testers and administrators evaluate the security of web applications.\n\n Arachni is smart, it trains itself by learning from the HTTP responses it receives during the audit process.\n Unlike other scanners, Arachni takes into account the dynamic nature of web applications and can detect changes caused while travelling\n through the paths of a web application's cyclomatic complexity.\n This way attack/input vectors that would otherwise be undetectable by non-humans are seamlessly handled by Arachni.\n\n Finally, Arachni yields great performance due to its asynchronous HTTP model (courtesy of Typhoeus).\n Thus, you'll only be limited by the responsiveness of the server under audit and your available bandwidth.\n\n Note: Despite the fact that Arachni is mostly targeted towards web application security,\n it can easily be used for general purpose scraping, data-mining, etc with the addition of custom modules.\n"
@@ -190,20 +190,6 @@ files:
190
190
  - lib/mixins/observable.rb
191
191
  - lib/parser.rb
192
192
  - lib/component_options.rb
193
- - lib/anemone/storage.rb
194
- - lib/anemone/page.rb
195
- - lib/anemone/cookie_store.rb
196
- - lib/anemone/page_store.rb
197
- - lib/anemone/tentacle.rb
198
- - lib/anemone/storage/pstore.rb
199
- - lib/anemone/storage/mongodb.rb
200
- - lib/anemone/storage/redis.rb
201
- - lib/anemone/storage/exceptions.rb
202
- - lib/anemone/storage/base.rb
203
- - lib/anemone/storage/tokyo_cabinet.rb
204
- - lib/anemone/http.rb
205
- - lib/anemone/core.rb
206
- - lib/anemone/exceptions.rb
207
193
  - lib/report.rb
208
194
  - lib/options.rb
209
195
  - lib/issue.rb
@@ -262,6 +248,7 @@ files:
262
248
  - lib/ui/web/server/public/sidebar-bottom.jpg
263
249
  - lib/ui/web/server/public/nav-selected-left.jpg
264
250
  - lib/ui/web/server/public/js/jquery-1.4.4.min.js
251
+ - lib/ui/web/server/public/js/jquery-ui-timepicker.js
265
252
  - lib/ui/web/server/public/js/jquery-ui-1.8.9.custom.min.js
266
253
  - lib/ui/web/server/public/spider.png
267
254
  - lib/ui/web/server/public/banner.png
@@ -272,6 +259,7 @@ files:
272
259
  - lib/ui/web/server/public/icons/error.png
273
260
  - lib/ui/web/server/public/icons/ok.png
274
261
  - lib/ui/web/server/public/sidebar-h4.jpg
262
+ - lib/ui/web/server/public/plugins/sample/style.css
275
263
  - lib/ui/web/server/public/footer.jpg
276
264
  - lib/ui/web/server/tmp/placeholder
277
265
  - lib/ui/web/server/db/placeholder
@@ -281,6 +269,7 @@ files:
281
269
  - lib/ui/web/server/views/flash.erb
282
270
  - lib/ui/web/server/views/report_formats.erb
283
271
  - lib/ui/web/server/views/modules.erb
272
+ - lib/ui/web/server/views/addons.erb
284
273
  - lib/ui/web/server/views/plugins.erb
285
274
  - lib/ui/web/server/views/layout.erb
286
275
  - lib/ui/web/server/views/output_results.erb
@@ -290,13 +279,26 @@ files:
290
279
  - lib/ui/web/server/views/home.erb
291
280
  - lib/ui/web/server/views/welcome.erb
292
281
  - lib/ui/web/server/views/reports.erb
282
+ - lib/ui/web/server/views/addon.erb
293
283
  - lib/ui/web/server/views/dispatchers_edit.erb
294
284
  - lib/ui/web/server/views/settings.erb
285
+ - lib/ui/web/scheduler.rb
286
+ - lib/ui/web/utilities.rb
287
+ - lib/ui/web/addons/autodeploy.rb
288
+ - lib/ui/web/addons/scheduler.rb
289
+ - lib/ui/web/addons/scheduler/views/options.erb
290
+ - lib/ui/web/addons/scheduler/views/index.erb
291
+ - lib/ui/web/addons/sample/views/index.erb
292
+ - lib/ui/web/addons/autodeploy/lib/manager.rb
293
+ - lib/ui/web/addons/autodeploy/views/index.erb
294
+ - lib/ui/web/addons/sample.rb
295
295
  - lib/ui/web/report_manager.rb
296
296
  - lib/ui/web/server.rb
297
297
  - lib/ui/web/log.rb
298
298
  - lib/ui/web/dispatcher_manager.rb
299
299
  - lib/ui/web/output_stream.rb
300
+ - lib/ui/web/instance_manager.rb
301
+ - lib/ui/web/addon_manager.rb
300
302
  - lib/parser/auditable.rb
301
303
  - lib/parser/parser.rb
302
304
  - lib/parser/page.rb
@@ -364,6 +366,7 @@ files:
364
366
  - modules/audit/os_cmd_injection_timing.rb
365
367
  - modules/audit/ldapi.rb
366
368
  - modules/audit/ldapi/errors.txt
369
+ - modules/audit/sqli_blind_rdiff/payloads.txt
367
370
  - modules/audit/code_injection_timing.rb
368
371
  - modules/audit/os_cmd_injection_timing/payloads.txt
369
372
  - modules/audit/sqli/regexp_ids.txt
@@ -1,35 +0,0 @@
1
- require 'delegate'
2
- require 'webrick/cookie'
3
-
4
- class WEBrick::Cookie
5
- def expired?
6
- !!expires && expires < Time.now
7
- end
8
- end
9
-
10
- module Anemone
11
- class CookieStore < DelegateClass(Hash)
12
-
13
- def initialize(cookies = nil)
14
- @cookies = {}
15
- cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
16
- super(@cookies)
17
- end
18
-
19
- def merge!(set_cookie_str)
20
- begin
21
- cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
22
- hash[cookie.name] = cookie if !!cookie
23
- hash
24
- end
25
- @cookies.merge! cookie_hash
26
- rescue
27
- end
28
- end
29
-
30
- def to_s
31
- @cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
32
- end
33
-
34
- end
35
- end
data/lib/anemone/core.rb DELETED
@@ -1,371 +0,0 @@
1
- =begin
2
- Arachni
3
- Copyright (c) 2010-2011 Tasos "Zapotek" Laskos <tasos.laskos@gmail.com>
4
-
5
- This is free software; you can copy and distribute and modify
6
- this program under the term of the GPL v2.0 License
7
- (See LICENSE file for details)
8
-
9
- =end
10
-
11
- require 'thread'
12
- require 'robots'
13
-
14
- opts = Arachni::Options.instance
15
- require opts.dir['lib'] + 'anemone/tentacle'
16
- require opts.dir['lib'] + 'anemone/page'
17
- require opts.dir['lib'] + 'anemone/exceptions'
18
- require opts.dir['lib'] + 'anemone/page_store'
19
- require opts.dir['lib'] + 'anemone/storage'
20
- require opts.dir['lib'] + 'anemone/storage/base'
21
-
22
- module Anemone
23
-
24
- VERSION = '0.5.0';
25
-
26
- #
27
- # Convenience method to start a crawl
28
- #
29
- def Anemone.crawl(urls, options = {}, &block)
30
- Core.crawl(urls, options, &block)
31
- end
32
-
33
-
34
- #
35
- # Overides Anemone's Core class method skip_link?( link )
36
- # to support regexp matching to the whole url and enforce redundancy checks.
37
- # <br/>
38
- # Messages were also added to inform the user in case of redundant URLs.
39
- #
40
- # @author: Tasos "Zapotek" Laskos
41
- # <tasos.laskos@gmail.com>
42
- # <zapotek@segfault.gr>
43
- # @version: 0.1
44
- #
45
- class Core
46
-
47
- include Arachni::UI::Output
48
-
49
- # PageStore storing all Page objects encountered during the crawl
50
- attr_reader :pages
51
- # Hash of options for the crawl
52
- attr_reader :opts
53
-
54
- DEFAULT_OPTS = {
55
- # run 4 Tentacle threads to fetch pages
56
- :threads => 4,
57
- # disable verbose output
58
- :verbose => false,
59
- # don't throw away the page response body after scanning it for links
60
- :discard_page_bodies => false,
61
- # identify self as Anemone/VERSION
62
- :user_agent => "Anemone/#{Anemone::VERSION}",
63
- # no delay between requests
64
- :delay => 0,
65
- # don't obey the robots exclusion protocol
66
- :obey_robots_txt => false,
67
- # by default, don't limit the depth of the crawl
68
- :depth_limit => false,
69
- # number of times HTTP redirects will be followed
70
- :redirect_limit => 5,
71
- # storage engine defaults to Hash in +process_options+ if none specified
72
- :storage => nil,
73
- # Hash of cookie name => value to send with HTTP requests
74
- :cookies => nil,
75
- # accept cookies from the server and send them back?
76
- :accept_cookies => false,
77
- # skip any link with a query string? e.g. http://foo.com/?u=user
78
- :skip_query_strings => false
79
- }
80
-
81
- # Create setter methods for all options to be called from the crawl block
82
- DEFAULT_OPTS.keys.each do |key|
83
- define_method "#{key}=" do |value|
84
- @opts[key.to_sym] = value
85
- end
86
- end
87
-
88
- #
89
- # Initialize the crawl with starting *urls* (single URL or Array of URLs)
90
- # and optional *block*
91
- #
92
- def initialize(urls, opts = {})
93
- @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
94
- @urls.each{ |url| url.path = '/' if url.path.empty? }
95
-
96
- @tentacles = []
97
- @on_every_page_blocks = []
98
- @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
99
- @skip_link_patterns = []
100
- @after_crawl_blocks = []
101
- @opts = opts
102
-
103
- yield self if block_given?
104
- end
105
-
106
- #
107
- # Convenience method to start a new crawl
108
- #
109
- def self.crawl(urls, opts = {})
110
- self.new(urls, opts) do |core|
111
- yield core if block_given?
112
- core.run
113
- end
114
- end
115
-
116
- #
117
- # Add a block to be executed on the PageStore after the crawl
118
- # is finished
119
- #
120
- def after_crawl(&block)
121
- @after_crawl_blocks << block
122
- self
123
- end
124
-
125
- #
126
- # Add one ore more Regex patterns for URLs which should not be
127
- # followed
128
- #
129
- def skip_links_like(*patterns)
130
- @skip_link_patterns.concat [patterns].flatten.compact
131
- self
132
- end
133
-
134
- #
135
- # Add a block to be executed on every Page as they are encountered
136
- # during the crawl
137
- #
138
- def on_every_page(&block)
139
- @on_every_page_blocks << block
140
- self
141
- end
142
-
143
- #
144
- # Add a block to be executed on Page objects with a URL matching
145
- # one or more patterns
146
- #
147
- def on_pages_like(*patterns, &block)
148
- if patterns
149
- patterns.each do |pattern|
150
- @on_pages_like_blocks[pattern] << block
151
- end
152
- end
153
- self
154
- end
155
-
156
- #
157
- # Specify a block which will select which links to follow on each page.
158
- # The block should return an Array of URI objects.
159
- #
160
- def focus_crawl(&block)
161
- @focus_crawl_block = block
162
- self
163
- end
164
-
165
- #
166
- # Perform the crawl
167
- #
168
- def run
169
- process_options
170
-
171
- @urls.delete_if { |url| !visit_link?(url) }
172
- return if @urls.empty?
173
-
174
- link_queue = Queue.new
175
- page_queue = Queue.new
176
-
177
- @opts[:threads].times do
178
- @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
179
- end
180
-
181
- @urls.each{ |url| link_queue.enq(url) }
182
-
183
- loop do
184
- page = page_queue.deq
185
- @pages.touch_key page.url
186
- puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
187
- do_page_blocks page
188
- page.discard_doc! if @opts[:discard_page_bodies]
189
-
190
- links = links_to_follow page
191
- links.each do |link|
192
- link_queue << [link, page.url.dup, page.depth + 1]
193
- end
194
- @pages.touch_keys links
195
-
196
- @pages[page.url] = page
197
-
198
- # if we are done with the crawl, tell the threads to end
199
- if link_queue.empty? and page_queue.empty?
200
- until link_queue.num_waiting == @tentacles.size
201
- Thread.pass
202
- end
203
- if page_queue.empty?
204
- @tentacles.size.times { link_queue << :END }
205
- break
206
- end
207
- end
208
- end
209
-
210
- @tentacles.each { |thread| thread.join }
211
- do_after_crawl_blocks
212
- self
213
- end
214
-
215
- private
216
-
217
- def process_options
218
- @opts = DEFAULT_OPTS.merge @opts
219
- @opts[:threads] = 1 if @opts[:delay] > 0
220
- storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
221
- @pages = PageStore.new(storage)
222
- @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
223
-
224
- freeze_options
225
- end
226
-
227
- #
228
- # Freeze the opts Hash so that no options can be modified
229
- # once the crawl begins
230
- #
231
- def freeze_options
232
- @opts.freeze
233
- @opts.each_key { |key| @opts[key].freeze }
234
- @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
235
- end
236
-
237
- #
238
- # Execute the after_crawl blocks
239
- #
240
- def do_after_crawl_blocks
241
- @after_crawl_blocks.each { |block| block.call(@pages) }
242
- end
243
-
244
- #
245
- # Execute the on_every_page blocks for *page*
246
- #
247
- # Modified it to fix a bug in Anemone when given more than one<br/>
248
- # regular expression for "@on_pages_like_blocks".
249
- #
250
- def do_page_blocks(page)
251
- @on_every_page_blocks.each do |block|
252
- block.call(page)
253
- end
254
-
255
- @on_pages_like_blocks.each do |patterns, blocks|
256
- if matches_pattern?( page.url.to_s, patterns )
257
- blocks.each { |block| block.call(page) }
258
- end
259
- end
260
- end
261
-
262
- #
263
- # Return an Array of links to follow from the given page.
264
- # Based on whether or not the link has already been crawled,
265
- # and the block given to focus_crawl()
266
- #
267
- def links_to_follow(page)
268
- links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
269
- links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
270
- end
271
-
272
- #
273
- # Returns +true+ if *link* has not been visited already,
274
- # and is not excluded by a skip_link pattern...
275
- # and is not excluded by robots.txt...
276
- # and is not deeper than the depth limit
277
- # Returns +false+ otherwise.
278
- #
279
- def visit_link?(link, from_page = nil)
280
- !@pages.has_page?(link) &&
281
- !skip_link?(link) &&
282
- !skip_query_string?(link) &&
283
- allowed(link) &&
284
- !too_deep?(from_page)
285
- end
286
-
287
- #
288
- # Returns +true+ if we are obeying robots.txt and the link
289
- # is granted access in it. Always returns +true+ when we are
290
- # not obeying robots.txt.
291
- #
292
- def allowed(link)
293
- @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
294
- end
295
-
296
- #
297
- # Returns +true+ if we are over the page depth limit.
298
- # This only works when coming from a page and with the +depth_limit+ option set.
299
- # When neither is the case, will always return +false+.
300
- def too_deep?(from_page)
301
- if from_page && @opts[:depth_limit]
302
- from_page.depth >= @opts[:depth_limit]
303
- else
304
- false
305
- end
306
- end
307
-
308
- #
309
- # Returns +true+ if *link* should not be visited because
310
- # it has a query string and +skip_query_strings+ is true.
311
- #
312
- def skip_query_string?(link)
313
- @opts[:skip_query_strings] && link.query
314
- end
315
-
316
- #
317
- # Returns +true+ if *link* should not be visited because
318
- # its URL matches a skip_link pattern or the reundancy countdown has reached
319
- # zero.
320
- #
321
- def skip_link?( link )
322
-
323
- url = link.to_s
324
- skip = false
325
- @opts['redundant'].each_with_index {
326
- |redundant, i|
327
-
328
- if( url =~ redundant['regexp'] )
329
-
330
- if( @opts['redundant'][i]['count'] == 0 )
331
- print_verbose( 'Discarding redundant page: \'' + url + '\'' )
332
- return true
333
- end
334
-
335
- print_info( 'Matched redundancy rule: ' +
336
- redundant['regexp'].to_s + ' for page \'' +
337
- url + '\'' )
338
-
339
- print_info( 'Count-down: ' +
340
- @opts['redundant'][i]['count'].to_s )
341
-
342
- @opts['redundant'][i]['count'] -= 1
343
- end
344
- }
345
-
346
- @skip_link_patterns.any? { |pattern| url =~ pattern }
347
-
348
- end
349
-
350
- #
351
- # Decides whether or not a url matches any of the regular expressions
352
- # in "patterns".
353
- #
354
- # @param [String] url
355
- # @param [Array] patterns array of regular expressions
356
- #
357
- # @return [Bool]
358
- #
359
- def matches_pattern?( url, patterns )
360
-
361
- patterns.each {
362
- |pattern|
363
- return true if url =~ pattern
364
- }
365
-
366
- return false
367
- end
368
-
369
- end
370
-
371
- end