arachni 0.2.4 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +33 -0
- data/README.md +2 -4
- data/Rakefile +15 -4
- data/bin/arachni +0 -0
- data/bin/arachni_web +0 -0
- data/bin/arachni_web_autostart +0 -0
- data/bin/arachni_xmlrpc +0 -0
- data/bin/arachni_xmlrpcd +0 -0
- data/bin/arachni_xmlrpcd_monitor +0 -0
- data/lib/arachni.rb +1 -1
- data/lib/framework.rb +36 -6
- data/lib/http.rb +12 -5
- data/lib/module/auditor.rb +482 -59
- data/lib/module/base.rb +17 -0
- data/lib/module/manager.rb +26 -2
- data/lib/module/trainer.rb +1 -12
- data/lib/module/utilities.rb +12 -0
- data/lib/parser/auditable.rb +8 -3
- data/lib/parser/elements.rb +11 -0
- data/lib/parser/page.rb +3 -1
- data/lib/parser/parser.rb +130 -18
- data/lib/rpc/xml/server/dispatcher.rb +21 -0
- data/lib/spider.rb +141 -82
- data/lib/ui/cli/cli.rb +2 -3
- data/lib/ui/web/addon_manager.rb +273 -0
- data/lib/ui/web/addons/autodeploy.rb +172 -0
- data/lib/ui/web/addons/autodeploy/lib/manager.rb +291 -0
- data/lib/ui/web/addons/autodeploy/views/index.erb +124 -0
- data/lib/ui/web/addons/sample.rb +78 -0
- data/lib/ui/web/addons/sample/views/index.erb +4 -0
- data/lib/ui/web/addons/scheduler.rb +139 -0
- data/lib/ui/web/addons/scheduler/views/index.erb +131 -0
- data/lib/ui/web/addons/scheduler/views/options.erb +93 -0
- data/lib/ui/web/dispatcher_manager.rb +80 -13
- data/lib/ui/web/instance_manager.rb +87 -0
- data/lib/ui/web/scheduler.rb +166 -0
- data/lib/ui/web/server.rb +142 -202
- data/lib/ui/web/server/public/js/jquery-ui-timepicker.js +985 -0
- data/lib/ui/web/server/public/plugins/sample/style.css +0 -0
- data/lib/ui/web/server/public/style.css +42 -0
- data/lib/ui/web/server/views/addon.erb +15 -0
- data/lib/ui/web/server/views/addons.erb +46 -0
- data/lib/ui/web/server/views/dispatchers.erb +1 -1
- data/lib/ui/web/server/views/instance.erb +9 -11
- data/lib/ui/web/server/views/layout.erb +14 -1
- data/lib/ui/web/server/views/welcome.erb +7 -6
- data/lib/ui/web/utilities.rb +134 -0
- data/modules/audit/code_injection_timing.rb +6 -2
- data/modules/audit/code_injection_timing/payloads.txt +2 -2
- data/modules/audit/os_cmd_injection_timing.rb +7 -3
- data/modules/audit/os_cmd_injection_timing/payloads.txt +1 -1
- data/modules/audit/sqli_blind_rdiff.rb +18 -233
- data/modules/audit/sqli_blind_rdiff/payloads.txt +5 -0
- data/modules/audit/sqli_blind_timing.rb +9 -2
- data/path_extractors/anchors.rb +1 -1
- data/path_extractors/forms.rb +1 -1
- data/path_extractors/frames.rb +1 -1
- data/path_extractors/generic.rb +1 -1
- data/path_extractors/links.rb +1 -1
- data/path_extractors/meta_refresh.rb +1 -1
- data/path_extractors/scripts.rb +1 -1
- data/path_extractors/sitemap.rb +1 -1
- data/plugins/proxy/server.rb +3 -2
- data/plugins/waf_detector.rb +0 -3
- metadata +37 -34
- data/lib/anemone/cookie_store.rb +0 -35
- data/lib/anemone/core.rb +0 -371
- data/lib/anemone/exceptions.rb +0 -5
- data/lib/anemone/http.rb +0 -144
- data/lib/anemone/page.rb +0 -338
- data/lib/anemone/page_store.rb +0 -160
- data/lib/anemone/storage.rb +0 -34
- data/lib/anemone/storage/base.rb +0 -75
- data/lib/anemone/storage/exceptions.rb +0 -15
- data/lib/anemone/storage/mongodb.rb +0 -89
- data/lib/anemone/storage/pstore.rb +0 -50
- data/lib/anemone/storage/redis.rb +0 -90
- data/lib/anemone/storage/tokyo_cabinet.rb +0 -57
- data/lib/anemone/tentacle.rb +0 -40
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: arachni
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: "0.3"
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Tasos Laskos
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-07-
|
13
|
+
date: 2011-07-26 00:00:00 +03:00
|
14
14
|
default_executable:
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
@@ -47,102 +47,102 @@ dependencies:
|
|
47
47
|
type: :runtime
|
48
48
|
version_requirements: *id003
|
49
49
|
- !ruby/object:Gem::Dependency
|
50
|
-
name:
|
50
|
+
name: sys-proctable
|
51
51
|
prerelease: false
|
52
52
|
requirement: &id004 !ruby/object:Gem::Requirement
|
53
53
|
none: false
|
54
54
|
requirements:
|
55
55
|
- - ~>
|
56
56
|
- !ruby/object:Gem::Version
|
57
|
-
version: 0.
|
57
|
+
version: 0.8.1
|
58
58
|
type: :runtime
|
59
59
|
version_requirements: *id004
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
|
-
name:
|
61
|
+
name: terminal-table
|
62
62
|
prerelease: false
|
63
63
|
requirement: &id005 !ruby/object:Gem::Requirement
|
64
64
|
none: false
|
65
65
|
requirements:
|
66
66
|
- - ~>
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
68
|
+
version: 1.4.2
|
69
69
|
type: :runtime
|
70
70
|
version_requirements: *id005
|
71
71
|
- !ruby/object:Gem::Dependency
|
72
|
-
name:
|
72
|
+
name: sinatra
|
73
73
|
prerelease: false
|
74
74
|
requirement: &id006 !ruby/object:Gem::Requirement
|
75
75
|
none: false
|
76
76
|
requirements:
|
77
77
|
- - ~>
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 1.
|
79
|
+
version: 1.2.1
|
80
80
|
type: :runtime
|
81
81
|
version_requirements: *id006
|
82
82
|
- !ruby/object:Gem::Dependency
|
83
|
-
name:
|
83
|
+
name: datamapper
|
84
84
|
prerelease: false
|
85
85
|
requirement: &id007 !ruby/object:Gem::Requirement
|
86
86
|
none: false
|
87
87
|
requirements:
|
88
88
|
- - ~>
|
89
89
|
- !ruby/object:Gem::Version
|
90
|
-
version: 1.2
|
90
|
+
version: 1.0.2
|
91
91
|
type: :runtime
|
92
92
|
version_requirements: *id007
|
93
93
|
- !ruby/object:Gem::Dependency
|
94
|
-
name:
|
94
|
+
name: rack_csrf
|
95
95
|
prerelease: false
|
96
96
|
requirement: &id008 !ruby/object:Gem::Requirement
|
97
97
|
none: false
|
98
98
|
requirements:
|
99
99
|
- - ~>
|
100
100
|
- !ruby/object:Gem::Version
|
101
|
-
version: 1.0
|
101
|
+
version: 2.1.0
|
102
102
|
type: :runtime
|
103
103
|
version_requirements: *id008
|
104
104
|
- !ruby/object:Gem::Dependency
|
105
|
-
name:
|
105
|
+
name: rack-flash
|
106
106
|
prerelease: false
|
107
107
|
requirement: &id009 !ruby/object:Gem::Requirement
|
108
108
|
none: false
|
109
109
|
requirements:
|
110
110
|
- - ~>
|
111
111
|
- !ruby/object:Gem::Version
|
112
|
-
version:
|
112
|
+
version: 0.1.1
|
113
113
|
type: :runtime
|
114
114
|
version_requirements: *id009
|
115
115
|
- !ruby/object:Gem::Dependency
|
116
|
-
name:
|
116
|
+
name: json
|
117
117
|
prerelease: false
|
118
118
|
requirement: &id010 !ruby/object:Gem::Requirement
|
119
119
|
none: false
|
120
120
|
requirements:
|
121
121
|
- - ~>
|
122
122
|
- !ruby/object:Gem::Version
|
123
|
-
version:
|
123
|
+
version: 1.4.6
|
124
124
|
type: :runtime
|
125
125
|
version_requirements: *id010
|
126
126
|
- !ruby/object:Gem::Dependency
|
127
|
-
name:
|
127
|
+
name: dm-sqlite-adapter
|
128
128
|
prerelease: false
|
129
129
|
requirement: &id011 !ruby/object:Gem::Requirement
|
130
130
|
none: false
|
131
131
|
requirements:
|
132
132
|
- - ~>
|
133
133
|
- !ruby/object:Gem::Version
|
134
|
-
version: 1.
|
134
|
+
version: 1.0.2
|
135
135
|
type: :runtime
|
136
136
|
version_requirements: *id011
|
137
137
|
- !ruby/object:Gem::Dependency
|
138
|
-
name:
|
138
|
+
name: net-ssh
|
139
139
|
prerelease: false
|
140
140
|
requirement: &id012 !ruby/object:Gem::Requirement
|
141
141
|
none: false
|
142
142
|
requirements:
|
143
143
|
- - ~>
|
144
144
|
- !ruby/object:Gem::Version
|
145
|
-
version: 1.
|
145
|
+
version: 2.1.4
|
146
146
|
type: :runtime
|
147
147
|
version_requirements: *id012
|
148
148
|
description: " Arachni is a feature-full, modular, high-performance Ruby framework aimed towards\n helping penetration testers and administrators evaluate the security of web applications.\n\n Arachni is smart, it trains itself by learning from the HTTP responses it receives during the audit process.\n Unlike other scanners, Arachni takes into account the dynamic nature of web applications and can detect changes caused while travelling\n through the paths of a web application's cyclomatic complexity.\n This way attack/input vectors that would otherwise be undetectable by non-humans are seamlessly handled by Arachni.\n\n Finally, Arachni yields great performance due to its asynchronous HTTP model (courtesy of Typhoeus).\n Thus, you'll only be limited by the responsiveness of the server under audit and your available bandwidth.\n\n Note: Despite the fact that Arachni is mostly targeted towards web application security,\n it can easily be used for general purpose scraping, data-mining, etc with the addition of custom modules.\n"
|
@@ -190,20 +190,6 @@ files:
|
|
190
190
|
- lib/mixins/observable.rb
|
191
191
|
- lib/parser.rb
|
192
192
|
- lib/component_options.rb
|
193
|
-
- lib/anemone/storage.rb
|
194
|
-
- lib/anemone/page.rb
|
195
|
-
- lib/anemone/cookie_store.rb
|
196
|
-
- lib/anemone/page_store.rb
|
197
|
-
- lib/anemone/tentacle.rb
|
198
|
-
- lib/anemone/storage/pstore.rb
|
199
|
-
- lib/anemone/storage/mongodb.rb
|
200
|
-
- lib/anemone/storage/redis.rb
|
201
|
-
- lib/anemone/storage/exceptions.rb
|
202
|
-
- lib/anemone/storage/base.rb
|
203
|
-
- lib/anemone/storage/tokyo_cabinet.rb
|
204
|
-
- lib/anemone/http.rb
|
205
|
-
- lib/anemone/core.rb
|
206
|
-
- lib/anemone/exceptions.rb
|
207
193
|
- lib/report.rb
|
208
194
|
- lib/options.rb
|
209
195
|
- lib/issue.rb
|
@@ -262,6 +248,7 @@ files:
|
|
262
248
|
- lib/ui/web/server/public/sidebar-bottom.jpg
|
263
249
|
- lib/ui/web/server/public/nav-selected-left.jpg
|
264
250
|
- lib/ui/web/server/public/js/jquery-1.4.4.min.js
|
251
|
+
- lib/ui/web/server/public/js/jquery-ui-timepicker.js
|
265
252
|
- lib/ui/web/server/public/js/jquery-ui-1.8.9.custom.min.js
|
266
253
|
- lib/ui/web/server/public/spider.png
|
267
254
|
- lib/ui/web/server/public/banner.png
|
@@ -272,6 +259,7 @@ files:
|
|
272
259
|
- lib/ui/web/server/public/icons/error.png
|
273
260
|
- lib/ui/web/server/public/icons/ok.png
|
274
261
|
- lib/ui/web/server/public/sidebar-h4.jpg
|
262
|
+
- lib/ui/web/server/public/plugins/sample/style.css
|
275
263
|
- lib/ui/web/server/public/footer.jpg
|
276
264
|
- lib/ui/web/server/tmp/placeholder
|
277
265
|
- lib/ui/web/server/db/placeholder
|
@@ -281,6 +269,7 @@ files:
|
|
281
269
|
- lib/ui/web/server/views/flash.erb
|
282
270
|
- lib/ui/web/server/views/report_formats.erb
|
283
271
|
- lib/ui/web/server/views/modules.erb
|
272
|
+
- lib/ui/web/server/views/addons.erb
|
284
273
|
- lib/ui/web/server/views/plugins.erb
|
285
274
|
- lib/ui/web/server/views/layout.erb
|
286
275
|
- lib/ui/web/server/views/output_results.erb
|
@@ -290,13 +279,26 @@ files:
|
|
290
279
|
- lib/ui/web/server/views/home.erb
|
291
280
|
- lib/ui/web/server/views/welcome.erb
|
292
281
|
- lib/ui/web/server/views/reports.erb
|
282
|
+
- lib/ui/web/server/views/addon.erb
|
293
283
|
- lib/ui/web/server/views/dispatchers_edit.erb
|
294
284
|
- lib/ui/web/server/views/settings.erb
|
285
|
+
- lib/ui/web/scheduler.rb
|
286
|
+
- lib/ui/web/utilities.rb
|
287
|
+
- lib/ui/web/addons/autodeploy.rb
|
288
|
+
- lib/ui/web/addons/scheduler.rb
|
289
|
+
- lib/ui/web/addons/scheduler/views/options.erb
|
290
|
+
- lib/ui/web/addons/scheduler/views/index.erb
|
291
|
+
- lib/ui/web/addons/sample/views/index.erb
|
292
|
+
- lib/ui/web/addons/autodeploy/lib/manager.rb
|
293
|
+
- lib/ui/web/addons/autodeploy/views/index.erb
|
294
|
+
- lib/ui/web/addons/sample.rb
|
295
295
|
- lib/ui/web/report_manager.rb
|
296
296
|
- lib/ui/web/server.rb
|
297
297
|
- lib/ui/web/log.rb
|
298
298
|
- lib/ui/web/dispatcher_manager.rb
|
299
299
|
- lib/ui/web/output_stream.rb
|
300
|
+
- lib/ui/web/instance_manager.rb
|
301
|
+
- lib/ui/web/addon_manager.rb
|
300
302
|
- lib/parser/auditable.rb
|
301
303
|
- lib/parser/parser.rb
|
302
304
|
- lib/parser/page.rb
|
@@ -364,6 +366,7 @@ files:
|
|
364
366
|
- modules/audit/os_cmd_injection_timing.rb
|
365
367
|
- modules/audit/ldapi.rb
|
366
368
|
- modules/audit/ldapi/errors.txt
|
369
|
+
- modules/audit/sqli_blind_rdiff/payloads.txt
|
367
370
|
- modules/audit/code_injection_timing.rb
|
368
371
|
- modules/audit/os_cmd_injection_timing/payloads.txt
|
369
372
|
- modules/audit/sqli/regexp_ids.txt
|
data/lib/anemone/cookie_store.rb
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
require 'delegate'
|
2
|
-
require 'webrick/cookie'
|
3
|
-
|
4
|
-
class WEBrick::Cookie
|
5
|
-
def expired?
|
6
|
-
!!expires && expires < Time.now
|
7
|
-
end
|
8
|
-
end
|
9
|
-
|
10
|
-
module Anemone
|
11
|
-
class CookieStore < DelegateClass(Hash)
|
12
|
-
|
13
|
-
def initialize(cookies = nil)
|
14
|
-
@cookies = {}
|
15
|
-
cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
|
16
|
-
super(@cookies)
|
17
|
-
end
|
18
|
-
|
19
|
-
def merge!(set_cookie_str)
|
20
|
-
begin
|
21
|
-
cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
|
22
|
-
hash[cookie.name] = cookie if !!cookie
|
23
|
-
hash
|
24
|
-
end
|
25
|
-
@cookies.merge! cookie_hash
|
26
|
-
rescue
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def to_s
|
31
|
-
@cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
|
32
|
-
end
|
33
|
-
|
34
|
-
end
|
35
|
-
end
|
data/lib/anemone/core.rb
DELETED
@@ -1,371 +0,0 @@
|
|
1
|
-
=begin
|
2
|
-
Arachni
|
3
|
-
Copyright (c) 2010-2011 Tasos "Zapotek" Laskos <tasos.laskos@gmail.com>
|
4
|
-
|
5
|
-
This is free software; you can copy and distribute and modify
|
6
|
-
this program under the term of the GPL v2.0 License
|
7
|
-
(See LICENSE file for details)
|
8
|
-
|
9
|
-
=end
|
10
|
-
|
11
|
-
require 'thread'
|
12
|
-
require 'robots'
|
13
|
-
|
14
|
-
opts = Arachni::Options.instance
|
15
|
-
require opts.dir['lib'] + 'anemone/tentacle'
|
16
|
-
require opts.dir['lib'] + 'anemone/page'
|
17
|
-
require opts.dir['lib'] + 'anemone/exceptions'
|
18
|
-
require opts.dir['lib'] + 'anemone/page_store'
|
19
|
-
require opts.dir['lib'] + 'anemone/storage'
|
20
|
-
require opts.dir['lib'] + 'anemone/storage/base'
|
21
|
-
|
22
|
-
module Anemone
|
23
|
-
|
24
|
-
VERSION = '0.5.0';
|
25
|
-
|
26
|
-
#
|
27
|
-
# Convenience method to start a crawl
|
28
|
-
#
|
29
|
-
def Anemone.crawl(urls, options = {}, &block)
|
30
|
-
Core.crawl(urls, options, &block)
|
31
|
-
end
|
32
|
-
|
33
|
-
|
34
|
-
#
|
35
|
-
# Overides Anemone's Core class method skip_link?( link )
|
36
|
-
# to support regexp matching to the whole url and enforce redundancy checks.
|
37
|
-
# <br/>
|
38
|
-
# Messages were also added to inform the user in case of redundant URLs.
|
39
|
-
#
|
40
|
-
# @author: Tasos "Zapotek" Laskos
|
41
|
-
# <tasos.laskos@gmail.com>
|
42
|
-
# <zapotek@segfault.gr>
|
43
|
-
# @version: 0.1
|
44
|
-
#
|
45
|
-
class Core
|
46
|
-
|
47
|
-
include Arachni::UI::Output
|
48
|
-
|
49
|
-
# PageStore storing all Page objects encountered during the crawl
|
50
|
-
attr_reader :pages
|
51
|
-
# Hash of options for the crawl
|
52
|
-
attr_reader :opts
|
53
|
-
|
54
|
-
DEFAULT_OPTS = {
|
55
|
-
# run 4 Tentacle threads to fetch pages
|
56
|
-
:threads => 4,
|
57
|
-
# disable verbose output
|
58
|
-
:verbose => false,
|
59
|
-
# don't throw away the page response body after scanning it for links
|
60
|
-
:discard_page_bodies => false,
|
61
|
-
# identify self as Anemone/VERSION
|
62
|
-
:user_agent => "Anemone/#{Anemone::VERSION}",
|
63
|
-
# no delay between requests
|
64
|
-
:delay => 0,
|
65
|
-
# don't obey the robots exclusion protocol
|
66
|
-
:obey_robots_txt => false,
|
67
|
-
# by default, don't limit the depth of the crawl
|
68
|
-
:depth_limit => false,
|
69
|
-
# number of times HTTP redirects will be followed
|
70
|
-
:redirect_limit => 5,
|
71
|
-
# storage engine defaults to Hash in +process_options+ if none specified
|
72
|
-
:storage => nil,
|
73
|
-
# Hash of cookie name => value to send with HTTP requests
|
74
|
-
:cookies => nil,
|
75
|
-
# accept cookies from the server and send them back?
|
76
|
-
:accept_cookies => false,
|
77
|
-
# skip any link with a query string? e.g. http://foo.com/?u=user
|
78
|
-
:skip_query_strings => false
|
79
|
-
}
|
80
|
-
|
81
|
-
# Create setter methods for all options to be called from the crawl block
|
82
|
-
DEFAULT_OPTS.keys.each do |key|
|
83
|
-
define_method "#{key}=" do |value|
|
84
|
-
@opts[key.to_sym] = value
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
#
|
89
|
-
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
90
|
-
# and optional *block*
|
91
|
-
#
|
92
|
-
def initialize(urls, opts = {})
|
93
|
-
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
94
|
-
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
95
|
-
|
96
|
-
@tentacles = []
|
97
|
-
@on_every_page_blocks = []
|
98
|
-
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
99
|
-
@skip_link_patterns = []
|
100
|
-
@after_crawl_blocks = []
|
101
|
-
@opts = opts
|
102
|
-
|
103
|
-
yield self if block_given?
|
104
|
-
end
|
105
|
-
|
106
|
-
#
|
107
|
-
# Convenience method to start a new crawl
|
108
|
-
#
|
109
|
-
def self.crawl(urls, opts = {})
|
110
|
-
self.new(urls, opts) do |core|
|
111
|
-
yield core if block_given?
|
112
|
-
core.run
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
#
|
117
|
-
# Add a block to be executed on the PageStore after the crawl
|
118
|
-
# is finished
|
119
|
-
#
|
120
|
-
def after_crawl(&block)
|
121
|
-
@after_crawl_blocks << block
|
122
|
-
self
|
123
|
-
end
|
124
|
-
|
125
|
-
#
|
126
|
-
# Add one ore more Regex patterns for URLs which should not be
|
127
|
-
# followed
|
128
|
-
#
|
129
|
-
def skip_links_like(*patterns)
|
130
|
-
@skip_link_patterns.concat [patterns].flatten.compact
|
131
|
-
self
|
132
|
-
end
|
133
|
-
|
134
|
-
#
|
135
|
-
# Add a block to be executed on every Page as they are encountered
|
136
|
-
# during the crawl
|
137
|
-
#
|
138
|
-
def on_every_page(&block)
|
139
|
-
@on_every_page_blocks << block
|
140
|
-
self
|
141
|
-
end
|
142
|
-
|
143
|
-
#
|
144
|
-
# Add a block to be executed on Page objects with a URL matching
|
145
|
-
# one or more patterns
|
146
|
-
#
|
147
|
-
def on_pages_like(*patterns, &block)
|
148
|
-
if patterns
|
149
|
-
patterns.each do |pattern|
|
150
|
-
@on_pages_like_blocks[pattern] << block
|
151
|
-
end
|
152
|
-
end
|
153
|
-
self
|
154
|
-
end
|
155
|
-
|
156
|
-
#
|
157
|
-
# Specify a block which will select which links to follow on each page.
|
158
|
-
# The block should return an Array of URI objects.
|
159
|
-
#
|
160
|
-
def focus_crawl(&block)
|
161
|
-
@focus_crawl_block = block
|
162
|
-
self
|
163
|
-
end
|
164
|
-
|
165
|
-
#
|
166
|
-
# Perform the crawl
|
167
|
-
#
|
168
|
-
def run
|
169
|
-
process_options
|
170
|
-
|
171
|
-
@urls.delete_if { |url| !visit_link?(url) }
|
172
|
-
return if @urls.empty?
|
173
|
-
|
174
|
-
link_queue = Queue.new
|
175
|
-
page_queue = Queue.new
|
176
|
-
|
177
|
-
@opts[:threads].times do
|
178
|
-
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
179
|
-
end
|
180
|
-
|
181
|
-
@urls.each{ |url| link_queue.enq(url) }
|
182
|
-
|
183
|
-
loop do
|
184
|
-
page = page_queue.deq
|
185
|
-
@pages.touch_key page.url
|
186
|
-
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
187
|
-
do_page_blocks page
|
188
|
-
page.discard_doc! if @opts[:discard_page_bodies]
|
189
|
-
|
190
|
-
links = links_to_follow page
|
191
|
-
links.each do |link|
|
192
|
-
link_queue << [link, page.url.dup, page.depth + 1]
|
193
|
-
end
|
194
|
-
@pages.touch_keys links
|
195
|
-
|
196
|
-
@pages[page.url] = page
|
197
|
-
|
198
|
-
# if we are done with the crawl, tell the threads to end
|
199
|
-
if link_queue.empty? and page_queue.empty?
|
200
|
-
until link_queue.num_waiting == @tentacles.size
|
201
|
-
Thread.pass
|
202
|
-
end
|
203
|
-
if page_queue.empty?
|
204
|
-
@tentacles.size.times { link_queue << :END }
|
205
|
-
break
|
206
|
-
end
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
@tentacles.each { |thread| thread.join }
|
211
|
-
do_after_crawl_blocks
|
212
|
-
self
|
213
|
-
end
|
214
|
-
|
215
|
-
private
|
216
|
-
|
217
|
-
def process_options
|
218
|
-
@opts = DEFAULT_OPTS.merge @opts
|
219
|
-
@opts[:threads] = 1 if @opts[:delay] > 0
|
220
|
-
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
|
221
|
-
@pages = PageStore.new(storage)
|
222
|
-
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
223
|
-
|
224
|
-
freeze_options
|
225
|
-
end
|
226
|
-
|
227
|
-
#
|
228
|
-
# Freeze the opts Hash so that no options can be modified
|
229
|
-
# once the crawl begins
|
230
|
-
#
|
231
|
-
def freeze_options
|
232
|
-
@opts.freeze
|
233
|
-
@opts.each_key { |key| @opts[key].freeze }
|
234
|
-
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
|
235
|
-
end
|
236
|
-
|
237
|
-
#
|
238
|
-
# Execute the after_crawl blocks
|
239
|
-
#
|
240
|
-
def do_after_crawl_blocks
|
241
|
-
@after_crawl_blocks.each { |block| block.call(@pages) }
|
242
|
-
end
|
243
|
-
|
244
|
-
#
|
245
|
-
# Execute the on_every_page blocks for *page*
|
246
|
-
#
|
247
|
-
# Modified it to fix a bug in Anemone when given more than one<br/>
|
248
|
-
# regular expression for "@on_pages_like_blocks".
|
249
|
-
#
|
250
|
-
def do_page_blocks(page)
|
251
|
-
@on_every_page_blocks.each do |block|
|
252
|
-
block.call(page)
|
253
|
-
end
|
254
|
-
|
255
|
-
@on_pages_like_blocks.each do |patterns, blocks|
|
256
|
-
if matches_pattern?( page.url.to_s, patterns )
|
257
|
-
blocks.each { |block| block.call(page) }
|
258
|
-
end
|
259
|
-
end
|
260
|
-
end
|
261
|
-
|
262
|
-
#
|
263
|
-
# Return an Array of links to follow from the given page.
|
264
|
-
# Based on whether or not the link has already been crawled,
|
265
|
-
# and the block given to focus_crawl()
|
266
|
-
#
|
267
|
-
def links_to_follow(page)
|
268
|
-
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
269
|
-
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
|
270
|
-
end
|
271
|
-
|
272
|
-
#
|
273
|
-
# Returns +true+ if *link* has not been visited already,
|
274
|
-
# and is not excluded by a skip_link pattern...
|
275
|
-
# and is not excluded by robots.txt...
|
276
|
-
# and is not deeper than the depth limit
|
277
|
-
# Returns +false+ otherwise.
|
278
|
-
#
|
279
|
-
def visit_link?(link, from_page = nil)
|
280
|
-
!@pages.has_page?(link) &&
|
281
|
-
!skip_link?(link) &&
|
282
|
-
!skip_query_string?(link) &&
|
283
|
-
allowed(link) &&
|
284
|
-
!too_deep?(from_page)
|
285
|
-
end
|
286
|
-
|
287
|
-
#
|
288
|
-
# Returns +true+ if we are obeying robots.txt and the link
|
289
|
-
# is granted access in it. Always returns +true+ when we are
|
290
|
-
# not obeying robots.txt.
|
291
|
-
#
|
292
|
-
def allowed(link)
|
293
|
-
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
294
|
-
end
|
295
|
-
|
296
|
-
#
|
297
|
-
# Returns +true+ if we are over the page depth limit.
|
298
|
-
# This only works when coming from a page and with the +depth_limit+ option set.
|
299
|
-
# When neither is the case, will always return +false+.
|
300
|
-
def too_deep?(from_page)
|
301
|
-
if from_page && @opts[:depth_limit]
|
302
|
-
from_page.depth >= @opts[:depth_limit]
|
303
|
-
else
|
304
|
-
false
|
305
|
-
end
|
306
|
-
end
|
307
|
-
|
308
|
-
#
|
309
|
-
# Returns +true+ if *link* should not be visited because
|
310
|
-
# it has a query string and +skip_query_strings+ is true.
|
311
|
-
#
|
312
|
-
def skip_query_string?(link)
|
313
|
-
@opts[:skip_query_strings] && link.query
|
314
|
-
end
|
315
|
-
|
316
|
-
#
|
317
|
-
# Returns +true+ if *link* should not be visited because
|
318
|
-
# its URL matches a skip_link pattern or the reundancy countdown has reached
|
319
|
-
# zero.
|
320
|
-
#
|
321
|
-
def skip_link?( link )
|
322
|
-
|
323
|
-
url = link.to_s
|
324
|
-
skip = false
|
325
|
-
@opts['redundant'].each_with_index {
|
326
|
-
|redundant, i|
|
327
|
-
|
328
|
-
if( url =~ redundant['regexp'] )
|
329
|
-
|
330
|
-
if( @opts['redundant'][i]['count'] == 0 )
|
331
|
-
print_verbose( 'Discarding redundant page: \'' + url + '\'' )
|
332
|
-
return true
|
333
|
-
end
|
334
|
-
|
335
|
-
print_info( 'Matched redundancy rule: ' +
|
336
|
-
redundant['regexp'].to_s + ' for page \'' +
|
337
|
-
url + '\'' )
|
338
|
-
|
339
|
-
print_info( 'Count-down: ' +
|
340
|
-
@opts['redundant'][i]['count'].to_s )
|
341
|
-
|
342
|
-
@opts['redundant'][i]['count'] -= 1
|
343
|
-
end
|
344
|
-
}
|
345
|
-
|
346
|
-
@skip_link_patterns.any? { |pattern| url =~ pattern }
|
347
|
-
|
348
|
-
end
|
349
|
-
|
350
|
-
#
|
351
|
-
# Decides whether or not a url matches any of the regular expressions
|
352
|
-
# in "patterns".
|
353
|
-
#
|
354
|
-
# @param [String] url
|
355
|
-
# @param [Array] patterns array of regular expressions
|
356
|
-
#
|
357
|
-
# @return [Bool]
|
358
|
-
#
|
359
|
-
def matches_pattern?( url, patterns )
|
360
|
-
|
361
|
-
patterns.each {
|
362
|
-
|pattern|
|
363
|
-
return true if url =~ pattern
|
364
|
-
}
|
365
|
-
|
366
|
-
return false
|
367
|
-
end
|
368
|
-
|
369
|
-
end
|
370
|
-
|
371
|
-
end
|