spidr 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.rdoc +191 -0
- data/Manifest.txt +10 -34
- data/{README.txt → README.rdoc} +3 -1
- data/Rakefile +6 -4
- data/lib/spidr/agent.rb +137 -97
- data/lib/spidr/auth_credential.rb +25 -0
- data/lib/spidr/auth_store.rb +157 -0
- data/lib/spidr/cookie_jar.rb +166 -0
- data/lib/spidr/filters.rb +2 -0
- data/lib/spidr/page.rb +75 -11
- data/lib/spidr/sanitizers.rb +59 -0
- data/lib/spidr/session_cache.rb +119 -0
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +2 -2
- data/spec/helpers/history.rb +34 -0
- data/spec/helpers/wsoc.rb +83 -0
- data/spec/page_examples.rb +5 -1
- data/spec/page_spec.rb +30 -0
- data/spec/sanitizers_spec.rb +67 -0
- data/tasks/yard.rb +1 -1
- metadata +24 -40
- metadata.gz.sig +0 -0
- data/History.txt +0 -167
- data/spec/helpers/course.rb +0 -95
- data/static/course/absolute/index.html +0 -10
- data/static/course/absolute/next.html +0 -9
- data/static/course/absolute/start.html +0 -19
- data/static/course/empty/index.html +0 -10
- data/static/course/empty/start.html +0 -23
- data/static/course/fail.html +0 -14
- data/static/course/frames/frame.html +0 -15
- data/static/course/frames/frame_next.html +0 -9
- data/static/course/frames/iframe.html +0 -15
- data/static/course/frames/iframe_next.html +0 -9
- data/static/course/frames/index.html +0 -10
- data/static/course/frames/start.html +0 -15
- data/static/course/index.html +0 -10
- data/static/course/javascript/index.html +0 -10
- data/static/course/javascript/start.html +0 -19
- data/static/course/loop/index.html +0 -10
- data/static/course/loop/next.html +0 -13
- data/static/course/loop/start.html +0 -19
- data/static/course/relative/current_directory.html +0 -9
- data/static/course/relative/index.html +0 -10
- data/static/course/relative/normal.html +0 -9
- data/static/course/relative/same_directory.html +0 -9
- data/static/course/relative/start.html +0 -27
- data/static/course/remote/index.html +0 -10
- data/static/course/remote/next.html +0 -9
- data/static/course/remote/start.html +0 -27
- data/static/course/scripts/course.js +0 -29
- data/static/course/scripts/jquery-1.2.6.min.js +0 -32
- data/static/course/specs.json +0 -1
- data/static/course/start.html +0 -27
- data/tasks/course.rb +0 -63
data.tar.gz.sig
CHANGED
|
Binary file
|
data/History.rdoc
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
=== 0.2.2 / 2010-01-06
|
|
2
|
+
|
|
3
|
+
* Require Web Spider Obstacle Course (WSOC) >= 0.1.1.
|
|
4
|
+
* Integrated the new WSOC into the specs.
|
|
5
|
+
* Removed the built-in Web Spider Obstacle Course.
|
|
6
|
+
* Added {Spidr::Page#content_types}.
|
|
7
|
+
* Added {Spidr::Page#cookie}.
|
|
8
|
+
* Added {Spidr::Page#cookies}.
|
|
9
|
+
* Added {Spidr::Page#cookie_params}.
|
|
10
|
+
* Added {Spidr::Sanitizers}.
|
|
11
|
+
* Added {Spidr::SessionCache}.
|
|
12
|
+
* Added {Spidr::CookieJar} (thanks Nick Plante).
|
|
13
|
+
* Added {Spidr::AuthStore} (thanks Nick Plante).
|
|
14
|
+
* Added {Spidr::Agent#post_page} (thanks Nick Plante).
|
|
15
|
+
* Renamed Spidr::Agent#get_session to {Spidr::SessionCache#[]}.
|
|
16
|
+
* Renamed Spidr::Agent#kill_session to {Spidr::SessionCache#kill!}.
|
|
17
|
+
|
|
18
|
+
=== 0.2.1 / 2009-11-25
|
|
19
|
+
|
|
20
|
+
* Added {Spidr::Events#every_ok_page}.
|
|
21
|
+
* Added {Spidr::Events#every_redirect_page}.
|
|
22
|
+
* Added {Spidr::Events#every_timedout_page}.
|
|
23
|
+
* Added {Spidr::Events#every_bad_request_page}.
|
|
24
|
+
* Added {Spidr::Events#every_unauthorized_page}.
|
|
25
|
+
* Added {Spidr::Events#every_forbidden_page}.
|
|
26
|
+
* Added {Spidr::Events#every_missing_page}.
|
|
27
|
+
* Added {Spidr::Events#every_internal_server_error_page}.
|
|
28
|
+
* Added {Spidr::Events#every_txt_page}.
|
|
29
|
+
* Added {Spidr::Events#every_html_page}.
|
|
30
|
+
* Added {Spidr::Events#every_xml_page}.
|
|
31
|
+
* Added {Spidr::Events#every_xsl_page}.
|
|
32
|
+
* Added {Spidr::Events#every_doc}.
|
|
33
|
+
* Added {Spidr::Events#every_html_doc}.
|
|
34
|
+
* Added {Spidr::Events#every_xml_doc}.
|
|
35
|
+
* Added {Spidr::Events#every_xsl_doc}.
|
|
36
|
+
* Added {Spidr::Events#every_rss_doc}.
|
|
37
|
+
* Added {Spidr::Events#every_atom_doc}.
|
|
38
|
+
* Added {Spidr::Events#every_javascript_page}.
|
|
39
|
+
* Added {Spidr::Events#every_css_page}.
|
|
40
|
+
* Added {Spidr::Events#every_rss_page}.
|
|
41
|
+
* Added {Spidr::Events#every_atom_page}.
|
|
42
|
+
* Added {Spidr::Events#every_ms_word_page}.
|
|
43
|
+
* Added {Spidr::Events#every_pdf_page}.
|
|
44
|
+
* Added {Spidr::Events#every_zip_page}.
|
|
45
|
+
* Fixed a bug where {Spidr::Agent#delay} was not being used to delay
|
|
46
|
+
requesting pages.
|
|
47
|
+
* Spider +link+ and +script+ tags in HTML pages (thanks Nick Plante).
|
|
48
|
+
|
|
49
|
+
=== 0.2.0 / 2009-10-10
|
|
50
|
+
|
|
51
|
+
* Added {URI.expand_path}.
|
|
52
|
+
* Added {Spidr::Page#search}.
|
|
53
|
+
* Added {Spidr::Page#at}.
|
|
54
|
+
* Added {Spidr::Page#title}.
|
|
55
|
+
* Added {Spidr::Agent#failures=}.
|
|
56
|
+
* Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
|
|
57
|
+
* Added Spidr::Agent#get_session.
|
|
58
|
+
* Added Spidr::Agent#kill_session.
|
|
59
|
+
* Added {Spidr.proxy=}.
|
|
60
|
+
* Added {Spidr.disable_proxy!}.
|
|
61
|
+
* Aliased Spidr::Page#txt? to {Spidr::Page#plain_text?}.
|
|
62
|
+
* Aliased Spidr::Page#ok? to {Spidr::Page#is_ok?}.
|
|
63
|
+
* Aliased Spidr::Page#redirect? to {Spidr::Page#is_redirect?}.
|
|
64
|
+
* Aliased Spidr::Page#unauthorized? to {Spidr::Page#is_unauthorized?}.
|
|
65
|
+
* Aliased Spidr::Page#forbidden? to {Spidr::Page#is_forbidden?}.
|
|
66
|
+
* Aliased Spidr::Page#missing? to {Spidr::Page#is_missing?}.
|
|
67
|
+
* Split URL filtering code out of {Spidr::Agent} and into
|
|
68
|
+
{Spidr::Filters}.
|
|
69
|
+
* Split URL / Page event code out of {Spidr::Agent} and into
|
|
70
|
+
{Spidr::Events}.
|
|
71
|
+
* Split pause! / continue! / skip_link! / skip_page! methods out of
|
|
72
|
+
{Spidr::Agent} and into {Spidr::Actions}.
|
|
73
|
+
* Fixed a bug in {Spidr::Page#code}, where it was not returning an Integer.
|
|
74
|
+
* Make sure {Spidr::Page#doc} returns Nokogiri::XML::Document objects for
|
|
75
|
+
RSS/RDF/Atom pages as well.
|
|
76
|
+
* Fixed the handling of the Location header in {Spidr::Page#links}
|
|
77
|
+
(thanks falter).
|
|
78
|
+
* Fixed a bug in {Spidr::Page#to_absolute} where trailing '/' characters on
|
|
79
|
+
URI paths were not being preserved (thanks falter).
|
|
80
|
+
* Fixed a bug where the URI query was not being sent with the request
|
|
81
|
+
in {Spidr::Agent#get_page} (thanks Damian Steer).
|
|
82
|
+
* Fixed a bug where SSL sessions were not being properly setup
|
|
83
|
+
(thanks falter).
|
|
84
|
+
* Switched {Spidr::Agent#history} to be a Set, to improve search-time
|
|
85
|
+
of the history (thanks falter).
|
|
86
|
+
* Switched {Spidr::Agent#failures} to a Set.
|
|
87
|
+
* Allow a block to be passed to {Spidr::Agent#run}, which will receive all
|
|
88
|
+
pages visited.
|
|
89
|
+
* Allow Spidr::Agent#start_at and Spidr::Agent#continue! to pass blocks
|
|
90
|
+
to {Spidr::Agent#run}.
|
|
91
|
+
* Made {Spidr::Agent#visit_page} public.
|
|
92
|
+
* Moved to YARD based documentation.
|
|
93
|
+
|
|
94
|
+
=== 0.1.9 / 2009-06-13
|
|
95
|
+
|
|
96
|
+
* Upgraded to Hoe 2.0.0.
|
|
97
|
+
* Use Hoe.spec instead of Hoe.new.
|
|
98
|
+
* Use the Hoe signing task for signed gems.
|
|
99
|
+
* Added the Spidr::Agent#schemes and Spidr::Agent#schemes= methods.
|
|
100
|
+
* Added a warning message if 'net/https' cannot be loaded.
|
|
101
|
+
* Allow the list of acceptable URL schemes to be passed into
|
|
102
|
+
{Spidr::Agent#initialize}.
|
|
103
|
+
* Allow history and queue information to be passed into
|
|
104
|
+
{Spidr::Agent#initialize}.
|
|
105
|
+
* {Spidr::Agent#start_at} no longer clears the history or the queue.
|
|
106
|
+
* Fixed a bug in the sanitization of semi-escaped URLs.
|
|
107
|
+
* Fixed a bug where https URLs would be followed even if 'net/https'
|
|
108
|
+
could not be loaded.
|
|
109
|
+
* Removed Spidr::Agent::SCHEMES.
|
|
110
|
+
|
|
111
|
+
=== 0.1.8 / 2009-05-27
|
|
112
|
+
|
|
113
|
+
* Added the Spidr::Agent#pause! and Spidr::Agent#continue! methods.
|
|
114
|
+
* Added the Spidr::Agent#running? and Spidr::Agent#paused? methods.
|
|
115
|
+
* Added an alias for pending_urls to the queue methods.
|
|
116
|
+
* Added {Spidr::Agent#queue} to provide read access to the queue.
|
|
117
|
+
* Added {Spidr::Agent#queue=} and {Spidr::Agent#history=} for setting the
|
|
118
|
+
queue and history.
|
|
119
|
+
* Added {Spidr::Agent#to_hash} which returns a Hash of the agents queue and
|
|
120
|
+
history.
|
|
121
|
+
* Made {Spidr::Agent#enqueue} and {Spidr::Agent#queued?} public.
|
|
122
|
+
* Added more specs.
|
|
123
|
+
|
|
124
|
+
=== 0.1.7 / 2009-04-24
|
|
125
|
+
|
|
126
|
+
* Added Spidr::Agent#all_headers.
|
|
127
|
+
* Fixed a bug where Page#headers was always +nil+.
|
|
128
|
+
* {Spidr::Spidr::Agent} will now follow the Location header in HTTP 300,
|
|
129
|
+
301, 302, 303 and 307 Redirects.
|
|
130
|
+
* {Spidr::Agent} will now follow iframe and frame tags.
|
|
131
|
+
|
|
132
|
+
=== 0.1.6 / 2009-04-14
|
|
133
|
+
|
|
134
|
+
* Added {Spidr::Agent#failures}, a list of URLs which could not be visited.
|
|
135
|
+
* Added {Spidr::Agent#failed?}.
|
|
136
|
+
* Added Spidr::Agent#every_failed_url.
|
|
137
|
+
* Added {Spidr::Agent#clear}, which clears the history and failures URL
|
|
138
|
+
lists.
|
|
139
|
+
* Improved fault tolerance in {Spidr::Agent#get_page}.
|
|
140
|
+
* If a Network or HTTP error is encountered, the URL will be added to
|
|
141
|
+
the failures list and the next URL will be visited.
|
|
142
|
+
* Fixed a typo in Spidr::Agent#ignore_exts_like.
|
|
143
|
+
* Updated the Web Spider Obstacle Course with links that always fail to be
|
|
144
|
+
visited.
|
|
145
|
+
|
|
146
|
+
=== 0.1.5 / 2009-03-22
|
|
147
|
+
|
|
148
|
+
* Catch malformed URIs in {Spidr::Page#to_absolute} and return +nil+.
|
|
149
|
+
* Filter out +nil+ URIs in {Spidr::Page#urls}.
|
|
150
|
+
|
|
151
|
+
=== 0.1.4 / 2009-01-15
|
|
152
|
+
|
|
153
|
+
* Use Nokogiri for HTML and XML parsing.
|
|
154
|
+
|
|
155
|
+
=== 0.1.3 / 2009-01-10
|
|
156
|
+
|
|
157
|
+
* Added the :host options to {Spidr::Agent#initialize}.
|
|
158
|
+
* Added the Web Spider Obstacle Course files to the Manifest.
|
|
159
|
+
* Aliased {Spidr::Agent#visited_urls} to {Spidr::Agent#history}.
|
|
160
|
+
|
|
161
|
+
=== 0.1.2 / 2008-11-06
|
|
162
|
+
|
|
163
|
+
* Fixed a bug in {Spidr::Page#to_absolute} where URLs with no path were not
|
|
164
|
+
receiving a default path of <tt>/</tt>.
|
|
165
|
+
* Fixed a bug in {Spidr::Page#to_absolute} where URL paths were not being
|
|
166
|
+
expanded, in order to remove <tt>..</tt> and <tt>.</tt> directories.
|
|
167
|
+
* Fixed a bug where absolute URLs could have a blank path, thus causing
|
|
168
|
+
{Spidr::Agent#get_page} to crash when it performed the HTTP request.
|
|
169
|
+
* Added RSpec spec tests.
|
|
170
|
+
* Created a Web-Spider Obstacle Course
|
|
171
|
+
(http://spidr.rubyforge.org/course/start.html) which is used in the spec
|
|
172
|
+
tests.
|
|
173
|
+
|
|
174
|
+
=== 0.1.1 / 2008-10-04
|
|
175
|
+
|
|
176
|
+
* Added a reader method for the response instance variable in Page.
|
|
177
|
+
* Fixed a bug in {Spidr::Page#method_missing}.
|
|
178
|
+
|
|
179
|
+
=== 0.1.0 / 2008-05-23
|
|
180
|
+
|
|
181
|
+
* Initial release.
|
|
182
|
+
* Black-list or white-list URLs based upon:
|
|
183
|
+
* Host name
|
|
184
|
+
* Port number
|
|
185
|
+
* Full link
|
|
186
|
+
* URL extension
|
|
187
|
+
* Provides call-backs for:
|
|
188
|
+
* Every visited Page.
|
|
189
|
+
* Every visited URL.
|
|
190
|
+
* Every visited URL that matches a specified pattern.
|
|
191
|
+
|
data/Manifest.txt
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
History.
|
|
1
|
+
History.rdoc
|
|
2
2
|
Manifest.txt
|
|
3
|
-
README.
|
|
3
|
+
README.rdoc
|
|
4
4
|
Rakefile
|
|
5
5
|
lib/spidr.rb
|
|
6
6
|
lib/spidr/extensions.rb
|
|
7
7
|
lib/spidr/extensions/uri.rb
|
|
8
8
|
lib/spidr/page.rb
|
|
9
|
+
lib/spidr/sanitizers.rb
|
|
9
10
|
lib/spidr/rules.rb
|
|
10
11
|
lib/spidr/filters.rb
|
|
11
12
|
lib/spidr/events.rb
|
|
@@ -16,50 +17,25 @@ lib/spidr/actions/exceptions/paused.rb
|
|
|
16
17
|
lib/spidr/actions/exceptions/skip_link.rb
|
|
17
18
|
lib/spidr/actions/exceptions/skip_page.rb
|
|
18
19
|
lib/spidr/actions/actions.rb
|
|
20
|
+
lib/spidr/session_cache.rb
|
|
21
|
+
lib/spidr/cookie_jar.rb
|
|
22
|
+
lib/spidr/auth_credential.rb
|
|
23
|
+
lib/spidr/auth_store.rb
|
|
19
24
|
lib/spidr/agent.rb
|
|
20
25
|
lib/spidr/spidr.rb
|
|
21
26
|
lib/spidr/version.rb
|
|
22
27
|
tasks/spec.rb
|
|
23
28
|
tasks/yard.rb
|
|
24
|
-
tasks/course.rb
|
|
25
29
|
spec/spec_helper.rb
|
|
26
|
-
spec/helpers/
|
|
30
|
+
spec/helpers/history.rb
|
|
31
|
+
spec/helpers/wsoc.rb
|
|
27
32
|
spec/helpers/page.rb
|
|
28
33
|
spec/extensions/uri_spec.rb
|
|
29
34
|
spec/page_examples.rb
|
|
30
35
|
spec/page_spec.rb
|
|
31
36
|
spec/rules_spec.rb
|
|
37
|
+
spec/sanitizers_spec.rb
|
|
32
38
|
spec/filters_spec.rb
|
|
33
39
|
spec/actions_spec.rb
|
|
34
40
|
spec/agent_spec.rb
|
|
35
41
|
spec/spidr_spec.rb
|
|
36
|
-
static/course/index.html
|
|
37
|
-
static/course/start.html
|
|
38
|
-
static/course/fail.html
|
|
39
|
-
static/course/scripts/jquery-1.2.6.min.js
|
|
40
|
-
static/course/scripts/course.js
|
|
41
|
-
static/course/empty/index.html
|
|
42
|
-
static/course/empty/start.html
|
|
43
|
-
static/course/javascript/index.html
|
|
44
|
-
static/course/javascript/start.html
|
|
45
|
-
static/course/loop/index.html
|
|
46
|
-
static/course/loop/start.html
|
|
47
|
-
static/course/loop/next.html
|
|
48
|
-
static/course/relative/index.html
|
|
49
|
-
static/course/relative/start.html
|
|
50
|
-
static/course/relative/normal.html
|
|
51
|
-
static/course/relative/current_directory.html
|
|
52
|
-
static/course/relative/same_directory.html
|
|
53
|
-
static/course/absolute/index.html
|
|
54
|
-
static/course/absolute/start.html
|
|
55
|
-
static/course/absolute/next.html
|
|
56
|
-
static/course/remote/index.html
|
|
57
|
-
static/course/remote/start.html
|
|
58
|
-
static/course/remote/next.html
|
|
59
|
-
static/course/frames/index.html
|
|
60
|
-
static/course/frames/start.html
|
|
61
|
-
static/course/frames/iframe.html
|
|
62
|
-
static/course/frames/iframe_next.html
|
|
63
|
-
static/course/frames/frame.html
|
|
64
|
-
static/course/frames/frame_next.html
|
|
65
|
-
static/course/specs.json
|
data/{README.txt → README.rdoc}
RENAMED
|
@@ -18,7 +18,9 @@ and easy to use.
|
|
|
18
18
|
* a tags.
|
|
19
19
|
* iframe tags.
|
|
20
20
|
* frame tags.
|
|
21
|
+
* Cookie protected links.
|
|
21
22
|
* HTTP 300, 301, 302, 303 and 307 Redirects.
|
|
23
|
+
* HTTP Basic Auth protected links.
|
|
22
24
|
* Black-list or white-list URLs based upon:
|
|
23
25
|
* URL scheme.
|
|
24
26
|
* Host name
|
|
@@ -156,7 +158,7 @@ and easy to use.
|
|
|
156
158
|
|
|
157
159
|
The MIT License
|
|
158
160
|
|
|
159
|
-
Copyright (c) 2008-
|
|
161
|
+
Copyright (c) 2008-2010 Hal Brodigan
|
|
160
162
|
|
|
161
163
|
Permission is hereby granted, free of charge, to any person obtaining
|
|
162
164
|
a copy of this software and associated documentation files (the
|
data/Rakefile
CHANGED
|
@@ -5,20 +5,22 @@ require 'hoe'
|
|
|
5
5
|
require 'hoe/signing'
|
|
6
6
|
require './tasks/spec.rb'
|
|
7
7
|
require './tasks/yard.rb'
|
|
8
|
-
require './tasks/course.rb'
|
|
9
|
-
require './lib/spidr/version.rb'
|
|
10
8
|
|
|
11
9
|
Hoe.spec('spidr') do
|
|
12
|
-
self.rubyforge_name = 'spidr'
|
|
13
10
|
self.developer('Postmodern', 'postmodern.mod3@gmail.com')
|
|
11
|
+
|
|
12
|
+
self.readme_file = 'README.rdoc'
|
|
13
|
+
self.history_file = 'History.rdoc'
|
|
14
14
|
self.remote_rdoc_dir = 'docs'
|
|
15
|
+
|
|
15
16
|
self.extra_deps = [
|
|
16
17
|
['nokogiri', '>=1.2.0']
|
|
17
18
|
]
|
|
18
19
|
|
|
19
20
|
self.extra_dev_deps = [
|
|
20
21
|
['rspec', '>=1.2.8'],
|
|
21
|
-
['yard', '>=0.4.0']
|
|
22
|
+
['yard', '>=0.4.0'],
|
|
23
|
+
['wsoc', '>=0.1.1']
|
|
22
24
|
]
|
|
23
25
|
|
|
24
26
|
self.spec_extras = {:has_rdoc => 'yard'}
|
data/lib/spidr/agent.rb
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
require 'spidr/sanitizers'
|
|
1
2
|
require 'spidr/filters'
|
|
2
3
|
require 'spidr/events'
|
|
3
4
|
require 'spidr/actions'
|
|
4
5
|
require 'spidr/page'
|
|
6
|
+
require 'spidr/session_cache'
|
|
7
|
+
require 'spidr/cookie_jar'
|
|
8
|
+
require 'spidr/auth_store'
|
|
5
9
|
require 'spidr/spidr'
|
|
6
10
|
|
|
7
11
|
require 'net/http'
|
|
@@ -10,16 +14,17 @@ require 'set'
|
|
|
10
14
|
module Spidr
|
|
11
15
|
class Agent
|
|
12
16
|
|
|
17
|
+
include Sanitizers
|
|
13
18
|
include Filters
|
|
14
19
|
include Events
|
|
15
20
|
include Actions
|
|
16
21
|
|
|
17
|
-
# Proxy to use
|
|
18
|
-
attr_accessor :proxy
|
|
19
|
-
|
|
20
22
|
# User-Agent to use
|
|
21
23
|
attr_accessor :user_agent
|
|
22
24
|
|
|
25
|
+
# HTTP Authentication credentials
|
|
26
|
+
attr_accessor :authorized
|
|
27
|
+
|
|
23
28
|
# Referer to use
|
|
24
29
|
attr_accessor :referer
|
|
25
30
|
|
|
@@ -35,6 +40,9 @@ module Spidr
|
|
|
35
40
|
# Queue of URLs to visit
|
|
36
41
|
attr_reader :queue
|
|
37
42
|
|
|
43
|
+
# Cached cookies
|
|
44
|
+
attr_reader :cookies
|
|
45
|
+
|
|
38
46
|
#
|
|
39
47
|
# Creates a new Agent object.
|
|
40
48
|
#
|
|
@@ -79,18 +87,19 @@ module Spidr
|
|
|
79
87
|
# The newly created agent.
|
|
80
88
|
#
|
|
81
89
|
def initialize(options={},&block)
|
|
82
|
-
@proxy = (options[:proxy] || Spidr.proxy)
|
|
83
90
|
@user_agent = (options[:user_agent] || Spidr.user_agent)
|
|
84
91
|
@referer = options[:referer]
|
|
85
92
|
|
|
93
|
+
@sessions = SessionCache.new(options[:proxy] || Spidr.proxy)
|
|
94
|
+
@cookies = CookieJar.new
|
|
95
|
+
@authorized = AuthStore.new
|
|
96
|
+
|
|
86
97
|
@running = false
|
|
87
98
|
@delay = (options[:delay] || 0)
|
|
88
99
|
@history = Set[]
|
|
89
100
|
@failures = Set[]
|
|
90
101
|
@queue = []
|
|
91
102
|
|
|
92
|
-
@sessions = {}
|
|
93
|
-
|
|
94
103
|
super(options)
|
|
95
104
|
|
|
96
105
|
block.call(self) if block
|
|
@@ -222,14 +231,6 @@ module Spidr
|
|
|
222
231
|
|
|
223
232
|
@running = false
|
|
224
233
|
|
|
225
|
-
@sessions.each_value do |sess|
|
|
226
|
-
begin
|
|
227
|
-
sess.finish
|
|
228
|
-
rescue IOError
|
|
229
|
-
nil
|
|
230
|
-
end
|
|
231
|
-
end
|
|
232
|
-
|
|
233
234
|
@sessions.clear
|
|
234
235
|
return self
|
|
235
236
|
end
|
|
@@ -244,6 +245,37 @@ module Spidr
|
|
|
244
245
|
@running == true
|
|
245
246
|
end
|
|
246
247
|
|
|
248
|
+
#
|
|
249
|
+
# The proxy information the agent uses.
|
|
250
|
+
#
|
|
251
|
+
# @return [Hash]
|
|
252
|
+
# The proxy information.
|
|
253
|
+
#
|
|
254
|
+
# @see SessionCache#proxy
|
|
255
|
+
#
|
|
256
|
+
# @since 0.2.2
|
|
257
|
+
#
|
|
258
|
+
def proxy
|
|
259
|
+
@sessions.proxy
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
#
|
|
263
|
+
# Sets the proxy information that the agent uses.
|
|
264
|
+
#
|
|
265
|
+
# @param [Hash] new_proxy
|
|
266
|
+
# The new proxy information.
|
|
267
|
+
#
|
|
268
|
+
# @return [Hash]
|
|
269
|
+
# The new proxy information.
|
|
270
|
+
#
|
|
271
|
+
# @see SessionCache#proxy=
|
|
272
|
+
#
|
|
273
|
+
# @since 0.2.2
|
|
274
|
+
#
|
|
275
|
+
def proxy=(new_proxy)
|
|
276
|
+
@sessions.proxy = new_proxy
|
|
277
|
+
end
|
|
278
|
+
|
|
247
279
|
#
|
|
248
280
|
# Sets the history of URLs that were previously visited.
|
|
249
281
|
#
|
|
@@ -400,10 +432,11 @@ module Spidr
|
|
|
400
432
|
# Specifies whether the URL was enqueued, or ignored.
|
|
401
433
|
#
|
|
402
434
|
def enqueue(url)
|
|
403
|
-
|
|
404
|
-
url = URI(link) unless url.kind_of?(URI)
|
|
435
|
+
url = sanitize_url(url)
|
|
405
436
|
|
|
406
437
|
if (!(queued?(url)) && visit?(url))
|
|
438
|
+
link = url.to_s
|
|
439
|
+
|
|
407
440
|
begin
|
|
408
441
|
@every_url_blocks.each { |block| block.call(url) }
|
|
409
442
|
|
|
@@ -443,37 +476,51 @@ module Spidr
|
|
|
443
476
|
# The page for the response, or +nil+ if the request failed.
|
|
444
477
|
#
|
|
445
478
|
def get_page(url,&block)
|
|
446
|
-
url = URI(url.to_s)
|
|
479
|
+
url = URI(url.to_s)
|
|
447
480
|
|
|
448
|
-
|
|
449
|
-
|
|
481
|
+
prepare_request(url) do |session,path,headers|
|
|
482
|
+
new_page = Page.new(url,session.get(path,headers))
|
|
450
483
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
else
|
|
454
|
-
path = '/'
|
|
455
|
-
end
|
|
484
|
+
# save any new cookies
|
|
485
|
+
@cookies.from_page(new_page)
|
|
456
486
|
|
|
457
|
-
|
|
458
|
-
|
|
487
|
+
block.call(new_page) if block
|
|
488
|
+
return new_page
|
|
489
|
+
end
|
|
490
|
+
end
|
|
459
491
|
|
|
460
|
-
|
|
461
|
-
|
|
492
|
+
#
|
|
493
|
+
# Posts supplied form data and creates a new Page object from a given URL.
|
|
494
|
+
#
|
|
495
|
+
# @param [URI::HTTP] url
|
|
496
|
+
# The URL to request.
|
|
497
|
+
#
|
|
498
|
+
# @param [String] post_data
|
|
499
|
+
# Form option data.
|
|
500
|
+
#
|
|
501
|
+
# @yield [page]
|
|
502
|
+
# If a block is given, it will be passed the page that represents the
|
|
503
|
+
# response.
|
|
504
|
+
#
|
|
505
|
+
# @yieldparam [Page] page
|
|
506
|
+
# The page for the response.
|
|
507
|
+
#
|
|
508
|
+
# @return [Page, nil]
|
|
509
|
+
# The page for the response, or +nil+ if the request failed.
|
|
510
|
+
#
|
|
511
|
+
# @since 0.2.2
|
|
512
|
+
#
|
|
513
|
+
def post_page(url,post_data='',&block)
|
|
514
|
+
url = URI(url.to_s)
|
|
462
515
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
headers['User-Agent'] = @user_agent if @user_agent
|
|
466
|
-
headers['Referer'] = @referer if @referer
|
|
516
|
+
prepare_request(url) do |session,path,headers|
|
|
517
|
+
new_page = Page.new(url,session.post(path,post_data,headers))
|
|
467
518
|
|
|
468
|
-
|
|
519
|
+
# save any new cookies
|
|
520
|
+
@cookies.from_page(new_page)
|
|
469
521
|
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
end
|
|
473
|
-
rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError
|
|
474
|
-
failed(url)
|
|
475
|
-
kill_session(url.scheme,host,port)
|
|
476
|
-
return nil
|
|
522
|
+
block.call(new_page) if block
|
|
523
|
+
return new_page
|
|
477
524
|
end
|
|
478
525
|
end
|
|
479
526
|
|
|
@@ -529,73 +576,66 @@ module Spidr
|
|
|
529
576
|
protected
|
|
530
577
|
|
|
531
578
|
#
|
|
532
|
-
#
|
|
533
|
-
# and
|
|
534
|
-
#
|
|
535
|
-
# @param [String] scheme
|
|
536
|
-
# The scheme of the URL, which will be requested later.
|
|
537
|
-
#
|
|
538
|
-
# @param [String] host
|
|
539
|
-
# The host that the session is needed with.
|
|
579
|
+
# Normalizes the request path and grabs a session to handle page
|
|
580
|
+
# get and post requests.
|
|
540
581
|
#
|
|
541
|
-
# @param [
|
|
542
|
-
# The
|
|
582
|
+
# @param [URI::HTTP] url
|
|
583
|
+
# The URL to request.
|
|
543
584
|
#
|
|
544
|
-
# @yield [
|
|
545
|
-
#
|
|
585
|
+
# @yield [request]
|
|
586
|
+
# A block whose purpose is to make a page request.
|
|
546
587
|
#
|
|
547
588
|
# @yieldparam [Net::HTTP] session
|
|
548
|
-
#
|
|
549
|
-
#
|
|
550
|
-
def get_session(scheme,host,port,&block)
|
|
551
|
-
key = [scheme,host,port]
|
|
552
|
-
|
|
553
|
-
unless @sessions[key]
|
|
554
|
-
session = Net::HTTP::Proxy(
|
|
555
|
-
@proxy[:host],
|
|
556
|
-
@proxy[:port],
|
|
557
|
-
@proxy[:user],
|
|
558
|
-
@proxy[:password]
|
|
559
|
-
).new(host,port)
|
|
560
|
-
|
|
561
|
-
if scheme == 'https'
|
|
562
|
-
session.use_ssl = true
|
|
563
|
-
session.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
564
|
-
end
|
|
565
|
-
|
|
566
|
-
@sessions[key] = session
|
|
567
|
-
end
|
|
568
|
-
|
|
569
|
-
session = @sessions[key]
|
|
570
|
-
block.call(session) if block
|
|
571
|
-
return session
|
|
572
|
-
end
|
|
573
|
-
|
|
574
|
-
#
|
|
575
|
-
# Destroys an HTTP session for the given scheme, host and port.
|
|
589
|
+
# An HTTP session object.
|
|
576
590
|
#
|
|
577
|
-
# @
|
|
578
|
-
#
|
|
591
|
+
# @yieldparam [String] path
|
|
592
|
+
# Normalized URL string.
|
|
579
593
|
#
|
|
580
|
-
# @
|
|
581
|
-
#
|
|
594
|
+
# @yieldparam [Hash] headers
|
|
595
|
+
# A Hash of request header options.
|
|
582
596
|
#
|
|
583
|
-
# @
|
|
584
|
-
# The port that the session was connected to.
|
|
597
|
+
# @since 0.2.2
|
|
585
598
|
#
|
|
586
|
-
def
|
|
587
|
-
|
|
588
|
-
|
|
599
|
+
def prepare_request(url,&block)
|
|
600
|
+
host = url.host
|
|
601
|
+
port = url.port
|
|
589
602
|
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
603
|
+
unless url.path.empty?
|
|
604
|
+
path = url.path
|
|
605
|
+
else
|
|
606
|
+
path = '/'
|
|
594
607
|
end
|
|
595
608
|
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
609
|
+
# append the URL query to the path
|
|
610
|
+
path += "?#{url.query}" if url.query
|
|
611
|
+
|
|
612
|
+
begin
|
|
613
|
+
sleep(@delay) if @delay > 0
|
|
614
|
+
|
|
615
|
+
headers = {}
|
|
616
|
+
headers['User-Agent'] = @user_agent if @user_agent
|
|
617
|
+
headers['Referer'] = @referer if @referer
|
|
618
|
+
|
|
619
|
+
if (authorization = @authorized.for_url(url))
|
|
620
|
+
headers['Authorization'] = "Basic #{authorization}"
|
|
621
|
+
end
|
|
622
|
+
|
|
623
|
+
if (header_cookies = @cookies.for_host(url.host))
|
|
624
|
+
headers['Cookie'] = header_cookies
|
|
625
|
+
end
|
|
626
|
+
|
|
627
|
+
block.call(@sessions[url],path,headers)
|
|
628
|
+
rescue SystemCallError,
|
|
629
|
+
Timeout::Error,
|
|
630
|
+
SocketError,
|
|
631
|
+
Net::HTTPBadResponse,
|
|
632
|
+
IOError
|
|
633
|
+
|
|
634
|
+
@sessions.kill!(url)
|
|
635
|
+
|
|
636
|
+
failed(url)
|
|
637
|
+
return nil
|
|
638
|
+
end
|
|
599
639
|
end
|
|
600
640
|
|
|
601
641
|
#
|
|
@@ -633,8 +673,8 @@ module Spidr
|
|
|
633
673
|
# The URL to add to the failures list.
|
|
634
674
|
#
|
|
635
675
|
def failed(url)
|
|
636
|
-
@every_failed_url_blocks.each { |block| block.call(url) }
|
|
637
676
|
@failures << url
|
|
677
|
+
@every_failed_url_blocks.each { |block| block.call(url) }
|
|
638
678
|
return true
|
|
639
679
|
end
|
|
640
680
|
|