spidr 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ChangeLog.md +69 -54
- data/Gemfile +9 -5
- data/LICENSE.txt +1 -1
- data/README.md +34 -26
- data/Rakefile +4 -15
- data/gemspec.yml +3 -2
- data/lib/spidr/agent.rb +101 -44
- data/lib/spidr/{actions → agent}/actions.rb +32 -12
- data/lib/spidr/{events.rb → agent/events.rb} +4 -8
- data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
- data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
- data/lib/spidr/auth_store.rb +2 -2
- data/lib/spidr/cookie_jar.rb +2 -2
- data/lib/spidr/extensions/uri.rb +28 -16
- data/lib/spidr/page.rb +7 -11
- data/lib/spidr/{body.rb → page/body.rb} +1 -1
- data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
- data/lib/spidr/{links.rb → page/links.rb} +43 -7
- data/lib/spidr/session_cache.rb +2 -2
- data/lib/spidr/spidr.rb +32 -5
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +60 -0
- data/spec/agent/filters_spec.rb +62 -0
- data/spec/agent/sanitizers_spec.rb +62 -0
- data/spec/agent_spec.rb +13 -13
- data/spec/auth_store_spec.rb +17 -17
- data/spec/cookie_jar_spec.rb +26 -26
- data/spec/extensions/uri_spec.rb +19 -9
- data/spec/helpers/history.rb +5 -5
- data/spec/helpers/wsoc.rb +2 -2
- data/spec/page_examples.rb +4 -4
- data/spec/page_spec.rb +28 -25
- data/spec/rules_spec.rb +14 -14
- data/spec/session_cache.rb +7 -7
- data/spec/spidr_spec.rb +10 -10
- metadata +37 -51
- data/lib/spidr/actions.rb +0 -2
- data/lib/spidr/actions/exceptions.rb +0 -4
- data/lib/spidr/actions/exceptions/action.rb +0 -9
- data/lib/spidr/actions/exceptions/paused.rb +0 -11
- data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
- data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
- data/spec/actions_spec.rb +0 -59
- data/spec/filters_spec.rb +0 -61
- data/spec/sanitizers_spec.rb +0 -61
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 31e83cba8fd67a2527641b404f82773d60b5fb97
|
4
|
+
data.tar.gz: cbd735b652d209cd49a6990eedf3de6f7a22e385
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d33742df9e9a4ec8090d4934de3562036e149195b3567ac1143c4637012876d86a18618e9f89251506ed8aa1d9c85cc18ed324774d4da29038e975827698f265
|
7
|
+
data.tar.gz: 24b08172be0184f7c68fbc63b31eaac55b0c55d70b35b8983fbbb1a3ce871e157b0bbf7d598625ef37ec3fe420c7372bc5fdaf7dd4b7131eac6e6e23e465e475
|
data/ChangeLog.md
CHANGED
@@ -1,21 +1,36 @@
|
|
1
|
+
### 0.5.0 / 2016-01-03
|
2
|
+
|
3
|
+
* Added support for respecting `robots.txt` files.
|
4
|
+
|
5
|
+
Spidr.site('http://reddit.com/', robots: true)
|
6
|
+
|
7
|
+
* Added {Spidr.robots=} and {Spidr.robots?}.
|
8
|
+
* Added {Spidr::Page#each_mailto} and {Spidr::Page#mailtos}.
|
9
|
+
* Fixed a bug in {Spidr::Agent.host} that limited spidering to only `http://`
|
10
|
+
URIs.
|
11
|
+
* Rescue `Zlib::Error` to catch `Zlib::DataError` and `Zlib::BufError`
|
12
|
+
exceptions caused by web servers that use incompatible gzip compression.
|
13
|
+
* Fixed a bug in {URI.expand_path} where `/../foo` was being expanded to `foo`
|
14
|
+
instead of `/foo`.
|
15
|
+
|
1
16
|
### 0.4.1 / 2011-12-08
|
2
17
|
|
3
18
|
* Catch `OpenSSL::SSL::SSLError` exceptions when initiated HTTPS Sessions.
|
4
19
|
|
5
20
|
### 0.4.0 / 2011-08-07
|
6
21
|
|
7
|
-
* Added
|
8
|
-
* Pass the Page `url` and `content_charset` to Nokogiri in
|
22
|
+
* Added `Spidr::Headers#content_charset`.
|
23
|
+
* Pass the Page `url` and `content_charset` to Nokogiri in `Spidr::Body#doc`.
|
9
24
|
This ensures that Nokogiri will preserve the body encoding.
|
10
|
-
* Made
|
11
|
-
* Allow
|
25
|
+
* Made `Spidr::Headers#is_content_type?` public.
|
26
|
+
* Allow `Spidr::Headers#is_content_type?` to match the full Content-Type
|
12
27
|
or the sub-type.
|
13
28
|
|
14
29
|
### 0.3.2 / 2011-06-20
|
15
30
|
|
16
|
-
* Added separate intitialize methods for
|
17
|
-
|
18
|
-
* Aliased
|
31
|
+
* Added separate intitialize methods for `Spidr::Actions`, `Spidr::Events`,
|
32
|
+
`Spidr::Filters` and `Spidr::Sanitizers`.
|
33
|
+
* Aliased `Spidr::Events#urls_like` to `Spidr::Events#every_url_like`.
|
19
34
|
* Reduce usage of `self.included` and `module_eval`.
|
20
35
|
* Reduce usage of nested-blocks.
|
21
36
|
* Reduce usage of `return`.
|
@@ -28,21 +43,21 @@
|
|
28
43
|
|
29
44
|
* Switched from Jeweler to [Ore](http://github.com/ruby-ore/ore).
|
30
45
|
* Split all header related methods out of {Spidr::Page} and into
|
31
|
-
|
46
|
+
`Spidr::Headers`.
|
32
47
|
* Split all body related methods out of {Spidr::Page} and into
|
33
|
-
|
48
|
+
`Spidr::Body`.
|
34
49
|
* Split all link related methods out of {Spidr::Page} and into
|
35
|
-
|
36
|
-
* Added
|
37
|
-
* Added
|
38
|
-
* Added
|
39
|
-
* Added
|
40
|
-
* Added
|
41
|
-
* Added
|
42
|
-
* Aliased
|
43
|
-
* Aliased
|
44
|
-
* Also check for `application/xml` in
|
45
|
-
* Catch all exceptions when merging URIs in
|
50
|
+
`Spidr::Links`.
|
51
|
+
* Added `Spidr::Headers#directory?`.
|
52
|
+
* Added `Spidr::Headers#json?`.
|
53
|
+
* Added `Spidr::Links#each_url`.
|
54
|
+
* Added `Spidr::Links#each_link`.
|
55
|
+
* Added `Spidr::Links#each_redirect`.
|
56
|
+
* Added `Spidr::Links#each_meta_redirect`.
|
57
|
+
* Aliased `Spidr::Headers#raw_cookie` to `Spidr::Headers#cookie`.
|
58
|
+
* Aliased `Spidr::Body#to_s` to `Spidr::Body#body`.
|
59
|
+
* Also check for `application/xml` in `Spidr::Headers#xml?`.
|
60
|
+
* Catch all exceptions when merging URIs in `Spidr::Links#to_absolute`.
|
46
61
|
* Always prepend a `/` to all FTP URI paths. Fixes a Ruby 1.8 specific
|
47
62
|
bug, where it expects an absolute path for all FTP URIs.
|
48
63
|
* Refactored {URI.expand_path}.
|
@@ -73,10 +88,10 @@
|
|
73
88
|
|
74
89
|
### 0.2.4 / 2010-05-05
|
75
90
|
|
76
|
-
* Added
|
77
|
-
* Added
|
78
|
-
* Added
|
79
|
-
* Added
|
91
|
+
* Added `Spidr::Filters#visit_urls`.
|
92
|
+
* Added `Spidr::Filters#visit_urls_like`.
|
93
|
+
* Added `Spidr::Filters#ignore_urls`.
|
94
|
+
* Added `Spidr::Filters#ignore_urls_like`.
|
80
95
|
* Added `Spidr::Page#is_content_type?`.
|
81
96
|
* Default `Spidr::Page#body` to an empty String.
|
82
97
|
* Default `Spidr::Page#content_type` to an empty String.
|
@@ -89,7 +104,7 @@
|
|
89
104
|
|
90
105
|
* Migrated to Jeweler, for the packaging and releasing RubyGems.
|
91
106
|
* Switched to MarkDown formatted YARD documentation.
|
92
|
-
* Added
|
107
|
+
* Added `Spidr::Events#every_link`.
|
93
108
|
* Added {Spidr::SessionCache#active?}.
|
94
109
|
* Added specs for {Spidr::SessionCache}.
|
95
110
|
|
@@ -102,7 +117,7 @@
|
|
102
117
|
* Added `Spidr::Page#cookie`.
|
103
118
|
* Added `Spidr::Page#cookies`.
|
104
119
|
* Added `Spidr::Page#cookie_params`.
|
105
|
-
* Added
|
120
|
+
* Added `Spidr::Sanitizers`.
|
106
121
|
* Added {Spidr::SessionCache}.
|
107
122
|
* Added {Spidr::CookieJar} (thanks Nick Plante).
|
108
123
|
* Added {Spidr::AuthStore} (thanks Nick Plante).
|
@@ -112,31 +127,31 @@
|
|
112
127
|
|
113
128
|
### 0.2.1 / 2009-11-25
|
114
129
|
|
115
|
-
* Added
|
116
|
-
* Added
|
117
|
-
* Added
|
118
|
-
* Added
|
119
|
-
* Added
|
120
|
-
* Added
|
121
|
-
* Added
|
122
|
-
* Added
|
123
|
-
* Added
|
124
|
-
* Added
|
125
|
-
* Added
|
126
|
-
* Added
|
127
|
-
* Added
|
128
|
-
* Added
|
129
|
-
* Added
|
130
|
-
* Added
|
131
|
-
* Added
|
132
|
-
* Added
|
133
|
-
* Added
|
134
|
-
* Added
|
135
|
-
* Added
|
136
|
-
* Added
|
137
|
-
* Added
|
138
|
-
* Added
|
139
|
-
* Added
|
130
|
+
* Added `Spidr::Events#every_ok_page`.
|
131
|
+
* Added `Spidr::Events#every_redirect_page`.
|
132
|
+
* Added `Spidr::Events#every_timedout_page`.
|
133
|
+
* Added `Spidr::Events#every_bad_request_page`.
|
134
|
+
* Added `Spidr::Events#every_unauthorized_page`.
|
135
|
+
* Added `Spidr::Events#every_forbidden_page`.
|
136
|
+
* Added `Spidr::Events#every_missing_page`.
|
137
|
+
* Added `Spidr::Events#every_internal_server_error_page`.
|
138
|
+
* Added `Spidr::Events#every_txt_page`.
|
139
|
+
* Added `Spidr::Events#every_html_page`.
|
140
|
+
* Added `Spidr::Events#every_xml_page`.
|
141
|
+
* Added `Spidr::Events#every_xsl_page`.
|
142
|
+
* Added `Spidr::Events#every_doc`.
|
143
|
+
* Added `Spidr::Events#every_html_doc`.
|
144
|
+
* Added `Spidr::Events#every_xml_doc`.
|
145
|
+
* Added `Spidr::Events#every_xsl_doc`.
|
146
|
+
* Added `Spidr::Events#every_rss_doc`.
|
147
|
+
* Added `Spidr::Events#every_atom_doc`.
|
148
|
+
* Added `Spidr::Events#every_javascript_page`.
|
149
|
+
* Added `Spidr::Events#every_css_page`.
|
150
|
+
* Added `Spidr::Events#every_rss_page`.
|
151
|
+
* Added `Spidr::Events#every_atom_page`.
|
152
|
+
* Added `Spidr::Events#every_ms_word_page`.
|
153
|
+
* Added `Spidr::Events#every_pdf_page`.
|
154
|
+
* Added `Spidr::Events#every_zip_page`.
|
140
155
|
* Fixed a bug where {Spidr::Agent#delay} was not being used to delay
|
141
156
|
requesting pages.
|
142
157
|
* Spider `link` and `script` tags in HTML pages (thanks Nick Plante).
|
@@ -160,11 +175,11 @@
|
|
160
175
|
* Aliased `Spidr::Page#forbidden?` to `Spidr::Page#is_forbidden?`.
|
161
176
|
* Aliased `Spidr::Page#missing?` to `Spidr::Page#is_missing?`.
|
162
177
|
* Split URL filtering code out of {Spidr::Agent} and into
|
163
|
-
|
178
|
+
`Spidr::Filters`.
|
164
179
|
* Split URL / Page event code out of {Spidr::Agent} and into
|
165
|
-
|
180
|
+
`Spidr::Events`.
|
166
181
|
* Split pause! / continue! / skip_link! / skip_page! methods out of
|
167
|
-
{Spidr::Agent} and into
|
182
|
+
{Spidr::Agent} and into `Spidr::Actions`.
|
168
183
|
* Fixed a bug in `Spidr::Page#code`, where it was not returning an Integer.
|
169
184
|
* Make sure `Spidr::Page#doc` returns `Nokogiri::XML::Document` objects for
|
170
185
|
RSS/RDF/Atom pages as well.
|
data/Gemfile
CHANGED
@@ -6,12 +6,16 @@ end
|
|
6
6
|
|
7
7
|
gemspec
|
8
8
|
|
9
|
+
|
10
|
+
gem 'robots', group: :robots
|
11
|
+
|
9
12
|
group :development do
|
10
|
-
gem 'rake'
|
13
|
+
gem 'rake'
|
14
|
+
gem 'rubygems-tasks', '~> 0.2'
|
11
15
|
|
12
|
-
gem '
|
13
|
-
gem 'rspec',
|
14
|
-
gem 'wsoc', '~> 0.1.3'
|
16
|
+
gem 'wsoc', '~> 0.1.3'
|
17
|
+
gem 'rspec', '~> 3.0'
|
15
18
|
|
16
|
-
gem 'kramdown',
|
19
|
+
gem 'kramdown', '~> 0.12'
|
20
|
+
gem 'yard', '~> 0.8'
|
17
21
|
end
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# Spidr
|
2
2
|
|
3
|
-
* [Homepage](
|
4
|
-
* [Source](
|
5
|
-
* [Issues](
|
3
|
+
* [Homepage](https://github.com/postmodern/spidr#readme)
|
4
|
+
* [Source](https://github.com/postmodern/spidr)
|
5
|
+
* [Issues](https://github.com/postmodern/spidr/issues)
|
6
6
|
* [Mailing List](http://groups.google.com/group/spidr)
|
7
7
|
* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
|
8
8
|
|
@@ -15,9 +15,9 @@ and easy to use.
|
|
15
15
|
## Features
|
16
16
|
|
17
17
|
* Follows:
|
18
|
-
* a tags.
|
19
|
-
* iframe tags.
|
20
|
-
* frame tags.
|
18
|
+
* `a` tags.
|
19
|
+
* `iframe` tags.
|
20
|
+
* `frame` tags.
|
21
21
|
* Cookie protected links.
|
22
22
|
* HTTP 300, 301, 302, 303 and 307 Redirects.
|
23
23
|
* Meta-Refresh Redirects.
|
@@ -51,36 +51,40 @@ Start spidering from a URL:
|
|
51
51
|
|
52
52
|
Spider a host:
|
53
53
|
|
54
|
-
Spidr.host('
|
54
|
+
Spidr.host('solnic.eu')
|
55
55
|
|
56
56
|
Spider a site:
|
57
57
|
|
58
|
-
Spidr.site('http://rubyflow.com/')
|
58
|
+
Spidr.site('http://www.rubyflow.com/')
|
59
59
|
|
60
60
|
Spider multiple hosts:
|
61
61
|
|
62
62
|
Spidr.start_at(
|
63
63
|
'http://company.com/',
|
64
|
-
:
|
64
|
+
hosts: [
|
65
65
|
'company.com',
|
66
|
-
/host\d
|
66
|
+
/host[\d]+\.company\.com/
|
67
67
|
]
|
68
68
|
)
|
69
69
|
|
70
70
|
Do not spider certain links:
|
71
71
|
|
72
|
-
Spidr.site('http://
|
72
|
+
Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
|
73
73
|
|
74
74
|
Do not spider links on certain ports:
|
75
75
|
|
76
|
+
Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
|
77
|
+
|
78
|
+
Do not spider links blacklisted in robots.txt:
|
79
|
+
|
76
80
|
Spidr.site(
|
77
|
-
'http://
|
78
|
-
:
|
81
|
+
'http://company.com/',
|
82
|
+
robots: true
|
79
83
|
)
|
80
84
|
|
81
85
|
Print out visited URLs:
|
82
86
|
|
83
|
-
Spidr.site('http://rubyinside.
|
87
|
+
Spidr.site('http://www.rubyinside.com/') do |spider|
|
84
88
|
spider.every_url { |url| puts url }
|
85
89
|
end
|
86
90
|
|
@@ -96,7 +100,7 @@ Build a URL map of a site:
|
|
96
100
|
|
97
101
|
Print out the URLs that could not be requested:
|
98
102
|
|
99
|
-
Spidr.site('http://
|
103
|
+
Spidr.site('http://company.com/') do |spider|
|
100
104
|
spider.every_failed_url { |url| puts url }
|
101
105
|
end
|
102
106
|
|
@@ -118,22 +122,22 @@ Finds all pages which have broken links:
|
|
118
122
|
|
119
123
|
Search HTML and XML pages:
|
120
124
|
|
121
|
-
Spidr.site('http://company.
|
125
|
+
Spidr.site('http://company.com/') do |spider|
|
122
126
|
spider.every_page do |page|
|
123
|
-
puts "
|
127
|
+
puts ">>> #{page.url}"
|
124
128
|
|
125
129
|
page.search('//meta').each do |meta|
|
126
130
|
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
127
131
|
value = meta.attributes['content']
|
128
132
|
|
129
|
-
puts "
|
133
|
+
puts " #{name} = #{value}"
|
130
134
|
end
|
131
135
|
end
|
132
136
|
end
|
133
137
|
|
134
138
|
Print out the titles from every page:
|
135
139
|
|
136
|
-
Spidr.site('
|
140
|
+
Spidr.site('https://www.ruby-lang.org/') do |spider|
|
137
141
|
spider.every_html_page do |page|
|
138
142
|
puts page.title
|
139
143
|
end
|
@@ -143,7 +147,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
|
|
143
147
|
|
144
148
|
servers = Set[]
|
145
149
|
|
146
|
-
Spidr.host('
|
150
|
+
Spidr.host('company.com') do |spider|
|
147
151
|
spider.all_headers do |headers|
|
148
152
|
servers << headers['server']
|
149
153
|
end
|
@@ -151,7 +155,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
|
|
151
155
|
|
152
156
|
Pause the spider on a forbidden page:
|
153
157
|
|
154
|
-
spider = Spidr.host('
|
158
|
+
spider = Spidr.host('company.com') do |spider|
|
155
159
|
spider.every_forbidden_page do |page|
|
156
160
|
spider.pause!
|
157
161
|
end
|
@@ -159,7 +163,7 @@ Pause the spider on a forbidden page:
|
|
159
163
|
|
160
164
|
Skip the processing of a page:
|
161
165
|
|
162
|
-
Spidr.host('
|
166
|
+
Spidr.host('company.com') do |spider|
|
163
167
|
spider.every_missing_page do |page|
|
164
168
|
spider.skip_page!
|
165
169
|
end
|
@@ -167,7 +171,7 @@ Skip the processing of a page:
|
|
167
171
|
|
168
172
|
Skip the processing of links:
|
169
173
|
|
170
|
-
Spidr.host('
|
174
|
+
Spidr.host('company.com') do |spider|
|
171
175
|
spider.every_url do |url|
|
172
176
|
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
173
177
|
spider.skip_link!
|
@@ -177,14 +181,18 @@ Skip the processing of links:
|
|
177
181
|
|
178
182
|
## Requirements
|
179
183
|
|
180
|
-
* [
|
184
|
+
* [ruby] >= 1.9.1
|
185
|
+
* [nokogiri] ~> 1.3
|
181
186
|
|
182
187
|
## Install
|
183
188
|
|
184
|
-
$
|
189
|
+
$ gem install spidr
|
185
190
|
|
186
191
|
## License
|
187
192
|
|
188
|
-
Copyright (c) 2008-
|
193
|
+
Copyright (c) 2008-2016 Hal Brodigan
|
189
194
|
|
190
195
|
See {file:LICENSE.txt} for license information.
|
196
|
+
|
197
|
+
[ruby]: https://www.ruby-lang.org/
|
198
|
+
[nokogiri]: http://www.nokogiri.org/
|
data/Rakefile
CHANGED
@@ -1,25 +1,14 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
|
3
3
|
begin
|
4
|
-
require 'bundler'
|
4
|
+
require 'bundler/setup'
|
5
5
|
rescue LoadError => e
|
6
|
-
|
7
|
-
STDERR.puts "Run `gem install bundler` to install Bundler."
|
8
|
-
exit e.status_code
|
9
|
-
end
|
10
|
-
|
11
|
-
begin
|
12
|
-
Bundler.setup(:development)
|
13
|
-
rescue Bundler::BundlerError => e
|
14
|
-
STDERR.puts e.message
|
15
|
-
STDERR.puts "Run `bundle install` to install missing gems"
|
16
|
-
exit e.status_code
|
6
|
+
abort e.message
|
17
7
|
end
|
18
8
|
|
19
9
|
require 'rake'
|
20
|
-
|
21
|
-
|
22
|
-
Ore::Tasks.new
|
10
|
+
require 'rubygems/tasks'
|
11
|
+
Gem::Tasks.new
|
23
12
|
|
24
13
|
require 'rspec/core/rake_task'
|
25
14
|
RSpec::Core::RakeTask.new
|
data/gemspec.yml
CHANGED
@@ -8,12 +8,13 @@ description:
|
|
8
8
|
license: MIT
|
9
9
|
authors: Postmodern
|
10
10
|
email: postmodern.mod3@gmail.com
|
11
|
-
homepage:
|
11
|
+
homepage: https://github.com/postmodern/spidr#readme
|
12
12
|
has_yard: true
|
13
13
|
|
14
|
+
required_ruby_version: ">= 1.9.1"
|
15
|
+
|
14
16
|
dependencies:
|
15
17
|
nokogiri: ~> 1.3
|
16
18
|
|
17
19
|
development_dependencies:
|
18
20
|
bundler: ~> 1.0
|
19
|
-
yard: ~> 0.7
|
data/lib/spidr/agent.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require 'spidr/sanitizers'
|
2
|
-
require 'spidr/filters'
|
3
|
-
require 'spidr/events'
|
4
|
-
require 'spidr/actions'
|
1
|
+
require 'spidr/agent/sanitizers'
|
2
|
+
require 'spidr/agent/filters'
|
3
|
+
require 'spidr/agent/events'
|
4
|
+
require 'spidr/agent/actions'
|
5
5
|
require 'spidr/page'
|
6
6
|
require 'spidr/session_cache'
|
7
7
|
require 'spidr/cookie_jar'
|
@@ -12,48 +12,72 @@ require 'openssl'
|
|
12
12
|
require 'net/http'
|
13
13
|
require 'set'
|
14
14
|
|
15
|
+
begin
|
16
|
+
require 'robots'
|
17
|
+
rescue LoadError
|
18
|
+
end
|
19
|
+
|
15
20
|
module Spidr
|
16
21
|
class Agent
|
17
22
|
|
18
|
-
include Sanitizers
|
19
|
-
include Filters
|
20
|
-
include Events
|
21
|
-
include Actions
|
22
|
-
|
23
23
|
# HTTP Host Header to use
|
24
|
+
#
|
25
|
+
# @return [String]
|
24
26
|
attr_accessor :host_header
|
25
27
|
|
26
28
|
# HTTP Host Headers to use for specific hosts
|
29
|
+
#
|
30
|
+
# @return [Hash{String,Regexp => String}]
|
27
31
|
attr_reader :host_headers
|
28
32
|
|
29
33
|
# User-Agent to use
|
34
|
+
#
|
35
|
+
# @return [String]
|
30
36
|
attr_accessor :user_agent
|
31
37
|
|
32
38
|
# HTTP Authentication credentials
|
39
|
+
#
|
40
|
+
# @return [AuthStore]
|
33
41
|
attr_accessor :authorized
|
34
42
|
|
35
43
|
# Referer to use
|
44
|
+
#
|
45
|
+
# @return [String]
|
36
46
|
attr_accessor :referer
|
37
47
|
|
38
48
|
# Delay in between fetching pages
|
49
|
+
#
|
50
|
+
# @return [Integer]
|
39
51
|
attr_accessor :delay
|
40
52
|
|
41
53
|
# History containing visited URLs
|
54
|
+
#
|
55
|
+
# @return [Set<URI::HTTP>]
|
42
56
|
attr_reader :history
|
43
57
|
|
44
58
|
# List of unreachable URLs
|
59
|
+
#
|
60
|
+
# @return [Set<URI::HTTP>]
|
45
61
|
attr_reader :failures
|
46
62
|
|
47
63
|
# Queue of URLs to visit
|
64
|
+
#
|
65
|
+
# @return [Array<URI::HTTP>]
|
48
66
|
attr_reader :queue
|
49
67
|
|
50
68
|
# Cached cookies
|
69
|
+
#
|
70
|
+
# @return [CookieJar]
|
51
71
|
attr_reader :cookies
|
52
72
|
|
53
73
|
# Maximum depth
|
74
|
+
#
|
75
|
+
# @return [Integer]
|
54
76
|
attr_reader :max_depth
|
55
77
|
|
56
78
|
# The visited URLs and their depth within a site
|
79
|
+
#
|
80
|
+
# @return [Hash{URI::HTTP => Integer}]
|
57
81
|
attr_reader :levels
|
58
82
|
|
59
83
|
#
|
@@ -101,6 +125,9 @@ module Spidr
|
|
101
125
|
# @option options [Integer] :max_depth
|
102
126
|
# The maximum link depth to follow.
|
103
127
|
#
|
128
|
+
# @option options [Boolean] :robots (Spidr.robots?)
|
129
|
+
# Specifies whether `robots.txt` should be honored.
|
130
|
+
#
|
104
131
|
# @yield [agent]
|
105
132
|
# If a block is given, it will be passed the newly created agent
|
106
133
|
# for further configuration.
|
@@ -108,8 +135,13 @@ module Spidr
|
|
108
135
|
# @yieldparam [Agent] agent
|
109
136
|
# The newly created agent.
|
110
137
|
#
|
138
|
+
# @see #initialize_sanitizers
|
139
|
+
# @see #initialize_filters
|
140
|
+
# @see #initialize_actions
|
141
|
+
# @see #initialize_events
|
142
|
+
#
|
111
143
|
def initialize(options={})
|
112
|
-
@host_header
|
144
|
+
@host_header = options[:host_header]
|
113
145
|
@host_headers = {}
|
114
146
|
|
115
147
|
if options[:host_headers]
|
@@ -117,21 +149,29 @@ module Spidr
|
|
117
149
|
end
|
118
150
|
|
119
151
|
@user_agent = options.fetch(:user_agent,Spidr.user_agent)
|
120
|
-
@referer
|
152
|
+
@referer = options[:referer]
|
121
153
|
|
122
|
-
@sessions
|
123
|
-
@cookies
|
154
|
+
@sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
|
155
|
+
@cookies = CookieJar.new
|
124
156
|
@authorized = AuthStore.new
|
125
157
|
|
126
|
-
@running
|
127
|
-
@delay
|
128
|
-
@history
|
158
|
+
@running = false
|
159
|
+
@delay = options.fetch(:delay,0)
|
160
|
+
@history = Set[]
|
129
161
|
@failures = Set[]
|
130
|
-
@queue
|
162
|
+
@queue = []
|
131
163
|
|
132
|
-
@levels
|
164
|
+
@levels = Hash.new(0)
|
133
165
|
@max_depth = options[:max_depth]
|
134
166
|
|
167
|
+
if options.fetch(:robots,Spidr.robots?)
|
168
|
+
unless Object.const_defined?(:Robots)
|
169
|
+
raise(ArgumentError,":robots option given but unable to require 'robots' gem")
|
170
|
+
end
|
171
|
+
|
172
|
+
@robots = Robots.new(@user_agent)
|
173
|
+
end
|
174
|
+
|
135
175
|
initialize_sanitizers(options)
|
136
176
|
initialize_filters(options)
|
137
177
|
initialize_actions(options)
|
@@ -156,6 +196,9 @@ module Spidr
|
|
156
196
|
# @yieldparam [Agent] agent
|
157
197
|
# The newly created agent.
|
158
198
|
#
|
199
|
+
# @see #initialize
|
200
|
+
# @see #start_at
|
201
|
+
#
|
159
202
|
def self.start_at(url,options={},&block)
|
160
203
|
agent = new(options,&block)
|
161
204
|
agent.start_at(url)
|
@@ -177,17 +220,19 @@ module Spidr
|
|
177
220
|
# @yieldparam [Agent] agent
|
178
221
|
# The newly created agent.
|
179
222
|
#
|
223
|
+
# @see #initialize
|
224
|
+
#
|
180
225
|
def self.site(url,options={},&block)
|
181
226
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
182
227
|
|
183
|
-
agent = new(options.merge(:
|
228
|
+
agent = new(options.merge(host: url.host),&block)
|
184
229
|
agent.start_at(url)
|
185
230
|
end
|
186
231
|
|
187
232
|
#
|
188
233
|
# Creates a new agent and spiders the given host.
|
189
234
|
#
|
190
|
-
# @param [String]
|
235
|
+
# @param [String] name
|
191
236
|
# The host-name to spider.
|
192
237
|
#
|
193
238
|
# @param [Hash] options
|
@@ -200,8 +245,11 @@ module Spidr
|
|
200
245
|
# @yieldparam [Agent] agent
|
201
246
|
# The newly created agent.
|
202
247
|
#
|
248
|
+
# @see #initialize
|
249
|
+
#
|
203
250
|
def self.host(name,options={},&block)
|
204
|
-
|
251
|
+
agent = new(options.merge(host: name),&block)
|
252
|
+
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
205
253
|
end
|
206
254
|
|
207
255
|
#
|
@@ -315,11 +363,9 @@ module Spidr
|
|
315
363
|
@history.clear
|
316
364
|
|
317
365
|
new_history.each do |url|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
url
|
322
|
-
end
|
366
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
367
|
+
|
368
|
+
@history << url
|
323
369
|
end
|
324
370
|
|
325
371
|
return @history
|
@@ -362,10 +408,23 @@ module Spidr
|
|
362
408
|
return @history.include?(url)
|
363
409
|
end
|
364
410
|
|
411
|
+
#
|
412
|
+
# Determines whether a URL is allowed by the robot policy.
|
413
|
+
#
|
414
|
+
# @param [URI::HTTP, String] url
|
415
|
+
# The URL to check.
|
416
|
+
#
|
417
|
+
# @return [Boolean]
|
418
|
+
# Specifies whether a URL is allowed by the robot policy.
|
419
|
+
#
|
420
|
+
def robot_allowed?(url)
|
421
|
+
@robots ? @robots.allowed?(url) : true
|
422
|
+
end
|
423
|
+
|
365
424
|
#
|
366
425
|
# Sets the list of failed URLs.
|
367
426
|
#
|
368
|
-
# @param [#each]
|
427
|
+
# @param [#each] new_failures
|
369
428
|
# The new list of failed URLs.
|
370
429
|
#
|
371
430
|
# @return [Array<URI::HTTP>]
|
@@ -378,11 +437,9 @@ module Spidr
|
|
378
437
|
@failures.clear
|
379
438
|
|
380
439
|
new_failures.each do |url|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
url
|
385
|
-
end
|
440
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
441
|
+
|
442
|
+
@failures << url
|
386
443
|
end
|
387
444
|
|
388
445
|
return @failures
|
@@ -408,7 +465,7 @@ module Spidr
|
|
408
465
|
#
|
409
466
|
# Sets the queue of URLs to visit.
|
410
467
|
#
|
411
|
-
# @param [#each]
|
468
|
+
# @param [#each] new_queue
|
412
469
|
# The new list of URLs to visit.
|
413
470
|
#
|
414
471
|
# @return [Array<URI::HTTP>]
|
@@ -421,11 +478,9 @@ module Spidr
|
|
421
478
|
@queue.clear
|
422
479
|
|
423
480
|
new_queue.each do |url|
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
url
|
428
|
-
end
|
481
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
482
|
+
|
483
|
+
@queue << url
|
429
484
|
end
|
430
485
|
|
431
486
|
return @queue
|
@@ -542,7 +597,7 @@ module Spidr
|
|
542
597
|
# @since 0.2.2
|
543
598
|
#
|
544
599
|
def post_page(url,post_data='')
|
545
|
-
url = URI(url.to_s)
|
600
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
546
601
|
|
547
602
|
prepare_request(url) do |session,path,headers|
|
548
603
|
new_page = Page.new(url,session.post(path,post_data,headers))
|
@@ -616,7 +671,7 @@ module Spidr
|
|
616
671
|
# the `queue` of the agent.
|
617
672
|
#
|
618
673
|
def to_hash
|
619
|
-
{:
|
674
|
+
{history: @history, queue: @queue}
|
620
675
|
end
|
621
676
|
|
622
677
|
protected
|
@@ -666,9 +721,9 @@ module Spidr
|
|
666
721
|
end
|
667
722
|
end
|
668
723
|
|
669
|
-
headers['Host']
|
724
|
+
headers['Host'] ||= @host_header if @host_header
|
670
725
|
headers['User-Agent'] = @user_agent if @user_agent
|
671
|
-
headers['Referer']
|
726
|
+
headers['Referer'] = @referer if @referer
|
672
727
|
|
673
728
|
if (authorization = @authorized.for_url(url))
|
674
729
|
headers['Authorization'] = "Basic #{authorization}"
|
@@ -687,7 +742,8 @@ module Spidr
|
|
687
742
|
SocketError,
|
688
743
|
IOError,
|
689
744
|
OpenSSL::SSL::SSLError,
|
690
|
-
Net::HTTPBadResponse
|
745
|
+
Net::HTTPBadResponse,
|
746
|
+
Zlib::Error
|
691
747
|
|
692
748
|
@sessions.kill!(url)
|
693
749
|
|
@@ -722,7 +778,8 @@ module Spidr
|
|
722
778
|
visit_port?(url.port) &&
|
723
779
|
visit_link?(url.to_s) &&
|
724
780
|
visit_url?(url) &&
|
725
|
-
visit_ext?(url.path)
|
781
|
+
visit_ext?(url.path) &&
|
782
|
+
robot_allowed?(url.to_s)
|
726
783
|
end
|
727
784
|
|
728
785
|
#
|