spidr 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ChangeLog.md +69 -54
- data/Gemfile +9 -5
- data/LICENSE.txt +1 -1
- data/README.md +34 -26
- data/Rakefile +4 -15
- data/gemspec.yml +3 -2
- data/lib/spidr/agent.rb +101 -44
- data/lib/spidr/{actions → agent}/actions.rb +32 -12
- data/lib/spidr/{events.rb → agent/events.rb} +4 -8
- data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
- data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
- data/lib/spidr/auth_store.rb +2 -2
- data/lib/spidr/cookie_jar.rb +2 -2
- data/lib/spidr/extensions/uri.rb +28 -16
- data/lib/spidr/page.rb +7 -11
- data/lib/spidr/{body.rb → page/body.rb} +1 -1
- data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
- data/lib/spidr/{links.rb → page/links.rb} +43 -7
- data/lib/spidr/session_cache.rb +2 -2
- data/lib/spidr/spidr.rb +32 -5
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +60 -0
- data/spec/agent/filters_spec.rb +62 -0
- data/spec/agent/sanitizers_spec.rb +62 -0
- data/spec/agent_spec.rb +13 -13
- data/spec/auth_store_spec.rb +17 -17
- data/spec/cookie_jar_spec.rb +26 -26
- data/spec/extensions/uri_spec.rb +19 -9
- data/spec/helpers/history.rb +5 -5
- data/spec/helpers/wsoc.rb +2 -2
- data/spec/page_examples.rb +4 -4
- data/spec/page_spec.rb +28 -25
- data/spec/rules_spec.rb +14 -14
- data/spec/session_cache.rb +7 -7
- data/spec/spidr_spec.rb +10 -10
- metadata +37 -51
- data/lib/spidr/actions.rb +0 -2
- data/lib/spidr/actions/exceptions.rb +0 -4
- data/lib/spidr/actions/exceptions/action.rb +0 -9
- data/lib/spidr/actions/exceptions/paused.rb +0 -11
- data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
- data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
- data/spec/actions_spec.rb +0 -59
- data/spec/filters_spec.rb +0 -61
- data/spec/sanitizers_spec.rb +0 -61
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 31e83cba8fd67a2527641b404f82773d60b5fb97
|
4
|
+
data.tar.gz: cbd735b652d209cd49a6990eedf3de6f7a22e385
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d33742df9e9a4ec8090d4934de3562036e149195b3567ac1143c4637012876d86a18618e9f89251506ed8aa1d9c85cc18ed324774d4da29038e975827698f265
|
7
|
+
data.tar.gz: 24b08172be0184f7c68fbc63b31eaac55b0c55d70b35b8983fbbb1a3ce871e157b0bbf7d598625ef37ec3fe420c7372bc5fdaf7dd4b7131eac6e6e23e465e475
|
data/ChangeLog.md
CHANGED
@@ -1,21 +1,36 @@
|
|
1
|
+
### 0.5.0 / 2016-01-03
|
2
|
+
|
3
|
+
* Added support for respecting `robots.txt` files.
|
4
|
+
|
5
|
+
Spidr.site('http://reddit.com/', robots: true)
|
6
|
+
|
7
|
+
* Added {Spidr.robots=} and {Spidr.robots?}.
|
8
|
+
* Added {Spidr::Page#each_mailto} and {Spidr::Page#mailtos}.
|
9
|
+
* Fixed a bug in {Spidr::Agent.host} that limited spidering to only `http://`
|
10
|
+
URIs.
|
11
|
+
* Rescue `Zlib::Error` to catch `Zlib::DataError` and `Zlib::BufError`
|
12
|
+
exceptions caused by web servers that use incompatible gzip compression.
|
13
|
+
* Fixed a bug in {URI.expand_path} where `/../foo` was being expanded to `foo`
|
14
|
+
instead of `/foo`.
|
15
|
+
|
1
16
|
### 0.4.1 / 2011-12-08
|
2
17
|
|
3
18
|
* Catch `OpenSSL::SSL::SSLError` exceptions when initiated HTTPS Sessions.
|
4
19
|
|
5
20
|
### 0.4.0 / 2011-08-07
|
6
21
|
|
7
|
-
* Added
|
8
|
-
* Pass the Page `url` and `content_charset` to Nokogiri in
|
22
|
+
* Added `Spidr::Headers#content_charset`.
|
23
|
+
* Pass the Page `url` and `content_charset` to Nokogiri in `Spidr::Body#doc`.
|
9
24
|
This ensures that Nokogiri will preserve the body encoding.
|
10
|
-
* Made
|
11
|
-
* Allow
|
25
|
+
* Made `Spidr::Headers#is_content_type?` public.
|
26
|
+
* Allow `Spidr::Headers#is_content_type?` to match the full Content-Type
|
12
27
|
or the sub-type.
|
13
28
|
|
14
29
|
### 0.3.2 / 2011-06-20
|
15
30
|
|
16
|
-
* Added separate intitialize methods for
|
17
|
-
|
18
|
-
* Aliased
|
31
|
+
* Added separate intitialize methods for `Spidr::Actions`, `Spidr::Events`,
|
32
|
+
`Spidr::Filters` and `Spidr::Sanitizers`.
|
33
|
+
* Aliased `Spidr::Events#urls_like` to `Spidr::Events#every_url_like`.
|
19
34
|
* Reduce usage of `self.included` and `module_eval`.
|
20
35
|
* Reduce usage of nested-blocks.
|
21
36
|
* Reduce usage of `return`.
|
@@ -28,21 +43,21 @@
|
|
28
43
|
|
29
44
|
* Switched from Jeweler to [Ore](http://github.com/ruby-ore/ore).
|
30
45
|
* Split all header related methods out of {Spidr::Page} and into
|
31
|
-
|
46
|
+
`Spidr::Headers`.
|
32
47
|
* Split all body related methods out of {Spidr::Page} and into
|
33
|
-
|
48
|
+
`Spidr::Body`.
|
34
49
|
* Split all link related methods out of {Spidr::Page} and into
|
35
|
-
|
36
|
-
* Added
|
37
|
-
* Added
|
38
|
-
* Added
|
39
|
-
* Added
|
40
|
-
* Added
|
41
|
-
* Added
|
42
|
-
* Aliased
|
43
|
-
* Aliased
|
44
|
-
* Also check for `application/xml` in
|
45
|
-
* Catch all exceptions when merging URIs in
|
50
|
+
`Spidr::Links`.
|
51
|
+
* Added `Spidr::Headers#directory?`.
|
52
|
+
* Added `Spidr::Headers#json?`.
|
53
|
+
* Added `Spidr::Links#each_url`.
|
54
|
+
* Added `Spidr::Links#each_link`.
|
55
|
+
* Added `Spidr::Links#each_redirect`.
|
56
|
+
* Added `Spidr::Links#each_meta_redirect`.
|
57
|
+
* Aliased `Spidr::Headers#raw_cookie` to `Spidr::Headers#cookie`.
|
58
|
+
* Aliased `Spidr::Body#to_s` to `Spidr::Body#body`.
|
59
|
+
* Also check for `application/xml` in `Spidr::Headers#xml?`.
|
60
|
+
* Catch all exceptions when merging URIs in `Spidr::Links#to_absolute`.
|
46
61
|
* Always prepend a `/` to all FTP URI paths. Fixes a Ruby 1.8 specific
|
47
62
|
bug, where it expects an absolute path for all FTP URIs.
|
48
63
|
* Refactored {URI.expand_path}.
|
@@ -73,10 +88,10 @@
|
|
73
88
|
|
74
89
|
### 0.2.4 / 2010-05-05
|
75
90
|
|
76
|
-
* Added
|
77
|
-
* Added
|
78
|
-
* Added
|
79
|
-
* Added
|
91
|
+
* Added `Spidr::Filters#visit_urls`.
|
92
|
+
* Added `Spidr::Filters#visit_urls_like`.
|
93
|
+
* Added `Spidr::Filters#ignore_urls`.
|
94
|
+
* Added `Spidr::Filters#ignore_urls_like`.
|
80
95
|
* Added `Spidr::Page#is_content_type?`.
|
81
96
|
* Default `Spidr::Page#body` to an empty String.
|
82
97
|
* Default `Spidr::Page#content_type` to an empty String.
|
@@ -89,7 +104,7 @@
|
|
89
104
|
|
90
105
|
* Migrated to Jeweler, for the packaging and releasing RubyGems.
|
91
106
|
* Switched to MarkDown formatted YARD documentation.
|
92
|
-
* Added
|
107
|
+
* Added `Spidr::Events#every_link`.
|
93
108
|
* Added {Spidr::SessionCache#active?}.
|
94
109
|
* Added specs for {Spidr::SessionCache}.
|
95
110
|
|
@@ -102,7 +117,7 @@
|
|
102
117
|
* Added `Spidr::Page#cookie`.
|
103
118
|
* Added `Spidr::Page#cookies`.
|
104
119
|
* Added `Spidr::Page#cookie_params`.
|
105
|
-
* Added
|
120
|
+
* Added `Spidr::Sanitizers`.
|
106
121
|
* Added {Spidr::SessionCache}.
|
107
122
|
* Added {Spidr::CookieJar} (thanks Nick Plante).
|
108
123
|
* Added {Spidr::AuthStore} (thanks Nick Plante).
|
@@ -112,31 +127,31 @@
|
|
112
127
|
|
113
128
|
### 0.2.1 / 2009-11-25
|
114
129
|
|
115
|
-
* Added
|
116
|
-
* Added
|
117
|
-
* Added
|
118
|
-
* Added
|
119
|
-
* Added
|
120
|
-
* Added
|
121
|
-
* Added
|
122
|
-
* Added
|
123
|
-
* Added
|
124
|
-
* Added
|
125
|
-
* Added
|
126
|
-
* Added
|
127
|
-
* Added
|
128
|
-
* Added
|
129
|
-
* Added
|
130
|
-
* Added
|
131
|
-
* Added
|
132
|
-
* Added
|
133
|
-
* Added
|
134
|
-
* Added
|
135
|
-
* Added
|
136
|
-
* Added
|
137
|
-
* Added
|
138
|
-
* Added
|
139
|
-
* Added
|
130
|
+
* Added `Spidr::Events#every_ok_page`.
|
131
|
+
* Added `Spidr::Events#every_redirect_page`.
|
132
|
+
* Added `Spidr::Events#every_timedout_page`.
|
133
|
+
* Added `Spidr::Events#every_bad_request_page`.
|
134
|
+
* Added `Spidr::Events#every_unauthorized_page`.
|
135
|
+
* Added `Spidr::Events#every_forbidden_page`.
|
136
|
+
* Added `Spidr::Events#every_missing_page`.
|
137
|
+
* Added `Spidr::Events#every_internal_server_error_page`.
|
138
|
+
* Added `Spidr::Events#every_txt_page`.
|
139
|
+
* Added `Spidr::Events#every_html_page`.
|
140
|
+
* Added `Spidr::Events#every_xml_page`.
|
141
|
+
* Added `Spidr::Events#every_xsl_page`.
|
142
|
+
* Added `Spidr::Events#every_doc`.
|
143
|
+
* Added `Spidr::Events#every_html_doc`.
|
144
|
+
* Added `Spidr::Events#every_xml_doc`.
|
145
|
+
* Added `Spidr::Events#every_xsl_doc`.
|
146
|
+
* Added `Spidr::Events#every_rss_doc`.
|
147
|
+
* Added `Spidr::Events#every_atom_doc`.
|
148
|
+
* Added `Spidr::Events#every_javascript_page`.
|
149
|
+
* Added `Spidr::Events#every_css_page`.
|
150
|
+
* Added `Spidr::Events#every_rss_page`.
|
151
|
+
* Added `Spidr::Events#every_atom_page`.
|
152
|
+
* Added `Spidr::Events#every_ms_word_page`.
|
153
|
+
* Added `Spidr::Events#every_pdf_page`.
|
154
|
+
* Added `Spidr::Events#every_zip_page`.
|
140
155
|
* Fixed a bug where {Spidr::Agent#delay} was not being used to delay
|
141
156
|
requesting pages.
|
142
157
|
* Spider `link` and `script` tags in HTML pages (thanks Nick Plante).
|
@@ -160,11 +175,11 @@
|
|
160
175
|
* Aliased `Spidr::Page#forbidden?` to `Spidr::Page#is_forbidden?`.
|
161
176
|
* Aliased `Spidr::Page#missing?` to `Spidr::Page#is_missing?`.
|
162
177
|
* Split URL filtering code out of {Spidr::Agent} and into
|
163
|
-
|
178
|
+
`Spidr::Filters`.
|
164
179
|
* Split URL / Page event code out of {Spidr::Agent} and into
|
165
|
-
|
180
|
+
`Spidr::Events`.
|
166
181
|
* Split pause! / continue! / skip_link! / skip_page! methods out of
|
167
|
-
{Spidr::Agent} and into
|
182
|
+
{Spidr::Agent} and into `Spidr::Actions`.
|
168
183
|
* Fixed a bug in `Spidr::Page#code`, where it was not returning an Integer.
|
169
184
|
* Make sure `Spidr::Page#doc` returns `Nokogiri::XML::Document` objects for
|
170
185
|
RSS/RDF/Atom pages as well.
|
data/Gemfile
CHANGED
@@ -6,12 +6,16 @@ end
|
|
6
6
|
|
7
7
|
gemspec
|
8
8
|
|
9
|
+
|
10
|
+
gem 'robots', group: :robots
|
11
|
+
|
9
12
|
group :development do
|
10
|
-
gem 'rake'
|
13
|
+
gem 'rake'
|
14
|
+
gem 'rubygems-tasks', '~> 0.2'
|
11
15
|
|
12
|
-
gem '
|
13
|
-
gem 'rspec',
|
14
|
-
gem 'wsoc', '~> 0.1.3'
|
16
|
+
gem 'wsoc', '~> 0.1.3'
|
17
|
+
gem 'rspec', '~> 3.0'
|
15
18
|
|
16
|
-
gem 'kramdown',
|
19
|
+
gem 'kramdown', '~> 0.12'
|
20
|
+
gem 'yard', '~> 0.8'
|
17
21
|
end
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# Spidr
|
2
2
|
|
3
|
-
* [Homepage](
|
4
|
-
* [Source](
|
5
|
-
* [Issues](
|
3
|
+
* [Homepage](https://github.com/postmodern/spidr#readme)
|
4
|
+
* [Source](https://github.com/postmodern/spidr)
|
5
|
+
* [Issues](https://github.com/postmodern/spidr/issues)
|
6
6
|
* [Mailing List](http://groups.google.com/group/spidr)
|
7
7
|
* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
|
8
8
|
|
@@ -15,9 +15,9 @@ and easy to use.
|
|
15
15
|
## Features
|
16
16
|
|
17
17
|
* Follows:
|
18
|
-
* a tags.
|
19
|
-
* iframe tags.
|
20
|
-
* frame tags.
|
18
|
+
* `a` tags.
|
19
|
+
* `iframe` tags.
|
20
|
+
* `frame` tags.
|
21
21
|
* Cookie protected links.
|
22
22
|
* HTTP 300, 301, 302, 303 and 307 Redirects.
|
23
23
|
* Meta-Refresh Redirects.
|
@@ -51,36 +51,40 @@ Start spidering from a URL:
|
|
51
51
|
|
52
52
|
Spider a host:
|
53
53
|
|
54
|
-
Spidr.host('
|
54
|
+
Spidr.host('solnic.eu')
|
55
55
|
|
56
56
|
Spider a site:
|
57
57
|
|
58
|
-
Spidr.site('http://rubyflow.com/')
|
58
|
+
Spidr.site('http://www.rubyflow.com/')
|
59
59
|
|
60
60
|
Spider multiple hosts:
|
61
61
|
|
62
62
|
Spidr.start_at(
|
63
63
|
'http://company.com/',
|
64
|
-
:
|
64
|
+
hosts: [
|
65
65
|
'company.com',
|
66
|
-
/host\d
|
66
|
+
/host[\d]+\.company\.com/
|
67
67
|
]
|
68
68
|
)
|
69
69
|
|
70
70
|
Do not spider certain links:
|
71
71
|
|
72
|
-
Spidr.site('http://
|
72
|
+
Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
|
73
73
|
|
74
74
|
Do not spider links on certain ports:
|
75
75
|
|
76
|
+
Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
|
77
|
+
|
78
|
+
Do not spider links blacklisted in robots.txt:
|
79
|
+
|
76
80
|
Spidr.site(
|
77
|
-
'http://
|
78
|
-
:
|
81
|
+
'http://company.com/',
|
82
|
+
robots: true
|
79
83
|
)
|
80
84
|
|
81
85
|
Print out visited URLs:
|
82
86
|
|
83
|
-
Spidr.site('http://rubyinside.
|
87
|
+
Spidr.site('http://www.rubyinside.com/') do |spider|
|
84
88
|
spider.every_url { |url| puts url }
|
85
89
|
end
|
86
90
|
|
@@ -96,7 +100,7 @@ Build a URL map of a site:
|
|
96
100
|
|
97
101
|
Print out the URLs that could not be requested:
|
98
102
|
|
99
|
-
Spidr.site('http://
|
103
|
+
Spidr.site('http://company.com/') do |spider|
|
100
104
|
spider.every_failed_url { |url| puts url }
|
101
105
|
end
|
102
106
|
|
@@ -118,22 +122,22 @@ Finds all pages which have broken links:
|
|
118
122
|
|
119
123
|
Search HTML and XML pages:
|
120
124
|
|
121
|
-
Spidr.site('http://company.
|
125
|
+
Spidr.site('http://company.com/') do |spider|
|
122
126
|
spider.every_page do |page|
|
123
|
-
puts "
|
127
|
+
puts ">>> #{page.url}"
|
124
128
|
|
125
129
|
page.search('//meta').each do |meta|
|
126
130
|
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
127
131
|
value = meta.attributes['content']
|
128
132
|
|
129
|
-
puts "
|
133
|
+
puts " #{name} = #{value}"
|
130
134
|
end
|
131
135
|
end
|
132
136
|
end
|
133
137
|
|
134
138
|
Print out the titles from every page:
|
135
139
|
|
136
|
-
Spidr.site('
|
140
|
+
Spidr.site('https://www.ruby-lang.org/') do |spider|
|
137
141
|
spider.every_html_page do |page|
|
138
142
|
puts page.title
|
139
143
|
end
|
@@ -143,7 +147,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
|
|
143
147
|
|
144
148
|
servers = Set[]
|
145
149
|
|
146
|
-
Spidr.host('
|
150
|
+
Spidr.host('company.com') do |spider|
|
147
151
|
spider.all_headers do |headers|
|
148
152
|
servers << headers['server']
|
149
153
|
end
|
@@ -151,7 +155,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
|
|
151
155
|
|
152
156
|
Pause the spider on a forbidden page:
|
153
157
|
|
154
|
-
spider = Spidr.host('
|
158
|
+
spider = Spidr.host('company.com') do |spider|
|
155
159
|
spider.every_forbidden_page do |page|
|
156
160
|
spider.pause!
|
157
161
|
end
|
@@ -159,7 +163,7 @@ Pause the spider on a forbidden page:
|
|
159
163
|
|
160
164
|
Skip the processing of a page:
|
161
165
|
|
162
|
-
Spidr.host('
|
166
|
+
Spidr.host('company.com') do |spider|
|
163
167
|
spider.every_missing_page do |page|
|
164
168
|
spider.skip_page!
|
165
169
|
end
|
@@ -167,7 +171,7 @@ Skip the processing of a page:
|
|
167
171
|
|
168
172
|
Skip the processing of links:
|
169
173
|
|
170
|
-
Spidr.host('
|
174
|
+
Spidr.host('company.com') do |spider|
|
171
175
|
spider.every_url do |url|
|
172
176
|
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
173
177
|
spider.skip_link!
|
@@ -177,14 +181,18 @@ Skip the processing of links:
|
|
177
181
|
|
178
182
|
## Requirements
|
179
183
|
|
180
|
-
* [
|
184
|
+
* [ruby] >= 1.9.1
|
185
|
+
* [nokogiri] ~> 1.3
|
181
186
|
|
182
187
|
## Install
|
183
188
|
|
184
|
-
$
|
189
|
+
$ gem install spidr
|
185
190
|
|
186
191
|
## License
|
187
192
|
|
188
|
-
Copyright (c) 2008-
|
193
|
+
Copyright (c) 2008-2016 Hal Brodigan
|
189
194
|
|
190
195
|
See {file:LICENSE.txt} for license information.
|
196
|
+
|
197
|
+
[ruby]: https://www.ruby-lang.org/
|
198
|
+
[nokogiri]: http://www.nokogiri.org/
|
data/Rakefile
CHANGED
@@ -1,25 +1,14 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
|
3
3
|
begin
|
4
|
-
require 'bundler'
|
4
|
+
require 'bundler/setup'
|
5
5
|
rescue LoadError => e
|
6
|
-
|
7
|
-
STDERR.puts "Run `gem install bundler` to install Bundler."
|
8
|
-
exit e.status_code
|
9
|
-
end
|
10
|
-
|
11
|
-
begin
|
12
|
-
Bundler.setup(:development)
|
13
|
-
rescue Bundler::BundlerError => e
|
14
|
-
STDERR.puts e.message
|
15
|
-
STDERR.puts "Run `bundle install` to install missing gems"
|
16
|
-
exit e.status_code
|
6
|
+
abort e.message
|
17
7
|
end
|
18
8
|
|
19
9
|
require 'rake'
|
20
|
-
|
21
|
-
|
22
|
-
Ore::Tasks.new
|
10
|
+
require 'rubygems/tasks'
|
11
|
+
Gem::Tasks.new
|
23
12
|
|
24
13
|
require 'rspec/core/rake_task'
|
25
14
|
RSpec::Core::RakeTask.new
|
data/gemspec.yml
CHANGED
@@ -8,12 +8,13 @@ description:
|
|
8
8
|
license: MIT
|
9
9
|
authors: Postmodern
|
10
10
|
email: postmodern.mod3@gmail.com
|
11
|
-
homepage:
|
11
|
+
homepage: https://github.com/postmodern/spidr#readme
|
12
12
|
has_yard: true
|
13
13
|
|
14
|
+
required_ruby_version: ">= 1.9.1"
|
15
|
+
|
14
16
|
dependencies:
|
15
17
|
nokogiri: ~> 1.3
|
16
18
|
|
17
19
|
development_dependencies:
|
18
20
|
bundler: ~> 1.0
|
19
|
-
yard: ~> 0.7
|
data/lib/spidr/agent.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require 'spidr/sanitizers'
|
2
|
-
require 'spidr/filters'
|
3
|
-
require 'spidr/events'
|
4
|
-
require 'spidr/actions'
|
1
|
+
require 'spidr/agent/sanitizers'
|
2
|
+
require 'spidr/agent/filters'
|
3
|
+
require 'spidr/agent/events'
|
4
|
+
require 'spidr/agent/actions'
|
5
5
|
require 'spidr/page'
|
6
6
|
require 'spidr/session_cache'
|
7
7
|
require 'spidr/cookie_jar'
|
@@ -12,48 +12,72 @@ require 'openssl'
|
|
12
12
|
require 'net/http'
|
13
13
|
require 'set'
|
14
14
|
|
15
|
+
begin
|
16
|
+
require 'robots'
|
17
|
+
rescue LoadError
|
18
|
+
end
|
19
|
+
|
15
20
|
module Spidr
|
16
21
|
class Agent
|
17
22
|
|
18
|
-
include Sanitizers
|
19
|
-
include Filters
|
20
|
-
include Events
|
21
|
-
include Actions
|
22
|
-
|
23
23
|
# HTTP Host Header to use
|
24
|
+
#
|
25
|
+
# @return [String]
|
24
26
|
attr_accessor :host_header
|
25
27
|
|
26
28
|
# HTTP Host Headers to use for specific hosts
|
29
|
+
#
|
30
|
+
# @return [Hash{String,Regexp => String}]
|
27
31
|
attr_reader :host_headers
|
28
32
|
|
29
33
|
# User-Agent to use
|
34
|
+
#
|
35
|
+
# @return [String]
|
30
36
|
attr_accessor :user_agent
|
31
37
|
|
32
38
|
# HTTP Authentication credentials
|
39
|
+
#
|
40
|
+
# @return [AuthStore]
|
33
41
|
attr_accessor :authorized
|
34
42
|
|
35
43
|
# Referer to use
|
44
|
+
#
|
45
|
+
# @return [String]
|
36
46
|
attr_accessor :referer
|
37
47
|
|
38
48
|
# Delay in between fetching pages
|
49
|
+
#
|
50
|
+
# @return [Integer]
|
39
51
|
attr_accessor :delay
|
40
52
|
|
41
53
|
# History containing visited URLs
|
54
|
+
#
|
55
|
+
# @return [Set<URI::HTTP>]
|
42
56
|
attr_reader :history
|
43
57
|
|
44
58
|
# List of unreachable URLs
|
59
|
+
#
|
60
|
+
# @return [Set<URI::HTTP>]
|
45
61
|
attr_reader :failures
|
46
62
|
|
47
63
|
# Queue of URLs to visit
|
64
|
+
#
|
65
|
+
# @return [Array<URI::HTTP>]
|
48
66
|
attr_reader :queue
|
49
67
|
|
50
68
|
# Cached cookies
|
69
|
+
#
|
70
|
+
# @return [CookieJar]
|
51
71
|
attr_reader :cookies
|
52
72
|
|
53
73
|
# Maximum depth
|
74
|
+
#
|
75
|
+
# @return [Integer]
|
54
76
|
attr_reader :max_depth
|
55
77
|
|
56
78
|
# The visited URLs and their depth within a site
|
79
|
+
#
|
80
|
+
# @return [Hash{URI::HTTP => Integer}]
|
57
81
|
attr_reader :levels
|
58
82
|
|
59
83
|
#
|
@@ -101,6 +125,9 @@ module Spidr
|
|
101
125
|
# @option options [Integer] :max_depth
|
102
126
|
# The maximum link depth to follow.
|
103
127
|
#
|
128
|
+
# @option options [Boolean] :robots (Spidr.robots?)
|
129
|
+
# Specifies whether `robots.txt` should be honored.
|
130
|
+
#
|
104
131
|
# @yield [agent]
|
105
132
|
# If a block is given, it will be passed the newly created agent
|
106
133
|
# for further configuration.
|
@@ -108,8 +135,13 @@ module Spidr
|
|
108
135
|
# @yieldparam [Agent] agent
|
109
136
|
# The newly created agent.
|
110
137
|
#
|
138
|
+
# @see #initialize_sanitizers
|
139
|
+
# @see #initialize_filters
|
140
|
+
# @see #initialize_actions
|
141
|
+
# @see #initialize_events
|
142
|
+
#
|
111
143
|
def initialize(options={})
|
112
|
-
@host_header
|
144
|
+
@host_header = options[:host_header]
|
113
145
|
@host_headers = {}
|
114
146
|
|
115
147
|
if options[:host_headers]
|
@@ -117,21 +149,29 @@ module Spidr
|
|
117
149
|
end
|
118
150
|
|
119
151
|
@user_agent = options.fetch(:user_agent,Spidr.user_agent)
|
120
|
-
@referer
|
152
|
+
@referer = options[:referer]
|
121
153
|
|
122
|
-
@sessions
|
123
|
-
@cookies
|
154
|
+
@sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
|
155
|
+
@cookies = CookieJar.new
|
124
156
|
@authorized = AuthStore.new
|
125
157
|
|
126
|
-
@running
|
127
|
-
@delay
|
128
|
-
@history
|
158
|
+
@running = false
|
159
|
+
@delay = options.fetch(:delay,0)
|
160
|
+
@history = Set[]
|
129
161
|
@failures = Set[]
|
130
|
-
@queue
|
162
|
+
@queue = []
|
131
163
|
|
132
|
-
@levels
|
164
|
+
@levels = Hash.new(0)
|
133
165
|
@max_depth = options[:max_depth]
|
134
166
|
|
167
|
+
if options.fetch(:robots,Spidr.robots?)
|
168
|
+
unless Object.const_defined?(:Robots)
|
169
|
+
raise(ArgumentError,":robots option given but unable to require 'robots' gem")
|
170
|
+
end
|
171
|
+
|
172
|
+
@robots = Robots.new(@user_agent)
|
173
|
+
end
|
174
|
+
|
135
175
|
initialize_sanitizers(options)
|
136
176
|
initialize_filters(options)
|
137
177
|
initialize_actions(options)
|
@@ -156,6 +196,9 @@ module Spidr
|
|
156
196
|
# @yieldparam [Agent] agent
|
157
197
|
# The newly created agent.
|
158
198
|
#
|
199
|
+
# @see #initialize
|
200
|
+
# @see #start_at
|
201
|
+
#
|
159
202
|
def self.start_at(url,options={},&block)
|
160
203
|
agent = new(options,&block)
|
161
204
|
agent.start_at(url)
|
@@ -177,17 +220,19 @@ module Spidr
|
|
177
220
|
# @yieldparam [Agent] agent
|
178
221
|
# The newly created agent.
|
179
222
|
#
|
223
|
+
# @see #initialize
|
224
|
+
#
|
180
225
|
def self.site(url,options={},&block)
|
181
226
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
182
227
|
|
183
|
-
agent = new(options.merge(:
|
228
|
+
agent = new(options.merge(host: url.host),&block)
|
184
229
|
agent.start_at(url)
|
185
230
|
end
|
186
231
|
|
187
232
|
#
|
188
233
|
# Creates a new agent and spiders the given host.
|
189
234
|
#
|
190
|
-
# @param [String]
|
235
|
+
# @param [String] name
|
191
236
|
# The host-name to spider.
|
192
237
|
#
|
193
238
|
# @param [Hash] options
|
@@ -200,8 +245,11 @@ module Spidr
|
|
200
245
|
# @yieldparam [Agent] agent
|
201
246
|
# The newly created agent.
|
202
247
|
#
|
248
|
+
# @see #initialize
|
249
|
+
#
|
203
250
|
def self.host(name,options={},&block)
|
204
|
-
|
251
|
+
agent = new(options.merge(host: name),&block)
|
252
|
+
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
205
253
|
end
|
206
254
|
|
207
255
|
#
|
@@ -315,11 +363,9 @@ module Spidr
|
|
315
363
|
@history.clear
|
316
364
|
|
317
365
|
new_history.each do |url|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
url
|
322
|
-
end
|
366
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
367
|
+
|
368
|
+
@history << url
|
323
369
|
end
|
324
370
|
|
325
371
|
return @history
|
@@ -362,10 +408,23 @@ module Spidr
|
|
362
408
|
return @history.include?(url)
|
363
409
|
end
|
364
410
|
|
411
|
+
#
|
412
|
+
# Determines whether a URL is allowed by the robot policy.
|
413
|
+
#
|
414
|
+
# @param [URI::HTTP, String] url
|
415
|
+
# The URL to check.
|
416
|
+
#
|
417
|
+
# @return [Boolean]
|
418
|
+
# Specifies whether a URL is allowed by the robot policy.
|
419
|
+
#
|
420
|
+
def robot_allowed?(url)
|
421
|
+
@robots ? @robots.allowed?(url) : true
|
422
|
+
end
|
423
|
+
|
365
424
|
#
|
366
425
|
# Sets the list of failed URLs.
|
367
426
|
#
|
368
|
-
# @param [#each]
|
427
|
+
# @param [#each] new_failures
|
369
428
|
# The new list of failed URLs.
|
370
429
|
#
|
371
430
|
# @return [Array<URI::HTTP>]
|
@@ -378,11 +437,9 @@ module Spidr
|
|
378
437
|
@failures.clear
|
379
438
|
|
380
439
|
new_failures.each do |url|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
url
|
385
|
-
end
|
440
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
441
|
+
|
442
|
+
@failures << url
|
386
443
|
end
|
387
444
|
|
388
445
|
return @failures
|
@@ -408,7 +465,7 @@ module Spidr
|
|
408
465
|
#
|
409
466
|
# Sets the queue of URLs to visit.
|
410
467
|
#
|
411
|
-
# @param [#each]
|
468
|
+
# @param [#each] new_queue
|
412
469
|
# The new list of URLs to visit.
|
413
470
|
#
|
414
471
|
# @return [Array<URI::HTTP>]
|
@@ -421,11 +478,9 @@ module Spidr
|
|
421
478
|
@queue.clear
|
422
479
|
|
423
480
|
new_queue.each do |url|
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
url
|
428
|
-
end
|
481
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
482
|
+
|
483
|
+
@queue << url
|
429
484
|
end
|
430
485
|
|
431
486
|
return @queue
|
@@ -542,7 +597,7 @@ module Spidr
|
|
542
597
|
# @since 0.2.2
|
543
598
|
#
|
544
599
|
def post_page(url,post_data='')
|
545
|
-
url = URI(url.to_s)
|
600
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
546
601
|
|
547
602
|
prepare_request(url) do |session,path,headers|
|
548
603
|
new_page = Page.new(url,session.post(path,post_data,headers))
|
@@ -616,7 +671,7 @@ module Spidr
|
|
616
671
|
# the `queue` of the agent.
|
617
672
|
#
|
618
673
|
def to_hash
|
619
|
-
{:
|
674
|
+
{history: @history, queue: @queue}
|
620
675
|
end
|
621
676
|
|
622
677
|
protected
|
@@ -666,9 +721,9 @@ module Spidr
|
|
666
721
|
end
|
667
722
|
end
|
668
723
|
|
669
|
-
headers['Host']
|
724
|
+
headers['Host'] ||= @host_header if @host_header
|
670
725
|
headers['User-Agent'] = @user_agent if @user_agent
|
671
|
-
headers['Referer']
|
726
|
+
headers['Referer'] = @referer if @referer
|
672
727
|
|
673
728
|
if (authorization = @authorized.for_url(url))
|
674
729
|
headers['Authorization'] = "Basic #{authorization}"
|
@@ -687,7 +742,8 @@ module Spidr
|
|
687
742
|
SocketError,
|
688
743
|
IOError,
|
689
744
|
OpenSSL::SSL::SSLError,
|
690
|
-
Net::HTTPBadResponse
|
745
|
+
Net::HTTPBadResponse,
|
746
|
+
Zlib::Error
|
691
747
|
|
692
748
|
@sessions.kill!(url)
|
693
749
|
|
@@ -722,7 +778,8 @@ module Spidr
|
|
722
778
|
visit_port?(url.port) &&
|
723
779
|
visit_link?(url.to_s) &&
|
724
780
|
visit_url?(url) &&
|
725
|
-
visit_ext?(url.path)
|
781
|
+
visit_ext?(url.path) &&
|
782
|
+
robot_allowed?(url.to_s)
|
726
783
|
end
|
727
784
|
|
728
785
|
#
|