spidr 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog.md +69 -54
  3. data/Gemfile +9 -5
  4. data/LICENSE.txt +1 -1
  5. data/README.md +34 -26
  6. data/Rakefile +4 -15
  7. data/gemspec.yml +3 -2
  8. data/lib/spidr/agent.rb +101 -44
  9. data/lib/spidr/{actions → agent}/actions.rb +32 -12
  10. data/lib/spidr/{events.rb → agent/events.rb} +4 -8
  11. data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
  12. data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
  13. data/lib/spidr/auth_store.rb +2 -2
  14. data/lib/spidr/cookie_jar.rb +2 -2
  15. data/lib/spidr/extensions/uri.rb +28 -16
  16. data/lib/spidr/page.rb +7 -11
  17. data/lib/spidr/{body.rb → page/body.rb} +1 -1
  18. data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
  19. data/lib/spidr/{links.rb → page/links.rb} +43 -7
  20. data/lib/spidr/session_cache.rb +2 -2
  21. data/lib/spidr/spidr.rb +32 -5
  22. data/lib/spidr/version.rb +1 -1
  23. data/spec/agent/actions_spec.rb +60 -0
  24. data/spec/agent/filters_spec.rb +62 -0
  25. data/spec/agent/sanitizers_spec.rb +62 -0
  26. data/spec/agent_spec.rb +13 -13
  27. data/spec/auth_store_spec.rb +17 -17
  28. data/spec/cookie_jar_spec.rb +26 -26
  29. data/spec/extensions/uri_spec.rb +19 -9
  30. data/spec/helpers/history.rb +5 -5
  31. data/spec/helpers/wsoc.rb +2 -2
  32. data/spec/page_examples.rb +4 -4
  33. data/spec/page_spec.rb +28 -25
  34. data/spec/rules_spec.rb +14 -14
  35. data/spec/session_cache.rb +7 -7
  36. data/spec/spidr_spec.rb +10 -10
  37. metadata +37 -51
  38. data/lib/spidr/actions.rb +0 -2
  39. data/lib/spidr/actions/exceptions.rb +0 -4
  40. data/lib/spidr/actions/exceptions/action.rb +0 -9
  41. data/lib/spidr/actions/exceptions/paused.rb +0 -11
  42. data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
  43. data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
  44. data/spec/actions_spec.rb +0 -59
  45. data/spec/filters_spec.rb +0 -61
  46. data/spec/sanitizers_spec.rb +0 -61
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 31e83cba8fd67a2527641b404f82773d60b5fb97
4
+ data.tar.gz: cbd735b652d209cd49a6990eedf3de6f7a22e385
5
+ SHA512:
6
+ metadata.gz: d33742df9e9a4ec8090d4934de3562036e149195b3567ac1143c4637012876d86a18618e9f89251506ed8aa1d9c85cc18ed324774d4da29038e975827698f265
7
+ data.tar.gz: 24b08172be0184f7c68fbc63b31eaac55b0c55d70b35b8983fbbb1a3ce871e157b0bbf7d598625ef37ec3fe420c7372bc5fdaf7dd4b7131eac6e6e23e465e475
@@ -1,21 +1,36 @@
1
+ ### 0.5.0 / 2016-01-03
2
+
3
+ * Added support for respecting `robots.txt` files.
4
+
5
+ Spidr.site('http://reddit.com/', robots: true)
6
+
7
+ * Added {Spidr.robots=} and {Spidr.robots?}.
8
+ * Added {Spidr::Page#each_mailto} and {Spidr::Page#mailtos}.
9
+ * Fixed a bug in {Spidr::Agent.host} that limited spidering to only `http://`
10
+ URIs.
11
+ * Rescue `Zlib::Error` to catch `Zlib::DataError` and `Zlib::BufError`
12
+ exceptions caused by web servers that use incompatible gzip compression.
13
+ * Fixed a bug in {URI.expand_path} where `/../foo` was being expanded to `foo`
14
+ instead of `/foo`.
15
+
1
16
  ### 0.4.1 / 2011-12-08
2
17
 
3
18
  * Catch `OpenSSL::SSL::SSLError` exceptions when initiated HTTPS Sessions.
4
19
 
5
20
  ### 0.4.0 / 2011-08-07
6
21
 
7
- * Added {Spidr::Headers#content_charset}.
8
- * Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
22
+ * Added `Spidr::Headers#content_charset`.
23
+ * Pass the Page `url` and `content_charset` to Nokogiri in `Spidr::Body#doc`.
9
24
  This ensures that Nokogiri will preserve the body encoding.
10
- * Made {Spidr::Headers#is_content_type?} public.
11
- * Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
25
+ * Made `Spidr::Headers#is_content_type?` public.
26
+ * Allow `Spidr::Headers#is_content_type?` to match the full Content-Type
12
27
  or the sub-type.
13
28
 
14
29
  ### 0.3.2 / 2011-06-20
15
30
 
16
- * Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
17
- {Spidr::Filters} and {Spidr::Sanitizers}.
18
- * Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
31
+ * Added separate intitialize methods for `Spidr::Actions`, `Spidr::Events`,
32
+ `Spidr::Filters` and `Spidr::Sanitizers`.
33
+ * Aliased `Spidr::Events#urls_like` to `Spidr::Events#every_url_like`.
19
34
  * Reduce usage of `self.included` and `module_eval`.
20
35
  * Reduce usage of nested-blocks.
21
36
  * Reduce usage of `return`.
@@ -28,21 +43,21 @@
28
43
 
29
44
  * Switched from Jeweler to [Ore](http://github.com/ruby-ore/ore).
30
45
  * Split all header related methods out of {Spidr::Page} and into
31
- {Spidr::Headers}.
46
+ `Spidr::Headers`.
32
47
  * Split all body related methods out of {Spidr::Page} and into
33
- {Spidr::Body}.
48
+ `Spidr::Body`.
34
49
  * Split all link related methods out of {Spidr::Page} and into
35
- {Spidr::Links}.
36
- * Added {Spidr::Headers#directory?}.
37
- * Added {Spidr::Headers#json?}.
38
- * Added {Spidr::Links#each_url}.
39
- * Added {Spidr::Links#each_link}.
40
- * Added {Spidr::Links#each_redirect}.
41
- * Added {Spidr::Links#each_meta_redirect}.
42
- * Aliased {Spidr::Headers#raw_cookie} to {Spidr::Headers#cookie}.
43
- * Aliased {Spidr::Body#to_s} to {Spidr::Body#body}.
44
- * Also check for `application/xml` in {Spidr::Headers#xml?}.
45
- * Catch all exceptions when merging URIs in {Spidr::Links#to_absolute}.
50
+ `Spidr::Links`.
51
+ * Added `Spidr::Headers#directory?`.
52
+ * Added `Spidr::Headers#json?`.
53
+ * Added `Spidr::Links#each_url`.
54
+ * Added `Spidr::Links#each_link`.
55
+ * Added `Spidr::Links#each_redirect`.
56
+ * Added `Spidr::Links#each_meta_redirect`.
57
+ * Aliased `Spidr::Headers#raw_cookie` to `Spidr::Headers#cookie`.
58
+ * Aliased `Spidr::Body#to_s` to `Spidr::Body#body`.
59
+ * Also check for `application/xml` in `Spidr::Headers#xml?`.
60
+ * Catch all exceptions when merging URIs in `Spidr::Links#to_absolute`.
46
61
  * Always prepend a `/` to all FTP URI paths. Fixes a Ruby 1.8 specific
47
62
  bug, where it expects an absolute path for all FTP URIs.
48
63
  * Refactored {URI.expand_path}.
@@ -73,10 +88,10 @@
73
88
 
74
89
  ### 0.2.4 / 2010-05-05
75
90
 
76
- * Added {Spidr::Filters#visit_urls}.
77
- * Added {Spidr::Filters#visit_urls_like}.
78
- * Added {Spidr::Filters#ignore_urls}.
79
- * Added {Spidr::Filters#ignore_urls_like}.
91
+ * Added `Spidr::Filters#visit_urls`.
92
+ * Added `Spidr::Filters#visit_urls_like`.
93
+ * Added `Spidr::Filters#ignore_urls`.
94
+ * Added `Spidr::Filters#ignore_urls_like`.
80
95
  * Added `Spidr::Page#is_content_type?`.
81
96
  * Default `Spidr::Page#body` to an empty String.
82
97
  * Default `Spidr::Page#content_type` to an empty String.
@@ -89,7 +104,7 @@
89
104
 
90
105
  * Migrated to Jeweler, for the packaging and releasing RubyGems.
91
106
  * Switched to MarkDown formatted YARD documentation.
92
- * Added {Spidr::Events#every_link}.
107
+ * Added `Spidr::Events#every_link`.
93
108
  * Added {Spidr::SessionCache#active?}.
94
109
  * Added specs for {Spidr::SessionCache}.
95
110
 
@@ -102,7 +117,7 @@
102
117
  * Added `Spidr::Page#cookie`.
103
118
  * Added `Spidr::Page#cookies`.
104
119
  * Added `Spidr::Page#cookie_params`.
105
- * Added {Spidr::Sanitizers}.
120
+ * Added `Spidr::Sanitizers`.
106
121
  * Added {Spidr::SessionCache}.
107
122
  * Added {Spidr::CookieJar} (thanks Nick Plante).
108
123
  * Added {Spidr::AuthStore} (thanks Nick Plante).
@@ -112,31 +127,31 @@
112
127
 
113
128
  ### 0.2.1 / 2009-11-25
114
129
 
115
- * Added {Spidr::Events#every_ok_page}.
116
- * Added {Spidr::Events#every_redirect_page}.
117
- * Added {Spidr::Events#every_timedout_page}.
118
- * Added {Spidr::Events#every_bad_request_page}.
119
- * Added {Spidr::Events#every_unauthorized_page}.
120
- * Added {Spidr::Events#every_forbidden_page}.
121
- * Added {Spidr::Events#every_missing_page}.
122
- * Added {Spidr::Events#every_internal_server_error_page}.
123
- * Added {Spidr::Events#every_txt_page}.
124
- * Added {Spidr::Events#every_html_page}.
125
- * Added {Spidr::Events#every_xml_page}.
126
- * Added {Spidr::Events#every_xsl_page}.
127
- * Added {Spidr::Events#every_doc}.
128
- * Added {Spidr::Events#every_html_doc}.
129
- * Added {Spidr::Events#every_xml_doc}.
130
- * Added {Spidr::Events#every_xsl_doc}.
131
- * Added {Spidr::Events#every_rss_doc}.
132
- * Added {Spidr::Events#every_atom_doc}.
133
- * Added {Spidr::Events#every_javascript_page}.
134
- * Added {Spidr::Events#every_css_page}.
135
- * Added {Spidr::Events#every_rss_page}.
136
- * Added {Spidr::Events#every_atom_page}.
137
- * Added {Spidr::Events#every_ms_word_page}.
138
- * Added {Spidr::Events#every_pdf_page}.
139
- * Added {Spidr::Events#every_zip_page}.
130
+ * Added `Spidr::Events#every_ok_page`.
131
+ * Added `Spidr::Events#every_redirect_page`.
132
+ * Added `Spidr::Events#every_timedout_page`.
133
+ * Added `Spidr::Events#every_bad_request_page`.
134
+ * Added `Spidr::Events#every_unauthorized_page`.
135
+ * Added `Spidr::Events#every_forbidden_page`.
136
+ * Added `Spidr::Events#every_missing_page`.
137
+ * Added `Spidr::Events#every_internal_server_error_page`.
138
+ * Added `Spidr::Events#every_txt_page`.
139
+ * Added `Spidr::Events#every_html_page`.
140
+ * Added `Spidr::Events#every_xml_page`.
141
+ * Added `Spidr::Events#every_xsl_page`.
142
+ * Added `Spidr::Events#every_doc`.
143
+ * Added `Spidr::Events#every_html_doc`.
144
+ * Added `Spidr::Events#every_xml_doc`.
145
+ * Added `Spidr::Events#every_xsl_doc`.
146
+ * Added `Spidr::Events#every_rss_doc`.
147
+ * Added `Spidr::Events#every_atom_doc`.
148
+ * Added `Spidr::Events#every_javascript_page`.
149
+ * Added `Spidr::Events#every_css_page`.
150
+ * Added `Spidr::Events#every_rss_page`.
151
+ * Added `Spidr::Events#every_atom_page`.
152
+ * Added `Spidr::Events#every_ms_word_page`.
153
+ * Added `Spidr::Events#every_pdf_page`.
154
+ * Added `Spidr::Events#every_zip_page`.
140
155
  * Fixed a bug where {Spidr::Agent#delay} was not being used to delay
141
156
  requesting pages.
142
157
  * Spider `link` and `script` tags in HTML pages (thanks Nick Plante).
@@ -160,11 +175,11 @@
160
175
  * Aliased `Spidr::Page#forbidden?` to `Spidr::Page#is_forbidden?`.
161
176
  * Aliased `Spidr::Page#missing?` to `Spidr::Page#is_missing?`.
162
177
  * Split URL filtering code out of {Spidr::Agent} and into
163
- {Spidr::Filters}.
178
+ `Spidr::Filters`.
164
179
  * Split URL / Page event code out of {Spidr::Agent} and into
165
- {Spidr::Events}.
180
+ `Spidr::Events`.
166
181
  * Split pause! / continue! / skip_link! / skip_page! methods out of
167
- {Spidr::Agent} and into {Spidr::Actions}.
182
+ {Spidr::Agent} and into `Spidr::Actions`.
168
183
  * Fixed a bug in `Spidr::Page#code`, where it was not returning an Integer.
169
184
  * Make sure `Spidr::Page#doc` returns `Nokogiri::XML::Document` objects for
170
185
  RSS/RDF/Atom pages as well.
data/Gemfile CHANGED
@@ -6,12 +6,16 @@ end
6
6
 
7
7
  gemspec
8
8
 
9
+
10
+ gem 'robots', group: :robots
11
+
9
12
  group :development do
10
- gem 'rake', '~> 0.8'
13
+ gem 'rake'
14
+ gem 'rubygems-tasks', '~> 0.2'
11
15
 
12
- gem 'ore-tasks', '~> 0.4'
13
- gem 'rspec', '~> 2.4'
14
- gem 'wsoc', '~> 0.1.3'
16
+ gem 'wsoc', '~> 0.1.3'
17
+ gem 'rspec', '~> 3.0'
15
18
 
16
- gem 'kramdown', '~> 0.12'
19
+ gem 'kramdown', '~> 0.12'
20
+ gem 'yard', '~> 0.8'
17
21
  end
@@ -1,4 +1,4 @@
1
- Copyright (c) 2008-2011 Hal Brodigan
1
+ Copyright (c) 2008-2016 Hal Brodigan
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
1
  # Spidr
2
2
 
3
- * [Homepage](http://spidr.rubyforge.org/)
4
- * [Source](http://github.com/postmodern/spidr)
5
- * [Issues](http://github.com/postmodern/spidr/issues)
3
+ * [Homepage](https://github.com/postmodern/spidr#readme)
4
+ * [Source](https://github.com/postmodern/spidr)
5
+ * [Issues](https://github.com/postmodern/spidr/issues)
6
6
  * [Mailing List](http://groups.google.com/group/spidr)
7
7
  * [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
8
8
 
@@ -15,9 +15,9 @@ and easy to use.
15
15
  ## Features
16
16
 
17
17
  * Follows:
18
- * a tags.
19
- * iframe tags.
20
- * frame tags.
18
+ * `a` tags.
19
+ * `iframe` tags.
20
+ * `frame` tags.
21
21
  * Cookie protected links.
22
22
  * HTTP 300, 301, 302, 303 and 307 Redirects.
23
23
  * Meta-Refresh Redirects.
@@ -51,36 +51,40 @@ Start spidering from a URL:
51
51
 
52
52
  Spider a host:
53
53
 
54
- Spidr.host('coderrr.wordpress.com')
54
+ Spidr.host('solnic.eu')
55
55
 
56
56
  Spider a site:
57
57
 
58
- Spidr.site('http://rubyflow.com/')
58
+ Spidr.site('http://www.rubyflow.com/')
59
59
 
60
60
  Spider multiple hosts:
61
61
 
62
62
  Spidr.start_at(
63
63
  'http://company.com/',
64
- :hosts => [
64
+ hosts: [
65
65
  'company.com',
66
- /host\d\.company\.com/
66
+ /host[\d]+\.company\.com/
67
67
  ]
68
68
  )
69
69
 
70
70
  Do not spider certain links:
71
71
 
72
- Spidr.site('http://matasano.com/', :ignore_links => [/log/])
72
+ Spidr.site('http://company.com/', ignore_links: [%{^/blog/}])
73
73
 
74
74
  Do not spider links on certain ports:
75
75
 
76
+ Spidr.site('http://company.com/', ignore_ports: [8000, 8010, 8080])
77
+
78
+ Do not spider links blacklisted in robots.txt:
79
+
76
80
  Spidr.site(
77
- 'http://sketchy.content.com/',
78
- :ignore_ports => [8000, 8010, 8080]
81
+ 'http://company.com/',
82
+ robots: true
79
83
  )
80
84
 
81
85
  Print out visited URLs:
82
86
 
83
- Spidr.site('http://rubyinside.org/') do |spider|
87
+ Spidr.site('http://www.rubyinside.com/') do |spider|
84
88
  spider.every_url { |url| puts url }
85
89
  end
86
90
 
@@ -96,7 +100,7 @@ Build a URL map of a site:
96
100
 
97
101
  Print out the URLs that could not be requested:
98
102
 
99
- Spidr.site('http://sketchy.content.com/') do |spider|
103
+ Spidr.site('http://company.com/') do |spider|
100
104
  spider.every_failed_url { |url| puts url }
101
105
  end
102
106
 
@@ -118,22 +122,22 @@ Finds all pages which have broken links:
118
122
 
119
123
  Search HTML and XML pages:
120
124
 
121
- Spidr.site('http://company.withablog.com/') do |spider|
125
+ Spidr.site('http://company.com/') do |spider|
122
126
  spider.every_page do |page|
123
- puts "[-] #{page.url}"
127
+ puts ">>> #{page.url}"
124
128
 
125
129
  page.search('//meta').each do |meta|
126
130
  name = (meta.attributes['name'] || meta.attributes['http-equiv'])
127
131
  value = meta.attributes['content']
128
132
 
129
- puts " #{name} = #{value}"
133
+ puts " #{name} = #{value}"
130
134
  end
131
135
  end
132
136
  end
133
137
 
134
138
  Print out the titles from every page:
135
139
 
136
- Spidr.site('http://www.rubypulse.com/') do |spider|
140
+ Spidr.site('https://www.ruby-lang.org/') do |spider|
137
141
  spider.every_html_page do |page|
138
142
  puts page.title
139
143
  end
@@ -143,7 +147,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
143
147
 
144
148
  servers = Set[]
145
149
 
146
- Spidr.host('generic.company.com') do |spider|
150
+ Spidr.host('company.com') do |spider|
147
151
  spider.all_headers do |headers|
148
152
  servers << headers['server']
149
153
  end
@@ -151,7 +155,7 @@ Find what kinds of web servers a host is using, by accessing the headers:
151
155
 
152
156
  Pause the spider on a forbidden page:
153
157
 
154
- spider = Spidr.host('overnight.startup.com') do |spider|
158
+ spider = Spidr.host('company.com') do |spider|
155
159
  spider.every_forbidden_page do |page|
156
160
  spider.pause!
157
161
  end
@@ -159,7 +163,7 @@ Pause the spider on a forbidden page:
159
163
 
160
164
  Skip the processing of a page:
161
165
 
162
- Spidr.host('sketchy.content.com') do |spider|
166
+ Spidr.host('company.com') do |spider|
163
167
  spider.every_missing_page do |page|
164
168
  spider.skip_page!
165
169
  end
@@ -167,7 +171,7 @@ Skip the processing of a page:
167
171
 
168
172
  Skip the processing of links:
169
173
 
170
- Spidr.host('sketchy.content.com') do |spider|
174
+ Spidr.host('company.com') do |spider|
171
175
  spider.every_url do |url|
172
176
  if url.path.split('/').find { |dir| dir.to_i > 1000 }
173
177
  spider.skip_link!
@@ -177,14 +181,18 @@ Skip the processing of links:
177
181
 
178
182
  ## Requirements
179
183
 
180
- * [nokogiri](http://nokogiri.rubyforge.org/) ~> 1.3
184
+ * [ruby] >= 1.9.1
185
+ * [nokogiri] ~> 1.3
181
186
 
182
187
  ## Install
183
188
 
184
- $ sudo gem install spidr
189
+ $ gem install spidr
185
190
 
186
191
  ## License
187
192
 
188
- Copyright (c) 2008-2011 Hal Brodigan
193
+ Copyright (c) 2008-2016 Hal Brodigan
189
194
 
190
195
  See {file:LICENSE.txt} for license information.
196
+
197
+ [ruby]: https://www.ruby-lang.org/
198
+ [nokogiri]: http://www.nokogiri.org/
data/Rakefile CHANGED
@@ -1,25 +1,14 @@
1
1
  require 'rubygems'
2
2
 
3
3
  begin
4
- require 'bundler'
4
+ require 'bundler/setup'
5
5
  rescue LoadError => e
6
- STDERR.puts e.message
7
- STDERR.puts "Run `gem install bundler` to install Bundler."
8
- exit e.status_code
9
- end
10
-
11
- begin
12
- Bundler.setup(:development)
13
- rescue Bundler::BundlerError => e
14
- STDERR.puts e.message
15
- STDERR.puts "Run `bundle install` to install missing gems"
16
- exit e.status_code
6
+ abort e.message
17
7
  end
18
8
 
19
9
  require 'rake'
20
-
21
- require 'ore/tasks'
22
- Ore::Tasks.new
10
+ require 'rubygems/tasks'
11
+ Gem::Tasks.new
23
12
 
24
13
  require 'rspec/core/rake_task'
25
14
  RSpec::Core::RakeTask.new
@@ -8,12 +8,13 @@ description:
8
8
  license: MIT
9
9
  authors: Postmodern
10
10
  email: postmodern.mod3@gmail.com
11
- homepage: http://github.com/postmodern/spidr
11
+ homepage: https://github.com/postmodern/spidr#readme
12
12
  has_yard: true
13
13
 
14
+ required_ruby_version: ">= 1.9.1"
15
+
14
16
  dependencies:
15
17
  nokogiri: ~> 1.3
16
18
 
17
19
  development_dependencies:
18
20
  bundler: ~> 1.0
19
- yard: ~> 0.7
@@ -1,7 +1,7 @@
1
- require 'spidr/sanitizers'
2
- require 'spidr/filters'
3
- require 'spidr/events'
4
- require 'spidr/actions'
1
+ require 'spidr/agent/sanitizers'
2
+ require 'spidr/agent/filters'
3
+ require 'spidr/agent/events'
4
+ require 'spidr/agent/actions'
5
5
  require 'spidr/page'
6
6
  require 'spidr/session_cache'
7
7
  require 'spidr/cookie_jar'
@@ -12,48 +12,72 @@ require 'openssl'
12
12
  require 'net/http'
13
13
  require 'set'
14
14
 
15
+ begin
16
+ require 'robots'
17
+ rescue LoadError
18
+ end
19
+
15
20
  module Spidr
16
21
  class Agent
17
22
 
18
- include Sanitizers
19
- include Filters
20
- include Events
21
- include Actions
22
-
23
23
  # HTTP Host Header to use
24
+ #
25
+ # @return [String]
24
26
  attr_accessor :host_header
25
27
 
26
28
  # HTTP Host Headers to use for specific hosts
29
+ #
30
+ # @return [Hash{String,Regexp => String}]
27
31
  attr_reader :host_headers
28
32
 
29
33
  # User-Agent to use
34
+ #
35
+ # @return [String]
30
36
  attr_accessor :user_agent
31
37
 
32
38
  # HTTP Authentication credentials
39
+ #
40
+ # @return [AuthStore]
33
41
  attr_accessor :authorized
34
42
 
35
43
  # Referer to use
44
+ #
45
+ # @return [String]
36
46
  attr_accessor :referer
37
47
 
38
48
  # Delay in between fetching pages
49
+ #
50
+ # @return [Integer]
39
51
  attr_accessor :delay
40
52
 
41
53
  # History containing visited URLs
54
+ #
55
+ # @return [Set<URI::HTTP>]
42
56
  attr_reader :history
43
57
 
44
58
  # List of unreachable URLs
59
+ #
60
+ # @return [Set<URI::HTTP>]
45
61
  attr_reader :failures
46
62
 
47
63
  # Queue of URLs to visit
64
+ #
65
+ # @return [Array<URI::HTTP>]
48
66
  attr_reader :queue
49
67
 
50
68
  # Cached cookies
69
+ #
70
+ # @return [CookieJar]
51
71
  attr_reader :cookies
52
72
 
53
73
  # Maximum depth
74
+ #
75
+ # @return [Integer]
54
76
  attr_reader :max_depth
55
77
 
56
78
  # The visited URLs and their depth within a site
79
+ #
80
+ # @return [Hash{URI::HTTP => Integer}]
57
81
  attr_reader :levels
58
82
 
59
83
  #
@@ -101,6 +125,9 @@ module Spidr
101
125
  # @option options [Integer] :max_depth
102
126
  # The maximum link depth to follow.
103
127
  #
128
+ # @option options [Boolean] :robots (Spidr.robots?)
129
+ # Specifies whether `robots.txt` should be honored.
130
+ #
104
131
  # @yield [agent]
105
132
  # If a block is given, it will be passed the newly created agent
106
133
  # for further configuration.
@@ -108,8 +135,13 @@ module Spidr
108
135
  # @yieldparam [Agent] agent
109
136
  # The newly created agent.
110
137
  #
138
+ # @see #initialize_sanitizers
139
+ # @see #initialize_filters
140
+ # @see #initialize_actions
141
+ # @see #initialize_events
142
+ #
111
143
  def initialize(options={})
112
- @host_header = options[:host_header]
144
+ @host_header = options[:host_header]
113
145
  @host_headers = {}
114
146
 
115
147
  if options[:host_headers]
@@ -117,21 +149,29 @@ module Spidr
117
149
  end
118
150
 
119
151
  @user_agent = options.fetch(:user_agent,Spidr.user_agent)
120
- @referer = options[:referer]
152
+ @referer = options[:referer]
121
153
 
122
- @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
123
- @cookies = CookieJar.new
154
+ @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
155
+ @cookies = CookieJar.new
124
156
  @authorized = AuthStore.new
125
157
 
126
- @running = false
127
- @delay = options.fetch(:delay,0)
128
- @history = Set[]
158
+ @running = false
159
+ @delay = options.fetch(:delay,0)
160
+ @history = Set[]
129
161
  @failures = Set[]
130
- @queue = []
162
+ @queue = []
131
163
 
132
- @levels = Hash.new(0)
164
+ @levels = Hash.new(0)
133
165
  @max_depth = options[:max_depth]
134
166
 
167
+ if options.fetch(:robots,Spidr.robots?)
168
+ unless Object.const_defined?(:Robots)
169
+ raise(ArgumentError,":robots option given but unable to require 'robots' gem")
170
+ end
171
+
172
+ @robots = Robots.new(@user_agent)
173
+ end
174
+
135
175
  initialize_sanitizers(options)
136
176
  initialize_filters(options)
137
177
  initialize_actions(options)
@@ -156,6 +196,9 @@ module Spidr
156
196
  # @yieldparam [Agent] agent
157
197
  # The newly created agent.
158
198
  #
199
+ # @see #initialize
200
+ # @see #start_at
201
+ #
159
202
  def self.start_at(url,options={},&block)
160
203
  agent = new(options,&block)
161
204
  agent.start_at(url)
@@ -177,17 +220,19 @@ module Spidr
177
220
  # @yieldparam [Agent] agent
178
221
  # The newly created agent.
179
222
  #
223
+ # @see #initialize
224
+ #
180
225
  def self.site(url,options={},&block)
181
226
  url = URI(url.to_s) unless url.kind_of?(URI)
182
227
 
183
- agent = new(options.merge(:host => url.host),&block)
228
+ agent = new(options.merge(host: url.host),&block)
184
229
  agent.start_at(url)
185
230
  end
186
231
 
187
232
  #
188
233
  # Creates a new agent and spiders the given host.
189
234
  #
190
- # @param [String]
235
+ # @param [String] name
191
236
  # The host-name to spider.
192
237
  #
193
238
  # @param [Hash] options
@@ -200,8 +245,11 @@ module Spidr
200
245
  # @yieldparam [Agent] agent
201
246
  # The newly created agent.
202
247
  #
248
+ # @see #initialize
249
+ #
203
250
  def self.host(name,options={},&block)
204
- site(URI::HTTP.build(:host => name, :path => '/'),options,&block)
251
+ agent = new(options.merge(host: name),&block)
252
+ agent.start_at(URI::HTTP.build(host: name, path: '/'))
205
253
  end
206
254
 
207
255
  #
@@ -315,11 +363,9 @@ module Spidr
315
363
  @history.clear
316
364
 
317
365
  new_history.each do |url|
318
- @history << unless url.kind_of?(URI)
319
- URI(url.to_s)
320
- else
321
- url
322
- end
366
+ url = URI(url.to_s) unless url.kind_of?(URI)
367
+
368
+ @history << url
323
369
  end
324
370
 
325
371
  return @history
@@ -362,10 +408,23 @@ module Spidr
362
408
  return @history.include?(url)
363
409
  end
364
410
 
411
+ #
412
+ # Determines whether a URL is allowed by the robot policy.
413
+ #
414
+ # @param [URI::HTTP, String] url
415
+ # The URL to check.
416
+ #
417
+ # @return [Boolean]
418
+ # Specifies whether a URL is allowed by the robot policy.
419
+ #
420
+ def robot_allowed?(url)
421
+ @robots ? @robots.allowed?(url) : true
422
+ end
423
+
365
424
  #
366
425
  # Sets the list of failed URLs.
367
426
  #
368
- # @param [#each]
427
+ # @param [#each] new_failures
369
428
  # The new list of failed URLs.
370
429
  #
371
430
  # @return [Array<URI::HTTP>]
@@ -378,11 +437,9 @@ module Spidr
378
437
  @failures.clear
379
438
 
380
439
  new_failures.each do |url|
381
- @failures << unless url.kind_of?(URI)
382
- URI(url.to_s)
383
- else
384
- url
385
- end
440
+ url = URI(url.to_s) unless url.kind_of?(URI)
441
+
442
+ @failures << url
386
443
  end
387
444
 
388
445
  return @failures
@@ -408,7 +465,7 @@ module Spidr
408
465
  #
409
466
  # Sets the queue of URLs to visit.
410
467
  #
411
- # @param [#each]
468
+ # @param [#each] new_queue
412
469
  # The new list of URLs to visit.
413
470
  #
414
471
  # @return [Array<URI::HTTP>]
@@ -421,11 +478,9 @@ module Spidr
421
478
  @queue.clear
422
479
 
423
480
  new_queue.each do |url|
424
- @queue << unless url.kind_of?(URI)
425
- URI(url.to_s)
426
- else
427
- url
428
- end
481
+ url = URI(url.to_s) unless url.kind_of?(URI)
482
+
483
+ @queue << url
429
484
  end
430
485
 
431
486
  return @queue
@@ -542,7 +597,7 @@ module Spidr
542
597
  # @since 0.2.2
543
598
  #
544
599
  def post_page(url,post_data='')
545
- url = URI(url.to_s)
600
+ url = URI(url.to_s) unless url.kind_of?(URI)
546
601
 
547
602
  prepare_request(url) do |session,path,headers|
548
603
  new_page = Page.new(url,session.post(path,post_data,headers))
@@ -616,7 +671,7 @@ module Spidr
616
671
  # the `queue` of the agent.
617
672
  #
618
673
  def to_hash
619
- {:history => @history, :queue => @queue}
674
+ {history: @history, queue: @queue}
620
675
  end
621
676
 
622
677
  protected
@@ -666,9 +721,9 @@ module Spidr
666
721
  end
667
722
  end
668
723
 
669
- headers['Host'] ||= @host_header if @host_header
724
+ headers['Host'] ||= @host_header if @host_header
670
725
  headers['User-Agent'] = @user_agent if @user_agent
671
- headers['Referer'] = @referer if @referer
726
+ headers['Referer'] = @referer if @referer
672
727
 
673
728
  if (authorization = @authorized.for_url(url))
674
729
  headers['Authorization'] = "Basic #{authorization}"
@@ -687,7 +742,8 @@ module Spidr
687
742
  SocketError,
688
743
  IOError,
689
744
  OpenSSL::SSL::SSLError,
690
- Net::HTTPBadResponse
745
+ Net::HTTPBadResponse,
746
+ Zlib::Error
691
747
 
692
748
  @sessions.kill!(url)
693
749
 
@@ -722,7 +778,8 @@ module Spidr
722
778
  visit_port?(url.port) &&
723
779
  visit_link?(url.to_s) &&
724
780
  visit_url?(url) &&
725
- visit_ext?(url.path)
781
+ visit_ext?(url.path) &&
782
+ robot_allowed?(url.to_s)
726
783
  end
727
784
 
728
785
  #