spidr 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
|
4
|
+
data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
|
7
|
+
data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f
|
data/.travis.yml
ADDED
data/ChangeLog.md
CHANGED
@@ -1,3 +1,21 @@
|
|
1
|
+
### 0.6.0 / 2016-08-04
|
2
|
+
|
3
|
+
* Added {Spidr::Proxy}.
|
4
|
+
* Added more options to {Spidr::Agent#initialize}:
|
5
|
+
* `:default_headers`: specifies the default headers to set in all requests
|
6
|
+
(@maccman).
|
7
|
+
* `:limit`: specify the maximum number of links to visit.
|
8
|
+
* `:open_timeout`, `:read_timeout`, `:ssl_timeout`, `:continue_timeout`,
|
9
|
+
and `:keep_alive_timeout`: sets `Net::HTTP` timeouts.
|
10
|
+
* Allow {Spidr::Settings::Proxy#proxy= Spidr.proxy=} to accept `nil`.
|
11
|
+
* Use `Net::HTTPResponse#get_fields` in {Spidr::Page} to correctly return
|
12
|
+
multiple values for repeated headers.
|
13
|
+
* Fixed a bug in {Spidr::Page#method_missing} where method names were not being
|
14
|
+
correctly converted to header names.
|
15
|
+
* Fixed a bug in {Spidr::Page#cookie_params} where `Set-Cookie` flags were not
|
16
|
+
being filtered out.
|
17
|
+
* Rewrote the specs to use webmock and increased spec coverage.
|
18
|
+
|
1
19
|
### 0.5.0 / 2016-01-03
|
2
20
|
|
3
21
|
* Added support for respecting `robots.txt` files.
|
@@ -166,8 +184,8 @@
|
|
166
184
|
* Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
|
167
185
|
* Added `Spidr::Agent#get_session`.
|
168
186
|
* Added `Spidr::Agent#kill_session`.
|
169
|
-
* Added {Spidr.proxy=}.
|
170
|
-
* Added {Spidr.disable_proxy!}.
|
187
|
+
* Added {Spidr::Settings::Proxy#proxy= Spidr.proxy=}.
|
188
|
+
* Added {Spidr::Settings::Proxy#disable_proxy! Spidr.disable_proxy!}.
|
171
189
|
* Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
|
172
190
|
* Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
|
173
191
|
* Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.
|
data/Gemfile
CHANGED
@@ -6,15 +6,15 @@ end
|
|
6
6
|
|
7
7
|
gemspec
|
8
8
|
|
9
|
-
|
10
9
|
gem 'robots', group: :robots
|
11
10
|
|
12
11
|
group :development do
|
13
12
|
gem 'rake'
|
14
13
|
gem 'rubygems-tasks', '~> 0.2'
|
15
14
|
|
16
|
-
gem 'wsoc', '~> 0.1.3'
|
17
15
|
gem 'rspec', '~> 3.0'
|
16
|
+
gem 'webmock', '~> 2.0'
|
17
|
+
gem 'sinatra', '~> 1.0'
|
18
18
|
|
19
19
|
gem 'kramdown', '~> 0.12'
|
20
20
|
gem 'yard', '~> 0.8'
|
data/README.md
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
* [Issues](https://github.com/postmodern/spidr/issues)
|
6
6
|
* [Mailing List](http://groups.google.com/group/spidr)
|
7
7
|
* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
|
8
|
+
* [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
|
8
9
|
|
9
10
|
## Description
|
10
11
|
|
@@ -28,7 +29,8 @@ and easy to use.
|
|
28
29
|
* Port number
|
29
30
|
* Full link
|
30
31
|
* URL extension
|
31
|
-
*
|
32
|
+
* Optional `/robots.txt` support.
|
33
|
+
* Provides callbacks for:
|
32
34
|
* Every visited Page.
|
33
35
|
* Every visited URL.
|
34
36
|
* Every visited URL that matches a specified pattern.
|
@@ -181,7 +183,7 @@ Skip the processing of links:
|
|
181
183
|
|
182
184
|
## Requirements
|
183
185
|
|
184
|
-
* [ruby] >=
|
186
|
+
* [ruby] >= 2.0.0
|
185
187
|
* [nokogiri] ~> 1.3
|
186
188
|
|
187
189
|
## Install
|
data/Rakefile
CHANGED
data/gemspec.yml
CHANGED
data/lib/spidr/agent.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
require 'spidr/settings/user_agent'
|
1
2
|
require 'spidr/agent/sanitizers'
|
2
3
|
require 'spidr/agent/filters'
|
3
4
|
require 'spidr/agent/events'
|
4
5
|
require 'spidr/agent/actions'
|
6
|
+
require 'spidr/agent/robots'
|
5
7
|
require 'spidr/page'
|
6
8
|
require 'spidr/session_cache'
|
7
9
|
require 'spidr/cookie_jar'
|
@@ -12,14 +14,11 @@ require 'openssl'
|
|
12
14
|
require 'net/http'
|
13
15
|
require 'set'
|
14
16
|
|
15
|
-
begin
|
16
|
-
require 'robots'
|
17
|
-
rescue LoadError
|
18
|
-
end
|
19
|
-
|
20
17
|
module Spidr
|
21
18
|
class Agent
|
22
19
|
|
20
|
+
include Settings::UserAgent
|
21
|
+
|
23
22
|
# HTTP Host Header to use
|
24
23
|
#
|
25
24
|
# @return [String]
|
@@ -30,10 +29,12 @@ module Spidr
|
|
30
29
|
# @return [Hash{String,Regexp => String}]
|
31
30
|
attr_reader :host_headers
|
32
31
|
|
33
|
-
#
|
32
|
+
# HTTP Headers to use for every request
|
34
33
|
#
|
35
|
-
# @return [String]
|
36
|
-
|
34
|
+
# @return [Hash{String => String}]
|
35
|
+
#
|
36
|
+
# @since 0.6.0
|
37
|
+
attr_reader :default_headers
|
37
38
|
|
38
39
|
# HTTP Authentication credentials
|
39
40
|
#
|
@@ -65,11 +66,23 @@ module Spidr
|
|
65
66
|
# @return [Array<URI::HTTP>]
|
66
67
|
attr_reader :queue
|
67
68
|
|
69
|
+
# The session cache
|
70
|
+
#
|
71
|
+
# @return [SessionCache]
|
72
|
+
#
|
73
|
+
# @since 0.6.0
|
74
|
+
attr_reader :sessions
|
75
|
+
|
68
76
|
# Cached cookies
|
69
77
|
#
|
70
78
|
# @return [CookieJar]
|
71
79
|
attr_reader :cookies
|
72
|
-
|
80
|
+
|
81
|
+
# Maximum number of pages to visit.
|
82
|
+
#
|
83
|
+
# @return [Integer]
|
84
|
+
attr_reader :limit
|
85
|
+
|
73
86
|
# Maximum depth
|
74
87
|
#
|
75
88
|
# @return [Integer]
|
@@ -86,6 +99,21 @@ module Spidr
|
|
86
99
|
# @param [Hash] options
|
87
100
|
# Additional options
|
88
101
|
#
|
102
|
+
# @option options [Integer] :open_timeout (Spidr.open_timeout)
|
103
|
+
# Optional open timeout.
|
104
|
+
#
|
105
|
+
# @option options [Integer] :read_timeout (Spidr.read_timeout)
|
106
|
+
# Optional read timeout.
|
107
|
+
#
|
108
|
+
# @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
|
109
|
+
# Optional ssl timeout.
|
110
|
+
#
|
111
|
+
# @option options [Integer] :continue_timeout (Spidr.continue_timeout)
|
112
|
+
# Optional continue timeout.
|
113
|
+
#
|
114
|
+
# @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
|
115
|
+
# Optional keep_alive timeout.
|
116
|
+
#
|
89
117
|
# @option options [Hash] :proxy (Spidr.proxy)
|
90
118
|
# The proxy information to use.
|
91
119
|
#
|
@@ -101,6 +129,9 @@ module Spidr
|
|
101
129
|
# @option :proxy [String] :password
|
102
130
|
# The password to authenticate with.
|
103
131
|
#
|
132
|
+
# @option options [Hash{String => String}] :default_headers
|
133
|
+
# Default headers to set for every request.
|
134
|
+
#
|
104
135
|
# @option options [String] :host_header
|
105
136
|
# The HTTP Host header to use with each request.
|
106
137
|
#
|
@@ -122,6 +153,9 @@ module Spidr
|
|
122
153
|
# @option options [Set, Array] :history
|
123
154
|
# The initial list of visited URLs.
|
124
155
|
#
|
156
|
+
# @option options [Integer] :limit
|
157
|
+
# The maximum number of pages to visit.
|
158
|
+
#
|
125
159
|
# @option options [Integer] :max_depth
|
126
160
|
# The maximum link depth to follow.
|
127
161
|
#
|
@@ -148,10 +182,16 @@ module Spidr
|
|
148
182
|
@host_headers.merge!(options[:host_headers])
|
149
183
|
end
|
150
184
|
|
185
|
+
@default_headers = {}
|
186
|
+
|
187
|
+
if options[:default_headers]
|
188
|
+
@default_headers.merge!(options[:default_headers])
|
189
|
+
end
|
190
|
+
|
151
191
|
@user_agent = options.fetch(:user_agent,Spidr.user_agent)
|
152
192
|
@referer = options[:referer]
|
153
193
|
|
154
|
-
@sessions = SessionCache.new(options
|
194
|
+
@sessions = SessionCache.new(options)
|
155
195
|
@cookies = CookieJar.new
|
156
196
|
@authorized = AuthStore.new
|
157
197
|
|
@@ -161,15 +201,16 @@ module Spidr
|
|
161
201
|
@failures = Set[]
|
162
202
|
@queue = []
|
163
203
|
|
204
|
+
@limit = options[:limit]
|
164
205
|
@levels = Hash.new(0)
|
165
206
|
@max_depth = options[:max_depth]
|
166
207
|
|
167
|
-
if options
|
168
|
-
|
169
|
-
|
170
|
-
end
|
208
|
+
if options[:queue]
|
209
|
+
self.queue = options[:queue]
|
210
|
+
end
|
171
211
|
|
172
|
-
|
212
|
+
if options[:history]
|
213
|
+
self.history = options[:history]
|
173
214
|
end
|
174
215
|
|
175
216
|
initialize_sanitizers(options)
|
@@ -177,6 +218,10 @@ module Spidr
|
|
177
218
|
initialize_actions(options)
|
178
219
|
initialize_events(options)
|
179
220
|
|
221
|
+
if options.fetch(:robots,Spidr.robots?)
|
222
|
+
initialize_robots
|
223
|
+
end
|
224
|
+
|
180
225
|
yield self if block_given?
|
181
226
|
end
|
182
227
|
|
@@ -252,6 +297,37 @@ module Spidr
|
|
252
297
|
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
253
298
|
end
|
254
299
|
|
300
|
+
#
|
301
|
+
# The proxy information the agent uses.
|
302
|
+
#
|
303
|
+
# @return [Proxy]
|
304
|
+
# The proxy information.
|
305
|
+
#
|
306
|
+
# @see SessionCache#proxy
|
307
|
+
#
|
308
|
+
# @since 0.2.2
|
309
|
+
#
|
310
|
+
def proxy
|
311
|
+
@sessions.proxy
|
312
|
+
end
|
313
|
+
|
314
|
+
#
|
315
|
+
# Sets the proxy information that the agent uses.
|
316
|
+
#
|
317
|
+
# @param [Proxy] new_proxy
|
318
|
+
# The new proxy information.
|
319
|
+
#
|
320
|
+
# @return [Hash]
|
321
|
+
# The new proxy information.
|
322
|
+
#
|
323
|
+
# @see SessionCache#proxy=
|
324
|
+
#
|
325
|
+
# @since 0.2.2
|
326
|
+
#
|
327
|
+
def proxy=(new_proxy)
|
328
|
+
@sessions.proxy = new_proxy
|
329
|
+
end
|
330
|
+
|
255
331
|
#
|
256
332
|
# Clears the history of the agent.
|
257
333
|
#
|
@@ -292,7 +368,7 @@ module Spidr
|
|
292
368
|
def run(&block)
|
293
369
|
@running = true
|
294
370
|
|
295
|
-
until (@queue.empty? || paused?)
|
371
|
+
until (@queue.empty? || paused? || limit_reached?)
|
296
372
|
begin
|
297
373
|
visit_page(dequeue,&block)
|
298
374
|
rescue Actions::Paused
|
@@ -316,37 +392,6 @@ module Spidr
|
|
316
392
|
@running == true
|
317
393
|
end
|
318
394
|
|
319
|
-
#
|
320
|
-
# The proxy information the agent uses.
|
321
|
-
#
|
322
|
-
# @return [Hash]
|
323
|
-
# The proxy information.
|
324
|
-
#
|
325
|
-
# @see SessionCache#proxy
|
326
|
-
#
|
327
|
-
# @since 0.2.2
|
328
|
-
#
|
329
|
-
def proxy
|
330
|
-
@sessions.proxy
|
331
|
-
end
|
332
|
-
|
333
|
-
#
|
334
|
-
# Sets the proxy information that the agent uses.
|
335
|
-
#
|
336
|
-
# @param [Hash] new_proxy
|
337
|
-
# The new proxy information.
|
338
|
-
#
|
339
|
-
# @return [Hash]
|
340
|
-
# The new proxy information.
|
341
|
-
#
|
342
|
-
# @see SessionCache#proxy=
|
343
|
-
#
|
344
|
-
# @since 0.2.2
|
345
|
-
#
|
346
|
-
def proxy=(new_proxy)
|
347
|
-
@sessions.proxy = new_proxy
|
348
|
-
end
|
349
|
-
|
350
395
|
#
|
351
396
|
# Sets the history of URLs that were previously visited.
|
352
397
|
#
|
@@ -408,19 +453,6 @@ module Spidr
|
|
408
453
|
return @history.include?(url)
|
409
454
|
end
|
410
455
|
|
411
|
-
#
|
412
|
-
# Determines whether a URL is allowed by the robot policy.
|
413
|
-
#
|
414
|
-
# @param [URI::HTTP, String] url
|
415
|
-
# The URL to check.
|
416
|
-
#
|
417
|
-
# @return [Boolean]
|
418
|
-
# Specifies whether a URL is allowed by the robot policy.
|
419
|
-
#
|
420
|
-
def robot_allowed?(url)
|
421
|
-
@robots ? @robots.allowed?(url) : true
|
422
|
-
end
|
423
|
-
|
424
456
|
#
|
425
457
|
# Sets the list of failed URLs.
|
426
458
|
#
|
@@ -536,7 +568,7 @@ module Spidr
|
|
536
568
|
return false
|
537
569
|
rescue Actions::Action
|
538
570
|
end
|
539
|
-
|
571
|
+
|
540
572
|
@queue << url
|
541
573
|
@levels[url] = level
|
542
574
|
return true
|
@@ -544,7 +576,7 @@ module Spidr
|
|
544
576
|
|
545
577
|
return false
|
546
578
|
end
|
547
|
-
|
579
|
+
|
548
580
|
#
|
549
581
|
# Requests and creates a new Page object from a given URL.
|
550
582
|
#
|
@@ -676,6 +708,45 @@ module Spidr
|
|
676
708
|
|
677
709
|
protected
|
678
710
|
|
711
|
+
#
|
712
|
+
# Prepares request headers for the given URL.
|
713
|
+
#
|
714
|
+
# @param [URI::HTTP] url
|
715
|
+
# The URL to prepare the request headers for.
|
716
|
+
#
|
717
|
+
# @return [Hash{String => String}]
|
718
|
+
# The prepared headers.
|
719
|
+
#
|
720
|
+
# @since 0.6.0
|
721
|
+
#
|
722
|
+
def prepare_request_headers(url)
|
723
|
+
# set any additional HTTP headers
|
724
|
+
headers = @default_headers.dup
|
725
|
+
|
726
|
+
unless @host_headers.empty?
|
727
|
+
@host_headers.each do |name,header|
|
728
|
+
if host.match(name)
|
729
|
+
headers['Host'] = header
|
730
|
+
break
|
731
|
+
end
|
732
|
+
end
|
733
|
+
end
|
734
|
+
|
735
|
+
headers['Host'] ||= @host_header if @host_header
|
736
|
+
headers['User-Agent'] = @user_agent if @user_agent
|
737
|
+
headers['Referer'] = @referer if @referer
|
738
|
+
|
739
|
+
if (authorization = @authorized.for_url(url))
|
740
|
+
headers['Authorization'] = "Basic #{authorization}"
|
741
|
+
end
|
742
|
+
|
743
|
+
if (header_cookies = @cookies.for_host(url.host))
|
744
|
+
headers['Cookie'] = header_cookies
|
745
|
+
end
|
746
|
+
|
747
|
+
return headers
|
748
|
+
end
|
749
|
+
|
679
750
|
#
|
680
751
|
# Normalizes the request path and grabs a session to handle page
|
681
752
|
# get and post requests.
|
@@ -709,29 +780,7 @@ module Spidr
|
|
709
780
|
# append the URL query to the path
|
710
781
|
path += "?#{url.query}" if url.query
|
711
782
|
|
712
|
-
|
713
|
-
headers = {}
|
714
|
-
|
715
|
-
unless @host_headers.empty?
|
716
|
-
@host_headers.each do |name,header|
|
717
|
-
if host.match(name)
|
718
|
-
headers['Host'] = header
|
719
|
-
break
|
720
|
-
end
|
721
|
-
end
|
722
|
-
end
|
723
|
-
|
724
|
-
headers['Host'] ||= @host_header if @host_header
|
725
|
-
headers['User-Agent'] = @user_agent if @user_agent
|
726
|
-
headers['Referer'] = @referer if @referer
|
727
|
-
|
728
|
-
if (authorization = @authorized.for_url(url))
|
729
|
-
headers['Authorization'] = "Basic #{authorization}"
|
730
|
-
end
|
731
|
-
|
732
|
-
if (header_cookies = @cookies.for_host(url.host))
|
733
|
-
headers['Cookie'] = header_cookies
|
734
|
-
end
|
783
|
+
headers = prepare_request_headers(url)
|
735
784
|
|
736
785
|
begin
|
737
786
|
sleep(@delay) if @delay > 0
|
@@ -762,6 +811,17 @@ module Spidr
|
|
762
811
|
@queue.shift
|
763
812
|
end
|
764
813
|
|
814
|
+
#
|
815
|
+
# Determines if the maximum limit has been reached.
|
816
|
+
#
|
817
|
+
# @return [Boolean]
|
818
|
+
#
|
819
|
+
# @since 0.6.0
|
820
|
+
#
|
821
|
+
def limit_reached?
|
822
|
+
@limit && @history.length >= @limit
|
823
|
+
end
|
824
|
+
|
765
825
|
#
|
766
826
|
# Determines if a given URL should be visited.
|
767
827
|
#
|