spidr 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
|
4
|
+
data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
|
7
|
+
data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f
|
data/.travis.yml
ADDED
data/ChangeLog.md
CHANGED
@@ -1,3 +1,21 @@
|
|
1
|
+
### 0.6.0 / 2016-08-04
|
2
|
+
|
3
|
+
* Added {Spidr::Proxy}.
|
4
|
+
* Added more options to {Spidr::Agent#initialize}:
|
5
|
+
* `:default_headers`: specifies the default headers to set in all requests
|
6
|
+
(@maccman).
|
7
|
+
* `:limit`: specify the maximum number of links to visit.
|
8
|
+
* `:open_timeout`, `:read_timeout`, `:ssl_timeout`, `:continue_timeout`,
|
9
|
+
and `:keep_alive_timeout`: sets `Net::HTTP` timeouts.
|
10
|
+
* Allow {Spidr::Settings::Proxy#proxy= Spidr.proxy=} to accept `nil`.
|
11
|
+
* Use `Net::HTTPResponse#get_fields` in {Spidr::Page} to correctly return
|
12
|
+
multiple values for repeated headers.
|
13
|
+
* Fixed a bug in {Spidr::Page#method_missing} where method names were not being
|
14
|
+
correctly converted to header names.
|
15
|
+
* Fixed a bug in {Spidr::Page#cookie_params} where `Set-Cookie` flags were not
|
16
|
+
being filtered out.
|
17
|
+
* Rewrote the specs to use webmock and increased spec coverage.
|
18
|
+
|
1
19
|
### 0.5.0 / 2016-01-03
|
2
20
|
|
3
21
|
* Added support for respecting `robots.txt` files.
|
@@ -166,8 +184,8 @@
|
|
166
184
|
* Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
|
167
185
|
* Added `Spidr::Agent#get_session`.
|
168
186
|
* Added `Spidr::Agent#kill_session`.
|
169
|
-
* Added {Spidr.proxy=}.
|
170
|
-
* Added {Spidr.disable_proxy!}.
|
187
|
+
* Added {Spidr::Settings::Proxy#proxy= Spidr.proxy=}.
|
188
|
+
* Added {Spidr::Settings::Proxy#disable_proxy! Spidr.disable_proxy!}.
|
171
189
|
* Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
|
172
190
|
* Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
|
173
191
|
* Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.
|
data/Gemfile
CHANGED
@@ -6,15 +6,15 @@ end
|
|
6
6
|
|
7
7
|
gemspec
|
8
8
|
|
9
|
-
|
10
9
|
gem 'robots', group: :robots
|
11
10
|
|
12
11
|
group :development do
|
13
12
|
gem 'rake'
|
14
13
|
gem 'rubygems-tasks', '~> 0.2'
|
15
14
|
|
16
|
-
gem 'wsoc', '~> 0.1.3'
|
17
15
|
gem 'rspec', '~> 3.0'
|
16
|
+
gem 'webmock', '~> 2.0'
|
17
|
+
gem 'sinatra', '~> 1.0'
|
18
18
|
|
19
19
|
gem 'kramdown', '~> 0.12'
|
20
20
|
gem 'yard', '~> 0.8'
|
data/README.md
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
* [Issues](https://github.com/postmodern/spidr/issues)
|
6
6
|
* [Mailing List](http://groups.google.com/group/spidr)
|
7
7
|
* [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
|
8
|
+
* [](https://travis-ci.org/postmodern/spidr)
|
8
9
|
|
9
10
|
## Description
|
10
11
|
|
@@ -28,7 +29,8 @@ and easy to use.
|
|
28
29
|
* Port number
|
29
30
|
* Full link
|
30
31
|
* URL extension
|
31
|
-
*
|
32
|
+
* Optional `/robots.txt` support.
|
33
|
+
* Provides callbacks for:
|
32
34
|
* Every visited Page.
|
33
35
|
* Every visited URL.
|
34
36
|
* Every visited URL that matches a specified pattern.
|
@@ -181,7 +183,7 @@ Skip the processing of links:
|
|
181
183
|
|
182
184
|
## Requirements
|
183
185
|
|
184
|
-
* [ruby] >=
|
186
|
+
* [ruby] >= 2.0.0
|
185
187
|
* [nokogiri] ~> 1.3
|
186
188
|
|
187
189
|
## Install
|
data/Rakefile
CHANGED
data/gemspec.yml
CHANGED
data/lib/spidr/agent.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
require 'spidr/settings/user_agent'
|
1
2
|
require 'spidr/agent/sanitizers'
|
2
3
|
require 'spidr/agent/filters'
|
3
4
|
require 'spidr/agent/events'
|
4
5
|
require 'spidr/agent/actions'
|
6
|
+
require 'spidr/agent/robots'
|
5
7
|
require 'spidr/page'
|
6
8
|
require 'spidr/session_cache'
|
7
9
|
require 'spidr/cookie_jar'
|
@@ -12,14 +14,11 @@ require 'openssl'
|
|
12
14
|
require 'net/http'
|
13
15
|
require 'set'
|
14
16
|
|
15
|
-
begin
|
16
|
-
require 'robots'
|
17
|
-
rescue LoadError
|
18
|
-
end
|
19
|
-
|
20
17
|
module Spidr
|
21
18
|
class Agent
|
22
19
|
|
20
|
+
include Settings::UserAgent
|
21
|
+
|
23
22
|
# HTTP Host Header to use
|
24
23
|
#
|
25
24
|
# @return [String]
|
@@ -30,10 +29,12 @@ module Spidr
|
|
30
29
|
# @return [Hash{String,Regexp => String}]
|
31
30
|
attr_reader :host_headers
|
32
31
|
|
33
|
-
#
|
32
|
+
# HTTP Headers to use for every request
|
34
33
|
#
|
35
|
-
# @return [String]
|
36
|
-
|
34
|
+
# @return [Hash{String => String}]
|
35
|
+
#
|
36
|
+
# @since 0.6.0
|
37
|
+
attr_reader :default_headers
|
37
38
|
|
38
39
|
# HTTP Authentication credentials
|
39
40
|
#
|
@@ -65,11 +66,23 @@ module Spidr
|
|
65
66
|
# @return [Array<URI::HTTP>]
|
66
67
|
attr_reader :queue
|
67
68
|
|
69
|
+
# The session cache
|
70
|
+
#
|
71
|
+
# @return [SessionCache]
|
72
|
+
#
|
73
|
+
# @since 0.6.0
|
74
|
+
attr_reader :sessions
|
75
|
+
|
68
76
|
# Cached cookies
|
69
77
|
#
|
70
78
|
# @return [CookieJar]
|
71
79
|
attr_reader :cookies
|
72
|
-
|
80
|
+
|
81
|
+
# Maximum number of pages to visit.
|
82
|
+
#
|
83
|
+
# @return [Integer]
|
84
|
+
attr_reader :limit
|
85
|
+
|
73
86
|
# Maximum depth
|
74
87
|
#
|
75
88
|
# @return [Integer]
|
@@ -86,6 +99,21 @@ module Spidr
|
|
86
99
|
# @param [Hash] options
|
87
100
|
# Additional options
|
88
101
|
#
|
102
|
+
# @option options [Integer] :open_timeout (Spidr.open_timeout)
|
103
|
+
# Optional open timeout.
|
104
|
+
#
|
105
|
+
# @option options [Integer] :read_timeout (Spidr.read_timeout)
|
106
|
+
# Optional read timeout.
|
107
|
+
#
|
108
|
+
# @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
|
109
|
+
# Optional ssl timeout.
|
110
|
+
#
|
111
|
+
# @option options [Integer] :continue_timeout (Spidr.continue_timeout)
|
112
|
+
# Optional continue timeout.
|
113
|
+
#
|
114
|
+
# @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
|
115
|
+
# Optional keep_alive timeout.
|
116
|
+
#
|
89
117
|
# @option options [Hash] :proxy (Spidr.proxy)
|
90
118
|
# The proxy information to use.
|
91
119
|
#
|
@@ -101,6 +129,9 @@ module Spidr
|
|
101
129
|
# @option :proxy [String] :password
|
102
130
|
# The password to authenticate with.
|
103
131
|
#
|
132
|
+
# @option options [Hash{String => String}] :default_headers
|
133
|
+
# Default headers to set for every request.
|
134
|
+
#
|
104
135
|
# @option options [String] :host_header
|
105
136
|
# The HTTP Host header to use with each request.
|
106
137
|
#
|
@@ -122,6 +153,9 @@ module Spidr
|
|
122
153
|
# @option options [Set, Array] :history
|
123
154
|
# The initial list of visited URLs.
|
124
155
|
#
|
156
|
+
# @option options [Integer] :limit
|
157
|
+
# The maximum number of pages to visit.
|
158
|
+
#
|
125
159
|
# @option options [Integer] :max_depth
|
126
160
|
# The maximum link depth to follow.
|
127
161
|
#
|
@@ -148,10 +182,16 @@ module Spidr
|
|
148
182
|
@host_headers.merge!(options[:host_headers])
|
149
183
|
end
|
150
184
|
|
185
|
+
@default_headers = {}
|
186
|
+
|
187
|
+
if options[:default_headers]
|
188
|
+
@default_headers.merge!(options[:default_headers])
|
189
|
+
end
|
190
|
+
|
151
191
|
@user_agent = options.fetch(:user_agent,Spidr.user_agent)
|
152
192
|
@referer = options[:referer]
|
153
193
|
|
154
|
-
@sessions = SessionCache.new(options
|
194
|
+
@sessions = SessionCache.new(options)
|
155
195
|
@cookies = CookieJar.new
|
156
196
|
@authorized = AuthStore.new
|
157
197
|
|
@@ -161,15 +201,16 @@ module Spidr
|
|
161
201
|
@failures = Set[]
|
162
202
|
@queue = []
|
163
203
|
|
204
|
+
@limit = options[:limit]
|
164
205
|
@levels = Hash.new(0)
|
165
206
|
@max_depth = options[:max_depth]
|
166
207
|
|
167
|
-
if options
|
168
|
-
|
169
|
-
|
170
|
-
end
|
208
|
+
if options[:queue]
|
209
|
+
self.queue = options[:queue]
|
210
|
+
end
|
171
211
|
|
172
|
-
|
212
|
+
if options[:history]
|
213
|
+
self.history = options[:history]
|
173
214
|
end
|
174
215
|
|
175
216
|
initialize_sanitizers(options)
|
@@ -177,6 +218,10 @@ module Spidr
|
|
177
218
|
initialize_actions(options)
|
178
219
|
initialize_events(options)
|
179
220
|
|
221
|
+
if options.fetch(:robots,Spidr.robots?)
|
222
|
+
initialize_robots
|
223
|
+
end
|
224
|
+
|
180
225
|
yield self if block_given?
|
181
226
|
end
|
182
227
|
|
@@ -252,6 +297,37 @@ module Spidr
|
|
252
297
|
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
253
298
|
end
|
254
299
|
|
300
|
+
#
|
301
|
+
# The proxy information the agent uses.
|
302
|
+
#
|
303
|
+
# @return [Proxy]
|
304
|
+
# The proxy information.
|
305
|
+
#
|
306
|
+
# @see SessionCache#proxy
|
307
|
+
#
|
308
|
+
# @since 0.2.2
|
309
|
+
#
|
310
|
+
def proxy
|
311
|
+
@sessions.proxy
|
312
|
+
end
|
313
|
+
|
314
|
+
#
|
315
|
+
# Sets the proxy information that the agent uses.
|
316
|
+
#
|
317
|
+
# @param [Proxy] new_proxy
|
318
|
+
# The new proxy information.
|
319
|
+
#
|
320
|
+
# @return [Hash]
|
321
|
+
# The new proxy information.
|
322
|
+
#
|
323
|
+
# @see SessionCache#proxy=
|
324
|
+
#
|
325
|
+
# @since 0.2.2
|
326
|
+
#
|
327
|
+
def proxy=(new_proxy)
|
328
|
+
@sessions.proxy = new_proxy
|
329
|
+
end
|
330
|
+
|
255
331
|
#
|
256
332
|
# Clears the history of the agent.
|
257
333
|
#
|
@@ -292,7 +368,7 @@ module Spidr
|
|
292
368
|
def run(&block)
|
293
369
|
@running = true
|
294
370
|
|
295
|
-
until (@queue.empty? || paused?)
|
371
|
+
until (@queue.empty? || paused? || limit_reached?)
|
296
372
|
begin
|
297
373
|
visit_page(dequeue,&block)
|
298
374
|
rescue Actions::Paused
|
@@ -316,37 +392,6 @@ module Spidr
|
|
316
392
|
@running == true
|
317
393
|
end
|
318
394
|
|
319
|
-
#
|
320
|
-
# The proxy information the agent uses.
|
321
|
-
#
|
322
|
-
# @return [Hash]
|
323
|
-
# The proxy information.
|
324
|
-
#
|
325
|
-
# @see SessionCache#proxy
|
326
|
-
#
|
327
|
-
# @since 0.2.2
|
328
|
-
#
|
329
|
-
def proxy
|
330
|
-
@sessions.proxy
|
331
|
-
end
|
332
|
-
|
333
|
-
#
|
334
|
-
# Sets the proxy information that the agent uses.
|
335
|
-
#
|
336
|
-
# @param [Hash] new_proxy
|
337
|
-
# The new proxy information.
|
338
|
-
#
|
339
|
-
# @return [Hash]
|
340
|
-
# The new proxy information.
|
341
|
-
#
|
342
|
-
# @see SessionCache#proxy=
|
343
|
-
#
|
344
|
-
# @since 0.2.2
|
345
|
-
#
|
346
|
-
def proxy=(new_proxy)
|
347
|
-
@sessions.proxy = new_proxy
|
348
|
-
end
|
349
|
-
|
350
395
|
#
|
351
396
|
# Sets the history of URLs that were previously visited.
|
352
397
|
#
|
@@ -408,19 +453,6 @@ module Spidr
|
|
408
453
|
return @history.include?(url)
|
409
454
|
end
|
410
455
|
|
411
|
-
#
|
412
|
-
# Determines whether a URL is allowed by the robot policy.
|
413
|
-
#
|
414
|
-
# @param [URI::HTTP, String] url
|
415
|
-
# The URL to check.
|
416
|
-
#
|
417
|
-
# @return [Boolean]
|
418
|
-
# Specifies whether a URL is allowed by the robot policy.
|
419
|
-
#
|
420
|
-
def robot_allowed?(url)
|
421
|
-
@robots ? @robots.allowed?(url) : true
|
422
|
-
end
|
423
|
-
|
424
456
|
#
|
425
457
|
# Sets the list of failed URLs.
|
426
458
|
#
|
@@ -536,7 +568,7 @@ module Spidr
|
|
536
568
|
return false
|
537
569
|
rescue Actions::Action
|
538
570
|
end
|
539
|
-
|
571
|
+
|
540
572
|
@queue << url
|
541
573
|
@levels[url] = level
|
542
574
|
return true
|
@@ -544,7 +576,7 @@ module Spidr
|
|
544
576
|
|
545
577
|
return false
|
546
578
|
end
|
547
|
-
|
579
|
+
|
548
580
|
#
|
549
581
|
# Requests and creates a new Page object from a given URL.
|
550
582
|
#
|
@@ -676,6 +708,45 @@ module Spidr
|
|
676
708
|
|
677
709
|
protected
|
678
710
|
|
711
|
+
#
|
712
|
+
# Prepares request headers for the given URL.
|
713
|
+
#
|
714
|
+
# @param [URI::HTTP] url
|
715
|
+
# The URL to prepare the request headers for.
|
716
|
+
#
|
717
|
+
# @return [Hash{String => String}]
|
718
|
+
# The prepared headers.
|
719
|
+
#
|
720
|
+
# @since 0.6.0
|
721
|
+
#
|
722
|
+
def prepare_request_headers(url)
|
723
|
+
# set any additional HTTP headers
|
724
|
+
headers = @default_headers.dup
|
725
|
+
|
726
|
+
unless @host_headers.empty?
|
727
|
+
@host_headers.each do |name,header|
|
728
|
+
if host.match(name)
|
729
|
+
headers['Host'] = header
|
730
|
+
break
|
731
|
+
end
|
732
|
+
end
|
733
|
+
end
|
734
|
+
|
735
|
+
headers['Host'] ||= @host_header if @host_header
|
736
|
+
headers['User-Agent'] = @user_agent if @user_agent
|
737
|
+
headers['Referer'] = @referer if @referer
|
738
|
+
|
739
|
+
if (authorization = @authorized.for_url(url))
|
740
|
+
headers['Authorization'] = "Basic #{authorization}"
|
741
|
+
end
|
742
|
+
|
743
|
+
if (header_cookies = @cookies.for_host(url.host))
|
744
|
+
headers['Cookie'] = header_cookies
|
745
|
+
end
|
746
|
+
|
747
|
+
return headers
|
748
|
+
end
|
749
|
+
|
679
750
|
#
|
680
751
|
# Normalizes the request path and grabs a session to handle page
|
681
752
|
# get and post requests.
|
@@ -709,29 +780,7 @@ module Spidr
|
|
709
780
|
# append the URL query to the path
|
710
781
|
path += "?#{url.query}" if url.query
|
711
782
|
|
712
|
-
|
713
|
-
headers = {}
|
714
|
-
|
715
|
-
unless @host_headers.empty?
|
716
|
-
@host_headers.each do |name,header|
|
717
|
-
if host.match(name)
|
718
|
-
headers['Host'] = header
|
719
|
-
break
|
720
|
-
end
|
721
|
-
end
|
722
|
-
end
|
723
|
-
|
724
|
-
headers['Host'] ||= @host_header if @host_header
|
725
|
-
headers['User-Agent'] = @user_agent if @user_agent
|
726
|
-
headers['Referer'] = @referer if @referer
|
727
|
-
|
728
|
-
if (authorization = @authorized.for_url(url))
|
729
|
-
headers['Authorization'] = "Basic #{authorization}"
|
730
|
-
end
|
731
|
-
|
732
|
-
if (header_cookies = @cookies.for_host(url.host))
|
733
|
-
headers['Cookie'] = header_cookies
|
734
|
-
end
|
783
|
+
headers = prepare_request_headers(url)
|
735
784
|
|
736
785
|
begin
|
737
786
|
sleep(@delay) if @delay > 0
|
@@ -762,6 +811,17 @@ module Spidr
|
|
762
811
|
@queue.shift
|
763
812
|
end
|
764
813
|
|
814
|
+
#
|
815
|
+
# Determines if the maximum limit has been reached.
|
816
|
+
#
|
817
|
+
# @return [Boolean]
|
818
|
+
#
|
819
|
+
# @since 0.6.0
|
820
|
+
#
|
821
|
+
def limit_reached?
|
822
|
+
@limit && @history.length >= @limit
|
823
|
+
end
|
824
|
+
|
765
825
|
#
|
766
826
|
# Determines if a given URL should be visited.
|
767
827
|
#
|