spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 31e83cba8fd67a2527641b404f82773d60b5fb97
4
- data.tar.gz: cbd735b652d209cd49a6990eedf3de6f7a22e385
3
+ metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
4
+ data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
5
5
  SHA512:
6
- metadata.gz: d33742df9e9a4ec8090d4934de3562036e149195b3567ac1143c4637012876d86a18618e9f89251506ed8aa1d9c85cc18ed324774d4da29038e975827698f265
7
- data.tar.gz: 24b08172be0184f7c68fbc63b31eaac55b0c55d70b35b8983fbbb1a3ce871e157b0bbf7d598625ef37ec3fe420c7372bc5fdaf7dd4b7131eac6e6e23e465e475
6
+ metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
7
+ data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f
@@ -0,0 +1,14 @@
1
+ ---
2
+ language: ruby
3
+ rvm:
4
+ - 2.0.0
5
+ - 2.1.9
6
+ - 2.2.4
7
+ - 2.3.1
8
+ - jruby
9
+ - rbx
10
+ matrix:
11
+ allow_failures:
12
+ - rvm: jruby
13
+ - rvm: rbx
14
+ script: rake spec
@@ -1,3 +1,21 @@
1
+ ### 0.6.0 / 2016-08-04
2
+
3
+ * Added {Spidr::Proxy}.
4
+ * Added more options to {Spidr::Agent#initialize}:
5
+ * `:default_headers`: specifies the default headers to set in all requests
6
+ (@maccman).
7
+ * `:limit`: specify the maximum number of links to visit.
8
+ * `:open_timeout`, `:read_timeout`, `:ssl_timeout`, `:continue_timeout`,
9
+ and `:keep_alive_timeout`: sets `Net::HTTP` timeouts.
10
+ * Allow {Spidr::Settings::Proxy#proxy= Spidr.proxy=} to accept `nil`.
11
+ * Use `Net::HTTPResponse#get_fields` in {Spidr::Page} to correctly return
12
+ multiple values for repeated headers.
13
+ * Fixed a bug in {Spidr::Page#method_missing} where method names were not being
14
+ correctly converted to header names.
15
+ * Fixed a bug in {Spidr::Page#cookie_params} where `Set-Cookie` flags were not
16
+ being filtered out.
17
+ * Rewrote the specs to use webmock and increased spec coverage.
18
+
1
19
  ### 0.5.0 / 2016-01-03
2
20
 
3
21
  * Added support for respecting `robots.txt` files.
@@ -166,8 +184,8 @@
166
184
  * Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
167
185
  * Added `Spidr::Agent#get_session`.
168
186
  * Added `Spidr::Agent#kill_session`.
169
- * Added {Spidr.proxy=}.
170
- * Added {Spidr.disable_proxy!}.
187
+ * Added {Spidr::Settings::Proxy#proxy= Spidr.proxy=}.
188
+ * Added {Spidr::Settings::Proxy#disable_proxy! Spidr.disable_proxy!}.
171
189
  * Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
172
190
  * Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
173
191
  * Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.
data/Gemfile CHANGED
@@ -6,15 +6,15 @@ end
6
6
 
7
7
  gemspec
8
8
 
9
-
10
9
  gem 'robots', group: :robots
11
10
 
12
11
  group :development do
13
12
  gem 'rake'
14
13
  gem 'rubygems-tasks', '~> 0.2'
15
14
 
16
- gem 'wsoc', '~> 0.1.3'
17
15
  gem 'rspec', '~> 3.0'
16
+ gem 'webmock', '~> 2.0'
17
+ gem 'sinatra', '~> 1.0'
18
18
 
19
19
  gem 'kramdown', '~> 0.12'
20
20
  gem 'yard', '~> 0.8'
data/README.md CHANGED
@@ -5,6 +5,7 @@
5
5
  * [Issues](https://github.com/postmodern/spidr/issues)
6
6
  * [Mailing List](http://groups.google.com/group/spidr)
7
7
  * [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
8
+ * [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
8
9
 
9
10
  ## Description
10
11
 
@@ -28,7 +29,8 @@ and easy to use.
28
29
  * Port number
29
30
  * Full link
30
31
  * URL extension
31
- * Provides call-backs for:
32
+ * Optional `/robots.txt` support.
33
+ * Provides callbacks for:
32
34
  * Every visited Page.
33
35
  * Every visited URL.
34
36
  * Every visited URL that matches a specified pattern.
@@ -181,7 +183,7 @@ Skip the processing of links:
181
183
 
182
184
  ## Requirements
183
185
 
184
- * [ruby] >= 1.9.1
186
+ * [ruby] >= 2.0.0
185
187
  * [nokogiri] ~> 1.3
186
188
 
187
189
  ## Install
data/Rakefile CHANGED
@@ -16,3 +16,4 @@ task :default => :spec
16
16
 
17
17
  require 'yard'
18
18
  YARD::Rake::YardocTask.new
19
+ task :doc => :yard
@@ -11,7 +11,7 @@ email: postmodern.mod3@gmail.com
11
11
  homepage: https://github.com/postmodern/spidr#readme
12
12
  has_yard: true
13
13
 
14
- required_ruby_version: ">= 1.9.1"
14
+ required_ruby_version: ">= 2.0.0"
15
15
 
16
16
  dependencies:
17
17
  nokogiri: ~> 1.3
@@ -1,7 +1,9 @@
1
+ require 'spidr/settings/user_agent'
1
2
  require 'spidr/agent/sanitizers'
2
3
  require 'spidr/agent/filters'
3
4
  require 'spidr/agent/events'
4
5
  require 'spidr/agent/actions'
6
+ require 'spidr/agent/robots'
5
7
  require 'spidr/page'
6
8
  require 'spidr/session_cache'
7
9
  require 'spidr/cookie_jar'
@@ -12,14 +14,11 @@ require 'openssl'
12
14
  require 'net/http'
13
15
  require 'set'
14
16
 
15
- begin
16
- require 'robots'
17
- rescue LoadError
18
- end
19
-
20
17
  module Spidr
21
18
  class Agent
22
19
 
20
+ include Settings::UserAgent
21
+
23
22
  # HTTP Host Header to use
24
23
  #
25
24
  # @return [String]
@@ -30,10 +29,12 @@ module Spidr
30
29
  # @return [Hash{String,Regexp => String}]
31
30
  attr_reader :host_headers
32
31
 
33
- # User-Agent to use
32
+ # HTTP Headers to use for every request
34
33
  #
35
- # @return [String]
36
- attr_accessor :user_agent
34
+ # @return [Hash{String => String}]
35
+ #
36
+ # @since 0.6.0
37
+ attr_reader :default_headers
37
38
 
38
39
  # HTTP Authentication credentials
39
40
  #
@@ -65,11 +66,23 @@ module Spidr
65
66
  # @return [Array<URI::HTTP>]
66
67
  attr_reader :queue
67
68
 
69
+ # The session cache
70
+ #
71
+ # @return [SessionCache]
72
+ #
73
+ # @since 0.6.0
74
+ attr_reader :sessions
75
+
68
76
  # Cached cookies
69
77
  #
70
78
  # @return [CookieJar]
71
79
  attr_reader :cookies
72
-
80
+
81
+ # Maximum number of pages to visit.
82
+ #
83
+ # @return [Integer]
84
+ attr_reader :limit
85
+
73
86
  # Maximum depth
74
87
  #
75
88
  # @return [Integer]
@@ -86,6 +99,21 @@ module Spidr
86
99
  # @param [Hash] options
87
100
  # Additional options
88
101
  #
102
+ # @option options [Integer] :open_timeout (Spidr.open_timeout)
103
+ # Optional open timeout.
104
+ #
105
+ # @option options [Integer] :read_timeout (Spidr.read_timeout)
106
+ # Optional read timeout.
107
+ #
108
+ # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
109
+ # Optional ssl timeout.
110
+ #
111
+ # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
112
+ # Optional continue timeout.
113
+ #
114
+ # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
115
+ # Optional keep_alive timeout.
116
+ #
89
117
  # @option options [Hash] :proxy (Spidr.proxy)
90
118
  # The proxy information to use.
91
119
  #
@@ -101,6 +129,9 @@ module Spidr
101
129
  # @option :proxy [String] :password
102
130
  # The password to authenticate with.
103
131
  #
132
+ # @option options [Hash{String => String}] :default_headers
133
+ # Default headers to set for every request.
134
+ #
104
135
  # @option options [String] :host_header
105
136
  # The HTTP Host header to use with each request.
106
137
  #
@@ -122,6 +153,9 @@ module Spidr
122
153
  # @option options [Set, Array] :history
123
154
  # The initial list of visited URLs.
124
155
  #
156
+ # @option options [Integer] :limit
157
+ # The maximum number of pages to visit.
158
+ #
125
159
  # @option options [Integer] :max_depth
126
160
  # The maximum link depth to follow.
127
161
  #
@@ -148,10 +182,16 @@ module Spidr
148
182
  @host_headers.merge!(options[:host_headers])
149
183
  end
150
184
 
185
+ @default_headers = {}
186
+
187
+ if options[:default_headers]
188
+ @default_headers.merge!(options[:default_headers])
189
+ end
190
+
151
191
  @user_agent = options.fetch(:user_agent,Spidr.user_agent)
152
192
  @referer = options[:referer]
153
193
 
154
- @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
194
+ @sessions = SessionCache.new(options)
155
195
  @cookies = CookieJar.new
156
196
  @authorized = AuthStore.new
157
197
 
@@ -161,15 +201,16 @@ module Spidr
161
201
  @failures = Set[]
162
202
  @queue = []
163
203
 
204
+ @limit = options[:limit]
164
205
  @levels = Hash.new(0)
165
206
  @max_depth = options[:max_depth]
166
207
 
167
- if options.fetch(:robots,Spidr.robots?)
168
- unless Object.const_defined?(:Robots)
169
- raise(ArgumentError,":robots option given but unable to require 'robots' gem")
170
- end
208
+ if options[:queue]
209
+ self.queue = options[:queue]
210
+ end
171
211
 
172
- @robots = Robots.new(@user_agent)
212
+ if options[:history]
213
+ self.history = options[:history]
173
214
  end
174
215
 
175
216
  initialize_sanitizers(options)
@@ -177,6 +218,10 @@ module Spidr
177
218
  initialize_actions(options)
178
219
  initialize_events(options)
179
220
 
221
+ if options.fetch(:robots,Spidr.robots?)
222
+ initialize_robots
223
+ end
224
+
180
225
  yield self if block_given?
181
226
  end
182
227
 
@@ -252,6 +297,37 @@ module Spidr
252
297
  agent.start_at(URI::HTTP.build(host: name, path: '/'))
253
298
  end
254
299
 
300
+ #
301
+ # The proxy information the agent uses.
302
+ #
303
+ # @return [Proxy]
304
+ # The proxy information.
305
+ #
306
+ # @see SessionCache#proxy
307
+ #
308
+ # @since 0.2.2
309
+ #
310
+ def proxy
311
+ @sessions.proxy
312
+ end
313
+
314
+ #
315
+ # Sets the proxy information that the agent uses.
316
+ #
317
+ # @param [Proxy] new_proxy
318
+ # The new proxy information.
319
+ #
320
+ # @return [Hash]
321
+ # The new proxy information.
322
+ #
323
+ # @see SessionCache#proxy=
324
+ #
325
+ # @since 0.2.2
326
+ #
327
+ def proxy=(new_proxy)
328
+ @sessions.proxy = new_proxy
329
+ end
330
+
255
331
  #
256
332
  # Clears the history of the agent.
257
333
  #
@@ -292,7 +368,7 @@ module Spidr
292
368
  def run(&block)
293
369
  @running = true
294
370
 
295
- until (@queue.empty? || paused?)
371
+ until (@queue.empty? || paused? || limit_reached?)
296
372
  begin
297
373
  visit_page(dequeue,&block)
298
374
  rescue Actions::Paused
@@ -316,37 +392,6 @@ module Spidr
316
392
  @running == true
317
393
  end
318
394
 
319
- #
320
- # The proxy information the agent uses.
321
- #
322
- # @return [Hash]
323
- # The proxy information.
324
- #
325
- # @see SessionCache#proxy
326
- #
327
- # @since 0.2.2
328
- #
329
- def proxy
330
- @sessions.proxy
331
- end
332
-
333
- #
334
- # Sets the proxy information that the agent uses.
335
- #
336
- # @param [Hash] new_proxy
337
- # The new proxy information.
338
- #
339
- # @return [Hash]
340
- # The new proxy information.
341
- #
342
- # @see SessionCache#proxy=
343
- #
344
- # @since 0.2.2
345
- #
346
- def proxy=(new_proxy)
347
- @sessions.proxy = new_proxy
348
- end
349
-
350
395
  #
351
396
  # Sets the history of URLs that were previously visited.
352
397
  #
@@ -408,19 +453,6 @@ module Spidr
408
453
  return @history.include?(url)
409
454
  end
410
455
 
411
- #
412
- # Determines whether a URL is allowed by the robot policy.
413
- #
414
- # @param [URI::HTTP, String] url
415
- # The URL to check.
416
- #
417
- # @return [Boolean]
418
- # Specifies whether a URL is allowed by the robot policy.
419
- #
420
- def robot_allowed?(url)
421
- @robots ? @robots.allowed?(url) : true
422
- end
423
-
424
456
  #
425
457
  # Sets the list of failed URLs.
426
458
  #
@@ -536,7 +568,7 @@ module Spidr
536
568
  return false
537
569
  rescue Actions::Action
538
570
  end
539
-
571
+
540
572
  @queue << url
541
573
  @levels[url] = level
542
574
  return true
@@ -544,7 +576,7 @@ module Spidr
544
576
 
545
577
  return false
546
578
  end
547
-
579
+
548
580
  #
549
581
  # Requests and creates a new Page object from a given URL.
550
582
  #
@@ -676,6 +708,45 @@ module Spidr
676
708
 
677
709
  protected
678
710
 
711
+ #
712
+ # Prepares request headers for the given URL.
713
+ #
714
+ # @param [URI::HTTP] url
715
+ # The URL to prepare the request headers for.
716
+ #
717
+ # @return [Hash{String => String}]
718
+ # The prepared headers.
719
+ #
720
+ # @since 0.6.0
721
+ #
722
+ def prepare_request_headers(url)
723
+ # set any additional HTTP headers
724
+ headers = @default_headers.dup
725
+
726
+ unless @host_headers.empty?
727
+ @host_headers.each do |name,header|
728
+ if host.match(name)
729
+ headers['Host'] = header
730
+ break
731
+ end
732
+ end
733
+ end
734
+
735
+ headers['Host'] ||= @host_header if @host_header
736
+ headers['User-Agent'] = @user_agent if @user_agent
737
+ headers['Referer'] = @referer if @referer
738
+
739
+ if (authorization = @authorized.for_url(url))
740
+ headers['Authorization'] = "Basic #{authorization}"
741
+ end
742
+
743
+ if (header_cookies = @cookies.for_host(url.host))
744
+ headers['Cookie'] = header_cookies
745
+ end
746
+
747
+ return headers
748
+ end
749
+
679
750
  #
680
751
  # Normalizes the request path and grabs a session to handle page
681
752
  # get and post requests.
@@ -709,29 +780,7 @@ module Spidr
709
780
  # append the URL query to the path
710
781
  path += "?#{url.query}" if url.query
711
782
 
712
- # set any additional HTTP headers
713
- headers = {}
714
-
715
- unless @host_headers.empty?
716
- @host_headers.each do |name,header|
717
- if host.match(name)
718
- headers['Host'] = header
719
- break
720
- end
721
- end
722
- end
723
-
724
- headers['Host'] ||= @host_header if @host_header
725
- headers['User-Agent'] = @user_agent if @user_agent
726
- headers['Referer'] = @referer if @referer
727
-
728
- if (authorization = @authorized.for_url(url))
729
- headers['Authorization'] = "Basic #{authorization}"
730
- end
731
-
732
- if (header_cookies = @cookies.for_host(url.host))
733
- headers['Cookie'] = header_cookies
734
- end
783
+ headers = prepare_request_headers(url)
735
784
 
736
785
  begin
737
786
  sleep(@delay) if @delay > 0
@@ -762,6 +811,17 @@ module Spidr
762
811
  @queue.shift
763
812
  end
764
813
 
814
+ #
815
+ # Determines if the maximum limit has been reached.
816
+ #
817
+ # @return [Boolean]
818
+ #
819
+ # @since 0.6.0
820
+ #
821
+ def limit_reached?
822
+ @limit && @history.length >= @limit
823
+ end
824
+
765
825
  #
766
826
  # Determines if a given URL should be visited.
767
827
  #