spidr 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 31e83cba8fd67a2527641b404f82773d60b5fb97
4
- data.tar.gz: cbd735b652d209cd49a6990eedf3de6f7a22e385
3
+ metadata.gz: 114364609c8da8613e22e9f18777cd9c79e1ac8a
4
+ data.tar.gz: edb694a4a695217a2adf2cd44ffeafb679e8292c
5
5
  SHA512:
6
- metadata.gz: d33742df9e9a4ec8090d4934de3562036e149195b3567ac1143c4637012876d86a18618e9f89251506ed8aa1d9c85cc18ed324774d4da29038e975827698f265
7
- data.tar.gz: 24b08172be0184f7c68fbc63b31eaac55b0c55d70b35b8983fbbb1a3ce871e157b0bbf7d598625ef37ec3fe420c7372bc5fdaf7dd4b7131eac6e6e23e465e475
6
+ metadata.gz: 50064bb0227d7dc0b3ff4bf55ec72b74635c89d6a7bfe24c6948c73ff439b74dbe6f2c72276586389340ed050a30b4814faa0d2584d490badc337c97393bf4f3
7
+ data.tar.gz: 46326b368267b66d647ea09712ec499dc5dbed82fb28b65d5573832d661328a768e140fcec72a75e68516dae6401ded9558978b13e7c6158249fd14737d1c93f
@@ -0,0 +1,14 @@
1
+ ---
2
+ language: ruby
3
+ rvm:
4
+ - 2.0.0
5
+ - 2.1.9
6
+ - 2.2.4
7
+ - 2.3.1
8
+ - jruby
9
+ - rbx
10
+ matrix:
11
+ allow_failures:
12
+ - rvm: jruby
13
+ - rvm: rbx
14
+ script: rake spec
@@ -1,3 +1,21 @@
1
+ ### 0.6.0 / 2016-08-04
2
+
3
+ * Added {Spidr::Proxy}.
4
+ * Added more options to {Spidr::Agent#initialize}:
5
+ * `:default_headers`: specifies the default headers to set in all requests
6
+ (@maccman).
7
+ * `:limit`: specify the maximum number of links to visit.
8
+ * `:open_timeout`, `:read_timeout`, `:ssl_timeout`, `:continue_timeout`,
9
+ and `:keep_alive_timeout`: sets `Net::HTTP` timeouts.
10
+ * Allow {Spidr::Settings::Proxy#proxy= Spidr.proxy=} to accept `nil`.
11
+ * Use `Net::HTTPResponse#get_fields` in {Spidr::Page} to correctly return
12
+ multiple values for repeated headers.
13
+ * Fixed a bug in {Spidr::Page#method_missing} where method names were not being
14
+ correctly converted to header names.
15
+ * Fixed a bug in {Spidr::Page#cookie_params} where `Set-Cookie` flags were not
16
+ being filtered out.
17
+ * Rewrote the specs to use webmock and increased spec coverage.
18
+
1
19
  ### 0.5.0 / 2016-01-03
2
20
 
3
21
  * Added support for respecting `robots.txt` files.
@@ -166,8 +184,8 @@
166
184
  * Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
167
185
  * Added `Spidr::Agent#get_session`.
168
186
  * Added `Spidr::Agent#kill_session`.
169
- * Added {Spidr.proxy=}.
170
- * Added {Spidr.disable_proxy!}.
187
+ * Added {Spidr::Settings::Proxy#proxy= Spidr.proxy=}.
188
+ * Added {Spidr::Settings::Proxy#disable_proxy! Spidr.disable_proxy!}.
171
189
  * Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
172
190
  * Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
173
191
  * Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.
data/Gemfile CHANGED
@@ -6,15 +6,15 @@ end
6
6
 
7
7
  gemspec
8
8
 
9
-
10
9
  gem 'robots', group: :robots
11
10
 
12
11
  group :development do
13
12
  gem 'rake'
14
13
  gem 'rubygems-tasks', '~> 0.2'
15
14
 
16
- gem 'wsoc', '~> 0.1.3'
17
15
  gem 'rspec', '~> 3.0'
16
+ gem 'webmock', '~> 2.0'
17
+ gem 'sinatra', '~> 1.0'
18
18
 
19
19
  gem 'kramdown', '~> 0.12'
20
20
  gem 'yard', '~> 0.8'
data/README.md CHANGED
@@ -5,6 +5,7 @@
5
5
  * [Issues](https://github.com/postmodern/spidr/issues)
6
6
  * [Mailing List](http://groups.google.com/group/spidr)
7
7
  * [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
8
+ * [![Build Status](https://travis-ci.org/postmodern/spidr.svg)](https://travis-ci.org/postmodern/spidr)
8
9
 
9
10
  ## Description
10
11
 
@@ -28,7 +29,8 @@ and easy to use.
28
29
  * Port number
29
30
  * Full link
30
31
  * URL extension
31
- * Provides call-backs for:
32
+ * Optional `/robots.txt` support.
33
+ * Provides callbacks for:
32
34
  * Every visited Page.
33
35
  * Every visited URL.
34
36
  * Every visited URL that matches a specified pattern.
@@ -181,7 +183,7 @@ Skip the processing of links:
181
183
 
182
184
  ## Requirements
183
185
 
184
- * [ruby] >= 1.9.1
186
+ * [ruby] >= 2.0.0
185
187
  * [nokogiri] ~> 1.3
186
188
 
187
189
  ## Install
data/Rakefile CHANGED
@@ -16,3 +16,4 @@ task :default => :spec
16
16
 
17
17
  require 'yard'
18
18
  YARD::Rake::YardocTask.new
19
+ task :doc => :yard
@@ -11,7 +11,7 @@ email: postmodern.mod3@gmail.com
11
11
  homepage: https://github.com/postmodern/spidr#readme
12
12
  has_yard: true
13
13
 
14
- required_ruby_version: ">= 1.9.1"
14
+ required_ruby_version: ">= 2.0.0"
15
15
 
16
16
  dependencies:
17
17
  nokogiri: ~> 1.3
@@ -1,7 +1,9 @@
1
+ require 'spidr/settings/user_agent'
1
2
  require 'spidr/agent/sanitizers'
2
3
  require 'spidr/agent/filters'
3
4
  require 'spidr/agent/events'
4
5
  require 'spidr/agent/actions'
6
+ require 'spidr/agent/robots'
5
7
  require 'spidr/page'
6
8
  require 'spidr/session_cache'
7
9
  require 'spidr/cookie_jar'
@@ -12,14 +14,11 @@ require 'openssl'
12
14
  require 'net/http'
13
15
  require 'set'
14
16
 
15
- begin
16
- require 'robots'
17
- rescue LoadError
18
- end
19
-
20
17
  module Spidr
21
18
  class Agent
22
19
 
20
+ include Settings::UserAgent
21
+
23
22
  # HTTP Host Header to use
24
23
  #
25
24
  # @return [String]
@@ -30,10 +29,12 @@ module Spidr
30
29
  # @return [Hash{String,Regexp => String}]
31
30
  attr_reader :host_headers
32
31
 
33
- # User-Agent to use
32
+ # HTTP Headers to use for every request
34
33
  #
35
- # @return [String]
36
- attr_accessor :user_agent
34
+ # @return [Hash{String => String}]
35
+ #
36
+ # @since 0.6.0
37
+ attr_reader :default_headers
37
38
 
38
39
  # HTTP Authentication credentials
39
40
  #
@@ -65,11 +66,23 @@ module Spidr
65
66
  # @return [Array<URI::HTTP>]
66
67
  attr_reader :queue
67
68
 
69
+ # The session cache
70
+ #
71
+ # @return [SessionCache]
72
+ #
73
+ # @since 0.6.0
74
+ attr_reader :sessions
75
+
68
76
  # Cached cookies
69
77
  #
70
78
  # @return [CookieJar]
71
79
  attr_reader :cookies
72
-
80
+
81
+ # Maximum number of pages to visit.
82
+ #
83
+ # @return [Integer]
84
+ attr_reader :limit
85
+
73
86
  # Maximum depth
74
87
  #
75
88
  # @return [Integer]
@@ -86,6 +99,21 @@ module Spidr
86
99
  # @param [Hash] options
87
100
  # Additional options
88
101
  #
102
+ # @option options [Integer] :open_timeout (Spidr.open_timeout)
103
+ # Optional open timeout.
104
+ #
105
+ # @option options [Integer] :read_timeout (Spidr.read_timeout)
106
+ # Optional read timeout.
107
+ #
108
+ # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
109
+ # Optional ssl timeout.
110
+ #
111
+ # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
112
+ # Optional continue timeout.
113
+ #
114
+ # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
115
+ # Optional keep_alive timeout.
116
+ #
89
117
  # @option options [Hash] :proxy (Spidr.proxy)
90
118
  # The proxy information to use.
91
119
  #
@@ -101,6 +129,9 @@ module Spidr
101
129
  # @option :proxy [String] :password
102
130
  # The password to authenticate with.
103
131
  #
132
+ # @option options [Hash{String => String}] :default_headers
133
+ # Default headers to set for every request.
134
+ #
104
135
  # @option options [String] :host_header
105
136
  # The HTTP Host header to use with each request.
106
137
  #
@@ -122,6 +153,9 @@ module Spidr
122
153
  # @option options [Set, Array] :history
123
154
  # The initial list of visited URLs.
124
155
  #
156
+ # @option options [Integer] :limit
157
+ # The maximum number of pages to visit.
158
+ #
125
159
  # @option options [Integer] :max_depth
126
160
  # The maximum link depth to follow.
127
161
  #
@@ -148,10 +182,16 @@ module Spidr
148
182
  @host_headers.merge!(options[:host_headers])
149
183
  end
150
184
 
185
+ @default_headers = {}
186
+
187
+ if options[:default_headers]
188
+ @default_headers.merge!(options[:default_headers])
189
+ end
190
+
151
191
  @user_agent = options.fetch(:user_agent,Spidr.user_agent)
152
192
  @referer = options[:referer]
153
193
 
154
- @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
194
+ @sessions = SessionCache.new(options)
155
195
  @cookies = CookieJar.new
156
196
  @authorized = AuthStore.new
157
197
 
@@ -161,15 +201,16 @@ module Spidr
161
201
  @failures = Set[]
162
202
  @queue = []
163
203
 
204
+ @limit = options[:limit]
164
205
  @levels = Hash.new(0)
165
206
  @max_depth = options[:max_depth]
166
207
 
167
- if options.fetch(:robots,Spidr.robots?)
168
- unless Object.const_defined?(:Robots)
169
- raise(ArgumentError,":robots option given but unable to require 'robots' gem")
170
- end
208
+ if options[:queue]
209
+ self.queue = options[:queue]
210
+ end
171
211
 
172
- @robots = Robots.new(@user_agent)
212
+ if options[:history]
213
+ self.history = options[:history]
173
214
  end
174
215
 
175
216
  initialize_sanitizers(options)
@@ -177,6 +218,10 @@ module Spidr
177
218
  initialize_actions(options)
178
219
  initialize_events(options)
179
220
 
221
+ if options.fetch(:robots,Spidr.robots?)
222
+ initialize_robots
223
+ end
224
+
180
225
  yield self if block_given?
181
226
  end
182
227
 
@@ -252,6 +297,37 @@ module Spidr
252
297
  agent.start_at(URI::HTTP.build(host: name, path: '/'))
253
298
  end
254
299
 
300
+ #
301
+ # The proxy information the agent uses.
302
+ #
303
+ # @return [Proxy]
304
+ # The proxy information.
305
+ #
306
+ # @see SessionCache#proxy
307
+ #
308
+ # @since 0.2.2
309
+ #
310
+ def proxy
311
+ @sessions.proxy
312
+ end
313
+
314
+ #
315
+ # Sets the proxy information that the agent uses.
316
+ #
317
+ # @param [Proxy] new_proxy
318
+ # The new proxy information.
319
+ #
320
+ # @return [Hash]
321
+ # The new proxy information.
322
+ #
323
+ # @see SessionCache#proxy=
324
+ #
325
+ # @since 0.2.2
326
+ #
327
+ def proxy=(new_proxy)
328
+ @sessions.proxy = new_proxy
329
+ end
330
+
255
331
  #
256
332
  # Clears the history of the agent.
257
333
  #
@@ -292,7 +368,7 @@ module Spidr
292
368
  def run(&block)
293
369
  @running = true
294
370
 
295
- until (@queue.empty? || paused?)
371
+ until (@queue.empty? || paused? || limit_reached?)
296
372
  begin
297
373
  visit_page(dequeue,&block)
298
374
  rescue Actions::Paused
@@ -316,37 +392,6 @@ module Spidr
316
392
  @running == true
317
393
  end
318
394
 
319
- #
320
- # The proxy information the agent uses.
321
- #
322
- # @return [Hash]
323
- # The proxy information.
324
- #
325
- # @see SessionCache#proxy
326
- #
327
- # @since 0.2.2
328
- #
329
- def proxy
330
- @sessions.proxy
331
- end
332
-
333
- #
334
- # Sets the proxy information that the agent uses.
335
- #
336
- # @param [Hash] new_proxy
337
- # The new proxy information.
338
- #
339
- # @return [Hash]
340
- # The new proxy information.
341
- #
342
- # @see SessionCache#proxy=
343
- #
344
- # @since 0.2.2
345
- #
346
- def proxy=(new_proxy)
347
- @sessions.proxy = new_proxy
348
- end
349
-
350
395
  #
351
396
  # Sets the history of URLs that were previously visited.
352
397
  #
@@ -408,19 +453,6 @@ module Spidr
408
453
  return @history.include?(url)
409
454
  end
410
455
 
411
- #
412
- # Determines whether a URL is allowed by the robot policy.
413
- #
414
- # @param [URI::HTTP, String] url
415
- # The URL to check.
416
- #
417
- # @return [Boolean]
418
- # Specifies whether a URL is allowed by the robot policy.
419
- #
420
- def robot_allowed?(url)
421
- @robots ? @robots.allowed?(url) : true
422
- end
423
-
424
456
  #
425
457
  # Sets the list of failed URLs.
426
458
  #
@@ -536,7 +568,7 @@ module Spidr
536
568
  return false
537
569
  rescue Actions::Action
538
570
  end
539
-
571
+
540
572
  @queue << url
541
573
  @levels[url] = level
542
574
  return true
@@ -544,7 +576,7 @@ module Spidr
544
576
 
545
577
  return false
546
578
  end
547
-
579
+
548
580
  #
549
581
  # Requests and creates a new Page object from a given URL.
550
582
  #
@@ -676,6 +708,45 @@ module Spidr
676
708
 
677
709
  protected
678
710
 
711
+ #
712
+ # Prepares request headers for the given URL.
713
+ #
714
+ # @param [URI::HTTP] url
715
+ # The URL to prepare the request headers for.
716
+ #
717
+ # @return [Hash{String => String}]
718
+ # The prepared headers.
719
+ #
720
+ # @since 0.6.0
721
+ #
722
+ def prepare_request_headers(url)
723
+ # set any additional HTTP headers
724
+ headers = @default_headers.dup
725
+
726
+ unless @host_headers.empty?
727
+ @host_headers.each do |name,header|
728
+ if host.match(name)
729
+ headers['Host'] = header
730
+ break
731
+ end
732
+ end
733
+ end
734
+
735
+ headers['Host'] ||= @host_header if @host_header
736
+ headers['User-Agent'] = @user_agent if @user_agent
737
+ headers['Referer'] = @referer if @referer
738
+
739
+ if (authorization = @authorized.for_url(url))
740
+ headers['Authorization'] = "Basic #{authorization}"
741
+ end
742
+
743
+ if (header_cookies = @cookies.for_host(url.host))
744
+ headers['Cookie'] = header_cookies
745
+ end
746
+
747
+ return headers
748
+ end
749
+
679
750
  #
680
751
  # Normalizes the request path and grabs a session to handle page
681
752
  # get and post requests.
@@ -709,29 +780,7 @@ module Spidr
709
780
  # append the URL query to the path
710
781
  path += "?#{url.query}" if url.query
711
782
 
712
- # set any additional HTTP headers
713
- headers = {}
714
-
715
- unless @host_headers.empty?
716
- @host_headers.each do |name,header|
717
- if host.match(name)
718
- headers['Host'] = header
719
- break
720
- end
721
- end
722
- end
723
-
724
- headers['Host'] ||= @host_header if @host_header
725
- headers['User-Agent'] = @user_agent if @user_agent
726
- headers['Referer'] = @referer if @referer
727
-
728
- if (authorization = @authorized.for_url(url))
729
- headers['Authorization'] = "Basic #{authorization}"
730
- end
731
-
732
- if (header_cookies = @cookies.for_host(url.host))
733
- headers['Cookie'] = header_cookies
734
- end
783
+ headers = prepare_request_headers(url)
735
784
 
736
785
  begin
737
786
  sleep(@delay) if @delay > 0
@@ -762,6 +811,17 @@ module Spidr
762
811
  @queue.shift
763
812
  end
764
813
 
814
+ #
815
+ # Determines if the maximum limit has been reached.
816
+ #
817
+ # @return [Boolean]
818
+ #
819
+ # @since 0.6.0
820
+ #
821
+ def limit_reached?
822
+ @limit && @history.length >= @limit
823
+ end
824
+
765
825
  #
766
826
  # Determines if a given URL should be visited.
767
827
  #