spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -400,7 +400,7 @@ module Spidr
400
400
  @schemes = []
401
401
 
402
402
  if options[:schemes]
403
- @schemes += options[:schemes]
403
+ self.schemes = options[:schemes]
404
404
  else
405
405
  @schemes << 'http'
406
406
 
@@ -439,14 +439,6 @@ module Spidr
439
439
  if options[:host]
440
440
  visit_hosts_like(options[:host])
441
441
  end
442
-
443
- if options[:queue]
444
- self.queue = options[:queue]
445
- end
446
-
447
- if options[:history]
448
- self.history = options[:history]
449
- end
450
442
  end
451
443
 
452
444
  #
@@ -0,0 +1,36 @@
1
+ begin
2
+ require 'robots'
3
+ rescue LoadError
4
+ end
5
+
6
+ module Spidr
7
+ class Agent
8
+ #
9
+ # Initializes the robots filter.
10
+ #
11
+ def initialize_robots
12
+ unless Object.const_defined?(:Robots)
13
+ raise(ArgumentError,":robots option given but unable to require 'robots' gem")
14
+ end
15
+
16
+ @robots = Robots.new(@user_agent)
17
+ end
18
+
19
+ #
20
+ # Determines whether a URL is allowed by the robot policy.
21
+ #
22
+ # @param [URI::HTTP, String] url
23
+ # The URL to check.
24
+ #
25
+ # @return [Boolean]
26
+ # Specifies whether a URL is allowed by the robot policy.
27
+ #
28
+ def robot_allowed?(url)
29
+ if @robots
30
+ @robots.allowed?(url)
31
+ else
32
+ true
33
+ end
34
+ end
35
+ end
36
+ end
@@ -1,7 +1,3 @@
1
- require 'spidr/page/headers'
2
- require 'spidr/page/body'
3
- require 'spidr/page/links'
4
-
5
1
  module Spidr
6
2
  #
7
3
  # Represents a requested page from a website.
@@ -34,42 +30,89 @@ module Spidr
34
30
  end
35
31
 
36
32
  #
37
- # The meta-redirect links of the page.
33
+ # The body of the response.
34
+ #
35
+ # @return [String]
36
+ # The body of the response.
38
37
  #
39
- # @return [Array<String>]
40
- # All meta-redirect links in the page.
38
+ def body
39
+ (response.body || '')
40
+ end
41
+
42
+ alias to_s body
43
+
41
44
  #
42
- # @deprecated
43
- # Deprecated in 0.3.0 and will be removed in 0.4.0.
44
- # Use {#meta_redirects} instead.
45
+ # Returns a parsed document object for HTML, XML, RSS and Atom pages.
45
46
  #
46
- def meta_redirect
47
- STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
48
- STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
47
+ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
48
+ # The document that represents HTML or XML pages.
49
+ # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
50
+ # the page could not be parsed properly.
51
+ #
52
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
53
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
54
+ #
55
+ def doc
56
+ unless body.empty?
57
+ doc_class = if html?
58
+ Nokogiri::HTML::Document
59
+ elsif rss? || atom? || xml? || xsl?
60
+ Nokogiri::XML::Document
61
+ end
49
62
 
50
- meta_redirects
63
+ if doc_class
64
+ begin
65
+ @doc ||= doc_class.parse(body, @url.to_s, content_charset)
66
+ rescue
67
+ end
68
+ end
69
+ end
51
70
  end
52
71
 
53
72
  #
54
- # Determines if the response code is `300`, `301`, `302`, `303`
55
- # or `307`. Also checks for "soft" redirects added at the page
56
- # level by a meta refresh tag.
73
+ # Searches the document for XPath or CSS Path paths.
74
+ #
75
+ # @param [Array<String>] paths
76
+ # CSS or XPath expressions to search the document with.
57
77
  #
58
- # @return [Boolean]
59
- # Specifies whether the response code is a HTTP Redirect code.
78
+ # @return [Array]
79
+ # The matched nodes from the document.
80
+ # Returns an empty Array if no nodes were matched, or if the page
81
+ # is not an HTML or XML document.
60
82
  #
61
- def is_redirect?
62
- case code
63
- when 300..303, 307
64
- true
65
- when 200
66
- meta_redirect?
83
+ # @example
84
+ # page.search('//a[@href]')
85
+ #
86
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
87
+ #
88
+ def search(*paths)
89
+ if doc
90
+ doc.search(*paths)
67
91
  else
68
- false
92
+ []
69
93
  end
70
94
  end
71
95
 
72
- alias redirect? is_redirect?
96
+ #
97
+ # Searches for the first occurrence an XPath or CSS Path expression.
98
+ #
99
+ # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
100
+ # The first matched node. Returns `nil` if no nodes could be matched,
101
+ # or if the page is not a HTML or XML document.
102
+ #
103
+ # @example
104
+ # page.at('//title')
105
+ #
106
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
107
+ #
108
+ def at(*arguments)
109
+ if doc
110
+ doc.at(*arguments)
111
+ end
112
+ end
113
+
114
+ alias / search
115
+ alias % at
73
116
 
74
117
  protected
75
118
 
@@ -90,7 +133,7 @@ module Spidr
90
133
  #
91
134
  def method_missing(name,*arguments,&block)
92
135
  if (arguments.empty? && block.nil?)
93
- header_name = name.to_s.sub('_','-')
136
+ header_name = name.to_s.tr('_','-')
94
137
 
95
138
  if @response.key?(header_name)
96
139
  return @response[header_name]
@@ -102,3 +145,8 @@ module Spidr
102
145
 
103
146
  end
104
147
  end
148
+
149
+ require 'spidr/page/status_codes'
150
+ require 'spidr/page/content_types'
151
+ require 'spidr/page/cookies'
152
+ require 'spidr/page/html'
@@ -1,98 +1,5 @@
1
- require 'set'
2
-
3
1
  module Spidr
4
2
  class Page
5
- # Reserved names used within Cookie strings
6
- RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
7
-
8
- #
9
- # The response code from the page.
10
- #
11
- # @return [Integer]
12
- # Response code from the page.
13
- #
14
- def code
15
- response.code.to_i
16
- end
17
-
18
- #
19
- # Determines if the response code is `200`.
20
- #
21
- # @return [Boolean]
22
- # Specifies whether the response code is `200`.
23
- #
24
- def is_ok?
25
- code == 200
26
- end
27
-
28
- alias ok? is_ok?
29
-
30
- #
31
- # Determines if the response code is `308`.
32
- #
33
- # @return [Boolean]
34
- # Specifies whether the response code is `308`.
35
- #
36
- def timedout?
37
- code == 308
38
- end
39
-
40
- #
41
- # Determines if the response code is `400`.
42
- #
43
- # @return [Boolean]
44
- # Specifies whether the response code is `400`.
45
- #
46
- def bad_request?
47
- code == 400
48
- end
49
-
50
- #
51
- # Determines if the response code is `401`.
52
- #
53
- # @return [Boolean]
54
- # Specifies whether the response code is `401`.
55
- #
56
- def is_unauthorized?
57
- code == 401
58
- end
59
-
60
- alias unauthorized? is_unauthorized?
61
-
62
- #
63
- # Determines if the response code is `403`.
64
- #
65
- # @return [Boolean]
66
- # Specifies whether the response code is `403`.
67
- #
68
- def is_forbidden?
69
- code == 403
70
- end
71
-
72
- alias forbidden? is_forbidden?
73
-
74
- #
75
- # Determines if the response code is `404`.
76
- #
77
- # @return [Boolean]
78
- # Specifies whether the response code is `404`.
79
- #
80
- def is_missing?
81
- code == 404
82
- end
83
-
84
- alias missing? is_missing?
85
-
86
- #
87
- # Determines if the response code is `500`.
88
- #
89
- # @return [Boolean]
90
- # Specifies whether the response code is `500`.
91
- #
92
- def had_internal_server_error?
93
- code == 500
94
- end
95
-
96
3
  #
97
4
  # The Content-Type of the page.
98
5
  #
@@ -100,7 +7,7 @@ module Spidr
100
7
  # The Content-Type of the page.
101
8
  #
102
9
  def content_type
103
- (response['Content-Type'] || '')
10
+ @response['Content-Type'] || ''
104
11
  end
105
12
 
106
13
  #
@@ -112,7 +19,7 @@ module Spidr
112
19
  # @since 0.2.2
113
20
  #
114
21
  def content_types
115
- (headers['content-type'] || [])
22
+ @response.get_fields('content-type') || []
116
23
  end
117
24
 
118
25
  #
@@ -314,57 +221,5 @@ module Spidr
314
221
  def zip?
315
222
  is_content_type?('application/zip')
316
223
  end
317
-
318
- #
319
- # The raw Cookie String sent along with the page.
320
- #
321
- # @return [String]
322
- # The raw Cookie from the response.
323
- #
324
- # @since 0.2.7
325
- #
326
- def cookie
327
- (response['Set-Cookie'] || '')
328
- end
329
-
330
- alias raw_cookie cookie
331
-
332
- #
333
- # The Cookie values sent along with the page.
334
- #
335
- # @return [Array<String>]
336
- # The Cookies from the response.
337
- #
338
- # @since 0.2.2
339
- #
340
- def cookies
341
- (headers['set-cookie'] || [])
342
- end
343
-
344
- #
345
- # The Cookie key -> value pairs returned with the response.
346
- #
347
- # @return [Hash{String => String}]
348
- # The cookie keys and values.
349
- #
350
- # @since 0.2.2
351
- #
352
- def cookie_params
353
- params = {}
354
-
355
- cookies.each do |value|
356
- value.split(';').each do |param|
357
- param.strip!
358
-
359
- name, value = param.split('=',2)
360
-
361
- unless RESERVED_COOKIE_NAMES.include?(name)
362
- params[name] = (value || '')
363
- end
364
- end
365
- end
366
-
367
- return params
368
- end
369
224
  end
370
225
  end
@@ -0,0 +1,60 @@
1
+ require 'set'
2
+
3
+ module Spidr
4
+ class Page
5
+ # Reserved names used within Cookie strings
6
+ RESERVED_COOKIE_NAMES = /^(?:Path|Expires|Domain|Secure|HTTPOnly)$/i
7
+
8
+ #
9
+ # The raw Cookie String sent along with the page.
10
+ #
11
+ # @return [String]
12
+ # The raw Cookie from the response.
13
+ #
14
+ # @since 0.2.7
15
+ #
16
+ def cookie
17
+ @response['Set-Cookie'] || ''
18
+ end
19
+
20
+ alias raw_cookie cookie
21
+
22
+ #
23
+ # The Cookie values sent along with the page.
24
+ #
25
+ # @return [Array<String>]
26
+ # The Cookies from the response.
27
+ #
28
+ # @since 0.2.2
29
+ #
30
+ def cookies
31
+ (@response.get_fields('Set-Cookie') || [])
32
+ end
33
+
34
+ #
35
+ # The Cookie key -> value pairs returned with the response.
36
+ #
37
+ # @return [Hash{String => String}]
38
+ # The cookie keys and values.
39
+ #
40
+ # @since 0.2.2
41
+ #
42
+ def cookie_params
43
+ params = {}
44
+
45
+ cookies.each do |value|
46
+ value.split(';').each do |param|
47
+ param.strip!
48
+
49
+ name, value = param.split('=',2)
50
+
51
+ unless name =~ RESERVED_COOKIE_NAMES
52
+ params[name] = (value || '')
53
+ end
54
+ end
55
+ end
56
+
57
+ return params
58
+ end
59
+ end
60
+ end
@@ -1,10 +1,22 @@
1
+ require 'nokogiri'
1
2
  require 'spidr/extensions/uri'
2
- require 'uri'
3
3
 
4
4
  module Spidr
5
5
  class Page
6
6
  include Enumerable
7
7
 
8
+ #
9
+ # The title of the HTML page.
10
+ #
11
+ # @return [String]
12
+ # The inner-text of the title element of the page.
13
+ #
14
+ def title
15
+ if (node = at('//title'))
16
+ node.inner_text
17
+ end
18
+ end
19
+
8
20
  #
9
21
  # Enumerates over the meta-redirect links in the page.
10
22
  #
@@ -21,7 +33,7 @@ module Spidr
21
33
  # @since 0.3.0
22
34
  #
23
35
  def each_meta_redirect
24
- return enum_for(:each_meta_redirect) unless block_given?
36
+ return enum_for(__method__) unless block_given?
25
37
 
26
38
  if (html? && doc)
27
39
  search('//meta[@http-equiv and @content]').each do |node|
@@ -44,7 +56,7 @@ module Spidr
44
56
  # Specifies whether the page includes page-level redirects.
45
57
  #
46
58
  def meta_redirect?
47
- !(each_meta_redirect.first.nil?)
59
+ !each_meta_redirect.first.nil?
48
60
  end
49
61
 
50
62
  #
@@ -59,6 +71,23 @@ module Spidr
59
71
  each_meta_redirect.to_a
60
72
  end
61
73
 
74
+ #
75
+ # The meta-redirect links of the page.
76
+ #
77
+ # @return [Array<String>]
78
+ # All meta-redirect links in the page.
79
+ #
80
+ # @deprecated
81
+ # Deprecated in 0.3.0 and will be removed in 0.4.0.
82
+ # Use {#meta_redirects} instead.
83
+ #
84
+ def meta_redirect
85
+ warn 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
86
+ warn 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
87
+
88
+ meta_redirects
89
+ end
90
+
62
91
  #
63
92
  # Enumerates over every HTTP or meta-redirect link in the page.
64
93
  #
@@ -74,18 +103,14 @@ module Spidr
74
103
  # @since 0.3.0
75
104
  #
76
105
  def each_redirect(&block)
77
- return enum_for(:each_redirect) unless block
106
+ return enum_for(__method__) unless block
78
107
 
79
- location = headers['location']
80
-
81
- if location.nil?
108
+ if (locations = @response.get_fields('Location'))
109
+ # Location headers override any meta-refresh redirects in the HTML
110
+ locations.each(&block)
111
+ else
82
112
  # check page-level meta redirects if there isn't a location header
83
113
  each_meta_redirect(&block)
84
- elsif location.kind_of?(Array)
85
- location.each(&block)
86
- else
87
- # usually the location header contains a single String
88
- yield location
89
114
  end
90
115
  end
91
116
 
@@ -115,7 +140,7 @@ module Spidr
115
140
  # @since 0.5.0
116
141
  #
117
142
  def each_mailto
118
- return enum_for(:each_mailto) unless block_given?
143
+ return enum_for(__method__) unless block_given?
119
144
 
120
145
  if (html? && doc)
121
146
  doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
@@ -151,7 +176,7 @@ module Spidr
151
176
  # @since 0.3.0
152
177
  #
153
178
  def each_link
154
- return enum_for(:each_link) unless block_given?
179
+ return enum_for(__method__) unless block_given?
155
180
 
156
181
  filter = lambda { |url|
157
182
  yield url unless (url.nil? || url.empty?)
@@ -208,7 +233,7 @@ module Spidr
208
233
  # @since 0.3.0
209
234
  #
210
235
  def each_url
211
- return enum_for(:each_url) unless block_given?
236
+ return enum_for(__method__) unless block_given?
212
237
 
213
238
  each_link do |link|
214
239
  if (url = to_absolute(link))
@@ -239,15 +264,14 @@ module Spidr
239
264
  # The normalized URI.
240
265
  #
241
266
  def to_absolute(link)
242
- begin
243
- new_url = url.merge(link.to_s)
244
- rescue Exception
245
- return nil
246
- end
247
-
248
- if new_url.path
249
- path = new_url.path
267
+ link = link.to_s
268
+ new_url = begin
269
+ url.merge(link)
270
+ rescue Exception
271
+ return
272
+ end
250
273
 
274
+ if (path = new_url.path)
251
275
  # ensure that paths begin with a leading '/' for URI::FTP
252
276
  if (new_url.scheme == 'ftp' && !path.start_with?('/'))
253
277
  path.insert(0,'/')