spidr 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -400,7 +400,7 @@ module Spidr
400
400
  @schemes = []
401
401
 
402
402
  if options[:schemes]
403
- @schemes += options[:schemes]
403
+ self.schemes = options[:schemes]
404
404
  else
405
405
  @schemes << 'http'
406
406
 
@@ -439,14 +439,6 @@ module Spidr
439
439
  if options[:host]
440
440
  visit_hosts_like(options[:host])
441
441
  end
442
-
443
- if options[:queue]
444
- self.queue = options[:queue]
445
- end
446
-
447
- if options[:history]
448
- self.history = options[:history]
449
- end
450
442
  end
451
443
 
452
444
  #
@@ -0,0 +1,36 @@
1
+ begin
2
+ require 'robots'
3
+ rescue LoadError
4
+ end
5
+
6
+ module Spidr
7
+ class Agent
8
+ #
9
+ # Initializes the robots filter.
10
+ #
11
+ def initialize_robots
12
+ unless Object.const_defined?(:Robots)
13
+ raise(ArgumentError,":robots option given but unable to require 'robots' gem")
14
+ end
15
+
16
+ @robots = Robots.new(@user_agent)
17
+ end
18
+
19
+ #
20
+ # Determines whether a URL is allowed by the robot policy.
21
+ #
22
+ # @param [URI::HTTP, String] url
23
+ # The URL to check.
24
+ #
25
+ # @return [Boolean]
26
+ # Specifies whether a URL is allowed by the robot policy.
27
+ #
28
+ def robot_allowed?(url)
29
+ if @robots
30
+ @robots.allowed?(url)
31
+ else
32
+ true
33
+ end
34
+ end
35
+ end
36
+ end
@@ -1,7 +1,3 @@
1
- require 'spidr/page/headers'
2
- require 'spidr/page/body'
3
- require 'spidr/page/links'
4
-
5
1
  module Spidr
6
2
  #
7
3
  # Represents a requested page from a website.
@@ -34,42 +30,89 @@ module Spidr
34
30
  end
35
31
 
36
32
  #
37
- # The meta-redirect links of the page.
33
+ # The body of the response.
34
+ #
35
+ # @return [String]
36
+ # The body of the response.
38
37
  #
39
- # @return [Array<String>]
40
- # All meta-redirect links in the page.
38
+ def body
39
+ (response.body || '')
40
+ end
41
+
42
+ alias to_s body
43
+
41
44
  #
42
- # @deprecated
43
- # Deprecated in 0.3.0 and will be removed in 0.4.0.
44
- # Use {#meta_redirects} instead.
45
+ # Returns a parsed document object for HTML, XML, RSS and Atom pages.
45
46
  #
46
- def meta_redirect
47
- STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
48
- STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
47
+ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
48
+ # The document that represents HTML or XML pages.
49
+ # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
50
+ # the page could not be parsed properly.
51
+ #
52
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
53
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
54
+ #
55
+ def doc
56
+ unless body.empty?
57
+ doc_class = if html?
58
+ Nokogiri::HTML::Document
59
+ elsif rss? || atom? || xml? || xsl?
60
+ Nokogiri::XML::Document
61
+ end
49
62
 
50
- meta_redirects
63
+ if doc_class
64
+ begin
65
+ @doc ||= doc_class.parse(body, @url.to_s, content_charset)
66
+ rescue
67
+ end
68
+ end
69
+ end
51
70
  end
52
71
 
53
72
  #
54
- # Determines if the response code is `300`, `301`, `302`, `303`
55
- # or `307`. Also checks for "soft" redirects added at the page
56
- # level by a meta refresh tag.
73
+ # Searches the document for XPath or CSS Path paths.
74
+ #
75
+ # @param [Array<String>] paths
76
+ # CSS or XPath expressions to search the document with.
57
77
  #
58
- # @return [Boolean]
59
- # Specifies whether the response code is a HTTP Redirect code.
78
+ # @return [Array]
79
+ # The matched nodes from the document.
80
+ # Returns an empty Array if no nodes were matched, or if the page
81
+ # is not an HTML or XML document.
60
82
  #
61
- def is_redirect?
62
- case code
63
- when 300..303, 307
64
- true
65
- when 200
66
- meta_redirect?
83
+ # @example
84
+ # page.search('//a[@href]')
85
+ #
86
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
87
+ #
88
+ def search(*paths)
89
+ if doc
90
+ doc.search(*paths)
67
91
  else
68
- false
92
+ []
69
93
  end
70
94
  end
71
95
 
72
- alias redirect? is_redirect?
96
+ #
97
+ # Searches for the first occurrence an XPath or CSS Path expression.
98
+ #
99
+ # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
100
+ # The first matched node. Returns `nil` if no nodes could be matched,
101
+ # or if the page is not a HTML or XML document.
102
+ #
103
+ # @example
104
+ # page.at('//title')
105
+ #
106
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
107
+ #
108
+ def at(*arguments)
109
+ if doc
110
+ doc.at(*arguments)
111
+ end
112
+ end
113
+
114
+ alias / search
115
+ alias % at
73
116
 
74
117
  protected
75
118
 
@@ -90,7 +133,7 @@ module Spidr
90
133
  #
91
134
  def method_missing(name,*arguments,&block)
92
135
  if (arguments.empty? && block.nil?)
93
- header_name = name.to_s.sub('_','-')
136
+ header_name = name.to_s.tr('_','-')
94
137
 
95
138
  if @response.key?(header_name)
96
139
  return @response[header_name]
@@ -102,3 +145,8 @@ module Spidr
102
145
 
103
146
  end
104
147
  end
148
+
149
+ require 'spidr/page/status_codes'
150
+ require 'spidr/page/content_types'
151
+ require 'spidr/page/cookies'
152
+ require 'spidr/page/html'
@@ -1,98 +1,5 @@
1
- require 'set'
2
-
3
1
  module Spidr
4
2
  class Page
5
- # Reserved names used within Cookie strings
6
- RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
7
-
8
- #
9
- # The response code from the page.
10
- #
11
- # @return [Integer]
12
- # Response code from the page.
13
- #
14
- def code
15
- response.code.to_i
16
- end
17
-
18
- #
19
- # Determines if the response code is `200`.
20
- #
21
- # @return [Boolean]
22
- # Specifies whether the response code is `200`.
23
- #
24
- def is_ok?
25
- code == 200
26
- end
27
-
28
- alias ok? is_ok?
29
-
30
- #
31
- # Determines if the response code is `308`.
32
- #
33
- # @return [Boolean]
34
- # Specifies whether the response code is `308`.
35
- #
36
- def timedout?
37
- code == 308
38
- end
39
-
40
- #
41
- # Determines if the response code is `400`.
42
- #
43
- # @return [Boolean]
44
- # Specifies whether the response code is `400`.
45
- #
46
- def bad_request?
47
- code == 400
48
- end
49
-
50
- #
51
- # Determines if the response code is `401`.
52
- #
53
- # @return [Boolean]
54
- # Specifies whether the response code is `401`.
55
- #
56
- def is_unauthorized?
57
- code == 401
58
- end
59
-
60
- alias unauthorized? is_unauthorized?
61
-
62
- #
63
- # Determines if the response code is `403`.
64
- #
65
- # @return [Boolean]
66
- # Specifies whether the response code is `403`.
67
- #
68
- def is_forbidden?
69
- code == 403
70
- end
71
-
72
- alias forbidden? is_forbidden?
73
-
74
- #
75
- # Determines if the response code is `404`.
76
- #
77
- # @return [Boolean]
78
- # Specifies whether the response code is `404`.
79
- #
80
- def is_missing?
81
- code == 404
82
- end
83
-
84
- alias missing? is_missing?
85
-
86
- #
87
- # Determines if the response code is `500`.
88
- #
89
- # @return [Boolean]
90
- # Specifies whether the response code is `500`.
91
- #
92
- def had_internal_server_error?
93
- code == 500
94
- end
95
-
96
3
  #
97
4
  # The Content-Type of the page.
98
5
  #
@@ -100,7 +7,7 @@ module Spidr
100
7
  # The Content-Type of the page.
101
8
  #
102
9
  def content_type
103
- (response['Content-Type'] || '')
10
+ @response['Content-Type'] || ''
104
11
  end
105
12
 
106
13
  #
@@ -112,7 +19,7 @@ module Spidr
112
19
  # @since 0.2.2
113
20
  #
114
21
  def content_types
115
- (headers['content-type'] || [])
22
+ @response.get_fields('content-type') || []
116
23
  end
117
24
 
118
25
  #
@@ -314,57 +221,5 @@ module Spidr
314
221
  def zip?
315
222
  is_content_type?('application/zip')
316
223
  end
317
-
318
- #
319
- # The raw Cookie String sent along with the page.
320
- #
321
- # @return [String]
322
- # The raw Cookie from the response.
323
- #
324
- # @since 0.2.7
325
- #
326
- def cookie
327
- (response['Set-Cookie'] || '')
328
- end
329
-
330
- alias raw_cookie cookie
331
-
332
- #
333
- # The Cookie values sent along with the page.
334
- #
335
- # @return [Array<String>]
336
- # The Cookies from the response.
337
- #
338
- # @since 0.2.2
339
- #
340
- def cookies
341
- (headers['set-cookie'] || [])
342
- end
343
-
344
- #
345
- # The Cookie key -> value pairs returned with the response.
346
- #
347
- # @return [Hash{String => String}]
348
- # The cookie keys and values.
349
- #
350
- # @since 0.2.2
351
- #
352
- def cookie_params
353
- params = {}
354
-
355
- cookies.each do |value|
356
- value.split(';').each do |param|
357
- param.strip!
358
-
359
- name, value = param.split('=',2)
360
-
361
- unless RESERVED_COOKIE_NAMES.include?(name)
362
- params[name] = (value || '')
363
- end
364
- end
365
- end
366
-
367
- return params
368
- end
369
224
  end
370
225
  end
@@ -0,0 +1,60 @@
1
+ require 'set'
2
+
3
+ module Spidr
4
+ class Page
5
+ # Reserved names used within Cookie strings
6
+ RESERVED_COOKIE_NAMES = /^(?:Path|Expires|Domain|Secure|HTTPOnly)$/i
7
+
8
+ #
9
+ # The raw Cookie String sent along with the page.
10
+ #
11
+ # @return [String]
12
+ # The raw Cookie from the response.
13
+ #
14
+ # @since 0.2.7
15
+ #
16
+ def cookie
17
+ @response['Set-Cookie'] || ''
18
+ end
19
+
20
+ alias raw_cookie cookie
21
+
22
+ #
23
+ # The Cookie values sent along with the page.
24
+ #
25
+ # @return [Array<String>]
26
+ # The Cookies from the response.
27
+ #
28
+ # @since 0.2.2
29
+ #
30
+ def cookies
31
+ (@response.get_fields('Set-Cookie') || [])
32
+ end
33
+
34
+ #
35
+ # The Cookie key -> value pairs returned with the response.
36
+ #
37
+ # @return [Hash{String => String}]
38
+ # The cookie keys and values.
39
+ #
40
+ # @since 0.2.2
41
+ #
42
+ def cookie_params
43
+ params = {}
44
+
45
+ cookies.each do |value|
46
+ value.split(';').each do |param|
47
+ param.strip!
48
+
49
+ name, value = param.split('=',2)
50
+
51
+ unless name =~ RESERVED_COOKIE_NAMES
52
+ params[name] = (value || '')
53
+ end
54
+ end
55
+ end
56
+
57
+ return params
58
+ end
59
+ end
60
+ end
@@ -1,10 +1,22 @@
1
+ require 'nokogiri'
1
2
  require 'spidr/extensions/uri'
2
- require 'uri'
3
3
 
4
4
  module Spidr
5
5
  class Page
6
6
  include Enumerable
7
7
 
8
+ #
9
+ # The title of the HTML page.
10
+ #
11
+ # @return [String]
12
+ # The inner-text of the title element of the page.
13
+ #
14
+ def title
15
+ if (node = at('//title'))
16
+ node.inner_text
17
+ end
18
+ end
19
+
8
20
  #
9
21
  # Enumerates over the meta-redirect links in the page.
10
22
  #
@@ -21,7 +33,7 @@ module Spidr
21
33
  # @since 0.3.0
22
34
  #
23
35
  def each_meta_redirect
24
- return enum_for(:each_meta_redirect) unless block_given?
36
+ return enum_for(__method__) unless block_given?
25
37
 
26
38
  if (html? && doc)
27
39
  search('//meta[@http-equiv and @content]').each do |node|
@@ -44,7 +56,7 @@ module Spidr
44
56
  # Specifies whether the page includes page-level redirects.
45
57
  #
46
58
  def meta_redirect?
47
- !(each_meta_redirect.first.nil?)
59
+ !each_meta_redirect.first.nil?
48
60
  end
49
61
 
50
62
  #
@@ -59,6 +71,23 @@ module Spidr
59
71
  each_meta_redirect.to_a
60
72
  end
61
73
 
74
+ #
75
+ # The meta-redirect links of the page.
76
+ #
77
+ # @return [Array<String>]
78
+ # All meta-redirect links in the page.
79
+ #
80
+ # @deprecated
81
+ # Deprecated in 0.3.0 and will be removed in 0.4.0.
82
+ # Use {#meta_redirects} instead.
83
+ #
84
+ def meta_redirect
85
+ warn 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
86
+ warn 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
87
+
88
+ meta_redirects
89
+ end
90
+
62
91
  #
63
92
  # Enumerates over every HTTP or meta-redirect link in the page.
64
93
  #
@@ -74,18 +103,14 @@ module Spidr
74
103
  # @since 0.3.0
75
104
  #
76
105
  def each_redirect(&block)
77
- return enum_for(:each_redirect) unless block
106
+ return enum_for(__method__) unless block
78
107
 
79
- location = headers['location']
80
-
81
- if location.nil?
108
+ if (locations = @response.get_fields('Location'))
109
+ # Location headers override any meta-refresh redirects in the HTML
110
+ locations.each(&block)
111
+ else
82
112
  # check page-level meta redirects if there isn't a location header
83
113
  each_meta_redirect(&block)
84
- elsif location.kind_of?(Array)
85
- location.each(&block)
86
- else
87
- # usually the location header contains a single String
88
- yield location
89
114
  end
90
115
  end
91
116
 
@@ -115,7 +140,7 @@ module Spidr
115
140
  # @since 0.5.0
116
141
  #
117
142
  def each_mailto
118
- return enum_for(:each_mailto) unless block_given?
143
+ return enum_for(__method__) unless block_given?
119
144
 
120
145
  if (html? && doc)
121
146
  doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
@@ -151,7 +176,7 @@ module Spidr
151
176
  # @since 0.3.0
152
177
  #
153
178
  def each_link
154
- return enum_for(:each_link) unless block_given?
179
+ return enum_for(__method__) unless block_given?
155
180
 
156
181
  filter = lambda { |url|
157
182
  yield url unless (url.nil? || url.empty?)
@@ -208,7 +233,7 @@ module Spidr
208
233
  # @since 0.3.0
209
234
  #
210
235
  def each_url
211
- return enum_for(:each_url) unless block_given?
236
+ return enum_for(__method__) unless block_given?
212
237
 
213
238
  each_link do |link|
214
239
  if (url = to_absolute(link))
@@ -239,15 +264,14 @@ module Spidr
239
264
  # The normalized URI.
240
265
  #
241
266
  def to_absolute(link)
242
- begin
243
- new_url = url.merge(link.to_s)
244
- rescue Exception
245
- return nil
246
- end
247
-
248
- if new_url.path
249
- path = new_url.path
267
+ link = link.to_s
268
+ new_url = begin
269
+ url.merge(link)
270
+ rescue Exception
271
+ return
272
+ end
250
273
 
274
+ if (path = new_url.path)
251
275
  # ensure that paths begin with a leading '/' for URI::FTP
252
276
  if (new_url.scheme == 'ftp' && !path.start_with?('/'))
253
277
  path.insert(0,'/')