spidr 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog.md +69 -54
  3. data/Gemfile +9 -5
  4. data/LICENSE.txt +1 -1
  5. data/README.md +34 -26
  6. data/Rakefile +4 -15
  7. data/gemspec.yml +3 -2
  8. data/lib/spidr/agent.rb +101 -44
  9. data/lib/spidr/{actions → agent}/actions.rb +32 -12
  10. data/lib/spidr/{events.rb → agent/events.rb} +4 -8
  11. data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
  12. data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
  13. data/lib/spidr/auth_store.rb +2 -2
  14. data/lib/spidr/cookie_jar.rb +2 -2
  15. data/lib/spidr/extensions/uri.rb +28 -16
  16. data/lib/spidr/page.rb +7 -11
  17. data/lib/spidr/{body.rb → page/body.rb} +1 -1
  18. data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
  19. data/lib/spidr/{links.rb → page/links.rb} +43 -7
  20. data/lib/spidr/session_cache.rb +2 -2
  21. data/lib/spidr/spidr.rb +32 -5
  22. data/lib/spidr/version.rb +1 -1
  23. data/spec/agent/actions_spec.rb +60 -0
  24. data/spec/agent/filters_spec.rb +62 -0
  25. data/spec/agent/sanitizers_spec.rb +62 -0
  26. data/spec/agent_spec.rb +13 -13
  27. data/spec/auth_store_spec.rb +17 -17
  28. data/spec/cookie_jar_spec.rb +26 -26
  29. data/spec/extensions/uri_spec.rb +19 -9
  30. data/spec/helpers/history.rb +5 -5
  31. data/spec/helpers/wsoc.rb +2 -2
  32. data/spec/page_examples.rb +4 -4
  33. data/spec/page_spec.rb +28 -25
  34. data/spec/rules_spec.rb +14 -14
  35. data/spec/session_cache.rb +7 -7
  36. data/spec/spidr_spec.rb +10 -10
  37. metadata +37 -51
  38. data/lib/spidr/actions.rb +0 -2
  39. data/lib/spidr/actions/exceptions.rb +0 -4
  40. data/lib/spidr/actions/exceptions/action.rb +0 -9
  41. data/lib/spidr/actions/exceptions/paused.rb +0 -11
  42. data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
  43. data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
  44. data/spec/actions_spec.rb +0 -59
  45. data/spec/filters_spec.rb +0 -61
  46. data/spec/sanitizers_spec.rb +0 -61
@@ -1,13 +1,33 @@
1
- require 'spidr/actions/exceptions/paused'
2
- require 'spidr/actions/exceptions/skip_link'
3
- require 'spidr/actions/exceptions/skip_page'
4
-
5
1
  module Spidr
6
- #
7
- # The {Actions} module adds methods to {Agent} for controlling the
8
- # spidering of links.
9
- #
10
- module Actions
2
+ class Agent
3
+ module Actions
4
+ #
5
+ # The base {Actions} exception class.
6
+ #
7
+ class Action < RuntimeError
8
+ end
9
+
10
+ #
11
+ # An {Actions} exception class used to pause a running {Agent}.
12
+ #
13
+ class Paused < Action
14
+ end
15
+
16
+ #
17
+ # An {Actions} exception class which causes a running {Agent} to
18
+ # skip a link.
19
+ #
20
+ class SkipLink < Action
21
+ end
22
+
23
+ #
24
+ # An {Actions} exception class which causes a running {Agent} to
25
+ # skip a {Page}, and all links within that page.
26
+ #
27
+ class SkipPage < Action
28
+ end
29
+ end
30
+
11
31
  #
12
32
  # Continue spidering.
13
33
  #
@@ -40,7 +60,7 @@ module Spidr
40
60
  #
41
61
  def pause!
42
62
  @paused = true
43
- raise(Paused)
63
+ raise(Actions::Paused)
44
64
  end
45
65
 
46
66
  #
@@ -61,7 +81,7 @@ module Spidr
61
81
  # and not enqueued or visited.
62
82
  #
63
83
  def skip_link!
64
- raise(SkipLink)
84
+ raise(Actions::SkipLink)
65
85
  end
66
86
 
67
87
  #
@@ -71,7 +91,7 @@ module Spidr
71
91
  # Indicates to the agent, that the current page should be skipped.
72
92
  #
73
93
  def skip_page!
74
- raise(SkipPage)
94
+ raise(Actions::SkipPage)
75
95
  end
76
96
 
77
97
  protected
@@ -1,10 +1,5 @@
1
1
  module Spidr
2
- #
3
- # The {Events} module adds methods to {Agent} for registering
4
- # callbacks which will receive URLs, links, headers and pages, when
5
- # they are visited.
6
- #
7
- module Events
2
+ class Agent
8
3
  #
9
4
  # Pass each URL from each page visited to the given block.
10
5
  #
@@ -526,12 +521,13 @@ module Spidr
526
521
  protected
527
522
 
528
523
  def initialize_events(options={})
529
- @every_url_blocks = []
524
+ @every_url_blocks = []
530
525
  @every_failed_url_blocks = []
531
- @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
526
+ @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
532
527
 
533
528
  @every_page_blocks = []
534
529
  @every_link_blocks = []
535
530
  end
531
+
536
532
  end
537
533
  end
@@ -1,11 +1,8 @@
1
1
  require 'spidr/rules'
2
2
 
3
3
  module Spidr
4
- #
5
- # The {Filters} module adds methods to {Agent} for controlling which
6
- # URLs the agent will visit.
7
- #
8
- module Filters
4
+ class Agent
5
+
9
6
  # List of acceptable URL schemes to follow
10
7
  attr_reader :schemes
11
8
 
@@ -419,24 +416,24 @@ module Spidr
419
416
  end
420
417
 
421
418
  @host_rules = Rules.new(
422
- :accept => options[:hosts],
423
- :reject => options[:ignore_hosts]
419
+ accept: options[:hosts],
420
+ reject: options[:ignore_hosts]
424
421
  )
425
422
  @port_rules = Rules.new(
426
- :accept => options[:ports],
427
- :reject => options[:ignore_ports]
423
+ accept: options[:ports],
424
+ reject: options[:ignore_ports]
428
425
  )
429
426
  @link_rules = Rules.new(
430
- :accept => options[:links],
431
- :reject => options[:ignore_links]
427
+ accept: options[:links],
428
+ reject: options[:ignore_links]
432
429
  )
433
430
  @url_rules = Rules.new(
434
- :accept => options[:urls],
435
- :reject => options[:ignore_urls]
431
+ accept: options[:urls],
432
+ reject: options[:ignore_urls]
436
433
  )
437
434
  @ext_rules = Rules.new(
438
- :accept => options[:exts],
439
- :reject => options[:ignore_exts]
435
+ accept: options[:exts],
436
+ reject: options[:ignore_exts]
440
437
  )
441
438
 
442
439
  if options[:host]
@@ -511,7 +508,7 @@ module Spidr
511
508
  #
512
509
  # Determines if a given URL should be visited.
513
510
  #
514
- # @param [URI::HTTP, URI::HTTPS] url
511
+ # @param [URI::HTTP, URI::HTTPS] link
515
512
  # The URL.
516
513
  #
517
514
  # @return [Boolean]
@@ -535,5 +532,6 @@ module Spidr
535
532
  def visit_ext?(path)
536
533
  @ext_rules.accept?(File.extname(path)[1..-1])
537
534
  end
535
+
538
536
  end
539
537
  end
@@ -1,11 +1,8 @@
1
1
  require 'uri'
2
2
 
3
3
  module Spidr
4
- #
5
- # The {Sanitizers} module adds methods to {Agent} which control the
6
- # sanitation of incoming links.
7
- #
8
- module Sanitizers
4
+ class Agent
5
+
9
6
  # Specifies whether the Agent will strip URI fragments
10
7
  attr_accessor :strip_fragments
11
8
 
@@ -27,7 +24,7 @@ module Spidr
27
24
  url = URI(url.to_s) unless url.kind_of?(URI)
28
25
 
29
26
  url.fragment = nil if @strip_fragments
30
- url.query = nil if @strip_query
27
+ url.query = nil if @strip_query
31
28
 
32
29
  return url
33
30
  end
@@ -50,7 +47,8 @@ module Spidr
50
47
  #
51
48
  def initialize_sanitizers(options={})
52
49
  @strip_fragments = options.fetch(:strip_fragments,true)
53
- @strip_query = options.fetch(:strip_query,false)
50
+ @strip_query = options.fetch(:strip_query,false)
54
51
  end
52
+
55
53
  end
56
54
  end
@@ -57,10 +57,10 @@ module Spidr
57
57
  #
58
58
  # Add an auth credential to the store for supplied base URL.
59
59
  #
60
- # @param [URI] url_base
60
+ # @param [URI] url
61
61
  # A URL pattern to associate with a set of auth credentials.
62
62
  #
63
- # @param [AuthCredential]
63
+ # @param [AuthCredential] auth
64
64
  # The auth credential for this URL pattern.
65
65
  #
66
66
  # @return [AuthCredential]
@@ -18,7 +18,7 @@ module Spidr
18
18
  def initialize
19
19
  @params = {}
20
20
 
21
- @dirty = Set[]
21
+ @dirty = Set[]
22
22
  @cookies = {}
23
23
  end
24
24
 
@@ -147,7 +147,7 @@ module Spidr
147
147
  #
148
148
  def cookies_for_host(host)
149
149
  host_cookies = (@params[host] || {})
150
- sub_domains = host.split('.')
150
+ sub_domains = host.split('.')
151
151
 
152
152
  while sub_domains.length > 2
153
153
  sub_domains.shift
@@ -1,4 +1,5 @@
1
1
  require 'uri'
2
+ require 'strscan'
2
3
 
3
4
  module URI
4
5
  #
@@ -26,27 +27,38 @@ module URI
26
27
  # URI.expand_path('/test/../path')
27
28
  # # => "/path"
28
29
  #
29
- def URI.expand_path(path)
30
- dirs = path.split(/\/+/)
30
+ def self.expand_path(path)
31
+ if path.start_with?('/')
32
+ leading_slash, path = path[0,1], path[1..-1]
33
+ else
34
+ leading_slash = ''
35
+ end
31
36
 
32
- # append any tailing '/' chars, lost due to String#split
33
- dirs << '' if path[-1,1] == '/'
37
+ if path.end_with?('/')
38
+ trailing_slash, path = path[-1,1], path[0..-2]
39
+ else
40
+ trailing_slash = ''
41
+ end
34
42
 
35
- new_dirs = []
43
+ scanner = StringScanner.new(path)
44
+ stack = []
36
45
 
37
- dirs.each do |dir|
38
- if dir == '..'
39
- new_dirs.pop
40
- elsif dir != '.'
41
- new_dirs.push(dir)
46
+ until scanner.eos?
47
+ if (dir = scanner.scan(/^[^\/]+/))
48
+ case dir
49
+ when '..' then stack.pop
50
+ when '.' then false
51
+ else stack.push(dir)
52
+ end
53
+ else
54
+ scanner.skip(/\/+/)
42
55
  end
43
56
  end
44
57
 
45
- full_path = new_dirs.join('/')
46
-
47
- # default empty paths to '/'
48
- full_path = '/' if full_path.empty?
49
-
50
- return full_path
58
+ unless stack.empty?
59
+ "#{leading_slash}#{stack.join('/')}#{trailing_slash}"
60
+ else
61
+ '/'
62
+ end
51
63
  end
52
64
  end
@@ -1,6 +1,6 @@
1
- require 'spidr/headers'
2
- require 'spidr/body'
3
- require 'spidr/links'
1
+ require 'spidr/page/headers'
2
+ require 'spidr/page/body'
3
+ require 'spidr/page/links'
4
4
 
5
5
  module Spidr
6
6
  #
@@ -8,10 +8,6 @@ module Spidr
8
8
  #
9
9
  class Page
10
10
 
11
- include Headers
12
- include Body
13
- include Links
14
-
15
11
  # URL of the page
16
12
  attr_reader :url
17
13
 
@@ -27,14 +23,14 @@ module Spidr
27
23
  # @param [URI::HTTP] url
28
24
  # The URL of the page.
29
25
  #
30
- # @param [Net::HTTP::Response] response
26
+ # @param [Net::HTTPResponse] response
31
27
  # The response from the request for the page.
32
28
  #
33
29
  def initialize(url,response)
34
- @url = url
30
+ @url = url
35
31
  @response = response
36
- @headers = response.to_hash
37
- @doc = nil
32
+ @headers = response.to_hash
33
+ @doc = nil
38
34
  end
39
35
 
40
36
  #
@@ -1,7 +1,7 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  module Spidr
4
- module Body
4
+ class Page
5
5
  #
6
6
  # The body of the response.
7
7
  #
@@ -1,7 +1,7 @@
1
1
  require 'set'
2
2
 
3
3
  module Spidr
4
- module Headers
4
+ class Page
5
5
  # Reserved names used within Cookie strings
6
6
  RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
7
7
 
@@ -2,7 +2,7 @@ require 'spidr/extensions/uri'
2
2
  require 'uri'
3
3
 
4
4
  module Spidr
5
- module Links
5
+ class Page
6
6
  include Enumerable
7
7
 
8
8
  #
@@ -100,6 +100,42 @@ module Spidr
100
100
  each_redirect.to_a
101
101
  end
102
102
 
103
+ #
104
+ # Enumerates over every `mailto:` link in the page.
105
+ #
106
+ # @yield [link]
107
+ # The given block will be passed every `mailto:` link from the page.
108
+ #
109
+ # @yieldparam [String] link
110
+ # A `mailto:` link from the page.
111
+ #
112
+ # @return [Enumerator]
113
+ # If no block is given, an enumerator object will be returned.
114
+ #
115
+ # @since 0.5.0
116
+ #
117
+ def each_mailto
118
+ return enum_for(:each_mailto) unless block_given?
119
+
120
+ if (html? && doc)
121
+ doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
122
+ yield a.get_attribute('href')[7..-1]
123
+ end
124
+ end
125
+ end
126
+
127
+ #
128
+ # `mailto:` links in the page.
129
+ #
130
+ # @return [Array<String>]
131
+ # The `mailto:` links found within the page.
132
+ #
133
+ # @since 0.5.0
134
+ #
135
+ def mailtos
136
+ each_mailto.to_a
137
+ end
138
+
103
139
  #
104
140
  # Enumerates over every link in the page.
105
141
  #
@@ -124,23 +160,23 @@ module Spidr
124
160
  each_redirect(&filter) if is_redirect?
125
161
 
126
162
  if (html? && doc)
127
- doc.search('a[@href]').each do |a|
163
+ doc.search('//a[@href]').each do |a|
128
164
  filter.call(a.get_attribute('href'))
129
165
  end
130
166
 
131
- doc.search('frame[@src]').each do |iframe|
167
+ doc.search('//frame[@src]').each do |iframe|
132
168
  filter.call(iframe.get_attribute('src'))
133
169
  end
134
170
 
135
- doc.search('iframe[@src]').each do |iframe|
171
+ doc.search('//iframe[@src]').each do |iframe|
136
172
  filter.call(iframe.get_attribute('src'))
137
173
  end
138
174
 
139
- doc.search('link[@href]').each do |link|
175
+ doc.search('//link[@href]').each do |link|
140
176
  filter.call(link.get_attribute('href'))
141
177
  end
142
178
 
143
- doc.search('script[@src]').each do |script|
179
+ doc.search('//script[@src]').each do |script|
144
180
  filter.call(script.get_attribute('src'))
145
181
  end
146
182
  end
@@ -213,7 +249,7 @@ module Spidr
213
249
  path = new_url.path
214
250
 
215
251
  # ensure that paths begin with a leading '/' for URI::FTP
216
- if (new_url.scheme == 'ftp' && path[0,1] != '/')
252
+ if (new_url.scheme == 'ftp' && !path.start_with?('/'))
217
253
  path.insert(0,'/')
218
254
  end
219
255
 
@@ -32,7 +32,7 @@ module Spidr
32
32
  # @since 0.2.2
33
33
  #
34
34
  def initialize(proxy=Spidr.proxy)
35
- @proxy = proxy
35
+ @proxy = proxy
36
36
  @sessions = {}
37
37
  end
38
38
 
@@ -82,7 +82,7 @@ module Spidr
82
82
  ).new(url.host,url.port)
83
83
 
84
84
  if url.scheme == 'https'
85
- session.use_ssl = true
85
+ session.use_ssl = true
86
86
  session.verify_mode = OpenSSL::SSL::VERIFY_NONE
87
87
  session.start
88
88
  end