spidr 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog.md +69 -54
  3. data/Gemfile +9 -5
  4. data/LICENSE.txt +1 -1
  5. data/README.md +34 -26
  6. data/Rakefile +4 -15
  7. data/gemspec.yml +3 -2
  8. data/lib/spidr/agent.rb +101 -44
  9. data/lib/spidr/{actions → agent}/actions.rb +32 -12
  10. data/lib/spidr/{events.rb → agent/events.rb} +4 -8
  11. data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
  12. data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
  13. data/lib/spidr/auth_store.rb +2 -2
  14. data/lib/spidr/cookie_jar.rb +2 -2
  15. data/lib/spidr/extensions/uri.rb +28 -16
  16. data/lib/spidr/page.rb +7 -11
  17. data/lib/spidr/{body.rb → page/body.rb} +1 -1
  18. data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
  19. data/lib/spidr/{links.rb → page/links.rb} +43 -7
  20. data/lib/spidr/session_cache.rb +2 -2
  21. data/lib/spidr/spidr.rb +32 -5
  22. data/lib/spidr/version.rb +1 -1
  23. data/spec/agent/actions_spec.rb +60 -0
  24. data/spec/agent/filters_spec.rb +62 -0
  25. data/spec/agent/sanitizers_spec.rb +62 -0
  26. data/spec/agent_spec.rb +13 -13
  27. data/spec/auth_store_spec.rb +17 -17
  28. data/spec/cookie_jar_spec.rb +26 -26
  29. data/spec/extensions/uri_spec.rb +19 -9
  30. data/spec/helpers/history.rb +5 -5
  31. data/spec/helpers/wsoc.rb +2 -2
  32. data/spec/page_examples.rb +4 -4
  33. data/spec/page_spec.rb +28 -25
  34. data/spec/rules_spec.rb +14 -14
  35. data/spec/session_cache.rb +7 -7
  36. data/spec/spidr_spec.rb +10 -10
  37. metadata +37 -51
  38. data/lib/spidr/actions.rb +0 -2
  39. data/lib/spidr/actions/exceptions.rb +0 -4
  40. data/lib/spidr/actions/exceptions/action.rb +0 -9
  41. data/lib/spidr/actions/exceptions/paused.rb +0 -11
  42. data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
  43. data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
  44. data/spec/actions_spec.rb +0 -59
  45. data/spec/filters_spec.rb +0 -61
  46. data/spec/sanitizers_spec.rb +0 -61
@@ -1,13 +1,33 @@
1
- require 'spidr/actions/exceptions/paused'
2
- require 'spidr/actions/exceptions/skip_link'
3
- require 'spidr/actions/exceptions/skip_page'
4
-
5
1
  module Spidr
6
- #
7
- # The {Actions} module adds methods to {Agent} for controlling the
8
- # spidering of links.
9
- #
10
- module Actions
2
+ class Agent
3
+ module Actions
4
+ #
5
+ # The base {Actions} exception class.
6
+ #
7
+ class Action < RuntimeError
8
+ end
9
+
10
+ #
11
+ # An {Actions} exception class used to pause a running {Agent}.
12
+ #
13
+ class Paused < Action
14
+ end
15
+
16
+ #
17
+ # An {Actions} exception class which causes a running {Agent} to
18
+ # skip a link.
19
+ #
20
+ class SkipLink < Action
21
+ end
22
+
23
+ #
24
+ # An {Actions} exception class which causes a running {Agent} to
25
+ # skip a {Page}, and all links within that page.
26
+ #
27
+ class SkipPage < Action
28
+ end
29
+ end
30
+
11
31
  #
12
32
  # Continue spidering.
13
33
  #
@@ -40,7 +60,7 @@ module Spidr
40
60
  #
41
61
  def pause!
42
62
  @paused = true
43
- raise(Paused)
63
+ raise(Actions::Paused)
44
64
  end
45
65
 
46
66
  #
@@ -61,7 +81,7 @@ module Spidr
61
81
  # and not enqueued or visited.
62
82
  #
63
83
  def skip_link!
64
- raise(SkipLink)
84
+ raise(Actions::SkipLink)
65
85
  end
66
86
 
67
87
  #
@@ -71,7 +91,7 @@ module Spidr
71
91
  # Indicates to the agent, that the current page should be skipped.
72
92
  #
73
93
  def skip_page!
74
- raise(SkipPage)
94
+ raise(Actions::SkipPage)
75
95
  end
76
96
 
77
97
  protected
@@ -1,10 +1,5 @@
1
1
  module Spidr
2
- #
3
- # The {Events} module adds methods to {Agent} for registering
4
- # callbacks which will receive URLs, links, headers and pages, when
5
- # they are visited.
6
- #
7
- module Events
2
+ class Agent
8
3
  #
9
4
  # Pass each URL from each page visited to the given block.
10
5
  #
@@ -526,12 +521,13 @@ module Spidr
526
521
  protected
527
522
 
528
523
  def initialize_events(options={})
529
- @every_url_blocks = []
524
+ @every_url_blocks = []
530
525
  @every_failed_url_blocks = []
531
- @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
526
+ @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
532
527
 
533
528
  @every_page_blocks = []
534
529
  @every_link_blocks = []
535
530
  end
531
+
536
532
  end
537
533
  end
@@ -1,11 +1,8 @@
1
1
  require 'spidr/rules'
2
2
 
3
3
  module Spidr
4
- #
5
- # The {Filters} module adds methods to {Agent} for controlling which
6
- # URLs the agent will visit.
7
- #
8
- module Filters
4
+ class Agent
5
+
9
6
  # List of acceptable URL schemes to follow
10
7
  attr_reader :schemes
11
8
 
@@ -419,24 +416,24 @@ module Spidr
419
416
  end
420
417
 
421
418
  @host_rules = Rules.new(
422
- :accept => options[:hosts],
423
- :reject => options[:ignore_hosts]
419
+ accept: options[:hosts],
420
+ reject: options[:ignore_hosts]
424
421
  )
425
422
  @port_rules = Rules.new(
426
- :accept => options[:ports],
427
- :reject => options[:ignore_ports]
423
+ accept: options[:ports],
424
+ reject: options[:ignore_ports]
428
425
  )
429
426
  @link_rules = Rules.new(
430
- :accept => options[:links],
431
- :reject => options[:ignore_links]
427
+ accept: options[:links],
428
+ reject: options[:ignore_links]
432
429
  )
433
430
  @url_rules = Rules.new(
434
- :accept => options[:urls],
435
- :reject => options[:ignore_urls]
431
+ accept: options[:urls],
432
+ reject: options[:ignore_urls]
436
433
  )
437
434
  @ext_rules = Rules.new(
438
- :accept => options[:exts],
439
- :reject => options[:ignore_exts]
435
+ accept: options[:exts],
436
+ reject: options[:ignore_exts]
440
437
  )
441
438
 
442
439
  if options[:host]
@@ -511,7 +508,7 @@ module Spidr
511
508
  #
512
509
  # Determines if a given URL should be visited.
513
510
  #
514
- # @param [URI::HTTP, URI::HTTPS] url
511
+ # @param [URI::HTTP, URI::HTTPS] link
515
512
  # The URL.
516
513
  #
517
514
  # @return [Boolean]
@@ -535,5 +532,6 @@ module Spidr
535
532
  def visit_ext?(path)
536
533
  @ext_rules.accept?(File.extname(path)[1..-1])
537
534
  end
535
+
538
536
  end
539
537
  end
@@ -1,11 +1,8 @@
1
1
  require 'uri'
2
2
 
3
3
  module Spidr
4
- #
5
- # The {Sanitizers} module adds methods to {Agent} which control the
6
- # sanitation of incoming links.
7
- #
8
- module Sanitizers
4
+ class Agent
5
+
9
6
  # Specifies whether the Agent will strip URI fragments
10
7
  attr_accessor :strip_fragments
11
8
 
@@ -27,7 +24,7 @@ module Spidr
27
24
  url = URI(url.to_s) unless url.kind_of?(URI)
28
25
 
29
26
  url.fragment = nil if @strip_fragments
30
- url.query = nil if @strip_query
27
+ url.query = nil if @strip_query
31
28
 
32
29
  return url
33
30
  end
@@ -50,7 +47,8 @@ module Spidr
50
47
  #
51
48
  def initialize_sanitizers(options={})
52
49
  @strip_fragments = options.fetch(:strip_fragments,true)
53
- @strip_query = options.fetch(:strip_query,false)
50
+ @strip_query = options.fetch(:strip_query,false)
54
51
  end
52
+
55
53
  end
56
54
  end
@@ -57,10 +57,10 @@ module Spidr
57
57
  #
58
58
  # Add an auth credential to the store for supplied base URL.
59
59
  #
60
- # @param [URI] url_base
60
+ # @param [URI] url
61
61
  # A URL pattern to associate with a set of auth credentials.
62
62
  #
63
- # @param [AuthCredential]
63
+ # @param [AuthCredential] auth
64
64
  # The auth credential for this URL pattern.
65
65
  #
66
66
  # @return [AuthCredential]
@@ -18,7 +18,7 @@ module Spidr
18
18
  def initialize
19
19
  @params = {}
20
20
 
21
- @dirty = Set[]
21
+ @dirty = Set[]
22
22
  @cookies = {}
23
23
  end
24
24
 
@@ -147,7 +147,7 @@ module Spidr
147
147
  #
148
148
  def cookies_for_host(host)
149
149
  host_cookies = (@params[host] || {})
150
- sub_domains = host.split('.')
150
+ sub_domains = host.split('.')
151
151
 
152
152
  while sub_domains.length > 2
153
153
  sub_domains.shift
@@ -1,4 +1,5 @@
1
1
  require 'uri'
2
+ require 'strscan'
2
3
 
3
4
  module URI
4
5
  #
@@ -26,27 +27,38 @@ module URI
26
27
  # URI.expand_path('/test/../path')
27
28
  # # => "/path"
28
29
  #
29
- def URI.expand_path(path)
30
- dirs = path.split(/\/+/)
30
+ def self.expand_path(path)
31
+ if path.start_with?('/')
32
+ leading_slash, path = path[0,1], path[1..-1]
33
+ else
34
+ leading_slash = ''
35
+ end
31
36
 
32
- # append any tailing '/' chars, lost due to String#split
33
- dirs << '' if path[-1,1] == '/'
37
+ if path.end_with?('/')
38
+ trailing_slash, path = path[-1,1], path[0..-2]
39
+ else
40
+ trailing_slash = ''
41
+ end
34
42
 
35
- new_dirs = []
43
+ scanner = StringScanner.new(path)
44
+ stack = []
36
45
 
37
- dirs.each do |dir|
38
- if dir == '..'
39
- new_dirs.pop
40
- elsif dir != '.'
41
- new_dirs.push(dir)
46
+ until scanner.eos?
47
+ if (dir = scanner.scan(/^[^\/]+/))
48
+ case dir
49
+ when '..' then stack.pop
50
+ when '.' then false
51
+ else stack.push(dir)
52
+ end
53
+ else
54
+ scanner.skip(/\/+/)
42
55
  end
43
56
  end
44
57
 
45
- full_path = new_dirs.join('/')
46
-
47
- # default empty paths to '/'
48
- full_path = '/' if full_path.empty?
49
-
50
- return full_path
58
+ unless stack.empty?
59
+ "#{leading_slash}#{stack.join('/')}#{trailing_slash}"
60
+ else
61
+ '/'
62
+ end
51
63
  end
52
64
  end
@@ -1,6 +1,6 @@
1
- require 'spidr/headers'
2
- require 'spidr/body'
3
- require 'spidr/links'
1
+ require 'spidr/page/headers'
2
+ require 'spidr/page/body'
3
+ require 'spidr/page/links'
4
4
 
5
5
  module Spidr
6
6
  #
@@ -8,10 +8,6 @@ module Spidr
8
8
  #
9
9
  class Page
10
10
 
11
- include Headers
12
- include Body
13
- include Links
14
-
15
11
  # URL of the page
16
12
  attr_reader :url
17
13
 
@@ -27,14 +23,14 @@ module Spidr
27
23
  # @param [URI::HTTP] url
28
24
  # The URL of the page.
29
25
  #
30
- # @param [Net::HTTP::Response] response
26
+ # @param [Net::HTTPResponse] response
31
27
  # The response from the request for the page.
32
28
  #
33
29
  def initialize(url,response)
34
- @url = url
30
+ @url = url
35
31
  @response = response
36
- @headers = response.to_hash
37
- @doc = nil
32
+ @headers = response.to_hash
33
+ @doc = nil
38
34
  end
39
35
 
40
36
  #
@@ -1,7 +1,7 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  module Spidr
4
- module Body
4
+ class Page
5
5
  #
6
6
  # The body of the response.
7
7
  #
@@ -1,7 +1,7 @@
1
1
  require 'set'
2
2
 
3
3
  module Spidr
4
- module Headers
4
+ class Page
5
5
  # Reserved names used within Cookie strings
6
6
  RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
7
7
 
@@ -2,7 +2,7 @@ require 'spidr/extensions/uri'
2
2
  require 'uri'
3
3
 
4
4
  module Spidr
5
- module Links
5
+ class Page
6
6
  include Enumerable
7
7
 
8
8
  #
@@ -100,6 +100,42 @@ module Spidr
100
100
  each_redirect.to_a
101
101
  end
102
102
 
103
+ #
104
+ # Enumerates over every `mailto:` link in the page.
105
+ #
106
+ # @yield [link]
107
+ # The given block will be passed every `mailto:` link from the page.
108
+ #
109
+ # @yieldparam [String] link
110
+ # A `mailto:` link from the page.
111
+ #
112
+ # @return [Enumerator]
113
+ # If no block is given, an enumerator object will be returned.
114
+ #
115
+ # @since 0.5.0
116
+ #
117
+ def each_mailto
118
+ return enum_for(:each_mailto) unless block_given?
119
+
120
+ if (html? && doc)
121
+ doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
122
+ yield a.get_attribute('href')[7..-1]
123
+ end
124
+ end
125
+ end
126
+
127
+ #
128
+ # `mailto:` links in the page.
129
+ #
130
+ # @return [Array<String>]
131
+ # The `mailto:` links found within the page.
132
+ #
133
+ # @since 0.5.0
134
+ #
135
+ def mailtos
136
+ each_mailto.to_a
137
+ end
138
+
103
139
  #
104
140
  # Enumerates over every link in the page.
105
141
  #
@@ -124,23 +160,23 @@ module Spidr
124
160
  each_redirect(&filter) if is_redirect?
125
161
 
126
162
  if (html? && doc)
127
- doc.search('a[@href]').each do |a|
163
+ doc.search('//a[@href]').each do |a|
128
164
  filter.call(a.get_attribute('href'))
129
165
  end
130
166
 
131
- doc.search('frame[@src]').each do |iframe|
167
+ doc.search('//frame[@src]').each do |iframe|
132
168
  filter.call(iframe.get_attribute('src'))
133
169
  end
134
170
 
135
- doc.search('iframe[@src]').each do |iframe|
171
+ doc.search('//iframe[@src]').each do |iframe|
136
172
  filter.call(iframe.get_attribute('src'))
137
173
  end
138
174
 
139
- doc.search('link[@href]').each do |link|
175
+ doc.search('//link[@href]').each do |link|
140
176
  filter.call(link.get_attribute('href'))
141
177
  end
142
178
 
143
- doc.search('script[@src]').each do |script|
179
+ doc.search('//script[@src]').each do |script|
144
180
  filter.call(script.get_attribute('src'))
145
181
  end
146
182
  end
@@ -213,7 +249,7 @@ module Spidr
213
249
  path = new_url.path
214
250
 
215
251
  # ensure that paths begin with a leading '/' for URI::FTP
216
- if (new_url.scheme == 'ftp' && path[0,1] != '/')
252
+ if (new_url.scheme == 'ftp' && !path.start_with?('/'))
217
253
  path.insert(0,'/')
218
254
  end
219
255
 
@@ -32,7 +32,7 @@ module Spidr
32
32
  # @since 0.2.2
33
33
  #
34
34
  def initialize(proxy=Spidr.proxy)
35
- @proxy = proxy
35
+ @proxy = proxy
36
36
  @sessions = {}
37
37
  end
38
38
 
@@ -82,7 +82,7 @@ module Spidr
82
82
  ).new(url.host,url.port)
83
83
 
84
84
  if url.scheme == 'https'
85
- session.use_ssl = true
85
+ session.use_ssl = true
86
86
  session.verify_mode = OpenSSL::SSL::VERIFY_NONE
87
87
  session.start
88
88
  end