spidr 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -105,7 +105,9 @@ module Spidr
105
105
  def each_redirect(&block)
106
106
  return enum_for(__method__) unless block
107
107
 
108
- if (locations = @response.get_fields('Location'))
108
+ locations = @response.get_fields('Location')
109
+
110
+ unless (locations.nil? || locations.empty?)
109
111
  # Location headers override any meta-refresh redirects in the HTML
110
112
  locations.each(&block)
111
113
  else
@@ -175,34 +177,30 @@ module Spidr
175
177
  #
176
178
  # @since 0.3.0
177
179
  #
178
- def each_link
180
+ def each_link(&block)
179
181
  return enum_for(__method__) unless block_given?
180
182
 
181
- filter = lambda { |url|
182
- yield url unless (url.nil? || url.empty?)
183
- }
184
-
185
- each_redirect(&filter) if is_redirect?
183
+ each_redirect(&block) if is_redirect?
186
184
 
187
185
  if (html? && doc)
188
- doc.search('//a[@href]').each do |a|
189
- filter.call(a.get_attribute('href'))
186
+ doc.search('//a[@href[string()]]').each do |a|
187
+ yield a.get_attribute('href')
190
188
  end
191
189
 
192
- doc.search('//frame[@src]').each do |iframe|
193
- filter.call(iframe.get_attribute('src'))
190
+ doc.search('//frame[@src[string()]]').each do |iframe|
191
+ yield iframe.get_attribute('src')
194
192
  end
195
193
 
196
- doc.search('//iframe[@src]').each do |iframe|
197
- filter.call(iframe.get_attribute('src'))
194
+ doc.search('//iframe[@src[string()]]').each do |iframe|
195
+ yield iframe.get_attribute('src')
198
196
  end
199
197
 
200
- doc.search('//link[@href]').each do |link|
201
- filter.call(link.get_attribute('href'))
198
+ doc.search('//link[@href[string()]]').each do |link|
199
+ yield link.get_attribute('href')
202
200
  end
203
201
 
204
- doc.search('//script[@src]').each do |script|
205
- filter.call(script.get_attribute('src'))
202
+ doc.search('//script[@src[string()]]').each do |script|
203
+ yield script.get_attribute('src')
206
204
  end
207
205
  end
208
206
  end
@@ -211,7 +209,7 @@ module Spidr
211
209
  # The links from within the page.
212
210
  #
213
211
  # @return [Array<String>]
214
- # All links within the HTML page, frame/iframe source URLs and any
212
+ # All links within the HTML page, `frame`/`iframe` source URLs and any
215
213
  # links in the `Location` header.
216
214
  #
217
215
  def links
@@ -271,7 +269,7 @@ module Spidr
271
269
  return
272
270
  end
273
271
 
274
- if (path = new_url.path)
272
+ if (!new_url.opaque) && (path = new_url.path)
275
273
  # ensure that paths begin with a leading '/' for URI::FTP
276
274
  if (new_url.scheme == 'ftp' && !path.start_with?('/'))
277
275
  path.insert(0,'/')
@@ -22,16 +22,6 @@ module Spidr
22
22
 
23
23
  alias ok? is_ok?
24
24
 
25
- #
26
- # Determines if the response code is `308`.
27
- #
28
- # @return [Boolean]
29
- # Specifies whether the response code is `308`.
30
- #
31
- def timedout?
32
- code == 308
33
- end
34
-
35
25
  #
36
26
  # Determines if the response code is `400`.
37
27
  #
@@ -78,6 +68,18 @@ module Spidr
78
68
 
79
69
  alias missing? is_missing?
80
70
 
71
+ #
72
+ # Determines if the response code is `408`.
73
+ #
74
+ # @return [Boolean]
75
+ # Specifies whether the response code is `408`.
76
+ #
77
+ def is_timedout?
78
+ code == 408
79
+ end
80
+
81
+ alias timedout? is_timedout?
82
+
81
83
  #
82
84
  # Determines if the response code is `500`.
83
85
  #
data/lib/spidr/proxy.rb CHANGED
@@ -10,28 +10,20 @@ module Spidr
10
10
  #
11
11
  # Initializes the proxy.
12
12
  #
13
- # @param [Hash] attributes
14
- # Attributes for the proxy.
15
- #
16
- # @option attributes [String] :host
13
+ # @param [String] host
17
14
  # The host the proxy is running on.
18
15
  #
19
- # @option attributes [Integer] :port
16
+ # @param [Integer] port
20
17
  # The port the proxy is running on.
21
18
  #
22
- # @option attributes [String] :user
19
+ # @param [String] user
23
20
  # The user to authenticate as with the proxy.
24
21
  #
25
- # @option attributes [String] :password
22
+ # @param [String] password
26
23
  # The password to authenticate with.
27
24
  #
28
- def initialize(attributes={})
29
- super(
30
- attributes[:host],
31
- attributes.fetch(:port,DEFAULT_PORT),
32
- attributes[:user],
33
- attributes[:password]
34
- )
25
+ def initialize(host: nil, port: DEFAULT_PORT, user: nil, password: nil)
26
+ super(host,port,user,password)
35
27
  end
36
28
 
37
29
  #
data/lib/spidr/rules.rb CHANGED
@@ -14,21 +14,18 @@ module Spidr
14
14
  #
15
15
  # Creates a new Rules object.
16
16
  #
17
- # @param [Hash] options
18
- # Additional options.
19
- #
20
- # @option options [Array<String, Regexp, Proc>] :accept
17
+ # @param [Array<String, Regexp, Proc>, nil] accept
21
18
  # The patterns to accept data with.
22
19
  #
23
- # @option options [Array<String, Regexp, Proc>] :reject
20
+ # @param [Array<String, Regexp, Proc>, nil] reject
24
21
  # The patterns to reject data with.
25
22
  #
26
- def initialize(options={})
23
+ def initialize(accept: nil, reject: nil)
27
24
  @accept = []
28
25
  @reject = []
29
26
 
30
- @accept += options[:accept] if options[:accept]
31
- @reject += options[:reject] if options[:reject]
27
+ @accept += accept if accept
28
+ @reject += reject if reject
32
29
  end
33
30
 
34
31
  #
@@ -17,37 +17,39 @@ module Spidr
17
17
  #
18
18
  # Creates a new session cache.
19
19
  #
20
- # @param [Hash] options
21
- # Configuration options.
22
- #
23
- # @option [Hash] :proxy (Spidr.proxy)
20
+ # @param [Hash] proxy
24
21
  # Proxy options.
25
22
  #
26
- # @option [Integer] :open_timeout (Spidr.open_timeout)
27
- # Optional open timeout.
23
+ # @param [Integer] open_timeout
24
+ # Optional open connection timeout.
28
25
  #
29
- # @option [Integer] :ssl_timeout (Spidr.ssl_timeout)
30
- # Optional ssl timeout.
26
+ # @param [Integer] ssl_timeout
27
+ # Optional SSL connection timeout.
31
28
  #
32
- # @option [Integer] :read_timeout (Spidr.read_timeout)
29
+ # @param [Integer] read_timeout
33
30
  # Optional read timeout.
34
31
  #
35
- # @option [Integer] :continue_timeout (Spidr.continue_timeout)
32
+ # @param [Integer] continue_timeout
36
33
  # Optional `Continue` timeout.
37
34
  #
38
- # @option [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
35
+ # @param [Integer] keep_alive_timeout
39
36
  # Optional `Keep-Alive` timeout.
40
37
  #
41
38
  # @since 0.6.0
42
39
  #
43
- def initialize(options={})
44
- @proxy = options.fetch(:proxy,Spidr.proxy)
40
+ def initialize(proxy: Spidr.proxy,
41
+ open_timeout: Spidr.open_timeout,
42
+ ssl_timeout: Spidr.ssl_timeout,
43
+ read_timeout: Spidr.read_timeout,
44
+ continue_timeout: Spidr.continue_timeout,
45
+ keep_alive_timeout: Spidr.keep_alive_timeout)
46
+ self.proxy = proxy
45
47
 
46
- @open_timeout = options.fetch(:open_timeout,Spidr.open_timeout)
47
- @ssl_timeout = options.fetch(:ssl_timeout,Spidr.ssl_timeout)
48
- @read_timeout = options.fetch(:read_timeout,Spidr.read_timeout)
49
- @continue_timeout = options.fetch(:continue_timeout,Spidr.continue_timeout)
50
- @keep_alive_timeout = options.fetch(:keep_alive_timeout,Spidr.keep_alive_timeout)
48
+ self.open_timeout = open_timeout
49
+ self.ssl_timeout = ssl_timeout
50
+ self.read_timeout = read_timeout
51
+ self.continue_timeout = continue_timeout
52
+ self.keep_alive_timeout = keep_alive_timeout
51
53
 
52
54
  @sessions = {}
53
55
  end
@@ -65,7 +67,7 @@ module Spidr
65
67
  #
66
68
  def active?(url)
67
69
  # normalize the url
68
- url = URI(url.to_s) unless url.kind_of?(URI)
70
+ url = URI(url)
69
71
 
70
72
  # session key
71
73
  key = key_for(url)
@@ -84,7 +86,7 @@ module Spidr
84
86
  #
85
87
  def [](url)
86
88
  # normalize the url
87
- url = URI(url.to_s) unless url.kind_of?(URI)
89
+ url = URI(url)
88
90
 
89
91
  # session key
90
92
  key = key_for(url)
@@ -127,7 +129,7 @@ module Spidr
127
129
  #
128
130
  def kill!(url)
129
131
  # normalize the url
130
- url = URI(url.to_s) unless url.kind_of?(URI)
132
+ url = URI(url)
131
133
 
132
134
  # session key
133
135
  key = key_for(url)
@@ -1,5 +1,7 @@
1
1
  require 'spidr/proxy'
2
2
 
3
+ require 'uri/http'
4
+
3
5
  module Spidr
4
6
  module Settings
5
7
  #
@@ -21,7 +23,7 @@ module Spidr
21
23
  #
22
24
  # Sets the proxy information used by Agent objects.
23
25
  #
24
- # @param [Spidr::Proxy, Hash, nil] new_proxy
26
+ # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] new_proxy
25
27
  # The new proxy information.
26
28
  #
27
29
  # @option new_proxy [String] :host
@@ -41,11 +43,23 @@ module Spidr
41
43
  #
42
44
  def proxy=(new_proxy)
43
45
  @proxy = case new_proxy
44
- when Spidr::Proxy then new_proxy
45
- when Hash then Spidr::Proxy.new(new_proxy)
46
- when nil then Spidr::Proxy.new
46
+ when Spidr::Proxy
47
+ new_proxy
48
+ when Hash
49
+ Spidr::Proxy.new(**new_proxy)
50
+ when String, URI::HTTP
51
+ proxy_uri = URI(new_proxy)
52
+
53
+ Spidr::Proxy.new(
54
+ host: proxy_uri.host,
55
+ port: proxy_uri.port,
56
+ user: proxy_uri.user,
57
+ password: proxy_uri.password
58
+ )
59
+ when nil
60
+ Spidr::Proxy.new
47
61
  else
48
- raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
62
+ raise(TypeError,"#{self.class}#{__method__} only accepts Spidr::Proxy, URI::HTTP, Hash, or nil")
49
63
  end
50
64
  end
51
65
 
data/lib/spidr/spidr.rb CHANGED
@@ -16,6 +16,7 @@ module Spidr
16
16
  # @since 0.5.0
17
17
  #
18
18
  def self.robots?
19
+ @robots ||= false
19
20
  @robots
20
21
  end
21
22
 
@@ -35,22 +36,31 @@ module Spidr
35
36
  #
36
37
  # @see Agent.start_at
37
38
  #
38
- def self.start_at(url,options={},&block)
39
- Agent.start_at(url,options,&block)
39
+ def self.start_at(url,**kwargs,&block)
40
+ Agent.start_at(url,**kwargs,&block)
40
41
  end
41
42
 
42
43
  #
43
44
  # @see Agent.host
44
45
  #
45
- def self.host(name,options={},&block)
46
- Agent.host(name,options,&block)
46
+ def self.host(name,**kwargs,&block)
47
+ Agent.host(name,**kwargs,&block)
48
+ end
49
+
50
+ #
51
+ # @see Agent.domain
52
+ #
53
+ # @since 0.7.0
54
+ #
55
+ def self.domain(name,options={},&block)
56
+ Agent.domain(name,options,&block)
47
57
  end
48
58
 
49
59
  #
50
60
  # @see Agent.site
51
61
  #
52
- def self.site(url,options={},&block)
53
- Agent.site(url,options,&block)
62
+ def self.site(url,**kwargs,&block)
63
+ Agent.site(url,**kwargs,&block)
54
64
  end
55
65
 
56
66
  #
data/lib/spidr/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.6.0'
3
+ VERSION = '0.7.0'
4
4
  end