spidr 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.1.4 / 2009-01-15
2
+
3
+ * Use Nokogiri for HTML and XML parsing.
4
+
1
5
  === 0.1.3 / 2009-01-10
2
6
 
3
7
  * Added the :host options to Spidr::Agent#initialize.
data/README.txt CHANGED
@@ -25,7 +25,7 @@ and easy to use.
25
25
 
26
26
  == REQUIREMENTS:
27
27
 
28
- * Hpricot
28
+ * nokogiri
29
29
 
30
30
  == INSTALL:
31
31
 
data/Rakefile CHANGED
@@ -10,7 +10,7 @@ Hoe.new('spidr', Spidr::VERSION) do |p|
10
10
  p.rubyforge_name = 'spidr'
11
11
  p.developer('Postmodern', 'postmodern.mod3@gmail.com')
12
12
  p.remote_rdoc_dir = 'docs'
13
- p.extra_deps = ['hpricot']
13
+ p.extra_deps = ['nokogiri']
14
14
  end
15
15
 
16
16
  # vim: syntax=Ruby
data/lib/spidr/agent.rb CHANGED
@@ -3,7 +3,6 @@ require 'spidr/page'
3
3
  require 'spidr/spidr'
4
4
 
5
5
  require 'net/http'
6
- require 'hpricot'
7
6
 
8
7
  module Spidr
9
8
  class Agent
data/lib/spidr/page.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'uri'
2
- require 'hpricot'
2
+ require 'nokogiri'
3
3
 
4
4
  module Spidr
5
5
  class Page
@@ -185,12 +185,17 @@ module Spidr
185
185
  end
186
186
 
187
187
  #
188
- # Returns an Hpricot::Doc if the page represents a HTML document,
189
- # returns +nil+ otherwise.
188
+ # If the page has a <tt>text/html</tt> content-type, a
189
+ # Nokogiri::HTML::Document object will be returned. If the page has a
190
+ # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
191
+ # will be returned. Other content-types will cause +nil+ to be
192
+ # returned.
190
193
  #
191
194
  def doc
192
195
  if html?
193
- return @doc ||= Hpricot(body)
196
+ return @doc ||= Nokogiri::HTML(body)
197
+ elsif xml?
198
+ return @doc ||= Nokogiri::XML(body)
194
199
  end
195
200
  end
196
201
 
@@ -201,8 +206,8 @@ module Spidr
201
206
  urls = []
202
207
 
203
208
  if html?
204
- doc.search('a[@href]') do |a|
205
- url = a.attributes['href'].strip
209
+ self.doc.search('a[@href]').each do |a|
210
+ url = a.get_attribute('href')
206
211
 
207
212
  urls << url unless url.empty?
208
213
  end
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.3'
2
+ VERSION = '0.1.4'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -9,11 +9,11 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-01-10 00:00:00 -08:00
12
+ date: 2009-01-15 00:00:00 -08:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
- name: hpricot
16
+ name: nokogiri
17
17
  type: :runtime
18
18
  version_requirement:
19
19
  version_requirements: !ruby/object:Gem::Requirement