spidr 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.1.4 / 2009-01-15
2
+
3
+ * Use Nokogiri for HTML and XML parsing.
4
+
1
5
  === 0.1.3 / 2009-01-10
2
6
 
3
7
  * Added the :host options to Spidr::Agent#initialize.
data/README.txt CHANGED
@@ -25,7 +25,7 @@ and easy to use.
25
25
 
26
26
  == REQUIREMENTS:
27
27
 
28
- * Hpricot
28
+ * nokogiri
29
29
 
30
30
  == INSTALL:
31
31
 
data/Rakefile CHANGED
@@ -10,7 +10,7 @@ Hoe.new('spidr', Spidr::VERSION) do |p|
10
10
  p.rubyforge_name = 'spidr'
11
11
  p.developer('Postmodern', 'postmodern.mod3@gmail.com')
12
12
  p.remote_rdoc_dir = 'docs'
13
- p.extra_deps = ['hpricot']
13
+ p.extra_deps = ['nokogiri']
14
14
  end
15
15
 
16
16
  # vim: syntax=Ruby
data/lib/spidr/agent.rb CHANGED
@@ -3,7 +3,6 @@ require 'spidr/page'
3
3
  require 'spidr/spidr'
4
4
 
5
5
  require 'net/http'
6
- require 'hpricot'
7
6
 
8
7
  module Spidr
9
8
  class Agent
data/lib/spidr/page.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'uri'
2
- require 'hpricot'
2
+ require 'nokogiri'
3
3
 
4
4
  module Spidr
5
5
  class Page
@@ -185,12 +185,17 @@ module Spidr
185
185
  end
186
186
 
187
187
  #
188
- # Returns an Hpricot::Doc if the page represents a HTML document,
189
- # returns +nil+ otherwise.
188
+ # If the page has a <tt>text/html</tt> content-type, a
189
+ # Nokogiri::HTML::Document object will be returned. If the page has a
190
+ # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
191
+ # will be returned. Other content-types will cause +nil+ to be
192
+ # returned.
190
193
  #
191
194
  def doc
192
195
  if html?
193
- return @doc ||= Hpricot(body)
196
+ return @doc ||= Nokogiri::HTML(body)
197
+ elsif xml?
198
+ return @doc ||= Nokogiri::XML(body)
194
199
  end
195
200
  end
196
201
 
@@ -201,8 +206,8 @@ module Spidr
201
206
  urls = []
202
207
 
203
208
  if html?
204
- doc.search('a[@href]') do |a|
205
- url = a.attributes['href'].strip
209
+ self.doc.search('a[@href]').each do |a|
210
+ url = a.get_attribute('href')
206
211
 
207
212
  urls << url unless url.empty?
208
213
  end
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.3'
2
+ VERSION = '0.1.4'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -9,11 +9,11 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-01-10 00:00:00 -08:00
12
+ date: 2009-01-15 00:00:00 -08:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
- name: hpricot
16
+ name: nokogiri
17
17
  type: :runtime
18
18
  version_requirement:
19
19
  version_requirements: !ruby/object:Gem::Requirement