spidr 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.5 / 2009-03-22
2
+
3
+ * Catch malformed URIs in Page#to_absolute and return +nil+.
4
+ * Filter out +nil+ URIs in Page#urls.
5
+
1
6
  === 0.1.4 / 2009-01-15
2
7
 
3
8
  * Use Nokogiri for HTML and XML parsing.
data/README.txt CHANGED
@@ -1,6 +1,7 @@
1
1
  = Spidr
2
2
 
3
3
  * http://spidr.rubyforge.org/
4
+ * http://github.com/postmodern/spidr/
4
5
  * Postmodern (postmodern.mod3 at gmail.com)
5
6
 
6
7
  == DESCRIPTION:
@@ -25,7 +26,7 @@ and easy to use.
25
26
 
26
27
  == REQUIREMENTS:
27
28
 
28
- * nokogiri
29
+ * {nokogiri}[http://nokogiri.rubyforge.org/]
29
30
 
30
31
  == INSTALL:
31
32
 
data/lib/spidr/page.rb CHANGED
@@ -192,10 +192,14 @@ module Spidr
192
192
  # returned.
193
193
  #
194
194
  def doc
195
- if html?
196
- return @doc ||= Nokogiri::HTML(body)
197
- elsif xml?
198
- return @doc ||= Nokogiri::XML(body)
195
+ begin
196
+ if html?
197
+ return @doc ||= Nokogiri::HTML(body)
198
+ elsif xml?
199
+ return @doc ||= Nokogiri::XML(body)
200
+ end
201
+ rescue
202
+ return nil
199
203
  end
200
204
  end
201
205
 
@@ -205,7 +209,7 @@ module Spidr
205
209
  def links
206
210
  urls = []
207
211
 
208
- if html?
212
+ if (html? && self.doc)
209
213
  self.doc.search('a[@href]').each do |a|
210
214
  url = a.get_attribute('href')
211
215
 
@@ -220,7 +224,7 @@ module Spidr
220
224
  # Returns all links from the HtML page as absolute URLs.
221
225
  #
222
226
  def urls
223
- links.map { |link| to_absolute(link) }
227
+ links.map { |link| to_absolute(link) }.compact
224
228
  end
225
229
 
226
230
  protected
@@ -233,20 +237,24 @@ module Spidr
233
237
  # clean the link
234
238
  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
235
239
 
236
- relative = URI(link)
237
- absolute = @url.merge(relative)
238
-
239
- if absolute.path
240
- if absolute.path.empty?
241
- # default the absolute path to '/'
242
- absolute.path = '/'
243
- else
244
- # make sure the path does not contain any .. or . directories.
245
- absolute.path = File.expand_path(absolute.path)
240
+ begin
241
+ relative = URI(link)
242
+ absolute = @url.merge(relative)
243
+
244
+ if absolute.path
245
+ if absolute.path.empty?
246
+ # default the absolute path to '/'
247
+ absolute.path = '/'
248
+ else
249
+ # make sure the path does not contain any .. or . directories.
250
+ absolute.path = File.expand_path(absolute.path)
251
+ end
246
252
  end
247
- end
248
253
 
249
- return absolute
254
+ return absolute
255
+ rescue URI::InvalidURIError => e
256
+ return nil
257
+ end
250
258
  end
251
259
 
252
260
  #
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.4'
2
+ VERSION = '0.1.5'
3
3
  end
@@ -1 +1 @@
1
- [{"url":"javascript:fail();","link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>","message":"should ignore links beginning with \"javascript:\"","behavior":"ignore"},{"url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","behavior":"ignore"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>","message":"should follow links pointing to other pages","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>","message":"should not follow links pointing to the current page","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>","message":"should not follow links to previously visited pages","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","link":"normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>","message":"should follow relative links","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","link":".\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>","message":"should follow relative links to files in the current directory","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","link":"..\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>","message":"should follow links that transverse directories","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","link":"#","example":"<a href=\"#\">should ignore in-page links<\/a>","message":"should ignore in-page links","behavior":"ignore"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","link":"","example":"<a>should not follow links with no href attributes<\/a>","message":"should not follow links with no href attributes","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","link":"","example":"<a href=\"\">should not follow links with empty href attributes<\/a>","message":"should not follow links with empty href attributes","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","link":" ","example":"<a href=\" \">should ignore links with blank href attributes<\/a>","message":"should ignore links with blank href attributes","behavior":"ignore"},{"url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>","message":"should follow remote links to unvisited pages","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>","message":"should not follow remote links to the same page","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>","message":"should not follow remote links with a relative path to the same page","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","link":"\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>","message":"should follow absolute links to unvisited pages","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","link":"\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>","message":"should not follow absolute links to the current page","behavior":"nofollow"}]
1
+ [{"behavior":"ignore","link":"javascript:fail();","url":"javascript:fail();","message":"should ignore links beginning with \"javascript:\"","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","message":"should ignore links with an onclick attribute and a href pointing to the page.","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"behavior":"follow","link":"next.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","message":"should follow links pointing to other pages","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links pointing to the current page","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links to previously visited pages","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"behavior":"follow","link":"normal.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","message":"should follow relative links","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"behavior":"follow","link":".\/current_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","message":"should follow relative links to files in the current directory","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"behavior":"follow","link":"..\/relative\/same_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","message":"should follow links that transverse directories","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","message":"should ignore in-page links","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with no href attributes","example":"<a>should not follow links with no href attributes<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with empty href attributes","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"behavior":"ignore","link":" ","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","message":"should ignore links with blank href attributes","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"behavior":"follow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","message":"should follow remote links to unvisited pages","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links with a relative path to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"behavior":"follow","link":"\/course\/absolute\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","message":"should follow absolute links to unvisited pages","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"\/course\/absolute\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","message":"should not follow absolute links to the current page","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"}]
data/tasks/course.rb CHANGED
@@ -1,4 +1,4 @@
1
- require 'hpricot'
1
+ require 'nokogiri'
2
2
  require 'json'
3
3
 
4
4
  namespace :course do
@@ -14,13 +14,11 @@ namespace :course do
14
14
  specs = []
15
15
 
16
16
  Dir[File.join(COURSE_DIR,'**','*.html')].each do |page|
17
- doc = Hpricot(open(page))
17
+ doc = Nokogiri::HTML(open(page))
18
18
  page_url = COURSE_URL.merge(page.sub(STATIC_DIR,''))
19
19
 
20
- link_to_spec = lambda { |container,spec_data|
21
- link = container.at('a')
22
-
23
- relative_url = link['href'].to_s
20
+ link_to_spec = lambda { |link,spec_data|
21
+ relative_url = (link.get_attribute('href') || '')
24
22
  absolute_url = page_url.merge(URI.encode(relative_url))
25
23
 
26
24
  if absolute_url.path
@@ -35,15 +33,15 @@ namespace :course do
35
33
  )
36
34
  }
37
35
 
38
- doc.search('.follow[a]') do |follow|
36
+ doc.search('.follow//a').each do |follow|
39
37
  specs << link_to_spec.call(follow, :behavior => :follow)
40
38
  end
41
39
 
42
- doc.search('.nofollow[a]') do |nofollow|
40
+ doc.search('.nofollow//a').each do |nofollow|
43
41
  specs << link_to_spec.call(nofollow, :behavior => :nofollow)
44
42
  end
45
43
 
46
- doc.search('.ignore[a]') do |ignore|
44
+ doc.search('.ignore//a').each do |ignore|
47
45
  specs << link_to_spec.call(ignore, :behavior => :ignore)
48
46
  end
49
47
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-01-15 00:00:00 -08:00
12
+ date: 2009-03-22 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -30,7 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: 1.8.2
33
+ version: 1.11.0
34
34
  version:
35
35
  description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
36
36
  email: