spidr 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/README.txt +2 -1
- data/lib/spidr/page.rb +26 -18
- data/lib/spidr/version.rb +1 -1
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +7 -9
- metadata +3 -3
data/History.txt
CHANGED
data/README.txt
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
= Spidr
|
2
2
|
|
3
3
|
* http://spidr.rubyforge.org/
|
4
|
+
* http://github.com/postmodern/spidr/
|
4
5
|
* Postmodern (postmodern.mod3 at gmail.com)
|
5
6
|
|
6
7
|
== DESCRIPTION:
|
@@ -25,7 +26,7 @@ and easy to use.
|
|
25
26
|
|
26
27
|
== REQUIREMENTS:
|
27
28
|
|
28
|
-
* nokogiri
|
29
|
+
* {nokogiri}[http://nokogiri.rubyforge.org/]
|
29
30
|
|
30
31
|
== INSTALL:
|
31
32
|
|
data/lib/spidr/page.rb
CHANGED
@@ -192,10 +192,14 @@ module Spidr
|
|
192
192
|
# returned.
|
193
193
|
#
|
194
194
|
def doc
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
195
|
+
begin
|
196
|
+
if html?
|
197
|
+
return @doc ||= Nokogiri::HTML(body)
|
198
|
+
elsif xml?
|
199
|
+
return @doc ||= Nokogiri::XML(body)
|
200
|
+
end
|
201
|
+
rescue
|
202
|
+
return nil
|
199
203
|
end
|
200
204
|
end
|
201
205
|
|
@@ -205,7 +209,7 @@ module Spidr
|
|
205
209
|
def links
|
206
210
|
urls = []
|
207
211
|
|
208
|
-
if html?
|
212
|
+
if (html? && self.doc)
|
209
213
|
self.doc.search('a[@href]').each do |a|
|
210
214
|
url = a.get_attribute('href')
|
211
215
|
|
@@ -220,7 +224,7 @@ module Spidr
|
|
220
224
|
# Returns all links from the HtML page as absolute URLs.
|
221
225
|
#
|
222
226
|
def urls
|
223
|
-
links.map { |link| to_absolute(link) }
|
227
|
+
links.map { |link| to_absolute(link) }.compact
|
224
228
|
end
|
225
229
|
|
226
230
|
protected
|
@@ -233,20 +237,24 @@ module Spidr
|
|
233
237
|
# clean the link
|
234
238
|
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
235
239
|
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
if absolute.path
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
240
|
+
begin
|
241
|
+
relative = URI(link)
|
242
|
+
absolute = @url.merge(relative)
|
243
|
+
|
244
|
+
if absolute.path
|
245
|
+
if absolute.path.empty?
|
246
|
+
# default the absolute path to '/'
|
247
|
+
absolute.path = '/'
|
248
|
+
else
|
249
|
+
# make sure the path does not contain any .. or . directories.
|
250
|
+
absolute.path = File.expand_path(absolute.path)
|
251
|
+
end
|
246
252
|
end
|
247
|
-
end
|
248
253
|
|
249
|
-
|
254
|
+
return absolute
|
255
|
+
rescue URI::InvalidURIError => e
|
256
|
+
return nil
|
257
|
+
end
|
250
258
|
end
|
251
259
|
|
252
260
|
#
|
data/lib/spidr/version.rb
CHANGED
data/static/course/specs.json
CHANGED
@@ -1 +1 @@
|
|
1
|
-
[{"
|
1
|
+
[{"behavior":"ignore","link":"javascript:fail();","url":"javascript:fail();","message":"should ignore links beginning with \"javascript:\"","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","message":"should ignore links with an onclick attribute and a href pointing to the page.","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"behavior":"follow","link":"next.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","message":"should follow links pointing to other pages","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links pointing to the current page","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links to previously visited pages","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"behavior":"follow","link":"normal.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","message":"should follow relative links","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"behavior":"follow","link":".\/current_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","message":"should follow relative links to files in the current directory","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"behavior":"follow","link":"..\/relative\/same_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","message":"should follow links that transverse directories","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","message":"should ignore in-page links","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with no href attributes","example":"<a>should not follow links with no href attributes<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with empty href attributes","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"behavior":"ignore","link":" ","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","message":"should ignore links with blank href attributes","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"behavior":"follow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","message":"should follow remote links to unvisited pages","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links with a relative path to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"behavior":"follow","link":"\/course\/absolute\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","message":"should follow absolute links to unvisited pages","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"\/course\/absolute\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","message":"should not follow absolute links to the current page","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"}]
|
data/tasks/course.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'nokogiri'
|
2
2
|
require 'json'
|
3
3
|
|
4
4
|
namespace :course do
|
@@ -14,13 +14,11 @@ namespace :course do
|
|
14
14
|
specs = []
|
15
15
|
|
16
16
|
Dir[File.join(COURSE_DIR,'**','*.html')].each do |page|
|
17
|
-
doc =
|
17
|
+
doc = Nokogiri::HTML(open(page))
|
18
18
|
page_url = COURSE_URL.merge(page.sub(STATIC_DIR,''))
|
19
19
|
|
20
|
-
link_to_spec = lambda { |
|
21
|
-
|
22
|
-
|
23
|
-
relative_url = link['href'].to_s
|
20
|
+
link_to_spec = lambda { |link,spec_data|
|
21
|
+
relative_url = (link.get_attribute('href') || '')
|
24
22
|
absolute_url = page_url.merge(URI.encode(relative_url))
|
25
23
|
|
26
24
|
if absolute_url.path
|
@@ -35,15 +33,15 @@ namespace :course do
|
|
35
33
|
)
|
36
34
|
}
|
37
35
|
|
38
|
-
doc.search('.follow
|
36
|
+
doc.search('.follow//a').each do |follow|
|
39
37
|
specs << link_to_spec.call(follow, :behavior => :follow)
|
40
38
|
end
|
41
39
|
|
42
|
-
doc.search('.nofollow
|
40
|
+
doc.search('.nofollow//a').each do |nofollow|
|
43
41
|
specs << link_to_spec.call(nofollow, :behavior => :nofollow)
|
44
42
|
end
|
45
43
|
|
46
|
-
doc.search('.ignore
|
44
|
+
doc.search('.ignore//a').each do |ignore|
|
47
45
|
specs << link_to_spec.call(ignore, :behavior => :ignore)
|
48
46
|
end
|
49
47
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-03-22 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: 1.11.0
|
34
34
|
version:
|
35
35
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
36
36
|
email:
|