spidr 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/README.txt +2 -1
- data/lib/spidr/page.rb +26 -18
- data/lib/spidr/version.rb +1 -1
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +7 -9
- metadata +3 -3
data/History.txt
CHANGED
data/README.txt
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
= Spidr
|
2
2
|
|
3
3
|
* http://spidr.rubyforge.org/
|
4
|
+
* http://github.com/postmodern/spidr/
|
4
5
|
* Postmodern (postmodern.mod3 at gmail.com)
|
5
6
|
|
6
7
|
== DESCRIPTION:
|
@@ -25,7 +26,7 @@ and easy to use.
|
|
25
26
|
|
26
27
|
== REQUIREMENTS:
|
27
28
|
|
28
|
-
* nokogiri
|
29
|
+
* {nokogiri}[http://nokogiri.rubyforge.org/]
|
29
30
|
|
30
31
|
== INSTALL:
|
31
32
|
|
data/lib/spidr/page.rb
CHANGED
@@ -192,10 +192,14 @@ module Spidr
|
|
192
192
|
# returned.
|
193
193
|
#
|
194
194
|
def doc
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
195
|
+
begin
|
196
|
+
if html?
|
197
|
+
return @doc ||= Nokogiri::HTML(body)
|
198
|
+
elsif xml?
|
199
|
+
return @doc ||= Nokogiri::XML(body)
|
200
|
+
end
|
201
|
+
rescue
|
202
|
+
return nil
|
199
203
|
end
|
200
204
|
end
|
201
205
|
|
@@ -205,7 +209,7 @@ module Spidr
|
|
205
209
|
def links
|
206
210
|
urls = []
|
207
211
|
|
208
|
-
if html?
|
212
|
+
if (html? && self.doc)
|
209
213
|
self.doc.search('a[@href]').each do |a|
|
210
214
|
url = a.get_attribute('href')
|
211
215
|
|
@@ -220,7 +224,7 @@ module Spidr
|
|
220
224
|
# Returns all links from the HtML page as absolute URLs.
|
221
225
|
#
|
222
226
|
def urls
|
223
|
-
links.map { |link| to_absolute(link) }
|
227
|
+
links.map { |link| to_absolute(link) }.compact
|
224
228
|
end
|
225
229
|
|
226
230
|
protected
|
@@ -233,20 +237,24 @@ module Spidr
|
|
233
237
|
# clean the link
|
234
238
|
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
235
239
|
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
if absolute.path
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
240
|
+
begin
|
241
|
+
relative = URI(link)
|
242
|
+
absolute = @url.merge(relative)
|
243
|
+
|
244
|
+
if absolute.path
|
245
|
+
if absolute.path.empty?
|
246
|
+
# default the absolute path to '/'
|
247
|
+
absolute.path = '/'
|
248
|
+
else
|
249
|
+
# make sure the path does not contain any .. or . directories.
|
250
|
+
absolute.path = File.expand_path(absolute.path)
|
251
|
+
end
|
246
252
|
end
|
247
|
-
end
|
248
253
|
|
249
|
-
|
254
|
+
return absolute
|
255
|
+
rescue URI::InvalidURIError => e
|
256
|
+
return nil
|
257
|
+
end
|
250
258
|
end
|
251
259
|
|
252
260
|
#
|
data/lib/spidr/version.rb
CHANGED
data/static/course/specs.json
CHANGED
@@ -1 +1 @@
|
|
1
|
-
[{"
|
1
|
+
[{"behavior":"ignore","link":"javascript:fail();","url":"javascript:fail();","message":"should ignore links beginning with \"javascript:\"","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","message":"should ignore links with an onclick attribute and a href pointing to the page.","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"behavior":"follow","link":"next.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","message":"should follow links pointing to other pages","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links pointing to the current page","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links to previously visited pages","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"behavior":"follow","link":"normal.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","message":"should follow relative links","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"behavior":"follow","link":".\/current_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","message":"should follow relative links to files in the current directory","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"behavior":"follow","link":"..\/relative\/same_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","message":"should follow links that transverse directories","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","message":"should ignore in-page links","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with no href attributes","example":"<a>should not follow links with no href attributes<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with empty href attributes","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"behavior":"ignore","link":" ","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","message":"should ignore links with blank href attributes","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"behavior":"follow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","message":"should follow remote links to unvisited pages","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links with a relative path to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"behavior":"follow","link":"\/course\/absolute\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","message":"should follow absolute links to unvisited pages","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"\/course\/absolute\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","message":"should not follow absolute links to the current page","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"}]
|
data/tasks/course.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'nokogiri'
|
2
2
|
require 'json'
|
3
3
|
|
4
4
|
namespace :course do
|
@@ -14,13 +14,11 @@ namespace :course do
|
|
14
14
|
specs = []
|
15
15
|
|
16
16
|
Dir[File.join(COURSE_DIR,'**','*.html')].each do |page|
|
17
|
-
doc =
|
17
|
+
doc = Nokogiri::HTML(open(page))
|
18
18
|
page_url = COURSE_URL.merge(page.sub(STATIC_DIR,''))
|
19
19
|
|
20
|
-
link_to_spec = lambda { |
|
21
|
-
|
22
|
-
|
23
|
-
relative_url = link['href'].to_s
|
20
|
+
link_to_spec = lambda { |link,spec_data|
|
21
|
+
relative_url = (link.get_attribute('href') || '')
|
24
22
|
absolute_url = page_url.merge(URI.encode(relative_url))
|
25
23
|
|
26
24
|
if absolute_url.path
|
@@ -35,15 +33,15 @@ namespace :course do
|
|
35
33
|
)
|
36
34
|
}
|
37
35
|
|
38
|
-
doc.search('.follow
|
36
|
+
doc.search('.follow//a').each do |follow|
|
39
37
|
specs << link_to_spec.call(follow, :behavior => :follow)
|
40
38
|
end
|
41
39
|
|
42
|
-
doc.search('.nofollow
|
40
|
+
doc.search('.nofollow//a').each do |nofollow|
|
43
41
|
specs << link_to_spec.call(nofollow, :behavior => :nofollow)
|
44
42
|
end
|
45
43
|
|
46
|
-
doc.search('.ignore
|
44
|
+
doc.search('.ignore//a').each do |ignore|
|
47
45
|
specs << link_to_spec.call(ignore, :behavior => :ignore)
|
48
46
|
end
|
49
47
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-03-22 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: 1.11.0
|
34
34
|
version:
|
35
35
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
36
36
|
email:
|