spiderz 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +2 -0
- data/Manifest.txt +7 -0
- data/README.txt +60 -0
- data/Rakefile +14 -0
- data/bin/spiderz +0 -0
- data/lib/spiderz.rb +98 -0
- data/test/test_spiderz.rb +0 -0
- metadata +72 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
data/README.txt
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
= spiderz
|
2
|
+
|
3
|
+
http://parkerfox.co.uk/labs/spiderz
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
Scarily easy spidering
|
8
|
+
|
9
|
+
== FEATURES/PROBLEMS:
|
10
|
+
|
11
|
+
* Very simple spidering using Hpricot
|
12
|
+
|
13
|
+
== SYNOPSIS:
|
14
|
+
|
15
|
+
Create a site map.
|
16
|
+
|
17
|
+
# create the class
|
18
|
+
spider = Spiderz.new "http://mysite.com"
|
19
|
+
|
20
|
+
# setup a custom
|
21
|
+
spider.success do |url, doc|
|
22
|
+
title = (doc / "title").text.strip
|
23
|
+
puts "<a href='#{url}' >#{title}</a>"
|
24
|
+
end
|
25
|
+
|
26
|
+
# set it going
|
27
|
+
spider.crawl
|
28
|
+
|
29
|
+
== REQUIREMENTS:
|
30
|
+
|
31
|
+
* Hpricot
|
32
|
+
|
33
|
+
== INSTALL:
|
34
|
+
|
35
|
+
sudo gem install spiderz
|
36
|
+
|
37
|
+
== LICENSE:
|
38
|
+
|
39
|
+
(The MIT License)
|
40
|
+
|
41
|
+
Copyright (c) 2008 FIX
|
42
|
+
|
43
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
44
|
+
a copy of this software and associated documentation files (the
|
45
|
+
'Software'), to deal in the Software without restriction, including
|
46
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
47
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
48
|
+
permit persons to whom the Software is furnished to do so, subject to
|
49
|
+
the following conditions:
|
50
|
+
|
51
|
+
The above copyright notice and this permission notice shall be
|
52
|
+
included in all copies or substantial portions of the Software.
|
53
|
+
|
54
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
55
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
56
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
57
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
58
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
59
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
60
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
require './lib/spiderz.rb'
|
6
|
+
|
7
|
+
Hoe.new('spiderz', Spiderz::VERSION) do |p|
|
8
|
+
# p.rubyforge_name = 'spiderzx' # if different than lowercase project name
|
9
|
+
p.developer('Jonah Fox', 'jonah@parkefox.co.uk')
|
10
|
+
p.remote_rdoc_dir = '' # Release to root
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
# vim: syntax=Ruby
|
data/bin/spiderz
ADDED
File without changes
|
data/lib/spiderz.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
class Spiderz
|
6
|
+
|
7
|
+
VERSION = "0.1.0"
|
8
|
+
|
9
|
+
#root should be like http://www.google.com (i.e. with http://)
|
10
|
+
def initialize(root)
|
11
|
+
@crawled = {}
|
12
|
+
@root = root
|
13
|
+
|
14
|
+
@success = Proc.new { |url, doc| puts "Successfully read url: #{url}" }
|
15
|
+
@failure = Proc.new { |url| puts "failure to read/parse url: #{url}" }
|
16
|
+
@started = Proc.new { |url| puts "Started crawling from url: #{url}" }
|
17
|
+
@completed = Proc.new { |url| puts "Crawling complete" }
|
18
|
+
|
19
|
+
@skip = Proc.new do |href|
|
20
|
+
!href || (external?(href) || mail?(href) || bookmark?(href))
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def crawl(url)
|
25
|
+
@started.call(url)
|
26
|
+
|
27
|
+
@to_crawl = [url]
|
28
|
+
|
29
|
+
while(@to_crawl.length > 0)
|
30
|
+
@to_crawl += page_links(@to_crawl.shift)
|
31
|
+
end
|
32
|
+
|
33
|
+
@completed.call(url)
|
34
|
+
end
|
35
|
+
|
36
|
+
def external? href
|
37
|
+
href.match("[a-z]+://") && !href.match(@root)
|
38
|
+
end
|
39
|
+
|
40
|
+
def bookmark? href
|
41
|
+
href.match(/^#/)
|
42
|
+
end
|
43
|
+
|
44
|
+
def mail? href
|
45
|
+
href.match("mailto")
|
46
|
+
end
|
47
|
+
|
48
|
+
def started &action
|
49
|
+
@started = action
|
50
|
+
end
|
51
|
+
|
52
|
+
def completed &action
|
53
|
+
@completed = action
|
54
|
+
end
|
55
|
+
|
56
|
+
def failure &action
|
57
|
+
@failure = action
|
58
|
+
end
|
59
|
+
|
60
|
+
def success &action
|
61
|
+
@success = action
|
62
|
+
end
|
63
|
+
|
64
|
+
def skip &action
|
65
|
+
@skip = action
|
66
|
+
end
|
67
|
+
|
68
|
+
def page_links url
|
69
|
+
#puts url
|
70
|
+
return [] if @crawled[url]
|
71
|
+
|
72
|
+
@crawled[url] = true
|
73
|
+
|
74
|
+
begin
|
75
|
+
doc = Hpricot(open(@root+url))
|
76
|
+
rescue
|
77
|
+
@failure.call(url)
|
78
|
+
return []
|
79
|
+
end
|
80
|
+
|
81
|
+
@success.call(url, doc)
|
82
|
+
|
83
|
+
links = doc/"a" #find links
|
84
|
+
|
85
|
+
urls = links.map do |a|
|
86
|
+
a.attributes["href"]
|
87
|
+
end
|
88
|
+
|
89
|
+
urls.delete_if do |url|
|
90
|
+
@crawled[url] || @skip.call(url)
|
91
|
+
end
|
92
|
+
|
93
|
+
urls
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
|
File without changes
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spiderz
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jonah Fox
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-11-30 00:00:00 +00:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.8.2
|
24
|
+
version:
|
25
|
+
description: Scarily easy spidering
|
26
|
+
email:
|
27
|
+
- jonah@parkefox.co.uk
|
28
|
+
executables:
|
29
|
+
- spiderz
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files:
|
33
|
+
- History.txt
|
34
|
+
- Manifest.txt
|
35
|
+
- README.txt
|
36
|
+
files:
|
37
|
+
- History.txt
|
38
|
+
- Manifest.txt
|
39
|
+
- README.txt
|
40
|
+
- Rakefile
|
41
|
+
- bin/spiderz
|
42
|
+
- lib/spiderz.rb
|
43
|
+
- test/test_spiderz.rb
|
44
|
+
has_rdoc: true
|
45
|
+
homepage: http://parkerfox.co.uk/labs/spiderz
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options:
|
48
|
+
- --main
|
49
|
+
- README.txt
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: "0"
|
57
|
+
version:
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: "0"
|
63
|
+
version:
|
64
|
+
requirements: []
|
65
|
+
|
66
|
+
rubyforge_project: spiderz
|
67
|
+
rubygems_version: 1.3.1
|
68
|
+
signing_key:
|
69
|
+
specification_version: 2
|
70
|
+
summary: Scarily easy spidering
|
71
|
+
test_files:
|
72
|
+
- test/test_spiderz.rb
|