spiderz 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,2 @@
1
+ === 0.1.0 / 2008-11-30
2
+
data/Manifest.txt ADDED
@@ -0,0 +1,7 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ bin/spiderz
6
+ lib/spiderz.rb
7
+ test/test_spiderz.rb
data/README.txt ADDED
@@ -0,0 +1,60 @@
1
+ = spiderz
2
+
3
+ http://parkerfox.co.uk/labs/spiderz
4
+
5
+ == DESCRIPTION:
6
+
7
+ Scarily easy spidering
8
+
9
+ == FEATURES/PROBLEMS:
10
+
11
+ * Very simple spidering using Hpricot
12
+
13
+ == SYNOPSIS:
14
+
15
+ Create a site map.
16
+
17
+ # create the class
18
+ spider = Spiderz.new "http://mysite.com"
19
+
20
+ # setup a custom
21
+ spider.success do |url, doc|
22
+ title = (doc / "title").text.strip
23
+ puts "<a href='#{url}' >#{title}</a>"
24
+ end
25
+
26
+ # set it going
27
+ spider.crawl
28
+
29
+ == REQUIREMENTS:
30
+
31
+ * Hpricot
32
+
33
+ == INSTALL:
34
+
35
+ sudo gem install spiderz
36
+
37
+ == LICENSE:
38
+
39
+ (The MIT License)
40
+
41
+ Copyright (c) 2008 FIX
42
+
43
+ Permission is hereby granted, free of charge, to any person obtaining
44
+ a copy of this software and associated documentation files (the
45
+ 'Software'), to deal in the Software without restriction, including
46
+ without limitation the rights to use, copy, modify, merge, publish,
47
+ distribute, sublicense, and/or sell copies of the Software, and to
48
+ permit persons to whom the Software is furnished to do so, subject to
49
+ the following conditions:
50
+
51
+ The above copyright notice and this permission notice shall be
52
+ included in all copies or substantial portions of the Software.
53
+
54
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
55
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
56
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
57
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
58
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
59
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
60
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './lib/spiderz.rb'
6
+
7
+ Hoe.new('spiderz', Spiderz::VERSION) do |p|
8
+ # p.rubyforge_name = 'spiderzx' # if different than lowercase project name
9
+ p.developer('Jonah Fox', 'jonah@parkefox.co.uk')
10
+ p.remote_rdoc_dir = '' # Release to root
11
+
12
+ end
13
+
14
+ # vim: syntax=Ruby
data/bin/spiderz ADDED
File without changes
data/lib/spiderz.rb ADDED
@@ -0,0 +1,98 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'open-uri'
4
+
5
+ class Spiderz
6
+
7
+ VERSION = "0.1.0"
8
+
9
+ #root should be like http://www.google.com (i.e. with http://)
10
+ def initialize(root)
11
+ @crawled = {}
12
+ @root = root
13
+
14
+ @success = Proc.new { |url, doc| puts "Successfully read url: #{url}" }
15
+ @failure = Proc.new { |url| puts "failure to read/parse url: #{url}" }
16
+ @started = Proc.new { |url| puts "Started crawling from url: #{url}" }
17
+ @completed = Proc.new { |url| puts "Crawling complete" }
18
+
19
+ @skip = Proc.new do |href|
20
+ !href || (external?(href) || mail?(href) || bookmark?(href))
21
+ end
22
+ end
23
+
24
+ def crawl(url)
25
+ @started.call(url)
26
+
27
+ @to_crawl = [url]
28
+
29
+ while(@to_crawl.length > 0)
30
+ @to_crawl += page_links(@to_crawl.shift)
31
+ end
32
+
33
+ @completed.call(url)
34
+ end
35
+
36
+ def external? href
37
+ href.match("[a-z]+://") && !href.match(@root)
38
+ end
39
+
40
+ def bookmark? href
41
+ href.match(/^#/)
42
+ end
43
+
44
+ def mail? href
45
+ href.match("mailto")
46
+ end
47
+
48
+ def started &action
49
+ @started = action
50
+ end
51
+
52
+ def completed &action
53
+ @completed = action
54
+ end
55
+
56
+ def failure &action
57
+ @failure = action
58
+ end
59
+
60
+ def success &action
61
+ @success = action
62
+ end
63
+
64
+ def skip &action
65
+ @skip = action
66
+ end
67
+
68
+ def page_links url
69
+ #puts url
70
+ return [] if @crawled[url]
71
+
72
+ @crawled[url] = true
73
+
74
+ begin
75
+ doc = Hpricot(open(@root+url))
76
+ rescue
77
+ @failure.call(url)
78
+ return []
79
+ end
80
+
81
+ @success.call(url, doc)
82
+
83
+ links = doc/"a" #find links
84
+
85
+ urls = links.map do |a|
86
+ a.attributes["href"]
87
+ end
88
+
89
+ urls.delete_if do |url|
90
+ @crawled[url] || @skip.call(url)
91
+ end
92
+
93
+ urls
94
+ end
95
+
96
+ end
97
+
98
+
File without changes
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spiderz
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jonah Fox
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-11-30 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.8.2
24
+ version:
25
+ description: Scarily easy spidering
26
+ email:
27
+ - jonah@parkefox.co.uk
28
+ executables:
29
+ - spiderz
30
+ extensions: []
31
+
32
+ extra_rdoc_files:
33
+ - History.txt
34
+ - Manifest.txt
35
+ - README.txt
36
+ files:
37
+ - History.txt
38
+ - Manifest.txt
39
+ - README.txt
40
+ - Rakefile
41
+ - bin/spiderz
42
+ - lib/spiderz.rb
43
+ - test/test_spiderz.rb
44
+ has_rdoc: true
45
+ homepage: http://parkerfox.co.uk/labs/spiderz
46
+ post_install_message:
47
+ rdoc_options:
48
+ - --main
49
+ - README.txt
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "0"
57
+ version:
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: "0"
63
+ version:
64
+ requirements: []
65
+
66
+ rubyforge_project: spiderz
67
+ rubygems_version: 1.3.1
68
+ signing_key:
69
+ specification_version: 2
70
+ summary: Scarily easy spidering
71
+ test_files:
72
+ - test/test_spiderz.rb