spiderz 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt ADDED
@@ -0,0 +1,2 @@
1
+ === 0.1.0 / 2008-11-30
2
+
data/Manifest.txt ADDED
@@ -0,0 +1,7 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ bin/spiderz
6
+ lib/spiderz.rb
7
+ test/test_spiderz.rb
data/README.txt ADDED
@@ -0,0 +1,60 @@
1
+ = spiderz
2
+
3
+ http://parkerfox.co.uk/labs/spiderz
4
+
5
+ == DESCRIPTION:
6
+
7
+ Scarily easy spidering
8
+
9
+ == FEATURES/PROBLEMS:
10
+
11
+ * Very simple spidering using Hpricot
12
+
13
+ == SYNOPSIS:
14
+
15
+ Create a site map.
16
+
17
+ # create the class
18
+ spider = Spiderz.new "http://mysite.com"
19
+
20
+ # setup a custom
21
+ spider.success do |url, doc|
22
+ title = (doc / "title").text.strip
23
+ puts "<a href='#{url}' >#{title}</a>"
24
+ end
25
+
26
+ # set it going
27
+ spider.crawl
28
+
29
+ == REQUIREMENTS:
30
+
31
+ * Hpricot
32
+
33
+ == INSTALL:
34
+
35
+ sudo gem install spiderz
36
+
37
+ == LICENSE:
38
+
39
+ (The MIT License)
40
+
41
+ Copyright (c) 2008 FIX
42
+
43
+ Permission is hereby granted, free of charge, to any person obtaining
44
+ a copy of this software and associated documentation files (the
45
+ 'Software'), to deal in the Software without restriction, including
46
+ without limitation the rights to use, copy, modify, merge, publish,
47
+ distribute, sublicense, and/or sell copies of the Software, and to
48
+ permit persons to whom the Software is furnished to do so, subject to
49
+ the following conditions:
50
+
51
+ The above copyright notice and this permission notice shall be
52
+ included in all copies or substantial portions of the Software.
53
+
54
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
55
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
56
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
57
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
58
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
59
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
60
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './lib/spiderz.rb'
6
+
7
+ Hoe.new('spiderz', Spiderz::VERSION) do |p|
8
+ # p.rubyforge_name = 'spiderzx' # if different than lowercase project name
9
+ p.developer('Jonah Fox', 'jonah@parkefox.co.uk')
10
+ p.remote_rdoc_dir = '' # Release to root
11
+
12
+ end
13
+
14
+ # vim: syntax=Ruby
data/bin/spiderz ADDED
File without changes
data/lib/spiderz.rb ADDED
@@ -0,0 +1,98 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'open-uri'
4
+
5
+ class Spiderz
6
+
7
+ VERSION = "0.1.0"
8
+
9
+ #root should be like http://www.google.com (i.e. with http://)
10
+ def initialize(root)
11
+ @crawled = {}
12
+ @root = root
13
+
14
+ @success = Proc.new { |url, doc| puts "Successfully read url: #{url}" }
15
+ @failure = Proc.new { |url| puts "failure to read/parse url: #{url}" }
16
+ @started = Proc.new { |url| puts "Started crawling from url: #{url}" }
17
+ @completed = Proc.new { |url| puts "Crawling complete" }
18
+
19
+ @skip = Proc.new do |href|
20
+ !href || (external?(href) || mail?(href) || bookmark?(href))
21
+ end
22
+ end
23
+
24
+ def crawl(url)
25
+ @started.call(url)
26
+
27
+ @to_crawl = [url]
28
+
29
+ while(@to_crawl.length > 0)
30
+ @to_crawl += page_links(@to_crawl.shift)
31
+ end
32
+
33
+ @completed.call(url)
34
+ end
35
+
36
+ def external? href
37
+ href.match("[a-z]+://") && !href.match(@root)
38
+ end
39
+
40
+ def bookmark? href
41
+ href.match(/^#/)
42
+ end
43
+
44
+ def mail? href
45
+ href.match("mailto")
46
+ end
47
+
48
+ def started &action
49
+ @started = action
50
+ end
51
+
52
+ def completed &action
53
+ @completed = action
54
+ end
55
+
56
+ def failure &action
57
+ @failure = action
58
+ end
59
+
60
+ def success &action
61
+ @success = action
62
+ end
63
+
64
+ def skip &action
65
+ @skip = action
66
+ end
67
+
68
+ def page_links url
69
+ #puts url
70
+ return [] if @crawled[url]
71
+
72
+ @crawled[url] = true
73
+
74
+ begin
75
+ doc = Hpricot(open(@root+url))
76
+ rescue
77
+ @failure.call(url)
78
+ return []
79
+ end
80
+
81
+ @success.call(url, doc)
82
+
83
+ links = doc/"a" #find links
84
+
85
+ urls = links.map do |a|
86
+ a.attributes["href"]
87
+ end
88
+
89
+ urls.delete_if do |url|
90
+ @crawled[url] || @skip.call(url)
91
+ end
92
+
93
+ urls
94
+ end
95
+
96
+ end
97
+
98
+
File without changes
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spiderz
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jonah Fox
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-11-30 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.8.2
24
+ version:
25
+ description: Scarily easy spidering
26
+ email:
27
+ - jonah@parkefox.co.uk
28
+ executables:
29
+ - spiderz
30
+ extensions: []
31
+
32
+ extra_rdoc_files:
33
+ - History.txt
34
+ - Manifest.txt
35
+ - README.txt
36
+ files:
37
+ - History.txt
38
+ - Manifest.txt
39
+ - README.txt
40
+ - Rakefile
41
+ - bin/spiderz
42
+ - lib/spiderz.rb
43
+ - test/test_spiderz.rb
44
+ has_rdoc: true
45
+ homepage: http://parkerfox.co.uk/labs/spiderz
46
+ post_install_message:
47
+ rdoc_options:
48
+ - --main
49
+ - README.txt
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "0"
57
+ version:
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: "0"
63
+ version:
64
+ requirements: []
65
+
66
+ rubyforge_project: spiderz
67
+ rubygems_version: 1.3.1
68
+ signing_key:
69
+ specification_version: 2
70
+ summary: Scarily easy spidering
71
+ test_files:
72
+ - test/test_spiderz.rb