ssoroka-spider_bot 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +50 -0
- data/Rakefile +14 -0
- data/bin/spider_bot +6 -0
- data/lib/spider_bot.rb +118 -0
- data/spider_bot.gemspec +37 -0
- metadata +82 -0
data/README.rdoc
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
= Spider bot
|
2
|
+
|
3
|
+
a non-threaded spider bot that spiders a site with response time stats. easily extendable
|
4
|
+
|
5
|
+
== Installation
|
6
|
+
|
7
|
+
sudo gem install ssoroka-spider_bot
|
8
|
+
|
9
|
+
== Usage
|
10
|
+
|
11
|
+
Usable in code or as executable!
|
12
|
+
|
13
|
+
spider_bot http://www.example.com
|
14
|
+
spider_bot http://0.0.0.0:3000
|
15
|
+
|
16
|
+
Example as script/spider:
|
17
|
+
|
18
|
+
#!/usr/bin/env ruby
|
19
|
+
require 'rubygems'
|
20
|
+
require 'spider_bot'
|
21
|
+
|
22
|
+
class MySpider < SpiderBot
|
23
|
+
# override these for handling events
|
24
|
+
def on_page(page)
|
25
|
+
end
|
26
|
+
|
27
|
+
def on_404(link)
|
28
|
+
end
|
29
|
+
|
30
|
+
def on_500(link)
|
31
|
+
end
|
32
|
+
|
33
|
+
# override these for changing how urls are classified as links
|
34
|
+
def off_site?(url)
|
35
|
+
url !~ /^\// # urls not starting with a /
|
36
|
+
end
|
37
|
+
|
38
|
+
def ignorable?(url)
|
39
|
+
url =~ /\/.*\..+/ && # files with extensions
|
40
|
+
url !~ /\.html$/ # but not html files
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
spider = MySpider.new(:quiet => false)
|
45
|
+
spider.start(ARGV[1])
|
46
|
+
|
47
|
+
== Gem Requirements
|
48
|
+
|
49
|
+
* ssoroka-ansi
|
50
|
+
* mechanize
|
data/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'echoe'
|
4
|
+
|
5
|
+
Echoe.new('spider_bot', '0.9.1') do |p|
|
6
|
+
p.description = 'A non-threaded spider bot that spiders a site with response time stats. easily extendable'
|
7
|
+
p.url = 'http://github.com/ssoroka/spider_bot'
|
8
|
+
p.author = 'Steven Soroka'
|
9
|
+
p.email = 'ssoroka78@gmail.com'
|
10
|
+
p.ignore_pattern = ["tmp/*"]
|
11
|
+
p.development_dependencies = ['mechanize', 'ssoroka-ansi']
|
12
|
+
end
|
13
|
+
|
14
|
+
Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each{|f| load f }
|
data/bin/spider_bot
ADDED
data/lib/spider_bot.rb
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'ansi'
|
3
|
+
require 'benchmark'
|
4
|
+
|
5
|
+
class SpiderBot
|
6
|
+
# override these for handling events
|
7
|
+
def on_page(page)
|
8
|
+
end
|
9
|
+
|
10
|
+
def on_404(link)
|
11
|
+
end
|
12
|
+
|
13
|
+
def on_500(link)
|
14
|
+
end
|
15
|
+
|
16
|
+
# override these for changing how urls are classified as links
|
17
|
+
def off_site?(url)
|
18
|
+
url !~ /^\// # urls not starting with a /
|
19
|
+
end
|
20
|
+
|
21
|
+
def ignorable?(url)
|
22
|
+
url =~ /\/.*\..+/ && # files with extensions
|
23
|
+
url !~ /\.html$/ # but not html files
|
24
|
+
end
|
25
|
+
|
26
|
+
## implementation! :)
|
27
|
+
attr_accessor :fourohfour, :fivehundred, :response_times, :been_to, :go_to
|
28
|
+
def initialize(options = {})
|
29
|
+
@agent = WWW::Mechanize.new
|
30
|
+
@agent.user_agent = options[:agent] || 'spider bot'
|
31
|
+
@been_to = ['/']
|
32
|
+
@go_to = []
|
33
|
+
@fourohfour = []
|
34
|
+
@fivehundred = []
|
35
|
+
@response_times = []
|
36
|
+
@key = "\n\nKey:\n{#{ANSI.color(:yellow){'queued'}}}/{#{ANSI.color(:green){'hit'}}}/{#{ANSI.color(:red){'404s'}}}/{#{ANSI.color(:red){'500s'}}} {current action}"
|
37
|
+
@quiet = options[:quiet]
|
38
|
+
end
|
39
|
+
|
40
|
+
def start(url = nil)
|
41
|
+
@starting_url = url || 'http://0.0.0.0:3000/'
|
42
|
+
fetch @starting_url
|
43
|
+
end
|
44
|
+
|
45
|
+
def fetch(link)
|
46
|
+
status "fetching #{link}"
|
47
|
+
@been_to << link
|
48
|
+
begin
|
49
|
+
page = nil
|
50
|
+
real = Benchmark.measure {
|
51
|
+
page = @agent.get link
|
52
|
+
}.real
|
53
|
+
@response_times << [link, real]
|
54
|
+
(page/'a').each{|el|
|
55
|
+
if el['href']
|
56
|
+
url = el['href'].gsub(/\#.*/, '')
|
57
|
+
@go_to.push url unless @go_to.include?(url) || @been_to.include?(url) || off_site?(url) || ignorable?(url)
|
58
|
+
end
|
59
|
+
}
|
60
|
+
on_page(page) # on_page event
|
61
|
+
rescue Net::HTTPNotFound => e
|
62
|
+
@fourohfour << link
|
63
|
+
status "#{link} not found, 404"
|
64
|
+
on_404(link)
|
65
|
+
rescue Net::HTTPInternalServerError => e
|
66
|
+
@fivehundred << link
|
67
|
+
status ANSI.color(:red) {"#{link} dead! 500 error!"}
|
68
|
+
on_500(link)
|
69
|
+
rescue Interrupt => e
|
70
|
+
status "Interrupt caught, shutting down."
|
71
|
+
close_up_shop
|
72
|
+
exit
|
73
|
+
end
|
74
|
+
|
75
|
+
# fetch next link in list.
|
76
|
+
next_link = @go_to.shift
|
77
|
+
if next_link
|
78
|
+
fetch(next_link)
|
79
|
+
else
|
80
|
+
close_up_shop
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def status(s)
|
85
|
+
return if @quiet
|
86
|
+
to_go = ANSI.color(:yellow) { @go_to.size.to_s }
|
87
|
+
done = ANSI.color(:green) { @been_to.size.to_s }
|
88
|
+
fourohfour = ANSI.color(:red) { @fourohfour.size.to_s }
|
89
|
+
fivehundred = ANSI.color(:red) { @fivehundred.size.to_s }
|
90
|
+
STDOUT.print(ANSI.clear_screen + ANSI.up(100) + ANSI.left(100))
|
91
|
+
STDOUT.print("spidering #{@starting_url}..\n")
|
92
|
+
STDOUT.print("#{to_go}/#{done}/#{fourohfour}/#{fivehundred}: #{s}")
|
93
|
+
STDOUT.print("\n\nNext 15 links:\n#{@go_to[0..14].join("\n")}")
|
94
|
+
STDOUT.print(@key)
|
95
|
+
STDOUT.flush
|
96
|
+
end
|
97
|
+
|
98
|
+
def close_up_shop
|
99
|
+
return if @quiet
|
100
|
+
STDOUT.puts "\n\nDone!"
|
101
|
+
if @fourohfour.any?
|
102
|
+
STDOUT.puts "Here are all your broken links:\n#{@fourohfour.join("\n")}"
|
103
|
+
else
|
104
|
+
STDOUT.puts "You have no broken links that I could find"
|
105
|
+
end
|
106
|
+
if @fivehundred.any?
|
107
|
+
STDOUT.puts "Here are all your dead 500 links:\n#{@fivehundred.join("\n")}"
|
108
|
+
else
|
109
|
+
STDOUT.puts "You have no dead (500) pages that I could find"
|
110
|
+
end
|
111
|
+
STDOUT.puts "\n5 slowest pages: "
|
112
|
+
@response_times.sort_by{|link, time|
|
113
|
+
-time
|
114
|
+
}.first(5).each{|link, time|
|
115
|
+
STDOUT.printf "%0.3fs %s\n", time, link
|
116
|
+
}
|
117
|
+
end
|
118
|
+
end
|
data/spider_bot.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = %q{spider_bot}
|
3
|
+
s.version = "0.9.1"
|
4
|
+
|
5
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
6
|
+
s.authors = ["Steven Soroka"]
|
7
|
+
s.date = %q{2008-12-04}
|
8
|
+
s.default_executable = %q{spider_bot}
|
9
|
+
s.description = %q{A non-threaded spider bot that spiders a site with response time stats. easily extendable}
|
10
|
+
s.email = %q{ssoroka78@gmail.com}
|
11
|
+
s.executables = ["spider_bot"]
|
12
|
+
s.extra_rdoc_files = ["bin/spider_bot", "lib/spider_bot.rb", "README.rdoc"]
|
13
|
+
s.files = ["bin/spider_bot", "lib/spider_bot.rb", "Rakefile", "README.rdoc", "spider_bot.gemspec", "Manifest"]
|
14
|
+
s.has_rdoc = true
|
15
|
+
s.homepage = %q{http://github.com/ssoroka/spider_bot}
|
16
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Spider_bot", "--main", "README.rdoc"]
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.rubyforge_project = %q{spider_bot}
|
19
|
+
s.rubygems_version = %q{1.2.0}
|
20
|
+
s.summary = %q{A non-threaded spider bot that spiders a site with response time stats. easily extendable}
|
21
|
+
|
22
|
+
if s.respond_to? :specification_version then
|
23
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
24
|
+
s.specification_version = 2
|
25
|
+
|
26
|
+
if current_version >= 3 then
|
27
|
+
s.add_development_dependency(%q<mechanize>, [">= 0"])
|
28
|
+
s.add_development_dependency(%q<ssoroka-ansi>, [">= 0"])
|
29
|
+
else
|
30
|
+
s.add_dependency(%q<mechanize>, [">= 0"])
|
31
|
+
s.add_dependency(%q<ssoroka-ansi>, [">= 0"])
|
32
|
+
end
|
33
|
+
else
|
34
|
+
s.add_dependency(%q<mechanize>, [">= 0"])
|
35
|
+
s.add_dependency(%q<ssoroka-ansi>, [">= 0"])
|
36
|
+
end
|
37
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ssoroka-spider_bot
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.9.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Steven Soroka
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-12-04 00:00:00 -08:00
|
13
|
+
default_executable: spider_bot
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: mechanize
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: "0"
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: ssoroka-ansi
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: "0"
|
32
|
+
version:
|
33
|
+
description: A non-threaded spider bot that spiders a site with response time stats. easily extendable
|
34
|
+
email: ssoroka78@gmail.com
|
35
|
+
executables:
|
36
|
+
- spider_bot
|
37
|
+
extensions: []
|
38
|
+
|
39
|
+
extra_rdoc_files:
|
40
|
+
- bin/spider_bot
|
41
|
+
- lib/spider_bot.rb
|
42
|
+
- README.rdoc
|
43
|
+
files:
|
44
|
+
- bin/spider_bot
|
45
|
+
- lib/spider_bot.rb
|
46
|
+
- Rakefile
|
47
|
+
- README.rdoc
|
48
|
+
- spider_bot.gemspec
|
49
|
+
- Manifest
|
50
|
+
has_rdoc: true
|
51
|
+
homepage: http://github.com/ssoroka/spider_bot
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options:
|
54
|
+
- --line-numbers
|
55
|
+
- --inline-source
|
56
|
+
- --title
|
57
|
+
- Spider_bot
|
58
|
+
- --main
|
59
|
+
- README.rdoc
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0"
|
67
|
+
version:
|
68
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: "1.2"
|
73
|
+
version:
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project: spider_bot
|
77
|
+
rubygems_version: 1.2.0
|
78
|
+
signing_key:
|
79
|
+
specification_version: 2
|
80
|
+
summary: A non-threaded spider bot that spiders a site with response time stats. easily extendable
|
81
|
+
test_files: []
|
82
|
+
|