ssoroka-spider_bot 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc ADDED
@@ -0,0 +1,50 @@
1
+ = Spider bot
2
+
3
+ a non-threaded spider bot that spiders a site with response time stats. easily extendable
4
+
5
+ == Installation
6
+
7
+ sudo gem install ssoroka-spider_bot
8
+
9
+ == Usage
10
+
11
+ Usable in code or as executable!
12
+
13
+ spider_bot http://www.example.com
14
+ spider_bot http://0.0.0.0:3000
15
+
16
+ Example as script/spider:
17
+
18
+ #!/usr/bin/env ruby
19
+ require 'rubygems'
20
+ require 'spider_bot'
21
+
22
+ class MySpider < SpiderBot
23
+ # override these for handling events
24
+ def on_page(page)
25
+ end
26
+
27
+ def on_404(link)
28
+ end
29
+
30
+ def on_500(link)
31
+ end
32
+
33
+ # override these for changing how urls are classified as links
34
+ def off_site?(url)
35
+ url !~ /^\// # urls not starting with a /
36
+ end
37
+
38
+ def ignorable?(url)
39
+ url =~ /\/.*\..+/ && # files with extensions
40
+ url !~ /\.html$/ # but not html files
41
+ end
42
+ end
43
+
44
+ spider = MySpider.new(:quiet => false)
45
+ spider.start(ARGV[1])
46
+
47
+ == Gem Requirements
48
+
49
+ * ssoroka-ansi
50
+ * mechanize
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'echoe'
4
+
5
+ Echoe.new('spider_bot', '0.9.1') do |p|
6
+ p.description = 'A non-threaded spider bot that spiders a site with response time stats. easily extendable'
7
+ p.url = 'http://github.com/ssoroka/spider_bot'
8
+ p.author = 'Steven Soroka'
9
+ p.email = 'ssoroka78@gmail.com'
10
+ p.ignore_pattern = ["tmp/*"]
11
+ p.development_dependencies = ['mechanize', 'ssoroka-ansi']
12
+ end
13
+
14
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each{|f| load f }
data/bin/spider_bot ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'lib/spider_bot'
4
+
5
+ spider = SpiderBot.new(:quiet => false)
6
+ spider.start(ARGV[1])
data/lib/spider_bot.rb ADDED
@@ -0,0 +1,118 @@
1
+ require 'mechanize'
2
+ require 'ansi'
3
+ require 'benchmark'
4
+
5
+ class SpiderBot
6
+ # override these for handling events
7
+ def on_page(page)
8
+ end
9
+
10
+ def on_404(link)
11
+ end
12
+
13
+ def on_500(link)
14
+ end
15
+
16
+ # override these for changing how urls are classified as links
17
+ def off_site?(url)
18
+ url !~ /^\// # urls not starting with a /
19
+ end
20
+
21
+ def ignorable?(url)
22
+ url =~ /\/.*\..+/ && # files with extensions
23
+ url !~ /\.html$/ # but not html files
24
+ end
25
+
26
+ ## implementation! :)
27
+ attr_accessor :fourohfour, :fivehundred, :response_times, :been_to, :go_to
28
+ def initialize(options = {})
29
+ @agent = WWW::Mechanize.new
30
+ @agent.user_agent = options[:agent] || 'spider bot'
31
+ @been_to = ['/']
32
+ @go_to = []
33
+ @fourohfour = []
34
+ @fivehundred = []
35
+ @response_times = []
36
+ @key = "\n\nKey:\n{#{ANSI.color(:yellow){'queued'}}}/{#{ANSI.color(:green){'hit'}}}/{#{ANSI.color(:red){'404s'}}}/{#{ANSI.color(:red){'500s'}}} {current action}"
37
+ @quiet = options[:quiet]
38
+ end
39
+
40
+ def start(url = nil)
41
+ @starting_url = url || 'http://0.0.0.0:3000/'
42
+ fetch @starting_url
43
+ end
44
+
45
+ def fetch(link)
46
+ status "fetching #{link}"
47
+ @been_to << link
48
+ begin
49
+ page = nil
50
+ real = Benchmark.measure {
51
+ page = @agent.get link
52
+ }.real
53
+ @response_times << [link, real]
54
+ (page/'a').each{|el|
55
+ if el['href']
56
+ url = el['href'].gsub(/\#.*/, '')
57
+ @go_to.push url unless @go_to.include?(url) || @been_to.include?(url) || off_site?(url) || ignorable?(url)
58
+ end
59
+ }
60
+ on_page(page) # on_page event
61
+ rescue Net::HTTPNotFound => e
62
+ @fourohfour << link
63
+ status "#{link} not found, 404"
64
+ on_404(link)
65
+ rescue Net::HTTPInternalServerError => e
66
+ @fivehundred << link
67
+ status ANSI.color(:red) {"#{link} dead! 500 error!"}
68
+ on_500(link)
69
+ rescue Interrupt => e
70
+ status "Interrupt caught, shutting down."
71
+ close_up_shop
72
+ exit
73
+ end
74
+
75
+ # fetch next link in list.
76
+ next_link = @go_to.shift
77
+ if next_link
78
+ fetch(next_link)
79
+ else
80
+ close_up_shop
81
+ end
82
+ end
83
+
84
+ def status(s)
85
+ return if @quiet
86
+ to_go = ANSI.color(:yellow) { @go_to.size.to_s }
87
+ done = ANSI.color(:green) { @been_to.size.to_s }
88
+ fourohfour = ANSI.color(:red) { @fourohfour.size.to_s }
89
+ fivehundred = ANSI.color(:red) { @fivehundred.size.to_s }
90
+ STDOUT.print(ANSI.clear_screen + ANSI.up(100) + ANSI.left(100))
91
+ STDOUT.print("spidering #{@starting_url}..\n")
92
+ STDOUT.print("#{to_go}/#{done}/#{fourohfour}/#{fivehundred}: #{s}")
93
+ STDOUT.print("\n\nNext 15 links:\n#{@go_to[0..14].join("\n")}")
94
+ STDOUT.print(@key)
95
+ STDOUT.flush
96
+ end
97
+
98
+ def close_up_shop
99
+ return if @quiet
100
+ STDOUT.puts "\n\nDone!"
101
+ if @fourohfour.any?
102
+ STDOUT.puts "Here are all your broken links:\n#{@fourohfour.join("\n")}"
103
+ else
104
+ STDOUT.puts "You have no broken links that I could find"
105
+ end
106
+ if @fivehundred.any?
107
+ STDOUT.puts "Here are all your dead 500 links:\n#{@fivehundred.join("\n")}"
108
+ else
109
+ STDOUT.puts "You have no dead (500) pages that I could find"
110
+ end
111
+ STDOUT.puts "\n5 slowest pages: "
112
+ @response_times.sort_by{|link, time|
113
+ -time
114
+ }.first(5).each{|link, time|
115
+ STDOUT.printf "%0.3fs %s\n", time, link
116
+ }
117
+ end
118
+ end
@@ -0,0 +1,37 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{spider_bot}
3
+ s.version = "0.9.1"
4
+
5
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
6
+ s.authors = ["Steven Soroka"]
7
+ s.date = %q{2008-12-04}
8
+ s.default_executable = %q{spider_bot}
9
+ s.description = %q{A non-threaded spider bot that spiders a site with response time stats. easily extendable}
10
+ s.email = %q{ssoroka78@gmail.com}
11
+ s.executables = ["spider_bot"]
12
+ s.extra_rdoc_files = ["bin/spider_bot", "lib/spider_bot.rb", "README.rdoc"]
13
+ s.files = ["bin/spider_bot", "lib/spider_bot.rb", "Rakefile", "README.rdoc", "spider_bot.gemspec", "Manifest"]
14
+ s.has_rdoc = true
15
+ s.homepage = %q{http://github.com/ssoroka/spider_bot}
16
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Spider_bot", "--main", "README.rdoc"]
17
+ s.require_paths = ["lib"]
18
+ s.rubyforge_project = %q{spider_bot}
19
+ s.rubygems_version = %q{1.2.0}
20
+ s.summary = %q{A non-threaded spider bot that spiders a site with response time stats. easily extendable}
21
+
22
+ if s.respond_to? :specification_version then
23
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
24
+ s.specification_version = 2
25
+
26
+ if current_version >= 3 then
27
+ s.add_development_dependency(%q<mechanize>, [">= 0"])
28
+ s.add_development_dependency(%q<ssoroka-ansi>, [">= 0"])
29
+ else
30
+ s.add_dependency(%q<mechanize>, [">= 0"])
31
+ s.add_dependency(%q<ssoroka-ansi>, [">= 0"])
32
+ end
33
+ else
34
+ s.add_dependency(%q<mechanize>, [">= 0"])
35
+ s.add_dependency(%q<ssoroka-ansi>, [">= 0"])
36
+ end
37
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ssoroka-spider_bot
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.9.1
5
+ platform: ruby
6
+ authors:
7
+ - Steven Soroka
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-12-04 00:00:00 -08:00
13
+ default_executable: spider_bot
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: mechanize
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0"
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: ssoroka-ansi
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: "0"
32
+ version:
33
+ description: A non-threaded spider bot that spiders a site with response time stats. easily extendable
34
+ email: ssoroka78@gmail.com
35
+ executables:
36
+ - spider_bot
37
+ extensions: []
38
+
39
+ extra_rdoc_files:
40
+ - bin/spider_bot
41
+ - lib/spider_bot.rb
42
+ - README.rdoc
43
+ files:
44
+ - bin/spider_bot
45
+ - lib/spider_bot.rb
46
+ - Rakefile
47
+ - README.rdoc
48
+ - spider_bot.gemspec
49
+ - Manifest
50
+ has_rdoc: true
51
+ homepage: http://github.com/ssoroka/spider_bot
52
+ post_install_message:
53
+ rdoc_options:
54
+ - --line-numbers
55
+ - --inline-source
56
+ - --title
57
+ - Spider_bot
58
+ - --main
59
+ - README.rdoc
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: "1.2"
73
+ version:
74
+ requirements: []
75
+
76
+ rubyforge_project: spider_bot
77
+ rubygems_version: 1.2.0
78
+ signing_key:
79
+ specification_version: 2
80
+ summary: A non-threaded spider bot that spiders a site with response time stats. easily extendable
81
+ test_files: []
82
+