krawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in krawler.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Krawler
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'krawler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install krawler
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require 'bundler/gem_tasks'
data/bin/krawl ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'krawler'
3
+
4
+ Krawler::Base.new(ARGV[0] || 'http://localhost:3000/').base
data/krawler.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/krawler/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Mike Evans"]
6
+ gem.email = ["mike@urlgonomics.com"]
7
+ gem.description = %q{Simple little rake task to crawl a site.}
8
+ gem.summary = %q{}
9
+ gem.homepage = ""
10
+
11
+ gem.add_dependency 'mechanize', '~> 2.5.0'
12
+ gem.rubyforge_project = 'krawler'
13
+
14
+ gem.files = `git ls-files`.split($\)
15
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
+ gem.name = "krawler"
18
+ gem.require_paths = ["lib"]
19
+ gem.version = Krawler::VERSION
20
+ end
@@ -0,0 +1,3 @@
1
+ module Krawler
2
+ VERSION = "0.0.1"
3
+ end
data/lib/krawler.rb ADDED
@@ -0,0 +1,65 @@
1
+ require 'krawler/version'
2
+ require 'mechanize'
3
+
4
+ module Krawler
5
+
6
+ class Base
7
+
8
+ def initialize(url)
9
+ @base = url
10
+ @agent = Mechanize.new
11
+ @links_to_crawl = [@base]
12
+ @crawled_links = []
13
+ @bad_links = []
14
+ @suspect_links = []
15
+ end
16
+
17
+ def base
18
+ puts "Crawling #{@base}"
19
+
20
+ while !@links_to_crawl.empty? do
21
+ crawl_page(@links_to_crawl.pop)
22
+ end
23
+
24
+ puts "#{@crawled_links.size} total Good Links"
25
+
26
+ puts "Bad Links:"
27
+ @bad_links.each {|link| puts link }
28
+
29
+ puts "Suspect Links:"
30
+ @suspect_links.each {|link| puts link}
31
+ end
32
+
33
+ def crawl_page(link)
34
+ @crawled_links << link
35
+ puts link
36
+ begin
37
+ start = Time.now
38
+ page = @agent.get(link)
39
+ rescue Mechanize::ResponseCodeError => e
40
+ puts e
41
+ @bad_links << link
42
+ return
43
+ rescue Timeout::Error => e
44
+ puts "SLOW PAGE, timeout at #{Time.now - start} seconds"
45
+ @suspect_links << link
46
+ return
47
+ end
48
+
49
+ elapsed = Time.now - start
50
+ if elapsed > 7.0
51
+ puts "SLOW PAGE, #{Time.now - start} seconds"
52
+ end
53
+
54
+ return if !page.respond_to?(:links)
55
+ page.links.each do |new_link|
56
+ new_link = new_link.href
57
+ if (new_link =~ /^#{Regexp.escape(@base)}/) || (new_link =~ /^\//)
58
+ next if @crawled_links.include?(new_link)
59
+
60
+ @links_to_crawl << new_link
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,9 @@
1
+ namespace :crawl do
2
+
3
+ desc "Crawl a site looking for errors"
4
+ task :base do
5
+ url = ENV['URL'] || 'http://localhost:3000'
6
+ Crawler.new(url).base
7
+ end
8
+
9
+ end
metadata ADDED
@@ -0,0 +1,67 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: krawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Mike Evans
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-10 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: &70271191131080 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 2.5.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70271191131080
25
+ description: Simple little rake task to crawl a site.
26
+ email:
27
+ - mike@urlgonomics.com
28
+ executables:
29
+ - krawl
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - .gitignore
34
+ - Gemfile
35
+ - LICENSE
36
+ - README.md
37
+ - Rakefile
38
+ - bin/krawl
39
+ - krawler.gemspec
40
+ - lib/krawler.rb
41
+ - lib/krawler/version.rb
42
+ - tasks/krawler.rake
43
+ homepage: ''
44
+ licenses: []
45
+ post_install_message:
46
+ rdoc_options: []
47
+ require_paths:
48
+ - lib
49
+ required_ruby_version: !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ! '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubyforge_project: krawler
63
+ rubygems_version: 1.8.17
64
+ signing_key:
65
+ specification_version: 3
66
+ summary: ''
67
+ test_files: []