validate-website 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,39 @@
1
+ == validate-website
2
+
3
+ == DESCRIPTION
4
+
5
+ Web crawler that print if the page is valid with the dtd.
6
+ compatible ruby 1.9
7
+
8
+ == SYNOPSIS
9
+
10
+ validate-website --help
11
+ validate-website -s "http://localhost:4567/" -u "Mozilla 5.0" -f not-well-formed.txt --auth=user,pass -e 'redirect|news'
12
+
13
+ == REQUIREMENTS:
14
+
15
+ libxml-ruby >= 1.1.3
16
+
17
+ == LICENSE
18
+ (The MIT License)
19
+
20
+ Copyright (c) 2009 spk
21
+
22
+ Permission is hereby granted, free of charge, to any person obtaining
23
+ a copy of this software and associated documentation files (the
24
+ 'Software'), to deal in the Software without restriction, including
25
+ without limitation the rights to use, copy, modify, merge, publish,
26
+ distribute, sublicense, and/or sell copies of the Software, and to
27
+ permit persons to whom the Software is furnished to do so, subject to
28
+ the following conditions:
29
+
30
+ The above copyright notice and this permission notice shall be
31
+ included in all copies or substantial portions of the Software.
32
+
33
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
34
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
35
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
36
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
37
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
38
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
39
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,71 @@
1
+ require 'rake/testtask'
2
+ require 'rake/packagetask'
3
+ require 'rake/rdoctask'
4
+ require 'rake'
5
+ require 'find'
6
+
7
+ # Globals
8
+
9
+ PKG_NAME = 'validate-website'
10
+ PKG_VERSION = '0.1'
11
+
12
+ PKG_FILES = ['README', 'Rakefile']
13
+ Find.find('lib/', 'bin/') do |f|
14
+ if FileTest.directory?(f) and f =~ /\.svn|\.git/
15
+ Find.prune
16
+ else
17
+ PKG_FILES << f
18
+ end
19
+ end
20
+
21
+ # Tasks
22
+
23
+ task :default => [:clean, :repackage]
24
+
25
+ #Rake::TestTask.new do |t|
26
+ #t.libs << "test"
27
+ #t.test_files = FileList['test/tc_*.rb']
28
+ #end
29
+
30
+ Rake::RDocTask.new do |rd|
31
+ f = []
32
+ require 'find'
33
+ Find.find('lib/') do |file|
34
+ # Skip hidden files (.svn/ directories and Vim swapfiles)
35
+ if file.split(/\//).last =~ /^\./
36
+ Find.prune
37
+ else
38
+ f << file if not FileTest.directory?(file)
39
+ end
40
+ end
41
+ rd.rdoc_files.include(f)
42
+ rd.options << '--all'
43
+ end
44
+
45
+ Rake::PackageTask.new(PKG_NAME, PKG_VERSION) do |p|
46
+ p.need_tar = true
47
+ p.package_files = PKG_FILES
48
+ end
49
+
50
+ # "Gem" part of the Rakefile
51
+ require 'rake/gempackagetask'
52
+
53
+ spec = Gem::Specification.new do |s|
54
+ s.author = 'spk'
55
+ s.email = 'spk@tuxfamily.org'
56
+ s.platform = Gem::Platform::RUBY
57
+ s.summary = "Web crawler for testing webpage validity"
58
+ s.name = PKG_NAME
59
+ s.version = PKG_VERSION
60
+ s.requirements << 'libxml-ruby'
61
+ s.require_path = 'lib'
62
+ s.bindir = 'bin'
63
+ s.executables << 'validate-website'
64
+ s.files = PKG_FILES
65
+ s.description = "Web crawler that print if the page is valid with the dtd"
66
+ end
67
+
68
+ Rake::GemPackageTask.new(spec) do |pkg|
69
+ pkg.need_zip = true
70
+ pkg.need_tar = true
71
+ end
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+ $:.unshift '../lib'
3
+ require 'spkspider'
4
+ require 'colorful_messages'
5
+ require 'open-uri'
6
+ require 'xml'
7
+ require 'optparse'
8
+
9
+ include ColorfulMessages
10
+
11
+ XML.default_validity_checking = true
12
+ XML.default_load_external_dtd = true
13
+
14
+ # default options
15
+ OPTIONS = {
16
+ :site => 'http://localhost:3000/',
17
+ :useragent => '',
18
+ :exclude => nil,
19
+ :file => nil,
20
+ :auth => nil,
21
+ }
22
+
23
+ ARGV.options do |o|
24
+ script_name = File.basename($0)
25
+ o.set_summary_indent(' ')
26
+ o.banner = "Usage: #{script_name} [OPTIONS]"
27
+ o.define_head "validate website"
28
+ o.separator ""
29
+
30
+ o.on("-s", "--site=val", String,
31
+ "Default: #{OPTIONS[:site]}") { |OPTIONS[:site]| }
32
+ o.on("-u", "--useragent=val", String,
33
+ "Default: #{OPTIONS[:useragent]}") { |OPTIONS[:useragent]| }
34
+ o.on("-e", "--exclude=val", String,
35
+ "Url to exclude") { |OPTIONS[:exclude]| }
36
+ o.on("-f", "--file=val", String,
37
+ "save not well formed urls") { |OPTIONS[:file]| }
38
+ o.on("--auth=[user,pass]", Array,
39
+ "Basic http authentification") { |OPTIONS[:auth]| }
40
+
41
+ o.separator ""
42
+ o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
43
+ o.parse!
44
+ end
45
+
46
+ spider = SpkSpider.new(OPTIONS[:site])
47
+ spider.user_agent = OPTIONS[:useragent]
48
+ spider.exclude = Regexp.new(OPTIONS[:exclude]) if OPTIONS[:exclude]
49
+ spider.basic_auth = OPTIONS[:auth]
50
+
51
+ if OPTIONS[:file]
52
+ file = OPTIONS[:file]
53
+ open(file, 'w').write('')
54
+ end
55
+
56
+ spider.crawl do |url, document|
57
+ begin
58
+ xp = XML::Parser.string(document)
59
+ exception = nil
60
+ XML::Error.set_handler do |error|
61
+ exception = error
62
+ end
63
+
64
+ doc = xp.parse
65
+
66
+ msg = " well formed? %s" % xp.context.well_formed?
67
+ if xp.context.well_formed?
68
+ print success(msg)
69
+ else
70
+ print error(msg)
71
+ open(file, 'a').write(url+"\n") if OPTIONS[:file]
72
+ end
73
+ rescue
74
+ print error(msg)
75
+ open(file, 'a').write(url+"\n") if OPTIONS[:file]
76
+ end
77
+ end
@@ -0,0 +1,30 @@
1
+ module ColorfulMessages
2
+
3
+ # red
4
+ def error(message)
5
+ "\033[1;31m#{message}\033[0m"
6
+ end
7
+
8
+ # yellow
9
+ def warning(message)
10
+ "\033[1;33m#{message}\033[0m"
11
+ end
12
+
13
+ # green
14
+ def success(message)
15
+ "\033[1;32m#{message}\033[0m"
16
+ end
17
+
18
+ alias_method :message, :success
19
+
20
+ # magenta
21
+ def note(message)
22
+ "\033[1;35m#{message}\033[0m"
23
+ end
24
+
25
+ # blue
26
+ def info(message)
27
+ "\033[1;34m#{message}\033[0m"
28
+ end
29
+
30
+ end
@@ -0,0 +1,147 @@
1
+ # encoding: utf-8
2
+ require 'open-uri'
3
+ # SpkSpider is a ruby crawler
4
+
5
+ class SpkSpider
6
+ VERSION = '0.0.5'
7
+
8
+ attr_accessor :links_to_visit, :site, :user_agent, :basic_auth
9
+ attr_accessor :parser, :exclude
10
+ attr_reader :visited_links, :external_links, :errors
11
+
12
+ # initialize method take the site to crawl in argument
13
+ def initialize(site)
14
+ puts "SpkSpider #{VERSION} initializing..."
15
+ @site = URI.parse(site) || raise("You didn't give me a site to crawl")
16
+ @user_agent = "SpkSpr/#{VERSION}"
17
+ @links_to_visit = Array.new
18
+ @visited_links = Array.new
19
+ @external_links = Array.new
20
+ @errors = Hash.new
21
+ @links_to_visit << site
22
+ @parser = 'xml'
23
+ puts "Ready to crawl"
24
+ end
25
+
26
+ def init_xml_parser(doc)
27
+ require 'xml'
28
+ xp = XML::HTMLParser.string(doc, {:options => XML::HTMLParser::Options::RECOVER | XML::HTMLParser::Options::NOERROR | XML::HTMLParser::Options::NOWARNING })
29
+ XML::Error.set_handler do |error|
30
+ exception = error
31
+ end
32
+ document = xp.parse
33
+ links = document.find("//a[@href]")
34
+ end
35
+
36
+ def fetch_links(doc)
37
+ case @parser
38
+ when 'xml'
39
+ init_xml_parser(doc)
40
+ when 'hpricot'
41
+ require 'hpricot'
42
+ Hpricot.buffer_size = 204800
43
+ Hpricot(doc).search("//a[@href]")
44
+ else
45
+ init_xml_parser(doc)
46
+ end
47
+ rescue
48
+ init_xml_parser(doc)
49
+ end
50
+
51
+ # download the document
52
+ def fetch_html(url)
53
+ uri = URI.parse(url)
54
+ print "Visiting: #{url}"
55
+ begin
56
+ @document = uri.read('User-Agent' => @user_agent, 'Referer' => url, :http_basic_authentication => @basic_auth)
57
+ rescue
58
+ # OpenURI::HTTPError
59
+ end
60
+ @visited_links << url
61
+ @document
62
+ end
63
+
64
+ # reading the document and extract the urls
65
+ def read_document(document, url)
66
+ if document
67
+ case document.content_type
68
+ when "text/html"
69
+ link_extractor(document, url)
70
+ else
71
+ print " ... not text/html, skipping ..."
72
+ end
73
+ else
74
+ print " ... document does not exist, skipping ..."
75
+ end
76
+ end
77
+
78
+ # extract the link and un-relative
79
+ def link_extractor(document, document_url)
80
+ links = fetch_links(document)
81
+ links.each do |link|
82
+ href = link.attributes['href']
83
+ if href && href.length > 0 && (@exclude && !href.match(@exclude) || @exclude.nil?)
84
+ begin
85
+ url = href
86
+ uri = URI.parse(url)
87
+ document_uri = URI.parse(document_url)
88
+ rescue
89
+ #print " #{url} skip this link"
90
+ next
91
+ end
92
+ else
93
+ #print " skip this link"
94
+ next
95
+ end
96
+
97
+ # Derelativeize links if necessary
98
+ if uri.relative?
99
+ url = document_uri.merge(url).to_s if url[0,1] == '?'
100
+ url = @site.merge(url).to_s
101
+ uri = URI.parse(url)
102
+ end
103
+
104
+ # skip anchor link
105
+ if url.include?('#')
106
+ #print '... Anchor link found, skipping ...'
107
+ next
108
+ end
109
+
110
+ # Check domain, if in same domain, keep link, else trash it
111
+ if uri.host != @site.host
112
+ @external_links << url
113
+ @external_links.uniq!
114
+ next
115
+ end
116
+
117
+ # Find out if we've seen this link already
118
+ if (@visited_links.include? url) || (@links_to_visit.include? url)
119
+ next
120
+ end
121
+
122
+ @links_to_visit << url
123
+ end
124
+ end
125
+
126
+ # lunch the crawling
127
+ def crawl
128
+ while !@links_to_visit.empty?
129
+ # get the first element of the links_to_visit
130
+ url = @links_to_visit.shift
131
+ document = fetch_html(url)
132
+ read_document(document, url)
133
+ if block_given?
134
+ yield(url, document)
135
+ end
136
+ puts ' done!'
137
+ end
138
+ end
139
+ end
140
+
141
+ if __FILE__ == $0
142
+ site = 'http://localhost:4567/'
143
+ site = ARGV[0] if ARGV[0]
144
+ spider = SpkSpider.new(site)
145
+ spider.user_agent = ''
146
+ spider.crawl
147
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: validate-website
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.1"
5
+ platform: ruby
6
+ authors:
7
+ - spk
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-10-24 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Web crawler that print if the page is valid with the dtd
17
+ email: spk@tuxfamily.org
18
+ executables:
19
+ - validate-website
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - README
26
+ - Rakefile
27
+ - lib/colorful_messages.rb
28
+ - lib/spkspider.rb
29
+ - bin/validate-website
30
+ has_rdoc: true
31
+ homepage:
32
+ licenses: []
33
+
34
+ post_install_message:
35
+ rdoc_options: []
36
+
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ requirements:
52
+ - libxml-ruby
53
+ rubyforge_project:
54
+ rubygems_version: 1.3.5
55
+ signing_key:
56
+ specification_version: 3
57
+ summary: Web crawler for testing webpage validity
58
+ test_files: []
59
+