validate-website 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,39 @@
1
+ == validate-website
2
+
3
+ == DESCRIPTION
4
+
5
+ Web crawler that print if the page is valid with the dtd.
6
+ compatible ruby 1.9
7
+
8
+ == SYNOPSIS
9
+
10
+ validate-website --help
11
+ validate-website -s "http://localhost:4567/" -u "Mozilla 5.0" -f not-well-formed.txt --auth=user,pass -e 'redirect|news'
12
+
13
+ == REQUIREMENTS:
14
+
15
+ libxml-ruby >= 1.1.3
16
+
17
+ == LICENSE
18
+ (The MIT License)
19
+
20
+ Copyright (c) 2009 spk
21
+
22
+ Permission is hereby granted, free of charge, to any person obtaining
23
+ a copy of this software and associated documentation files (the
24
+ 'Software'), to deal in the Software without restriction, including
25
+ without limitation the rights to use, copy, modify, merge, publish,
26
+ distribute, sublicense, and/or sell copies of the Software, and to
27
+ permit persons to whom the Software is furnished to do so, subject to
28
+ the following conditions:
29
+
30
+ The above copyright notice and this permission notice shall be
31
+ included in all copies or substantial portions of the Software.
32
+
33
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
34
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
35
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
36
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
37
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
38
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
39
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,71 @@
1
+ require 'rake/testtask'
2
+ require 'rake/packagetask'
3
+ require 'rake/rdoctask'
4
+ require 'rake'
5
+ require 'find'
6
+
7
+ # Globals
8
+
9
+ PKG_NAME = 'validate-website'
10
+ PKG_VERSION = '0.1'
11
+
12
+ PKG_FILES = ['README', 'Rakefile']
13
+ Find.find('lib/', 'bin/') do |f|
14
+ if FileTest.directory?(f) and f =~ /\.svn|\.git/
15
+ Find.prune
16
+ else
17
+ PKG_FILES << f
18
+ end
19
+ end
20
+
21
+ # Tasks
22
+
23
+ task :default => [:clean, :repackage]
24
+
25
+ #Rake::TestTask.new do |t|
26
+ #t.libs << "test"
27
+ #t.test_files = FileList['test/tc_*.rb']
28
+ #end
29
+
30
+ Rake::RDocTask.new do |rd|
31
+ f = []
32
+ require 'find'
33
+ Find.find('lib/') do |file|
34
+ # Skip hidden files (.svn/ directories and Vim swapfiles)
35
+ if file.split(/\//).last =~ /^\./
36
+ Find.prune
37
+ else
38
+ f << file if not FileTest.directory?(file)
39
+ end
40
+ end
41
+ rd.rdoc_files.include(f)
42
+ rd.options << '--all'
43
+ end
44
+
45
+ Rake::PackageTask.new(PKG_NAME, PKG_VERSION) do |p|
46
+ p.need_tar = true
47
+ p.package_files = PKG_FILES
48
+ end
49
+
50
+ # "Gem" part of the Rakefile
51
+ require 'rake/gempackagetask'
52
+
53
+ spec = Gem::Specification.new do |s|
54
+ s.author = 'spk'
55
+ s.email = 'spk@tuxfamily.org'
56
+ s.platform = Gem::Platform::RUBY
57
+ s.summary = "Web crawler for testing webpage validity"
58
+ s.name = PKG_NAME
59
+ s.version = PKG_VERSION
60
+ s.requirements << 'libxml-ruby'
61
+ s.require_path = 'lib'
62
+ s.bindir = 'bin'
63
+ s.executables << 'validate-website'
64
+ s.files = PKG_FILES
65
+ s.description = "Web crawler that print if the page is valid with the dtd"
66
+ end
67
+
68
+ Rake::GemPackageTask.new(spec) do |pkg|
69
+ pkg.need_zip = true
70
+ pkg.need_tar = true
71
+ end
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+ $:.unshift '../lib'
3
+ require 'spkspider'
4
+ require 'colorful_messages'
5
+ require 'open-uri'
6
+ require 'xml'
7
+ require 'optparse'
8
+
9
+ include ColorfulMessages
10
+
11
+ XML.default_validity_checking = true
12
+ XML.default_load_external_dtd = true
13
+
14
+ # default options
15
+ OPTIONS = {
16
+ :site => 'http://localhost:3000/',
17
+ :useragent => '',
18
+ :exclude => nil,
19
+ :file => nil,
20
+ :auth => nil,
21
+ }
22
+
23
+ ARGV.options do |o|
24
+ script_name = File.basename($0)
25
+ o.set_summary_indent(' ')
26
+ o.banner = "Usage: #{script_name} [OPTIONS]"
27
+ o.define_head "validate website"
28
+ o.separator ""
29
+
30
+ o.on("-s", "--site=val", String,
31
+ "Default: #{OPTIONS[:site]}") { |OPTIONS[:site]| }
32
+ o.on("-u", "--useragent=val", String,
33
+ "Default: #{OPTIONS[:useragent]}") { |OPTIONS[:useragent]| }
34
+ o.on("-e", "--exclude=val", String,
35
+ "Url to exclude") { |OPTIONS[:exclude]| }
36
+ o.on("-f", "--file=val", String,
37
+ "save not well formed urls") { |OPTIONS[:file]| }
38
+ o.on("--auth=[user,pass]", Array,
39
+ "Basic http authentification") { |OPTIONS[:auth]| }
40
+
41
+ o.separator ""
42
+ o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
43
+ o.parse!
44
+ end
45
+
46
+ spider = SpkSpider.new(OPTIONS[:site])
47
+ spider.user_agent = OPTIONS[:useragent]
48
+ spider.exclude = Regexp.new(OPTIONS[:exclude]) if OPTIONS[:exclude]
49
+ spider.basic_auth = OPTIONS[:auth]
50
+
51
+ if OPTIONS[:file]
52
+ file = OPTIONS[:file]
53
+ open(file, 'w').write('')
54
+ end
55
+
56
+ spider.crawl do |url, document|
57
+ begin
58
+ xp = XML::Parser.string(document)
59
+ exception = nil
60
+ XML::Error.set_handler do |error|
61
+ exception = error
62
+ end
63
+
64
+ doc = xp.parse
65
+
66
+ msg = " well formed? %s" % xp.context.well_formed?
67
+ if xp.context.well_formed?
68
+ print success(msg)
69
+ else
70
+ print error(msg)
71
+ open(file, 'a').write(url+"\n") if OPTIONS[:file]
72
+ end
73
+ rescue
74
+ print error(msg)
75
+ open(file, 'a').write(url+"\n") if OPTIONS[:file]
76
+ end
77
+ end
@@ -0,0 +1,30 @@
1
+ module ColorfulMessages
2
+
3
+ # red
4
+ def error(message)
5
+ "\033[1;31m#{message}\033[0m"
6
+ end
7
+
8
+ # yellow
9
+ def warning(message)
10
+ "\033[1;33m#{message}\033[0m"
11
+ end
12
+
13
+ # green
14
+ def success(message)
15
+ "\033[1;32m#{message}\033[0m"
16
+ end
17
+
18
+ alias_method :message, :success
19
+
20
+ # magenta
21
+ def note(message)
22
+ "\033[1;35m#{message}\033[0m"
23
+ end
24
+
25
+ # blue
26
+ def info(message)
27
+ "\033[1;34m#{message}\033[0m"
28
+ end
29
+
30
+ end
@@ -0,0 +1,147 @@
1
+ # encoding: utf-8
2
+ require 'open-uri'
3
+ # SpkSpider is a ruby crawler
4
+
5
+ class SpkSpider
6
+ VERSION = '0.0.5'
7
+
8
+ attr_accessor :links_to_visit, :site, :user_agent, :basic_auth
9
+ attr_accessor :parser, :exclude
10
+ attr_reader :visited_links, :external_links, :errors
11
+
12
+ # initialize method take the site to crawl in argument
13
+ def initialize(site)
14
+ puts "SpkSpider #{VERSION} initializing..."
15
+ @site = URI.parse(site) || raise("You didn't give me a site to crawl")
16
+ @user_agent = "SpkSpr/#{VERSION}"
17
+ @links_to_visit = Array.new
18
+ @visited_links = Array.new
19
+ @external_links = Array.new
20
+ @errors = Hash.new
21
+ @links_to_visit << site
22
+ @parser = 'xml'
23
+ puts "Ready to crawl"
24
+ end
25
+
26
+ def init_xml_parser(doc)
27
+ require 'xml'
28
+ xp = XML::HTMLParser.string(doc, {:options => XML::HTMLParser::Options::RECOVER | XML::HTMLParser::Options::NOERROR | XML::HTMLParser::Options::NOWARNING })
29
+ XML::Error.set_handler do |error|
30
+ exception = error
31
+ end
32
+ document = xp.parse
33
+ links = document.find("//a[@href]")
34
+ end
35
+
36
+ def fetch_links(doc)
37
+ case @parser
38
+ when 'xml'
39
+ init_xml_parser(doc)
40
+ when 'hpricot'
41
+ require 'hpricot'
42
+ Hpricot.buffer_size = 204800
43
+ Hpricot(doc).search("//a[@href]")
44
+ else
45
+ init_xml_parser(doc)
46
+ end
47
+ rescue
48
+ init_xml_parser(doc)
49
+ end
50
+
51
+ # download the document
52
+ def fetch_html(url)
53
+ uri = URI.parse(url)
54
+ print "Visiting: #{url}"
55
+ begin
56
+ @document = uri.read('User-Agent' => @user_agent, 'Referer' => url, :http_basic_authentication => @basic_auth)
57
+ rescue
58
+ # OpenURI::HTTPError
59
+ end
60
+ @visited_links << url
61
+ @document
62
+ end
63
+
64
+ # reading the document and extract the urls
65
+ def read_document(document, url)
66
+ if document
67
+ case document.content_type
68
+ when "text/html"
69
+ link_extractor(document, url)
70
+ else
71
+ print " ... not text/html, skipping ..."
72
+ end
73
+ else
74
+ print " ... document does not exist, skipping ..."
75
+ end
76
+ end
77
+
78
+ # extract the link and un-relative
79
+ def link_extractor(document, document_url)
80
+ links = fetch_links(document)
81
+ links.each do |link|
82
+ href = link.attributes['href']
83
+ if href && href.length > 0 && (@exclude && !href.match(@exclude) || @exclude.nil?)
84
+ begin
85
+ url = href
86
+ uri = URI.parse(url)
87
+ document_uri = URI.parse(document_url)
88
+ rescue
89
+ #print " #{url} skip this link"
90
+ next
91
+ end
92
+ else
93
+ #print " skip this link"
94
+ next
95
+ end
96
+
97
+ # Derelativeize links if necessary
98
+ if uri.relative?
99
+ url = document_uri.merge(url).to_s if url[0,1] == '?'
100
+ url = @site.merge(url).to_s
101
+ uri = URI.parse(url)
102
+ end
103
+
104
+ # skip anchor link
105
+ if url.include?('#')
106
+ #print '... Anchor link found, skipping ...'
107
+ next
108
+ end
109
+
110
+ # Check domain, if in same domain, keep link, else trash it
111
+ if uri.host != @site.host
112
+ @external_links << url
113
+ @external_links.uniq!
114
+ next
115
+ end
116
+
117
+ # Find out if we've seen this link already
118
+ if (@visited_links.include? url) || (@links_to_visit.include? url)
119
+ next
120
+ end
121
+
122
+ @links_to_visit << url
123
+ end
124
+ end
125
+
126
+ # lunch the crawling
127
+ def crawl
128
+ while !@links_to_visit.empty?
129
+ # get the first element of the links_to_visit
130
+ url = @links_to_visit.shift
131
+ document = fetch_html(url)
132
+ read_document(document, url)
133
+ if block_given?
134
+ yield(url, document)
135
+ end
136
+ puts ' done!'
137
+ end
138
+ end
139
+ end
140
+
141
+ if __FILE__ == $0
142
+ site = 'http://localhost:4567/'
143
+ site = ARGV[0] if ARGV[0]
144
+ spider = SpkSpider.new(site)
145
+ spider.user_agent = ''
146
+ spider.crawl
147
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: validate-website
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.1"
5
+ platform: ruby
6
+ authors:
7
+ - spk
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-10-24 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Web crawler that print if the page is valid with the dtd
17
+ email: spk@tuxfamily.org
18
+ executables:
19
+ - validate-website
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - README
26
+ - Rakefile
27
+ - lib/colorful_messages.rb
28
+ - lib/spkspider.rb
29
+ - bin/validate-website
30
+ has_rdoc: true
31
+ homepage:
32
+ licenses: []
33
+
34
+ post_install_message:
35
+ rdoc_options: []
36
+
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ requirements:
52
+ - libxml-ruby
53
+ rubyforge_project:
54
+ rubygems_version: 1.3.5
55
+ signing_key:
56
+ specification_version: 3
57
+ summary: Web crawler for testing webpage validity
58
+ test_files: []
59
+