validate-website 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (93) hide show
  1. data/README +15 -4
  2. data/Rakefile +6 -4
  3. data/bin/validate-website +33 -36
  4. data/lib/colorful_messages.rb +7 -10
  5. data/lib/validator.rb +37 -0
  6. data/lib/xhtml/xframes-1.xsd +166 -0
  7. data/lib/xhtml/xhtml-access-1.xsd +43 -0
  8. data/lib/xhtml/xhtml-applet-1.xsd +66 -0
  9. data/lib/xhtml/xhtml-attribs-1.xsd +67 -0
  10. data/lib/xhtml/xhtml-base-1.xsd +31 -0
  11. data/lib/xhtml/xhtml-basic-form-1.xsd +195 -0
  12. data/lib/xhtml/xhtml-basic-table-1.xsd +169 -0
  13. data/lib/xhtml/xhtml-basic10-model-1.xsd +385 -0
  14. data/lib/xhtml/xhtml-basic10-module-redefines-1.xsd +61 -0
  15. data/lib/xhtml/xhtml-basic10-modules-1.xsd +233 -0
  16. data/lib/xhtml/xhtml-basic10.xsd +99 -0
  17. data/lib/xhtml/xhtml-basic11-model-1.xsd +622 -0
  18. data/lib/xhtml/xhtml-basic11-modules-1.xsd +508 -0
  19. data/lib/xhtml/xhtml-basic11.xsd +105 -0
  20. data/lib/xhtml/xhtml-bdo-1.xsd +72 -0
  21. data/lib/xhtml/xhtml-blkphras-1.xsd +155 -0
  22. data/lib/xhtml/xhtml-blkpres-1.xsd +32 -0
  23. data/lib/xhtml/xhtml-blkstruct-1.xsd +44 -0
  24. data/lib/xhtml/xhtml-charent-1.xsd +38 -0
  25. data/lib/xhtml/xhtml-copyright-1.xsd +29 -0
  26. data/lib/xhtml/xhtml-csismap-1.xsd +91 -0
  27. data/lib/xhtml/xhtml-datatypes-1.xsd +177 -0
  28. data/lib/xhtml/xhtml-edit-1.xsd +34 -0
  29. data/lib/xhtml/xhtml-events-1.xsd +130 -0
  30. data/lib/xhtml/xhtml-form-1.xsd +321 -0
  31. data/lib/xhtml/xhtml-frames-1.xsd +113 -0
  32. data/lib/xhtml/xhtml-framework-1.xsd +62 -0
  33. data/lib/xhtml/xhtml-hypertext-1.xsd +47 -0
  34. data/lib/xhtml/xhtml-iframe-1.xsd +68 -0
  35. data/lib/xhtml/xhtml-image-1.xsd +40 -0
  36. data/lib/xhtml/xhtml-inlphras-1.xsd +158 -0
  37. data/lib/xhtml/xhtml-inlpres-1.xsd +34 -0
  38. data/lib/xhtml/xhtml-inlstruct-1.xsd +45 -0
  39. data/lib/xhtml/xhtml-inlstyle-1.xsd +22 -0
  40. data/lib/xhtml/xhtml-inputmode-1.xsd +35 -0
  41. data/lib/xhtml/xhtml-lat1.ent +196 -0
  42. data/lib/xhtml/xhtml-legacy-1.xsd +97 -0
  43. data/lib/xhtml/xhtml-link-1.xsd +45 -0
  44. data/lib/xhtml/xhtml-list-1.xsd +94 -0
  45. data/lib/xhtml/xhtml-meta-1.xsd +54 -0
  46. data/lib/xhtml/xhtml-metaAttributes-1.xsd +39 -0
  47. data/lib/xhtml/xhtml-misc-1.xsd +441 -0
  48. data/lib/xhtml/xhtml-nameident-1.xsd +63 -0
  49. data/lib/xhtml/xhtml-notations-1.xsd +69 -0
  50. data/lib/xhtml/xhtml-object-1.xsd +71 -0
  51. data/lib/xhtml/xhtml-param-1.xsd +46 -0
  52. data/lib/xhtml/xhtml-pres-1.xsd +46 -0
  53. data/lib/xhtml/xhtml-print-1.xsd +85 -0
  54. data/lib/xhtml/xhtml-print-model-1.xsd +604 -0
  55. data/lib/xhtml/xhtml-print-modules-1.xsd +422 -0
  56. data/lib/xhtml/xhtml-rdfa-1.dtd +438 -0
  57. data/lib/xhtml/xhtml-rdfa-1.xsd +116 -0
  58. data/lib/xhtml/xhtml-rdfa-model-1.xsd +461 -0
  59. data/lib/xhtml/xhtml-rdfa-modules-1.xsd +548 -0
  60. data/lib/xhtml/xhtml-ruby-1.xsd +170 -0
  61. data/lib/xhtml/xhtml-ruby-basic-1.xsd +84 -0
  62. data/lib/xhtml/xhtml-script-1.xsd +65 -0
  63. data/lib/xhtml/xhtml-special.ent +80 -0
  64. data/lib/xhtml/xhtml-ssismap-1.xsd +38 -0
  65. data/lib/xhtml/xhtml-struct-1.xsd +85 -0
  66. data/lib/xhtml/xhtml-style-1.xsd +47 -0
  67. data/lib/xhtml/xhtml-symbol.ent +237 -0
  68. data/lib/xhtml/xhtml-table-1.xsd +267 -0
  69. data/lib/xhtml/xhtml-target-1.xsd +49 -0
  70. data/lib/xhtml/xhtml-text-1.xsd +62 -0
  71. data/lib/xhtml/xhtml1-frameset.dtd +1235 -0
  72. data/lib/xhtml/xhtml1-frameset.xsd +2847 -0
  73. data/lib/xhtml/xhtml1-strict.dtd +978 -0
  74. data/lib/xhtml/xhtml1-strict.xsd +2211 -0
  75. data/lib/xhtml/xhtml1-transitional.dtd +1201 -0
  76. data/lib/xhtml/xhtml1-transitional.xsd +2755 -0
  77. data/lib/xhtml/xhtml11-model-1.xsd +715 -0
  78. data/lib/xhtml/xhtml11-module-redefines-1.xsd +335 -0
  79. data/lib/xhtml/xhtml11-modules-1.xsd +605 -0
  80. data/lib/xhtml/xhtml11.xsd +107 -0
  81. data/lib/xhtml/xhtml2.xsd +21 -0
  82. data/lib/xhtml/xml-events-1.xsd +73 -0
  83. data/lib/xhtml/xml-events-2.xsd +73 -0
  84. data/lib/xhtml/xml-events-attribs-1.xsd +73 -0
  85. data/lib/xhtml/xml-events-attribs-2.xsd +75 -0
  86. data/lib/xhtml/xml-events-copyright-1.xsd +34 -0
  87. data/lib/xhtml/xml-events-copyright-2.xsd +34 -0
  88. data/lib/xhtml/xml-handlers-1.xsd +136 -0
  89. data/lib/xhtml/xml-handlers-2.xsd +98 -0
  90. data/lib/xhtml/xml-script-1.xsd +38 -0
  91. data/lib/xhtml/xml.xsd +286 -0
  92. metadata +114 -8
  93. data/lib/spkspider.rb +0 -147
data/lib/spkspider.rb DELETED
@@ -1,147 +0,0 @@
1
- # encoding: utf-8
2
- require 'open-uri'
3
- # SpkSpider is a ruby crawler
4
-
5
- class SpkSpider
6
- VERSION = '0.0.5'
7
-
8
- attr_accessor :links_to_visit, :site, :user_agent, :basic_auth
9
- attr_accessor :parser, :exclude
10
- attr_reader :visited_links, :external_links, :errors
11
-
12
- # initialize method take the site to crawl in argument
13
- def initialize(site)
14
- puts "SpkSpider #{VERSION} initializing..."
15
- @site = URI.parse(site) || raise("You didn't give me a site to crawl")
16
- @user_agent = "SpkSpr/#{VERSION}"
17
- @links_to_visit = Array.new
18
- @visited_links = Array.new
19
- @external_links = Array.new
20
- @errors = Hash.new
21
- @links_to_visit << site
22
- @parser = 'xml'
23
- puts "Ready to crawl"
24
- end
25
-
26
- def init_xml_parser(doc)
27
- require 'xml'
28
- xp = XML::HTMLParser.string(doc, {:options => XML::HTMLParser::Options::RECOVER | XML::HTMLParser::Options::NOERROR | XML::HTMLParser::Options::NOWARNING })
29
- XML::Error.set_handler do |error|
30
- exception = error
31
- end
32
- document = xp.parse
33
- links = document.find("//a[@href]")
34
- end
35
-
36
- def fetch_links(doc)
37
- case @parser
38
- when 'xml'
39
- init_xml_parser(doc)
40
- when 'hpricot'
41
- require 'hpricot'
42
- Hpricot.buffer_size = 204800
43
- Hpricot(doc).search("//a[@href]")
44
- else
45
- init_xml_parser(doc)
46
- end
47
- rescue
48
- init_xml_parser(doc)
49
- end
50
-
51
- # download the document
52
- def fetch_html(url)
53
- uri = URI.parse(url)
54
- print "Visiting: #{url}"
55
- begin
56
- @document = uri.read('User-Agent' => @user_agent, 'Referer' => url, :http_basic_authentication => @basic_auth)
57
- rescue
58
- # OpenURI::HTTPError
59
- end
60
- @visited_links << url
61
- @document
62
- end
63
-
64
- # reading the document and extract the urls
65
- def read_document(document, url)
66
- if document
67
- case document.content_type
68
- when "text/html"
69
- link_extractor(document, url)
70
- else
71
- print " ... not text/html, skipping ..."
72
- end
73
- else
74
- print " ... document does not exist, skipping ..."
75
- end
76
- end
77
-
78
- # extract the link and un-relative
79
- def link_extractor(document, document_url)
80
- links = fetch_links(document)
81
- links.each do |link|
82
- href = link.attributes['href']
83
- if href && href.length > 0 && (@exclude && !href.match(@exclude) || @exclude.nil?)
84
- begin
85
- url = href
86
- uri = URI.parse(url)
87
- document_uri = URI.parse(document_url)
88
- rescue
89
- #print " #{url} skip this link"
90
- next
91
- end
92
- else
93
- #print " skip this link"
94
- next
95
- end
96
-
97
- # Derelativeize links if necessary
98
- if uri.relative?
99
- url = document_uri.merge(url).to_s if url[0,1] == '?'
100
- url = @site.merge(url).to_s
101
- uri = URI.parse(url)
102
- end
103
-
104
- # skip anchor link
105
- if url.include?('#')
106
- #print '... Anchor link found, skipping ...'
107
- next
108
- end
109
-
110
- # Check domain, if in same domain, keep link, else trash it
111
- if uri.host != @site.host
112
- @external_links << url
113
- @external_links.uniq!
114
- next
115
- end
116
-
117
- # Find out if we've seen this link already
118
- if (@visited_links.include? url) || (@links_to_visit.include? url)
119
- next
120
- end
121
-
122
- @links_to_visit << url
123
- end
124
- end
125
-
126
- # lunch the crawling
127
- def crawl
128
- while !@links_to_visit.empty?
129
- # get the first element of the links_to_visit
130
- url = @links_to_visit.shift
131
- document = fetch_html(url)
132
- read_document(document, url)
133
- if block_given?
134
- yield(url, document)
135
- end
136
- puts ' done!'
137
- end
138
- end
139
- end
140
-
141
- if __FILE__ == $0
142
- site = 'http://localhost:4567/'
143
- site = ARGV[0] if ARGV[0]
144
- spider = SpkSpider.new(site)
145
- spider.user_agent = ''
146
- spider.crawl
147
- end