validate-website 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. data/README +15 -4
  2. data/Rakefile +6 -4
  3. data/bin/validate-website +33 -36
  4. data/lib/colorful_messages.rb +7 -10
  5. data/lib/validator.rb +37 -0
  6. data/lib/xhtml/xframes-1.xsd +166 -0
  7. data/lib/xhtml/xhtml-access-1.xsd +43 -0
  8. data/lib/xhtml/xhtml-applet-1.xsd +66 -0
  9. data/lib/xhtml/xhtml-attribs-1.xsd +67 -0
  10. data/lib/xhtml/xhtml-base-1.xsd +31 -0
  11. data/lib/xhtml/xhtml-basic-form-1.xsd +195 -0
  12. data/lib/xhtml/xhtml-basic-table-1.xsd +169 -0
  13. data/lib/xhtml/xhtml-basic10-model-1.xsd +385 -0
  14. data/lib/xhtml/xhtml-basic10-module-redefines-1.xsd +61 -0
  15. data/lib/xhtml/xhtml-basic10-modules-1.xsd +233 -0
  16. data/lib/xhtml/xhtml-basic10.xsd +99 -0
  17. data/lib/xhtml/xhtml-basic11-model-1.xsd +622 -0
  18. data/lib/xhtml/xhtml-basic11-modules-1.xsd +508 -0
  19. data/lib/xhtml/xhtml-basic11.xsd +105 -0
  20. data/lib/xhtml/xhtml-bdo-1.xsd +72 -0
  21. data/lib/xhtml/xhtml-blkphras-1.xsd +155 -0
  22. data/lib/xhtml/xhtml-blkpres-1.xsd +32 -0
  23. data/lib/xhtml/xhtml-blkstruct-1.xsd +44 -0
  24. data/lib/xhtml/xhtml-charent-1.xsd +38 -0
  25. data/lib/xhtml/xhtml-copyright-1.xsd +29 -0
  26. data/lib/xhtml/xhtml-csismap-1.xsd +91 -0
  27. data/lib/xhtml/xhtml-datatypes-1.xsd +177 -0
  28. data/lib/xhtml/xhtml-edit-1.xsd +34 -0
  29. data/lib/xhtml/xhtml-events-1.xsd +130 -0
  30. data/lib/xhtml/xhtml-form-1.xsd +321 -0
  31. data/lib/xhtml/xhtml-frames-1.xsd +113 -0
  32. data/lib/xhtml/xhtml-framework-1.xsd +62 -0
  33. data/lib/xhtml/xhtml-hypertext-1.xsd +47 -0
  34. data/lib/xhtml/xhtml-iframe-1.xsd +68 -0
  35. data/lib/xhtml/xhtml-image-1.xsd +40 -0
  36. data/lib/xhtml/xhtml-inlphras-1.xsd +158 -0
  37. data/lib/xhtml/xhtml-inlpres-1.xsd +34 -0
  38. data/lib/xhtml/xhtml-inlstruct-1.xsd +45 -0
  39. data/lib/xhtml/xhtml-inlstyle-1.xsd +22 -0
  40. data/lib/xhtml/xhtml-inputmode-1.xsd +35 -0
  41. data/lib/xhtml/xhtml-lat1.ent +196 -0
  42. data/lib/xhtml/xhtml-legacy-1.xsd +97 -0
  43. data/lib/xhtml/xhtml-link-1.xsd +45 -0
  44. data/lib/xhtml/xhtml-list-1.xsd +94 -0
  45. data/lib/xhtml/xhtml-meta-1.xsd +54 -0
  46. data/lib/xhtml/xhtml-metaAttributes-1.xsd +39 -0
  47. data/lib/xhtml/xhtml-misc-1.xsd +441 -0
  48. data/lib/xhtml/xhtml-nameident-1.xsd +63 -0
  49. data/lib/xhtml/xhtml-notations-1.xsd +69 -0
  50. data/lib/xhtml/xhtml-object-1.xsd +71 -0
  51. data/lib/xhtml/xhtml-param-1.xsd +46 -0
  52. data/lib/xhtml/xhtml-pres-1.xsd +46 -0
  53. data/lib/xhtml/xhtml-print-1.xsd +85 -0
  54. data/lib/xhtml/xhtml-print-model-1.xsd +604 -0
  55. data/lib/xhtml/xhtml-print-modules-1.xsd +422 -0
  56. data/lib/xhtml/xhtml-rdfa-1.dtd +438 -0
  57. data/lib/xhtml/xhtml-rdfa-1.xsd +116 -0
  58. data/lib/xhtml/xhtml-rdfa-model-1.xsd +461 -0
  59. data/lib/xhtml/xhtml-rdfa-modules-1.xsd +548 -0
  60. data/lib/xhtml/xhtml-ruby-1.xsd +170 -0
  61. data/lib/xhtml/xhtml-ruby-basic-1.xsd +84 -0
  62. data/lib/xhtml/xhtml-script-1.xsd +65 -0
  63. data/lib/xhtml/xhtml-special.ent +80 -0
  64. data/lib/xhtml/xhtml-ssismap-1.xsd +38 -0
  65. data/lib/xhtml/xhtml-struct-1.xsd +85 -0
  66. data/lib/xhtml/xhtml-style-1.xsd +47 -0
  67. data/lib/xhtml/xhtml-symbol.ent +237 -0
  68. data/lib/xhtml/xhtml-table-1.xsd +267 -0
  69. data/lib/xhtml/xhtml-target-1.xsd +49 -0
  70. data/lib/xhtml/xhtml-text-1.xsd +62 -0
  71. data/lib/xhtml/xhtml1-frameset.dtd +1235 -0
  72. data/lib/xhtml/xhtml1-frameset.xsd +2847 -0
  73. data/lib/xhtml/xhtml1-strict.dtd +978 -0
  74. data/lib/xhtml/xhtml1-strict.xsd +2211 -0
  75. data/lib/xhtml/xhtml1-transitional.dtd +1201 -0
  76. data/lib/xhtml/xhtml1-transitional.xsd +2755 -0
  77. data/lib/xhtml/xhtml11-model-1.xsd +715 -0
  78. data/lib/xhtml/xhtml11-module-redefines-1.xsd +335 -0
  79. data/lib/xhtml/xhtml11-modules-1.xsd +605 -0
  80. data/lib/xhtml/xhtml11.xsd +107 -0
  81. data/lib/xhtml/xhtml2.xsd +21 -0
  82. data/lib/xhtml/xml-events-1.xsd +73 -0
  83. data/lib/xhtml/xml-events-2.xsd +73 -0
  84. data/lib/xhtml/xml-events-attribs-1.xsd +73 -0
  85. data/lib/xhtml/xml-events-attribs-2.xsd +75 -0
  86. data/lib/xhtml/xml-events-copyright-1.xsd +34 -0
  87. data/lib/xhtml/xml-events-copyright-2.xsd +34 -0
  88. data/lib/xhtml/xml-handlers-1.xsd +136 -0
  89. data/lib/xhtml/xml-handlers-2.xsd +98 -0
  90. data/lib/xhtml/xml-script-1.xsd +38 -0
  91. data/lib/xhtml/xml.xsd +286 -0
  92. metadata +114 -8
  93. data/lib/spkspider.rb +0 -147
data/lib/spkspider.rb DELETED
@@ -1,147 +0,0 @@
1
- # encoding: utf-8
2
- require 'open-uri'
3
- # SpkSpider is a ruby crawler
4
-
5
- class SpkSpider
6
- VERSION = '0.0.5'
7
-
8
- attr_accessor :links_to_visit, :site, :user_agent, :basic_auth
9
- attr_accessor :parser, :exclude
10
- attr_reader :visited_links, :external_links, :errors
11
-
12
- # initialize method take the site to crawl in argument
13
- def initialize(site)
14
- puts "SpkSpider #{VERSION} initializing..."
15
- @site = URI.parse(site) || raise("You didn't give me a site to crawl")
16
- @user_agent = "SpkSpr/#{VERSION}"
17
- @links_to_visit = Array.new
18
- @visited_links = Array.new
19
- @external_links = Array.new
20
- @errors = Hash.new
21
- @links_to_visit << site
22
- @parser = 'xml'
23
- puts "Ready to crawl"
24
- end
25
-
26
- def init_xml_parser(doc)
27
- require 'xml'
28
- xp = XML::HTMLParser.string(doc, {:options => XML::HTMLParser::Options::RECOVER | XML::HTMLParser::Options::NOERROR | XML::HTMLParser::Options::NOWARNING })
29
- XML::Error.set_handler do |error|
30
- exception = error
31
- end
32
- document = xp.parse
33
- links = document.find("//a[@href]")
34
- end
35
-
36
- def fetch_links(doc)
37
- case @parser
38
- when 'xml'
39
- init_xml_parser(doc)
40
- when 'hpricot'
41
- require 'hpricot'
42
- Hpricot.buffer_size = 204800
43
- Hpricot(doc).search("//a[@href]")
44
- else
45
- init_xml_parser(doc)
46
- end
47
- rescue
48
- init_xml_parser(doc)
49
- end
50
-
51
- # download the document
52
- def fetch_html(url)
53
- uri = URI.parse(url)
54
- print "Visiting: #{url}"
55
- begin
56
- @document = uri.read('User-Agent' => @user_agent, 'Referer' => url, :http_basic_authentication => @basic_auth)
57
- rescue
58
- # OpenURI::HTTPError
59
- end
60
- @visited_links << url
61
- @document
62
- end
63
-
64
- # reading the document and extract the urls
65
- def read_document(document, url)
66
- if document
67
- case document.content_type
68
- when "text/html"
69
- link_extractor(document, url)
70
- else
71
- print " ... not text/html, skipping ..."
72
- end
73
- else
74
- print " ... document does not exist, skipping ..."
75
- end
76
- end
77
-
78
- # extract the link and un-relative
79
- def link_extractor(document, document_url)
80
- links = fetch_links(document)
81
- links.each do |link|
82
- href = link.attributes['href']
83
- if href && href.length > 0 && (@exclude && !href.match(@exclude) || @exclude.nil?)
84
- begin
85
- url = href
86
- uri = URI.parse(url)
87
- document_uri = URI.parse(document_url)
88
- rescue
89
- #print " #{url} skip this link"
90
- next
91
- end
92
- else
93
- #print " skip this link"
94
- next
95
- end
96
-
97
- # Derelativeize links if necessary
98
- if uri.relative?
99
- url = document_uri.merge(url).to_s if url[0,1] == '?'
100
- url = @site.merge(url).to_s
101
- uri = URI.parse(url)
102
- end
103
-
104
- # skip anchor link
105
- if url.include?('#')
106
- #print '... Anchor link found, skipping ...'
107
- next
108
- end
109
-
110
- # Check domain, if in same domain, keep link, else trash it
111
- if uri.host != @site.host
112
- @external_links << url
113
- @external_links.uniq!
114
- next
115
- end
116
-
117
- # Find out if we've seen this link already
118
- if (@visited_links.include? url) || (@links_to_visit.include? url)
119
- next
120
- end
121
-
122
- @links_to_visit << url
123
- end
124
- end
125
-
126
- # lunch the crawling
127
- def crawl
128
- while !@links_to_visit.empty?
129
- # get the first element of the links_to_visit
130
- url = @links_to_visit.shift
131
- document = fetch_html(url)
132
- read_document(document, url)
133
- if block_given?
134
- yield(url, document)
135
- end
136
- puts ' done!'
137
- end
138
- end
139
- end
140
-
141
- if __FILE__ == $0
142
- site = 'http://localhost:4567/'
143
- site = ARGV[0] if ARGV[0]
144
- spider = SpkSpider.new(site)
145
- spider.user_agent = ''
146
- spider.crawl
147
- end