w3clove 0.6.2 → 0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/w3clove/sitemap.rb +13 -2
- data/lib/w3clove/version.rb +1 -1
- data/spec/samples/exclusions.html +43 -0
- data/spec/sitemap_spec.rb +7 -0
- data/spec/spec_helper.rb +1 -0
- data/w3clove.gemspec +4 -4
- metadata +19 -20
data/lib/w3clove/sitemap.rb
CHANGED
@@ -48,18 +48,29 @@ module W3Clove
|
|
48
48
|
# scrape the links from HTML.
|
49
49
|
#
|
50
50
|
# For HTML sources, it will only get the links that start with the sitemap url, convert relative links
|
51
|
-
# to absolute links, remove anchors from links,
|
51
|
+
# to absolute links, remove anchors from links, include the sitemap url, and exclude links that don't
|
52
|
+
# seem to point to HTML (like images, multimedia, text, javascript...)
|
52
53
|
def pages_in_sitemap
|
53
54
|
pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)}
|
54
55
|
if pages.empty?
|
55
56
|
m = MetaInspector.new(url)
|
56
|
-
links = m.absolute_links.select {|l| l.start_with?(m.url)}.map {|l| l.split('#')[0]}.uniq
|
57
|
+
links = m.absolute_links.select {|l| l.start_with?(m.url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq
|
57
58
|
links << m.url unless (links.include?(m.url) || links.include?("#{m.url}/"))
|
58
59
|
pages = links.map {|link| W3Clove::Page.new(link)}
|
59
60
|
end
|
60
61
|
pages
|
61
62
|
end
|
62
63
|
|
64
|
+
# Tells if the given url looks like an HTML page.
|
65
|
+
# That is, it does not look like javascript, image, pdf...
|
66
|
+
def looks_like_html?(url)
|
67
|
+
u = URI.parse(url)
|
68
|
+
scheme = u.scheme
|
69
|
+
extension = u.path.split(".").last
|
70
|
+
|
71
|
+
(scheme =~ /http[s]?/i) && (extension !~ /gif|jpg|jpeg|png|tiff|bmp|txt|pdf|doc|xls|wav|mp3|ogg/i)
|
72
|
+
end
|
73
|
+
|
63
74
|
def xml_locations
|
64
75
|
Nokogiri::XML(doc).css('loc')
|
65
76
|
end
|
data/lib/w3clove/version.rb
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>Examples of exclusions</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
<a href="/">Front page</a><br />
|
18
|
+
<a href="/faqs">FAQs</a><br />
|
19
|
+
|
20
|
+
<a href="/file.gif">link</a><br />
|
21
|
+
<a href="/file.GIF">link</a><br />
|
22
|
+
<a href="/file.gif?size=thumb">link</a><br />
|
23
|
+
<a href="/file.jpg">link</a><br />
|
24
|
+
<a href="/file.jpeg">link</a><br />
|
25
|
+
<a href="/file.png">link</a><br />
|
26
|
+
<a href="/file.tiff">link</a><br />
|
27
|
+
<a href="/file.bmp">link</a><br />
|
28
|
+
|
29
|
+
<a href="/file.pdf">link</a><br />
|
30
|
+
<a href="/file.txt">link</a><br />
|
31
|
+
<a href="/file.doc">link</a><br />
|
32
|
+
<a href="/file.xls">link</a><br />
|
33
|
+
|
34
|
+
<a href="/file.wav">link</a><br />
|
35
|
+
<a href="/file.mp3">link</a><br />
|
36
|
+
<a href="/file.ogg">link</a><br />
|
37
|
+
|
38
|
+
<a href="mailto:user@example.com">link</a><br />
|
39
|
+
<a href="ftp://user:password@example.com">link</a><br />
|
40
|
+
<a href="telnet://user:password@example.com">link</a><br />
|
41
|
+
<a href="javascript:alert('hey');">link</a><br />
|
42
|
+
</body>
|
43
|
+
</html>
|
data/spec/sitemap_spec.rb
CHANGED
@@ -10,6 +10,7 @@ describe W3Clove::Sitemap do
|
|
10
10
|
@sitemap_with_trailing_slash = W3Clove::Sitemap.new('http://eparreno.com')
|
11
11
|
@sitemap_with_protocol_relative = W3Clove::Sitemap.new('http://protocol-relative.com')
|
12
12
|
@sitemap_with_protocol_relative_https = W3Clove::Sitemap.new('https://protocol-relative.com')
|
13
|
+
@sitemap_for_exclusions = W3Clove::Sitemap.new('http://example.com/exclusions')
|
13
14
|
|
14
15
|
MarkupValidator.any_instance.stubs(:validate_uri).returns(stubbed_validator_results)
|
15
16
|
end
|
@@ -75,6 +76,12 @@ describe W3Clove::Sitemap do
|
|
75
76
|
urls.should_not include 'http://eparreno.com'
|
76
77
|
end
|
77
78
|
|
79
|
+
it "should exclude non-html pages" do
|
80
|
+
@sitemap_for_exclusions.pages.length.should == 2
|
81
|
+
@sitemap_for_exclusions.pages.first.url.should == 'http://example.com/exclusions/'
|
82
|
+
@sitemap_for_exclusions.pages.last.url.should == 'http://example.com/exclusions/faqs'
|
83
|
+
end
|
84
|
+
|
78
85
|
context "protocol-relative links" do
|
79
86
|
it "should include only internal links" do
|
80
87
|
@sitemap_with_protocol_relative.pages.size.should == 3
|
data/spec/spec_helper.rb
CHANGED
@@ -13,6 +13,7 @@ FakeWeb.register_uri(:get, "http://www.eparreno.com", :response => open("#{$samp
|
|
13
13
|
FakeWeb.register_uri(:get, "http://zigotica.com", :response => open("#{$samples_dir}/zigotica.com.html").read)
|
14
14
|
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => open("#{$samples_dir}/protocol_relative.html").read)
|
15
15
|
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => open("#{$samples_dir}/protocol_relative.html").read)
|
16
|
+
FakeWeb.register_uri(:get, "http://example.com/exclusions", :response => open("#{$samples_dir}/exclusions.html").read)
|
16
17
|
|
17
18
|
def message_text(message_id)
|
18
19
|
message_texts = {
|
data/w3clove.gemspec
CHANGED
@@ -16,12 +16,12 @@ and outputs a detailed report with all errors and warnings}
|
|
16
16
|
gem.require_paths = ["lib"]
|
17
17
|
gem.version = W3Clove::VERSION
|
18
18
|
|
19
|
-
gem.add_dependency 'w3c_validators', '~> 1.
|
20
|
-
gem.add_dependency 'nokogiri', '~> 1.5.
|
21
|
-
gem.add_dependency 'metainspector', '~> 1.
|
19
|
+
gem.add_dependency 'w3c_validators', '~> 1.2'
|
20
|
+
gem.add_dependency 'nokogiri', '~> 1.5.3'
|
21
|
+
gem.add_dependency 'metainspector', '~> 1.9.0'
|
22
22
|
|
23
23
|
gem.add_development_dependency 'rspec', '~> 2.5.0'
|
24
|
-
gem.add_development_dependency 'mocha', '~> 0.
|
24
|
+
gem.add_development_dependency 'mocha', '~> 0.11.4'
|
25
25
|
gem.add_development_dependency 'rake', '~> 0.9.2'
|
26
26
|
gem.add_development_dependency 'fakeweb', '~> 1.3.0'
|
27
27
|
end
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: w3clove
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 5
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
|
10
|
-
version: 0.6.2
|
8
|
+
- 7
|
9
|
+
version: "0.7"
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Jaime Iniesta
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date:
|
17
|
+
date: 2012-06-03 00:00:00 Z
|
19
18
|
dependencies:
|
20
19
|
- !ruby/object:Gem::Dependency
|
21
20
|
name: w3c_validators
|
@@ -25,12 +24,11 @@ dependencies:
|
|
25
24
|
requirements:
|
26
25
|
- - ~>
|
27
26
|
- !ruby/object:Gem::Version
|
28
|
-
hash:
|
27
|
+
hash: 11
|
29
28
|
segments:
|
30
29
|
- 1
|
31
|
-
- 0
|
32
30
|
- 2
|
33
|
-
version: 1.
|
31
|
+
version: "1.2"
|
34
32
|
type: :runtime
|
35
33
|
version_requirements: *id001
|
36
34
|
- !ruby/object:Gem::Dependency
|
@@ -41,12 +39,12 @@ dependencies:
|
|
41
39
|
requirements:
|
42
40
|
- - ~>
|
43
41
|
- !ruby/object:Gem::Version
|
44
|
-
hash:
|
42
|
+
hash: 5
|
45
43
|
segments:
|
46
44
|
- 1
|
47
45
|
- 5
|
48
|
-
-
|
49
|
-
version: 1.5.
|
46
|
+
- 3
|
47
|
+
version: 1.5.3
|
50
48
|
type: :runtime
|
51
49
|
version_requirements: *id002
|
52
50
|
- !ruby/object:Gem::Dependency
|
@@ -57,12 +55,12 @@ dependencies:
|
|
57
55
|
requirements:
|
58
56
|
- - ~>
|
59
57
|
- !ruby/object:Gem::Version
|
60
|
-
hash:
|
58
|
+
hash: 51
|
61
59
|
segments:
|
62
60
|
- 1
|
63
|
-
-
|
64
|
-
-
|
65
|
-
version: 1.
|
61
|
+
- 9
|
62
|
+
- 0
|
63
|
+
version: 1.9.0
|
66
64
|
type: :runtime
|
67
65
|
version_requirements: *id003
|
68
66
|
- !ruby/object:Gem::Dependency
|
@@ -89,12 +87,12 @@ dependencies:
|
|
89
87
|
requirements:
|
90
88
|
- - ~>
|
91
89
|
- !ruby/object:Gem::Version
|
92
|
-
hash:
|
90
|
+
hash: 59
|
93
91
|
segments:
|
94
92
|
- 0
|
95
|
-
-
|
96
|
-
-
|
97
|
-
version: 0.
|
93
|
+
- 11
|
94
|
+
- 4
|
95
|
+
version: 0.11.4
|
98
96
|
type: :development
|
99
97
|
version_requirements: *id005
|
100
98
|
- !ruby/object:Gem::Dependency
|
@@ -162,6 +160,7 @@ files:
|
|
162
160
|
- spec/page_spec.rb
|
163
161
|
- spec/samples/absolute_links.html
|
164
162
|
- spec/samples/eparreno.com.html
|
163
|
+
- spec/samples/exclusions.html
|
165
164
|
- spec/samples/guides.rubyonrails.org.html
|
166
165
|
- spec/samples/protocol_relative.html
|
167
166
|
- spec/samples/sitemap.xml
|
@@ -198,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
198
197
|
requirements: []
|
199
198
|
|
200
199
|
rubyforge_project:
|
201
|
-
rubygems_version: 1.8.
|
200
|
+
rubygems_version: 1.8.15
|
202
201
|
signing_key:
|
203
202
|
specification_version: 3
|
204
203
|
summary: command-line tool to validate the markup of a whole site against the W3C validator
|