w3clove 0.6.2 → 0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -48,18 +48,29 @@ module W3Clove
48
48
  # scrape the links from HTML.
49
49
  #
50
50
  # For HTML sources, it will only get the links that start with the sitemap url, convert relative links
51
- # to absolute links, remove anchors from links, and include the sitemap url
51
+ # to absolute links, remove anchors from links, include the sitemap url, and exclude links that don't
52
+ # seem to point to HTML (like images, multimedia, text, javascript...)
52
53
  def pages_in_sitemap
53
54
  pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)}
54
55
  if pages.empty?
55
56
  m = MetaInspector.new(url)
56
- links = m.absolute_links.select {|l| l.start_with?(m.url)}.map {|l| l.split('#')[0]}.uniq
57
+ links = m.absolute_links.select {|l| l.start_with?(m.url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq
57
58
  links << m.url unless (links.include?(m.url) || links.include?("#{m.url}/"))
58
59
  pages = links.map {|link| W3Clove::Page.new(link)}
59
60
  end
60
61
  pages
61
62
  end
62
63
 
64
+ # Tells if the given url looks like an HTML page.
65
+ # That is, it does not look like javascript, image, pdf...
66
+ def looks_like_html?(url)
67
+ u = URI.parse(url)
68
+ scheme = u.scheme
69
+ extension = u.path.split(".").last
70
+
71
+ (scheme =~ /http[s]?/i) && (extension !~ /gif|jpg|jpeg|png|tiff|bmp|txt|pdf|doc|xls|wav|mp3|ogg/i)
72
+ end
73
+
63
74
  def xml_locations
64
75
  Nokogiri::XML(doc).css('loc')
65
76
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module W3Clove
4
- VERSION = "0.6.2"
4
+ VERSION = "0.7"
5
5
  end
@@ -0,0 +1,43 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ <title>Examples of exclusions</title>
15
+ </head>
16
+ <body>
17
+ <a href="/">Front page</a><br />
18
+ <a href="/faqs">FAQs</a><br />
19
+
20
+ <a href="/file.gif">link</a><br />
21
+ <a href="/file.GIF">link</a><br />
22
+ <a href="/file.gif?size=thumb">link</a><br />
23
+ <a href="/file.jpg">link</a><br />
24
+ <a href="/file.jpeg">link</a><br />
25
+ <a href="/file.png">link</a><br />
26
+ <a href="/file.tiff">link</a><br />
27
+ <a href="/file.bmp">link</a><br />
28
+
29
+ <a href="/file.pdf">link</a><br />
30
+ <a href="/file.txt">link</a><br />
31
+ <a href="/file.doc">link</a><br />
32
+ <a href="/file.xls">link</a><br />
33
+
34
+ <a href="/file.wav">link</a><br />
35
+ <a href="/file.mp3">link</a><br />
36
+ <a href="/file.ogg">link</a><br />
37
+
38
+ <a href="mailto:user@example.com">link</a><br />
39
+ <a href="ftp://user:password@example.com">link</a><br />
40
+ <a href="telnet://user:password@example.com">link</a><br />
41
+ <a href="javascript:alert('hey');">link</a><br />
42
+ </body>
43
+ </html>
data/spec/sitemap_spec.rb CHANGED
@@ -10,6 +10,7 @@ describe W3Clove::Sitemap do
10
10
  @sitemap_with_trailing_slash = W3Clove::Sitemap.new('http://eparreno.com')
11
11
  @sitemap_with_protocol_relative = W3Clove::Sitemap.new('http://protocol-relative.com')
12
12
  @sitemap_with_protocol_relative_https = W3Clove::Sitemap.new('https://protocol-relative.com')
13
+ @sitemap_for_exclusions = W3Clove::Sitemap.new('http://example.com/exclusions')
13
14
 
14
15
  MarkupValidator.any_instance.stubs(:validate_uri).returns(stubbed_validator_results)
15
16
  end
@@ -75,6 +76,12 @@ describe W3Clove::Sitemap do
75
76
  urls.should_not include 'http://eparreno.com'
76
77
  end
77
78
 
79
+ it "should exclude non-html pages" do
80
+ @sitemap_for_exclusions.pages.length.should == 2
81
+ @sitemap_for_exclusions.pages.first.url.should == 'http://example.com/exclusions/'
82
+ @sitemap_for_exclusions.pages.last.url.should == 'http://example.com/exclusions/faqs'
83
+ end
84
+
78
85
  context "protocol-relative links" do
79
86
  it "should include only internal links" do
80
87
  @sitemap_with_protocol_relative.pages.size.should == 3
data/spec/spec_helper.rb CHANGED
@@ -13,6 +13,7 @@ FakeWeb.register_uri(:get, "http://www.eparreno.com", :response => open("#{$samp
13
13
  FakeWeb.register_uri(:get, "http://zigotica.com", :response => open("#{$samples_dir}/zigotica.com.html").read)
14
14
  FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => open("#{$samples_dir}/protocol_relative.html").read)
15
15
  FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => open("#{$samples_dir}/protocol_relative.html").read)
16
+ FakeWeb.register_uri(:get, "http://example.com/exclusions", :response => open("#{$samples_dir}/exclusions.html").read)
16
17
 
17
18
  def message_text(message_id)
18
19
  message_texts = {
data/w3clove.gemspec CHANGED
@@ -16,12 +16,12 @@ and outputs a detailed report with all errors and warnings}
16
16
  gem.require_paths = ["lib"]
17
17
  gem.version = W3Clove::VERSION
18
18
 
19
- gem.add_dependency 'w3c_validators', '~> 1.0.2'
20
- gem.add_dependency 'nokogiri', '~> 1.5.0'
21
- gem.add_dependency 'metainspector', '~> 1.8.8'
19
+ gem.add_dependency 'w3c_validators', '~> 1.2'
20
+ gem.add_dependency 'nokogiri', '~> 1.5.3'
21
+ gem.add_dependency 'metainspector', '~> 1.9.0'
22
22
 
23
23
  gem.add_development_dependency 'rspec', '~> 2.5.0'
24
- gem.add_development_dependency 'mocha', '~> 0.9.12'
24
+ gem.add_development_dependency 'mocha', '~> 0.11.4'
25
25
  gem.add_development_dependency 'rake', '~> 0.9.2'
26
26
  gem.add_development_dependency 'fakeweb', '~> 1.3.0'
27
27
  end
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: w3clove
3
3
  version: !ruby/object:Gem::Version
4
- hash: 3
4
+ hash: 5
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 6
9
- - 2
10
- version: 0.6.2
8
+ - 7
9
+ version: "0.7"
11
10
  platform: ruby
12
11
  authors:
13
12
  - Jaime Iniesta
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2011-12-30 00:00:00 Z
17
+ date: 2012-06-03 00:00:00 Z
19
18
  dependencies:
20
19
  - !ruby/object:Gem::Dependency
21
20
  name: w3c_validators
@@ -25,12 +24,11 @@ dependencies:
25
24
  requirements:
26
25
  - - ~>
27
26
  - !ruby/object:Gem::Version
28
- hash: 19
27
+ hash: 11
29
28
  segments:
30
29
  - 1
31
- - 0
32
30
  - 2
33
- version: 1.0.2
31
+ version: "1.2"
34
32
  type: :runtime
35
33
  version_requirements: *id001
36
34
  - !ruby/object:Gem::Dependency
@@ -41,12 +39,12 @@ dependencies:
41
39
  requirements:
42
40
  - - ~>
43
41
  - !ruby/object:Gem::Version
44
- hash: 3
42
+ hash: 5
45
43
  segments:
46
44
  - 1
47
45
  - 5
48
- - 0
49
- version: 1.5.0
46
+ - 3
47
+ version: 1.5.3
50
48
  type: :runtime
51
49
  version_requirements: *id002
52
50
  - !ruby/object:Gem::Dependency
@@ -57,12 +55,12 @@ dependencies:
57
55
  requirements:
58
56
  - - ~>
59
57
  - !ruby/object:Gem::Version
60
- hash: 39
58
+ hash: 51
61
59
  segments:
62
60
  - 1
63
- - 8
64
- - 8
65
- version: 1.8.8
61
+ - 9
62
+ - 0
63
+ version: 1.9.0
66
64
  type: :runtime
67
65
  version_requirements: *id003
68
66
  - !ruby/object:Gem::Dependency
@@ -89,12 +87,12 @@ dependencies:
89
87
  requirements:
90
88
  - - ~>
91
89
  - !ruby/object:Gem::Version
92
- hash: 35
90
+ hash: 59
93
91
  segments:
94
92
  - 0
95
- - 9
96
- - 12
97
- version: 0.9.12
93
+ - 11
94
+ - 4
95
+ version: 0.11.4
98
96
  type: :development
99
97
  version_requirements: *id005
100
98
  - !ruby/object:Gem::Dependency
@@ -162,6 +160,7 @@ files:
162
160
  - spec/page_spec.rb
163
161
  - spec/samples/absolute_links.html
164
162
  - spec/samples/eparreno.com.html
163
+ - spec/samples/exclusions.html
165
164
  - spec/samples/guides.rubyonrails.org.html
166
165
  - spec/samples/protocol_relative.html
167
166
  - spec/samples/sitemap.xml
@@ -198,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
198
197
  requirements: []
199
198
 
200
199
  rubyforge_project:
201
- rubygems_version: 1.8.10
200
+ rubygems_version: 1.8.15
202
201
  signing_key:
203
202
  specification_version: 3
204
203
  summary: command-line tool to validate the markup of a whole site against the W3C validator