w3clove 0.6.2 → 0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -48,18 +48,29 @@ module W3Clove
48
48
  # scrape the links from HTML.
49
49
  #
50
50
  # For HTML sources, it will only get the links that start with the sitemap url, convert relative links
51
- # to absolute links, remove anchors from links, and include the sitemap url
51
+ # to absolute links, remove anchors from links, include the sitemap url, and exclude links that don't
52
+ # seem to point to HTML (like images, multimedia, text, javascript...)
52
53
  def pages_in_sitemap
53
54
  pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)}
54
55
  if pages.empty?
55
56
  m = MetaInspector.new(url)
56
- links = m.absolute_links.select {|l| l.start_with?(m.url)}.map {|l| l.split('#')[0]}.uniq
57
+ links = m.absolute_links.select {|l| l.start_with?(m.url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq
57
58
  links << m.url unless (links.include?(m.url) || links.include?("#{m.url}/"))
58
59
  pages = links.map {|link| W3Clove::Page.new(link)}
59
60
  end
60
61
  pages
61
62
  end
62
63
 
64
+ # Tells if the given url looks like an HTML page.
65
+ # That is, it does not look like javascript, image, pdf...
66
+ def looks_like_html?(url)
67
+ u = URI.parse(url)
68
+ scheme = u.scheme
69
+ extension = u.path.split(".").last
70
+
71
+ (scheme =~ /http[s]?/i) && (extension !~ /gif|jpg|jpeg|png|tiff|bmp|txt|pdf|doc|xls|wav|mp3|ogg/i)
72
+ end
73
+
63
74
  def xml_locations
64
75
  Nokogiri::XML(doc).css('loc')
65
76
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module W3Clove
4
- VERSION = "0.6.2"
4
+ VERSION = "0.7"
5
5
  end
@@ -0,0 +1,43 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ <title>Examples of exclusions</title>
15
+ </head>
16
+ <body>
17
+ <a href="/">Front page</a><br />
18
+ <a href="/faqs">FAQs</a><br />
19
+
20
+ <a href="/file.gif">link</a><br />
21
+ <a href="/file.GIF">link</a><br />
22
+ <a href="/file.gif?size=thumb">link</a><br />
23
+ <a href="/file.jpg">link</a><br />
24
+ <a href="/file.jpeg">link</a><br />
25
+ <a href="/file.png">link</a><br />
26
+ <a href="/file.tiff">link</a><br />
27
+ <a href="/file.bmp">link</a><br />
28
+
29
+ <a href="/file.pdf">link</a><br />
30
+ <a href="/file.txt">link</a><br />
31
+ <a href="/file.doc">link</a><br />
32
+ <a href="/file.xls">link</a><br />
33
+
34
+ <a href="/file.wav">link</a><br />
35
+ <a href="/file.mp3">link</a><br />
36
+ <a href="/file.ogg">link</a><br />
37
+
38
+ <a href="mailto:user@example.com">link</a><br />
39
+ <a href="ftp://user:password@example.com">link</a><br />
40
+ <a href="telnet://user:password@example.com">link</a><br />
41
+ <a href="javascript:alert('hey');">link</a><br />
42
+ </body>
43
+ </html>
data/spec/sitemap_spec.rb CHANGED
@@ -10,6 +10,7 @@ describe W3Clove::Sitemap do
10
10
  @sitemap_with_trailing_slash = W3Clove::Sitemap.new('http://eparreno.com')
11
11
  @sitemap_with_protocol_relative = W3Clove::Sitemap.new('http://protocol-relative.com')
12
12
  @sitemap_with_protocol_relative_https = W3Clove::Sitemap.new('https://protocol-relative.com')
13
+ @sitemap_for_exclusions = W3Clove::Sitemap.new('http://example.com/exclusions')
13
14
 
14
15
  MarkupValidator.any_instance.stubs(:validate_uri).returns(stubbed_validator_results)
15
16
  end
@@ -75,6 +76,12 @@ describe W3Clove::Sitemap do
75
76
  urls.should_not include 'http://eparreno.com'
76
77
  end
77
78
 
79
+ it "should exclude non-html pages" do
80
+ @sitemap_for_exclusions.pages.length.should == 2
81
+ @sitemap_for_exclusions.pages.first.url.should == 'http://example.com/exclusions/'
82
+ @sitemap_for_exclusions.pages.last.url.should == 'http://example.com/exclusions/faqs'
83
+ end
84
+
78
85
  context "protocol-relative links" do
79
86
  it "should include only internal links" do
80
87
  @sitemap_with_protocol_relative.pages.size.should == 3
data/spec/spec_helper.rb CHANGED
@@ -13,6 +13,7 @@ FakeWeb.register_uri(:get, "http://www.eparreno.com", :response => open("#{$samp
13
13
  FakeWeb.register_uri(:get, "http://zigotica.com", :response => open("#{$samples_dir}/zigotica.com.html").read)
14
14
  FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => open("#{$samples_dir}/protocol_relative.html").read)
15
15
  FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => open("#{$samples_dir}/protocol_relative.html").read)
16
+ FakeWeb.register_uri(:get, "http://example.com/exclusions", :response => open("#{$samples_dir}/exclusions.html").read)
16
17
 
17
18
  def message_text(message_id)
18
19
  message_texts = {
data/w3clove.gemspec CHANGED
@@ -16,12 +16,12 @@ and outputs a detailed report with all errors and warnings}
16
16
  gem.require_paths = ["lib"]
17
17
  gem.version = W3Clove::VERSION
18
18
 
19
- gem.add_dependency 'w3c_validators', '~> 1.0.2'
20
- gem.add_dependency 'nokogiri', '~> 1.5.0'
21
- gem.add_dependency 'metainspector', '~> 1.8.8'
19
+ gem.add_dependency 'w3c_validators', '~> 1.2'
20
+ gem.add_dependency 'nokogiri', '~> 1.5.3'
21
+ gem.add_dependency 'metainspector', '~> 1.9.0'
22
22
 
23
23
  gem.add_development_dependency 'rspec', '~> 2.5.0'
24
- gem.add_development_dependency 'mocha', '~> 0.9.12'
24
+ gem.add_development_dependency 'mocha', '~> 0.11.4'
25
25
  gem.add_development_dependency 'rake', '~> 0.9.2'
26
26
  gem.add_development_dependency 'fakeweb', '~> 1.3.0'
27
27
  end
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: w3clove
3
3
  version: !ruby/object:Gem::Version
4
- hash: 3
4
+ hash: 5
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 6
9
- - 2
10
- version: 0.6.2
8
+ - 7
9
+ version: "0.7"
11
10
  platform: ruby
12
11
  authors:
13
12
  - Jaime Iniesta
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2011-12-30 00:00:00 Z
17
+ date: 2012-06-03 00:00:00 Z
19
18
  dependencies:
20
19
  - !ruby/object:Gem::Dependency
21
20
  name: w3c_validators
@@ -25,12 +24,11 @@ dependencies:
25
24
  requirements:
26
25
  - - ~>
27
26
  - !ruby/object:Gem::Version
28
- hash: 19
27
+ hash: 11
29
28
  segments:
30
29
  - 1
31
- - 0
32
30
  - 2
33
- version: 1.0.2
31
+ version: "1.2"
34
32
  type: :runtime
35
33
  version_requirements: *id001
36
34
  - !ruby/object:Gem::Dependency
@@ -41,12 +39,12 @@ dependencies:
41
39
  requirements:
42
40
  - - ~>
43
41
  - !ruby/object:Gem::Version
44
- hash: 3
42
+ hash: 5
45
43
  segments:
46
44
  - 1
47
45
  - 5
48
- - 0
49
- version: 1.5.0
46
+ - 3
47
+ version: 1.5.3
50
48
  type: :runtime
51
49
  version_requirements: *id002
52
50
  - !ruby/object:Gem::Dependency
@@ -57,12 +55,12 @@ dependencies:
57
55
  requirements:
58
56
  - - ~>
59
57
  - !ruby/object:Gem::Version
60
- hash: 39
58
+ hash: 51
61
59
  segments:
62
60
  - 1
63
- - 8
64
- - 8
65
- version: 1.8.8
61
+ - 9
62
+ - 0
63
+ version: 1.9.0
66
64
  type: :runtime
67
65
  version_requirements: *id003
68
66
  - !ruby/object:Gem::Dependency
@@ -89,12 +87,12 @@ dependencies:
89
87
  requirements:
90
88
  - - ~>
91
89
  - !ruby/object:Gem::Version
92
- hash: 35
90
+ hash: 59
93
91
  segments:
94
92
  - 0
95
- - 9
96
- - 12
97
- version: 0.9.12
93
+ - 11
94
+ - 4
95
+ version: 0.11.4
98
96
  type: :development
99
97
  version_requirements: *id005
100
98
  - !ruby/object:Gem::Dependency
@@ -162,6 +160,7 @@ files:
162
160
  - spec/page_spec.rb
163
161
  - spec/samples/absolute_links.html
164
162
  - spec/samples/eparreno.com.html
163
+ - spec/samples/exclusions.html
165
164
  - spec/samples/guides.rubyonrails.org.html
166
165
  - spec/samples/protocol_relative.html
167
166
  - spec/samples/sitemap.xml
@@ -198,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
198
197
  requirements: []
199
198
 
200
199
  rubyforge_project:
201
- rubygems_version: 1.8.10
200
+ rubygems_version: 1.8.15
202
201
  signing_key:
203
202
  specification_version: 3
204
203
  summary: command-line tool to validate the markup of a whole site against the W3C validator