validate-website 0.8.1 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3c732e2a061a486000368013967fdd598da0b29a
4
- data.tar.gz: 9977f374877aa34f18a3740faf79e8f77f8ed474
3
+ metadata.gz: 7c6d263126db73e28c33fddb7ba58358f522d305
4
+ data.tar.gz: bc4d5405e1872d425dea1691456159d523b9c2d2
5
5
  SHA512:
6
- metadata.gz: 70fab85d94c458bf16a36a4780249ba3d7cfd2d591208a3266f74535520084ce3f9a922c633c46fd47e41ce73b3f113f3f1aef88571d0767e3d452fe33e83b83
7
- data.tar.gz: 3416c45ea279abf768d44cefeece27f2e8eb0bcb0d6552f67e0d8e91d914533270a72687fe4136f27b64ca0f72dd215de59dbedc64b851d9daff11ff81475834
6
+ metadata.gz: 3039cb7c6d5082a2d69915ddf2c1d102362049e5297899eb2aeba0b8b493edc32301aa3803a20411bbfb82bbb9fe05cbebc514083ae1aef9ae954769cd308521
7
+ data.tar.gz: 14682a11d7e68083aefecd1ab2ca978de5c3d257f0a8d6d5bcacf4427586f5814947f7ea0c3b52bd821d72c3e4dbc0803ae9c950b629576fa10bf05facec1758
@@ -7,7 +7,7 @@ require 'validate_website/option_parser'
7
7
  require 'validate_website/validator'
8
8
  require 'validate_website/colorful_messages'
9
9
 
10
- require 'anemone'
10
+ require 'spidr'
11
11
 
12
12
  module ValidateWebsite
13
13
 
@@ -15,7 +15,7 @@ module ValidateWebsite
15
15
  class Core
16
16
 
17
17
  attr_accessor :site
18
- attr_reader :options, :anemone
18
+ attr_reader :options, :crawler
19
19
 
20
20
  include ColorfulMessages
21
21
 
@@ -52,29 +52,35 @@ module ValidateWebsite
52
52
  #
53
53
  def crawl(opts={})
54
54
  opts = @options.merge(opts)
55
+ opts.merge!(:ignore_links => Regexp.new(opts[:exclude])) if opts[:exclude]
56
+
55
57
  puts color(:note, "validating #{@site}", opts[:color]) unless opts[:quiet]
56
58
  puts color(:warning, "No internet connection") unless internet_connection?
57
59
 
58
- @anemone = Anemone.crawl(@site, opts) do |anemone|
59
- anemone.skip_links_like Regexp.new(opts[:exclude]) if opts[:exclude]
60
+ @crawler = Spidr.site(@site, opts) do |crawler|
61
+ crawler.every_css_page do |page|
62
+ extract_urls_from_css(page).each do |u|
63
+ crawler.enqueue(u)
64
+ end
65
+ end
60
66
 
61
- # select the links on each page to follow (iframe, link, css url)
62
- anemone.focus_crawl { |page|
63
- page.links.concat(extract_urls(page))
64
- }
67
+ crawler.every_html_page do |page|
68
+ extract_imgs_from_page(page).each do |i|
69
+ crawler.enqueue(i)
70
+ end
65
71
 
66
- anemone.on_every_page { |page|
67
- url = page.url.to_s
68
- if opts[:markup_validation] && page.html? && page.fetched?
69
- validate(page.doc, page.body, url, opts)
72
+ if opts[:markup_validation] && page.html?
73
+ validate(page.doc, page.body, page.url, opts)
70
74
  end
75
+ end
71
76
 
72
- if opts[:not_found] && page.not_found?
77
+ crawler.every_failed_url do |url|
78
+ if opts[:not_found]
73
79
  @not_found_error = true
74
- puts color(:error, "%s linked in %s but not exist" % [url, page.referer], opts[:color])
80
+ puts color(:error, "%s linked but not exist" % [url], opts[:color])
75
81
  to_file(url)
76
82
  end
77
- }
83
+ end
78
84
  end
79
85
  end
80
86
 
@@ -93,17 +99,14 @@ module ValidateWebsite
93
99
  files.each do |f|
94
100
  next unless File.file?(f)
95
101
 
96
- page = Anemone::Page.new(URI.parse(opts[:site] + URI.encode(f)),
97
- :body => open(f).read,
98
- :headers => {'content-type' => ['text/html', 'application/xhtml+xml']})
102
+ response = fake_http_response(open(f).read)
103
+ page = Spidr::Page.new(URI.parse(opts[:site] + URI.encode(f)), response)
99
104
 
100
105
  if opts[:markup_validation]
101
106
  validate(page.doc, page.body, f)
102
107
  end
103
108
  if opts[:not_found]
104
- links = page.links
105
- links.concat extract_urls_from_img_script_iframe_link(page)
106
- check_static_not_found(links.uniq)
109
+ check_static_not_found(page.links)
107
110
  end
108
111
  end
109
112
  end
@@ -128,13 +131,6 @@ module ValidateWebsite
128
131
  end
129
132
  end
130
133
 
131
- def get_url(page, elem, attrname)
132
- u = elem.attributes[attrname].to_s
133
- return if u.nil? || u.empty?
134
- abs = page.to_absolute(u) rescue nil
135
- abs if abs && page.in_domain?(abs)
136
- end
137
-
138
134
  # check files linked on static document
139
135
  # see lib/validate_website/runner.rb
140
136
  def check_static_not_found(links, opts={})
@@ -143,8 +139,8 @@ module ValidateWebsite
143
139
  file_location = URI.parse(File.join(Dir.getwd, l.path)).path
144
140
  # Check CSS url()
145
141
  if File.exists?(file_location) && File.extname(file_location) == '.css'
146
- css_page = Anemone::Page.new(l, :body => File.read(file_location),
147
- :headers => {'content-type' => ['text/css']})
142
+ response = fake_http_response(open(file_location).read, ['text/css'])
143
+ css_page = Spidr::Page.new(l, response)
148
144
  links.concat extract_urls_from_css(css_page)
149
145
  links.uniq!
150
146
  end
@@ -156,42 +152,29 @@ module ValidateWebsite
156
152
  end
157
153
  end
158
154
 
159
- # Extract urls from img script iframe and link element
160
- #
161
- # @param [Anemone::Page] an Anemone::Page object
162
- # @return [Array] Lists of urls
163
- #
164
- def extract_urls_from_img_script_iframe_link(page)
165
- links = Set.new
166
- page.doc.css('img, script, iframe, link').each do |elem|
167
- if elem.name == 'link'
168
- url = get_url(page, elem, "href")
169
- else
170
- url = get_url(page, elem, "src")
171
- end
172
- links << url unless url.nil? || url.to_s.empty?
173
- end
174
- links
175
- end
176
-
177
155
  # Extract urls from CSS page
178
156
  #
179
- # @param [Anemone::Page] an Anemone::Page object
157
+ # @param [Spidr::Page] an Spidr::Page object
180
158
  # @return [Array] Lists of urls
181
159
  #
182
160
  def extract_urls_from_css(page)
183
- page.body.scan(/url\((['".\/\w-]+)\)/).inject([]) do |result, url|
161
+ page.body.scan(/url\((['".\/\w-]+)\)/).inject(Set[]) do |result, url|
184
162
  url = url.first.gsub("'", "").gsub('"', '')
185
163
  abs = page.to_absolute(URI.parse(url))
186
164
  result << abs
187
165
  end
188
166
  end
189
167
 
190
- def extract_urls(page)
191
- links = Set.new
192
- links.merge extract_urls_from_img_script_iframe_link(page) if page.html?
193
- links.merge extract_urls_from_css(page) if page.content_type == 'text/css'
194
- links.to_a
168
+ # Extract imgs urls from page
169
+ #
170
+ # @param [Spidr::Page] an Spidr::Page object
171
+ # @return [Array] Lists of urls
172
+ #
173
+ def extract_imgs_from_page(page)
174
+ page.doc.search('//img[@src]').inject(Set[]) do |result, elem|
175
+ u = elem.attributes['src']
176
+ result << page.to_absolute(URI.parse(u))
177
+ end
195
178
  end
196
179
 
197
180
  ##
@@ -220,5 +203,20 @@ module ValidateWebsite
220
203
  end
221
204
  end
222
205
 
206
+ # Fake http response for Spidr static crawling
207
+ # see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb
208
+ #
209
+ # @param [String] response body
210
+ # @param [Array] content types
211
+ # @return [Net::HTTPResponse] fake http response
212
+ def fake_http_response(body, content_types=['text/html', 'text/xhtml+xml'])
213
+ response = Net::HTTPResponse.new '1.1', 200, 'OK'
214
+ response.instance_variable_set(:@read, true)
215
+ response.body = body
216
+ content_types.each do |c|
217
+ response.add_field('content-type', c)
218
+ end
219
+ response
220
+ end
223
221
  end
224
222
  end
@@ -16,11 +16,6 @@ module ValidateWebsite
16
16
  :color => true,
17
17
  # internal verbose for ValidateWebsite
18
18
  :validate_verbose => false,
19
- # Anemone options see anemone/lib/anemone/core.rb
20
- :verbose => false,
21
- :cookies => nil,
22
- :accept_cookies => true,
23
- :redirect_limit => 0,
24
19
  }
25
20
 
26
21
  DEFAULT_OPTS_CRAWL = {
@@ -101,10 +96,6 @@ module ValidateWebsite
101
96
  "Only report errors (Default: #{@@default_opts[:quiet]})") { |v|
102
97
  options[:quiet] = v
103
98
  }
104
- o.on("-d", "--debug",
105
- "Show anemone log (Default: #{@@default_opts[:verbose]})") { |v|
106
- options[:verbose] = v
107
- }
108
99
 
109
100
  o.separator ""
110
101
  o.on_tail("-h", "--help", "Show this help message.") { puts o; exit }
@@ -1,13 +1,13 @@
1
1
  '\" t
2
2
  .\" Title: validate-website-static
3
3
  .\" Author: [see the "AUTHOR" section]
4
- .\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
5
- .\" Date: 04/05/2012
4
+ .\" Generator: DocBook XSL Stylesheets v1.78.1 <http://docbook.sf.net/>
5
+ .\" Date: 09/20/2014
6
6
  .\" Manual: \ \&
7
7
  .\" Source: \ \&
8
8
  .\" Language: English
9
9
  .\"
10
- .TH "VALIDATE\-WEBSITE\-S" "1" "04/05/2012" "\ \&" "\ \&"
10
+ .TH "VALIDATE\-WEBSITE\-S" "1" "09/20/2014" "\ \&" "\ \&"
11
11
  .\" -----------------------------------------------------------------
12
12
  .\" * Define some portability stuff
13
13
  .\" -----------------------------------------------------------------
@@ -1,13 +1,13 @@
1
1
  '\" t
2
2
  .\" Title: validate-website
3
3
  .\" Author: [see the "AUTHOR" section]
4
- .\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
5
- .\" Date: 04/05/2012
4
+ .\" Generator: DocBook XSL Stylesheets v1.78.1 <http://docbook.sf.net/>
5
+ .\" Date: 09/20/2014
6
6
  .\" Manual: \ \&
7
7
  .\" Source: \ \&
8
8
  .\" Language: English
9
9
  .\"
10
- .TH "VALIDATE\-WEBSITE" "1" "04/05/2012" "\ \&" "\ \&"
10
+ .TH "VALIDATE\-WEBSITE" "1" "09/20/2014" "\ \&" "\ \&"
11
11
  .\" -----------------------------------------------------------------
12
12
  .\" * Define some portability stuff
13
13
  .\" -----------------------------------------------------------------
@@ -45,7 +45,7 @@ http://localhost:3000/)
45
45
  .PP
46
46
  \fB\-u\fR, \fB\-\-user\-agent\fR \fIUSERAGENT\fR
47
47
  .RS 4
48
- Change user agent (Default: Anemone/VERSION)
48
+ Change user agent (Default: Spidr\&.user_agent)
49
49
  .RE
50
50
  .PP
51
51
  \fB\-e\fR, \fB\-\-exclude\fR \fIEXCLUDE\fR
@@ -95,11 +95,6 @@ Show detail of validator errors (Default: false)\&.
95
95
  Only report errors (Default: false)\&.
96
96
  .RE
97
97
  .PP
98
- \fB\-d\fR, \fB\-\-debug\fR
99
- .RS 4
100
- Show anemone log (Default: false)
101
- .RE
102
- .PP
103
98
  \fB\-h\fR, \fB\-\-help\fR
104
99
  .RS 4
105
100
  Show help message and exit\&.
@@ -5,7 +5,8 @@ describe ValidateWebsite::Core do
5
5
 
6
6
  before do
7
7
  WebMock.reset!
8
- stub_request(:get, ValidateWebsite::Core::PING_URL).with(:status => 200)
8
+ stub_request(:get, ValidateWebsite::Core::PING_URL).to_return(:status => 200)
9
+ stub_request(:get, /#{SPEC_DOMAIN}/).to_return(:status => 200)
9
10
  @validate_website = ValidateWebsite::Core.new(:color => false)
10
11
  end
11
12
 
@@ -18,7 +19,7 @@ describe ValidateWebsite::Core do
18
19
  :content_type => 'text/html')
19
20
  @validate_website.site = page.url
20
21
  @validate_website.crawl(:quiet => true)
21
- @validate_website.anemone.pages.size.must_equal 5
22
+ @validate_website.crawler.history.size.must_equal 5
22
23
  end
23
24
 
24
25
  it 'extract link' do
@@ -29,7 +30,7 @@ describe ValidateWebsite::Core do
29
30
  :content_type => 'text/html')
30
31
  @validate_website.site = page.url
31
32
  @validate_website.crawl(:quiet => true)
32
- @validate_website.anemone.pages.size.must_equal 98
33
+ @validate_website.crawler.history.size.must_equal 98
33
34
  end
34
35
  end
35
36
 
@@ -43,7 +44,7 @@ describe ValidateWebsite::Core do
43
44
  :content_type => 'text/css')
44
45
  @validate_website.site = page.url
45
46
  @validate_website.crawl(:quiet => true)
46
- @validate_website.anemone.pages.size.must_equal 5
47
+ @validate_website.crawler.history.size.must_equal 5
47
48
  end
48
49
 
49
50
  it "should extract url with single quote" do
@@ -52,7 +53,7 @@ describe ValidateWebsite::Core do
52
53
  :content_type => 'text/css')
53
54
  @validate_website.site = page.url
54
55
  @validate_website.crawl(:quiet => true)
55
- @validate_website.anemone.pages.size.must_equal 2
56
+ @validate_website.crawler.history.size.must_equal 2
56
57
  end
57
58
 
58
59
  it "should extract url with double quote" do
@@ -61,7 +62,7 @@ describe ValidateWebsite::Core do
61
62
  :content_type => 'text/css')
62
63
  @validate_website.site = page.url
63
64
  @validate_website.crawl(:quiet => true)
64
- @validate_website.anemone.pages.size.must_equal 2
65
+ @validate_website.crawler.history.size.must_equal 2
65
66
  end
66
67
  end
67
68
 
@@ -1,7 +1,7 @@
1
1
  # encoding: UTF-8
2
2
  require 'minitest/autorun'
3
3
  require_relative 'webmock_helper'
4
- require 'anemone'
4
+ require 'spidr'
5
5
  require 'pry'
6
6
 
7
7
  require 'validate_website/core'
@@ -4,7 +4,7 @@ require File.expand_path('../spec_helper', __FILE__)
4
4
  describe ValidateWebsite::Validator do
5
5
  before do
6
6
  WebMock.reset!
7
- @http = Anemone::HTTP.new
7
+ @http = Spidr::Agent.new
8
8
  end
9
9
 
10
10
  describe("xhtml1") do
@@ -15,7 +15,7 @@ describe ValidateWebsite::Validator do
15
15
  page = FakePage.new(name,
16
16
  :body => open(file).read,
17
17
  :content_type => 'text/html')
18
- @xhtml1_page = @http.fetch_page(page.url)
18
+ @xhtml1_page = @http.get_page(page.url)
19
19
  validator = ValidateWebsite::Validator.new(@xhtml1_page.doc, @xhtml1_page.body)
20
20
  validator.dtd.system_id.must_equal dtd_uri
21
21
  validator.namespace.must_equal name
@@ -36,7 +36,7 @@ describe ValidateWebsite::Validator do
36
36
  page = FakePage.new(name,
37
37
  :body => open(file).read,
38
38
  :content_type => 'text/html')
39
- @html5_page = @http.fetch_page(page.url)
39
+ @html5_page = @http.get_page(page.url)
40
40
  validator = ValidateWebsite::Validator.new(@html5_page.doc, @html5_page.body)
41
41
  validator.valid?.must_equal true
42
42
  end
@@ -46,7 +46,7 @@ describe ValidateWebsite::Validator do
46
46
  page = FakePage.new(name,
47
47
  :body => open(file).read,
48
48
  :content_type => 'text/html')
49
- @html5_page = @http.fetch_page(page.url)
49
+ @html5_page = @http.get_page(page.url)
50
50
  validator = ValidateWebsite::Validator.new(@html5_page.doc, @html5_page.body)
51
51
  validator.valid?.must_equal true
52
52
  end
@@ -61,7 +61,7 @@ describe ValidateWebsite::Validator do
61
61
  page = FakePage.new(name,
62
62
  :body => open(file).read,
63
63
  :content_type => 'text/html')
64
- @html5_page = @http.fetch_page(page.url)
64
+ @html5_page = @http.get_page(page.url)
65
65
  end
66
66
 
67
67
  it 'should have an array of errors' do
@@ -85,7 +85,7 @@ describe ValidateWebsite::Validator do
85
85
  page = FakePage.new(name,
86
86
  :body => open(file).read,
87
87
  :content_type => 'text/html')
88
- @html4_strict_page = @http.fetch_page(page.url)
88
+ @html4_strict_page = @http.get_page(page.url)
89
89
  validator = ValidateWebsite::Validator.new(@html4_strict_page.doc, @html4_strict_page.body)
90
90
  validator.valid?.must_equal true
91
91
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: validate-website
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Laurent Arnoud
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-18 00:00:00.000000000 Z
11
+ date: 2014-09-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: anemone
14
+ name: spidr
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0.7'
19
+ version: '0.4'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0.7'
26
+ version: '0.4'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: paint
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -250,7 +250,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
250
250
  - !ruby/object:Gem::Version
251
251
  version: '0'
252
252
  requirements:
253
- - anemone
253
+ - spidr
254
254
  - rainbow
255
255
  - multipart_body
256
256
  rubyforge_project: