validate-website 1.0.5 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1f9ca28a5036d0be57d7b0bdea0a22977c3d6a0f
4
- data.tar.gz: 9add05e1e19342356734344c065fa8f727de5a39
3
+ metadata.gz: 134cbdd3da2da6847c525ffe5a2ee68f1a380ae2
4
+ data.tar.gz: 678ff514b9f6f368bbb78e93a8dc42f38a35a803
5
5
  SHA512:
6
- metadata.gz: 21b55b544e3db2d75598e43e7c2d41980e045b17f1ea2d9a2961770a71ee32d66f80cd3ced92ce0a36baaae33a0eccf90519f5de3cbdd972af97b496362ae8b1
7
- data.tar.gz: 84a3117d4d7d1a125df96b779de04454858024269493ae519c6a301e8069bf90b14b9489b92a69dead1a5d4aadc39d224fd55a9d583ecff198eb04d517c73f34
6
+ metadata.gz: dd90c5dec7d0c80ea9b94abcb1a38a425ee59be32c73fe9fc97620a5a00ee4c9cf2dae52aa099509c8573f2dfca117377ed15373bce2d93ed7c25a43e6ed067e
7
+ data.tar.gz: 4d24ac9b1dccd744a3b7bd7ebbb18b946c05c4043a0c76898eafc543892e886715e2d99670ea91f3b9d0c3203425cf0de420445d041360c44fb67a934c6165c2
data/Rakefile CHANGED
@@ -4,8 +4,8 @@ require 'rake/testtask'
4
4
  task default: [:test]
5
5
 
6
6
  RDoc::Task.new do |rd|
7
- rd.main = "README.rdoc"
8
- rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
7
+ rd.main = 'README.rdoc'
8
+ rd.rdoc_files.include('README.rdoc', 'lib/**/*.rb')
9
9
  end
10
10
 
11
11
  # install asciidoc libxml2-utils xmlto docbook-xsl docbook-xml
@@ -15,6 +15,6 @@ task :manpage do
15
15
  end
16
16
 
17
17
  Rake::TestTask.new do |t|
18
- t.pattern = "spec/*_spec.rb"
18
+ t.pattern = 'spec/*_spec.rb'
19
19
  end
20
20
  task spec: :test
@@ -1,2 +1,3 @@
1
1
  # encoding: utf-8
2
2
  require 'validate_website/core'
3
+ require 'validate_website/version'
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  require 'set'
4
2
  require 'open-uri'
5
3
  require 'webrick/cookie'
@@ -10,7 +8,11 @@ require 'validate_website/colorful_messages'
10
8
 
11
9
  require 'spidr'
12
10
 
11
+ # Base module ValidateWebsite
13
12
  module ValidateWebsite
13
+ autoload :Crawl, 'validate_website/crawl'
14
+ autoload :Static, 'validate_website/static'
15
+
14
16
  # Core class for static or website validation
15
17
  class Core
16
18
  attr_accessor :site
@@ -23,55 +25,16 @@ module ValidateWebsite
23
25
  EXIT_FAILURE_NOT_FOUND = 65
24
26
  EXIT_FAILURE_MARKUP_NOT_FOUND = 66
25
27
 
26
- PING_URL = 'http://www.google.com/'
27
-
28
- def initialize(options = {}, validation_type = :crawl)
28
+ def initialize(options = {}, validation_type)
29
29
  @not_founds_count = 0
30
30
  @errors_count = 0
31
- @options = Parser.parse(options, validation_type)
31
+ @options = Parser.parse(options, validation_type).to_h
32
32
  @site = @options[:site]
33
- @service_url = @options[:'html5-validator-service-url']
33
+ @service_url = @options[:html5_validator_service_url]
34
34
  Validator.html5_validator_service_url = @service_url if @service_url
35
35
  puts color(:note, "validating #{@site}\n", @options[:color])
36
36
  end
37
37
 
38
- # @param [Hash] options
39
- # :color [Boolean] color output (true, false)
40
- # :exclude [String] a String used by Regexp.new
41
- # :markup [Boolean] Check the markup validity
42
- # :not_found [Boolean] Check for not found page (404)
43
- #
44
- def crawl(options = {})
45
- @options = @options.to_hash.merge(options)
46
- @options.merge!(ignore_links: @options[:exclude]) if @options[:exclude]
47
- puts color(:warning, "No internet connection") unless internet_connection?
48
-
49
- @crawler = spidr_crawler(@site, @options)
50
- print_status_line(@crawler.history.size,
51
- @crawler.failures.size,
52
- @not_founds_count,
53
- @errors_count)
54
- end
55
-
56
- # @param [Hash] options
57
- #
58
- def crawl_static(options = {})
59
- @options = @options.to_hash.merge(options)
60
- @site = @options[:site]
61
-
62
- files = Dir.glob(@options[:pattern])
63
- files.each do |f|
64
- next unless File.file?(f)
65
-
66
- response = fake_httpresponse(open(f).read)
67
- page = Spidr::Page.new(URI.join(@site, URI.encode(f)), response)
68
-
69
- validate(page.doc, page.body, f) if @options[:markup]
70
- check_static_not_found(page.links) if @options[:not_found]
71
- end
72
- print_status_line(files.size, 0, @not_founds_count, @errors_count)
73
- end
74
-
75
38
  def errors?
76
39
  @errors_count > 0
77
40
  end
@@ -99,71 +62,33 @@ module ValidateWebsite
99
62
  end
100
63
  end
101
64
 
102
- private
103
-
104
- def internet_connection?
105
- true if open(ValidateWebsite::Core::PING_URL)
106
- rescue
107
- false
108
- end
109
-
110
- def static_site_link(l)
111
- link = URI.parse(URI.encode(l))
112
- link = URI.join(@site, link) if link.host.nil?
113
- link
114
- end
115
-
116
- def in_static_domain?(site, link)
117
- URI.parse(site).host == link.host
118
- end
119
-
120
- # check files linked on static document
121
- # see lib/validate_website/runner.rb
122
- def check_static_not_found(links)
123
- links.each_with_object(Set[]) do |l, result|
124
- next if l.include?('#')
125
- link = static_site_link(l)
126
- next unless in_static_domain?(@site, link)
127
- file_path = URI.parse(File.join(Dir.getwd, link.path || '/')).path
128
- not_found_error(file_path) && next unless File.exist?(file_path)
129
- # Check CSS url()
130
- if File.extname(file_path) == '.css'
131
- response = fake_httpresponse(open(file_path).read, ['text/css'])
132
- css_page = Spidr::Page.new(l, response)
133
- result.merge extract_urls_from_css(css_page)
134
- end
135
- end
136
- end
137
-
138
- def not_found_error(location)
139
- puts "\n"
140
- puts color(:error, "#{location} linked but not exist", @options[:color])
141
- @not_founds_count += 1
142
- end
143
-
144
65
  # Extract urls from CSS page
145
66
  #
146
67
  # @param [Spidr::Page] an Spidr::Page object
147
68
  # @return [Array] Lists of urls
148
69
  #
149
- def extract_urls_from_css(page)
150
- page.body.scan(/url\((['".\/\w-]+)\)/).reduce(Set[]) do |result, url|
151
- url = url.first.gsub("'", "").gsub('"', '')
152
- abs = page.to_absolute(URI.parse(url))
153
- result << abs
70
+ def self.extract_urls_from_css(page)
71
+ page.body.scan(%r{url\((['".\/\w-]+)\)}).reduce(Set[]) do |result, url|
72
+ url = url.first.gsub("'", '').gsub('"', '')
73
+ abs = page.to_absolute(url)
74
+ result << abs.to_s
154
75
  end
155
76
  end
156
77
 
157
- # Extract imgs urls from page
158
- #
159
- # @param [Spidr::Page] an Spidr::Page object
160
- # @return [Array] Lists of urls
161
- #
162
- def extract_imgs_from_page(page)
163
- page.doc.search('//img[@src]').reduce(Set[]) do |result, elem|
164
- u = elem.attributes['src']
165
- result << page.to_absolute(URI.parse(u))
166
- end
78
+ private
79
+
80
+ def print_status_line(total, failures, not_founds, errors)
81
+ puts "\n\n"
82
+ puts color(:info, ["#{total} visited",
83
+ "#{failures} failures",
84
+ "#{not_founds} not founds",
85
+ "#{errors} errors"].join(', '), options[:color])
86
+ end
87
+
88
+ def not_found_error(location)
89
+ puts "\n"
90
+ puts color(:error, "#{location} linked but not exist", options[:color])
91
+ @not_founds_count += 1
167
92
  end
168
93
 
169
94
  ##
@@ -177,65 +102,16 @@ module ValidateWebsite
177
102
  if validator.valid?
178
103
  print color(:success, '.', options[:color]) # rspec style
179
104
  else
180
- @errors_count += 1
181
- puts "\n"
182
- puts color(:error, "* #{url}", options[:color])
183
- if options[:verbose]
184
- puts color(:error, validator.errors.join(', '), options[:color])
185
- end
186
- end
187
- end
188
-
189
- # Fake http response for Spidr static crawling
190
- # see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb
191
- #
192
- # @param [String] response body
193
- # @param [Array] content types
194
- # @return [Net::HTTPResponse] fake http response
195
- def fake_httpresponse(body, content_types = ['text/html', 'text/xhtml+xml'])
196
- response = Net::HTTPResponse.new '1.1', 200, 'OK'
197
- response.instance_variable_set(:@read, true)
198
- response.body = body
199
- content_types.each do |c|
200
- response.add_field('content-type', c)
105
+ handle_validation_error(validator, url)
201
106
  end
202
- response
203
107
  end
204
108
 
205
- def print_status_line(total, failures, not_founds, errors)
206
- puts "\n\n"
207
- puts color(:info, ["#{total} visited",
208
- "#{failures} failures",
209
- "#{not_founds} not founds",
210
- "#{errors} errors"].join(', '), @options[:color])
211
- end
212
-
213
- def spidr_crawler(site, options)
214
- @host = URI(site).host
215
- Spidr.site(site, options) do |crawler|
216
- crawler.cookies[@host] = default_cookies if options[:cookies]
217
- crawler.every_css_page do |page|
218
- extract_urls_from_css(page).each do |u|
219
- crawler.enqueue(u)
220
- end
221
- end
222
-
223
- crawler.every_html_page do |page|
224
- extract_imgs_from_page(page).each do |i|
225
- crawler.enqueue(i)
226
- end
227
-
228
- if options[:markup] && page.html?
229
- validate(page.doc, page.body, page.url, options[:ignore])
230
- end
231
- end
232
-
233
- if options[:not_found]
234
- crawler.every_failed_url do |url|
235
- not_found_error(url)
236
- end
237
- end
238
- end
109
+ def handle_validation_error(validator, url)
110
+ @errors_count += 1
111
+ puts "\n"
112
+ puts color(:error, "* #{url}", options[:color])
113
+ return unless options[:verbose]
114
+ puts color(:error, validator.errors.join(', '), options[:color])
239
115
  end
240
116
  end
241
117
  end
@@ -0,0 +1,78 @@
1
+ require 'validate_website/core'
2
+
3
+ module ValidateWebsite
4
+ # Class for http website validation
5
+ class Crawl < Core
6
+ def initialize(options = {}, validation_type = :crawl)
7
+ super
8
+ end
9
+
10
+ # @param [Hash] options
11
+ # :color [Boolean] color output (true, false)
12
+ # :exclude [String] a String used by Regexp.new
13
+ # :markup [Boolean] Check the markup validity
14
+ # :not_found [Boolean] Check for not found page (404)
15
+ #
16
+ def crawl(options = {})
17
+ @options = @options.merge(options)
18
+ @options.merge!(ignore_links: @options[:exclude]) if @options[:exclude]
19
+
20
+ @crawler = spidr_crawler(@site, @options)
21
+ print_status_line(@crawler.history.size,
22
+ @crawler.failures.size,
23
+ @not_founds_count,
24
+ @errors_count)
25
+ end
26
+
27
+ private
28
+
29
+ # Extract imgs urls from page
30
+ #
31
+ # @param [Spidr::Page] an Spidr::Page object
32
+ # @return [Array] Lists of urls
33
+ #
34
+ def extract_imgs_from_page(page)
35
+ page.doc.search('//img[@src]').reduce(Set[]) do |result, elem|
36
+ u = elem.attributes['src']
37
+ result << page.to_absolute(URI.parse(u))
38
+ end
39
+ end
40
+
41
+ def spidr_crawler(site, options)
42
+ @host = URI(site).host
43
+ Spidr.site(site, options) do |crawler|
44
+ crawler.cookies[@host] = default_cookies if options[:cookies]
45
+ on_every_css_page(crawler)
46
+ on_every_html_page(crawler)
47
+ on_every_failed_url(crawler)
48
+ end
49
+ end
50
+
51
+ def on_every_css_page(crawler)
52
+ crawler.every_css_page do |page|
53
+ ValidateWebsite::Core.extract_urls_from_css(page).each do |u|
54
+ crawler.enqueue(u)
55
+ end
56
+ end
57
+ end
58
+
59
+ def on_every_html_page(crawler)
60
+ crawler.every_html_page do |page|
61
+ extract_imgs_from_page(page).each do |i|
62
+ crawler.enqueue(i)
63
+ end
64
+
65
+ if options[:markup] && page.html?
66
+ validate(page.doc, page.body, page.url, options[:ignore])
67
+ end
68
+ end
69
+ end
70
+
71
+ def on_every_failed_url(crawler)
72
+ return unless options[:not_found]
73
+ crawler.every_failed_url do |url|
74
+ not_found_error(url)
75
+ end
76
+ end
77
+ end
78
+ end
@@ -4,38 +4,69 @@ require 'slop'
4
4
  module ValidateWebsite
5
5
  # Internal class for parse command line args
6
6
  class Parser
7
+ VALID_TYPES = [:crawl, :static].freeze
8
+
7
9
  DEFAULT_OPTIONS = {
10
+ site: 'http://localhost/',
11
+ pattern: '**/*.html',
12
+ exclude: nil,
13
+ user_agent: nil,
8
14
  markup: true,
9
15
  # crawler: log not found url (404 status code)
10
16
  # static: log not found url (not on filesystem, `pwd` considered
11
- # as root « / »)
17
+ # as root " / ")
12
18
  not_found: false,
13
19
  file: nil,
14
20
  # regex to ignore certain validation errors
15
21
  ignore: nil,
16
22
  color: true,
17
23
  # internal verbose for ValidateWebsite
18
- verbose: false,
24
+ verbose: false
19
25
  }
20
26
 
21
- DEFAULT_OPTIONS_CRAWL = {
22
- site: 'http://localhost:3000/',
23
- exclude: nil,
24
- user_agent: nil,
25
- }.merge(DEFAULT_OPTIONS)
26
-
27
- DEFAULT_OPTIONS_STATIC = {
28
- site: 'http://www.example.com/',
29
- pattern: '**/*.html',
30
- }.merge(DEFAULT_OPTIONS)
31
-
27
+ # Generic parse method for crawl or static options
32
28
  def self.parse(options, type)
33
- const = "DEFAULT_OPTIONS_#{type.to_s.upcase}"
34
- fail ArgumentError unless const_defined?(const)
35
- if Array === options
29
+ fail ArgumentError unless VALID_TYPES.include?(type)
30
+ # We are in command line (ARGV)
31
+ if options.is_a?(Array)
36
32
  send("command_line_parse_#{type}", options)
37
33
  else
38
- const_get(const).merge(options)
34
+ # for testing or Ruby usage with a Hash
35
+ DEFAULT_OPTIONS.merge(options)
36
+ end
37
+ end
38
+
39
+ def self.default_args
40
+ Slop.parse do |o|
41
+ yield o if block_given?
42
+ boolean_options(o)
43
+ o.regexp('-i', '--ignore',
44
+ 'Validation errors to ignore (ex: "valign|autocorrect")')
45
+ o.string('-5', '--html5-validator-service-url',
46
+ 'Change default html5 validator service URL')
47
+ verbose_help_options(o)
48
+ end
49
+ end
50
+
51
+ def self.boolean_options(o)
52
+ o.bool('-m', '--markup',
53
+ "Markup validation (default: #{DEFAULT_OPTIONS[:markup]})",
54
+ default: DEFAULT_OPTIONS[:markup])
55
+ o.bool('-n', '--not-found',
56
+ "Log not found url (default: #{DEFAULT_OPTIONS[:not_found]})",
57
+ default: DEFAULT_OPTIONS[:not_found])
58
+ o.bool('--color',
59
+ "Show colored output (default: #{DEFAULT_OPTIONS[:color]})",
60
+ default: DEFAULT_OPTIONS[:color])
61
+ end
62
+
63
+ def self.verbose_help_options(o)
64
+ o.bool('-v', '--verbose',
65
+ "Show validator errors (default: #{DEFAULT_OPTIONS[:verbose]})",
66
+ default: DEFAULT_OPTIONS[:verbose])
67
+ o.on('-h', '--help', 'Display this help message.') do
68
+ puts o
69
+ exit
39
70
  end
40
71
  end
41
72
 
@@ -43,28 +74,15 @@ module ValidateWebsite
43
74
  # @params [ARGV]
44
75
  # @return [Hash]
45
76
  def self.command_line_parse_crawl(_args)
46
- Slop.parse(help: true) do
47
- banner 'Usage: validate-website [OPTIONS]'
48
-
49
- on("s", "site=", "Website to crawl",
50
- default: DEFAULT_OPTIONS_CRAWL[:site])
51
- on(:u, :user_agent=, "Change user agent",
52
- default: DEFAULT_OPTIONS_CRAWL[:user_agent])
53
- on("e", "exclude=", "Url to exclude (ex: 'redirect|news')",
54
- type: :regexp)
55
- on("c", "cookies=", "Set defaults cookies")
56
- on("m", "markup", "Markup validation",
57
- default: DEFAULT_OPTIONS_CRAWL[:markup])
58
- on("i", "ignore=", "Validation errors to ignore",
59
- type: :regexp)
60
- on(:n, :not_found, "Log not found url",
61
- default: DEFAULT_OPTIONS_CRAWL[:not_found])
62
- on("color", "Show colored output",
63
- default: DEFAULT_OPTIONS_CRAWL[:color])
64
- on("5", "html5-validator-service-url=",
65
- "Change default html5 validator service URL")
66
- on("v", "verbose", "Show validator errors",
67
- default: DEFAULT_OPTIONS_CRAWL[:verbose])
77
+ default_args do |o|
78
+ o.string('-s', '--site',
79
+ "Website to crawl (default: #{DEFAULT_OPTIONS[:site]})",
80
+ default: DEFAULT_OPTIONS[:site])
81
+ o.string('-u', '--user-agent',
82
+ 'Change user agent',
83
+ default: DEFAULT_OPTIONS[:user_agent])
84
+ o.regexp('-e', '--exclude', 'Url to exclude (ex: "redirect|news")')
85
+ o.string('-c', '--cookies', 'Set defaults cookies')
68
86
  end
69
87
  end
70
88
 
@@ -72,26 +90,13 @@ module ValidateWebsite
72
90
  # @params [ARGV]
73
91
  # @return [Hash]
74
92
  def self.command_line_parse_static(_args)
75
- Slop.parse(help: true) do
76
- banner 'Usage: validate-website-static [OPTIONS]'
77
-
78
- on("s", "site=", "Website to crawl",
79
- default: DEFAULT_OPTIONS_STATIC[:site])
80
- on("p", "pattern=", "Change filenames pattern",
81
- type: :regexp, default: DEFAULT_OPTIONS_STATIC[:pattern])
82
- on("c", "cookies=", "Set defaults cookies")
83
- on("m", "markup", "Markup validation",
84
- default: DEFAULT_OPTIONS_STATIC[:markup])
85
- on("i", "ignore=", "Validation errors to ignore",
86
- type: :regexp)
87
- on(:n, :not_found, "Log not found url",
88
- default: DEFAULT_OPTIONS_STATIC[:not_found])
89
- on("color", "Show colored output",
90
- default: DEFAULT_OPTIONS_STATIC[:color])
91
- on("5", "html5-validator-service-url=",
92
- "Change default html5 validator service URL")
93
- on("v", "verbose", "Show validator errors",
94
- default: DEFAULT_OPTIONS_STATIC[:verbose])
93
+ default_args do |o|
94
+ o.string('-s', '--site',
95
+ "Website to crawl (default: #{DEFAULT_OPTIONS[:site]})",
96
+ default: DEFAULT_OPTIONS[:site])
97
+ o.regexp('-p', '--pattern',
98
+ "Filename pattern (default: #{DEFAULT_OPTIONS[:pattern]})",
99
+ default: DEFAULT_OPTIONS[:pattern])
95
100
  end
96
101
  end
97
102
  end