validate-website 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1f9ca28a5036d0be57d7b0bdea0a22977c3d6a0f
4
- data.tar.gz: 9add05e1e19342356734344c065fa8f727de5a39
3
+ metadata.gz: 134cbdd3da2da6847c525ffe5a2ee68f1a380ae2
4
+ data.tar.gz: 678ff514b9f6f368bbb78e93a8dc42f38a35a803
5
5
  SHA512:
6
- metadata.gz: 21b55b544e3db2d75598e43e7c2d41980e045b17f1ea2d9a2961770a71ee32d66f80cd3ced92ce0a36baaae33a0eccf90519f5de3cbdd972af97b496362ae8b1
7
- data.tar.gz: 84a3117d4d7d1a125df96b779de04454858024269493ae519c6a301e8069bf90b14b9489b92a69dead1a5d4aadc39d224fd55a9d583ecff198eb04d517c73f34
6
+ metadata.gz: dd90c5dec7d0c80ea9b94abcb1a38a425ee59be32c73fe9fc97620a5a00ee4c9cf2dae52aa099509c8573f2dfca117377ed15373bce2d93ed7c25a43e6ed067e
7
+ data.tar.gz: 4d24ac9b1dccd744a3b7bd7ebbb18b946c05c4043a0c76898eafc543892e886715e2d99670ea91f3b9d0c3203425cf0de420445d041360c44fb67a934c6165c2
data/Rakefile CHANGED
@@ -4,8 +4,8 @@ require 'rake/testtask'
4
4
  task default: [:test]
5
5
 
6
6
  RDoc::Task.new do |rd|
7
- rd.main = "README.rdoc"
8
- rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
7
+ rd.main = 'README.rdoc'
8
+ rd.rdoc_files.include('README.rdoc', 'lib/**/*.rb')
9
9
  end
10
10
 
11
11
  # install asciidoc libxml2-utils xmlto docbook-xsl docbook-xml
@@ -15,6 +15,6 @@ task :manpage do
15
15
  end
16
16
 
17
17
  Rake::TestTask.new do |t|
18
- t.pattern = "spec/*_spec.rb"
18
+ t.pattern = 'spec/*_spec.rb'
19
19
  end
20
20
  task spec: :test
@@ -1,2 +1,3 @@
1
1
  # encoding: utf-8
2
2
  require 'validate_website/core'
3
+ require 'validate_website/version'
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  require 'set'
4
2
  require 'open-uri'
5
3
  require 'webrick/cookie'
@@ -10,7 +8,11 @@ require 'validate_website/colorful_messages'
10
8
 
11
9
  require 'spidr'
12
10
 
11
+ # Base module ValidateWebsite
13
12
  module ValidateWebsite
13
+ autoload :Crawl, 'validate_website/crawl'
14
+ autoload :Static, 'validate_website/static'
15
+
14
16
  # Core class for static or website validation
15
17
  class Core
16
18
  attr_accessor :site
@@ -23,55 +25,16 @@ module ValidateWebsite
23
25
  EXIT_FAILURE_NOT_FOUND = 65
24
26
  EXIT_FAILURE_MARKUP_NOT_FOUND = 66
25
27
 
26
- PING_URL = 'http://www.google.com/'
27
-
28
- def initialize(options = {}, validation_type = :crawl)
28
+ def initialize(options = {}, validation_type)
29
29
  @not_founds_count = 0
30
30
  @errors_count = 0
31
- @options = Parser.parse(options, validation_type)
31
+ @options = Parser.parse(options, validation_type).to_h
32
32
  @site = @options[:site]
33
- @service_url = @options[:'html5-validator-service-url']
33
+ @service_url = @options[:html5_validator_service_url]
34
34
  Validator.html5_validator_service_url = @service_url if @service_url
35
35
  puts color(:note, "validating #{@site}\n", @options[:color])
36
36
  end
37
37
 
38
- # @param [Hash] options
39
- # :color [Boolean] color output (true, false)
40
- # :exclude [String] a String used by Regexp.new
41
- # :markup [Boolean] Check the markup validity
42
- # :not_found [Boolean] Check for not found page (404)
43
- #
44
- def crawl(options = {})
45
- @options = @options.to_hash.merge(options)
46
- @options.merge!(ignore_links: @options[:exclude]) if @options[:exclude]
47
- puts color(:warning, "No internet connection") unless internet_connection?
48
-
49
- @crawler = spidr_crawler(@site, @options)
50
- print_status_line(@crawler.history.size,
51
- @crawler.failures.size,
52
- @not_founds_count,
53
- @errors_count)
54
- end
55
-
56
- # @param [Hash] options
57
- #
58
- def crawl_static(options = {})
59
- @options = @options.to_hash.merge(options)
60
- @site = @options[:site]
61
-
62
- files = Dir.glob(@options[:pattern])
63
- files.each do |f|
64
- next unless File.file?(f)
65
-
66
- response = fake_httpresponse(open(f).read)
67
- page = Spidr::Page.new(URI.join(@site, URI.encode(f)), response)
68
-
69
- validate(page.doc, page.body, f) if @options[:markup]
70
- check_static_not_found(page.links) if @options[:not_found]
71
- end
72
- print_status_line(files.size, 0, @not_founds_count, @errors_count)
73
- end
74
-
75
38
  def errors?
76
39
  @errors_count > 0
77
40
  end
@@ -99,71 +62,33 @@ module ValidateWebsite
99
62
  end
100
63
  end
101
64
 
102
- private
103
-
104
- def internet_connection?
105
- true if open(ValidateWebsite::Core::PING_URL)
106
- rescue
107
- false
108
- end
109
-
110
- def static_site_link(l)
111
- link = URI.parse(URI.encode(l))
112
- link = URI.join(@site, link) if link.host.nil?
113
- link
114
- end
115
-
116
- def in_static_domain?(site, link)
117
- URI.parse(site).host == link.host
118
- end
119
-
120
- # check files linked on static document
121
- # see lib/validate_website/runner.rb
122
- def check_static_not_found(links)
123
- links.each_with_object(Set[]) do |l, result|
124
- next if l.include?('#')
125
- link = static_site_link(l)
126
- next unless in_static_domain?(@site, link)
127
- file_path = URI.parse(File.join(Dir.getwd, link.path || '/')).path
128
- not_found_error(file_path) && next unless File.exist?(file_path)
129
- # Check CSS url()
130
- if File.extname(file_path) == '.css'
131
- response = fake_httpresponse(open(file_path).read, ['text/css'])
132
- css_page = Spidr::Page.new(l, response)
133
- result.merge extract_urls_from_css(css_page)
134
- end
135
- end
136
- end
137
-
138
- def not_found_error(location)
139
- puts "\n"
140
- puts color(:error, "#{location} linked but not exist", @options[:color])
141
- @not_founds_count += 1
142
- end
143
-
144
65
  # Extract urls from CSS page
145
66
  #
146
67
  # @param [Spidr::Page] an Spidr::Page object
147
68
  # @return [Array] Lists of urls
148
69
  #
149
- def extract_urls_from_css(page)
150
- page.body.scan(/url\((['".\/\w-]+)\)/).reduce(Set[]) do |result, url|
151
- url = url.first.gsub("'", "").gsub('"', '')
152
- abs = page.to_absolute(URI.parse(url))
153
- result << abs
70
+ def self.extract_urls_from_css(page)
71
+ page.body.scan(%r{url\((['".\/\w-]+)\)}).reduce(Set[]) do |result, url|
72
+ url = url.first.gsub("'", '').gsub('"', '')
73
+ abs = page.to_absolute(url)
74
+ result << abs.to_s
154
75
  end
155
76
  end
156
77
 
157
- # Extract imgs urls from page
158
- #
159
- # @param [Spidr::Page] an Spidr::Page object
160
- # @return [Array] Lists of urls
161
- #
162
- def extract_imgs_from_page(page)
163
- page.doc.search('//img[@src]').reduce(Set[]) do |result, elem|
164
- u = elem.attributes['src']
165
- result << page.to_absolute(URI.parse(u))
166
- end
78
+ private
79
+
80
+ def print_status_line(total, failures, not_founds, errors)
81
+ puts "\n\n"
82
+ puts color(:info, ["#{total} visited",
83
+ "#{failures} failures",
84
+ "#{not_founds} not founds",
85
+ "#{errors} errors"].join(', '), options[:color])
86
+ end
87
+
88
+ def not_found_error(location)
89
+ puts "\n"
90
+ puts color(:error, "#{location} linked but not exist", options[:color])
91
+ @not_founds_count += 1
167
92
  end
168
93
 
169
94
  ##
@@ -177,65 +102,16 @@ module ValidateWebsite
177
102
  if validator.valid?
178
103
  print color(:success, '.', options[:color]) # rspec style
179
104
  else
180
- @errors_count += 1
181
- puts "\n"
182
- puts color(:error, "* #{url}", options[:color])
183
- if options[:verbose]
184
- puts color(:error, validator.errors.join(', '), options[:color])
185
- end
186
- end
187
- end
188
-
189
- # Fake http response for Spidr static crawling
190
- # see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb
191
- #
192
- # @param [String] response body
193
- # @param [Array] content types
194
- # @return [Net::HTTPResponse] fake http response
195
- def fake_httpresponse(body, content_types = ['text/html', 'text/xhtml+xml'])
196
- response = Net::HTTPResponse.new '1.1', 200, 'OK'
197
- response.instance_variable_set(:@read, true)
198
- response.body = body
199
- content_types.each do |c|
200
- response.add_field('content-type', c)
105
+ handle_validation_error(validator, url)
201
106
  end
202
- response
203
107
  end
204
108
 
205
- def print_status_line(total, failures, not_founds, errors)
206
- puts "\n\n"
207
- puts color(:info, ["#{total} visited",
208
- "#{failures} failures",
209
- "#{not_founds} not founds",
210
- "#{errors} errors"].join(', '), @options[:color])
211
- end
212
-
213
- def spidr_crawler(site, options)
214
- @host = URI(site).host
215
- Spidr.site(site, options) do |crawler|
216
- crawler.cookies[@host] = default_cookies if options[:cookies]
217
- crawler.every_css_page do |page|
218
- extract_urls_from_css(page).each do |u|
219
- crawler.enqueue(u)
220
- end
221
- end
222
-
223
- crawler.every_html_page do |page|
224
- extract_imgs_from_page(page).each do |i|
225
- crawler.enqueue(i)
226
- end
227
-
228
- if options[:markup] && page.html?
229
- validate(page.doc, page.body, page.url, options[:ignore])
230
- end
231
- end
232
-
233
- if options[:not_found]
234
- crawler.every_failed_url do |url|
235
- not_found_error(url)
236
- end
237
- end
238
- end
109
+ def handle_validation_error(validator, url)
110
+ @errors_count += 1
111
+ puts "\n"
112
+ puts color(:error, "* #{url}", options[:color])
113
+ return unless options[:verbose]
114
+ puts color(:error, validator.errors.join(', '), options[:color])
239
115
  end
240
116
  end
241
117
  end
@@ -0,0 +1,78 @@
1
+ require 'validate_website/core'
2
+
3
+ module ValidateWebsite
4
+ # Class for http website validation
5
+ class Crawl < Core
6
+ def initialize(options = {}, validation_type = :crawl)
7
+ super
8
+ end
9
+
10
+ # @param [Hash] options
11
+ # :color [Boolean] color output (true, false)
12
+ # :exclude [String] a String used by Regexp.new
13
+ # :markup [Boolean] Check the markup validity
14
+ # :not_found [Boolean] Check for not found page (404)
15
+ #
16
+ def crawl(options = {})
17
+ @options = @options.merge(options)
18
+ @options.merge!(ignore_links: @options[:exclude]) if @options[:exclude]
19
+
20
+ @crawler = spidr_crawler(@site, @options)
21
+ print_status_line(@crawler.history.size,
22
+ @crawler.failures.size,
23
+ @not_founds_count,
24
+ @errors_count)
25
+ end
26
+
27
+ private
28
+
29
+ # Extract imgs urls from page
30
+ #
31
+ # @param [Spidr::Page] an Spidr::Page object
32
+ # @return [Array] Lists of urls
33
+ #
34
+ def extract_imgs_from_page(page)
35
+ page.doc.search('//img[@src]').reduce(Set[]) do |result, elem|
36
+ u = elem.attributes['src']
37
+ result << page.to_absolute(URI.parse(u))
38
+ end
39
+ end
40
+
41
+ def spidr_crawler(site, options)
42
+ @host = URI(site).host
43
+ Spidr.site(site, options) do |crawler|
44
+ crawler.cookies[@host] = default_cookies if options[:cookies]
45
+ on_every_css_page(crawler)
46
+ on_every_html_page(crawler)
47
+ on_every_failed_url(crawler)
48
+ end
49
+ end
50
+
51
+ def on_every_css_page(crawler)
52
+ crawler.every_css_page do |page|
53
+ ValidateWebsite::Core.extract_urls_from_css(page).each do |u|
54
+ crawler.enqueue(u)
55
+ end
56
+ end
57
+ end
58
+
59
+ def on_every_html_page(crawler)
60
+ crawler.every_html_page do |page|
61
+ extract_imgs_from_page(page).each do |i|
62
+ crawler.enqueue(i)
63
+ end
64
+
65
+ if options[:markup] && page.html?
66
+ validate(page.doc, page.body, page.url, options[:ignore])
67
+ end
68
+ end
69
+ end
70
+
71
+ def on_every_failed_url(crawler)
72
+ return unless options[:not_found]
73
+ crawler.every_failed_url do |url|
74
+ not_found_error(url)
75
+ end
76
+ end
77
+ end
78
+ end
@@ -4,38 +4,69 @@ require 'slop'
4
4
  module ValidateWebsite
5
5
  # Internal class for parse command line args
6
6
  class Parser
7
+ VALID_TYPES = [:crawl, :static].freeze
8
+
7
9
  DEFAULT_OPTIONS = {
10
+ site: 'http://localhost/',
11
+ pattern: '**/*.html',
12
+ exclude: nil,
13
+ user_agent: nil,
8
14
  markup: true,
9
15
  # crawler: log not found url (404 status code)
10
16
  # static: log not found url (not on filesystem, `pwd` considered
11
- # as root « / »)
17
+ # as root " / ")
12
18
  not_found: false,
13
19
  file: nil,
14
20
  # regex to ignore certain validation errors
15
21
  ignore: nil,
16
22
  color: true,
17
23
  # internal verbose for ValidateWebsite
18
- verbose: false,
24
+ verbose: false
19
25
  }
20
26
 
21
- DEFAULT_OPTIONS_CRAWL = {
22
- site: 'http://localhost:3000/',
23
- exclude: nil,
24
- user_agent: nil,
25
- }.merge(DEFAULT_OPTIONS)
26
-
27
- DEFAULT_OPTIONS_STATIC = {
28
- site: 'http://www.example.com/',
29
- pattern: '**/*.html',
30
- }.merge(DEFAULT_OPTIONS)
31
-
27
+ # Generic parse method for crawl or static options
32
28
  def self.parse(options, type)
33
- const = "DEFAULT_OPTIONS_#{type.to_s.upcase}"
34
- fail ArgumentError unless const_defined?(const)
35
- if Array === options
29
+ fail ArgumentError unless VALID_TYPES.include?(type)
30
+ # We are in command line (ARGV)
31
+ if options.is_a?(Array)
36
32
  send("command_line_parse_#{type}", options)
37
33
  else
38
- const_get(const).merge(options)
34
+ # for testing or Ruby usage with a Hash
35
+ DEFAULT_OPTIONS.merge(options)
36
+ end
37
+ end
38
+
39
+ def self.default_args
40
+ Slop.parse do |o|
41
+ yield o if block_given?
42
+ boolean_options(o)
43
+ o.regexp('-i', '--ignore',
44
+ 'Validation errors to ignore (ex: "valign|autocorrect")')
45
+ o.string('-5', '--html5-validator-service-url',
46
+ 'Change default html5 validator service URL')
47
+ verbose_help_options(o)
48
+ end
49
+ end
50
+
51
+ def self.boolean_options(o)
52
+ o.bool('-m', '--markup',
53
+ "Markup validation (default: #{DEFAULT_OPTIONS[:markup]})",
54
+ default: DEFAULT_OPTIONS[:markup])
55
+ o.bool('-n', '--not-found',
56
+ "Log not found url (default: #{DEFAULT_OPTIONS[:not_found]})",
57
+ default: DEFAULT_OPTIONS[:not_found])
58
+ o.bool('--color',
59
+ "Show colored output (default: #{DEFAULT_OPTIONS[:color]})",
60
+ default: DEFAULT_OPTIONS[:color])
61
+ end
62
+
63
+ def self.verbose_help_options(o)
64
+ o.bool('-v', '--verbose',
65
+ "Show validator errors (default: #{DEFAULT_OPTIONS[:verbose]})",
66
+ default: DEFAULT_OPTIONS[:verbose])
67
+ o.on('-h', '--help', 'Display this help message.') do
68
+ puts o
69
+ exit
39
70
  end
40
71
  end
41
72
 
@@ -43,28 +74,15 @@ module ValidateWebsite
43
74
  # @params [ARGV]
44
75
  # @return [Hash]
45
76
  def self.command_line_parse_crawl(_args)
46
- Slop.parse(help: true) do
47
- banner 'Usage: validate-website [OPTIONS]'
48
-
49
- on("s", "site=", "Website to crawl",
50
- default: DEFAULT_OPTIONS_CRAWL[:site])
51
- on(:u, :user_agent=, "Change user agent",
52
- default: DEFAULT_OPTIONS_CRAWL[:user_agent])
53
- on("e", "exclude=", "Url to exclude (ex: 'redirect|news')",
54
- type: :regexp)
55
- on("c", "cookies=", "Set defaults cookies")
56
- on("m", "markup", "Markup validation",
57
- default: DEFAULT_OPTIONS_CRAWL[:markup])
58
- on("i", "ignore=", "Validation errors to ignore",
59
- type: :regexp)
60
- on(:n, :not_found, "Log not found url",
61
- default: DEFAULT_OPTIONS_CRAWL[:not_found])
62
- on("color", "Show colored output",
63
- default: DEFAULT_OPTIONS_CRAWL[:color])
64
- on("5", "html5-validator-service-url=",
65
- "Change default html5 validator service URL")
66
- on("v", "verbose", "Show validator errors",
67
- default: DEFAULT_OPTIONS_CRAWL[:verbose])
77
+ default_args do |o|
78
+ o.string('-s', '--site',
79
+ "Website to crawl (default: #{DEFAULT_OPTIONS[:site]})",
80
+ default: DEFAULT_OPTIONS[:site])
81
+ o.string('-u', '--user-agent',
82
+ 'Change user agent',
83
+ default: DEFAULT_OPTIONS[:user_agent])
84
+ o.regexp('-e', '--exclude', 'Url to exclude (ex: "redirect|news")')
85
+ o.string('-c', '--cookies', 'Set defaults cookies')
68
86
  end
69
87
  end
70
88
 
@@ -72,26 +90,13 @@ module ValidateWebsite
72
90
  # @params [ARGV]
73
91
  # @return [Hash]
74
92
  def self.command_line_parse_static(_args)
75
- Slop.parse(help: true) do
76
- banner 'Usage: validate-website-static [OPTIONS]'
77
-
78
- on("s", "site=", "Website to crawl",
79
- default: DEFAULT_OPTIONS_STATIC[:site])
80
- on("p", "pattern=", "Change filenames pattern",
81
- type: :regexp, default: DEFAULT_OPTIONS_STATIC[:pattern])
82
- on("c", "cookies=", "Set defaults cookies")
83
- on("m", "markup", "Markup validation",
84
- default: DEFAULT_OPTIONS_STATIC[:markup])
85
- on("i", "ignore=", "Validation errors to ignore",
86
- type: :regexp)
87
- on(:n, :not_found, "Log not found url",
88
- default: DEFAULT_OPTIONS_STATIC[:not_found])
89
- on("color", "Show colored output",
90
- default: DEFAULT_OPTIONS_STATIC[:color])
91
- on("5", "html5-validator-service-url=",
92
- "Change default html5 validator service URL")
93
- on("v", "verbose", "Show validator errors",
94
- default: DEFAULT_OPTIONS_STATIC[:verbose])
93
+ default_args do |o|
94
+ o.string('-s', '--site',
95
+ "Website to crawl (default: #{DEFAULT_OPTIONS[:site]})",
96
+ default: DEFAULT_OPTIONS[:site])
97
+ o.regexp('-p', '--pattern',
98
+ "Filename pattern (default: #{DEFAULT_OPTIONS[:pattern]})",
99
+ default: DEFAULT_OPTIONS[:pattern])
95
100
  end
96
101
  end
97
102
  end