site_checker 0.0.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/site_checker.rb +224 -0
- metadata +45 -11
data/lib/site_checker.rb
CHANGED
@@ -0,0 +1,224 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
class SiteChecker
|
5
|
+
attr_accessor :problems
|
6
|
+
attr_accessor :ignore_list, :visit_references, :max_recursion_depth
|
7
|
+
|
8
|
+
def initialize()
|
9
|
+
yield self if block_given?
|
10
|
+
@ignore_list ||= []
|
11
|
+
@visit_references ||= false
|
12
|
+
@max_recursion_depth ||= -1
|
13
|
+
end
|
14
|
+
|
15
|
+
def check(url, root)
|
16
|
+
@visits = {}
|
17
|
+
@problems = {}
|
18
|
+
@recursion_depth = 0
|
19
|
+
|
20
|
+
@root = root
|
21
|
+
|
22
|
+
register_visit(:local_page, url)
|
23
|
+
process_local_page(url, nil)
|
24
|
+
end
|
25
|
+
|
26
|
+
def local_pages
|
27
|
+
@visits[:local_page]
|
28
|
+
end
|
29
|
+
|
30
|
+
def remote_pages
|
31
|
+
@visits[:remote_page]
|
32
|
+
end
|
33
|
+
|
34
|
+
def local_images
|
35
|
+
@visits[:local_image]
|
36
|
+
end
|
37
|
+
|
38
|
+
def remote_images
|
39
|
+
@visits[:remote_image]
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
def process_local_page(url, parent_url)
|
44
|
+
links = collect_links(url, parent_url)
|
45
|
+
|
46
|
+
filter_out_working_anchors!(links)
|
47
|
+
report_and_remove_anchors!(links, parent_url)
|
48
|
+
|
49
|
+
links.each do |link, kind|
|
50
|
+
if kind != :anchor
|
51
|
+
visit(kind, url, link) unless visited?(kind, link)
|
52
|
+
else
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def register_visit(kind, link)
|
58
|
+
@visits[kind] = [] unless @visits.has_key?(kind)
|
59
|
+
@visits[kind] << link
|
60
|
+
end
|
61
|
+
|
62
|
+
def visited?(kind, link)
|
63
|
+
@visits[kind] = [] unless @visits.has_key?(kind)
|
64
|
+
@visits[kind].include?(link)
|
65
|
+
end
|
66
|
+
|
67
|
+
def visit(kind, parent_url, link)
|
68
|
+
register_visit(kind, link)
|
69
|
+
if kind != :local_page
|
70
|
+
open_reference(kind, link, parent_url)
|
71
|
+
else
|
72
|
+
unless stop_recursion?
|
73
|
+
@recursion_depth += 1
|
74
|
+
process_local_page(link, parent_url)
|
75
|
+
@recursion_depth -= 1
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def open_reference(kind, link, parent_url)
|
81
|
+
content = nil
|
82
|
+
begin
|
83
|
+
if kind == :local_page
|
84
|
+
if URI(@root).absolute?
|
85
|
+
content = open(link)
|
86
|
+
else
|
87
|
+
link = add_index_html(link)
|
88
|
+
content = File.open(link).read
|
89
|
+
end
|
90
|
+
elsif kind == :local_image
|
91
|
+
if URI(@root).absolute?
|
92
|
+
open(link)
|
93
|
+
else
|
94
|
+
File.open(link)
|
95
|
+
end
|
96
|
+
elsif @visit_references
|
97
|
+
open(link)
|
98
|
+
end
|
99
|
+
rescue OpenURI::HTTPError => e
|
100
|
+
new_problem(strip_root(parent_url), "#{strip_root(link)} (#{e.message.strip})")
|
101
|
+
rescue Errno::ENOENT => e
|
102
|
+
link = remove_index_html(link) if kind == :local_page
|
103
|
+
new_problem(strip_root(parent_url), "#{strip_root(link)} (404 Not Found)")
|
104
|
+
rescue => e
|
105
|
+
new_problem(strip_root(parent_url), "#{strip_root(link)} (#{e.message.strip})")
|
106
|
+
end
|
107
|
+
content
|
108
|
+
end
|
109
|
+
|
110
|
+
def filter_out_working_anchors!(links)
|
111
|
+
links.delete_if{ |link, kind| (kind == :local_page && has_anchor?(links, link)) }
|
112
|
+
end
|
113
|
+
|
114
|
+
def report_and_remove_anchors!(links, parent_url)
|
115
|
+
anchors = links.select {|link, kind| link.match(/^.+#.+$/) && kind == :local_page}
|
116
|
+
anchors.each do |anchor, kind|
|
117
|
+
new_problem(strip_root(parent_url), "#{strip_root(anchor)} (404 Not Found)")
|
118
|
+
links.delete(anchor)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def has_anchor?(links, link)
|
123
|
+
anchor = link.gsub(/^.+#/, "")
|
124
|
+
links.has_key?(anchor) && links[anchor] == :anchor
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
def absolute_reference?(link)
|
129
|
+
link.start_with?(@root)
|
130
|
+
end
|
131
|
+
|
132
|
+
def relative_reference?(link)
|
133
|
+
link =~ /^\/.+/
|
134
|
+
end
|
135
|
+
|
136
|
+
def collect_links(url, parent_url)
|
137
|
+
links = {}
|
138
|
+
content = open_reference(:local_page, url, parent_url)
|
139
|
+
if content
|
140
|
+
doc = Nokogiri(content)
|
141
|
+
doc.xpath("//img").reject {|img| ignored?(img['src'])}.each do |img|
|
142
|
+
link_kind = detect_link_and_kind(img['src'], url, :remote_image, :local_image)
|
143
|
+
links.merge!(link_kind) unless link_kind.empty?
|
144
|
+
end
|
145
|
+
doc.xpath("//a").reject {|a| ignored?(a['href'])}.each do |a|
|
146
|
+
link_kind = detect_link_and_kind(a['href'], url, :remote_page, :local_page)
|
147
|
+
links.merge!(link_kind) unless link_kind.empty?
|
148
|
+
end
|
149
|
+
|
150
|
+
doc.xpath("//a").reject {|a| !a['id']}.each do |a|
|
151
|
+
links.merge!({a['id'] => :anchor})
|
152
|
+
end
|
153
|
+
end
|
154
|
+
links
|
155
|
+
end
|
156
|
+
|
157
|
+
def detect_link_and_kind(reference, url, external_kind, local_kind)
|
158
|
+
link_kind = {}
|
159
|
+
link = URI(strip_trailing_slash(reference))
|
160
|
+
if link.to_s.start_with?(@root)
|
161
|
+
new_problem(url, "#{link} (absolute path)")
|
162
|
+
else
|
163
|
+
if URI(reference).absolute?
|
164
|
+
link_kind[link.to_s] = external_kind
|
165
|
+
else
|
166
|
+
link_kind[create_absolute_reference(link.to_s)] = local_kind
|
167
|
+
end
|
168
|
+
end
|
169
|
+
link_kind
|
170
|
+
end
|
171
|
+
|
172
|
+
def strip_trailing_slash(link)
|
173
|
+
link.gsub(/\/$/, "")
|
174
|
+
end
|
175
|
+
|
176
|
+
def strip_root(link)
|
177
|
+
if link
|
178
|
+
link.gsub(/^#{@root}[\/]?/, "")
|
179
|
+
else
|
180
|
+
""
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def add_index_html(path)
|
185
|
+
path.end_with?(".html") ? path : File.join(path, "index.html")
|
186
|
+
end
|
187
|
+
|
188
|
+
def remove_index_html(path)
|
189
|
+
path.gsub(/\/index.html$/, "")
|
190
|
+
end
|
191
|
+
|
192
|
+
def create_absolute_reference(link)
|
193
|
+
root = URI(@root)
|
194
|
+
if root.absolute?
|
195
|
+
root.merge(link).to_s.gsub(/\/$/, "")
|
196
|
+
else
|
197
|
+
File.join(root.path, link)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def new_problem(url, message)
|
202
|
+
url = @root if url.empty?
|
203
|
+
@problems[url] = [] unless problems.has_key?(url)
|
204
|
+
@problems[url] << message
|
205
|
+
end
|
206
|
+
|
207
|
+
def ignored?(link)
|
208
|
+
if link
|
209
|
+
@ignore_list.include? link
|
210
|
+
else
|
211
|
+
true
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def stop_recursion?
|
216
|
+
if @max_recursion_depth == -1
|
217
|
+
false
|
218
|
+
elsif @max_recursion_depth > @recursion_depth
|
219
|
+
false
|
220
|
+
else
|
221
|
+
true
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_checker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 25
|
5
|
+
prerelease:
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
8
|
+
- 1
|
9
|
+
- 1
|
10
|
+
version: 0.1.1
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Zsolt Fabok
|
@@ -14,10 +15,40 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2012-07-
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
date: 2012-07-18 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rspec
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 35
|
29
|
+
segments:
|
30
|
+
- 2
|
31
|
+
- 11
|
32
|
+
- 0
|
33
|
+
version: 2.11.0
|
34
|
+
type: :development
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: nokogiri
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 9
|
45
|
+
segments:
|
46
|
+
- 1
|
47
|
+
- 5
|
48
|
+
- 5
|
49
|
+
version: 1.5.5
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id002
|
21
52
|
description: A simple tool for checking references on your website
|
22
53
|
email: me@zsoltfabok.com
|
23
54
|
executables: []
|
@@ -28,7 +59,6 @@ extra_rdoc_files: []
|
|
28
59
|
|
29
60
|
files:
|
30
61
|
- lib/site_checker.rb
|
31
|
-
has_rdoc: true
|
32
62
|
homepage: https://github.com/ZsoltFabok/site_checker
|
33
63
|
licenses: []
|
34
64
|
|
@@ -38,25 +68,29 @@ rdoc_options: []
|
|
38
68
|
require_paths:
|
39
69
|
- lib
|
40
70
|
required_ruby_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
41
72
|
requirements:
|
42
73
|
- - ">="
|
43
74
|
- !ruby/object:Gem::Version
|
75
|
+
hash: 3
|
44
76
|
segments:
|
45
77
|
- 0
|
46
78
|
version: "0"
|
47
79
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
none: false
|
48
81
|
requirements:
|
49
82
|
- - ">="
|
50
83
|
- !ruby/object:Gem::Version
|
84
|
+
hash: 3
|
51
85
|
segments:
|
52
86
|
- 0
|
53
87
|
version: "0"
|
54
88
|
requirements: []
|
55
89
|
|
56
90
|
rubyforge_project:
|
57
|
-
rubygems_version: 1.
|
91
|
+
rubygems_version: 1.8.15
|
58
92
|
signing_key:
|
59
93
|
specification_version: 3
|
60
|
-
summary: site_checker-0.
|
94
|
+
summary: site_checker-0.1.1
|
61
95
|
test_files: []
|
62
96
|
|