site_checker 0.0.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/site_checker.rb +224 -0
  2. metadata +45 -11
data/lib/site_checker.rb CHANGED
@@ -0,0 +1,224 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ class SiteChecker
5
+ attr_accessor :problems
6
+ attr_accessor :ignore_list, :visit_references, :max_recursion_depth
7
+
8
+ def initialize()
9
+ yield self if block_given?
10
+ @ignore_list ||= []
11
+ @visit_references ||= false
12
+ @max_recursion_depth ||= -1
13
+ end
14
+
15
+ def check(url, root)
16
+ @visits = {}
17
+ @problems = {}
18
+ @recursion_depth = 0
19
+
20
+ @root = root
21
+
22
+ register_visit(:local_page, url)
23
+ process_local_page(url, nil)
24
+ end
25
+
26
+ def local_pages
27
+ @visits[:local_page]
28
+ end
29
+
30
+ def remote_pages
31
+ @visits[:remote_page]
32
+ end
33
+
34
+ def local_images
35
+ @visits[:local_image]
36
+ end
37
+
38
+ def remote_images
39
+ @visits[:remote_image]
40
+ end
41
+
42
+ private
43
+ def process_local_page(url, parent_url)
44
+ links = collect_links(url, parent_url)
45
+
46
+ filter_out_working_anchors!(links)
47
+ report_and_remove_anchors!(links, parent_url)
48
+
49
+ links.each do |link, kind|
50
+ if kind != :anchor
51
+ visit(kind, url, link) unless visited?(kind, link)
52
+ else
53
+ end
54
+ end
55
+ end
56
+
57
+ def register_visit(kind, link)
58
+ @visits[kind] = [] unless @visits.has_key?(kind)
59
+ @visits[kind] << link
60
+ end
61
+
62
+ def visited?(kind, link)
63
+ @visits[kind] = [] unless @visits.has_key?(kind)
64
+ @visits[kind].include?(link)
65
+ end
66
+
67
+ def visit(kind, parent_url, link)
68
+ register_visit(kind, link)
69
+ if kind != :local_page
70
+ open_reference(kind, link, parent_url)
71
+ else
72
+ unless stop_recursion?
73
+ @recursion_depth += 1
74
+ process_local_page(link, parent_url)
75
+ @recursion_depth -= 1
76
+ end
77
+ end
78
+ end
79
+
80
+ def open_reference(kind, link, parent_url)
81
+ content = nil
82
+ begin
83
+ if kind == :local_page
84
+ if URI(@root).absolute?
85
+ content = open(link)
86
+ else
87
+ link = add_index_html(link)
88
+ content = File.open(link).read
89
+ end
90
+ elsif kind == :local_image
91
+ if URI(@root).absolute?
92
+ open(link)
93
+ else
94
+ File.open(link)
95
+ end
96
+ elsif @visit_references
97
+ open(link)
98
+ end
99
+ rescue OpenURI::HTTPError => e
100
+ new_problem(strip_root(parent_url), "#{strip_root(link)} (#{e.message.strip})")
101
+ rescue Errno::ENOENT => e
102
+ link = remove_index_html(link) if kind == :local_page
103
+ new_problem(strip_root(parent_url), "#{strip_root(link)} (404 Not Found)")
104
+ rescue => e
105
+ new_problem(strip_root(parent_url), "#{strip_root(link)} (#{e.message.strip})")
106
+ end
107
+ content
108
+ end
109
+
110
+ def filter_out_working_anchors!(links)
111
+ links.delete_if{ |link, kind| (kind == :local_page && has_anchor?(links, link)) }
112
+ end
113
+
114
+ def report_and_remove_anchors!(links, parent_url)
115
+ anchors = links.select {|link, kind| link.match(/^.+#.+$/) && kind == :local_page}
116
+ anchors.each do |anchor, kind|
117
+ new_problem(strip_root(parent_url), "#{strip_root(anchor)} (404 Not Found)")
118
+ links.delete(anchor)
119
+ end
120
+ end
121
+
122
+ def has_anchor?(links, link)
123
+ anchor = link.gsub(/^.+#/, "")
124
+ links.has_key?(anchor) && links[anchor] == :anchor
125
+ end
126
+
127
+
128
+ def absolute_reference?(link)
129
+ link.start_with?(@root)
130
+ end
131
+
132
+ def relative_reference?(link)
133
+ link =~ /^\/.+/
134
+ end
135
+
136
+ def collect_links(url, parent_url)
137
+ links = {}
138
+ content = open_reference(:local_page, url, parent_url)
139
+ if content
140
+ doc = Nokogiri(content)
141
+ doc.xpath("//img").reject {|img| ignored?(img['src'])}.each do |img|
142
+ link_kind = detect_link_and_kind(img['src'], url, :remote_image, :local_image)
143
+ links.merge!(link_kind) unless link_kind.empty?
144
+ end
145
+ doc.xpath("//a").reject {|a| ignored?(a['href'])}.each do |a|
146
+ link_kind = detect_link_and_kind(a['href'], url, :remote_page, :local_page)
147
+ links.merge!(link_kind) unless link_kind.empty?
148
+ end
149
+
150
+ doc.xpath("//a").reject {|a| !a['id']}.each do |a|
151
+ links.merge!({a['id'] => :anchor})
152
+ end
153
+ end
154
+ links
155
+ end
156
+
157
+ def detect_link_and_kind(reference, url, external_kind, local_kind)
158
+ link_kind = {}
159
+ link = URI(strip_trailing_slash(reference))
160
+ if link.to_s.start_with?(@root)
161
+ new_problem(url, "#{link} (absolute path)")
162
+ else
163
+ if URI(reference).absolute?
164
+ link_kind[link.to_s] = external_kind
165
+ else
166
+ link_kind[create_absolute_reference(link.to_s)] = local_kind
167
+ end
168
+ end
169
+ link_kind
170
+ end
171
+
172
+ def strip_trailing_slash(link)
173
+ link.gsub(/\/$/, "")
174
+ end
175
+
176
+ def strip_root(link)
177
+ if link
178
+ link.gsub(/^#{@root}[\/]?/, "")
179
+ else
180
+ ""
181
+ end
182
+ end
183
+
184
+ def add_index_html(path)
185
+ path.end_with?(".html") ? path : File.join(path, "index.html")
186
+ end
187
+
188
+ def remove_index_html(path)
189
+ path.gsub(/\/index.html$/, "")
190
+ end
191
+
192
+ def create_absolute_reference(link)
193
+ root = URI(@root)
194
+ if root.absolute?
195
+ root.merge(link).to_s.gsub(/\/$/, "")
196
+ else
197
+ File.join(root.path, link)
198
+ end
199
+ end
200
+
201
+ def new_problem(url, message)
202
+ url = @root if url.empty?
203
+ @problems[url] = [] unless problems.has_key?(url)
204
+ @problems[url] << message
205
+ end
206
+
207
+ def ignored?(link)
208
+ if link
209
+ @ignore_list.include? link
210
+ else
211
+ true
212
+ end
213
+ end
214
+
215
+ def stop_recursion?
216
+ if @max_recursion_depth == -1
217
+ false
218
+ elsif @max_recursion_depth > @recursion_depth
219
+ false
220
+ else
221
+ true
222
+ end
223
+ end
224
+ end
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_checker
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
4
+ hash: 25
5
+ prerelease:
5
6
  segments:
6
7
  - 0
7
- - 0
8
- - 0
9
- version: 0.0.0
8
+ - 1
9
+ - 1
10
+ version: 0.1.1
10
11
  platform: ruby
11
12
  authors:
12
13
  - Zsolt Fabok
@@ -14,10 +15,40 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2012-07-15 00:00:00 +03:00
18
- default_executable:
19
- dependencies: []
20
-
18
+ date: 2012-07-18 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 35
29
+ segments:
30
+ - 2
31
+ - 11
32
+ - 0
33
+ version: 2.11.0
34
+ type: :development
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokogiri
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ hash: 9
45
+ segments:
46
+ - 1
47
+ - 5
48
+ - 5
49
+ version: 1.5.5
50
+ type: :runtime
51
+ version_requirements: *id002
21
52
  description: A simple tool for checking references on your website
22
53
  email: me@zsoltfabok.com
23
54
  executables: []
@@ -28,7 +59,6 @@ extra_rdoc_files: []
28
59
 
29
60
  files:
30
61
  - lib/site_checker.rb
31
- has_rdoc: true
32
62
  homepage: https://github.com/ZsoltFabok/site_checker
33
63
  licenses: []
34
64
 
@@ -38,25 +68,29 @@ rdoc_options: []
38
68
  require_paths:
39
69
  - lib
40
70
  required_ruby_version: !ruby/object:Gem::Requirement
71
+ none: false
41
72
  requirements:
42
73
  - - ">="
43
74
  - !ruby/object:Gem::Version
75
+ hash: 3
44
76
  segments:
45
77
  - 0
46
78
  version: "0"
47
79
  required_rubygems_version: !ruby/object:Gem::Requirement
80
+ none: false
48
81
  requirements:
49
82
  - - ">="
50
83
  - !ruby/object:Gem::Version
84
+ hash: 3
51
85
  segments:
52
86
  - 0
53
87
  version: "0"
54
88
  requirements: []
55
89
 
56
90
  rubyforge_project:
57
- rubygems_version: 1.3.6
91
+ rubygems_version: 1.8.15
58
92
  signing_key:
59
93
  specification_version: 3
60
- summary: site_checker-0.0.0
94
+ summary: site_checker-0.1.1
61
95
  test_files: []
62
96