site_checker 0.0.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/site_checker.rb +224 -0
  2. metadata +45 -11
data/lib/site_checker.rb CHANGED
@@ -0,0 +1,224 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ class SiteChecker
5
+ attr_accessor :problems
6
+ attr_accessor :ignore_list, :visit_references, :max_recursion_depth
7
+
8
+ def initialize()
9
+ yield self if block_given?
10
+ @ignore_list ||= []
11
+ @visit_references ||= false
12
+ @max_recursion_depth ||= -1
13
+ end
14
+
15
+ def check(url, root)
16
+ @visits = {}
17
+ @problems = {}
18
+ @recursion_depth = 0
19
+
20
+ @root = root
21
+
22
+ register_visit(:local_page, url)
23
+ process_local_page(url, nil)
24
+ end
25
+
26
+ def local_pages
27
+ @visits[:local_page]
28
+ end
29
+
30
+ def remote_pages
31
+ @visits[:remote_page]
32
+ end
33
+
34
+ def local_images
35
+ @visits[:local_image]
36
+ end
37
+
38
+ def remote_images
39
+ @visits[:remote_image]
40
+ end
41
+
42
+ private
43
+ def process_local_page(url, parent_url)
44
+ links = collect_links(url, parent_url)
45
+
46
+ filter_out_working_anchors!(links)
47
+ report_and_remove_anchors!(links, parent_url)
48
+
49
+ links.each do |link, kind|
50
+ if kind != :anchor
51
+ visit(kind, url, link) unless visited?(kind, link)
52
+ else
53
+ end
54
+ end
55
+ end
56
+
57
+ def register_visit(kind, link)
58
+ @visits[kind] = [] unless @visits.has_key?(kind)
59
+ @visits[kind] << link
60
+ end
61
+
62
+ def visited?(kind, link)
63
+ @visits[kind] = [] unless @visits.has_key?(kind)
64
+ @visits[kind].include?(link)
65
+ end
66
+
67
+ def visit(kind, parent_url, link)
68
+ register_visit(kind, link)
69
+ if kind != :local_page
70
+ open_reference(kind, link, parent_url)
71
+ else
72
+ unless stop_recursion?
73
+ @recursion_depth += 1
74
+ process_local_page(link, parent_url)
75
+ @recursion_depth -= 1
76
+ end
77
+ end
78
+ end
79
+
80
+ def open_reference(kind, link, parent_url)
81
+ content = nil
82
+ begin
83
+ if kind == :local_page
84
+ if URI(@root).absolute?
85
+ content = open(link)
86
+ else
87
+ link = add_index_html(link)
88
+ content = File.open(link).read
89
+ end
90
+ elsif kind == :local_image
91
+ if URI(@root).absolute?
92
+ open(link)
93
+ else
94
+ File.open(link)
95
+ end
96
+ elsif @visit_references
97
+ open(link)
98
+ end
99
+ rescue OpenURI::HTTPError => e
100
+ new_problem(strip_root(parent_url), "#{strip_root(link)} (#{e.message.strip})")
101
+ rescue Errno::ENOENT => e
102
+ link = remove_index_html(link) if kind == :local_page
103
+ new_problem(strip_root(parent_url), "#{strip_root(link)} (404 Not Found)")
104
+ rescue => e
105
+ new_problem(strip_root(parent_url), "#{strip_root(link)} (#{e.message.strip})")
106
+ end
107
+ content
108
+ end
109
+
110
+ def filter_out_working_anchors!(links)
111
+ links.delete_if{ |link, kind| (kind == :local_page && has_anchor?(links, link)) }
112
+ end
113
+
114
+ def report_and_remove_anchors!(links, parent_url)
115
+ anchors = links.select {|link, kind| link.match(/^.+#.+$/) && kind == :local_page}
116
+ anchors.each do |anchor, kind|
117
+ new_problem(strip_root(parent_url), "#{strip_root(anchor)} (404 Not Found)")
118
+ links.delete(anchor)
119
+ end
120
+ end
121
+
122
+ def has_anchor?(links, link)
123
+ anchor = link.gsub(/^.+#/, "")
124
+ links.has_key?(anchor) && links[anchor] == :anchor
125
+ end
126
+
127
+
128
+ def absolute_reference?(link)
129
+ link.start_with?(@root)
130
+ end
131
+
132
+ def relative_reference?(link)
133
+ link =~ /^\/.+/
134
+ end
135
+
136
+ def collect_links(url, parent_url)
137
+ links = {}
138
+ content = open_reference(:local_page, url, parent_url)
139
+ if content
140
+ doc = Nokogiri(content)
141
+ doc.xpath("//img").reject {|img| ignored?(img['src'])}.each do |img|
142
+ link_kind = detect_link_and_kind(img['src'], url, :remote_image, :local_image)
143
+ links.merge!(link_kind) unless link_kind.empty?
144
+ end
145
+ doc.xpath("//a").reject {|a| ignored?(a['href'])}.each do |a|
146
+ link_kind = detect_link_and_kind(a['href'], url, :remote_page, :local_page)
147
+ links.merge!(link_kind) unless link_kind.empty?
148
+ end
149
+
150
+ doc.xpath("//a").reject {|a| !a['id']}.each do |a|
151
+ links.merge!({a['id'] => :anchor})
152
+ end
153
+ end
154
+ links
155
+ end
156
+
157
+ def detect_link_and_kind(reference, url, external_kind, local_kind)
158
+ link_kind = {}
159
+ link = URI(strip_trailing_slash(reference))
160
+ if link.to_s.start_with?(@root)
161
+ new_problem(url, "#{link} (absolute path)")
162
+ else
163
+ if URI(reference).absolute?
164
+ link_kind[link.to_s] = external_kind
165
+ else
166
+ link_kind[create_absolute_reference(link.to_s)] = local_kind
167
+ end
168
+ end
169
+ link_kind
170
+ end
171
+
172
+ def strip_trailing_slash(link)
173
+ link.gsub(/\/$/, "")
174
+ end
175
+
176
+ def strip_root(link)
177
+ if link
178
+ link.gsub(/^#{@root}[\/]?/, "")
179
+ else
180
+ ""
181
+ end
182
+ end
183
+
184
+ def add_index_html(path)
185
+ path.end_with?(".html") ? path : File.join(path, "index.html")
186
+ end
187
+
188
+ def remove_index_html(path)
189
+ path.gsub(/\/index.html$/, "")
190
+ end
191
+
192
+ def create_absolute_reference(link)
193
+ root = URI(@root)
194
+ if root.absolute?
195
+ root.merge(link).to_s.gsub(/\/$/, "")
196
+ else
197
+ File.join(root.path, link)
198
+ end
199
+ end
200
+
201
+ def new_problem(url, message)
202
+ url = @root if url.empty?
203
+ @problems[url] = [] unless problems.has_key?(url)
204
+ @problems[url] << message
205
+ end
206
+
207
+ def ignored?(link)
208
+ if link
209
+ @ignore_list.include? link
210
+ else
211
+ true
212
+ end
213
+ end
214
+
215
+ def stop_recursion?
216
+ if @max_recursion_depth == -1
217
+ false
218
+ elsif @max_recursion_depth > @recursion_depth
219
+ false
220
+ else
221
+ true
222
+ end
223
+ end
224
+ end
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_checker
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
4
+ hash: 25
5
+ prerelease:
5
6
  segments:
6
7
  - 0
7
- - 0
8
- - 0
9
- version: 0.0.0
8
+ - 1
9
+ - 1
10
+ version: 0.1.1
10
11
  platform: ruby
11
12
  authors:
12
13
  - Zsolt Fabok
@@ -14,10 +15,40 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2012-07-15 00:00:00 +03:00
18
- default_executable:
19
- dependencies: []
20
-
18
+ date: 2012-07-18 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 35
29
+ segments:
30
+ - 2
31
+ - 11
32
+ - 0
33
+ version: 2.11.0
34
+ type: :development
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokogiri
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ hash: 9
45
+ segments:
46
+ - 1
47
+ - 5
48
+ - 5
49
+ version: 1.5.5
50
+ type: :runtime
51
+ version_requirements: *id002
21
52
  description: A simple tool for checking references on your website
22
53
  email: me@zsoltfabok.com
23
54
  executables: []
@@ -28,7 +59,6 @@ extra_rdoc_files: []
28
59
 
29
60
  files:
30
61
  - lib/site_checker.rb
31
- has_rdoc: true
32
62
  homepage: https://github.com/ZsoltFabok/site_checker
33
63
  licenses: []
34
64
 
@@ -38,25 +68,29 @@ rdoc_options: []
38
68
  require_paths:
39
69
  - lib
40
70
  required_ruby_version: !ruby/object:Gem::Requirement
71
+ none: false
41
72
  requirements:
42
73
  - - ">="
43
74
  - !ruby/object:Gem::Version
75
+ hash: 3
44
76
  segments:
45
77
  - 0
46
78
  version: "0"
47
79
  required_rubygems_version: !ruby/object:Gem::Requirement
80
+ none: false
48
81
  requirements:
49
82
  - - ">="
50
83
  - !ruby/object:Gem::Version
84
+ hash: 3
51
85
  segments:
52
86
  - 0
53
87
  version: "0"
54
88
  requirements: []
55
89
 
56
90
  rubyforge_project:
57
- rubygems_version: 1.3.6
91
+ rubygems_version: 1.8.15
58
92
  signing_key:
59
93
  specification_version: 3
60
- summary: site_checker-0.0.0
94
+ summary: site_checker-0.1.1
61
95
  test_files: []
62
96