grell 1.6.8 → 1.6.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +2 -0
- data/lib/grell/page.rb +48 -39
- data/lib/grell/version.rb +1 -1
- data/spec/lib/page_spec.rb +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2a7136c81652b0260ee867e65380e69e5d3a2264
|
4
|
+
data.tar.gz: f95af6c4e4a99aa1216f8842c8829af521f7b1d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5b6c238c9894254531f5448dd5435603009b01c0c1fef015b748f3a91a662fa81fd0c7bbe882b95d88e8cbe91beee342222877f889931d5ed186cf0e2d03ef4e
|
7
|
+
data.tar.gz: 76a390dc30e53275ded279b27a8bd6ebabe4d9cefb0cc7367563467cd86fd37e6bb20440810d237137f400d72af6dd289602603ff6f70dce8f3bd6906f6d24e2
|
data/CHANGELOG.md
CHANGED
data/lib/grell/page.rb
CHANGED
@@ -43,7 +43,7 @@ module Grell
|
|
43
43
|
|
44
44
|
# Number of times we have retried the current page
|
45
45
|
def retries
|
46
|
-
[@times_visited -1, 0].max
|
46
|
+
[@times_visited - 1, 0].max
|
47
47
|
end
|
48
48
|
|
49
49
|
# The current URL, this may be different from the URL we asked for if there was some redirect
|
@@ -205,17 +205,9 @@ module Grell
|
|
205
205
|
|
206
206
|
private
|
207
207
|
def all_links
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
# Do not follow disabled links
|
212
|
-
enabled_links = anchors_in_body.reject { |anchor| anchor.disabled? }
|
213
|
-
|
214
|
-
unique_links = enabled_links.map do |anchor|
|
215
|
-
anchor['href'] || anchor['data-href']
|
216
|
-
end.compact
|
217
|
-
|
218
|
-
unique_links.map{|link| link_to_url(link)}.uniq.compact
|
208
|
+
links = @rawpage.all_anchors.map { |anchor| Link.new(anchor) }
|
209
|
+
body_enabled_links = links.reject { |link| link.inside_header? || link.disabled? }
|
210
|
+
body_enabled_links.map { |link| link.to_url(host) }.uniq.compact
|
219
211
|
|
220
212
|
rescue Capybara::Poltergeist::ObsoleteNode
|
221
213
|
Grell.logger.warn "We found an obsolete node in #{@url}. Ignoring all links"
|
@@ -224,37 +216,54 @@ module Grell
|
|
224
216
|
[]
|
225
217
|
end
|
226
218
|
|
227
|
-
#
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
if uri.absolute?
|
232
|
-
if uri.host != URI.parse(host).host
|
233
|
-
Grell.logger.debug "GRELL does not follow links to external hosts: #{link}"
|
234
|
-
nil
|
235
|
-
else
|
236
|
-
link # Absolute link to our own host
|
237
|
-
end
|
238
|
-
else
|
239
|
-
if uri.path.nil?
|
240
|
-
Grell.logger.debug "GRELL does not follow links without a path: #{uri}"
|
241
|
-
nil
|
242
|
-
end
|
243
|
-
if uri.path.start_with?('/')
|
244
|
-
host + link #convert to full URL
|
245
|
-
else #links like href="google.com" the browser would go to http://google.com like "http://#{link}"
|
246
|
-
Grell.logger.debug "GRELL Bad formatted link: #{link}, assuming external"
|
247
|
-
nil
|
248
|
-
end
|
219
|
+
# Private class to group all the methods related to links.
|
220
|
+
class Link
|
221
|
+
def initialize(anchor)
|
222
|
+
@anchor = anchor
|
249
223
|
end
|
250
224
|
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
225
|
+
# <link> can only be used in the <head> as of: https://developer.mozilla.org/en/docs/Web/HTML/Element/link
|
226
|
+
def inside_header?
|
227
|
+
@anchor.tag_name == 'link'
|
228
|
+
end
|
255
229
|
|
230
|
+
# Is the link disabled by either Javascript or CSS?
|
231
|
+
def disabled?
|
232
|
+
@anchor.disabled? || !!@anchor.native.attributes['disabled']
|
233
|
+
end
|
256
234
|
|
235
|
+
# Some links may use data-href + javascript to do interesting things
|
236
|
+
def href
|
237
|
+
@anchor['href'] || @anchor['data-href']
|
238
|
+
end
|
257
239
|
|
258
|
-
|
240
|
+
# We only accept links in this same host that start with a path
|
241
|
+
def to_url(host)
|
242
|
+
uri = URI.parse(href)
|
243
|
+
if uri.absolute?
|
244
|
+
if uri.host != URI.parse(host).host
|
245
|
+
Grell.logger.debug "GRELL does not follow links to external hosts: #{href}"
|
246
|
+
nil
|
247
|
+
else
|
248
|
+
href # Absolute link to our own host
|
249
|
+
end
|
250
|
+
else
|
251
|
+
if uri.path.nil?
|
252
|
+
Grell.logger.debug "GRELL does not follow links without a path: #{uri}"
|
253
|
+
nil
|
254
|
+
end
|
255
|
+
if uri.path.start_with?('/')
|
256
|
+
host + href # convert to full URL
|
257
|
+
else # links like href="google.com" the browser would go to http://google.com like "http://#{link}"
|
258
|
+
Grell.logger.debug "GRELL Bad formatted link: #{href}, assuming external"
|
259
|
+
nil
|
260
|
+
end
|
261
|
+
end
|
262
|
+
rescue URI::InvalidURIError # Invalid links propagating till we navigate to them
|
263
|
+
href
|
264
|
+
end
|
265
|
+
end
|
259
266
|
|
267
|
+
end
|
268
|
+
end
|
260
269
|
end
|
data/lib/grell/version.rb
CHANGED
data/spec/lib/page_spec.rb
CHANGED
@@ -253,6 +253,7 @@ RSpec.describe Grell::Page do
|
|
253
253
|
<a href=\"/trusmis.html\">trusmis</a>
|
254
254
|
<a href=\"/help.html\">help</a>
|
255
255
|
<a href=\"javascript: void(0)\">help</a>
|
256
|
+
<a href=\"/helpdisabled.html\" disabled=\"\">helpdisabled</a>
|
256
257
|
</body></html>"
|
257
258
|
end
|
258
259
|
let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] }
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.6.
|
4
|
+
version: 1.6.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-07-
|
11
|
+
date: 2016-07-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|