grell 1.6.8 → 1.6.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +2 -0
- data/lib/grell/page.rb +48 -39
- data/lib/grell/version.rb +1 -1
- data/spec/lib/page_spec.rb +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2a7136c81652b0260ee867e65380e69e5d3a2264
|
4
|
+
data.tar.gz: f95af6c4e4a99aa1216f8842c8829af521f7b1d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5b6c238c9894254531f5448dd5435603009b01c0c1fef015b748f3a91a662fa81fd0c7bbe882b95d88e8cbe91beee342222877f889931d5ed186cf0e2d03ef4e
|
7
|
+
data.tar.gz: 76a390dc30e53275ded279b27a8bd6ebabe4d9cefb0cc7367563467cd86fd37e6bb20440810d237137f400d72af6dd289602603ff6f70dce8f3bd6906f6d24e2
|
data/CHANGELOG.md
CHANGED
data/lib/grell/page.rb
CHANGED
@@ -43,7 +43,7 @@ module Grell
|
|
43
43
|
|
44
44
|
# Number of times we have retried the current page
|
45
45
|
def retries
|
46
|
-
[@times_visited -1, 0].max
|
46
|
+
[@times_visited - 1, 0].max
|
47
47
|
end
|
48
48
|
|
49
49
|
# The current URL, this may be different from the URL we asked for if there was some redirect
|
@@ -205,17 +205,9 @@ module Grell
|
|
205
205
|
|
206
206
|
private
|
207
207
|
def all_links
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
# Do not follow disabled links
|
212
|
-
enabled_links = anchors_in_body.reject { |anchor| anchor.disabled? }
|
213
|
-
|
214
|
-
unique_links = enabled_links.map do |anchor|
|
215
|
-
anchor['href'] || anchor['data-href']
|
216
|
-
end.compact
|
217
|
-
|
218
|
-
unique_links.map{|link| link_to_url(link)}.uniq.compact
|
208
|
+
links = @rawpage.all_anchors.map { |anchor| Link.new(anchor) }
|
209
|
+
body_enabled_links = links.reject { |link| link.inside_header? || link.disabled? }
|
210
|
+
body_enabled_links.map { |link| link.to_url(host) }.uniq.compact
|
219
211
|
|
220
212
|
rescue Capybara::Poltergeist::ObsoleteNode
|
221
213
|
Grell.logger.warn "We found an obsolete node in #{@url}. Ignoring all links"
|
@@ -224,37 +216,54 @@ module Grell
|
|
224
216
|
[]
|
225
217
|
end
|
226
218
|
|
227
|
-
#
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
if uri.absolute?
|
232
|
-
if uri.host != URI.parse(host).host
|
233
|
-
Grell.logger.debug "GRELL does not follow links to external hosts: #{link}"
|
234
|
-
nil
|
235
|
-
else
|
236
|
-
link # Absolute link to our own host
|
237
|
-
end
|
238
|
-
else
|
239
|
-
if uri.path.nil?
|
240
|
-
Grell.logger.debug "GRELL does not follow links without a path: #{uri}"
|
241
|
-
nil
|
242
|
-
end
|
243
|
-
if uri.path.start_with?('/')
|
244
|
-
host + link #convert to full URL
|
245
|
-
else #links like href="google.com" the browser would go to http://google.com like "http://#{link}"
|
246
|
-
Grell.logger.debug "GRELL Bad formatted link: #{link}, assuming external"
|
247
|
-
nil
|
248
|
-
end
|
219
|
+
# Private class to group all the methods related to links.
|
220
|
+
class Link
|
221
|
+
def initialize(anchor)
|
222
|
+
@anchor = anchor
|
249
223
|
end
|
250
224
|
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
225
|
+
# <link> can only be used in the <head> as of: https://developer.mozilla.org/en/docs/Web/HTML/Element/link
|
226
|
+
def inside_header?
|
227
|
+
@anchor.tag_name == 'link'
|
228
|
+
end
|
255
229
|
|
230
|
+
# Is the link disabled by either Javascript or CSS?
|
231
|
+
def disabled?
|
232
|
+
@anchor.disabled? || !!@anchor.native.attributes['disabled']
|
233
|
+
end
|
256
234
|
|
235
|
+
# Some links may use data-href + javascript to do interesting things
|
236
|
+
def href
|
237
|
+
@anchor['href'] || @anchor['data-href']
|
238
|
+
end
|
257
239
|
|
258
|
-
|
240
|
+
# We only accept links in this same host that start with a path
|
241
|
+
def to_url(host)
|
242
|
+
uri = URI.parse(href)
|
243
|
+
if uri.absolute?
|
244
|
+
if uri.host != URI.parse(host).host
|
245
|
+
Grell.logger.debug "GRELL does not follow links to external hosts: #{href}"
|
246
|
+
nil
|
247
|
+
else
|
248
|
+
href # Absolute link to our own host
|
249
|
+
end
|
250
|
+
else
|
251
|
+
if uri.path.nil?
|
252
|
+
Grell.logger.debug "GRELL does not follow links without a path: #{uri}"
|
253
|
+
nil
|
254
|
+
end
|
255
|
+
if uri.path.start_with?('/')
|
256
|
+
host + href # convert to full URL
|
257
|
+
else # links like href="google.com" the browser would go to http://google.com like "http://#{link}"
|
258
|
+
Grell.logger.debug "GRELL Bad formatted link: #{href}, assuming external"
|
259
|
+
nil
|
260
|
+
end
|
261
|
+
end
|
262
|
+
rescue URI::InvalidURIError # Invalid links propagating till we navigate to them
|
263
|
+
href
|
264
|
+
end
|
265
|
+
end
|
259
266
|
|
267
|
+
end
|
268
|
+
end
|
260
269
|
end
|
data/lib/grell/version.rb
CHANGED
data/spec/lib/page_spec.rb
CHANGED
@@ -253,6 +253,7 @@ RSpec.describe Grell::Page do
|
|
253
253
|
<a href=\"/trusmis.html\">trusmis</a>
|
254
254
|
<a href=\"/help.html\">help</a>
|
255
255
|
<a href=\"javascript: void(0)\">help</a>
|
256
|
+
<a href=\"/helpdisabled.html\" disabled=\"\">helpdisabled</a>
|
256
257
|
</body></html>"
|
257
258
|
end
|
258
259
|
let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] }
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.6.
|
4
|
+
version: 1.6.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-07-
|
11
|
+
date: 2016-07-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|