pageflow-chart 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/app/jobs/pageflow/chart/scrape_site_job.rb +3 -2
- data/lib/pageflow/chart/refresh_tag_following_downloader.rb +62 -0
- data/lib/pageflow/chart/version.rb +1 -1
- data/spec/jobs/pageflow/chart/scrape_site_job_spec.rb +3 -3
- data/spec/pageflow/chart/refresh_tag_following_downloader_spec.rb +178 -0
- metadata +4 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d165bb8184706fe6b0de27586f90d10ea710380
|
4
|
+
data.tar.gz: 5c330166481ba0d5ebf61256dc0d0ca81c47e97c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 031a5819cda5782c92e801f1cc229ccf73be853df6c7cb0a5843f9abe6cdc1bcc36b7a4da10d8464ad6251a43228cb661dab057eb0f1f9eff568a51cab3755d3
|
7
|
+
data.tar.gz: 5aadd7313e897e65b0d834dc3e47f130724141612c3bf22af7862388e336337e9532f6e416b0b260fd9c7b3acfb3a2c50e948e296b41e4b9373dddfd705c18d5
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
### Version 0.2.2
|
4
|
+
|
5
|
+
2017-07-12
|
6
|
+
|
7
|
+
[Compare changes](https://github.com/codevise/pageflow-chart/compare/v0.2.1...v0.2.2)
|
8
|
+
|
9
|
+
- Follow redirects from refresh meta tags
|
10
|
+
([#37](https://github.com/codevise/pageflow-chart/pull/37))
|
11
|
+
|
3
12
|
### Version 0.2.1
|
4
13
|
|
5
14
|
2017-07-12
|
@@ -11,7 +11,7 @@ module Pageflow
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def perform(scraped_site)
|
14
|
-
downloader.
|
14
|
+
downloader.load_following_refresh_tags(scraped_site.url) do |file|
|
15
15
|
scraper = Scraper.new(file.read, Chart.config.scraper_options)
|
16
16
|
scraped_site.html_file = StringIOWithContentType.new(
|
17
17
|
scraper.html,
|
@@ -42,7 +42,8 @@ module Pageflow
|
|
42
42
|
|
43
43
|
def self.perform_with_result(scraped_site, options = {})
|
44
44
|
# This is were the downloader passed to `initialize` is created.
|
45
|
-
new(Downloader.new(base_url: scraped_site.url))
|
45
|
+
new(RefreshTagFollowingDownloader.new(Downloader.new(base_url: scraped_site.url)))
|
46
|
+
.perform(scraped_site)
|
46
47
|
end
|
47
48
|
|
48
49
|
def begin_try_catch
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Pageflow
|
5
|
+
module Chart
|
6
|
+
class RefreshTagFollowingDownloader < SimpleDelegator
|
7
|
+
MAX_REDIRECT_COUNT = 3
|
8
|
+
|
9
|
+
class TooManyRedirects < StandardError; end
|
10
|
+
class NoUrlInRefreshMetaTag < StandardError; end
|
11
|
+
|
12
|
+
def load_following_refresh_tags(url, redirect_count = 0, &block)
|
13
|
+
load(url) do |file|
|
14
|
+
if (redirect_url = find_refresh_meta_tag_url(file.read))
|
15
|
+
if redirect_count >= MAX_REDIRECT_COUNT
|
16
|
+
raise TooManyRedirects, 'Too many redirects via refresh meta tags.'
|
17
|
+
end
|
18
|
+
|
19
|
+
redirect_url = ensure_absolute(redirect_url, url)
|
20
|
+
return load_following_refresh_tags(redirect_url, redirect_count + 1, &block)
|
21
|
+
end
|
22
|
+
|
23
|
+
file.rewind
|
24
|
+
yield file if block_given?
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def find_refresh_meta_tag_url(html)
|
31
|
+
tag = find_refresh_meta_tag(html)
|
32
|
+
|
33
|
+
extract_redirect_url(tag) if tag
|
34
|
+
end
|
35
|
+
|
36
|
+
def find_refresh_meta_tag(html)
|
37
|
+
document = Nokogiri::HTML(html)
|
38
|
+
document.at_css('head meta[http-equiv="REFRESH"]')
|
39
|
+
end
|
40
|
+
|
41
|
+
def extract_redirect_url(tag)
|
42
|
+
if tag[:content] && tag[:content] =~ /url=/
|
43
|
+
tag[:content].split('url=').last
|
44
|
+
else
|
45
|
+
raise NoUrlInRefreshMetaTag, "Could not extract url from #{tag}."
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def ensure_absolute(url, context_url)
|
50
|
+
uri = URI(url)
|
51
|
+
context_uri = URI(context_url)
|
52
|
+
|
53
|
+
[
|
54
|
+
uri.scheme || context_uri.scheme,
|
55
|
+
'://',
|
56
|
+
uri.host || context_uri.host,
|
57
|
+
uri.path
|
58
|
+
].join('')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -5,14 +5,14 @@ module Pageflow
|
|
5
5
|
describe ScrapeSiteJob do
|
6
6
|
describe '#perform' do
|
7
7
|
it 'scrapes html' do
|
8
|
-
scraper = double(
|
9
|
-
downloader = double(
|
8
|
+
scraper = double('Scraper', html: '<html>rewritten</html>')
|
9
|
+
downloader = double('Downloader', load: '<html>original</html>')
|
10
10
|
job = ScrapeSiteJob.new(downloader)
|
11
11
|
scraped_site = create(:scraped_site, url: 'http://example.com')
|
12
12
|
|
13
13
|
allow(Scraper).to receive(:new).and_return(scraper)
|
14
14
|
|
15
|
-
expect(downloader).to receive(:
|
15
|
+
expect(downloader).to receive(:load_following_refresh_tags).with('http://example.com')
|
16
16
|
|
17
17
|
job.perform(scraped_site)
|
18
18
|
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Pageflow
|
4
|
+
module Chart
|
5
|
+
describe RefreshTagFollowingDownloader do
|
6
|
+
describe '#load_following_refresh_tags' do
|
7
|
+
it 'delegates to downloader if no refresh meta tag is found' do
|
8
|
+
downloader = double(Downloader)
|
9
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
10
|
+
|
11
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
12
|
+
|
13
|
+
chart_html = <<-HTML
|
14
|
+
<html><head><title>A chart</title></head></html>
|
15
|
+
HTML
|
16
|
+
|
17
|
+
result = ''
|
18
|
+
|
19
|
+
allow(downloader).to receive(:load)
|
20
|
+
.with(original_url)
|
21
|
+
.and_yield(StringIO.new(chart_html))
|
22
|
+
|
23
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
24
|
+
result = file.read
|
25
|
+
end
|
26
|
+
|
27
|
+
expect(result).to eq(chart_html)
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'looks for refresh meta tags and loads their url instead' do
|
31
|
+
downloader = double(Downloader)
|
32
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
33
|
+
|
34
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
35
|
+
target_url = 'http://other.dwcdn.net/HPKfl/5/'
|
36
|
+
|
37
|
+
redirect_html = <<-HTML
|
38
|
+
<html><head><meta http-equiv="REFRESH" content="0; url=http://other.dwcdn.net/HPKfl/5/"></head></html>
|
39
|
+
HTML
|
40
|
+
chart_html = <<-HTML
|
41
|
+
<html><head><title>A chart</title></head></html>
|
42
|
+
HTML
|
43
|
+
|
44
|
+
result = ''
|
45
|
+
|
46
|
+
allow(downloader).to receive(:load)
|
47
|
+
.with(original_url)
|
48
|
+
.and_yield(StringIO.new(redirect_html))
|
49
|
+
|
50
|
+
allow(downloader).to receive(:load)
|
51
|
+
.with(target_url)
|
52
|
+
.and_yield(StringIO.new(chart_html))
|
53
|
+
|
54
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
55
|
+
result = file.read
|
56
|
+
end
|
57
|
+
|
58
|
+
expect(result).to eq(chart_html)
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'supports schema relative urls' do
|
62
|
+
downloader = double(Downloader)
|
63
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
64
|
+
|
65
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
66
|
+
target_url = 'http://other.dwcdn.net/HPKfl/5/'
|
67
|
+
|
68
|
+
redirect_html = <<-HTML
|
69
|
+
<html><head><meta http-equiv="REFRESH" content="0; url=//other.dwcdn.net/HPKfl/5/"></head></html>
|
70
|
+
HTML
|
71
|
+
chart_html = <<-HTML
|
72
|
+
<html><head><title>A chart</title></head></html>
|
73
|
+
HTML
|
74
|
+
|
75
|
+
result = ''
|
76
|
+
|
77
|
+
allow(downloader).to receive(:load)
|
78
|
+
.with(original_url)
|
79
|
+
.and_yield(StringIO.new(redirect_html))
|
80
|
+
|
81
|
+
allow(downloader).to receive(:load)
|
82
|
+
.with(target_url)
|
83
|
+
.and_yield(StringIO.new(chart_html))
|
84
|
+
|
85
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
86
|
+
result = file.read
|
87
|
+
end
|
88
|
+
|
89
|
+
expect(result).to eq(chart_html)
|
90
|
+
end
|
91
|
+
|
92
|
+
it 'supports relative urls' do
|
93
|
+
downloader = double(Downloader)
|
94
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
95
|
+
|
96
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
97
|
+
target_url = 'http://datawrapper.dwcdn.net/HPKfl/5/'
|
98
|
+
|
99
|
+
redirect_html = <<-HTML
|
100
|
+
<html><head><meta http-equiv="REFRESH" content="0; url=/HPKfl/5/"></head></html>
|
101
|
+
HTML
|
102
|
+
chart_html = <<-HTML
|
103
|
+
<html><head><title>A chart</title></head></html>
|
104
|
+
HTML
|
105
|
+
|
106
|
+
result = ''
|
107
|
+
|
108
|
+
allow(downloader).to receive(:load)
|
109
|
+
.with(original_url)
|
110
|
+
.and_yield(StringIO.new(redirect_html))
|
111
|
+
|
112
|
+
allow(downloader).to receive(:load)
|
113
|
+
.with(target_url)
|
114
|
+
.and_yield(StringIO.new(chart_html))
|
115
|
+
|
116
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
117
|
+
result = file.read
|
118
|
+
end
|
119
|
+
|
120
|
+
expect(result).to eq(chart_html)
|
121
|
+
end
|
122
|
+
|
123
|
+
it 'fails on too many redirects' do
|
124
|
+
downloader = double(Downloader)
|
125
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
126
|
+
|
127
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
128
|
+
|
129
|
+
redirect_html = <<-HTML
|
130
|
+
<html><head><meta http-equiv="REFRESH" content="0; url=#{original_url}"></head></html>
|
131
|
+
HTML
|
132
|
+
|
133
|
+
allow(downloader).to receive(:load).with(original_url) do |&block|
|
134
|
+
block.call(StringIO.new(redirect_html))
|
135
|
+
end
|
136
|
+
|
137
|
+
expect {
|
138
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url)
|
139
|
+
}.to raise_error(RefreshTagFollowingDownloader::TooManyRedirects)
|
140
|
+
end
|
141
|
+
|
142
|
+
it 'fails on invalid refresh meta tag' do
|
143
|
+
downloader = double(Downloader)
|
144
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
145
|
+
|
146
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
147
|
+
|
148
|
+
redirect_html = <<-HTML
|
149
|
+
<html><head><meta http-equiv="REFRESH" content="something strange"></head></html>
|
150
|
+
HTML
|
151
|
+
|
152
|
+
allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
|
153
|
+
|
154
|
+
expect {
|
155
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url)
|
156
|
+
}.to raise_error(RefreshTagFollowingDownloader::NoUrlInRefreshMetaTag)
|
157
|
+
end
|
158
|
+
|
159
|
+
it 'fails on refresh meta tag without content attribute' do
|
160
|
+
downloader = double(Downloader)
|
161
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
162
|
+
|
163
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
164
|
+
|
165
|
+
redirect_html = <<-HTML
|
166
|
+
<html><head><meta http-equiv="REFRESH"></head></html>
|
167
|
+
HTML
|
168
|
+
|
169
|
+
allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
|
170
|
+
|
171
|
+
expect {
|
172
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url)
|
173
|
+
}.to raise_error(RefreshTagFollowingDownloader::NoUrlInRefreshMetaTag)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pageflow-chart
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Fischbach
|
@@ -269,6 +269,7 @@ files:
|
|
269
269
|
- lib/pageflow/chart/downloader.rb
|
270
270
|
- lib/pageflow/chart/engine.rb
|
271
271
|
- lib/pageflow/chart/page_type.rb
|
272
|
+
- lib/pageflow/chart/refresh_tag_following_downloader.rb
|
272
273
|
- lib/pageflow/chart/scraper.rb
|
273
274
|
- lib/pageflow/chart/version.rb
|
274
275
|
- spec/controllers/pageflow/chart/scraped_sites_controller_spec.rb
|
@@ -316,6 +317,7 @@ files:
|
|
316
317
|
- spec/models/pageflow/chart/scraped_site_spec.rb
|
317
318
|
- spec/pageflow/chart/configuration_spec.rb
|
318
319
|
- spec/pageflow/chart/downloader_spec.rb
|
320
|
+
- spec/pageflow/chart/refresh_tag_following_downloader_spec.rb
|
319
321
|
- spec/pageflow/chart/scraper_spec.rb
|
320
322
|
- spec/requests/scraping_site_spec.rb
|
321
323
|
- spec/spec_helper.rb
|
@@ -394,6 +396,7 @@ test_files:
|
|
394
396
|
- spec/models/pageflow/chart/scraped_site_spec.rb
|
395
397
|
- spec/pageflow/chart/configuration_spec.rb
|
396
398
|
- spec/pageflow/chart/downloader_spec.rb
|
399
|
+
- spec/pageflow/chart/refresh_tag_following_downloader_spec.rb
|
397
400
|
- spec/pageflow/chart/scraper_spec.rb
|
398
401
|
- spec/requests/scraping_site_spec.rb
|
399
402
|
- spec/spec_helper.rb
|