pageflow-chart 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/app/jobs/pageflow/chart/scrape_site_job.rb +3 -2
- data/lib/pageflow/chart/refresh_tag_following_downloader.rb +62 -0
- data/lib/pageflow/chart/version.rb +1 -1
- data/spec/jobs/pageflow/chart/scrape_site_job_spec.rb +3 -3
- data/spec/pageflow/chart/refresh_tag_following_downloader_spec.rb +178 -0
- metadata +4 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d165bb8184706fe6b0de27586f90d10ea710380
|
4
|
+
data.tar.gz: 5c330166481ba0d5ebf61256dc0d0ca81c47e97c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 031a5819cda5782c92e801f1cc229ccf73be853df6c7cb0a5843f9abe6cdc1bcc36b7a4da10d8464ad6251a43228cb661dab057eb0f1f9eff568a51cab3755d3
|
7
|
+
data.tar.gz: 5aadd7313e897e65b0d834dc3e47f130724141612c3bf22af7862388e336337e9532f6e416b0b260fd9c7b3acfb3a2c50e948e296b41e4b9373dddfd705c18d5
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
### Version 0.2.2
|
4
|
+
|
5
|
+
2017-07-12
|
6
|
+
|
7
|
+
[Compare changes](https://github.com/codevise/pageflow-chart/compare/v0.2.1...v0.2.2)
|
8
|
+
|
9
|
+
- Follow redirects from refresh meta tags
|
10
|
+
([#37](https://github.com/codevise/pageflow-chart/pull/37))
|
11
|
+
|
3
12
|
### Version 0.2.1
|
4
13
|
|
5
14
|
2017-07-12
|
@@ -11,7 +11,7 @@ module Pageflow
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def perform(scraped_site)
|
14
|
-
downloader.
|
14
|
+
downloader.load_following_refresh_tags(scraped_site.url) do |file|
|
15
15
|
scraper = Scraper.new(file.read, Chart.config.scraper_options)
|
16
16
|
scraped_site.html_file = StringIOWithContentType.new(
|
17
17
|
scraper.html,
|
@@ -42,7 +42,8 @@ module Pageflow
|
|
42
42
|
|
43
43
|
def self.perform_with_result(scraped_site, options = {})
|
44
44
|
# This is were the downloader passed to `initialize` is created.
|
45
|
-
new(Downloader.new(base_url: scraped_site.url))
|
45
|
+
new(RefreshTagFollowingDownloader.new(Downloader.new(base_url: scraped_site.url)))
|
46
|
+
.perform(scraped_site)
|
46
47
|
end
|
47
48
|
|
48
49
|
def begin_try_catch
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Pageflow
|
5
|
+
module Chart
|
6
|
+
class RefreshTagFollowingDownloader < SimpleDelegator
|
7
|
+
MAX_REDIRECT_COUNT = 3
|
8
|
+
|
9
|
+
class TooManyRedirects < StandardError; end
|
10
|
+
class NoUrlInRefreshMetaTag < StandardError; end
|
11
|
+
|
12
|
+
def load_following_refresh_tags(url, redirect_count = 0, &block)
|
13
|
+
load(url) do |file|
|
14
|
+
if (redirect_url = find_refresh_meta_tag_url(file.read))
|
15
|
+
if redirect_count >= MAX_REDIRECT_COUNT
|
16
|
+
raise TooManyRedirects, 'Too many redirects via refresh meta tags.'
|
17
|
+
end
|
18
|
+
|
19
|
+
redirect_url = ensure_absolute(redirect_url, url)
|
20
|
+
return load_following_refresh_tags(redirect_url, redirect_count + 1, &block)
|
21
|
+
end
|
22
|
+
|
23
|
+
file.rewind
|
24
|
+
yield file if block_given?
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def find_refresh_meta_tag_url(html)
|
31
|
+
tag = find_refresh_meta_tag(html)
|
32
|
+
|
33
|
+
extract_redirect_url(tag) if tag
|
34
|
+
end
|
35
|
+
|
36
|
+
def find_refresh_meta_tag(html)
|
37
|
+
document = Nokogiri::HTML(html)
|
38
|
+
document.at_css('head meta[http-equiv="REFRESH"]')
|
39
|
+
end
|
40
|
+
|
41
|
+
def extract_redirect_url(tag)
|
42
|
+
if tag[:content] && tag[:content] =~ /url=/
|
43
|
+
tag[:content].split('url=').last
|
44
|
+
else
|
45
|
+
raise NoUrlInRefreshMetaTag, "Could not extract url from #{tag}."
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def ensure_absolute(url, context_url)
|
50
|
+
uri = URI(url)
|
51
|
+
context_uri = URI(context_url)
|
52
|
+
|
53
|
+
[
|
54
|
+
uri.scheme || context_uri.scheme,
|
55
|
+
'://',
|
56
|
+
uri.host || context_uri.host,
|
57
|
+
uri.path
|
58
|
+
].join('')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -5,14 +5,14 @@ module Pageflow
|
|
5
5
|
describe ScrapeSiteJob do
|
6
6
|
describe '#perform' do
|
7
7
|
it 'scrapes html' do
|
8
|
-
scraper = double(
|
9
|
-
downloader = double(
|
8
|
+
scraper = double('Scraper', html: '<html>rewritten</html>')
|
9
|
+
downloader = double('Downloader', load: '<html>original</html>')
|
10
10
|
job = ScrapeSiteJob.new(downloader)
|
11
11
|
scraped_site = create(:scraped_site, url: 'http://example.com')
|
12
12
|
|
13
13
|
allow(Scraper).to receive(:new).and_return(scraper)
|
14
14
|
|
15
|
-
expect(downloader).to receive(:
|
15
|
+
expect(downloader).to receive(:load_following_refresh_tags).with('http://example.com')
|
16
16
|
|
17
17
|
job.perform(scraped_site)
|
18
18
|
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Pageflow
|
4
|
+
module Chart
|
5
|
+
describe RefreshTagFollowingDownloader do
|
6
|
+
describe '#load_following_refresh_tags' do
|
7
|
+
it 'delegates to downloader if no refresh meta tag is found' do
|
8
|
+
downloader = double(Downloader)
|
9
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
10
|
+
|
11
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
12
|
+
|
13
|
+
chart_html = <<-HTML
|
14
|
+
<html><head><title>A chart</title></head></html>
|
15
|
+
HTML
|
16
|
+
|
17
|
+
result = ''
|
18
|
+
|
19
|
+
allow(downloader).to receive(:load)
|
20
|
+
.with(original_url)
|
21
|
+
.and_yield(StringIO.new(chart_html))
|
22
|
+
|
23
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
24
|
+
result = file.read
|
25
|
+
end
|
26
|
+
|
27
|
+
expect(result).to eq(chart_html)
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'looks for refresh meta tags and loads their url instead' do
|
31
|
+
downloader = double(Downloader)
|
32
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
33
|
+
|
34
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
35
|
+
target_url = 'http://other.dwcdn.net/HPKfl/5/'
|
36
|
+
|
37
|
+
redirect_html = <<-HTML
|
38
|
+
<html><head><meta http-equiv="REFRESH" content="0; url=http://other.dwcdn.net/HPKfl/5/"></head></html>
|
39
|
+
HTML
|
40
|
+
chart_html = <<-HTML
|
41
|
+
<html><head><title>A chart</title></head></html>
|
42
|
+
HTML
|
43
|
+
|
44
|
+
result = ''
|
45
|
+
|
46
|
+
allow(downloader).to receive(:load)
|
47
|
+
.with(original_url)
|
48
|
+
.and_yield(StringIO.new(redirect_html))
|
49
|
+
|
50
|
+
allow(downloader).to receive(:load)
|
51
|
+
.with(target_url)
|
52
|
+
.and_yield(StringIO.new(chart_html))
|
53
|
+
|
54
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
55
|
+
result = file.read
|
56
|
+
end
|
57
|
+
|
58
|
+
expect(result).to eq(chart_html)
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'supports schema relative urls' do
|
62
|
+
downloader = double(Downloader)
|
63
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
64
|
+
|
65
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
66
|
+
target_url = 'http://other.dwcdn.net/HPKfl/5/'
|
67
|
+
|
68
|
+
redirect_html = <<-HTML
|
69
|
+
<html><head><meta http-equiv="REFRESH" content="0; url=//other.dwcdn.net/HPKfl/5/"></head></html>
|
70
|
+
HTML
|
71
|
+
chart_html = <<-HTML
|
72
|
+
<html><head><title>A chart</title></head></html>
|
73
|
+
HTML
|
74
|
+
|
75
|
+
result = ''
|
76
|
+
|
77
|
+
allow(downloader).to receive(:load)
|
78
|
+
.with(original_url)
|
79
|
+
.and_yield(StringIO.new(redirect_html))
|
80
|
+
|
81
|
+
allow(downloader).to receive(:load)
|
82
|
+
.with(target_url)
|
83
|
+
.and_yield(StringIO.new(chart_html))
|
84
|
+
|
85
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
86
|
+
result = file.read
|
87
|
+
end
|
88
|
+
|
89
|
+
expect(result).to eq(chart_html)
|
90
|
+
end
|
91
|
+
|
92
|
+
it 'supports relative urls' do
|
93
|
+
downloader = double(Downloader)
|
94
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
95
|
+
|
96
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
97
|
+
target_url = 'http://datawrapper.dwcdn.net/HPKfl/5/'
|
98
|
+
|
99
|
+
redirect_html = <<-HTML
|
100
|
+
<html><head><meta http-equiv="REFRESH" content="0; url=/HPKfl/5/"></head></html>
|
101
|
+
HTML
|
102
|
+
chart_html = <<-HTML
|
103
|
+
<html><head><title>A chart</title></head></html>
|
104
|
+
HTML
|
105
|
+
|
106
|
+
result = ''
|
107
|
+
|
108
|
+
allow(downloader).to receive(:load)
|
109
|
+
.with(original_url)
|
110
|
+
.and_yield(StringIO.new(redirect_html))
|
111
|
+
|
112
|
+
allow(downloader).to receive(:load)
|
113
|
+
.with(target_url)
|
114
|
+
.and_yield(StringIO.new(chart_html))
|
115
|
+
|
116
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
117
|
+
result = file.read
|
118
|
+
end
|
119
|
+
|
120
|
+
expect(result).to eq(chart_html)
|
121
|
+
end
|
122
|
+
|
123
|
+
it 'fails on too many redirects' do
|
124
|
+
downloader = double(Downloader)
|
125
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
126
|
+
|
127
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
128
|
+
|
129
|
+
redirect_html = <<-HTML
|
130
|
+
<html><head><meta http-equiv="REFRESH" content="0; url=#{original_url}"></head></html>
|
131
|
+
HTML
|
132
|
+
|
133
|
+
allow(downloader).to receive(:load).with(original_url) do |&block|
|
134
|
+
block.call(StringIO.new(redirect_html))
|
135
|
+
end
|
136
|
+
|
137
|
+
expect {
|
138
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url)
|
139
|
+
}.to raise_error(RefreshTagFollowingDownloader::TooManyRedirects)
|
140
|
+
end
|
141
|
+
|
142
|
+
it 'fails on invalid refresh meta tag' do
|
143
|
+
downloader = double(Downloader)
|
144
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
145
|
+
|
146
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
147
|
+
|
148
|
+
redirect_html = <<-HTML
|
149
|
+
<html><head><meta http-equiv="REFRESH" content="something strange"></head></html>
|
150
|
+
HTML
|
151
|
+
|
152
|
+
allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
|
153
|
+
|
154
|
+
expect {
|
155
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url)
|
156
|
+
}.to raise_error(RefreshTagFollowingDownloader::NoUrlInRefreshMetaTag)
|
157
|
+
end
|
158
|
+
|
159
|
+
it 'fails on refresh meta tag without content attribute' do
|
160
|
+
downloader = double(Downloader)
|
161
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
162
|
+
|
163
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
164
|
+
|
165
|
+
redirect_html = <<-HTML
|
166
|
+
<html><head><meta http-equiv="REFRESH"></head></html>
|
167
|
+
HTML
|
168
|
+
|
169
|
+
allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
|
170
|
+
|
171
|
+
expect {
|
172
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url)
|
173
|
+
}.to raise_error(RefreshTagFollowingDownloader::NoUrlInRefreshMetaTag)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pageflow-chart
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Fischbach
|
@@ -269,6 +269,7 @@ files:
|
|
269
269
|
- lib/pageflow/chart/downloader.rb
|
270
270
|
- lib/pageflow/chart/engine.rb
|
271
271
|
- lib/pageflow/chart/page_type.rb
|
272
|
+
- lib/pageflow/chart/refresh_tag_following_downloader.rb
|
272
273
|
- lib/pageflow/chart/scraper.rb
|
273
274
|
- lib/pageflow/chart/version.rb
|
274
275
|
- spec/controllers/pageflow/chart/scraped_sites_controller_spec.rb
|
@@ -316,6 +317,7 @@ files:
|
|
316
317
|
- spec/models/pageflow/chart/scraped_site_spec.rb
|
317
318
|
- spec/pageflow/chart/configuration_spec.rb
|
318
319
|
- spec/pageflow/chart/downloader_spec.rb
|
320
|
+
- spec/pageflow/chart/refresh_tag_following_downloader_spec.rb
|
319
321
|
- spec/pageflow/chart/scraper_spec.rb
|
320
322
|
- spec/requests/scraping_site_spec.rb
|
321
323
|
- spec/spec_helper.rb
|
@@ -394,6 +396,7 @@ test_files:
|
|
394
396
|
- spec/models/pageflow/chart/scraped_site_spec.rb
|
395
397
|
- spec/pageflow/chart/configuration_spec.rb
|
396
398
|
- spec/pageflow/chart/downloader_spec.rb
|
399
|
+
- spec/pageflow/chart/refresh_tag_following_downloader_spec.rb
|
397
400
|
- spec/pageflow/chart/scraper_spec.rb
|
398
401
|
- spec/requests/scraping_site_spec.rb
|
399
402
|
- spec/spec_helper.rb
|