pageflow-chart 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 11c6001c705a34c2d17a065441d9686e344d7599
4
- data.tar.gz: 46f602eb70655ddd464c18eda8de6d609361d6b3
3
+ metadata.gz: 1d165bb8184706fe6b0de27586f90d10ea710380
4
+ data.tar.gz: 5c330166481ba0d5ebf61256dc0d0ca81c47e97c
5
5
  SHA512:
6
- metadata.gz: 4bd30df75eb0b9604b04f4d7a7a4aabd713b622a171453f58107318acb2dc4fe1a86421c6ccabe958f4d9b04531c262e0a3839975934d43a98823ac4d904aba6
7
- data.tar.gz: ae52e80fa81f6cbd3e94102874c6902a151b6c77b4b55bd0422db8c0b7595e91b993ad18b5164c467e3c5b455d2c3d722c27635c113535bd5db1d03a18ff6b1e
6
+ metadata.gz: 031a5819cda5782c92e801f1cc229ccf73be853df6c7cb0a5843f9abe6cdc1bcc36b7a4da10d8464ad6251a43228cb661dab057eb0f1f9eff568a51cab3755d3
7
+ data.tar.gz: 5aadd7313e897e65b0d834dc3e47f130724141612c3bf22af7862388e336337e9532f6e416b0b260fd9c7b3acfb3a2c50e948e296b41e4b9373dddfd705c18d5
data/CHANGELOG.md CHANGED
@@ -1,5 +1,14 @@
1
1
  # CHANGELOG
2
2
 
3
+ ### Version 0.2.2
4
+
5
+ 2017-07-12
6
+
7
+ [Compare changes](https://github.com/codevise/pageflow-chart/compare/v0.2.1...v0.2.2)
8
+
9
+ - Follow redirects from refresh meta tags
10
+ ([#37](https://github.com/codevise/pageflow-chart/pull/37))
11
+
3
12
  ### Version 0.2.1
4
13
 
5
14
  2017-07-12
@@ -11,7 +11,7 @@ module Pageflow
11
11
  end
12
12
 
13
13
  def perform(scraped_site)
14
- downloader.load(scraped_site.url) do |file|
14
+ downloader.load_following_refresh_tags(scraped_site.url) do |file|
15
15
  scraper = Scraper.new(file.read, Chart.config.scraper_options)
16
16
  scraped_site.html_file = StringIOWithContentType.new(
17
17
  scraper.html,
@@ -42,7 +42,8 @@ module Pageflow
42
42
 
43
43
  def self.perform_with_result(scraped_site, options = {})
44
44
  # This is were the downloader passed to `initialize` is created.
45
- new(Downloader.new(base_url: scraped_site.url)).perform(scraped_site)
45
+ new(RefreshTagFollowingDownloader.new(Downloader.new(base_url: scraped_site.url)))
46
+ .perform(scraped_site)
46
47
  end
47
48
 
48
49
  def begin_try_catch
@@ -0,0 +1,62 @@
1
+ require 'nokogiri'
2
+ require 'uri'
3
+
4
+ module Pageflow
5
+ module Chart
6
+ class RefreshTagFollowingDownloader < SimpleDelegator
7
+ MAX_REDIRECT_COUNT = 3
8
+
9
+ class TooManyRedirects < StandardError; end
10
+ class NoUrlInRefreshMetaTag < StandardError; end
11
+
12
+ def load_following_refresh_tags(url, redirect_count = 0, &block)
13
+ load(url) do |file|
14
+ if (redirect_url = find_refresh_meta_tag_url(file.read))
15
+ if redirect_count >= MAX_REDIRECT_COUNT
16
+ raise TooManyRedirects, 'Too many redirects via refresh meta tags.'
17
+ end
18
+
19
+ redirect_url = ensure_absolute(redirect_url, url)
20
+ return load_following_refresh_tags(redirect_url, redirect_count + 1, &block)
21
+ end
22
+
23
+ file.rewind
24
+ yield file if block_given?
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def find_refresh_meta_tag_url(html)
31
+ tag = find_refresh_meta_tag(html)
32
+
33
+ extract_redirect_url(tag) if tag
34
+ end
35
+
36
+ def find_refresh_meta_tag(html)
37
+ document = Nokogiri::HTML(html)
38
+ document.at_css('head meta[http-equiv="REFRESH"]')
39
+ end
40
+
41
+ def extract_redirect_url(tag)
42
+ if tag[:content] && tag[:content] =~ /url=/
43
+ tag[:content].split('url=').last
44
+ else
45
+ raise NoUrlInRefreshMetaTag, "Could not extract url from #{tag}."
46
+ end
47
+ end
48
+
49
+ def ensure_absolute(url, context_url)
50
+ uri = URI(url)
51
+ context_uri = URI(context_url)
52
+
53
+ [
54
+ uri.scheme || context_uri.scheme,
55
+ '://',
56
+ uri.host || context_uri.host,
57
+ uri.path
58
+ ].join('')
59
+ end
60
+ end
61
+ end
62
+ end
@@ -1,5 +1,5 @@
1
1
  module Pageflow
2
2
  module Chart
3
- VERSION = '0.2.1'.freeze
3
+ VERSION = '0.2.2'.freeze
4
4
  end
5
5
  end
@@ -5,14 +5,14 @@ module Pageflow
5
5
  describe ScrapeSiteJob do
6
6
  describe '#perform' do
7
7
  it 'scrapes html' do
8
- scraper = double("Scraper", html: '<html>rewritten</html>')
9
- downloader = double("Downloader", load: '<html>original</html>')
8
+ scraper = double('Scraper', html: '<html>rewritten</html>')
9
+ downloader = double('Downloader', load: '<html>original</html>')
10
10
  job = ScrapeSiteJob.new(downloader)
11
11
  scraped_site = create(:scraped_site, url: 'http://example.com')
12
12
 
13
13
  allow(Scraper).to receive(:new).and_return(scraper)
14
14
 
15
- expect(downloader).to receive(:load).with('http://example.com')
15
+ expect(downloader).to receive(:load_following_refresh_tags).with('http://example.com')
16
16
 
17
17
  job.perform(scraped_site)
18
18
  end
@@ -0,0 +1,178 @@
1
+ require 'spec_helper'
2
+
3
+ module Pageflow
4
+ module Chart
5
+ describe RefreshTagFollowingDownloader do
6
+ describe '#load_following_refresh_tags' do
7
+ it 'delegates to downloader if no refresh meta tag is found' do
8
+ downloader = double(Downloader)
9
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
10
+
11
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
12
+
13
+ chart_html = <<-HTML
14
+ <html><head><title>A chart</title></head></html>
15
+ HTML
16
+
17
+ result = ''
18
+
19
+ allow(downloader).to receive(:load)
20
+ .with(original_url)
21
+ .and_yield(StringIO.new(chart_html))
22
+
23
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
24
+ result = file.read
25
+ end
26
+
27
+ expect(result).to eq(chart_html)
28
+ end
29
+
30
+ it 'looks for refresh meta tags and loads their url instead' do
31
+ downloader = double(Downloader)
32
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
33
+
34
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
35
+ target_url = 'http://other.dwcdn.net/HPKfl/5/'
36
+
37
+ redirect_html = <<-HTML
38
+ <html><head><meta http-equiv="REFRESH" content="0; url=http://other.dwcdn.net/HPKfl/5/"></head></html>
39
+ HTML
40
+ chart_html = <<-HTML
41
+ <html><head><title>A chart</title></head></html>
42
+ HTML
43
+
44
+ result = ''
45
+
46
+ allow(downloader).to receive(:load)
47
+ .with(original_url)
48
+ .and_yield(StringIO.new(redirect_html))
49
+
50
+ allow(downloader).to receive(:load)
51
+ .with(target_url)
52
+ .and_yield(StringIO.new(chart_html))
53
+
54
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
55
+ result = file.read
56
+ end
57
+
58
+ expect(result).to eq(chart_html)
59
+ end
60
+
61
+ it 'supports schema relative urls' do
62
+ downloader = double(Downloader)
63
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
64
+
65
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
66
+ target_url = 'http://other.dwcdn.net/HPKfl/5/'
67
+
68
+ redirect_html = <<-HTML
69
+ <html><head><meta http-equiv="REFRESH" content="0; url=//other.dwcdn.net/HPKfl/5/"></head></html>
70
+ HTML
71
+ chart_html = <<-HTML
72
+ <html><head><title>A chart</title></head></html>
73
+ HTML
74
+
75
+ result = ''
76
+
77
+ allow(downloader).to receive(:load)
78
+ .with(original_url)
79
+ .and_yield(StringIO.new(redirect_html))
80
+
81
+ allow(downloader).to receive(:load)
82
+ .with(target_url)
83
+ .and_yield(StringIO.new(chart_html))
84
+
85
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
86
+ result = file.read
87
+ end
88
+
89
+ expect(result).to eq(chart_html)
90
+ end
91
+
92
+ it 'supports relative urls' do
93
+ downloader = double(Downloader)
94
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
95
+
96
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
97
+ target_url = 'http://datawrapper.dwcdn.net/HPKfl/5/'
98
+
99
+ redirect_html = <<-HTML
100
+ <html><head><meta http-equiv="REFRESH" content="0; url=/HPKfl/5/"></head></html>
101
+ HTML
102
+ chart_html = <<-HTML
103
+ <html><head><title>A chart</title></head></html>
104
+ HTML
105
+
106
+ result = ''
107
+
108
+ allow(downloader).to receive(:load)
109
+ .with(original_url)
110
+ .and_yield(StringIO.new(redirect_html))
111
+
112
+ allow(downloader).to receive(:load)
113
+ .with(target_url)
114
+ .and_yield(StringIO.new(chart_html))
115
+
116
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
117
+ result = file.read
118
+ end
119
+
120
+ expect(result).to eq(chart_html)
121
+ end
122
+
123
+ it 'fails on too many redirects' do
124
+ downloader = double(Downloader)
125
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
126
+
127
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
128
+
129
+ redirect_html = <<-HTML
130
+ <html><head><meta http-equiv="REFRESH" content="0; url=#{original_url}"></head></html>
131
+ HTML
132
+
133
+ allow(downloader).to receive(:load).with(original_url) do |&block|
134
+ block.call(StringIO.new(redirect_html))
135
+ end
136
+
137
+ expect {
138
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url)
139
+ }.to raise_error(RefreshTagFollowingDownloader::TooManyRedirects)
140
+ end
141
+
142
+ it 'fails on invalid refresh meta tag' do
143
+ downloader = double(Downloader)
144
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
145
+
146
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
147
+
148
+ redirect_html = <<-HTML
149
+ <html><head><meta http-equiv="REFRESH" content="something strange"></head></html>
150
+ HTML
151
+
152
+ allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
153
+
154
+ expect {
155
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url)
156
+ }.to raise_error(RefreshTagFollowingDownloader::NoUrlInRefreshMetaTag)
157
+ end
158
+
159
+ it 'fails on refresh meta tag without content attribute' do
160
+ downloader = double(Downloader)
161
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
162
+
163
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
164
+
165
+ redirect_html = <<-HTML
166
+ <html><head><meta http-equiv="REFRESH"></head></html>
167
+ HTML
168
+
169
+ allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
170
+
171
+ expect {
172
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url)
173
+ }.to raise_error(RefreshTagFollowingDownloader::NoUrlInRefreshMetaTag)
174
+ end
175
+ end
176
+ end
177
+ end
178
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pageflow-chart
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Fischbach
@@ -269,6 +269,7 @@ files:
269
269
  - lib/pageflow/chart/downloader.rb
270
270
  - lib/pageflow/chart/engine.rb
271
271
  - lib/pageflow/chart/page_type.rb
272
+ - lib/pageflow/chart/refresh_tag_following_downloader.rb
272
273
  - lib/pageflow/chart/scraper.rb
273
274
  - lib/pageflow/chart/version.rb
274
275
  - spec/controllers/pageflow/chart/scraped_sites_controller_spec.rb
@@ -316,6 +317,7 @@ files:
316
317
  - spec/models/pageflow/chart/scraped_site_spec.rb
317
318
  - spec/pageflow/chart/configuration_spec.rb
318
319
  - spec/pageflow/chart/downloader_spec.rb
320
+ - spec/pageflow/chart/refresh_tag_following_downloader_spec.rb
319
321
  - spec/pageflow/chart/scraper_spec.rb
320
322
  - spec/requests/scraping_site_spec.rb
321
323
  - spec/spec_helper.rb
@@ -394,6 +396,7 @@ test_files:
394
396
  - spec/models/pageflow/chart/scraped_site_spec.rb
395
397
  - spec/pageflow/chart/configuration_spec.rb
396
398
  - spec/pageflow/chart/downloader_spec.rb
399
+ - spec/pageflow/chart/refresh_tag_following_downloader_spec.rb
397
400
  - spec/pageflow/chart/scraper_spec.rb
398
401
  - spec/requests/scraping_site_spec.rb
399
402
  - spec/spec_helper.rb