pageflow-chart 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 11c6001c705a34c2d17a065441d9686e344d7599
4
- data.tar.gz: 46f602eb70655ddd464c18eda8de6d609361d6b3
3
+ metadata.gz: 1d165bb8184706fe6b0de27586f90d10ea710380
4
+ data.tar.gz: 5c330166481ba0d5ebf61256dc0d0ca81c47e97c
5
5
  SHA512:
6
- metadata.gz: 4bd30df75eb0b9604b04f4d7a7a4aabd713b622a171453f58107318acb2dc4fe1a86421c6ccabe958f4d9b04531c262e0a3839975934d43a98823ac4d904aba6
7
- data.tar.gz: ae52e80fa81f6cbd3e94102874c6902a151b6c77b4b55bd0422db8c0b7595e91b993ad18b5164c467e3c5b455d2c3d722c27635c113535bd5db1d03a18ff6b1e
6
+ metadata.gz: 031a5819cda5782c92e801f1cc229ccf73be853df6c7cb0a5843f9abe6cdc1bcc36b7a4da10d8464ad6251a43228cb661dab057eb0f1f9eff568a51cab3755d3
7
+ data.tar.gz: 5aadd7313e897e65b0d834dc3e47f130724141612c3bf22af7862388e336337e9532f6e416b0b260fd9c7b3acfb3a2c50e948e296b41e4b9373dddfd705c18d5
data/CHANGELOG.md CHANGED
@@ -1,5 +1,14 @@
1
1
  # CHANGELOG
2
2
 
3
+ ### Version 0.2.2
4
+
5
+ 2017-07-12
6
+
7
+ [Compare changes](https://github.com/codevise/pageflow-chart/compare/v0.2.1...v0.2.2)
8
+
9
+ - Follow redirects from refresh meta tags
10
+ ([#37](https://github.com/codevise/pageflow-chart/pull/37))
11
+
3
12
  ### Version 0.2.1
4
13
 
5
14
  2017-07-12
@@ -11,7 +11,7 @@ module Pageflow
11
11
  end
12
12
 
13
13
  def perform(scraped_site)
14
- downloader.load(scraped_site.url) do |file|
14
+ downloader.load_following_refresh_tags(scraped_site.url) do |file|
15
15
  scraper = Scraper.new(file.read, Chart.config.scraper_options)
16
16
  scraped_site.html_file = StringIOWithContentType.new(
17
17
  scraper.html,
@@ -42,7 +42,8 @@ module Pageflow
42
42
 
43
43
  def self.perform_with_result(scraped_site, options = {})
44
44
  # This is were the downloader passed to `initialize` is created.
45
- new(Downloader.new(base_url: scraped_site.url)).perform(scraped_site)
45
+ new(RefreshTagFollowingDownloader.new(Downloader.new(base_url: scraped_site.url)))
46
+ .perform(scraped_site)
46
47
  end
47
48
 
48
49
  def begin_try_catch
@@ -0,0 +1,62 @@
1
+ require 'nokogiri'
2
+ require 'uri'
3
+
4
+ module Pageflow
5
+ module Chart
6
+ class RefreshTagFollowingDownloader < SimpleDelegator
7
+ MAX_REDIRECT_COUNT = 3
8
+
9
+ class TooManyRedirects < StandardError; end
10
+ class NoUrlInRefreshMetaTag < StandardError; end
11
+
12
+ def load_following_refresh_tags(url, redirect_count = 0, &block)
13
+ load(url) do |file|
14
+ if (redirect_url = find_refresh_meta_tag_url(file.read))
15
+ if redirect_count >= MAX_REDIRECT_COUNT
16
+ raise TooManyRedirects, 'Too many redirects via refresh meta tags.'
17
+ end
18
+
19
+ redirect_url = ensure_absolute(redirect_url, url)
20
+ return load_following_refresh_tags(redirect_url, redirect_count + 1, &block)
21
+ end
22
+
23
+ file.rewind
24
+ yield file if block_given?
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def find_refresh_meta_tag_url(html)
31
+ tag = find_refresh_meta_tag(html)
32
+
33
+ extract_redirect_url(tag) if tag
34
+ end
35
+
36
+ def find_refresh_meta_tag(html)
37
+ document = Nokogiri::HTML(html)
38
+ document.at_css('head meta[http-equiv="REFRESH"]')
39
+ end
40
+
41
+ def extract_redirect_url(tag)
42
+ if tag[:content] && tag[:content] =~ /url=/
43
+ tag[:content].split('url=').last
44
+ else
45
+ raise NoUrlInRefreshMetaTag, "Could not extract url from #{tag}."
46
+ end
47
+ end
48
+
49
+ def ensure_absolute(url, context_url)
50
+ uri = URI(url)
51
+ context_uri = URI(context_url)
52
+
53
+ [
54
+ uri.scheme || context_uri.scheme,
55
+ '://',
56
+ uri.host || context_uri.host,
57
+ uri.path
58
+ ].join('')
59
+ end
60
+ end
61
+ end
62
+ end
@@ -1,5 +1,5 @@
1
1
  module Pageflow
2
2
  module Chart
3
- VERSION = '0.2.1'.freeze
3
+ VERSION = '0.2.2'.freeze
4
4
  end
5
5
  end
@@ -5,14 +5,14 @@ module Pageflow
5
5
  describe ScrapeSiteJob do
6
6
  describe '#perform' do
7
7
  it 'scrapes html' do
8
- scraper = double("Scraper", html: '<html>rewritten</html>')
9
- downloader = double("Downloader", load: '<html>original</html>')
8
+ scraper = double('Scraper', html: '<html>rewritten</html>')
9
+ downloader = double('Downloader', load: '<html>original</html>')
10
10
  job = ScrapeSiteJob.new(downloader)
11
11
  scraped_site = create(:scraped_site, url: 'http://example.com')
12
12
 
13
13
  allow(Scraper).to receive(:new).and_return(scraper)
14
14
 
15
- expect(downloader).to receive(:load).with('http://example.com')
15
+ expect(downloader).to receive(:load_following_refresh_tags).with('http://example.com')
16
16
 
17
17
  job.perform(scraped_site)
18
18
  end
@@ -0,0 +1,178 @@
1
+ require 'spec_helper'
2
+
3
+ module Pageflow
4
+ module Chart
5
+ describe RefreshTagFollowingDownloader do
6
+ describe '#load_following_refresh_tags' do
7
+ it 'delegates to downloader if no refresh meta tag is found' do
8
+ downloader = double(Downloader)
9
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
10
+
11
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
12
+
13
+ chart_html = <<-HTML
14
+ <html><head><title>A chart</title></head></html>
15
+ HTML
16
+
17
+ result = ''
18
+
19
+ allow(downloader).to receive(:load)
20
+ .with(original_url)
21
+ .and_yield(StringIO.new(chart_html))
22
+
23
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
24
+ result = file.read
25
+ end
26
+
27
+ expect(result).to eq(chart_html)
28
+ end
29
+
30
+ it 'looks for refresh meta tags and loads their url instead' do
31
+ downloader = double(Downloader)
32
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
33
+
34
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
35
+ target_url = 'http://other.dwcdn.net/HPKfl/5/'
36
+
37
+ redirect_html = <<-HTML
38
+ <html><head><meta http-equiv="REFRESH" content="0; url=http://other.dwcdn.net/HPKfl/5/"></head></html>
39
+ HTML
40
+ chart_html = <<-HTML
41
+ <html><head><title>A chart</title></head></html>
42
+ HTML
43
+
44
+ result = ''
45
+
46
+ allow(downloader).to receive(:load)
47
+ .with(original_url)
48
+ .and_yield(StringIO.new(redirect_html))
49
+
50
+ allow(downloader).to receive(:load)
51
+ .with(target_url)
52
+ .and_yield(StringIO.new(chart_html))
53
+
54
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
55
+ result = file.read
56
+ end
57
+
58
+ expect(result).to eq(chart_html)
59
+ end
60
+
61
+ it 'supports schema relative urls' do
62
+ downloader = double(Downloader)
63
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
64
+
65
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
66
+ target_url = 'http://other.dwcdn.net/HPKfl/5/'
67
+
68
+ redirect_html = <<-HTML
69
+ <html><head><meta http-equiv="REFRESH" content="0; url=//other.dwcdn.net/HPKfl/5/"></head></html>
70
+ HTML
71
+ chart_html = <<-HTML
72
+ <html><head><title>A chart</title></head></html>
73
+ HTML
74
+
75
+ result = ''
76
+
77
+ allow(downloader).to receive(:load)
78
+ .with(original_url)
79
+ .and_yield(StringIO.new(redirect_html))
80
+
81
+ allow(downloader).to receive(:load)
82
+ .with(target_url)
83
+ .and_yield(StringIO.new(chart_html))
84
+
85
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
86
+ result = file.read
87
+ end
88
+
89
+ expect(result).to eq(chart_html)
90
+ end
91
+
92
+ it 'supports relative urls' do
93
+ downloader = double(Downloader)
94
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
95
+
96
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
97
+ target_url = 'http://datawrapper.dwcdn.net/HPKfl/5/'
98
+
99
+ redirect_html = <<-HTML
100
+ <html><head><meta http-equiv="REFRESH" content="0; url=/HPKfl/5/"></head></html>
101
+ HTML
102
+ chart_html = <<-HTML
103
+ <html><head><title>A chart</title></head></html>
104
+ HTML
105
+
106
+ result = ''
107
+
108
+ allow(downloader).to receive(:load)
109
+ .with(original_url)
110
+ .and_yield(StringIO.new(redirect_html))
111
+
112
+ allow(downloader).to receive(:load)
113
+ .with(target_url)
114
+ .and_yield(StringIO.new(chart_html))
115
+
116
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
117
+ result = file.read
118
+ end
119
+
120
+ expect(result).to eq(chart_html)
121
+ end
122
+
123
+ it 'fails on too many redirects' do
124
+ downloader = double(Downloader)
125
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
126
+
127
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
128
+
129
+ redirect_html = <<-HTML
130
+ <html><head><meta http-equiv="REFRESH" content="0; url=#{original_url}"></head></html>
131
+ HTML
132
+
133
+ allow(downloader).to receive(:load).with(original_url) do |&block|
134
+ block.call(StringIO.new(redirect_html))
135
+ end
136
+
137
+ expect {
138
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url)
139
+ }.to raise_error(RefreshTagFollowingDownloader::TooManyRedirects)
140
+ end
141
+
142
+ it 'fails on invalid refresh meta tag' do
143
+ downloader = double(Downloader)
144
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
145
+
146
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
147
+
148
+ redirect_html = <<-HTML
149
+ <html><head><meta http-equiv="REFRESH" content="something strange"></head></html>
150
+ HTML
151
+
152
+ allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
153
+
154
+ expect {
155
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url)
156
+ }.to raise_error(RefreshTagFollowingDownloader::NoUrlInRefreshMetaTag)
157
+ end
158
+
159
+ it 'fails on refresh meta tag without content attribute' do
160
+ downloader = double(Downloader)
161
+ refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
162
+
163
+ original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
164
+
165
+ redirect_html = <<-HTML
166
+ <html><head><meta http-equiv="REFRESH"></head></html>
167
+ HTML
168
+
169
+ allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
170
+
171
+ expect {
172
+ refresh_tag_following_downloader.load_following_refresh_tags(original_url)
173
+ }.to raise_error(RefreshTagFollowingDownloader::NoUrlInRefreshMetaTag)
174
+ end
175
+ end
176
+ end
177
+ end
178
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pageflow-chart
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Fischbach
@@ -269,6 +269,7 @@ files:
269
269
  - lib/pageflow/chart/downloader.rb
270
270
  - lib/pageflow/chart/engine.rb
271
271
  - lib/pageflow/chart/page_type.rb
272
+ - lib/pageflow/chart/refresh_tag_following_downloader.rb
272
273
  - lib/pageflow/chart/scraper.rb
273
274
  - lib/pageflow/chart/version.rb
274
275
  - spec/controllers/pageflow/chart/scraped_sites_controller_spec.rb
@@ -316,6 +317,7 @@ files:
316
317
  - spec/models/pageflow/chart/scraped_site_spec.rb
317
318
  - spec/pageflow/chart/configuration_spec.rb
318
319
  - spec/pageflow/chart/downloader_spec.rb
320
+ - spec/pageflow/chart/refresh_tag_following_downloader_spec.rb
319
321
  - spec/pageflow/chart/scraper_spec.rb
320
322
  - spec/requests/scraping_site_spec.rb
321
323
  - spec/spec_helper.rb
@@ -394,6 +396,7 @@ test_files:
394
396
  - spec/models/pageflow/chart/scraped_site_spec.rb
395
397
  - spec/pageflow/chart/configuration_spec.rb
396
398
  - spec/pageflow/chart/downloader_spec.rb
399
+ - spec/pageflow/chart/refresh_tag_following_downloader_spec.rb
397
400
  - spec/pageflow/chart/scraper_spec.rb
398
401
  - spec/requests/scraping_site_spec.rb
399
402
  - spec/spec_helper.rb