pageflow-chart 2.1.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/CHANGELOG.md +12 -8
- data/README.md +2 -2
- data/app/assets/javascripts/pageflow/chart/consent.js +16 -0
- data/app/assets/javascripts/pageflow/chart/editor/config.js +7 -0
- data/app/assets/javascripts/pageflow/chart/editor/models/scraped_site.js +11 -50
- data/app/assets/javascripts/pageflow/chart/editor/views/configuration_editor.js +6 -4
- data/app/assets/javascripts/pageflow/chart/editor/views/embedded/iframe_embedded_view.js +32 -16
- data/app/assets/javascripts/pageflow/chart/editor/views/inputs/scraped_url_input_view.js +18 -39
- data/app/assets/javascripts/pageflow/chart/editor.js +4 -3
- data/app/assets/javascripts/pageflow/chart/page_type.js +61 -53
- data/app/assets/javascripts/pageflow/chart.js +2 -3
- data/app/assets/stylesheets/pageflow/chart/editor.scss +3 -20
- data/app/assets/stylesheets/pageflow/chart/themes/default.scss +3 -0
- data/app/assets/stylesheets/pageflow/chart.scss +9 -16
- data/app/helpers/pageflow/chart/scraped_sites_helper.rb +17 -8
- data/app/jobs/pageflow/chart/scrape_site_job.rb +14 -4
- data/app/models/pageflow/chart/scraped_site.rb +37 -4
- data/app/views/pageflow/chart/editor/scraped_sites/_scraped_site.json.jbuilder +1 -0
- data/app/views/pageflow/chart/page.html.erb +9 -2
- data/chart.gemspec +2 -2
- data/config/locales/de.yml +4 -0
- data/config/locales/en.yml +4 -0
- data/db/migrate/20190531141820_add_file_attributes_to_scraped_sites.rb +8 -0
- data/db/migrate/20190531145431_insert_file_usages_for_scraped_sites.rb +59 -0
- data/db/migrate/20200507141608_add_javascript_body_attachment_to_scraped_site.rb +5 -0
- data/lib/pageflow/chart/configuration.rb +6 -3
- data/lib/pageflow/chart/downloader.rb +4 -1
- data/lib/pageflow/chart/page_type.rb +17 -0
- data/lib/pageflow/chart/plugin.rb +10 -0
- data/lib/pageflow/chart/refresh_tag_following_downloader.rb +3 -3
- data/lib/pageflow/chart/scraper.rb +25 -13
- data/lib/pageflow/chart/version.rb +1 -1
- data/lib/pageflow/chart.rb +4 -0
- data/spec/factories/scraped_sites.rb +17 -3
- data/spec/fixtures/all.css +3 -0
- data/spec/fixtures/all.js +1 -0
- data/spec/fixtures/all_body.js +1 -0
- data/spec/fixtures/data.csv +1 -0
- data/spec/fixtures/index.html +7 -0
- data/spec/helpers/pageflow/chart/scraped_sites_helper_spec.rb +59 -0
- data/spec/integration/file_type_spec.rb +10 -0
- data/spec/jobs/pageflow/chart/scrape_site_job_spec.rb +14 -1
- data/spec/models/pageflow/chart/scraped_site_spec.rb +54 -0
- data/spec/pageflow/chart/downloader_spec.rb +13 -3
- data/spec/pageflow/chart/refresh_tag_following_downloader_spec.rb +23 -10
- data/spec/pageflow/chart/scraper_spec.rb +201 -63
- metadata +28 -20
- data/app/assets/javascripts/pageflow/chart/editor/collections/scraped_sites_collection.js +0 -23
- data/app/assets/javascripts/pageflow/chart/editor/initializers/setup_collections.js +0 -1
- data/app/assets/javascripts/pageflow/chart/editor/templates/scraped_site_status.jst.ejs +0 -2
- data/app/assets/javascripts/pageflow/chart/editor/templates/url_input.jst.ejs +0 -7
- data/app/assets/javascripts/pageflow/chart/editor/views/scraped_site_status_view.js +0 -18
- data/app/controllers/pageflow/chart/application_controller.rb +0 -6
- data/app/controllers/pageflow/chart/scraped_sites_controller.rb +0 -25
- data/config/routes.rb +0 -3
- data/spec/controllers/pageflow/chart/scraped_sites_controller_spec.rb +0 -35
- data/spec/requests/scraping_site_spec.rb +0 -23
@@ -26,5 +26,59 @@ module Pageflow::Chart
|
|
26
26
|
expect(scraped_site_with_custom_theme.use_custom_theme).to eq(true)
|
27
27
|
expect(scraped_site_without_custom_theme.use_custom_theme).to eq(false)
|
28
28
|
end
|
29
|
+
|
30
|
+
it 'exposes all attachments for export' do
|
31
|
+
scraped_site = ScrapedSite.new(url: 'http://example.com/foo/index.html')
|
32
|
+
|
33
|
+
expect(scraped_site.attachments_for_export.map(&:name))
|
34
|
+
.to eq(%i[javascript_file javascript_body_file stylesheet_file html_file csv_file])
|
35
|
+
end
|
36
|
+
|
37
|
+
describe '#publish!' do
|
38
|
+
it 'transitions state to processing for new site' do
|
39
|
+
scraped_site = ScrapedSite.new(url: 'http://example.com/foo/index.html')
|
40
|
+
|
41
|
+
scraped_site.publish!
|
42
|
+
|
43
|
+
expect(scraped_site.state).to eq('processing')
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'transitions state to processed if html file is already set ' \
|
47
|
+
'(e.g. for sites that have been created via entry import)' do
|
48
|
+
scraped_site = ScrapedSite.new(url: 'http://example.com/foo/index.html',
|
49
|
+
html_file_file_name: 'index.html')
|
50
|
+
|
51
|
+
scraped_site.publish!
|
52
|
+
|
53
|
+
expect(scraped_site.state).to eq('processed')
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe '#retryable?' do
|
58
|
+
it 'is true if processing_failed' do
|
59
|
+
scraped_site = ScrapedSite.new(url: 'http://example.com/foo/index.html',
|
60
|
+
state: 'processing_failed')
|
61
|
+
|
62
|
+
expect(scraped_site).to be_retryable
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'is false if processed' do
|
66
|
+
scraped_site = ScrapedSite.new(url: 'http://example.com/foo/index.html',
|
67
|
+
state: 'processed')
|
68
|
+
|
69
|
+
expect(scraped_site).not_to be_retryable
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe '#retry!' do
|
74
|
+
it 'transitions state to processing if processing_failed' do
|
75
|
+
scraped_site = ScrapedSite.new(url: 'http://example.com/foo/index.html',
|
76
|
+
state: 'processing_failed')
|
77
|
+
|
78
|
+
scraped_site.retry!
|
79
|
+
|
80
|
+
expect(scraped_site.state).to eq('processing')
|
81
|
+
end
|
82
|
+
end
|
29
83
|
end
|
30
84
|
end
|
@@ -17,17 +17,27 @@ module Pageflow
|
|
17
17
|
expect(result).to eq("aaa")
|
18
18
|
end
|
19
19
|
|
20
|
-
it 'ignores HTTP response 404' do
|
20
|
+
it 'ignores HTTP response 404 by default' do
|
21
21
|
downloader = Downloader.new
|
22
22
|
result = ''
|
23
23
|
|
24
|
-
stub_request(:get,
|
24
|
+
stub_request(:get, 'http://example.com/a').to_return(status: 404, body: 'aaa')
|
25
25
|
|
26
26
|
downloader.load('http://example.com/a') do |io|
|
27
27
|
result = io.read
|
28
28
|
end
|
29
29
|
|
30
|
-
expect(result).to eq(
|
30
|
+
expect(result).to eq('')
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'supports raising error on HTTP response 404 ' do
|
34
|
+
downloader = Downloader.new
|
35
|
+
|
36
|
+
stub_request(:get, 'http://example.com/a').to_return(status: 404, body: 'aaa')
|
37
|
+
|
38
|
+
expect {
|
39
|
+
downloader.load('http://example.com/a', raise_on_http_error: true)
|
40
|
+
}.to raise_error(Downloader::HTTPError)
|
31
41
|
end
|
32
42
|
|
33
43
|
it 'derives protocol from base_url' do
|
@@ -17,7 +17,7 @@ module Pageflow
|
|
17
17
|
result = ''
|
18
18
|
|
19
19
|
allow(downloader).to receive(:load)
|
20
|
-
.with(original_url)
|
20
|
+
.with(original_url, {})
|
21
21
|
.and_yield(StringIO.new(chart_html))
|
22
22
|
|
23
23
|
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
@@ -27,6 +27,19 @@ module Pageflow
|
|
27
27
|
expect(result).to eq(chart_html)
|
28
28
|
end
|
29
29
|
|
30
|
+
it 'passes raise_on_http_error to downloader' do
|
31
|
+
downloader = double(Downloader).as_null_object
|
32
|
+
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
33
|
+
|
34
|
+
original_url = 'http://datawrapper.dwcdn.net/HPKfl/2/'
|
35
|
+
|
36
|
+
expect(downloader).to receive(:load)
|
37
|
+
.with(original_url, raise_on_http_error: true)
|
38
|
+
|
39
|
+
refresh_tag_following_downloader.load_following_refresh_tags(original_url,
|
40
|
+
raise_on_http_error: true)
|
41
|
+
end
|
42
|
+
|
30
43
|
it 'looks for refresh meta tags and loads their url instead' do
|
31
44
|
downloader = double(Downloader)
|
32
45
|
refresh_tag_following_downloader = RefreshTagFollowingDownloader.new(downloader)
|
@@ -44,11 +57,11 @@ module Pageflow
|
|
44
57
|
result = ''
|
45
58
|
|
46
59
|
allow(downloader).to receive(:load)
|
47
|
-
.with(original_url)
|
60
|
+
.with(original_url, {})
|
48
61
|
.and_yield(StringIO.new(redirect_html))
|
49
62
|
|
50
63
|
allow(downloader).to receive(:load)
|
51
|
-
.with(target_url)
|
64
|
+
.with(target_url, {})
|
52
65
|
.and_yield(StringIO.new(chart_html))
|
53
66
|
|
54
67
|
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
@@ -75,11 +88,11 @@ module Pageflow
|
|
75
88
|
result = ''
|
76
89
|
|
77
90
|
allow(downloader).to receive(:load)
|
78
|
-
.with(original_url)
|
91
|
+
.with(original_url, {})
|
79
92
|
.and_yield(StringIO.new(redirect_html))
|
80
93
|
|
81
94
|
allow(downloader).to receive(:load)
|
82
|
-
.with(target_url)
|
95
|
+
.with(target_url, {})
|
83
96
|
.and_yield(StringIO.new(chart_html))
|
84
97
|
|
85
98
|
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
@@ -106,11 +119,11 @@ module Pageflow
|
|
106
119
|
result = ''
|
107
120
|
|
108
121
|
allow(downloader).to receive(:load)
|
109
|
-
.with(original_url)
|
122
|
+
.with(original_url, {})
|
110
123
|
.and_yield(StringIO.new(redirect_html))
|
111
124
|
|
112
125
|
allow(downloader).to receive(:load)
|
113
|
-
.with(target_url)
|
126
|
+
.with(target_url, {})
|
114
127
|
.and_yield(StringIO.new(chart_html))
|
115
128
|
|
116
129
|
refresh_tag_following_downloader.load_following_refresh_tags(original_url) do |file|
|
@@ -130,7 +143,7 @@ module Pageflow
|
|
130
143
|
<html><head><meta http-equiv="REFRESH" content="0; url=#{original_url}"></head></html>
|
131
144
|
HTML
|
132
145
|
|
133
|
-
allow(downloader).to receive(:load).with(original_url) do |&block|
|
146
|
+
allow(downloader).to receive(:load).with(original_url, {}) do |&block|
|
134
147
|
block.call(StringIO.new(redirect_html))
|
135
148
|
end
|
136
149
|
|
@@ -149,7 +162,7 @@ module Pageflow
|
|
149
162
|
<html><head><meta http-equiv="REFRESH" content="something strange"></head></html>
|
150
163
|
HTML
|
151
164
|
|
152
|
-
allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
|
165
|
+
allow(downloader).to receive(:load).with(original_url, {}).and_yield(StringIO.new(redirect_html))
|
153
166
|
|
154
167
|
expect {
|
155
168
|
refresh_tag_following_downloader.load_following_refresh_tags(original_url)
|
@@ -166,7 +179,7 @@ module Pageflow
|
|
166
179
|
<html><head><meta http-equiv="REFRESH"></head></html>
|
167
180
|
HTML
|
168
181
|
|
169
|
-
allow(downloader).to receive(:load).with(original_url).and_yield(StringIO.new(redirect_html))
|
182
|
+
allow(downloader).to receive(:load).with(original_url, {}).and_yield(StringIO.new(redirect_html))
|
170
183
|
|
171
184
|
expect {
|
172
185
|
refresh_tag_following_downloader.load_following_refresh_tags(original_url)
|
@@ -19,27 +19,67 @@ module Pageflow
|
|
19
19
|
expect(scraper.html).to include('contents')
|
20
20
|
end
|
21
21
|
|
22
|
-
it '
|
22
|
+
it 'filters blacklisted selectors' do
|
23
23
|
html = <<-HTML
|
24
24
|
<!DOCTYPE html>
|
25
25
|
<html>
|
26
26
|
<head>
|
27
|
-
<
|
28
|
-
<script type="text/javascript" src="/other.js"></script>
|
27
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
29
28
|
</head>
|
30
29
|
<body>
|
30
|
+
<div id="bad" class="noscript"></div>
|
31
|
+
<div id="good"></div>
|
31
32
|
</body>
|
32
33
|
</html>
|
33
34
|
HTML
|
34
|
-
scraper = Scraper.new(html)
|
35
|
+
scraper = Scraper.new(html, selector_blacklist: ['body .noscript'])
|
35
36
|
|
36
|
-
expect(HtmlFragment.new(scraper.html)).
|
37
|
-
expect(HtmlFragment.new(scraper.html)).
|
37
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('body #good')
|
38
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('body #bad')
|
38
39
|
end
|
39
40
|
|
40
|
-
|
41
|
+
describe 'stylesheets in head' do
|
42
|
+
it 'combines link tags in head' do
|
43
|
+
html = <<-HTML
|
44
|
+
<!DOCTYPE html>
|
45
|
+
<html>
|
46
|
+
<head>
|
47
|
+
<link rel="stylesheet" type="text/css" href="/some.css">
|
48
|
+
<link rel="stylesheet" type="text/css" href="/other.css">
|
49
|
+
</head>
|
50
|
+
<body>
|
51
|
+
</body>
|
52
|
+
</html>
|
53
|
+
HTML
|
54
|
+
scraper = Scraper.new(html)
|
55
|
+
|
56
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('head link[href="/some.css"]')
|
57
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('head link[href="all.css"]')
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe 'scripts in head' do
|
62
|
+
it 'combines script tags in head' do
|
63
|
+
html = <<-HTML
|
64
|
+
<!DOCTYPE html>
|
65
|
+
<html>
|
66
|
+
<head>
|
67
|
+
<script type="text/javascript" src="/some.js"></script>
|
68
|
+
<script type="text/javascript" src="/other.js"></script>
|
69
|
+
</head>
|
70
|
+
<body>
|
71
|
+
</body>
|
72
|
+
</html>
|
73
|
+
HTML
|
74
|
+
scraper = Scraper.new(html)
|
75
|
+
|
76
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('head script[src="/some.js"]')
|
77
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('head script[src="all.js"]')
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'inserts script tag at position of first script src tag to keep position ' \
|
41
81
|
'between inline scripts' do
|
42
|
-
|
82
|
+
html = <<-HTML
|
43
83
|
<!DOCTYPE html>
|
44
84
|
<html>
|
45
85
|
<head>
|
@@ -55,122 +95,220 @@ module Pageflow
|
|
55
95
|
<body>
|
56
96
|
</body>
|
57
97
|
</html>
|
58
|
-
|
59
|
-
|
98
|
+
HTML
|
99
|
+
scraper = Scraper.new(html)
|
60
100
|
|
61
|
-
|
101
|
+
fragment = HtmlFragment.new(scraper.html)
|
62
102
|
|
63
|
-
|
64
|
-
|
65
|
-
|
103
|
+
expect(fragment).to have_tags_in_order('head script#setup',
|
104
|
+
'head script[src="all.js"]',
|
105
|
+
'head script#usage')
|
106
|
+
end
|
66
107
|
end
|
67
108
|
|
68
|
-
|
69
|
-
|
109
|
+
describe 'scripts in body' do
|
110
|
+
it 'combines script tags in body' do
|
111
|
+
html = <<-HTML
|
70
112
|
<!DOCTYPE html>
|
71
113
|
<html>
|
72
114
|
<head>
|
73
|
-
<link rel="stylesheet" type="text/css" href="/some.css">
|
74
|
-
<link rel="stylesheet" type="text/css" href="/other.css">
|
75
115
|
</head>
|
76
116
|
<body>
|
117
|
+
<script type="text/javascript" src="/some.js"></script>
|
118
|
+
<script type="text/javascript" src="/other.js"></script>
|
77
119
|
</body>
|
78
120
|
</html>
|
79
|
-
|
80
|
-
|
121
|
+
HTML
|
122
|
+
scraper = Scraper.new(html)
|
81
123
|
|
82
|
-
|
83
|
-
|
84
|
-
|
124
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('body script[src="/some.js"]')
|
125
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('body script[src="all_body.js"]')
|
126
|
+
end
|
85
127
|
|
86
|
-
|
87
|
-
|
128
|
+
it 'inserts script tag at position of first script src tag to keep position ' \
|
129
|
+
'between inline scripts' do
|
130
|
+
html = <<-HTML
|
88
131
|
<!DOCTYPE html>
|
89
132
|
<html>
|
90
133
|
<head>
|
91
|
-
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
92
134
|
</head>
|
93
135
|
<body>
|
94
|
-
<script id="
|
95
|
-
|
136
|
+
<script id="setup">
|
137
|
+
// Some setup required for scripts below to execute
|
138
|
+
</script>
|
139
|
+
<script type="text/javascript" src="/some.js"></script>
|
140
|
+
<script type="text/javascript" src="/other.js"></script>
|
141
|
+
<script id="usage">
|
142
|
+
// Some script using stuff loading above
|
143
|
+
</script>
|
96
144
|
</body>
|
97
145
|
</html>
|
98
|
-
|
99
|
-
|
146
|
+
HTML
|
147
|
+
scraper = Scraper.new(html)
|
148
|
+
|
149
|
+
fragment = HtmlFragment.new(scraper.html)
|
150
|
+
|
151
|
+
expect(fragment).to have_tags_in_order('body script#setup',
|
152
|
+
'body script[src="all_body.js"]',
|
153
|
+
'body script#usage')
|
154
|
+
end
|
100
155
|
|
101
|
-
|
102
|
-
|
156
|
+
it 'filters blacklisted inline scripts' do
|
157
|
+
html = <<-HTML
|
158
|
+
<!DOCTYPE html>
|
159
|
+
<html>
|
160
|
+
<head>
|
161
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
162
|
+
</head>
|
163
|
+
<body>
|
164
|
+
<script id="good">window.ok = true;</script>
|
165
|
+
<script id="bad">alert();</script>
|
166
|
+
</body>
|
167
|
+
</html>
|
168
|
+
HTML
|
169
|
+
scraper = Scraper.new(html, inline_script_blacklist: [/alert/])
|
170
|
+
|
171
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('body script#good')
|
172
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('body script#bad')
|
173
|
+
end
|
103
174
|
end
|
175
|
+
end
|
104
176
|
|
105
|
-
|
106
|
-
|
177
|
+
describe '#javascript_urls' do
|
178
|
+
describe 'scripts in head' do
|
179
|
+
it 'returns list of urls to javascript files' do
|
180
|
+
html = <<-HTML
|
107
181
|
<!DOCTYPE html>
|
108
182
|
<html>
|
109
183
|
<head>
|
110
|
-
<
|
184
|
+
<script type="text/javascript" src="/some.js"></script>
|
185
|
+
<script type="text/javascript" src="/other.js"></script>
|
111
186
|
</head>
|
112
187
|
<body>
|
113
|
-
<div id="bad" class="noscript"></div>
|
114
|
-
<div id="good"></div>
|
115
188
|
</body>
|
116
189
|
</html>
|
117
|
-
|
118
|
-
|
190
|
+
HTML
|
191
|
+
scraper = Scraper.new(html)
|
119
192
|
|
120
|
-
|
121
|
-
|
122
|
-
end
|
123
|
-
end
|
193
|
+
expect(scraper.javascript_urls_in_head).to eq(['/some.js', '/other.js'])
|
194
|
+
end
|
124
195
|
|
125
|
-
|
126
|
-
|
127
|
-
html = <<-HTML
|
196
|
+
it 'filters by blacklist' do
|
197
|
+
html = <<-HTML
|
128
198
|
<!DOCTYPE html>
|
129
199
|
<html>
|
130
200
|
<head>
|
131
201
|
<script type="text/javascript" src="/some.js"></script>
|
132
|
-
<script type="text/javascript" src="/
|
202
|
+
<script type="text/javascript" src="http://example.com/piwik.js"></script>
|
133
203
|
</head>
|
134
204
|
<body>
|
135
205
|
</body>
|
136
206
|
</html>
|
137
|
-
|
138
|
-
|
207
|
+
HTML
|
208
|
+
scraper = Scraper.new(html, head_script_blacklist: [/piwik/])
|
209
|
+
|
210
|
+
expect(scraper.javascript_urls_in_head).to eq(['/some.js'])
|
211
|
+
end
|
139
212
|
|
140
|
-
|
213
|
+
it 'ignores inline scripts in head' do
|
214
|
+
html = <<-HTML
|
215
|
+
<!DOCTYPE html>
|
216
|
+
<html>
|
217
|
+
<head>
|
218
|
+
<script type="text/javascript"></script>
|
219
|
+
</head>
|
220
|
+
<body>
|
221
|
+
</body>
|
222
|
+
</html>
|
223
|
+
HTML
|
224
|
+
scraper = Scraper.new(html)
|
225
|
+
|
226
|
+
expect(scraper.javascript_urls_in_head).to eq([])
|
227
|
+
end
|
228
|
+
|
229
|
+
it 'ignores scripts in body' do
|
230
|
+
html = <<-HTML
|
231
|
+
<!DOCTYPE html>
|
232
|
+
<html>
|
233
|
+
<head>
|
234
|
+
</head>
|
235
|
+
<body>
|
236
|
+
<script type="text/javascript" src="/some.js"></script>
|
237
|
+
</body>
|
238
|
+
</html>
|
239
|
+
HTML
|
240
|
+
scraper = Scraper.new(html)
|
241
|
+
|
242
|
+
expect(scraper.javascript_urls_in_head).to eq([])
|
243
|
+
end
|
141
244
|
end
|
142
245
|
|
143
|
-
|
144
|
-
|
246
|
+
describe 'scripts in body' do
|
247
|
+
it 'ignores scripts in head' do
|
248
|
+
html = <<-HTML
|
145
249
|
<!DOCTYPE html>
|
146
250
|
<html>
|
147
251
|
<head>
|
148
252
|
<script type="text/javascript" src="/some.js"></script>
|
149
|
-
<script type="text/javascript" src="http://example.com/piwik.js"></script>
|
150
253
|
</head>
|
151
254
|
<body>
|
152
255
|
</body>
|
153
256
|
</html>
|
154
|
-
|
155
|
-
|
257
|
+
HTML
|
258
|
+
scraper = Scraper.new(html)
|
156
259
|
|
157
|
-
|
158
|
-
|
260
|
+
expect(scraper.javascript_urls_in_body).to eq([])
|
261
|
+
end
|
159
262
|
|
160
|
-
|
161
|
-
|
263
|
+
it 'returns list of urls to javascript files' do
|
264
|
+
html = <<-HTML
|
162
265
|
<!DOCTYPE html>
|
163
266
|
<html>
|
164
267
|
<head>
|
165
|
-
<script type="text/javascript"></script>
|
166
268
|
</head>
|
167
269
|
<body>
|
270
|
+
<script type="text/javascript" src="/some.js"></script>
|
271
|
+
<script type="text/javascript" src="/other.js"></script>
|
168
272
|
</body>
|
169
273
|
</html>
|
170
|
-
|
171
|
-
|
274
|
+
HTML
|
275
|
+
scraper = Scraper.new(html)
|
276
|
+
|
277
|
+
expect(scraper.javascript_urls_in_body).to eq(['/some.js', '/other.js'])
|
278
|
+
end
|
279
|
+
|
280
|
+
it 'filters by blacklist' do
|
281
|
+
html = <<-HTML
|
282
|
+
<!DOCTYPE html>
|
283
|
+
<html>
|
284
|
+
<head>
|
285
|
+
</head>
|
286
|
+
<body>
|
287
|
+
<script type="text/javascript" src="/some.js"></script>
|
288
|
+
<script type="text/javascript" src="http://example.com/piwik.js"></script>
|
289
|
+
</body>
|
290
|
+
</html>
|
291
|
+
HTML
|
292
|
+
scraper = Scraper.new(html, body_script_blacklist: [/piwik/])
|
293
|
+
|
294
|
+
expect(scraper.javascript_urls_in_body).to eq(['/some.js'])
|
295
|
+
end
|
296
|
+
|
297
|
+
it 'ignores inline scripts in body' do
|
298
|
+
html = <<-HTML
|
299
|
+
<!DOCTYPE html>
|
300
|
+
<html>
|
301
|
+
<head>
|
302
|
+
</head>
|
303
|
+
<body>
|
304
|
+
<script type="text/javascript"></script>
|
305
|
+
</body>
|
306
|
+
</html>
|
307
|
+
HTML
|
308
|
+
scraper = Scraper.new(html)
|
172
309
|
|
173
|
-
|
310
|
+
expect(scraper.javascript_urls_in_body).to eq([])
|
311
|
+
end
|
174
312
|
end
|
175
313
|
end
|
176
314
|
|