pageflow-chart 2.2.0 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -8
- data/app/jobs/pageflow/chart/scrape_site_job.rb +10 -3
- data/app/models/pageflow/chart/scraped_site.rb +3 -1
- data/db/migrate/20200507141608_add_javascript_body_attachment_to_scraped_site.rb +5 -0
- data/lib/pageflow/chart/configuration.rb +2 -0
- data/lib/pageflow/chart/scraper.rb +25 -13
- data/lib/pageflow/chart/version.rb +1 -1
- data/spec/factories/scraped_sites.rb +1 -0
- data/spec/fixtures/all_body.js +1 -0
- data/spec/models/pageflow/chart/scraped_site_spec.rb +1 -1
- data/spec/pageflow/chart/scraper_spec.rb +201 -63
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf67574bc3fc0e11ee66f37634eeab5b73ff6105ef6c281b022b24fad99fd854
|
4
|
+
data.tar.gz: 21cad9381bcc4c3cc312726b8b4b3b2e8c8a3f4b4f8ef54255848d348ed31a90
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0f778a73ae92747c4a43e80e8443edd74dc0ecd5056278195b623182108c905b3abef2f48833fb64e7b49d58b9f64fffffe7eaf854c9fb61a896bf57c37c4205
|
7
|
+
data.tar.gz: 15ec876b421dc8ba87d71ec9dc88d007cff39ec3658ce44e8c2bf702d75be6f1130506b8daec5d4ed0ebad453bf3726088d7b7aa2b8ea9ce645f6dc809206dc8
|
data/CHANGELOG.md
CHANGED
@@ -1,16 +1,14 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
-
### Version 2.
|
3
|
+
### Version 2.3.0
|
4
4
|
|
5
|
-
|
5
|
+
2020-05-11
|
6
6
|
|
7
|
-
[Compare changes](https://github.com/codevise/pageflow-chart/compare/2-
|
7
|
+
[Compare changes](https://github.com/codevise/pageflow-chart/compare/2-2-stable...v2.3.0)
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
- Turn scraped site into file type. Install migrations.
|
12
|
-
([#55](https://github.com/codevise/pageflow-chart/pull/55))
|
9
|
+
- Support Datawrapper charts with script tags in body
|
10
|
+
([#56](https://github.com/codevise/pageflow-chart/pull/56))
|
13
11
|
|
14
12
|
See
|
15
|
-
[2-
|
13
|
+
[2-2-stable branch](https://github.com/codevise/pageflow-chart/blob/2-2-stable/CHANGELOG.md)
|
16
14
|
for previous changes.
|
@@ -17,11 +17,18 @@ module Pageflow
|
|
17
17
|
content_type: 'text/html'
|
18
18
|
)
|
19
19
|
|
20
|
-
downloader.load_all(scraper.
|
20
|
+
downloader.load_all(scraper.javascript_urls_in_head,
|
21
21
|
extension: '.js',
|
22
22
|
before_each: begin_try_catch,
|
23
|
-
after_each: end_try_catch) do |
|
24
|
-
scraped_site.javascript_file =
|
23
|
+
after_each: end_try_catch) do |javascript_head_file|
|
24
|
+
scraped_site.javascript_file = javascript_head_file
|
25
|
+
end
|
26
|
+
|
27
|
+
downloader.load_all(scraper.javascript_urls_in_body,
|
28
|
+
extension: '.js',
|
29
|
+
before_each: begin_try_catch,
|
30
|
+
after_each: end_try_catch) do |javascript_body_file|
|
31
|
+
scraped_site.javascript_body_file = javascript_body_file
|
25
32
|
end
|
26
33
|
|
27
34
|
downloader.load_all(scraper.stylesheet_urls,
|
@@ -4,11 +4,13 @@ module Pageflow
|
|
4
4
|
include Pageflow::ReusableFile
|
5
5
|
|
6
6
|
has_attached_file :javascript_file, Chart.config.paperclip_options(extension: 'js')
|
7
|
+
has_attached_file :javascript_body_file, Chart.config.paperclip_options(basename: 'all_body', extension: 'js')
|
7
8
|
has_attached_file :stylesheet_file, Chart.config.paperclip_options(extension: 'css')
|
8
9
|
has_attached_file :html_file, Chart.config.paperclip_options(extension: 'html')
|
9
10
|
has_attached_file :csv_file, Chart.config.paperclip_options(basename: 'data', extension: 'csv')
|
10
11
|
|
11
12
|
do_not_validate_attachment_file_type(:javascript_file)
|
13
|
+
do_not_validate_attachment_file_type(:javascript_body_file)
|
12
14
|
do_not_validate_attachment_file_type(:stylesheet_file)
|
13
15
|
do_not_validate_attachment_file_type(:html_file)
|
14
16
|
do_not_validate_attachment_file_type(:csv_file)
|
@@ -85,7 +87,7 @@ module Pageflow
|
|
85
87
|
end
|
86
88
|
|
87
89
|
def attachments_for_export
|
88
|
-
[javascript_file, stylesheet_file, html_file, csv_file]
|
90
|
+
[javascript_file, javascript_body_file, stylesheet_file, html_file, csv_file]
|
89
91
|
end
|
90
92
|
end
|
91
93
|
end
|
@@ -22,6 +22,7 @@ module Pageflow
|
|
22
22
|
#
|
23
23
|
# @param [Hash] opts
|
24
24
|
# @option opts [Array<Regexp>] :head_script_blacklist Script tags in page head are ignored if they match any of this list of regexes.
|
25
|
+
# @option opts [Array<Regexp>] :body_script_blacklist Script tags in page body are ignored if they match any of this list of regexes.
|
25
26
|
# @option opts [Array<Regexp>] :inline_script_blacklist Inline script tags are ignored if they match any of this list of regexes.
|
26
27
|
# @option opts [Array<String>] :selector_blacklist HTML-elements matched by selectors in this list will not be scraped.
|
27
28
|
# @return [Hash]
|
@@ -55,6 +56,7 @@ module Pageflow
|
|
55
56
|
def initialize
|
56
57
|
@scraper_options = {
|
57
58
|
head_script_blacklist: [/piwik/],
|
59
|
+
body_script_blacklist: [/piwik/],
|
58
60
|
inline_script_blacklist: [/piwik/],
|
59
61
|
selector_blacklist: ['body .noscript']
|
60
62
|
}
|
@@ -3,7 +3,11 @@ require 'nokogiri'
|
|
3
3
|
module Pageflow
|
4
4
|
module Chart
|
5
5
|
class Scraper
|
6
|
-
attr_reader :document,
|
6
|
+
attr_reader :document,
|
7
|
+
:options,
|
8
|
+
:javascript_urls_in_head,
|
9
|
+
:javascript_urls_in_body,
|
10
|
+
:stylesheet_urls
|
7
11
|
|
8
12
|
def initialize(html, options = {})
|
9
13
|
@document = Nokogiri::HTML(html)
|
@@ -23,14 +27,21 @@ module Pageflow
|
|
23
27
|
private
|
24
28
|
|
25
29
|
def parse
|
26
|
-
parse_javascript_urls
|
30
|
+
parse_javascript_urls(:head)
|
31
|
+
parse_javascript_urls(:body)
|
27
32
|
parse_stylesheet_urls
|
28
33
|
end
|
29
34
|
|
30
|
-
def parse_javascript_urls
|
31
|
-
|
35
|
+
def parse_javascript_urls(container)
|
36
|
+
script_tags = filtered_script_tags_in(container).map do |tag|
|
32
37
|
tag[:src]
|
33
38
|
end
|
39
|
+
|
40
|
+
if container.eql?(:head)
|
41
|
+
@javascript_urls_in_head = script_tags
|
42
|
+
else
|
43
|
+
@javascript_urls_in_body = script_tags
|
44
|
+
end
|
34
45
|
end
|
35
46
|
|
36
47
|
def parse_stylesheet_urls
|
@@ -42,7 +53,8 @@ module Pageflow
|
|
42
53
|
def rewrite
|
43
54
|
filter_inline_scripts
|
44
55
|
filter_by_selectors
|
45
|
-
|
56
|
+
combine_script_tags_in(:head)
|
57
|
+
combine_script_tags_in(:body)
|
46
58
|
combine_css_link_tags
|
47
59
|
end
|
48
60
|
|
@@ -66,12 +78,12 @@ module Pageflow
|
|
66
78
|
end
|
67
79
|
end
|
68
80
|
|
69
|
-
def
|
70
|
-
script_tags_to_remove =
|
81
|
+
def combine_script_tags_in(container)
|
82
|
+
script_tags_to_remove = script_src_tags_in(container)
|
71
83
|
return if script_tags_to_remove.empty?
|
72
84
|
|
73
85
|
all_script_src_tag = Nokogiri::XML::Node.new('script', document)
|
74
|
-
all_script_src_tag[:src] = 'all.js'
|
86
|
+
all_script_src_tag[:src] = container.eql?(:head) ? 'all.js' : 'all_body.js'
|
75
87
|
all_script_src_tag[:type] = 'text/javascript'
|
76
88
|
|
77
89
|
script_tags_to_remove
|
@@ -91,16 +103,16 @@ module Pageflow
|
|
91
103
|
document.at_css('head') << all_css_link_tag
|
92
104
|
end
|
93
105
|
|
94
|
-
def
|
95
|
-
|
96
|
-
options.fetch(
|
106
|
+
def filtered_script_tags_in(container)
|
107
|
+
script_src_tags_in(container).reject do |tag|
|
108
|
+
options.fetch("#{container}_script_blacklist".to_sym, []).any? do |regexp|
|
97
109
|
tag[:src] =~ regexp
|
98
110
|
end
|
99
111
|
end
|
100
112
|
end
|
101
113
|
|
102
|
-
def
|
103
|
-
document.css(
|
114
|
+
def script_src_tags_in(container)
|
115
|
+
document.css("#{container} script[src]")
|
104
116
|
end
|
105
117
|
|
106
118
|
def css_link_tags
|
@@ -8,6 +8,7 @@ module Pageflow
|
|
8
8
|
state { 'processed' }
|
9
9
|
|
10
10
|
javascript_file { File.open(Engine.root.join('spec', 'fixtures', 'all.js')) }
|
11
|
+
javascript_body_file { File.open(Engine.root.join('spec', 'fixtures', 'all_body.js')) }
|
11
12
|
stylesheet_file { File.open(Engine.root.join('spec', 'fixtures', 'all.css')) }
|
12
13
|
html_file { File.open(Engine.root.join('spec', 'fixtures', 'index.html')) }
|
13
14
|
csv_file { File.open(Engine.root.join('spec', 'fixtures', 'data.csv')) }
|
@@ -0,0 +1 @@
|
|
1
|
+
var chart_body = {};
|
@@ -31,7 +31,7 @@ module Pageflow::Chart
|
|
31
31
|
scraped_site = ScrapedSite.new(url: 'http://example.com/foo/index.html')
|
32
32
|
|
33
33
|
expect(scraped_site.attachments_for_export.map(&:name))
|
34
|
-
.to eq(%i[javascript_file stylesheet_file html_file csv_file])
|
34
|
+
.to eq(%i[javascript_file javascript_body_file stylesheet_file html_file csv_file])
|
35
35
|
end
|
36
36
|
|
37
37
|
describe '#publish!' do
|
@@ -19,27 +19,67 @@ module Pageflow
|
|
19
19
|
expect(scraper.html).to include('contents')
|
20
20
|
end
|
21
21
|
|
22
|
-
it '
|
22
|
+
it 'filters blacklisted selectors' do
|
23
23
|
html = <<-HTML
|
24
24
|
<!DOCTYPE html>
|
25
25
|
<html>
|
26
26
|
<head>
|
27
|
-
<
|
28
|
-
<script type="text/javascript" src="/other.js"></script>
|
27
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
29
28
|
</head>
|
30
29
|
<body>
|
30
|
+
<div id="bad" class="noscript"></div>
|
31
|
+
<div id="good"></div>
|
31
32
|
</body>
|
32
33
|
</html>
|
33
34
|
HTML
|
34
|
-
scraper = Scraper.new(html)
|
35
|
+
scraper = Scraper.new(html, selector_blacklist: ['body .noscript'])
|
35
36
|
|
36
|
-
expect(HtmlFragment.new(scraper.html)).
|
37
|
-
expect(HtmlFragment.new(scraper.html)).
|
37
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('body #good')
|
38
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('body #bad')
|
38
39
|
end
|
39
40
|
|
40
|
-
|
41
|
+
describe 'stylesheets in head' do
|
42
|
+
it 'combines link tags in head' do
|
43
|
+
html = <<-HTML
|
44
|
+
<!DOCTYPE html>
|
45
|
+
<html>
|
46
|
+
<head>
|
47
|
+
<link rel="stylesheet" type="text/css" href="/some.css">
|
48
|
+
<link rel="stylesheet" type="text/css" href="/other.css">
|
49
|
+
</head>
|
50
|
+
<body>
|
51
|
+
</body>
|
52
|
+
</html>
|
53
|
+
HTML
|
54
|
+
scraper = Scraper.new(html)
|
55
|
+
|
56
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('head link[href="/some.css"]')
|
57
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('head link[href="all.css"]')
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe 'scripts in head' do
|
62
|
+
it 'combines script tags in head' do
|
63
|
+
html = <<-HTML
|
64
|
+
<!DOCTYPE html>
|
65
|
+
<html>
|
66
|
+
<head>
|
67
|
+
<script type="text/javascript" src="/some.js"></script>
|
68
|
+
<script type="text/javascript" src="/other.js"></script>
|
69
|
+
</head>
|
70
|
+
<body>
|
71
|
+
</body>
|
72
|
+
</html>
|
73
|
+
HTML
|
74
|
+
scraper = Scraper.new(html)
|
75
|
+
|
76
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('head script[src="/some.js"]')
|
77
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('head script[src="all.js"]')
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'inserts script tag at position of first script src tag to keep position ' \
|
41
81
|
'between inline scripts' do
|
42
|
-
|
82
|
+
html = <<-HTML
|
43
83
|
<!DOCTYPE html>
|
44
84
|
<html>
|
45
85
|
<head>
|
@@ -55,122 +95,220 @@ module Pageflow
|
|
55
95
|
<body>
|
56
96
|
</body>
|
57
97
|
</html>
|
58
|
-
|
59
|
-
|
98
|
+
HTML
|
99
|
+
scraper = Scraper.new(html)
|
60
100
|
|
61
|
-
|
101
|
+
fragment = HtmlFragment.new(scraper.html)
|
62
102
|
|
63
|
-
|
64
|
-
|
65
|
-
|
103
|
+
expect(fragment).to have_tags_in_order('head script#setup',
|
104
|
+
'head script[src="all.js"]',
|
105
|
+
'head script#usage')
|
106
|
+
end
|
66
107
|
end
|
67
108
|
|
68
|
-
|
69
|
-
|
109
|
+
describe 'scripts in body' do
|
110
|
+
it 'combines script tags in body' do
|
111
|
+
html = <<-HTML
|
70
112
|
<!DOCTYPE html>
|
71
113
|
<html>
|
72
114
|
<head>
|
73
|
-
<link rel="stylesheet" type="text/css" href="/some.css">
|
74
|
-
<link rel="stylesheet" type="text/css" href="/other.css">
|
75
115
|
</head>
|
76
116
|
<body>
|
117
|
+
<script type="text/javascript" src="/some.js"></script>
|
118
|
+
<script type="text/javascript" src="/other.js"></script>
|
77
119
|
</body>
|
78
120
|
</html>
|
79
|
-
|
80
|
-
|
121
|
+
HTML
|
122
|
+
scraper = Scraper.new(html)
|
81
123
|
|
82
|
-
|
83
|
-
|
84
|
-
|
124
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('body script[src="/some.js"]')
|
125
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('body script[src="all_body.js"]')
|
126
|
+
end
|
85
127
|
|
86
|
-
|
87
|
-
|
128
|
+
it 'inserts script tag at position of first script src tag to keep position ' \
|
129
|
+
'between inline scripts' do
|
130
|
+
html = <<-HTML
|
88
131
|
<!DOCTYPE html>
|
89
132
|
<html>
|
90
133
|
<head>
|
91
|
-
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
92
134
|
</head>
|
93
135
|
<body>
|
94
|
-
<script id="
|
95
|
-
|
136
|
+
<script id="setup">
|
137
|
+
// Some setup required for scripts below to execute
|
138
|
+
</script>
|
139
|
+
<script type="text/javascript" src="/some.js"></script>
|
140
|
+
<script type="text/javascript" src="/other.js"></script>
|
141
|
+
<script id="usage">
|
142
|
+
// Some script using stuff loading above
|
143
|
+
</script>
|
96
144
|
</body>
|
97
145
|
</html>
|
98
|
-
|
99
|
-
|
146
|
+
HTML
|
147
|
+
scraper = Scraper.new(html)
|
148
|
+
|
149
|
+
fragment = HtmlFragment.new(scraper.html)
|
150
|
+
|
151
|
+
expect(fragment).to have_tags_in_order('body script#setup',
|
152
|
+
'body script[src="all_body.js"]',
|
153
|
+
'body script#usage')
|
154
|
+
end
|
100
155
|
|
101
|
-
|
102
|
-
|
156
|
+
it 'filters blacklisted inline scripts' do
|
157
|
+
html = <<-HTML
|
158
|
+
<!DOCTYPE html>
|
159
|
+
<html>
|
160
|
+
<head>
|
161
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
162
|
+
</head>
|
163
|
+
<body>
|
164
|
+
<script id="good">window.ok = true;</script>
|
165
|
+
<script id="bad">alert();</script>
|
166
|
+
</body>
|
167
|
+
</html>
|
168
|
+
HTML
|
169
|
+
scraper = Scraper.new(html, inline_script_blacklist: [/alert/])
|
170
|
+
|
171
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('body script#good')
|
172
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('body script#bad')
|
173
|
+
end
|
103
174
|
end
|
175
|
+
end
|
104
176
|
|
105
|
-
|
106
|
-
|
177
|
+
describe '#javascript_urls' do
|
178
|
+
describe 'scripts in head' do
|
179
|
+
it 'returns list of urls to javascript files' do
|
180
|
+
html = <<-HTML
|
107
181
|
<!DOCTYPE html>
|
108
182
|
<html>
|
109
183
|
<head>
|
110
|
-
<
|
184
|
+
<script type="text/javascript" src="/some.js"></script>
|
185
|
+
<script type="text/javascript" src="/other.js"></script>
|
111
186
|
</head>
|
112
187
|
<body>
|
113
|
-
<div id="bad" class="noscript"></div>
|
114
|
-
<div id="good"></div>
|
115
188
|
</body>
|
116
189
|
</html>
|
117
|
-
|
118
|
-
|
190
|
+
HTML
|
191
|
+
scraper = Scraper.new(html)
|
119
192
|
|
120
|
-
|
121
|
-
|
122
|
-
end
|
123
|
-
end
|
193
|
+
expect(scraper.javascript_urls_in_head).to eq(['/some.js', '/other.js'])
|
194
|
+
end
|
124
195
|
|
125
|
-
|
126
|
-
|
127
|
-
html = <<-HTML
|
196
|
+
it 'filters by blacklist' do
|
197
|
+
html = <<-HTML
|
128
198
|
<!DOCTYPE html>
|
129
199
|
<html>
|
130
200
|
<head>
|
131
201
|
<script type="text/javascript" src="/some.js"></script>
|
132
|
-
<script type="text/javascript" src="/
|
202
|
+
<script type="text/javascript" src="http://example.com/piwik.js"></script>
|
133
203
|
</head>
|
134
204
|
<body>
|
135
205
|
</body>
|
136
206
|
</html>
|
137
|
-
|
138
|
-
|
207
|
+
HTML
|
208
|
+
scraper = Scraper.new(html, head_script_blacklist: [/piwik/])
|
209
|
+
|
210
|
+
expect(scraper.javascript_urls_in_head).to eq(['/some.js'])
|
211
|
+
end
|
139
212
|
|
140
|
-
|
213
|
+
it 'ignores inline scripts in head' do
|
214
|
+
html = <<-HTML
|
215
|
+
<!DOCTYPE html>
|
216
|
+
<html>
|
217
|
+
<head>
|
218
|
+
<script type="text/javascript"></script>
|
219
|
+
</head>
|
220
|
+
<body>
|
221
|
+
</body>
|
222
|
+
</html>
|
223
|
+
HTML
|
224
|
+
scraper = Scraper.new(html)
|
225
|
+
|
226
|
+
expect(scraper.javascript_urls_in_head).to eq([])
|
227
|
+
end
|
228
|
+
|
229
|
+
it 'ignores scripts in body' do
|
230
|
+
html = <<-HTML
|
231
|
+
<!DOCTYPE html>
|
232
|
+
<html>
|
233
|
+
<head>
|
234
|
+
</head>
|
235
|
+
<body>
|
236
|
+
<script type="text/javascript" src="/some.js"></script>
|
237
|
+
</body>
|
238
|
+
</html>
|
239
|
+
HTML
|
240
|
+
scraper = Scraper.new(html)
|
241
|
+
|
242
|
+
expect(scraper.javascript_urls_in_head).to eq([])
|
243
|
+
end
|
141
244
|
end
|
142
245
|
|
143
|
-
|
144
|
-
|
246
|
+
describe 'scripts in body' do
|
247
|
+
it 'ignores scripts in head' do
|
248
|
+
html = <<-HTML
|
145
249
|
<!DOCTYPE html>
|
146
250
|
<html>
|
147
251
|
<head>
|
148
252
|
<script type="text/javascript" src="/some.js"></script>
|
149
|
-
<script type="text/javascript" src="http://example.com/piwik.js"></script>
|
150
253
|
</head>
|
151
254
|
<body>
|
152
255
|
</body>
|
153
256
|
</html>
|
154
|
-
|
155
|
-
|
257
|
+
HTML
|
258
|
+
scraper = Scraper.new(html)
|
156
259
|
|
157
|
-
|
158
|
-
|
260
|
+
expect(scraper.javascript_urls_in_body).to eq([])
|
261
|
+
end
|
159
262
|
|
160
|
-
|
161
|
-
|
263
|
+
it 'returns list of urls to javascript files' do
|
264
|
+
html = <<-HTML
|
162
265
|
<!DOCTYPE html>
|
163
266
|
<html>
|
164
267
|
<head>
|
165
|
-
<script type="text/javascript"></script>
|
166
268
|
</head>
|
167
269
|
<body>
|
270
|
+
<script type="text/javascript" src="/some.js"></script>
|
271
|
+
<script type="text/javascript" src="/other.js"></script>
|
168
272
|
</body>
|
169
273
|
</html>
|
170
|
-
|
171
|
-
|
274
|
+
HTML
|
275
|
+
scraper = Scraper.new(html)
|
276
|
+
|
277
|
+
expect(scraper.javascript_urls_in_body).to eq(['/some.js', '/other.js'])
|
278
|
+
end
|
279
|
+
|
280
|
+
it 'filters by blacklist' do
|
281
|
+
html = <<-HTML
|
282
|
+
<!DOCTYPE html>
|
283
|
+
<html>
|
284
|
+
<head>
|
285
|
+
</head>
|
286
|
+
<body>
|
287
|
+
<script type="text/javascript" src="/some.js"></script>
|
288
|
+
<script type="text/javascript" src="http://example.com/piwik.js"></script>
|
289
|
+
</body>
|
290
|
+
</html>
|
291
|
+
HTML
|
292
|
+
scraper = Scraper.new(html, body_script_blacklist: [/piwik/])
|
293
|
+
|
294
|
+
expect(scraper.javascript_urls_in_body).to eq(['/some.js'])
|
295
|
+
end
|
296
|
+
|
297
|
+
it 'ignores inline scripts in body' do
|
298
|
+
html = <<-HTML
|
299
|
+
<!DOCTYPE html>
|
300
|
+
<html>
|
301
|
+
<head>
|
302
|
+
</head>
|
303
|
+
<body>
|
304
|
+
<script type="text/javascript"></script>
|
305
|
+
</body>
|
306
|
+
</html>
|
307
|
+
HTML
|
308
|
+
scraper = Scraper.new(html)
|
172
309
|
|
173
|
-
|
310
|
+
expect(scraper.javascript_urls_in_body).to eq([])
|
311
|
+
end
|
174
312
|
end
|
175
313
|
end
|
176
314
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pageflow-chart
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Codevise Solutions Ltd
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-05-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pageflow
|
@@ -209,6 +209,7 @@ files:
|
|
209
209
|
- db/migrate/20160211085234_add_use_custom_theme_to_scraped_sites.rb
|
210
210
|
- db/migrate/20190531141820_add_file_attributes_to_scraped_sites.rb
|
211
211
|
- db/migrate/20190531145431_insert_file_usages_for_scraped_sites.rb
|
212
|
+
- db/migrate/20200507141608_add_javascript_body_attachment_to_scraped_site.rb
|
212
213
|
- lib/generators/pageflow_chart/install/install_generator.rb
|
213
214
|
- lib/pageflow/chart.rb
|
214
215
|
- lib/pageflow/chart/configuration.rb
|
@@ -221,6 +222,7 @@ files:
|
|
221
222
|
- spec/factories/scraped_sites.rb
|
222
223
|
- spec/fixtures/all.css
|
223
224
|
- spec/fixtures/all.js
|
225
|
+
- spec/fixtures/all_body.js
|
224
226
|
- spec/fixtures/data.csv
|
225
227
|
- spec/fixtures/datawrapper.html
|
226
228
|
- spec/fixtures/index.html
|
@@ -266,6 +268,7 @@ test_files:
|
|
266
268
|
- spec/factories/scraped_sites.rb
|
267
269
|
- spec/fixtures/all.css
|
268
270
|
- spec/fixtures/all.js
|
271
|
+
- spec/fixtures/all_body.js
|
269
272
|
- spec/fixtures/data.csv
|
270
273
|
- spec/fixtures/datawrapper.html
|
271
274
|
- spec/fixtures/index.html
|