pageflow-chart 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -8
- data/app/jobs/pageflow/chart/scrape_site_job.rb +10 -3
- data/app/models/pageflow/chart/scraped_site.rb +3 -1
- data/db/migrate/20200507141608_add_javascript_body_attachment_to_scraped_site.rb +5 -0
- data/lib/pageflow/chart/configuration.rb +2 -0
- data/lib/pageflow/chart/scraper.rb +25 -13
- data/lib/pageflow/chart/version.rb +1 -1
- data/spec/factories/scraped_sites.rb +1 -0
- data/spec/fixtures/all_body.js +1 -0
- data/spec/models/pageflow/chart/scraped_site_spec.rb +1 -1
- data/spec/pageflow/chart/scraper_spec.rb +201 -63
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf67574bc3fc0e11ee66f37634eeab5b73ff6105ef6c281b022b24fad99fd854
|
4
|
+
data.tar.gz: 21cad9381bcc4c3cc312726b8b4b3b2e8c8a3f4b4f8ef54255848d348ed31a90
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0f778a73ae92747c4a43e80e8443edd74dc0ecd5056278195b623182108c905b3abef2f48833fb64e7b49d58b9f64fffffe7eaf854c9fb61a896bf57c37c4205
|
7
|
+
data.tar.gz: 15ec876b421dc8ba87d71ec9dc88d007cff39ec3658ce44e8c2bf702d75be6f1130506b8daec5d4ed0ebad453bf3726088d7b7aa2b8ea9ce645f6dc809206dc8
|
data/CHANGELOG.md
CHANGED
@@ -1,16 +1,14 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
-
### Version 2.
|
3
|
+
### Version 2.3.0
|
4
4
|
|
5
|
-
|
5
|
+
2020-05-11
|
6
6
|
|
7
|
-
[Compare changes](https://github.com/codevise/pageflow-chart/compare/2-
|
7
|
+
[Compare changes](https://github.com/codevise/pageflow-chart/compare/2-2-stable...v2.3.0)
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
- Turn scraped site into file type. Install migrations.
|
12
|
-
([#55](https://github.com/codevise/pageflow-chart/pull/55))
|
9
|
+
- Support Datawrapper charts with script tags in body
|
10
|
+
([#56](https://github.com/codevise/pageflow-chart/pull/56))
|
13
11
|
|
14
12
|
See
|
15
|
-
[2-
|
13
|
+
[2-2-stable branch](https://github.com/codevise/pageflow-chart/blob/2-2-stable/CHANGELOG.md)
|
16
14
|
for previous changes.
|
@@ -17,11 +17,18 @@ module Pageflow
|
|
17
17
|
content_type: 'text/html'
|
18
18
|
)
|
19
19
|
|
20
|
-
downloader.load_all(scraper.
|
20
|
+
downloader.load_all(scraper.javascript_urls_in_head,
|
21
21
|
extension: '.js',
|
22
22
|
before_each: begin_try_catch,
|
23
|
-
after_each: end_try_catch) do |
|
24
|
-
scraped_site.javascript_file =
|
23
|
+
after_each: end_try_catch) do |javascript_head_file|
|
24
|
+
scraped_site.javascript_file = javascript_head_file
|
25
|
+
end
|
26
|
+
|
27
|
+
downloader.load_all(scraper.javascript_urls_in_body,
|
28
|
+
extension: '.js',
|
29
|
+
before_each: begin_try_catch,
|
30
|
+
after_each: end_try_catch) do |javascript_body_file|
|
31
|
+
scraped_site.javascript_body_file = javascript_body_file
|
25
32
|
end
|
26
33
|
|
27
34
|
downloader.load_all(scraper.stylesheet_urls,
|
@@ -4,11 +4,13 @@ module Pageflow
|
|
4
4
|
include Pageflow::ReusableFile
|
5
5
|
|
6
6
|
has_attached_file :javascript_file, Chart.config.paperclip_options(extension: 'js')
|
7
|
+
has_attached_file :javascript_body_file, Chart.config.paperclip_options(basename: 'all_body', extension: 'js')
|
7
8
|
has_attached_file :stylesheet_file, Chart.config.paperclip_options(extension: 'css')
|
8
9
|
has_attached_file :html_file, Chart.config.paperclip_options(extension: 'html')
|
9
10
|
has_attached_file :csv_file, Chart.config.paperclip_options(basename: 'data', extension: 'csv')
|
10
11
|
|
11
12
|
do_not_validate_attachment_file_type(:javascript_file)
|
13
|
+
do_not_validate_attachment_file_type(:javascript_body_file)
|
12
14
|
do_not_validate_attachment_file_type(:stylesheet_file)
|
13
15
|
do_not_validate_attachment_file_type(:html_file)
|
14
16
|
do_not_validate_attachment_file_type(:csv_file)
|
@@ -85,7 +87,7 @@ module Pageflow
|
|
85
87
|
end
|
86
88
|
|
87
89
|
def attachments_for_export
|
88
|
-
[javascript_file, stylesheet_file, html_file, csv_file]
|
90
|
+
[javascript_file, javascript_body_file, stylesheet_file, html_file, csv_file]
|
89
91
|
end
|
90
92
|
end
|
91
93
|
end
|
@@ -22,6 +22,7 @@ module Pageflow
|
|
22
22
|
#
|
23
23
|
# @param [Hash] opts
|
24
24
|
# @option opts [Array<Regexp>] :head_script_blacklist Script tags in page head are ignored if they match any of this list of regexes.
|
25
|
+
# @option opts [Array<Regexp>] :body_script_blacklist Script tags in page body are ignored if they match any of this list of regexes.
|
25
26
|
# @option opts [Array<Regexp>] :inline_script_blacklist Inline script tags are ignored if they match any of this list of regexes.
|
26
27
|
# @option opts [Array<String>] :selector_blacklist HTML-elements matched by selectors in this list will not be scraped.
|
27
28
|
# @return [Hash]
|
@@ -55,6 +56,7 @@ module Pageflow
|
|
55
56
|
def initialize
|
56
57
|
@scraper_options = {
|
57
58
|
head_script_blacklist: [/piwik/],
|
59
|
+
body_script_blacklist: [/piwik/],
|
58
60
|
inline_script_blacklist: [/piwik/],
|
59
61
|
selector_blacklist: ['body .noscript']
|
60
62
|
}
|
@@ -3,7 +3,11 @@ require 'nokogiri'
|
|
3
3
|
module Pageflow
|
4
4
|
module Chart
|
5
5
|
class Scraper
|
6
|
-
attr_reader :document,
|
6
|
+
attr_reader :document,
|
7
|
+
:options,
|
8
|
+
:javascript_urls_in_head,
|
9
|
+
:javascript_urls_in_body,
|
10
|
+
:stylesheet_urls
|
7
11
|
|
8
12
|
def initialize(html, options = {})
|
9
13
|
@document = Nokogiri::HTML(html)
|
@@ -23,14 +27,21 @@ module Pageflow
|
|
23
27
|
private
|
24
28
|
|
25
29
|
def parse
|
26
|
-
parse_javascript_urls
|
30
|
+
parse_javascript_urls(:head)
|
31
|
+
parse_javascript_urls(:body)
|
27
32
|
parse_stylesheet_urls
|
28
33
|
end
|
29
34
|
|
30
|
-
def parse_javascript_urls
|
31
|
-
|
35
|
+
def parse_javascript_urls(container)
|
36
|
+
script_tags = filtered_script_tags_in(container).map do |tag|
|
32
37
|
tag[:src]
|
33
38
|
end
|
39
|
+
|
40
|
+
if container.eql?(:head)
|
41
|
+
@javascript_urls_in_head = script_tags
|
42
|
+
else
|
43
|
+
@javascript_urls_in_body = script_tags
|
44
|
+
end
|
34
45
|
end
|
35
46
|
|
36
47
|
def parse_stylesheet_urls
|
@@ -42,7 +53,8 @@ module Pageflow
|
|
42
53
|
def rewrite
|
43
54
|
filter_inline_scripts
|
44
55
|
filter_by_selectors
|
45
|
-
|
56
|
+
combine_script_tags_in(:head)
|
57
|
+
combine_script_tags_in(:body)
|
46
58
|
combine_css_link_tags
|
47
59
|
end
|
48
60
|
|
@@ -66,12 +78,12 @@ module Pageflow
|
|
66
78
|
end
|
67
79
|
end
|
68
80
|
|
69
|
-
def
|
70
|
-
script_tags_to_remove =
|
81
|
+
def combine_script_tags_in(container)
|
82
|
+
script_tags_to_remove = script_src_tags_in(container)
|
71
83
|
return if script_tags_to_remove.empty?
|
72
84
|
|
73
85
|
all_script_src_tag = Nokogiri::XML::Node.new('script', document)
|
74
|
-
all_script_src_tag[:src] = 'all.js'
|
86
|
+
all_script_src_tag[:src] = container.eql?(:head) ? 'all.js' : 'all_body.js'
|
75
87
|
all_script_src_tag[:type] = 'text/javascript'
|
76
88
|
|
77
89
|
script_tags_to_remove
|
@@ -91,16 +103,16 @@ module Pageflow
|
|
91
103
|
document.at_css('head') << all_css_link_tag
|
92
104
|
end
|
93
105
|
|
94
|
-
def
|
95
|
-
|
96
|
-
options.fetch(
|
106
|
+
def filtered_script_tags_in(container)
|
107
|
+
script_src_tags_in(container).reject do |tag|
|
108
|
+
options.fetch("#{container}_script_blacklist".to_sym, []).any? do |regexp|
|
97
109
|
tag[:src] =~ regexp
|
98
110
|
end
|
99
111
|
end
|
100
112
|
end
|
101
113
|
|
102
|
-
def
|
103
|
-
document.css(
|
114
|
+
def script_src_tags_in(container)
|
115
|
+
document.css("#{container} script[src]")
|
104
116
|
end
|
105
117
|
|
106
118
|
def css_link_tags
|
@@ -8,6 +8,7 @@ module Pageflow
|
|
8
8
|
state { 'processed' }
|
9
9
|
|
10
10
|
javascript_file { File.open(Engine.root.join('spec', 'fixtures', 'all.js')) }
|
11
|
+
javascript_body_file { File.open(Engine.root.join('spec', 'fixtures', 'all_body.js')) }
|
11
12
|
stylesheet_file { File.open(Engine.root.join('spec', 'fixtures', 'all.css')) }
|
12
13
|
html_file { File.open(Engine.root.join('spec', 'fixtures', 'index.html')) }
|
13
14
|
csv_file { File.open(Engine.root.join('spec', 'fixtures', 'data.csv')) }
|
@@ -0,0 +1 @@
|
|
1
|
+
var chart_body = {};
|
@@ -31,7 +31,7 @@ module Pageflow::Chart
|
|
31
31
|
scraped_site = ScrapedSite.new(url: 'http://example.com/foo/index.html')
|
32
32
|
|
33
33
|
expect(scraped_site.attachments_for_export.map(&:name))
|
34
|
-
.to eq(%i[javascript_file stylesheet_file html_file csv_file])
|
34
|
+
.to eq(%i[javascript_file javascript_body_file stylesheet_file html_file csv_file])
|
35
35
|
end
|
36
36
|
|
37
37
|
describe '#publish!' do
|
@@ -19,27 +19,67 @@ module Pageflow
|
|
19
19
|
expect(scraper.html).to include('contents')
|
20
20
|
end
|
21
21
|
|
22
|
-
it '
|
22
|
+
it 'filters blacklisted selectors' do
|
23
23
|
html = <<-HTML
|
24
24
|
<!DOCTYPE html>
|
25
25
|
<html>
|
26
26
|
<head>
|
27
|
-
<
|
28
|
-
<script type="text/javascript" src="/other.js"></script>
|
27
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
29
28
|
</head>
|
30
29
|
<body>
|
30
|
+
<div id="bad" class="noscript"></div>
|
31
|
+
<div id="good"></div>
|
31
32
|
</body>
|
32
33
|
</html>
|
33
34
|
HTML
|
34
|
-
scraper = Scraper.new(html)
|
35
|
+
scraper = Scraper.new(html, selector_blacklist: ['body .noscript'])
|
35
36
|
|
36
|
-
expect(HtmlFragment.new(scraper.html)).
|
37
|
-
expect(HtmlFragment.new(scraper.html)).
|
37
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('body #good')
|
38
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('body #bad')
|
38
39
|
end
|
39
40
|
|
40
|
-
|
41
|
+
describe 'stylesheets in head' do
|
42
|
+
it 'combines link tags in head' do
|
43
|
+
html = <<-HTML
|
44
|
+
<!DOCTYPE html>
|
45
|
+
<html>
|
46
|
+
<head>
|
47
|
+
<link rel="stylesheet" type="text/css" href="/some.css">
|
48
|
+
<link rel="stylesheet" type="text/css" href="/other.css">
|
49
|
+
</head>
|
50
|
+
<body>
|
51
|
+
</body>
|
52
|
+
</html>
|
53
|
+
HTML
|
54
|
+
scraper = Scraper.new(html)
|
55
|
+
|
56
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('head link[href="/some.css"]')
|
57
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('head link[href="all.css"]')
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe 'scripts in head' do
|
62
|
+
it 'combines script tags in head' do
|
63
|
+
html = <<-HTML
|
64
|
+
<!DOCTYPE html>
|
65
|
+
<html>
|
66
|
+
<head>
|
67
|
+
<script type="text/javascript" src="/some.js"></script>
|
68
|
+
<script type="text/javascript" src="/other.js"></script>
|
69
|
+
</head>
|
70
|
+
<body>
|
71
|
+
</body>
|
72
|
+
</html>
|
73
|
+
HTML
|
74
|
+
scraper = Scraper.new(html)
|
75
|
+
|
76
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('head script[src="/some.js"]')
|
77
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('head script[src="all.js"]')
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'inserts script tag at position of first script src tag to keep position ' \
|
41
81
|
'between inline scripts' do
|
42
|
-
|
82
|
+
html = <<-HTML
|
43
83
|
<!DOCTYPE html>
|
44
84
|
<html>
|
45
85
|
<head>
|
@@ -55,122 +95,220 @@ module Pageflow
|
|
55
95
|
<body>
|
56
96
|
</body>
|
57
97
|
</html>
|
58
|
-
|
59
|
-
|
98
|
+
HTML
|
99
|
+
scraper = Scraper.new(html)
|
60
100
|
|
61
|
-
|
101
|
+
fragment = HtmlFragment.new(scraper.html)
|
62
102
|
|
63
|
-
|
64
|
-
|
65
|
-
|
103
|
+
expect(fragment).to have_tags_in_order('head script#setup',
|
104
|
+
'head script[src="all.js"]',
|
105
|
+
'head script#usage')
|
106
|
+
end
|
66
107
|
end
|
67
108
|
|
68
|
-
|
69
|
-
|
109
|
+
describe 'scripts in body' do
|
110
|
+
it 'combines script tags in body' do
|
111
|
+
html = <<-HTML
|
70
112
|
<!DOCTYPE html>
|
71
113
|
<html>
|
72
114
|
<head>
|
73
|
-
<link rel="stylesheet" type="text/css" href="/some.css">
|
74
|
-
<link rel="stylesheet" type="text/css" href="/other.css">
|
75
115
|
</head>
|
76
116
|
<body>
|
117
|
+
<script type="text/javascript" src="/some.js"></script>
|
118
|
+
<script type="text/javascript" src="/other.js"></script>
|
77
119
|
</body>
|
78
120
|
</html>
|
79
|
-
|
80
|
-
|
121
|
+
HTML
|
122
|
+
scraper = Scraper.new(html)
|
81
123
|
|
82
|
-
|
83
|
-
|
84
|
-
|
124
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('body script[src="/some.js"]')
|
125
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('body script[src="all_body.js"]')
|
126
|
+
end
|
85
127
|
|
86
|
-
|
87
|
-
|
128
|
+
it 'inserts script tag at position of first script src tag to keep position ' \
|
129
|
+
'between inline scripts' do
|
130
|
+
html = <<-HTML
|
88
131
|
<!DOCTYPE html>
|
89
132
|
<html>
|
90
133
|
<head>
|
91
|
-
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
92
134
|
</head>
|
93
135
|
<body>
|
94
|
-
<script id="
|
95
|
-
|
136
|
+
<script id="setup">
|
137
|
+
// Some setup required for scripts below to execute
|
138
|
+
</script>
|
139
|
+
<script type="text/javascript" src="/some.js"></script>
|
140
|
+
<script type="text/javascript" src="/other.js"></script>
|
141
|
+
<script id="usage">
|
142
|
+
// Some script using stuff loading above
|
143
|
+
</script>
|
96
144
|
</body>
|
97
145
|
</html>
|
98
|
-
|
99
|
-
|
146
|
+
HTML
|
147
|
+
scraper = Scraper.new(html)
|
148
|
+
|
149
|
+
fragment = HtmlFragment.new(scraper.html)
|
150
|
+
|
151
|
+
expect(fragment).to have_tags_in_order('body script#setup',
|
152
|
+
'body script[src="all_body.js"]',
|
153
|
+
'body script#usage')
|
154
|
+
end
|
100
155
|
|
101
|
-
|
102
|
-
|
156
|
+
it 'filters blacklisted inline scripts' do
|
157
|
+
html = <<-HTML
|
158
|
+
<!DOCTYPE html>
|
159
|
+
<html>
|
160
|
+
<head>
|
161
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
162
|
+
</head>
|
163
|
+
<body>
|
164
|
+
<script id="good">window.ok = true;</script>
|
165
|
+
<script id="bad">alert();</script>
|
166
|
+
</body>
|
167
|
+
</html>
|
168
|
+
HTML
|
169
|
+
scraper = Scraper.new(html, inline_script_blacklist: [/alert/])
|
170
|
+
|
171
|
+
expect(HtmlFragment.new(scraper.html)).to have_tag('body script#good')
|
172
|
+
expect(HtmlFragment.new(scraper.html)).not_to have_tag('body script#bad')
|
173
|
+
end
|
103
174
|
end
|
175
|
+
end
|
104
176
|
|
105
|
-
|
106
|
-
|
177
|
+
describe '#javascript_urls' do
|
178
|
+
describe 'scripts in head' do
|
179
|
+
it 'returns list of urls to javascript files' do
|
180
|
+
html = <<-HTML
|
107
181
|
<!DOCTYPE html>
|
108
182
|
<html>
|
109
183
|
<head>
|
110
|
-
<
|
184
|
+
<script type="text/javascript" src="/some.js"></script>
|
185
|
+
<script type="text/javascript" src="/other.js"></script>
|
111
186
|
</head>
|
112
187
|
<body>
|
113
|
-
<div id="bad" class="noscript"></div>
|
114
|
-
<div id="good"></div>
|
115
188
|
</body>
|
116
189
|
</html>
|
117
|
-
|
118
|
-
|
190
|
+
HTML
|
191
|
+
scraper = Scraper.new(html)
|
119
192
|
|
120
|
-
|
121
|
-
|
122
|
-
end
|
123
|
-
end
|
193
|
+
expect(scraper.javascript_urls_in_head).to eq(['/some.js', '/other.js'])
|
194
|
+
end
|
124
195
|
|
125
|
-
|
126
|
-
|
127
|
-
html = <<-HTML
|
196
|
+
it 'filters by blacklist' do
|
197
|
+
html = <<-HTML
|
128
198
|
<!DOCTYPE html>
|
129
199
|
<html>
|
130
200
|
<head>
|
131
201
|
<script type="text/javascript" src="/some.js"></script>
|
132
|
-
<script type="text/javascript" src="/
|
202
|
+
<script type="text/javascript" src="http://example.com/piwik.js"></script>
|
133
203
|
</head>
|
134
204
|
<body>
|
135
205
|
</body>
|
136
206
|
</html>
|
137
|
-
|
138
|
-
|
207
|
+
HTML
|
208
|
+
scraper = Scraper.new(html, head_script_blacklist: [/piwik/])
|
209
|
+
|
210
|
+
expect(scraper.javascript_urls_in_head).to eq(['/some.js'])
|
211
|
+
end
|
139
212
|
|
140
|
-
|
213
|
+
it 'ignores inline scripts in head' do
|
214
|
+
html = <<-HTML
|
215
|
+
<!DOCTYPE html>
|
216
|
+
<html>
|
217
|
+
<head>
|
218
|
+
<script type="text/javascript"></script>
|
219
|
+
</head>
|
220
|
+
<body>
|
221
|
+
</body>
|
222
|
+
</html>
|
223
|
+
HTML
|
224
|
+
scraper = Scraper.new(html)
|
225
|
+
|
226
|
+
expect(scraper.javascript_urls_in_head).to eq([])
|
227
|
+
end
|
228
|
+
|
229
|
+
it 'ignores scripts in body' do
|
230
|
+
html = <<-HTML
|
231
|
+
<!DOCTYPE html>
|
232
|
+
<html>
|
233
|
+
<head>
|
234
|
+
</head>
|
235
|
+
<body>
|
236
|
+
<script type="text/javascript" src="/some.js"></script>
|
237
|
+
</body>
|
238
|
+
</html>
|
239
|
+
HTML
|
240
|
+
scraper = Scraper.new(html)
|
241
|
+
|
242
|
+
expect(scraper.javascript_urls_in_head).to eq([])
|
243
|
+
end
|
141
244
|
end
|
142
245
|
|
143
|
-
|
144
|
-
|
246
|
+
describe 'scripts in body' do
|
247
|
+
it 'ignores scripts in head' do
|
248
|
+
html = <<-HTML
|
145
249
|
<!DOCTYPE html>
|
146
250
|
<html>
|
147
251
|
<head>
|
148
252
|
<script type="text/javascript" src="/some.js"></script>
|
149
|
-
<script type="text/javascript" src="http://example.com/piwik.js"></script>
|
150
253
|
</head>
|
151
254
|
<body>
|
152
255
|
</body>
|
153
256
|
</html>
|
154
|
-
|
155
|
-
|
257
|
+
HTML
|
258
|
+
scraper = Scraper.new(html)
|
156
259
|
|
157
|
-
|
158
|
-
|
260
|
+
expect(scraper.javascript_urls_in_body).to eq([])
|
261
|
+
end
|
159
262
|
|
160
|
-
|
161
|
-
|
263
|
+
it 'returns list of urls to javascript files' do
|
264
|
+
html = <<-HTML
|
162
265
|
<!DOCTYPE html>
|
163
266
|
<html>
|
164
267
|
<head>
|
165
|
-
<script type="text/javascript"></script>
|
166
268
|
</head>
|
167
269
|
<body>
|
270
|
+
<script type="text/javascript" src="/some.js"></script>
|
271
|
+
<script type="text/javascript" src="/other.js"></script>
|
168
272
|
</body>
|
169
273
|
</html>
|
170
|
-
|
171
|
-
|
274
|
+
HTML
|
275
|
+
scraper = Scraper.new(html)
|
276
|
+
|
277
|
+
expect(scraper.javascript_urls_in_body).to eq(['/some.js', '/other.js'])
|
278
|
+
end
|
279
|
+
|
280
|
+
it 'filters by blacklist' do
|
281
|
+
html = <<-HTML
|
282
|
+
<!DOCTYPE html>
|
283
|
+
<html>
|
284
|
+
<head>
|
285
|
+
</head>
|
286
|
+
<body>
|
287
|
+
<script type="text/javascript" src="/some.js"></script>
|
288
|
+
<script type="text/javascript" src="http://example.com/piwik.js"></script>
|
289
|
+
</body>
|
290
|
+
</html>
|
291
|
+
HTML
|
292
|
+
scraper = Scraper.new(html, body_script_blacklist: [/piwik/])
|
293
|
+
|
294
|
+
expect(scraper.javascript_urls_in_body).to eq(['/some.js'])
|
295
|
+
end
|
296
|
+
|
297
|
+
it 'ignores inline scripts in body' do
|
298
|
+
html = <<-HTML
|
299
|
+
<!DOCTYPE html>
|
300
|
+
<html>
|
301
|
+
<head>
|
302
|
+
</head>
|
303
|
+
<body>
|
304
|
+
<script type="text/javascript"></script>
|
305
|
+
</body>
|
306
|
+
</html>
|
307
|
+
HTML
|
308
|
+
scraper = Scraper.new(html)
|
172
309
|
|
173
|
-
|
310
|
+
expect(scraper.javascript_urls_in_body).to eq([])
|
311
|
+
end
|
174
312
|
end
|
175
313
|
end
|
176
314
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pageflow-chart
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Codevise Solutions Ltd
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-05-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pageflow
|
@@ -209,6 +209,7 @@ files:
|
|
209
209
|
- db/migrate/20160211085234_add_use_custom_theme_to_scraped_sites.rb
|
210
210
|
- db/migrate/20190531141820_add_file_attributes_to_scraped_sites.rb
|
211
211
|
- db/migrate/20190531145431_insert_file_usages_for_scraped_sites.rb
|
212
|
+
- db/migrate/20200507141608_add_javascript_body_attachment_to_scraped_site.rb
|
212
213
|
- lib/generators/pageflow_chart/install/install_generator.rb
|
213
214
|
- lib/pageflow/chart.rb
|
214
215
|
- lib/pageflow/chart/configuration.rb
|
@@ -221,6 +222,7 @@ files:
|
|
221
222
|
- spec/factories/scraped_sites.rb
|
222
223
|
- spec/fixtures/all.css
|
223
224
|
- spec/fixtures/all.js
|
225
|
+
- spec/fixtures/all_body.js
|
224
226
|
- spec/fixtures/data.csv
|
225
227
|
- spec/fixtures/datawrapper.html
|
226
228
|
- spec/fixtures/index.html
|
@@ -266,6 +268,7 @@ test_files:
|
|
266
268
|
- spec/factories/scraped_sites.rb
|
267
269
|
- spec/fixtures/all.css
|
268
270
|
- spec/fixtures/all.js
|
271
|
+
- spec/fixtures/all_body.js
|
269
272
|
- spec/fixtures/data.csv
|
270
273
|
- spec/fixtures/datawrapper.html
|
271
274
|
- spec/fixtures/index.html
|