safe_image 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +193 -0
- data/README.md +166 -11
- data/lib/safe_image/discourse_compat.rb +2 -13
- data/lib/safe_image/ico.rb +1 -1
- data/lib/safe_image/native.rb +24 -15
- data/lib/safe_image/optimizer.rb +79 -4
- data/lib/safe_image/processor.rb +1 -1
- data/lib/safe_image/remote.rb +174 -8
- data/lib/safe_image/runner.rb +9 -1
- data/lib/safe_image/sandbox.rb +41 -14
- data/lib/safe_image/svg_css.rb +314 -0
- data/lib/safe_image/svg_metadata.rb +179 -53
- data/lib/safe_image/svg_sanitizer.rb +524 -43
- data/lib/safe_image/version.rb +1 -1
- data/lib/safe_image/zygote.rb +619 -0
- data/lib/safe_image.rb +12 -0
- metadata +18 -2
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "strscan"
|
|
4
|
+
|
|
5
|
+
module SafeImage
|
|
6
|
+
# Allowlist sanitizer for the small CSS subset SVG files legitimately use
|
|
7
|
+
# (Inkscape writes style="" attributes, Illustrator writes class rules in
|
|
8
|
+
# <style> elements). Output is constructed from validated tokens, never
|
|
9
|
+
# echoed from the input, so nothing outside the vocabulary below can appear
|
|
10
|
+
# in it. Anything the grammar does not recognise — escapes, quotes,
|
|
11
|
+
# at-rules, comments, unknown properties or functions, non-fragment url() —
|
|
12
|
+
# drops the declaration rather than being decoded.
|
|
13
|
+
module SvgCss
|
|
14
|
+
NO_FUNCTIONS = [].freeze
|
|
15
|
+
URL_FUNCTIONS = %w[url].freeze
|
|
16
|
+
COLOR_FUNCTIONS = %w[rgb rgba hsl hsla].freeze
|
|
17
|
+
PAINT_FUNCTIONS = (COLOR_FUNCTIONS + URL_FUNCTIONS).freeze
|
|
18
|
+
# Lowercase: function names are matched and emitted case-insensitively.
|
|
19
|
+
TRANSFORM_FUNCTIONS = %w[matrix translate translatex translatey scale rotate skewx skewy].freeze
|
|
20
|
+
|
|
21
|
+
# A CSS property is allowed exactly when its presentation-attribute twin is
|
|
22
|
+
# in SvgSanitizer::ALLOWED_ATTRIBUTES (a test asserts this); the value is the
|
|
23
|
+
# list of functions that property's values may call. The only functions that
|
|
24
|
+
# reach a resource are url(...) — and those are constrained to same-document
|
|
25
|
+
# #fragment references — so the URL surface is exactly the url()-bearing
|
|
26
|
+
# rows below (paint servers, clip/mask, and markers), all fragment-only.
|
|
27
|
+
ALLOWED_PROPERTIES = {
|
|
28
|
+
# Paint and color. url() = paint-server reference (gradient/pattern).
|
|
29
|
+
"fill" => PAINT_FUNCTIONS,
|
|
30
|
+
"stroke" => PAINT_FUNCTIONS,
|
|
31
|
+
"stop-color" => COLOR_FUNCTIONS,
|
|
32
|
+
"color" => COLOR_FUNCTIONS, # currentColor resolution; Inkscape/Illustrator
|
|
33
|
+
"opacity" => NO_FUNCTIONS,
|
|
34
|
+
"fill-opacity" => NO_FUNCTIONS,
|
|
35
|
+
"stroke-opacity" => NO_FUNCTIONS,
|
|
36
|
+
"stop-opacity" => NO_FUNCTIONS,
|
|
37
|
+
"fill-rule" => NO_FUNCTIONS,
|
|
38
|
+
"clip-rule" => NO_FUNCTIONS,
|
|
39
|
+
# Stroke geometry. stroke-dasharray/dashoffset are how every dashed line
|
|
40
|
+
# is expressed; vector-effect:non-scaling-stroke is an Inkscape default.
|
|
41
|
+
"stroke-width" => NO_FUNCTIONS,
|
|
42
|
+
"stroke-linecap" => NO_FUNCTIONS,
|
|
43
|
+
"stroke-linejoin" => NO_FUNCTIONS,
|
|
44
|
+
"stroke-miterlimit" => NO_FUNCTIONS,
|
|
45
|
+
"stroke-dasharray" => NO_FUNCTIONS,
|
|
46
|
+
"stroke-dashoffset" => NO_FUNCTIONS,
|
|
47
|
+
"vector-effect" => NO_FUNCTIONS,
|
|
48
|
+
# Geometry references. url() = clipPath/mask/marker element by #id.
|
|
49
|
+
"clip-path" => URL_FUNCTIONS,
|
|
50
|
+
"mask" => URL_FUNCTIONS,
|
|
51
|
+
"marker" => URL_FUNCTIONS,
|
|
52
|
+
"marker-start" => URL_FUNCTIONS,
|
|
53
|
+
"marker-mid" => URL_FUNCTIONS,
|
|
54
|
+
"marker-end" => URL_FUNCTIONS,
|
|
55
|
+
"transform" => TRANSFORM_FUNCTIONS,
|
|
56
|
+
# Visibility and rendering hints. Keywords/numbers only.
|
|
57
|
+
"display" => NO_FUNCTIONS,
|
|
58
|
+
"visibility" => NO_FUNCTIONS,
|
|
59
|
+
"overflow" => NO_FUNCTIONS,
|
|
60
|
+
"paint-order" => NO_FUNCTIONS,
|
|
61
|
+
"mix-blend-mode" => NO_FUNCTIONS,
|
|
62
|
+
"isolation" => NO_FUNCTIONS,
|
|
63
|
+
"shape-rendering" => NO_FUNCTIONS,
|
|
64
|
+
"image-rendering" => NO_FUNCTIONS,
|
|
65
|
+
"color-interpolation" => NO_FUNCTIONS,
|
|
66
|
+
# Text. Keywords/lengths only; no url() anywhere in text styling.
|
|
67
|
+
"font-family" => NO_FUNCTIONS,
|
|
68
|
+
"font-size" => NO_FUNCTIONS,
|
|
69
|
+
"font-weight" => NO_FUNCTIONS,
|
|
70
|
+
"font-style" => NO_FUNCTIONS,
|
|
71
|
+
"font-variant" => NO_FUNCTIONS,
|
|
72
|
+
"font-stretch" => NO_FUNCTIONS,
|
|
73
|
+
"text-anchor" => NO_FUNCTIONS,
|
|
74
|
+
"text-decoration" => NO_FUNCTIONS,
|
|
75
|
+
"letter-spacing" => NO_FUNCTIONS,
|
|
76
|
+
"word-spacing" => NO_FUNCTIONS,
|
|
77
|
+
"dominant-baseline" => NO_FUNCTIONS,
|
|
78
|
+
"baseline-shift" => NO_FUNCTIONS,
|
|
79
|
+
"writing-mode" => NO_FUNCTIONS,
|
|
80
|
+
"direction" => NO_FUNCTIONS
|
|
81
|
+
}.freeze
|
|
82
|
+
|
|
83
|
+
# Every character a declaration may contain. The exclusions do the work:
|
|
84
|
+
# no backslash (CSS escapes re-form tokens after any pattern check), no
|
|
85
|
+
# quotes (no strings, so no string-URL functions), no "@" (no at-rules),
|
|
86
|
+
# no "*" — which keeps CSS comments (/* */) structurally impossible even
|
|
87
|
+
# though "/" is admitted for modern color alpha (rgb(R G B / A)). A "/"
|
|
88
|
+
# survives to output only via the color-function parser below; anywhere
|
|
89
|
+
# else it fails tokenisation and the declaration drops.
|
|
90
|
+
DECLARATION_CHARSET = %r{\A[a-zA-Z0-9 #%+.,()/:-]*\z}.freeze
|
|
91
|
+
|
|
92
|
+
# CSS priority flag. Parsed out of the value structurally and re-emitted
|
|
93
|
+
# canonically, so "!" never enters the value tokeniser.
|
|
94
|
+
IMPORTANT = /\s*!\s*important\s*\z/i.freeze
|
|
95
|
+
|
|
96
|
+
# url() may only reference the current document; same fragment shape
|
|
97
|
+
# SvgSanitizer.dangerous_value? accepts.
|
|
98
|
+
FRAGMENT = /#[A-Za-z][\w.-]*/.freeze
|
|
99
|
+
|
|
100
|
+
HEX_COLOR = /#\h{3,8}/.freeze
|
|
101
|
+
NUMBER = /[+-]?(?:\d+\.\d+|\.\d+|\d+)(?:%|px|pt|pc|em|rem|ex|ch|cm|mm|in|deg|rad|grad|turn)?/.freeze
|
|
102
|
+
IDENT = /[a-zA-Z][a-zA-Z0-9-]*/.freeze
|
|
103
|
+
FUNCTION_NAME = /[a-zA-Z][a-zA-Z0-9-]*(?=\()/.freeze
|
|
104
|
+
SEPARATOR = /\s*,\s*|\s+/.freeze
|
|
105
|
+
|
|
106
|
+
# Selectors: type/.class/#id compounds joined by descendant or child
|
|
107
|
+
# combinators, in comma lists. The charset shuts out pseudo-classes (:),
|
|
108
|
+
# attribute selectors ([), and everything the declaration charset already
|
|
109
|
+
# excludes.
|
|
110
|
+
SELECTOR_CHARSET = /\A[a-zA-Z0-9_ #.,*>-]*\z/.freeze
|
|
111
|
+
SELECTOR_TYPE = /\*|[a-zA-Z][a-zA-Z0-9-]*/.freeze
|
|
112
|
+
SELECTOR_QUALIFIER = /[.#][A-Za-z_][\w-]*/.freeze
|
|
113
|
+
COMBINATOR = /\s*>\s*|\s+/.freeze
|
|
114
|
+
|
|
115
|
+
module_function
|
|
116
|
+
|
|
117
|
+
# Prefixes a bare id/fragment name with the document namespace, unless it is
|
|
118
|
+
# already prefixed (so re-sanitising is a fixed point). A nil namespace is a
|
|
119
|
+
# no-op, preserving the document-scoped (non-inline) behaviour.
|
|
120
|
+
def apply_namespace(namespace, name)
|
|
121
|
+
return name if namespace.nil? || name.start_with?("#{namespace}-")
|
|
122
|
+
|
|
123
|
+
"#{namespace}-#{name}"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Sanitizes a style="" declaration list. When a namespace is given, url(#id)
|
|
127
|
+
# references are rewritten to url(#namespace-id) so they keep pointing at the
|
|
128
|
+
# namespaced ids in the same document. Returns the constructed declaration
|
|
129
|
+
# list, or nil when no declaration survives.
|
|
130
|
+
def sanitize_declarations(css, namespace: nil)
|
|
131
|
+
declarations = normalize(css).split(";").filter_map { |declaration| sanitize_declaration(declaration, namespace) }
|
|
132
|
+
declarations.empty? ? nil : declarations.join(";")
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Sanitizes a <style> element's stylesheet. The structure scan accepts
|
|
136
|
+
# only a flat list of "selectors { declarations }" rules — at-rules,
|
|
137
|
+
# nested blocks, and unbalanced braces fail the whole sheet closed rather
|
|
138
|
+
# than surviving in degraded form. Within a well-formed sheet, individual
|
|
139
|
+
# selectors and declarations drop independently. Returns the constructed
|
|
140
|
+
# stylesheet, or nil when no rule survives.
|
|
141
|
+
def sanitize_stylesheet(css, namespace: nil)
|
|
142
|
+
css = normalize(css)
|
|
143
|
+
# At-rules (@import, @media, @font-face, @keyframes, ...) have no place in
|
|
144
|
+
# the allowed subset, and "@" appears nowhere else in it. Rejecting it up
|
|
145
|
+
# front fails the whole element closed — the rule-by-rule scan below would
|
|
146
|
+
# otherwise drop only the at-rule and keep later rules, which contradicts
|
|
147
|
+
# the documented guarantee and risks parser edge cases at the boundary.
|
|
148
|
+
return nil if css.include?("@")
|
|
149
|
+
|
|
150
|
+
scanner = StringScanner.new(css)
|
|
151
|
+
rules = []
|
|
152
|
+
until scanner.eos?
|
|
153
|
+
scanner.skip(/\s+/)
|
|
154
|
+
break if scanner.eos?
|
|
155
|
+
|
|
156
|
+
selectors_src = scanner.scan(/[^{}]+/)
|
|
157
|
+
return nil unless selectors_src && scanner.skip(/\{/)
|
|
158
|
+
|
|
159
|
+
body = scanner.scan(/[^{}]*/)
|
|
160
|
+
return nil unless scanner.skip(/\}/)
|
|
161
|
+
|
|
162
|
+
selectors = sanitize_selectors(selectors_src, namespace)
|
|
163
|
+
declarations = sanitize_declarations(body, namespace: namespace)
|
|
164
|
+
rules << "#{selectors}{#{declarations}}" if selectors && declarations
|
|
165
|
+
end
|
|
166
|
+
rules.empty? ? nil : rules.join
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def sanitize_selectors(src, namespace = nil)
|
|
170
|
+
selectors = src.split(",").filter_map { |selector| sanitize_selector(selector.strip, namespace) }
|
|
171
|
+
selectors.empty? ? nil : selectors.join(",")
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def sanitize_selector(selector, namespace = nil)
|
|
175
|
+
return nil if selector.empty? || !selector.match?(SELECTOR_CHARSET)
|
|
176
|
+
|
|
177
|
+
scanner = StringScanner.new(selector)
|
|
178
|
+
out = +""
|
|
179
|
+
loop do
|
|
180
|
+
compound = scan_compound(scanner, namespace)
|
|
181
|
+
return nil unless compound
|
|
182
|
+
|
|
183
|
+
out << compound
|
|
184
|
+
break if scanner.eos?
|
|
185
|
+
|
|
186
|
+
combinator = scanner.scan(COMBINATOR)
|
|
187
|
+
return nil if combinator.nil? || scanner.eos?
|
|
188
|
+
|
|
189
|
+
out << (combinator.include?(">") ? ">" : " ")
|
|
190
|
+
end
|
|
191
|
+
scope_selector(namespace, out)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Confines a selector to the namespaced document by anchoring it under the
|
|
195
|
+
# root's scope class, so a preserved <style> cannot reach a host page if the
|
|
196
|
+
# SVG is inlined. Universal/type/class selectors that would otherwise match
|
|
197
|
+
# host elements only match descendants of this document's root. Idempotent:
|
|
198
|
+
# an already-scoped selector is returned unchanged.
|
|
199
|
+
def scope_selector(namespace, selector)
|
|
200
|
+
return selector if namespace.nil?
|
|
201
|
+
|
|
202
|
+
scope = ".#{namespace}-scope"
|
|
203
|
+
selector.start_with?("#{scope} ") ? selector : "#{scope} #{selector}"
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def scan_compound(scanner, namespace = nil)
|
|
207
|
+
out = +""
|
|
208
|
+
if (type = scanner.scan(SELECTOR_TYPE))
|
|
209
|
+
out << type
|
|
210
|
+
end
|
|
211
|
+
while (qualifier = scanner.scan(SELECTOR_QUALIFIER))
|
|
212
|
+
out << namespace_qualifier(namespace, qualifier)
|
|
213
|
+
end
|
|
214
|
+
out.empty? ? nil : out
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Prefix an id (#x) or class (.x) selector's name with the namespace so it
|
|
218
|
+
# matches only this document's namespaced ids/classes, never a host element's.
|
|
219
|
+
# Type and universal selectors are left alone (they are confined by the root
|
|
220
|
+
# scope class instead). Idempotent via apply_namespace.
|
|
221
|
+
def namespace_qualifier(namespace, qualifier)
|
|
222
|
+
return qualifier if namespace.nil?
|
|
223
|
+
|
|
224
|
+
"#{qualifier[0]}#{apply_namespace(namespace, qualifier[1..])}"
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def normalize(css)
|
|
228
|
+
css.to_s.tr("\t\r\n\f\v", " ")
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
def sanitize_declaration(declaration, namespace = nil)
|
|
232
|
+
important = ""
|
|
233
|
+
if declaration.match?(IMPORTANT)
|
|
234
|
+
declaration = declaration.sub(IMPORTANT, "")
|
|
235
|
+
important = "!important"
|
|
236
|
+
end
|
|
237
|
+
return nil unless declaration.match?(DECLARATION_CHARSET)
|
|
238
|
+
|
|
239
|
+
property, value = declaration.split(":", 2)
|
|
240
|
+
return nil if value.nil?
|
|
241
|
+
|
|
242
|
+
property = property.strip.downcase
|
|
243
|
+
functions = ALLOWED_PROPERTIES[property]
|
|
244
|
+
return nil unless functions
|
|
245
|
+
|
|
246
|
+
value = sanitize_value(value.strip, functions, namespace)
|
|
247
|
+
value && "#{property}:#{value}#{important}"
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# A value is a comma- or space-separated list of tokens: keywords, numbers
|
|
251
|
+
# with an allowlisted unit, hex colors, and allowlisted functions. The
|
|
252
|
+
# output is reassembled from the matched tokens.
|
|
253
|
+
def sanitize_value(value, functions, namespace = nil)
|
|
254
|
+
scanner = StringScanner.new(value)
|
|
255
|
+
out = +""
|
|
256
|
+
loop do
|
|
257
|
+
token = scan_token(scanner, functions, namespace)
|
|
258
|
+
return nil unless token
|
|
259
|
+
|
|
260
|
+
out << token
|
|
261
|
+
break if scanner.eos?
|
|
262
|
+
|
|
263
|
+
separator = scanner.scan(SEPARATOR)
|
|
264
|
+
return nil if separator.nil? || scanner.eos?
|
|
265
|
+
|
|
266
|
+
out << (separator.include?(",") ? "," : " ")
|
|
267
|
+
end
|
|
268
|
+
out
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def scan_token(scanner, functions, namespace = nil)
|
|
272
|
+
if (name = scanner.scan(FUNCTION_NAME))
|
|
273
|
+
scan_function(scanner, name.downcase, functions, namespace)
|
|
274
|
+
else
|
|
275
|
+
scanner.scan(HEX_COLOR) || scanner.scan(NUMBER) || scanner.scan(IDENT)
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
# The scanner is positioned at the "(". url() takes exactly one
|
|
280
|
+
# same-document fragment; every other allowed function takes numbers.
|
|
281
|
+
def scan_function(scanner, name, functions, namespace = nil)
|
|
282
|
+
return nil unless functions.include?(name)
|
|
283
|
+
|
|
284
|
+
scanner.skip(/\(\s*/)
|
|
285
|
+
if name == "url"
|
|
286
|
+
fragment = scanner.scan(FRAGMENT)
|
|
287
|
+
return nil unless fragment && scanner.skip(/\s*\)/)
|
|
288
|
+
|
|
289
|
+
"url(##{apply_namespace(namespace, fragment[1..])})"
|
|
290
|
+
else
|
|
291
|
+
args = []
|
|
292
|
+
loop do
|
|
293
|
+
arg = scanner.scan(NUMBER)
|
|
294
|
+
return nil unless arg
|
|
295
|
+
|
|
296
|
+
args << arg
|
|
297
|
+
break if scanner.skip(/\s*\)/)
|
|
298
|
+
# Modern color syntax: rgb(R G B / A). The slash separates the alpha,
|
|
299
|
+
# and is accepted only here, only for color functions — the single
|
|
300
|
+
# path by which "/" can reach output. Re-emitted in the space form
|
|
301
|
+
# (mixing commas with "/" is invalid CSS), so the result is valid.
|
|
302
|
+
if COLOR_FUNCTIONS.include?(name) && scanner.skip(%r{\s*/\s*})
|
|
303
|
+
alpha = scanner.scan(NUMBER)
|
|
304
|
+
return nil unless alpha && scanner.skip(/\s*\)/)
|
|
305
|
+
|
|
306
|
+
return "#{name}(#{args.join(" ")} / #{alpha})"
|
|
307
|
+
end
|
|
308
|
+
return nil unless scanner.skip(SEPARATOR)
|
|
309
|
+
end
|
|
310
|
+
"#{name}(#{args.join(",")})"
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "pathname"
|
|
4
|
-
require "rexml/document"
|
|
5
|
-
require "rexml/parsers/pullparser"
|
|
6
4
|
|
|
7
5
|
module SafeImage
|
|
8
6
|
module SvgMetadata
|
|
@@ -14,10 +12,55 @@ module SafeImage
|
|
|
14
12
|
MAX_SVG_ATTRIBUTES = 50_000
|
|
15
13
|
MAX_SVG_DIMENSION = 100_000
|
|
16
14
|
MAX_SVG_PIXELS = 100_000_000
|
|
15
|
+
# Upper bound on the render tree the document instantiates. The caps above
|
|
16
|
+
# bound the *source* document, but several allowlisted features replicate
|
|
17
|
+
# referenced content at render time, so a small source can cost a consumer
|
|
18
|
+
# (browser/rasterizer) orders of magnitude more work:
|
|
19
|
+
# * <use href="#id"> deep-copies its target subtree — a chain of doubling
|
|
20
|
+
# groups fans a few dozen nodes into billions ("use bomb"), and a cyclic
|
|
21
|
+
# reference expands forever.
|
|
22
|
+
# * a <marker> is drawn once per vertex of every path/line/polyline/polygon
|
|
23
|
+
# that references it, so (vertex count) x (marker subtree size) draws — a
|
|
24
|
+
# dense `d` (~200k vertices fit in 1 MB) times a non-trivial marker is a
|
|
25
|
+
# linear-but-huge "draw bomb" no node/byte/element cap can see.
|
|
26
|
+
# SvgSanitizer charges both against this single budget over the sanitized
|
|
27
|
+
# tree (renderer-free static accounting) and rejects when it is exceeded.
|
|
28
|
+
MAX_SVG_RENDER_UNITS = 1_000_000
|
|
17
29
|
|
|
18
30
|
LENGTH_PATTERN = /\A\s*([+]?(?:\d+(?:\.\d+)?|\.\d+))(?:px)?\s*\z/i.freeze
|
|
19
31
|
VIEWBOX_SPLIT = /[\s,]+/.freeze
|
|
20
32
|
|
|
33
|
+
# Byte-order marks for the multi-byte encodings whose ASCII characters our
|
|
34
|
+
# byte-level scans below cannot see through. XML mandates a BOM for UTF-16
|
|
35
|
+
# and UTF-32, so a document in one of these encodings either carries a BOM
|
|
36
|
+
# here or contains NUL bytes for its ASCII characters (caught separately).
|
|
37
|
+
# Order matters: the UTF-32 LE mark begins with the UTF-16 LE mark.
|
|
38
|
+
NON_UTF8_BOMS = [
|
|
39
|
+
"\xFF\xFE\x00\x00".b, # UTF-32 LE
|
|
40
|
+
"\x00\x00\xFE\xFF".b, # UTF-32 BE
|
|
41
|
+
"\xFF\xFE".b, # UTF-16 LE
|
|
42
|
+
"\xFE\xFF".b # UTF-16 BE
|
|
43
|
+
].freeze
|
|
44
|
+
|
|
45
|
+
UTF8_BOM = "\xEF\xBB\xBF".b.freeze
|
|
46
|
+
# Declared encodings we accept: UTF-8/ASCII plus the single-byte,
|
|
47
|
+
# ASCII-transparent legacy charsets (ISO-8859-*, Windows-125x). Their bytes
|
|
48
|
+
# below 0x80 decode to identical ASCII, so the byte scans below see the same
|
|
49
|
+
# markup any decoder (REXML or a browser) does; and being single-byte, no
|
|
50
|
+
# lead byte can swallow a following quote the way Shift-JIS, GBK, or Big5
|
|
51
|
+
# can. Multi-byte (Shift-JIS, GBK, EUC-*, ISO-2022-*), transforming (UTF-7:
|
|
52
|
+
# "+ADw-" decodes to "<"), and NUL-interleaved (UTF-16/32) encodings are
|
|
53
|
+
# deliberately excluded — they let bytes our ASCII scans cannot see become
|
|
54
|
+
# markup the parser acts on. The shape match alone is not airtight:
|
|
55
|
+
# "utf8" or "windows-1259" fit the pattern yet name no real encoding, so a
|
|
56
|
+
# name must also resolve via Encoding.find to pass — lookalikes fail
|
|
57
|
+
# closed here instead of leaking REXML's bare ArgumentError to the caller.
|
|
58
|
+
SAFE_DECLARED_ENCODING =
|
|
59
|
+
/\A(?:utf-?8|us-ascii|ascii|iso-?8859-?\d{1,2}|(?:windows|cp)-?125\d)\z/i.freeze
|
|
60
|
+
# ASCII-only so it matches the binary buffer; the optional BOM is stripped
|
|
61
|
+
# before matching rather than embedded here (which would make this UTF-8).
|
|
62
|
+
XML_DECL_ENCODING = /\A\s*<\?xml\b[^>]*?\bencoding\s*=\s*["']([^"']+)["']/i.freeze
|
|
63
|
+
|
|
21
64
|
def probe(path, max_pixels: nil, max_bytes: MAX_SVG_BYTES)
|
|
22
65
|
started = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
23
66
|
path = safe_svg_path(path)
|
|
@@ -34,6 +77,14 @@ module SafeImage
|
|
|
34
77
|
def dimensions(path, max_pixels: nil, max_bytes: MAX_SVG_BYTES)
|
|
35
78
|
xml = read_svg(path, max_bytes: max_bytes)
|
|
36
79
|
_name, attributes = scan_svg!(xml)
|
|
80
|
+
dimensions_from_attributes(attributes, max_pixels: max_pixels)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Computes and validates the document dimensions from the already-scanned
|
|
84
|
+
# root attributes, so a caller that has run scan_svg! does not re-read or
|
|
85
|
+
# re-scan the file. Same width/height-then-viewBox fallback and limits as
|
|
86
|
+
# dimensions above.
|
|
87
|
+
def dimensions_from_attributes(attributes, max_pixels: nil)
|
|
37
88
|
width = parse_length(attributes["width"])
|
|
38
89
|
height = parse_length(attributes["height"])
|
|
39
90
|
|
|
@@ -46,27 +97,12 @@ module SafeImage
|
|
|
46
97
|
validate_dimensions!(width, height, max_pixels: max_pixels)
|
|
47
98
|
end
|
|
48
99
|
|
|
49
|
-
# Builds the full REXML tree. Used only by the SVG sanitizer, which needs to
|
|
50
|
-
# walk and rewrite the document; metadata reads go through the DOM-free
|
|
51
|
-
# streaming path above. The streaming validation runs first so a document
|
|
52
|
-
# that breaches the structural caps is rejected before the tree is built.
|
|
53
|
-
def parse(path, max_bytes: MAX_SVG_BYTES)
|
|
54
|
-
xml = read_svg(path, max_bytes: max_bytes)
|
|
55
|
-
scan_svg!(xml)
|
|
56
|
-
doc = REXML::Document.new(xml)
|
|
57
|
-
raise InvalidImageError, "SVG root required" unless doc.root&.name == "svg"
|
|
58
|
-
|
|
59
|
-
doc
|
|
60
|
-
rescue REXML::ParseException => e
|
|
61
|
-
raise InvalidImageError, "invalid SVG: #{e.message}"
|
|
62
|
-
end
|
|
63
|
-
|
|
64
100
|
def read_svg(path, max_bytes: MAX_SVG_BYTES)
|
|
65
101
|
path = safe_svg_path(path)
|
|
66
102
|
size = File.size(path)
|
|
67
103
|
raise LimitError, "SVG exceeds #{max_bytes} bytes" if size > max_bytes
|
|
68
104
|
|
|
69
|
-
xml = File.binread(path, max_bytes + 1)
|
|
105
|
+
xml = File.binread(path, max_bytes + 1) || "".b
|
|
70
106
|
raise LimitError, "SVG exceeds #{max_bytes} bytes" if xml.bytesize > max_bytes
|
|
71
107
|
reject_unsafe_xml!(xml)
|
|
72
108
|
xml
|
|
@@ -79,10 +115,41 @@ module SafeImage
|
|
|
79
115
|
end
|
|
80
116
|
|
|
81
117
|
def reject_unsafe_xml!(xml)
|
|
118
|
+
# The DOCTYPE/PI scans below are ASCII byte regexes; they only see what
|
|
119
|
+
# they expect when the bytes we scan decode to the same markup the XML
|
|
120
|
+
# parser sees. That holds for UTF-8 and single-byte ASCII-transparent
|
|
121
|
+
# charsets but not for UTF-16/32 or multi-byte/transforming encodings, so
|
|
122
|
+
# reject those first.
|
|
123
|
+
reject_unsafe_encoding!(xml)
|
|
82
124
|
raise InvalidImageError, "doctype is not allowed in SVG" if xml.match?(/<!DOCTYPE/i)
|
|
83
125
|
raise InvalidImageError, "XML processing instructions are not allowed in SVG" if xml.match?(/<\?(?!xml\s)/i)
|
|
84
126
|
end
|
|
85
127
|
|
|
128
|
+
def reject_unsafe_encoding!(xml)
|
|
129
|
+
bytes = xml.b
|
|
130
|
+
# UTF-16/UTF-32 interleave NUL bytes between ASCII characters, hiding
|
|
131
|
+
# "<!DOCTYPE" from the ASCII scans while the XML parser still decodes and
|
|
132
|
+
# honours it. (NUL is invalid in XML 1.0 regardless, so this also rejects
|
|
133
|
+
# garbage.)
|
|
134
|
+
if NON_UTF8_BOMS.any? { |bom| bytes.start_with?(bom) } || bytes.include?("\x00".b)
|
|
135
|
+
raise InvalidImageError, "SVG must use a single-byte or UTF-8 encoding"
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
bytes = bytes.byteslice(UTF8_BOM.bytesize..) if bytes.start_with?(UTF8_BOM)
|
|
139
|
+
match = bytes.match(XML_DECL_ENCODING)
|
|
140
|
+
return unless match
|
|
141
|
+
return if match[1].match?(SAFE_DECLARED_ENCODING) && known_encoding?(match[1])
|
|
142
|
+
|
|
143
|
+
raise InvalidImageError, "unsupported SVG encoding: #{match[1]}"
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def known_encoding?(name)
|
|
147
|
+
Encoding.find(name)
|
|
148
|
+
true
|
|
149
|
+
rescue ArgumentError
|
|
150
|
+
false
|
|
151
|
+
end
|
|
152
|
+
|
|
86
153
|
def parse_length(value)
|
|
87
154
|
value = value.to_s
|
|
88
155
|
match = LENGTH_PATTERN.match(value)
|
|
@@ -119,44 +186,103 @@ module SafeImage
|
|
|
119
186
|
[width.ceil, height.ceil]
|
|
120
187
|
end
|
|
121
188
|
|
|
122
|
-
# Streams the document with a
|
|
123
|
-
# events arrive, so a hostile "millions of tiny
|
|
124
|
-
# rejected at the cap without ever retaining the
|
|
125
|
-
#
|
|
126
|
-
# element's name and
|
|
189
|
+
# Streams the document with a SAX parser, enforcing the structural caps as
|
|
190
|
+
# events arrive (see cap_scanner_class), so a hostile "millions of tiny
|
|
191
|
+
# elements" document is rejected at the cap without ever retaining the
|
|
192
|
+
# multi-million-object DOM a parse-then-validate approach would build.
|
|
193
|
+
# Returns the root element's local name and a localname=>value hash of its
|
|
194
|
+
# attributes, matching the contract dimensions_from_attributes consumes.
|
|
195
|
+
#
|
|
196
|
+
# SAX does NOT raise on malformed XML even with recovery disabled — it
|
|
197
|
+
# reports through the error callback and keeps going — so well-formedness is
|
|
198
|
+
# enforced by recording any reported error and rejecting after the parse.
|
|
199
|
+
# This reproduces the old REXML pull-parser's reject set (unclosed/mismatched
|
|
200
|
+
# tags, trailing junk) and is strictly stricter on multiple root elements,
|
|
201
|
+
# which is a safe direction for a gate.
|
|
127
202
|
def scan_svg!(xml)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
attributes += event[1].size
|
|
145
|
-
raise LimitError, "SVG has too many attributes" if attributes > MAX_SVG_ATTRIBUTES
|
|
146
|
-
|
|
147
|
-
if root_name.nil?
|
|
148
|
-
root_name = event[0]
|
|
149
|
-
root_attributes = event[1]
|
|
150
|
-
end
|
|
151
|
-
elsif event.end_element?
|
|
152
|
-
depth -= 1
|
|
153
|
-
end
|
|
203
|
+
require_nokogiri
|
|
204
|
+
handler = cap_scanner_class.new
|
|
205
|
+
parser = Nokogiri::XML::SAX::Parser.new(handler)
|
|
206
|
+
begin
|
|
207
|
+
# recovery: false — do not silently repair malformed markup. Errors still
|
|
208
|
+
# arrive via the error callback rather than as exceptions, so they are
|
|
209
|
+
# checked explicitly below.
|
|
210
|
+
parser.parse(xml) { |ctx| ctx.recovery = false }
|
|
211
|
+
rescue LimitError, InvalidImageError
|
|
212
|
+
raise # our own cap/validation rejections, surfaced from a callback
|
|
213
|
+
rescue StandardError => e
|
|
214
|
+
# Nokogiri rejects some inputs by raising rather than via the error
|
|
215
|
+
# callback (e.g. empty input -> "input string cannot be empty"). Keep
|
|
216
|
+
# untrusted-input failures inside our error hierarchy.
|
|
217
|
+
raise InvalidImageError, "invalid SVG: #{e.message}"
|
|
154
218
|
end
|
|
155
219
|
|
|
156
|
-
raise InvalidImageError, "SVG
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
220
|
+
raise InvalidImageError, "invalid SVG: #{handler.parse_error}" if handler.parse_error
|
|
221
|
+
raise InvalidImageError, "SVG root required" unless handler.root_name == "svg"
|
|
222
|
+
|
|
223
|
+
[handler.root_name, handler.root_attributes]
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# Loaded on first SVG use, not at file load: keeping the XML library off the
|
|
227
|
+
# hot path of every non-SVG operation (and every sandbox worker boot) where
|
|
228
|
+
# it would otherwise be paid for nothing.
|
|
229
|
+
def require_nokogiri
|
|
230
|
+
require "nokogiri"
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# The SAX cap-enforcement handler, built lazily and memoised the first time
|
|
234
|
+
# an SVG is scanned. It subclasses Nokogiri::XML::SAX::Document, so it cannot
|
|
235
|
+
# be declared at file-load time without forcing nokogiri to load eagerly and
|
|
236
|
+
# defeating the lazy require above. A breached cap raises LimitError straight
|
|
237
|
+
# out of a callback; libxml2 propagates it at the next event boundary, so the
|
|
238
|
+
# parse aborts promptly rather than scanning to the end (verified: rejection
|
|
239
|
+
# time grows far slower than input size).
|
|
240
|
+
def cap_scanner_class
|
|
241
|
+
@cap_scanner_class ||= Class.new(Nokogiri::XML::SAX::Document) do
|
|
242
|
+
attr_reader :root_name, :root_attributes, :parse_error
|
|
243
|
+
|
|
244
|
+
def initialize
|
|
245
|
+
super
|
|
246
|
+
@depth = -1
|
|
247
|
+
@elements = 0
|
|
248
|
+
@attributes = 0
|
|
249
|
+
@root_name = nil
|
|
250
|
+
@root_attributes = nil
|
|
251
|
+
@parse_error = nil
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# attrs: array of Nokogiri::XML::SAX::Parser::Attribute (localname/value),
|
|
255
|
+
# NOT including namespace declarations; `ns` carries the xmlns decls. Both
|
|
256
|
+
# count toward the attribute cap so the bound cannot be sidestepped by
|
|
257
|
+
# spraying namespace declarations.
|
|
258
|
+
def start_element_namespace(name, attrs = [], _prefix = nil, _uri = nil, ns = [])
|
|
259
|
+
@depth += 1
|
|
260
|
+
raise LimitError, "SVG nesting exceeds #{MAX_SVG_DEPTH}" if @depth > MAX_SVG_DEPTH
|
|
261
|
+
|
|
262
|
+
@elements += 1
|
|
263
|
+
raise LimitError, "SVG has too many elements" if @elements > MAX_SVG_ELEMENTS
|
|
264
|
+
|
|
265
|
+
@attributes += attrs.length + ns.length
|
|
266
|
+
raise LimitError, "SVG has too many attributes" if @attributes > MAX_SVG_ATTRIBUTES
|
|
267
|
+
|
|
268
|
+
return unless @root_name.nil?
|
|
269
|
+
|
|
270
|
+
@root_name = name
|
|
271
|
+
@root_attributes = attrs.each_with_object({}) { |attr, hash| hash[attr.localname] = attr.value }
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def end_element_namespace(_name, _prefix = nil, _uri = nil)
|
|
275
|
+
@depth -= 1
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# libxml2 reports well-formedness violations here rather than raising;
|
|
279
|
+
# record the first so scan_svg! can reject on it.
|
|
280
|
+
def error(message)
|
|
281
|
+
@parse_error ||= message.to_s.strip
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
def warning(_message); end
|
|
285
|
+
end
|
|
160
286
|
end
|
|
161
287
|
end
|
|
162
288
|
end
|