safe_image 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,314 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ module SafeImage
6
+ # Allowlist sanitizer for the small CSS subset SVG files legitimately use
7
+ # (Inkscape writes style="" attributes, Illustrator writes class rules in
8
+ # <style> elements). Output is constructed from validated tokens, never
9
+ # echoed from the input, so nothing outside the vocabulary below can appear
10
+ # in it. Anything the grammar does not recognise — escapes, quotes,
11
+ # at-rules, comments, unknown properties or functions, non-fragment url() —
12
+ # drops the declaration rather than being decoded.
13
+ module SvgCss
14
+ NO_FUNCTIONS = [].freeze
15
+ URL_FUNCTIONS = %w[url].freeze
16
+ COLOR_FUNCTIONS = %w[rgb rgba hsl hsla].freeze
17
+ PAINT_FUNCTIONS = (COLOR_FUNCTIONS + URL_FUNCTIONS).freeze
18
+ # Lowercase: function names are matched and emitted case-insensitively.
19
+ TRANSFORM_FUNCTIONS = %w[matrix translate translatex translatey scale rotate skewx skewy].freeze
20
+
21
+ # A CSS property is allowed exactly when its presentation-attribute twin is
22
+ # in SvgSanitizer::ALLOWED_ATTRIBUTES (a test asserts this); the value is the
23
+ # list of functions that property's values may call. The only functions that
24
+ # reach a resource are url(...) — and those are constrained to same-document
25
+ # #fragment references — so the URL surface is exactly the url()-bearing
26
+ # rows below (paint servers, clip/mask, and markers), all fragment-only.
27
+ ALLOWED_PROPERTIES = {
28
+ # Paint and color. url() = paint-server reference (gradient/pattern).
29
+ "fill" => PAINT_FUNCTIONS,
30
+ "stroke" => PAINT_FUNCTIONS,
31
+ "stop-color" => COLOR_FUNCTIONS,
32
+ "color" => COLOR_FUNCTIONS, # currentColor resolution; Inkscape/Illustrator
33
+ "opacity" => NO_FUNCTIONS,
34
+ "fill-opacity" => NO_FUNCTIONS,
35
+ "stroke-opacity" => NO_FUNCTIONS,
36
+ "stop-opacity" => NO_FUNCTIONS,
37
+ "fill-rule" => NO_FUNCTIONS,
38
+ "clip-rule" => NO_FUNCTIONS,
39
+ # Stroke geometry. stroke-dasharray/dashoffset are how every dashed line
40
+ # is expressed; vector-effect:non-scaling-stroke is an Inkscape default.
41
+ "stroke-width" => NO_FUNCTIONS,
42
+ "stroke-linecap" => NO_FUNCTIONS,
43
+ "stroke-linejoin" => NO_FUNCTIONS,
44
+ "stroke-miterlimit" => NO_FUNCTIONS,
45
+ "stroke-dasharray" => NO_FUNCTIONS,
46
+ "stroke-dashoffset" => NO_FUNCTIONS,
47
+ "vector-effect" => NO_FUNCTIONS,
48
+ # Geometry references. url() = clipPath/mask/marker element by #id.
49
+ "clip-path" => URL_FUNCTIONS,
50
+ "mask" => URL_FUNCTIONS,
51
+ "marker" => URL_FUNCTIONS,
52
+ "marker-start" => URL_FUNCTIONS,
53
+ "marker-mid" => URL_FUNCTIONS,
54
+ "marker-end" => URL_FUNCTIONS,
55
+ "transform" => TRANSFORM_FUNCTIONS,
56
+ # Visibility and rendering hints. Keywords/numbers only.
57
+ "display" => NO_FUNCTIONS,
58
+ "visibility" => NO_FUNCTIONS,
59
+ "overflow" => NO_FUNCTIONS,
60
+ "paint-order" => NO_FUNCTIONS,
61
+ "mix-blend-mode" => NO_FUNCTIONS,
62
+ "isolation" => NO_FUNCTIONS,
63
+ "shape-rendering" => NO_FUNCTIONS,
64
+ "image-rendering" => NO_FUNCTIONS,
65
+ "color-interpolation" => NO_FUNCTIONS,
66
+ # Text. Keywords/lengths only; no url() anywhere in text styling.
67
+ "font-family" => NO_FUNCTIONS,
68
+ "font-size" => NO_FUNCTIONS,
69
+ "font-weight" => NO_FUNCTIONS,
70
+ "font-style" => NO_FUNCTIONS,
71
+ "font-variant" => NO_FUNCTIONS,
72
+ "font-stretch" => NO_FUNCTIONS,
73
+ "text-anchor" => NO_FUNCTIONS,
74
+ "text-decoration" => NO_FUNCTIONS,
75
+ "letter-spacing" => NO_FUNCTIONS,
76
+ "word-spacing" => NO_FUNCTIONS,
77
+ "dominant-baseline" => NO_FUNCTIONS,
78
+ "baseline-shift" => NO_FUNCTIONS,
79
+ "writing-mode" => NO_FUNCTIONS,
80
+ "direction" => NO_FUNCTIONS
81
+ }.freeze
82
+
83
+ # Every character a declaration may contain. The exclusions do the work:
84
+ # no backslash (CSS escapes re-form tokens after any pattern check), no
85
+ # quotes (no strings, so no string-URL functions), no "@" (no at-rules),
86
+ # no "*" — which keeps CSS comments (/* */) structurally impossible even
87
+ # though "/" is admitted for modern color alpha (rgb(R G B / A)). A "/"
88
+ # survives to output only via the color-function parser below; anywhere
89
+ # else it fails tokenisation and the declaration drops.
90
+ DECLARATION_CHARSET = %r{\A[a-zA-Z0-9 #%+.,()/:-]*\z}.freeze
91
+
92
+ # CSS priority flag. Parsed out of the value structurally and re-emitted
93
+ # canonically, so "!" never enters the value tokeniser.
94
+ IMPORTANT = /\s*!\s*important\s*\z/i.freeze
95
+
96
+ # url() may only reference the current document; same fragment shape
97
+ # SvgSanitizer.dangerous_value? accepts.
98
+ FRAGMENT = /#[A-Za-z][\w.-]*/.freeze
99
+
100
+ HEX_COLOR = /#\h{3,8}/.freeze
101
+ NUMBER = /[+-]?(?:\d+\.\d+|\.\d+|\d+)(?:%|px|pt|pc|em|rem|ex|ch|cm|mm|in|deg|rad|grad|turn)?/.freeze
102
+ IDENT = /[a-zA-Z][a-zA-Z0-9-]*/.freeze
103
+ FUNCTION_NAME = /[a-zA-Z][a-zA-Z0-9-]*(?=\()/.freeze
104
+ SEPARATOR = /\s*,\s*|\s+/.freeze
105
+
106
+ # Selectors: type/.class/#id compounds joined by descendant or child
107
+ # combinators, in comma lists. The charset shuts out pseudo-classes (:),
108
+ # attribute selectors ([), and everything the declaration charset already
109
+ # excludes.
110
+ SELECTOR_CHARSET = /\A[a-zA-Z0-9_ #.,*>-]*\z/.freeze
111
+ SELECTOR_TYPE = /\*|[a-zA-Z][a-zA-Z0-9-]*/.freeze
112
+ SELECTOR_QUALIFIER = /[.#][A-Za-z_][\w-]*/.freeze
113
+ COMBINATOR = /\s*>\s*|\s+/.freeze
114
+
115
+ module_function
116
+
117
+ # Prefixes a bare id/fragment name with the document namespace, unless it is
118
+ # already prefixed (so re-sanitising is a fixed point). A nil namespace is a
119
+ # no-op, preserving the document-scoped (non-inline) behaviour.
120
+ def apply_namespace(namespace, name)
121
+ return name if namespace.nil? || name.start_with?("#{namespace}-")
122
+
123
+ "#{namespace}-#{name}"
124
+ end
125
+
126
+ # Sanitizes a style="" declaration list. When a namespace is given, url(#id)
127
+ # references are rewritten to url(#namespace-id) so they keep pointing at the
128
+ # namespaced ids in the same document. Returns the constructed declaration
129
+ # list, or nil when no declaration survives.
130
+ def sanitize_declarations(css, namespace: nil)
131
+ declarations = normalize(css).split(";").filter_map { |declaration| sanitize_declaration(declaration, namespace) }
132
+ declarations.empty? ? nil : declarations.join(";")
133
+ end
134
+
135
+ # Sanitizes a <style> element's stylesheet. The structure scan accepts
136
+ # only a flat list of "selectors { declarations }" rules — at-rules,
137
+ # nested blocks, and unbalanced braces fail the whole sheet closed rather
138
+ # than surviving in degraded form. Within a well-formed sheet, individual
139
+ # selectors and declarations drop independently. Returns the constructed
140
+ # stylesheet, or nil when no rule survives.
141
+ def sanitize_stylesheet(css, namespace: nil)
142
+ css = normalize(css)
143
+ # At-rules (@import, @media, @font-face, @keyframes, ...) have no place in
144
+ # the allowed subset, and "@" appears nowhere else in it. Rejecting it up
145
+ # front fails the whole element closed — the rule-by-rule scan below would
146
+ # otherwise drop only the at-rule and keep later rules, which contradicts
147
+ # the documented guarantee and risks parser edge cases at the boundary.
148
+ return nil if css.include?("@")
149
+
150
+ scanner = StringScanner.new(css)
151
+ rules = []
152
+ until scanner.eos?
153
+ scanner.skip(/\s+/)
154
+ break if scanner.eos?
155
+
156
+ selectors_src = scanner.scan(/[^{}]+/)
157
+ return nil unless selectors_src && scanner.skip(/\{/)
158
+
159
+ body = scanner.scan(/[^{}]*/)
160
+ return nil unless scanner.skip(/\}/)
161
+
162
+ selectors = sanitize_selectors(selectors_src, namespace)
163
+ declarations = sanitize_declarations(body, namespace: namespace)
164
+ rules << "#{selectors}{#{declarations}}" if selectors && declarations
165
+ end
166
+ rules.empty? ? nil : rules.join
167
+ end
168
+
169
+ def sanitize_selectors(src, namespace = nil)
170
+ selectors = src.split(",").filter_map { |selector| sanitize_selector(selector.strip, namespace) }
171
+ selectors.empty? ? nil : selectors.join(",")
172
+ end
173
+
174
+ def sanitize_selector(selector, namespace = nil)
175
+ return nil if selector.empty? || !selector.match?(SELECTOR_CHARSET)
176
+
177
+ scanner = StringScanner.new(selector)
178
+ out = +""
179
+ loop do
180
+ compound = scan_compound(scanner, namespace)
181
+ return nil unless compound
182
+
183
+ out << compound
184
+ break if scanner.eos?
185
+
186
+ combinator = scanner.scan(COMBINATOR)
187
+ return nil if combinator.nil? || scanner.eos?
188
+
189
+ out << (combinator.include?(">") ? ">" : " ")
190
+ end
191
+ scope_selector(namespace, out)
192
+ end
193
+
194
+ # Confines a selector to the namespaced document by anchoring it under the
195
+ # root's scope class, so a preserved <style> cannot reach a host page if the
196
+ # SVG is inlined. Universal/type/class selectors that would otherwise match
197
+ # host elements only match descendants of this document's root. Idempotent:
198
+ # an already-scoped selector is returned unchanged.
199
+ def scope_selector(namespace, selector)
200
+ return selector if namespace.nil?
201
+
202
+ scope = ".#{namespace}-scope"
203
+ selector.start_with?("#{scope} ") ? selector : "#{scope} #{selector}"
204
+ end
205
+
206
+ def scan_compound(scanner, namespace = nil)
207
+ out = +""
208
+ if (type = scanner.scan(SELECTOR_TYPE))
209
+ out << type
210
+ end
211
+ while (qualifier = scanner.scan(SELECTOR_QUALIFIER))
212
+ out << namespace_qualifier(namespace, qualifier)
213
+ end
214
+ out.empty? ? nil : out
215
+ end
216
+
217
+ # Prefix an id (#x) or class (.x) selector's name with the namespace so it
218
+ # matches only this document's namespaced ids/classes, never a host element's.
219
+ # Type and universal selectors are left alone (they are confined by the root
220
+ # scope class instead). Idempotent via apply_namespace.
221
+ def namespace_qualifier(namespace, qualifier)
222
+ return qualifier if namespace.nil?
223
+
224
+ "#{qualifier[0]}#{apply_namespace(namespace, qualifier[1..])}"
225
+ end
226
+
227
+ def normalize(css)
228
+ css.to_s.tr("\t\r\n\f\v", " ")
229
+ end
230
+
231
+ def sanitize_declaration(declaration, namespace = nil)
232
+ important = ""
233
+ if declaration.match?(IMPORTANT)
234
+ declaration = declaration.sub(IMPORTANT, "")
235
+ important = "!important"
236
+ end
237
+ return nil unless declaration.match?(DECLARATION_CHARSET)
238
+
239
+ property, value = declaration.split(":", 2)
240
+ return nil if value.nil?
241
+
242
+ property = property.strip.downcase
243
+ functions = ALLOWED_PROPERTIES[property]
244
+ return nil unless functions
245
+
246
+ value = sanitize_value(value.strip, functions, namespace)
247
+ value && "#{property}:#{value}#{important}"
248
+ end
249
+
250
+ # A value is a comma- or space-separated list of tokens: keywords, numbers
251
+ # with an allowlisted unit, hex colors, and allowlisted functions. The
252
+ # output is reassembled from the matched tokens.
253
+ def sanitize_value(value, functions, namespace = nil)
254
+ scanner = StringScanner.new(value)
255
+ out = +""
256
+ loop do
257
+ token = scan_token(scanner, functions, namespace)
258
+ return nil unless token
259
+
260
+ out << token
261
+ break if scanner.eos?
262
+
263
+ separator = scanner.scan(SEPARATOR)
264
+ return nil if separator.nil? || scanner.eos?
265
+
266
+ out << (separator.include?(",") ? "," : " ")
267
+ end
268
+ out
269
+ end
270
+
271
+ def scan_token(scanner, functions, namespace = nil)
272
+ if (name = scanner.scan(FUNCTION_NAME))
273
+ scan_function(scanner, name.downcase, functions, namespace)
274
+ else
275
+ scanner.scan(HEX_COLOR) || scanner.scan(NUMBER) || scanner.scan(IDENT)
276
+ end
277
+ end
278
+
279
+ # The scanner is positioned at the "(". url() takes exactly one
280
+ # same-document fragment; every other allowed function takes numbers.
281
+ def scan_function(scanner, name, functions, namespace = nil)
282
+ return nil unless functions.include?(name)
283
+
284
+ scanner.skip(/\(\s*/)
285
+ if name == "url"
286
+ fragment = scanner.scan(FRAGMENT)
287
+ return nil unless fragment && scanner.skip(/\s*\)/)
288
+
289
+ "url(##{apply_namespace(namespace, fragment[1..])})"
290
+ else
291
+ args = []
292
+ loop do
293
+ arg = scanner.scan(NUMBER)
294
+ return nil unless arg
295
+
296
+ args << arg
297
+ break if scanner.skip(/\s*\)/)
298
+ # Modern color syntax: rgb(R G B / A). The slash separates the alpha,
299
+ # and is accepted only here, only for color functions — the single
300
+ # path by which "/" can reach output. Re-emitted in the space form
301
+ # (mixing commas with "/" is invalid CSS), so the result is valid.
302
+ if COLOR_FUNCTIONS.include?(name) && scanner.skip(%r{\s*/\s*})
303
+ alpha = scanner.scan(NUMBER)
304
+ return nil unless alpha && scanner.skip(/\s*\)/)
305
+
306
+ return "#{name}(#{args.join(" ")} / #{alpha})"
307
+ end
308
+ return nil unless scanner.skip(SEPARATOR)
309
+ end
310
+ "#{name}(#{args.join(",")})"
311
+ end
312
+ end
313
+ end
314
+ end
@@ -1,8 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "pathname"
4
- require "rexml/document"
5
- require "rexml/parsers/pullparser"
6
4
 
7
5
  module SafeImage
8
6
  module SvgMetadata
@@ -14,10 +12,55 @@ module SafeImage
14
12
  MAX_SVG_ATTRIBUTES = 50_000
15
13
  MAX_SVG_DIMENSION = 100_000
16
14
  MAX_SVG_PIXELS = 100_000_000
15
+ # Upper bound on the render tree the document instantiates. The caps above
16
+ # bound the *source* document, but several allowlisted features replicate
17
+ # referenced content at render time, so a small source can cost a consumer
18
+ # (browser/rasterizer) orders of magnitude more work:
19
+ # * <use href="#id"> deep-copies its target subtree — a chain of doubling
20
+ # groups fans a few dozen nodes into billions ("use bomb"), and a cyclic
21
+ # reference expands forever.
22
+ # * a <marker> is drawn once per vertex of every path/line/polyline/polygon
23
+ # that references it, so (vertex count) x (marker subtree size) draws — a
24
+ # dense `d` (~200k vertices fit in 1 MB) times a non-trivial marker is a
25
+ # linear-but-huge "draw bomb" no node/byte/element cap can see.
26
+ # SvgSanitizer charges both against this single budget over the sanitized
27
+ # tree (renderer-free static accounting) and rejects when it is exceeded.
28
+ MAX_SVG_RENDER_UNITS = 1_000_000
17
29
 
18
30
  LENGTH_PATTERN = /\A\s*([+]?(?:\d+(?:\.\d+)?|\.\d+))(?:px)?\s*\z/i.freeze
19
31
  VIEWBOX_SPLIT = /[\s,]+/.freeze
20
32
 
33
+ # Byte-order marks for the multi-byte encodings whose ASCII characters our
34
+ # byte-level scans below cannot see through. XML mandates a BOM for UTF-16
35
+ # and UTF-32, so a document in one of these encodings either carries a BOM
36
+ # here or contains NUL bytes for its ASCII characters (caught separately).
37
+ # Order matters: the UTF-32 LE mark begins with the UTF-16 LE mark.
38
+ NON_UTF8_BOMS = [
39
+ "\xFF\xFE\x00\x00".b, # UTF-32 LE
40
+ "\x00\x00\xFE\xFF".b, # UTF-32 BE
41
+ "\xFF\xFE".b, # UTF-16 LE
42
+ "\xFE\xFF".b # UTF-16 BE
43
+ ].freeze
44
+
45
+ UTF8_BOM = "\xEF\xBB\xBF".b.freeze
46
+ # Declared encodings we accept: UTF-8/ASCII plus the single-byte,
47
+ # ASCII-transparent legacy charsets (ISO-8859-*, Windows-125x). Their bytes
48
+ # below 0x80 decode to identical ASCII, so the byte scans below see the same
49
+ # markup any decoder (REXML or a browser) does; and being single-byte, no
50
+ # lead byte can swallow a following quote the way Shift-JIS, GBK, or Big5
51
+ # can. Multi-byte (Shift-JIS, GBK, EUC-*, ISO-2022-*), transforming (UTF-7:
52
+ # "+ADw-" decodes to "<"), and NUL-interleaved (UTF-16/32) encodings are
53
+ # deliberately excluded — they let bytes our ASCII scans cannot see become
54
+ # markup the parser acts on. The shape match alone is not airtight:
55
+ # "utf8" or "windows-1259" fit the pattern yet name no real encoding, so a
56
+ # name must also resolve via Encoding.find to pass — lookalikes fail
57
+ # closed here instead of leaking REXML's bare ArgumentError to the caller.
58
+ SAFE_DECLARED_ENCODING =
59
+ /\A(?:utf-?8|us-ascii|ascii|iso-?8859-?\d{1,2}|(?:windows|cp)-?125\d)\z/i.freeze
60
+ # ASCII-only so it matches the binary buffer; the optional BOM is stripped
61
+ # before matching rather than embedded here (which would make this UTF-8).
62
+ XML_DECL_ENCODING = /\A\s*<\?xml\b[^>]*?\bencoding\s*=\s*["']([^"']+)["']/i.freeze
63
+
21
64
  def probe(path, max_pixels: nil, max_bytes: MAX_SVG_BYTES)
22
65
  started = Process.clock_gettime(Process::CLOCK_MONOTONIC)
23
66
  path = safe_svg_path(path)
@@ -34,6 +77,14 @@ module SafeImage
34
77
  def dimensions(path, max_pixels: nil, max_bytes: MAX_SVG_BYTES)
35
78
  xml = read_svg(path, max_bytes: max_bytes)
36
79
  _name, attributes = scan_svg!(xml)
80
+ dimensions_from_attributes(attributes, max_pixels: max_pixels)
81
+ end
82
+
83
+ # Computes and validates the document dimensions from the already-scanned
84
+ # root attributes, so a caller that has run scan_svg! does not re-read or
85
+ # re-scan the file. Same width/height-then-viewBox fallback and limits as
86
+ # dimensions above.
87
+ def dimensions_from_attributes(attributes, max_pixels: nil)
37
88
  width = parse_length(attributes["width"])
38
89
  height = parse_length(attributes["height"])
39
90
 
@@ -46,27 +97,12 @@ module SafeImage
46
97
  validate_dimensions!(width, height, max_pixels: max_pixels)
47
98
  end
48
99
 
49
- # Builds the full REXML tree. Used only by the SVG sanitizer, which needs to
50
- # walk and rewrite the document; metadata reads go through the DOM-free
51
- # streaming path above. The streaming validation runs first so a document
52
- # that breaches the structural caps is rejected before the tree is built.
53
- def parse(path, max_bytes: MAX_SVG_BYTES)
54
- xml = read_svg(path, max_bytes: max_bytes)
55
- scan_svg!(xml)
56
- doc = REXML::Document.new(xml)
57
- raise InvalidImageError, "SVG root required" unless doc.root&.name == "svg"
58
-
59
- doc
60
- rescue REXML::ParseException => e
61
- raise InvalidImageError, "invalid SVG: #{e.message}"
62
- end
63
-
64
100
  def read_svg(path, max_bytes: MAX_SVG_BYTES)
65
101
  path = safe_svg_path(path)
66
102
  size = File.size(path)
67
103
  raise LimitError, "SVG exceeds #{max_bytes} bytes" if size > max_bytes
68
104
 
69
- xml = File.binread(path, max_bytes + 1)
105
+ xml = File.binread(path, max_bytes + 1) || "".b
70
106
  raise LimitError, "SVG exceeds #{max_bytes} bytes" if xml.bytesize > max_bytes
71
107
  reject_unsafe_xml!(xml)
72
108
  xml
@@ -79,10 +115,41 @@ module SafeImage
79
115
  end
80
116
 
81
117
  def reject_unsafe_xml!(xml)
118
+ # The DOCTYPE/PI scans below are ASCII byte regexes; they only see what
119
+ # they expect when the bytes we scan decode to the same markup the XML
120
+ # parser sees. That holds for UTF-8 and single-byte ASCII-transparent
121
+ # charsets but not for UTF-16/32 or multi-byte/transforming encodings, so
122
+ # reject those first.
123
+ reject_unsafe_encoding!(xml)
82
124
  raise InvalidImageError, "doctype is not allowed in SVG" if xml.match?(/<!DOCTYPE/i)
83
125
  raise InvalidImageError, "XML processing instructions are not allowed in SVG" if xml.match?(/<\?(?!xml\s)/i)
84
126
  end
85
127
 
128
+ def reject_unsafe_encoding!(xml)
129
+ bytes = xml.b
130
+ # UTF-16/UTF-32 interleave NUL bytes between ASCII characters, hiding
131
+ # "<!DOCTYPE" from the ASCII scans while the XML parser still decodes and
132
+ # honours it. (NUL is invalid in XML 1.0 regardless, so this also rejects
133
+ # garbage.)
134
+ if NON_UTF8_BOMS.any? { |bom| bytes.start_with?(bom) } || bytes.include?("\x00".b)
135
+ raise InvalidImageError, "SVG must use a single-byte or UTF-8 encoding"
136
+ end
137
+
138
+ bytes = bytes.byteslice(UTF8_BOM.bytesize..) if bytes.start_with?(UTF8_BOM)
139
+ match = bytes.match(XML_DECL_ENCODING)
140
+ return unless match
141
+ return if match[1].match?(SAFE_DECLARED_ENCODING) && known_encoding?(match[1])
142
+
143
+ raise InvalidImageError, "unsupported SVG encoding: #{match[1]}"
144
+ end
145
+
146
+ def known_encoding?(name)
147
+ Encoding.find(name)
148
+ true
149
+ rescue ArgumentError
150
+ false
151
+ end
152
+
86
153
  def parse_length(value)
87
154
  value = value.to_s
88
155
  match = LENGTH_PATTERN.match(value)
@@ -119,44 +186,103 @@ module SafeImage
119
186
  [width.ceil, height.ceil]
120
187
  end
121
188
 
122
- # Streams the document with a pull parser, enforcing the structural caps as
123
- # events arrive, so a hostile "millions of tiny elements" document is
124
- # rejected at the cap without ever retaining the multi-million-object DOM
125
- # that a parse-then-validate approach would build first. Returns the root
126
- # element's name and its attributes hash.
189
+ # Streams the document with a SAX parser, enforcing the structural caps as
190
+ # events arrive (see cap_scanner_class), so a hostile "millions of tiny
191
+ # elements" document is rejected at the cap without ever retaining the
192
+ # multi-million-object DOM a parse-then-validate approach would build.
193
+ # Returns the root element's local name and a localname=>value hash of its
194
+ # attributes, matching the contract dimensions_from_attributes consumes.
195
+ #
196
+ # SAX does NOT raise on malformed XML even with recovery disabled — it
197
+ # reports through the error callback and keeps going — so well-formedness is
198
+ # enforced by recording any reported error and rejecting after the parse.
199
+ # This reproduces the old REXML pull-parser's reject set (unclosed/mismatched
200
+ # tags, trailing junk) and is strictly stricter on multiple root elements,
201
+ # which is a safe direction for a gate.
127
202
  def scan_svg!(xml)
128
- parser = REXML::Parsers::PullParser.new(xml)
129
- depth = -1
130
- elements = 0
131
- attributes = 0
132
- root_name = nil
133
- root_attributes = nil
134
-
135
- while parser.has_next?
136
- event = parser.pull
137
- if event.start_element?
138
- depth += 1
139
- raise LimitError, "SVG nesting exceeds #{MAX_SVG_DEPTH}" if depth > MAX_SVG_DEPTH
140
-
141
- elements += 1
142
- raise LimitError, "SVG has too many elements" if elements > MAX_SVG_ELEMENTS
143
-
144
- attributes += event[1].size
145
- raise LimitError, "SVG has too many attributes" if attributes > MAX_SVG_ATTRIBUTES
146
-
147
- if root_name.nil?
148
- root_name = event[0]
149
- root_attributes = event[1]
150
- end
151
- elsif event.end_element?
152
- depth -= 1
153
- end
203
+ require_nokogiri
204
+ handler = cap_scanner_class.new
205
+ parser = Nokogiri::XML::SAX::Parser.new(handler)
206
+ begin
207
+ # recovery: false — do not silently repair malformed markup. Errors still
208
+ # arrive via the error callback rather than as exceptions, so they are
209
+ # checked explicitly below.
210
+ parser.parse(xml) { |ctx| ctx.recovery = false }
211
+ rescue LimitError, InvalidImageError
212
+ raise # our own cap/validation rejections, surfaced from a callback
213
+ rescue StandardError => e
214
+ # Nokogiri rejects some inputs by raising rather than via the error
215
+ # callback (e.g. empty input -> "input string cannot be empty"). Keep
216
+ # untrusted-input failures inside our error hierarchy.
217
+ raise InvalidImageError, "invalid SVG: #{e.message}"
154
218
  end
155
219
 
156
- raise InvalidImageError, "SVG root required" unless root_name == "svg"
157
- [root_name, root_attributes]
158
- rescue REXML::ParseException => e
159
- raise InvalidImageError, "invalid SVG: #{e.message}"
220
+ raise InvalidImageError, "invalid SVG: #{handler.parse_error}" if handler.parse_error
221
+ raise InvalidImageError, "SVG root required" unless handler.root_name == "svg"
222
+
223
+ [handler.root_name, handler.root_attributes]
224
+ end
225
+
226
+ # Loaded on first SVG use, not at file load: keeping the XML library off the
227
+ # hot path of every non-SVG operation (and every sandbox worker boot) where
228
+ # it would otherwise be paid for nothing.
229
+ def require_nokogiri
230
+ require "nokogiri"
231
+ end
232
+
233
+ # The SAX cap-enforcement handler, built lazily and memoised the first time
234
+ # an SVG is scanned. It subclasses Nokogiri::XML::SAX::Document, so it cannot
235
+ # be declared at file-load time without forcing nokogiri to load eagerly and
236
+ # defeating the lazy require above. A breached cap raises LimitError straight
237
+ # out of a callback; libxml2 propagates it at the next event boundary, so the
238
+ # parse aborts promptly rather than scanning to the end (verified: rejection
239
+ # time grows far slower than input size).
240
+ def cap_scanner_class
241
+ @cap_scanner_class ||= Class.new(Nokogiri::XML::SAX::Document) do
242
+ attr_reader :root_name, :root_attributes, :parse_error
243
+
244
+ def initialize
245
+ super
246
+ @depth = -1
247
+ @elements = 0
248
+ @attributes = 0
249
+ @root_name = nil
250
+ @root_attributes = nil
251
+ @parse_error = nil
252
+ end
253
+
254
+ # attrs: array of Nokogiri::XML::SAX::Parser::Attribute (localname/value),
255
+ # NOT including namespace declarations; `ns` carries the xmlns decls. Both
256
+ # count toward the attribute cap so the bound cannot be sidestepped by
257
+ # spraying namespace declarations.
258
+ def start_element_namespace(name, attrs = [], _prefix = nil, _uri = nil, ns = [])
259
+ @depth += 1
260
+ raise LimitError, "SVG nesting exceeds #{MAX_SVG_DEPTH}" if @depth > MAX_SVG_DEPTH
261
+
262
+ @elements += 1
263
+ raise LimitError, "SVG has too many elements" if @elements > MAX_SVG_ELEMENTS
264
+
265
+ @attributes += attrs.length + ns.length
266
+ raise LimitError, "SVG has too many attributes" if @attributes > MAX_SVG_ATTRIBUTES
267
+
268
+ return unless @root_name.nil?
269
+
270
+ @root_name = name
271
+ @root_attributes = attrs.each_with_object({}) { |attr, hash| hash[attr.localname] = attr.value }
272
+ end
273
+
274
+ def end_element_namespace(_name, _prefix = nil, _uri = nil)
275
+ @depth -= 1
276
+ end
277
+
278
+ # libxml2 reports well-formedness violations here rather than raising;
279
+ # record the first so scan_svg! can reject on it.
280
+ def error(message)
281
+ @parse_error ||= message.to_s.strip
282
+ end
283
+
284
+ def warning(_message); end
285
+ end
160
286
  end
161
287
  end
162
288
  end