dandruff 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +23 -0
- data/CHANGELOG.md +69 -0
- data/COMPARISON.md +175 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +142 -0
- data/LICENSE.txt +21 -0
- data/Makefile +41 -0
- data/README.md +1196 -0
- data/Rakefile +12 -0
- data/examples/basic_usage.rb +84 -0
- data/examples/email_sanitization_example.md +268 -0
- data/failed-expectations.md +192 -0
- data/lib/dandruff/attributes.rb +223 -0
- data/lib/dandruff/config.rb +500 -0
- data/lib/dandruff/expressions.rb +103 -0
- data/lib/dandruff/tags.rb +160 -0
- data/lib/dandruff/utils.rb +27 -0
- data/lib/dandruff/version.rb +5 -0
- data/lib/dandruff.rb +1095 -0
- metadata +134 -0
data/lib/dandruff.rb
ADDED
|
@@ -0,0 +1,1095 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
require 'set'
|
|
5
|
+
require 'uri'
|
|
6
|
+
|
|
7
|
+
require_relative 'dandruff/version'
|
|
8
|
+
require_relative 'dandruff/config'
|
|
9
|
+
require_relative 'dandruff/tags'
|
|
10
|
+
require_relative 'dandruff/attributes'
|
|
11
|
+
require_relative 'dandruff/expressions'
|
|
12
|
+
require_relative 'dandruff/utils'
|
|
13
|
+
|
|
14
|
+
# Dandruff - A robust HTML sanitizer for Ruby
|
|
15
|
+
#
|
|
16
|
+
# Dandruff is a Ruby implementation inspired by DOMPurify, providing comprehensive XSS protection
|
|
17
|
+
# by sanitizing HTML strings and removing malicious payloads. It's designed for excellent developer
|
|
18
|
+
# experience while maintaining battle-tested security.
|
|
19
|
+
#
|
|
20
|
+
# ## Key Features
|
|
21
|
+
#
|
|
22
|
+
# - **Comprehensive XSS Protection**: Defends against XSS, mXSS, DOM clobbering, and protocol injection
|
|
23
|
+
# - **Flexible Configuration**: Fine-grained control over tags, attributes, and sanitization behavior
|
|
24
|
+
# - **Content Type Profiles**: Pre-configured settings for HTML, SVG, MathML, and HTML email
|
|
25
|
+
# - **Hook System**: Extend sanitization with custom processing logic
|
|
26
|
+
# - **Developer-Friendly API**: Intuitive Ruby idioms with block-based configuration
|
|
27
|
+
# - **Battle-Tested Security**: Based on DOMPurify's proven security model
|
|
28
|
+
# - **Performance Optimized**: Efficient multi-pass sanitization with configurable limits
|
|
29
|
+
#
|
|
30
|
+
# ## Quick Start
|
|
31
|
+
#
|
|
32
|
+
# @example Basic sanitization
|
|
33
|
+
# require 'dandruff'
|
|
34
|
+
#
|
|
35
|
+
# dandruff = Dandruff.new
|
|
36
|
+
# clean = dandruff.sanitize('<script>alert("xss")</script><p>Safe content</p>')
|
|
37
|
+
# # => "<p>Safe content</p>"
|
|
38
|
+
#
|
|
39
|
+
# @example Configure with block
|
|
40
|
+
# dandruff = Dandruff.new do |config|
|
|
41
|
+
# config.allowed_tags = ['p', 'strong', 'em', 'a']
|
|
42
|
+
# config.allowed_attributes = ['href', 'title', 'class']
|
|
43
|
+
# end
|
|
44
|
+
#
|
|
45
|
+
# @example Use convenience class method
|
|
46
|
+
# clean = Dandruff.sanitize(dirty_html, allowed_tags: ['p', 'strong'])
|
|
47
|
+
#
|
|
48
|
+
# @example Profile-based configuration
|
|
49
|
+
# dandruff = Dandruff.new do |config|
|
|
50
|
+
# config.use_profiles = { html: true, svg: true }
|
|
51
|
+
# end
|
|
52
|
+
#
|
|
53
|
+
# @example Per-tag attribute control
|
|
54
|
+
# dandruff = Dandruff.new do |config|
|
|
55
|
+
# config.allowed_attributes_per_tag = {
|
|
56
|
+
# 'a' => ['href', 'title'],
|
|
57
|
+
# 'img' => ['src', 'alt', 'width', 'height']
|
|
58
|
+
# }
|
|
59
|
+
# end
|
|
60
|
+
#
|
|
61
|
+
# @example Custom hooks
|
|
62
|
+
# dandruff = Dandruff.new
|
|
63
|
+
# dandruff.add_hook(:upon_sanitize_attribute) do |node, data, config|
|
|
64
|
+
# # Custom attribute processing
|
|
65
|
+
# if data[:attr_name] == 'data-safe'
|
|
66
|
+
# data[:keep_attr] = true
|
|
67
|
+
# end
|
|
68
|
+
# end
|
|
69
|
+
#
|
|
70
|
+
# ## Security
|
|
71
|
+
#
|
|
72
|
+
# Dandruff protects against multiple attack vectors:
|
|
73
|
+
# - **XSS**: Removes script tags, event handlers, javascript: URIs
|
|
74
|
+
# - **mXSS**: Multi-pass sanitization prevents mutation-based attacks
|
|
75
|
+
# - **DOM Clobbering**: Blocks dangerous id/name attribute values
|
|
76
|
+
# - **Protocol Injection**: Validates URI protocols (javascript:, vbscript:, data:text/html)
|
|
77
|
+
# - **Namespace Confusion**: Prevents mXSS via SVG/MathML namespace attacks
|
|
78
|
+
# - **CSS Injection**: Sanitizes inline styles and style tag content
|
|
79
|
+
#
|
|
80
|
+
# @see https://github.com/kuyio/dandruff GitHub repository
|
|
81
|
+
# @see https://github.com/cure53/DOMPurify Original JavaScript implementation
|
|
82
|
+
# @see Config Configuration options reference
|
|
83
|
+
# @see Sanitizer Core sanitization engine
|
|
84
|
+
module Dandruff
|
|
85
|
+
class Error < StandardError; end
|
|
86
|
+
|
|
87
|
+
# Main sanitizer class handling HTML sanitization logic
|
|
88
|
+
#
|
|
89
|
+
# This class manages the core sanitization process, configuration, and hooks.
|
|
90
|
+
# It parses HTML, removes dangerous elements and attributes, and serializes the result.
|
|
91
|
+
class Sanitizer
|
|
92
|
+
MATH_SVG_TAGS = %w[math svg].freeze
|
|
93
|
+
attr_reader :removed, :config, :hooks
|
|
94
|
+
|
|
95
|
+
# Initializes a new sanitizer instance
|
|
96
|
+
#
|
|
97
|
+
# @param config [Config] optional configuration object
|
|
98
|
+
# @yield [config] optional block to configure instance config
|
|
99
|
+
def initialize(config = nil)
|
|
100
|
+
@removed = []
|
|
101
|
+
@config = build_config(config)
|
|
102
|
+
@hooks = create_hooks_map
|
|
103
|
+
@is_supported = check_support
|
|
104
|
+
yield(@config) if block_given?
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Hook management
|
|
108
|
+
def add_hook(entry_point, &hook_function)
|
|
109
|
+
return unless hook_function.is_a?(Proc)
|
|
110
|
+
|
|
111
|
+
@hooks[entry_point] ||= []
|
|
112
|
+
@hooks[entry_point] << hook_function
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def remove_hook(entry_point, hook_function = nil)
|
|
116
|
+
arr = @hooks[entry_point]
|
|
117
|
+
return nil unless arr
|
|
118
|
+
|
|
119
|
+
if hook_function
|
|
120
|
+
idx = arr.rindex(hook_function)
|
|
121
|
+
return nil unless idx
|
|
122
|
+
|
|
123
|
+
arr.delete_at(idx)
|
|
124
|
+
else
|
|
125
|
+
arr.pop
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def remove_hooks(entry_point)
|
|
130
|
+
@hooks[entry_point] = []
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def remove_all_hooks
|
|
134
|
+
@hooks = create_hooks_map
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Checks if the current environment supports Dandruff functionality
|
|
138
|
+
#
|
|
139
|
+
# @return [Boolean] true if Nokogiri is available, false otherwise
|
|
140
|
+
def supported?
|
|
141
|
+
@is_supported
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Sets configuration for the sanitizer
|
|
145
|
+
#
|
|
146
|
+
# @param cfg [Hash] configuration options
|
|
147
|
+
def set_config(cfg = {})
|
|
148
|
+
@config = parse_config(cfg)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Configures the sanitizer with a block
|
|
152
|
+
#
|
|
153
|
+
# @yield [config] the configuration object to modify
|
|
154
|
+
# @return [Sanitizer] the sanitizer instance
|
|
155
|
+
def configure
|
|
156
|
+
yield(@config) if block_given?
|
|
157
|
+
self
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Clears current configuration, resetting to defaults
|
|
161
|
+
def clear_config
|
|
162
|
+
@config = parse_config({})
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Main sanitization method
|
|
166
|
+
#
|
|
167
|
+
# Parses the input HTML, sanitizes elements and attributes, and returns clean HTML.
|
|
168
|
+
#
|
|
169
|
+
# @param dirty [String, Nokogiri::XML::Node] the input to sanitize
|
|
170
|
+
# @param cfg [Hash] optional configuration override
|
|
171
|
+
# @return [String, Nokogiri::XML::Document] sanitized HTML or DOM
|
|
172
|
+
def sanitize(dirty, cfg = {})
|
|
173
|
+
return dirty unless supported?
|
|
174
|
+
|
|
175
|
+
cfg.empty? ? ensure_config : set_config(cfg)
|
|
176
|
+
@removed = []
|
|
177
|
+
return '' if dirty.nil?
|
|
178
|
+
return dirty.to_s if dirty.to_s.strip.empty?
|
|
179
|
+
|
|
180
|
+
dirty = dirty.to_s unless dirty.is_a?(String)
|
|
181
|
+
doc = parse_html(dirty)
|
|
182
|
+
sanitize_document(doc)
|
|
183
|
+
output = serialize_html(doc)
|
|
184
|
+
|
|
185
|
+
output = resanitize_until_stable(output) if @config.sanitize_until_stable
|
|
186
|
+
|
|
187
|
+
if @config.return_dom
|
|
188
|
+
return parse_html(output)
|
|
189
|
+
elsif @config.return_dom_fragment
|
|
190
|
+
return Nokogiri::HTML5::DocumentFragment.parse(output)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
output
|
|
194
|
+
end
|
|
195
|
+
alias_method :scrub, :sanitize
|
|
196
|
+
|
|
197
|
+
private
|
|
198
|
+
|
|
199
|
+
# Checks if required dependencies are available
|
|
200
|
+
def check_support
|
|
201
|
+
defined?(Nokogiri) && Nokogiri::VERSION
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Creates the default hooks map
|
|
205
|
+
#
|
|
206
|
+
# @return [Hash] hash of hook arrays keyed by hook name
|
|
207
|
+
def create_hooks_map
|
|
208
|
+
{
|
|
209
|
+
before_sanitize_attributes: [],
|
|
210
|
+
after_sanitize_attributes: [],
|
|
211
|
+
before_sanitize_elements: [],
|
|
212
|
+
after_sanitize_elements: [],
|
|
213
|
+
upon_sanitize_attribute: [],
|
|
214
|
+
upon_sanitize_element: []
|
|
215
|
+
}
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Parses configuration options
|
|
219
|
+
def parse_config(cfg = {})
|
|
220
|
+
Config.new(cfg)
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Builds a configuration from hash or existing Config
|
|
224
|
+
def build_config(cfg)
|
|
225
|
+
return parse_config(cfg) if cfg.is_a?(Hash)
|
|
226
|
+
return cfg if cfg.is_a?(Config)
|
|
227
|
+
|
|
228
|
+
parse_config({})
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Ensures configuration is set
|
|
232
|
+
def ensure_config
|
|
233
|
+
@config ||= parse_config({})
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Parses HTML string into Nokogiri document
|
|
237
|
+
#
|
|
238
|
+
# @param html [String] HTML string to parse
|
|
239
|
+
# @return [Nokogiri::XML::Document] parsed document
|
|
240
|
+
def parse_html(html)
|
|
241
|
+
html = "<remove></remove>#{html}" if @config.force_body
|
|
242
|
+
if @config.parser_media_type == 'application/xhtml+xml' && @config.namespace == 'http://www.w3.org/1999/xhtml'
|
|
243
|
+
html = "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head></head><body>#{html}</body></html>"
|
|
244
|
+
end
|
|
245
|
+
if @config.whole_document || @config.return_dom || @config.allow_document_elements || html.match?(/<frameset/i)
|
|
246
|
+
Nokogiri::HTML5.parse(html)
|
|
247
|
+
else
|
|
248
|
+
Nokogiri::HTML5.fragment(html)
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Sanitizes the document by processing elements and attributes
|
|
253
|
+
#
|
|
254
|
+
# @param doc [Nokogiri::XML::Document] document to sanitize
|
|
255
|
+
# @return [Nokogiri::XML::Document] sanitized document
|
|
256
|
+
def sanitize_document(doc) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
257
|
+
doc.children.first.remove if @config.force_body && doc.children.first&.name == 'remove'
|
|
258
|
+
execute_hooks(:before_sanitize_elements, doc)
|
|
259
|
+
doc.traverse do |node|
|
|
260
|
+
if node.element? && %w[script iframe frame frameset object embed].include?(node.name)
|
|
261
|
+
@removed << { element: node }
|
|
262
|
+
if node.name == 'frameset'
|
|
263
|
+
parent = node.parent
|
|
264
|
+
# puts "Removing frameset and parent: #{parent&.name}"
|
|
265
|
+
node.remove
|
|
266
|
+
parent&.remove
|
|
267
|
+
else
|
|
268
|
+
node.remove
|
|
269
|
+
end
|
|
270
|
+
next
|
|
271
|
+
elsif node.element? && node.name == 'style'
|
|
272
|
+
node.remove && next unless @config.allow_style_tags
|
|
273
|
+
|
|
274
|
+
if unsafe_style_node?(node)
|
|
275
|
+
node.remove
|
|
276
|
+
next
|
|
277
|
+
end
|
|
278
|
+
elsif node.element?
|
|
279
|
+
sanitize_element(node)
|
|
280
|
+
elsif node.text? && @config.safe_for_templates
|
|
281
|
+
sanitize_text_node(node)
|
|
282
|
+
elsif node.comment? && @config.safe_for_xml
|
|
283
|
+
sanitize_comment_node(node)
|
|
284
|
+
elsif node.cdata?
|
|
285
|
+
node.replace(Nokogiri::XML::Text.new(node.text, node.document))
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
execute_hooks(:after_sanitize_elements, doc)
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
def sanitize_element(node)
|
|
292
|
+
tag_name = transform_case(node.name)
|
|
293
|
+
|
|
294
|
+
return if handle_isindex(node, tag_name)
|
|
295
|
+
return if handle_dangerous_math_svg(node)
|
|
296
|
+
return if handle_namespace_check(node)
|
|
297
|
+
return if handle_prefixed_element(node, tag_name)
|
|
298
|
+
|
|
299
|
+
execute_hooks(:upon_sanitize_element, node, { tag_name: tag_name })
|
|
300
|
+
|
|
301
|
+
unless allowed_element?(tag_name)
|
|
302
|
+
handle_disallowed_element(node, tag_name)
|
|
303
|
+
return
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
sanitize_attributes(node)
|
|
307
|
+
handle_vml_namespace(node)
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# Sanitizes attributes of an element
|
|
311
|
+
#
|
|
312
|
+
# @param node [Nokogiri::XML::Element] element to sanitize attributes for
|
|
313
|
+
def sanitize_attributes(node)
|
|
314
|
+
tag_name = transform_case(node.name)
|
|
315
|
+
to_remove = []
|
|
316
|
+
dangerous_removed = false
|
|
317
|
+
had_xlink_href = node.key?('xlink:href')
|
|
318
|
+
|
|
319
|
+
execute_hooks(:before_sanitize_attributes, node)
|
|
320
|
+
|
|
321
|
+
node.attributes.each do |name, attr|
|
|
322
|
+
lc_name = normalize_attribute_name(name, attr)
|
|
323
|
+
|
|
324
|
+
handle_is_attribute(attr, lc_name)
|
|
325
|
+
value = attr.value
|
|
326
|
+
|
|
327
|
+
handle_xlink_namespace_definition(node, lc_name)
|
|
328
|
+
|
|
329
|
+
had_xlink_href ||= (lc_name == 'xlink:href')
|
|
330
|
+
had_xlink_href ||= (attr.namespace&.href == 'http://www.w3.org/1999/xlink')
|
|
331
|
+
|
|
332
|
+
execute_hooks(:upon_sanitize_attribute, attr, { tag_name: tag_name, attr_name: lc_name, value: value })
|
|
333
|
+
|
|
334
|
+
if valid_attribute?(tag_name, lc_name, value)
|
|
335
|
+
attr.value = value if value != attr.value
|
|
336
|
+
else
|
|
337
|
+
to_remove << name
|
|
338
|
+
@removed << { attribute: attr, from: node }
|
|
339
|
+
dangerous_removed = true if dangerous_attribute_removed?(lc_name, tag_name)
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
to_remove.each { |n| node.delete(n) }
|
|
344
|
+
|
|
345
|
+
# Remove meta/link tags entirely if dangerous attributes were removed
|
|
346
|
+
if dangerous_removed && %w[meta link].include?(tag_name)
|
|
347
|
+
node.remove
|
|
348
|
+
return
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
ensure_alt_attribute(node, tag_name)
|
|
352
|
+
ensure_xlink_namespace(node) if had_xlink_href || node.key?('xlink:href')
|
|
353
|
+
|
|
354
|
+
execute_hooks(:after_sanitize_attributes, node)
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
def build_isindex_replacement(node)
|
|
358
|
+
doc = node.document
|
|
359
|
+
form = Nokogiri::XML::Node.new('form', doc)
|
|
360
|
+
hr1 = Nokogiri::XML::Node.new('hr', doc)
|
|
361
|
+
hr2 = Nokogiri::XML::Node.new('hr', doc)
|
|
362
|
+
label = Nokogiri::XML::Node.new('label', doc)
|
|
363
|
+
label.content = 'This is a searchable index. Enter search keywords: '
|
|
364
|
+
input = Nokogiri::XML::Node.new('input', doc)
|
|
365
|
+
if node['src']
|
|
366
|
+
input['name'] = 'isindex'
|
|
367
|
+
input['label'] = node['label'] if node['label']
|
|
368
|
+
else
|
|
369
|
+
input['label'] = node['label'] if node['label']
|
|
370
|
+
input['name'] = 'isindex'
|
|
371
|
+
end
|
|
372
|
+
label.add_child(input)
|
|
373
|
+
form.add_child(hr1)
|
|
374
|
+
form.add_child(label)
|
|
375
|
+
form.add_child(hr2)
|
|
376
|
+
form
|
|
377
|
+
rescue StandardError
|
|
378
|
+
nil
|
|
379
|
+
end
|
|
380
|
+
|
|
381
|
+
# Checks if an element tag is allowed
|
|
382
|
+
#
|
|
383
|
+
# @param tag_name [String] the tag name to check
|
|
384
|
+
# @return [Boolean] true if the tag is allowed, false otherwise
|
|
385
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
386
|
+
def allowed_element?(tag_name)
|
|
387
|
+
if !@config.whole_document && !@config.allow_document_elements && !@config.return_dom &&
|
|
388
|
+
%w[html head body].include?(tag_name)
|
|
389
|
+
return false
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
return false if @config.forbidden_tags&.include?(tag_name)
|
|
393
|
+
|
|
394
|
+
unless @config.allowed_tags.nil?
|
|
395
|
+
allowed = @config.allowed_tags.dup.map { |t| transform_case(t) }
|
|
396
|
+
allowed.concat(@config.additional_tags) if @config.additional_tags
|
|
397
|
+
is_included = allowed.include?(tag_name)
|
|
398
|
+
return is_included
|
|
399
|
+
end
|
|
400
|
+
return true if @config.additional_tags&.map { |t| transform_case(t) }&.include?(tag_name)
|
|
401
|
+
|
|
402
|
+
default_allowed_tags.include?(tag_name)
|
|
403
|
+
end
|
|
404
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
405
|
+
|
|
406
|
+
# Checks if an attribute is valid for a given tag
|
|
407
|
+
#
|
|
408
|
+
# @param tag_name [String] the element tag name
|
|
409
|
+
# @param attr_name [String] the attribute name
|
|
410
|
+
# @param value [String] the attribute value
|
|
411
|
+
# @return [Boolean] true if the attribute is valid, false otherwise
|
|
412
|
+
# Checks if an attribute is valid for a given tag
|
|
413
|
+
#
|
|
414
|
+
# @param tag_name [String] the element tag name
|
|
415
|
+
# @param attr_name [String] the attribute name
|
|
416
|
+
# @param value [String] the attribute value
|
|
417
|
+
# @return [Boolean] true if the attribute is valid, false otherwise
|
|
418
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
419
|
+
def valid_attribute?(tag_name, attr_name, value)
|
|
420
|
+
return false if forbidden_attribute?(attr_name)
|
|
421
|
+
return false if dangerous_attribute?(attr_name)
|
|
422
|
+
|
|
423
|
+
attr_allowed = attribute_allowed?(tag_name, attr_name)
|
|
424
|
+
|
|
425
|
+
return true if data_attribute_allowed?(attr_name)
|
|
426
|
+
return true if aria_attribute_allowed?(attr_name)
|
|
427
|
+
return true if attr_name == 'is'
|
|
428
|
+
|
|
429
|
+
if attr_name == 'style'
|
|
430
|
+
return false unless attr_allowed || attr_allowed.nil?
|
|
431
|
+
|
|
432
|
+
return valid_style_attribute?(value)
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
return false if @config.sanitize_dom && dom_clobbering_attribute?(attr_name, value)
|
|
436
|
+
|
|
437
|
+
return valid_uri_attribute?(tag_name, value, attr_allowed) if uri_like?(attr_name) && value
|
|
438
|
+
|
|
439
|
+
return attr_allowed if [true, false].include?(attr_allowed)
|
|
440
|
+
|
|
441
|
+
# Default permissive checks
|
|
442
|
+
return true if @config.additional_attributes&.include?(attr_name)
|
|
443
|
+
|
|
444
|
+
allow_unknown_protocols_fallback?(value)
|
|
445
|
+
end
|
|
446
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
447
|
+
|
|
448
|
+
# Checks if an attribute name is URI-like
|
|
449
|
+
#
|
|
450
|
+
# @param attr_name [String] the attribute name
|
|
451
|
+
# @return [Boolean] true if the attribute is URI-like, false otherwise
|
|
452
|
+
def uri_like?(attr_name)
|
|
453
|
+
default_uri_safe_attributes.include?(attr_name) || @config.additional_uri_safe_attributes&.include?(attr_name)
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
def unsafe_inline_style?(value)
|
|
457
|
+
normalized = value.downcase
|
|
458
|
+
# Decode CSS hex escapes to surface hidden protocol names
|
|
459
|
+
normalized = normalized.gsub(/\\([0-9a-f]{1,6})\s?/i) do
|
|
460
|
+
[::Regexp.last_match(1).to_i(16)].pack('U')
|
|
461
|
+
rescue StandardError
|
|
462
|
+
''
|
|
463
|
+
end
|
|
464
|
+
normalized = normalized.delete('\\') # Remove backslashes after decoding hex escapes
|
|
465
|
+
# Check for truly dangerous CSS patterns
|
|
466
|
+
# More lenient than before to match DOMPurify behavior
|
|
467
|
+
normalized = normalized.gsub(/\s+/, '') # Remove all whitespace for easier matching
|
|
468
|
+
|
|
469
|
+
# Dangerous: javascript/vbscript/data protocols in background/import
|
|
470
|
+
return true if normalized.match?(/javascript:/i) && normalized.match?(/background|@import/i)
|
|
471
|
+
return true if normalized.match?(/vbscript:/i)
|
|
472
|
+
|
|
473
|
+
# Dangerous: expression() (IE)
|
|
474
|
+
return true if normalized.include?('expression(')
|
|
475
|
+
|
|
476
|
+
# Dangerous: @import (can load external stylesheets)
|
|
477
|
+
return true if normalized.match?(/@import\s*url/i)
|
|
478
|
+
|
|
479
|
+
# Dangerous: data:text/html (can contain scripts)
|
|
480
|
+
return true if normalized.include?('data:text/html')
|
|
481
|
+
|
|
482
|
+
# Note: behavior:, binding:, data:image/svg+xml in content/filter are SAFE
|
|
483
|
+
# These are only dangerous in specific contexts that modern browsers don't execute
|
|
484
|
+
|
|
485
|
+
false
|
|
486
|
+
end
|
|
487
|
+
|
|
488
|
+
def sanitize_style_value(value)
|
|
489
|
+
return nil if unsafe_inline_style?(value)
|
|
490
|
+
return nil if value.match?(/\\[0-9a-f]{1,6}/i)
|
|
491
|
+
|
|
492
|
+
allowed_props = Set.new(%w[
|
|
493
|
+
align-content align-items align-self all animation animation-delay animation-direction animation-duration
|
|
494
|
+
animation-fill-mode animation-iteration-count animation-name animation-play-state animation-timing-function
|
|
495
|
+
background background-clip background-color background-image background-origin background-position
|
|
496
|
+
background-repeat background-size border border-bottom border-bottom-color border-bottom-style
|
|
497
|
+
border-bottom-width
|
|
498
|
+
border-collapse border-color border-image border-left border-left-color border-left-style border-left-width
|
|
499
|
+
border-radius border-right border-right-color border-right-style border-right-width border-spacing border-style
|
|
500
|
+
border-top border-top-color border-top-style border-top-width border-width bottom box-shadow box-sizing
|
|
501
|
+
caption-side clear clip color column-count column-fill column-gap column-rule column-rule-color
|
|
502
|
+
column-rule-style
|
|
503
|
+
column-rule-width column-span column-width columns content cursor direction display empty-cells filter flex
|
|
504
|
+
flex-basis flex-direction flex-flow flex-grow flex-shrink flex-wrap float font font-family font-size
|
|
505
|
+
font-size-adjust font-stretch font-style font-variant font-weight gap grid grid-area grid-auto-columns
|
|
506
|
+
grid-auto-flow grid-auto-rows grid-column grid-column-end grid-column-gap grid-column-start grid-gap grid-row
|
|
507
|
+
grid-row-end grid-row-gap grid-row-start grid-template grid-template-areas grid-template-columns
|
|
508
|
+
grid-template-rows
|
|
509
|
+
height justify-content left letter-spacing line-height list-style list-style-image
|
|
510
|
+
list-style-position list-style-type
|
|
511
|
+
margin margin-bottom margin-left margin-right margin-top max-height max-width min-height min-width opacity order
|
|
512
|
+
outline outline-color outline-offset outline-style outline-width overflow overflow-x overflow-y padding
|
|
513
|
+
padding-bottom padding-left padding-right padding-top page-break-after page-break-before page-break-inside
|
|
514
|
+
perspective perspective-origin pointer-events position quotes resize right row-gap table-layout text-align
|
|
515
|
+
text-align-last text-decoration text-decoration-color text-decoration-line text-decoration-style text-indent
|
|
516
|
+
text-justify text-overflow text-shadow text-transform top transform transform-origin transition transition-delay
|
|
517
|
+
transition-duration transition-property transition-timing-function unicode-bidi vertical-align visibility
|
|
518
|
+
white-space width word-break word-spacing word-wrap writing-mode z-index
|
|
519
|
+
])
|
|
520
|
+
|
|
521
|
+
declarations = value.split(';').map(&:strip).reject(&:empty?)
|
|
522
|
+
sanitized = declarations.filter_map do |decl|
|
|
523
|
+
prop, val = decl.split(':', 2).map { |p| p&.strip }
|
|
524
|
+
next nil unless prop && val
|
|
525
|
+
|
|
526
|
+
lc_prop = prop.downcase
|
|
527
|
+
next nil unless allowed_props.include?(lc_prop)
|
|
528
|
+
# reject dangerous urls/protocols in values
|
|
529
|
+
next nil if unsafe_inline_style?(val)
|
|
530
|
+
|
|
531
|
+
"#{lc_prop}:#{val}"
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
return nil if sanitized.empty?
|
|
535
|
+
|
|
536
|
+
sanitized.join('; ')
|
|
537
|
+
end
|
|
538
|
+
|
|
539
|
+
def unsafe_style_block?(content)
|
|
540
|
+
return false if content.nil? || content.strip.empty?
|
|
541
|
+
|
|
542
|
+
unsafe_inline_style?(content)
|
|
543
|
+
end
|
|
544
|
+
|
|
545
|
+
# Checks if a node is within a MathML or SVG context
|
|
546
|
+
#
|
|
547
|
+
# @param node [Nokogiri::XML::Element] element to check
|
|
548
|
+
# @return [Boolean] true if inside math or svg element
|
|
549
|
+
def in_math_or_svg_context?(node)
|
|
550
|
+
current = node.parent
|
|
551
|
+
while current
|
|
552
|
+
if current.respond_to?(:element?) && current.element? && MATH_SVG_TAGS.include?(current.name.downcase)
|
|
553
|
+
return true
|
|
554
|
+
end
|
|
555
|
+
break unless current.respond_to?(:parent)
|
|
556
|
+
|
|
557
|
+
current = current.parent
|
|
558
|
+
end
|
|
559
|
+
false
|
|
560
|
+
end
|
|
561
|
+
|
|
562
|
+
# Checks if an element is dangerous when inside MathML/SVG context
|
|
563
|
+
#
|
|
564
|
+
# @param node [Nokogiri::XML::Element] element to check
|
|
565
|
+
# @return [Boolean] true if element can cause mXSS in math/svg context
|
|
566
|
+
def dangerous_in_math_svg?(node)
|
|
567
|
+
return false unless node.element?
|
|
568
|
+
|
|
569
|
+
tag = node.name.downcase
|
|
570
|
+
return false unless in_math_or_svg_context?(node)
|
|
571
|
+
|
|
572
|
+
# These elements can cause mXSS when inside MathML/SVG
|
|
573
|
+
# - style: can break out of context with </style><img onerror=...>
|
|
574
|
+
# - title: similar context confusion
|
|
575
|
+
# - mglyph: not standard in MathML 3.0, used in nesting attacks
|
|
576
|
+
%w[style title mglyph].include?(tag)
|
|
577
|
+
end
|
|
578
|
+
|
|
579
|
+
def unsafe_style_node?(node)
|
|
580
|
+
parent_name = node.parent&.name
|
|
581
|
+
top_level = parent_name.nil? || parent_name == '#document' || parent_name == '#document-fragment' ||
|
|
582
|
+
%w[html head body].include?(parent_name)
|
|
583
|
+
|
|
584
|
+
# For whole_document/html_email profiles, allow style tags at top level (in head/body)
|
|
585
|
+
# This is safe because the entire document structure is being preserved
|
|
586
|
+
if @config.whole_document && @config.allow_style_tags
|
|
587
|
+
# Only block style in truly unsafe contexts (e.g., option/select)
|
|
588
|
+
return true if %w[option select].include?(parent_name)
|
|
589
|
+
|
|
590
|
+
# Allow style tags even if they contain CSS content
|
|
591
|
+
return false
|
|
592
|
+
end
|
|
593
|
+
|
|
594
|
+
# For non-whole-document contexts, block top-level style tags as they're unexpected
|
|
595
|
+
return true if top_level
|
|
596
|
+
return true if %w[option select].include?(parent_name)
|
|
597
|
+
return true if node.content.include?('<') || node.element_children.any?
|
|
598
|
+
|
|
599
|
+
false
|
|
600
|
+
end
|
|
601
|
+
|
|
602
|
+
def resanitize_until_stable(html)
|
|
603
|
+
current = html
|
|
604
|
+
max_passes = @config.mutation_max_passes.to_i
|
|
605
|
+
return current if max_passes <= 1
|
|
606
|
+
|
|
607
|
+
passes = 1
|
|
608
|
+
while passes < max_passes
|
|
609
|
+
doc = parse_html(current)
|
|
610
|
+
sanitize_document(doc)
|
|
611
|
+
next_output = serialize_html(doc)
|
|
612
|
+
passes += 1
|
|
613
|
+
break if next_output == current
|
|
614
|
+
|
|
615
|
+
current = next_output
|
|
616
|
+
end
|
|
617
|
+
current
|
|
618
|
+
end
|
|
619
|
+
|
|
620
|
+
# Serializes the document back to HTML string
|
|
621
|
+
#
|
|
622
|
+
# @param doc [Nokogiri::XML::Document] document to serialize
|
|
623
|
+
# @return [String] HTML string
|
|
624
|
+
def serialize_html(doc)
|
|
625
|
+
result = doc.respond_to?(:to_html) ? doc.to_html : doc.to_s
|
|
626
|
+
result = result.sub(/\A\n+/, '')
|
|
627
|
+
result = fix_svg_self_closing_tags(result).gsub('&unknown;', '&unknown;')
|
|
628
|
+
# Remove encoded script blocks
|
|
629
|
+
result = result.gsub(%r{<script>.*?</script>}i, '')
|
|
630
|
+
if !@config.whole_document && !@config.allow_document_elements && !@config.return_dom
|
|
631
|
+
result = result.gsub(%r{</?(?:html|head|body)(?:\s[^>]*)?>}i, '')
|
|
632
|
+
end
|
|
633
|
+
result
|
|
634
|
+
end
|
|
635
|
+
|
|
636
|
+
def fix_svg_self_closing_tags(html)
|
|
637
|
+
%w[circle ellipse line path polygon polyline rect stop use feimage mask g defs].each do |tag|
|
|
638
|
+
html = html.gsub(%r{<#{tag}([^>]*)/>}, "<#{tag}\\1></#{tag}>")
|
|
639
|
+
end
|
|
640
|
+
html
|
|
641
|
+
end
|
|
642
|
+
|
|
643
|
+
# Transforms tag/attribute names to lowercase if not XHTML
|
|
644
|
+
#
|
|
645
|
+
# @param str [String] string to transform
|
|
646
|
+
# @return [String] transformed string
|
|
647
|
+
def transform_case(str)
|
|
648
|
+
@config&.parser_media_type == 'application/xhtml+xml' ? str : str.downcase
|
|
649
|
+
end
|
|
650
|
+
|
|
651
|
+
# Returns the default set of allowed tags
|
|
652
|
+
#
|
|
653
|
+
# @return [Set] set of allowed HTML, SVG, MathML, and text tags
|
|
654
|
+
def default_allowed_tags
|
|
655
|
+
@default_allowed_tags ||= begin
|
|
656
|
+
source = @config.minimal_profile ? Tags::MINIMAL_HTML : Tags::HTML
|
|
657
|
+
s = Set.new(source.map { |t| transform_case(t) })
|
|
658
|
+
unless @config.minimal_profile
|
|
659
|
+
s.merge(Tags::SVG.map { |t| transform_case(t) })
|
|
660
|
+
s.merge(Tags::SVG_FILTERS.map { |t| transform_case(t) })
|
|
661
|
+
s.merge(Tags::MATH_ML.map { |t| transform_case(t) })
|
|
662
|
+
end
|
|
663
|
+
s.merge(Tags::TEXT.map { |t| transform_case(t) })
|
|
664
|
+
s
|
|
665
|
+
end
|
|
666
|
+
end
|
|
667
|
+
|
|
668
|
+
# Returns the default set of URI-safe attributes
|
|
669
|
+
#
|
|
670
|
+
# @return [Set] set of attributes that can contain URIs
|
|
671
|
+
def default_uri_safe_attributes
|
|
672
|
+
@default_uri_safe_attributes ||= Set.new(%w[href src xlink:href action formaction cite data poster background
|
|
673
|
+
srcset])
|
|
674
|
+
end
|
|
675
|
+
|
|
676
|
+
# Checks if a tag's content should be forbidden
|
|
677
|
+
#
|
|
678
|
+
# @param tag_name [String] the tag name to check
|
|
679
|
+
# @return [Boolean] true if content should be forbidden, false otherwise
|
|
680
|
+
def forbidden_content?(tag_name)
|
|
681
|
+
default_forbid_contents.include?(tag_name) || @config.forbid_contents&.include?(tag_name)
|
|
682
|
+
end
|
|
683
|
+
|
|
684
|
+
# Returns the default set of tags whose content should be forbidden
|
|
685
|
+
#
|
|
686
|
+
# @return [Set] set of tags with forbidden content
|
|
687
|
+
def default_forbid_contents
|
|
688
|
+
@default_forbid_contents ||= Set.new(%w[annotation-xml audio colgroup desc foreignobject head iframe math mi mn
|
|
689
|
+
mo ms mtext noembed noframes noscript plaintext script style svg template thead title video xmp])
|
|
690
|
+
end
|
|
691
|
+
|
|
692
|
+
# Returns the set of tags that can have data URIs
|
|
693
|
+
#
|
|
694
|
+
# @return [Set] set of tags allowed to have data URIs
|
|
695
|
+
def data_uri_tags
|
|
696
|
+
@data_uri_tags ||= begin
|
|
697
|
+
t = Set.new(%w[audio video img source image track])
|
|
698
|
+
t.merge(@config.add_data_uri_tags) if @config.add_data_uri_tags
|
|
699
|
+
t
|
|
700
|
+
end
|
|
701
|
+
end
|
|
702
|
+
|
|
703
|
+
# Sanitizes text nodes by removing template expressions
|
|
704
|
+
#
|
|
705
|
+
# @param node [Nokogiri::XML::Text] text node to sanitize
|
|
706
|
+
def sanitize_text_node(node)
|
|
707
|
+
content = node.content
|
|
708
|
+
[Expressions::MUSTACHE_EXPR, Expressions::ERB_EXPR, Expressions::TMPLIT_EXPR].each do |expr|
|
|
709
|
+
content = content.gsub(expr, ' ')
|
|
710
|
+
end
|
|
711
|
+
return if node.content == content
|
|
712
|
+
|
|
713
|
+
@removed << { element: node.dup }
|
|
714
|
+
node.content = content
|
|
715
|
+
end
|
|
716
|
+
|
|
717
|
+
# Sanitizes comment nodes by removing them entirely
|
|
718
|
+
#
|
|
719
|
+
# @param node [Nokogiri::XML::Comment] comment node to sanitize
|
|
720
|
+
def sanitize_comment_node(node)
|
|
721
|
+
@removed << { element: node }
|
|
722
|
+
node.remove
|
|
723
|
+
end
|
|
724
|
+
|
|
725
|
+
# Executes hooks for a given entry point
|
|
726
|
+
#
|
|
727
|
+
# @param entry_point [Symbol] the hook entry point
|
|
728
|
+
# @param node [Nokogiri::XML::Node] the node being processed
|
|
729
|
+
# @param data [Hash] additional data for the hook
|
|
730
|
+
def execute_hooks(entry_point, node, data = nil)
|
|
731
|
+
hooks = @hooks[entry_point]
|
|
732
|
+
return unless hooks
|
|
733
|
+
|
|
734
|
+
hooks.each { |h| h.call(node, data, @config) }
|
|
735
|
+
end
|
|
736
|
+
|
|
737
|
+
# Helper methods for sanitize_element
|
|
738
|
+
|
|
739
|
+
# Handles the deprecated isindex element by converting it to a form
|
|
740
|
+
#
|
|
741
|
+
# @param node [Nokogiri::XML::Element] the element node
|
|
742
|
+
# @param tag_name [String] the tag name
|
|
743
|
+
# @return [Boolean] true if handled (removed/replaced), false otherwise
|
|
744
|
+
def handle_isindex(node, tag_name)
|
|
745
|
+
return false unless tag_name == 'isindex'
|
|
746
|
+
|
|
747
|
+
replacement = build_isindex_replacement(node)
|
|
748
|
+
node.add_next_sibling(replacement) if replacement
|
|
749
|
+
@removed << { element: node }
|
|
750
|
+
node.remove
|
|
751
|
+
true
|
|
752
|
+
end
|
|
753
|
+
|
|
754
|
+
# Removes elements that are dangerous in MathML/SVG contexts
|
|
755
|
+
#
|
|
756
|
+
# @param node [Nokogiri::XML::Element] the element node
|
|
757
|
+
# @return [Boolean] true if removed, false otherwise
|
|
758
|
+
def handle_dangerous_math_svg(node)
|
|
759
|
+
return false unless dangerous_in_math_svg?(node)
|
|
760
|
+
|
|
761
|
+
@removed << { element: node }
|
|
762
|
+
node.remove
|
|
763
|
+
true
|
|
764
|
+
end
|
|
765
|
+
|
|
766
|
+
# Checks and handles element namespaces
|
|
767
|
+
#
|
|
768
|
+
# @param node [Nokogiri::XML::Element] the element node
|
|
769
|
+
# @return [Boolean] true if removed due to invalid namespace, false otherwise
|
|
770
|
+
def handle_namespace_check(node)
|
|
771
|
+
return false unless node.namespace&.href
|
|
772
|
+
return false if ['http://www.w3.org/1999/xhtml', 'http://www.w3.org/2000/svg', 'http://www.w3.org/1998/Math/MathML'].include?(node.namespace.href)
|
|
773
|
+
|
|
774
|
+
node.children.to_a.each { |child| node.add_previous_sibling(child) } if @config.keep_content
|
|
775
|
+
@removed << { element: node }
|
|
776
|
+
node.remove
|
|
777
|
+
true
|
|
778
|
+
end
|
|
779
|
+
|
|
780
|
+
# Handles elements with namespace prefixes
|
|
781
|
+
#
|
|
782
|
+
# @param node [Nokogiri::XML::Element] the element node
|
|
783
|
+
# @param tag_name [String] the tag name
|
|
784
|
+
# @return [Boolean] true if handled (removed), false otherwise
|
|
785
|
+
def handle_prefixed_element(node, tag_name)
|
|
786
|
+
return false unless tag_name.include?(':')
|
|
787
|
+
|
|
788
|
+
prefix = tag_name.split(':').first.downcase
|
|
789
|
+
if %w[xml xmlns].include?(prefix)
|
|
790
|
+
if @config.keep_content
|
|
791
|
+
text_nodes = []
|
|
792
|
+
node.traverse { |n| text_nodes << n if n.text? }
|
|
793
|
+
text_nodes.each { |text_node| node.add_previous_sibling(text_node.dup) }
|
|
794
|
+
end
|
|
795
|
+
@removed << { element: node }
|
|
796
|
+
node.remove
|
|
797
|
+
return true
|
|
798
|
+
end
|
|
799
|
+
|
|
800
|
+
node.children.to_a.each { |child| node.add_previous_sibling(child) } if @config.keep_content
|
|
801
|
+
@removed << { element: node }
|
|
802
|
+
node.remove
|
|
803
|
+
true
|
|
804
|
+
end
|
|
805
|
+
|
|
806
|
+
# Handles elements that are not allowed by the configuration
|
|
807
|
+
#
|
|
808
|
+
# @param node [Nokogiri::XML::Element] the element node
|
|
809
|
+
# @param tag_name [String] the tag name
|
|
810
|
+
def handle_disallowed_element(node, tag_name)
|
|
811
|
+
replaced_children = false
|
|
812
|
+
if @config.keep_content && !forbidden_content?(tag_name) && !@config.allowed_tags
|
|
813
|
+
if node.children.any?
|
|
814
|
+
node.children.to_a.each { |child| node.add_previous_sibling(child) }
|
|
815
|
+
replaced_children = true
|
|
816
|
+
else
|
|
817
|
+
node.remove
|
|
818
|
+
end
|
|
819
|
+
elsif @config.allowed_tags && node.children.any?
|
|
820
|
+
node.add_next_sibling(Nokogiri::XML::Text.new(' ', node.document))
|
|
821
|
+
end
|
|
822
|
+
@removed << { element: node }
|
|
823
|
+
node.remove unless replaced_children
|
|
824
|
+
end
|
|
825
|
+
|
|
826
|
+
# Removes elements with VML namespace
|
|
827
|
+
#
|
|
828
|
+
# @param node [Nokogiri::XML::Element] the element node
|
|
829
|
+
def handle_vml_namespace(node)
|
|
830
|
+
return unless node['xmlns']&.match?(/vml/i)
|
|
831
|
+
|
|
832
|
+
@removed << { element: node }
|
|
833
|
+
node.remove
|
|
834
|
+
end
|
|
835
|
+
|
|
836
|
+
# Helper methods for sanitize_attributes
|
|
837
|
+
|
|
838
|
+
# Normalizes attribute name handling namespaces
|
|
839
|
+
#
|
|
840
|
+
# @param name [String] attribute name
|
|
841
|
+
# @param attr [Nokogiri::XML::Attr] attribute object
|
|
842
|
+
# @return [String] normalized attribute name
|
|
843
|
+
def normalize_attribute_name(name, attr)
|
|
844
|
+
if attr.namespace&.prefix == 'xmlns'
|
|
845
|
+
name == 'xmlns' ? 'xmlns' : "xmlns:#{transform_case(name)}"
|
|
846
|
+
else
|
|
847
|
+
transform_case(name)
|
|
848
|
+
end
|
|
849
|
+
end
|
|
850
|
+
|
|
851
|
+
# Handles the 'is' attribute by clearing its value
|
|
852
|
+
#
|
|
853
|
+
# @param attr [Nokogiri::XML::Attr] attribute object
|
|
854
|
+
# @param lc_name [String] lowercased attribute name
|
|
855
|
+
def handle_is_attribute(attr, lc_name)
|
|
856
|
+
return unless lc_name == 'is'
|
|
857
|
+
|
|
858
|
+
attr.value = ''
|
|
859
|
+
end
|
|
860
|
+
|
|
861
|
+
# Adds xlink namespace definition if needed
|
|
862
|
+
#
|
|
863
|
+
# @param node [Nokogiri::XML::Element] the element node
|
|
864
|
+
# @param lc_name [String] lowercased attribute name
|
|
865
|
+
def handle_xlink_namespace_definition(node, lc_name)
|
|
866
|
+
return unless lc_name.start_with?('xlink:')
|
|
867
|
+
|
|
868
|
+
begin
|
|
869
|
+
node.add_namespace_definition('xlink', 'http://www.w3.org/1999/xlink')
|
|
870
|
+
rescue StandardError
|
|
871
|
+
nil
|
|
872
|
+
end
|
|
873
|
+
end
|
|
874
|
+
|
|
875
|
+
# Checks if a removed attribute was dangerous enough to warrant removing the element
|
|
876
|
+
#
|
|
877
|
+
# @param lc_name [String] lowercased attribute name
|
|
878
|
+
# @param tag_name [String] tag name
|
|
879
|
+
# @return [Boolean] true if dangerous
|
|
880
|
+
def dangerous_attribute_removed?(lc_name, tag_name)
|
|
881
|
+
%w[href content].include?(lc_name) && %w[meta link].include?(tag_name)
|
|
882
|
+
end
|
|
883
|
+
|
|
884
|
+
# Ensures img tags have an alt attribute if allowed
|
|
885
|
+
#
|
|
886
|
+
# @param node [Nokogiri::XML::Element] the element node
|
|
887
|
+
# @param tag_name [String] tag name
|
|
888
|
+
def ensure_alt_attribute(node, tag_name)
|
|
889
|
+
return unless tag_name == 'img' && @config.allowed_attributes_per_tag.is_a?(Hash)
|
|
890
|
+
|
|
891
|
+
allowed = @config.allowed_attributes_per_tag['img']
|
|
892
|
+
node['alt'] = '' if allowed&.include?('alt') && !node.key?('alt')
|
|
893
|
+
end
|
|
894
|
+
|
|
895
|
+
# Ensures xlink namespace is present if needed
|
|
896
|
+
#
|
|
897
|
+
# @param node [Nokogiri::XML::Element] the element node
|
|
898
|
+
def ensure_xlink_namespace(node)
|
|
899
|
+
return if node['xmlns:xlink']
|
|
900
|
+
|
|
901
|
+
node['xmlns:xlink'] = 'http://www.w3.org/1999/xlink'
|
|
902
|
+
begin
|
|
903
|
+
node.add_namespace_definition('xlink', 'http://www.w3.org/1999/xlink')
|
|
904
|
+
rescue StandardError
|
|
905
|
+
nil
|
|
906
|
+
end
|
|
907
|
+
end
|
|
908
|
+
|
|
909
|
+
# Helper methods for valid_attribute?
|
|
910
|
+
|
|
911
|
+
# Checks if an attribute is explicitly forbidden
|
|
912
|
+
#
|
|
913
|
+
# @param attr_name [String] attribute name
|
|
914
|
+
# @return [Boolean] true if forbidden
|
|
915
|
+
def forbidden_attribute?(attr_name)
|
|
916
|
+
@config.forbidden_attributes&.include?(attr_name)
|
|
917
|
+
end
|
|
918
|
+
|
|
919
|
+
# Checks if an attribute is inherently dangerous
|
|
920
|
+
#
|
|
921
|
+
# @param attr_name [String] attribute name
|
|
922
|
+
# @return [Boolean] true if dangerous
|
|
923
|
+
def dangerous_attribute?(attr_name)
|
|
924
|
+
Attributes::DANGEROUS.any? { |d| attr_name.match?(/#{d}/i) }
|
|
925
|
+
end
|
|
926
|
+
|
|
927
|
+
# Checks if an attribute is allowed for a specific tag
|
|
928
|
+
#
|
|
929
|
+
# @param tag_name [String] tag name
|
|
930
|
+
# @param attr_name [String] attribute name
|
|
931
|
+
# @return [Boolean, nil] true/false if determined, nil if no rule found
|
|
932
|
+
def attribute_allowed?(tag_name, attr_name)
|
|
933
|
+
if @config.allowed_attributes_per_tag.is_a?(Hash)
|
|
934
|
+
per_tag_attrs = @config.allowed_attributes_per_tag[tag_name]
|
|
935
|
+
return per_tag_attrs.map { |a| transform_case(a) }.include?(attr_name) if per_tag_attrs
|
|
936
|
+
end
|
|
937
|
+
|
|
938
|
+
return check_global_allowed_attributes(attr_name) unless @config.allowed_attributes.nil?
|
|
939
|
+
|
|
940
|
+
check_default_allowed_attributes(tag_name, attr_name)
|
|
941
|
+
end
|
|
942
|
+
|
|
943
|
+
# Checks global allowed attributes list
|
|
944
|
+
#
|
|
945
|
+
# @param attr_name [String] attribute name
|
|
946
|
+
# @return [Boolean] true if allowed
|
|
947
|
+
def check_global_allowed_attributes(attr_name)
|
|
948
|
+
allowed = @config.allowed_attributes.dup.map { |a| transform_case(a) }
|
|
949
|
+
allowed.concat(@config.additional_attributes&.map { |a| transform_case(a) }) if @config.additional_attributes
|
|
950
|
+
allowed.include?(attr_name)
|
|
951
|
+
end
|
|
952
|
+
|
|
953
|
+
# Checks default allowed attributes based on tag type
|
|
954
|
+
#
|
|
955
|
+
# @param tag_name [String] tag name
|
|
956
|
+
# @param attr_name [String] attribute name
|
|
957
|
+
# @return [Boolean, nil] true if allowed, nil otherwise
|
|
958
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
959
|
+
def check_default_allowed_attributes(tag_name, attr_name)
|
|
960
|
+
html_attrs = @html_attrs ||= Attributes::HTML.map { |a| transform_case(a) }.to_set
|
|
961
|
+
svg_attrs = @svg_attrs ||= (Attributes::SVG + Attributes::XML).map { |a| transform_case(a) }.to_set
|
|
962
|
+
math_attrs = @math_attrs ||= (Attributes::MATH_ML + Attributes::XML).map { |a| transform_case(a) }.to_set
|
|
963
|
+
|
|
964
|
+
@html_tags_set ||= Tags::HTML.map { |t| transform_case(t) }.to_set
|
|
965
|
+
@svg_tags_set ||= (Tags::SVG + Tags::SVG_FILTERS).map { |t| transform_case(t) }.to_set
|
|
966
|
+
@math_tags_set ||= Tags::MATH_ML.map { |t| transform_case(t) }.to_set
|
|
967
|
+
|
|
968
|
+
is_svg = @svg_tags_set.include?(tag_name)
|
|
969
|
+
is_math = @math_tags_set.include?(tag_name)
|
|
970
|
+
is_html = @html_tags_set.include?(tag_name)
|
|
971
|
+
|
|
972
|
+
# Default to HTML if not recognized as standard tag but allowed
|
|
973
|
+
is_html = true if !is_svg && !is_math
|
|
974
|
+
|
|
975
|
+
attr_allowed = false
|
|
976
|
+
attr_allowed ||= svg_attrs.include?(attr_name) if is_svg
|
|
977
|
+
attr_allowed ||= math_attrs.include?(attr_name) if is_math
|
|
978
|
+
attr_allowed ||= html_attrs.include?(attr_name) if is_html
|
|
979
|
+
attr_allowed ? true : nil
|
|
980
|
+
end
|
|
981
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
982
|
+
|
|
983
|
+
# Checks if data attributes are allowed
|
|
984
|
+
#
|
|
985
|
+
# @param attr_name [String] attribute name
|
|
986
|
+
# @return [Boolean] true if allowed
|
|
987
|
+
def data_attribute_allowed?(attr_name)
|
|
988
|
+
@config.allow_data_attributes && attr_name.match?(Expressions::DATA_ATTR)
|
|
989
|
+
end
|
|
990
|
+
|
|
991
|
+
# Checks if ARIA attributes are allowed
|
|
992
|
+
#
|
|
993
|
+
# @param attr_name [String] attribute name
|
|
994
|
+
# @return [Boolean] true if allowed
|
|
995
|
+
def aria_attribute_allowed?(attr_name)
|
|
996
|
+
@config.allow_aria_attributes && attr_name.match?(Expressions::ARIA_ATTR)
|
|
997
|
+
end
|
|
998
|
+
|
|
999
|
+
# Validates style attribute value
|
|
1000
|
+
#
|
|
1001
|
+
# @param value [String] attribute value
|
|
1002
|
+
# @return [Boolean] true if valid
|
|
1003
|
+
def valid_style_attribute?(value)
|
|
1004
|
+
return false if value && unsafe_inline_style?(value.to_s)
|
|
1005
|
+
|
|
1006
|
+
true
|
|
1007
|
+
end
|
|
1008
|
+
|
|
1009
|
+
# Checks for DOM clobbering via attributes
|
|
1010
|
+
#
|
|
1011
|
+
# @param attr_name [String] attribute name
|
|
1012
|
+
# @param value [String] attribute value
|
|
1013
|
+
# @return [Boolean] true if clobbering detected
|
|
1014
|
+
def dom_clobbering_attribute?(attr_name, value)
|
|
1015
|
+
value && !value.to_s.strip.empty? && %w[name id].include?(attr_name) &&
|
|
1016
|
+
Attributes::DOM_CLOBBERING.include?(value.downcase)
|
|
1017
|
+
end
|
|
1018
|
+
|
|
1019
|
+
# Validates URI attributes
|
|
1020
|
+
#
|
|
1021
|
+
# @param tag_name [String] tag name
|
|
1022
|
+
# @param _attr_name [String] attribute name (unused)
|
|
1023
|
+
# @param value [String] attribute value
|
|
1024
|
+
# @param attr_allowed [Boolean] whether attribute is allowed
|
|
1025
|
+
# @return [Boolean] true if valid
|
|
1026
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
1027
|
+
def valid_uri_attribute?(tag_name, value, attr_allowed)
|
|
1028
|
+
val = value.to_s
|
|
1029
|
+
leading_space_pattern = /\A[\s\u0085\u00a0\u1680\u180e\u2000-\u200b\u2028\u2029\u205f\u3000]+/
|
|
1030
|
+
trailing_space_pattern = /[\s\u0085\u00a0\u1680\u180e\u2000-\u200b\u2028\u2029\u205f\u3000]+\z/
|
|
1031
|
+
val = val.gsub(leading_space_pattern, '').gsub(trailing_space_pattern, '')
|
|
1032
|
+
value.replace(val) if value.respond_to?(:replace) && value != val
|
|
1033
|
+
return false if val.match?(/[\x00-\x1f\x7f]/)
|
|
1034
|
+
|
|
1035
|
+
decoded = begin
|
|
1036
|
+
URI.decode_www_form_component(val)
|
|
1037
|
+
rescue StandardError
|
|
1038
|
+
val
|
|
1039
|
+
end
|
|
1040
|
+
return false if @config.allowed_uri_regexp && !val.match?(@config.allowed_uri_regexp)
|
|
1041
|
+
|
|
1042
|
+
# For URI attributes, check if it's allowed and has valid URI
|
|
1043
|
+
uri_allowed = attr_allowed.nil? || attr_allowed # default to allowed if not explicitly set
|
|
1044
|
+
return false if decoded.match?(Expressions::IS_SCRIPT_OR_DATA)
|
|
1045
|
+
|
|
1046
|
+
if decoded.match?(/^data:/i)
|
|
1047
|
+
return true if uri_allowed && @config.allow_data_uri && data_uri_tags.include?(tag_name)
|
|
1048
|
+
|
|
1049
|
+
return false
|
|
1050
|
+
end
|
|
1051
|
+
|
|
1052
|
+
return true if uri_allowed && decoded.match?(Expressions::IS_ALLOWED_URI)
|
|
1053
|
+
return true if uri_allowed && @config.allow_unknown_protocols && !decoded.match?(Expressions::IS_SCRIPT_OR_DATA)
|
|
1054
|
+
|
|
1055
|
+
false # Reject invalid URIs or non-allowed URI attributes
|
|
1056
|
+
end
|
|
1057
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
1058
|
+
|
|
1059
|
+
# Fallback check for unknown protocols
|
|
1060
|
+
#
|
|
1061
|
+
# @param value [String] attribute value
|
|
1062
|
+
# @return [Boolean] true if allowed
|
|
1063
|
+
def allow_unknown_protocols_fallback?(value)
|
|
1064
|
+
if @config.allow_unknown_protocols && value && !value.match?(Expressions::IS_SCRIPT_OR_DATA)
|
|
1065
|
+
return false if value.match?(/^data:/i) && !@config.allow_data_uri
|
|
1066
|
+
|
|
1067
|
+
return true
|
|
1068
|
+
end
|
|
1069
|
+
|
|
1070
|
+
false
|
|
1071
|
+
end
|
|
1072
|
+
end
|
|
1073
|
+
|
|
1074
|
+
# Builds a new sanitizer instance with optional configuration
|
|
1075
|
+
#
|
|
1076
|
+
# @param cfg [Hash, Config] optional configuration to initialize with
|
|
1077
|
+
# @yield [config] optional block to mutate configuration before use
|
|
1078
|
+
# @return [Sanitizer] a new sanitizer instance
|
|
1079
|
+
def self.new(cfg = {}, &block)
|
|
1080
|
+
Sanitizer.new(cfg, &block)
|
|
1081
|
+
end
|
|
1082
|
+
|
|
1083
|
+
# Convenience helper to sanitize with a fresh, default-configured instance.
|
|
1084
|
+
#
|
|
1085
|
+
# @param dirty [String, Nokogiri::XML::Node] the input to sanitize
|
|
1086
|
+
# @param cfg [Hash] optional configuration override
|
|
1087
|
+
# @return [String, Nokogiri::XML::Document] sanitized HTML or DOM
|
|
1088
|
+
def self.sanitize(dirty, cfg = {})
|
|
1089
|
+
new(cfg).sanitize(dirty)
|
|
1090
|
+
end
|
|
1091
|
+
|
|
1092
|
+
class << self
|
|
1093
|
+
alias_method :scrub, :sanitize
|
|
1094
|
+
end
|
|
1095
|
+
end
|