dandruff 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/dandruff.rb ADDED
@@ -0,0 +1,1095 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'set'
5
+ require 'uri'
6
+
7
+ require_relative 'dandruff/version'
8
+ require_relative 'dandruff/config'
9
+ require_relative 'dandruff/tags'
10
+ require_relative 'dandruff/attributes'
11
+ require_relative 'dandruff/expressions'
12
+ require_relative 'dandruff/utils'
13
+
14
+ # Dandruff - A robust HTML sanitizer for Ruby
15
+ #
16
+ # Dandruff is a Ruby implementation inspired by DOMPurify, providing comprehensive XSS protection
17
+ # by sanitizing HTML strings and removing malicious payloads. It's designed for excellent developer
18
+ # experience while maintaining battle-tested security.
19
+ #
20
+ # ## Key Features
21
+ #
22
+ # - **Comprehensive XSS Protection**: Defends against XSS, mXSS, DOM clobbering, and protocol injection
23
+ # - **Flexible Configuration**: Fine-grained control over tags, attributes, and sanitization behavior
24
+ # - **Content Type Profiles**: Pre-configured settings for HTML, SVG, MathML, and HTML email
25
+ # - **Hook System**: Extend sanitization with custom processing logic
26
+ # - **Developer-Friendly API**: Intuitive Ruby idioms with block-based configuration
27
+ # - **Battle-Tested Security**: Based on DOMPurify's proven security model
28
+ # - **Performance Optimized**: Efficient multi-pass sanitization with configurable limits
29
+ #
30
+ # ## Quick Start
31
+ #
32
+ # @example Basic sanitization
33
+ # require 'dandruff'
34
+ #
35
+ # dandruff = Dandruff.new
36
+ # clean = dandruff.sanitize('<script>alert("xss")</script><p>Safe content</p>')
37
+ # # => "<p>Safe content</p>"
38
+ #
39
+ # @example Configure with block
40
+ # dandruff = Dandruff.new do |config|
41
+ # config.allowed_tags = ['p', 'strong', 'em', 'a']
42
+ # config.allowed_attributes = ['href', 'title', 'class']
43
+ # end
44
+ #
45
+ # @example Use convenience class method
46
+ # clean = Dandruff.sanitize(dirty_html, allowed_tags: ['p', 'strong'])
47
+ #
48
+ # @example Profile-based configuration
49
+ # dandruff = Dandruff.new do |config|
50
+ # config.use_profiles = { html: true, svg: true }
51
+ # end
52
+ #
53
+ # @example Per-tag attribute control
54
+ # dandruff = Dandruff.new do |config|
55
+ # config.allowed_attributes_per_tag = {
56
+ # 'a' => ['href', 'title'],
57
+ # 'img' => ['src', 'alt', 'width', 'height']
58
+ # }
59
+ # end
60
+ #
61
+ # @example Custom hooks
62
+ # dandruff = Dandruff.new
63
+ # dandruff.add_hook(:upon_sanitize_attribute) do |node, data, config|
64
+ # # Custom attribute processing
65
+ # if data[:attr_name] == 'data-safe'
66
+ # data[:keep_attr] = true
67
+ # end
68
+ # end
69
+ #
70
+ # ## Security
71
+ #
72
+ # Dandruff protects against multiple attack vectors:
73
+ # - **XSS**: Removes script tags, event handlers, javascript: URIs
74
+ # - **mXSS**: Multi-pass sanitization prevents mutation-based attacks
75
+ # - **DOM Clobbering**: Blocks dangerous id/name attribute values
76
+ # - **Protocol Injection**: Validates URI protocols (javascript:, vbscript:, data:text/html)
77
+ # - **Namespace Confusion**: Prevents mXSS via SVG/MathML namespace attacks
78
+ # - **CSS Injection**: Sanitizes inline styles and style tag content
79
+ #
80
+ # @see https://github.com/kuyio/dandruff GitHub repository
81
+ # @see https://github.com/cure53/DOMPurify Original JavaScript implementation
82
+ # @see Config Configuration options reference
83
+ # @see Sanitizer Core sanitization engine
84
+ module Dandruff
85
+ class Error < StandardError; end
86
+
87
+ # Main sanitizer class handling HTML sanitization logic
88
+ #
89
+ # This class manages the core sanitization process, configuration, and hooks.
90
+ # It parses HTML, removes dangerous elements and attributes, and serializes the result.
91
+ class Sanitizer
92
+ MATH_SVG_TAGS = %w[math svg].freeze
93
+ attr_reader :removed, :config, :hooks
94
+
95
+ # Initializes a new sanitizer instance
96
+ #
97
+ # @param config [Config] optional configuration object
98
+ # @yield [config] optional block to configure instance config
99
+ def initialize(config = nil)
100
+ @removed = []
101
+ @config = build_config(config)
102
+ @hooks = create_hooks_map
103
+ @is_supported = check_support
104
+ yield(@config) if block_given?
105
+ end
106
+
107
+ # Hook management
108
+ def add_hook(entry_point, &hook_function)
109
+ return unless hook_function.is_a?(Proc)
110
+
111
+ @hooks[entry_point] ||= []
112
+ @hooks[entry_point] << hook_function
113
+ end
114
+
115
+ def remove_hook(entry_point, hook_function = nil)
116
+ arr = @hooks[entry_point]
117
+ return nil unless arr
118
+
119
+ if hook_function
120
+ idx = arr.rindex(hook_function)
121
+ return nil unless idx
122
+
123
+ arr.delete_at(idx)
124
+ else
125
+ arr.pop
126
+ end
127
+ end
128
+
129
+ def remove_hooks(entry_point)
130
+ @hooks[entry_point] = []
131
+ end
132
+
133
+ def remove_all_hooks
134
+ @hooks = create_hooks_map
135
+ end
136
+
137
+ # Checks if the current environment supports Dandruff functionality
138
+ #
139
+ # @return [Boolean] true if Nokogiri is available, false otherwise
140
+ def supported?
141
+ @is_supported
142
+ end
143
+
144
+ # Sets configuration for the sanitizer
145
+ #
146
+ # @param cfg [Hash] configuration options
147
+ def set_config(cfg = {})
148
+ @config = parse_config(cfg)
149
+ end
150
+
151
+ # Configures the sanitizer with a block
152
+ #
153
+ # @yield [config] the configuration object to modify
154
+ # @return [Sanitizer] the sanitizer instance
155
+ def configure
156
+ yield(@config) if block_given?
157
+ self
158
+ end
159
+
160
+ # Clears current configuration, resetting to defaults
161
+ def clear_config
162
+ @config = parse_config({})
163
+ end
164
+
165
+ # Main sanitization method
166
+ #
167
+ # Parses the input HTML, sanitizes elements and attributes, and returns clean HTML.
168
+ #
169
+ # @param dirty [String, Nokogiri::XML::Node] the input to sanitize
170
+ # @param cfg [Hash] optional configuration override
171
+ # @return [String, Nokogiri::XML::Document] sanitized HTML or DOM
172
+ def sanitize(dirty, cfg = {})
173
+ return dirty unless supported?
174
+
175
+ cfg.empty? ? ensure_config : set_config(cfg)
176
+ @removed = []
177
+ return '' if dirty.nil?
178
+ return dirty.to_s if dirty.to_s.strip.empty?
179
+
180
+ dirty = dirty.to_s unless dirty.is_a?(String)
181
+ doc = parse_html(dirty)
182
+ sanitize_document(doc)
183
+ output = serialize_html(doc)
184
+
185
+ output = resanitize_until_stable(output) if @config.sanitize_until_stable
186
+
187
+ if @config.return_dom
188
+ return parse_html(output)
189
+ elsif @config.return_dom_fragment
190
+ return Nokogiri::HTML5::DocumentFragment.parse(output)
191
+ end
192
+
193
+ output
194
+ end
195
+ alias_method :scrub, :sanitize
196
+
197
+ private
198
+
199
+ # Checks if required dependencies are available
200
+ def check_support
201
+ defined?(Nokogiri) && Nokogiri::VERSION
202
+ end
203
+
204
+ # Creates the default hooks map
205
+ #
206
+ # @return [Hash] hash of hook arrays keyed by hook name
207
+ def create_hooks_map
208
+ {
209
+ before_sanitize_attributes: [],
210
+ after_sanitize_attributes: [],
211
+ before_sanitize_elements: [],
212
+ after_sanitize_elements: [],
213
+ upon_sanitize_attribute: [],
214
+ upon_sanitize_element: []
215
+ }
216
+ end
217
+
218
+ # Parses configuration options
219
+ def parse_config(cfg = {})
220
+ Config.new(cfg)
221
+ end
222
+
223
+ # Builds a configuration from hash or existing Config
224
+ def build_config(cfg)
225
+ return parse_config(cfg) if cfg.is_a?(Hash)
226
+ return cfg if cfg.is_a?(Config)
227
+
228
+ parse_config({})
229
+ end
230
+
231
+ # Ensures configuration is set
232
+ def ensure_config
233
+ @config ||= parse_config({})
234
+ end
235
+
236
+ # Parses HTML string into Nokogiri document
237
+ #
238
+ # @param html [String] HTML string to parse
239
+ # @return [Nokogiri::XML::Document] parsed document
240
+ def parse_html(html)
241
+ html = "<remove></remove>#{html}" if @config.force_body
242
+ if @config.parser_media_type == 'application/xhtml+xml' && @config.namespace == 'http://www.w3.org/1999/xhtml'
243
+ html = "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head></head><body>#{html}</body></html>"
244
+ end
245
+ if @config.whole_document || @config.return_dom || @config.allow_document_elements || html.match?(/<frameset/i)
246
+ Nokogiri::HTML5.parse(html)
247
+ else
248
+ Nokogiri::HTML5.fragment(html)
249
+ end
250
+ end
251
+
252
+ # Sanitizes the document by processing elements and attributes
253
+ #
254
+ # @param doc [Nokogiri::XML::Document] document to sanitize
255
+ # @return [Nokogiri::XML::Document] sanitized document
256
+ def sanitize_document(doc) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
257
+ doc.children.first.remove if @config.force_body && doc.children.first&.name == 'remove'
258
+ execute_hooks(:before_sanitize_elements, doc)
259
+ doc.traverse do |node|
260
+ if node.element? && %w[script iframe frame frameset object embed].include?(node.name)
261
+ @removed << { element: node }
262
+ if node.name == 'frameset'
263
+ parent = node.parent
264
+ # puts "Removing frameset and parent: #{parent&.name}"
265
+ node.remove
266
+ parent&.remove
267
+ else
268
+ node.remove
269
+ end
270
+ next
271
+ elsif node.element? && node.name == 'style'
272
+ node.remove && next unless @config.allow_style_tags
273
+
274
+ if unsafe_style_node?(node)
275
+ node.remove
276
+ next
277
+ end
278
+ elsif node.element?
279
+ sanitize_element(node)
280
+ elsif node.text? && @config.safe_for_templates
281
+ sanitize_text_node(node)
282
+ elsif node.comment? && @config.safe_for_xml
283
+ sanitize_comment_node(node)
284
+ elsif node.cdata?
285
+ node.replace(Nokogiri::XML::Text.new(node.text, node.document))
286
+ end
287
+ end
288
+ execute_hooks(:after_sanitize_elements, doc)
289
+ end
290
+
291
+ def sanitize_element(node)
292
+ tag_name = transform_case(node.name)
293
+
294
+ return if handle_isindex(node, tag_name)
295
+ return if handle_dangerous_math_svg(node)
296
+ return if handle_namespace_check(node)
297
+ return if handle_prefixed_element(node, tag_name)
298
+
299
+ execute_hooks(:upon_sanitize_element, node, { tag_name: tag_name })
300
+
301
+ unless allowed_element?(tag_name)
302
+ handle_disallowed_element(node, tag_name)
303
+ return
304
+ end
305
+
306
+ sanitize_attributes(node)
307
+ handle_vml_namespace(node)
308
+ end
309
+
310
+ # Sanitizes attributes of an element
311
+ #
312
+ # @param node [Nokogiri::XML::Element] element to sanitize attributes for
313
+ def sanitize_attributes(node)
314
+ tag_name = transform_case(node.name)
315
+ to_remove = []
316
+ dangerous_removed = false
317
+ had_xlink_href = node.key?('xlink:href')
318
+
319
+ execute_hooks(:before_sanitize_attributes, node)
320
+
321
+ node.attributes.each do |name, attr|
322
+ lc_name = normalize_attribute_name(name, attr)
323
+
324
+ handle_is_attribute(attr, lc_name)
325
+ value = attr.value
326
+
327
+ handle_xlink_namespace_definition(node, lc_name)
328
+
329
+ had_xlink_href ||= (lc_name == 'xlink:href')
330
+ had_xlink_href ||= (attr.namespace&.href == 'http://www.w3.org/1999/xlink')
331
+
332
+ execute_hooks(:upon_sanitize_attribute, attr, { tag_name: tag_name, attr_name: lc_name, value: value })
333
+
334
+ if valid_attribute?(tag_name, lc_name, value)
335
+ attr.value = value if value != attr.value
336
+ else
337
+ to_remove << name
338
+ @removed << { attribute: attr, from: node }
339
+ dangerous_removed = true if dangerous_attribute_removed?(lc_name, tag_name)
340
+ end
341
+ end
342
+
343
+ to_remove.each { |n| node.delete(n) }
344
+
345
+ # Remove meta/link tags entirely if dangerous attributes were removed
346
+ if dangerous_removed && %w[meta link].include?(tag_name)
347
+ node.remove
348
+ return
349
+ end
350
+
351
+ ensure_alt_attribute(node, tag_name)
352
+ ensure_xlink_namespace(node) if had_xlink_href || node.key?('xlink:href')
353
+
354
+ execute_hooks(:after_sanitize_attributes, node)
355
+ end
356
+
357
+ def build_isindex_replacement(node)
358
+ doc = node.document
359
+ form = Nokogiri::XML::Node.new('form', doc)
360
+ hr1 = Nokogiri::XML::Node.new('hr', doc)
361
+ hr2 = Nokogiri::XML::Node.new('hr', doc)
362
+ label = Nokogiri::XML::Node.new('label', doc)
363
+ label.content = 'This is a searchable index. Enter search keywords: '
364
+ input = Nokogiri::XML::Node.new('input', doc)
365
+ if node['src']
366
+ input['name'] = 'isindex'
367
+ input['label'] = node['label'] if node['label']
368
+ else
369
+ input['label'] = node['label'] if node['label']
370
+ input['name'] = 'isindex'
371
+ end
372
+ label.add_child(input)
373
+ form.add_child(hr1)
374
+ form.add_child(label)
375
+ form.add_child(hr2)
376
+ form
377
+ rescue StandardError
378
+ nil
379
+ end
380
+
381
+ # Checks if an element tag is allowed
382
+ #
383
+ # @param tag_name [String] the tag name to check
384
+ # @return [Boolean] true if the tag is allowed, false otherwise
385
+ # rubocop:disable Metrics/CyclomaticComplexity
386
+ def allowed_element?(tag_name)
387
+ if !@config.whole_document && !@config.allow_document_elements && !@config.return_dom &&
388
+ %w[html head body].include?(tag_name)
389
+ return false
390
+ end
391
+
392
+ return false if @config.forbidden_tags&.include?(tag_name)
393
+
394
+ unless @config.allowed_tags.nil?
395
+ allowed = @config.allowed_tags.dup.map { |t| transform_case(t) }
396
+ allowed.concat(@config.additional_tags) if @config.additional_tags
397
+ is_included = allowed.include?(tag_name)
398
+ return is_included
399
+ end
400
+ return true if @config.additional_tags&.map { |t| transform_case(t) }&.include?(tag_name)
401
+
402
+ default_allowed_tags.include?(tag_name)
403
+ end
404
+ # rubocop:enable Metrics/CyclomaticComplexity
405
+
406
+ # Checks if an attribute is valid for a given tag
407
+ #
408
+ # @param tag_name [String] the element tag name
409
+ # @param attr_name [String] the attribute name
410
+ # @param value [String] the attribute value
411
+ # @return [Boolean] true if the attribute is valid, false otherwise
412
+ # Checks if an attribute is valid for a given tag
413
+ #
414
+ # @param tag_name [String] the element tag name
415
+ # @param attr_name [String] the attribute name
416
+ # @param value [String] the attribute value
417
+ # @return [Boolean] true if the attribute is valid, false otherwise
418
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
419
+ def valid_attribute?(tag_name, attr_name, value)
420
+ return false if forbidden_attribute?(attr_name)
421
+ return false if dangerous_attribute?(attr_name)
422
+
423
+ attr_allowed = attribute_allowed?(tag_name, attr_name)
424
+
425
+ return true if data_attribute_allowed?(attr_name)
426
+ return true if aria_attribute_allowed?(attr_name)
427
+ return true if attr_name == 'is'
428
+
429
+ if attr_name == 'style'
430
+ return false unless attr_allowed || attr_allowed.nil?
431
+
432
+ return valid_style_attribute?(value)
433
+ end
434
+
435
+ return false if @config.sanitize_dom && dom_clobbering_attribute?(attr_name, value)
436
+
437
+ return valid_uri_attribute?(tag_name, value, attr_allowed) if uri_like?(attr_name) && value
438
+
439
+ return attr_allowed if [true, false].include?(attr_allowed)
440
+
441
+ # Default permissive checks
442
+ return true if @config.additional_attributes&.include?(attr_name)
443
+
444
+ allow_unknown_protocols_fallback?(value)
445
+ end
446
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
447
+
448
+ # Checks if an attribute name is URI-like
449
+ #
450
+ # @param attr_name [String] the attribute name
451
+ # @return [Boolean] true if the attribute is URI-like, false otherwise
452
+ def uri_like?(attr_name)
453
+ default_uri_safe_attributes.include?(attr_name) || @config.additional_uri_safe_attributes&.include?(attr_name)
454
+ end
455
+
456
+ def unsafe_inline_style?(value)
457
+ normalized = value.downcase
458
+ # Decode CSS hex escapes to surface hidden protocol names
459
+ normalized = normalized.gsub(/\\([0-9a-f]{1,6})\s?/i) do
460
+ [::Regexp.last_match(1).to_i(16)].pack('U')
461
+ rescue StandardError
462
+ ''
463
+ end
464
+ normalized = normalized.delete('\\') # Remove backslashes after decoding hex escapes
465
+ # Check for truly dangerous CSS patterns
466
+ # More lenient than before to match DOMPurify behavior
467
+ normalized = normalized.gsub(/\s+/, '') # Remove all whitespace for easier matching
468
+
469
+ # Dangerous: javascript/vbscript/data protocols in background/import
470
+ return true if normalized.match?(/javascript:/i) && normalized.match?(/background|@import/i)
471
+ return true if normalized.match?(/vbscript:/i)
472
+
473
+ # Dangerous: expression() (IE)
474
+ return true if normalized.include?('expression(')
475
+
476
+ # Dangerous: @import (can load external stylesheets)
477
+ return true if normalized.match?(/@import\s*url/i)
478
+
479
+ # Dangerous: data:text/html (can contain scripts)
480
+ return true if normalized.include?('data:text/html')
481
+
482
+ # Note: behavior:, binding:, data:image/svg+xml in content/filter are SAFE
483
+ # These are only dangerous in specific contexts that modern browsers don't execute
484
+
485
+ false
486
+ end
487
+
488
+ def sanitize_style_value(value)
489
+ return nil if unsafe_inline_style?(value)
490
+ return nil if value.match?(/\\[0-9a-f]{1,6}/i)
491
+
492
+ allowed_props = Set.new(%w[
493
+ align-content align-items align-self all animation animation-delay animation-direction animation-duration
494
+ animation-fill-mode animation-iteration-count animation-name animation-play-state animation-timing-function
495
+ background background-clip background-color background-image background-origin background-position
496
+ background-repeat background-size border border-bottom border-bottom-color border-bottom-style
497
+ border-bottom-width
498
+ border-collapse border-color border-image border-left border-left-color border-left-style border-left-width
499
+ border-radius border-right border-right-color border-right-style border-right-width border-spacing border-style
500
+ border-top border-top-color border-top-style border-top-width border-width bottom box-shadow box-sizing
501
+ caption-side clear clip color column-count column-fill column-gap column-rule column-rule-color
502
+ column-rule-style
503
+ column-rule-width column-span column-width columns content cursor direction display empty-cells filter flex
504
+ flex-basis flex-direction flex-flow flex-grow flex-shrink flex-wrap float font font-family font-size
505
+ font-size-adjust font-stretch font-style font-variant font-weight gap grid grid-area grid-auto-columns
506
+ grid-auto-flow grid-auto-rows grid-column grid-column-end grid-column-gap grid-column-start grid-gap grid-row
507
+ grid-row-end grid-row-gap grid-row-start grid-template grid-template-areas grid-template-columns
508
+ grid-template-rows
509
+ height justify-content left letter-spacing line-height list-style list-style-image
510
+ list-style-position list-style-type
511
+ margin margin-bottom margin-left margin-right margin-top max-height max-width min-height min-width opacity order
512
+ outline outline-color outline-offset outline-style outline-width overflow overflow-x overflow-y padding
513
+ padding-bottom padding-left padding-right padding-top page-break-after page-break-before page-break-inside
514
+ perspective perspective-origin pointer-events position quotes resize right row-gap table-layout text-align
515
+ text-align-last text-decoration text-decoration-color text-decoration-line text-decoration-style text-indent
516
+ text-justify text-overflow text-shadow text-transform top transform transform-origin transition transition-delay
517
+ transition-duration transition-property transition-timing-function unicode-bidi vertical-align visibility
518
+ white-space width word-break word-spacing word-wrap writing-mode z-index
519
+ ])
520
+
521
+ declarations = value.split(';').map(&:strip).reject(&:empty?)
522
+ sanitized = declarations.filter_map do |decl|
523
+ prop, val = decl.split(':', 2).map { |p| p&.strip }
524
+ next nil unless prop && val
525
+
526
+ lc_prop = prop.downcase
527
+ next nil unless allowed_props.include?(lc_prop)
528
+ # reject dangerous urls/protocols in values
529
+ next nil if unsafe_inline_style?(val)
530
+
531
+ "#{lc_prop}:#{val}"
532
+ end
533
+
534
+ return nil if sanitized.empty?
535
+
536
+ sanitized.join('; ')
537
+ end
538
+
539
+ def unsafe_style_block?(content)
540
+ return false if content.nil? || content.strip.empty?
541
+
542
+ unsafe_inline_style?(content)
543
+ end
544
+
545
+ # Checks if a node is within a MathML or SVG context
546
+ #
547
+ # @param node [Nokogiri::XML::Element] element to check
548
+ # @return [Boolean] true if inside math or svg element
549
+ def in_math_or_svg_context?(node)
550
+ current = node.parent
551
+ while current
552
+ if current.respond_to?(:element?) && current.element? && MATH_SVG_TAGS.include?(current.name.downcase)
553
+ return true
554
+ end
555
+ break unless current.respond_to?(:parent)
556
+
557
+ current = current.parent
558
+ end
559
+ false
560
+ end
561
+
562
+ # Checks if an element is dangerous when inside MathML/SVG context
563
+ #
564
+ # @param node [Nokogiri::XML::Element] element to check
565
+ # @return [Boolean] true if element can cause mXSS in math/svg context
566
+ def dangerous_in_math_svg?(node)
567
+ return false unless node.element?
568
+
569
+ tag = node.name.downcase
570
+ return false unless in_math_or_svg_context?(node)
571
+
572
+ # These elements can cause mXSS when inside MathML/SVG
573
+ # - style: can break out of context with </style><img onerror=...>
574
+ # - title: similar context confusion
575
+ # - mglyph: not standard in MathML 3.0, used in nesting attacks
576
+ %w[style title mglyph].include?(tag)
577
+ end
578
+
579
+ def unsafe_style_node?(node)
580
+ parent_name = node.parent&.name
581
+ top_level = parent_name.nil? || parent_name == '#document' || parent_name == '#document-fragment' ||
582
+ %w[html head body].include?(parent_name)
583
+
584
+ # For whole_document/html_email profiles, allow style tags at top level (in head/body)
585
+ # This is safe because the entire document structure is being preserved
586
+ if @config.whole_document && @config.allow_style_tags
587
+ # Only block style in truly unsafe contexts (e.g., option/select)
588
+ return true if %w[option select].include?(parent_name)
589
+
590
+ # Allow style tags even if they contain CSS content
591
+ return false
592
+ end
593
+
594
+ # For non-whole-document contexts, block top-level style tags as they're unexpected
595
+ return true if top_level
596
+ return true if %w[option select].include?(parent_name)
597
+ return true if node.content.include?('<') || node.element_children.any?
598
+
599
+ false
600
+ end
601
+
602
+ def resanitize_until_stable(html)
603
+ current = html
604
+ max_passes = @config.mutation_max_passes.to_i
605
+ return current if max_passes <= 1
606
+
607
+ passes = 1
608
+ while passes < max_passes
609
+ doc = parse_html(current)
610
+ sanitize_document(doc)
611
+ next_output = serialize_html(doc)
612
+ passes += 1
613
+ break if next_output == current
614
+
615
+ current = next_output
616
+ end
617
+ current
618
+ end
619
+
620
+ # Serializes the document back to HTML string
621
+ #
622
+ # @param doc [Nokogiri::XML::Document] document to serialize
623
+ # @return [String] HTML string
624
+ def serialize_html(doc)
625
+ result = doc.respond_to?(:to_html) ? doc.to_html : doc.to_s
626
+ result = result.sub(/\A\n+/, '')
627
+ result = fix_svg_self_closing_tags(result).gsub('&amp;unknown;', '&unknown;')
628
+ # Remove encoded script blocks
629
+ result = result.gsub(%r{&lt;script&gt;.*?&lt;/script&gt;}i, '')
630
+ if !@config.whole_document && !@config.allow_document_elements && !@config.return_dom
631
+ result = result.gsub(%r{</?(?:html|head|body)(?:\s[^>]*)?>}i, '')
632
+ end
633
+ result
634
+ end
635
+
636
+ def fix_svg_self_closing_tags(html)
637
+ %w[circle ellipse line path polygon polyline rect stop use feimage mask g defs].each do |tag|
638
+ html = html.gsub(%r{<#{tag}([^>]*)/>}, "<#{tag}\\1></#{tag}>")
639
+ end
640
+ html
641
+ end
642
+
643
+ # Transforms tag/attribute names to lowercase if not XHTML
644
+ #
645
+ # @param str [String] string to transform
646
+ # @return [String] transformed string
647
+ def transform_case(str)
648
+ @config&.parser_media_type == 'application/xhtml+xml' ? str : str.downcase
649
+ end
650
+
651
+ # Returns the default set of allowed tags
652
+ #
653
+ # @return [Set] set of allowed HTML, SVG, MathML, and text tags
654
+ def default_allowed_tags
655
+ @default_allowed_tags ||= begin
656
+ source = @config.minimal_profile ? Tags::MINIMAL_HTML : Tags::HTML
657
+ s = Set.new(source.map { |t| transform_case(t) })
658
+ unless @config.minimal_profile
659
+ s.merge(Tags::SVG.map { |t| transform_case(t) })
660
+ s.merge(Tags::SVG_FILTERS.map { |t| transform_case(t) })
661
+ s.merge(Tags::MATH_ML.map { |t| transform_case(t) })
662
+ end
663
+ s.merge(Tags::TEXT.map { |t| transform_case(t) })
664
+ s
665
+ end
666
+ end
667
+
668
+ # Returns the default set of URI-safe attributes
669
+ #
670
+ # @return [Set] set of attributes that can contain URIs
671
+ def default_uri_safe_attributes
672
+ @default_uri_safe_attributes ||= Set.new(%w[href src xlink:href action formaction cite data poster background
673
+ srcset])
674
+ end
675
+
676
+ # Checks if a tag's content should be forbidden
677
+ #
678
+ # @param tag_name [String] the tag name to check
679
+ # @return [Boolean] true if content should be forbidden, false otherwise
680
+ def forbidden_content?(tag_name)
681
+ default_forbid_contents.include?(tag_name) || @config.forbid_contents&.include?(tag_name)
682
+ end
683
+
684
+ # Returns the default set of tags whose content should be forbidden
685
+ #
686
+ # @return [Set] set of tags with forbidden content
687
+ def default_forbid_contents
688
+ @default_forbid_contents ||= Set.new(%w[annotation-xml audio colgroup desc foreignobject head iframe math mi mn
689
+ mo ms mtext noembed noframes noscript plaintext script style svg template thead title video xmp])
690
+ end
691
+
692
+ # Returns the set of tags that can have data URIs
693
+ #
694
+ # @return [Set] set of tags allowed to have data URIs
695
+ def data_uri_tags
696
+ @data_uri_tags ||= begin
697
+ t = Set.new(%w[audio video img source image track])
698
+ t.merge(@config.add_data_uri_tags) if @config.add_data_uri_tags
699
+ t
700
+ end
701
+ end
702
+
703
+ # Sanitizes text nodes by removing template expressions
704
+ #
705
+ # @param node [Nokogiri::XML::Text] text node to sanitize
706
+ def sanitize_text_node(node)
707
+ content = node.content
708
+ [Expressions::MUSTACHE_EXPR, Expressions::ERB_EXPR, Expressions::TMPLIT_EXPR].each do |expr|
709
+ content = content.gsub(expr, ' ')
710
+ end
711
+ return if node.content == content
712
+
713
+ @removed << { element: node.dup }
714
+ node.content = content
715
+ end
716
+
717
+ # Sanitizes comment nodes by removing them entirely
718
+ #
719
+ # @param node [Nokogiri::XML::Comment] comment node to sanitize
720
+ def sanitize_comment_node(node)
721
+ @removed << { element: node }
722
+ node.remove
723
+ end
724
+
725
+ # Executes hooks for a given entry point
726
+ #
727
+ # @param entry_point [Symbol] the hook entry point
728
+ # @param node [Nokogiri::XML::Node] the node being processed
729
+ # @param data [Hash] additional data for the hook
730
+ def execute_hooks(entry_point, node, data = nil)
731
+ hooks = @hooks[entry_point]
732
+ return unless hooks
733
+
734
+ hooks.each { |h| h.call(node, data, @config) }
735
+ end
736
+
737
+ # Helper methods for sanitize_element
738
+
739
+ # Handles the deprecated isindex element by converting it to a form
740
+ #
741
+ # @param node [Nokogiri::XML::Element] the element node
742
+ # @param tag_name [String] the tag name
743
+ # @return [Boolean] true if handled (removed/replaced), false otherwise
744
+ def handle_isindex(node, tag_name)
745
+ return false unless tag_name == 'isindex'
746
+
747
+ replacement = build_isindex_replacement(node)
748
+ node.add_next_sibling(replacement) if replacement
749
+ @removed << { element: node }
750
+ node.remove
751
+ true
752
+ end
753
+
754
+ # Removes elements that are dangerous in MathML/SVG contexts
755
+ #
756
+ # @param node [Nokogiri::XML::Element] the element node
757
+ # @return [Boolean] true if removed, false otherwise
758
+ def handle_dangerous_math_svg(node)
759
+ return false unless dangerous_in_math_svg?(node)
760
+
761
+ @removed << { element: node }
762
+ node.remove
763
+ true
764
+ end
765
+
766
+ # Checks and handles element namespaces
767
+ #
768
+ # @param node [Nokogiri::XML::Element] the element node
769
+ # @return [Boolean] true if removed due to invalid namespace, false otherwise
770
+ def handle_namespace_check(node)
771
+ return false unless node.namespace&.href
772
+ return false if ['http://www.w3.org/1999/xhtml', 'http://www.w3.org/2000/svg', 'http://www.w3.org/1998/Math/MathML'].include?(node.namespace.href)
773
+
774
+ node.children.to_a.each { |child| node.add_previous_sibling(child) } if @config.keep_content
775
+ @removed << { element: node }
776
+ node.remove
777
+ true
778
+ end
779
+
780
+ # Handles elements with namespace prefixes
781
+ #
782
+ # @param node [Nokogiri::XML::Element] the element node
783
+ # @param tag_name [String] the tag name
784
+ # @return [Boolean] true if handled (removed), false otherwise
785
+ def handle_prefixed_element(node, tag_name)
786
+ return false unless tag_name.include?(':')
787
+
788
+ prefix = tag_name.split(':').first.downcase
789
+ if %w[xml xmlns].include?(prefix)
790
+ if @config.keep_content
791
+ text_nodes = []
792
+ node.traverse { |n| text_nodes << n if n.text? }
793
+ text_nodes.each { |text_node| node.add_previous_sibling(text_node.dup) }
794
+ end
795
+ @removed << { element: node }
796
+ node.remove
797
+ return true
798
+ end
799
+
800
+ node.children.to_a.each { |child| node.add_previous_sibling(child) } if @config.keep_content
801
+ @removed << { element: node }
802
+ node.remove
803
+ true
804
+ end
805
+
806
+ # Handles elements that are not allowed by the configuration
807
+ #
808
+ # @param node [Nokogiri::XML::Element] the element node
809
+ # @param tag_name [String] the tag name
810
+ def handle_disallowed_element(node, tag_name)
811
+ replaced_children = false
812
+ if @config.keep_content && !forbidden_content?(tag_name) && !@config.allowed_tags
813
+ if node.children.any?
814
+ node.children.to_a.each { |child| node.add_previous_sibling(child) }
815
+ replaced_children = true
816
+ else
817
+ node.remove
818
+ end
819
+ elsif @config.allowed_tags && node.children.any?
820
+ node.add_next_sibling(Nokogiri::XML::Text.new(' ', node.document))
821
+ end
822
+ @removed << { element: node }
823
+ node.remove unless replaced_children
824
+ end
825
+
826
+ # Removes elements with VML namespace
827
+ #
828
+ # @param node [Nokogiri::XML::Element] the element node
829
+ def handle_vml_namespace(node)
830
+ return unless node['xmlns']&.match?(/vml/i)
831
+
832
+ @removed << { element: node }
833
+ node.remove
834
+ end
835
+
836
+ # Helper methods for sanitize_attributes
837
+
838
+ # Normalizes attribute name handling namespaces
839
+ #
840
+ # @param name [String] attribute name
841
+ # @param attr [Nokogiri::XML::Attr] attribute object
842
+ # @return [String] normalized attribute name
843
+ def normalize_attribute_name(name, attr)
844
+ if attr.namespace&.prefix == 'xmlns'
845
+ name == 'xmlns' ? 'xmlns' : "xmlns:#{transform_case(name)}"
846
+ else
847
+ transform_case(name)
848
+ end
849
+ end
850
+
851
+ # Handles the 'is' attribute by clearing its value
852
+ #
853
+ # @param attr [Nokogiri::XML::Attr] attribute object
854
+ # @param lc_name [String] lowercased attribute name
855
+ def handle_is_attribute(attr, lc_name)
856
+ return unless lc_name == 'is'
857
+
858
+ attr.value = ''
859
+ end
860
+
861
+ # Adds xlink namespace definition if needed
862
+ #
863
+ # @param node [Nokogiri::XML::Element] the element node
864
+ # @param lc_name [String] lowercased attribute name
865
+ def handle_xlink_namespace_definition(node, lc_name)
866
+ return unless lc_name.start_with?('xlink:')
867
+
868
+ begin
869
+ node.add_namespace_definition('xlink', 'http://www.w3.org/1999/xlink')
870
+ rescue StandardError
871
+ nil
872
+ end
873
+ end
874
+
875
+ # Checks if a removed attribute was dangerous enough to warrant removing the element
876
+ #
877
+ # @param lc_name [String] lowercased attribute name
878
+ # @param tag_name [String] tag name
879
+ # @return [Boolean] true if dangerous
880
+ def dangerous_attribute_removed?(lc_name, tag_name)
881
+ %w[href content].include?(lc_name) && %w[meta link].include?(tag_name)
882
+ end
883
+
884
+ # Ensures img tags have an alt attribute if allowed
885
+ #
886
+ # @param node [Nokogiri::XML::Element] the element node
887
+ # @param tag_name [String] tag name
888
+ def ensure_alt_attribute(node, tag_name)
889
+ return unless tag_name == 'img' && @config.allowed_attributes_per_tag.is_a?(Hash)
890
+
891
+ allowed = @config.allowed_attributes_per_tag['img']
892
+ node['alt'] = '' if allowed&.include?('alt') && !node.key?('alt')
893
+ end
894
+
895
+ # Ensures xlink namespace is present if needed
896
+ #
897
+ # @param node [Nokogiri::XML::Element] the element node
898
+ def ensure_xlink_namespace(node)
899
+ return if node['xmlns:xlink']
900
+
901
+ node['xmlns:xlink'] = 'http://www.w3.org/1999/xlink'
902
+ begin
903
+ node.add_namespace_definition('xlink', 'http://www.w3.org/1999/xlink')
904
+ rescue StandardError
905
+ nil
906
+ end
907
+ end
908
+
909
+ # Helper methods for valid_attribute?
910
+
911
+ # Checks if an attribute is explicitly forbidden
912
+ #
913
+ # @param attr_name [String] attribute name
914
+ # @return [Boolean] true if forbidden
915
+ def forbidden_attribute?(attr_name)
916
+ @config.forbidden_attributes&.include?(attr_name)
917
+ end
918
+
919
+ # Checks if an attribute is inherently dangerous
920
+ #
921
+ # @param attr_name [String] attribute name
922
+ # @return [Boolean] true if dangerous
923
+ def dangerous_attribute?(attr_name)
924
+ Attributes::DANGEROUS.any? { |d| attr_name.match?(/#{d}/i) }
925
+ end
926
+
927
+ # Checks if an attribute is allowed for a specific tag
928
+ #
929
+ # @param tag_name [String] tag name
930
+ # @param attr_name [String] attribute name
931
+ # @return [Boolean, nil] true/false if determined, nil if no rule found
932
+ def attribute_allowed?(tag_name, attr_name)
933
+ if @config.allowed_attributes_per_tag.is_a?(Hash)
934
+ per_tag_attrs = @config.allowed_attributes_per_tag[tag_name]
935
+ return per_tag_attrs.map { |a| transform_case(a) }.include?(attr_name) if per_tag_attrs
936
+ end
937
+
938
+ return check_global_allowed_attributes(attr_name) unless @config.allowed_attributes.nil?
939
+
940
+ check_default_allowed_attributes(tag_name, attr_name)
941
+ end
942
+
943
+ # Checks global allowed attributes list
944
+ #
945
+ # @param attr_name [String] attribute name
946
+ # @return [Boolean] true if allowed
947
+ def check_global_allowed_attributes(attr_name)
948
+ allowed = @config.allowed_attributes.dup.map { |a| transform_case(a) }
949
+ allowed.concat(@config.additional_attributes&.map { |a| transform_case(a) }) if @config.additional_attributes
950
+ allowed.include?(attr_name)
951
+ end
952
+
953
+ # Checks default allowed attributes based on tag type
954
+ #
955
+ # @param tag_name [String] tag name
956
+ # @param attr_name [String] attribute name
957
+ # @return [Boolean, nil] true if allowed, nil otherwise
958
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
959
+ def check_default_allowed_attributes(tag_name, attr_name)
960
+ html_attrs = @html_attrs ||= Attributes::HTML.map { |a| transform_case(a) }.to_set
961
+ svg_attrs = @svg_attrs ||= (Attributes::SVG + Attributes::XML).map { |a| transform_case(a) }.to_set
962
+ math_attrs = @math_attrs ||= (Attributes::MATH_ML + Attributes::XML).map { |a| transform_case(a) }.to_set
963
+
964
+ @html_tags_set ||= Tags::HTML.map { |t| transform_case(t) }.to_set
965
+ @svg_tags_set ||= (Tags::SVG + Tags::SVG_FILTERS).map { |t| transform_case(t) }.to_set
966
+ @math_tags_set ||= Tags::MATH_ML.map { |t| transform_case(t) }.to_set
967
+
968
+ is_svg = @svg_tags_set.include?(tag_name)
969
+ is_math = @math_tags_set.include?(tag_name)
970
+ is_html = @html_tags_set.include?(tag_name)
971
+
972
+ # Default to HTML if not recognized as standard tag but allowed
973
+ is_html = true if !is_svg && !is_math
974
+
975
+ attr_allowed = false
976
+ attr_allowed ||= svg_attrs.include?(attr_name) if is_svg
977
+ attr_allowed ||= math_attrs.include?(attr_name) if is_math
978
+ attr_allowed ||= html_attrs.include?(attr_name) if is_html
979
+ attr_allowed ? true : nil
980
+ end
981
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
982
+
983
+ # Checks if data attributes are allowed
984
+ #
985
+ # @param attr_name [String] attribute name
986
+ # @return [Boolean] true if allowed
987
+ def data_attribute_allowed?(attr_name)
988
+ @config.allow_data_attributes && attr_name.match?(Expressions::DATA_ATTR)
989
+ end
990
+
991
+ # Checks if ARIA attributes are allowed
992
+ #
993
+ # @param attr_name [String] attribute name
994
+ # @return [Boolean] true if allowed
995
+ def aria_attribute_allowed?(attr_name)
996
+ @config.allow_aria_attributes && attr_name.match?(Expressions::ARIA_ATTR)
997
+ end
998
+
999
+ # Validates style attribute value
1000
+ #
1001
+ # @param value [String] attribute value
1002
+ # @return [Boolean] true if valid
1003
+ def valid_style_attribute?(value)
1004
+ return false if value && unsafe_inline_style?(value.to_s)
1005
+
1006
+ true
1007
+ end
1008
+
1009
+ # Checks for DOM clobbering via attributes
1010
+ #
1011
+ # @param attr_name [String] attribute name
1012
+ # @param value [String] attribute value
1013
+ # @return [Boolean] true if clobbering detected
1014
+ def dom_clobbering_attribute?(attr_name, value)
1015
+ value && !value.to_s.strip.empty? && %w[name id].include?(attr_name) &&
1016
+ Attributes::DOM_CLOBBERING.include?(value.downcase)
1017
+ end
1018
+
1019
+ # Validates URI attributes
1020
+ #
1021
+ # @param tag_name [String] tag name
1022
+ # @param _attr_name [String] attribute name (unused)
1023
+ # @param value [String] attribute value
1024
+ # @param attr_allowed [Boolean] whether attribute is allowed
1025
+ # @return [Boolean] true if valid
1026
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
1027
+ def valid_uri_attribute?(tag_name, value, attr_allowed)
1028
+ val = value.to_s
1029
+ leading_space_pattern = /\A[\s\u0085\u00a0\u1680\u180e\u2000-\u200b\u2028\u2029\u205f\u3000]+/
1030
+ trailing_space_pattern = /[\s\u0085\u00a0\u1680\u180e\u2000-\u200b\u2028\u2029\u205f\u3000]+\z/
1031
+ val = val.gsub(leading_space_pattern, '').gsub(trailing_space_pattern, '')
1032
+ value.replace(val) if value.respond_to?(:replace) && value != val
1033
+ return false if val.match?(/[\x00-\x1f\x7f]/)
1034
+
1035
+ decoded = begin
1036
+ URI.decode_www_form_component(val)
1037
+ rescue StandardError
1038
+ val
1039
+ end
1040
+ return false if @config.allowed_uri_regexp && !val.match?(@config.allowed_uri_regexp)
1041
+
1042
+ # For URI attributes, check if it's allowed and has valid URI
1043
+ uri_allowed = attr_allowed.nil? || attr_allowed # default to allowed if not explicitly set
1044
+ return false if decoded.match?(Expressions::IS_SCRIPT_OR_DATA)
1045
+
1046
+ if decoded.match?(/^data:/i)
1047
+ return true if uri_allowed && @config.allow_data_uri && data_uri_tags.include?(tag_name)
1048
+
1049
+ return false
1050
+ end
1051
+
1052
+ return true if uri_allowed && decoded.match?(Expressions::IS_ALLOWED_URI)
1053
+ return true if uri_allowed && @config.allow_unknown_protocols && !decoded.match?(Expressions::IS_SCRIPT_OR_DATA)
1054
+
1055
+ false # Reject invalid URIs or non-allowed URI attributes
1056
+ end
1057
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
1058
+
1059
+ # Fallback check for unknown protocols
1060
+ #
1061
+ # @param value [String] attribute value
1062
+ # @return [Boolean] true if allowed
1063
+ def allow_unknown_protocols_fallback?(value)
1064
+ if @config.allow_unknown_protocols && value && !value.match?(Expressions::IS_SCRIPT_OR_DATA)
1065
+ return false if value.match?(/^data:/i) && !@config.allow_data_uri
1066
+
1067
+ return true
1068
+ end
1069
+
1070
+ false
1071
+ end
1072
+ end
1073
+
1074
+ # Builds a new sanitizer instance with optional configuration
1075
+ #
1076
+ # @param cfg [Hash, Config] optional configuration to initialize with
1077
+ # @yield [config] optional block to mutate configuration before use
1078
+ # @return [Sanitizer] a new sanitizer instance
1079
+ def self.new(cfg = {}, &block)
1080
+ Sanitizer.new(cfg, &block)
1081
+ end
1082
+
1083
+ # Convenience helper to sanitize with a fresh, default-configured instance.
1084
+ #
1085
+ # @param dirty [String, Nokogiri::XML::Node] the input to sanitize
1086
+ # @param cfg [Hash] optional configuration override
1087
+ # @return [String, Nokogiri::XML::Document] sanitized HTML or DOM
1088
+ def self.sanitize(dirty, cfg = {})
1089
+ new(cfg).sanitize(dirty)
1090
+ end
1091
+
1092
+ class << self
1093
+ alias_method :scrub, :sanitize
1094
+ end
1095
+ end