llm-docs-builder 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,792 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # A lightweight HTML → Markdown converter using only Nokogiri's public API.
5
+ #
6
+ # Design goals:
7
+ # - Traverse with Nokogiri and keep logic small, readable, and predictable
8
+ # - Preserve the existing public behavior covered by specs
9
+ # - Convert tables into Markdown while preserving inline formatting
10
+ class HtmlToMarkdownConverter
11
+ # Mapping of HTML heading tags to their numeric levels
12
+ HEADING_LEVEL = {
13
+ 'h1' => 1,
14
+ 'h2' => 2,
15
+ 'h3' => 3,
16
+ 'h4' => 4,
17
+ 'h5' => 5,
18
+ 'h6' => 6
19
+ }.freeze
20
+
21
+ # HTML tags treated as transparent block containers
22
+ BLOCK_CONTAINERS = %w[div aside figure article section main header footer nav body html].freeze
23
+
24
+ # HTML tags rendered as bold/strong in markdown
25
+ INLINE_STRONG_TAGS = %w[strong b].freeze
26
+
27
+ # HTML tags rendered as italic/emphasis in markdown
28
+ INLINE_EM_TAGS = %w[em i].freeze
29
+
30
+ # HTML list container tags
31
+ LIST_TAGS = %w[ul ol].freeze
32
+
33
+ # HTML tags that should be completely ignored during conversion
34
+ IGNORE_TAGS = %w[script style head noscript iframe svg canvas].freeze
35
+
36
+ # Pattern for escaping markdown special characters in link labels
37
+ MARKDOWN_LABEL_ESCAPE_PATTERN = /[\\\[\]()*_`!]/
38
+
39
+ # URL schemes considered safe for link destinations
40
+ SAFE_URI_SCHEMES = %w[http https mailto ftp tel].freeze
41
+
42
+ # Entry point for HTML to Markdown conversion
43
+ #
44
+ # @param html [String] HTML content to convert
45
+ # @return [String] converted markdown content
46
+ def convert(html)
47
+ return '' if html.nil? || html.strip.empty?
48
+
49
+ fragment = Nokogiri::HTML::DocumentFragment.parse(html)
50
+ rendered = render_blocks(fragment.children, depth: 0)
51
+ clean_output(rendered)
52
+ end
53
+
54
+ # Initialize table renderer
55
+ def table_renderer
56
+ @table_renderer ||= HtmlToMarkdown::TableMarkupRenderer.new(
57
+ inline_collapser: method(:collapsed_inline_for),
58
+ block_renderer: method(:render_blocks)
59
+ )
60
+ end
61
+
62
+ private
63
+
64
+ # Renders a sequence of block-level nodes, inserting a blank line between blocks
65
+ #
66
+ # @param nodes [Nokogiri::XML::NodeSet]
67
+ # @param depth [Integer] nesting depth for lists
68
+ # @return [String] rendered markdown
69
+ def render_blocks(nodes, depth: 0)
70
+ parts = []
71
+ inline_buffer = []
72
+
73
+ flush_inline = lambda do
74
+ unless inline_buffer.empty?
75
+ rendered_inline = collapse_inline_preserving_newlines(render_inline_nodes(inline_buffer))
76
+ inline_buffer.clear
77
+ parts << rendered_inline unless rendered_inline.empty?
78
+ end
79
+ end
80
+
81
+ nodes.each do |node|
82
+ if node.text?
83
+ inline_buffer << node
84
+ next
85
+ end
86
+
87
+ next unless node.element?
88
+
89
+ tag = node.name.downcase
90
+ next if IGNORE_TAGS.include?(tag)
91
+
92
+ if block_like?(node)
93
+ flush_inline.call
94
+ rendered = render_element_block(node, depth: depth)
95
+ parts << rendered unless rendered.nil? || rendered.strip.empty?
96
+ else
97
+ inline_buffer << node
98
+ end
99
+ end
100
+
101
+ flush_inline.call
102
+
103
+ parts.join("\n\n")
104
+ end
105
+
106
+ # Render individual block element
107
+ #
108
+ # @param element [Nokogiri::XML::Element]
109
+ # @param depth [Integer] nesting depth
110
+ # @return [String] rendered markdown
111
+ def render_element_block(element, depth: 0)
112
+ tag = element.name.downcase
113
+
114
+ return table_renderer.render_table(element) if tag == 'table'
115
+
116
+ case tag
117
+ when 'hr'
118
+ '---'
119
+ when *HEADING_LEVEL.keys
120
+ text = collapsed_inline_for(element)
121
+ return '' if text.empty?
122
+
123
+ "#{'#' * HEADING_LEVEL[tag]} #{text}"
124
+ when 'blockquote'
125
+ render_blockquote(element)
126
+ when 'pre'
127
+ render_fenced_code(element)
128
+ when 'img'
129
+ # Allow images to be emitted as their own block when they appear
130
+ # directly under block containers (e.g., inside <figure>).
131
+ render_image(element)
132
+ when 'ul'
133
+ render_list(element, ordered: false, depth: depth)
134
+ when 'ol'
135
+ start_index = parse_integer(element['start']) || 1
136
+ render_list(element, ordered: true, depth: depth, start: start_index)
137
+ when 'dl'
138
+ render_definition_list(element)
139
+ when 'figure'
140
+ render_figure(element, depth: depth)
141
+ when *BLOCK_CONTAINERS
142
+ # Transparent block container: render its children as blocks.
143
+ # If the container only has inline/text content, render that inline instead.
144
+ render_transparent_container(element, depth: depth)
145
+ else
146
+ # Fallback: inline container at block level
147
+ collapsed_inline_for(element)
148
+ end
149
+ end
150
+
151
+ # Inline rendering
152
+ #
153
+ # @param node [Nokogiri::XML::Node]
154
+ # @param escape_for_label [Boolean] whether to escape markdown in labels
155
+ # @return [Array<String, Boolean, Symbol>] rendered text, has_markdown flag, and optional metadata
156
+ def render_inline(node, escape_for_label: false)
157
+ if node.text?
158
+ text = inline_text(node.text)
159
+ return [escape_for_label ? escape_markdown_label(text) : text, false]
160
+ end
161
+
162
+ tag = node.name.downcase if node.element?
163
+ case tag
164
+ when 'br'
165
+ ["\n", false]
166
+ when 'img'
167
+ [render_image(node), true]
168
+ when 'a'
169
+ render_link(node)
170
+ when *INLINE_STRONG_TAGS
171
+ render_wrapped_inline(node, '**', escape_for_label: escape_for_label)
172
+ when *INLINE_EM_TAGS
173
+ render_wrapped_inline(node, '*', escape_for_label: escape_for_label)
174
+ when 'code'
175
+ [render_inline_code(node), true]
176
+ else
177
+ render_inline_children(node, escape_for_label: escape_for_label)
178
+ end
179
+ end
180
+
181
+ # Render transparent block container
182
+ #
183
+ # @param element [Nokogiri::XML::Element] container element
184
+ # @param depth [Integer] nesting depth
185
+ # @return [String] rendered content
186
+ def render_transparent_container(element, depth:)
187
+ blocks = render_blocks(element.children, depth: depth)
188
+ if blocks.strip.empty?
189
+ collapsed_inline_for(element)
190
+ else
191
+ blocks
192
+ end
193
+ end
194
+
195
+ # Render figure element
196
+ #
197
+ # @param element [Nokogiri::XML::Element] figure element
198
+ # @param depth [Integer] nesting depth
199
+ # @return [String] rendered markdown
200
+ def render_figure(element, depth:)
201
+ renderer = HtmlToMarkdown::FigureCodeBlockRenderer.new(
202
+ element,
203
+ inline_collapser: method(:collapsed_inline_for),
204
+ fence_calculator: method(:compute_code_fence)
205
+ )
206
+ rendered = renderer.render
207
+ return render_transparent_container(element, depth: depth) if rendered.nil? || rendered.strip.empty?
208
+
209
+ render_figure_children_in_original_order(
210
+ element,
211
+ code_block_node: renderer.code_block_node,
212
+ rendered_code: rendered,
213
+ depth: depth
214
+ )
215
+ end
216
+
217
+ # Render figure children preserving order
218
+ #
219
+ # @param element [Nokogiri::XML::Element] figure element
220
+ # @param code_block_node [Nokogiri::XML::Element] code block node
221
+ # @param rendered_code [String] rendered code
222
+ # @param depth [Integer] nesting depth
223
+ # @return [String] rendered content
224
+ def render_figure_children_in_original_order(element, code_block_node:, rendered_code:, depth:)
225
+ direct_code_child = figure_direct_child_for(element, code_block_node)
226
+ parts = []
227
+ code_inserted = false
228
+
229
+ element.children.each do |child|
230
+ next if figcaption?(child)
231
+ next if child.text? && child.text.strip.empty?
232
+
233
+ if !direct_code_child.nil? && child.equal?(direct_code_child)
234
+ parts << rendered_code
235
+ code_inserted = true
236
+ next
237
+ end
238
+
239
+ rendered_child = render_blocks([child], depth: depth)
240
+ parts << rendered_child unless rendered_child.nil? || rendered_child.strip.empty?
241
+ end
242
+
243
+ parts.unshift(rendered_code) unless code_inserted
244
+ parts.join("\n\n")
245
+ end
246
+
247
+ # Find direct child of figure containing the node
248
+ #
249
+ # @param element [Nokogiri::XML::Element] figure element
250
+ # @param node [Nokogiri::XML::Element]
251
+ # @return [Nokogiri::XML::Element, nil] direct child or nil
252
+ def figure_direct_child_for(element, node)
253
+ return nil if node.nil?
254
+
255
+ current = node
256
+ current = current.parent until current.nil? || current.parent.nil? || current.parent.equal?(element)
257
+
258
+ return nil if current.nil? || !current.parent.equal?(element)
259
+
260
+ current
261
+ end
262
+
263
+ # Check if node is a figcaption element
264
+ #
265
+ # @param node [Nokogiri::XML::Node]
266
+ # @return [Boolean] true if figcaption
267
+ def figcaption?(node)
268
+ node.element? && node.name.casecmp('figcaption').zero?
269
+ end
270
+
271
+ # Render inline children of parent element
272
+ #
273
+ # @param parent [Nokogiri::XML::Element] parent element
274
+ # @param escape_for_label [Boolean] whether to escape for labels
275
+ # @return [Array<String, Boolean>] rendered text and has_markdown flag
276
+ def render_inline_children(parent, escape_for_label: false)
277
+ has_markdown = false
278
+ parts = []
279
+
280
+ parent.children.each do |child|
281
+ next if child.parent.nil?
282
+
283
+ s, marked, metadata = render_inline(child, escape_for_label: escape_for_label)
284
+ Helpers.prune_trailing_unsafe_link_separator!(parts) if metadata == :unsafe_link_pruned
285
+ next if s.nil? || s.empty?
286
+
287
+ parts << s
288
+ has_markdown ||= marked
289
+ end
290
+
291
+ [parts.join, has_markdown]
292
+ end
293
+
294
+ # Render inline children as string
295
+ #
296
+ # @param parent [Nokogiri::XML::Element] parent element
297
+ # @return [String] rendered inline text
298
+ def render_inline_string(parent)
299
+ s, = render_inline_children(parent)
300
+ s
301
+ end
302
+
303
+ # Collapse inline whitespace preserving newlines
304
+ #
305
+ # @param parent [Nokogiri::XML::Element] parent element
306
+ # @return [String] collapsed inline text
307
+ def collapsed_inline_for(parent)
308
+ collapse_inline_preserving_newlines(render_inline_string(parent))
309
+ end
310
+
311
+ # Render wrapped inline element (strong, em)
312
+ #
313
+ # @param node [Nokogiri::XML::Element] element to wrap
314
+ # @param wrapper [String] wrapper characters
315
+ # @param escape_for_label [Boolean] whether to escape for labels
316
+ # @return [Array<String, Boolean>] wrapped text and has_markdown flag
317
+ def render_wrapped_inline(node, wrapper, escape_for_label: false)
318
+ if escape_for_label
319
+ s, = render_inline_children(node, escape_for_label: true)
320
+ content = collapse_inline_preserving_newlines(s)
321
+ else
322
+ content = collapsed_inline_for(node)
323
+ end
324
+ return ['', false] if content.empty?
325
+
326
+ ["#{wrapper}#{content}#{wrapper}", true]
327
+ end
328
+
329
+ # Render sequence of inline nodes
330
+ #
331
+ # @param nodes [Array<Nokogiri::XML::Node>]
332
+ # @return [String] rendered text
333
+ def render_inline_nodes(nodes)
334
+ return '' if nodes.nil? || nodes.empty?
335
+
336
+ parts = []
337
+ nodes.each do |node|
338
+ s, = render_inline(node)
339
+ parts << s unless s.nil? || s.empty?
340
+ end
341
+
342
+ parts.join
343
+ end
344
+
345
+ # Render link element
346
+ #
347
+ # @param node [Nokogiri::XML::Element] link element
348
+ # @return [Array<String, Boolean, Symbol>] rendered link, has_markdown flag, and optional metadata
349
+ def render_link(node)
350
+ href = (node['href'] || '').to_s
351
+ sanitized_href = href.strip
352
+
353
+ if sanitized_href.empty?
354
+ label_str, label_has_md = render_inline_children(node)
355
+ label = collapse_inline_preserving_newlines(label_str)
356
+ return [label, label_has_md]
357
+ end
358
+
359
+ unless safe_link_destination?(sanitized_href)
360
+ prune_unsafe_link_separators(node)
361
+ return ['', false, :unsafe_link_pruned]
362
+ end
363
+
364
+ label_str, = render_inline_children(node, escape_for_label: true)
365
+ label = collapse_inline_preserving_newlines(label_str)
366
+ destination = format_markdown_link_destination(sanitized_href)
367
+ ["[#{label}](#{destination})", true]
368
+ end
369
+
370
+ # Render image element
371
+ #
372
+ # @param node [Nokogiri::XML::Element] image element
373
+ # @return [String] rendered image markdown
374
+ def render_image(node)
375
+ src = (node['src'] || '').to_s
376
+ return '' if src.empty?
377
+
378
+ alt = (node['alt'] || '').to_s
379
+ title = (node['title'] || '').to_s
380
+ title_part = title.empty? ? '' : %( "#{title}")
381
+ destination = format_markdown_link_destination(src)
382
+ "![#{escape_markdown_label(alt)}](#{destination}#{title_part})"
383
+ end
384
+
385
+ # Render inline code element
386
+ #
387
+ # @param node [Nokogiri::XML::Element] code element
388
+ # @return [String] rendered inline code
389
+ def render_inline_code(node)
390
+ text = node.text.to_s.gsub(/\r\n?/, "\n").gsub(/\n+/, ' ').strip
391
+ return '' if text.empty?
392
+
393
+ fence_len = (text.scan(/`+/).map(&:length).max || 0) + 1
394
+ fence = '`' * fence_len
395
+ "#{fence}#{text}#{fence}"
396
+ end
397
+
398
+ # Render blockquote element
399
+ #
400
+ # @param node [Nokogiri::XML::Element] blockquote element
401
+ # @return [String] rendered blockquote markdown
402
+ def render_blockquote(node)
403
+ # Render blockquote differently based on whether it contains block-level elements.
404
+ # If it only has inline/text content, preserve the inline sequence instead of
405
+ # attempting block rendering (which would drop surrounding text nodes).
406
+ has_block_children = node.element_children.any? { |child| block_like?(child) }
407
+
408
+ inner =
409
+ if has_block_children
410
+ render_blocks(node.children, depth: 0)
411
+ else
412
+ collapsed_inline_for(node)
413
+ end
414
+ return '' if inner.strip.empty?
415
+
416
+ lines = inner.split("\n")
417
+ lines.map { |line| line.strip.empty? ? '>' : "> #{line}" }.join("\n")
418
+ end
419
+
420
+ # Render fenced code block
421
+ #
422
+ # @param node [Nokogiri::XML::Element] pre element
423
+ # @return [String] rendered code block
424
+ def render_fenced_code(node)
425
+ inner_code = node.at_css('code')
426
+ code = inner_code ? inner_code.text.to_s : node.text.to_s
427
+ code = code.gsub(/\r\n?/, "\n").rstrip
428
+ fence = compute_code_fence(code)
429
+ "#{fence}\n#{code}\n#{fence}"
430
+ end
431
+
432
+ # Compute appropriate code fence length
433
+ #
434
+ # @param code [String] code content
435
+ # @return [String] fence string
436
+ def compute_code_fence(code)
437
+ text = code.to_s
438
+ longest_sequence = text.scan(/`+/).map(&:length).max || 0
439
+ fence_length = [3, longest_sequence + 1].max
440
+ '`' * fence_length
441
+ end
442
+
443
+ # Render list (ordered or unordered)
444
+ #
445
+ # @param list_node [Nokogiri::XML::Element] list element
446
+ # @param ordered [Boolean] whether list is ordered
447
+ # @param depth [Integer] nesting depth
448
+ # @param start [Integer, nil] starting number for ordered lists
449
+ # @return [String] rendered list markdown
450
+ def render_list(list_node, ordered:, depth:, start: nil)
451
+ lines = []
452
+ index = ordered ? (start || 1) : nil
453
+ indent = ' ' * depth
454
+
455
+ list_node.element_children.each do |child|
456
+ next unless child.name.downcase == 'li'
457
+
458
+ override = ordered ? parse_integer(child['value']) : nil
459
+ index = override unless override.nil?
460
+
461
+ prefix =
462
+ if ordered
463
+ "#{indent}#{index}. "
464
+ else
465
+ "#{indent}- "
466
+ end
467
+
468
+ index = (index || 0) + 1 if ordered
469
+
470
+ segments = build_list_item_segments(child)
471
+ inline_text, segments = extract_leading_inline_text(segments, depth: depth)
472
+ inline_text = collapse_inline_preserving_newlines(inline_text)
473
+
474
+ bullet_line = inline_text.empty? ? prefix.rstrip : "#{prefix}#{inline_text}"
475
+ item_lines = [bullet_line]
476
+
477
+ previous_type = nil
478
+ segments.each do |segment|
479
+ segment_lines = render_list_item_segment(segment, depth: depth)
480
+ next if segment_lines.empty?
481
+
482
+ insert_blank_line =
483
+ case segment.first
484
+ when :nested_list
485
+ %i[block inline].include?(previous_type)
486
+ else
487
+ true
488
+ end
489
+
490
+ item_lines << '' if insert_blank_line && !item_lines.last.to_s.empty?
491
+ item_lines.concat(segment_lines)
492
+ previous_type = segment.first
493
+ end
494
+
495
+ lines << item_lines.join("\n")
496
+ end
497
+
498
+ lines.join("\n")
499
+ end
500
+
501
+ # Build segments for list item content
502
+ #
503
+ # @param list_item [Nokogiri::XML::Element] list item element
504
+ # @return [Array<Array>] array of segment tuples [type, value]
505
+ def build_list_item_segments(list_item)
506
+ segments = []
507
+ inline_buffer = []
508
+
509
+ list_item.children.each do |child|
510
+ if child.element? && LIST_TAGS.include?(child.name.downcase)
511
+ segments << [:inline, inline_buffer] unless inline_buffer.empty?
512
+ inline_buffer = []
513
+ segments << [:nested_list, child]
514
+ elsif block_like?(child)
515
+ segments << [:inline, inline_buffer] unless inline_buffer.empty?
516
+ inline_buffer = []
517
+ segments << [:block, child]
518
+ else
519
+ inline_buffer << child
520
+ end
521
+ end
522
+
523
+ segments << [:inline, inline_buffer] unless inline_buffer.empty?
524
+ segments
525
+ end
526
+
527
+ # Extract leading inline text from segments
528
+ #
529
+ # @param segments [Array<Array>] segment tuples
530
+ # @param depth [Integer] nesting depth
531
+ # @return [Array<String, Array>] inline text and remaining segments
532
+ def extract_leading_inline_text(segments, depth:)
533
+ loop do
534
+ return ['', segments] if segments.empty?
535
+
536
+ type, value = segments.first
537
+
538
+ case type
539
+ when :inline
540
+ segments.shift
541
+ candidate = collapse_inline_preserving_newlines(render_inline_nodes(value))
542
+ next if candidate.empty?
543
+
544
+ return [candidate, segments]
545
+ when :block
546
+ rendered = render_element_block(value, depth: depth + 1)
547
+ if rendered && !rendered.include?("\n")
548
+ segments.shift
549
+ return [collapse_inline_preserving_newlines(rendered), segments]
550
+ end
551
+
552
+ return ['', segments]
553
+ else
554
+ return ['', segments]
555
+ end
556
+ end
557
+ end
558
+
559
+ # Render individual list item segment
560
+ #
561
+ # @param segment [Array] segment tuple [type, value]
562
+ # @param depth [Integer] nesting depth
563
+ # @return [Array<String>] rendered lines
564
+ def render_list_item_segment(segment, depth:)
565
+ type, value = segment
566
+
567
+ case type
568
+ when :block
569
+ rendered = render_element_block(value, depth: depth + 1)
570
+ return [] if rendered.nil? || rendered.strip.empty?
571
+
572
+ indent_list_block_lines(rendered, depth + 1)
573
+ when :inline
574
+ rendered = collapse_inline_preserving_newlines(render_inline_nodes(value))
575
+ return [] if rendered.empty?
576
+
577
+ indent_list_block_lines(rendered, depth + 1)
578
+ when :nested_list
579
+ ordered = value.name.downcase == 'ol'
580
+ nested = render_list(
581
+ value,
582
+ ordered: ordered,
583
+ depth: depth + 1,
584
+ start: ordered ? parse_integer(value['start']) : nil
585
+ )
586
+ nested.empty? ? [] : nested.split("\n")
587
+ else
588
+ []
589
+ end
590
+ end
591
+
592
+ # Indent lines for list blocks
593
+ #
594
+ # @param text [String]
595
+ # @param depth [Integer] nesting depth
596
+ # @return [Array<String>] indented lines
597
+ def indent_list_block_lines(text, depth)
598
+ indent = ' ' * depth
599
+
600
+ text.split("\n").map do |line|
601
+ line.strip.empty? ? '' : "#{indent}#{line}"
602
+ end
603
+ end
604
+
605
+ # Render definition list element
606
+ #
607
+ # @param dl_node [Nokogiri::XML::Element] definition list element
608
+ # @return [String] rendered definition list
609
+ def render_definition_list(dl_node)
610
+ out = []
611
+ pending_term = nil
612
+ pending_definitions = []
613
+
614
+ flush_pending = lambda do
615
+ return if pending_term.nil? || pending_definitions.empty?
616
+
617
+ entry = "#{pending_term}\n: #{pending_definitions.first}"
618
+ pending_definitions.drop(1).each do |definition|
619
+ entry << "\n: #{definition}"
620
+ end
621
+
622
+ out << entry
623
+ pending_term = nil
624
+ pending_definitions = []
625
+ end
626
+
627
+ dl_node.element_children.each do |child|
628
+ case child.name.downcase
629
+ when 'dt'
630
+ flush_pending.call
631
+ pending_term = collapsed_inline_for(child)
632
+ pending_definitions = []
633
+ when 'dd'
634
+ defn = collapsed_inline_for(child)
635
+ pending_definitions << defn if pending_term
636
+ end
637
+ end
638
+
639
+ flush_pending.call
640
+
641
+ out.join("\n\n")
642
+ end
643
+
644
+ # Helpers
645
+
646
+ # Normalize whitespace in text
647
+ #
648
+ # @param text [String]
649
+ # @return [String] normalized text
650
+ def normalize_whitespace(text)
651
+ text.gsub(/[ \t\r\n\f\v]+/, ' ')
652
+ end
653
+
654
+ # Process inline text node
655
+ #
656
+ # @param text [String]
657
+ # @return [String] processed text
658
+ def inline_text(text)
659
+ return '' if text.nil? || text.empty?
660
+
661
+ decoded = CGI.unescapeHTML(text)
662
+ return '' if decoded.empty?
663
+
664
+ safe = decoded.gsub('<', '&lt;').gsub('>', '&gt;')
665
+ normalize_whitespace(safe)
666
+ end
667
+
668
+ # Collapse whitespace while preserving newlines
669
+ #
670
+ # @param text [String]
671
+ # @return [String] collapsed text
672
+ def collapse_inline_preserving_newlines(text)
673
+ return '' if text.nil? || text.empty?
674
+
675
+ placeholder = '__LLM_BR__'
676
+ marked = text.gsub("\r\n", "\n").tr("\r", "\n").gsub("\n", placeholder)
677
+ collapsed = normalize_whitespace(marked).strip
678
+ collapsed.gsub(placeholder, "\n")
679
+ end
680
+
681
+ # Escape special characters in markdown label
682
+ #
683
+ # @param text [String]
684
+ # @return [String] escaped text
685
+ def escape_markdown_label(text)
686
+ text.to_s.gsub(MARKDOWN_LABEL_ESCAPE_PATTERN) { |char| "\\#{char}" }
687
+ end
688
+
689
+ # Format URL for markdown link destination
690
+ #
691
+ # @param url [String]
692
+ # @return [String] formatted URL
693
+ def format_markdown_link_destination(url)
694
+ return '' if url.nil?
695
+
696
+ str = url.to_s
697
+ return str if str.empty?
698
+
699
+ # Wrap in angle brackets when the URL contains characters that often
700
+ # confuse markdown link destination parsing (e.g., spaces or parentheses).
701
+ # CommonMark allows the form: [label](<url>)
702
+ if str.match?(/[\s()]/)
703
+ "<#{str}>"
704
+ else
705
+ str
706
+ end
707
+ end
708
+
709
+ # Check if link destination is safe
710
+ #
711
+ # @param href [String] link href
712
+ # @return [Boolean] true if safe
713
+ def safe_link_destination?(href)
714
+ return false if href.nil?
715
+
716
+ sanitized = href.strip
717
+ return false if sanitized.empty?
718
+ return true if sanitized.start_with?('#', '/', './', '../')
719
+ return false if sanitized.match?(/\A(?:javascript|vbscript|data)\s*:/i)
720
+
721
+ if (match = sanitized.match(/\A([a-z][a-z0-9+\-.]*):/i))
722
+ SAFE_URI_SCHEMES.include?(match[1].downcase)
723
+ else
724
+ true
725
+ end
726
+ end
727
+
728
+ # Remove separator characters around unsafe links
729
+ #
730
+ # @param node [Nokogiri::XML::Element] link node
731
+ # @return [void]
732
+ def prune_unsafe_link_separators(node)
733
+ return unless node
734
+
735
+ [node.previous_sibling, node.next_sibling].each do |sibling|
736
+ prune_separator_text_node(sibling)
737
+ end
738
+ end
739
+
740
+ # Remove separator from text node if it's only a pipe
741
+ #
742
+ # @param sibling [Nokogiri::XML::Node, nil] sibling node
743
+ # @return [void]
744
+ def prune_separator_text_node(sibling)
745
+ return unless sibling&.text?
746
+
747
+ stripped = sibling.text.strip
748
+ sibling.remove if stripped == '|'
749
+ end
750
+
751
+ # Parse integer from string value
752
+ #
753
+ # @param raw [String, nil] raw value
754
+ # @return [Integer, nil] parsed integer or nil
755
+ def parse_integer(raw)
756
+ return nil if raw.nil?
757
+
758
+ str = raw.to_s.strip
759
+ return nil unless str.match?(/\A[+-]?\d+\z/)
760
+
761
+ str.to_i
762
+ end
763
+
764
+ # Clean and normalize output markdown
765
+ #
766
+ # @param output [String] raw output
767
+ # @return [String] cleaned output
768
+ def clean_output(output)
769
+ cleaned = output.gsub(/\r\n?/, "\n")
770
+ cleaned = cleaned.gsub(/[ \t]+\n/, "\n")
771
+ cleaned = Helpers.squeeze_blank_lines_outside_fences(cleaned, max_blank: 2)
772
+ # Trim leading/trailing blank lines but preserve significant trailing spaces
773
+ cleaned = cleaned.gsub(/\A(?:[ \t]*\n)+/, '')
774
+ cleaned.gsub(/(?:\n[ \t]*)+\z/, '')
775
+ end
776
+
777
+ # Check if node should be treated as a block element
778
+ #
779
+ # @param node [Nokogiri::XML::Node]
780
+ # @return [Boolean] true if block-like
781
+ def block_like?(node)
782
+ return false unless node.element?
783
+
784
+ tag = node.name.downcase
785
+ return true if HEADING_LEVEL.key?(tag)
786
+ return true if BLOCK_CONTAINERS.include?(tag)
787
+ return true if %w[p pre ul ol dl table blockquote hr figcaption].include?(tag)
788
+
789
+ false
790
+ end
791
+ end
792
+ end