llm-docs-builder 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ci.yml +35 -4
  3. data/.github/workflows/docker.yml +7 -7
  4. data/.github/workflows/push.yml +3 -3
  5. data/.gitignore +8 -0
  6. data/.rubocop.yml +1 -14
  7. data/.ruby-version +1 -1
  8. data/.yard-lint.yml +275 -0
  9. data/CHANGELOG.md +16 -0
  10. data/Dockerfile +14 -7
  11. data/Gemfile +1 -1
  12. data/Gemfile.lock +33 -25
  13. data/README.md +16 -0
  14. data/lib/llm_docs_builder/cli.rb +0 -1
  15. data/lib/llm_docs_builder/config.rb +33 -0
  16. data/lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb +31 -0
  17. data/lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb +71 -0
  18. data/lib/llm_docs_builder/helpers.rb +9 -0
  19. data/lib/llm_docs_builder/html_detector.rb +159 -0
  20. data/lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb +181 -0
  21. data/lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb +597 -0
  22. data/lib/llm_docs_builder/html_to_markdown_converter.rb +826 -0
  23. data/lib/llm_docs_builder/markdown_transformer.rb +23 -9
  24. data/lib/llm_docs_builder/output_formatter.rb +1 -1
  25. data/lib/llm_docs_builder/text_compressor.rb +2 -2
  26. data/lib/llm_docs_builder/transformers/base_transformer.rb +13 -1
  27. data/lib/llm_docs_builder/transformers/heading_transformer.rb +19 -7
  28. data/lib/llm_docs_builder/url_fetcher.rb +18 -0
  29. data/lib/llm_docs_builder/version.rb +1 -1
  30. data/lib/llm_docs_builder.rb +10 -0
  31. data/llm-docs-builder.gemspec +3 -2
  32. data/package-lock.json +331 -0
  33. data/package.json +9 -0
  34. data/renovate.json +22 -9
  35. metadata +31 -8
  36. data/AGENTS.md +0 -20
@@ -0,0 +1,826 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # A lightweight HTML → Markdown converter using only Nokogiri's public API.
5
+ #
6
+ # Design goals:
7
+ # - Traverse with Nokogiri and keep logic small, readable, and predictable
8
+ # - Preserve the existing public behavior covered by specs
9
+ # - Convert tables into Markdown while preserving inline formatting
10
+ class HtmlToMarkdownConverter
11
+ # Mapping of HTML heading tags to their numeric levels
12
+ HEADING_LEVEL = {
13
+ 'h1' => 1,
14
+ 'h2' => 2,
15
+ 'h3' => 3,
16
+ 'h4' => 4,
17
+ 'h5' => 5,
18
+ 'h6' => 6
19
+ }.freeze
20
+
21
+ # HTML tags treated as transparent block containers
22
+ BLOCK_CONTAINERS = %w[div aside figure article section main header footer nav body html].freeze
23
+
24
+ # HTML tags rendered as bold/strong in markdown
25
+ INLINE_STRONG_TAGS = %w[strong b].freeze
26
+
27
+ # HTML tags rendered as italic/emphasis in markdown
28
+ INLINE_EM_TAGS = %w[em i].freeze
29
+
30
+ # HTML list container tags
31
+ LIST_TAGS = %w[ul ol].freeze
32
+
33
+ # HTML tags that should be completely ignored during conversion
34
+ IGNORE_TAGS = %w[script style head noscript iframe svg canvas].freeze
35
+
36
+ # Pattern for escaping markdown special characters in link labels
37
+ MARKDOWN_LABEL_ESCAPE_PATTERN = /[\\\[\]()*_`!]/
38
+
39
+ # URL schemes considered safe for link destinations
40
+ SAFE_URI_SCHEMES = %w[http https mailto ftp tel].freeze
41
+
42
+ # Entry point for HTML to Markdown conversion
43
+ #
44
+ # @param html [String] HTML content to convert
45
+ # @return [String] converted markdown content
46
+ def convert(html)
47
+ return '' if html.nil? || html.strip.empty?
48
+
49
+ fragment = Nokogiri::HTML::DocumentFragment.parse(html)
50
+ rendered = render_blocks(fragment.children, depth: 0)
51
+ clean_output(rendered)
52
+ end
53
+
54
+ # Initialize table renderer
55
+ def table_renderer
56
+ @table_renderer ||= HtmlToMarkdown::TableMarkupRenderer.new(
57
+ inline_collapser: method(:collapsed_inline_for),
58
+ block_renderer: method(:render_blocks)
59
+ )
60
+ end
61
+
62
+ private
63
+
64
+ # Renders a sequence of block-level nodes, inserting a blank line between blocks
65
+ #
66
+ # @param nodes [Nokogiri::XML::NodeSet]
67
+ # @param depth [Integer] nesting depth for lists
68
+ # @return [String] rendered markdown
69
+ def render_blocks(nodes, depth: 0)
70
+ parts = []
71
+ inline_buffer = []
72
+
73
+ flush_inline = lambda do
74
+ unless inline_buffer.empty?
75
+ rendered_inline = collapse_inline_preserving_newlines(render_inline_nodes(inline_buffer))
76
+ inline_buffer.clear
77
+ parts << rendered_inline unless rendered_inline.empty?
78
+ end
79
+ end
80
+
81
+ nodes.each do |node|
82
+ if node.text?
83
+ inline_buffer << node
84
+ next
85
+ end
86
+
87
+ next unless node.element?
88
+
89
+ tag = node.name.downcase
90
+ next if IGNORE_TAGS.include?(tag)
91
+
92
+ if block_like?(node)
93
+ flush_inline.call
94
+ rendered = render_element_block(node, depth: depth)
95
+ parts << rendered unless rendered.nil? || rendered.strip.empty?
96
+ else
97
+ inline_buffer << node
98
+ end
99
+ end
100
+
101
+ flush_inline.call
102
+
103
+ parts.join("\n\n")
104
+ end
105
+
106
+ # Render individual block element
107
+ #
108
+ # @param element [Nokogiri::XML::Element]
109
+ # @param depth [Integer] nesting depth
110
+ # @return [String] rendered markdown
111
+ def render_element_block(element, depth: 0)
112
+ tag = element.name.downcase
113
+
114
+ return table_renderer.render_table(element) if tag == 'table'
115
+
116
+ case tag
117
+ when 'hr'
118
+ '---'
119
+ when *HEADING_LEVEL.keys
120
+ text = collapsed_inline_for(element)
121
+ return '' if text.empty?
122
+
123
+ effective_level = effective_heading_level(element, HEADING_LEVEL[tag])
124
+ "#{'#' * effective_level} #{text}"
125
+ when 'blockquote'
126
+ render_blockquote(element)
127
+ when 'pre'
128
+ render_fenced_code(element)
129
+ when 'img'
130
+ # Allow images to be emitted as their own block when they appear
131
+ # directly under block containers (e.g., inside <figure>).
132
+ render_image(element)
133
+ when 'ul'
134
+ render_list(element, ordered: false, depth: depth)
135
+ when 'ol'
136
+ start_index = parse_integer(element['start']) || 1
137
+ render_list(element, ordered: true, depth: depth, start: start_index)
138
+ when 'dl'
139
+ render_definition_list(element)
140
+ when 'figure'
141
+ render_figure(element, depth: depth)
142
+ when *BLOCK_CONTAINERS
143
+ # Transparent block container: render its children as blocks.
144
+ # If the container only has inline/text content, render that inline instead.
145
+ render_transparent_container(element, depth: depth)
146
+ else
147
+ # Fallback: inline container at block level
148
+ collapsed_inline_for(element)
149
+ end
150
+ end
151
+
152
+ # Inline rendering
153
+ #
154
+ # @param node [Nokogiri::XML::Node]
155
+ # @param escape_for_label [Boolean] whether to escape markdown in labels
156
+ # @return [Array<String, Boolean, Symbol>] rendered text, has_markdown flag, and optional metadata
157
+ def render_inline(node, escape_for_label: false)
158
+ if node.text?
159
+ text = inline_text(node.text)
160
+ return [escape_for_label ? escape_markdown_label(text) : text, false]
161
+ end
162
+
163
+ tag = node.name.downcase if node.element?
164
+ case tag
165
+ when 'br'
166
+ ["\n", false]
167
+ when 'img'
168
+ [render_image(node), true]
169
+ when 'a'
170
+ render_link(node)
171
+ when *INLINE_STRONG_TAGS
172
+ render_wrapped_inline(node, '**', escape_for_label: escape_for_label)
173
+ when *INLINE_EM_TAGS
174
+ render_wrapped_inline(node, '*', escape_for_label: escape_for_label)
175
+ when 'code'
176
+ [render_inline_code(node), true]
177
+ else
178
+ render_inline_children(node, escape_for_label: escape_for_label)
179
+ end
180
+ end
181
+
182
+ # Render transparent block container
183
+ #
184
+ # @param element [Nokogiri::XML::Element] container element
185
+ # @param depth [Integer] nesting depth
186
+ # @return [String] rendered content
187
+ def render_transparent_container(element, depth:)
188
+ blocks = render_blocks(element.children, depth: depth)
189
+ if blocks.strip.empty?
190
+ collapsed_inline_for(element)
191
+ else
192
+ blocks
193
+ end
194
+ end
195
+
196
+ # Render figure element
197
+ #
198
+ # @param element [Nokogiri::XML::Element] figure element
199
+ # @param depth [Integer] nesting depth
200
+ # @return [String] rendered markdown
201
+ def render_figure(element, depth:)
202
+ renderer = HtmlToMarkdown::FigureCodeBlockRenderer.new(
203
+ element,
204
+ inline_collapser: method(:collapsed_inline_for),
205
+ fence_calculator: method(:compute_code_fence)
206
+ )
207
+ rendered = renderer.render
208
+ return render_transparent_container(element, depth: depth) if rendered.nil? || rendered.strip.empty?
209
+
210
+ render_figure_children_in_original_order(
211
+ element,
212
+ code_block_node: renderer.code_block_node,
213
+ rendered_code: rendered,
214
+ depth: depth
215
+ )
216
+ end
217
+
218
+ # Render figure children preserving order
219
+ #
220
+ # @param element [Nokogiri::XML::Element] figure element
221
+ # @param code_block_node [Nokogiri::XML::Element] code block node
222
+ # @param rendered_code [String] rendered code
223
+ # @param depth [Integer] nesting depth
224
+ # @return [String] rendered content
225
+ def render_figure_children_in_original_order(element, code_block_node:, rendered_code:, depth:)
226
+ direct_code_child = figure_direct_child_for(element, code_block_node)
227
+ parts = []
228
+ code_inserted = false
229
+
230
+ element.children.each do |child|
231
+ next if figcaption?(child)
232
+ next if child.text? && child.text.strip.empty?
233
+
234
+ if !direct_code_child.nil? && child.equal?(direct_code_child)
235
+ parts << rendered_code
236
+ code_inserted = true
237
+ next
238
+ end
239
+
240
+ rendered_child = render_blocks([child], depth: depth)
241
+ parts << rendered_child unless rendered_child.nil? || rendered_child.strip.empty?
242
+ end
243
+
244
+ parts.unshift(rendered_code) unless code_inserted
245
+ parts.join("\n\n")
246
+ end
247
+
248
+ # Find direct child of figure containing the node
249
+ #
250
+ # @param element [Nokogiri::XML::Element] figure element
251
+ # @param node [Nokogiri::XML::Element]
252
+ # @return [Nokogiri::XML::Element, nil] direct child or nil
253
+ def figure_direct_child_for(element, node)
254
+ return nil if node.nil?
255
+
256
+ current = node
257
+ current = current.parent until current.nil? || current.parent.nil? || current.parent.equal?(element)
258
+
259
+ return nil if current.nil? || !current.parent.equal?(element)
260
+
261
+ current
262
+ end
263
+
264
+ # Check if node is a figcaption element
265
+ #
266
+ # @param node [Nokogiri::XML::Node]
267
+ # @return [Boolean] true if figcaption
268
+ def figcaption?(node)
269
+ node.element? && node.name.casecmp('figcaption').zero?
270
+ end
271
+
272
+ # Render inline children of parent element
273
+ #
274
+ # @param parent [Nokogiri::XML::Element] parent element
275
+ # @param escape_for_label [Boolean] whether to escape for labels
276
+ # @return [Array<String, Boolean>] rendered text and has_markdown flag
277
+ def render_inline_children(parent, escape_for_label: false)
278
+ has_markdown = false
279
+ parts = []
280
+
281
+ parent.children.each do |child|
282
+ next if child.parent.nil?
283
+
284
+ s, marked, metadata = render_inline(child, escape_for_label: escape_for_label)
285
+ Helpers.prune_trailing_unsafe_link_separator!(parts) if metadata == :unsafe_link_pruned
286
+ next if s.nil? || s.empty?
287
+
288
+ parts << s
289
+ has_markdown ||= marked
290
+ end
291
+
292
+ [parts.join, has_markdown]
293
+ end
294
+
295
+ # Render inline children as string
296
+ #
297
+ # @param parent [Nokogiri::XML::Element] parent element
298
+ # @return [String] rendered inline text
299
+ def render_inline_string(parent)
300
+ s, = render_inline_children(parent)
301
+ s
302
+ end
303
+
304
+ # Collapse inline whitespace preserving newlines
305
+ #
306
+ # @param parent [Nokogiri::XML::Element] parent element
307
+ # @return [String] collapsed inline text
308
+ def collapsed_inline_for(parent)
309
+ collapse_inline_preserving_newlines(render_inline_string(parent))
310
+ end
311
+
312
+ # Render wrapped inline element (strong, em)
313
+ #
314
+ # @param node [Nokogiri::XML::Element] element to wrap
315
+ # @param wrapper [String] wrapper characters
316
+ # @param escape_for_label [Boolean] whether to escape for labels
317
+ # @return [Array<String, Boolean>] wrapped text and has_markdown flag
318
+ def render_wrapped_inline(node, wrapper, escape_for_label: false)
319
+ if escape_for_label
320
+ s, = render_inline_children(node, escape_for_label: true)
321
+ content = collapse_inline_preserving_newlines(s)
322
+ else
323
+ content = collapsed_inline_for(node)
324
+ end
325
+ return ['', false] if content.empty?
326
+
327
+ ["#{wrapper}#{content}#{wrapper}", true]
328
+ end
329
+
330
+ # Render sequence of inline nodes
331
+ #
332
+ # @param nodes [Array<Nokogiri::XML::Node>]
333
+ # @return [String] rendered text
334
+ def render_inline_nodes(nodes)
335
+ return '' if nodes.nil? || nodes.empty?
336
+
337
+ parts = []
338
+ nodes.each do |node|
339
+ s, = render_inline(node)
340
+ parts << s unless s.nil? || s.empty?
341
+ end
342
+
343
+ parts.join
344
+ end
345
+
346
+ # Render link element
347
+ #
348
+ # @param node [Nokogiri::XML::Element] link element
349
+ # @return [Array<String, Boolean, Symbol>] rendered link, has_markdown flag, and optional metadata
350
+ def render_link(node)
351
+ href = (node['href'] || '').to_s
352
+ sanitized_href = href.strip
353
+
354
+ if sanitized_href.empty?
355
+ label_str, label_has_md = render_inline_children(node)
356
+ label = collapse_inline_preserving_newlines(label_str)
357
+ return [label, label_has_md]
358
+ end
359
+
360
+ unless safe_link_destination?(sanitized_href)
361
+ prune_unsafe_link_separators(node)
362
+ return ['', false, :unsafe_link_pruned]
363
+ end
364
+
365
+ label_str, = render_inline_children(node, escape_for_label: true)
366
+ label = collapse_inline_preserving_newlines(label_str)
367
+ destination = format_markdown_link_destination(sanitized_href)
368
+ ["[#{label}](#{destination})", true]
369
+ end
370
+
371
+ # Render image element
372
+ #
373
+ # @param node [Nokogiri::XML::Element] image element
374
+ # @return [String] rendered image markdown
375
+ def render_image(node)
376
+ src = (node['src'] || '').to_s
377
+ return '' if src.empty?
378
+
379
+ alt = (node['alt'] || '').to_s
380
+ title = (node['title'] || '').to_s
381
+ title_part = title.empty? ? '' : %( "#{title}")
382
+ destination = format_markdown_link_destination(src)
383
+ "![#{escape_markdown_label(alt)}](#{destination}#{title_part})"
384
+ end
385
+
386
+ # Render inline code element
387
+ #
388
+ # @param node [Nokogiri::XML::Element] code element
389
+ # @return [String] rendered inline code
390
+ def render_inline_code(node)
391
+ text = node.text.to_s.gsub(/\r\n?/, "\n").gsub(/\n+/, ' ').strip
392
+ return '' if text.empty?
393
+
394
+ fence_len = (text.scan(/`+/).map(&:length).max || 0) + 1
395
+ fence = '`' * fence_len
396
+ "#{fence}#{text}#{fence}"
397
+ end
398
+
399
+ # Render blockquote element
400
+ #
401
+ # @param node [Nokogiri::XML::Element] blockquote element
402
+ # @return [String] rendered blockquote markdown
403
+ def render_blockquote(node)
404
+ # Render blockquote differently based on whether it contains block-level elements.
405
+ # If it only has inline/text content, preserve the inline sequence instead of
406
+ # attempting block rendering (which would drop surrounding text nodes).
407
+ has_block_children = node.element_children.any? { |child| block_like?(child) }
408
+
409
+ inner =
410
+ if has_block_children
411
+ render_blocks(node.children, depth: 0)
412
+ else
413
+ collapsed_inline_for(node)
414
+ end
415
+ return '' if inner.strip.empty?
416
+
417
+ lines = inner.split("\n")
418
+ lines.map { |line| line.strip.empty? ? '>' : "> #{line}" }.join("\n")
419
+ end
420
+
421
+ # Render fenced code block
422
+ #
423
+ # @param node [Nokogiri::XML::Element] pre element
424
+ # @return [String] rendered code block
425
+ def render_fenced_code(node)
426
+ inner_code = node.at_css('code')
427
+ code = inner_code ? inner_code.text.to_s : node.text.to_s
428
+ code = code.gsub(/\r\n?/, "\n").rstrip
429
+ fence = compute_code_fence(code)
430
+ "#{fence}\n#{code}\n#{fence}"
431
+ end
432
+
433
+ # Compute appropriate code fence length
434
+ #
435
+ # @param code [String] code content
436
+ # @return [String] fence string
437
+ def compute_code_fence(code)
438
+ text = code.to_s
439
+ longest_sequence = text.scan(/`+/).map(&:length).max || 0
440
+ fence_length = [3, longest_sequence + 1].max
441
+ '`' * fence_length
442
+ end
443
+
444
+ # Render list (ordered or unordered)
445
+ #
446
+ # @param list_node [Nokogiri::XML::Element] list element
447
+ # @param ordered [Boolean] whether list is ordered
448
+ # @param depth [Integer] nesting depth
449
+ # @param start [Integer, nil] starting number for ordered lists
450
+ # @return [String] rendered list markdown
451
+ def render_list(list_node, ordered:, depth:, start: nil)
452
+ lines = []
453
+ index = ordered ? (start || 1) : nil
454
+ indent = ' ' * depth
455
+
456
+ list_node.element_children.each do |child|
457
+ next unless child.name.downcase == 'li'
458
+
459
+ override = ordered ? parse_integer(child['value']) : nil
460
+ index = override unless override.nil?
461
+
462
+ prefix =
463
+ if ordered
464
+ "#{indent}#{index}. "
465
+ else
466
+ "#{indent}- "
467
+ end
468
+
469
+ index = (index || 0) + 1 if ordered
470
+
471
+ segments = build_list_item_segments(child)
472
+ inline_text, segments = extract_leading_inline_text(segments, depth: depth)
473
+ inline_text = collapse_inline_preserving_newlines(inline_text)
474
+
475
+ bullet_line = inline_text.empty? ? prefix.rstrip : "#{prefix}#{inline_text}"
476
+ item_lines = [bullet_line]
477
+
478
+ previous_type = nil
479
+ segments.each do |segment|
480
+ segment_lines = render_list_item_segment(segment, depth: depth)
481
+ next if segment_lines.empty?
482
+
483
+ insert_blank_line =
484
+ case segment.first
485
+ when :nested_list
486
+ %i[block inline].include?(previous_type)
487
+ else
488
+ true
489
+ end
490
+
491
+ item_lines << '' if insert_blank_line && !item_lines.last.to_s.empty?
492
+ item_lines.concat(segment_lines)
493
+ previous_type = segment.first
494
+ end
495
+
496
+ lines << item_lines.join("\n")
497
+ end
498
+
499
+ lines.join("\n")
500
+ end
501
+
502
+ # Build segments for list item content
503
+ #
504
+ # @param list_item [Nokogiri::XML::Element] list item element
505
+ # @return [Array<Array>] array of segment tuples [type, value]
506
+ def build_list_item_segments(list_item)
507
+ segments = []
508
+ inline_buffer = []
509
+
510
+ list_item.children.each do |child|
511
+ if child.element? && LIST_TAGS.include?(child.name.downcase)
512
+ segments << [:inline, inline_buffer] unless inline_buffer.empty?
513
+ inline_buffer = []
514
+ segments << [:nested_list, child]
515
+ elsif block_like?(child)
516
+ segments << [:inline, inline_buffer] unless inline_buffer.empty?
517
+ inline_buffer = []
518
+ segments << [:block, child]
519
+ else
520
+ inline_buffer << child
521
+ end
522
+ end
523
+
524
+ segments << [:inline, inline_buffer] unless inline_buffer.empty?
525
+ segments
526
+ end
527
+
528
+ # Extract leading inline text from segments
529
+ #
530
+ # @param segments [Array<Array>] segment tuples
531
+ # @param depth [Integer] nesting depth
532
+ # @return [Array<String, Array>] inline text and remaining segments
533
+ def extract_leading_inline_text(segments, depth:)
534
+ loop do
535
+ return ['', segments] if segments.empty?
536
+
537
+ type, value = segments.first
538
+
539
+ case type
540
+ when :inline
541
+ segments.shift
542
+ candidate = collapse_inline_preserving_newlines(render_inline_nodes(value))
543
+ next if candidate.empty?
544
+
545
+ return [candidate, segments]
546
+ when :block
547
+ rendered = render_element_block(value, depth: depth + 1)
548
+ if rendered && !rendered.include?("\n")
549
+ segments.shift
550
+ return [collapse_inline_preserving_newlines(rendered), segments]
551
+ end
552
+
553
+ return ['', segments]
554
+ else
555
+ return ['', segments]
556
+ end
557
+ end
558
+ end
559
+
560
+ # Render individual list item segment
561
+ #
562
+ # @param segment [Array] segment tuple [type, value]
563
+ # @param depth [Integer] nesting depth
564
+ # @return [Array<String>] rendered lines
565
+ def render_list_item_segment(segment, depth:)
566
+ type, value = segment
567
+
568
+ case type
569
+ when :block
570
+ rendered = render_element_block(value, depth: depth + 1)
571
+ return [] if rendered.nil? || rendered.strip.empty?
572
+
573
+ indent_list_block_lines(rendered, depth + 1)
574
+ when :inline
575
+ rendered = collapse_inline_preserving_newlines(render_inline_nodes(value))
576
+ return [] if rendered.empty?
577
+
578
+ indent_list_block_lines(rendered, depth + 1)
579
+ when :nested_list
580
+ ordered = value.name.downcase == 'ol'
581
+ nested = render_list(
582
+ value,
583
+ ordered: ordered,
584
+ depth: depth + 1,
585
+ start: ordered ? parse_integer(value['start']) : nil
586
+ )
587
+ nested.empty? ? [] : nested.split("\n")
588
+ else
589
+ []
590
+ end
591
+ end
592
+
593
+ # Indent lines for list blocks
594
+ #
595
+ # @param text [String]
596
+ # @param depth [Integer] nesting depth
597
+ # @return [Array<String>] indented lines
598
+ def indent_list_block_lines(text, depth)
599
+ indent = ' ' * depth
600
+
601
+ text.split("\n").map do |line|
602
+ line.strip.empty? ? '' : "#{indent}#{line}"
603
+ end
604
+ end
605
+
606
+ # Render definition list element
607
+ #
608
+ # @param dl_node [Nokogiri::XML::Element] definition list element
609
+ # @return [String] rendered definition list
610
+ def render_definition_list(dl_node)
611
+ out = []
612
+ pending_term = nil
613
+ pending_definitions = []
614
+
615
+ flush_pending = lambda do
616
+ return if pending_term.nil? || pending_definitions.empty?
617
+
618
+ entry = "#{pending_term}\n: #{pending_definitions.first}"
619
+ pending_definitions.drop(1).each do |definition|
620
+ entry << "\n: #{definition}"
621
+ end
622
+
623
+ out << entry
624
+ pending_term = nil
625
+ pending_definitions = []
626
+ end
627
+
628
+ dl_node.element_children.each do |child|
629
+ case child.name.downcase
630
+ when 'dt'
631
+ flush_pending.call
632
+ pending_term = collapsed_inline_for(child)
633
+ pending_definitions = []
634
+ when 'dd'
635
+ defn = collapsed_inline_for(child)
636
+ pending_definitions << defn if pending_term
637
+ end
638
+ end
639
+
640
+ flush_pending.call
641
+
642
+ out.join("\n\n")
643
+ end
644
+
645
+ # Compute effective heading level adjusted for section nesting
646
+ #
647
+ # When HTML uses nested <section> elements with same-level headings,
648
+ # the inner headings should receive deeper markdown levels. The offset
649
+ # is calculated as the difference between the actual section ancestor
650
+ # count and the expected count for that heading tag (h1 expects 0
651
+ # sections, h2 expects 1, etc.), capped at heading level 6.
652
+ #
653
+ # @param element [Nokogiri::XML::Element] heading element
654
+ # @param base_level [Integer] HTML heading level (1-6)
655
+ # @return [Integer] effective markdown heading level (1-6)
656
+ def effective_heading_level(element, base_level)
657
+ depth = section_ancestor_count(element)
658
+ offset = [depth - (base_level - 1), 0].max
659
+ [base_level + offset, 6].min
660
+ end
661
+
662
+ # Count the number of <section> ancestor elements
663
+ #
664
+ # @param element [Nokogiri::XML::Element]
665
+ # @return [Integer] number of section ancestors
666
+ def section_ancestor_count(element)
667
+ count = 0
668
+ node = element.parent
669
+
670
+ while node
671
+ count += 1 if node.element? && node.name.downcase == 'section'
672
+ node = node.parent
673
+ end
674
+
675
+ count
676
+ end
677
+
678
+ # Helpers
679
+
680
+ # Normalize whitespace in text
681
+ #
682
+ # @param text [String]
683
+ # @return [String] normalized text
684
+ def normalize_whitespace(text)
685
+ text.gsub(/[ \t\r\n\f\v]+/, ' ')
686
+ end
687
+
688
+ # Process inline text node
689
+ #
690
+ # @param text [String]
691
+ # @return [String] processed text
692
+ def inline_text(text)
693
+ return '' if text.nil? || text.empty?
694
+
695
+ decoded = CGI.unescapeHTML(text)
696
+ return '' if decoded.empty?
697
+
698
+ safe = decoded.gsub('<', '&lt;').gsub('>', '&gt;')
699
+ normalize_whitespace(safe)
700
+ end
701
+
702
+ # Collapse whitespace while preserving newlines
703
+ #
704
+ # @param text [String]
705
+ # @return [String] collapsed text
706
+ def collapse_inline_preserving_newlines(text)
707
+ return '' if text.nil? || text.empty?
708
+
709
+ placeholder = '__LLM_BR__'
710
+ marked = text.gsub("\r\n", "\n").tr("\r", "\n").gsub("\n", placeholder)
711
+ collapsed = normalize_whitespace(marked).strip
712
+ collapsed.gsub(placeholder, "\n")
713
+ end
714
+
715
+ # Escape special characters in markdown label
716
+ #
717
+ # @param text [String]
718
+ # @return [String] escaped text
719
+ def escape_markdown_label(text)
720
+ text.to_s.gsub(MARKDOWN_LABEL_ESCAPE_PATTERN) { |char| "\\#{char}" }
721
+ end
722
+
723
+ # Format URL for markdown link destination
724
+ #
725
+ # @param url [String]
726
+ # @return [String] formatted URL
727
+ def format_markdown_link_destination(url)
728
+ return '' if url.nil?
729
+
730
+ str = url.to_s
731
+ return str if str.empty?
732
+
733
+ # Wrap in angle brackets when the URL contains characters that often
734
+ # confuse markdown link destination parsing (e.g., spaces or parentheses).
735
+ # CommonMark allows the form: [label](<url>)
736
+ if str.match?(/[\s()]/)
737
+ "<#{str}>"
738
+ else
739
+ str
740
+ end
741
+ end
742
+
743
+ # Check if link destination is safe
744
+ #
745
+ # @param href [String] link href
746
+ # @return [Boolean] true if safe
747
+ def safe_link_destination?(href)
748
+ return false if href.nil?
749
+
750
+ sanitized = href.strip
751
+ return false if sanitized.empty?
752
+ return true if sanitized.start_with?('#', '/', './', '../')
753
+ return false if sanitized.match?(/\A(?:javascript|vbscript|data)\s*:/i)
754
+
755
+ if (match = sanitized.match(/\A([a-z][a-z0-9+\-.]*):/i))
756
+ SAFE_URI_SCHEMES.include?(match[1].downcase)
757
+ else
758
+ true
759
+ end
760
+ end
761
+
762
+ # Remove separator characters around unsafe links
763
+ #
764
+ # @param node [Nokogiri::XML::Element] link node
765
+ # @return [void]
766
+ def prune_unsafe_link_separators(node)
767
+ return unless node
768
+
769
+ [node.previous_sibling, node.next_sibling].each do |sibling|
770
+ prune_separator_text_node(sibling)
771
+ end
772
+ end
773
+
774
+ # Remove separator from text node if it's only a pipe
775
+ #
776
+ # @param sibling [Nokogiri::XML::Node, nil] sibling node
777
+ # @return [void]
778
+ def prune_separator_text_node(sibling)
779
+ return unless sibling&.text?
780
+
781
+ stripped = sibling.text.strip
782
+ sibling.remove if stripped == '|'
783
+ end
784
+
785
+ # Parse integer from string value
786
+ #
787
+ # @param raw [String, nil] raw value
788
+ # @return [Integer, nil] parsed integer or nil
789
+ def parse_integer(raw)
790
+ return nil if raw.nil?
791
+
792
+ str = raw.to_s.strip
793
+ return nil unless str.match?(/\A[+-]?\d+\z/)
794
+
795
+ str.to_i
796
+ end
797
+
798
+ # Clean and normalize output markdown
799
+ #
800
+ # @param output [String] raw output
801
+ # @return [String] cleaned output
802
+ def clean_output(output)
803
+ cleaned = output.gsub(/\r\n?/, "\n")
804
+ cleaned = cleaned.gsub(/[ \t]+\n/, "\n")
805
+ cleaned = Helpers.squeeze_blank_lines_outside_fences(cleaned, max_blank: 2)
806
+ # Trim leading/trailing blank lines but preserve significant trailing spaces
807
+ cleaned = cleaned.gsub(/\A(?:[ \t]*\n)+/, '')
808
+ cleaned.gsub(/(?:\n[ \t]*)+\z/, '')
809
+ end
810
+
811
+ # Check if node should be treated as a block element
812
+ #
813
+ # @param node [Nokogiri::XML::Node]
814
+ # @return [Boolean] true if block-like
815
+ def block_like?(node)
816
+ return false unless node.element?
817
+
818
+ tag = node.name.downcase
819
+ return true if HEADING_LEVEL.key?(tag)
820
+ return true if BLOCK_CONTAINERS.include?(tag)
821
+ return true if %w[p pre ul ol dl table blockquote hr figcaption].include?(tag)
822
+
823
+ false
824
+ end
825
+ end
826
+ end