llm-docs-builder 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +13 -0
- data/.github/workflows/docker.yml +2 -2
- data/.github/workflows/push.yml +2 -2
- data/.gitignore +8 -0
- data/CHANGELOG.md +13 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +47 -18
- data/README.md +19 -0
- data/lib/llm_docs_builder/cli.rb +32 -10
- data/lib/llm_docs_builder/comparator.rb +5 -75
- data/lib/llm_docs_builder/config.rb +42 -2
- data/lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb +31 -0
- data/lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb +71 -0
- data/lib/llm_docs_builder/helpers.rb +9 -0
- data/lib/llm_docs_builder/html_detector.rb +159 -0
- data/lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb +181 -0
- data/lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb +597 -0
- data/lib/llm_docs_builder/html_to_markdown_converter.rb +792 -0
- data/lib/llm_docs_builder/markdown_transformer.rb +30 -5
- data/lib/llm_docs_builder/output_formatter.rb +1 -1
- data/lib/llm_docs_builder/transformers/base_transformer.rb +13 -1
- data/lib/llm_docs_builder/url_fetcher.rb +138 -0
- data/lib/llm_docs_builder/version.rb +1 -1
- data/lib/llm_docs_builder.rb +11 -0
- data/llm-docs-builder.gemspec +1 -0
- metadata +23 -1
|
@@ -0,0 +1,792 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmDocsBuilder
|
|
4
|
+
# A lightweight HTML → Markdown converter using only Nokogiri's public API.
|
|
5
|
+
#
|
|
6
|
+
# Design goals:
|
|
7
|
+
# - Traverse with Nokogiri and keep logic small, readable, and predictable
|
|
8
|
+
# - Preserve the existing public behavior covered by specs
|
|
9
|
+
# - Convert tables into Markdown while preserving inline formatting
|
|
10
|
+
class HtmlToMarkdownConverter
|
|
11
|
+
# Mapping of HTML heading tags to their numeric levels
|
|
12
|
+
HEADING_LEVEL = {
|
|
13
|
+
'h1' => 1,
|
|
14
|
+
'h2' => 2,
|
|
15
|
+
'h3' => 3,
|
|
16
|
+
'h4' => 4,
|
|
17
|
+
'h5' => 5,
|
|
18
|
+
'h6' => 6
|
|
19
|
+
}.freeze
|
|
20
|
+
|
|
21
|
+
# HTML tags treated as transparent block containers
|
|
22
|
+
BLOCK_CONTAINERS = %w[div aside figure article section main header footer nav body html].freeze
|
|
23
|
+
|
|
24
|
+
# HTML tags rendered as bold/strong in markdown
|
|
25
|
+
INLINE_STRONG_TAGS = %w[strong b].freeze
|
|
26
|
+
|
|
27
|
+
# HTML tags rendered as italic/emphasis in markdown
|
|
28
|
+
INLINE_EM_TAGS = %w[em i].freeze
|
|
29
|
+
|
|
30
|
+
# HTML list container tags
|
|
31
|
+
LIST_TAGS = %w[ul ol].freeze
|
|
32
|
+
|
|
33
|
+
# HTML tags that should be completely ignored during conversion
|
|
34
|
+
IGNORE_TAGS = %w[script style head noscript iframe svg canvas].freeze
|
|
35
|
+
|
|
36
|
+
# Pattern for escaping markdown special characters in link labels
|
|
37
|
+
MARKDOWN_LABEL_ESCAPE_PATTERN = /[\\\[\]()*_`!]/
|
|
38
|
+
|
|
39
|
+
# URL schemes considered safe for link destinations
|
|
40
|
+
SAFE_URI_SCHEMES = %w[http https mailto ftp tel].freeze
|
|
41
|
+
|
|
42
|
+
# Entry point for HTML to Markdown conversion
|
|
43
|
+
#
|
|
44
|
+
# @param html [String] HTML content to convert
|
|
45
|
+
# @return [String] converted markdown content
|
|
46
|
+
def convert(html)
|
|
47
|
+
return '' if html.nil? || html.strip.empty?
|
|
48
|
+
|
|
49
|
+
fragment = Nokogiri::HTML::DocumentFragment.parse(html)
|
|
50
|
+
rendered = render_blocks(fragment.children, depth: 0)
|
|
51
|
+
clean_output(rendered)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Initialize table renderer
|
|
55
|
+
def table_renderer
|
|
56
|
+
@table_renderer ||= HtmlToMarkdown::TableMarkupRenderer.new(
|
|
57
|
+
inline_collapser: method(:collapsed_inline_for),
|
|
58
|
+
block_renderer: method(:render_blocks)
|
|
59
|
+
)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
# Renders a sequence of block-level nodes, inserting a blank line between blocks
|
|
65
|
+
#
|
|
66
|
+
# @param nodes [Nokogiri::XML::NodeSet]
|
|
67
|
+
# @param depth [Integer] nesting depth for lists
|
|
68
|
+
# @return [String] rendered markdown
|
|
69
|
+
def render_blocks(nodes, depth: 0)
|
|
70
|
+
parts = []
|
|
71
|
+
inline_buffer = []
|
|
72
|
+
|
|
73
|
+
flush_inline = lambda do
|
|
74
|
+
unless inline_buffer.empty?
|
|
75
|
+
rendered_inline = collapse_inline_preserving_newlines(render_inline_nodes(inline_buffer))
|
|
76
|
+
inline_buffer.clear
|
|
77
|
+
parts << rendered_inline unless rendered_inline.empty?
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
nodes.each do |node|
|
|
82
|
+
if node.text?
|
|
83
|
+
inline_buffer << node
|
|
84
|
+
next
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
next unless node.element?
|
|
88
|
+
|
|
89
|
+
tag = node.name.downcase
|
|
90
|
+
next if IGNORE_TAGS.include?(tag)
|
|
91
|
+
|
|
92
|
+
if block_like?(node)
|
|
93
|
+
flush_inline.call
|
|
94
|
+
rendered = render_element_block(node, depth: depth)
|
|
95
|
+
parts << rendered unless rendered.nil? || rendered.strip.empty?
|
|
96
|
+
else
|
|
97
|
+
inline_buffer << node
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
flush_inline.call
|
|
102
|
+
|
|
103
|
+
parts.join("\n\n")
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Render individual block element
|
|
107
|
+
#
|
|
108
|
+
# @param element [Nokogiri::XML::Element]
|
|
109
|
+
# @param depth [Integer] nesting depth
|
|
110
|
+
# @return [String] rendered markdown
|
|
111
|
+
def render_element_block(element, depth: 0)
|
|
112
|
+
tag = element.name.downcase
|
|
113
|
+
|
|
114
|
+
return table_renderer.render_table(element) if tag == 'table'
|
|
115
|
+
|
|
116
|
+
case tag
|
|
117
|
+
when 'hr'
|
|
118
|
+
'---'
|
|
119
|
+
when *HEADING_LEVEL.keys
|
|
120
|
+
text = collapsed_inline_for(element)
|
|
121
|
+
return '' if text.empty?
|
|
122
|
+
|
|
123
|
+
"#{'#' * HEADING_LEVEL[tag]} #{text}"
|
|
124
|
+
when 'blockquote'
|
|
125
|
+
render_blockquote(element)
|
|
126
|
+
when 'pre'
|
|
127
|
+
render_fenced_code(element)
|
|
128
|
+
when 'img'
|
|
129
|
+
# Allow images to be emitted as their own block when they appear
|
|
130
|
+
# directly under block containers (e.g., inside <figure>).
|
|
131
|
+
render_image(element)
|
|
132
|
+
when 'ul'
|
|
133
|
+
render_list(element, ordered: false, depth: depth)
|
|
134
|
+
when 'ol'
|
|
135
|
+
start_index = parse_integer(element['start']) || 1
|
|
136
|
+
render_list(element, ordered: true, depth: depth, start: start_index)
|
|
137
|
+
when 'dl'
|
|
138
|
+
render_definition_list(element)
|
|
139
|
+
when 'figure'
|
|
140
|
+
render_figure(element, depth: depth)
|
|
141
|
+
when *BLOCK_CONTAINERS
|
|
142
|
+
# Transparent block container: render its children as blocks.
|
|
143
|
+
# If the container only has inline/text content, render that inline instead.
|
|
144
|
+
render_transparent_container(element, depth: depth)
|
|
145
|
+
else
|
|
146
|
+
# Fallback: inline container at block level
|
|
147
|
+
collapsed_inline_for(element)
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Inline rendering
|
|
152
|
+
#
|
|
153
|
+
# @param node [Nokogiri::XML::Node]
|
|
154
|
+
# @param escape_for_label [Boolean] whether to escape markdown in labels
|
|
155
|
+
# @return [Array<String, Boolean, Symbol>] rendered text, has_markdown flag, and optional metadata
|
|
156
|
+
def render_inline(node, escape_for_label: false)
|
|
157
|
+
if node.text?
|
|
158
|
+
text = inline_text(node.text)
|
|
159
|
+
return [escape_for_label ? escape_markdown_label(text) : text, false]
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
tag = node.name.downcase if node.element?
|
|
163
|
+
case tag
|
|
164
|
+
when 'br'
|
|
165
|
+
["\n", false]
|
|
166
|
+
when 'img'
|
|
167
|
+
[render_image(node), true]
|
|
168
|
+
when 'a'
|
|
169
|
+
render_link(node)
|
|
170
|
+
when *INLINE_STRONG_TAGS
|
|
171
|
+
render_wrapped_inline(node, '**', escape_for_label: escape_for_label)
|
|
172
|
+
when *INLINE_EM_TAGS
|
|
173
|
+
render_wrapped_inline(node, '*', escape_for_label: escape_for_label)
|
|
174
|
+
when 'code'
|
|
175
|
+
[render_inline_code(node), true]
|
|
176
|
+
else
|
|
177
|
+
render_inline_children(node, escape_for_label: escape_for_label)
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Render transparent block container
|
|
182
|
+
#
|
|
183
|
+
# @param element [Nokogiri::XML::Element] container element
|
|
184
|
+
# @param depth [Integer] nesting depth
|
|
185
|
+
# @return [String] rendered content
|
|
186
|
+
def render_transparent_container(element, depth:)
|
|
187
|
+
blocks = render_blocks(element.children, depth: depth)
|
|
188
|
+
if blocks.strip.empty?
|
|
189
|
+
collapsed_inline_for(element)
|
|
190
|
+
else
|
|
191
|
+
blocks
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Render figure element
|
|
196
|
+
#
|
|
197
|
+
# @param element [Nokogiri::XML::Element] figure element
|
|
198
|
+
# @param depth [Integer] nesting depth
|
|
199
|
+
# @return [String] rendered markdown
|
|
200
|
+
def render_figure(element, depth:)
|
|
201
|
+
renderer = HtmlToMarkdown::FigureCodeBlockRenderer.new(
|
|
202
|
+
element,
|
|
203
|
+
inline_collapser: method(:collapsed_inline_for),
|
|
204
|
+
fence_calculator: method(:compute_code_fence)
|
|
205
|
+
)
|
|
206
|
+
rendered = renderer.render
|
|
207
|
+
return render_transparent_container(element, depth: depth) if rendered.nil? || rendered.strip.empty?
|
|
208
|
+
|
|
209
|
+
render_figure_children_in_original_order(
|
|
210
|
+
element,
|
|
211
|
+
code_block_node: renderer.code_block_node,
|
|
212
|
+
rendered_code: rendered,
|
|
213
|
+
depth: depth
|
|
214
|
+
)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Render figure children preserving order
|
|
218
|
+
#
|
|
219
|
+
# @param element [Nokogiri::XML::Element] figure element
|
|
220
|
+
# @param code_block_node [Nokogiri::XML::Element] code block node
|
|
221
|
+
# @param rendered_code [String] rendered code
|
|
222
|
+
# @param depth [Integer] nesting depth
|
|
223
|
+
# @return [String] rendered content
|
|
224
|
+
def render_figure_children_in_original_order(element, code_block_node:, rendered_code:, depth:)
|
|
225
|
+
direct_code_child = figure_direct_child_for(element, code_block_node)
|
|
226
|
+
parts = []
|
|
227
|
+
code_inserted = false
|
|
228
|
+
|
|
229
|
+
element.children.each do |child|
|
|
230
|
+
next if figcaption?(child)
|
|
231
|
+
next if child.text? && child.text.strip.empty?
|
|
232
|
+
|
|
233
|
+
if !direct_code_child.nil? && child.equal?(direct_code_child)
|
|
234
|
+
parts << rendered_code
|
|
235
|
+
code_inserted = true
|
|
236
|
+
next
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
rendered_child = render_blocks([child], depth: depth)
|
|
240
|
+
parts << rendered_child unless rendered_child.nil? || rendered_child.strip.empty?
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
parts.unshift(rendered_code) unless code_inserted
|
|
244
|
+
parts.join("\n\n")
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Find direct child of figure containing the node
|
|
248
|
+
#
|
|
249
|
+
# @param element [Nokogiri::XML::Element] figure element
|
|
250
|
+
# @param node [Nokogiri::XML::Element]
|
|
251
|
+
# @return [Nokogiri::XML::Element, nil] direct child or nil
|
|
252
|
+
def figure_direct_child_for(element, node)
|
|
253
|
+
return nil if node.nil?
|
|
254
|
+
|
|
255
|
+
current = node
|
|
256
|
+
current = current.parent until current.nil? || current.parent.nil? || current.parent.equal?(element)
|
|
257
|
+
|
|
258
|
+
return nil if current.nil? || !current.parent.equal?(element)
|
|
259
|
+
|
|
260
|
+
current
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Check if node is a figcaption element
|
|
264
|
+
#
|
|
265
|
+
# @param node [Nokogiri::XML::Node]
|
|
266
|
+
# @return [Boolean] true if figcaption
|
|
267
|
+
def figcaption?(node)
|
|
268
|
+
node.element? && node.name.casecmp('figcaption').zero?
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# Render inline children of parent element
|
|
272
|
+
#
|
|
273
|
+
# @param parent [Nokogiri::XML::Element] parent element
|
|
274
|
+
# @param escape_for_label [Boolean] whether to escape for labels
|
|
275
|
+
# @return [Array<String, Boolean>] rendered text and has_markdown flag
|
|
276
|
+
def render_inline_children(parent, escape_for_label: false)
|
|
277
|
+
has_markdown = false
|
|
278
|
+
parts = []
|
|
279
|
+
|
|
280
|
+
parent.children.each do |child|
|
|
281
|
+
next if child.parent.nil?
|
|
282
|
+
|
|
283
|
+
s, marked, metadata = render_inline(child, escape_for_label: escape_for_label)
|
|
284
|
+
Helpers.prune_trailing_unsafe_link_separator!(parts) if metadata == :unsafe_link_pruned
|
|
285
|
+
next if s.nil? || s.empty?
|
|
286
|
+
|
|
287
|
+
parts << s
|
|
288
|
+
has_markdown ||= marked
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
[parts.join, has_markdown]
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# Render inline children as string
|
|
295
|
+
#
|
|
296
|
+
# @param parent [Nokogiri::XML::Element] parent element
|
|
297
|
+
# @return [String] rendered inline text
|
|
298
|
+
def render_inline_string(parent)
|
|
299
|
+
s, = render_inline_children(parent)
|
|
300
|
+
s
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
# Collapse inline whitespace preserving newlines
|
|
304
|
+
#
|
|
305
|
+
# @param parent [Nokogiri::XML::Element] parent element
|
|
306
|
+
# @return [String] collapsed inline text
|
|
307
|
+
def collapsed_inline_for(parent)
|
|
308
|
+
collapse_inline_preserving_newlines(render_inline_string(parent))
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
# Render wrapped inline element (strong, em)
|
|
312
|
+
#
|
|
313
|
+
# @param node [Nokogiri::XML::Element] element to wrap
|
|
314
|
+
# @param wrapper [String] wrapper characters
|
|
315
|
+
# @param escape_for_label [Boolean] whether to escape for labels
|
|
316
|
+
# @return [Array<String, Boolean>] wrapped text and has_markdown flag
|
|
317
|
+
def render_wrapped_inline(node, wrapper, escape_for_label: false)
|
|
318
|
+
if escape_for_label
|
|
319
|
+
s, = render_inline_children(node, escape_for_label: true)
|
|
320
|
+
content = collapse_inline_preserving_newlines(s)
|
|
321
|
+
else
|
|
322
|
+
content = collapsed_inline_for(node)
|
|
323
|
+
end
|
|
324
|
+
return ['', false] if content.empty?
|
|
325
|
+
|
|
326
|
+
["#{wrapper}#{content}#{wrapper}", true]
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
# Render sequence of inline nodes
|
|
330
|
+
#
|
|
331
|
+
# @param nodes [Array<Nokogiri::XML::Node>]
|
|
332
|
+
# @return [String] rendered text
|
|
333
|
+
def render_inline_nodes(nodes)
|
|
334
|
+
return '' if nodes.nil? || nodes.empty?
|
|
335
|
+
|
|
336
|
+
parts = []
|
|
337
|
+
nodes.each do |node|
|
|
338
|
+
s, = render_inline(node)
|
|
339
|
+
parts << s unless s.nil? || s.empty?
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
parts.join
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
# Render link element
|
|
346
|
+
#
|
|
347
|
+
# @param node [Nokogiri::XML::Element] link element
|
|
348
|
+
# @return [Array<String, Boolean, Symbol>] rendered link, has_markdown flag, and optional metadata
|
|
349
|
+
def render_link(node)
|
|
350
|
+
href = (node['href'] || '').to_s
|
|
351
|
+
sanitized_href = href.strip
|
|
352
|
+
|
|
353
|
+
if sanitized_href.empty?
|
|
354
|
+
label_str, label_has_md = render_inline_children(node)
|
|
355
|
+
label = collapse_inline_preserving_newlines(label_str)
|
|
356
|
+
return [label, label_has_md]
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
unless safe_link_destination?(sanitized_href)
|
|
360
|
+
prune_unsafe_link_separators(node)
|
|
361
|
+
return ['', false, :unsafe_link_pruned]
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
label_str, = render_inline_children(node, escape_for_label: true)
|
|
365
|
+
label = collapse_inline_preserving_newlines(label_str)
|
|
366
|
+
destination = format_markdown_link_destination(sanitized_href)
|
|
367
|
+
["[#{label}](#{destination})", true]
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# Render image element
|
|
371
|
+
#
|
|
372
|
+
# @param node [Nokogiri::XML::Element] image element
|
|
373
|
+
# @return [String] rendered image markdown
|
|
374
|
+
def render_image(node)
|
|
375
|
+
src = (node['src'] || '').to_s
|
|
376
|
+
return '' if src.empty?
|
|
377
|
+
|
|
378
|
+
alt = (node['alt'] || '').to_s
|
|
379
|
+
title = (node['title'] || '').to_s
|
|
380
|
+
title_part = title.empty? ? '' : %( "#{title}")
|
|
381
|
+
destination = format_markdown_link_destination(src)
|
|
382
|
+
""
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
# Render inline code element
|
|
386
|
+
#
|
|
387
|
+
# @param node [Nokogiri::XML::Element] code element
|
|
388
|
+
# @return [String] rendered inline code
|
|
389
|
+
def render_inline_code(node)
|
|
390
|
+
text = node.text.to_s.gsub(/\r\n?/, "\n").gsub(/\n+/, ' ').strip
|
|
391
|
+
return '' if text.empty?
|
|
392
|
+
|
|
393
|
+
fence_len = (text.scan(/`+/).map(&:length).max || 0) + 1
|
|
394
|
+
fence = '`' * fence_len
|
|
395
|
+
"#{fence}#{text}#{fence}"
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
# Render blockquote element
|
|
399
|
+
#
|
|
400
|
+
# @param node [Nokogiri::XML::Element] blockquote element
|
|
401
|
+
# @return [String] rendered blockquote markdown
|
|
402
|
+
def render_blockquote(node)
|
|
403
|
+
# Render blockquote differently based on whether it contains block-level elements.
|
|
404
|
+
# If it only has inline/text content, preserve the inline sequence instead of
|
|
405
|
+
# attempting block rendering (which would drop surrounding text nodes).
|
|
406
|
+
has_block_children = node.element_children.any? { |child| block_like?(child) }
|
|
407
|
+
|
|
408
|
+
inner =
|
|
409
|
+
if has_block_children
|
|
410
|
+
render_blocks(node.children, depth: 0)
|
|
411
|
+
else
|
|
412
|
+
collapsed_inline_for(node)
|
|
413
|
+
end
|
|
414
|
+
return '' if inner.strip.empty?
|
|
415
|
+
|
|
416
|
+
lines = inner.split("\n")
|
|
417
|
+
lines.map { |line| line.strip.empty? ? '>' : "> #{line}" }.join("\n")
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
# Render fenced code block
|
|
421
|
+
#
|
|
422
|
+
# @param node [Nokogiri::XML::Element] pre element
|
|
423
|
+
# @return [String] rendered code block
|
|
424
|
+
def render_fenced_code(node)
|
|
425
|
+
inner_code = node.at_css('code')
|
|
426
|
+
code = inner_code ? inner_code.text.to_s : node.text.to_s
|
|
427
|
+
code = code.gsub(/\r\n?/, "\n").rstrip
|
|
428
|
+
fence = compute_code_fence(code)
|
|
429
|
+
"#{fence}\n#{code}\n#{fence}"
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
# Compute appropriate code fence length
|
|
433
|
+
#
|
|
434
|
+
# @param code [String] code content
|
|
435
|
+
# @return [String] fence string
|
|
436
|
+
def compute_code_fence(code)
|
|
437
|
+
text = code.to_s
|
|
438
|
+
longest_sequence = text.scan(/`+/).map(&:length).max || 0
|
|
439
|
+
fence_length = [3, longest_sequence + 1].max
|
|
440
|
+
'`' * fence_length
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
# Render list (ordered or unordered)
|
|
444
|
+
#
|
|
445
|
+
# @param list_node [Nokogiri::XML::Element] list element
|
|
446
|
+
# @param ordered [Boolean] whether list is ordered
|
|
447
|
+
# @param depth [Integer] nesting depth
|
|
448
|
+
# @param start [Integer, nil] starting number for ordered lists
|
|
449
|
+
# @return [String] rendered list markdown
|
|
450
|
+
def render_list(list_node, ordered:, depth:, start: nil)
|
|
451
|
+
lines = []
|
|
452
|
+
index = ordered ? (start || 1) : nil
|
|
453
|
+
indent = ' ' * depth
|
|
454
|
+
|
|
455
|
+
list_node.element_children.each do |child|
|
|
456
|
+
next unless child.name.downcase == 'li'
|
|
457
|
+
|
|
458
|
+
override = ordered ? parse_integer(child['value']) : nil
|
|
459
|
+
index = override unless override.nil?
|
|
460
|
+
|
|
461
|
+
prefix =
|
|
462
|
+
if ordered
|
|
463
|
+
"#{indent}#{index}. "
|
|
464
|
+
else
|
|
465
|
+
"#{indent}- "
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
index = (index || 0) + 1 if ordered
|
|
469
|
+
|
|
470
|
+
segments = build_list_item_segments(child)
|
|
471
|
+
inline_text, segments = extract_leading_inline_text(segments, depth: depth)
|
|
472
|
+
inline_text = collapse_inline_preserving_newlines(inline_text)
|
|
473
|
+
|
|
474
|
+
bullet_line = inline_text.empty? ? prefix.rstrip : "#{prefix}#{inline_text}"
|
|
475
|
+
item_lines = [bullet_line]
|
|
476
|
+
|
|
477
|
+
previous_type = nil
|
|
478
|
+
segments.each do |segment|
|
|
479
|
+
segment_lines = render_list_item_segment(segment, depth: depth)
|
|
480
|
+
next if segment_lines.empty?
|
|
481
|
+
|
|
482
|
+
insert_blank_line =
|
|
483
|
+
case segment.first
|
|
484
|
+
when :nested_list
|
|
485
|
+
%i[block inline].include?(previous_type)
|
|
486
|
+
else
|
|
487
|
+
true
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
item_lines << '' if insert_blank_line && !item_lines.last.to_s.empty?
|
|
491
|
+
item_lines.concat(segment_lines)
|
|
492
|
+
previous_type = segment.first
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
lines << item_lines.join("\n")
|
|
496
|
+
end
|
|
497
|
+
|
|
498
|
+
lines.join("\n")
|
|
499
|
+
end
|
|
500
|
+
|
|
501
|
+
# Build segments for list item content
|
|
502
|
+
#
|
|
503
|
+
# @param list_item [Nokogiri::XML::Element] list item element
|
|
504
|
+
# @return [Array<Array>] array of segment tuples [type, value]
|
|
505
|
+
def build_list_item_segments(list_item)
|
|
506
|
+
segments = []
|
|
507
|
+
inline_buffer = []
|
|
508
|
+
|
|
509
|
+
list_item.children.each do |child|
|
|
510
|
+
if child.element? && LIST_TAGS.include?(child.name.downcase)
|
|
511
|
+
segments << [:inline, inline_buffer] unless inline_buffer.empty?
|
|
512
|
+
inline_buffer = []
|
|
513
|
+
segments << [:nested_list, child]
|
|
514
|
+
elsif block_like?(child)
|
|
515
|
+
segments << [:inline, inline_buffer] unless inline_buffer.empty?
|
|
516
|
+
inline_buffer = []
|
|
517
|
+
segments << [:block, child]
|
|
518
|
+
else
|
|
519
|
+
inline_buffer << child
|
|
520
|
+
end
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
segments << [:inline, inline_buffer] unless inline_buffer.empty?
|
|
524
|
+
segments
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
# Extract leading inline text from segments
|
|
528
|
+
#
|
|
529
|
+
# @param segments [Array<Array>] segment tuples
|
|
530
|
+
# @param depth [Integer] nesting depth
|
|
531
|
+
# @return [Array<String, Array>] inline text and remaining segments
|
|
532
|
+
def extract_leading_inline_text(segments, depth:)
|
|
533
|
+
loop do
|
|
534
|
+
return ['', segments] if segments.empty?
|
|
535
|
+
|
|
536
|
+
type, value = segments.first
|
|
537
|
+
|
|
538
|
+
case type
|
|
539
|
+
when :inline
|
|
540
|
+
segments.shift
|
|
541
|
+
candidate = collapse_inline_preserving_newlines(render_inline_nodes(value))
|
|
542
|
+
next if candidate.empty?
|
|
543
|
+
|
|
544
|
+
return [candidate, segments]
|
|
545
|
+
when :block
|
|
546
|
+
rendered = render_element_block(value, depth: depth + 1)
|
|
547
|
+
if rendered && !rendered.include?("\n")
|
|
548
|
+
segments.shift
|
|
549
|
+
return [collapse_inline_preserving_newlines(rendered), segments]
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
return ['', segments]
|
|
553
|
+
else
|
|
554
|
+
return ['', segments]
|
|
555
|
+
end
|
|
556
|
+
end
|
|
557
|
+
end
|
|
558
|
+
|
|
559
|
+
# Render individual list item segment
|
|
560
|
+
#
|
|
561
|
+
# @param segment [Array] segment tuple [type, value]
|
|
562
|
+
# @param depth [Integer] nesting depth
|
|
563
|
+
# @return [Array<String>] rendered lines
|
|
564
|
+
def render_list_item_segment(segment, depth:)
|
|
565
|
+
type, value = segment
|
|
566
|
+
|
|
567
|
+
case type
|
|
568
|
+
when :block
|
|
569
|
+
rendered = render_element_block(value, depth: depth + 1)
|
|
570
|
+
return [] if rendered.nil? || rendered.strip.empty?
|
|
571
|
+
|
|
572
|
+
indent_list_block_lines(rendered, depth + 1)
|
|
573
|
+
when :inline
|
|
574
|
+
rendered = collapse_inline_preserving_newlines(render_inline_nodes(value))
|
|
575
|
+
return [] if rendered.empty?
|
|
576
|
+
|
|
577
|
+
indent_list_block_lines(rendered, depth + 1)
|
|
578
|
+
when :nested_list
|
|
579
|
+
ordered = value.name.downcase == 'ol'
|
|
580
|
+
nested = render_list(
|
|
581
|
+
value,
|
|
582
|
+
ordered: ordered,
|
|
583
|
+
depth: depth + 1,
|
|
584
|
+
start: ordered ? parse_integer(value['start']) : nil
|
|
585
|
+
)
|
|
586
|
+
nested.empty? ? [] : nested.split("\n")
|
|
587
|
+
else
|
|
588
|
+
[]
|
|
589
|
+
end
|
|
590
|
+
end
|
|
591
|
+
|
|
592
|
+
# Indent lines for list blocks
|
|
593
|
+
#
|
|
594
|
+
# @param text [String]
|
|
595
|
+
# @param depth [Integer] nesting depth
|
|
596
|
+
# @return [Array<String>] indented lines
|
|
597
|
+
def indent_list_block_lines(text, depth)
|
|
598
|
+
indent = ' ' * depth
|
|
599
|
+
|
|
600
|
+
text.split("\n").map do |line|
|
|
601
|
+
line.strip.empty? ? '' : "#{indent}#{line}"
|
|
602
|
+
end
|
|
603
|
+
end
|
|
604
|
+
|
|
605
|
+
# Render definition list element
|
|
606
|
+
#
|
|
607
|
+
# @param dl_node [Nokogiri::XML::Element] definition list element
|
|
608
|
+
# @return [String] rendered definition list
|
|
609
|
+
def render_definition_list(dl_node)
|
|
610
|
+
out = []
|
|
611
|
+
pending_term = nil
|
|
612
|
+
pending_definitions = []
|
|
613
|
+
|
|
614
|
+
flush_pending = lambda do
|
|
615
|
+
return if pending_term.nil? || pending_definitions.empty?
|
|
616
|
+
|
|
617
|
+
entry = "#{pending_term}\n: #{pending_definitions.first}"
|
|
618
|
+
pending_definitions.drop(1).each do |definition|
|
|
619
|
+
entry << "\n: #{definition}"
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
out << entry
|
|
623
|
+
pending_term = nil
|
|
624
|
+
pending_definitions = []
|
|
625
|
+
end
|
|
626
|
+
|
|
627
|
+
dl_node.element_children.each do |child|
|
|
628
|
+
case child.name.downcase
|
|
629
|
+
when 'dt'
|
|
630
|
+
flush_pending.call
|
|
631
|
+
pending_term = collapsed_inline_for(child)
|
|
632
|
+
pending_definitions = []
|
|
633
|
+
when 'dd'
|
|
634
|
+
defn = collapsed_inline_for(child)
|
|
635
|
+
pending_definitions << defn if pending_term
|
|
636
|
+
end
|
|
637
|
+
end
|
|
638
|
+
|
|
639
|
+
flush_pending.call
|
|
640
|
+
|
|
641
|
+
out.join("\n\n")
|
|
642
|
+
end
|
|
643
|
+
|
|
644
|
+
# Helpers
|
|
645
|
+
|
|
646
|
+
# Normalize whitespace in text
|
|
647
|
+
#
|
|
648
|
+
# @param text [String]
|
|
649
|
+
# @return [String] normalized text
|
|
650
|
+
def normalize_whitespace(text)
|
|
651
|
+
text.gsub(/[ \t\r\n\f\v]+/, ' ')
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
# Process inline text node
|
|
655
|
+
#
|
|
656
|
+
# @param text [String]
|
|
657
|
+
# @return [String] processed text
|
|
658
|
+
def inline_text(text)
|
|
659
|
+
return '' if text.nil? || text.empty?
|
|
660
|
+
|
|
661
|
+
decoded = CGI.unescapeHTML(text)
|
|
662
|
+
return '' if decoded.empty?
|
|
663
|
+
|
|
664
|
+
safe = decoded.gsub('<', '<').gsub('>', '>')
|
|
665
|
+
normalize_whitespace(safe)
|
|
666
|
+
end
|
|
667
|
+
|
|
668
|
+
# Collapse whitespace while preserving newlines
|
|
669
|
+
#
|
|
670
|
+
# @param text [String]
|
|
671
|
+
# @return [String] collapsed text
|
|
672
|
+
def collapse_inline_preserving_newlines(text)
|
|
673
|
+
return '' if text.nil? || text.empty?
|
|
674
|
+
|
|
675
|
+
placeholder = '__LLM_BR__'
|
|
676
|
+
marked = text.gsub("\r\n", "\n").tr("\r", "\n").gsub("\n", placeholder)
|
|
677
|
+
collapsed = normalize_whitespace(marked).strip
|
|
678
|
+
collapsed.gsub(placeholder, "\n")
|
|
679
|
+
end
|
|
680
|
+
|
|
681
|
+
# Escape special characters in markdown label
|
|
682
|
+
#
|
|
683
|
+
# @param text [String]
|
|
684
|
+
# @return [String] escaped text
|
|
685
|
+
def escape_markdown_label(text)
|
|
686
|
+
text.to_s.gsub(MARKDOWN_LABEL_ESCAPE_PATTERN) { |char| "\\#{char}" }
|
|
687
|
+
end
|
|
688
|
+
|
|
689
|
+
# Format URL for markdown link destination
|
|
690
|
+
#
|
|
691
|
+
# @param url [String]
|
|
692
|
+
# @return [String] formatted URL
|
|
693
|
+
def format_markdown_link_destination(url)
|
|
694
|
+
return '' if url.nil?
|
|
695
|
+
|
|
696
|
+
str = url.to_s
|
|
697
|
+
return str if str.empty?
|
|
698
|
+
|
|
699
|
+
# Wrap in angle brackets when the URL contains characters that often
|
|
700
|
+
# confuse markdown link destination parsing (e.g., spaces or parentheses).
|
|
701
|
+
# CommonMark allows the form: [label](<url>)
|
|
702
|
+
if str.match?(/[\s()]/)
|
|
703
|
+
"<#{str}>"
|
|
704
|
+
else
|
|
705
|
+
str
|
|
706
|
+
end
|
|
707
|
+
end
|
|
708
|
+
|
|
709
|
+
# Check if link destination is safe
|
|
710
|
+
#
|
|
711
|
+
# @param href [String] link href
|
|
712
|
+
# @return [Boolean] true if safe
|
|
713
|
+
def safe_link_destination?(href)
|
|
714
|
+
return false if href.nil?
|
|
715
|
+
|
|
716
|
+
sanitized = href.strip
|
|
717
|
+
return false if sanitized.empty?
|
|
718
|
+
return true if sanitized.start_with?('#', '/', './', '../')
|
|
719
|
+
return false if sanitized.match?(/\A(?:javascript|vbscript|data)\s*:/i)
|
|
720
|
+
|
|
721
|
+
if (match = sanitized.match(/\A([a-z][a-z0-9+\-.]*):/i))
|
|
722
|
+
SAFE_URI_SCHEMES.include?(match[1].downcase)
|
|
723
|
+
else
|
|
724
|
+
true
|
|
725
|
+
end
|
|
726
|
+
end
|
|
727
|
+
|
|
728
|
+
# Remove separator characters around unsafe links
|
|
729
|
+
#
|
|
730
|
+
# @param node [Nokogiri::XML::Element] link node
|
|
731
|
+
# @return [void]
|
|
732
|
+
def prune_unsafe_link_separators(node)
|
|
733
|
+
return unless node
|
|
734
|
+
|
|
735
|
+
[node.previous_sibling, node.next_sibling].each do |sibling|
|
|
736
|
+
prune_separator_text_node(sibling)
|
|
737
|
+
end
|
|
738
|
+
end
|
|
739
|
+
|
|
740
|
+
# Remove separator from text node if it's only a pipe
|
|
741
|
+
#
|
|
742
|
+
# @param sibling [Nokogiri::XML::Node, nil] sibling node
|
|
743
|
+
# @return [void]
|
|
744
|
+
def prune_separator_text_node(sibling)
|
|
745
|
+
return unless sibling&.text?
|
|
746
|
+
|
|
747
|
+
stripped = sibling.text.strip
|
|
748
|
+
sibling.remove if stripped == '|'
|
|
749
|
+
end
|
|
750
|
+
|
|
751
|
+
# Parse integer from string value
|
|
752
|
+
#
|
|
753
|
+
# @param raw [String, nil] raw value
|
|
754
|
+
# @return [Integer, nil] parsed integer or nil
|
|
755
|
+
def parse_integer(raw)
|
|
756
|
+
return nil if raw.nil?
|
|
757
|
+
|
|
758
|
+
str = raw.to_s.strip
|
|
759
|
+
return nil unless str.match?(/\A[+-]?\d+\z/)
|
|
760
|
+
|
|
761
|
+
str.to_i
|
|
762
|
+
end
|
|
763
|
+
|
|
764
|
+
# Clean and normalize output markdown
|
|
765
|
+
#
|
|
766
|
+
# @param output [String] raw output
|
|
767
|
+
# @return [String] cleaned output
|
|
768
|
+
def clean_output(output)
|
|
769
|
+
cleaned = output.gsub(/\r\n?/, "\n")
|
|
770
|
+
cleaned = cleaned.gsub(/[ \t]+\n/, "\n")
|
|
771
|
+
cleaned = Helpers.squeeze_blank_lines_outside_fences(cleaned, max_blank: 2)
|
|
772
|
+
# Trim leading/trailing blank lines but preserve significant trailing spaces
|
|
773
|
+
cleaned = cleaned.gsub(/\A(?:[ \t]*\n)+/, '')
|
|
774
|
+
cleaned.gsub(/(?:\n[ \t]*)+\z/, '')
|
|
775
|
+
end
|
|
776
|
+
|
|
777
|
+
# Check if node should be treated as a block element
|
|
778
|
+
#
|
|
779
|
+
# @param node [Nokogiri::XML::Node]
|
|
780
|
+
# @return [Boolean] true if block-like
|
|
781
|
+
def block_like?(node)
|
|
782
|
+
return false unless node.element?
|
|
783
|
+
|
|
784
|
+
tag = node.name.downcase
|
|
785
|
+
return true if HEADING_LEVEL.key?(tag)
|
|
786
|
+
return true if BLOCK_CONTAINERS.include?(tag)
|
|
787
|
+
return true if %w[p pre ul ol dl table blockquote hr figcaption].include?(tag)
|
|
788
|
+
|
|
789
|
+
false
|
|
790
|
+
end
|
|
791
|
+
end
|
|
792
|
+
end
|