llm-docs-builder 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,597 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module HtmlToMarkdown
5
+ # Handles conversion of HTML table markup to Markdown table format
6
+ class TableMarkupRenderer
7
+ # Initialize a new table markup renderer
8
+ #
9
+ # @param inline_collapser [Proc] callable for collapsing inline content
10
+ # @param block_renderer [Proc] callable for rendering block elements
11
+ def initialize(inline_collapser:, block_renderer:)
12
+ @inline_collapser = inline_collapser
13
+ @block_renderer = block_renderer
14
+ end
15
+
16
+ # Main entry point for rendering HTML tables to Markdown
17
+ #
18
+ # @param table_node [Nokogiri::XML::Node] the HTML table element to convert
19
+ # @return [String] markdown table or HTML if table cannot be converted
20
+ def render_table(table_node)
21
+ return table_node.to_html if table_contains_nested_tables?(table_node)
22
+ return render_table_with_rowspan_cells(table_node) if table_contains_rowspan_cells?(table_node)
23
+ return render_table_with_colspan_cells(table_node) if table_contains_colspan_cells?(table_node)
24
+
25
+ caption_text = caption_text_for(table_node)
26
+
27
+ rows = table_node.css('tr').map do |row|
28
+ cells = row.element_children.select { |child| %w[th td].include?(child.name.downcase) }
29
+ next if cells.empty?
30
+
31
+ header_candidate = row.ancestors('thead').any? ||
32
+ cells.all? { |cell| cell.name.casecmp('th').zero? }
33
+
34
+ {
35
+ header: header_candidate,
36
+ values: cells.map { |cell| render_table_cell(cell) }
37
+ }
38
+ end.compact
39
+ return '' if rows.empty?
40
+
41
+ header_index = rows.find_index { |row| row[:header] }
42
+
43
+ if header_index
44
+ header_values = rows[header_index][:values]
45
+ data_values = rows.each_with_index.filter_map do |row, index|
46
+ next if index == header_index
47
+
48
+ row[:values]
49
+ end
50
+ else
51
+ header_values = rows.first[:values]
52
+ data_values = rows.drop(1).map { |row| row[:values] }
53
+ end
54
+
55
+ column_count = [header_values.length, data_values.map(&:length).max || 0].max
56
+ column_count = 1 if column_count.zero?
57
+
58
+ header = pad_table_row(header_values, column_count)
59
+ data_rows = data_values.map { |row| pad_table_row(row, column_count) }
60
+
61
+ header_cells = header.map { |value| table_cell_data(value) }
62
+ data_cells = data_rows.map { |row| row.map { |value| table_cell_data(value) } }
63
+
64
+ column_specs = compute_table_column_specs(header_cells, data_cells)
65
+ column_widths = column_specs.map { |spec| spec[:width] }
66
+
67
+ lines = []
68
+ lines.concat(format_table_row(header_cells, column_specs))
69
+ lines << render_table_separator(column_widths)
70
+ data_cells.each do |row_cells|
71
+ lines.concat(format_table_row(row_cells, column_specs))
72
+ end
73
+
74
+ table_markdown = lines.join("\n")
75
+
76
+ with_optional_caption(caption_text, table_markdown)
77
+ end
78
+
79
+ private
80
+
81
+ # Render table that contains rowspan cells
82
+ #
83
+ # @param table_node [Nokogiri::XML::Element] table element
84
+ # @return [String] markdown table
85
+ def render_table_with_rowspan_cells(table_node)
86
+ caption_text = caption_text_for(table_node)
87
+
88
+ span_slots = []
89
+ rows = []
90
+
91
+ table_node.css('tr').each do |row|
92
+ cells = row.element_children.select { |child| %w[th td].include?(child.name.downcase) }
93
+ next if cells.empty?
94
+
95
+ header_candidate = row.ancestors('thead').any? ||
96
+ cells.all? { |cell| cell.name.casecmp('th').zero? }
97
+
98
+ expanded_cells = expand_row_for_rowspans(cells, span_slots)
99
+ rows << { header: header_candidate, cells: expanded_cells }
100
+ end
101
+
102
+ return table_node.to_html if rows.empty?
103
+
104
+ column_count = rows.map { |row| row[:cells].length }.max || 1
105
+ column_count = 1 if column_count.zero?
106
+
107
+ header_index = find_header_index(rows, default: 0)
108
+
109
+ header_values = pad_table_row(rows[header_index][:cells], column_count)
110
+ header_lines = format_rowspan_row_text(header_values).to_s.split("\n", -1)
111
+ header_lines = [''] if header_lines.empty?
112
+ header_lines.map! { |line| line.empty? ? ' ' : line }
113
+
114
+ header_cells = header_values.map { |value| table_cell_data(value) }
115
+ column_widths = column_widths_from_cells(header_cells)
116
+
117
+ lines = header_lines.map { |line| format_bordered_row_content(line) }
118
+ lines << render_table_separator(column_widths)
119
+
120
+ rows.each_with_index do |row, index|
121
+ next if index == header_index
122
+
123
+ padded_cells = pad_table_row(row[:cells], column_count)
124
+ row_lines = format_rowspan_row_text(padded_cells).to_s.split("\n", -1)
125
+ row_lines = [''] if row_lines.empty?
126
+
127
+ row_lines.each do |line|
128
+ display = line.empty? ? ' ' : line
129
+ lines << format_bordered_row_content(display)
130
+ end
131
+ end
132
+
133
+ table_markdown = lines.join("\n")
134
+
135
+ with_optional_caption(caption_text, table_markdown)
136
+ end
137
+
138
+ # Render table that contains colspan cells
139
+ #
140
+ # @param table_node [Nokogiri::XML::Element] table element
141
+ # @return [String] markdown table
142
+ def render_table_with_colspan_cells(table_node)
143
+ caption_text = caption_text_for(table_node)
144
+
145
+ rows = table_node.css('tr').map do |row|
146
+ cells = row.element_children.select { |child| %w[th td].include?(child.name.downcase) }
147
+ next if cells.empty?
148
+
149
+ header_candidate = row.ancestors('thead').any? ||
150
+ cells.all? { |cell| cell.name.casecmp('th').zero? }
151
+
152
+ values = cells.map { |cell| render_table_cell(cell) }
153
+ # Escape literal pipes in each cell to avoid creating bogus columns when joined
154
+ escaped_values = values.map { |v| sanitize_table_cell_line(v, escape_pipes: true) }
155
+ text = escaped_values.join(' | ').strip
156
+
157
+ {
158
+ header: header_candidate,
159
+ values: values,
160
+ text: text
161
+ }
162
+ end.compact
163
+ return table_node.to_html if rows.empty?
164
+
165
+ header_index = find_header_index(rows, default: 0)
166
+ header = rows[header_index]
167
+ data_rows = rows.each_with_index.filter_map { |row, index| index == header_index ? nil : row }
168
+
169
+ column_count = rows.map { |row| row[:values].length }.max || 1
170
+ column_count = 1 if column_count.zero?
171
+
172
+ header_values = pad_table_row(header[:values] || [], column_count)
173
+ header_cells = header_values.map { |value| table_cell_data(value) }
174
+ column_widths = column_widths_from_cells(header_cells)
175
+
176
+ lines = []
177
+ lines << format_bordered_row_content(header[:text])
178
+ lines << render_table_separator(column_widths)
179
+ data_rows.each { |row| lines << format_bordered_row_content(row[:text]) }
180
+
181
+ table_markdown = lines.join("\n")
182
+
183
+ with_optional_caption(caption_text, table_markdown)
184
+ end
185
+
186
+ # Expand row cells accounting for rowspan effects
187
+ #
188
+ # @param cells [Array<Nokogiri::XML::Element>] cell elements
189
+ # @param span_slots [Array<Integer>] tracking array for rowspan state
190
+ # @return [Array<String>] expanded cell values
191
+ def expand_row_for_rowspans(cells, span_slots)
192
+ row_cells = []
193
+ column = 0
194
+
195
+ cells.each do |cell|
196
+ while span_slots[column].to_i.positive?
197
+ row_cells << ''
198
+ span_slots[column] = span_slots[column].to_i - 1
199
+ span_slots[column] = nil if span_slots[column].to_i <= 0
200
+ column += 1
201
+ end
202
+
203
+ value = render_table_cell(cell)
204
+ colspan = parse_integer(cell['colspan']) || 1
205
+ colspan = 1 if colspan <= 0
206
+ rowspan = parse_integer(cell['rowspan']) || 1
207
+ rowspan = 1 if rowspan <= 0
208
+
209
+ colspan.times do |offset|
210
+ row_cells << (offset.zero? ? value : '')
211
+ target_index = column + offset
212
+ span_slots[target_index] = (rowspan - 1 if rowspan > 1)
213
+ end
214
+
215
+ column += colspan
216
+ end
217
+
218
+ while span_slots[column].to_i.positive?
219
+ row_cells << ''
220
+ span_slots[column] = span_slots[column].to_i - 1
221
+ span_slots[column] = nil if span_slots[column].to_i <= 0
222
+ column += 1
223
+ end
224
+
225
+ row_cells
226
+ end
227
+
228
+ # Format row text for rowspan tables
229
+ #
230
+ # @param cells [Array<String>] cell values
231
+ # @return [String] formatted row text
232
+ def format_rowspan_row_text(cells)
233
+ values =
234
+ if cells.is_a?(Array)
235
+ cells.map(&:to_s)
236
+ else
237
+ # Fallback: split a pre-joined string if encountered
238
+ cells.to_s.split(' | ')
239
+ end
240
+
241
+ # Escape literal pipes per cell so row assembly with ' | ' doesn't introduce extra columns
242
+ safe_values = values.map { |value| sanitize_table_cell_line(value, escape_pipes: true) }
243
+
244
+ split_values =
245
+ safe_values.map do |value|
246
+ segments = value.gsub(/\r\n?/, "\n").split("\n")
247
+ segments = [''] if segments.empty?
248
+ segments
249
+ end
250
+
251
+ max_lines = split_values.map(&:length).max || 0
252
+ return '' if max_lines.zero?
253
+
254
+ column_widths =
255
+ split_values.map do |segments|
256
+ segments.map(&:length).max || 0
257
+ end
258
+
259
+ lines = Array.new(max_lines) do |row_index|
260
+ row_values =
261
+ split_values.each_with_index.map do |segments, column_index|
262
+ segment = segments[row_index] || ''
263
+ width = column_widths[column_index]
264
+ if width.positive? && !segment.empty?
265
+ segment.ljust(width)
266
+ else
267
+ segment
268
+ end
269
+ end
270
+
271
+ row_values.join(' | ')
272
+ end
273
+
274
+ lines.join("\n")
275
+ end
276
+
277
+ # Render individual table cell content
278
+ #
279
+ # @param cell [Nokogiri::XML::Element] cell element
280
+ # @return [String] rendered cell content
281
+ def render_table_cell(cell)
282
+ content = @block_renderer.call(cell.children, depth: 0)
283
+ return '' if content.nil?
284
+
285
+ cleaned = content.strip
286
+ return cleaned unless cleaned.empty?
287
+
288
+ @inline_collapser.call(cell)
289
+ end
290
+
291
+ # Extract data from table cell value
292
+ #
293
+ # @param value [String] cell value
294
+ # @return [Hash] cell data with lines and pipe_split flag
295
+ def table_cell_data(value)
296
+ text = value.to_s
297
+ return { lines: [''], pipe_split: false } if text.empty?
298
+
299
+ pipe_split = false
300
+
301
+ lines =
302
+ text
303
+ .gsub(/\r\n?/, "\n")
304
+ .split("\n")
305
+ .flat_map do |line|
306
+ segments, split_flag = split_table_cell_line(line)
307
+ pipe_split ||= split_flag
308
+ segments
309
+ end
310
+
311
+ lines.reject! { |segment| segment.strip.empty? }
312
+ lines = [''] if lines.empty?
313
+
314
+ { lines: lines, pipe_split: pipe_split }
315
+ end
316
+
317
+ # Split table cell line into segments
318
+ #
319
+ # @param line [String] cell line text
320
+ # @return [Array<Array<String>, Boolean>] segments and split flag
321
+ def split_table_cell_line(line)
322
+ return [[''], false] if line.nil? || line.empty?
323
+
324
+ # We always treat a cell line as a single segment and escape literal pipes
325
+ # that are outside of code spans. This keeps column integrity intact.
326
+ sanitized_line = sanitize_table_cell_line(line, escape_pipes: true)
327
+ [[sanitized_line], false]
328
+ end
329
+
330
+ # Sanitize table cell line text
331
+ #
332
+ # @param text [String] cell text
333
+ # @param escape_pipes [Boolean] whether to escape pipe characters
334
+ # @return [String] sanitized text
335
+ def sanitize_table_cell_line(text, escape_pipes: false)
336
+ raw = text.to_s
337
+ return '' if raw.empty?
338
+
339
+ sanitized = +''
340
+ index = 0
341
+ length = raw.length
342
+ inside_code = false
343
+ fence_length = 0
344
+
345
+ while index < length
346
+ char = raw[index]
347
+
348
+ if char == '\\'
349
+ sanitized << '\\\\'
350
+ index += 1
351
+ if index < length
352
+ sanitized << raw[index]
353
+ index += 1
354
+ end
355
+ next
356
+ end
357
+
358
+ if char == '`'
359
+ run_length = 1
360
+ run_length += 1 while index + run_length < length && raw[index + run_length] == '`'
361
+
362
+ sanitized << ('`' * run_length)
363
+ index += run_length
364
+
365
+ if inside_code
366
+ inside_code = false if run_length == fence_length
367
+ fence_length = 0 unless inside_code
368
+ else
369
+ inside_code = true
370
+ fence_length = run_length
371
+ end
372
+
373
+ next
374
+ end
375
+
376
+ if char == '|' && escape_pipes && !inside_code
377
+ sanitized << '\\|'
378
+ index += 1
379
+ next
380
+ end
381
+
382
+ sanitized << char
383
+ index += 1
384
+ end
385
+
386
+ sanitized.strip
387
+ end
388
+
389
+ # Check if table contains nested tables
390
+ #
391
+ # @param table_node [Nokogiri::XML::Element] table element
392
+ # @return [Boolean] true if nested tables exist
393
+ def table_contains_nested_tables?(table_node)
394
+ table_node.css('table').any?
395
+ end
396
+
397
+ # Check if table contains rowspan cells
398
+ #
399
+ # @param table_node [Nokogiri::XML::Element] table element
400
+ # @return [Boolean] true if rowspan cells exist
401
+ def table_contains_rowspan_cells?(table_node)
402
+ table_node.css('td[rowspan], th[rowspan]').any? do |cell|
403
+ span_value_significant?(cell['rowspan'])
404
+ end
405
+ end
406
+
407
+ # Check if table contains colspan cells
408
+ #
409
+ # @param table_node [Nokogiri::XML::Element] table element
410
+ # @return [Boolean] true if colspan cells exist
411
+ def table_contains_colspan_cells?(table_node)
412
+ table_node.css('td[colspan], th[colspan]').any? do |cell|
413
+ span_value_significant?(cell['colspan'])
414
+ end
415
+ end
416
+
417
+ # Check if span value is significant (not 1 or empty)
418
+ #
419
+ # @param raw_value [String] span attribute value
420
+ # @return [Boolean] true if span is significant
421
+ def span_value_significant?(raw_value)
422
+ return false if raw_value.nil?
423
+
424
+ value = raw_value.to_s.strip
425
+ return true if value.empty?
426
+ return false if value == '1'
427
+
428
+ integer = value.to_i
429
+ return true if integer > 1
430
+
431
+ integer <= 0 || value != integer.to_s
432
+ end
433
+
434
+ # Pad table row to specified length
435
+ #
436
+ # @param values [Array<String>] row values
437
+ # @param length [Integer] desired length
438
+ # @return [Array<String>] padded row
439
+ def pad_table_row(values, length)
440
+ padded = values.nil? ? [] : values.dup
441
+ padded = [] if padded.nil?
442
+
443
+ padded << '' while padded.length < length
444
+
445
+ padded[0, length]
446
+ end
447
+
448
+ # Compute column specifications (width and padding)
449
+ #
450
+ # @param header_cells [Array<Hash>] header cell data
451
+ # @param data_cells [Array<Array<Hash>>] data cell data
452
+ # @return [Array<Hash>] column specifications
453
+ def compute_table_column_specs(header_cells, data_cells)
454
+ column_count = header_cells.length
455
+
456
+ column_count.times.map do |index|
457
+ header_cell = header_cells[index] || { lines: [''], pipe_split: false }
458
+ column_cells = data_cells.map { |row| row[index] || { lines: [''], pipe_split: false } }
459
+
460
+ header_width = header_cell[:lines].map(&:length).max || 0
461
+ content_width = column_cells.map { |cell| cell[:lines].map(&:length).max || 0 }.max || 0
462
+
463
+ requires_padding =
464
+ ([header_cell] + column_cells).any? do |cell|
465
+ cell[:lines].length > 1 && !cell[:pipe_split]
466
+ end
467
+
468
+ width =
469
+ if requires_padding
470
+ [header_width, content_width].max
471
+ else
472
+ header_width
473
+ end
474
+ width = [width, 1].max
475
+
476
+ { width: width, pad: requires_padding }
477
+ end
478
+ end
479
+
480
+ # Format table row with column specifications
481
+ #
482
+ # @param row_cells [Array<Hash>] cell data
483
+ # @param column_specs [Array<Hash>] column specifications
484
+ # @return [Array<String>] formatted row lines
485
+ def format_table_row(row_cells, column_specs)
486
+ row_height = row_cells.map { |cell| cell[:lines].length }.max || 0
487
+ row_height = 1 if row_height.zero?
488
+
489
+ rows = []
490
+
491
+ row_height.times do |line_index|
492
+ values = column_specs.each_index.map do |column_index|
493
+ cell = row_cells[column_index] || { lines: [''], pipe_split: false }
494
+ line = cell[:lines][line_index] || ''
495
+ spec = column_specs[column_index]
496
+ spec[:pad] ? pad_table_cell_line(line, spec[:width]) : line.to_s
497
+ end
498
+
499
+ next if values.all? { |value| value.strip.empty? }
500
+
501
+ rows << "| #{values.join(' | ')} |"
502
+ end
503
+
504
+ if rows.empty?
505
+ placeholder = column_specs.map { |spec| ' ' * spec[:width] }.join(' | ')
506
+ ["| #{placeholder} |"]
507
+ else
508
+ rows
509
+ end
510
+ end
511
+
512
+ # Pad table cell line to specified width
513
+ #
514
+ # @param text [String] cell text
515
+ # @param width [Integer] target width
516
+ # @return [String] padded text
517
+ def pad_table_cell_line(text, width)
518
+ value = text.to_s
519
+ width <= 0 ? value : value.ljust(width)
520
+ end
521
+
522
+ # Render table separator line
523
+ #
524
+ # @param column_widths [Array<Integer>] column widths
525
+ # @return [String] separator line
526
+ def render_table_separator(column_widths)
527
+ "|#{column_widths.map { |width| '-' * [width + 2, 3].max }.join('|')}|"
528
+ end
529
+
530
+ # Table helpers
531
+
532
+ # Extract caption text from table
533
+ #
534
+ # @param table_node [Nokogiri::XML::Element] table element
535
+ # @return [String, nil] caption text or nil
536
+ def caption_text_for(table_node)
537
+ caption = table_node.at_css('caption')
538
+ text = @inline_collapser.call(caption).strip if caption
539
+ text = nil if text.nil? || text.empty?
540
+ text
541
+ end
542
+
543
+ # Prepend caption to table markdown if present
544
+ #
545
+ # @param caption_text [String, nil] caption text
546
+ # @param table_markdown [String] table markdown
547
+ # @return [String] table with optional caption
548
+ def with_optional_caption(caption_text, table_markdown)
549
+ caption_text ? "#{caption_text}\n\n#{table_markdown}" : table_markdown
550
+ end
551
+
552
+ # Find index of header row
553
+ #
554
+ # @param rows [Array<Hash>] row data
555
+ # @param default [Integer, nil] default index if no header found
556
+ # @return [Integer, nil] header row index
557
+ def find_header_index(rows, default: nil)
558
+ idx = rows.find_index { |row| row[:header] }
559
+ idx.nil? ? default : idx
560
+ end
561
+
562
+ # Format row content with borders
563
+ #
564
+ # @param content [String] row content
565
+ # @return [String] bordered row
566
+ def format_bordered_row_content(content)
567
+ value = content.to_s
568
+ value = ' ' if value.empty?
569
+ "| #{value} |"
570
+ end
571
+
572
+ # Calculate column widths from cell data
573
+ #
574
+ # @param cells [Array<Hash>] cell data
575
+ # @return [Array<Integer>] column widths
576
+ def column_widths_from_cells(cells)
577
+ cells.map do |cell|
578
+ width = cell[:lines].map(&:length).max || 0
579
+ [width, 1].max
580
+ end
581
+ end
582
+
583
+ # Parse integer from string value
584
+ #
585
+ # @param raw [String, nil] raw value
586
+ # @return [Integer, nil] parsed integer or nil
587
+ def parse_integer(raw)
588
+ return nil if raw.nil?
589
+
590
+ str = raw.to_s.strip
591
+ return nil unless str.match?(/\A[+-]?\d+\z/)
592
+
593
+ str.to_i
594
+ end
595
+ end
596
+ end
597
+ end