llm-docs-builder 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,181 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Provides HTML to Markdown conversion functionality
5
+ #
6
+ # This module contains specialized renderers for converting HTML elements
7
+ # to Markdown format, with support for complex structures like tables,
8
+ # figures, and syntax-highlighted code blocks.
9
+ #
10
+ # @api private
11
+ module HtmlToMarkdown
12
+ # Converts <figure> elements that actually contain syntax-highlighted code back into fenced Markdown.
13
+ class FigureCodeBlockRenderer
14
+ # Generic CSS class names commonly used for code formatting that should be ignored
15
+ GENERIC_CODE_CLASSES = %w[highlight code main gutter numbers line-numbers line-number line wrap table].freeze
16
+
17
+ # @return [Nokogiri::XML::Node, nil] the identified code block node
18
+ attr_reader :code_block_node
19
+
20
+ # Initialize a new figure code block renderer
21
+ #
22
+ # @param element [Nokogiri::XML::Node] the figure element to render
23
+ # @param inline_collapser [Proc] callable for collapsing inline content
24
+ # @param fence_calculator [Proc] callable for calculating fence length
25
+ def initialize(element, inline_collapser:, fence_calculator:)
26
+ @element = element
27
+ @inline_collapser = inline_collapser
28
+ @fence_calculator = fence_calculator
29
+ end
30
+
31
+ # Render the figure as a fenced code block
32
+ #
33
+ # @return [String, nil] markdown fenced code block or nil if not a code figure
34
+ def render
35
+ @code_block_node = nil
36
+ return unless code_figure?
37
+
38
+ lines = extract_figure_code_lines
39
+ return if lines.empty?
40
+
41
+ language = detect_code_language
42
+ caption = caption_text
43
+ info_string = [language, caption].compact.reject(&:empty?).join(' ')
44
+ code_body = lines.join("\n")
45
+ fence = fence_calculator.call(code_body)
46
+ opening_fence = info_string.empty? ? fence : "#{fence}#{info_string}"
47
+ "#{opening_fence}\n#{code_body}\n#{fence}"
48
+ end
49
+
50
+ private
51
+
52
+ # @!attribute [r] element
53
+ # @return [Nokogiri::XML::Node] the figure element being processed
54
+ # @!attribute [r] inline_collapser
55
+ # @return [Proc] callable for collapsing inline content
56
+ # @!attribute [r] fence_calculator
57
+ # @return [Proc] callable for calculating fence length
58
+ attr_reader :element, :inline_collapser, :fence_calculator
59
+
60
+ # Extract caption text from figcaption element
61
+ #
62
+ # @return [String, nil] caption text or nil if no caption
63
+ def caption_text
64
+ caption_node = element.at_css('figcaption')
65
+ return if caption_node.nil?
66
+
67
+ inline_collapser.call(caption_node)
68
+ end
69
+
70
+ # Check if figure element represents a code block
71
+ #
72
+ # @return [Boolean] true if figure contains code
73
+ def code_figure?
74
+ class_tokens(element).any? { |token| token.casecmp('code').zero? }
75
+ end
76
+
77
+ # Extract code lines from figure element
78
+ #
79
+ # @return [Array<String>] array of code lines
80
+ def extract_figure_code_lines
81
+ pre = element.at_css('td.main pre') ||
82
+ element.at_css('td:not(.line-numbers) pre') ||
83
+ element.at_css('div.highlight pre') ||
84
+ element.at_css('pre')
85
+ @code_block_node = pre
86
+ return [] unless pre
87
+
88
+ lines =
89
+ if pre.css('.line').any?
90
+ pre.css('.line').map { |line| extract_code_line_text(line) }
91
+ else
92
+ raw = pre.at_css('code') ? pre.at_css('code').text : pre.text
93
+ raw.to_s.gsub(/\r\n?/, "\n").split("\n", -1)
94
+ end
95
+
96
+ clean_code_lines(lines)
97
+ end
98
+
99
+ # Extract text from a single code line node
100
+ #
101
+ # @param line_node [Nokogiri::XML::Element] line element
102
+ # @return [String] extracted text
103
+ def extract_code_line_text(line_node)
104
+ text = line_node.xpath('.//text()').map(&:text).join
105
+ text = text.tr("\u00a0", ' ')
106
+ text.gsub(/\r\n?/, '').rstrip
107
+ end
108
+
109
+ # Clean and normalize code lines
110
+ #
111
+ # @param lines [Array<String>] raw code lines
112
+ # @return [Array<String>] cleaned lines
113
+ def clean_code_lines(lines)
114
+ sanitized = lines.map { |line| line.to_s.gsub(/\r\n?/, "\n") }
115
+ sanitized.shift while sanitized.first&.strip&.empty?
116
+ sanitized.pop while sanitized.last&.strip&.empty?
117
+ sanitized
118
+ end
119
+
120
+ # Detect programming language from element attributes
121
+ #
122
+ # @return [String, nil] detected language or nil
123
+ def detect_code_language
124
+ candidates = [
125
+ element.at_css('code'),
126
+ element.at_css('pre'),
127
+ element.at_css('td.main'),
128
+ element.at_css('div.highlight'),
129
+ element
130
+ ].compact
131
+ candidates.concat(element.css('[data-language], [data-lang], [lang], [class]'))
132
+
133
+ candidates.each do |node|
134
+ language = extract_language_from_node(node)
135
+ return language unless language.nil? || language.empty?
136
+ end
137
+
138
+ nil
139
+ end
140
+
141
+ # Extract language identifier from node attributes
142
+ #
143
+ # @param node [Nokogiri::XML::Element] element to examine
144
+ # @return [String, nil] language identifier or nil
145
+ def extract_language_from_node(node)
146
+ %w[data-language data-lang lang].each do |attr|
147
+ value = node[attr]
148
+ return value.to_s.strip unless value.nil? || value.to_s.strip.empty?
149
+ end
150
+
151
+ class_attr = node['class']
152
+ return nil if class_attr.nil? || class_attr.strip.empty?
153
+
154
+ tokens = class_tokens(node)
155
+ tokens.each do |token|
156
+ next if token.empty?
157
+
158
+ if (match = token.match(/\A(?:language|lang)-(.*)\z/i))
159
+ candidate = match[1].to_s.strip
160
+ return candidate unless candidate.empty?
161
+ end
162
+
163
+ lowered = token.downcase
164
+ next if GENERIC_CODE_CLASSES.include?(lowered)
165
+
166
+ return token
167
+ end
168
+
169
+ nil
170
+ end
171
+
172
+ # Extract class tokens from node's class attribute
173
+ #
174
+ # @param node [Nokogiri::XML::Element] element to examine
175
+ # @return [Array<String>] array of class names
176
+ def class_tokens(node)
177
+ (node['class'] || '').split(/\s+/).reject(&:empty?)
178
+ end
179
+ end
180
+ end
181
+ end