llm-docs-builder 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Detects whether input should be treated as HTML and related snippet checks
5
+ class HtmlDetector
6
+ # Detect if loaded content is HTML instead of markdown
7
+ #
8
+ # @param content [String] raw content
9
+ # @param snippet [String, nil] optional precomputed snippet
10
+ # @return [Boolean]
11
+ def html_content?(content, snippet = detection_snippet(content))
12
+ return false unless html_content_snippet?(snippet)
13
+
14
+ full_html_document?(content)
15
+ end
16
+
17
+ # Prepare a snippet of content for HTML detection by removing leading whitespace
18
+ # and build metadata comments.
19
+ #
20
+ # @param content [String]
21
+ # @return [String, nil]
22
+ def detection_snippet(content)
23
+ return unless content
24
+
25
+ snippet = content.lstrip
26
+ return unless snippet
27
+
28
+ comment_prefix = /\A<!--.*?-->\s*/m
29
+ # Remote docs often include build metadata comments; skip them before tag detection.
30
+ return '' if snippet.empty? while snippet.sub!(comment_prefix, '')
31
+
32
+ snippet.lstrip[0, 500]
33
+ end
34
+
35
+ # Determine whether a snippet should be treated as HTML.
36
+ #
37
+ # @param snippet [String, nil]
38
+ # @return [Boolean]
39
+ def html_content_snippet?(snippet)
40
+ return false unless snippet && !snippet.empty?
41
+ return false if markdown_heading_snippet?(snippet)
42
+
43
+ html_candidate_snippet?(snippet)
44
+ end
45
+
46
+ # Determine whether a snippet appears to start with HTML markup.
47
+ #
48
+ # @param snippet [String]
49
+ # @return [Boolean]
50
+ def html_candidate_snippet?(snippet)
51
+ snippet.match?(/\A<\s*(?:!DOCTYPE\s+html|html\b|body\b|head\b|article\b|section\b|main\b|p\b|div\b|table\b|thead\b|tbody\b|tr\b|td\b|th\b|meta\b|link\b|h[1-6]\b|ul\b|ol\b|li\b|blockquote\b)/i)
52
+ end
53
+
54
+ # Check if the full document should be treated as HTML by parsing it and
55
+ # ensuring we do not observe unwrapped markdown constructs like plain text or lists.
56
+ #
57
+ # @param content [String]
58
+ # @return [Boolean]
59
+ def full_html_document?(content)
60
+ document = Nokogiri::HTML::Document.parse(content)
61
+ body = document.at('body')
62
+
63
+ return false unless body
64
+ return false if document.xpath('/text()').any? { |node| meaningful_text?(node.text) }
65
+
66
+ body.xpath('./text()').each do |node|
67
+ text = node.text
68
+ next unless meaningful_text?(text)
69
+
70
+ return false unless allow_inline_body_text?(content, text)
71
+ end
72
+
73
+ true
74
+ rescue Nokogiri::XML::SyntaxError
75
+ false
76
+ end
77
+
78
+ # Checks if text contains meaningful non-whitespace content
79
+ #
80
+ # @param text [String, nil]
81
+ # @return [Boolean] true if text contains non-whitespace characters
82
+ def meaningful_text?(text)
83
+ return false if text.nil?
84
+
85
+ stripped = text.strip
86
+ stripped.match?(/\S/)
87
+ end
88
+
89
+ # Checks if text looks like markdown syntax
90
+ #
91
+ # @param text [String, nil]
92
+ # @return [Boolean] true if text contains markdown-like patterns
93
+ def markdown_like_text?(text)
94
+ return false if text.nil?
95
+ return true if markdown_heading_snippet?(text)
96
+
97
+ text.each_line do |line|
98
+ trimmed = line.lstrip
99
+ next if trimmed.empty?
100
+ next if trimmed.start_with?('<')
101
+
102
+ return true if trimmed.match?(/\A[*+-]\s+\S/)
103
+ return true if trimmed.match?(/\A\d+\.\s+\S/)
104
+ return true if trimmed.match?(/\A>\s+\S/)
105
+ return true if trimmed.start_with?('```', '~~~')
106
+ return true if trimmed.strip.match?(/\A(?:-{3,}|_{3,}|={3,})\z/)
107
+ end
108
+
109
+ false
110
+ end
111
+
112
+ # Determines if inline body text should be allowed in HTML context
113
+ #
114
+ # @param content [String] full content being processed
115
+ # @param text [String] specific text to check
116
+ # @return [Boolean] true if inline body text is acceptable
117
+ def allow_inline_body_text?(content, text)
118
+ return false if markdown_like_text?(text)
119
+
120
+ html_with_body_wrapper?(content)
121
+ end
122
+
123
+ # Checks if content has HTML document structure wrapper tags
124
+ #
125
+ # @param content [String] content to check for HTML wrapper tags
126
+ # @return [Boolean] true if content contains DOCTYPE, html, or body tags
127
+ def html_with_body_wrapper?(content)
128
+ content.match?(/<\s*!DOCTYPE\s+html/i) ||
129
+ content.match?(/<\s*html\b/i) ||
130
+ content.match?(/<\s*body\b/i)
131
+ end
132
+
133
+ # Detect whether the snippet represents a table fragment we should preserve.
134
+ #
135
+ # @param snippet [String, nil]
136
+ # @return [Boolean]
137
+ def table_fragment?(snippet)
138
+ return false unless snippet && !snippet.empty?
139
+
140
+ snippet.match?(/\A<\s*(?:table|thead|tbody|tr|td|th)\b/i)
141
+ end
142
+
143
+ # Detect common markdown heading syntax within the snippet.
144
+ #
145
+ # @param snippet [String]
146
+ # @return [Boolean]
147
+ def markdown_heading_snippet?(snippet)
148
+ snippet.each_line do |line|
149
+ trimmed = line.lstrip
150
+ next if trimmed.empty?
151
+ next if trimmed.start_with?('<')
152
+
153
+ return true if trimmed.match?(/\A#+\s+/)
154
+ end
155
+
156
+ false
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,181 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Provides HTML to Markdown conversion functionality
5
+ #
6
+ # This module contains specialized renderers for converting HTML elements
7
+ # to Markdown format, with support for complex structures like tables,
8
+ # figures, and syntax-highlighted code blocks.
9
+ #
10
+ # @api private
11
+ module HtmlToMarkdown
12
+ # Converts <figure> elements that actually contain syntax-highlighted code back into fenced Markdown.
13
+ class FigureCodeBlockRenderer
14
+ # Generic CSS class names commonly used for code formatting that should be ignored
15
+ GENERIC_CODE_CLASSES = %w[highlight code main gutter numbers line-numbers line-number line wrap table].freeze
16
+
17
+ # @return [Nokogiri::XML::Node, nil] the identified code block node
18
+ attr_reader :code_block_node
19
+
20
+ # Initialize a new figure code block renderer
21
+ #
22
+ # @param element [Nokogiri::XML::Node] the figure element to render
23
+ # @param inline_collapser [Proc] callable for collapsing inline content
24
+ # @param fence_calculator [Proc] callable for calculating fence length
25
+ def initialize(element, inline_collapser:, fence_calculator:)
26
+ @element = element
27
+ @inline_collapser = inline_collapser
28
+ @fence_calculator = fence_calculator
29
+ end
30
+
31
+ # Render the figure as a fenced code block
32
+ #
33
+ # @return [String, nil] markdown fenced code block or nil if not a code figure
34
+ def render
35
+ @code_block_node = nil
36
+ return unless code_figure?
37
+
38
+ lines = extract_figure_code_lines
39
+ return if lines.empty?
40
+
41
+ language = detect_code_language
42
+ caption = caption_text
43
+ info_string = [language, caption].compact.reject(&:empty?).join(' ')
44
+ code_body = lines.join("\n")
45
+ fence = fence_calculator.call(code_body)
46
+ opening_fence = info_string.empty? ? fence : "#{fence}#{info_string}"
47
+ "#{opening_fence}\n#{code_body}\n#{fence}"
48
+ end
49
+
50
+ private
51
+
52
+ # @!attribute [r] element
53
+ # @return [Nokogiri::XML::Node] the figure element being processed
54
+ # @!attribute [r] inline_collapser
55
+ # @return [Proc] callable for collapsing inline content
56
+ # @!attribute [r] fence_calculator
57
+ # @return [Proc] callable for calculating fence length
58
+ attr_reader :element, :inline_collapser, :fence_calculator
59
+
60
+ # Extract caption text from figcaption element
61
+ #
62
+ # @return [String, nil] caption text or nil if no caption
63
+ def caption_text
64
+ caption_node = element.at_css('figcaption')
65
+ return if caption_node.nil?
66
+
67
+ inline_collapser.call(caption_node)
68
+ end
69
+
70
+ # Check if figure element represents a code block
71
+ #
72
+ # @return [Boolean] true if figure contains code
73
+ def code_figure?
74
+ class_tokens(element).any? { |token| token.casecmp('code').zero? }
75
+ end
76
+
77
+ # Extract code lines from figure element
78
+ #
79
+ # @return [Array<String>] array of code lines
80
+ def extract_figure_code_lines
81
+ pre = element.at_css('td.main pre') ||
82
+ element.at_css('td:not(.line-numbers) pre') ||
83
+ element.at_css('div.highlight pre') ||
84
+ element.at_css('pre')
85
+ @code_block_node = pre
86
+ return [] unless pre
87
+
88
+ lines =
89
+ if pre.css('.line').any?
90
+ pre.css('.line').map { |line| extract_code_line_text(line) }
91
+ else
92
+ raw = pre.at_css('code') ? pre.at_css('code').text : pre.text
93
+ raw.to_s.gsub(/\r\n?/, "\n").split("\n", -1)
94
+ end
95
+
96
+ clean_code_lines(lines)
97
+ end
98
+
99
+ # Extract text from a single code line node
100
+ #
101
+ # @param line_node [Nokogiri::XML::Element] line element
102
+ # @return [String] extracted text
103
+ def extract_code_line_text(line_node)
104
+ text = line_node.xpath('.//text()').map(&:text).join
105
+ text = text.tr("\u00a0", ' ')
106
+ text.gsub(/\r\n?/, '').rstrip
107
+ end
108
+
109
+ # Clean and normalize code lines
110
+ #
111
+ # @param lines [Array<String>] raw code lines
112
+ # @return [Array<String>] cleaned lines
113
+ def clean_code_lines(lines)
114
+ sanitized = lines.map { |line| line.to_s.gsub(/\r\n?/, "\n") }
115
+ sanitized.shift while sanitized.first&.strip&.empty?
116
+ sanitized.pop while sanitized.last&.strip&.empty?
117
+ sanitized
118
+ end
119
+
120
+ # Detect programming language from element attributes
121
+ #
122
+ # @return [String, nil] detected language or nil
123
+ def detect_code_language
124
+ candidates = [
125
+ element.at_css('code'),
126
+ element.at_css('pre'),
127
+ element.at_css('td.main'),
128
+ element.at_css('div.highlight'),
129
+ element
130
+ ].compact
131
+ candidates.concat(element.css('[data-language], [data-lang], [lang], [class]'))
132
+
133
+ candidates.each do |node|
134
+ language = extract_language_from_node(node)
135
+ return language unless language.nil? || language.empty?
136
+ end
137
+
138
+ nil
139
+ end
140
+
141
+ # Extract language identifier from node attributes
142
+ #
143
+ # @param node [Nokogiri::XML::Element] element to examine
144
+ # @return [String, nil] language identifier or nil
145
+ def extract_language_from_node(node)
146
+ %w[data-language data-lang lang].each do |attr|
147
+ value = node[attr]
148
+ return value.to_s.strip unless value.nil? || value.to_s.strip.empty?
149
+ end
150
+
151
+ class_attr = node['class']
152
+ return nil if class_attr.nil? || class_attr.strip.empty?
153
+
154
+ tokens = class_tokens(node)
155
+ tokens.each do |token|
156
+ next if token.empty?
157
+
158
+ if (match = token.match(/\A(?:language|lang)-(.*)\z/i))
159
+ candidate = match[1].to_s.strip
160
+ return candidate unless candidate.empty?
161
+ end
162
+
163
+ lowered = token.downcase
164
+ next if GENERIC_CODE_CLASSES.include?(lowered)
165
+
166
+ return token
167
+ end
168
+
169
+ nil
170
+ end
171
+
172
+ # Extract class tokens from node's class attribute
173
+ #
174
+ # @param node [Nokogiri::XML::Element] element to examine
175
+ # @return [Array<String>] array of class names
176
+ def class_tokens(node)
177
+ (node['class'] || '').split(/\s+/).reject(&:empty?)
178
+ end
179
+ end
180
+ end
181
+ end