llm-docs-builder 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +13 -0
- data/.github/workflows/docker.yml +2 -2
- data/.github/workflows/push.yml +2 -2
- data/.gitignore +8 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +34 -5
- data/README.md +16 -0
- data/lib/llm_docs_builder/config.rb +33 -0
- data/lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb +31 -0
- data/lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb +71 -0
- data/lib/llm_docs_builder/helpers.rb +9 -0
- data/lib/llm_docs_builder/html_detector.rb +159 -0
- data/lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb +181 -0
- data/lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb +597 -0
- data/lib/llm_docs_builder/html_to_markdown_converter.rb +792 -0
- data/lib/llm_docs_builder/markdown_transformer.rb +23 -9
- data/lib/llm_docs_builder/output_formatter.rb +1 -1
- data/lib/llm_docs_builder/transformers/base_transformer.rb +13 -1
- data/lib/llm_docs_builder/url_fetcher.rb +18 -0
- data/lib/llm_docs_builder/version.rb +1 -1
- data/lib/llm_docs_builder.rb +10 -0
- data/llm-docs-builder.gemspec +1 -0
- metadata +22 -2
- data/AGENTS.md +0 -20
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmDocsBuilder
|
|
4
|
+
# Provides HTML to Markdown conversion functionality
|
|
5
|
+
#
|
|
6
|
+
# This module contains specialized renderers for converting HTML elements
|
|
7
|
+
# to Markdown format, with support for complex structures like tables,
|
|
8
|
+
# figures, and syntax-highlighted code blocks.
|
|
9
|
+
#
|
|
10
|
+
# @api private
|
|
11
|
+
module HtmlToMarkdown
|
|
12
|
+
# Converts <figure> elements that actually contain syntax-highlighted code back into fenced Markdown.
|
|
13
|
+
class FigureCodeBlockRenderer
|
|
14
|
+
# Generic CSS class names commonly used for code formatting that should be ignored
|
|
15
|
+
GENERIC_CODE_CLASSES = %w[highlight code main gutter numbers line-numbers line-number line wrap table].freeze
|
|
16
|
+
|
|
17
|
+
# @return [Nokogiri::XML::Node, nil] the identified code block node
|
|
18
|
+
attr_reader :code_block_node
|
|
19
|
+
|
|
20
|
+
# Initialize a new figure code block renderer
|
|
21
|
+
#
|
|
22
|
+
# @param element [Nokogiri::XML::Node] the figure element to render
|
|
23
|
+
# @param inline_collapser [Proc] callable for collapsing inline content
|
|
24
|
+
# @param fence_calculator [Proc] callable for calculating fence length
|
|
25
|
+
def initialize(element, inline_collapser:, fence_calculator:)
|
|
26
|
+
@element = element
|
|
27
|
+
@inline_collapser = inline_collapser
|
|
28
|
+
@fence_calculator = fence_calculator
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Render the figure as a fenced code block
|
|
32
|
+
#
|
|
33
|
+
# @return [String, nil] markdown fenced code block or nil if not a code figure
|
|
34
|
+
def render
|
|
35
|
+
@code_block_node = nil
|
|
36
|
+
return unless code_figure?
|
|
37
|
+
|
|
38
|
+
lines = extract_figure_code_lines
|
|
39
|
+
return if lines.empty?
|
|
40
|
+
|
|
41
|
+
language = detect_code_language
|
|
42
|
+
caption = caption_text
|
|
43
|
+
info_string = [language, caption].compact.reject(&:empty?).join(' ')
|
|
44
|
+
code_body = lines.join("\n")
|
|
45
|
+
fence = fence_calculator.call(code_body)
|
|
46
|
+
opening_fence = info_string.empty? ? fence : "#{fence}#{info_string}"
|
|
47
|
+
"#{opening_fence}\n#{code_body}\n#{fence}"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
# @!attribute [r] element
|
|
53
|
+
# @return [Nokogiri::XML::Node] the figure element being processed
|
|
54
|
+
# @!attribute [r] inline_collapser
|
|
55
|
+
# @return [Proc] callable for collapsing inline content
|
|
56
|
+
# @!attribute [r] fence_calculator
|
|
57
|
+
# @return [Proc] callable for calculating fence length
|
|
58
|
+
attr_reader :element, :inline_collapser, :fence_calculator
|
|
59
|
+
|
|
60
|
+
# Extract caption text from figcaption element
|
|
61
|
+
#
|
|
62
|
+
# @return [String, nil] caption text or nil if no caption
|
|
63
|
+
def caption_text
|
|
64
|
+
caption_node = element.at_css('figcaption')
|
|
65
|
+
return if caption_node.nil?
|
|
66
|
+
|
|
67
|
+
inline_collapser.call(caption_node)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Check if figure element represents a code block
|
|
71
|
+
#
|
|
72
|
+
# @return [Boolean] true if figure contains code
|
|
73
|
+
def code_figure?
|
|
74
|
+
class_tokens(element).any? { |token| token.casecmp('code').zero? }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Extract code lines from figure element
|
|
78
|
+
#
|
|
79
|
+
# @return [Array<String>] array of code lines
|
|
80
|
+
def extract_figure_code_lines
|
|
81
|
+
pre = element.at_css('td.main pre') ||
|
|
82
|
+
element.at_css('td:not(.line-numbers) pre') ||
|
|
83
|
+
element.at_css('div.highlight pre') ||
|
|
84
|
+
element.at_css('pre')
|
|
85
|
+
@code_block_node = pre
|
|
86
|
+
return [] unless pre
|
|
87
|
+
|
|
88
|
+
lines =
|
|
89
|
+
if pre.css('.line').any?
|
|
90
|
+
pre.css('.line').map { |line| extract_code_line_text(line) }
|
|
91
|
+
else
|
|
92
|
+
raw = pre.at_css('code') ? pre.at_css('code').text : pre.text
|
|
93
|
+
raw.to_s.gsub(/\r\n?/, "\n").split("\n", -1)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
clean_code_lines(lines)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Extract text from a single code line node
|
|
100
|
+
#
|
|
101
|
+
# @param line_node [Nokogiri::XML::Element] line element
|
|
102
|
+
# @return [String] extracted text
|
|
103
|
+
def extract_code_line_text(line_node)
|
|
104
|
+
text = line_node.xpath('.//text()').map(&:text).join
|
|
105
|
+
text = text.tr("\u00a0", ' ')
|
|
106
|
+
text.gsub(/\r\n?/, '').rstrip
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Clean and normalize code lines
|
|
110
|
+
#
|
|
111
|
+
# @param lines [Array<String>] raw code lines
|
|
112
|
+
# @return [Array<String>] cleaned lines
|
|
113
|
+
def clean_code_lines(lines)
|
|
114
|
+
sanitized = lines.map { |line| line.to_s.gsub(/\r\n?/, "\n") }
|
|
115
|
+
sanitized.shift while sanitized.first&.strip&.empty?
|
|
116
|
+
sanitized.pop while sanitized.last&.strip&.empty?
|
|
117
|
+
sanitized
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Detect programming language from element attributes
|
|
121
|
+
#
|
|
122
|
+
# @return [String, nil] detected language or nil
|
|
123
|
+
def detect_code_language
|
|
124
|
+
candidates = [
|
|
125
|
+
element.at_css('code'),
|
|
126
|
+
element.at_css('pre'),
|
|
127
|
+
element.at_css('td.main'),
|
|
128
|
+
element.at_css('div.highlight'),
|
|
129
|
+
element
|
|
130
|
+
].compact
|
|
131
|
+
candidates.concat(element.css('[data-language], [data-lang], [lang], [class]'))
|
|
132
|
+
|
|
133
|
+
candidates.each do |node|
|
|
134
|
+
language = extract_language_from_node(node)
|
|
135
|
+
return language unless language.nil? || language.empty?
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
nil
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Extract language identifier from node attributes
|
|
142
|
+
#
|
|
143
|
+
# @param node [Nokogiri::XML::Element] element to examine
|
|
144
|
+
# @return [String, nil] language identifier or nil
|
|
145
|
+
def extract_language_from_node(node)
|
|
146
|
+
%w[data-language data-lang lang].each do |attr|
|
|
147
|
+
value = node[attr]
|
|
148
|
+
return value.to_s.strip unless value.nil? || value.to_s.strip.empty?
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
class_attr = node['class']
|
|
152
|
+
return nil if class_attr.nil? || class_attr.strip.empty?
|
|
153
|
+
|
|
154
|
+
tokens = class_tokens(node)
|
|
155
|
+
tokens.each do |token|
|
|
156
|
+
next if token.empty?
|
|
157
|
+
|
|
158
|
+
if (match = token.match(/\A(?:language|lang)-(.*)\z/i))
|
|
159
|
+
candidate = match[1].to_s.strip
|
|
160
|
+
return candidate unless candidate.empty?
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
lowered = token.downcase
|
|
164
|
+
next if GENERIC_CODE_CLASSES.include?(lowered)
|
|
165
|
+
|
|
166
|
+
return token
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
nil
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Extract class tokens from node's class attribute
|
|
173
|
+
#
|
|
174
|
+
# @param node [Nokogiri::XML::Element] element to examine
|
|
175
|
+
# @return [Array<String>] array of class names
|
|
176
|
+
def class_tokens(node)
|
|
177
|
+
(node['class'] || '').split(/\s+/).reject(&:empty?)
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|