llm-docs-builder 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ci.yml +35 -4
  3. data/.github/workflows/docker.yml +7 -7
  4. data/.github/workflows/push.yml +3 -3
  5. data/.gitignore +8 -0
  6. data/.rubocop.yml +1 -14
  7. data/.ruby-version +1 -1
  8. data/.yard-lint.yml +275 -0
  9. data/CHANGELOG.md +16 -0
  10. data/Dockerfile +14 -7
  11. data/Gemfile +1 -1
  12. data/Gemfile.lock +33 -25
  13. data/README.md +16 -0
  14. data/lib/llm_docs_builder/cli.rb +0 -1
  15. data/lib/llm_docs_builder/config.rb +33 -0
  16. data/lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb +31 -0
  17. data/lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb +71 -0
  18. data/lib/llm_docs_builder/helpers.rb +9 -0
  19. data/lib/llm_docs_builder/html_detector.rb +159 -0
  20. data/lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb +181 -0
  21. data/lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb +597 -0
  22. data/lib/llm_docs_builder/html_to_markdown_converter.rb +826 -0
  23. data/lib/llm_docs_builder/markdown_transformer.rb +23 -9
  24. data/lib/llm_docs_builder/output_formatter.rb +1 -1
  25. data/lib/llm_docs_builder/text_compressor.rb +2 -2
  26. data/lib/llm_docs_builder/transformers/base_transformer.rb +13 -1
  27. data/lib/llm_docs_builder/transformers/heading_transformer.rb +19 -7
  28. data/lib/llm_docs_builder/url_fetcher.rb +18 -0
  29. data/lib/llm_docs_builder/version.rb +1 -1
  30. data/lib/llm_docs_builder.rb +10 -0
  31. data/llm-docs-builder.gemspec +3 -2
  32. data/package-lock.json +331 -0
  33. data/package.json +9 -0
  34. data/renovate.json +22 -9
  35. metadata +31 -8
  36. data/AGENTS.md +0 -20
data/README.md CHANGED
@@ -68,6 +68,8 @@ llm-docs-builder transform --url https://yoursite.com/docs/page.html
68
68
  llm-docs-builder bulk-transform --config llm-docs-builder.yml
69
69
  ```
70
70
 
71
+ **HTML to Markdown Conversion:** The transformer automatically detects and converts HTML content to clean markdown format. This works seamlessly with both local files and remote URLs, converting HTML tables, code blocks, and other elements into their markdown equivalents.
72
+
71
73
  ## Installation
72
74
 
73
75
  ### Docker (Recommended)
@@ -85,6 +87,20 @@ gem install llm-docs-builder
85
87
 
86
88
  ## Features
87
89
 
90
+ ### Automatic HTML to Markdown Conversion
91
+
92
+ The tool automatically detects and converts HTML content to clean markdown:
93
+ - **HTML Tables** → Markdown tables
94
+ - **HTML Code Blocks** → Fenced code blocks
95
+ - **Figures & Captions** → Clean markdown equivalents
96
+ - **Seamless Integration** - Works with local files and remote URLs without special configuration
97
+
98
+ ```bash
99
+ # Transform HTML content automatically
100
+ llm-docs-builder transform --docs page-with-html.md
101
+ llm-docs-builder transform --url https://site.com/docs/api.html
102
+ ```
103
+
88
104
  ### Measure and Compare
89
105
 
90
106
  ```bash
@@ -399,7 +399,6 @@ module LlmDocsBuilder
399
399
  end
400
400
 
401
401
  # Display version information
402
- #
403
402
  def show_version
404
403
  puts "llm-docs-builder version #{LlmDocsBuilder::VERSION}"
405
404
  end
@@ -53,6 +53,39 @@ module LlmDocsBuilder
53
53
  # defaults for any options not specified via CLI.
54
54
  #
55
55
  # @param options [Hash] CLI options hash
56
+ # @option options [String] :docs path to documentation directory or file
57
+ # @option options [String] :base_url base URL for expanding relative links
58
+ # @option options [String] :title project title
59
+ # @option options [String] :description project description
60
+ # @option options [String] :body additional body content
61
+ # @option options [String] :output output file path
62
+ # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
63
+ # @option options [Boolean] :remove_comments remove HTML comments
64
+ # @option options [Boolean] :normalize_whitespace normalize whitespace
65
+ # @option options [Boolean] :remove_badges remove badge images
66
+ # @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
67
+ # @option options [Boolean] :verbose enable verbose output
68
+ # @option options [String] :suffix suffix for transformed files
69
+ # @option options [Array<String>] :excludes glob patterns for files to exclude
70
+ # @option options [Boolean] :bulk enable bulk transformation mode
71
+ # @option options [Boolean] :include_hidden include hidden files
72
+ # @option options [Boolean] :remove_code_examples remove code blocks
73
+ # @option options [Boolean] :remove_images remove image syntax
74
+ # @option options [Boolean] :simplify_links simplify link text
75
+ # @option options [Boolean] :remove_blockquotes remove blockquote formatting
76
+ # @option options [Boolean] :generate_toc generate table of contents
77
+ # @option options [String] :custom_instruction custom instruction text
78
+ # @option options [Boolean] :remove_stopwords remove common stopwords
79
+ # @option options [Boolean] :remove_duplicates remove duplicate paragraphs
80
+ # @option options [Boolean] :normalize_headings normalize heading hierarchy
81
+ # @option options [String] :heading_separator separator for heading paths
82
+ # @option options [Boolean] :include_metadata include metadata in output
83
+ # @option options [Boolean] :include_tokens include token counts
84
+ # @option options [Boolean] :include_timestamps include timestamps
85
+ # @option options [Boolean] :include_priority include priority metadata
86
+ # @option options [Boolean] :calculate_compression calculate compression ratios
87
+ # @option options [String] :content raw markdown content
88
+ # @option options [String] :source_url source URL for content
56
89
  # @return [Hash] merged configuration with CLI overrides applied
57
90
  def merge_with_options(options)
58
91
  # CLI options override config file, config file provides defaults
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Helper methods for content transformation
5
+ #
6
+ # @api private
7
+ module Helpers
8
+ # Removes trailing pipe characters and whitespace from array of string parts
9
+ #
10
+ # @param parts [Array<String>] array of string parts to process
11
+ # @return [void]
12
+ def prune_trailing_unsafe_link_separator!(parts)
13
+ while parts.any?
14
+ last = parts.last
15
+ new_last = last.sub(/[ \t]*\|\s*\z/, '')
16
+
17
+ if new_last != last
18
+ trimmed = new_last.rstrip
19
+ parts[-1] = trimmed
20
+ parts.pop if trimmed.empty?
21
+ elsif last.strip.empty?
22
+ parts.pop
23
+ else
24
+ break
25
+ end
26
+ end
27
+ end
28
+
29
+ module_function :prune_trailing_unsafe_link_separator!
30
+ end
31
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Helpers
5
+ # Reduces consecutive blank lines outside of code fences
6
+ #
7
+ # @param text [String] input text to process
8
+ # @param max_blank [Integer] maximum number of consecutive blank lines to allow
9
+ # @param fence_chars [Array<String>] characters that can be used for code fences
10
+ # @param min_fence [Integer] minimum length of fence character sequence
11
+ # @return [String] processed text with squeezed blank lines
12
+ def squeeze_blank_lines_outside_fences(text, max_blank: 2, fence_chars: %w[` ~], min_fence: 3)
13
+ return '' if text.to_s.empty?
14
+
15
+ lines = text.split("\n", -1)
16
+
17
+ inside_fence = false
18
+ fence_indent = ''.dup
19
+ fence_char = nil
20
+ fence_len = 0
21
+
22
+ # Build a fast “does this look like an opening fence?” regex
23
+ # e.g., leading spaces + ``` or ~~~ (length >= min_fence) + optional info string
24
+ fence_set = Regexp.escape(fence_chars.join)
25
+ open_re = /\A(\s*)([#{fence_set}])\2{#{min_fence - 1},}.*\z/
26
+
27
+ out = []
28
+ blank_streak = 0
29
+
30
+ lines.each_with_index do |line, _idx|
31
+ if inside_fence
32
+ out << line
33
+ # Closing fence must match indent, char, and fence length
34
+ if line.match?(/\A#{Regexp.escape(fence_indent)}#{Regexp.escape(fence_char * fence_len)}\s*\z/)
35
+ inside_fence = false
36
+ fence_indent = ''.dup
37
+ fence_char = nil
38
+ fence_len = 0
39
+ end
40
+ next
41
+ end
42
+
43
+ if (m = line.match(open_re))
44
+ # Enter fenced block; compute the *actual* fence length from the line
45
+ fence_indent = m[1]
46
+ fence_char = m[2]
47
+ after_indent = line[fence_indent.length..]
48
+ fence_len = after_indent[/\A#{Regexp.escape(fence_char)}+/].length
49
+ inside_fence = true
50
+ blank_streak = 0
51
+ out << line
52
+ next
53
+ end
54
+
55
+ # Outside fences: squeeze blank lines
56
+ if line.strip.empty?
57
+ blank_streak += 1
58
+ # Keep at most max_blank blank lines; skip extras
59
+ out << line if blank_streak <= max_blank
60
+ else
61
+ blank_streak = 0
62
+ out << line
63
+ end
64
+ end
65
+
66
+ out.join("\n")
67
+ end
68
+
69
+ module_function :squeeze_blank_lines_outside_fences
70
+ end
71
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Helpers
5
+ end
6
+ end
7
+
8
+ require_relative 'helpers/squeeze_blank_lines_outside_fences'
9
+ require_relative 'helpers/prune_trailing_unsafe_link_separator'
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Detects whether input should be treated as HTML and related snippet checks
5
+ class HtmlDetector
6
+ # Detect if loaded content is HTML instead of markdown
7
+ #
8
+ # @param content [String] raw content
9
+ # @param snippet [String, nil] optional precomputed snippet
10
+ # @return [Boolean]
11
+ def html_content?(content, snippet = detection_snippet(content))
12
+ return false unless html_content_snippet?(snippet)
13
+
14
+ full_html_document?(content)
15
+ end
16
+
17
+ # Prepare a snippet of content for HTML detection by removing leading whitespace
18
+ # and build metadata comments.
19
+ #
20
+ # @param content [String]
21
+ # @return [String, nil]
22
+ def detection_snippet(content)
23
+ return unless content
24
+
25
+ snippet = content.lstrip
26
+ return unless snippet
27
+
28
+ comment_prefix = /\A<!--.*?-->\s*/m
29
+ # Remote docs often include build metadata comments; skip them before tag detection.
30
+ return '' if snippet.empty? while snippet.sub!(comment_prefix, '')
31
+
32
+ snippet.lstrip[0, 500]
33
+ end
34
+
35
+ # Determine whether a snippet should be treated as HTML.
36
+ #
37
+ # @param snippet [String, nil]
38
+ # @return [Boolean]
39
+ def html_content_snippet?(snippet)
40
+ return false unless snippet && !snippet.empty?
41
+ return false if markdown_heading_snippet?(snippet)
42
+
43
+ html_candidate_snippet?(snippet)
44
+ end
45
+
46
+ # Determine whether a snippet appears to start with HTML markup.
47
+ #
48
+ # @param snippet [String]
49
+ # @return [Boolean]
50
+ def html_candidate_snippet?(snippet)
51
+ snippet.match?(/\A<\s*(?:!DOCTYPE\s+html|html\b|body\b|head\b|article\b|section\b|main\b|p\b|div\b|table\b|thead\b|tbody\b|tr\b|td\b|th\b|meta\b|link\b|h[1-6]\b|ul\b|ol\b|li\b|blockquote\b)/i)
52
+ end
53
+
54
+ # Check if the full document should be treated as HTML by parsing it and
55
+ # ensuring we do not observe unwrapped markdown constructs like plain text or lists.
56
+ #
57
+ # @param content [String]
58
+ # @return [Boolean]
59
+ def full_html_document?(content)
60
+ document = Nokogiri::HTML::Document.parse(content)
61
+ body = document.at('body')
62
+
63
+ return false unless body
64
+ return false if document.xpath('/text()').any? { |node| meaningful_text?(node.text) }
65
+
66
+ body.xpath('./text()').each do |node|
67
+ text = node.text
68
+ next unless meaningful_text?(text)
69
+
70
+ return false unless allow_inline_body_text?(content, text)
71
+ end
72
+
73
+ true
74
+ rescue Nokogiri::XML::SyntaxError
75
+ false
76
+ end
77
+
78
+ # Checks if text contains meaningful non-whitespace content
79
+ #
80
+ # @param text [String, nil]
81
+ # @return [Boolean] true if text contains non-whitespace characters
82
+ def meaningful_text?(text)
83
+ return false if text.nil?
84
+
85
+ stripped = text.strip
86
+ stripped.match?(/\S/)
87
+ end
88
+
89
+ # Checks if text looks like markdown syntax
90
+ #
91
+ # @param text [String, nil]
92
+ # @return [Boolean] true if text contains markdown-like patterns
93
+ def markdown_like_text?(text)
94
+ return false if text.nil?
95
+ return true if markdown_heading_snippet?(text)
96
+
97
+ text.each_line do |line|
98
+ trimmed = line.lstrip
99
+ next if trimmed.empty?
100
+ next if trimmed.start_with?('<')
101
+
102
+ return true if trimmed.match?(/\A[*+-]\s+\S/)
103
+ return true if trimmed.match?(/\A\d+\.\s+\S/)
104
+ return true if trimmed.match?(/\A>\s+\S/)
105
+ return true if trimmed.start_with?('```', '~~~')
106
+ return true if trimmed.strip.match?(/\A(?:-{3,}|_{3,}|={3,})\z/)
107
+ end
108
+
109
+ false
110
+ end
111
+
112
+ # Determines if inline body text should be allowed in HTML context
113
+ #
114
+ # @param content [String] full content being processed
115
+ # @param text [String] specific text to check
116
+ # @return [Boolean] true if inline body text is acceptable
117
+ def allow_inline_body_text?(content, text)
118
+ return false if markdown_like_text?(text)
119
+
120
+ html_with_body_wrapper?(content)
121
+ end
122
+
123
+ # Checks if content has HTML document structure wrapper tags
124
+ #
125
+ # @param content [String] content to check for HTML wrapper tags
126
+ # @return [Boolean] true if content contains DOCTYPE, html, or body tags
127
+ def html_with_body_wrapper?(content)
128
+ content.match?(/<\s*!DOCTYPE\s+html/i) ||
129
+ content.match?(/<\s*html\b/i) ||
130
+ content.match?(/<\s*body\b/i)
131
+ end
132
+
133
+ # Detect whether the snippet represents a table fragment we should preserve.
134
+ #
135
+ # @param snippet [String, nil]
136
+ # @return [Boolean]
137
+ def table_fragment?(snippet)
138
+ return false unless snippet && !snippet.empty?
139
+
140
+ snippet.match?(/\A<\s*(?:table|thead|tbody|tr|td|th)\b/i)
141
+ end
142
+
143
+ # Detect common markdown heading syntax within the snippet.
144
+ #
145
+ # @param snippet [String]
146
+ # @return [Boolean]
147
+ def markdown_heading_snippet?(snippet)
148
+ snippet.each_line do |line|
149
+ trimmed = line.lstrip
150
+ next if trimmed.empty?
151
+ next if trimmed.start_with?('<')
152
+
153
+ return true if trimmed.match?(/\A#+\s+/)
154
+ end
155
+
156
+ false
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,181 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Provides HTML to Markdown conversion functionality
5
+ #
6
+ # This module contains specialized renderers for converting HTML elements
7
+ # to Markdown format, with support for complex structures like tables,
8
+ # figures, and syntax-highlighted code blocks.
9
+ #
10
+ # @api private
11
+ module HtmlToMarkdown
12
+ # Converts <figure> elements that actually contain syntax-highlighted code back into fenced Markdown.
13
+ class FigureCodeBlockRenderer
14
+ # Generic CSS class names commonly used for code formatting that should be ignored
15
+ GENERIC_CODE_CLASSES = %w[highlight code main gutter numbers line-numbers line-number line wrap table].freeze
16
+
17
+ # @return [Nokogiri::XML::Node, nil] the identified code block node
18
+ attr_reader :code_block_node
19
+
20
+ # Initialize a new figure code block renderer
21
+ #
22
+ # @param element [Nokogiri::XML::Node] the figure element to render
23
+ # @param inline_collapser [Proc] callable for collapsing inline content
24
+ # @param fence_calculator [Proc] callable for calculating fence length
25
+ def initialize(element, inline_collapser:, fence_calculator:)
26
+ @element = element
27
+ @inline_collapser = inline_collapser
28
+ @fence_calculator = fence_calculator
29
+ end
30
+
31
+ # Render the figure as a fenced code block
32
+ #
33
+ # @return [String, nil] markdown fenced code block or nil if not a code figure
34
+ def render
35
+ @code_block_node = nil
36
+ return unless code_figure?
37
+
38
+ lines = extract_figure_code_lines
39
+ return if lines.empty?
40
+
41
+ language = detect_code_language
42
+ caption = caption_text
43
+ info_string = [language, caption].compact.reject(&:empty?).join(' ')
44
+ code_body = lines.join("\n")
45
+ fence = fence_calculator.call(code_body)
46
+ opening_fence = info_string.empty? ? fence : "#{fence}#{info_string}"
47
+ "#{opening_fence}\n#{code_body}\n#{fence}"
48
+ end
49
+
50
+ private
51
+
52
+ # @!attribute [r] element
53
+ # @return [Nokogiri::XML::Node] the figure element being processed
54
+ # @!attribute [r] inline_collapser
55
+ # @return [Proc] callable for collapsing inline content
56
+ # @!attribute [r] fence_calculator
57
+ # @return [Proc] callable for calculating fence length
58
+ attr_reader :element, :inline_collapser, :fence_calculator
59
+
60
+ # Extract caption text from figcaption element
61
+ #
62
+ # @return [String, nil] caption text or nil if no caption
63
+ def caption_text
64
+ caption_node = element.at_css('figcaption')
65
+ return if caption_node.nil?
66
+
67
+ inline_collapser.call(caption_node)
68
+ end
69
+
70
+ # Check if figure element represents a code block
71
+ #
72
+ # @return [Boolean] true if figure contains code
73
+ def code_figure?
74
+ class_tokens(element).any? { |token| token.casecmp('code').zero? }
75
+ end
76
+
77
+ # Extract code lines from figure element
78
+ #
79
+ # @return [Array<String>] array of code lines
80
+ def extract_figure_code_lines
81
+ pre = element.at_css('td.main pre') ||
82
+ element.at_css('td:not(.line-numbers) pre') ||
83
+ element.at_css('div.highlight pre') ||
84
+ element.at_css('pre')
85
+ @code_block_node = pre
86
+ return [] unless pre
87
+
88
+ lines =
89
+ if pre.css('.line').any?
90
+ pre.css('.line').map { |line| extract_code_line_text(line) }
91
+ else
92
+ raw = pre.at_css('code') ? pre.at_css('code').text : pre.text
93
+ raw.to_s.gsub(/\r\n?/, "\n").split("\n", -1)
94
+ end
95
+
96
+ clean_code_lines(lines)
97
+ end
98
+
99
+ # Extract text from a single code line node
100
+ #
101
+ # @param line_node [Nokogiri::XML::Element] line element
102
+ # @return [String] extracted text
103
+ def extract_code_line_text(line_node)
104
+ text = line_node.xpath('.//text()').map(&:text).join
105
+ text = text.tr("\u00a0", ' ')
106
+ text.gsub(/\r\n?/, '').rstrip
107
+ end
108
+
109
+ # Clean and normalize code lines
110
+ #
111
+ # @param lines [Array<String>] raw code lines
112
+ # @return [Array<String>] cleaned lines
113
+ def clean_code_lines(lines)
114
+ sanitized = lines.map { |line| line.to_s.gsub(/\r\n?/, "\n") }
115
+ sanitized.shift while sanitized.first&.strip&.empty?
116
+ sanitized.pop while sanitized.last&.strip&.empty?
117
+ sanitized
118
+ end
119
+
120
+ # Detect programming language from element attributes
121
+ #
122
+ # @return [String, nil] detected language or nil
123
+ def detect_code_language
124
+ candidates = [
125
+ element.at_css('code'),
126
+ element.at_css('pre'),
127
+ element.at_css('td.main'),
128
+ element.at_css('div.highlight'),
129
+ element
130
+ ].compact
131
+ candidates.concat(element.css('[data-language], [data-lang], [lang], [class]'))
132
+
133
+ candidates.each do |node|
134
+ language = extract_language_from_node(node)
135
+ return language unless language.nil? || language.empty?
136
+ end
137
+
138
+ nil
139
+ end
140
+
141
+ # Extract language identifier from node attributes
142
+ #
143
+ # @param node [Nokogiri::XML::Element] element to examine
144
+ # @return [String, nil] language identifier or nil
145
+ def extract_language_from_node(node)
146
+ %w[data-language data-lang lang].each do |attr|
147
+ value = node[attr]
148
+ return value.to_s.strip unless value.nil? || value.to_s.strip.empty?
149
+ end
150
+
151
+ class_attr = node['class']
152
+ return nil if class_attr.nil? || class_attr.strip.empty?
153
+
154
+ tokens = class_tokens(node)
155
+ tokens.each do |token|
156
+ next if token.empty?
157
+
158
+ if (match = token.match(/\A(?:language|lang)-(.*)\z/i))
159
+ candidate = match[1].to_s.strip
160
+ return candidate unless candidate.empty?
161
+ end
162
+
163
+ lowered = token.downcase
164
+ next if GENERIC_CODE_CLASSES.include?(lowered)
165
+
166
+ return token
167
+ end
168
+
169
+ nil
170
+ end
171
+
172
+ # Extract class tokens from node's class attribute
173
+ #
174
+ # @param node [Nokogiri::XML::Element] element to examine
175
+ # @return [Array<String>] array of class names
176
+ def class_tokens(node)
177
+ (node['class'] || '').split(/\s+/).reject(&:empty?)
178
+ end
179
+ end
180
+ end
181
+ end