llm-docs-builder 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +13 -0
- data/.github/workflows/docker.yml +2 -2
- data/.github/workflows/push.yml +2 -2
- data/.gitignore +8 -0
- data/CHANGELOG.md +13 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +47 -18
- data/README.md +19 -0
- data/lib/llm_docs_builder/cli.rb +32 -10
- data/lib/llm_docs_builder/comparator.rb +5 -75
- data/lib/llm_docs_builder/config.rb +42 -2
- data/lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb +31 -0
- data/lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb +71 -0
- data/lib/llm_docs_builder/helpers.rb +9 -0
- data/lib/llm_docs_builder/html_detector.rb +159 -0
- data/lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb +181 -0
- data/lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb +597 -0
- data/lib/llm_docs_builder/html_to_markdown_converter.rb +792 -0
- data/lib/llm_docs_builder/markdown_transformer.rb +30 -5
- data/lib/llm_docs_builder/output_formatter.rb +1 -1
- data/lib/llm_docs_builder/transformers/base_transformer.rb +13 -1
- data/lib/llm_docs_builder/url_fetcher.rb +138 -0
- data/lib/llm_docs_builder/version.rb +1 -1
- data/lib/llm_docs_builder.rb +11 -0
- data/llm-docs-builder.gemspec +1 -0
- metadata +23 -1
|
@@ -55,7 +55,7 @@ module LlmDocsBuilder
|
|
|
55
55
|
#
|
|
56
56
|
# @return [String] transformed markdown content
|
|
57
57
|
def transform
|
|
58
|
-
content =
|
|
58
|
+
content = load_content
|
|
59
59
|
|
|
60
60
|
# Build and execute transformation pipeline
|
|
61
61
|
content = cleanup_transformer.transform(content, options)
|
|
@@ -63,9 +63,7 @@ module LlmDocsBuilder
|
|
|
63
63
|
content = heading_transformer.transform(content, options)
|
|
64
64
|
content = compress_content(content) if should_compress?
|
|
65
65
|
content = enhancement_transformer.transform(content, options)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
content
|
|
66
|
+
whitespace_transformer.transform(content, options)
|
|
69
67
|
end
|
|
70
68
|
|
|
71
69
|
private
|
|
@@ -114,7 +112,7 @@ module LlmDocsBuilder
|
|
|
114
112
|
|
|
115
113
|
# Compress content using TextCompressor
|
|
116
114
|
#
|
|
117
|
-
# @param content [String] content
|
|
115
|
+
# @param content [String] markdown content
|
|
118
116
|
# @return [String] compressed content
|
|
119
117
|
def compress_content(content)
|
|
120
118
|
compressor = TextCompressor.new
|
|
@@ -124,5 +122,32 @@ module LlmDocsBuilder
|
|
|
124
122
|
}
|
|
125
123
|
compressor.compress(content, compression_methods)
|
|
126
124
|
end
|
|
125
|
+
|
|
126
|
+
# Load source content either from provided string or file path
|
|
127
|
+
#
|
|
128
|
+
# @return [String] markdown content to transform
|
|
129
|
+
def load_content
|
|
130
|
+
content = options[:content] ? options[:content].dup : File.read(file_path)
|
|
131
|
+
snippet = html_detector.detection_snippet(content)
|
|
132
|
+
|
|
133
|
+
return content if html_detector.table_fragment?(snippet)
|
|
134
|
+
return html_to_markdown_converter.convert(content) if html_detector.html_content?(content, snippet)
|
|
135
|
+
|
|
136
|
+
content
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Memoized HTML to markdown converter
|
|
140
|
+
#
|
|
141
|
+
# @return [HtmlToMarkdownConverter]
|
|
142
|
+
def html_to_markdown_converter
|
|
143
|
+
@html_to_markdown_converter ||= HtmlToMarkdownConverter.new
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Memoized HTML detector
|
|
147
|
+
#
|
|
148
|
+
# @return [HtmlDetector]
|
|
149
|
+
def html_detector
|
|
150
|
+
@html_detector ||= HtmlDetector.new
|
|
151
|
+
end
|
|
127
152
|
end
|
|
128
153
|
end
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module LlmDocsBuilder
|
|
4
|
+
# Provides content transformation functionality
|
|
5
|
+
#
|
|
6
|
+
# This module contains specialized transformers for modifying markdown content,
|
|
7
|
+
# including cleanup operations, link processing, heading normalization, and
|
|
8
|
+
# content enhancement for AI consumption.
|
|
9
|
+
#
|
|
10
|
+
# @api private
|
|
4
11
|
module Transformers
|
|
5
12
|
# Base module for all transformers
|
|
6
13
|
#
|
|
@@ -11,9 +18,12 @@ module LlmDocsBuilder
|
|
|
11
18
|
module BaseTransformer
|
|
12
19
|
# Transform content
|
|
13
20
|
#
|
|
14
|
-
# @
|
|
21
|
+
# @abstract Subclasses must implement this method and document specific options
|
|
22
|
+
# @param content [String] markdown content
|
|
15
23
|
# @param options [Hash] transformation options
|
|
24
|
+
# @option options [Object] :* options vary by implementation - see specific transformer classes
|
|
16
25
|
# @return [String] transformed content
|
|
26
|
+
# @note Options vary by implementation - see specific transformer classes for supported keys
|
|
17
27
|
def transform(content, options = {})
|
|
18
28
|
raise NotImplementedError, "#{self.class} must implement #transform"
|
|
19
29
|
end
|
|
@@ -21,7 +31,9 @@ module LlmDocsBuilder
|
|
|
21
31
|
# Check if transformation should be applied
|
|
22
32
|
#
|
|
23
33
|
# @param options [Hash] transformation options
|
|
34
|
+
# @option options [Object] :* options vary by implementation - see specific transformer classes
|
|
24
35
|
# @return [Boolean] true if transformation should be applied
|
|
36
|
+
# @note Options vary by implementation - see specific transformer classes for supported keys
|
|
25
37
|
def should_transform?(options)
|
|
26
38
|
true
|
|
27
39
|
end
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'net/http'
|
|
4
|
+
require 'uri'
|
|
5
|
+
|
|
6
|
+
module LlmDocsBuilder
|
|
7
|
+
# Lightweight HTTP client for fetching remote documentation pages.
|
|
8
|
+
#
|
|
9
|
+
# Provides common functionality needed by multiple commands (transform, compare)
|
|
10
|
+
# including strict scheme validation, redirect handling and sensible timeouts.
|
|
11
|
+
class UrlFetcher
|
|
12
|
+
# Default user agent string for HTTP requests
|
|
13
|
+
DEFAULT_USER_AGENT = 'llm-docs-builder/1.0 (+https://github.com/mensfeld/llm-docs-builder)'
|
|
14
|
+
|
|
15
|
+
# Maximum number of redirects to follow
|
|
16
|
+
MAX_REDIRECTS = 10
|
|
17
|
+
|
|
18
|
+
# @param user_agent [String] HTTP user agent header value
|
|
19
|
+
# @param verbose [Boolean] enable redirect logging
|
|
20
|
+
# @param output [IO] IO stream used for redirect logging
|
|
21
|
+
def initialize(user_agent: DEFAULT_USER_AGENT, verbose: false, output: $stdout)
|
|
22
|
+
@user_agent = user_agent
|
|
23
|
+
@verbose = verbose
|
|
24
|
+
@output = output
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Fetch remote URL content while following redirects.
|
|
28
|
+
#
|
|
29
|
+
# @param url_string [String] URL to fetch
|
|
30
|
+
# @param redirect_count [Integer] current redirect depth (internal use)
|
|
31
|
+
# @return [String] response body
|
|
32
|
+
# @raise [Errors::GenerationError] on invalid URLs, network failures, or redirect loops
|
|
33
|
+
def fetch(url_string, redirect_count = 0)
|
|
34
|
+
if redirect_count >= MAX_REDIRECTS
|
|
35
|
+
raise(
|
|
36
|
+
Errors::GenerationError,
|
|
37
|
+
"Too many redirects (#{MAX_REDIRECTS}) when fetching #{url_string}"
|
|
38
|
+
)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
uri = validate_and_parse_url(url_string)
|
|
42
|
+
|
|
43
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
44
|
+
http.use_ssl = uri.scheme == 'https'
|
|
45
|
+
http.open_timeout = 10
|
|
46
|
+
http.read_timeout = 30
|
|
47
|
+
|
|
48
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
49
|
+
request['User-Agent'] = @user_agent
|
|
50
|
+
|
|
51
|
+
response = http.request(request)
|
|
52
|
+
|
|
53
|
+
case response
|
|
54
|
+
when Net::HTTPSuccess
|
|
55
|
+
response.body
|
|
56
|
+
when Net::HTTPRedirection
|
|
57
|
+
redirect_url = absolute_redirect_url(uri, response['location'])
|
|
58
|
+
log_redirect(redirect_url)
|
|
59
|
+
fetch(redirect_url, redirect_count + 1)
|
|
60
|
+
else
|
|
61
|
+
raise(
|
|
62
|
+
Errors::GenerationError,
|
|
63
|
+
"Failed to fetch #{url_string}: #{response.code} #{response.message}"
|
|
64
|
+
)
|
|
65
|
+
end
|
|
66
|
+
rescue Errors::GenerationError
|
|
67
|
+
raise
|
|
68
|
+
rescue StandardError => e
|
|
69
|
+
raise(
|
|
70
|
+
Errors::GenerationError,
|
|
71
|
+
"Error fetching #{url_string}: #{e.message}"
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
# Validate and parse URL string
|
|
78
|
+
#
|
|
79
|
+
# @param url_string [String] URL to validate
|
|
80
|
+
# @return [URI::HTTP, URI::HTTPS] parsed URI
|
|
81
|
+
# @raise [Errors::GenerationError] if URL is invalid or unsupported
|
|
82
|
+
def validate_and_parse_url(url_string)
|
|
83
|
+
uri = URI.parse(url_string)
|
|
84
|
+
|
|
85
|
+
unless %w[http https].include?(uri.scheme&.downcase)
|
|
86
|
+
raise(
|
|
87
|
+
Errors::GenerationError,
|
|
88
|
+
"Unsupported URL scheme: #{uri.scheme || 'none'} (only http/https allowed)"
|
|
89
|
+
)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
if uri.host.nil? || uri.host.empty?
|
|
93
|
+
raise(
|
|
94
|
+
Errors::GenerationError,
|
|
95
|
+
"Invalid URL: missing host in #{url_string}"
|
|
96
|
+
)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
uri
|
|
100
|
+
rescue URI::InvalidURIError => e
|
|
101
|
+
raise(
|
|
102
|
+
Errors::GenerationError,
|
|
103
|
+
"Invalid URL format: #{e.message}"
|
|
104
|
+
)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Convert redirect location to absolute URL
|
|
108
|
+
#
|
|
109
|
+
# @param base_uri [URI] base URI
|
|
110
|
+
# @param location [String] redirect location
|
|
111
|
+
# @return [String] absolute redirect URL
|
|
112
|
+
# @raise [Errors::GenerationError] if location is invalid
|
|
113
|
+
def absolute_redirect_url(base_uri, location)
|
|
114
|
+
raise(
|
|
115
|
+
Errors::GenerationError,
|
|
116
|
+
"Redirect missing location header for #{base_uri}"
|
|
117
|
+
) if location.nil? || location.empty?
|
|
118
|
+
|
|
119
|
+
URI.join(base_uri, location).to_s
|
|
120
|
+
rescue URI::InvalidURIError => e
|
|
121
|
+
raise(
|
|
122
|
+
Errors::GenerationError,
|
|
123
|
+
"Invalid redirect URL from #{base_uri}: #{e.message}"
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Log redirect if verbose mode enabled
|
|
128
|
+
#
|
|
129
|
+
# @param url [String] redirect URL
|
|
130
|
+
# @return [void]
|
|
131
|
+
def log_redirect(url)
|
|
132
|
+
return unless @verbose
|
|
133
|
+
|
|
134
|
+
@output.puts(" Redirecting to #{url}...")
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
data/lib/llm_docs_builder.rb
CHANGED
|
@@ -4,10 +4,20 @@ require 'zeitwerk'
|
|
|
4
4
|
require 'pathname'
|
|
5
5
|
require 'find'
|
|
6
6
|
|
|
7
|
+
autoload(:Nokogiri, 'nokogiri')
|
|
8
|
+
autoload(:CGI, 'cgi')
|
|
9
|
+
|
|
7
10
|
loader = Zeitwerk::Loader.for_gem
|
|
8
11
|
loader.inflector.inflect('cli' => 'CLI')
|
|
9
12
|
loader.setup
|
|
10
13
|
|
|
14
|
+
# Build and optimize documentation for LLMs
|
|
15
|
+
#
|
|
16
|
+
# This gem provides tools for generating llms.txt files and transforming markdown
|
|
17
|
+
# documentation to be AI-friendly. It can reduce token consumption by 67-95% while
|
|
18
|
+
# preserving essential documentation content.
|
|
19
|
+
#
|
|
20
|
+
# @api public
|
|
11
21
|
module LlmDocsBuilder
|
|
12
22
|
class << self
|
|
13
23
|
# Generates llms.txt from existing markdown documentation
|
|
@@ -25,6 +35,7 @@ module LlmDocsBuilder
|
|
|
25
35
|
# @option options [Boolean] :convert_urls convert HTML URLs to markdown format (overrides
|
|
26
36
|
# config)
|
|
27
37
|
# @option options [Boolean] :verbose enable verbose output (overrides config)
|
|
38
|
+
# @option options [String] :content raw markdown content (used for remote sources)
|
|
28
39
|
# @return [String] generated llms.txt content
|
|
29
40
|
#
|
|
30
41
|
# @example Generate from docs directory
|
data/llm-docs-builder.gemspec
CHANGED
|
@@ -35,6 +35,7 @@ Gem::Specification.new do |spec|
|
|
|
35
35
|
spec.executables = spec.files.grep(%r{\Abin/}) { |f| File.basename(f) }
|
|
36
36
|
spec.require_paths = ['lib']
|
|
37
37
|
|
|
38
|
+
spec.add_dependency 'nokogiri', '~> 1.17'
|
|
38
39
|
spec.add_dependency 'zeitwerk', '~> 2.6'
|
|
39
40
|
|
|
40
41
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: llm-docs-builder
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.12.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Maciej Mensfeld
|
|
@@ -9,6 +9,20 @@ bindir: bin
|
|
|
9
9
|
cert_chain: []
|
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: nokogiri
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '1.17'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '1.17'
|
|
12
26
|
- !ruby/object:Gem::Dependency
|
|
13
27
|
name: zeitwerk
|
|
14
28
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -132,6 +146,13 @@ files:
|
|
|
132
146
|
- lib/llm_docs_builder/config.rb
|
|
133
147
|
- lib/llm_docs_builder/errors.rb
|
|
134
148
|
- lib/llm_docs_builder/generator.rb
|
|
149
|
+
- lib/llm_docs_builder/helpers.rb
|
|
150
|
+
- lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb
|
|
151
|
+
- lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb
|
|
152
|
+
- lib/llm_docs_builder/html_detector.rb
|
|
153
|
+
- lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb
|
|
154
|
+
- lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb
|
|
155
|
+
- lib/llm_docs_builder/html_to_markdown_converter.rb
|
|
135
156
|
- lib/llm_docs_builder/markdown_transformer.rb
|
|
136
157
|
- lib/llm_docs_builder/output_formatter.rb
|
|
137
158
|
- lib/llm_docs_builder/parser.rb
|
|
@@ -143,6 +164,7 @@ files:
|
|
|
143
164
|
- lib/llm_docs_builder/transformers/heading_transformer.rb
|
|
144
165
|
- lib/llm_docs_builder/transformers/link_transformer.rb
|
|
145
166
|
- lib/llm_docs_builder/transformers/whitespace_transformer.rb
|
|
167
|
+
- lib/llm_docs_builder/url_fetcher.rb
|
|
146
168
|
- lib/llm_docs_builder/validator.rb
|
|
147
169
|
- lib/llm_docs_builder/version.rb
|
|
148
170
|
- llm-docs-builder.gemspec
|