llm-docs-builder 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +13 -0
- data/.github/workflows/docker.yml +2 -2
- data/.github/workflows/push.yml +2 -2
- data/.gitignore +8 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +34 -5
- data/README.md +16 -0
- data/lib/llm_docs_builder/config.rb +33 -0
- data/lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb +31 -0
- data/lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb +71 -0
- data/lib/llm_docs_builder/helpers.rb +9 -0
- data/lib/llm_docs_builder/html_detector.rb +159 -0
- data/lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb +181 -0
- data/lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb +597 -0
- data/lib/llm_docs_builder/html_to_markdown_converter.rb +792 -0
- data/lib/llm_docs_builder/markdown_transformer.rb +23 -9
- data/lib/llm_docs_builder/output_formatter.rb +1 -1
- data/lib/llm_docs_builder/transformers/base_transformer.rb +13 -1
- data/lib/llm_docs_builder/url_fetcher.rb +18 -0
- data/lib/llm_docs_builder/version.rb +1 -1
- data/lib/llm_docs_builder.rb +10 -0
- data/llm-docs-builder.gemspec +1 -0
- metadata +22 -2
- data/AGENTS.md +0 -20
|
@@ -63,9 +63,7 @@ module LlmDocsBuilder
|
|
|
63
63
|
content = heading_transformer.transform(content, options)
|
|
64
64
|
content = compress_content(content) if should_compress?
|
|
65
65
|
content = enhancement_transformer.transform(content, options)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
content
|
|
66
|
+
whitespace_transformer.transform(content, options)
|
|
69
67
|
end
|
|
70
68
|
|
|
71
69
|
private
|
|
@@ -114,7 +112,7 @@ module LlmDocsBuilder
|
|
|
114
112
|
|
|
115
113
|
# Compress content using TextCompressor
|
|
116
114
|
#
|
|
117
|
-
# @param content [String] content
|
|
115
|
+
# @param content [String] markdown content
|
|
118
116
|
# @return [String] compressed content
|
|
119
117
|
def compress_content(content)
|
|
120
118
|
compressor = TextCompressor.new
|
|
@@ -129,11 +127,27 @@ module LlmDocsBuilder
|
|
|
129
127
|
#
|
|
130
128
|
# @return [String] markdown content to transform
|
|
131
129
|
def load_content
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
130
|
+
content = options[:content] ? options[:content].dup : File.read(file_path)
|
|
131
|
+
snippet = html_detector.detection_snippet(content)
|
|
132
|
+
|
|
133
|
+
return content if html_detector.table_fragment?(snippet)
|
|
134
|
+
return html_to_markdown_converter.convert(content) if html_detector.html_content?(content, snippet)
|
|
135
|
+
|
|
136
|
+
content
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Memoized HTML to markdown converter
|
|
140
|
+
#
|
|
141
|
+
# @return [HtmlToMarkdownConverter]
|
|
142
|
+
def html_to_markdown_converter
|
|
143
|
+
@html_to_markdown_converter ||= HtmlToMarkdownConverter.new
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Memoized HTML detector
|
|
147
|
+
#
|
|
148
|
+
# @return [HtmlDetector]
|
|
149
|
+
def html_detector
|
|
150
|
+
@html_detector ||= HtmlDetector.new
|
|
137
151
|
end
|
|
138
152
|
end
|
|
139
153
|
end
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module LlmDocsBuilder
|
|
4
|
+
# Provides content transformation functionality
|
|
5
|
+
#
|
|
6
|
+
# This module contains specialized transformers for modifying markdown content,
|
|
7
|
+
# including cleanup operations, link processing, heading normalization, and
|
|
8
|
+
# content enhancement for AI consumption.
|
|
9
|
+
#
|
|
10
|
+
# @api private
|
|
4
11
|
module Transformers
|
|
5
12
|
# Base module for all transformers
|
|
6
13
|
#
|
|
@@ -11,9 +18,12 @@ module LlmDocsBuilder
|
|
|
11
18
|
module BaseTransformer
|
|
12
19
|
# Transform content
|
|
13
20
|
#
|
|
14
|
-
# @
|
|
21
|
+
# @abstract Subclasses must implement this method and document specific options
|
|
22
|
+
# @param content [String] markdown content
|
|
15
23
|
# @param options [Hash] transformation options
|
|
24
|
+
# @option options [Object] :* options vary by implementation - see specific transformer classes
|
|
16
25
|
# @return [String] transformed content
|
|
26
|
+
# @note Options vary by implementation - see specific transformer classes for supported keys
|
|
17
27
|
def transform(content, options = {})
|
|
18
28
|
raise NotImplementedError, "#{self.class} must implement #transform"
|
|
19
29
|
end
|
|
@@ -21,7 +31,9 @@ module LlmDocsBuilder
|
|
|
21
31
|
# Check if transformation should be applied
|
|
22
32
|
#
|
|
23
33
|
# @param options [Hash] transformation options
|
|
34
|
+
# @option options [Object] :* options vary by implementation - see specific transformer classes
|
|
24
35
|
# @return [Boolean] true if transformation should be applied
|
|
36
|
+
# @note Options vary by implementation - see specific transformer classes for supported keys
|
|
25
37
|
def should_transform?(options)
|
|
26
38
|
true
|
|
27
39
|
end
|
|
@@ -9,7 +9,10 @@ module LlmDocsBuilder
|
|
|
9
9
|
# Provides common functionality needed by multiple commands (transform, compare)
|
|
10
10
|
# including strict scheme validation, redirect handling and sensible timeouts.
|
|
11
11
|
class UrlFetcher
|
|
12
|
+
# Default user agent string for HTTP requests
|
|
12
13
|
DEFAULT_USER_AGENT = 'llm-docs-builder/1.0 (+https://github.com/mensfeld/llm-docs-builder)'
|
|
14
|
+
|
|
15
|
+
# Maximum number of redirects to follow
|
|
13
16
|
MAX_REDIRECTS = 10
|
|
14
17
|
|
|
15
18
|
# @param user_agent [String] HTTP user agent header value
|
|
@@ -71,6 +74,11 @@ module LlmDocsBuilder
|
|
|
71
74
|
|
|
72
75
|
private
|
|
73
76
|
|
|
77
|
+
# Validate and parse URL string
|
|
78
|
+
#
|
|
79
|
+
# @param url_string [String] URL to validate
|
|
80
|
+
# @return [URI::HTTP, URI::HTTPS] parsed URI
|
|
81
|
+
# @raise [Errors::GenerationError] if URL is invalid or unsupported
|
|
74
82
|
def validate_and_parse_url(url_string)
|
|
75
83
|
uri = URI.parse(url_string)
|
|
76
84
|
|
|
@@ -96,6 +104,12 @@ module LlmDocsBuilder
|
|
|
96
104
|
)
|
|
97
105
|
end
|
|
98
106
|
|
|
107
|
+
# Convert redirect location to absolute URL
|
|
108
|
+
#
|
|
109
|
+
# @param base_uri [URI] base URI
|
|
110
|
+
# @param location [String] redirect location
|
|
111
|
+
# @return [String] absolute redirect URL
|
|
112
|
+
# @raise [Errors::GenerationError] if location is invalid
|
|
99
113
|
def absolute_redirect_url(base_uri, location)
|
|
100
114
|
raise(
|
|
101
115
|
Errors::GenerationError,
|
|
@@ -110,6 +124,10 @@ module LlmDocsBuilder
|
|
|
110
124
|
)
|
|
111
125
|
end
|
|
112
126
|
|
|
127
|
+
# Log redirect if verbose mode enabled
|
|
128
|
+
#
|
|
129
|
+
# @param url [String] redirect URL
|
|
130
|
+
# @return [void]
|
|
113
131
|
def log_redirect(url)
|
|
114
132
|
return unless @verbose
|
|
115
133
|
|
data/lib/llm_docs_builder.rb
CHANGED
|
@@ -4,10 +4,20 @@ require 'zeitwerk'
|
|
|
4
4
|
require 'pathname'
|
|
5
5
|
require 'find'
|
|
6
6
|
|
|
7
|
+
autoload(:Nokogiri, 'nokogiri')
|
|
8
|
+
autoload(:CGI, 'cgi')
|
|
9
|
+
|
|
7
10
|
loader = Zeitwerk::Loader.for_gem
|
|
8
11
|
loader.inflector.inflect('cli' => 'CLI')
|
|
9
12
|
loader.setup
|
|
10
13
|
|
|
14
|
+
# Build and optimize documentation for LLMs
|
|
15
|
+
#
|
|
16
|
+
# This gem provides tools for generating llms.txt files and transforming markdown
|
|
17
|
+
# documentation to be AI-friendly. It can reduce token consumption by 67-95% while
|
|
18
|
+
# preserving essential documentation content.
|
|
19
|
+
#
|
|
20
|
+
# @api public
|
|
11
21
|
module LlmDocsBuilder
|
|
12
22
|
class << self
|
|
13
23
|
# Generates llms.txt from existing markdown documentation
|
data/llm-docs-builder.gemspec
CHANGED
|
@@ -35,6 +35,7 @@ Gem::Specification.new do |spec|
|
|
|
35
35
|
spec.executables = spec.files.grep(%r{\Abin/}) { |f| File.basename(f) }
|
|
36
36
|
spec.require_paths = ['lib']
|
|
37
37
|
|
|
38
|
+
spec.add_dependency 'nokogiri', '~> 1.17'
|
|
38
39
|
spec.add_dependency 'zeitwerk', '~> 2.6'
|
|
39
40
|
|
|
40
41
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: llm-docs-builder
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.12.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Maciej Mensfeld
|
|
@@ -9,6 +9,20 @@ bindir: bin
|
|
|
9
9
|
cert_chain: []
|
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: nokogiri
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '1.17'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '1.17'
|
|
12
26
|
- !ruby/object:Gem::Dependency
|
|
13
27
|
name: zeitwerk
|
|
14
28
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -116,7 +130,6 @@ files:
|
|
|
116
130
|
- ".rspec"
|
|
117
131
|
- ".rubocop.yml"
|
|
118
132
|
- ".ruby-version"
|
|
119
|
-
- AGENTS.md
|
|
120
133
|
- CHANGELOG.md
|
|
121
134
|
- Dockerfile
|
|
122
135
|
- Gemfile
|
|
@@ -133,6 +146,13 @@ files:
|
|
|
133
146
|
- lib/llm_docs_builder/config.rb
|
|
134
147
|
- lib/llm_docs_builder/errors.rb
|
|
135
148
|
- lib/llm_docs_builder/generator.rb
|
|
149
|
+
- lib/llm_docs_builder/helpers.rb
|
|
150
|
+
- lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb
|
|
151
|
+
- lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb
|
|
152
|
+
- lib/llm_docs_builder/html_detector.rb
|
|
153
|
+
- lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb
|
|
154
|
+
- lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb
|
|
155
|
+
- lib/llm_docs_builder/html_to_markdown_converter.rb
|
|
136
156
|
- lib/llm_docs_builder/markdown_transformer.rb
|
|
137
157
|
- lib/llm_docs_builder/output_formatter.rb
|
|
138
158
|
- lib/llm_docs_builder/parser.rb
|
data/AGENTS.md
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
# Repository Guidelines
|
|
2
|
-
|
|
3
|
-
## Project Structure & Module Organization
|
|
4
|
-
Core gem code lives in `lib/llm_docs_builder`, with single-responsibility modules such as `generator.rb`, `validator.rb`, and the CLI glue in `cli.rb`. Shared entrypoint `lib/llm_docs_builder.rb` wires dependencies. Executables reside in `bin/`: `llm-docs-builder` boots the CLI, while `rspecs` runs the full test matrix. Specs mirror library files under `spec/` with command-level coverage in `spec/integrations`. Static assets (logos, diff screenshots) are in `misc/`. Example configuration templates live at `llm-docs-builder.yml.example`.
|
|
5
|
-
|
|
6
|
-
## Build, Test, and Development Commands
|
|
7
|
-
- `bundle install` — sync gem dependencies defined in `Gemfile`.
|
|
8
|
-
- `bundle exec rake` — default task; runs RSpec and RuboCop together.
|
|
9
|
-
- `bundle exec rspec` or `bin/rspecs` — execute unit and integration specs with doc formatter.
|
|
10
|
-
- `bundle exec rubocop` — enforce the Ruby style guide; mirrors CI.
|
|
11
|
-
- `bin/llm-docs-builder transform --docs README.md` — smoke-test the CLI against a local file.
|
|
12
|
-
|
|
13
|
-
## Coding Style & Naming Conventions
|
|
14
|
-
Target Ruby 3.2 with two-space indentation and trailing newline. Prefer single-quoted strings; enable `# frozen_string_literal: true` headers on Ruby files. Keep lines ≤120 characters except where the RuboCop config allows. Use descriptive module/class names (e.g., `LlmDocsBuilder::Generator`) and predicate methods ending with `?` when returning booleans. Place supporting fixtures in `spec/support` if added, and name files after the class they extend.
|
|
15
|
-
|
|
16
|
-
## Testing Guidelines
|
|
17
|
-
RSpec is the sole testing framework. Name files `*_spec.rb` and align describe blocks with constant paths. Integration scenarios belong in `spec/integrations` to capture CLI behaviors. SimpleCov is enabled by default for line and branch coverage; export `SIMPLECOV=false` for quick local runs. Persist example statuses with the automatically managed `spec/examples.txt`.
|
|
18
|
-
|
|
19
|
-
## Commit & Pull Request Guidelines
|
|
20
|
-
Keep commit subjects short, present-tense, and focused (e.g., `Align CLI config (#27)`). Group related changes together so `git log` remains readable. Pull requests should describe motivation, summarize behavioral impact, link related issues or discussions, and include CLI output or screenshots when touching generated docs. Ensure CI passes (`bundle exec rake`) before requesting review, and note any follow-up work in the PR description.
|