llm-docs-builder 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +13 -0
- data/.github/workflows/docker.yml +2 -2
- data/.github/workflows/push.yml +2 -2
- data/.gitignore +8 -0
- data/CHANGELOG.md +13 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +47 -18
- data/README.md +19 -0
- data/lib/llm_docs_builder/cli.rb +32 -10
- data/lib/llm_docs_builder/comparator.rb +5 -75
- data/lib/llm_docs_builder/config.rb +42 -2
- data/lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb +31 -0
- data/lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb +71 -0
- data/lib/llm_docs_builder/helpers.rb +9 -0
- data/lib/llm_docs_builder/html_detector.rb +159 -0
- data/lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb +181 -0
- data/lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb +597 -0
- data/lib/llm_docs_builder/html_to_markdown_converter.rb +792 -0
- data/lib/llm_docs_builder/markdown_transformer.rb +30 -5
- data/lib/llm_docs_builder/output_formatter.rb +1 -1
- data/lib/llm_docs_builder/transformers/base_transformer.rb +13 -1
- data/lib/llm_docs_builder/url_fetcher.rb +138 -0
- data/lib/llm_docs_builder/version.rb +1 -1
- data/lib/llm_docs_builder.rb +11 -0
- data/llm-docs-builder.gemspec +1 -0
- metadata +23 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ac257dad79f49ed6993f784f8a28ee1e996e735fef4581449ad521ea9414a5d4
|
|
4
|
+
data.tar.gz: 29e1d2d578d57ea6f17aafca070c61b6161b6313d6614f0e4f798933ceae082d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f82216cca621e942c0e6ad3d92aba5d099159cc9c0d10c1d010a85e2a740511103cebd0198c0056195775064853e749472dcb7f0939b8d3fda7753d291a5b0da
|
|
7
|
+
data.tar.gz: 31aa5737e215439b11a2e79d793dabb9ff342206b660a2ecd846920bc2f6501c3d5910da4cdc52ecfcfa9f7b9acef14213b17936edd23d86808c0bcb2f391952
|
data/.github/workflows/ci.yml
CHANGED
|
@@ -54,6 +54,18 @@ jobs:
|
|
|
54
54
|
GITHUB_COVERAGE: ${{ matrix.coverage }}
|
|
55
55
|
run: bin/rspecs
|
|
56
56
|
|
|
57
|
+
yard-lint:
|
|
58
|
+
timeout-minutes: 5
|
|
59
|
+
runs-on: ubuntu-latest
|
|
60
|
+
steps:
|
|
61
|
+
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
|
|
62
|
+
- name: Set up Ruby
|
|
63
|
+
uses: ruby/setup-ruby@v1
|
|
64
|
+
with:
|
|
65
|
+
ruby-version: '3.4.7'
|
|
66
|
+
bundler-cache: true
|
|
67
|
+
- name: Run yard-lint
|
|
68
|
+
run: bundle exec yard-lint lib/
|
|
57
69
|
|
|
58
70
|
ci-success:
|
|
59
71
|
name: CI Success
|
|
@@ -61,6 +73,7 @@ jobs:
|
|
|
61
73
|
if: always()
|
|
62
74
|
needs:
|
|
63
75
|
- specs
|
|
76
|
+
- yard-lint
|
|
64
77
|
steps:
|
|
65
78
|
- name: Check all jobs passed
|
|
66
79
|
if: |
|
|
@@ -31,7 +31,7 @@ jobs:
|
|
|
31
31
|
|
|
32
32
|
- name: Docker meta
|
|
33
33
|
id: meta
|
|
34
|
-
uses: docker/metadata-action@
|
|
34
|
+
uses: docker/metadata-action@318604b99e75e41977312d83839a89be02ca4893 # v5
|
|
35
35
|
with:
|
|
36
36
|
images: |
|
|
37
37
|
mensfeld/llm-docs-builder
|
|
@@ -45,7 +45,7 @@ jobs:
|
|
|
45
45
|
type=raw,value=latest,enable={{is_default_branch}}
|
|
46
46
|
|
|
47
47
|
- name: Set up QEMU
|
|
48
|
-
uses: docker/setup-qemu-action@
|
|
48
|
+
uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3
|
|
49
49
|
|
|
50
50
|
- name: Set up Docker Buildx
|
|
51
51
|
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3
|
data/.github/workflows/push.yml
CHANGED
|
@@ -24,7 +24,7 @@ jobs:
|
|
|
24
24
|
fetch-depth: 0
|
|
25
25
|
|
|
26
26
|
- name: Set up Ruby
|
|
27
|
-
uses: ruby/setup-ruby@
|
|
27
|
+
uses: ruby/setup-ruby@d5126b9b3579e429dd52e51e68624dda2e05be25 # v1.267.0
|
|
28
28
|
with:
|
|
29
29
|
bundler-cache: false
|
|
30
30
|
|
|
@@ -32,4 +32,4 @@ jobs:
|
|
|
32
32
|
run: |
|
|
33
33
|
bundle install --jobs 4 --retry 3
|
|
34
34
|
|
|
35
|
-
- uses: rubygems/release-gem@
|
|
35
|
+
- uses: rubygems/release-gem@1c162a739e8b4cb21a676e97b087e8268d8fc40b # v1.1.2
|
data/.gitignore
CHANGED
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
/test/version_tmp/
|
|
11
11
|
/tmp/
|
|
12
12
|
mise.toml
|
|
13
|
+
.DS_Store
|
|
14
|
+
.vscode/launch.json
|
|
13
15
|
|
|
14
16
|
# Used by dotenv library to load environment variables.
|
|
15
17
|
.env
|
|
@@ -64,3 +66,9 @@ llms.txt
|
|
|
64
66
|
# Config files that might contain sensitive data
|
|
65
67
|
llms-txt.yml
|
|
66
68
|
.llms-txt.yml
|
|
69
|
+
|
|
70
|
+
# AI coding agent
|
|
71
|
+
AGENTS.md
|
|
72
|
+
CLAUDE.md
|
|
73
|
+
GEMINI.md
|
|
74
|
+
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.12.0 (2025-11-12)
|
|
4
|
+
- [Feature] **HTML to Markdown Reverse Converter** — Added support for converting HTML content to markdown format.
|
|
5
|
+
- Enables processing of HTML documentation sources
|
|
6
|
+
- Integrates seamlessly with the transformer pipeline
|
|
7
|
+
- Useful for converting web-based docs to markdown for further processing
|
|
8
|
+
- By @Eric-Guo in PR #32.
|
|
9
|
+
|
|
10
|
+
## 0.11.0 (2025-11-03)
|
|
11
|
+
- [Feature] **Transform from URL** — The `transform` command now accepts a remote URL via `--url` and processes fetched content through the standard transformer pipeline.
|
|
12
|
+
- Example: `llm-docs-builder transform --url https://example.com/docs/page.html`
|
|
13
|
+
- Applies all configured transformations and output options identically to local files
|
|
14
|
+
- By @Eric-Guo and @codex in PR #28.
|
|
15
|
+
|
|
3
16
|
## 0.10.0 (2025-10-27)
|
|
4
17
|
- [Feature] **llms.txt Specification Compliance** - Updated output format to fully comply with the llms.txt specification from llmstxt.org.
|
|
5
18
|
- **Metadata Format**: Metadata now appears within the description field using parentheses and comma separators: `- [title](url): description (tokens:450, updated:2025-10-13, priority:high)`
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
llm-docs-builder (0.
|
|
4
|
+
llm-docs-builder (0.12.0)
|
|
5
|
+
nokogiri (~> 1.17)
|
|
5
6
|
zeitwerk (~> 2.6)
|
|
6
7
|
|
|
7
8
|
GEM
|
|
@@ -10,41 +11,62 @@ GEM
|
|
|
10
11
|
ast (2.4.3)
|
|
11
12
|
byebug (12.0.0)
|
|
12
13
|
coderay (1.1.3)
|
|
14
|
+
date (3.5.0)
|
|
13
15
|
diff-lcs (1.6.2)
|
|
14
16
|
docile (1.4.1)
|
|
15
|
-
|
|
17
|
+
erb (5.1.3)
|
|
18
|
+
io-console (0.8.1)
|
|
19
|
+
irb (1.15.3)
|
|
20
|
+
pp (>= 0.6.0)
|
|
21
|
+
rdoc (>= 4.0.0)
|
|
22
|
+
reline (>= 0.4.2)
|
|
23
|
+
json (2.16.0)
|
|
16
24
|
language_server-protocol (3.17.0.5)
|
|
17
25
|
lint_roller (1.1.0)
|
|
18
26
|
method_source (1.1.0)
|
|
27
|
+
nokogiri (1.18.10-x86_64-linux-gnu)
|
|
28
|
+
racc (~> 1.4)
|
|
19
29
|
parallel (1.27.0)
|
|
20
|
-
parser (3.3.
|
|
30
|
+
parser (3.3.10.0)
|
|
21
31
|
ast (~> 2.4.1)
|
|
22
32
|
racc
|
|
23
|
-
|
|
33
|
+
pp (0.6.3)
|
|
34
|
+
prettyprint
|
|
35
|
+
prettyprint (0.2.0)
|
|
36
|
+
prism (1.6.0)
|
|
24
37
|
pry (0.15.2)
|
|
25
38
|
coderay (~> 1.1)
|
|
26
39
|
method_source (~> 1.0)
|
|
27
40
|
pry-byebug (3.11.0)
|
|
28
41
|
byebug (~> 12.0)
|
|
29
42
|
pry (>= 0.13, < 0.16)
|
|
43
|
+
psych (5.2.6)
|
|
44
|
+
date
|
|
45
|
+
stringio
|
|
30
46
|
racc (1.8.1)
|
|
31
47
|
rainbow (3.1.1)
|
|
32
|
-
rake (13.3.
|
|
33
|
-
|
|
34
|
-
|
|
48
|
+
rake (13.3.1)
|
|
49
|
+
rdoc (6.15.1)
|
|
50
|
+
erb
|
|
51
|
+
psych (>= 4.0.0)
|
|
52
|
+
tsort
|
|
53
|
+
regexp_parser (2.11.3)
|
|
54
|
+
reline (0.6.3)
|
|
55
|
+
io-console (~> 0.5)
|
|
56
|
+
rspec (3.13.2)
|
|
35
57
|
rspec-core (~> 3.13.0)
|
|
36
58
|
rspec-expectations (~> 3.13.0)
|
|
37
59
|
rspec-mocks (~> 3.13.0)
|
|
38
|
-
rspec-core (3.13.
|
|
60
|
+
rspec-core (3.13.6)
|
|
39
61
|
rspec-support (~> 3.13.0)
|
|
40
62
|
rspec-expectations (3.13.5)
|
|
41
63
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
42
64
|
rspec-support (~> 3.13.0)
|
|
43
|
-
rspec-mocks (3.13.
|
|
65
|
+
rspec-mocks (3.13.7)
|
|
44
66
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
45
67
|
rspec-support (~> 3.13.0)
|
|
46
|
-
rspec-support (3.13.
|
|
47
|
-
rubocop (1.
|
|
68
|
+
rspec-support (3.13.6)
|
|
69
|
+
rubocop (1.81.7)
|
|
48
70
|
json (~> 2.3)
|
|
49
71
|
language_server-protocol (~> 3.17.0.2)
|
|
50
72
|
lint_roller (~> 1.1.0)
|
|
@@ -52,10 +74,10 @@ GEM
|
|
|
52
74
|
parser (>= 3.3.0.2)
|
|
53
75
|
rainbow (>= 2.2.2, < 4.0)
|
|
54
76
|
regexp_parser (>= 2.9.3, < 3.0)
|
|
55
|
-
rubocop-ast (>= 1.
|
|
77
|
+
rubocop-ast (>= 1.47.1, < 2.0)
|
|
56
78
|
ruby-progressbar (~> 1.7)
|
|
57
79
|
unicode-display_width (>= 2.4.0, < 4.0)
|
|
58
|
-
rubocop-ast (1.
|
|
80
|
+
rubocop-ast (1.47.1)
|
|
59
81
|
parser (>= 3.3.7.2)
|
|
60
82
|
prism (~> 1.4)
|
|
61
83
|
ruby-progressbar (1.13.0)
|
|
@@ -65,13 +87,19 @@ GEM
|
|
|
65
87
|
simplecov_json_formatter (~> 0.1)
|
|
66
88
|
simplecov-html (0.13.2)
|
|
67
89
|
simplecov_json_formatter (0.1.4)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
unicode-
|
|
90
|
+
stringio (3.1.8)
|
|
91
|
+
tsort (0.2.0)
|
|
92
|
+
unicode-display_width (3.2.0)
|
|
93
|
+
unicode-emoji (~> 4.1)
|
|
94
|
+
unicode-emoji (4.1.0)
|
|
95
|
+
yard (0.9.37)
|
|
96
|
+
yard-lint (1.1.0)
|
|
97
|
+
irb
|
|
98
|
+
yard (~> 0.9)
|
|
99
|
+
zeitwerk (~> 2.6)
|
|
71
100
|
zeitwerk (2.7.3)
|
|
72
101
|
|
|
73
102
|
PLATFORMS
|
|
74
|
-
ruby
|
|
75
103
|
x86_64-linux
|
|
76
104
|
|
|
77
105
|
DEPENDENCIES
|
|
@@ -83,6 +111,7 @@ DEPENDENCIES
|
|
|
83
111
|
rspec (~> 3.0)
|
|
84
112
|
rubocop (~> 1.0)
|
|
85
113
|
simplecov (~> 0.21)
|
|
114
|
+
yard-lint
|
|
86
115
|
|
|
87
116
|
BUNDLED WITH
|
|
88
|
-
2.7.
|
|
117
|
+
2.7.2
|
data/README.md
CHANGED
|
@@ -61,10 +61,15 @@ Factor: 2.8x smaller
|
|
|
61
61
|
# Single file
|
|
62
62
|
llm-docs-builder transform --docs README.md
|
|
63
63
|
|
|
64
|
+
# Fetch and transform a remote page
|
|
65
|
+
llm-docs-builder transform --url https://yoursite.com/docs/page.html
|
|
66
|
+
|
|
64
67
|
# Bulk transform with config
|
|
65
68
|
llm-docs-builder bulk-transform --config llm-docs-builder.yml
|
|
66
69
|
```
|
|
67
70
|
|
|
71
|
+
**HTML to Markdown Conversion:** The transformer automatically detects and converts HTML content to clean markdown format. This works seamlessly with both local files and remote URLs, converting HTML tables, code blocks, and other elements into their markdown equivalents.
|
|
72
|
+
|
|
68
73
|
## Installation
|
|
69
74
|
|
|
70
75
|
### Docker (Recommended)
|
|
@@ -82,6 +87,20 @@ gem install llm-docs-builder
|
|
|
82
87
|
|
|
83
88
|
## Features
|
|
84
89
|
|
|
90
|
+
### Automatic HTML to Markdown Conversion
|
|
91
|
+
|
|
92
|
+
The tool automatically detects and converts HTML content to clean markdown:
|
|
93
|
+
- **HTML Tables** → Markdown tables
|
|
94
|
+
- **HTML Code Blocks** → Fenced code blocks
|
|
95
|
+
- **Figures & Captions** → Clean markdown equivalents
|
|
96
|
+
- **Seamless Integration** - Works with local files and remote URLs without special configuration
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# Transform HTML content automatically
|
|
100
|
+
llm-docs-builder transform --docs page-with-html.md
|
|
101
|
+
llm-docs-builder transform --url https://site.com/docs/api.html
|
|
102
|
+
```
|
|
103
|
+
|
|
85
104
|
### Measure and Compare
|
|
86
105
|
|
|
87
106
|
```bash
|
data/lib/llm_docs_builder/cli.rb
CHANGED
|
@@ -68,8 +68,9 @@ module LlmDocsBuilder
|
|
|
68
68
|
# @param argv [Array<String>] command-line arguments
|
|
69
69
|
# @return [Hash] parsed options including :command, :config, :docs, :output, :verbose
|
|
70
70
|
def parse_options(argv)
|
|
71
|
+
command_token = argv.first
|
|
71
72
|
options = {
|
|
72
|
-
command:
|
|
73
|
+
command: command_token&.match?(/\A[a-z](?:[a-z-]*[a-z])?\z/) ? argv.shift : nil
|
|
73
74
|
}
|
|
74
75
|
|
|
75
76
|
OptionParser.new do |opts|
|
|
@@ -100,7 +101,7 @@ module LlmDocsBuilder
|
|
|
100
101
|
options[:output] = path
|
|
101
102
|
end
|
|
102
103
|
|
|
103
|
-
opts.on('-u', '--url URL', 'URL to fetch for comparison') do |url|
|
|
104
|
+
opts.on('-u', '--url URL', 'URL to fetch for transform or comparison') do |url|
|
|
104
105
|
options[:url] = url
|
|
105
106
|
end
|
|
106
107
|
|
|
@@ -185,21 +186,42 @@ module LlmDocsBuilder
|
|
|
185
186
|
config = LlmDocsBuilder::Config.new(options[:config])
|
|
186
187
|
merged_options = config.merge_with_options(options)
|
|
187
188
|
|
|
188
|
-
|
|
189
|
+
url = options[:url]
|
|
190
|
+
cli_file_path = options[:docs]
|
|
191
|
+
config_file_path = config['docs']
|
|
192
|
+
file_path = url ? cli_file_path : (cli_file_path || config_file_path)
|
|
189
193
|
|
|
190
|
-
|
|
191
|
-
puts '
|
|
194
|
+
if url && cli_file_path
|
|
195
|
+
puts 'Cannot use both --docs and --url for transform command'
|
|
192
196
|
exit 1
|
|
193
197
|
end
|
|
194
198
|
|
|
195
|
-
unless
|
|
196
|
-
|
|
197
|
-
|
|
199
|
+
unless file_path
|
|
200
|
+
unless url
|
|
201
|
+
puts 'File path required for transform command (use -d/--docs)'
|
|
202
|
+
exit 1
|
|
203
|
+
end
|
|
198
204
|
end
|
|
199
205
|
|
|
200
|
-
|
|
206
|
+
content =
|
|
207
|
+
if url
|
|
208
|
+
puts "Fetching #{url}..." if merged_options[:verbose]
|
|
209
|
+
fetcher = LlmDocsBuilder::UrlFetcher.new(verbose: merged_options[:verbose])
|
|
210
|
+
remote_content = fetcher.fetch(url)
|
|
211
|
+
puts "Transforming content from #{url}..." if merged_options[:verbose]
|
|
212
|
+
transform_options = merged_options.merge(content: remote_content, docs: nil, source_url: url)
|
|
213
|
+
LlmDocsBuilder.transform_markdown(nil, transform_options)
|
|
214
|
+
else
|
|
215
|
+
unless File.exist?(file_path)
|
|
216
|
+
puts "File not found: #{file_path}"
|
|
217
|
+
exit 1
|
|
218
|
+
end
|
|
201
219
|
|
|
202
|
-
|
|
220
|
+
puts "Transforming #{file_path}..." if merged_options[:verbose]
|
|
221
|
+
|
|
222
|
+
merged_options[:docs] = file_path
|
|
223
|
+
LlmDocsBuilder.transform_markdown(file_path, merged_options)
|
|
224
|
+
end
|
|
203
225
|
|
|
204
226
|
if merged_options[:output] && merged_options[:output] != 'llms.txt'
|
|
205
227
|
File.write(merged_options[:output], content)
|
|
@@ -1,8 +1,5 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'net/http'
|
|
4
|
-
require 'uri'
|
|
5
|
-
|
|
6
3
|
module LlmDocsBuilder
|
|
7
4
|
# Compares content sizes between human and AI versions
|
|
8
5
|
#
|
|
@@ -30,7 +27,7 @@ module LlmDocsBuilder
|
|
|
30
27
|
AI_USER_AGENT = 'Claude-Web/1.0 (Anthropic AI Assistant)'
|
|
31
28
|
|
|
32
29
|
# Maximum number of redirects to follow before raising an error
|
|
33
|
-
MAX_REDIRECTS =
|
|
30
|
+
MAX_REDIRECTS = UrlFetcher::MAX_REDIRECTS
|
|
34
31
|
|
|
35
32
|
# @return [String] URL to compare
|
|
36
33
|
attr_reader :url
|
|
@@ -133,78 +130,11 @@ module LlmDocsBuilder
|
|
|
133
130
|
# @return [String] response body
|
|
134
131
|
# @raise [Errors::GenerationError] if fetch fails or too many redirects
|
|
135
132
|
def fetch_url(url_string, user_agent, redirect_count = 0)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
"Too many redirects (#{MAX_REDIRECTS}) when fetching #{url_string}"
|
|
140
|
-
)
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
uri = validate_and_parse_url(url_string)
|
|
144
|
-
|
|
145
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
|
146
|
-
http.use_ssl = uri.scheme == 'https'
|
|
147
|
-
http.open_timeout = 10
|
|
148
|
-
http.read_timeout = 30
|
|
149
|
-
|
|
150
|
-
request = Net::HTTP::Get.new(uri.request_uri)
|
|
151
|
-
request['User-Agent'] = user_agent
|
|
152
|
-
|
|
153
|
-
response = http.request(request)
|
|
154
|
-
|
|
155
|
-
case response
|
|
156
|
-
when Net::HTTPSuccess
|
|
157
|
-
response.body
|
|
158
|
-
when Net::HTTPRedirection
|
|
159
|
-
# Follow redirect with incremented counter
|
|
160
|
-
redirect_url = response['location']
|
|
161
|
-
puts " Redirecting to #{redirect_url}..." if options[:verbose] && redirect_count.positive?
|
|
162
|
-
fetch_url(redirect_url, user_agent, redirect_count + 1)
|
|
163
|
-
else
|
|
164
|
-
raise(
|
|
165
|
-
Errors::GenerationError,
|
|
166
|
-
"Failed to fetch #{url_string}: #{response.code} #{response.message}"
|
|
167
|
-
)
|
|
168
|
-
end
|
|
169
|
-
rescue Errors::GenerationError
|
|
170
|
-
raise
|
|
171
|
-
rescue StandardError => e
|
|
172
|
-
raise(
|
|
173
|
-
Errors::GenerationError,
|
|
174
|
-
"Error fetching #{url_string}: #{e.message}"
|
|
175
|
-
)
|
|
176
|
-
end
|
|
177
|
-
|
|
178
|
-
# Validates and parses URL to prevent malformed URLs
|
|
179
|
-
#
|
|
180
|
-
# @param url_string [String] URL to validate and parse
|
|
181
|
-
# @return [URI::HTTP, URI::HTTPS] parsed URI
|
|
182
|
-
# @raise [Errors::GenerationError] if URL is invalid or uses unsupported scheme
|
|
183
|
-
def validate_and_parse_url(url_string)
|
|
184
|
-
uri = URI.parse(url_string)
|
|
185
|
-
|
|
186
|
-
# Only allow HTTP and HTTPS schemes
|
|
187
|
-
unless %w[http https].include?(uri.scheme&.downcase)
|
|
188
|
-
raise(
|
|
189
|
-
Errors::GenerationError,
|
|
190
|
-
"Unsupported URL scheme: #{uri.scheme || 'none'} (only http/https allowed)"
|
|
191
|
-
)
|
|
192
|
-
end
|
|
193
|
-
|
|
194
|
-
# Ensure host is present
|
|
195
|
-
if uri.host.nil? || uri.host.empty?
|
|
196
|
-
raise(
|
|
197
|
-
Errors::GenerationError,
|
|
198
|
-
"Invalid URL: missing host in #{url_string}"
|
|
199
|
-
)
|
|
200
|
-
end
|
|
201
|
-
|
|
202
|
-
uri
|
|
203
|
-
rescue URI::InvalidURIError => e
|
|
204
|
-
raise(
|
|
205
|
-
Errors::GenerationError,
|
|
206
|
-
"Invalid URL format: #{e.message}"
|
|
133
|
+
fetcher = UrlFetcher.new(
|
|
134
|
+
user_agent: user_agent,
|
|
135
|
+
verbose: options[:verbose]
|
|
207
136
|
)
|
|
137
|
+
fetcher.fetch(url_string, redirect_count)
|
|
208
138
|
end
|
|
209
139
|
|
|
210
140
|
# Calculate comparison statistics
|
|
@@ -53,11 +53,48 @@ module LlmDocsBuilder
|
|
|
53
53
|
# defaults for any options not specified via CLI.
|
|
54
54
|
#
|
|
55
55
|
# @param options [Hash] CLI options hash
|
|
56
|
+
# @option options [String] :docs path to documentation directory or file
|
|
57
|
+
# @option options [String] :base_url base URL for expanding relative links
|
|
58
|
+
# @option options [String] :title project title
|
|
59
|
+
# @option options [String] :description project description
|
|
60
|
+
# @option options [String] :body additional body content
|
|
61
|
+
# @option options [String] :output output file path
|
|
62
|
+
# @option options [Boolean] :convert_urls convert HTML URLs to markdown format
|
|
63
|
+
# @option options [Boolean] :remove_comments remove HTML comments
|
|
64
|
+
# @option options [Boolean] :normalize_whitespace normalize whitespace
|
|
65
|
+
# @option options [Boolean] :remove_badges remove badge images
|
|
66
|
+
# @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
|
|
67
|
+
# @option options [Boolean] :verbose enable verbose output
|
|
68
|
+
# @option options [String] :suffix suffix for transformed files
|
|
69
|
+
# @option options [Array<String>] :excludes glob patterns for files to exclude
|
|
70
|
+
# @option options [Boolean] :bulk enable bulk transformation mode
|
|
71
|
+
# @option options [Boolean] :include_hidden include hidden files
|
|
72
|
+
# @option options [Boolean] :remove_code_examples remove code blocks
|
|
73
|
+
# @option options [Boolean] :remove_images remove image syntax
|
|
74
|
+
# @option options [Boolean] :simplify_links simplify link text
|
|
75
|
+
# @option options [Boolean] :remove_blockquotes remove blockquote formatting
|
|
76
|
+
# @option options [Boolean] :generate_toc generate table of contents
|
|
77
|
+
# @option options [String] :custom_instruction custom instruction text
|
|
78
|
+
# @option options [Boolean] :remove_stopwords remove common stopwords
|
|
79
|
+
# @option options [Boolean] :remove_duplicates remove duplicate paragraphs
|
|
80
|
+
# @option options [Boolean] :normalize_headings normalize heading hierarchy
|
|
81
|
+
# @option options [String] :heading_separator separator for heading paths
|
|
82
|
+
# @option options [Boolean] :include_metadata include metadata in output
|
|
83
|
+
# @option options [Boolean] :include_tokens include token counts
|
|
84
|
+
# @option options [Boolean] :include_timestamps include timestamps
|
|
85
|
+
# @option options [Boolean] :include_priority include priority metadata
|
|
86
|
+
# @option options [Boolean] :calculate_compression calculate compression ratios
|
|
87
|
+
# @option options [String] :content raw markdown content
|
|
88
|
+
# @option options [String] :source_url source URL for content
|
|
56
89
|
# @return [Hash] merged configuration with CLI overrides applied
|
|
57
90
|
def merge_with_options(options)
|
|
58
91
|
# CLI options override config file, config file provides defaults
|
|
59
92
|
{
|
|
60
|
-
docs: options
|
|
93
|
+
docs: if options.key?(:docs)
|
|
94
|
+
options[:docs]
|
|
95
|
+
else
|
|
96
|
+
self['docs'] || '.'
|
|
97
|
+
end,
|
|
61
98
|
base_url: options[:base_url] || self['base_url'],
|
|
62
99
|
title: options[:title] || self['title'],
|
|
63
100
|
description: options[:description] || self['description'],
|
|
@@ -171,7 +208,10 @@ module LlmDocsBuilder
|
|
|
171
208
|
else
|
|
172
209
|
self['calculate_compression'] || false
|
|
173
210
|
end
|
|
174
|
-
}
|
|
211
|
+
}.tap do |merged|
|
|
212
|
+
merged[:content] = options[:content] if options.key?(:content)
|
|
213
|
+
merged[:source_url] = options[:source_url] if options.key?(:source_url)
|
|
214
|
+
end
|
|
175
215
|
end
|
|
176
216
|
|
|
177
217
|
# Check if a config file was found and exists
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmDocsBuilder
|
|
4
|
+
# Helper methods for content transformation
|
|
5
|
+
#
|
|
6
|
+
# @api private
|
|
7
|
+
module Helpers
|
|
8
|
+
# Removes trailing pipe characters and whitespace from array of string parts
|
|
9
|
+
#
|
|
10
|
+
# @param parts [Array<String>] array of string parts to process
|
|
11
|
+
# @return [void]
|
|
12
|
+
def prune_trailing_unsafe_link_separator!(parts)
|
|
13
|
+
while parts.any?
|
|
14
|
+
last = parts.last
|
|
15
|
+
new_last = last.sub(/[ \t]*\|\s*\z/, '')
|
|
16
|
+
|
|
17
|
+
if new_last != last
|
|
18
|
+
trimmed = new_last.rstrip
|
|
19
|
+
parts[-1] = trimmed
|
|
20
|
+
parts.pop if trimmed.empty?
|
|
21
|
+
elsif last.strip.empty?
|
|
22
|
+
parts.pop
|
|
23
|
+
else
|
|
24
|
+
break
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
module_function :prune_trailing_unsafe_link_separator!
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmDocsBuilder
|
|
4
|
+
module Helpers
|
|
5
|
+
# Reduces consecutive blank lines outside of code fences
|
|
6
|
+
#
|
|
7
|
+
# @param text [String] input text to process
|
|
8
|
+
# @param max_blank [Integer] maximum number of consecutive blank lines to allow
|
|
9
|
+
# @param fence_chars [Array<String>] characters that can be used for code fences
|
|
10
|
+
# @param min_fence [Integer] minimum length of fence character sequence
|
|
11
|
+
# @return [String] processed text with squeezed blank lines
|
|
12
|
+
def squeeze_blank_lines_outside_fences(text, max_blank: 2, fence_chars: %w[` ~], min_fence: 3)
|
|
13
|
+
return '' if text.to_s.empty?
|
|
14
|
+
|
|
15
|
+
lines = text.split("\n", -1)
|
|
16
|
+
|
|
17
|
+
inside_fence = false
|
|
18
|
+
fence_indent = ''.dup
|
|
19
|
+
fence_char = nil
|
|
20
|
+
fence_len = 0
|
|
21
|
+
|
|
22
|
+
# Build a fast “does this look like an opening fence?” regex
|
|
23
|
+
# e.g., leading spaces + ``` or ~~~ (length >= min_fence) + optional info string
|
|
24
|
+
fence_set = Regexp.escape(fence_chars.join)
|
|
25
|
+
open_re = /\A(\s*)([#{fence_set}])\2{#{min_fence - 1},}.*\z/
|
|
26
|
+
|
|
27
|
+
out = []
|
|
28
|
+
blank_streak = 0
|
|
29
|
+
|
|
30
|
+
lines.each_with_index do |line, _idx|
|
|
31
|
+
if inside_fence
|
|
32
|
+
out << line
|
|
33
|
+
# Closing fence must match indent, char, and fence length
|
|
34
|
+
if line.match?(/\A#{Regexp.escape(fence_indent)}#{Regexp.escape(fence_char * fence_len)}\s*\z/)
|
|
35
|
+
inside_fence = false
|
|
36
|
+
fence_indent = ''.dup
|
|
37
|
+
fence_char = nil
|
|
38
|
+
fence_len = 0
|
|
39
|
+
end
|
|
40
|
+
next
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
if (m = line.match(open_re))
|
|
44
|
+
# Enter fenced block; compute the *actual* fence length from the line
|
|
45
|
+
fence_indent = m[1]
|
|
46
|
+
fence_char = m[2]
|
|
47
|
+
after_indent = line[fence_indent.length..]
|
|
48
|
+
fence_len = after_indent[/\A#{Regexp.escape(fence_char)}+/].length
|
|
49
|
+
inside_fence = true
|
|
50
|
+
blank_streak = 0
|
|
51
|
+
out << line
|
|
52
|
+
next
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Outside fences: squeeze blank lines
|
|
56
|
+
if line.strip.empty?
|
|
57
|
+
blank_streak += 1
|
|
58
|
+
# Keep at most max_blank blank lines; skip extras
|
|
59
|
+
out << line if blank_streak <= max_blank
|
|
60
|
+
else
|
|
61
|
+
blank_streak = 0
|
|
62
|
+
out << line
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
out.join("\n")
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
module_function :squeeze_blank_lines_outside_fences
|
|
70
|
+
end
|
|
71
|
+
end
|