llmstxt-standalone 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmstxt_standalone-0.1.0/PKG-INFO +179 -0
- llmstxt_standalone-0.1.0/README.md +148 -0
- llmstxt_standalone-0.1.0/pyproject.toml +61 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/__init__.py +5 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/__main__.py +6 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/cli.py +172 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/config/__init__.py +6 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/config/derive.py +39 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/config/load.py +93 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/config/model.py +60 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/config/plugin.py +42 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/convert.py +152 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/generate.py +343 -0
- llmstxt_standalone-0.1.0/src/llmstxt_standalone/py.typed +0 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: llmstxt-standalone
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Generate llms.txt from built HTML documentation
|
|
5
|
+
Keywords: llms,documentation,markdown,mkdocs
|
|
6
|
+
Author: Shaan Majid
|
|
7
|
+
Author-email: Shaan Majid <shaanmajid64@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Topic :: Documentation
|
|
20
|
+
Classifier: Typing :: Typed
|
|
21
|
+
Requires-Dist: typer>=0.9.0
|
|
22
|
+
Requires-Dist: pyyaml>=6.0
|
|
23
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
24
|
+
Requires-Dist: markdownify>=0.14,<2.0
|
|
25
|
+
Requires-Dist: mdformat>=0.7,<2.0
|
|
26
|
+
Requires-Dist: mdformat-tables>=1.0
|
|
27
|
+
Requires-Python: >=3.10
|
|
28
|
+
Project-URL: Repository, https://github.com/shaanmajid/llmstxt-standalone
|
|
29
|
+
Project-URL: Issues, https://github.com/shaanmajid/llmstxt-standalone/issues
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# llmstxt-standalone
|
|
33
|
+
|
|
34
|
+
[](https://github.com/shaanmajid/llmstxt-standalone/actions/workflows/ci.yml)
|
|
35
|
+
[](https://pypi.org/project/llmstxt-standalone/)
|
|
36
|
+
[](https://pypi.org/project/llmstxt-standalone/)
|
|
37
|
+
[](https://github.com/shaanmajid/llmstxt-standalone/blob/main/LICENSE)
|
|
38
|
+
[](https://codecov.io/gh/shaanmajid/llmstxt-standalone)
|
|
39
|
+
[](https://github.com/j178/prek)
|
|
40
|
+
|
|
41
|
+
Generate `/llms.txt`, `/llms-full.txt`, and per-page markdown files from built HTML documentation, following the [llms.txt spec](https://llmstxt.org/).
|
|
42
|
+
|
|
43
|
+
This tool works on pre-built HTML, making it useful for environments that cannot run MkDocs plugins (e.g., [Zensical](https://zensical.com/)) or when you want llms.txt generation as a separate build step. For standard MkDocs workflows, see [mkdocs-llmstxt](https://github.com/pawamoy/mkdocs-llmstxt).
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
Requires Python 3.10+.
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# Run without installing
|
|
51
|
+
uvx llmstxt-standalone
|
|
52
|
+
|
|
53
|
+
# Install as a CLI tool
|
|
54
|
+
uv tool install llmstxt-standalone # or: pipx install
|
|
55
|
+
|
|
56
|
+
# Add to a project
|
|
57
|
+
uv add llmstxt-standalone # or: pip install
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Usage
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# Run from project root (expects mkdocs.yml and site/)
|
|
64
|
+
llmstxt-standalone
|
|
65
|
+
|
|
66
|
+
# Explicit paths
|
|
67
|
+
llmstxt-standalone --config mkdocs.yml --site-dir ./build --output-dir ./dist
|
|
68
|
+
|
|
69
|
+
# Preview without writing files
|
|
70
|
+
llmstxt-standalone --dry-run
|
|
71
|
+
|
|
72
|
+
# Suppress output
|
|
73
|
+
llmstxt-standalone --quiet
|
|
74
|
+
|
|
75
|
+
# Show detailed progress
|
|
76
|
+
llmstxt-standalone --verbose
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
| Option | Short | Default | Description |
|
|
80
|
+
|--------|-------|---------|-------------|
|
|
81
|
+
| `--config` | `-c` | `mkdocs.yml` | Path to MkDocs config file |
|
|
82
|
+
| `--site-dir` | `-s` | `site` | Path to built HTML directory |
|
|
83
|
+
| `--output-dir` | `-o` | same as site-dir | Where to write output files |
|
|
84
|
+
| `--dry-run` | `-n` | | Preview without writing |
|
|
85
|
+
| `--quiet` | `-q` | | Suppress output |
|
|
86
|
+
| `--verbose` | `-v` | | Show detailed progress |
|
|
87
|
+
| `--version` | `-V` | | Show version |
|
|
88
|
+
|
|
89
|
+
## Output
|
|
90
|
+
|
|
91
|
+
The tool generates three outputs:
|
|
92
|
+
|
|
93
|
+
1. `llms.txt` — an index file with markdown links to all pages
|
|
94
|
+
1. `llms-full.txt` — concatenated content of all pages
|
|
95
|
+
1. Per-page `.md` files alongside the HTML
|
|
96
|
+
|
|
97
|
+
The per-page markdown files make the URLs in `llms.txt` resolve to actual content. If your site is at `https://docs.example.com/`, the URL `https://docs.example.com/install/index.md` returns markdown instead of HTML.
|
|
98
|
+
|
|
99
|
+
## Configuration
|
|
100
|
+
|
|
101
|
+
The tool reads your `mkdocs.yml` for site metadata. You can configure llmstxt output explicitly or let it derive structure from your nav.
|
|
102
|
+
|
|
103
|
+
### Explicit configuration
|
|
104
|
+
|
|
105
|
+
```yaml
|
|
106
|
+
plugins:
|
|
107
|
+
- llmstxt:
|
|
108
|
+
markdown_description: |
|
|
109
|
+
Extra context for LLMs about your project.
|
|
110
|
+
full_output: llms-full.txt
|
|
111
|
+
content_selector: article.md-content__inner
|
|
112
|
+
sections:
|
|
113
|
+
Getting Started:
|
|
114
|
+
- index.md
|
|
115
|
+
- install.md
|
|
116
|
+
Usage:
|
|
117
|
+
- guide/basics.md
|
|
118
|
+
- guide/advanced.md
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
| Option | Default | Description |
|
|
122
|
+
|--------|---------|-------------|
|
|
123
|
+
| `markdown_description` | `""` | Additional context for LLMs, appears after site description |
|
|
124
|
+
| `full_output` | `llms-full.txt` | Filename for concatenated content |
|
|
125
|
+
| `content_selector` | auto-detect | CSS selector for main content |
|
|
126
|
+
| `sections` | derived from nav | Section names mapped to page lists |
|
|
127
|
+
|
|
128
|
+
### Automatic fallback
|
|
129
|
+
|
|
130
|
+
Without an explicit `llmstxt` plugin config, sections derive from your `nav` structure. Top-level pages go into a "Pages" section; nested nav items become sections named by their keys.
|
|
131
|
+
|
|
132
|
+
### MkDocs settings
|
|
133
|
+
|
|
134
|
+
The tool respects `use_directory_urls` from your mkdocs.yml. When enabled (the default), `install.md` maps to `install/index.md`; when disabled, it maps to `install.md`.
|
|
135
|
+
|
|
136
|
+
### Content extraction
|
|
137
|
+
|
|
138
|
+
If `content_selector` is not set, the tool tries these selectors in order:
|
|
139
|
+
|
|
140
|
+
1. `.md-content__inner` (Material for MkDocs)
|
|
141
|
+
1. `[role="main"]` (default MkDocs theme)
|
|
142
|
+
1. `article`
|
|
143
|
+
1. `main`
|
|
144
|
+
1. The entire document
|
|
145
|
+
|
|
146
|
+
### Title resolution
|
|
147
|
+
|
|
148
|
+
Page titles resolve in this order:
|
|
149
|
+
|
|
150
|
+
1. The title from your `nav` structure
|
|
151
|
+
1. The HTML `<title>` tag (with site name suffix stripped)
|
|
152
|
+
1. The first `<h1>` tag
|
|
153
|
+
1. A title derived from the filename
|
|
154
|
+
|
|
155
|
+
## Programmatic use
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from pathlib import Path
|
|
159
|
+
from llmstxt_standalone.config import load_config
|
|
160
|
+
from llmstxt_standalone.generate import generate_llms_txt
|
|
161
|
+
|
|
162
|
+
config = load_config(Path("mkdocs.yml"))
|
|
163
|
+
result = generate_llms_txt(config, site_dir=Path("site"))
|
|
164
|
+
|
|
165
|
+
print(result.llms_txt) # Index content
|
|
166
|
+
print(result.llms_full_txt) # Full content
|
|
167
|
+
print(result.markdown_files) # List of written .md paths
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Compatibility
|
|
171
|
+
|
|
172
|
+
- Produces output identical to mkdocs-llmstxt when configured the same way
|
|
173
|
+
- Handles Unicode, international characters, and special characters
|
|
174
|
+
- Works with Material for MkDocs, ReadTheDocs, and the default MkDocs theme
|
|
175
|
+
- Parses configs containing Python YAML tags like `!python/object/apply`
|
|
176
|
+
|
|
177
|
+
## License
|
|
178
|
+
|
|
179
|
+
MIT
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# llmstxt-standalone
|
|
2
|
+
|
|
3
|
+
[](https://github.com/shaanmajid/llmstxt-standalone/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/llmstxt-standalone/)
|
|
5
|
+
[](https://pypi.org/project/llmstxt-standalone/)
|
|
6
|
+
[](https://github.com/shaanmajid/llmstxt-standalone/blob/main/LICENSE)
|
|
7
|
+
[](https://codecov.io/gh/shaanmajid/llmstxt-standalone)
|
|
8
|
+
[](https://github.com/j178/prek)
|
|
9
|
+
|
|
10
|
+
Generate `/llms.txt`, `/llms-full.txt`, and per-page markdown files from built HTML documentation, following the [llms.txt spec](https://llmstxt.org/).
|
|
11
|
+
|
|
12
|
+
This tool works on pre-built HTML, making it useful for environments that cannot run MkDocs plugins (e.g., [Zensical](https://zensical.com/)) or when you want llms.txt generation as a separate build step. For standard MkDocs workflows, see [mkdocs-llmstxt](https://github.com/pawamoy/mkdocs-llmstxt).
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
Requires Python 3.10+.
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# Run without installing
|
|
20
|
+
uvx llmstxt-standalone
|
|
21
|
+
|
|
22
|
+
# Install as a CLI tool
|
|
23
|
+
uv tool install llmstxt-standalone # or: pipx install
|
|
24
|
+
|
|
25
|
+
# Add to a project
|
|
26
|
+
uv add llmstxt-standalone # or: pip install
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# Run from project root (expects mkdocs.yml and site/)
|
|
33
|
+
llmstxt-standalone
|
|
34
|
+
|
|
35
|
+
# Explicit paths
|
|
36
|
+
llmstxt-standalone --config mkdocs.yml --site-dir ./build --output-dir ./dist
|
|
37
|
+
|
|
38
|
+
# Preview without writing files
|
|
39
|
+
llmstxt-standalone --dry-run
|
|
40
|
+
|
|
41
|
+
# Suppress output
|
|
42
|
+
llmstxt-standalone --quiet
|
|
43
|
+
|
|
44
|
+
# Show detailed progress
|
|
45
|
+
llmstxt-standalone --verbose
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
| Option | Short | Default | Description |
|
|
49
|
+
|--------|-------|---------|-------------|
|
|
50
|
+
| `--config` | `-c` | `mkdocs.yml` | Path to MkDocs config file |
|
|
51
|
+
| `--site-dir` | `-s` | `site` | Path to built HTML directory |
|
|
52
|
+
| `--output-dir` | `-o` | same as site-dir | Where to write output files |
|
|
53
|
+
| `--dry-run` | `-n` | | Preview without writing |
|
|
54
|
+
| `--quiet` | `-q` | | Suppress output |
|
|
55
|
+
| `--verbose` | `-v` | | Show detailed progress |
|
|
56
|
+
| `--version` | `-V` | | Show version |
|
|
57
|
+
|
|
58
|
+
## Output
|
|
59
|
+
|
|
60
|
+
The tool generates three outputs:
|
|
61
|
+
|
|
62
|
+
1. `llms.txt` — an index file with markdown links to all pages
|
|
63
|
+
1. `llms-full.txt` — concatenated content of all pages
|
|
64
|
+
1. Per-page `.md` files alongside the HTML
|
|
65
|
+
|
|
66
|
+
The per-page markdown files make the URLs in `llms.txt` resolve to actual content. If your site is at `https://docs.example.com/`, the URL `https://docs.example.com/install/index.md` returns markdown instead of HTML.
|
|
67
|
+
|
|
68
|
+
## Configuration
|
|
69
|
+
|
|
70
|
+
The tool reads your `mkdocs.yml` for site metadata. You can configure llmstxt output explicitly or let it derive structure from your nav.
|
|
71
|
+
|
|
72
|
+
### Explicit configuration
|
|
73
|
+
|
|
74
|
+
```yaml
|
|
75
|
+
plugins:
|
|
76
|
+
- llmstxt:
|
|
77
|
+
markdown_description: |
|
|
78
|
+
Extra context for LLMs about your project.
|
|
79
|
+
full_output: llms-full.txt
|
|
80
|
+
content_selector: article.md-content__inner
|
|
81
|
+
sections:
|
|
82
|
+
Getting Started:
|
|
83
|
+
- index.md
|
|
84
|
+
- install.md
|
|
85
|
+
Usage:
|
|
86
|
+
- guide/basics.md
|
|
87
|
+
- guide/advanced.md
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
| Option | Default | Description |
|
|
91
|
+
|--------|---------|-------------|
|
|
92
|
+
| `markdown_description` | `""` | Additional context for LLMs, appears after site description |
|
|
93
|
+
| `full_output` | `llms-full.txt` | Filename for concatenated content |
|
|
94
|
+
| `content_selector` | auto-detect | CSS selector for main content |
|
|
95
|
+
| `sections` | derived from nav | Section names mapped to page lists |
|
|
96
|
+
|
|
97
|
+
### Automatic fallback
|
|
98
|
+
|
|
99
|
+
Without an explicit `llmstxt` plugin config, sections derive from your `nav` structure. Top-level pages go into a "Pages" section; nested nav items become sections named by their keys.
|
|
100
|
+
|
|
101
|
+
### MkDocs settings
|
|
102
|
+
|
|
103
|
+
The tool respects `use_directory_urls` from your mkdocs.yml. When enabled (the default), `install.md` maps to `install/index.md`; when disabled, it maps to `install.md`.
|
|
104
|
+
|
|
105
|
+
### Content extraction
|
|
106
|
+
|
|
107
|
+
If `content_selector` is not set, the tool tries these selectors in order:
|
|
108
|
+
|
|
109
|
+
1. `.md-content__inner` (Material for MkDocs)
|
|
110
|
+
1. `[role="main"]` (default MkDocs theme)
|
|
111
|
+
1. `article`
|
|
112
|
+
1. `main`
|
|
113
|
+
1. The entire document
|
|
114
|
+
|
|
115
|
+
### Title resolution
|
|
116
|
+
|
|
117
|
+
Page titles resolve in this order:
|
|
118
|
+
|
|
119
|
+
1. The title from your `nav` structure
|
|
120
|
+
1. The HTML `<title>` tag (with site name suffix stripped)
|
|
121
|
+
1. The first `<h1>` tag
|
|
122
|
+
1. A title derived from the filename
|
|
123
|
+
|
|
124
|
+
## Programmatic use
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from pathlib import Path
|
|
128
|
+
from llmstxt_standalone.config import load_config
|
|
129
|
+
from llmstxt_standalone.generate import generate_llms_txt
|
|
130
|
+
|
|
131
|
+
config = load_config(Path("mkdocs.yml"))
|
|
132
|
+
result = generate_llms_txt(config, site_dir=Path("site"))
|
|
133
|
+
|
|
134
|
+
print(result.llms_txt) # Index content
|
|
135
|
+
print(result.llms_full_txt) # Full content
|
|
136
|
+
print(result.markdown_files) # List of written .md paths
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Compatibility
|
|
140
|
+
|
|
141
|
+
- Produces output identical to mkdocs-llmstxt when configured the same way
|
|
142
|
+
- Handles Unicode, international characters, and special characters
|
|
143
|
+
- Works with Material for MkDocs, ReadTheDocs, and the default MkDocs theme
|
|
144
|
+
- Parses configs containing Python YAML tags like `!python/object/apply`
|
|
145
|
+
|
|
146
|
+
## License
|
|
147
|
+
|
|
148
|
+
MIT
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "llmstxt-standalone"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Generate llms.txt from built HTML documentation"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
keywords = ["llms", "documentation", "markdown", "mkdocs"]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 5 - Production/Stable",
|
|
11
|
+
"Environment :: Console",
|
|
12
|
+
"Intended Audience :: Developers",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.10",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
"Programming Language :: Python :: 3.14",
|
|
20
|
+
"Topic :: Documentation",
|
|
21
|
+
"Typing :: Typed",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
authors = [{ name = "Shaan Majid", email = "shaanmajid64@gmail.com" }]
|
|
25
|
+
|
|
26
|
+
dependencies = [
|
|
27
|
+
"typer>=0.9.0",
|
|
28
|
+
"pyyaml>=6.0",
|
|
29
|
+
"beautifulsoup4>=4.12",
|
|
30
|
+
"markdownify>=0.14,<2.0",
|
|
31
|
+
"mdformat>=0.7,<2.0",
|
|
32
|
+
"mdformat-tables>=1.0",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
llmstxt-standalone = "llmstxt_standalone.cli:app"
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Repository = "https://github.com/shaanmajid/llmstxt-standalone"
|
|
40
|
+
Issues = "https://github.com/shaanmajid/llmstxt-standalone/issues"
|
|
41
|
+
|
|
42
|
+
[dependency-groups]
|
|
43
|
+
dev = [
|
|
44
|
+
{ include-group = "hooks" },
|
|
45
|
+
{ include-group = "lint" },
|
|
46
|
+
{ include-group = "typecheck" },
|
|
47
|
+
{ include-group = "test" },
|
|
48
|
+
]
|
|
49
|
+
hooks = ["prek>=0.3.0"]
|
|
50
|
+
lint = ["ruff>=0.14.14"]
|
|
51
|
+
typecheck = ["ty>=0.0.14"]
|
|
52
|
+
test = ["pytest>=8.0.0", "pytest-cov>=6.0.0"]
|
|
53
|
+
|
|
54
|
+
[tool.uv]
|
|
55
|
+
# renovate: datasource=pypi depName=uv
|
|
56
|
+
required-version = ">=0.9.27"
|
|
57
|
+
|
|
58
|
+
[build-system]
|
|
59
|
+
# renovate: datasource=pypi depName=uv_build
|
|
60
|
+
requires = ["uv_build>=0.9.0,<=0.9.27"]
|
|
61
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Command-line interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from llmstxt_standalone import __version__
|
|
11
|
+
from llmstxt_standalone.config import load_config
|
|
12
|
+
from llmstxt_standalone.generate import build_llms_output, write_markdown_files
|
|
13
|
+
|
|
14
|
+
app = typer.Typer(
|
|
15
|
+
help="Generate llms.txt from built HTML documentation.",
|
|
16
|
+
no_args_is_help=False,
|
|
17
|
+
context_settings={"help_option_names": ["-h", "--help"]},
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def version_callback(value: bool) -> None:
|
|
22
|
+
"""Print version and exit if --version flag is set."""
|
|
23
|
+
if value:
|
|
24
|
+
typer.echo(f"llmstxt-standalone {__version__}")
|
|
25
|
+
raise typer.Exit()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@app.command()
|
|
29
|
+
def main(
|
|
30
|
+
config: Annotated[
|
|
31
|
+
Path,
|
|
32
|
+
typer.Option("--config", "-c", help="Path to mkdocs.yml config file"),
|
|
33
|
+
] = Path("mkdocs.yml"),
|
|
34
|
+
site_dir: Annotated[
|
|
35
|
+
Path,
|
|
36
|
+
typer.Option("--site-dir", "-s", help="Path to built HTML site directory"),
|
|
37
|
+
] = Path("site"),
|
|
38
|
+
output_dir: Annotated[
|
|
39
|
+
Path | None,
|
|
40
|
+
typer.Option(
|
|
41
|
+
"--output-dir", "-o", help="Output directory (defaults to site-dir)"
|
|
42
|
+
),
|
|
43
|
+
] = None,
|
|
44
|
+
dry_run: Annotated[
|
|
45
|
+
bool,
|
|
46
|
+
typer.Option(
|
|
47
|
+
"--dry-run",
|
|
48
|
+
"-n",
|
|
49
|
+
help="Preview what would be generated without writing files",
|
|
50
|
+
),
|
|
51
|
+
] = False,
|
|
52
|
+
quiet: Annotated[
|
|
53
|
+
bool,
|
|
54
|
+
typer.Option("--quiet", "-q", help="Suppress output (exit code only)"),
|
|
55
|
+
] = False,
|
|
56
|
+
verbose: Annotated[
|
|
57
|
+
bool,
|
|
58
|
+
typer.Option("--verbose", "-v", help="Show detailed progress"),
|
|
59
|
+
] = False,
|
|
60
|
+
version: Annotated[
|
|
61
|
+
bool,
|
|
62
|
+
typer.Option(
|
|
63
|
+
"--version",
|
|
64
|
+
"-V",
|
|
65
|
+
callback=version_callback,
|
|
66
|
+
is_eager=True,
|
|
67
|
+
help="Show version",
|
|
68
|
+
),
|
|
69
|
+
] = False,
|
|
70
|
+
) -> None:
|
|
71
|
+
"""Generate llms.txt and llms-full.txt from built HTML documentation."""
|
|
72
|
+
# Resolve output directory
|
|
73
|
+
out_dir = output_dir or site_dir
|
|
74
|
+
|
|
75
|
+
# quiet overrides verbose
|
|
76
|
+
if quiet:
|
|
77
|
+
verbose = False
|
|
78
|
+
|
|
79
|
+
def log(msg: str, color: str = "green", err: bool = False) -> None:
|
|
80
|
+
if not quiet:
|
|
81
|
+
typer.secho(msg, fg=color, err=err)
|
|
82
|
+
|
|
83
|
+
# Validate inputs
|
|
84
|
+
if not config.exists():
|
|
85
|
+
typer.secho(f"Error: Config file not found: {config}", fg="red", err=True)
|
|
86
|
+
raise typer.Exit(1)
|
|
87
|
+
|
|
88
|
+
if not site_dir.exists():
|
|
89
|
+
typer.secho(f"Error: Site directory not found: {site_dir}", fg="red", err=True)
|
|
90
|
+
typer.secho(
|
|
91
|
+
"Hint: Run 'mkdocs build' first to generate the HTML documentation.",
|
|
92
|
+
fg="yellow",
|
|
93
|
+
err=True,
|
|
94
|
+
)
|
|
95
|
+
raise typer.Exit(1)
|
|
96
|
+
|
|
97
|
+
# Load config
|
|
98
|
+
try:
|
|
99
|
+
cfg = load_config(config)
|
|
100
|
+
except Exception as e:
|
|
101
|
+
typer.secho(f"Error loading config: {e}", fg="red", err=True)
|
|
102
|
+
raise typer.Exit(1) from None
|
|
103
|
+
|
|
104
|
+
# Validate sections
|
|
105
|
+
if not cfg.sections:
|
|
106
|
+
typer.secho("Error: No sections configured.", fg="red", err=True)
|
|
107
|
+
typer.secho(
|
|
108
|
+
"Add a 'nav' to your mkdocs.yml, or configure 'sections' "
|
|
109
|
+
"in the llmstxt plugin.",
|
|
110
|
+
fg="yellow",
|
|
111
|
+
err=True,
|
|
112
|
+
)
|
|
113
|
+
raise typer.Exit(1)
|
|
114
|
+
|
|
115
|
+
if verbose:
|
|
116
|
+
typer.echo(f"Site: {cfg.site_name}")
|
|
117
|
+
typer.echo(f"Sections: {list(cfg.sections.keys())}")
|
|
118
|
+
if dry_run:
|
|
119
|
+
typer.echo("Dry run - no files will be written")
|
|
120
|
+
|
|
121
|
+
# Generate content
|
|
122
|
+
build = build_llms_output(
|
|
123
|
+
config=cfg,
|
|
124
|
+
site_dir=site_dir,
|
|
125
|
+
)
|
|
126
|
+
try:
|
|
127
|
+
markdown_files = write_markdown_files(
|
|
128
|
+
build.pages,
|
|
129
|
+
output_dir=out_dir,
|
|
130
|
+
use_directory_urls=cfg.use_directory_urls,
|
|
131
|
+
dry_run=dry_run,
|
|
132
|
+
)
|
|
133
|
+
except (OSError, ValueError) as exc:
|
|
134
|
+
typer.secho(f"Error writing markdown files: {exc}", fg="red", err=True)
|
|
135
|
+
raise typer.Exit(1) from None
|
|
136
|
+
|
|
137
|
+
# Define output paths
|
|
138
|
+
llms_path = out_dir / "llms.txt"
|
|
139
|
+
full_path = out_dir / cfg.full_output
|
|
140
|
+
|
|
141
|
+
# Write output files (skip in dry-run mode)
|
|
142
|
+
if dry_run:
|
|
143
|
+
action = "Would generate"
|
|
144
|
+
color = "yellow"
|
|
145
|
+
else:
|
|
146
|
+
action = "Generated"
|
|
147
|
+
color = "green"
|
|
148
|
+
try:
|
|
149
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
llms_path.write_text(build.llms_txt, encoding="utf-8")
|
|
151
|
+
full_path.write_text(build.llms_full_txt, encoding="utf-8")
|
|
152
|
+
except OSError as exc:
|
|
153
|
+
typer.secho(f"Error writing output files: {exc}", fg="red", err=True)
|
|
154
|
+
raise typer.Exit(1) from None
|
|
155
|
+
|
|
156
|
+
log(f"{action} {llms_path} ({len(build.llms_txt):,} bytes)", color)
|
|
157
|
+
log(f"{action} {full_path} ({len(build.llms_full_txt):,} bytes)", color)
|
|
158
|
+
log(f"{action} {len(markdown_files)} markdown files", color)
|
|
159
|
+
|
|
160
|
+
if verbose and build.skipped:
|
|
161
|
+
log("Skipped files:", color="yellow", err=True)
|
|
162
|
+
for path, reason in build.skipped:
|
|
163
|
+
log(f"- {path} ({reason})", color="yellow", err=True)
|
|
164
|
+
|
|
165
|
+
if build.warnings:
|
|
166
|
+
log("Warnings:", color="yellow", err=True)
|
|
167
|
+
for warning in build.warnings:
|
|
168
|
+
log(f"- {warning}", color="yellow", err=True)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
if __name__ == "__main__":
|
|
172
|
+
app()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Helpers for deriving config sections from nav."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def nav_to_sections(nav: list[Any]) -> dict[str, list[str]]:
|
|
9
|
+
"""Convert nav structure to sections dict."""
|
|
10
|
+
sections: dict[str, list[str]] = {}
|
|
11
|
+
|
|
12
|
+
for item in nav:
|
|
13
|
+
if isinstance(item, dict):
|
|
14
|
+
for key, value in item.items():
|
|
15
|
+
if isinstance(value, str):
|
|
16
|
+
# Top-level page, add to "Pages" section
|
|
17
|
+
sections.setdefault("Pages", []).append(value)
|
|
18
|
+
elif isinstance(value, list):
|
|
19
|
+
# Section with children
|
|
20
|
+
pages = _extract_pages(value)
|
|
21
|
+
if pages:
|
|
22
|
+
sections[key] = pages
|
|
23
|
+
|
|
24
|
+
return sections
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _extract_pages(items: list[Any]) -> list[str]:
|
|
28
|
+
"""Extract page paths from nav items."""
|
|
29
|
+
pages: list[str] = []
|
|
30
|
+
for item in items:
|
|
31
|
+
if isinstance(item, str):
|
|
32
|
+
pages.append(item)
|
|
33
|
+
elif isinstance(item, dict):
|
|
34
|
+
for value in item.values():
|
|
35
|
+
if isinstance(value, str):
|
|
36
|
+
pages.append(value)
|
|
37
|
+
elif isinstance(value, list):
|
|
38
|
+
pages.extend(_extract_pages(value))
|
|
39
|
+
return pages
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Configuration loading from mkdocs.yml."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
from llmstxt_standalone.config.derive import nav_to_sections
|
|
11
|
+
from llmstxt_standalone.config.model import Config
|
|
12
|
+
from llmstxt_standalone.config.plugin import get_llmstxt_config
|
|
13
|
+
|
|
14
|
+
DEFAULT_SITE_NAME = "Documentation"
|
|
15
|
+
DEFAULT_FULL_OUTPUT = "llms-full.txt"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class _PermissiveLoader(yaml.SafeLoader):
|
|
19
|
+
"""SafeLoader that ignores unknown Python tags.
|
|
20
|
+
|
|
21
|
+
MkDocs extensions like pymdownx.slugs use Python-specific YAML tags
|
|
22
|
+
like !python/object/apply which SafeLoader rejects. This loader
|
|
23
|
+
treats them as raw strings to allow parsing the rest of the config.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _ignore_unknown(loader: yaml.Loader, tag_suffix: str, node: yaml.Node) -> str:
|
|
28
|
+
"""Return the raw tag as a placeholder string."""
|
|
29
|
+
return f"<{node.tag}>"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Register handler for all Python tags (both full and shorthand forms)
|
|
33
|
+
_PermissiveLoader.add_multi_constructor("tag:yaml.org,2002:python/", _ignore_unknown)
|
|
34
|
+
_PermissiveLoader.add_multi_constructor("!python/", _ignore_unknown)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def load_config(config_path: Path) -> Config:
|
|
38
|
+
"""Load and resolve configuration from mkdocs.yml.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
config_path: Path to mkdocs.yml file.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Resolved Config object.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
FileNotFoundError: If config file doesn't exist.
|
|
48
|
+
"""
|
|
49
|
+
if not config_path.exists():
|
|
50
|
+
raise FileNotFoundError(f"Config file not found: {config_path}")
|
|
51
|
+
|
|
52
|
+
with open(config_path, encoding="utf-8") as f:
|
|
53
|
+
raw = yaml.load(f, Loader=_PermissiveLoader)
|
|
54
|
+
|
|
55
|
+
if not isinstance(raw, dict):
|
|
56
|
+
raise ValueError(f"Config file must be a mapping: {config_path}")
|
|
57
|
+
|
|
58
|
+
return _config_from_mkdocs(raw)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _config_from_mkdocs(raw: dict[str, Any]) -> Config:
|
|
62
|
+
"""Build a Config from a parsed mkdocs.yml mapping."""
|
|
63
|
+
site_name = raw.get("site_name", DEFAULT_SITE_NAME)
|
|
64
|
+
site_description = raw.get("site_description", "")
|
|
65
|
+
site_url = raw.get("site_url", "").rstrip("/")
|
|
66
|
+
nav = raw.get("nav", [])
|
|
67
|
+
# MkDocs defaults use_directory_urls to true
|
|
68
|
+
use_directory_urls = raw.get("use_directory_urls", True)
|
|
69
|
+
|
|
70
|
+
llmstxt_config = get_llmstxt_config(raw)
|
|
71
|
+
|
|
72
|
+
if llmstxt_config is not None:
|
|
73
|
+
markdown_description = llmstxt_config.get("markdown_description", "")
|
|
74
|
+
full_output = llmstxt_config.get("full_output", DEFAULT_FULL_OUTPUT)
|
|
75
|
+
content_selector = llmstxt_config.get("content_selector")
|
|
76
|
+
sections = llmstxt_config.get("sections", {})
|
|
77
|
+
else:
|
|
78
|
+
markdown_description = ""
|
|
79
|
+
full_output = DEFAULT_FULL_OUTPUT
|
|
80
|
+
content_selector = None
|
|
81
|
+
sections = nav_to_sections(nav)
|
|
82
|
+
|
|
83
|
+
return Config(
|
|
84
|
+
site_name=site_name,
|
|
85
|
+
site_description=site_description,
|
|
86
|
+
site_url=site_url,
|
|
87
|
+
markdown_description=markdown_description,
|
|
88
|
+
full_output=full_output,
|
|
89
|
+
content_selector=content_selector,
|
|
90
|
+
sections=sections,
|
|
91
|
+
nav=nav,
|
|
92
|
+
use_directory_urls=use_directory_urls,
|
|
93
|
+
)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Configuration model and derived helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Config:
|
|
11
|
+
"""Resolved configuration for llmstxt generation."""
|
|
12
|
+
|
|
13
|
+
site_name: str
|
|
14
|
+
site_description: str
|
|
15
|
+
site_url: str
|
|
16
|
+
markdown_description: str
|
|
17
|
+
full_output: str
|
|
18
|
+
content_selector: str | None
|
|
19
|
+
sections: dict[str, list[str]]
|
|
20
|
+
nav: list[Any]
|
|
21
|
+
use_directory_urls: bool = True
|
|
22
|
+
|
|
23
|
+
def get_nav_title(self, md_path: str) -> str | None:
|
|
24
|
+
"""Find the title for a page from the nav structure only.
|
|
25
|
+
|
|
26
|
+
Returns None if the page is not found in nav or has no explicit title.
|
|
27
|
+
"""
|
|
28
|
+
return self._search_nav(self.nav, md_path, section_title=None)
|
|
29
|
+
|
|
30
|
+
def get_filename_title(self, md_path: str) -> str:
|
|
31
|
+
"""Derive title from filename path."""
|
|
32
|
+
return md_path.replace(".md", "").replace("-", " ").replace("/", " - ").title()
|
|
33
|
+
|
|
34
|
+
def get_page_title(self, md_path: str) -> str:
|
|
35
|
+
"""Find the title for a page from the nav structure with fallback."""
|
|
36
|
+
return self.get_nav_title(md_path) or self.get_filename_title(md_path)
|
|
37
|
+
|
|
38
|
+
def _search_nav(
|
|
39
|
+
self, items: list[Any], md_path: str, section_title: str | None
|
|
40
|
+
) -> str | None:
|
|
41
|
+
"""Recursively search nav for a page title.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
items: Nav items to search (list of dicts or strings).
|
|
45
|
+
md_path: Markdown file path to find.
|
|
46
|
+
section_title: Title of the current section for bare string inheritance.
|
|
47
|
+
"""
|
|
48
|
+
for item in items:
|
|
49
|
+
# Bare string in a section list inherits section title
|
|
50
|
+
if isinstance(item, str) and item == md_path:
|
|
51
|
+
return section_title
|
|
52
|
+
if isinstance(item, dict):
|
|
53
|
+
for key, value in item.items():
|
|
54
|
+
if isinstance(value, str) and value == md_path:
|
|
55
|
+
return key
|
|
56
|
+
if isinstance(value, list):
|
|
57
|
+
result = self._search_nav(value, md_path, section_title=key)
|
|
58
|
+
if result:
|
|
59
|
+
return result
|
|
60
|
+
return None
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""MkDocs plugin configuration helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_llmstxt_config(raw: dict[str, Any]) -> dict[str, Any] | None:
|
|
9
|
+
"""Extract llmstxt plugin config from mkdocs.yml plugins.
|
|
10
|
+
|
|
11
|
+
MkDocs supports two plugin config styles:
|
|
12
|
+
|
|
13
|
+
List form:
|
|
14
|
+
plugins:
|
|
15
|
+
- llmstxt:
|
|
16
|
+
sections: ...
|
|
17
|
+
|
|
18
|
+
Mapping form:
|
|
19
|
+
plugins:
|
|
20
|
+
llmstxt:
|
|
21
|
+
sections: ...
|
|
22
|
+
"""
|
|
23
|
+
plugins = raw.get("plugins")
|
|
24
|
+
if plugins is None:
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
# Mapping form: plugins is a dict with plugin names as keys
|
|
28
|
+
if isinstance(plugins, dict):
|
|
29
|
+
if "llmstxt" in plugins:
|
|
30
|
+
config = plugins["llmstxt"]
|
|
31
|
+
# Plugin with no options is represented as empty dict or None
|
|
32
|
+
return config if isinstance(config, dict) else {}
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
# List form: plugins is a list of strings or dicts
|
|
36
|
+
for plugin in plugins:
|
|
37
|
+
if isinstance(plugin, dict) and "llmstxt" in plugin:
|
|
38
|
+
config = plugin["llmstxt"]
|
|
39
|
+
return config if isinstance(config, dict) else {}
|
|
40
|
+
if plugin == "llmstxt":
|
|
41
|
+
return {}
|
|
42
|
+
return None
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""HTML to Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import mdformat
|
|
6
|
+
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
7
|
+
from markdownify import ATX, MarkdownConverter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _should_remove(tag: Tag) -> bool:
|
|
11
|
+
"""Check if a tag should be removed during autoclean."""
|
|
12
|
+
if tag.name in {"img", "svg"}:
|
|
13
|
+
return True
|
|
14
|
+
if tag.name == "a" and tag.img:
|
|
15
|
+
return True
|
|
16
|
+
classes = tag.get("class") or ()
|
|
17
|
+
if tag.name == "a" and "headerlink" in classes:
|
|
18
|
+
return True
|
|
19
|
+
if "twemoji" in classes:
|
|
20
|
+
return True
|
|
21
|
+
return "tabbed-labels" in classes
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _autoclean(soup: BeautifulSoup | Tag) -> None:
|
|
25
|
+
"""Remove unwanted elements from HTML."""
|
|
26
|
+
for element in soup.find_all(_should_remove):
|
|
27
|
+
element.decompose()
|
|
28
|
+
|
|
29
|
+
# Unwrap autoref elements
|
|
30
|
+
for element in soup.find_all("autoref"):
|
|
31
|
+
element.replace_with(NavigableString(element.get_text()))
|
|
32
|
+
|
|
33
|
+
# Remove line numbers from code blocks
|
|
34
|
+
for element in soup.find_all("table", attrs={"class": "highlighttable"}):
|
|
35
|
+
code = element.find("code")
|
|
36
|
+
if code:
|
|
37
|
+
element.replace_with(
|
|
38
|
+
BeautifulSoup(f"<pre>{code.get_text()}</pre>", "html.parser")
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_language(tag: Tag) -> str:
|
|
43
|
+
"""Extract language from code block classes.
|
|
44
|
+
|
|
45
|
+
The callback receives the <pre> tag, so we need to check:
|
|
46
|
+
1. Classes on the <pre> tag itself
|
|
47
|
+
2. Classes on the parent of <pre>
|
|
48
|
+
3. Classes on child <code> element (common pattern: <pre><code class="language-X">)
|
|
49
|
+
"""
|
|
50
|
+
classes: list[str] = list(tag.get("class") or ())
|
|
51
|
+
|
|
52
|
+
# Check parent classes
|
|
53
|
+
if tag.parent:
|
|
54
|
+
classes.extend(tag.parent.get("class") or ())
|
|
55
|
+
|
|
56
|
+
# Check child <code> element classes
|
|
57
|
+
code_child = tag.find("code")
|
|
58
|
+
if code_child:
|
|
59
|
+
classes.extend(code_child.get("class") or ())
|
|
60
|
+
|
|
61
|
+
for css_class in classes:
|
|
62
|
+
if css_class.startswith("language-"):
|
|
63
|
+
return css_class[9:]
|
|
64
|
+
return ""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Converter with mkdocs-llmstxt-compatible settings
|
|
68
|
+
_converter = MarkdownConverter(
|
|
69
|
+
bullets="-",
|
|
70
|
+
code_language_callback=_get_language,
|
|
71
|
+
escape_underscores=False,
|
|
72
|
+
heading_style=ATX,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def extract_title_from_html(html: str, site_name: str | None = None) -> str | None:
|
|
77
|
+
"""Extract page title from HTML.
|
|
78
|
+
|
|
79
|
+
Tries <title> tag first, then falls back to first <h1>.
|
|
80
|
+
Strips site name suffixes (e.g., "Page - Site Name" -> "Page") when provided.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
html: Raw HTML content.
|
|
84
|
+
site_name: Site name to strip from title suffixes (e.g., "Page - Site").
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
The page title, or None if not found.
|
|
88
|
+
"""
|
|
89
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
90
|
+
|
|
91
|
+
# Try <title> tag first
|
|
92
|
+
title_tag = soup.find("title")
|
|
93
|
+
if title_tag:
|
|
94
|
+
title = title_tag.get_text().strip()
|
|
95
|
+
# Strip site name suffix only when it matches the configured site name.
|
|
96
|
+
if site_name and " - " in title:
|
|
97
|
+
base, suffix = title.rsplit(" - ", 1)
|
|
98
|
+
if suffix.strip().casefold() == site_name.strip().casefold():
|
|
99
|
+
title = base.strip()
|
|
100
|
+
if title:
|
|
101
|
+
return title
|
|
102
|
+
|
|
103
|
+
# Fall back to first <h1>
|
|
104
|
+
h1_tag = soup.find("h1")
|
|
105
|
+
if h1_tag:
|
|
106
|
+
text = h1_tag.get_text().strip()
|
|
107
|
+
if text:
|
|
108
|
+
return text
|
|
109
|
+
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def html_to_markdown(html: str, content_selector: str | None = None) -> str:
|
|
114
|
+
"""Convert HTML to clean Markdown.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
html: Raw HTML content.
|
|
118
|
+
content_selector: Optional CSS selector for main content.
|
|
119
|
+
Defaults to Material for MkDocs selectors.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Cleaned Markdown text.
|
|
123
|
+
"""
|
|
124
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
125
|
+
|
|
126
|
+
# Find main content
|
|
127
|
+
if content_selector:
|
|
128
|
+
try:
|
|
129
|
+
content = soup.select_one(content_selector)
|
|
130
|
+
except Exception:
|
|
131
|
+
content = None
|
|
132
|
+
else:
|
|
133
|
+
if content is None:
|
|
134
|
+
return ""
|
|
135
|
+
else:
|
|
136
|
+
content = None
|
|
137
|
+
|
|
138
|
+
if content is None:
|
|
139
|
+
content = (
|
|
140
|
+
soup.select_one(".md-content__inner") # Material for MkDocs
|
|
141
|
+
or soup.select_one('[role="main"]') # Default MkDocs theme
|
|
142
|
+
or soup.select_one("article")
|
|
143
|
+
or soup.select_one("main")
|
|
144
|
+
or soup
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
if content is None:
|
|
148
|
+
return ""
|
|
149
|
+
|
|
150
|
+
_autoclean(content)
|
|
151
|
+
md = _converter.convert_soup(content)
|
|
152
|
+
return mdformat.text(md, options={"wrap": "no"}, extensions=("tables",))
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""Main generation orchestration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from llmstxt_standalone.config import Config
|
|
9
|
+
from llmstxt_standalone.convert import extract_title_from_html, html_to_markdown
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _escape_markdown_link_text(text: str) -> str:
|
|
13
|
+
r"""Escape characters that break markdown link syntax.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
text: The link text to escape.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Text with backslashes, brackets escaped and newlines replaced with spaces.
|
|
20
|
+
"""
|
|
21
|
+
return (
|
|
22
|
+
text.replace("\\", "\\\\")
|
|
23
|
+
.replace("[", r"\[")
|
|
24
|
+
.replace("]", r"\]")
|
|
25
|
+
.replace("\r\n", " ")
|
|
26
|
+
.replace("\n", " ")
|
|
27
|
+
.replace("\r", " ")
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _is_index_md(md_path: str) -> bool:
|
|
32
|
+
return md_path == "index.md" or md_path.endswith("/index.md")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _ensure_safe_md_path(md_path: str) -> Path:
|
|
36
|
+
path = Path(md_path)
|
|
37
|
+
if path.is_absolute() or path.drive:
|
|
38
|
+
raise ValueError(f"Markdown path must be relative: {md_path}")
|
|
39
|
+
if ".." in path.parts:
|
|
40
|
+
raise ValueError(f"Markdown path must not contain '..': {md_path}")
|
|
41
|
+
return path
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _ensure_within_dir(base_dir: Path, path: Path, label: str) -> Path:
|
|
45
|
+
base_resolved = base_dir.resolve(strict=False)
|
|
46
|
+
path_resolved = path.resolve(strict=False)
|
|
47
|
+
if not path_resolved.is_relative_to(base_resolved):
|
|
48
|
+
raise ValueError(f"{label} resolves outside {base_resolved}: {path}")
|
|
49
|
+
return path
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def md_path_to_html_path(
|
|
53
|
+
site_dir: Path, md_path: str, use_directory_urls: bool = True
|
|
54
|
+
) -> Path:
|
|
55
|
+
"""Convert docs/foo.md path to site HTML path.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
site_dir: Path to the built site directory.
|
|
59
|
+
md_path: Relative markdown file path (e.g., "install.md").
|
|
60
|
+
use_directory_urls: If True, maps to foo/index.html; if False, maps to foo.html.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Path to the corresponding HTML file.
|
|
64
|
+
"""
|
|
65
|
+
# Handle index.md at any level (root or nested like foo/bar/index.md)
|
|
66
|
+
safe_md_path = _ensure_safe_md_path(md_path)
|
|
67
|
+
if _is_index_md(md_path):
|
|
68
|
+
html_path = site_dir / safe_md_path.with_suffix(".html")
|
|
69
|
+
return _ensure_within_dir(site_dir, html_path, "HTML path")
|
|
70
|
+
if use_directory_urls:
|
|
71
|
+
html_path = site_dir / safe_md_path.with_suffix("") / "index.html"
|
|
72
|
+
return _ensure_within_dir(site_dir, html_path, "HTML path")
|
|
73
|
+
html_path = site_dir / safe_md_path.with_suffix(".html")
|
|
74
|
+
return _ensure_within_dir(site_dir, html_path, "HTML path")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def md_path_to_page_url(
|
|
78
|
+
site_url: str,
|
|
79
|
+
md_path: str,
|
|
80
|
+
use_directory_urls: bool = True,
|
|
81
|
+
) -> str:
|
|
82
|
+
"""Convert docs/foo.md path to markdown file URL on the deployed site.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
site_url: Base URL of the site.
|
|
86
|
+
md_path: Relative markdown file path (e.g., "install.md").
|
|
87
|
+
use_directory_urls: If True, directory-style URLs; if False, flat URLs.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
URL to the markdown file on the deployed site.
|
|
91
|
+
"""
|
|
92
|
+
if not site_url:
|
|
93
|
+
if _is_index_md(md_path):
|
|
94
|
+
return md_path
|
|
95
|
+
if use_directory_urls:
|
|
96
|
+
return md_path.replace(".md", "") + "/index.md"
|
|
97
|
+
return md_path
|
|
98
|
+
# Handle index.md at any level (root or nested like foo/bar/index.md)
|
|
99
|
+
if _is_index_md(md_path):
|
|
100
|
+
return f"{site_url}/{md_path}"
|
|
101
|
+
if use_directory_urls:
|
|
102
|
+
return f"{site_url}/{md_path.replace('.md', '')}/index.md"
|
|
103
|
+
return f"{site_url}/{md_path}"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def md_path_to_output_md_path(
|
|
107
|
+
site_dir: Path, md_path: str, use_directory_urls: bool = True
|
|
108
|
+
) -> Path:
|
|
109
|
+
"""Convert docs/foo.md path to site markdown output path.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
site_dir: Path to the built site directory.
|
|
113
|
+
md_path: Relative markdown file path (e.g., "install.md").
|
|
114
|
+
use_directory_urls: If True, outputs to foo/index.md; if False, outputs to foo.md.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Path where the markdown file should be written.
|
|
118
|
+
"""
|
|
119
|
+
# Handle index.md at any level (root or nested like foo/bar/index.md)
|
|
120
|
+
safe_md_path = _ensure_safe_md_path(md_path)
|
|
121
|
+
if _is_index_md(md_path):
|
|
122
|
+
output_path = site_dir / safe_md_path
|
|
123
|
+
return _ensure_within_dir(site_dir, output_path, "Output path")
|
|
124
|
+
if use_directory_urls:
|
|
125
|
+
output_path = site_dir / safe_md_path.with_suffix("") / "index.md"
|
|
126
|
+
return _ensure_within_dir(site_dir, output_path, "Output path")
|
|
127
|
+
output_path = site_dir / safe_md_path
|
|
128
|
+
return _ensure_within_dir(site_dir, output_path, "Output path")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@dataclass
|
|
132
|
+
class PageMarkdown:
|
|
133
|
+
"""Per-page markdown output."""
|
|
134
|
+
|
|
135
|
+
md_path: str
|
|
136
|
+
content: str
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass
|
|
140
|
+
class BuildResult:
|
|
141
|
+
"""Result of building llms.txt content (no files written)."""
|
|
142
|
+
|
|
143
|
+
llms_txt: str
|
|
144
|
+
llms_full_txt: str
|
|
145
|
+
pages: list[PageMarkdown]
|
|
146
|
+
skipped: list[tuple[Path, str]]
|
|
147
|
+
warnings: list[str]
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass
|
|
151
|
+
class GenerateResult:
|
|
152
|
+
"""Result of llms.txt generation with files written."""
|
|
153
|
+
|
|
154
|
+
llms_txt: str
|
|
155
|
+
llms_full_txt: str
|
|
156
|
+
markdown_files: list[Path]
|
|
157
|
+
skipped: list[tuple[Path, str]]
|
|
158
|
+
warnings: list[str]
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def build_llms_output(
|
|
162
|
+
config: Config,
|
|
163
|
+
site_dir: Path,
|
|
164
|
+
) -> BuildResult:
|
|
165
|
+
"""Build llms.txt, llms-full.txt, and per-page markdown content.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
config: Resolved configuration.
|
|
169
|
+
site_dir: Path to built HTML site directory.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
BuildResult with content and per-page markdown data.
|
|
173
|
+
"""
|
|
174
|
+
# Build llms.txt (index)
|
|
175
|
+
llms_lines = [f"# {config.site_name}", ""]
|
|
176
|
+
|
|
177
|
+
if config.site_description:
|
|
178
|
+
llms_lines.append(f"> {config.site_description}")
|
|
179
|
+
llms_lines.append("")
|
|
180
|
+
|
|
181
|
+
if config.markdown_description:
|
|
182
|
+
llms_lines.append(config.markdown_description.strip())
|
|
183
|
+
llms_lines.append("")
|
|
184
|
+
|
|
185
|
+
# Build llms-full.txt header
|
|
186
|
+
full_lines = [f"# {config.site_name}", ""]
|
|
187
|
+
|
|
188
|
+
if config.site_description:
|
|
189
|
+
full_lines.append(f"> {config.site_description}")
|
|
190
|
+
full_lines.append("")
|
|
191
|
+
|
|
192
|
+
# Process sections - check HTML existence and extract titles first
|
|
193
|
+
page_outputs: list[PageMarkdown] = []
|
|
194
|
+
skipped: list[tuple[Path, str]] = []
|
|
195
|
+
warnings: list[str] = []
|
|
196
|
+
|
|
197
|
+
for section_name, section_pages in config.sections.items():
|
|
198
|
+
section_entries: list[str] = []
|
|
199
|
+
|
|
200
|
+
for md_path in section_pages:
|
|
201
|
+
try:
|
|
202
|
+
html_path = md_path_to_html_path(
|
|
203
|
+
site_dir, md_path, config.use_directory_urls
|
|
204
|
+
)
|
|
205
|
+
except ValueError as exc:
|
|
206
|
+
skipped.append((site_dir / md_path, str(exc)))
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
if not html_path.exists():
|
|
210
|
+
skipped.append((html_path, "HTML file not found"))
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
html = html_path.read_text(encoding="utf-8")
|
|
215
|
+
except UnicodeDecodeError:
|
|
216
|
+
skipped.append((html_path, "HTML file has encoding errors"))
|
|
217
|
+
continue
|
|
218
|
+
except OSError as exc:
|
|
219
|
+
skipped.append((html_path, f"Failed to read HTML file: {exc}"))
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Prefer nav title (mkdocs-llmstxt compat), fall back to HTML, then filename
|
|
223
|
+
title = (
|
|
224
|
+
config.get_nav_title(md_path)
|
|
225
|
+
or extract_title_from_html(html, site_name=config.site_name)
|
|
226
|
+
or config.get_filename_title(md_path)
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
page_url = md_path_to_page_url(
|
|
230
|
+
config.site_url,
|
|
231
|
+
md_path,
|
|
232
|
+
config.use_directory_urls,
|
|
233
|
+
)
|
|
234
|
+
# Escape brackets in title to produce valid markdown links
|
|
235
|
+
escaped_title = _escape_markdown_link_text(title)
|
|
236
|
+
section_entries.append(f"- [{escaped_title}]({page_url})")
|
|
237
|
+
|
|
238
|
+
# Convert content for llms-full.txt
|
|
239
|
+
try:
|
|
240
|
+
content = html_to_markdown(html, config.content_selector)
|
|
241
|
+
except Exception as exc:
|
|
242
|
+
warning = f"Failed to convert HTML from {html_path}: {exc}"
|
|
243
|
+
warnings.append(warning)
|
|
244
|
+
content = ""
|
|
245
|
+
|
|
246
|
+
if content:
|
|
247
|
+
full_lines.append(f"## {title}")
|
|
248
|
+
full_lines.append("")
|
|
249
|
+
full_lines.append(content)
|
|
250
|
+
full_lines.append("")
|
|
251
|
+
else:
|
|
252
|
+
warning = (
|
|
253
|
+
f"No markdown content extracted from {html_path}; content empty"
|
|
254
|
+
)
|
|
255
|
+
warnings.append(warning)
|
|
256
|
+
|
|
257
|
+
page_outputs.append(PageMarkdown(md_path=md_path, content=content))
|
|
258
|
+
|
|
259
|
+
# Only add section to llms.txt if it has entries
|
|
260
|
+
if section_entries:
|
|
261
|
+
llms_lines.append(f"## {section_name}")
|
|
262
|
+
llms_lines.append("")
|
|
263
|
+
llms_lines.extend(section_entries)
|
|
264
|
+
llms_lines.append("")
|
|
265
|
+
|
|
266
|
+
llms_txt = "\n".join(llms_lines)
|
|
267
|
+
llms_full_txt = "\n".join(full_lines)
|
|
268
|
+
|
|
269
|
+
return BuildResult(
|
|
270
|
+
llms_txt=llms_txt,
|
|
271
|
+
llms_full_txt=llms_full_txt,
|
|
272
|
+
pages=page_outputs,
|
|
273
|
+
skipped=skipped,
|
|
274
|
+
warnings=warnings,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def write_markdown_files(
|
|
279
|
+
pages: list[PageMarkdown],
|
|
280
|
+
output_dir: Path,
|
|
281
|
+
use_directory_urls: bool,
|
|
282
|
+
dry_run: bool = False,
|
|
283
|
+
) -> list[Path]:
|
|
284
|
+
"""Write per-page markdown files to disk.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
pages: Per-page markdown content.
|
|
288
|
+
output_dir: Path to write output files.
|
|
289
|
+
use_directory_urls: If True, outputs to foo/index.md; if False, outputs to foo.md.
|
|
290
|
+
dry_run: If True, don't write markdown files.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
List of output markdown paths (written or would-be).
|
|
294
|
+
"""
|
|
295
|
+
markdown_files: list[Path] = []
|
|
296
|
+
for page in pages:
|
|
297
|
+
output_md_path = md_path_to_output_md_path(
|
|
298
|
+
output_dir, page.md_path, use_directory_urls
|
|
299
|
+
)
|
|
300
|
+
if not dry_run:
|
|
301
|
+
try:
|
|
302
|
+
output_md_path.parent.mkdir(parents=True, exist_ok=True)
|
|
303
|
+
output_md_path.write_text(page.content, encoding="utf-8")
|
|
304
|
+
except OSError as exc:
|
|
305
|
+
raise OSError(f"Failed to write {output_md_path}: {exc}") from exc
|
|
306
|
+
markdown_files.append(output_md_path)
|
|
307
|
+
return markdown_files
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def generate_llms_txt(
|
|
311
|
+
config: Config,
|
|
312
|
+
site_dir: Path,
|
|
313
|
+
output_dir: Path | None = None,
|
|
314
|
+
dry_run: bool = False,
|
|
315
|
+
) -> GenerateResult:
|
|
316
|
+
"""Generate llms.txt, llms-full.txt, and per-page markdown files.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
config: Resolved configuration.
|
|
320
|
+
site_dir: Path to built HTML site directory.
|
|
321
|
+
output_dir: Path to write output files. Defaults to site_dir.
|
|
322
|
+
dry_run: If True, don't write markdown files.
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
GenerateResult with content and list of markdown files (written or would-be).
|
|
326
|
+
"""
|
|
327
|
+
build = build_llms_output(config=config, site_dir=site_dir)
|
|
328
|
+
if output_dir is None:
|
|
329
|
+
output_dir = site_dir
|
|
330
|
+
markdown_files = write_markdown_files(
|
|
331
|
+
build.pages,
|
|
332
|
+
output_dir=output_dir,
|
|
333
|
+
use_directory_urls=config.use_directory_urls,
|
|
334
|
+
dry_run=dry_run,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
return GenerateResult(
|
|
338
|
+
llms_txt=build.llms_txt,
|
|
339
|
+
llms_full_txt=build.llms_full_txt,
|
|
340
|
+
markdown_files=markdown_files,
|
|
341
|
+
skipped=build.skipped,
|
|
342
|
+
warnings=build.warnings,
|
|
343
|
+
)
|
|
File without changes
|