llmsbrieftxt 1.3.1__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of llmsbrieftxt might be problematic. Click here for more details.
- llmsbrieftxt-1.4.0/.github/copilot-instructions.md +115 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/CLAUDE.md +27 -11
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/PKG-INFO +44 -12
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/README.md +43 -11
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/cli.py +112 -15
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/constants.py +26 -0
- llmsbrieftxt-1.4.0/llmsbrieftxt/main.py +227 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/pyproject.toml +1 -1
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/test_cli.py +128 -1
- llmsbrieftxt-1.3.1/llmsbrieftxt/main.py +0 -142
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/ISSUE_TEMPLATE/question.yml +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/pull_request_template.md +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/workflows/ci.yml +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/workflows/pr-title-check.yml +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/workflows/release.yml +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.gitignore +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/CONTRIBUTING.md +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/LICENSE +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/PRODUCTION_CLEANUP_PLAN.md +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/__init__.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/crawler.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/doc_loader.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/extractor.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/schema.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/summarizer.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/url_filters.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/url_utils.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/pytest.ini +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/scripts/bump_version.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/__init__.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/conftest.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/fixtures/__init__.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/integration/__init__.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/integration/test_doc_loader_integration.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/__init__.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/test_doc_loader.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/test_extractor.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/test_robustness.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/test_summarizer.py +0 -0
- {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/uv.lock +0 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# GitHub Copilot Instructions for llmsbrieftxt
|
|
2
|
+
|
|
3
|
+
## Project Overview
|
|
4
|
+
|
|
5
|
+
This is `llmsbrieftxt`, a Python package that generates llms-brief.txt files by crawling documentation websites and using OpenAI to create structured descriptions. The CLI command is `llmtxt` (not `llmsbrieftxt`).
|
|
6
|
+
|
|
7
|
+
## Architecture and Code Patterns
|
|
8
|
+
|
|
9
|
+
### Async-First Design
|
|
10
|
+
All main functions use async/await patterns. Use `asyncio.gather()` for concurrent operations and semaphore control for rate limiting. The processing pipeline flows: URL Discovery → Content Extraction → LLM Summarization → File Generation.
|
|
11
|
+
|
|
12
|
+
### Module Organization
|
|
13
|
+
- **cli.py**: Simple CLI with positional URL argument (no subcommands)
|
|
14
|
+
- **main.py**: Orchestrates the async generation pipeline
|
|
15
|
+
- **crawler.py**: RobustDocCrawler for breadth-first URL discovery
|
|
16
|
+
- **doc_loader.py**: DocLoader wraps crawler with document loading
|
|
17
|
+
- **extractor.py**: HTML to markdown via trafilatura
|
|
18
|
+
- **summarizer.py**: OpenAI integration with retry logic (tenacity)
|
|
19
|
+
- **url_utils.py**: URLNormalizer for deduplication
|
|
20
|
+
- **url_filters.py**: Filter non-documentation URLs
|
|
21
|
+
- **schema.py**: Pydantic models (PageSummary)
|
|
22
|
+
- **constants.py**: Configuration constants
|
|
23
|
+
|
|
24
|
+
### Type Safety
|
|
25
|
+
Use Pydantic models for all structured data. The OpenAI integration uses structured output with the PageSummary model.
|
|
26
|
+
|
|
27
|
+
### Error Handling
|
|
28
|
+
Failed URL loads should be logged but not stop processing. LLM failures use exponential backoff retries via tenacity. Never let one failure break the entire pipeline.
|
|
29
|
+
|
|
30
|
+
## Development Practices
|
|
31
|
+
|
|
32
|
+
### Testing Requirements
|
|
33
|
+
Write tests before implementing features. Use pytest with these markers:
|
|
34
|
+
- `@pytest.mark.unit` for fast, isolated tests
|
|
35
|
+
- `@pytest.mark.requires_openai` for tests needing OPENAI_API_KEY
|
|
36
|
+
- `@pytest.mark.slow` for tests making external API calls
|
|
37
|
+
|
|
38
|
+
Tests go in:
|
|
39
|
+
- `tests/unit/` for fast tests with no external dependencies
|
|
40
|
+
- `tests/integration/` for tests requiring OPENAI_API_KEY
|
|
41
|
+
|
|
42
|
+
### Code Quality Tools
|
|
43
|
+
Before committing, always run:
|
|
44
|
+
1. Format: `uv run ruff format llmsbrieftxt/ tests/`
|
|
45
|
+
2. Lint: `uv run ruff check llmsbrieftxt/ tests/`
|
|
46
|
+
3. Type check: `uv run pyright llmsbrieftxt/`
|
|
47
|
+
4. Tests: `uv run pytest tests/unit/`
|
|
48
|
+
|
|
49
|
+
### Package Management
|
|
50
|
+
Use `uv` for all package operations:
|
|
51
|
+
- Install: `uv sync --group dev`
|
|
52
|
+
- Add dependency: `uv add package-name`
|
|
53
|
+
- Build: `uv build`
|
|
54
|
+
|
|
55
|
+
## Design Philosophy
|
|
56
|
+
|
|
57
|
+
### Unix Philosophy
|
|
58
|
+
This project follows "do one thing and do it well":
|
|
59
|
+
- Generate llms-brief.txt files only (no built-in search/list features)
|
|
60
|
+
- Compose with standard Unix tools (rg, grep, ls)
|
|
61
|
+
- Simple CLI: URL is a positional argument, no subcommands
|
|
62
|
+
- Plain text output for scriptability
|
|
63
|
+
|
|
64
|
+
### Simplicity Over Features
|
|
65
|
+
Avoid adding functionality that duplicates mature Unix tools. Every line of code must serve the core mission of generating llms-brief.txt files.
|
|
66
|
+
|
|
67
|
+
## Configuration Defaults
|
|
68
|
+
|
|
69
|
+
- **Crawl Depth**: 3 levels (hardcoded in crawler.py)
|
|
70
|
+
- **Output**: `~/.claude/docs/<domain>.txt` (override with `--output`)
|
|
71
|
+
- **Cache**: `.llmsbrieftxt_cache/` for intermediate results
|
|
72
|
+
- **OpenAI Model**: `gpt-5-mini` (override with `--model`)
|
|
73
|
+
- **Concurrency**: 10 concurrent LLM requests (prevents rate limiting)
|
|
74
|
+
|
|
75
|
+
## Commit Convention
|
|
76
|
+
|
|
77
|
+
Use conventional commits for automated versioning:
|
|
78
|
+
- `fix:` → patch bump (1.0.0 → 1.0.1)
|
|
79
|
+
- `feat:` → minor bump (1.0.0 → 1.1.0)
|
|
80
|
+
- `BREAKING CHANGE` or `feat!:`/`fix!:` → major bump (1.0.0 → 2.0.0)
|
|
81
|
+
|
|
82
|
+
Examples:
|
|
83
|
+
```bash
|
|
84
|
+
git commit -m "fix: handle empty sitemap gracefully"
|
|
85
|
+
git commit -m "feat: add --depth option for custom crawl depth"
|
|
86
|
+
git commit -m "feat!: change default output location"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Non-Obvious Behaviors
|
|
90
|
+
|
|
91
|
+
1. URL Discovery discovers ALL pages up to depth 3, not just direct links
|
|
92
|
+
2. URLs like `/page`, `/page/`, and `/page#section` are deduplicated as the same URL
|
|
93
|
+
3. Summaries are automatically cached in `.llmsbrieftxt_cache/summaries.json`
|
|
94
|
+
4. Content extraction uses trafilatura to preserve HTML structure in markdown
|
|
95
|
+
5. File I/O is synchronous (uses standard `Path.write_text()` for simplicity)
|
|
96
|
+
|
|
97
|
+
## Known Limitations
|
|
98
|
+
|
|
99
|
+
1. Only supports OpenAI API (no other LLM providers)
|
|
100
|
+
2. Crawl depth is hardcoded to 3 in crawler.py
|
|
101
|
+
3. No CLI flag to force resume from cache (though cache exists)
|
|
102
|
+
4. No progress persistence if interrupted
|
|
103
|
+
5. Prompts and parsing assume English documentation
|
|
104
|
+
|
|
105
|
+
## Code Review Checklist
|
|
106
|
+
|
|
107
|
+
When reviewing code changes:
|
|
108
|
+
- Ensure async patterns are used correctly (no blocking I/O in async functions)
|
|
109
|
+
- Verify all functions have type hints
|
|
110
|
+
- Check that tests are included for new functionality
|
|
111
|
+
- Confirm error handling doesn't break the pipeline
|
|
112
|
+
- Validate that conventional commit format is used
|
|
113
|
+
- Ensure code follows Unix philosophy (simplicity, composability)
|
|
114
|
+
- Check that ruff and pyright pass without errors
|
|
115
|
+
- **IMPORTANT**: Always include specific file names and line numbers when providing review feedback (e.g., "main.py:165" or "line 182 in cli.py")
|
|
@@ -95,8 +95,12 @@ llmtxt https://docs.python.org/3/
|
|
|
95
95
|
|
|
96
96
|
# With options
|
|
97
97
|
llmtxt https://example.com --model gpt-4o
|
|
98
|
-
llmtxt https://example.com --show-urls
|
|
98
|
+
llmtxt https://example.com --show-urls # Preview URLs with cost estimate
|
|
99
99
|
llmtxt https://example.com --max-urls 50
|
|
100
|
+
llmtxt https://example.com --depth 2 # Control crawl depth (default: 3)
|
|
101
|
+
llmtxt https://example.com --use-cache-only # No API calls, cache only
|
|
102
|
+
llmtxt https://example.com --force-refresh # Ignore cache, regenerate all
|
|
103
|
+
llmtxt https://example.com --cache-dir /tmp/cache # Custom cache location
|
|
100
104
|
llmtxt https://example.com --output custom-path.txt
|
|
101
105
|
```
|
|
102
106
|
|
|
@@ -114,9 +118,15 @@ ls -lh ~/.claude/docs/
|
|
|
114
118
|
|
|
115
119
|
### Default Behavior
|
|
116
120
|
These are the production defaults:
|
|
117
|
-
- **Crawl Depth**: 3 levels from starting URL (
|
|
121
|
+
- **Crawl Depth**: 3 levels from starting URL (configurable with `--depth`)
|
|
118
122
|
- **Output Location**: `~/.claude/docs/<domain>.txt` (can override with `--output`)
|
|
119
|
-
- **Cache Directory**: `.llmsbrieftxt_cache/` for intermediate results
|
|
123
|
+
- **Cache Directory**: `.llmsbrieftxt_cache/` for intermediate results (can override with `--cache-dir`)
|
|
124
|
+
|
|
125
|
+
### New Features (as of latest update)
|
|
126
|
+
- **Cost Estimation**: `--show-urls` now displays estimated API cost before processing
|
|
127
|
+
- **Cache Control**: `--use-cache-only` and `--force-refresh` flags for cache management
|
|
128
|
+
- **Failed URL Tracking**: Failed URLs are written to `failed_urls.txt` next to output file
|
|
129
|
+
- **Depth Configuration**: Crawl depth is now configurable via `--depth` flag
|
|
120
130
|
|
|
121
131
|
### Default Model
|
|
122
132
|
- **OpenAI Model**: `gpt-5-mini` (defined in constants.py, can override with `--model`)
|
|
@@ -169,12 +179,16 @@ Test markers:
|
|
|
169
179
|
|
|
170
180
|
## Non-Obvious Behaviors
|
|
171
181
|
|
|
172
|
-
1. **URL Discovery**: Discovers ALL pages up to depth 3, not just pages linked from your starting URL
|
|
182
|
+
1. **URL Discovery**: Discovers ALL pages up to configured depth (default 3), not just pages linked from your starting URL
|
|
173
183
|
2. **Duplicate Handling**: `/page`, `/page/`, and `/page#section` are treated as the same URL
|
|
174
184
|
3. **Concurrency Limit**: Default 10 concurrent LLM requests prevents rate limiting
|
|
175
185
|
4. **Automatic Caching**: Summaries cached in `.llmsbrieftxt_cache/summaries.json` and reused automatically
|
|
176
186
|
5. **Content Extraction**: Uses `trafilatura` for HTML→markdown, preserving structure
|
|
177
187
|
6. **Sync File I/O**: Uses standard `Path.write_text()` instead of async file I/O (simpler, sufficient)
|
|
188
|
+
7. **Cost Estimation**: `--show-urls` shows both discovered URLs count AND estimated API cost
|
|
189
|
+
8. **Cache-First**: When using cache, shows "Cached: X | New: Y" breakdown before processing
|
|
190
|
+
9. **Failed URL Reporting**: Failed URLs saved to `failed_urls.txt` in same directory as output
|
|
191
|
+
10. **Environment Variables**: `--output` and `--cache-dir` support `$HOME` and other env var expansion
|
|
178
192
|
|
|
179
193
|
## Using llms-brief.txt Files
|
|
180
194
|
|
|
@@ -240,9 +254,12 @@ grep -rn "hooks" ~/.claude/docs/
|
|
|
240
254
|
|
|
241
255
|
### Debugging Issues
|
|
242
256
|
1. Check logs - logger is configured in most modules
|
|
243
|
-
2. Use `--show-urls` to preview URL discovery
|
|
244
|
-
3. Check cache: `.llmsbrieftxt_cache/summaries.json`
|
|
245
|
-
4.
|
|
257
|
+
2. Use `--show-urls` to preview URL discovery and cost estimate
|
|
258
|
+
3. Check cache: `.llmsbrieftxt_cache/summaries.json` (or custom `--cache-dir`)
|
|
259
|
+
4. Check failed URLs: `failed_urls.txt` in output directory
|
|
260
|
+
5. Test with limited scope: `--max-urls 10 --depth 1` for quick testing
|
|
261
|
+
6. Use `--use-cache-only` to test output generation without API calls
|
|
262
|
+
7. Run with verbose pytest: `uv run pytest -vv -s`
|
|
246
263
|
|
|
247
264
|
### Modifying URL Discovery Logic
|
|
248
265
|
- Edit `crawler.py` for crawling behavior
|
|
@@ -308,10 +325,9 @@ uv build
|
|
|
308
325
|
## Known Limitations
|
|
309
326
|
|
|
310
327
|
1. **OpenAI Only**: Currently only supports OpenAI API (no other LLM providers)
|
|
311
|
-
2. **
|
|
312
|
-
3. **
|
|
313
|
-
4. **No
|
|
314
|
-
5. **English-Centric**: Prompts and parsing assume English documentation
|
|
328
|
+
2. **No Progress Persistence**: If interrupted, must restart (though cache helps and is used automatically on restart)
|
|
329
|
+
3. **English-Centric**: Prompts and parsing assume English documentation
|
|
330
|
+
4. **No Incremental Timestamp Checking**: Force refresh or cache-only mode, but no "only update changed pages" mode
|
|
315
331
|
|
|
316
332
|
## Migration from v0.x
|
|
317
333
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmsbrieftxt
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: Generate llms-brief.txt files from documentation websites using AI
|
|
5
5
|
Project-URL: Homepage, https://github.com/stevennevins/llmsbrief
|
|
6
6
|
Project-URL: Repository, https://github.com/stevennevins/llmsbrief
|
|
@@ -98,8 +98,12 @@ Output is automatically saved to `~/.claude/docs/<domain>.txt` (e.g., `docs.pyth
|
|
|
98
98
|
- `--output PATH` - Custom output path (default: `~/.claude/docs/<domain>.txt`)
|
|
99
99
|
- `--model MODEL` - OpenAI model to use (default: `gpt-5-mini`)
|
|
100
100
|
- `--max-concurrent-summaries N` - Concurrent LLM requests (default: 10)
|
|
101
|
-
- `--show-urls` - Preview discovered URLs
|
|
101
|
+
- `--show-urls` - Preview discovered URLs with cost estimate (no API calls)
|
|
102
102
|
- `--max-urls N` - Limit number of URLs to process
|
|
103
|
+
- `--depth N` - Maximum crawl depth (default: 3)
|
|
104
|
+
- `--cache-dir PATH` - Cache directory path (default: `.llmsbrieftxt_cache`)
|
|
105
|
+
- `--use-cache-only` - Use only cached summaries, skip API calls for new pages
|
|
106
|
+
- `--force-refresh` - Ignore cache and regenerate all summaries
|
|
103
107
|
|
|
104
108
|
### Examples
|
|
105
109
|
|
|
@@ -110,12 +114,24 @@ llmtxt https://docs.python.org/3/
|
|
|
110
114
|
# Use a different model
|
|
111
115
|
llmtxt https://react.dev --model gpt-4o
|
|
112
116
|
|
|
113
|
-
# Preview URLs before processing (no API calls)
|
|
117
|
+
# Preview URLs with cost estimate before processing (no API calls)
|
|
114
118
|
llmtxt https://react.dev --show-urls
|
|
115
119
|
|
|
116
120
|
# Limit scope for testing
|
|
117
121
|
llmtxt https://docs.python.org --max-urls 50
|
|
118
122
|
|
|
123
|
+
# Custom crawl depth (explore deeper or shallower)
|
|
124
|
+
llmtxt https://example.com --depth 2
|
|
125
|
+
|
|
126
|
+
# Use only cached summaries (no API calls)
|
|
127
|
+
llmtxt https://docs.python.org/3/ --use-cache-only
|
|
128
|
+
|
|
129
|
+
# Force refresh all summaries (ignore cache)
|
|
130
|
+
llmtxt https://docs.python.org/3/ --force-refresh
|
|
131
|
+
|
|
132
|
+
# Custom cache directory
|
|
133
|
+
llmtxt https://example.com --cache-dir /tmp/my-cache
|
|
134
|
+
|
|
119
135
|
# Custom output location
|
|
120
136
|
llmtxt https://react.dev --output ./my-docs/react.txt
|
|
121
137
|
|
|
@@ -245,11 +261,11 @@ uv run mypy llmsbrieftxt/
|
|
|
245
261
|
|
|
246
262
|
### Default Settings
|
|
247
263
|
|
|
248
|
-
- **Crawl Depth**: 3 levels (
|
|
249
|
-
- **Output Location**: `~/.claude/docs/<domain>.txt`
|
|
250
|
-
- **Cache Directory**: `.llmsbrieftxt_cache/`
|
|
251
|
-
- **OpenAI Model**: `gpt-5-mini`
|
|
252
|
-
- **Concurrent Requests**: 10
|
|
264
|
+
- **Crawl Depth**: 3 levels (configurable via `--depth`)
|
|
265
|
+
- **Output Location**: `~/.claude/docs/<domain>.txt` (configurable via `--output`)
|
|
266
|
+
- **Cache Directory**: `.llmsbrieftxt_cache/` (configurable via `--cache-dir`)
|
|
267
|
+
- **OpenAI Model**: `gpt-5-mini` (configurable via `--model`)
|
|
268
|
+
- **Concurrent Requests**: 10 (configurable via `--max-concurrent-summaries`)
|
|
253
269
|
|
|
254
270
|
### Environment Variables
|
|
255
271
|
|
|
@@ -259,10 +275,26 @@ uv run mypy llmsbrieftxt/
|
|
|
259
275
|
|
|
260
276
|
### Managing API Costs
|
|
261
277
|
|
|
262
|
-
- Use `--show-urls`
|
|
263
|
-
- Use `--max-urls` to limit processing during testing
|
|
264
|
-
- Summaries are cached automatically - rerunning is cheap
|
|
265
|
-
-
|
|
278
|
+
- **Preview with cost estimate**: Use `--show-urls` to see discovered URLs and estimated API cost before processing
|
|
279
|
+
- **Limit scope**: Use `--max-urls` to limit processing during testing
|
|
280
|
+
- **Automatic caching**: Summaries are cached automatically - rerunning is cheap
|
|
281
|
+
- **Cache-only mode**: Use `--use-cache-only` to generate output from cache without API calls
|
|
282
|
+
- **Force refresh**: Use `--force-refresh` when you need to regenerate all summaries
|
|
283
|
+
- **Cost-effective model**: Default model `gpt-5-mini` is cost-effective for most documentation
|
|
284
|
+
|
|
285
|
+
### Controlling Crawl Depth
|
|
286
|
+
|
|
287
|
+
- **Default depth (3)**: Good for most documentation sites (100-300 pages)
|
|
288
|
+
- **Shallow crawl (1-2)**: Use for large sites or to focus on main pages only
|
|
289
|
+
- **Deep crawl (4-5)**: Use for small sites or comprehensive coverage
|
|
290
|
+
- Example: `llmtxt https://example.com --depth 2 --show-urls` to preview scope
|
|
291
|
+
|
|
292
|
+
### Cache Management
|
|
293
|
+
|
|
294
|
+
- **Default location**: `.llmsbrieftxt_cache/` in current directory
|
|
295
|
+
- **Custom location**: Use `--cache-dir` for shared caches or different organization
|
|
296
|
+
- **Cache benefits**: Speeds up reruns, reduces API costs, enables incremental updates
|
|
297
|
+
- **Failed URLs tracking**: Failed URLs are written to `failed_urls.txt` next to output file
|
|
266
298
|
|
|
267
299
|
### Organizing Documentation
|
|
268
300
|
|
|
@@ -65,8 +65,12 @@ Output is automatically saved to `~/.claude/docs/<domain>.txt` (e.g., `docs.pyth
|
|
|
65
65
|
- `--output PATH` - Custom output path (default: `~/.claude/docs/<domain>.txt`)
|
|
66
66
|
- `--model MODEL` - OpenAI model to use (default: `gpt-5-mini`)
|
|
67
67
|
- `--max-concurrent-summaries N` - Concurrent LLM requests (default: 10)
|
|
68
|
-
- `--show-urls` - Preview discovered URLs
|
|
68
|
+
- `--show-urls` - Preview discovered URLs with cost estimate (no API calls)
|
|
69
69
|
- `--max-urls N` - Limit number of URLs to process
|
|
70
|
+
- `--depth N` - Maximum crawl depth (default: 3)
|
|
71
|
+
- `--cache-dir PATH` - Cache directory path (default: `.llmsbrieftxt_cache`)
|
|
72
|
+
- `--use-cache-only` - Use only cached summaries, skip API calls for new pages
|
|
73
|
+
- `--force-refresh` - Ignore cache and regenerate all summaries
|
|
70
74
|
|
|
71
75
|
### Examples
|
|
72
76
|
|
|
@@ -77,12 +81,24 @@ llmtxt https://docs.python.org/3/
|
|
|
77
81
|
# Use a different model
|
|
78
82
|
llmtxt https://react.dev --model gpt-4o
|
|
79
83
|
|
|
80
|
-
# Preview URLs before processing (no API calls)
|
|
84
|
+
# Preview URLs with cost estimate before processing (no API calls)
|
|
81
85
|
llmtxt https://react.dev --show-urls
|
|
82
86
|
|
|
83
87
|
# Limit scope for testing
|
|
84
88
|
llmtxt https://docs.python.org --max-urls 50
|
|
85
89
|
|
|
90
|
+
# Custom crawl depth (explore deeper or shallower)
|
|
91
|
+
llmtxt https://example.com --depth 2
|
|
92
|
+
|
|
93
|
+
# Use only cached summaries (no API calls)
|
|
94
|
+
llmtxt https://docs.python.org/3/ --use-cache-only
|
|
95
|
+
|
|
96
|
+
# Force refresh all summaries (ignore cache)
|
|
97
|
+
llmtxt https://docs.python.org/3/ --force-refresh
|
|
98
|
+
|
|
99
|
+
# Custom cache directory
|
|
100
|
+
llmtxt https://example.com --cache-dir /tmp/my-cache
|
|
101
|
+
|
|
86
102
|
# Custom output location
|
|
87
103
|
llmtxt https://react.dev --output ./my-docs/react.txt
|
|
88
104
|
|
|
@@ -212,11 +228,11 @@ uv run mypy llmsbrieftxt/
|
|
|
212
228
|
|
|
213
229
|
### Default Settings
|
|
214
230
|
|
|
215
|
-
- **Crawl Depth**: 3 levels (
|
|
216
|
-
- **Output Location**: `~/.claude/docs/<domain>.txt`
|
|
217
|
-
- **Cache Directory**: `.llmsbrieftxt_cache/`
|
|
218
|
-
- **OpenAI Model**: `gpt-5-mini`
|
|
219
|
-
- **Concurrent Requests**: 10
|
|
231
|
+
- **Crawl Depth**: 3 levels (configurable via `--depth`)
|
|
232
|
+
- **Output Location**: `~/.claude/docs/<domain>.txt` (configurable via `--output`)
|
|
233
|
+
- **Cache Directory**: `.llmsbrieftxt_cache/` (configurable via `--cache-dir`)
|
|
234
|
+
- **OpenAI Model**: `gpt-5-mini` (configurable via `--model`)
|
|
235
|
+
- **Concurrent Requests**: 10 (configurable via `--max-concurrent-summaries`)
|
|
220
236
|
|
|
221
237
|
### Environment Variables
|
|
222
238
|
|
|
@@ -226,10 +242,26 @@ uv run mypy llmsbrieftxt/
|
|
|
226
242
|
|
|
227
243
|
### Managing API Costs
|
|
228
244
|
|
|
229
|
-
- Use `--show-urls`
|
|
230
|
-
- Use `--max-urls` to limit processing during testing
|
|
231
|
-
- Summaries are cached automatically - rerunning is cheap
|
|
232
|
-
-
|
|
245
|
+
- **Preview with cost estimate**: Use `--show-urls` to see discovered URLs and estimated API cost before processing
|
|
246
|
+
- **Limit scope**: Use `--max-urls` to limit processing during testing
|
|
247
|
+
- **Automatic caching**: Summaries are cached automatically - rerunning is cheap
|
|
248
|
+
- **Cache-only mode**: Use `--use-cache-only` to generate output from cache without API calls
|
|
249
|
+
- **Force refresh**: Use `--force-refresh` when you need to regenerate all summaries
|
|
250
|
+
- **Cost-effective model**: Default model `gpt-5-mini` is cost-effective for most documentation
|
|
251
|
+
|
|
252
|
+
### Controlling Crawl Depth
|
|
253
|
+
|
|
254
|
+
- **Default depth (3)**: Good for most documentation sites (100-300 pages)
|
|
255
|
+
- **Shallow crawl (1-2)**: Use for large sites or to focus on main pages only
|
|
256
|
+
- **Deep crawl (4-5)**: Use for small sites or comprehensive coverage
|
|
257
|
+
- Example: `llmtxt https://example.com --depth 2 --show-urls` to preview scope
|
|
258
|
+
|
|
259
|
+
### Cache Management
|
|
260
|
+
|
|
261
|
+
- **Default location**: `.llmsbrieftxt_cache/` in current directory
|
|
262
|
+
- **Custom location**: Use `--cache-dir` for shared caches or different organization
|
|
263
|
+
- **Cache benefits**: Speeds up reruns, reduces API costs, enables incremental updates
|
|
264
|
+
- **Failed URLs tracking**: Failed URLs are written to `failed_urls.txt` next to output file
|
|
233
265
|
|
|
234
266
|
### Organizing Documentation
|
|
235
267
|
|
|
@@ -8,9 +8,14 @@ from pathlib import Path
|
|
|
8
8
|
from urllib.parse import urlparse
|
|
9
9
|
|
|
10
10
|
from llmsbrieftxt.constants import (
|
|
11
|
+
DEFAULT_CACHE_DIR,
|
|
11
12
|
DEFAULT_CONCURRENT_SUMMARIES,
|
|
13
|
+
DEFAULT_CRAWL_DEPTH,
|
|
12
14
|
DEFAULT_OPENAI_MODEL,
|
|
13
15
|
DOCS_DIR,
|
|
16
|
+
ESTIMATED_TOKENS_PER_PAGE_INPUT,
|
|
17
|
+
ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
|
|
18
|
+
OPENAI_PRICING,
|
|
14
19
|
)
|
|
15
20
|
from llmsbrieftxt.main import generate_llms_txt
|
|
16
21
|
|
|
@@ -48,13 +53,39 @@ def parse_args(test_args: list[str] | None = None) -> argparse.Namespace:
|
|
|
48
53
|
parser.add_argument(
|
|
49
54
|
"--show-urls",
|
|
50
55
|
action="store_true",
|
|
51
|
-
help="Preview discovered URLs
|
|
56
|
+
help="Preview discovered URLs with cost estimate (no processing or API calls)",
|
|
52
57
|
)
|
|
53
58
|
|
|
54
59
|
parser.add_argument(
|
|
55
60
|
"--max-urls", type=int, help="Maximum number of URLs to discover and process"
|
|
56
61
|
)
|
|
57
62
|
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
"--depth",
|
|
65
|
+
type=int,
|
|
66
|
+
default=DEFAULT_CRAWL_DEPTH,
|
|
67
|
+
help=f"Maximum crawl depth (default: {DEFAULT_CRAWL_DEPTH})",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--cache-dir",
|
|
72
|
+
type=str,
|
|
73
|
+
default=DEFAULT_CACHE_DIR,
|
|
74
|
+
help=f"Cache directory path (default: {DEFAULT_CACHE_DIR})",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--use-cache-only",
|
|
79
|
+
action="store_true",
|
|
80
|
+
help="Use only cached summaries, skip API calls for new pages",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
parser.add_argument(
|
|
84
|
+
"--force-refresh",
|
|
85
|
+
action="store_true",
|
|
86
|
+
help="Ignore cache and regenerate all summaries",
|
|
87
|
+
)
|
|
88
|
+
|
|
58
89
|
return parser.parse_args(test_args)
|
|
59
90
|
|
|
60
91
|
|
|
@@ -72,6 +103,39 @@ def check_openai_api_key() -> bool:
|
|
|
72
103
|
return bool(os.environ.get("OPENAI_API_KEY"))
|
|
73
104
|
|
|
74
105
|
|
|
106
|
+
def estimate_cost(num_pages: int, model: str) -> str:
|
|
107
|
+
"""
|
|
108
|
+
Estimate the API cost for processing a given number of pages.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
num_pages: Number of pages to process
|
|
112
|
+
model: OpenAI model name
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Formatted cost estimate string
|
|
116
|
+
"""
|
|
117
|
+
if model not in OPENAI_PRICING:
|
|
118
|
+
return "Cost estimation not available for this model"
|
|
119
|
+
|
|
120
|
+
input_price, output_price = OPENAI_PRICING[model]
|
|
121
|
+
|
|
122
|
+
# Calculate total tokens
|
|
123
|
+
total_input_tokens = num_pages * ESTIMATED_TOKENS_PER_PAGE_INPUT
|
|
124
|
+
total_output_tokens = num_pages * ESTIMATED_TOKENS_PER_PAGE_OUTPUT
|
|
125
|
+
|
|
126
|
+
# Calculate cost (prices are per 1M tokens)
|
|
127
|
+
input_cost = (total_input_tokens / 1_000_000) * input_price
|
|
128
|
+
output_cost = (total_output_tokens / 1_000_000) * output_price
|
|
129
|
+
total_cost = input_cost + output_cost
|
|
130
|
+
|
|
131
|
+
if total_cost < 0.01:
|
|
132
|
+
return f"~${total_cost:.4f}"
|
|
133
|
+
elif total_cost < 1.00:
|
|
134
|
+
return f"~${total_cost:.3f}"
|
|
135
|
+
else:
|
|
136
|
+
return f"~${total_cost:.2f}"
|
|
137
|
+
|
|
138
|
+
|
|
75
139
|
def get_output_path(url: str, custom_output: str | None = None) -> Path:
|
|
76
140
|
"""
|
|
77
141
|
Get the output file path for a given URL.
|
|
@@ -84,7 +148,9 @@ def get_output_path(url: str, custom_output: str | None = None) -> Path:
|
|
|
84
148
|
Path object for the output file
|
|
85
149
|
"""
|
|
86
150
|
if custom_output:
|
|
87
|
-
|
|
151
|
+
# Expand environment variables and user home directory
|
|
152
|
+
expanded = os.path.expandvars(custom_output)
|
|
153
|
+
return Path(expanded).expanduser()
|
|
88
154
|
|
|
89
155
|
# Extract domain from URL
|
|
90
156
|
parsed = urlparse(url)
|
|
@@ -116,8 +182,25 @@ def main() -> None:
|
|
|
116
182
|
print("Example: https://docs.python.org/3/", file=sys.stderr)
|
|
117
183
|
sys.exit(1)
|
|
118
184
|
|
|
119
|
-
#
|
|
120
|
-
if
|
|
185
|
+
# Validate depth parameter
|
|
186
|
+
if args.depth < 1:
|
|
187
|
+
print("Error: --depth must be at least 1", file=sys.stderr)
|
|
188
|
+
sys.exit(1)
|
|
189
|
+
|
|
190
|
+
# Check for conflicting cache flags
|
|
191
|
+
if args.use_cache_only and args.force_refresh:
|
|
192
|
+
print(
|
|
193
|
+
"Error: Cannot use --use-cache-only and --force-refresh together",
|
|
194
|
+
file=sys.stderr,
|
|
195
|
+
)
|
|
196
|
+
sys.exit(1)
|
|
197
|
+
|
|
198
|
+
# Check for API key (unless just showing URLs or using cache only)
|
|
199
|
+
if (
|
|
200
|
+
not args.show_urls
|
|
201
|
+
and not args.use_cache_only
|
|
202
|
+
and not check_openai_api_key()
|
|
203
|
+
):
|
|
121
204
|
print("Error: OPENAI_API_KEY not found", file=sys.stderr)
|
|
122
205
|
print("Please set your OpenAI API key:", file=sys.stderr)
|
|
123
206
|
print(" export OPENAI_API_KEY='sk-your-api-key-here'", file=sys.stderr)
|
|
@@ -131,24 +214,24 @@ def main() -> None:
|
|
|
131
214
|
# Determine output path
|
|
132
215
|
output_path = get_output_path(args.url, args.output)
|
|
133
216
|
|
|
217
|
+
# Expand cache directory path
|
|
218
|
+
cache_dir = Path(os.path.expandvars(args.cache_dir)).expanduser()
|
|
219
|
+
|
|
134
220
|
# Print configuration
|
|
135
221
|
print(f"Processing URL: {args.url}")
|
|
136
|
-
|
|
222
|
+
if not args.show_urls:
|
|
223
|
+
print(f"Using model: {args.model}")
|
|
224
|
+
print(f"Crawl depth: {args.depth}")
|
|
137
225
|
print(f"Output: {output_path}")
|
|
138
226
|
if args.max_urls:
|
|
139
227
|
print(f"Max URLs: {args.max_urls}")
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
print("")
|
|
144
|
-
print(
|
|
145
|
-
"Note: This will discover and process all documentation pages (depth=3)"
|
|
146
|
-
)
|
|
147
|
-
print("Tip: Use --show-urls first to preview scope, or --max-urls to limit")
|
|
148
|
-
print("")
|
|
228
|
+
if args.use_cache_only:
|
|
229
|
+
print("Mode: Cache-only (no API calls)")
|
|
230
|
+
elif args.force_refresh:
|
|
231
|
+
print("Mode: Force refresh (ignoring cache)")
|
|
149
232
|
|
|
150
233
|
# Run generation
|
|
151
|
-
asyncio.run(
|
|
234
|
+
result = asyncio.run(
|
|
152
235
|
generate_llms_txt(
|
|
153
236
|
url=args.url,
|
|
154
237
|
llm_name=args.model,
|
|
@@ -156,9 +239,23 @@ def main() -> None:
|
|
|
156
239
|
output_path=str(output_path),
|
|
157
240
|
show_urls=args.show_urls,
|
|
158
241
|
max_urls=args.max_urls,
|
|
242
|
+
max_depth=args.depth,
|
|
243
|
+
cache_dir=str(cache_dir),
|
|
244
|
+
use_cache_only=args.use_cache_only,
|
|
245
|
+
force_refresh=args.force_refresh,
|
|
159
246
|
)
|
|
160
247
|
)
|
|
161
248
|
|
|
249
|
+
# Show cost estimate and failed URLs if available
|
|
250
|
+
if args.show_urls and result:
|
|
251
|
+
num_urls_value = result.get("num_urls", 0)
|
|
252
|
+
# Type guard to ensure we have an int
|
|
253
|
+
if isinstance(num_urls_value, int):
|
|
254
|
+
print(
|
|
255
|
+
f"\nEstimated cost for {num_urls_value} pages: {estimate_cost(num_urls_value, args.model)}"
|
|
256
|
+
)
|
|
257
|
+
print("Note: Actual cost may vary based on page content size and caching")
|
|
258
|
+
|
|
162
259
|
except KeyboardInterrupt:
|
|
163
260
|
print("\nOperation cancelled by user.", file=sys.stderr)
|
|
164
261
|
sys.exit(1)
|
|
@@ -9,6 +9,32 @@ DEFAULT_OPENAI_MODEL = "gpt-5-mini"
|
|
|
9
9
|
# Docs Directory
|
|
10
10
|
DOCS_DIR = "~/.claude/docs" # Will be expanded to full path at runtime
|
|
11
11
|
|
|
12
|
+
# Default Cache Directory
|
|
13
|
+
DEFAULT_CACHE_DIR = ".llmsbrieftxt_cache"
|
|
14
|
+
|
|
15
|
+
# Default Crawl Depth
|
|
16
|
+
DEFAULT_CRAWL_DEPTH = 3
|
|
17
|
+
|
|
18
|
+
# OpenAI Pricing (per 1M tokens) - prices subject to change
|
|
19
|
+
# Format: {model: (input_price, output_price)}
|
|
20
|
+
# Note: Verify current pricing at https://openai.com/api/pricing/
|
|
21
|
+
OPENAI_PRICING = {
|
|
22
|
+
"gpt-5-mini": (0.15, 0.60), # $0.15 input, $0.60 output per 1M tokens
|
|
23
|
+
"gpt-4o-mini": (0.15, 0.60),
|
|
24
|
+
"gpt-4o": (2.50, 10.00),
|
|
25
|
+
"gpt-4-turbo": (10.00, 30.00),
|
|
26
|
+
"gpt-4": (30.00, 60.00),
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
# Estimated tokens per page for cost calculation
|
|
30
|
+
# These estimates are based on typical documentation page sizes:
|
|
31
|
+
# - Input: ~2000-4000 words per doc page → ~3000 tokens (conservative estimate)
|
|
32
|
+
# - Output: ~300 tokens for structured PageSummary with all fields
|
|
33
|
+
# Accuracy: Estimates typically within ±30% of actual cost
|
|
34
|
+
# Pages with code examples or very long content may exceed these estimates
|
|
35
|
+
ESTIMATED_TOKENS_PER_PAGE_INPUT = 3000
|
|
36
|
+
ESTIMATED_TOKENS_PER_PAGE_OUTPUT = 400
|
|
37
|
+
|
|
12
38
|
|
|
13
39
|
# Prompt Templates
|
|
14
40
|
DEFAULT_SUMMARY_PROMPT = """You are a specialized content analyzer creating structured summaries for llms-brief.txt files. Your role is to help LLMs understand web content by providing comprehensive yet concise summaries.
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""Main generation pipeline for llmsbrieftxt."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from llmsbrieftxt.doc_loader import DocLoader
|
|
8
|
+
from llmsbrieftxt.extractor import default_extractor
|
|
9
|
+
from llmsbrieftxt.summarizer import Summarizer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def extract_url_from_summary(summary: str) -> str | None:
|
|
13
|
+
"""
|
|
14
|
+
Extract URL from a summary in the format: Title: [title](URL).
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
summary: Formatted summary string
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Extracted URL or None if not found
|
|
21
|
+
"""
|
|
22
|
+
# Match markdown link format: [text](url)
|
|
23
|
+
match = re.search(r"\[([^\]]+)\]\(([^)]+)\)", summary)
|
|
24
|
+
if match:
|
|
25
|
+
return match.group(2)
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def ensure_directory_exists(file_path: str) -> None:
|
|
30
|
+
"""Ensure the parent directory of the given file path exists.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
file_path: Path to the file whose parent directory should be created
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
RuntimeError: If directory creation fails due to permissions or other issues
|
|
37
|
+
"""
|
|
38
|
+
dir_path = Path(file_path).parent
|
|
39
|
+
if dir_path == Path("."):
|
|
40
|
+
return # Current directory, no need to create
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
dir_path.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
if not dir_path.exists():
|
|
45
|
+
print(f"Created directory: {dir_path}")
|
|
46
|
+
except OSError as e:
|
|
47
|
+
raise RuntimeError(f"Failed to create directory {dir_path}: {e}") from e
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
async def generate_llms_txt(
|
|
51
|
+
url: str,
|
|
52
|
+
llm_name: str = "o4-mini",
|
|
53
|
+
max_concurrent_summaries: int = 10,
|
|
54
|
+
output_path: str = "llms.txt",
|
|
55
|
+
show_urls: bool = False,
|
|
56
|
+
max_urls: int | None = None,
|
|
57
|
+
max_depth: int = 3,
|
|
58
|
+
cache_dir: str = ".llmsbrieftxt_cache",
|
|
59
|
+
use_cache_only: bool = False,
|
|
60
|
+
force_refresh: bool = False,
|
|
61
|
+
) -> dict[str, int | list[str]] | None:
|
|
62
|
+
"""
|
|
63
|
+
Generate llms-brief.txt file from a documentation website.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
url: URL of the documentation site to crawl
|
|
67
|
+
llm_name: OpenAI model to use for summarization
|
|
68
|
+
max_concurrent_summaries: Maximum concurrent LLM requests
|
|
69
|
+
output_path: Path to write the output file
|
|
70
|
+
show_urls: If True, only show discovered URLs without processing
|
|
71
|
+
max_urls: Maximum number of URLs to discover/process
|
|
72
|
+
max_depth: Maximum crawl depth for URL discovery
|
|
73
|
+
cache_dir: Directory to store cached summaries
|
|
74
|
+
use_cache_only: If True, only use cached summaries (no API calls)
|
|
75
|
+
force_refresh: If True, ignore cache and regenerate all summaries
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Dictionary with metadata (for show_urls mode) or None
|
|
79
|
+
"""
|
|
80
|
+
urls_processed = 0
|
|
81
|
+
summaries_generated = 0
|
|
82
|
+
failed_urls: set[str] = set() # Use set to avoid duplicates
|
|
83
|
+
|
|
84
|
+
# Set up cache directory
|
|
85
|
+
cache_path = Path(cache_dir)
|
|
86
|
+
cache_path.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
cache_file = cache_path / "summaries.json"
|
|
88
|
+
|
|
89
|
+
# Load existing summaries from cache if available (unless force refresh)
|
|
90
|
+
existing_summaries: dict[str, str] = {}
|
|
91
|
+
if cache_file.exists() and not force_refresh:
|
|
92
|
+
try:
|
|
93
|
+
with open(cache_file) as f:
|
|
94
|
+
existing_summaries = json.load(f)
|
|
95
|
+
print(f"Found {len(existing_summaries)} cached summaries")
|
|
96
|
+
except Exception as e:
|
|
97
|
+
print(f"Warning: Could not load cache: {str(e)}")
|
|
98
|
+
elif force_refresh and cache_file.exists():
|
|
99
|
+
print("Force refresh enabled - ignoring existing cache")
|
|
100
|
+
|
|
101
|
+
extractor = default_extractor
|
|
102
|
+
output_file = output_path
|
|
103
|
+
|
|
104
|
+
# If show_urls is True, just show discovered URLs and exit
|
|
105
|
+
if show_urls:
|
|
106
|
+
print("Discovering documentation URLs...")
|
|
107
|
+
doc_loader = DocLoader(max_urls=max_urls, max_depth=max_depth)
|
|
108
|
+
_, discovered_urls = await doc_loader.load_docs(
|
|
109
|
+
url, extractor=extractor, show_urls=True
|
|
110
|
+
)
|
|
111
|
+
print("\nDiscovered URLs:")
|
|
112
|
+
for discovered_url in discovered_urls:
|
|
113
|
+
print(f" - {discovered_url}")
|
|
114
|
+
print(f"\nTotal: {len(discovered_urls)} unique URLs")
|
|
115
|
+
|
|
116
|
+
# Calculate how many would be cached vs new
|
|
117
|
+
num_cached = sum(1 for u in discovered_urls if u in existing_summaries)
|
|
118
|
+
num_new = len(discovered_urls) - num_cached
|
|
119
|
+
if existing_summaries:
|
|
120
|
+
print(f"Cached: {num_cached} | New: {num_new}")
|
|
121
|
+
|
|
122
|
+
return {"num_urls": len(discovered_urls), "failed_urls": []}
|
|
123
|
+
|
|
124
|
+
# Load and process documents
|
|
125
|
+
doc_loader = DocLoader(max_urls=max_urls, max_depth=max_depth)
|
|
126
|
+
docs, discovered_urls = await doc_loader.load_docs(url, extractor=extractor)
|
|
127
|
+
urls_processed = len(docs)
|
|
128
|
+
|
|
129
|
+
# Track which URLs failed to load
|
|
130
|
+
loaded_urls = {doc.metadata.get("source") for doc in docs}
|
|
131
|
+
failed_urls.update(u for u in discovered_urls if u not in loaded_urls)
|
|
132
|
+
|
|
133
|
+
# Handle cache-only mode
|
|
134
|
+
if use_cache_only:
|
|
135
|
+
print("\nCache-only mode: Using only cached summaries")
|
|
136
|
+
summaries: list[str] = []
|
|
137
|
+
for doc in docs:
|
|
138
|
+
doc_url = doc.metadata.get("source", "")
|
|
139
|
+
if doc_url in existing_summaries:
|
|
140
|
+
summaries.append(existing_summaries[doc_url])
|
|
141
|
+
else:
|
|
142
|
+
print(f" Warning: No cache for {doc_url}")
|
|
143
|
+
failed_urls.add(doc_url)
|
|
144
|
+
summaries_generated = len(summaries)
|
|
145
|
+
else:
|
|
146
|
+
# Initialize summarizer
|
|
147
|
+
print(f"\nGenerating summaries with {llm_name}...")
|
|
148
|
+
summarizer = Summarizer(
|
|
149
|
+
llm_name=llm_name,
|
|
150
|
+
max_concurrent=max_concurrent_summaries,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
summaries: list[str] = []
|
|
154
|
+
try:
|
|
155
|
+
summaries = await summarizer.summarize_all(
|
|
156
|
+
docs, existing_summaries=existing_summaries, cache_file=cache_file
|
|
157
|
+
)
|
|
158
|
+
summaries_generated = len(summaries)
|
|
159
|
+
|
|
160
|
+
# Track URLs that failed summarization by extracting URLs from summaries
|
|
161
|
+
summarized_urls: set[str] = set()
|
|
162
|
+
for summary in summaries:
|
|
163
|
+
if summary:
|
|
164
|
+
extracted_url: str | None = extract_url_from_summary(summary)
|
|
165
|
+
if extracted_url:
|
|
166
|
+
summarized_urls.add(extracted_url)
|
|
167
|
+
|
|
168
|
+
# Add docs that weren't successfully summarized to failed_urls
|
|
169
|
+
for doc in docs:
|
|
170
|
+
doc_url = doc.metadata.get("source", "")
|
|
171
|
+
if doc_url and doc_url not in summarized_urls:
|
|
172
|
+
failed_urls.add(doc_url)
|
|
173
|
+
except KeyboardInterrupt:
|
|
174
|
+
print("Process interrupted by user. Saving partial results...")
|
|
175
|
+
if cache_file.exists():
|
|
176
|
+
try:
|
|
177
|
+
with open(cache_file) as f:
|
|
178
|
+
partial_summaries = json.load(f)
|
|
179
|
+
summaries = list(partial_summaries.values())
|
|
180
|
+
summaries_generated = len(summaries)
|
|
181
|
+
print(f"Recovered {len(summaries)} summaries from cache")
|
|
182
|
+
except Exception:
|
|
183
|
+
# Silently ignore cache read errors during interrupt recovery
|
|
184
|
+
# If we can't recover from cache, we'll continue with empty results
|
|
185
|
+
pass
|
|
186
|
+
except Exception as e:
|
|
187
|
+
print(f"Summarization process error: {str(e)}")
|
|
188
|
+
if cache_file.exists():
|
|
189
|
+
try:
|
|
190
|
+
with open(cache_file) as f:
|
|
191
|
+
partial_summaries = json.load(f)
|
|
192
|
+
summaries = list(partial_summaries.values())
|
|
193
|
+
summaries_generated = len(summaries)
|
|
194
|
+
print(
|
|
195
|
+
f"Recovered {len(summaries)} partial summaries from cache"
|
|
196
|
+
)
|
|
197
|
+
except Exception:
|
|
198
|
+
# If cache recovery fails during error handling, continue with empty results
|
|
199
|
+
summaries = []
|
|
200
|
+
finally:
|
|
201
|
+
# Write results to file
|
|
202
|
+
if summaries:
|
|
203
|
+
ensure_directory_exists(output_file)
|
|
204
|
+
output_content = "".join(summaries)
|
|
205
|
+
Path(output_file).write_text(output_content, encoding="utf-8")
|
|
206
|
+
else:
|
|
207
|
+
ensure_directory_exists(output_file)
|
|
208
|
+
Path(output_file).write_text("", encoding="utf-8")
|
|
209
|
+
|
|
210
|
+
# Print summary
|
|
211
|
+
print(f"\n{'=' * 50}")
|
|
212
|
+
print(f"Processed: {summaries_generated}/{urls_processed} pages")
|
|
213
|
+
if urls_processed > 0:
|
|
214
|
+
success_rate = summaries_generated / urls_processed * 100
|
|
215
|
+
print(f"Success rate: {success_rate:.1f}%")
|
|
216
|
+
print(f"Output: {output_file}")
|
|
217
|
+
|
|
218
|
+
# Report failed URLs
|
|
219
|
+
if failed_urls:
|
|
220
|
+
print(f"Failed URLs: {len(failed_urls)}")
|
|
221
|
+
failed_file = Path(output_file).parent / "failed_urls.txt"
|
|
222
|
+
# Sort URLs for consistent output
|
|
223
|
+
failed_file.write_text("\n".join(sorted(failed_urls)), encoding="utf-8")
|
|
224
|
+
print(f"Failed URLs written to: {failed_file}")
|
|
225
|
+
print(f"{'=' * 50}")
|
|
226
|
+
|
|
227
|
+
return None
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
"""Tests for CLI argument parsing."""
|
|
2
2
|
|
|
3
|
+
import os
|
|
4
|
+
|
|
3
5
|
import pytest
|
|
4
6
|
|
|
5
|
-
from llmsbrieftxt.cli import parse_args, validate_url
|
|
7
|
+
from llmsbrieftxt.cli import estimate_cost, get_output_path, parse_args, validate_url
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
class TestCLIArgumentParsing:
|
|
@@ -110,6 +112,61 @@ class TestCLIArgumentParsing:
|
|
|
110
112
|
args = parse_args(["https://example.com", "--max-urls", "50"])
|
|
111
113
|
assert args.max_urls == 50
|
|
112
114
|
|
|
115
|
+
@pytest.mark.unit
|
|
116
|
+
def test_depth_flag_default(self):
|
|
117
|
+
"""Test default depth value."""
|
|
118
|
+
args = parse_args(["https://example.com"])
|
|
119
|
+
assert args.depth == 3
|
|
120
|
+
|
|
121
|
+
@pytest.mark.unit
|
|
122
|
+
def test_depth_flag_custom(self):
|
|
123
|
+
"""Test custom depth value."""
|
|
124
|
+
args = parse_args(["https://example.com", "--depth", "5"])
|
|
125
|
+
assert args.depth == 5
|
|
126
|
+
|
|
127
|
+
@pytest.mark.unit
|
|
128
|
+
def test_cache_dir_flag_default(self):
|
|
129
|
+
"""Test default cache directory."""
|
|
130
|
+
args = parse_args(["https://example.com"])
|
|
131
|
+
assert args.cache_dir == ".llmsbrieftxt_cache"
|
|
132
|
+
|
|
133
|
+
@pytest.mark.unit
|
|
134
|
+
def test_cache_dir_flag_custom(self):
|
|
135
|
+
"""Test custom cache directory."""
|
|
136
|
+
args = parse_args(["https://example.com", "--cache-dir", "/tmp/mycache"])
|
|
137
|
+
assert args.cache_dir == "/tmp/mycache"
|
|
138
|
+
|
|
139
|
+
@pytest.mark.unit
|
|
140
|
+
def test_use_cache_only_flag(self):
|
|
141
|
+
"""Test --use-cache-only flag."""
|
|
142
|
+
args = parse_args(["https://example.com", "--use-cache-only"])
|
|
143
|
+
assert args.use_cache_only is True
|
|
144
|
+
|
|
145
|
+
@pytest.mark.unit
|
|
146
|
+
def test_force_refresh_flag(self):
|
|
147
|
+
"""Test --force-refresh flag."""
|
|
148
|
+
args = parse_args(["https://example.com", "--force-refresh"])
|
|
149
|
+
assert args.force_refresh is True
|
|
150
|
+
|
|
151
|
+
@pytest.mark.unit
|
|
152
|
+
def test_all_new_arguments_together(self):
|
|
153
|
+
"""Test all new arguments combined."""
|
|
154
|
+
args = parse_args(
|
|
155
|
+
[
|
|
156
|
+
"https://example.com",
|
|
157
|
+
"--depth",
|
|
158
|
+
"2",
|
|
159
|
+
"--cache-dir",
|
|
160
|
+
"custom_cache",
|
|
161
|
+
"--max-urls",
|
|
162
|
+
"100",
|
|
163
|
+
]
|
|
164
|
+
)
|
|
165
|
+
assert args.url == "https://example.com"
|
|
166
|
+
assert args.depth == 2
|
|
167
|
+
assert args.cache_dir == "custom_cache"
|
|
168
|
+
assert args.max_urls == 100
|
|
169
|
+
|
|
113
170
|
@pytest.mark.unit
|
|
114
171
|
def test_no_url_exits(self):
|
|
115
172
|
"""Test that providing no URL exits with error."""
|
|
@@ -154,3 +211,73 @@ class TestURLValidation:
|
|
|
154
211
|
def test_invalid_url_malformed(self):
|
|
155
212
|
"""Test invalid malformed URL."""
|
|
156
213
|
assert validate_url("not-a-url") is False
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class TestCostEstimation:
|
|
217
|
+
"""Tests for cost estimation."""
|
|
218
|
+
|
|
219
|
+
@pytest.mark.unit
|
|
220
|
+
def test_cost_estimate_small_job(self):
|
|
221
|
+
"""Test cost estimation for small number of pages."""
|
|
222
|
+
cost = estimate_cost(10, "gpt-5-mini")
|
|
223
|
+
assert cost.startswith("~$")
|
|
224
|
+
assert "$0." in cost
|
|
225
|
+
|
|
226
|
+
@pytest.mark.unit
|
|
227
|
+
def test_cost_estimate_medium_job(self):
|
|
228
|
+
"""Test cost estimation for medium number of pages."""
|
|
229
|
+
cost = estimate_cost(100, "gpt-4o-mini")
|
|
230
|
+
assert cost.startswith("~$")
|
|
231
|
+
assert "$" in cost
|
|
232
|
+
|
|
233
|
+
@pytest.mark.unit
|
|
234
|
+
def test_cost_estimate_large_job(self):
|
|
235
|
+
"""Test cost estimation for large number of pages."""
|
|
236
|
+
cost = estimate_cost(500, "gpt-4o")
|
|
237
|
+
assert cost.startswith("~$")
|
|
238
|
+
assert float(cost.replace("~$", "")) > 1.0
|
|
239
|
+
|
|
240
|
+
@pytest.mark.unit
|
|
241
|
+
def test_cost_estimate_unknown_model(self):
|
|
242
|
+
"""Test cost estimation for unknown model."""
|
|
243
|
+
cost = estimate_cost(100, "unknown-model")
|
|
244
|
+
assert "not available" in cost
|
|
245
|
+
|
|
246
|
+
@pytest.mark.unit
|
|
247
|
+
def test_cost_estimate_zero_pages(self):
|
|
248
|
+
"""Test cost estimation for zero pages."""
|
|
249
|
+
cost = estimate_cost(0, "gpt-5-mini")
|
|
250
|
+
assert cost == "~$0.0000"
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class TestOutputPathExpansion:
|
|
254
|
+
"""Tests for output path with environment variable expansion."""
|
|
255
|
+
|
|
256
|
+
@pytest.mark.unit
|
|
257
|
+
def test_output_path_with_tilde_expansion(self):
|
|
258
|
+
"""Test output path with ~ expands to home directory."""
|
|
259
|
+
path = get_output_path("https://example.com", "~/docs/output.txt")
|
|
260
|
+
assert "~" not in str(path)
|
|
261
|
+
assert str(path).startswith(os.path.expanduser("~"))
|
|
262
|
+
|
|
263
|
+
@pytest.mark.unit
|
|
264
|
+
def test_output_path_with_env_var(self, monkeypatch):
|
|
265
|
+
"""Test output path with $VAR environment variable."""
|
|
266
|
+
monkeypatch.setenv("MYDIR", "/tmp/testdir")
|
|
267
|
+
path = get_output_path("https://example.com", "$MYDIR/output.txt")
|
|
268
|
+
assert str(path) == "/tmp/testdir/output.txt"
|
|
269
|
+
|
|
270
|
+
@pytest.mark.unit
|
|
271
|
+
def test_output_path_with_env_var_braces(self, monkeypatch):
|
|
272
|
+
"""Test output path with ${VAR} environment variable."""
|
|
273
|
+
monkeypatch.setenv("TESTDIR", "/tmp/test")
|
|
274
|
+
path = get_output_path("https://example.com", "${TESTDIR}/docs/output.txt")
|
|
275
|
+
assert str(path) == "/tmp/test/docs/output.txt"
|
|
276
|
+
|
|
277
|
+
@pytest.mark.unit
|
|
278
|
+
def test_output_path_default_no_expansion(self):
|
|
279
|
+
"""Test default output path (no custom path) works correctly."""
|
|
280
|
+
path = get_output_path("https://docs.example.com")
|
|
281
|
+
# Should contain .claude/docs in path
|
|
282
|
+
assert ".claude/docs" in str(path)
|
|
283
|
+
assert str(path).endswith("docs.example.com.txt")
|
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
"""Main generation pipeline for llmsbrieftxt."""
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from llmsbrieftxt.doc_loader import DocLoader
|
|
7
|
-
from llmsbrieftxt.extractor import default_extractor
|
|
8
|
-
from llmsbrieftxt.summarizer import Summarizer
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def ensure_directory_exists(file_path: str) -> None:
|
|
12
|
-
"""Ensure the parent directory of the given file path exists.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
file_path: Path to the file whose parent directory should be created
|
|
16
|
-
|
|
17
|
-
Raises:
|
|
18
|
-
RuntimeError: If directory creation fails due to permissions or other issues
|
|
19
|
-
"""
|
|
20
|
-
dir_path = Path(file_path).parent
|
|
21
|
-
if dir_path == Path("."):
|
|
22
|
-
return # Current directory, no need to create
|
|
23
|
-
|
|
24
|
-
try:
|
|
25
|
-
dir_path.mkdir(parents=True, exist_ok=True)
|
|
26
|
-
if not dir_path.exists():
|
|
27
|
-
print(f"Created directory: {dir_path}")
|
|
28
|
-
except OSError as e:
|
|
29
|
-
raise RuntimeError(f"Failed to create directory {dir_path}: {e}") from e
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
async def generate_llms_txt(
|
|
33
|
-
url: str,
|
|
34
|
-
llm_name: str = "o4-mini",
|
|
35
|
-
max_concurrent_summaries: int = 10,
|
|
36
|
-
output_path: str = "llms.txt",
|
|
37
|
-
show_urls: bool = False,
|
|
38
|
-
max_urls: int | None = None,
|
|
39
|
-
) -> None:
|
|
40
|
-
"""
|
|
41
|
-
Generate llms-brief.txt file from a documentation website.
|
|
42
|
-
|
|
43
|
-
Args:
|
|
44
|
-
url: URL of the documentation site to crawl
|
|
45
|
-
llm_name: OpenAI model to use for summarization
|
|
46
|
-
max_concurrent_summaries: Maximum concurrent LLM requests
|
|
47
|
-
output_path: Path to write the output file
|
|
48
|
-
show_urls: If True, only show discovered URLs without processing
|
|
49
|
-
max_urls: Maximum number of URLs to discover/process
|
|
50
|
-
"""
|
|
51
|
-
urls_processed = 0
|
|
52
|
-
summaries_generated = 0
|
|
53
|
-
|
|
54
|
-
# Set up cache directory
|
|
55
|
-
cache_dir = Path(".llmsbrieftxt_cache")
|
|
56
|
-
cache_dir.mkdir(exist_ok=True)
|
|
57
|
-
cache_file = cache_dir / "summaries.json"
|
|
58
|
-
|
|
59
|
-
# Load existing summaries from cache if available
|
|
60
|
-
existing_summaries: dict[str, str] = {}
|
|
61
|
-
if cache_file.exists():
|
|
62
|
-
try:
|
|
63
|
-
with open(cache_file) as f:
|
|
64
|
-
existing_summaries = json.load(f)
|
|
65
|
-
print(f"Using {len(existing_summaries)} cached summaries")
|
|
66
|
-
except Exception as e:
|
|
67
|
-
print(f"Warning: Could not load cache: {str(e)}")
|
|
68
|
-
|
|
69
|
-
extractor = default_extractor
|
|
70
|
-
output_file = output_path
|
|
71
|
-
|
|
72
|
-
# If show_urls is True, just show discovered URLs and exit
|
|
73
|
-
if show_urls:
|
|
74
|
-
print("Discovering documentation URLs...")
|
|
75
|
-
doc_loader = DocLoader(max_urls=max_urls)
|
|
76
|
-
_, discovered_urls = await doc_loader.load_docs(
|
|
77
|
-
url, extractor=extractor, show_urls=True
|
|
78
|
-
)
|
|
79
|
-
print("\nDiscovered URLs:")
|
|
80
|
-
for discovered_url in discovered_urls:
|
|
81
|
-
print(f" - {discovered_url}")
|
|
82
|
-
print(f"\nTotal: {len(discovered_urls)} unique URLs")
|
|
83
|
-
return
|
|
84
|
-
|
|
85
|
-
# Load and process documents
|
|
86
|
-
doc_loader = DocLoader(max_urls=max_urls)
|
|
87
|
-
docs, discovered_urls = await doc_loader.load_docs(url, extractor=extractor)
|
|
88
|
-
urls_processed = len(docs)
|
|
89
|
-
|
|
90
|
-
# Initialize summarizer
|
|
91
|
-
print(f"\nGenerating summaries with {llm_name}...")
|
|
92
|
-
summarizer = Summarizer(
|
|
93
|
-
llm_name=llm_name,
|
|
94
|
-
max_concurrent=max_concurrent_summaries,
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
summaries = []
|
|
98
|
-
try:
|
|
99
|
-
summaries = await summarizer.summarize_all(
|
|
100
|
-
docs, existing_summaries=existing_summaries, cache_file=cache_file
|
|
101
|
-
)
|
|
102
|
-
summaries_generated = len(summaries)
|
|
103
|
-
except KeyboardInterrupt:
|
|
104
|
-
print("Process interrupted by user. Saving partial results...")
|
|
105
|
-
if cache_file.exists():
|
|
106
|
-
try:
|
|
107
|
-
with open(cache_file) as f:
|
|
108
|
-
partial_summaries = json.load(f)
|
|
109
|
-
summaries = list(partial_summaries.values())
|
|
110
|
-
summaries_generated = len(summaries)
|
|
111
|
-
print(f"Recovered {len(summaries)} summaries from cache")
|
|
112
|
-
except Exception:
|
|
113
|
-
pass
|
|
114
|
-
except Exception as e:
|
|
115
|
-
print(f"Summarization process error: {str(e)}")
|
|
116
|
-
if cache_file.exists():
|
|
117
|
-
try:
|
|
118
|
-
with open(cache_file) as f:
|
|
119
|
-
partial_summaries = json.load(f)
|
|
120
|
-
summaries = list(partial_summaries.values())
|
|
121
|
-
summaries_generated = len(summaries)
|
|
122
|
-
print(f"Recovered {len(summaries)} partial summaries from cache")
|
|
123
|
-
except Exception:
|
|
124
|
-
summaries = []
|
|
125
|
-
finally:
|
|
126
|
-
# Write results to file
|
|
127
|
-
if summaries:
|
|
128
|
-
ensure_directory_exists(output_file)
|
|
129
|
-
output_content = "".join(summaries)
|
|
130
|
-
Path(output_file).write_text(output_content, encoding="utf-8")
|
|
131
|
-
else:
|
|
132
|
-
ensure_directory_exists(output_file)
|
|
133
|
-
Path(output_file).write_text("", encoding="utf-8")
|
|
134
|
-
|
|
135
|
-
# Print summary
|
|
136
|
-
print(f"\n{'=' * 50}")
|
|
137
|
-
print(f"Processed: {summaries_generated}/{urls_processed} pages")
|
|
138
|
-
if urls_processed > 0:
|
|
139
|
-
success_rate = summaries_generated / urls_processed * 100
|
|
140
|
-
print(f"Success rate: {success_rate:.1f}%")
|
|
141
|
-
print(f"Output: {output_file}")
|
|
142
|
-
print(f"{'=' * 50}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|