llmsbrieftxt 1.3.1__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of llmsbrieftxt might be problematic. Click here for more details.

Files changed (44) hide show
  1. llmsbrieftxt-1.4.0/.github/copilot-instructions.md +115 -0
  2. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/CLAUDE.md +27 -11
  3. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/PKG-INFO +44 -12
  4. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/README.md +43 -11
  5. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/cli.py +112 -15
  6. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/constants.py +26 -0
  7. llmsbrieftxt-1.4.0/llmsbrieftxt/main.py +227 -0
  8. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/pyproject.toml +1 -1
  9. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/test_cli.py +128 -1
  10. llmsbrieftxt-1.3.1/llmsbrieftxt/main.py +0 -142
  11. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  12. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  13. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  14. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/ISSUE_TEMPLATE/question.yml +0 -0
  15. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  16. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/pull_request_template.md +0 -0
  17. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/workflows/ci.yml +0 -0
  18. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/workflows/pr-title-check.yml +0 -0
  19. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.github/workflows/release.yml +0 -0
  20. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/.gitignore +0 -0
  21. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/CONTRIBUTING.md +0 -0
  22. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/LICENSE +0 -0
  23. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/PRODUCTION_CLEANUP_PLAN.md +0 -0
  24. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/__init__.py +0 -0
  25. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/crawler.py +0 -0
  26. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/doc_loader.py +0 -0
  27. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/extractor.py +0 -0
  28. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/schema.py +0 -0
  29. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/summarizer.py +0 -0
  30. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/url_filters.py +0 -0
  31. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/llmsbrieftxt/url_utils.py +0 -0
  32. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/pytest.ini +0 -0
  33. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/scripts/bump_version.py +0 -0
  34. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/__init__.py +0 -0
  35. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/conftest.py +0 -0
  36. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/fixtures/__init__.py +0 -0
  37. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/integration/__init__.py +0 -0
  38. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/integration/test_doc_loader_integration.py +0 -0
  39. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/__init__.py +0 -0
  40. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/test_doc_loader.py +0 -0
  41. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/test_extractor.py +0 -0
  42. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/test_robustness.py +0 -0
  43. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/tests/unit/test_summarizer.py +0 -0
  44. {llmsbrieftxt-1.3.1 → llmsbrieftxt-1.4.0}/uv.lock +0 -0
@@ -0,0 +1,115 @@
1
+ # GitHub Copilot Instructions for llmsbrieftxt
2
+
3
+ ## Project Overview
4
+
5
+ This is `llmsbrieftxt`, a Python package that generates llms-brief.txt files by crawling documentation websites and using OpenAI to create structured descriptions. The CLI command is `llmtxt` (not `llmsbrieftxt`).
6
+
7
+ ## Architecture and Code Patterns
8
+
9
+ ### Async-First Design
10
+ All main functions use async/await patterns. Use `asyncio.gather()` for concurrent operations and semaphore control for rate limiting. The processing pipeline flows: URL Discovery → Content Extraction → LLM Summarization → File Generation.
11
+
12
+ ### Module Organization
13
+ - **cli.py**: Simple CLI with positional URL argument (no subcommands)
14
+ - **main.py**: Orchestrates the async generation pipeline
15
+ - **crawler.py**: RobustDocCrawler for breadth-first URL discovery
16
+ - **doc_loader.py**: DocLoader wraps crawler with document loading
17
+ - **extractor.py**: HTML to markdown via trafilatura
18
+ - **summarizer.py**: OpenAI integration with retry logic (tenacity)
19
+ - **url_utils.py**: URLNormalizer for deduplication
20
+ - **url_filters.py**: Filter non-documentation URLs
21
+ - **schema.py**: Pydantic models (PageSummary)
22
+ - **constants.py**: Configuration constants
23
+
24
+ ### Type Safety
25
+ Use Pydantic models for all structured data. The OpenAI integration uses structured output with the PageSummary model.
26
+
27
+ ### Error Handling
28
+ Failed URL loads should be logged but not stop processing. LLM failures use exponential backoff retries via tenacity. Never let one failure break the entire pipeline.
29
+
30
+ ## Development Practices
31
+
32
+ ### Testing Requirements
33
+ Write tests before implementing features. Use pytest with these markers:
34
+ - `@pytest.mark.unit` for fast, isolated tests
35
+ - `@pytest.mark.requires_openai` for tests needing OPENAI_API_KEY
36
+ - `@pytest.mark.slow` for tests making external API calls
37
+
38
+ Tests go in:
39
+ - `tests/unit/` for fast tests with no external dependencies
40
+ - `tests/integration/` for tests requiring OPENAI_API_KEY
41
+
42
+ ### Code Quality Tools
43
+ Before committing, always run:
44
+ 1. Format: `uv run ruff format llmsbrieftxt/ tests/`
45
+ 2. Lint: `uv run ruff check llmsbrieftxt/ tests/`
46
+ 3. Type check: `uv run pyright llmsbrieftxt/`
47
+ 4. Tests: `uv run pytest tests/unit/`
48
+
49
+ ### Package Management
50
+ Use `uv` for all package operations:
51
+ - Install: `uv sync --group dev`
52
+ - Add dependency: `uv add package-name`
53
+ - Build: `uv build`
54
+
55
+ ## Design Philosophy
56
+
57
+ ### Unix Philosophy
58
+ This project follows "do one thing and do it well":
59
+ - Generate llms-brief.txt files only (no built-in search/list features)
60
+ - Compose with standard Unix tools (rg, grep, ls)
61
+ - Simple CLI: URL is a positional argument, no subcommands
62
+ - Plain text output for scriptability
63
+
64
+ ### Simplicity Over Features
65
+ Avoid adding functionality that duplicates mature Unix tools. Every line of code must serve the core mission of generating llms-brief.txt files.
66
+
67
+ ## Configuration Defaults
68
+
69
+ - **Crawl Depth**: 3 levels (hardcoded in crawler.py)
70
+ - **Output**: `~/.claude/docs/<domain>.txt` (override with `--output`)
71
+ - **Cache**: `.llmsbrieftxt_cache/` for intermediate results
72
+ - **OpenAI Model**: `gpt-5-mini` (override with `--model`)
73
+ - **Concurrency**: 10 concurrent LLM requests (prevents rate limiting)
74
+
75
+ ## Commit Convention
76
+
77
+ Use conventional commits for automated versioning:
78
+ - `fix:` → patch bump (1.0.0 → 1.0.1)
79
+ - `feat:` → minor bump (1.0.0 → 1.1.0)
80
+ - `BREAKING CHANGE` or `feat!:`/`fix!:` → major bump (1.0.0 → 2.0.0)
81
+
82
+ Examples:
83
+ ```bash
84
+ git commit -m "fix: handle empty sitemap gracefully"
85
+ git commit -m "feat: add --depth option for custom crawl depth"
86
+ git commit -m "feat!: change default output location"
87
+ ```
88
+
89
+ ## Non-Obvious Behaviors
90
+
91
+ 1. URL Discovery discovers ALL pages up to depth 3, not just direct links
92
+ 2. URLs like `/page`, `/page/`, and `/page#section` are deduplicated as the same URL
93
+ 3. Summaries are automatically cached in `.llmsbrieftxt_cache/summaries.json`
94
+ 4. Content extraction uses trafilatura to preserve HTML structure in markdown
95
+ 5. File I/O is synchronous (uses standard `Path.write_text()` for simplicity)
96
+
97
+ ## Known Limitations
98
+
99
+ 1. Only supports OpenAI API (no other LLM providers)
100
+ 2. Crawl depth is hardcoded to 3 in crawler.py
101
+ 3. No CLI flag to force resume from cache (though cache exists)
102
+ 4. No progress persistence if interrupted
103
+ 5. Prompts and parsing assume English documentation
104
+
105
+ ## Code Review Checklist
106
+
107
+ When reviewing code changes:
108
+ - Ensure async patterns are used correctly (no blocking I/O in async functions)
109
+ - Verify all functions have type hints
110
+ - Check that tests are included for new functionality
111
+ - Confirm error handling doesn't break the pipeline
112
+ - Validate that conventional commit format is used
113
+ - Ensure code follows Unix philosophy (simplicity, composability)
114
+ - Check that ruff and pyright pass without errors
115
+ - **IMPORTANT**: Always include specific file names and line numbers when providing review feedback (e.g., "main.py:165" or "line 182 in cli.py")
@@ -95,8 +95,12 @@ llmtxt https://docs.python.org/3/
95
95
 
96
96
  # With options
97
97
  llmtxt https://example.com --model gpt-4o
98
- llmtxt https://example.com --show-urls
98
+ llmtxt https://example.com --show-urls # Preview URLs with cost estimate
99
99
  llmtxt https://example.com --max-urls 50
100
+ llmtxt https://example.com --depth 2 # Control crawl depth (default: 3)
101
+ llmtxt https://example.com --use-cache-only # No API calls, cache only
102
+ llmtxt https://example.com --force-refresh # Ignore cache, regenerate all
103
+ llmtxt https://example.com --cache-dir /tmp/cache # Custom cache location
100
104
  llmtxt https://example.com --output custom-path.txt
101
105
  ```
102
106
 
@@ -114,9 +118,15 @@ ls -lh ~/.claude/docs/
114
118
 
115
119
  ### Default Behavior
116
120
  These are the production defaults:
117
- - **Crawl Depth**: 3 levels from starting URL (hardcoded in crawler.py)
121
+ - **Crawl Depth**: 3 levels from starting URL (configurable with `--depth`)
118
122
  - **Output Location**: `~/.claude/docs/<domain>.txt` (can override with `--output`)
119
- - **Cache Directory**: `.llmsbrieftxt_cache/` for intermediate results
123
+ - **Cache Directory**: `.llmsbrieftxt_cache/` for intermediate results (can override with `--cache-dir`)
124
+
125
+ ### New Features (as of latest update)
126
+ - **Cost Estimation**: `--show-urls` now displays estimated API cost before processing
127
+ - **Cache Control**: `--use-cache-only` and `--force-refresh` flags for cache management
128
+ - **Failed URL Tracking**: Failed URLs are written to `failed_urls.txt` next to output file
129
+ - **Depth Configuration**: Crawl depth is now configurable via `--depth` flag
120
130
 
121
131
  ### Default Model
122
132
  - **OpenAI Model**: `gpt-5-mini` (defined in constants.py, can override with `--model`)
@@ -169,12 +179,16 @@ Test markers:
169
179
 
170
180
  ## Non-Obvious Behaviors
171
181
 
172
- 1. **URL Discovery**: Discovers ALL pages up to depth 3, not just pages linked from your starting URL
182
+ 1. **URL Discovery**: Discovers ALL pages up to configured depth (default 3), not just pages linked from your starting URL
173
183
  2. **Duplicate Handling**: `/page`, `/page/`, and `/page#section` are treated as the same URL
174
184
  3. **Concurrency Limit**: Default 10 concurrent LLM requests prevents rate limiting
175
185
  4. **Automatic Caching**: Summaries cached in `.llmsbrieftxt_cache/summaries.json` and reused automatically
176
186
  5. **Content Extraction**: Uses `trafilatura` for HTML→markdown, preserving structure
177
187
  6. **Sync File I/O**: Uses standard `Path.write_text()` instead of async file I/O (simpler, sufficient)
188
+ 7. **Cost Estimation**: `--show-urls` shows both discovered URLs count AND estimated API cost
189
+ 8. **Cache-First**: When using cache, shows "Cached: X | New: Y" breakdown before processing
190
+ 9. **Failed URL Reporting**: Failed URLs saved to `failed_urls.txt` in same directory as output
191
+ 10. **Environment Variables**: `--output` and `--cache-dir` support `$HOME` and other env var expansion
178
192
 
179
193
  ## Using llms-brief.txt Files
180
194
 
@@ -240,9 +254,12 @@ grep -rn "hooks" ~/.claude/docs/
240
254
 
241
255
  ### Debugging Issues
242
256
  1. Check logs - logger is configured in most modules
243
- 2. Use `--show-urls` to preview URL discovery
244
- 3. Check cache: `.llmsbrieftxt_cache/summaries.json`
245
- 4. Run with verbose pytest: `uv run pytest -vv -s`
257
+ 2. Use `--show-urls` to preview URL discovery and cost estimate
258
+ 3. Check cache: `.llmsbrieftxt_cache/summaries.json` (or custom `--cache-dir`)
259
+ 4. Check failed URLs: `failed_urls.txt` in output directory
260
+ 5. Test with limited scope: `--max-urls 10 --depth 1` for quick testing
261
+ 6. Use `--use-cache-only` to test output generation without API calls
262
+ 7. Run with verbose pytest: `uv run pytest -vv -s`
246
263
 
247
264
  ### Modifying URL Discovery Logic
248
265
  - Edit `crawler.py` for crawling behavior
@@ -308,10 +325,9 @@ uv build
308
325
  ## Known Limitations
309
326
 
310
327
  1. **OpenAI Only**: Currently only supports OpenAI API (no other LLM providers)
311
- 2. **Depth Hardcoded**: Crawl depth is hardcoded to 3 in crawler.py
312
- 3. **No Resume Flag**: Cache exists but no CLI flag to force resume from cache
313
- 4. **No Progress Persistence**: If interrupted, must restart (though cache helps)
314
- 5. **English-Centric**: Prompts and parsing assume English documentation
328
+ 2. **No Progress Persistence**: If interrupted, must restart (though cache helps and is used automatically on restart)
329
+ 3. **English-Centric**: Prompts and parsing assume English documentation
330
+ 4. **No Incremental Timestamp Checking**: Force refresh or cache-only mode, but no "only update changed pages" mode
315
331
 
316
332
  ## Migration from v0.x
317
333
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmsbrieftxt
3
- Version: 1.3.1
3
+ Version: 1.4.0
4
4
  Summary: Generate llms-brief.txt files from documentation websites using AI
5
5
  Project-URL: Homepage, https://github.com/stevennevins/llmsbrief
6
6
  Project-URL: Repository, https://github.com/stevennevins/llmsbrief
@@ -98,8 +98,12 @@ Output is automatically saved to `~/.claude/docs/<domain>.txt` (e.g., `docs.pyth
98
98
  - `--output PATH` - Custom output path (default: `~/.claude/docs/<domain>.txt`)
99
99
  - `--model MODEL` - OpenAI model to use (default: `gpt-5-mini`)
100
100
  - `--max-concurrent-summaries N` - Concurrent LLM requests (default: 10)
101
- - `--show-urls` - Preview discovered URLs without processing
101
+ - `--show-urls` - Preview discovered URLs with cost estimate (no API calls)
102
102
  - `--max-urls N` - Limit number of URLs to process
103
+ - `--depth N` - Maximum crawl depth (default: 3)
104
+ - `--cache-dir PATH` - Cache directory path (default: `.llmsbrieftxt_cache`)
105
+ - `--use-cache-only` - Use only cached summaries, skip API calls for new pages
106
+ - `--force-refresh` - Ignore cache and regenerate all summaries
103
107
 
104
108
  ### Examples
105
109
 
@@ -110,12 +114,24 @@ llmtxt https://docs.python.org/3/
110
114
  # Use a different model
111
115
  llmtxt https://react.dev --model gpt-4o
112
116
 
113
- # Preview URLs before processing (no API calls)
117
+ # Preview URLs with cost estimate before processing (no API calls)
114
118
  llmtxt https://react.dev --show-urls
115
119
 
116
120
  # Limit scope for testing
117
121
  llmtxt https://docs.python.org --max-urls 50
118
122
 
123
+ # Custom crawl depth (explore deeper or shallower)
124
+ llmtxt https://example.com --depth 2
125
+
126
+ # Use only cached summaries (no API calls)
127
+ llmtxt https://docs.python.org/3/ --use-cache-only
128
+
129
+ # Force refresh all summaries (ignore cache)
130
+ llmtxt https://docs.python.org/3/ --force-refresh
131
+
132
+ # Custom cache directory
133
+ llmtxt https://example.com --cache-dir /tmp/my-cache
134
+
119
135
  # Custom output location
120
136
  llmtxt https://react.dev --output ./my-docs/react.txt
121
137
 
@@ -245,11 +261,11 @@ uv run mypy llmsbrieftxt/
245
261
 
246
262
  ### Default Settings
247
263
 
248
- - **Crawl Depth**: 3 levels (hardcoded)
249
- - **Output Location**: `~/.claude/docs/<domain>.txt`
250
- - **Cache Directory**: `.llmsbrieftxt_cache/`
251
- - **OpenAI Model**: `gpt-5-mini`
252
- - **Concurrent Requests**: 10
264
+ - **Crawl Depth**: 3 levels (configurable via `--depth`)
265
+ - **Output Location**: `~/.claude/docs/<domain>.txt` (configurable via `--output`)
266
+ - **Cache Directory**: `.llmsbrieftxt_cache/` (configurable via `--cache-dir`)
267
+ - **OpenAI Model**: `gpt-5-mini` (configurable via `--model`)
268
+ - **Concurrent Requests**: 10 (configurable via `--max-concurrent-summaries`)
253
269
 
254
270
  ### Environment Variables
255
271
 
@@ -259,10 +275,26 @@ uv run mypy llmsbrieftxt/
259
275
 
260
276
  ### Managing API Costs
261
277
 
262
- - Use `--show-urls` first to preview scope
263
- - Use `--max-urls` to limit processing during testing
264
- - Summaries are cached automatically - rerunning is cheap
265
- - Default model `gpt-5-mini` is cost-effective for most documentation
278
+ - **Preview with cost estimate**: Use `--show-urls` to see discovered URLs and estimated API cost before processing
279
+ - **Limit scope**: Use `--max-urls` to limit processing during testing
280
+ - **Automatic caching**: Summaries are cached automatically - rerunning is cheap
281
+ - **Cache-only mode**: Use `--use-cache-only` to generate output from cache without API calls
282
+ - **Force refresh**: Use `--force-refresh` when you need to regenerate all summaries
283
+ - **Cost-effective model**: Default model `gpt-5-mini` is cost-effective for most documentation
284
+
285
+ ### Controlling Crawl Depth
286
+
287
+ - **Default depth (3)**: Good for most documentation sites (100-300 pages)
288
+ - **Shallow crawl (1-2)**: Use for large sites or to focus on main pages only
289
+ - **Deep crawl (4-5)**: Use for small sites or comprehensive coverage
290
+ - Example: `llmtxt https://example.com --depth 2 --show-urls` to preview scope
291
+
292
+ ### Cache Management
293
+
294
+ - **Default location**: `.llmsbrieftxt_cache/` in current directory
295
+ - **Custom location**: Use `--cache-dir` for shared caches or different organization
296
+ - **Cache benefits**: Speeds up reruns, reduces API costs, enables incremental updates
297
+ - **Failed URLs tracking**: Failed URLs are written to `failed_urls.txt` next to output file
266
298
 
267
299
  ### Organizing Documentation
268
300
 
@@ -65,8 +65,12 @@ Output is automatically saved to `~/.claude/docs/<domain>.txt` (e.g., `docs.pyth
65
65
  - `--output PATH` - Custom output path (default: `~/.claude/docs/<domain>.txt`)
66
66
  - `--model MODEL` - OpenAI model to use (default: `gpt-5-mini`)
67
67
  - `--max-concurrent-summaries N` - Concurrent LLM requests (default: 10)
68
- - `--show-urls` - Preview discovered URLs without processing
68
+ - `--show-urls` - Preview discovered URLs with cost estimate (no API calls)
69
69
  - `--max-urls N` - Limit number of URLs to process
70
+ - `--depth N` - Maximum crawl depth (default: 3)
71
+ - `--cache-dir PATH` - Cache directory path (default: `.llmsbrieftxt_cache`)
72
+ - `--use-cache-only` - Use only cached summaries, skip API calls for new pages
73
+ - `--force-refresh` - Ignore cache and regenerate all summaries
70
74
 
71
75
  ### Examples
72
76
 
@@ -77,12 +81,24 @@ llmtxt https://docs.python.org/3/
77
81
  # Use a different model
78
82
  llmtxt https://react.dev --model gpt-4o
79
83
 
80
- # Preview URLs before processing (no API calls)
84
+ # Preview URLs with cost estimate before processing (no API calls)
81
85
  llmtxt https://react.dev --show-urls
82
86
 
83
87
  # Limit scope for testing
84
88
  llmtxt https://docs.python.org --max-urls 50
85
89
 
90
+ # Custom crawl depth (explore deeper or shallower)
91
+ llmtxt https://example.com --depth 2
92
+
93
+ # Use only cached summaries (no API calls)
94
+ llmtxt https://docs.python.org/3/ --use-cache-only
95
+
96
+ # Force refresh all summaries (ignore cache)
97
+ llmtxt https://docs.python.org/3/ --force-refresh
98
+
99
+ # Custom cache directory
100
+ llmtxt https://example.com --cache-dir /tmp/my-cache
101
+
86
102
  # Custom output location
87
103
  llmtxt https://react.dev --output ./my-docs/react.txt
88
104
 
@@ -212,11 +228,11 @@ uv run mypy llmsbrieftxt/
212
228
 
213
229
  ### Default Settings
214
230
 
215
- - **Crawl Depth**: 3 levels (hardcoded)
216
- - **Output Location**: `~/.claude/docs/<domain>.txt`
217
- - **Cache Directory**: `.llmsbrieftxt_cache/`
218
- - **OpenAI Model**: `gpt-5-mini`
219
- - **Concurrent Requests**: 10
231
+ - **Crawl Depth**: 3 levels (configurable via `--depth`)
232
+ - **Output Location**: `~/.claude/docs/<domain>.txt` (configurable via `--output`)
233
+ - **Cache Directory**: `.llmsbrieftxt_cache/` (configurable via `--cache-dir`)
234
+ - **OpenAI Model**: `gpt-5-mini` (configurable via `--model`)
235
+ - **Concurrent Requests**: 10 (configurable via `--max-concurrent-summaries`)
220
236
 
221
237
  ### Environment Variables
222
238
 
@@ -226,10 +242,26 @@ uv run mypy llmsbrieftxt/
226
242
 
227
243
  ### Managing API Costs
228
244
 
229
- - Use `--show-urls` first to preview scope
230
- - Use `--max-urls` to limit processing during testing
231
- - Summaries are cached automatically - rerunning is cheap
232
- - Default model `gpt-5-mini` is cost-effective for most documentation
245
+ - **Preview with cost estimate**: Use `--show-urls` to see discovered URLs and estimated API cost before processing
246
+ - **Limit scope**: Use `--max-urls` to limit processing during testing
247
+ - **Automatic caching**: Summaries are cached automatically - rerunning is cheap
248
+ - **Cache-only mode**: Use `--use-cache-only` to generate output from cache without API calls
249
+ - **Force refresh**: Use `--force-refresh` when you need to regenerate all summaries
250
+ - **Cost-effective model**: Default model `gpt-5-mini` is cost-effective for most documentation
251
+
252
+ ### Controlling Crawl Depth
253
+
254
+ - **Default depth (3)**: Good for most documentation sites (100-300 pages)
255
+ - **Shallow crawl (1-2)**: Use for large sites or to focus on main pages only
256
+ - **Deep crawl (4-5)**: Use for small sites or comprehensive coverage
257
+ - Example: `llmtxt https://example.com --depth 2 --show-urls` to preview scope
258
+
259
+ ### Cache Management
260
+
261
+ - **Default location**: `.llmsbrieftxt_cache/` in current directory
262
+ - **Custom location**: Use `--cache-dir` for shared caches or different organization
263
+ - **Cache benefits**: Speeds up reruns, reduces API costs, enables incremental updates
264
+ - **Failed URLs tracking**: Failed URLs are written to `failed_urls.txt` next to output file
233
265
 
234
266
  ### Organizing Documentation
235
267
 
@@ -8,9 +8,14 @@ from pathlib import Path
8
8
  from urllib.parse import urlparse
9
9
 
10
10
  from llmsbrieftxt.constants import (
11
+ DEFAULT_CACHE_DIR,
11
12
  DEFAULT_CONCURRENT_SUMMARIES,
13
+ DEFAULT_CRAWL_DEPTH,
12
14
  DEFAULT_OPENAI_MODEL,
13
15
  DOCS_DIR,
16
+ ESTIMATED_TOKENS_PER_PAGE_INPUT,
17
+ ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
18
+ OPENAI_PRICING,
14
19
  )
15
20
  from llmsbrieftxt.main import generate_llms_txt
16
21
 
@@ -48,13 +53,39 @@ def parse_args(test_args: list[str] | None = None) -> argparse.Namespace:
48
53
  parser.add_argument(
49
54
  "--show-urls",
50
55
  action="store_true",
51
- help="Preview discovered URLs without processing them",
56
+ help="Preview discovered URLs with cost estimate (no processing or API calls)",
52
57
  )
53
58
 
54
59
  parser.add_argument(
55
60
  "--max-urls", type=int, help="Maximum number of URLs to discover and process"
56
61
  )
57
62
 
63
+ parser.add_argument(
64
+ "--depth",
65
+ type=int,
66
+ default=DEFAULT_CRAWL_DEPTH,
67
+ help=f"Maximum crawl depth (default: {DEFAULT_CRAWL_DEPTH})",
68
+ )
69
+
70
+ parser.add_argument(
71
+ "--cache-dir",
72
+ type=str,
73
+ default=DEFAULT_CACHE_DIR,
74
+ help=f"Cache directory path (default: {DEFAULT_CACHE_DIR})",
75
+ )
76
+
77
+ parser.add_argument(
78
+ "--use-cache-only",
79
+ action="store_true",
80
+ help="Use only cached summaries, skip API calls for new pages",
81
+ )
82
+
83
+ parser.add_argument(
84
+ "--force-refresh",
85
+ action="store_true",
86
+ help="Ignore cache and regenerate all summaries",
87
+ )
88
+
58
89
  return parser.parse_args(test_args)
59
90
 
60
91
 
@@ -72,6 +103,39 @@ def check_openai_api_key() -> bool:
72
103
  return bool(os.environ.get("OPENAI_API_KEY"))
73
104
 
74
105
 
106
+ def estimate_cost(num_pages: int, model: str) -> str:
107
+ """
108
+ Estimate the API cost for processing a given number of pages.
109
+
110
+ Args:
111
+ num_pages: Number of pages to process
112
+ model: OpenAI model name
113
+
114
+ Returns:
115
+ Formatted cost estimate string
116
+ """
117
+ if model not in OPENAI_PRICING:
118
+ return "Cost estimation not available for this model"
119
+
120
+ input_price, output_price = OPENAI_PRICING[model]
121
+
122
+ # Calculate total tokens
123
+ total_input_tokens = num_pages * ESTIMATED_TOKENS_PER_PAGE_INPUT
124
+ total_output_tokens = num_pages * ESTIMATED_TOKENS_PER_PAGE_OUTPUT
125
+
126
+ # Calculate cost (prices are per 1M tokens)
127
+ input_cost = (total_input_tokens / 1_000_000) * input_price
128
+ output_cost = (total_output_tokens / 1_000_000) * output_price
129
+ total_cost = input_cost + output_cost
130
+
131
+ if total_cost < 0.01:
132
+ return f"~${total_cost:.4f}"
133
+ elif total_cost < 1.00:
134
+ return f"~${total_cost:.3f}"
135
+ else:
136
+ return f"~${total_cost:.2f}"
137
+
138
+
75
139
  def get_output_path(url: str, custom_output: str | None = None) -> Path:
76
140
  """
77
141
  Get the output file path for a given URL.
@@ -84,7 +148,9 @@ def get_output_path(url: str, custom_output: str | None = None) -> Path:
84
148
  Path object for the output file
85
149
  """
86
150
  if custom_output:
87
- return Path(custom_output)
151
+ # Expand environment variables and user home directory
152
+ expanded = os.path.expandvars(custom_output)
153
+ return Path(expanded).expanduser()
88
154
 
89
155
  # Extract domain from URL
90
156
  parsed = urlparse(url)
@@ -116,8 +182,25 @@ def main() -> None:
116
182
  print("Example: https://docs.python.org/3/", file=sys.stderr)
117
183
  sys.exit(1)
118
184
 
119
- # Check for API key (unless just showing URLs)
120
- if not args.show_urls and not check_openai_api_key():
185
+ # Validate depth parameter
186
+ if args.depth < 1:
187
+ print("Error: --depth must be at least 1", file=sys.stderr)
188
+ sys.exit(1)
189
+
190
+ # Check for conflicting cache flags
191
+ if args.use_cache_only and args.force_refresh:
192
+ print(
193
+ "Error: Cannot use --use-cache-only and --force-refresh together",
194
+ file=sys.stderr,
195
+ )
196
+ sys.exit(1)
197
+
198
+ # Check for API key (unless just showing URLs or using cache only)
199
+ if (
200
+ not args.show_urls
201
+ and not args.use_cache_only
202
+ and not check_openai_api_key()
203
+ ):
121
204
  print("Error: OPENAI_API_KEY not found", file=sys.stderr)
122
205
  print("Please set your OpenAI API key:", file=sys.stderr)
123
206
  print(" export OPENAI_API_KEY='sk-your-api-key-here'", file=sys.stderr)
@@ -131,24 +214,24 @@ def main() -> None:
131
214
  # Determine output path
132
215
  output_path = get_output_path(args.url, args.output)
133
216
 
217
+ # Expand cache directory path
218
+ cache_dir = Path(os.path.expandvars(args.cache_dir)).expanduser()
219
+
134
220
  # Print configuration
135
221
  print(f"Processing URL: {args.url}")
136
- print(f"Using model: {args.model}")
222
+ if not args.show_urls:
223
+ print(f"Using model: {args.model}")
224
+ print(f"Crawl depth: {args.depth}")
137
225
  print(f"Output: {output_path}")
138
226
  if args.max_urls:
139
227
  print(f"Max URLs: {args.max_urls}")
140
-
141
- # Warn about API costs for large jobs
142
- if not args.show_urls and not args.max_urls:
143
- print("")
144
- print(
145
- "Note: This will discover and process all documentation pages (depth=3)"
146
- )
147
- print("Tip: Use --show-urls first to preview scope, or --max-urls to limit")
148
- print("")
228
+ if args.use_cache_only:
229
+ print("Mode: Cache-only (no API calls)")
230
+ elif args.force_refresh:
231
+ print("Mode: Force refresh (ignoring cache)")
149
232
 
150
233
  # Run generation
151
- asyncio.run(
234
+ result = asyncio.run(
152
235
  generate_llms_txt(
153
236
  url=args.url,
154
237
  llm_name=args.model,
@@ -156,9 +239,23 @@ def main() -> None:
156
239
  output_path=str(output_path),
157
240
  show_urls=args.show_urls,
158
241
  max_urls=args.max_urls,
242
+ max_depth=args.depth,
243
+ cache_dir=str(cache_dir),
244
+ use_cache_only=args.use_cache_only,
245
+ force_refresh=args.force_refresh,
159
246
  )
160
247
  )
161
248
 
249
+ # Show cost estimate and failed URLs if available
250
+ if args.show_urls and result:
251
+ num_urls_value = result.get("num_urls", 0)
252
+ # Type guard to ensure we have an int
253
+ if isinstance(num_urls_value, int):
254
+ print(
255
+ f"\nEstimated cost for {num_urls_value} pages: {estimate_cost(num_urls_value, args.model)}"
256
+ )
257
+ print("Note: Actual cost may vary based on page content size and caching")
258
+
162
259
  except KeyboardInterrupt:
163
260
  print("\nOperation cancelled by user.", file=sys.stderr)
164
261
  sys.exit(1)
@@ -9,6 +9,32 @@ DEFAULT_OPENAI_MODEL = "gpt-5-mini"
9
9
  # Docs Directory
10
10
  DOCS_DIR = "~/.claude/docs" # Will be expanded to full path at runtime
11
11
 
12
+ # Default Cache Directory
13
+ DEFAULT_CACHE_DIR = ".llmsbrieftxt_cache"
14
+
15
+ # Default Crawl Depth
16
+ DEFAULT_CRAWL_DEPTH = 3
17
+
18
+ # OpenAI Pricing (per 1M tokens) - prices subject to change
19
+ # Format: {model: (input_price, output_price)}
20
+ # Note: Verify current pricing at https://openai.com/api/pricing/
21
+ OPENAI_PRICING = {
22
+ "gpt-5-mini": (0.15, 0.60), # $0.15 input, $0.60 output per 1M tokens
23
+ "gpt-4o-mini": (0.15, 0.60),
24
+ "gpt-4o": (2.50, 10.00),
25
+ "gpt-4-turbo": (10.00, 30.00),
26
+ "gpt-4": (30.00, 60.00),
27
+ }
28
+
29
+ # Estimated tokens per page for cost calculation
30
+ # These estimates are based on typical documentation page sizes:
31
+ # - Input: ~2000-4000 words per doc page → ~3000 tokens (conservative estimate)
32
+ # - Output: ~300 tokens for structured PageSummary with all fields
33
+ # Accuracy: Estimates typically within ±30% of actual cost
34
+ # Pages with code examples or very long content may exceed these estimates
35
+ ESTIMATED_TOKENS_PER_PAGE_INPUT = 3000
36
+ ESTIMATED_TOKENS_PER_PAGE_OUTPUT = 400
37
+
12
38
 
13
39
  # Prompt Templates
14
40
  DEFAULT_SUMMARY_PROMPT = """You are a specialized content analyzer creating structured summaries for llms-brief.txt files. Your role is to help LLMs understand web content by providing comprehensive yet concise summaries.
@@ -0,0 +1,227 @@
1
+ """Main generation pipeline for llmsbrieftxt."""
2
+
3
+ import json
4
+ import re
5
+ from pathlib import Path
6
+
7
+ from llmsbrieftxt.doc_loader import DocLoader
8
+ from llmsbrieftxt.extractor import default_extractor
9
+ from llmsbrieftxt.summarizer import Summarizer
10
+
11
+
12
+ def extract_url_from_summary(summary: str) -> str | None:
13
+ """
14
+ Extract URL from a summary in the format: Title: [title](URL).
15
+
16
+ Args:
17
+ summary: Formatted summary string
18
+
19
+ Returns:
20
+ Extracted URL or None if not found
21
+ """
22
+ # Match markdown link format: [text](url)
23
+ match = re.search(r"\[([^\]]+)\]\(([^)]+)\)", summary)
24
+ if match:
25
+ return match.group(2)
26
+ return None
27
+
28
+
29
+ def ensure_directory_exists(file_path: str) -> None:
30
+ """Ensure the parent directory of the given file path exists.
31
+
32
+ Args:
33
+ file_path: Path to the file whose parent directory should be created
34
+
35
+ Raises:
36
+ RuntimeError: If directory creation fails due to permissions or other issues
37
+ """
38
+ dir_path = Path(file_path).parent
39
+ if dir_path == Path("."):
40
+ return # Current directory, no need to create
41
+
42
+ try:
43
+ dir_path.mkdir(parents=True, exist_ok=True)
44
+ if not dir_path.exists():
45
+ print(f"Created directory: {dir_path}")
46
+ except OSError as e:
47
+ raise RuntimeError(f"Failed to create directory {dir_path}: {e}") from e
48
+
49
+
50
+ async def generate_llms_txt(
51
+ url: str,
52
+ llm_name: str = "o4-mini",
53
+ max_concurrent_summaries: int = 10,
54
+ output_path: str = "llms.txt",
55
+ show_urls: bool = False,
56
+ max_urls: int | None = None,
57
+ max_depth: int = 3,
58
+ cache_dir: str = ".llmsbrieftxt_cache",
59
+ use_cache_only: bool = False,
60
+ force_refresh: bool = False,
61
+ ) -> dict[str, int | list[str]] | None:
62
+ """
63
+ Generate llms-brief.txt file from a documentation website.
64
+
65
+ Args:
66
+ url: URL of the documentation site to crawl
67
+ llm_name: OpenAI model to use for summarization
68
+ max_concurrent_summaries: Maximum concurrent LLM requests
69
+ output_path: Path to write the output file
70
+ show_urls: If True, only show discovered URLs without processing
71
+ max_urls: Maximum number of URLs to discover/process
72
+ max_depth: Maximum crawl depth for URL discovery
73
+ cache_dir: Directory to store cached summaries
74
+ use_cache_only: If True, only use cached summaries (no API calls)
75
+ force_refresh: If True, ignore cache and regenerate all summaries
76
+
77
+ Returns:
78
+ Dictionary with metadata (for show_urls mode) or None
79
+ """
80
+ urls_processed = 0
81
+ summaries_generated = 0
82
+ failed_urls: set[str] = set() # Use set to avoid duplicates
83
+
84
+ # Set up cache directory
85
+ cache_path = Path(cache_dir)
86
+ cache_path.mkdir(parents=True, exist_ok=True)
87
+ cache_file = cache_path / "summaries.json"
88
+
89
+ # Load existing summaries from cache if available (unless force refresh)
90
+ existing_summaries: dict[str, str] = {}
91
+ if cache_file.exists() and not force_refresh:
92
+ try:
93
+ with open(cache_file) as f:
94
+ existing_summaries = json.load(f)
95
+ print(f"Found {len(existing_summaries)} cached summaries")
96
+ except Exception as e:
97
+ print(f"Warning: Could not load cache: {str(e)}")
98
+ elif force_refresh and cache_file.exists():
99
+ print("Force refresh enabled - ignoring existing cache")
100
+
101
+ extractor = default_extractor
102
+ output_file = output_path
103
+
104
+ # If show_urls is True, just show discovered URLs and exit
105
+ if show_urls:
106
+ print("Discovering documentation URLs...")
107
+ doc_loader = DocLoader(max_urls=max_urls, max_depth=max_depth)
108
+ _, discovered_urls = await doc_loader.load_docs(
109
+ url, extractor=extractor, show_urls=True
110
+ )
111
+ print("\nDiscovered URLs:")
112
+ for discovered_url in discovered_urls:
113
+ print(f" - {discovered_url}")
114
+ print(f"\nTotal: {len(discovered_urls)} unique URLs")
115
+
116
+ # Calculate how many would be cached vs new
117
+ num_cached = sum(1 for u in discovered_urls if u in existing_summaries)
118
+ num_new = len(discovered_urls) - num_cached
119
+ if existing_summaries:
120
+ print(f"Cached: {num_cached} | New: {num_new}")
121
+
122
+ return {"num_urls": len(discovered_urls), "failed_urls": []}
123
+
124
+ # Load and process documents
125
+ doc_loader = DocLoader(max_urls=max_urls, max_depth=max_depth)
126
+ docs, discovered_urls = await doc_loader.load_docs(url, extractor=extractor)
127
+ urls_processed = len(docs)
128
+
129
+ # Track which URLs failed to load
130
+ loaded_urls = {doc.metadata.get("source") for doc in docs}
131
+ failed_urls.update(u for u in discovered_urls if u not in loaded_urls)
132
+
133
+ # Handle cache-only mode
134
+ if use_cache_only:
135
+ print("\nCache-only mode: Using only cached summaries")
136
+ summaries: list[str] = []
137
+ for doc in docs:
138
+ doc_url = doc.metadata.get("source", "")
139
+ if doc_url in existing_summaries:
140
+ summaries.append(existing_summaries[doc_url])
141
+ else:
142
+ print(f" Warning: No cache for {doc_url}")
143
+ failed_urls.add(doc_url)
144
+ summaries_generated = len(summaries)
145
+ else:
146
+ # Initialize summarizer
147
+ print(f"\nGenerating summaries with {llm_name}...")
148
+ summarizer = Summarizer(
149
+ llm_name=llm_name,
150
+ max_concurrent=max_concurrent_summaries,
151
+ )
152
+
153
+ summaries: list[str] = []
154
+ try:
155
+ summaries = await summarizer.summarize_all(
156
+ docs, existing_summaries=existing_summaries, cache_file=cache_file
157
+ )
158
+ summaries_generated = len(summaries)
159
+
160
+ # Track URLs that failed summarization by extracting URLs from summaries
161
+ summarized_urls: set[str] = set()
162
+ for summary in summaries:
163
+ if summary:
164
+ extracted_url: str | None = extract_url_from_summary(summary)
165
+ if extracted_url:
166
+ summarized_urls.add(extracted_url)
167
+
168
+ # Add docs that weren't successfully summarized to failed_urls
169
+ for doc in docs:
170
+ doc_url = doc.metadata.get("source", "")
171
+ if doc_url and doc_url not in summarized_urls:
172
+ failed_urls.add(doc_url)
173
+ except KeyboardInterrupt:
174
+ print("Process interrupted by user. Saving partial results...")
175
+ if cache_file.exists():
176
+ try:
177
+ with open(cache_file) as f:
178
+ partial_summaries = json.load(f)
179
+ summaries = list(partial_summaries.values())
180
+ summaries_generated = len(summaries)
181
+ print(f"Recovered {len(summaries)} summaries from cache")
182
+ except Exception:
183
+ # Silently ignore cache read errors during interrupt recovery
184
+ # If we can't recover from cache, we'll continue with empty results
185
+ pass
186
+ except Exception as e:
187
+ print(f"Summarization process error: {str(e)}")
188
+ if cache_file.exists():
189
+ try:
190
+ with open(cache_file) as f:
191
+ partial_summaries = json.load(f)
192
+ summaries = list(partial_summaries.values())
193
+ summaries_generated = len(summaries)
194
+ print(
195
+ f"Recovered {len(summaries)} partial summaries from cache"
196
+ )
197
+ except Exception:
198
+ # If cache recovery fails during error handling, continue with empty results
199
+ summaries = []
200
+ finally:
201
+ # Write results to file
202
+ if summaries:
203
+ ensure_directory_exists(output_file)
204
+ output_content = "".join(summaries)
205
+ Path(output_file).write_text(output_content, encoding="utf-8")
206
+ else:
207
+ ensure_directory_exists(output_file)
208
+ Path(output_file).write_text("", encoding="utf-8")
209
+
210
+ # Print summary
211
+ print(f"\n{'=' * 50}")
212
+ print(f"Processed: {summaries_generated}/{urls_processed} pages")
213
+ if urls_processed > 0:
214
+ success_rate = summaries_generated / urls_processed * 100
215
+ print(f"Success rate: {success_rate:.1f}%")
216
+ print(f"Output: {output_file}")
217
+
218
+ # Report failed URLs
219
+ if failed_urls:
220
+ print(f"Failed URLs: {len(failed_urls)}")
221
+ failed_file = Path(output_file).parent / "failed_urls.txt"
222
+ # Sort URLs for consistent output
223
+ failed_file.write_text("\n".join(sorted(failed_urls)), encoding="utf-8")
224
+ print(f"Failed URLs written to: {failed_file}")
225
+ print(f"{'=' * 50}")
226
+
227
+ return None
@@ -7,7 +7,7 @@ packages = ["llmsbrieftxt"]
7
7
 
8
8
  [project]
9
9
  name = "llmsbrieftxt"
10
- version = "1.3.1"
10
+ version = "1.4.0"
11
11
  description = "Generate llms-brief.txt files from documentation websites using AI"
12
12
  readme = "README.md"
13
13
  requires-python = ">=3.10"
@@ -1,8 +1,10 @@
1
1
  """Tests for CLI argument parsing."""
2
2
 
3
+ import os
4
+
3
5
  import pytest
4
6
 
5
- from llmsbrieftxt.cli import parse_args, validate_url
7
+ from llmsbrieftxt.cli import estimate_cost, get_output_path, parse_args, validate_url
6
8
 
7
9
 
8
10
  class TestCLIArgumentParsing:
@@ -110,6 +112,61 @@ class TestCLIArgumentParsing:
110
112
  args = parse_args(["https://example.com", "--max-urls", "50"])
111
113
  assert args.max_urls == 50
112
114
 
115
+ @pytest.mark.unit
116
+ def test_depth_flag_default(self):
117
+ """Test default depth value."""
118
+ args = parse_args(["https://example.com"])
119
+ assert args.depth == 3
120
+
121
+ @pytest.mark.unit
122
+ def test_depth_flag_custom(self):
123
+ """Test custom depth value."""
124
+ args = parse_args(["https://example.com", "--depth", "5"])
125
+ assert args.depth == 5
126
+
127
+ @pytest.mark.unit
128
+ def test_cache_dir_flag_default(self):
129
+ """Test default cache directory."""
130
+ args = parse_args(["https://example.com"])
131
+ assert args.cache_dir == ".llmsbrieftxt_cache"
132
+
133
+ @pytest.mark.unit
134
+ def test_cache_dir_flag_custom(self):
135
+ """Test custom cache directory."""
136
+ args = parse_args(["https://example.com", "--cache-dir", "/tmp/mycache"])
137
+ assert args.cache_dir == "/tmp/mycache"
138
+
139
+ @pytest.mark.unit
140
+ def test_use_cache_only_flag(self):
141
+ """Test --use-cache-only flag."""
142
+ args = parse_args(["https://example.com", "--use-cache-only"])
143
+ assert args.use_cache_only is True
144
+
145
+ @pytest.mark.unit
146
+ def test_force_refresh_flag(self):
147
+ """Test --force-refresh flag."""
148
+ args = parse_args(["https://example.com", "--force-refresh"])
149
+ assert args.force_refresh is True
150
+
151
+ @pytest.mark.unit
152
+ def test_all_new_arguments_together(self):
153
+ """Test all new arguments combined."""
154
+ args = parse_args(
155
+ [
156
+ "https://example.com",
157
+ "--depth",
158
+ "2",
159
+ "--cache-dir",
160
+ "custom_cache",
161
+ "--max-urls",
162
+ "100",
163
+ ]
164
+ )
165
+ assert args.url == "https://example.com"
166
+ assert args.depth == 2
167
+ assert args.cache_dir == "custom_cache"
168
+ assert args.max_urls == 100
169
+
113
170
  @pytest.mark.unit
114
171
  def test_no_url_exits(self):
115
172
  """Test that providing no URL exits with error."""
@@ -154,3 +211,73 @@ class TestURLValidation:
154
211
  def test_invalid_url_malformed(self):
155
212
  """Test invalid malformed URL."""
156
213
  assert validate_url("not-a-url") is False
214
+
215
+
216
+ class TestCostEstimation:
217
+ """Tests for cost estimation."""
218
+
219
+ @pytest.mark.unit
220
+ def test_cost_estimate_small_job(self):
221
+ """Test cost estimation for small number of pages."""
222
+ cost = estimate_cost(10, "gpt-5-mini")
223
+ assert cost.startswith("~$")
224
+ assert "$0." in cost
225
+
226
+ @pytest.mark.unit
227
+ def test_cost_estimate_medium_job(self):
228
+ """Test cost estimation for medium number of pages."""
229
+ cost = estimate_cost(100, "gpt-4o-mini")
230
+ assert cost.startswith("~$")
231
+ assert "$" in cost
232
+
233
+ @pytest.mark.unit
234
+ def test_cost_estimate_large_job(self):
235
+ """Test cost estimation for large number of pages."""
236
+ cost = estimate_cost(500, "gpt-4o")
237
+ assert cost.startswith("~$")
238
+ assert float(cost.replace("~$", "")) > 1.0
239
+
240
+ @pytest.mark.unit
241
+ def test_cost_estimate_unknown_model(self):
242
+ """Test cost estimation for unknown model."""
243
+ cost = estimate_cost(100, "unknown-model")
244
+ assert "not available" in cost
245
+
246
+ @pytest.mark.unit
247
+ def test_cost_estimate_zero_pages(self):
248
+ """Test cost estimation for zero pages."""
249
+ cost = estimate_cost(0, "gpt-5-mini")
250
+ assert cost == "~$0.0000"
251
+
252
+
253
+ class TestOutputPathExpansion:
254
+ """Tests for output path with environment variable expansion."""
255
+
256
+ @pytest.mark.unit
257
+ def test_output_path_with_tilde_expansion(self):
258
+ """Test output path with ~ expands to home directory."""
259
+ path = get_output_path("https://example.com", "~/docs/output.txt")
260
+ assert "~" not in str(path)
261
+ assert str(path).startswith(os.path.expanduser("~"))
262
+
263
+ @pytest.mark.unit
264
+ def test_output_path_with_env_var(self, monkeypatch):
265
+ """Test output path with $VAR environment variable."""
266
+ monkeypatch.setenv("MYDIR", "/tmp/testdir")
267
+ path = get_output_path("https://example.com", "$MYDIR/output.txt")
268
+ assert str(path) == "/tmp/testdir/output.txt"
269
+
270
+ @pytest.mark.unit
271
+ def test_output_path_with_env_var_braces(self, monkeypatch):
272
+ """Test output path with ${VAR} environment variable."""
273
+ monkeypatch.setenv("TESTDIR", "/tmp/test")
274
+ path = get_output_path("https://example.com", "${TESTDIR}/docs/output.txt")
275
+ assert str(path) == "/tmp/test/docs/output.txt"
276
+
277
+ @pytest.mark.unit
278
+ def test_output_path_default_no_expansion(self):
279
+ """Test default output path (no custom path) works correctly."""
280
+ path = get_output_path("https://docs.example.com")
281
+ # Should contain .claude/docs in path
282
+ assert ".claude/docs" in str(path)
283
+ assert str(path).endswith("docs.example.com.txt")
@@ -1,142 +0,0 @@
1
- """Main generation pipeline for llmsbrieftxt."""
2
-
3
- import json
4
- from pathlib import Path
5
-
6
- from llmsbrieftxt.doc_loader import DocLoader
7
- from llmsbrieftxt.extractor import default_extractor
8
- from llmsbrieftxt.summarizer import Summarizer
9
-
10
-
11
- def ensure_directory_exists(file_path: str) -> None:
12
- """Ensure the parent directory of the given file path exists.
13
-
14
- Args:
15
- file_path: Path to the file whose parent directory should be created
16
-
17
- Raises:
18
- RuntimeError: If directory creation fails due to permissions or other issues
19
- """
20
- dir_path = Path(file_path).parent
21
- if dir_path == Path("."):
22
- return # Current directory, no need to create
23
-
24
- try:
25
- dir_path.mkdir(parents=True, exist_ok=True)
26
- if not dir_path.exists():
27
- print(f"Created directory: {dir_path}")
28
- except OSError as e:
29
- raise RuntimeError(f"Failed to create directory {dir_path}: {e}") from e
30
-
31
-
32
- async def generate_llms_txt(
33
- url: str,
34
- llm_name: str = "o4-mini",
35
- max_concurrent_summaries: int = 10,
36
- output_path: str = "llms.txt",
37
- show_urls: bool = False,
38
- max_urls: int | None = None,
39
- ) -> None:
40
- """
41
- Generate llms-brief.txt file from a documentation website.
42
-
43
- Args:
44
- url: URL of the documentation site to crawl
45
- llm_name: OpenAI model to use for summarization
46
- max_concurrent_summaries: Maximum concurrent LLM requests
47
- output_path: Path to write the output file
48
- show_urls: If True, only show discovered URLs without processing
49
- max_urls: Maximum number of URLs to discover/process
50
- """
51
- urls_processed = 0
52
- summaries_generated = 0
53
-
54
- # Set up cache directory
55
- cache_dir = Path(".llmsbrieftxt_cache")
56
- cache_dir.mkdir(exist_ok=True)
57
- cache_file = cache_dir / "summaries.json"
58
-
59
- # Load existing summaries from cache if available
60
- existing_summaries: dict[str, str] = {}
61
- if cache_file.exists():
62
- try:
63
- with open(cache_file) as f:
64
- existing_summaries = json.load(f)
65
- print(f"Using {len(existing_summaries)} cached summaries")
66
- except Exception as e:
67
- print(f"Warning: Could not load cache: {str(e)}")
68
-
69
- extractor = default_extractor
70
- output_file = output_path
71
-
72
- # If show_urls is True, just show discovered URLs and exit
73
- if show_urls:
74
- print("Discovering documentation URLs...")
75
- doc_loader = DocLoader(max_urls=max_urls)
76
- _, discovered_urls = await doc_loader.load_docs(
77
- url, extractor=extractor, show_urls=True
78
- )
79
- print("\nDiscovered URLs:")
80
- for discovered_url in discovered_urls:
81
- print(f" - {discovered_url}")
82
- print(f"\nTotal: {len(discovered_urls)} unique URLs")
83
- return
84
-
85
- # Load and process documents
86
- doc_loader = DocLoader(max_urls=max_urls)
87
- docs, discovered_urls = await doc_loader.load_docs(url, extractor=extractor)
88
- urls_processed = len(docs)
89
-
90
- # Initialize summarizer
91
- print(f"\nGenerating summaries with {llm_name}...")
92
- summarizer = Summarizer(
93
- llm_name=llm_name,
94
- max_concurrent=max_concurrent_summaries,
95
- )
96
-
97
- summaries = []
98
- try:
99
- summaries = await summarizer.summarize_all(
100
- docs, existing_summaries=existing_summaries, cache_file=cache_file
101
- )
102
- summaries_generated = len(summaries)
103
- except KeyboardInterrupt:
104
- print("Process interrupted by user. Saving partial results...")
105
- if cache_file.exists():
106
- try:
107
- with open(cache_file) as f:
108
- partial_summaries = json.load(f)
109
- summaries = list(partial_summaries.values())
110
- summaries_generated = len(summaries)
111
- print(f"Recovered {len(summaries)} summaries from cache")
112
- except Exception:
113
- pass
114
- except Exception as e:
115
- print(f"Summarization process error: {str(e)}")
116
- if cache_file.exists():
117
- try:
118
- with open(cache_file) as f:
119
- partial_summaries = json.load(f)
120
- summaries = list(partial_summaries.values())
121
- summaries_generated = len(summaries)
122
- print(f"Recovered {len(summaries)} partial summaries from cache")
123
- except Exception:
124
- summaries = []
125
- finally:
126
- # Write results to file
127
- if summaries:
128
- ensure_directory_exists(output_file)
129
- output_content = "".join(summaries)
130
- Path(output_file).write_text(output_content, encoding="utf-8")
131
- else:
132
- ensure_directory_exists(output_file)
133
- Path(output_file).write_text("", encoding="utf-8")
134
-
135
- # Print summary
136
- print(f"\n{'=' * 50}")
137
- print(f"Processed: {summaries_generated}/{urls_processed} pages")
138
- if urls_processed > 0:
139
- success_rate = summaries_generated / urls_processed * 100
140
- print(f"Success rate: {success_rate:.1f}%")
141
- print(f"Output: {output_file}")
142
- print(f"{'=' * 50}")
File without changes
File without changes
File without changes
File without changes