llmsbrieftxt 1.5.0__py3-none-any.whl → 1.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llmsbrieftxt/cli.py CHANGED
@@ -254,7 +254,7 @@ def main() -> None:
254
254
  )
255
255
  )
256
256
 
257
- # Show cost estimate and failed URLs if available
257
+ # Show cost estimate if in show-urls mode
258
258
  if args.show_urls and result:
259
259
  num_urls_value = result.get("num_urls", 0)
260
260
  # Type guard to ensure we have an int
@@ -264,6 +264,12 @@ def main() -> None:
264
264
  )
265
265
  print("Note: Actual cost may vary based on page content size and caching")
266
266
 
267
+ # Check success and exit with appropriate code
268
+ if result is not None:
269
+ success = result.get("success", True)
270
+ if not success:
271
+ sys.exit(1)
272
+
267
273
  except KeyboardInterrupt:
268
274
  print("\nOperation cancelled by user.", file=sys.stderr)
269
275
  sys.exit(1)
llmsbrieftxt/crawler.py CHANGED
@@ -183,6 +183,10 @@ class RobustDocCrawler:
183
183
 
184
184
  # Process in batches
185
185
  for i in range(0, len(current_level), self.max_concurrent):
186
+ # Check if we've reached max_urls before processing next batch
187
+ if len(discovered) >= self.max_urls:
188
+ break
189
+
186
190
  batch = current_level[i : i + self.max_concurrent]
187
191
  tasks = [
188
192
  self._extract_links(url, client, base_path) for url in batch
@@ -190,6 +194,10 @@ class RobustDocCrawler:
190
194
  results = await asyncio.gather(*tasks, return_exceptions=True)
191
195
 
192
196
  for url, result in zip(batch, results, strict=False):
197
+ # Check max_urls before adding each URL
198
+ if len(discovered) >= self.max_urls:
199
+ break
200
+
193
201
  visited.add(url)
194
202
  discovered.add(url)
195
203
 
llmsbrieftxt/main.py CHANGED
@@ -148,7 +148,7 @@ async def generate_llms_txt(
148
148
  use_cache_only: bool = False,
149
149
  force_refresh: bool = False,
150
150
  skip_confirmation: bool = False,
151
- ) -> dict[str, int | list[str]] | None:
151
+ ) -> dict[str, int | list[str] | bool] | None:
152
152
  """
153
153
  Generate llms-brief.txt file from a documentation website.
154
154
 
@@ -166,10 +166,11 @@ async def generate_llms_txt(
166
166
  skip_confirmation: If True, skip confirmation prompt for high costs
167
167
 
168
168
  Returns:
169
- Dictionary with metadata (for show_urls mode) or None
169
+ Dictionary with metadata including 'success' boolean (for show_urls mode returns dict, otherwise None on success)
170
170
  """
171
171
  urls_processed = 0
172
172
  summaries_generated = 0
173
+ new_summaries_generated = 0 # Track new (non-cached) summaries
173
174
  failed_urls: set[str] = set() # Use set to avoid duplicates
174
175
 
175
176
  # Set up cache directory
@@ -217,7 +218,7 @@ async def generate_llms_txt(
217
218
  if existing_summaries:
218
219
  print(f"Cached: {num_cached} | New: {num_new}")
219
220
 
220
- return {"num_urls": len(discovered_urls), "failed_urls": []}
221
+ return {"num_urls": len(discovered_urls), "failed_urls": [], "success": True}
221
222
 
222
223
  # Load and process documents
223
224
  doc_loader = DocLoader(max_urls=max_urls, max_depth=max_depth)
@@ -263,6 +264,12 @@ async def generate_llms_txt(
263
264
 
264
265
  # Handle cache-only mode
265
266
  usage_stats: dict[str, int] = {"input_tokens": 0, "output_tokens": 0}
267
+ num_docs_to_process = len(docs)
268
+ num_cached_used = sum(
269
+ 1 for doc in docs if doc.metadata.get("source", "") in existing_summaries
270
+ )
271
+ num_new_needed = num_docs_to_process - num_cached_used
272
+
266
273
  if use_cache_only:
267
274
  print("\nCache-only mode: Using only cached summaries")
268
275
  summaries: list[str] = []
@@ -274,6 +281,7 @@ async def generate_llms_txt(
274
281
  print(f" Warning: No cache for {doc_url}")
275
282
  failed_urls.add(doc_url)
276
283
  summaries_generated = len(summaries)
284
+ new_summaries_generated = 0 # No new summaries in cache-only mode
277
285
  else:
278
286
  # Initialize summarizer
279
287
  print(f"\nGenerating summaries with {llm_name}...")
@@ -288,6 +296,8 @@ async def generate_llms_txt(
288
296
  docs, existing_summaries=existing_summaries, cache_file=cache_file
289
297
  )
290
298
  summaries_generated = len(summaries)
299
+ # Calculate new summaries (total - cached)
300
+ new_summaries_generated = summaries_generated - num_cached_used
291
301
 
292
302
  # Track URLs that failed summarization by extracting URLs from summaries
293
303
  summarized_urls: set[str] = set()
@@ -304,12 +314,16 @@ async def generate_llms_txt(
304
314
  failed_urls.add(doc_url)
305
315
  except KeyboardInterrupt:
306
316
  print("Process interrupted by user. Saving partial results...")
317
+ new_summaries_generated = 0 # Initialize in case recovery fails
307
318
  if cache_file.exists():
308
319
  try:
309
320
  with open(cache_file) as f:
310
321
  partial_summaries = json.load(f)
311
322
  summaries = list(partial_summaries.values())
312
323
  summaries_generated = len(summaries)
324
+ new_summaries_generated = max(
325
+ 0, summaries_generated - num_cached_used
326
+ )
313
327
  print(f"Recovered {len(summaries)} summaries from cache")
314
328
  except Exception:
315
329
  # Silently ignore cache read errors during interrupt recovery
@@ -317,12 +331,16 @@ async def generate_llms_txt(
317
331
  pass
318
332
  except Exception as e:
319
333
  print(f"Summarization process error: {str(e)}")
334
+ new_summaries_generated = 0 # Initialize in case recovery fails
320
335
  if cache_file.exists():
321
336
  try:
322
337
  with open(cache_file) as f:
323
338
  partial_summaries = json.load(f)
324
339
  summaries = list(partial_summaries.values())
325
340
  summaries_generated = len(summaries)
341
+ new_summaries_generated = max(
342
+ 0, summaries_generated - num_cached_used
343
+ )
326
344
  print(
327
345
  f"Recovered {len(summaries)} partial summaries from cache"
328
346
  )
@@ -376,4 +394,31 @@ async def generate_llms_txt(
376
394
  print(f"Failed URLs written to: {failed_file}")
377
395
  print(f"{'=' * 50}")
378
396
 
397
+ # Determine success based on whether we generated new summaries when needed
398
+ success = True
399
+ if not use_cache_only:
400
+ # If there were new pages that needed API calls
401
+ if num_new_needed > 0:
402
+ # Success only if we generated at least one new summary
403
+ if new_summaries_generated == 0:
404
+ print("\nERROR: All API calls failed - no new summaries generated")
405
+ success = False
406
+ elif new_summaries_generated < num_new_needed:
407
+ print(
408
+ f"\nWARNING: Some API calls failed ({new_summaries_generated}/{num_new_needed} successful)"
409
+ )
410
+ # If all pages were cached, that's fine
411
+ else:
412
+ # Cache-only mode: success if we have any summaries
413
+ success = summaries_generated > 0
414
+ if not success:
415
+ print("\nERROR: No cached summaries found")
416
+
417
+ # Return success indicator (for CLI exit code)
418
+ return {
419
+ "success": success,
420
+ "summaries_generated": summaries_generated,
421
+ "new_summaries": new_summaries_generated,
422
+ }
423
+
379
424
  return None
@@ -25,15 +25,7 @@ from .schema import Document, PageSummary
25
25
  logger = logging.getLogger(__name__)
26
26
 
27
27
 
28
- # Fallback summary used when LLM summarization fails
29
- FALLBACK_SUMMARY = PageSummary(
30
- content_analysis="This page contains web content relevant to the topic.",
31
- primary_use_cases="When accessing general web content",
32
- key_takeaways="Contains general information",
33
- related_topics="Web content",
34
- keywords="web, content, information",
35
- concise_summary="This page contains web content relevant to the topic.",
36
- )
28
+ # Note: No fallback summary - we want failures to be properly reported
37
29
 
38
30
 
39
31
  class Summarizer:
@@ -62,6 +54,9 @@ class Summarizer:
62
54
  raise ValueError(
63
55
  "OPENAI_API_KEY environment variable is required. Please set your OpenAI API key in your environment variables."
64
56
  )
57
+ base_url = os.getenv("OPENAI_BASE_URL")
58
+ if base_url:
59
+ return AsyncOpenAI(api_key=api_key, base_url=base_url)
65
60
  return AsyncOpenAI(api_key=api_key)
66
61
 
67
62
  @retry(
@@ -97,8 +92,8 @@ class Summarizer:
97
92
  },
98
93
  )
99
94
 
100
- async def _summarize(self, doc: Any, loop: Any) -> PageSummary:
101
- """Summarize document using OpenAI API."""
95
+ async def _summarize(self, doc: Any, loop: Any) -> PageSummary | None:
96
+ """Summarize document using OpenAI API. Returns None on failure."""
102
97
  url = doc.metadata.get("source", "unknown")
103
98
  try:
104
99
  # Truncate content if it's too long (keep first 10000 chars for now)
@@ -164,7 +159,7 @@ class Summarizer:
164
159
 
165
160
  except Exception as e:
166
161
  # Log with full traceback for debugging
167
- logger.exception(
162
+ logger.error(
168
163
  f"Failed to summarize {url}: {str(e)}",
169
164
  exc_info=e,
170
165
  extra={
@@ -172,8 +167,8 @@ class Summarizer:
172
167
  "model": self.llm_name,
173
168
  },
174
169
  )
175
- # Return cached fallback PageSummary object
176
- return FALLBACK_SUMMARY
170
+ # Return None to indicate failure (no fallback)
171
+ return None
177
172
 
178
173
  async def summarize_document(
179
174
  self, doc: Any, cache_file: Path | None = None
@@ -184,6 +179,11 @@ class Summarizer:
184
179
  loop = asyncio.get_event_loop()
185
180
  page_summary = await self._summarize(doc, loop)
186
181
 
182
+ # Check if summarization failed
183
+ if page_summary is None:
184
+ logger.warning(f"Summarization failed for {url}")
185
+ return None
186
+
187
187
  # Format the summary with new structure
188
188
  title = doc.metadata.get("title", url.split("/")[-1])
189
189
  formatted_summary = f"Title: [{title}]({url})\nKeywords: {page_summary.keywords}\nSummary: {page_summary.concise_summary}\n\n"
@@ -194,7 +194,7 @@ class Summarizer:
194
194
 
195
195
  return formatted_summary
196
196
  except Exception as e:
197
- logger.exception(
197
+ logger.error(
198
198
  f"Error summarizing {url}: {str(e)}",
199
199
  exc_info=e,
200
200
  extra={"url": url},
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmsbrieftxt
3
- Version: 1.5.0
3
+ Version: 1.11.1
4
4
  Summary: Generate llms-brief.txt files from documentation websites using AI
5
5
  Project-URL: Homepage, https://github.com/stevennevins/llmsbrief
6
6
  Project-URL: Repository, https://github.com/stevennevins/llmsbrief
@@ -99,10 +99,10 @@ Output is automatically saved to `~/.claude/docs/<domain>.txt` (e.g., `docs.pyth
99
99
  - `--model MODEL` - OpenAI model to use (default: `gpt-5-mini`)
100
100
  - `--max-concurrent-summaries N` - Concurrent LLM requests (default: 10)
101
101
  - `--show-urls` - Preview discovered URLs with cost estimate (no API calls)
102
- - `--max-urls N` - Limit number of URLs to process
102
+ - `--max-urls N` - Strictly limit number of URLs to process (may stop mid-crawl)
103
103
  - `--depth N` - Maximum crawl depth (default: 3)
104
104
  - `--cache-dir PATH` - Cache directory path (default: `.llmsbrieftxt_cache`)
105
- - `--use-cache-only` - Use only cached summaries, skip API calls for new pages
105
+ - `--use-cache-only` - Use only cached summaries (fails with exit 1 if no cache exists)
106
106
  - `--force-refresh` - Ignore cache and regenerate all summaries
107
107
 
108
108
  ### Examples
@@ -244,6 +244,42 @@ uv run pytest tests/unit/test_cli.py
244
244
  uv run pytest -v
245
245
  ```
246
246
 
247
+ ### E2E Testing with Ollama (No API Costs)
248
+
249
+ For testing without OpenAI API costs, use [Ollama](https://ollama.com) as a local LLM provider:
250
+
251
+ ```bash
252
+ # 1. Install Ollama (one-time setup)
253
+ curl -fsSL https://ollama.com/install.sh | sh
254
+ # Or download from: https://ollama.com/download
255
+
256
+ # 2. Start Ollama service
257
+ ollama serve &
258
+
259
+ # 3. Pull a lightweight model
260
+ ollama pull tinyllama # 637MB, fastest
261
+ # Or: ollama pull phi3:mini # 2.3GB, better quality
262
+
263
+ # 4. Run E2E tests with Ollama
264
+ export OPENAI_BASE_URL="http://localhost:11434/v1"
265
+ export OPENAI_API_KEY="ollama-dummy-key"
266
+ uv run pytest tests/integration/test_ollama_e2e.py -v
267
+
268
+ # 5. Or test the CLI directly
269
+ llmtxt https://example.com --model tinyllama --max-urls 5 --depth 1
270
+ ```
271
+
272
+ **Benefits:**
273
+ - ✅ Zero API costs - runs completely local
274
+ - ✅ OpenAI-compatible endpoint
275
+ - ✅ Same code path as production
276
+ - ✅ Cached in GitHub Actions for CI/CD
277
+
278
+ **Recommended Models:**
279
+ - `tinyllama` (637MB) - Fastest, great for CI/CD
280
+ - `phi3:mini` (2.3GB) - Better quality, still fast
281
+ - `gemma2:2b` (1.6GB) - Balanced option
282
+
247
283
  ### Code Quality
248
284
 
249
285
  ```bash
@@ -270,6 +306,7 @@ uv run mypy llmsbrieftxt/
270
306
  ### Environment Variables
271
307
 
272
308
  - `OPENAI_API_KEY` - Required for all operations
309
+ - `OPENAI_BASE_URL` - Optional. Set to use OpenAI-compatible endpoints (e.g., Ollama at `http://localhost:11434/v1`)
273
310
 
274
311
  ## Usage Tips
275
312
 
@@ -319,8 +356,45 @@ This tool is designed to work seamlessly with Claude Code. Once you've generated
319
356
 
320
357
  Generated llms-brief.txt files can be served via MCP (Model Context Protocol) servers. See the [mcpdoc project](https://github.com/langchain-ai/mcpdoc) for an example integration.
321
358
 
359
+ ## Exit Codes
360
+
361
+ The CLI returns specific exit codes for scripting and automation:
362
+
363
+ - `0` - Success (documentation generated successfully)
364
+ - `1` - Failure (all API calls failed, no summaries generated, keyboard interrupt, or other errors)
365
+
366
+ This enables reliable shell scripting:
367
+
368
+ ```bash
369
+ if llmtxt https://docs.python.org/3/; then
370
+ echo "Documentation generated successfully"
371
+ else
372
+ echo "Generation failed - check error message above"
373
+ fi
374
+ ```
375
+
376
+ ### Exit Code Behavior by Mode
377
+
378
+ - **Normal mode**: Exit 0 if any summaries generated (new or cached). Exit 1 only if no summaries generated.
379
+ - **--use-cache-only mode**: Exit 0 if cached summaries found. Exit 1 if no cache exists.
380
+ - **Partial failures**: Exit 0 if some summaries generated (shows WARNING). Exit 1 only if all API calls failed.
381
+
322
382
  ## Troubleshooting
323
383
 
384
+ ### Common Errors
385
+
386
+ **"ERROR: All API calls failed - no new summaries generated"**
387
+ - **Cause**: OpenAI API unavailable, authentication failed, or rate limited
388
+ - **Solution**: Check `OPENAI_API_KEY`, verify API access, retry with `--force-refresh`, or reduce `--max-concurrent-summaries`
389
+
390
+ **"ERROR: No cached summaries found"**
391
+ - **Cause**: Using `--use-cache-only` but no cache exists at the specified location
392
+ - **Solution**: Run without `--use-cache-only` to generate new summaries, or check `--cache-dir` location
393
+
394
+ **"WARNING: Some API calls failed (X/Y successful)"**
395
+ - **Cause**: Some but not all pages were successfully summarized
396
+ - **Solution**: Check network connection, verify API key, retry with `--force-refresh`
397
+
324
398
  ### API Key Issues
325
399
 
326
400
  ```bash
@@ -0,0 +1,16 @@
1
+ llmsbrieftxt/__init__.py,sha256=baAcEjLSYFIeNZF51tOMmA_zAMhN8HvKael-UU-Ruec,22
2
+ llmsbrieftxt/cli.py,sha256=v8ZWykJ1QclX7zM7L_IpdoSkJ_TxkaNkZg6drngD4zU,8625
3
+ llmsbrieftxt/constants.py,sha256=cjV_W5MqfVINM78__6eKnFPOGPHAI4ZYz8GqbIEEKz8,2565
4
+ llmsbrieftxt/crawler.py,sha256=zmilV_QwO9pvrqQvjMZbP357_c5z9rvIIvRCBnLWZ1I,12884
5
+ llmsbrieftxt/doc_loader.py,sha256=dGeHnEVCqtTQgdowMCFxrhrmh3QV5n8l3TIOgDYaU9g,5167
6
+ llmsbrieftxt/extractor.py,sha256=28jckOcYf7u5zmZrhOZ-PmcWvPwTLZhMHxISSkFdeXk,1955
7
+ llmsbrieftxt/main.py,sha256=vQOf0kHgI6MnQTeT4OBKxIDxzEY8RJhuCOytc4-7bZA,16565
8
+ llmsbrieftxt/schema.py,sha256=ix9666XBpSbHUuYF1-jIK88sijK5Cvaer6gwbdLlWfs,2186
9
+ llmsbrieftxt/summarizer.py,sha256=2dkOyuk20Xafo7qqazxcjr_Qct-8mcYgfJUPXOu3qAQ,10866
10
+ llmsbrieftxt/url_filters.py,sha256=1KWO9yfPEqOIFXVts5xraErVQKPDAw4Nls3yuXzbRE8,2182
11
+ llmsbrieftxt/url_utils.py,sha256=vFc_MNyLZ6QflhDF0oyiZJPYuF2_GyQmtKK7etwCmcs,2212
12
+ llmsbrieftxt-1.11.1.dist-info/METADATA,sha256=qej3UnHZXh2oVyYxxtXinFCPjG-8JHN4LfBzWQH1jqY,13692
13
+ llmsbrieftxt-1.11.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
+ llmsbrieftxt-1.11.1.dist-info/entry_points.txt,sha256=lY7gjN9DS7cv3Kd3LjezvgFBum7BhpMHSPGvdCzBtFU,49
15
+ llmsbrieftxt-1.11.1.dist-info/licenses/LICENSE,sha256=Bf6uF7ggkMcXEXAdu2lGR7u-voH5CJIWOzU5vnKQVJI,1082
16
+ llmsbrieftxt-1.11.1.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- llmsbrieftxt/__init__.py,sha256=baAcEjLSYFIeNZF51tOMmA_zAMhN8HvKael-UU-Ruec,22
2
- llmsbrieftxt/cli.py,sha256=TSSSKtDydMpa6rApZ6sJQwCgGkMXf2cSeDe_lp80F1g,8440
3
- llmsbrieftxt/constants.py,sha256=cjV_W5MqfVINM78__6eKnFPOGPHAI4ZYz8GqbIEEKz8,2565
4
- llmsbrieftxt/crawler.py,sha256=ryt6pZ8Ed5vzEa78qeu93eSDlSyuFBqePlYZZMUFvGM,12553
5
- llmsbrieftxt/doc_loader.py,sha256=dGeHnEVCqtTQgdowMCFxrhrmh3QV5n8l3TIOgDYaU9g,5167
6
- llmsbrieftxt/extractor.py,sha256=28jckOcYf7u5zmZrhOZ-PmcWvPwTLZhMHxISSkFdeXk,1955
7
- llmsbrieftxt/main.py,sha256=5R6cAKFou9_FCluHQaktHKQU_nn_n3asnveB_g7o3yA,14346
8
- llmsbrieftxt/schema.py,sha256=ix9666XBpSbHUuYF1-jIK88sijK5Cvaer6gwbdLlWfs,2186
9
- llmsbrieftxt/summarizer.py,sha256=6RDAwbtw7baniwAp6mVbn6RfFVjOpAvsIXIWNYk5hFk,10879
10
- llmsbrieftxt/url_filters.py,sha256=1KWO9yfPEqOIFXVts5xraErVQKPDAw4Nls3yuXzbRE8,2182
11
- llmsbrieftxt/url_utils.py,sha256=vFc_MNyLZ6QflhDF0oyiZJPYuF2_GyQmtKK7etwCmcs,2212
12
- llmsbrieftxt-1.5.0.dist-info/METADATA,sha256=5FORT6_SuCTbY21xLaExQY9-zOmbEGOgxinOwV8F2uM,10961
13
- llmsbrieftxt-1.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
- llmsbrieftxt-1.5.0.dist-info/entry_points.txt,sha256=lY7gjN9DS7cv3Kd3LjezvgFBum7BhpMHSPGvdCzBtFU,49
15
- llmsbrieftxt-1.5.0.dist-info/licenses/LICENSE,sha256=Bf6uF7ggkMcXEXAdu2lGR7u-voH5CJIWOzU5vnKQVJI,1082
16
- llmsbrieftxt-1.5.0.dist-info/RECORD,,