PyPI - llmsbrieftxt - Versions diffs - 1.5.0__py3-none-any.whl → 1.11.1__py3-none-any.whl - Mend

llmsbrieftxt 1.5.0py3-none-any.whl → 1.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

llmsbrieftxt/cli.py +7 -1
llmsbrieftxt/crawler.py +8 -0
llmsbrieftxt/main.py +48 -3
llmsbrieftxt/summarizer.py +15 -15
{llmsbrieftxt-1.5.0.dist-info → llmsbrieftxt-1.11.1.dist-info}/METADATA +77 -3
llmsbrieftxt-1.11.1.dist-info/RECORD +16 -0
llmsbrieftxt-1.5.0.dist-info/RECORD +0 -16
{llmsbrieftxt-1.5.0.dist-info → llmsbrieftxt-1.11.1.dist-info}/WHEEL +0 -0
{llmsbrieftxt-1.5.0.dist-info → llmsbrieftxt-1.11.1.dist-info}/entry_points.txt +0 -0
{llmsbrieftxt-1.5.0.dist-info → llmsbrieftxt-1.11.1.dist-info}/licenses/LICENSE +0 -0

llmsbrieftxt/cli.py CHANGED Viewed

@@ -254,7 +254,7 @@ def main() -> None:
             )
         )
-        # Show cost estimate and failed URLs if available
+        # Show cost estimate if in show-urls mode
         if args.show_urls and result:
             num_urls_value = result.get("num_urls", 0)
             # Type guard to ensure we have an int
@@ -264,6 +264,12 @@ def main() -> None:
                 )
             print("Note: Actual cost may vary based on page content size and caching")
+        # Check success and exit with appropriate code
+        if result is not None:
+            success = result.get("success", True)
+            if not success:
+                sys.exit(1)
     except KeyboardInterrupt:
         print("\nOperation cancelled by user.", file=sys.stderr)
         sys.exit(1)

llmsbrieftxt/crawler.py CHANGED Viewed

@@ -183,6 +183,10 @@ class RobustDocCrawler:
                 # Process in batches
                 for i in range(0, len(current_level), self.max_concurrent):
+                    # Check if we've reached max_urls before processing next batch
+                    if len(discovered) >= self.max_urls:
+                        break
                     batch = current_level[i : i + self.max_concurrent]
                     tasks = [
                         self._extract_links(url, client, base_path) for url in batch
@@ -190,6 +194,10 @@ class RobustDocCrawler:
                     results = await asyncio.gather(*tasks, return_exceptions=True)
                     for url, result in zip(batch, results, strict=False):
+                        # Check max_urls before adding each URL
+                        if len(discovered) >= self.max_urls:
+                            break
                         visited.add(url)
                         discovered.add(url)

llmsbrieftxt/main.py CHANGED Viewed

@@ -148,7 +148,7 @@ async def generate_llms_txt(
     use_cache_only: bool = False,
     force_refresh: bool = False,
     skip_confirmation: bool = False,
-) -> dict[str, int | list[str]] | None:
+) -> dict[str, int | list[str] | bool] | None:
     """
     Generate llms-brief.txt file from a documentation website.
@@ -166,10 +166,11 @@ async def generate_llms_txt(
         skip_confirmation: If True, skip confirmation prompt for high costs
     Returns:
-        Dictionary with metadata (for show_urls mode) or None
+        Dictionary with metadata including 'success' boolean (for show_urls mode returns dict, otherwise None on success)
     """
     urls_processed = 0
     summaries_generated = 0
+    new_summaries_generated = 0  # Track new (non-cached) summaries
     failed_urls: set[str] = set()  # Use set to avoid duplicates
     # Set up cache directory
@@ -217,7 +218,7 @@ async def generate_llms_txt(
         if existing_summaries:
             print(f"Cached: {num_cached} | New: {num_new}")
-        return {"num_urls": len(discovered_urls), "failed_urls": []}
+        return {"num_urls": len(discovered_urls), "failed_urls": [], "success": True}
     # Load and process documents
     doc_loader = DocLoader(max_urls=max_urls, max_depth=max_depth)
@@ -263,6 +264,12 @@ async def generate_llms_txt(
     # Handle cache-only mode
     usage_stats: dict[str, int] = {"input_tokens": 0, "output_tokens": 0}
+    num_docs_to_process = len(docs)
+    num_cached_used = sum(
+        1 for doc in docs if doc.metadata.get("source", "") in existing_summaries
+    )
+    num_new_needed = num_docs_to_process - num_cached_used
     if use_cache_only:
         print("\nCache-only mode: Using only cached summaries")
         summaries: list[str] = []
@@ -274,6 +281,7 @@ async def generate_llms_txt(
                 print(f"  Warning: No cache for {doc_url}")
                 failed_urls.add(doc_url)
         summaries_generated = len(summaries)
+        new_summaries_generated = 0  # No new summaries in cache-only mode
     else:
         # Initialize summarizer
         print(f"\nGenerating summaries with {llm_name}...")
@@ -288,6 +296,8 @@ async def generate_llms_txt(
                 docs, existing_summaries=existing_summaries, cache_file=cache_file
             )
             summaries_generated = len(summaries)
+            # Calculate new summaries (total - cached)
+            new_summaries_generated = summaries_generated - num_cached_used
             # Track URLs that failed summarization by extracting URLs from summaries
             summarized_urls: set[str] = set()
@@ -304,12 +314,16 @@ async def generate_llms_txt(
                     failed_urls.add(doc_url)
         except KeyboardInterrupt:
             print("Process interrupted by user. Saving partial results...")
+            new_summaries_generated = 0  # Initialize in case recovery fails
             if cache_file.exists():
                 try:
                     with open(cache_file) as f:
                         partial_summaries = json.load(f)
                         summaries = list(partial_summaries.values())
                         summaries_generated = len(summaries)
+                        new_summaries_generated = max(
+                            0, summaries_generated - num_cached_used
+                        )
                         print(f"Recovered {len(summaries)} summaries from cache")
                 except Exception:
                     # Silently ignore cache read errors during interrupt recovery
@@ -317,12 +331,16 @@ async def generate_llms_txt(
                     pass
         except Exception as e:
             print(f"Summarization process error: {str(e)}")
+            new_summaries_generated = 0  # Initialize in case recovery fails
             if cache_file.exists():
                 try:
                     with open(cache_file) as f:
                         partial_summaries = json.load(f)
                         summaries = list(partial_summaries.values())
                         summaries_generated = len(summaries)
+                        new_summaries_generated = max(
+                            0, summaries_generated - num_cached_used
+                        )
                         print(
                             f"Recovered {len(summaries)} partial summaries from cache"
                         )
@@ -376,4 +394,31 @@ async def generate_llms_txt(
                 print(f"Failed URLs written to: {failed_file}")
             print(f"{'=' * 50}")
+        # Determine success based on whether we generated new summaries when needed
+        success = True
+        if not use_cache_only:
+            # If there were new pages that needed API calls
+            if num_new_needed > 0:
+                # Success only if we generated at least one new summary
+                if new_summaries_generated == 0:
+                    print("\nERROR: All API calls failed - no new summaries generated")
+                    success = False
+                elif new_summaries_generated < num_new_needed:
+                    print(
+                        f"\nWARNING: Some API calls failed ({new_summaries_generated}/{num_new_needed} successful)"
+                    )
+            # If all pages were cached, that's fine
+        else:
+            # Cache-only mode: success if we have any summaries
+            success = summaries_generated > 0
+            if not success:
+                print("\nERROR: No cached summaries found")
+        # Return success indicator (for CLI exit code)
+        return {
+            "success": success,
+            "summaries_generated": summaries_generated,
+            "new_summaries": new_summaries_generated,
+        }
     return None

llmsbrieftxt/summarizer.py CHANGED Viewed

@@ -25,15 +25,7 @@ from .schema import Document, PageSummary
 logger = logging.getLogger(__name__)
-# Fallback summary used when LLM summarization fails
-FALLBACK_SUMMARY = PageSummary(
-    content_analysis="This page contains web content relevant to the topic.",
-    primary_use_cases="When accessing general web content",
-    key_takeaways="Contains general information",
-    related_topics="Web content",
-    keywords="web, content, information",
-    concise_summary="This page contains web content relevant to the topic.",
-)
+# Note: No fallback summary - we want failures to be properly reported
 class Summarizer:
@@ -62,6 +54,9 @@ class Summarizer:
             raise ValueError(
                 "OPENAI_API_KEY environment variable is required. Please set your OpenAI API key in your environment variables."
             )
+        base_url = os.getenv("OPENAI_BASE_URL")
+        if base_url:
+            return AsyncOpenAI(api_key=api_key, base_url=base_url)
         return AsyncOpenAI(api_key=api_key)
     @retry(
@@ -97,8 +92,8 @@ class Summarizer:
             },
         )
-    async def _summarize(self, doc: Any, loop: Any) -> PageSummary:
-        """Summarize document using OpenAI API."""
+    async def _summarize(self, doc: Any, loop: Any) -> PageSummary | None:
+        """Summarize document using OpenAI API. Returns None on failure."""
         url = doc.metadata.get("source", "unknown")
         try:
             # Truncate content if it's too long (keep first 10000 chars for now)
@@ -164,7 +159,7 @@ class Summarizer:
         except Exception as e:
             # Log with full traceback for debugging
-            logger.exception(
+            logger.error(
                 f"Failed to summarize {url}: {str(e)}",
                 exc_info=e,
                 extra={
@@ -172,8 +167,8 @@ class Summarizer:
                     "model": self.llm_name,
                 },
             )
-            # Return cached fallback PageSummary object
-            return FALLBACK_SUMMARY
+            # Return None to indicate failure (no fallback)
+            return None
     async def summarize_document(
         self, doc: Any, cache_file: Path | None = None
@@ -184,6 +179,11 @@ class Summarizer:
                 loop = asyncio.get_event_loop()
                 page_summary = await self._summarize(doc, loop)
+                # Check if summarization failed
+                if page_summary is None:
+                    logger.warning(f"Summarization failed for {url}")
+                    return None
                 # Format the summary with new structure
                 title = doc.metadata.get("title", url.split("/")[-1])
                 formatted_summary = f"Title: [{title}]({url})\nKeywords: {page_summary.keywords}\nSummary: {page_summary.concise_summary}\n\n"
@@ -194,7 +194,7 @@ class Summarizer:
                 return formatted_summary
             except Exception as e:
-                logger.exception(
+                logger.error(
                     f"Error summarizing {url}: {str(e)}",
                     exc_info=e,
                     extra={"url": url},

{llmsbrieftxt-1.5.0.dist-info → llmsbrieftxt-1.11.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmsbrieftxt
-Version: 1.5.0
+Version: 1.11.1
 Summary: Generate llms-brief.txt files from documentation websites using AI
 Project-URL: Homepage, https://github.com/stevennevins/llmsbrief
 Project-URL: Repository, https://github.com/stevennevins/llmsbrief
@@ -99,10 +99,10 @@ Output is automatically saved to `~/.claude/docs/<domain>.txt` (e.g., `docs.pyth
 - `--model MODEL` - OpenAI model to use (default: `gpt-5-mini`)
 - `--max-concurrent-summaries N` - Concurrent LLM requests (default: 10)
 - `--show-urls` - Preview discovered URLs with cost estimate (no API calls)
-- `--max-urls N` - Limit number of URLs to process
+- `--max-urls N` - Strictly limit number of URLs to process (may stop mid-crawl)
 - `--depth N` - Maximum crawl depth (default: 3)
 - `--cache-dir PATH` - Cache directory path (default: `.llmsbrieftxt_cache`)
-- `--use-cache-only` - Use only cached summaries, skip API calls for new pages
+- `--use-cache-only` - Use only cached summaries (fails with exit 1 if no cache exists)
 - `--force-refresh` - Ignore cache and regenerate all summaries
 ### Examples
@@ -244,6 +244,42 @@ uv run pytest tests/unit/test_cli.py
 uv run pytest -v
 ```
+### E2E Testing with Ollama (No API Costs)
+For testing without OpenAI API costs, use [Ollama](https://ollama.com) as a local LLM provider:
+```bash
+# 1. Install Ollama (one-time setup)
+curl -fsSL https://ollama.com/install.sh | sh
+# Or download from: https://ollama.com/download
+# 2. Start Ollama service
+ollama serve &
+# 3. Pull a lightweight model
+ollama pull tinyllama  # 637MB, fastest
+# Or: ollama pull phi3:mini  # 2.3GB, better quality
+# 4. Run E2E tests with Ollama
+export OPENAI_BASE_URL="http://localhost:11434/v1"
+export OPENAI_API_KEY="ollama-dummy-key"
+uv run pytest tests/integration/test_ollama_e2e.py -v
+# 5. Or test the CLI directly
+llmtxt https://example.com --model tinyllama --max-urls 5 --depth 1
+```
+**Benefits:**
+- ✅ Zero API costs - runs completely local
+- ✅ OpenAI-compatible endpoint
+- ✅ Same code path as production
+- ✅ Cached in GitHub Actions for CI/CD
+**Recommended Models:**
+- `tinyllama` (637MB) - Fastest, great for CI/CD
+- `phi3:mini` (2.3GB) - Better quality, still fast
+- `gemma2:2b` (1.6GB) - Balanced option
 ### Code Quality
 ```bash
@@ -270,6 +306,7 @@ uv run mypy llmsbrieftxt/
 ### Environment Variables
 - `OPENAI_API_KEY` - Required for all operations
+- `OPENAI_BASE_URL` - Optional. Set to use OpenAI-compatible endpoints (e.g., Ollama at `http://localhost:11434/v1`)
 ## Usage Tips
@@ -319,8 +356,45 @@ This tool is designed to work seamlessly with Claude Code. Once you've generated
 Generated llms-brief.txt files can be served via MCP (Model Context Protocol) servers. See the [mcpdoc project](https://github.com/langchain-ai/mcpdoc) for an example integration.
+## Exit Codes
+The CLI returns specific exit codes for scripting and automation:
+- `0` - Success (documentation generated successfully)
+- `1` - Failure (all API calls failed, no summaries generated, keyboard interrupt, or other errors)
+This enables reliable shell scripting:
+```bash
+if llmtxt https://docs.python.org/3/; then
+  echo "Documentation generated successfully"
+else
+  echo "Generation failed - check error message above"
+fi
+```
+### Exit Code Behavior by Mode
+- **Normal mode**: Exit 0 if any summaries generated (new or cached). Exit 1 only if no summaries generated.
+- **--use-cache-only mode**: Exit 0 if cached summaries found. Exit 1 if no cache exists.
+- **Partial failures**: Exit 0 if some summaries generated (shows WARNING). Exit 1 only if all API calls failed.
 ## Troubleshooting
+### Common Errors
+**"ERROR: All API calls failed - no new summaries generated"**
+- **Cause**: OpenAI API unavailable, authentication failed, or rate limited
+- **Solution**: Check `OPENAI_API_KEY`, verify API access, retry with `--force-refresh`, or reduce `--max-concurrent-summaries`
+**"ERROR: No cached summaries found"**
+- **Cause**: Using `--use-cache-only` but no cache exists at the specified location
+- **Solution**: Run without `--use-cache-only` to generate new summaries, or check `--cache-dir` location
+**"WARNING: Some API calls failed (X/Y successful)"**
+- **Cause**: Some but not all pages were successfully summarized
+- **Solution**: Check network connection, verify API key, retry with `--force-refresh`
 ### API Key Issues
 ```bash

llmsbrieftxt-1.11.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+llmsbrieftxt/__init__.py,sha256=baAcEjLSYFIeNZF51tOMmA_zAMhN8HvKael-UU-Ruec,22
+llmsbrieftxt/cli.py,sha256=v8ZWykJ1QclX7zM7L_IpdoSkJ_TxkaNkZg6drngD4zU,8625
+llmsbrieftxt/constants.py,sha256=cjV_W5MqfVINM78__6eKnFPOGPHAI4ZYz8GqbIEEKz8,2565
+llmsbrieftxt/crawler.py,sha256=zmilV_QwO9pvrqQvjMZbP357_c5z9rvIIvRCBnLWZ1I,12884
+llmsbrieftxt/doc_loader.py,sha256=dGeHnEVCqtTQgdowMCFxrhrmh3QV5n8l3TIOgDYaU9g,5167
+llmsbrieftxt/extractor.py,sha256=28jckOcYf7u5zmZrhOZ-PmcWvPwTLZhMHxISSkFdeXk,1955
+llmsbrieftxt/main.py,sha256=vQOf0kHgI6MnQTeT4OBKxIDxzEY8RJhuCOytc4-7bZA,16565
+llmsbrieftxt/schema.py,sha256=ix9666XBpSbHUuYF1-jIK88sijK5Cvaer6gwbdLlWfs,2186
+llmsbrieftxt/summarizer.py,sha256=2dkOyuk20Xafo7qqazxcjr_Qct-8mcYgfJUPXOu3qAQ,10866
+llmsbrieftxt/url_filters.py,sha256=1KWO9yfPEqOIFXVts5xraErVQKPDAw4Nls3yuXzbRE8,2182
+llmsbrieftxt/url_utils.py,sha256=vFc_MNyLZ6QflhDF0oyiZJPYuF2_GyQmtKK7etwCmcs,2212
+llmsbrieftxt-1.11.1.dist-info/METADATA,sha256=qej3UnHZXh2oVyYxxtXinFCPjG-8JHN4LfBzWQH1jqY,13692
+llmsbrieftxt-1.11.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+llmsbrieftxt-1.11.1.dist-info/entry_points.txt,sha256=lY7gjN9DS7cv3Kd3LjezvgFBum7BhpMHSPGvdCzBtFU,49
+llmsbrieftxt-1.11.1.dist-info/licenses/LICENSE,sha256=Bf6uF7ggkMcXEXAdu2lGR7u-voH5CJIWOzU5vnKQVJI,1082
+llmsbrieftxt-1.11.1.dist-info/RECORD,,

llmsbrieftxt-1.5.0.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-llmsbrieftxt/__init__.py,sha256=baAcEjLSYFIeNZF51tOMmA_zAMhN8HvKael-UU-Ruec,22
-llmsbrieftxt/cli.py,sha256=TSSSKtDydMpa6rApZ6sJQwCgGkMXf2cSeDe_lp80F1g,8440
-llmsbrieftxt/constants.py,sha256=cjV_W5MqfVINM78__6eKnFPOGPHAI4ZYz8GqbIEEKz8,2565
-llmsbrieftxt/crawler.py,sha256=ryt6pZ8Ed5vzEa78qeu93eSDlSyuFBqePlYZZMUFvGM,12553
-llmsbrieftxt/doc_loader.py,sha256=dGeHnEVCqtTQgdowMCFxrhrmh3QV5n8l3TIOgDYaU9g,5167
-llmsbrieftxt/extractor.py,sha256=28jckOcYf7u5zmZrhOZ-PmcWvPwTLZhMHxISSkFdeXk,1955
-llmsbrieftxt/main.py,sha256=5R6cAKFou9_FCluHQaktHKQU_nn_n3asnveB_g7o3yA,14346
-llmsbrieftxt/schema.py,sha256=ix9666XBpSbHUuYF1-jIK88sijK5Cvaer6gwbdLlWfs,2186
-llmsbrieftxt/summarizer.py,sha256=6RDAwbtw7baniwAp6mVbn6RfFVjOpAvsIXIWNYk5hFk,10879
-llmsbrieftxt/url_filters.py,sha256=1KWO9yfPEqOIFXVts5xraErVQKPDAw4Nls3yuXzbRE8,2182
-llmsbrieftxt/url_utils.py,sha256=vFc_MNyLZ6QflhDF0oyiZJPYuF2_GyQmtKK7etwCmcs,2212
-llmsbrieftxt-1.5.0.dist-info/METADATA,sha256=5FORT6_SuCTbY21xLaExQY9-zOmbEGOgxinOwV8F2uM,10961
-llmsbrieftxt-1.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-llmsbrieftxt-1.5.0.dist-info/entry_points.txt,sha256=lY7gjN9DS7cv3Kd3LjezvgFBum7BhpMHSPGvdCzBtFU,49
-llmsbrieftxt-1.5.0.dist-info/licenses/LICENSE,sha256=Bf6uF7ggkMcXEXAdu2lGR7u-voH5CJIWOzU5vnKQVJI,1082
-llmsbrieftxt-1.5.0.dist-info/RECORD,,

{llmsbrieftxt-1.5.0.dist-info → llmsbrieftxt-1.11.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{llmsbrieftxt-1.5.0.dist-info → llmsbrieftxt-1.11.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{llmsbrieftxt-1.5.0.dist-info → llmsbrieftxt-1.11.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

llmsbrieftxt 1.5.0__py3-none-any.whl → 1.11.1__py3-none-any.whl

llmsbrieftxt 1.5.0py3-none-any.whl → 1.11.1py3-none-any.whl