PyPI - llmsbrieftxt - Versions diffs - 1.5.0__py3-none-any.whl - Mend

llmsbrieftxt 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

llmsbrieftxt/__init__.py +1 -0
llmsbrieftxt/cli.py +276 -0
llmsbrieftxt/constants.py +62 -0
llmsbrieftxt/crawler.py +358 -0
llmsbrieftxt/doc_loader.py +150 -0
llmsbrieftxt/extractor.py +69 -0
llmsbrieftxt/main.py +379 -0
llmsbrieftxt/schema.py +42 -0
llmsbrieftxt/summarizer.py +300 -0
llmsbrieftxt/url_filters.py +75 -0
llmsbrieftxt/url_utils.py +73 -0
llmsbrieftxt-1.5.0.dist-info/METADATA +383 -0
llmsbrieftxt-1.5.0.dist-info/RECORD +16 -0
llmsbrieftxt-1.5.0.dist-info/WHEEL +4 -0
llmsbrieftxt-1.5.0.dist-info/entry_points.txt +2 -0
llmsbrieftxt-1.5.0.dist-info/licenses/LICENSE +21 -0

llmsbrieftxt/doc_loader.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Documentation-aware loader for intelligently discovering and crawling documentation sites."""
+import asyncio
+import logging
+from collections.abc import Callable
+import httpx
+from tqdm import tqdm
+from llmsbrieftxt.crawler import RobustDocCrawler
+from llmsbrieftxt.extractor import default_extractor
+from llmsbrieftxt.schema import Document
+logger = logging.getLogger(__name__)
+class DocLoader:
+    """Main documentation loader using robust crawling strategies."""
+    def __init__(
+        self,
+        max_urls: int | None = None,
+        max_concurrent: int = 10,
+        max_depth: int = 3,
+    ):
+        """Initialize the documentation loader.
+        Args:
+            max_urls: Optional maximum number of URLs to discover
+            max_concurrent: Maximum concurrent requests (default 10)
+            max_depth: Maximum crawl depth (default 3)
+        """
+        self.max_urls = max_urls
+        self.max_concurrent = max_concurrent
+        self.max_depth = max_depth
+        self.crawler = RobustDocCrawler(
+            max_urls=max_urls,
+            max_depth=max_depth,
+            max_concurrent=max_concurrent,
+        )
+    async def load_docs(
+        self,
+        url: str,
+        extractor: Callable[[str], str] | None = None,
+        show_urls: bool = False,
+    ) -> tuple[list[Document], list[str]]:
+        """Load documentation pages using robust discovery strategies.
+        Args:
+            url: The base URL to start from
+            extractor: Optional content extractor function
+            show_urls: Whether to return discovered URLs without loading
+        Returns:
+            Tuple of (documents, discovered_urls)
+        """
+        if extractor is None:
+            extractor = default_extractor
+        logger.info(f"Starting documentation discovery for {url}")
+        print(f"Discovering documentation from {url}...")
+        # Use RobustDocCrawler to discover URLs
+        discovered_urls = await self.crawler.discover_urls(url)
+        print(f"\nFound {len(discovered_urls)} pages")
+        if show_urls:
+            # Return empty documents but include URLs for preview
+            return [], sorted(discovered_urls)
+        # Load content from discovered URLs
+        documents = await self._load_documents(list(discovered_urls), extractor)
+        return documents, sorted(discovered_urls)
+    async def _load_documents(
+        self, urls: list[str], extractor: Callable[[str], str]
+    ) -> list[Document]:
+        """Load content from URLs and create documents.
+        Args:
+            urls: List of URLs to load
+            extractor: Function to extract content from HTML
+        Returns:
+            List of Document objects
+        """
+        documents: list[Document] = []
+        url_list: list[str] = urls
+        async with httpx.AsyncClient(
+            follow_redirects=True, timeout=httpx.Timeout(30.0)
+        ) as client:
+            # Create semaphore for concurrency control
+            semaphore = asyncio.Semaphore(self.max_concurrent)
+            async def load_with_limit(url: str) -> Document | None:
+                """Load document with semaphore-controlled concurrency."""
+                async with semaphore:
+                    return await self._load_single_document(url, client, extractor)
+            # Process all URLs concurrently with semaphore limiting parallelism
+            with tqdm(
+                total=len(url_list), desc="Loading documents", unit="doc"
+            ) as pbar:
+                tasks = [load_with_limit(url) for url in url_list]
+                # Use as_completed to update progress bar as tasks finish
+                for coro in asyncio.as_completed(tasks):
+                    result = await coro
+                    if isinstance(result, Exception):
+                        logger.warning(f"Failed to load: {result}")
+                    elif result is not None:
+                        documents.append(result)
+                    pbar.update(1)
+        return documents
+    async def _load_single_document(
+        self, url: str, client: httpx.AsyncClient, extractor: Callable[[str], str]
+    ) -> Document | None:
+        """Load a single document from a URL.
+        Args:
+            url: The URL to load
+            client: HTTP client
+            extractor: Content extraction function
+        Returns:
+            Document object or None if failed
+        """
+        try:
+            response = await client.get(url, timeout=30.0, follow_redirects=True)
+            if response.status_code == 200:
+                content = extractor(response.text)
+                if content and len(content.strip()) > 100:
+                    return Document(
+                        page_content=content, metadata={"source": url, "url": url}
+                    )
+                else:
+                    logger.debug(f"No meaningful content extracted from {url}")
+                    return None
+            else:
+                logger.debug(f"HTTP {response.status_code} for {url}")
+                return None
+        except Exception as e:
+            logger.debug(f"Failed to load {url}: {e}")
+            return None

llmsbrieftxt/extractor.py ADDED Viewed

@@ -0,0 +1,69 @@
+import logging
+from typing import Any
+from trafilatura import extract
+from trafilatura.settings import use_config
+logger = logging.getLogger(__name__)
+# Cache Trafilatura config to avoid recreating on every extraction
+_trafilatura_config: Any = None
+def _get_trafilatura_config() -> Any:
+    """Get or create cached Trafilatura config."""
+    global _trafilatura_config
+    if _trafilatura_config is None:
+        _trafilatura_config = use_config()
+        _trafilatura_config.set("DEFAULT", "MIN_EXTRACTED_SIZE", "200")
+        _trafilatura_config.set("DEFAULT", "MIN_FILE_SIZE", "100")
+    return _trafilatura_config
+def default_extractor(html: str) -> str:
+    """
+    Extract main content from HTML using Trafilatura.
+    Trafilatura intelligently extracts the main content while filtering out:
+    - Navigation menus
+    - Sidebars
+    - Footers
+    - Cookie banners
+    - Advertisements
+    Args:
+        html: Raw HTML content
+    Returns:
+        Extracted content as markdown string (empty string if extraction fails)
+    """
+    # Get cached Trafilatura config
+    config = _get_trafilatura_config()
+    # Extract with Trafilatura
+    try:
+        result = extract(
+            html,
+            config=config,
+            output_format="markdown",
+            include_links=True,
+            include_images=False,  # Images not needed for text summaries
+            include_tables=True,
+            include_comments=False,
+            favor_recall=True,  # Prefer extracting more rather than less
+        )
+        if result:
+            logger.debug(
+                f"Trafilatura extracted {len(result)} chars "
+                f"(reduced from {len(html)} chars HTML)"
+            )
+            return str(result)
+        else:
+            logger.debug("Trafilatura extraction returned no content")
+            return ""
+    except Exception as e:
+        logger.warning(f"Trafilatura extraction failed: {e}")
+        return ""

llmsbrieftxt/main.py ADDED Viewed

@@ -0,0 +1,379 @@
+"""Main generation pipeline for llmsbrieftxt."""
+import json
+import logging
+import re
+from pathlib import Path
+from llmsbrieftxt.constants import (
+    ESTIMATED_TOKENS_PER_PAGE_INPUT,
+    ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
+    OPENAI_PRICING,
+)
+from llmsbrieftxt.doc_loader import DocLoader
+from llmsbrieftxt.extractor import default_extractor
+from llmsbrieftxt.summarizer import Summarizer
+logger = logging.getLogger(__name__)
+def calculate_actual_cost(input_tokens: int, output_tokens: int, model: str) -> float:
+    """
+    Calculate actual API cost from token usage.
+    Args:
+        input_tokens: Number of input tokens used
+        output_tokens: Number of output tokens generated
+        model: OpenAI model name
+    Returns:
+        Total cost in dollars
+    """
+    if model not in OPENAI_PRICING:
+        return 0.0
+    input_price, output_price = OPENAI_PRICING[model]
+    input_cost = (input_tokens / 1_000_000) * input_price
+    output_cost = (output_tokens / 1_000_000) * output_price
+    return input_cost + output_cost
+def format_cost(cost: float) -> str:
+    """Format cost as a dollar string."""
+    if cost < 0.01:
+        return f"${cost:.4f}"
+    elif cost < 1.00:
+        return f"${cost:.3f}"
+    else:
+        return f"${cost:.2f}"
+def get_cache_stats(cache_file: Path, model: str) -> dict[str, int | float | str]:
+    """
+    Get cache statistics including size and estimated savings.
+    Args:
+        cache_file: Path to cache file
+        model: OpenAI model name for cost calculation
+    Returns:
+        Dictionary with cache statistics
+    """
+    if not cache_file.exists():
+        return {
+            "num_entries": 0,
+            "size_mb": 0.0,
+            "estimated_savings": "$0.00",
+        }
+    try:
+        # Get file size
+        size_bytes = cache_file.stat().st_size
+        size_mb = size_bytes / (1024 * 1024)
+        # Count entries
+        with open(cache_file) as f:
+            cache_data = json.load(f)
+            num_entries = len(cache_data)
+        # Estimate savings (num_entries * avg cost per page)
+        avg_input_tokens = ESTIMATED_TOKENS_PER_PAGE_INPUT
+        avg_output_tokens = ESTIMATED_TOKENS_PER_PAGE_OUTPUT
+        savings_per_page = calculate_actual_cost(
+            avg_input_tokens, avg_output_tokens, model
+        )
+        total_savings = num_entries * savings_per_page
+        return {
+            "num_entries": num_entries,
+            "size_mb": size_mb,
+            "estimated_savings": format_cost(total_savings),
+        }
+    except Exception as e:
+        logger.warning(f"Could not read cache stats from {cache_file}: {str(e)}")
+        return {
+            "num_entries": 0,
+            "size_mb": 0.0,
+            "estimated_savings": "$0.00",
+        }
+def extract_url_from_summary(summary: str) -> str | None:
+    """
+    Extract URL from a summary in the format: Title: [title](URL).
+    Args:
+        summary: Formatted summary string
+    Returns:
+        Extracted URL or None if not found
+    """
+    # Match markdown link format: [text](url)
+    match = re.search(r"\[([^\]]+)\]\(([^)]+)\)", summary)
+    if match:
+        return match.group(2)
+    return None
+def ensure_directory_exists(file_path: str) -> None:
+    """Ensure the parent directory of the given file path exists.
+    Args:
+        file_path: Path to the file whose parent directory should be created
+    Raises:
+        RuntimeError: If directory creation fails due to permissions or other issues
+    """
+    dir_path = Path(file_path).parent
+    if dir_path == Path("."):
+        return  # Current directory, no need to create
+    try:
+        dir_path.mkdir(parents=True, exist_ok=True)
+        if not dir_path.exists():
+            print(f"Created directory: {dir_path}")
+    except OSError as e:
+        raise RuntimeError(f"Failed to create directory {dir_path}: {e}") from e
+async def generate_llms_txt(
+    url: str,
+    llm_name: str = "o4-mini",
+    max_concurrent_summaries: int = 10,
+    output_path: str = "llms.txt",
+    show_urls: bool = False,
+    max_urls: int | None = None,
+    max_depth: int = 3,
+    cache_dir: str = ".llmsbrieftxt_cache",
+    use_cache_only: bool = False,
+    force_refresh: bool = False,
+    skip_confirmation: bool = False,
+) -> dict[str, int | list[str]] | None:
+    """
+    Generate llms-brief.txt file from a documentation website.
+    Args:
+        url: URL of the documentation site to crawl
+        llm_name: OpenAI model to use for summarization
+        max_concurrent_summaries: Maximum concurrent LLM requests
+        output_path: Path to write the output file
+        show_urls: If True, only show discovered URLs without processing
+        max_urls: Maximum number of URLs to discover/process
+        max_depth: Maximum crawl depth for URL discovery
+        cache_dir: Directory to store cached summaries
+        use_cache_only: If True, only use cached summaries (no API calls)
+        force_refresh: If True, ignore cache and regenerate all summaries
+        skip_confirmation: If True, skip confirmation prompt for high costs
+    Returns:
+        Dictionary with metadata (for show_urls mode) or None
+    """
+    urls_processed = 0
+    summaries_generated = 0
+    failed_urls: set[str] = set()  # Use set to avoid duplicates
+    # Set up cache directory
+    cache_path = Path(cache_dir)
+    cache_path.mkdir(parents=True, exist_ok=True)
+    cache_file = cache_path / "summaries.json"
+    # Load existing summaries from cache if available (unless force refresh)
+    existing_summaries: dict[str, str] = {}
+    if not force_refresh and cache_file.exists():
+        try:
+            with open(cache_file) as f:
+                existing_summaries = json.load(f)
+                # Show cache stats
+                cache_stats = get_cache_stats(cache_file, llm_name)
+                print(
+                    f"\nCache: {cache_stats['num_entries']} entries ({cache_stats['size_mb']:.1f}MB on disk)"
+                )
+                print(
+                    f"Approximate value from cache: ~{cache_stats['estimated_savings']} in saved API calls"
+                )
+        except Exception as e:
+            print(f"Warning: Could not load cache: {str(e)}")
+    elif force_refresh and cache_file.exists():
+        print("\nForce refresh enabled - ignoring existing cache")
+    extractor = default_extractor
+    output_file = output_path
+    # If show_urls is True, just show discovered URLs and exit
+    if show_urls:
+        print("Discovering documentation URLs...")
+        doc_loader = DocLoader(max_urls=max_urls, max_depth=max_depth)
+        _, discovered_urls = await doc_loader.load_docs(
+            url, extractor=extractor, show_urls=True
+        )
+        print("\nDiscovered URLs:")
+        for discovered_url in discovered_urls:
+            print(f"  - {discovered_url}")
+        print(f"\nTotal: {len(discovered_urls)} unique URLs")
+        # Calculate how many would be cached vs new
+        num_cached = sum(1 for u in discovered_urls if u in existing_summaries)
+        num_new = len(discovered_urls) - num_cached
+        if existing_summaries:
+            print(f"Cached: {num_cached} | New: {num_new}")
+        return {"num_urls": len(discovered_urls), "failed_urls": []}
+    # Load and process documents
+    doc_loader = DocLoader(max_urls=max_urls, max_depth=max_depth)
+    docs, discovered_urls = await doc_loader.load_docs(url, extractor=extractor)
+    urls_processed = len(docs)
+    # Track which URLs failed to load
+    loaded_urls = {doc.metadata.get("source") for doc in docs}
+    failed_urls.update(u for u in discovered_urls if u not in loaded_urls)
+    # Show cost estimate and get confirmation (unless using cache-only or skip_confirmation)
+    if not use_cache_only and not skip_confirmation:
+        num_cached = sum(1 for u in discovered_urls if u in existing_summaries)
+        num_new = len(discovered_urls) - num_cached
+        estimated_cost_new = calculate_actual_cost(
+            num_new * ESTIMATED_TOKENS_PER_PAGE_INPUT,
+            num_new * ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
+            llm_name,
+        )
+        print(f"\nThis run: {num_new} new pages, {num_cached} cached")
+        if num_cached > 0:
+            saved_cost = calculate_actual_cost(
+                num_cached * ESTIMATED_TOKENS_PER_PAGE_INPUT,
+                num_cached * ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
+                llm_name,
+            )
+            print(
+                f"Estimated cost: {format_cost(estimated_cost_new)} (saving {format_cost(saved_cost)} via cache)"
+            )
+        else:
+            print(f"Estimated cost: {format_cost(estimated_cost_new)}")
+        # Prompt for confirmation if cost is significant (> $1.00)
+        if estimated_cost_new > 1.00:
+            print(
+                f"\nWARNING: This will cost approximately {format_cost(estimated_cost_new)}"
+            )
+            response = input("Continue? [y/N]: ").strip().lower()
+            if response not in ["y", "yes"]:
+                print("Cancelled by user")
+                return None
+    # Handle cache-only mode
+    usage_stats: dict[str, int] = {"input_tokens": 0, "output_tokens": 0}
+    if use_cache_only:
+        print("\nCache-only mode: Using only cached summaries")
+        summaries: list[str] = []
+        for doc in docs:
+            doc_url = doc.metadata.get("source", "")
+            if doc_url in existing_summaries:
+                summaries.append(existing_summaries[doc_url])
+            else:
+                print(f"  Warning: No cache for {doc_url}")
+                failed_urls.add(doc_url)
+        summaries_generated = len(summaries)
+    else:
+        # Initialize summarizer
+        print(f"\nGenerating summaries with {llm_name}...")
+        summarizer = Summarizer(
+            llm_name=llm_name,
+            max_concurrent=max_concurrent_summaries,
+        )
+        summaries: list[str] = []
+        try:
+            summaries, usage_stats = await summarizer.summarize_all(
+                docs, existing_summaries=existing_summaries, cache_file=cache_file
+            )
+            summaries_generated = len(summaries)
+            # Track URLs that failed summarization by extracting URLs from summaries
+            summarized_urls: set[str] = set()
+            for summary in summaries:
+                if summary:
+                    extracted_url: str | None = extract_url_from_summary(summary)
+                    if extracted_url:
+                        summarized_urls.add(extracted_url)
+            # Add docs that weren't successfully summarized to failed_urls
+            for doc in docs:
+                doc_url = doc.metadata.get("source", "")
+                if doc_url and doc_url not in summarized_urls:
+                    failed_urls.add(doc_url)
+        except KeyboardInterrupt:
+            print("Process interrupted by user. Saving partial results...")
+            if cache_file.exists():
+                try:
+                    with open(cache_file) as f:
+                        partial_summaries = json.load(f)
+                        summaries = list(partial_summaries.values())
+                        summaries_generated = len(summaries)
+                        print(f"Recovered {len(summaries)} summaries from cache")
+                except Exception:
+                    # Silently ignore cache read errors during interrupt recovery
+                    # If we can't recover from cache, we'll continue with empty results
+                    pass
+        except Exception as e:
+            print(f"Summarization process error: {str(e)}")
+            if cache_file.exists():
+                try:
+                    with open(cache_file) as f:
+                        partial_summaries = json.load(f)
+                        summaries = list(partial_summaries.values())
+                        summaries_generated = len(summaries)
+                        print(
+                            f"Recovered {len(summaries)} partial summaries from cache"
+                        )
+                except Exception:
+                    # If cache recovery fails during error handling, continue with empty results
+                    summaries = []
+        finally:
+            # Write results to file
+            if summaries:
+                ensure_directory_exists(output_file)
+                output_content = "".join(summaries)
+                Path(output_file).write_text(output_content, encoding="utf-8")
+            else:
+                ensure_directory_exists(output_file)
+                Path(output_file).write_text("", encoding="utf-8")
+            # Print summary
+            print(f"\n{'=' * 50}")
+            print(f"Processed: {summaries_generated}/{urls_processed} pages")
+            if urls_processed > 0:
+                success_rate = summaries_generated / urls_processed * 100
+                print(f"Success rate: {success_rate:.1f}%")
+            # Show actual API cost if tokens were used
+            if usage_stats["input_tokens"] > 0 or usage_stats["output_tokens"] > 0:
+                actual_cost = calculate_actual_cost(
+                    usage_stats["input_tokens"], usage_stats["output_tokens"], llm_name
+                )
+                num_cached = len(existing_summaries)
+                if num_cached > 0:
+                    # Calculate how much we saved via cache
+                    saved_cost = calculate_actual_cost(
+                        num_cached * ESTIMATED_TOKENS_PER_PAGE_INPUT,
+                        num_cached * ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
+                        llm_name,
+                    )
+                    print(
+                        f"Actual cost: {format_cost(actual_cost)} (saved {format_cost(saved_cost)} via cache)"
+                    )
+                else:
+                    print(f"Actual cost: {format_cost(actual_cost)}")
+            print(f"Output: {output_file}")
+            # Report failed URLs
+            if failed_urls:
+                print(f"Failed URLs: {len(failed_urls)}")
+                failed_file = Path(output_file).parent / "failed_urls.txt"
+                # Sort URLs for consistent output
+                failed_file.write_text("\n".join(sorted(failed_urls)), encoding="utf-8")
+                print(f"Failed URLs written to: {failed_file}")
+            print(f"{'=' * 50}")
+    return None

llmsbrieftxt/schema.py ADDED Viewed

@@ -0,0 +1,42 @@
+from typing import Any
+from pydantic import BaseModel, Field
+class Document(BaseModel):
+    """Simple document class to replace langchain.schema.Document."""
+    page_content: str
+    metadata: dict[str, Any] = Field(default_factory=dict)
+class PageSummary(BaseModel):
+    content_analysis: str = Field(
+        description="Comprehensive analysis of the page content (2-3 sentences, ~50-80 words). "
+        "Include main topics, key concepts, important features, and unique value propositions. "
+        "Be specific about what makes this content valuable and distinctive."
+    )
+    primary_use_cases: str = Field(
+        description="3-5 specific, actionable scenarios when an LLM should reference this page (2-3 sentences total, ~40-60 words). "
+        'Format as concrete use cases like: "When implementing X feature", "To understand Y concept", '
+        '"For troubleshooting Z issue". Focus on practical applications.'
+    )
+    key_takeaways: str = Field(
+        description="2-3 most valuable insights, capabilities, or pieces of information (2-3 sentences, ~40-60 words). "
+        "Highlight unique knowledge, practical tips, or critical information that makes this page worth consulting. "
+        "Format as distinct points separated by semicolons."
+    )
+    related_topics: str = Field(
+        description="Relevant domains, technologies, and concepts this page relates to (1-2 sentences, ~20-30 words). "
+        'List connected topics that provide context, like: "API design, REST principles, microservices architecture". '
+        "Help establish the knowledge domain."
+    )
+    keywords: str = Field(
+        description="5-10 specific, searchable terms for discovery and indexing (comma-separated list, ~15-25 words). "
+        "Include technical terms, product names, methodologies, and key concepts. "
+        'Example: "GraphQL, API Gateway, schema stitching, federation, Apollo Server, type safety".'
+    )
+    concise_summary: str = Field(
+        description="Single comprehensive sentence capturing the essence of the page (15-25 words). "
+        "Summarize what the page offers and its primary value in one clear, informative statement."
+    )