PyPI - lean-explore - Versions diffs - 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

lean-explore 0.3.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

lean_explore/__init__.py +14 -1
lean_explore/api/__init__.py +12 -1
lean_explore/api/client.py +64 -176
lean_explore/cli/__init__.py +10 -1
lean_explore/cli/data_commands.py +184 -489
lean_explore/cli/display.py +171 -0
lean_explore/cli/main.py +51 -608
lean_explore/config.py +244 -0
lean_explore/extract/__init__.py +5 -0
lean_explore/extract/__main__.py +368 -0
lean_explore/extract/doc_gen4.py +200 -0
lean_explore/extract/doc_parser.py +499 -0
lean_explore/extract/embeddings.py +369 -0
lean_explore/extract/github.py +110 -0
lean_explore/extract/index.py +316 -0
lean_explore/extract/informalize.py +653 -0
lean_explore/extract/package_config.py +59 -0
lean_explore/extract/package_registry.py +45 -0
lean_explore/extract/package_utils.py +105 -0
lean_explore/extract/types.py +25 -0
lean_explore/mcp/__init__.py +11 -1
lean_explore/mcp/app.py +14 -46
lean_explore/mcp/server.py +20 -35
lean_explore/mcp/tools.py +71 -205
lean_explore/models/__init__.py +9 -0
lean_explore/models/search_db.py +76 -0
lean_explore/models/search_types.py +53 -0
lean_explore/search/__init__.py +32 -0
lean_explore/search/engine.py +651 -0
lean_explore/search/scoring.py +156 -0
lean_explore/search/service.py +68 -0
lean_explore/search/tokenization.py +71 -0
lean_explore/util/__init__.py +28 -0
lean_explore/util/embedding_client.py +92 -0
lean_explore/util/logging.py +22 -0
lean_explore/util/openrouter_client.py +63 -0
lean_explore/util/reranker_client.py +187 -0
{lean_explore-0.3.0.dist-info → lean_explore-1.0.1.dist-info}/METADATA +32 -9
lean_explore-1.0.1.dist-info/RECORD +43 -0
{lean_explore-0.3.0.dist-info → lean_explore-1.0.1.dist-info}/WHEEL +1 -1
lean_explore-1.0.1.dist-info/entry_points.txt +2 -0
lean_explore/cli/agent.py +0 -788
lean_explore/cli/config_utils.py +0 -481
lean_explore/defaults.py +0 -114
lean_explore/local/__init__.py +0 -1
lean_explore/local/search.py +0 -1050
lean_explore/local/service.py +0 -479
lean_explore/shared/__init__.py +0 -1
lean_explore/shared/models/__init__.py +0 -1
lean_explore/shared/models/api.py +0 -117
lean_explore/shared/models/db.py +0 -396
lean_explore-0.3.0.dist-info/RECORD +0 -26
lean_explore-0.3.0.dist-info/entry_points.txt +0 -2
{lean_explore-0.3.0.dist-info → lean_explore-1.0.1.dist-info}/licenses/LICENSE +0 -0
{lean_explore-0.3.0.dist-info → lean_explore-1.0.1.dist-info}/top_level.txt +0 -0

lean_explore/extract/embeddings.py ADDED Viewed

@@ -0,0 +1,369 @@
+"""Generate embeddings for Lean declarations.
+Reads declarations from the database and generates informalization embeddings
+for semantic search.
+"""
+import logging
+import sqlite3
+import struct
+import time
+from collections import deque
+from dataclasses import dataclass
+from pathlib import Path
+from rich.progress import (
+    BarColumn,
+    Progress,
+    ProgressColumn,
+    SpinnerColumn,
+    Task,
+    TaskProgressColumn,
+    TextColumn,
+    TimeRemainingColumn,
+)
+from rich.text import Text
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession
+from lean_explore.config import Config
+from lean_explore.models import Declaration
+from lean_explore.util import EmbeddingClient
+logger = logging.getLogger(__name__)
+class RateColumn(ProgressColumn):
+    """Custom column showing embeddings per second over a rolling window."""
+    def __init__(self, window_seconds: int = 300):
+        """Initialize rate column.
+        Args:
+            window_seconds: Rolling window size in seconds for rate calculation
+        """
+        super().__init__()
+        self.window_seconds = window_seconds
+        self.history: deque[tuple[float, int]] = deque()
+        self.total_count = 0
+    def add_count(self, count: int) -> None:
+        """Add embedding count with timestamp."""
+        now = time.time()
+        self.history.append((now, count))
+        self.total_count += count
+        # Remove old entries outside window
+        cutoff = now - self.window_seconds
+        while self.history and self.history[0][0] < cutoff:
+            self.history.popleft()
+    def render(self, task: Task) -> Text:
+        """Render the rate column."""
+        if not self.history:
+            return Text("-- emb/s", style="cyan")
+        now = time.time()
+        cutoff = now - self.window_seconds
+        # Sum counts within window
+        window_count = sum(c for t, c in self.history if t >= cutoff)
+        # Calculate elapsed time in window
+        if self.history:
+            oldest_in_window = max(self.history[0][0], cutoff)
+            elapsed = now - oldest_in_window
+            if elapsed > 0:
+                rate = window_count / elapsed
+                return Text(f"{rate:.1f} emb/s", style="cyan")
+        return Text("-- emb/s", style="cyan")
+# --- Data Classes ---
+@dataclass
+class EmbeddingCaches:
+    """Container for embedding caches.
+    Stores embeddings as raw bytes for efficiency. Use _deserialize_embedding()
+    to convert to list[float] when actually needed.
+    """
+    by_informalization: dict[str, bytes]
+def _deserialize_embedding(data: bytes) -> list[float]:
+    """Convert raw binary embedding to list[float].
+    Args:
+        data: Binary embedding data (float32 packed)
+    Returns:
+        List of float values
+    """
+    num_floats = len(data) // 4
+    return list(struct.unpack(f"{num_floats}f", data))
+# --- Cross-Database Cache Loading ---
+def _discover_database_files() -> list[Path]:
+    """Discover all lean_explore.db files in data/ and cache/ directories.
+    Returns:
+        List of paths to discovered database files
+    """
+    database_files = []
+    # Search in data directory
+    data_dir = Config.DATA_DIRECTORY
+    if data_dir.exists():
+        database_files.extend(data_dir.rglob("lean_explore.db"))
+    # Search in cache directory
+    cache_dir = Config.CACHE_DIRECTORY
+    if cache_dir.exists():
+        database_files.extend(cache_dir.rglob("lean_explore.db"))
+    logger.info(f"Discovered {len(database_files)} database files")
+    return database_files
+def _load_embedding_caches(database_files: list[Path]) -> EmbeddingCaches:
+    """Load embeddings from all discovered databases.
+    Builds a cache mapping informalization text to raw embedding bytes by scanning
+    all databases for declarations that have embeddings.
+    Uses sync sqlite3 directly to avoid SQLAlchemy ORM overhead and TypeDecorator
+    deserialization. Embeddings are stored as raw bytes and only deserialized
+    when actually used.
+    Args:
+        database_files: List of database file paths to scan
+    Returns:
+        EmbeddingCaches with cache dictionary populated (as bytes)
+    """
+    cache_by_informalization: dict[str, bytes] = {}
+    for db_path in database_files:
+        logger.info(f"Loading embedding cache from {db_path}")
+        try:
+            connection = sqlite3.connect(db_path)
+            cursor = connection.execute(
+                """
+                SELECT informalization, informalization_embedding
+                FROM declarations
+                WHERE informalization_embedding IS NOT NULL
+                """
+            )
+            count = 0
+            for row in cursor:
+                count += 1
+                (informalization, informalization_embedding) = row
+                # Cache informalization embedding
+                if (
+                    informalization is not None
+                    and informalization not in cache_by_informalization
+                ):
+                    cache_by_informalization[informalization] = (
+                        informalization_embedding
+                    )
+            connection.close()
+            logger.info(f"Loaded {count} declarations from {db_path}")
+        except Exception as e:
+            logger.warning(f"Failed to load embedding cache from {db_path}: {e}")
+            continue
+    logger.info(f"Total cache size - informalization: {len(cache_by_informalization)}")
+    return EmbeddingCaches(by_informalization=cache_by_informalization)
+async def _get_declarations_needing_embeddings(
+    session: AsyncSession, limit: int | None
+) -> list[Declaration]:
+    """Get declarations that need informalization embeddings.
+    Only returns declarations that have an informalization but no embedding yet.
+    Args:
+        session: Async database session
+        limit: Maximum number of declarations to retrieve (None for all)
+    Returns:
+        List of declarations needing embeddings
+    """
+    stmt = select(Declaration).where(
+        Declaration.informalization.isnot(None),
+        Declaration.informalization_embedding.is_(None),
+    )
+    if limit:
+        stmt = stmt.limit(limit)
+    result = await session.execute(stmt)
+    return list(result.scalars().all())
+async def _apply_cache_to_declarations(
+    session: AsyncSession,
+    declarations: list[Declaration],
+    caches: EmbeddingCaches,
+    commit_batch_size: int = 1000,
+) -> tuple[int, list[Declaration]]:
+    """Apply cached embeddings to declarations.
+    This is a fast first pass that applies all cache hits before generating
+    new embeddings, allowing the user to see exactly how many need generation.
+    Args:
+        session: Async database session
+        declarations: List of declarations to check against cache
+        caches: Embedding caches from cross-database loading
+        commit_batch_size: Number of updates to batch before committing
+    Returns:
+        Tuple of (cache_hits_count, list of declarations still needing generation)
+    """
+    cache_hits = 0
+    remaining: list[Declaration] = []
+    batch_count = 0
+    for declaration in declarations:
+        if not declaration.informalization:
+            continue
+        if declaration.informalization in caches.by_informalization:
+            declaration.informalization_embedding = _deserialize_embedding(
+                caches.by_informalization[declaration.informalization]
+            )
+            cache_hits += 1
+            batch_count += 1
+            if batch_count >= commit_batch_size:
+                await session.commit()
+                batch_count = 0
+        else:
+            remaining.append(declaration)
+    if batch_count > 0:
+        await session.commit()
+    return cache_hits, remaining
+async def _process_batch(
+    session: AsyncSession,
+    declarations: list[Declaration],
+    client: EmbeddingClient,
+) -> int:
+    """Process a batch of declarations and generate informalization embeddings.
+    Args:
+        session: Async database session
+        declarations: List of declarations to process (already filtered, no cache)
+        client: Embedding client for generating embeddings
+    Returns:
+        Number of embeddings generated
+    """
+    texts_to_embed = []
+    declarations_to_embed = []
+    for declaration in declarations:
+        if not declaration.informalization:
+            continue
+        if declaration.informalization_embedding is not None:
+            continue
+        texts_to_embed.append(declaration.informalization)
+        declarations_to_embed.append(declaration)
+    if texts_to_embed:
+        response = await client.embed(texts_to_embed)
+        for declaration, embedding in zip(declarations_to_embed, response.embeddings):
+            declaration.informalization_embedding = embedding
+    await session.commit()
+    return len(texts_to_embed)
+async def generate_embeddings(
+    engine: AsyncEngine,
+    model_name: str,
+    batch_size: int = 128,
+    limit: int | None = None,
+    max_seq_length: int = 512,
+) -> None:
+    """Generate embeddings for all declarations.
+    Args:
+        engine: Async database engine
+        model_name: Name of the sentence transformer model to use
+        batch_size: Number of declarations to process in each batch (default 250)
+        limit: Maximum number of declarations to process (None for all)
+        max_seq_length: Maximum sequence length for tokenization (default 512).
+            Lower values reduce memory usage but may truncate long texts.
+    """
+    # Discover and load embedding caches from all existing databases
+    logger.info("Discovering existing databases for embedding cache...")
+    database_files = _discover_database_files()
+    caches = _load_embedding_caches(database_files)
+    async with AsyncSession(engine, expire_on_commit=False) as session:
+        declarations = await _get_declarations_needing_embeddings(session, limit)
+        logger.info(f"Found {len(declarations)} declarations needing embeddings")
+        if not declarations:
+            logger.info("No declarations to process")
+            return
+        # Phase 1: Apply all cache hits first
+        logger.info("Phase 1: Applying cached embeddings...")
+        cache_hits, remaining = await _apply_cache_to_declarations(
+            session, declarations, caches
+        )
+        logger.info(
+            f"Applied {cache_hits} embeddings from cache, "
+            f"{len(remaining)} remaining need generation"
+        )
+        if not remaining:
+            logger.info("All embeddings served from cache, no generation needed")
+            return
+        # Phase 2: Generate embeddings for remaining declarations
+        logger.info("Phase 2: Generating embeddings for remaining declarations...")
+        client = EmbeddingClient(model_name=model_name, max_length=max_seq_length)
+        logger.info(f"Using {client.model_name} on {client.device}")
+        total = len(remaining)
+        total_embeddings = 0
+        rate_column = RateColumn(window_seconds=60)
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TaskProgressColumn(),
+            rate_column,
+            TimeRemainingColumn(),
+        ) as progress:
+            task = progress.add_task("Generating embeddings", total=total)
+            for i in range(0, total, batch_size):
+                batch = remaining[i : i + batch_size]
+                count = await _process_batch(session, batch, client)
+                total_embeddings += count
+                rate_column.add_count(count)
+                progress.update(task, advance=len(batch))
+        logger.info(
+            f"Generated {total_embeddings} new embeddings "
+            f"({cache_hits} from cache, {total_embeddings} generated)"
+        )

lean_explore/extract/github.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""GitHub utilities for fetching package metadata.
+This module provides functions to interact with GitHub repositories
+for fetching toolchain versions and release tags.
+"""
+import json
+import logging
+import re
+import urllib.request
+logger = logging.getLogger(__name__)
+def github_url_to_raw(git_url: str, branch: str, file_path: str) -> str:
+    """Convert GitHub repo URL to raw file URL.
+    Args:
+        git_url: GitHub repository URL (e.g., https://github.com/owner/repo)
+        branch: Branch or tag name
+        file_path: Path to file in repo
+    Returns:
+        Raw GitHub URL for the file.
+    """
+    match = re.search(r"github\.com/([^/]+)/([^/]+?)(?:\.git)?$", git_url)
+    if not match:
+        raise ValueError(f"Could not parse GitHub URL: {git_url}")
+    owner, repo = match.groups()
+    return f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
+def fetch_lean_toolchain(git_url: str, ref: str = "main") -> str:
+    """Fetch lean-toolchain content from a GitHub repository.
+    Args:
+        git_url: GitHub repository URL
+        ref: Branch name or tag (default: main)
+    Returns:
+        Content of the lean-toolchain file (e.g., 'leanprover/lean4:v4.27.0')
+    """
+    raw_url = github_url_to_raw(git_url, ref, "lean-toolchain")
+    logger.info(f"Fetching lean-toolchain from {raw_url}")
+    try:
+        with urllib.request.urlopen(raw_url, timeout=30) as response:
+            return response.read().decode("utf-8").strip()
+    except Exception as e:
+        raise RuntimeError(f"Failed to fetch lean-toolchain from {raw_url}: {e}")
+def fetch_latest_tag(git_url: str) -> str:
+    """Fetch the latest semver tag from a GitHub repository.
+    Args:
+        git_url: GitHub repository URL
+    Returns:
+        Latest tag name (e.g., 'v4.26.0')
+    """
+    match = re.search(r"github\.com/([^/]+)/([^/]+?)(?:\.git)?$", git_url)
+    if not match:
+        raise ValueError(f"Could not parse GitHub URL: {git_url}")
+    owner, repo = match.groups()
+    api_url = f"https://api.github.com/repos/{owner}/{repo}/tags?per_page=100"
+    logger.info(f"Fetching tags from {api_url}")
+    try:
+        request = urllib.request.Request(
+            api_url,
+            headers={"Accept": "application/vnd.github.v3+json"},
+        )
+        with urllib.request.urlopen(request, timeout=30) as response:
+            tags = json.loads(response.read().decode("utf-8"))
+    except Exception as e:
+        raise RuntimeError(f"Failed to fetch tags from {api_url}: {e}")
+    if not tags:
+        raise RuntimeError(f"No tags found for {git_url}")
+    # Filter to semver-like tags (v*.*.*)
+    semver_pattern = re.compile(r"^v?\d+\.\d+\.\d+")
+    semver_tags = [t["name"] for t in tags if semver_pattern.match(t["name"])]
+    if not semver_tags:
+        return tags[0]["name"]
+    def semver_key(tag: str) -> list[int]:
+        return [int(x) for x in re.findall(r"\d+", tag)]
+    semver_tags.sort(key=semver_key, reverse=True)
+    return semver_tags[0]
+def extract_lean_version(toolchain: str) -> str:
+    """Extract version from lean-toolchain content.
+    Args:
+        toolchain: Toolchain content like 'leanprover/lean4:v4.27.0'
+            or 'leanprover/lean4:v4.28.0-rc1'.
+    Returns:
+        Version string like 'v4.27.0' or 'v4.28.0-rc1'
+    """
+    match = re.search(r"v\d+\.\d+\.\d+(?:-rc\d+)?", toolchain)
+    if not match:
+        raise ValueError(f"Could not extract version from toolchain: {toolchain}")
+    return match.group()

lean-explore 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

lean-explore 0.3.0py3-none-any.whl → 1.0.1py3-none-any.whl