PyPI - okb - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

okb 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

okb/cli.py +1209 -16
okb/config.py +122 -4
okb/http_server.py +208 -2
okb/llm/analyze.py +524 -0
okb/llm/consolidate.py +685 -0
okb/llm/enrich.py +723 -0
okb/llm/extractors/__init__.py +13 -0
okb/llm/extractors/base.py +44 -0
okb/llm/extractors/cross_doc.py +478 -0
okb/llm/extractors/dedup.py +499 -0
okb/llm/extractors/entity.py +369 -0
okb/llm/extractors/todo.py +149 -0
okb/llm/providers.py +9 -6
okb/mcp_server.py +1279 -12
okb/migrations/0008.enrichment.sql +46 -0
okb/migrations/0009.entity-consolidation.sql +120 -0
okb/migrations/0010.token-id.sql +7 -0
okb/modal_llm.py +26 -8
okb/plugins/sources/__init__.py +2 -1
okb/plugins/sources/dropbox_paper.py +44 -9
okb/plugins/sources/github.py +5 -5
okb/plugins/sources/todoist.py +254 -0
okb/tokens.py +25 -3
{okb-1.0.0.dist-info → okb-1.1.0.dist-info}/METADATA +119 -68
okb-1.1.0.dist-info/RECORD +49 -0
{okb-1.0.0.dist-info → okb-1.1.0.dist-info}/entry_points.txt +1 -0
okb-1.0.0.dist-info/RECORD +0 -36
{okb-1.0.0.dist-info → okb-1.1.0.dist-info}/WHEEL +0 -0

okb/cli.py CHANGED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import importlib.resources
 import json
+import os
 import shutil
 import subprocess
 import sys
@@ -44,7 +45,7 @@ def _check_docker() -> bool:
 def _get_container_status() -> str | None:
-    """Get the status of the lkb container. Returns None if not found."""
+    """Get the status of the okb container. Returns None if not found."""
     try:
         result = subprocess.run(
             [
@@ -841,7 +842,7 @@ def sync_run(
 ):
     """Sync from API sources.
-    Example: lkb sync run github --repo owner/repo
+    Example: okb sync run github --repo owner/repo
     """
     import psycopg
     from psycopg.rows import dict_row
@@ -968,6 +969,132 @@ def sync_list():
             click.echo(f"  {name}")
+@sync.command("list-projects")
+@click.argument("source")
+def sync_list_projects(source: str):
+    """List projects from an API source (for finding project IDs).
+    Example: okb sync list-projects todoist
+    """
+    from .plugins.registry import PluginRegistry
+    # Get the plugin
+    source_obj = PluginRegistry.get_source(source)
+    if source_obj is None:
+        click.echo(f"Error: Source '{source}' not found.", err=True)
+        click.echo(f"Installed sources: {', '.join(PluginRegistry.list_sources())}")
+        sys.exit(1)
+    # Check if source supports list_projects
+    if not hasattr(source_obj, "list_projects"):
+        click.echo(f"Error: Source '{source}' does not support listing projects.", err=True)
+        sys.exit(1)
+    # Get and resolve config
+    source_cfg = config.get_source_config(source)
+    if source_cfg is None:
+        click.echo(f"Error: Source '{source}' not configured.", err=True)
+        click.echo("Add it to your config file under plugins.sources")
+        sys.exit(1)
+    try:
+        source_obj.configure(source_cfg)
+    except Exception as e:
+        click.echo(f"Error configuring '{source}': {e}", err=True)
+        sys.exit(1)
+    try:
+        projects = source_obj.list_projects()
+        if projects:
+            click.echo(f"Projects in {source}:")
+            for project_id, name in projects:
+                click.echo(f"  {project_id}: {name}")
+        else:
+            click.echo("No projects found.")
+    except Exception as e:
+        click.echo(f"Error listing projects: {e}", err=True)
+        sys.exit(1)
+@sync.command("auth")
+@click.argument("source")
+def sync_auth(source: str):
+    """Authenticate with an API source (get tokens).
+    Currently supports: dropbox-paper
+    Example: okb sync auth dropbox-paper
+    """
+    if source == "dropbox-paper":
+        _auth_dropbox()
+    else:
+        click.echo(f"Error: Authentication helper not available for '{source}'", err=True)
+        click.echo("Supported: dropbox-paper")
+        sys.exit(1)
+def _auth_dropbox():
+    """Interactive OAuth flow for Dropbox."""
+    try:
+        import dropbox
+        from dropbox import DropboxOAuth2FlowNoRedirect
+    except ImportError:
+        click.echo("Error: dropbox package not installed", err=True)
+        click.echo("Install with: pip install dropbox", err=True)
+        sys.exit(1)
+    click.echo("Dropbox OAuth Setup")
+    click.echo("=" * 50)
+    click.echo("")
+    click.echo("You'll need your Dropbox app credentials.")
+    click.echo("Get them at: https://www.dropbox.com/developers/apps")
+    click.echo("")
+    app_key = click.prompt("App key")
+    app_secret = click.prompt("App secret")
+    # Start OAuth flow
+    auth_flow = DropboxOAuth2FlowNoRedirect(
+        app_key,
+        app_secret,
+        token_access_type="offline",  # This gives us a refresh token
+    )
+    authorize_url = auth_flow.start()
+    click.echo("")
+    click.echo("1. Go to this URL in your browser:")
+    click.echo(f"   {authorize_url}")
+    click.echo("")
+    click.echo("2. Click 'Allow' to authorize the app")
+    click.echo("3. Copy the authorization code")
+    click.echo("")
+    auth_code = click.prompt("Enter the authorization code")
+    try:
+        oauth_result = auth_flow.finish(auth_code.strip())
+    except Exception as e:
+        click.echo(f"Error: Failed to get tokens - {e}", err=True)
+        sys.exit(1)
+    click.echo("")
+    click.echo("Success! Add these to your environment or config:")
+    click.echo("")
+    click.echo(f"DROPBOX_APP_KEY={app_key}")
+    click.echo(f"DROPBOX_APP_SECRET={app_secret}")
+    click.echo(f"DROPBOX_REFRESH_TOKEN={oauth_result.refresh_token}")
+    click.echo("")
+    click.echo("Config example (~/.config/okb/config.yaml):")
+    click.echo("")
+    click.echo("plugins:")
+    click.echo("  sources:")
+    click.echo("    dropbox-paper:")
+    click.echo("      enabled: true")
+    click.echo("      app_key: ${DROPBOX_APP_KEY}")
+    click.echo("      app_secret: ${DROPBOX_APP_SECRET}")
+    click.echo("      refresh_token: ${DROPBOX_REFRESH_TOKEN}")
 @sync.command("status")
 @click.argument("source", required=False)
 @click.option("--db", "database", default=None, help="Database to check")
@@ -1083,7 +1210,7 @@ def token_list(ctx, database: str | None):
         for t in tokens:
             desc = f" - {t.description}" if t.description else ""
             last_used = t.last_used_at.strftime("%Y-%m-%d %H:%M") if t.last_used_at else "never"
-            click.echo(f"  [{t.permissions}] {t.token_hash[:12]}...{desc}")
+            click.echo(f"  ID {t.id} [{t.permissions}] {t.token_hash[:12]}...{desc}")
             created = t.created_at.strftime("%Y-%m-%d %H:%M")
             click.echo(f"      Created: {created}, Last used: {last_used}")
     except Exception as e:
@@ -1092,26 +1219,43 @@ def token_list(ctx, database: str | None):
 @token.command("revoke")
-@click.argument("token_value")
+@click.argument("token_value", required=False)
+@click.option("--id", "token_id", type=int, default=None, help="Token ID to revoke (from 'okb token list')")
 @click.option("--db", "database", default=None, help="Database to revoke token from")
 @click.pass_context
-def token_revoke(ctx, token_value: str, database: str | None):
+def token_revoke(ctx, token_value: str | None, token_id: int | None, database: str | None):
     """Revoke (delete) an API token.
-    TOKEN_VALUE must be the full token string.
+    Either provide the full TOKEN_VALUE or use --id with the token ID from 'okb token list'.
     """
-    from .tokens import delete_token
+    from .tokens import delete_token, delete_token_by_id
+    if not token_value and not token_id:
+        click.echo("Error: Provide either TOKEN_VALUE or --id", err=True)
+        sys.exit(1)
+    if token_value and token_id:
+        click.echo("Error: Provide either TOKEN_VALUE or --id, not both", err=True)
+        sys.exit(1)
     db_name = database or ctx.obj.get("database")
     db_cfg = config.get_database(db_name)
     try:
-        deleted = delete_token(db_cfg.url, token_value)
-        if deleted:
-            click.echo("Token revoked.")
+        if token_id:
+            deleted = delete_token_by_id(db_cfg.url, token_id)
+            if deleted:
+                click.echo(f"Token ID {token_id} revoked.")
+            else:
+                click.echo(f"Token ID {token_id} not found.", err=True)
+                sys.exit(1)
         else:
-            click.echo("Token not found. Make sure you're using the full token string.", err=True)
-            sys.exit(1)
+            deleted = delete_token(db_cfg.url, token_value)
+            if deleted:
+                click.echo("Token revoked.")
+            else:
+                click.echo("Token not found. Use --id or provide the full token string.", err=True)
+                sys.exit(1)
     except Exception as e:
         click.echo(f"Error revoking token: {e}", err=True)
         sys.exit(1)
@@ -1150,7 +1294,7 @@ def llm_status(ctx, database: str | None):
         click.echo(f"Cache responses: {config.llm_cache_responses}")
         if config.llm_provider == "modal":
-            click.echo("Backend: Modal GPU (deploy with: lkb llm deploy)")
+            click.echo("Backend: Modal GPU (deploy with: okb llm deploy)")
         elif config.llm_use_bedrock:
             click.echo(f"Backend: AWS Bedrock (region: {config.llm_aws_region})")
         else:
@@ -1240,7 +1384,9 @@ def llm_clear_cache(ctx, database: str | None, days: int | None, yes: bool):
 def llm_deploy():
     """Deploy the Modal LLM app for open model inference.
-    This deploys a GPU-accelerated LLM service on Modal using Llama 3.2.
+    Deploys a GPU-accelerated LLM service on Modal using the model from your config.
+    Default: microsoft/Phi-3-mini-4k-instruct (no HuggingFace approval needed).
     Required for using provider: modal in your config.
     Requires Modal CLI to be installed and authenticated:
@@ -1259,14 +1405,1061 @@ def llm_deploy():
         click.echo(f"Error: modal_llm.py not found at {llm_path}", err=True)
         sys.exit(1)
-    click.echo(f"Deploying {llm_path} to Modal...")
-    click.echo("Note: First deploy downloads the model (~2GB) and may take a few minutes.")
+    # Get model and GPU from config
+    model = config.llm_model or "microsoft/Phi-3-mini-4k-instruct"
+    gpu = config.llm_modal_gpu or "L4"
+    click.echo("Deploying Modal LLM:")
+    click.echo(f"  Model: {model}")
+    click.echo(f"  GPU: {gpu}")
+    click.echo("Note: First deploy downloads the model and may take a few minutes.")
+    # Set model and GPU in environment for Modal to pick up
+    env = os.environ.copy()
+    env["OKB_LLM_MODEL"] = model
+    env["OKB_MODAL_GPU"] = gpu
     result = subprocess.run(
         ["modal", "deploy", str(llm_path)],
         cwd=llm_path.parent,
+        env=env,
     )
     sys.exit(result.returncode)
+# =============================================================================
+# Enrich commands
+# =============================================================================
+@main.group()
+def enrich():
+    """LLM-based document enrichment (extract TODOs and entities)."""
+    pass
+@enrich.command("run")
+@click.option("--db", "database", default=None, help="Database to enrich")
+@click.option("--source-type", default=None, help="Filter by source type")
+@click.option("--project", default=None, help="Filter by project")
+@click.option("--query", default=None, help="Semantic search query to filter documents")
+@click.option("--path-pattern", default=None, help="SQL LIKE pattern for source_path")
+@click.option(
+    "--all", "enrich_all", is_flag=True, help="Re-enrich all documents (ignore enriched_at)"
+)
+@click.option("--dry-run", is_flag=True, help="Show what would be enriched without executing")
+@click.option("--limit", default=100, help="Maximum documents to process")
+@click.option("--workers", default=None, type=int, help="Parallel workers (default: docs/5, min 1)")
+@click.pass_context
+def enrich_run(
+    ctx,
+    database: str | None,
+    source_type: str | None,
+    project: str | None,
+    query: str | None,
+    path_pattern: str | None,
+    enrich_all: bool,
+    dry_run: bool,
+    limit: int,
+    workers: int | None,
+):
+    """Run enrichment on documents to extract TODOs and entities.
+    By default, only processes documents that haven't been enriched yet.
+    Use --all to re-enrich all documents (e.g., after changing enrichment config).
+    Examples:
+        okb enrich run                  # Enrich un-enriched documents
+        okb enrich run --dry-run        # Show what would be enriched
+        okb enrich run --all            # Re-enrich everything
+        okb enrich run --source-type markdown  # Only markdown files
+        okb enrich run --query "meeting notes"  # Filter by semantic search
+        okb enrich run --path-pattern '%myrepo%'  # Filter by source path
+        okb enrich run --workers 8      # Use 8 parallel workers
+    """
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    from .llm import get_llm
+    from .llm.enrich import EnrichmentConfig, get_unenriched_documents, process_enrichment
+    # Check LLM is configured before doing any work
+    if get_llm() is None:
+        click.echo("Error: No LLM provider configured.", err=True)
+        click.echo("", err=True)
+        click.echo("Enrichment requires an LLM to extract TODOs and entities.", err=True)
+        click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml:", err=True)
+        click.echo("", err=True)
+        click.echo("  llm:", err=True)
+        click.echo("    provider: claude", err=True)
+        click.echo("    model: claude-haiku-4-5-20251001", err=True)
+        click.echo("", err=True)
+        click.echo("Run 'okb llm status' to check configuration.", err=True)
+        ctx.exit(1)
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    # Get enrichment version for re-enrichment check
+    enrichment_version = config.enrichment_version if enrich_all else None
+    click.echo(f"Scanning database '{db_cfg.name}' for documents to enrich...")
+    if dry_run:
+        click.echo("(dry run - no changes will be made)")
+    docs = get_unenriched_documents(
+        db_url=db_cfg.url,
+        source_type=source_type,
+        project=project,
+        query=query,
+        path_pattern=path_pattern,
+        enrichment_version=enrichment_version,
+        limit=limit,
+    )
+    if not docs:
+        click.echo("No documents need enrichment.")
+        return
+    click.echo(f"Found {len(docs)} documents to enrich")
+    if dry_run:
+        for doc in docs[:20]:
+            click.echo(f"  - {doc['title']} ({doc['source_type']})")
+        if len(docs) > 20:
+            click.echo(f"  ... and {len(docs) - 20} more")
+        return
+    # Calculate workers if not specified: floor(docs/5), minimum 1
+    if workers is None:
+        workers = max(1, len(docs) // 5)
+    # Build config
+    enrich_config = EnrichmentConfig.from_config(
+        {
+            "enabled": config.enrichment_enabled,
+            "version": config.enrichment_version,
+            "extract_todos": config.enrichment_extract_todos,
+            "extract_entities": config.enrichment_extract_entities,
+            "auto_create_todos": config.enrichment_auto_create_todos,
+            "auto_create_entities": config.enrichment_auto_create_entities,
+            "min_confidence_todo": config.enrichment_min_confidence_todo,
+            "min_confidence_entity": config.enrichment_min_confidence_entity,
+        }
+    )
+    total_todos = 0
+    total_entities_pending = 0
+    total_entities_created = 0
+    completed = 0
+    errors = 0
+    def enrich_one(doc: dict) -> tuple[dict, dict | None, str | None]:
+        """Process a single document. Returns (doc, stats, error)."""
+        proj = doc["metadata"].get("project") if doc["metadata"] else None
+        try:
+            stats = process_enrichment(
+                document_id=str(doc["id"]),
+                source_path=doc["source_path"],
+                title=doc["title"],
+                content=doc["content"],
+                source_type=doc["source_type"],
+                db_url=db_cfg.url,
+                config=enrich_config,
+                project=proj,
+            )
+            return doc, stats, None
+        except Exception as e:
+            return doc, None, str(e)
+    click.echo(f"Processing with {workers} parallel workers...")
+    with ThreadPoolExecutor(max_workers=workers) as executor:
+        futures = {executor.submit(enrich_one, doc): doc for doc in docs}
+        for future in as_completed(futures):
+            doc, stats, error = future.result()
+            completed += 1
+            title = doc["title"][:40] if doc["title"] else "Untitled"
+            if error:
+                errors += 1
+                click.echo(f"[{completed}/{len(docs)}] {title}... -> error: {error[:50]}")
+                continue
+            total_todos += stats["todos_created"]
+            total_entities_pending += stats["entities_pending"]
+            total_entities_created += stats["entities_created"]
+            parts = []
+            if stats["todos_created"]:
+                parts.append(f"{stats['todos_created']} TODOs")
+            if stats["entities_pending"]:
+                parts.append(f"{stats['entities_pending']} pending")
+            if stats["entities_created"]:
+                parts.append(f"{stats['entities_created']} entities")
+            if parts:
+                click.echo(f"[{completed}/{len(docs)}] {title}... -> {', '.join(parts)}")
+            else:
+                click.echo(f"[{completed}/{len(docs)}] {title}... -> nothing extracted")
+    click.echo("")
+    click.echo("Summary:")
+    click.echo(f"  Documents processed: {len(docs)}")
+    if errors:
+        click.echo(f"  Errors: {errors}")
+    click.echo(f"  TODOs created: {total_todos}")
+    click.echo(f"  Entities pending review: {total_entities_pending}")
+    click.echo(f"  Entities auto-created: {total_entities_created}")
+@enrich.command("pending")
+@click.option("--db", "database", default=None, help="Database to check")
+@click.option("--type", "entity_type", default=None, help="Filter by entity type")
+@click.option("--limit", default=50, help="Maximum results")
+@click.pass_context
+def enrich_pending(ctx, database: str | None, entity_type: str | None, limit: int):
+    """List pending entity suggestions awaiting review.
+    Shows entities extracted from documents that need approval before
+    becoming searchable. Use 'okb enrich approve' or 'okb enrich reject'
+    to process them.
+    """
+    from .llm.enrich import list_pending_entities
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    entities = list_pending_entities(db_cfg.url, entity_type=entity_type, limit=limit)
+    if not entities:
+        click.echo("No pending entity suggestions.")
+        return
+    click.echo(f"Pending entities ({len(entities)}):\n")
+    for e in entities:
+        confidence = e.get("confidence", 0)
+        confidence_str = f" ({confidence:.0%})" if confidence else ""
+        click.echo(f"  [{e['entity_type']}] {e['entity_name']}{confidence_str}")
+        click.echo(f"    ID: {e['id']}")
+        if e.get("description"):
+            desc = (
+                e["description"][:60] + "..."
+                if len(e.get("description", "")) > 60
+                else e["description"]
+            )
+            click.echo(f"    {desc}")
+        if e.get("aliases"):
+            click.echo(f"    Aliases: {', '.join(e['aliases'][:3])}")
+        click.echo(f"    Source: {e['source_title']}")
+        click.echo("")
+    click.echo("Use 'okb enrich approve <id>' or 'okb enrich reject <id>' to process.")
+@enrich.command("approve")
+@click.argument("pending_id")
+@click.option("--db", "database", default=None, help="Database")
+@click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
+@click.pass_context
+def enrich_approve(ctx, pending_id: str, database: str | None, local: bool):
+    """Approve a pending entity, creating it as a searchable document."""
+    from .llm.enrich import approve_entity
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    source_path = approve_entity(db_cfg.url, pending_id, use_modal=not local)
+    if source_path:
+        click.echo(f"Entity approved and created: {source_path}")
+    else:
+        click.echo("Failed to approve entity. ID may be invalid or already processed.", err=True)
+        sys.exit(1)
+@enrich.command("reject")
+@click.argument("pending_id")
+@click.option("--db", "database", default=None, help="Database")
+@click.pass_context
+def enrich_reject(ctx, pending_id: str, database: str | None):
+    """Reject a pending entity suggestion."""
+    from .llm.enrich import reject_entity
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    if reject_entity(db_cfg.url, pending_id):
+        click.echo("Entity rejected.")
+    else:
+        click.echo("Failed to reject entity. ID may be invalid or already processed.", err=True)
+        sys.exit(1)
+@enrich.command("analyze")
+@click.option("--db", "database", default=None, help="Database to analyze")
+@click.option("--project", default=None, help="Analyze specific project only")
+@click.option("--sample-size", default=15, help="Number of documents to sample")
+@click.option("--no-update", is_flag=True, help="Don't update database metadata")
+@click.option("--stats-only", is_flag=True, help="Show stats without LLM analysis")
+@click.pass_context
+def enrich_analyze(
+    ctx,
+    database: str | None,
+    project: str | None,
+    sample_size: int,
+    no_update: bool,
+    stats_only: bool,
+):
+    """Analyze knowledge base and update description/topics.
+    Uses entity aggregation and document sampling to understand the overall
+    content and themes in the knowledge base. Generates a description and
+    topic keywords using LLM analysis.
+    Examples:
+        okb enrich analyze              # Analyze entire database
+        okb enrich analyze --stats-only # Show stats without LLM call
+        okb enrich analyze --project myproject  # Analyze specific project
+        okb enrich analyze --no-update  # Analyze without updating metadata
+    """
+    from .llm.analyze import (
+        analyze_database,
+        get_content_stats,
+        get_entity_summary,
+    )
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    scope = f"project '{project}'" if project else f"database '{db_cfg.name}'"
+    click.echo(f"Analyzing {scope}...\n")
+    # Always get stats
+    stats = get_content_stats(db_cfg.url, project)
+    entities = get_entity_summary(db_cfg.url, project, limit=20)
+    # Show stats
+    click.echo("Content Statistics:")
+    click.echo(f"  Documents: {stats['total_documents']:,}")
+    click.echo(f"  Tokens: ~{stats['total_tokens']:,}")
+    if stats["source_types"]:
+        sorted_types = sorted(stats["source_types"].items(), key=lambda x: -x[1])
+        types_parts = [f"{t}: {c}" for t, c in sorted_types]
+        # Break into multiple lines if many types
+        if len(types_parts) > 4:
+            click.echo("  Source types:")
+            for tp in types_parts:
+                click.echo(f"    {tp}")
+        else:
+            click.echo(f"  Source types: {', '.join(types_parts)}")
+    if stats["projects"]:
+        click.echo(f"  Projects: {', '.join(stats['projects'])}")
+    if stats["date_range"]["earliest"]:
+        earliest = stats["date_range"]["earliest"]
+        latest = stats["date_range"]["latest"]
+        click.echo(f"  Date range: {earliest} to {latest}")
+    click.echo("")
+    # Show top entities
+    if entities:
+        click.echo("Top Entities (by mentions):")
+        for i, e in enumerate(entities[:10], 1):
+            name, etype = e["name"], e["type"]
+            refs, docs = e["ref_count"], e["doc_count"]
+            click.echo(f"  {i}. {name} ({etype}) - {refs} mentions in {docs} docs")
+        click.echo("")
+    else:
+        click.echo("No entities extracted yet.")
+        click.echo("Run 'okb enrich run' to extract entities from documents.\n")
+    if stats_only:
+        return
+    # Check LLM is configured
+    from .llm import get_llm
+    if get_llm() is None:
+        click.echo("Error: No LLM provider configured.", err=True)
+        click.echo("", err=True)
+        click.echo("Analysis requires an LLM to generate description and topics.", err=True)
+        click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml:", err=True)
+        click.echo("", err=True)
+        click.echo("  llm:", err=True)
+        click.echo("    provider: claude", err=True)
+        click.echo("", err=True)
+        click.echo("Use --stats-only to see statistics without LLM analysis.", err=True)
+        ctx.exit(1)
+    click.echo(f"Sampling {sample_size} documents for analysis...")
+    click.echo("Generating description and topics...")
+    click.echo("")
+    try:
+        result = analyze_database(
+            db_url=db_cfg.url,
+            project=project,
+            sample_size=sample_size,
+            auto_update=not no_update,
+        )
+        click.echo("Analysis Complete:")
+        click.echo(f"  Description: {result.description}")
+        click.echo(f"  Topics: {', '.join(result.topics)}")
+        if not no_update:
+            click.echo("")
+            click.echo("Updated database metadata.")
+        else:
+            click.echo("")
+            click.echo("(metadata not updated - use without --no-update to save)")
+    except Exception as e:
+        click.echo(f"Error during analysis: {e}", err=True)
+        ctx.exit(1)
+@enrich.command("consolidate")
+@click.option("--db", "database", default=None, help="Database to consolidate")
+@click.option("--duplicates/--no-duplicates", "detect_duplicates", default=True,
+              help="Detect duplicate entities")
+@click.option("--cross-doc/--no-cross-doc", "detect_cross_doc", default=True,
+              help="Detect cross-document entities")
+@click.option("--clusters/--no-clusters", "build_clusters", default=True,
+              help="Build topic clusters")
+@click.option("--relationships/--no-relationships", "extract_relationships", default=True,
+              help="Extract entity relationships")
+@click.option("--dry-run", is_flag=True, help="Show what would be found without creating proposals")
+@click.pass_context
+def enrich_consolidate(
+    ctx,
+    database: str | None,
+    detect_duplicates: bool,
+    detect_cross_doc: bool,
+    build_clusters: bool,
+    extract_relationships: bool,
+    dry_run: bool,
+):
+    """Run entity consolidation pipeline.
+    Detects duplicate entities, cross-document mentions, builds topic clusters,
+    and extracts entity relationships. Creates pending proposals for review
+    rather than auto-applying changes.
+    Examples:
+        okb enrich consolidate              # Run full consolidation
+        okb enrich consolidate --dry-run    # Show what would be found
+        okb enrich consolidate --no-clusters  # Skip clustering
+        okb enrich consolidate --duplicates --no-cross-doc --no-clusters --no-relationships
+    """
+    from .llm import get_llm
+    from .llm.consolidate import format_consolidation_result, run_consolidation
+    # Check LLM is configured if needed
+    if get_llm() is None:
+        click.echo("Error: No LLM provider configured.", err=True)
+        click.echo("Consolidation requires an LLM for deduplication and clustering.", err=True)
+        click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml", err=True)
+        ctx.exit(1)
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    click.echo(f"Running consolidation on database '{db_cfg.name}'...")
+    if dry_run:
+        click.echo("(dry run - no proposals will be created)")
+    result = run_consolidation(
+        db_url=db_cfg.url,
+        detect_duplicates=detect_duplicates,
+        detect_cross_doc=detect_cross_doc,
+        build_clusters=build_clusters,
+        extract_relationships=extract_relationships,
+        dry_run=dry_run,
+    )
+    # Format and display result
+    output = format_consolidation_result(result)
+    click.echo("")
+    click.echo(output)
+    if not dry_run and (result.duplicates_found > 0 or result.cross_doc_candidates > 0):
+        click.echo("")
+        click.echo("Use 'okb enrich merge-proposals' to review pending merges.")
+@enrich.command("merge-proposals")
+@click.option("--db", "database", default=None, help="Database to check")
+@click.option("--limit", default=50, help="Maximum results")
+@click.pass_context
+def enrich_merge_proposals(ctx, database: str | None, limit: int):
+    """List pending entity merge proposals.
+    Shows duplicate entities and cross-document mentions awaiting review.
+    Use 'okb enrich approve-merge' or 'okb enrich reject-merge' to process.
+    """
+    from .llm.extractors.dedup import list_pending_merges
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    merges = list_pending_merges(db_cfg.url, limit=limit)
+    if not merges:
+        click.echo("No pending merge proposals.")
+        return
+    click.echo(f"Pending merge proposals ({len(merges)}):\n")
+    for m in merges:
+        confidence = m.get("confidence", 0)
+        confidence_str = f" ({confidence:.0%})" if confidence else ""
+        click.echo(f"  {m['canonical_name']} <- {m['duplicate_name']}{confidence_str}")
+        click.echo(f"    ID: {m['id']}")
+        click.echo(f"    Reason: {m.get('reason', 'similarity')}")
+        click.echo("")
+    click.echo("Use 'okb enrich approve-merge <id>' or 'okb enrich reject-merge <id>' to process.")
+@enrich.command("approve-merge")
+@click.argument("merge_id")
+@click.option("--db", "database", default=None, help="Database")
+@click.pass_context
+def enrich_approve_merge(ctx, merge_id: str, database: str | None):
+    """Approve a pending entity merge.
+    Merges the duplicate entity into the canonical entity:
+    - Redirects all entity references from duplicate to canonical
+    - Adds duplicate's name as an alias for canonical
+    - Deletes the duplicate entity document
+    """
+    from .llm.extractors.dedup import approve_merge
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    if approve_merge(db_cfg.url, merge_id):
+        click.echo("Merge approved and executed.")
+    else:
+        click.echo("Failed to approve merge. ID may be invalid or already processed.", err=True)
+        sys.exit(1)
+@enrich.command("reject-merge")
+@click.argument("merge_id")
+@click.option("--db", "database", default=None, help="Database")
+@click.pass_context
+def enrich_reject_merge(ctx, merge_id: str, database: str | None):
+    """Reject a pending entity merge proposal."""
+    from .llm.extractors.dedup import reject_merge
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    if reject_merge(db_cfg.url, merge_id):
+        click.echo("Merge rejected.")
+    else:
+        click.echo("Failed to reject merge. ID may be invalid or already processed.", err=True)
+        sys.exit(1)
+@enrich.command("clusters")
+@click.option("--db", "database", default=None, help="Database to check")
+@click.option("--limit", default=20, help="Maximum clusters to show")
+@click.pass_context
+def enrich_clusters(ctx, database: str | None, limit: int):
+    """List topic clusters.
+    Shows groups of related entities and documents organized by theme.
+    """
+    from .llm.consolidate import get_topic_clusters
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    clusters = get_topic_clusters(db_cfg.url, limit=limit)
+    if not clusters:
+        click.echo("No topic clusters found.")
+        click.echo("Run 'okb enrich consolidate' to generate clusters.")
+        return
+    click.echo(f"Topic clusters ({len(clusters)}):\n")
+    for c in clusters:
+        click.echo(f"  {c['name']}")
+        if c.get("description"):
+            desc = c["description"][:70] + "..." if len(c["description"]) > 70 else c["description"]
+            click.echo(f"    {desc}")
+        click.echo(f"    Members: {c['member_count']} entities/documents")
+        if c.get("sample_members"):
+            samples = ", ".join(c["sample_members"][:5])
+            click.echo(f"    Examples: {samples}")
+        click.echo("")
+@enrich.command("relationships")
+@click.option("--db", "database", default=None, help="Database to check")
+@click.option("--entity", "entity_name", default=None, help="Filter to specific entity")
+@click.option("--type", "relationship_type", default=None,
+              help="Filter by relationship type (works_for, uses, belongs_to, related_to)")
+@click.option("--limit", default=50, help="Maximum results")
+@click.pass_context
+def enrich_relationships(
+    ctx,
+    database: str | None,
+    entity_name: str | None,
+    relationship_type: str | None,
+    limit: int,
+):
+    """List entity relationships.
+    Shows connections between entities (person→org, tech→project, etc.).
+    Examples:
+        okb enrich relationships                    # All relationships
+        okb enrich relationships --entity "Django"  # Filter to one entity
+        okb enrich relationships --type works_for   # Filter by type
+    """
+    from .llm.consolidate import get_entity_relationships
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    relationships = get_entity_relationships(
+        db_cfg.url,
+        entity_name=entity_name,
+        relationship_type=relationship_type,
+        limit=limit,
+    )
+    if not relationships:
+        if entity_name:
+            click.echo(f"No relationships found for entity '{entity_name}'.")
+        else:
+            click.echo("No relationships found.")
+            click.echo("Run 'okb enrich consolidate' to extract relationships.")
+        return
+    click.echo(f"Entity relationships ({len(relationships)}):\n")
+    for r in relationships:
+        confidence = r.get("confidence", 0)
+        conf_str = f" ({confidence:.0%})" if confidence else ""
+        click.echo(f"  {r['source_name']} --[{r['relationship_type']}]--> {r['target_name']}{conf_str}")
+        if r.get("evidence"):
+            evidence = r["evidence"][:60] + "..." if len(r["evidence"]) > 60 else r["evidence"]
+            click.echo(f"    Evidence: {evidence}")
+    click.echo("")
+@enrich.command("all")
+@click.option("--db", "database", default=None, help="Database to enrich")
+@click.option("--source-type", default=None, help="Filter by source type")
+@click.option("--project", default=None, help="Filter by project")
+@click.option("--query", default=None, help="Semantic search query to filter documents")
+@click.option("--path-pattern", default=None, help="SQL LIKE pattern for source_path")
+@click.option("--limit", default=100, help="Maximum documents to process")
+@click.option("--workers", default=None, type=int, help="Parallel workers (default: docs/5, min 1)")
+@click.option("--dry-run", is_flag=True, help="Show what would be done without executing")
+@click.option("--skip-consolidate", is_flag=True, help="Skip consolidation phase")
+@click.option("--duplicates/--no-duplicates", "detect_duplicates", default=True,
+              help="Detect duplicate entities during consolidation")
+@click.option("--clusters/--no-clusters", "build_clusters", default=True,
+              help="Build topic clusters during consolidation")
+@click.option("--relationships/--no-relationships", "extract_relationships", default=True,
+              help="Extract entity relationships during consolidation")
+@click.pass_context
+def enrich_all(
+    ctx,
+    database: str | None,
+    source_type: str | None,
+    project: str | None,
+    query: str | None,
+    path_pattern: str | None,
+    limit: int,
+    workers: int | None,
+    dry_run: bool,
+    skip_consolidate: bool,
+    detect_duplicates: bool,
+    build_clusters: bool,
+    extract_relationships: bool,
+):
+    """Run full enrichment pipeline: extraction + consolidation.
+    Combines 'enrich run' and 'enrich consolidate' in one command for
+    one-shot enrichment of documents.
+    Examples:
+        okb enrich all                          # Run full pipeline
+        okb enrich all --dry-run                # Preview what would happen
+        okb enrich all --skip-consolidate       # Run extraction only
+        okb enrich all --source-type markdown   # Filter to markdown files
+        okb enrich all --no-clusters            # Skip cluster building
+    """
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    from .llm import get_llm
+    from .llm.consolidate import format_consolidation_result, run_consolidation
+    from .llm.enrich import EnrichmentConfig, get_unenriched_documents, process_enrichment
+    # Check LLM is configured
+    if get_llm() is None:
+        click.echo("Error: No LLM provider configured.", err=True)
+        click.echo("Set ANTHROPIC_API_KEY or configure in ~/.config/okb/config.yaml", err=True)
+        ctx.exit(1)
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    # Phase 1: Enrichment
+    click.echo("=== Phase 1: Enrichment ===")
+    click.echo(f"Scanning database '{db_cfg.name}' for documents to enrich...")
+    if dry_run:
+        click.echo("(dry run - no changes will be made)")
+    docs = get_unenriched_documents(
+        db_url=db_cfg.url,
+        source_type=source_type,
+        project=project,
+        query=query,
+        path_pattern=path_pattern,
+        limit=limit,
+    )
+    total_todos = 0
+    total_entities_pending = 0
+    total_entities_created = 0
+    if not docs:
+        click.echo("No documents need enrichment.")
+    else:
+        click.echo(f"Found {len(docs)} documents to enrich")
+        if dry_run:
+            for doc in docs[:20]:
+                click.echo(f"  - {doc['title']} ({doc['source_type']})")
+            if len(docs) > 20:
+                click.echo(f"  ... and {len(docs) - 20} more")
+        else:
+            # Build config
+            enrich_config = EnrichmentConfig.from_config(
+                {
+                    "enabled": config.enrichment_enabled,
+                    "version": config.enrichment_version,
+                    "extract_todos": config.enrichment_extract_todos,
+                    "extract_entities": config.enrichment_extract_entities,
+                    "auto_create_todos": config.enrichment_auto_create_todos,
+                    "auto_create_entities": config.enrichment_auto_create_entities,
+                    "min_confidence_todo": config.enrichment_min_confidence_todo,
+                    "min_confidence_entity": config.enrichment_min_confidence_entity,
+                }
+            )
+            # Calculate workers
+            if workers is None:
+                workers = max(1, len(docs) // 5)
+            completed = 0
+            errors = 0
+            def enrich_one(doc: dict) -> tuple[dict, dict | None, str | None]:
+                proj = doc["metadata"].get("project") if doc["metadata"] else None
+                try:
+                    stats = process_enrichment(
+                        document_id=str(doc["id"]),
+                        source_path=doc["source_path"],
+                        title=doc["title"],
+                        content=doc["content"],
+                        source_type=doc["source_type"],
+                        db_url=db_cfg.url,
+                        config=enrich_config,
+                        project=proj,
+                    )
+                    return doc, stats, None
+                except Exception as e:
+                    return doc, None, str(e)
+            click.echo(f"Processing with {workers} parallel workers...")
+            with ThreadPoolExecutor(max_workers=workers) as executor:
+                futures = {executor.submit(enrich_one, doc): doc for doc in docs}
+                for future in as_completed(futures):
+                    doc, stats, error = future.result()
+                    completed += 1
+                    title = doc["title"][:40] if doc["title"] else "Untitled"
+                    if error:
+                        errors += 1
+                        click.echo(f"[{completed}/{len(docs)}] {title}... -> error: {error[:50]}")
+                        continue
+                    total_todos += stats["todos_created"]
+                    total_entities_pending += stats["entities_pending"]
+                    total_entities_created += stats["entities_created"]
+                    parts = []
+                    if stats["todos_created"]:
+                        parts.append(f"{stats['todos_created']} TODOs")
+                    if stats["entities_pending"]:
+                        parts.append(f"{stats['entities_pending']} pending")
+                    if stats["entities_created"]:
+                        parts.append(f"{stats['entities_created']} entities")
+                    if parts:
+                        click.echo(f"[{completed}/{len(docs)}] {title}... -> {', '.join(parts)}")
+                    else:
+                        click.echo(f"[{completed}/{len(docs)}] {title}... -> nothing extracted")
+            click.echo("")
+            click.echo("Enrichment summary:")
+            click.echo(f"  Documents processed: {len(docs)}")
+            if errors:
+                click.echo(f"  Errors: {errors}")
+            click.echo(f"  TODOs created: {total_todos}")
+            click.echo(f"  Entities pending review: {total_entities_pending}")
+            click.echo(f"  Entities auto-created: {total_entities_created}")
+    # Phase 2: Consolidation
+    if skip_consolidate:
+        click.echo("")
+        click.echo("Skipping consolidation (--skip-consolidate)")
+        return
+    click.echo("")
+    click.echo("=== Phase 2: Consolidation ===")
+    result = run_consolidation(
+        db_url=db_cfg.url,
+        detect_duplicates=detect_duplicates,
+        detect_cross_doc=True,
+        build_clusters=build_clusters,
+        extract_relationships=extract_relationships,
+        dry_run=dry_run,
+    )
+    output = format_consolidation_result(result)
+    click.echo(output)
+    if not dry_run and (result.duplicates_found > 0 or result.cross_doc_candidates > 0):
+        click.echo("")
+        click.echo("Use 'okb enrich review' to review pending entities and merges.")
+@enrich.command("review")
+@click.option("--db", "database", default=None, help="Database to review")
+@click.option("--entities-only", is_flag=True, help="Only review pending entities")
+@click.option("--merges-only", is_flag=True, help="Only review pending merges")
+@click.option("--local", is_flag=True, help="Use local CPU embedding instead of Modal")
+@click.option("--wait/--no-wait", default=True, help="Wait for embeddings to complete")
+@click.pass_context
+def enrich_review(
+    ctx, database: str | None, entities_only: bool, merges_only: bool, local: bool, wait: bool
+):
+    """Interactive review of pending entities and merge proposals.
+    Loops through pending items with approve/reject prompts.
+    Press Q to quit early - remaining items stay pending for later.
+    Entity approvals run asynchronously - you can continue reviewing while
+    embeddings are generated. Use --no-wait to exit immediately after reviewing.
+    Examples:
+        okb enrich review                    # Review all pending items
+        okb enrich review --entities-only    # Only review entities
+        okb enrich review --merges-only      # Only review merges
+        okb enrich review --local            # Use local CPU embedding
+        okb enrich review --no-wait          # Don't wait for embeddings
+    """
+    from .llm.enrich import (
+        approve_entity_async,
+        list_pending_entities,
+        reject_entity,
+        shutdown_executor,
+    )
+    from .llm.extractors.dedup import approve_merge, list_pending_merges, reject_merge
+    db_name = database or ctx.obj.get("database")
+    db_cfg = config.get_database(db_name)
+    use_modal = not local
+    # Get pending items
+    entities = [] if merges_only else list_pending_entities(db_cfg.url, limit=100)
+    merges = [] if entities_only else list_pending_merges(db_cfg.url, limit=100)
+    if not entities and not merges:
+        click.echo("No pending items to review.")
+        return
+    click.echo(f"Pending: {len(entities)} entities, {len(merges)} merges")
+    click.echo("")
+    # Counters
+    approved = 0
+    rejected = 0
+    skipped = 0
+    # Track async approval futures
+    pending_futures: list[tuple] = []  # (future, entity_name)
+    # Review entities
+    choice = None
+    if entities and not merges_only:
+        for i, e in enumerate(entities, 1):
+            # Check for completed futures
+            done_count = sum(1 for f, _ in pending_futures if f.done())
+            if pending_futures and done_count > 0:
+                total = len(pending_futures)
+                click.echo(click.style(f"  ({done_count}/{total} embeddings done)", dim=True))
+            click.echo(click.style(f"=== Entity Review [{i}/{len(entities)}] ===", bold=True))
+            click.echo(f"Name: {click.style(e['entity_name'], fg='cyan')}")
+            click.echo(f"Type: {e['entity_type']}")
+            confidence = e.get("confidence", 0)
+            if confidence:
+                click.echo(f"Confidence: {confidence:.0%}")
+            if e.get("description"):
+                d = e["description"]
+                desc = d[:80] + "..." if len(d) > 80 else d
+                click.echo(f"Description: {desc}")
+            if e.get("aliases"):
+                click.echo(f"Aliases: {', '.join(e['aliases'][:5])}")
+            click.echo(f"Source: {e['source_title']}")
+            click.echo("")
+            choice = click.prompt(
+                "[A]pprove  [R]eject  [S]kip  [Q]uit",
+                type=click.Choice(["A", "R", "S", "Q", "a", "r", "s", "q"]),
+                show_choices=False,
+            ).upper()
+            if choice == "Q":
+                click.echo("Quitting review...")
+                break
+            elif choice == "A":
+                # Submit async approval
+                future = approve_entity_async(db_cfg.url, str(e["id"]), use_modal)
+                pending_futures.append((future, e["entity_name"]))
+                click.echo(click.style("⏳ Queued for approval", fg="cyan"))
+                approved += 1
+            elif choice == "R":
+                if reject_entity(db_cfg.url, str(e["id"])):
+                    click.echo(click.style("✗ Rejected", fg="yellow"))
+                    rejected += 1
+                else:
+                    click.echo(click.style("✗ Failed to reject", fg="red"))
+            else:
+                click.echo("Skipped")
+                skipped += 1
+            click.echo("")
+        else:
+            # Completed all entities, continue to merges
+            pass
+    # Review merges (only if we didn't quit early)
+    if merges and not entities_only and (not entities or choice != "Q"):
+        for i, m in enumerate(merges, 1):
+            click.echo(click.style(f"=== Merge Review [{i}/{len(merges)}] ===", bold=True))
+            cname = click.style(m["canonical_name"], fg="cyan")
+            ctype = m.get("canonical_type", "unknown")
+            click.echo(f"Canonical: {cname} ({ctype})")
+            dname = click.style(m["duplicate_name"], fg="yellow")
+            dtype = m.get("duplicate_type", "unknown")
+            click.echo(f"Duplicate: {dname} ({dtype})")
+            confidence = m.get("confidence", 0)
+            if confidence:
+                click.echo(f"Confidence: {confidence:.0%}")
+            click.echo(f"Reason: {m.get('reason', 'similarity')}")
+            click.echo("")
+            choice = click.prompt(
+                "[A]pprove  [R]eject  [S]kip  [Q]uit",
+                type=click.Choice(["A", "R", "S", "Q", "a", "r", "s", "q"]),
+                show_choices=False,
+            ).upper()
+            if choice == "Q":
+                click.echo("Quitting review...")
+                break
+            elif choice == "A":
+                if approve_merge(db_cfg.url, str(m["id"])):
+                    click.echo(click.style("✓ Merged", fg="green"))
+                    approved += 1
+                else:
+                    click.echo(click.style("✗ Failed to merge", fg="red"))
+            elif choice == "R":
+                if reject_merge(db_cfg.url, str(m["id"])):
+                    click.echo(click.style("✗ Rejected", fg="yellow"))
+                    rejected += 1
+                else:
+                    click.echo(click.style("✗ Failed to reject", fg="red"))
+            else:
+                click.echo("Skipped")
+                skipped += 1
+            click.echo("")
+    # Wait for pending approvals if requested
+    if pending_futures:
+        if wait:
+            click.echo(f"Waiting for {len(pending_futures)} pending approvals...")
+            succeeded = 0
+            failed = 0
+            for future, name in pending_futures:
+                try:
+                    result = future.result(timeout=120)
+                    if result:
+                        click.echo(click.style(f"  ✓ {name}", fg="green"))
+                        succeeded += 1
+                    else:
+                        click.echo(click.style(f"  ✗ {name} failed", fg="red"))
+                        failed += 1
+                except Exception as e:
+                    click.echo(click.style(f"  ✗ {name}: {e}", fg="red"))
+                    failed += 1
+            click.echo(f"Embeddings: {succeeded} succeeded, {failed} failed")
+        else:
+            done_count = sum(1 for f, _ in pending_futures if f.done())
+            pending_count = len(pending_futures) - done_count
+            if pending_count > 0:
+                click.echo(f"{pending_count} embeddings still processing in background...")
+    # Cleanup executor
+    shutdown_executor(wait=wait)
+    # Summary
+    click.echo("")
+    click.echo(click.style("Review complete:", bold=True))
+    click.echo(f"  {click.style(str(approved), fg='green')} approved")
+    click.echo(f"  {click.style(str(rejected), fg='yellow')} rejected")
+    click.echo(f"  {skipped} skipped")
 if __name__ == "__main__":
     main()

okb 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

okb 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl