PyPI - okb - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

okb 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

okb/cli.py +1209 -16
okb/config.py +122 -4
okb/http_server.py +208 -2
okb/llm/analyze.py +524 -0
okb/llm/consolidate.py +685 -0
okb/llm/enrich.py +723 -0
okb/llm/extractors/__init__.py +13 -0
okb/llm/extractors/base.py +44 -0
okb/llm/extractors/cross_doc.py +478 -0
okb/llm/extractors/dedup.py +499 -0
okb/llm/extractors/entity.py +369 -0
okb/llm/extractors/todo.py +149 -0
okb/llm/providers.py +9 -6
okb/mcp_server.py +1279 -12
okb/migrations/0008.enrichment.sql +46 -0
okb/migrations/0009.entity-consolidation.sql +120 -0
okb/migrations/0010.token-id.sql +7 -0
okb/modal_llm.py +26 -8
okb/plugins/sources/__init__.py +2 -1
okb/plugins/sources/dropbox_paper.py +44 -9
okb/plugins/sources/github.py +5 -5
okb/plugins/sources/todoist.py +254 -0
okb/tokens.py +25 -3
{okb-1.0.0.dist-info → okb-1.1.0.dist-info}/METADATA +119 -68
okb-1.1.0.dist-info/RECORD +49 -0
{okb-1.0.0.dist-info → okb-1.1.0.dist-info}/entry_points.txt +1 -0
okb-1.0.0.dist-info/RECORD +0 -36
{okb-1.0.0.dist-info → okb-1.1.0.dist-info}/WHEEL +0 -0

okb/config.py CHANGED Viewed

@@ -53,7 +53,7 @@ class DatabaseConfig:
     name: str
     url: str
-    managed: bool = True  # Whether lkb manages this (Docker) or external
+    managed: bool = True  # Whether okb manages this (Docker) or external
     default: bool = False
     description: str | None = None  # Human-readable description for LLM context
     topics: list[str] | None = None  # Topic keywords to help LLM route queries
@@ -259,6 +259,7 @@ DEFAULTS = {
             "yarn.lock",
             "uv.lock",
             "Cargo.lock",
+            "poetry.lock",
             "*.pyc",
             "*.pyo",
             "*.tmp",
@@ -281,7 +282,7 @@ DEFAULTS = {
     },
     "llm": {
         # LLM provider configuration
-        # provider: None = disabled, "claude" = Anthropic API
+        # provider: None = disabled, "claude" = Anthropic API, "modal" = Modal GPU
         "provider": None,
         "model": "claude-haiku-4-5-20251001",
         "timeout": 30,
@@ -289,6 +290,38 @@ DEFAULTS = {
         # Bedrock settings (when use_bedrock is True)
         "use_bedrock": False,
         "aws_region": "us-west-2",
+        # Modal settings (when provider is "modal")
+        "modal_gpu": "L4",  # GPU type: T4, L4, A10G, A100, etc.
+    },
+    "enrichment": {
+        # LLM-based document enrichment
+        "enabled": True,
+        "version": 1,  # Increment to force re-enrichment
+        # What to extract
+        "extract_todos": True,
+        "extract_entities": True,
+        # Auto-create behavior
+        "auto_create_todos": True,      # TODOs created immediately
+        "auto_create_entities": False,  # Entities go to pending_entities table
+        # Confidence thresholds
+        "min_confidence_todo": 0.7,
+        "min_confidence_entity": 0.8,
+        # Auto-enrich during ingest (per source type)
+        "auto_enrich": {
+            "markdown": True,
+            "org": True,
+            "text": True,
+            "code": False,      # Skip code files
+            "web": False,       # Skip web pages
+            "todoist-task": False,  # Already structured
+        },
+        # Entity consolidation settings
+        "consolidation": {
+            "cross_doc_min_mentions": 3,       # Min docs for cross-doc detection
+            "embedding_similarity_threshold": 0.85,  # For duplicate detection
+            "auto_merge_threshold": 0.95,      # Auto-approve above this
+            "min_cluster_size": 3,             # Min entities per cluster
+        },
     },
 }
@@ -349,12 +382,30 @@ class Config:
     llm_cache_responses: bool = True
     llm_use_bedrock: bool = False
     llm_aws_region: str = "us-west-2"
+    llm_modal_gpu: str = "L4"
+    # Enrichment settings (loaded from config in __post_init__)
+    enrichment_enabled: bool = True
+    enrichment_version: int = 1
+    enrichment_extract_todos: bool = True
+    enrichment_extract_entities: bool = True
+    enrichment_auto_create_todos: bool = True
+    enrichment_auto_create_entities: bool = False
+    enrichment_min_confidence_todo: float = 0.7
+    enrichment_min_confidence_entity: float = 0.8
+    enrichment_auto_enrich: dict[str, bool] = field(default_factory=dict)
+    # Consolidation settings (loaded from config in __post_init__)
+    consolidation_cross_doc_min_mentions: int = 3
+    consolidation_embedding_similarity_threshold: float = 0.85
+    consolidation_auto_merge_threshold: float = 0.95
+    consolidation_min_cluster_size: int = 3
     def __post_init__(self):
         """Load configuration from file and environment."""
         file_config = load_config_file()
-        # Load and merge local config overlay (.lkbconf.yaml)
+        # Load and merge local config overlay (.okbconf.yaml)
         local_path = find_local_config()
         local_default_db: str | None = None
         if local_path:
@@ -417,7 +468,7 @@ class Config:
         else:
             # Legacy: single database_url (env > file > default)
             legacy_url = os.environ.get(
-                "KB_DATABASE_URL",
+                "OKB_DATABASE_URL",
                 file_config.get("database_url", DEFAULTS["databases"]["default"]["url"]),
             )
             self.databases["default"] = DatabaseConfig(
@@ -535,6 +586,55 @@ class Config:
         )
         self.llm_use_bedrock = llm_cfg.get("use_bedrock", DEFAULTS["llm"]["use_bedrock"])
         self.llm_aws_region = llm_cfg.get("aws_region", DEFAULTS["llm"]["aws_region"])
+        self.llm_modal_gpu = os.environ.get(
+            "OKB_MODAL_GPU",
+            llm_cfg.get("modal_gpu", DEFAULTS["llm"]["modal_gpu"]),
+        )
+        # Enrichment settings
+        enrich_cfg = file_config.get("enrichment", {})
+        self.enrichment_enabled = enrich_cfg.get("enabled", DEFAULTS["enrichment"]["enabled"])
+        self.enrichment_version = enrich_cfg.get("version", DEFAULTS["enrichment"]["version"])
+        self.enrichment_extract_todos = enrich_cfg.get(
+            "extract_todos", DEFAULTS["enrichment"]["extract_todos"]
+        )
+        self.enrichment_extract_entities = enrich_cfg.get(
+            "extract_entities", DEFAULTS["enrichment"]["extract_entities"]
+        )
+        self.enrichment_auto_create_todos = enrich_cfg.get(
+            "auto_create_todos", DEFAULTS["enrichment"]["auto_create_todos"]
+        )
+        self.enrichment_auto_create_entities = enrich_cfg.get(
+            "auto_create_entities", DEFAULTS["enrichment"]["auto_create_entities"]
+        )
+        self.enrichment_min_confidence_todo = enrich_cfg.get(
+            "min_confidence_todo", DEFAULTS["enrichment"]["min_confidence_todo"]
+        )
+        self.enrichment_min_confidence_entity = enrich_cfg.get(
+            "min_confidence_entity", DEFAULTS["enrichment"]["min_confidence_entity"]
+        )
+        self.enrichment_auto_enrich = enrich_cfg.get(
+            "auto_enrich", DEFAULTS["enrichment"]["auto_enrich"]
+        )
+        # Consolidation settings
+        consolidation_cfg = enrich_cfg.get("consolidation", {})
+        self.consolidation_cross_doc_min_mentions = consolidation_cfg.get(
+            "cross_doc_min_mentions",
+            DEFAULTS["enrichment"]["consolidation"]["cross_doc_min_mentions"],
+        )
+        self.consolidation_embedding_similarity_threshold = consolidation_cfg.get(
+            "embedding_similarity_threshold",
+            DEFAULTS["enrichment"]["consolidation"]["embedding_similarity_threshold"],
+        )
+        self.consolidation_auto_merge_threshold = consolidation_cfg.get(
+            "auto_merge_threshold",
+            DEFAULTS["enrichment"]["consolidation"]["auto_merge_threshold"],
+        )
+        self.consolidation_min_cluster_size = consolidation_cfg.get(
+            "min_cluster_size",
+            DEFAULTS["enrichment"]["consolidation"]["min_cluster_size"],
+        )
     def get_database(self, name: str | None = None) -> DatabaseConfig:
         """Get database config by name, or default if None."""
@@ -648,6 +748,24 @@ class Config:
                 "cache_responses": self.llm_cache_responses,
                 "use_bedrock": self.llm_use_bedrock,
                 "aws_region": self.llm_aws_region,
+                "modal_gpu": self.llm_modal_gpu,
+            },
+            "enrichment": {
+                "enabled": self.enrichment_enabled,
+                "version": self.enrichment_version,
+                "extract_todos": self.enrichment_extract_todos,
+                "extract_entities": self.enrichment_extract_entities,
+                "auto_create_todos": self.enrichment_auto_create_todos,
+                "auto_create_entities": self.enrichment_auto_create_entities,
+                "min_confidence_todo": self.enrichment_min_confidence_todo,
+                "min_confidence_entity": self.enrichment_min_confidence_entity,
+                "auto_enrich": self.enrichment_auto_enrich,
+                "consolidation": {
+                    "cross_doc_min_mentions": self.consolidation_cross_doc_min_mentions,
+                    "embedding_similarity_threshold": self.consolidation_embedding_similarity_threshold,
+                    "auto_merge_threshold": self.consolidation_auto_merge_threshold,
+                    "min_cluster_size": self.consolidation_min_cluster_size,
+                },
             },
         }

okb/http_server.py CHANGED Viewed

@@ -37,9 +37,15 @@ READ_ONLY_TOOLS = frozenset(
         "get_document",
         "list_sources",
         "list_projects",
+        "list_documents_by_project",
         "recent_documents",
         "get_actionable_items",
         "get_database_info",
+        "list_sync_sources",
+        "list_pending_entities",
+        "list_pending_merges",
+        "get_topic_clusters",
+        "get_entity_relationships",
     }
 )
@@ -49,6 +55,17 @@ WRITE_TOOLS = frozenset(
         "delete_knowledge",
         "set_database_description",
         "add_todo",
+        "trigger_sync",
+        "trigger_rescan",
+        "enrich_document",
+        "approve_entity",
+        "reject_entity",
+        "analyze_knowledge_base",
+        "find_entity_duplicates",
+        "merge_entities",
+        "approve_merge",
+        "reject_merge",
+        "run_consolidation",
     }
 )
@@ -206,6 +223,24 @@ class HTTPMCPServer:
                     content=[TextContent(type="text", text=f"## Projects\n\n{project_list}")]
                 )
+            elif name == "list_documents_by_project":
+                project = arguments["project"]
+                limit = arguments.get("limit", 100)
+                docs = kb.list_documents_by_project(project, limit)
+                if not docs:
+                    return CallToolResult(
+                        content=[
+                            TextContent(
+                                type="text", text=f"No documents found for project '{project}'."
+                            )
+                        ]
+                    )
+                output = [f"## Documents in '{project}' ({len(docs)} documents)\n"]
+                for d in docs:
+                    output.append(f"- **{d['title'] or d['source_path']}** ({d['source_type']})")
+                    output.append(f"  - `{d['source_path']}`")
+                return CallToolResult(content=[TextContent(type="text", text="\n".join(output))])
             elif name == "recent_documents":
                 from .mcp_server import format_relative_time, get_document_date
@@ -263,13 +298,13 @@ class HTTPMCPServer:
                 deleted = kb.delete_knowledge(arguments["source_path"])
                 if deleted:
                     return CallToolResult(
-                        content=[TextContent(type="text", text="Knowledge entry deleted.")]
+                        content=[TextContent(type="text", text="Document deleted.")]
                     )
                 return CallToolResult(
                     content=[
                         TextContent(
                             type="text",
-                            text="Could not delete. Entry not found or not a Claude-saved entry.",
+                            text="Could not delete. Document not found.",
                         )
                     ]
                 )
@@ -349,6 +384,177 @@ class HTTPMCPServer:
                     content=[TextContent(type="text", text="No fields provided to update.")]
                 )
+            elif name == "add_todo":
+                result = kb.save_todo(
+                    title=arguments["title"],
+                    content=arguments.get("content"),
+                    due_date=arguments.get("due_date"),
+                    priority=arguments.get("priority"),
+                    project=arguments.get("project"),
+                    tags=arguments.get("tags"),
+                )
+                parts = [
+                    "TODO created:",
+                    f"- Title: {result['title']}",
+                    f"- Path: `{result['source_path']}`",
+                ]
+                if result.get("priority"):
+                    parts.append(f"- Priority: P{result['priority']}")
+                if result.get("due_date"):
+                    parts.append(f"- Due: {result['due_date']}")
+                return CallToolResult(content=[TextContent(type="text", text="\n".join(parts))])
+            elif name == "trigger_sync":
+                from .mcp_server import _run_sync
+                # Get the db_url from the knowledge base
+                result = _run_sync(
+                    kb.db_url,
+                    sources=arguments.get("sources", []),
+                    sync_all=arguments.get("all", False),
+                    full=arguments.get("full", False),
+                    doc_ids=arguments.get("doc_ids"),
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "trigger_rescan":
+                from .mcp_server import _run_rescan
+                result = _run_rescan(
+                    kb.db_url,
+                    dry_run=arguments.get("dry_run", False),
+                    delete_missing=arguments.get("delete_missing", False),
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "list_sync_sources":
+                from .mcp_server import _list_sync_sources
+                token_info = getattr(self.server, "_current_token_info", None)
+                db_name = token_info.database if token_info else config.get_database().name
+                result = _list_sync_sources(kb.db_url, db_name)
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "enrich_document":
+                from .mcp_server import _enrich_document
+                result = _enrich_document(
+                    kb.db_url,
+                    source_path=arguments["source_path"],
+                    extract_todos=arguments.get("extract_todos", True),
+                    extract_entities=arguments.get("extract_entities", True),
+                    auto_create_entities=arguments.get("auto_create_entities", False),
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "list_pending_entities":
+                from .mcp_server import _list_pending_entities
+                result = _list_pending_entities(
+                    kb.db_url,
+                    entity_type=arguments.get("entity_type"),
+                    limit=arguments.get("limit", 20),
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "approve_entity":
+                from .mcp_server import _approve_entity
+                result = _approve_entity(kb.db_url, arguments["pending_id"])
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "reject_entity":
+                from .mcp_server import _reject_entity
+                result = _reject_entity(kb.db_url, arguments["pending_id"])
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "analyze_knowledge_base":
+                from .mcp_server import _analyze_knowledge_base
+                result = _analyze_knowledge_base(
+                    kb.db_url,
+                    project=arguments.get("project"),
+                    sample_size=arguments.get("sample_size", 15),
+                    auto_update=arguments.get("auto_update", True),
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            # Entity consolidation tools
+            elif name == "find_entity_duplicates":
+                from .mcp_server import _find_entity_duplicates
+                result = _find_entity_duplicates(
+                    kb.db_url,
+                    similarity_threshold=arguments.get("similarity_threshold", 0.85),
+                    limit=arguments.get("limit", 50),
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "merge_entities":
+                from .mcp_server import _merge_entities
+                result = _merge_entities(
+                    kb.db_url,
+                    canonical_path=arguments["canonical_path"],
+                    duplicate_path=arguments["duplicate_path"],
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "list_pending_merges":
+                from .mcp_server import _list_pending_merges
+                result = _list_pending_merges(
+                    kb.db_url,
+                    limit=arguments.get("limit", 50),
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "approve_merge":
+                from .mcp_server import _approve_merge
+                result = _approve_merge(kb.db_url, arguments["merge_id"])
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "reject_merge":
+                from .mcp_server import _reject_merge
+                result = _reject_merge(kb.db_url, arguments["merge_id"])
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "get_topic_clusters":
+                from .mcp_server import _get_topic_clusters
+                result = _get_topic_clusters(
+                    kb.db_url,
+                    limit=arguments.get("limit", 20),
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "get_entity_relationships":
+                from .mcp_server import _get_entity_relationships
+                result = _get_entity_relationships(
+                    kb.db_url,
+                    entity_name=arguments.get("entity_name"),
+                    relationship_type=arguments.get("relationship_type"),
+                    limit=arguments.get("limit", 50),
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
+            elif name == "run_consolidation":
+                from .mcp_server import _run_consolidation
+                result = _run_consolidation(
+                    kb.db_url,
+                    detect_duplicates=arguments.get("detect_duplicates", True),
+                    detect_cross_doc=arguments.get("detect_cross_doc", True),
+                    build_clusters=arguments.get("build_clusters", True),
+                    extract_relationships=arguments.get("extract_relationships", True),
+                    dry_run=arguments.get("dry_run", False),
+                )
+                return CallToolResult(content=[TextContent(type="text", text=result)])
             else:
                 return CallToolResult(
                     content=[TextContent(type="text", text=f"Unknown tool: {name}")]

okb 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

okb 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl