PyPI - okb - Versions diffs - 1.1.0a0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

okb 1.1.0a0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

okb/cli.py +1083 -16
okb/config.py +122 -4
okb/http_server.py +356 -91
okb/llm/analyze.py +524 -0
okb/llm/consolidate.py +685 -0
okb/llm/enrich.py +723 -0
okb/llm/extractors/__init__.py +13 -0
okb/llm/extractors/base.py +44 -0
okb/llm/extractors/cross_doc.py +478 -0
okb/llm/extractors/dedup.py +499 -0
okb/llm/extractors/entity.py +369 -0
okb/llm/extractors/todo.py +149 -0
okb/llm/providers.py +9 -6
okb/mcp_server.py +1036 -12
okb/migrations/0008.enrichment.sql +46 -0
okb/migrations/0009.entity-consolidation.sql +120 -0
okb/migrations/0010.token-id.sql +7 -0
okb/modal_llm.py +26 -8
okb/plugins/sources/github.py +5 -5
okb/tokens.py +25 -3
{okb-1.1.0a0.dist-info → okb-1.1.2.dist-info}/METADATA +91 -8
{okb-1.1.0a0.dist-info → okb-1.1.2.dist-info}/RECORD +24 -12
{okb-1.1.0a0.dist-info → okb-1.1.2.dist-info}/WHEEL +0 -0
{okb-1.1.0a0.dist-info → okb-1.1.2.dist-info}/entry_points.txt +0 -0

okb/llm/extractors/entity.py ADDED Viewed

@@ -0,0 +1,369 @@
+"""Entity extraction from document content using LLM."""
+from __future__ import annotations
+import json
+import re
+from .base import ExtractedEntity
+ENTITY_SYSTEM_PROMPT = """\
+You are an expert at identifying named entities in text for a PERSONAL knowledge base.
+Extract only entities that are specific to the author's context - things an LLM wouldn't know about.
+Entity types to extract:
+- person: People the author knows, works with, or references (colleagues, contacts, clients)
+- project: Specific named projects/products/codebases (e.g., "Acme Dashboard", "customer-portal")
+          NOT git branches, environments, or workflow stages
+- technology: ONLY obscure/niche tools or internal systems - NOT well-known technologies
+- organization: Specific companies, teams, clients the author works with
+DO NOT extract:
+- Well-known technologies: JSON, HTTP, SQL, Python, JavaScript, Docker, AWS, PostgreSQL, React, etc.
+  (The LLM already knows these - they add no value to a personal knowledge base)
+- Code symbols: function names, method calls, variables, class names
+- Generic terms: "user", "data", "system", "database", "API", "server", "client"
+- Git branches/workflow terms: main, master, develop, release, staging, production, feature, hotfix
+- Generic process terms: deploy, build, test, migration, setup, config
+- Environment names: dev, prod, qa, uat, local
+- Issue or bug descriptions - those are documents, not entities
+- Famous people, major companies (Google, Microsoft, etc.) unless contextually relevant to author
+ONLY extract entities that would help answer "Who/what is X?" where X is specific to this person.
+For each entity found, extract:
+- name: The canonical name (proper noun)
+- entity_type: One of: person, project, technology, organization
+- aliases: Other names/abbreviations (optional)
+- description: Brief description based on context (optional)
+- mentions: Text snippets where entity appears (max 3)
+- confidence: How confident you are (0.0-1.0)
+Return JSON array. Return empty array [] if no context-specific entities found.
+"""
+ENTITY_USER_PROMPT = """\
+Document title: {title}
+Source type: {source_type}
+Content:
+{content}
+Extract named entities as JSON array.
+"""
+def extract_entities(
+    content: str,
+    title: str,
+    source_type: str | None = None,
+    min_confidence: float = 0.8,
+) -> list[ExtractedEntity]:
+    """Extract entities from document content using LLM.
+    Args:
+        content: Document content to analyze
+        title: Document title for context
+        source_type: Type of document (optional)
+        min_confidence: Minimum confidence threshold (0-1)
+    Returns:
+        List of extracted entities
+    """
+    from .. import complete
+    # Truncate content if too long
+    if len(content) > 20000:
+        content = content[:20000] + "\n\n[... content truncated ...]"
+    prompt = ENTITY_USER_PROMPT.format(
+        title=title,
+        source_type=source_type or "unknown",
+        content=content,
+    )
+    response = complete(
+        prompt=prompt,
+        system=ENTITY_SYSTEM_PROMPT,
+        max_tokens=2048,
+        use_cache=True,
+    )
+    if response is None:
+        return []
+    return _parse_entity_response(response.content, min_confidence, title)
+def _looks_like_code(name: str) -> bool:
+    """Check if entity name looks like code."""
+    # Contains parentheses (function calls)
+    if "(" in name or ")" in name:
+        return True
+    # Snake_case with underscores (likely variable/function)
+    if "_" in name and name.islower():
+        return True
+    # Starts with lowercase and contains dots (method chain)
+    if name and name[0].islower() and "." in name:
+        return True
+    # CamelCase starting with lowercase (variable/method name)
+    if name and name[0].islower() and any(c.isupper() for c in name):
+        return True
+    return False
+# Well-known technologies that add no value to a personal knowledge base
+COMMON_TECHNOLOGIES = frozenset(
+    s.lower()
+    for s in [
+        # Data formats
+        "JSON",
+        "XML",
+        "YAML",
+        "CSV",
+        "HTML",
+        "CSS",
+        "Markdown",
+        # Protocols
+        "HTTP",
+        "HTTPS",
+        "REST",
+        "GraphQL",
+        "WebSocket",
+        "TCP",
+        "UDP",
+        "SSH",
+        "FTP",
+        "SMTP",
+        # Languages
+        "Python",
+        "JavaScript",
+        "TypeScript",
+        "Java",
+        "Go",
+        "Rust",
+        "C",
+        "C++",
+        "Ruby",
+        "PHP",
+        "Swift",
+        "Kotlin",
+        "Scala",
+        "Bash",
+        "Shell",
+        "SQL",
+        "Lua",
+        # Major frameworks/tools
+        "React",
+        "Vue",
+        "Angular",
+        "Node.js",
+        "Django",
+        "Flask",
+        "FastAPI",
+        "Rails",
+        "Spring",
+        "Express",
+        "Next.js",
+        # Databases
+        "PostgreSQL",
+        "MySQL",
+        "MongoDB",
+        "Redis",
+        "SQLite",
+        "Elasticsearch",
+        "DynamoDB",
+        # Cloud/infra
+        "AWS",
+        "Azure",
+        "GCP",
+        "Docker",
+        "Kubernetes",
+        "Linux",
+        "Windows",
+        "macOS",
+        "Nginx",
+        "Apache",
+        # Tools
+        "Git",
+        "GitHub",
+        "GitLab",
+        "npm",
+        "pip",
+        "Webpack",
+        "VS Code",
+        "Vim",
+        "Emacs",
+    ]
+)
+# Generic git/workflow/environment terms that are not context-specific
+GENERIC_TERMS = frozenset(
+    s.lower()
+    for s in [
+        # Git branches
+        "main",
+        "master",
+        "develop",
+        "development",
+        "release",
+        "staging",
+        "production",
+        "feature",
+        "hotfix",
+        "bugfix",
+        # Environments
+        "dev",
+        "prod",
+        "test",
+        "qa",
+        "uat",
+        "local",
+        "sandbox",
+        # Workflow/process terms
+        "deploy",
+        "build",
+        "migration",
+        "setup",
+        "config",
+        "configuration",
+        "rollback",
+        "rollout",
+        # Generic architectural terms
+        "frontend",
+        "backend",
+        "api",
+        "service",
+        "server",
+        "client",
+        "app",
+        "application",
+        "module",
+        "component",
+        "library",
+        "package",
+        "plugin",
+        "extension",
+        # Generic data terms
+        "database",
+        "cache",
+        "queue",
+        "worker",
+        "scheduler",
+        "cron",
+    ]
+)
+def _parse_entity_response(
+    response_text: str, min_confidence: float, title: str = ""
+) -> list[ExtractedEntity]:
+    """Parse LLM response into ExtractedEntity objects."""
+    # Try to extract JSON from response
+    json_match = re.search(r"\[.*\]", response_text, re.DOTALL)
+    if not json_match:
+        return []
+    try:
+        entities_data = json.loads(json_match.group())
+    except json.JSONDecodeError:
+        return []
+    if not isinstance(entities_data, list):
+        return []
+    valid_types = {"person", "project", "technology", "organization"}
+    entities = []
+    for item in entities_data:
+        if not isinstance(item, dict):
+            continue
+        name = item.get("name", "").strip()
+        entity_type = item.get("entity_type")
+        if not name or not isinstance(name, str):
+            continue
+        if not entity_type or entity_type not in valid_types:
+            continue
+        # Filter: too short or too long
+        if len(name) < 3 or len(name) > 80:
+            continue
+        # Filter: looks like code
+        if _looks_like_code(name):
+            continue
+        # Filter: well-known technologies (LLM already knows these)
+        if name.lower() in COMMON_TECHNOLOGIES:
+            continue
+        # Filter: generic git/workflow/environment terms
+        if name.lower() in GENERIC_TERMS:
+            continue
+        # Filter: matches document title (source shouldn't be extracted as entity)
+        if title and name.lower() == title.lower():
+            continue
+        # Get confidence (default to 0.85 if not specified)
+        confidence = item.get("confidence", 0.85)
+        if not isinstance(confidence, int | float):
+            confidence = 0.85
+        if confidence < min_confidence:
+            continue
+        # Parse aliases
+        aliases = item.get("aliases", [])
+        if not isinstance(aliases, list):
+            aliases = []
+        aliases = [a for a in aliases if isinstance(a, str)]
+        # Parse mentions
+        mentions = item.get("mentions", [])
+        if not isinstance(mentions, list):
+            mentions = []
+        mentions = [m for m in mentions if isinstance(m, str)][:3]
+        entities.append(
+            ExtractedEntity(
+                name=name,
+                entity_type=entity_type,
+                aliases=aliases,
+                description=item.get("description"),
+                mentions=mentions,
+                confidence=float(confidence),
+            )
+        )
+    return entities
+def normalize_entity_name(name: str) -> str:
+    """Normalize entity name for deduplication and URL generation.
+    Examples:
+        "John Smith" -> "john-smith"
+        "AWS (Amazon Web Services)" -> "aws-amazon-web-services"
+        "React.js" -> "react-js"
+    """
+    # Lowercase
+    normalized = name.lower()
+    # Replace non-alphanumeric with spaces
+    normalized = re.sub(r"[^a-z0-9\s]", " ", normalized)
+    # Collapse whitespace and replace with hyphens
+    normalized = re.sub(r"\s+", "-", normalized.strip())
+    # Remove leading/trailing hyphens
+    normalized = normalized.strip("-")
+    return normalized
+def entity_source_path(entity_type: str, name: str) -> str:
+    """Generate source_path for an entity document.
+    Format: okb://entity/{type}/{normalized-name}
+    """
+    normalized = normalize_entity_name(name)
+    return f"okb://entity/{entity_type}/{normalized}"

okb/llm/extractors/todo.py ADDED Viewed

@@ -0,0 +1,149 @@
+"""TODO extraction from document content using LLM."""
+from __future__ import annotations
+import json
+import re
+from datetime import UTC, datetime
+from .base import ExtractedTodo
+TODO_SYSTEM_PROMPT = """\
+You are an expert at identifying action items and tasks in text.
+Extract TODO items from the given document content.
+Look for:
+- Explicit markers: TODO, FIXME, HACK, XXX, ACTION
+- Action phrases: "need to", "should", "must", "have to", "action item"
+- Deadlines and commitments: "by Friday", "before the meeting", "this week"
+- Questions implying needed work: "What about X?", "How do we handle Y?"
+- Incomplete items marked for follow-up
+For each TODO found, extract:
+- title: A concise description of the task (imperative form: "Fix the bug", not "The bug needs fixing")
+- content: Additional context or details (optional)
+- due_date: If a deadline is mentioned, in ISO format YYYY-MM-DD (optional)
+- priority: 1=urgent, 2=high, 3=normal, 4=low, 5=someday (optional)
+- assignee: Person responsible if mentioned (optional)
+- source_context: The exact text snippet where this TODO was found
+Return JSON array of extracted TODOs. Return empty array [] if none found.
+Be conservative - only extract clear action items, not vague mentions.
+"""
+TODO_USER_PROMPT = """\
+Document title: {title}
+Source type: {source_type}
+Content:
+{content}
+Extract all TODO items from this content as JSON array.
+"""
+def extract_todos(
+    content: str,
+    title: str,
+    source_type: str,
+    min_confidence: float = 0.7,
+) -> list[ExtractedTodo]:
+    """Extract TODO items from document content using LLM.
+    Args:
+        content: Document content to analyze
+        title: Document title for context
+        source_type: Type of document (markdown, code, org, etc.)
+        min_confidence: Minimum confidence threshold (0-1)
+    Returns:
+        List of extracted TODO items
+    """
+    from .. import complete
+    # Truncate content if too long (keep first ~20k chars for context)
+    if len(content) > 20000:
+        content = content[:20000] + "\n\n[... content truncated ...]"
+    prompt = TODO_USER_PROMPT.format(
+        title=title,
+        source_type=source_type,
+        content=content,
+    )
+    response = complete(
+        prompt=prompt,
+        system=TODO_SYSTEM_PROMPT,
+        max_tokens=2048,
+        use_cache=True,
+    )
+    if response is None:
+        return []
+    return _parse_todo_response(response.content, min_confidence)
+def _parse_todo_response(response_text: str, min_confidence: float) -> list[ExtractedTodo]:
+    """Parse LLM response into ExtractedTodo objects."""
+    # Try to extract JSON from response
+    json_match = re.search(r"\[.*\]", response_text, re.DOTALL)
+    if not json_match:
+        return []
+    try:
+        todos_data = json.loads(json_match.group())
+    except json.JSONDecodeError:
+        return []
+    if not isinstance(todos_data, list):
+        return []
+    todos = []
+    for item in todos_data:
+        if not isinstance(item, dict):
+            continue
+        title = item.get("title")
+        if not title or not isinstance(title, str):
+            continue
+        # Parse due_date if present
+        due_date = None
+        if due_str := item.get("due_date"):
+            try:
+                due_date = datetime.fromisoformat(due_str).replace(tzinfo=UTC)
+            except (ValueError, TypeError):
+                pass
+        # Parse priority
+        priority = None
+        if p := item.get("priority"):
+            try:
+                priority = int(p)
+                if priority < 1 or priority > 5:
+                    priority = None
+            except (ValueError, TypeError):
+                pass
+        # Get confidence (default to 0.8 if not specified)
+        confidence = item.get("confidence", 0.8)
+        if not isinstance(confidence, (int, float)):
+            confidence = 0.8
+        if confidence < min_confidence:
+            continue
+        todos.append(
+            ExtractedTodo(
+                title=title.strip(),
+                content=item.get("content"),
+                due_date=due_date,
+                priority=priority,
+                assignee=item.get("assignee"),
+                confidence=float(confidence),
+                source_context=item.get("source_context"),
+            )
+        )
+    return todos

okb/llm/providers.py CHANGED Viewed

@@ -165,13 +165,13 @@ class ClaudeProvider:
 class ModalProvider:
-    """Modal-based LLM provider using open models (Llama, Mistral, etc.).
+    """Modal-based LLM provider using open models (Phi-3, Llama, Mistral, etc.).
     Runs on Modal GPU infrastructure - no API key needed, pay per compute.
-    Requires deploying the Modal app first: `modal deploy lkb/modal_llm.py`
+    Requires deploying the Modal app first: `okb llm deploy`
     Config:
-        model: Model name (default: meta-llama/Llama-3.2-3B-Instruct)
+        model: Model name (default: microsoft/Phi-3-mini-4k-instruct)
         timeout: Request timeout in seconds (default: 60)
     """
@@ -179,7 +179,7 @@ class ModalProvider:
     def __init__(self) -> None:
         self._llm = None
-        self._model: str = "meta-llama/Llama-3.2-3B-Instruct"
+        self._model: str = "microsoft/Phi-3-mini-4k-instruct"
         self._timeout: int = 60
     def configure(self, config: dict) -> None:
@@ -202,7 +202,7 @@ class ModalProvider:
             self._llm = modal.Cls.from_name("knowledge-llm", "LLM")()
         except modal.exception.NotFoundError:
             raise RuntimeError(
-                "Modal LLM app not deployed. Deploy with: modal deploy lkb/modal_llm.py"
+                "Modal LLM app not deployed. Deploy with: okb llm deploy"
             )
     def complete(
@@ -244,9 +244,12 @@ class ModalProvider:
     def list_models(self) -> list[str]:
         """List recommended models for Modal."""
         return [
+            # Non-gated (work immediately)
+            "microsoft/Phi-3-mini-4k-instruct",
+            "Qwen/Qwen2-1.5B-Instruct",
+            # Gated (require HuggingFace approval + HF_TOKEN)
             "meta-llama/Llama-3.2-3B-Instruct",
             "meta-llama/Llama-3.2-1B-Instruct",
-            "mistralai/Mistral-7B-Instruct-v0.3",
         ]

okb 1.1.0a0__py3-none-any.whl → 1.1.2__py3-none-any.whl

okb 1.1.0a0py3-none-any.whl → 1.1.2py3-none-any.whl