PyPI - threadkeeper - Versions diffs - 0.4.0__py3-none-any.whl - Mend

threadkeeper 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

threadkeeper/__init__.py +8 -0
threadkeeper/_mcp.py +6 -0
threadkeeper/_setup.py +299 -0
threadkeeper/adapters/__init__.py +40 -0
threadkeeper/adapters/_hook_helpers.py +72 -0
threadkeeper/adapters/base.py +152 -0
threadkeeper/adapters/claude_code.py +178 -0
threadkeeper/adapters/claude_desktop.py +128 -0
threadkeeper/adapters/codex.py +259 -0
threadkeeper/adapters/copilot.py +195 -0
threadkeeper/adapters/gemini.py +169 -0
threadkeeper/adapters/vscode.py +144 -0
threadkeeper/brief.py +735 -0
threadkeeper/config.py +216 -0
threadkeeper/curator.py +390 -0
threadkeeper/db.py +474 -0
threadkeeper/embeddings.py +232 -0
threadkeeper/extract_daemon.py +125 -0
threadkeeper/helpers.py +101 -0
threadkeeper/i18n.py +342 -0
threadkeeper/identity.py +237 -0
threadkeeper/ingest.py +507 -0
threadkeeper/lessons.py +170 -0
threadkeeper/nudges.py +257 -0
threadkeeper/process_health.py +202 -0
threadkeeper/review_prompts.py +207 -0
threadkeeper/search_proxy.py +160 -0
threadkeeper/server.py +55 -0
threadkeeper/shadow_review.py +358 -0
threadkeeper/skill_watcher.py +96 -0
threadkeeper/spawn_budget.py +246 -0
threadkeeper/tools/__init__.py +2 -0
threadkeeper/tools/concepts.py +111 -0
threadkeeper/tools/consolidate.py +222 -0
threadkeeper/tools/core_memory.py +109 -0
threadkeeper/tools/correlation.py +116 -0
threadkeeper/tools/curator.py +121 -0
threadkeeper/tools/dialectic.py +359 -0
threadkeeper/tools/dialog.py +131 -0
threadkeeper/tools/distill.py +184 -0
threadkeeper/tools/extract.py +411 -0
threadkeeper/tools/graph.py +183 -0
threadkeeper/tools/invariants.py +177 -0
threadkeeper/tools/lessons.py +110 -0
threadkeeper/tools/missed_spawns.py +142 -0
threadkeeper/tools/peers.py +579 -0
threadkeeper/tools/pickup.py +148 -0
threadkeeper/tools/probes.py +251 -0
threadkeeper/tools/process_health.py +90 -0
threadkeeper/tools/session.py +34 -0
threadkeeper/tools/shadow_review.py +106 -0
threadkeeper/tools/skills.py +856 -0
threadkeeper/tools/spawn.py +871 -0
threadkeeper/tools/style.py +44 -0
threadkeeper/tools/threads.py +299 -0
threadkeeper-0.4.0.dist-info/METADATA +351 -0
threadkeeper-0.4.0.dist-info/RECORD +61 -0
threadkeeper-0.4.0.dist-info/WHEEL +5 -0
threadkeeper-0.4.0.dist-info/entry_points.txt +2 -0
threadkeeper-0.4.0.dist-info/licenses/LICENSE +21 -0
threadkeeper-0.4.0.dist-info/top_level.txt +1 -0

threadkeeper/embeddings.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""Embedding model loader, vectorization, and cosine/FTS/RRF search primitives
+over notes and dialog_messages.
+Two cosine paths:
+- Fast: sqlite-vec `vec0` virtual tables (notes_vec, dialog_vec) when the
+  extension is loaded. Sub-linear search via the vec0 KNN backend.
+- Fallback: legacy Python-side dot product over BLOB column. Used when
+  sqlite-vec isn't available (extension build disabled / package missing).
+  Correct, just slower at scale.
+Embeddings are dual-written: every new note/dialog_message gets its
+vector in BOTH the BLOB column AND the vec0 virtual table, so the legacy
+path keeps working and we can roll back without data loss. Old rows are
+backfilled to vec0 lazily by the ingester.
+"""
+import sqlite3
+from typing import Optional
+from .config import SEMANTIC_AVAILABLE, EMBED_MODEL_NAME
+from . import db as _db
+def _vec_on() -> bool:
+    """Indirect lookup so monkeypatching db.vec_available in tests works."""
+    return _db.vec_available()
+_model = None
+def _get_model():
+    global _model
+    if not SEMANTIC_AVAILABLE:
+        return None
+    if _model is None:
+        from sentence_transformers import SentenceTransformer  # type: ignore
+        _model = SentenceTransformer(EMBED_MODEL_NAME)
+    return _model
+def _embed(text: str) -> Optional[bytes]:
+    m = _get_model()
+    if m is None:
+        return None
+    v = m.encode([text], normalize_embeddings=True)[0].astype("float32")
+    return v.tobytes()
+def _cosine_search(conn: sqlite3.Connection, query: str, k: int) -> list[dict]:
+    """Top-k cosine over notes. Uses vec0 ANN when available."""
+    m = _get_model()
+    if m is None:
+        return []
+    import numpy as np  # type: ignore
+    qv = m.encode([query], normalize_embeddings=True)[0].astype("float32")
+    if _vec_on():
+        try:
+            return _vec0_notes_search(conn, qv.tobytes(), k)
+        except sqlite3.OperationalError:
+            pass  # fall through to legacy
+    # Legacy Python-side path
+    rows = conn.execute(
+        "SELECT id, content, kind, thread_id, created_at, embedding "
+        "FROM notes WHERE embedding IS NOT NULL"
+    ).fetchall()
+    if not rows:
+        return []
+    scored = []
+    for r in rows:
+        v = np.frombuffer(r["embedding"], dtype="float32")
+        scored.append((float(np.dot(qv, v)), r))
+    scored.sort(key=lambda x: -x[0])
+    return [{"score": s, **dict(r)} for s, r in scored[:k]]
+def _vec0_notes_search(conn: sqlite3.Connection, qv_blob: bytes,
+                       k: int) -> list[dict]:
+    """vec0 KNN over notes_vec, joined back to notes for payload.
+    Distance is squared-Euclidean on normalized vectors; we convert to
+    cosine score for compatibility with the legacy result shape:
+        cos(q, v) = 1 - dist²/2  for unit-norm vectors.
+    """
+    rows = conn.execute(
+        "SELECT n.id, n.content, n.kind, n.thread_id, n.created_at, "
+        "       v.distance "
+        "FROM notes_vec v "
+        "JOIN notes n ON n.id = v.id "
+        "WHERE v.embedding MATCH ? AND k = ? "
+        "ORDER BY v.distance",
+        (qv_blob, max(1, int(k))),
+    ).fetchall()
+    out = []
+    for r in rows:
+        score = max(-1.0, min(1.0, 1.0 - (r["distance"] ** 2) / 2.0))
+        d = {k_: r[k_] for k_ in ("id", "content", "kind",
+                                  "thread_id", "created_at")}
+        d["score"] = float(score)
+        out.append(d)
+    return out
+def _dialog_cosine_search(conn, query: str, k: int) -> list[dict]:
+    """Top-k cosine over dialog_messages. Uses vec0 ANN when available."""
+    m = _get_model()
+    if m is None:
+        return []
+    import numpy as np  # type: ignore
+    qv = m.encode([query], normalize_embeddings=True)[0].astype("float32")
+    if _vec_on():
+        try:
+            return _vec0_dialog_search(conn, qv.tobytes(), k)
+        except sqlite3.OperationalError:
+            pass
+    rows = conn.execute(
+        "SELECT uuid, role, project, session_id, content, created_at, embedding "
+        "FROM dialog_messages WHERE embedding IS NOT NULL"
+    ).fetchall()
+    if not rows:
+        return []
+    scored = []
+    for r in rows:
+        v = np.frombuffer(r["embedding"], dtype="float32")
+        scored.append((float(np.dot(qv, v)), r))
+    scored.sort(key=lambda x: -x[0])
+    return [{"score": s, **dict(r)} for s, r in scored[:k]]
+def _vec_upsert_note(conn: sqlite3.Connection, note_id: int,
+                     emb_blob: Optional[bytes]) -> None:
+    """Mirror a note's embedding into notes_vec. No-op when vec0 isn't
+    loaded or the blob is None. Safe to call multiple times — uses
+    INSERT OR REPLACE keyed by integer id."""
+    if not _vec_on() or emb_blob is None:
+        return
+    try:
+        conn.execute(
+            "INSERT OR REPLACE INTO notes_vec(id, embedding) VALUES (?, ?)",
+            (note_id, emb_blob),
+        )
+    except sqlite3.OperationalError:
+        pass  # vec0 table missing on this connection — silent fall-through
+def _vec_upsert_dialog(conn: sqlite3.Connection, uuid: str,
+                       emb_blob: Optional[bytes]) -> None:
+    """Mirror a dialog_message embedding into dialog_vec via the uuid map.
+    Resolves or assigns a rowid for the given uuid in dialog_vec_map, then
+    INSERT-OR-REPLACE keyed by that rowid in dialog_vec."""
+    if not _vec_on() or emb_blob is None:
+        return
+    try:
+        row = conn.execute(
+            "SELECT rowid FROM dialog_vec_map WHERE uuid=?", (uuid,)
+        ).fetchone()
+        if row is None:
+            cur = conn.execute(
+                "INSERT INTO dialog_vec_map(uuid) VALUES (?)", (uuid,)
+            )
+            vec_rowid = cur.lastrowid
+        else:
+            vec_rowid = row[0] if not hasattr(row, "keys") else row["rowid"]
+        conn.execute(
+            "INSERT OR REPLACE INTO dialog_vec(rowid, embedding) VALUES (?, ?)",
+            (vec_rowid, emb_blob),
+        )
+    except sqlite3.OperationalError:
+        pass
+def _vec0_dialog_search(conn: sqlite3.Connection, qv_blob: bytes,
+                        k: int) -> list[dict]:
+    """vec0 KNN over dialog_vec, joined via dialog_vec_map.uuid back to
+    dialog_messages for payload."""
+    rows = conn.execute(
+        "SELECT d.uuid, d.role, d.project, d.session_id, d.content, "
+        "       d.created_at, v.distance "
+        "FROM dialog_vec v "
+        "JOIN dialog_vec_map m ON m.rowid = v.rowid "
+        "JOIN dialog_messages d ON d.uuid = m.uuid "
+        "WHERE v.embedding MATCH ? AND k = ? "
+        "ORDER BY v.distance",
+        (qv_blob, max(1, int(k))),
+    ).fetchall()
+    out = []
+    for r in rows:
+        score = max(-1.0, min(1.0, 1.0 - (r["distance"] ** 2) / 2.0))
+        d = {k_: r[k_] for k_ in ("uuid", "role", "project",
+                                  "session_id", "content", "created_at")}
+        d["score"] = float(score)
+        out.append(d)
+    return out
+def _fts_search(conn: sqlite3.Connection, query: str,
+                k: int) -> list[dict]:
+    """FTS5 search over dialog_fts joined to dialog_messages. FTS5 ranks
+    by BM25 (lower = better); we keep insertion order from the result for
+    RRF (already ranked best-first by FTS5)."""
+    try:
+        rows = conn.execute(
+            "SELECT f.uuid, d.role, d.session_id, d.content, d.created_at "
+            "FROM dialog_fts f "
+            "JOIN dialog_messages d ON d.uuid = f.uuid "
+            "WHERE dialog_fts MATCH ? ORDER BY rank LIMIT ?",
+            (query, max(1, int(k))),
+        ).fetchall()
+    except sqlite3.OperationalError:
+        # FTS reserved-char syntax error or table missing
+        return []
+    return [
+        {
+            "uuid": r["uuid"],
+            "role": r["role"],
+            "session_id": r["session_id"],
+            "content": r["content"],
+            "created_at": r["created_at"],
+        }
+        for r in rows
+    ]
+def _rrf_combine(lists: list[list[dict]], top_n: int,
+                 k_rrf: int = 60) -> list[dict]:
+    """Reciprocal Rank Fusion. score = Σ 1/(rank + k_rrf) across input lists.
+    De-duplicates by uuid. Returns up to top_n payloads sorted by score."""
+    scores: dict[str, float] = {}
+    payloads: dict[str, dict] = {}
+    for lst in lists:
+        for rank, item in enumerate(lst):
+            uid = item.get("uuid")
+            if not uid:
+                continue
+            scores[uid] = scores.get(uid, 0.0) + 1.0 / (rank + k_rrf)
+            if uid not in payloads:
+                payloads[uid] = item
+    ranked = sorted(scores.items(), key=lambda x: -x[1])[:top_n]
+    return [payloads[uid] for uid, _ in ranked]

threadkeeper/extract_daemon.py ADDED Viewed

@@ -0,0 +1,125 @@
+"""Extract daemon — periodic auto-harvest of decision-shaped utterances
+from dialog_messages into the extract_candidates queue.
+Architecture mirror of shadow_review:
+  1. Daemon thread wakes every EXTRACT_INTERVAL_S seconds (0 = off).
+  2. Calls extract_recent(window_min=EXTRACT_WINDOW_MIN) — same logic as
+     the MCP tool, just invoked automatically instead of waiting for the
+     agent to remember to call it.
+  3. Records events.kind='extract_pass' with the per-pass counters so
+     `extract_review_status()` can show health at a glance.
+Where shadow_review extracts CLASS-LEVEL durable RULES (skills, lessons),
+extract harvests PER-INCIDENT DECISION-SHAPED utterances and adds them
+to the agent's review queue. The agent then materializes the survivors
+via review_candidates() + accept_candidate().
+Designed around the empirical finding (audit log, 2026-05-16): no
+parallel session calls `note()` / `verbatim_user()` / `open_thread()`
+on its own. Memory bookkeeping fights against the agent's primary task
+focus. This daemon side-steps the incentive problem by harvesting from
+what the agent ALREADY said out loud — no agent-side action required.
+"""
+from __future__ import annotations
+import logging
+import sqlite3
+import threading
+import time
+from .config import EXTRACT_INTERVAL_S, EXTRACT_WINDOW_MIN
+from .db import get_db
+from . import identity
+logger = logging.getLogger(__name__)
+_started = False
+def _last_extract_ts(conn: sqlite3.Connection) -> int:
+    """High-water timestamp of the most recent extract pass, or 0."""
+    try:
+        row = conn.execute(
+            "SELECT target FROM events WHERE kind='extract_pass' "
+            "ORDER BY id DESC LIMIT 1"
+        ).fetchone()
+    except sqlite3.OperationalError:
+        return 0
+    if not row or not row["target"]:
+        return 0
+    try:
+        return int(row["target"])
+    except (ValueError, TypeError):
+        return 0
+def _record_extract_pass(conn: sqlite3.Connection,
+                         ts: int,
+                         outcome: str) -> None:
+    try:
+        conn.execute(
+            "INSERT INTO events (session_id, kind, target, summary, "
+            "created_at) VALUES (?, 'extract_pass', ?, ?, ?)",
+            (identity._session_id or "", str(ts), outcome[:300],
+             int(time.time())),
+        )
+        conn.commit()
+    except sqlite3.OperationalError:
+        logger.debug("extract_daemon: failed to record pass", exc_info=True)
+def run_extract_pass(force: bool = False) -> str:
+    """Execute one extract pass synchronously. Used by the daemon AND by
+    the MCP tool for manual triggering.
+    Returns the same status string `extract_recent` returns ("ok
+    window=… scanned=… verbatim=… distill=… concept=… note=…
+    skipped_existing=…" or "no_dialog window=…m"). Plus advances the
+    `extract_pass` cursor for telemetry.
+    """
+    if EXTRACT_INTERVAL_S <= 0 and not force:
+        return "disabled"
+    # Late import — tools.extract registers MCP tools at import time, and
+    # the daemon module loads before all tools are registered.
+    from .tools.extract import extract_recent
+    try:
+        result = extract_recent(window_min=EXTRACT_WINDOW_MIN)
+    except Exception as e:
+        logger.debug("extract_daemon: pass failed", exc_info=True)
+        _record_extract_pass(get_db(), int(time.time()),
+                             f"error: {e}")
+        return f"error: {e}"
+    _record_extract_pass(get_db(), int(time.time()), str(result)[:200])
+    return str(result)
+def _serve_loop() -> None:
+    """Daemon body. Sleep → tick → sleep, until process dies."""
+    while True:
+        try:
+            run_extract_pass()
+        except Exception:
+            logger.debug("extract_daemon tick failed", exc_info=True)
+        time.sleep(EXTRACT_INTERVAL_S)
+def start_extract_daemon() -> None:
+    """Idempotent daemon starter. Honors env: no-op when
+    EXTRACT_INTERVAL_S<=0. Same cascade-prevention as shadow_review:
+    slim children (SEMANTIC_AVAILABLE=False) refuse to start the
+    daemon so spawn() doesn't recurse."""
+    global _started
+    if _started:
+        return
+    if EXTRACT_INTERVAL_S <= 0:
+        return
+    from .config import SEMANTIC_AVAILABLE
+    if not SEMANTIC_AVAILABLE:
+        return  # slim child: don't fire extract from here
+    t = threading.Thread(
+        target=_serve_loop, name="extract_daemon", daemon=True,
+    )
+    t.start()
+    _started = True

threadkeeper/helpers.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Stateless utility helpers used across the package: time formatting,
+short-quoting, ID generation, process-aliveness check."""
+from __future__ import annotations
+import os
+import secrets
+import sqlite3
+import subprocess
+from typing import Optional
+def fmt_age(seconds: int) -> str:
+    """Compact human-readable age. 0..59 → 's', then 'm', 'h', 'd'."""
+    if seconds < 60:
+        return f"{seconds}s"
+    m = seconds // 60
+    if m < 60:
+        return f"{m}m"
+    h = m // 60
+    if h < 24:
+        return f"{h}h"
+    d = h // 24
+    return f"{d}d"
+def q(s: str) -> str:
+    """Compact double-quote escape for brief lines."""
+    return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
+def _gen_short_id(conn: sqlite3.Connection, prefix: str, table: str,
+                  id_col: str = "id") -> str:
+    """prefix + 3 hex chars (4096 unique). Retries on collision; extends to
+    5 hex if collision space exhausted (~1M unique fallback)."""
+    for _ in range(64):
+        cand = prefix + secrets.token_hex(2)[:3]
+        if not conn.execute(
+            f"SELECT 1 FROM {table} WHERE {id_col}=?", (cand,)
+        ).fetchone():
+            return cand
+    return prefix + secrets.token_hex(3)[:5]
+def gen_thread_id(conn: sqlite3.Connection) -> str:
+    return _gen_short_id(conn, "T", "threads")
+def gen_probe_id(conn: sqlite3.Connection) -> str:
+    return _gen_short_id(conn, "P", "probes")
+def gen_concept_id(conn: sqlite3.Connection) -> str:
+    return _gen_short_id(conn, "C", "concepts")
+def gen_distill_id(conn: sqlite3.Connection) -> str:
+    return _gen_short_id(conn, "D", "distill")
+def gen_dialectic_id(conn: sqlite3.Connection) -> str:
+    return _gen_short_id(conn, "UC", "user_dialectic")
+def alive(pid: int) -> bool:
+    """True if pid corresponds to a running (non-zombie) process. Reaps
+    zombies opportunistically when pid is our own child. pid<=0 sentinel
+    (used for visible spawns where we don't track) → False."""
+    if pid is None or pid <= 0:
+        return False
+    try:
+        wpid, _ = os.waitpid(pid, os.WNOHANG)
+        if wpid == pid:
+            return False
+    except (ChildProcessError, OSError):
+        pass
+    try:
+        os.kill(pid, 0)
+    except ProcessLookupError:
+        return False
+    except PermissionError:
+        return True
+    except OSError:
+        return False
+    # Process exists; distinguish zombie via `ps -o state=`. Zombies show
+    # 'Z' on macOS/Linux. If ps fails (rare), assume alive.
+    try:
+        r = subprocess.run(
+            ["ps", "-p", str(pid), "-o", "state="],
+            capture_output=True, text=True, timeout=2,
+        )
+        state = (r.stdout or "").strip()
+        if state.startswith("Z") or state == "":
+            return False
+    except (subprocess.SubprocessError, OSError):
+        pass
+    return True
+def normalize_text(s: str) -> str:
+    """Whitespace-collapsed lower for fuzzy duplicate detection."""
+    return " ".join(s.lower().strip().split())