PyPI - claude-sql - Versions diffs - 0.4.0__py3-none-any.whl - Mend

claude-sql 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

claude_sql/__init__.py +5 -0
claude_sql/binding.py +740 -0
claude_sql/blind_handover.py +155 -0
claude_sql/checkpointer.py +202 -0
claude_sql/cli.py +2344 -0
claude_sql/cluster_worker.py +208 -0
claude_sql/community_worker.py +306 -0
claude_sql/config.py +380 -0
claude_sql/embed_worker.py +482 -0
claude_sql/freeze.py +189 -0
claude_sql/friction_worker.py +561 -0
claude_sql/install_source.py +77 -0
claude_sql/judge_worker.py +459 -0
claude_sql/judges.py +239 -0
claude_sql/kappa_worker.py +257 -0
claude_sql/llm_worker.py +1760 -0
claude_sql/logging_setup.py +95 -0
claude_sql/output.py +248 -0
claude_sql/parquet_shards.py +172 -0
claude_sql/retry_queue.py +180 -0
claude_sql/review_sheet_render.py +167 -0
claude_sql/review_sheet_worker.py +463 -0
claude_sql/schemas.py +454 -0
claude_sql/session_text.py +387 -0
claude_sql/skills_catalog.py +354 -0
claude_sql/sql_views.py +1751 -0
claude_sql/terms_worker.py +145 -0
claude_sql/ungrounded_worker.py +190 -0
claude_sql-0.4.0.dist-info/METADATA +530 -0
claude_sql-0.4.0.dist-info/RECORD +32 -0
claude_sql-0.4.0.dist-info/WHEEL +4 -0
claude_sql-0.4.0.dist-info/entry_points.txt +3 -0

claude_sql/blind_handover.py ADDED Viewed

@@ -0,0 +1,155 @@
+"""Blind-handover stripper for grader-safe session bundles.
+Goal: hand a session transcript to an external grader (cross-provider
+judge) with all identity markers removed, so the grader cannot use
+"who sent this" as a cue.  The stripper removes:
+* Slack IDs: users (``U[A-Z0-9]{8,}``), channels (``C[A-Z0-9]+``),
+  teams (``T[A-Z0-9]+``), DM channels (``D[A-Z0-9]+``), workflows
+  (``W[A-Z0-9]+``).
+* Agent persona markers: ``:moyai:``, ``Bonk →``, ``Bonk:``,
+  ``Clod:``, ``Clod →``, ``-- Bonk`` sign-offs.
+* Protocol tokens: ``over :radio:``, ``out :radio:``.
+* Tool names: ``mcp__<server>__<tool>`` calls in plain text.
+* System IDs: OTel trace IDs, session UUIDs, work item IDs
+  (``wi_[0-9a-f]+``), thread timestamps (``\\d{10}\\.\\d{6}``).
+* Mrkdwn/markdown formatting: ``<@UXXX>``, ``<#CXXX|name>``,
+  ``<!here>``, ``<!channel>``.
+Original session_id is hashed (SHA256[:16]) so bundles remain
+re-linkable for post-grading analysis without leaking identity.
+"""
+from __future__ import annotations
+import hashlib
+import re
+from dataclasses import dataclass
+# ---------------------------------------------------------------------------
+# Precompiled patterns
+# ---------------------------------------------------------------------------
+_SLACK_USER_RE = re.compile(r"\bU[A-Z0-9]{8,}\b")
+_SLACK_CHANNEL_RE = re.compile(r"\bC[A-Z0-9]{8,}\b")
+_SLACK_TEAM_RE = re.compile(r"\bT[A-Z0-9]{8,}\b")
+_SLACK_DM_RE = re.compile(r"\bD[A-Z0-9]{8,}\b")
+_SLACK_WORKFLOW_RE = re.compile(r"\bW[A-Z0-9]{8,}\b")
+# Mrkdwn refs like <@UABC123>, <#CABC123|chan>, <!here>, <!channel>
+_MRKDWN_USER_REF_RE = re.compile(r"<@U[A-Z0-9]{8,}>")
+_MRKDWN_CHANNEL_REF_RE = re.compile(r"<#C[A-Z0-9]{8,}(?:\|[^>]*)?>")
+_MRKDWN_BROADCAST_RE = re.compile(r"<!(?:here|channel|everyone)>")
+# Agent persona markers (case-sensitive on purpose — these are brand tokens)
+_PERSONA_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
+    (re.compile(r":moyai:\s*"), ""),
+    (re.compile(r"\bBonk\s*(?:→|->)\s*\w+:?\s*"), "[agent] "),
+    (re.compile(r"\bClod\s*(?:→|->)\s*\w+:?\s*"), "[agent] "),
+    (re.compile(r"(?<![a-zA-Z])Bonk\b"), "[agent]"),
+    (re.compile(r"(?<![a-zA-Z])Clod\b"), "[agent]"),
+    (re.compile(r"--\s*\[agent\]\s*"), ""),
+)
+# Protocol tokens
+_RADIO_OVER_RE = re.compile(r"over\s*:radio:\s*", re.IGNORECASE)
+_RADIO_OUT_RE = re.compile(r"out\s*:radio:\s*", re.IGNORECASE)
+# System IDs
+_WORK_ITEM_RE = re.compile(r"\bwi_[0-9a-f]{12}\b")
+_THREAD_TS_RE = re.compile(r"\b\d{10}\.\d{6,}\b")
+_OTEL_TRACE_ID_RE = re.compile(r"\b[0-9a-f]{32}\b")
+_SESSION_UUID_RE = re.compile(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b")
+# MCP tool names
+_MCP_TOOL_RE = re.compile(r"\bmcp__[a-z0-9_-]+__[a-z0-9_-]+\b")
+@dataclass(frozen=True)
+class BlindResult:
+    """Output of ``strip_text``: the cleaned text + diagnostic counts."""
+    text: str
+    n_user_ids: int
+    n_channel_ids: int
+    n_persona_markers: int
+    n_protocol_tokens: int
+    n_system_ids: int
+    n_tool_names: int
+def strip_text(text: str) -> BlindResult:
+    """Strip every identity marker from ``text``.
+    Returns both the cleaned text and a count of each category, so
+    callers can log per-strip diagnostics (e.g., "session X had 12
+    persona markers, 4 thread timestamps removed").
+    """
+    # Count BEFORE replacement so diagnostic counts are accurate
+    n_user_ids = len(_SLACK_USER_RE.findall(text)) + len(_MRKDWN_USER_REF_RE.findall(text))
+    n_channel_ids = (
+        len(_SLACK_CHANNEL_RE.findall(text))
+        + len(_SLACK_TEAM_RE.findall(text))
+        + len(_SLACK_DM_RE.findall(text))
+        + len(_SLACK_WORKFLOW_RE.findall(text))
+        + len(_MRKDWN_CHANNEL_REF_RE.findall(text))
+        + len(_MRKDWN_BROADCAST_RE.findall(text))
+    )
+    n_tool_names = len(_MCP_TOOL_RE.findall(text))
+    n_system_ids = (
+        len(_WORK_ITEM_RE.findall(text))
+        + len(_THREAD_TS_RE.findall(text))
+        + len(_OTEL_TRACE_ID_RE.findall(text))
+        + len(_SESSION_UUID_RE.findall(text))
+    )
+    n_protocol_tokens = len(_RADIO_OVER_RE.findall(text)) + len(_RADIO_OUT_RE.findall(text))
+    n_persona_markers = sum(len(pat.findall(text)) for pat, _ in _PERSONA_PATTERNS)
+    # Replacements. Order matters: longest-tokens-first for mrkdwn refs
+    # before bare ID patterns, otherwise ``<@U...>`` would have the
+    # inner ID stripped first and leave stray angle brackets.
+    text = _MRKDWN_USER_REF_RE.sub("[user]", text)
+    text = _MRKDWN_CHANNEL_REF_RE.sub("[channel]", text)
+    text = _MRKDWN_BROADCAST_RE.sub("[broadcast]", text)
+    text = _SLACK_USER_RE.sub("[user]", text)
+    text = _SLACK_CHANNEL_RE.sub("[channel]", text)
+    text = _SLACK_TEAM_RE.sub("[team]", text)
+    text = _SLACK_DM_RE.sub("[dm]", text)
+    text = _SLACK_WORKFLOW_RE.sub("[workflow]", text)
+    for pat, replacement in _PERSONA_PATTERNS:
+        text = pat.sub(replacement, text)
+    text = _RADIO_OVER_RE.sub("[end-of-turn] ", text)
+    text = _RADIO_OUT_RE.sub("[sign-off] ", text)
+    text = _MCP_TOOL_RE.sub("[tool]", text)
+    text = _WORK_ITEM_RE.sub("[work-item]", text)
+    text = _THREAD_TS_RE.sub("[ts]", text)
+    # OTel trace IDs before session UUIDs — they overlap in character set
+    # but not in structure (dashed vs not).
+    text = _OTEL_TRACE_ID_RE.sub("[trace]", text)
+    text = _SESSION_UUID_RE.sub("[session]", text)
+    # Collapse runs of whitespace introduced by strip operations
+    text = re.sub(r"[ \t]{2,}", " ", text).strip()
+    return BlindResult(
+        text=text,
+        n_user_ids=n_user_ids,
+        n_channel_ids=n_channel_ids,
+        n_persona_markers=n_persona_markers,
+        n_protocol_tokens=n_protocol_tokens,
+        n_system_ids=n_system_ids,
+        n_tool_names=n_tool_names,
+    )
+def original_hash(session_id: str) -> str:
+    """Return a stable 16-char hash of ``session_id`` for re-linkage.
+    Used so the blinded bundle keeps a ``original_hash`` column that
+    lets post-grading analysis re-associate scores with the true
+    session without exposing the ID to the grader.
+    """
+    return hashlib.sha256(session_id.encode("utf-8")).hexdigest()[:16]

claude_sql/checkpointer.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""Per-(session_id, pipeline) checkpoint backed by a persistent DuckDB file.
+Tracks when each LLM pipeline last processed each session so re-runs skip
+sessions whose transcripts have not advanced. One row per
+``(session_id, pipeline)``; ``INSERT OR REPLACE`` is the upsert primitive.
+Schema::
+    CREATE TABLE session_checkpoint (
+        session_id            VARCHAR,
+        pipeline              VARCHAR,
+        last_ts_processed     TIMESTAMP,
+        last_mtime_processed  TIMESTAMP,
+        completed_at          TIMESTAMP NOT NULL,
+        PRIMARY KEY (session_id, pipeline)
+    );
+All timestamps are UTC. Plain ``TIMESTAMP`` (not ``TIMESTAMP WITH TIME
+ZONE``) because DuckDB's tz-aware type requires ``pytz`` at query time —
+an extra dep we don't want. We stash tz-aware UTC datetimes by stripping
+``tzinfo`` at the boundary and re-attaching ``UTC`` on read.
+The file lives at ``~/.claude/claude_sql.duckdb`` (overridable via
+``CLAUDE_SQL_CHECKPOINT_DB_PATH``).
+"""
+from __future__ import annotations
+import time
+from collections.abc import Iterable
+from datetime import UTC, datetime
+from pathlib import Path
+import duckdb
+PIPELINE_NAMES: tuple[str, ...] = ("classify", "trajectory", "conflicts")
+_CREATE_TABLE_SQL = """
+CREATE TABLE IF NOT EXISTS session_checkpoint (
+    session_id            VARCHAR   NOT NULL,
+    pipeline              VARCHAR   NOT NULL,
+    last_ts_processed     TIMESTAMP,
+    last_mtime_processed  TIMESTAMP,
+    completed_at          TIMESTAMP NOT NULL,
+    PRIMARY KEY (session_id, pipeline)
+);
+"""
+def _strip_tz(dt: datetime | None) -> datetime | None:
+    """Drop tz so DuckDB's naive TIMESTAMP round-trips without pytz."""
+    if dt is None:
+        return None
+    return dt.astimezone(UTC).replace(tzinfo=None)
+def _attach_tz(dt: datetime | None) -> datetime | None:
+    """Re-attach UTC on read so callers always get aware datetimes back."""
+    if dt is None:
+        return None
+    return dt.replace(tzinfo=UTC)
+def _connect(path: Path, *, max_attempts: int = 20) -> duckdb.DuckDBPyConnection:
+    """Open the checkpoint DB and ensure the table exists.
+    DuckDB file connections are exclusive at the process level — when three
+    pipelines run in parallel, one grabs the lock and the others see
+    ``IOException: Could not set lock``. Retry with exponential backoff so
+    concurrent callers serialize rather than crash. 20 attempts × up to
+    1.6s each covers the multi-minute chunk cadence comfortably.
+    """
+    path.parent.mkdir(parents=True, exist_ok=True)
+    delay = 0.05
+    last_err: duckdb.IOException | None = None
+    for _ in range(max_attempts):
+        try:
+            con = duckdb.connect(str(path))
+            con.execute(_CREATE_TABLE_SQL)
+            return con
+        except duckdb.IOException as exc:
+            last_err = exc
+            time.sleep(delay)
+            delay = min(delay * 1.5, 1.6)
+    assert last_err is not None  # noqa: S101 — loop-postcondition invariant
+    raise last_err
+def load_as_map(db_path: Path, pipeline: str) -> dict[str, tuple[datetime | None, datetime | None]]:
+    """Return ``{session_id: (last_ts, last_mtime)}`` for one pipeline.
+    Empty dict when the DB doesn't exist yet or the pipeline has no rows.
+    """
+    if not db_path.exists():
+        return {}
+    con = _connect(db_path)
+    try:
+        rows = con.execute(
+            "SELECT session_id, last_ts_processed, last_mtime_processed "
+            "FROM session_checkpoint WHERE pipeline = ?",
+            [pipeline],
+        ).fetchall()
+    finally:
+        con.close()
+    return {
+        str(sid): (_attach_tz(last_ts), _attach_tz(last_mtime)) for sid, last_ts, last_mtime in rows
+    }
+def filter_unchanged(
+    candidates: Iterable[tuple[str, datetime | None, datetime | None]],
+    *,
+    pipeline: str,
+    checkpoint_db_path: Path,
+) -> tuple[list[str], int]:
+    """Drop sessions whose ``(last_ts, last_mtime)`` has not advanced.
+    ``candidates`` is an iterable of ``(session_id, current_last_ts,
+    current_last_mtime)``. Returns ``(pending_session_ids, skipped_count)``.
+    A session is skipped iff a checkpoint row exists for ``pipeline`` AND
+    both ``current_last_ts <= ckpt.last_ts`` AND ``current_last_mtime <=
+    ckpt.last_mtime``. Either bound moving forward invalidates the skip.
+    """
+    ckpt = load_as_map(checkpoint_db_path, pipeline)
+    pending: list[str] = []
+    skipped = 0
+    for sid, cur_ts, cur_mtime in candidates:
+        prev = ckpt.get(sid)
+        if prev is None:
+            pending.append(sid)
+            continue
+        prev_ts, prev_mtime = prev
+        if _stale_or_equal(cur_ts, prev_ts) and _stale_or_equal(cur_mtime, prev_mtime):
+            skipped += 1
+            continue
+        pending.append(sid)
+    return pending, skipped
+def _stale_or_equal(cur: datetime | None, prev: datetime | None) -> bool:
+    """True iff both are present and ``cur`` has not advanced past ``prev``.
+    Normalises both operands to naive-UTC before comparing so aware-vs-naive
+    drift from different upstream sources (read_json → aware, checkpoint
+    re-attach → aware, raw DuckDB TIMESTAMP fetch → naive) never raises
+    ``TypeError``.
+    """
+    if cur is None or prev is None:
+        return False
+    cur_naive = _strip_tz(cur)
+    prev_naive = _strip_tz(prev)
+    if cur_naive is None or prev_naive is None:
+        return False
+    return cur_naive <= prev_naive
+def mark_completed(
+    db_path: Path,
+    *,
+    pipeline: str,
+    rows: Iterable[tuple[str, datetime | None, datetime | None]],
+) -> int:
+    """Upsert checkpoint rows for ``(session_id, pipeline)``.
+    Each row is ``(session_id, last_ts_processed, last_mtime_processed)``.
+    The ``completed_at`` column is stamped with ``datetime.now(UTC)``.
+    Returns the number of upserted rows. When ``rows`` is empty, the DB is
+    left untouched.
+    """
+    incoming = list(rows)
+    if not incoming:
+        return 0
+    now = datetime.now(UTC).replace(tzinfo=None)
+    payload = [
+        (sid, pipeline, _strip_tz(last_ts), _strip_tz(last_mtime), now)
+        for sid, last_ts, last_mtime in incoming
+    ]
+    con = _connect(db_path)
+    try:
+        con.executemany(
+            "INSERT OR REPLACE INTO session_checkpoint "
+            "(session_id, pipeline, last_ts_processed, last_mtime_processed, completed_at) "
+            "VALUES (?, ?, ?, ?, ?)",
+            payload,
+        )
+    finally:
+        con.close()
+    return len(incoming)
+def count_rows(db_path: Path) -> int:
+    """Return the total number of checkpoint rows, or 0 when the DB is missing."""
+    if not db_path.exists():
+        return 0
+    con = _connect(db_path)
+    try:
+        row = con.execute("SELECT count(*) FROM session_checkpoint").fetchone()
+    finally:
+        con.close()
+    return int(row[0]) if row else 0