claude-sql 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ """Blind-handover stripper for grader-safe session bundles.
2
+
3
+ Goal: hand a session transcript to an external grader (cross-provider
4
+ judge) with all identity markers removed, so the grader cannot use
5
+ "who sent this" as a cue. The stripper removes:
6
+
7
+ * Slack IDs: users (``U[A-Z0-9]{8,}``), channels (``C[A-Z0-9]+``),
8
+ teams (``T[A-Z0-9]+``), DM channels (``D[A-Z0-9]+``), workflows
9
+ (``W[A-Z0-9]+``).
10
+ * Agent persona markers: ``:moyai:``, ``Bonk →``, ``Bonk:``,
11
+ ``Clod:``, ``Clod →``, ``-- Bonk`` sign-offs.
12
+ * Protocol tokens: ``over :radio:``, ``out :radio:``.
13
+ * Tool names: ``mcp__<server>__<tool>`` calls in plain text.
14
+ * System IDs: OTel trace IDs, session UUIDs, work item IDs
15
+ (``wi_[0-9a-f]+``), thread timestamps (``\\d{10}\\.\\d{6}``).
16
+ * Mrkdwn/markdown formatting: ``<@UXXX>``, ``<#CXXX|name>``,
17
+ ``<!here>``, ``<!channel>``.
18
+
19
+ Original session_id is hashed (SHA256[:16]) so bundles remain
20
+ re-linkable for post-grading analysis without leaking identity.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import hashlib
26
+ import re
27
+ from dataclasses import dataclass
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Precompiled patterns
31
+ # ---------------------------------------------------------------------------
32
+
33
+ _SLACK_USER_RE = re.compile(r"\bU[A-Z0-9]{8,}\b")
34
+ _SLACK_CHANNEL_RE = re.compile(r"\bC[A-Z0-9]{8,}\b")
35
+ _SLACK_TEAM_RE = re.compile(r"\bT[A-Z0-9]{8,}\b")
36
+ _SLACK_DM_RE = re.compile(r"\bD[A-Z0-9]{8,}\b")
37
+ _SLACK_WORKFLOW_RE = re.compile(r"\bW[A-Z0-9]{8,}\b")
38
+
39
+ # Mrkdwn refs like <@UABC123>, <#CABC123|chan>, <!here>, <!channel>
40
+ _MRKDWN_USER_REF_RE = re.compile(r"<@U[A-Z0-9]{8,}>")
41
+ _MRKDWN_CHANNEL_REF_RE = re.compile(r"<#C[A-Z0-9]{8,}(?:\|[^>]*)?>")
42
+ _MRKDWN_BROADCAST_RE = re.compile(r"<!(?:here|channel|everyone)>")
43
+
44
+ # Agent persona markers (case-sensitive on purpose — these are brand tokens)
45
+ _PERSONA_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
46
+ (re.compile(r":moyai:\s*"), ""),
47
+ (re.compile(r"\bBonk\s*(?:→|->)\s*\w+:?\s*"), "[agent] "),
48
+ (re.compile(r"\bClod\s*(?:→|->)\s*\w+:?\s*"), "[agent] "),
49
+ (re.compile(r"(?<![a-zA-Z])Bonk\b"), "[agent]"),
50
+ (re.compile(r"(?<![a-zA-Z])Clod\b"), "[agent]"),
51
+ (re.compile(r"--\s*\[agent\]\s*"), ""),
52
+ )
53
+
54
+ # Protocol tokens
55
+ _RADIO_OVER_RE = re.compile(r"over\s*:radio:\s*", re.IGNORECASE)
56
+ _RADIO_OUT_RE = re.compile(r"out\s*:radio:\s*", re.IGNORECASE)
57
+
58
+ # System IDs
59
+ _WORK_ITEM_RE = re.compile(r"\bwi_[0-9a-f]{12}\b")
60
+ _THREAD_TS_RE = re.compile(r"\b\d{10}\.\d{6,}\b")
61
+ _OTEL_TRACE_ID_RE = re.compile(r"\b[0-9a-f]{32}\b")
62
+ _SESSION_UUID_RE = re.compile(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b")
63
+
64
+ # MCP tool names
65
+ _MCP_TOOL_RE = re.compile(r"\bmcp__[a-z0-9_-]+__[a-z0-9_-]+\b")
66
+
67
+
68
+ @dataclass(frozen=True)
69
+ class BlindResult:
70
+ """Output of ``strip_text``: the cleaned text + diagnostic counts."""
71
+
72
+ text: str
73
+ n_user_ids: int
74
+ n_channel_ids: int
75
+ n_persona_markers: int
76
+ n_protocol_tokens: int
77
+ n_system_ids: int
78
+ n_tool_names: int
79
+
80
+
81
+ def strip_text(text: str) -> BlindResult:
82
+ """Strip every identity marker from ``text``.
83
+
84
+ Returns both the cleaned text and a count of each category, so
85
+ callers can log per-strip diagnostics (e.g., "session X had 12
86
+ persona markers, 4 thread timestamps removed").
87
+ """
88
+ # Count BEFORE replacement so diagnostic counts are accurate
89
+ n_user_ids = len(_SLACK_USER_RE.findall(text)) + len(_MRKDWN_USER_REF_RE.findall(text))
90
+ n_channel_ids = (
91
+ len(_SLACK_CHANNEL_RE.findall(text))
92
+ + len(_SLACK_TEAM_RE.findall(text))
93
+ + len(_SLACK_DM_RE.findall(text))
94
+ + len(_SLACK_WORKFLOW_RE.findall(text))
95
+ + len(_MRKDWN_CHANNEL_REF_RE.findall(text))
96
+ + len(_MRKDWN_BROADCAST_RE.findall(text))
97
+ )
98
+ n_tool_names = len(_MCP_TOOL_RE.findall(text))
99
+ n_system_ids = (
100
+ len(_WORK_ITEM_RE.findall(text))
101
+ + len(_THREAD_TS_RE.findall(text))
102
+ + len(_OTEL_TRACE_ID_RE.findall(text))
103
+ + len(_SESSION_UUID_RE.findall(text))
104
+ )
105
+ n_protocol_tokens = len(_RADIO_OVER_RE.findall(text)) + len(_RADIO_OUT_RE.findall(text))
106
+ n_persona_markers = sum(len(pat.findall(text)) for pat, _ in _PERSONA_PATTERNS)
107
+
108
+ # Replacements. Order matters: longest-tokens-first for mrkdwn refs
109
+ # before bare ID patterns, otherwise ``<@U...>`` would have the
110
+ # inner ID stripped first and leave stray angle brackets.
111
+ text = _MRKDWN_USER_REF_RE.sub("[user]", text)
112
+ text = _MRKDWN_CHANNEL_REF_RE.sub("[channel]", text)
113
+ text = _MRKDWN_BROADCAST_RE.sub("[broadcast]", text)
114
+ text = _SLACK_USER_RE.sub("[user]", text)
115
+ text = _SLACK_CHANNEL_RE.sub("[channel]", text)
116
+ text = _SLACK_TEAM_RE.sub("[team]", text)
117
+ text = _SLACK_DM_RE.sub("[dm]", text)
118
+ text = _SLACK_WORKFLOW_RE.sub("[workflow]", text)
119
+
120
+ for pat, replacement in _PERSONA_PATTERNS:
121
+ text = pat.sub(replacement, text)
122
+
123
+ text = _RADIO_OVER_RE.sub("[end-of-turn] ", text)
124
+ text = _RADIO_OUT_RE.sub("[sign-off] ", text)
125
+
126
+ text = _MCP_TOOL_RE.sub("[tool]", text)
127
+ text = _WORK_ITEM_RE.sub("[work-item]", text)
128
+ text = _THREAD_TS_RE.sub("[ts]", text)
129
+ # OTel trace IDs before session UUIDs — they overlap in character set
130
+ # but not in structure (dashed vs not).
131
+ text = _OTEL_TRACE_ID_RE.sub("[trace]", text)
132
+ text = _SESSION_UUID_RE.sub("[session]", text)
133
+
134
+ # Collapse runs of whitespace introduced by strip operations
135
+ text = re.sub(r"[ \t]{2,}", " ", text).strip()
136
+
137
+ return BlindResult(
138
+ text=text,
139
+ n_user_ids=n_user_ids,
140
+ n_channel_ids=n_channel_ids,
141
+ n_persona_markers=n_persona_markers,
142
+ n_protocol_tokens=n_protocol_tokens,
143
+ n_system_ids=n_system_ids,
144
+ n_tool_names=n_tool_names,
145
+ )
146
+
147
+
148
+ def original_hash(session_id: str) -> str:
149
+ """Return a stable 16-char hash of ``session_id`` for re-linkage.
150
+
151
+ Used so the blinded bundle keeps a ``original_hash`` column that
152
+ lets post-grading analysis re-associate scores with the true
153
+ session without exposing the ID to the grader.
154
+ """
155
+ return hashlib.sha256(session_id.encode("utf-8")).hexdigest()[:16]
@@ -0,0 +1,202 @@
1
+ """Per-(session_id, pipeline) checkpoint backed by a persistent DuckDB file.
2
+
3
+ Tracks when each LLM pipeline last processed each session so re-runs skip
4
+ sessions whose transcripts have not advanced. One row per
5
+ ``(session_id, pipeline)``; ``INSERT OR REPLACE`` is the upsert primitive.
6
+
7
+ Schema::
8
+
9
+ CREATE TABLE session_checkpoint (
10
+ session_id VARCHAR,
11
+ pipeline VARCHAR,
12
+ last_ts_processed TIMESTAMP,
13
+ last_mtime_processed TIMESTAMP,
14
+ completed_at TIMESTAMP NOT NULL,
15
+ PRIMARY KEY (session_id, pipeline)
16
+ );
17
+
18
+ All timestamps are UTC. Plain ``TIMESTAMP`` (not ``TIMESTAMP WITH TIME
19
+ ZONE``) because DuckDB's tz-aware type requires ``pytz`` at query time —
20
+ an extra dep we don't want. We stash tz-aware UTC datetimes by stripping
21
+ ``tzinfo`` at the boundary and re-attaching ``UTC`` on read.
22
+
23
+ The file lives at ``~/.claude/claude_sql.duckdb`` (overridable via
24
+ ``CLAUDE_SQL_CHECKPOINT_DB_PATH``).
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import time
30
+ from collections.abc import Iterable
31
+ from datetime import UTC, datetime
32
+ from pathlib import Path
33
+
34
+ import duckdb
35
+
36
+ PIPELINE_NAMES: tuple[str, ...] = ("classify", "trajectory", "conflicts")
37
+
38
+ _CREATE_TABLE_SQL = """
39
+ CREATE TABLE IF NOT EXISTS session_checkpoint (
40
+ session_id VARCHAR NOT NULL,
41
+ pipeline VARCHAR NOT NULL,
42
+ last_ts_processed TIMESTAMP,
43
+ last_mtime_processed TIMESTAMP,
44
+ completed_at TIMESTAMP NOT NULL,
45
+ PRIMARY KEY (session_id, pipeline)
46
+ );
47
+ """
48
+
49
+
50
+ def _strip_tz(dt: datetime | None) -> datetime | None:
51
+ """Drop tz so DuckDB's naive TIMESTAMP round-trips without pytz."""
52
+ if dt is None:
53
+ return None
54
+ return dt.astimezone(UTC).replace(tzinfo=None)
55
+
56
+
57
+ def _attach_tz(dt: datetime | None) -> datetime | None:
58
+ """Re-attach UTC on read so callers always get aware datetimes back."""
59
+ if dt is None:
60
+ return None
61
+ return dt.replace(tzinfo=UTC)
62
+
63
+
64
+ def _connect(path: Path, *, max_attempts: int = 20) -> duckdb.DuckDBPyConnection:
65
+ """Open the checkpoint DB and ensure the table exists.
66
+
67
+ DuckDB file connections are exclusive at the process level — when three
68
+ pipelines run in parallel, one grabs the lock and the others see
69
+ ``IOException: Could not set lock``. Retry with exponential backoff so
70
+ concurrent callers serialize rather than crash. 20 attempts × up to
71
+ 1.6s each covers the multi-minute chunk cadence comfortably.
72
+ """
73
+ path.parent.mkdir(parents=True, exist_ok=True)
74
+ delay = 0.05
75
+ last_err: duckdb.IOException | None = None
76
+ for _ in range(max_attempts):
77
+ try:
78
+ con = duckdb.connect(str(path))
79
+ con.execute(_CREATE_TABLE_SQL)
80
+ return con
81
+ except duckdb.IOException as exc:
82
+ last_err = exc
83
+ time.sleep(delay)
84
+ delay = min(delay * 1.5, 1.6)
85
+ assert last_err is not None # noqa: S101 — loop-postcondition invariant
86
+ raise last_err
87
+
88
+
89
+ def load_as_map(db_path: Path, pipeline: str) -> dict[str, tuple[datetime | None, datetime | None]]:
90
+ """Return ``{session_id: (last_ts, last_mtime)}`` for one pipeline.
91
+
92
+ Empty dict when the DB doesn't exist yet or the pipeline has no rows.
93
+ """
94
+ if not db_path.exists():
95
+ return {}
96
+ con = _connect(db_path)
97
+ try:
98
+ rows = con.execute(
99
+ "SELECT session_id, last_ts_processed, last_mtime_processed "
100
+ "FROM session_checkpoint WHERE pipeline = ?",
101
+ [pipeline],
102
+ ).fetchall()
103
+ finally:
104
+ con.close()
105
+ return {
106
+ str(sid): (_attach_tz(last_ts), _attach_tz(last_mtime)) for sid, last_ts, last_mtime in rows
107
+ }
108
+
109
+
110
+ def filter_unchanged(
111
+ candidates: Iterable[tuple[str, datetime | None, datetime | None]],
112
+ *,
113
+ pipeline: str,
114
+ checkpoint_db_path: Path,
115
+ ) -> tuple[list[str], int]:
116
+ """Drop sessions whose ``(last_ts, last_mtime)`` has not advanced.
117
+
118
+ ``candidates`` is an iterable of ``(session_id, current_last_ts,
119
+ current_last_mtime)``. Returns ``(pending_session_ids, skipped_count)``.
120
+
121
+ A session is skipped iff a checkpoint row exists for ``pipeline`` AND
122
+ both ``current_last_ts <= ckpt.last_ts`` AND ``current_last_mtime <=
123
+ ckpt.last_mtime``. Either bound moving forward invalidates the skip.
124
+ """
125
+ ckpt = load_as_map(checkpoint_db_path, pipeline)
126
+ pending: list[str] = []
127
+ skipped = 0
128
+ for sid, cur_ts, cur_mtime in candidates:
129
+ prev = ckpt.get(sid)
130
+ if prev is None:
131
+ pending.append(sid)
132
+ continue
133
+ prev_ts, prev_mtime = prev
134
+ if _stale_or_equal(cur_ts, prev_ts) and _stale_or_equal(cur_mtime, prev_mtime):
135
+ skipped += 1
136
+ continue
137
+ pending.append(sid)
138
+ return pending, skipped
139
+
140
+
141
+ def _stale_or_equal(cur: datetime | None, prev: datetime | None) -> bool:
142
+ """True iff both are present and ``cur`` has not advanced past ``prev``.
143
+
144
+ Normalises both operands to naive-UTC before comparing so aware-vs-naive
145
+ drift from different upstream sources (read_json → aware, checkpoint
146
+ re-attach → aware, raw DuckDB TIMESTAMP fetch → naive) never raises
147
+ ``TypeError``.
148
+ """
149
+ if cur is None or prev is None:
150
+ return False
151
+ cur_naive = _strip_tz(cur)
152
+ prev_naive = _strip_tz(prev)
153
+ if cur_naive is None or prev_naive is None:
154
+ return False
155
+ return cur_naive <= prev_naive
156
+
157
+
158
+ def mark_completed(
159
+ db_path: Path,
160
+ *,
161
+ pipeline: str,
162
+ rows: Iterable[tuple[str, datetime | None, datetime | None]],
163
+ ) -> int:
164
+ """Upsert checkpoint rows for ``(session_id, pipeline)``.
165
+
166
+ Each row is ``(session_id, last_ts_processed, last_mtime_processed)``.
167
+ The ``completed_at`` column is stamped with ``datetime.now(UTC)``.
168
+
169
+ Returns the number of upserted rows. When ``rows`` is empty, the DB is
170
+ left untouched.
171
+ """
172
+ incoming = list(rows)
173
+ if not incoming:
174
+ return 0
175
+ now = datetime.now(UTC).replace(tzinfo=None)
176
+ payload = [
177
+ (sid, pipeline, _strip_tz(last_ts), _strip_tz(last_mtime), now)
178
+ for sid, last_ts, last_mtime in incoming
179
+ ]
180
+ con = _connect(db_path)
181
+ try:
182
+ con.executemany(
183
+ "INSERT OR REPLACE INTO session_checkpoint "
184
+ "(session_id, pipeline, last_ts_processed, last_mtime_processed, completed_at) "
185
+ "VALUES (?, ?, ?, ?, ?)",
186
+ payload,
187
+ )
188
+ finally:
189
+ con.close()
190
+ return len(incoming)
191
+
192
+
193
+ def count_rows(db_path: Path) -> int:
194
+ """Return the total number of checkpoint rows, or 0 when the DB is missing."""
195
+ if not db_path.exists():
196
+ return 0
197
+ con = _connect(db_path)
198
+ try:
199
+ row = con.execute("SELECT count(*) FROM session_checkpoint").fetchone()
200
+ finally:
201
+ con.close()
202
+ return int(row[0]) if row else 0