claude-sql 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_sql/__init__.py +5 -0
- claude_sql/binding.py +740 -0
- claude_sql/blind_handover.py +155 -0
- claude_sql/checkpointer.py +202 -0
- claude_sql/cli.py +2344 -0
- claude_sql/cluster_worker.py +208 -0
- claude_sql/community_worker.py +306 -0
- claude_sql/config.py +380 -0
- claude_sql/embed_worker.py +482 -0
- claude_sql/freeze.py +189 -0
- claude_sql/friction_worker.py +561 -0
- claude_sql/install_source.py +77 -0
- claude_sql/judge_worker.py +459 -0
- claude_sql/judges.py +239 -0
- claude_sql/kappa_worker.py +257 -0
- claude_sql/llm_worker.py +1760 -0
- claude_sql/logging_setup.py +95 -0
- claude_sql/output.py +248 -0
- claude_sql/parquet_shards.py +172 -0
- claude_sql/retry_queue.py +180 -0
- claude_sql/review_sheet_render.py +167 -0
- claude_sql/review_sheet_worker.py +463 -0
- claude_sql/schemas.py +454 -0
- claude_sql/session_text.py +387 -0
- claude_sql/skills_catalog.py +354 -0
- claude_sql/sql_views.py +1751 -0
- claude_sql/terms_worker.py +145 -0
- claude_sql/ungrounded_worker.py +190 -0
- claude_sql-0.4.0.dist-info/METADATA +530 -0
- claude_sql-0.4.0.dist-info/RECORD +32 -0
- claude_sql-0.4.0.dist-info/WHEEL +4 -0
- claude_sql-0.4.0.dist-info/entry_points.txt +3 -0
claude_sql/llm_worker.py
ADDED
|
@@ -0,0 +1,1760 @@
|
|
|
1
|
+
"""Bedrock Sonnet 4.6 classification worker.
|
|
2
|
+
|
|
3
|
+
Uses ``invoke_model`` with ``output_config.format`` (GA structured output) --
|
|
4
|
+
NO ``tool_use`` / ``tool_choice`` machinery. Pydantic v2 models in
|
|
5
|
+
``schemas.py`` supply the flattened JSON Schema dicts.
|
|
6
|
+
|
|
7
|
+
Three public pipelines
|
|
8
|
+
----------------------
|
|
9
|
+
classify_sessions(con, settings, *, since_days, limit, dry_run, no_thinking) -> int
|
|
10
|
+
trajectory_messages(con, settings, *, since_days, limit, dry_run, no_thinking) -> int
|
|
11
|
+
detect_conflicts(con, settings, *, since_days, limit, dry_run, no_thinking) -> int
|
|
12
|
+
|
|
13
|
+
Each pipeline discovers unfinished rows (anti-join against its parquet),
|
|
14
|
+
dispatches parallel Bedrock calls under a semaphore, and writes results in
|
|
15
|
+
chunks of ``max(batch_size * 4, 256)`` for crash-resilience.
|
|
16
|
+
|
|
17
|
+
Tenacity + botocore retry shape mirrors ``embed_worker._is_retryable`` exactly
|
|
18
|
+
so throttling behaves the same.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import asyncio
|
|
24
|
+
import json
|
|
25
|
+
import os
|
|
26
|
+
import re
|
|
27
|
+
import threading
|
|
28
|
+
import time
|
|
29
|
+
from datetime import UTC, datetime
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import TYPE_CHECKING, Any
|
|
32
|
+
|
|
33
|
+
import anyio
|
|
34
|
+
import anyio.to_thread
|
|
35
|
+
import boto3
|
|
36
|
+
import polars as pl
|
|
37
|
+
from botocore.config import Config as BotoConfig
|
|
38
|
+
from botocore.exceptions import (
|
|
39
|
+
ClientError,
|
|
40
|
+
ConnectionError as BotoConnectionError,
|
|
41
|
+
EndpointConnectionError,
|
|
42
|
+
ReadTimeoutError,
|
|
43
|
+
SSLError,
|
|
44
|
+
)
|
|
45
|
+
from loguru import logger
|
|
46
|
+
from tenacity import (
|
|
47
|
+
retry,
|
|
48
|
+
retry_if_exception,
|
|
49
|
+
stop_after_attempt,
|
|
50
|
+
wait_exponential,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
from claude_sql import checkpointer, retry_queue
|
|
54
|
+
from claude_sql.logging_setup import loguru_before_sleep
|
|
55
|
+
from claude_sql.parquet_shards import read_all, write_part
|
|
56
|
+
from claude_sql.schemas import (
|
|
57
|
+
MESSAGE_TRAJECTORY_SCHEMA,
|
|
58
|
+
SESSION_CLASSIFICATION_SCHEMA,
|
|
59
|
+
SESSION_CONFLICTS_SCHEMA,
|
|
60
|
+
)
|
|
61
|
+
from claude_sql.session_text import iter_session_texts, session_bounds
|
|
62
|
+
|
|
63
|
+
if TYPE_CHECKING:
|
|
64
|
+
import duckdb
|
|
65
|
+
|
|
66
|
+
from claude_sql.config import Settings
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
_RETRY_CODES: set[str] = {
|
|
70
|
+
# Standard Bedrock throttle + transient-service errors.
|
|
71
|
+
"ThrottlingException",
|
|
72
|
+
"ServiceUnavailableException",
|
|
73
|
+
"ModelTimeoutException",
|
|
74
|
+
"ModelErrorException",
|
|
75
|
+
# Bedrock-specific on-demand capacity errors (per AWS re:Post
|
|
76
|
+
# "Troubleshoot Bedrock on-demand 429 Throttling", 2026-05-08).
|
|
77
|
+
"ProvisionedThroughputExceededException",
|
|
78
|
+
"TooManyRequestsException",
|
|
79
|
+
# 5xx spikes on CRIS routing during global region failover — these
|
|
80
|
+
# are idempotent for structured-output invocations so retry is safe.
|
|
81
|
+
"InternalServerException",
|
|
82
|
+
"InternalFailure",
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
#: When set, every classifier call appends a JSONL trace row to this path
|
|
87
|
+
#: capturing model id, input/output token counts, prompt-cache hits, and
|
|
88
|
+
#: wall-clock ms. Used to verify that ``cache_control`` on the system block
|
|
89
|
+
#: actually triggers Anthropic prompt caching and to compare the real
|
|
90
|
+
#: token mix against the static dry-run estimates. No-op in normal use.
|
|
91
|
+
_BEDROCK_TRACE_PATH = os.environ.get("CLAUDE_SQL_BEDROCK_TRACE")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _maybe_log_bedrock_call(pipeline: str, model_id: str, payload: dict, elapsed_ms: float) -> None:
|
|
95
|
+
"""Append a single trace row when ``CLAUDE_SQL_BEDROCK_TRACE`` is set.
|
|
96
|
+
|
|
97
|
+
Anthropic returns prompt-cache stats under ``payload["usage"]``; we
|
|
98
|
+
capture the full shape so downstream cost accounting can split
|
|
99
|
+
5-minute-TTL writes (1.25× input rate) from 1-hour-TTL writes
|
|
100
|
+
(2× input rate) and cache reads (0.1× input rate). See Anthropic's
|
|
101
|
+
prompt-caching docs for the schema, and AWS's prompt-caching page
|
|
102
|
+
for the per-model cache minimums. Failures are swallowed — tracing
|
|
103
|
+
must never break a real run.
|
|
104
|
+
"""
|
|
105
|
+
if not _BEDROCK_TRACE_PATH:
|
|
106
|
+
return
|
|
107
|
+
try:
|
|
108
|
+
usage = payload.get("usage") or {}
|
|
109
|
+
cache_creation = usage.get("cache_creation") or {}
|
|
110
|
+
row = {
|
|
111
|
+
"ts": datetime.now(UTC).isoformat(),
|
|
112
|
+
"pipeline": pipeline,
|
|
113
|
+
"model": model_id,
|
|
114
|
+
"input_tokens": usage.get("input_tokens"),
|
|
115
|
+
"output_tokens": usage.get("output_tokens"),
|
|
116
|
+
"cache_creation_input_tokens": usage.get("cache_creation_input_tokens"),
|
|
117
|
+
"cache_read_input_tokens": usage.get("cache_read_input_tokens"),
|
|
118
|
+
# New-shape fields (present when the model returns the
|
|
119
|
+
# ``cache_creation`` sub-object; older responses omit them).
|
|
120
|
+
"ephemeral_5m_input_tokens": cache_creation.get("ephemeral_5m_input_tokens"),
|
|
121
|
+
"ephemeral_1h_input_tokens": cache_creation.get("ephemeral_1h_input_tokens"),
|
|
122
|
+
"stop_reason": payload.get("stop_reason"),
|
|
123
|
+
"elapsed_ms": round(elapsed_ms, 1),
|
|
124
|
+
}
|
|
125
|
+
path = Path(_BEDROCK_TRACE_PATH)
|
|
126
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
127
|
+
with path.open("a") as fh:
|
|
128
|
+
fh.write(json.dumps(row) + "\n")
|
|
129
|
+
except OSError:
|
|
130
|
+
# Tracing must never break a real run.
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _is_retryable(exc: BaseException) -> bool:
|
|
135
|
+
"""Return True if ``exc`` is a Bedrock error worth retrying.
|
|
136
|
+
|
|
137
|
+
Same policy as ``embed_worker._is_retryable`` -- throttle/service errors
|
|
138
|
+
via ``ClientError`` plus SSL / connection / read-timeout exceptions.
|
|
139
|
+
"""
|
|
140
|
+
if isinstance(exc, SSLError | BotoConnectionError | EndpointConnectionError | ReadTimeoutError):
|
|
141
|
+
return True
|
|
142
|
+
if not isinstance(exc, ClientError):
|
|
143
|
+
return False
|
|
144
|
+
code = exc.response.get("Error", {}).get("Code")
|
|
145
|
+
return code in _RETRY_CODES
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
_CLIENT_LOCK = threading.Lock()
|
|
149
|
+
_CLIENT_CACHE: dict[tuple[str, int], Any] = {}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _build_bedrock_client(settings: Settings) -> Any:
|
|
153
|
+
"""Return a process-wide ``bedrock-runtime`` client keyed on region + pool size.
|
|
154
|
+
|
|
155
|
+
Per boto3's "Multithreading with clients" guide (2026-05-08) a single
|
|
156
|
+
``client`` instance is thread-safe and intended to be shared across
|
|
157
|
+
workers; creating one per request wastes the TCP pool. We cache by
|
|
158
|
+
``(region, pool_size)`` so changes to ``llm_concurrency`` at runtime
|
|
159
|
+
still produce a fresh client with the right ``max_pool_connections``.
|
|
160
|
+
|
|
161
|
+
Config choices (sources in docstrings of the retry decorator and
|
|
162
|
+
``_maybe_log_bedrock_call``):
|
|
163
|
+
|
|
164
|
+
* ``max_pool_connections`` — botocore default is 10, which starves any
|
|
165
|
+
concurrency >10. AWS's Bedrock scale guide recommends 50 for high
|
|
166
|
+
throughput; we size to at least ``2 × llm_concurrency`` with a
|
|
167
|
+
floor of 32 so embed + friction + trajectory can share without
|
|
168
|
+
contention.
|
|
169
|
+
* ``connect_timeout=10`` — aggressive enough to fail fast on network
|
|
170
|
+
hiccups without swamping short backfills.
|
|
171
|
+
* ``read_timeout=600`` — Sonnet 4.6 with adaptive thinking + 1M
|
|
172
|
+
context can hold the connection past the 60-second botocore
|
|
173
|
+
default. 10 minutes is a safe upper bound for any single call.
|
|
174
|
+
* ``retries.mode='adaptive'`` + ``max_attempts=0`` — botocore's
|
|
175
|
+
adaptive client-side token bucket absorbs short throttle bursts
|
|
176
|
+
at the SDK layer while this module's tenacity decorator owns the
|
|
177
|
+
semantic retry policy (refusal short-circuit, error
|
|
178
|
+
classification). ``max_attempts=0`` disables botocore's own
|
|
179
|
+
retry loop so tenacity sees errors immediately.
|
|
180
|
+
"""
|
|
181
|
+
pool_size = max(
|
|
182
|
+
32,
|
|
183
|
+
max(settings.embed_concurrency, settings.llm_concurrency) * 2,
|
|
184
|
+
)
|
|
185
|
+
key = (settings.region, pool_size)
|
|
186
|
+
with _CLIENT_LOCK:
|
|
187
|
+
client = _CLIENT_CACHE.get(key)
|
|
188
|
+
if client is None:
|
|
189
|
+
boto_cfg = BotoConfig(
|
|
190
|
+
region_name=settings.region,
|
|
191
|
+
retries={"max_attempts": 0, "mode": "adaptive"},
|
|
192
|
+
max_pool_connections=pool_size,
|
|
193
|
+
connect_timeout=10,
|
|
194
|
+
read_timeout=600,
|
|
195
|
+
)
|
|
196
|
+
client = boto3.client("bedrock-runtime", config=boto_cfg)
|
|
197
|
+
_CLIENT_CACHE[key] = client
|
|
198
|
+
return client
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@retry(
|
|
202
|
+
stop=stop_after_attempt(10),
|
|
203
|
+
wait=wait_exponential(multiplier=2, min=2, max=60),
|
|
204
|
+
retry=retry_if_exception(_is_retryable),
|
|
205
|
+
before_sleep=loguru_before_sleep("WARNING"),
|
|
206
|
+
reraise=True,
|
|
207
|
+
)
|
|
208
|
+
def _invoke_classifier_sync(
|
|
209
|
+
client: Any,
|
|
210
|
+
model_id: str,
|
|
211
|
+
schema: dict,
|
|
212
|
+
user_text: str,
|
|
213
|
+
*,
|
|
214
|
+
max_tokens: int,
|
|
215
|
+
thinking_mode: str,
|
|
216
|
+
system: str | None = None,
|
|
217
|
+
) -> dict:
|
|
218
|
+
"""One Bedrock ``invoke_model`` call with ``output_config.format`` structured output.
|
|
219
|
+
|
|
220
|
+
Parameters
|
|
221
|
+
----------
|
|
222
|
+
client
|
|
223
|
+
A boto3 ``bedrock-runtime`` client.
|
|
224
|
+
model_id
|
|
225
|
+
Sonnet 4.6 CRIS profile ID (or any model that supports output_config).
|
|
226
|
+
schema
|
|
227
|
+
Flattened JSON Schema dict (see ``schemas.py``).
|
|
228
|
+
user_text
|
|
229
|
+
The full user-role message body (session text or single message).
|
|
230
|
+
max_tokens
|
|
231
|
+
Hard cap on response tokens.
|
|
232
|
+
thinking_mode
|
|
233
|
+
``"adaptive"`` enables reasoning (higher quality, slower);
|
|
234
|
+
``"disabled"`` is the escape hatch if Bedrock rejects thinking
|
|
235
|
+
combined with ``output_config``.
|
|
236
|
+
system
|
|
237
|
+
Optional system prompt. Pipelines pass a task-specific framing
|
|
238
|
+
(what's being classified, what each label means, when to abstain)
|
|
239
|
+
so the schema descriptions don't have to carry the whole load.
|
|
240
|
+
|
|
241
|
+
Returns
|
|
242
|
+
-------
|
|
243
|
+
dict
|
|
244
|
+
The structured-output JSON object that matches ``schema``.
|
|
245
|
+
"""
|
|
246
|
+
body: dict[str, Any] = {
|
|
247
|
+
"anthropic_version": "bedrock-2023-05-31",
|
|
248
|
+
"max_tokens": max_tokens,
|
|
249
|
+
"output_config": {
|
|
250
|
+
"format": {"type": "json_schema", "schema": schema},
|
|
251
|
+
},
|
|
252
|
+
"messages": [{"role": "user", "content": user_text}],
|
|
253
|
+
}
|
|
254
|
+
if system:
|
|
255
|
+
# Mark the system block with prompt caching so Anthropic reuses the
|
|
256
|
+
# encoded prefix across calls. Below the minimum-cacheable threshold
|
|
257
|
+
# (~1024 tokens for Sonnet 4.6) the cache_control header is ignored
|
|
258
|
+
# silently — no harm — and once the per-pipeline system prompts
|
|
259
|
+
# cross the threshold, the discount kicks in automatically. We send
|
|
260
|
+
# the system value as a content-block list so cache_control attaches
|
|
261
|
+
# cleanly; Bedrock also accepts a bare string for non-cached calls.
|
|
262
|
+
body["system"] = [{"type": "text", "text": system, "cache_control": {"type": "ephemeral"}}]
|
|
263
|
+
if thinking_mode == "adaptive":
|
|
264
|
+
body["thinking"] = {"type": "adaptive"}
|
|
265
|
+
t0 = time.monotonic()
|
|
266
|
+
resp = client.invoke_model(
|
|
267
|
+
modelId=model_id,
|
|
268
|
+
body=json.dumps(body),
|
|
269
|
+
contentType="application/json",
|
|
270
|
+
accept="application/json",
|
|
271
|
+
)
|
|
272
|
+
elapsed_ms = (time.monotonic() - t0) * 1000.0
|
|
273
|
+
payload = json.loads(resp["body"].read())
|
|
274
|
+
_maybe_log_bedrock_call(
|
|
275
|
+
pipeline=schema.get("title", "classifier") if isinstance(schema, dict) else "classifier",
|
|
276
|
+
model_id=model_id,
|
|
277
|
+
payload=payload,
|
|
278
|
+
elapsed_ms=elapsed_ms,
|
|
279
|
+
)
|
|
280
|
+
return _parse_structured_payload(payload)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class BedrockRefusalError(Exception):
|
|
284
|
+
"""Bedrock declined to classify the input under its content policy.
|
|
285
|
+
|
|
286
|
+
Raised when the response has ``stop_reason == "refusal"`` and no
|
|
287
|
+
content blocks. Callers treat this as a terminal, non-retryable
|
|
288
|
+
outcome and can write a neutral placeholder row so the message is
|
|
289
|
+
not re-tried in every future run.
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _parse_structured_payload(payload: dict) -> dict:
|
|
294
|
+
"""Pull the structured JSON object out of a Bedrock response.
|
|
295
|
+
|
|
296
|
+
Four shapes observed in production (2026-04):
|
|
297
|
+
|
|
298
|
+
1. ``payload["output"]`` is a dict — early GA shape, straight return.
|
|
299
|
+
2. Content block with ``type == "output"`` (current GA shape for
|
|
300
|
+
``output_config.format``) — the structured object is the block
|
|
301
|
+
itself, typically under ``"output"`` / ``"json"`` / ``"content"``.
|
|
302
|
+
3. Anthropic message shape (``content`` is a list of blocks with
|
|
303
|
+
``type == "text"``) — parse the first text block as JSON.
|
|
304
|
+
4. Bare dict that already matches the schema — return as-is if it
|
|
305
|
+
looks nothing like a Bedrock envelope.
|
|
306
|
+
|
|
307
|
+
A ``RuntimeError`` with the observed top-level keys is raised when
|
|
308
|
+
no shape matches; the caller enqueues the unit on the retry queue.
|
|
309
|
+
"""
|
|
310
|
+
if payload.get("stop_reason") == "refusal":
|
|
311
|
+
raise BedrockRefusalError("Bedrock refused the input (stop_reason=refusal)")
|
|
312
|
+
if "output" in payload and isinstance(payload["output"], dict):
|
|
313
|
+
return payload["output"]
|
|
314
|
+
content = payload.get("content")
|
|
315
|
+
if isinstance(content, list):
|
|
316
|
+
# Shape 2: structured-output block.
|
|
317
|
+
for block in content:
|
|
318
|
+
if not isinstance(block, dict):
|
|
319
|
+
continue
|
|
320
|
+
if block.get("type") == "output":
|
|
321
|
+
for key in ("output", "json", "content"):
|
|
322
|
+
val = block.get(key)
|
|
323
|
+
if isinstance(val, dict):
|
|
324
|
+
return val
|
|
325
|
+
if isinstance(val, str):
|
|
326
|
+
try:
|
|
327
|
+
return json.loads(val)
|
|
328
|
+
except json.JSONDecodeError:
|
|
329
|
+
continue
|
|
330
|
+
# Shape 3: text block whose body is the structured JSON.
|
|
331
|
+
for block in content:
|
|
332
|
+
if not isinstance(block, dict) or block.get("type") != "text":
|
|
333
|
+
continue
|
|
334
|
+
text = block.get("text", "")
|
|
335
|
+
try:
|
|
336
|
+
return json.loads(text)
|
|
337
|
+
except json.JSONDecodeError:
|
|
338
|
+
stripped = text.strip()
|
|
339
|
+
if stripped.startswith("```"):
|
|
340
|
+
stripped = stripped.strip("`").lstrip("json").strip()
|
|
341
|
+
try:
|
|
342
|
+
return json.loads(stripped)
|
|
343
|
+
except json.JSONDecodeError:
|
|
344
|
+
pass
|
|
345
|
+
# Shape 3b: message with only non-text blocks (thinking, tool_use)
|
|
346
|
+
# but a stop_reason of end_turn — no structured payload to parse.
|
|
347
|
+
if payload.keys() == {"output"} and isinstance(payload["output"], str):
|
|
348
|
+
return json.loads(payload["output"])
|
|
349
|
+
raise RuntimeError(f"Unexpected response shape: {sorted(payload.keys())}")
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
async def _classify_one(
|
|
353
|
+
client: Any,
|
|
354
|
+
model_id: str,
|
|
355
|
+
schema: dict,
|
|
356
|
+
text: str,
|
|
357
|
+
*,
|
|
358
|
+
max_tokens: int,
|
|
359
|
+
thinking_mode: str,
|
|
360
|
+
sem: asyncio.Semaphore | anyio.CapacityLimiter,
|
|
361
|
+
system: str | None = None,
|
|
362
|
+
) -> dict:
|
|
363
|
+
"""Run one classification call under the concurrency limiter.
|
|
364
|
+
|
|
365
|
+
``sem`` accepts either an ``asyncio.Semaphore`` (legacy) or an
|
|
366
|
+
``anyio.CapacityLimiter`` (new default) — both support
|
|
367
|
+
``async with``. The boto3 ``invoke_model`` call is blocking, so we
|
|
368
|
+
hand it to ``anyio.to_thread.run_sync`` which honors the enclosing
|
|
369
|
+
structured-concurrency cancellation scope (if any) instead of
|
|
370
|
+
silently detaching on ``asyncio.to_thread`` cancellation.
|
|
371
|
+
"""
|
|
372
|
+
async with sem:
|
|
373
|
+
return await anyio.to_thread.run_sync(
|
|
374
|
+
lambda: _invoke_classifier_sync(
|
|
375
|
+
client,
|
|
376
|
+
model_id,
|
|
377
|
+
schema,
|
|
378
|
+
text,
|
|
379
|
+
max_tokens=max_tokens,
|
|
380
|
+
thinking_mode=thinking_mode,
|
|
381
|
+
system=system,
|
|
382
|
+
)
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
# ---------------------------------------------------------------------------
|
|
387
|
+
# Per-pipeline system prompts
|
|
388
|
+
# ---------------------------------------------------------------------------
|
|
389
|
+
#
|
|
390
|
+
# The schema descriptions in :mod:`claude_sql.schemas` carry label semantics,
|
|
391
|
+
# but a system prompt is the right surface for *task framing*: what is being
|
|
392
|
+
# classified, what counts as evidence, when to abstain, and what NOT to do.
|
|
393
|
+
# The prior implementation passed only ``messages: [{"role": "user", ...}]``
|
|
394
|
+
# and let the schema do everything — workable on Sonnet, but quality
|
|
395
|
+
# degrades on smaller models and the model has no anchor for ambiguous
|
|
396
|
+
# cases. These constants give every classifier the same anchor.
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
CLASSIFY_SYSTEM_PROMPT = """\
|
|
400
|
+
<instructions>
|
|
401
|
+
You are an offline post-hoc analyst classifying complete Claude Code coding
|
|
402
|
+
sessions. The user message contains the full session transcript (user turns,
|
|
403
|
+
assistant turns, tool calls, and tool results) already concatenated.
|
|
404
|
+
|
|
405
|
+
Emit exactly one JSON object matching the schema. Four label fields plus a
|
|
406
|
+
self-assessed confidence, no surrounding prose, no markdown fences.
|
|
407
|
+
</instructions>
|
|
408
|
+
|
|
409
|
+
<context>
|
|
410
|
+
How to read the transcript:
|
|
411
|
+
|
|
412
|
+
- The opening user message states or implies the goal.
|
|
413
|
+
- Closing exchanges show whether the goal was met.
|
|
414
|
+
- Tool calls plus tool results are the strongest evidence of what actually
|
|
415
|
+
happened — read past chitchat to the actions.
|
|
416
|
+
|
|
417
|
+
Pacing patterns:
|
|
418
|
+
|
|
419
|
+
- Confirmation pattern (user replies "ok", "thanks", "looks good", short
|
|
420
|
+
turns separated by long agent runs) → autonomous.
|
|
421
|
+
- Course correction (user re-instructs, names files the agent missed,
|
|
422
|
+
rewrites the plan mid-flight) → assisted.
|
|
423
|
+
- Step-by-step (user types every instruction, confirms each step, rejects
|
|
424
|
+
more than they accept) → manual.
|
|
425
|
+
|
|
426
|
+
Work category cues:
|
|
427
|
+
|
|
428
|
+
- sde: code, tests, refactors, CI failures, debugging, package management,
|
|
429
|
+
type errors, lint output, anything in src/ or tests/. Default for any
|
|
430
|
+
coding-tool session.
|
|
431
|
+
- admin: scheduling, calendar, expense reports, low-signal email triage,
|
|
432
|
+
routine ops with no code changes.
|
|
433
|
+
- strategy_business: business analysis, competitive landscape, strategic
|
|
434
|
+
memos, proposals, market sizing. Reading and writing strategy documents.
|
|
435
|
+
- events: speaker prep, agenda building, event logistics.
|
|
436
|
+
- thought_leadership: writing for external audiences (blog posts,
|
|
437
|
+
conference abstracts, LinkedIn). Polished prose, not internal docs.
|
|
438
|
+
- other: only when nothing else fits. Sessions that mix sde plus a second
|
|
439
|
+
category should pick the one with more turns / tool calls.
|
|
440
|
+
|
|
441
|
+
Success semantics:
|
|
442
|
+
|
|
443
|
+
- success: goal as stated was clearly met. Tests pass, feature works,
|
|
444
|
+
document is done, decision is made.
|
|
445
|
+
- partial: the work landed with explicit caveats or leftover TODOs the
|
|
446
|
+
user acknowledged.
|
|
447
|
+
- failure: session ended without reaching the goal — agent gave up,
|
|
448
|
+
blocked indefinitely, or wrong path landed.
|
|
449
|
+
- unknown: insufficient signal. Session ends mid-task, no clear close,
|
|
450
|
+
too short to judge.
|
|
451
|
+
</context>
|
|
452
|
+
|
|
453
|
+
<calibration>
|
|
454
|
+
- Use unknown plus confidence < 0.5 when the evidence is genuinely mixed.
|
|
455
|
+
Do not manufacture certainty to fill the schema.
|
|
456
|
+
- goal must be one sentence in present tense, paraphrasing the user — not
|
|
457
|
+
a literal quote, not two goals concatenated with "and".
|
|
458
|
+
- A session that explores three options and doesn't pick one is partial,
|
|
459
|
+
with unknown only if the user never confirmed the session was over.
|
|
460
|
+
- Confidence is per-row, not per-field. If you're sure of three fields
|
|
461
|
+
and uncertain about work_category, pick the most likely and reflect
|
|
462
|
+
the uncertainty in the overall confidence.
|
|
463
|
+
</calibration>
|
|
464
|
+
|
|
465
|
+
<examples>
|
|
466
|
+
<example>
|
|
467
|
+
<input>A 4-hour session where the user opens with "implement Phase 2 of the
|
|
468
|
+
auth migration", the agent runs ~80 tool calls, the user replies "ok",
|
|
469
|
+
"good", "ship it" between long agent runs, ends with green tests plus a
|
|
470
|
+
successful merge.</input>
|
|
471
|
+
<output>autonomy_tier=autonomous, work_category=sde, success=success,
|
|
472
|
+
confidence=0.9</output>
|
|
473
|
+
</example>
|
|
474
|
+
<example>
|
|
475
|
+
<input>A 30-minute session where the user pastes a stack trace, the agent
|
|
476
|
+
reads the offending file and proposes a fix, the user says "actually I
|
|
477
|
+
think the bug is in module Y, can you check there", the agent verifies,
|
|
478
|
+
fixes Y, tests pass, the user thanks the agent and ends.</input>
|
|
479
|
+
<output>autonomy_tier=assisted (user redirected), work_category=sde,
|
|
480
|
+
success=success, confidence=0.85</output>
|
|
481
|
+
</example>
|
|
482
|
+
<example>
|
|
483
|
+
<input>A 2-hour session of strategic memo work — user dictates section
|
|
484
|
+
outlines, agent drafts, user rewrites paragraphs heavily, three rounds
|
|
485
|
+
of revision, ends with a published draft.</input>
|
|
486
|
+
<output>autonomy_tier=assisted, work_category=strategy_business,
|
|
487
|
+
success=success, confidence=0.85</output>
|
|
488
|
+
</example>
|
|
489
|
+
<example>
|
|
490
|
+
<input>A session that opens with "schedule a 1:1 with X", the agent calls
|
|
491
|
+
calendar, finds slots, user picks one, agent books, user confirms.</input>
|
|
492
|
+
<output>autonomy_tier=manual, work_category=admin, success=success,
|
|
493
|
+
confidence=0.95</output>
|
|
494
|
+
</example>
|
|
495
|
+
<example>
|
|
496
|
+
<input>A 5-minute session where the user asks "how should I structure the
|
|
497
|
+
test fixture?", the agent explains, the user says "got it" and ends
|
|
498
|
+
without writing code.</input>
|
|
499
|
+
<output>autonomy_tier=manual, work_category=sde, success=success (goal was
|
|
500
|
+
advice, which was given), confidence=0.7</output>
|
|
501
|
+
</example>
|
|
502
|
+
<example>
|
|
503
|
+
<input>A session where the user pastes a 500-line markdown plan and says
|
|
504
|
+
"let's start", the agent runs through the first three sections, but the
|
|
505
|
+
session ends mid-flight with five sections still unaddressed.</input>
|
|
506
|
+
<output>autonomy_tier=assisted, work_category=sde, success=partial,
|
|
507
|
+
confidence=0.85</output>
|
|
508
|
+
</example>
|
|
509
|
+
</examples>
|
|
510
|
+
|
|
511
|
+
<anti_patterns>
|
|
512
|
+
- Don't grade on agent skill. success means the goal was met, even if
|
|
513
|
+
the path was meandering. failure doesn't mean the agent was bad; it
|
|
514
|
+
means the goal wasn't met.
|
|
515
|
+
- Don't infer goals from agent actions. The user's opening message is
|
|
516
|
+
the ground truth for goal. If the agent went on a tangent, the goal is
|
|
517
|
+
still what the user asked for.
|
|
518
|
+
- Don't confuse autonomous with "agent did a lot". Autonomous requires
|
|
519
|
+
the user to step back and let the agent run. A session where the
|
|
520
|
+
agent produces lots of code but the user reviews each diff is assisted.
|
|
521
|
+
- goal is the user's goal, not the session's outcome. If the user asked
|
|
522
|
+
to refactor X but the agent ended up debugging an unrelated test
|
|
523
|
+
failure, goal is still "refactor X". The detour shows up in success.
|
|
524
|
+
</anti_patterns>
|
|
525
|
+
"""
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
TRAJECTORY_SYSTEM_PROMPT = """\
|
|
529
|
+
<instructions>
|
|
530
|
+
You score the emotional polarity of ONE message inside a Claude Code coding
|
|
531
|
+
session. The user input is the message text in isolation — you will not
|
|
532
|
+
see the prior turn, by design.
|
|
533
|
+
|
|
534
|
+
Emit exactly one JSON object matching the schema:
|
|
535
|
+
|
|
536
|
+
- sentiment_delta: one of positive / neutral / negative. The field name
|
|
537
|
+
is historical; semantics are absolute polarity, not "vs prior".
|
|
538
|
+
- is_transition: true when the message is pure filler / acknowledgement,
|
|
539
|
+
false otherwise.
|
|
540
|
+
- confidence: self-assessed certainty 0.0-1.0.
|
|
541
|
+
|
|
542
|
+
Output JSON only. No surrounding prose, no markdown fences.
|
|
543
|
+
</instructions>
|
|
544
|
+
|
|
545
|
+
<calibration>
|
|
546
|
+
Prior on label distribution for a coding session is roughly:
|
|
547
|
+
neutral 70%, positive 25%, negative 5%.
|
|
548
|
+
If your output drifts away from this distribution on a sustained run, you
|
|
549
|
+
are manufacturing affect. neutral is the default; deviations need explicit
|
|
550
|
+
cues.
|
|
551
|
+
|
|
552
|
+
NEUTRAL — the majority class. Pick this for:
|
|
553
|
+
- Factual statements: "The function returns a list.", "Tests pass."
|
|
554
|
+
- Procedural turns: "Running the linter.", "Here is the diff.",
|
|
555
|
+
"Updated foo.py."
|
|
556
|
+
- Plain instructions: "Add a test for the empty case.", "Refactor module X."
|
|
557
|
+
- Plain questions: "Where does the config live?", "Why is this private?"
|
|
558
|
+
- Tool-use narration: "Calling the search API now.", "Reading file Y."
|
|
559
|
+
- Status reports without affect: "I'm done with the migration."
|
|
560
|
+
|
|
561
|
+
POSITIVE — visible excitement, approval, or momentum:
|
|
562
|
+
- Direct praise: "nice!", "love this", "perfect", "this is great"
|
|
563
|
+
- Energetic agreement: "yes exactly", "shipping it", "do it"
|
|
564
|
+
- Celebration: "finally working", "huge win"
|
|
565
|
+
- Explicit thanks that goes beyond "thanks": "thanks, this is exactly
|
|
566
|
+
what I needed"
|
|
567
|
+
NOT positive: polite "thanks", procedural "ok", a "sounds good" that's
|
|
568
|
+
just pacing the conversation.
|
|
569
|
+
|
|
570
|
+
NEGATIVE — friction, frustration, blocked:
|
|
571
|
+
- Frustration: "ugh", "seriously?", "are you kidding", "this is broken"
|
|
572
|
+
- Pushback: "no, that's wrong", "I don't think that works", "not what
|
|
573
|
+
I asked for"
|
|
574
|
+
- Blocked: "this is failing", "I'm stuck", "can't get past X"
|
|
575
|
+
- Sharp correction: "stop doing that", "you keep messing this up"
|
|
576
|
+
NOT negative: a calm correction ("actually let me clarify"), a flagged
|
|
577
|
+
bug report ("noticed an off-by-one"), a curt instruction.
|
|
578
|
+
|
|
579
|
+
is_transition:
|
|
580
|
+
|
|
581
|
+
Set true when the message has no substantive content — it's filler:
|
|
582
|
+
- "Ok let me check that.", "Running...", "Done.", "Clean.", "Right.",
|
|
583
|
+
"Cool.", "Got it, moving on.", "Yep.", "Sure."
|
|
584
|
+
|
|
585
|
+
Set false when the message carries information, instruction, question,
|
|
586
|
+
or affect, even if it's short. "tests pass" is neutral but NOT a
|
|
587
|
+
transition. "ugh" is negative and not a transition either.
|
|
588
|
+
</calibration>
|
|
589
|
+
|
|
590
|
+
<examples>
|
|
591
|
+
<example>
|
|
592
|
+
<input>Tests pass.</input>
|
|
593
|
+
<output>sentiment_delta=neutral, is_transition=true, confidence=0.9</output>
|
|
594
|
+
</example>
|
|
595
|
+
<example>
|
|
596
|
+
<input>All 240 passed, 4 warnings in 53s.</input>
|
|
597
|
+
<output>sentiment_delta=neutral, is_transition=false, confidence=0.9.
|
|
598
|
+
Information-dense report.</output>
|
|
599
|
+
</example>
|
|
600
|
+
<example>
|
|
601
|
+
<input>shipping it</input>
|
|
602
|
+
<output>sentiment_delta=positive, is_transition=false, confidence=0.95.
|
|
603
|
+
Explicit decision verb plus momentum.</output>
|
|
604
|
+
</example>
|
|
605
|
+
<example>
|
|
606
|
+
<input>ok let me check that</input>
|
|
607
|
+
<output>sentiment_delta=neutral, is_transition=true, confidence=0.9.
|
|
608
|
+
Acknowledgement filler. The "ok" doesn't carry affect; it's pacing.</output>
|
|
609
|
+
</example>
|
|
610
|
+
<example>
|
|
611
|
+
<input>this entire approach is wrong because the cache key is per-tenant</input>
|
|
612
|
+
<output>sentiment_delta=negative, is_transition=false, confidence=0.9.
|
|
613
|
+
Substantive disagreement with reasoning. Negative even though articulate.</output>
|
|
614
|
+
</example>
|
|
615
|
+
<example>
|
|
616
|
+
<input>running pytest now</input>
|
|
617
|
+
<output>sentiment_delta=neutral, is_transition=true, confidence=0.85.
|
|
618
|
+
Procedural narration.</output>
|
|
619
|
+
</example>
|
|
620
|
+
<example>
|
|
621
|
+
<input>that's not what I meant — I want X to be derived, not stored</input>
|
|
622
|
+
<output>sentiment_delta=negative, is_transition=false, confidence=0.85.
|
|
623
|
+
Correction with a substantive counter-proposal.</output>
|
|
624
|
+
</example>
|
|
625
|
+
<example>
|
|
626
|
+
<input>perfect, this is exactly what I wanted</input>
|
|
627
|
+
<output>sentiment_delta=positive, is_transition=false, confidence=0.95.
|
|
628
|
+
Direct praise plus specificity.</output>
|
|
629
|
+
</example>
|
|
630
|
+
<example>
|
|
631
|
+
<input>hmm, that doesn't seem right</input>
|
|
632
|
+
<output>sentiment_delta=negative, is_transition=false, confidence=0.65.
|
|
633
|
+
Mild pushback. Confidence below 0.7 because "hmm" is genuinely ambiguous
|
|
634
|
+
— could be deliberation.</output>
|
|
635
|
+
</example>
|
|
636
|
+
<example>
|
|
637
|
+
<input>thanks</input>
|
|
638
|
+
<output>sentiment_delta=neutral, is_transition=true, confidence=0.7.
|
|
639
|
+
Bare politeness. Not positive (no specificity), a social close.</output>
|
|
640
|
+
</example>
|
|
641
|
+
<example>
|
|
642
|
+
<input>Done.</input>
|
|
643
|
+
<output>sentiment_delta=neutral, is_transition=true, confidence=0.9.
|
|
644
|
+
Single-word close-out. Filler.</output>
|
|
645
|
+
</example>
|
|
646
|
+
<example>
|
|
647
|
+
<input>no don't do that</input>
|
|
648
|
+
<output>sentiment_delta=negative, is_transition=false, confidence=0.9.
|
|
649
|
+
Hard correction. The "no" plus "don't" is the cue.</output>
|
|
650
|
+
</example>
|
|
651
|
+
<example>
|
|
652
|
+
<input>I think we should go with the simple version for now</input>
|
|
653
|
+
<output>sentiment_delta=neutral, is_transition=false, confidence=0.85.
|
|
654
|
+
Substantive opinion without affect. "For now" signals pragmatism, not
|
|
655
|
+
positivity.</output>
|
|
656
|
+
</example>
|
|
657
|
+
</examples>
|
|
658
|
+
|
|
659
|
+
<anti_patterns>
|
|
660
|
+
- Don't manufacture certainty. Confidence < 0.7 is appropriate when the
|
|
661
|
+
message is short, single-word, or context-dependent. The downstream
|
|
662
|
+
pipeline weights by confidence — don't hand-wave.
|
|
663
|
+
- Don't conflate length with neutrality. A long technical message can
|
|
664
|
+
still be negative ("This entire approach is wrong because..."). A
|
|
665
|
+
short message can still be positive ("ship it!").
|
|
666
|
+
- Don't read intent into procedural text. A bare "Done." is a
|
|
667
|
+
transition, not triumphant positive. A bare "Running tests" is
|
|
668
|
+
neutral, not anxious negative.
|
|
669
|
+
- Avoid the "slightly positive" / "mildly negative" trap. The schema
|
|
670
|
+
has three labels for a reason. If tempted to pick a side, the answer
|
|
671
|
+
is neutral.
|
|
672
|
+
- Tool-use narration ("calling X", "reading Y", "checking Z") is
|
|
673
|
+
overwhelmingly neutral. Don't score the agent's procedural play-by-
|
|
674
|
+
play as positive momentum unless the wording itself is enthusiastic.
|
|
675
|
+
- Length is not affect. A 50-word careful explanation can be neutral.
|
|
676
|
+
A 3-word reply ("ship it!") can be positive. Polarity is in the words,
|
|
677
|
+
not the size of the message.
|
|
678
|
+
</anti_patterns>
|
|
679
|
+
"""
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
CONFLICTS_SYSTEM_PROMPT = """\
|
|
683
|
+
<instructions>
|
|
684
|
+
You analyze a complete Claude Code coding session for STANCE CONFLICTS —
|
|
685
|
+
moments where the user and the agent (or the agent's own reasoning) hold
|
|
686
|
+
mutually-exclusive positions on the same substantive question.
|
|
687
|
+
|
|
688
|
+
Emit exactly one JSON object with a conflicts array. Each conflict has
|
|
689
|
+
three fields: stance_a, stance_b (one-sentence summaries) and resolution
|
|
690
|
+
(resolved / unresolved / abandoned). An empty list is valid and common.
|
|
691
|
+
|
|
692
|
+
Output JSON only. No surrounding prose, no markdown fences.
|
|
693
|
+
</instructions>
|
|
694
|
+
|
|
695
|
+
<context>
|
|
696
|
+
What counts as a conflict:
|
|
697
|
+
|
|
698
|
+
- Two stances on the same technical decision held by different parties,
|
|
699
|
+
or by the same party at different points. "Use Sonnet" vs "use Opus".
|
|
700
|
+
"Ship the simple version now" vs "wait for the architectural cleanup".
|
|
701
|
+
"Cache the embeddings" vs "rebuild from parquet every run".
|
|
702
|
+
- Two stances on a strategic / scope decision: "rename the field" vs
|
|
703
|
+
"keep the field name and shift semantics". "One bundled PR" vs
|
|
704
|
+
"split into three". "Fix it on this branch" vs "open a follow-up".
|
|
705
|
+
- The conflict must be SUBSTANTIVE — measurable consequences, not style.
|
|
706
|
+
|
|
707
|
+
Resolution semantics:
|
|
708
|
+
|
|
709
|
+
- resolved: the session converged on one stance with explicit agreement.
|
|
710
|
+
Look for "ok let's do that", "you're right", "going with X".
|
|
711
|
+
- unresolved: both stances were still live at session end. User punted,
|
|
712
|
+
agent didn't pick, or session ran out of time.
|
|
713
|
+
- abandoned: topic was dropped without a decision. Different from
|
|
714
|
+
unresolved — abandoned means the conversation moved on, not that they
|
|
715
|
+
failed to decide.
|
|
716
|
+
|
|
717
|
+
Identification heuristics:
|
|
718
|
+
|
|
719
|
+
1. Strongest signal is structural: stance A proposed, counter-stance B
|
|
720
|
+
raised explicitly, then a decision made (or not). Without an explicit
|
|
721
|
+
counter-stance, you don't have a conflict.
|
|
722
|
+
2. Verbal markers: "but I think", "actually I'd argue", "I disagree",
|
|
723
|
+
"the other side of that is", "alternatively", "or we could".
|
|
724
|
+
3. Skip agent's internal monologue ("on one hand X, on the other Y") when
|
|
725
|
+
the agent immediately picks one — that's deliberation. Only count when
|
|
726
|
+
the user (or another party) holds the other stance.
|
|
727
|
+
</context>
|
|
728
|
+
|
|
729
|
+
<calibration>
|
|
730
|
+
When in doubt, return an empty conflicts array. False positives pollute
|
|
731
|
+
the corpus more than missed conflicts hurt — downstream views
|
|
732
|
+
(session_conflicts) are used by humans to find interesting decision
|
|
733
|
+
points, and noise drowns signal.
|
|
734
|
+
|
|
735
|
+
Typical coding session has 0 conflicts. Typical strategy / planning
|
|
736
|
+
session has 0-2. Sessions with 3+ conflicts exist but are rare;
|
|
737
|
+
double-check your output if you're emitting that many.
|
|
738
|
+
</calibration>
|
|
739
|
+
|
|
740
|
+
<examples>
|
|
741
|
+
<example>
|
|
742
|
+
<input>User wants to optimize a slow query. Agent proposes denormalizing
|
|
743
|
+
the table. User counters: "no, let's add a covering index instead — I
|
|
744
|
+
don't want to touch the schema". Agent accepts the index approach and
|
|
745
|
+
ships it.</input>
|
|
746
|
+
<output>conflicts=[{stance_a: "Denormalize the table to make the query
|
|
747
|
+
faster.", stance_b: "Keep the schema; add a covering index instead.",
|
|
748
|
+
resolution: "resolved"}]</output>
|
|
749
|
+
</example>
|
|
750
|
+
<example>
|
|
751
|
+
<input>User proposes a 3-step plan. Agent says "I think step 2 is risky
|
|
752
|
+
because of X — should we add a rollback first?" User agrees, plan
|
|
753
|
+
becomes 4 steps. Both proceed.</input>
|
|
754
|
+
<output>conflicts=[]. Agent flagged a risk, user incorporated it. No
|
|
755
|
+
counter-stance held.</output>
|
|
756
|
+
</example>
|
|
757
|
+
<example>
|
|
758
|
+
<input>Agent's reasoning shows "I could use approach A or approach B,
|
|
759
|
+
but A is simpler so I'll go with A". User says "ok".</input>
|
|
760
|
+
<output>conflicts=[]. Agent considered alternatives in its own
|
|
761
|
+
thinking. User didn't hold a counter-stance.</output>
|
|
762
|
+
</example>
|
|
763
|
+
<example>
|
|
764
|
+
<input>User says "wait, isn't that going to break X?" Agent explains
|
|
765
|
+
why not. User: "oh you're right, never mind."</input>
|
|
766
|
+
<output>conflicts=[]. Question surfaced, answered, retracted. No
|
|
767
|
+
sustained position.</output>
|
|
768
|
+
</example>
|
|
769
|
+
<example>
|
|
770
|
+
<input>User leans toward "ship simple version now", agent leans toward
|
|
771
|
+
"wait for architectural cleanup". Session ends without a decision; user
|
|
772
|
+
says "let me think about it".</input>
|
|
773
|
+
<output>conflicts=[{stance_a: "Ship the simple version now to unblock
|
|
774
|
+
users.", stance_b: "Wait for the architectural cleanup so we don't ship
|
|
775
|
+
debt.", resolution: "unresolved"}]</output>
|
|
776
|
+
</example>
|
|
777
|
+
<example>
|
|
778
|
+
<input>Brief disagreement about which CI config to use. User pivots to
|
|
779
|
+
a different topic. Never returns to the CI question.</input>
|
|
780
|
+
<output>conflicts=[{stance_a: "Use GitHub Actions for the new pipeline.",
|
|
781
|
+
stance_b: "Stick with the existing CodeBuild setup.",
|
|
782
|
+
resolution: "abandoned"}]</output>
|
|
783
|
+
</example>
|
|
784
|
+
</examples>
|
|
785
|
+
|
|
786
|
+
<anti_patterns>
|
|
787
|
+
- Don't count collaboration as conflict. Agent proposes a plan, user
|
|
788
|
+
agrees with caveats and the agent adapts. That's collaboration.
|
|
789
|
+
- Don't count agent deliberation. Agent considers two approaches in its
|
|
790
|
+
own reasoning, then picks one with the user's blessing. That's
|
|
791
|
+
deliberation, not conflict.
|
|
792
|
+
- Don't count surface-level pushback that the user immediately retracts.
|
|
793
|
+
- Don't count style / formatting disagreements ("I'd phrase that
|
|
794
|
+
differently", "use semicolons not commas").
|
|
795
|
+
- Don't count accepted risk. Agent flags risk, user accepts it. That's
|
|
796
|
+
a noted caveat, not a conflict.
|
|
797
|
+
- Don't count iteration. Two failed attempts at the same task (agent
|
|
798
|
+
tried X, then Y). That's iteration, not conflict.
|
|
799
|
+
- Don't count tooling preferences without consequence ("I'd use jq here"
|
|
800
|
+
vs "I'd use python -c").
|
|
801
|
+
</anti_patterns>
|
|
802
|
+
"""
|
|
803
|
+
|
|
804
|
+
|
|
805
|
+
USER_FRICTION_SYSTEM_PROMPT = """\
|
|
806
|
+
<instructions>
|
|
807
|
+
You classify ONE short user message from a Claude Code coding session for
|
|
808
|
+
friction signals — cues that the human is impatient, confused,
|
|
809
|
+
interrupting the agent, correcting it, or asking for something the agent
|
|
810
|
+
should have provided proactively but didn't.
|
|
811
|
+
|
|
812
|
+
The message is presented in isolation. You will not see prior turns or
|
|
813
|
+
the agent response that preceded it. Make the call from the message
|
|
814
|
+
text alone.
|
|
815
|
+
|
|
816
|
+
Emit exactly one JSON object with three fields: label (one of the seven
|
|
817
|
+
values below), rationale (one short sentence naming the cue), and
|
|
818
|
+
confidence (0.0-1.0). Output JSON only. No surrounding prose, no
|
|
819
|
+
markdown fences.
|
|
820
|
+
</instructions>
|
|
821
|
+
|
|
822
|
+
<context>
|
|
823
|
+
Label semantics:
|
|
824
|
+
|
|
825
|
+
- status_ping: progress / ETA query.
|
|
826
|
+
Triggers: "how's it going?", "any update?", "where are we?",
|
|
827
|
+
"still working?", "what's your eta?", "are you alive?"
|
|
828
|
+
NOT triggers: "where does the config live?" (technical question),
|
|
829
|
+
"where are we in the migration plan?" (substantive scope question).
|
|
830
|
+
|
|
831
|
+
- unmet_expectation: short question pointing at something the agent
|
|
832
|
+
should have produced.
|
|
833
|
+
Triggers: bare one-word questions ending in "?": "screenshot?",
|
|
834
|
+
"tests?", "diff?", "link?", "logs?", "stacktrace?".
|
|
835
|
+
NOT triggers: "what's the type of X?" (substantive),
|
|
836
|
+
"tests for which file?" (clarification, not friction).
|
|
837
|
+
|
|
838
|
+
- confusion: user signals they don't follow the output or state.
|
|
839
|
+
Triggers: "what does that mean?", "I don't get it", "huh?",
|
|
840
|
+
"why did you do X?" (when X already happened), "wait, what?"
|
|
841
|
+
NOT triggers: a calm question about a future action, a request for
|
|
842
|
+
explanation ("explain that step please" — neutral instruction).
|
|
843
|
+
|
|
844
|
+
- interruption: user cuts the agent off or pivots mid-task.
|
|
845
|
+
Triggers: "wait", "stop", "hold on", "pause", "actually...",
|
|
846
|
+
"before you do that", "nvm", "never mind".
|
|
847
|
+
NOT triggers: "wait until tests pass" (instruction, not interrupt),
|
|
848
|
+
"stop the server" (action request).
|
|
849
|
+
|
|
850
|
+
- correction: explicit "you got it wrong".
|
|
851
|
+
Triggers: "no, not that", "that's wrong", "nope", "try again",
|
|
852
|
+
"you're doing it wrong", "incorrect".
|
|
853
|
+
NOT triggers: "actually let me clarify" (re-framing, not correcting),
|
|
854
|
+
technical bug reports ("X returns None instead of []" — substantive).
|
|
855
|
+
|
|
856
|
+
- frustration: terse annoyance or sarcasm.
|
|
857
|
+
Triggers: "ugh", "seriously?", "are you kidding", "really?",
|
|
858
|
+
"come on".
|
|
859
|
+
NOT triggers: a curt but neutral instruction.
|
|
860
|
+
|
|
861
|
+
- none: ordinary task turn. THIS IS THE MAJORITY CLASS — use it
|
|
862
|
+
aggressively. Anything that's a substantive instruction, a plain
|
|
863
|
+
technical question, an acknowledgement, a routing decision, or text
|
|
864
|
+
the user typed to advance the task is none. The threshold for
|
|
865
|
+
friction is high.
|
|
866
|
+
</context>
|
|
867
|
+
|
|
868
|
+
<calibration>
|
|
869
|
+
- confidence < 0.5 is correct when the message is genuinely ambiguous
|
|
870
|
+
between none and a friction label. Don't manufacture certainty.
|
|
871
|
+
- confidence > 0.8 requires an unambiguous cue you can name in the
|
|
872
|
+
rationale field.
|
|
873
|
+
- For obvious cases ("ugh"), 0.95 is fine.
|
|
874
|
+
</calibration>
|
|
875
|
+
|
|
876
|
+
<examples>
|
|
877
|
+
<example>
|
|
878
|
+
<input>screenshot?</input>
|
|
879
|
+
<output>label=unmet_expectation, confidence=0.7. Bare one-word
|
|
880
|
+
question pointing at a missed artifact.</output>
|
|
881
|
+
</example>
|
|
882
|
+
<example>
|
|
883
|
+
<input>stop</input>
|
|
884
|
+
<output>label=interruption, confidence=0.95. Hard interruption keyword
|
|
885
|
+
as the entire message.</output>
|
|
886
|
+
</example>
|
|
887
|
+
<example>
|
|
888
|
+
<input>delete that file</input>
|
|
889
|
+
<output>label=none, confidence=0.9. Bare instruction, not friction.</output>
|
|
890
|
+
</example>
|
|
891
|
+
<example>
|
|
892
|
+
<input>ugh</input>
|
|
893
|
+
<output>label=frustration, confidence=0.95. Unambiguous annoyance.</output>
|
|
894
|
+
</example>
|
|
895
|
+
<example>
|
|
896
|
+
<input>why did you do that?</input>
|
|
897
|
+
<output>label=confusion, confidence=0.85. Questioning a completed
|
|
898
|
+
action.</output>
|
|
899
|
+
</example>
|
|
900
|
+
<example>
|
|
901
|
+
<input>where does the config live?</input>
|
|
902
|
+
<output>label=none, confidence=0.9. Substantive technical question.</output>
|
|
903
|
+
</example>
|
|
904
|
+
<example>
|
|
905
|
+
<input>nope, try again</input>
|
|
906
|
+
<output>label=correction, confidence=0.95. Explicit rejection plus
|
|
907
|
+
redo.</output>
|
|
908
|
+
</example>
|
|
909
|
+
<example>
|
|
910
|
+
<input>tests for the auth module</input>
|
|
911
|
+
<output>label=none, confidence=0.9. Substantive instruction — what
|
|
912
|
+
tests, not a bare "tests?".</output>
|
|
913
|
+
</example>
|
|
914
|
+
</examples>
|
|
915
|
+
|
|
916
|
+
<anti_patterns>
|
|
917
|
+
- A bare instruction is none, even if it sounds curt. "delete that file"
|
|
918
|
+
is not correction. "add a test for X" is not unmet_expectation.
|
|
919
|
+
- A short technical question is none. "what's the type?" /
|
|
920
|
+
"where is X?" are not friction signals. Friction requires affect or
|
|
921
|
+
implicit complaint.
|
|
922
|
+
- Don't flag based on tone alone. "ok" is none, even if you imagine
|
|
923
|
+
it's sarcastic — without surrounding context you can't tell, so
|
|
924
|
+
default to none.
|
|
925
|
+
- Claude Code injects two strings as user-role messages that look like
|
|
926
|
+
friction but are CLI bookkeeping: "Continue from where you left off."
|
|
927
|
+
and "[Request interrupted by user for tool use]". Both should be
|
|
928
|
+
none. (They're filtered upstream so you'll rarely see them, but be
|
|
929
|
+
safe.)
|
|
930
|
+
</anti_patterns>
|
|
931
|
+
"""
|
|
932
|
+
|
|
933
|
+
|
|
934
|
+
_CLASSIFIER_APPENDIX = """\
|
|
935
|
+
|
|
936
|
+
<operating_context>
|
|
937
|
+
You are running offline against a snapshot of Claude Code transcripts
|
|
938
|
+
already on disk. There is no live user to clarify with — you must commit
|
|
939
|
+
to one output for each call. The downstream pipeline writes your output
|
|
940
|
+
to a parquet file used by SQL views and analytics macros; future you (or
|
|
941
|
+
a human auditor) will read these rows in aggregate, not in isolation.
|
|
942
|
+
</operating_context>
|
|
943
|
+
|
|
944
|
+
<quality_bar>
|
|
945
|
+
- Idempotence: the same input must produce the same output across runs.
|
|
946
|
+
Don't introduce randomness or invent details that aren't in the input.
|
|
947
|
+
- Calibration over confidence: a low confidence with the correct label
|
|
948
|
+
is more useful than a high confidence with the wrong one. Confidence
|
|
949
|
+
is downstream-weighted; honesty pays.
|
|
950
|
+
- Failure mode: if the input is genuinely undecidable, pick the most
|
|
951
|
+
conservative / abstaining label the schema allows (unknown, none,
|
|
952
|
+
empty list) and set confidence below 0.5. Do not guess.
|
|
953
|
+
- The schema is the contract: every field is required, no field may be
|
|
954
|
+
null unless the schema marks it optional, and string fields have
|
|
955
|
+
practical length budgets stated in their descriptions — respect them.
|
|
956
|
+
</quality_bar>
|
|
957
|
+
|
|
958
|
+
<output_rules>
|
|
959
|
+
- Output is parsed as JSON. Bedrock's output_config.format enforces the
|
|
960
|
+
schema, but you should still produce valid JSON without surrounding
|
|
961
|
+
text or fences. The parser ignores prose; you waste tokens by emitting
|
|
962
|
+
it.
|
|
963
|
+
- Do not echo the schema, the system prompt, or the user message back.
|
|
964
|
+
Just the structured object.
|
|
965
|
+
- Field order in your output should match the order in the schema. This
|
|
966
|
+
is conventional, not enforced, but it makes the parquet rows readable.
|
|
967
|
+
</output_rules>
|
|
968
|
+
"""
|
|
969
|
+
|
|
970
|
+
|
|
971
|
+
CLASSIFY_SYSTEM_PROMPT += _CLASSIFIER_APPENDIX
|
|
972
|
+
TRAJECTORY_SYSTEM_PROMPT += _CLASSIFIER_APPENDIX
|
|
973
|
+
CONFLICTS_SYSTEM_PROMPT += _CLASSIFIER_APPENDIX
|
|
974
|
+
USER_FRICTION_SYSTEM_PROMPT += _CLASSIFIER_APPENDIX
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
def _estimate_cost(
|
|
978
|
+
n_items: int,
|
|
979
|
+
avg_in_tokens: int,
|
|
980
|
+
avg_out_tokens: int,
|
|
981
|
+
pricing: tuple[float, float],
|
|
982
|
+
) -> float:
|
|
983
|
+
"""Back-of-envelope dollar estimate for ``n_items`` classification calls."""
|
|
984
|
+
in_rate, out_rate = pricing
|
|
985
|
+
return (n_items * avg_in_tokens * in_rate + n_items * avg_out_tokens * out_rate) / 1_000_000
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
# ---------------------------------------------------------------------------
|
|
989
|
+
# Pipeline 1: session classification
|
|
990
|
+
# ---------------------------------------------------------------------------
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
async def _classify_sessions_async(
|
|
994
|
+
con: duckdb.DuckDBPyConnection,
|
|
995
|
+
settings: Settings,
|
|
996
|
+
*,
|
|
997
|
+
since_days: int | None,
|
|
998
|
+
limit: int | None,
|
|
999
|
+
thinking_mode: str,
|
|
1000
|
+
) -> int:
|
|
1001
|
+
"""Async implementation behind :func:`classify_sessions`."""
|
|
1002
|
+
already: set[str] = set()
|
|
1003
|
+
done_df = read_all(settings.classifications_parquet_path)
|
|
1004
|
+
if done_df is not None and done_df.height > 0:
|
|
1005
|
+
already = set(done_df["session_id"].to_list())
|
|
1006
|
+
|
|
1007
|
+
# Checkpoint skip: compare current (last_ts, mtime) against the last run.
|
|
1008
|
+
bounds = session_bounds(con, since_days=since_days, limit=limit)
|
|
1009
|
+
unchanged_pending, skipped = checkpointer.filter_unchanged(
|
|
1010
|
+
((sid, lt, mt) for sid, (lt, mt) in bounds.items()),
|
|
1011
|
+
pipeline="classify",
|
|
1012
|
+
checkpoint_db_path=settings.checkpoint_db_path,
|
|
1013
|
+
)
|
|
1014
|
+
keep = set(unchanged_pending)
|
|
1015
|
+
|
|
1016
|
+
# Retry queue: pull pending retries first so they're re-enqueued into
|
|
1017
|
+
# `keep` even when the checkpoint would otherwise skip them.
|
|
1018
|
+
retry_ids = set(retry_queue.drain(settings.checkpoint_db_path, pipeline="classify"))
|
|
1019
|
+
if retry_ids:
|
|
1020
|
+
logger.info("classify: draining {} retry-queue entries", len(retry_ids))
|
|
1021
|
+
keep |= retry_ids
|
|
1022
|
+
|
|
1023
|
+
pending: list[tuple[str, str]] = []
|
|
1024
|
+
for sid, text in iter_session_texts(con, settings=settings, since_days=since_days, limit=limit):
|
|
1025
|
+
if sid in already and sid not in retry_ids:
|
|
1026
|
+
continue
|
|
1027
|
+
if sid not in keep:
|
|
1028
|
+
continue
|
|
1029
|
+
pending.append((sid, text))
|
|
1030
|
+
|
|
1031
|
+
if not pending:
|
|
1032
|
+
logger.info("classify: no pending sessions (skipped={} via checkpoint)", skipped)
|
|
1033
|
+
return 0
|
|
1034
|
+
if skipped:
|
|
1035
|
+
logger.info("classify: skipped {} sessions via checkpoint", skipped)
|
|
1036
|
+
|
|
1037
|
+
client = _build_bedrock_client(settings)
|
|
1038
|
+
sem = anyio.CapacityLimiter(settings.llm_concurrency)
|
|
1039
|
+
chunk_size = max(settings.batch_size * 4, 256)
|
|
1040
|
+
logger.info(
|
|
1041
|
+
"classify: {} pending, model={}, thinking={}, concurrency={}, chunks of {}",
|
|
1042
|
+
len(pending),
|
|
1043
|
+
settings.sonnet_model_id,
|
|
1044
|
+
thinking_mode,
|
|
1045
|
+
settings.llm_concurrency,
|
|
1046
|
+
chunk_size,
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
written = 0
|
|
1050
|
+
for i in range(0, len(pending), chunk_size):
|
|
1051
|
+
chunk = pending[i : i + chunk_size]
|
|
1052
|
+
t0 = time.monotonic()
|
|
1053
|
+
coros = [
|
|
1054
|
+
_classify_one(
|
|
1055
|
+
client,
|
|
1056
|
+
settings.sonnet_model_id,
|
|
1057
|
+
SESSION_CLASSIFICATION_SCHEMA,
|
|
1058
|
+
text,
|
|
1059
|
+
max_tokens=settings.classify_max_tokens,
|
|
1060
|
+
thinking_mode=thinking_mode,
|
|
1061
|
+
sem=sem,
|
|
1062
|
+
system=CLASSIFY_SYSTEM_PROMPT,
|
|
1063
|
+
)
|
|
1064
|
+
for _, text in chunk
|
|
1065
|
+
]
|
|
1066
|
+
results = await asyncio.gather(*coros, return_exceptions=True)
|
|
1067
|
+
elapsed = time.monotonic() - t0
|
|
1068
|
+
|
|
1069
|
+
now = datetime.now(UTC)
|
|
1070
|
+
ok_rows: list[dict[str, Any]] = []
|
|
1071
|
+
errors = 0
|
|
1072
|
+
for (sid, _), res in zip(chunk, results, strict=True):
|
|
1073
|
+
if isinstance(res, BaseException):
|
|
1074
|
+
errors += 1
|
|
1075
|
+
logger.warning("classify: {} failed (queued for retry): {}", sid, res)
|
|
1076
|
+
retry_queue.enqueue(
|
|
1077
|
+
settings.checkpoint_db_path,
|
|
1078
|
+
pipeline="classify",
|
|
1079
|
+
unit_id=sid,
|
|
1080
|
+
error=str(res),
|
|
1081
|
+
)
|
|
1082
|
+
continue
|
|
1083
|
+
res_dict: dict[str, Any] = res
|
|
1084
|
+
ok_rows.append(
|
|
1085
|
+
{
|
|
1086
|
+
"session_id": sid,
|
|
1087
|
+
"autonomy_tier": res_dict.get("autonomy_tier"),
|
|
1088
|
+
"work_category": res_dict.get("work_category"),
|
|
1089
|
+
"success": res_dict.get("success"),
|
|
1090
|
+
"goal": res_dict.get("goal"),
|
|
1091
|
+
"confidence": float(res_dict.get("confidence", 0.0)),
|
|
1092
|
+
"classified_at": now,
|
|
1093
|
+
}
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
if ok_rows:
|
|
1097
|
+
df = pl.DataFrame(
|
|
1098
|
+
ok_rows,
|
|
1099
|
+
schema={
|
|
1100
|
+
"session_id": pl.Utf8,
|
|
1101
|
+
"autonomy_tier": pl.Utf8,
|
|
1102
|
+
"work_category": pl.Utf8,
|
|
1103
|
+
"success": pl.Utf8,
|
|
1104
|
+
"goal": pl.Utf8,
|
|
1105
|
+
"confidence": pl.Float32,
|
|
1106
|
+
"classified_at": pl.Datetime("us", "UTC"),
|
|
1107
|
+
},
|
|
1108
|
+
)
|
|
1109
|
+
write_part(settings.classifications_parquet_path, df)
|
|
1110
|
+
|
|
1111
|
+
# Checkpoint the sessions we just classified — at their CURRENT bounds,
|
|
1112
|
+
# so a later re-run with no new messages is a no-op. Also clear those
|
|
1113
|
+
# sessions from the retry queue.
|
|
1114
|
+
if ok_rows:
|
|
1115
|
+
ok_sids = [row["session_id"] for row in ok_rows]
|
|
1116
|
+
checkpointer.mark_completed(
|
|
1117
|
+
settings.checkpoint_db_path,
|
|
1118
|
+
pipeline="classify",
|
|
1119
|
+
rows=[(sid, *bounds.get(sid, (None, None))) for sid in ok_sids],
|
|
1120
|
+
)
|
|
1121
|
+
retry_queue.mark_done(
|
|
1122
|
+
settings.checkpoint_db_path,
|
|
1123
|
+
pipeline="classify",
|
|
1124
|
+
unit_ids=ok_sids,
|
|
1125
|
+
)
|
|
1126
|
+
|
|
1127
|
+
written += len(ok_rows)
|
|
1128
|
+
logger.info(
|
|
1129
|
+
"classify chunk {}/{}: {} ok, {} errors, {:.1f}s ({:.1f} sess/s)",
|
|
1130
|
+
i // chunk_size + 1,
|
|
1131
|
+
(len(pending) + chunk_size - 1) // chunk_size,
|
|
1132
|
+
len(ok_rows),
|
|
1133
|
+
errors,
|
|
1134
|
+
elapsed,
|
|
1135
|
+
len(ok_rows) / elapsed if elapsed > 0 else 0,
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
logger.info("classify: wrote {} total rows", written)
|
|
1139
|
+
return written
|
|
1140
|
+
|
|
1141
|
+
|
|
1142
|
+
def _count_pending_sessions(
|
|
1143
|
+
con: duckdb.DuckDBPyConnection,
|
|
1144
|
+
*,
|
|
1145
|
+
already: set[str],
|
|
1146
|
+
since_days: int | None,
|
|
1147
|
+
limit: int | None,
|
|
1148
|
+
) -> int:
|
|
1149
|
+
"""Return the count of sessions that have text messages but no classification yet.
|
|
1150
|
+
|
|
1151
|
+
Pure SQL — does NOT materialize any session text. This is the fast path for
|
|
1152
|
+
``--dry-run`` cost estimation against the full corpus (the previous path
|
|
1153
|
+
iterated :func:`iter_session_texts`, which took ~15 min on 6K+ sessions).
|
|
1154
|
+
"""
|
|
1155
|
+
where = ["mt.text_content IS NOT NULL", "length(mt.text_content) >= 1"]
|
|
1156
|
+
if since_days is not None:
|
|
1157
|
+
where.append(f"mt.ts >= current_timestamp - INTERVAL {int(since_days)} DAY")
|
|
1158
|
+
sql = f"""
|
|
1159
|
+
SELECT count(DISTINCT CAST(mt.session_id AS VARCHAR))
|
|
1160
|
+
FROM messages_text mt
|
|
1161
|
+
WHERE {" AND ".join(where)}
|
|
1162
|
+
"""
|
|
1163
|
+
row = con.execute(sql).fetchone()
|
|
1164
|
+
total = int(row[0]) if row is not None else 0
|
|
1165
|
+
if already:
|
|
1166
|
+
# Subtract sessions that already have a classification. We pull only
|
|
1167
|
+
# the overlap via a parameterized IN so we don't double-count sessions
|
|
1168
|
+
# in ``already`` that aren't actually in the corpus anymore.
|
|
1169
|
+
placeholders = ",".join("?" for _ in already)
|
|
1170
|
+
overlap_sql = f"""
|
|
1171
|
+
SELECT count(DISTINCT CAST(mt.session_id AS VARCHAR))
|
|
1172
|
+
FROM messages_text mt
|
|
1173
|
+
WHERE {" AND ".join(where)}
|
|
1174
|
+
AND CAST(mt.session_id AS VARCHAR) IN ({placeholders})
|
|
1175
|
+
"""
|
|
1176
|
+
overlap_row = con.execute(overlap_sql, list(already)).fetchone()
|
|
1177
|
+
overlap = int(overlap_row[0]) if overlap_row is not None else 0
|
|
1178
|
+
total = max(0, total - overlap)
|
|
1179
|
+
if limit is not None:
|
|
1180
|
+
total = min(total, int(limit))
|
|
1181
|
+
return total
|
|
1182
|
+
|
|
1183
|
+
|
|
1184
|
+
def classify_sessions(
|
|
1185
|
+
con: duckdb.DuckDBPyConnection,
|
|
1186
|
+
settings: Settings,
|
|
1187
|
+
*,
|
|
1188
|
+
since_days: int | None = None,
|
|
1189
|
+
limit: int | None = None,
|
|
1190
|
+
dry_run: bool = False,
|
|
1191
|
+
no_thinking: bool = False,
|
|
1192
|
+
) -> int | dict[str, Any]:
|
|
1193
|
+
"""Classify pending sessions and return count of successful classifications.
|
|
1194
|
+
|
|
1195
|
+
In ``--dry-run`` mode, returns a plan dict with keys ``{pipeline,
|
|
1196
|
+
candidates, llm_calls, avg_input_tokens, avg_output_tokens,
|
|
1197
|
+
estimated_cost_usd, model, thinking, since_days, limit}`` instead of the
|
|
1198
|
+
row count, so the CLI can emit it as structured JSON.
|
|
1199
|
+
"""
|
|
1200
|
+
thinking_mode = "disabled" if no_thinking else settings.classify_thinking
|
|
1201
|
+
|
|
1202
|
+
if dry_run:
|
|
1203
|
+
already: set[str] = set()
|
|
1204
|
+
done_df = read_all(settings.classifications_parquet_path)
|
|
1205
|
+
if done_df is not None and done_df.height > 0:
|
|
1206
|
+
already = set(done_df["session_id"].to_list())
|
|
1207
|
+
pending_count = _count_pending_sessions(
|
|
1208
|
+
con, already=already, since_days=since_days, limit=limit
|
|
1209
|
+
)
|
|
1210
|
+
# Back-of-envelope: avg 8K input tokens, 300 output per session.
|
|
1211
|
+
cost = _estimate_cost(pending_count, 8000, 300, settings.sonnet_pricing)
|
|
1212
|
+
logger.info(
|
|
1213
|
+
"classify --dry-run: {} sessions pending. Estimated cost ~${:.2f} "
|
|
1214
|
+
"(thinking={}, model={})",
|
|
1215
|
+
pending_count,
|
|
1216
|
+
cost,
|
|
1217
|
+
thinking_mode,
|
|
1218
|
+
settings.sonnet_model_id,
|
|
1219
|
+
)
|
|
1220
|
+
return {
|
|
1221
|
+
"pipeline": "classify",
|
|
1222
|
+
"candidates": pending_count,
|
|
1223
|
+
"llm_calls": pending_count,
|
|
1224
|
+
"avg_input_tokens": 8000,
|
|
1225
|
+
"avg_output_tokens": 300,
|
|
1226
|
+
"estimated_cost_usd": round(cost, 4),
|
|
1227
|
+
"model": settings.sonnet_model_id,
|
|
1228
|
+
"thinking": thinking_mode,
|
|
1229
|
+
"since_days": since_days,
|
|
1230
|
+
"limit": limit,
|
|
1231
|
+
"dry_run": True,
|
|
1232
|
+
}
|
|
1233
|
+
|
|
1234
|
+
return asyncio.run(
|
|
1235
|
+
_classify_sessions_async(
|
|
1236
|
+
con,
|
|
1237
|
+
settings,
|
|
1238
|
+
since_days=since_days,
|
|
1239
|
+
limit=limit,
|
|
1240
|
+
thinking_mode=thinking_mode,
|
|
1241
|
+
)
|
|
1242
|
+
)
|
|
1243
|
+
|
|
1244
|
+
|
|
1245
|
+
# ---------------------------------------------------------------------------
|
|
1246
|
+
# Pipeline 2: message trajectory
|
|
1247
|
+
# ---------------------------------------------------------------------------
|
|
1248
|
+
|
|
1249
|
+
# Cheap prefilter: short + starts with acknowledgement pattern -> is_transition, skip LLM.
|
|
1250
|
+
_TRANSITION_RE = re.compile(
|
|
1251
|
+
r"^\s*(ok|okay|alright|now|let me|great[,!]?|sure|got it|sounds good|perfect|clean)\b",
|
|
1252
|
+
re.IGNORECASE,
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
|
|
1256
|
+
def _heuristic_trajectory(text: str) -> dict | None:
|
|
1257
|
+
"""Fast path -- return a result dict if confident, else None."""
|
|
1258
|
+
if not text:
|
|
1259
|
+
return None
|
|
1260
|
+
if len(text) < 80 and _TRANSITION_RE.match(text):
|
|
1261
|
+
return {"sentiment_delta": "neutral", "is_transition": True, "confidence": 0.9}
|
|
1262
|
+
return None
|
|
1263
|
+
|
|
1264
|
+
|
|
1265
|
+
async def _trajectory_async(
|
|
1266
|
+
con: duckdb.DuckDBPyConnection,
|
|
1267
|
+
settings: Settings,
|
|
1268
|
+
*,
|
|
1269
|
+
since_days: int | None,
|
|
1270
|
+
limit: int | None,
|
|
1271
|
+
thinking_mode: str,
|
|
1272
|
+
) -> int:
|
|
1273
|
+
"""Async implementation behind :func:`trajectory_messages`."""
|
|
1274
|
+
already: set[str] = set()
|
|
1275
|
+
done_df = read_all(settings.trajectory_parquet_path)
|
|
1276
|
+
if done_df is not None and done_df.height > 0:
|
|
1277
|
+
already = set(done_df["uuid"].to_list())
|
|
1278
|
+
|
|
1279
|
+
# Session-level checkpoint: drop messages whose host session has not advanced
|
|
1280
|
+
# since the last trajectory run. This cuts the per-message SQL down before
|
|
1281
|
+
# the anti-join on uuid.
|
|
1282
|
+
bounds = session_bounds(con, since_days=since_days, limit=limit)
|
|
1283
|
+
unchanged_pending, skipped_sessions = checkpointer.filter_unchanged(
|
|
1284
|
+
((sid, lt, mt) for sid, (lt, mt) in bounds.items()),
|
|
1285
|
+
pipeline="trajectory",
|
|
1286
|
+
checkpoint_db_path=settings.checkpoint_db_path,
|
|
1287
|
+
)
|
|
1288
|
+
active_sessions: set[str] = set(unchanged_pending)
|
|
1289
|
+
|
|
1290
|
+
# Retry queue: drain pending failed uuids into the `already`-bypass set
|
|
1291
|
+
# so they get retried even though they landed in the parquet the first
|
|
1292
|
+
# time they were attempted.
|
|
1293
|
+
retry_uuids = set(retry_queue.drain(settings.checkpoint_db_path, pipeline="trajectory"))
|
|
1294
|
+
if retry_uuids:
|
|
1295
|
+
logger.info("trajectory: draining {} retry-queue entries", len(retry_uuids))
|
|
1296
|
+
already -= retry_uuids
|
|
1297
|
+
|
|
1298
|
+
where = ["mt.text_content IS NOT NULL", "length(mt.text_content) >= 1"]
|
|
1299
|
+
if since_days is not None:
|
|
1300
|
+
where.append(f"mt.ts >= current_timestamp - INTERVAL {int(since_days)} DAY")
|
|
1301
|
+
if active_sessions:
|
|
1302
|
+
where.append(
|
|
1303
|
+
"CAST(mt.session_id AS VARCHAR) IN (SELECT unnest(?))",
|
|
1304
|
+
)
|
|
1305
|
+
sql = f"""
|
|
1306
|
+
SELECT CAST(mt.uuid AS VARCHAR) AS uuid,
|
|
1307
|
+
CAST(mt.session_id AS VARCHAR) AS sid,
|
|
1308
|
+
mt.text_content
|
|
1309
|
+
FROM messages_text mt
|
|
1310
|
+
WHERE {" AND ".join(where)}
|
|
1311
|
+
ORDER BY mt.ts
|
|
1312
|
+
"""
|
|
1313
|
+
if limit is not None:
|
|
1314
|
+
sql += f"\nLIMIT {int(limit)}"
|
|
1315
|
+
params = [list(active_sessions)] if active_sessions else []
|
|
1316
|
+
rows_raw = con.execute(sql, params).fetchall() if active_sessions or not bounds else []
|
|
1317
|
+
rows = [(r[0], r[2]) for r in rows_raw if r[0] not in already]
|
|
1318
|
+
session_for_uuid = {r[0]: r[1] for r in rows_raw if r[0] not in already}
|
|
1319
|
+
if skipped_sessions:
|
|
1320
|
+
logger.info(
|
|
1321
|
+
"trajectory: skipped {} sessions via checkpoint",
|
|
1322
|
+
skipped_sessions,
|
|
1323
|
+
)
|
|
1324
|
+
logger.info("trajectory: {} pending messages", len(rows))
|
|
1325
|
+
|
|
1326
|
+
if not rows:
|
|
1327
|
+
logger.info("trajectory: wrote 0 total rows (nothing pending)")
|
|
1328
|
+
return 0
|
|
1329
|
+
|
|
1330
|
+
heuristic_rows: list[dict[str, Any]] = []
|
|
1331
|
+
llm_pending: list[tuple[str, str]] = []
|
|
1332
|
+
now = datetime.now(UTC)
|
|
1333
|
+
for uuid, text in rows:
|
|
1334
|
+
fast = _heuristic_trajectory(text)
|
|
1335
|
+
if fast is not None:
|
|
1336
|
+
heuristic_rows.append({"uuid": uuid, **fast, "classified_at": now})
|
|
1337
|
+
else:
|
|
1338
|
+
llm_pending.append((uuid, text))
|
|
1339
|
+
|
|
1340
|
+
logger.info(
|
|
1341
|
+
"trajectory: {} heuristic, {} LLM",
|
|
1342
|
+
len(heuristic_rows),
|
|
1343
|
+
len(llm_pending),
|
|
1344
|
+
)
|
|
1345
|
+
|
|
1346
|
+
if heuristic_rows:
|
|
1347
|
+
df = pl.DataFrame(
|
|
1348
|
+
heuristic_rows,
|
|
1349
|
+
schema={
|
|
1350
|
+
"uuid": pl.Utf8,
|
|
1351
|
+
"sentiment_delta": pl.Utf8,
|
|
1352
|
+
"is_transition": pl.Boolean,
|
|
1353
|
+
"confidence": pl.Float32,
|
|
1354
|
+
"classified_at": pl.Datetime("us", "UTC"),
|
|
1355
|
+
},
|
|
1356
|
+
)
|
|
1357
|
+
write_part(settings.trajectory_parquet_path, df)
|
|
1358
|
+
|
|
1359
|
+
processed_sessions: set[str] = set()
|
|
1360
|
+
for row in heuristic_rows:
|
|
1361
|
+
sid = session_for_uuid.get(row["uuid"])
|
|
1362
|
+
if sid is not None:
|
|
1363
|
+
processed_sessions.add(sid)
|
|
1364
|
+
|
|
1365
|
+
if not llm_pending:
|
|
1366
|
+
if processed_sessions:
|
|
1367
|
+
checkpointer.mark_completed(
|
|
1368
|
+
settings.checkpoint_db_path,
|
|
1369
|
+
pipeline="trajectory",
|
|
1370
|
+
rows=[(sid, *bounds.get(sid, (None, None))) for sid in processed_sessions],
|
|
1371
|
+
)
|
|
1372
|
+
logger.info("trajectory: wrote {} total rows", len(heuristic_rows))
|
|
1373
|
+
return len(heuristic_rows)
|
|
1374
|
+
|
|
1375
|
+
client = _build_bedrock_client(settings)
|
|
1376
|
+
sem = anyio.CapacityLimiter(settings.llm_concurrency)
|
|
1377
|
+
chunk_size = max(settings.batch_size * 4, 256)
|
|
1378
|
+
written = len(heuristic_rows)
|
|
1379
|
+
|
|
1380
|
+
for i in range(0, len(llm_pending), chunk_size):
|
|
1381
|
+
chunk = llm_pending[i : i + chunk_size]
|
|
1382
|
+
t0 = time.monotonic()
|
|
1383
|
+
coros = [
|
|
1384
|
+
_classify_one(
|
|
1385
|
+
client,
|
|
1386
|
+
settings.sonnet_model_id,
|
|
1387
|
+
MESSAGE_TRAJECTORY_SCHEMA,
|
|
1388
|
+
text,
|
|
1389
|
+
max_tokens=settings.classify_max_tokens,
|
|
1390
|
+
thinking_mode=thinking_mode,
|
|
1391
|
+
sem=sem,
|
|
1392
|
+
system=TRAJECTORY_SYSTEM_PROMPT,
|
|
1393
|
+
)
|
|
1394
|
+
for _, text in chunk
|
|
1395
|
+
]
|
|
1396
|
+
results = await asyncio.gather(*coros, return_exceptions=True)
|
|
1397
|
+
now = datetime.now(UTC)
|
|
1398
|
+
|
|
1399
|
+
ok: list[dict[str, Any]] = []
|
|
1400
|
+
ok_uuids: list[str] = []
|
|
1401
|
+
refused_uuids: list[str] = []
|
|
1402
|
+
errors = 0
|
|
1403
|
+
for (uuid, _), res in zip(chunk, results, strict=True):
|
|
1404
|
+
if isinstance(res, BedrockRefusalError):
|
|
1405
|
+
# Terminal: Bedrock won't classify this body. Stamp a neutral
|
|
1406
|
+
# placeholder so the session moves on and the retry queue
|
|
1407
|
+
# doesn't cycle forever on the same refusal.
|
|
1408
|
+
logger.info("trajectory: {} refused by Bedrock — marking neutral", uuid)
|
|
1409
|
+
now = datetime.now(UTC)
|
|
1410
|
+
ok.append(
|
|
1411
|
+
{
|
|
1412
|
+
"uuid": uuid,
|
|
1413
|
+
"sentiment_delta": "neutral",
|
|
1414
|
+
"is_transition": False,
|
|
1415
|
+
"confidence": 0.0,
|
|
1416
|
+
"classified_at": now,
|
|
1417
|
+
}
|
|
1418
|
+
)
|
|
1419
|
+
refused_uuids.append(uuid)
|
|
1420
|
+
continue
|
|
1421
|
+
if isinstance(res, BaseException):
|
|
1422
|
+
errors += 1
|
|
1423
|
+
logger.warning("trajectory: {} failed (queued for retry): {}", uuid, res)
|
|
1424
|
+
retry_queue.enqueue(
|
|
1425
|
+
settings.checkpoint_db_path,
|
|
1426
|
+
pipeline="trajectory",
|
|
1427
|
+
unit_id=uuid,
|
|
1428
|
+
error=str(res),
|
|
1429
|
+
)
|
|
1430
|
+
continue
|
|
1431
|
+
res_dict: dict[str, Any] = res
|
|
1432
|
+
ok.append(
|
|
1433
|
+
{
|
|
1434
|
+
"uuid": uuid,
|
|
1435
|
+
"sentiment_delta": res_dict.get("sentiment_delta"),
|
|
1436
|
+
"is_transition": bool(res_dict.get("is_transition", False)),
|
|
1437
|
+
"confidence": float(res_dict.get("confidence", 0.0)),
|
|
1438
|
+
"classified_at": now,
|
|
1439
|
+
}
|
|
1440
|
+
)
|
|
1441
|
+
ok_uuids.append(uuid)
|
|
1442
|
+
sid = session_for_uuid.get(uuid)
|
|
1443
|
+
if sid is not None:
|
|
1444
|
+
processed_sessions.add(sid)
|
|
1445
|
+
if ok:
|
|
1446
|
+
df = pl.DataFrame(
|
|
1447
|
+
ok,
|
|
1448
|
+
schema={
|
|
1449
|
+
"uuid": pl.Utf8,
|
|
1450
|
+
"sentiment_delta": pl.Utf8,
|
|
1451
|
+
"is_transition": pl.Boolean,
|
|
1452
|
+
"confidence": pl.Float32,
|
|
1453
|
+
"classified_at": pl.Datetime("us", "UTC"),
|
|
1454
|
+
},
|
|
1455
|
+
)
|
|
1456
|
+
write_part(settings.trajectory_parquet_path, df)
|
|
1457
|
+
# Clear retry queue for both successful uuids AND refusals we just
|
|
1458
|
+
# neutralised — the refusal placeholder lives in the parquet now,
|
|
1459
|
+
# so these uuids must not loop back through the queue.
|
|
1460
|
+
done_uuids = ok_uuids + refused_uuids
|
|
1461
|
+
if done_uuids:
|
|
1462
|
+
retry_queue.mark_done(
|
|
1463
|
+
settings.checkpoint_db_path,
|
|
1464
|
+
pipeline="trajectory",
|
|
1465
|
+
unit_ids=done_uuids,
|
|
1466
|
+
)
|
|
1467
|
+
# Per-chunk checkpoint: stamp sessions we've fully processed so a
|
|
1468
|
+
# mid-run crash doesn't lose the whole trajectory run.
|
|
1469
|
+
chunk_sessions = {session_for_uuid[u] for u in ok_uuids if u in session_for_uuid}
|
|
1470
|
+
if chunk_sessions:
|
|
1471
|
+
checkpointer.mark_completed(
|
|
1472
|
+
settings.checkpoint_db_path,
|
|
1473
|
+
pipeline="trajectory",
|
|
1474
|
+
rows=[(sid, *bounds.get(sid, (None, None))) for sid in chunk_sessions],
|
|
1475
|
+
)
|
|
1476
|
+
written += len(ok)
|
|
1477
|
+
logger.info(
|
|
1478
|
+
"trajectory chunk {}/{}: {} ok, {} errors, {:.1f}s",
|
|
1479
|
+
i // chunk_size + 1,
|
|
1480
|
+
(len(llm_pending) + chunk_size - 1) // chunk_size,
|
|
1481
|
+
len(ok),
|
|
1482
|
+
errors,
|
|
1483
|
+
time.monotonic() - t0,
|
|
1484
|
+
)
|
|
1485
|
+
|
|
1486
|
+
if processed_sessions:
|
|
1487
|
+
checkpointer.mark_completed(
|
|
1488
|
+
settings.checkpoint_db_path,
|
|
1489
|
+
pipeline="trajectory",
|
|
1490
|
+
rows=[(sid, *bounds.get(sid, (None, None))) for sid in processed_sessions],
|
|
1491
|
+
)
|
|
1492
|
+
logger.info("trajectory: wrote {} total rows", written)
|
|
1493
|
+
return written
|
|
1494
|
+
|
|
1495
|
+
|
|
1496
|
+
def trajectory_messages(
|
|
1497
|
+
con: duckdb.DuckDBPyConnection,
|
|
1498
|
+
settings: Settings,
|
|
1499
|
+
*,
|
|
1500
|
+
since_days: int | None = None,
|
|
1501
|
+
limit: int | None = None,
|
|
1502
|
+
dry_run: bool = False,
|
|
1503
|
+
no_thinking: bool = False,
|
|
1504
|
+
) -> int | dict[str, Any]:
|
|
1505
|
+
"""Per-message sentiment + transition classification.
|
|
1506
|
+
|
|
1507
|
+
In ``--dry-run`` mode returns a plan dict (see :func:`classify_sessions`).
|
|
1508
|
+
"""
|
|
1509
|
+
thinking_mode = "disabled" if no_thinking else settings.trajectory_thinking
|
|
1510
|
+
if dry_run:
|
|
1511
|
+
where = ["mt.text_content IS NOT NULL"]
|
|
1512
|
+
if since_days is not None:
|
|
1513
|
+
where.append(f"mt.ts >= current_timestamp - INTERVAL {int(since_days)} DAY")
|
|
1514
|
+
if limit is not None:
|
|
1515
|
+
sql = (
|
|
1516
|
+
f"SELECT least({int(limit)}, count(*)) "
|
|
1517
|
+
f"FROM messages_text mt WHERE {' AND '.join(where)}"
|
|
1518
|
+
)
|
|
1519
|
+
else:
|
|
1520
|
+
sql = f"SELECT count(*) FROM messages_text mt WHERE {' AND '.join(where)}"
|
|
1521
|
+
row = con.execute(sql).fetchone()
|
|
1522
|
+
n = int(row[0]) if row is not None else 0
|
|
1523
|
+
# Roughly half survive heuristic pre-filter.
|
|
1524
|
+
llm_n = n // 2
|
|
1525
|
+
cost = _estimate_cost(llm_n, 500, 50, settings.sonnet_pricing)
|
|
1526
|
+
logger.info(
|
|
1527
|
+
"trajectory --dry-run: {} messages, estimated LLM cost ~${:.2f}",
|
|
1528
|
+
n,
|
|
1529
|
+
cost,
|
|
1530
|
+
)
|
|
1531
|
+
return {
|
|
1532
|
+
"pipeline": "trajectory",
|
|
1533
|
+
"candidates": n,
|
|
1534
|
+
"llm_calls": llm_n,
|
|
1535
|
+
"avg_input_tokens": 500,
|
|
1536
|
+
"avg_output_tokens": 50,
|
|
1537
|
+
"estimated_cost_usd": round(cost, 4),
|
|
1538
|
+
"model": settings.sonnet_model_id,
|
|
1539
|
+
"thinking": thinking_mode,
|
|
1540
|
+
"since_days": since_days,
|
|
1541
|
+
"limit": limit,
|
|
1542
|
+
"dry_run": True,
|
|
1543
|
+
}
|
|
1544
|
+
return asyncio.run(
|
|
1545
|
+
_trajectory_async(
|
|
1546
|
+
con,
|
|
1547
|
+
settings,
|
|
1548
|
+
since_days=since_days,
|
|
1549
|
+
limit=limit,
|
|
1550
|
+
thinking_mode=thinking_mode,
|
|
1551
|
+
)
|
|
1552
|
+
)
|
|
1553
|
+
|
|
1554
|
+
|
|
1555
|
+
# ---------------------------------------------------------------------------
|
|
1556
|
+
# Pipeline 3: conflict detection
|
|
1557
|
+
# ---------------------------------------------------------------------------
|
|
1558
|
+
|
|
1559
|
+
|
|
1560
|
+
async def _conflicts_async(
|
|
1561
|
+
con: duckdb.DuckDBPyConnection,
|
|
1562
|
+
settings: Settings,
|
|
1563
|
+
*,
|
|
1564
|
+
since_days: int | None,
|
|
1565
|
+
limit: int | None,
|
|
1566
|
+
thinking_mode: str,
|
|
1567
|
+
) -> int:
|
|
1568
|
+
"""Async implementation behind :func:`detect_conflicts`."""
|
|
1569
|
+
already: set[str] = set()
|
|
1570
|
+
done_df = read_all(settings.conflicts_parquet_path)
|
|
1571
|
+
if done_df is not None and done_df.height > 0:
|
|
1572
|
+
already = set(done_df["session_id"].to_list())
|
|
1573
|
+
|
|
1574
|
+
bounds = session_bounds(con, since_days=since_days, limit=limit)
|
|
1575
|
+
unchanged_pending, skipped = checkpointer.filter_unchanged(
|
|
1576
|
+
((sid, lt, mt) for sid, (lt, mt) in bounds.items()),
|
|
1577
|
+
pipeline="conflicts",
|
|
1578
|
+
checkpoint_db_path=settings.checkpoint_db_path,
|
|
1579
|
+
)
|
|
1580
|
+
keep = set(unchanged_pending)
|
|
1581
|
+
|
|
1582
|
+
retry_ids = set(retry_queue.drain(settings.checkpoint_db_path, pipeline="conflicts"))
|
|
1583
|
+
if retry_ids:
|
|
1584
|
+
logger.info("conflicts: draining {} retry-queue entries", len(retry_ids))
|
|
1585
|
+
keep |= retry_ids
|
|
1586
|
+
|
|
1587
|
+
pending: list[tuple[str, str]] = []
|
|
1588
|
+
for sid, text in iter_session_texts(con, settings=settings, since_days=since_days, limit=limit):
|
|
1589
|
+
if sid in already and sid not in retry_ids:
|
|
1590
|
+
continue
|
|
1591
|
+
if sid not in keep:
|
|
1592
|
+
continue
|
|
1593
|
+
pending.append((sid, text))
|
|
1594
|
+
|
|
1595
|
+
if not pending:
|
|
1596
|
+
logger.info("conflicts: no pending sessions (skipped={} via checkpoint)", skipped)
|
|
1597
|
+
return 0
|
|
1598
|
+
if skipped:
|
|
1599
|
+
logger.info("conflicts: skipped {} sessions via checkpoint", skipped)
|
|
1600
|
+
|
|
1601
|
+
client = _build_bedrock_client(settings)
|
|
1602
|
+
sem = anyio.CapacityLimiter(settings.llm_concurrency)
|
|
1603
|
+
chunk_size = max(settings.batch_size * 4, 256)
|
|
1604
|
+
logger.info("conflicts: {} pending sessions", len(pending))
|
|
1605
|
+
|
|
1606
|
+
written = 0
|
|
1607
|
+
for i in range(0, len(pending), chunk_size):
|
|
1608
|
+
chunk = pending[i : i + chunk_size]
|
|
1609
|
+
t0 = time.monotonic()
|
|
1610
|
+
coros = [
|
|
1611
|
+
_classify_one(
|
|
1612
|
+
client,
|
|
1613
|
+
settings.sonnet_model_id,
|
|
1614
|
+
SESSION_CONFLICTS_SCHEMA,
|
|
1615
|
+
text,
|
|
1616
|
+
max_tokens=settings.classify_max_tokens,
|
|
1617
|
+
thinking_mode=thinking_mode,
|
|
1618
|
+
sem=sem,
|
|
1619
|
+
system=CONFLICTS_SYSTEM_PROMPT,
|
|
1620
|
+
)
|
|
1621
|
+
for _, text in chunk
|
|
1622
|
+
]
|
|
1623
|
+
results = await asyncio.gather(*coros, return_exceptions=True)
|
|
1624
|
+
now = datetime.now(UTC)
|
|
1625
|
+
|
|
1626
|
+
rows: list[dict[str, Any]] = []
|
|
1627
|
+
errors = 0
|
|
1628
|
+
for (sid, _), res in zip(chunk, results, strict=True):
|
|
1629
|
+
if isinstance(res, BaseException):
|
|
1630
|
+
errors += 1
|
|
1631
|
+
logger.warning("conflicts: {} failed (queued for retry): {}", sid, res)
|
|
1632
|
+
retry_queue.enqueue(
|
|
1633
|
+
settings.checkpoint_db_path,
|
|
1634
|
+
pipeline="conflicts",
|
|
1635
|
+
unit_id=sid,
|
|
1636
|
+
error=str(res),
|
|
1637
|
+
)
|
|
1638
|
+
continue
|
|
1639
|
+
res_dict: dict[str, Any] = res
|
|
1640
|
+
conflicts = res_dict.get("conflicts") or []
|
|
1641
|
+
if not conflicts:
|
|
1642
|
+
# Write a sentinel row so we don't re-classify this session.
|
|
1643
|
+
rows.append(
|
|
1644
|
+
{
|
|
1645
|
+
"session_id": sid,
|
|
1646
|
+
"conflict_idx": 0,
|
|
1647
|
+
"stance_a": None,
|
|
1648
|
+
"stance_b": None,
|
|
1649
|
+
"resolution": None,
|
|
1650
|
+
"detected_at": now,
|
|
1651
|
+
"empty": True,
|
|
1652
|
+
}
|
|
1653
|
+
)
|
|
1654
|
+
continue
|
|
1655
|
+
for idx, c in enumerate(conflicts):
|
|
1656
|
+
rows.append(
|
|
1657
|
+
{
|
|
1658
|
+
"session_id": sid,
|
|
1659
|
+
"conflict_idx": idx,
|
|
1660
|
+
"stance_a": c.get("stance_a"),
|
|
1661
|
+
"stance_b": c.get("stance_b"),
|
|
1662
|
+
"resolution": c.get("resolution"),
|
|
1663
|
+
"detected_at": now,
|
|
1664
|
+
"empty": False,
|
|
1665
|
+
}
|
|
1666
|
+
)
|
|
1667
|
+
if rows:
|
|
1668
|
+
df = pl.DataFrame(
|
|
1669
|
+
rows,
|
|
1670
|
+
schema={
|
|
1671
|
+
"session_id": pl.Utf8,
|
|
1672
|
+
"conflict_idx": pl.Int32,
|
|
1673
|
+
"stance_a": pl.Utf8,
|
|
1674
|
+
"stance_b": pl.Utf8,
|
|
1675
|
+
"resolution": pl.Utf8,
|
|
1676
|
+
"detected_at": pl.Datetime("us", "UTC"),
|
|
1677
|
+
"empty": pl.Boolean,
|
|
1678
|
+
},
|
|
1679
|
+
)
|
|
1680
|
+
write_part(settings.conflicts_parquet_path, df)
|
|
1681
|
+
ok_sids = {
|
|
1682
|
+
sid
|
|
1683
|
+
for (sid, _t), r in zip(chunk, results, strict=True)
|
|
1684
|
+
if not isinstance(r, BaseException)
|
|
1685
|
+
}
|
|
1686
|
+
if ok_sids:
|
|
1687
|
+
checkpointer.mark_completed(
|
|
1688
|
+
settings.checkpoint_db_path,
|
|
1689
|
+
pipeline="conflicts",
|
|
1690
|
+
rows=[(sid, *bounds.get(sid, (None, None))) for sid in ok_sids],
|
|
1691
|
+
)
|
|
1692
|
+
retry_queue.mark_done(
|
|
1693
|
+
settings.checkpoint_db_path,
|
|
1694
|
+
pipeline="conflicts",
|
|
1695
|
+
unit_ids=list(ok_sids),
|
|
1696
|
+
)
|
|
1697
|
+
written += len(ok_sids)
|
|
1698
|
+
logger.info(
|
|
1699
|
+
"conflicts chunk {}/{}: {} sessions processed, {} errors, {:.1f}s",
|
|
1700
|
+
i // chunk_size + 1,
|
|
1701
|
+
(len(pending) + chunk_size - 1) // chunk_size,
|
|
1702
|
+
len(chunk) - errors,
|
|
1703
|
+
errors,
|
|
1704
|
+
time.monotonic() - t0,
|
|
1705
|
+
)
|
|
1706
|
+
|
|
1707
|
+
logger.info("conflicts: processed {} sessions", written)
|
|
1708
|
+
return written
|
|
1709
|
+
|
|
1710
|
+
|
|
1711
|
+
def detect_conflicts(
|
|
1712
|
+
con: duckdb.DuckDBPyConnection,
|
|
1713
|
+
settings: Settings,
|
|
1714
|
+
*,
|
|
1715
|
+
since_days: int | None = None,
|
|
1716
|
+
limit: int | None = None,
|
|
1717
|
+
dry_run: bool = False,
|
|
1718
|
+
no_thinking: bool = False,
|
|
1719
|
+
) -> int | dict[str, Any]:
|
|
1720
|
+
"""Detect stance conflicts per session and return count processed.
|
|
1721
|
+
|
|
1722
|
+
In ``--dry-run`` mode returns a plan dict (see :func:`classify_sessions`).
|
|
1723
|
+
"""
|
|
1724
|
+
thinking_mode = "disabled" if no_thinking else settings.classify_thinking
|
|
1725
|
+
if dry_run:
|
|
1726
|
+
already: set[str] = set()
|
|
1727
|
+
done_df = read_all(settings.conflicts_parquet_path)
|
|
1728
|
+
if done_df is not None and done_df.height > 0:
|
|
1729
|
+
already = set(done_df["session_id"].to_list())
|
|
1730
|
+
pending_count = _count_pending_sessions(
|
|
1731
|
+
con, already=already, since_days=since_days, limit=limit
|
|
1732
|
+
)
|
|
1733
|
+
cost = _estimate_cost(pending_count, 6000, 400, settings.sonnet_pricing)
|
|
1734
|
+
logger.info(
|
|
1735
|
+
"conflicts --dry-run: {} sessions, estimated cost ~${:.2f}",
|
|
1736
|
+
pending_count,
|
|
1737
|
+
cost,
|
|
1738
|
+
)
|
|
1739
|
+
return {
|
|
1740
|
+
"pipeline": "conflicts",
|
|
1741
|
+
"candidates": pending_count,
|
|
1742
|
+
"llm_calls": pending_count,
|
|
1743
|
+
"avg_input_tokens": 6000,
|
|
1744
|
+
"avg_output_tokens": 400,
|
|
1745
|
+
"estimated_cost_usd": round(cost, 4),
|
|
1746
|
+
"model": settings.sonnet_model_id,
|
|
1747
|
+
"thinking": thinking_mode,
|
|
1748
|
+
"since_days": since_days,
|
|
1749
|
+
"limit": limit,
|
|
1750
|
+
"dry_run": True,
|
|
1751
|
+
}
|
|
1752
|
+
return asyncio.run(
|
|
1753
|
+
_conflicts_async(
|
|
1754
|
+
con,
|
|
1755
|
+
settings,
|
|
1756
|
+
since_days=since_days,
|
|
1757
|
+
limit=limit,
|
|
1758
|
+
thinking_mode=thinking_mode,
|
|
1759
|
+
)
|
|
1760
|
+
)
|