claude-sql 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1760 @@
1
+ """Bedrock Sonnet 4.6 classification worker.
2
+
3
+ Uses ``invoke_model`` with ``output_config.format`` (GA structured output) --
4
+ NO ``tool_use`` / ``tool_choice`` machinery. Pydantic v2 models in
5
+ ``schemas.py`` supply the flattened JSON Schema dicts.
6
+
7
+ Three public pipelines
8
+ ----------------------
9
+ classify_sessions(con, settings, *, since_days, limit, dry_run, no_thinking) -> int
10
+ trajectory_messages(con, settings, *, since_days, limit, dry_run, no_thinking) -> int
11
+ detect_conflicts(con, settings, *, since_days, limit, dry_run, no_thinking) -> int
12
+
13
+ Each pipeline discovers unfinished rows (anti-join against its parquet),
14
+ dispatches parallel Bedrock calls under a semaphore, and writes results in
15
+ chunks of ``max(batch_size * 4, 256)`` for crash-resilience.
16
+
17
+ Tenacity + botocore retry shape mirrors ``embed_worker._is_retryable`` exactly
18
+ so throttling behaves the same.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import asyncio
24
+ import json
25
+ import os
26
+ import re
27
+ import threading
28
+ import time
29
+ from datetime import UTC, datetime
30
+ from pathlib import Path
31
+ from typing import TYPE_CHECKING, Any
32
+
33
+ import anyio
34
+ import anyio.to_thread
35
+ import boto3
36
+ import polars as pl
37
+ from botocore.config import Config as BotoConfig
38
+ from botocore.exceptions import (
39
+ ClientError,
40
+ ConnectionError as BotoConnectionError,
41
+ EndpointConnectionError,
42
+ ReadTimeoutError,
43
+ SSLError,
44
+ )
45
+ from loguru import logger
46
+ from tenacity import (
47
+ retry,
48
+ retry_if_exception,
49
+ stop_after_attempt,
50
+ wait_exponential,
51
+ )
52
+
53
+ from claude_sql import checkpointer, retry_queue
54
+ from claude_sql.logging_setup import loguru_before_sleep
55
+ from claude_sql.parquet_shards import read_all, write_part
56
+ from claude_sql.schemas import (
57
+ MESSAGE_TRAJECTORY_SCHEMA,
58
+ SESSION_CLASSIFICATION_SCHEMA,
59
+ SESSION_CONFLICTS_SCHEMA,
60
+ )
61
+ from claude_sql.session_text import iter_session_texts, session_bounds
62
+
63
+ if TYPE_CHECKING:
64
+ import duckdb
65
+
66
+ from claude_sql.config import Settings
67
+
68
+
69
+ _RETRY_CODES: set[str] = {
70
+ # Standard Bedrock throttle + transient-service errors.
71
+ "ThrottlingException",
72
+ "ServiceUnavailableException",
73
+ "ModelTimeoutException",
74
+ "ModelErrorException",
75
+ # Bedrock-specific on-demand capacity errors (per AWS re:Post
76
+ # "Troubleshoot Bedrock on-demand 429 Throttling", 2026-05-08).
77
+ "ProvisionedThroughputExceededException",
78
+ "TooManyRequestsException",
79
+ # 5xx spikes on CRIS routing during global region failover — these
80
+ # are idempotent for structured-output invocations so retry is safe.
81
+ "InternalServerException",
82
+ "InternalFailure",
83
+ }
84
+
85
+
86
+ #: When set, every classifier call appends a JSONL trace row to this path
87
+ #: capturing model id, input/output token counts, prompt-cache hits, and
88
+ #: wall-clock ms. Used to verify that ``cache_control`` on the system block
89
+ #: actually triggers Anthropic prompt caching and to compare the real
90
+ #: token mix against the static dry-run estimates. No-op in normal use.
91
+ _BEDROCK_TRACE_PATH = os.environ.get("CLAUDE_SQL_BEDROCK_TRACE")
92
+
93
+
94
+ def _maybe_log_bedrock_call(pipeline: str, model_id: str, payload: dict, elapsed_ms: float) -> None:
95
+ """Append a single trace row when ``CLAUDE_SQL_BEDROCK_TRACE`` is set.
96
+
97
+ Anthropic returns prompt-cache stats under ``payload["usage"]``; we
98
+ capture the full shape so downstream cost accounting can split
99
+ 5-minute-TTL writes (1.25× input rate) from 1-hour-TTL writes
100
+ (2× input rate) and cache reads (0.1× input rate). See Anthropic's
101
+ prompt-caching docs for the schema, and AWS's prompt-caching page
102
+ for the per-model cache minimums. Failures are swallowed — tracing
103
+ must never break a real run.
104
+ """
105
+ if not _BEDROCK_TRACE_PATH:
106
+ return
107
+ try:
108
+ usage = payload.get("usage") or {}
109
+ cache_creation = usage.get("cache_creation") or {}
110
+ row = {
111
+ "ts": datetime.now(UTC).isoformat(),
112
+ "pipeline": pipeline,
113
+ "model": model_id,
114
+ "input_tokens": usage.get("input_tokens"),
115
+ "output_tokens": usage.get("output_tokens"),
116
+ "cache_creation_input_tokens": usage.get("cache_creation_input_tokens"),
117
+ "cache_read_input_tokens": usage.get("cache_read_input_tokens"),
118
+ # New-shape fields (present when the model returns the
119
+ # ``cache_creation`` sub-object; older responses omit them).
120
+ "ephemeral_5m_input_tokens": cache_creation.get("ephemeral_5m_input_tokens"),
121
+ "ephemeral_1h_input_tokens": cache_creation.get("ephemeral_1h_input_tokens"),
122
+ "stop_reason": payload.get("stop_reason"),
123
+ "elapsed_ms": round(elapsed_ms, 1),
124
+ }
125
+ path = Path(_BEDROCK_TRACE_PATH)
126
+ path.parent.mkdir(parents=True, exist_ok=True)
127
+ with path.open("a") as fh:
128
+ fh.write(json.dumps(row) + "\n")
129
+ except OSError:
130
+ # Tracing must never break a real run.
131
+ pass
132
+
133
+
134
+ def _is_retryable(exc: BaseException) -> bool:
135
+ """Return True if ``exc`` is a Bedrock error worth retrying.
136
+
137
+ Same policy as ``embed_worker._is_retryable`` -- throttle/service errors
138
+ via ``ClientError`` plus SSL / connection / read-timeout exceptions.
139
+ """
140
+ if isinstance(exc, SSLError | BotoConnectionError | EndpointConnectionError | ReadTimeoutError):
141
+ return True
142
+ if not isinstance(exc, ClientError):
143
+ return False
144
+ code = exc.response.get("Error", {}).get("Code")
145
+ return code in _RETRY_CODES
146
+
147
+
148
+ _CLIENT_LOCK = threading.Lock()
149
+ _CLIENT_CACHE: dict[tuple[str, int], Any] = {}
150
+
151
+
152
+ def _build_bedrock_client(settings: Settings) -> Any:
153
+ """Return a process-wide ``bedrock-runtime`` client keyed on region + pool size.
154
+
155
+ Per boto3's "Multithreading with clients" guide (2026-05-08) a single
156
+ ``client`` instance is thread-safe and intended to be shared across
157
+ workers; creating one per request wastes the TCP pool. We cache by
158
+ ``(region, pool_size)`` so changes to ``llm_concurrency`` at runtime
159
+ still produce a fresh client with the right ``max_pool_connections``.
160
+
161
+ Config choices (sources in docstrings of the retry decorator and
162
+ ``_maybe_log_bedrock_call``):
163
+
164
+ * ``max_pool_connections`` — botocore default is 10, which starves any
165
+ concurrency >10. AWS's Bedrock scale guide recommends 50 for high
166
+ throughput; we size to at least ``2 × llm_concurrency`` with a
167
+ floor of 32 so embed + friction + trajectory can share without
168
+ contention.
169
+ * ``connect_timeout=10`` — aggressive enough to fail fast on network
170
+ hiccups without swamping short backfills.
171
+ * ``read_timeout=600`` — Sonnet 4.6 with adaptive thinking + 1M
172
+ context can hold the connection past the 60-second botocore
173
+ default. 10 minutes is a safe upper bound for any single call.
174
+ * ``retries.mode='adaptive'`` + ``max_attempts=0`` — botocore's
175
+ adaptive client-side token bucket absorbs short throttle bursts
176
+ at the SDK layer while this module's tenacity decorator owns the
177
+ semantic retry policy (refusal short-circuit, error
178
+ classification). ``max_attempts=0`` disables botocore's own
179
+ retry loop so tenacity sees errors immediately.
180
+ """
181
+ pool_size = max(
182
+ 32,
183
+ max(settings.embed_concurrency, settings.llm_concurrency) * 2,
184
+ )
185
+ key = (settings.region, pool_size)
186
+ with _CLIENT_LOCK:
187
+ client = _CLIENT_CACHE.get(key)
188
+ if client is None:
189
+ boto_cfg = BotoConfig(
190
+ region_name=settings.region,
191
+ retries={"max_attempts": 0, "mode": "adaptive"},
192
+ max_pool_connections=pool_size,
193
+ connect_timeout=10,
194
+ read_timeout=600,
195
+ )
196
+ client = boto3.client("bedrock-runtime", config=boto_cfg)
197
+ _CLIENT_CACHE[key] = client
198
+ return client
199
+
200
+
201
+ @retry(
202
+ stop=stop_after_attempt(10),
203
+ wait=wait_exponential(multiplier=2, min=2, max=60),
204
+ retry=retry_if_exception(_is_retryable),
205
+ before_sleep=loguru_before_sleep("WARNING"),
206
+ reraise=True,
207
+ )
208
+ def _invoke_classifier_sync(
209
+ client: Any,
210
+ model_id: str,
211
+ schema: dict,
212
+ user_text: str,
213
+ *,
214
+ max_tokens: int,
215
+ thinking_mode: str,
216
+ system: str | None = None,
217
+ ) -> dict:
218
+ """One Bedrock ``invoke_model`` call with ``output_config.format`` structured output.
219
+
220
+ Parameters
221
+ ----------
222
+ client
223
+ A boto3 ``bedrock-runtime`` client.
224
+ model_id
225
+ Sonnet 4.6 CRIS profile ID (or any model that supports output_config).
226
+ schema
227
+ Flattened JSON Schema dict (see ``schemas.py``).
228
+ user_text
229
+ The full user-role message body (session text or single message).
230
+ max_tokens
231
+ Hard cap on response tokens.
232
+ thinking_mode
233
+ ``"adaptive"`` enables reasoning (higher quality, slower);
234
+ ``"disabled"`` is the escape hatch if Bedrock rejects thinking
235
+ combined with ``output_config``.
236
+ system
237
+ Optional system prompt. Pipelines pass a task-specific framing
238
+ (what's being classified, what each label means, when to abstain)
239
+ so the schema descriptions don't have to carry the whole load.
240
+
241
+ Returns
242
+ -------
243
+ dict
244
+ The structured-output JSON object that matches ``schema``.
245
+ """
246
+ body: dict[str, Any] = {
247
+ "anthropic_version": "bedrock-2023-05-31",
248
+ "max_tokens": max_tokens,
249
+ "output_config": {
250
+ "format": {"type": "json_schema", "schema": schema},
251
+ },
252
+ "messages": [{"role": "user", "content": user_text}],
253
+ }
254
+ if system:
255
+ # Mark the system block with prompt caching so Anthropic reuses the
256
+ # encoded prefix across calls. Below the minimum-cacheable threshold
257
+ # (~1024 tokens for Sonnet 4.6) the cache_control header is ignored
258
+ # silently — no harm — and once the per-pipeline system prompts
259
+ # cross the threshold, the discount kicks in automatically. We send
260
+ # the system value as a content-block list so cache_control attaches
261
+ # cleanly; Bedrock also accepts a bare string for non-cached calls.
262
+ body["system"] = [{"type": "text", "text": system, "cache_control": {"type": "ephemeral"}}]
263
+ if thinking_mode == "adaptive":
264
+ body["thinking"] = {"type": "adaptive"}
265
+ t0 = time.monotonic()
266
+ resp = client.invoke_model(
267
+ modelId=model_id,
268
+ body=json.dumps(body),
269
+ contentType="application/json",
270
+ accept="application/json",
271
+ )
272
+ elapsed_ms = (time.monotonic() - t0) * 1000.0
273
+ payload = json.loads(resp["body"].read())
274
+ _maybe_log_bedrock_call(
275
+ pipeline=schema.get("title", "classifier") if isinstance(schema, dict) else "classifier",
276
+ model_id=model_id,
277
+ payload=payload,
278
+ elapsed_ms=elapsed_ms,
279
+ )
280
+ return _parse_structured_payload(payload)
281
+
282
+
283
+ class BedrockRefusalError(Exception):
284
+ """Bedrock declined to classify the input under its content policy.
285
+
286
+ Raised when the response has ``stop_reason == "refusal"`` and no
287
+ content blocks. Callers treat this as a terminal, non-retryable
288
+ outcome and can write a neutral placeholder row so the message is
289
+ not re-tried in every future run.
290
+ """
291
+
292
+
293
+ def _parse_structured_payload(payload: dict) -> dict:
294
+ """Pull the structured JSON object out of a Bedrock response.
295
+
296
+ Four shapes observed in production (2026-04):
297
+
298
+ 1. ``payload["output"]`` is a dict — early GA shape, straight return.
299
+ 2. Content block with ``type == "output"`` (current GA shape for
300
+ ``output_config.format``) — the structured object is the block
301
+ itself, typically under ``"output"`` / ``"json"`` / ``"content"``.
302
+ 3. Anthropic message shape (``content`` is a list of blocks with
303
+ ``type == "text"``) — parse the first text block as JSON.
304
+ 4. Bare dict that already matches the schema — return as-is if it
305
+ looks nothing like a Bedrock envelope.
306
+
307
+ A ``RuntimeError`` with the observed top-level keys is raised when
308
+ no shape matches; the caller enqueues the unit on the retry queue.
309
+ """
310
+ if payload.get("stop_reason") == "refusal":
311
+ raise BedrockRefusalError("Bedrock refused the input (stop_reason=refusal)")
312
+ if "output" in payload and isinstance(payload["output"], dict):
313
+ return payload["output"]
314
+ content = payload.get("content")
315
+ if isinstance(content, list):
316
+ # Shape 2: structured-output block.
317
+ for block in content:
318
+ if not isinstance(block, dict):
319
+ continue
320
+ if block.get("type") == "output":
321
+ for key in ("output", "json", "content"):
322
+ val = block.get(key)
323
+ if isinstance(val, dict):
324
+ return val
325
+ if isinstance(val, str):
326
+ try:
327
+ return json.loads(val)
328
+ except json.JSONDecodeError:
329
+ continue
330
+ # Shape 3: text block whose body is the structured JSON.
331
+ for block in content:
332
+ if not isinstance(block, dict) or block.get("type") != "text":
333
+ continue
334
+ text = block.get("text", "")
335
+ try:
336
+ return json.loads(text)
337
+ except json.JSONDecodeError:
338
+ stripped = text.strip()
339
+ if stripped.startswith("```"):
340
+ stripped = stripped.strip("`").lstrip("json").strip()
341
+ try:
342
+ return json.loads(stripped)
343
+ except json.JSONDecodeError:
344
+ pass
345
+ # Shape 3b: message with only non-text blocks (thinking, tool_use)
346
+ # but a stop_reason of end_turn — no structured payload to parse.
347
+ if payload.keys() == {"output"} and isinstance(payload["output"], str):
348
+ return json.loads(payload["output"])
349
+ raise RuntimeError(f"Unexpected response shape: {sorted(payload.keys())}")
350
+
351
+
352
+ async def _classify_one(
353
+ client: Any,
354
+ model_id: str,
355
+ schema: dict,
356
+ text: str,
357
+ *,
358
+ max_tokens: int,
359
+ thinking_mode: str,
360
+ sem: asyncio.Semaphore | anyio.CapacityLimiter,
361
+ system: str | None = None,
362
+ ) -> dict:
363
+ """Run one classification call under the concurrency limiter.
364
+
365
+ ``sem`` accepts either an ``asyncio.Semaphore`` (legacy) or an
366
+ ``anyio.CapacityLimiter`` (new default) — both support
367
+ ``async with``. The boto3 ``invoke_model`` call is blocking, so we
368
+ hand it to ``anyio.to_thread.run_sync`` which honors the enclosing
369
+ structured-concurrency cancellation scope (if any) instead of
370
+ silently detaching on ``asyncio.to_thread`` cancellation.
371
+ """
372
+ async with sem:
373
+ return await anyio.to_thread.run_sync(
374
+ lambda: _invoke_classifier_sync(
375
+ client,
376
+ model_id,
377
+ schema,
378
+ text,
379
+ max_tokens=max_tokens,
380
+ thinking_mode=thinking_mode,
381
+ system=system,
382
+ )
383
+ )
384
+
385
+
386
+ # ---------------------------------------------------------------------------
387
+ # Per-pipeline system prompts
388
+ # ---------------------------------------------------------------------------
389
+ #
390
+ # The schema descriptions in :mod:`claude_sql.schemas` carry label semantics,
391
+ # but a system prompt is the right surface for *task framing*: what is being
392
+ # classified, what counts as evidence, when to abstain, and what NOT to do.
393
+ # The prior implementation passed only ``messages: [{"role": "user", ...}]``
394
+ # and let the schema do everything — workable on Sonnet, but quality
395
+ # degrades on smaller models and the model has no anchor for ambiguous
396
+ # cases. These constants give every classifier the same anchor.
397
+
398
+
399
+ CLASSIFY_SYSTEM_PROMPT = """\
400
+ <instructions>
401
+ You are an offline post-hoc analyst classifying complete Claude Code coding
402
+ sessions. The user message contains the full session transcript (user turns,
403
+ assistant turns, tool calls, and tool results) already concatenated.
404
+
405
+ Emit exactly one JSON object matching the schema. Four label fields plus a
406
+ self-assessed confidence, no surrounding prose, no markdown fences.
407
+ </instructions>
408
+
409
+ <context>
410
+ How to read the transcript:
411
+
412
+ - The opening user message states or implies the goal.
413
+ - Closing exchanges show whether the goal was met.
414
+ - Tool calls plus tool results are the strongest evidence of what actually
415
+ happened — read past chitchat to the actions.
416
+
417
+ Pacing patterns:
418
+
419
+ - Confirmation pattern (user replies "ok", "thanks", "looks good", short
420
+ turns separated by long agent runs) → autonomous.
421
+ - Course correction (user re-instructs, names files the agent missed,
422
+ rewrites the plan mid-flight) → assisted.
423
+ - Step-by-step (user types every instruction, confirms each step, rejects
424
+ more than they accept) → manual.
425
+
426
+ Work category cues:
427
+
428
+ - sde: code, tests, refactors, CI failures, debugging, package management,
429
+ type errors, lint output, anything in src/ or tests/. Default for any
430
+ coding-tool session.
431
+ - admin: scheduling, calendar, expense reports, low-signal email triage,
432
+ routine ops with no code changes.
433
+ - strategy_business: business analysis, competitive landscape, strategic
434
+ memos, proposals, market sizing. Reading and writing strategy documents.
435
+ - events: speaker prep, agenda building, event logistics.
436
+ - thought_leadership: writing for external audiences (blog posts,
437
+ conference abstracts, LinkedIn). Polished prose, not internal docs.
438
+ - other: only when nothing else fits. Sessions that mix sde plus a second
439
+ category should pick the one with more turns / tool calls.
440
+
441
+ Success semantics:
442
+
443
+ - success: goal as stated was clearly met. Tests pass, feature works,
444
+ document is done, decision is made.
445
+ - partial: the work landed with explicit caveats or leftover TODOs the
446
+ user acknowledged.
447
+ - failure: session ended without reaching the goal — agent gave up,
448
+ blocked indefinitely, or wrong path landed.
449
+ - unknown: insufficient signal. Session ends mid-task, no clear close,
450
+ too short to judge.
451
+ </context>
452
+
453
+ <calibration>
454
+ - Use unknown plus confidence < 0.5 when the evidence is genuinely mixed.
455
+ Do not manufacture certainty to fill the schema.
456
+ - goal must be one sentence in present tense, paraphrasing the user — not
457
+ a literal quote, not two goals concatenated with "and".
458
+ - A session that explores three options and doesn't pick one is partial,
459
+ with unknown only if the user never confirmed the session was over.
460
+ - Confidence is per-row, not per-field. If you're sure of three fields
461
+ and uncertain about work_category, pick the most likely and reflect
462
+ the uncertainty in the overall confidence.
463
+ </calibration>
464
+
465
+ <examples>
466
+ <example>
467
+ <input>A 4-hour session where the user opens with "implement Phase 2 of the
468
+ auth migration", the agent runs ~80 tool calls, the user replies "ok",
469
+ "good", "ship it" between long agent runs, ends with green tests plus a
470
+ successful merge.</input>
471
+ <output>autonomy_tier=autonomous, work_category=sde, success=success,
472
+ confidence=0.9</output>
473
+ </example>
474
+ <example>
475
+ <input>A 30-minute session where the user pastes a stack trace, the agent
476
+ reads the offending file and proposes a fix, the user says "actually I
477
+ think the bug is in module Y, can you check there", the agent verifies,
478
+ fixes Y, tests pass, the user thanks the agent and ends.</input>
479
+ <output>autonomy_tier=assisted (user redirected), work_category=sde,
480
+ success=success, confidence=0.85</output>
481
+ </example>
482
+ <example>
483
+ <input>A 2-hour session of strategic memo work — user dictates section
484
+ outlines, agent drafts, user rewrites paragraphs heavily, three rounds
485
+ of revision, ends with a published draft.</input>
486
+ <output>autonomy_tier=assisted, work_category=strategy_business,
487
+ success=success, confidence=0.85</output>
488
+ </example>
489
+ <example>
490
+ <input>A session that opens with "schedule a 1:1 with X", the agent calls
491
+ calendar, finds slots, user picks one, agent books, user confirms.</input>
492
+ <output>autonomy_tier=manual, work_category=admin, success=success,
493
+ confidence=0.95</output>
494
+ </example>
495
+ <example>
496
+ <input>A 5-minute session where the user asks "how should I structure the
497
+ test fixture?", the agent explains, the user says "got it" and ends
498
+ without writing code.</input>
499
+ <output>autonomy_tier=manual, work_category=sde, success=success (goal was
500
+ advice, which was given), confidence=0.7</output>
501
+ </example>
502
+ <example>
503
+ <input>A session where the user pastes a 500-line markdown plan and says
504
+ "let's start", the agent runs through the first three sections, but the
505
+ session ends mid-flight with five sections still unaddressed.</input>
506
+ <output>autonomy_tier=assisted, work_category=sde, success=partial,
507
+ confidence=0.85</output>
508
+ </example>
509
+ </examples>
510
+
511
+ <anti_patterns>
512
+ - Don't grade on agent skill. success means the goal was met, even if
513
+ the path was meandering. failure doesn't mean the agent was bad; it
514
+ means the goal wasn't met.
515
+ - Don't infer goals from agent actions. The user's opening message is
516
+ the ground truth for goal. If the agent went on a tangent, the goal is
517
+ still what the user asked for.
518
+ - Don't confuse autonomous with "agent did a lot". Autonomous requires
519
+ the user to step back and let the agent run. A session where the
520
+ agent produces lots of code but the user reviews each diff is assisted.
521
+ - goal is the user's goal, not the session's outcome. If the user asked
522
+ to refactor X but the agent ended up debugging an unrelated test
523
+ failure, goal is still "refactor X". The detour shows up in success.
524
+ </anti_patterns>
525
+ """
526
+
527
+
528
+ TRAJECTORY_SYSTEM_PROMPT = """\
529
+ <instructions>
530
+ You score the emotional polarity of ONE message inside a Claude Code coding
531
+ session. The user input is the message text in isolation — you will not
532
+ see the prior turn, by design.
533
+
534
+ Emit exactly one JSON object matching the schema:
535
+
536
+ - sentiment_delta: one of positive / neutral / negative. The field name
537
+ is historical; semantics are absolute polarity, not "vs prior".
538
+ - is_transition: true when the message is pure filler / acknowledgement,
539
+ false otherwise.
540
+ - confidence: self-assessed certainty 0.0-1.0.
541
+
542
+ Output JSON only. No surrounding prose, no markdown fences.
543
+ </instructions>
544
+
545
+ <calibration>
546
+ Prior on label distribution for a coding session is roughly:
547
+ neutral 70%, positive 25%, negative 5%.
548
+ If your output drifts away from this distribution on a sustained run, you
549
+ are manufacturing affect. neutral is the default; deviations need explicit
550
+ cues.
551
+
552
+ NEUTRAL — the majority class. Pick this for:
553
+ - Factual statements: "The function returns a list.", "Tests pass."
554
+ - Procedural turns: "Running the linter.", "Here is the diff.",
555
+ "Updated foo.py."
556
+ - Plain instructions: "Add a test for the empty case.", "Refactor module X."
557
+ - Plain questions: "Where does the config live?", "Why is this private?"
558
+ - Tool-use narration: "Calling the search API now.", "Reading file Y."
559
+ - Status reports without affect: "I'm done with the migration."
560
+
561
+ POSITIVE — visible excitement, approval, or momentum:
562
+ - Direct praise: "nice!", "love this", "perfect", "this is great"
563
+ - Energetic agreement: "yes exactly", "shipping it", "do it"
564
+ - Celebration: "finally working", "huge win"
565
+ - Explicit thanks that goes beyond "thanks": "thanks, this is exactly
566
+ what I needed"
567
+ NOT positive: polite "thanks", procedural "ok", a "sounds good" that's
568
+ just pacing the conversation.
569
+
570
+ NEGATIVE — friction, frustration, blocked:
571
+ - Frustration: "ugh", "seriously?", "are you kidding", "this is broken"
572
+ - Pushback: "no, that's wrong", "I don't think that works", "not what
573
+ I asked for"
574
+ - Blocked: "this is failing", "I'm stuck", "can't get past X"
575
+ - Sharp correction: "stop doing that", "you keep messing this up"
576
+ NOT negative: a calm correction ("actually let me clarify"), a flagged
577
+ bug report ("noticed an off-by-one"), a curt instruction.
578
+
579
+ is_transition:
580
+
581
+ Set true when the message has no substantive content — it's filler:
582
+ - "Ok let me check that.", "Running...", "Done.", "Clean.", "Right.",
583
+ "Cool.", "Got it, moving on.", "Yep.", "Sure."
584
+
585
+ Set false when the message carries information, instruction, question,
586
+ or affect, even if it's short. "tests pass" is neutral but NOT a
587
+ transition. "ugh" is negative and not a transition either.
588
+ </calibration>
589
+
590
+ <examples>
591
+ <example>
592
+ <input>Tests pass.</input>
593
+ <output>sentiment_delta=neutral, is_transition=true, confidence=0.9</output>
594
+ </example>
595
+ <example>
596
+ <input>All 240 passed, 4 warnings in 53s.</input>
597
+ <output>sentiment_delta=neutral, is_transition=false, confidence=0.9.
598
+ Information-dense report.</output>
599
+ </example>
600
+ <example>
601
+ <input>shipping it</input>
602
+ <output>sentiment_delta=positive, is_transition=false, confidence=0.95.
603
+ Explicit decision verb plus momentum.</output>
604
+ </example>
605
+ <example>
606
+ <input>ok let me check that</input>
607
+ <output>sentiment_delta=neutral, is_transition=true, confidence=0.9.
608
+ Acknowledgement filler. The "ok" doesn't carry affect; it's pacing.</output>
609
+ </example>
610
+ <example>
611
+ <input>this entire approach is wrong because the cache key is per-tenant</input>
612
+ <output>sentiment_delta=negative, is_transition=false, confidence=0.9.
613
+ Substantive disagreement with reasoning. Negative even though articulate.</output>
614
+ </example>
615
+ <example>
616
+ <input>running pytest now</input>
617
+ <output>sentiment_delta=neutral, is_transition=true, confidence=0.85.
618
+ Procedural narration.</output>
619
+ </example>
620
+ <example>
621
+ <input>that's not what I meant — I want X to be derived, not stored</input>
622
+ <output>sentiment_delta=negative, is_transition=false, confidence=0.85.
623
+ Correction with a substantive counter-proposal.</output>
624
+ </example>
625
+ <example>
626
+ <input>perfect, this is exactly what I wanted</input>
627
+ <output>sentiment_delta=positive, is_transition=false, confidence=0.95.
628
+ Direct praise plus specificity.</output>
629
+ </example>
630
+ <example>
631
+ <input>hmm, that doesn't seem right</input>
632
+ <output>sentiment_delta=negative, is_transition=false, confidence=0.65.
633
+ Mild pushback. Confidence below 0.7 because "hmm" is genuinely ambiguous
634
+ — could be deliberation.</output>
635
+ </example>
636
+ <example>
637
+ <input>thanks</input>
638
+ <output>sentiment_delta=neutral, is_transition=true, confidence=0.7.
639
+ Bare politeness. Not positive (no specificity), a social close.</output>
640
+ </example>
641
+ <example>
642
+ <input>Done.</input>
643
+ <output>sentiment_delta=neutral, is_transition=true, confidence=0.9.
644
+ Single-word close-out. Filler.</output>
645
+ </example>
646
+ <example>
647
+ <input>no don't do that</input>
648
+ <output>sentiment_delta=negative, is_transition=false, confidence=0.9.
649
+ Hard correction. The "no" plus "don't" is the cue.</output>
650
+ </example>
651
+ <example>
652
+ <input>I think we should go with the simple version for now</input>
653
+ <output>sentiment_delta=neutral, is_transition=false, confidence=0.85.
654
+ Substantive opinion without affect. "For now" signals pragmatism, not
655
+ positivity.</output>
656
+ </example>
657
+ </examples>
658
+
659
+ <anti_patterns>
660
+ - Don't manufacture certainty. Confidence < 0.7 is appropriate when the
661
+ message is short, single-word, or context-dependent. The downstream
662
+ pipeline weights by confidence — don't hand-wave.
663
+ - Don't conflate length with neutrality. A long technical message can
664
+ still be negative ("This entire approach is wrong because..."). A
665
+ short message can still be positive ("ship it!").
666
+ - Don't read intent into procedural text. A bare "Done." is a
667
+ transition, not triumphant positive. A bare "Running tests" is
668
+ neutral, not anxious negative.
669
+ - Avoid the "slightly positive" / "mildly negative" trap. The schema
670
+ has three labels for a reason. If tempted to pick a side, the answer
671
+ is neutral.
672
+ - Tool-use narration ("calling X", "reading Y", "checking Z") is
673
+ overwhelmingly neutral. Don't score the agent's procedural play-by-
674
+ play as positive momentum unless the wording itself is enthusiastic.
675
+ - Length is not affect. A 50-word careful explanation can be neutral.
676
+ A 3-word reply ("ship it!") can be positive. Polarity is in the words,
677
+ not the size of the message.
678
+ </anti_patterns>
679
+ """
680
+
681
+
682
+ CONFLICTS_SYSTEM_PROMPT = """\
683
+ <instructions>
684
+ You analyze a complete Claude Code coding session for STANCE CONFLICTS —
685
+ moments where the user and the agent (or the agent's own reasoning) hold
686
+ mutually-exclusive positions on the same substantive question.
687
+
688
+ Emit exactly one JSON object with a conflicts array. Each conflict has
689
+ three fields: stance_a, stance_b (one-sentence summaries) and resolution
690
+ (resolved / unresolved / abandoned). An empty list is valid and common.
691
+
692
+ Output JSON only. No surrounding prose, no markdown fences.
693
+ </instructions>
694
+
695
+ <context>
696
+ What counts as a conflict:
697
+
698
+ - Two stances on the same technical decision held by different parties,
699
+ or by the same party at different points. "Use Sonnet" vs "use Opus".
700
+ "Ship the simple version now" vs "wait for the architectural cleanup".
701
+ "Cache the embeddings" vs "rebuild from parquet every run".
702
+ - Two stances on a strategic / scope decision: "rename the field" vs
703
+ "keep the field name and shift semantics". "One bundled PR" vs
704
+ "split into three". "Fix it on this branch" vs "open a follow-up".
705
+ - The conflict must be SUBSTANTIVE — measurable consequences, not style.
706
+
707
+ Resolution semantics:
708
+
709
+ - resolved: the session converged on one stance with explicit agreement.
710
+ Look for "ok let's do that", "you're right", "going with X".
711
+ - unresolved: both stances were still live at session end. User punted,
712
+ agent didn't pick, or session ran out of time.
713
+ - abandoned: topic was dropped without a decision. Different from
714
+ unresolved — abandoned means the conversation moved on, not that they
715
+ failed to decide.
716
+
717
+ Identification heuristics:
718
+
719
+ 1. Strongest signal is structural: stance A proposed, counter-stance B
720
+ raised explicitly, then a decision made (or not). Without an explicit
721
+ counter-stance, you don't have a conflict.
722
+ 2. Verbal markers: "but I think", "actually I'd argue", "I disagree",
723
+ "the other side of that is", "alternatively", "or we could".
724
+ 3. Skip agent's internal monologue ("on one hand X, on the other Y") when
725
+ the agent immediately picks one — that's deliberation. Only count when
726
+ the user (or another party) holds the other stance.
727
+ </context>
728
+
729
+ <calibration>
730
+ When in doubt, return an empty conflicts array. False positives pollute
731
+ the corpus more than missed conflicts hurt — downstream views
732
+ (session_conflicts) are used by humans to find interesting decision
733
+ points, and noise drowns signal.
734
+
735
+ Typical coding session has 0 conflicts. Typical strategy / planning
736
+ session has 0-2. Sessions with 3+ conflicts exist but are rare;
737
+ double-check your output if you're emitting that many.
738
+ </calibration>
739
+
740
+ <examples>
741
+ <example>
742
+ <input>User wants to optimize a slow query. Agent proposes denormalizing
743
+ the table. User counters: "no, let's add a covering index instead — I
744
+ don't want to touch the schema". Agent accepts the index approach and
745
+ ships it.</input>
746
+ <output>conflicts=[{stance_a: "Denormalize the table to make the query
747
+ faster.", stance_b: "Keep the schema; add a covering index instead.",
748
+ resolution: "resolved"}]</output>
749
+ </example>
750
+ <example>
751
+ <input>User proposes a 3-step plan. Agent says "I think step 2 is risky
752
+ because of X — should we add a rollback first?" User agrees, plan
753
+ becomes 4 steps. Both proceed.</input>
754
+ <output>conflicts=[]. Agent flagged a risk, user incorporated it. No
755
+ counter-stance held.</output>
756
+ </example>
757
+ <example>
758
+ <input>Agent's reasoning shows "I could use approach A or approach B,
759
+ but A is simpler so I'll go with A". User says "ok".</input>
760
+ <output>conflicts=[]. Agent considered alternatives in its own
761
+ thinking. User didn't hold a counter-stance.</output>
762
+ </example>
763
+ <example>
764
+ <input>User says "wait, isn't that going to break X?" Agent explains
765
+ why not. User: "oh you're right, never mind."</input>
766
+ <output>conflicts=[]. Question surfaced, answered, retracted. No
767
+ sustained position.</output>
768
+ </example>
769
+ <example>
770
+ <input>User leans toward "ship simple version now", agent leans toward
771
+ "wait for architectural cleanup". Session ends without a decision; user
772
+ says "let me think about it".</input>
773
+ <output>conflicts=[{stance_a: "Ship the simple version now to unblock
774
+ users.", stance_b: "Wait for the architectural cleanup so we don't ship
775
+ debt.", resolution: "unresolved"}]</output>
776
+ </example>
777
+ <example>
778
+ <input>Brief disagreement about which CI config to use. User pivots to
779
+ a different topic. Never returns to the CI question.</input>
780
+ <output>conflicts=[{stance_a: "Use GitHub Actions for the new pipeline.",
781
+ stance_b: "Stick with the existing CodeBuild setup.",
782
+ resolution: "abandoned"}]</output>
783
+ </example>
784
+ </examples>
785
+
786
+ <anti_patterns>
787
+ - Don't count collaboration as conflict. Agent proposes a plan, user
788
+ agrees with caveats and the agent adapts. That's collaboration.
789
+ - Don't count agent deliberation. Agent considers two approaches in its
790
+ own reasoning, then picks one with the user's blessing. That's
791
+ deliberation, not conflict.
792
+ - Don't count surface-level pushback that the user immediately retracts.
793
+ - Don't count style / formatting disagreements ("I'd phrase that
794
+ differently", "use semicolons not commas").
795
+ - Don't count accepted risk. Agent flags risk, user accepts it. That's
796
+ a noted caveat, not a conflict.
797
+ - Don't count iteration. Two failed attempts at the same task (agent
798
+ tried X, then Y). That's iteration, not conflict.
799
+ - Don't count tooling preferences without consequence ("I'd use jq here"
800
+ vs "I'd use python -c").
801
+ </anti_patterns>
802
+ """
803
+
804
+
805
+ USER_FRICTION_SYSTEM_PROMPT = """\
806
+ <instructions>
807
+ You classify ONE short user message from a Claude Code coding session for
808
+ friction signals — cues that the human is impatient, confused,
809
+ interrupting the agent, correcting it, or asking for something the agent
810
+ should have provided proactively but didn't.
811
+
812
+ The message is presented in isolation. You will not see prior turns or
813
+ the agent response that preceded it. Make the call from the message
814
+ text alone.
815
+
816
+ Emit exactly one JSON object with three fields: label (one of the seven
817
+ values below), rationale (one short sentence naming the cue), and
818
+ confidence (0.0-1.0). Output JSON only. No surrounding prose, no
819
+ markdown fences.
820
+ </instructions>
821
+
822
+ <context>
823
+ Label semantics:
824
+
825
+ - status_ping: progress / ETA query.
826
+ Triggers: "how's it going?", "any update?", "where are we?",
827
+ "still working?", "what's your eta?", "are you alive?"
828
+ NOT triggers: "where does the config live?" (technical question),
829
+ "where are we in the migration plan?" (substantive scope question).
830
+
831
+ - unmet_expectation: short question pointing at something the agent
832
+ should have produced.
833
+ Triggers: bare one-word questions ending in "?": "screenshot?",
834
+ "tests?", "diff?", "link?", "logs?", "stacktrace?".
835
+ NOT triggers: "what's the type of X?" (substantive),
836
+ "tests for which file?" (clarification, not friction).
837
+
838
+ - confusion: user signals they don't follow the output or state.
839
+ Triggers: "what does that mean?", "I don't get it", "huh?",
840
+ "why did you do X?" (when X already happened), "wait, what?"
841
+ NOT triggers: a calm question about a future action, a request for
842
+ explanation ("explain that step please" — neutral instruction).
843
+
844
+ - interruption: user cuts the agent off or pivots mid-task.
845
+ Triggers: "wait", "stop", "hold on", "pause", "actually...",
846
+ "before you do that", "nvm", "never mind".
847
+ NOT triggers: "wait until tests pass" (instruction, not interrupt),
848
+ "stop the server" (action request).
849
+
850
+ - correction: explicit "you got it wrong".
851
+ Triggers: "no, not that", "that's wrong", "nope", "try again",
852
+ "you're doing it wrong", "incorrect".
853
+ NOT triggers: "actually let me clarify" (re-framing, not correcting),
854
+ technical bug reports ("X returns None instead of []" — substantive).
855
+
856
+ - frustration: terse annoyance or sarcasm.
857
+ Triggers: "ugh", "seriously?", "are you kidding", "really?",
858
+ "come on".
859
+ NOT triggers: a curt but neutral instruction.
860
+
861
+ - none: ordinary task turn. THIS IS THE MAJORITY CLASS — use it
862
+ aggressively. Anything that's a substantive instruction, a plain
863
+ technical question, an acknowledgement, a routing decision, or text
864
+ the user typed to advance the task is none. The threshold for
865
+ friction is high.
866
+ </context>
867
+
868
+ <calibration>
869
+ - confidence < 0.5 is correct when the message is genuinely ambiguous
870
+ between none and a friction label. Don't manufacture certainty.
871
+ - confidence > 0.8 requires an unambiguous cue you can name in the
872
+ rationale field.
873
+ - For obvious cases ("ugh"), 0.95 is fine.
874
+ </calibration>
875
+
876
+ <examples>
877
+ <example>
878
+ <input>screenshot?</input>
879
+ <output>label=unmet_expectation, confidence=0.7. Bare one-word
880
+ question pointing at a missed artifact.</output>
881
+ </example>
882
+ <example>
883
+ <input>stop</input>
884
+ <output>label=interruption, confidence=0.95. Hard interruption keyword
885
+ as the entire message.</output>
886
+ </example>
887
+ <example>
888
+ <input>delete that file</input>
889
+ <output>label=none, confidence=0.9. Bare instruction, not friction.</output>
890
+ </example>
891
+ <example>
892
+ <input>ugh</input>
893
+ <output>label=frustration, confidence=0.95. Unambiguous annoyance.</output>
894
+ </example>
895
+ <example>
896
+ <input>why did you do that?</input>
897
+ <output>label=confusion, confidence=0.85. Questioning a completed
898
+ action.</output>
899
+ </example>
900
+ <example>
901
+ <input>where does the config live?</input>
902
+ <output>label=none, confidence=0.9. Substantive technical question.</output>
903
+ </example>
904
+ <example>
905
+ <input>nope, try again</input>
906
+ <output>label=correction, confidence=0.95. Explicit rejection plus
907
+ redo.</output>
908
+ </example>
909
+ <example>
910
+ <input>tests for the auth module</input>
911
+ <output>label=none, confidence=0.9. Substantive instruction — what
912
+ tests, not a bare "tests?".</output>
913
+ </example>
914
+ </examples>
915
+
916
+ <anti_patterns>
917
+ - A bare instruction is none, even if it sounds curt. "delete that file"
918
+ is not correction. "add a test for X" is not unmet_expectation.
919
+ - A short technical question is none. "what's the type?" /
920
+ "where is X?" are not friction signals. Friction requires affect or
921
+ implicit complaint.
922
+ - Don't flag based on tone alone. "ok" is none, even if you imagine
923
+ it's sarcastic — without surrounding context you can't tell, so
924
+ default to none.
925
+ - Claude Code injects two strings as user-role messages that look like
926
+ friction but are CLI bookkeeping: "Continue from where you left off."
927
+ and "[Request interrupted by user for tool use]". Both should be
928
+ none. (They're filtered upstream so you'll rarely see them, but be
929
+ safe.)
930
+ </anti_patterns>
931
+ """
932
+
933
+
934
+ _CLASSIFIER_APPENDIX = """\
935
+
936
+ <operating_context>
937
+ You are running offline against a snapshot of Claude Code transcripts
938
+ already on disk. There is no live user to clarify with — you must commit
939
+ to one output for each call. The downstream pipeline writes your output
940
+ to a parquet file used by SQL views and analytics macros; future you (or
941
+ a human auditor) will read these rows in aggregate, not in isolation.
942
+ </operating_context>
943
+
944
+ <quality_bar>
945
+ - Idempotence: the same input must produce the same output across runs.
946
+ Don't introduce randomness or invent details that aren't in the input.
947
+ - Calibration over confidence: a low confidence with the correct label
948
+ is more useful than a high confidence with the wrong one. Confidence
949
+ is downstream-weighted; honesty pays.
950
+ - Failure mode: if the input is genuinely undecidable, pick the most
951
+ conservative / abstaining label the schema allows (unknown, none,
952
+ empty list) and set confidence below 0.5. Do not guess.
953
+ - The schema is the contract: every field is required, no field may be
954
+ null unless the schema marks it optional, and string fields have
955
+ practical length budgets stated in their descriptions — respect them.
956
+ </quality_bar>
957
+
958
+ <output_rules>
959
+ - Output is parsed as JSON. Bedrock's output_config.format enforces the
960
+ schema, but you should still produce valid JSON without surrounding
961
+ text or fences. The parser ignores prose; you waste tokens by emitting
962
+ it.
963
+ - Do not echo the schema, the system prompt, or the user message back.
964
+ Just the structured object.
965
+ - Field order in your output should match the order in the schema. This
966
+ is conventional, not enforced, but it makes the parquet rows readable.
967
+ </output_rules>
968
+ """
969
+
970
+
971
+ CLASSIFY_SYSTEM_PROMPT += _CLASSIFIER_APPENDIX
972
+ TRAJECTORY_SYSTEM_PROMPT += _CLASSIFIER_APPENDIX
973
+ CONFLICTS_SYSTEM_PROMPT += _CLASSIFIER_APPENDIX
974
+ USER_FRICTION_SYSTEM_PROMPT += _CLASSIFIER_APPENDIX
975
+
976
+
977
+ def _estimate_cost(
978
+ n_items: int,
979
+ avg_in_tokens: int,
980
+ avg_out_tokens: int,
981
+ pricing: tuple[float, float],
982
+ ) -> float:
983
+ """Back-of-envelope dollar estimate for ``n_items`` classification calls."""
984
+ in_rate, out_rate = pricing
985
+ return (n_items * avg_in_tokens * in_rate + n_items * avg_out_tokens * out_rate) / 1_000_000
986
+
987
+
988
+ # ---------------------------------------------------------------------------
989
+ # Pipeline 1: session classification
990
+ # ---------------------------------------------------------------------------
991
+
992
+
993
+ async def _classify_sessions_async(
994
+ con: duckdb.DuckDBPyConnection,
995
+ settings: Settings,
996
+ *,
997
+ since_days: int | None,
998
+ limit: int | None,
999
+ thinking_mode: str,
1000
+ ) -> int:
1001
+ """Async implementation behind :func:`classify_sessions`."""
1002
+ already: set[str] = set()
1003
+ done_df = read_all(settings.classifications_parquet_path)
1004
+ if done_df is not None and done_df.height > 0:
1005
+ already = set(done_df["session_id"].to_list())
1006
+
1007
+ # Checkpoint skip: compare current (last_ts, mtime) against the last run.
1008
+ bounds = session_bounds(con, since_days=since_days, limit=limit)
1009
+ unchanged_pending, skipped = checkpointer.filter_unchanged(
1010
+ ((sid, lt, mt) for sid, (lt, mt) in bounds.items()),
1011
+ pipeline="classify",
1012
+ checkpoint_db_path=settings.checkpoint_db_path,
1013
+ )
1014
+ keep = set(unchanged_pending)
1015
+
1016
+ # Retry queue: pull pending retries first so they're re-enqueued into
1017
+ # `keep` even when the checkpoint would otherwise skip them.
1018
+ retry_ids = set(retry_queue.drain(settings.checkpoint_db_path, pipeline="classify"))
1019
+ if retry_ids:
1020
+ logger.info("classify: draining {} retry-queue entries", len(retry_ids))
1021
+ keep |= retry_ids
1022
+
1023
+ pending: list[tuple[str, str]] = []
1024
+ for sid, text in iter_session_texts(con, settings=settings, since_days=since_days, limit=limit):
1025
+ if sid in already and sid not in retry_ids:
1026
+ continue
1027
+ if sid not in keep:
1028
+ continue
1029
+ pending.append((sid, text))
1030
+
1031
+ if not pending:
1032
+ logger.info("classify: no pending sessions (skipped={} via checkpoint)", skipped)
1033
+ return 0
1034
+ if skipped:
1035
+ logger.info("classify: skipped {} sessions via checkpoint", skipped)
1036
+
1037
+ client = _build_bedrock_client(settings)
1038
+ sem = anyio.CapacityLimiter(settings.llm_concurrency)
1039
+ chunk_size = max(settings.batch_size * 4, 256)
1040
+ logger.info(
1041
+ "classify: {} pending, model={}, thinking={}, concurrency={}, chunks of {}",
1042
+ len(pending),
1043
+ settings.sonnet_model_id,
1044
+ thinking_mode,
1045
+ settings.llm_concurrency,
1046
+ chunk_size,
1047
+ )
1048
+
1049
+ written = 0
1050
+ for i in range(0, len(pending), chunk_size):
1051
+ chunk = pending[i : i + chunk_size]
1052
+ t0 = time.monotonic()
1053
+ coros = [
1054
+ _classify_one(
1055
+ client,
1056
+ settings.sonnet_model_id,
1057
+ SESSION_CLASSIFICATION_SCHEMA,
1058
+ text,
1059
+ max_tokens=settings.classify_max_tokens,
1060
+ thinking_mode=thinking_mode,
1061
+ sem=sem,
1062
+ system=CLASSIFY_SYSTEM_PROMPT,
1063
+ )
1064
+ for _, text in chunk
1065
+ ]
1066
+ results = await asyncio.gather(*coros, return_exceptions=True)
1067
+ elapsed = time.monotonic() - t0
1068
+
1069
+ now = datetime.now(UTC)
1070
+ ok_rows: list[dict[str, Any]] = []
1071
+ errors = 0
1072
+ for (sid, _), res in zip(chunk, results, strict=True):
1073
+ if isinstance(res, BaseException):
1074
+ errors += 1
1075
+ logger.warning("classify: {} failed (queued for retry): {}", sid, res)
1076
+ retry_queue.enqueue(
1077
+ settings.checkpoint_db_path,
1078
+ pipeline="classify",
1079
+ unit_id=sid,
1080
+ error=str(res),
1081
+ )
1082
+ continue
1083
+ res_dict: dict[str, Any] = res
1084
+ ok_rows.append(
1085
+ {
1086
+ "session_id": sid,
1087
+ "autonomy_tier": res_dict.get("autonomy_tier"),
1088
+ "work_category": res_dict.get("work_category"),
1089
+ "success": res_dict.get("success"),
1090
+ "goal": res_dict.get("goal"),
1091
+ "confidence": float(res_dict.get("confidence", 0.0)),
1092
+ "classified_at": now,
1093
+ }
1094
+ )
1095
+
1096
+ if ok_rows:
1097
+ df = pl.DataFrame(
1098
+ ok_rows,
1099
+ schema={
1100
+ "session_id": pl.Utf8,
1101
+ "autonomy_tier": pl.Utf8,
1102
+ "work_category": pl.Utf8,
1103
+ "success": pl.Utf8,
1104
+ "goal": pl.Utf8,
1105
+ "confidence": pl.Float32,
1106
+ "classified_at": pl.Datetime("us", "UTC"),
1107
+ },
1108
+ )
1109
+ write_part(settings.classifications_parquet_path, df)
1110
+
1111
+ # Checkpoint the sessions we just classified — at their CURRENT bounds,
1112
+ # so a later re-run with no new messages is a no-op. Also clear those
1113
+ # sessions from the retry queue.
1114
+ if ok_rows:
1115
+ ok_sids = [row["session_id"] for row in ok_rows]
1116
+ checkpointer.mark_completed(
1117
+ settings.checkpoint_db_path,
1118
+ pipeline="classify",
1119
+ rows=[(sid, *bounds.get(sid, (None, None))) for sid in ok_sids],
1120
+ )
1121
+ retry_queue.mark_done(
1122
+ settings.checkpoint_db_path,
1123
+ pipeline="classify",
1124
+ unit_ids=ok_sids,
1125
+ )
1126
+
1127
+ written += len(ok_rows)
1128
+ logger.info(
1129
+ "classify chunk {}/{}: {} ok, {} errors, {:.1f}s ({:.1f} sess/s)",
1130
+ i // chunk_size + 1,
1131
+ (len(pending) + chunk_size - 1) // chunk_size,
1132
+ len(ok_rows),
1133
+ errors,
1134
+ elapsed,
1135
+ len(ok_rows) / elapsed if elapsed > 0 else 0,
1136
+ )
1137
+
1138
+ logger.info("classify: wrote {} total rows", written)
1139
+ return written
1140
+
1141
+
1142
+ def _count_pending_sessions(
1143
+ con: duckdb.DuckDBPyConnection,
1144
+ *,
1145
+ already: set[str],
1146
+ since_days: int | None,
1147
+ limit: int | None,
1148
+ ) -> int:
1149
+ """Return the count of sessions that have text messages but no classification yet.
1150
+
1151
+ Pure SQL — does NOT materialize any session text. This is the fast path for
1152
+ ``--dry-run`` cost estimation against the full corpus (the previous path
1153
+ iterated :func:`iter_session_texts`, which took ~15 min on 6K+ sessions).
1154
+ """
1155
+ where = ["mt.text_content IS NOT NULL", "length(mt.text_content) >= 1"]
1156
+ if since_days is not None:
1157
+ where.append(f"mt.ts >= current_timestamp - INTERVAL {int(since_days)} DAY")
1158
+ sql = f"""
1159
+ SELECT count(DISTINCT CAST(mt.session_id AS VARCHAR))
1160
+ FROM messages_text mt
1161
+ WHERE {" AND ".join(where)}
1162
+ """
1163
+ row = con.execute(sql).fetchone()
1164
+ total = int(row[0]) if row is not None else 0
1165
+ if already:
1166
+ # Subtract sessions that already have a classification. We pull only
1167
+ # the overlap via a parameterized IN so we don't double-count sessions
1168
+ # in ``already`` that aren't actually in the corpus anymore.
1169
+ placeholders = ",".join("?" for _ in already)
1170
+ overlap_sql = f"""
1171
+ SELECT count(DISTINCT CAST(mt.session_id AS VARCHAR))
1172
+ FROM messages_text mt
1173
+ WHERE {" AND ".join(where)}
1174
+ AND CAST(mt.session_id AS VARCHAR) IN ({placeholders})
1175
+ """
1176
+ overlap_row = con.execute(overlap_sql, list(already)).fetchone()
1177
+ overlap = int(overlap_row[0]) if overlap_row is not None else 0
1178
+ total = max(0, total - overlap)
1179
+ if limit is not None:
1180
+ total = min(total, int(limit))
1181
+ return total
1182
+
1183
+
1184
+ def classify_sessions(
1185
+ con: duckdb.DuckDBPyConnection,
1186
+ settings: Settings,
1187
+ *,
1188
+ since_days: int | None = None,
1189
+ limit: int | None = None,
1190
+ dry_run: bool = False,
1191
+ no_thinking: bool = False,
1192
+ ) -> int | dict[str, Any]:
1193
+ """Classify pending sessions and return count of successful classifications.
1194
+
1195
+ In ``--dry-run`` mode, returns a plan dict with keys ``{pipeline,
1196
+ candidates, llm_calls, avg_input_tokens, avg_output_tokens,
1197
+ estimated_cost_usd, model, thinking, since_days, limit}`` instead of the
1198
+ row count, so the CLI can emit it as structured JSON.
1199
+ """
1200
+ thinking_mode = "disabled" if no_thinking else settings.classify_thinking
1201
+
1202
+ if dry_run:
1203
+ already: set[str] = set()
1204
+ done_df = read_all(settings.classifications_parquet_path)
1205
+ if done_df is not None and done_df.height > 0:
1206
+ already = set(done_df["session_id"].to_list())
1207
+ pending_count = _count_pending_sessions(
1208
+ con, already=already, since_days=since_days, limit=limit
1209
+ )
1210
+ # Back-of-envelope: avg 8K input tokens, 300 output per session.
1211
+ cost = _estimate_cost(pending_count, 8000, 300, settings.sonnet_pricing)
1212
+ logger.info(
1213
+ "classify --dry-run: {} sessions pending. Estimated cost ~${:.2f} "
1214
+ "(thinking={}, model={})",
1215
+ pending_count,
1216
+ cost,
1217
+ thinking_mode,
1218
+ settings.sonnet_model_id,
1219
+ )
1220
+ return {
1221
+ "pipeline": "classify",
1222
+ "candidates": pending_count,
1223
+ "llm_calls": pending_count,
1224
+ "avg_input_tokens": 8000,
1225
+ "avg_output_tokens": 300,
1226
+ "estimated_cost_usd": round(cost, 4),
1227
+ "model": settings.sonnet_model_id,
1228
+ "thinking": thinking_mode,
1229
+ "since_days": since_days,
1230
+ "limit": limit,
1231
+ "dry_run": True,
1232
+ }
1233
+
1234
+ return asyncio.run(
1235
+ _classify_sessions_async(
1236
+ con,
1237
+ settings,
1238
+ since_days=since_days,
1239
+ limit=limit,
1240
+ thinking_mode=thinking_mode,
1241
+ )
1242
+ )
1243
+
1244
+
1245
+ # ---------------------------------------------------------------------------
1246
+ # Pipeline 2: message trajectory
1247
+ # ---------------------------------------------------------------------------
1248
+
1249
+ # Cheap prefilter: short + starts with acknowledgement pattern -> is_transition, skip LLM.
1250
+ _TRANSITION_RE = re.compile(
1251
+ r"^\s*(ok|okay|alright|now|let me|great[,!]?|sure|got it|sounds good|perfect|clean)\b",
1252
+ re.IGNORECASE,
1253
+ )
1254
+
1255
+
1256
+ def _heuristic_trajectory(text: str) -> dict | None:
1257
+ """Fast path -- return a result dict if confident, else None."""
1258
+ if not text:
1259
+ return None
1260
+ if len(text) < 80 and _TRANSITION_RE.match(text):
1261
+ return {"sentiment_delta": "neutral", "is_transition": True, "confidence": 0.9}
1262
+ return None
1263
+
1264
+
1265
+ async def _trajectory_async(
1266
+ con: duckdb.DuckDBPyConnection,
1267
+ settings: Settings,
1268
+ *,
1269
+ since_days: int | None,
1270
+ limit: int | None,
1271
+ thinking_mode: str,
1272
+ ) -> int:
1273
+ """Async implementation behind :func:`trajectory_messages`."""
1274
+ already: set[str] = set()
1275
+ done_df = read_all(settings.trajectory_parquet_path)
1276
+ if done_df is not None and done_df.height > 0:
1277
+ already = set(done_df["uuid"].to_list())
1278
+
1279
+ # Session-level checkpoint: drop messages whose host session has not advanced
1280
+ # since the last trajectory run. This cuts the per-message SQL down before
1281
+ # the anti-join on uuid.
1282
+ bounds = session_bounds(con, since_days=since_days, limit=limit)
1283
+ unchanged_pending, skipped_sessions = checkpointer.filter_unchanged(
1284
+ ((sid, lt, mt) for sid, (lt, mt) in bounds.items()),
1285
+ pipeline="trajectory",
1286
+ checkpoint_db_path=settings.checkpoint_db_path,
1287
+ )
1288
+ active_sessions: set[str] = set(unchanged_pending)
1289
+
1290
+ # Retry queue: drain pending failed uuids into the `already`-bypass set
1291
+ # so they get retried even though they landed in the parquet the first
1292
+ # time they were attempted.
1293
+ retry_uuids = set(retry_queue.drain(settings.checkpoint_db_path, pipeline="trajectory"))
1294
+ if retry_uuids:
1295
+ logger.info("trajectory: draining {} retry-queue entries", len(retry_uuids))
1296
+ already -= retry_uuids
1297
+
1298
+ where = ["mt.text_content IS NOT NULL", "length(mt.text_content) >= 1"]
1299
+ if since_days is not None:
1300
+ where.append(f"mt.ts >= current_timestamp - INTERVAL {int(since_days)} DAY")
1301
+ if active_sessions:
1302
+ where.append(
1303
+ "CAST(mt.session_id AS VARCHAR) IN (SELECT unnest(?))",
1304
+ )
1305
+ sql = f"""
1306
+ SELECT CAST(mt.uuid AS VARCHAR) AS uuid,
1307
+ CAST(mt.session_id AS VARCHAR) AS sid,
1308
+ mt.text_content
1309
+ FROM messages_text mt
1310
+ WHERE {" AND ".join(where)}
1311
+ ORDER BY mt.ts
1312
+ """
1313
+ if limit is not None:
1314
+ sql += f"\nLIMIT {int(limit)}"
1315
+ params = [list(active_sessions)] if active_sessions else []
1316
+ rows_raw = con.execute(sql, params).fetchall() if active_sessions or not bounds else []
1317
+ rows = [(r[0], r[2]) for r in rows_raw if r[0] not in already]
1318
+ session_for_uuid = {r[0]: r[1] for r in rows_raw if r[0] not in already}
1319
+ if skipped_sessions:
1320
+ logger.info(
1321
+ "trajectory: skipped {} sessions via checkpoint",
1322
+ skipped_sessions,
1323
+ )
1324
+ logger.info("trajectory: {} pending messages", len(rows))
1325
+
1326
+ if not rows:
1327
+ logger.info("trajectory: wrote 0 total rows (nothing pending)")
1328
+ return 0
1329
+
1330
+ heuristic_rows: list[dict[str, Any]] = []
1331
+ llm_pending: list[tuple[str, str]] = []
1332
+ now = datetime.now(UTC)
1333
+ for uuid, text in rows:
1334
+ fast = _heuristic_trajectory(text)
1335
+ if fast is not None:
1336
+ heuristic_rows.append({"uuid": uuid, **fast, "classified_at": now})
1337
+ else:
1338
+ llm_pending.append((uuid, text))
1339
+
1340
+ logger.info(
1341
+ "trajectory: {} heuristic, {} LLM",
1342
+ len(heuristic_rows),
1343
+ len(llm_pending),
1344
+ )
1345
+
1346
+ if heuristic_rows:
1347
+ df = pl.DataFrame(
1348
+ heuristic_rows,
1349
+ schema={
1350
+ "uuid": pl.Utf8,
1351
+ "sentiment_delta": pl.Utf8,
1352
+ "is_transition": pl.Boolean,
1353
+ "confidence": pl.Float32,
1354
+ "classified_at": pl.Datetime("us", "UTC"),
1355
+ },
1356
+ )
1357
+ write_part(settings.trajectory_parquet_path, df)
1358
+
1359
+ processed_sessions: set[str] = set()
1360
+ for row in heuristic_rows:
1361
+ sid = session_for_uuid.get(row["uuid"])
1362
+ if sid is not None:
1363
+ processed_sessions.add(sid)
1364
+
1365
+ if not llm_pending:
1366
+ if processed_sessions:
1367
+ checkpointer.mark_completed(
1368
+ settings.checkpoint_db_path,
1369
+ pipeline="trajectory",
1370
+ rows=[(sid, *bounds.get(sid, (None, None))) for sid in processed_sessions],
1371
+ )
1372
+ logger.info("trajectory: wrote {} total rows", len(heuristic_rows))
1373
+ return len(heuristic_rows)
1374
+
1375
+ client = _build_bedrock_client(settings)
1376
+ sem = anyio.CapacityLimiter(settings.llm_concurrency)
1377
+ chunk_size = max(settings.batch_size * 4, 256)
1378
+ written = len(heuristic_rows)
1379
+
1380
+ for i in range(0, len(llm_pending), chunk_size):
1381
+ chunk = llm_pending[i : i + chunk_size]
1382
+ t0 = time.monotonic()
1383
+ coros = [
1384
+ _classify_one(
1385
+ client,
1386
+ settings.sonnet_model_id,
1387
+ MESSAGE_TRAJECTORY_SCHEMA,
1388
+ text,
1389
+ max_tokens=settings.classify_max_tokens,
1390
+ thinking_mode=thinking_mode,
1391
+ sem=sem,
1392
+ system=TRAJECTORY_SYSTEM_PROMPT,
1393
+ )
1394
+ for _, text in chunk
1395
+ ]
1396
+ results = await asyncio.gather(*coros, return_exceptions=True)
1397
+ now = datetime.now(UTC)
1398
+
1399
+ ok: list[dict[str, Any]] = []
1400
+ ok_uuids: list[str] = []
1401
+ refused_uuids: list[str] = []
1402
+ errors = 0
1403
+ for (uuid, _), res in zip(chunk, results, strict=True):
1404
+ if isinstance(res, BedrockRefusalError):
1405
+ # Terminal: Bedrock won't classify this body. Stamp a neutral
1406
+ # placeholder so the session moves on and the retry queue
1407
+ # doesn't cycle forever on the same refusal.
1408
+ logger.info("trajectory: {} refused by Bedrock — marking neutral", uuid)
1409
+ now = datetime.now(UTC)
1410
+ ok.append(
1411
+ {
1412
+ "uuid": uuid,
1413
+ "sentiment_delta": "neutral",
1414
+ "is_transition": False,
1415
+ "confidence": 0.0,
1416
+ "classified_at": now,
1417
+ }
1418
+ )
1419
+ refused_uuids.append(uuid)
1420
+ continue
1421
+ if isinstance(res, BaseException):
1422
+ errors += 1
1423
+ logger.warning("trajectory: {} failed (queued for retry): {}", uuid, res)
1424
+ retry_queue.enqueue(
1425
+ settings.checkpoint_db_path,
1426
+ pipeline="trajectory",
1427
+ unit_id=uuid,
1428
+ error=str(res),
1429
+ )
1430
+ continue
1431
+ res_dict: dict[str, Any] = res
1432
+ ok.append(
1433
+ {
1434
+ "uuid": uuid,
1435
+ "sentiment_delta": res_dict.get("sentiment_delta"),
1436
+ "is_transition": bool(res_dict.get("is_transition", False)),
1437
+ "confidence": float(res_dict.get("confidence", 0.0)),
1438
+ "classified_at": now,
1439
+ }
1440
+ )
1441
+ ok_uuids.append(uuid)
1442
+ sid = session_for_uuid.get(uuid)
1443
+ if sid is not None:
1444
+ processed_sessions.add(sid)
1445
+ if ok:
1446
+ df = pl.DataFrame(
1447
+ ok,
1448
+ schema={
1449
+ "uuid": pl.Utf8,
1450
+ "sentiment_delta": pl.Utf8,
1451
+ "is_transition": pl.Boolean,
1452
+ "confidence": pl.Float32,
1453
+ "classified_at": pl.Datetime("us", "UTC"),
1454
+ },
1455
+ )
1456
+ write_part(settings.trajectory_parquet_path, df)
1457
+ # Clear retry queue for both successful uuids AND refusals we just
1458
+ # neutralised — the refusal placeholder lives in the parquet now,
1459
+ # so these uuids must not loop back through the queue.
1460
+ done_uuids = ok_uuids + refused_uuids
1461
+ if done_uuids:
1462
+ retry_queue.mark_done(
1463
+ settings.checkpoint_db_path,
1464
+ pipeline="trajectory",
1465
+ unit_ids=done_uuids,
1466
+ )
1467
+ # Per-chunk checkpoint: stamp sessions we've fully processed so a
1468
+ # mid-run crash doesn't lose the whole trajectory run.
1469
+ chunk_sessions = {session_for_uuid[u] for u in ok_uuids if u in session_for_uuid}
1470
+ if chunk_sessions:
1471
+ checkpointer.mark_completed(
1472
+ settings.checkpoint_db_path,
1473
+ pipeline="trajectory",
1474
+ rows=[(sid, *bounds.get(sid, (None, None))) for sid in chunk_sessions],
1475
+ )
1476
+ written += len(ok)
1477
+ logger.info(
1478
+ "trajectory chunk {}/{}: {} ok, {} errors, {:.1f}s",
1479
+ i // chunk_size + 1,
1480
+ (len(llm_pending) + chunk_size - 1) // chunk_size,
1481
+ len(ok),
1482
+ errors,
1483
+ time.monotonic() - t0,
1484
+ )
1485
+
1486
+ if processed_sessions:
1487
+ checkpointer.mark_completed(
1488
+ settings.checkpoint_db_path,
1489
+ pipeline="trajectory",
1490
+ rows=[(sid, *bounds.get(sid, (None, None))) for sid in processed_sessions],
1491
+ )
1492
+ logger.info("trajectory: wrote {} total rows", written)
1493
+ return written
1494
+
1495
+
1496
+ def trajectory_messages(
1497
+ con: duckdb.DuckDBPyConnection,
1498
+ settings: Settings,
1499
+ *,
1500
+ since_days: int | None = None,
1501
+ limit: int | None = None,
1502
+ dry_run: bool = False,
1503
+ no_thinking: bool = False,
1504
+ ) -> int | dict[str, Any]:
1505
+ """Per-message sentiment + transition classification.
1506
+
1507
+ In ``--dry-run`` mode returns a plan dict (see :func:`classify_sessions`).
1508
+ """
1509
+ thinking_mode = "disabled" if no_thinking else settings.trajectory_thinking
1510
+ if dry_run:
1511
+ where = ["mt.text_content IS NOT NULL"]
1512
+ if since_days is not None:
1513
+ where.append(f"mt.ts >= current_timestamp - INTERVAL {int(since_days)} DAY")
1514
+ if limit is not None:
1515
+ sql = (
1516
+ f"SELECT least({int(limit)}, count(*)) "
1517
+ f"FROM messages_text mt WHERE {' AND '.join(where)}"
1518
+ )
1519
+ else:
1520
+ sql = f"SELECT count(*) FROM messages_text mt WHERE {' AND '.join(where)}"
1521
+ row = con.execute(sql).fetchone()
1522
+ n = int(row[0]) if row is not None else 0
1523
+ # Roughly half survive heuristic pre-filter.
1524
+ llm_n = n // 2
1525
+ cost = _estimate_cost(llm_n, 500, 50, settings.sonnet_pricing)
1526
+ logger.info(
1527
+ "trajectory --dry-run: {} messages, estimated LLM cost ~${:.2f}",
1528
+ n,
1529
+ cost,
1530
+ )
1531
+ return {
1532
+ "pipeline": "trajectory",
1533
+ "candidates": n,
1534
+ "llm_calls": llm_n,
1535
+ "avg_input_tokens": 500,
1536
+ "avg_output_tokens": 50,
1537
+ "estimated_cost_usd": round(cost, 4),
1538
+ "model": settings.sonnet_model_id,
1539
+ "thinking": thinking_mode,
1540
+ "since_days": since_days,
1541
+ "limit": limit,
1542
+ "dry_run": True,
1543
+ }
1544
+ return asyncio.run(
1545
+ _trajectory_async(
1546
+ con,
1547
+ settings,
1548
+ since_days=since_days,
1549
+ limit=limit,
1550
+ thinking_mode=thinking_mode,
1551
+ )
1552
+ )
1553
+
1554
+
1555
+ # ---------------------------------------------------------------------------
1556
+ # Pipeline 3: conflict detection
1557
+ # ---------------------------------------------------------------------------
1558
+
1559
+
1560
+ async def _conflicts_async(
1561
+ con: duckdb.DuckDBPyConnection,
1562
+ settings: Settings,
1563
+ *,
1564
+ since_days: int | None,
1565
+ limit: int | None,
1566
+ thinking_mode: str,
1567
+ ) -> int:
1568
+ """Async implementation behind :func:`detect_conflicts`."""
1569
+ already: set[str] = set()
1570
+ done_df = read_all(settings.conflicts_parquet_path)
1571
+ if done_df is not None and done_df.height > 0:
1572
+ already = set(done_df["session_id"].to_list())
1573
+
1574
+ bounds = session_bounds(con, since_days=since_days, limit=limit)
1575
+ unchanged_pending, skipped = checkpointer.filter_unchanged(
1576
+ ((sid, lt, mt) for sid, (lt, mt) in bounds.items()),
1577
+ pipeline="conflicts",
1578
+ checkpoint_db_path=settings.checkpoint_db_path,
1579
+ )
1580
+ keep = set(unchanged_pending)
1581
+
1582
+ retry_ids = set(retry_queue.drain(settings.checkpoint_db_path, pipeline="conflicts"))
1583
+ if retry_ids:
1584
+ logger.info("conflicts: draining {} retry-queue entries", len(retry_ids))
1585
+ keep |= retry_ids
1586
+
1587
+ pending: list[tuple[str, str]] = []
1588
+ for sid, text in iter_session_texts(con, settings=settings, since_days=since_days, limit=limit):
1589
+ if sid in already and sid not in retry_ids:
1590
+ continue
1591
+ if sid not in keep:
1592
+ continue
1593
+ pending.append((sid, text))
1594
+
1595
+ if not pending:
1596
+ logger.info("conflicts: no pending sessions (skipped={} via checkpoint)", skipped)
1597
+ return 0
1598
+ if skipped:
1599
+ logger.info("conflicts: skipped {} sessions via checkpoint", skipped)
1600
+
1601
+ client = _build_bedrock_client(settings)
1602
+ sem = anyio.CapacityLimiter(settings.llm_concurrency)
1603
+ chunk_size = max(settings.batch_size * 4, 256)
1604
+ logger.info("conflicts: {} pending sessions", len(pending))
1605
+
1606
+ written = 0
1607
+ for i in range(0, len(pending), chunk_size):
1608
+ chunk = pending[i : i + chunk_size]
1609
+ t0 = time.monotonic()
1610
+ coros = [
1611
+ _classify_one(
1612
+ client,
1613
+ settings.sonnet_model_id,
1614
+ SESSION_CONFLICTS_SCHEMA,
1615
+ text,
1616
+ max_tokens=settings.classify_max_tokens,
1617
+ thinking_mode=thinking_mode,
1618
+ sem=sem,
1619
+ system=CONFLICTS_SYSTEM_PROMPT,
1620
+ )
1621
+ for _, text in chunk
1622
+ ]
1623
+ results = await asyncio.gather(*coros, return_exceptions=True)
1624
+ now = datetime.now(UTC)
1625
+
1626
+ rows: list[dict[str, Any]] = []
1627
+ errors = 0
1628
+ for (sid, _), res in zip(chunk, results, strict=True):
1629
+ if isinstance(res, BaseException):
1630
+ errors += 1
1631
+ logger.warning("conflicts: {} failed (queued for retry): {}", sid, res)
1632
+ retry_queue.enqueue(
1633
+ settings.checkpoint_db_path,
1634
+ pipeline="conflicts",
1635
+ unit_id=sid,
1636
+ error=str(res),
1637
+ )
1638
+ continue
1639
+ res_dict: dict[str, Any] = res
1640
+ conflicts = res_dict.get("conflicts") or []
1641
+ if not conflicts:
1642
+ # Write a sentinel row so we don't re-classify this session.
1643
+ rows.append(
1644
+ {
1645
+ "session_id": sid,
1646
+ "conflict_idx": 0,
1647
+ "stance_a": None,
1648
+ "stance_b": None,
1649
+ "resolution": None,
1650
+ "detected_at": now,
1651
+ "empty": True,
1652
+ }
1653
+ )
1654
+ continue
1655
+ for idx, c in enumerate(conflicts):
1656
+ rows.append(
1657
+ {
1658
+ "session_id": sid,
1659
+ "conflict_idx": idx,
1660
+ "stance_a": c.get("stance_a"),
1661
+ "stance_b": c.get("stance_b"),
1662
+ "resolution": c.get("resolution"),
1663
+ "detected_at": now,
1664
+ "empty": False,
1665
+ }
1666
+ )
1667
+ if rows:
1668
+ df = pl.DataFrame(
1669
+ rows,
1670
+ schema={
1671
+ "session_id": pl.Utf8,
1672
+ "conflict_idx": pl.Int32,
1673
+ "stance_a": pl.Utf8,
1674
+ "stance_b": pl.Utf8,
1675
+ "resolution": pl.Utf8,
1676
+ "detected_at": pl.Datetime("us", "UTC"),
1677
+ "empty": pl.Boolean,
1678
+ },
1679
+ )
1680
+ write_part(settings.conflicts_parquet_path, df)
1681
+ ok_sids = {
1682
+ sid
1683
+ for (sid, _t), r in zip(chunk, results, strict=True)
1684
+ if not isinstance(r, BaseException)
1685
+ }
1686
+ if ok_sids:
1687
+ checkpointer.mark_completed(
1688
+ settings.checkpoint_db_path,
1689
+ pipeline="conflicts",
1690
+ rows=[(sid, *bounds.get(sid, (None, None))) for sid in ok_sids],
1691
+ )
1692
+ retry_queue.mark_done(
1693
+ settings.checkpoint_db_path,
1694
+ pipeline="conflicts",
1695
+ unit_ids=list(ok_sids),
1696
+ )
1697
+ written += len(ok_sids)
1698
+ logger.info(
1699
+ "conflicts chunk {}/{}: {} sessions processed, {} errors, {:.1f}s",
1700
+ i // chunk_size + 1,
1701
+ (len(pending) + chunk_size - 1) // chunk_size,
1702
+ len(chunk) - errors,
1703
+ errors,
1704
+ time.monotonic() - t0,
1705
+ )
1706
+
1707
+ logger.info("conflicts: processed {} sessions", written)
1708
+ return written
1709
+
1710
+
1711
+ def detect_conflicts(
1712
+ con: duckdb.DuckDBPyConnection,
1713
+ settings: Settings,
1714
+ *,
1715
+ since_days: int | None = None,
1716
+ limit: int | None = None,
1717
+ dry_run: bool = False,
1718
+ no_thinking: bool = False,
1719
+ ) -> int | dict[str, Any]:
1720
+ """Detect stance conflicts per session and return count processed.
1721
+
1722
+ In ``--dry-run`` mode returns a plan dict (see :func:`classify_sessions`).
1723
+ """
1724
+ thinking_mode = "disabled" if no_thinking else settings.classify_thinking
1725
+ if dry_run:
1726
+ already: set[str] = set()
1727
+ done_df = read_all(settings.conflicts_parquet_path)
1728
+ if done_df is not None and done_df.height > 0:
1729
+ already = set(done_df["session_id"].to_list())
1730
+ pending_count = _count_pending_sessions(
1731
+ con, already=already, since_days=since_days, limit=limit
1732
+ )
1733
+ cost = _estimate_cost(pending_count, 6000, 400, settings.sonnet_pricing)
1734
+ logger.info(
1735
+ "conflicts --dry-run: {} sessions, estimated cost ~${:.2f}",
1736
+ pending_count,
1737
+ cost,
1738
+ )
1739
+ return {
1740
+ "pipeline": "conflicts",
1741
+ "candidates": pending_count,
1742
+ "llm_calls": pending_count,
1743
+ "avg_input_tokens": 6000,
1744
+ "avg_output_tokens": 400,
1745
+ "estimated_cost_usd": round(cost, 4),
1746
+ "model": settings.sonnet_model_id,
1747
+ "thinking": thinking_mode,
1748
+ "since_days": since_days,
1749
+ "limit": limit,
1750
+ "dry_run": True,
1751
+ }
1752
+ return asyncio.run(
1753
+ _conflicts_async(
1754
+ con,
1755
+ settings,
1756
+ since_days=since_days,
1757
+ limit=limit,
1758
+ thinking_mode=thinking_mode,
1759
+ )
1760
+ )