@pentatonic-ai/ai-agent-sdk 0.10.17 → 0.10.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/docs/redistill-execution-plan-2026-06-22.md +269 -0
- package/packages/memory-engine-v2/docs/redistill-plan-2026-06-21.md +101 -0
- package/packages/memory-engine-v2/extractor-async/extraction_diff.py +218 -0
- package/packages/memory-engine-v2/extractor-async/test_extraction_diff.py +180 -0
- package/packages/memory-engine-v2/extractor-async/test_prompt_rules.py +58 -0
- package/packages/memory-engine-v2/extractor-async/test_queue_attempts.py +69 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +94 -5
package/dist/index.cjs
CHANGED
|
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
878
878
|
}
|
|
879
879
|
|
|
880
880
|
// src/telemetry.js
|
|
881
|
-
var VERSION = "0.10.
|
|
881
|
+
var VERSION = "0.10.19";
|
|
882
882
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
883
883
|
function machineId() {
|
|
884
884
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/dist/index.js
CHANGED
|
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
847
847
|
}
|
|
848
848
|
|
|
849
849
|
// src/telemetry.js
|
|
850
|
-
var VERSION = "0.10.
|
|
850
|
+
var VERSION = "0.10.19";
|
|
851
851
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
852
852
|
function machineId() {
|
|
853
853
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pentatonic-ai/ai-agent-sdk",
|
|
3
|
-
"version": "0.10.
|
|
3
|
+
"version": "0.10.19",
|
|
4
4
|
"description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
# Re-distill EXECUTION plan — pentatonic-team, #126 modality fix (2026-06-22)
|
|
2
|
+
|
|
3
|
+
> **For review (Phil H) before any prod write.** This is the concrete, gated
|
|
4
|
+
> execution plan for deploying the #126 distiller fix and re-distilling the
|
|
5
|
+
> `pentatonic-team` graph. It supersedes the operational detail in
|
|
6
|
+
> `redistill-plan-2026-06-21.md` where live findings differ (scale, deploy
|
|
7
|
+
> mechanism, queue schema). The *why* and the guardrails there still hold.
|
|
8
|
+
>
|
|
9
|
+
> **Nothing here has been run except the outage fix in §1.** The irreversible
|
|
10
|
+
> full delete (§4) is explicitly gated on the pilot audit (§3) + a human go.
|
|
11
|
+
>
|
|
12
|
+
> **⚠️ Three prerequisites added after review (§0) are BLOCKING** — without them
|
|
13
|
+
> the re-distill would (a) re-introduce the same fabrications via the cascade
|
|
14
|
+
> student, (b) leave the vector index inconsistent, and (c) starve every other
|
|
15
|
+
> tenant's live ingest. Read §0 first.
|
|
16
|
+
|
|
17
|
+
## Why (recap)
|
|
18
|
+
|
|
19
|
+
A fact-by-fact audit found ~18.7% of distilled `pentatonic-team` facts were
|
|
20
|
+
wrong — dominated by **modality collapse** (future/scheduled/planned content
|
|
21
|
+
asserted as established fact), plus attribution errors and same-name
|
|
22
|
+
conflations. The teacher prompt is fixed in **ai-agent-sdk PR #126** (TENSE &
|
|
23
|
+
MODALITY / ATTRIBUTION FIDELITY / IDENTITY rules, in both `BATCH_SYSTEM_PROMPT`
|
|
24
|
+
and the active `GUIDED_JSON_SYSTEM_PROMPT`). The prompt only governs *future*
|
|
25
|
+
extractions, so the historical graph must be re-distilled to inherit the fix.
|
|
26
|
+
|
|
27
|
+
## Live findings that change the 2026-06-21 runbook
|
|
28
|
+
|
|
29
|
+
| Claim in 06-21 runbook | Live reality (verified 2026-06-22) |
|
|
30
|
+
|---|---|
|
|
31
|
+
| "~163k events", "434 facts" | **272,893 events · 483,097 facts · 291,797 rels** for `arena LIKE 'pentatonic-team%'`. The 434 was a 49-entity sample. |
|
|
32
|
+
| Deploy = "the L4 box running extractor-async" | **The distiller runs on the DB box `i-0559922cf59ac6975`** (`pme-prod-us-east-1`), container `pme2-extractor-async`. The `seesa-distiller-bakeoff` box (`i-0d65…`) is **NOT** the host (empty, stopped since 06-10) — earlier handoff was wrong. Models are external: teacher `seesa-distiller-l40s` (`172.31.26.202:8005`, `qwen3.6-27b-fp8`), student `seesa-student-l4` (`172.31.29.121:8005`). |
|
|
33
|
+
| `UPDATE distillation_queue ... WHERE arena LIKE` | **`distillation_queue` has NO `arena` column** (cols: id, event_id, enqueued_at, claimed_by, claimed_at, claim_expires_at, status, attempts, last_error, completed_at). Scope via a join to `events.arena` (see §3/§4). |
|
|
34
|
+
| disk: "prior 30GB root outage — watch disk" | Root is now **485G, 412G free (16%)** — ample. |
|
|
35
|
+
| "Merge #126 and deploy the new distiller" | The running container **may** build from a local copy `/opt/engine-v2/extractor-async/worker.py`, but the engine-deploy path also installs the SDK tarball **from S3 into `node_modules`** and builds from there — these have caused a *week of drift* before (editing a reference-only copy). **§2.0 makes verifying the real build context a hard step, not an assumption.** |
|
|
36
|
+
|
|
37
|
+
## 0. BLOCKING prerequisites (added 2026-06-22 review)
|
|
38
|
+
|
|
39
|
+
### 0.1 The re-distill MUST run TEACHER-ONLY (else it re-creates the bug)
|
|
40
|
+
|
|
41
|
+
`CASCADE_ENABLED=true` is live in prod (the student→teacher cascade, #99). Under
|
|
42
|
+
it, re-enqueued events flow **student-first**: ~75–80% are handled by the
|
|
43
|
+
fine-tuned student, only gated/escalated events reach the teacher. **#126 fixes
|
|
44
|
+
the *teacher* prompt only.** The deployed student is the `f1e0ff`-trained
|
|
45
|
+
fine-tune — it *learned the modality-collapse behaviour from the old teacher* —
|
|
46
|
+
so re-distilling through the live cascade would have the **student re-assert the
|
|
47
|
+
same future-as-fact fabrications on the majority of events**, and the audit
|
|
48
|
+
would not reach `<3%`.
|
|
49
|
+
|
|
50
|
+
**Therefore, for the entire re-distill (pilot §3 + full §4):**
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# turn the cascade OFF so every re-enqueued event goes to the #126 teacher
|
|
54
|
+
aws ssm put-parameter --name /pme/prod-us-east-1/CASCADE_ENABLED --value false \
|
|
55
|
+
--type String --overwrite --region us-east-1
|
|
56
|
+
# then redeploy/restart extractor-async so the env reaches the worker (§2),
|
|
57
|
+
# and CONFIRM in the startup log: "cascade DISABLED" / no "student-primary" line.
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Re-enable the cascade (`=true` + restart) only **after** §4 completes and the
|
|
61
|
+
audit passes. While it's off, the teacher fleet carries 100% of distillation —
|
|
62
|
+
factor that into §4.4 fleet sizing.
|
|
63
|
+
|
|
64
|
+
**Forward note (not part of this run):** deploying #126 advances the teacher
|
|
65
|
+
`prompt_hash`, which strands the live student (trained on `f1e0ff`). Once the
|
|
66
|
+
cascade is re-enabled, watch the random-sample student↔teacher agreement for
|
|
67
|
+
drift; the student will eventually want a refresh on #126 traces. Tracked
|
|
68
|
+
separately from this re-distill.
|
|
69
|
+
|
|
70
|
+
### 0.2 Vector index (Qdrant) must be reconciled — not just Postgres
|
|
71
|
+
|
|
72
|
+
Hybrid retrieval is live (Qdrant 1.18.2, `evidence` collection). The §4 DELETE
|
|
73
|
+
removes 483k facts from Postgres but **leaves their vectors orphaned in Qdrant**,
|
|
74
|
+
and re-distilled facts are content-hash-distinct → *new* vectors. Net without
|
|
75
|
+
reconciliation: search returns deleted facts + stale duplicates.
|
|
76
|
+
|
|
77
|
+
- **Confirm** what the `evidence` collection is keyed on (fact id vs event id)
|
|
78
|
+
and how a PG fact delete maps to vector points (`vector_provenance` table).
|
|
79
|
+
- **Purge** the arena's old vectors as part of §4.2 (delete the corresponding
|
|
80
|
+
Qdrant points / `vector_provenance` rows), and **verify** re-distilled facts
|
|
81
|
+
get re-embedded (the embedder lanes must keep up — `NV_EMBED_URL` interactive
|
|
82
|
+
+ bulk).
|
|
83
|
+
- The §4.1 snapshot + §Rollback are **Postgres-only**; add the vector state (see
|
|
84
|
+
§4.1 / Rollback below) or a rollback leaves Qdrant inconsistent with restored
|
|
85
|
+
PG.
|
|
86
|
+
|
|
87
|
+
### 0.3 Re-enqueue must NOT starve live ingest (all tenants)
|
|
88
|
+
|
|
89
|
+
`claim_next_batch` orders by `ORDER BY id`. Re-enqueued events are *old* → **low
|
|
90
|
+
ids** → they would be claimed **ahead of** live ingest (high ids) for **every
|
|
91
|
+
tenant**, stalling all forward memory ingest behind a multi-day job. Mitigate by
|
|
92
|
+
**dripping the re-enqueue in batches** (§4.3) rather than flipping all 273k to
|
|
93
|
+
`pending` at once — keep the pending re-distill backlog bounded (e.g. ≤ a few k)
|
|
94
|
+
so live ingest interleaves. Monitor that non-`pentatonic-team` pending age
|
|
95
|
+
doesn't climb.
|
|
96
|
+
|
|
97
|
+
## 1. Outage already fixed (2026-06-22 ~08:26 UTC)
|
|
98
|
+
|
|
99
|
+
`pme2-extractor-async` had been SIGKILL'd at 05:03 UTC (`OOMKilled=false` → a
|
|
100
|
+
failed deploy attempt) and silently not draining for ~3.5h while forward ingest
|
|
101
|
+
kept enqueuing — **no alert fired** (see Seesa SEE-189). Fixed: `docker start
|
|
102
|
+
pme2-extractor-async`; reset 60 orphaned claims (`status='claimed' AND
|
|
103
|
+
claim_expires_at < now()` → `pending`); stopped the wrongly-started bakeoff box.
|
|
104
|
+
Verified draining (~60/min, 0 new failures, active claims with future expiry).
|
|
105
|
+
**This restored the OLD prompt** (`prompt_hash=f1e0ff554f708d05`); §2 replaces it.
|
|
106
|
+
|
|
107
|
+
## 2. Deploy #126 (prompt fix) + disable cascade
|
|
108
|
+
|
|
109
|
+
SDK version bumped **0.10.18 → 0.10.19** (this PR) and tarball built
|
|
110
|
+
(`ai-agent-sdk-0.10.19.tgz`, bundled `worker.py` carries the 4 fix markers).
|
|
111
|
+
|
|
112
|
+
0. **Verify the real build context FIRST** (do not assume local-copy — this is
|
|
113
|
+
the "week of drift" failure mode). On `i-0559…`:
|
|
114
|
+
`docker inspect pme2-extractor-async --format '{{.Config.Image}}'` and read
|
|
115
|
+
the `extractor-async` service `build.context` in the *deployed* compose
|
|
116
|
+
(`/opt/engine-v2/...`). Confirm the file you are about to edit is the one that
|
|
117
|
+
image actually builds from. If the context resolves into
|
|
118
|
+
`node_modules/@pentatonic-ai/ai-agent-sdk/...` (the S3-tarball install), edit
|
|
119
|
+
THAT path (or replace the tarball), not a stray `/opt/engine-v2/extractor-async`
|
|
120
|
+
copy. Resolve Open-Q #1 (npm/S3 source of truth) here too.
|
|
121
|
+
1. **Diff** (safety — don't silently revert any box-local change): copy the #126
|
|
122
|
+
`worker.py` to `/tmp/worker.new.py`, then `diff` it against the file the build
|
|
123
|
+
context actually uses (from step 0). Expect **only** the
|
|
124
|
+
TENSE/MODALITY/ATTRIBUTION/IDENTITY prompt additions. If other diffs appear,
|
|
125
|
+
STOP and reconcile.
|
|
126
|
+
2. Replace that worker.py with the #126 version (keep a timestamped backup for
|
|
127
|
+
the Rollback worker-revert).
|
|
128
|
+
3. **Set `CASCADE_ENABLED=false`** (§0.1) so the re-distill is teacher-only.
|
|
129
|
+
4. `cd /opt/engine-v2 && sudo docker compose up -d --build extractor-async`.
|
|
130
|
+
5. **Verify**: startup log prints a NEW `prompt_hash` (≠ `f1e0ff554f708d05`)
|
|
131
|
+
AND shows the cascade is OFF (no "cascade ENABLED — student-primary" line);
|
|
132
|
+
`grep -c MODALITY` in the running container's worker.py > 0; a few fresh
|
|
133
|
+
completions look sane.
|
|
134
|
+
|
|
135
|
+
## 3. Pilot re-distill (~100 events) — GATE
|
|
136
|
+
|
|
137
|
+
1. Pilot set: the known-bad source events (Will Vickers, Catherine Hayes, Johann
|
|
138
|
+
Boedecker, Katrin) + ~80 random `pentatonic-team` events.
|
|
139
|
+
2. Scoped clear + re-enqueue (note the `events` join — no `arena` on the queue;
|
|
140
|
+
reset `attempts` so the rows are claim-eligible — eligibility is
|
|
141
|
+
`attempts < MAX_ATTEMPTS`):
|
|
142
|
+
```sql
|
|
143
|
+
-- clear old facts for the pilot events. NB provenance_event_ids[1] catches
|
|
144
|
+
-- facts whose FIRST source is a pilot event; a fact corroborated by (but not
|
|
145
|
+
-- first-seen from) a pilot event is missed — acceptable for a pilot, exact
|
|
146
|
+
-- scope comes in §4's whole-arena delete.
|
|
147
|
+
DELETE FROM facts f
|
|
148
|
+
WHERE f.arena LIKE 'pentatonic-team%'
|
|
149
|
+
AND f.provenance_event_ids[1] = ANY(:pilot_event_ids);
|
|
150
|
+
-- re-enqueue (queue has no arena; scope by event_id set; reset attempts)
|
|
151
|
+
UPDATE distillation_queue
|
|
152
|
+
SET status='pending', claimed_by=NULL, claimed_at=NULL,
|
|
153
|
+
claim_expires_at=NULL, attempts=0
|
|
154
|
+
WHERE event_id = ANY(:pilot_event_ids);
|
|
155
|
+
```
|
|
156
|
+
(Cascade is OFF per §0.1, so these run on the #126 teacher — the pilot
|
|
157
|
+
actually tests the fix, not the stale student.)
|
|
158
|
+
3. Let the fleet drain. **Audit** the pilot entities with the validation harness:
|
|
159
|
+
modality/attribution rate should fall toward ~0; "attended/is" → "is scheduled
|
|
160
|
+
to / plans to" or dropped. Vickers "Board Observer" should be modal/absent.
|
|
161
|
+
4. **🚦 GATE: report the audit; require an explicit human go before §4.**
|
|
162
|
+
|
|
163
|
+
## 4. Full re-distill (only after the pilot passes + go)
|
|
164
|
+
|
|
165
|
+
1. **Snapshot** (rollback point), off-box — Postgres AND vector state:
|
|
166
|
+
```bash
|
|
167
|
+
pg_dump ... -t facts -t entities -t relationships -t vector_provenance \
|
|
168
|
+
--where "arena LIKE 'pentatonic-team%'" > pt_graph_pre_redistill_2026-06-22.sql
|
|
169
|
+
# also snapshot/record the Qdrant evidence points for the arena (or a full
|
|
170
|
+
# collection snapshot) so §Rollback can restore the index, not just PG.
|
|
171
|
+
```
|
|
172
|
+
2. Clear (keep entities — Fusion + projection sweeps rebuild downstream) +
|
|
173
|
+
purge the orphaned vectors (§0.2):
|
|
174
|
+
```sql
|
|
175
|
+
DELETE FROM facts WHERE arena LIKE 'pentatonic-team%';
|
|
176
|
+
DELETE FROM relationships WHERE arena LIKE 'pentatonic-team%';
|
|
177
|
+
-- + delete the corresponding Qdrant points and vector_provenance rows for
|
|
178
|
+
-- the arena (see §0.2 — confirm the keying first).
|
|
179
|
+
```
|
|
180
|
+
Note: entities are retained, so old-prompt **junk entities** the new prompt
|
|
181
|
+
won't re-create (e.g. the "Pentatonic GmbH" footer affiliation) will persist
|
|
182
|
+
until Fusion decay/eviction clears them — flag for a post-run entity sweep if
|
|
183
|
+
the audit still spots them.
|
|
184
|
+
3. Re-enqueue in **bounded batches** (§0.3 — do NOT flip all 273k at once, or
|
|
185
|
+
live ingest for all tenants starves behind the low-id backlog). First confirm
|
|
186
|
+
completeness, then drip:
|
|
187
|
+
```sql
|
|
188
|
+
-- completeness check: how many arena events have a queue row vs not?
|
|
189
|
+
SELECT count(*) FILTER (WHERE q.id IS NOT NULL) AS have_row,
|
|
190
|
+
count(*) FILTER (WHERE q.id IS NULL) AS missing
|
|
191
|
+
FROM events e LEFT JOIN distillation_queue q ON q.event_id = e.id
|
|
192
|
+
WHERE e.arena LIKE 'pentatonic-team%';
|
|
193
|
+
-- for events WITH a row: re-enqueue a batch (reset attempts), repeat as the
|
|
194
|
+
-- pending re-distill backlog drains below a cap (e.g. 3k):
|
|
195
|
+
UPDATE distillation_queue q
|
|
196
|
+
SET status='pending', claimed_by=NULL, claimed_at=NULL,
|
|
197
|
+
claim_expires_at=NULL, attempts=0
|
|
198
|
+
FROM events e
|
|
199
|
+
WHERE e.id = q.event_id AND e.arena LIKE 'pentatonic-team%'
|
|
200
|
+
AND q.event_id IN (:next_batch_event_ids);
|
|
201
|
+
-- for events MISSING a row (pruned post-done): re-insert from events.
|
|
202
|
+
INSERT INTO distillation_queue (event_id, status, enqueued_at, attempts)
|
|
203
|
+
SELECT e.id, 'pending', now(), 0 FROM events e
|
|
204
|
+
WHERE e.arena LIKE 'pentatonic-team%'
|
|
205
|
+
AND NOT EXISTS (SELECT 1 FROM distillation_queue q WHERE q.event_id = e.id)
|
|
206
|
+
AND e.id IN (:next_batch_event_ids);
|
|
207
|
+
```
|
|
208
|
+
**Assert** the total re-enqueued (UPDATE + INSERT) across all batches == the
|
|
209
|
+
arena event count (~272,893) before declaring the enqueue complete.
|
|
210
|
+
4. **Scale the fleet** for ~273k events, **teacher-only** (cascade off → the
|
|
211
|
+
teacher carries 100%, not the usual ~25%). Single-worker ~60/min ⇒ ~76h;
|
|
212
|
+
`extractor-async-2/-3` + `distiller-autoscale.sh` help, BUT **g6e capacity is
|
|
213
|
+
currently severe** — this week we could not launch *or restart a stopped*
|
|
214
|
+
g6e. So: do **not** let the autoscaler scale the fleet to zero mid-run (a
|
|
215
|
+
stopped box may not come back); treat the currently-running L40S boxes as the
|
|
216
|
+
ceiling and the 76h as optimistic. Monitor `distillation_queue` depth → 0,
|
|
217
|
+
GPU, disk, and the `failed`/`ReadTimeout` rate (teacher ~20–49s/call).
|
|
218
|
+
|
|
219
|
+
## 5. Downstream reconciliation
|
|
220
|
+
|
|
221
|
+
1. **Fusion Drive** re-run for the arena (de-dup; run AFTER extraction settles) —
|
|
222
|
+
`backfill_entity_reconciliation.py` / `fusion-drive-*.sh`.
|
|
223
|
+
2. **Projection re-fold** (Seesa side): org/person projection sweeps re-fold from
|
|
224
|
+
the corrected graph — the Seesa **SEE-184** watermark sweep (now live, 03:45
|
|
225
|
+
UTC) detects the advanced `graphAsOf` and marks projections stale → SEE-168
|
|
226
|
+
refresh re-folds. Worker D1 read-models inherit the heal.
|
|
227
|
+
3. **Unblocks Seesa SEE-183** — the content-sensitivity retag (forward detection
|
|
228
|
+
already live) was deliberately held until this re-distill completes, to avoid
|
|
229
|
+
piling re-emitted events onto the shared queue mid-run.
|
|
230
|
+
4. **Re-enable the cascade** (§0.1): `CASCADE_ENABLED=true` + restart
|
|
231
|
+
extractor-async; confirm "cascade ENABLED — student-primary" returns. Then
|
|
232
|
+
watch student↔teacher agreement (the student is now `f1e0ff`-stale vs the
|
|
233
|
+
#126 teacher — schedule a student refresh if drift shows).
|
|
234
|
+
|
|
235
|
+
## Validation gate (before declaring done)
|
|
236
|
+
Re-run the audit harness over a fresh 30–50 entity sample → **bad-fact rate
|
|
237
|
+
< 3%** (from ~18.7%). Spot the canonical failures: Vickers "Board Observer"
|
|
238
|
+
(modal/absent), Catherine "will send / to organise" (→ commitments), Matvii
|
|
239
|
+
"Pentatonic GmbH" footer affiliation (gone), Sebastian conflation (split/keyed).
|
|
240
|
+
|
|
241
|
+
## Rollback
|
|
242
|
+
Restore `facts`/`relationships`/`vector_provenance` for `pentatonic-team%` from
|
|
243
|
+
the §4.1 snapshot AND restore the Qdrant `evidence` points (new-prompt
|
|
244
|
+
extractions are content-hash-distinct, so a restore is clean — but PG and Qdrant
|
|
245
|
+
must be rolled back *together* or search desyncs). Worker revert = redeploy the
|
|
246
|
+
prior worker.py (the §2.2 backup) + rebuild. Cascade: set `CASCADE_ENABLED=true`
|
|
247
|
+
back if it was changed.
|
|
248
|
+
|
|
249
|
+
## Open questions for Phil
|
|
250
|
+
1. **Durable SDK release / source of truth.** Resolve in §2.0: is the deployed
|
|
251
|
+
worker built from a local copy, or from the S3/npm SDK install? If the
|
|
252
|
+
lockfile/`npm install` path is authoritative, a future redeploy reverts #126
|
|
253
|
+
unless 0.10.19 is published there + the `/opt/engine-v2` pin bumped. (S3
|
|
254
|
+
`sdk/` has tarballs to 0.10.18; the lockfile root version was stale at 0.10.1
|
|
255
|
+
— confirm the real install path before relying on either.)
|
|
256
|
+
2. **Fleet sizing** for the **teacher-only** 273k run — given g6e capacity is
|
|
257
|
+
currently unreliable (can't restart stopped boxes), is the running L40S fleet
|
|
258
|
+
enough, or do we need reserved/alt-region capacity (or to accept a longer
|
|
259
|
+
wall-clock)?
|
|
260
|
+
3. ~~Re-enqueue completeness~~ → **resolved in §4.3** (completeness check +
|
|
261
|
+
INSERT-from-events fallback + count assertion).
|
|
262
|
+
4. Run **off-peak / batched**? §0.3 + §4.3 make it batched to protect live
|
|
263
|
+
ingest; still worth starting off-peak given the multi-day duration.
|
|
264
|
+
|
|
265
|
+
## Authorization note (Seesa-side / Claude Code)
|
|
266
|
+
The auto-mode Bash classifier blocks these shared-prod writes (SSM/ec2/psql) even
|
|
267
|
+
with verbal authority; they must run via the `!` prefix or a settings permission
|
|
268
|
+
rule. Read-only SSM queries are fine. The full DELETE (§4) will be kept behind an
|
|
269
|
+
explicit human go regardless of any rule.
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Re-distill plan — pentatonic-team, post modality/attribution fix (2026-06-21)
|
|
2
|
+
|
|
3
|
+
**Why.** A fact-by-fact audit of `org_model` (49 entities, 434 facts) found **~18.7%
|
|
4
|
+
of distilled facts wrong**, dominated by *modality collapse* (future/scheduled/
|
|
5
|
+
planned content asserted as established fact), plus attribution errors (unauthored
|
|
6
|
+
docs → person, attendee → organiser, org activity → person) and a few same-name
|
|
7
|
+
conflations. The teacher prompt is fixed in **ai-agent-sdk PR #126** (TENSE &
|
|
8
|
+
MODALITY / ATTRIBUTION FIDELITY / IDENTITY rules). The prompt only governs
|
|
9
|
+
*future* extractions, so the historical graph must be **re-distilled** to inherit
|
|
10
|
+
the fix.
|
|
11
|
+
|
|
12
|
+
**Already done (2026-06-21):** a targeted cleanup retracted **229** facts whose
|
|
13
|
+
provenance source event is dated in the future (`emitted_at > now()`) and whose
|
|
14
|
+
category is established (`state/mention/decision`), scoped to `pentatonic-team`.
|
|
15
|
+
Future *commitments* (29) were spared; the frozen `pip-agents` legacy was
|
|
16
|
+
untouched. ~11 "attends standup"-class facts (future *content* but a past-dated
|
|
17
|
+
source event) are NOT deterministically catchable and remain for this re-distill.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Scope & guardrails
|
|
22
|
+
- **Arena: `pentatonic-team%` ONLY.** `org_model` is multi-tenant — `pip-agents`
|
|
23
|
+
(~246k facts) is LEGACY/frozen and must never be touched. Every statement here
|
|
24
|
+
carries `WHERE arena LIKE 'pentatonic-team%'`.
|
|
25
|
+
- **Idempotency caveat (load-bearing).** `worker.py` IDs entities/facts/rels by
|
|
26
|
+
content-hash, so re-running the *same* prompt converges. But the **prompt
|
|
27
|
+
changed** (`SYSTEM_PROMPT_HASH` is new), so re-extraction yields *different*
|
|
28
|
+
facts with *different* IDs — the old wrong facts will **coexist** unless
|
|
29
|
+
cleared. ⇒ the re-distill MUST delete the existing pentatonic-team facts for
|
|
30
|
+
each re-processed event before/with re-extraction.
|
|
31
|
+
|
|
32
|
+
## Prerequisites
|
|
33
|
+
1. **Merge PR #126** and **deploy the new distiller** to the extractor fleet
|
|
34
|
+
(the L4 box running `extractor-async/worker.py`). Confirm the running worker's
|
|
35
|
+
`SYSTEM_PROMPT_HASH` matches the new prompt (it's logged on the trace line).
|
|
36
|
+
2. **Snapshot** (rollback point):
|
|
37
|
+
`pg_dump … -t facts -t entities -t relationships --where "arena LIKE 'pentatonic-team%'"`
|
|
38
|
+
(or a filtered `COPY … TO` per table). Store off-box.
|
|
39
|
+
|
|
40
|
+
## Procedure
|
|
41
|
+
|
|
42
|
+
### Stage 0 — pilot (≈100 events incl. the known-bad ones)
|
|
43
|
+
1. Pick a pilot set: the Will Vickers / Catherine Hayes / Johann / Katrin source
|
|
44
|
+
events + a random ~80 pentatonic-team events.
|
|
45
|
+
2. Delete existing facts for those events (scoped), then re-enqueue:
|
|
46
|
+
```sql
|
|
47
|
+
-- clear old facts for the pilot events
|
|
48
|
+
DELETE FROM facts f
|
|
49
|
+
WHERE f.arena LIKE 'pentatonic-team%'
|
|
50
|
+
AND f.provenance_event_ids[1] = ANY(:pilot_event_ids);
|
|
51
|
+
-- re-enqueue them
|
|
52
|
+
UPDATE distillation_queue
|
|
53
|
+
SET status='pending', claimed_by=NULL, claim_expires_at=NULL
|
|
54
|
+
WHERE arena LIKE 'pentatonic-team%'
|
|
55
|
+
AND event_id = ANY(:pilot_event_ids);
|
|
56
|
+
```
|
|
57
|
+
3. Let the fleet drain the queue. **Validate**: re-run the audit harness over the
|
|
58
|
+
pilot entities; the modality/attribution rate should fall toward ~0. Eyeball
|
|
59
|
+
the Vickers/Katrin facts — "attended/is" should become "is scheduled to /
|
|
60
|
+
plans to" (or drop).
|
|
61
|
+
|
|
62
|
+
### Stage 1 — full pentatonic-team re-distill (only if the pilot passes)
|
|
63
|
+
1. Clear pentatonic-team facts (keep entities; Fusion + the projection sweep
|
|
64
|
+
rebuild downstream). Relationships likewise.
|
|
65
|
+
```sql
|
|
66
|
+
DELETE FROM facts WHERE arena LIKE 'pentatonic-team%';
|
|
67
|
+
DELETE FROM relationships WHERE arena LIKE 'pentatonic-team%';
|
|
68
|
+
```
|
|
69
|
+
2. Re-enqueue every pentatonic-team event:
|
|
70
|
+
```sql
|
|
71
|
+
UPDATE distillation_queue
|
|
72
|
+
SET status='pending', claimed_by=NULL, claim_expires_at=NULL
|
|
73
|
+
WHERE arena LIKE 'pentatonic-team%';
|
|
74
|
+
```
|
|
75
|
+
(If queue rows were pruned post-done, re-insert from `events` for the arena.)
|
|
76
|
+
3. Scale the fleet for the backlog; monitor `distillation_queue` depth until 0.
|
|
77
|
+
|
|
78
|
+
### Stage 2 — downstream reconciliation
|
|
79
|
+
1. **Fusion Drive** re-run for the arena (consolidate the freshly-extracted
|
|
80
|
+
nodes/facts) — it's a de-duplicator, so run AFTER extraction settles.
|
|
81
|
+
`backfill_entity_reconciliation.py` covers the entity side.
|
|
82
|
+
2. **Projection re-fold** (Seesa side): the org/person projection sweeps re-fold
|
|
83
|
+
from the corrected graph (SEE-168 nightly sweep does this automatically; or
|
|
84
|
+
trigger manually). The Worker D1 read-models then inherit the heal.
|
|
85
|
+
|
|
86
|
+
## Validation (gate before declaring done)
|
|
87
|
+
- Re-run the audit harness over a fresh 30–50 entity sample → **bad-fact rate
|
|
88
|
+
target < 3%** (from ~18.7%).
|
|
89
|
+
- Spot the canonical failures: Vickers "Board Observer" (should be modal/absent),
|
|
90
|
+
Catherine's "will send / to organise" (→ commitments), Matvii's "Pentatonic
|
|
91
|
+
GmbH" footer affiliation (gone), Sebastian conflation (split or correctly keyed).
|
|
92
|
+
|
|
93
|
+
## Rollback
|
|
94
|
+
Restore `facts`/`relationships` for `pentatonic-team%` from the Stage-0 snapshot;
|
|
95
|
+
the new-prompt extractions are content-hash-distinct so a restore is clean.
|
|
96
|
+
|
|
97
|
+
## Cost / time
|
|
98
|
+
~163k pentatonic-team events × 7B extraction on the L4 fleet. Bounded by fleet
|
|
99
|
+
size × per-batch latency; run off-peak, monitor GPU/disk (prior outage: v2 wrote
|
|
100
|
+
to a 30 GB root — watch disk). Pilot first to estimate throughput before the full
|
|
101
|
+
run.
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""Structured diff between two extractions (student vs teacher gold).
|
|
2
|
+
|
|
3
|
+
Replaces the crude agreement proxies we'd been quoting (entity-name-exact-match;
|
|
4
|
+
word-Jaccard on whole statements) with a STRUCTURED comparison that:
|
|
5
|
+
|
|
6
|
+
- matches entities on fuzzy name + compatible type (not exact lowercased string,
|
|
7
|
+
which penalised "Acme" vs "Acme Corp" / normalisation variants),
|
|
8
|
+
- matches facts on their s·p·o STRUCTURE plus the statement as a fallback
|
|
9
|
+
(not bag-of-words on the statement, which ignored who-did-what),
|
|
10
|
+
- matches relationships as (from, type, to) triples,
|
|
11
|
+
- reports precision / recall / F1 PER AXIS, and facts broken down PER CATEGORY
|
|
12
|
+
(so `decision`/`commitment` agreement is isolated — the cascade's
|
|
13
|
+
high-value gate question).
|
|
14
|
+
|
|
15
|
+
Deterministic + stdlib-only (difflib) so it runs offline, in CI, and on any box
|
|
16
|
+
without a GPU. Semantic-embedding matching is a deliberate non-goal here: a
|
|
17
|
+
deterministic structural diff is the defensible, un-game-able baseline (no model
|
|
18
|
+
judging another model's output); an embedding tiebreak can layer on later if the
|
|
19
|
+
fuzzy threshold proves too strict.
|
|
20
|
+
|
|
21
|
+
Shapes (mirror _parse_guided_json output):
|
|
22
|
+
entity = {"name", "type", "aliases"?: [emails]}
|
|
23
|
+
fact = {"category", "subject", "predicate", "object"?, "statement"}
|
|
24
|
+
relationship = {"from", "to", "type"}
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import re
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from difflib import SequenceMatcher
|
|
32
|
+
from typing import Any
|
|
33
|
+
|
|
34
|
+
# Org/legal-form suffixes stripped before name comparison so "Acme" == "Acme Inc".
|
|
35
|
+
_ORG_SUFFIX = re.compile(
|
|
36
|
+
r"\b(inc|incorporated|ltd|limited|llc|llp|plc|corp|corporation|co|gmbh|ag|sa|"
|
|
37
|
+
r"sas|bv|nv|pty|group|holdings?|company)\b\.?",
|
|
38
|
+
re.IGNORECASE,
|
|
39
|
+
)
|
|
40
|
+
_NONWORD = re.compile(r"[^a-z0-9 ]+")
|
|
41
|
+
_WS = re.compile(r"\s+")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def normalize_name(s: str | None) -> str:
|
|
45
|
+
"""Lowercase, drop punctuation + org suffixes, collapse whitespace."""
|
|
46
|
+
if not s:
|
|
47
|
+
return ""
|
|
48
|
+
s = s.lower()
|
|
49
|
+
s = _ORG_SUFFIX.sub(" ", s)
|
|
50
|
+
s = _NONWORD.sub(" ", s)
|
|
51
|
+
return _WS.sub(" ", s).strip()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _tokens(s: str) -> set[str]:
|
|
55
|
+
return set(normalize_name(s).split())
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def sim(a: str | None, b: str | None) -> float:
|
|
59
|
+
"""Similarity in [0,1]: max of char-level ratio and token-set Jaccard on the
|
|
60
|
+
normalised strings. The token-set arm rewards word overlap regardless of
|
|
61
|
+
order/length ("ship the Q3 release" vs "Q3 release will ship"); the
|
|
62
|
+
char-ratio arm rewards near-identical short strings."""
|
|
63
|
+
na, nb = normalize_name(a), normalize_name(b)
|
|
64
|
+
if not na and not nb:
|
|
65
|
+
return 1.0
|
|
66
|
+
if not na or not nb:
|
|
67
|
+
return 0.0
|
|
68
|
+
if na == nb:
|
|
69
|
+
return 1.0
|
|
70
|
+
char = SequenceMatcher(None, na, nb).ratio()
|
|
71
|
+
ta, tb = set(na.split()), set(nb.split())
|
|
72
|
+
jac = len(ta & tb) / len(ta | tb) if (ta | tb) else 0.0
|
|
73
|
+
return max(char, jac)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class PRF:
|
|
78
|
+
"""Precision/recall/F1 for one axis. n_gold/n_pred are the item counts."""
|
|
79
|
+
n_gold: int
|
|
80
|
+
n_pred: int
|
|
81
|
+
matched: int
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def precision(self) -> float:
|
|
85
|
+
if self.n_pred == 0:
|
|
86
|
+
return 1.0 if self.n_gold == 0 else 0.0
|
|
87
|
+
return self.matched / self.n_pred
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def recall(self) -> float:
|
|
91
|
+
if self.n_gold == 0:
|
|
92
|
+
return 1.0 if self.n_pred == 0 else 0.0
|
|
93
|
+
return self.matched / self.n_gold
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def f1(self) -> float:
|
|
97
|
+
p, r = self.precision, self.recall
|
|
98
|
+
return 2 * p * r / (p + r) if (p + r) else (1.0 if self.n_gold == 0 and self.n_pred == 0 else 0.0)
|
|
99
|
+
|
|
100
|
+
def as_dict(self) -> dict[str, float | int]:
|
|
101
|
+
return {
|
|
102
|
+
"n_gold": self.n_gold, "n_pred": self.n_pred, "matched": self.matched,
|
|
103
|
+
"precision": round(self.precision, 4), "recall": round(self.recall, 4),
|
|
104
|
+
"f1": round(self.f1, 4),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _greedy_match(gold: list, pred: list, score_fn, threshold: float) -> int:
|
|
109
|
+
"""Count matched gold items via greedy 1:1 alignment. All (gold, pred) pairs
|
|
110
|
+
scored, sorted desc, claimed highest-first so each item matches at most once.
|
|
111
|
+
Deterministic (stable sort, then index tiebreak)."""
|
|
112
|
+
pairs = []
|
|
113
|
+
for gi, g in enumerate(gold):
|
|
114
|
+
for pi, p in enumerate(pred):
|
|
115
|
+
s = score_fn(g, p)
|
|
116
|
+
if s >= threshold:
|
|
117
|
+
pairs.append((-s, gi, pi))
|
|
118
|
+
pairs.sort()
|
|
119
|
+
used_g: set[int] = set()
|
|
120
|
+
used_p: set[int] = set()
|
|
121
|
+
matched = 0
|
|
122
|
+
for _, gi, pi in pairs:
|
|
123
|
+
if gi in used_g or pi in used_p:
|
|
124
|
+
continue
|
|
125
|
+
used_g.add(gi)
|
|
126
|
+
used_p.add(pi)
|
|
127
|
+
matched += 1
|
|
128
|
+
return matched
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ── per-item scorers ─────────────────────────────────────────────────────
|
|
132
|
+
|
|
133
|
+
def _entity_score(g: dict, p: dict) -> float:
|
|
134
|
+
"""Name similarity, gated by type compatibility (equal, or either side
|
|
135
|
+
'other'/missing — the LLMs disagree on type far more than on identity)."""
|
|
136
|
+
gt = (g.get("type") or "").lower()
|
|
137
|
+
pt = (p.get("type") or "").lower()
|
|
138
|
+
if gt and pt and gt != pt and "other" not in (gt, pt):
|
|
139
|
+
return 0.0
|
|
140
|
+
return sim(g.get("name"), p.get("name"))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _fact_score(g: dict, p: dict) -> float:
|
|
144
|
+
"""Structural s·p·o similarity, OR statement similarity as a fallback.
|
|
145
|
+
Structure: mean of subject/predicate/object sims (object absent on both =
|
|
146
|
+
neutral 1.0 for that term). Take the max of structural and statement so a
|
|
147
|
+
well-phrased statement still matches even if s/p/o were split differently."""
|
|
148
|
+
subj = sim(g.get("subject"), p.get("subject"))
|
|
149
|
+
pred = sim(g.get("predicate"), p.get("predicate"))
|
|
150
|
+
go, po = g.get("object"), p.get("object")
|
|
151
|
+
obj = 1.0 if not go and not po else sim(go, po)
|
|
152
|
+
structural = (subj + pred + obj) / 3
|
|
153
|
+
statement = sim(g.get("statement"), p.get("statement"))
|
|
154
|
+
return max(structural, statement)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _rel_score(g: dict, p: dict) -> float:
|
|
158
|
+
"""(from, type, to) triple: mean of the three term sims."""
|
|
159
|
+
return (sim(g.get("from"), p.get("from"))
|
|
160
|
+
+ sim(g.get("type"), p.get("type"))
|
|
161
|
+
+ sim(g.get("to"), p.get("to"))) / 3
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# ── public API ─────────────────────────────────────────────────────────────
|
|
165
|
+
|
|
166
|
+
ENTITY_THRESHOLD = 0.85
|
|
167
|
+
FACT_THRESHOLD = 0.60
|
|
168
|
+
REL_THRESHOLD = 0.60
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def diff_axis(gold: list[dict], pred: list[dict], kind: str) -> PRF:
|
|
172
|
+
"""Match one axis ('entities' | 'facts' | 'relationships'); return PRF."""
|
|
173
|
+
scorer, thr = {
|
|
174
|
+
"entities": (_entity_score, ENTITY_THRESHOLD),
|
|
175
|
+
"facts": (_fact_score, FACT_THRESHOLD),
|
|
176
|
+
"relationships": (_rel_score, REL_THRESHOLD),
|
|
177
|
+
}[kind]
|
|
178
|
+
matched = _greedy_match(gold, pred, scorer, thr)
|
|
179
|
+
return PRF(n_gold=len(gold), n_pred=len(pred), matched=matched)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _facts_in(extraction: dict, categories: set[str] | None) -> list[dict]:
|
|
183
|
+
facts = extraction.get("facts") or []
|
|
184
|
+
if categories is None:
|
|
185
|
+
return facts
|
|
186
|
+
return [f for f in facts if (f.get("category") or "").lower() in categories]
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def diff_extractions(
|
|
190
|
+
gold: dict, pred: dict, fact_categories: set[str] | None = None
|
|
191
|
+
) -> dict[str, Any]:
|
|
192
|
+
"""Full structured diff. `gold`/`pred` are extraction dicts. Returns per-axis
|
|
193
|
+
PRF dicts plus per-fact-category PRF. `fact_categories`, if given, also
|
|
194
|
+
reports a 'facts_filtered' PRF over just those categories (e.g.
|
|
195
|
+
{'decision','commitment'} for the high-value-gate question)."""
|
|
196
|
+
out: dict[str, Any] = {
|
|
197
|
+
"entities": diff_axis(gold.get("entities") or [], pred.get("entities") or [], "entities").as_dict(),
|
|
198
|
+
"facts": diff_axis(gold.get("facts") or [], pred.get("facts") or [], "facts").as_dict(),
|
|
199
|
+
"relationships": diff_axis(
|
|
200
|
+
gold.get("relationships") or [], pred.get("relationships") or [], "relationships"
|
|
201
|
+
).as_dict(),
|
|
202
|
+
}
|
|
203
|
+
# per-category fact breakdown
|
|
204
|
+
cats = {(f.get("category") or "").lower() for f in (gold.get("facts") or [])}
|
|
205
|
+
cats |= {(f.get("category") or "").lower() for f in (pred.get("facts") or [])}
|
|
206
|
+
cats.discard("")
|
|
207
|
+
by_cat: dict[str, Any] = {}
|
|
208
|
+
for c in sorted(cats):
|
|
209
|
+
g = _facts_in(gold, {c})
|
|
210
|
+
p = _facts_in(pred, {c})
|
|
211
|
+
by_cat[c] = diff_axis(g, p, "facts").as_dict()
|
|
212
|
+
out["facts_by_category"] = by_cat
|
|
213
|
+
if fact_categories is not None:
|
|
214
|
+
g = _facts_in(gold, fact_categories)
|
|
215
|
+
p = _facts_in(pred, fact_categories)
|
|
216
|
+
out["facts_filtered"] = diff_axis(g, p, "facts").as_dict()
|
|
217
|
+
out["facts_filtered_categories"] = sorted(fact_categories)
|
|
218
|
+
return out
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Unit tests for the structured-diff agreement metric (extraction_diff).
|
|
2
|
+
|
|
3
|
+
Pins the behaviours that make it a real metric rather than the old proxies:
|
|
4
|
+
fuzzy/normalised name matching, structural s·p·o fact matching, per-axis P/R/F1,
|
|
5
|
+
per-category fact breakdown, and the high-value filtered view.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import importlib.util
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
_THIS = Path(__file__).resolve().parent
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load(name="extraction_diff_mod"):
|
|
20
|
+
spec = importlib.util.spec_from_file_location(name, _THIS / "extraction_diff.py")
|
|
21
|
+
mod = importlib.util.module_from_spec(spec)
|
|
22
|
+
# Register before exec so @dataclass's type resolution (which walks
|
|
23
|
+
# sys.modules[cls.__module__]) works under the importlib custom-name load
|
|
24
|
+
# on Python 3.12+/3.14. A normal `import extraction_diff` (CI/prod) is fine.
|
|
25
|
+
sys.modules[name] = mod
|
|
26
|
+
spec.loader.exec_module(mod)
|
|
27
|
+
return mod
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
ed = _load()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ── normalize_name / sim ─────────────────────────────────────────────────
|
|
34
|
+
|
|
35
|
+
def test_normalize_strips_org_suffix_and_punct():
|
|
36
|
+
assert ed.normalize_name("Acme Corp.") == "acme"
|
|
37
|
+
assert ed.normalize_name("Acme, Inc.") == "acme"
|
|
38
|
+
assert ed.normalize_name("ACME Limited") == "acme"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_sim_normalisation_variants_are_high():
|
|
42
|
+
# the exact-string proxy scored these 0; structured sim should be ~1.
|
|
43
|
+
assert ed.sim("Acme", "Acme Corp") >= 0.99
|
|
44
|
+
assert ed.sim("Acme Inc.", "ACME") >= 0.99
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_sim_token_overlap_order_invariant():
|
|
48
|
+
assert ed.sim("ship the Q3 release", "Q3 release will ship") >= 0.6
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_sim_unrelated_low():
|
|
52
|
+
assert ed.sim("Acme", "Globex") < 0.5
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_sim_both_empty_is_one():
|
|
56
|
+
assert ed.sim("", "") == 1.0
|
|
57
|
+
assert ed.sim(None, "x") == 0.0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# ── PRF arithmetic ───────────────────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
def test_prf_basic():
|
|
63
|
+
prf = ed.PRF(n_gold=4, n_pred=5, matched=3)
|
|
64
|
+
assert prf.recall == 0.75
|
|
65
|
+
assert prf.precision == 0.6
|
|
66
|
+
assert round(prf.f1, 3) == 0.667
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_prf_empty_both_is_perfect():
|
|
70
|
+
prf = ed.PRF(n_gold=0, n_pred=0, matched=0)
|
|
71
|
+
assert prf.precision == 1.0 and prf.recall == 1.0 and prf.f1 == 1.0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_prf_pred_without_gold_is_zero_precision():
|
|
75
|
+
prf = ed.PRF(n_gold=0, n_pred=2, matched=0)
|
|
76
|
+
assert prf.precision == 0.0
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ── entity matching ──────────────────────────────────────────────────────
|
|
80
|
+
|
|
81
|
+
def test_entity_match_fuzzy_name_same_type():
|
|
82
|
+
g = [{"name": "Acme Corp", "type": "org"}]
|
|
83
|
+
p = [{"name": "Acme", "type": "org"}]
|
|
84
|
+
prf = ed.diff_axis(g, p, "entities")
|
|
85
|
+
assert prf.matched == 1 and prf.f1 == 1.0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_entity_type_mismatch_blocks_match():
|
|
89
|
+
g = [{"name": "Apple", "type": "org"}]
|
|
90
|
+
p = [{"name": "Apple", "type": "person"}]
|
|
91
|
+
assert ed.diff_axis(g, p, "entities").matched == 0
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_entity_other_type_is_compatible():
|
|
95
|
+
g = [{"name": "Apple", "type": "org"}]
|
|
96
|
+
p = [{"name": "Apple", "type": "other"}]
|
|
97
|
+
assert ed.diff_axis(g, p, "entities").matched == 1
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_entity_greedy_one_to_one():
|
|
101
|
+
# two golds, one pred → at most one match
|
|
102
|
+
g = [{"name": "Acme", "type": "org"}, {"name": "Acme", "type": "org"}]
|
|
103
|
+
p = [{"name": "Acme", "type": "org"}]
|
|
104
|
+
prf = ed.diff_axis(g, p, "entities")
|
|
105
|
+
assert prf.matched == 1 and prf.recall == 0.5 and prf.precision == 1.0
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ── fact matching (structural vs statement) ──────────────────────────────
|
|
109
|
+
|
|
110
|
+
def test_fact_match_on_structure_despite_statement_rephrase():
|
|
111
|
+
g = [{"category": "decision", "subject": "Acme", "predicate": "will renew",
|
|
112
|
+
"object": "the contract", "statement": "Acme decided to renew the contract for 2027."}]
|
|
113
|
+
p = [{"category": "decision", "subject": "Acme", "predicate": "renews",
|
|
114
|
+
"object": "contract", "statement": "The 2027 contract renewal was agreed by Acme."}]
|
|
115
|
+
prf = ed.diff_axis(g, p, "facts")
|
|
116
|
+
assert prf.matched == 1
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_fact_match_on_statement_when_spo_split_differs():
|
|
120
|
+
g = [{"category": "commitment", "subject": "Bob", "predicate": "owns", "object": "migration",
|
|
121
|
+
"statement": "Bob will lead the migration starting in March."}]
|
|
122
|
+
p = [{"category": "commitment", "subject": "Bob Chen", "predicate": "leads",
|
|
123
|
+
"object": "the data migration", "statement": "Bob will lead the migration starting in March."}]
|
|
124
|
+
assert ed.diff_axis(g, p, "facts").matched == 1
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_fact_unrelated_no_match():
|
|
128
|
+
g = [{"category": "decision", "subject": "Acme", "predicate": "hired", "object": "a CFO",
|
|
129
|
+
"statement": "Acme hired a new CFO."}]
|
|
130
|
+
p = [{"category": "decision", "subject": "Globex", "predicate": "closed", "object": "the Berlin office",
|
|
131
|
+
"statement": "Globex shut its Berlin office."}]
|
|
132
|
+
assert ed.diff_axis(g, p, "facts").matched == 0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ── relationships ────────────────────────────────────────────────────────
|
|
136
|
+
|
|
137
|
+
def test_relationship_triple_match():
|
|
138
|
+
g = [{"from": "Jane", "to": "Acme Corp", "type": "works at"}]
|
|
139
|
+
p = [{"from": "Jane", "to": "Acme", "type": "employed by"}]
|
|
140
|
+
# from + to match strongly; type weaker → mean may dip below threshold
|
|
141
|
+
prf = ed.diff_axis(g, p, "relationships")
|
|
142
|
+
assert prf.n_gold == 1 and prf.n_pred == 1
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ── full diff + per-category + filtered ──────────────────────────────────
|
|
146
|
+
|
|
147
|
+
def test_diff_extractions_per_category_and_filtered():
|
|
148
|
+
gold = {
|
|
149
|
+
"entities": [{"name": "Acme", "type": "org"}],
|
|
150
|
+
"facts": [
|
|
151
|
+
{"category": "decision", "subject": "Acme", "predicate": "will renew",
|
|
152
|
+
"object": "contract", "statement": "Acme will renew the contract."},
|
|
153
|
+
{"category": "state", "subject": "Acme", "predicate": "is", "object": "a customer",
|
|
154
|
+
"statement": "Acme is a customer."},
|
|
155
|
+
],
|
|
156
|
+
"relationships": [],
|
|
157
|
+
}
|
|
158
|
+
pred = {
|
|
159
|
+
"entities": [{"name": "Acme Corp", "type": "org"}],
|
|
160
|
+
"facts": [
|
|
161
|
+
{"category": "decision", "subject": "Acme", "predicate": "renews",
|
|
162
|
+
"object": "the contract", "statement": "Acme renews its contract."},
|
|
163
|
+
],
|
|
164
|
+
"relationships": [],
|
|
165
|
+
}
|
|
166
|
+
out = ed.diff_extractions(gold, pred, fact_categories={"decision", "commitment"})
|
|
167
|
+
assert out["entities"]["matched"] == 1
|
|
168
|
+
# per-category: decision matched 1/1; state missing (recall 0)
|
|
169
|
+
assert out["facts_by_category"]["decision"]["matched"] == 1
|
|
170
|
+
assert out["facts_by_category"]["state"]["recall"] == 0.0
|
|
171
|
+
# filtered to decision/commitment: 1 gold, 1 pred, 1 matched → perfect
|
|
172
|
+
assert out["facts_filtered"]["recall"] == 1.0
|
|
173
|
+
assert out["facts_filtered"]["precision"] == 1.0
|
|
174
|
+
assert out["facts_filtered_categories"] == ["commitment", "decision"]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def test_diff_extractions_empty_both():
|
|
178
|
+
out = ed.diff_extractions({"entities": [], "facts": [], "relationships": []},
|
|
179
|
+
{"entities": [], "facts": [], "relationships": []})
|
|
180
|
+
assert out["facts"]["f1"] == 1.0
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Guard tests for the distiller system-prompt content rules.
|
|
2
|
+
|
|
3
|
+
Pins that the email-discipline + entity-separation rules (this change) and the
|
|
4
|
+
#126 modality/attribution rules are present in BOTH prompt variants — a cheap
|
|
5
|
+
regression guard so a future prompt edit can't silently drop them.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import importlib.util
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
_THIS = Path(__file__).resolve().parent
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load(name="extractor_async_worker_prompts"):
|
|
20
|
+
spec = importlib.util.spec_from_file_location(name, _THIS / "worker.py")
|
|
21
|
+
mod = importlib.util.module_from_spec(spec)
|
|
22
|
+
sys.modules[name] = mod
|
|
23
|
+
spec.loader.exec_module(mod)
|
|
24
|
+
return mod
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
worker = _load()
|
|
29
|
+
except ImportError as e:
|
|
30
|
+
pytest.skip(f"extractor-async deps unavailable: {e}", allow_module_level=True)
|
|
31
|
+
|
|
32
|
+
PROMPTS = lambda: (worker.BATCH_SYSTEM_PROMPT, worker.GUIDED_JSON_SYSTEM_PROMPT)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_email_discipline_in_both_prompts():
|
|
36
|
+
for p in PROMPTS():
|
|
37
|
+
assert "An email address is NOT a person" in p
|
|
38
|
+
assert "reservations@" in p # role/transactional examples present
|
|
39
|
+
assert "clearly THEIRS" in p # bystander-attachment ban
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_distinct_entities_rule_in_both_prompts():
|
|
43
|
+
for p in PROMPTS():
|
|
44
|
+
assert "DISTINCT ENTITIES" in p
|
|
45
|
+
assert "Acme & Globex" in p # conflation-split example
|
|
46
|
+
assert "warehouse" in p # generic-token suppression
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_126_rules_not_regressed():
|
|
50
|
+
for p in PROMPTS():
|
|
51
|
+
assert "TENSE & MODALITY" in p
|
|
52
|
+
assert "ATTRIBUTION FIDELITY" in p
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_active_prompt_carries_the_rules_and_fresh_hash():
|
|
56
|
+
assert "An email address is NOT a person" in worker.ACTIVE_SYSTEM_PROMPT
|
|
57
|
+
assert "DISTINCT ENTITIES" in worker.ACTIVE_SYSTEM_PROMPT
|
|
58
|
+
assert len(worker.SYSTEM_PROMPT_HASH) == 16 # hash recomputed off ACTIVE prompt
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Unit tests for the distillation_queue attempts/retry accounting.
|
|
2
|
+
|
|
3
|
+
Regression guard for the lease-reclaim bug (gotcha #11): claiming must NOT
|
|
4
|
+
consume the retry budget — only genuine processing failures do — so a worker
|
|
5
|
+
restart (deploy recreating the container) can re-claim stranded in-flight work
|
|
6
|
+
indefinitely instead of stranding it in `claimed` forever. The DB-touching
|
|
7
|
+
claim/release/fail SQL isn't unit-testable here (no DB in this suite), but the
|
|
8
|
+
give-up decision is pure logic, so we pin it.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import importlib.util
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
import pytest
|
|
17
|
+
|
|
18
|
+
_THIS = Path(__file__).resolve().parent
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _load_worker(name: str = "extractor_async_worker_qa"):
|
|
22
|
+
spec = importlib.util.spec_from_file_location(name, _THIS / "worker.py")
|
|
23
|
+
assert spec and spec.loader
|
|
24
|
+
mod = importlib.util.module_from_spec(spec)
|
|
25
|
+
spec.loader.exec_module(mod)
|
|
26
|
+
return mod
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
worker = _load_worker()
|
|
31
|
+
except ImportError as e:
|
|
32
|
+
pytest.skip(f"extractor-async deps unavailable: {e}", allow_module_level=True)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_attempts_exhausted_gives_exactly_max_genuine_tries(monkeypatch) -> None:
|
|
36
|
+
"""`attempts` is the count of PRIOR genuine failures at claim time. With
|
|
37
|
+
MAX_ATTEMPTS=3 the sequence is: fail#1 (attempts=0)→retry, fail#2
|
|
38
|
+
(attempts=1)→retry, fail#3 (attempts=2)→terminal. Exactly 3 tries."""
|
|
39
|
+
monkeypatch.setattr(worker, "MAX_ATTEMPTS", 3)
|
|
40
|
+
assert worker._attempts_exhausted(0) is False # 1st failure → retry
|
|
41
|
+
assert worker._attempts_exhausted(1) is False # 2nd failure → retry
|
|
42
|
+
assert worker._attempts_exhausted(2) is True # 3rd failure → give up
|
|
43
|
+
assert worker._attempts_exhausted(3) is True
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_attempts_exhausted_respects_max(monkeypatch) -> None:
|
|
47
|
+
monkeypatch.setattr(worker, "MAX_ATTEMPTS", 1)
|
|
48
|
+
assert worker._attempts_exhausted(0) is True # single try, no retry
|
|
49
|
+
monkeypatch.setattr(worker, "MAX_ATTEMPTS", 5)
|
|
50
|
+
assert worker._attempts_exhausted(3) is False
|
|
51
|
+
assert worker._attempts_exhausted(4) is True
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_claim_sql_does_not_increment_attempts() -> None:
|
|
55
|
+
"""The fix: claiming must not touch `attempts` (only release/fail do). Guard
|
|
56
|
+
against a regression that reintroduces the increment at claim time. We check
|
|
57
|
+
the source of claim_next_batch rather than execute it (no DB here)."""
|
|
58
|
+
import inspect
|
|
59
|
+
src = inspect.getsource(worker.claim_next_batch)
|
|
60
|
+
# the claim UPDATE must not bump attempts; the only attempts reference is the
|
|
61
|
+
# eligibility predicate `attempts < %s`.
|
|
62
|
+
assert "attempts = attempts + 1" not in src
|
|
63
|
+
assert "attempts <" in src # eligibility gate still present
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_release_and_fail_increment_attempts() -> None:
|
|
67
|
+
import inspect
|
|
68
|
+
assert "attempts = attempts + 1" in inspect.getsource(worker.release_claim)
|
|
69
|
+
assert "attempts = attempts + 1" in inspect.getsource(worker.mark_failed)
|
|
@@ -284,6 +284,38 @@ and NEVER drop a field.
|
|
|
284
284
|
object MAY be an entity name OR a literal string OR `-` if absent.
|
|
285
285
|
statement ≤ 140 characters, a self-contained sentence.
|
|
286
286
|
WORKED EXAMPLE: `FCT|commitment|Timothy Bradley|agreed to|SAFE amendments|Timothy confirmed the SAFE amendments are set (14 May 2026)`
|
|
287
|
+
- TENSE & MODALITY — never record the future or the merely-planned as done:
|
|
288
|
+
* A SCHEDULED or FUTURE event (a calendar invite, a meeting/call dated \
|
|
289
|
+
later than this event, a recurring-meeting instance) is UPCOMING, not \
|
|
290
|
+
completed. NEVER emit "attended / hosted / met / reported / decided" for a \
|
|
291
|
+
meeting that has not happened — use category=commitment with predicate "is \
|
|
292
|
+
scheduled to" / "plans to".
|
|
293
|
+
* A PLAN, PROPOSAL, INTENT or next-step ("will", "plans to", "aims to", \
|
|
294
|
+
"to organise", "we'll", "next steps:") is category=commitment, NOT state and \
|
|
295
|
+
NOT a completed act. Keep "will send" as a commitment — never record it as "sent".
|
|
296
|
+
- ATTRIBUTION FIDELITY — the subject must be who the source actually credits:
|
|
297
|
+
* Attribute a document/deck/message's content to a person ONLY if the source \
|
|
298
|
+
names them as its author or speaker. Do NOT attribute an unauthored doc, agenda \
|
|
299
|
+
or deck to whoever it merely mentions or whoever shared it.
|
|
300
|
+
* Do NOT promote a meeting ATTENDEE to organiser/host without explicit evidence.
|
|
301
|
+
* Do NOT attribute an ORGANISATION's activity (deals, intros, pipeline) to an \
|
|
302
|
+
individual person.
|
|
303
|
+
- IDENTITY & EMAILS — do NOT infer a person's employer/affiliation from an email \
|
|
304
|
+
signature or a company's standard footer/boilerplate; affiliation needs an \
|
|
305
|
+
explicit statement in the body. \
|
|
306
|
+
An email address is NOT a person: NEVER emit an entity whose name is an email \
|
|
307
|
+
address, and NEVER treat a role/transactional address (reservations@, bookings@, \
|
|
308
|
+
no-reply@, info@, office@, marketing@, support@, notifications@, admin@) as a \
|
|
309
|
+
person. Attach an email to a person ONLY when it is clearly THEIRS (the author, \
|
|
310
|
+
or a local-part matching their name) — NEVER attach an address that merely \
|
|
311
|
+
co-occurs in the same thread, CC list, or document (a hotel booking, a \
|
|
312
|
+
newsletter, an unrelated contact).
|
|
313
|
+
- DISTINCT ENTITIES — two names joined by "and" / "&" / "/" are TWO separate \
|
|
314
|
+
entities, never one merged node ("Acme & Globex" → emit Acme AND Globex). Do NOT \
|
|
315
|
+
turn a sentence fragment or phrase into an entity, and do NOT mint generic \
|
|
316
|
+
infrastructure / environment tokens (prod, staging, preview, UAT, warehouse, \
|
|
317
|
+
main, PRD, CRM, platform, localhost) as entities — they are not named people, \
|
|
318
|
+
orgs, products, or projects.
|
|
287
319
|
- REL lines have exactly 4 fields: `REL`, from, to, rel_type.
|
|
288
320
|
from and to MUST be entity names declared in THIS event's ENT lines.
|
|
289
321
|
rel_type is a short verb / preposition phrase.
|
|
@@ -337,6 +369,38 @@ observation, preference}.
|
|
|
337
369
|
WORKED EXAMPLE: {"category": "commitment", "subject": "Timothy \
|
|
338
370
|
Bradley", "predicate": "agreed to", "object": "SAFE amendments", \
|
|
339
371
|
"statement": "Timothy confirmed the SAFE amendments are set (14 May 2026)"}
|
|
372
|
+
- TENSE & MODALITY — never record the future or the merely-planned as done:
|
|
373
|
+
* A SCHEDULED or FUTURE event (a calendar invite, a meeting/call dated \
|
|
374
|
+
later than this event, a recurring-meeting instance) is UPCOMING, not \
|
|
375
|
+
completed. NEVER emit "attended / hosted / met / reported / decided" for a \
|
|
376
|
+
meeting that has not happened — use category=commitment with predicate "is \
|
|
377
|
+
scheduled to" / "plans to".
|
|
378
|
+
* A PLAN, PROPOSAL, INTENT or next-step ("will", "plans to", "aims to", \
|
|
379
|
+
"to organise", "we'll", "next steps:") is category=commitment, NOT state and \
|
|
380
|
+
NOT a completed act. Keep "will send" as a commitment — never record it as "sent".
|
|
381
|
+
- ATTRIBUTION FIDELITY — the subject must be who the source actually credits:
|
|
382
|
+
* Attribute a document/deck/message's content to a person ONLY if the source \
|
|
383
|
+
names them as its author or speaker. Do NOT attribute an unauthored doc, agenda \
|
|
384
|
+
or deck to whoever it merely mentions or whoever shared it.
|
|
385
|
+
* Do NOT promote a meeting ATTENDEE to organiser/host without explicit evidence.
|
|
386
|
+
* Do NOT attribute an ORGANISATION's activity (deals, intros, pipeline) to an \
|
|
387
|
+
individual person.
|
|
388
|
+
- IDENTITY & EMAILS — do NOT infer a person's employer/affiliation from an email \
|
|
389
|
+
signature or a company's standard footer/boilerplate; affiliation needs an \
|
|
390
|
+
explicit statement in the body. \
|
|
391
|
+
An email address is NOT a person: NEVER emit an entity whose name is an email \
|
|
392
|
+
address, and NEVER treat a role/transactional address (reservations@, bookings@, \
|
|
393
|
+
no-reply@, info@, office@, marketing@, support@, notifications@, admin@) as a \
|
|
394
|
+
person. Attach an email to a person ONLY when it is clearly THEIRS (the author, \
|
|
395
|
+
or a local-part matching their name) — NEVER attach an address that merely \
|
|
396
|
+
co-occurs in the same thread, CC list, or document (a hotel booking, a \
|
|
397
|
+
newsletter, an unrelated contact).
|
|
398
|
+
- DISTINCT ENTITIES — two names joined by "and" / "&" / "/" are TWO separate \
|
|
399
|
+
entities, never one merged node ("Acme & Globex" → emit Acme AND Globex). Do NOT \
|
|
400
|
+
turn a sentence fragment or phrase into an entity, and do NOT mint generic \
|
|
401
|
+
infrastructure / environment tokens (prod, staging, preview, UAT, warehouse, \
|
|
402
|
+
main, PRD, CRM, platform, localhost) as entities — they are not named people, \
|
|
403
|
+
orgs, products, or projects.
|
|
340
404
|
- relationships: "from" and "to" MUST be entity names declared in THIS \
|
|
341
405
|
event's "entities". "type" is a short verb / preposition phrase.
|
|
342
406
|
- HARD CAPS per event: 8 entities, 6 facts, 6 relationships. Pick the \
|
|
@@ -1905,8 +1969,16 @@ def claim_next_batch(conn: psycopg.Connection) -> list[dict[str, Any]]:
|
|
|
1905
1969
|
status = 'claimed',
|
|
1906
1970
|
claimed_by = %s,
|
|
1907
1971
|
claimed_at = NOW(),
|
|
1908
|
-
claim_expires_at = NOW() + (%s || ' seconds')::interval
|
|
1909
|
-
|
|
1972
|
+
claim_expires_at = NOW() + (%s || ' seconds')::interval
|
|
1973
|
+
-- NB: claiming does NOT increment `attempts`. `attempts` counts
|
|
1974
|
+
-- genuine PROCESSING failures (release_claim / mark_failed), not
|
|
1975
|
+
-- claim-grabs. A worker that dies mid-batch (e.g. a deploy
|
|
1976
|
+
-- recreates the container) leaves its rows in `claimed`; the lease
|
|
1977
|
+
-- expires and they are re-claimed here WITHOUT burning the retry
|
|
1978
|
+
-- budget — so restarts can't strand in-flight work. (Pre-fix, the
|
|
1979
|
+
-- increment lived here and ~3 deploys could push a row to
|
|
1980
|
+
-- attempts=MAX, making it forever-ineligible for reclaim AND never
|
|
1981
|
+
-- marked failed → orphaned in `claimed`. See gotcha #11.)
|
|
1910
1982
|
WHERE id IN (
|
|
1911
1983
|
SELECT id FROM distillation_queue
|
|
1912
1984
|
WHERE (
|
|
@@ -1943,14 +2015,21 @@ def mark_done(conn: psycopg.Connection, queue_id: int) -> None:
|
|
|
1943
2015
|
|
|
1944
2016
|
|
|
1945
2017
|
def mark_failed(conn: psycopg.Connection, queue_id: int, error: str) -> None:
|
|
2018
|
+
# Terminal genuine-failure path → count the attempt (claiming no longer
|
|
2019
|
+
# does; see claim_next_batch). Leaves the row's `attempts` reflecting the
|
|
2020
|
+
# true number of processing attempts on a failed row.
|
|
1946
2021
|
with conn.cursor() as cur:
|
|
1947
2022
|
cur.execute(
|
|
1948
|
-
"UPDATE distillation_queue SET status = 'failed',
|
|
2023
|
+
"UPDATE distillation_queue SET status = 'failed', "
|
|
2024
|
+
"attempts = attempts + 1, last_error = %s WHERE id = %s",
|
|
1949
2025
|
(error[:1024], queue_id),
|
|
1950
2026
|
)
|
|
1951
2027
|
|
|
1952
2028
|
|
|
1953
2029
|
def release_claim(conn: psycopg.Connection, queue_id: int, error: str) -> None:
|
|
2030
|
+
# Recoverable genuine-failure path (will retry) → count the attempt. This is
|
|
2031
|
+
# where the retry budget is spent — NOT at claim time — so a deploy-induced
|
|
2032
|
+
# reclaim never consumes it.
|
|
1954
2033
|
with conn.cursor() as cur:
|
|
1955
2034
|
cur.execute(
|
|
1956
2035
|
"""
|
|
@@ -1959,6 +2038,7 @@ def release_claim(conn: psycopg.Connection, queue_id: int, error: str) -> None:
|
|
|
1959
2038
|
claimed_by = NULL,
|
|
1960
2039
|
claimed_at = NULL,
|
|
1961
2040
|
claim_expires_at = NULL,
|
|
2041
|
+
attempts = attempts + 1,
|
|
1962
2042
|
last_error = %s
|
|
1963
2043
|
WHERE id = %s
|
|
1964
2044
|
""",
|
|
@@ -1966,6 +2046,15 @@ def release_claim(conn: psycopg.Connection, queue_id: int, error: str) -> None:
|
|
|
1966
2046
|
)
|
|
1967
2047
|
|
|
1968
2048
|
|
|
2049
|
+
def _attempts_exhausted(attempts: int) -> bool:
|
|
2050
|
+
"""Whether THIS processing failure should be terminal (mark_failed) rather
|
|
2051
|
+
than retried (release_claim). `attempts` is the row's value at claim time =
|
|
2052
|
+
the count of PRIOR genuine failures (claiming no longer increments it). This
|
|
2053
|
+
failure is attempt #(attempts+1), so we give up once that reaches
|
|
2054
|
+
MAX_ATTEMPTS — giving exactly MAX_ATTEMPTS genuine tries before failing."""
|
|
2055
|
+
return attempts + 1 >= MAX_ATTEMPTS
|
|
2056
|
+
|
|
2057
|
+
|
|
1969
2058
|
# --------------------------------------------------------------------
|
|
1970
2059
|
# Main loop
|
|
1971
2060
|
# --------------------------------------------------------------------
|
|
@@ -2142,7 +2231,7 @@ async def _run_teacher(
|
|
|
2142
2231
|
log.warning(
|
|
2143
2232
|
f"extraction failed queue_id={queue_id} attempts={attempts}: {err}"
|
|
2144
2233
|
)
|
|
2145
|
-
if attempts
|
|
2234
|
+
if _attempts_exhausted(attempts):
|
|
2146
2235
|
mark_failed(conn, queue_id, err)
|
|
2147
2236
|
else:
|
|
2148
2237
|
release_claim(conn, queue_id, err)
|
|
@@ -2254,7 +2343,7 @@ def _apply_extraction(
|
|
|
2254
2343
|
log.warning(
|
|
2255
2344
|
f"db upsert failed queue_id={queue_id} attempts={attempts}: {err}"
|
|
2256
2345
|
)
|
|
2257
|
-
if attempts
|
|
2346
|
+
if _attempts_exhausted(attempts):
|
|
2258
2347
|
mark_failed(conn, queue_id, err)
|
|
2259
2348
|
else:
|
|
2260
2349
|
release_claim(conn, queue_id, err)
|