@pentatonic-ai/ai-agent-sdk 0.10.4 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -41,6 +41,14 @@ import psycopg.rows
41
41
 
42
42
  from confidence import corroborated_confidence
43
43
  from entity_id import entity_id, normalize_surface_form
44
+ from extraction_schema import (
45
+ ALLOWED_ENT_TYPES,
46
+ ALLOWED_FCT_CATEGORIES,
47
+ EXTRACTION_SCHEMA,
48
+ MAX_ENTITIES_PER_EVENT,
49
+ MAX_FACTS_PER_EVENT,
50
+ MAX_RELATIONSHIPS_PER_EVENT,
51
+ )
44
52
  from noise_filter import is_noise_entity_name
45
53
  from sensitive_filter import SKIP_SENSITIVE_CONTENT, is_sensitive_event
46
54
 
@@ -90,15 +98,65 @@ DISTILL_TRACE_ENABLED = os.environ.get(
90
98
  # chunk via a JSONDecodeError. Pipe-delimited records, one per line,
91
99
  # recover at line granularity — a malformed line skips itself, the rest
92
100
  # of the chunk lands. See 2026-05-18 ops notes.
101
+ #
102
+ # 2026-06-11 update: guided JSON is back as an OPT-IN second mode
103
+ # (DISTILL_OUTPUT_MODE=guided_json, default "kv" — a no-op until an
104
+ # operator flips it). Both halves of the 2026-05-18 removal rationale
105
+ # are answered this time:
106
+ # (a) the self-hosted Qwen2.5-7B vLLM box enforces structured output
107
+ # via logit masking (xgrammar/outlines) — the model CANNOT emit
108
+ # schema-invalid bytes, unlike the old VL gateway which
109
+ # half-ignored response_format;
110
+ # (b) blast radius is solved structurally — the schema is an array
111
+ # of per-event objects (see extraction_schema.py), so one
112
+ # event's content can't corrupt another's parse; the only
113
+ # residual failure is max_tokens truncation, and
114
+ # _parse_guided_json salvages every complete event object.
115
+ # ALLOWED_ENT_TYPES / ALLOWED_FCT_CATEGORIES now live in
116
+ # extraction_schema.py (imported above) so the schema enums and the
117
+ # KV prompt pin to the same single source.
93
118
  EVENT_HEADER_RE = re.compile(r"^===?\s*event\s+(\d+)\s*===?\s*$", re.IGNORECASE)
94
- ALLOWED_ENT_TYPES = {
95
- "person", "org", "product", "place", "project",
96
- "concept", "topic", "date", "other",
97
- }
98
- ALLOWED_FCT_CATEGORIES = {
99
- "decision", "commitment", "state", "mention",
100
- "observation", "preference",
101
- }
119
+
120
+ # Output mode flag. "kv" (default) keeps today's pipe-delimited path
121
+ # byte-for-byte; "guided_json" switches the prompt, request params and
122
+ # parser. Anything unrecognised falls back to "kv" — fail-safe.
123
+ DISTILL_OUTPUT_MODE = os.environ.get("DISTILL_OUTPUT_MODE", "kv").strip().lower()
124
+ if DISTILL_OUTPUT_MODE not in ("kv", "guided_json"):
125
+ log.warning(
126
+ f"DISTILL_OUTPUT_MODE={DISTILL_OUTPUT_MODE!r} unrecognised — using 'kv'"
127
+ )
128
+ DISTILL_OUTPUT_MODE = "kv"
129
+
130
+ # How the structured-output schema is attached to the request in
131
+ # guided_json mode. The repo carries no pin for the engine box's vLLM
132
+ # version, so this is operator-selectable:
133
+ # - "response_format" (default): OpenAI-style
134
+ # response_format={"type":"json_schema","json_schema":{...}} —
135
+ # supported by vLLM >= 0.6.x OpenAI-compat server.
136
+ # - "guided_json": vLLM's legacy extension param (top-level
137
+ # `guided_json` in the request body; what openai-client users pass
138
+ # via extra_body). FALLBACK for older vLLM builds that predate
139
+ # json_schema response_format.
140
+ # Exactly one is sent — some vLLM versions reject requests that carry
141
+ # both guided-decoding params at once.
142
+ DISTILL_GUIDED_PARAM_STYLE = os.environ.get(
143
+ "DISTILL_GUIDED_PARAM_STYLE", "response_format"
144
+ ).strip().lower()
145
+ if DISTILL_GUIDED_PARAM_STYLE not in ("response_format", "guided_json"):
146
+ log.warning(
147
+ f"DISTILL_GUIDED_PARAM_STYLE={DISTILL_GUIDED_PARAM_STYLE!r} unrecognised "
148
+ f"— using 'response_format'"
149
+ )
150
+ DISTILL_GUIDED_PARAM_STYLE = "response_format"
151
+
152
+ # JSON output carries structural overhead (braces, quotes, key names)
153
+ # the KV format doesn't, so guided mode gets its own per-event token
154
+ # budget. Truncation is guided mode's ONLY parse-failure mode (the
155
+ # schema enforcer guarantees validity up to the cut), so this errs
156
+ # higher than the KV 300.
157
+ LLM_MAX_TOKENS_PER_EVENT_JSON = int(
158
+ os.environ.get("LLM_MAX_TOKENS_PER_EVENT_JSON", "400")
159
+ )
102
160
 
103
161
 
104
162
  # --------------------------------------------------------------------
@@ -165,10 +223,71 @@ A whole file is one entity, not twenty.
165
223
  - Output ONLY the formatted records. No header, no footer, no prose."""
166
224
 
167
225
 
226
+ # Guided-JSON variant of BATCH_SYSTEM_PROMPT. Same CONTENT rules
227
+ # (conservatism, per-event caps, code-content rule, subject-must-be-a-
228
+ # declared-entity, email-alias pairing, statement <= 140 chars, never
229
+ # skip an event) — only the output-format scaffolding changes. The
230
+ # pipe-format anchoring ("COUNT THE PIPES", pipe/newline substitution)
231
+ # is dropped: vLLM's guided decoding enforces the schema mechanically,
232
+ # so the prompt no longer needs to beg for format compliance, and JSON
233
+ # string escaping makes the pipe/newline substitution rules moot.
234
+ GUIDED_JSON_SYSTEM_PROMPT = """You extract structured knowledge from N \
235
+ events for a personal-memory graph.
236
+
237
+ You will receive N events, each prefixed with `[event K]`. Respond \
238
+ with a single JSON object: {"events": [...]} containing one object \
239
+ per input event. Be conservative — only emit things explicitly stated.
240
+
241
+ Each per-event object has:
242
+ "index": the zero-indexed event number, matching the input `[event K]`.
243
+ "entities": array of {"name", "type", "email"?}.
244
+ "facts": array of {"category", "subject", "predicate", "object", "statement"}.
245
+ "relationships": array of {"from", "to", "type"}.
246
+
247
+ RULES:
248
+ - NEVER skip an event — if an event has nothing to extract, emit its \
249
+ object with "index" set and empty arrays.
250
+ - entities: type ∈ {person, org, product, place, project, concept, \
251
+ topic, date, other}.
252
+ email (OPTIONAL, person only): when the event body or attributes
253
+ show an email address that unambiguously identifies the person,
254
+ include it. This pairs the name+email forms so a later event seeing
255
+ only the email resolves to the same entity. Omit the key otherwise.
256
+ - facts: category ∈ {decision, commitment, state, mention, \
257
+ observation, preference}.
258
+ subject MUST be an entity name declared in THIS event's "entities".
259
+ predicate is a short verb phrase (e.g. "agreed to", "owns", "works at").
260
+ object MAY be an entity name OR a literal string OR null if absent.
261
+ statement ≤ 140 characters, a self-contained sentence.
262
+ WORKED EXAMPLE: {"category": "commitment", "subject": "Timothy \
263
+ Bradley", "predicate": "agreed to", "object": "SAFE amendments", \
264
+ "statement": "Timothy confirmed the SAFE amendments are set (14 May 2026)"}
265
+ - relationships: "from" and "to" MUST be entity names declared in THIS \
266
+ event's "entities". "type" is a short verb / preposition phrase.
267
+ - HARD CAPS per event: 8 entities, 6 facts, 6 relationships. Pick the \
268
+ most salient.
269
+ - For code / technical content: extract only top-level services, \
270
+ modules, or domain concepts. NOT variables, types, or method names. \
271
+ A whole file is one entity, not twenty.
272
+ - Output ONLY the JSON object. No markdown fences, no prose."""
273
+
274
+
275
+ # The system prompt actually sent to the LLM under the current output
276
+ # mode. Everything downstream (request body, trace fingerprint) hangs
277
+ # off this so the two can never disagree.
278
+ ACTIVE_SYSTEM_PROMPT = (
279
+ GUIDED_JSON_SYSTEM_PROMPT
280
+ if DISTILL_OUTPUT_MODE == "guided_json"
281
+ else BATCH_SYSTEM_PROMPT
282
+ )
283
+
168
284
  # Teacher-prompt fingerprint for trace logging. If the prompt changes,
169
285
  # the hash changes — lets training-data exports filter by teacher
170
- # version so we never mix outputs from a retired prompt.
171
- SYSTEM_PROMPT_HASH = hashlib.sha256(BATCH_SYSTEM_PROMPT.encode()).hexdigest()[:16]
286
+ # version so we never mix outputs from a retired prompt. Computed from
287
+ # the ACTIVE prompt, so flipping DISTILL_OUTPUT_MODE auto-segments
288
+ # distillation_traces into a new teacher version (KV-format traces and
289
+ # guided-JSON traces never mix in a training export).
290
+ SYSTEM_PROMPT_HASH = hashlib.sha256(ACTIVE_SYSTEM_PROMPT.encode()).hexdigest()[:16]
172
291
 
173
292
 
174
293
  # --------------------------------------------------------------------
@@ -353,14 +472,232 @@ def _split_event_blocks(text: str, expected_n: int) -> list[str]:
353
472
  return slices
354
473
 
355
474
 
475
+ # --------------------------------------------------------------------
476
+ # Guided-JSON parsing (DISTILL_OUTPUT_MODE=guided_json)
477
+ # --------------------------------------------------------------------
478
+
479
+
480
+ def _load_guided_payload(text: str) -> dict[str, Any] | None:
481
+ """Parse the guided-JSON chunk output into the {"events": [...]}
482
+ payload, salvaging what's complete if the output was truncated.
483
+
484
+ Under guided decoding the server's logit masking guarantees every
485
+ emitted byte is schema-consistent, so the ONLY way the payload can
486
+ fail to parse is max_tokens truncation mid-stream. Salvage is
487
+ therefore simple and structural: walk back to the last complete
488
+ `}` (the close of the last fully-emitted event object), close the
489
+ events array + root object, and re-parse. Each step back drops at
490
+ most one (incomplete) event — per-event degradation, never
491
+ chunk-level loss. Returns None if nothing parseable remains."""
492
+ raw = (text or "").strip()
493
+ if not raw:
494
+ return None
495
+ # Defensive fence strip — can't occur under guided decoding, but
496
+ # the bake-off script replays this parser over unguided output too.
497
+ if raw.startswith("```"):
498
+ raw = raw.strip("`").strip()
499
+ if raw.lower().startswith("json"):
500
+ raw = raw[4:].lstrip()
501
+ try:
502
+ payload = json.loads(raw)
503
+ return payload if isinstance(payload, dict) else None
504
+ except json.JSONDecodeError:
505
+ pass
506
+ # Truncated: trim to the last complete `}` of the events array and
507
+ # close the structure. Walk back through `}` occurrences until a
508
+ # candidate parses (bounded — each iteration discards at least one
509
+ # char, and 200 closing braces covers far more events than a chunk
510
+ # can hold).
511
+ end = len(raw)
512
+ for _ in range(200):
513
+ idx = raw.rfind("}", 0, end)
514
+ if idx < 0:
515
+ return None
516
+ candidate = raw[: idx + 1] + "]}"
517
+ try:
518
+ payload = json.loads(candidate)
519
+ return payload if isinstance(payload, dict) else None
520
+ except json.JSONDecodeError:
521
+ end = idx
522
+ return None
523
+
524
+
525
+ def _resolve_event_index(ev: dict[str, Any], pos: int, expected_n: int) -> int | None:
526
+ """Map a parsed event object to its result slot. Trust the model's
527
+ "index" field when it's a valid in-range int (it mirrors the
528
+ `[event K]` input header); fall back to array position otherwise.
529
+ None = undeliverable (both out of range) — the object is dropped
530
+ without corrupting any other event's slot."""
531
+ idx = ev.get("index")
532
+ if isinstance(idx, int) and not isinstance(idx, bool) and 0 <= idx < expected_n:
533
+ return idx
534
+ if 0 <= pos < expected_n:
535
+ return pos
536
+ return None
537
+
538
+
539
+ def _parse_guided_json(text: str, expected_n: int) -> list[dict[str, Any]]:
540
+ """Parse guided-JSON output into per-event extraction dicts —
541
+ sibling of _parse_kv_records, returning the IDENTICAL shape
542
+ ({"entities": [...], "facts": [...], "relationships": [...]}, with
543
+ entity emails promoted into "aliases") so the upsert path and trace
544
+ logging are untouched by the output-mode flip.
545
+
546
+ Defensive beyond what guided decoding guarantees: truncation is
547
+ salvaged per-event (see _load_guided_payload), per-item junk is
548
+ skipped, the per-event hard caps are re-enforced, and string fields
549
+ are normalised exactly as the KV parser normalises them (strip,
550
+ lowercase type/category, `-`/empty/null object → None, non-email
551
+ "email" values dropped). Always returns expected_n entries."""
552
+ results: list[dict[str, Any]] = [
553
+ {"entities": [], "facts": [], "relationships": []} for _ in range(expected_n)
554
+ ]
555
+ payload = _load_guided_payload(text)
556
+ if payload is None:
557
+ return results
558
+ events = payload.get("events")
559
+ if not isinstance(events, list):
560
+ return results
561
+ for pos, ev in enumerate(events):
562
+ if not isinstance(ev, dict):
563
+ continue
564
+ idx = _resolve_event_index(ev, pos, expected_n)
565
+ if idx is None:
566
+ continue
567
+ target = results[idx]
568
+ ents = ev.get("entities")
569
+ for e in (ents if isinstance(ents, list) else [])[:MAX_ENTITIES_PER_EVENT]:
570
+ if not isinstance(e, dict):
571
+ continue
572
+ name = str(e.get("name") or "").strip()
573
+ if not name:
574
+ continue
575
+ etype = str(e.get("type") or "").strip().lower()
576
+ ent: dict[str, Any] = {"type": etype, "name": name}
577
+ # Mirror the KV 4th-field rule: promote into aliases only
578
+ # when it actually looks like an email; drop junk silently.
579
+ email = e.get("email")
580
+ if isinstance(email, str):
581
+ email = email.strip()
582
+ if email and "@" in email and " " not in email:
583
+ ent["aliases"] = [email]
584
+ target["entities"].append(ent)
585
+ facts = ev.get("facts")
586
+ for f in (facts if isinstance(facts, list) else [])[:MAX_FACTS_PER_EVENT]:
587
+ if not isinstance(f, dict):
588
+ continue
589
+ stmt = str(f.get("statement") or "").strip()
590
+ if not stmt:
591
+ continue
592
+ obj = f.get("object")
593
+ obj = obj.strip() if isinstance(obj, str) else None
594
+ target["facts"].append(
595
+ {
596
+ "category": str(f.get("category") or "").strip().lower(),
597
+ "subject": str(f.get("subject") or "").strip(),
598
+ "predicate": str(f.get("predicate") or "").strip(),
599
+ "object": None if obj in (None, "", "-", "null", "None") else obj,
600
+ "statement": stmt,
601
+ }
602
+ )
603
+ rels = ev.get("relationships")
604
+ for r in (rels if isinstance(rels, list) else [])[:MAX_RELATIONSHIPS_PER_EVENT]:
605
+ if not isinstance(r, dict):
606
+ continue
607
+ frm = str(r.get("from") or "").strip()
608
+ to = str(r.get("to") or "").strip()
609
+ rtype = str(r.get("type") or "").strip()
610
+ if frm and to and rtype:
611
+ target["relationships"].append({"from": frm, "to": to, "type": rtype})
612
+ return results
613
+
614
+
615
+ def _guided_event_slices(text: str, expected_n: int) -> list[str]:
616
+ """Per-event raw slices for trace logging in guided mode — the
617
+ JSON-mode sibling of _split_event_blocks, same shape contract
618
+ (expected_n entries, missing events as empty strings). Each slice
619
+ is the model's event object re-serialised verbatim-in-content
620
+ (key order preserved, non-ASCII kept) so distillation_traces stays
621
+ a faithful (input, output) training pair."""
622
+ slices: list[str] = [""] * expected_n
623
+ payload = _load_guided_payload(text)
624
+ if payload is None:
625
+ return slices
626
+ events = payload.get("events")
627
+ if not isinstance(events, list):
628
+ return slices
629
+ for pos, ev in enumerate(events):
630
+ if not isinstance(ev, dict):
631
+ continue
632
+ idx = _resolve_event_index(ev, pos, expected_n)
633
+ if idx is not None:
634
+ slices[idx] = json.dumps(ev, ensure_ascii=False)
635
+ return slices
636
+
637
+
638
+ def _build_request_body(user_prompt: str, n: int) -> dict[str, Any]:
639
+ """Chat-completions request body for one N-event chunk. Pure —
640
+ everything mode-dependent (prompt, token budget, structured-output
641
+ params) keys off the module-level flags so this is unit-testable.
642
+
643
+ kv mode (default): byte-for-byte the pre-flag body — KV-text
644
+ output, no guided_json / response_format. The benefit of
645
+ structured-output enforcement was half-ignored by the old VL
646
+ upstream, and the KV parser recovers from per-line drift.
647
+
648
+ guided_json mode: attaches EXTRACTION_SCHEMA via ONE of the two
649
+ vLLM structured-output param styles (DISTILL_GUIDED_PARAM_STYLE;
650
+ some vLLM versions reject requests carrying both at once):
651
+ - response_format {"type": "json_schema", ...} — OpenAI-style,
652
+ current vLLM (default).
653
+ - top-level guided_json — vLLM's legacy extension param (what
654
+ openai-client callers pass via extra_body), fallback for older
655
+ server builds.
656
+ """
657
+ body: dict[str, Any] = {
658
+ "model": LLM_MODEL,
659
+ "messages": [
660
+ {"role": "system", "content": ACTIVE_SYSTEM_PROMPT},
661
+ {"role": "user", "content": user_prompt},
662
+ ],
663
+ "temperature": 0.0,
664
+ "max_tokens": (
665
+ LLM_MAX_TOKENS_PER_EVENT_JSON
666
+ if DISTILL_OUTPUT_MODE == "guided_json"
667
+ else LLM_MAX_TOKENS_PER_EVENT
668
+ ) * n,
669
+ }
670
+ if DISTILL_OUTPUT_MODE == "guided_json":
671
+ if DISTILL_GUIDED_PARAM_STYLE == "guided_json":
672
+ body["guided_json"] = EXTRACTION_SCHEMA
673
+ else:
674
+ body["response_format"] = {
675
+ "type": "json_schema",
676
+ "json_schema": {
677
+ "name": "memory_extraction",
678
+ "strict": True,
679
+ "schema": EXTRACTION_SCHEMA,
680
+ },
681
+ }
682
+ return body
683
+
684
+
356
685
  async def call_llm_batch(
357
686
  client: httpx.AsyncClient, events: list[dict[str, Any]]
358
687
  ) -> list[dict[str, Any]]:
359
688
  """Send N events in a single chat-completion call, return the list
360
- of per-event extraction dicts in input order. The model emits
361
- pipe-delimited KV records (see BATCH_SYSTEM_PROMPT); the parser is
362
- line-tolerant so a malformed record skips itself rather than
363
- failing the chunk. Raises only on transport failure or completely
689
+ of per-event extraction dicts in input order.
690
+
691
+ kv mode (default): the model emits pipe-delimited KV records (see
692
+ BATCH_SYSTEM_PROMPT); the parser is line-tolerant so a malformed
693
+ record skips itself rather than failing the chunk.
694
+
695
+ guided_json mode: the model emits the EXTRACTION_SCHEMA-constrained
696
+ JSON envelope under server-side guided decoding; the parser
697
+ salvages complete event objects from a truncated stream so failure
698
+ degrades per-event, never per-chunk. Both parsers return the same
699
+ per-event dict shape, so everything downstream of this function is
700
+ mode-agnostic. Raises only on transport failure or completely
364
701
  empty output."""
365
702
  n = len(events)
366
703
  if n == 0:
@@ -378,20 +715,7 @@ async def call_llm_batch(
378
715
  build_event_block(i, ev) for i, ev in enumerate(events)
379
716
  )
380
717
 
381
- body: dict[str, Any] = {
382
- "model": LLM_MODEL,
383
- "messages": [
384
- {"role": "system", "content": BATCH_SYSTEM_PROMPT},
385
- {"role": "user", "content": user_prompt},
386
- ],
387
- "temperature": 0.0,
388
- "max_tokens": LLM_MAX_TOKENS_PER_EVENT * n,
389
- # KV-text output — no guided_json / response_format. The
390
- # benefit of structured-output enforcement was already
391
- # half-ignored by VL upstream, and the parser now recovers
392
- # from per-line drift so the schema enforcement isn't worth
393
- # the JSON brittleness it brought.
394
- }
718
+ body = _build_request_body(user_prompt, n)
395
719
  r = await client.post(LLM_ENDPOINT, json=body, headers=headers)
396
720
  r.raise_for_status()
397
721
  data = r.json()
@@ -400,12 +724,16 @@ async def call_llm_batch(
400
724
  text = data.get("message", {}).get("content", "")
401
725
  if not text:
402
726
  raise RuntimeError(f"llm returned no content: {json.dumps(data)[:300]}")
403
- parsed = _parse_kv_records(text, n)
727
+ if DISTILL_OUTPUT_MODE == "guided_json":
728
+ parsed = _parse_guided_json(text, n)
729
+ slices = _guided_event_slices(text, n)
730
+ else:
731
+ parsed = _parse_kv_records(text, n)
732
+ slices = _split_event_blocks(text, n)
404
733
  # Attach the per-event raw slice so downstream trace logging gets
405
734
  # the model's verbatim output for THIS event without re-splitting
406
735
  # the chunk-level text. Parser semantics are unaffected — the
407
736
  # raw_slice key is ignored by upsert paths.
408
- slices = _split_event_blocks(text, n)
409
737
  for record, slice_text in zip(parsed, slices):
410
738
  record["raw_slice"] = slice_text
411
739
  return parsed
@@ -761,6 +1089,25 @@ SKIP_ATTRIBUTE_SOURCES = set(
761
1089
  )
762
1090
  DISTILL_MAX_AGE_DAYS = int(os.environ.get("DISTILL_MAX_AGE_DAYS", "90"))
763
1091
 
1092
+ # Layer-1 content pre-filter (cascade tier 1 — the cheap deterministic gate in
1093
+ # front of the student/7B). Skips events with NO extractable signal BEFORE the
1094
+ # LLM, so GPU is spent only on text that can yield facts.
1095
+ # - bytes-garbage: a binary doc (raw PDF bytes) stored as text decodes to a
1096
+ # wall of U+FFFD replacement chars. `build_event_block` feeds `content` to
1097
+ # the LLM, so it sees the garbage and extracts nothing (live 2026-06-10:
1098
+ # 35,296/39,453 pentatonic-team `doc` events are bytes-garbage). HIGH
1099
+ # PRECISION — real text effectively never crosses a 5–10% replacement-char
1100
+ # ratio, so this is a zero-quality-loss skip. (Durable fix = extract PDF
1101
+ # text at INGEST; this stops the GPU waste meanwhile.)
1102
+ # - too-short: trivially short content (one-line acks / emoji) has no facts.
1103
+ # Conservative and OFF by default (0) to guarantee zero quality loss; tune
1104
+ # up once layer-2 (the student model) owns the borderline cases.
1105
+ SKIP_BYTES_GARBAGE = os.environ.get(
1106
+ "DISTILL_SKIP_BYTES_GARBAGE", "true"
1107
+ ).strip().lower() not in ("false", "0", "no", "off")
1108
+ GARBAGE_CHAR_RATIO = float(os.environ.get("DISTILL_GARBAGE_CHAR_RATIO", "0.10"))
1109
+ MIN_CONTENT_CHARS = int(os.environ.get("DISTILL_MIN_CONTENT_CHARS", "0"))
1110
+
764
1111
 
765
1112
  def claim_next_batch(conn: psycopg.Connection) -> list[dict[str, Any]]:
766
1113
  """Atomically claim up to BATCH_SIZE pending items. SKIP LOCKED so
@@ -843,6 +1190,43 @@ def claim_next_batch(conn: psycopg.Connection) -> list[dict[str, Any]]:
843
1190
  """,
844
1191
  (DISTILL_MAX_AGE_DAYS, DISTILL_MAX_AGE_DAYS),
845
1192
  )
1193
+ # Pre-filter: bytes-garbage content. A binary doc (raw PDF bytes)
1194
+ # stored as text decodes to mostly U+FFFD (chr(65533)); the LLM
1195
+ # extracts nothing from it. Skip when the replacement-char ratio
1196
+ # exceeds GARBAGE_CHAR_RATIO — real text never crosses it, so no
1197
+ # quality loss. Scoped to the pending set; one cheap UPDATE/cycle.
1198
+ if SKIP_BYTES_GARBAGE:
1199
+ cur.execute(
1200
+ """
1201
+ UPDATE distillation_queue dq SET
1202
+ status = 'done',
1203
+ completed_at = NOW(),
1204
+ last_error = 'filtered: bytes_garbage'
1205
+ FROM events e
1206
+ WHERE dq.event_id = e.id
1207
+ AND dq.status = 'pending'
1208
+ AND length(e.content) > 0
1209
+ AND (length(e.content) - length(replace(e.content, chr(65533), '')))::float
1210
+ / length(e.content) > %s
1211
+ """,
1212
+ (GARBAGE_CHAR_RATIO,),
1213
+ )
1214
+ # Pre-filter: trivially-short content (one-line acks / emoji). OFF by
1215
+ # default (MIN_CONTENT_CHARS=0) so it never costs a fact unless tuned on.
1216
+ if MIN_CONTENT_CHARS > 0:
1217
+ cur.execute(
1218
+ """
1219
+ UPDATE distillation_queue dq SET
1220
+ status = 'done',
1221
+ completed_at = NOW(),
1222
+ last_error = 'filtered: too_short'
1223
+ FROM events e
1224
+ WHERE dq.event_id = e.id
1225
+ AND dq.status = 'pending'
1226
+ AND length(trim(e.content)) < %s
1227
+ """,
1228
+ (MIN_CONTENT_CHARS,),
1229
+ )
846
1230
 
847
1231
  with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
848
1232
  cur.execute(
@@ -1085,7 +1469,9 @@ async def amain():
1085
1469
  f"endpoint={LLM_ENDPOINT or '(stub)'}, model={LLM_MODEL}, "
1086
1470
  f"poll={POLL_INTERVAL_SEC}s, claim={BATCH_SIZE}, "
1087
1471
  f"events_per_call={EVENTS_PER_LLM_CALL}, "
1088
- f"concurrent_calls={CONCURRENT_LLM_CALLS})"
1472
+ f"concurrent_calls={CONCURRENT_LLM_CALLS}, "
1473
+ f"output_mode={DISTILL_OUTPUT_MODE}, "
1474
+ f"prompt_hash={SYSTEM_PROMPT_HASH})"
1089
1475
  )
1090
1476
  stub_mode = not LLM_ENDPOINT
1091
1477
  if stub_mode: