@pentatonic-ai/ai-agent-sdk 0.10.8 → 0.10.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
878
878
  }
879
879
 
880
880
  // src/telemetry.js
881
- var VERSION = "0.10.8";
881
+ var VERSION = "0.10.10";
882
882
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
883
883
  function machineId() {
884
884
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
847
847
  }
848
848
 
849
849
  // src/telemetry.js
850
- var VERSION = "0.10.8";
850
+ var VERSION = "0.10.10";
851
851
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
852
852
  function machineId() {
853
853
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.10.8",
3
+ "version": "0.10.10",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -6,15 +6,21 @@
6
6
  > low-value, and junk nodes out of existence (vertical aging). Named for the drive that
7
7
  > does the fusing — the decay pass rides the same engine.
8
8
 
9
- **Status:** spec + initial implementation (PR #92) — 2026-06-12. Implemented: salience
10
- scoring + decay, **eviction** (`fusion_drive_decay.py --evict`, reversible via
11
- `node_evictions`), and **fusion** of exact + cross-run-shared-provenance entity dupes and
12
- exact-triple fact dupes (`fusion_drive_fuse.py --apply`, reversible via `entity_merges`/
13
- `fact_merges`), with scored directory-anchored master selection. All arena-scoped,
14
- dry-run-default, transactional, audited. TODO (later PRs): embedding-band + LLM-adjudicated
15
- detection (in `entity_resolution_v2.py`), semantic fact fusion, authority-table wiring for
16
- canonical scoring, continuous scheduling, and a half-life/threshold calibration pass before
17
- `--evict` runs in prod.
9
+ **Status:** spec + implementation (PR #92, then completion PR) — 2026-06-13.
10
+ **Implemented:** salience scoring + decay; **eviction** (`fusion_drive_decay.py --evict`,
11
+ reversible via `node_evictions`); **entity AND relationship decay**; **fusion** of exact +
12
+ cross-run-shared-provenance entity dupes and exact-triple fact dupes, plus an **LLM
13
+ adjudication tier via the in-VPC distiller** (Qwen3.6 NO egress) for ambiguous cross-run
14
+ entities and semantic (same-assertion-different-words) facts; **authority signals** wired
15
+ into canonical scoring (`grounded` = name verbatim in a provenance event;
16
+ `from_current_teacher` = `distillation_traces.llm_model`); **born-salience** in BOTH the
17
+ async distiller and the sync extractor (+ backfill for existing rows); **continuous
18
+ scheduling** (the `fusion-drive-sweep` 6h timer — dry-run-default, never `--evict` from
19
+ cron). All arena-scoped, dry-run-default, transactional, reversible, audited.
20
+ **Remaining:** `in_directory` anchoring (needs an authoritative directory/contacts source —
21
+ no such table exists yet; the scorer already supports it for when one lands); and the
22
+ **half-life / threshold / salience-constant CALIBRATION pass on a real arena before
23
+ `--evict` is ever run in prod** — eviction stays a deliberate manual op until then.
18
24
  **Builds on:** `RFC-entity-reconciliation.md`, `scripts/entity_resolution_v2.py` (#82),
19
25
  `org-model/migrations/002_entity_merges_audit.sql`.
20
26
  **Motivated by:** the v2 store is currently **pure-accretion** — three independent
@@ -14,6 +14,6 @@ COPY worker.py .
14
14
  # add a new sibling module, add it here too — missing COPY makes the
15
15
  # container crash-loop on import at startup (observed 2026-06-08 deploy).
16
16
  # The test_*.py files are intentionally excluded; pytest only, not runtime.
17
- COPY noise_filter.py confidence.py entity_id.py sensitive_filter.py extraction_schema.py ./
17
+ COPY noise_filter.py confidence.py entity_id.py sensitive_filter.py extraction_schema.py source_time.py ./
18
18
 
19
19
  CMD ["python", "worker.py"]
@@ -0,0 +1,63 @@
1
+ """source_time — robust ISO-8601 source-time parsing for graph stamping.
2
+
3
+ The memory graph must stamp `events.emitted_at` and the graph rows'
4
+ `first_seen` / `last_seen` / `asserted_at` from the SOURCE time of the
5
+ content (when the email/meeting/message actually happened), NOT the
6
+ ingest wall-clock (`NOW()`). The source time is carried on the event as
7
+ `attributes.timestamp` (ISO-8601). This helper promotes it.
8
+
9
+ Mirrors `compat/server.py:_parse_ts` (handles the bare `Z` suffix that
10
+ `datetime.fromisoformat` only learned in 3.11) but returns a tz-aware
11
+ `datetime` rather than a unix float, because the destination columns are
12
+ `TIMESTAMPTZ` and we want psycopg to bind a datetime, not an epoch.
13
+
14
+ CONTRACT (load-bearing): callers MUST fall back to the existing default
15
+ (received / NOW) when the source time is absent or unparseable. This
16
+ helper NEVER raises and returns `None` on anything it can't parse — the
17
+ caller is responsible for the `or NOW()` fallback so we never NULL a
18
+ NOT NULL column or crash the ingest/distill path.
19
+
20
+ NOTE: keep this byte-identical with the copy in extractor-sync/. Same
21
+ convention as entity_id.py — two services, one parsing rule.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from datetime import datetime, timezone
27
+ from typing import Any
28
+
29
+
30
+ def parse_source_time(value: Any) -> datetime | None:
31
+ """Best-effort ISO-8601 -> tz-aware datetime. Returns None on
32
+ anything we can't parse (caller falls back to NOW()).
33
+
34
+ Accepts both the bare `Z` suffix and explicit offsets. A parsed
35
+ value with no offset is assumed UTC (the producers emit UTC ISO
36
+ strings; a naive datetime would break TIMESTAMPTZ comparisons)."""
37
+ if not isinstance(value, str) or not value:
38
+ return None
39
+ try:
40
+ # `fromisoformat` handles `+00:00` but not the bare `Z` suffix
41
+ # until Python 3.11; normalise to be safe across runtime
42
+ # versions on the engine box.
43
+ dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
44
+ except Exception:
45
+ return None
46
+ if dt.tzinfo is None:
47
+ # Producer emitted a naive ISO string; treat as UTC rather than
48
+ # letting psycopg interpret it in the server's local zone.
49
+ dt = dt.replace(tzinfo=timezone.utc)
50
+ return dt
51
+
52
+
53
+ def event_source_time(event: dict[str, Any]) -> datetime | None:
54
+ """Pull the source time off an event dict's attributes.
55
+
56
+ Precedence: `attributes.timestamp` (the source/content time) wins
57
+ over `attributes.emitted_at` (a producer-supplied emit-now, which is
58
+ closer to ingest time). Returns None if neither parses — caller
59
+ falls back to NOW()."""
60
+ attrs = event.get("attributes") or {}
61
+ return parse_source_time(attrs.get("timestamp")) or parse_source_time(
62
+ attrs.get("emitted_at")
63
+ )
@@ -0,0 +1,102 @@
1
+ """Tests for source_time — promoting source event time onto graph rows.
2
+
3
+ The contract under test: source time present and parseable → used;
4
+ absent, empty, or garbage → returns None so the caller falls back to
5
+ NOW() (never crashes, never NULLs a NOT NULL column).
6
+
7
+ Run: pytest packages/memory-engine-v2/extractor-async/test_source_time.py
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from datetime import datetime, timezone
13
+
14
+ import pytest
15
+
16
+ from source_time import event_source_time, parse_source_time
17
+
18
+
19
+ class TestParseSourceTime:
20
+ def test_iso_with_z_suffix(self):
21
+ dt = parse_source_time("2025-03-14T09:30:00Z")
22
+ assert dt == datetime(2025, 3, 14, 9, 30, tzinfo=timezone.utc)
23
+
24
+ def test_iso_with_explicit_offset(self):
25
+ dt = parse_source_time("2025-03-14T09:30:00+00:00")
26
+ assert dt == datetime(2025, 3, 14, 9, 30, tzinfo=timezone.utc)
27
+
28
+ def test_iso_with_nonzero_offset_preserved(self):
29
+ dt = parse_source_time("2025-03-14T12:30:00+03:00")
30
+ # 12:30+03:00 == 09:30 UTC
31
+ assert dt.utcoffset().total_seconds() == 3 * 3600
32
+ assert dt.astimezone(timezone.utc) == datetime(
33
+ 2025, 3, 14, 9, 30, tzinfo=timezone.utc
34
+ )
35
+
36
+ def test_naive_iso_assumed_utc(self):
37
+ # No offset → must NOT come back naive (would break TIMESTAMPTZ
38
+ # comparisons); we assume UTC.
39
+ dt = parse_source_time("2025-03-14T09:30:00")
40
+ assert dt is not None
41
+ assert dt.tzinfo is not None
42
+ assert dt == datetime(2025, 3, 14, 9, 30, tzinfo=timezone.utc)
43
+
44
+ # --- fallback cases: must return None, never raise ---
45
+
46
+ @pytest.mark.parametrize(
47
+ "bad",
48
+ [
49
+ None,
50
+ "",
51
+ "not-a-date",
52
+ "2025-13-99T99:99:99Z", # structurally ISO-ish but invalid
53
+ "14/03/2025", # wrong format
54
+ 12345, # not a string
55
+ [], # not a string
56
+ {"timestamp": "x"}, # not a string
57
+ ],
58
+ )
59
+ def test_garbage_or_absent_returns_none(self, bad):
60
+ assert parse_source_time(bad) is None
61
+
62
+
63
+ class TestEventSourceTime:
64
+ def test_prefers_timestamp_over_emitted_at(self):
65
+ ev = {
66
+ "attributes": {
67
+ "timestamp": "2025-01-01T00:00:00Z", # source time
68
+ "emitted_at": "2025-06-01T00:00:00Z", # producer emit-now
69
+ }
70
+ }
71
+ assert event_source_time(ev) == datetime(
72
+ 2025, 1, 1, 0, 0, tzinfo=timezone.utc
73
+ )
74
+
75
+ def test_falls_back_to_emitted_at_when_no_timestamp(self):
76
+ ev = {"attributes": {"emitted_at": "2025-06-01T00:00:00Z"}}
77
+ assert event_source_time(ev) == datetime(
78
+ 2025, 6, 1, 0, 0, tzinfo=timezone.utc
79
+ )
80
+
81
+ def test_none_when_neither_present(self):
82
+ assert event_source_time({"attributes": {}}) is None
83
+
84
+ def test_none_when_no_attributes(self):
85
+ # Must not crash on an event with a missing/None attributes bag.
86
+ assert event_source_time({}) is None
87
+ assert event_source_time({"attributes": None}) is None
88
+
89
+ def test_garbage_timestamp_falls_back_to_emitted_at(self):
90
+ ev = {
91
+ "attributes": {
92
+ "timestamp": "garbage",
93
+ "emitted_at": "2025-06-01T00:00:00Z",
94
+ }
95
+ }
96
+ assert event_source_time(ev) == datetime(
97
+ 2025, 6, 1, 0, 0, tzinfo=timezone.utc
98
+ )
99
+
100
+ def test_all_garbage_returns_none(self):
101
+ ev = {"attributes": {"timestamp": "nope", "emitted_at": "also-nope"}}
102
+ assert event_source_time(ev) is None
@@ -33,6 +33,7 @@ import os
33
33
  import re
34
34
  import socket
35
35
  import time
36
+ from datetime import datetime
36
37
  from typing import Any
37
38
 
38
39
  import httpx
@@ -41,6 +42,7 @@ import psycopg.rows
41
42
 
42
43
  from confidence import born_salience, corroborated_confidence
43
44
  from entity_id import entity_id, normalize_surface_form
45
+ from source_time import event_source_time, parse_source_time
44
46
  from extraction_schema import (
45
47
  ALLOWED_ENT_TYPES,
46
48
  ALLOWED_FCT_CATEGORIES,
@@ -372,7 +374,10 @@ def build_event_block(idx: int, event: dict[str, Any]) -> str:
372
374
  src = event.get("source_kind", "unknown")
373
375
  content = clean_content(event.get("content") or "")[:MAX_CONTENT_CHARS]
374
376
  attrs = event.get("attributes") or {}
375
- when = attrs.get("emitted_at") or attrs.get("timestamp")
377
+ # Prefer the SOURCE time (`timestamp`) over the producer's emit-now
378
+ # (`emitted_at`) so the LLM anchors "when" to when the content
379
+ # actually happened, not when it was forwarded into ingest.
380
+ when = attrs.get("timestamp") or attrs.get("emitted_at")
376
381
  author = attrs.get("author") or attrs.get("user_id")
377
382
  header = [f"[event {idx}]", f"source_kind: {src}"]
378
383
  if when:
@@ -798,10 +803,18 @@ def upsert_entities(
798
803
  participant_set: list[str],
799
804
  disclosure_class: str,
800
805
  entities: list[dict],
806
+ event_time: datetime | None,
801
807
  ) -> dict[str, str]:
802
808
  """Alias-aware insert (or merge) of entities; returns a name→id
803
809
  map so facts and relationships can link to the inserted rows.
804
810
 
811
+ `event_time` is the SOURCE time of the event being distilled (parsed
812
+ from `attributes.timestamp`); it stamps `first_seen`/`last_seen` so
813
+ the graph tracks content time, not ingest time. `None` (no/garbage
814
+ source time) falls back to NOW() via COALESCE — never NULLs a NOT
815
+ NULL column. Re-corroboration widens the window with LEAST/GREATEST
816
+ on the SOURCE time, so "most recent evidence" = newest source time.
817
+
805
818
  Two concerns layered together:
806
819
 
807
820
  1. **ID derivation** uses the shared `entity_id()` helper from
@@ -884,10 +897,13 @@ def upsert_entities(
884
897
  UPDATE entities SET
885
898
  aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
886
899
  provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
887
- last_seen = NOW()
900
+ -- Widen the seen-window with this event's SOURCE
901
+ -- time, not NOW(): newest evidence = newest source.
902
+ last_seen = GREATEST(last_seen, COALESCE(%s, NOW())),
903
+ first_seen = LEAST(first_seen, COALESCE(%s, NOW()))
888
904
  WHERE id = %s
889
905
  """,
890
- (aliases, [event_id], eid),
906
+ (aliases, [event_id], event_time, event_time, eid),
891
907
  )
892
908
  else:
893
909
  # 3b. No match — insert new.
@@ -904,8 +920,12 @@ def upsert_entities(
904
920
  """
905
921
  INSERT INTO entities (
906
922
  id, arena, entity_type, canonical_name, aliases,
907
- provenance_event_ids, participant_set, disclosure_class, salience
908
- ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class, %s)
923
+ provenance_event_ids, participant_set, disclosure_class, salience,
924
+ first_seen, last_seen
925
+ ) VALUES (
926
+ %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class, %s,
927
+ COALESCE(%s, NOW()), COALESCE(%s, NOW())
928
+ )
909
929
  ON CONFLICT (id) DO UPDATE SET
910
930
  aliases = (
911
931
  SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
@@ -915,11 +935,14 @@ def upsert_entities(
915
935
  ),
916
936
  -- re-corroboration can only RAISE salience, never lower it
917
937
  salience = GREATEST(entities.salience, EXCLUDED.salience),
918
- last_seen = NOW()
938
+ -- widen the seen-window on SOURCE time, not NOW()
939
+ last_seen = GREATEST(entities.last_seen, EXCLUDED.last_seen),
940
+ first_seen = LEAST(entities.first_seen, EXCLUDED.first_seen)
919
941
  """,
920
942
  (
921
943
  eid, arena, etype, name, aliases,
922
944
  [event_id], participant_set, disclosure_class, _sal,
945
+ event_time, event_time,
923
946
  ),
924
947
  )
925
948
  name_to_id[name] = eid
@@ -934,6 +957,8 @@ def upsert_facts(
934
957
  disclosure_class: str,
935
958
  facts: list[dict],
936
959
  name_to_id: dict[str, str],
960
+ event_time: datetime | None,
961
+ due_at: datetime | None = None,
937
962
  ) -> int:
938
963
  """Facts are content-hashed on (arena, statement). Same statement
939
964
  extracted from any event in the arena converges to the same row,
@@ -950,7 +975,16 @@ def upsert_facts(
950
975
  (see confidence.py — caps at 0.9 to reserve [0.9, 1.0] for
951
976
  `stage = 'verified'` which only a human can produce). Stage stays
952
977
  `provisional`; corroboration is a signal, not a graduation.
953
- """
978
+
979
+ `asserted_at` is stamped from the event's SOURCE time (`event_time`,
980
+ parsed from `attributes.timestamp`), falling back to NOW() via
981
+ COALESCE — so the temporal anchor is when the fact was actually
982
+ asserted, not when we distilled it. On corroboration it moves
983
+ FORWARD with GREATEST to the newest source time across all
984
+ corroborating events: facts have no `last_seen`, so #92's decay uses
985
+ `asserted_at` as the recency clock and resets it on re-corroboration
986
+ — order-stable regardless of distill order. `due_at` (the source
987
+ event's structured deadline, if any) populates `effective_until`."""
954
988
  if not facts:
955
989
  return 0
956
990
  inserted = 0
@@ -975,10 +1009,12 @@ def upsert_facts(
975
1009
  INSERT INTO facts (
976
1010
  id, arena, category, subject_entity_id, predicate,
977
1011
  object_entity_id, statement, provenance_event_ids,
978
- stage, confidence, participant_set, disclosure_class, salience
1012
+ stage, confidence, participant_set, disclosure_class, salience,
1013
+ asserted_at, effective_until
979
1014
  ) VALUES (
980
1015
  %s, %s, %s, %s, %s, %s, %s, %s,
981
- 'provisional'::extraction_stage, %s, %s, %s::disclosure_class, %s
1016
+ 'provisional'::extraction_stage, %s, %s, %s::disclosure_class, %s,
1017
+ COALESCE(%s, NOW()), %s
982
1018
  )
983
1019
  ON CONFLICT (id) DO UPDATE SET
984
1020
  provenance_event_ids = (
@@ -1005,7 +1041,18 @@ def upsert_facts(
1005
1041
  )
1006
1042
  ),
1007
1043
  0.9
1008
- )
1044
+ ),
1045
+ -- `asserted_at` doubles as the decay clock for facts:
1046
+ -- #92's fusion_drive_decay ages off
1047
+ -- max(last_accessed, asserted_at) and resets that
1048
+ -- clock on re-corroboration (facts have no `last_seen`
1049
+ -- of their own). So on conflict we move it FORWARD
1050
+ -- with GREATEST to the newest source time across all
1051
+ -- corroborating events — newest evidence, not oldest.
1052
+ -- This also makes it order-stable (independent of
1053
+ -- distill order). EXCLUDED.asserted_at is the
1054
+ -- COALESCE(event_time, NOW()) from the INSERT above.
1055
+ asserted_at = GREATEST(facts.asserted_at, EXCLUDED.asserted_at)
1009
1056
  """,
1010
1057
  (
1011
1058
  _content_id(arena, stmt),
@@ -1020,6 +1067,8 @@ def upsert_facts(
1020
1067
  participant_set,
1021
1068
  disclosure_class,
1022
1069
  _fsal,
1070
+ event_time,
1071
+ due_at,
1023
1072
  ),
1024
1073
  )
1025
1074
  inserted += 1
@@ -1034,9 +1083,14 @@ def upsert_relationships(
1034
1083
  disclosure_class: str,
1035
1084
  relationships: list[dict],
1036
1085
  name_to_id: dict[str, str],
1086
+ event_time: datetime | None,
1037
1087
  ) -> int:
1038
1088
  """Edge identity is (arena, from, to, type). ON CONFLICT bumps
1039
- weight + last_seen rather than duplicating."""
1089
+ weight + widens the seen-window rather than duplicating.
1090
+
1091
+ `first_seen`/`last_seen` are stamped from the event's SOURCE time
1092
+ (`event_time`), falling back to NOW() via COALESCE; re-corroboration
1093
+ widens with LEAST/GREATEST on the source time, not ingest time."""
1040
1094
  if not relationships:
1041
1095
  return 0
1042
1096
  inserted = 0
@@ -1052,21 +1106,25 @@ def upsert_relationships(
1052
1106
  """
1053
1107
  INSERT INTO relationships (
1054
1108
  id, arena, from_entity_id, to_entity_id, relationship_type,
1055
- weight, provenance_event_ids, participant_set, disclosure_class
1109
+ weight, provenance_event_ids, participant_set, disclosure_class,
1110
+ first_seen, last_seen
1056
1111
  ) VALUES (
1057
- %s, %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
1112
+ %s, %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class,
1113
+ COALESCE(%s, NOW()), COALESCE(%s, NOW())
1058
1114
  )
1059
1115
  ON CONFLICT (id) DO UPDATE SET
1060
1116
  weight = relationships.weight + EXCLUDED.weight,
1061
1117
  provenance_event_ids = (
1062
1118
  SELECT ARRAY(SELECT DISTINCT UNNEST(relationships.provenance_event_ids || EXCLUDED.provenance_event_ids))
1063
1119
  ),
1064
- last_seen = NOW()
1120
+ last_seen = GREATEST(relationships.last_seen, EXCLUDED.last_seen),
1121
+ first_seen = LEAST(relationships.first_seen, EXCLUDED.first_seen)
1065
1122
  """,
1066
1123
  (
1067
1124
  rid, arena, from_id, to_id, rtype,
1068
1125
  float(r.get("confidence") or 0.5),
1069
1126
  [event_id], participant_set, disclosure_class,
1127
+ event_time, event_time,
1070
1128
  ),
1071
1129
  )
1072
1130
  inserted += 1
@@ -1318,7 +1376,7 @@ def fetch_event(conn: psycopg.Connection, event_id: str) -> dict[str, Any] | Non
1318
1376
  with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
1319
1377
  cur.execute(
1320
1378
  "SELECT id, arena, source_kind, content, attributes, participant_set, "
1321
- "disclosure_class FROM events WHERE id = %s",
1379
+ "disclosure_class, emitted_at FROM events WHERE id = %s",
1322
1380
  (event_id,),
1323
1381
  )
1324
1382
  return cur.fetchone()
@@ -1476,16 +1534,31 @@ async def process_batch(
1476
1534
  arena = event["arena"]
1477
1535
  participant_set = event.get("participant_set") or [arena]
1478
1536
  disclosure = event.get("disclosure_class") or "private"
1537
+ # SOURCE time of this event: prefer the parsed
1538
+ # `attributes.timestamp` (canonical), falling back to the
1539
+ # stored `emitted_at` column (which the sync path now also
1540
+ # stamps from source time). `None` ⇒ upserts fall back to
1541
+ # NOW() in-SQL. NEVER crash on a bad/absent source time.
1542
+ event_time = event_source_time(event) or event.get("emitted_at")
1543
+ # A structured deadline on the source event, if the producer
1544
+ # supplied one — populates facts.effective_until. Absent or
1545
+ # unparseable ⇒ None (column stays NULL, its existing
1546
+ # behaviour). Only `attributes.due_at` is honoured; we do NOT
1547
+ # guess deadlines from free text here.
1548
+ due_at = parse_source_time((event.get("attributes") or {}).get("due_at"))
1479
1549
 
1480
1550
  try:
1481
1551
  name_to_id = upsert_entities(
1482
- conn, arena, event_id, participant_set, disclosure, ents
1552
+ conn, arena, event_id, participant_set, disclosure, ents,
1553
+ event_time,
1483
1554
  )
1484
1555
  n_facts = upsert_facts(
1485
1556
  conn, arena, event_id, participant_set, disclosure, facts, name_to_id,
1557
+ event_time, due_at,
1486
1558
  )
1487
1559
  n_rels = upsert_relationships(
1488
1560
  conn, arena, event_id, participant_set, disclosure, rels, name_to_id,
1561
+ event_time,
1489
1562
  )
1490
1563
  mark_done(conn, queue_id)
1491
1564
  log.info(
@@ -5,7 +5,9 @@ WORKDIR /app
5
5
  COPY requirements.txt .
6
6
  RUN pip install --no-cache-dir -r requirements.txt
7
7
 
8
- COPY entity_id.py server.py .
8
+ # confidence.py is a byte-identical copy of extractor-async's (the born_salience
9
+ # scale must match the decay side). test_born_salience_parity guards drift.
10
+ COPY entity_id.py confidence.py source_time.py server.py .
9
11
 
10
12
  EXPOSE 8101
11
13
  CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8101", "--workers", "2"]
@@ -0,0 +1,99 @@
1
+ """confidence — fact confidence promotion based on multi-source corroboration.
2
+
3
+ Today every fact lands in org_model at confidence 0.5 / stage 'provisional'
4
+ and never moves. Live-data audit (2026-05-25): EVERY fact across 200
5
+ sampled rows in pentatonic-team is stuck at 0.5 — no signal of
6
+ "how trustworthy is this?" reaches the read side.
7
+
8
+ The right signal is **multi-source corroboration**: the same statement
9
+ appearing in two emails AND a calendar event is meaningfully more
10
+ trustworthy than a one-off mention in a Slack DM. The extractor
11
+ already records `provenance_event_ids` (the list of source events
12
+ that mention each fact), so the data needed for promotion is there
13
+ — we just don't use it.
14
+
15
+ Formula:
16
+
17
+ confidence = min(0.5 + 0.15 * (n_sources - 1), 0.9)
18
+
19
+ Concretely:
20
+
21
+ 1 source → 0.50 (single mention, default)
22
+ 2 sources → 0.65 (one corroboration)
23
+ 3 sources → 0.80
24
+ 4 sources → 0.90 (cap; "verified" remains human-only)
25
+ 5+ → 0.90
26
+
27
+ Cap at 0.9 reserves the [0.9, 1.0] range for human-verified facts
28
+ (`stage = 'verified'`), which the extractor cannot produce on its
29
+ own. We never bump the stage from `provisional` to `distilled` or
30
+ `verified` from this code path — corroboration is a signal, not a
31
+ promotion. Stage transitions stay deliberate / explicit.
32
+
33
+ Pure module — no I/O, no deps. Importable from worker.py without
34
+ pulling in psycopg / httpx.
35
+ """
36
+
37
+ from __future__ import annotations
38
+
39
+ # Bump-per-additional-source. Tuned so:
40
+ # 1 → 0.50 (base)
41
+ # 2 → 0.65
42
+ # 3 → 0.80
43
+ # 4 → 0.90 (cap reached)
44
+ # Picked instead of a smooth log/sqrt because the read-side bucket
45
+ # boundaries (UI badge colours) align cleanly with these steps.
46
+ _CONF_PER_SOURCE = 0.15
47
+ _CONF_BASE = 0.5
48
+ _CONF_CAP = 0.9
49
+
50
+
51
+ def corroborated_confidence(n_sources: int) -> float:
52
+ """Confidence score for a fact corroborated by `n_sources` events.
53
+
54
+ `n_sources <= 0` returns the base confidence — never negative,
55
+ never above the cap. Pure function for easy unit testing.
56
+ """
57
+ if n_sources <= 1:
58
+ return _CONF_BASE
59
+ bumped = _CONF_BASE + _CONF_PER_SOURCE * (n_sources - 1)
60
+ if bumped > _CONF_CAP:
61
+ return _CONF_CAP
62
+ return round(bumped, 2)
63
+
64
+
65
+ # ── born salience (Fusion Drive) ─────────────────────────────────────
66
+ # Retention priority a node is stamped with at extraction time, SEPARATE
67
+ # from confidence (confidence = corroboration/truth; salience = how long
68
+ # it's worth keeping). Junk — flagged by the extractor's own quality
69
+ # detectors (noise name, numeric-ID-as-person, hallucinated email,
70
+ # ungrounded, etc.) — is born near the floor so the Fusion Drive decay
71
+ # pass evicts it on a short clock instead of the multi-year default.
72
+ #
73
+ # This MUST stay byte-identical to fusion_drive/salience.py:born_salience
74
+ # (the decay side uses the same scale). test_born_salience_parity.py
75
+ # guards the two against drift — same pattern as entity_id.py's parity
76
+ # test across the sync/async build contexts.
77
+ _SAL_BASE = 0.50
78
+ _SAL_CORROB_PER_SOURCE = 0.10
79
+ _SAL_CORROB_CAP = 0.30
80
+ _SAL_FLOOR = 0.01
81
+ _SAL_CEIL = 1.00
82
+ _SAL_PENALTIES = {
83
+ "noise_name": 0.45,
84
+ "numeric_id_person": 0.45,
85
+ "hallucinated_email": 0.40,
86
+ "ungrounded": 0.35,
87
+ "subject_undeclared": 0.25,
88
+ "low_signal": 0.15,
89
+ }
90
+
91
+
92
+ def born_salience(n_sources: int = 1, quality_flags: list[str] | None = None) -> float:
93
+ """Salience to stamp on a freshly extracted node. See the module note."""
94
+ s = _SAL_BASE
95
+ if n_sources > 1:
96
+ s += min(_SAL_CORROB_CAP, _SAL_CORROB_PER_SOURCE * (n_sources - 1))
97
+ for flag in quality_flags or []:
98
+ s -= _SAL_PENALTIES.get(flag, 0.0)
99
+ return round(max(_SAL_FLOOR, min(_SAL_CEIL, s)), 4)