@pentatonic-ai/ai-agent-sdk 0.10.7 → 0.10.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/dist/index.cjs +1 -1
  2. package/dist/index.js +1 -1
  3. package/package.json +1 -1
  4. package/packages/memory-engine-v2/RFC-decay-and-fusion.md +185 -0
  5. package/packages/memory-engine-v2/RFC-fusion-drive.md +199 -0
  6. package/packages/memory-engine-v2/extractor-async/confidence.py +37 -0
  7. package/packages/memory-engine-v2/extractor-async/source_time.py +63 -0
  8. package/packages/memory-engine-v2/extractor-async/test_born_salience_parity.py +35 -0
  9. package/packages/memory-engine-v2/extractor-async/test_source_time.py +102 -0
  10. package/packages/memory-engine-v2/extractor-async/worker.py +121 -18
  11. package/packages/memory-engine-v2/extractor-sync/Dockerfile +3 -1
  12. package/packages/memory-engine-v2/extractor-sync/confidence.py +99 -0
  13. package/packages/memory-engine-v2/extractor-sync/server.py +61 -11
  14. package/packages/memory-engine-v2/extractor-sync/source_time.py +63 -0
  15. package/packages/memory-engine-v2/extractor-sync/test_confidence_parity.py +18 -0
  16. package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +2 -2
  17. package/packages/memory-engine-v2/fusion_drive/__init__.py +0 -0
  18. package/packages/memory-engine-v2/fusion_drive/adjudicate.py +85 -0
  19. package/packages/memory-engine-v2/fusion_drive/canonical.py +94 -0
  20. package/packages/memory-engine-v2/fusion_drive/conftest.py +8 -0
  21. package/packages/memory-engine-v2/fusion_drive/merge.py +178 -0
  22. package/packages/memory-engine-v2/fusion_drive/salience.py +118 -0
  23. package/packages/memory-engine-v2/fusion_drive/test_adjudicate.py +65 -0
  24. package/packages/memory-engine-v2/fusion_drive/test_canonical.py +76 -0
  25. package/packages/memory-engine-v2/fusion_drive/test_merge.py +112 -0
  26. package/packages/memory-engine-v2/fusion_drive/test_salience.py +93 -0
  27. package/packages/memory-engine-v2/org-model/migrations/006_fusion_drive.sql +80 -0
  28. package/packages/memory-engine-v2/scripts/fusion_drive_born_salience_backfill.py +113 -0
  29. package/packages/memory-engine-v2/scripts/fusion_drive_decay.py +200 -0
  30. package/packages/memory-engine-v2/scripts/fusion_drive_fuse.py +434 -0
@@ -33,14 +33,16 @@ import os
33
33
  import re
34
34
  import socket
35
35
  import time
36
+ from datetime import datetime
36
37
  from typing import Any
37
38
 
38
39
  import httpx
39
40
  import psycopg
40
41
  import psycopg.rows
41
42
 
42
- from confidence import corroborated_confidence
43
+ from confidence import born_salience, corroborated_confidence
43
44
  from entity_id import entity_id, normalize_surface_form
45
+ from source_time import event_source_time, parse_source_time
44
46
  from extraction_schema import (
45
47
  ALLOWED_ENT_TYPES,
46
48
  ALLOWED_FCT_CATEGORIES,
@@ -372,7 +374,10 @@ def build_event_block(idx: int, event: dict[str, Any]) -> str:
372
374
  src = event.get("source_kind", "unknown")
373
375
  content = clean_content(event.get("content") or "")[:MAX_CONTENT_CHARS]
374
376
  attrs = event.get("attributes") or {}
375
- when = attrs.get("emitted_at") or attrs.get("timestamp")
377
+ # Prefer the SOURCE time (`timestamp`) over the producer's emit-now
378
+ # (`emitted_at`) so the LLM anchors "when" to when the content
379
+ # actually happened, not when it was forwarded into ingest.
380
+ when = attrs.get("timestamp") or attrs.get("emitted_at")
376
381
  author = attrs.get("author") or attrs.get("user_id")
377
382
  header = [f"[event {idx}]", f"source_kind: {src}"]
378
383
  if when:
@@ -782,6 +787,15 @@ def _content_id(*parts: str) -> str:
782
787
  return hashlib.sha256("\x1f".join(parts).encode()).hexdigest()[:32]
783
788
 
784
789
 
790
+ def _digit_ratio(s: str) -> float:
791
+ """Fraction of non-whitespace chars that are digits. Used to flag
792
+ numeric-ID-as-person junk for Fusion Drive born-salience."""
793
+ stripped = "".join(s.split())
794
+ if not stripped:
795
+ return 0.0
796
+ return sum(c.isdigit() for c in stripped) / len(stripped)
797
+
798
+
785
799
  def upsert_entities(
786
800
  conn: psycopg.Connection,
787
801
  arena: str,
@@ -789,10 +803,18 @@ def upsert_entities(
789
803
  participant_set: list[str],
790
804
  disclosure_class: str,
791
805
  entities: list[dict],
806
+ event_time: datetime | None,
792
807
  ) -> dict[str, str]:
793
808
  """Alias-aware insert (or merge) of entities; returns a name→id
794
809
  map so facts and relationships can link to the inserted rows.
795
810
 
811
+ `event_time` is the SOURCE time of the event being distilled (parsed
812
+ from `attributes.timestamp`); it stamps `first_seen`/`last_seen` so
813
+ the graph tracks content time, not ingest time. `None` (no/garbage
814
+ source time) falls back to NOW() via COALESCE — never NULLs a NOT
815
+ NULL column. Re-corroboration widens the window with LEAST/GREATEST
816
+ on the SOURCE time, so "most recent evidence" = newest source time.
817
+
796
818
  Two concerns layered together:
797
819
 
798
820
  1. **ID derivation** uses the shared `entity_id()` helper from
@@ -875,20 +897,35 @@ def upsert_entities(
875
897
  UPDATE entities SET
876
898
  aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
877
899
  provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
878
- last_seen = NOW()
900
+ -- Widen the seen-window with this event's SOURCE
901
+ -- time, not NOW(): newest evidence = newest source.
902
+ last_seen = GREATEST(last_seen, COALESCE(%s, NOW())),
903
+ first_seen = LEAST(first_seen, COALESCE(%s, NOW()))
879
904
  WHERE id = %s
880
905
  """,
881
- (aliases, [event_id], eid),
906
+ (aliases, [event_id], event_time, event_time, eid),
882
907
  )
883
908
  else:
884
909
  # 3b. No match — insert new.
885
910
  eid = entity_id(arena, etype, name)
911
+ # Fusion Drive born-salience: a numeric-ID-as-person (classic
912
+ # 7B junk that slips past noise_filter, e.g. "1716801984") is
913
+ # born near the floor so the decay pass can evict it on a short
914
+ # clock instead of the multi-year entity default.
915
+ _qflags = []
916
+ if etype == "person" and _digit_ratio(name) > 0.5:
917
+ _qflags.append("numeric_id_person")
918
+ _sal = born_salience(1, _qflags)
886
919
  cur.execute(
887
920
  """
888
921
  INSERT INTO entities (
889
922
  id, arena, entity_type, canonical_name, aliases,
890
- provenance_event_ids, participant_set, disclosure_class
891
- ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class)
923
+ provenance_event_ids, participant_set, disclosure_class, salience,
924
+ first_seen, last_seen
925
+ ) VALUES (
926
+ %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class, %s,
927
+ COALESCE(%s, NOW()), COALESCE(%s, NOW())
928
+ )
892
929
  ON CONFLICT (id) DO UPDATE SET
893
930
  aliases = (
894
931
  SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
@@ -896,11 +933,16 @@ def upsert_entities(
896
933
  provenance_event_ids = (
897
934
  SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
898
935
  ),
899
- last_seen = NOW()
936
+ -- re-corroboration can only RAISE salience, never lower it
937
+ salience = GREATEST(entities.salience, EXCLUDED.salience),
938
+ -- widen the seen-window on SOURCE time, not NOW()
939
+ last_seen = GREATEST(entities.last_seen, EXCLUDED.last_seen),
940
+ first_seen = LEAST(entities.first_seen, EXCLUDED.first_seen)
900
941
  """,
901
942
  (
902
943
  eid, arena, etype, name, aliases,
903
- [event_id], participant_set, disclosure_class,
944
+ [event_id], participant_set, disclosure_class, _sal,
945
+ event_time, event_time,
904
946
  ),
905
947
  )
906
948
  name_to_id[name] = eid
@@ -915,6 +957,8 @@ def upsert_facts(
915
957
  disclosure_class: str,
916
958
  facts: list[dict],
917
959
  name_to_id: dict[str, str],
960
+ event_time: datetime | None,
961
+ due_at: datetime | None = None,
918
962
  ) -> int:
919
963
  """Facts are content-hashed on (arena, statement). Same statement
920
964
  extracted from any event in the arena converges to the same row,
@@ -931,7 +975,16 @@ def upsert_facts(
931
975
  (see confidence.py — caps at 0.9 to reserve [0.9, 1.0] for
932
976
  `stage = 'verified'` which only a human can produce). Stage stays
933
977
  `provisional`; corroboration is a signal, not a graduation.
934
- """
978
+
979
+ `asserted_at` is stamped from the event's SOURCE time (`event_time`,
980
+ parsed from `attributes.timestamp`), falling back to NOW() via
981
+ COALESCE — so the temporal anchor is when the fact was actually
982
+ asserted, not when we distilled it. On corroboration it moves
983
+ FORWARD with GREATEST to the newest source time across all
984
+ corroborating events: facts have no `last_seen`, so #92's decay uses
985
+ `asserted_at` as the recency clock and resets it on re-corroboration
986
+ — order-stable regardless of distill order. `due_at` (the source
987
+ event's structured deadline, if any) populates `effective_until`."""
935
988
  if not facts:
936
989
  return 0
937
990
  inserted = 0
@@ -942,15 +995,26 @@ def upsert_facts(
942
995
  continue
943
996
  subj_name = f.get("subject")
944
997
  obj_name = f.get("object")
998
+ # Fusion Drive born-salience: a fact whose subject isn't among the
999
+ # event's declared entities (ungrounded subject) or that's barely
1000
+ # a sentence is born low so decay can clear it. n_sources=1 here.
1001
+ _fflags = []
1002
+ if subj_name and not name_to_id.get(subj_name):
1003
+ _fflags.append("subject_undeclared")
1004
+ if len(stmt) < 60:
1005
+ _fflags.append("low_signal")
1006
+ _fsal = born_salience(1, _fflags)
945
1007
  cur.execute(
946
1008
  """
947
1009
  INSERT INTO facts (
948
1010
  id, arena, category, subject_entity_id, predicate,
949
1011
  object_entity_id, statement, provenance_event_ids,
950
- stage, confidence, participant_set, disclosure_class
1012
+ stage, confidence, participant_set, disclosure_class, salience,
1013
+ asserted_at, effective_until
951
1014
  ) VALUES (
952
1015
  %s, %s, %s, %s, %s, %s, %s, %s,
953
- 'provisional'::extraction_stage, %s, %s, %s::disclosure_class
1016
+ 'provisional'::extraction_stage, %s, %s, %s::disclosure_class, %s,
1017
+ COALESCE(%s, NOW()), %s
954
1018
  )
955
1019
  ON CONFLICT (id) DO UPDATE SET
956
1020
  provenance_event_ids = (
@@ -958,6 +1022,7 @@ def upsert_facts(
958
1022
  facts.provenance_event_ids || EXCLUDED.provenance_event_ids
959
1023
  ))
960
1024
  ),
1025
+ salience = GREATEST(facts.salience, EXCLUDED.salience),
961
1026
  -- Confidence bumps with each additional independent
962
1027
  -- source. The cardinality of the merged provenance
963
1028
  -- array IS the corroboration count, so the formula
@@ -976,7 +1041,18 @@ def upsert_facts(
976
1041
  )
977
1042
  ),
978
1043
  0.9
979
- )
1044
+ ),
1045
+ -- `asserted_at` doubles as the decay clock for facts:
1046
+ -- #92's fusion_drive_decay ages off
1047
+ -- max(last_accessed, asserted_at) and resets that
1048
+ -- clock on re-corroboration (facts have no `last_seen`
1049
+ -- of their own). So on conflict we move it FORWARD
1050
+ -- with GREATEST to the newest source time across all
1051
+ -- corroborating events — newest evidence, not oldest.
1052
+ -- This also makes it order-stable (independent of
1053
+ -- distill order). EXCLUDED.asserted_at is the
1054
+ -- COALESCE(event_time, NOW()) from the INSERT above.
1055
+ asserted_at = GREATEST(facts.asserted_at, EXCLUDED.asserted_at)
980
1056
  """,
981
1057
  (
982
1058
  _content_id(arena, stmt),
@@ -990,6 +1066,9 @@ def upsert_facts(
990
1066
  float(f.get("confidence") or corroborated_confidence(1)),
991
1067
  participant_set,
992
1068
  disclosure_class,
1069
+ _fsal,
1070
+ event_time,
1071
+ due_at,
993
1072
  ),
994
1073
  )
995
1074
  inserted += 1
@@ -1004,9 +1083,14 @@ def upsert_relationships(
1004
1083
  disclosure_class: str,
1005
1084
  relationships: list[dict],
1006
1085
  name_to_id: dict[str, str],
1086
+ event_time: datetime | None,
1007
1087
  ) -> int:
1008
1088
  """Edge identity is (arena, from, to, type). ON CONFLICT bumps
1009
- weight + last_seen rather than duplicating."""
1089
+ weight + widens the seen-window rather than duplicating.
1090
+
1091
+ `first_seen`/`last_seen` are stamped from the event's SOURCE time
1092
+ (`event_time`), falling back to NOW() via COALESCE; re-corroboration
1093
+ widens with LEAST/GREATEST on the source time, not ingest time."""
1010
1094
  if not relationships:
1011
1095
  return 0
1012
1096
  inserted = 0
@@ -1022,21 +1106,25 @@ def upsert_relationships(
1022
1106
  """
1023
1107
  INSERT INTO relationships (
1024
1108
  id, arena, from_entity_id, to_entity_id, relationship_type,
1025
- weight, provenance_event_ids, participant_set, disclosure_class
1109
+ weight, provenance_event_ids, participant_set, disclosure_class,
1110
+ first_seen, last_seen
1026
1111
  ) VALUES (
1027
- %s, %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
1112
+ %s, %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class,
1113
+ COALESCE(%s, NOW()), COALESCE(%s, NOW())
1028
1114
  )
1029
1115
  ON CONFLICT (id) DO UPDATE SET
1030
1116
  weight = relationships.weight + EXCLUDED.weight,
1031
1117
  provenance_event_ids = (
1032
1118
  SELECT ARRAY(SELECT DISTINCT UNNEST(relationships.provenance_event_ids || EXCLUDED.provenance_event_ids))
1033
1119
  ),
1034
- last_seen = NOW()
1120
+ last_seen = GREATEST(relationships.last_seen, EXCLUDED.last_seen),
1121
+ first_seen = LEAST(relationships.first_seen, EXCLUDED.first_seen)
1035
1122
  """,
1036
1123
  (
1037
1124
  rid, arena, from_id, to_id, rtype,
1038
1125
  float(r.get("confidence") or 0.5),
1039
1126
  [event_id], participant_set, disclosure_class,
1127
+ event_time, event_time,
1040
1128
  ),
1041
1129
  )
1042
1130
  inserted += 1
@@ -1288,7 +1376,7 @@ def fetch_event(conn: psycopg.Connection, event_id: str) -> dict[str, Any] | Non
1288
1376
  with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
1289
1377
  cur.execute(
1290
1378
  "SELECT id, arena, source_kind, content, attributes, participant_set, "
1291
- "disclosure_class FROM events WHERE id = %s",
1379
+ "disclosure_class, emitted_at FROM events WHERE id = %s",
1292
1380
  (event_id,),
1293
1381
  )
1294
1382
  return cur.fetchone()
@@ -1446,16 +1534,31 @@ async def process_batch(
1446
1534
  arena = event["arena"]
1447
1535
  participant_set = event.get("participant_set") or [arena]
1448
1536
  disclosure = event.get("disclosure_class") or "private"
1537
+ # SOURCE time of this event: prefer the parsed
1538
+ # `attributes.timestamp` (canonical), falling back to the
1539
+ # stored `emitted_at` column (which the sync path now also
1540
+ # stamps from source time). `None` ⇒ upserts fall back to
1541
+ # NOW() in-SQL. NEVER crash on a bad/absent source time.
1542
+ event_time = event_source_time(event) or event.get("emitted_at")
1543
+ # A structured deadline on the source event, if the producer
1544
+ # supplied one — populates facts.effective_until. Absent or
1545
+ # unparseable ⇒ None (column stays NULL, its existing
1546
+ # behaviour). Only `attributes.due_at` is honoured; we do NOT
1547
+ # guess deadlines from free text here.
1548
+ due_at = parse_source_time((event.get("attributes") or {}).get("due_at"))
1449
1549
 
1450
1550
  try:
1451
1551
  name_to_id = upsert_entities(
1452
- conn, arena, event_id, participant_set, disclosure, ents
1552
+ conn, arena, event_id, participant_set, disclosure, ents,
1553
+ event_time,
1453
1554
  )
1454
1555
  n_facts = upsert_facts(
1455
1556
  conn, arena, event_id, participant_set, disclosure, facts, name_to_id,
1557
+ event_time, due_at,
1456
1558
  )
1457
1559
  n_rels = upsert_relationships(
1458
1560
  conn, arena, event_id, participant_set, disclosure, rels, name_to_id,
1561
+ event_time,
1459
1562
  )
1460
1563
  mark_done(conn, queue_id)
1461
1564
  log.info(
@@ -5,7 +5,9 @@ WORKDIR /app
5
5
  COPY requirements.txt .
6
6
  RUN pip install --no-cache-dir -r requirements.txt
7
7
 
8
- COPY entity_id.py server.py .
8
+ # confidence.py is a byte-identical copy of extractor-async's (the born_salience
9
+ # scale must match the decay side). test_born_salience_parity guards drift.
10
+ COPY entity_id.py confidence.py server.py .
9
11
 
10
12
  EXPOSE 8101
11
13
  CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8101", "--workers", "2"]
@@ -0,0 +1,99 @@
1
+ """confidence — fact confidence promotion based on multi-source corroboration.
2
+
3
+ Today every fact lands in org_model at confidence 0.5 / stage 'provisional'
4
+ and never moves. Live-data audit (2026-05-25): EVERY fact across 200
5
+ sampled rows in pentatonic-team is stuck at 0.5 — no signal of
6
+ "how trustworthy is this?" reaches the read side.
7
+
8
+ The right signal is **multi-source corroboration**: the same statement
9
+ appearing in two emails AND a calendar event is meaningfully more
10
+ trustworthy than a one-off mention in a Slack DM. The extractor
11
+ already records `provenance_event_ids` (the list of source events
12
+ that mention each fact), so the data needed for promotion is there
13
+ — we just don't use it.
14
+
15
+ Formula:
16
+
17
+ confidence = min(0.5 + 0.15 * (n_sources - 1), 0.9)
18
+
19
+ Concretely:
20
+
21
+ 1 source → 0.50 (single mention, default)
22
+ 2 sources → 0.65 (one corroboration)
23
+ 3 sources → 0.80
24
+ 4 sources → 0.90 (cap; "verified" remains human-only)
25
+ 5+ → 0.90
26
+
27
+ Cap at 0.9 reserves the [0.9, 1.0] range for human-verified facts
28
+ (`stage = 'verified'`), which the extractor cannot produce on its
29
+ own. We never bump the stage from `provisional` to `distilled` or
30
+ `verified` from this code path — corroboration is a signal, not a
31
+ promotion. Stage transitions stay deliberate / explicit.
32
+
33
+ Pure module — no I/O, no deps. Importable from worker.py without
34
+ pulling in psycopg / httpx.
35
+ """
36
+
37
+ from __future__ import annotations
38
+
39
+ # Bump-per-additional-source. Tuned so:
40
+ # 1 → 0.50 (base)
41
+ # 2 → 0.65
42
+ # 3 → 0.80
43
+ # 4 → 0.90 (cap reached)
44
+ # Picked instead of a smooth log/sqrt because the read-side bucket
45
+ # boundaries (UI badge colours) align cleanly with these steps.
46
+ _CONF_PER_SOURCE = 0.15
47
+ _CONF_BASE = 0.5
48
+ _CONF_CAP = 0.9
49
+
50
+
51
+ def corroborated_confidence(n_sources: int) -> float:
52
+ """Confidence score for a fact corroborated by `n_sources` events.
53
+
54
+ `n_sources <= 0` returns the base confidence — never negative,
55
+ never above the cap. Pure function for easy unit testing.
56
+ """
57
+ if n_sources <= 1:
58
+ return _CONF_BASE
59
+ bumped = _CONF_BASE + _CONF_PER_SOURCE * (n_sources - 1)
60
+ if bumped > _CONF_CAP:
61
+ return _CONF_CAP
62
+ return round(bumped, 2)
63
+
64
+
65
+ # ── born salience (Fusion Drive) ─────────────────────────────────────
66
+ # Retention priority a node is stamped with at extraction time, SEPARATE
67
+ # from confidence (confidence = corroboration/truth; salience = how long
68
+ # it's worth keeping). Junk — flagged by the extractor's own quality
69
+ # detectors (noise name, numeric-ID-as-person, hallucinated email,
70
+ # ungrounded, etc.) — is born near the floor so the Fusion Drive decay
71
+ # pass evicts it on a short clock instead of the multi-year default.
72
+ #
73
+ # This MUST stay byte-identical to fusion_drive/salience.py:born_salience
74
+ # (the decay side uses the same scale). test_born_salience_parity.py
75
+ # guards the two against drift — same pattern as entity_id.py's parity
76
+ # test across the sync/async build contexts.
77
+ _SAL_BASE = 0.50
78
+ _SAL_CORROB_PER_SOURCE = 0.10
79
+ _SAL_CORROB_CAP = 0.30
80
+ _SAL_FLOOR = 0.01
81
+ _SAL_CEIL = 1.00
82
+ _SAL_PENALTIES = {
83
+ "noise_name": 0.45,
84
+ "numeric_id_person": 0.45,
85
+ "hallucinated_email": 0.40,
86
+ "ungrounded": 0.35,
87
+ "subject_undeclared": 0.25,
88
+ "low_signal": 0.15,
89
+ }
90
+
91
+
92
+ def born_salience(n_sources: int = 1, quality_flags: list[str] | None = None) -> float:
93
+ """Salience to stamp on a freshly extracted node. See the module note."""
94
+ s = _SAL_BASE
95
+ if n_sources > 1:
96
+ s += min(_SAL_CORROB_CAP, _SAL_CORROB_PER_SOURCE * (n_sources - 1))
97
+ for flag in quality_flags or []:
98
+ s -= _SAL_PENALTIES.get(flag, 0.0)
99
+ return round(max(_SAL_FLOOR, min(_SAL_CEIL, s)), 4)
@@ -27,10 +27,14 @@ import os
27
27
  import re
28
28
  import time
29
29
  from contextlib import asynccontextmanager
30
+ from datetime import datetime # noqa: F401 (used in type hints)
30
31
  from typing import Any
31
32
 
32
33
  # Canonical entity-ID scheme — byte-identical copy in extractor-async (entity_id.py).
34
+ from confidence import born_salience
33
35
  from entity_id import entity_id, normalize_surface_form # noqa: F401
36
+ # Source-time parsing — byte-identical copy in extractor-async (source_time.py).
37
+ from source_time import event_source_time
34
38
 
35
39
  import psycopg
36
40
  import psycopg.rows
@@ -394,17 +398,27 @@ RULES = {
394
398
 
395
399
  async def _upsert_event(cur: psycopg.AsyncCursor, req: ExtractRequest,
396
400
  event_id: str, content_hash: str) -> None:
397
- """ON CONFLICT DO NOTHING — re-emitting the same event is a no-op."""
401
+ """ON CONFLICT DO NOTHING — re-emitting the same event is a no-op.
402
+
403
+ `emitted_at` is the SOURCE time of the content (when the
404
+ email/meeting/message actually happened), parsed from
405
+ `attributes.timestamp`; `received_at` keeps its NOW() default and
406
+ means ingest time — exactly the split the schema comment at
407
+ 001_init.sql:112 promises. When the source time is absent or
408
+ unparseable we fall back to NOW() via COALESCE (never NULL a
409
+ NOT NULL column)."""
410
+ emitted_at = event_source_time({"attributes": req.attributes})
398
411
  await cur.execute(
399
412
  """
400
413
  INSERT INTO events (
401
414
  id, arena, client_id, user_id, event_type, source_kind,
402
415
  source_id, content, content_hash, participant_set,
403
- participant_kind, disclosure_class, attributes
416
+ participant_kind, disclosure_class, attributes, emitted_at
404
417
  ) VALUES (
405
418
  %s, %s, %s, %s, %s, %s::source_kind,
406
419
  %s, %s, %s, %s,
407
- %s::participant_kind, %s::disclosure_class, %s::jsonb
420
+ %s::participant_kind, %s::disclosure_class, %s::jsonb,
421
+ COALESCE(%s, NOW())
408
422
  )
409
423
  ON CONFLICT (id) DO NOTHING
410
424
  """,
@@ -416,13 +430,26 @@ async def _upsert_event(cur: psycopg.AsyncCursor, req: ExtractRequest,
416
430
  req.attributes.get("participant_kind", "unknown"),
417
431
  req.attributes.get("disclosure_class", "private"),
418
432
  psycopg.types.json.Json(req.attributes),
433
+ emitted_at,
419
434
  ),
420
435
  )
421
436
 
422
437
 
423
- async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> None:
438
+ async def _upsert_entities(
439
+ cur: psycopg.AsyncCursor,
440
+ entities: list[dict],
441
+ event_time: "datetime | None",
442
+ ) -> None:
424
443
  """Alias-aware idempotent entity upsert.
425
444
 
445
+ `event_time` is the SOURCE time of the originating event (parsed from
446
+ `attributes.timestamp`); it stamps `first_seen`/`last_seen` so the
447
+ graph tracks when the evidence actually happened, not when we
448
+ ingested it. `None` (no/garbage source time) falls back to NOW() via
449
+ COALESCE. On re-corroboration we widen the window with
450
+ LEAST(first_seen, ...) / GREATEST(last_seen, ...): "most recent
451
+ evidence" = newest SOURCE time, not newest ingest.
452
+
426
453
  For each entity, before inserting, look for an existing row in the
427
454
  same (arena, entity_type) whose canonical_name OR aliases overlap
428
455
  any of the incoming surface forms. If found, merge aliases +
@@ -488,23 +515,40 @@ async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> No
488
515
  UPDATE entities SET
489
516
  aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
490
517
  provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
491
- last_seen = NOW()
518
+ -- Widen the seen-window with this event's SOURCE time,
519
+ -- not NOW(): newest evidence = newest source time.
520
+ last_seen = GREATEST(last_seen, COALESCE(%s, NOW())),
521
+ first_seen = LEAST(first_seen, COALESCE(%s, NOW()))
492
522
  WHERE id = %s
493
523
  """,
494
- (e["aliases"], e["provenance_event_ids"], existing_id),
524
+ (e["aliases"], e["provenance_event_ids"],
525
+ event_time, event_time, existing_id),
495
526
  )
496
527
  else:
497
528
  # 3b. No match — insert new. ON CONFLICT (id) is a belt-
498
529
  # and-braces fallback for the rare case where two writers
499
530
  # collide on the same id under different surface forms;
500
531
  # the advisory lock above is the primary defence.
532
+ # Fusion Drive born-salience via the SHARED born_salience (no
533
+ # inline constants — they'd drift from the async path; #96 review
534
+ # §4). Sync entities are deterministic (names from structured
535
+ # email/calendar fields) so they're high-quality; the one junk
536
+ # class sync can still emit is a numeric-ID-as-person, flagged so
537
+ # it's born low and decay can evict it. The async distiller owns
538
+ # the full quality-flag set.
539
+ _digits = sum(c.isdigit() for c in e["canonical_name"] if not c.isspace())
540
+ _nonspace = sum(1 for c in e["canonical_name"] if not c.isspace()) or 1
541
+ _flags = ["numeric_id_person"] if (e["entity_type"] == "person" and _digits / _nonspace > 0.5) else []
542
+ _sal = born_salience(1, _flags)
501
543
  await cur.execute(
502
544
  """
503
545
  INSERT INTO entities (
504
546
  id, arena, entity_type, canonical_name, aliases,
505
- provenance_event_ids, participant_set, disclosure_class
547
+ provenance_event_ids, participant_set, disclosure_class,
548
+ first_seen, last_seen, salience
506
549
  ) VALUES (
507
- %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
550
+ %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class,
551
+ COALESCE(%s, NOW()), COALESCE(%s, NOW()), %s
508
552
  )
509
553
  ON CONFLICT (id) DO UPDATE SET
510
554
  aliases = (
@@ -513,11 +557,14 @@ async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> No
513
557
  provenance_event_ids = (
514
558
  SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
515
559
  ),
516
- last_seen = NOW()
560
+ salience = GREATEST(entities.salience, EXCLUDED.salience),
561
+ last_seen = GREATEST(entities.last_seen, EXCLUDED.last_seen),
562
+ first_seen = LEAST(entities.first_seen, EXCLUDED.first_seen)
517
563
  """,
518
564
  (e["id"], e["arena"], e["entity_type"], e["canonical_name"],
519
565
  e["aliases"], e["provenance_event_ids"],
520
- e["participant_set"], e["disclosure_class"]),
566
+ e["participant_set"], e["disclosure_class"],
567
+ event_time, event_time, _sal),
521
568
  )
522
569
 
523
570
 
@@ -584,7 +631,10 @@ async def extract(req: ExtractRequest):
584
631
  async with _pool.connection() as conn:
585
632
  async with conn.cursor() as cur:
586
633
  await _upsert_event(cur, req, event_id, content_hash)
587
- await _upsert_entities(cur, entities)
634
+ # Source time of THIS event — stamps the graph rows so
635
+ # first/last_seen track content time, not ingest time.
636
+ event_time = event_source_time({"attributes": req.attributes})
637
+ await _upsert_entities(cur, entities, event_time)
588
638
  # Facts + relationships are deliberately left to the async
589
639
  # distillation worker — the deterministic path can't
590
640
  # reliably extract decisions/commitments without LLM context.
@@ -0,0 +1,63 @@
1
+ """source_time — robust ISO-8601 source-time parsing for graph stamping.
2
+
3
+ The memory graph must stamp `events.emitted_at` and the graph rows'
4
+ `first_seen` / `last_seen` / `asserted_at` from the SOURCE time of the
5
+ content (when the email/meeting/message actually happened), NOT the
6
+ ingest wall-clock (`NOW()`). The source time is carried on the event as
7
+ `attributes.timestamp` (ISO-8601). This helper promotes it.
8
+
9
+ Mirrors `compat/server.py:_parse_ts` (handles the bare `Z` suffix that
10
+ `datetime.fromisoformat` only learned in 3.11) but returns a tz-aware
11
+ `datetime` rather than a unix float, because the destination columns are
12
+ `TIMESTAMPTZ` and we want psycopg to bind a datetime, not an epoch.
13
+
14
+ CONTRACT (load-bearing): callers MUST fall back to the existing default
15
+ (received / NOW) when the source time is absent or unparseable. This
16
+ helper NEVER raises and returns `None` on anything it can't parse — the
17
+ caller is responsible for the `or NOW()` fallback so we never NULL a
18
+ NOT NULL column or crash the ingest/distill path.
19
+
20
+ NOTE: keep this byte-identical with the copy in extractor-sync/. Same
21
+ convention as entity_id.py — two services, one parsing rule.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from datetime import datetime, timezone
27
+ from typing import Any
28
+
29
+
30
+ def parse_source_time(value: Any) -> datetime | None:
31
+ """Best-effort ISO-8601 -> tz-aware datetime. Returns None on
32
+ anything we can't parse (caller falls back to NOW()).
33
+
34
+ Accepts both the bare `Z` suffix and explicit offsets. A parsed
35
+ value with no offset is assumed UTC (the producers emit UTC ISO
36
+ strings; a naive datetime would break TIMESTAMPTZ comparisons)."""
37
+ if not isinstance(value, str) or not value:
38
+ return None
39
+ try:
40
+ # `fromisoformat` handles `+00:00` but not the bare `Z` suffix
41
+ # until Python 3.11; normalise to be safe across runtime
42
+ # versions on the engine box.
43
+ dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
44
+ except Exception:
45
+ return None
46
+ if dt.tzinfo is None:
47
+ # Producer emitted a naive ISO string; treat as UTC rather than
48
+ # letting psycopg interpret it in the server's local zone.
49
+ dt = dt.replace(tzinfo=timezone.utc)
50
+ return dt
51
+
52
+
53
+ def event_source_time(event: dict[str, Any]) -> datetime | None:
54
+ """Pull the source time off an event dict's attributes.
55
+
56
+ Precedence: `attributes.timestamp` (the source/content time) wins
57
+ over `attributes.emitted_at` (a producer-supplied emit-now, which is
58
+ closer to ingest time). Returns None if neither parses — caller
59
+ falls back to NOW()."""
60
+ attrs = event.get("attributes") or {}
61
+ return parse_source_time(attrs.get("timestamp")) or parse_source_time(
62
+ attrs.get("emitted_at")
63
+ )
@@ -0,0 +1,18 @@
1
+ """extractor-sync/confidence.py must stay byte-identical to extractor-async's
2
+ copy — both carry born_salience, whose scale must match the Fusion Drive decay
3
+ side. Same drift guard as test_entity_id_parity.py across the build contexts."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import os
8
+
9
+
10
+ def test_sync_confidence_is_byte_identical_to_async():
11
+ here = os.path.dirname(__file__)
12
+ sync = os.path.join(here, "confidence.py")
13
+ async_ = os.path.join(here, "..", "extractor-async", "confidence.py")
14
+ with open(sync, "rb") as f:
15
+ a = f.read()
16
+ with open(async_, "rb") as f:
17
+ b = f.read()
18
+ assert a == b, "extractor-sync/confidence.py drifted from extractor-async/confidence.py"
@@ -273,7 +273,7 @@ def test_pool_keeps_default_tuple_row_factory() -> None:
273
273
  def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
274
274
  """Entity already exists → UPDATE branch runs, id taken from row[0]."""
275
275
  cur = _FakeCursor(existing_id="e_existing")
276
- asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
276
+ asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()], None))
277
277
  updates = [(s, p) for s, p in cur.executed if s.startswith("UPDATE entities")]
278
278
  assert len(updates) == 1
279
279
  _, params = updates[0]
@@ -283,7 +283,7 @@ def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
283
283
 
284
284
  def test_upsert_entities_insert_branch_when_no_match() -> None:
285
285
  cur = _FakeCursor(existing_id=None)
286
- asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
286
+ asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()], None))
287
287
  inserts = [s for s, _ in cur.executed if s.startswith("INSERT INTO entities")]
288
288
  assert len(inserts) == 1
289
289
  assert not any(s.startswith("UPDATE entities") for s, _ in cur.executed)