@pentatonic-ai/ai-agent-sdk 0.10.7 → 0.10.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/RFC-decay-and-fusion.md +185 -0
- package/packages/memory-engine-v2/RFC-fusion-drive.md +199 -0
- package/packages/memory-engine-v2/extractor-async/confidence.py +37 -0
- package/packages/memory-engine-v2/extractor-async/source_time.py +63 -0
- package/packages/memory-engine-v2/extractor-async/test_born_salience_parity.py +35 -0
- package/packages/memory-engine-v2/extractor-async/test_source_time.py +102 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +121 -18
- package/packages/memory-engine-v2/extractor-sync/Dockerfile +3 -1
- package/packages/memory-engine-v2/extractor-sync/confidence.py +99 -0
- package/packages/memory-engine-v2/extractor-sync/server.py +61 -11
- package/packages/memory-engine-v2/extractor-sync/source_time.py +63 -0
- package/packages/memory-engine-v2/extractor-sync/test_confidence_parity.py +18 -0
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +2 -2
- package/packages/memory-engine-v2/fusion_drive/__init__.py +0 -0
- package/packages/memory-engine-v2/fusion_drive/adjudicate.py +85 -0
- package/packages/memory-engine-v2/fusion_drive/canonical.py +94 -0
- package/packages/memory-engine-v2/fusion_drive/conftest.py +8 -0
- package/packages/memory-engine-v2/fusion_drive/merge.py +178 -0
- package/packages/memory-engine-v2/fusion_drive/salience.py +118 -0
- package/packages/memory-engine-v2/fusion_drive/test_adjudicate.py +65 -0
- package/packages/memory-engine-v2/fusion_drive/test_canonical.py +76 -0
- package/packages/memory-engine-v2/fusion_drive/test_merge.py +112 -0
- package/packages/memory-engine-v2/fusion_drive/test_salience.py +93 -0
- package/packages/memory-engine-v2/org-model/migrations/006_fusion_drive.sql +80 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_born_salience_backfill.py +113 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_decay.py +200 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_fuse.py +434 -0
|
@@ -33,14 +33,16 @@ import os
|
|
|
33
33
|
import re
|
|
34
34
|
import socket
|
|
35
35
|
import time
|
|
36
|
+
from datetime import datetime
|
|
36
37
|
from typing import Any
|
|
37
38
|
|
|
38
39
|
import httpx
|
|
39
40
|
import psycopg
|
|
40
41
|
import psycopg.rows
|
|
41
42
|
|
|
42
|
-
from confidence import corroborated_confidence
|
|
43
|
+
from confidence import born_salience, corroborated_confidence
|
|
43
44
|
from entity_id import entity_id, normalize_surface_form
|
|
45
|
+
from source_time import event_source_time, parse_source_time
|
|
44
46
|
from extraction_schema import (
|
|
45
47
|
ALLOWED_ENT_TYPES,
|
|
46
48
|
ALLOWED_FCT_CATEGORIES,
|
|
@@ -372,7 +374,10 @@ def build_event_block(idx: int, event: dict[str, Any]) -> str:
|
|
|
372
374
|
src = event.get("source_kind", "unknown")
|
|
373
375
|
content = clean_content(event.get("content") or "")[:MAX_CONTENT_CHARS]
|
|
374
376
|
attrs = event.get("attributes") or {}
|
|
375
|
-
|
|
377
|
+
# Prefer the SOURCE time (`timestamp`) over the producer's emit-now
|
|
378
|
+
# (`emitted_at`) so the LLM anchors "when" to when the content
|
|
379
|
+
# actually happened, not when it was forwarded into ingest.
|
|
380
|
+
when = attrs.get("timestamp") or attrs.get("emitted_at")
|
|
376
381
|
author = attrs.get("author") or attrs.get("user_id")
|
|
377
382
|
header = [f"[event {idx}]", f"source_kind: {src}"]
|
|
378
383
|
if when:
|
|
@@ -782,6 +787,15 @@ def _content_id(*parts: str) -> str:
|
|
|
782
787
|
return hashlib.sha256("\x1f".join(parts).encode()).hexdigest()[:32]
|
|
783
788
|
|
|
784
789
|
|
|
790
|
+
def _digit_ratio(s: str) -> float:
|
|
791
|
+
"""Fraction of non-whitespace chars that are digits. Used to flag
|
|
792
|
+
numeric-ID-as-person junk for Fusion Drive born-salience."""
|
|
793
|
+
stripped = "".join(s.split())
|
|
794
|
+
if not stripped:
|
|
795
|
+
return 0.0
|
|
796
|
+
return sum(c.isdigit() for c in stripped) / len(stripped)
|
|
797
|
+
|
|
798
|
+
|
|
785
799
|
def upsert_entities(
|
|
786
800
|
conn: psycopg.Connection,
|
|
787
801
|
arena: str,
|
|
@@ -789,10 +803,18 @@ def upsert_entities(
|
|
|
789
803
|
participant_set: list[str],
|
|
790
804
|
disclosure_class: str,
|
|
791
805
|
entities: list[dict],
|
|
806
|
+
event_time: datetime | None,
|
|
792
807
|
) -> dict[str, str]:
|
|
793
808
|
"""Alias-aware insert (or merge) of entities; returns a name→id
|
|
794
809
|
map so facts and relationships can link to the inserted rows.
|
|
795
810
|
|
|
811
|
+
`event_time` is the SOURCE time of the event being distilled (parsed
|
|
812
|
+
from `attributes.timestamp`); it stamps `first_seen`/`last_seen` so
|
|
813
|
+
the graph tracks content time, not ingest time. `None` (no/garbage
|
|
814
|
+
source time) falls back to NOW() via COALESCE — never NULLs a NOT
|
|
815
|
+
NULL column. Re-corroboration widens the window with LEAST/GREATEST
|
|
816
|
+
on the SOURCE time, so "most recent evidence" = newest source time.
|
|
817
|
+
|
|
796
818
|
Two concerns layered together:
|
|
797
819
|
|
|
798
820
|
1. **ID derivation** uses the shared `entity_id()` helper from
|
|
@@ -875,20 +897,35 @@ def upsert_entities(
|
|
|
875
897
|
UPDATE entities SET
|
|
876
898
|
aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
|
|
877
899
|
provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
|
|
878
|
-
|
|
900
|
+
-- Widen the seen-window with this event's SOURCE
|
|
901
|
+
-- time, not NOW(): newest evidence = newest source.
|
|
902
|
+
last_seen = GREATEST(last_seen, COALESCE(%s, NOW())),
|
|
903
|
+
first_seen = LEAST(first_seen, COALESCE(%s, NOW()))
|
|
879
904
|
WHERE id = %s
|
|
880
905
|
""",
|
|
881
|
-
(aliases, [event_id], eid),
|
|
906
|
+
(aliases, [event_id], event_time, event_time, eid),
|
|
882
907
|
)
|
|
883
908
|
else:
|
|
884
909
|
# 3b. No match — insert new.
|
|
885
910
|
eid = entity_id(arena, etype, name)
|
|
911
|
+
# Fusion Drive born-salience: a numeric-ID-as-person (classic
|
|
912
|
+
# 7B junk that slips past noise_filter, e.g. "1716801984") is
|
|
913
|
+
# born near the floor so the decay pass can evict it on a short
|
|
914
|
+
# clock instead of the multi-year entity default.
|
|
915
|
+
_qflags = []
|
|
916
|
+
if etype == "person" and _digit_ratio(name) > 0.5:
|
|
917
|
+
_qflags.append("numeric_id_person")
|
|
918
|
+
_sal = born_salience(1, _qflags)
|
|
886
919
|
cur.execute(
|
|
887
920
|
"""
|
|
888
921
|
INSERT INTO entities (
|
|
889
922
|
id, arena, entity_type, canonical_name, aliases,
|
|
890
|
-
provenance_event_ids, participant_set, disclosure_class
|
|
891
|
-
|
|
923
|
+
provenance_event_ids, participant_set, disclosure_class, salience,
|
|
924
|
+
first_seen, last_seen
|
|
925
|
+
) VALUES (
|
|
926
|
+
%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class, %s,
|
|
927
|
+
COALESCE(%s, NOW()), COALESCE(%s, NOW())
|
|
928
|
+
)
|
|
892
929
|
ON CONFLICT (id) DO UPDATE SET
|
|
893
930
|
aliases = (
|
|
894
931
|
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
|
|
@@ -896,11 +933,16 @@ def upsert_entities(
|
|
|
896
933
|
provenance_event_ids = (
|
|
897
934
|
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
|
|
898
935
|
),
|
|
899
|
-
|
|
936
|
+
-- re-corroboration can only RAISE salience, never lower it
|
|
937
|
+
salience = GREATEST(entities.salience, EXCLUDED.salience),
|
|
938
|
+
-- widen the seen-window on SOURCE time, not NOW()
|
|
939
|
+
last_seen = GREATEST(entities.last_seen, EXCLUDED.last_seen),
|
|
940
|
+
first_seen = LEAST(entities.first_seen, EXCLUDED.first_seen)
|
|
900
941
|
""",
|
|
901
942
|
(
|
|
902
943
|
eid, arena, etype, name, aliases,
|
|
903
|
-
[event_id], participant_set, disclosure_class,
|
|
944
|
+
[event_id], participant_set, disclosure_class, _sal,
|
|
945
|
+
event_time, event_time,
|
|
904
946
|
),
|
|
905
947
|
)
|
|
906
948
|
name_to_id[name] = eid
|
|
@@ -915,6 +957,8 @@ def upsert_facts(
|
|
|
915
957
|
disclosure_class: str,
|
|
916
958
|
facts: list[dict],
|
|
917
959
|
name_to_id: dict[str, str],
|
|
960
|
+
event_time: datetime | None,
|
|
961
|
+
due_at: datetime | None = None,
|
|
918
962
|
) -> int:
|
|
919
963
|
"""Facts are content-hashed on (arena, statement). Same statement
|
|
920
964
|
extracted from any event in the arena converges to the same row,
|
|
@@ -931,7 +975,16 @@ def upsert_facts(
|
|
|
931
975
|
(see confidence.py — caps at 0.9 to reserve [0.9, 1.0] for
|
|
932
976
|
`stage = 'verified'` which only a human can produce). Stage stays
|
|
933
977
|
`provisional`; corroboration is a signal, not a graduation.
|
|
934
|
-
|
|
978
|
+
|
|
979
|
+
`asserted_at` is stamped from the event's SOURCE time (`event_time`,
|
|
980
|
+
parsed from `attributes.timestamp`), falling back to NOW() via
|
|
981
|
+
COALESCE — so the temporal anchor is when the fact was actually
|
|
982
|
+
asserted, not when we distilled it. On corroboration it moves
|
|
983
|
+
FORWARD with GREATEST to the newest source time across all
|
|
984
|
+
corroborating events: facts have no `last_seen`, so #92's decay uses
|
|
985
|
+
`asserted_at` as the recency clock and resets it on re-corroboration
|
|
986
|
+
— order-stable regardless of distill order. `due_at` (the source
|
|
987
|
+
event's structured deadline, if any) populates `effective_until`."""
|
|
935
988
|
if not facts:
|
|
936
989
|
return 0
|
|
937
990
|
inserted = 0
|
|
@@ -942,15 +995,26 @@ def upsert_facts(
|
|
|
942
995
|
continue
|
|
943
996
|
subj_name = f.get("subject")
|
|
944
997
|
obj_name = f.get("object")
|
|
998
|
+
# Fusion Drive born-salience: a fact whose subject isn't among the
|
|
999
|
+
# event's declared entities (ungrounded subject) or that's barely
|
|
1000
|
+
# a sentence is born low so decay can clear it. n_sources=1 here.
|
|
1001
|
+
_fflags = []
|
|
1002
|
+
if subj_name and not name_to_id.get(subj_name):
|
|
1003
|
+
_fflags.append("subject_undeclared")
|
|
1004
|
+
if len(stmt) < 60:
|
|
1005
|
+
_fflags.append("low_signal")
|
|
1006
|
+
_fsal = born_salience(1, _fflags)
|
|
945
1007
|
cur.execute(
|
|
946
1008
|
"""
|
|
947
1009
|
INSERT INTO facts (
|
|
948
1010
|
id, arena, category, subject_entity_id, predicate,
|
|
949
1011
|
object_entity_id, statement, provenance_event_ids,
|
|
950
|
-
stage, confidence, participant_set, disclosure_class
|
|
1012
|
+
stage, confidence, participant_set, disclosure_class, salience,
|
|
1013
|
+
asserted_at, effective_until
|
|
951
1014
|
) VALUES (
|
|
952
1015
|
%s, %s, %s, %s, %s, %s, %s, %s,
|
|
953
|
-
'provisional'::extraction_stage, %s, %s, %s::disclosure_class
|
|
1016
|
+
'provisional'::extraction_stage, %s, %s, %s::disclosure_class, %s,
|
|
1017
|
+
COALESCE(%s, NOW()), %s
|
|
954
1018
|
)
|
|
955
1019
|
ON CONFLICT (id) DO UPDATE SET
|
|
956
1020
|
provenance_event_ids = (
|
|
@@ -958,6 +1022,7 @@ def upsert_facts(
|
|
|
958
1022
|
facts.provenance_event_ids || EXCLUDED.provenance_event_ids
|
|
959
1023
|
))
|
|
960
1024
|
),
|
|
1025
|
+
salience = GREATEST(facts.salience, EXCLUDED.salience),
|
|
961
1026
|
-- Confidence bumps with each additional independent
|
|
962
1027
|
-- source. The cardinality of the merged provenance
|
|
963
1028
|
-- array IS the corroboration count, so the formula
|
|
@@ -976,7 +1041,18 @@ def upsert_facts(
|
|
|
976
1041
|
)
|
|
977
1042
|
),
|
|
978
1043
|
0.9
|
|
979
|
-
)
|
|
1044
|
+
),
|
|
1045
|
+
-- `asserted_at` doubles as the decay clock for facts:
|
|
1046
|
+
-- #92's fusion_drive_decay ages off
|
|
1047
|
+
-- max(last_accessed, asserted_at) and resets that
|
|
1048
|
+
-- clock on re-corroboration (facts have no `last_seen`
|
|
1049
|
+
-- of their own). So on conflict we move it FORWARD
|
|
1050
|
+
-- with GREATEST to the newest source time across all
|
|
1051
|
+
-- corroborating events — newest evidence, not oldest.
|
|
1052
|
+
-- This also makes it order-stable (independent of
|
|
1053
|
+
-- distill order). EXCLUDED.asserted_at is the
|
|
1054
|
+
-- COALESCE(event_time, NOW()) from the INSERT above.
|
|
1055
|
+
asserted_at = GREATEST(facts.asserted_at, EXCLUDED.asserted_at)
|
|
980
1056
|
""",
|
|
981
1057
|
(
|
|
982
1058
|
_content_id(arena, stmt),
|
|
@@ -990,6 +1066,9 @@ def upsert_facts(
|
|
|
990
1066
|
float(f.get("confidence") or corroborated_confidence(1)),
|
|
991
1067
|
participant_set,
|
|
992
1068
|
disclosure_class,
|
|
1069
|
+
_fsal,
|
|
1070
|
+
event_time,
|
|
1071
|
+
due_at,
|
|
993
1072
|
),
|
|
994
1073
|
)
|
|
995
1074
|
inserted += 1
|
|
@@ -1004,9 +1083,14 @@ def upsert_relationships(
|
|
|
1004
1083
|
disclosure_class: str,
|
|
1005
1084
|
relationships: list[dict],
|
|
1006
1085
|
name_to_id: dict[str, str],
|
|
1086
|
+
event_time: datetime | None,
|
|
1007
1087
|
) -> int:
|
|
1008
1088
|
"""Edge identity is (arena, from, to, type). ON CONFLICT bumps
|
|
1009
|
-
weight +
|
|
1089
|
+
weight + widens the seen-window rather than duplicating.
|
|
1090
|
+
|
|
1091
|
+
`first_seen`/`last_seen` are stamped from the event's SOURCE time
|
|
1092
|
+
(`event_time`), falling back to NOW() via COALESCE; re-corroboration
|
|
1093
|
+
widens with LEAST/GREATEST on the source time, not ingest time."""
|
|
1010
1094
|
if not relationships:
|
|
1011
1095
|
return 0
|
|
1012
1096
|
inserted = 0
|
|
@@ -1022,21 +1106,25 @@ def upsert_relationships(
|
|
|
1022
1106
|
"""
|
|
1023
1107
|
INSERT INTO relationships (
|
|
1024
1108
|
id, arena, from_entity_id, to_entity_id, relationship_type,
|
|
1025
|
-
weight, provenance_event_ids, participant_set, disclosure_class
|
|
1109
|
+
weight, provenance_event_ids, participant_set, disclosure_class,
|
|
1110
|
+
first_seen, last_seen
|
|
1026
1111
|
) VALUES (
|
|
1027
|
-
%s, %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
|
|
1112
|
+
%s, %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class,
|
|
1113
|
+
COALESCE(%s, NOW()), COALESCE(%s, NOW())
|
|
1028
1114
|
)
|
|
1029
1115
|
ON CONFLICT (id) DO UPDATE SET
|
|
1030
1116
|
weight = relationships.weight + EXCLUDED.weight,
|
|
1031
1117
|
provenance_event_ids = (
|
|
1032
1118
|
SELECT ARRAY(SELECT DISTINCT UNNEST(relationships.provenance_event_ids || EXCLUDED.provenance_event_ids))
|
|
1033
1119
|
),
|
|
1034
|
-
last_seen =
|
|
1120
|
+
last_seen = GREATEST(relationships.last_seen, EXCLUDED.last_seen),
|
|
1121
|
+
first_seen = LEAST(relationships.first_seen, EXCLUDED.first_seen)
|
|
1035
1122
|
""",
|
|
1036
1123
|
(
|
|
1037
1124
|
rid, arena, from_id, to_id, rtype,
|
|
1038
1125
|
float(r.get("confidence") or 0.5),
|
|
1039
1126
|
[event_id], participant_set, disclosure_class,
|
|
1127
|
+
event_time, event_time,
|
|
1040
1128
|
),
|
|
1041
1129
|
)
|
|
1042
1130
|
inserted += 1
|
|
@@ -1288,7 +1376,7 @@ def fetch_event(conn: psycopg.Connection, event_id: str) -> dict[str, Any] | Non
|
|
|
1288
1376
|
with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
|
|
1289
1377
|
cur.execute(
|
|
1290
1378
|
"SELECT id, arena, source_kind, content, attributes, participant_set, "
|
|
1291
|
-
"disclosure_class FROM events WHERE id = %s",
|
|
1379
|
+
"disclosure_class, emitted_at FROM events WHERE id = %s",
|
|
1292
1380
|
(event_id,),
|
|
1293
1381
|
)
|
|
1294
1382
|
return cur.fetchone()
|
|
@@ -1446,16 +1534,31 @@ async def process_batch(
|
|
|
1446
1534
|
arena = event["arena"]
|
|
1447
1535
|
participant_set = event.get("participant_set") or [arena]
|
|
1448
1536
|
disclosure = event.get("disclosure_class") or "private"
|
|
1537
|
+
# SOURCE time of this event: prefer the parsed
|
|
1538
|
+
# `attributes.timestamp` (canonical), falling back to the
|
|
1539
|
+
# stored `emitted_at` column (which the sync path now also
|
|
1540
|
+
# stamps from source time). `None` ⇒ upserts fall back to
|
|
1541
|
+
# NOW() in-SQL. NEVER crash on a bad/absent source time.
|
|
1542
|
+
event_time = event_source_time(event) or event.get("emitted_at")
|
|
1543
|
+
# A structured deadline on the source event, if the producer
|
|
1544
|
+
# supplied one — populates facts.effective_until. Absent or
|
|
1545
|
+
# unparseable ⇒ None (column stays NULL, its existing
|
|
1546
|
+
# behaviour). Only `attributes.due_at` is honoured; we do NOT
|
|
1547
|
+
# guess deadlines from free text here.
|
|
1548
|
+
due_at = parse_source_time((event.get("attributes") or {}).get("due_at"))
|
|
1449
1549
|
|
|
1450
1550
|
try:
|
|
1451
1551
|
name_to_id = upsert_entities(
|
|
1452
|
-
conn, arena, event_id, participant_set, disclosure, ents
|
|
1552
|
+
conn, arena, event_id, participant_set, disclosure, ents,
|
|
1553
|
+
event_time,
|
|
1453
1554
|
)
|
|
1454
1555
|
n_facts = upsert_facts(
|
|
1455
1556
|
conn, arena, event_id, participant_set, disclosure, facts, name_to_id,
|
|
1557
|
+
event_time, due_at,
|
|
1456
1558
|
)
|
|
1457
1559
|
n_rels = upsert_relationships(
|
|
1458
1560
|
conn, arena, event_id, participant_set, disclosure, rels, name_to_id,
|
|
1561
|
+
event_time,
|
|
1459
1562
|
)
|
|
1460
1563
|
mark_done(conn, queue_id)
|
|
1461
1564
|
log.info(
|
|
@@ -5,7 +5,9 @@ WORKDIR /app
|
|
|
5
5
|
COPY requirements.txt .
|
|
6
6
|
RUN pip install --no-cache-dir -r requirements.txt
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
# confidence.py is a byte-identical copy of extractor-async's (the born_salience
|
|
9
|
+
# scale must match the decay side). test_born_salience_parity guards drift.
|
|
10
|
+
COPY entity_id.py confidence.py server.py .
|
|
9
11
|
|
|
10
12
|
EXPOSE 8101
|
|
11
13
|
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8101", "--workers", "2"]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""confidence — fact confidence promotion based on multi-source corroboration.
|
|
2
|
+
|
|
3
|
+
Today every fact lands in org_model at confidence 0.5 / stage 'provisional'
|
|
4
|
+
and never moves. Live-data audit (2026-05-25): EVERY fact across 200
|
|
5
|
+
sampled rows in pentatonic-team is stuck at 0.5 — no signal of
|
|
6
|
+
"how trustworthy is this?" reaches the read side.
|
|
7
|
+
|
|
8
|
+
The right signal is **multi-source corroboration**: the same statement
|
|
9
|
+
appearing in two emails AND a calendar event is meaningfully more
|
|
10
|
+
trustworthy than a one-off mention in a Slack DM. The extractor
|
|
11
|
+
already records `provenance_event_ids` (the list of source events
|
|
12
|
+
that mention each fact), so the data needed for promotion is there
|
|
13
|
+
— we just don't use it.
|
|
14
|
+
|
|
15
|
+
Formula:
|
|
16
|
+
|
|
17
|
+
confidence = min(0.5 + 0.15 * (n_sources - 1), 0.9)
|
|
18
|
+
|
|
19
|
+
Concretely:
|
|
20
|
+
|
|
21
|
+
1 source → 0.50 (single mention, default)
|
|
22
|
+
2 sources → 0.65 (one corroboration)
|
|
23
|
+
3 sources → 0.80
|
|
24
|
+
4 sources → 0.90 (cap; "verified" remains human-only)
|
|
25
|
+
5+ → 0.90
|
|
26
|
+
|
|
27
|
+
Cap at 0.9 reserves the [0.9, 1.0] range for human-verified facts
|
|
28
|
+
(`stage = 'verified'`), which the extractor cannot produce on its
|
|
29
|
+
own. We never bump the stage from `provisional` to `distilled` or
|
|
30
|
+
`verified` from this code path — corroboration is a signal, not a
|
|
31
|
+
promotion. Stage transitions stay deliberate / explicit.
|
|
32
|
+
|
|
33
|
+
Pure module — no I/O, no deps. Importable from worker.py without
|
|
34
|
+
pulling in psycopg / httpx.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from __future__ import annotations
|
|
38
|
+
|
|
39
|
+
# Bump-per-additional-source. Tuned so:
|
|
40
|
+
# 1 → 0.50 (base)
|
|
41
|
+
# 2 → 0.65
|
|
42
|
+
# 3 → 0.80
|
|
43
|
+
# 4 → 0.90 (cap reached)
|
|
44
|
+
# Picked instead of a smooth log/sqrt because the read-side bucket
|
|
45
|
+
# boundaries (UI badge colours) align cleanly with these steps.
|
|
46
|
+
_CONF_PER_SOURCE = 0.15
|
|
47
|
+
_CONF_BASE = 0.5
|
|
48
|
+
_CONF_CAP = 0.9
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def corroborated_confidence(n_sources: int) -> float:
|
|
52
|
+
"""Confidence score for a fact corroborated by `n_sources` events.
|
|
53
|
+
|
|
54
|
+
`n_sources <= 0` returns the base confidence — never negative,
|
|
55
|
+
never above the cap. Pure function for easy unit testing.
|
|
56
|
+
"""
|
|
57
|
+
if n_sources <= 1:
|
|
58
|
+
return _CONF_BASE
|
|
59
|
+
bumped = _CONF_BASE + _CONF_PER_SOURCE * (n_sources - 1)
|
|
60
|
+
if bumped > _CONF_CAP:
|
|
61
|
+
return _CONF_CAP
|
|
62
|
+
return round(bumped, 2)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ── born salience (Fusion Drive) ─────────────────────────────────────
|
|
66
|
+
# Retention priority a node is stamped with at extraction time, SEPARATE
|
|
67
|
+
# from confidence (confidence = corroboration/truth; salience = how long
|
|
68
|
+
# it's worth keeping). Junk — flagged by the extractor's own quality
|
|
69
|
+
# detectors (noise name, numeric-ID-as-person, hallucinated email,
|
|
70
|
+
# ungrounded, etc.) — is born near the floor so the Fusion Drive decay
|
|
71
|
+
# pass evicts it on a short clock instead of the multi-year default.
|
|
72
|
+
#
|
|
73
|
+
# This MUST stay byte-identical to fusion_drive/salience.py:born_salience
|
|
74
|
+
# (the decay side uses the same scale). test_born_salience_parity.py
|
|
75
|
+
# guards the two against drift — same pattern as entity_id.py's parity
|
|
76
|
+
# test across the sync/async build contexts.
|
|
77
|
+
_SAL_BASE = 0.50
|
|
78
|
+
_SAL_CORROB_PER_SOURCE = 0.10
|
|
79
|
+
_SAL_CORROB_CAP = 0.30
|
|
80
|
+
_SAL_FLOOR = 0.01
|
|
81
|
+
_SAL_CEIL = 1.00
|
|
82
|
+
_SAL_PENALTIES = {
|
|
83
|
+
"noise_name": 0.45,
|
|
84
|
+
"numeric_id_person": 0.45,
|
|
85
|
+
"hallucinated_email": 0.40,
|
|
86
|
+
"ungrounded": 0.35,
|
|
87
|
+
"subject_undeclared": 0.25,
|
|
88
|
+
"low_signal": 0.15,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def born_salience(n_sources: int = 1, quality_flags: list[str] | None = None) -> float:
|
|
93
|
+
"""Salience to stamp on a freshly extracted node. See the module note."""
|
|
94
|
+
s = _SAL_BASE
|
|
95
|
+
if n_sources > 1:
|
|
96
|
+
s += min(_SAL_CORROB_CAP, _SAL_CORROB_PER_SOURCE * (n_sources - 1))
|
|
97
|
+
for flag in quality_flags or []:
|
|
98
|
+
s -= _SAL_PENALTIES.get(flag, 0.0)
|
|
99
|
+
return round(max(_SAL_FLOOR, min(_SAL_CEIL, s)), 4)
|
|
@@ -27,10 +27,14 @@ import os
|
|
|
27
27
|
import re
|
|
28
28
|
import time
|
|
29
29
|
from contextlib import asynccontextmanager
|
|
30
|
+
from datetime import datetime # noqa: F401 (used in type hints)
|
|
30
31
|
from typing import Any
|
|
31
32
|
|
|
32
33
|
# Canonical entity-ID scheme — byte-identical copy in extractor-async (entity_id.py).
|
|
34
|
+
from confidence import born_salience
|
|
33
35
|
from entity_id import entity_id, normalize_surface_form # noqa: F401
|
|
36
|
+
# Source-time parsing — byte-identical copy in extractor-async (source_time.py).
|
|
37
|
+
from source_time import event_source_time
|
|
34
38
|
|
|
35
39
|
import psycopg
|
|
36
40
|
import psycopg.rows
|
|
@@ -394,17 +398,27 @@ RULES = {
|
|
|
394
398
|
|
|
395
399
|
async def _upsert_event(cur: psycopg.AsyncCursor, req: ExtractRequest,
|
|
396
400
|
event_id: str, content_hash: str) -> None:
|
|
397
|
-
"""ON CONFLICT DO NOTHING — re-emitting the same event is a no-op.
|
|
401
|
+
"""ON CONFLICT DO NOTHING — re-emitting the same event is a no-op.
|
|
402
|
+
|
|
403
|
+
`emitted_at` is the SOURCE time of the content (when the
|
|
404
|
+
email/meeting/message actually happened), parsed from
|
|
405
|
+
`attributes.timestamp`; `received_at` keeps its NOW() default and
|
|
406
|
+
means ingest time — exactly the split the schema comment at
|
|
407
|
+
001_init.sql:112 promises. When the source time is absent or
|
|
408
|
+
unparseable we fall back to NOW() via COALESCE (never NULL a
|
|
409
|
+
NOT NULL column)."""
|
|
410
|
+
emitted_at = event_source_time({"attributes": req.attributes})
|
|
398
411
|
await cur.execute(
|
|
399
412
|
"""
|
|
400
413
|
INSERT INTO events (
|
|
401
414
|
id, arena, client_id, user_id, event_type, source_kind,
|
|
402
415
|
source_id, content, content_hash, participant_set,
|
|
403
|
-
participant_kind, disclosure_class, attributes
|
|
416
|
+
participant_kind, disclosure_class, attributes, emitted_at
|
|
404
417
|
) VALUES (
|
|
405
418
|
%s, %s, %s, %s, %s, %s::source_kind,
|
|
406
419
|
%s, %s, %s, %s,
|
|
407
|
-
%s::participant_kind, %s::disclosure_class, %s::jsonb
|
|
420
|
+
%s::participant_kind, %s::disclosure_class, %s::jsonb,
|
|
421
|
+
COALESCE(%s, NOW())
|
|
408
422
|
)
|
|
409
423
|
ON CONFLICT (id) DO NOTHING
|
|
410
424
|
""",
|
|
@@ -416,13 +430,26 @@ async def _upsert_event(cur: psycopg.AsyncCursor, req: ExtractRequest,
|
|
|
416
430
|
req.attributes.get("participant_kind", "unknown"),
|
|
417
431
|
req.attributes.get("disclosure_class", "private"),
|
|
418
432
|
psycopg.types.json.Json(req.attributes),
|
|
433
|
+
emitted_at,
|
|
419
434
|
),
|
|
420
435
|
)
|
|
421
436
|
|
|
422
437
|
|
|
423
|
-
async def _upsert_entities(
|
|
438
|
+
async def _upsert_entities(
|
|
439
|
+
cur: psycopg.AsyncCursor,
|
|
440
|
+
entities: list[dict],
|
|
441
|
+
event_time: "datetime | None",
|
|
442
|
+
) -> None:
|
|
424
443
|
"""Alias-aware idempotent entity upsert.
|
|
425
444
|
|
|
445
|
+
`event_time` is the SOURCE time of the originating event (parsed from
|
|
446
|
+
`attributes.timestamp`); it stamps `first_seen`/`last_seen` so the
|
|
447
|
+
graph tracks when the evidence actually happened, not when we
|
|
448
|
+
ingested it. `None` (no/garbage source time) falls back to NOW() via
|
|
449
|
+
COALESCE. On re-corroboration we widen the window with
|
|
450
|
+
LEAST(first_seen, ...) / GREATEST(last_seen, ...): "most recent
|
|
451
|
+
evidence" = newest SOURCE time, not newest ingest.
|
|
452
|
+
|
|
426
453
|
For each entity, before inserting, look for an existing row in the
|
|
427
454
|
same (arena, entity_type) whose canonical_name OR aliases overlap
|
|
428
455
|
any of the incoming surface forms. If found, merge aliases +
|
|
@@ -488,23 +515,40 @@ async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> No
|
|
|
488
515
|
UPDATE entities SET
|
|
489
516
|
aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
|
|
490
517
|
provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
|
|
491
|
-
|
|
518
|
+
-- Widen the seen-window with this event's SOURCE time,
|
|
519
|
+
-- not NOW(): newest evidence = newest source time.
|
|
520
|
+
last_seen = GREATEST(last_seen, COALESCE(%s, NOW())),
|
|
521
|
+
first_seen = LEAST(first_seen, COALESCE(%s, NOW()))
|
|
492
522
|
WHERE id = %s
|
|
493
523
|
""",
|
|
494
|
-
(e["aliases"], e["provenance_event_ids"],
|
|
524
|
+
(e["aliases"], e["provenance_event_ids"],
|
|
525
|
+
event_time, event_time, existing_id),
|
|
495
526
|
)
|
|
496
527
|
else:
|
|
497
528
|
# 3b. No match — insert new. ON CONFLICT (id) is a belt-
|
|
498
529
|
# and-braces fallback for the rare case where two writers
|
|
499
530
|
# collide on the same id under different surface forms;
|
|
500
531
|
# the advisory lock above is the primary defence.
|
|
532
|
+
# Fusion Drive born-salience via the SHARED born_salience (no
|
|
533
|
+
# inline constants — they'd drift from the async path; #96 review
|
|
534
|
+
# §4). Sync entities are deterministic (names from structured
|
|
535
|
+
# email/calendar fields) so they're high-quality; the one junk
|
|
536
|
+
# class sync can still emit is a numeric-ID-as-person, flagged so
|
|
537
|
+
# it's born low and decay can evict it. The async distiller owns
|
|
538
|
+
# the full quality-flag set.
|
|
539
|
+
_digits = sum(c.isdigit() for c in e["canonical_name"] if not c.isspace())
|
|
540
|
+
_nonspace = sum(1 for c in e["canonical_name"] if not c.isspace()) or 1
|
|
541
|
+
_flags = ["numeric_id_person"] if (e["entity_type"] == "person" and _digits / _nonspace > 0.5) else []
|
|
542
|
+
_sal = born_salience(1, _flags)
|
|
501
543
|
await cur.execute(
|
|
502
544
|
"""
|
|
503
545
|
INSERT INTO entities (
|
|
504
546
|
id, arena, entity_type, canonical_name, aliases,
|
|
505
|
-
provenance_event_ids, participant_set, disclosure_class
|
|
547
|
+
provenance_event_ids, participant_set, disclosure_class,
|
|
548
|
+
first_seen, last_seen, salience
|
|
506
549
|
) VALUES (
|
|
507
|
-
%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
|
|
550
|
+
%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class,
|
|
551
|
+
COALESCE(%s, NOW()), COALESCE(%s, NOW()), %s
|
|
508
552
|
)
|
|
509
553
|
ON CONFLICT (id) DO UPDATE SET
|
|
510
554
|
aliases = (
|
|
@@ -513,11 +557,14 @@ async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> No
|
|
|
513
557
|
provenance_event_ids = (
|
|
514
558
|
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
|
|
515
559
|
),
|
|
516
|
-
|
|
560
|
+
salience = GREATEST(entities.salience, EXCLUDED.salience),
|
|
561
|
+
last_seen = GREATEST(entities.last_seen, EXCLUDED.last_seen),
|
|
562
|
+
first_seen = LEAST(entities.first_seen, EXCLUDED.first_seen)
|
|
517
563
|
""",
|
|
518
564
|
(e["id"], e["arena"], e["entity_type"], e["canonical_name"],
|
|
519
565
|
e["aliases"], e["provenance_event_ids"],
|
|
520
|
-
e["participant_set"], e["disclosure_class"]
|
|
566
|
+
e["participant_set"], e["disclosure_class"],
|
|
567
|
+
event_time, event_time, _sal),
|
|
521
568
|
)
|
|
522
569
|
|
|
523
570
|
|
|
@@ -584,7 +631,10 @@ async def extract(req: ExtractRequest):
|
|
|
584
631
|
async with _pool.connection() as conn:
|
|
585
632
|
async with conn.cursor() as cur:
|
|
586
633
|
await _upsert_event(cur, req, event_id, content_hash)
|
|
587
|
-
|
|
634
|
+
# Source time of THIS event — stamps the graph rows so
|
|
635
|
+
# first/last_seen track content time, not ingest time.
|
|
636
|
+
event_time = event_source_time({"attributes": req.attributes})
|
|
637
|
+
await _upsert_entities(cur, entities, event_time)
|
|
588
638
|
# Facts + relationships are deliberately left to the async
|
|
589
639
|
# distillation worker — the deterministic path can't
|
|
590
640
|
# reliably extract decisions/commitments without LLM context.
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""source_time — robust ISO-8601 source-time parsing for graph stamping.
|
|
2
|
+
|
|
3
|
+
The memory graph must stamp `events.emitted_at` and the graph rows'
|
|
4
|
+
`first_seen` / `last_seen` / `asserted_at` from the SOURCE time of the
|
|
5
|
+
content (when the email/meeting/message actually happened), NOT the
|
|
6
|
+
ingest wall-clock (`NOW()`). The source time is carried on the event as
|
|
7
|
+
`attributes.timestamp` (ISO-8601). This helper promotes it.
|
|
8
|
+
|
|
9
|
+
Mirrors `compat/server.py:_parse_ts` (handles the bare `Z` suffix that
|
|
10
|
+
`datetime.fromisoformat` only learned in 3.11) but returns a tz-aware
|
|
11
|
+
`datetime` rather than a unix float, because the destination columns are
|
|
12
|
+
`TIMESTAMPTZ` and we want psycopg to bind a datetime, not an epoch.
|
|
13
|
+
|
|
14
|
+
CONTRACT (load-bearing): callers MUST fall back to the existing default
|
|
15
|
+
(received / NOW) when the source time is absent or unparseable. This
|
|
16
|
+
helper NEVER raises and returns `None` on anything it can't parse — the
|
|
17
|
+
caller is responsible for the `or NOW()` fallback so we never NULL a
|
|
18
|
+
NOT NULL column or crash the ingest/distill path.
|
|
19
|
+
|
|
20
|
+
NOTE: keep this byte-identical with the copy in extractor-sync/. Same
|
|
21
|
+
convention as entity_id.py — two services, one parsing rule.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from datetime import datetime, timezone
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_source_time(value: Any) -> datetime | None:
|
|
31
|
+
"""Best-effort ISO-8601 -> tz-aware datetime. Returns None on
|
|
32
|
+
anything we can't parse (caller falls back to NOW()).
|
|
33
|
+
|
|
34
|
+
Accepts both the bare `Z` suffix and explicit offsets. A parsed
|
|
35
|
+
value with no offset is assumed UTC (the producers emit UTC ISO
|
|
36
|
+
strings; a naive datetime would break TIMESTAMPTZ comparisons)."""
|
|
37
|
+
if not isinstance(value, str) or not value:
|
|
38
|
+
return None
|
|
39
|
+
try:
|
|
40
|
+
# `fromisoformat` handles `+00:00` but not the bare `Z` suffix
|
|
41
|
+
# until Python 3.11; normalise to be safe across runtime
|
|
42
|
+
# versions on the engine box.
|
|
43
|
+
dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
44
|
+
except Exception:
|
|
45
|
+
return None
|
|
46
|
+
if dt.tzinfo is None:
|
|
47
|
+
# Producer emitted a naive ISO string; treat as UTC rather than
|
|
48
|
+
# letting psycopg interpret it in the server's local zone.
|
|
49
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
50
|
+
return dt
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def event_source_time(event: dict[str, Any]) -> datetime | None:
|
|
54
|
+
"""Pull the source time off an event dict's attributes.
|
|
55
|
+
|
|
56
|
+
Precedence: `attributes.timestamp` (the source/content time) wins
|
|
57
|
+
over `attributes.emitted_at` (a producer-supplied emit-now, which is
|
|
58
|
+
closer to ingest time). Returns None if neither parses — caller
|
|
59
|
+
falls back to NOW()."""
|
|
60
|
+
attrs = event.get("attributes") or {}
|
|
61
|
+
return parse_source_time(attrs.get("timestamp")) or parse_source_time(
|
|
62
|
+
attrs.get("emitted_at")
|
|
63
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""extractor-sync/confidence.py must stay byte-identical to extractor-async's
|
|
2
|
+
copy — both carry born_salience, whose scale must match the Fusion Drive decay
|
|
3
|
+
side. Same drift guard as test_entity_id_parity.py across the build contexts."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_sync_confidence_is_byte_identical_to_async():
|
|
11
|
+
here = os.path.dirname(__file__)
|
|
12
|
+
sync = os.path.join(here, "confidence.py")
|
|
13
|
+
async_ = os.path.join(here, "..", "extractor-async", "confidence.py")
|
|
14
|
+
with open(sync, "rb") as f:
|
|
15
|
+
a = f.read()
|
|
16
|
+
with open(async_, "rb") as f:
|
|
17
|
+
b = f.read()
|
|
18
|
+
assert a == b, "extractor-sync/confidence.py drifted from extractor-async/confidence.py"
|
|
@@ -273,7 +273,7 @@ def test_pool_keeps_default_tuple_row_factory() -> None:
|
|
|
273
273
|
def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
|
|
274
274
|
"""Entity already exists → UPDATE branch runs, id taken from row[0]."""
|
|
275
275
|
cur = _FakeCursor(existing_id="e_existing")
|
|
276
|
-
asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
|
|
276
|
+
asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()], None))
|
|
277
277
|
updates = [(s, p) for s, p in cur.executed if s.startswith("UPDATE entities")]
|
|
278
278
|
assert len(updates) == 1
|
|
279
279
|
_, params = updates[0]
|
|
@@ -283,7 +283,7 @@ def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
|
|
|
283
283
|
|
|
284
284
|
def test_upsert_entities_insert_branch_when_no_match() -> None:
|
|
285
285
|
cur = _FakeCursor(existing_id=None)
|
|
286
|
-
asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
|
|
286
|
+
asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()], None))
|
|
287
287
|
inserts = [s for s, _ in cur.executed if s.startswith("INSERT INTO entities")]
|
|
288
288
|
assert len(inserts) == 1
|
|
289
289
|
assert not any(s.startswith("UPDATE entities") for s, _ in cur.executed)
|
|
File without changes
|