npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.10.7 → 0.10.9 - Mend

@pentatonic-ai/ai-agent-sdk 0.10.7 → 0.10.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/packages/memory-engine-v2/extractor-async/worker.py CHANGED Viewed

@@ -33,14 +33,16 @@ import os
 import re
 import socket
 import time
+from datetime import datetime
 from typing import Any
 import httpx
 import psycopg
 import psycopg.rows
-from confidence import corroborated_confidence
+from confidence import born_salience, corroborated_confidence
 from entity_id import entity_id, normalize_surface_form
+from source_time import event_source_time, parse_source_time
 from extraction_schema import (
     ALLOWED_ENT_TYPES,
     ALLOWED_FCT_CATEGORIES,
@@ -372,7 +374,10 @@ def build_event_block(idx: int, event: dict[str, Any]) -> str:
     src = event.get("source_kind", "unknown")
     content = clean_content(event.get("content") or "")[:MAX_CONTENT_CHARS]
     attrs = event.get("attributes") or {}
-    when = attrs.get("emitted_at") or attrs.get("timestamp")
+    # Prefer the SOURCE time (`timestamp`) over the producer's emit-now
+    # (`emitted_at`) so the LLM anchors "when" to when the content
+    # actually happened, not when it was forwarded into ingest.
+    when = attrs.get("timestamp") or attrs.get("emitted_at")
     author = attrs.get("author") or attrs.get("user_id")
     header = [f"[event {idx}]", f"source_kind: {src}"]
     if when:
@@ -782,6 +787,15 @@ def _content_id(*parts: str) -> str:
     return hashlib.sha256("\x1f".join(parts).encode()).hexdigest()[:32]
+def _digit_ratio(s: str) -> float:
+    """Fraction of non-whitespace chars that are digits. Used to flag
+    numeric-ID-as-person junk for Fusion Drive born-salience."""
+    stripped = "".join(s.split())
+    if not stripped:
+        return 0.0
+    return sum(c.isdigit() for c in stripped) / len(stripped)
 def upsert_entities(
     conn: psycopg.Connection,
     arena: str,
@@ -789,10 +803,18 @@ def upsert_entities(
     participant_set: list[str],
     disclosure_class: str,
     entities: list[dict],
+    event_time: datetime | None,
 ) -> dict[str, str]:
     """Alias-aware insert (or merge) of entities; returns a name→id
     map so facts and relationships can link to the inserted rows.
+    `event_time` is the SOURCE time of the event being distilled (parsed
+    from `attributes.timestamp`); it stamps `first_seen`/`last_seen` so
+    the graph tracks content time, not ingest time. `None` (no/garbage
+    source time) falls back to NOW() via COALESCE — never NULLs a NOT
+    NULL column. Re-corroboration widens the window with LEAST/GREATEST
+    on the SOURCE time, so "most recent evidence" = newest source time.
     Two concerns layered together:
     1. **ID derivation** uses the shared `entity_id()` helper from
@@ -875,20 +897,35 @@ def upsert_entities(
                     UPDATE entities SET
                       aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
                       provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
-                      last_seen = NOW()
+                      -- Widen the seen-window with this event's SOURCE
+                      -- time, not NOW(): newest evidence = newest source.
+                      last_seen = GREATEST(last_seen, COALESCE(%s, NOW())),
+                      first_seen = LEAST(first_seen, COALESCE(%s, NOW()))
                     WHERE id = %s
                     """,
-                    (aliases, [event_id], eid),
+                    (aliases, [event_id], event_time, event_time, eid),
                 )
             else:
                 # 3b. No match — insert new.
                 eid = entity_id(arena, etype, name)
+                # Fusion Drive born-salience: a numeric-ID-as-person (classic
+                # 7B junk that slips past noise_filter, e.g. "1716801984") is
+                # born near the floor so the decay pass can evict it on a short
+                # clock instead of the multi-year entity default.
+                _qflags = []
+                if etype == "person" and _digit_ratio(name) > 0.5:
+                    _qflags.append("numeric_id_person")
+                _sal = born_salience(1, _qflags)
                 cur.execute(
                     """
                     INSERT INTO entities (
                       id, arena, entity_type, canonical_name, aliases,
-                      provenance_event_ids, participant_set, disclosure_class
-                    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class)
+                      provenance_event_ids, participant_set, disclosure_class, salience,
+                      first_seen, last_seen
+                    ) VALUES (
+                      %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class, %s,
+                      COALESCE(%s, NOW()), COALESCE(%s, NOW())
+                    )
                     ON CONFLICT (id) DO UPDATE SET
                       aliases = (
                         SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
@@ -896,11 +933,16 @@ def upsert_entities(
                       provenance_event_ids = (
                         SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
                       ),
-                      last_seen = NOW()
+                      -- re-corroboration can only RAISE salience, never lower it
+                      salience = GREATEST(entities.salience, EXCLUDED.salience),
+                      -- widen the seen-window on SOURCE time, not NOW()
+                      last_seen = GREATEST(entities.last_seen, EXCLUDED.last_seen),
+                      first_seen = LEAST(entities.first_seen, EXCLUDED.first_seen)
                     """,
                     (
                         eid, arena, etype, name, aliases,
-                        [event_id], participant_set, disclosure_class,
+                        [event_id], participant_set, disclosure_class, _sal,
+                        event_time, event_time,
                     ),
                 )
             name_to_id[name] = eid
@@ -915,6 +957,8 @@ def upsert_facts(
     disclosure_class: str,
     facts: list[dict],
     name_to_id: dict[str, str],
+    event_time: datetime | None,
+    due_at: datetime | None = None,
 ) -> int:
     """Facts are content-hashed on (arena, statement). Same statement
     extracted from any event in the arena converges to the same row,
@@ -931,7 +975,16 @@ def upsert_facts(
     (see confidence.py — caps at 0.9 to reserve [0.9, 1.0] for
     `stage = 'verified'` which only a human can produce). Stage stays
     `provisional`; corroboration is a signal, not a graduation.
-    """
+    `asserted_at` is stamped from the event's SOURCE time (`event_time`,
+    parsed from `attributes.timestamp`), falling back to NOW() via
+    COALESCE — so the temporal anchor is when the fact was actually
+    asserted, not when we distilled it. On corroboration it moves
+    FORWARD with GREATEST to the newest source time across all
+    corroborating events: facts have no `last_seen`, so #92's decay uses
+    `asserted_at` as the recency clock and resets it on re-corroboration
+    — order-stable regardless of distill order. `due_at` (the source
+    event's structured deadline, if any) populates `effective_until`."""
     if not facts:
         return 0
     inserted = 0
@@ -942,15 +995,26 @@ def upsert_facts(
                 continue
             subj_name = f.get("subject")
             obj_name = f.get("object")
+            # Fusion Drive born-salience: a fact whose subject isn't among the
+            # event's declared entities (ungrounded subject) or that's barely
+            # a sentence is born low so decay can clear it. n_sources=1 here.
+            _fflags = []
+            if subj_name and not name_to_id.get(subj_name):
+                _fflags.append("subject_undeclared")
+            if len(stmt) < 60:
+                _fflags.append("low_signal")
+            _fsal = born_salience(1, _fflags)
             cur.execute(
                 """
                 INSERT INTO facts (
                   id, arena, category, subject_entity_id, predicate,
                   object_entity_id, statement, provenance_event_ids,
-                  stage, confidence, participant_set, disclosure_class
+                  stage, confidence, participant_set, disclosure_class, salience,
+                  asserted_at, effective_until
                 ) VALUES (
                   %s, %s, %s, %s, %s, %s, %s, %s,
-                  'provisional'::extraction_stage, %s, %s, %s::disclosure_class
+                  'provisional'::extraction_stage, %s, %s, %s::disclosure_class, %s,
+                  COALESCE(%s, NOW()), %s
                 )
                 ON CONFLICT (id) DO UPDATE SET
                   provenance_event_ids = (
@@ -958,6 +1022,7 @@ def upsert_facts(
                       facts.provenance_event_ids || EXCLUDED.provenance_event_ids
                     ))
                   ),
+                  salience = GREATEST(facts.salience, EXCLUDED.salience),
                   -- Confidence bumps with each additional independent
                   -- source. The cardinality of the merged provenance
                   -- array IS the corroboration count, so the formula
@@ -976,7 +1041,18 @@ def upsert_facts(
                       )
                     ),
                     0.9
-                  )
+                  ),
+                  -- `asserted_at` doubles as the decay clock for facts:
+                  -- #92's fusion_drive_decay ages off
+                  -- max(last_accessed, asserted_at) and resets that
+                  -- clock on re-corroboration (facts have no `last_seen`
+                  -- of their own). So on conflict we move it FORWARD
+                  -- with GREATEST to the newest source time across all
+                  -- corroborating events — newest evidence, not oldest.
+                  -- This also makes it order-stable (independent of
+                  -- distill order). EXCLUDED.asserted_at is the
+                  -- COALESCE(event_time, NOW()) from the INSERT above.
+                  asserted_at = GREATEST(facts.asserted_at, EXCLUDED.asserted_at)
                 """,
                 (
                     _content_id(arena, stmt),
@@ -990,6 +1066,9 @@ def upsert_facts(
                     float(f.get("confidence") or corroborated_confidence(1)),
                     participant_set,
                     disclosure_class,
+                    _fsal,
+                    event_time,
+                    due_at,
                 ),
             )
             inserted += 1
@@ -1004,9 +1083,14 @@ def upsert_relationships(
     disclosure_class: str,
     relationships: list[dict],
     name_to_id: dict[str, str],
+    event_time: datetime | None,
 ) -> int:
     """Edge identity is (arena, from, to, type). ON CONFLICT bumps
-    weight + last_seen rather than duplicating."""
+    weight + widens the seen-window rather than duplicating.
+    `first_seen`/`last_seen` are stamped from the event's SOURCE time
+    (`event_time`), falling back to NOW() via COALESCE; re-corroboration
+    widens with LEAST/GREATEST on the source time, not ingest time."""
     if not relationships:
         return 0
     inserted = 0
@@ -1022,21 +1106,25 @@ def upsert_relationships(
                 """
                 INSERT INTO relationships (
                   id, arena, from_entity_id, to_entity_id, relationship_type,
-                  weight, provenance_event_ids, participant_set, disclosure_class
+                  weight, provenance_event_ids, participant_set, disclosure_class,
+                  first_seen, last_seen
                 ) VALUES (
-                  %s, %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
+                  %s, %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class,
+                  COALESCE(%s, NOW()), COALESCE(%s, NOW())
                 )
                 ON CONFLICT (id) DO UPDATE SET
                   weight = relationships.weight + EXCLUDED.weight,
                   provenance_event_ids = (
                     SELECT ARRAY(SELECT DISTINCT UNNEST(relationships.provenance_event_ids || EXCLUDED.provenance_event_ids))
                   ),
-                  last_seen = NOW()
+                  last_seen = GREATEST(relationships.last_seen, EXCLUDED.last_seen),
+                  first_seen = LEAST(relationships.first_seen, EXCLUDED.first_seen)
                 """,
                 (
                     rid, arena, from_id, to_id, rtype,
                     float(r.get("confidence") or 0.5),
                     [event_id], participant_set, disclosure_class,
+                    event_time, event_time,
                 ),
             )
             inserted += 1
@@ -1288,7 +1376,7 @@ def fetch_event(conn: psycopg.Connection, event_id: str) -> dict[str, Any] | Non
     with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
         cur.execute(
             "SELECT id, arena, source_kind, content, attributes, participant_set, "
-            "disclosure_class FROM events WHERE id = %s",
+            "disclosure_class, emitted_at FROM events WHERE id = %s",
             (event_id,),
         )
         return cur.fetchone()
@@ -1446,16 +1534,31 @@ async def process_batch(
             arena = event["arena"]
             participant_set = event.get("participant_set") or [arena]
             disclosure = event.get("disclosure_class") or "private"
+            # SOURCE time of this event: prefer the parsed
+            # `attributes.timestamp` (canonical), falling back to the
+            # stored `emitted_at` column (which the sync path now also
+            # stamps from source time). `None` ⇒ upserts fall back to
+            # NOW() in-SQL. NEVER crash on a bad/absent source time.
+            event_time = event_source_time(event) or event.get("emitted_at")
+            # A structured deadline on the source event, if the producer
+            # supplied one — populates facts.effective_until. Absent or
+            # unparseable ⇒ None (column stays NULL, its existing
+            # behaviour). Only `attributes.due_at` is honoured; we do NOT
+            # guess deadlines from free text here.
+            due_at = parse_source_time((event.get("attributes") or {}).get("due_at"))
             try:
                 name_to_id = upsert_entities(
-                    conn, arena, event_id, participant_set, disclosure, ents
+                    conn, arena, event_id, participant_set, disclosure, ents,
+                    event_time,
                 )
                 n_facts = upsert_facts(
                     conn, arena, event_id, participant_set, disclosure, facts, name_to_id,
+                    event_time, due_at,
                 )
                 n_rels = upsert_relationships(
                     conn, arena, event_id, participant_set, disclosure, rels, name_to_id,
+                    event_time,
                 )
                 mark_done(conn, queue_id)
                 log.info(

package/packages/memory-engine-v2/extractor-sync/Dockerfile CHANGED Viewed

@@ -5,7 +5,9 @@ WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-COPY entity_id.py server.py .
+# confidence.py is a byte-identical copy of extractor-async's (the born_salience
+# scale must match the decay side). test_born_salience_parity guards drift.
+COPY entity_id.py confidence.py server.py .
 EXPOSE 8101
 CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8101", "--workers", "2"]

package/packages/memory-engine-v2/extractor-sync/confidence.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""confidence — fact confidence promotion based on multi-source corroboration.
+Today every fact lands in org_model at confidence 0.5 / stage 'provisional'
+and never moves. Live-data audit (2026-05-25): EVERY fact across 200
+sampled rows in pentatonic-team is stuck at 0.5 — no signal of
+"how trustworthy is this?" reaches the read side.
+The right signal is **multi-source corroboration**: the same statement
+appearing in two emails AND a calendar event is meaningfully more
+trustworthy than a one-off mention in a Slack DM. The extractor
+already records `provenance_event_ids` (the list of source events
+that mention each fact), so the data needed for promotion is there
+— we just don't use it.
+Formula:
+    confidence = min(0.5 + 0.15 * (n_sources - 1), 0.9)
+Concretely:
+    1 source  →  0.50  (single mention, default)
+    2 sources →  0.65  (one corroboration)
+    3 sources →  0.80
+    4 sources →  0.90  (cap; "verified" remains human-only)
+    5+        →  0.90
+Cap at 0.9 reserves the [0.9, 1.0] range for human-verified facts
+(`stage = 'verified'`), which the extractor cannot produce on its
+own. We never bump the stage from `provisional` to `distilled` or
+`verified` from this code path — corroboration is a signal, not a
+promotion. Stage transitions stay deliberate / explicit.
+Pure module — no I/O, no deps. Importable from worker.py without
+pulling in psycopg / httpx.
+"""
+from __future__ import annotations
+# Bump-per-additional-source. Tuned so:
+#   1 → 0.50  (base)
+#   2 → 0.65
+#   3 → 0.80
+#   4 → 0.90 (cap reached)
+# Picked instead of a smooth log/sqrt because the read-side bucket
+# boundaries (UI badge colours) align cleanly with these steps.
+_CONF_PER_SOURCE = 0.15
+_CONF_BASE = 0.5
+_CONF_CAP = 0.9
+def corroborated_confidence(n_sources: int) -> float:
+    """Confidence score for a fact corroborated by `n_sources` events.
+    `n_sources <= 0` returns the base confidence — never negative,
+    never above the cap. Pure function for easy unit testing.
+    """
+    if n_sources <= 1:
+        return _CONF_BASE
+    bumped = _CONF_BASE + _CONF_PER_SOURCE * (n_sources - 1)
+    if bumped > _CONF_CAP:
+        return _CONF_CAP
+    return round(bumped, 2)
+# ── born salience (Fusion Drive) ─────────────────────────────────────
+# Retention priority a node is stamped with at extraction time, SEPARATE
+# from confidence (confidence = corroboration/truth; salience = how long
+# it's worth keeping). Junk — flagged by the extractor's own quality
+# detectors (noise name, numeric-ID-as-person, hallucinated email,
+# ungrounded, etc.) — is born near the floor so the Fusion Drive decay
+# pass evicts it on a short clock instead of the multi-year default.
+#
+# This MUST stay byte-identical to fusion_drive/salience.py:born_salience
+# (the decay side uses the same scale). test_born_salience_parity.py
+# guards the two against drift — same pattern as entity_id.py's parity
+# test across the sync/async build contexts.
+_SAL_BASE = 0.50
+_SAL_CORROB_PER_SOURCE = 0.10
+_SAL_CORROB_CAP = 0.30
+_SAL_FLOOR = 0.01
+_SAL_CEIL = 1.00
+_SAL_PENALTIES = {
+    "noise_name": 0.45,
+    "numeric_id_person": 0.45,
+    "hallucinated_email": 0.40,
+    "ungrounded": 0.35,
+    "subject_undeclared": 0.25,
+    "low_signal": 0.15,
+}
+def born_salience(n_sources: int = 1, quality_flags: list[str] | None = None) -> float:
+    """Salience to stamp on a freshly extracted node. See the module note."""
+    s = _SAL_BASE
+    if n_sources > 1:
+        s += min(_SAL_CORROB_CAP, _SAL_CORROB_PER_SOURCE * (n_sources - 1))
+    for flag in quality_flags or []:
+        s -= _SAL_PENALTIES.get(flag, 0.0)
+    return round(max(_SAL_FLOOR, min(_SAL_CEIL, s)), 4)

package/packages/memory-engine-v2/extractor-sync/server.py CHANGED Viewed

@@ -27,10 +27,14 @@ import os
 import re
 import time
 from contextlib import asynccontextmanager
+from datetime import datetime  # noqa: F401  (used in type hints)
 from typing import Any
 # Canonical entity-ID scheme — byte-identical copy in extractor-async (entity_id.py).
+from confidence import born_salience
 from entity_id import entity_id, normalize_surface_form  # noqa: F401
+# Source-time parsing — byte-identical copy in extractor-async (source_time.py).
+from source_time import event_source_time
 import psycopg
 import psycopg.rows
@@ -394,17 +398,27 @@ RULES = {
 async def _upsert_event(cur: psycopg.AsyncCursor, req: ExtractRequest,
                         event_id: str, content_hash: str) -> None:
-    """ON CONFLICT DO NOTHING — re-emitting the same event is a no-op."""
+    """ON CONFLICT DO NOTHING — re-emitting the same event is a no-op.
+    `emitted_at` is the SOURCE time of the content (when the
+    email/meeting/message actually happened), parsed from
+    `attributes.timestamp`; `received_at` keeps its NOW() default and
+    means ingest time — exactly the split the schema comment at
+    001_init.sql:112 promises. When the source time is absent or
+    unparseable we fall back to NOW() via COALESCE (never NULL a
+    NOT NULL column)."""
+    emitted_at = event_source_time({"attributes": req.attributes})
     await cur.execute(
         """
         INSERT INTO events (
           id, arena, client_id, user_id, event_type, source_kind,
           source_id, content, content_hash, participant_set,
-          participant_kind, disclosure_class, attributes
+          participant_kind, disclosure_class, attributes, emitted_at
         ) VALUES (
           %s, %s, %s, %s, %s, %s::source_kind,
           %s, %s, %s, %s,
-          %s::participant_kind, %s::disclosure_class, %s::jsonb
+          %s::participant_kind, %s::disclosure_class, %s::jsonb,
+          COALESCE(%s, NOW())
         )
         ON CONFLICT (id) DO NOTHING
         """,
@@ -416,13 +430,26 @@ async def _upsert_event(cur: psycopg.AsyncCursor, req: ExtractRequest,
             req.attributes.get("participant_kind", "unknown"),
             req.attributes.get("disclosure_class", "private"),
             psycopg.types.json.Json(req.attributes),
+            emitted_at,
         ),
     )
-async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> None:
+async def _upsert_entities(
+    cur: psycopg.AsyncCursor,
+    entities: list[dict],
+    event_time: "datetime | None",
+) -> None:
     """Alias-aware idempotent entity upsert.
+    `event_time` is the SOURCE time of the originating event (parsed from
+    `attributes.timestamp`); it stamps `first_seen`/`last_seen` so the
+    graph tracks when the evidence actually happened, not when we
+    ingested it. `None` (no/garbage source time) falls back to NOW() via
+    COALESCE. On re-corroboration we widen the window with
+    LEAST(first_seen, ...) / GREATEST(last_seen, ...): "most recent
+    evidence" = newest SOURCE time, not newest ingest.
     For each entity, before inserting, look for an existing row in the
     same (arena, entity_type) whose canonical_name OR aliases overlap
     any of the incoming surface forms. If found, merge aliases +
@@ -488,23 +515,40 @@ async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> No
                 UPDATE entities SET
                   aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
                   provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
-                  last_seen = NOW()
+                  -- Widen the seen-window with this event's SOURCE time,
+                  -- not NOW(): newest evidence = newest source time.
+                  last_seen = GREATEST(last_seen, COALESCE(%s, NOW())),
+                  first_seen = LEAST(first_seen, COALESCE(%s, NOW()))
                 WHERE id = %s
                 """,
-                (e["aliases"], e["provenance_event_ids"], existing_id),
+                (e["aliases"], e["provenance_event_ids"],
+                 event_time, event_time, existing_id),
             )
         else:
             # 3b. No match — insert new. ON CONFLICT (id) is a belt-
             # and-braces fallback for the rare case where two writers
             # collide on the same id under different surface forms;
             # the advisory lock above is the primary defence.
+            # Fusion Drive born-salience via the SHARED born_salience (no
+            # inline constants — they'd drift from the async path; #96 review
+            # §4). Sync entities are deterministic (names from structured
+            # email/calendar fields) so they're high-quality; the one junk
+            # class sync can still emit is a numeric-ID-as-person, flagged so
+            # it's born low and decay can evict it. The async distiller owns
+            # the full quality-flag set.
+            _digits = sum(c.isdigit() for c in e["canonical_name"] if not c.isspace())
+            _nonspace = sum(1 for c in e["canonical_name"] if not c.isspace()) or 1
+            _flags = ["numeric_id_person"] if (e["entity_type"] == "person" and _digits / _nonspace > 0.5) else []
+            _sal = born_salience(1, _flags)
             await cur.execute(
                 """
                 INSERT INTO entities (
                   id, arena, entity_type, canonical_name, aliases,
-                  provenance_event_ids, participant_set, disclosure_class
+                  provenance_event_ids, participant_set, disclosure_class,
+                  first_seen, last_seen, salience
                 ) VALUES (
-                  %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
+                  %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class,
+                  COALESCE(%s, NOW()), COALESCE(%s, NOW()), %s
                 )
                 ON CONFLICT (id) DO UPDATE SET
                   aliases = (
@@ -513,11 +557,14 @@ async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> No
                   provenance_event_ids = (
                     SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
                   ),
-                  last_seen = NOW()
+                  salience = GREATEST(entities.salience, EXCLUDED.salience),
+                  last_seen = GREATEST(entities.last_seen, EXCLUDED.last_seen),
+                  first_seen = LEAST(entities.first_seen, EXCLUDED.first_seen)
                 """,
                 (e["id"], e["arena"], e["entity_type"], e["canonical_name"],
                  e["aliases"], e["provenance_event_ids"],
-                 e["participant_set"], e["disclosure_class"]),
+                 e["participant_set"], e["disclosure_class"],
+                 event_time, event_time, _sal),
             )
@@ -584,7 +631,10 @@ async def extract(req: ExtractRequest):
     async with _pool.connection() as conn:
         async with conn.cursor() as cur:
             await _upsert_event(cur, req, event_id, content_hash)
-            await _upsert_entities(cur, entities)
+            # Source time of THIS event — stamps the graph rows so
+            # first/last_seen track content time, not ingest time.
+            event_time = event_source_time({"attributes": req.attributes})
+            await _upsert_entities(cur, entities, event_time)
             # Facts + relationships are deliberately left to the async
             # distillation worker — the deterministic path can't
             # reliably extract decisions/commitments without LLM context.

package/packages/memory-engine-v2/extractor-sync/source_time.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""source_time — robust ISO-8601 source-time parsing for graph stamping.
+The memory graph must stamp `events.emitted_at` and the graph rows'
+`first_seen` / `last_seen` / `asserted_at` from the SOURCE time of the
+content (when the email/meeting/message actually happened), NOT the
+ingest wall-clock (`NOW()`). The source time is carried on the event as
+`attributes.timestamp` (ISO-8601). This helper promotes it.
+Mirrors `compat/server.py:_parse_ts` (handles the bare `Z` suffix that
+`datetime.fromisoformat` only learned in 3.11) but returns a tz-aware
+`datetime` rather than a unix float, because the destination columns are
+`TIMESTAMPTZ` and we want psycopg to bind a datetime, not an epoch.
+CONTRACT (load-bearing): callers MUST fall back to the existing default
+(received / NOW) when the source time is absent or unparseable. This
+helper NEVER raises and returns `None` on anything it can't parse — the
+caller is responsible for the `or NOW()` fallback so we never NULL a
+NOT NULL column or crash the ingest/distill path.
+NOTE: keep this byte-identical with the copy in extractor-sync/. Same
+convention as entity_id.py — two services, one parsing rule.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+from typing import Any
+def parse_source_time(value: Any) -> datetime | None:
+    """Best-effort ISO-8601 -> tz-aware datetime. Returns None on
+    anything we can't parse (caller falls back to NOW()).
+    Accepts both the bare `Z` suffix and explicit offsets. A parsed
+    value with no offset is assumed UTC (the producers emit UTC ISO
+    strings; a naive datetime would break TIMESTAMPTZ comparisons)."""
+    if not isinstance(value, str) or not value:
+        return None
+    try:
+        # `fromisoformat` handles `+00:00` but not the bare `Z` suffix
+        # until Python 3.11; normalise to be safe across runtime
+        # versions on the engine box.
+        dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
+    except Exception:
+        return None
+    if dt.tzinfo is None:
+        # Producer emitted a naive ISO string; treat as UTC rather than
+        # letting psycopg interpret it in the server's local zone.
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt
+def event_source_time(event: dict[str, Any]) -> datetime | None:
+    """Pull the source time off an event dict's attributes.
+    Precedence: `attributes.timestamp` (the source/content time) wins
+    over `attributes.emitted_at` (a producer-supplied emit-now, which is
+    closer to ingest time). Returns None if neither parses — caller
+    falls back to NOW()."""
+    attrs = event.get("attributes") or {}
+    return parse_source_time(attrs.get("timestamp")) or parse_source_time(
+        attrs.get("emitted_at")
+    )

package/packages/memory-engine-v2/extractor-sync/test_confidence_parity.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""extractor-sync/confidence.py must stay byte-identical to extractor-async's
+copy — both carry born_salience, whose scale must match the Fusion Drive decay
+side. Same drift guard as test_entity_id_parity.py across the build contexts."""
+from __future__ import annotations
+import os
+def test_sync_confidence_is_byte_identical_to_async():
+    here = os.path.dirname(__file__)
+    sync = os.path.join(here, "confidence.py")
+    async_ = os.path.join(here, "..", "extractor-async", "confidence.py")
+    with open(sync, "rb") as f:
+        a = f.read()
+    with open(async_, "rb") as f:
+        b = f.read()
+    assert a == b, "extractor-sync/confidence.py drifted from extractor-async/confidence.py"

package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py CHANGED Viewed

@@ -273,7 +273,7 @@ def test_pool_keeps_default_tuple_row_factory() -> None:
 def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
     """Entity already exists → UPDATE branch runs, id taken from row[0]."""
     cur = _FakeCursor(existing_id="e_existing")
-    asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
+    asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()], None))
     updates = [(s, p) for s, p in cur.executed if s.startswith("UPDATE entities")]
     assert len(updates) == 1
     _, params = updates[0]
@@ -283,7 +283,7 @@ def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
 def test_upsert_entities_insert_branch_when_no_match() -> None:
     cur = _FakeCursor(existing_id=None)
-    asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
+    asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()], None))
     inserts = [s for s, _ in cur.executed if s.startswith("INSERT INTO entities")]
     assert len(inserts) == 1
     assert not any(s.startswith("UPDATE entities") for s, _ in cur.executed)

package/packages/memory-engine-v2/fusion_drive/__init__.py ADDED Viewed

File without changes