@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/README.md +3 -3
  2. package/bin/cli.js +1 -1
  3. package/bin/commands/config.js +1 -1
  4. package/dist/index.cjs +1 -1
  5. package/dist/index.js +1 -1
  6. package/package.json +2 -2
  7. package/packages/doctor/src/checks/local-memory.js +2 -2
  8. package/packages/memory/README.md +2 -2
  9. package/packages/memory/openclaw-plugin/README.md +2 -2
  10. package/packages/memory/openclaw-plugin/openclaw.plugin.json +1 -1
  11. package/packages/memory/src/server.js +2 -2
  12. package/packages/memory-engine-v2/.env.example +30 -0
  13. package/packages/memory-engine-v2/README.md +125 -0
  14. package/packages/memory-engine-v2/compat/Dockerfile +11 -0
  15. package/packages/memory-engine-v2/compat/requirements.txt +6 -0
  16. package/packages/memory-engine-v2/compat/server.py +1047 -0
  17. package/packages/memory-engine-v2/docker-compose.aws.yml +78 -0
  18. package/packages/memory-engine-v2/docker-compose.yml +206 -0
  19. package/packages/memory-engine-v2/extractor-async/Dockerfile +14 -0
  20. package/packages/memory-engine-v2/extractor-async/confidence.py +62 -0
  21. package/packages/memory-engine-v2/extractor-async/noise_filter.py +144 -0
  22. package/packages/memory-engine-v2/extractor-async/requirements.txt +2 -0
  23. package/packages/memory-engine-v2/extractor-async/test_confidence.py +76 -0
  24. package/packages/memory-engine-v2/extractor-async/test_noise_filter.py +177 -0
  25. package/packages/memory-engine-v2/extractor-async/worker.py +797 -0
  26. package/packages/memory-engine-v2/extractor-sync/Dockerfile +11 -0
  27. package/packages/memory-engine-v2/extractor-sync/requirements.txt +4 -0
  28. package/packages/memory-engine-v2/extractor-sync/server.py +424 -0
  29. package/packages/memory-engine-v2/org-model/migrations/001_init.sql +390 -0
  30. package/packages/memory-engine-v2/tests/e2e_smoke.py +356 -0
  31. package/packages/memory-engine-v2/tests/fixtures/generate_synthetic_corpus.py +758 -0
  32. package/packages/memory-engine/.env.example +0 -13
  33. package/packages/memory-engine/MIGRATION.md +0 -219
  34. package/packages/memory-engine/README.md +0 -145
  35. package/packages/memory-engine/bench/README.md +0 -99
  36. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +0 -1115
  37. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +0 -819
  38. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +0 -1278
  39. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +0 -1018
  40. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +0 -1038
  41. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +0 -961
  42. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +0 -1115
  43. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +0 -819
  44. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +0 -1278
  45. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +0 -1018
  46. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +0 -1038
  47. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +0 -937
  48. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +0 -1115
  49. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +0 -819
  50. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +0 -1278
  51. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +0 -1018
  52. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +0 -1038
  53. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +0 -961
  54. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +0 -1115
  55. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +0 -819
  56. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +0 -1278
  57. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +0 -1018
  58. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +0 -1038
  59. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +0 -883
  60. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +0 -1115
  61. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +0 -819
  62. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +0 -1278
  63. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +0 -1018
  64. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +0 -1038
  65. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +0 -937
  66. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +0 -1115
  67. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +0 -1115
  68. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +0 -819
  69. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +0 -542
  70. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +0 -1278
  71. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +0 -894
  72. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +0 -1018
  73. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +0 -680
  74. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +0 -1038
  75. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +0 -693
  76. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +0 -961
  77. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +0 -727
  78. package/packages/memory-engine/compat/Dockerfile +0 -22
  79. package/packages/memory-engine/compat/server.py +0 -1255
  80. package/packages/memory-engine/docker-compose.test.yml +0 -59
  81. package/packages/memory-engine/docker-compose.yml +0 -255
  82. package/packages/memory-engine/engine/README.md +0 -52
  83. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +0 -1543
  84. package/packages/memory-engine/engine/l5-comms-layer.py +0 -663
  85. package/packages/memory-engine/engine/l6-document-store.py +0 -1018
  86. package/packages/memory-engine/engine/services/_shared/__init__.py +0 -1
  87. package/packages/memory-engine/engine/services/_shared/embed_provider.py +0 -562
  88. package/packages/memory-engine/engine/services/l2/Dockerfile +0 -50
  89. package/packages/memory-engine/engine/services/l2/init_databases.py +0 -81
  90. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +0 -2721
  91. package/packages/memory-engine/engine/services/l5/Dockerfile +0 -11
  92. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +0 -808
  93. package/packages/memory-engine/engine/services/l6/Dockerfile +0 -30
  94. package/packages/memory-engine/engine/services/l6/l6-document-store.py +0 -1221
  95. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +0 -28
  96. package/packages/memory-engine/engine/services/nv-embed/server.py +0 -152
  97. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  98. package/packages/memory-engine/pme_memory/__main__.py +0 -129
  99. package/packages/memory-engine/pme_memory/artifacts.py +0 -95
  100. package/packages/memory-engine/pme_memory/embed.py +0 -74
  101. package/packages/memory-engine/pme_memory/health.py +0 -36
  102. package/packages/memory-engine/pme_memory/hygiene.py +0 -159
  103. package/packages/memory-engine/pme_memory/indexer.py +0 -200
  104. package/packages/memory-engine/pme_memory/needs.py +0 -55
  105. package/packages/memory-engine/pme_memory/provenance.py +0 -80
  106. package/packages/memory-engine/pme_memory/scoring.py +0 -168
  107. package/packages/memory-engine/pme_memory/search.py +0 -52
  108. package/packages/memory-engine/pme_memory/store.py +0 -86
  109. package/packages/memory-engine/pme_memory/synthesis.py +0 -114
  110. package/packages/memory-engine/pyproject.toml +0 -65
  111. package/packages/memory-engine/scripts/kg-extractor.py +0 -557
  112. package/packages/memory-engine/scripts/kg-preflexor-v2.py +0 -738
  113. package/packages/memory-engine/scripts/wipe-legacy-l3-entities.py +0 -128
  114. package/packages/memory-engine/tests/e2e_arena.sh +0 -259
  115. package/packages/memory-engine/tests/embed_stub/Dockerfile +0 -13
  116. package/packages/memory-engine/tests/embed_stub/server.py +0 -80
  117. package/packages/memory-engine/tests/test_aggregate.py +0 -333
  118. package/packages/memory-engine/tests/test_api_contract.sh +0 -57
  119. package/packages/memory-engine/tests/test_arena_safety.py +0 -232
  120. package/packages/memory-engine/tests/test_channel_stat_reader.py +0 -437
  121. package/packages/memory-engine/tests/test_channel_stat_rollups.py +0 -308
  122. package/packages/memory-engine/tests/test_compat_nv_embed_probe.py +0 -48
  123. package/packages/memory-engine/tests/test_embed_provider.py +0 -693
  124. package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +0 -280
  125. package/packages/memory-engine/tests/test_l3_arena_isolation.py +0 -412
  126. package/packages/memory-engine/tests/test_l6_module_load.py +0 -84
  127. package/packages/memory-engine/tests/test_people_list_reader.py +0 -432
@@ -0,0 +1,11 @@
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY server.py .
9
+
10
+ EXPOSE 8101
11
+ CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8101", "--workers", "2"]
@@ -0,0 +1,4 @@
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.32.0
3
+ psycopg[binary,pool]==3.2.3
4
+ pydantic==2.9.2
@@ -0,0 +1,424 @@
1
+ """extractor-sync — deterministic fast-path extraction.
2
+
3
+ Called inline by compat on the write path. No LLM. Pure Python rules
4
+ per source_kind. Writes events + provisional facts/entities into
5
+ org-model, enqueues an entry on distillation_queue for the LLM
6
+ distillation worker to upgrade later.
7
+
8
+ Per-source rule registry is in `RULES` at the bottom of this file.
9
+ Adding a new source = adding an entry there, not changing this file's
10
+ core. Each rule is `(event) -> (entities[], facts[], relationships[])`
11
+ where entities/facts/relationships are provisional — the async LLM
12
+ distillation worker will upgrade them with high-confidence content.
13
+
14
+ Why this exists (vs putting it in compat): keeping the extractor
15
+ behind its own HTTP boundary lets the compat shim be the pure wire-
16
+ contract layer + lets the extractor be replaced/re-implemented
17
+ without touching compat. Also lets it scale horizontally — N extractor
18
+ replicas behind one compat — if the deterministic path ever becomes a
19
+ hot spot.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import hashlib
25
+ import logging
26
+ import os
27
+ import re
28
+ import time
29
+ from contextlib import asynccontextmanager
30
+ from typing import Any
31
+
32
+ import psycopg
33
+ import psycopg.rows
34
+ from fastapi import FastAPI, HTTPException
35
+ from psycopg_pool import AsyncConnectionPool
36
+ from pydantic import BaseModel
37
+
38
+ logging.basicConfig(level=logging.INFO)
39
+ log = logging.getLogger("extractor-sync")
40
+
41
+ PG_DSN = os.environ.get("PG_DSN", "postgresql://pme:local-dev-pw@org-model:5432/org_model")
42
+
43
+ # Connection pool. Sized for high-concurrency bursts: with TES at
44
+ # shardCount=8 and BATCH_SIZE=50 in the DO drain, compat's parallel
45
+ # asyncio.gather in /store-batch can fan ~400 simultaneous _extract
46
+ # calls at us. max_size=20 caused engine_500 cascades under that load
47
+ # (DLQ losses observed 2026-05-17). 200 covers the 8×50 ceiling with
48
+ # headroom; min_size=8 keeps a few warm so the first batch in a burst
49
+ # doesn't pay connection-setup cost.
50
+ _pool: AsyncConnectionPool | None = None
51
+
52
+
53
+ @asynccontextmanager
54
+ async def lifespan(app: FastAPI):
55
+ global _pool
56
+ _pool = AsyncConnectionPool(
57
+ conninfo=PG_DSN,
58
+ min_size=8,
59
+ max_size=50,
60
+ kwargs={"row_factory": psycopg.rows.dict_row},
61
+ open=False,
62
+ )
63
+ await _pool.open()
64
+ log.info("extractor-sync: pool opened (min=8 max=50)")
65
+ yield
66
+ await _pool.close()
67
+
68
+
69
+ app = FastAPI(title="pme2-extractor-sync", lifespan=lifespan)
70
+
71
+
72
+ # ----------------------------------------------------------------------
73
+ # Wire shape
74
+ # ----------------------------------------------------------------------
75
+
76
+
77
+ class ExtractRequest(BaseModel):
78
+ """One event for extraction.
79
+
80
+ Matches the relevant subset of TES's STORE_MEMORY envelope shape.
81
+ compat already validates the broader payload; we just need the
82
+ fields the rules read.
83
+ """
84
+
85
+ arena: str
86
+ clientId: str
87
+ userId: str | None = None
88
+ event_type: str = "STORE_MEMORY"
89
+ source_kind: str # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent'
90
+ source_id: str | None = None
91
+ content: str
92
+ attributes: dict[str, Any] = {}
93
+ emitted_at: str | None = None # ISO; defaults to now() in DB
94
+
95
+
96
+ class ExtractResponse(BaseModel):
97
+ event_id: str
98
+ entities_extracted: int
99
+ facts_extracted: int
100
+ relationships_extracted: int
101
+ distillation_queued: bool
102
+
103
+
104
+ # ----------------------------------------------------------------------
105
+ # Hashing + ID helpers
106
+ # ----------------------------------------------------------------------
107
+
108
+
109
+ def _content_hash(arena: str, content: str) -> str:
110
+ """sha256(arena:content)[:32] — matches the v1 _arena_scoped_rid
111
+ convention so caller-supplied predicted IDs continue to work."""
112
+ h = hashlib.sha256(f"{arena}:{content}".encode()).hexdigest()
113
+ return h[:32]
114
+
115
+
116
+ def _entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
117
+ """Deterministic entity ID — same canonical name in the same arena
118
+ always produces the same entity, so re-extractions converge."""
119
+ key = f"{arena}|{entity_type}|{canonical_name.lower().strip()}"
120
+ return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]
121
+
122
+
123
+ def _fact_id(arena: str, category: str, subject: str | None, predicate: str | None,
124
+ object_: str | None, statement: str) -> str:
125
+ """Deterministic fact ID — same statement-shape converges across
126
+ re-extractions. Statement-included so two facts about the same
127
+ subject with different statements get separate IDs."""
128
+ key = f"{arena}|{category}|{subject or ''}|{predicate or ''}|{object_ or ''}|{statement}"
129
+ return "f_" + hashlib.sha256(key.encode()).hexdigest()[:24]
130
+
131
+
132
+ def _relationship_id(arena: str, from_id: str, to_id: str, rel_type: str) -> str:
133
+ key = f"{arena}|{from_id}|{to_id}|{rel_type}"
134
+ return "r_" + hashlib.sha256(key.encode()).hexdigest()[:24]
135
+
136
+
137
+ # ----------------------------------------------------------------------
138
+ # Per-source extraction rules (deterministic, no LLM).
139
+ #
140
+ # Each rule returns `(entities, facts, relationships)`. The compat shim
141
+ # may have already pulled out emails/mentions from the message envelope
142
+ # itself; we treat those as authoritative inputs in `req.attributes`
143
+ # and only extract from `content` where unambiguous (e.g., regex
144
+ # matches on email addresses, @-mentions).
145
+ #
146
+ # These are intentionally cautious. Better to extract less and let the
147
+ # async LLM distillation fill in than to extract noisy junk that
148
+ # pollutes the org model. The keystone spec (#285) will set the κ /
149
+ # false-inclusion bars for each rule.
150
+ # ----------------------------------------------------------------------
151
+
152
+
153
+ EMAIL_RE = re.compile(r"\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b")
154
+ MENTION_RE = re.compile(r"<@([A-Z0-9]+)>") # slack @-mentions
155
+ URL_RE = re.compile(r"https?://\S+")
156
+
157
+
158
+ def _extract_chat(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
159
+ """slack / teams / DM: extract participant emails + @-mentions.
160
+
161
+ Skip body distillation — the LLM worker handles that. We just want
162
+ structural entities (who) here so peopleList / spreading-activation
163
+ queries work on day one.
164
+ """
165
+ entities, facts, relationships = [], [], []
166
+ # Emails in body.
167
+ for m in EMAIL_RE.findall(req.content):
168
+ eid = _entity_id(req.arena, "person", m)
169
+ entities.append({
170
+ "id": eid,
171
+ "arena": req.arena,
172
+ "entity_type": "person",
173
+ "canonical_name": m,
174
+ "aliases": [m],
175
+ "provenance_event_ids": [event_id],
176
+ "participant_set": req.attributes.get("participant_set", [req.arena]),
177
+ "disclosure_class": req.attributes.get("disclosure_class", "private"),
178
+ })
179
+ # Slack-style mentions (uppercase user IDs).
180
+ for sid in MENTION_RE.findall(req.content):
181
+ eid = _entity_id(req.arena, "person", f"slack:{sid}")
182
+ entities.append({
183
+ "id": eid,
184
+ "arena": req.arena,
185
+ "entity_type": "person",
186
+ "canonical_name": f"slack:{sid}",
187
+ "aliases": [f"slack:{sid}"],
188
+ "provenance_event_ids": [event_id],
189
+ "participant_set": req.attributes.get("participant_set", [req.arena]),
190
+ "disclosure_class": req.attributes.get("disclosure_class", "private"),
191
+ })
192
+ return entities, facts, relationships
193
+
194
+
195
+ def _extract_doc(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
196
+ """drive / notion / confluence: leave content distillation entirely
197
+ to the LLM worker. Just extract source URL as a structural entity."""
198
+ entities, facts, relationships = [], [], []
199
+ src = req.attributes.get("source_url") or req.attributes.get("path")
200
+ if src:
201
+ eid = _entity_id(req.arena, "document", src)
202
+ entities.append({
203
+ "id": eid,
204
+ "arena": req.arena,
205
+ "entity_type": "document",
206
+ "canonical_name": src,
207
+ "aliases": [src],
208
+ "provenance_event_ids": [event_id],
209
+ "participant_set": req.attributes.get("participant_set", [req.arena]),
210
+ "disclosure_class": req.attributes.get("disclosure_class", "private"),
211
+ })
212
+ return entities, facts, relationships
213
+
214
+
215
+ def _extract_note(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
216
+ """gmail / drafts: extract from + to emails, subject."""
217
+ entities, facts, relationships = [], [], []
218
+ for key in ("from_email", "to_email", "cc_email"):
219
+ val = req.attributes.get(key)
220
+ if isinstance(val, str) and "@" in val:
221
+ eid = _entity_id(req.arena, "person", val)
222
+ entities.append({
223
+ "id": eid,
224
+ "arena": req.arena,
225
+ "entity_type": "person",
226
+ "canonical_name": val,
227
+ "aliases": [val],
228
+ "provenance_event_ids": [event_id],
229
+ "participant_set": req.attributes.get("participant_set", [req.arena]),
230
+ "disclosure_class": req.attributes.get("disclosure_class", "private"),
231
+ })
232
+ return entities, facts, relationships
233
+
234
+
235
+ def _extract_event(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
236
+ """calendar: extract organizer + attendees as people entities."""
237
+ entities, facts, relationships = [], [], []
238
+ organizer = req.attributes.get("organizer_email")
239
+ attendees = req.attributes.get("attendee_emails") or []
240
+ for email in [organizer, *attendees]:
241
+ if isinstance(email, str) and "@" in email:
242
+ eid = _entity_id(req.arena, "person", email)
243
+ entities.append({
244
+ "id": eid,
245
+ "arena": req.arena,
246
+ "entity_type": "person",
247
+ "canonical_name": email,
248
+ "aliases": [email],
249
+ "provenance_event_ids": [event_id],
250
+ "participant_set": req.attributes.get("participant_set", [req.arena]),
251
+ "disclosure_class": req.attributes.get("disclosure_class", "private"),
252
+ })
253
+ return entities, facts, relationships
254
+
255
+
256
+ def _extract_default(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
257
+ """Unknown source kind: extract bare emails from content, nothing else.
258
+ Async distillation does the heavy lifting."""
259
+ entities, facts, relationships = [], [], []
260
+ for m in EMAIL_RE.findall(req.content):
261
+ eid = _entity_id(req.arena, "person", m)
262
+ entities.append({
263
+ "id": eid,
264
+ "arena": req.arena,
265
+ "entity_type": "person",
266
+ "canonical_name": m,
267
+ "aliases": [m],
268
+ "provenance_event_ids": [event_id],
269
+ "participant_set": req.attributes.get("participant_set", [req.arena]),
270
+ "disclosure_class": req.attributes.get("disclosure_class", "private"),
271
+ })
272
+ return entities, facts, relationships
273
+
274
+
275
+ RULES = {
276
+ "chat": _extract_chat,
277
+ "doc": _extract_doc,
278
+ "note": _extract_note,
279
+ "event": _extract_event,
280
+ }
281
+
282
+
283
+ # ----------------------------------------------------------------------
284
+ # DB writes (idempotent upsert)
285
+ # ----------------------------------------------------------------------
286
+
287
+
288
+ async def _upsert_event(cur: psycopg.AsyncCursor, req: ExtractRequest,
289
+ event_id: str, content_hash: str) -> None:
290
+ """ON CONFLICT DO NOTHING — re-emitting the same event is a no-op."""
291
+ await cur.execute(
292
+ """
293
+ INSERT INTO events (
294
+ id, arena, client_id, user_id, event_type, source_kind,
295
+ source_id, content, content_hash, participant_set,
296
+ participant_kind, disclosure_class, attributes
297
+ ) VALUES (
298
+ %s, %s, %s, %s, %s, %s::source_kind,
299
+ %s, %s, %s, %s,
300
+ %s::participant_kind, %s::disclosure_class, %s::jsonb
301
+ )
302
+ ON CONFLICT (id) DO NOTHING
303
+ """,
304
+ (
305
+ event_id, req.arena, req.clientId, req.userId,
306
+ req.event_type, req.source_kind, req.source_id,
307
+ req.content, content_hash,
308
+ req.attributes.get("participant_set", [req.arena]),
309
+ req.attributes.get("participant_kind", "unknown"),
310
+ req.attributes.get("disclosure_class", "private"),
311
+ psycopg.types.json.Json(req.attributes),
312
+ ),
313
+ )
314
+
315
+
316
+ async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> None:
317
+ """Idempotent entity upsert: same canonical_name in same arena
318
+ converges to the same row. Aliases + provenance_event_ids grow
319
+ via array_append; never replace."""
320
+ for e in entities:
321
+ await cur.execute(
322
+ """
323
+ INSERT INTO entities (
324
+ id, arena, entity_type, canonical_name, aliases,
325
+ provenance_event_ids, participant_set, disclosure_class
326
+ ) VALUES (
327
+ %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
328
+ )
329
+ ON CONFLICT (id) DO UPDATE SET
330
+ aliases = (
331
+ SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
332
+ ),
333
+ provenance_event_ids = (
334
+ SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
335
+ ),
336
+ last_seen = NOW()
337
+ """,
338
+ (e["id"], e["arena"], e["entity_type"], e["canonical_name"],
339
+ e["aliases"], e["provenance_event_ids"],
340
+ e["participant_set"], e["disclosure_class"]),
341
+ )
342
+
343
+
344
+ async def _enqueue_distillation(cur: psycopg.AsyncCursor, event_id: str) -> None:
345
+ """Queue an LLM distillation pass on this event. Idempotent: if
346
+ we already have a pending/claimed/done entry for this event, no-op."""
347
+ await cur.execute(
348
+ """
349
+ INSERT INTO distillation_queue (event_id, status)
350
+ SELECT %s, 'pending'
351
+ WHERE NOT EXISTS (
352
+ SELECT 1 FROM distillation_queue
353
+ WHERE event_id = %s AND status IN ('pending', 'claimed', 'done')
354
+ )
355
+ """,
356
+ (event_id, event_id),
357
+ )
358
+
359
+
360
+ # ----------------------------------------------------------------------
361
+ # HTTP endpoints
362
+ # ----------------------------------------------------------------------
363
+
364
+
365
+ @app.get("/health")
366
+ async def health():
367
+ """Cheap liveness probe — does NOT touch DB so compose healthchecks
368
+ don't gate on the DB being up at the wrong moment."""
369
+ return {"status": "healthy", "service": "extractor-sync"}
370
+
371
+
372
+ @app.get("/health/deep")
373
+ async def health_deep():
374
+ """Includes a DB round-trip to confirm the pool is alive."""
375
+ if _pool is None:
376
+ raise HTTPException(503, "pool not initialised")
377
+ try:
378
+ async with _pool.connection() as conn:
379
+ async with conn.cursor() as cur:
380
+ await cur.execute("SELECT 1")
381
+ row = await cur.fetchone()
382
+ if not row:
383
+ raise HTTPException(503, "db query returned empty")
384
+ except Exception as e:
385
+ raise HTTPException(503, f"db error: {e}")
386
+ return {"status": "healthy", "service": "extractor-sync", "db": "ok"}
387
+
388
+
389
+ @app.post("/extract", response_model=ExtractResponse)
390
+ async def extract(req: ExtractRequest):
391
+ """Process one event. Idempotent: re-submitting the same event
392
+ converges to the same state (caller-safe to retry)."""
393
+ if _pool is None:
394
+ raise HTTPException(503, "pool not initialised")
395
+
396
+ t0 = time.perf_counter()
397
+ content_hash = _content_hash(req.arena, req.content)
398
+ event_id = content_hash # identity == content-hash, per the schema doc
399
+
400
+ # Dispatch to the per-source rule.
401
+ rule = RULES.get(req.source_kind, _extract_default)
402
+ entities, facts, relationships = rule(req, event_id)
403
+
404
+ async with _pool.connection() as conn:
405
+ async with conn.cursor() as cur:
406
+ await _upsert_event(cur, req, event_id, content_hash)
407
+ await _upsert_entities(cur, entities)
408
+ # Facts + relationships are deliberately left to the async
409
+ # distillation worker — the deterministic path can't
410
+ # reliably extract decisions/commitments without LLM context.
411
+ # Future: deterministic relationships from explicit
412
+ # participant_set (e.g., calendar: "organizer
413
+ # communicated_with attendee") — leaving as a follow-up.
414
+ await _enqueue_distillation(cur, event_id)
415
+
416
+ dur_ms = (time.perf_counter() - t0) * 1000
417
+ log.info(f"extract event_id={event_id} entities={len(entities)} ms={dur_ms:.1f}")
418
+ return ExtractResponse(
419
+ event_id=event_id,
420
+ entities_extracted=len(entities),
421
+ facts_extracted=len(facts),
422
+ relationships_extracted=len(relationships),
423
+ distillation_queued=True,
424
+ )