@cello-protocol/daemon 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-loader.d.ts +41 -0
- package/dist/agent-loader.d.ts.map +1 -0
- package/dist/agent-loader.js +94 -0
- package/dist/agent-loader.js.map +1 -0
- package/dist/bin/cello-daemon.d.ts +13 -0
- package/dist/bin/cello-daemon.d.ts.map +1 -0
- package/dist/bin/cello-daemon.js +170 -0
- package/dist/bin/cello-daemon.js.map +1 -0
- package/dist/cello-node-transport-dialer.d.ts +59 -0
- package/dist/cello-node-transport-dialer.d.ts.map +1 -0
- package/dist/cello-node-transport-dialer.js +108 -0
- package/dist/cello-node-transport-dialer.js.map +1 -0
- package/dist/challenge-verifier.d.ts +12 -0
- package/dist/challenge-verifier.d.ts.map +1 -0
- package/dist/challenge-verifier.js +11 -0
- package/dist/challenge-verifier.js.map +1 -0
- package/dist/connect-or-start.d.ts +25 -0
- package/dist/connect-or-start.d.ts.map +1 -0
- package/dist/connect-or-start.js +117 -0
- package/dist/connect-or-start.js.map +1 -0
- package/dist/content-park-client.d.ts +49 -0
- package/dist/content-park-client.d.ts.map +1 -0
- package/dist/content-park-client.js +196 -0
- package/dist/content-park-client.js.map +1 -0
- package/dist/daemon.d.ts +65 -0
- package/dist/daemon.d.ts.map +1 -0
- package/dist/daemon.js +3202 -0
- package/dist/daemon.js.map +1 -0
- package/dist/directory-bootstrap.d.ts +55 -0
- package/dist/directory-bootstrap.d.ts.map +1 -0
- package/dist/directory-bootstrap.js +102 -0
- package/dist/directory-bootstrap.js.map +1 -0
- package/dist/file-manifest-provider.d.ts +18 -0
- package/dist/file-manifest-provider.d.ts.map +1 -0
- package/dist/file-manifest-provider.js +72 -0
- package/dist/file-manifest-provider.js.map +1 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +18 -0
- package/dist/index.js.map +1 -0
- package/dist/ipc-client.d.ts +31 -0
- package/dist/ipc-client.d.ts.map +1 -0
- package/dist/ipc-client.js +112 -0
- package/dist/ipc-client.js.map +1 -0
- package/dist/ipc-server.d.ts +49 -0
- package/dist/ipc-server.d.ts.map +1 -0
- package/dist/ipc-server.js +268 -0
- package/dist/ipc-server.js.map +1 -0
- package/dist/lock-file.d.ts +27 -0
- package/dist/lock-file.d.ts.map +1 -0
- package/dist/lock-file.js +84 -0
- package/dist/lock-file.js.map +1 -0
- package/dist/manifest-loader.d.ts +33 -0
- package/dist/manifest-loader.d.ts.map +1 -0
- package/dist/manifest-loader.js +70 -0
- package/dist/manifest-loader.js.map +1 -0
- package/dist/manifest-poll-scheduler.d.ts +31 -0
- package/dist/manifest-poll-scheduler.d.ts.map +1 -0
- package/dist/manifest-poll-scheduler.js +59 -0
- package/dist/manifest-poll-scheduler.js.map +1 -0
- package/dist/manifest-version-store-file.d.ts +18 -0
- package/dist/manifest-version-store-file.d.ts.map +1 -0
- package/dist/manifest-version-store-file.js +40 -0
- package/dist/manifest-version-store-file.js.map +1 -0
- package/dist/manifest-version-store.d.ts +14 -0
- package/dist/manifest-version-store.d.ts.map +1 -0
- package/dist/manifest-version-store.js +13 -0
- package/dist/manifest-version-store.js.map +1 -0
- package/dist/network-directory-node.d.ts +94 -0
- package/dist/network-directory-node.d.ts.map +1 -0
- package/dist/network-directory-node.js +626 -0
- package/dist/network-directory-node.js.map +1 -0
- package/dist/nonce-dedup.d.ts +68 -0
- package/dist/nonce-dedup.d.ts.map +1 -0
- package/dist/nonce-dedup.js +204 -0
- package/dist/nonce-dedup.js.map +1 -0
- package/dist/notification-dispatcher.d.ts +65 -0
- package/dist/notification-dispatcher.d.ts.map +1 -0
- package/dist/notification-dispatcher.js +138 -0
- package/dist/notification-dispatcher.js.map +1 -0
- package/dist/registration-context.d.ts +69 -0
- package/dist/registration-context.d.ts.map +1 -0
- package/dist/registration-context.js +118 -0
- package/dist/registration-context.js.map +1 -0
- package/dist/registration-manager.d.ts +72 -0
- package/dist/registration-manager.d.ts.map +1 -0
- package/dist/registration-manager.js +267 -0
- package/dist/registration-manager.js.map +1 -0
- package/dist/registration-persistence.d.ts +131 -0
- package/dist/registration-persistence.d.ts.map +1 -0
- package/dist/registration-persistence.js +233 -0
- package/dist/registration-persistence.js.map +1 -0
- package/dist/retry-queue.d.ts +144 -0
- package/dist/retry-queue.d.ts.map +1 -0
- package/dist/retry-queue.js +444 -0
- package/dist/retry-queue.js.map +1 -0
- package/dist/seal-frontier-verify.d.ts +58 -0
- package/dist/seal-frontier-verify.d.ts.map +1 -0
- package/dist/seal-frontier-verify.js +87 -0
- package/dist/seal-frontier-verify.js.map +1 -0
- package/dist/seal-legibility-tbs.d.ts +25 -0
- package/dist/seal-legibility-tbs.d.ts.map +1 -0
- package/dist/seal-legibility-tbs.js +78 -0
- package/dist/seal-legibility-tbs.js.map +1 -0
- package/dist/seal-upgrade.d.ts +90 -0
- package/dist/seal-upgrade.d.ts.map +1 -0
- package/dist/seal-upgrade.js +178 -0
- package/dist/seal-upgrade.js.map +1 -0
- package/dist/session-assignment-parser.d.ts +22 -0
- package/dist/session-assignment-parser.d.ts.map +1 -0
- package/dist/session-assignment-parser.js +139 -0
- package/dist/session-assignment-parser.js.map +1 -0
- package/dist/session-ceremony.d.ts +156 -0
- package/dist/session-ceremony.d.ts.map +1 -0
- package/dist/session-ceremony.js +447 -0
- package/dist/session-ceremony.js.map +1 -0
- package/dist/session-connection-gater.d.ts +91 -0
- package/dist/session-connection-gater.d.ts.map +1 -0
- package/dist/session-connection-gater.js +146 -0
- package/dist/session-connection-gater.js.map +1 -0
- package/dist/session-node-manager.d.ts +585 -0
- package/dist/session-node-manager.d.ts.map +1 -0
- package/dist/session-node-manager.js +2609 -0
- package/dist/session-node-manager.js.map +1 -0
- package/dist/session-relay-client.d.ts +101 -0
- package/dist/session-relay-client.d.ts.map +1 -0
- package/dist/session-relay-client.js +520 -0
- package/dist/session-relay-client.js.map +1 -0
- package/dist/session-tree.d.ts +80 -0
- package/dist/session-tree.d.ts.map +1 -0
- package/dist/session-tree.js +123 -0
- package/dist/session-tree.js.map +1 -0
- package/dist/signaling-connect.d.ts +83 -0
- package/dist/signaling-connect.d.ts.map +1 -0
- package/dist/signaling-connect.js +266 -0
- package/dist/signaling-connect.js.map +1 -0
- package/dist/transcript-cipher.d.ts +31 -0
- package/dist/transcript-cipher.d.ts.map +1 -0
- package/dist/transcript-cipher.js +74 -0
- package/dist/transcript-cipher.js.map +1 -0
- package/dist/transport-composition.d.ts +31 -0
- package/dist/transport-composition.d.ts.map +1 -0
- package/dist/transport-composition.js +55 -0
- package/dist/transport-composition.js.map +1 -0
- package/dist/transport-selector.d.ts +189 -0
- package/dist/transport-selector.d.ts.map +1 -0
- package/dist/transport-selector.js +195 -0
- package/dist/transport-selector.js.map +1 -0
- package/dist/types.d.ts +265 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +33 -0
- package/dist/types.js.map +1 -0
- package/package.json +4 -4
|
@@ -0,0 +1,2609 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CELLO Daemon — SessionNodeManager
|
|
3
|
+
*
|
|
4
|
+
* Manages the lifecycle of all ephemeral session nodes:
|
|
5
|
+
* 1. Per-session nodes: fresh transport key + Peer ID, connectionGater allows
|
|
6
|
+
* only the designated counterparty. Created during cello_initiate_session
|
|
7
|
+
* (outbound) or cello_await_session (inbound, via standing receiver handoff).
|
|
8
|
+
* 2. Standing receiver node: pre-created, open gater, kept alive at all times.
|
|
9
|
+
* Handed to the first inbound session; immediately replaced.
|
|
10
|
+
* 3. 32-node cap: enforced before any new node is created.
|
|
11
|
+
* 4. Session status in SQLite: active → sealed (on close) or interrupted
|
|
12
|
+
* (on graceful shutdown or SIGKILL-restart detection).
|
|
13
|
+
*
|
|
14
|
+
* Pseudocode (SPARC Phase P):
|
|
15
|
+
*
|
|
16
|
+
* initialize():
|
|
17
|
+
* 1. Open SQLite (node:sqlite), create sessions table if not exists
|
|
18
|
+
* 2. Detect interrupted sessions: SELECT * FROM sessions WHERE status='active'
|
|
19
|
+
* → batch-update to 'interrupted', log session.interrupted.detected for each
|
|
20
|
+
* (source: 'daemon_restart') — runs before IPC socket opens so no race
|
|
21
|
+
* 3. Create standing receiver node (fresh libp2p, open gater, sentinel agentName)
|
|
22
|
+
* 4. Start standing receiver, set standingReceiverReady=true
|
|
23
|
+
* 5. Log session.node.created for the standing receiver
|
|
24
|
+
*
|
|
25
|
+
* createSessionNode(sessionId, agentName, counterpartyPubkey, counterpartyPeerId, correlationId):
|
|
26
|
+
* Pseudocode:
|
|
27
|
+
* 1. Check activeNodes.size >= MAX_SESSION_NODES → log cap.reached, return error
|
|
28
|
+
* 2. Create SessionConnectionGater(counterpartyPeerId) — restricted from birth
|
|
29
|
+
* 3. nodeFactory.createNode({gater}) → fresh libp2p node
|
|
30
|
+
* 4. node.start() — bind TCP ephemeral port
|
|
31
|
+
* 5. Insert SQLite row status='active'
|
|
32
|
+
* 6. Log session.node.created
|
|
33
|
+
* 7. Add to activeNodes map
|
|
34
|
+
* 8. Return {ok:true, peerId, addrs}
|
|
35
|
+
* On libp2p error: extract error.message (never ${error}), log create.failed, return error
|
|
36
|
+
*
|
|
37
|
+
* acceptSession(sessionId, agentName, counterpartyPubkey, initiatorPeerId, correlationId):
|
|
38
|
+
* Pseudocode:
|
|
39
|
+
* 1. If !standingReceiverReady → return standing_receiver_unavailable
|
|
40
|
+
* 2. Take standing receiver from slot (clear slot atomically)
|
|
41
|
+
* 3. gater.setAllowedPeer(initiatorPeerId) ← BEFORE returning multiaddr (AC-015)
|
|
42
|
+
* 4. Insert SQLite row status='active'
|
|
43
|
+
* 5. Log session.node.created
|
|
44
|
+
* 6. Add to activeNodes map
|
|
45
|
+
* 7. Trigger async replacement of standing receiver (do NOT await)
|
|
46
|
+
* 8. Return {ok:true, peerId, addrs}
|
|
47
|
+
*
|
|
48
|
+
* destroySessionNode(sessionId, reason):
|
|
49
|
+
* Pseudocode:
|
|
50
|
+
* 1. Find node in activeNodes
|
|
51
|
+
* 2. stop node
|
|
52
|
+
* 3. Update SQLite status to sealed/interrupted/error
|
|
53
|
+
* 4. Remove from activeNodes
|
|
54
|
+
* 5. Log session.node.destroyed
|
|
55
|
+
*
|
|
56
|
+
* gracefulShutdown():
|
|
57
|
+
* Pseudocode:
|
|
58
|
+
* 1. Get all activeNodes
|
|
59
|
+
* 2. For each: update SQLite 'interrupted', log destroyed(reason:'interrupted')
|
|
60
|
+
* 3. Stop all nodes
|
|
61
|
+
* 4. Stop standing receiver
|
|
62
|
+
*
|
|
63
|
+
* getStatus(): { standingReceiverReady: boolean }
|
|
64
|
+
*/
|
|
65
|
+
// node:sqlite (DatabaseSync) requires Node.js >= 24 (stable in 24 LTS).
|
|
66
|
+
// The engines field in package.json is set to ">=24" specifically because of this
|
|
67
|
+
// dependency — do not lower the engine floor without replacing this import.
|
|
68
|
+
import { DatabaseSync } from "node:sqlite";
|
|
69
|
+
import { TranscriptCipher } from "./transcript-cipher.js";
|
|
70
|
+
import { randomUUID, createHash } from "node:crypto";
|
|
71
|
+
import * as lp from "it-length-prefixed";
|
|
72
|
+
import { decode, Encoder } from "cbor-x";
|
|
73
|
+
import { MAX_SESSION_NODES, STANDING_RECEIVER_AGENT_NAME } from "./types.js";
|
|
74
|
+
import { SessionConnectionGater } from "./session-connection-gater.js";
|
|
75
|
+
import { SessionTree } from "./session-tree.js";
|
|
76
|
+
import { CELLO_CONTENT_PROTOCOL_ID, NodeAutoNatService } from "@cello-protocol/transport";
|
|
77
|
+
import { verify } from "@cello-protocol/crypto";
|
|
78
|
+
import { encodeSealPayload } from "@cello-protocol/protocol-types";
|
|
79
|
+
import { AgentRelayClient, LEAF_KIND_CTRL } from "./session-relay-client.js";
|
|
80
|
+
const CBOR_ENC = new Encoder({ tagUint8Array: false });
|
|
81
|
+
// ─── SessionNodeManager ───────────────────────────────────────────────────────
|
|
82
|
+
export class SessionNodeManager {
|
|
83
|
+
#factory;
|
|
84
|
+
#logger;
|
|
85
|
+
#dbPath;
|
|
86
|
+
#db = null;
|
|
87
|
+
// DOD-LOG-1: at-rest cipher for the durable transcript blobs (loaded in init()).
|
|
88
|
+
#transcriptCipher = null;
|
|
89
|
+
#activeNodes = new Map();
|
|
90
|
+
// M7 DOD-SPINE-6 / MSG-001-3b: ONE relay witness client per AGENT (keyed by agent name).
|
|
91
|
+
// The relay authenticates and keys delivery by the agent's K_local pubkey, so all of an
|
|
92
|
+
// agent's sessions share one authenticated relay stream (each frame carries session_id).
|
|
93
|
+
#relayClients = new Map();
|
|
94
|
+
// DOD-LOOP-1: the standing receiver is PER-AGENT, not per-daemon. A daemon hosting two agents
|
|
95
|
+
// (the loopback case) needs each agent to have its OWN inbound receiver node — otherwise the
|
|
96
|
+
// initiator (consuming its agent's standing receiver) and the responder (consuming its agent's)
|
|
97
|
+
// would contend for a single node and thrash. Keyed by agentName. A creation-in-flight guard set
|
|
98
|
+
// prevents two concurrent ensure() calls from building two nodes for the same agent.
|
|
99
|
+
#standingReceivers = new Map();
|
|
100
|
+
#standingReceiverCreating = new Set();
|
|
101
|
+
// Agents whose removeStandingReceiverForAgent ran while an #ensureStandingReceiver for them was
|
|
102
|
+
// in flight (parked on createNode/start, so the map had no entry to delete yet). The in-flight
|
|
103
|
+
// ensure checks this after start() and tears the fresh node down instead of installing an SR for
|
|
104
|
+
// an agent that has since gone offline (cello_stop_agent race). A fresh ensure clears it.
|
|
105
|
+
#standingReceiverRemoving = new Set();
|
|
106
|
+
// Set once gracefulShutdown begins. The standing-receiver replacement that
|
|
107
|
+
// acceptSession kicks off runs un-awaited (AC-003), so it can be in flight when
|
|
108
|
+
// shutdown starts; #createStandingReceiver checks this flag and stops a freshly
|
|
109
|
+
// built node instead of leaving an orphan bound to a TCP port (review M2).
|
|
110
|
+
#shuttingDown = false;
|
|
111
|
+
// DAEMON-004: lazily-loaded in-memory cache of each session's daemon-owned
|
|
112
|
+
// Merkle tree. The authoritative store is the session_tree_leaves table —
|
|
113
|
+
// the cache is rebuilt from it on first access (so it survives a restart).
|
|
114
|
+
#trees = new Map();
|
|
115
|
+
// DAEMON-004: per-session FIFO buffer of verified received content awaiting
|
|
116
|
+
// cello_receive. Populated by ingestReceivedContent / the content stream handler.
|
|
117
|
+
#receivedContent = new Map();
|
|
118
|
+
// CELLO-M7-TRANSPORT-001: the directory-node multiaddrs serving as AutoNAT
|
|
119
|
+
// probers (SI-002). Empty () => [] when the directory is in 'reconnecting'
|
|
120
|
+
// state — AutoNAT cannot run and dialability stays the conservative default.
|
|
121
|
+
#autoNatProbers;
|
|
122
|
+
// M7-SESSION-003: per-session direct-path counterparty liveness, observed on the
|
|
123
|
+
// session node's onPeerConnect ('alive') / onPeerDisconnect ('gone'). This is
|
|
124
|
+
// the liveness authority for direct sessions — the unilateral-seal gate reads
|
|
125
|
+
// it (relay sessions query the relay instead). NEVER the directory (SI-002).
|
|
126
|
+
#sessionLiveness = new Map();
|
|
127
|
+
// M7-UPGRADE-002: sessions whose content integrity could NOT be verified (a content_hash
|
|
128
|
+
// mismatch = tamper was observed). The auto-acknowledge gate (SI-002) refuses to auto-co-sign
|
|
129
|
+
// for a desynced session — B must never blind-sign a tail it cannot verify. Keyed by sessionId hex.
|
|
130
|
+
#contentDesynced = new Set();
|
|
131
|
+
// DOD-MSG-4 (strict in-order): the RELAY is the ordering authority (Structure 2). For each
|
|
132
|
+
// message the relay witnesses, it delivers B a (content_hash -> canonical sequence) binding via
|
|
133
|
+
// the leaf_deliver stream. B records it here — keyed #k(agent,session) -> (contentHashHex -> seq)
|
|
134
|
+
// — and orders its transcript by THIS, never by a sender-stamped field (sovereign-node: B does
|
|
135
|
+
// not trust the counterparty for ordering). When B has no witness for an arriving hash
|
|
136
|
+
// (relay-degraded), it falls back to arrival-order append.
|
|
137
|
+
#witnessedSeq = new Map();
|
|
138
|
+
// DOD-MSG-4: out-of-order direct arrivals. A content frame whose canonical sequence is AHEAD of
|
|
139
|
+
// the next expected leaf is HELD here (keyed #k(agent,session) -> (canonicalSeq -> entry)) instead
|
|
140
|
+
// of being appended out of order. Once the missing in-between sequence(s) land (recovered from the
|
|
141
|
+
// relay mailbox), #releaseHeld drains the held entries in canonical order. content is plaintext in
|
|
142
|
+
// memory only — evicted on teardown, same as #receivedContent.
|
|
143
|
+
#heldContent = new Map();
|
|
144
|
+
// DOD-MSG-4: the relay's high-water canonical sequence for this session — the largest sequence the
|
|
145
|
+
// relay has witnessed (max over leaf_deliver). Keyed #k(agent,session). EXPOSED for the next
|
|
146
|
+
// sub-increment (catch-up-before-live: on reconnect, hold live arrivals until the tree reaches this
|
|
147
|
+
// so a fresh message can't append ahead of earlier ones still parked) — it is NOT yet consumed by
|
|
148
|
+
// the gate, which today holds purely on the per-message `canonicalSeq > nextExpected` test.
|
|
149
|
+
#highWaterSeq = new Map();
|
|
150
|
+
// M7-UPGRADE-002: sessions for which B has already submitted its responder SEAL leaf (via
|
|
151
|
+
// auto-ack OR cello_close_session). Idempotency guard — A's SEAL ctrl leaf may be delivered
|
|
152
|
+
// more than once (and the relay echoes leaves), so auto-ack fires AT MOST ONCE per session.
|
|
153
|
+
#responderSealSubmitted = new Set();
|
|
154
|
+
// M7-SESSION-001 (M-1 PUSH): optional callback fired when a session changes
|
|
155
|
+
// state, so the composition root can dispatch a session_state_changed
|
|
156
|
+
// notification to live MCP clients. Injected via a setter AFTER construction
|
|
157
|
+
// because the NotificationDispatcher is built later than this manager in
|
|
158
|
+
// daemon.ts (it depends on the IPC server). Never required — when unset,
|
|
159
|
+
// state changes are persisted and logged but no push notification is emitted.
|
|
160
|
+
#onSessionStateChanged = null;
|
|
161
|
+
// CELLO-M7-MSG-001 (AC-001/AC-002/AC-003): the send is no longer fire-and-forget.
|
|
162
|
+
// After a content_frame is delivered over the direct session channel, the sender
|
|
163
|
+
// arms a TTF timer and waits for an unsigned, transport-authenticated `persisted`
|
|
164
|
+
// delivery ACK on the same /cello/content/1.0.0 protocol. A persisted ACK cancels
|
|
165
|
+
// the timer (content.delivery.acked); TTF expiry hands the content to the park
|
|
166
|
+
// backstop. Keyed sessionId → contentHashHex → entry.
|
|
167
|
+
#awaitingAck = new Map();
|
|
168
|
+
// TTF (time-to-flush) for an un-acked content entry. Injectable so tests can drive
|
|
169
|
+
// expiry deterministically; production default sits in the Part-4 proposed 10–30s band.
|
|
170
|
+
#contentTtfMs = 20_000;
|
|
171
|
+
// CELLO-M7-MSG-001: side-effect hooks the composition root wires to the durable
|
|
172
|
+
// retry_queue (and, in 3b, the relay park deposit). Injected after construction
|
|
173
|
+
// because RetryQueue is built later in daemon.ts. When unset, the awaiting-ACK timer
|
|
174
|
+
// still fires and the ACK still resolves — only the durable crash-backstop is skipped.
|
|
175
|
+
#onAwaitingPersisted = null;
|
|
176
|
+
#onAwaitingTtf = null;
|
|
177
|
+
/**
|
|
178
|
+
* MSG-001-3b (2b): the live content-park deposit. The manager resolves the recipient + relay
|
|
179
|
+
* endpoint from the session entry and calls this when a send is NOT confirmed delivered
|
|
180
|
+
* (direct-fail or TTF expiry). The daemon's hook seals (sealToRecipient) + deposits via
|
|
181
|
+
* ContentParkClient. Best-effort.
|
|
182
|
+
*/
|
|
183
|
+
#contentParkHook = null;
|
|
184
|
+
constructor(opts) {
|
|
185
|
+
this.#factory = opts.factory;
|
|
186
|
+
this.#logger = opts.logger;
|
|
187
|
+
this.#dbPath = opts.dbPath;
|
|
188
|
+
if (typeof opts.contentTtfMs === "number" && opts.contentTtfMs > 0) {
|
|
189
|
+
this.#contentTtfMs = opts.contentTtfMs;
|
|
190
|
+
}
|
|
191
|
+
this.#autoNatProbers = opts.autoNatProbers ?? (() => []);
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* CELLO-M7-MSG-001: wire the durable-backstop side effects of the awaiting-ACK
|
|
195
|
+
* lifecycle. `onPersisted` clears the durable retry_queue entry when a persisted ACK
|
|
196
|
+
* arrives; `onTtf` records/parks the un-acked content when the TTF timer fires.
|
|
197
|
+
* Injected by the composition root (daemon.ts) after the RetryQueue exists.
|
|
198
|
+
*/
|
|
199
|
+
setAwaitingAckHooks(hooks) {
|
|
200
|
+
this.#onAwaitingPersisted = hooks.onPersisted ?? null;
|
|
201
|
+
this.#onAwaitingTtf = hooks.onTtf ?? null;
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* MSG-001-3b (2b): inject the live content-park deposit (seal + ContentParkClient.deposit).
|
|
205
|
+
* Injected by the composition root (daemon.ts). When absent, a not-confirmed send still records
|
|
206
|
+
* the durable awaiting entry (crash backstop) but does not deposit live.
|
|
207
|
+
*/
|
|
208
|
+
setContentParkHook(fn) {
|
|
209
|
+
this.#contentParkHook = fn;
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* MSG-001-3b (2b): deposit un-confirmed content to the relay store-and-forward backstop — keyed
|
|
213
|
+
* to the recipient, on the SAME relay this session is witnessed by — so an offline recipient
|
|
214
|
+
* recovers it (at the sequence the witness already assigned, R1). Best-effort, never throws.
|
|
215
|
+
*/
|
|
216
|
+
#parkContent(agentName, sessionId, contentHashHex, content, structure1Cbor, structure2Cbor) {
|
|
217
|
+
const hook = this.#contentParkHook;
|
|
218
|
+
const entry = this.#activeNodes.get(this.#k(agentName, sessionId));
|
|
219
|
+
if (!hook || !entry || !entry.relayPeerId || !entry.relayAddrs)
|
|
220
|
+
return;
|
|
221
|
+
void hook({
|
|
222
|
+
sessionId,
|
|
223
|
+
recipientPubkeyHex: entry.counterpartyPubkey,
|
|
224
|
+
relayPeerId: entry.relayPeerId,
|
|
225
|
+
relayAddrs: entry.relayAddrs,
|
|
226
|
+
contentHashHex,
|
|
227
|
+
content,
|
|
228
|
+
// DOD-MSG-4 (2b): carry the relay's signed ordering record so the parked entry is self-ordering
|
|
229
|
+
// on recover too (sealed INTO the ciphertext envelope — INV-3: the relay still sees only ciphertext).
|
|
230
|
+
structure1Cbor,
|
|
231
|
+
structure2Cbor,
|
|
232
|
+
}).catch((err) => {
|
|
233
|
+
this.#logger.warn("content.park.deposit.failed", {
|
|
234
|
+
sessionId,
|
|
235
|
+
contentHash: contentHashHex,
|
|
236
|
+
error: err instanceof Error ? err.message : String(err),
|
|
237
|
+
});
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
// ─── Initialization ──────────────────────────────────────────────────────
|
|
241
|
+
async initialize() {
|
|
242
|
+
// Step 1: Open SQLite and create sessions table
|
|
243
|
+
this.#db = new DatabaseSync(this.#dbPath);
|
|
244
|
+
// DOD-LOG-1: at-rest cipher for the durable transcript. Dedicated 0600 key beside the DB.
|
|
245
|
+
this.#transcriptCipher = TranscriptCipher.loadOrCreate(`${this.#dbPath}.transcript-key`);
|
|
246
|
+
this.#db.exec(`
|
|
247
|
+
CREATE TABLE IF NOT EXISTS sessions (
|
|
248
|
+
session_id TEXT NOT NULL,
|
|
249
|
+
agent_name TEXT NOT NULL,
|
|
250
|
+
counterparty_pubkey TEXT NOT NULL,
|
|
251
|
+
status TEXT NOT NULL,
|
|
252
|
+
created_at INTEGER NOT NULL,
|
|
253
|
+
updated_at INTEGER NOT NULL,
|
|
254
|
+
-- DOD-LOOP-1: composite key so two of the operator's agents can hold both ends of the
|
|
255
|
+
-- SAME session_id on ONE daemon (the loopback case). A bare session_id PK would reject
|
|
256
|
+
-- the second end's row.
|
|
257
|
+
PRIMARY KEY (agent_name, session_id)
|
|
258
|
+
)
|
|
259
|
+
`);
|
|
260
|
+
// M7-SESSION-001: idempotent schema extension — add message_count and interrupted_at
|
|
261
|
+
// columns if they do not exist. ALTER TABLE IF NOT EXISTS COLUMN is not supported by
|
|
262
|
+
// older SQLite; we use a try/catch per column as the idempotent approach.
|
|
263
|
+
for (const ddl of [
|
|
264
|
+
"ALTER TABLE sessions ADD COLUMN message_count INTEGER NOT NULL DEFAULT 0",
|
|
265
|
+
"ALTER TABLE sessions ADD COLUMN interrupted_at TEXT",
|
|
266
|
+
// MSG-001-3b (MSG-2 startup-flush): persist the session's relay endpoint so the
|
|
267
|
+
// crash-backstop flush can deposit un-acked content after a restart, when the
|
|
268
|
+
// in-memory entry is gone. relay_addrs is a JSON array of multiaddr strings.
|
|
269
|
+
"ALTER TABLE sessions ADD COLUMN relay_peer_id TEXT",
|
|
270
|
+
"ALTER TABLE sessions ADD COLUMN relay_addrs TEXT",
|
|
271
|
+
// M7-SESSION-004 (AC-005): persist the seal certificate's legibility object with the
|
|
272
|
+
// sealed record so it survives a daemon restart and is readable on the cert-read surface
|
|
273
|
+
// (cello_get_sealed_receipt). JSON string with hex-encoded pubkeys; NULL until sealed.
|
|
274
|
+
// Inline idempotent migration (NOT Flyway — this is the client-side SQLite, AC-011).
|
|
275
|
+
"ALTER TABLE sessions ADD COLUMN seal_legibility TEXT",
|
|
276
|
+
"ALTER TABLE sessions ADD COLUMN sealed_root_hex TEXT",
|
|
277
|
+
// M7 legibility-TBS-binding (responder verify): the counterparty's FROST primary (group)
|
|
278
|
+
// pubkey, taken from the FROST-signed SessionAssignment's signer_pubkey. The responder uses
|
|
279
|
+
// it to VERIFY the bilateral seal signature locally (the seal is signed by the initiator's
|
|
280
|
+
// primary), not just accept it. NULL when this party initiated (it uses its own primary).
|
|
281
|
+
"ALTER TABLE sessions ADD COLUMN counterparty_primary_pubkey TEXT",
|
|
282
|
+
]) {
|
|
283
|
+
try {
|
|
284
|
+
this.#db.exec(ddl);
|
|
285
|
+
}
|
|
286
|
+
catch (err) {
|
|
287
|
+
// Only swallow the idempotent "duplicate column name" case (the column
|
|
288
|
+
// already exists from a prior init). Any other failure — disk full,
|
|
289
|
+
// SQLITE_LOCKED, corruption — must propagate, otherwise the daemon would
|
|
290
|
+
// run without these columns and later silently read undefined.
|
|
291
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
292
|
+
if (!msg.includes("duplicate column name"))
|
|
293
|
+
throw err;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
// M7-SESSION-001 (H-1): side table holding the verified bilateral
|
|
297
|
+
// SEAL-INTERRUPTED commitment artifacts. A side table (CREATE TABLE IF NOT
|
|
298
|
+
// EXISTS) is inherently idempotent — no ALTER TABLE / duplicate-column
|
|
299
|
+
// handling required. We keep BOTH parties' signed leaves and the agreed
|
|
300
|
+
// Merkle root so the achieved commitment is never discarded.
|
|
301
|
+
this.#db.exec(`
|
|
302
|
+
CREATE TABLE IF NOT EXISTS seal_interrupted_artifacts (
|
|
303
|
+
agent_name TEXT NOT NULL,
|
|
304
|
+
session_id TEXT NOT NULL,
|
|
305
|
+
role TEXT NOT NULL,
|
|
306
|
+
own_leaf TEXT NOT NULL,
|
|
307
|
+
counterparty_leaf TEXT NOT NULL,
|
|
308
|
+
merkle_root TEXT NOT NULL,
|
|
309
|
+
nonce TEXT NOT NULL,
|
|
310
|
+
created_at INTEGER NOT NULL,
|
|
311
|
+
-- DOD-LOOP-1: composite key (per-agent end of a loopback session).
|
|
312
|
+
PRIMARY KEY (agent_name, session_id)
|
|
313
|
+
)
|
|
314
|
+
`);
|
|
315
|
+
// DAEMON-004 (AC-007 / SI-001): the daemon-owned per-session Merkle tree,
|
|
316
|
+
// persisted as an ordered list of leaf hashes. The (session_id, leaf_index)
|
|
317
|
+
// primary key enforces append-order uniqueness; a fresh daemon reconstructs
|
|
318
|
+
// each tree from these rows so the transcript survives a restart. Querying
|
|
319
|
+
// by session_id ORDER BY leaf_index is the only read pattern.
|
|
320
|
+
this.#db.exec(`
|
|
321
|
+
CREATE TABLE IF NOT EXISTS session_tree_leaves (
|
|
322
|
+
agent_name TEXT NOT NULL,
|
|
323
|
+
session_id TEXT NOT NULL,
|
|
324
|
+
leaf_index INTEGER NOT NULL,
|
|
325
|
+
leaf_kind TEXT NOT NULL,
|
|
326
|
+
leaf_hash_hex TEXT NOT NULL,
|
|
327
|
+
created_at INTEGER NOT NULL,
|
|
328
|
+
-- DOD-LOOP-1: composite key so each agent's end has its own append-ordered tree.
|
|
329
|
+
PRIMARY KEY (agent_name, session_id, leaf_index)
|
|
330
|
+
)
|
|
331
|
+
`);
|
|
332
|
+
// DOD-LOG-1 (PERSIST-LOG-001): the durable, ENCRYPTED-at-rest readable transcript. Each row
|
|
333
|
+
// is keyed by the canonical leaf `sequence`, so it JOINS to session_tree_leaves(leaf_index) —
|
|
334
|
+
// a stored message is provably behind a committed hash-chain leaf, not a loose dump. `blob` is
|
|
335
|
+
// the AES-256-GCM envelope of the readable plaintext (relay/directory never see this — INV-3).
|
|
336
|
+
this.#db.exec(`
|
|
337
|
+
CREATE TABLE IF NOT EXISTS transcript (
|
|
338
|
+
agent_name TEXT NOT NULL,
|
|
339
|
+
session_id TEXT NOT NULL,
|
|
340
|
+
sequence INTEGER NOT NULL,
|
|
341
|
+
direction TEXT NOT NULL, -- 'sent' | 'received'
|
|
342
|
+
blob BLOB NOT NULL, -- AES-256-GCM(iv||ct||tag) of the plaintext
|
|
343
|
+
created_at INTEGER NOT NULL,
|
|
344
|
+
PRIMARY KEY (agent_name, session_id, sequence, direction)
|
|
345
|
+
)
|
|
346
|
+
`);
|
|
347
|
+
// Step 2: Detect interrupted sessions (SIGKILL detection — AC-010).
|
|
348
|
+
// Any 'active' row in a freshly-started daemon is a remnant of a prior
|
|
349
|
+
// killed process. Batch-update to 'interrupted' before IPC opens.
|
|
350
|
+
const activeRows = this.#db
|
|
351
|
+
.prepare("SELECT * FROM sessions WHERE status = 'active'")
|
|
352
|
+
.all();
|
|
353
|
+
if (activeRows.length > 0) {
|
|
354
|
+
const now = Date.now();
|
|
355
|
+
const interruptedAt = new Date(now).toISOString();
|
|
356
|
+
for (const row of activeRows) {
|
|
357
|
+
try {
|
|
358
|
+
this.#db
|
|
359
|
+
.prepare("UPDATE sessions SET status = 'interrupted', updated_at = ?, interrupted_at = COALESCE(interrupted_at, ?) WHERE agent_name = ? AND session_id = ?")
|
|
360
|
+
.run(now, interruptedAt, row.agent_name, row.session_id);
|
|
361
|
+
this.#logger.warn("session.interrupted.detected", {
|
|
362
|
+
sessionId: row.session_id,
|
|
363
|
+
agentName: row.agent_name,
|
|
364
|
+
source: "daemon_restart",
|
|
365
|
+
});
|
|
366
|
+
}
|
|
367
|
+
catch (err) {
|
|
368
|
+
this.#logger.error("session.interrupt.db.write.failed", {
|
|
369
|
+
sessionId: row.session_id,
|
|
370
|
+
error: err instanceof Error ? err.message : String(err),
|
|
371
|
+
});
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
// DOD-LOOP-1: standing receivers are now PER-AGENT, created when each agent comes online
|
|
376
|
+
// (cello_start_agent → ensureStandingReceiverForAgent). No daemon-global receiver is created at
|
|
377
|
+
// init (no agent is online yet). The initiate/accept paths kick off creation on demand if missing.
|
|
378
|
+
}
|
|
379
|
+
// ─── Public API ──────────────────────────────────────────────────────────
|
|
380
|
+
/**
|
|
381
|
+
* Get the underlying DatabaseSync handle.
|
|
382
|
+
* Used by the composition root (daemon.ts) to pass to RetryQueue and
|
|
383
|
+
* NonceDedupStore — they share the same SQLCipher DB file (DAEMON-003 AC-008).
|
|
384
|
+
*/
|
|
385
|
+
getDb() {
|
|
386
|
+
if (!this.#db) {
|
|
387
|
+
throw new Error("SessionNodeManager not initialized — call initialize() first");
|
|
388
|
+
}
|
|
389
|
+
return this.#db;
|
|
390
|
+
}
|
|
391
|
+
/**
|
|
392
|
+
* DOD-LOG-1: the at-rest cipher, shared with the RetryQueue so its content_blob is encrypted with
|
|
393
|
+
* the SAME key as the transcript. Available after initialize().
|
|
394
|
+
*/
|
|
395
|
+
getTranscriptCipher() {
|
|
396
|
+
if (!this.#transcriptCipher) {
|
|
397
|
+
throw new Error("SessionNodeManager not initialized — call initialize() first");
|
|
398
|
+
}
|
|
399
|
+
return this.#transcriptCipher;
|
|
400
|
+
}
|
|
401
|
+
/**
|
|
402
|
+
* DOD-LOG-1: append one readable message to the durable, encrypted-at-rest transcript, keyed by
|
|
403
|
+
* the canonical leaf `sequence` so it joins to the committed hash chain. Idempotent on replay
|
|
404
|
+
* (INSERT OR IGNORE — the same (session, sequence, direction) is written at most once). Never
|
|
405
|
+
* throws into the caller's content path: a transcript-write failure is logged, not fatal.
|
|
406
|
+
*/
|
|
407
|
+
recordTranscriptMessage(agentName, sessionId, sequence, direction, plaintext, correlationId) {
|
|
408
|
+
if (!this.#db || !this.#transcriptCipher)
|
|
409
|
+
return;
|
|
410
|
+
try {
|
|
411
|
+
const blob = this.#transcriptCipher.encrypt(plaintext);
|
|
412
|
+
this.#db
|
|
413
|
+
.prepare(`INSERT OR IGNORE INTO transcript (agent_name, session_id, sequence, direction, blob, created_at)
|
|
414
|
+
VALUES (?, ?, ?, ?, ?, ?)`)
|
|
415
|
+
.run(agentName, sessionId, sequence, direction, blob, Date.now());
|
|
416
|
+
this.#logger.info("transcript.message.recorded", { sessionId, agentName, sequence, direction, correlationId });
|
|
417
|
+
}
|
|
418
|
+
catch (err) {
|
|
419
|
+
this.#logger.warn("transcript.message.record.failed", {
|
|
420
|
+
sessionId, agentName, sequence, direction,
|
|
421
|
+
reason: err instanceof Error ? err.message : String(err),
|
|
422
|
+
correlationId,
|
|
423
|
+
});
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
/**
|
|
427
|
+
* DOD-LOG-1: read a session's durable transcript back (after a restart), decrypted and ordered by
|
|
428
|
+
* canonical sequence then direction. A blob that fails to decrypt (tamper/wrong key) is skipped
|
|
429
|
+
* with a loud log rather than crashing the read.
|
|
430
|
+
*/
|
|
431
|
+
readTranscript(agentName, sessionId) {
|
|
432
|
+
if (!this.#db || !this.#transcriptCipher)
|
|
433
|
+
return { messages: [], undecryptable: 0 };
|
|
434
|
+
const rows = this.#db
|
|
435
|
+
.prepare(`SELECT sequence, direction, blob, created_at FROM transcript
|
|
436
|
+
WHERE agent_name = ? AND session_id = ? ORDER BY sequence ASC, direction ASC`)
|
|
437
|
+
.all(agentName, sessionId);
|
|
438
|
+
const messages = [];
|
|
439
|
+
let undecryptable = 0;
|
|
440
|
+
for (const r of rows) {
|
|
441
|
+
const pt = this.#transcriptCipher.decrypt(r.blob instanceof Uint8Array ? r.blob : new Uint8Array(r.blob));
|
|
442
|
+
if (pt === null) {
|
|
443
|
+
// A row that fails GCM auth (tamper / wrong key) is REPORTED to the reader, not silently
|
|
444
|
+
// dropped — a gap in the transcript must be visible, not invisible (the reader needs to
|
|
445
|
+
// distinguish "never existed" from "tampered/unreadable").
|
|
446
|
+
undecryptable += 1;
|
|
447
|
+
this.#logger.warn("transcript.message.decrypt.failed", { sessionId, agentName, sequence: r.sequence, direction: r.direction });
|
|
448
|
+
continue;
|
|
449
|
+
}
|
|
450
|
+
messages.push({
|
|
451
|
+
sequence: r.sequence,
|
|
452
|
+
direction: r.direction === "sent" ? "sent" : "received",
|
|
453
|
+
text: new TextDecoder().decode(pt),
|
|
454
|
+
createdAt: r.created_at,
|
|
455
|
+
});
|
|
456
|
+
}
|
|
457
|
+
return { messages, undecryptable };
|
|
458
|
+
}
|
|
459
|
+
/** DOD-LOOP-1: whether the given agent has a standing receiver ready (any agent if omitted). */
|
|
460
|
+
getStandingReceiverReady(agentName) {
|
|
461
|
+
if (agentName !== undefined)
|
|
462
|
+
return this.#standingReceivers.has(agentName);
|
|
463
|
+
return this.#standingReceivers.size > 0;
|
|
464
|
+
}
|
|
465
|
+
/** First ready standing receiver (any agent) — for agent-agnostic OUTBOUND use (gater-open). */
|
|
466
|
+
#anyStandingReceiver() {
|
|
467
|
+
for (const sr of this.#standingReceivers.values())
|
|
468
|
+
return sr;
|
|
469
|
+
return null;
|
|
470
|
+
}
|
|
471
|
+
/**
|
|
472
|
+
* The current standing receiver node's session-transport coordinates (peer id +
|
|
473
|
+
* listen multiaddrs), or null if it is not ready. These are the addresses a local
|
|
474
|
+
* SessionNegotiator advertises as this node's counterparty endpoint so the initiator
|
|
475
|
+
* can dial it, and the value an inbound session_assignment carries in its
|
|
476
|
+
* counterparty_session_* fields. Read-only — does NOT consume the standing receiver
|
|
477
|
+
* (unlike acceptSession, which hands it off).
|
|
478
|
+
*/
|
|
479
|
+
getStandingReceiverInfo(agentName) {
|
|
480
|
+
// DOD-LOOP-1: the initiator advertises ITS OWN agent's standing receiver, which it then reuses
|
|
481
|
+
// as the session node — so the advertised endpoint matches the node the counterparty dials.
|
|
482
|
+
const sr = this.#standingReceivers.get(agentName);
|
|
483
|
+
if (!sr)
|
|
484
|
+
return null;
|
|
485
|
+
return { peerId: sr.node.getPeerId(), addrs: sr.node.listenAddresses() };
|
|
486
|
+
}
|
|
487
|
+
/**
|
|
488
|
+
* The standing receiver's libp2p node — a general-purpose, OPEN-gater node usable for
|
|
489
|
+
* OUTBOUND dials that are not session-scoped (e.g. the content-park deposit/pull to the
|
|
490
|
+
* relay, MSG-001-3b). Session nodes have restrictive gaters; the standing receiver does not.
|
|
491
|
+
* Returns null until the receiver is ready.
|
|
492
|
+
*/
|
|
493
|
+
getStandingReceiverNode(agentName) {
|
|
494
|
+
// With an agentName: that agent's own standing-receiver node (needed when the dial must
|
|
495
|
+
// originate from a SPECIFIC agent — e.g. the startup content-park re-park, where the
|
|
496
|
+
// depositor is the original sender). Without one: any ready standing receiver (outbound
|
|
497
|
+
// content-park deposit/pull to the relay — open gater, not session-scoped).
|
|
498
|
+
if (agentName !== undefined)
|
|
499
|
+
return this.#standingReceivers.get(agentName)?.node ?? null;
|
|
500
|
+
return this.#anyStandingReceiver()?.node ?? null;
|
|
501
|
+
}
|
|
502
|
+
/**
|
|
503
|
+
* The libp2p Peer ID of an active session's node (N_A for an initiated session), or
|
|
504
|
+
* null if no active node exists for it. This is the initiator's session peer id that an
|
|
505
|
+
* inbound session_assignment must carry to the counterparty (so the counterparty gates
|
|
506
|
+
* its handed-off receiver to it). Read-only.
|
|
507
|
+
*/
|
|
508
|
+
getSessionNodePeerId(agentName, sessionId) {
|
|
509
|
+
return this.#activeNodes.get(this.#k(agentName, sessionId))?.node.getPeerId() ?? null;
|
|
510
|
+
}
|
|
511
|
+
/**
|
|
512
|
+
* CELLO-M7-TRANSPORT-001: the AutoNAT service wrapping the current standing
|
|
513
|
+
* receiver node, or null if the standing receiver is not ready. The composition
|
|
514
|
+
* root uses this as the daemon's runtime IAutoNatService — its getDialability()
|
|
515
|
+
* drives the SessionAssignment advertised address (AC-004/AC-019), and it is the
|
|
516
|
+
* source of the transport.autonat.result / transport.autonat.unavailable events.
|
|
517
|
+
*/
|
|
518
|
+
getStandingReceiverAutoNat() {
|
|
519
|
+
// DOD-LOOP-1: the daemon-level autonat source is any ready standing receiver; null until one
|
|
520
|
+
// exists (the composition root falls back to LocalAutoNatStub). Per-session advertised dialability
|
|
521
|
+
// comes from the initiating agent's own SR via getStandingReceiverInfo, not this daemon-level value.
|
|
522
|
+
return this.#anyStandingReceiver()?.autoNat ?? null;
|
|
523
|
+
}
|
|
524
|
+
/**
|
|
525
|
+
* M7-SESSION-001 (M-1 PUSH): register the session-state-change callback.
|
|
526
|
+
* Called by the composition root (daemon.ts) after the NotificationDispatcher
|
|
527
|
+
* exists. Setter injection avoids a construction-order/circular dependency.
|
|
528
|
+
*/
|
|
529
|
+
setOnSessionStateChanged(cb) {
|
|
530
|
+
this.#onSessionStateChanged = cb;
|
|
531
|
+
}
|
|
532
|
+
/**
|
|
533
|
+
* DOD-LOOP-1: the session core is keyed by (agentName, sessionId), NOT sessionId alone. Two of
|
|
534
|
+
* the operator's own agents (the loopback case) can hold the two ends of the SAME session_id on
|
|
535
|
+
* ONE daemon, so a bare session_id is ambiguous between them. This composite string key — the
|
|
536
|
+
* agent name and the hex session id joined by a 0x1f unit separator (which appears in neither) —
|
|
537
|
+
* is the key for every in-memory session-core map (#activeNodes, #trees, #receivedContent,
|
|
538
|
+
* #sessionLiveness, #contentDesynced, #responderSealSubmitted, #awaitingAck). #relayClients is
|
|
539
|
+
* already per-agent (its own key), and the standing receivers are keyed by agent name directly.
|
|
540
|
+
*/
|
|
541
|
+
#k(agentName, sessionId) {
|
|
542
|
+
return `${agentName}\x1f${sessionId}`;
|
|
543
|
+
}
|
|
544
|
+
/**
|
|
545
|
+
* Create a new outbound session node.
|
|
546
|
+
* Called during cello_initiate_session.
|
|
547
|
+
*
|
|
548
|
+
* @param sessionId Unique session ID (hex string)
|
|
549
|
+
* @param agentName Name of the initiating agent
|
|
550
|
+
* @param counterpartyPubkey Counterparty's K_local public key (hex)
|
|
551
|
+
* @param counterpartyPeerId Counterparty's session-layer Peer ID (for gater)
|
|
552
|
+
* @param correlationId Correlation ID minted at session initiation
|
|
553
|
+
*/
|
|
554
|
+
async createSessionNode(sessionId, agentName, counterpartyPubkey, counterpartyPeerId, correlationId, reuseStandingReceiver = false, relay) {
|
|
555
|
+
// Cap enforcement (AC-006)
|
|
556
|
+
if (this.#activeNodes.size >= MAX_SESSION_NODES) {
|
|
557
|
+
this.#logger.warn("session.node.cap.reached", {
|
|
558
|
+
agentName,
|
|
559
|
+
currentCount: this.#activeNodes.size,
|
|
560
|
+
maxCount: MAX_SESSION_NODES,
|
|
561
|
+
});
|
|
562
|
+
return {
|
|
563
|
+
ok: false,
|
|
564
|
+
reason: "max_sessions_reached",
|
|
565
|
+
guidance: "The daemon has reached its maximum of 32 concurrent session nodes. " +
|
|
566
|
+
"Close an existing session before starting a new one.",
|
|
567
|
+
};
|
|
568
|
+
}
|
|
569
|
+
// The session node N_A: either a FRESH ephemeral node (default), or — for the initiator
|
|
570
|
+
// path (reuseStandingReceiver) — the standing receiver handed off as the session node. The
|
|
571
|
+
// latter makes N_A's peer id equal the SESSION endpoint the initiator ADVERTISED to the
|
|
572
|
+
// directory (its standing receiver), so the counterparty's connection gater (set to that
|
|
573
|
+
// advertised peer id) admits N_A's dial. Mirrors acceptSession, which already hands off the
|
|
574
|
+
// standing receiver on the receiver side. WIRE-001/INV-5: a fully-fresh ephemeral initiator
|
|
575
|
+
// node would require advertising N_A's peer id pre-negotiation (a session-node lifecycle
|
|
576
|
+
// split); the symmetric standing-receiver handoff is the consistent interim model.
|
|
577
|
+
let node;
|
|
578
|
+
let gater;
|
|
579
|
+
let autoNat;
|
|
580
|
+
if (reuseStandingReceiver) {
|
|
581
|
+
const sr = this.#standingReceivers.get(agentName);
|
|
582
|
+
if (!sr) {
|
|
583
|
+
// DOD-LOOP-1: this agent has no standing receiver ready — kick off (idempotent) creation
|
|
584
|
+
// so a retry finds it, and report unavailable. Per-agent, so the initiator consuming its
|
|
585
|
+
// OWN agent's receiver never contends with a co-resident responder agent (the loopback case).
|
|
586
|
+
void this.#ensureStandingReceiver(agentName, correlationId);
|
|
587
|
+
return {
|
|
588
|
+
ok: false,
|
|
589
|
+
reason: "standing_receiver_unavailable",
|
|
590
|
+
guidance: "The standing receiver node is initializing (completes within 200ms). Retry the session in a moment.",
|
|
591
|
+
};
|
|
592
|
+
}
|
|
593
|
+
({ node, gater, autoNat } = sr);
|
|
594
|
+
gater.setAllowedPeer(counterpartyPeerId);
|
|
595
|
+
// Hand this agent's standing receiver off to this session; a replacement is spun up below.
|
|
596
|
+
this.#standingReceivers.delete(agentName);
|
|
597
|
+
}
|
|
598
|
+
else {
|
|
599
|
+
gater = new SessionConnectionGater({
|
|
600
|
+
sessionId,
|
|
601
|
+
allowedPeerId: counterpartyPeerId,
|
|
602
|
+
logger: this.#logger,
|
|
603
|
+
});
|
|
604
|
+
try {
|
|
605
|
+
node = await this.#factory.createNode({ sessionId, connectionGater: gater, nodeType: "session" });
|
|
606
|
+
await node.start();
|
|
607
|
+
}
|
|
608
|
+
catch (err) {
|
|
609
|
+
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
610
|
+
this.#logger.error("session.node.create.failed", {
|
|
611
|
+
sessionId,
|
|
612
|
+
agentName,
|
|
613
|
+
error: errorMessage,
|
|
614
|
+
correlationId,
|
|
615
|
+
});
|
|
616
|
+
return {
|
|
617
|
+
ok: false,
|
|
618
|
+
reason: "session_node_creation_failed",
|
|
619
|
+
guidance: "Failed to create session transport node. The daemon logged the cause in " +
|
|
620
|
+
"session.node.create.failed. Check that the system has available ports and sufficient memory.",
|
|
621
|
+
};
|
|
622
|
+
}
|
|
623
|
+
// CELLO-M7-TRANSPORT-001: session nodes also need dialability awareness for the
|
|
624
|
+
// dcutr decision path (AC-002). Wrap the node in a NodeAutoNatService and emit
|
|
625
|
+
// its initial result (nodeType: 'session').
|
|
626
|
+
autoNat = new NodeAutoNatService({
|
|
627
|
+
node,
|
|
628
|
+
logger: this.#logger,
|
|
629
|
+
nodeType: "session",
|
|
630
|
+
probers: this.#autoNatProbers(),
|
|
631
|
+
});
|
|
632
|
+
autoNat.emitInitialResult();
|
|
633
|
+
}
|
|
634
|
+
const peerId = node.getPeerId();
|
|
635
|
+
const addrs = node.listenAddresses();
|
|
636
|
+
// Persist to SQLite
|
|
637
|
+
this.#insertSessionRow(sessionId, agentName, counterpartyPubkey, "active");
|
|
638
|
+
// Log observability event (session.node.created)
|
|
639
|
+
this.#logger.info("session.node.created", {
|
|
640
|
+
sessionId,
|
|
641
|
+
agentName,
|
|
642
|
+
sessionPeerId: peerId,
|
|
643
|
+
correlationId,
|
|
644
|
+
});
|
|
645
|
+
// Add to active map (keyed by (agentName, sessionId) — DOD-LOOP-1)
|
|
646
|
+
this.#activeNodes.set(this.#k(agentName, sessionId), {
|
|
647
|
+
node,
|
|
648
|
+
agentName,
|
|
649
|
+
sessionId,
|
|
650
|
+
counterpartyPubkey,
|
|
651
|
+
gater,
|
|
652
|
+
correlationId,
|
|
653
|
+
counterpartySessionPeerId: counterpartyPeerId,
|
|
654
|
+
autoNat,
|
|
655
|
+
});
|
|
656
|
+
// DAEMON-004: register the content stream handler so inbound content_frames
|
|
657
|
+
// are cross-checked, appended to the daemon-owned tree, and buffered.
|
|
658
|
+
await this.#registerContentHandler(agentName, sessionId, node, counterpartyPubkey);
|
|
659
|
+
// M7-SESSION-003 AC-004: act on the session node's peer events for direct-path
|
|
660
|
+
// liveness. The session connection IS the authority for a direct session.
|
|
661
|
+
this.#wireSessionLiveness(agentName, sessionId, node, counterpartyPubkey, correlationId);
|
|
662
|
+
// M7 DOD-SPINE-6 / MSG-001-3b: connect this session node to the relay as the
|
|
663
|
+
// Structure-2 witness (non-fatal — direct content still works without it).
|
|
664
|
+
if (relay) {
|
|
665
|
+
await this.#connectSessionRelay(sessionId, node, agentName, relay, correlationId);
|
|
666
|
+
}
|
|
667
|
+
// If we consumed this agent's standing receiver, spin up a replacement (async — do NOT await).
|
|
668
|
+
if (reuseStandingReceiver) {
|
|
669
|
+
void this.#ensureStandingReceiver(agentName, correlationId);
|
|
670
|
+
}
|
|
671
|
+
return { ok: true, peerId, addrs };
|
|
672
|
+
}
|
|
673
|
+
/**
|
|
674
|
+
* M7 DOD-SPINE-6 / MSG-001-3b: connect a session node to the relay witness and
|
|
675
|
+
* store the client on the active entry. Best-effort: a connect/auth failure logs
|
|
676
|
+
* and leaves relayClient undefined — the session is NOT destroyed and the direct
|
|
677
|
+
* content path keeps working (the relay-park/recovery path is MSG-001-3b's domain).
|
|
678
|
+
*/
|
|
679
|
+
async #connectSessionRelay(sessionId, node, agentName, relay, correlationId) {
|
|
680
|
+
try {
|
|
681
|
+
// The session node's gater admits only the counterparty; the relay witness is a
|
|
682
|
+
// third peer. Permit it OUTBOUND so the dial isn't denied — inbound stays
|
|
683
|
+
// counterparty-only (INV-5). The relay peer id comes from the signed assignment.
|
|
684
|
+
this.#activeNodes.get(this.#k(agentName, sessionId))?.gater.setAllowedOutboundPeer(relay.relayPeerId);
|
|
685
|
+
// One relay client per (AGENT, RELAY NODE). The relay keys by agent pubkey, so the
|
|
686
|
+
// collision H1 addresses is per relay; CELLO is federated, so a different session for
|
|
687
|
+
// the same agent may be assigned a DIFFERENT relay — that needs its own client.
|
|
688
|
+
const clientKey = `${agentName}::${relay.relayPeerId}`;
|
|
689
|
+
let client = this.#relayClients.get(clientKey);
|
|
690
|
+
if (!client) {
|
|
691
|
+
client = new AgentRelayClient({
|
|
692
|
+
relayPeerId: relay.relayPeerId,
|
|
693
|
+
relayAddrs: relay.relayAddrs,
|
|
694
|
+
keyProvider: relay.keyProvider,
|
|
695
|
+
senderPubkey: relay.senderPubkey,
|
|
696
|
+
logger: this.#logger,
|
|
697
|
+
});
|
|
698
|
+
this.#relayClients.set(clientKey, client);
|
|
699
|
+
}
|
|
700
|
+
const sessionIdHexForRelay = Buffer.from(relay.sessionIdBytes).toString("hex");
|
|
701
|
+
client.registerSession(sessionIdHexForRelay, node, (frame) => {
|
|
702
|
+
// The counterparty's witnessed leaf arrived with its canonical sequence. The
|
|
703
|
+
// plaintext is delivered separately over the direct content stream; this is the
|
|
704
|
+
// ordering/witness signal. Full canonical-sequence reconciliation against the
|
|
705
|
+
// local tree is MSG-001-3b (J-CONTENT).
|
|
706
|
+
this.#logger.info("session.relay.leaf.delivered", {
|
|
707
|
+
sessionId,
|
|
708
|
+
sequenceNumber: frame.sequence_number,
|
|
709
|
+
leafKind: frame.leaf_kind,
|
|
710
|
+
correlationId,
|
|
711
|
+
});
|
|
712
|
+
// DOD-MSG-4 (strict in-order): record the relay-witnessed canonical sequence for the
|
|
713
|
+
// counterparty's MSG leaves. The relay is the ordering authority; structure1_cbor =
|
|
714
|
+
// [1, content_hash(32), sender_pubkey, session_id, last_seen_seq, ts]. The relay sequence
|
|
715
|
+
// is 1-based and global per session; the daemon tree is 0-based — normalize with -1. Only
|
|
716
|
+
// COUNTERPARTY leaves (the ones B will ingest); our own echoed leaf already lands via the
|
|
717
|
+
// send path. The gate (ingestReceivedContent) reads this map to hold out-of-order arrivals.
|
|
718
|
+
if (!frame.authored_by_us && frame.leaf_kind !== LEAF_KIND_CTRL) {
|
|
719
|
+
try {
|
|
720
|
+
const s1 = decode(frame.structure1_cbor);
|
|
721
|
+
const contentHash = s1?.[1];
|
|
722
|
+
if (contentHash instanceof Uint8Array && frame.sequence_number > 0) {
|
|
723
|
+
this.recordWitnessedSequence(agentName, sessionId, Buffer.from(contentHash).toString("hex"), frame.sequence_number - 1);
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
catch (err) {
|
|
727
|
+
this.#logger.warn("session.relay.leaf.witness.decode.failed", {
|
|
728
|
+
sessionId,
|
|
729
|
+
error: err instanceof Error ? err.message : String(err),
|
|
730
|
+
correlationId,
|
|
731
|
+
});
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
// M7-UPGRADE-002: auto-acknowledge close. When the COUNTERPARTY's SEAL ctrl leaf (0x02)
|
|
735
|
+
// arrives and B has verified the content, B's OWN node auto-co-signs the responder SEAL
|
|
736
|
+
// leaf — no agent prompt — so the bilateral seal completes promptly instead of degrading
|
|
737
|
+
// to unilateral on a slow/busy/crashed agent. Never auto-ack our OWN echoed ctrl leaf.
|
|
738
|
+
if (frame.leaf_kind === LEAF_KIND_CTRL && !frame.authored_by_us) {
|
|
739
|
+
this.#maybeAutoAcknowledgeSeal(agentName, sessionId, correlationId);
|
|
740
|
+
}
|
|
741
|
+
});
|
|
742
|
+
const entry = this.#activeNodes.get(this.#k(agentName, sessionId));
|
|
743
|
+
if (entry) {
|
|
744
|
+
entry.relayClient = client;
|
|
745
|
+
entry.relaySessionIdBytes = relay.sessionIdBytes;
|
|
746
|
+
entry.relayClientKey = clientKey;
|
|
747
|
+
// 2b: remember the relay endpoint so the content-park backstop deposits to the SAME relay.
|
|
748
|
+
entry.relayPeerId = relay.relayPeerId;
|
|
749
|
+
entry.relayAddrs = relay.relayAddrs;
|
|
750
|
+
// MSG-2 startup-flush: also PERSIST it, so a restart's crash-backstop flush (which runs
|
|
751
|
+
// before the in-memory entry exists) can deposit un-acked content to the same relay.
|
|
752
|
+
try {
|
|
753
|
+
this.#db
|
|
754
|
+
?.prepare("UPDATE sessions SET relay_peer_id = ?, relay_addrs = ?, updated_at = ? WHERE agent_name = ? AND session_id = ?")
|
|
755
|
+
.run(relay.relayPeerId, JSON.stringify(relay.relayAddrs), Date.now(), agentName, sessionId);
|
|
756
|
+
}
|
|
757
|
+
catch (err) {
|
|
758
|
+
this.#logger.warn("session.relay.endpoint.persist.failed", {
|
|
759
|
+
sessionId,
|
|
760
|
+
error: err instanceof Error ? err.message : String(err),
|
|
761
|
+
});
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
else {
|
|
765
|
+
// The session was torn down while we were wiring — undo the registration.
|
|
766
|
+
client.unregisterSession(sessionIdHexForRelay);
|
|
767
|
+
if (!client.hasSessions() && this.#relayClients.get(clientKey) === client) {
|
|
768
|
+
client.close();
|
|
769
|
+
this.#relayClients.delete(clientKey);
|
|
770
|
+
}
|
|
771
|
+
return;
|
|
772
|
+
}
|
|
773
|
+
// Proactively connect so the relay has this agent's stream to deliver leaves to
|
|
774
|
+
// (the RECEIVER must be connected before the counterparty submits). Best-effort.
|
|
775
|
+
await client.connect(node);
|
|
776
|
+
}
|
|
777
|
+
catch (err) {
|
|
778
|
+
this.#logger.warn("session.relay.connect.error", {
|
|
779
|
+
sessionId,
|
|
780
|
+
error: err instanceof Error ? err.message : String(err),
|
|
781
|
+
correlationId,
|
|
782
|
+
});
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
/**
|
|
786
|
+
* M7 DOD-SPINE-6 / MSG-001-3b: detach a session from its (agent, relay) client and
|
|
787
|
+
* close the client when it has no remaining sessions. Idempotent and identity-guarded:
|
|
788
|
+
* the map delete only fires if the map still holds THIS client (a racing teardown of a
|
|
789
|
+
* sibling session must not close a freshly-created replacement client for the same key).
|
|
790
|
+
*/
|
|
791
|
+
#detachSessionRelay(entry) {
|
|
792
|
+
const client = entry.relayClient;
|
|
793
|
+
const key = entry.relayClientKey;
|
|
794
|
+
if (!client || !entry.relaySessionIdBytes)
|
|
795
|
+
return;
|
|
796
|
+
// Idempotent: clear the entry's reference so a second teardown of the same entry no-ops.
|
|
797
|
+
entry.relayClient = undefined;
|
|
798
|
+
const sidHex = Buffer.from(entry.relaySessionIdBytes).toString("hex");
|
|
799
|
+
client.unregisterSession(sidHex);
|
|
800
|
+
if (!client.hasSessions() && key && this.#relayClients.get(key) === client) {
|
|
801
|
+
client.close();
|
|
802
|
+
this.#relayClients.delete(key);
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
/**
|
|
806
|
+
* M7-SESSION-003 AC-004: wire a session node's peer-connect / peer-disconnect
|
|
807
|
+
* events to per-session direct-path liveness. onPeerConnect → 'alive'; the
|
|
808
|
+
* session node's gater restricts connections to the designated counterparty, so
|
|
809
|
+
* a connect/disconnect on this node is the counterparty's session-path liveness.
|
|
810
|
+
* onPeerDisconnect → 'gone' (the hook the client did not act on before),
|
|
811
|
+
* emitting session.liveness.changed at WARN. Combined with the transport
|
|
812
|
+
* keepalive (AC-005), a peer that vanished without a clean close still surfaces
|
|
813
|
+
* a disconnect and drives 'gone'.
|
|
814
|
+
*/
|
|
815
|
+
#wireSessionLiveness(agentName, sessionId, node, counterpartyPubkey, correlationId) {
|
|
816
|
+
const key = this.#k(agentName, sessionId);
|
|
817
|
+
node.onPeerConnect(() => {
|
|
818
|
+
const prior = this.#sessionLiveness.get(key);
|
|
819
|
+
this.#sessionLiveness.set(key, "alive");
|
|
820
|
+
if (prior !== "alive") {
|
|
821
|
+
this.#logger.info("session.liveness.changed", {
|
|
822
|
+
sessionId,
|
|
823
|
+
counterpartyPubkey,
|
|
824
|
+
transportPath: "direct",
|
|
825
|
+
liveness: "alive",
|
|
826
|
+
observedBy: "session_node",
|
|
827
|
+
correlationId,
|
|
828
|
+
});
|
|
829
|
+
}
|
|
830
|
+
});
|
|
831
|
+
node.onPeerDisconnect(() => {
|
|
832
|
+
const prior = this.#sessionLiveness.get(key);
|
|
833
|
+
this.#sessionLiveness.set(key, "gone");
|
|
834
|
+
if (prior !== "gone") {
|
|
835
|
+
this.#logger.warn("session.liveness.changed", {
|
|
836
|
+
sessionId,
|
|
837
|
+
counterpartyPubkey,
|
|
838
|
+
transportPath: "direct",
|
|
839
|
+
liveness: "gone",
|
|
840
|
+
observedBy: "session_node",
|
|
841
|
+
correlationId,
|
|
842
|
+
});
|
|
843
|
+
}
|
|
844
|
+
});
|
|
845
|
+
}
|
|
846
|
+
/**
|
|
847
|
+
* M7-SESSION-003: read the direct-path counterparty liveness for a session.
|
|
848
|
+
* 'unknown' when no session node observation has occurred yet.
|
|
849
|
+
*/
|
|
850
|
+
getSessionLiveness(agentName, sessionId) {
|
|
851
|
+
return this.#sessionLiveness.get(this.#k(agentName, sessionId)) ?? "unknown";
|
|
852
|
+
}
|
|
853
|
+
/**
|
|
854
|
+
* Hand the standing receiver to an inbound session.
|
|
855
|
+
* Called during cello_await_session.
|
|
856
|
+
*
|
|
857
|
+
* CRITICAL (AC-015): gater.setAllowedPeer() is called BEFORE returning
|
|
858
|
+
* the node's multiaddr to the caller. This closes the window where an
|
|
859
|
+
* unexpected peer could connect during the hand-off.
|
|
860
|
+
*/
|
|
861
|
+
async acceptSession(sessionId, agentName, counterpartyPubkey, initiatorPeerId, correlationId, relay) {
|
|
862
|
+
const inboundSr = this.#standingReceivers.get(agentName);
|
|
863
|
+
if (!inboundSr) {
|
|
864
|
+
// DOD-LOOP-1: per-agent — kick off (idempotent) creation so a retry finds it.
|
|
865
|
+
void this.#ensureStandingReceiver(agentName, correlationId);
|
|
866
|
+
return {
|
|
867
|
+
ok: false,
|
|
868
|
+
reason: "standing_receiver_unavailable",
|
|
869
|
+
guidance: "The standing receiver node is initializing (completes within 200ms). " +
|
|
870
|
+
"Retry cello_await_session in a moment.",
|
|
871
|
+
};
|
|
872
|
+
}
|
|
873
|
+
// Cap enforcement — inbound sessions count against the same limit (AC-006)
|
|
874
|
+
if (this.#activeNodes.size >= MAX_SESSION_NODES) {
|
|
875
|
+
this.#logger.warn("session.node.cap.reached", {
|
|
876
|
+
agentName,
|
|
877
|
+
currentCount: this.#activeNodes.size,
|
|
878
|
+
maxCount: MAX_SESSION_NODES,
|
|
879
|
+
});
|
|
880
|
+
return {
|
|
881
|
+
ok: false,
|
|
882
|
+
reason: "max_sessions_reached",
|
|
883
|
+
guidance: "The daemon has reached its maximum of 32 concurrent session nodes. " +
|
|
884
|
+
"Close an existing session before starting a new one.",
|
|
885
|
+
};
|
|
886
|
+
}
|
|
887
|
+
const { node, gater, autoNat } = inboundSr;
|
|
888
|
+
// AC-015: update gater BEFORE retrieving multiaddr / returning to caller
|
|
889
|
+
gater.setAllowedPeer(initiatorPeerId);
|
|
890
|
+
const peerId = node.getPeerId();
|
|
891
|
+
const addrs = node.listenAddresses();
|
|
892
|
+
// Persist to SQLite
|
|
893
|
+
this.#insertSessionRow(sessionId, agentName, counterpartyPubkey, "active");
|
|
894
|
+
// Log observability event
|
|
895
|
+
this.#logger.info("session.node.created", {
|
|
896
|
+
sessionId,
|
|
897
|
+
agentName,
|
|
898
|
+
sessionPeerId: peerId,
|
|
899
|
+
correlationId,
|
|
900
|
+
});
|
|
901
|
+
// Remove this agent's standing receiver from the slot and add to active map. The handed-off
|
|
902
|
+
// node keeps its AutoNAT service (it continues to surface dialability).
|
|
903
|
+
this.#standingReceivers.delete(agentName);
|
|
904
|
+
this.#activeNodes.set(this.#k(agentName, sessionId), {
|
|
905
|
+
node,
|
|
906
|
+
agentName,
|
|
907
|
+
sessionId,
|
|
908
|
+
counterpartyPubkey,
|
|
909
|
+
gater,
|
|
910
|
+
correlationId,
|
|
911
|
+
counterpartySessionPeerId: initiatorPeerId,
|
|
912
|
+
autoNat,
|
|
913
|
+
});
|
|
914
|
+
// DAEMON-004: register the content stream handler for the inbound session.
|
|
915
|
+
await this.#registerContentHandler(agentName, sessionId, node, counterpartyPubkey);
|
|
916
|
+
// M7-SESSION-003 AC-004: act on the inbound session node's peer events too.
|
|
917
|
+
this.#wireSessionLiveness(agentName, sessionId, node, counterpartyPubkey, correlationId);
|
|
918
|
+
// M7 DOD-SPINE-6 / MSG-001-3b: the receiver also connects to the relay witness so
|
|
919
|
+
// the relay can deliver the initiator's witnessed leaves (leaf_deliver) to it.
|
|
920
|
+
if (relay) {
|
|
921
|
+
await this.#connectSessionRelay(sessionId, node, agentName, relay, correlationId);
|
|
922
|
+
}
|
|
923
|
+
// Immediately spin up a replacement for THIS agent (async — do NOT await, AC-003)
|
|
924
|
+
void this.#ensureStandingReceiver(agentName, correlationId);
|
|
925
|
+
return { ok: true, peerId, addrs };
|
|
926
|
+
}
|
|
927
|
+
/**
|
|
928
|
+
* Destroy a session node after seal or on error teardown.
|
|
929
|
+
* Status written to SQLite.
|
|
930
|
+
*/
|
|
931
|
+
async destroySessionNode(agentName, sessionId, reason) {
|
|
932
|
+
const entry = this.#activeNodes.get(this.#k(agentName, sessionId));
|
|
933
|
+
if (!entry)
|
|
934
|
+
return;
|
|
935
|
+
entry.autoNat.stop();
|
|
936
|
+
// M7 DOD-SPINE-6 / MSG-001-3b: close the relay witness stream so we don't leak it.
|
|
937
|
+
this.#detachSessionRelay(entry);
|
|
938
|
+
try {
|
|
939
|
+
await entry.node.stop();
|
|
940
|
+
}
|
|
941
|
+
catch (err) {
|
|
942
|
+
this.#logger.error("session.node.stop.failed", {
|
|
943
|
+
sessionId,
|
|
944
|
+
agentName: entry.agentName,
|
|
945
|
+
error: err instanceof Error ? err.message : String(err),
|
|
946
|
+
correlationId: entry.correlationId,
|
|
947
|
+
});
|
|
948
|
+
// Fall through — still remove from active map and update DB
|
|
949
|
+
}
|
|
950
|
+
// Update SQLite — 'sealed' → 'sealed', 'interrupted'/'error' → 'interrupted'.
|
|
951
|
+
// 'error' is not a valid SessionStatus in SQLite; error-torn-down sessions
|
|
952
|
+
// surface as interrupted so AC-010 recovery handles them at next login.
|
|
953
|
+
// The session.node.destroyed log preserves the original reason for observability.
|
|
954
|
+
const dbStatus = reason === "sealed" ? "sealed" : "interrupted";
|
|
955
|
+
this.#updateSessionStatus(agentName, sessionId, dbStatus);
|
|
956
|
+
this.#activeNodes.delete(this.#k(agentName, sessionId));
|
|
957
|
+
// Evict the in-memory per-session caches on teardown. The tree is durable in
|
|
958
|
+
// SQLite (getSessionTree reloads it on demand), and the received-content buffer
|
|
959
|
+
// holds plaintext that must not linger after a session ends. Without this, both
|
|
960
|
+
// maps grow unbounded by total sessions seen over a long-lived daemon.
|
|
961
|
+
// (#evictSessionCaches also drops the M7-SESSION-003 liveness flag, so both the
|
|
962
|
+
// destroy and retire teardown paths clear it — no stale verdict survives.)
|
|
963
|
+
this.#evictSessionCaches(agentName, sessionId);
|
|
964
|
+
this.#logger.info("session.node.destroyed", {
|
|
965
|
+
sessionId,
|
|
966
|
+
agentName: entry.agentName,
|
|
967
|
+
reason,
|
|
968
|
+
});
|
|
969
|
+
}
|
|
970
|
+
/**
|
|
971
|
+
* round-2 finding #5: retire a session's live libp2p node WITHOUT changing its
|
|
972
|
+
* DB status. Used after the active-session bilateral seal commitment has already
|
|
973
|
+
* advanced the row to 'seal_interrupted_pending': the session is frozen, so we
|
|
974
|
+
* stop the node and unregister its /cello/content handler (no more inbound leaves,
|
|
975
|
+
* no leaked node per active close) but must NOT overwrite the pending/sealed status
|
|
976
|
+
* the way destroySessionNode would. The durable tree stays in SQLite (getSessionTree
|
|
977
|
+
* reloads it); the in-memory plaintext buffer is evicted.
|
|
978
|
+
*/
|
|
979
|
+
async retireSessionNode(agentName, sessionId) {
|
|
980
|
+
const entry = this.#activeNodes.get(this.#k(agentName, sessionId));
|
|
981
|
+
if (!entry)
|
|
982
|
+
return;
|
|
983
|
+
this.#detachSessionRelay(entry);
|
|
984
|
+
try {
|
|
985
|
+
await entry.node.stop();
|
|
986
|
+
}
|
|
987
|
+
catch (err) {
|
|
988
|
+
this.#logger.error("session.node.stop.failed", {
|
|
989
|
+
sessionId,
|
|
990
|
+
agentName: entry.agentName,
|
|
991
|
+
error: err instanceof Error ? err.message : String(err),
|
|
992
|
+
correlationId: entry.correlationId,
|
|
993
|
+
});
|
|
994
|
+
// Fall through — still remove from active map.
|
|
995
|
+
}
|
|
996
|
+
this.#activeNodes.delete(this.#k(agentName, sessionId));
|
|
997
|
+
this.#evictSessionCaches(agentName, sessionId);
|
|
998
|
+
this.#logger.info("session.node.destroyed", {
|
|
999
|
+
sessionId,
|
|
1000
|
+
agentName: entry.agentName,
|
|
1001
|
+
reason: "sealing",
|
|
1002
|
+
});
|
|
1003
|
+
}
|
|
1004
|
+
/** Drop the in-memory tree + received-content caches for a torn-down session (DOD-LOOP-1: per (agent, session)). */
|
|
1005
|
+
#evictSessionCaches(agentName, sessionId) {
|
|
1006
|
+
const key = this.#k(agentName, sessionId);
|
|
1007
|
+
this.#trees.delete(key);
|
|
1008
|
+
this.#receivedContent.delete(key);
|
|
1009
|
+
// CELLO-M7-MSG-001: cancel any armed TTF timers so a torn-down session never
|
|
1010
|
+
// fires a park backstop (or keeps a timer) after it is gone.
|
|
1011
|
+
this.#clearAwaitingForSession(agentName, sessionId);
|
|
1012
|
+
// M7-SESSION-003: drop the direct-path liveness flag (the seal gate already read
|
|
1013
|
+
// its verdict) so a destroyed/retired session retains no stale alive/gone state.
|
|
1014
|
+
this.#sessionLiveness.delete(key);
|
|
1015
|
+
// M7-UPGRADE-002: drop the auto-acknowledge bookkeeping for a torn-down session.
|
|
1016
|
+
this.#contentDesynced.delete(key);
|
|
1017
|
+
this.#responderSealSubmitted.delete(key);
|
|
1018
|
+
// DOD-MSG-4: drop the strict-in-order bookkeeping (witness map, held plaintext, high-water)
|
|
1019
|
+
// so a torn-down session retains no stale ordering state or buffered plaintext.
|
|
1020
|
+
this.#witnessedSeq.delete(key);
|
|
1021
|
+
this.#heldContent.delete(key);
|
|
1022
|
+
this.#highWaterSeq.delete(key);
|
|
1023
|
+
}
|
|
1024
|
+
/**
|
|
1025
|
+
* Graceful shutdown: mark all active sessions as interrupted, stop all nodes.
|
|
1026
|
+
* Called from the SIGTERM / cello logout path (AC-009).
|
|
1027
|
+
* SQLite writes complete before this method returns.
|
|
1028
|
+
*/
|
|
1029
|
+
async gracefulShutdown() {
|
|
1030
|
+
// Signal any in-flight standing-receiver replacement to self-stop (review M2).
|
|
1031
|
+
this.#shuttingDown = true;
|
|
1032
|
+
// Cancel every armed awaiting-ACK timer so an un-acked send (e.g. a rejected /
|
|
1033
|
+
// tampered frame that never produced a `persisted` ACK) does not leave a 20s
|
|
1034
|
+
// timer pinning the content + this manager in memory past teardown (review M1).
|
|
1035
|
+
for (const bySession of this.#awaitingAck.values()) {
|
|
1036
|
+
for (const entry of bySession.values())
|
|
1037
|
+
clearTimeout(entry.timer);
|
|
1038
|
+
}
|
|
1039
|
+
this.#awaitingAck.clear();
|
|
1040
|
+
// Mark ALL 'active' rows interrupted in SQLite — single batch UPDATE covers
|
|
1041
|
+
// both in-memory managed nodes AND any rows that were inserted directly
|
|
1042
|
+
// (e.g. by the binary AC-009 SIGTERM test inserting synthetic rows).
|
|
1043
|
+
// This is the authoritative persistence step; in-memory map is secondary.
|
|
1044
|
+
const now = Date.now();
|
|
1045
|
+
if (!this.#db) {
|
|
1046
|
+
this.#logger.error("session.interrupt.db.write.failed", {
|
|
1047
|
+
sessionId: "__all__",
|
|
1048
|
+
error: "db not initialized",
|
|
1049
|
+
});
|
|
1050
|
+
}
|
|
1051
|
+
else {
|
|
1052
|
+
const interruptedAt = new Date(now).toISOString();
|
|
1053
|
+
try {
|
|
1054
|
+
this.#db.prepare("UPDATE sessions SET status = 'interrupted', updated_at = ?, interrupted_at = COALESCE(interrupted_at, ?) WHERE status = 'active'").run(now, interruptedAt);
|
|
1055
|
+
}
|
|
1056
|
+
catch (err) {
|
|
1057
|
+
this.#logger.error("session.interrupt.db.write.failed", {
|
|
1058
|
+
sessionId: "__all__",
|
|
1059
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1060
|
+
});
|
|
1061
|
+
}
|
|
1062
|
+
}
|
|
1063
|
+
// Stop all session nodes, then emit session.node.destroyed only on success
|
|
1064
|
+
// (mirrors destroySessionNode ordering: stop first, log destroyed after)
|
|
1065
|
+
const stopPromises = [];
|
|
1066
|
+
for (const entry of this.#activeNodes.values()) {
|
|
1067
|
+
entry.autoNat.stop();
|
|
1068
|
+
// M7 DOD-SPINE-6: detach from the agent relay client (closes it when its last
|
|
1069
|
+
// session goes) — consistent with the other teardown paths.
|
|
1070
|
+
this.#detachSessionRelay(entry);
|
|
1071
|
+
stopPromises.push(entry.node.stop().then(() => {
|
|
1072
|
+
this.#logger.info("session.node.destroyed", {
|
|
1073
|
+
sessionId: entry.sessionId,
|
|
1074
|
+
agentName: entry.agentName,
|
|
1075
|
+
reason: "interrupted",
|
|
1076
|
+
});
|
|
1077
|
+
}).catch((err) => {
|
|
1078
|
+
this.#logger.error("session.node.stop.failed", {
|
|
1079
|
+
sessionId: entry.sessionId,
|
|
1080
|
+
agentName: entry.agentName,
|
|
1081
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1082
|
+
correlationId: entry.correlationId,
|
|
1083
|
+
});
|
|
1084
|
+
}));
|
|
1085
|
+
}
|
|
1086
|
+
await Promise.all(stopPromises);
|
|
1087
|
+
this.#activeNodes.clear();
|
|
1088
|
+
// Evict in-memory per-session caches (trees reload from SQLite; received-content
|
|
1089
|
+
// plaintext must not survive shutdown in memory).
|
|
1090
|
+
this.#trees.clear();
|
|
1091
|
+
this.#receivedContent.clear();
|
|
1092
|
+
// Stop ALL per-agent standing receivers (DOD-LOOP-1).
|
|
1093
|
+
for (const [agentName, sr] of this.#standingReceivers) {
|
|
1094
|
+
sr.autoNat.stop();
|
|
1095
|
+
try {
|
|
1096
|
+
await sr.node.stop();
|
|
1097
|
+
}
|
|
1098
|
+
catch (err) {
|
|
1099
|
+
this.#logger.error("session.node.stop.failed", {
|
|
1100
|
+
sessionId: "standing_receiver_shutdown",
|
|
1101
|
+
agentName: `${STANDING_RECEIVER_AGENT_NAME}:${agentName}`,
|
|
1102
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1103
|
+
correlationId: "n/a",
|
|
1104
|
+
});
|
|
1105
|
+
}
|
|
1106
|
+
}
|
|
1107
|
+
this.#standingReceivers.clear();
|
|
1108
|
+
// Release the SQLite handle so the DB file is no longer held open after shutdown
|
|
1109
|
+
// (review L5). Queries guard on `#db === null` and degrade to empty/null.
|
|
1110
|
+
if (this.#db) {
|
|
1111
|
+
try {
|
|
1112
|
+
this.#db.close();
|
|
1113
|
+
}
|
|
1114
|
+
catch { /* already closed */ }
|
|
1115
|
+
this.#db = null;
|
|
1116
|
+
}
|
|
1117
|
+
}
|
|
1118
|
+
/**
|
|
1119
|
+
* Return all sessions with a given status from SQLite.
|
|
1120
|
+
* Used by cello status to surface interrupted sessions.
|
|
1121
|
+
*/
|
|
1122
|
+
getSessionsByStatus(status) {
|
|
1123
|
+
if (!this.#db)
|
|
1124
|
+
return [];
|
|
1125
|
+
return this.#db
|
|
1126
|
+
.prepare("SELECT * FROM sessions WHERE status = ?")
|
|
1127
|
+
.all(status);
|
|
1128
|
+
}
|
|
1129
|
+
/**
|
|
1130
|
+
* M7-SESSION-004 (AC-005): persist the seal certificate's legibility object with the
|
|
1131
|
+
* sealed record. Stored as a JSON string (hex-encoded pubkeys) so it round-trips a
|
|
1132
|
+
* daemon restart and is returned intact on the cert-read surface. The caller normalises
|
|
1133
|
+
* the raw wire legibility (Uint8Array pubkeys) into a JSON-safe shape before storing.
|
|
1134
|
+
* Best-effort: a session row may not yet exist (the seal arrived before the row was
|
|
1135
|
+
* persisted); in that case we no-op rather than throw — the cert still flows through the
|
|
1136
|
+
* live return path. The legibility content is identical regardless of delivery timing.
|
|
1137
|
+
*/
|
|
1138
|
+
recordSealCertificate(agentName, sessionId, sealedRootHex, legibilityJson) {
|
|
1139
|
+
if (!this.#db)
|
|
1140
|
+
return;
|
|
1141
|
+
this.#db
|
|
1142
|
+
.prepare("UPDATE sessions SET seal_legibility = ?, sealed_root_hex = ?, updated_at = ? WHERE agent_name = ? AND session_id = ?")
|
|
1143
|
+
.run(legibilityJson, sealedRootHex, Date.now(), agentName, sessionId);
|
|
1144
|
+
}
|
|
1145
|
+
/**
|
|
1146
|
+
* M7 legibility-TBS-binding (responder verify): record the counterparty's FROST primary (group)
|
|
1147
|
+
* pubkey from the FROST-signed SessionAssignment, so the responder can VERIFY the bilateral seal
|
|
1148
|
+
* signature locally. Best-effort — a missing row (race) is a no-op; the seal then falls back to
|
|
1149
|
+
* accept-without-verify (still sound: the live frame arrives over the authenticated Noise channel).
|
|
1150
|
+
*/
|
|
1151
|
+
recordCounterpartyPrimary(agentName, sessionId, primaryPubkeyHex) {
|
|
1152
|
+
if (!this.#db)
|
|
1153
|
+
return;
|
|
1154
|
+
this.#db
|
|
1155
|
+
.prepare("UPDATE sessions SET counterparty_primary_pubkey = ?, updated_at = ? WHERE agent_name = ? AND session_id = ?")
|
|
1156
|
+
.run(primaryPubkeyHex, Date.now(), agentName, sessionId);
|
|
1157
|
+
}
|
|
1158
|
+
/**
|
|
1159
|
+
* M7-SESSION-004 (AC-005/AC-006): read the persisted seal certificate for a session.
|
|
1160
|
+
* Returns the sealed root and the parsed legibility object (JSON-safe, hex pubkeys), or
|
|
1161
|
+
* null if the session is unknown or not yet sealed. This is the cert-read surface a
|
|
1162
|
+
* reader (operator, agent, arbitrator) — possibly in a DIFFERENT process than the one
|
|
1163
|
+
* that built the certificate — uses to determine receipt-not-assent, per-party frontiers,
|
|
1164
|
+
* attestation modes, and whether the final message was answered.
|
|
1165
|
+
*/
|
|
1166
|
+
getSealCertificate(agentName, sessionId) {
|
|
1167
|
+
if (!this.#db)
|
|
1168
|
+
return null;
|
|
1169
|
+
const row = this.#db
|
|
1170
|
+
.prepare("SELECT sealed_root_hex, seal_legibility FROM sessions WHERE agent_name = ? AND session_id = ?")
|
|
1171
|
+
.get(agentName, sessionId);
|
|
1172
|
+
if (!row || !row.seal_legibility || !row.sealed_root_hex)
|
|
1173
|
+
return null;
|
|
1174
|
+
let legibility;
|
|
1175
|
+
try {
|
|
1176
|
+
legibility = JSON.parse(row.seal_legibility);
|
|
1177
|
+
}
|
|
1178
|
+
catch {
|
|
1179
|
+
return null;
|
|
1180
|
+
}
|
|
1181
|
+
return { sealed_root: row.sealed_root_hex, legibility };
|
|
1182
|
+
}
|
|
1183
|
+
/**
|
|
1184
|
+
* M7-SESSION-001: Mark a session as interrupted with message count and timestamp.
|
|
1185
|
+
* Called when a relay session_interrupted frame arrives or a relay stream closes.
|
|
1186
|
+
* Also tears down the in-memory session node if one exists for this sessionId.
|
|
1187
|
+
*
|
|
1188
|
+
* @param sessionId The hex session ID from the relay frame
|
|
1189
|
+
* @param messageCount Number of message leaves at interruption
|
|
1190
|
+
* @param source 'relay_frame' | 'stream_close'
|
|
1191
|
+
*/
|
|
1192
|
+
async markInterruptedWithDetails(agentName, sessionId, messageCount, source) {
|
|
1193
|
+
if (!this.#db)
|
|
1194
|
+
return;
|
|
1195
|
+
// H-3 SECURITY: only an 'active' session may transition to 'interrupted'.
|
|
1196
|
+
// A late or forged relay frame must NOT revert a 'sealed', 'seal_interrupted_pending',
|
|
1197
|
+
// or already-'interrupted' session back to 'interrupted'. This mirrors the
|
|
1198
|
+
// stream-close guard in #watchRelayStream below — the two paths must agree.
|
|
1199
|
+
const existing = this.getSessionRecord(agentName, sessionId);
|
|
1200
|
+
if (!existing || existing.status !== "active") {
|
|
1201
|
+
this.#logger.warn("session.interrupt.ignored", {
|
|
1202
|
+
sessionId,
|
|
1203
|
+
source,
|
|
1204
|
+
currentStatus: existing?.status ?? "absent",
|
|
1205
|
+
reason: "session_not_active",
|
|
1206
|
+
});
|
|
1207
|
+
return;
|
|
1208
|
+
}
|
|
1209
|
+
const now = Date.now();
|
|
1210
|
+
const interruptedAt = new Date(now).toISOString();
|
|
1211
|
+
// round-2 finding #7: the daemon-owned tree is the authoritative transcript
|
|
1212
|
+
// length. The `messageCount` arg comes from registerRelayStream time and defaults
|
|
1213
|
+
// to 0, so writing it blindly would clobber the column out of sync with the tree
|
|
1214
|
+
// (both seal flows prefer tree.size(), but the column must not lie). When a tree
|
|
1215
|
+
// exists for this session, persist its size; otherwise fall back to the arg.
|
|
1216
|
+
const treeSize = this.getSessionTree(agentName, sessionId).size();
|
|
1217
|
+
const authoritativeCount = treeSize > 0 ? treeSize : messageCount;
|
|
1218
|
+
try {
|
|
1219
|
+
// The `AND status = 'active'` predicate is the authoritative guard: even if
|
|
1220
|
+
// the pre-check above raced (it cannot — DatabaseSync is synchronous), the
|
|
1221
|
+
// UPDATE only mutates a row that is still active.
|
|
1222
|
+
this.#db
|
|
1223
|
+
.prepare("UPDATE sessions SET status = 'interrupted', updated_at = ?, message_count = ?, interrupted_at = ? WHERE agent_name = ? AND session_id = ? AND status = 'active'")
|
|
1224
|
+
.run(now, authoritativeCount, interruptedAt, agentName, sessionId);
|
|
1225
|
+
}
|
|
1226
|
+
catch (err) {
|
|
1227
|
+
this.#logger.error("session.interrupt.db.write.failed", {
|
|
1228
|
+
sessionId,
|
|
1229
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1230
|
+
});
|
|
1231
|
+
}
|
|
1232
|
+
// Look up the in-memory entry (keyed by (agent, session)) for teardown.
|
|
1233
|
+
const entry = this.#activeNodes.get(this.#k(agentName, sessionId));
|
|
1234
|
+
// Tear down the in-memory session node if it exists
|
|
1235
|
+
if (entry) {
|
|
1236
|
+
entry.autoNat.stop();
|
|
1237
|
+
this.#detachSessionRelay(entry);
|
|
1238
|
+
try {
|
|
1239
|
+
await entry.node.stop();
|
|
1240
|
+
}
|
|
1241
|
+
catch (err) {
|
|
1242
|
+
this.#logger.error("session.node.stop.failed", {
|
|
1243
|
+
sessionId,
|
|
1244
|
+
agentName,
|
|
1245
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1246
|
+
correlationId: entry.correlationId,
|
|
1247
|
+
});
|
|
1248
|
+
// Fall through — still remove from active map
|
|
1249
|
+
}
|
|
1250
|
+
this.#activeNodes.delete(this.#k(agentName, sessionId));
|
|
1251
|
+
this.#logger.info("session.node.destroyed", {
|
|
1252
|
+
sessionId,
|
|
1253
|
+
agentName,
|
|
1254
|
+
reason: "interrupted",
|
|
1255
|
+
});
|
|
1256
|
+
}
|
|
1257
|
+
this.#logger.warn("session.interrupted.detected", {
|
|
1258
|
+
sessionId,
|
|
1259
|
+
agentName,
|
|
1260
|
+
source,
|
|
1261
|
+
});
|
|
1262
|
+
// M7-SESSION-001 (M-1 PUSH): notify live MCP clients that this session is now
|
|
1263
|
+
// interrupted. Only fires on a real active→interrupted transition (the guard
|
|
1264
|
+
// above already returned for any non-active session).
|
|
1265
|
+
try {
|
|
1266
|
+
this.#onSessionStateChanged?.(agentName, sessionId, "interrupted", existing.counterparty_pubkey);
|
|
1267
|
+
}
|
|
1268
|
+
catch (err) {
|
|
1269
|
+
this.#logger.debug("session.state.notify.failed", {
|
|
1270
|
+
sessionId,
|
|
1271
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1272
|
+
});
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
/**
|
|
1276
|
+
* M7-SESSION-001 (H-1): persist a verified bilateral SEAL-INTERRUPTED
|
|
1277
|
+
* commitment and transition the session to 'seal_interrupted_pending'.
|
|
1278
|
+
*
|
|
1279
|
+
* This is NOT a seal. It records that both parties produced and exchanged
|
|
1280
|
+
* K_local-signed SEAL-INTERRUPTED leaves over the same {leafCount, merkleRoot}.
|
|
1281
|
+
* The FROST threshold notarization is a separate, currently-unwired step (see
|
|
1282
|
+
* daemon.ts handleSealInterruptedFlow H-1 note), which is precisely why the
|
|
1283
|
+
* status is 'seal_interrupted_pending' and never 'sealed'.
|
|
1284
|
+
*
|
|
1285
|
+
* The status update is guarded so it only advances a session out of the
|
|
1286
|
+
* 'interrupted' state — it will not overwrite a 'sealed' row.
|
|
1287
|
+
*
|
|
1288
|
+
* @returns true if the session row was advanced to seal_interrupted_pending.
|
|
1289
|
+
*/
|
|
1290
|
+
persistSealInterruptedCommitment(opts) {
|
|
1291
|
+
if (!this.#db)
|
|
1292
|
+
return false;
|
|
1293
|
+
const now = Date.now();
|
|
1294
|
+
try {
|
|
1295
|
+
this.#db
|
|
1296
|
+
.prepare(`INSERT OR REPLACE INTO seal_interrupted_artifacts
|
|
1297
|
+
(agent_name, session_id, role, own_leaf, counterparty_leaf, merkle_root, nonce, created_at)
|
|
1298
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`)
|
|
1299
|
+
.run(opts.agentName, opts.sessionId, opts.role, JSON.stringify(opts.ownLeaf), JSON.stringify(opts.counterpartyLeaf), opts.merkleRoot, opts.nonce, now);
|
|
1300
|
+
}
|
|
1301
|
+
catch (err) {
|
|
1302
|
+
this.#logger.error("session.interrupted.db.write.failed", {
|
|
1303
|
+
sessionId: opts.sessionId,
|
|
1304
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1305
|
+
});
|
|
1306
|
+
return false;
|
|
1307
|
+
}
|
|
1308
|
+
// DAEMON-004: the bilateral commitment advances a session out of either
|
|
1309
|
+
// 'interrupted' (SESSION-001 interrupted-seal flow) OR 'active' (DAEMON-004
|
|
1310
|
+
// active-session seal). The guard still refuses to overwrite a terminal
|
|
1311
|
+
// 'sealed' row or an already-pending one.
|
|
1312
|
+
const result = this.#db
|
|
1313
|
+
.prepare("UPDATE sessions SET status = 'seal_interrupted_pending', updated_at = ? WHERE agent_name = ? AND session_id = ? AND status IN ('active', 'interrupted')")
|
|
1314
|
+
.run(now, opts.agentName, opts.sessionId);
|
|
1315
|
+
return Number(result.changes) > 0;
|
|
1316
|
+
}
|
|
1317
|
+
/**
|
|
1318
|
+
* M7-SESSION-001 (H-1): read back the persisted bilateral commitment artifacts
|
|
1319
|
+
* for a session. Returns null when none exist.
|
|
1320
|
+
*/
|
|
1321
|
+
getSealInterruptedArtifacts(agentName, sessionId) {
|
|
1322
|
+
if (!this.#db)
|
|
1323
|
+
return null;
|
|
1324
|
+
const row = this.#db
|
|
1325
|
+
.prepare("SELECT * FROM seal_interrupted_artifacts WHERE agent_name = ? AND session_id = ?")
|
|
1326
|
+
.get(agentName, sessionId);
|
|
1327
|
+
if (!row)
|
|
1328
|
+
return null;
|
|
1329
|
+
return {
|
|
1330
|
+
role: row.role,
|
|
1331
|
+
ownLeaf: JSON.parse(row.own_leaf),
|
|
1332
|
+
counterpartyLeaf: JSON.parse(row.counterparty_leaf),
|
|
1333
|
+
merkleRoot: row.merkle_root,
|
|
1334
|
+
nonce: row.nonce,
|
|
1335
|
+
};
|
|
1336
|
+
}
|
|
1337
|
+
/**
|
|
1338
|
+
* Return the session record for a specific sessionId, regardless of status.
|
|
1339
|
+
* Used by cello_close_session to inspect session state.
|
|
1340
|
+
*/
|
|
1341
|
+
getSessionRecord(agentName, sessionId) {
|
|
1342
|
+
if (!this.#db)
|
|
1343
|
+
return null;
|
|
1344
|
+
const row = this.#db
|
|
1345
|
+
.prepare("SELECT * FROM sessions WHERE agent_name = ? AND session_id = ?")
|
|
1346
|
+
.get(agentName, sessionId);
|
|
1347
|
+
return row ?? null;
|
|
1348
|
+
}
|
|
1349
|
+
/**
|
|
1350
|
+
* MSG-2 startup-flush: the persisted relay endpoint for a session, or null if none was
|
|
1351
|
+
* recorded. Used by the crash-backstop flush, which runs at startup BEFORE the in-memory
|
|
1352
|
+
* session entries exist, so it cannot use `entry.relayPeerId`.
|
|
1353
|
+
*/
|
|
1354
|
+
getPersistedRelayEndpoint(agentName, sessionId) {
|
|
1355
|
+
if (!this.#db)
|
|
1356
|
+
return null;
|
|
1357
|
+
const row = this.#db
|
|
1358
|
+
.prepare("SELECT relay_peer_id, relay_addrs FROM sessions WHERE agent_name = ? AND session_id = ?")
|
|
1359
|
+
.get(agentName, sessionId);
|
|
1360
|
+
if (!row?.relay_peer_id || !row?.relay_addrs)
|
|
1361
|
+
return null;
|
|
1362
|
+
try {
|
|
1363
|
+
const addrs = JSON.parse(row.relay_addrs);
|
|
1364
|
+
if (!Array.isArray(addrs) || addrs.length === 0)
|
|
1365
|
+
return null;
|
|
1366
|
+
return { relayPeerId: row.relay_peer_id, relayAddrs: addrs };
|
|
1367
|
+
}
|
|
1368
|
+
catch {
|
|
1369
|
+
return null;
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
/**
|
|
1373
|
+
* DOD-MSG-4 (auto-recover): the DISTINCT relay endpoints this agent has sessions on, so the daemon
|
|
1374
|
+
* can pull the agent's parked mailbox from each on reconnect (the relay mailbox is keyed by recipient
|
|
1375
|
+
* pubkey, so one pull per relay drains all of the agent's parked content there). Distinct by relay
|
|
1376
|
+
* peer id.
|
|
1377
|
+
*/
|
|
1378
|
+
getAgentRelayEndpoints(agentName) {
|
|
1379
|
+
if (!this.#db)
|
|
1380
|
+
return [];
|
|
1381
|
+
const rows = this.#db
|
|
1382
|
+
.prepare("SELECT DISTINCT relay_peer_id, relay_addrs FROM sessions WHERE agent_name = ? AND relay_peer_id IS NOT NULL")
|
|
1383
|
+
.all(agentName);
|
|
1384
|
+
const byPeer = new Map();
|
|
1385
|
+
for (const row of rows) {
|
|
1386
|
+
if (!row.relay_peer_id || !row.relay_addrs)
|
|
1387
|
+
continue;
|
|
1388
|
+
try {
|
|
1389
|
+
const addrs = JSON.parse(row.relay_addrs);
|
|
1390
|
+
if (!Array.isArray(addrs) || addrs.length === 0)
|
|
1391
|
+
continue;
|
|
1392
|
+
if (!byPeer.has(row.relay_peer_id))
|
|
1393
|
+
byPeer.set(row.relay_peer_id, { relayPeerId: row.relay_peer_id, relayAddrs: addrs });
|
|
1394
|
+
}
|
|
1395
|
+
catch {
|
|
1396
|
+
/* skip malformed */
|
|
1397
|
+
}
|
|
1398
|
+
}
|
|
1399
|
+
return [...byPeer.values()];
|
|
1400
|
+
}
|
|
1401
|
+
// ─── DAEMON-004: daemon-owned Merkle tree ──────────────────────────────────
|
|
1402
|
+
/**
|
|
1403
|
+
* Return the daemon-owned Merkle tree for a session, loading it from SQLite
|
|
1404
|
+
* on first access (so it survives a restart — AC-007). Never returns null;
|
|
1405
|
+
* an unknown session yields an empty tree.
|
|
1406
|
+
*/
|
|
1407
|
+
getSessionTree(agentName, sessionId) {
|
|
1408
|
+
const key = this.#k(agentName, sessionId);
|
|
1409
|
+
const cached = this.#trees.get(key);
|
|
1410
|
+
if (cached)
|
|
1411
|
+
return cached;
|
|
1412
|
+
const tree = this.#loadTreeFromDb(agentName, sessionId);
|
|
1413
|
+
this.#trees.set(key, tree);
|
|
1414
|
+
return tree;
|
|
1415
|
+
}
|
|
1416
|
+
/** Current daemon-owned tree root for a session, as hex. */
|
|
1417
|
+
getSessionTreeRootHex(agentName, sessionId) {
|
|
1418
|
+
return this.getSessionTree(agentName, sessionId).rootHex();
|
|
1419
|
+
}
|
|
1420
|
+
/**
|
|
1421
|
+
* Append a leaf (by its 32-byte leaf-hash hex) to the daemon-owned tree,
|
|
1422
|
+
* persist it, advance the root, and fire session.tree.appended.
|
|
1423
|
+
*
|
|
1424
|
+
* @returns the new leaf index and the recomputed root hex.
|
|
1425
|
+
*/
|
|
1426
|
+
appendSessionLeaf(agentName, sessionId, kind, leafHashHex, correlationId) {
|
|
1427
|
+
const tree = this.getSessionTree(agentName, sessionId);
|
|
1428
|
+
const { leafIndex, newRootHex } = tree.appendLeafHash(kind, leafHashHex);
|
|
1429
|
+
if (this.#db) {
|
|
1430
|
+
try {
|
|
1431
|
+
this.#db
|
|
1432
|
+
.prepare(`INSERT INTO session_tree_leaves
|
|
1433
|
+
(agent_name, session_id, leaf_index, leaf_kind, leaf_hash_hex, created_at)
|
|
1434
|
+
VALUES (?, ?, ?, ?, ?, ?)`)
|
|
1435
|
+
.run(agentName, sessionId, leafIndex, kind, leafHashHex, Date.now());
|
|
1436
|
+
// DAEMON-004 (finding #2): keep sessions.message_count synced to the tree
|
|
1437
|
+
// size. message_count is the bilateral leafCount the seal flow signs over
|
|
1438
|
+
// (handleSealInterruptedFlow / the responder). If it diverged from the
|
|
1439
|
+
// daemon-owned tree, a post-active-messaging seal would attest to a
|
|
1440
|
+
// truncated transcript and the bilateral leafCount check would mismatch.
|
|
1441
|
+
// The tree (leafIndex + 1 leaves) is authoritative; the column tracks it.
|
|
1442
|
+
this.#db
|
|
1443
|
+
.prepare("UPDATE sessions SET message_count = ?, updated_at = ? WHERE agent_name = ? AND session_id = ?")
|
|
1444
|
+
.run(leafIndex + 1, Date.now(), agentName, sessionId);
|
|
1445
|
+
}
|
|
1446
|
+
catch (err) {
|
|
1447
|
+
// A persist failure must be visible, not swallowed: the in-memory tree
|
|
1448
|
+
// has advanced but the durable transcript has not, which would diverge
|
|
1449
|
+
// on restart. Surface it loudly.
|
|
1450
|
+
this.#logger.error("session.tree.persist.failed", {
|
|
1451
|
+
sessionId,
|
|
1452
|
+
leafIndex,
|
|
1453
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1454
|
+
correlationId,
|
|
1455
|
+
});
|
|
1456
|
+
}
|
|
1457
|
+
}
|
|
1458
|
+
this.#logger.info("session.tree.appended", {
|
|
1459
|
+
sessionId,
|
|
1460
|
+
leafIndex,
|
|
1461
|
+
newRootHex,
|
|
1462
|
+
correlationId,
|
|
1463
|
+
});
|
|
1464
|
+
return { leafIndex, newRootHex };
|
|
1465
|
+
}
|
|
1466
|
+
/**
|
|
1467
|
+
* SEAM 1b (dialer ⇄ session-node reconciliation): dial the counterparty THROUGH
|
|
1468
|
+
* this session's OWN node, so the session node N_A holds the connection its content
|
|
1469
|
+
* newStream actually rides. TRANSPORT-001's transport selector dialed on a separate
|
|
1470
|
+
* (composition-root) node whose connection N_A could not use — the per-session node
|
|
1471
|
+
* must be the dialer. Direct mode only here (the default content path, Part 4 D-a);
|
|
1472
|
+
* relay-circuit + dcutr strategy via N_A is a later seam. Tries each addr in turn;
|
|
1473
|
+
* succeeds on the first connection, returns a named failure if none connect.
|
|
1474
|
+
*/
|
|
1475
|
+
async connectToCounterparty(agentName, sessionId, addrs) {
|
|
1476
|
+
const entry = this.#activeNodes.get(this.#k(agentName, sessionId));
|
|
1477
|
+
if (!entry) {
|
|
1478
|
+
return { ok: false, reason: "session_node_unavailable", error: "no active session node for this session" };
|
|
1479
|
+
}
|
|
1480
|
+
if (addrs.length === 0) {
|
|
1481
|
+
return { ok: false, reason: "no_counterparty_addrs", error: "the assignment carried no counterparty session addrs to dial" };
|
|
1482
|
+
}
|
|
1483
|
+
let lastError = "";
|
|
1484
|
+
for (const addr of addrs) {
|
|
1485
|
+
try {
|
|
1486
|
+
await entry.node.dial(addr);
|
|
1487
|
+
this.#logger.info("session.transport.connected", {
|
|
1488
|
+
sessionId,
|
|
1489
|
+
addr,
|
|
1490
|
+
correlationId: entry.correlationId,
|
|
1491
|
+
});
|
|
1492
|
+
return { ok: true };
|
|
1493
|
+
}
|
|
1494
|
+
catch (err) {
|
|
1495
|
+
// error.message extracted — never [object Object]; try the next addr.
|
|
1496
|
+
lastError = err instanceof Error ? err.message : String(err);
|
|
1497
|
+
}
|
|
1498
|
+
}
|
|
1499
|
+
this.#logger.warn("session.transport.connect.failed", {
|
|
1500
|
+
sessionId,
|
|
1501
|
+
reason: "counterparty_dial_failed",
|
|
1502
|
+
error: lastError,
|
|
1503
|
+
correlationId: entry.correlationId,
|
|
1504
|
+
});
|
|
1505
|
+
return { ok: false, reason: "counterparty_dial_failed", error: lastError };
|
|
1506
|
+
}
|
|
1507
|
+
/**
|
|
1508
|
+
* DAEMON-004: send content over the session node's direct P2P content stream.
|
|
1509
|
+
* On a dead/missing stream this returns a NAMED, diagnosable failure — never a
|
|
1510
|
+
* silent success and never a desync (closing the old silent fire-and-forget
|
|
1511
|
+
* content catch in the retired in-process client send path).
|
|
1512
|
+
*
|
|
1513
|
+
* SCOPE / findings #3 + #4 — what this send path does and does NOT do today:
|
|
1514
|
+
* - #4: it delivers the content over the direct /cello/content/1.0.0 P2P
|
|
1515
|
+
* stream only. It does NOT also submit a K_local-SIGNED content_hash leaf to
|
|
1516
|
+
* the RELAY on /cello/relay/1.0.0 (EARS behavior #1). That relay hash-submit
|
|
1517
|
+
* is MSG-001's scope; AC-001's "relay log shows a hash_submit" evidence is
|
|
1518
|
+
* produced once MSG-001 lands.
|
|
1519
|
+
* - #3: because there is no relay yet, the sequence number cello_send returns
|
|
1520
|
+
* is the LOCAL leaf index, not a relay-assigned canonical global sequence.
|
|
1521
|
+
* Each daemon appends leaves in its own LOCAL observation order, so two
|
|
1522
|
+
* daemons' roots agree only under perfectly ping-ponged traffic. Canonical
|
|
1523
|
+
* cross-process ordering (and thus AC-002 root agreement under concurrent
|
|
1524
|
+
* bidirectional traffic) requires the relay-assigned sequence from MSG-001.
|
|
1525
|
+
*/
|
|
1526
|
+
async sendContent(agentName, sessionId, content, contentHash, correlationId) {
|
|
1527
|
+
const entry = this.#activeNodes.get(this.#k(agentName, sessionId));
|
|
1528
|
+
if (!entry) {
|
|
1529
|
+
return { ok: false, reason: "session_node_unavailable", error: "no active session node for this session" };
|
|
1530
|
+
}
|
|
1531
|
+
// R1 (MSG-001-3b): witness the message-leaf HASH to the relay FIRST, INDEPENDENT of
|
|
1532
|
+
// direct delivery. The relay is the ordering authority (Structure 2): it assigns the
|
|
1533
|
+
// canonical sequence from the hash whether or not the counterparty is reachable for direct
|
|
1534
|
+
// content. So an OFFLINE recipient still gets a sequence, and the parked content is later
|
|
1535
|
+
// recovered AT that sequence (DOD-MSG-4 recovery-not-desync). The relay only ever sees the
|
|
1536
|
+
// hash (INV-3). Best-effort: a relay miss degrades to local-only sequencing. Previously this
|
|
1537
|
+
// ran AFTER a successful direct send, so an offline recipient's content got NO sequence — the
|
|
1538
|
+
// gap R1 closes.
|
|
1539
|
+
// DOD-MSG-4 (self-ordering content frame): the relay's committed ordering record for this leaf,
|
|
1540
|
+
// captured from the hash submit so it can be stamped into the content frame (and the parked
|
|
1541
|
+
// entry). Undefined if the relay is unreachable / an old relay — the receiver then falls back to
|
|
1542
|
+
// the leaf_deliver witness stream / arrival order.
|
|
1543
|
+
let orderingS1;
|
|
1544
|
+
let orderingS2;
|
|
1545
|
+
if (entry.relayClient && entry.relaySessionIdBytes) {
|
|
1546
|
+
try {
|
|
1547
|
+
const witnessed = await entry.relayClient.submitMessageHash(entry.node, entry.relaySessionIdBytes, contentHash);
|
|
1548
|
+
if (witnessed.ok) {
|
|
1549
|
+
orderingS1 = witnessed.structure1_cbor;
|
|
1550
|
+
orderingS2 = witnessed.structure2_cbor;
|
|
1551
|
+
this.#logger.info("session.relay.hash.submitted", {
|
|
1552
|
+
sessionId,
|
|
1553
|
+
sequenceNumber: witnessed.sequence_number,
|
|
1554
|
+
correlationId,
|
|
1555
|
+
});
|
|
1556
|
+
}
|
|
1557
|
+
else {
|
|
1558
|
+
this.#logger.warn("session.relay.hash.submit.failed", {
|
|
1559
|
+
sessionId,
|
|
1560
|
+
reason: witnessed.reason,
|
|
1561
|
+
correlationId,
|
|
1562
|
+
});
|
|
1563
|
+
}
|
|
1564
|
+
}
|
|
1565
|
+
catch (relayErr) {
|
|
1566
|
+
this.#logger.warn("session.relay.hash.submit.failed", {
|
|
1567
|
+
sessionId,
|
|
1568
|
+
reason: relayErr instanceof Error ? relayErr.message : String(relayErr),
|
|
1569
|
+
correlationId,
|
|
1570
|
+
});
|
|
1571
|
+
}
|
|
1572
|
+
}
|
|
1573
|
+
// Attempt direct peer↔peer content delivery. On success the receiver's `persisted` ACK
|
|
1574
|
+
// resolves the awaiting timer; on failure (counterparty offline) the hash is already
|
|
1575
|
+
// witnessed above, so the caller / TTF path parks the SEALED content to the relay
|
|
1576
|
+
// store-and-forward backstop and the recipient recovers it at the witnessed sequence (2b).
|
|
1577
|
+
try {
|
|
1578
|
+
const stream = await entry.node.newStream(entry.counterpartySessionPeerId, CELLO_CONTENT_PROTOCOL_ID);
|
|
1579
|
+
// AC-001/AC-003: arm the TTF tracking BEFORE the frame goes on the wire. The
|
|
1580
|
+
// receiver's `persisted` ACK can come back fast (in-process / low-latency
|
|
1581
|
+
// transports), so registering the awaiting entry after send would let the ACK
|
|
1582
|
+
// race ahead of it and be dropped — the timer would then spuriously fire. The
|
|
1583
|
+
// content is delivered to the wire but NOT yet confirmed persisted; the ACK
|
|
1584
|
+
// resolves it (content.delivery.acked) and TTF expiry hands it to the park
|
|
1585
|
+
// backstop. The correlationId rides in the frame so the receiver's
|
|
1586
|
+
// session.content.received shares ONE flow id with the sender.
|
|
1587
|
+
this.#trackAwaitingAck(agentName, sessionId, content, contentHash, correlationId, orderingS1, orderingS2);
|
|
1588
|
+
const frame = CBOR_ENC.encode({
|
|
1589
|
+
type: "content_frame",
|
|
1590
|
+
session_id: sessionId,
|
|
1591
|
+
content_hash: contentHash,
|
|
1592
|
+
content_bytes: content,
|
|
1593
|
+
correlation_id: correlationId,
|
|
1594
|
+
// DOD-MSG-4 (self-ordering): the relay's signed ordering record, so the receiver verifies +
|
|
1595
|
+
// orders from the frame ALONE (no dependence on the separate leaf_deliver witness timing).
|
|
1596
|
+
// structure1_cbor = sender-signed bytes (verify); structure2_cbor = relay's committed seq +
|
|
1597
|
+
// prev_root (order). Omitted if the relay was unreachable — receiver falls back to the witness.
|
|
1598
|
+
structure1_cbor: orderingS1,
|
|
1599
|
+
structure2_cbor: orderingS2,
|
|
1600
|
+
});
|
|
1601
|
+
stream.send(lp.encode.single(frame));
|
|
1602
|
+
try {
|
|
1603
|
+
await stream.close();
|
|
1604
|
+
}
|
|
1605
|
+
catch { /* best-effort close */ }
|
|
1606
|
+
return { ok: true };
|
|
1607
|
+
}
|
|
1608
|
+
catch (err) {
|
|
1609
|
+
// The send failed after (possibly) arming the awaiting tracking — drop it so a
|
|
1610
|
+
// never-delivered frame does not later fire a spurious TTF park.
|
|
1611
|
+
this.#untrackAwaitingAck(agentName, sessionId, contentHash);
|
|
1612
|
+
// 2b: direct delivery failed (counterparty offline). The hash is already witnessed (R1, the
|
|
1613
|
+
// sequence is assigned), so deposit the content to the relay store-and-forward backstop now;
|
|
1614
|
+
// the recipient pulls + recovers it on next online (DOD-MSG-3/4).
|
|
1615
|
+
this.#parkContent(agentName, sessionId, Buffer.from(contentHash).toString("hex"), content, orderingS1, orderingS2);
|
|
1616
|
+
// error.message extracted — never [object Object]. libp2p/cross-package errors are not
|
|
1617
|
+
// always `instanceof Error` in this realm, so fall back to a message property / JSON.
|
|
1618
|
+
const errMsg = err instanceof Error
|
|
1619
|
+
? err.message
|
|
1620
|
+
: err && typeof err === "object" && typeof err.message === "string"
|
|
1621
|
+
? err.message
|
|
1622
|
+
: (() => {
|
|
1623
|
+
try {
|
|
1624
|
+
return JSON.stringify(err);
|
|
1625
|
+
}
|
|
1626
|
+
catch {
|
|
1627
|
+
return String(err);
|
|
1628
|
+
}
|
|
1629
|
+
})();
|
|
1630
|
+
return { ok: false, reason: "session_stream_unavailable", error: errMsg };
|
|
1631
|
+
}
|
|
1632
|
+
}
|
|
1633
|
+
/**
|
|
1634
|
+
* M7 DOD-SPINE-7: submit THIS party's SEAL ctrl leaf (0x02) to the relay witness.
|
|
1635
|
+
* Structure: content_hash = SHA-256(0x02 || encodeSealPayload({session_id, final_root,
|
|
1636
|
+
* close_timestamp, "PENDING"})), where final_root is the daemon's OWN tree root. Two
|
|
1637
|
+
* distinct-sender SEAL leaves in the relay's log trigger the relay's #maybeProcessSeal
|
|
1638
|
+
* → directory processSeal (rebuild + verify the signed chain) → FROST notarization →
|
|
1639
|
+
* session_sealed. Requires an active relay client; the caller falls back to the
|
|
1640
|
+
* directory-mediated path when this returns relay_unavailable.
|
|
1641
|
+
*/
|
|
1642
|
+
async submitSealLeaf(agentName, sessionId, correlationId) {
|
|
1643
|
+
const sealKey = this.#k(agentName, sessionId);
|
|
1644
|
+
const entry = this.#activeNodes.get(sealKey);
|
|
1645
|
+
if (!entry)
|
|
1646
|
+
return { ok: false, reason: "session_node_unavailable" };
|
|
1647
|
+
if (!entry.relayClient || !entry.relaySessionIdBytes)
|
|
1648
|
+
return { ok: false, reason: "relay_unavailable" };
|
|
1649
|
+
// M7-UPGRADE-002 idempotency: this party submits its responder SEAL leaf AT MOST ONCE per
|
|
1650
|
+
// session. BOTH cello_close_session and the auto-acknowledge path call here; the first to reach
|
|
1651
|
+
// this point wins, the second short-circuits. The check+set is SYNCHRONOUS (before any await) so
|
|
1652
|
+
// two near-simultaneous triggers (e.g. B's own close racing A's delivered SEAL ctrl leaf) cannot
|
|
1653
|
+
// both submit. Cleared below on a relay submit failure so a genuine retry can proceed.
|
|
1654
|
+
if (this.#responderSealSubmitted.has(sealKey)) {
|
|
1655
|
+
return { ok: false, reason: "responder_seal_already_submitted" };
|
|
1656
|
+
}
|
|
1657
|
+
this.#responderSealSubmitted.add(sealKey);
|
|
1658
|
+
const finalRootHex = this.getSessionTreeRootHex(agentName, sessionId);
|
|
1659
|
+
const sealPayload = encodeSealPayload({
|
|
1660
|
+
session_id: entry.relaySessionIdBytes,
|
|
1661
|
+
final_root: new Uint8Array(Buffer.from(finalRootHex, "hex")),
|
|
1662
|
+
close_timestamp: Date.now(),
|
|
1663
|
+
attestation: "PENDING",
|
|
1664
|
+
});
|
|
1665
|
+
// content_hash = SHA-256(0x02 || seal_payload) — the ctrl leaf kind byte is 0x02.
|
|
1666
|
+
const contentHash = new Uint8Array(createHash("sha256").update(new Uint8Array([LEAF_KIND_CTRL])).update(sealPayload).digest());
|
|
1667
|
+
const result = await entry.relayClient.submitLeaf(entry.node, entry.relaySessionIdBytes, contentHash, LEAF_KIND_CTRL);
|
|
1668
|
+
if (!result.ok) {
|
|
1669
|
+
// Clear the idempotency mark so a genuine retry (agent close / reconnect) can proceed (DB-001).
|
|
1670
|
+
this.#responderSealSubmitted.delete(sealKey);
|
|
1671
|
+
this.#logger.warn("session.seal.leaf.submit.failed", { sessionId, reason: result.reason, correlationId });
|
|
1672
|
+
return { ok: false, reason: result.reason };
|
|
1673
|
+
}
|
|
1674
|
+
// SESSION-002: the reported_root for a unilateral seal is the content-hash root the
|
|
1675
|
+
// local tree WOULD have with this SEAL ctrl leaf appended — the same root the directory
|
|
1676
|
+
// rebuilds from the relay's content-hash chain (the relay records the identical
|
|
1677
|
+
// content_hash for this ctrl leaf). Computed without mutating the durable tree /
|
|
1678
|
+
// message_count, so the bilateral + interrupted seal paths are unaffected.
|
|
1679
|
+
const contentHashHex = Buffer.from(contentHash).toString("hex");
|
|
1680
|
+
const reportedRootHex = this.getSessionTree(agentName, sessionId).rootWithAppendedHex(contentHashHex);
|
|
1681
|
+
this.#logger.info("session.seal.leaf.submitted", {
|
|
1682
|
+
sessionId,
|
|
1683
|
+
sequenceNumber: result.sequence_number,
|
|
1684
|
+
correlationId,
|
|
1685
|
+
});
|
|
1686
|
+
// M7-UPGRADE-002: #responderSealSubmitted was set synchronously at the top of this method —
|
|
1687
|
+
// the guard now blocks any second submit (auto-ack OR a redelivered counterparty SEAL ctrl leaf).
|
|
1688
|
+
return { ok: true, sequenceNumber: result.sequence_number, reportedRootHex };
|
|
1689
|
+
}
|
|
1690
|
+
/**
|
|
1691
|
+
* CELLO-M7-UPGRADE-001 (DOD-UP-1): readiness of a session for B to RATIFY a unilateral seal
|
|
1692
|
+
* (the returning absent party). This is the SAME verifiability bar as the UP-2 auto-ack gate:
|
|
1693
|
+
*
|
|
1694
|
+
* - `known`: the session exists locally with its content (B has a transcript to ratify). After a
|
|
1695
|
+
* restart B reloads it from SQLite, and autoRecoverForAgent re-pulls any parked content first.
|
|
1696
|
+
* - `tampered`: the content cross-check flagged a content_hash mismatch (#contentDesynced) — B
|
|
1697
|
+
* must NEVER ratify content it could not integrity-verify (the KERNEL refusal, AC-003).
|
|
1698
|
+
*
|
|
1699
|
+
* The directory separately verifies B's ack signature is genuine; B separately verifies the
|
|
1700
|
+
* unilateral cert signature (R1 is authentic). NOTE: a full "B's frontier covers R1's tail"
|
|
1701
|
+
* completeness check (the `desynced` reason) requires the deferred MSG-001-3b canonical-sequence
|
|
1702
|
+
* reconciliation — same documented limitation as the UP-2 gate above.
|
|
1703
|
+
*/
|
|
1704
|
+
getSealUpgradeReadiness(agentName, sessionId) {
|
|
1705
|
+
const record = this.getSessionRecord(agentName, sessionId);
|
|
1706
|
+
return {
|
|
1707
|
+
known: !!record,
|
|
1708
|
+
tampered: this.#contentDesynced.has(this.#k(agentName, sessionId)),
|
|
1709
|
+
};
|
|
1710
|
+
}
|
|
1711
|
+
/**
|
|
1712
|
+
* M7-UPGRADE-002: auto-acknowledge close (POSTMORTEM Workstream E / C-5). When B's daemon
|
|
1713
|
+
* ingests the COUNTERPARTY's SEAL control leaf and B has verified the content, B's OWN node
|
|
1714
|
+
* auto-co-signs + submits its responder SEAL leaf WITHOUT waiting for B's agent to call
|
|
1715
|
+
* cello_close_session — so a bilateral seal completes promptly instead of degrading to
|
|
1716
|
+
* unilateral on a slow/busy/crashed agent.
|
|
1717
|
+
*
|
|
1718
|
+
* SI-001 (non-negotiable): B's signature is ALWAYS produced by B's own node — submitSealLeaf
|
|
1719
|
+
* signs the responder SEAL leaf with B's K_local. We remove the agent PROMPT, never the SIGNER;
|
|
1720
|
+
* nothing here lets the directory or the peer synthesize B's acknowledgement.
|
|
1721
|
+
*
|
|
1722
|
+
* SI-002 (verifiability gate): auto-ack ONLY content B has verified. A session whose content
|
|
1723
|
+
* cross-check failed (content_hash_mismatch = tamper, recorded in #contentDesynced) is NEVER
|
|
1724
|
+
* auto-signed — it surfaces to the agent as a genuine decision point. DISAGREEMENT with the
|
|
1725
|
+
* content is NOT a gate failure (C-6): the gate is "can I verify integrity?", never "do I agree?"
|
|
1726
|
+
* — a verified-but-disliked tail is auto-sealed and the transcript speaks for B.
|
|
1727
|
+
*
|
|
1728
|
+
* Idempotent + non-throwing: marks #responderSealSubmitted BEFORE the async submit so a
|
|
1729
|
+
* redelivered ctrl leaf cannot double-submit; clears the mark on submit failure so a later
|
|
1730
|
+
* agent close / reconnect can still complete the seal (DB-001 — never a silent half-seal).
|
|
1731
|
+
*/
|
|
1732
|
+
#maybeAutoAcknowledgeSeal(agentName, sessionId, correlationId) {
|
|
1733
|
+
const ackKey = this.#k(agentName, sessionId);
|
|
1734
|
+
// Idempotency: at most one responder seal per session (auto-ack or agent close).
|
|
1735
|
+
if (this.#responderSealSubmitted.has(ackKey))
|
|
1736
|
+
return;
|
|
1737
|
+
const record = this.getSessionRecord(agentName, sessionId);
|
|
1738
|
+
// Only an ACTIVE session auto-acks. A committed/sealing/sealed/interrupted session is out of
|
|
1739
|
+
// scope (already sealing, or needs the interrupted/upgrade path), not an auto-ack candidate.
|
|
1740
|
+
if (!record || record.status !== "active")
|
|
1741
|
+
return;
|
|
1742
|
+
// SI-002 verifiability gate: never auto-sign a session whose content we could not verify.
|
|
1743
|
+
// Today the ONLY tracked unverifiable cause is a content_hash mismatch = TAMPER (#contentDesynced
|
|
1744
|
+
// is set only there). Genuine tamper is a SECURITY event — log it at ERROR with the distinct
|
|
1745
|
+
// reason `content_tamper` so the AC-008 tamper alarm can fire (it keys on that reason). The other
|
|
1746
|
+
// two specced reasons — `desynced` (B's tree is behind the canonical sealed tail) and
|
|
1747
|
+
// `content_unverifiable` (parked content unrecoverable) — require the MSG-001-3b canonical-
|
|
1748
|
+
// sequence reconciliation that is deferred; they are reserved for that follow-on.
|
|
1749
|
+
if (this.#contentDesynced.has(ackKey)) {
|
|
1750
|
+
this.#logger.error("session.seal.autoack.skipped", {
|
|
1751
|
+
sessionId,
|
|
1752
|
+
reason: "content_tamper",
|
|
1753
|
+
correlationId,
|
|
1754
|
+
});
|
|
1755
|
+
// AC-002: the verifiability gate refused — surface counterparty_closing to B's agent as a
|
|
1756
|
+
// GENUINE decision point (the seal will not auto-complete; B must decide). Uses the existing
|
|
1757
|
+
// session-state push to the live MCP clients; best-effort (never throws out of this gate).
|
|
1758
|
+
try {
|
|
1759
|
+
this.#onSessionStateChanged?.(record.agent_name, sessionId, "counterparty_closing", record.counterparty_pubkey);
|
|
1760
|
+
}
|
|
1761
|
+
catch (err) {
|
|
1762
|
+
this.#logger.debug("session.state.notify.failed", {
|
|
1763
|
+
sessionId,
|
|
1764
|
+
reason: err instanceof Error ? err.message : String(err),
|
|
1765
|
+
});
|
|
1766
|
+
}
|
|
1767
|
+
return;
|
|
1768
|
+
}
|
|
1769
|
+
const entry = this.#activeNodes.get(ackKey);
|
|
1770
|
+
const responderPubkey = entry?.relayClient?.senderPubkeyHex ?? "unknown";
|
|
1771
|
+
// submitSealLeaf owns the #responderSealSubmitted idempotency mark (set synchronously at its
|
|
1772
|
+
// top), so the auto-ack does not pre-mark — it just reacts to the result.
|
|
1773
|
+
void this.submitSealLeaf(agentName, sessionId, correlationId)
|
|
1774
|
+
.then((result) => {
|
|
1775
|
+
if (result.ok) {
|
|
1776
|
+
// SI-001: the responder SEAL leaf was signed by B's OWN node (K_local) in submitSealLeaf.
|
|
1777
|
+
this.#logger.info("session.seal.autoacknowledged", {
|
|
1778
|
+
sessionId,
|
|
1779
|
+
responderPubkey,
|
|
1780
|
+
correlationId,
|
|
1781
|
+
});
|
|
1782
|
+
}
|
|
1783
|
+
else if (result.reason === "responder_seal_already_submitted") {
|
|
1784
|
+
// B's agent close already submitted the responder seal (it won the race) — nothing to do.
|
|
1785
|
+
return;
|
|
1786
|
+
}
|
|
1787
|
+
else {
|
|
1788
|
+
// Submission failed (e.g. relay path down) — the agent close / reconnect can still
|
|
1789
|
+
// complete the seal; never a silent half-seal (DB-001).
|
|
1790
|
+
this.#logger.warn("session.seal.autoack.skipped", {
|
|
1791
|
+
sessionId,
|
|
1792
|
+
reason: result.reason,
|
|
1793
|
+
correlationId,
|
|
1794
|
+
});
|
|
1795
|
+
}
|
|
1796
|
+
})
|
|
1797
|
+
.catch((err) => {
|
|
1798
|
+
this.#logger.warn("session.seal.autoack.skipped", {
|
|
1799
|
+
sessionId,
|
|
1800
|
+
reason: err instanceof Error ? err.message : String(err),
|
|
1801
|
+
correlationId,
|
|
1802
|
+
});
|
|
1803
|
+
});
|
|
1804
|
+
}
|
|
1805
|
+
/**
|
|
1806
|
+
* DAEMON-004: cross-check received content against its hash, append the
|
|
1807
|
+
* verified leaf to the daemon-owned tree, and buffer it for cello_receive.
|
|
1808
|
+
* A hash MISMATCH is genuine tamper — rejected without append or buffer.
|
|
1809
|
+
*
|
|
1810
|
+
* SCOPE / finding #5 — what this cross-check does and does NOT prove today:
|
|
1811
|
+
* `contentHash` here is carried in the SAME content_frame as `content`, so this
|
|
1812
|
+
* comparison only catches wire corruption of a single frame — it does NOT prove
|
|
1813
|
+
* the content matches what the sender independently committed. Full tamper-
|
|
1814
|
+
* evidence (EARS behavior #2) requires cross-checking against the K_local-signed
|
|
1815
|
+
* content_hash leaf the sender submits to the RELAY on a separate channel; that
|
|
1816
|
+
* relay hash-submit path is MSG-001's scope and does not exist yet. Until MSG-001
|
|
1817
|
+
* lands, a malicious sender that sends matching (content, hash) in one frame is
|
|
1818
|
+
* not detected here — only the relay-relayed signed leaf closes that gap.
|
|
1819
|
+
*
|
|
1820
|
+
* @returns the appended leaf index (as sequenceNumber) on success.
|
|
1821
|
+
*/
|
|
1822
|
+
ingestReceivedContent(agentName, sessionId, content, contentHash, correlationId) {
|
|
1823
|
+
// The transcript is frozen ONLY once it is COMMITTED + signed — 'sealed' or
|
|
1824
|
+
// 'seal_interrupted_pending' (the bilateral seal commitment) — because a later FROST
|
|
1825
|
+
// notarization attests that exact root; a late leaf would diverge from it.
|
|
1826
|
+
//
|
|
1827
|
+
// MSG-001-3b recovery: a merely 'interrupted' session is NOT yet committed. The
|
|
1828
|
+
// counterparty's last message(s) may have been parked while this party was offline, so its
|
|
1829
|
+
// local transcript is INCOMPLETE (not frozen-final). Recovering that parked content COMPLETES
|
|
1830
|
+
// the local view to match the counterparty BEFORE the bilateral seal — it is not a resumption
|
|
1831
|
+
// (no new activity, no re-accept) and its root was never committed. So allow 'active' AND
|
|
1832
|
+
// 'interrupted'; reject only the two committed states. (No DB row = test-only path, allowed.)
|
|
1833
|
+
const record = this.getSessionRecord(agentName, sessionId);
|
|
1834
|
+
if (record && (record.status === "sealed" || record.status === "seal_interrupted_pending")) {
|
|
1835
|
+
this.#logger.warn("session.content.cross_check.failed", {
|
|
1836
|
+
sessionId,
|
|
1837
|
+
reason: "session_committed",
|
|
1838
|
+
currentStatus: record.status,
|
|
1839
|
+
correlationId,
|
|
1840
|
+
});
|
|
1841
|
+
return { ok: false, reason: "session_committed" };
|
|
1842
|
+
}
|
|
1843
|
+
const computed = createHash("sha256").update(new Uint8Array([0x00])).update(content).digest();
|
|
1844
|
+
const contentHashHex = Buffer.from(contentHash).toString("hex");
|
|
1845
|
+
if (Buffer.from(computed).toString("hex") !== contentHashHex) {
|
|
1846
|
+
this.#logger.warn("session.content.cross_check.failed", {
|
|
1847
|
+
sessionId,
|
|
1848
|
+
reason: "content_hash_mismatch",
|
|
1849
|
+
correlationId,
|
|
1850
|
+
});
|
|
1851
|
+
// M7-UPGRADE-002 (SI-002): a tamper makes this session's content unverifiable — the
|
|
1852
|
+
// auto-acknowledge gate must never auto-co-sign it. The session stays alive (DOD-MSG-7),
|
|
1853
|
+
// but the responder seal now requires the agent's explicit decision, not an auto-ack.
|
|
1854
|
+
this.#contentDesynced.add(this.#k(agentName, sessionId));
|
|
1855
|
+
return { ok: false, reason: "content_hash_mismatch" };
|
|
1856
|
+
}
|
|
1857
|
+
const entry = this.#activeNodes.get(this.#k(agentName, sessionId));
|
|
1858
|
+
const senderPubkey = entry?.counterpartyPubkey
|
|
1859
|
+
?? this.getSessionRecord(agentName, sessionId)?.counterparty_pubkey
|
|
1860
|
+
?? "unknown";
|
|
1861
|
+
// DOD-MSG-5: a content_hash satisfies AT MOST ONE Merkle leaf, exactly once. If this hash is
|
|
1862
|
+
// already a leaf in the tree — it arrived BOTH directly and via the relay-park backstop, or it
|
|
1863
|
+
// is a replay — do NOT append a second leaf and do NOT double-count it. The recipient already
|
|
1864
|
+
// holds this message at its assigned sequence. (In the normal single-delivery case this find is
|
|
1865
|
+
// -1, so the live/recover append paths are unchanged.)
|
|
1866
|
+
const existingIdx = this.getSessionTree(agentName, sessionId).indexOfHash(contentHashHex);
|
|
1867
|
+
if (existingIdx >= 0) {
|
|
1868
|
+
this.#logger.info("session.content.deduplicated", {
|
|
1869
|
+
sessionId,
|
|
1870
|
+
contentHashHex,
|
|
1871
|
+
sequenceNumber: existingIdx,
|
|
1872
|
+
correlationId,
|
|
1873
|
+
});
|
|
1874
|
+
// appendedCount 0 — a dedup appends NO new leaf, so a recover that re-pulls an already-ingested
|
|
1875
|
+
// entry (e.g. after auto-recover already drained it) must not count it as a fresh recovery.
|
|
1876
|
+
return { ok: true, leafIndex: existingIdx, sequenceNumber: existingIdx, appendedCount: 0 };
|
|
1877
|
+
}
|
|
1878
|
+
// DOD-MSG-4 (strict in-order gate): the RELAY is the ordering authority. If B holds the
|
|
1879
|
+
// canonical sequence for this hash (witnessed via leaf_deliver) and it is AHEAD of the next
|
|
1880
|
+
// expected leaf, HOLD the content rather than append it out of order. The missing in-between
|
|
1881
|
+
// sequence(s) are recovered from the relay mailbox; #releaseHeld then drains the held entries
|
|
1882
|
+
// in canonical order. This keeps the daemon-owned leaf index === the canonical sequence by
|
|
1883
|
+
// construction, so two parties' roots match even when direct delivery and park-recovery
|
|
1884
|
+
// interleave. With NO witness for this hash (relay-degraded) B falls back to arrival-order
|
|
1885
|
+
// append — the pre-MSG-4 behavior (no ordering signal available).
|
|
1886
|
+
const key = this.#k(agentName, sessionId);
|
|
1887
|
+
const canonicalSeq = this.#witnessedSeq.get(key)?.get(contentHashHex);
|
|
1888
|
+
const nextExpected = this.getSessionTree(agentName, sessionId).size();
|
|
1889
|
+
if (canonicalSeq !== undefined && canonicalSeq > nextExpected) {
|
|
1890
|
+
let held = this.#heldContent.get(key);
|
|
1891
|
+
if (!held) {
|
|
1892
|
+
held = new Map();
|
|
1893
|
+
this.#heldContent.set(key, held);
|
|
1894
|
+
}
|
|
1895
|
+
held.set(canonicalSeq, { content, contentHashHex, correlationId });
|
|
1896
|
+
this.#logger.info("session.content.held", {
|
|
1897
|
+
sessionId,
|
|
1898
|
+
canonicalSeq,
|
|
1899
|
+
nextExpected,
|
|
1900
|
+
gap: canonicalSeq - nextExpected,
|
|
1901
|
+
correlationId,
|
|
1902
|
+
});
|
|
1903
|
+
// Held content is NOT yet a durable leaf, so it is deliberately NOT acknowledged `persisted`
|
|
1904
|
+
// (the caller checks `held`). The sender's TTF→park backstop and the recover/dedup path
|
|
1905
|
+
// guarantee eventual delivery; B never claims persisted for content it only holds in memory.
|
|
1906
|
+
return { ok: true, leafIndex: canonicalSeq, sequenceNumber: canonicalSeq, held: true };
|
|
1907
|
+
}
|
|
1908
|
+
if (canonicalSeq !== undefined && canonicalSeq < nextExpected) {
|
|
1909
|
+
// Contradiction (review finding #2): the witness says this hash belongs BEHIND the current
|
|
1910
|
+
// tree, yet the dedup scan above found no existing leaf for it — so it is neither a duplicate
|
|
1911
|
+
// nor in canonical order. This is only reachable via the accepted content-before-witness /
|
|
1912
|
+
// relay-degraded interleaving (the next sub-increment's pending-witness buffer closes it). Log
|
|
1913
|
+
// it loudly (the leaf-index===sequence invariant is at risk) and append rather than DROP the
|
|
1914
|
+
// message — losing content is worse than a transient mis-order the seal cross-check will catch.
|
|
1915
|
+
this.#logger.warn("session.content.sequence_behind_tree", {
|
|
1916
|
+
sessionId,
|
|
1917
|
+
canonicalSeq,
|
|
1918
|
+
nextExpected,
|
|
1919
|
+
correlationId,
|
|
1920
|
+
});
|
|
1921
|
+
}
|
|
1922
|
+
const { leafIndex } = this.#appendVerifiedContent(agentName, sessionId, content, contentHashHex, senderPubkey, correlationId);
|
|
1923
|
+
// A just-appended leaf may unblock held out-of-order arrivals whose turn is now next.
|
|
1924
|
+
// appendedCount = this leaf + any held leaves released by it, so a caller (recover) can tally the
|
|
1925
|
+
// leaves ACTUALLY written, not just the directly-ingested one (review #3).
|
|
1926
|
+
const released = this.#releaseHeld(agentName, sessionId, senderPubkey);
|
|
1927
|
+
return { ok: true, leafIndex, sequenceNumber: leafIndex, appendedCount: 1 + released };
|
|
1928
|
+
}
|
|
1929
|
+
/**
|
|
1930
|
+
* DOD-MSG-4: record the relay-witnessed canonical sequence for a content hash. The relay is the
|
|
1931
|
+
* ordering authority (Structure 2): it assigns each message a sequence from its hash and delivers
|
|
1932
|
+
* B the (content_hash -> sequence) binding via leaf_deliver. The strict-in-order gate orders the
|
|
1933
|
+
* transcript by THIS — never a sender-stamped field. Also advances the per-session high-water mark
|
|
1934
|
+
* (the largest witnessed sequence) reserved for the future catch-up-before-live increment. Idempotent.
|
|
1935
|
+
*/
|
|
1936
|
+
recordWitnessedSequence(agentName, sessionId, contentHashHex, sequenceNumber) {
|
|
1937
|
+
if (sequenceNumber < 0)
|
|
1938
|
+
return;
|
|
1939
|
+
const key = this.#k(agentName, sessionId);
|
|
1940
|
+
let map = this.#witnessedSeq.get(key);
|
|
1941
|
+
if (!map) {
|
|
1942
|
+
map = new Map();
|
|
1943
|
+
this.#witnessedSeq.set(key, map);
|
|
1944
|
+
}
|
|
1945
|
+
map.set(contentHashHex, sequenceNumber);
|
|
1946
|
+
const hw = this.#highWaterSeq.get(key) ?? -1;
|
|
1947
|
+
if (sequenceNumber > hw)
|
|
1948
|
+
this.#highWaterSeq.set(key, sequenceNumber);
|
|
1949
|
+
}
|
|
1950
|
+
/**
|
|
1951
|
+
* DOD-MSG-4: the relay's high-water canonical sequence for this session (largest witnessed leaf),
|
|
1952
|
+
* or -1 if none. Exposed for the next sub-increment (catch-up-before-live — on reconnect B holds
|
|
1953
|
+
* live arrivals until its tree reaches this, because it has more to recover than it has appended);
|
|
1954
|
+
* NOT yet consumed by the gate. Also `recordWitnessedSequence` maintains it.
|
|
1955
|
+
*/
|
|
1956
|
+
getHighWaterSeq(agentName, sessionId) {
|
|
1957
|
+
return this.#highWaterSeq.get(this.#k(agentName, sessionId)) ?? -1;
|
|
1958
|
+
}
|
|
1959
|
+
/** DOD-MSG-4 / DAEMON-004: append a verified message leaf and buffer it for cello_receive. */
|
|
1960
|
+
#appendVerifiedContent(agentName, sessionId, content, contentHashHex, senderPubkey, correlationId) {
|
|
1961
|
+
const { leafIndex } = this.appendSessionLeaf(agentName, sessionId, "msg", contentHashHex, correlationId);
|
|
1962
|
+
// DOD-LOG-1: persist the readable RECEIVED plaintext to the durable transcript, keyed by the
|
|
1963
|
+
// canonical leaf sequence so it joins the committed hash chain (survives restart; INV-3 — the
|
|
1964
|
+
// relay/directory never see this plaintext, only the hash).
|
|
1965
|
+
this.recordTranscriptMessage(agentName, sessionId, leafIndex, "received", content, correlationId);
|
|
1966
|
+
const recvKey = this.#k(agentName, sessionId);
|
|
1967
|
+
// Review finding #6: the witness for this hash has done its ordering job once the leaf is
|
|
1968
|
+
// appended — drop it so #witnessedSeq stays proportional to held/pending content, not the whole
|
|
1969
|
+
// transcript. A later replay of the same hash is still caught by the dedup leaf-scan, which is
|
|
1970
|
+
// independent of the witness map.
|
|
1971
|
+
this.#witnessedSeq.get(recvKey)?.delete(contentHashHex);
|
|
1972
|
+
let buf = this.#receivedContent.get(recvKey);
|
|
1973
|
+
if (!buf) {
|
|
1974
|
+
buf = [];
|
|
1975
|
+
this.#receivedContent.set(recvKey, buf);
|
|
1976
|
+
}
|
|
1977
|
+
buf.push({ contentHex: Buffer.from(content).toString("hex"), senderPubkey, sequenceNumber: leafIndex });
|
|
1978
|
+
this.#logger.info("session.content.received", {
|
|
1979
|
+
sessionId,
|
|
1980
|
+
senderPubkey,
|
|
1981
|
+
contentHashHex,
|
|
1982
|
+
sequenceNumber: leafIndex,
|
|
1983
|
+
correlationId,
|
|
1984
|
+
});
|
|
1985
|
+
return { leafIndex };
|
|
1986
|
+
}
|
|
1987
|
+
/**
|
|
1988
|
+
* DOD-MSG-4: drain held out-of-order content in canonical order. After a leaf is appended, any
|
|
1989
|
+
* held entry whose canonical sequence equals the new next-expected index is now in order — append
|
|
1990
|
+
* it, then check again (a single fill can release a run of consecutive held messages).
|
|
1991
|
+
*/
|
|
1992
|
+
#releaseHeld(agentName, sessionId, senderPubkey) {
|
|
1993
|
+
const key = this.#k(agentName, sessionId);
|
|
1994
|
+
const held = this.#heldContent.get(key);
|
|
1995
|
+
if (!held)
|
|
1996
|
+
return 0;
|
|
1997
|
+
let released = 0;
|
|
1998
|
+
for (;;) {
|
|
1999
|
+
const nextExpected = this.getSessionTree(agentName, sessionId).size();
|
|
2000
|
+
const entry = held.get(nextExpected);
|
|
2001
|
+
if (!entry)
|
|
2002
|
+
break;
|
|
2003
|
+
held.delete(nextExpected);
|
|
2004
|
+
this.#appendVerifiedContent(agentName, sessionId, entry.content, entry.contentHashHex, senderPubkey, entry.correlationId);
|
|
2005
|
+
released++;
|
|
2006
|
+
this.#logger.info("session.content.released", {
|
|
2007
|
+
sessionId,
|
|
2008
|
+
sequenceNumber: nextExpected,
|
|
2009
|
+
correlationId: entry.correlationId,
|
|
2010
|
+
});
|
|
2011
|
+
if (held.size === 0) {
|
|
2012
|
+
this.#heldContent.delete(key);
|
|
2013
|
+
break;
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
return released;
|
|
2017
|
+
}
|
|
2018
|
+
/** DAEMON-004: pop the oldest verified received content for cello_receive. */
|
|
2019
|
+
takeReceivedContent(agentName, sessionId) {
|
|
2020
|
+
const buf = this.#receivedContent.get(this.#k(agentName, sessionId));
|
|
2021
|
+
if (!buf || buf.length === 0)
|
|
2022
|
+
return null;
|
|
2023
|
+
return buf.shift() ?? null;
|
|
2024
|
+
}
|
|
2025
|
+
// ─── CELLO-M7-MSG-001: delivery ACK / TTF tracking (send side) ──────────────
|
|
2026
|
+
/**
|
|
2027
|
+
* Arm awaiting-ACK tracking for a just-sent content frame (AC-001/AC-003). Records
|
|
2028
|
+
* the content + a TTF timer keyed by content hash; a `persisted` ACK on the inbound
|
|
2029
|
+
* content stream resolves it, TTF expiry hands it to the park backstop. The timer is
|
|
2030
|
+
* `unref`'d so an in-flight wait never keeps the daemon process (or a test runner)
|
|
2031
|
+
* alive on its own.
|
|
2032
|
+
*/
|
|
2033
|
+
#trackAwaitingAck(agentName, sessionId, content, contentHash, correlationId, structure1Cbor, structure2Cbor) {
|
|
2034
|
+
const hashHex = Buffer.from(contentHash).toString("hex");
|
|
2035
|
+
const ackKey = this.#k(agentName, sessionId);
|
|
2036
|
+
let bySession = this.#awaitingAck.get(ackKey);
|
|
2037
|
+
if (!bySession) {
|
|
2038
|
+
bySession = new Map();
|
|
2039
|
+
this.#awaitingAck.set(ackKey, bySession);
|
|
2040
|
+
}
|
|
2041
|
+
// Replace any prior timer for the same (session, hash) so we never leak a timer.
|
|
2042
|
+
const prior = bySession.get(hashHex);
|
|
2043
|
+
if (prior)
|
|
2044
|
+
clearTimeout(prior.timer);
|
|
2045
|
+
const timer = setTimeout(() => {
|
|
2046
|
+
this.#handleTtfExpiry(agentName, sessionId, hashHex);
|
|
2047
|
+
}, this.#contentTtfMs);
|
|
2048
|
+
if (typeof timer.unref === "function")
|
|
2049
|
+
timer.unref();
|
|
2050
|
+
// DOD-MSG-4 (2b, review #1): retain the relay's ordering record so a TTF-triggered park carries
|
|
2051
|
+
// it too (not only the direct-dial-fail park) — so a TTF-parked entry is self-ordering on recover.
|
|
2052
|
+
bySession.set(hashHex, { timer, content, correlationId, structure1Cbor, structure2Cbor });
|
|
2053
|
+
}
|
|
2054
|
+
/**
|
|
2055
|
+
* Resolve an awaiting-ACK entry on a `persisted` delivery ACK (AC-001/AC-002): cancel
|
|
2056
|
+
* the TTF timer, emit content.delivery.acked, and clear the durable backstop entry.
|
|
2057
|
+
* A `received`-level ACK is NOT handled here — the protocol acts on `persisted` only,
|
|
2058
|
+
* so a received ACK leaves the timer armed.
|
|
2059
|
+
*/
|
|
2060
|
+
#resolveAwaitingAck(agentName, sessionId, contentHash) {
|
|
2061
|
+
const hashHex = Buffer.from(contentHash).toString("hex");
|
|
2062
|
+
const ackKey = this.#k(agentName, sessionId);
|
|
2063
|
+
const bySession = this.#awaitingAck.get(ackKey);
|
|
2064
|
+
const entry = bySession?.get(hashHex);
|
|
2065
|
+
if (!entry || !bySession)
|
|
2066
|
+
return; // unknown / already resolved — idempotent
|
|
2067
|
+
clearTimeout(entry.timer);
|
|
2068
|
+
bySession.delete(hashHex);
|
|
2069
|
+
if (bySession.size === 0)
|
|
2070
|
+
this.#awaitingAck.delete(ackKey);
|
|
2071
|
+
this.#logger.info("content.delivery.acked", {
|
|
2072
|
+
sessionId,
|
|
2073
|
+
contentHash: hashHex,
|
|
2074
|
+
level: "persisted",
|
|
2075
|
+
correlationId: entry.correlationId,
|
|
2076
|
+
});
|
|
2077
|
+
// Clear the durable crash-backstop entry so the startup flush does not re-park
|
|
2078
|
+
// already-delivered content.
|
|
2079
|
+
try {
|
|
2080
|
+
this.#onAwaitingPersisted?.(agentName, sessionId, hashHex);
|
|
2081
|
+
}
|
|
2082
|
+
catch (err) {
|
|
2083
|
+
this.#logger.error("content.delivery.ack.backstop.failed", {
|
|
2084
|
+
sessionId, contentHash: hashHex, error: err instanceof Error ? err.message : String(err),
|
|
2085
|
+
});
|
|
2086
|
+
}
|
|
2087
|
+
}
|
|
2088
|
+
/**
|
|
2089
|
+
* TTF timer fired with no `persisted` ACK (AC-003/AC-019): hand the un-acked content
|
|
2090
|
+
* to the park backstop (the durable retry_queue today; the relay store-and-forward
|
|
2091
|
+
* deposit in 3b). The session is never killed and the operator is never interrupted —
|
|
2092
|
+
* parking is best-effort durability.
|
|
2093
|
+
*/
|
|
2094
|
+
#handleTtfExpiry(agentName, sessionId, hashHex) {
|
|
2095
|
+
const ackKey = this.#k(agentName, sessionId);
|
|
2096
|
+
const bySession = this.#awaitingAck.get(ackKey);
|
|
2097
|
+
const entry = bySession?.get(hashHex);
|
|
2098
|
+
if (!entry || !bySession)
|
|
2099
|
+
return;
|
|
2100
|
+
bySession.delete(hashHex);
|
|
2101
|
+
if (bySession.size === 0)
|
|
2102
|
+
this.#awaitingAck.delete(ackKey);
|
|
2103
|
+
this.#logger.debug("content.delivery.ttf_expired", { sessionId, contentHash: hashHex });
|
|
2104
|
+
try {
|
|
2105
|
+
this.#onAwaitingTtf?.(agentName, sessionId, hashHex, entry.content);
|
|
2106
|
+
}
|
|
2107
|
+
catch (err) {
|
|
2108
|
+
this.#logger.error("content.park.backstop.failed", {
|
|
2109
|
+
sessionId, contentHash: hashHex, error: err instanceof Error ? err.message : String(err),
|
|
2110
|
+
});
|
|
2111
|
+
}
|
|
2112
|
+
// 2b: delivered to the wire but never confirmed `persisted` — deposit it to the relay
|
|
2113
|
+
// store-and-forward so the recipient recovers it (at the witnessed sequence). The durable
|
|
2114
|
+
// awaiting entry above remains the crash backstop. Carry the retained ordering record (review #1)
|
|
2115
|
+
// so a TTF-parked entry self-orders on recover, exactly like the direct-dial-fail park.
|
|
2116
|
+
this.#parkContent(agentName, sessionId, hashHex, entry.content, entry.structure1Cbor, entry.structure2Cbor);
|
|
2117
|
+
}
|
|
2118
|
+
/**
|
|
2119
|
+
* Send an unsigned `persisted` delivery ACK back to the sender over the same
|
|
2120
|
+
* /cello/content/1.0.0 protocol (AC-001). Best-effort: authentication is the Noise
|
|
2121
|
+
* session channel, so the ACK carries no signature; a failed ACK send is logged and
|
|
2122
|
+
* the sender recovers via its TTF/recovery path rather than a thrown error here.
|
|
2123
|
+
*/
|
|
2124
|
+
async #sendDeliveryAck(agentName, sessionId, contentHash, correlationId) {
|
|
2125
|
+
const entry = this.#activeNodes.get(this.#k(agentName, sessionId));
|
|
2126
|
+
if (!entry)
|
|
2127
|
+
return;
|
|
2128
|
+
try {
|
|
2129
|
+
const stream = await entry.node.newStream(entry.counterpartySessionPeerId, CELLO_CONTENT_PROTOCOL_ID);
|
|
2130
|
+
const frame = CBOR_ENC.encode({
|
|
2131
|
+
type: "content_delivery_ack",
|
|
2132
|
+
session_id: sessionId,
|
|
2133
|
+
content_hash: contentHash,
|
|
2134
|
+
level: "persisted",
|
|
2135
|
+
correlation_id: correlationId,
|
|
2136
|
+
});
|
|
2137
|
+
stream.send(lp.encode.single(frame));
|
|
2138
|
+
try {
|
|
2139
|
+
await stream.close();
|
|
2140
|
+
}
|
|
2141
|
+
catch { /* best-effort close */ }
|
|
2142
|
+
}
|
|
2143
|
+
catch (err) {
|
|
2144
|
+
this.#logger.warn("content.delivery.ack.send.failed", {
|
|
2145
|
+
sessionId,
|
|
2146
|
+
contentHash: Buffer.from(contentHash).toString("hex"),
|
|
2147
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2148
|
+
correlationId,
|
|
2149
|
+
});
|
|
2150
|
+
}
|
|
2151
|
+
}
|
|
2152
|
+
/** Cancel and drop a single awaiting-ACK entry (e.g. the send failed after arming). */
|
|
2153
|
+
#untrackAwaitingAck(agentName, sessionId, contentHash) {
|
|
2154
|
+
const hashHex = Buffer.from(contentHash).toString("hex");
|
|
2155
|
+
const ackKey = this.#k(agentName, sessionId);
|
|
2156
|
+
const bySession = this.#awaitingAck.get(ackKey);
|
|
2157
|
+
const entry = bySession?.get(hashHex);
|
|
2158
|
+
if (!entry || !bySession)
|
|
2159
|
+
return;
|
|
2160
|
+
clearTimeout(entry.timer);
|
|
2161
|
+
bySession.delete(hashHex);
|
|
2162
|
+
if (bySession.size === 0)
|
|
2163
|
+
this.#awaitingAck.delete(ackKey);
|
|
2164
|
+
}
|
|
2165
|
+
/** Cancel and drop all awaiting-ACK timers for a session (teardown). */
|
|
2166
|
+
#clearAwaitingForSession(agentName, sessionId) {
|
|
2167
|
+
const ackKey = this.#k(agentName, sessionId);
|
|
2168
|
+
const bySession = this.#awaitingAck.get(ackKey);
|
|
2169
|
+
if (!bySession)
|
|
2170
|
+
return;
|
|
2171
|
+
for (const entry of bySession.values())
|
|
2172
|
+
clearTimeout(entry.timer);
|
|
2173
|
+
this.#awaitingAck.delete(ackKey);
|
|
2174
|
+
}
|
|
2175
|
+
#loadTreeFromDb(agentName, sessionId) {
|
|
2176
|
+
if (!this.#db)
|
|
2177
|
+
return SessionTree.empty();
|
|
2178
|
+
const rows = this.#db
|
|
2179
|
+
.prepare("SELECT leaf_kind, leaf_hash_hex FROM session_tree_leaves WHERE agent_name = ? AND session_id = ? ORDER BY leaf_index ASC")
|
|
2180
|
+
.all(agentName, sessionId);
|
|
2181
|
+
return SessionTree.fromLeaves(rows.map((r) => ({ kind: r.leaf_kind === "ctrl" ? "ctrl" : "msg", hashHex: r.leaf_hash_hex })));
|
|
2182
|
+
}
|
|
2183
|
+
/**
|
|
2184
|
+
* DAEMON-004: register the /cello/content/1.0.0 handler on a session node so
|
|
2185
|
+
* inbound content_frames are decoded, cross-checked, and ingested.
|
|
2186
|
+
*/
|
|
2187
|
+
// Awaited by createSessionNode / acceptSession so the /cello/content/1.0.0 handler
|
|
2188
|
+
// is provably registered before the caller returns (and thus before any peer sends
|
|
2189
|
+
// content). libp2p registers the protocol synchronously today, but awaiting removes
|
|
2190
|
+
// the fragile dependency on that internal timing (review L4).
|
|
2191
|
+
async #registerContentHandler(agentName, sessionId, node, _counterpartyPubkey) {
|
|
2192
|
+
try {
|
|
2193
|
+
await node.handle(CELLO_CONTENT_PROTOCOL_ID, (stream) => {
|
|
2194
|
+
void this.#handleContentStream(agentName, sessionId, stream);
|
|
2195
|
+
});
|
|
2196
|
+
}
|
|
2197
|
+
catch (err) {
|
|
2198
|
+
this.#logger.error("session.content.handler.register.failed", {
|
|
2199
|
+
sessionId,
|
|
2200
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2201
|
+
});
|
|
2202
|
+
}
|
|
2203
|
+
}
|
|
2204
|
+
/**
|
|
2205
|
+
* DOD-MSG-4 (self-ordering content frame): verify the relay's signed ordering record carried IN the
|
|
2206
|
+
* content frame and record the canonical sequence for the strict-in-order gate — so ordering does
|
|
2207
|
+
* not depend on the separate leaf_deliver witness arriving first. Best-effort: any failure (malformed,
|
|
2208
|
+
* hash mismatch, bad signature, wrong signer) is logged and ignored — the content still ingests and
|
|
2209
|
+
* orders via the witness stream / arrival, so a bad record cannot block delivery.
|
|
2210
|
+
*
|
|
2211
|
+
* structure1_cbor = [1, content_hash(32), sender_pubkey(32), session_id(16), last_seen_seq, ts] —
|
|
2212
|
+
* the EXACT bytes the sender signed (needed to verify; Structure2 omits session_id/last_seen/ts).
|
|
2213
|
+
* structure2_cbor = [seq, sender_pubkey, content_hash, sender_signature, scan_result, prev_root].
|
|
2214
|
+
*/
|
|
2215
|
+
/**
|
|
2216
|
+
* DOD-MSG-4 (2b): encode the pre-seal park envelope `[1, content, structure1_cbor|null,
|
|
2217
|
+
* structure2_cbor|null]`. The daemon seals THIS (not the bare content) so a parked entry carries
|
|
2218
|
+
* its own signed ordering record — recover then orders it the same way the direct frame does. The
|
|
2219
|
+
* relay still only ever holds the sealed ciphertext (INV-3 preserved).
|
|
2220
|
+
*/
|
|
2221
|
+
encodeParkEnvelope(content, structure1Cbor, structure2Cbor) {
|
|
2222
|
+
return CBOR_ENC.encode([1, content, structure1Cbor ?? null, structure2Cbor ?? null]);
|
|
2223
|
+
}
|
|
2224
|
+
/**
|
|
2225
|
+
* DOD-MSG-4 (2b): decode a park envelope produced by encodeParkEnvelope. Falls back to treating the
|
|
2226
|
+
* whole plaintext as raw content (no ordering record) for entries sealed the old way (e.g. test
|
|
2227
|
+
* fixtures that seal bare content) — so recover stays backward-compatible.
|
|
2228
|
+
*/
|
|
2229
|
+
decodeParkEnvelope(plaintext) {
|
|
2230
|
+
try {
|
|
2231
|
+
const arr = decode(plaintext);
|
|
2232
|
+
// Discriminator (review #2): a 4-element array tagged with version 1 + a byte-string content.
|
|
2233
|
+
// The content-hash cross-check in ingestReceivedContent is the real safety net, but narrowing
|
|
2234
|
+
// to length 4 makes a bare-content false-positive astronomically less likely still.
|
|
2235
|
+
if (Array.isArray(arr) && arr.length === 4 && arr[0] === 1 && arr[1] instanceof Uint8Array) {
|
|
2236
|
+
return {
|
|
2237
|
+
content: arr[1],
|
|
2238
|
+
structure1Cbor: arr[2] instanceof Uint8Array ? arr[2] : undefined,
|
|
2239
|
+
structure2Cbor: arr[3] instanceof Uint8Array ? arr[3] : undefined,
|
|
2240
|
+
};
|
|
2241
|
+
}
|
|
2242
|
+
}
|
|
2243
|
+
catch {
|
|
2244
|
+
/* not an envelope — fall through to raw */
|
|
2245
|
+
}
|
|
2246
|
+
return { content: plaintext };
|
|
2247
|
+
}
|
|
2248
|
+
/**
|
|
2249
|
+
* DOD-MSG-4 (2b): public entry for the recover path to verify + record a parked entry's ordering
|
|
2250
|
+
* record (the recover handler lives in daemon.ts, which has no access to the private method).
|
|
2251
|
+
*/
|
|
2252
|
+
recordOrderingRecord(agentName, sessionId, structure1Cbor, structure2Cbor, contentHash, correlationId) {
|
|
2253
|
+
this.#recordFrameOrdering(agentName, sessionId, structure1Cbor, structure2Cbor, contentHash, correlationId, "park");
|
|
2254
|
+
}
|
|
2255
|
+
#recordFrameOrdering(agentName, sessionId, structure1Cbor, structure2Cbor, contentHash, correlationId, source = "content_frame") {
|
|
2256
|
+
try {
|
|
2257
|
+
const s1 = decode(structure1Cbor);
|
|
2258
|
+
const s2 = decode(structure2Cbor);
|
|
2259
|
+
const s1Hash = s1?.[1];
|
|
2260
|
+
const s1Pubkey = s1?.[2];
|
|
2261
|
+
const seq = typeof s2?.[0] === "number" ? s2[0] : -1;
|
|
2262
|
+
const s2Sig = s2?.[3];
|
|
2263
|
+
if (!(s1Hash instanceof Uint8Array) || !(s1Pubkey instanceof Uint8Array) || !(s2Sig instanceof Uint8Array) || seq < 1) {
|
|
2264
|
+
this.#logger.warn("session.content.ordering.malformed", { sessionId, correlationId });
|
|
2265
|
+
return;
|
|
2266
|
+
}
|
|
2267
|
+
// The framed ordering record must bind to THIS content (its hash) — else it orders the wrong bytes.
|
|
2268
|
+
const contentHashHex = Buffer.from(contentHash).toString("hex");
|
|
2269
|
+
if (Buffer.from(s1Hash).toString("hex") !== contentHashHex) {
|
|
2270
|
+
this.#logger.warn("session.content.ordering.hash_mismatch", { sessionId, correlationId });
|
|
2271
|
+
return;
|
|
2272
|
+
}
|
|
2273
|
+
// Verify the SENDER's Ed25519 signature over the exact signed bytes (structure1_cbor) — the same
|
|
2274
|
+
// check the relay performs. Proves the counterparty committed to this (content_hash @ sequence).
|
|
2275
|
+
if (!verify(s1Pubkey, structure1Cbor, s2Sig)) {
|
|
2276
|
+
this.#logger.warn("session.content.ordering.bad_signature", { sessionId, correlationId });
|
|
2277
|
+
return;
|
|
2278
|
+
}
|
|
2279
|
+
// Sovereign-node cross-check: the signer MUST be THIS session's counterparty, not an unrelated
|
|
2280
|
+
// key. FAIL CLOSED (review L) — if the counterparty pubkey is unknown we cannot prove the signer,
|
|
2281
|
+
// so we do NOT trust the framed ordering record (fall back to the witness stream / arrival). The
|
|
2282
|
+
// "B does not trust the counterparty for ordering" invariant is non-negotiable; never fail open.
|
|
2283
|
+
const counterparty = this.getSessionRecord(agentName, sessionId)?.counterparty_pubkey;
|
|
2284
|
+
if (!counterparty || Buffer.from(s1Pubkey).toString("hex") !== counterparty) {
|
|
2285
|
+
this.#logger.warn("session.content.ordering.wrong_signer", {
|
|
2286
|
+
sessionId,
|
|
2287
|
+
reason: counterparty ? "signer_not_counterparty" : "counterparty_unknown",
|
|
2288
|
+
correlationId,
|
|
2289
|
+
});
|
|
2290
|
+
return;
|
|
2291
|
+
}
|
|
2292
|
+
// Verified — record the relay-assigned canonical sequence (1-based → 0-based leaf index) for the gate.
|
|
2293
|
+
this.recordWitnessedSequence(agentName, sessionId, contentHashHex, seq - 1);
|
|
2294
|
+
this.#logger.info("session.content.ordering.recorded", {
|
|
2295
|
+
sessionId,
|
|
2296
|
+
canonicalSeq: seq - 1,
|
|
2297
|
+
source,
|
|
2298
|
+
correlationId,
|
|
2299
|
+
});
|
|
2300
|
+
}
|
|
2301
|
+
catch (err) {
|
|
2302
|
+
this.#logger.warn("session.content.ordering.decode_failed", {
|
|
2303
|
+
sessionId,
|
|
2304
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2305
|
+
correlationId,
|
|
2306
|
+
});
|
|
2307
|
+
}
|
|
2308
|
+
}
|
|
2309
|
+
async #handleContentStream(agentName, sessionId, stream) {
|
|
2310
|
+
const iter = lp.decode(stream)[Symbol.asyncIterator]();
|
|
2311
|
+
try {
|
|
2312
|
+
const result = await iter.next();
|
|
2313
|
+
if (result.done || result.value === undefined)
|
|
2314
|
+
return;
|
|
2315
|
+
const bytes = result.value instanceof Uint8Array ? result.value
|
|
2316
|
+
: Buffer.isBuffer(result.value) ? new Uint8Array(result.value)
|
|
2317
|
+
: result.value.slice();
|
|
2318
|
+
const frame = decode(bytes);
|
|
2319
|
+
const correlationId = typeof frame["correlation_id"] === "string" ? frame["correlation_id"] : undefined;
|
|
2320
|
+
// CELLO-M7-MSG-001 (AC-001/AC-002): a `persisted` delivery ACK arriving on the
|
|
2321
|
+
// same /cello/content/1.0.0 protocol resolves the sender's awaiting-ACK timer.
|
|
2322
|
+
// The protocol acts on `persisted` ONLY — any other level leaves the timer armed.
|
|
2323
|
+
if (frame["type"] === "content_delivery_ack") {
|
|
2324
|
+
const ackHash = frame["content_hash"];
|
|
2325
|
+
const level = frame["level"];
|
|
2326
|
+
if (ackHash instanceof Uint8Array && level === "persisted") {
|
|
2327
|
+
this.#resolveAwaitingAck(agentName, sessionId, ackHash);
|
|
2328
|
+
}
|
|
2329
|
+
return;
|
|
2330
|
+
}
|
|
2331
|
+
if (frame["type"] !== "content_frame")
|
|
2332
|
+
return;
|
|
2333
|
+
const contentBytes = frame["content_bytes"];
|
|
2334
|
+
const contentHash = frame["content_hash"];
|
|
2335
|
+
if (!(contentBytes instanceof Uint8Array) || !(contentHash instanceof Uint8Array))
|
|
2336
|
+
return;
|
|
2337
|
+
// DOD-MSG-4 (self-ordering content frame): if the frame carries the relay's signed ordering
|
|
2338
|
+
// record, verify the sender signature and record the canonical sequence FROM THE FRAME, BEFORE
|
|
2339
|
+
// ingest — so the strict-in-order gate has the position without waiting on the separate
|
|
2340
|
+
// leaf_deliver witness (removes the content-before-witness race). A bad/absent record is
|
|
2341
|
+
// non-fatal: the content still ingests, ordered by the witness stream / arrival as before.
|
|
2342
|
+
const s1Cbor = frame["structure1_cbor"];
|
|
2343
|
+
const s2Cbor = frame["structure2_cbor"];
|
|
2344
|
+
if (s1Cbor instanceof Uint8Array && s2Cbor instanceof Uint8Array) {
|
|
2345
|
+
this.#recordFrameOrdering(agentName, sessionId, s1Cbor, s2Cbor, contentHash, correlationId);
|
|
2346
|
+
}
|
|
2347
|
+
// AC-001: carry the sender's correlationId from the frame into the receive
|
|
2348
|
+
// path so both sides log the same flow id (never re-minted on receipt).
|
|
2349
|
+
const ingest = this.ingestReceivedContent(agentName, sessionId, contentBytes, contentHash, correlationId);
|
|
2350
|
+
// AC-001: after the content is durably ingested AND its hash cross-check
|
|
2351
|
+
// succeeds, emit an unsigned `persisted` delivery ACK back to the sender. A
|
|
2352
|
+
// rejected ingest (tamper / not-active) produces NO ACK, so the sender's TTF
|
|
2353
|
+
// path can park / recover.
|
|
2354
|
+
// DOD-MSG-4: a HELD (out-of-order) frame is NOT yet a durable leaf, so it is NOT
|
|
2355
|
+
// acknowledged `persisted` — the sender's TTF→park backstop then guarantees the
|
|
2356
|
+
// missing-earlier message is fetchable, and dedup absorbs the redundant copy.
|
|
2357
|
+
if (ingest.ok && !ingest.held) {
|
|
2358
|
+
void this.#sendDeliveryAck(agentName, sessionId, contentHash, correlationId);
|
|
2359
|
+
}
|
|
2360
|
+
}
|
|
2361
|
+
catch (err) {
|
|
2362
|
+
this.#logger.warn("session.content.stream.read.failed", {
|
|
2363
|
+
sessionId,
|
|
2364
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2365
|
+
});
|
|
2366
|
+
}
|
|
2367
|
+
}
|
|
2368
|
+
/**
|
|
2369
|
+
* M7-SESSION-001 AC-004/AC-005: Register a relay stream for an active session.
|
|
2370
|
+
* Starts a background reader that watches for session_interrupted frames and
|
|
2371
|
+
* stream close events. Both detection paths call markInterruptedWithDetails().
|
|
2372
|
+
*
|
|
2373
|
+
* The reader runs for the lifetime of the relay stream. If the stream closes
|
|
2374
|
+
* without delivering a session_interrupted frame (AC-005 / 'stream_close' path),
|
|
2375
|
+
* the session is still marked interrupted.
|
|
2376
|
+
*
|
|
2377
|
+
* @param sessionId The hex session ID
|
|
2378
|
+
* @param stream The relay stream to monitor
|
|
2379
|
+
* @param messageCount Number of message leaves at the time of registration
|
|
2380
|
+
* (used as the count at interruption — best effort since exact count at frame
|
|
2381
|
+
* receipt may differ, but this is the value available at stream setup time)
|
|
2382
|
+
*/
|
|
2383
|
+
registerRelayStream(agentName, sessionId, stream, messageCount = 0) {
|
|
2384
|
+
void this.#watchRelayStream(agentName, sessionId, stream, messageCount);
|
|
2385
|
+
}
|
|
2386
|
+
/**
|
|
2387
|
+
* Background relay stream watcher.
|
|
2388
|
+
* Pseudocode:
|
|
2389
|
+
* 1. Create LP-framed iterator over the stream
|
|
2390
|
+
* 2. For each frame:
|
|
2391
|
+
* a. If type === 'session_interrupted':
|
|
2392
|
+
* - Record receivedInterruptFrame = true
|
|
2393
|
+
* - Call markInterruptedWithDetails(sessionId, messageCount, 'relay_frame')
|
|
2394
|
+
* - Break (no more frames expected)
|
|
2395
|
+
* 3. On stream close (loop ends normally or with error):
|
|
2396
|
+
* a. If !receivedInterruptFrame:
|
|
2397
|
+
* - Call markInterruptedWithDetails(sessionId, messageCount, 'stream_close')
|
|
2398
|
+
*/
|
|
2399
|
+
async #watchRelayStream(agentName, sessionId, stream, messageCount) {
|
|
2400
|
+
let receivedInterruptFrame = false;
|
|
2401
|
+
// CELLO-M7-TRANSPORT-001: cast the stream input to lp.decode. Adding the
|
|
2402
|
+
// @libp2p/autonat service (interface@3.2.2 / uint8arraylist v2) to the
|
|
2403
|
+
// transport package surfaced a benign mixed-version split between the Stream
|
|
2404
|
+
// type (now v2) and it-length-prefixed's expected Uint8ArrayList (v3). The two
|
|
2405
|
+
// are structurally identical at runtime — this is a build-time-only artifact.
|
|
2406
|
+
const lpSource = stream;
|
|
2407
|
+
const source = lp.decode(lpSource)[Symbol.asyncIterator]();
|
|
2408
|
+
try {
|
|
2409
|
+
while (true) {
|
|
2410
|
+
let result;
|
|
2411
|
+
try {
|
|
2412
|
+
result = await source.next();
|
|
2413
|
+
}
|
|
2414
|
+
catch {
|
|
2415
|
+
// Stream error (e.g. stream aborted) — treat as stream close
|
|
2416
|
+
break;
|
|
2417
|
+
}
|
|
2418
|
+
if (result.done || result.value === undefined)
|
|
2419
|
+
break;
|
|
2420
|
+
let frame;
|
|
2421
|
+
try {
|
|
2422
|
+
const bytes = result.value instanceof Uint8Array ? result.value
|
|
2423
|
+
: Buffer.isBuffer(result.value) ? new Uint8Array(result.value)
|
|
2424
|
+
: result.value.slice();
|
|
2425
|
+
frame = decode(bytes);
|
|
2426
|
+
}
|
|
2427
|
+
catch {
|
|
2428
|
+
continue;
|
|
2429
|
+
}
|
|
2430
|
+
if (frame["type"] === "session_interrupted") {
|
|
2431
|
+
// H-3 SECURITY: this stream is registered (bound) to a specific
|
|
2432
|
+
// sessionId. A malicious or buggy relay could put a DIFFERENT session_id
|
|
2433
|
+
// in the frame body to target a session this stream is not authorized
|
|
2434
|
+
// for (cross-session targeting). Never trust the frame's id: if the frame
|
|
2435
|
+
// names a different session, reject it and keep watching the bound one.
|
|
2436
|
+
const frameSessionId = typeof frame["session_id"] === "string"
|
|
2437
|
+
? frame["session_id"]
|
|
2438
|
+
: (frame["session_id"] instanceof Uint8Array
|
|
2439
|
+
? Buffer.from(frame["session_id"]).toString("hex")
|
|
2440
|
+
: null);
|
|
2441
|
+
if (frameSessionId !== null && frameSessionId !== sessionId) {
|
|
2442
|
+
this.#logger.warn("session.interrupt.frame.session_mismatch", {
|
|
2443
|
+
boundSessionId: sessionId,
|
|
2444
|
+
frameSessionId,
|
|
2445
|
+
reason: "cross_session_frame_rejected",
|
|
2446
|
+
});
|
|
2447
|
+
continue; // ignore the hostile/mismatched frame; keep reading
|
|
2448
|
+
}
|
|
2449
|
+
receivedInterruptFrame = true;
|
|
2450
|
+
// Always mark the BOUND sessionId — never the id carried in the frame.
|
|
2451
|
+
await this.markInterruptedWithDetails(agentName, sessionId, messageCount, "relay_frame");
|
|
2452
|
+
break; // No more relay frames expected after session_interrupted
|
|
2453
|
+
}
|
|
2454
|
+
}
|
|
2455
|
+
}
|
|
2456
|
+
catch {
|
|
2457
|
+
// Stream read loop ended — fall through to stream_close check
|
|
2458
|
+
}
|
|
2459
|
+
// AC-005: stream closed without a session_interrupted frame
|
|
2460
|
+
if (!receivedInterruptFrame) {
|
|
2461
|
+
// Only mark interrupted if this session is still active in SQLite
|
|
2462
|
+
const record = this.getSessionRecord(agentName, sessionId);
|
|
2463
|
+
if (record && record.status === "active") {
|
|
2464
|
+
await this.markInterruptedWithDetails(agentName, sessionId, messageCount, "stream_close");
|
|
2465
|
+
}
|
|
2466
|
+
}
|
|
2467
|
+
}
|
|
2468
|
+
// ─── Private helpers ──────────────────────────────────────────────────────
|
|
2469
|
+
/**
|
|
2470
|
+
* DOD-LOOP-1: ensure the given agent has a standing receiver node (idempotent). Created when an
|
|
2471
|
+
* agent comes online (cello_start_agent) and replaced after it is handed off to a session. The
|
|
2472
|
+
* `#standingReceiverCreating` guard prevents two concurrent ensure() calls (e.g. the
|
|
2473
|
+
* cello_start_agent hook racing a consume-site retry) from building two nodes for one agent. A
|
|
2474
|
+
* create failure logs + leaves no entry; the next consume-site ensure() call retries on demand.
|
|
2475
|
+
*/
|
|
2476
|
+
async #ensureStandingReceiver(agentName, correlationId = randomUUID()) {
|
|
2477
|
+
if (this.#standingReceivers.has(agentName) || this.#standingReceiverCreating.has(agentName))
|
|
2478
|
+
return;
|
|
2479
|
+
if (this.#shuttingDown)
|
|
2480
|
+
return;
|
|
2481
|
+
// A fresh ensure request supersedes any pending removal (agent toggled offline→online).
|
|
2482
|
+
this.#standingReceiverRemoving.delete(agentName);
|
|
2483
|
+
this.#standingReceiverCreating.add(agentName);
|
|
2484
|
+
try {
|
|
2485
|
+
const sessionId = `standing_receiver_${randomUUID()}`;
|
|
2486
|
+
const gater = new SessionConnectionGater({
|
|
2487
|
+
sessionId,
|
|
2488
|
+
allowedPeerId: null, // open — counterparty unknown at creation time
|
|
2489
|
+
logger: this.#logger,
|
|
2490
|
+
});
|
|
2491
|
+
let node;
|
|
2492
|
+
try {
|
|
2493
|
+
node = await this.#factory.createNode({ sessionId, connectionGater: gater, nodeType: "standing_receiver" });
|
|
2494
|
+
await node.start();
|
|
2495
|
+
}
|
|
2496
|
+
catch (err) {
|
|
2497
|
+
this.#logger.error("session.node.create.failed", {
|
|
2498
|
+
sessionId,
|
|
2499
|
+
agentName: `${STANDING_RECEIVER_AGENT_NAME}:${agentName}`,
|
|
2500
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2501
|
+
correlationId,
|
|
2502
|
+
});
|
|
2503
|
+
return; // not ready — callers check getStandingReceiverReady / retry via ensure on demand
|
|
2504
|
+
}
|
|
2505
|
+
// M2: gracefulShutdown may have begun while this node was starting (ensure runs un-awaited).
|
|
2506
|
+
// Don't install an orphan bound to a TCP port — stop it and bail.
|
|
2507
|
+
if (this.#shuttingDown) {
|
|
2508
|
+
try {
|
|
2509
|
+
await node.stop();
|
|
2510
|
+
}
|
|
2511
|
+
catch { /* best-effort */ }
|
|
2512
|
+
return;
|
|
2513
|
+
}
|
|
2514
|
+
// L1: the agent may have gone offline (cello_stop_agent → removeStandingReceiverForAgent)
|
|
2515
|
+
// while this ensure was parked on start(). Removal found no map entry to delete, so the
|
|
2516
|
+
// tombstone is how we learn of it — tear the fresh node down rather than install an SR for
|
|
2517
|
+
// an offline agent.
|
|
2518
|
+
if (this.#standingReceiverRemoving.has(agentName)) {
|
|
2519
|
+
this.#standingReceiverRemoving.delete(agentName);
|
|
2520
|
+
try {
|
|
2521
|
+
await node.stop();
|
|
2522
|
+
}
|
|
2523
|
+
catch { /* best-effort */ }
|
|
2524
|
+
return;
|
|
2525
|
+
}
|
|
2526
|
+
// CELLO-M7-TRANSPORT-001: wrap in a NodeAutoNatService so its dialability drives session-
|
|
2527
|
+
// address advertisement and the transport.autonat.* events fire.
|
|
2528
|
+
const autoNat = new NodeAutoNatService({
|
|
2529
|
+
node,
|
|
2530
|
+
logger: this.#logger,
|
|
2531
|
+
nodeType: "standing_receiver",
|
|
2532
|
+
probers: this.#autoNatProbers(),
|
|
2533
|
+
});
|
|
2534
|
+
autoNat.emitInitialResult();
|
|
2535
|
+
this.#standingReceivers.set(agentName, { node, gater, autoNat });
|
|
2536
|
+
this.#logger.info("session.node.created", {
|
|
2537
|
+
sessionId,
|
|
2538
|
+
agentName: `${STANDING_RECEIVER_AGENT_NAME}:${agentName}`,
|
|
2539
|
+
sessionPeerId: node.getPeerId(),
|
|
2540
|
+
correlationId,
|
|
2541
|
+
});
|
|
2542
|
+
}
|
|
2543
|
+
finally {
|
|
2544
|
+
this.#standingReceiverCreating.delete(agentName);
|
|
2545
|
+
}
|
|
2546
|
+
}
|
|
2547
|
+
/**
|
|
2548
|
+
* DOD-LOOP-1: public hook for the composition root to create an agent's standing receiver when
|
|
2549
|
+
* the agent comes online (cello_start_agent), and to tear it down when it goes offline.
|
|
2550
|
+
*/
|
|
2551
|
+
async ensureStandingReceiverForAgent(agentName) {
|
|
2552
|
+
await this.#ensureStandingReceiver(agentName);
|
|
2553
|
+
}
|
|
2554
|
+
async removeStandingReceiverForAgent(agentName) {
|
|
2555
|
+
const sr = this.#standingReceivers.get(agentName);
|
|
2556
|
+
if (!sr) {
|
|
2557
|
+
// L1: an #ensureStandingReceiver for this agent may be in flight (parked on start(), so no
|
|
2558
|
+
// map entry yet). Leave a tombstone — that ensure tears its fresh node down on completion
|
|
2559
|
+
// instead of installing an SR for an agent that is now offline. Also drop any stale creating
|
|
2560
|
+
// marker so a later start can re-ensure.
|
|
2561
|
+
if (this.#standingReceiverCreating.has(agentName))
|
|
2562
|
+
this.#standingReceiverRemoving.add(agentName);
|
|
2563
|
+
return;
|
|
2564
|
+
}
|
|
2565
|
+
this.#standingReceivers.delete(agentName);
|
|
2566
|
+
sr.autoNat.stop();
|
|
2567
|
+
try {
|
|
2568
|
+
await sr.node.stop();
|
|
2569
|
+
}
|
|
2570
|
+
catch { /* best-effort */ }
|
|
2571
|
+
}
|
|
2572
|
+
#insertSessionRow(sessionId, agentName, counterpartyPubkey, status) {
|
|
2573
|
+
if (!this.#db)
|
|
2574
|
+
return false;
|
|
2575
|
+
const now = Date.now();
|
|
2576
|
+
try {
|
|
2577
|
+
this.#db
|
|
2578
|
+
.prepare(`INSERT INTO sessions
|
|
2579
|
+
(session_id, agent_name, counterparty_pubkey, status, created_at, updated_at)
|
|
2580
|
+
VALUES (?, ?, ?, ?, ?, ?)`)
|
|
2581
|
+
.run(sessionId, agentName, counterpartyPubkey, status, now, now);
|
|
2582
|
+
return true;
|
|
2583
|
+
}
|
|
2584
|
+
catch (err) {
|
|
2585
|
+
this.#logger.error("session.interrupt.db.write.failed", {
|
|
2586
|
+
sessionId,
|
|
2587
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2588
|
+
});
|
|
2589
|
+
return false;
|
|
2590
|
+
}
|
|
2591
|
+
}
|
|
2592
|
+
#updateSessionStatus(agentName, sessionId, status) {
|
|
2593
|
+
if (!this.#db)
|
|
2594
|
+
return;
|
|
2595
|
+
const now = Date.now();
|
|
2596
|
+
try {
|
|
2597
|
+
this.#db
|
|
2598
|
+
.prepare("UPDATE sessions SET status = ?, updated_at = ? WHERE agent_name = ? AND session_id = ?")
|
|
2599
|
+
.run(status, now, agentName, sessionId);
|
|
2600
|
+
}
|
|
2601
|
+
catch (err) {
|
|
2602
|
+
this.#logger.error("session.interrupt.db.write.failed", {
|
|
2603
|
+
sessionId,
|
|
2604
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2605
|
+
});
|
|
2606
|
+
}
|
|
2607
|
+
}
|
|
2608
|
+
}
|
|
2609
|
+
//# sourceMappingURL=session-node-manager.js.map
|