@desplega.ai/agent-swarm 1.74.4 → 1.76.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/openapi.json +1264 -46
- package/package.json +2 -2
- package/src/be/db.ts +563 -9
- package/src/be/memory/edges-store.ts +69 -0
- package/src/be/memory/providers/sqlite-store.ts +4 -0
- package/src/be/memory/raters/explicit-self.ts +22 -0
- package/src/be/memory/raters/implicit-citation.ts +44 -0
- package/src/be/memory/raters/llm-client.ts +172 -0
- package/src/be/memory/raters/llm-summarizer.ts +218 -0
- package/src/be/memory/raters/llm.ts +375 -0
- package/src/be/memory/raters/noop.ts +14 -0
- package/src/be/memory/raters/registry.ts +86 -0
- package/src/be/memory/raters/retrieval.ts +88 -0
- package/src/be/memory/raters/run-server-raters.ts +97 -0
- package/src/be/memory/raters/store.ts +228 -0
- package/src/be/memory/raters/types.ts +101 -0
- package/src/be/memory/reranker.ts +32 -2
- package/src/be/memory/retrieval-store.ts +116 -0
- package/src/be/memory/types.ts +3 -0
- package/src/be/migrations/051_memory_posteriors_and_retrieval.sql +67 -0
- package/src/be/migrations/052_memory_edges.sql +36 -0
- package/src/be/migrations/053_agent_waiting_for_credentials_status.sql +61 -0
- package/src/be/migrations/054_agent_harness_provider.sql +21 -0
- package/src/be/migrations/055_agent_cred_status.sql +15 -0
- package/src/be/migrations/056_drop_agent_tasks_source_check.sql +139 -0
- package/src/be/migrations/057_inbox_item_state.sql +27 -0
- package/src/be/migrations/058_task_templates.sql +31 -0
- package/src/be/swarm-config-guard.ts +24 -0
- package/src/commands/credential-wait.ts +186 -0
- package/src/commands/provider-credentials.ts +434 -0
- package/src/commands/runner.ts +253 -21
- package/src/hooks/hook.ts +143 -66
- package/src/http/agents.ts +191 -1
- package/src/http/config.ts +11 -1
- package/src/http/core.ts +5 -0
- package/src/http/inbox-state.ts +89 -0
- package/src/http/index.ts +10 -0
- package/src/http/memory.ts +230 -1
- package/src/http/sessions.ts +86 -0
- package/src/http/status.ts +665 -0
- package/src/http/task-templates.ts +51 -0
- package/src/http/tasks.ts +85 -5
- package/src/http/users.ts +134 -0
- package/src/prompts/memories.ts +62 -0
- package/src/providers/claude-adapter.ts +22 -0
- package/src/providers/claude-managed-adapter.ts +24 -0
- package/src/providers/codex-adapter.ts +43 -1
- package/src/providers/devin-adapter.ts +18 -0
- package/src/providers/index.ts +7 -0
- package/src/providers/opencode-adapter.ts +60 -0
- package/src/providers/pi-mono-adapter.ts +71 -0
- package/src/providers/types.ts +34 -0
- package/src/server.ts +2 -0
- package/src/slack/handlers.ts +0 -1
- package/src/tests/agents-harness-provider.test.ts +333 -0
- package/src/tests/credential-check.test.ts +367 -0
- package/src/tests/credential-status-api.test.ts +223 -0
- package/src/tests/credential-status-routing.test.ts +150 -0
- package/src/tests/credential-wait.test.ts +282 -0
- package/src/tests/harness-provider-resolution.test.ts +242 -0
- package/src/tests/jira-sync.test.ts +1 -1
- package/src/tests/memory-edges.test.ts +722 -0
- package/src/tests/memory-rate-endpoint.test.ts +330 -0
- package/src/tests/memory-rate-tool.test.ts +252 -0
- package/src/tests/memory-rater-e2e.test.ts +578 -0
- package/src/tests/memory-rater-implicit-citation.test.ts +304 -0
- package/src/tests/memory-rater-llm-summarizer.test.ts +317 -0
- package/src/tests/memory-rater-llm.test.ts +964 -0
- package/src/tests/memory-rater-store.test.ts +249 -0
- package/src/tests/memory-reranker.test.ts +161 -2
- package/src/tests/migration-runner-regressions.test.ts +17 -2
- package/src/tests/mocks/mock-llm-rater-client.ts +35 -0
- package/src/tests/run-server-raters.test.ts +291 -0
- package/src/tests/sessions.test.ts +141 -0
- package/src/tests/status.test.ts +843 -0
- package/src/tests/stop-hook-task-resolution.test.ts +98 -0
- package/src/tests/template-recommendations.test.ts +148 -0
- package/src/tests/tool-annotations.test.ts +2 -2
- package/src/tests/use-dismissible-card.test.ts +140 -0
- package/src/tools/memory-rate.ts +166 -0
- package/src/tools/memory-search.ts +18 -0
- package/src/tools/store-progress.ts +37 -0
- package/src/tools/swarm-config/set-config.ts +17 -1
- package/src/tools/tool-config.ts +1 -0
- package/src/types.ts +122 -1
- package/src/utils/harness-provider.ts +32 -0
- package/tsconfig.json +0 -2
|
@@ -0,0 +1,578 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-cutting end-to-end test for memory-rater v1.5.
|
|
3
|
+
*
|
|
4
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-7.md §6
|
|
5
|
+
*
|
|
6
|
+
* Exercises the full flow that ships in v1.5:
|
|
7
|
+
* A. retrieval bridge writes `memory_retrieval` rows
|
|
8
|
+
* B. explicit rating with `referencesSource` updates the memory's posterior
|
|
9
|
+
* AND creates an `agent_memory_edge` row
|
|
10
|
+
* C. implicit-citation server rater fires on task completion, hits the
|
|
11
|
+
* cited memory and misses the uncited one
|
|
12
|
+
* D. LlmRater piggyback adds llm-source ratings + a second edge
|
|
13
|
+
* E. read endpoints surface the retrievals and the edges
|
|
14
|
+
* F. reranker — usefulness(α, β) > 1 after positive ratings, so the cited
|
|
15
|
+
* memory ranks above its baseline score
|
|
16
|
+
* G. backward compat — with no ratings on a fresh DB, reranker scores are
|
|
17
|
+
* byte-identical to a pre-v1.5 baseline
|
|
18
|
+
*
|
|
19
|
+
* Implementation notes:
|
|
20
|
+
* - Spawns the API server on an isolated SQLite file so the HTTP read
|
|
21
|
+
* endpoints can be exercised against the same DB the test mutates.
|
|
22
|
+
* - The retrieval bridge and the implicit-citation rater fire are invoked
|
|
23
|
+
* in-process via `recordRetrievals` and `runServerRaters` rather than
|
|
24
|
+
* through their HTTP entry points, because both call sites depend on
|
|
25
|
+
* external services (OpenAI embeddings for `/api/memory/search`,
|
|
26
|
+
* `claude -p` for the hook). Their HTTP wrappers are exercised by the
|
|
27
|
+
* dedicated step-2/step-3/step-4 suites; this test focuses on the
|
|
28
|
+
* cross-cutting state transitions the dedicated suites don't cover
|
|
29
|
+
* end-to-end.
|
|
30
|
+
* - The LlmRater path uses `buildRatingsFromLlm` with a hand-built
|
|
31
|
+
* `SummaryWithRatings`-shaped payload — this matches what the hook
|
|
32
|
+
* would have constructed from a successful `claude -p` summary call.
|
|
33
|
+
*/
|
|
34
|
+
import { afterAll, beforeAll, beforeEach, describe, expect, test } from "bun:test";
|
|
35
|
+
import { randomUUID } from "node:crypto";
|
|
36
|
+
import { unlink } from "node:fs/promises";
|
|
37
|
+
import type { Subprocess } from "bun";
|
|
38
|
+
import { closeDb, createAgent, createSessionLogs, getDb, initDb } from "../be/db";
|
|
39
|
+
import { SqliteMemoryStore } from "../be/memory/providers/sqlite-store";
|
|
40
|
+
import { ImplicitCitationRater } from "../be/memory/raters/implicit-citation";
|
|
41
|
+
import { buildRatingsFromLlm } from "../be/memory/raters/llm";
|
|
42
|
+
import { recordRetrievals } from "../be/memory/raters/retrieval";
|
|
43
|
+
import { runServerRaters } from "../be/memory/raters/run-server-raters";
|
|
44
|
+
import { applyRating } from "../be/memory/raters/store";
|
|
45
|
+
import { rerank } from "../be/memory/reranker";
|
|
46
|
+
import type { MemoryCandidate } from "../be/memory/types";
|
|
47
|
+
|
|
48
|
+
const TEST_PORT = 19131;
|
|
49
|
+
const TEST_DB_PATH = `/tmp/test-memory-rater-e2e-${Date.now()}.sqlite`;
|
|
50
|
+
const BASE = `http://localhost:${TEST_PORT}`;
|
|
51
|
+
const API_KEY = "test-key";
|
|
52
|
+
|
|
53
|
+
let serverProc: Subprocess;
|
|
54
|
+
let store: SqliteMemoryStore;
|
|
55
|
+
const agentId = randomUUID();
|
|
56
|
+
const taskId = randomUUID();
|
|
57
|
+
|
|
58
|
+
const testTemplateGlobals = globalThis as typeof globalThis & {
|
|
59
|
+
__testMigrationTemplate?: Uint8Array;
|
|
60
|
+
__savedE2eTemplate?: Uint8Array;
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
async function api(
|
|
64
|
+
method: string,
|
|
65
|
+
path: string,
|
|
66
|
+
opts: { body?: unknown; agentId?: string; sourceTaskId?: string } = {},
|
|
67
|
+
// biome-ignore lint/suspicious/noExplicitAny: test helper
|
|
68
|
+
): Promise<{ status: number; body: any }> {
|
|
69
|
+
const headers: Record<string, string> = {
|
|
70
|
+
"Content-Type": "application/json",
|
|
71
|
+
Authorization: `Bearer ${API_KEY}`,
|
|
72
|
+
};
|
|
73
|
+
if (opts.agentId) headers["x-agent-id"] = opts.agentId;
|
|
74
|
+
if (opts.sourceTaskId) headers["x-source-task-id"] = opts.sourceTaskId;
|
|
75
|
+
const res = await fetch(`${BASE}${path}`, {
|
|
76
|
+
method,
|
|
77
|
+
headers,
|
|
78
|
+
body: opts.body !== undefined ? JSON.stringify(opts.body) : undefined,
|
|
79
|
+
});
|
|
80
|
+
const text = await res.text();
|
|
81
|
+
// biome-ignore lint/suspicious/noExplicitAny: body may be JSON or text
|
|
82
|
+
let body: any;
|
|
83
|
+
try {
|
|
84
|
+
body = JSON.parse(text);
|
|
85
|
+
} catch {
|
|
86
|
+
body = text;
|
|
87
|
+
}
|
|
88
|
+
return { status: res.status, body };
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
async function waitForServer(url: string, timeoutMs = 15000): Promise<void> {
|
|
92
|
+
const start = Date.now();
|
|
93
|
+
while (Date.now() - start < timeoutMs) {
|
|
94
|
+
try {
|
|
95
|
+
const r = await fetch(url);
|
|
96
|
+
if (r.ok) return;
|
|
97
|
+
} catch {
|
|
98
|
+
// not ready
|
|
99
|
+
}
|
|
100
|
+
await Bun.sleep(50);
|
|
101
|
+
}
|
|
102
|
+
throw new Error(`Server did not start within ${timeoutMs}ms`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function readPosterior(id: string): { alpha: number; beta: number } {
|
|
106
|
+
const row = getDb()
|
|
107
|
+
.prepare<{ alpha: number; beta: number }, [string]>(
|
|
108
|
+
"SELECT alpha, beta FROM agent_memory WHERE id = ?",
|
|
109
|
+
)
|
|
110
|
+
.get(id);
|
|
111
|
+
if (!row) throw new Error(`memory ${id} not found`);
|
|
112
|
+
return { alpha: row.alpha, beta: row.beta };
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function getRatings(taskIdArg: string) {
|
|
116
|
+
return getDb()
|
|
117
|
+
.prepare<
|
|
118
|
+
{
|
|
119
|
+
memoryId: string;
|
|
120
|
+
source: string;
|
|
121
|
+
signal: number;
|
|
122
|
+
weight: number;
|
|
123
|
+
},
|
|
124
|
+
[string]
|
|
125
|
+
>("SELECT memoryId, source, signal, weight FROM memory_rating WHERE taskId = ?")
|
|
126
|
+
.all(taskIdArg);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function countEdges(memoryId: string): number {
|
|
130
|
+
const row = getDb()
|
|
131
|
+
.prepare<{ n: number }, [string]>(
|
|
132
|
+
"SELECT COUNT(*) as n FROM agent_memory_edge WHERE from_id = ? AND type = 'references-source'",
|
|
133
|
+
)
|
|
134
|
+
.get(memoryId);
|
|
135
|
+
return row?.n ?? 0;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
beforeAll(async () => {
|
|
139
|
+
for (const suffix of ["", "-wal", "-shm"]) {
|
|
140
|
+
try {
|
|
141
|
+
await unlink(TEST_DB_PATH + suffix);
|
|
142
|
+
} catch {}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
serverProc = Bun.spawn(["bun", "src/http.ts"], {
|
|
146
|
+
cwd: `${import.meta.dir}/../..`,
|
|
147
|
+
env: {
|
|
148
|
+
...process.env,
|
|
149
|
+
PORT: String(TEST_PORT),
|
|
150
|
+
DATABASE_PATH: TEST_DB_PATH,
|
|
151
|
+
API_KEY,
|
|
152
|
+
// The cross-cutting flow gates on this allow-list — without it the
|
|
153
|
+
// implicit-citation server rater is a no-op (the byte-identical
|
|
154
|
+
// off-mode litmus from PR #429).
|
|
155
|
+
MEMORY_RATERS: "implicit-citation,llm,explicit-self",
|
|
156
|
+
CAPABILITIES: "core",
|
|
157
|
+
SLACK_BOT_TOKEN: "",
|
|
158
|
+
LINEAR_DISABLE: "true",
|
|
159
|
+
JIRA_DISABLE: "true",
|
|
160
|
+
GITHUB_DISABLE: "true",
|
|
161
|
+
SLACK_DISABLE: "true",
|
|
162
|
+
HEARTBEAT_DISABLE: "true",
|
|
163
|
+
OAUTH_KEEPALIVE_DISABLE: "true",
|
|
164
|
+
ANONYMIZED_TELEMETRY: "false",
|
|
165
|
+
},
|
|
166
|
+
stdout: "ignore",
|
|
167
|
+
stderr: "ignore",
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
await waitForServer(`${BASE}/health`);
|
|
171
|
+
|
|
172
|
+
// Hide preload.ts's in-memory template so initDb opens the real file.
|
|
173
|
+
testTemplateGlobals.__savedE2eTemplate = testTemplateGlobals.__testMigrationTemplate;
|
|
174
|
+
testTemplateGlobals.__testMigrationTemplate = undefined;
|
|
175
|
+
closeDb();
|
|
176
|
+
initDb(TEST_DB_PATH);
|
|
177
|
+
|
|
178
|
+
createAgent({ id: agentId, name: "E2E Agent", isLead: false, status: "idle" });
|
|
179
|
+
const insertTask = getDb().prepare(
|
|
180
|
+
`INSERT INTO agent_tasks (id, agentId, task, status, source, createdAt, lastUpdatedAt)
|
|
181
|
+
VALUES (?, ?, ?, 'in_progress', 'mcp', ?, ?)`,
|
|
182
|
+
);
|
|
183
|
+
const now = new Date().toISOString();
|
|
184
|
+
insertTask.run(taskId, agentId, "e2e task", now, now);
|
|
185
|
+
|
|
186
|
+
store = new SqliteMemoryStore();
|
|
187
|
+
}, 20000);
|
|
188
|
+
|
|
189
|
+
afterAll(async () => {
|
|
190
|
+
closeDb();
|
|
191
|
+
testTemplateGlobals.__testMigrationTemplate = testTemplateGlobals.__savedE2eTemplate;
|
|
192
|
+
testTemplateGlobals.__savedE2eTemplate = undefined;
|
|
193
|
+
if (serverProc) {
|
|
194
|
+
serverProc.kill();
|
|
195
|
+
try {
|
|
196
|
+
await serverProc.exited;
|
|
197
|
+
} catch {}
|
|
198
|
+
}
|
|
199
|
+
await Bun.sleep(50);
|
|
200
|
+
for (const suffix of ["", "-wal", "-shm"]) {
|
|
201
|
+
try {
|
|
202
|
+
await unlink(TEST_DB_PATH + suffix);
|
|
203
|
+
} catch {}
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
beforeEach(() => {
|
|
208
|
+
// Each test starts fresh — wipe all rater-touched state but keep the
|
|
209
|
+
// agent / task rows, and reset all memory posteriors to Beta(1,1).
|
|
210
|
+
getDb().run("DELETE FROM memory_rating");
|
|
211
|
+
getDb().run("DELETE FROM memory_retrieval");
|
|
212
|
+
getDb().run("DELETE FROM session_logs");
|
|
213
|
+
getDb().run("DELETE FROM agent_memory_edge");
|
|
214
|
+
getDb().run("UPDATE agent_memory SET alpha = 1.0, beta = 1.0");
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
function makeMemory(name: string, scope: "agent" | "swarm"): { id: string } {
|
|
218
|
+
return store.store({
|
|
219
|
+
agentId,
|
|
220
|
+
scope,
|
|
221
|
+
name,
|
|
222
|
+
content: `${name} content`,
|
|
223
|
+
source: "manual",
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
describe("memory-rater v1.5 — cross-cutting e2e", () => {
|
|
228
|
+
test("Step A: retrieval bridge writes memory_retrieval rows", () => {
|
|
229
|
+
const memA = makeMemory("mem-A-step-a", "agent");
|
|
230
|
+
const memB = makeMemory("mem-B-step-a", "swarm");
|
|
231
|
+
|
|
232
|
+
recordRetrievals(taskId, agentId, [
|
|
233
|
+
{ memoryId: memA.id, similarity: 0.9 },
|
|
234
|
+
{ memoryId: memB.id, similarity: 0.7 },
|
|
235
|
+
]);
|
|
236
|
+
|
|
237
|
+
const rows = getDb()
|
|
238
|
+
.prepare<{ memoryId: string }, [string]>(
|
|
239
|
+
"SELECT memoryId FROM memory_retrieval WHERE taskId = ? ORDER BY memoryId",
|
|
240
|
+
)
|
|
241
|
+
.all(taskId);
|
|
242
|
+
expect(rows).toHaveLength(2);
|
|
243
|
+
expect(rows.map((r) => r.memoryId).sort()).toEqual([memA.id, memB.id].sort());
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
test("Step B: explicit-self rating with edge updates posterior + creates edge", async () => {
|
|
247
|
+
const memA = makeMemory("mem-A-step-b", "agent");
|
|
248
|
+
insertRetrieval(taskId, agentId, memA.id);
|
|
249
|
+
|
|
250
|
+
const r = await api("POST", "/api/memory/rate", {
|
|
251
|
+
agentId,
|
|
252
|
+
body: {
|
|
253
|
+
events: [
|
|
254
|
+
{
|
|
255
|
+
memoryId: memA.id,
|
|
256
|
+
signal: 1,
|
|
257
|
+
weight: 1,
|
|
258
|
+
source: "explicit-self",
|
|
259
|
+
taskId,
|
|
260
|
+
referencesSource: "github:desplega-ai/agent-swarm#999",
|
|
261
|
+
},
|
|
262
|
+
],
|
|
263
|
+
},
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
expect(r.status).toBe(200);
|
|
267
|
+
expect(r.body.applied).toBe(1);
|
|
268
|
+
|
|
269
|
+
expect(readPosterior(memA.id).alpha).toBeCloseTo(2.0, 5);
|
|
270
|
+
|
|
271
|
+
const edges = getDb()
|
|
272
|
+
.prepare<{ from_id: string; to_id: string; alpha: number; beta: number }, [string]>(
|
|
273
|
+
"SELECT from_id, to_id, alpha, beta FROM agent_memory_edge WHERE from_id = ?",
|
|
274
|
+
)
|
|
275
|
+
.all(memA.id);
|
|
276
|
+
expect(edges).toHaveLength(1);
|
|
277
|
+
expect(edges[0]!.to_id).toBe("github:desplega-ai/agent-swarm#999");
|
|
278
|
+
expect(edges[0]!.alpha).toBeCloseTo(2.0, 5);
|
|
279
|
+
expect(edges[0]!.beta).toBeCloseTo(1.0, 5);
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
test("Step C: implicit-citation rater hits cited memory, misses the other", async () => {
|
|
283
|
+
const memA = makeMemory("mem-A-step-c", "agent");
|
|
284
|
+
const memB = makeMemory("mem-B-step-c", "swarm");
|
|
285
|
+
|
|
286
|
+
// Pre-condition: explicit-self has already moved alpha for mem-A to 2.0
|
|
287
|
+
// (mirrors the actual flow in step B).
|
|
288
|
+
insertRetrieval(taskId, agentId, memA.id);
|
|
289
|
+
insertRetrieval(taskId, agentId, memB.id);
|
|
290
|
+
applyRating(
|
|
291
|
+
[
|
|
292
|
+
{
|
|
293
|
+
memoryId: memA.id,
|
|
294
|
+
signal: 1,
|
|
295
|
+
weight: 1,
|
|
296
|
+
source: "explicit-self",
|
|
297
|
+
},
|
|
298
|
+
],
|
|
299
|
+
{ taskId },
|
|
300
|
+
);
|
|
301
|
+
expect(readPosterior(memA.id).alpha).toBeCloseTo(2.0, 5);
|
|
302
|
+
|
|
303
|
+
// session_logs cite mem-A but NOT mem-B.
|
|
304
|
+
createSessionLogs({
|
|
305
|
+
taskId,
|
|
306
|
+
sessionId: "session-c",
|
|
307
|
+
iteration: 1,
|
|
308
|
+
cli: "claude",
|
|
309
|
+
lines: [`I used ${memA.id} to solve this`, "more progress"],
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
// Fire the server-rater orchestration the way `store-progress` does.
|
|
313
|
+
// Inject the rater explicitly — the test process inherits its own
|
|
314
|
+
// MEMORY_RATERS env (typically unset to avoid disturbing other suites),
|
|
315
|
+
// and we want to exercise this rater regardless. Step G covers the
|
|
316
|
+
// unset-env "byte-identical" backward-compat case separately.
|
|
317
|
+
const result = await runServerRaters(
|
|
318
|
+
{
|
|
319
|
+
taskId,
|
|
320
|
+
agentId,
|
|
321
|
+
retrievedMemoryIds: [memA.id, memB.id],
|
|
322
|
+
evidence: getDb()
|
|
323
|
+
.prepare<{ content: string }, [string]>(
|
|
324
|
+
"SELECT content FROM session_logs WHERE taskId = ? ORDER BY iteration, lineNumber",
|
|
325
|
+
)
|
|
326
|
+
.all(taskId)
|
|
327
|
+
.map((r) => r.content)
|
|
328
|
+
.join("\n"),
|
|
329
|
+
},
|
|
330
|
+
{
|
|
331
|
+
raters: [new ImplicitCitationRater()],
|
|
332
|
+
},
|
|
333
|
+
);
|
|
334
|
+
expect(result.ratersFired).toBeGreaterThanOrEqual(1);
|
|
335
|
+
|
|
336
|
+
// mem-A.alpha = 2.0 (explicit) + 0.5 (implicit hit) = 2.5
|
|
337
|
+
// mem-B.beta = 1.0 (prior) + 0.25 (implicit miss) = 1.25
|
|
338
|
+
expect(readPosterior(memA.id)).toEqual({ alpha: 2.5, beta: 1.0 });
|
|
339
|
+
expect(readPosterior(memB.id)).toEqual({ alpha: 1.0, beta: 1.25 });
|
|
340
|
+
|
|
341
|
+
const ratings = getRatings(taskId);
|
|
342
|
+
const sources = ratings.map((r) => r.source).sort();
|
|
343
|
+
expect(sources).toContain("implicit-citation");
|
|
344
|
+
expect(sources).toContain("explicit-self");
|
|
345
|
+
});
|
|
346
|
+
|
|
347
|
+
test("Step D: LlmRater piggyback updates posteriors + emits a second edge", async () => {
|
|
348
|
+
const memA = makeMemory("mem-A-step-d", "agent");
|
|
349
|
+
const memB = makeMemory("mem-B-step-d", "swarm");
|
|
350
|
+
|
|
351
|
+
insertRetrieval(taskId, agentId, memA.id);
|
|
352
|
+
insertRetrieval(taskId, agentId, memB.id);
|
|
353
|
+
|
|
354
|
+
// What the `claude -p` summary call returns when the hook piggybacks
|
|
355
|
+
// — same structure as `SummaryWithRatingsSchema`. Keeping this hand-
|
|
356
|
+
// assembled (rather than going through the schema's parse) makes the
|
|
357
|
+
// mapping the test exercises explicit.
|
|
358
|
+
const llmRatings = [
|
|
359
|
+
{
|
|
360
|
+
id: memA.id,
|
|
361
|
+
score: 0.9,
|
|
362
|
+
reasoning: "directly answered the question",
|
|
363
|
+
referencesSource: "linear:DES-294",
|
|
364
|
+
},
|
|
365
|
+
{
|
|
366
|
+
id: memB.id,
|
|
367
|
+
score: 0.2,
|
|
368
|
+
reasoning: "tangentially related, mostly noise",
|
|
369
|
+
},
|
|
370
|
+
];
|
|
371
|
+
|
|
372
|
+
const events = buildRatingsFromLlm(llmRatings, [{ id: memA.id }, { id: memB.id }]);
|
|
373
|
+
expect(events).toHaveLength(2);
|
|
374
|
+
// Sanity-check the mapping (signal = 2*score - 1, weight = 0.8).
|
|
375
|
+
const eA = events.find((e) => e.memoryId === memA.id)!;
|
|
376
|
+
const eB = events.find((e) => e.memoryId === memB.id)!;
|
|
377
|
+
expect(eA.signal).toBeCloseTo(0.8, 5); // 2*0.9 - 1
|
|
378
|
+
expect(eB.signal).toBeCloseTo(-0.6, 5); // 2*0.2 - 1
|
|
379
|
+
expect(eA.weight).toBeCloseTo(0.8, 5);
|
|
380
|
+
expect(eA.referencesSource).toBe("linear:DES-294");
|
|
381
|
+
expect(eB.referencesSource).toBeUndefined();
|
|
382
|
+
|
|
383
|
+
const r = await api("POST", "/api/memory/rate", {
|
|
384
|
+
agentId,
|
|
385
|
+
body: {
|
|
386
|
+
events: events.map((e) => ({ ...e, taskId })),
|
|
387
|
+
},
|
|
388
|
+
});
|
|
389
|
+
expect(r.status).toBe(200);
|
|
390
|
+
expect(r.body.applied).toBe(2);
|
|
391
|
+
|
|
392
|
+
// Posterior shifts: alphaDelta = max(0, signal) * weight,
|
|
393
|
+
// betaDelta = max(0, -signal) * weight.
|
|
394
|
+
// mem-A: alpha = 1 + 0.8 * 0.8 = 1.64, beta = 1
|
|
395
|
+
// mem-B: alpha = 1, beta = 1 + 0.6 * 0.8 = 1.48
|
|
396
|
+
expect(readPosterior(memA.id).alpha).toBeCloseTo(1.64, 5);
|
|
397
|
+
expect(readPosterior(memA.id).beta).toBeCloseTo(1.0, 5);
|
|
398
|
+
expect(readPosterior(memB.id).alpha).toBeCloseTo(1.0, 5);
|
|
399
|
+
expect(readPosterior(memB.id).beta).toBeCloseTo(1.48, 5);
|
|
400
|
+
|
|
401
|
+
expect(countEdges(memA.id)).toBe(1);
|
|
402
|
+
expect(countEdges(memB.id)).toBe(0);
|
|
403
|
+
|
|
404
|
+
const edges = getDb()
|
|
405
|
+
.prepare<{ to_id: string }, [string]>("SELECT to_id FROM agent_memory_edge WHERE from_id = ?")
|
|
406
|
+
.all(memA.id);
|
|
407
|
+
expect(edges[0]!.to_id).toBe("linear:DES-294");
|
|
408
|
+
});
|
|
409
|
+
|
|
410
|
+
test("Step E: GET /api/memory/retrievals + GET /api/memory/edges return what was written", async () => {
|
|
411
|
+
const memA = makeMemory("mem-A-step-e", "agent");
|
|
412
|
+
const memB = makeMemory("mem-B-step-e", "swarm");
|
|
413
|
+
|
|
414
|
+
insertRetrieval(taskId, agentId, memA.id);
|
|
415
|
+
insertRetrieval(taskId, agentId, memB.id);
|
|
416
|
+
|
|
417
|
+
// Two edges on mem-A, one from explicit-self (github), one from llm (linear).
|
|
418
|
+
applyRating(
|
|
419
|
+
[
|
|
420
|
+
{
|
|
421
|
+
memoryId: memA.id,
|
|
422
|
+
signal: 1,
|
|
423
|
+
weight: 1,
|
|
424
|
+
source: "explicit-self",
|
|
425
|
+
referencesSource: "github:desplega-ai/agent-swarm#999",
|
|
426
|
+
},
|
|
427
|
+
],
|
|
428
|
+
{ taskId },
|
|
429
|
+
);
|
|
430
|
+
applyRating(
|
|
431
|
+
[
|
|
432
|
+
{
|
|
433
|
+
memoryId: memA.id,
|
|
434
|
+
signal: 0.8,
|
|
435
|
+
weight: 0.8,
|
|
436
|
+
source: "llm",
|
|
437
|
+
referencesSource: "linear:DES-294",
|
|
438
|
+
},
|
|
439
|
+
],
|
|
440
|
+
{ taskId },
|
|
441
|
+
);
|
|
442
|
+
|
|
443
|
+
const r1 = await api("GET", `/api/memory/retrievals?taskId=${taskId}`, {
|
|
444
|
+
agentId,
|
|
445
|
+
});
|
|
446
|
+
expect(r1.status).toBe(200);
|
|
447
|
+
expect(Array.isArray(r1.body.results)).toBe(true);
|
|
448
|
+
expect(r1.body.results).toHaveLength(2);
|
|
449
|
+
const rids = (r1.body.results as { id: string }[]).map((x) => x.id).sort();
|
|
450
|
+
expect(rids).toEqual([memA.id, memB.id].sort());
|
|
451
|
+
|
|
452
|
+
const r2 = await api("GET", `/api/memory/edges?memoryId=${memA.id}`, {
|
|
453
|
+
agentId,
|
|
454
|
+
});
|
|
455
|
+
expect(r2.status).toBe(200);
|
|
456
|
+
expect(Array.isArray(r2.body.edges)).toBe(true);
|
|
457
|
+
expect(r2.body.edges).toHaveLength(2);
|
|
458
|
+
const tos = (r2.body.edges as { to: string }[]).map((e) => e.to).sort();
|
|
459
|
+
expect(tos).toEqual(["github:desplega-ai/agent-swarm#999", "linear:DES-294"]);
|
|
460
|
+
for (const edge of r2.body.edges as {
|
|
461
|
+
alpha: number;
|
|
462
|
+
beta: number;
|
|
463
|
+
usefulness: number;
|
|
464
|
+
}[]) {
|
|
465
|
+
expect(edge.alpha).toBeGreaterThan(1);
|
|
466
|
+
expect(edge.usefulness).toBeGreaterThanOrEqual(1.0);
|
|
467
|
+
expect(edge.usefulness).toBeLessThanOrEqual(2.0);
|
|
468
|
+
}
|
|
469
|
+
});
|
|
470
|
+
|
|
471
|
+
test("Step F: reranker — usefulness > 1 after positive ratings, mem-A ranks higher than baseline", () => {
|
|
472
|
+
const memA = makeMemory("mem-A-step-f", "agent");
|
|
473
|
+
const memB = makeMemory("mem-B-step-f", "swarm");
|
|
474
|
+
|
|
475
|
+
// Build a reproducible candidate set — same fields as the reranker reads.
|
|
476
|
+
const buildCandidate = (
|
|
477
|
+
id: string,
|
|
478
|
+
similarity: number,
|
|
479
|
+
alpha: number,
|
|
480
|
+
beta: number,
|
|
481
|
+
): MemoryCandidate => ({
|
|
482
|
+
id,
|
|
483
|
+
agentId,
|
|
484
|
+
scope: "agent",
|
|
485
|
+
name: id,
|
|
486
|
+
content: id,
|
|
487
|
+
source: "manual",
|
|
488
|
+
similarity,
|
|
489
|
+
createdAt: new Date().toISOString(),
|
|
490
|
+
accessedAt: new Date().toISOString(),
|
|
491
|
+
accessCount: 0,
|
|
492
|
+
alpha,
|
|
493
|
+
beta,
|
|
494
|
+
tags: null,
|
|
495
|
+
sourceTaskId: null,
|
|
496
|
+
sourcePath: null,
|
|
497
|
+
chunkIndex: null,
|
|
498
|
+
totalChunks: null,
|
|
499
|
+
expiresAt: null,
|
|
500
|
+
embeddingModel: null,
|
|
501
|
+
});
|
|
502
|
+
|
|
503
|
+
// Baseline (Beta(1,1)) — mem-A and mem-B both have usefulness = 1.
|
|
504
|
+
const baselineA = buildCandidate(memA.id, 0.5, 1, 1);
|
|
505
|
+
const baselineB = buildCandidate(memB.id, 0.5, 1, 1);
|
|
506
|
+
const baselineRanked = rerank([baselineA, baselineB], { limit: 2 });
|
|
507
|
+
const baselineScoreA = baselineRanked.find((r) => r.id === memA.id)!.similarity;
|
|
508
|
+
|
|
509
|
+
// After positive ratings push mem-A's posterior to (2.5, 1.0) — same
|
|
510
|
+
// numbers we asserted in step C — usefulness(2.5, 1.0) = clamp(
|
|
511
|
+
// 2 * 2.5 / 3.5, 1.0, 2.0) ≈ 1.428. So the rescaled score is strictly
|
|
512
|
+
// greater than baseline.
|
|
513
|
+
const ratedA = buildCandidate(memA.id, 0.5, 2.5, 1.0);
|
|
514
|
+
const ratedRanked = rerank([ratedA, baselineB], { limit: 2 });
|
|
515
|
+
const ratedScoreA = ratedRanked.find((r) => r.id === memA.id)!.similarity;
|
|
516
|
+
|
|
517
|
+
expect(ratedScoreA).toBeGreaterThan(baselineScoreA);
|
|
518
|
+
// mem-A now ranks first (was tied with mem-B before).
|
|
519
|
+
expect(ratedRanked[0]!.id).toBe(memA.id);
|
|
520
|
+
});
|
|
521
|
+
|
|
522
|
+
test("Step G: backward compat — Beta(1,1) yields a usefulness factor of exactly 1.0 (byte-identical)", () => {
|
|
523
|
+
const id = randomUUID();
|
|
524
|
+
const buildCandidate = (similarity: number): MemoryCandidate => ({
|
|
525
|
+
id,
|
|
526
|
+
agentId,
|
|
527
|
+
scope: "agent",
|
|
528
|
+
name: id,
|
|
529
|
+
content: id,
|
|
530
|
+
source: "manual",
|
|
531
|
+
similarity,
|
|
532
|
+
createdAt: new Date().toISOString(),
|
|
533
|
+
accessedAt: new Date().toISOString(),
|
|
534
|
+
accessCount: 0,
|
|
535
|
+
alpha: 1,
|
|
536
|
+
beta: 1,
|
|
537
|
+
tags: null,
|
|
538
|
+
sourceTaskId: null,
|
|
539
|
+
sourcePath: null,
|
|
540
|
+
chunkIndex: null,
|
|
541
|
+
totalChunks: null,
|
|
542
|
+
expiresAt: null,
|
|
543
|
+
embeddingModel: null,
|
|
544
|
+
});
|
|
545
|
+
|
|
546
|
+
// The pre-rater reranker computed: similarity * recency_decay * access_boost.
|
|
547
|
+
// With access_boost = 1 (accessCount = 0) and a fresh timestamp
|
|
548
|
+
// (recency_decay ≈ 1 for ages well below the half-life), the score is
|
|
549
|
+
// approximately equal to the input similarity — and crucially, the
|
|
550
|
+
// usefulness factor MUST contribute 1.0 exactly so the v1.5 reranker
|
|
551
|
+
// is byte-identical to pre-v1.5 for unrated memories.
|
|
552
|
+
const candidates = [buildCandidate(0.5), buildCandidate(0.7), buildCandidate(0.3)];
|
|
553
|
+
const ranked = rerank(candidates, { limit: 3 });
|
|
554
|
+
|
|
555
|
+
// Order is preserved by similarity (since all other multipliers are equal).
|
|
556
|
+
expect(ranked.map((r) => r.similarity).sort((a, b) => b - a)).toEqual(
|
|
557
|
+
ranked.map((r) => r.similarity),
|
|
558
|
+
);
|
|
559
|
+
|
|
560
|
+
// The usefulness factor at Beta(1,1) is exactly 1.0; a memory with no
|
|
561
|
+
// ratings should score within numerical noise of similarity * recency *
|
|
562
|
+
// access (the original pre-v1.5 formula).
|
|
563
|
+
const fresh = buildCandidate(0.5);
|
|
564
|
+
const score = rerank([fresh], { limit: 1 })[0]!.similarity;
|
|
565
|
+
// recency at age = 0 is exactly 1; access_boost at count=0 is exactly 1;
|
|
566
|
+
// usefulness at (1,1) is exactly 1. So score === 0.5 to machine precision.
|
|
567
|
+
expect(score).toBeCloseTo(0.5, 10);
|
|
568
|
+
});
|
|
569
|
+
});
|
|
570
|
+
|
|
571
|
+
function insertRetrieval(taskIdArg: string, agentIdArg: string, memoryId: string): void {
|
|
572
|
+
getDb()
|
|
573
|
+
.prepare(
|
|
574
|
+
`INSERT INTO memory_retrieval (id, taskId, agentId, sessionId, memoryId, similarity, retrievedAt)
|
|
575
|
+
VALUES (?, ?, ?, NULL, ?, 0.85, ?)`,
|
|
576
|
+
)
|
|
577
|
+
.run(randomUUID(), taskIdArg, agentIdArg, memoryId, new Date().toISOString());
|
|
578
|
+
}
|