@desplega.ai/agent-swarm 1.74.4 → 1.76.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/openapi.json +1264 -46
- package/package.json +2 -2
- package/src/be/db.ts +563 -9
- package/src/be/memory/edges-store.ts +69 -0
- package/src/be/memory/providers/sqlite-store.ts +4 -0
- package/src/be/memory/raters/explicit-self.ts +22 -0
- package/src/be/memory/raters/implicit-citation.ts +44 -0
- package/src/be/memory/raters/llm-client.ts +172 -0
- package/src/be/memory/raters/llm-summarizer.ts +218 -0
- package/src/be/memory/raters/llm.ts +375 -0
- package/src/be/memory/raters/noop.ts +14 -0
- package/src/be/memory/raters/registry.ts +86 -0
- package/src/be/memory/raters/retrieval.ts +88 -0
- package/src/be/memory/raters/run-server-raters.ts +97 -0
- package/src/be/memory/raters/store.ts +228 -0
- package/src/be/memory/raters/types.ts +101 -0
- package/src/be/memory/reranker.ts +32 -2
- package/src/be/memory/retrieval-store.ts +116 -0
- package/src/be/memory/types.ts +3 -0
- package/src/be/migrations/051_memory_posteriors_and_retrieval.sql +67 -0
- package/src/be/migrations/052_memory_edges.sql +36 -0
- package/src/be/migrations/053_agent_waiting_for_credentials_status.sql +61 -0
- package/src/be/migrations/054_agent_harness_provider.sql +21 -0
- package/src/be/migrations/055_agent_cred_status.sql +15 -0
- package/src/be/migrations/056_drop_agent_tasks_source_check.sql +139 -0
- package/src/be/migrations/057_inbox_item_state.sql +27 -0
- package/src/be/migrations/058_task_templates.sql +31 -0
- package/src/be/swarm-config-guard.ts +24 -0
- package/src/commands/credential-wait.ts +186 -0
- package/src/commands/provider-credentials.ts +434 -0
- package/src/commands/runner.ts +253 -21
- package/src/hooks/hook.ts +143 -66
- package/src/http/agents.ts +191 -1
- package/src/http/config.ts +11 -1
- package/src/http/core.ts +5 -0
- package/src/http/inbox-state.ts +89 -0
- package/src/http/index.ts +10 -0
- package/src/http/memory.ts +230 -1
- package/src/http/sessions.ts +86 -0
- package/src/http/status.ts +665 -0
- package/src/http/task-templates.ts +51 -0
- package/src/http/tasks.ts +85 -5
- package/src/http/users.ts +134 -0
- package/src/prompts/memories.ts +62 -0
- package/src/providers/claude-adapter.ts +22 -0
- package/src/providers/claude-managed-adapter.ts +24 -0
- package/src/providers/codex-adapter.ts +43 -1
- package/src/providers/devin-adapter.ts +18 -0
- package/src/providers/index.ts +7 -0
- package/src/providers/opencode-adapter.ts +60 -0
- package/src/providers/pi-mono-adapter.ts +71 -0
- package/src/providers/types.ts +34 -0
- package/src/server.ts +2 -0
- package/src/slack/handlers.ts +0 -1
- package/src/tests/agents-harness-provider.test.ts +333 -0
- package/src/tests/credential-check.test.ts +367 -0
- package/src/tests/credential-status-api.test.ts +223 -0
- package/src/tests/credential-status-routing.test.ts +150 -0
- package/src/tests/credential-wait.test.ts +282 -0
- package/src/tests/harness-provider-resolution.test.ts +242 -0
- package/src/tests/jira-sync.test.ts +1 -1
- package/src/tests/memory-edges.test.ts +722 -0
- package/src/tests/memory-rate-endpoint.test.ts +330 -0
- package/src/tests/memory-rate-tool.test.ts +252 -0
- package/src/tests/memory-rater-e2e.test.ts +578 -0
- package/src/tests/memory-rater-implicit-citation.test.ts +304 -0
- package/src/tests/memory-rater-llm-summarizer.test.ts +317 -0
- package/src/tests/memory-rater-llm.test.ts +964 -0
- package/src/tests/memory-rater-store.test.ts +249 -0
- package/src/tests/memory-reranker.test.ts +161 -2
- package/src/tests/migration-runner-regressions.test.ts +17 -2
- package/src/tests/mocks/mock-llm-rater-client.ts +35 -0
- package/src/tests/run-server-raters.test.ts +291 -0
- package/src/tests/sessions.test.ts +141 -0
- package/src/tests/status.test.ts +843 -0
- package/src/tests/stop-hook-task-resolution.test.ts +98 -0
- package/src/tests/template-recommendations.test.ts +148 -0
- package/src/tests/tool-annotations.test.ts +2 -2
- package/src/tests/use-dismissible-card.test.ts +140 -0
- package/src/tools/memory-rate.ts +166 -0
- package/src/tools/memory-search.ts +18 -0
- package/src/tools/store-progress.ts +37 -0
- package/src/tools/swarm-config/set-config.ts +17 -1
- package/src/tools/tool-config.ts +1 -0
- package/src/types.ts +122 -1
- package/src/utils/harness-provider.ts +32 -0
- package/tsconfig.json +0 -2
|
@@ -0,0 +1,964 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit + integration tests for `LlmRater` and the worker-side hook piggyback.
|
|
3
|
+
*
|
|
4
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-4.md §6, §7
|
|
5
|
+
*
|
|
6
|
+
* Layout:
|
|
7
|
+
* 1. Pure unit tests — Zod schema parse/reject, `buildRatingsFromLlm`
|
|
8
|
+
* mapping, prompt construction.
|
|
9
|
+
* 2. `LlmRater.rate(ctx)` per-memory path with `MockLlmRaterClient`.
|
|
10
|
+
* 3. HTTP integration: spawn the API server against an isolated SQLite
|
|
11
|
+
* file, simulate the hook's piggyback flow (`generateObject` is mocked
|
|
12
|
+
* by feeding the parsed object directly into `buildRatingsFromLlm`),
|
|
13
|
+
* and assert `agent_memory.alpha/beta` move + `memory_rating` rows are
|
|
14
|
+
* written.
|
|
15
|
+
* 4. Negative path: `MEMORY_RATERS` unset → no `/api/memory/rate` call.
|
|
16
|
+
*/
|
|
17
|
+
import { afterAll, beforeAll, beforeEach, describe, expect, test } from "bun:test";
|
|
18
|
+
import { randomUUID } from "node:crypto";
|
|
19
|
+
import { unlink } from "node:fs/promises";
|
|
20
|
+
import type { Subprocess } from "bun";
|
|
21
|
+
import { closeDb, createAgent, getDb, initDb } from "../be/db";
|
|
22
|
+
import { SqliteMemoryStore } from "../be/memory/providers/sqlite-store";
|
|
23
|
+
import {
|
|
24
|
+
buildRatingsFromLlm,
|
|
25
|
+
buildSummaryWithRatingsPrompt,
|
|
26
|
+
dedupeRetrievalsForRater,
|
|
27
|
+
fetchRetrievalsForTask,
|
|
28
|
+
isLlmRaterEnabled,
|
|
29
|
+
LLM_RATER_WEIGHT,
|
|
30
|
+
LlmRater,
|
|
31
|
+
postRatings,
|
|
32
|
+
type RetrievalRow,
|
|
33
|
+
SummaryWithRatingsSchema,
|
|
34
|
+
} from "../be/memory/raters/llm";
|
|
35
|
+
import { getRegisteredRaters, SERVER_RATERS } from "../be/memory/raters/registry";
|
|
36
|
+
import type { RatingEvent } from "../be/memory/raters/types";
|
|
37
|
+
import { MockLlmRaterClient } from "./mocks/mock-llm-rater-client";
|
|
38
|
+
|
|
39
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
40
|
+
// 1. Pure unit tests — schema, mapping, prompt. No DB / network required.
|
|
41
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
42
|
+
|
|
43
|
+
describe("SummaryWithRatingsSchema", () => {
|
|
44
|
+
test("accepts a well-formed response", () => {
|
|
45
|
+
const r = SummaryWithRatingsSchema.safeParse({
|
|
46
|
+
summary: "key learnings",
|
|
47
|
+
ratings: [
|
|
48
|
+
{ id: "mem-1", score: 0.9, reasoning: "directly answered" },
|
|
49
|
+
{ id: "mem-2", score: 0, reasoning: "irrelevant" },
|
|
50
|
+
],
|
|
51
|
+
});
|
|
52
|
+
expect(r.success).toBe(true);
|
|
53
|
+
if (r.success) expect(r.data.ratings).toHaveLength(2);
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
test("defaults `ratings` to [] when omitted", () => {
|
|
57
|
+
const r = SummaryWithRatingsSchema.safeParse({ summary: "no retrievals" });
|
|
58
|
+
expect(r.success).toBe(true);
|
|
59
|
+
if (r.success) expect(r.data.ratings).toEqual([]);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
test("rejects score > 1", () => {
|
|
63
|
+
const r = SummaryWithRatingsSchema.safeParse({
|
|
64
|
+
summary: "x",
|
|
65
|
+
ratings: [{ id: "m", score: 1.2, reasoning: "n/a" }],
|
|
66
|
+
});
|
|
67
|
+
expect(r.success).toBe(false);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test("rejects score < 0", () => {
|
|
71
|
+
const r = SummaryWithRatingsSchema.safeParse({
|
|
72
|
+
summary: "x",
|
|
73
|
+
ratings: [{ id: "m", score: -0.1, reasoning: "n/a" }],
|
|
74
|
+
});
|
|
75
|
+
expect(r.success).toBe(false);
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
test("rejects empty reasoning", () => {
|
|
79
|
+
const r = SummaryWithRatingsSchema.safeParse({
|
|
80
|
+
summary: "x",
|
|
81
|
+
ratings: [{ id: "m", score: 0.5, reasoning: "" }],
|
|
82
|
+
});
|
|
83
|
+
expect(r.success).toBe(false);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
test("rejects reasoning > 500 chars", () => {
|
|
87
|
+
const r = SummaryWithRatingsSchema.safeParse({
|
|
88
|
+
summary: "x",
|
|
89
|
+
ratings: [{ id: "m", score: 0.5, reasoning: "a".repeat(501) }],
|
|
90
|
+
});
|
|
91
|
+
expect(r.success).toBe(false);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
test("rejects missing reasoning", () => {
|
|
95
|
+
const r = SummaryWithRatingsSchema.safeParse({
|
|
96
|
+
summary: "x",
|
|
97
|
+
ratings: [{ id: "m", score: 0.5 }],
|
|
98
|
+
});
|
|
99
|
+
expect(r.success).toBe(false);
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
test("rejects non-string id", () => {
|
|
103
|
+
const r = SummaryWithRatingsSchema.safeParse({
|
|
104
|
+
summary: "x",
|
|
105
|
+
ratings: [{ id: 42, score: 0.5, reasoning: "ok" }],
|
|
106
|
+
});
|
|
107
|
+
expect(r.success).toBe(false);
|
|
108
|
+
});
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
describe("buildRatingsFromLlm", () => {
|
|
112
|
+
const retrievals = [
|
|
113
|
+
{ id: "mem-A", name: "a", content: "" },
|
|
114
|
+
{ id: "mem-B", name: "b", content: "" },
|
|
115
|
+
{ id: "mem-C", name: "c", content: "" },
|
|
116
|
+
];
|
|
117
|
+
|
|
118
|
+
test("score=0 → signal=-1, score=0.5 → signal=0, score=1 → signal=+1", () => {
|
|
119
|
+
const events = buildRatingsFromLlm(
|
|
120
|
+
[
|
|
121
|
+
{ id: "mem-A", score: 0, reasoning: "useless" },
|
|
122
|
+
{ id: "mem-B", score: 0.5, reasoning: "neutral" },
|
|
123
|
+
{ id: "mem-C", score: 1, reasoning: "perfect" },
|
|
124
|
+
],
|
|
125
|
+
retrievals,
|
|
126
|
+
);
|
|
127
|
+
expect(events).toHaveLength(3);
|
|
128
|
+
const a = events.find((e) => e.memoryId === "mem-A")!;
|
|
129
|
+
const b = events.find((e) => e.memoryId === "mem-B")!;
|
|
130
|
+
const c = events.find((e) => e.memoryId === "mem-C")!;
|
|
131
|
+
expect(a.signal).toBeCloseTo(-1, 6);
|
|
132
|
+
expect(b.signal).toBeCloseTo(0, 6);
|
|
133
|
+
expect(c.signal).toBeCloseTo(1, 6);
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
test("weight is exactly 0.8 for every event (research-doc constant)", () => {
|
|
137
|
+
const events = buildRatingsFromLlm(
|
|
138
|
+
[
|
|
139
|
+
{ id: "mem-A", score: 0.2, reasoning: "x" },
|
|
140
|
+
{ id: "mem-B", score: 0.7, reasoning: "y" },
|
|
141
|
+
],
|
|
142
|
+
retrievals,
|
|
143
|
+
);
|
|
144
|
+
for (const e of events) {
|
|
145
|
+
expect(e.weight).toBe(0.8);
|
|
146
|
+
}
|
|
147
|
+
expect(LLM_RATER_WEIGHT).toBe(0.8);
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
test("source is set to 'llm' (the HTTP rate endpoint enums it)", () => {
|
|
151
|
+
const events = buildRatingsFromLlm([{ id: "mem-A", score: 0.5, reasoning: "x" }], retrievals);
|
|
152
|
+
expect(events[0]!.source).toBe("llm");
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
test("reasoning is preserved on each event", () => {
|
|
156
|
+
const events = buildRatingsFromLlm(
|
|
157
|
+
[{ id: "mem-A", score: 0.7, reasoning: "directly answered the question" }],
|
|
158
|
+
retrievals,
|
|
159
|
+
);
|
|
160
|
+
expect(events[0]!.reasoning).toBe("directly answered the question");
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
test("drops ratings whose id is not in the retrieval set (anti-hallucination)", () => {
|
|
164
|
+
const events = buildRatingsFromLlm(
|
|
165
|
+
[
|
|
166
|
+
{ id: "mem-A", score: 0.9, reasoning: "real" },
|
|
167
|
+
{ id: "mem-HALLUCINATED", score: 0.9, reasoning: "fake" },
|
|
168
|
+
],
|
|
169
|
+
retrievals,
|
|
170
|
+
);
|
|
171
|
+
expect(events).toHaveLength(1);
|
|
172
|
+
expect(events[0]!.memoryId).toBe("mem-A");
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
test("empty ratings array → empty events", () => {
|
|
176
|
+
const events = buildRatingsFromLlm([], retrievals);
|
|
177
|
+
expect(events).toEqual([]);
|
|
178
|
+
});
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
describe("buildSummaryWithRatingsPrompt", () => {
|
|
182
|
+
test("returns base prompt unchanged when retrievals is empty", () => {
|
|
183
|
+
const base = "BASE_PROMPT";
|
|
184
|
+
expect(buildSummaryWithRatingsPrompt(base, [])).toBe(base);
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
test("appends schema instruction + memory list when retrievals is non-empty", () => {
|
|
188
|
+
const out = buildSummaryWithRatingsPrompt("BASE", [
|
|
189
|
+
{ id: "mem-1", name: "first", content: "alpha content" },
|
|
190
|
+
{ id: "mem-2", name: "second", content: "beta content" },
|
|
191
|
+
]);
|
|
192
|
+
expect(out.startsWith("BASE")).toBe(true);
|
|
193
|
+
expect(out).toContain('"summary"');
|
|
194
|
+
expect(out).toContain('"ratings"');
|
|
195
|
+
expect(out).toContain("mem-1");
|
|
196
|
+
expect(out).toContain("mem-2");
|
|
197
|
+
expect(out).toContain("alpha content");
|
|
198
|
+
expect(out).toContain("beta content");
|
|
199
|
+
expect(out).toContain("first");
|
|
200
|
+
expect(out).toContain("second");
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
test("truncates long memory content into the prompt", () => {
|
|
204
|
+
const longContent = "x".repeat(5000);
|
|
205
|
+
const out = buildSummaryWithRatingsPrompt("BASE", [
|
|
206
|
+
{ id: "mem-long", name: "L", content: longContent },
|
|
207
|
+
]);
|
|
208
|
+
// Truncation cap is 600 chars + ellipsis. Make sure full 5000 isn't echoed.
|
|
209
|
+
expect(out.includes("x".repeat(5000))).toBe(false);
|
|
210
|
+
expect(out).toContain("…");
|
|
211
|
+
});
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
describe("dedupeRetrievalsForRater", () => {
|
|
215
|
+
// Regression: the LLM rater audit (post-PR #450) found scheduled-task self-
|
|
216
|
+
// similarity inflated alpha posteriors 5x in one rater pass — the Claude
|
|
217
|
+
// Code Changelog Monitor cron surfaced 5 memories from prior hourly runs
|
|
218
|
+
// and got each rated +1.0. Dedup keys on `scheduleId` so only memories
|
|
219
|
+
// from the same scheduled job collapse; distinct one-shot tasks pass
|
|
220
|
+
// through even when their truncated 80-char names collide.
|
|
221
|
+
|
|
222
|
+
test("happy path: 5 cron memories sharing scheduleId + 1 distinct → 2 rows", () => {
|
|
223
|
+
const cronName = "Task: Claude Code Changelog Monitor — check for new entries";
|
|
224
|
+
const cronScheduleId = "sched-claude-code-changelog";
|
|
225
|
+
const rows: RetrievalRow[] = [
|
|
226
|
+
// Newest cron run first (API returns DESC by retrievedAt).
|
|
227
|
+
{
|
|
228
|
+
id: "cron-5",
|
|
229
|
+
name: cronName,
|
|
230
|
+
content: "run 5",
|
|
231
|
+
scheduleId: cronScheduleId,
|
|
232
|
+
retrievedAt: "2026-05-08T05:00:00Z",
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
id: "cron-4",
|
|
236
|
+
name: cronName,
|
|
237
|
+
content: "run 4",
|
|
238
|
+
scheduleId: cronScheduleId,
|
|
239
|
+
retrievedAt: "2026-05-08T04:00:00Z",
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
id: "cron-3",
|
|
243
|
+
name: cronName,
|
|
244
|
+
content: "run 3",
|
|
245
|
+
scheduleId: cronScheduleId,
|
|
246
|
+
retrievedAt: "2026-05-08T03:00:00Z",
|
|
247
|
+
},
|
|
248
|
+
{
|
|
249
|
+
id: "cron-2",
|
|
250
|
+
name: cronName,
|
|
251
|
+
content: "run 2",
|
|
252
|
+
scheduleId: cronScheduleId,
|
|
253
|
+
retrievedAt: "2026-05-08T02:00:00Z",
|
|
254
|
+
},
|
|
255
|
+
{
|
|
256
|
+
id: "cron-1",
|
|
257
|
+
name: cronName,
|
|
258
|
+
content: "run 1",
|
|
259
|
+
scheduleId: cronScheduleId,
|
|
260
|
+
retrievedAt: "2026-05-08T01:00:00Z",
|
|
261
|
+
},
|
|
262
|
+
// Different one-shot task — null scheduleId, must pass through.
|
|
263
|
+
{
|
|
264
|
+
id: "distinct",
|
|
265
|
+
name: "Task: Refactor MCP tool list",
|
|
266
|
+
content: "x",
|
|
267
|
+
scheduleId: null,
|
|
268
|
+
retrievedAt: "2026-05-07T12:00:00Z",
|
|
269
|
+
},
|
|
270
|
+
];
|
|
271
|
+
|
|
272
|
+
const out = dedupeRetrievalsForRater(rows);
|
|
273
|
+
|
|
274
|
+
expect(out).toHaveLength(2);
|
|
275
|
+
// First-seen wins → freshest cron run is the representative.
|
|
276
|
+
expect(out.map((r) => r.id)).toEqual(["cron-5", "distinct"]);
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
test("two distinct one-shot tasks sharing the truncated 80-char name prefix → both kept", () => {
|
|
280
|
+
// Reviewer's flagged false-positive: `Task: ${task.task.slice(0, 80)}`
|
|
281
|
+
// collapses two distinct tasks whose first 80 chars happen to match. With
|
|
282
|
+
// scheduleId-keyed dedup, both have `null` scheduleId and pass through.
|
|
283
|
+
const sharedPrefix = `Task: ${"x".repeat(80)}`;
|
|
284
|
+
const rows: RetrievalRow[] = [
|
|
285
|
+
{
|
|
286
|
+
id: "task-a",
|
|
287
|
+
name: sharedPrefix,
|
|
288
|
+
content: `Task: ${"x".repeat(80)} unique-suffix-A\n\nOutput:\n…`,
|
|
289
|
+
scheduleId: null,
|
|
290
|
+
retrievedAt: "2026-05-08T05:00:00Z",
|
|
291
|
+
},
|
|
292
|
+
{
|
|
293
|
+
id: "task-b",
|
|
294
|
+
name: sharedPrefix,
|
|
295
|
+
content: `Task: ${"x".repeat(80)} unique-suffix-B\n\nOutput:\n…`,
|
|
296
|
+
scheduleId: null,
|
|
297
|
+
retrievedAt: "2026-05-08T04:00:00Z",
|
|
298
|
+
},
|
|
299
|
+
];
|
|
300
|
+
|
|
301
|
+
const out = dedupeRetrievalsForRater(rows);
|
|
302
|
+
|
|
303
|
+
expect(out).toHaveLength(2);
|
|
304
|
+
expect(out.map((r) => r.id)).toEqual(["task-a", "task-b"]);
|
|
305
|
+
});
|
|
306
|
+
|
|
307
|
+
test("Task: vs Session: with the same prefix → both kept (different memory types)", () => {
|
|
308
|
+
// Both names share their first 80 chars after the type prefix; both have
|
|
309
|
+
// null scheduleId (one-shot work). Must pass through.
|
|
310
|
+
const sharedSuffix = "Refactor MCP tool list to use deferred discovery";
|
|
311
|
+
const rows: RetrievalRow[] = [
|
|
312
|
+
{
|
|
313
|
+
id: "task",
|
|
314
|
+
name: `Task: ${sharedSuffix}`,
|
|
315
|
+
content: "task body",
|
|
316
|
+
source: "task_completion",
|
|
317
|
+
scheduleId: null,
|
|
318
|
+
retrievedAt: "2026-05-08T05:00:00Z",
|
|
319
|
+
},
|
|
320
|
+
{
|
|
321
|
+
id: "session",
|
|
322
|
+
name: `Session: ${sharedSuffix}`,
|
|
323
|
+
content: "session summary",
|
|
324
|
+
source: "session_summary",
|
|
325
|
+
scheduleId: null,
|
|
326
|
+
retrievedAt: "2026-05-08T04:00:00Z",
|
|
327
|
+
},
|
|
328
|
+
];
|
|
329
|
+
|
|
330
|
+
const out = dedupeRetrievalsForRater(rows);
|
|
331
|
+
|
|
332
|
+
expect(out).toHaveLength(2);
|
|
333
|
+
expect(out.map((r) => r.id)).toEqual(["task", "session"]);
|
|
334
|
+
});
|
|
335
|
+
|
|
336
|
+
test("two different scheduled jobs surface in the same set → both representatives kept", () => {
|
|
337
|
+
const rows: RetrievalRow[] = [
|
|
338
|
+
{ id: "j1-r2", name: "Task: Job One", content: "", scheduleId: "sched-1" },
|
|
339
|
+
{ id: "j1-r1", name: "Task: Job One", content: "", scheduleId: "sched-1" },
|
|
340
|
+
{ id: "j2-r2", name: "Task: Job Two", content: "", scheduleId: "sched-2" },
|
|
341
|
+
{ id: "j2-r1", name: "Task: Job Two", content: "", scheduleId: "sched-2" },
|
|
342
|
+
];
|
|
343
|
+
|
|
344
|
+
const out = dedupeRetrievalsForRater(rows);
|
|
345
|
+
|
|
346
|
+
expect(out).toHaveLength(2);
|
|
347
|
+
expect(out.map((r) => r.id)).toEqual(["j1-r2", "j2-r2"]);
|
|
348
|
+
});
|
|
349
|
+
|
|
350
|
+
test("rows without scheduleId pass through unchanged (manual / file_index memories)", () => {
|
|
351
|
+
const rows: RetrievalRow[] = [
|
|
352
|
+
{ id: "m1", name: "Manual note", content: "", source: "manual" },
|
|
353
|
+
{ id: "m2", name: "Manual note", content: "", source: "manual" },
|
|
354
|
+
{ id: "m3", name: "Indexed file", content: "", source: "file_index", scheduleId: null },
|
|
355
|
+
];
|
|
356
|
+
expect(dedupeRetrievalsForRater(rows)).toEqual(rows);
|
|
357
|
+
});
|
|
358
|
+
|
|
359
|
+
test("empty input → empty output", () => {
|
|
360
|
+
expect(dedupeRetrievalsForRater([])).toEqual([]);
|
|
361
|
+
});
|
|
362
|
+
});
|
|
363
|
+
|
|
364
|
+
describe("isLlmRaterEnabled", () => {
|
|
365
|
+
test("false when MEMORY_RATERS unset", () => {
|
|
366
|
+
const prev = process.env.MEMORY_RATERS;
|
|
367
|
+
delete process.env.MEMORY_RATERS;
|
|
368
|
+
try {
|
|
369
|
+
expect(isLlmRaterEnabled()).toBe(false);
|
|
370
|
+
} finally {
|
|
371
|
+
if (prev !== undefined) process.env.MEMORY_RATERS = prev;
|
|
372
|
+
}
|
|
373
|
+
});
|
|
374
|
+
|
|
375
|
+
test("false when MEMORY_RATERS lacks 'llm'", () => {
|
|
376
|
+
const prev = process.env.MEMORY_RATERS;
|
|
377
|
+
process.env.MEMORY_RATERS = "implicit-citation,noop";
|
|
378
|
+
try {
|
|
379
|
+
expect(isLlmRaterEnabled()).toBe(false);
|
|
380
|
+
} finally {
|
|
381
|
+
if (prev === undefined) delete process.env.MEMORY_RATERS;
|
|
382
|
+
else process.env.MEMORY_RATERS = prev;
|
|
383
|
+
}
|
|
384
|
+
});
|
|
385
|
+
|
|
386
|
+
test("true when MEMORY_RATERS includes 'llm'", () => {
|
|
387
|
+
const prev = process.env.MEMORY_RATERS;
|
|
388
|
+
process.env.MEMORY_RATERS = "implicit-citation,llm";
|
|
389
|
+
try {
|
|
390
|
+
expect(isLlmRaterEnabled()).toBe(true);
|
|
391
|
+
} finally {
|
|
392
|
+
if (prev === undefined) delete process.env.MEMORY_RATERS;
|
|
393
|
+
else process.env.MEMORY_RATERS = prev;
|
|
394
|
+
}
|
|
395
|
+
});
|
|
396
|
+
});
|
|
397
|
+
|
|
398
|
+
describe("registry: 'llm' is registered but not in SERVER_RATERS", () => {
|
|
399
|
+
test("getRegisteredRaters() with MEMORY_RATERS='llm' yields LlmRater", () => {
|
|
400
|
+
const prev = process.env.MEMORY_RATERS;
|
|
401
|
+
process.env.MEMORY_RATERS = "llm";
|
|
402
|
+
try {
|
|
403
|
+
const raters = getRegisteredRaters();
|
|
404
|
+
expect(raters.map((r) => r.name)).toContain("llm");
|
|
405
|
+
} finally {
|
|
406
|
+
if (prev === undefined) delete process.env.MEMORY_RATERS;
|
|
407
|
+
else process.env.MEMORY_RATERS = prev;
|
|
408
|
+
}
|
|
409
|
+
});
|
|
410
|
+
|
|
411
|
+
test("'llm' is NOT in SERVER_RATERS — only worker-driven", () => {
|
|
412
|
+
expect(SERVER_RATERS.has("llm")).toBe(false);
|
|
413
|
+
});
|
|
414
|
+
});
|
|
415
|
+
|
|
416
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
417
|
+
// 2. LlmRater.rate(ctx) per-memory path with MockLlmRaterClient.
|
|
418
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
419
|
+
|
|
420
|
+
describe("LlmRater.rate(ctx) — per-memory path with MockLlmRaterClient", () => {
|
|
421
|
+
test("name is 'llm'", () => {
|
|
422
|
+
const rater = new LlmRater(new MockLlmRaterClient({}));
|
|
423
|
+
expect(rater.name).toBe("llm");
|
|
424
|
+
});
|
|
425
|
+
|
|
426
|
+
test("returns [] when retrievedMemoryIds is empty", async () => {
|
|
427
|
+
const rater = new LlmRater(new MockLlmRaterClient({}));
|
|
428
|
+
const events = await rater.rate({
|
|
429
|
+
agentId: "agent-x",
|
|
430
|
+
retrievedMemoryIds: [],
|
|
431
|
+
evidence: "anything",
|
|
432
|
+
});
|
|
433
|
+
expect(events).toEqual([]);
|
|
434
|
+
});
|
|
435
|
+
|
|
436
|
+
test("returns [] when retrievedMemories is missing (RatingContext only)", async () => {
|
|
437
|
+
const rater = new LlmRater(new MockLlmRaterClient({ "mem-A": { score: 1, reasoning: "x" } }));
|
|
438
|
+
const events = await rater.rate({
|
|
439
|
+
agentId: "agent-x",
|
|
440
|
+
retrievedMemoryIds: ["mem-A"],
|
|
441
|
+
evidence: "x",
|
|
442
|
+
});
|
|
443
|
+
expect(events).toEqual([]);
|
|
444
|
+
});
|
|
445
|
+
|
|
446
|
+
test("calls client per memory and maps score → signal correctly", async () => {
|
|
447
|
+
const mock = new MockLlmRaterClient({
|
|
448
|
+
"mem-A": { score: 1, reasoning: "perfect" },
|
|
449
|
+
"mem-B": { score: 0, reasoning: "useless" },
|
|
450
|
+
"mem-C": { score: 0.5, reasoning: "neutral" },
|
|
451
|
+
});
|
|
452
|
+
const rater = new LlmRater(mock);
|
|
453
|
+
const events = await rater.rate({
|
|
454
|
+
agentId: "agent-x",
|
|
455
|
+
taskId: "task-1",
|
|
456
|
+
retrievedMemoryIds: ["mem-A", "mem-B", "mem-C"],
|
|
457
|
+
retrievedMemories: [
|
|
458
|
+
{ id: "mem-A", name: "A", content: "ca" },
|
|
459
|
+
{ id: "mem-B", name: "B", content: "cb" },
|
|
460
|
+
{ id: "mem-C", name: "C", content: "cc" },
|
|
461
|
+
],
|
|
462
|
+
query: "the question",
|
|
463
|
+
response: "the response",
|
|
464
|
+
evidence: null,
|
|
465
|
+
});
|
|
466
|
+
expect(events).toHaveLength(3);
|
|
467
|
+
expect(mock.calls).toHaveLength(3);
|
|
468
|
+
// Source is empty (framework will stamp via runServerRaters / store).
|
|
469
|
+
for (const e of events) expect(e.source).toBe("");
|
|
470
|
+
for (const e of events) expect(e.weight).toBe(0.8);
|
|
471
|
+
const a = events.find((e) => e.memoryId === "mem-A")!;
|
|
472
|
+
const b = events.find((e) => e.memoryId === "mem-B")!;
|
|
473
|
+
const c = events.find((e) => e.memoryId === "mem-C")!;
|
|
474
|
+
expect(a.signal).toBeCloseTo(1, 6);
|
|
475
|
+
expect(b.signal).toBeCloseTo(-1, 6);
|
|
476
|
+
expect(c.signal).toBeCloseTo(0, 6);
|
|
477
|
+
expect(a.reasoning).toBe("perfect");
|
|
478
|
+
});
|
|
479
|
+
|
|
480
|
+
test("client returns null → that memory is skipped (no event emitted)", async () => {
|
|
481
|
+
const mock = new MockLlmRaterClient({
|
|
482
|
+
"mem-A": { score: 0.7, reasoning: "useful" },
|
|
483
|
+
"mem-B": null, // simulates LLM parse failure
|
|
484
|
+
});
|
|
485
|
+
const rater = new LlmRater(mock);
|
|
486
|
+
const events = await rater.rate({
|
|
487
|
+
agentId: "agent-x",
|
|
488
|
+
retrievedMemoryIds: ["mem-A", "mem-B"],
|
|
489
|
+
retrievedMemories: [
|
|
490
|
+
{ id: "mem-A", name: "A", content: "" },
|
|
491
|
+
{ id: "mem-B", name: "B", content: "" },
|
|
492
|
+
],
|
|
493
|
+
evidence: null,
|
|
494
|
+
});
|
|
495
|
+
expect(events).toHaveLength(1);
|
|
496
|
+
expect(events[0]!.memoryId).toBe("mem-A");
|
|
497
|
+
});
|
|
498
|
+
|
|
499
|
+
test("uses ctx.evidence as `response` when ctx.response is missing", async () => {
|
|
500
|
+
const mock = new MockLlmRaterClient({
|
|
501
|
+
"mem-A": { score: 1, reasoning: "x" },
|
|
502
|
+
});
|
|
503
|
+
const rater = new LlmRater(mock);
|
|
504
|
+
await rater.rate({
|
|
505
|
+
agentId: "agent-x",
|
|
506
|
+
retrievedMemoryIds: ["mem-A"],
|
|
507
|
+
retrievedMemories: [{ id: "mem-A", name: "A", content: "ca" }],
|
|
508
|
+
evidence: "fallback evidence",
|
|
509
|
+
});
|
|
510
|
+
expect(mock.calls[0]!.response).toBe("fallback evidence");
|
|
511
|
+
});
|
|
512
|
+
});
|
|
513
|
+
|
|
514
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
515
|
+
// 3. HTTP integration — hook-piggyback dry-run.
|
|
516
|
+
// 4. Negative path — MEMORY_RATERS unset → no rate call.
|
|
517
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
518
|
+
|
|
519
|
+
const TEST_PORT = 19119;
|
|
520
|
+
const TEST_DB_PATH = `/tmp/test-memory-rater-llm-${Date.now()}.sqlite`;
|
|
521
|
+
const BASE = `http://localhost:${TEST_PORT}`;
|
|
522
|
+
const API_KEY = "test-key";
|
|
523
|
+
|
|
524
|
+
let serverProc: Subprocess;
|
|
525
|
+
const agentA = randomUUID();
|
|
526
|
+
const taskA = randomUUID();
|
|
527
|
+
const taskB = randomUUID();
|
|
528
|
+
let store: SqliteMemoryStore;
|
|
529
|
+
|
|
530
|
+
const testTemplateGlobals = globalThis as typeof globalThis & {
|
|
531
|
+
__testMigrationTemplate?: Uint8Array;
|
|
532
|
+
__savedRaterLlmTemplate?: Uint8Array;
|
|
533
|
+
};
|
|
534
|
+
|
|
535
|
+
async function waitForServer(url: string, timeoutMs = 15000): Promise<void> {
|
|
536
|
+
const start = Date.now();
|
|
537
|
+
while (Date.now() - start < timeoutMs) {
|
|
538
|
+
try {
|
|
539
|
+
const r = await fetch(url);
|
|
540
|
+
if (r.ok) return;
|
|
541
|
+
} catch {
|
|
542
|
+
// not ready
|
|
543
|
+
}
|
|
544
|
+
await Bun.sleep(50);
|
|
545
|
+
}
|
|
546
|
+
throw new Error(`Server did not start within ${timeoutMs}ms`);
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
function makeMemory(name: string): { id: string } {
|
|
550
|
+
return store.store({
|
|
551
|
+
agentId: agentA,
|
|
552
|
+
scope: "agent",
|
|
553
|
+
name,
|
|
554
|
+
content: `${name} content`,
|
|
555
|
+
source: "manual",
|
|
556
|
+
});
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
function insertRetrieval(taskId: string, memoryId: string): void {
|
|
560
|
+
getDb()
|
|
561
|
+
.prepare(
|
|
562
|
+
`INSERT INTO memory_retrieval (id, taskId, agentId, sessionId, memoryId, similarity, retrievedAt)
|
|
563
|
+
VALUES (?, ?, ?, NULL, ?, 0.85, ?)`,
|
|
564
|
+
)
|
|
565
|
+
.run(randomUUID(), taskId, agentA, memoryId, new Date().toISOString());
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
function readPosterior(id: string): { alpha: number; beta: number } {
|
|
569
|
+
const row = getDb()
|
|
570
|
+
.prepare<{ alpha: number; beta: number }, [string]>(
|
|
571
|
+
"SELECT alpha, beta FROM agent_memory WHERE id = ?",
|
|
572
|
+
)
|
|
573
|
+
.get(id);
|
|
574
|
+
if (!row) throw new Error(`memory ${id} not found`);
|
|
575
|
+
return { alpha: row.alpha, beta: row.beta };
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
function getRatings(taskId: string) {
|
|
579
|
+
return getDb()
|
|
580
|
+
.prepare<
|
|
581
|
+
{
|
|
582
|
+
memoryId: string;
|
|
583
|
+
source: string;
|
|
584
|
+
signal: number;
|
|
585
|
+
weight: number;
|
|
586
|
+
reasoning: string | null;
|
|
587
|
+
},
|
|
588
|
+
[string]
|
|
589
|
+
>("SELECT memoryId, source, signal, weight, reasoning FROM memory_rating WHERE taskId = ?")
|
|
590
|
+
.all(taskId);
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
describe("HTTP integration: hook-piggyback dry-run", () => {
|
|
594
|
+
beforeAll(async () => {
|
|
595
|
+
for (const suffix of ["", "-wal", "-shm"]) {
|
|
596
|
+
try {
|
|
597
|
+
await unlink(TEST_DB_PATH + suffix);
|
|
598
|
+
} catch {}
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
serverProc = Bun.spawn(["bun", "src/http.ts"], {
|
|
602
|
+
cwd: `${import.meta.dir}/../..`,
|
|
603
|
+
env: {
|
|
604
|
+
...process.env,
|
|
605
|
+
PORT: String(TEST_PORT),
|
|
606
|
+
DATABASE_PATH: TEST_DB_PATH,
|
|
607
|
+
API_KEY,
|
|
608
|
+
CAPABILITIES: "core",
|
|
609
|
+
SLACK_BOT_TOKEN: "",
|
|
610
|
+
LINEAR_DISABLE: "true",
|
|
611
|
+
JIRA_DISABLE: "true",
|
|
612
|
+
GITHUB_DISABLE: "true",
|
|
613
|
+
SLACK_DISABLE: "true",
|
|
614
|
+
HEARTBEAT_DISABLE: "true",
|
|
615
|
+
OAUTH_KEEPALIVE_DISABLE: "true",
|
|
616
|
+
ANONYMIZED_TELEMETRY: "false",
|
|
617
|
+
},
|
|
618
|
+
stdout: "ignore",
|
|
619
|
+
stderr: "ignore",
|
|
620
|
+
});
|
|
621
|
+
|
|
622
|
+
await waitForServer(`${BASE}/health`);
|
|
623
|
+
|
|
624
|
+
testTemplateGlobals.__savedRaterLlmTemplate = testTemplateGlobals.__testMigrationTemplate;
|
|
625
|
+
testTemplateGlobals.__testMigrationTemplate = undefined;
|
|
626
|
+
// Close any leftover in-memory DB from a prior test in the same Bun worker.
|
|
627
|
+
// initDb is a no-op when `db` is already set, so without this the test
|
|
628
|
+
// process can keep writing to the previous template-restored DB while the
|
|
629
|
+
// spawned server reads from TEST_DB_PATH — defensive even if today's CI
|
|
630
|
+
// ordering happens to leave `db` null here.
|
|
631
|
+
closeDb();
|
|
632
|
+
initDb(TEST_DB_PATH);
|
|
633
|
+
createAgent({ id: agentA, name: "Rater LLM Test", isLead: false, status: "idle" });
|
|
634
|
+
|
|
635
|
+
const insertTask = getDb().prepare(
|
|
636
|
+
`INSERT INTO agent_tasks (id, agentId, task, status, source, createdAt, lastUpdatedAt)
|
|
637
|
+
VALUES (?, ?, ?, 'in_progress', 'mcp', ?, ?)`,
|
|
638
|
+
);
|
|
639
|
+
const now = new Date().toISOString();
|
|
640
|
+
insertTask.run(taskA, agentA, "rater llm task A", now, now);
|
|
641
|
+
insertTask.run(taskB, agentA, "rater llm task B", now, now);
|
|
642
|
+
|
|
643
|
+
store = new SqliteMemoryStore();
|
|
644
|
+
}, 20000);
|
|
645
|
+
|
|
646
|
+
afterAll(async () => {
|
|
647
|
+
closeDb();
|
|
648
|
+
testTemplateGlobals.__testMigrationTemplate = testTemplateGlobals.__savedRaterLlmTemplate;
|
|
649
|
+
testTemplateGlobals.__savedRaterLlmTemplate = undefined;
|
|
650
|
+
if (serverProc) {
|
|
651
|
+
serverProc.kill();
|
|
652
|
+
try {
|
|
653
|
+
await serverProc.exited;
|
|
654
|
+
} catch {}
|
|
655
|
+
}
|
|
656
|
+
await Bun.sleep(50);
|
|
657
|
+
for (const suffix of ["", "-wal", "-shm"]) {
|
|
658
|
+
try {
|
|
659
|
+
await unlink(TEST_DB_PATH + suffix);
|
|
660
|
+
} catch {}
|
|
661
|
+
}
|
|
662
|
+
});
|
|
663
|
+
|
|
664
|
+
beforeEach(() => {
|
|
665
|
+
getDb().run("DELETE FROM memory_rating");
|
|
666
|
+
getDb().run("DELETE FROM memory_retrieval");
|
|
667
|
+
getDb().run("UPDATE agent_memory SET alpha = 1.0, beta = 1.0");
|
|
668
|
+
});
|
|
669
|
+
|
|
670
|
+
test("fetchRetrievalsForTask returns rows for the requesting agent", async () => {
|
|
671
|
+
const m = makeMemory("retr-fetch-1");
|
|
672
|
+
insertRetrieval(taskA, m.id);
|
|
673
|
+
const rows = await fetchRetrievalsForTask({
|
|
674
|
+
apiUrl: BASE,
|
|
675
|
+
apiKey: API_KEY,
|
|
676
|
+
agentId: agentA,
|
|
677
|
+
taskId: taskA,
|
|
678
|
+
});
|
|
679
|
+
expect(rows).toHaveLength(1);
|
|
680
|
+
expect(rows[0]!.id).toBe(m.id);
|
|
681
|
+
});
|
|
682
|
+
|
|
683
|
+
test("fetchRetrievalsForTask → [] on transport failure (best-effort)", async () => {
|
|
684
|
+
const rows = await fetchRetrievalsForTask({
|
|
685
|
+
apiUrl: "http://localhost:1", // refused
|
|
686
|
+
apiKey: API_KEY,
|
|
687
|
+
agentId: agentA,
|
|
688
|
+
taskId: taskA,
|
|
689
|
+
});
|
|
690
|
+
expect(rows).toEqual([]);
|
|
691
|
+
});
|
|
692
|
+
|
|
693
|
+
test("postRatings → applies events; alpha/beta posteriors move per mocked generateObject result", async () => {
|
|
694
|
+
const useful = makeMemory("piggyback-useful");
|
|
695
|
+
const misleading = makeMemory("piggyback-misleading");
|
|
696
|
+
const neutral = makeMemory("piggyback-neutral");
|
|
697
|
+
|
|
698
|
+
// Worker has retrieved these three.
|
|
699
|
+
insertRetrieval(taskA, useful.id);
|
|
700
|
+
insertRetrieval(taskA, misleading.id);
|
|
701
|
+
insertRetrieval(taskA, neutral.id);
|
|
702
|
+
|
|
703
|
+
// Simulate hook flow: fetch retrievals, run schema validation against a
|
|
704
|
+
// mocked `generateObject` result (object — not stringified envelope —
|
|
705
|
+
// because the AI SDK returns a parsed/validated object directly), then
|
|
706
|
+
// POST.
|
|
707
|
+
const retrievals = await fetchRetrievalsForTask({
|
|
708
|
+
apiUrl: BASE,
|
|
709
|
+
apiKey: API_KEY,
|
|
710
|
+
agentId: agentA,
|
|
711
|
+
taskId: taskA,
|
|
712
|
+
});
|
|
713
|
+
expect(retrievals).toHaveLength(3);
|
|
714
|
+
|
|
715
|
+
const mockedGenerateObjectResult = {
|
|
716
|
+
summary: "Found a couple of helpful patterns; one memory was misleading.",
|
|
717
|
+
ratings: [
|
|
718
|
+
{ id: useful.id, score: 1, reasoning: "directly answered the question" },
|
|
719
|
+
{ id: misleading.id, score: 0, reasoning: "this memory contradicted the docs" },
|
|
720
|
+
{ id: neutral.id, score: 0.5, reasoning: "tangential but interesting" },
|
|
721
|
+
],
|
|
722
|
+
};
|
|
723
|
+
// The AI SDK's `generateObject` validates against the Zod schema before
|
|
724
|
+
// returning; mirror that contract here so the test fails fast if the
|
|
725
|
+
// schema drifts.
|
|
726
|
+
const parsed = SummaryWithRatingsSchema.safeParse(mockedGenerateObjectResult);
|
|
727
|
+
expect(parsed.success).toBe(true);
|
|
728
|
+
if (!parsed.success) return;
|
|
729
|
+
|
|
730
|
+
const events = buildRatingsFromLlm(parsed.data.ratings, retrievals);
|
|
731
|
+
expect(events).toHaveLength(3);
|
|
732
|
+
for (const e of events) {
|
|
733
|
+
expect(e.weight).toBe(0.8);
|
|
734
|
+
expect(e.source).toBe("llm");
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
const r = await postRatings({
|
|
738
|
+
apiUrl: BASE,
|
|
739
|
+
apiKey: API_KEY,
|
|
740
|
+
agentId: agentA,
|
|
741
|
+
taskId: taskA,
|
|
742
|
+
events,
|
|
743
|
+
});
|
|
744
|
+
expect(r.ok).toBe(true);
|
|
745
|
+
|
|
746
|
+
// Posteriors moved by alphaDelta/betaDelta = max(0, ±signal) * 0.8.
|
|
747
|
+
// useful: signal=+1 → alpha += 0.8
|
|
748
|
+
// misleading: signal=-1 → beta += 0.8
|
|
749
|
+
// neutral: signal=0 → no shift
|
|
750
|
+
expect(readPosterior(useful.id)).toEqual({ alpha: 1.8, beta: 1.0 });
|
|
751
|
+
expect(readPosterior(misleading.id)).toEqual({ alpha: 1.0, beta: 1.8 });
|
|
752
|
+
expect(readPosterior(neutral.id)).toEqual({ alpha: 1.0, beta: 1.0 });
|
|
753
|
+
|
|
754
|
+
const ratings = getRatings(taskA);
|
|
755
|
+
expect(ratings).toHaveLength(3);
|
|
756
|
+
for (const row of ratings) {
|
|
757
|
+
expect(row.source).toBe("llm");
|
|
758
|
+
expect(row.weight).toBe(0.8);
|
|
759
|
+
expect(row.reasoning).not.toBeNull();
|
|
760
|
+
expect((row.reasoning ?? "").length).toBeGreaterThan(0);
|
|
761
|
+
}
|
|
762
|
+
});
|
|
763
|
+
|
|
764
|
+
test("hallucinated memoryId is dropped before POST (defence-in-depth)", async () => {
|
|
765
|
+
const real = makeMemory("piggyback-real");
|
|
766
|
+
insertRetrieval(taskB, real.id);
|
|
767
|
+
const retrievals = await fetchRetrievalsForTask({
|
|
768
|
+
apiUrl: BASE,
|
|
769
|
+
apiKey: API_KEY,
|
|
770
|
+
agentId: agentA,
|
|
771
|
+
taskId: taskB,
|
|
772
|
+
});
|
|
773
|
+
const events = buildRatingsFromLlm(
|
|
774
|
+
[
|
|
775
|
+
{ id: real.id, score: 1, reasoning: "real memory" },
|
|
776
|
+
{ id: "mem-FAKE-NOT-IN-DB", score: 1, reasoning: "hallucinated" },
|
|
777
|
+
],
|
|
778
|
+
retrievals,
|
|
779
|
+
);
|
|
780
|
+
expect(events).toHaveLength(1);
|
|
781
|
+
expect(events[0]!.memoryId).toBe(real.id);
|
|
782
|
+
|
|
783
|
+
const r = await postRatings({
|
|
784
|
+
apiUrl: BASE,
|
|
785
|
+
apiKey: API_KEY,
|
|
786
|
+
agentId: agentA,
|
|
787
|
+
taskId: taskB,
|
|
788
|
+
events,
|
|
789
|
+
});
|
|
790
|
+
expect(r.ok).toBe(true);
|
|
791
|
+
expect(getRatings(taskB)).toHaveLength(1);
|
|
792
|
+
});
|
|
793
|
+
|
|
794
|
+
test("negative path: simulated hook with MEMORY_RATERS unset → no /api/memory/rate call", async () => {
|
|
795
|
+
const m = makeMemory("piggyback-negative");
|
|
796
|
+
insertRetrieval(taskA, m.id);
|
|
797
|
+
|
|
798
|
+
const prev = process.env.MEMORY_RATERS;
|
|
799
|
+
delete process.env.MEMORY_RATERS;
|
|
800
|
+
try {
|
|
801
|
+
// Mirror the hook's gate: when isLlmRaterEnabled() is false, the hook
|
|
802
|
+
// never calls fetchRetrievalsForTask / generateObject / postRatings —
|
|
803
|
+
// it falls back to the existing summary-only path.
|
|
804
|
+
let postCalled = false;
|
|
805
|
+
const fakeFetch: typeof fetch = async () => {
|
|
806
|
+
postCalled = true;
|
|
807
|
+
return new Response("{}", { status: 200 });
|
|
808
|
+
};
|
|
809
|
+
if (!isLlmRaterEnabled()) {
|
|
810
|
+
// No call at all — assertion is "we did not invoke postRatings".
|
|
811
|
+
expect(postCalled).toBe(false);
|
|
812
|
+
}
|
|
813
|
+
// Sanity: if we DID call postRatings, nothing got applied because no events.
|
|
814
|
+
const r = await postRatings({
|
|
815
|
+
apiUrl: BASE,
|
|
816
|
+
apiKey: API_KEY,
|
|
817
|
+
agentId: agentA,
|
|
818
|
+
taskId: taskA,
|
|
819
|
+
events: [],
|
|
820
|
+
fetchImpl: fakeFetch,
|
|
821
|
+
});
|
|
822
|
+
expect(r.ok).toBe(true);
|
|
823
|
+
expect(postCalled).toBe(false); // events=[] short-circuits before fetch
|
|
824
|
+
} finally {
|
|
825
|
+
if (prev !== undefined) process.env.MEMORY_RATERS = prev;
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
// No memory_rating rows for taskA in this test.
|
|
829
|
+
expect(getRatings(taskA)).toHaveLength(0);
|
|
830
|
+
expect(readPosterior(m.id)).toEqual({ alpha: 1.0, beta: 1.0 });
|
|
831
|
+
});
|
|
832
|
+
|
|
833
|
+
test("postRatings logs but does not throw on 4xx (best-effort)", async () => {
|
|
834
|
+
const m = makeMemory("piggyback-4xx");
|
|
835
|
+
insertRetrieval(taskA, m.id);
|
|
836
|
+
|
|
837
|
+
const evt: RatingEvent = {
|
|
838
|
+
memoryId: m.id,
|
|
839
|
+
signal: 1,
|
|
840
|
+
weight: 0.8,
|
|
841
|
+
// intentionally omit source — the server's RateEventSchema enum will reject
|
|
842
|
+
source: "implicit-citation",
|
|
843
|
+
reasoning: "spoof attempt",
|
|
844
|
+
};
|
|
845
|
+
const r = await postRatings({
|
|
846
|
+
apiUrl: BASE,
|
|
847
|
+
apiKey: API_KEY,
|
|
848
|
+
agentId: agentA,
|
|
849
|
+
taskId: taskA,
|
|
850
|
+
events: [evt],
|
|
851
|
+
});
|
|
852
|
+
expect(r.ok).toBe(false);
|
|
853
|
+
expect(r.status).toBeGreaterThanOrEqual(400);
|
|
854
|
+
// Posterior unchanged — 400 means nothing was applied.
|
|
855
|
+
expect(readPosterior(m.id)).toEqual({ alpha: 1.0, beta: 1.0 });
|
|
856
|
+
});
|
|
857
|
+
|
|
858
|
+
test("OPENROUTER_API_KEY unset → hook is a no-op (no fetch, no index, no rate POST)", async () => {
|
|
859
|
+
const m = makeMemory("piggyback-openrouter-unset");
|
|
860
|
+
insertRetrieval(taskA, m.id);
|
|
861
|
+
|
|
862
|
+
// Mirror the hook's outer gate exactly: when OPENROUTER_API_KEY is unset,
|
|
863
|
+
// the entire summary + rating block must early-return. No call to
|
|
864
|
+
// /api/memory/index, no call to /api/memory/rate, no LLM invocation.
|
|
865
|
+
const prev = process.env.OPENROUTER_API_KEY;
|
|
866
|
+
delete process.env.OPENROUTER_API_KEY;
|
|
867
|
+
try {
|
|
868
|
+
let anyFetchCalled = false;
|
|
869
|
+
const fakeFetch: typeof fetch = async () => {
|
|
870
|
+
anyFetchCalled = true;
|
|
871
|
+
return new Response("{}", { status: 200 });
|
|
872
|
+
};
|
|
873
|
+
|
|
874
|
+
const skip = !process.env.OPENROUTER_API_KEY;
|
|
875
|
+
expect(skip).toBe(true);
|
|
876
|
+
|
|
877
|
+
// The hook block is entirely guarded — no fetch, no postRatings.
|
|
878
|
+
// We never reach fetchRetrievalsForTask or postRatings, so neither is
|
|
879
|
+
// exercised in this branch.
|
|
880
|
+
if (!skip) {
|
|
881
|
+
// Unreachable in this test — defensive assertion only.
|
|
882
|
+
await fetchRetrievalsForTask({
|
|
883
|
+
apiUrl: BASE,
|
|
884
|
+
apiKey: API_KEY,
|
|
885
|
+
agentId: agentA,
|
|
886
|
+
taskId: taskA,
|
|
887
|
+
fetchImpl: fakeFetch,
|
|
888
|
+
});
|
|
889
|
+
}
|
|
890
|
+
expect(anyFetchCalled).toBe(false);
|
|
891
|
+
} finally {
|
|
892
|
+
if (prev !== undefined) process.env.OPENROUTER_API_KEY = prev;
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
// No memory_rating rows for taskA, posterior unchanged.
|
|
896
|
+
expect(getRatings(taskA)).toHaveLength(0);
|
|
897
|
+
expect(readPosterior(m.id)).toEqual({ alpha: 1.0, beta: 1.0 });
|
|
898
|
+
});
|
|
899
|
+
|
|
900
|
+
test("happy path: mocked generateObject result → postRatings called with expected events", async () => {
|
|
901
|
+
const useful = makeMemory("happy-useful");
|
|
902
|
+
const misleading = makeMemory("happy-misleading");
|
|
903
|
+
|
|
904
|
+
insertRetrieval(taskB, useful.id);
|
|
905
|
+
insertRetrieval(taskB, misleading.id);
|
|
906
|
+
|
|
907
|
+
const retrievals = await fetchRetrievalsForTask({
|
|
908
|
+
apiUrl: BASE,
|
|
909
|
+
apiKey: API_KEY,
|
|
910
|
+
agentId: agentA,
|
|
911
|
+
taskId: taskB,
|
|
912
|
+
});
|
|
913
|
+
expect(retrievals).toHaveLength(2);
|
|
914
|
+
|
|
915
|
+
// Stand in for `const { object } = await generateObject(...)` — the AI
|
|
916
|
+
// SDK guarantees `object` is already validated against the Zod schema.
|
|
917
|
+
const generateObjectResult: {
|
|
918
|
+
object: { summary: string; ratings: Array<{ id: string; score: number; reasoning: string }> };
|
|
919
|
+
} = {
|
|
920
|
+
object: {
|
|
921
|
+
summary: "Two patterns surfaced; one was misleading.",
|
|
922
|
+
ratings: [
|
|
923
|
+
{ id: useful.id, score: 1, reasoning: "directly answered" },
|
|
924
|
+
{ id: misleading.id, score: 0, reasoning: "contradicted the docs" },
|
|
925
|
+
],
|
|
926
|
+
},
|
|
927
|
+
};
|
|
928
|
+
|
|
929
|
+
// Schema gate is implicit in the SDK, but assert here so a future schema
|
|
930
|
+
// change doesn't silently make this test pass on garbage data.
|
|
931
|
+
const validated = SummaryWithRatingsSchema.parse(generateObjectResult.object);
|
|
932
|
+
|
|
933
|
+
const events = buildRatingsFromLlm(validated.ratings, retrievals);
|
|
934
|
+
expect(events).toHaveLength(2);
|
|
935
|
+
const usefulEvent = events.find((e) => e.memoryId === useful.id)!;
|
|
936
|
+
const misleadingEvent = events.find((e) => e.memoryId === misleading.id)!;
|
|
937
|
+
expect(usefulEvent.signal).toBeCloseTo(1, 6);
|
|
938
|
+
expect(misleadingEvent.signal).toBeCloseTo(-1, 6);
|
|
939
|
+
expect(usefulEvent.source).toBe("llm");
|
|
940
|
+
expect(misleadingEvent.source).toBe("llm");
|
|
941
|
+
|
|
942
|
+
// Track that postRatings actually attempts the POST with our events.
|
|
943
|
+
let postedEvents: RatingEvent[] | null = null;
|
|
944
|
+
const trackingFetch: typeof fetch = async (url, init) => {
|
|
945
|
+
if (typeof url === "string" && url.endsWith("/api/memory/rate")) {
|
|
946
|
+
const body = JSON.parse(String(init?.body ?? "{}"));
|
|
947
|
+
postedEvents = body.events;
|
|
948
|
+
}
|
|
949
|
+
return new Response("{}", { status: 200 });
|
|
950
|
+
};
|
|
951
|
+
const r = await postRatings({
|
|
952
|
+
apiUrl: BASE,
|
|
953
|
+
apiKey: API_KEY,
|
|
954
|
+
agentId: agentA,
|
|
955
|
+
taskId: taskB,
|
|
956
|
+
events,
|
|
957
|
+
fetchImpl: trackingFetch,
|
|
958
|
+
});
|
|
959
|
+
expect(r.ok).toBe(true);
|
|
960
|
+
expect(postedEvents).not.toBeNull();
|
|
961
|
+
expect(postedEvents!).toHaveLength(2);
|
|
962
|
+
expect(postedEvents!.map((e) => e.memoryId).sort()).toEqual([useful.id, misleading.id].sort());
|
|
963
|
+
});
|
|
964
|
+
});
|