@desplega.ai/agent-swarm 1.75.0 → 1.76.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/openapi.json +973 -36
- package/package.json +2 -2
- package/src/be/db.ts +527 -9
- package/src/be/memory/raters/llm-summarizer.ts +218 -0
- package/src/be/memory/raters/llm.ts +56 -75
- package/src/be/memory/retrieval-store.ts +21 -0
- package/src/be/migrations/054_agent_harness_provider.sql +21 -0
- package/src/be/migrations/055_agent_cred_status.sql +15 -0
- package/src/be/migrations/056_drop_agent_tasks_source_check.sql +139 -0
- package/src/be/migrations/057_inbox_item_state.sql +27 -0
- package/src/be/migrations/058_task_templates.sql +31 -0
- package/src/be/swarm-config-guard.ts +24 -0
- package/src/commands/credential-wait.ts +1 -1
- package/src/commands/provider-credentials.ts +434 -0
- package/src/commands/runner.ts +229 -42
- package/src/hooks/hook.ts +115 -95
- package/src/http/agents.ts +82 -2
- package/src/http/config.ts +11 -1
- package/src/http/inbox-state.ts +89 -0
- package/src/http/index.ts +10 -0
- package/src/http/sessions.ts +86 -0
- package/src/http/status.ts +665 -0
- package/src/http/task-templates.ts +51 -0
- package/src/http/tasks.ts +85 -5
- package/src/http/users.ts +134 -0
- package/src/providers/claude-adapter.ts +5 -0
- package/src/providers/codex-adapter.ts +1 -1
- package/src/providers/index.ts +1 -1
- package/src/slack/handlers.ts +0 -1
- package/src/tests/agents-harness-provider.test.ts +333 -0
- package/src/tests/credential-check.test.ts +32 -1
- package/src/tests/credential-status-api.test.ts +42 -0
- package/src/tests/harness-provider-resolution.test.ts +242 -0
- package/src/tests/jira-sync.test.ts +1 -1
- package/src/tests/memory-rater-llm-summarizer.test.ts +317 -0
- package/src/tests/memory-rater-llm.test.ts +265 -107
- package/src/tests/migration-runner-regressions.test.ts +17 -2
- package/src/tests/sessions.test.ts +141 -0
- package/src/tests/status.test.ts +843 -0
- package/src/tests/stop-hook-task-resolution.test.ts +98 -0
- package/src/tests/template-recommendations.test.ts +148 -0
- package/src/tests/use-dismissible-card.test.ts +140 -0
- package/src/tools/swarm-config/set-config.ts +17 -1
- package/src/types.ts +117 -0
- package/src/utils/harness-provider.ts +32 -0
- package/tsconfig.json +0 -2
- package/src/providers/credentials.ts +0 -74
|
@@ -8,9 +8,10 @@
|
|
|
8
8
|
* mapping, prompt construction.
|
|
9
9
|
* 2. `LlmRater.rate(ctx)` per-memory path with `MockLlmRaterClient`.
|
|
10
10
|
* 3. HTTP integration: spawn the API server against an isolated SQLite
|
|
11
|
-
* file, simulate the hook's piggyback flow (
|
|
12
|
-
*
|
|
13
|
-
* `agent_memory.alpha/beta` move + `memory_rating` rows are
|
|
11
|
+
* file, simulate the hook's piggyback flow (`generateObject` is mocked
|
|
12
|
+
* by feeding the parsed object directly into `buildRatingsFromLlm`),
|
|
13
|
+
* and assert `agent_memory.alpha/beta` move + `memory_rating` rows are
|
|
14
|
+
* written.
|
|
14
15
|
* 4. Negative path: `MEMORY_RATERS` unset → no `/api/memory/rate` call.
|
|
15
16
|
*/
|
|
16
17
|
import { afterAll, beforeAll, beforeEach, describe, expect, test } from "bun:test";
|
|
@@ -22,13 +23,13 @@ import { SqliteMemoryStore } from "../be/memory/providers/sqlite-store";
|
|
|
22
23
|
import {
|
|
23
24
|
buildRatingsFromLlm,
|
|
24
25
|
buildSummaryWithRatingsPrompt,
|
|
25
|
-
|
|
26
|
+
dedupeRetrievalsForRater,
|
|
26
27
|
fetchRetrievalsForTask,
|
|
27
28
|
isLlmRaterEnabled,
|
|
28
29
|
LLM_RATER_WEIGHT,
|
|
29
30
|
LlmRater,
|
|
30
|
-
parseSummaryWithRatings,
|
|
31
31
|
postRatings,
|
|
32
|
+
type RetrievalRow,
|
|
32
33
|
SummaryWithRatingsSchema,
|
|
33
34
|
} from "../be/memory/raters/llm";
|
|
34
35
|
import { getRegisteredRaters, SERVER_RATERS } from "../be/memory/raters/registry";
|
|
@@ -210,108 +211,153 @@ describe("buildSummaryWithRatingsPrompt", () => {
|
|
|
210
211
|
});
|
|
211
212
|
});
|
|
212
213
|
|
|
213
|
-
describe("
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
214
|
+
describe("dedupeRetrievalsForRater", () => {
|
|
215
|
+
// Regression: the LLM rater audit (post-PR #450) found scheduled-task self-
|
|
216
|
+
// similarity inflated alpha posteriors 5x in one rater pass — the Claude
|
|
217
|
+
// Code Changelog Monitor cron surfaced 5 memories from prior hourly runs
|
|
218
|
+
// and got each rated +1.0. Dedup keys on `scheduleId` so only memories
|
|
219
|
+
// from the same scheduled job collapse; distinct one-shot tasks pass
|
|
220
|
+
// through even when their truncated 80-char names collide.
|
|
221
|
+
|
|
222
|
+
test("happy path: 5 cron memories sharing scheduleId + 1 distinct → 2 rows", () => {
|
|
223
|
+
const cronName = "Task: Claude Code Changelog Monitor — check for new entries";
|
|
224
|
+
const cronScheduleId = "sched-claude-code-changelog";
|
|
225
|
+
const rows: RetrievalRow[] = [
|
|
226
|
+
// Newest cron run first (API returns DESC by retrievedAt).
|
|
227
|
+
{
|
|
228
|
+
id: "cron-5",
|
|
229
|
+
name: cronName,
|
|
230
|
+
content: "run 5",
|
|
231
|
+
scheduleId: cronScheduleId,
|
|
232
|
+
retrievedAt: "2026-05-08T05:00:00Z",
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
id: "cron-4",
|
|
236
|
+
name: cronName,
|
|
237
|
+
content: "run 4",
|
|
238
|
+
scheduleId: cronScheduleId,
|
|
239
|
+
retrievedAt: "2026-05-08T04:00:00Z",
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
id: "cron-3",
|
|
243
|
+
name: cronName,
|
|
244
|
+
content: "run 3",
|
|
245
|
+
scheduleId: cronScheduleId,
|
|
246
|
+
retrievedAt: "2026-05-08T03:00:00Z",
|
|
247
|
+
},
|
|
248
|
+
{
|
|
249
|
+
id: "cron-2",
|
|
250
|
+
name: cronName,
|
|
251
|
+
content: "run 2",
|
|
252
|
+
scheduleId: cronScheduleId,
|
|
253
|
+
retrievedAt: "2026-05-08T02:00:00Z",
|
|
254
|
+
},
|
|
255
|
+
{
|
|
256
|
+
id: "cron-1",
|
|
257
|
+
name: cronName,
|
|
258
|
+
content: "run 1",
|
|
259
|
+
scheduleId: cronScheduleId,
|
|
260
|
+
retrievedAt: "2026-05-08T01:00:00Z",
|
|
261
|
+
},
|
|
262
|
+
// Different one-shot task — null scheduleId, must pass through.
|
|
263
|
+
{
|
|
264
|
+
id: "distinct",
|
|
265
|
+
name: "Task: Refactor MCP tool list",
|
|
266
|
+
content: "x",
|
|
267
|
+
scheduleId: null,
|
|
268
|
+
retrievedAt: "2026-05-07T12:00:00Z",
|
|
269
|
+
},
|
|
270
|
+
];
|
|
225
271
|
|
|
226
|
-
|
|
227
|
-
const envelope = JSON.stringify({
|
|
228
|
-
result: { summary: "S", ratings: [{ id: "m", score: 1, reasoning: "yes" }] },
|
|
229
|
-
});
|
|
230
|
-
const out = parseSummaryWithRatings(envelope);
|
|
231
|
-
expect(out).not.toBeNull();
|
|
232
|
-
if (!out) return;
|
|
233
|
-
expect(out.ratings[0]!.score).toBe(1);
|
|
234
|
-
});
|
|
272
|
+
const out = dedupeRetrievalsForRater(rows);
|
|
235
273
|
|
|
236
|
-
|
|
237
|
-
|
|
274
|
+
expect(out).toHaveLength(2);
|
|
275
|
+
// First-seen wins → freshest cron run is the representative.
|
|
276
|
+
expect(out.map((r) => r.id)).toEqual(["cron-5", "distinct"]);
|
|
238
277
|
});
|
|
239
278
|
|
|
240
|
-
test("
|
|
241
|
-
|
|
242
|
-
|
|
279
|
+
test("two distinct one-shot tasks sharing the truncated 80-char name prefix → both kept", () => {
|
|
280
|
+
// Reviewer's flagged false-positive: `Task: ${task.task.slice(0, 80)}`
|
|
281
|
+
// collapses two distinct tasks whose first 80 chars happen to match. With
|
|
282
|
+
// scheduleId-keyed dedup, both have `null` scheduleId and pass through.
|
|
283
|
+
const sharedPrefix = `Task: ${"x".repeat(80)}`;
|
|
284
|
+
const rows: RetrievalRow[] = [
|
|
285
|
+
{
|
|
286
|
+
id: "task-a",
|
|
287
|
+
name: sharedPrefix,
|
|
288
|
+
content: `Task: ${"x".repeat(80)} unique-suffix-A\n\nOutput:\n…`,
|
|
289
|
+
scheduleId: null,
|
|
290
|
+
retrievedAt: "2026-05-08T05:00:00Z",
|
|
291
|
+
},
|
|
292
|
+
{
|
|
293
|
+
id: "task-b",
|
|
294
|
+
name: sharedPrefix,
|
|
295
|
+
content: `Task: ${"x".repeat(80)} unique-suffix-B\n\nOutput:\n…`,
|
|
296
|
+
scheduleId: null,
|
|
297
|
+
retrievedAt: "2026-05-08T04:00:00Z",
|
|
298
|
+
},
|
|
299
|
+
];
|
|
300
|
+
|
|
301
|
+
const out = dedupeRetrievalsForRater(rows);
|
|
302
|
+
|
|
303
|
+
expect(out).toHaveLength(2);
|
|
304
|
+
expect(out.map((r) => r.id)).toEqual(["task-a", "task-b"]);
|
|
243
305
|
});
|
|
244
306
|
|
|
245
|
-
test("
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
307
|
+
test("Task: vs Session: with the same prefix → both kept (different memory types)", () => {
|
|
308
|
+
// Both names share their first 80 chars after the type prefix; both have
|
|
309
|
+
// null scheduleId (one-shot work). Must pass through.
|
|
310
|
+
const sharedSuffix = "Refactor MCP tool list to use deferred discovery";
|
|
311
|
+
const rows: RetrievalRow[] = [
|
|
312
|
+
{
|
|
313
|
+
id: "task",
|
|
314
|
+
name: `Task: ${sharedSuffix}`,
|
|
315
|
+
content: "task body",
|
|
316
|
+
source: "task_completion",
|
|
317
|
+
scheduleId: null,
|
|
318
|
+
retrievedAt: "2026-05-08T05:00:00Z",
|
|
319
|
+
},
|
|
320
|
+
{
|
|
321
|
+
id: "session",
|
|
322
|
+
name: `Session: ${sharedSuffix}`,
|
|
323
|
+
content: "session summary",
|
|
324
|
+
source: "session_summary",
|
|
325
|
+
scheduleId: null,
|
|
326
|
+
retrievedAt: "2026-05-08T04:00:00Z",
|
|
327
|
+
},
|
|
328
|
+
];
|
|
329
|
+
|
|
330
|
+
const out = dedupeRetrievalsForRater(rows);
|
|
331
|
+
|
|
332
|
+
expect(out).toHaveLength(2);
|
|
333
|
+
expect(out.map((r) => r.id)).toEqual(["task", "session"]);
|
|
252
334
|
});
|
|
253
|
-
});
|
|
254
335
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
const envelope = JSON.stringify({ result: inner });
|
|
268
|
-
expect(parseSummaryWithRatings(envelope)).toBeNull();
|
|
269
|
-
const out = extractSummaryFromClaudeStdout(envelope);
|
|
270
|
-
expect(out).toBe(summaryText);
|
|
271
|
-
// Hard guarantee for the indexer: must NOT be raw JSON.
|
|
272
|
-
expect(out.startsWith("{")).toBe(false);
|
|
273
|
-
expect(out.includes('"ratings"')).toBe(false);
|
|
274
|
-
});
|
|
275
|
-
|
|
276
|
-
test("structured envelope missing the `ratings` field entirely → extracts summary", () => {
|
|
277
|
-
const summaryText = "No retrievals this session.";
|
|
278
|
-
const inner = JSON.stringify({ summary: summaryText });
|
|
279
|
-
const envelope = JSON.stringify({ result: inner });
|
|
280
|
-
const out = extractSummaryFromClaudeStdout(envelope);
|
|
281
|
-
expect(out).toBe(summaryText);
|
|
282
|
-
});
|
|
283
|
-
|
|
284
|
-
test("structured envelope with non-string summary field → falls through to inner string", () => {
|
|
285
|
-
// Defensive: if `summary` itself is malformed, we still don't crash; the
|
|
286
|
-
// best-effort fallback is to return the inner JSON as a string. The
|
|
287
|
-
// length/keyword heuristics in the hook will likely skip indexing.
|
|
288
|
-
const inner = JSON.stringify({ summary: 42, ratings: [] });
|
|
289
|
-
const envelope = JSON.stringify({ result: inner });
|
|
290
|
-
const out = extractSummaryFromClaudeStdout(envelope);
|
|
291
|
-
expect(out).toBe(inner);
|
|
292
|
-
});
|
|
293
|
-
|
|
294
|
-
test("unstructured envelope with plain text result → returns the text unchanged", () => {
|
|
295
|
-
const text = "- Discovered that the API requires Bearer prefix.\n- No other learnings.";
|
|
296
|
-
const envelope = JSON.stringify({ result: text });
|
|
297
|
-
expect(extractSummaryFromClaudeStdout(envelope)).toBe(text);
|
|
298
|
-
});
|
|
299
|
-
|
|
300
|
-
test("envelope.result is an object with a string summary field → extracts it", () => {
|
|
301
|
-
const envelope = JSON.stringify({
|
|
302
|
-
result: { summary: "object form", ratings: [] },
|
|
303
|
-
});
|
|
304
|
-
expect(extractSummaryFromClaudeStdout(envelope)).toBe("object form");
|
|
336
|
+
test("two different scheduled jobs surface in the same set → both representatives kept", () => {
|
|
337
|
+
const rows: RetrievalRow[] = [
|
|
338
|
+
{ id: "j1-r2", name: "Task: Job One", content: "", scheduleId: "sched-1" },
|
|
339
|
+
{ id: "j1-r1", name: "Task: Job One", content: "", scheduleId: "sched-1" },
|
|
340
|
+
{ id: "j2-r2", name: "Task: Job Two", content: "", scheduleId: "sched-2" },
|
|
341
|
+
{ id: "j2-r1", name: "Task: Job Two", content: "", scheduleId: "sched-2" },
|
|
342
|
+
];
|
|
343
|
+
|
|
344
|
+
const out = dedupeRetrievalsForRater(rows);
|
|
345
|
+
|
|
346
|
+
expect(out).toHaveLength(2);
|
|
347
|
+
expect(out.map((r) => r.id)).toEqual(["j1-r2", "j2-r2"]);
|
|
305
348
|
});
|
|
306
349
|
|
|
307
|
-
test("
|
|
308
|
-
const
|
|
309
|
-
|
|
350
|
+
test("rows without scheduleId pass through unchanged (manual / file_index memories)", () => {
|
|
351
|
+
const rows: RetrievalRow[] = [
|
|
352
|
+
{ id: "m1", name: "Manual note", content: "", source: "manual" },
|
|
353
|
+
{ id: "m2", name: "Manual note", content: "", source: "manual" },
|
|
354
|
+
{ id: "m3", name: "Indexed file", content: "", source: "file_index", scheduleId: null },
|
|
355
|
+
];
|
|
356
|
+
expect(dedupeRetrievalsForRater(rows)).toEqual(rows);
|
|
310
357
|
});
|
|
311
358
|
|
|
312
|
-
test("
|
|
313
|
-
|
|
314
|
-
expect(extractSummaryFromClaudeStdout(stdout)).toBe(stdout);
|
|
359
|
+
test("empty input → empty output", () => {
|
|
360
|
+
expect(dedupeRetrievalsForRater([])).toEqual([]);
|
|
315
361
|
});
|
|
316
362
|
});
|
|
317
363
|
|
|
@@ -644,7 +690,7 @@ describe("HTTP integration: hook-piggyback dry-run", () => {
|
|
|
644
690
|
expect(rows).toEqual([]);
|
|
645
691
|
});
|
|
646
692
|
|
|
647
|
-
test("postRatings → applies events; alpha/beta posteriors move per mocked
|
|
693
|
+
test("postRatings → applies events; alpha/beta posteriors move per mocked generateObject result", async () => {
|
|
648
694
|
const useful = makeMemory("piggyback-useful");
|
|
649
695
|
const misleading = makeMemory("piggyback-misleading");
|
|
650
696
|
const neutral = makeMemory("piggyback-neutral");
|
|
@@ -654,7 +700,10 @@ describe("HTTP integration: hook-piggyback dry-run", () => {
|
|
|
654
700
|
insertRetrieval(taskA, misleading.id);
|
|
655
701
|
insertRetrieval(taskA, neutral.id);
|
|
656
702
|
|
|
657
|
-
// Simulate hook flow: fetch retrievals,
|
|
703
|
+
// Simulate hook flow: fetch retrievals, run schema validation against a
|
|
704
|
+
// mocked `generateObject` result (object — not stringified envelope —
|
|
705
|
+
// because the AI SDK returns a parsed/validated object directly), then
|
|
706
|
+
// POST.
|
|
658
707
|
const retrievals = await fetchRetrievalsForTask({
|
|
659
708
|
apiUrl: BASE,
|
|
660
709
|
apiKey: API_KEY,
|
|
@@ -663,20 +712,22 @@ describe("HTTP integration: hook-piggyback dry-run", () => {
|
|
|
663
712
|
});
|
|
664
713
|
expect(retrievals).toHaveLength(3);
|
|
665
714
|
|
|
666
|
-
|
|
667
|
-
const mockedSummaryJson = JSON.stringify({
|
|
715
|
+
const mockedGenerateObjectResult = {
|
|
668
716
|
summary: "Found a couple of helpful patterns; one memory was misleading.",
|
|
669
717
|
ratings: [
|
|
670
718
|
{ id: useful.id, score: 1, reasoning: "directly answered the question" },
|
|
671
719
|
{ id: misleading.id, score: 0, reasoning: "this memory contradicted the docs" },
|
|
672
720
|
{ id: neutral.id, score: 0.5, reasoning: "tangential but interesting" },
|
|
673
721
|
],
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
722
|
+
};
|
|
723
|
+
// The AI SDK's `generateObject` validates against the Zod schema before
|
|
724
|
+
// returning; mirror that contract here so the test fails fast if the
|
|
725
|
+
// schema drifts.
|
|
726
|
+
const parsed = SummaryWithRatingsSchema.safeParse(mockedGenerateObjectResult);
|
|
727
|
+
expect(parsed.success).toBe(true);
|
|
728
|
+
if (!parsed.success) return;
|
|
729
|
+
|
|
730
|
+
const events = buildRatingsFromLlm(parsed.data.ratings, retrievals);
|
|
680
731
|
expect(events).toHaveLength(3);
|
|
681
732
|
for (const e of events) {
|
|
682
733
|
expect(e.weight).toBe(0.8);
|
|
@@ -748,8 +799,8 @@ describe("HTTP integration: hook-piggyback dry-run", () => {
|
|
|
748
799
|
delete process.env.MEMORY_RATERS;
|
|
749
800
|
try {
|
|
750
801
|
// Mirror the hook's gate: when isLlmRaterEnabled() is false, the hook
|
|
751
|
-
// never calls fetchRetrievalsForTask /
|
|
752
|
-
//
|
|
802
|
+
// never calls fetchRetrievalsForTask / generateObject / postRatings —
|
|
803
|
+
// it falls back to the existing summary-only path.
|
|
753
804
|
let postCalled = false;
|
|
754
805
|
const fakeFetch: typeof fetch = async () => {
|
|
755
806
|
postCalled = true;
|
|
@@ -803,4 +854,111 @@ describe("HTTP integration: hook-piggyback dry-run", () => {
|
|
|
803
854
|
// Posterior unchanged — 400 means nothing was applied.
|
|
804
855
|
expect(readPosterior(m.id)).toEqual({ alpha: 1.0, beta: 1.0 });
|
|
805
856
|
});
|
|
857
|
+
|
|
858
|
+
test("OPENROUTER_API_KEY unset → hook is a no-op (no fetch, no index, no rate POST)", async () => {
|
|
859
|
+
const m = makeMemory("piggyback-openrouter-unset");
|
|
860
|
+
insertRetrieval(taskA, m.id);
|
|
861
|
+
|
|
862
|
+
// Mirror the hook's outer gate exactly: when OPENROUTER_API_KEY is unset,
|
|
863
|
+
// the entire summary + rating block must early-return. No call to
|
|
864
|
+
// /api/memory/index, no call to /api/memory/rate, no LLM invocation.
|
|
865
|
+
const prev = process.env.OPENROUTER_API_KEY;
|
|
866
|
+
delete process.env.OPENROUTER_API_KEY;
|
|
867
|
+
try {
|
|
868
|
+
let anyFetchCalled = false;
|
|
869
|
+
const fakeFetch: typeof fetch = async () => {
|
|
870
|
+
anyFetchCalled = true;
|
|
871
|
+
return new Response("{}", { status: 200 });
|
|
872
|
+
};
|
|
873
|
+
|
|
874
|
+
const skip = !process.env.OPENROUTER_API_KEY;
|
|
875
|
+
expect(skip).toBe(true);
|
|
876
|
+
|
|
877
|
+
// The hook block is entirely guarded — no fetch, no postRatings.
|
|
878
|
+
// We never reach fetchRetrievalsForTask or postRatings, so neither is
|
|
879
|
+
// exercised in this branch.
|
|
880
|
+
if (!skip) {
|
|
881
|
+
// Unreachable in this test — defensive assertion only.
|
|
882
|
+
await fetchRetrievalsForTask({
|
|
883
|
+
apiUrl: BASE,
|
|
884
|
+
apiKey: API_KEY,
|
|
885
|
+
agentId: agentA,
|
|
886
|
+
taskId: taskA,
|
|
887
|
+
fetchImpl: fakeFetch,
|
|
888
|
+
});
|
|
889
|
+
}
|
|
890
|
+
expect(anyFetchCalled).toBe(false);
|
|
891
|
+
} finally {
|
|
892
|
+
if (prev !== undefined) process.env.OPENROUTER_API_KEY = prev;
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
// No memory_rating rows for taskA, posterior unchanged.
|
|
896
|
+
expect(getRatings(taskA)).toHaveLength(0);
|
|
897
|
+
expect(readPosterior(m.id)).toEqual({ alpha: 1.0, beta: 1.0 });
|
|
898
|
+
});
|
|
899
|
+
|
|
900
|
+
test("happy path: mocked generateObject result → postRatings called with expected events", async () => {
|
|
901
|
+
const useful = makeMemory("happy-useful");
|
|
902
|
+
const misleading = makeMemory("happy-misleading");
|
|
903
|
+
|
|
904
|
+
insertRetrieval(taskB, useful.id);
|
|
905
|
+
insertRetrieval(taskB, misleading.id);
|
|
906
|
+
|
|
907
|
+
const retrievals = await fetchRetrievalsForTask({
|
|
908
|
+
apiUrl: BASE,
|
|
909
|
+
apiKey: API_KEY,
|
|
910
|
+
agentId: agentA,
|
|
911
|
+
taskId: taskB,
|
|
912
|
+
});
|
|
913
|
+
expect(retrievals).toHaveLength(2);
|
|
914
|
+
|
|
915
|
+
// Stand in for `const { object } = await generateObject(...)` — the AI
|
|
916
|
+
// SDK guarantees `object` is already validated against the Zod schema.
|
|
917
|
+
const generateObjectResult: {
|
|
918
|
+
object: { summary: string; ratings: Array<{ id: string; score: number; reasoning: string }> };
|
|
919
|
+
} = {
|
|
920
|
+
object: {
|
|
921
|
+
summary: "Two patterns surfaced; one was misleading.",
|
|
922
|
+
ratings: [
|
|
923
|
+
{ id: useful.id, score: 1, reasoning: "directly answered" },
|
|
924
|
+
{ id: misleading.id, score: 0, reasoning: "contradicted the docs" },
|
|
925
|
+
],
|
|
926
|
+
},
|
|
927
|
+
};
|
|
928
|
+
|
|
929
|
+
// Schema gate is implicit in the SDK, but assert here so a future schema
|
|
930
|
+
// change doesn't silently make this test pass on garbage data.
|
|
931
|
+
const validated = SummaryWithRatingsSchema.parse(generateObjectResult.object);
|
|
932
|
+
|
|
933
|
+
const events = buildRatingsFromLlm(validated.ratings, retrievals);
|
|
934
|
+
expect(events).toHaveLength(2);
|
|
935
|
+
const usefulEvent = events.find((e) => e.memoryId === useful.id)!;
|
|
936
|
+
const misleadingEvent = events.find((e) => e.memoryId === misleading.id)!;
|
|
937
|
+
expect(usefulEvent.signal).toBeCloseTo(1, 6);
|
|
938
|
+
expect(misleadingEvent.signal).toBeCloseTo(-1, 6);
|
|
939
|
+
expect(usefulEvent.source).toBe("llm");
|
|
940
|
+
expect(misleadingEvent.source).toBe("llm");
|
|
941
|
+
|
|
942
|
+
// Track that postRatings actually attempts the POST with our events.
|
|
943
|
+
let postedEvents: RatingEvent[] | null = null;
|
|
944
|
+
const trackingFetch: typeof fetch = async (url, init) => {
|
|
945
|
+
if (typeof url === "string" && url.endsWith("/api/memory/rate")) {
|
|
946
|
+
const body = JSON.parse(String(init?.body ?? "{}"));
|
|
947
|
+
postedEvents = body.events;
|
|
948
|
+
}
|
|
949
|
+
return new Response("{}", { status: 200 });
|
|
950
|
+
};
|
|
951
|
+
const r = await postRatings({
|
|
952
|
+
apiUrl: BASE,
|
|
953
|
+
apiKey: API_KEY,
|
|
954
|
+
agentId: agentA,
|
|
955
|
+
taskId: taskB,
|
|
956
|
+
events,
|
|
957
|
+
fetchImpl: trackingFetch,
|
|
958
|
+
});
|
|
959
|
+
expect(r.ok).toBe(true);
|
|
960
|
+
expect(postedEvents).not.toBeNull();
|
|
961
|
+
expect(postedEvents!).toHaveLength(2);
|
|
962
|
+
expect(postedEvents!.map((e) => e.memoryId).sort()).toEqual([useful.id, misleading.id].sort());
|
|
963
|
+
});
|
|
806
964
|
});
|
|
@@ -71,7 +71,12 @@ describe("migration regressions", () => {
|
|
|
71
71
|
expect(columns).toContain("setupScript");
|
|
72
72
|
});
|
|
73
73
|
|
|
74
|
-
test("fresh DB
|
|
74
|
+
test("fresh DB drops source CHECK constraint on agent_tasks (Zod is the gate)", () => {
|
|
75
|
+
// Migration 056 removes the SQL CHECK on agent_tasks.source — the Zod
|
|
76
|
+
// `AgentTaskSourceSchema` in src/types.ts is now the single source of
|
|
77
|
+
// truth for the allowed enum, and is enforced at the HTTP/MCP ingress.
|
|
78
|
+
// Direct SQL inserts no longer fail on unknown sources by design;
|
|
79
|
+
// adding a new source no longer requires a forward-only migration.
|
|
75
80
|
const database = initDb(FRESH_DB_PATH);
|
|
76
81
|
const now = new Date().toISOString();
|
|
77
82
|
|
|
@@ -81,6 +86,16 @@ describe("migration regressions", () => {
|
|
|
81
86
|
VALUES (?, ?, ?, ?, ?, ?)`,
|
|
82
87
|
[crypto.randomUUID(), "invalid source", "pending", "not-valid", now, now],
|
|
83
88
|
);
|
|
84
|
-
}).toThrow();
|
|
89
|
+
}).not.toThrow();
|
|
90
|
+
|
|
91
|
+
// The requestedByUserId FK survives the table-rebuild in migration 056.
|
|
92
|
+
const fkList = database
|
|
93
|
+
.prepare<{ table: string; from: string; to: string }, []>(
|
|
94
|
+
'SELECT "table" as "table", "from", "to" FROM pragma_foreign_key_list(\'agent_tasks\')',
|
|
95
|
+
)
|
|
96
|
+
.all();
|
|
97
|
+
const requestedByFk = fkList.find((fk) => fk.from === "requestedByUserId");
|
|
98
|
+
expect(requestedByFk?.table).toBe("users");
|
|
99
|
+
expect(requestedByFk?.to).toBe("id");
|
|
85
100
|
});
|
|
86
101
|
});
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
2
|
+
import { unlink } from "node:fs/promises";
|
|
3
|
+
import {
|
|
4
|
+
closeDb,
|
|
5
|
+
createAgent,
|
|
6
|
+
createTaskExtended,
|
|
7
|
+
getRootTaskChain,
|
|
8
|
+
initDb,
|
|
9
|
+
listRecentSessions,
|
|
10
|
+
} from "../be/db";
|
|
11
|
+
|
|
12
|
+
const TEST_DB_PATH = "./test-sessions.sqlite";
|
|
13
|
+
|
|
14
|
+
describe("sessions — getRootTaskChain + listRecentSessions", () => {
|
|
15
|
+
beforeAll(async () => {
|
|
16
|
+
for (const suffix of ["", "-wal", "-shm"]) {
|
|
17
|
+
try {
|
|
18
|
+
await unlink(`${TEST_DB_PATH}${suffix}`);
|
|
19
|
+
} catch {}
|
|
20
|
+
}
|
|
21
|
+
initDb(TEST_DB_PATH);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
afterAll(async () => {
|
|
25
|
+
closeDb();
|
|
26
|
+
for (const suffix of ["", "-wal", "-shm"]) {
|
|
27
|
+
try {
|
|
28
|
+
await unlink(`${TEST_DB_PATH}${suffix}`);
|
|
29
|
+
} catch {}
|
|
30
|
+
}
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
test("empty chain — no rows for non-existent root", () => {
|
|
34
|
+
const chain = getRootTaskChain("nonexistent-root-id");
|
|
35
|
+
expect(chain).toEqual([]);
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
test("single-root chain — chain length 1", () => {
|
|
39
|
+
const agent = createAgent({
|
|
40
|
+
id: "sessions-test-agent-1",
|
|
41
|
+
name: "Sessions Test Agent 1",
|
|
42
|
+
isLead: false,
|
|
43
|
+
status: "idle",
|
|
44
|
+
});
|
|
45
|
+
const root = createTaskExtended("root only", { agentId: agent.id });
|
|
46
|
+
|
|
47
|
+
const chain = getRootTaskChain(root.id);
|
|
48
|
+
expect(chain).toHaveLength(1);
|
|
49
|
+
expect(chain[0].id).toBe(root.id);
|
|
50
|
+
expect(chain[0].parentTaskId).toBeUndefined();
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
test("3-level chain — root → child → grandchild", () => {
|
|
54
|
+
const agent = createAgent({
|
|
55
|
+
id: "sessions-test-agent-2",
|
|
56
|
+
name: "Sessions Test Agent 2",
|
|
57
|
+
isLead: false,
|
|
58
|
+
status: "idle",
|
|
59
|
+
});
|
|
60
|
+
const root = createTaskExtended("root", { agentId: agent.id });
|
|
61
|
+
const child = createTaskExtended("child", {
|
|
62
|
+
agentId: agent.id,
|
|
63
|
+
parentTaskId: root.id,
|
|
64
|
+
});
|
|
65
|
+
const grandchild = createTaskExtended("grandchild", {
|
|
66
|
+
agentId: agent.id,
|
|
67
|
+
parentTaskId: child.id,
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
const chain = getRootTaskChain(root.id);
|
|
71
|
+
expect(chain).toHaveLength(3);
|
|
72
|
+
|
|
73
|
+
// ordered by createdAt — root first, then child, then grandchild
|
|
74
|
+
expect(chain.map((t) => t.id)).toEqual([root.id, child.id, grandchild.id]);
|
|
75
|
+
expect(chain[0].parentTaskId).toBeUndefined();
|
|
76
|
+
expect(chain[1].parentTaskId).toBe(root.id);
|
|
77
|
+
expect(chain[2].parentTaskId).toBe(child.id);
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
test("parallel siblings — root with two children", () => {
|
|
81
|
+
const agent = createAgent({
|
|
82
|
+
id: "sessions-test-agent-3",
|
|
83
|
+
name: "Sessions Test Agent 3",
|
|
84
|
+
isLead: false,
|
|
85
|
+
status: "idle",
|
|
86
|
+
});
|
|
87
|
+
const root = createTaskExtended("parallel root", { agentId: agent.id });
|
|
88
|
+
const sibA = createTaskExtended("sibling A", {
|
|
89
|
+
agentId: agent.id,
|
|
90
|
+
parentTaskId: root.id,
|
|
91
|
+
});
|
|
92
|
+
const sibB = createTaskExtended("sibling B", {
|
|
93
|
+
agentId: agent.id,
|
|
94
|
+
parentTaskId: root.id,
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
const chain = getRootTaskChain(root.id);
|
|
98
|
+
expect(chain).toHaveLength(3);
|
|
99
|
+
expect(chain[0].id).toBe(root.id);
|
|
100
|
+
// siblings appear in createdAt order (sibA before sibB)
|
|
101
|
+
const ids = chain.map((t) => t.id);
|
|
102
|
+
expect(ids.indexOf(sibA.id)).toBeLessThan(ids.indexOf(sibB.id));
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
test("listRecentSessions returns root tasks with chain summary", () => {
|
|
106
|
+
const sessions = listRecentSessions({ limit: 50 });
|
|
107
|
+
// We've created multiple roots above; each non-empty session must surface.
|
|
108
|
+
expect(sessions.length).toBeGreaterThanOrEqual(3);
|
|
109
|
+
|
|
110
|
+
for (const s of sessions) {
|
|
111
|
+
// Root tasks only — never have parentTaskId
|
|
112
|
+
expect(s.root.parentTaskId).toBeUndefined();
|
|
113
|
+
expect(typeof s.chainTaskCount).toBe("number");
|
|
114
|
+
expect(s.chainTaskCount).toBeGreaterThanOrEqual(1);
|
|
115
|
+
expect(typeof s.lastActivityAt).toBe("string");
|
|
116
|
+
expect(typeof s.latestStatus).toBe("string");
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// The 3-level chain must report chainTaskCount of 3
|
|
120
|
+
const threeLevel = sessions.find((s) => s.root.task === "root");
|
|
121
|
+
expect(threeLevel).toBeDefined();
|
|
122
|
+
expect(threeLevel?.chainTaskCount).toBe(3);
|
|
123
|
+
|
|
124
|
+
// The parallel-root must report chainTaskCount of 3 (root + 2 siblings)
|
|
125
|
+
const parallel = sessions.find((s) => s.root.task === "parallel root");
|
|
126
|
+
expect(parallel).toBeDefined();
|
|
127
|
+
expect(parallel?.chainTaskCount).toBe(3);
|
|
128
|
+
|
|
129
|
+
// The single-root chain must report chainTaskCount of 1
|
|
130
|
+
const single = sessions.find((s) => s.root.task === "root only");
|
|
131
|
+
expect(single).toBeDefined();
|
|
132
|
+
expect(single?.chainTaskCount).toBe(1);
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
test("listRecentSessions ordered by lastActivityAt DESC", () => {
|
|
136
|
+
const sessions = listRecentSessions({ limit: 50 });
|
|
137
|
+
for (let i = 1; i < sessions.length; i++) {
|
|
138
|
+
expect(sessions[i - 1].lastActivityAt >= sessions[i].lastActivityAt).toBe(true);
|
|
139
|
+
}
|
|
140
|
+
});
|
|
141
|
+
});
|