@desplega.ai/agent-swarm 1.74.4 → 1.76.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/README.md +1 -1
  2. package/openapi.json +1264 -46
  3. package/package.json +2 -2
  4. package/src/be/db.ts +563 -9
  5. package/src/be/memory/edges-store.ts +69 -0
  6. package/src/be/memory/providers/sqlite-store.ts +4 -0
  7. package/src/be/memory/raters/explicit-self.ts +22 -0
  8. package/src/be/memory/raters/implicit-citation.ts +44 -0
  9. package/src/be/memory/raters/llm-client.ts +172 -0
  10. package/src/be/memory/raters/llm-summarizer.ts +218 -0
  11. package/src/be/memory/raters/llm.ts +375 -0
  12. package/src/be/memory/raters/noop.ts +14 -0
  13. package/src/be/memory/raters/registry.ts +86 -0
  14. package/src/be/memory/raters/retrieval.ts +88 -0
  15. package/src/be/memory/raters/run-server-raters.ts +97 -0
  16. package/src/be/memory/raters/store.ts +228 -0
  17. package/src/be/memory/raters/types.ts +101 -0
  18. package/src/be/memory/reranker.ts +32 -2
  19. package/src/be/memory/retrieval-store.ts +116 -0
  20. package/src/be/memory/types.ts +3 -0
  21. package/src/be/migrations/051_memory_posteriors_and_retrieval.sql +67 -0
  22. package/src/be/migrations/052_memory_edges.sql +36 -0
  23. package/src/be/migrations/053_agent_waiting_for_credentials_status.sql +61 -0
  24. package/src/be/migrations/054_agent_harness_provider.sql +21 -0
  25. package/src/be/migrations/055_agent_cred_status.sql +15 -0
  26. package/src/be/migrations/056_drop_agent_tasks_source_check.sql +139 -0
  27. package/src/be/migrations/057_inbox_item_state.sql +27 -0
  28. package/src/be/migrations/058_task_templates.sql +31 -0
  29. package/src/be/swarm-config-guard.ts +24 -0
  30. package/src/commands/credential-wait.ts +186 -0
  31. package/src/commands/provider-credentials.ts +434 -0
  32. package/src/commands/runner.ts +253 -21
  33. package/src/hooks/hook.ts +143 -66
  34. package/src/http/agents.ts +191 -1
  35. package/src/http/config.ts +11 -1
  36. package/src/http/core.ts +5 -0
  37. package/src/http/inbox-state.ts +89 -0
  38. package/src/http/index.ts +10 -0
  39. package/src/http/memory.ts +230 -1
  40. package/src/http/sessions.ts +86 -0
  41. package/src/http/status.ts +665 -0
  42. package/src/http/task-templates.ts +51 -0
  43. package/src/http/tasks.ts +85 -5
  44. package/src/http/users.ts +134 -0
  45. package/src/prompts/memories.ts +62 -0
  46. package/src/providers/claude-adapter.ts +22 -0
  47. package/src/providers/claude-managed-adapter.ts +24 -0
  48. package/src/providers/codex-adapter.ts +43 -1
  49. package/src/providers/devin-adapter.ts +18 -0
  50. package/src/providers/index.ts +7 -0
  51. package/src/providers/opencode-adapter.ts +60 -0
  52. package/src/providers/pi-mono-adapter.ts +71 -0
  53. package/src/providers/types.ts +34 -0
  54. package/src/server.ts +2 -0
  55. package/src/slack/handlers.ts +0 -1
  56. package/src/tests/agents-harness-provider.test.ts +333 -0
  57. package/src/tests/credential-check.test.ts +367 -0
  58. package/src/tests/credential-status-api.test.ts +223 -0
  59. package/src/tests/credential-status-routing.test.ts +150 -0
  60. package/src/tests/credential-wait.test.ts +282 -0
  61. package/src/tests/harness-provider-resolution.test.ts +242 -0
  62. package/src/tests/jira-sync.test.ts +1 -1
  63. package/src/tests/memory-edges.test.ts +722 -0
  64. package/src/tests/memory-rate-endpoint.test.ts +330 -0
  65. package/src/tests/memory-rate-tool.test.ts +252 -0
  66. package/src/tests/memory-rater-e2e.test.ts +578 -0
  67. package/src/tests/memory-rater-implicit-citation.test.ts +304 -0
  68. package/src/tests/memory-rater-llm-summarizer.test.ts +317 -0
  69. package/src/tests/memory-rater-llm.test.ts +964 -0
  70. package/src/tests/memory-rater-store.test.ts +249 -0
  71. package/src/tests/memory-reranker.test.ts +161 -2
  72. package/src/tests/migration-runner-regressions.test.ts +17 -2
  73. package/src/tests/mocks/mock-llm-rater-client.ts +35 -0
  74. package/src/tests/run-server-raters.test.ts +291 -0
  75. package/src/tests/sessions.test.ts +141 -0
  76. package/src/tests/status.test.ts +843 -0
  77. package/src/tests/stop-hook-task-resolution.test.ts +98 -0
  78. package/src/tests/template-recommendations.test.ts +148 -0
  79. package/src/tests/tool-annotations.test.ts +2 -2
  80. package/src/tests/use-dismissible-card.test.ts +140 -0
  81. package/src/tools/memory-rate.ts +166 -0
  82. package/src/tools/memory-search.ts +18 -0
  83. package/src/tools/store-progress.ts +37 -0
  84. package/src/tools/swarm-config/set-config.ts +17 -1
  85. package/src/tools/tool-config.ts +1 -0
  86. package/src/types.ts +122 -1
  87. package/src/utils/harness-provider.ts +32 -0
  88. package/tsconfig.json +0 -2
@@ -0,0 +1,375 @@
1
+ /**
2
+ * `LlmRater` — second live rater, source = "llm".
3
+ *
4
+ * Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-4.md §2-3
5
+ *
6
+ * The worker-side flow does NOT call `LlmRater.rate(ctx)` from the in-process
7
+ * server-rater orchestrator. Instead, the rating LLM call is piggybacked on
8
+ * the existing session-summary call in `src/hooks/hook.ts` (cost optimization
9
+ * — same Haiku invocation produces both summary text + per-memory ratings).
10
+ * The hook then POSTs the constructed `RatingEvent[]` to `/api/memory/rate`.
11
+ *
12
+ * `LlmRater.rate(ctx)` is wired up so the class still satisfies `MemoryRater`
13
+ * for registry consistency / future direct integrations / unit tests, but is
14
+ * never invoked by `runServerRaters` (LlmRater is NOT in `SERVER_RATERS`).
15
+ *
16
+ * This module is imported from worker-side `src/hooks/hook.ts` so it MUST NOT
17
+ * touch `bun:sqlite` or `src/be/db`. The boundary check enforces it.
18
+ */
19
+ import { z } from "zod";
20
+ import { ClaudeCliLlmRaterClient, type LlmRaterClient, type LlmRaterResult } from "./llm-client";
21
+ import {
22
+ type MemoryRater,
23
+ type RatingContext,
24
+ type RatingEvent,
25
+ REFERENCES_SOURCE_MAX_LENGTH,
26
+ sanitizeReferencesSource,
27
+ } from "./types";
28
+
29
+ /**
30
+ * Per-rating weight, fixed at 0.8 per the research-doc convention
31
+ * ("LLM intent_weight"). Encoded here once so neither callers nor tests can
32
+ * silently drift the constant.
33
+ */
34
+ export const LLM_RATER_WEIGHT = 0.8;
35
+
36
+ const RatingSchema = z.object({
37
+ id: z.string().min(1),
38
+ score: z.number().min(0).max(1),
39
+ reasoning: z.string().min(1).max(500),
40
+ // Step-6 §6 — optional free-form external source ID. Q2 contract: ≤512
41
+ // chars, no closed enum, no prefix parser. Sanitization (control-char
42
+ // strip + NUL rejection) happens in `buildRatingsFromLlm` so a single
43
+ // bad rating drops the field rather than failing the whole batch.
44
+ referencesSource: z.string().min(1).max(REFERENCES_SOURCE_MAX_LENGTH).optional(),
45
+ });
46
+
47
+ /**
48
+ * Zod schema for the structured-output piggyback prompt. The hook asks the
49
+ * summarizer LLM to return summary + per-memory ratings in one JSON object so
50
+ * we don't pay for N additional LLM calls.
51
+ */
52
+ export const SummaryWithRatingsSchema = z.object({
53
+ summary: z.string(),
54
+ ratings: z.array(RatingSchema).default([]),
55
+ });
56
+
57
+ export type LlmRating = z.infer<typeof RatingSchema>;
58
+ export type SummaryWithRatings = z.infer<typeof SummaryWithRatingsSchema>;
59
+
60
+ /** Context augmentations LlmRater consumes when called directly (per-memory path). */
61
+ export type LlmRatingContext = RatingContext & {
62
+ /** What the agent asked the memory system. */
63
+ query?: string;
64
+ /** Final agent response / summary used as the "did this help?" signal. */
65
+ response?: string;
66
+ /** Snapshots for memories listed in `retrievedMemoryIds` (id-aligned by id). */
67
+ retrievedMemories?: { id: string; name: string; content: string }[];
68
+ };
69
+
70
+ export class LlmRater implements MemoryRater {
71
+ readonly name = "llm";
72
+
73
+ constructor(public readonly client: LlmRaterClient = new ClaudeCliLlmRaterClient()) {}
74
+
75
+ /**
76
+ * Per-memory scoring path. The production hook bypasses this method and
77
+ * calls {@link buildRatingsFromLlm} on the piggybacked summarizer JSON
78
+ * (one LLM invocation, not N). Direct callers (tests, future integrations)
79
+ * MUST pass {@link LlmRatingContext} — the base `RatingContext` carries
80
+ * only memory IDs, which is insufficient to drive `LlmRaterClient.rate`.
81
+ *
82
+ * Returns `[]` when the augmented fields are missing so the rater stays a
83
+ * no-op rather than crashing on a `RatingContext`-only invocation.
84
+ */
85
+ async rate(ctx: RatingContext): Promise<RatingEvent[]> {
86
+ const enriched = ctx as LlmRatingContext;
87
+ if (enriched.retrievedMemoryIds.length === 0) return [];
88
+ const memories = enriched.retrievedMemories;
89
+ if (!memories || memories.length === 0) return [];
90
+
91
+ const events: RatingEvent[] = [];
92
+ for (const memoryId of enriched.retrievedMemoryIds) {
93
+ const memory = memories.find((m) => m.id === memoryId);
94
+ if (!memory) continue;
95
+ let result: LlmRaterResult | null;
96
+ try {
97
+ result = await this.client.rate({
98
+ query: enriched.query ?? "",
99
+ memory,
100
+ response: enriched.response ?? enriched.evidence ?? "",
101
+ });
102
+ } catch (err) {
103
+ console.error(
104
+ `[memory-rater:llm] client.rate threw for memoryId=${memoryId}:`,
105
+ (err as Error).message,
106
+ );
107
+ continue;
108
+ }
109
+ if (!result) continue;
110
+ events.push({
111
+ memoryId,
112
+ signal: 2 * result.score - 1,
113
+ weight: LLM_RATER_WEIGHT,
114
+ // Framework stamps `source = rater.name` in `runServerRaters`. Raters
115
+ // that populate `source` themselves are rejected by `applyRating`.
116
+ source: "",
117
+ reasoning: result.reasoning,
118
+ });
119
+ }
120
+ return events;
121
+ }
122
+ }
123
+
124
+ /**
125
+ * Convert the piggybacked summary's `ratings` array into `RatingEvent[]` for
126
+ * `POST /api/memory/rate`. Drops ratings whose `id` was not in the original
127
+ * retrieval set (defence-in-depth — the LLM occasionally hallucinates memory
128
+ * IDs; the server-side R6 check catches it too, but rejecting upstream keeps
129
+ * the audit log cleaner).
130
+ *
131
+ * Mapping: `signal = 2 * score - 1` (0 → -1, 0.5 → 0, 1 → +1).
132
+ * Weight = {@link LLM_RATER_WEIGHT} (0.8).
133
+ * Source = `"llm"` (the HTTP rate endpoint enums `["llm", "explicit-self"]`).
134
+ */
135
+ export function buildRatingsFromLlm(
136
+ ratings: LlmRating[],
137
+ retrievals: { id: string }[],
138
+ ): RatingEvent[] {
139
+ const allowed = new Set(retrievals.map((r) => r.id));
140
+ const events: RatingEvent[] = [];
141
+ for (const r of ratings) {
142
+ if (!allowed.has(r.id)) continue;
143
+ // Step-6 §6 — sanitize before propagation. If the LLM emits a NUL byte
144
+ // or an all-control-chars string, drop the edge but keep the rating
145
+ // (best-effort: the memory's own posterior still gets the signal).
146
+ let cleanedReferencesSource: string | undefined;
147
+ if (r.referencesSource !== undefined) {
148
+ const cleaned = sanitizeReferencesSource(r.referencesSource);
149
+ if (cleaned !== null) {
150
+ cleanedReferencesSource = cleaned;
151
+ }
152
+ }
153
+ events.push({
154
+ memoryId: r.id,
155
+ signal: 2 * r.score - 1,
156
+ weight: LLM_RATER_WEIGHT,
157
+ source: "llm",
158
+ reasoning: r.reasoning,
159
+ ...(cleanedReferencesSource !== undefined
160
+ ? { referencesSource: cleanedReferencesSource }
161
+ : {}),
162
+ });
163
+ }
164
+ return events;
165
+ }
166
+
167
+ /**
168
+ * Append a structured-output instruction to the existing summary prompt so
169
+ * the same `claude -p` invocation produces both summary text AND per-memory
170
+ * ratings against `SummaryWithRatingsSchema`.
171
+ *
172
+ * Memory `content` is truncated to {@link RETRIEVAL_PROMPT_CONTENT_CAP} chars
173
+ * to keep the prompt within Haiku's context budget on long sessions; the
174
+ * server already truncates `agent_memory.content` to 500 chars in the
175
+ * retrievals endpoint, so this is the typical case.
176
+ */
177
+ const RETRIEVAL_PROMPT_CONTENT_CAP = 600;
178
+
179
+ export function buildSummaryWithRatingsPrompt(
180
+ basePrompt: string,
181
+ retrievals: { id: string; name: string; content: string }[],
182
+ ): string {
183
+ if (retrievals.length === 0) return basePrompt;
184
+ const memoryBlock = retrievals
185
+ .map((m, i) => {
186
+ const content =
187
+ m.content.length > RETRIEVAL_PROMPT_CONTENT_CAP
188
+ ? `${m.content.slice(0, RETRIEVAL_PROMPT_CONTENT_CAP)}…`
189
+ : m.content;
190
+ return `Memory #${i + 1}\n id: ${m.id}\n name: ${m.name}\n content: ${content}`;
191
+ })
192
+ .join("\n\n");
193
+
194
+ return `${basePrompt}
195
+
196
+ CRITICAL: Your entire response MUST be a single JSON object that conforms to the schema below. Do NOT wrap it in triple-backtick fences (no \`\`\`json or \`\`\`), do NOT add a prose preamble, do NOT add trailing commentary. Just the JSON object, nothing else.
197
+
198
+ Schema:
199
+ {
200
+ "summary": string, // your existing summary text
201
+ "ratings": [ // one entry per memory you can score
202
+ {
203
+ "id": string, // memory id, copied from the list below
204
+ "score": number, // 0 = misleading/unhelpful, 1 = highly useful
205
+ "reasoning": string, // 1..500 chars, why
206
+ "referencesSource": string // OPTIONAL — see note below
207
+ }
208
+ ]
209
+ }
210
+
211
+ Score ONLY memories present in the list below. Use the exact ids. Omit any you cannot evaluate.
212
+
213
+ Optionally for each rating, if the memory clearly references a specific external source (a GitHub PR/issue, a Linear issue, a customer, a Slack thread, an AgentMail thread, etc.), include a \`referencesSource\` string using the convention "<source>:<identifier>" (e.g. "github:owner/repo#N", "linear:KEY-N", "customer:<slug>"). Any prefix is fine — pick what matches the source. Omit the field if no clear external source.
214
+
215
+ Memories retrieved during this session:
216
+
217
+ ${memoryBlock}`;
218
+ }
219
+
220
+ /**
221
+ * `MEMORY_RATERS=...` includes `llm`? Used by the hook to gate the piggyback
222
+ * path — strict opt-in so existing deployments are byte-identical when unset.
223
+ */
224
+ export function isLlmRaterEnabled(): boolean {
225
+ const raw = process.env.MEMORY_RATERS;
226
+ if (!raw || raw.trim() === "") return false;
227
+ return raw
228
+ .split(",")
229
+ .map((s) => s.trim())
230
+ .includes("llm");
231
+ }
232
+
233
+ /** Memory snapshot returned by `GET /api/memory/retrievals`. */
234
+ export type RetrievalRow = {
235
+ id: string;
236
+ name: string;
237
+ content: string;
238
+ scope?: string;
239
+ /** `agent_memory.source` — present once the API surfaces it (post-PR #451 amendment). */
240
+ source?: string;
241
+ /** `agent_tasks.scheduleId` for the writing task, or null when not a scheduled run. */
242
+ scheduleId?: string | null;
243
+ similarity?: number | null;
244
+ retrievedAt?: string;
245
+ };
246
+
247
+ /**
248
+ * Dedupe candidate memories before LLM rating to prevent posterior inflation
249
+ * from scheduled-task self-similarity.
250
+ *
251
+ * **Why this exists.** Scheduled tasks fire identical task text on every
252
+ * run, and the task-completion path names each memory
253
+ * `"Task: ${task.task.slice(0, 80)}"` (`src/tools/store-progress.ts`). When
254
+ * the next run searches memory, its own past runs surface as "highly
255
+ * similar" rows. Without dedup, the LLM rater scored 5+ near-clones at +1.0
256
+ * each — bumping alpha 5x in a single session and distorting the Beta(α,β)
257
+ * ranking vs. a normal one-shot session. Concrete case (Lead's audit of the
258
+ * first 37 `llm` ratings, post-PR #450): the Claude Code Changelog Monitor
259
+ * hourly cron (taskId `f938d74d-05af-44a7-a0aa-3463d22be502`) produced 5
260
+ * saturating +1s in one rater pass — every rated memory was a prior hourly
261
+ * run.
262
+ *
263
+ * **Discriminator.** `agent_tasks.scheduleId`. Memories sharing a non-null
264
+ * `scheduleId` are by definition from the same scheduled job — that is the
265
+ * exact duplicate class the audit identified, and the only one we want to
266
+ * collapse. We do NOT key on `name` alone, because the 80-char truncation in
267
+ * task-completion names ("Task: …") and session-summary names ("Session: …")
268
+ * means two distinct one-shot tasks/summaries that happen to share the first
269
+ * 80 chars of their description would silently collapse — the false-positive
270
+ * path the PR #451 reviewer flagged.
271
+ *
272
+ * **Pass-through cases (NOT deduped).**
273
+ * - `scheduleId` is null/undefined (manual one-shot tasks, manual memories,
274
+ * file-index memories) — no scheduled-clone risk.
275
+ * - Two memories from different scheduled jobs that happen to surface in
276
+ * the same retrieval set — different `scheduleId`s, both kept.
277
+ *
278
+ * **Tie-break.** Input is `ORDER BY mr.retrievedAt DESC` from
279
+ * `getRetrievalsForAgent`, so "first occurrence per scheduleId" = "freshest
280
+ * surfaced run", which is the representative we want.
281
+ */
282
+ export function dedupeRetrievalsForRater<T extends { scheduleId?: string | null }>(rows: T[]): T[] {
283
+ const seenSchedules = new Set<string>();
284
+ const out: T[] = [];
285
+ for (const row of rows) {
286
+ const scheduleId = row.scheduleId;
287
+ if (typeof scheduleId === "string" && scheduleId.length > 0) {
288
+ if (seenSchedules.has(scheduleId)) continue;
289
+ seenSchedules.add(scheduleId);
290
+ }
291
+ out.push(row);
292
+ }
293
+ return out;
294
+ }
295
+
296
+ /**
297
+ * GET `/api/memory/retrievals?taskId=` — best-effort. Returns `[]` on any
298
+ * failure so a transient API outage never blocks the summary-indexing path.
299
+ */
300
+ export async function fetchRetrievalsForTask(opts: {
301
+ apiUrl: string;
302
+ apiKey: string;
303
+ agentId: string;
304
+ taskId: string;
305
+ fetchImpl?: typeof fetch;
306
+ }): Promise<RetrievalRow[]> {
307
+ const fetchFn = opts.fetchImpl ?? fetch;
308
+ try {
309
+ const url = `${opts.apiUrl}/api/memory/retrievals?taskId=${encodeURIComponent(opts.taskId)}`;
310
+ const res = await fetchFn(url, {
311
+ headers: {
312
+ "X-Agent-ID": opts.agentId,
313
+ ...(opts.apiKey ? { Authorization: `Bearer ${opts.apiKey}` } : {}),
314
+ },
315
+ });
316
+ if (!res.ok) {
317
+ console.error(
318
+ `[memory-rater:llm] GET /api/memory/retrievals failed: ${res.status} ${res.statusText}`,
319
+ );
320
+ return [];
321
+ }
322
+ const body = (await res.json()) as { results?: RetrievalRow[] };
323
+ return body.results ?? [];
324
+ } catch (err) {
325
+ console.error("[memory-rater:llm] fetchRetrievalsForTask threw:", (err as Error).message);
326
+ return [];
327
+ }
328
+ }
329
+
330
+ /**
331
+ * POST `/api/memory/rate` — best-effort. Logs on 4xx/5xx, never throws. The
332
+ * worker hook wraps the whole rating block in its own try/catch as a final
333
+ * line of defence — rater failure must never block summary indexing.
334
+ */
335
+ export async function postRatings(opts: {
336
+ apiUrl: string;
337
+ apiKey: string;
338
+ agentId: string;
339
+ taskId?: string;
340
+ events: RatingEvent[];
341
+ fetchImpl?: typeof fetch;
342
+ }): Promise<{ ok: boolean; status: number }> {
343
+ if (opts.events.length === 0) return { ok: true, status: 0 };
344
+ const fetchFn = opts.fetchImpl ?? fetch;
345
+ const events = opts.events.map((e) => ({
346
+ memoryId: e.memoryId,
347
+ signal: e.signal,
348
+ weight: e.weight,
349
+ source: e.source,
350
+ ...(e.reasoning !== undefined ? { reasoning: e.reasoning } : {}),
351
+ ...(e.referencesSource !== undefined ? { referencesSource: e.referencesSource } : {}),
352
+ ...(opts.taskId ? { taskId: opts.taskId } : {}),
353
+ }));
354
+ try {
355
+ const res = await fetchFn(`${opts.apiUrl}/api/memory/rate`, {
356
+ method: "POST",
357
+ headers: {
358
+ "Content-Type": "application/json",
359
+ "X-Agent-ID": opts.agentId,
360
+ ...(opts.apiKey ? { Authorization: `Bearer ${opts.apiKey}` } : {}),
361
+ },
362
+ body: JSON.stringify({ events }),
363
+ });
364
+ if (!res.ok) {
365
+ const text = await res.text().catch(() => "");
366
+ console.error(
367
+ `[memory-rater:llm] POST /api/memory/rate failed: ${res.status} ${res.statusText} ${text.slice(0, 200)}`,
368
+ );
369
+ }
370
+ return { ok: res.ok, status: res.status };
371
+ } catch (err) {
372
+ console.error("[memory-rater:llm] postRatings threw:", (err as Error).message);
373
+ return { ok: false, status: 0 };
374
+ }
375
+ }
@@ -0,0 +1,14 @@
1
+ import type { MemoryRater, RatingEvent } from "./types";
2
+
3
+ /**
4
+ * Default rater. Emits no events, makes no DB calls. Selected when
5
+ * MEMORY_RATERS is unset or empty so the framework defaults to behaving
6
+ * byte-identically to pre-rater builds.
7
+ */
8
+ export class NoopRater implements MemoryRater {
9
+ readonly name = "noop";
10
+
11
+ async rate(): Promise<RatingEvent[]> {
12
+ return [];
13
+ }
14
+ }
@@ -0,0 +1,86 @@
1
+ import { ExplicitSelfRatingRater } from "./explicit-self";
2
+ import { ImplicitCitationRater } from "./implicit-citation";
3
+ import { LlmRater } from "./llm";
4
+ import { NoopRater } from "./noop";
5
+ import type { MemoryRater } from "./types";
6
+
7
+ /**
8
+ * Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-1.md §4
9
+ *
10
+ * `MEMORY_RATERS` env — comma-separated list of rater names. Defaults to
11
+ * `[NoopRater]` when unset/empty so existing deployments stay byte-identical.
12
+ *
13
+ * `MEMORY_RATER_WEIGHTS` env — optional `name:multiplier,...` overrides.
14
+ * Multiplier is applied to every emitted RatingEvent.weight before
15
+ * `applyRating`. Default = 1.0.
16
+ *
17
+ * Each later step touches *only* its own line in the factory map:
18
+ * - step-1: noop only (this PR).
19
+ * - step-2: implicit-citation.
20
+ * - step-4: llm.
21
+ * - step-5: explicit-self.
22
+ *
23
+ * Unknown names are logged and skipped — startup never fails on this.
24
+ */
25
+
26
+ type RaterFactory = () => MemoryRater;
27
+
28
+ const FACTORIES: Record<string, RaterFactory> = {
29
+ noop: () => new NoopRater(),
30
+ "implicit-citation": () => new ImplicitCitationRater(),
31
+ "explicit-self": () => new ExplicitSelfRatingRater(),
32
+ llm: () => new LlmRater(),
33
+ };
34
+
35
+ /**
36
+ * Raters whose `rate(ctx)` runs server-side (in `store-progress.ts` after task
37
+ * completion). Worker-driven raters (e.g. step-4's `LlmRater`, step-5's
38
+ * `ExplicitSelfRater`) emit events from outside this set and POST them to
39
+ * `/api/memory/rate`. The store-progress hook only fires raters listed here.
40
+ *
41
+ * Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-2.md §6
42
+ */
43
+ export const SERVER_RATERS = new Set<string>(["implicit-citation"]);
44
+
45
+ export function getRegisteredRaters(): MemoryRater[] {
46
+ const raw = process.env.MEMORY_RATERS;
47
+ if (!raw || raw.trim() === "") {
48
+ return [new NoopRater()];
49
+ }
50
+
51
+ const names = raw
52
+ .split(",")
53
+ .map((s) => s.trim())
54
+ .filter((s) => s.length > 0);
55
+
56
+ const raters: MemoryRater[] = [];
57
+ for (const name of names) {
58
+ const factory = FACTORIES[name];
59
+ if (!factory) {
60
+ console.warn(`[memory-rater] Unknown rater "${name}" in MEMORY_RATERS — skipping`);
61
+ continue;
62
+ }
63
+ raters.push(factory());
64
+ }
65
+
66
+ if (raters.length === 0) {
67
+ return [new NoopRater()];
68
+ }
69
+ return raters;
70
+ }
71
+
72
+ export function getRaterWeightMultiplier(name: string): number {
73
+ const raw = process.env.MEMORY_RATER_WEIGHTS;
74
+ if (!raw || raw.trim() === "") return 1.0;
75
+
76
+ for (const pair of raw.split(",")) {
77
+ const trimmed = pair.trim();
78
+ if (trimmed === "") continue;
79
+ const [rawName, rawMult] = trimmed.split(":");
80
+ if (!rawName || !rawMult) continue;
81
+ if (rawName.trim() !== name) continue;
82
+ const mult = Number(rawMult);
83
+ if (Number.isFinite(mult) && mult >= 0) return mult;
84
+ }
85
+ return 1.0;
86
+ }
@@ -0,0 +1,88 @@
1
+ import { ensure } from "@desplega.ai/business-use";
2
+ import { getDb } from "@/be/db";
3
+
4
+ /**
5
+ * Retrieval-bridge helper — appends `memory_retrieval` audit rows so
6
+ * server-side raters (currently `ImplicitCitationRater`) can correlate the
7
+ * memories surfaced to a task with the evidence emitted during that task.
8
+ *
9
+ * Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-2.md §1, §3
10
+ *
11
+ * Both call sites — `POST /api/memory/search` (HTTP) and the in-process
12
+ * `memory-search` MCP tool — call this helper post-rerank when a
13
+ * `X-Source-Task-ID` header is present. When `taskId` is absent or the
14
+ * results array is empty, the function is a no-op so the existing search
15
+ * paths stay byte-identical to today.
16
+ *
17
+ * Best-effort by design: a retrieval-bridge failure must NOT poison search.
18
+ * Callers wrap this in their own try/catch and return search results either
19
+ * way (see `src/http/memory.ts` and `src/tools/memory-search.ts`).
20
+ */
21
+
22
+ export type RetrievalRecord = {
23
+ memoryId: string;
24
+ similarity: number;
25
+ };
26
+
27
+ export function recordRetrievals(
28
+ taskId: string | undefined,
29
+ agentId: string,
30
+ results: RetrievalRecord[],
31
+ sessionId?: string,
32
+ ): void {
33
+ if (!taskId || results.length === 0) return;
34
+
35
+ const db = getDb();
36
+ const insert = db.prepare(
37
+ `INSERT INTO memory_retrieval
38
+ (id, taskId, agentId, sessionId, memoryId, similarity, retrievedAt)
39
+ VALUES (?, ?, ?, ?, ?, ?, ?)`,
40
+ );
41
+ const now = new Date().toISOString();
42
+
43
+ // Single transaction: even on a 100-row paginated search this is one
44
+ // commit, not N. No-op when results is empty.
45
+ db.transaction(() => {
46
+ for (const r of results) {
47
+ insert.run(
48
+ crypto.randomUUID(),
49
+ taskId,
50
+ agentId,
51
+ sessionId ?? null,
52
+ r.memoryId,
53
+ r.similarity,
54
+ now,
55
+ );
56
+ }
57
+ })();
58
+
59
+ // Business-use instrumentation — one `memory_retrieved` event per call,
60
+ // OUTSIDE the transaction. Validator self-contained.
61
+ ensure({
62
+ id: "memory_retrieved",
63
+ flow: "task",
64
+ runId: taskId,
65
+ data: {
66
+ count: results.length,
67
+ taskId,
68
+ agentId,
69
+ },
70
+ validator: (data) =>
71
+ typeof data.count === "number" &&
72
+ data.count > 0 &&
73
+ typeof data.taskId === "string" &&
74
+ data.taskId.length > 0 &&
75
+ typeof data.agentId === "string" &&
76
+ data.agentId.length > 0,
77
+ });
78
+ }
79
+
80
+ export function getRetrievalsForTask(
81
+ taskId: string,
82
+ ): { memoryId: string; similarity: number | null }[] {
83
+ return getDb()
84
+ .prepare<{ memoryId: string; similarity: number | null }, [string]>(
85
+ "SELECT memoryId, similarity FROM memory_retrieval WHERE taskId = ?",
86
+ )
87
+ .all(taskId);
88
+ }
@@ -0,0 +1,97 @@
1
+ import {
2
+ getRaterWeightMultiplier as defaultGetRaterWeightMultiplier,
3
+ getRegisteredRaters as defaultGetRegisteredRaters,
4
+ SERVER_RATERS,
5
+ } from "./registry";
6
+ import { type ApplyRatingResult, applyRating as defaultApplyRating } from "./store";
7
+ import type { MemoryRater, RatingEvent } from "./types";
8
+
9
+ /**
10
+ * Inputs for `runServerRaters`. The caller is responsible for fetching
11
+ * `retrievedMemoryIds` from `memory_retrieval` and the concatenated
12
+ * `evidence` text from `session_logs` (both are trivial SELECTs already
13
+ * covered by integration tests in memory-rater-implicit-citation.test.ts).
14
+ */
15
+ export type RunServerRatersInput = {
16
+ taskId: string;
17
+ agentId: string;
18
+ retrievedMemoryIds: string[];
19
+ evidence: string;
20
+ };
21
+
22
+ /**
23
+ * Optional overrides — primarily for unit tests so the orchestration logic
24
+ * (filter → rate → stamp source → clamp weight → applyRating) can be
25
+ * exercised with stub raters and an in-memory `applyRating`.
26
+ */
27
+ export type RunServerRatersDeps = {
28
+ raters?: MemoryRater[];
29
+ serverRaterNames?: ReadonlySet<string>;
30
+ weightMultiplierFor?: (name: string) => number;
31
+ applyRating?: (events: RatingEvent[], ctx: { taskId?: string }) => ApplyRatingResult;
32
+ };
33
+
34
+ export type ServerRaterFireOutcome = {
35
+ rater: string;
36
+ events: RatingEvent[];
37
+ result: ApplyRatingResult;
38
+ };
39
+
40
+ export type RunServerRatersResult = {
41
+ ratersFired: number;
42
+ outcomes: ServerRaterFireOutcome[];
43
+ };
44
+
45
+ /**
46
+ * Fire every allow-listed server-side memory rater for a completed task,
47
+ * stamp `source` from the rater's name (the framework's anti-spoof guarantee),
48
+ * apply the configured `MEMORY_RATER_WEIGHTS` multiplier with a [0, 1] clamp,
49
+ * then persist the resulting `RatingEvent`s via `applyRating`.
50
+ *
51
+ * Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-2.md §5
52
+ *
53
+ * Extracted from the previous inline IIFE in `store-progress.ts` (PR #426
54
+ * review feedback) so the orchestration is unit-testable in isolation —
55
+ * `raters`, `serverRaterNames`, `weightMultiplierFor`, and `applyRating` are
56
+ * all injectable. With no overrides, behaviour is byte-identical to the
57
+ * original inline block.
58
+ *
59
+ * No-ops when `retrievedMemoryIds` is empty. Rater errors propagate; callers
60
+ * are expected to wrap in try/catch (rater failure must NEVER affect task
61
+ * status — see the `console.error` site in `store-progress.ts`).
62
+ */
63
+ export async function runServerRaters(
64
+ input: RunServerRatersInput,
65
+ deps: RunServerRatersDeps = {},
66
+ ): Promise<RunServerRatersResult> {
67
+ const result: RunServerRatersResult = { ratersFired: 0, outcomes: [] };
68
+ if (input.retrievedMemoryIds.length === 0) return result;
69
+
70
+ const allRaters = deps.raters ?? defaultGetRegisteredRaters();
71
+ const allowed = deps.serverRaterNames ?? SERVER_RATERS;
72
+ const weightFor = deps.weightMultiplierFor ?? defaultGetRaterWeightMultiplier;
73
+ const applyFn = deps.applyRating ?? defaultApplyRating;
74
+
75
+ const serverRaters = allRaters.filter((r) => allowed.has(r.name));
76
+
77
+ for (const rater of serverRaters) {
78
+ const events = await rater.rate({
79
+ taskId: input.taskId,
80
+ agentId: input.agentId,
81
+ retrievedMemoryIds: input.retrievedMemoryIds,
82
+ evidence: input.evidence,
83
+ });
84
+ if (events.length === 0) continue;
85
+
86
+ const multiplier = weightFor(rater.name);
87
+ const stamped: RatingEvent[] = events.map((e) => ({
88
+ ...e,
89
+ source: rater.name,
90
+ weight: Math.max(0, Math.min(1, e.weight * multiplier)),
91
+ }));
92
+ const applied = applyFn(stamped, { taskId: input.taskId });
93
+ result.ratersFired += 1;
94
+ result.outcomes.push({ rater: rater.name, events: stamped, result: applied });
95
+ }
96
+ return result;
97
+ }