@desplega.ai/agent-swarm 1.74.4 → 1.76.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/openapi.json +1264 -46
- package/package.json +2 -2
- package/src/be/db.ts +563 -9
- package/src/be/memory/edges-store.ts +69 -0
- package/src/be/memory/providers/sqlite-store.ts +4 -0
- package/src/be/memory/raters/explicit-self.ts +22 -0
- package/src/be/memory/raters/implicit-citation.ts +44 -0
- package/src/be/memory/raters/llm-client.ts +172 -0
- package/src/be/memory/raters/llm-summarizer.ts +218 -0
- package/src/be/memory/raters/llm.ts +375 -0
- package/src/be/memory/raters/noop.ts +14 -0
- package/src/be/memory/raters/registry.ts +86 -0
- package/src/be/memory/raters/retrieval.ts +88 -0
- package/src/be/memory/raters/run-server-raters.ts +97 -0
- package/src/be/memory/raters/store.ts +228 -0
- package/src/be/memory/raters/types.ts +101 -0
- package/src/be/memory/reranker.ts +32 -2
- package/src/be/memory/retrieval-store.ts +116 -0
- package/src/be/memory/types.ts +3 -0
- package/src/be/migrations/051_memory_posteriors_and_retrieval.sql +67 -0
- package/src/be/migrations/052_memory_edges.sql +36 -0
- package/src/be/migrations/053_agent_waiting_for_credentials_status.sql +61 -0
- package/src/be/migrations/054_agent_harness_provider.sql +21 -0
- package/src/be/migrations/055_agent_cred_status.sql +15 -0
- package/src/be/migrations/056_drop_agent_tasks_source_check.sql +139 -0
- package/src/be/migrations/057_inbox_item_state.sql +27 -0
- package/src/be/migrations/058_task_templates.sql +31 -0
- package/src/be/swarm-config-guard.ts +24 -0
- package/src/commands/credential-wait.ts +186 -0
- package/src/commands/provider-credentials.ts +434 -0
- package/src/commands/runner.ts +253 -21
- package/src/hooks/hook.ts +143 -66
- package/src/http/agents.ts +191 -1
- package/src/http/config.ts +11 -1
- package/src/http/core.ts +5 -0
- package/src/http/inbox-state.ts +89 -0
- package/src/http/index.ts +10 -0
- package/src/http/memory.ts +230 -1
- package/src/http/sessions.ts +86 -0
- package/src/http/status.ts +665 -0
- package/src/http/task-templates.ts +51 -0
- package/src/http/tasks.ts +85 -5
- package/src/http/users.ts +134 -0
- package/src/prompts/memories.ts +62 -0
- package/src/providers/claude-adapter.ts +22 -0
- package/src/providers/claude-managed-adapter.ts +24 -0
- package/src/providers/codex-adapter.ts +43 -1
- package/src/providers/devin-adapter.ts +18 -0
- package/src/providers/index.ts +7 -0
- package/src/providers/opencode-adapter.ts +60 -0
- package/src/providers/pi-mono-adapter.ts +71 -0
- package/src/providers/types.ts +34 -0
- package/src/server.ts +2 -0
- package/src/slack/handlers.ts +0 -1
- package/src/tests/agents-harness-provider.test.ts +333 -0
- package/src/tests/credential-check.test.ts +367 -0
- package/src/tests/credential-status-api.test.ts +223 -0
- package/src/tests/credential-status-routing.test.ts +150 -0
- package/src/tests/credential-wait.test.ts +282 -0
- package/src/tests/harness-provider-resolution.test.ts +242 -0
- package/src/tests/jira-sync.test.ts +1 -1
- package/src/tests/memory-edges.test.ts +722 -0
- package/src/tests/memory-rate-endpoint.test.ts +330 -0
- package/src/tests/memory-rate-tool.test.ts +252 -0
- package/src/tests/memory-rater-e2e.test.ts +578 -0
- package/src/tests/memory-rater-implicit-citation.test.ts +304 -0
- package/src/tests/memory-rater-llm-summarizer.test.ts +317 -0
- package/src/tests/memory-rater-llm.test.ts +964 -0
- package/src/tests/memory-rater-store.test.ts +249 -0
- package/src/tests/memory-reranker.test.ts +161 -2
- package/src/tests/migration-runner-regressions.test.ts +17 -2
- package/src/tests/mocks/mock-llm-rater-client.ts +35 -0
- package/src/tests/run-server-raters.test.ts +291 -0
- package/src/tests/sessions.test.ts +141 -0
- package/src/tests/status.test.ts +843 -0
- package/src/tests/stop-hook-task-resolution.test.ts +98 -0
- package/src/tests/template-recommendations.test.ts +148 -0
- package/src/tests/tool-annotations.test.ts +2 -2
- package/src/tests/use-dismissible-card.test.ts +140 -0
- package/src/tools/memory-rate.ts +166 -0
- package/src/tools/memory-search.ts +18 -0
- package/src/tools/store-progress.ts +37 -0
- package/src/tools/swarm-config/set-config.ts +17 -1
- package/src/tools/tool-config.ts +1 -0
- package/src/types.ts +122 -1
- package/src/utils/harness-provider.ts +32 -0
- package/tsconfig.json +0 -2
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `LlmRater` — second live rater, source = "llm".
|
|
3
|
+
*
|
|
4
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-4.md §2-3
|
|
5
|
+
*
|
|
6
|
+
* The worker-side flow does NOT call `LlmRater.rate(ctx)` from the in-process
|
|
7
|
+
* server-rater orchestrator. Instead, the rating LLM call is piggybacked on
|
|
8
|
+
* the existing session-summary call in `src/hooks/hook.ts` (cost optimization
|
|
9
|
+
* — same Haiku invocation produces both summary text + per-memory ratings).
|
|
10
|
+
* The hook then POSTs the constructed `RatingEvent[]` to `/api/memory/rate`.
|
|
11
|
+
*
|
|
12
|
+
* `LlmRater.rate(ctx)` is wired up so the class still satisfies `MemoryRater`
|
|
13
|
+
* for registry consistency / future direct integrations / unit tests, but is
|
|
14
|
+
* never invoked by `runServerRaters` (LlmRater is NOT in `SERVER_RATERS`).
|
|
15
|
+
*
|
|
16
|
+
* This module is imported from worker-side `src/hooks/hook.ts` so it MUST NOT
|
|
17
|
+
* touch `bun:sqlite` or `src/be/db`. The boundary check enforces it.
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
import { ClaudeCliLlmRaterClient, type LlmRaterClient, type LlmRaterResult } from "./llm-client";
|
|
21
|
+
import {
|
|
22
|
+
type MemoryRater,
|
|
23
|
+
type RatingContext,
|
|
24
|
+
type RatingEvent,
|
|
25
|
+
REFERENCES_SOURCE_MAX_LENGTH,
|
|
26
|
+
sanitizeReferencesSource,
|
|
27
|
+
} from "./types";
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Per-rating weight, fixed at 0.8 per the research-doc convention
|
|
31
|
+
* ("LLM intent_weight"). Encoded here once so neither callers nor tests can
|
|
32
|
+
* silently drift the constant.
|
|
33
|
+
*/
|
|
34
|
+
export const LLM_RATER_WEIGHT = 0.8;
|
|
35
|
+
|
|
36
|
+
const RatingSchema = z.object({
|
|
37
|
+
id: z.string().min(1),
|
|
38
|
+
score: z.number().min(0).max(1),
|
|
39
|
+
reasoning: z.string().min(1).max(500),
|
|
40
|
+
// Step-6 §6 — optional free-form external source ID. Q2 contract: ≤512
|
|
41
|
+
// chars, no closed enum, no prefix parser. Sanitization (control-char
|
|
42
|
+
// strip + NUL rejection) happens in `buildRatingsFromLlm` so a single
|
|
43
|
+
// bad rating drops the field rather than failing the whole batch.
|
|
44
|
+
referencesSource: z.string().min(1).max(REFERENCES_SOURCE_MAX_LENGTH).optional(),
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Zod schema for the structured-output piggyback prompt. The hook asks the
|
|
49
|
+
* summarizer LLM to return summary + per-memory ratings in one JSON object so
|
|
50
|
+
* we don't pay for N additional LLM calls.
|
|
51
|
+
*/
|
|
52
|
+
export const SummaryWithRatingsSchema = z.object({
|
|
53
|
+
summary: z.string(),
|
|
54
|
+
ratings: z.array(RatingSchema).default([]),
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
export type LlmRating = z.infer<typeof RatingSchema>;
|
|
58
|
+
export type SummaryWithRatings = z.infer<typeof SummaryWithRatingsSchema>;
|
|
59
|
+
|
|
60
|
+
/** Context augmentations LlmRater consumes when called directly (per-memory path). */
|
|
61
|
+
export type LlmRatingContext = RatingContext & {
|
|
62
|
+
/** What the agent asked the memory system. */
|
|
63
|
+
query?: string;
|
|
64
|
+
/** Final agent response / summary used as the "did this help?" signal. */
|
|
65
|
+
response?: string;
|
|
66
|
+
/** Snapshots for memories listed in `retrievedMemoryIds` (id-aligned by id). */
|
|
67
|
+
retrievedMemories?: { id: string; name: string; content: string }[];
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
export class LlmRater implements MemoryRater {
|
|
71
|
+
readonly name = "llm";
|
|
72
|
+
|
|
73
|
+
constructor(public readonly client: LlmRaterClient = new ClaudeCliLlmRaterClient()) {}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Per-memory scoring path. The production hook bypasses this method and
|
|
77
|
+
* calls {@link buildRatingsFromLlm} on the piggybacked summarizer JSON
|
|
78
|
+
* (one LLM invocation, not N). Direct callers (tests, future integrations)
|
|
79
|
+
* MUST pass {@link LlmRatingContext} — the base `RatingContext` carries
|
|
80
|
+
* only memory IDs, which is insufficient to drive `LlmRaterClient.rate`.
|
|
81
|
+
*
|
|
82
|
+
* Returns `[]` when the augmented fields are missing so the rater stays a
|
|
83
|
+
* no-op rather than crashing on a `RatingContext`-only invocation.
|
|
84
|
+
*/
|
|
85
|
+
async rate(ctx: RatingContext): Promise<RatingEvent[]> {
|
|
86
|
+
const enriched = ctx as LlmRatingContext;
|
|
87
|
+
if (enriched.retrievedMemoryIds.length === 0) return [];
|
|
88
|
+
const memories = enriched.retrievedMemories;
|
|
89
|
+
if (!memories || memories.length === 0) return [];
|
|
90
|
+
|
|
91
|
+
const events: RatingEvent[] = [];
|
|
92
|
+
for (const memoryId of enriched.retrievedMemoryIds) {
|
|
93
|
+
const memory = memories.find((m) => m.id === memoryId);
|
|
94
|
+
if (!memory) continue;
|
|
95
|
+
let result: LlmRaterResult | null;
|
|
96
|
+
try {
|
|
97
|
+
result = await this.client.rate({
|
|
98
|
+
query: enriched.query ?? "",
|
|
99
|
+
memory,
|
|
100
|
+
response: enriched.response ?? enriched.evidence ?? "",
|
|
101
|
+
});
|
|
102
|
+
} catch (err) {
|
|
103
|
+
console.error(
|
|
104
|
+
`[memory-rater:llm] client.rate threw for memoryId=${memoryId}:`,
|
|
105
|
+
(err as Error).message,
|
|
106
|
+
);
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
if (!result) continue;
|
|
110
|
+
events.push({
|
|
111
|
+
memoryId,
|
|
112
|
+
signal: 2 * result.score - 1,
|
|
113
|
+
weight: LLM_RATER_WEIGHT,
|
|
114
|
+
// Framework stamps `source = rater.name` in `runServerRaters`. Raters
|
|
115
|
+
// that populate `source` themselves are rejected by `applyRating`.
|
|
116
|
+
source: "",
|
|
117
|
+
reasoning: result.reasoning,
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
return events;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Convert the piggybacked summary's `ratings` array into `RatingEvent[]` for
|
|
126
|
+
* `POST /api/memory/rate`. Drops ratings whose `id` was not in the original
|
|
127
|
+
* retrieval set (defence-in-depth — the LLM occasionally hallucinates memory
|
|
128
|
+
* IDs; the server-side R6 check catches it too, but rejecting upstream keeps
|
|
129
|
+
* the audit log cleaner).
|
|
130
|
+
*
|
|
131
|
+
* Mapping: `signal = 2 * score - 1` (0 → -1, 0.5 → 0, 1 → +1).
|
|
132
|
+
* Weight = {@link LLM_RATER_WEIGHT} (0.8).
|
|
133
|
+
* Source = `"llm"` (the HTTP rate endpoint enums `["llm", "explicit-self"]`).
|
|
134
|
+
*/
|
|
135
|
+
export function buildRatingsFromLlm(
|
|
136
|
+
ratings: LlmRating[],
|
|
137
|
+
retrievals: { id: string }[],
|
|
138
|
+
): RatingEvent[] {
|
|
139
|
+
const allowed = new Set(retrievals.map((r) => r.id));
|
|
140
|
+
const events: RatingEvent[] = [];
|
|
141
|
+
for (const r of ratings) {
|
|
142
|
+
if (!allowed.has(r.id)) continue;
|
|
143
|
+
// Step-6 §6 — sanitize before propagation. If the LLM emits a NUL byte
|
|
144
|
+
// or an all-control-chars string, drop the edge but keep the rating
|
|
145
|
+
// (best-effort: the memory's own posterior still gets the signal).
|
|
146
|
+
let cleanedReferencesSource: string | undefined;
|
|
147
|
+
if (r.referencesSource !== undefined) {
|
|
148
|
+
const cleaned = sanitizeReferencesSource(r.referencesSource);
|
|
149
|
+
if (cleaned !== null) {
|
|
150
|
+
cleanedReferencesSource = cleaned;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
events.push({
|
|
154
|
+
memoryId: r.id,
|
|
155
|
+
signal: 2 * r.score - 1,
|
|
156
|
+
weight: LLM_RATER_WEIGHT,
|
|
157
|
+
source: "llm",
|
|
158
|
+
reasoning: r.reasoning,
|
|
159
|
+
...(cleanedReferencesSource !== undefined
|
|
160
|
+
? { referencesSource: cleanedReferencesSource }
|
|
161
|
+
: {}),
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
return events;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Append a structured-output instruction to the existing summary prompt so
|
|
169
|
+
* the same `claude -p` invocation produces both summary text AND per-memory
|
|
170
|
+
* ratings against `SummaryWithRatingsSchema`.
|
|
171
|
+
*
|
|
172
|
+
* Memory `content` is truncated to {@link RETRIEVAL_PROMPT_CONTENT_CAP} chars
|
|
173
|
+
* to keep the prompt within Haiku's context budget on long sessions; the
|
|
174
|
+
* server already truncates `agent_memory.content` to 500 chars in the
|
|
175
|
+
* retrievals endpoint, so this is the typical case.
|
|
176
|
+
*/
|
|
177
|
+
const RETRIEVAL_PROMPT_CONTENT_CAP = 600;
|
|
178
|
+
|
|
179
|
+
export function buildSummaryWithRatingsPrompt(
|
|
180
|
+
basePrompt: string,
|
|
181
|
+
retrievals: { id: string; name: string; content: string }[],
|
|
182
|
+
): string {
|
|
183
|
+
if (retrievals.length === 0) return basePrompt;
|
|
184
|
+
const memoryBlock = retrievals
|
|
185
|
+
.map((m, i) => {
|
|
186
|
+
const content =
|
|
187
|
+
m.content.length > RETRIEVAL_PROMPT_CONTENT_CAP
|
|
188
|
+
? `${m.content.slice(0, RETRIEVAL_PROMPT_CONTENT_CAP)}…`
|
|
189
|
+
: m.content;
|
|
190
|
+
return `Memory #${i + 1}\n id: ${m.id}\n name: ${m.name}\n content: ${content}`;
|
|
191
|
+
})
|
|
192
|
+
.join("\n\n");
|
|
193
|
+
|
|
194
|
+
return `${basePrompt}
|
|
195
|
+
|
|
196
|
+
CRITICAL: Your entire response MUST be a single JSON object that conforms to the schema below. Do NOT wrap it in triple-backtick fences (no \`\`\`json or \`\`\`), do NOT add a prose preamble, do NOT add trailing commentary. Just the JSON object, nothing else.
|
|
197
|
+
|
|
198
|
+
Schema:
|
|
199
|
+
{
|
|
200
|
+
"summary": string, // your existing summary text
|
|
201
|
+
"ratings": [ // one entry per memory you can score
|
|
202
|
+
{
|
|
203
|
+
"id": string, // memory id, copied from the list below
|
|
204
|
+
"score": number, // 0 = misleading/unhelpful, 1 = highly useful
|
|
205
|
+
"reasoning": string, // 1..500 chars, why
|
|
206
|
+
"referencesSource": string // OPTIONAL — see note below
|
|
207
|
+
}
|
|
208
|
+
]
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
Score ONLY memories present in the list below. Use the exact ids. Omit any you cannot evaluate.
|
|
212
|
+
|
|
213
|
+
Optionally for each rating, if the memory clearly references a specific external source (a GitHub PR/issue, a Linear issue, a customer, a Slack thread, an AgentMail thread, etc.), include a \`referencesSource\` string using the convention "<source>:<identifier>" (e.g. "github:owner/repo#N", "linear:KEY-N", "customer:<slug>"). Any prefix is fine — pick what matches the source. Omit the field if no clear external source.
|
|
214
|
+
|
|
215
|
+
Memories retrieved during this session:
|
|
216
|
+
|
|
217
|
+
${memoryBlock}`;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* `MEMORY_RATERS=...` includes `llm`? Used by the hook to gate the piggyback
|
|
222
|
+
* path — strict opt-in so existing deployments are byte-identical when unset.
|
|
223
|
+
*/
|
|
224
|
+
export function isLlmRaterEnabled(): boolean {
|
|
225
|
+
const raw = process.env.MEMORY_RATERS;
|
|
226
|
+
if (!raw || raw.trim() === "") return false;
|
|
227
|
+
return raw
|
|
228
|
+
.split(",")
|
|
229
|
+
.map((s) => s.trim())
|
|
230
|
+
.includes("llm");
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
/** Memory snapshot returned by `GET /api/memory/retrievals`. */
|
|
234
|
+
export type RetrievalRow = {
|
|
235
|
+
id: string;
|
|
236
|
+
name: string;
|
|
237
|
+
content: string;
|
|
238
|
+
scope?: string;
|
|
239
|
+
/** `agent_memory.source` — present once the API surfaces it (post-PR #451 amendment). */
|
|
240
|
+
source?: string;
|
|
241
|
+
/** `agent_tasks.scheduleId` for the writing task, or null when not a scheduled run. */
|
|
242
|
+
scheduleId?: string | null;
|
|
243
|
+
similarity?: number | null;
|
|
244
|
+
retrievedAt?: string;
|
|
245
|
+
};
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Dedupe candidate memories before LLM rating to prevent posterior inflation
|
|
249
|
+
* from scheduled-task self-similarity.
|
|
250
|
+
*
|
|
251
|
+
* **Why this exists.** Scheduled tasks fire identical task text on every
|
|
252
|
+
* run, and the task-completion path names each memory
|
|
253
|
+
* `"Task: ${task.task.slice(0, 80)}"` (`src/tools/store-progress.ts`). When
|
|
254
|
+
* the next run searches memory, its own past runs surface as "highly
|
|
255
|
+
* similar" rows. Without dedup, the LLM rater scored 5+ near-clones at +1.0
|
|
256
|
+
* each — bumping alpha 5x in a single session and distorting the Beta(α,β)
|
|
257
|
+
* ranking vs. a normal one-shot session. Concrete case (Lead's audit of the
|
|
258
|
+
* first 37 `llm` ratings, post-PR #450): the Claude Code Changelog Monitor
|
|
259
|
+
* hourly cron (taskId `f938d74d-05af-44a7-a0aa-3463d22be502`) produced 5
|
|
260
|
+
* saturating +1s in one rater pass — every rated memory was a prior hourly
|
|
261
|
+
* run.
|
|
262
|
+
*
|
|
263
|
+
* **Discriminator.** `agent_tasks.scheduleId`. Memories sharing a non-null
|
|
264
|
+
* `scheduleId` are by definition from the same scheduled job — that is the
|
|
265
|
+
* exact duplicate class the audit identified, and the only one we want to
|
|
266
|
+
* collapse. We do NOT key on `name` alone, because the 80-char truncation in
|
|
267
|
+
* task-completion names ("Task: …") and session-summary names ("Session: …")
|
|
268
|
+
* means two distinct one-shot tasks/summaries that happen to share the first
|
|
269
|
+
* 80 chars of their description would silently collapse — the false-positive
|
|
270
|
+
* path the PR #451 reviewer flagged.
|
|
271
|
+
*
|
|
272
|
+
* **Pass-through cases (NOT deduped).**
|
|
273
|
+
* - `scheduleId` is null/undefined (manual one-shot tasks, manual memories,
|
|
274
|
+
* file-index memories) — no scheduled-clone risk.
|
|
275
|
+
* - Two memories from different scheduled jobs that happen to surface in
|
|
276
|
+
* the same retrieval set — different `scheduleId`s, both kept.
|
|
277
|
+
*
|
|
278
|
+
* **Tie-break.** Input is `ORDER BY mr.retrievedAt DESC` from
|
|
279
|
+
* `getRetrievalsForAgent`, so "first occurrence per scheduleId" = "freshest
|
|
280
|
+
* surfaced run", which is the representative we want.
|
|
281
|
+
*/
|
|
282
|
+
export function dedupeRetrievalsForRater<T extends { scheduleId?: string | null }>(rows: T[]): T[] {
|
|
283
|
+
const seenSchedules = new Set<string>();
|
|
284
|
+
const out: T[] = [];
|
|
285
|
+
for (const row of rows) {
|
|
286
|
+
const scheduleId = row.scheduleId;
|
|
287
|
+
if (typeof scheduleId === "string" && scheduleId.length > 0) {
|
|
288
|
+
if (seenSchedules.has(scheduleId)) continue;
|
|
289
|
+
seenSchedules.add(scheduleId);
|
|
290
|
+
}
|
|
291
|
+
out.push(row);
|
|
292
|
+
}
|
|
293
|
+
return out;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* GET `/api/memory/retrievals?taskId=` — best-effort. Returns `[]` on any
|
|
298
|
+
* failure so a transient API outage never blocks the summary-indexing path.
|
|
299
|
+
*/
|
|
300
|
+
export async function fetchRetrievalsForTask(opts: {
|
|
301
|
+
apiUrl: string;
|
|
302
|
+
apiKey: string;
|
|
303
|
+
agentId: string;
|
|
304
|
+
taskId: string;
|
|
305
|
+
fetchImpl?: typeof fetch;
|
|
306
|
+
}): Promise<RetrievalRow[]> {
|
|
307
|
+
const fetchFn = opts.fetchImpl ?? fetch;
|
|
308
|
+
try {
|
|
309
|
+
const url = `${opts.apiUrl}/api/memory/retrievals?taskId=${encodeURIComponent(opts.taskId)}`;
|
|
310
|
+
const res = await fetchFn(url, {
|
|
311
|
+
headers: {
|
|
312
|
+
"X-Agent-ID": opts.agentId,
|
|
313
|
+
...(opts.apiKey ? { Authorization: `Bearer ${opts.apiKey}` } : {}),
|
|
314
|
+
},
|
|
315
|
+
});
|
|
316
|
+
if (!res.ok) {
|
|
317
|
+
console.error(
|
|
318
|
+
`[memory-rater:llm] GET /api/memory/retrievals failed: ${res.status} ${res.statusText}`,
|
|
319
|
+
);
|
|
320
|
+
return [];
|
|
321
|
+
}
|
|
322
|
+
const body = (await res.json()) as { results?: RetrievalRow[] };
|
|
323
|
+
return body.results ?? [];
|
|
324
|
+
} catch (err) {
|
|
325
|
+
console.error("[memory-rater:llm] fetchRetrievalsForTask threw:", (err as Error).message);
|
|
326
|
+
return [];
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
/**
|
|
331
|
+
* POST `/api/memory/rate` — best-effort. Logs on 4xx/5xx, never throws. The
|
|
332
|
+
* worker hook wraps the whole rating block in its own try/catch as a final
|
|
333
|
+
* line of defence — rater failure must never block summary indexing.
|
|
334
|
+
*/
|
|
335
|
+
export async function postRatings(opts: {
|
|
336
|
+
apiUrl: string;
|
|
337
|
+
apiKey: string;
|
|
338
|
+
agentId: string;
|
|
339
|
+
taskId?: string;
|
|
340
|
+
events: RatingEvent[];
|
|
341
|
+
fetchImpl?: typeof fetch;
|
|
342
|
+
}): Promise<{ ok: boolean; status: number }> {
|
|
343
|
+
if (opts.events.length === 0) return { ok: true, status: 0 };
|
|
344
|
+
const fetchFn = opts.fetchImpl ?? fetch;
|
|
345
|
+
const events = opts.events.map((e) => ({
|
|
346
|
+
memoryId: e.memoryId,
|
|
347
|
+
signal: e.signal,
|
|
348
|
+
weight: e.weight,
|
|
349
|
+
source: e.source,
|
|
350
|
+
...(e.reasoning !== undefined ? { reasoning: e.reasoning } : {}),
|
|
351
|
+
...(e.referencesSource !== undefined ? { referencesSource: e.referencesSource } : {}),
|
|
352
|
+
...(opts.taskId ? { taskId: opts.taskId } : {}),
|
|
353
|
+
}));
|
|
354
|
+
try {
|
|
355
|
+
const res = await fetchFn(`${opts.apiUrl}/api/memory/rate`, {
|
|
356
|
+
method: "POST",
|
|
357
|
+
headers: {
|
|
358
|
+
"Content-Type": "application/json",
|
|
359
|
+
"X-Agent-ID": opts.agentId,
|
|
360
|
+
...(opts.apiKey ? { Authorization: `Bearer ${opts.apiKey}` } : {}),
|
|
361
|
+
},
|
|
362
|
+
body: JSON.stringify({ events }),
|
|
363
|
+
});
|
|
364
|
+
if (!res.ok) {
|
|
365
|
+
const text = await res.text().catch(() => "");
|
|
366
|
+
console.error(
|
|
367
|
+
`[memory-rater:llm] POST /api/memory/rate failed: ${res.status} ${res.statusText} ${text.slice(0, 200)}`,
|
|
368
|
+
);
|
|
369
|
+
}
|
|
370
|
+
return { ok: res.ok, status: res.status };
|
|
371
|
+
} catch (err) {
|
|
372
|
+
console.error("[memory-rater:llm] postRatings threw:", (err as Error).message);
|
|
373
|
+
return { ok: false, status: 0 };
|
|
374
|
+
}
|
|
375
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { MemoryRater, RatingEvent } from "./types";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Default rater. Emits no events, makes no DB calls. Selected when
|
|
5
|
+
* MEMORY_RATERS is unset or empty so the framework defaults to behaving
|
|
6
|
+
* byte-identically to pre-rater builds.
|
|
7
|
+
*/
|
|
8
|
+
export class NoopRater implements MemoryRater {
|
|
9
|
+
readonly name = "noop";
|
|
10
|
+
|
|
11
|
+
async rate(): Promise<RatingEvent[]> {
|
|
12
|
+
return [];
|
|
13
|
+
}
|
|
14
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { ExplicitSelfRatingRater } from "./explicit-self";
|
|
2
|
+
import { ImplicitCitationRater } from "./implicit-citation";
|
|
3
|
+
import { LlmRater } from "./llm";
|
|
4
|
+
import { NoopRater } from "./noop";
|
|
5
|
+
import type { MemoryRater } from "./types";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-1.md §4
|
|
9
|
+
*
|
|
10
|
+
* `MEMORY_RATERS` env — comma-separated list of rater names. Defaults to
|
|
11
|
+
* `[NoopRater]` when unset/empty so existing deployments stay byte-identical.
|
|
12
|
+
*
|
|
13
|
+
* `MEMORY_RATER_WEIGHTS` env — optional `name:multiplier,...` overrides.
|
|
14
|
+
* Multiplier is applied to every emitted RatingEvent.weight before
|
|
15
|
+
* `applyRating`. Default = 1.0.
|
|
16
|
+
*
|
|
17
|
+
* Each later step touches *only* its own line in the factory map:
|
|
18
|
+
* - step-1: noop only (this PR).
|
|
19
|
+
* - step-2: implicit-citation.
|
|
20
|
+
* - step-4: llm.
|
|
21
|
+
* - step-5: explicit-self.
|
|
22
|
+
*
|
|
23
|
+
* Unknown names are logged and skipped — startup never fails on this.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
type RaterFactory = () => MemoryRater;
|
|
27
|
+
|
|
28
|
+
const FACTORIES: Record<string, RaterFactory> = {
|
|
29
|
+
noop: () => new NoopRater(),
|
|
30
|
+
"implicit-citation": () => new ImplicitCitationRater(),
|
|
31
|
+
"explicit-self": () => new ExplicitSelfRatingRater(),
|
|
32
|
+
llm: () => new LlmRater(),
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Raters whose `rate(ctx)` runs server-side (in `store-progress.ts` after task
|
|
37
|
+
* completion). Worker-driven raters (e.g. step-4's `LlmRater`, step-5's
|
|
38
|
+
* `ExplicitSelfRater`) emit events from outside this set and POST them to
|
|
39
|
+
* `/api/memory/rate`. The store-progress hook only fires raters listed here.
|
|
40
|
+
*
|
|
41
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-2.md §6
|
|
42
|
+
*/
|
|
43
|
+
export const SERVER_RATERS = new Set<string>(["implicit-citation"]);
|
|
44
|
+
|
|
45
|
+
export function getRegisteredRaters(): MemoryRater[] {
|
|
46
|
+
const raw = process.env.MEMORY_RATERS;
|
|
47
|
+
if (!raw || raw.trim() === "") {
|
|
48
|
+
return [new NoopRater()];
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const names = raw
|
|
52
|
+
.split(",")
|
|
53
|
+
.map((s) => s.trim())
|
|
54
|
+
.filter((s) => s.length > 0);
|
|
55
|
+
|
|
56
|
+
const raters: MemoryRater[] = [];
|
|
57
|
+
for (const name of names) {
|
|
58
|
+
const factory = FACTORIES[name];
|
|
59
|
+
if (!factory) {
|
|
60
|
+
console.warn(`[memory-rater] Unknown rater "${name}" in MEMORY_RATERS — skipping`);
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
raters.push(factory());
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (raters.length === 0) {
|
|
67
|
+
return [new NoopRater()];
|
|
68
|
+
}
|
|
69
|
+
return raters;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export function getRaterWeightMultiplier(name: string): number {
|
|
73
|
+
const raw = process.env.MEMORY_RATER_WEIGHTS;
|
|
74
|
+
if (!raw || raw.trim() === "") return 1.0;
|
|
75
|
+
|
|
76
|
+
for (const pair of raw.split(",")) {
|
|
77
|
+
const trimmed = pair.trim();
|
|
78
|
+
if (trimmed === "") continue;
|
|
79
|
+
const [rawName, rawMult] = trimmed.split(":");
|
|
80
|
+
if (!rawName || !rawMult) continue;
|
|
81
|
+
if (rawName.trim() !== name) continue;
|
|
82
|
+
const mult = Number(rawMult);
|
|
83
|
+
if (Number.isFinite(mult) && mult >= 0) return mult;
|
|
84
|
+
}
|
|
85
|
+
return 1.0;
|
|
86
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { ensure } from "@desplega.ai/business-use";
|
|
2
|
+
import { getDb } from "@/be/db";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Retrieval-bridge helper — appends `memory_retrieval` audit rows so
|
|
6
|
+
* server-side raters (currently `ImplicitCitationRater`) can correlate the
|
|
7
|
+
* memories surfaced to a task with the evidence emitted during that task.
|
|
8
|
+
*
|
|
9
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-2.md §1, §3
|
|
10
|
+
*
|
|
11
|
+
* Both call sites — `POST /api/memory/search` (HTTP) and the in-process
|
|
12
|
+
* `memory-search` MCP tool — call this helper post-rerank when a
|
|
13
|
+
* `X-Source-Task-ID` header is present. When `taskId` is absent or the
|
|
14
|
+
* results array is empty, the function is a no-op so the existing search
|
|
15
|
+
* paths stay byte-identical to today.
|
|
16
|
+
*
|
|
17
|
+
* Best-effort by design: a retrieval-bridge failure must NOT poison search.
|
|
18
|
+
* Callers wrap this in their own try/catch and return search results either
|
|
19
|
+
* way (see `src/http/memory.ts` and `src/tools/memory-search.ts`).
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
export type RetrievalRecord = {
|
|
23
|
+
memoryId: string;
|
|
24
|
+
similarity: number;
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
export function recordRetrievals(
|
|
28
|
+
taskId: string | undefined,
|
|
29
|
+
agentId: string,
|
|
30
|
+
results: RetrievalRecord[],
|
|
31
|
+
sessionId?: string,
|
|
32
|
+
): void {
|
|
33
|
+
if (!taskId || results.length === 0) return;
|
|
34
|
+
|
|
35
|
+
const db = getDb();
|
|
36
|
+
const insert = db.prepare(
|
|
37
|
+
`INSERT INTO memory_retrieval
|
|
38
|
+
(id, taskId, agentId, sessionId, memoryId, similarity, retrievedAt)
|
|
39
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
|
40
|
+
);
|
|
41
|
+
const now = new Date().toISOString();
|
|
42
|
+
|
|
43
|
+
// Single transaction: even on a 100-row paginated search this is one
|
|
44
|
+
// commit, not N. No-op when results is empty.
|
|
45
|
+
db.transaction(() => {
|
|
46
|
+
for (const r of results) {
|
|
47
|
+
insert.run(
|
|
48
|
+
crypto.randomUUID(),
|
|
49
|
+
taskId,
|
|
50
|
+
agentId,
|
|
51
|
+
sessionId ?? null,
|
|
52
|
+
r.memoryId,
|
|
53
|
+
r.similarity,
|
|
54
|
+
now,
|
|
55
|
+
);
|
|
56
|
+
}
|
|
57
|
+
})();
|
|
58
|
+
|
|
59
|
+
// Business-use instrumentation — one `memory_retrieved` event per call,
|
|
60
|
+
// OUTSIDE the transaction. Validator self-contained.
|
|
61
|
+
ensure({
|
|
62
|
+
id: "memory_retrieved",
|
|
63
|
+
flow: "task",
|
|
64
|
+
runId: taskId,
|
|
65
|
+
data: {
|
|
66
|
+
count: results.length,
|
|
67
|
+
taskId,
|
|
68
|
+
agentId,
|
|
69
|
+
},
|
|
70
|
+
validator: (data) =>
|
|
71
|
+
typeof data.count === "number" &&
|
|
72
|
+
data.count > 0 &&
|
|
73
|
+
typeof data.taskId === "string" &&
|
|
74
|
+
data.taskId.length > 0 &&
|
|
75
|
+
typeof data.agentId === "string" &&
|
|
76
|
+
data.agentId.length > 0,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export function getRetrievalsForTask(
|
|
81
|
+
taskId: string,
|
|
82
|
+
): { memoryId: string; similarity: number | null }[] {
|
|
83
|
+
return getDb()
|
|
84
|
+
.prepare<{ memoryId: string; similarity: number | null }, [string]>(
|
|
85
|
+
"SELECT memoryId, similarity FROM memory_retrieval WHERE taskId = ?",
|
|
86
|
+
)
|
|
87
|
+
.all(taskId);
|
|
88
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getRaterWeightMultiplier as defaultGetRaterWeightMultiplier,
|
|
3
|
+
getRegisteredRaters as defaultGetRegisteredRaters,
|
|
4
|
+
SERVER_RATERS,
|
|
5
|
+
} from "./registry";
|
|
6
|
+
import { type ApplyRatingResult, applyRating as defaultApplyRating } from "./store";
|
|
7
|
+
import type { MemoryRater, RatingEvent } from "./types";
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Inputs for `runServerRaters`. The caller is responsible for fetching
|
|
11
|
+
* `retrievedMemoryIds` from `memory_retrieval` and the concatenated
|
|
12
|
+
* `evidence` text from `session_logs` (both are trivial SELECTs already
|
|
13
|
+
* covered by integration tests in memory-rater-implicit-citation.test.ts).
|
|
14
|
+
*/
|
|
15
|
+
export type RunServerRatersInput = {
|
|
16
|
+
taskId: string;
|
|
17
|
+
agentId: string;
|
|
18
|
+
retrievedMemoryIds: string[];
|
|
19
|
+
evidence: string;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Optional overrides — primarily for unit tests so the orchestration logic
|
|
24
|
+
* (filter → rate → stamp source → clamp weight → applyRating) can be
|
|
25
|
+
* exercised with stub raters and an in-memory `applyRating`.
|
|
26
|
+
*/
|
|
27
|
+
export type RunServerRatersDeps = {
|
|
28
|
+
raters?: MemoryRater[];
|
|
29
|
+
serverRaterNames?: ReadonlySet<string>;
|
|
30
|
+
weightMultiplierFor?: (name: string) => number;
|
|
31
|
+
applyRating?: (events: RatingEvent[], ctx: { taskId?: string }) => ApplyRatingResult;
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
export type ServerRaterFireOutcome = {
|
|
35
|
+
rater: string;
|
|
36
|
+
events: RatingEvent[];
|
|
37
|
+
result: ApplyRatingResult;
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
export type RunServerRatersResult = {
|
|
41
|
+
ratersFired: number;
|
|
42
|
+
outcomes: ServerRaterFireOutcome[];
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Fire every allow-listed server-side memory rater for a completed task,
|
|
47
|
+
* stamp `source` from the rater's name (the framework's anti-spoof guarantee),
|
|
48
|
+
* apply the configured `MEMORY_RATER_WEIGHTS` multiplier with a [0, 1] clamp,
|
|
49
|
+
* then persist the resulting `RatingEvent`s via `applyRating`.
|
|
50
|
+
*
|
|
51
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-2.md §5
|
|
52
|
+
*
|
|
53
|
+
* Extracted from the previous inline IIFE in `store-progress.ts` (PR #426
|
|
54
|
+
* review feedback) so the orchestration is unit-testable in isolation —
|
|
55
|
+
* `raters`, `serverRaterNames`, `weightMultiplierFor`, and `applyRating` are
|
|
56
|
+
* all injectable. With no overrides, behaviour is byte-identical to the
|
|
57
|
+
* original inline block.
|
|
58
|
+
*
|
|
59
|
+
* No-ops when `retrievedMemoryIds` is empty. Rater errors propagate; callers
|
|
60
|
+
* are expected to wrap in try/catch (rater failure must NEVER affect task
|
|
61
|
+
* status — see the `console.error` site in `store-progress.ts`).
|
|
62
|
+
*/
|
|
63
|
+
export async function runServerRaters(
|
|
64
|
+
input: RunServerRatersInput,
|
|
65
|
+
deps: RunServerRatersDeps = {},
|
|
66
|
+
): Promise<RunServerRatersResult> {
|
|
67
|
+
const result: RunServerRatersResult = { ratersFired: 0, outcomes: [] };
|
|
68
|
+
if (input.retrievedMemoryIds.length === 0) return result;
|
|
69
|
+
|
|
70
|
+
const allRaters = deps.raters ?? defaultGetRegisteredRaters();
|
|
71
|
+
const allowed = deps.serverRaterNames ?? SERVER_RATERS;
|
|
72
|
+
const weightFor = deps.weightMultiplierFor ?? defaultGetRaterWeightMultiplier;
|
|
73
|
+
const applyFn = deps.applyRating ?? defaultApplyRating;
|
|
74
|
+
|
|
75
|
+
const serverRaters = allRaters.filter((r) => allowed.has(r.name));
|
|
76
|
+
|
|
77
|
+
for (const rater of serverRaters) {
|
|
78
|
+
const events = await rater.rate({
|
|
79
|
+
taskId: input.taskId,
|
|
80
|
+
agentId: input.agentId,
|
|
81
|
+
retrievedMemoryIds: input.retrievedMemoryIds,
|
|
82
|
+
evidence: input.evidence,
|
|
83
|
+
});
|
|
84
|
+
if (events.length === 0) continue;
|
|
85
|
+
|
|
86
|
+
const multiplier = weightFor(rater.name);
|
|
87
|
+
const stamped: RatingEvent[] = events.map((e) => ({
|
|
88
|
+
...e,
|
|
89
|
+
source: rater.name,
|
|
90
|
+
weight: Math.max(0, Math.min(1, e.weight * multiplier)),
|
|
91
|
+
}));
|
|
92
|
+
const applied = applyFn(stamped, { taskId: input.taskId });
|
|
93
|
+
result.ratersFired += 1;
|
|
94
|
+
result.outcomes.push({ rater: rater.name, events: stamped, result: applied });
|
|
95
|
+
}
|
|
96
|
+
return result;
|
|
97
|
+
}
|