@desplega.ai/agent-swarm 1.74.4 → 1.75.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/openapi.json +282 -1
- package/package.json +1 -1
- package/src/be/db.ts +36 -0
- package/src/be/memory/edges-store.ts +69 -0
- package/src/be/memory/providers/sqlite-store.ts +4 -0
- package/src/be/memory/raters/explicit-self.ts +22 -0
- package/src/be/memory/raters/implicit-citation.ts +44 -0
- package/src/be/memory/raters/llm-client.ts +172 -0
- package/src/be/memory/raters/llm.ts +394 -0
- package/src/be/memory/raters/noop.ts +14 -0
- package/src/be/memory/raters/registry.ts +86 -0
- package/src/be/memory/raters/retrieval.ts +88 -0
- package/src/be/memory/raters/run-server-raters.ts +97 -0
- package/src/be/memory/raters/store.ts +228 -0
- package/src/be/memory/raters/types.ts +101 -0
- package/src/be/memory/reranker.ts +32 -2
- package/src/be/memory/retrieval-store.ts +95 -0
- package/src/be/memory/types.ts +3 -0
- package/src/be/migrations/051_memory_posteriors_and_retrieval.sql +67 -0
- package/src/be/migrations/052_memory_edges.sql +36 -0
- package/src/be/migrations/053_agent_waiting_for_credentials_status.sql +61 -0
- package/src/commands/credential-wait.ts +186 -0
- package/src/commands/runner.ts +54 -9
- package/src/hooks/hook.ts +67 -10
- package/src/http/agents.ts +110 -0
- package/src/http/core.ts +5 -0
- package/src/http/memory.ts +230 -1
- package/src/prompts/memories.ts +62 -0
- package/src/providers/claude-adapter.ts +17 -0
- package/src/providers/claude-managed-adapter.ts +24 -0
- package/src/providers/codex-adapter.ts +42 -0
- package/src/providers/credentials.ts +74 -0
- package/src/providers/devin-adapter.ts +18 -0
- package/src/providers/index.ts +7 -0
- package/src/providers/opencode-adapter.ts +60 -0
- package/src/providers/pi-mono-adapter.ts +71 -0
- package/src/providers/types.ts +34 -0
- package/src/server.ts +2 -0
- package/src/tests/credential-check.test.ts +336 -0
- package/src/tests/credential-status-api.test.ts +181 -0
- package/src/tests/credential-status-routing.test.ts +150 -0
- package/src/tests/credential-wait.test.ts +282 -0
- package/src/tests/memory-edges.test.ts +722 -0
- package/src/tests/memory-rate-endpoint.test.ts +330 -0
- package/src/tests/memory-rate-tool.test.ts +252 -0
- package/src/tests/memory-rater-e2e.test.ts +578 -0
- package/src/tests/memory-rater-implicit-citation.test.ts +304 -0
- package/src/tests/memory-rater-llm.test.ts +806 -0
- package/src/tests/memory-rater-store.test.ts +249 -0
- package/src/tests/memory-reranker.test.ts +161 -2
- package/src/tests/mocks/mock-llm-rater-client.ts +35 -0
- package/src/tests/run-server-raters.test.ts +291 -0
- package/src/tests/tool-annotations.test.ts +2 -2
- package/src/tools/memory-rate.ts +166 -0
- package/src/tools/memory-search.ts +18 -0
- package/src/tools/store-progress.ts +37 -0
- package/src/tools/tool-config.ts +1 -0
- package/src/types.ts +5 -1
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `LlmRaterClient` — pluggable LLM driver used by `LlmRater` to score the
|
|
3
|
+
* usefulness of a single retrieved memory against a (query, response) pair.
|
|
4
|
+
*
|
|
5
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-4.md §1
|
|
6
|
+
*
|
|
7
|
+
* This module is imported from worker-side `src/hooks/hook.ts` (the session-
|
|
8
|
+
* summary piggyback path), so it MUST NOT touch `bun:sqlite` or `src/be/db`.
|
|
9
|
+
* The DB-boundary check in `scripts/check-db-boundary.sh` enforces this.
|
|
10
|
+
*
|
|
11
|
+
* Default implementation shells out to the same `claude -p` CLI the hook
|
|
12
|
+
* already uses for session summarization — zero new SDK dependencies.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
export type LlmRaterInput = {
|
|
16
|
+
/** What the agent asked the memory system for. */
|
|
17
|
+
query: string;
|
|
18
|
+
/** The memory we're scoring. */
|
|
19
|
+
memory: {
|
|
20
|
+
id: string;
|
|
21
|
+
name: string;
|
|
22
|
+
content: string;
|
|
23
|
+
};
|
|
24
|
+
/** The agent's eventual response (or session summary) — the "did this help?" signal. */
|
|
25
|
+
response: string;
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
export type LlmRaterResult = {
|
|
29
|
+
/** Usefulness score in [0, 1]. 0 = misleading, 1 = highly useful. */
|
|
30
|
+
score: number;
|
|
31
|
+
/** Short human-readable explanation. */
|
|
32
|
+
reasoning: string;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
export interface LlmRaterClient {
|
|
36
|
+
/**
|
|
37
|
+
* Score one memory. Returns null on parse failure / non-JSON output / timeout
|
|
38
|
+
* — the caller (`LlmRater`) treats `null` as "skip this rating", no posterior
|
|
39
|
+
* change. Implementations MUST NOT throw on transport errors; swallow + log
|
|
40
|
+
* + return null so the worker hook can never crash on rater failure.
|
|
41
|
+
*/
|
|
42
|
+
rate(input: LlmRaterInput): Promise<LlmRaterResult | null>;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Configuration for the Claude-CLI implementation.
|
|
47
|
+
*/
|
|
48
|
+
export type ClaudeCliLlmRaterClientOptions = {
|
|
49
|
+
/** Override the model. Defaults to `MEMORY_LLM_RATER_MODEL` env var or "haiku". */
|
|
50
|
+
model?: string;
|
|
51
|
+
/** Soft timeout (ms) for the `claude -p` shell-out. Default 30s. */
|
|
52
|
+
timeoutMs?: number;
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
const DEFAULT_TIMEOUT_MS = 30000;
|
|
56
|
+
|
|
57
|
+
const PROMPT_TEMPLATE = `You are scoring the usefulness of one retrieved memory.
|
|
58
|
+
|
|
59
|
+
Return ONLY a JSON object with these fields (no prose, no markdown):
|
|
60
|
+
{
|
|
61
|
+
"score": number, // 0 = misleading/unhelpful, 1 = highly useful
|
|
62
|
+
"reasoning": string // 1..500 chars, why
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
QUERY:
|
|
66
|
+
\${query}
|
|
67
|
+
|
|
68
|
+
MEMORY:
|
|
69
|
+
id: \${memoryId}
|
|
70
|
+
name: \${memoryName}
|
|
71
|
+
content: \${memoryContent}
|
|
72
|
+
|
|
73
|
+
AGENT RESPONSE / SUMMARY:
|
|
74
|
+
\${response}
|
|
75
|
+
|
|
76
|
+
Score 0..1.`;
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* `claude -p --output-format json` returns a JSON envelope of the shape
|
|
80
|
+
* `{ result: string, ... }`. We parse the envelope, then JSON-parse the
|
|
81
|
+
* inner `result` to recover the score+reasoning object.
|
|
82
|
+
*/
|
|
83
|
+
type ClaudeCliEnvelope = { result?: unknown };
|
|
84
|
+
|
|
85
|
+
function buildPrompt(input: LlmRaterInput): string {
|
|
86
|
+
return PROMPT_TEMPLATE.replace("${query}", input.query)
|
|
87
|
+
.replace("${memoryId}", input.memory.id)
|
|
88
|
+
.replace("${memoryName}", input.memory.name)
|
|
89
|
+
.replace("${memoryContent}", input.memory.content)
|
|
90
|
+
.replace("${response}", input.response);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function parseScoreAndReasoning(raw: unknown): LlmRaterResult | null {
|
|
94
|
+
if (typeof raw !== "string") return null;
|
|
95
|
+
let parsed: unknown;
|
|
96
|
+
try {
|
|
97
|
+
parsed = JSON.parse(raw.trim());
|
|
98
|
+
} catch {
|
|
99
|
+
return null;
|
|
100
|
+
}
|
|
101
|
+
if (!parsed || typeof parsed !== "object") return null;
|
|
102
|
+
const obj = parsed as { score?: unknown; reasoning?: unknown };
|
|
103
|
+
const score = typeof obj.score === "number" ? obj.score : null;
|
|
104
|
+
const reasoning = typeof obj.reasoning === "string" ? obj.reasoning : null;
|
|
105
|
+
if (score == null || reasoning == null) return null;
|
|
106
|
+
if (!Number.isFinite(score) || score < 0 || score > 1) return null;
|
|
107
|
+
if (reasoning.length === 0 || reasoning.length > 500) return null;
|
|
108
|
+
return { score, reasoning };
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
export class ClaudeCliLlmRaterClient implements LlmRaterClient {
|
|
112
|
+
private readonly model: string;
|
|
113
|
+
private readonly timeoutMs: number;
|
|
114
|
+
|
|
115
|
+
constructor(opts: ClaudeCliLlmRaterClientOptions = {}) {
|
|
116
|
+
this.model = opts.model ?? process.env.MEMORY_LLM_RATER_MODEL ?? "haiku";
|
|
117
|
+
this.timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async rate(input: LlmRaterInput): Promise<LlmRaterResult | null> {
|
|
121
|
+
const prompt = buildPrompt(input);
|
|
122
|
+
const tmpFile = `/tmp/llm-rater-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`;
|
|
123
|
+
|
|
124
|
+
let stdout = "";
|
|
125
|
+
try {
|
|
126
|
+
await Bun.write(tmpFile, prompt);
|
|
127
|
+
const proc = Bun.spawn(
|
|
128
|
+
["bash", "-c", `cat "${tmpFile}" | claude -p --model ${this.model} --output-format json`],
|
|
129
|
+
{
|
|
130
|
+
stdout: "pipe",
|
|
131
|
+
stderr: "pipe",
|
|
132
|
+
env: { ...process.env, SKIP_SESSION_SUMMARY: "1" },
|
|
133
|
+
},
|
|
134
|
+
);
|
|
135
|
+
const timeoutId = setTimeout(() => proc.kill(), this.timeoutMs);
|
|
136
|
+
stdout = await new Response(proc.stdout).text();
|
|
137
|
+
clearTimeout(timeoutId);
|
|
138
|
+
} catch (err) {
|
|
139
|
+
console.error("[memory-rater:llm] claude -p shell-out failed:", (err as Error).message);
|
|
140
|
+
return null;
|
|
141
|
+
} finally {
|
|
142
|
+
try {
|
|
143
|
+
await Bun.$`rm -f ${tmpFile}`.quiet();
|
|
144
|
+
} catch {
|
|
145
|
+
// best-effort
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
let envelope: ClaudeCliEnvelope;
|
|
150
|
+
try {
|
|
151
|
+
envelope = JSON.parse(stdout) as ClaudeCliEnvelope;
|
|
152
|
+
} catch {
|
|
153
|
+
return null;
|
|
154
|
+
}
|
|
155
|
+
return parseScoreAndReasoning(envelope.result);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Factory honouring `MEMORY_LLM_RATER_PROVIDER` — defaults to `claude-cli`.
|
|
161
|
+
* Unknown providers fall back to the Claude CLI default and log a warning so
|
|
162
|
+
* misconfiguration never crashes the worker.
|
|
163
|
+
*/
|
|
164
|
+
export function getDefaultLlmRaterClient(): LlmRaterClient {
|
|
165
|
+
const provider = (process.env.MEMORY_LLM_RATER_PROVIDER ?? "claude-cli").trim();
|
|
166
|
+
if (provider !== "claude-cli") {
|
|
167
|
+
console.warn(
|
|
168
|
+
`[memory-rater:llm] Unknown MEMORY_LLM_RATER_PROVIDER "${provider}" — falling back to claude-cli`,
|
|
169
|
+
);
|
|
170
|
+
}
|
|
171
|
+
return new ClaudeCliLlmRaterClient();
|
|
172
|
+
}
|
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `LlmRater` — second live rater, source = "llm".
|
|
3
|
+
*
|
|
4
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-4.md §2-3
|
|
5
|
+
*
|
|
6
|
+
* The worker-side flow does NOT call `LlmRater.rate(ctx)` from the in-process
|
|
7
|
+
* server-rater orchestrator. Instead, the rating LLM call is piggybacked on
|
|
8
|
+
* the existing session-summary call in `src/hooks/hook.ts` (cost optimization
|
|
9
|
+
* — same Haiku invocation produces both summary text + per-memory ratings).
|
|
10
|
+
* The hook then POSTs the constructed `RatingEvent[]` to `/api/memory/rate`.
|
|
11
|
+
*
|
|
12
|
+
* `LlmRater.rate(ctx)` is wired up so the class still satisfies `MemoryRater`
|
|
13
|
+
* for registry consistency / future direct integrations / unit tests, but is
|
|
14
|
+
* never invoked by `runServerRaters` (LlmRater is NOT in `SERVER_RATERS`).
|
|
15
|
+
*
|
|
16
|
+
* This module is imported from worker-side `src/hooks/hook.ts` so it MUST NOT
|
|
17
|
+
* touch `bun:sqlite` or `src/be/db`. The boundary check enforces it.
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
import { ClaudeCliLlmRaterClient, type LlmRaterClient, type LlmRaterResult } from "./llm-client";
|
|
21
|
+
import {
|
|
22
|
+
type MemoryRater,
|
|
23
|
+
type RatingContext,
|
|
24
|
+
type RatingEvent,
|
|
25
|
+
REFERENCES_SOURCE_MAX_LENGTH,
|
|
26
|
+
sanitizeReferencesSource,
|
|
27
|
+
} from "./types";
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Per-rating weight, fixed at 0.8 per the research-doc convention
|
|
31
|
+
* ("LLM intent_weight"). Encoded here once so neither callers nor tests can
|
|
32
|
+
* silently drift the constant.
|
|
33
|
+
*/
|
|
34
|
+
export const LLM_RATER_WEIGHT = 0.8;
|
|
35
|
+
|
|
36
|
+
const RatingSchema = z.object({
|
|
37
|
+
id: z.string().min(1),
|
|
38
|
+
score: z.number().min(0).max(1),
|
|
39
|
+
reasoning: z.string().min(1).max(500),
|
|
40
|
+
// Step-6 §6 — optional free-form external source ID. Q2 contract: ≤512
|
|
41
|
+
// chars, no closed enum, no prefix parser. Sanitization (control-char
|
|
42
|
+
// strip + NUL rejection) happens in `buildRatingsFromLlm` so a single
|
|
43
|
+
// bad rating drops the field rather than failing the whole batch.
|
|
44
|
+
referencesSource: z.string().min(1).max(REFERENCES_SOURCE_MAX_LENGTH).optional(),
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Zod schema for the structured-output piggyback prompt. The hook asks the
|
|
49
|
+
* summarizer LLM to return summary + per-memory ratings in one JSON object so
|
|
50
|
+
* we don't pay for N additional LLM calls.
|
|
51
|
+
*/
|
|
52
|
+
export const SummaryWithRatingsSchema = z.object({
|
|
53
|
+
summary: z.string(),
|
|
54
|
+
ratings: z.array(RatingSchema).default([]),
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
export type LlmRating = z.infer<typeof RatingSchema>;
|
|
58
|
+
export type SummaryWithRatings = z.infer<typeof SummaryWithRatingsSchema>;
|
|
59
|
+
|
|
60
|
+
/** Context augmentations LlmRater consumes when called directly (per-memory path). */
|
|
61
|
+
export type LlmRatingContext = RatingContext & {
|
|
62
|
+
/** What the agent asked the memory system. */
|
|
63
|
+
query?: string;
|
|
64
|
+
/** Final agent response / summary used as the "did this help?" signal. */
|
|
65
|
+
response?: string;
|
|
66
|
+
/** Snapshots for memories listed in `retrievedMemoryIds` (id-aligned by id). */
|
|
67
|
+
retrievedMemories?: { id: string; name: string; content: string }[];
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
export class LlmRater implements MemoryRater {
|
|
71
|
+
readonly name = "llm";
|
|
72
|
+
|
|
73
|
+
constructor(public readonly client: LlmRaterClient = new ClaudeCliLlmRaterClient()) {}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Per-memory scoring path. The production hook bypasses this method and
|
|
77
|
+
* calls {@link buildRatingsFromLlm} on the piggybacked summarizer JSON
|
|
78
|
+
* (one LLM invocation, not N). Direct callers (tests, future integrations)
|
|
79
|
+
* MUST pass {@link LlmRatingContext} — the base `RatingContext` carries
|
|
80
|
+
* only memory IDs, which is insufficient to drive `LlmRaterClient.rate`.
|
|
81
|
+
*
|
|
82
|
+
* Returns `[]` when the augmented fields are missing so the rater stays a
|
|
83
|
+
* no-op rather than crashing on a `RatingContext`-only invocation.
|
|
84
|
+
*/
|
|
85
|
+
async rate(ctx: RatingContext): Promise<RatingEvent[]> {
|
|
86
|
+
const enriched = ctx as LlmRatingContext;
|
|
87
|
+
if (enriched.retrievedMemoryIds.length === 0) return [];
|
|
88
|
+
const memories = enriched.retrievedMemories;
|
|
89
|
+
if (!memories || memories.length === 0) return [];
|
|
90
|
+
|
|
91
|
+
const events: RatingEvent[] = [];
|
|
92
|
+
for (const memoryId of enriched.retrievedMemoryIds) {
|
|
93
|
+
const memory = memories.find((m) => m.id === memoryId);
|
|
94
|
+
if (!memory) continue;
|
|
95
|
+
let result: LlmRaterResult | null;
|
|
96
|
+
try {
|
|
97
|
+
result = await this.client.rate({
|
|
98
|
+
query: enriched.query ?? "",
|
|
99
|
+
memory,
|
|
100
|
+
response: enriched.response ?? enriched.evidence ?? "",
|
|
101
|
+
});
|
|
102
|
+
} catch (err) {
|
|
103
|
+
console.error(
|
|
104
|
+
`[memory-rater:llm] client.rate threw for memoryId=${memoryId}:`,
|
|
105
|
+
(err as Error).message,
|
|
106
|
+
);
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
if (!result) continue;
|
|
110
|
+
events.push({
|
|
111
|
+
memoryId,
|
|
112
|
+
signal: 2 * result.score - 1,
|
|
113
|
+
weight: LLM_RATER_WEIGHT,
|
|
114
|
+
// Framework stamps `source = rater.name` in `runServerRaters`. Raters
|
|
115
|
+
// that populate `source` themselves are rejected by `applyRating`.
|
|
116
|
+
source: "",
|
|
117
|
+
reasoning: result.reasoning,
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
return events;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Convert the piggybacked summary's `ratings` array into `RatingEvent[]` for
|
|
126
|
+
* `POST /api/memory/rate`. Drops ratings whose `id` was not in the original
|
|
127
|
+
* retrieval set (defence-in-depth — the LLM occasionally hallucinates memory
|
|
128
|
+
* IDs; the server-side R6 check catches it too, but rejecting upstream keeps
|
|
129
|
+
* the audit log cleaner).
|
|
130
|
+
*
|
|
131
|
+
* Mapping: `signal = 2 * score - 1` (0 → -1, 0.5 → 0, 1 → +1).
|
|
132
|
+
* Weight = {@link LLM_RATER_WEIGHT} (0.8).
|
|
133
|
+
* Source = `"llm"` (the HTTP rate endpoint enums `["llm", "explicit-self"]`).
|
|
134
|
+
*/
|
|
135
|
+
export function buildRatingsFromLlm(
|
|
136
|
+
ratings: LlmRating[],
|
|
137
|
+
retrievals: { id: string }[],
|
|
138
|
+
): RatingEvent[] {
|
|
139
|
+
const allowed = new Set(retrievals.map((r) => r.id));
|
|
140
|
+
const events: RatingEvent[] = [];
|
|
141
|
+
for (const r of ratings) {
|
|
142
|
+
if (!allowed.has(r.id)) continue;
|
|
143
|
+
// Step-6 §6 — sanitize before propagation. If the LLM emits a NUL byte
|
|
144
|
+
// or an all-control-chars string, drop the edge but keep the rating
|
|
145
|
+
// (best-effort: the memory's own posterior still gets the signal).
|
|
146
|
+
let cleanedReferencesSource: string | undefined;
|
|
147
|
+
if (r.referencesSource !== undefined) {
|
|
148
|
+
const cleaned = sanitizeReferencesSource(r.referencesSource);
|
|
149
|
+
if (cleaned !== null) {
|
|
150
|
+
cleanedReferencesSource = cleaned;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
events.push({
|
|
154
|
+
memoryId: r.id,
|
|
155
|
+
signal: 2 * r.score - 1,
|
|
156
|
+
weight: LLM_RATER_WEIGHT,
|
|
157
|
+
source: "llm",
|
|
158
|
+
reasoning: r.reasoning,
|
|
159
|
+
...(cleanedReferencesSource !== undefined
|
|
160
|
+
? { referencesSource: cleanedReferencesSource }
|
|
161
|
+
: {}),
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
return events;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Append a structured-output instruction to the existing summary prompt so
|
|
169
|
+
* the same `claude -p` invocation produces both summary text AND per-memory
|
|
170
|
+
* ratings against `SummaryWithRatingsSchema`.
|
|
171
|
+
*
|
|
172
|
+
* Memory `content` is truncated to {@link RETRIEVAL_PROMPT_CONTENT_CAP} chars
|
|
173
|
+
* to keep the prompt within Haiku's context budget on long sessions; the
|
|
174
|
+
* server already truncates `agent_memory.content` to 500 chars in the
|
|
175
|
+
* retrievals endpoint, so this is the typical case.
|
|
176
|
+
*/
|
|
177
|
+
const RETRIEVAL_PROMPT_CONTENT_CAP = 600;
|
|
178
|
+
|
|
179
|
+
export function buildSummaryWithRatingsPrompt(
|
|
180
|
+
basePrompt: string,
|
|
181
|
+
retrievals: { id: string; name: string; content: string }[],
|
|
182
|
+
): string {
|
|
183
|
+
if (retrievals.length === 0) return basePrompt;
|
|
184
|
+
const memoryBlock = retrievals
|
|
185
|
+
.map((m, i) => {
|
|
186
|
+
const content =
|
|
187
|
+
m.content.length > RETRIEVAL_PROMPT_CONTENT_CAP
|
|
188
|
+
? `${m.content.slice(0, RETRIEVAL_PROMPT_CONTENT_CAP)}…`
|
|
189
|
+
: m.content;
|
|
190
|
+
return `Memory #${i + 1}\n id: ${m.id}\n name: ${m.name}\n content: ${content}`;
|
|
191
|
+
})
|
|
192
|
+
.join("\n\n");
|
|
193
|
+
|
|
194
|
+
return `${basePrompt}
|
|
195
|
+
|
|
196
|
+
CRITICAL: Return JSON conforming to this schema (no prose outside the JSON, no markdown fences):
|
|
197
|
+
{
|
|
198
|
+
"summary": string, // your existing summary text
|
|
199
|
+
"ratings": [ // one entry per memory you can score
|
|
200
|
+
{
|
|
201
|
+
"id": string, // memory id, copied from the list below
|
|
202
|
+
"score": number, // 0 = misleading/unhelpful, 1 = highly useful
|
|
203
|
+
"reasoning": string, // 1..500 chars, why
|
|
204
|
+
"referencesSource": string // OPTIONAL — see note below
|
|
205
|
+
}
|
|
206
|
+
]
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
Score ONLY memories present in the list below. Use the exact ids. Omit any you cannot evaluate.
|
|
210
|
+
|
|
211
|
+
Optionally for each rating, if the memory clearly references a specific external source (a GitHub PR/issue, a Linear issue, a customer, a Slack thread, an AgentMail thread, etc.), include a \`referencesSource\` string using the convention "<source>:<identifier>" (e.g. "github:owner/repo#N", "linear:KEY-N", "customer:<slug>"). Any prefix is fine — pick what matches the source. Omit the field if no clear external source.
|
|
212
|
+
|
|
213
|
+
Memories retrieved during this session:
|
|
214
|
+
|
|
215
|
+
${memoryBlock}`;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Best-effort parse of the structured `SummaryWithRatingsSchema` JSON out of
|
|
220
|
+
* the `claude -p --output-format json` envelope (`{ result: "<inner json>" }`).
|
|
221
|
+
*
|
|
222
|
+
* Returns `null` on any parse failure — the caller falls back to the existing
|
|
223
|
+
* summary-only path. NEVER throws.
|
|
224
|
+
*/
|
|
225
|
+
export function parseSummaryWithRatings(claudeStdout: string): SummaryWithRatings | null {
|
|
226
|
+
let envelope: { result?: unknown };
|
|
227
|
+
try {
|
|
228
|
+
envelope = JSON.parse(claudeStdout) as { result?: unknown };
|
|
229
|
+
} catch {
|
|
230
|
+
return null;
|
|
231
|
+
}
|
|
232
|
+
const inner = envelope.result;
|
|
233
|
+
let candidate: unknown;
|
|
234
|
+
if (typeof inner === "string") {
|
|
235
|
+
try {
|
|
236
|
+
candidate = JSON.parse(inner.trim());
|
|
237
|
+
} catch {
|
|
238
|
+
return null;
|
|
239
|
+
}
|
|
240
|
+
} else if (inner && typeof inner === "object") {
|
|
241
|
+
candidate = inner;
|
|
242
|
+
} else {
|
|
243
|
+
return null;
|
|
244
|
+
}
|
|
245
|
+
const parsed = SummaryWithRatingsSchema.safeParse(candidate);
|
|
246
|
+
return parsed.success ? parsed.data : null;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Fallback summary-text extractor for the hook's `claude -p` envelope. Used
|
|
251
|
+
* when {@link parseSummaryWithRatings} returns null — i.e., when the LLM
|
|
252
|
+
* returned a valid envelope but the inner payload either wasn't structured
|
|
253
|
+
* JSON (unstructured prompt path) OR was structured JSON whose ratings failed
|
|
254
|
+
* `SummaryWithRatingsSchema` validation (e.g., out-of-range scores).
|
|
255
|
+
*
|
|
256
|
+
* In the latter case `envelope.result` is the full inner JSON STRING such as
|
|
257
|
+
* `{"summary":"...","ratings":[...]}`; indexing that verbatim into agent
|
|
258
|
+
* memory would violate the step-4 contract that ratings are best-effort and
|
|
259
|
+
* the existing summary-indexing behavior remains unchanged. We extract the
|
|
260
|
+
* inner `summary` field if present, else return the inner string (treating
|
|
261
|
+
* it as plain summary text). NEVER throws.
|
|
262
|
+
*/
|
|
263
|
+
export function extractSummaryFromClaudeStdout(claudeStdout: string): string {
|
|
264
|
+
let envelope: { result?: unknown };
|
|
265
|
+
try {
|
|
266
|
+
envelope = JSON.parse(claudeStdout) as { result?: unknown };
|
|
267
|
+
} catch {
|
|
268
|
+
return claudeStdout;
|
|
269
|
+
}
|
|
270
|
+
const inner = envelope.result;
|
|
271
|
+
if (typeof inner === "string") {
|
|
272
|
+
try {
|
|
273
|
+
const innerParsed = JSON.parse(inner.trim()) as { summary?: unknown };
|
|
274
|
+
if (innerParsed && typeof innerParsed.summary === "string") {
|
|
275
|
+
return innerParsed.summary;
|
|
276
|
+
}
|
|
277
|
+
} catch {
|
|
278
|
+
// inner wasn't JSON — treat it as plain summary text
|
|
279
|
+
}
|
|
280
|
+
return inner;
|
|
281
|
+
}
|
|
282
|
+
if (
|
|
283
|
+
inner &&
|
|
284
|
+
typeof inner === "object" &&
|
|
285
|
+
typeof (inner as { summary?: unknown }).summary === "string"
|
|
286
|
+
) {
|
|
287
|
+
return (inner as { summary: string }).summary;
|
|
288
|
+
}
|
|
289
|
+
return claudeStdout;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/**
|
|
293
|
+
* `MEMORY_RATERS=...` includes `llm`? Used by the hook to gate the piggyback
|
|
294
|
+
* path — strict opt-in so existing deployments are byte-identical when unset.
|
|
295
|
+
*/
|
|
296
|
+
export function isLlmRaterEnabled(): boolean {
|
|
297
|
+
const raw = process.env.MEMORY_RATERS;
|
|
298
|
+
if (!raw || raw.trim() === "") return false;
|
|
299
|
+
return raw
|
|
300
|
+
.split(",")
|
|
301
|
+
.map((s) => s.trim())
|
|
302
|
+
.includes("llm");
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/** Memory snapshot returned by `GET /api/memory/retrievals`. */
|
|
306
|
+
export type RetrievalRow = {
|
|
307
|
+
id: string;
|
|
308
|
+
name: string;
|
|
309
|
+
content: string;
|
|
310
|
+
scope?: string;
|
|
311
|
+
similarity?: number | null;
|
|
312
|
+
retrievedAt?: string;
|
|
313
|
+
};
|
|
314
|
+
|
|
315
|
+
/**
|
|
316
|
+
* GET `/api/memory/retrievals?taskId=` — best-effort. Returns `[]` on any
|
|
317
|
+
* failure so a transient API outage never blocks the summary-indexing path.
|
|
318
|
+
*/
|
|
319
|
+
export async function fetchRetrievalsForTask(opts: {
|
|
320
|
+
apiUrl: string;
|
|
321
|
+
apiKey: string;
|
|
322
|
+
agentId: string;
|
|
323
|
+
taskId: string;
|
|
324
|
+
fetchImpl?: typeof fetch;
|
|
325
|
+
}): Promise<RetrievalRow[]> {
|
|
326
|
+
const fetchFn = opts.fetchImpl ?? fetch;
|
|
327
|
+
try {
|
|
328
|
+
const url = `${opts.apiUrl}/api/memory/retrievals?taskId=${encodeURIComponent(opts.taskId)}`;
|
|
329
|
+
const res = await fetchFn(url, {
|
|
330
|
+
headers: {
|
|
331
|
+
"X-Agent-ID": opts.agentId,
|
|
332
|
+
...(opts.apiKey ? { Authorization: `Bearer ${opts.apiKey}` } : {}),
|
|
333
|
+
},
|
|
334
|
+
});
|
|
335
|
+
if (!res.ok) {
|
|
336
|
+
console.error(
|
|
337
|
+
`[memory-rater:llm] GET /api/memory/retrievals failed: ${res.status} ${res.statusText}`,
|
|
338
|
+
);
|
|
339
|
+
return [];
|
|
340
|
+
}
|
|
341
|
+
const body = (await res.json()) as { results?: RetrievalRow[] };
|
|
342
|
+
return body.results ?? [];
|
|
343
|
+
} catch (err) {
|
|
344
|
+
console.error("[memory-rater:llm] fetchRetrievalsForTask threw:", (err as Error).message);
|
|
345
|
+
return [];
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* POST `/api/memory/rate` — best-effort. Logs on 4xx/5xx, never throws. The
|
|
351
|
+
* worker hook wraps the whole rating block in its own try/catch as a final
|
|
352
|
+
* line of defence — rater failure must never block summary indexing.
|
|
353
|
+
*/
|
|
354
|
+
export async function postRatings(opts: {
|
|
355
|
+
apiUrl: string;
|
|
356
|
+
apiKey: string;
|
|
357
|
+
agentId: string;
|
|
358
|
+
taskId?: string;
|
|
359
|
+
events: RatingEvent[];
|
|
360
|
+
fetchImpl?: typeof fetch;
|
|
361
|
+
}): Promise<{ ok: boolean; status: number }> {
|
|
362
|
+
if (opts.events.length === 0) return { ok: true, status: 0 };
|
|
363
|
+
const fetchFn = opts.fetchImpl ?? fetch;
|
|
364
|
+
const events = opts.events.map((e) => ({
|
|
365
|
+
memoryId: e.memoryId,
|
|
366
|
+
signal: e.signal,
|
|
367
|
+
weight: e.weight,
|
|
368
|
+
source: e.source,
|
|
369
|
+
...(e.reasoning !== undefined ? { reasoning: e.reasoning } : {}),
|
|
370
|
+
...(e.referencesSource !== undefined ? { referencesSource: e.referencesSource } : {}),
|
|
371
|
+
...(opts.taskId ? { taskId: opts.taskId } : {}),
|
|
372
|
+
}));
|
|
373
|
+
try {
|
|
374
|
+
const res = await fetchFn(`${opts.apiUrl}/api/memory/rate`, {
|
|
375
|
+
method: "POST",
|
|
376
|
+
headers: {
|
|
377
|
+
"Content-Type": "application/json",
|
|
378
|
+
"X-Agent-ID": opts.agentId,
|
|
379
|
+
...(opts.apiKey ? { Authorization: `Bearer ${opts.apiKey}` } : {}),
|
|
380
|
+
},
|
|
381
|
+
body: JSON.stringify({ events }),
|
|
382
|
+
});
|
|
383
|
+
if (!res.ok) {
|
|
384
|
+
const text = await res.text().catch(() => "");
|
|
385
|
+
console.error(
|
|
386
|
+
`[memory-rater:llm] POST /api/memory/rate failed: ${res.status} ${res.statusText} ${text.slice(0, 200)}`,
|
|
387
|
+
);
|
|
388
|
+
}
|
|
389
|
+
return { ok: res.ok, status: res.status };
|
|
390
|
+
} catch (err) {
|
|
391
|
+
console.error("[memory-rater:llm] postRatings threw:", (err as Error).message);
|
|
392
|
+
return { ok: false, status: 0 };
|
|
393
|
+
}
|
|
394
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { MemoryRater, RatingEvent } from "./types";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Default rater. Emits no events, makes no DB calls. Selected when
|
|
5
|
+
* MEMORY_RATERS is unset or empty so the framework defaults to behaving
|
|
6
|
+
* byte-identically to pre-rater builds.
|
|
7
|
+
*/
|
|
8
|
+
export class NoopRater implements MemoryRater {
|
|
9
|
+
readonly name = "noop";
|
|
10
|
+
|
|
11
|
+
async rate(): Promise<RatingEvent[]> {
|
|
12
|
+
return [];
|
|
13
|
+
}
|
|
14
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { ExplicitSelfRatingRater } from "./explicit-self";
|
|
2
|
+
import { ImplicitCitationRater } from "./implicit-citation";
|
|
3
|
+
import { LlmRater } from "./llm";
|
|
4
|
+
import { NoopRater } from "./noop";
|
|
5
|
+
import type { MemoryRater } from "./types";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-1.md §4
|
|
9
|
+
*
|
|
10
|
+
* `MEMORY_RATERS` env — comma-separated list of rater names. Defaults to
|
|
11
|
+
* `[NoopRater]` when unset/empty so existing deployments stay byte-identical.
|
|
12
|
+
*
|
|
13
|
+
* `MEMORY_RATER_WEIGHTS` env — optional `name:multiplier,...` overrides.
|
|
14
|
+
* Multiplier is applied to every emitted RatingEvent.weight before
|
|
15
|
+
* `applyRating`. Default = 1.0.
|
|
16
|
+
*
|
|
17
|
+
* Each later step touches *only* its own line in the factory map:
|
|
18
|
+
* - step-1: noop only (this PR).
|
|
19
|
+
* - step-2: implicit-citation.
|
|
20
|
+
* - step-4: llm.
|
|
21
|
+
* - step-5: explicit-self.
|
|
22
|
+
*
|
|
23
|
+
* Unknown names are logged and skipped — startup never fails on this.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
type RaterFactory = () => MemoryRater;
|
|
27
|
+
|
|
28
|
+
const FACTORIES: Record<string, RaterFactory> = {
|
|
29
|
+
noop: () => new NoopRater(),
|
|
30
|
+
"implicit-citation": () => new ImplicitCitationRater(),
|
|
31
|
+
"explicit-self": () => new ExplicitSelfRatingRater(),
|
|
32
|
+
llm: () => new LlmRater(),
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Raters whose `rate(ctx)` runs server-side (in `store-progress.ts` after task
|
|
37
|
+
* completion). Worker-driven raters (e.g. step-4's `LlmRater`, step-5's
|
|
38
|
+
* `ExplicitSelfRater`) emit events from outside this set and POST them to
|
|
39
|
+
* `/api/memory/rate`. The store-progress hook only fires raters listed here.
|
|
40
|
+
*
|
|
41
|
+
* Plan: thoughts/taras/plans/2026-05-05-memory-rater-v1.5/step-2.md §6
|
|
42
|
+
*/
|
|
43
|
+
export const SERVER_RATERS = new Set<string>(["implicit-citation"]);
|
|
44
|
+
|
|
45
|
+
export function getRegisteredRaters(): MemoryRater[] {
|
|
46
|
+
const raw = process.env.MEMORY_RATERS;
|
|
47
|
+
if (!raw || raw.trim() === "") {
|
|
48
|
+
return [new NoopRater()];
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const names = raw
|
|
52
|
+
.split(",")
|
|
53
|
+
.map((s) => s.trim())
|
|
54
|
+
.filter((s) => s.length > 0);
|
|
55
|
+
|
|
56
|
+
const raters: MemoryRater[] = [];
|
|
57
|
+
for (const name of names) {
|
|
58
|
+
const factory = FACTORIES[name];
|
|
59
|
+
if (!factory) {
|
|
60
|
+
console.warn(`[memory-rater] Unknown rater "${name}" in MEMORY_RATERS — skipping`);
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
raters.push(factory());
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (raters.length === 0) {
|
|
67
|
+
return [new NoopRater()];
|
|
68
|
+
}
|
|
69
|
+
return raters;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export function getRaterWeightMultiplier(name: string): number {
|
|
73
|
+
const raw = process.env.MEMORY_RATER_WEIGHTS;
|
|
74
|
+
if (!raw || raw.trim() === "") return 1.0;
|
|
75
|
+
|
|
76
|
+
for (const pair of raw.split(",")) {
|
|
77
|
+
const trimmed = pair.trim();
|
|
78
|
+
if (trimmed === "") continue;
|
|
79
|
+
const [rawName, rawMult] = trimmed.split(":");
|
|
80
|
+
if (!rawName || !rawMult) continue;
|
|
81
|
+
if (rawName.trim() !== name) continue;
|
|
82
|
+
const mult = Number(rawMult);
|
|
83
|
+
if (Number.isFinite(mult) && mult >= 0) return mult;
|
|
84
|
+
}
|
|
85
|
+
return 1.0;
|
|
86
|
+
}
|