clementine-agent 1.18.185 → 1.18.187
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/background-tasks.d.ts +12 -1
- package/dist/agent/background-tasks.js +30 -1
- package/dist/agent/claim-verification.d.ts +64 -0
- package/dist/agent/claim-verification.js +197 -0
- package/dist/agent/clementine-turn-context.d.ts +10 -0
- package/dist/agent/clementine-turn-context.js +168 -2
- package/dist/agent/project-resolver.d.ts +116 -0
- package/dist/agent/project-resolver.js +332 -0
- package/dist/agent/run-agent-context.js +7 -1
- package/dist/agent/run-agent.js +24 -4
- package/dist/agent/run-skill.js +15 -1
- package/dist/gateway/router.js +47 -0
- package/dist/tools/mcp-server.js +2 -0
- package/dist/tools/project-tools.d.ts +35 -0
- package/dist/tools/project-tools.js +218 -0
- package/dist/types.d.ts +2 -0
- package/package.json +1 -1
|
@@ -48,7 +48,18 @@ export declare function markRunning(id: string, opts?: BackgroundTaskOptions, me
|
|
|
48
48
|
}): BackgroundTask | null;
|
|
49
49
|
/** Patch non-status metadata on a task. Used for notification bookkeeping. */
|
|
50
50
|
export declare function updateBackgroundTask(id: string, patch: Partial<Omit<BackgroundTask, 'id'>>, opts?: BackgroundTaskOptions): BackgroundTask | null;
|
|
51
|
-
/** Transition to 'done' with final result.
|
|
51
|
+
/** Transition to 'done' with final result.
|
|
52
|
+
*
|
|
53
|
+
* 1.18.187 — Before stamping `done`, run claim-verification: parse the
|
|
54
|
+
* result text for active-voice action claims ("I deployed X", "I sent
|
|
55
|
+
* the email") and check the run's event log for matching tool calls.
|
|
56
|
+
* If the claims have no evidence, stamp `done` but flag
|
|
57
|
+
* `verificationFlag: 'claimed-without-evidence'` so the dashboard +
|
|
58
|
+
* future recall surfaces can show the discrepancy. We DON'T refuse the
|
|
59
|
+
* `done` transition outright — the task did complete from the SDK's
|
|
60
|
+
* point of view, and refusing would leave it stuck. Instead the flag
|
|
61
|
+
* makes the hallucination visible and downstream recall blocks can
|
|
62
|
+
* downweight flagged items. */
|
|
52
63
|
export declare function markDone(id: string, result: string, deliverableNote?: string, opts?: BackgroundTaskOptions): BackgroundTask | null;
|
|
53
64
|
/** Transition to 'failed' or 'aborted' with error message. */
|
|
54
65
|
export declare function markFailed(id: string, error: string, reason?: 'failed' | 'aborted' | 'interrupted', opts?: BackgroundTaskOptions): BackgroundTask | null;
|
|
@@ -138,7 +138,18 @@ function writeFullResultFile(id, result, opts) {
|
|
|
138
138
|
writeFileSync(file, result);
|
|
139
139
|
return file;
|
|
140
140
|
}
|
|
141
|
-
/** Transition to 'done' with final result.
|
|
141
|
+
/** Transition to 'done' with final result.
|
|
142
|
+
*
|
|
143
|
+
* 1.18.187 — Before stamping `done`, run claim-verification: parse the
|
|
144
|
+
* result text for active-voice action claims ("I deployed X", "I sent
|
|
145
|
+
* the email") and check the run's event log for matching tool calls.
|
|
146
|
+
* If the claims have no evidence, stamp `done` but flag
|
|
147
|
+
* `verificationFlag: 'claimed-without-evidence'` so the dashboard +
|
|
148
|
+
* future recall surfaces can show the discrepancy. We DON'T refuse the
|
|
149
|
+
* `done` transition outright — the task did complete from the SDK's
|
|
150
|
+
* point of view, and refusing would leave it stuck. Instead the flag
|
|
151
|
+
* makes the hallucination visible and downstream recall blocks can
|
|
152
|
+
* downweight flagged items. */
|
|
142
153
|
export function markDone(id, result, deliverableNote, opts) {
|
|
143
154
|
const task = loadBackgroundTask(id, opts);
|
|
144
155
|
if (!task)
|
|
@@ -155,6 +166,24 @@ export function markDone(id, result, deliverableNote, opts) {
|
|
|
155
166
|
task.deliverableNote = deliverableNote;
|
|
156
167
|
else if (resultPath)
|
|
157
168
|
task.deliverableNote = resultPath;
|
|
169
|
+
// 1.18.187 — claim verification (Part D). Best-effort: load the
|
|
170
|
+
// events file for this run, check whether the result's first-person
|
|
171
|
+
// action claims have matching tool calls. Failure to verify (e.g.,
|
|
172
|
+
// missing event log on older installs) is non-fatal — we just skip
|
|
173
|
+
// the flag.
|
|
174
|
+
try {
|
|
175
|
+
// eslint-disable-next-line @typescript-eslint/no-var-requires, @typescript-eslint/no-require-imports
|
|
176
|
+
const { verifyTaskClaims } = require('./claim-verification.js');
|
|
177
|
+
const verdict = verifyTaskClaims(result, task.runId);
|
|
178
|
+
if (verdict && verdict.ok === false) {
|
|
179
|
+
task.verificationFlag = 'claimed-without-evidence';
|
|
180
|
+
task.verificationDetails = verdict.missingEvidence
|
|
181
|
+
.map((m) => `${m.label}: expected any of {${m.expectedAnyOf.join(', ')}} — none found`)
|
|
182
|
+
.join('; ')
|
|
183
|
+
.slice(0, 1000);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
catch { /* claim verification is best-effort */ }
|
|
158
187
|
safeWrite(pathFor(id, opts), task);
|
|
159
188
|
return task;
|
|
160
189
|
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* claim-verification — detect when an agent claims to have done
|
|
3
|
+
* something but the run's tool-call history shows no matching action.
|
|
4
|
+
*
|
|
5
|
+
* Why this exists (1.18.187)
|
|
6
|
+
* ──────────────────────────
|
|
7
|
+
* On 2026-05-11 a bg task was diagnosed where Clementine said "The
|
|
8
|
+
* site is live again at https://X.netlify.app — all 100 coaches with
|
|
9
|
+
* search/filter/sort intact" — but the live URL returned HTTP 404,
|
|
10
|
+
* and the run had zero tool calls matching a deploy. She had
|
|
11
|
+
* confabulated success from a recall summary of a PRIOR task.
|
|
12
|
+
*
|
|
13
|
+
* The bg-task framework's `markDone` had no verification: it accepted
|
|
14
|
+
* the agent's claim verbatim and stamped status='done'. Downstream,
|
|
15
|
+
* the "Recently completed background work" recall block then
|
|
16
|
+
* re-injected that hallucinated "done" claim into the next session's
|
|
17
|
+
* prompt, perpetuating the lie.
|
|
18
|
+
*
|
|
19
|
+
* This module breaks the cycle by inspecting the result text for
|
|
20
|
+
* active-voice action claims ("I deployed X", "I sent the email")
|
|
21
|
+
* and cross-referencing against the run's event log
|
|
22
|
+
* (~/.clementine/events/<runId>.jsonl). When a claim has no matching
|
|
23
|
+
* evidence, the task is flagged for owner review instead of stamped
|
|
24
|
+
* `done`.
|
|
25
|
+
*
|
|
26
|
+
* Pure functions where possible; one I/O call to read the event log.
|
|
27
|
+
*/
|
|
28
|
+
import type { RunEvent } from '../types.js';
|
|
29
|
+
export type VerificationVerdict = {
|
|
30
|
+
ok: true;
|
|
31
|
+
reason: 'no-claims';
|
|
32
|
+
} | {
|
|
33
|
+
ok: true;
|
|
34
|
+
reason: 'evidence-found';
|
|
35
|
+
matchedClaims: Array<{
|
|
36
|
+
label: string;
|
|
37
|
+
evidence: string;
|
|
38
|
+
}>;
|
|
39
|
+
} | {
|
|
40
|
+
ok: false;
|
|
41
|
+
reason: 'claimed-without-evidence';
|
|
42
|
+
missingEvidence: Array<{
|
|
43
|
+
label: string;
|
|
44
|
+
expectedAnyOf: string[];
|
|
45
|
+
}>;
|
|
46
|
+
};
|
|
47
|
+
export interface VerifyOptions {
|
|
48
|
+
/** Path to the events directory. Defaults to ~/.clementine/events. */
|
|
49
|
+
eventsDir?: string;
|
|
50
|
+
/** Pre-loaded events (for tests). When set, eventsDir is ignored. */
|
|
51
|
+
events?: RunEvent[];
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Verify that a bg-task's result text matches the tool calls actually
|
|
55
|
+
* made during the run.
|
|
56
|
+
*
|
|
57
|
+
* @param resultText The text the agent produced as the final response.
|
|
58
|
+
* @param runId The run id; used to find the event log on disk.
|
|
59
|
+
* @param opts Test injection: pre-loaded events or alt directory.
|
|
60
|
+
* @returns A verdict object the caller (markDone) can use to
|
|
61
|
+
* decide whether to stamp `done` or flag the task.
|
|
62
|
+
*/
|
|
63
|
+
export declare function verifyTaskClaims(resultText: string, runId: string | undefined, opts?: VerifyOptions): VerificationVerdict;
|
|
64
|
+
//# sourceMappingURL=claim-verification.d.ts.map
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* claim-verification — detect when an agent claims to have done
|
|
3
|
+
* something but the run's tool-call history shows no matching action.
|
|
4
|
+
*
|
|
5
|
+
* Why this exists (1.18.187)
|
|
6
|
+
* ──────────────────────────
|
|
7
|
+
* On 2026-05-11 a bg task was diagnosed where Clementine said "The
|
|
8
|
+
* site is live again at https://X.netlify.app — all 100 coaches with
|
|
9
|
+
* search/filter/sort intact" — but the live URL returned HTTP 404,
|
|
10
|
+
* and the run had zero tool calls matching a deploy. She had
|
|
11
|
+
* confabulated success from a recall summary of a PRIOR task.
|
|
12
|
+
*
|
|
13
|
+
* The bg-task framework's `markDone` had no verification: it accepted
|
|
14
|
+
* the agent's claim verbatim and stamped status='done'. Downstream,
|
|
15
|
+
* the "Recently completed background work" recall block then
|
|
16
|
+
* re-injected that hallucinated "done" claim into the next session's
|
|
17
|
+
* prompt, perpetuating the lie.
|
|
18
|
+
*
|
|
19
|
+
* This module breaks the cycle by inspecting the result text for
|
|
20
|
+
* active-voice action claims ("I deployed X", "I sent the email")
|
|
21
|
+
* and cross-referencing against the run's event log
|
|
22
|
+
* (~/.clementine/events/<runId>.jsonl). When a claim has no matching
|
|
23
|
+
* evidence, the task is flagged for owner review instead of stamped
|
|
24
|
+
* `done`.
|
|
25
|
+
*
|
|
26
|
+
* Pure functions where possible; one I/O call to read the event log.
|
|
27
|
+
*/
|
|
28
|
+
import fs from 'node:fs';
|
|
29
|
+
import path from 'node:path';
|
|
30
|
+
import { BASE_DIR } from '../config.js';
|
|
31
|
+
/**
|
|
32
|
+
* Claim rules ordered by specificity. Each rule has a verb pattern
|
|
33
|
+
* (first-person active voice with optional adverbs/modifiers) and
|
|
34
|
+
* the set of tool-call shapes that count as evidence for it.
|
|
35
|
+
*
|
|
36
|
+
* Pattern shape: `\bI\s+(have\s+)?(verb-tensed)\b`
|
|
37
|
+
*
|
|
38
|
+
* Active voice + first person is required — "X is deployed" or
|
|
39
|
+
* "the site has been deployed" don't trigger (those are status
|
|
40
|
+
* references, not action claims).
|
|
41
|
+
*/
|
|
42
|
+
const CLAIM_RULES = [
|
|
43
|
+
{
|
|
44
|
+
label: 'deploy',
|
|
45
|
+
pattern: /\bI\s+(?:have\s+)?(?:just\s+|now\s+)?(?:deployed|published|pushed|launched|uploaded)\b/i,
|
|
46
|
+
evidenceMatchers: [
|
|
47
|
+
{ kind: 'bashCommand', pattern: /\bnetlify\s+deploy\b/i, describe: 'netlify deploy' },
|
|
48
|
+
{ kind: 'bashCommand', pattern: /\bvercel\s+(?:--prod|deploy)\b/i, describe: 'vercel deploy' },
|
|
49
|
+
{ kind: 'bashCommand', pattern: /\bgh-pages\b/i, describe: 'gh-pages publish' },
|
|
50
|
+
{ kind: 'bashCommand', pattern: /\bgit\s+push\b/i, describe: 'git push' },
|
|
51
|
+
{ kind: 'bashCommand', pattern: /\brsync\b/i, describe: 'rsync upload' },
|
|
52
|
+
{ kind: 'bashCommand', pattern: /\baws\s+s3\s+(?:cp|sync)\b/i, describe: 'aws s3 upload' },
|
|
53
|
+
{ kind: 'bashCommand', pattern: /\bcurl\s+.*-X\s+(?:POST|PUT)\b/i, describe: 'curl POST/PUT (API deploy)' },
|
|
54
|
+
{ kind: 'toolName', pattern: /^mcp__.*__(?:deploy|publish|upload)/i, describe: 'MCP deploy tool' },
|
|
55
|
+
{ kind: 'toolName', pattern: /^project_deploy$/i, describe: 'project_deploy tool' },
|
|
56
|
+
],
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
label: 'send',
|
|
60
|
+
pattern: /\bI\s+(?:have\s+)?(?:just\s+)?(?:sent|emailed|messaged|posted|notified)\b/i,
|
|
61
|
+
evidenceMatchers: [
|
|
62
|
+
{ kind: 'toolName', pattern: /^mcp__.*__(?:send|reply|create_message|post|notify)/i, describe: 'integration send tool' },
|
|
63
|
+
{ kind: 'toolName', pattern: /^discord_(?:channel_send|send_dm|reply)/i, describe: 'discord send' },
|
|
64
|
+
{ kind: 'toolName', pattern: /^outlook_(?:send|reply)/i, describe: 'outlook send' },
|
|
65
|
+
{ kind: 'toolName', pattern: /^gmail_send/i, describe: 'gmail send' },
|
|
66
|
+
{ kind: 'toolName', pattern: /^slack_(?:send|post)/i, describe: 'slack post' },
|
|
67
|
+
{ kind: 'bashCommand', pattern: /\bcurl\s+.*-X\s+POST\b/i, describe: 'curl POST (webhook)' },
|
|
68
|
+
],
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
label: 'write/create',
|
|
72
|
+
pattern: /\bI\s+(?:have\s+)?(?:just\s+)?(?:created|wrote|saved|built|generated)\s+(?:the|a|an|new)\b/i,
|
|
73
|
+
evidenceMatchers: [
|
|
74
|
+
{ kind: 'toolName', pattern: /^Write$/i, describe: 'Write tool' },
|
|
75
|
+
{ kind: 'toolName', pattern: /^Edit$/i, describe: 'Edit tool' },
|
|
76
|
+
{ kind: 'toolName', pattern: /^NotebookEdit$/i, describe: 'NotebookEdit tool' },
|
|
77
|
+
{ kind: 'bashCommand', pattern: />\s*[^\s|;&]+/, describe: 'shell write redirect' },
|
|
78
|
+
{ kind: 'bashCommand', pattern: /\bmkdir\b/i, describe: 'mkdir' },
|
|
79
|
+
{ kind: 'bashCommand', pattern: /\bcp\b|\bmv\b/i, describe: 'cp/mv' },
|
|
80
|
+
{ kind: 'toolName', pattern: /^note_create$/i, describe: 'note_create' },
|
|
81
|
+
{ kind: 'toolName', pattern: /^memory_write$/i, describe: 'memory_write' },
|
|
82
|
+
],
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
label: 'merge',
|
|
86
|
+
pattern: /\bI\s+(?:have\s+)?(?:just\s+)?(?:merged|combined|consolidated|joined)\s+(?:the|a|two|all|both)\b/i,
|
|
87
|
+
evidenceMatchers: [
|
|
88
|
+
{ kind: 'toolName', pattern: /^Write$/i, describe: 'Write tool (output file)' },
|
|
89
|
+
{ kind: 'toolName', pattern: /^Edit$/i, describe: 'Edit tool' },
|
|
90
|
+
{ kind: 'bashCommand', pattern: /\b(?:cat|paste|jq|awk)\b.*>\s*[^\s|;&]+/, describe: 'shell merge into file' },
|
|
91
|
+
{ kind: 'bashCommand', pattern: /\bgit\s+merge\b/i, describe: 'git merge' },
|
|
92
|
+
],
|
|
93
|
+
},
|
|
94
|
+
];
|
|
95
|
+
/**
|
|
96
|
+
* Verify that a bg-task's result text matches the tool calls actually
|
|
97
|
+
* made during the run.
|
|
98
|
+
*
|
|
99
|
+
* @param resultText The text the agent produced as the final response.
|
|
100
|
+
* @param runId The run id; used to find the event log on disk.
|
|
101
|
+
* @param opts Test injection: pre-loaded events or alt directory.
|
|
102
|
+
* @returns A verdict object the caller (markDone) can use to
|
|
103
|
+
* decide whether to stamp `done` or flag the task.
|
|
104
|
+
*/
|
|
105
|
+
export function verifyTaskClaims(resultText, runId, opts = {}) {
|
|
106
|
+
const text = String(resultText ?? '');
|
|
107
|
+
if (!text.trim())
|
|
108
|
+
return { ok: true, reason: 'no-claims' };
|
|
109
|
+
// 1. Find all claim rules whose pattern matches the result text.
|
|
110
|
+
const triggeredRules = CLAIM_RULES.filter((r) => r.pattern.test(text));
|
|
111
|
+
if (triggeredRules.length === 0) {
|
|
112
|
+
// No first-person active-voice action claims — nothing to verify.
|
|
113
|
+
// (Pure status reports like "the file is at /x/y/z" pass through.)
|
|
114
|
+
return { ok: true, reason: 'no-claims' };
|
|
115
|
+
}
|
|
116
|
+
// 2. Load the run's event log to inspect tool calls.
|
|
117
|
+
const events = opts.events ?? loadEvents(runId, opts.eventsDir);
|
|
118
|
+
// 3. For each triggered rule, check whether ANY matching evidence
|
|
119
|
+
// exists in the event log. If at least one rule has zero evidence,
|
|
120
|
+
// the verdict is claimed-without-evidence.
|
|
121
|
+
const matched = [];
|
|
122
|
+
const missing = [];
|
|
123
|
+
for (const rule of triggeredRules) {
|
|
124
|
+
const hit = findEvidence(events, rule);
|
|
125
|
+
if (hit) {
|
|
126
|
+
matched.push({ label: rule.label, evidence: hit });
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
missing.push({
|
|
130
|
+
label: rule.label,
|
|
131
|
+
expectedAnyOf: rule.evidenceMatchers.map((m) => m.describe),
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
if (missing.length > 0) {
|
|
136
|
+
return { ok: false, reason: 'claimed-without-evidence', missingEvidence: missing };
|
|
137
|
+
}
|
|
138
|
+
return { ok: true, reason: 'evidence-found', matchedClaims: matched };
|
|
139
|
+
}
|
|
140
|
+
// ── Internals ────────────────────────────────────────────────────────
|
|
141
|
+
function loadEvents(runId, eventsDir) {
|
|
142
|
+
if (!runId)
|
|
143
|
+
return [];
|
|
144
|
+
const dir = eventsDir ?? path.join(BASE_DIR, 'events');
|
|
145
|
+
const safe = String(runId).replace(/[^a-zA-Z0-9_-]/g, '_').slice(0, 128);
|
|
146
|
+
const file = path.join(dir, `${safe}.jsonl`);
|
|
147
|
+
if (!fs.existsSync(file))
|
|
148
|
+
return [];
|
|
149
|
+
try {
|
|
150
|
+
const lines = fs.readFileSync(file, 'utf-8').split('\n').filter(Boolean);
|
|
151
|
+
const out = [];
|
|
152
|
+
for (const line of lines) {
|
|
153
|
+
try {
|
|
154
|
+
out.push(JSON.parse(line));
|
|
155
|
+
}
|
|
156
|
+
catch { /* skip malformed */ }
|
|
157
|
+
}
|
|
158
|
+
return out;
|
|
159
|
+
}
|
|
160
|
+
catch {
|
|
161
|
+
return [];
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
function findEvidence(events, rule) {
|
|
165
|
+
for (const event of events) {
|
|
166
|
+
if (event.kind !== 'tool_call')
|
|
167
|
+
continue;
|
|
168
|
+
const toolName = String(event.toolName ?? '');
|
|
169
|
+
// Match against tool name patterns.
|
|
170
|
+
for (const m of rule.evidenceMatchers) {
|
|
171
|
+
if (m.kind === 'toolName' && m.pattern.test(toolName)) {
|
|
172
|
+
return `${m.describe} (tool: ${toolName})`;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
// For Bash tool calls, inspect the command argument.
|
|
176
|
+
if (toolName === 'Bash' && event.toolInput) {
|
|
177
|
+
const cmd = extractBashCommand(event.toolInput);
|
|
178
|
+
if (cmd) {
|
|
179
|
+
for (const m of rule.evidenceMatchers) {
|
|
180
|
+
if (m.kind === 'bashCommand' && m.pattern.test(cmd)) {
|
|
181
|
+
return `${m.describe} (Bash: ${cmd.slice(0, 80)})`;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
189
|
+
function extractBashCommand(toolInput) {
|
|
190
|
+
if (!toolInput || typeof toolInput !== 'object')
|
|
191
|
+
return null;
|
|
192
|
+
const obj = toolInput;
|
|
193
|
+
if (typeof obj.command === 'string')
|
|
194
|
+
return obj.command;
|
|
195
|
+
return null;
|
|
196
|
+
}
|
|
197
|
+
//# sourceMappingURL=claim-verification.js.map
|
|
@@ -41,6 +41,7 @@
|
|
|
41
41
|
* note on prompt caching boundaries.
|
|
42
42
|
*/
|
|
43
43
|
import type { BackgroundTask } from '../types.js';
|
|
44
|
+
import type { ProjectMeta } from './assistant.js';
|
|
44
45
|
export interface BuildTurnContextOptions {
|
|
45
46
|
/** The user's current message — used as the query for retrieved memory. */
|
|
46
47
|
userMessage: string;
|
|
@@ -79,6 +80,11 @@ export interface BuildTurnContextOptions {
|
|
|
79
80
|
}) => BackgroundTask[];
|
|
80
81
|
/** Clock injection for tests. Defaults to Date.now(). */
|
|
81
82
|
now?: () => number;
|
|
83
|
+
/** 1.18.187 — active project for this turn. When set, the
|
|
84
|
+
* "Active project" section renders with path, STATUS.md preview,
|
|
85
|
+
* source/output file inventory, and deploy.json summary (if any).
|
|
86
|
+
* Set by the router's resolver before the chat call. */
|
|
87
|
+
activeProject?: ProjectMeta | null;
|
|
82
88
|
}
|
|
83
89
|
export interface BuildTurnContextResult {
|
|
84
90
|
/** The full ready-to-prepend context block, INCLUDING outer
|
|
@@ -93,6 +99,10 @@ export interface BuildTurnContextResult {
|
|
|
93
99
|
recentBgTasks: number;
|
|
94
100
|
liveState: boolean;
|
|
95
101
|
identityFrame: boolean;
|
|
102
|
+
/** 1.18.187 — whether the active-project block rendered. */
|
|
103
|
+
activeProject: boolean;
|
|
104
|
+
/** 1.18.187 — whether the dispute gate fired (suppressed past-success recall). */
|
|
105
|
+
disputeDetected: boolean;
|
|
96
106
|
};
|
|
97
107
|
/** Final character count of the block. Useful for logging + the
|
|
98
108
|
* Anthropic prompt-cache-health analysis. */
|
|
@@ -40,7 +40,10 @@
|
|
|
40
40
|
* in the USER message, NOT in the system prompt. See the SDK reference
|
|
41
41
|
* note on prompt caching boundaries.
|
|
42
42
|
*/
|
|
43
|
+
import fs from 'node:fs';
|
|
44
|
+
import path from 'node:path';
|
|
43
45
|
import pino from 'pino';
|
|
46
|
+
import { detectDisputePattern } from './project-resolver.js';
|
|
44
47
|
const logger = pino({ name: 'clementine.turn-context' });
|
|
45
48
|
// ── Tunables ──────────────────────────────────────────────────────────
|
|
46
49
|
/** Hard cap on the entire block. Keep volatile content small so the
|
|
@@ -59,10 +62,42 @@ export function buildClementineTurnContext(opts) {
|
|
|
59
62
|
recentBgTasks: 0,
|
|
60
63
|
liveState: false,
|
|
61
64
|
identityFrame: false,
|
|
65
|
+
activeProject: false,
|
|
66
|
+
disputeDetected: false,
|
|
62
67
|
};
|
|
63
68
|
const parts = [];
|
|
64
69
|
const nowMs = (opts.now ?? Date.now)();
|
|
65
70
|
const nowDate = new Date(nowMs);
|
|
71
|
+
// 1.18.187 — detect dispute pattern (Part E). When the owner is
|
|
72
|
+
// reporting a failure of prior work, we want to suppress "past
|
|
73
|
+
// success" recall items (they bias the model toward defending its
|
|
74
|
+
// memory instead of verifying reality) and add a verification
|
|
75
|
+
// directive at the top of the block.
|
|
76
|
+
const disputeDetected = detectDisputePattern(opts.userMessage);
|
|
77
|
+
sections.disputeDetected = disputeDetected;
|
|
78
|
+
if (disputeDetected) {
|
|
79
|
+
parts.push('### Dispute mode — verification posture\n' +
|
|
80
|
+
'The owner is disputing prior work. **Treat recalled `done` claims as suspect.** ' +
|
|
81
|
+
'Before defending a past success in memory, verify reality with tools — ' +
|
|
82
|
+
'curl any URLs you previously claimed, check files that should exist, run status commands. ' +
|
|
83
|
+
'Honest "I claimed X but on re-check it failed because Y" is better than " I deployed it, see memory."');
|
|
84
|
+
}
|
|
85
|
+
// 1.18.187 — active project block (Part B). Renders when the
|
|
86
|
+
// session has a linked project resolved for this turn. Includes
|
|
87
|
+
// path, STATUS.md preview, source/output inventory, and deploy.json
|
|
88
|
+
// summary so the model knows where to read, write, and deploy.
|
|
89
|
+
if (opts.activeProject?.path && fs.existsSync(opts.activeProject.path)) {
|
|
90
|
+
try {
|
|
91
|
+
const projectBlock = buildActiveProjectBlock(opts.activeProject);
|
|
92
|
+
if (projectBlock) {
|
|
93
|
+
parts.push(projectBlock);
|
|
94
|
+
sections.activeProject = true;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
catch (err) {
|
|
98
|
+
logger.debug({ err, project: opts.activeProject.path }, 'turn-context: active project block failed (non-fatal)');
|
|
99
|
+
}
|
|
100
|
+
}
|
|
66
101
|
// ── 1. Retrieved memory hits ──────────────────────────────────────
|
|
67
102
|
// The single most important section. Pulls the top semantic + FTS
|
|
68
103
|
// hits from the SQLite memory store, scored against the user's
|
|
@@ -98,9 +133,17 @@ export function buildClementineTurnContext(opts) {
|
|
|
98
133
|
// ── 2. Recent background task headlines ───────────────────────────
|
|
99
134
|
// Last 24h of terminal-state bg tasks. So when the owner asks "what
|
|
100
135
|
// happened with that job?" she knows without re-asking.
|
|
136
|
+
//
|
|
137
|
+
// 1.18.187 dispute gate (Part E): when the owner is disputing prior
|
|
138
|
+
// work, exclude `done`-status tasks — those are exactly the entries
|
|
139
|
+
// that bias the model toward "but my memory says it succeeded."
|
|
140
|
+
// Failed/aborted/interrupted tasks STAY because they're useful
|
|
141
|
+
// signal for the verification posture.
|
|
101
142
|
if (opts.listBackgroundTasks) {
|
|
102
143
|
try {
|
|
103
|
-
const TERMINAL =
|
|
144
|
+
const TERMINAL = disputeDetected
|
|
145
|
+
? ['failed', 'interrupted', 'aborted']
|
|
146
|
+
: ['done', 'failed', 'interrupted', 'aborted'];
|
|
104
147
|
const recent = [];
|
|
105
148
|
for (const status of TERMINAL) {
|
|
106
149
|
const tasks = opts.listBackgroundTasks({ status });
|
|
@@ -123,10 +166,18 @@ export function buildClementineTurnContext(opts) {
|
|
|
123
166
|
const lines = ['### Recently completed background work (last 24h)'];
|
|
124
167
|
for (const task of recent.slice(0, MAX_BG_TASKS)) {
|
|
125
168
|
const promptPreview = (task.prompt ?? '').slice(0, 80).replace(/\s+/g, ' ').trim();
|
|
169
|
+
// 1.18.187 — flagged tasks get an explicit "claim not
|
|
170
|
+
// verified" warning so the model doesn't read them as
|
|
171
|
+
// authoritative. The verificationFlag is set by markDone
|
|
172
|
+
// when the result text claimed actions the event log can't
|
|
173
|
+
// back up. See agent/claim-verification.ts.
|
|
174
|
+
const flag = task.verificationFlag === 'claimed-without-evidence'
|
|
175
|
+
? ' ⚠ CLAIM NOT VERIFIED'
|
|
176
|
+
: '';
|
|
126
177
|
const tail = task.status === 'done'
|
|
127
178
|
? (task.result ?? task.deliverableNote ?? 'done').slice(0, 100).replace(/\s+/g, ' ').trim()
|
|
128
179
|
: (task.error ?? task.status).slice(0, 100).replace(/\s+/g, ' ').trim();
|
|
129
|
-
const line = `- **${task.status}
|
|
180
|
+
const line = `- **${task.status}**${flag}: ${promptPreview} → ${tail}`;
|
|
130
181
|
lines.push(line.slice(0, MAX_BG_TASK_LINE_CHARS));
|
|
131
182
|
sections.recentBgTasks += 1;
|
|
132
183
|
}
|
|
@@ -191,6 +242,121 @@ export function buildClementineTurnContext(opts) {
|
|
|
191
242
|
};
|
|
192
243
|
}
|
|
193
244
|
// ── Helpers ───────────────────────────────────────────────────────────
|
|
245
|
+
// ── Active project block (1.18.187) ──────────────────────────────────
|
|
246
|
+
const STATUS_PREVIEW_CHARS = 800;
|
|
247
|
+
const MAX_FILES_LISTED = 8;
|
|
248
|
+
/**
|
|
249
|
+
* Render an "Active project" block summarizing the project's name,
|
|
250
|
+
* path, current STATUS.md content (capped), source/output file
|
|
251
|
+
* inventory, and deploy.json config (if present).
|
|
252
|
+
*
|
|
253
|
+
* Designed to give the model everything it needs to continue work on
|
|
254
|
+
* the project in one shot: WHERE the data lives, WHAT'S been done,
|
|
255
|
+
* HOW to deploy.
|
|
256
|
+
*
|
|
257
|
+
* Returns empty string when the project folder is missing or
|
|
258
|
+
* unreadable — caller treats empty as "no project block this turn."
|
|
259
|
+
*/
|
|
260
|
+
function buildActiveProjectBlock(project) {
|
|
261
|
+
const projectPath = project.path;
|
|
262
|
+
const basename = path.basename(projectPath);
|
|
263
|
+
const displayName = project.description ? `${basename} (${project.description})` : basename;
|
|
264
|
+
const lines = [];
|
|
265
|
+
lines.push(`### Active project: ${displayName}`);
|
|
266
|
+
lines.push(`Path: \`${projectPath}\``);
|
|
267
|
+
// Inventory the top-level (and a sources/ + output/ if they exist).
|
|
268
|
+
try {
|
|
269
|
+
const inventory = inventoryProject(projectPath);
|
|
270
|
+
if (inventory.length > 0) {
|
|
271
|
+
lines.push('');
|
|
272
|
+
lines.push('**Layout:**');
|
|
273
|
+
for (const item of inventory) {
|
|
274
|
+
lines.push(` - ${item}`);
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
catch { /* inventory failure is non-fatal */ }
|
|
279
|
+
// STATUS.md preview.
|
|
280
|
+
const statusPath = path.join(projectPath, '.clementine', 'STATUS.md');
|
|
281
|
+
if (fs.existsSync(statusPath)) {
|
|
282
|
+
try {
|
|
283
|
+
const status = fs.readFileSync(statusPath, 'utf-8').trim();
|
|
284
|
+
if (status) {
|
|
285
|
+
const preview = status.length > STATUS_PREVIEW_CHARS
|
|
286
|
+
? status.slice(0, STATUS_PREVIEW_CHARS - 3) + '...'
|
|
287
|
+
: status;
|
|
288
|
+
lines.push('');
|
|
289
|
+
lines.push('**STATUS.md:**');
|
|
290
|
+
lines.push(preview);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
catch { /* skip */ }
|
|
294
|
+
}
|
|
295
|
+
// Deploy config summary.
|
|
296
|
+
const deployPath = path.join(projectPath, '.clementine', 'deploy.json');
|
|
297
|
+
if (fs.existsSync(deployPath)) {
|
|
298
|
+
try {
|
|
299
|
+
const parsed = JSON.parse(fs.readFileSync(deployPath, 'utf-8'));
|
|
300
|
+
if (parsed && typeof parsed === 'object') {
|
|
301
|
+
const kind = parsed.kind ?? 'unknown';
|
|
302
|
+
const site = parsed.site ?? '?';
|
|
303
|
+
const dir = parsed.dir ?? 'output';
|
|
304
|
+
const verifyUrl = parsed.verifyUrl ?? '?';
|
|
305
|
+
lines.push('');
|
|
306
|
+
lines.push(`**Deploy config:** ${kind} → ${site} (deploy dir: \`${dir}\`, verifies at ${verifyUrl}).`);
|
|
307
|
+
lines.push('Use the \`project_deploy\` tool when ready — it runs the command AND curls the URL to verify before reporting success.');
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
catch { /* skip */ }
|
|
311
|
+
}
|
|
312
|
+
else {
|
|
313
|
+
lines.push('');
|
|
314
|
+
lines.push('**No deploy config yet.** If this project should deploy somewhere, ask the owner where and ' +
|
|
315
|
+
'write `.clementine/deploy.json` with shape: `{kind: "netlify", site: "...", dir: "output", verifyUrl: "..."}`.');
|
|
316
|
+
}
|
|
317
|
+
return lines.join('\n');
|
|
318
|
+
}
|
|
319
|
+
function inventoryProject(projectPath) {
|
|
320
|
+
const out = [];
|
|
321
|
+
let entries;
|
|
322
|
+
try {
|
|
323
|
+
entries = fs.readdirSync(projectPath, { withFileTypes: true });
|
|
324
|
+
}
|
|
325
|
+
catch {
|
|
326
|
+
return out;
|
|
327
|
+
}
|
|
328
|
+
// Highlight the conventional folders first.
|
|
329
|
+
const conventional = ['sources', 'output', '.clementine'];
|
|
330
|
+
for (const name of conventional) {
|
|
331
|
+
const entry = entries.find((e) => e.name === name && e.isDirectory());
|
|
332
|
+
if (!entry)
|
|
333
|
+
continue;
|
|
334
|
+
const inside = path.join(projectPath, name);
|
|
335
|
+
try {
|
|
336
|
+
const items = fs.readdirSync(inside).filter((n) => !n.startsWith('.'));
|
|
337
|
+
const summary = items.length === 0
|
|
338
|
+
? '(empty)'
|
|
339
|
+
: items.length > MAX_FILES_LISTED
|
|
340
|
+
? `${items.slice(0, MAX_FILES_LISTED).join(', ')}, +${items.length - MAX_FILES_LISTED} more`
|
|
341
|
+
: items.join(', ');
|
|
342
|
+
out.push(`\`${name}/\` — ${summary}`);
|
|
343
|
+
}
|
|
344
|
+
catch {
|
|
345
|
+
out.push(`\`${name}/\``);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
// Then top-level files (data + code) up to MAX_FILES_LISTED.
|
|
349
|
+
const topFiles = entries
|
|
350
|
+
.filter((e) => e.isFile() && !e.name.startsWith('.'))
|
|
351
|
+
.map((e) => e.name);
|
|
352
|
+
if (topFiles.length > 0) {
|
|
353
|
+
const summary = topFiles.length > MAX_FILES_LISTED
|
|
354
|
+
? `${topFiles.slice(0, MAX_FILES_LISTED).join(', ')}, +${topFiles.length - MAX_FILES_LISTED} more`
|
|
355
|
+
: topFiles.join(', ');
|
|
356
|
+
out.push(`top-level files: ${summary}`);
|
|
357
|
+
}
|
|
358
|
+
return out;
|
|
359
|
+
}
|
|
194
360
|
function buildIdentityLine(opts) {
|
|
195
361
|
const parts = [];
|
|
196
362
|
if (opts.ownerName) {
|