@synergenius/flow-weaver-pack-weaver 0.9.193 → 0.9.196

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/dist/bot/ai-client.d.ts +5 -0
  2. package/dist/bot/ai-client.d.ts.map +1 -1
  3. package/dist/bot/ai-client.js +43 -0
  4. package/dist/bot/ai-client.js.map +1 -1
  5. package/dist/bot/assistant-core.js +2 -2
  6. package/dist/bot/assistant-core.js.map +1 -1
  7. package/dist/bot/behavior-defaults.d.ts +3 -1
  8. package/dist/bot/behavior-defaults.d.ts.map +1 -1
  9. package/dist/bot/behavior-defaults.js +7 -0
  10. package/dist/bot/behavior-defaults.js.map +1 -1
  11. package/dist/bot/capability-registry.js +3 -3
  12. package/dist/bot/capability-registry.js.map +1 -1
  13. package/dist/bot/context-compactor.d.ts +35 -0
  14. package/dist/bot/context-compactor.d.ts.map +1 -0
  15. package/dist/bot/context-compactor.js +130 -0
  16. package/dist/bot/context-compactor.js.map +1 -0
  17. package/dist/bot/dream-task.d.ts +45 -0
  18. package/dist/bot/dream-task.d.ts.map +1 -0
  19. package/dist/bot/dream-task.js +125 -0
  20. package/dist/bot/dream-task.js.map +1 -0
  21. package/dist/bot/knowledge-store.d.ts +9 -0
  22. package/dist/bot/knowledge-store.d.ts.map +1 -1
  23. package/dist/bot/knowledge-store.js +21 -0
  24. package/dist/bot/knowledge-store.js.map +1 -1
  25. package/dist/bot/memory-extraction-worker.d.ts +14 -0
  26. package/dist/bot/memory-extraction-worker.d.ts.map +1 -0
  27. package/dist/bot/memory-extraction-worker.js +42 -0
  28. package/dist/bot/memory-extraction-worker.js.map +1 -0
  29. package/dist/bot/memory-extractor.d.ts +27 -0
  30. package/dist/bot/memory-extractor.d.ts.map +1 -0
  31. package/dist/bot/memory-extractor.js +155 -0
  32. package/dist/bot/memory-extractor.js.map +1 -0
  33. package/dist/bot/operations.d.ts +3 -1
  34. package/dist/bot/operations.d.ts.map +1 -1
  35. package/dist/bot/operations.js +3 -1
  36. package/dist/bot/operations.js.map +1 -1
  37. package/dist/bot/post-turn-hooks.d.ts +57 -0
  38. package/dist/bot/post-turn-hooks.d.ts.map +1 -0
  39. package/dist/bot/post-turn-hooks.js +108 -0
  40. package/dist/bot/post-turn-hooks.js.map +1 -0
  41. package/dist/bot/profile-types.d.ts +16 -0
  42. package/dist/bot/profile-types.d.ts.map +1 -1
  43. package/dist/bot/swarm-controller.d.ts +7 -0
  44. package/dist/bot/swarm-controller.d.ts.map +1 -1
  45. package/dist/bot/swarm-controller.js +121 -1
  46. package/dist/bot/swarm-controller.js.map +1 -1
  47. package/dist/bot/task-prompt-builder.js +35 -21
  48. package/dist/bot/task-prompt-builder.js.map +1 -1
  49. package/dist/bot/task-types.d.ts +13 -0
  50. package/dist/bot/task-types.d.ts.map +1 -1
  51. package/dist/bot/tool-registry.d.ts +13 -0
  52. package/dist/bot/tool-registry.d.ts.map +1 -1
  53. package/dist/bot/tool-registry.js +80 -0
  54. package/dist/bot/tool-registry.js.map +1 -1
  55. package/dist/bot/types.d.ts +2 -0
  56. package/dist/bot/types.d.ts.map +1 -1
  57. package/dist/node-types/agent-execute.d.ts.map +1 -1
  58. package/dist/node-types/agent-execute.js +38 -17
  59. package/dist/node-types/agent-execute.js.map +1 -1
  60. package/dist/node-types/build-context.d.ts +4 -3
  61. package/dist/node-types/build-context.d.ts.map +1 -1
  62. package/dist/node-types/build-context.js +37 -6
  63. package/dist/node-types/build-context.js.map +1 -1
  64. package/dist/node-types/receive-task.d.ts +2 -1
  65. package/dist/node-types/receive-task.d.ts.map +1 -1
  66. package/dist/node-types/receive-task.js +4 -1
  67. package/dist/node-types/receive-task.js.map +1 -1
  68. package/dist/node-types/review-result.d.ts +9 -0
  69. package/dist/node-types/review-result.d.ts.map +1 -1
  70. package/dist/node-types/review-result.js +20 -5
  71. package/dist/node-types/review-result.js.map +1 -1
  72. package/dist/node-types/verify-task.d.ts +22 -0
  73. package/dist/node-types/verify-task.d.ts.map +1 -0
  74. package/dist/node-types/verify-task.js +143 -0
  75. package/dist/node-types/verify-task.js.map +1 -0
  76. package/dist/ui/capability-editor.js +3 -3
  77. package/dist/ui/profile-editor.js +3 -3
  78. package/dist/ui/swarm-dashboard.js +3 -3
  79. package/dist/workflows/weaver-agent.d.ts +3 -3
  80. package/dist/workflows/weaver-agent.d.ts.map +1 -1
  81. package/dist/workflows/weaver-agent.js +267 -18
  82. package/dist/workflows/weaver-agent.js.map +1 -1
  83. package/dist/workflows/weaver-bot-batch.d.ts +3 -3
  84. package/dist/workflows/weaver-bot-batch.d.ts.map +1 -1
  85. package/dist/workflows/weaver-bot-batch.js +280 -24
  86. package/dist/workflows/weaver-bot-batch.js.map +1 -1
  87. package/dist/workflows/weaver-bot.d.ts +2 -0
  88. package/dist/workflows/weaver-bot.d.ts.map +1 -1
  89. package/dist/workflows/weaver-bot.js +15 -10
  90. package/dist/workflows/weaver-bot.js.map +1 -1
  91. package/flowweaver.manifest.json +1 -1
  92. package/package.json +3 -3
  93. package/src/bot/ai-client.ts +54 -0
  94. package/src/bot/assistant-core.ts +2 -2
  95. package/src/bot/behavior-defaults.ts +9 -1
  96. package/src/bot/capability-registry.ts +3 -3
  97. package/src/bot/context-compactor.ts +147 -0
  98. package/src/bot/dream-task.ts +167 -0
  99. package/src/bot/knowledge-store.ts +27 -0
  100. package/src/bot/memory-extraction-worker.ts +58 -0
  101. package/src/bot/memory-extractor.ts +213 -0
  102. package/src/bot/operations.ts +3 -1
  103. package/src/bot/post-turn-hooks.ts +137 -0
  104. package/src/bot/profile-types.ts +17 -0
  105. package/src/bot/swarm-controller.ts +129 -2
  106. package/src/bot/task-prompt-builder.ts +37 -21
  107. package/src/bot/task-types.ts +21 -0
  108. package/src/bot/tool-registry.ts +89 -0
  109. package/src/bot/types.ts +2 -0
  110. package/src/node-types/agent-execute.ts +44 -17
  111. package/src/node-types/build-context.ts +45 -7
  112. package/src/node-types/receive-task.ts +3 -0
  113. package/src/node-types/review-result.ts +22 -5
  114. package/src/node-types/verify-task.ts +181 -0
  115. package/src/workflows/weaver-agent.ts +429 -18
  116. package/src/workflows/weaver-bot-batch.ts +443 -24
  117. package/src/workflows/weaver-bot.ts +16 -11
@@ -4,17 +4,21 @@ import {
4
4
  createAnthropicProvider,
5
5
  getOrCreateCliSession,
6
6
  killAllCliSessions,
7
+ joinSplitPrompt,
7
8
  type AgentProvider,
8
9
  type AgentMessage,
9
10
  type ToolDefinition,
10
11
  type StreamEvent,
11
12
  type StreamOptions,
12
13
  type ToolEvent,
14
+ type SplitPrompt,
13
15
  } from '@synergenius/flow-weaver/agent';
14
16
  import { WEAVER_TOOLS, createWeaverExecutor } from '../bot/weaver-tools.js';
17
+ import { resolveToolsForTask } from '../bot/tool-registry.js';
15
18
  import { auditEmit } from '../bot/audit-logger.js';
16
19
  import { withRetry, getErrorGuidance } from '../bot/error-classifier.js';
17
20
  import { CostTracker } from '../bot/cost-tracker.js';
21
+ import { PostTurnHookRunner, CostCheckpointHook, ProgressReportHook } from '../bot/post-turn-hooks.js';
18
22
 
19
23
  // Clean up persistent sessions on process exit
20
24
  let cleanupRegistered = false;
@@ -64,15 +68,16 @@ class CliSessionProvider implements AgentProvider {
64
68
 
65
69
  if (!prompt) return;
66
70
 
67
- // Only pass system prompt on the first call
68
- const systemPrompt = this.sentCount <= messages.length ? options?.systemPrompt : undefined;
71
+ // Only pass system prompt on the first call — CLI sessions accept a string
72
+ const splitPrompt = this.sentCount <= messages.length ? options?.systemPrompt : undefined;
73
+ const systemPromptStr = splitPrompt ? joinSplitPrompt(splitPrompt) : undefined;
69
74
 
70
75
  // Forward usage events to the runner's CostTracker via the global callback.
71
76
  // This bridges CLI session usage → runner cost tracking → swarm budget enforcement.
72
77
  const usageCb = (globalThis as Record<string, unknown>).__fw_ai_usage_callback__ as
73
78
  ((model: string, usage: { inputTokens: number; outputTokens: number }) => void) | undefined;
74
79
 
75
- for await (const event of this.session.send(prompt, systemPrompt)) {
80
+ for await (const event of this.session.send(prompt, systemPromptStr)) {
76
81
  if (event.type === 'usage' && usageCb) {
77
82
  usageCb(this.model, {
78
83
  inputTokens: event.promptTokens,
@@ -150,21 +155,24 @@ export async function weaverAgentExecute(
150
155
  return { onSuccess: false, onFailure: true, ctx: JSON.stringify(context) };
151
156
  }
152
157
 
153
- // Build system prompt
154
- let systemPrompt: string;
158
+ // Build system prompt as SplitPrompt — prefix is stable (cacheable),
159
+ // suffix is per-task (contextBundle, project plan).
160
+ // If frozenPromptPrefix is available from the swarm controller, use it
161
+ // to ensure all bot slots share the same cached prefix bytes.
162
+ let systemPrompt: SplitPrompt;
155
163
  try {
156
164
  const mod = await import('../bot/system-prompt.js');
157
- const basePrompt = await mod.buildSystemPrompt();
165
+ const prefix = context.frozenPromptPrefix ?? await mod.buildSystemPrompt();
158
166
  let cliCommands: { name: string; description: string; botCompatible?: boolean; options?: { flags: string; arg?: string; description: string }[] }[] = [];
159
167
  try {
160
168
  const docMeta = await import('@synergenius/flow-weaver/doc-metadata');
161
169
  cliCommands = docMeta.CLI_COMMANDS ?? [];
162
170
  } catch (err) { if (process.env.WEAVER_VERBOSE) console.error('[agent-execute] doc-metadata unavailable (older fw):', err); }
163
- const botPrompt = mod.buildBotSystemPrompt(context.contextBundle, cliCommands, projectDir);
164
- systemPrompt = basePrompt + '\n\n' + botPrompt;
171
+ const suffix = mod.buildBotSystemPrompt(context.contextBundle, cliCommands, projectDir);
172
+ systemPrompt = { prefix, suffix };
165
173
  } catch (err) {
166
174
  if (process.env.WEAVER_VERBOSE) console.error('[agent-execute] system prompt build failed, using fallback:', err);
167
- systemPrompt = 'You are Weaver, an AI workflow bot. Use the provided tools to complete tasks.';
175
+ systemPrompt = { prefix: 'You are Weaver, an AI workflow bot. Use the provided tools to complete tasks.', suffix: '' };
168
176
  }
169
177
 
170
178
  const taskPrompt = task.instruction.startsWith('## Task:')
@@ -219,14 +227,34 @@ export async function weaverAgentExecute(
219
227
 
220
228
  const onStreamEvent = (event: StreamEvent) => renderer.onStreamEvent(event);
221
229
 
222
- // Filter tools by profile: only orchestrators get task_create.
223
- // Without this, the AI sees task_create and delegates instead of doing work.
230
+ // Filter tools by task mode and profile capabilities.
231
+ // Mode-based filtering removes tools the task doesn't need (e.g., modify mode
232
+ // excludes write_file). Capability intersection ensures profiles only get their
233
+ // granted tools (e.g., orchestrator gets task_create, developer does not).
224
234
  const behavior = context.behaviorJson ? JSON.parse(context.behaviorJson) : undefined;
225
235
  const caps: string[] = behavior?.capabilities ?? [];
226
- const isOrchestrator = caps.includes('role-orchestrator') || caps.includes('task-mgmt') || caps.includes('decomposition');
227
- const tools = isOrchestrator
228
- ? WEAVER_TOOLS
229
- : WEAVER_TOOLS.filter(t => t.name !== 'task_create');
236
+ const grantedToolNames = resolveToolsForTask(
237
+ { mode: task.mode },
238
+ caps.length > 0 ? caps : undefined,
239
+ );
240
+ const tools = WEAVER_TOOLS.filter(t => grantedToolNames.has(t.name));
241
+
242
+ // Set up post-turn hooks — cost checkpoint + progress reporting.
243
+ // CostCheckpointHook aborts the loop when cumulative cost exceeds budget.
244
+ // ProgressReportHook emits turn-progress events for UI updates.
245
+ const hookRunner = new PostTurnHookRunner();
246
+ const model = pInfo.model ?? 'claude-sonnet-4-6';
247
+ const budget = behavior?.budget;
248
+ if (budget != null && budget > 0) {
249
+ hookRunner.register(new CostCheckpointHook(budget, model));
250
+ }
251
+ hookRunner.register(new ProgressReportHook((event) => {
252
+ renderer.onStreamEvent?.({ type: 'text_delta', text: '' }); // keep renderer alive
253
+ if (process.env.WEAVER_VERBOSE) {
254
+ console.log(`[post-turn] ${event.type}: iter=${event.data.iteration} tools=${event.data.toolCallCount}`);
255
+ }
256
+ }));
257
+ const onTurnEnd = hookRunner.createCallback();
230
258
 
231
259
  const result = await withRetry(
232
260
  () => runAgentLoop(
@@ -234,7 +262,7 @@ export async function weaverAgentExecute(
234
262
  tools,
235
263
  executor,
236
264
  [{ role: 'user', content: taskPrompt }],
237
- { systemPrompt, maxIterations: 15, onToolEvent, onStreamEvent },
265
+ { systemPrompt, maxIterations: 15, onToolEvent, onStreamEvent, onTurnEnd },
238
266
  ),
239
267
  {
240
268
  maxRetries: 3,
@@ -246,7 +274,6 @@ export async function weaverAgentExecute(
246
274
  );
247
275
 
248
276
  const usage = result.usage;
249
- const model = pInfo.model ?? 'claude-sonnet-4-6';
250
277
  const estimatedCost = CostTracker.estimateCost(model, {
251
278
  inputTokens: usage.promptTokens,
252
279
  outputTokens: usage.completionTokens,
@@ -4,6 +4,7 @@ import * as path from 'node:path';
4
4
  import type { WeaverContext } from '../bot/types.js';
5
5
  import { HierarchyEventLog } from '../bot/hierarchy-event-log.js';
6
6
  import { KnowledgeStore } from '../bot/knowledge-store.js';
7
+ import { selectRelevantKnowledge } from '../bot/ai-client.js';
7
8
 
8
9
  /** Profiles that work with Flow Weaver workflows and need FW context. */
9
10
  const FW_PROFILES = new Set(['fw-developer']);
@@ -65,7 +66,6 @@ function resolveFwBin(projectDir: string): string | null {
65
66
  * For create tasks, includes full authoring context + templates.
66
67
  *
67
68
  * @flowWeaver nodeType
68
- * @expression
69
69
  * @label Build Context
70
70
  * @icon build
71
71
  * @color cyan
@@ -73,10 +73,13 @@ function resolveFwBin(projectDir: string): string | null {
73
73
  * @output ctx [order:0] - Weaver context with contextBundle (JSON)
74
74
  * @output onFailure [hidden]
75
75
  */
76
- export function weaverBuildContext(ctx: string): { ctx: string } {
76
+ export async function weaverBuildContext(execute: boolean, ctx: string): Promise<{ onSuccess: boolean; onFailure: boolean; ctx: string }> {
77
+ if (!execute) {
78
+ return { onSuccess: true, onFailure: false, ctx };
79
+ }
77
80
  const context = JSON.parse(ctx) as WeaverContext;
78
81
  const { projectDir } = context.env;
79
- const task = JSON.parse(context.taskJson!) as { mode?: string; targets?: string[] };
82
+ const task = JSON.parse(context.taskJson!) as { mode?: string; targets?: string[]; instruction?: string; title?: string; description?: string };
80
83
  const sections: string[] = [];
81
84
  const needsFw = needsFwContext(context.taskJson);
82
85
  const fwBin = needsFw ? resolveFwBin(projectDir) : null;
@@ -102,13 +105,48 @@ export function weaverBuildContext(ctx: string): { ctx: string } {
102
105
  }
103
106
  } catch { /* non-fatal — memory is best-effort */ }
104
107
 
105
- // Auto-recall learned knowledge from previous bot runs
108
+ // Auto-recall learned knowledge from previous bot runs (with aging caveats + LLM relevance)
106
109
  try {
107
110
  const knowledge = new KnowledgeStore(projectDir);
108
111
  const entries = knowledge.list();
109
112
  if (entries.length > 0) {
110
- const knowledgeLines = entries.map((e: { key: string; value: string }) => `- **${e.key}**: ${e.value}`);
111
- sections.push(`## Learned Knowledge\n\nFacts discovered by previous runs use these instead of re-discovering:\n${knowledgeLines.join('\n')}`);
113
+ const now = Date.now();
114
+ const NINETY_DAYS_MS = 90 * 24 * 60 * 60 * 1000;
115
+
116
+ // Auto-prune entries older than 90 days
117
+ const staleKeys = entries.filter(e => now - e.createdAt > NINETY_DAYS_MS).map(e => e.key);
118
+ for (const key of staleKeys) knowledge.forget(key);
119
+
120
+ let fresh = entries.filter(e => now - e.createdAt <= NINETY_DAYS_MS);
121
+
122
+ // LLM-based relevance selection when > 10 entries
123
+ if (fresh.length > 10) {
124
+ try {
125
+ const { manifest, entries: sorted } = KnowledgeStore.buildManifest(fresh);
126
+ const instruction = task.instruction ?? task.title ?? task.description ?? '';
127
+ const indices = await selectRelevantKnowledge(
128
+ { type: context.env.providerInfo?.type ?? 'anthropic', apiKey: process.env.ANTHROPIC_API_KEY },
129
+ instruction,
130
+ task.mode,
131
+ manifest,
132
+ );
133
+ if (indices && indices.length > 0) {
134
+ fresh = indices.filter(i => i < sorted.length).map(i => sorted[i]);
135
+ }
136
+ // Fallback: if LLM returns null, use all fresh entries (current behavior)
137
+ } catch { /* LLM failure non-fatal — use all entries */ }
138
+ }
139
+
140
+ if (fresh.length > 0) {
141
+ const knowledgeLines = fresh.map((e: { key: string; value: string; createdAt: number }) => {
142
+ const ageDays = Math.floor((now - e.createdAt) / (24 * 60 * 60 * 1000));
143
+ const caveat = ageDays >= 1
144
+ ? ` _(${ageDays}d ago — may be outdated, verify before asserting)_`
145
+ : '';
146
+ return `- **${e.key}**: ${e.value}${caveat}`;
147
+ });
148
+ sections.push(`## Learned Knowledge\n\nFacts discovered by previous runs — use these instead of re-discovering:\n${knowledgeLines.join('\n')}`);
149
+ }
112
150
  }
113
151
  } catch { /* non-fatal — knowledge recall is best-effort */ }
114
152
 
@@ -135,7 +173,7 @@ export function weaverBuildContext(ctx: string): { ctx: string } {
135
173
  if (process.env.WEAVER_VERBOSE) process.stderr.write(`\x1b[2m Context: ${bundle.length} chars\x1b[0m\n`);
136
174
 
137
175
  context.contextBundle = bundle;
138
- return { ctx: JSON.stringify(context) };
176
+ return { onSuccess: true, onFailure: false, ctx: JSON.stringify(context) };
139
177
  }
140
178
 
141
179
  /** Minimal context for modify tasks: grammar + annotations + target sources + referenced node types. */
@@ -10,6 +10,7 @@ import type { WeaverEnv, WeaverContext } from '../bot/types.js';
10
10
  * @color purple
11
11
  * @input env [order:0] - Weaver environment bundle
12
12
  * @input [taskJson] [order:1] - Pre-supplied task (JSON, optional)
13
+ * @input [frozenPromptPrefix] [order:2] [hidden] - Frozen system prompt prefix for cache sharing
13
14
  * @output ctx [order:0] - Weaver context (JSON)
14
15
  * @output onSuccess [order:-2] - On Success
15
16
  * @output onFailure [order:-1] [hidden] - On Failure
@@ -18,11 +19,13 @@ export async function weaverReceiveTask(
18
19
  execute: boolean,
19
20
  env: WeaverEnv,
20
21
  taskJson?: string,
22
+ frozenPromptPrefix?: string,
21
23
  ): Promise<{
22
24
  onSuccess: boolean; onFailure: boolean;
23
25
  ctx: string;
24
26
  }> {
25
27
  const context: WeaverContext = { env, taskJson: '{}', hasTask: false };
28
+ if (frozenPromptPrefix) context.frozenPromptPrefix = frozenPromptPrefix;
26
29
 
27
30
  if (!execute) {
28
31
  return { onSuccess: true, onFailure: false, ctx: JSON.stringify(context) };
@@ -11,6 +11,18 @@ import {
11
11
  } from '@synergenius/flow-weaver/agent';
12
12
  import { createWeaverExecutor } from '../bot/weaver-tools.js';
13
13
 
14
+ /**
15
+ * Strip `<analysis>...</analysis>` scratchpad blocks from LLM response text.
16
+ * The analysis is a reasoning scaffold that improves verdict quality but
17
+ * should not leak into the parsed JSON output.
18
+ */
19
+ export function stripAnalysis(text: string): { cleaned: string; analysis: string | undefined } {
20
+ const match = text.match(/<analysis>([\s\S]*?)<\/analysis>/);
21
+ const analysis = match?.[1]?.trim() || undefined;
22
+ const cleaned = text.replace(/<analysis>[\s\S]*?<\/analysis>/g, '').trim();
23
+ return { cleaned, analysis };
24
+ }
25
+
14
26
  /**
15
27
  * LLM-powered task completion reviewer.
16
28
  * Makes a single judgment call: did the bot accomplish the assigned task?
@@ -95,7 +107,9 @@ Rate each criterion as PASS or FAIL:
95
107
 
96
108
  If you need to verify file contents to judge the RESULT criterion, use the read_file tool. Only read files if the evidence is ambiguous.
97
109
 
98
- Respond with exactly:
110
+ First, write your reasoning inside <analysis> tags. Work through each criterion step by step, examining the evidence.
111
+
112
+ After your <analysis> block, output only the following JSON — no other text outside the tags:
99
113
  {"pass": true/false, "intent": "PASS/FAIL", "execution": "PASS/FAIL", "result": "PASS/FAIL", "completeness": "PASS/FAIL", "reason": "one sentence summary"}`;
100
114
 
101
115
  try {
@@ -129,11 +143,14 @@ Respond with exactly:
129
143
  { maxIterations: 2 },
130
144
  );
131
145
 
146
+ // Strip <analysis> scratchpad before parsing JSON verdict
147
+ const { cleaned: cleanedSummary } = stripAnalysis(result.summary);
148
+
132
149
  // Parse the structured response
133
150
  let pass = true;
134
151
  let reason = 'Review completed';
135
152
  let criteria: Record<string, string> = {};
136
- const jsonMatch = result.summary.match(/\{[\s\S]*"pass"[\s\S]*\}/);
153
+ const jsonMatch = cleanedSummary.match(/\{[\s\S]*"pass"[\s\S]*\}/);
137
154
  if (jsonMatch) {
138
155
  try {
139
156
  const parsed = JSON.parse(jsonMatch[0]);
@@ -146,14 +163,14 @@ Respond with exactly:
146
163
  } catch {
147
164
  if (jsonMatch[0].includes('"pass": false') || jsonMatch[0].includes('"pass":false')) {
148
165
  pass = false;
149
- reason = result.summary.slice(0, 200);
166
+ reason = cleanedSummary.slice(0, 200);
150
167
  }
151
168
  }
152
169
  } else {
153
- const lower = result.summary.toLowerCase();
170
+ const lower = cleanedSummary.toLowerCase();
154
171
  if (lower.includes('"pass": false') || lower.includes('"pass":false')) {
155
172
  pass = false;
156
- reason = result.summary.slice(0, 200);
173
+ reason = cleanedSummary.slice(0, 200);
157
174
  }
158
175
  }
159
176
 
@@ -0,0 +1,181 @@
1
+ /**
2
+ * Verification agent — independent post-run review of completed work.
3
+ *
4
+ * Uses a fresh provider session with a different model tier to ensure
5
+ * structurally independent review. Only has read_file and run_shell
6
+ * (read-only) — cannot modify the workspace.
7
+ *
8
+ * Produces a structured VerificationResult: pass/fail/inconclusive.
9
+ */
10
+
11
+ import type { VerificationResult, VerificationVerdict } from '../bot/task-types.js';
12
+ import { callAI } from '../bot/ai-client.js';
13
+ import { resolveProviderConfig } from '../bot/agent-provider.js';
14
+ import { resolveModelTier } from '../bot/behavior-defaults.js';
15
+ import { stripAnalysis } from './review-result.js';
16
+ import { CostTracker } from '../bot/cost-tracker.js';
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Verification prompt
20
+ // ---------------------------------------------------------------------------
21
+
22
+ const VERIFICATION_SYSTEM_PROMPT = `You are an independent verification agent. Your job is to review
23
+ work completed by another AI agent and determine if it meets the task requirements.
24
+
25
+ You have NO context about what happened during execution. You only see:
26
+ - The task description
27
+ - Files that were created or modified
28
+ - Current state of those files
29
+
30
+ You must verify by READING the actual files and optionally RUNNING read-only
31
+ commands (tests, linting, type checking). You CANNOT and MUST NOT modify any files.
32
+
33
+ CRITICAL: Respond with TEXT ONLY. Do NOT call any tools. Tool calls will be
34
+ REJECTED and will waste your only turn.
35
+
36
+ <analysis>
37
+ Examine the task description, then check each modified/created file.
38
+ Look for:
39
+ 1. Does the code actually implement what was requested?
40
+ 2. Are there obvious bugs, missing error handling, or incomplete implementations?
41
+ 3. Do tests pass? Does the code compile?
42
+ 4. Are there security issues or bad practices?
43
+ </analysis>
44
+
45
+ After your <analysis> block, output ONLY the following JSON:
46
+
47
+ {
48
+ "verdict": "pass" | "fail" | "inconclusive",
49
+ "summary": "One sentence explaining the verdict",
50
+ "issues": ["List of specific issues found (empty if pass)"],
51
+ "filesReviewed": ["List of files you examined"]
52
+ }`;
53
+
54
+ // ---------------------------------------------------------------------------
55
+ // Build user prompt from task + run data
56
+ // ---------------------------------------------------------------------------
57
+
58
+ export interface VerifyTaskInput {
59
+ taskTitle: string;
60
+ taskDescription: string;
61
+ filesCreated: string[];
62
+ filesModified: string[];
63
+ summary: string;
64
+ checks?: Record<string, string>;
65
+ }
66
+
67
+ export function buildVerificationPrompt(input: VerifyTaskInput): string {
68
+ const lines: string[] = [];
69
+ lines.push(`## Task to Verify`);
70
+ lines.push(`Title: ${input.taskTitle}`);
71
+ lines.push(`Description: ${input.taskDescription}`);
72
+ lines.push('');
73
+ lines.push(`## Work Summary`);
74
+ lines.push(input.summary);
75
+ lines.push('');
76
+
77
+ if (input.filesCreated.length > 0) {
78
+ lines.push(`## Files Created`);
79
+ for (const f of input.filesCreated) lines.push(`- ${f}`);
80
+ lines.push('');
81
+ }
82
+
83
+ if (input.filesModified.length > 0) {
84
+ lines.push(`## Files Modified`);
85
+ for (const f of input.filesModified) lines.push(`- ${f}`);
86
+ lines.push('');
87
+ }
88
+
89
+ if (input.checks && Object.keys(input.checks).length > 0) {
90
+ lines.push(`## Automated Check Results`);
91
+ for (const [name, result] of Object.entries(input.checks)) {
92
+ lines.push(`- ${name}: ${result === 'pass' ? 'PASS' : result.slice(0, 200)}`);
93
+ }
94
+ lines.push('');
95
+ }
96
+
97
+ lines.push(`## Instructions`);
98
+ lines.push('Read the files listed above and verify the work meets the task requirements.');
99
+ lines.push('Output your analysis in <analysis> tags, then the JSON verdict.');
100
+
101
+ return lines.join('\n');
102
+ }
103
+
104
+ // ---------------------------------------------------------------------------
105
+ // Parse verification response
106
+ // ---------------------------------------------------------------------------
107
+
108
+ export function parseVerificationResponse(response: string): Omit<VerificationResult, 'verifiedAt' | 'cost'> {
109
+ const { cleaned } = stripAnalysis(response);
110
+
111
+ // Extract JSON from response
112
+ const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
113
+ if (!jsonMatch) {
114
+ return {
115
+ verdict: 'inconclusive',
116
+ summary: 'Failed to parse verification response',
117
+ issues: ['Verification agent did not produce valid JSON'],
118
+ filesReviewed: [],
119
+ };
120
+ }
121
+
122
+ try {
123
+ const parsed = JSON.parse(jsonMatch[0]);
124
+ const verdict: VerificationVerdict =
125
+ parsed.verdict === 'pass' || parsed.verdict === 'fail' || parsed.verdict === 'inconclusive'
126
+ ? parsed.verdict
127
+ : 'inconclusive';
128
+
129
+ return {
130
+ verdict,
131
+ summary: typeof parsed.summary === 'string' ? parsed.summary : 'No summary provided',
132
+ issues: Array.isArray(parsed.issues) ? parsed.issues.filter((i: unknown) => typeof i === 'string') : [],
133
+ filesReviewed: Array.isArray(parsed.filesReviewed) ? parsed.filesReviewed.filter((f: unknown) => typeof f === 'string') : [],
134
+ };
135
+ } catch {
136
+ return {
137
+ verdict: 'inconclusive',
138
+ summary: 'Failed to parse verification JSON',
139
+ issues: ['JSON parse error in verification response'],
140
+ filesReviewed: [],
141
+ };
142
+ }
143
+ }
144
+
145
+ // ---------------------------------------------------------------------------
146
+ // Run verification
147
+ // ---------------------------------------------------------------------------
148
+
149
+ export async function runVerification(
150
+ input: VerifyTaskInput,
151
+ providerType: string,
152
+ tier: string,
153
+ apiKey?: string,
154
+ ): Promise<VerificationResult> {
155
+ const model = tier;
156
+ const pInfo = {
157
+ type: providerType as 'anthropic' | 'claude-cli' | 'platform',
158
+ apiKey,
159
+ model,
160
+ };
161
+
162
+ const userPrompt = buildVerificationPrompt(input);
163
+ const response = await callAI(pInfo, VERIFICATION_SYSTEM_PROMPT, userPrompt, 2048);
164
+
165
+ const parsed = parseVerificationResponse(response);
166
+
167
+ // Estimate cost
168
+ // Rough estimate: system prompt ~500 tokens, user prompt ~300-1000, response ~500
169
+ const estimatedInputTokens = (VERIFICATION_SYSTEM_PROMPT.length + userPrompt.length) / 4;
170
+ const estimatedOutputTokens = response.length / 4;
171
+ const cost = CostTracker.estimateCost(model, {
172
+ inputTokens: Math.round(estimatedInputTokens),
173
+ outputTokens: Math.round(estimatedOutputTokens),
174
+ });
175
+
176
+ return {
177
+ ...parsed,
178
+ verifiedAt: new Date().toISOString(),
179
+ cost,
180
+ };
181
+ }