prism-mcp-server 7.3.1 → 7.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,10 +21,15 @@ import { getStorage } from '../storage/index.js';
21
21
  import { VALID_ACTION_TYPES } from './schema.js';
22
22
  import { SafetyController } from './safetyController.js';
23
23
  import { invokeClawAgent } from './clawInvocation.js';
24
- import { PRISM_DARK_FACTORY_POLL_MS, PRISM_DARK_FACTORY_MAX_RUNTIME_MS, PRISM_USER_ID } from '../config.js';
24
+ import { PRISM_DARK_FACTORY_POLL_MS, PRISM_DARK_FACTORY_MAX_RUNTIME_MS, PRISM_USER_ID, PRISM_VERIFICATION_LAYERS, PRISM_VERIFICATION_DEFAULT_SEVERITY } from '../config.js';
25
25
  import { debugLog } from '../utils/logger.js';
26
26
  import path from 'path';
27
27
  import fs from 'fs';
28
+ import * as crypto from 'crypto';
29
+ import { Gatekeeper } from '../verification/gatekeeper.js';
30
+ import { VerificationRunner } from '../verification/runner.js';
31
+ import { computeRubricHash } from '../verification/schema.js';
32
+ import { VerificationGateError } from '../errors.js';
28
33
  /** Interval handle for graceful shutdown */
29
34
  let runnerInterval = null;
30
35
  /** Tracks whether the runner is currently processing a tick (prevents overlap) */
@@ -186,25 +191,21 @@ async function emitExperienceEvent(pipeline, eventType, outcome) {
186
191
  *
187
192
  * @internal Exported for unit testing only. Not part of the public API.
188
193
  */
189
- export function parseExecuteOutput(raw) {
194
+ function extractJsonFromLlmOutput(raw) {
190
195
  if (!raw || typeof raw !== 'string' || raw.trim() === '') {
191
- return { parsed: null, error: 'JSON Parse Error: empty or non-string input' };
196
+ return { json: null, error: 'JSON Parse Error: empty or non-string input' };
192
197
  }
193
198
  const cleaned = raw.trim();
194
199
  let jsonCandidate = null;
195
- // Strategy 1: Try raw trimmed input as-is
196
200
  if (cleaned.startsWith('{')) {
197
201
  jsonCandidate = cleaned;
198
202
  }
199
- // Strategy 2: Strip markdown code fences
200
203
  if (!jsonCandidate) {
201
- // Match ```json or ``` blocks anywhere in the text (not just start/end of string)
202
204
  const fenceMatch = cleaned.match(/```(?:json)?\s*\n?([\s\S]*?)\n?\s*```/);
203
205
  if (fenceMatch) {
204
206
  jsonCandidate = fenceMatch[1].trim();
205
207
  }
206
208
  }
207
- // Strategy 3: Brace extraction — find first { to last }
208
209
  if (!jsonCandidate) {
209
210
  const firstBrace = cleaned.indexOf('{');
210
211
  const lastBrace = cleaned.lastIndexOf('}');
@@ -213,17 +214,21 @@ export function parseExecuteOutput(raw) {
213
214
  }
214
215
  }
215
216
  if (!jsonCandidate) {
216
- return { parsed: null, error: 'JSON Parse Error: no JSON object found in LLM output' };
217
+ return { json: null, error: 'JSON Parse Error: no JSON object found in LLM output' };
217
218
  }
218
- // Attempt JSON parse
219
+ return { json: jsonCandidate, error: null };
220
+ }
221
+ export function parseExecuteOutput(raw) {
222
+ const ext = extractJsonFromLlmOutput(raw);
223
+ if (ext.error || !ext.json)
224
+ return { parsed: null, error: ext.error };
219
225
  let parsed;
220
226
  try {
221
- parsed = JSON.parse(jsonCandidate);
227
+ parsed = JSON.parse(ext.json);
222
228
  }
223
229
  catch {
224
230
  return { parsed: null, error: 'JSON Parse Error: LLM output is not valid JSON' };
225
231
  }
226
- // Shape validation: must be an object with an 'actions' array
227
232
  if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
228
233
  return { parsed: null, error: 'Shape Error: output is not a JSON object' };
229
234
  }
@@ -231,7 +236,6 @@ export function parseExecuteOutput(raw) {
231
236
  return { parsed: null, error: 'Shape Error: output missing required "actions" array' };
232
237
  }
233
238
  const result = parsed;
234
- // Validate each action in the array
235
239
  for (let i = 0; i < result.actions.length; i++) {
236
240
  const action = result.actions[i];
237
241
  if (!action || typeof action !== 'object' || Array.isArray(action)) {
@@ -246,6 +250,62 @@ export function parseExecuteOutput(raw) {
246
250
  }
247
251
  return { parsed: result, error: null };
248
252
  }
253
+ export function parseContractOutput(raw) {
254
+ const ext = extractJsonFromLlmOutput(raw);
255
+ if (ext.error || !ext.json)
256
+ return { parsed: null, error: ext.error };
257
+ let parsed;
258
+ try {
259
+ parsed = JSON.parse(ext.json);
260
+ }
261
+ catch {
262
+ return { parsed: null, error: 'JSON Parse Error: LLM output is not valid JSON' };
263
+ }
264
+ if (!parsed || typeof parsed !== 'object' || !Array.isArray(parsed.criteria)) {
265
+ return { parsed: null, error: 'Shape Error: output missing required "criteria" array' };
266
+ }
267
+ // Validate each criterion element has the required string fields
268
+ for (let i = 0; i < parsed.criteria.length; i++) {
269
+ const c = parsed.criteria[i];
270
+ if (!c || typeof c !== 'object' || typeof c.id !== 'string' || typeof c.description !== 'string') {
271
+ return { parsed: null, error: `Shape Error: criteria[${i}] must have string "id" and "description"` };
272
+ }
273
+ }
274
+ return { parsed: parsed, error: null };
275
+ }
276
+ export function parseEvaluationOutput(raw) {
277
+ const ext = extractJsonFromLlmOutput(raw);
278
+ if (ext.error || !ext.json)
279
+ return { parsed: null, error: ext.error };
280
+ let parsed;
281
+ try {
282
+ parsed = JSON.parse(ext.json);
283
+ }
284
+ catch {
285
+ return { parsed: null, error: 'JSON Parse Error: LLM output is not valid JSON' };
286
+ }
287
+ if (!parsed || typeof parsed !== 'object' || typeof parsed.pass !== 'boolean') {
288
+ return { parsed: null, error: 'Shape Error: output missing required "pass" boolean' };
289
+ }
290
+ const p = parsed;
291
+ if (p.findings !== undefined) {
292
+ if (!Array.isArray(p.findings)) {
293
+ return { parsed: null, error: 'Shape Error: "findings" must be an array when present' };
294
+ }
295
+ // Fix #3: Each failing finding must supply an evidence object so the
296
+ // Evaluator cannot submit bare severity claims without evidence pointers.
297
+ for (let i = 0; i < p.findings.length; i++) {
298
+ const f = p.findings[i];
299
+ if (!f || typeof f !== 'object') {
300
+ return { parsed: null, error: `Shape Error: findings[${i}] must be an object` };
301
+ }
302
+ if (f.pass_fail === false && (!f.evidence || typeof f.evidence !== 'object')) {
303
+ return { parsed: null, error: `Shape Error: findings[${i}] is missing required "evidence" object for failure` };
304
+ }
305
+ }
306
+ }
307
+ return { parsed: parsed, error: null };
308
+ }
249
309
  // ─── Step Execution ────────────────────────────────────────────
250
310
  /**
251
311
  * Execute a single step of the pipeline.
@@ -268,8 +328,8 @@ async function executeStep(pipeline, spec) {
268
328
  // - BYOM model override
269
329
  // - Timeout enforcement
270
330
  const { success, resultText } = await invokeClawAgent(spec, pipeline);
271
- // For non-EXECUTE steps, return as-is (free-form text)
272
- if (step !== 'EXECUTE') {
331
+ // For non-JSON steps, return as-is (free-form text)
332
+ if (step !== 'EXECUTE' && step !== 'PLAN_CONTRACT' && step !== 'EVALUATE') {
273
333
  return {
274
334
  iteration: pipeline.iteration,
275
335
  step,
@@ -279,7 +339,6 @@ async function executeStep(pipeline, spec) {
279
339
  notes: resultText.slice(0, 2000),
280
340
  };
281
341
  }
282
- // ── v7.3.1: EXECUTE step — parse and validate structured output ──
283
342
  if (!success) {
284
343
  // LLM invocation itself failed (timeout, error, etc.)
285
344
  return {
@@ -291,7 +350,59 @@ async function executeStep(pipeline, spec) {
291
350
  notes: `LLM invocation failed: ${resultText.slice(0, 500)}`,
292
351
  };
293
352
  }
294
- // Parse the structured JSON output
353
+ // Parse appropriate JSON output depending on step
354
+ if (step === 'PLAN_CONTRACT') {
355
+ const { parsed, error: parseError } = parseContractOutput(resultText);
356
+ if (parseError || !parsed) {
357
+ debugLog(`[DarkFactory] PLAN_CONTRACT output parse failure: ${parseError}`);
358
+ return {
359
+ iteration: pipeline.iteration,
360
+ step,
361
+ started_at: stepStart,
362
+ completed_at: new Date().toISOString(),
363
+ success: false,
364
+ notes: parseError || 'Unknown parse error',
365
+ };
366
+ }
367
+ return {
368
+ iteration: pipeline.iteration,
369
+ step,
370
+ started_at: stepStart,
371
+ completed_at: new Date().toISOString(),
372
+ success: true,
373
+ notes: `Contract accepted with ${parsed.criteria.length} criteria.`,
374
+ contractPayload: parsed, // Passthrough for runner to write to disk
375
+ };
376
+ }
377
+ if (step === 'EVALUATE') {
378
+ const { parsed, error: parseError } = parseEvaluationOutput(resultText);
379
+ if (parseError || !parsed) {
380
+ debugLog(`[DarkFactory] EVALUATE output parse failure: ${parseError}`);
381
+ return {
382
+ iteration: pipeline.iteration,
383
+ step,
384
+ started_at: stepStart,
385
+ completed_at: new Date().toISOString(),
386
+ success: false,
387
+ notes: parseError || 'Unknown parse error',
388
+ };
389
+ }
390
+ // Fix #2: Serialize findings array into notes so the Generator's retry
391
+ // prompt receives the full line-by-line critique, not just a summary string.
392
+ const findingsText = parsed.findings && parsed.findings.length > 0
393
+ ? '\nFindings:\n' + parsed.findings.map((f) => `- [${f.severity}] Criterion ${f.criterion_id}: ${f.evidence?.description || 'Failed'} (${f.evidence?.file || 'unknown'}:${f.evidence?.line ?? '?'})`).join('\n')
394
+ : '';
395
+ return {
396
+ iteration: pipeline.iteration,
397
+ step,
398
+ started_at: stepStart,
399
+ completed_at: new Date().toISOString(),
400
+ success: parsed.pass,
401
+ notes: (parsed.notes || `Evaluation complete: ${parsed.pass ? 'PASS' : 'FAIL'}`) + findingsText,
402
+ evaluationPayload: parsed, // Passthrough for orchestrator logic
403
+ };
404
+ }
405
+ // EXECUTE
295
406
  const { parsed, error: parseError } = parseExecuteOutput(resultText);
296
407
  if (parseError || !parsed) {
297
408
  debugLog(`[DarkFactory] EXECUTE output parse failure: ${parseError}`);
@@ -482,11 +593,152 @@ async function runnerTick() {
482
593
  await emitExperienceEvent(pipeline, 'failure', `Scope violation: ${result.scopeViolation}`);
483
594
  return;
484
595
  }
485
- // Determine next step based on result
486
596
  const currentStep = pipeline.current_step;
487
- const nextStep = SafetyController.getNextStep(currentStep, pipeline.iteration, spec, result.success // For VERIFY step: success means tests passed
488
- );
489
- if (nextStep === null || currentStep === 'FINALIZE') {
597
+ // ── Phase 4: Verification Pipeline Orchestrator ──
598
+ if (currentStep === 'VERIFY' && spec.workingDirectory) {
599
+ const harnessPath = path.join(path.resolve(spec.workingDirectory), 'verification_harness.json');
600
+ if (fs.existsSync(harnessPath)) {
601
+ try {
602
+ const rawHarness = fs.readFileSync(harnessPath, 'utf8');
603
+ const harnessData = JSON.parse(rawHarness);
604
+ // GAP-5 fix: Persist the harness so CLI drift detection works for DarkFactory runs
605
+ const rubricHash = computeRubricHash(harnessData.tests);
606
+ const harness = {
607
+ ...harnessData,
608
+ project: pipeline.project,
609
+ conversation_id: `dark-factory-${pipeline.id}`,
610
+ created_at: new Date().toISOString(),
611
+ rubric_hash: rubricHash,
612
+ };
613
+ await storage.saveVerificationHarness(harness, pipeline.user_id);
614
+ // GAP-2 fix: Build VerificationConfig from env vars so PRISM_VERIFICATION_LAYERS
615
+ // and PRISM_VERIFICATION_DEFAULT_SEVERITY are respected in DarkFactory pipelines
616
+ const vConfig = {
617
+ enabled: true,
618
+ layers: PRISM_VERIFICATION_LAYERS,
619
+ default_severity: PRISM_VERIFICATION_DEFAULT_SEVERITY,
620
+ };
621
+ const verificationResult = await VerificationRunner.runSuite(rawHarness, {
622
+ harness,
623
+ layers: PRISM_VERIFICATION_LAYERS,
624
+ config: vConfig,
625
+ });
626
+ const coverageScore = verificationResult.total > 0 ? (verificationResult.total - verificationResult.skipped_count) / verificationResult.total : 0;
627
+ const executedCount = verificationResult.total - verificationResult.skipped_count;
628
+ const passRate = executedCount > 0 ? verificationResult.passed_count / executedCount : 0;
629
+ // GAP-4 fix: Use proper ValidationResult type instead of `any`
630
+ const valResult = {
631
+ id: crypto.randomUUID(),
632
+ rubric_hash: rubricHash,
633
+ project: pipeline.project,
634
+ conversation_id: `dark-factory-${pipeline.id}`,
635
+ run_at: new Date().toISOString(),
636
+ passed: passRate >= harnessData.min_pass_rate && verificationResult.severity_gate.action !== "abort",
637
+ pass_rate: passRate,
638
+ critical_failures: verificationResult.severity_gate.failed_assertions.length,
639
+ coverage_score: coverageScore,
640
+ result_json: JSON.stringify(verificationResult),
641
+ gate_action: verificationResult.severity_gate.action,
642
+ gate_override: false,
643
+ };
644
+ const { canContinue, validatedResult } = Gatekeeper.executeGate(valResult);
645
+ await storage.saveVerificationRun(validatedResult, pipeline.user_id);
646
+ // GAP-3 fix: Emit verification experience event for ML routing feedback
647
+ try {
648
+ const confidenceScore = Math.round(passRate * 100);
649
+ await storage.saveLedger({
650
+ project: pipeline.project,
651
+ conversation_id: `dark-factory-${pipeline.id}`,
652
+ user_id: pipeline.user_id,
653
+ event_type: 'validation_result',
654
+ summary: `[VERIFY] ${verificationResult.passed_count}/${verificationResult.total} passed (gate: ${verificationResult.severity_gate.action})`,
655
+ keywords: ['dark-factory', 'verification', pipeline.project],
656
+ importance: verificationResult.severity_gate.action === 'abort' ? 2 : 0,
657
+ confidence_score: confidenceScore,
658
+ });
659
+ }
660
+ catch { /* experience events are advisory — never block execution */ }
661
+ if (!canContinue) {
662
+ result.success = false;
663
+ result.notes = (result.notes ? result.notes + '\n\n' : '') + `[GATE BLOCKED] Pipeline verification runner failed the security gate.`;
664
+ }
665
+ else {
666
+ result.success = result.success && validatedResult.passed;
667
+ }
668
+ }
669
+ catch (err) {
670
+ if (err instanceof VerificationGateError) {
671
+ debugLog(`[DarkFactory] Pipeline ${pipeline.id} ABORTED by Verification Gate.`);
672
+ try {
673
+ await storage.savePipeline({
674
+ ...pipeline,
675
+ status: 'FAILED',
676
+ error: `[GATE ABORT] ${err.message}`,
677
+ });
678
+ }
679
+ catch { /* Status guard */ }
680
+ await emitExperienceEvent(pipeline, 'failure', `[GATE ABORT] ${err.message}`);
681
+ return;
682
+ }
683
+ else {
684
+ console.error(`[DarkFactory] Verification harness crash: ${err.message}`);
685
+ result.success = false;
686
+ result.notes = `[GATE CRASH] Verification suite failed to execute: ${err.message}`;
687
+ }
688
+ }
689
+ }
690
+ }
691
+ if (currentStep === 'PLAN_CONTRACT' && spec.workingDirectory && result.success && result.contractPayload) {
692
+ const contractPath = path.join(path.resolve(spec.workingDirectory), 'contract_rubric.json');
693
+ try {
694
+ fs.writeFileSync(contractPath, JSON.stringify(result.contractPayload, null, 2), 'utf8');
695
+ debugLog(`[DarkFactory] contract_rubric.json written to ${contractPath}`);
696
+ }
697
+ catch (writeErr) {
698
+ // Disk/permissions error — fail the pipeline immediately so it doesn't
699
+ // loop on PLAN_CONTRACT forever (each tick would re-attempt the write).
700
+ debugLog(`[DarkFactory] Failed to write contract_rubric.json: ${writeErr.message}`);
701
+ try {
702
+ await storage.savePipeline({
703
+ ...pipeline,
704
+ status: 'FAILED',
705
+ error: `PLAN_CONTRACT failed: could not write contract_rubric.json — ${writeErr.message}`,
706
+ });
707
+ }
708
+ catch { /* status guard */ }
709
+ await emitExperienceEvent(pipeline, 'failure', `contract_rubric.json write failed: ${writeErr.message}`);
710
+ return;
711
+ }
712
+ }
713
+ if (currentStep === 'EVALUATE' && result.evaluationPayload) {
714
+ // Emit ML learning event for evaluation outcome.
715
+ // Using 'learning' (valid LedgerEntry event type) rather than
716
+ // a non-existent 'evaluation_result' to avoid runtime cast issues.
717
+ try {
718
+ await storage.saveLedger({
719
+ project: pipeline.project,
720
+ conversation_id: `dark-factory-${pipeline.id}`,
721
+ user_id: pipeline.user_id,
722
+ event_type: 'learning',
723
+ summary: `[EVALUATE] ${result.success ? 'PASS' : 'FAIL'} on iter ${pipeline.iteration} rev ${pipeline.eval_revisions ?? 0}`,
724
+ keywords: ['dark-factory', 'evaluation', pipeline.project],
725
+ importance: result.success ? 3 : 1,
726
+ confidence_score: result.success ? 90 : 50,
727
+ });
728
+ }
729
+ catch { /* advisory — never block execution */ }
730
+ }
731
+ // ─── Determine plan_viable from evaluation payload ───
732
+ // Default to false (conservative): a parse failure or missing payload means
733
+ // we don't know if the plan is viable, so escalate to PLAN re-planning
734
+ // rather than burning eval_revisions on more EXECUTE retries.
735
+ let evalPlanViable = false;
736
+ if (currentStep === 'EVALUATE' && result.evaluationPayload) {
737
+ // plan_viable defaults false if null/missing (same conservative principle)
738
+ evalPlanViable = result.evaluationPayload.plan_viable ?? false;
739
+ }
740
+ const nextStepInfo = SafetyController.getNextStep(pipeline, spec, result.success, evalPlanViable);
741
+ if (nextStepInfo === null || currentStep === 'FINALIZE') {
490
742
  // Pipeline complete — determine final status
491
743
  const finalStatus = result.success ? 'COMPLETED' : 'FAILED';
492
744
  const finalError = result.success ? null : `Pipeline ended at step=${currentStep}: ${result.notes?.slice(0, 500)}`;
@@ -514,13 +766,25 @@ async function runnerTick() {
514
766
  debugLog(`[DarkFactory] Pipeline ${pipeline.id} finished: ${finalStatus}`);
515
767
  }
516
768
  else {
517
- // Advance to next step
518
769
  try {
770
+ const updatedPayload = currentStep === 'PLAN_CONTRACT' && result.contractPayload
771
+ ? result.contractPayload
772
+ : pipeline.contract_payload;
773
+ // Forward the most informative notes available:
774
+ // EXECUTE notes = what the generator did
775
+ // EVALUATE notes = what the evaluator found
776
+ // Other steps: preserve existing notes
777
+ const updatedNotes = (currentStep === 'EXECUTE' || currentStep === 'EVALUATE') && result.notes
778
+ ? result.notes
779
+ : pipeline.notes;
519
780
  await storage.savePipeline({
520
781
  ...pipeline,
521
- current_step: nextStep.step,
522
- iteration: nextStep.iteration,
782
+ current_step: nextStepInfo.step,
783
+ iteration: nextStepInfo.iteration,
784
+ eval_revisions: nextStepInfo.eval_revisions,
523
785
  last_heartbeat: new Date().toISOString(),
786
+ contract_payload: updatedPayload,
787
+ notes: updatedNotes,
524
788
  });
525
789
  }
526
790
  catch (err) {
@@ -531,7 +795,7 @@ async function runnerTick() {
531
795
  }
532
796
  throw err;
533
797
  }
534
- debugLog(`[DarkFactory] Pipeline ${pipeline.id} advanced: ${currentStep} → ${nextStep.step} (iter ${nextStep.iteration})`);
798
+ debugLog(`[DarkFactory] Pipeline ${pipeline.id} advanced: ${currentStep} → ${nextStepInfo.step} (iter ${nextStepInfo.iteration}, rev ${nextStepInfo.eval_revisions ?? 0})`);
535
799
  }
536
800
  }
537
801
  catch (err) {
@@ -1,4 +1,4 @@
1
- import { VALID_ACTION_TYPES } from './schema.js';
1
+ import { VALID_ACTION_TYPES, DEFAULT_MAX_REVISIONS } from './schema.js';
2
2
  import { PRISM_DARK_FACTORY_MAX_RUNTIME_MS } from '../config.js';
3
3
  import { debugLog } from '../utils/logger.js';
4
4
  import path from 'path';
@@ -31,13 +31,6 @@ export class SafetyController {
31
31
  'COMPLETED': [], // Terminal — no exits
32
32
  'FAILED': ['RUNNING'], // Allow retry from failed state
33
33
  };
34
- /**
35
- * Legal step transitions for the pipeline execution state machine.
36
- * FINALIZE is entered from VERIFY when iteration == maxIterations or success.
37
- */
38
- static STEP_ORDER = [
39
- 'INIT', 'PLAN', 'EXECUTE', 'VERIFY', 'FINALIZE'
40
- ];
41
34
  /**
42
35
  * Prevents runaway LLM invocation loops by enforcing the max iteration envelope.
43
36
  */
@@ -147,8 +140,15 @@ export class SafetyController {
147
140
  * Used by clawInvocation.ts instead of inline prompt construction.
148
141
  */
149
142
  static generateBoundaryPrompt(spec, state) {
143
+ let modeDescription = 'an autonomous code agent';
144
+ if (state.current_step === 'PLAN_CONTRACT' || state.current_step === 'EVALUATE') {
145
+ modeDescription = 'an ADVERSARIAL EVALUATOR enforcing strict quality constraints against a generated output';
146
+ }
147
+ else if (state.current_step === 'EXECUTE') {
148
+ modeDescription = 'a GENERATOR executing code constrained by a strict rubric';
149
+ }
150
150
  const lines = [
151
- `You are Prism Dark Factory, operating in the background as an autonomous code agent.`,
151
+ `You are Prism Dark Factory, operating in the background as ${modeDescription}.`,
152
152
  `You are strictly limited to code actions within the defined scope.`,
153
153
  ``,
154
154
  `── Operational Boundaries ──`,
@@ -156,6 +156,7 @@ export class SafetyController {
156
156
  `Project: ${state.project}`,
157
157
  `Current Step: ${state.current_step}`,
158
158
  `Iteration: ${state.iteration} / ${spec.maxIterations}`,
159
+ `Revision: ${state.eval_revisions ?? 0} / ${spec.maxRevisions ?? DEFAULT_MAX_REVISIONS}`,
159
160
  `Restricted Workspace: ${spec.workingDirectory || '(unrestricted)'}`,
160
161
  ];
161
162
  if (spec.contextFiles && spec.contextFiles.length > 0) {
@@ -164,29 +165,54 @@ export class SafetyController {
164
165
  lines.push(``, `── Objective ──`, spec.objective, ``, `── Safety Rules ──`, `1. Do NOT modify files outside the Restricted Workspace.`, `2. Do NOT make network requests unless the objective explicitly requires it.`, `3. Do NOT execute destructive operations (rm -rf, DROP TABLE, etc.).`, `4. Respond ONLY with actions relevant to the current step.`, `5. If you cannot complete the step, explain why and stop.`);
165
166
  return lines.join('\n');
166
167
  }
167
- /**
168
- * Determine the next step in the pipeline execution sequence.
169
- * Returns null if the pipeline should terminate (FINALIZE reached or iteration exceeded).
170
- */
171
- static getNextStep(currentStep, iteration, spec, verifyPassed) {
168
+ static getNextStep(state, spec, stepPassed, planViable = true) {
169
+ const currentStep = state.current_step;
170
+ const iteration = state.iteration;
171
+ const eval_revisions = state.eval_revisions ?? 0;
172
172
  switch (currentStep) {
173
173
  case 'INIT':
174
- return { step: 'PLAN', iteration };
174
+ return { step: 'PLAN', iteration, eval_revisions };
175
175
  case 'PLAN':
176
- return { step: 'EXECUTE', iteration };
176
+ return { step: 'PLAN_CONTRACT', iteration, eval_revisions };
177
+ case 'PLAN_CONTRACT':
178
+ return { step: 'EXECUTE', iteration, eval_revisions };
177
179
  case 'EXECUTE':
178
- return { step: 'VERIFY', iteration };
180
+ return { step: 'EVALUATE', iteration, eval_revisions };
181
+ case 'EVALUATE':
182
+ if (stepPassed) {
183
+ // Contract passed, move to VERIFY
184
+ return { step: 'VERIFY', iteration, eval_revisions: 0 };
185
+ }
186
+ // Contract failed.
187
+ if (planViable) {
188
+ // Fall back to EXECUTE but increment revision counter
189
+ const nextRevision = eval_revisions + 1;
190
+ const maxRev = spec.maxRevisions ?? DEFAULT_MAX_REVISIONS;
191
+ if (nextRevision >= maxRev) {
192
+ // Exceeded max revisions — pipeline fails
193
+ return null;
194
+ }
195
+ return { step: 'EXECUTE', iteration, eval_revisions: nextRevision };
196
+ }
197
+ else {
198
+ // Fall back all the way to PLAN
199
+ const nextIteration = iteration + 1;
200
+ if (!SafetyController.validateIterationLimit(nextIteration, spec)) {
201
+ return null;
202
+ }
203
+ return { step: 'PLAN', iteration: nextIteration, eval_revisions: 0 };
204
+ }
179
205
  case 'VERIFY':
180
- if (verifyPassed) {
181
- return { step: 'FINALIZE', iteration };
206
+ if (stepPassed) {
207
+ return { step: 'FINALIZE', iteration, eval_revisions };
182
208
  }
183
209
  // Verification failed — loop back to PLAN with incremented iteration
184
- const nextIteration = iteration + 1;
185
- if (!SafetyController.validateIterationLimit(nextIteration, spec)) {
210
+ const nextIterationVerify = iteration + 1;
211
+ if (!SafetyController.validateIterationLimit(nextIterationVerify, spec)) {
186
212
  // Exceeded max iterations — force finalize with failure
187
213
  return null;
188
214
  }
189
- return { step: 'PLAN', iteration: nextIteration };
215
+ return { step: 'PLAN', iteration: nextIterationVerify, eval_revisions: 0 };
190
216
  case 'FINALIZE':
191
217
  return null; // Terminal step
192
218
  default:
@@ -2,3 +2,5 @@
2
2
  export const VALID_ACTION_TYPES = [
3
3
  'READ_FILE', 'WRITE_FILE', 'PATCH_FILE', 'RUN_TEST'
4
4
  ];
5
+ /** Default max adversarial revisions per EXECUTE phase (used when spec.maxRevisions is unset). */
6
+ export const DEFAULT_MAX_REVISIONS = 3;