planpong 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/dist/src/config/defaults.js +1 -0
  2. package/dist/src/config/defaults.js.map +1 -1
  3. package/dist/src/config/loader.d.ts +1 -0
  4. package/dist/src/config/loader.js +3 -0
  5. package/dist/src/config/loader.js.map +1 -1
  6. package/dist/src/core/apply-edits.d.ts +40 -0
  7. package/dist/src/core/apply-edits.js +220 -0
  8. package/dist/src/core/apply-edits.js.map +1 -0
  9. package/dist/src/core/convergence.d.ts +57 -4
  10. package/dist/src/core/convergence.js +134 -6
  11. package/dist/src/core/convergence.js.map +1 -1
  12. package/dist/src/core/loop.js +3 -3
  13. package/dist/src/core/loop.js.map +1 -1
  14. package/dist/src/core/operations.d.ts +14 -1
  15. package/dist/src/core/operations.js +592 -56
  16. package/dist/src/core/operations.js.map +1 -1
  17. package/dist/src/core/plan-diff.d.ts +23 -0
  18. package/dist/src/core/plan-diff.js +135 -0
  19. package/dist/src/core/plan-diff.js.map +1 -0
  20. package/dist/src/core/session.d.ts +11 -0
  21. package/dist/src/core/session.js +51 -1
  22. package/dist/src/core/session.js.map +1 -1
  23. package/dist/src/mcp/tools/get-feedback.d.ts +16 -0
  24. package/dist/src/mcp/tools/get-feedback.js +118 -114
  25. package/dist/src/mcp/tools/get-feedback.js.map +1 -1
  26. package/dist/src/mcp/tools/revise.d.ts +16 -0
  27. package/dist/src/mcp/tools/revise.js +76 -61
  28. package/dist/src/mcp/tools/revise.js.map +1 -1
  29. package/dist/src/mcp/tools/status.js +15 -1
  30. package/dist/src/mcp/tools/status.js.map +1 -1
  31. package/dist/src/prompts/planner.d.ts +34 -1
  32. package/dist/src/prompts/planner.js +272 -17
  33. package/dist/src/prompts/planner.js.map +1 -1
  34. package/dist/src/prompts/reviewer.d.ts +14 -1
  35. package/dist/src/prompts/reviewer.js +84 -1
  36. package/dist/src/prompts/reviewer.js.map +1 -1
  37. package/dist/src/providers/claude.d.ts +3 -0
  38. package/dist/src/providers/claude.js +151 -13
  39. package/dist/src/providers/claude.js.map +1 -1
  40. package/dist/src/providers/codex.d.ts +3 -0
  41. package/dist/src/providers/codex.js +150 -14
  42. package/dist/src/providers/codex.js.map +1 -1
  43. package/dist/src/providers/types.d.ts +69 -3
  44. package/dist/src/schemas/config.d.ts +3 -0
  45. package/dist/src/schemas/config.js +6 -0
  46. package/dist/src/schemas/config.js.map +1 -1
  47. package/dist/src/schemas/json-schema.d.ts +21 -0
  48. package/dist/src/schemas/json-schema.js +172 -0
  49. package/dist/src/schemas/json-schema.js.map +1 -0
  50. package/dist/src/schemas/metrics.d.ts +171 -0
  51. package/dist/src/schemas/metrics.js +49 -0
  52. package/dist/src/schemas/metrics.js.map +1 -0
  53. package/dist/src/schemas/revision.d.ts +166 -2
  54. package/dist/src/schemas/revision.js +35 -2
  55. package/dist/src/schemas/revision.js.map +1 -1
  56. package/dist/src/schemas/session.d.ts +6 -0
  57. package/dist/src/schemas/session.js +10 -0
  58. package/dist/src/schemas/session.js.map +1 -1
  59. package/package.json +4 -2
@@ -1,10 +1,15 @@
1
1
  import { createHash } from "node:crypto";
2
2
  import { readFileSync, writeFileSync, existsSync } from "node:fs";
3
3
  import { relative, resolve } from "node:path";
4
- import { buildRevisionPrompt } from "../prompts/planner.js";
5
- import { buildReviewPrompt, formatPriorDecisions, getReviewPhase, } from "../prompts/reviewer.js";
6
- import { parseFeedbackForPhase, parseRevision, isConverged, } from "./convergence.js";
7
- import { createSession, writeSessionState, writeRoundFeedback, writeRoundResponse, readRoundFeedback, readRoundResponse, writeInitialPlan, } from "./session.js";
4
+ import { isEditsRevision, isDirectionRevision, } from "../schemas/revision.js";
5
+ import { buildRevisionPrompt, buildEditsRetryPrompt, } from "../prompts/planner.js";
6
+ import { buildReviewPrompt, buildIncrementalReviewPrompt, formatPriorDecisions, getReviewPhase, } from "../prompts/reviewer.js";
7
+ import { buildPlanDiff } from "./plan-diff.js";
8
+ import { parseFeedbackForPhase, parseRevision, parseStructuredFeedbackForPhase, parseStructuredRevision, isConverged, StructuredOutputParseError, ZodValidationError, } from "./convergence.js";
9
+ import { getFeedbackJsonSchemaForPhase, getRevisionJsonSchema, } from "../schemas/json-schema.js";
10
+ import { applyEdits, logFailures, summarizeApply, } from "./apply-edits.js";
11
+ import { createSession, writeSessionState, writeRoundFeedback, writeRoundResponse, readRoundFeedback, readRoundResponse, writeInitialPlan, writeRoundMetrics, writeRoundPlanSnapshot, readRoundPlanSnapshot, } from "./session.js";
12
+ import { summarizeTiming, } from "../schemas/metrics.js";
8
13
  // --- Utility functions ---
9
14
  export function hashFile(path) {
10
15
  const content = readFileSync(path, "utf-8");
@@ -226,6 +231,216 @@ function buildPriorDecisions(cwd, sessionId, currentRound) {
226
231
  return null;
227
232
  return formatPriorDecisions(priorRounds);
228
233
  }
234
+ /**
235
+ * Invocation state machine — single owner of all retry/downgrade logic for
236
+ * provider invocations. Providers are single-shot; this function decides
237
+ * when to downgrade from structured output to legacy mode.
238
+ *
239
+ * Strict 2-attempt cap: structured (1) -> legacy fallback (1) -> terminal.
240
+ *
241
+ * Failure handling:
242
+ * - Provider `capability` error in structured mode → downgrade
243
+ * - Provider `fatal` error → terminal (no downgrade)
244
+ * - JSON.parse failure on structured output → downgrade
245
+ * - Zod validation failure on structured output → terminal (NOT retried)
246
+ * - Any failure in legacy mode → terminal
247
+ *
248
+ * Observability: when `metricsContext` is provided, each attempt emits a
249
+ * start/end line to stderr, collects `InvocationAttempt` records, and
250
+ * persists a `RoundMetrics` file in the session directory. All telemetry
251
+ * I/O is fail-open — failures log a warning and are swallowed, never
252
+ * altering the invocation outcome. The in-memory metrics object is
253
+ * returned alongside the result so callers get timing data without a
254
+ * filesystem round-trip.
255
+ */
256
+ async function invokeWithStateMachine(args) {
257
+ const { provider, invokeOptions, jsonSchema, buildPrompt, parseStructured, parseLegacy, roundLabel, metricsContext, } = args;
258
+ const supported = await provider.checkStructuredOutputSupport();
259
+ let mode = supported ? "structured" : "legacy";
260
+ let attempt = 0;
261
+ const maxAttempts = 2;
262
+ let lastError = null;
263
+ // Metrics collection — only active when metricsContext is provided.
264
+ const attempts = [];
265
+ const startedAt = new Date().toISOString();
266
+ const startedAtMs = Date.now();
267
+ const providerLabel = buildProviderLabel(provider.name, invokeOptions.model, invokeOptions.effort);
268
+ const writeMetricsNow = () => {
269
+ if (!metricsContext)
270
+ return;
271
+ try {
272
+ const metrics = {
273
+ schema_version: 1,
274
+ session_id: metricsContext.sessionId,
275
+ round: metricsContext.round,
276
+ phase: metricsContext.phase,
277
+ role: metricsContext.role,
278
+ started_at: startedAt,
279
+ completed_at: new Date().toISOString(),
280
+ total_duration_ms: Date.now() - startedAtMs,
281
+ attempts,
282
+ };
283
+ writeRoundMetrics(invokeOptions.cwd, metricsContext.sessionId, metricsContext.round, metricsContext.role, metrics);
284
+ }
285
+ catch {
286
+ // writeRoundMetrics is already fail-open; catch here belts-and-braces
287
+ // against unexpected synchronous errors building the metrics object.
288
+ }
289
+ };
290
+ const buildMetrics = () => {
291
+ if (!metricsContext)
292
+ return null;
293
+ try {
294
+ return {
295
+ schema_version: 1,
296
+ session_id: metricsContext.sessionId,
297
+ round: metricsContext.round,
298
+ phase: metricsContext.phase,
299
+ role: metricsContext.role,
300
+ started_at: startedAt,
301
+ completed_at: new Date().toISOString(),
302
+ total_duration_ms: Date.now() - startedAtMs,
303
+ attempts,
304
+ };
305
+ }
306
+ catch {
307
+ return null;
308
+ }
309
+ };
310
+ try {
311
+ while (attempt < maxAttempts) {
312
+ attempt++;
313
+ const prompt = buildPrompt(mode === "structured");
314
+ const promptChars = prompt.length;
315
+ const promptLines = prompt.split("\n").length;
316
+ const options = mode === "structured"
317
+ ? { ...invokeOptions, jsonSchema }
318
+ : { ...invokeOptions };
319
+ logStart(roundLabel, providerLabel, mode, promptChars, metricsContext);
320
+ const response = await provider.invoke(prompt, options);
321
+ // Base attempt record — filled in below.
322
+ const attemptRecord = {
323
+ mode,
324
+ provider: provider.name,
325
+ model: invokeOptions.model ?? null,
326
+ effort: invokeOptions.effort ?? null,
327
+ prompt_chars: promptChars,
328
+ prompt_lines: promptLines,
329
+ output_chars: null,
330
+ output_lines: null,
331
+ duration_ms: response.duration ?? 0,
332
+ ok: false,
333
+ error_kind: null,
334
+ error_exit_code: null,
335
+ };
336
+ if (!response.ok) {
337
+ attemptRecord.ok = false;
338
+ attemptRecord.error_kind = response.error.kind;
339
+ attemptRecord.error_exit_code = response.error.exitCode;
340
+ attempts.push(attemptRecord);
341
+ logEnd(roundLabel, providerLabel, mode, promptChars, null, response.duration ?? 0, false, `${response.error.kind}: ${truncate(response.error.message, 200)}`, metricsContext);
342
+ if (mode === "structured" &&
343
+ response.error.kind === "capability" &&
344
+ attempt < maxAttempts) {
345
+ provider.markNonCapable();
346
+ mode = "legacy";
347
+ continue;
348
+ }
349
+ // Fatal, or already in legacy mode — terminal
350
+ throw new Error(`${roundLabel} failed (exit ${response.error.exitCode}, ${response.error.kind}):\n${response.error.message}`);
351
+ }
352
+ // Provider returned output — record output size, try to parse.
353
+ const outputChars = response.output.length;
354
+ const outputLines = response.output.split("\n").length;
355
+ attemptRecord.output_chars = outputChars;
356
+ attemptRecord.output_lines = outputLines;
357
+ try {
358
+ const parsed = mode === "structured"
359
+ ? parseStructured(response.output)
360
+ : parseLegacy(response.output);
361
+ attemptRecord.ok = true;
362
+ attempts.push(attemptRecord);
363
+ logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, true, null, metricsContext);
364
+ return {
365
+ result: parsed,
366
+ metrics: buildMetrics(),
367
+ sessionId: response.ok ? response.sessionId : undefined,
368
+ };
369
+ }
370
+ catch (parseError) {
371
+ lastError = parseError instanceof Error ? parseError : new Error(String(parseError));
372
+ // Zod validation failure on structured output is terminal — the model
373
+ // produced semantically invalid content, retrying won't help.
374
+ if (parseError instanceof ZodValidationError) {
375
+ attemptRecord.ok = false;
376
+ attemptRecord.error_kind = "zod";
377
+ attempts.push(attemptRecord);
378
+ logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, false, `zod: ${truncate(lastError.message, 200)}`, metricsContext);
379
+ throw parseError;
380
+ }
381
+ // JSON.parse failure on structured output triggers downgrade
382
+ if (mode === "structured" &&
383
+ parseError instanceof StructuredOutputParseError &&
384
+ attempt < maxAttempts) {
385
+ attemptRecord.ok = false;
386
+ attemptRecord.error_kind = "parse";
387
+ attempts.push(attemptRecord);
388
+ logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, false, `parse: ${truncate(lastError.message, 200)}`, metricsContext);
389
+ provider.markNonCapable();
390
+ mode = "legacy";
391
+ continue;
392
+ }
393
+ // Legacy parse failure — terminal
394
+ attemptRecord.ok = false;
395
+ attemptRecord.error_kind = "parse";
396
+ attempts.push(attemptRecord);
397
+ logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, false, `parse: ${truncate(lastError.message, 200)}`, metricsContext);
398
+ throw new Error(`${roundLabel} parse failed in ${mode} mode: ${lastError.message}`);
399
+ }
400
+ }
401
+ // Unreachable in normal flow — defensive
402
+ throw lastError ?? new Error(`${roundLabel} exhausted all attempts`);
403
+ }
404
+ finally {
405
+ // Persist metrics on every exit (success or throw). Fail-open — this
406
+ // never throws; writeRoundMetrics catches its own errors.
407
+ writeMetricsNow();
408
+ }
409
+ }
410
+ function buildProviderLabel(providerName, model, effort) {
411
+ return formatProviderLabel({
412
+ provider: providerName,
413
+ model: model ?? undefined,
414
+ effort: effort ?? undefined,
415
+ });
416
+ }
417
+ function truncate(text, max) {
418
+ return text.length > max ? text.slice(0, max) : text;
419
+ }
420
+ function safeStderr(line) {
421
+ try {
422
+ process.stderr.write(line);
423
+ }
424
+ catch {
425
+ // stderr unavailable — nothing else we can do
426
+ }
427
+ }
428
+ function logStart(roundLabel, providerLabel, mode, promptChars, ctx) {
429
+ if (!ctx)
430
+ return;
431
+ safeStderr(`[planpong] R${ctx.round} ${ctx.role} | ${providerLabel} | ${mode} | prompt=${promptChars}c\n`);
432
+ }
433
+ function logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, durationMs, ok, failDetail, ctx) {
434
+ if (!ctx)
435
+ return;
436
+ const durationStr = formatDuration(durationMs);
437
+ if (ok && outputChars !== null) {
438
+ safeStderr(`[planpong] R${ctx.round} ${ctx.role} | ${providerLabel} | ${mode} | prompt=${promptChars}c output=${outputChars}c duration=${durationStr} | ok\n`);
439
+ }
440
+ else {
441
+ safeStderr(`[planpong] R${ctx.round} ${ctx.role} | ${providerLabel} | ${mode} | prompt=${promptChars}c duration=${durationStr} | fail (${failDetail ?? "unknown"})\n`);
442
+ }
443
+ }
229
444
  /**
230
445
  * Run a single review round: send current plan to the reviewer for critique.
231
446
  */
@@ -235,34 +450,68 @@ export async function runReviewRound(session, cwd, config, reviewerProvider) {
235
450
  const planContent = readFileSync(planPath, "utf-8");
236
451
  const phase = getReviewPhase(round);
237
452
  const priorDecisions = buildPriorDecisions(cwd, session.id, round);
238
- const reviewPrompt = buildReviewPrompt(planContent, priorDecisions, phase);
239
- const reviewResponse = await reviewerProvider.invoke(reviewPrompt, {
240
- cwd,
241
- model: config.reviewer.model,
242
- effort: config.reviewer.effort,
243
- });
244
- // Try to parse even on non-zero exit — CLIs can exit 1 with valid output
245
- let feedback;
246
- try {
247
- feedback = parseFeedbackForPhase(reviewResponse.content, phase);
248
- }
249
- catch (parseError) {
250
- // If exit code was also non-zero, the provider genuinely failed
251
- if (reviewResponse.exitCode !== 0) {
252
- throw new Error(`Reviewer failed (exit ${reviewResponse.exitCode}):\n${reviewResponse.content.slice(0, 500)}`);
253
- }
254
- // Exit was 0 but parse failed — retry
255
- const retryPrompt = `Your previous response could not be parsed. Please output ONLY a valid JSON object wrapped in <planpong-feedback> tags. The error was: ${parseError instanceof Error ? parseError.message : "parse error"}\n\nOriginal prompt:\n${reviewPrompt}`;
256
- const retryResponse = await reviewerProvider.invoke(retryPrompt, {
453
+ // Persist a snapshot of the plan as the reviewer is about to see it. On
454
+ // round N+1 we'll diff against this snapshot to produce the incremental
455
+ // "what changed" content for the resumed reviewer session.
456
+ writeRoundPlanSnapshot(cwd, session.id, round, planContent);
457
+ // Reviewer-side persistent sessions. Both claude and codex support this:
458
+ // - claude: we generate the UUID and pass it via --session-id (first)
459
+ // or --resume (subsequent).
460
+ // - codex: codex generates its own thread_id; we capture it from the
461
+ // `--json` event stream and pass it via `codex exec resume <id>`
462
+ // on subsequent calls.
463
+ // The canonical reviewer session ID is `session.reviewerSessionId` — for
464
+ // claude this is the pre-generated UUID; for codex it's overwritten
465
+ // after the first call with the captured thread_id.
466
+ const reviewerSessionInited = session.reviewerSessionInitialized === true;
467
+ const isResumedReviewerSession = reviewerSessionInited;
468
+ const priorPlanContent = isResumedReviewerSession
469
+ ? readRoundPlanSnapshot(cwd, session.id, round - 1)
470
+ : null;
471
+ const planDiff = priorPlanContent
472
+ ? buildPlanDiff(priorPlanContent, planContent)
473
+ : null;
474
+ const newSessionId = !reviewerSessionInited && reviewerProvider.name === "claude"
475
+ ? session.reviewerSessionId
476
+ : undefined;
477
+ const resumeSessionId = reviewerSessionInited
478
+ ? session.reviewerSessionId
479
+ : undefined;
480
+ const { result: feedback, metrics, sessionId: capturedSessionId, } = await invokeWithStateMachine({
481
+ provider: reviewerProvider,
482
+ invokeOptions: {
257
483
  cwd,
258
484
  model: config.reviewer.model,
259
485
  effort: config.reviewer.effort,
260
- });
261
- feedback = parseFeedbackForPhase(retryResponse.content, phase);
262
- }
486
+ newSessionId,
487
+ resumeSessionId,
488
+ },
489
+ jsonSchema: getFeedbackJsonSchemaForPhase(phase),
490
+ buildPrompt: (structuredOutput) => isResumedReviewerSession
491
+ ? buildIncrementalReviewPrompt(planDiff ?? planContent, priorDecisions, phase, structuredOutput)
492
+ : buildReviewPrompt(planContent, priorDecisions, phase, structuredOutput),
493
+ parseStructured: (output) => parseStructuredFeedbackForPhase(output, phase),
494
+ parseLegacy: (output) => parseFeedbackForPhase(output, phase),
495
+ roundLabel: `Round ${round} review`,
496
+ metricsContext: {
497
+ sessionId: session.id,
498
+ round,
499
+ phase,
500
+ role: "review",
501
+ },
502
+ });
263
503
  writeRoundFeedback(cwd, session.id, round, feedback);
264
504
  const severity = severityFromFeedback(feedback);
265
505
  const converged = isConverged(feedback);
506
+ const timing = metrics ? summarizeTiming(metrics) : undefined;
507
+ // Persist the canonical reviewer session ID. For claude this is the
508
+ // UUID we generated; for codex it's the thread_id captured from --json
509
+ // output. Either way, future rounds resume this conversation.
510
+ if (!reviewerSessionInited && capturedSessionId) {
511
+ session.reviewerSessionId = capturedSessionId;
512
+ session.reviewerSessionInitialized = true;
513
+ writeSessionState(cwd, session);
514
+ }
266
515
  // Extract phase-specific extras for status line
267
516
  const phaseExtras = {};
268
517
  if (feedback.verdict === "blocked") {
@@ -281,7 +530,7 @@ export async function runReviewRound(session, cwd, config, reviewerProvider) {
281
530
  phaseExtras.risks_promoted = feedback.issues.length;
282
531
  }
283
532
  }
284
- return { round, feedback, severity, converged, phaseExtras };
533
+ return { round, feedback, severity, converged, phaseExtras, timing };
285
534
  }
286
535
  /**
287
536
  * Run a single revision round: send plan + feedback to the planner for revision.
@@ -296,37 +545,86 @@ export async function runRevisionRound(session, cwd, config, plannerProvider) {
296
545
  }
297
546
  const phase = getReviewPhase(round);
298
547
  const keyDecisions = extractKeyDecisions(planContent);
299
- const revisionPrompt = buildRevisionPrompt(planContent, feedback, keyDecisions, null, phase);
300
- const revisionResponse = await plannerProvider.invoke(revisionPrompt, {
301
- cwd,
302
- model: config.planner.model,
303
- effort: config.planner.effort,
304
- });
305
- // Try to parse even on non-zero exit CLIs can exit 1 with valid output
306
- let revision;
307
- try {
308
- revision = parseRevision(revisionResponse.content);
309
- }
310
- catch (parseError) {
311
- // If exit code was also non-zero, the provider genuinely failed
312
- if (revisionResponse.exitCode !== 0) {
313
- throw new Error(`Planner revision failed (exit ${revisionResponse.exitCode}):\n${revisionResponse.content.slice(0, 500)}`);
314
- }
315
- // Exit was 0 but parse failed — retry
316
- const retryPrompt = `Your previous response could not be parsed. Please output ONLY a valid JSON object wrapped in <planpong-revision> tags. The error was: ${parseError instanceof Error ? parseError.message : "parse error"}\n\nOriginal prompt:\n${revisionPrompt}`;
317
- const retryResponse = await plannerProvider.invoke(retryPrompt, {
548
+ // Direction phase always uses full-plan output. Risk + detail honor
549
+ // config.revision_mode. The shape decision is made once here and threaded
550
+ // through prompt + JSON schema + parser.
551
+ const useEdits = phase !== "direction" && config.revision_mode === "edits";
552
+ const revisionShape = useEdits ? "edits" : "full";
553
+ const jsonSchema = getRevisionJsonSchema(phase, config.revision_mode);
554
+ // Planner-side persistent sessions were tested and found to INCREASE wall
555
+ // time — the model used the spared context budget to do more work per
556
+ // round (more edits, deeper revisions), not to do the same work faster.
557
+ // Reviewer-side persistent sessions are kept (see runReviewRound).
558
+ const { result: revision, metrics } = await invokeWithStateMachine({
559
+ provider: plannerProvider,
560
+ invokeOptions: {
318
561
  cwd,
319
562
  model: config.planner.model,
320
563
  effort: config.planner.effort,
564
+ },
565
+ jsonSchema,
566
+ buildPrompt: (structuredOutput) => buildRevisionPrompt(planContent, feedback, keyDecisions, null, phase, structuredOutput, config.revision_mode),
567
+ parseStructured: (output) => parseStructuredRevision(output, revisionShape),
568
+ parseLegacy: (output) => parseRevision(output, revisionShape),
569
+ roundLabel: `Round ${round} revision`,
570
+ metricsContext: {
571
+ sessionId: session.id,
572
+ round,
573
+ phase,
574
+ role: "revision",
575
+ },
576
+ });
577
+ writeRoundResponse(cwd, session.id, round, revision);
578
+ const timing = metrics ? summarizeTiming(metrics) : undefined;
579
+ // Apply revision to disk. Two paths: full (today's behavior) or edits
580
+ // (apply edit list, retry failures, atomic write).
581
+ let editTelemetry;
582
+ let finalRevision = revision;
583
+ if (useEdits && isEditsRevision(revision)) {
584
+ const result = await applyRevisionEdits({
585
+ session,
586
+ cwd,
587
+ planPath,
588
+ planContent,
589
+ revision,
590
+ plannerProvider,
591
+ config,
592
+ phase,
593
+ metrics,
321
594
  });
322
- revision = parseRevision(retryResponse.content);
595
+ finalRevision = result.revision;
596
+ editTelemetry = result.telemetry;
323
597
  }
324
- writeRoundResponse(cwd, session.id, round, revision);
325
- // Tally responses
598
+ else if (isDirectionRevision(revision)) {
599
+ writeFileSync(planPath, revision.updated_plan);
600
+ editTelemetry = {
601
+ revision_mode: "full",
602
+ edits_attempted: null,
603
+ edits_applied: null,
604
+ edits_failed: null,
605
+ edits_retried: null,
606
+ edits_recovered: null,
607
+ retry_invoked: false,
608
+ };
609
+ persistRevisionMetrics({
610
+ cwd,
611
+ session,
612
+ round,
613
+ phase,
614
+ metrics,
615
+ telemetry: editTelemetry,
616
+ });
617
+ }
618
+ else {
619
+ throw new Error(`runRevisionRound: revision shape mismatch — expected ${useEdits ? "edits" : "full"} but got ${"updated_plan" in revision ? "full" : "edits"}`);
620
+ }
621
+ session.planHash = hashFile(planPath);
622
+ writeSessionState(cwd, session);
623
+ // Tally responses (use the possibly-downgraded responses from finalRevision).
326
624
  let accepted = 0;
327
625
  let rejected = 0;
328
626
  let deferred = 0;
329
- for (const resp of revision.responses) {
627
+ for (const resp of finalRevision.responses) {
330
628
  if (resp.action === "accepted")
331
629
  accepted++;
332
630
  else if (resp.action === "rejected")
@@ -334,19 +632,257 @@ export async function runRevisionRound(session, cwd, config, plannerProvider) {
334
632
  else if (resp.action === "deferred")
335
633
  deferred++;
336
634
  }
337
- // Write updated plan to disk
338
- const updatedPlan = revision.updated_plan;
339
- writeFileSync(planPath, updatedPlan);
340
- session.planHash = hashFile(planPath);
341
- writeSessionState(cwd, session);
342
635
  return {
343
636
  round,
344
- revision,
637
+ revision: finalRevision,
345
638
  accepted,
346
639
  rejected,
347
640
  deferred,
348
641
  planUpdated: true,
642
+ timing,
643
+ edits: editTelemetry,
644
+ };
645
+ }
646
+ /**
647
+ * Apply an edits-mode revision: first-pass apply, targeted retry on failures,
648
+ * atomic write, response-edit consistency check. All mutations to the plan
649
+ * happen in memory; a single writeFileSync persists the final state.
650
+ */
651
+ async function applyRevisionEdits(args) {
652
+ const { session, cwd, planPath, planContent, revision, plannerProvider, config, phase, metrics, } = args;
653
+ const round = session.currentRound;
654
+ const editsAttempted = revision.edits.length;
655
+ // First-pass apply.
656
+ const firstPass = applyEdits(planContent, revision.edits);
657
+ if (firstPass.failures.length > 0) {
658
+ logFailures(`R${round} edits first-pass`, firstPass.failures);
659
+ }
660
+ safeStderr(`[planpong] R${round} edits | first-pass | ${summarizeApply(firstPass)}\n`);
661
+ let working = firstPass.plan;
662
+ const successfulEdits = firstPass.applied.map((a) => a.edit);
663
+ const recoveredEdits = [];
664
+ const unrecoverableFailures = [];
665
+ let retryInvoked = false;
666
+ let retriedCount = 0;
667
+ if (firstPass.failures.length > 0) {
668
+ retryInvoked = true;
669
+ retriedCount = firstPass.failures.length;
670
+ try {
671
+ const retryResult = await runEditsRetry({
672
+ cwd,
673
+ session,
674
+ round,
675
+ phase,
676
+ plannerProvider,
677
+ config,
678
+ currentPlan: working,
679
+ failures: firstPass.failures,
680
+ });
681
+ const secondPass = applyEdits(working, retryResult.edits);
682
+ if (secondPass.failures.length > 0) {
683
+ logFailures(`R${round} edits retry`, secondPass.failures);
684
+ }
685
+ safeStderr(`[planpong] R${round} edits | retry | ${summarizeApply(secondPass)}\n`);
686
+ working = secondPass.plan;
687
+ for (const a of secondPass.applied)
688
+ recoveredEdits.push(a.edit);
689
+ unrecoverableFailures.push(...secondPass.failures);
690
+ // Track the retry as an additional invocation attempt in metrics.
691
+ if (metrics) {
692
+ metrics.attempts.push(retryResult.attemptRecord);
693
+ }
694
+ }
695
+ catch (err) {
696
+ // Retry failed entirely (provider error, parse error). Surface but
697
+ // keep first-pass partial result — strictly better than nothing.
698
+ safeStderr(`[planpong] R${round} edits | retry failed: ${err instanceof Error ? err.message : String(err)}\n`);
699
+ unrecoverableFailures.push(...firstPass.failures);
700
+ }
701
+ }
702
+ // Atomic write of the final plan state.
703
+ writeFileSync(planPath, working);
704
+ // Response-edit consistency check: if an `accepted` response has no
705
+ // surviving edit anywhere in its rationale or suggestion's section, the
706
+ // planner claimed to have addressed an issue without a corresponding plan
707
+ // change. Downgrade to `deferred`. The match is heuristic — keyed on the
708
+ // response's `issue_id` appearing in the edit's after text or in any
709
+ // edit's section that maps to the issue's section field. This is the same
710
+ // tradeoff the plan documents (R3 F2 issue, accepted as heuristic).
711
+ const survivingEdits = [...successfulEdits, ...recoveredEdits];
712
+ const downgraded = downgradeOrphanedResponses(revision, survivingEdits, unrecoverableFailures);
713
+ // Persist failure metadata in the round response JSON alongside responses.
714
+ // We rewrite the response file to include the (possibly-downgraded)
715
+ // responses + edit application result.
716
+ writeRoundResponse(cwd, session.id, round, downgraded);
717
+ const telemetry = {
718
+ revision_mode: "edits",
719
+ edits_attempted: editsAttempted,
720
+ edits_applied: successfulEdits.length,
721
+ edits_failed: firstPass.failures.length,
722
+ edits_retried: retriedCount,
723
+ edits_recovered: recoveredEdits.length,
724
+ retry_invoked: retryInvoked,
725
+ };
726
+ persistRevisionMetrics({
727
+ cwd,
728
+ session,
729
+ round,
730
+ phase,
731
+ metrics,
732
+ telemetry,
733
+ });
734
+ return { revision: downgraded, telemetry };
735
+ }
736
+ /**
737
+ * One-shot retry for failed edits. Builds a targeted prompt with only the
738
+ * failures + current (partially-edited) plan and asks the planner to
739
+ * re-express each failed edit. The retry is best-effort — provider/parse
740
+ * errors are caught by the caller and treated as "no recovery."
741
+ */
742
+ async function runEditsRetry(args) {
743
+ const { plannerProvider, config, currentPlan, failures } = args;
744
+ const supported = await plannerProvider.checkStructuredOutputSupport();
745
+ const useStructured = supported;
746
+ const prompt = buildEditsRetryPrompt(currentPlan, failures.map((f) => ({
747
+ edit: f.edit,
748
+ reason: f.reason,
749
+ section_searched: f.section_searched,
750
+ diagnostic: f.diagnostic,
751
+ })), useStructured);
752
+ // Use a minimal JSON schema for the retry — only `edits` array. We lift
753
+ // the EditsRevisionJsonSchema's `edits` block by using the full schema
754
+ // and then ignoring the `responses` field (the planner is asked to omit
755
+ // it). For simplicity reuse the full edits schema; the retry prompt
756
+ // explicitly tells the planner not to include `responses`.
757
+ const jsonSchema = getRevisionJsonSchema("detail", "edits");
758
+ const promptChars = prompt.length;
759
+ const promptLines = prompt.split("\n").length;
760
+ const options = useStructured
761
+ ? {
762
+ cwd: args.cwd,
763
+ model: config.planner.model,
764
+ effort: config.planner.effort,
765
+ jsonSchema,
766
+ }
767
+ : {
768
+ cwd: args.cwd,
769
+ model: config.planner.model,
770
+ effort: config.planner.effort,
771
+ };
772
+ const response = await plannerProvider.invoke(prompt, options);
773
+ const attemptRecord = {
774
+ mode: useStructured ? "structured" : "legacy",
775
+ provider: plannerProvider.name,
776
+ model: config.planner.model ?? null,
777
+ effort: config.planner.effort ?? null,
778
+ prompt_chars: promptChars,
779
+ prompt_lines: promptLines,
780
+ output_chars: response.ok ? response.output.length : null,
781
+ output_lines: response.ok ? response.output.split("\n").length : null,
782
+ duration_ms: response.duration ?? 0,
783
+ ok: false,
784
+ error_kind: "edit-retry",
785
+ error_exit_code: null,
349
786
  };
787
+ if (!response.ok) {
788
+ throw new Error(`edits retry: provider error (${response.error.kind}: ${response.error.exitCode})`);
789
+ }
790
+ // Parse the retry response — accept either a full edits revision (with
791
+ // empty responses) or just an `edits` array wrapped in the standard tags.
792
+ let edits;
793
+ try {
794
+ if (useStructured) {
795
+ const parsed = JSON.parse(response.output);
796
+ edits = extractEditsFromRetryPayload(parsed);
797
+ }
798
+ else {
799
+ const json = response.output.match(/<planpong-revision>([\s\S]*?)<\/planpong-revision>/i)?.[1] ??
800
+ response.output;
801
+ const parsed = JSON.parse(json);
802
+ edits = extractEditsFromRetryPayload(parsed);
803
+ }
804
+ }
805
+ catch (err) {
806
+ throw new Error(`edits retry: parse failed: ${err instanceof Error ? err.message : String(err)}`);
807
+ }
808
+ attemptRecord.ok = true;
809
+ return { edits, attemptRecord };
810
+ }
811
+ function extractEditsFromRetryPayload(payload) {
812
+ if (payload &&
813
+ typeof payload === "object" &&
814
+ "edits" in payload &&
815
+ Array.isArray(payload.edits)) {
816
+ return payload.edits;
817
+ }
818
+ if (Array.isArray(payload))
819
+ return payload;
820
+ throw new Error("retry payload missing `edits` array");
821
+ }
822
+ /**
823
+ * Heuristic response-edit consistency check.
824
+ *
825
+ * For each `accepted` response, look for at least one surviving edit in the
826
+ * response's `section`. If none exists, downgrade the response action to
827
+ * `deferred` with rationale prefixed `edit_not_applied: ...`. The plan
828
+ * acknowledges this is heuristic (no explicit issue↔edit ID mapping in the
829
+ * schema). False negatives are possible — an accepted response that didn't
830
+ * require a plan change (e.g., "this was already addressed") is incorrectly
831
+ * downgraded if no edit lands in its declared section. To reduce noise, we
832
+ * only downgrade when there's at least one unrecoverable failure — if every
833
+ * edit succeeded, the planner's accepts are taken at face value.
834
+ */
835
+ function downgradeOrphanedResponses(revision, survivingEdits, unrecoverableFailures) {
836
+ if (unrecoverableFailures.length === 0)
837
+ return revision;
838
+ // Build a set of sections that have at least one surviving edit.
839
+ const editedSections = new Set(survivingEdits.map((e) => e.section.trim()));
840
+ const downgradedResponses = revision.responses.map((resp) => {
841
+ if (resp.action !== "accepted")
842
+ return resp;
843
+ // Section is not on IssueResponse; we have no per-issue section mapping
844
+ // (R3 F2 limitation). Without that, we treat ANY surviving-edit set as
845
+ // "the planner did some work" and only downgrade accepts when ALL edits
846
+ // failed — i.e., the plan didn't change at all. This is conservative
847
+ // but minimizes false-positive downgrades while still preventing the
848
+ // worst case ("everything accepted, no edits applied").
849
+ if (editedSections.size === 0) {
850
+ return {
851
+ ...resp,
852
+ action: "deferred",
853
+ rationale: `edit_not_applied: corresponding plan edit failed and could not be recovered. Original rationale: ${resp.rationale}`,
854
+ };
855
+ }
856
+ return resp;
857
+ });
858
+ return { ...revision, responses: downgradedResponses };
859
+ }
860
+ /**
861
+ * Re-persist the revision metrics file with augmented edit telemetry. The
862
+ * state machine has already written the basic metrics file in its finally
863
+ * block; this overwrites with the same data plus revision_mode + edit
864
+ * counts. Fail-open — telemetry write errors never propagate.
865
+ */
866
+ function persistRevisionMetrics(args) {
867
+ const { cwd, session, round, metrics, telemetry } = args;
868
+ if (!metrics)
869
+ return;
870
+ try {
871
+ const augmented = {
872
+ ...metrics,
873
+ revision_mode: telemetry.revision_mode,
874
+ edits_attempted: telemetry.edits_attempted,
875
+ edits_applied: telemetry.edits_applied,
876
+ edits_failed: telemetry.edits_failed,
877
+ edits_retried: telemetry.edits_retried,
878
+ edits_recovered: telemetry.edits_recovered,
879
+ retry_invoked: telemetry.retry_invoked,
880
+ };
881
+ writeRoundMetrics(cwd, session.id, round, "revision", augmented);
882
+ }
883
+ catch {
884
+ // fail-open — telemetry never breaks the run
885
+ }
350
886
  }
351
887
  /**
352
888
  * Mark the session as approved and update the plan's status line.