planpong 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/dist/bin/planpong.js +37 -1
  2. package/dist/bin/planpong.js.map +1 -1
  3. package/dist/src/config/defaults.js +1 -0
  4. package/dist/src/config/defaults.js.map +1 -1
  5. package/dist/src/config/loader.d.ts +1 -0
  6. package/dist/src/config/loader.js +3 -0
  7. package/dist/src/config/loader.js.map +1 -1
  8. package/dist/src/core/apply-edits.d.ts +40 -0
  9. package/dist/src/core/apply-edits.js +220 -0
  10. package/dist/src/core/apply-edits.js.map +1 -0
  11. package/dist/src/core/convergence.d.ts +18 -2
  12. package/dist/src/core/convergence.js +21 -9
  13. package/dist/src/core/convergence.js.map +1 -1
  14. package/dist/src/core/operations.d.ts +14 -1
  15. package/dist/src/core/operations.js +551 -62
  16. package/dist/src/core/operations.js.map +1 -1
  17. package/dist/src/core/plan-diff.d.ts +23 -0
  18. package/dist/src/core/plan-diff.js +135 -0
  19. package/dist/src/core/plan-diff.js.map +1 -0
  20. package/dist/src/core/session.d.ts +11 -0
  21. package/dist/src/core/session.js +51 -1
  22. package/dist/src/core/session.js.map +1 -1
  23. package/dist/src/mcp/tools/get-feedback.d.ts +16 -0
  24. package/dist/src/mcp/tools/get-feedback.js +118 -114
  25. package/dist/src/mcp/tools/get-feedback.js.map +1 -1
  26. package/dist/src/mcp/tools/revise.d.ts +16 -0
  27. package/dist/src/mcp/tools/revise.js +76 -61
  28. package/dist/src/mcp/tools/revise.js.map +1 -1
  29. package/dist/src/mcp/tools/status.js +15 -1
  30. package/dist/src/mcp/tools/status.js.map +1 -1
  31. package/dist/src/prompts/planner.d.ts +34 -1
  32. package/dist/src/prompts/planner.js +239 -4
  33. package/dist/src/prompts/planner.js.map +1 -1
  34. package/dist/src/prompts/reviewer.d.ts +13 -0
  35. package/dist/src/prompts/reviewer.js +65 -0
  36. package/dist/src/prompts/reviewer.js.map +1 -1
  37. package/dist/src/providers/claude.js +19 -3
  38. package/dist/src/providers/claude.js.map +1 -1
  39. package/dist/src/providers/codex.js +50 -3
  40. package/dist/src/providers/codex.js.map +1 -1
  41. package/dist/src/providers/types.d.ts +20 -0
  42. package/dist/src/schemas/config.d.ts +3 -0
  43. package/dist/src/schemas/config.js +6 -0
  44. package/dist/src/schemas/config.js.map +1 -1
  45. package/dist/src/schemas/json-schema.d.ts +12 -0
  46. package/dist/src/schemas/json-schema.js +20 -1
  47. package/dist/src/schemas/json-schema.js.map +1 -1
  48. package/dist/src/schemas/metrics.d.ts +171 -0
  49. package/dist/src/schemas/metrics.js +49 -0
  50. package/dist/src/schemas/metrics.js.map +1 -0
  51. package/dist/src/schemas/revision.d.ts +166 -2
  52. package/dist/src/schemas/revision.js +35 -2
  53. package/dist/src/schemas/revision.js.map +1 -1
  54. package/dist/src/schemas/session.d.ts +6 -0
  55. package/dist/src/schemas/session.js +10 -0
  56. package/dist/src/schemas/session.js.map +1 -1
  57. package/package.json +1 -1
@@ -1,11 +1,15 @@
1
1
  import { createHash } from "node:crypto";
2
2
  import { readFileSync, writeFileSync, existsSync } from "node:fs";
3
3
  import { relative, resolve } from "node:path";
4
- import { buildRevisionPrompt } from "../prompts/planner.js";
5
- import { buildReviewPrompt, formatPriorDecisions, getReviewPhase, } from "../prompts/reviewer.js";
4
+ import { isEditsRevision, isDirectionRevision, } from "../schemas/revision.js";
5
+ import { buildRevisionPrompt, buildEditsRetryPrompt, } from "../prompts/planner.js";
6
+ import { buildReviewPrompt, buildIncrementalReviewPrompt, formatPriorDecisions, getReviewPhase, } from "../prompts/reviewer.js";
7
+ import { buildPlanDiff } from "./plan-diff.js";
6
8
  import { parseFeedbackForPhase, parseRevision, parseStructuredFeedbackForPhase, parseStructuredRevision, isConverged, StructuredOutputParseError, ZodValidationError, } from "./convergence.js";
7
- import { getFeedbackJsonSchemaForPhase, PlannerRevisionJsonSchema } from "../schemas/json-schema.js";
8
- import { createSession, writeSessionState, writeRoundFeedback, writeRoundResponse, readRoundFeedback, readRoundResponse, writeInitialPlan, } from "./session.js";
9
+ import { getFeedbackJsonSchemaForPhase, getRevisionJsonSchema, } from "../schemas/json-schema.js";
10
+ import { applyEdits, logFailures, summarizeApply, } from "./apply-edits.js";
11
+ import { createSession, writeSessionState, writeRoundFeedback, writeRoundResponse, readRoundFeedback, readRoundResponse, writeInitialPlan, writeRoundMetrics, writeRoundPlanSnapshot, readRoundPlanSnapshot, } from "./session.js";
12
+ import { summarizeTiming, } from "../schemas/metrics.js";
9
13
  // --- Utility functions ---
10
14
  export function hashFile(path) {
11
15
  const content = readFileSync(path, "utf-8");
@@ -240,62 +244,202 @@ function buildPriorDecisions(cwd, sessionId, currentRound) {
240
244
  * - JSON.parse failure on structured output → downgrade
241
245
  * - Zod validation failure on structured output → terminal (NOT retried)
242
246
  * - Any failure in legacy mode → terminal
247
+ *
248
+ * Observability: when `metricsContext` is provided, each attempt emits a
249
+ * start/end line to stderr, collects `InvocationAttempt` records, and
250
+ * persists a `RoundMetrics` file in the session directory. All telemetry
251
+ * I/O is fail-open — failures log a warning and are swallowed, never
252
+ * altering the invocation outcome. The in-memory metrics object is
253
+ * returned alongside the result so callers get timing data without a
254
+ * filesystem round-trip.
243
255
  */
244
256
  async function invokeWithStateMachine(args) {
245
- const { provider, invokeOptions, jsonSchema, buildPrompt, parseStructured, parseLegacy, roundLabel, } = args;
257
+ const { provider, invokeOptions, jsonSchema, buildPrompt, parseStructured, parseLegacy, roundLabel, metricsContext, } = args;
246
258
  const supported = await provider.checkStructuredOutputSupport();
247
259
  let mode = supported ? "structured" : "legacy";
248
260
  let attempt = 0;
249
261
  const maxAttempts = 2;
250
262
  let lastError = null;
251
- while (attempt < maxAttempts) {
252
- attempt++;
253
- const prompt = buildPrompt(mode === "structured");
254
- const options = mode === "structured"
255
- ? { ...invokeOptions, jsonSchema }
256
- : { ...invokeOptions };
257
- const response = await provider.invoke(prompt, options);
258
- if (!response.ok) {
259
- if (mode === "structured" &&
260
- response.error.kind === "capability" &&
261
- attempt < maxAttempts) {
262
- process.stderr.write(`[planpong] ${roundLabel}: structured → legacy (reason: capability error: ${response.error.message.slice(0, 200)})\n`);
263
- provider.markNonCapable();
264
- mode = "legacy";
265
- continue;
266
- }
267
- // Fatal, or already in legacy mode — terminal
268
- throw new Error(`${roundLabel} failed (exit ${response.error.exitCode}, ${response.error.kind}):\n${response.error.message}`);
263
+ // Metrics collection only active when metricsContext is provided.
264
+ const attempts = [];
265
+ const startedAt = new Date().toISOString();
266
+ const startedAtMs = Date.now();
267
+ const providerLabel = buildProviderLabel(provider.name, invokeOptions.model, invokeOptions.effort);
268
+ const writeMetricsNow = () => {
269
+ if (!metricsContext)
270
+ return;
271
+ try {
272
+ const metrics = {
273
+ schema_version: 1,
274
+ session_id: metricsContext.sessionId,
275
+ round: metricsContext.round,
276
+ phase: metricsContext.phase,
277
+ role: metricsContext.role,
278
+ started_at: startedAt,
279
+ completed_at: new Date().toISOString(),
280
+ total_duration_ms: Date.now() - startedAtMs,
281
+ attempts,
282
+ };
283
+ writeRoundMetrics(invokeOptions.cwd, metricsContext.sessionId, metricsContext.round, metricsContext.role, metrics);
269
284
  }
270
- // Provider returned output — try to parse
285
+ catch {
286
+ // writeRoundMetrics is already fail-open; catch here belts-and-braces
287
+ // against unexpected synchronous errors building the metrics object.
288
+ }
289
+ };
290
+ const buildMetrics = () => {
291
+ if (!metricsContext)
292
+ return null;
271
293
  try {
272
- if (mode === "structured") {
273
- return parseStructured(response.output);
274
- }
275
- return parseLegacy(response.output);
294
+ return {
295
+ schema_version: 1,
296
+ session_id: metricsContext.sessionId,
297
+ round: metricsContext.round,
298
+ phase: metricsContext.phase,
299
+ role: metricsContext.role,
300
+ started_at: startedAt,
301
+ completed_at: new Date().toISOString(),
302
+ total_duration_ms: Date.now() - startedAtMs,
303
+ attempts,
304
+ };
305
+ }
306
+ catch {
307
+ return null;
276
308
  }
277
- catch (parseError) {
278
- lastError = parseError instanceof Error ? parseError : new Error(String(parseError));
279
- // Zod validation failure on structured output is terminal — the model
280
- // produced semantically invalid content, retrying won't help.
281
- if (parseError instanceof ZodValidationError) {
282
- throw parseError;
309
+ };
310
+ try {
311
+ while (attempt < maxAttempts) {
312
+ attempt++;
313
+ const prompt = buildPrompt(mode === "structured");
314
+ const promptChars = prompt.length;
315
+ const promptLines = prompt.split("\n").length;
316
+ const options = mode === "structured"
317
+ ? { ...invokeOptions, jsonSchema }
318
+ : { ...invokeOptions };
319
+ logStart(roundLabel, providerLabel, mode, promptChars, metricsContext);
320
+ const response = await provider.invoke(prompt, options);
321
+ // Base attempt record — filled in below.
322
+ const attemptRecord = {
323
+ mode,
324
+ provider: provider.name,
325
+ model: invokeOptions.model ?? null,
326
+ effort: invokeOptions.effort ?? null,
327
+ prompt_chars: promptChars,
328
+ prompt_lines: promptLines,
329
+ output_chars: null,
330
+ output_lines: null,
331
+ duration_ms: response.duration ?? 0,
332
+ ok: false,
333
+ error_kind: null,
334
+ error_exit_code: null,
335
+ };
336
+ if (!response.ok) {
337
+ attemptRecord.ok = false;
338
+ attemptRecord.error_kind = response.error.kind;
339
+ attemptRecord.error_exit_code = response.error.exitCode;
340
+ attempts.push(attemptRecord);
341
+ logEnd(roundLabel, providerLabel, mode, promptChars, null, response.duration ?? 0, false, `${response.error.kind}: ${truncate(response.error.message, 200)}`, metricsContext);
342
+ if (mode === "structured" &&
343
+ response.error.kind === "capability" &&
344
+ attempt < maxAttempts) {
345
+ provider.markNonCapable();
346
+ mode = "legacy";
347
+ continue;
348
+ }
349
+ // Fatal, or already in legacy mode — terminal
350
+ throw new Error(`${roundLabel} failed (exit ${response.error.exitCode}, ${response.error.kind}):\n${response.error.message}`);
283
351
  }
284
- // JSON.parse failure on structured output triggers downgrade
285
- if (mode === "structured" &&
286
- parseError instanceof StructuredOutputParseError &&
287
- attempt < maxAttempts) {
288
- process.stderr.write(`[planpong] ${roundLabel}: structured → legacy (reason: JSON.parse failure: ${lastError.message.slice(0, 200)})\n`);
289
- provider.markNonCapable();
290
- mode = "legacy";
291
- continue;
352
+ // Provider returned output record output size, try to parse.
353
+ const outputChars = response.output.length;
354
+ const outputLines = response.output.split("\n").length;
355
+ attemptRecord.output_chars = outputChars;
356
+ attemptRecord.output_lines = outputLines;
357
+ try {
358
+ const parsed = mode === "structured"
359
+ ? parseStructured(response.output)
360
+ : parseLegacy(response.output);
361
+ attemptRecord.ok = true;
362
+ attempts.push(attemptRecord);
363
+ logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, true, null, metricsContext);
364
+ return {
365
+ result: parsed,
366
+ metrics: buildMetrics(),
367
+ sessionId: response.ok ? response.sessionId : undefined,
368
+ };
369
+ }
370
+ catch (parseError) {
371
+ lastError = parseError instanceof Error ? parseError : new Error(String(parseError));
372
+ // Zod validation failure on structured output is terminal — the model
373
+ // produced semantically invalid content, retrying won't help.
374
+ if (parseError instanceof ZodValidationError) {
375
+ attemptRecord.ok = false;
376
+ attemptRecord.error_kind = "zod";
377
+ attempts.push(attemptRecord);
378
+ logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, false, `zod: ${truncate(lastError.message, 200)}`, metricsContext);
379
+ throw parseError;
380
+ }
381
+ // JSON.parse failure on structured output triggers downgrade
382
+ if (mode === "structured" &&
383
+ parseError instanceof StructuredOutputParseError &&
384
+ attempt < maxAttempts) {
385
+ attemptRecord.ok = false;
386
+ attemptRecord.error_kind = "parse";
387
+ attempts.push(attemptRecord);
388
+ logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, false, `parse: ${truncate(lastError.message, 200)}`, metricsContext);
389
+ provider.markNonCapable();
390
+ mode = "legacy";
391
+ continue;
392
+ }
393
+ // Legacy parse failure — terminal
394
+ attemptRecord.ok = false;
395
+ attemptRecord.error_kind = "parse";
396
+ attempts.push(attemptRecord);
397
+ logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, false, `parse: ${truncate(lastError.message, 200)}`, metricsContext);
398
+ throw new Error(`${roundLabel} parse failed in ${mode} mode: ${lastError.message}`);
292
399
  }
293
- // Legacy parse failure — terminal
294
- throw new Error(`${roundLabel} parse failed in ${mode} mode: ${lastError.message}`);
295
400
  }
401
+ // Unreachable in normal flow — defensive
402
+ throw lastError ?? new Error(`${roundLabel} exhausted all attempts`);
403
+ }
404
+ finally {
405
+ // Persist metrics on every exit (success or throw). Fail-open — this
406
+ // never throws; writeRoundMetrics catches its own errors.
407
+ writeMetricsNow();
408
+ }
409
+ }
410
+ function buildProviderLabel(providerName, model, effort) {
411
+ return formatProviderLabel({
412
+ provider: providerName,
413
+ model: model ?? undefined,
414
+ effort: effort ?? undefined,
415
+ });
416
+ }
417
+ function truncate(text, max) {
418
+ return text.length > max ? text.slice(0, max) : text;
419
+ }
420
+ function safeStderr(line) {
421
+ try {
422
+ process.stderr.write(line);
423
+ }
424
+ catch {
425
+ // stderr unavailable — nothing else we can do
426
+ }
427
+ }
428
+ function logStart(roundLabel, providerLabel, mode, promptChars, ctx) {
429
+ if (!ctx)
430
+ return;
431
+ safeStderr(`[planpong] R${ctx.round} ${ctx.role} | ${providerLabel} | ${mode} | prompt=${promptChars}c\n`);
432
+ }
433
+ function logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, durationMs, ok, failDetail, ctx) {
434
+ if (!ctx)
435
+ return;
436
+ const durationStr = formatDuration(durationMs);
437
+ if (ok && outputChars !== null) {
438
+ safeStderr(`[planpong] R${ctx.round} ${ctx.role} | ${providerLabel} | ${mode} | prompt=${promptChars}c output=${outputChars}c duration=${durationStr} | ok\n`);
439
+ }
440
+ else {
441
+ safeStderr(`[planpong] R${ctx.round} ${ctx.role} | ${providerLabel} | ${mode} | prompt=${promptChars}c duration=${durationStr} | fail (${failDetail ?? "unknown"})\n`);
296
442
  }
297
- // Unreachable in normal flow — defensive
298
- throw lastError ?? new Error(`${roundLabel} exhausted all attempts`);
299
443
  }
300
444
  /**
301
445
  * Run a single review round: send current plan to the reviewer for critique.
@@ -306,22 +450,68 @@ export async function runReviewRound(session, cwd, config, reviewerProvider) {
306
450
  const planContent = readFileSync(planPath, "utf-8");
307
451
  const phase = getReviewPhase(round);
308
452
  const priorDecisions = buildPriorDecisions(cwd, session.id, round);
309
- const feedback = await invokeWithStateMachine({
453
+ // Persist a snapshot of the plan as the reviewer is about to see it. On
454
+ // round N+1 we'll diff against this snapshot to produce the incremental
455
+ // "what changed" content for the resumed reviewer session.
456
+ writeRoundPlanSnapshot(cwd, session.id, round, planContent);
457
+ // Reviewer-side persistent sessions. Both claude and codex support this:
458
+ // - claude: we generate the UUID and pass it via --session-id (first)
459
+ // or --resume (subsequent).
460
+ // - codex: codex generates its own thread_id; we capture it from the
461
+ // `--json` event stream and pass it via `codex exec resume <id>`
462
+ // on subsequent calls.
463
+ // The canonical reviewer session ID is `session.reviewerSessionId` — for
464
+ // claude this is the pre-generated UUID; for codex it's overwritten
465
+ // after the first call with the captured thread_id.
466
+ const reviewerSessionInited = session.reviewerSessionInitialized === true;
467
+ const isResumedReviewerSession = reviewerSessionInited;
468
+ const priorPlanContent = isResumedReviewerSession
469
+ ? readRoundPlanSnapshot(cwd, session.id, round - 1)
470
+ : null;
471
+ const planDiff = priorPlanContent
472
+ ? buildPlanDiff(priorPlanContent, planContent)
473
+ : null;
474
+ const newSessionId = !reviewerSessionInited && reviewerProvider.name === "claude"
475
+ ? session.reviewerSessionId
476
+ : undefined;
477
+ const resumeSessionId = reviewerSessionInited
478
+ ? session.reviewerSessionId
479
+ : undefined;
480
+ const { result: feedback, metrics, sessionId: capturedSessionId, } = await invokeWithStateMachine({
310
481
  provider: reviewerProvider,
311
482
  invokeOptions: {
312
483
  cwd,
313
484
  model: config.reviewer.model,
314
485
  effort: config.reviewer.effort,
486
+ newSessionId,
487
+ resumeSessionId,
315
488
  },
316
489
  jsonSchema: getFeedbackJsonSchemaForPhase(phase),
317
- buildPrompt: (structuredOutput) => buildReviewPrompt(planContent, priorDecisions, phase, structuredOutput),
490
+ buildPrompt: (structuredOutput) => isResumedReviewerSession
491
+ ? buildIncrementalReviewPrompt(planDiff ?? planContent, priorDecisions, phase, structuredOutput)
492
+ : buildReviewPrompt(planContent, priorDecisions, phase, structuredOutput),
318
493
  parseStructured: (output) => parseStructuredFeedbackForPhase(output, phase),
319
494
  parseLegacy: (output) => parseFeedbackForPhase(output, phase),
320
495
  roundLabel: `Round ${round} review`,
496
+ metricsContext: {
497
+ sessionId: session.id,
498
+ round,
499
+ phase,
500
+ role: "review",
501
+ },
321
502
  });
322
503
  writeRoundFeedback(cwd, session.id, round, feedback);
323
504
  const severity = severityFromFeedback(feedback);
324
505
  const converged = isConverged(feedback);
506
+ const timing = metrics ? summarizeTiming(metrics) : undefined;
507
+ // Persist the canonical reviewer session ID. For claude this is the
508
+ // UUID we generated; for codex it's the thread_id captured from --json
509
+ // output. Either way, future rounds resume this conversation.
510
+ if (!reviewerSessionInited && capturedSessionId) {
511
+ session.reviewerSessionId = capturedSessionId;
512
+ session.reviewerSessionInitialized = true;
513
+ writeSessionState(cwd, session);
514
+ }
325
515
  // Extract phase-specific extras for status line
326
516
  const phaseExtras = {};
327
517
  if (feedback.verdict === "blocked") {
@@ -340,7 +530,7 @@ export async function runReviewRound(session, cwd, config, reviewerProvider) {
340
530
  phaseExtras.risks_promoted = feedback.issues.length;
341
531
  }
342
532
  }
343
- return { round, feedback, severity, converged, phaseExtras };
533
+ return { round, feedback, severity, converged, phaseExtras, timing };
344
534
  }
345
535
  /**
346
536
  * Run a single revision round: send plan + feedback to the planner for revision.
@@ -355,25 +545,86 @@ export async function runRevisionRound(session, cwd, config, plannerProvider) {
355
545
  }
356
546
  const phase = getReviewPhase(round);
357
547
  const keyDecisions = extractKeyDecisions(planContent);
358
- const revision = await invokeWithStateMachine({
548
+ // Direction phase always uses full-plan output. Risk + detail honor
549
+ // config.revision_mode. The shape decision is made once here and threaded
550
+ // through prompt + JSON schema + parser.
551
+ const useEdits = phase !== "direction" && config.revision_mode === "edits";
552
+ const revisionShape = useEdits ? "edits" : "full";
553
+ const jsonSchema = getRevisionJsonSchema(phase, config.revision_mode);
554
+ // Planner-side persistent sessions were tested and found to INCREASE wall
555
+ // time — the model used the spared context budget to do more work per
556
+ // round (more edits, deeper revisions), not to do the same work faster.
557
+ // Reviewer-side persistent sessions are kept (see runReviewRound).
558
+ const { result: revision, metrics } = await invokeWithStateMachine({
359
559
  provider: plannerProvider,
360
560
  invokeOptions: {
361
561
  cwd,
362
562
  model: config.planner.model,
363
563
  effort: config.planner.effort,
364
564
  },
365
- jsonSchema: PlannerRevisionJsonSchema,
366
- buildPrompt: (structuredOutput) => buildRevisionPrompt(planContent, feedback, keyDecisions, null, phase, structuredOutput),
367
- parseStructured: (output) => parseStructuredRevision(output),
368
- parseLegacy: (output) => parseRevision(output),
565
+ jsonSchema,
566
+ buildPrompt: (structuredOutput) => buildRevisionPrompt(planContent, feedback, keyDecisions, null, phase, structuredOutput, config.revision_mode),
567
+ parseStructured: (output) => parseStructuredRevision(output, revisionShape),
568
+ parseLegacy: (output) => parseRevision(output, revisionShape),
369
569
  roundLabel: `Round ${round} revision`,
570
+ metricsContext: {
571
+ sessionId: session.id,
572
+ round,
573
+ phase,
574
+ role: "revision",
575
+ },
370
576
  });
371
577
  writeRoundResponse(cwd, session.id, round, revision);
372
- // Tally responses
578
+ const timing = metrics ? summarizeTiming(metrics) : undefined;
579
+ // Apply revision to disk. Two paths: full (today's behavior) or edits
580
+ // (apply edit list, retry failures, atomic write).
581
+ let editTelemetry;
582
+ let finalRevision = revision;
583
+ if (useEdits && isEditsRevision(revision)) {
584
+ const result = await applyRevisionEdits({
585
+ session,
586
+ cwd,
587
+ planPath,
588
+ planContent,
589
+ revision,
590
+ plannerProvider,
591
+ config,
592
+ phase,
593
+ metrics,
594
+ });
595
+ finalRevision = result.revision;
596
+ editTelemetry = result.telemetry;
597
+ }
598
+ else if (isDirectionRevision(revision)) {
599
+ writeFileSync(planPath, revision.updated_plan);
600
+ editTelemetry = {
601
+ revision_mode: "full",
602
+ edits_attempted: null,
603
+ edits_applied: null,
604
+ edits_failed: null,
605
+ edits_retried: null,
606
+ edits_recovered: null,
607
+ retry_invoked: false,
608
+ };
609
+ persistRevisionMetrics({
610
+ cwd,
611
+ session,
612
+ round,
613
+ phase,
614
+ metrics,
615
+ telemetry: editTelemetry,
616
+ });
617
+ }
618
+ else {
619
+ throw new Error(`runRevisionRound: revision shape mismatch — expected ${useEdits ? "edits" : "full"} but got ${"updated_plan" in revision ? "full" : "edits"}`);
620
+ }
621
+ session.planHash = hashFile(planPath);
622
+ writeSessionState(cwd, session);
623
+ // Tally responses (use the possibly-downgraded responses from finalRevision).
373
624
  let accepted = 0;
374
625
  let rejected = 0;
375
626
  let deferred = 0;
376
- for (const resp of revision.responses) {
627
+ for (const resp of finalRevision.responses) {
377
628
  if (resp.action === "accepted")
378
629
  accepted++;
379
630
  else if (resp.action === "rejected")
@@ -381,20 +632,258 @@ export async function runRevisionRound(session, cwd, config, plannerProvider) {
381
632
  else if (resp.action === "deferred")
382
633
  deferred++;
383
634
  }
384
- // Write updated plan to disk
385
- const updatedPlan = revision.updated_plan;
386
- writeFileSync(planPath, updatedPlan);
387
- session.planHash = hashFile(planPath);
388
- writeSessionState(cwd, session);
389
635
  return {
390
636
  round,
391
- revision,
637
+ revision: finalRevision,
392
638
  accepted,
393
639
  rejected,
394
640
  deferred,
395
641
  planUpdated: true,
642
+ timing,
643
+ edits: editTelemetry,
396
644
  };
397
645
  }
646
+ /**
647
+ * Apply an edits-mode revision: first-pass apply, targeted retry on failures,
648
+ * atomic write, response-edit consistency check. All mutations to the plan
649
+ * happen in memory; a single writeFileSync persists the final state.
650
+ */
651
+ async function applyRevisionEdits(args) {
652
+ const { session, cwd, planPath, planContent, revision, plannerProvider, config, phase, metrics, } = args;
653
+ const round = session.currentRound;
654
+ const editsAttempted = revision.edits.length;
655
+ // First-pass apply.
656
+ const firstPass = applyEdits(planContent, revision.edits);
657
+ if (firstPass.failures.length > 0) {
658
+ logFailures(`R${round} edits first-pass`, firstPass.failures);
659
+ }
660
+ safeStderr(`[planpong] R${round} edits | first-pass | ${summarizeApply(firstPass)}\n`);
661
+ let working = firstPass.plan;
662
+ const successfulEdits = firstPass.applied.map((a) => a.edit);
663
+ const recoveredEdits = [];
664
+ const unrecoverableFailures = [];
665
+ let retryInvoked = false;
666
+ let retriedCount = 0;
667
+ if (firstPass.failures.length > 0) {
668
+ retryInvoked = true;
669
+ retriedCount = firstPass.failures.length;
670
+ try {
671
+ const retryResult = await runEditsRetry({
672
+ cwd,
673
+ session,
674
+ round,
675
+ phase,
676
+ plannerProvider,
677
+ config,
678
+ currentPlan: working,
679
+ failures: firstPass.failures,
680
+ });
681
+ const secondPass = applyEdits(working, retryResult.edits);
682
+ if (secondPass.failures.length > 0) {
683
+ logFailures(`R${round} edits retry`, secondPass.failures);
684
+ }
685
+ safeStderr(`[planpong] R${round} edits | retry | ${summarizeApply(secondPass)}\n`);
686
+ working = secondPass.plan;
687
+ for (const a of secondPass.applied)
688
+ recoveredEdits.push(a.edit);
689
+ unrecoverableFailures.push(...secondPass.failures);
690
+ // Track the retry as an additional invocation attempt in metrics.
691
+ if (metrics) {
692
+ metrics.attempts.push(retryResult.attemptRecord);
693
+ }
694
+ }
695
+ catch (err) {
696
+ // Retry failed entirely (provider error, parse error). Surface but
697
+ // keep first-pass partial result — strictly better than nothing.
698
+ safeStderr(`[planpong] R${round} edits | retry failed: ${err instanceof Error ? err.message : String(err)}\n`);
699
+ unrecoverableFailures.push(...firstPass.failures);
700
+ }
701
+ }
702
+ // Atomic write of the final plan state.
703
+ writeFileSync(planPath, working);
704
+ // Response-edit consistency check: if an `accepted` response has no
705
+ // surviving edit anywhere in its rationale or suggestion's section, the
706
+ // planner claimed to have addressed an issue without a corresponding plan
707
+ // change. Downgrade to `deferred`. The match is heuristic — keyed on the
708
+ // response's `issue_id` appearing in the edit's after text or in any
709
+ // edit's section that maps to the issue's section field. This is the same
710
+ // tradeoff the plan documents (R3 F2 issue, accepted as heuristic).
711
+ const survivingEdits = [...successfulEdits, ...recoveredEdits];
712
+ const downgraded = downgradeOrphanedResponses(revision, survivingEdits, unrecoverableFailures);
713
+ // Persist failure metadata in the round response JSON alongside responses.
714
+ // We rewrite the response file to include the (possibly-downgraded)
715
+ // responses + edit application result.
716
+ writeRoundResponse(cwd, session.id, round, downgraded);
717
+ const telemetry = {
718
+ revision_mode: "edits",
719
+ edits_attempted: editsAttempted,
720
+ edits_applied: successfulEdits.length,
721
+ edits_failed: firstPass.failures.length,
722
+ edits_retried: retriedCount,
723
+ edits_recovered: recoveredEdits.length,
724
+ retry_invoked: retryInvoked,
725
+ };
726
+ persistRevisionMetrics({
727
+ cwd,
728
+ session,
729
+ round,
730
+ phase,
731
+ metrics,
732
+ telemetry,
733
+ });
734
+ return { revision: downgraded, telemetry };
735
+ }
736
+ /**
737
+ * One-shot retry for failed edits. Builds a targeted prompt with only the
738
+ * failures + current (partially-edited) plan and asks the planner to
739
+ * re-express each failed edit. The retry is best-effort — provider/parse
740
+ * errors are caught by the caller and treated as "no recovery."
741
+ */
742
+ async function runEditsRetry(args) {
743
+ const { plannerProvider, config, currentPlan, failures } = args;
744
+ const supported = await plannerProvider.checkStructuredOutputSupport();
745
+ const useStructured = supported;
746
+ const prompt = buildEditsRetryPrompt(currentPlan, failures.map((f) => ({
747
+ edit: f.edit,
748
+ reason: f.reason,
749
+ section_searched: f.section_searched,
750
+ diagnostic: f.diagnostic,
751
+ })), useStructured);
752
+ // Use a minimal JSON schema for the retry — only `edits` array. We lift
753
+ // the EditsRevisionJsonSchema's `edits` block by using the full schema
754
+ // and then ignoring the `responses` field (the planner is asked to omit
755
+ // it). For simplicity reuse the full edits schema; the retry prompt
756
+ // explicitly tells the planner not to include `responses`.
757
+ const jsonSchema = getRevisionJsonSchema("detail", "edits");
758
+ const promptChars = prompt.length;
759
+ const promptLines = prompt.split("\n").length;
760
+ const options = useStructured
761
+ ? {
762
+ cwd: args.cwd,
763
+ model: config.planner.model,
764
+ effort: config.planner.effort,
765
+ jsonSchema,
766
+ }
767
+ : {
768
+ cwd: args.cwd,
769
+ model: config.planner.model,
770
+ effort: config.planner.effort,
771
+ };
772
+ const response = await plannerProvider.invoke(prompt, options);
773
+ const attemptRecord = {
774
+ mode: useStructured ? "structured" : "legacy",
775
+ provider: plannerProvider.name,
776
+ model: config.planner.model ?? null,
777
+ effort: config.planner.effort ?? null,
778
+ prompt_chars: promptChars,
779
+ prompt_lines: promptLines,
780
+ output_chars: response.ok ? response.output.length : null,
781
+ output_lines: response.ok ? response.output.split("\n").length : null,
782
+ duration_ms: response.duration ?? 0,
783
+ ok: false,
784
+ error_kind: "edit-retry",
785
+ error_exit_code: null,
786
+ };
787
+ if (!response.ok) {
788
+ throw new Error(`edits retry: provider error (${response.error.kind}: ${response.error.exitCode})`);
789
+ }
790
+ // Parse the retry response — accept either a full edits revision (with
791
+ // empty responses) or just an `edits` array wrapped in the standard tags.
792
+ let edits;
793
+ try {
794
+ if (useStructured) {
795
+ const parsed = JSON.parse(response.output);
796
+ edits = extractEditsFromRetryPayload(parsed);
797
+ }
798
+ else {
799
+ const json = response.output.match(/<planpong-revision>([\s\S]*?)<\/planpong-revision>/i)?.[1] ??
800
+ response.output;
801
+ const parsed = JSON.parse(json);
802
+ edits = extractEditsFromRetryPayload(parsed);
803
+ }
804
+ }
805
+ catch (err) {
806
+ throw new Error(`edits retry: parse failed: ${err instanceof Error ? err.message : String(err)}`);
807
+ }
808
+ attemptRecord.ok = true;
809
+ return { edits, attemptRecord };
810
+ }
811
+ function extractEditsFromRetryPayload(payload) {
812
+ if (payload &&
813
+ typeof payload === "object" &&
814
+ "edits" in payload &&
815
+ Array.isArray(payload.edits)) {
816
+ return payload.edits;
817
+ }
818
+ if (Array.isArray(payload))
819
+ return payload;
820
+ throw new Error("retry payload missing `edits` array");
821
+ }
822
+ /**
823
+ * Heuristic response-edit consistency check.
824
+ *
825
+ * For each `accepted` response, look for at least one surviving edit in the
826
+ * response's `section`. If none exists, downgrade the response action to
827
+ * `deferred` with rationale prefixed `edit_not_applied: ...`. The plan
828
+ * acknowledges this is heuristic (no explicit issue↔edit ID mapping in the
829
+ * schema). False negatives are possible — an accepted response that didn't
830
+ * require a plan change (e.g., "this was already addressed") is incorrectly
831
+ * downgraded if no edit lands in its declared section. To reduce noise, we
832
+ * only downgrade when there's at least one unrecoverable failure — if every
833
+ * edit succeeded, the planner's accepts are taken at face value.
834
+ */
835
+ function downgradeOrphanedResponses(revision, survivingEdits, unrecoverableFailures) {
836
+ if (unrecoverableFailures.length === 0)
837
+ return revision;
838
+ // Build a set of sections that have at least one surviving edit.
839
+ const editedSections = new Set(survivingEdits.map((e) => e.section.trim()));
840
+ const downgradedResponses = revision.responses.map((resp) => {
841
+ if (resp.action !== "accepted")
842
+ return resp;
843
+ // Section is not on IssueResponse; we have no per-issue section mapping
844
+ // (R3 F2 limitation). Without that, we treat ANY surviving-edit set as
845
+ // "the planner did some work" and only downgrade accepts when ALL edits
846
+ // failed — i.e., the plan didn't change at all. This is conservative
847
+ // but minimizes false-positive downgrades while still preventing the
848
+ // worst case ("everything accepted, no edits applied").
849
+ if (editedSections.size === 0) {
850
+ return {
851
+ ...resp,
852
+ action: "deferred",
853
+ rationale: `edit_not_applied: corresponding plan edit failed and could not be recovered. Original rationale: ${resp.rationale}`,
854
+ };
855
+ }
856
+ return resp;
857
+ });
858
+ return { ...revision, responses: downgradedResponses };
859
+ }
860
+ /**
861
+ * Re-persist the revision metrics file with augmented edit telemetry. The
862
+ * state machine has already written the basic metrics file in its finally
863
+ * block; this overwrites with the same data plus revision_mode + edit
864
+ * counts. Fail-open — telemetry write errors never propagate.
865
+ */
866
+ function persistRevisionMetrics(args) {
867
+ const { cwd, session, round, metrics, telemetry } = args;
868
+ if (!metrics)
869
+ return;
870
+ try {
871
+ const augmented = {
872
+ ...metrics,
873
+ revision_mode: telemetry.revision_mode,
874
+ edits_attempted: telemetry.edits_attempted,
875
+ edits_applied: telemetry.edits_applied,
876
+ edits_failed: telemetry.edits_failed,
877
+ edits_retried: telemetry.edits_retried,
878
+ edits_recovered: telemetry.edits_recovered,
879
+ retry_invoked: telemetry.retry_invoked,
880
+ };
881
+ writeRoundMetrics(cwd, session.id, round, "revision", augmented);
882
+ }
883
+ catch {
884
+ // fail-open — telemetry never breaks the run
885
+ }
886
+ }
398
887
  /**
399
888
  * Mark the session as approved and update the plan's status line.
400
889
  */