planpong 0.4.2 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/planpong.js +37 -1
- package/dist/bin/planpong.js.map +1 -1
- package/dist/src/config/defaults.js +1 -0
- package/dist/src/config/defaults.js.map +1 -1
- package/dist/src/config/loader.d.ts +1 -0
- package/dist/src/config/loader.js +3 -0
- package/dist/src/config/loader.js.map +1 -1
- package/dist/src/core/apply-edits.d.ts +40 -0
- package/dist/src/core/apply-edits.js +220 -0
- package/dist/src/core/apply-edits.js.map +1 -0
- package/dist/src/core/convergence.d.ts +18 -2
- package/dist/src/core/convergence.js +21 -9
- package/dist/src/core/convergence.js.map +1 -1
- package/dist/src/core/operations.d.ts +14 -1
- package/dist/src/core/operations.js +551 -62
- package/dist/src/core/operations.js.map +1 -1
- package/dist/src/core/plan-diff.d.ts +23 -0
- package/dist/src/core/plan-diff.js +135 -0
- package/dist/src/core/plan-diff.js.map +1 -0
- package/dist/src/core/session.d.ts +11 -0
- package/dist/src/core/session.js +51 -1
- package/dist/src/core/session.js.map +1 -1
- package/dist/src/mcp/tools/get-feedback.d.ts +16 -0
- package/dist/src/mcp/tools/get-feedback.js +118 -114
- package/dist/src/mcp/tools/get-feedback.js.map +1 -1
- package/dist/src/mcp/tools/revise.d.ts +16 -0
- package/dist/src/mcp/tools/revise.js +76 -61
- package/dist/src/mcp/tools/revise.js.map +1 -1
- package/dist/src/mcp/tools/status.js +15 -1
- package/dist/src/mcp/tools/status.js.map +1 -1
- package/dist/src/prompts/planner.d.ts +34 -1
- package/dist/src/prompts/planner.js +239 -4
- package/dist/src/prompts/planner.js.map +1 -1
- package/dist/src/prompts/reviewer.d.ts +13 -0
- package/dist/src/prompts/reviewer.js +65 -0
- package/dist/src/prompts/reviewer.js.map +1 -1
- package/dist/src/providers/claude.js +19 -3
- package/dist/src/providers/claude.js.map +1 -1
- package/dist/src/providers/codex.js +50 -3
- package/dist/src/providers/codex.js.map +1 -1
- package/dist/src/providers/types.d.ts +20 -0
- package/dist/src/schemas/config.d.ts +3 -0
- package/dist/src/schemas/config.js +6 -0
- package/dist/src/schemas/config.js.map +1 -1
- package/dist/src/schemas/json-schema.d.ts +12 -0
- package/dist/src/schemas/json-schema.js +20 -1
- package/dist/src/schemas/json-schema.js.map +1 -1
- package/dist/src/schemas/metrics.d.ts +171 -0
- package/dist/src/schemas/metrics.js +49 -0
- package/dist/src/schemas/metrics.js.map +1 -0
- package/dist/src/schemas/revision.d.ts +166 -2
- package/dist/src/schemas/revision.js +35 -2
- package/dist/src/schemas/revision.js.map +1 -1
- package/dist/src/schemas/session.d.ts +6 -0
- package/dist/src/schemas/session.js +10 -0
- package/dist/src/schemas/session.js.map +1 -1
- package/package.json +1 -1
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import { createHash } from "node:crypto";
|
|
2
2
|
import { readFileSync, writeFileSync, existsSync } from "node:fs";
|
|
3
3
|
import { relative, resolve } from "node:path";
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
4
|
+
import { isEditsRevision, isDirectionRevision, } from "../schemas/revision.js";
|
|
5
|
+
import { buildRevisionPrompt, buildEditsRetryPrompt, } from "../prompts/planner.js";
|
|
6
|
+
import { buildReviewPrompt, buildIncrementalReviewPrompt, formatPriorDecisions, getReviewPhase, } from "../prompts/reviewer.js";
|
|
7
|
+
import { buildPlanDiff } from "./plan-diff.js";
|
|
6
8
|
import { parseFeedbackForPhase, parseRevision, parseStructuredFeedbackForPhase, parseStructuredRevision, isConverged, StructuredOutputParseError, ZodValidationError, } from "./convergence.js";
|
|
7
|
-
import { getFeedbackJsonSchemaForPhase,
|
|
8
|
-
import {
|
|
9
|
+
import { getFeedbackJsonSchemaForPhase, getRevisionJsonSchema, } from "../schemas/json-schema.js";
|
|
10
|
+
import { applyEdits, logFailures, summarizeApply, } from "./apply-edits.js";
|
|
11
|
+
import { createSession, writeSessionState, writeRoundFeedback, writeRoundResponse, readRoundFeedback, readRoundResponse, writeInitialPlan, writeRoundMetrics, writeRoundPlanSnapshot, readRoundPlanSnapshot, } from "./session.js";
|
|
12
|
+
import { summarizeTiming, } from "../schemas/metrics.js";
|
|
9
13
|
// --- Utility functions ---
|
|
10
14
|
export function hashFile(path) {
|
|
11
15
|
const content = readFileSync(path, "utf-8");
|
|
@@ -240,62 +244,202 @@ function buildPriorDecisions(cwd, sessionId, currentRound) {
|
|
|
240
244
|
* - JSON.parse failure on structured output → downgrade
|
|
241
245
|
* - Zod validation failure on structured output → terminal (NOT retried)
|
|
242
246
|
* - Any failure in legacy mode → terminal
|
|
247
|
+
*
|
|
248
|
+
* Observability: when `metricsContext` is provided, each attempt emits a
|
|
249
|
+
* start/end line to stderr, collects `InvocationAttempt` records, and
|
|
250
|
+
* persists a `RoundMetrics` file in the session directory. All telemetry
|
|
251
|
+
* I/O is fail-open — failures log a warning and are swallowed, never
|
|
252
|
+
* altering the invocation outcome. The in-memory metrics object is
|
|
253
|
+
* returned alongside the result so callers get timing data without a
|
|
254
|
+
* filesystem round-trip.
|
|
243
255
|
*/
|
|
244
256
|
async function invokeWithStateMachine(args) {
|
|
245
|
-
const { provider, invokeOptions, jsonSchema, buildPrompt, parseStructured, parseLegacy, roundLabel, } = args;
|
|
257
|
+
const { provider, invokeOptions, jsonSchema, buildPrompt, parseStructured, parseLegacy, roundLabel, metricsContext, } = args;
|
|
246
258
|
const supported = await provider.checkStructuredOutputSupport();
|
|
247
259
|
let mode = supported ? "structured" : "legacy";
|
|
248
260
|
let attempt = 0;
|
|
249
261
|
const maxAttempts = 2;
|
|
250
262
|
let lastError = null;
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
263
|
+
// Metrics collection — only active when metricsContext is provided.
|
|
264
|
+
const attempts = [];
|
|
265
|
+
const startedAt = new Date().toISOString();
|
|
266
|
+
const startedAtMs = Date.now();
|
|
267
|
+
const providerLabel = buildProviderLabel(provider.name, invokeOptions.model, invokeOptions.effort);
|
|
268
|
+
const writeMetricsNow = () => {
|
|
269
|
+
if (!metricsContext)
|
|
270
|
+
return;
|
|
271
|
+
try {
|
|
272
|
+
const metrics = {
|
|
273
|
+
schema_version: 1,
|
|
274
|
+
session_id: metricsContext.sessionId,
|
|
275
|
+
round: metricsContext.round,
|
|
276
|
+
phase: metricsContext.phase,
|
|
277
|
+
role: metricsContext.role,
|
|
278
|
+
started_at: startedAt,
|
|
279
|
+
completed_at: new Date().toISOString(),
|
|
280
|
+
total_duration_ms: Date.now() - startedAtMs,
|
|
281
|
+
attempts,
|
|
282
|
+
};
|
|
283
|
+
writeRoundMetrics(invokeOptions.cwd, metricsContext.sessionId, metricsContext.round, metricsContext.role, metrics);
|
|
269
284
|
}
|
|
270
|
-
|
|
285
|
+
catch {
|
|
286
|
+
// writeRoundMetrics is already fail-open; catch here belts-and-braces
|
|
287
|
+
// against unexpected synchronous errors building the metrics object.
|
|
288
|
+
}
|
|
289
|
+
};
|
|
290
|
+
const buildMetrics = () => {
|
|
291
|
+
if (!metricsContext)
|
|
292
|
+
return null;
|
|
271
293
|
try {
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
294
|
+
return {
|
|
295
|
+
schema_version: 1,
|
|
296
|
+
session_id: metricsContext.sessionId,
|
|
297
|
+
round: metricsContext.round,
|
|
298
|
+
phase: metricsContext.phase,
|
|
299
|
+
role: metricsContext.role,
|
|
300
|
+
started_at: startedAt,
|
|
301
|
+
completed_at: new Date().toISOString(),
|
|
302
|
+
total_duration_ms: Date.now() - startedAtMs,
|
|
303
|
+
attempts,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
catch {
|
|
307
|
+
return null;
|
|
276
308
|
}
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
309
|
+
};
|
|
310
|
+
try {
|
|
311
|
+
while (attempt < maxAttempts) {
|
|
312
|
+
attempt++;
|
|
313
|
+
const prompt = buildPrompt(mode === "structured");
|
|
314
|
+
const promptChars = prompt.length;
|
|
315
|
+
const promptLines = prompt.split("\n").length;
|
|
316
|
+
const options = mode === "structured"
|
|
317
|
+
? { ...invokeOptions, jsonSchema }
|
|
318
|
+
: { ...invokeOptions };
|
|
319
|
+
logStart(roundLabel, providerLabel, mode, promptChars, metricsContext);
|
|
320
|
+
const response = await provider.invoke(prompt, options);
|
|
321
|
+
// Base attempt record — filled in below.
|
|
322
|
+
const attemptRecord = {
|
|
323
|
+
mode,
|
|
324
|
+
provider: provider.name,
|
|
325
|
+
model: invokeOptions.model ?? null,
|
|
326
|
+
effort: invokeOptions.effort ?? null,
|
|
327
|
+
prompt_chars: promptChars,
|
|
328
|
+
prompt_lines: promptLines,
|
|
329
|
+
output_chars: null,
|
|
330
|
+
output_lines: null,
|
|
331
|
+
duration_ms: response.duration ?? 0,
|
|
332
|
+
ok: false,
|
|
333
|
+
error_kind: null,
|
|
334
|
+
error_exit_code: null,
|
|
335
|
+
};
|
|
336
|
+
if (!response.ok) {
|
|
337
|
+
attemptRecord.ok = false;
|
|
338
|
+
attemptRecord.error_kind = response.error.kind;
|
|
339
|
+
attemptRecord.error_exit_code = response.error.exitCode;
|
|
340
|
+
attempts.push(attemptRecord);
|
|
341
|
+
logEnd(roundLabel, providerLabel, mode, promptChars, null, response.duration ?? 0, false, `${response.error.kind}: ${truncate(response.error.message, 200)}`, metricsContext);
|
|
342
|
+
if (mode === "structured" &&
|
|
343
|
+
response.error.kind === "capability" &&
|
|
344
|
+
attempt < maxAttempts) {
|
|
345
|
+
provider.markNonCapable();
|
|
346
|
+
mode = "legacy";
|
|
347
|
+
continue;
|
|
348
|
+
}
|
|
349
|
+
// Fatal, or already in legacy mode — terminal
|
|
350
|
+
throw new Error(`${roundLabel} failed (exit ${response.error.exitCode}, ${response.error.kind}):\n${response.error.message}`);
|
|
283
351
|
}
|
|
284
|
-
//
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
352
|
+
// Provider returned output — record output size, try to parse.
|
|
353
|
+
const outputChars = response.output.length;
|
|
354
|
+
const outputLines = response.output.split("\n").length;
|
|
355
|
+
attemptRecord.output_chars = outputChars;
|
|
356
|
+
attemptRecord.output_lines = outputLines;
|
|
357
|
+
try {
|
|
358
|
+
const parsed = mode === "structured"
|
|
359
|
+
? parseStructured(response.output)
|
|
360
|
+
: parseLegacy(response.output);
|
|
361
|
+
attemptRecord.ok = true;
|
|
362
|
+
attempts.push(attemptRecord);
|
|
363
|
+
logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, true, null, metricsContext);
|
|
364
|
+
return {
|
|
365
|
+
result: parsed,
|
|
366
|
+
metrics: buildMetrics(),
|
|
367
|
+
sessionId: response.ok ? response.sessionId : undefined,
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
catch (parseError) {
|
|
371
|
+
lastError = parseError instanceof Error ? parseError : new Error(String(parseError));
|
|
372
|
+
// Zod validation failure on structured output is terminal — the model
|
|
373
|
+
// produced semantically invalid content, retrying won't help.
|
|
374
|
+
if (parseError instanceof ZodValidationError) {
|
|
375
|
+
attemptRecord.ok = false;
|
|
376
|
+
attemptRecord.error_kind = "zod";
|
|
377
|
+
attempts.push(attemptRecord);
|
|
378
|
+
logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, false, `zod: ${truncate(lastError.message, 200)}`, metricsContext);
|
|
379
|
+
throw parseError;
|
|
380
|
+
}
|
|
381
|
+
// JSON.parse failure on structured output triggers downgrade
|
|
382
|
+
if (mode === "structured" &&
|
|
383
|
+
parseError instanceof StructuredOutputParseError &&
|
|
384
|
+
attempt < maxAttempts) {
|
|
385
|
+
attemptRecord.ok = false;
|
|
386
|
+
attemptRecord.error_kind = "parse";
|
|
387
|
+
attempts.push(attemptRecord);
|
|
388
|
+
logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, false, `parse: ${truncate(lastError.message, 200)}`, metricsContext);
|
|
389
|
+
provider.markNonCapable();
|
|
390
|
+
mode = "legacy";
|
|
391
|
+
continue;
|
|
392
|
+
}
|
|
393
|
+
// Legacy parse failure — terminal
|
|
394
|
+
attemptRecord.ok = false;
|
|
395
|
+
attemptRecord.error_kind = "parse";
|
|
396
|
+
attempts.push(attemptRecord);
|
|
397
|
+
logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, response.duration ?? 0, false, `parse: ${truncate(lastError.message, 200)}`, metricsContext);
|
|
398
|
+
throw new Error(`${roundLabel} parse failed in ${mode} mode: ${lastError.message}`);
|
|
292
399
|
}
|
|
293
|
-
// Legacy parse failure — terminal
|
|
294
|
-
throw new Error(`${roundLabel} parse failed in ${mode} mode: ${lastError.message}`);
|
|
295
400
|
}
|
|
401
|
+
// Unreachable in normal flow — defensive
|
|
402
|
+
throw lastError ?? new Error(`${roundLabel} exhausted all attempts`);
|
|
403
|
+
}
|
|
404
|
+
finally {
|
|
405
|
+
// Persist metrics on every exit (success or throw). Fail-open — this
|
|
406
|
+
// never throws; writeRoundMetrics catches its own errors.
|
|
407
|
+
writeMetricsNow();
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
function buildProviderLabel(providerName, model, effort) {
|
|
411
|
+
return formatProviderLabel({
|
|
412
|
+
provider: providerName,
|
|
413
|
+
model: model ?? undefined,
|
|
414
|
+
effort: effort ?? undefined,
|
|
415
|
+
});
|
|
416
|
+
}
|
|
417
|
+
function truncate(text, max) {
|
|
418
|
+
return text.length > max ? text.slice(0, max) : text;
|
|
419
|
+
}
|
|
420
|
+
function safeStderr(line) {
|
|
421
|
+
try {
|
|
422
|
+
process.stderr.write(line);
|
|
423
|
+
}
|
|
424
|
+
catch {
|
|
425
|
+
// stderr unavailable — nothing else we can do
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
function logStart(roundLabel, providerLabel, mode, promptChars, ctx) {
|
|
429
|
+
if (!ctx)
|
|
430
|
+
return;
|
|
431
|
+
safeStderr(`[planpong] R${ctx.round} ${ctx.role} | ${providerLabel} | ${mode} | prompt=${promptChars}c\n`);
|
|
432
|
+
}
|
|
433
|
+
function logEnd(roundLabel, providerLabel, mode, promptChars, outputChars, durationMs, ok, failDetail, ctx) {
|
|
434
|
+
if (!ctx)
|
|
435
|
+
return;
|
|
436
|
+
const durationStr = formatDuration(durationMs);
|
|
437
|
+
if (ok && outputChars !== null) {
|
|
438
|
+
safeStderr(`[planpong] R${ctx.round} ${ctx.role} | ${providerLabel} | ${mode} | prompt=${promptChars}c output=${outputChars}c duration=${durationStr} | ok\n`);
|
|
439
|
+
}
|
|
440
|
+
else {
|
|
441
|
+
safeStderr(`[planpong] R${ctx.round} ${ctx.role} | ${providerLabel} | ${mode} | prompt=${promptChars}c duration=${durationStr} | fail (${failDetail ?? "unknown"})\n`);
|
|
296
442
|
}
|
|
297
|
-
// Unreachable in normal flow — defensive
|
|
298
|
-
throw lastError ?? new Error(`${roundLabel} exhausted all attempts`);
|
|
299
443
|
}
|
|
300
444
|
/**
|
|
301
445
|
* Run a single review round: send current plan to the reviewer for critique.
|
|
@@ -306,22 +450,68 @@ export async function runReviewRound(session, cwd, config, reviewerProvider) {
|
|
|
306
450
|
const planContent = readFileSync(planPath, "utf-8");
|
|
307
451
|
const phase = getReviewPhase(round);
|
|
308
452
|
const priorDecisions = buildPriorDecisions(cwd, session.id, round);
|
|
309
|
-
|
|
453
|
+
// Persist a snapshot of the plan as the reviewer is about to see it. On
|
|
454
|
+
// round N+1 we'll diff against this snapshot to produce the incremental
|
|
455
|
+
// "what changed" content for the resumed reviewer session.
|
|
456
|
+
writeRoundPlanSnapshot(cwd, session.id, round, planContent);
|
|
457
|
+
// Reviewer-side persistent sessions. Both claude and codex support this:
|
|
458
|
+
// - claude: we generate the UUID and pass it via --session-id (first)
|
|
459
|
+
// or --resume (subsequent).
|
|
460
|
+
// - codex: codex generates its own thread_id; we capture it from the
|
|
461
|
+
// `--json` event stream and pass it via `codex exec resume <id>`
|
|
462
|
+
// on subsequent calls.
|
|
463
|
+
// The canonical reviewer session ID is `session.reviewerSessionId` — for
|
|
464
|
+
// claude this is the pre-generated UUID; for codex it's overwritten
|
|
465
|
+
// after the first call with the captured thread_id.
|
|
466
|
+
const reviewerSessionInited = session.reviewerSessionInitialized === true;
|
|
467
|
+
const isResumedReviewerSession = reviewerSessionInited;
|
|
468
|
+
const priorPlanContent = isResumedReviewerSession
|
|
469
|
+
? readRoundPlanSnapshot(cwd, session.id, round - 1)
|
|
470
|
+
: null;
|
|
471
|
+
const planDiff = priorPlanContent
|
|
472
|
+
? buildPlanDiff(priorPlanContent, planContent)
|
|
473
|
+
: null;
|
|
474
|
+
const newSessionId = !reviewerSessionInited && reviewerProvider.name === "claude"
|
|
475
|
+
? session.reviewerSessionId
|
|
476
|
+
: undefined;
|
|
477
|
+
const resumeSessionId = reviewerSessionInited
|
|
478
|
+
? session.reviewerSessionId
|
|
479
|
+
: undefined;
|
|
480
|
+
const { result: feedback, metrics, sessionId: capturedSessionId, } = await invokeWithStateMachine({
|
|
310
481
|
provider: reviewerProvider,
|
|
311
482
|
invokeOptions: {
|
|
312
483
|
cwd,
|
|
313
484
|
model: config.reviewer.model,
|
|
314
485
|
effort: config.reviewer.effort,
|
|
486
|
+
newSessionId,
|
|
487
|
+
resumeSessionId,
|
|
315
488
|
},
|
|
316
489
|
jsonSchema: getFeedbackJsonSchemaForPhase(phase),
|
|
317
|
-
buildPrompt: (structuredOutput) =>
|
|
490
|
+
buildPrompt: (structuredOutput) => isResumedReviewerSession
|
|
491
|
+
? buildIncrementalReviewPrompt(planDiff ?? planContent, priorDecisions, phase, structuredOutput)
|
|
492
|
+
: buildReviewPrompt(planContent, priorDecisions, phase, structuredOutput),
|
|
318
493
|
parseStructured: (output) => parseStructuredFeedbackForPhase(output, phase),
|
|
319
494
|
parseLegacy: (output) => parseFeedbackForPhase(output, phase),
|
|
320
495
|
roundLabel: `Round ${round} review`,
|
|
496
|
+
metricsContext: {
|
|
497
|
+
sessionId: session.id,
|
|
498
|
+
round,
|
|
499
|
+
phase,
|
|
500
|
+
role: "review",
|
|
501
|
+
},
|
|
321
502
|
});
|
|
322
503
|
writeRoundFeedback(cwd, session.id, round, feedback);
|
|
323
504
|
const severity = severityFromFeedback(feedback);
|
|
324
505
|
const converged = isConverged(feedback);
|
|
506
|
+
const timing = metrics ? summarizeTiming(metrics) : undefined;
|
|
507
|
+
// Persist the canonical reviewer session ID. For claude this is the
|
|
508
|
+
// UUID we generated; for codex it's the thread_id captured from --json
|
|
509
|
+
// output. Either way, future rounds resume this conversation.
|
|
510
|
+
if (!reviewerSessionInited && capturedSessionId) {
|
|
511
|
+
session.reviewerSessionId = capturedSessionId;
|
|
512
|
+
session.reviewerSessionInitialized = true;
|
|
513
|
+
writeSessionState(cwd, session);
|
|
514
|
+
}
|
|
325
515
|
// Extract phase-specific extras for status line
|
|
326
516
|
const phaseExtras = {};
|
|
327
517
|
if (feedback.verdict === "blocked") {
|
|
@@ -340,7 +530,7 @@ export async function runReviewRound(session, cwd, config, reviewerProvider) {
|
|
|
340
530
|
phaseExtras.risks_promoted = feedback.issues.length;
|
|
341
531
|
}
|
|
342
532
|
}
|
|
343
|
-
return { round, feedback, severity, converged, phaseExtras };
|
|
533
|
+
return { round, feedback, severity, converged, phaseExtras, timing };
|
|
344
534
|
}
|
|
345
535
|
/**
|
|
346
536
|
* Run a single revision round: send plan + feedback to the planner for revision.
|
|
@@ -355,25 +545,86 @@ export async function runRevisionRound(session, cwd, config, plannerProvider) {
|
|
|
355
545
|
}
|
|
356
546
|
const phase = getReviewPhase(round);
|
|
357
547
|
const keyDecisions = extractKeyDecisions(planContent);
|
|
358
|
-
|
|
548
|
+
// Direction phase always uses full-plan output. Risk + detail honor
|
|
549
|
+
// config.revision_mode. The shape decision is made once here and threaded
|
|
550
|
+
// through prompt + JSON schema + parser.
|
|
551
|
+
const useEdits = phase !== "direction" && config.revision_mode === "edits";
|
|
552
|
+
const revisionShape = useEdits ? "edits" : "full";
|
|
553
|
+
const jsonSchema = getRevisionJsonSchema(phase, config.revision_mode);
|
|
554
|
+
// Planner-side persistent sessions were tested and found to INCREASE wall
|
|
555
|
+
// time — the model used the spared context budget to do more work per
|
|
556
|
+
// round (more edits, deeper revisions), not to do the same work faster.
|
|
557
|
+
// Reviewer-side persistent sessions are kept (see runReviewRound).
|
|
558
|
+
const { result: revision, metrics } = await invokeWithStateMachine({
|
|
359
559
|
provider: plannerProvider,
|
|
360
560
|
invokeOptions: {
|
|
361
561
|
cwd,
|
|
362
562
|
model: config.planner.model,
|
|
363
563
|
effort: config.planner.effort,
|
|
364
564
|
},
|
|
365
|
-
jsonSchema
|
|
366
|
-
buildPrompt: (structuredOutput) => buildRevisionPrompt(planContent, feedback, keyDecisions, null, phase, structuredOutput),
|
|
367
|
-
parseStructured: (output) => parseStructuredRevision(output),
|
|
368
|
-
parseLegacy: (output) => parseRevision(output),
|
|
565
|
+
jsonSchema,
|
|
566
|
+
buildPrompt: (structuredOutput) => buildRevisionPrompt(planContent, feedback, keyDecisions, null, phase, structuredOutput, config.revision_mode),
|
|
567
|
+
parseStructured: (output) => parseStructuredRevision(output, revisionShape),
|
|
568
|
+
parseLegacy: (output) => parseRevision(output, revisionShape),
|
|
369
569
|
roundLabel: `Round ${round} revision`,
|
|
570
|
+
metricsContext: {
|
|
571
|
+
sessionId: session.id,
|
|
572
|
+
round,
|
|
573
|
+
phase,
|
|
574
|
+
role: "revision",
|
|
575
|
+
},
|
|
370
576
|
});
|
|
371
577
|
writeRoundResponse(cwd, session.id, round, revision);
|
|
372
|
-
|
|
578
|
+
const timing = metrics ? summarizeTiming(metrics) : undefined;
|
|
579
|
+
// Apply revision to disk. Two paths: full (today's behavior) or edits
|
|
580
|
+
// (apply edit list, retry failures, atomic write).
|
|
581
|
+
let editTelemetry;
|
|
582
|
+
let finalRevision = revision;
|
|
583
|
+
if (useEdits && isEditsRevision(revision)) {
|
|
584
|
+
const result = await applyRevisionEdits({
|
|
585
|
+
session,
|
|
586
|
+
cwd,
|
|
587
|
+
planPath,
|
|
588
|
+
planContent,
|
|
589
|
+
revision,
|
|
590
|
+
plannerProvider,
|
|
591
|
+
config,
|
|
592
|
+
phase,
|
|
593
|
+
metrics,
|
|
594
|
+
});
|
|
595
|
+
finalRevision = result.revision;
|
|
596
|
+
editTelemetry = result.telemetry;
|
|
597
|
+
}
|
|
598
|
+
else if (isDirectionRevision(revision)) {
|
|
599
|
+
writeFileSync(planPath, revision.updated_plan);
|
|
600
|
+
editTelemetry = {
|
|
601
|
+
revision_mode: "full",
|
|
602
|
+
edits_attempted: null,
|
|
603
|
+
edits_applied: null,
|
|
604
|
+
edits_failed: null,
|
|
605
|
+
edits_retried: null,
|
|
606
|
+
edits_recovered: null,
|
|
607
|
+
retry_invoked: false,
|
|
608
|
+
};
|
|
609
|
+
persistRevisionMetrics({
|
|
610
|
+
cwd,
|
|
611
|
+
session,
|
|
612
|
+
round,
|
|
613
|
+
phase,
|
|
614
|
+
metrics,
|
|
615
|
+
telemetry: editTelemetry,
|
|
616
|
+
});
|
|
617
|
+
}
|
|
618
|
+
else {
|
|
619
|
+
throw new Error(`runRevisionRound: revision shape mismatch — expected ${useEdits ? "edits" : "full"} but got ${"updated_plan" in revision ? "full" : "edits"}`);
|
|
620
|
+
}
|
|
621
|
+
session.planHash = hashFile(planPath);
|
|
622
|
+
writeSessionState(cwd, session);
|
|
623
|
+
// Tally responses (use the possibly-downgraded responses from finalRevision).
|
|
373
624
|
let accepted = 0;
|
|
374
625
|
let rejected = 0;
|
|
375
626
|
let deferred = 0;
|
|
376
|
-
for (const resp of
|
|
627
|
+
for (const resp of finalRevision.responses) {
|
|
377
628
|
if (resp.action === "accepted")
|
|
378
629
|
accepted++;
|
|
379
630
|
else if (resp.action === "rejected")
|
|
@@ -381,20 +632,258 @@ export async function runRevisionRound(session, cwd, config, plannerProvider) {
|
|
|
381
632
|
else if (resp.action === "deferred")
|
|
382
633
|
deferred++;
|
|
383
634
|
}
|
|
384
|
-
// Write updated plan to disk
|
|
385
|
-
const updatedPlan = revision.updated_plan;
|
|
386
|
-
writeFileSync(planPath, updatedPlan);
|
|
387
|
-
session.planHash = hashFile(planPath);
|
|
388
|
-
writeSessionState(cwd, session);
|
|
389
635
|
return {
|
|
390
636
|
round,
|
|
391
|
-
revision,
|
|
637
|
+
revision: finalRevision,
|
|
392
638
|
accepted,
|
|
393
639
|
rejected,
|
|
394
640
|
deferred,
|
|
395
641
|
planUpdated: true,
|
|
642
|
+
timing,
|
|
643
|
+
edits: editTelemetry,
|
|
396
644
|
};
|
|
397
645
|
}
|
|
646
|
+
/**
|
|
647
|
+
* Apply an edits-mode revision: first-pass apply, targeted retry on failures,
|
|
648
|
+
* atomic write, response-edit consistency check. All mutations to the plan
|
|
649
|
+
* happen in memory; a single writeFileSync persists the final state.
|
|
650
|
+
*/
|
|
651
|
+
async function applyRevisionEdits(args) {
|
|
652
|
+
const { session, cwd, planPath, planContent, revision, plannerProvider, config, phase, metrics, } = args;
|
|
653
|
+
const round = session.currentRound;
|
|
654
|
+
const editsAttempted = revision.edits.length;
|
|
655
|
+
// First-pass apply.
|
|
656
|
+
const firstPass = applyEdits(planContent, revision.edits);
|
|
657
|
+
if (firstPass.failures.length > 0) {
|
|
658
|
+
logFailures(`R${round} edits first-pass`, firstPass.failures);
|
|
659
|
+
}
|
|
660
|
+
safeStderr(`[planpong] R${round} edits | first-pass | ${summarizeApply(firstPass)}\n`);
|
|
661
|
+
let working = firstPass.plan;
|
|
662
|
+
const successfulEdits = firstPass.applied.map((a) => a.edit);
|
|
663
|
+
const recoveredEdits = [];
|
|
664
|
+
const unrecoverableFailures = [];
|
|
665
|
+
let retryInvoked = false;
|
|
666
|
+
let retriedCount = 0;
|
|
667
|
+
if (firstPass.failures.length > 0) {
|
|
668
|
+
retryInvoked = true;
|
|
669
|
+
retriedCount = firstPass.failures.length;
|
|
670
|
+
try {
|
|
671
|
+
const retryResult = await runEditsRetry({
|
|
672
|
+
cwd,
|
|
673
|
+
session,
|
|
674
|
+
round,
|
|
675
|
+
phase,
|
|
676
|
+
plannerProvider,
|
|
677
|
+
config,
|
|
678
|
+
currentPlan: working,
|
|
679
|
+
failures: firstPass.failures,
|
|
680
|
+
});
|
|
681
|
+
const secondPass = applyEdits(working, retryResult.edits);
|
|
682
|
+
if (secondPass.failures.length > 0) {
|
|
683
|
+
logFailures(`R${round} edits retry`, secondPass.failures);
|
|
684
|
+
}
|
|
685
|
+
safeStderr(`[planpong] R${round} edits | retry | ${summarizeApply(secondPass)}\n`);
|
|
686
|
+
working = secondPass.plan;
|
|
687
|
+
for (const a of secondPass.applied)
|
|
688
|
+
recoveredEdits.push(a.edit);
|
|
689
|
+
unrecoverableFailures.push(...secondPass.failures);
|
|
690
|
+
// Track the retry as an additional invocation attempt in metrics.
|
|
691
|
+
if (metrics) {
|
|
692
|
+
metrics.attempts.push(retryResult.attemptRecord);
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
catch (err) {
|
|
696
|
+
// Retry failed entirely (provider error, parse error). Surface but
|
|
697
|
+
// keep first-pass partial result — strictly better than nothing.
|
|
698
|
+
safeStderr(`[planpong] R${round} edits | retry failed: ${err instanceof Error ? err.message : String(err)}\n`);
|
|
699
|
+
unrecoverableFailures.push(...firstPass.failures);
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
// Atomic write of the final plan state.
|
|
703
|
+
writeFileSync(planPath, working);
|
|
704
|
+
// Response-edit consistency check: if an `accepted` response has no
|
|
705
|
+
// surviving edit anywhere in its rationale or suggestion's section, the
|
|
706
|
+
// planner claimed to have addressed an issue without a corresponding plan
|
|
707
|
+
// change. Downgrade to `deferred`. The match is heuristic — keyed on the
|
|
708
|
+
// response's `issue_id` appearing in the edit's after text or in any
|
|
709
|
+
// edit's section that maps to the issue's section field. This is the same
|
|
710
|
+
// tradeoff the plan documents (R3 F2 issue, accepted as heuristic).
|
|
711
|
+
const survivingEdits = [...successfulEdits, ...recoveredEdits];
|
|
712
|
+
const downgraded = downgradeOrphanedResponses(revision, survivingEdits, unrecoverableFailures);
|
|
713
|
+
// Persist failure metadata in the round response JSON alongside responses.
|
|
714
|
+
// We rewrite the response file to include the (possibly-downgraded)
|
|
715
|
+
// responses + edit application result.
|
|
716
|
+
writeRoundResponse(cwd, session.id, round, downgraded);
|
|
717
|
+
const telemetry = {
|
|
718
|
+
revision_mode: "edits",
|
|
719
|
+
edits_attempted: editsAttempted,
|
|
720
|
+
edits_applied: successfulEdits.length,
|
|
721
|
+
edits_failed: firstPass.failures.length,
|
|
722
|
+
edits_retried: retriedCount,
|
|
723
|
+
edits_recovered: recoveredEdits.length,
|
|
724
|
+
retry_invoked: retryInvoked,
|
|
725
|
+
};
|
|
726
|
+
persistRevisionMetrics({
|
|
727
|
+
cwd,
|
|
728
|
+
session,
|
|
729
|
+
round,
|
|
730
|
+
phase,
|
|
731
|
+
metrics,
|
|
732
|
+
telemetry,
|
|
733
|
+
});
|
|
734
|
+
return { revision: downgraded, telemetry };
|
|
735
|
+
}
|
|
736
|
+
/**
|
|
737
|
+
* One-shot retry for failed edits. Builds a targeted prompt with only the
|
|
738
|
+
* failures + current (partially-edited) plan and asks the planner to
|
|
739
|
+
* re-express each failed edit. The retry is best-effort — provider/parse
|
|
740
|
+
* errors are caught by the caller and treated as "no recovery."
|
|
741
|
+
*/
|
|
742
|
+
async function runEditsRetry(args) {
|
|
743
|
+
const { plannerProvider, config, currentPlan, failures } = args;
|
|
744
|
+
const supported = await plannerProvider.checkStructuredOutputSupport();
|
|
745
|
+
const useStructured = supported;
|
|
746
|
+
const prompt = buildEditsRetryPrompt(currentPlan, failures.map((f) => ({
|
|
747
|
+
edit: f.edit,
|
|
748
|
+
reason: f.reason,
|
|
749
|
+
section_searched: f.section_searched,
|
|
750
|
+
diagnostic: f.diagnostic,
|
|
751
|
+
})), useStructured);
|
|
752
|
+
// Use a minimal JSON schema for the retry — only `edits` array. We lift
|
|
753
|
+
// the EditsRevisionJsonSchema's `edits` block by using the full schema
|
|
754
|
+
// and then ignoring the `responses` field (the planner is asked to omit
|
|
755
|
+
// it). For simplicity reuse the full edits schema; the retry prompt
|
|
756
|
+
// explicitly tells the planner not to include `responses`.
|
|
757
|
+
const jsonSchema = getRevisionJsonSchema("detail", "edits");
|
|
758
|
+
const promptChars = prompt.length;
|
|
759
|
+
const promptLines = prompt.split("\n").length;
|
|
760
|
+
const options = useStructured
|
|
761
|
+
? {
|
|
762
|
+
cwd: args.cwd,
|
|
763
|
+
model: config.planner.model,
|
|
764
|
+
effort: config.planner.effort,
|
|
765
|
+
jsonSchema,
|
|
766
|
+
}
|
|
767
|
+
: {
|
|
768
|
+
cwd: args.cwd,
|
|
769
|
+
model: config.planner.model,
|
|
770
|
+
effort: config.planner.effort,
|
|
771
|
+
};
|
|
772
|
+
const response = await plannerProvider.invoke(prompt, options);
|
|
773
|
+
const attemptRecord = {
|
|
774
|
+
mode: useStructured ? "structured" : "legacy",
|
|
775
|
+
provider: plannerProvider.name,
|
|
776
|
+
model: config.planner.model ?? null,
|
|
777
|
+
effort: config.planner.effort ?? null,
|
|
778
|
+
prompt_chars: promptChars,
|
|
779
|
+
prompt_lines: promptLines,
|
|
780
|
+
output_chars: response.ok ? response.output.length : null,
|
|
781
|
+
output_lines: response.ok ? response.output.split("\n").length : null,
|
|
782
|
+
duration_ms: response.duration ?? 0,
|
|
783
|
+
ok: false,
|
|
784
|
+
error_kind: "edit-retry",
|
|
785
|
+
error_exit_code: null,
|
|
786
|
+
};
|
|
787
|
+
if (!response.ok) {
|
|
788
|
+
throw new Error(`edits retry: provider error (${response.error.kind}: ${response.error.exitCode})`);
|
|
789
|
+
}
|
|
790
|
+
// Parse the retry response — accept either a full edits revision (with
|
|
791
|
+
// empty responses) or just an `edits` array wrapped in the standard tags.
|
|
792
|
+
let edits;
|
|
793
|
+
try {
|
|
794
|
+
if (useStructured) {
|
|
795
|
+
const parsed = JSON.parse(response.output);
|
|
796
|
+
edits = extractEditsFromRetryPayload(parsed);
|
|
797
|
+
}
|
|
798
|
+
else {
|
|
799
|
+
const json = response.output.match(/<planpong-revision>([\s\S]*?)<\/planpong-revision>/i)?.[1] ??
|
|
800
|
+
response.output;
|
|
801
|
+
const parsed = JSON.parse(json);
|
|
802
|
+
edits = extractEditsFromRetryPayload(parsed);
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
catch (err) {
|
|
806
|
+
throw new Error(`edits retry: parse failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
807
|
+
}
|
|
808
|
+
attemptRecord.ok = true;
|
|
809
|
+
return { edits, attemptRecord };
|
|
810
|
+
}
|
|
811
|
+
function extractEditsFromRetryPayload(payload) {
|
|
812
|
+
if (payload &&
|
|
813
|
+
typeof payload === "object" &&
|
|
814
|
+
"edits" in payload &&
|
|
815
|
+
Array.isArray(payload.edits)) {
|
|
816
|
+
return payload.edits;
|
|
817
|
+
}
|
|
818
|
+
if (Array.isArray(payload))
|
|
819
|
+
return payload;
|
|
820
|
+
throw new Error("retry payload missing `edits` array");
|
|
821
|
+
}
|
|
822
|
+
/**
|
|
823
|
+
* Heuristic response-edit consistency check.
|
|
824
|
+
*
|
|
825
|
+
* For each `accepted` response, look for at least one surviving edit in the
|
|
826
|
+
* response's `section`. If none exists, downgrade the response action to
|
|
827
|
+
* `deferred` with rationale prefixed `edit_not_applied: ...`. The plan
|
|
828
|
+
* acknowledges this is heuristic (no explicit issue↔edit ID mapping in the
|
|
829
|
+
* schema). False negatives are possible — an accepted response that didn't
|
|
830
|
+
* require a plan change (e.g., "this was already addressed") is incorrectly
|
|
831
|
+
* downgraded if no edit lands in its declared section. To reduce noise, we
|
|
832
|
+
* only downgrade when there's at least one unrecoverable failure — if every
|
|
833
|
+
* edit succeeded, the planner's accepts are taken at face value.
|
|
834
|
+
*/
|
|
835
|
+
function downgradeOrphanedResponses(revision, survivingEdits, unrecoverableFailures) {
|
|
836
|
+
if (unrecoverableFailures.length === 0)
|
|
837
|
+
return revision;
|
|
838
|
+
// Build a set of sections that have at least one surviving edit.
|
|
839
|
+
const editedSections = new Set(survivingEdits.map((e) => e.section.trim()));
|
|
840
|
+
const downgradedResponses = revision.responses.map((resp) => {
|
|
841
|
+
if (resp.action !== "accepted")
|
|
842
|
+
return resp;
|
|
843
|
+
// Section is not on IssueResponse; we have no per-issue section mapping
|
|
844
|
+
// (R3 F2 limitation). Without that, we treat ANY surviving-edit set as
|
|
845
|
+
// "the planner did some work" and only downgrade accepts when ALL edits
|
|
846
|
+
// failed — i.e., the plan didn't change at all. This is conservative
|
|
847
|
+
// but minimizes false-positive downgrades while still preventing the
|
|
848
|
+
// worst case ("everything accepted, no edits applied").
|
|
849
|
+
if (editedSections.size === 0) {
|
|
850
|
+
return {
|
|
851
|
+
...resp,
|
|
852
|
+
action: "deferred",
|
|
853
|
+
rationale: `edit_not_applied: corresponding plan edit failed and could not be recovered. Original rationale: ${resp.rationale}`,
|
|
854
|
+
};
|
|
855
|
+
}
|
|
856
|
+
return resp;
|
|
857
|
+
});
|
|
858
|
+
return { ...revision, responses: downgradedResponses };
|
|
859
|
+
}
|
|
860
|
+
/**
|
|
861
|
+
* Re-persist the revision metrics file with augmented edit telemetry. The
|
|
862
|
+
* state machine has already written the basic metrics file in its finally
|
|
863
|
+
* block; this overwrites with the same data plus revision_mode + edit
|
|
864
|
+
* counts. Fail-open — telemetry write errors never propagate.
|
|
865
|
+
*/
|
|
866
|
+
function persistRevisionMetrics(args) {
|
|
867
|
+
const { cwd, session, round, metrics, telemetry } = args;
|
|
868
|
+
if (!metrics)
|
|
869
|
+
return;
|
|
870
|
+
try {
|
|
871
|
+
const augmented = {
|
|
872
|
+
...metrics,
|
|
873
|
+
revision_mode: telemetry.revision_mode,
|
|
874
|
+
edits_attempted: telemetry.edits_attempted,
|
|
875
|
+
edits_applied: telemetry.edits_applied,
|
|
876
|
+
edits_failed: telemetry.edits_failed,
|
|
877
|
+
edits_retried: telemetry.edits_retried,
|
|
878
|
+
edits_recovered: telemetry.edits_recovered,
|
|
879
|
+
retry_invoked: telemetry.retry_invoked,
|
|
880
|
+
};
|
|
881
|
+
writeRoundMetrics(cwd, session.id, round, "revision", augmented);
|
|
882
|
+
}
|
|
883
|
+
catch {
|
|
884
|
+
// fail-open — telemetry never breaks the run
|
|
885
|
+
}
|
|
886
|
+
}
|
|
398
887
|
/**
|
|
399
888
|
* Mark the session as approved and update the plan's status line.
|
|
400
889
|
*/
|