@sebastianandreasson/pi-autonomous-agents 0.11.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -1
- package/docs/PI_SUPERVISOR.md +10 -0
- package/package.json +1 -1
- package/src/cli.mjs +4 -1
- package/src/pi-config.mjs +9 -0
- package/src/pi-prompts.mjs +19 -0
- package/src/pi-repo.mjs +5 -2
- package/src/pi-report.mjs +15 -0
- package/src/pi-supervisor.mjs +522 -28
- package/src/pi-telemetry.mjs +6 -1
- package/visualizer-ui/dist/assets/index-Bbj-UfL5.js +12 -0
- package/visualizer-ui/dist/assets/index-CO5voAk0.css +1 -0
- package/visualizer-ui/dist/index.html +2 -2
- package/visualizer-ui/dist/assets/index-C5V0jXPE.css +0 -1
- package/visualizer-ui/dist/assets/index-CpHvuv0C.js +0 -12
package/README.md
CHANGED
|
@@ -190,10 +190,13 @@ Common fields in `pi.config.json`:
|
|
|
190
190
|
- `testCommand`
|
|
191
191
|
- `visualReviewEnabled`
|
|
192
192
|
- `visualCaptureCommand`
|
|
193
|
+
- `failureArtifactDir`
|
|
193
194
|
- `continueAfterSeconds`
|
|
194
195
|
- `toolContinueAfterSeconds`
|
|
195
196
|
- `noEventTimeoutSeconds`
|
|
196
197
|
- `toolNoEventTimeoutSeconds`
|
|
198
|
+
- `sameFileLoopBudget`
|
|
199
|
+
- `loopHistoryLimit`
|
|
197
200
|
- `largeFileWarningLines`
|
|
198
201
|
- `largeSpecWarningLines`
|
|
199
202
|
|
|
@@ -207,6 +210,8 @@ Key defaults:
|
|
|
207
210
|
- `toolContinueAfterSeconds`: `900`
|
|
208
211
|
- `noEventTimeoutSeconds`: `900`
|
|
209
212
|
- `toolNoEventTimeoutSeconds`: `1800`
|
|
213
|
+
- `sameFileLoopBudget`: `2`
|
|
214
|
+
- `loopHistoryLimit`: `25`
|
|
210
215
|
|
|
211
216
|
## Prompt and Tooling Behavior
|
|
212
217
|
|
|
@@ -217,6 +222,7 @@ The package is optimized for local models by default:
|
|
|
217
222
|
- prompts prefer `read` for source inspection
|
|
218
223
|
- shell is intended for `git`, tests, and narrow diagnostics
|
|
219
224
|
- SDK transport carries forward oversized shell-read warnings and loop/timeout guards
|
|
225
|
+
- repeated same-file loop failures are remembered across iterations and escalate the next edit strategy
|
|
220
226
|
- the supervisor emits large-file/spec warnings when touched files are getting risky
|
|
221
227
|
|
|
222
228
|
This is deliberate. Large monolith files, huge e2e specs, and broad TODO items are one of the main causes of local-model drift and retry loops.
|
|
@@ -255,6 +261,8 @@ Useful files during a run:
|
|
|
255
261
|
Latest verification output snapshot.
|
|
256
262
|
- `.pi-last-iteration.json`
|
|
257
263
|
Structured summary of the last completed iteration.
|
|
264
|
+
- `pi-output/failure-artifacts/`
|
|
265
|
+
Compact failure artifacts with command, exit code, changed files, tester summary, and output excerpt.
|
|
258
266
|
- `.pi-state.json`
|
|
259
267
|
Persistent harness state, including in-progress iteration data.
|
|
260
268
|
- `pi.log`
|
|
@@ -264,7 +272,7 @@ Useful files during a run:
|
|
|
264
272
|
- `.pi-runtime/active-run.json`
|
|
265
273
|
- `.pi-runtime/runs/<runId>/...`
|
|
266
274
|
|
|
267
|
-
`pi-harness report` summarizes recent telemetry and surfaces things like terminal reasons
|
|
275
|
+
`pi-harness report` summarizes recent telemetry and surfaces things like terminal reasons, large-file warnings, and recent failure artifacts.
|
|
268
276
|
|
|
269
277
|
`pi-harness run` now also starts lightweight local web UI for orchestration flow by default. By default it listens on `127.0.0.1:4317`. Override with `PI_VISUALIZER_HOST` and `PI_VISUALIZER_PORT`. Set `PI_VISUALIZER=0` to disable embedded web UI for a run.
|
|
270
278
|
|
package/docs/PI_SUPERVISOR.md
CHANGED
|
@@ -80,10 +80,13 @@ Projects typically provide their own `pi.config.json` with fields such as:
|
|
|
80
80
|
- `visualCaptureCommand`
|
|
81
81
|
- `visualFeedbackFile`
|
|
82
82
|
- `testerFeedbackFile`
|
|
83
|
+
- `failureArtifactDir`
|
|
83
84
|
- `models`
|
|
84
85
|
- `piModel`
|
|
85
86
|
- `visualReviewModel`
|
|
86
87
|
- `commitMode`
|
|
88
|
+
- `sameFileLoopBudget`
|
|
89
|
+
- `loopHistoryLimit`
|
|
87
90
|
|
|
88
91
|
Model entries may carry their own OpenAI-compatible endpoint settings, so the PI text loop and the multimodal visual reviewer can point at different backends without changing code.
|
|
89
92
|
|
|
@@ -124,6 +127,10 @@ The default flow keeps commit ownership with the active agent:
|
|
|
124
127
|
2. `tester` should review functionality and, on `PASS`, stage only the task-related files and create the commit directly.
|
|
125
128
|
3. If the working tree is too messy to isolate safely, tester should return `VERDICT: BLOCKED` instead of guessing.
|
|
126
129
|
|
|
130
|
+
If tester returns `PASS` but leaves a dirty tree without creating the commit, the harness now treats that as a protocol error and automatically falls back to a commit-plan follow-up instead of stalling the iteration.
|
|
131
|
+
|
|
132
|
+
If tester edits files before finalization, the harness re-runs the configured smoke verification command immediately and records which files tester touched.
|
|
133
|
+
|
|
127
134
|
If a repo explicitly needs the older harness-managed commit-plan flow, set `commitMode` to `plan`. In that mode, `testerCommit` and parsed commit plans are used as a compatibility path rather than the default.
|
|
128
135
|
|
|
129
136
|
For source inspection, prompts prefer `read` and reserve shell usage for `git`, tests, and narrow diagnostics. Large shell file reads are more likely to truncate under context pressure than focused `read` calls.
|
|
@@ -175,6 +182,7 @@ SDK transport mitigates obvious local loops by watching agent and tool events:
|
|
|
175
182
|
|
|
176
183
|
- repeated identical tool calls are aborted
|
|
177
184
|
- repeated same-path churn is aborted
|
|
185
|
+
- repeated same-file loop targets are persisted in harness state and escalate the next retry strategy
|
|
178
186
|
- a soft `continue` can be sent after inactivity
|
|
179
187
|
- a separate tool-aware watchdog can tolerate long-running `bash` or browser work without treating the turn as dead
|
|
180
188
|
- a hard no-event timeout aborts a wedged turn instead of hanging indefinitely
|
|
@@ -200,4 +208,6 @@ Each step records:
|
|
|
200
208
|
- changed file count
|
|
201
209
|
- verification status
|
|
202
210
|
- retry count
|
|
211
|
+
- artifact path for compact failure diagnostics when available
|
|
212
|
+
- output excerpt for failed verification-style events
|
|
203
213
|
- notes
|
package/package.json
CHANGED
package/src/cli.mjs
CHANGED
|
@@ -36,11 +36,14 @@ function main() {
|
|
|
36
36
|
if (subcommand === 'once' || subcommand === 'run') {
|
|
37
37
|
childArgs.push(subcommand)
|
|
38
38
|
}
|
|
39
|
+
const childStdio = subcommand === 'once' || subcommand === 'run'
|
|
40
|
+
? ['pipe', 'inherit', 'inherit']
|
|
41
|
+
: 'inherit'
|
|
39
42
|
|
|
40
43
|
const child = spawn(process.execPath, childArgs, {
|
|
41
44
|
cwd: process.cwd(),
|
|
42
45
|
env: process.env,
|
|
43
|
-
stdio:
|
|
46
|
+
stdio: childStdio,
|
|
44
47
|
})
|
|
45
48
|
registerOwnedChildProcess(child)
|
|
46
49
|
|
package/src/pi-config.mjs
CHANGED
|
@@ -259,6 +259,7 @@ export function loadConfig(mode = 'once') {
|
|
|
259
259
|
maxTesterFeedbackLines: readInt('PI_MAX_TESTER_FEEDBACK_LINES', file.maxTesterFeedbackLines, 32),
|
|
260
260
|
maxPromptNotesLines: readInt('PI_MAX_PROMPT_NOTES_LINES', file.maxPromptNotesLines, 16),
|
|
261
261
|
maxVerificationExcerptLines: readInt('PI_MAX_VERIFICATION_EXCERPT_LINES', file.maxVerificationExcerptLines, 40),
|
|
262
|
+
maxFailureArtifactLines: readInt('PI_MAX_FAILURE_ARTIFACT_LINES', file.maxFailureArtifactLines, 80),
|
|
262
263
|
largeFileWarningLines: readInt('PI_LARGE_FILE_WARNING_LINES', file.largeFileWarningLines, 500),
|
|
263
264
|
largeSpecWarningLines: readInt('PI_LARGE_SPEC_WARNING_LINES', file.largeSpecWarningLines, 300),
|
|
264
265
|
piTools: readString('PI_TOOLS', file.piTools, 'read,edit,write,find,ls,bash'),
|
|
@@ -280,6 +281,8 @@ export function loadConfig(mode = 'once') {
|
|
|
280
281
|
verificationTimeoutSeconds: readInt('PI_VERIFICATION_TIMEOUT', file.verificationTimeoutSeconds, 300),
|
|
281
282
|
idleRetryLimit: readInt('PI_IDLE_RETRY_LIMIT', file.idleRetryLimit, 1),
|
|
282
283
|
noChangeRetryLimit: readInt('PI_NO_CHANGE_RETRY_LIMIT', file.noChangeRetryLimit, 1),
|
|
284
|
+
sameFileLoopBudget: readInt('PI_SAME_FILE_LOOP_BUDGET', file.sameFileLoopBudget, 2),
|
|
285
|
+
loopHistoryLimit: readInt('PI_LOOP_HISTORY_LIMIT', file.loopHistoryLimit, 25),
|
|
283
286
|
visualFeedbackFile: resolveFromCwd(
|
|
284
287
|
cwd,
|
|
285
288
|
'PI_VISUAL_FEEDBACK_FILE',
|
|
@@ -298,6 +301,12 @@ export function loadConfig(mode = 'once') {
|
|
|
298
301
|
file.testerFeedbackHistoryDir,
|
|
299
302
|
'pi-output/tester-feedback/history'
|
|
300
303
|
),
|
|
304
|
+
failureArtifactDir: resolveFromCwd(
|
|
305
|
+
cwd,
|
|
306
|
+
'PI_FAILURE_ARTIFACT_DIR',
|
|
307
|
+
file.failureArtifactDir,
|
|
308
|
+
'pi-output/failure-artifacts'
|
|
309
|
+
),
|
|
301
310
|
visualReviewHistoryDir: resolveFromCwd(
|
|
302
311
|
cwd,
|
|
303
312
|
'PI_VISUAL_REVIEW_HISTORY_DIR',
|
package/src/pi-prompts.mjs
CHANGED
|
@@ -54,6 +54,16 @@ function formatLargeFileRiskHint(warnings) {
|
|
|
54
54
|
return `\nLarge file risk in touched files:\n${lines}\nPrefer helper extraction, smaller scoped edits, or test splitting over broad in-place edits.\n`
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
+
function formatLoopRecoveryHint(hints) {
|
|
58
|
+
const list = Array.isArray(hints) ? hints.filter(Boolean) : []
|
|
59
|
+
if (list.length === 0) {
|
|
60
|
+
return ''
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const lines = list.slice(0, 3).map((hint) => `- ${hint}`).join('\n')
|
|
64
|
+
return `\nRecent loop-recovery constraints:\n${lines}\n`
|
|
65
|
+
}
|
|
66
|
+
|
|
57
67
|
function displayPath(config, filePath) {
|
|
58
68
|
const relativePath = path.relative(config.cwd, filePath)
|
|
59
69
|
if (
|
|
@@ -190,11 +200,13 @@ export function buildMainPrompt(config, options = {}) {
|
|
|
190
200
|
config.developerInstructionsFile,
|
|
191
201
|
config.usingBundledDeveloperInstructions,
|
|
192
202
|
)
|
|
203
|
+
const loopRecoveryHint = formatLoopRecoveryHint(options.loopRecoveryHints)
|
|
193
204
|
|
|
194
205
|
if (!config.usingBundledDeveloperInstructions) {
|
|
195
206
|
return `Read ${taskFile} and ${instructionsFile}.
|
|
196
207
|
${authorityLine}${visualFeedbackSection}
|
|
197
208
|
${testerFeedbackSection}
|
|
209
|
+
${loopRecoveryHint}
|
|
198
210
|
|
|
199
211
|
Work only on the current phase.
|
|
200
212
|
Select the first unchecked actionable checkbox in phase order.
|
|
@@ -220,6 +232,7 @@ Before stopping:
|
|
|
220
232
|
return `Read ${taskFile} and ${instructionsFile}.
|
|
221
233
|
${authorityLine}${visualFeedbackSection}
|
|
222
234
|
${testerFeedbackSection}
|
|
235
|
+
${loopRecoveryHint}
|
|
223
236
|
|
|
224
237
|
Do one current-phase unchecked task.
|
|
225
238
|
|
|
@@ -254,12 +267,14 @@ export function buildFixPrompt(config, recentVerificationOutput, options = {}) {
|
|
|
254
267
|
)
|
|
255
268
|
const findings = clampLines(recentVerificationOutput, configMaxLines(config, 'maxVerificationExcerptLines', 40))
|
|
256
269
|
const largeFileRiskHint = formatLargeFileRiskHint(options.largeFileWarnings)
|
|
270
|
+
const loopRecoveryHint = formatLoopRecoveryHint(options.loopRecoveryHints)
|
|
257
271
|
|
|
258
272
|
if (!config.usingBundledDeveloperInstructions) {
|
|
259
273
|
return `Read ${taskFile} and ${instructionsFile}.
|
|
260
274
|
${authorityLine}${visualFeedbackSection}
|
|
261
275
|
${testerFeedbackSection}
|
|
262
276
|
${largeFileRiskHint}
|
|
277
|
+
${loopRecoveryHint}
|
|
263
278
|
|
|
264
279
|
The tester step found a real problem in the current implementation. Fix only the product behavior related to the current phase and current task.
|
|
265
280
|
|
|
@@ -286,6 +301,7 @@ Before stopping:
|
|
|
286
301
|
${authorityLine}${visualFeedbackSection}
|
|
287
302
|
${testerFeedbackSection}
|
|
288
303
|
${largeFileRiskHint}
|
|
304
|
+
${loopRecoveryHint}
|
|
289
305
|
|
|
290
306
|
The tester step found a real problem in the current implementation. Fix only the product behavior related to the current phase and current task.
|
|
291
307
|
|
|
@@ -319,6 +335,7 @@ export function buildSteeringPrompt(config, reason, options = {}) {
|
|
|
319
335
|
config.usingBundledDeveloperInstructions,
|
|
320
336
|
)
|
|
321
337
|
const largeFileRiskHint = formatLargeFileRiskHint(options.largeFileWarnings)
|
|
338
|
+
const loopRecoveryHint = formatLoopRecoveryHint(options.loopRecoveryHints)
|
|
322
339
|
|
|
323
340
|
if (!config.usingBundledDeveloperInstructions) {
|
|
324
341
|
return `Continue from the current repo state.
|
|
@@ -326,6 +343,7 @@ Read ${taskFile} and ${instructionsFile}.
|
|
|
326
343
|
${authorityLine}${visualFeedbackSection}
|
|
327
344
|
${testerFeedbackSection}
|
|
328
345
|
${largeFileRiskHint}
|
|
346
|
+
${loopRecoveryHint}
|
|
329
347
|
|
|
330
348
|
Reason for this follow-up: ${reason}
|
|
331
349
|
|
|
@@ -346,6 +364,7 @@ Read ${taskFile} and ${instructionsFile}.
|
|
|
346
364
|
${authorityLine}${visualFeedbackSection}
|
|
347
365
|
${testerFeedbackSection}
|
|
348
366
|
${largeFileRiskHint}
|
|
367
|
+
${loopRecoveryHint}
|
|
349
368
|
|
|
350
369
|
Reason for this follow-up: ${reason}
|
|
351
370
|
|
package/src/pi-repo.mjs
CHANGED
|
@@ -57,6 +57,7 @@ export async function readState(stateFile) {
|
|
|
57
57
|
lastStatus: '',
|
|
58
58
|
lastVerificationStatus: '',
|
|
59
59
|
lastVisualStatus: '',
|
|
60
|
+
loopHistory: {},
|
|
60
61
|
lastRunAt: '',
|
|
61
62
|
runId: '',
|
|
62
63
|
inProgress: null,
|
|
@@ -75,6 +76,7 @@ export async function readState(stateFile) {
|
|
|
75
76
|
lastStatus: '',
|
|
76
77
|
lastVerificationStatus: '',
|
|
77
78
|
lastVisualStatus: '',
|
|
79
|
+
loopHistory: {},
|
|
78
80
|
lastRunAt: '',
|
|
79
81
|
runId: '',
|
|
80
82
|
inProgress: null,
|
|
@@ -282,7 +284,8 @@ export function watchParentProcess(onParentExit, options = {}) {
|
|
|
282
284
|
}
|
|
283
285
|
|
|
284
286
|
const currentParentPid = normalizePid(process.ppid)
|
|
285
|
-
|
|
287
|
+
const parentStillRunning = isProcessRunning(expectedParentPid)
|
|
288
|
+
if (currentParentPid === expectedParentPid && currentParentPid > 1 && parentStillRunning) {
|
|
286
289
|
return
|
|
287
290
|
}
|
|
288
291
|
|
|
@@ -483,7 +486,7 @@ function countLines(text) {
|
|
|
483
486
|
return normalized.split('\n').length
|
|
484
487
|
}
|
|
485
488
|
|
|
486
|
-
function isSpecLikeFile(filePath) {
|
|
489
|
+
export function isSpecLikeFile(filePath) {
|
|
487
490
|
const normalized = String(filePath ?? '').replaceAll('\\', '/')
|
|
488
491
|
return /(^|\/)(e2e|test|tests|spec|specs)\//.test(normalized)
|
|
489
492
|
|| /\.(spec|test)\.[cm]?[jt]sx?$/.test(normalized)
|
package/src/pi-report.mjs
CHANGED
|
@@ -46,6 +46,21 @@ async function main() {
|
|
|
46
46
|
}
|
|
47
47
|
}
|
|
48
48
|
|
|
49
|
+
const failureArtifacts = recent
|
|
50
|
+
.filter((event) => String(event.artifactPath ?? '').trim() !== '')
|
|
51
|
+
.slice(-5)
|
|
52
|
+
|
|
53
|
+
if (failureArtifacts.length > 0) {
|
|
54
|
+
console.log('\nFailure artifacts:')
|
|
55
|
+
for (const event of failureArtifacts) {
|
|
56
|
+
const excerpt = String(event.outputExcerpt ?? '').trim()
|
|
57
|
+
console.log(`- iteration ${event.iteration} ${event.kind}: ${event.artifactPath}`)
|
|
58
|
+
if (excerpt !== '') {
|
|
59
|
+
console.log(` excerpt: ${excerpt.split('\n')[0]}`)
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
49
64
|
const last = recent.at(-1)
|
|
50
65
|
if (!last) {
|
|
51
66
|
return
|