@sebastianandreasson/pi-autonomous-agents 0.11.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -190,10 +190,13 @@ Common fields in `pi.config.json`:
190
190
  - `testCommand`
191
191
  - `visualReviewEnabled`
192
192
  - `visualCaptureCommand`
193
+ - `failureArtifactDir`
193
194
  - `continueAfterSeconds`
194
195
  - `toolContinueAfterSeconds`
195
196
  - `noEventTimeoutSeconds`
196
197
  - `toolNoEventTimeoutSeconds`
198
+ - `sameFileLoopBudget`
199
+ - `loopHistoryLimit`
197
200
  - `largeFileWarningLines`
198
201
  - `largeSpecWarningLines`
199
202
 
@@ -207,6 +210,8 @@ Key defaults:
207
210
  - `toolContinueAfterSeconds`: `900`
208
211
  - `noEventTimeoutSeconds`: `900`
209
212
  - `toolNoEventTimeoutSeconds`: `1800`
213
+ - `sameFileLoopBudget`: `2`
214
+ - `loopHistoryLimit`: `25`
210
215
 
211
216
  ## Prompt and Tooling Behavior
212
217
 
@@ -217,6 +222,7 @@ The package is optimized for local models by default:
217
222
  - prompts prefer `read` for source inspection
218
223
  - shell is intended for `git`, tests, and narrow diagnostics
219
224
  - SDK transport carries forward oversized shell-read warnings and loop/timeout guards
225
+ - repeated same-file loop failures are remembered across iterations and escalate the next edit strategy
220
226
  - the supervisor emits large-file/spec warnings when touched files are getting risky
221
227
 
222
228
  This is deliberate. Large monolith files, huge e2e specs, and broad TODO items are one of the main causes of local-model drift and retry loops.
@@ -255,6 +261,8 @@ Useful files during a run:
255
261
  Latest verification output snapshot.
256
262
  - `.pi-last-iteration.json`
257
263
  Structured summary of the last completed iteration.
264
+ - `pi-output/failure-artifacts/`
265
+ Compact failure artifacts with command, exit code, changed files, tester summary, and output excerpt.
258
266
  - `.pi-state.json`
259
267
  Persistent harness state, including in-progress iteration data.
260
268
  - `pi.log`
@@ -264,7 +272,7 @@ Useful files during a run:
264
272
  - `.pi-runtime/active-run.json`
265
273
  - `.pi-runtime/runs/<runId>/...`
266
274
 
267
- `pi-harness report` summarizes recent telemetry and surfaces things like terminal reasons and large-file warnings.
275
+ `pi-harness report` summarizes recent telemetry and surfaces things like terminal reasons, large-file warnings, and recent failure artifacts.
268
276
 
269
277
  `pi-harness run` now also starts lightweight local web UI for orchestration flow by default. By default it listens on `127.0.0.1:4317`. Override with `PI_VISUALIZER_HOST` and `PI_VISUALIZER_PORT`. Set `PI_VISUALIZER=0` to disable embedded web UI for a run.
270
278
 
@@ -80,10 +80,13 @@ Projects typically provide their own `pi.config.json` with fields such as:
80
80
  - `visualCaptureCommand`
81
81
  - `visualFeedbackFile`
82
82
  - `testerFeedbackFile`
83
+ - `failureArtifactDir`
83
84
  - `models`
84
85
  - `piModel`
85
86
  - `visualReviewModel`
86
87
  - `commitMode`
88
+ - `sameFileLoopBudget`
89
+ - `loopHistoryLimit`
87
90
 
88
91
  Model entries may carry their own OpenAI-compatible endpoint settings, so the PI text loop and the multimodal visual reviewer can point at different backends without changing code.
89
92
 
@@ -124,6 +127,10 @@ The default flow keeps commit ownership with the active agent:
124
127
  2. `tester` should review functionality and, on `PASS`, stage only the task-related files and create the commit directly.
125
128
  3. If the working tree is too messy to isolate safely, tester should return `VERDICT: BLOCKED` instead of guessing.
126
129
 
130
+ If tester returns `PASS` but leaves a dirty tree without creating the commit, the harness now treats that as a protocol error and automatically falls back to a commit-plan follow-up instead of stalling the iteration.
131
+
132
+ If tester edits files before finalization, the harness re-runs the configured smoke verification command immediately and records which files tester touched.
133
+
127
134
  If a repo explicitly needs the older harness-managed commit-plan flow, set `commitMode` to `plan`. In that mode, `testerCommit` and parsed commit plans are used as a compatibility path rather than the default.
128
135
 
129
136
  For source inspection, prompts prefer `read` and reserve shell usage for `git`, tests, and narrow diagnostics. Large shell file reads are more likely to truncate under context pressure than focused `read` calls.
@@ -175,6 +182,7 @@ SDK transport mitigates obvious local loops by watching agent and tool events:
175
182
 
176
183
  - repeated identical tool calls are aborted
177
184
  - repeated same-path churn is aborted
185
+ - repeated same-file loop targets are persisted in harness state and escalate the next retry strategy
178
186
  - a soft `continue` can be sent after inactivity
179
187
  - a separate tool-aware watchdog can tolerate long-running `bash` or browser work without treating the turn as dead
180
188
  - a hard no-event timeout aborts a wedged turn instead of hanging indefinitely
@@ -200,4 +208,6 @@ Each step records:
200
208
  - changed file count
201
209
  - verification status
202
210
  - retry count
211
+ - artifact path for compact failure diagnostics when available
212
+ - output excerpt for failed verification-style events
203
213
  - notes
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@sebastianandreasson/pi-autonomous-agents",
3
3
  "private": false,
4
- "version": "0.11.0",
4
+ "version": "0.12.1",
5
5
  "type": "module",
6
6
  "description": "Portable unattended PI harness for developer/tester/visual-review loops.",
7
7
  "license": "MIT",
package/src/cli.mjs CHANGED
@@ -36,11 +36,14 @@ function main() {
36
36
  if (subcommand === 'once' || subcommand === 'run') {
37
37
  childArgs.push(subcommand)
38
38
  }
39
+ const childStdio = subcommand === 'once' || subcommand === 'run'
40
+ ? ['pipe', 'inherit', 'inherit']
41
+ : 'inherit'
39
42
 
40
43
  const child = spawn(process.execPath, childArgs, {
41
44
  cwd: process.cwd(),
42
45
  env: process.env,
43
- stdio: 'inherit',
46
+ stdio: childStdio,
44
47
  })
45
48
  registerOwnedChildProcess(child)
46
49
 
package/src/pi-config.mjs CHANGED
@@ -259,6 +259,7 @@ export function loadConfig(mode = 'once') {
259
259
  maxTesterFeedbackLines: readInt('PI_MAX_TESTER_FEEDBACK_LINES', file.maxTesterFeedbackLines, 32),
260
260
  maxPromptNotesLines: readInt('PI_MAX_PROMPT_NOTES_LINES', file.maxPromptNotesLines, 16),
261
261
  maxVerificationExcerptLines: readInt('PI_MAX_VERIFICATION_EXCERPT_LINES', file.maxVerificationExcerptLines, 40),
262
+ maxFailureArtifactLines: readInt('PI_MAX_FAILURE_ARTIFACT_LINES', file.maxFailureArtifactLines, 80),
262
263
  largeFileWarningLines: readInt('PI_LARGE_FILE_WARNING_LINES', file.largeFileWarningLines, 500),
263
264
  largeSpecWarningLines: readInt('PI_LARGE_SPEC_WARNING_LINES', file.largeSpecWarningLines, 300),
264
265
  piTools: readString('PI_TOOLS', file.piTools, 'read,edit,write,find,ls,bash'),
@@ -280,6 +281,8 @@ export function loadConfig(mode = 'once') {
280
281
  verificationTimeoutSeconds: readInt('PI_VERIFICATION_TIMEOUT', file.verificationTimeoutSeconds, 300),
281
282
  idleRetryLimit: readInt('PI_IDLE_RETRY_LIMIT', file.idleRetryLimit, 1),
282
283
  noChangeRetryLimit: readInt('PI_NO_CHANGE_RETRY_LIMIT', file.noChangeRetryLimit, 1),
284
+ sameFileLoopBudget: readInt('PI_SAME_FILE_LOOP_BUDGET', file.sameFileLoopBudget, 2),
285
+ loopHistoryLimit: readInt('PI_LOOP_HISTORY_LIMIT', file.loopHistoryLimit, 25),
283
286
  visualFeedbackFile: resolveFromCwd(
284
287
  cwd,
285
288
  'PI_VISUAL_FEEDBACK_FILE',
@@ -298,6 +301,12 @@ export function loadConfig(mode = 'once') {
298
301
  file.testerFeedbackHistoryDir,
299
302
  'pi-output/tester-feedback/history'
300
303
  ),
304
+ failureArtifactDir: resolveFromCwd(
305
+ cwd,
306
+ 'PI_FAILURE_ARTIFACT_DIR',
307
+ file.failureArtifactDir,
308
+ 'pi-output/failure-artifacts'
309
+ ),
301
310
  visualReviewHistoryDir: resolveFromCwd(
302
311
  cwd,
303
312
  'PI_VISUAL_REVIEW_HISTORY_DIR',
@@ -54,6 +54,16 @@ function formatLargeFileRiskHint(warnings) {
54
54
  return `\nLarge file risk in touched files:\n${lines}\nPrefer helper extraction, smaller scoped edits, or test splitting over broad in-place edits.\n`
55
55
  }
56
56
 
57
+ function formatLoopRecoveryHint(hints) {
58
+ const list = Array.isArray(hints) ? hints.filter(Boolean) : []
59
+ if (list.length === 0) {
60
+ return ''
61
+ }
62
+
63
+ const lines = list.slice(0, 3).map((hint) => `- ${hint}`).join('\n')
64
+ return `\nRecent loop-recovery constraints:\n${lines}\n`
65
+ }
66
+
57
67
  function displayPath(config, filePath) {
58
68
  const relativePath = path.relative(config.cwd, filePath)
59
69
  if (
@@ -190,11 +200,13 @@ export function buildMainPrompt(config, options = {}) {
190
200
  config.developerInstructionsFile,
191
201
  config.usingBundledDeveloperInstructions,
192
202
  )
203
+ const loopRecoveryHint = formatLoopRecoveryHint(options.loopRecoveryHints)
193
204
 
194
205
  if (!config.usingBundledDeveloperInstructions) {
195
206
  return `Read ${taskFile} and ${instructionsFile}.
196
207
  ${authorityLine}${visualFeedbackSection}
197
208
  ${testerFeedbackSection}
209
+ ${loopRecoveryHint}
198
210
 
199
211
  Work only on the current phase.
200
212
  Select the first unchecked actionable checkbox in phase order.
@@ -220,6 +232,7 @@ Before stopping:
220
232
  return `Read ${taskFile} and ${instructionsFile}.
221
233
  ${authorityLine}${visualFeedbackSection}
222
234
  ${testerFeedbackSection}
235
+ ${loopRecoveryHint}
223
236
 
224
237
  Do one current-phase unchecked task.
225
238
 
@@ -254,12 +267,14 @@ export function buildFixPrompt(config, recentVerificationOutput, options = {}) {
254
267
  )
255
268
  const findings = clampLines(recentVerificationOutput, configMaxLines(config, 'maxVerificationExcerptLines', 40))
256
269
  const largeFileRiskHint = formatLargeFileRiskHint(options.largeFileWarnings)
270
+ const loopRecoveryHint = formatLoopRecoveryHint(options.loopRecoveryHints)
257
271
 
258
272
  if (!config.usingBundledDeveloperInstructions) {
259
273
  return `Read ${taskFile} and ${instructionsFile}.
260
274
  ${authorityLine}${visualFeedbackSection}
261
275
  ${testerFeedbackSection}
262
276
  ${largeFileRiskHint}
277
+ ${loopRecoveryHint}
263
278
 
264
279
  The tester step found a real problem in the current implementation. Fix only the product behavior related to the current phase and current task.
265
280
 
@@ -286,6 +301,7 @@ Before stopping:
286
301
  ${authorityLine}${visualFeedbackSection}
287
302
  ${testerFeedbackSection}
288
303
  ${largeFileRiskHint}
304
+ ${loopRecoveryHint}
289
305
 
290
306
  The tester step found a real problem in the current implementation. Fix only the product behavior related to the current phase and current task.
291
307
 
@@ -319,6 +335,7 @@ export function buildSteeringPrompt(config, reason, options = {}) {
319
335
  config.usingBundledDeveloperInstructions,
320
336
  )
321
337
  const largeFileRiskHint = formatLargeFileRiskHint(options.largeFileWarnings)
338
+ const loopRecoveryHint = formatLoopRecoveryHint(options.loopRecoveryHints)
322
339
 
323
340
  if (!config.usingBundledDeveloperInstructions) {
324
341
  return `Continue from the current repo state.
@@ -326,6 +343,7 @@ Read ${taskFile} and ${instructionsFile}.
326
343
  ${authorityLine}${visualFeedbackSection}
327
344
  ${testerFeedbackSection}
328
345
  ${largeFileRiskHint}
346
+ ${loopRecoveryHint}
329
347
 
330
348
  Reason for this follow-up: ${reason}
331
349
 
@@ -346,6 +364,7 @@ Read ${taskFile} and ${instructionsFile}.
346
364
  ${authorityLine}${visualFeedbackSection}
347
365
  ${testerFeedbackSection}
348
366
  ${largeFileRiskHint}
367
+ ${loopRecoveryHint}
349
368
 
350
369
  Reason for this follow-up: ${reason}
351
370
 
package/src/pi-repo.mjs CHANGED
@@ -57,6 +57,7 @@ export async function readState(stateFile) {
57
57
  lastStatus: '',
58
58
  lastVerificationStatus: '',
59
59
  lastVisualStatus: '',
60
+ loopHistory: {},
60
61
  lastRunAt: '',
61
62
  runId: '',
62
63
  inProgress: null,
@@ -75,6 +76,7 @@ export async function readState(stateFile) {
75
76
  lastStatus: '',
76
77
  lastVerificationStatus: '',
77
78
  lastVisualStatus: '',
79
+ loopHistory: {},
78
80
  lastRunAt: '',
79
81
  runId: '',
80
82
  inProgress: null,
@@ -282,7 +284,8 @@ export function watchParentProcess(onParentExit, options = {}) {
282
284
  }
283
285
 
284
286
  const currentParentPid = normalizePid(process.ppid)
285
- if (currentParentPid === expectedParentPid && currentParentPid > 1) {
287
+ const parentStillRunning = isProcessRunning(expectedParentPid)
288
+ if (currentParentPid === expectedParentPid && currentParentPid > 1 && parentStillRunning) {
286
289
  return
287
290
  }
288
291
 
@@ -483,7 +486,7 @@ function countLines(text) {
483
486
  return normalized.split('\n').length
484
487
  }
485
488
 
486
- function isSpecLikeFile(filePath) {
489
+ export function isSpecLikeFile(filePath) {
487
490
  const normalized = String(filePath ?? '').replaceAll('\\', '/')
488
491
  return /(^|\/)(e2e|test|tests|spec|specs)\//.test(normalized)
489
492
  || /\.(spec|test)\.[cm]?[jt]sx?$/.test(normalized)
package/src/pi-report.mjs CHANGED
@@ -46,6 +46,21 @@ async function main() {
46
46
  }
47
47
  }
48
48
 
49
+ const failureArtifacts = recent
50
+ .filter((event) => String(event.artifactPath ?? '').trim() !== '')
51
+ .slice(-5)
52
+
53
+ if (failureArtifacts.length > 0) {
54
+ console.log('\nFailure artifacts:')
55
+ for (const event of failureArtifacts) {
56
+ const excerpt = String(event.outputExcerpt ?? '').trim()
57
+ console.log(`- iteration ${event.iteration} ${event.kind}: ${event.artifactPath}`)
58
+ if (excerpt !== '') {
59
+ console.log(` excerpt: ${excerpt.split('\n')[0]}`)
60
+ }
61
+ }
62
+ }
63
+
49
64
  const last = recent.at(-1)
50
65
  if (!last) {
51
66
  return