@probelabs/visor 0.1.181 → 0.1.182-ee

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/defaults/code-talk.yaml +80 -14
  2. package/defaults/engineer.yaml +33 -15
  3. package/defaults/skills/code-explorer.yaml +5 -0
  4. package/dist/agent-protocol/a2a-frontend.d.ts +10 -0
  5. package/dist/agent-protocol/a2a-frontend.d.ts.map +1 -1
  6. package/dist/agent-protocol/task-evaluator.d.ts +52 -0
  7. package/dist/agent-protocol/task-evaluator.d.ts.map +1 -0
  8. package/dist/agent-protocol/task-store.d.ts +5 -3
  9. package/dist/agent-protocol/task-store.d.ts.map +1 -1
  10. package/dist/agent-protocol/tasks-cli-handler.d.ts.map +1 -1
  11. package/dist/agent-protocol/tasks-tui.d.ts +34 -0
  12. package/dist/agent-protocol/tasks-tui.d.ts.map +1 -0
  13. package/dist/agent-protocol/trace-serializer.d.ts +90 -0
  14. package/dist/agent-protocol/trace-serializer.d.ts.map +1 -0
  15. package/dist/agent-protocol/track-execution.d.ts +2 -0
  16. package/dist/agent-protocol/track-execution.d.ts.map +1 -1
  17. package/dist/cli-main.d.ts.map +1 -1
  18. package/dist/defaults/code-talk.yaml +80 -14
  19. package/dist/defaults/engineer.yaml +33 -15
  20. package/dist/defaults/skills/code-explorer.yaml +5 -0
  21. package/dist/docs/commands.md +57 -14
  22. package/dist/docs/configuration.md +2 -0
  23. package/dist/docs/guides/graceful-restart.md +178 -0
  24. package/dist/docs/observability.md +69 -0
  25. package/dist/docs/production-deployment.md +17 -0
  26. package/dist/email/polling-runner.d.ts +4 -0
  27. package/dist/email/polling-runner.d.ts.map +1 -1
  28. package/dist/generated/config-schema.d.ts +70 -6
  29. package/dist/generated/config-schema.d.ts.map +1 -1
  30. package/dist/generated/config-schema.json +36 -6
  31. package/dist/index.js +7420 -1483
  32. package/dist/providers/mcp-custom-sse-server.d.ts +4 -0
  33. package/dist/providers/mcp-custom-sse-server.d.ts.map +1 -1
  34. package/dist/runners/graceful-restart.d.ts +46 -0
  35. package/dist/runners/graceful-restart.d.ts.map +1 -0
  36. package/dist/runners/mcp-server-runner.d.ts +12 -0
  37. package/dist/runners/mcp-server-runner.d.ts.map +1 -1
  38. package/dist/runners/runner-factory.d.ts.map +1 -1
  39. package/dist/runners/runner-host.d.ts +12 -0
  40. package/dist/runners/runner-host.d.ts.map +1 -1
  41. package/dist/runners/runner.d.ts +12 -0
  42. package/dist/runners/runner.d.ts.map +1 -1
  43. package/dist/sdk/{a2a-frontend-IWOUJOIZ.mjs → a2a-frontend-4LP3MLTS.mjs} +47 -5
  44. package/dist/sdk/a2a-frontend-4LP3MLTS.mjs.map +1 -0
  45. package/dist/sdk/{a2a-frontend-BDACLGMA.mjs → a2a-frontend-MU5EO2HZ.mjs} +35 -1
  46. package/dist/sdk/a2a-frontend-MU5EO2HZ.mjs.map +1 -0
  47. package/dist/sdk/{check-provider-registry-4YKTEDKF.mjs → check-provider-registry-I4BCWKRU.mjs} +7 -7
  48. package/dist/sdk/{check-provider-registry-4YFVBGYU.mjs → check-provider-registry-RRWCXSTG.mjs} +3 -3
  49. package/dist/sdk/{chunk-VMVIM4JB.mjs → chunk-4I3TJ7UJ.mjs} +37 -7
  50. package/dist/sdk/chunk-4I3TJ7UJ.mjs.map +1 -0
  51. package/dist/sdk/{chunk-7YZSSO4X.mjs → chunk-6DPPP7LD.mjs} +10 -10
  52. package/dist/sdk/chunk-7ERVRLDV.mjs +296 -0
  53. package/dist/sdk/chunk-7ERVRLDV.mjs.map +1 -0
  54. package/dist/sdk/{chunk-VXC2XNQJ.mjs → chunk-ANUT54HW.mjs} +3 -3
  55. package/dist/sdk/{chunk-J73GEFPT.mjs → chunk-DHETLQIX.mjs} +2 -2
  56. package/dist/sdk/{chunk-DGIH6EX3.mjs → chunk-QXT47ZHR.mjs} +151 -281
  57. package/dist/sdk/chunk-QXT47ZHR.mjs.map +1 -0
  58. package/dist/sdk/{chunk-4DVP6KVC.mjs → chunk-TQQNSHQV.mjs} +72 -31
  59. package/dist/sdk/chunk-TQQNSHQV.mjs.map +1 -0
  60. package/dist/sdk/chunk-ZOF5QT6U.mjs +5943 -0
  61. package/dist/sdk/chunk-ZOF5QT6U.mjs.map +1 -0
  62. package/dist/sdk/{config-TSA5FUOM.mjs → config-2STD74CJ.mjs} +2 -2
  63. package/dist/sdk/config-JE4HKTWW.mjs +16 -0
  64. package/dist/sdk/{failure-condition-evaluator-HTPB5FYW.mjs → failure-condition-evaluator-5DZYMCGW.mjs} +4 -4
  65. package/dist/sdk/{github-frontend-3SDFCCKI.mjs → github-frontend-L3F5JXPJ.mjs} +4 -4
  66. package/dist/sdk/{host-QE4L7UXE.mjs → host-QBJ7TOWG.mjs} +3 -3
  67. package/dist/sdk/{host-CVH2CSHM.mjs → host-X5ZZCEWN.mjs} +2 -2
  68. package/dist/sdk/knex-store-QCEW4I4R.mjs +527 -0
  69. package/dist/sdk/knex-store-QCEW4I4R.mjs.map +1 -0
  70. package/dist/sdk/loader-ZNKKJEZ3.mjs +89 -0
  71. package/dist/sdk/loader-ZNKKJEZ3.mjs.map +1 -0
  72. package/dist/sdk/opa-policy-engine-QCSSIMUF.mjs +655 -0
  73. package/dist/sdk/opa-policy-engine-QCSSIMUF.mjs.map +1 -0
  74. package/dist/sdk/{routing-YVMTKFDZ.mjs → routing-CVQT4KHX.mjs} +5 -5
  75. package/dist/sdk/{schedule-tool-Z5VG67JK.mjs → schedule-tool-AECLFHSY.mjs} +7 -7
  76. package/dist/sdk/{schedule-tool-ZMX3Y7LF.mjs → schedule-tool-Z6QYL2B3.mjs} +3 -3
  77. package/dist/sdk/{schedule-tool-handler-PCERK6ZZ.mjs → schedule-tool-handler-6QLZRTQA.mjs} +7 -7
  78. package/dist/sdk/{schedule-tool-handler-N7UNABOA.mjs → schedule-tool-handler-J4NUETJ6.mjs} +3 -3
  79. package/dist/sdk/sdk.d.mts +33 -0
  80. package/dist/sdk/sdk.d.ts +33 -0
  81. package/dist/sdk/sdk.js +3545 -455
  82. package/dist/sdk/sdk.js.map +1 -1
  83. package/dist/sdk/sdk.mjs +6 -6
  84. package/dist/sdk/task-evaluator-HLNXKKVV.mjs +1278 -0
  85. package/dist/sdk/task-evaluator-HLNXKKVV.mjs.map +1 -0
  86. package/dist/sdk/{trace-helpers-KXDOJWBL.mjs → trace-helpers-WJXYVV4S.mjs} +3 -3
  87. package/dist/sdk/trace-reader-ZY77OFNM.mjs +266 -0
  88. package/dist/sdk/trace-reader-ZY77OFNM.mjs.map +1 -0
  89. package/dist/sdk/track-execution-MKIQXP2C.mjs +136 -0
  90. package/dist/sdk/track-execution-MKIQXP2C.mjs.map +1 -0
  91. package/dist/sdk/validator-XTZJZZJH.mjs +134 -0
  92. package/dist/sdk/validator-XTZJZZJH.mjs.map +1 -0
  93. package/dist/sdk/{workflow-check-provider-NTHC5ZBF.mjs → workflow-check-provider-EXMC6JIS.mjs} +7 -7
  94. package/dist/sdk/{workflow-check-provider-SRIMWKLQ.mjs → workflow-check-provider-VKYGI5GK.mjs} +3 -3
  95. package/dist/slack/socket-runner.d.ts +12 -0
  96. package/dist/slack/socket-runner.d.ts.map +1 -1
  97. package/dist/teams/webhook-runner.d.ts +4 -0
  98. package/dist/teams/webhook-runner.d.ts.map +1 -1
  99. package/dist/telegram/polling-runner.d.ts +2 -0
  100. package/dist/telegram/polling-runner.d.ts.map +1 -1
  101. package/dist/types/config.d.ts +33 -0
  102. package/dist/types/config.d.ts.map +1 -1
  103. package/dist/whatsapp/webhook-runner.d.ts +4 -0
  104. package/dist/whatsapp/webhook-runner.d.ts.map +1 -1
  105. package/package.json +2 -2
  106. package/dist/output/traces/run-2026-03-17T13-58-29-402Z.ndjson +0 -157
  107. package/dist/output/traces/run-2026-03-17T13-59-10-403Z.ndjson +0 -2333
  108. package/dist/sdk/a2a-frontend-BDACLGMA.mjs.map +0 -1
  109. package/dist/sdk/a2a-frontend-IWOUJOIZ.mjs.map +0 -1
  110. package/dist/sdk/check-provider-registry-67ZLGDDQ.mjs +0 -31
  111. package/dist/sdk/chunk-4DVP6KVC.mjs.map +0 -1
  112. package/dist/sdk/chunk-DGIH6EX3.mjs.map +0 -1
  113. package/dist/sdk/chunk-QGBASDYP.mjs +0 -46153
  114. package/dist/sdk/chunk-QGBASDYP.mjs.map +0 -1
  115. package/dist/sdk/chunk-VMVIM4JB.mjs.map +0 -1
  116. package/dist/sdk/host-VBBSLUWG.mjs +0 -87
  117. package/dist/sdk/host-VBBSLUWG.mjs.map +0 -1
  118. package/dist/sdk/schedule-tool-ADUXTCY7.mjs +0 -37
  119. package/dist/sdk/schedule-tool-handler-QOJVFRB4.mjs +0 -41
  120. package/dist/sdk/workflow-check-provider-CJXW2Z4F.mjs +0 -31
  121. package/dist/sdk/workflow-check-provider-CJXW2Z4F.mjs.map +0 -1
  122. package/dist/sdk/workflow-check-provider-NTHC5ZBF.mjs.map +0 -1
  123. package/dist/sdk/workflow-check-provider-SRIMWKLQ.mjs.map +0 -1
  124. package/dist/traces/run-2026-03-17T13-58-29-402Z.ndjson +0 -157
  125. package/dist/traces/run-2026-03-17T13-59-10-403Z.ndjson +0 -2333
  126. /package/dist/sdk/{check-provider-registry-4YFVBGYU.mjs.map → check-provider-registry-I4BCWKRU.mjs.map} +0 -0
  127. /package/dist/sdk/{check-provider-registry-4YKTEDKF.mjs.map → check-provider-registry-RRWCXSTG.mjs.map} +0 -0
  128. /package/dist/sdk/{chunk-7YZSSO4X.mjs.map → chunk-6DPPP7LD.mjs.map} +0 -0
  129. /package/dist/sdk/{chunk-VXC2XNQJ.mjs.map → chunk-ANUT54HW.mjs.map} +0 -0
  130. /package/dist/sdk/{chunk-J73GEFPT.mjs.map → chunk-DHETLQIX.mjs.map} +0 -0
  131. /package/dist/sdk/{check-provider-registry-67ZLGDDQ.mjs.map → config-2STD74CJ.mjs.map} +0 -0
  132. /package/dist/sdk/{config-TSA5FUOM.mjs.map → config-JE4HKTWW.mjs.map} +0 -0
  133. /package/dist/sdk/{failure-condition-evaluator-HTPB5FYW.mjs.map → failure-condition-evaluator-5DZYMCGW.mjs.map} +0 -0
  134. /package/dist/sdk/{github-frontend-3SDFCCKI.mjs.map → github-frontend-L3F5JXPJ.mjs.map} +0 -0
  135. /package/dist/sdk/{host-CVH2CSHM.mjs.map → host-QBJ7TOWG.mjs.map} +0 -0
  136. /package/dist/sdk/{host-QE4L7UXE.mjs.map → host-X5ZZCEWN.mjs.map} +0 -0
  137. /package/dist/sdk/{routing-YVMTKFDZ.mjs.map → routing-CVQT4KHX.mjs.map} +0 -0
  138. /package/dist/sdk/{schedule-tool-ADUXTCY7.mjs.map → schedule-tool-AECLFHSY.mjs.map} +0 -0
  139. /package/dist/sdk/{schedule-tool-Z5VG67JK.mjs.map → schedule-tool-Z6QYL2B3.mjs.map} +0 -0
  140. /package/dist/sdk/{schedule-tool-ZMX3Y7LF.mjs.map → schedule-tool-handler-6QLZRTQA.mjs.map} +0 -0
  141. /package/dist/sdk/{schedule-tool-handler-N7UNABOA.mjs.map → schedule-tool-handler-J4NUETJ6.mjs.map} +0 -0
  142. /package/dist/sdk/{schedule-tool-handler-PCERK6ZZ.mjs.map → trace-helpers-WJXYVV4S.mjs.map} +0 -0
  143. /package/dist/sdk/{schedule-tool-handler-QOJVFRB4.mjs.map → workflow-check-provider-EXMC6JIS.mjs.map} +0 -0
  144. /package/dist/sdk/{trace-helpers-KXDOJWBL.mjs.map → workflow-check-provider-VKYGI5GK.mjs.map} +0 -0
@@ -136,18 +136,59 @@ outputs:
136
136
  value_js: |
137
137
  const result = outputs?.['explore-code'];
138
138
  if (result?.answer) return result.answer;
139
- const routeOutput = outputs?.['setup-projects']?.routing_decision;
140
- // Handle proper notes field
141
- const routeNotes = routeOutput?.notes;
142
- if (typeof routeNotes === 'string' && routeNotes.trim().length > 0) {
143
- return { text: routeNotes };
139
+ const resultText = typeof result?.text === 'string' ? result.text.trim() : '';
140
+ if (resultText.length > 0) {
141
+ return { text: resultText };
144
142
  }
145
- // Fallback: if AI returned {text: "..."} instead of proper schema
146
- const routeText = routeOutput?.text;
147
- if (typeof routeText === 'string' && routeText.trim().length > 0) {
148
- return { text: routeText };
143
+ const routeNotes = outputs?.['setup-projects']?.routing_decision?.notes;
144
+ const checkoutProjects = outputs?.['setup-projects']?.checkout_projects;
145
+ if (
146
+ (!Array.isArray(checkoutProjects) || checkoutProjects.length === 0) &&
147
+ typeof routeNotes === 'string' &&
148
+ routeNotes.trim().length > 0
149
+ ) {
150
+ return { text: routeNotes.trim() };
151
+ }
152
+ return { text: 'Code exploration did not produce an answer.' };
153
+
154
+ - name: exploration_status
155
+ description: Outcome of the exploration step
156
+ value_js: |
157
+ const result = outputs?.['explore-code'];
158
+ if (result?.answer?.text) return 'success';
159
+ const resultText = typeof result?.text === 'string' ? result.text.trim() : '';
160
+ if (resultText.length > 0) {
161
+ if (/timed out/i.test(resultText)) return 'timeout';
162
+ return 'failed';
149
163
  }
150
- return null;
164
+ const routeNotes = outputs?.['setup-projects']?.routing_decision?.notes;
165
+ const checkoutProjects = outputs?.['setup-projects']?.checkout_projects;
166
+ if (
167
+ (!Array.isArray(checkoutProjects) || checkoutProjects.length === 0) &&
168
+ typeof routeNotes === 'string' &&
169
+ routeNotes.trim().length > 0
170
+ ) {
171
+ return 'no_projects';
172
+ }
173
+ return 'failed';
174
+
175
+ - name: exploration_error
176
+ description: Timeout or failure detail when exploration did not return a real answer
177
+ value_js: |
178
+ const result = outputs?.['explore-code'];
179
+ if (result?.answer?.text) return '';
180
+ const resultText = typeof result?.text === 'string' ? result.text.trim() : '';
181
+ if (resultText.length > 0) return resultText;
182
+ const routeNotes = outputs?.['setup-projects']?.routing_decision?.notes;
183
+ const checkoutProjects = outputs?.['setup-projects']?.checkout_projects;
184
+ if (
185
+ (!Array.isArray(checkoutProjects) || checkoutProjects.length === 0) &&
186
+ typeof routeNotes === 'string' &&
187
+ routeNotes.trim().length > 0
188
+ ) {
189
+ return routeNotes.trim();
190
+ }
191
+ return 'Code exploration did not produce an answer.';
151
192
 
152
193
  - name: references
153
194
  description: Code/doc references from exploration
@@ -174,11 +215,20 @@ outputs:
174
215
  const result = outputs?.['explore-code'];
175
216
  const confidence = result?.confidence;
176
217
  const reason = result?.confidence_reason;
177
- if (typeof reason === 'string') return reason;
218
+ if (typeof reason === 'string' && reason.trim().length > 0) return reason;
178
219
  if (confidence === 'high') return '';
220
+ const resultText = typeof result?.text === 'string' ? result.text.trim() : '';
221
+ if (resultText.length > 0) return resultText;
179
222
  const routeNotes = outputs?.['setup-projects']?.routing_decision?.notes;
180
- if (typeof routeNotes === 'string' && routeNotes.trim().length > 0) return routeNotes;
181
- return 'No confidence explanation was provided by explore-code.';
223
+ const checkoutProjects = outputs?.['setup-projects']?.checkout_projects;
224
+ if (
225
+ (!Array.isArray(checkoutProjects) || checkoutProjects.length === 0) &&
226
+ typeof routeNotes === 'string' &&
227
+ routeNotes.trim().length > 0
228
+ ) {
229
+ return routeNotes.trim();
230
+ }
231
+ return 'Code exploration did not produce an answer.';
182
232
 
183
233
  - name: projects_explored
184
234
  description: Which project IDs were checked out
@@ -261,7 +311,7 @@ steps:
261
311
  skip_code_context: true
262
312
  enableDelegate: true
263
313
  enableExecutePlan: false
264
- max_iterations: 50
314
+ max_iterations: 100
265
315
  prompt_type: code-explorer
266
316
  allowBash: true
267
317
  bashConfig:
@@ -415,8 +465,17 @@ steps:
415
465
  - Each delegate should answer ONE specific question (not "look at the code")
416
466
  - Run multiple delegates in PARALLEL for different hypotheses or components
417
467
  - Ask delegates to return specific file paths and line numbers
468
+ - Do NOT delegate or re-search the same question twice in one investigation
469
+ - If a delegate returns enough evidence for the current claim, stop and use it
418
470
 
419
471
  Relay complete data from tools — do not summarize or compress tool output.
472
+
473
+ Investigation scope:
474
+ - Stop once you have enough evidence to answer the question accurately
475
+ - If this is an implementation handoff for engineer, optimize for the minimum
476
+ sufficient handoff: repo, branch/ref, target files, relevant tests, and the
477
+ key evidence explaining why those files matter
478
+ - Prefer one search followed by targeted extract over repeated broad searches
420
479
  </instructions>
421
480
 
422
481
  {% if inputs.exploration_prompt %}
@@ -468,6 +527,13 @@ steps:
468
527
  implementation, then consult docs to confirm semantics. When multiple projects
469
528
  are involved, trace data and config flow across them.
470
529
 
530
+ Efficiency rules for this investigation:
531
+ - Reuse evidence already found in earlier tool results
532
+ - If the question is narrow and the relevant files are already identified,
533
+ stop exploring and answer
534
+ - If the next consumer is engineer, avoid broad code archaeology once the
535
+ implementation target and validation path are clear
536
+
471
537
  Synthesize a single answer:
472
538
  - Ground everything in code/docs evidence
473
539
  - End with a "## References" section with clickable GitHub links:
@@ -338,19 +338,25 @@ steps:
338
338
  <delegation>
339
339
  Use the delegate tool for parallel work, plan validation, and build discovery.
340
340
 
341
- MANDATORY FIRST STEP delegate "Discover build system" for each repository:
341
+ FIRST decide whether delegation is needed.
342
+
343
+ Delegate "Discover build system" ONLY when the exact commands are not already
344
+ available in the provided context, code-explorer output, project metadata, or
345
+ recent tool results.
342
346
  - Check: Makefile, package.json (scripts), Cargo.toml, go.mod, pyproject.toml
343
347
  - Check: CI config (.github/workflows/, .gitlab-ci.yml, Jenkinsfile)
344
348
  - Check: README for build/test/lint instructions
345
349
  - Return the EXACT commands for: build, test, lint/format, and any pre-commit hooks
346
350
  - Example output: "build: make, test: make test, lint: gofmt -l . && golangci-lint run"
347
- Use these commands throughout the session. Do NOT guess use what the delegate found.
351
+ - Reuse these commands throughout the session. Do NOT rediscover them once known.
348
352
 
349
- MANDATORY BEFORE IMPLEMENTATION delegate "Plan validation":
353
+ Delegate "Plan validation" ONLY when the task is broad, high-risk, multi-repo,
354
+ or the implementation path is still unclear after reviewing existing context.
350
355
  - Describe: files to change, approach, patterns to follow
351
356
  - Ask the delegate to verify: do these files exist? Are there existing tests?
352
357
  Are there related utilities or patterns to reuse? Any API contracts to respect?
353
- - Wait for the response before writing code
358
+ - Skip this delegate for narrow single-repo changes when code-explorer or direct
359
+ inspection already identified the target files, branch, and validation path.
354
360
 
355
361
  Also delegate for:
356
362
  - Multi-repo changes (one delegate per repo, in parallel)
@@ -361,9 +367,12 @@ steps:
361
367
  - Sequential dependent work (step B needs step A's output)
362
368
  - Simple single-file edits (fewer than 5 iterations)
363
369
  - Git operations (commit, push, PR) — always do these yourself
370
+ - Questions you already delegated once in this session
364
371
 
365
372
  Delegates have fewer iterations and no access to your conversation.
366
373
  Provide all necessary context in the delegate prompt.
374
+ If a delegate returns empty output, times out, or repeats information already
375
+ known, do NOT call the same delegate again. Fall back to direct tools.
367
376
  </delegation>
368
377
 
369
378
  <git-workflow>
@@ -372,12 +381,17 @@ steps:
372
381
 
373
382
  Before your final response, verify:
374
383
  □ Build passes (using exact commands from "Discover build system" delegate)
375
- □ Tests pass (run the full test suite, not just your new tests)
384
+ □ Tests pass (start with the narrowest relevant tests; run broader suites only
385
+ when required by repo policy, when the change is cross-cutting, or when focused
386
+ tests indicate wider impact)
376
387
  □ Lint/format passes (if the project has a linter)
377
388
  □ git add <files>
378
389
  □ git commit -m "descriptive message"
379
390
  □ git push -u origin <branch-name>
380
391
  □ gh pr create (for new PRs) or update existing PR
392
+ - For a new branch: ALWAYS push first, then use `gh pr create --head <branch-name>`
393
+ - If PR creation fails, inspect stderr, fix the missing prerequisite, and retry once
394
+ - Do NOT repeat the same `gh pr create` command after the same error
381
395
  No PR URL = failed task. Report errors honestly, never claim false success.
382
396
 
383
397
  If build/test/lint fails, fix the issue before committing. If you cannot fix it,
@@ -397,19 +411,23 @@ steps:
397
411
  <efficiency>
398
412
  - Use context data directly — don't re-read files or re-run searches for
399
413
  information already provided by code-explorer.
414
+ - If code-explorer already identified the repo, branch, files, tests, or exact
415
+ commands, treat that as the default source of truth unless a tool result proves
416
+ it wrong.
400
417
  - If a project has <setup> commands listed, run them FIRST (in the project's
401
418
  directory) before any other work. These are prerequisites (e.g., `npm install`,
402
419
  `make deps`, database migrations).
403
- - Use tasks to track multi-step work. Create these tasks at minimum:
404
- 1. "Run setup commands" execute <setup> commands for each project (if any)
405
- 2. "Discover build system" (delegate) — find exact build/test/lint commands
406
- 3. "Plan validation" (delegate) — verify approach before coding
407
- 4. "Implement changes" — the actual code/file changes
408
- 5. "Verify build" run build, test, and lint commands; fix any failures
409
- 6. "Create pull request" git branch, add, commit, push, gh pr create
410
- Mark in_progress/completed as you go. Do NOT skip "Verify build".
411
- - If a bash command fails, try a different approach. Don't retry the same
412
- command or get stuck in loops.
420
+ - Use tasks to track real phases of work, not every obvious micro-step.
421
+ - For narrow single-repo changes, keep the task list minimal:
422
+ 1. "Implement changes"
423
+ 2. "Verify build"
424
+ 3. "Create pull request"
425
+ - Add "Run setup commands", "Discover build system", or "Plan validation"
426
+ ONLY when you actually need to perform those steps.
427
+ - Mark in_progress/completed as you go. Do NOT skip "Verify build".
428
+ - If a bash command fails, diagnose the cause before retrying.
429
+ - Do NOT repeat the same logical action after the same error unless you changed
430
+ a prerequisite (for example: push before re-running `gh pr create`).
413
431
  </efficiency>
414
432
  {% assign has_trace = inputs.trace_id | size %}
415
433
  {% assign has_slack_user = inputs.slack_user_id | size %}
@@ -29,6 +29,11 @@ knowledge: |
29
29
  - If confidence "high", trust the answer — do NOT re-call with rephrased question
30
30
  - Only call again for a genuinely DIFFERENT aspect of the codebase
31
31
  - If confidence "medium" or "low", check confidence_reason for what to refine
32
+ - If `exploration_status` is `timeout`, `failed`, or `no_projects`, do NOT re-call
33
+ with a paraphrase of the same question. Report the failure honestly and only retry
34
+ if you can narrow the question or change the scope.
35
+ - If `references` is empty and confidence is low, treat that as "not answered yet",
36
+ not as a usable code answer.
32
37
 
33
38
  ## Usage Instructions
34
39
  1. Call the `code-explorer` tool with the user's question — do NOT try to answer code questions yourself
@@ -148,35 +148,78 @@ visor mcp-server --transport http --config defaults/code-review.yaml \
148
148
 
149
149
  #### `visor tasks`
150
150
 
151
- Monitor and manage A2A agent tasks.
151
+ Monitor, inspect, and evaluate agent tasks. Requires `task_tracking: true` (or `--task-tracking` CLI flag).
152
152
 
153
153
  ```bash
154
154
  visor tasks [command] [options]
155
155
  ```
156
156
 
157
157
  **Subcommands:**
158
- - `list` (default) — List tasks with optional filters
158
+ - `list` (default) — List tasks (interactive TUI in TTY, table otherwise)
159
+ - `show <task-id>` — Show full task details including response and evaluation
160
+ - `trace <task-id>` — Show execution trace tree (YAML-formatted span hierarchy)
161
+ - `evaluate <task-id>` — Evaluate task quality with LLM judge
159
162
  - `stats` — Queue summary statistics
160
163
  - `cancel <task-id>` — Cancel a running task
161
- - `help` — Show usage
164
+ - `purge` — Delete old completed/failed tasks
162
165
 
163
- **Options:**
164
- - `--state <state>` — Filter by state: `submitted`, `working`, `completed`, `failed`, `canceled`
166
+ Task IDs support prefix matching — use the first 8 characters.
167
+
168
+ **List options:**
169
+ - `--all` — Show all tasks including completed/failed history
170
+ - `--state <state>` — Filter: `submitted`, `working`, `completed`, `failed`, `canceled`
171
+ - `--search <text>` — Full-text search on task input
165
172
  - `--agent <workflow-id>` — Filter by workflow
166
- - `--limit <n>` — Number of tasks to show (default: 20)
167
- - `--output <format>` — Output format: `table` (default), `json`, `markdown`
168
- - `--watch`Live refresh every 2 seconds
173
+ - `--instance <id>` — Filter by visor instance
174
+ - `--limit <n>` — Tasks per page (default: 20)
175
+ - `--page <n>` Page number
176
+ - `--output <format>` — Output: `table`, `json`, `markdown` (disables TUI)
177
+ - `--tui` — Force interactive TUI mode
178
+ - `--watch` — Auto-refresh every 2 seconds
179
+
180
+ **Trace options:**
181
+ - `--full` — Show full output without truncation
182
+ - `--output <format>` — Output: `tree` (default), `json`
183
+
184
+ **Evaluate options:**
185
+ - `--model <model>` — LLM model for evaluation (default: from config or env)
186
+ - `--provider <provider>` — AI provider: `google`, `openai`, `anthropic`
187
+ - `--last <n>` — Batch evaluate last N tasks
188
+ - `--state <state>` — Filter for batch mode (default: `completed`)
189
+ - `--prompt <text>` — Custom evaluation system prompt
190
+ - `--output <format>` — Output: `table`, `json`
191
+
192
+ **Purge options:**
193
+ - `--age <duration>` — Maximum age, e.g. `24h`, `7d`, `30d` (default: `7d`)
169
194
 
170
195
  **Examples:**
171
196
  ```bash
172
- visor tasks # List all tasks
173
- visor tasks list --state working # Show only working tasks
174
- visor tasks list --agent security-review # Tasks for a specific workflow
175
- visor tasks list --output json # JSON output
176
- visor tasks list --watch # Live monitoring
197
+ # Browsing tasks
198
+ visor tasks # Interactive TUI browser
199
+ visor tasks --output table # Plain table output
200
+ visor tasks --all # Include completed/failed history
201
+ visor tasks --state failed # Show only failed tasks
202
+ visor tasks --search "auth middleware" # Search by input text
203
+
204
+ # Inspecting individual tasks
205
+ visor tasks show abc123 # Task details with response
206
+ visor tasks show abc123 --output json # Full JSON with evaluation data
207
+
208
+ # Execution traces
209
+ visor tasks trace abc123 # Compact trace tree
210
+ visor tasks trace abc123 --full # Full trace with untruncated outputs
211
+
212
+ # Quality evaluation
213
+ visor tasks evaluate abc123 # Evaluate a single task
214
+ visor tasks evaluate abc123 --output json # Evaluation as JSON
215
+ visor tasks evaluate --last 10 # Batch evaluate last 10 tasks
216
+ visor tasks evaluate --last 5 --model gpt-4o # Use specific model
217
+
218
+ # Administration
177
219
  visor tasks stats # Queue summary
178
220
  visor tasks stats --output json # Stats as JSON
179
- visor tasks cancel abc123 # Cancel a task
221
+ visor tasks cancel abc123 # Cancel a running task
222
+ visor tasks purge --age 30d # Delete tasks older than 30 days
180
223
  ```
181
224
 
182
225
  ### Common CLI Options
@@ -430,6 +430,8 @@ The following global configuration options are available and documented in detai
430
430
  | `sandbox` | Default sandbox name for all steps | [Sandbox Engines](./sandbox-engines.md) |
431
431
  | `sandboxes` | Named sandbox definitions (Docker, Bubblewrap, Seatbelt) | [Sandbox Engines](./sandbox-engines.md) |
432
432
  | `workspace` | Workspace isolation configuration | [Workspace Isolation RFC](./rfc/workspace-isolation.md) |
433
+ | `task_tracking` | Enable cross-frontend task tracking (`true`/`false`) | [Observability](./observability.md) |
434
+ | `task_evaluate` | Auto-evaluate completed tasks with LLM judge (`true` or object) | [Observability](./observability.md) |
433
435
 
434
436
  Example combining several options:
435
437
 
@@ -0,0 +1,178 @@
1
+ # Graceful Restart
2
+
3
+ Visor supports zero-disruption restarts via `SIGUSR1`. When triggered, the old process stops accepting new work, a new process spawns and begins accepting requests, and the old process waits for all in-flight work to complete before exiting. Both processes run in parallel during the transition.
4
+
5
+ ## How It Works
6
+
7
+ ```
8
+ SIGUSR1 received by old process
9
+ → Stop listening on all ports (free ports instantly)
10
+ → Spawn new process with same args/env
11
+ → New process starts, binds ports, sends IPC "ready" signal
12
+ → Old process drains: waits for ALL in-flight work to complete
13
+ → Old process runs cleanup callbacks
14
+ → Old process exits
15
+ ```
16
+
17
+ **Key behavior:** By default, the old process runs **indefinitely** until all in-flight work completes. There is no timeout — active conversations, tool calls, and webhook handlers are never interrupted. You can optionally set a hard timeout via configuration.
18
+
19
+ ## Usage
20
+
21
+ ### Trigger a Restart
22
+
23
+ ```bash
24
+ # Find the Visor PID
25
+ pgrep -f visor
26
+
27
+ # Send SIGUSR1
28
+ kill -USR1 <pid>
29
+ ```
30
+
31
+ ### Kubernetes / Docker
32
+
33
+ ```bash
34
+ # Kubernetes
35
+ kubectl exec -n visor deploy/visor -- kill -USR1 1
36
+
37
+ # Docker
38
+ docker kill --signal=USR1 visor
39
+ ```
40
+
41
+ ### systemd
42
+
43
+ ```ini
44
+ [Service]
45
+ ExecReload=/bin/kill -USR1 $MAINPID
46
+ ```
47
+
48
+ Then reload with:
49
+ ```bash
50
+ systemctl reload visor
51
+ ```
52
+
53
+ ## Configuration
54
+
55
+ Add `graceful_restart` to your `.visor.yaml`:
56
+
57
+ ```yaml
58
+ graceful_restart:
59
+ # Maximum time to wait for in-flight work to complete (milliseconds).
60
+ # 0 = unlimited (default). Old process waits as long as needed.
61
+ drain_timeout_ms: 0
62
+
63
+ # Maximum time to wait for the new process to start and signal readiness.
64
+ # Default: 15000 (15 seconds).
65
+ child_ready_timeout_ms: 15000
66
+
67
+ # Send "bot is restarting" messages to active conversations.
68
+ # Default: true.
69
+ notify_users: true
70
+
71
+ # Override the auto-detected spawn command.
72
+ # Leave empty to auto-detect (recommended).
73
+ restart_command: ""
74
+ ```
75
+
76
+ ## Auto-Detection of Spawn Method
77
+
78
+ Visor automatically detects how it was invoked and spawns the new process accordingly:
79
+
80
+ | Invocation | Spawn behavior |
81
+ |---|---|
82
+ | `npx -y @probelabs/visor@latest --slack` | Re-runs `npx -y @probelabs/visor@latest` + original args (fetches latest version) |
83
+ | `node dist/index.js --slack` | Re-runs `node dist/index.js` + same args (picks up updated binary on disk) |
84
+ | `./dist/index.js --slack` | Re-runs with `process.execPath` + same argv |
85
+ | Custom (`restart_command` set) | Runs the configured command + original Visor args |
86
+
87
+ The `VISOR_RESTART_GENERATION` environment variable is incremented on each restart, letting you track restart generations in logs.
88
+
89
+ ## Graceful Restart vs Config Reload
90
+
91
+ Visor supports two complementary mechanisms for applying changes without disruption:
92
+
93
+ | Mechanism | Signal | Use case | Process lifecycle |
94
+ |---|---|---|---|
95
+ | **Graceful restart** (`SIGUSR1`) | `kill -USR1` | New code, binary updates, dependency changes | Old process drains, new process spawns |
96
+ | **Hot config reload** (`SIGUSR2` / `--watch`) | `kill -USR2` | Config-only changes (thresholds, checks, routing) | Same process, config reloaded in-place |
97
+
98
+ **When to use `--watch`:** If you only need to update `.visor.yaml` (e.g., add a check, change a threshold, adjust routing), use `--watch` to auto-reload on file changes — no restart needed:
99
+
100
+ ```bash
101
+ visor --slack --config .visor.yaml --watch
102
+ ```
103
+
104
+ The `--watch` flag monitors the config file for changes and applies them without restarting. This is faster and lighter than a full graceful restart. Use graceful restart (`SIGUSR1`) when you need to pick up new code or binary changes.
105
+
106
+ ## Signal Reference
107
+
108
+ | Signal | Behavior |
109
+ |---|---|
110
+ | `SIGUSR1` | Graceful restart — spawns new process, drains old |
111
+ | `SIGUSR2` | Hot config reload — reloads `.visor.yaml` in-place (also triggered by `--watch`) |
112
+ | `SIGTERM` | Graceful shutdown (stop + exit) |
113
+ | `SIGINT` | Graceful shutdown (stop + exit) |
114
+
115
+ ## What Gets Drained
116
+
117
+ Each runner type handles draining differently:
118
+
119
+ | Runner | stopListening | drain |
120
+ |---|---|---|
121
+ | **Slack** | Closes WebSocket, stops scheduler | Waits for all active threads to finish |
122
+ | **MCP Server** | Closes HTTP server, frees port | Waits for all active tool calls to complete |
123
+ | **Telegram** | Stops long-polling | Waits for active chat handlers |
124
+ | **Email** | Stops polling interval | Waits for active email processing |
125
+ | **WhatsApp** | Closes webhook HTTP server | Waits for active request handlers |
126
+ | **Teams** | Closes webhook HTTP server | Waits for active request handlers |
127
+ | **A2A** | Closes HTTP server | Waits for active tasks in queue |
128
+
129
+ ## Error Handling
130
+
131
+ | Scenario | Behavior |
132
+ |---|---|
133
+ | New process fails to start | Restart aborted, old process continues serving |
134
+ | New process doesn't become ready in time | Restart aborted, child killed, old process continues |
135
+ | Drain timeout exceeded (if configured) | Old process force-exits; new process is already running |
136
+ | Double SIGUSR1 | Second signal ignored while restart is in progress |
137
+ | SIGTERM during restart | Standard shutdown handler takes over |
138
+
139
+ ## Deployment Patterns
140
+
141
+ ### Blue-Green with SIGUSR1
142
+
143
+ 1. Deploy new code to disk (e.g., `npm install -g @probelabs/visor@latest`)
144
+ 2. Send `SIGUSR1` to the running process
145
+ 3. New process picks up updated binary automatically
146
+ 4. Old process drains and exits
147
+
148
+ ### Rolling Restart in Kubernetes
149
+
150
+ For Kubernetes deployments with multiple replicas, you can use the built-in rolling update strategy instead of SIGUSR1. However, SIGUSR1 is useful for single-replica deployments or when you want to avoid pod recreation:
151
+
152
+ ```bash
153
+ # Restart single instance without pod recreation
154
+ kubectl exec -n visor deploy/visor -- kill -USR1 1
155
+ ```
156
+
157
+ ### CI/CD Integration
158
+
159
+ ```yaml
160
+ # GitHub Actions example
161
+ - name: Deploy and restart
162
+ run: |
163
+ ssh deploy@server "cd /opt/visor && git pull && npm ci && npm run build"
164
+ ssh deploy@server "kill -USR1 $(cat /var/run/visor.pid)"
165
+ ```
166
+
167
+ ## Monitoring
168
+
169
+ Track restarts via:
170
+ - **Logs:** Look for `[GracefulRestart]` log entries
171
+ - **Environment:** `VISOR_RESTART_GENERATION` shows current generation
172
+ - **OTel:** Restart events appear as spans in telemetry traces
173
+
174
+ ## Limitations
175
+
176
+ - **Windows:** `SIGUSR1` is not available on Windows. Use process restart via your service manager instead.
177
+ - **Slack WebSocket:** The WebSocket connection cannot be transferred between processes. The new process opens a fresh Socket Mode connection. Slack automatically routes new events to the new connection.
178
+ - **npx mode:** When running via npx, each restart fetches the latest published version. Pin versions in `restart_command` if you need deterministic restarts.
@@ -223,6 +223,75 @@ When using `--output json`, full `executionStatistics` object is included with:
223
223
  | `totalDuration` | Total execution time in milliseconds |
224
224
  | Issue counts | By severity: critical, error, warning, info |
225
225
 
226
+ ## Task Tracking & Evaluation
227
+
228
+ Task tracking records every workflow execution (CLI, Slack, TUI, Scheduler) in a shared SQLite store, making them visible via `visor tasks`.
229
+
230
+ ### Enabling Task Tracking
231
+
232
+ ```yaml
233
+ # .visor.yaml
234
+ task_tracking: true
235
+ ```
236
+
237
+ Or via CLI flag: `visor --task-tracking --slack --config .visor.yaml`
238
+
239
+ ### Automatic Task Evaluation
240
+
241
+ When enabled, every completed task is automatically evaluated by an LLM judge that scores response quality and execution efficiency. Evaluations run asynchronously (non-blocking) after task completion and are stored as task artifacts.
242
+
243
+ ```yaml
244
+ # Simple — enable with defaults
245
+ task_evaluate: true
246
+
247
+ # With configuration
248
+ task_evaluate:
249
+ enabled: true
250
+ model: gemini-2.5-flash # LLM model (default: auto-detect from API keys)
251
+ provider: google # google, openai, anthropic
252
+ prompt: "Custom evaluation..." # Override default evaluation prompt
253
+ ```
254
+
255
+ Environment variables (override config):
256
+ - `VISOR_TASK_EVALUATE=true` — enable auto-evaluation
257
+ - `VISOR_EVAL_MODEL` — evaluation model
258
+ - `VISOR_EVAL_PROVIDER` — evaluation provider
259
+ - `VISOR_EVAL_PROMPT` — custom system prompt
260
+
261
+ ### Execution Traces
262
+
263
+ Each task captures an OpenTelemetry trace that records the full execution pipeline: check ordering, AI model calls with token counts, tool calls with result sizes, and delegation chains. View traces with:
264
+
265
+ ```bash
266
+ visor tasks trace <task-id> # Compact YAML tree
267
+ visor tasks trace <task-id> --full # Full untruncated output
268
+ ```
269
+
270
+ The trace tree shows:
271
+ - **visor.run** — root span with metadata (trace_id, version, source, duration)
272
+ - **Checks** — named steps with type (ai/script/workflow), duration, input context, and output
273
+ - **AI blocks** — LLM calls with model, token counts, and intent
274
+ - **Tool calls** — search, extract, listFiles with input queries and result sizes (or "no results")
275
+ - **Delegations** — sub-agent searches with nested AI/tool chains
276
+
277
+ Traces are also included in the LLM evaluation prompt, allowing the judge to assess execution efficiency alongside response quality.
278
+
279
+ ### Evaluation Results
280
+
281
+ Evaluations rate tasks on two axes:
282
+
283
+ | Axis | Rating | Categories |
284
+ |------|--------|------------|
285
+ | **Response quality** | 1-5 | excellent, good, adequate, poor, off-topic, error |
286
+ | **Execution quality** | 1-5 | efficient, adequate, wasteful, error |
287
+
288
+ View stored evaluations:
289
+ ```bash
290
+ visor tasks show <task-id> # Includes evaluation inline
291
+ visor tasks show <task-id> --output json # Full evaluation object
292
+ visor tasks evaluate --last 10 # Batch evaluate recent tasks
293
+ ```
294
+
226
295
  ## Related Documentation
227
296
 
228
297
  - [Output Formats](./output-formats.md) - Detailed format specifications
@@ -527,6 +527,23 @@ visor config restore 1 --output restored.yaml
527
527
 
528
528
  ## Upgrading
529
529
 
530
+ ### Graceful Restart (Zero-Disruption)
531
+
532
+ Visor supports zero-disruption restarts via `SIGUSR1`. The old process stops accepting new work, a new process spawns, and the old process waits for all in-flight work to complete before exiting. Both processes run in parallel during the transition.
533
+
534
+ ```bash
535
+ # Deploy new code, then trigger graceful restart
536
+ kill -USR1 $(pgrep -f visor)
537
+
538
+ # Kubernetes
539
+ kubectl exec -n visor deploy/visor -- kill -USR1 1
540
+
541
+ # Docker
542
+ docker kill --signal=USR1 visor
543
+ ```
544
+
545
+ By default, the old process waits **indefinitely** for active conversations and requests to complete. See [Graceful Restart Guide](./guides/graceful-restart.md) for full configuration options.
546
+
530
547
  ### Rolling Update (Kubernetes)
531
548
 
532
549
  ```bash
@@ -50,6 +50,7 @@ export declare class EmailPollingRunner implements Runner {
50
50
  private sendConfig?;
51
51
  private resendLastSeenId?;
52
52
  private hasWebhookSecret;
53
+ private activeProcessing;
53
54
  constructor(engine: StateMachineExecutionEngine, cfg: VisorConfig, opts: EmailPollingConfig);
54
55
  /** Get the EmailClient instance (for shared access) */
55
56
  getClient(): EmailClient;
@@ -58,6 +59,8 @@ export declare class EmailPollingRunner implements Runner {
58
59
  /** Hot-swap config for future requests */
59
60
  updateConfig(cfg: VisorConfig): void;
60
61
  start(): Promise<void>;
62
+ stopListening(): Promise<void>;
63
+ drain(timeoutMs?: number): Promise<void>;
61
64
  stop(): Promise<void>;
62
65
  private startImapPolling;
63
66
  private pollOnce;
@@ -72,6 +75,7 @@ export declare class EmailPollingRunner implements Runner {
72
75
  error?: string;
73
76
  }>;
74
77
  private handleMessage;
78
+ private handleMessageInner;
75
79
  /** Ensure email frontend is in the config for this run */
76
80
  private prepareConfigForRun;
77
81
  /** Deduplication: track processed messages by Message-ID */