npm - @probelabs/visor - Versions diffs - 0.1.181 → 0.1.182 - Mend

@probelabs/visor 0.1.181 → 0.1.182

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

package/dist/defaults/code-talk.yaml CHANGED Viewed

@@ -136,18 +136,59 @@ outputs:
     value_js: |
       const result = outputs?.['explore-code'];
       if (result?.answer) return result.answer;
-      const routeOutput = outputs?.['setup-projects']?.routing_decision;
-      // Handle proper notes field
-      const routeNotes = routeOutput?.notes;
-      if (typeof routeNotes === 'string' && routeNotes.trim().length > 0) {
-        return { text: routeNotes };
+      const resultText = typeof result?.text === 'string' ? result.text.trim() : '';
+      if (resultText.length > 0) {
+        return { text: resultText };
       }
-      // Fallback: if AI returned {text: "..."} instead of proper schema
-      const routeText = routeOutput?.text;
-      if (typeof routeText === 'string' && routeText.trim().length > 0) {
-        return { text: routeText };
+      const routeNotes = outputs?.['setup-projects']?.routing_decision?.notes;
+      const checkoutProjects = outputs?.['setup-projects']?.checkout_projects;
+      if (
+        (!Array.isArray(checkoutProjects) || checkoutProjects.length === 0) &&
+        typeof routeNotes === 'string' &&
+        routeNotes.trim().length > 0
+      ) {
+        return { text: routeNotes.trim() };
+      }
+      return { text: 'Code exploration did not produce an answer.' };
+  - name: exploration_status
+    description: Outcome of the exploration step
+    value_js: |
+      const result = outputs?.['explore-code'];
+      if (result?.answer?.text) return 'success';
+      const resultText = typeof result?.text === 'string' ? result.text.trim() : '';
+      if (resultText.length > 0) {
+        if (/timed out/i.test(resultText)) return 'timeout';
+        return 'failed';
       }
-      return null;
+      const routeNotes = outputs?.['setup-projects']?.routing_decision?.notes;
+      const checkoutProjects = outputs?.['setup-projects']?.checkout_projects;
+      if (
+        (!Array.isArray(checkoutProjects) || checkoutProjects.length === 0) &&
+        typeof routeNotes === 'string' &&
+        routeNotes.trim().length > 0
+      ) {
+        return 'no_projects';
+      }
+      return 'failed';
+  - name: exploration_error
+    description: Timeout or failure detail when exploration did not return a real answer
+    value_js: |
+      const result = outputs?.['explore-code'];
+      if (result?.answer?.text) return '';
+      const resultText = typeof result?.text === 'string' ? result.text.trim() : '';
+      if (resultText.length > 0) return resultText;
+      const routeNotes = outputs?.['setup-projects']?.routing_decision?.notes;
+      const checkoutProjects = outputs?.['setup-projects']?.checkout_projects;
+      if (
+        (!Array.isArray(checkoutProjects) || checkoutProjects.length === 0) &&
+        typeof routeNotes === 'string' &&
+        routeNotes.trim().length > 0
+      ) {
+        return routeNotes.trim();
+      }
+      return 'Code exploration did not produce an answer.';
   - name: references
     description: Code/doc references from exploration
@@ -174,11 +215,20 @@ outputs:
       const result = outputs?.['explore-code'];
       const confidence = result?.confidence;
       const reason = result?.confidence_reason;
-      if (typeof reason === 'string') return reason;
+      if (typeof reason === 'string' && reason.trim().length > 0) return reason;
       if (confidence === 'high') return '';
+      const resultText = typeof result?.text === 'string' ? result.text.trim() : '';
+      if (resultText.length > 0) return resultText;
       const routeNotes = outputs?.['setup-projects']?.routing_decision?.notes;
-      if (typeof routeNotes === 'string' && routeNotes.trim().length > 0) return routeNotes;
-      return 'No confidence explanation was provided by explore-code.';
+      const checkoutProjects = outputs?.['setup-projects']?.checkout_projects;
+      if (
+        (!Array.isArray(checkoutProjects) || checkoutProjects.length === 0) &&
+        typeof routeNotes === 'string' &&
+        routeNotes.trim().length > 0
+      ) {
+        return routeNotes.trim();
+      }
+      return 'Code exploration did not produce an answer.';
   - name: projects_explored
     description: Which project IDs were checked out
@@ -261,7 +311,7 @@ steps:
       skip_code_context: true
       enableDelegate: true
       enableExecutePlan: false
-      max_iterations: 50
+      max_iterations: 100
       prompt_type: code-explorer
       allowBash: true
       bashConfig:
@@ -415,8 +465,17 @@ steps:
       - Each delegate should answer ONE specific question (not "look at the code")
       - Run multiple delegates in PARALLEL for different hypotheses or components
       - Ask delegates to return specific file paths and line numbers
+      - Do NOT delegate or re-search the same question twice in one investigation
+      - If a delegate returns enough evidence for the current claim, stop and use it
       Relay complete data from tools — do not summarize or compress tool output.
+      Investigation scope:
+      - Stop once you have enough evidence to answer the question accurately
+      - If this is an implementation handoff for engineer, optimize for the minimum
+        sufficient handoff: repo, branch/ref, target files, relevant tests, and the
+        key evidence explaining why those files matter
+      - Prefer one search followed by targeted extract over repeated broad searches
       </instructions>
       {% if inputs.exploration_prompt %}
@@ -468,6 +527,13 @@ steps:
       implementation, then consult docs to confirm semantics. When multiple projects
       are involved, trace data and config flow across them.
+      Efficiency rules for this investigation:
+      - Reuse evidence already found in earlier tool results
+      - If the question is narrow and the relevant files are already identified,
+        stop exploring and answer
+      - If the next consumer is engineer, avoid broad code archaeology once the
+        implementation target and validation path are clear
       Synthesize a single answer:
       - Ground everything in code/docs evidence
       - End with a "## References" section with clickable GitHub links:

package/dist/defaults/engineer.yaml CHANGED Viewed

@@ -338,19 +338,25 @@ steps:
       <delegation>
       Use the delegate tool for parallel work, plan validation, and build discovery.
-      MANDATORY FIRST STEP — delegate "Discover build system" for each repository:
+      FIRST decide whether delegation is needed.
+      Delegate "Discover build system" ONLY when the exact commands are not already
+      available in the provided context, code-explorer output, project metadata, or
+      recent tool results.
       - Check: Makefile, package.json (scripts), Cargo.toml, go.mod, pyproject.toml
       - Check: CI config (.github/workflows/, .gitlab-ci.yml, Jenkinsfile)
       - Check: README for build/test/lint instructions
       - Return the EXACT commands for: build, test, lint/format, and any pre-commit hooks
       - Example output: "build: make, test: make test, lint: gofmt -l . && golangci-lint run"
-      Use these commands throughout the session. Do NOT guess — use what the delegate found.
+      - Reuse these commands throughout the session. Do NOT rediscover them once known.
-      MANDATORY BEFORE IMPLEMENTATION — delegate "Plan validation":
+      Delegate "Plan validation" ONLY when the task is broad, high-risk, multi-repo,
+      or the implementation path is still unclear after reviewing existing context.
       - Describe: files to change, approach, patterns to follow
       - Ask the delegate to verify: do these files exist? Are there existing tests?
         Are there related utilities or patterns to reuse? Any API contracts to respect?
-      - Wait for the response before writing code
+      - Skip this delegate for narrow single-repo changes when code-explorer or direct
+        inspection already identified the target files, branch, and validation path.
       Also delegate for:
       - Multi-repo changes (one delegate per repo, in parallel)
@@ -361,9 +367,12 @@ steps:
       - Sequential dependent work (step B needs step A's output)
       - Simple single-file edits (fewer than 5 iterations)
       - Git operations (commit, push, PR) — always do these yourself
+      - Questions you already delegated once in this session
       Delegates have fewer iterations and no access to your conversation.
       Provide all necessary context in the delegate prompt.
+      If a delegate returns empty output, times out, or repeats information already
+      known, do NOT call the same delegate again. Fall back to direct tools.
       </delegation>
       <git-workflow>
@@ -372,12 +381,17 @@ steps:
       Before your final response, verify:
       □ Build passes (using exact commands from "Discover build system" delegate)
-      □ Tests pass (run the full test suite, not just your new tests)
+      □ Tests pass (start with the narrowest relevant tests; run broader suites only
+        when required by repo policy, when the change is cross-cutting, or when focused
+        tests indicate wider impact)
       □ Lint/format passes (if the project has a linter)
       □ git add <files>
       □ git commit -m "descriptive message"
       □ git push -u origin <branch-name>
       □ gh pr create (for new PRs) or update existing PR
+        - For a new branch: ALWAYS push first, then use `gh pr create --head <branch-name>`
+        - If PR creation fails, inspect stderr, fix the missing prerequisite, and retry once
+        - Do NOT repeat the same `gh pr create` command after the same error
       No PR URL = failed task. Report errors honestly, never claim false success.
       If build/test/lint fails, fix the issue before committing. If you cannot fix it,
@@ -397,19 +411,23 @@ steps:
       <efficiency>
       - Use context data directly — don't re-read files or re-run searches for
         information already provided by code-explorer.
+      - If code-explorer already identified the repo, branch, files, tests, or exact
+        commands, treat that as the default source of truth unless a tool result proves
+        it wrong.
       - If a project has <setup> commands listed, run them FIRST (in the project's
         directory) before any other work. These are prerequisites (e.g., `npm install`,
         `make deps`, database migrations).
-      - Use tasks to track multi-step work. Create these tasks at minimum:
-        1. "Run setup commands" — execute <setup> commands for each project (if any)
-        2. "Discover build system" (delegate) — find exact build/test/lint commands
-        3. "Plan validation" (delegate) — verify approach before coding
-        4. "Implement changes" — the actual code/file changes
-        5. "Verify build" — run build, test, and lint commands; fix any failures
-        6. "Create pull request" — git branch, add, commit, push, gh pr create
-        Mark in_progress/completed as you go. Do NOT skip "Verify build".
-      - If a bash command fails, try a different approach. Don't retry the same
-        command or get stuck in loops.
+      - Use tasks to track real phases of work, not every obvious micro-step.
+      - For narrow single-repo changes, keep the task list minimal:
+        1. "Implement changes"
+        2. "Verify build"
+        3. "Create pull request"
+      - Add "Run setup commands", "Discover build system", or "Plan validation"
+        ONLY when you actually need to perform those steps.
+      - Mark in_progress/completed as you go. Do NOT skip "Verify build".
+      - If a bash command fails, diagnose the cause before retrying.
+      - Do NOT repeat the same logical action after the same error unless you changed
+        a prerequisite (for example: push before re-running `gh pr create`).
       </efficiency>
       {% assign has_trace = inputs.trace_id | size %}
       {% assign has_slack_user = inputs.slack_user_id | size %}

package/dist/defaults/skills/code-explorer.yaml CHANGED Viewed

@@ -29,6 +29,11 @@ knowledge: |
   - If confidence "high", trust the answer — do NOT re-call with rephrased question
   - Only call again for a genuinely DIFFERENT aspect of the codebase
   - If confidence "medium" or "low", check confidence_reason for what to refine
+  - If `exploration_status` is `timeout`, `failed`, or `no_projects`, do NOT re-call
+    with a paraphrase of the same question. Report the failure honestly and only retry
+    if you can narrow the question or change the scope.
+  - If `references` is empty and confidence is low, treat that as "not answered yet",
+    not as a usable code answer.
   ## Usage Instructions
   1. Call the `code-explorer` tool with the user's question — do NOT try to answer code questions yourself

package/dist/docs/commands.md CHANGED Viewed

@@ -148,35 +148,78 @@ visor mcp-server --transport http --config defaults/code-review.yaml \
 #### `visor tasks`
-Monitor and manage A2A agent tasks.
+Monitor, inspect, and evaluate agent tasks. Requires `task_tracking: true` (or `--task-tracking` CLI flag).
 ```bash
 visor tasks [command] [options]
 ```
 **Subcommands:**
-- `list` (default) — List tasks with optional filters
+- `list` (default) — List tasks (interactive TUI in TTY, table otherwise)
+- `show <task-id>` — Show full task details including response and evaluation
+- `trace <task-id>` — Show execution trace tree (YAML-formatted span hierarchy)
+- `evaluate <task-id>` — Evaluate task quality with LLM judge
 - `stats` — Queue summary statistics
 - `cancel <task-id>` — Cancel a running task
-- `help` — Show usage
+- `purge` — Delete old completed/failed tasks
-**Options:**
-- `--state <state>` — Filter by state: `submitted`, `working`, `completed`, `failed`, `canceled`
+Task IDs support prefix matching — use the first 8 characters.
+**List options:**
+- `--all` — Show all tasks including completed/failed history
+- `--state <state>` — Filter: `submitted`, `working`, `completed`, `failed`, `canceled`
+- `--search <text>` — Full-text search on task input
 - `--agent <workflow-id>` — Filter by workflow
-- `--limit <n>` — Number of tasks to show (default: 20)
-- `--output <format>` — Output format: `table` (default), `json`, `markdown`
-- `--watch` — Live refresh every 2 seconds
+- `--instance <id>` — Filter by visor instance
+- `--limit <n>` — Tasks per page (default: 20)
+- `--page <n>` — Page number
+- `--output <format>` — Output: `table`, `json`, `markdown` (disables TUI)
+- `--tui` — Force interactive TUI mode
+- `--watch` — Auto-refresh every 2 seconds
+**Trace options:**
+- `--full` — Show full output without truncation
+- `--output <format>` — Output: `tree` (default), `json`
+**Evaluate options:**
+- `--model <model>` — LLM model for evaluation (default: from config or env)
+- `--provider <provider>` — AI provider: `google`, `openai`, `anthropic`
+- `--last <n>` — Batch evaluate last N tasks
+- `--state <state>` — Filter for batch mode (default: `completed`)
+- `--prompt <text>` — Custom evaluation system prompt
+- `--output <format>` — Output: `table`, `json`
+**Purge options:**
+- `--age <duration>` — Maximum age, e.g. `24h`, `7d`, `30d` (default: `7d`)
 **Examples:**
 ```bash
-visor tasks                                  # List all tasks
-visor tasks list --state working             # Show only working tasks
-visor tasks list --agent security-review     # Tasks for a specific workflow
-visor tasks list --output json               # JSON output
-visor tasks list --watch                     # Live monitoring
+# Browsing tasks
+visor tasks                                  # Interactive TUI browser
+visor tasks --output table                   # Plain table output
+visor tasks --all                            # Include completed/failed history
+visor tasks --state failed                   # Show only failed tasks
+visor tasks --search "auth middleware"        # Search by input text
+# Inspecting individual tasks
+visor tasks show abc123                      # Task details with response
+visor tasks show abc123 --output json        # Full JSON with evaluation data
+# Execution traces
+visor tasks trace abc123                     # Compact trace tree
+visor tasks trace abc123 --full              # Full trace with untruncated outputs
+# Quality evaluation
+visor tasks evaluate abc123                  # Evaluate a single task
+visor tasks evaluate abc123 --output json    # Evaluation as JSON
+visor tasks evaluate --last 10               # Batch evaluate last 10 tasks
+visor tasks evaluate --last 5 --model gpt-4o # Use specific model
+# Administration
 visor tasks stats                            # Queue summary
 visor tasks stats --output json              # Stats as JSON
-visor tasks cancel abc123                    # Cancel a task
+visor tasks cancel abc123                    # Cancel a running task
+visor tasks purge --age 30d                  # Delete tasks older than 30 days
 ```
 ### Common CLI Options

package/dist/docs/configuration.md CHANGED Viewed

@@ -430,6 +430,8 @@ The following global configuration options are available and documented in detai
 | `sandbox` | Default sandbox name for all steps | [Sandbox Engines](./sandbox-engines.md) |
 | `sandboxes` | Named sandbox definitions (Docker, Bubblewrap, Seatbelt) | [Sandbox Engines](./sandbox-engines.md) |
 | `workspace` | Workspace isolation configuration | [Workspace Isolation RFC](./rfc/workspace-isolation.md) |
+| `task_tracking` | Enable cross-frontend task tracking (`true`/`false`) | [Observability](./observability.md) |
+| `task_evaluate` | Auto-evaluate completed tasks with LLM judge (`true` or object) | [Observability](./observability.md) |
 Example combining several options:

package/dist/docs/guides/graceful-restart.md ADDED Viewed

@@ -0,0 +1,178 @@
+# Graceful Restart
+Visor supports zero-disruption restarts via `SIGUSR1`. When triggered, the old process stops accepting new work, a new process spawns and begins accepting requests, and the old process waits for all in-flight work to complete before exiting. Both processes run in parallel during the transition.
+## How It Works
+```
+SIGUSR1 received by old process
+  → Stop listening on all ports (free ports instantly)
+  → Spawn new process with same args/env
+  → New process starts, binds ports, sends IPC "ready" signal
+  → Old process drains: waits for ALL in-flight work to complete
+  → Old process runs cleanup callbacks
+  → Old process exits
+```
+**Key behavior:** By default, the old process runs **indefinitely** until all in-flight work completes. There is no timeout — active conversations, tool calls, and webhook handlers are never interrupted. You can optionally set a hard timeout via configuration.
+## Usage
+### Trigger a Restart
+```bash
+# Find the Visor PID
+pgrep -f visor
+# Send SIGUSR1
+kill -USR1 <pid>
+```
+### Kubernetes / Docker
+```bash
+# Kubernetes
+kubectl exec -n visor deploy/visor -- kill -USR1 1
+# Docker
+docker kill --signal=USR1 visor
+```
+### systemd
+```ini
+[Service]
+ExecReload=/bin/kill -USR1 $MAINPID
+```
+Then reload with:
+```bash
+systemctl reload visor
+```
+## Configuration
+Add `graceful_restart` to your `.visor.yaml`:
+```yaml
+graceful_restart:
+  # Maximum time to wait for in-flight work to complete (milliseconds).
+  # 0 = unlimited (default). Old process waits as long as needed.
+  drain_timeout_ms: 0
+  # Maximum time to wait for the new process to start and signal readiness.
+  # Default: 15000 (15 seconds).
+  child_ready_timeout_ms: 15000
+  # Send "bot is restarting" messages to active conversations.
+  # Default: true.
+  notify_users: true
+  # Override the auto-detected spawn command.
+  # Leave empty to auto-detect (recommended).
+  restart_command: ""
+```
+## Auto-Detection of Spawn Method
+Visor automatically detects how it was invoked and spawns the new process accordingly:
+| Invocation | Spawn behavior |
+|---|---|
+| `npx -y @probelabs/visor@latest --slack` | Re-runs `npx -y @probelabs/visor@latest` + original args (fetches latest version) |
+| `node dist/index.js --slack` | Re-runs `node dist/index.js` + same args (picks up updated binary on disk) |
+| `./dist/index.js --slack` | Re-runs with `process.execPath` + same argv |
+| Custom (`restart_command` set) | Runs the configured command + original Visor args |
+The `VISOR_RESTART_GENERATION` environment variable is incremented on each restart, letting you track restart generations in logs.
+## Graceful Restart vs Config Reload
+Visor supports two complementary mechanisms for applying changes without disruption:
+| Mechanism | Signal | Use case | Process lifecycle |
+|---|---|---|---|
+| **Graceful restart** (`SIGUSR1`) | `kill -USR1` | New code, binary updates, dependency changes | Old process drains, new process spawns |
+| **Hot config reload** (`SIGUSR2` / `--watch`) | `kill -USR2` | Config-only changes (thresholds, checks, routing) | Same process, config reloaded in-place |
+**When to use `--watch`:** If you only need to update `.visor.yaml` (e.g., add a check, change a threshold, adjust routing), use `--watch` to auto-reload on file changes — no restart needed:
+```bash
+visor --slack --config .visor.yaml --watch
+```
+The `--watch` flag monitors the config file for changes and applies them without restarting. This is faster and lighter than a full graceful restart. Use graceful restart (`SIGUSR1`) when you need to pick up new code or binary changes.
+## Signal Reference
+| Signal | Behavior |
+|---|---|
+| `SIGUSR1` | Graceful restart — spawns new process, drains old |
+| `SIGUSR2` | Hot config reload — reloads `.visor.yaml` in-place (also triggered by `--watch`) |
+| `SIGTERM` | Graceful shutdown (stop + exit) |
+| `SIGINT` | Graceful shutdown (stop + exit) |
+## What Gets Drained
+Each runner type handles draining differently:
+| Runner | stopListening | drain |
+|---|---|---|
+| **Slack** | Closes WebSocket, stops scheduler | Waits for all active threads to finish |
+| **MCP Server** | Closes HTTP server, frees port | Waits for all active tool calls to complete |
+| **Telegram** | Stops long-polling | Waits for active chat handlers |
+| **Email** | Stops polling interval | Waits for active email processing |
+| **WhatsApp** | Closes webhook HTTP server | Waits for active request handlers |
+| **Teams** | Closes webhook HTTP server | Waits for active request handlers |
+| **A2A** | Closes HTTP server | Waits for active tasks in queue |
+## Error Handling
+| Scenario | Behavior |
+|---|---|
+| New process fails to start | Restart aborted, old process continues serving |
+| New process doesn't become ready in time | Restart aborted, child killed, old process continues |
+| Drain timeout exceeded (if configured) | Old process force-exits; new process is already running |
+| Double SIGUSR1 | Second signal ignored while restart is in progress |
+| SIGTERM during restart | Standard shutdown handler takes over |
+## Deployment Patterns
+### Blue-Green with SIGUSR1
+1. Deploy new code to disk (e.g., `npm install -g @probelabs/visor@latest`)
+2. Send `SIGUSR1` to the running process
+3. New process picks up updated binary automatically
+4. Old process drains and exits
+### Rolling Restart in Kubernetes
+For Kubernetes deployments with multiple replicas, you can use the built-in rolling update strategy instead of SIGUSR1. However, SIGUSR1 is useful for single-replica deployments or when you want to avoid pod recreation:
+```bash
+# Restart single instance without pod recreation
+kubectl exec -n visor deploy/visor -- kill -USR1 1
+```
+### CI/CD Integration
+```yaml
+# GitHub Actions example
+- name: Deploy and restart
+  run: |
+    ssh deploy@server "cd /opt/visor && git pull && npm ci && npm run build"
+    ssh deploy@server "kill -USR1 $(cat /var/run/visor.pid)"
+```
+## Monitoring
+Track restarts via:
+- **Logs:** Look for `[GracefulRestart]` log entries
+- **Environment:** `VISOR_RESTART_GENERATION` shows current generation
+- **OTel:** Restart events appear as spans in telemetry traces
+## Limitations
+- **Windows:** `SIGUSR1` is not available on Windows. Use process restart via your service manager instead.
+- **Slack WebSocket:** The WebSocket connection cannot be transferred between processes. The new process opens a fresh Socket Mode connection. Slack automatically routes new events to the new connection.
+- **npx mode:** When running via npx, each restart fetches the latest published version. Pin versions in `restart_command` if you need deterministic restarts.

package/dist/docs/observability.md CHANGED Viewed

@@ -223,6 +223,75 @@ When using `--output json`, full `executionStatistics` object is included with:
 | `totalDuration` | Total execution time in milliseconds |
 | Issue counts | By severity: critical, error, warning, info |
+## Task Tracking & Evaluation
+Task tracking records every workflow execution (CLI, Slack, TUI, Scheduler) in a shared SQLite store, making them visible via `visor tasks`.
+### Enabling Task Tracking
+```yaml
+# .visor.yaml
+task_tracking: true
+```
+Or via CLI flag: `visor --task-tracking --slack --config .visor.yaml`
+### Automatic Task Evaluation
+When enabled, every completed task is automatically evaluated by an LLM judge that scores response quality and execution efficiency. Evaluations run asynchronously (non-blocking) after task completion and are stored as task artifacts.
+```yaml
+# Simple — enable with defaults
+task_evaluate: true
+# With configuration
+task_evaluate:
+  enabled: true
+  model: gemini-2.5-flash        # LLM model (default: auto-detect from API keys)
+  provider: google                # google, openai, anthropic
+  prompt: "Custom evaluation..."  # Override default evaluation prompt
+```
+Environment variables (override config):
+- `VISOR_TASK_EVALUATE=true` — enable auto-evaluation
+- `VISOR_EVAL_MODEL` — evaluation model
+- `VISOR_EVAL_PROVIDER` — evaluation provider
+- `VISOR_EVAL_PROMPT` — custom system prompt
+### Execution Traces
+Each task captures an OpenTelemetry trace that records the full execution pipeline: check ordering, AI model calls with token counts, tool calls with result sizes, and delegation chains. View traces with:
+```bash
+visor tasks trace <task-id>          # Compact YAML tree
+visor tasks trace <task-id> --full   # Full untruncated output
+```
+The trace tree shows:
+- **visor.run** — root span with metadata (trace_id, version, source, duration)
+- **Checks** — named steps with type (ai/script/workflow), duration, input context, and output
+- **AI blocks** — LLM calls with model, token counts, and intent
+- **Tool calls** — search, extract, listFiles with input queries and result sizes (or "no results")
+- **Delegations** — sub-agent searches with nested AI/tool chains
+Traces are also included in the LLM evaluation prompt, allowing the judge to assess execution efficiency alongside response quality.
+### Evaluation Results
+Evaluations rate tasks on two axes:
+| Axis | Rating | Categories |
+|------|--------|------------|
+| **Response quality** | 1-5 | excellent, good, adequate, poor, off-topic, error |
+| **Execution quality** | 1-5 | efficient, adequate, wasteful, error |
+View stored evaluations:
+```bash
+visor tasks show <task-id>               # Includes evaluation inline
+visor tasks show <task-id> --output json # Full evaluation object
+visor tasks evaluate --last 10           # Batch evaluate recent tasks
+```
 ## Related Documentation
 - [Output Formats](./output-formats.md) - Detailed format specifications

package/dist/docs/production-deployment.md CHANGED Viewed

@@ -527,6 +527,23 @@ visor config restore 1 --output restored.yaml
 ## Upgrading
+### Graceful Restart (Zero-Disruption)
+Visor supports zero-disruption restarts via `SIGUSR1`. The old process stops accepting new work, a new process spawns, and the old process waits for all in-flight work to complete before exiting. Both processes run in parallel during the transition.
+```bash
+# Deploy new code, then trigger graceful restart
+kill -USR1 $(pgrep -f visor)
+# Kubernetes
+kubectl exec -n visor deploy/visor -- kill -USR1 1
+# Docker
+docker kill --signal=USR1 visor
+```
+By default, the old process waits **indefinitely** for active conversations and requests to complete. See [Graceful Restart Guide](./guides/graceful-restart.md) for full configuration options.
 ### Rolling Update (Kubernetes)
 ```bash

package/dist/email/polling-runner.d.ts CHANGED Viewed

@@ -50,6 +50,7 @@ export declare class EmailPollingRunner implements Runner {
     private sendConfig?;
     private resendLastSeenId?;
     private hasWebhookSecret;
+    private activeProcessing;
     constructor(engine: StateMachineExecutionEngine, cfg: VisorConfig, opts: EmailPollingConfig);
     /** Get the EmailClient instance (for shared access) */
     getClient(): EmailClient;
@@ -58,6 +59,8 @@ export declare class EmailPollingRunner implements Runner {
     /** Hot-swap config for future requests */
     updateConfig(cfg: VisorConfig): void;
     start(): Promise<void>;
+    stopListening(): Promise<void>;
+    drain(timeoutMs?: number): Promise<void>;
     stop(): Promise<void>;
     private startImapPolling;
     private pollOnce;
@@ -72,6 +75,7 @@ export declare class EmailPollingRunner implements Runner {
         error?: string;
     }>;
     private handleMessage;
+    private handleMessageInner;
     /** Ensure email frontend is in the config for this run */
     private prepareConfigForRun;
     /** Deduplication: track processed messages by Message-ID */