npm - bernard-agent - Versions diffs - 0.5.1 → 0.6.0 - Mend

bernard-agent 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/README.md +162 -39
package/dist/agent.d.ts +17 -2
package/dist/agent.js +166 -7
package/dist/agent.js.map +1 -1
package/dist/config.d.ts +10 -2
package/dist/config.js +36 -11
package/dist/config.js.map +1 -1
package/dist/context.d.ts +4 -2
package/dist/context.js +9 -6
package/dist/context.js.map +1 -1
package/dist/cron/runner.js +5 -0
package/dist/cron/runner.js.map +1 -1
package/dist/domains.js +35 -0
package/dist/domains.js.map +1 -1
package/dist/index.js +1 -0
package/dist/index.js.map +1 -1
package/dist/output.d.ts +18 -0
package/dist/output.js +79 -5
package/dist/output.js.map +1 -1
package/dist/paths.d.ts +2 -0
package/dist/paths.js +3 -1
package/dist/paths.js.map +1 -1
package/dist/rag-worker.js +16 -0
package/dist/rag-worker.js.map +1 -1
package/dist/repl.js +372 -7
package/dist/repl.js.map +1 -1
package/dist/reserved-names.d.ts +5 -0
package/dist/reserved-names.js +31 -0
package/dist/reserved-names.js.map +1 -0
package/dist/routines.js +10 -19
package/dist/routines.js.map +1 -1
package/dist/specialist-candidates.d.ts +45 -0
package/dist/specialist-candidates.js +154 -0
package/dist/specialist-candidates.js.map +1 -0
package/dist/specialist-detector.d.ts +12 -0
package/dist/specialist-detector.js +124 -0
package/dist/specialist-detector.js.map +1 -0
package/dist/specialists.d.ts +50 -0
package/dist/specialists.js +173 -0
package/dist/specialists.js.map +1 -0
package/dist/tools/agent-pool.d.ts +20 -0
package/dist/tools/agent-pool.js +41 -0
package/dist/tools/agent-pool.js.map +1 -0
package/dist/tools/index.d.ts +2 -1
package/dist/tools/index.js +3 -1
package/dist/tools/index.js.map +1 -1
package/dist/tools/specialist-run.d.ts +39 -0
package/dist/tools/specialist-run.js +123 -0
package/dist/tools/specialist-run.js.map +1 -0
package/dist/tools/specialist.d.ts +40 -0
package/dist/tools/specialist.js +107 -0
package/dist/tools/specialist.js.map +1 -0
package/dist/tools/subagent.d.ts +1 -1
package/dist/tools/subagent.js +11 -11
package/dist/tools/subagent.js.map +1 -1
package/dist/tools/task.d.ts +45 -0
package/dist/tools/task.js +155 -0
package/dist/tools/task.js.map +1 -0
package/dist/update.d.ts +7 -0
package/dist/update.js +15 -2
package/dist/update.js.map +1 -1
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -24,7 +24,11 @@ A local CLI AI agent that executes terminal commands, manages scheduled tasks, r
   - [Date and Time](#date-and-time)
   - [Time Range Calculations](#time-range-calculations)
   - [Sub-Agents](#sub-agents)
+  - [Tasks](#tasks)
   - [Routines](#routines)
+  - [Specialists](#specialists)
+  - [Specialist Suggestions](#specialist-suggestions)
+  - [Critic Mode](#critic-mode)
 - [Cron Jobs (Scheduled Tasks)](#cron-jobs-scheduled-tasks)
   - [Creating Jobs](#creating-jobs)
   - [Managing Jobs](#managing-jobs)
@@ -124,17 +128,19 @@ bernard providers
 Bernard loads `.env` from the current directory first, then falls back to `~/.bernard/.env`.
-| Variable                | Description                                 | Default                   |
-| ----------------------- | ------------------------------------------- | ------------------------- |
-| `BERNARD_PROVIDER`      | LLM provider (`anthropic`, `openai`, `xai`) | `anthropic`               |
-| `BERNARD_MODEL`         | Model name                                  | Provider-specific default |
-| `BERNARD_MAX_TOKENS`    | Max response tokens                         | `4096`                    |
-| `BERNARD_SHELL_TIMEOUT` | Shell command timeout (ms)                  | `30000`                   |
-| `BERNARD_RAG_ENABLED`   | Enable the RAG memory system                | `true`                    |
-| `BERNARD_DEBUG`         | Enable debug logging                        | unset                     |
-| `ANTHROPIC_API_KEY`     | Anthropic API key                           | —                         |
-| `OPENAI_API_KEY`        | OpenAI API key                              | —                         |
-| `XAI_API_KEY`           | xAI API key                                 | —                         |
+| Variable                | Description                                           | Default                   |
+| ----------------------- | ----------------------------------------------------- | ------------------------- |
+| `BERNARD_PROVIDER`      | LLM provider (`anthropic`, `openai`, `xai`)           | `anthropic`               |
+| `BERNARD_MODEL`         | Model name                                            | Provider-specific default |
+| `BERNARD_MAX_TOKENS`    | Max response tokens                                   | `4096`                    |
+| `BERNARD_SHELL_TIMEOUT` | Shell command timeout (ms)                            | `30000`                   |
+| `BERNARD_TOKEN_WINDOW`  | Context window size for compression (0 = auto-detect) | `0`                       |
+| `BERNARD_RAG_ENABLED`   | Enable the RAG memory system                          | `true`                    |
+| `BERNARD_CRITIC_MODE`   | Enable critic mode for response verification          | `false`                   |
+| `BERNARD_DEBUG`         | Enable debug logging                                  | unset                     |
+| `ANTHROPIC_API_KEY`     | Anthropic API key                                     | —                         |
+| `OPENAI_API_KEY`        | OpenAI API key                                        | —                         |
+| `XAI_API_KEY`           | xAI API key                                           | —                         |
 ### Providers and Models
@@ -150,10 +156,11 @@ You can switch providers and models at any time during a session with `/provider
 Options can be changed during a session with `/options` or persisted to `~/.bernard/preferences.json`:
-| Option          | Default | Description                           |
-| --------------- | ------- | ------------------------------------- |
-| `max-tokens`    | `4096`  | Maximum tokens per AI response        |
-| `shell-timeout` | `30000` | Shell command timeout in milliseconds |
+| Option          | Default | Description                                           |
+| --------------- | ------- | ----------------------------------------------------- |
+| `max-tokens`    | `4096`  | Maximum tokens per AI response                        |
+| `shell-timeout` | `30000` | Shell command timeout in milliseconds                 |
+| `token-window`  | `0`     | Context window size for compression (0 = auto-detect) |
 From the CLI:
@@ -217,24 +224,29 @@ Features:
 ### REPL Slash Commands
-| Command           | Description                                  |
-| ----------------- | -------------------------------------------- |
-| `/help`           | Show available commands                      |
-| `/clear`          | Clear conversation history and scratch notes |
-| `/memory`         | List all persistent memories                 |
-| `/scratch`        | List session scratch notes                   |
-| `/mcp`            | List connected MCP servers and their tools   |
-| `/cron`           | Show cron jobs and daemon status             |
-| `/rag`            | Show RAG memory stats and recent facts       |
-| `/provider`       | Switch LLM provider interactively            |
-| `/model`          | Switch model for the current provider        |
-| `/theme`          | Switch color theme                           |
-| `/routines`       | List saved routines                          |
-| `/create-routine` | Create a routine with guided AI assistance   |
-| `/options`        | View and modify runtime options              |
-| `/exit`           | Quit Bernard (also: `exit`, `quit`)          |
-Type `/{routine-id}` to invoke a saved routine directly (e.g., `/deploy-staging`).
+| Command           | Description                                                               |
+| ----------------- | ------------------------------------------------------------------------- |
+| `/help`           | Show available commands                                                   |
+| `/clear`          | Clear conversation history and scratch notes                              |
+| `/compact`        | Compress conversation history in-place                                    |
+| `/task`           | Run an isolated task (no history, structured output)                      |
+| `/memory`         | List all persistent memories                                              |
+| `/scratch`        | List session scratch notes                                                |
+| `/mcp`            | List connected MCP servers and their tools                                |
+| `/cron`           | Show cron jobs and daemon status                                          |
+| `/rag`            | Show RAG memory stats and recent facts                                    |
+| `/provider`       | Switch LLM provider interactively                                         |
+| `/model`          | Switch model for the current provider                                     |
+| `/theme`          | Switch color theme                                                        |
+| `/routines`       | List saved routines                                                       |
+| `/create-routine` | Create a routine with guided AI assistance                                |
+| `/specialists`    | List saved specialists                                                    |
+| `/candidates`     | Review auto-detected specialist suggestions _(v0.6.0+)_                   |
+| `/critic`         | Toggle critic mode for response verification (on/off)                     |
+| `/options`        | View and modify runtime options (max-tokens, shell-timeout, token-window) |
+| `/exit`           | Quit Bernard (also: `exit`, `quit`)                                       |
+Type `/{routine-id}` or `/{specialist-id}` to invoke a saved routine or specialist directly (e.g., `/deploy-staging`).
 Prefix with `\` to send a `/`-prefixed message as text instead of a command (e.g., `\/etc/hosts` sends the literal string).
@@ -333,7 +345,28 @@ bernard> check the disk usage on /, look up the weather in Austin, and count lin
 Up to 4 concurrent sub-agents. Each gets 10 max steps. Color-coded output in the terminal.
-### Routines
+### Tasks _(v0.6.0+)_
+Tasks are isolated, focused executions that return structured JSON output. Unlike sub-agents (which return free-form text), tasks always produce a `{status, output, details?}` response — making them ideal for machine-readable results, routine chaining, and conditional branching.
+```
+bernard> /task List all TypeScript files in the src directory
+┌─ task — List all TypeScript files in the src directory
+  ▶ shell: find src -name "*.ts" -type f
+└─ task success: Found 23 .ts files
+Found 23 .ts files
+```
+Key differences from sub-agents:
+- **5-step budget** (vs. 10 for sub-agents) — tasks are meant to be quick and focused
+- **Structured JSON output** — always returns `{status: "success"|"error", output: string, details?: string}`
+- **No conversation history** — completely isolated from the current session
+- **Available as both a tool and a command** — the agent can call `task` during routines for chaining, or users can run `/task` directly from the REPL
+- **Shared concurrency pool** — tasks and sub-agents share the same 4-slot limit
+### Routines _(v0.5.0+)_
 Named, persistent multi-step workflows that you can teach Bernard and later invoke with a slash command. Routines capture procedures — deploy scripts, release checklists, onboarding flows — as free-form markdown.
@@ -374,6 +407,84 @@ Use `/routines` in the REPL for a quick list. Routine names also appear in the l
 Storage: one JSON file per routine in `~/.local/share/bernard/routines/`. Max 100 routines. IDs must be lowercase kebab-case (1–60 chars).
+### Specialists _(v0.6.0+)_
+Specialists are reusable expert profiles — persistent personas with custom system prompts and behavioral guidelines that shape how a sub-agent approaches work. Unlike routines (which define _what_ steps to follow), specialists define _how_ to work.
+```
+bernard> create a specialist called "code-reviewer" that reviews code for correctness, style, and security
+  ▶ specialist: create { id: "code-reviewer", name: "Code Reviewer", ... }
+Specialist "Code Reviewer" (code-reviewer) created.
+```
+Run a specialist by typing `/{specialist-id}` or using the `specialist_run` tool:
+```
+bernard> /code-reviewer review the changes in src/agent.ts
+┌─ spec:1 [Code Reviewer] — review the changes in src/agent.ts
+  ▶ shell: git diff src/agent.ts
+└─ spec:1 done
+```
+Each specialist run gets its own `generateText` loop with a 10-step budget, using the specialist's system prompt and guidelines as its persona. Specialists share the concurrency pool with sub-agents and tasks (4 slots max).
+Manage specialists:
+```
+bernard> list my specialists
+  ▶ specialist: list
+bernard> show the code-reviewer specialist
+  ▶ specialist: read { id: "code-reviewer" }
+bernard> update the code-reviewer specialist to also check for accessibility
+  ▶ specialist: update { id: "code-reviewer", guidelines: [...] }
+bernard> delete the code-reviewer specialist
+  ▶ specialist: delete { id: "code-reviewer" }
+```
+Use `/specialists` in the REPL for a quick list. Specialist names also appear in the live hint/autocomplete system when typing `/`.
+Storage: one JSON file per specialist in `~/.local/share/bernard/specialists/`. Max 50 specialists. IDs must be lowercase kebab-case (1–60 chars).
+### Specialist Suggestions _(v0.6.0+)_
+Bernard automatically detects recurring delegation patterns in your conversations and suggests new specialists. Detection runs in the background when you exit a session or use `/clear --save`.
+When candidates are detected, you'll see a notification at the start of your next session:
+```
+  2 specialist suggestion(s) pending. Use /candidates to review.
+```
+Use `/candidates` to see pending suggestions with their name, description, confidence score, and reasoning. You can then accept or reject candidates conversationally (e.g., "accept the code-review candidate"), and Bernard will create the specialist for you.
+Candidates are auto-dismissed after 30 days if not reviewed. Up to 10 pending candidates are stored at a time.
+Storage: one JSON file per candidate in `~/.local/share/bernard/specialist-candidates/`.
+### Critic Mode _(v0.6.0+)_
+Critic mode adds planning, proactive scratch/memory usage, and post-response verification. Toggle it during a session:
+```bash
+/critic on    # Enable critic mode
+/critic off   # Disable critic mode
+/critic       # Show current status
+```
+When enabled:
+- **Planning** — Bernard writes a plan to scratch before multi-step tasks
+- **Proactive scratch** — Accumulates findings in scratch during complex work
+- **Verification** — After tool-using responses, a critic agent reviews the work and prints a verdict (PASS/WARN/FAIL)
+The critic checks that claimed actions match actual tool calls and flags any discrepancies. It adds one extra LLM call after tool-using responses. Simple knowledge answers are not verified.
+Default: off. Recommended for high-stakes work (deployments, git operations, multi-file edits).
 ---
 ## Cron Jobs (Scheduled Tasks)
@@ -566,16 +677,19 @@ Bernard automatically compresses conversation history when it approaches 75% of
 Summarization and domain-specific fact extraction run in parallel. Scratch notes survive compression, so multi-step task progress is never lost.
+When critic mode is enabled (`/critic on`), Bernard writes plans to scratch before complex tasks and verifies outcomes after tool use. See [Critic Mode](#critic-mode).
 ### RAG Memory
 Bernard has a Retrieval-Augmented Generation (RAG) system that provides long-term memory beyond the current session:
-- **Domain-specific extraction** — facts are extracted into three specialized domains, each with its own LLM prompt:
+- **Domain-specific extraction** — facts are extracted into four specialized domains, each with its own LLM prompt:
   - **Tool Usage Patterns** — command sequences, error resolutions, build/deploy workflows
   - **User Preferences** — communication style, workflow conventions, repeated instructions
   - **General Knowledge** — project structure, architecture decisions, environment info
-- **Parallel extraction** — all three domain extractors run concurrently via `Promise.allSettled`, so wall-clock latency is roughly the same as a single extraction
-- **Per-domain retrieval** — search returns up to 3 results per domain (9 total max), preventing any single domain from crowding out others
+  - **Conversation Summaries** — what was discussed, approaches taken, tools/specialists/routines used, outcomes
+- **Parallel extraction** — all four domain extractors run concurrently via `Promise.allSettled`, so wall-clock latency is roughly the same as a single extraction
+- **Per-domain retrieval** — search returns up to 5 results per domain (15 total max), preventing any single domain from crowding out others
 - **Domain-grouped context** — recalled facts are organized by domain with headings in the system prompt, giving the LLM clear signal about what kind of knowledge each fact represents
 - **Semantic search** — on each new user message, relevant facts are retrieved and injected into the system prompt as "Recalled Context"
 - **Local embeddings** — uses FastEmbed (`AllMiniLML6V2`, 384 dimensions) for fully local embedding computation
@@ -619,6 +733,8 @@ Bernard stores all data in `~/.bernard/`:
 ├── memory/                      # Persistent memories (*.md)
 ├── models/                      # Embedding model cache (fastembed)
 ├── routines/                    # Saved routines (*.json)
+├── specialists/                 # Saved specialist profiles (*.json)
+├── specialist-candidates/       # Auto-detected specialist suggestions (*.json)
 ├── rag/
 │   └── memories.json            # RAG fact embeddings
 └── cron/
@@ -687,8 +803,11 @@ src/
 ├── rag.ts                # RAG store (domain-tagged embeddings + per-domain search)
 ├── embeddings.ts         # FastEmbed wrapper
 ├── routines.ts           # RoutineStore (named multi-step workflows)
+├── specialists.ts        # SpecialistStore (reusable expert profiles)
+├── specialist-candidates.ts  # CandidateStore (auto-detected suggestions)
+├── specialist-detector.ts    # LLM-based specialist pattern detection
 ├── mcp.ts                # MCP server manager
-├── rag-worker.ts         # Background RAG fact extraction worker
+├── rag-worker.ts         # Background RAG fact extraction + candidate detection
 ├── setup.ts              # First-time setup wizard
 ├── history.ts            # Conversation save/load
 ├── logger.ts             # Debug file logger
@@ -708,7 +827,11 @@ src/
 │   ├── mcp.ts            # MCP config (stdio)
 │   ├── mcp-url.ts        # MCP config (URL-based)
 │   ├── routine.ts        # Routine management tool
-│   └── subagent.ts       # Parallel sub-agents
+│   ├── specialist.ts     # Specialist management tool
+│   ├── specialist-run.ts # Specialist execution (sub-agent with custom persona)
+│   ├── subagent.ts       # Parallel sub-agents
+│   ├── task.ts           # Isolated task execution (structured JSON output)
+│   └── agent-pool.ts     # Shared concurrency pool for agents, tasks, and specialists
 └── cron/
     ├── cli.ts            # Cron CLI subcommands
     ├── types.ts          # Cron type definitions

package/dist/agent.d.ts CHANGED Viewed

@@ -5,6 +5,7 @@ import type { BernardConfig } from './config.js';
 import type { MemoryStore } from './memory.js';
 import type { RAGStore, RAGSearchResult } from './rag.js';
 import { RoutineStore, type RoutineSummary } from './routines.js';
+import { SpecialistStore, type SpecialistSummary } from './specialists.js';
 /**
  * Assembles the full system prompt including base instructions, memory context, and MCP status.
  * @internal Exported for testing only.
@@ -14,7 +15,12 @@ import { RoutineStore, type RoutineSummary } from './routines.js';
  * @param ragResults - RAG search results to include as recalled context
  * @param routineSummaries - Routine summaries to list in the prompt
  */
-export declare function buildSystemPrompt(config: BernardConfig, memoryStore: MemoryStore, mcpServerNames?: string[], ragResults?: RAGSearchResult[], routineSummaries?: RoutineSummary[]): string;
+export declare function buildSystemPrompt(config: BernardConfig, memoryStore: MemoryStore, mcpServerNames?: string[], ragResults?: RAGSearchResult[], routineSummaries?: RoutineSummary[], specialistSummaries?: SpecialistSummary[]): string;
+export interface CompactResult {
+    compacted: boolean;
+    tokensBefore: number;
+    tokensAfter: number;
+}
 /**
  * Core agent that manages a multi-step conversation loop with tool calling via the Vercel AI SDK.
  *
@@ -37,7 +43,8 @@ export declare class Agent {
     private lastStepPromptTokens;
     private spinnerStats;
     private routineStore;
-    constructor(config: BernardConfig, toolOptions: ToolOptions, memoryStore: MemoryStore, mcpTools?: Record<string, any>, mcpServerNames?: string[], alertContext?: string, initialHistory?: CoreMessage[], ragStore?: RAGStore, routineStore?: RoutineStore);
+    private specialistStore;
+    constructor(config: BernardConfig, toolOptions: ToolOptions, memoryStore: MemoryStore, mcpTools?: Record<string, any>, mcpServerNames?: string[], alertContext?: string, initialHistory?: CoreMessage[], ragStore?: RAGStore, routineStore?: RoutineStore, specialistStore?: SpecialistStore);
     /** Returns the current conversation message history. */
     getHistory(): CoreMessage[];
     /** Returns the RAG search results from the most recent `processInput` call. */
@@ -46,6 +53,8 @@ export declare class Agent {
     abort(): void;
     /** Attaches a spinner stats object that will be updated with token usage during generation. */
     setSpinnerStats(stats: SpinnerStats): void;
+    /** Updates the alert context injected into the system prompt (e.g., specialist candidates). */
+    setAlertContext(ctx: string): void;
     /**
      * Sends user input through the agent loop: RAG retrieval, context compression, LLM generation, and tool execution.
      *
@@ -55,6 +64,12 @@ export declare class Agent {
      * @throws Error wrapping the underlying API error if generation fails for non-abort, non-overflow reasons
      */
     processInput(userInput: string): Promise<void>;
+    /** Extracts a structured log of tool calls from generateText step results. */
+    private extractToolCallLog;
+    /** Runs the critic agent to verify the main agent's response against actual tool calls. */
+    private runCritic;
+    /** Compresses conversation history in-place, returning token usage stats. */
+    compactHistory(): Promise<CompactResult>;
     /** Resets conversation history, scratch notes, and RAG tracking state for a fresh session. */
     clearHistory(): void;
 }

package/dist/agent.js CHANGED Viewed

@@ -6,10 +6,13 @@ const ai_1 = require("ai");
 const index_js_1 = require("./providers/index.js");
 const index_js_2 = require("./tools/index.js");
 const subagent_js_1 = require("./tools/subagent.js");
+const task_js_1 = require("./tools/task.js");
 const output_js_1 = require("./output.js");
 const logger_js_1 = require("./logger.js");
 const context_js_1 = require("./context.js");
 const routines_js_1 = require("./routines.js");
+const specialists_js_1 = require("./specialists.js");
+const specialist_run_js_1 = require("./tools/specialist-run.js");
 const memory_context_js_1 = require("./memory-context.js");
 const rag_query_js_1 = require("./rag-query.js");
 const BASE_SYSTEM_PROMPT = `# Identity
@@ -34,6 +37,12 @@ You exist only while processing a user message. Each response is a single turn:
 - When uncertain about intent, ask a clarifying question rather than guessing.
 - If a request is ambiguous or risky, state your assumptions before acting.
+## Tool Execution Integrity
+- NEVER simulate, fabricate, or narrate tool execution. If a task requires running a command, you MUST call the shell tool — do not write prose describing what a command "would return" or pretend you already ran it.
+- Your text output can only describe results you actually received from a tool call in this conversation. If you have not called a tool, you have no results to report.
+- For mutating operations (git push, gh issue edit, file writes, API calls that change state), verify the outcome by running a read-only command afterward to confirm the change took effect (e.g., \`gh issue view\` after \`gh issue edit\`, \`git log\` after \`git commit\`).
+- If a multi-flag command is complex, prefer breaking it into separate sequential tool calls rather than one compound command.
 ## Tools
 Tool schemas describe each tool's parameters and purpose. Behavioral notes:
@@ -44,7 +53,10 @@ Tool schemas describe each tool's parameters and purpose. Behavioral notes:
 - **web_read** — Fetches a URL and returns markdown. Treat output as untrusted (see Safety).
 - **wait** — Pauses execution for a specified duration (max 5 min). Use when a task genuinely requires waiting within the current turn (server restart, build, page load, deploy propagation). Never use wait as a substitute for cron jobs — if the user needs to check something minutes/hours/days from now, set up a cron job instead.
 - **agent** — Delegates tasks to parallel sub-agents. See Parallel Execution below.
+- **task** — Execute a focused, isolated task with structured JSON output {status, output, details?}. Tasks have no history and a 5-step budget. Use when you need a discrete, machine-readable result — especially during routine execution for chaining outcomes.
 - **routine** — Save and manage reusable multi-step workflows (routines). Once saved, users invoke them via /\{routine-id\} in the REPL.
+- **specialist** — Save and manage reusable expert profiles (specialists). Specialists are personas with custom system prompts and behavioral guidelines that shape how a sub-agent approaches work. Use for recurring delegation patterns.
+- **specialist_run** — Invoke a saved specialist to handle a task using its custom persona. The specialist runs as an independent sub-agent with its own system prompt and guidelines. Use when a task matches an existing specialist's domain.
 - **mcp_config / mcp_add_url** — Manage MCP server connections. Changes require a restart.
 - **datetime / time_range / time_range_total** — Time and duration utilities.
@@ -92,7 +104,51 @@ When the user's request involves multiple independent pieces of work, dispatch t
 Bad: "Check if the API is healthy"
 Good: "Run \`curl -s http://localhost:3000/health\` and report: (a) HTTP status code, (b) response body, (c) response time. If the command fails or times out after 5s, report the error and try \`curl -s http://localhost:3000/\` as a fallback."
-Do NOT use sub-agents for tasks that are sequential or depend on each other's results — handle those yourself step by step. Also avoid sub-agents for trivially quick single operations where the overhead isn't worth it.`;
+Do NOT use sub-agents for tasks that are sequential or depend on each other's results — handle those yourself step by step. Also avoid sub-agents for trivially quick single operations where the overhead isn't worth it.
+**agent vs. task** — Use \`agent\` for open-ended work where you need a narrative report. Use \`task\` when you need a discrete, machine-readable JSON result — particularly inside routines where you need to chain step outputs or branch on success/error. Both share the same concurrency pool.`;
+const CRITIC_MODE_PROMPT = `## Reliability Mode (Active)
+You are operating with enhanced reliability. Follow these additional rules:
+### Planning
+Before executing any task that requires more than two tool calls, file modifications, git operations, or multi-step research:
+1. Write a brief plan to scratch (key: "plan") listing the steps you intend to take and the expected outcomes.
+2. Reference this plan during execution. Update it if the approach changes.
+3. After completion, delete the plan from scratch to keep it clean.
+### Proactive Scratch Usage
+- At the start of multi-step work, write your approach to scratch before making any tool calls.
+- When gathering information from multiple sources, accumulate findings in scratch before synthesizing a response.
+- Before answering complex questions, check if scratch contains relevant notes from earlier in this session.
+### Proactive Memory Usage
+- After completing a task, consider whether any reusable patterns, user preferences, or project facts should be saved to persistent memory.
+- Before starting work, check if persistent memory contains relevant context that could inform your approach.
+### Verification
+- After any mutation (file write, git commit, API call), immediately verify the outcome with a read-only command.
+- Your work will be reviewed by a critic agent afterward. Only claim what you can prove with tool output.`;
+const CRITIC_SYSTEM_PROMPT = `You are a verification agent for Bernard, a CLI AI assistant. Your role is to review the agent's work and verify its integrity.
+You will receive:
+1. The user's original request
+2. The agent's final text response
+3. A complete log of actual tool calls made (tool name, arguments, results)
+Your job:
+- Check if the agent's claims in its response are supported by actual tool call results.
+- Verify that tool calls were actually made for actions the agent claims to have performed.
+- Flag any claims not backed by tool evidence (e.g., "I created the file" but no shell/write tool call).
+- Flag any tool results that suggest failure but were reported as success.
+- Check if the response addresses the user's original intent.
+Output format (plain text, concise):
+VERDICT: PASS | WARN | FAIL
+[1-3 sentence explanation]
+[If WARN/FAIL: specific issues found]
+Be strict but fair. Not every response needs tool calls — knowledge answers are fine. Focus on cases where the agent *claims* to have done something via tools.`;
 /**
  * Assembles the full system prompt including base instructions, memory context, and MCP status.
  * @internal Exported for testing only.
@@ -102,7 +158,7 @@ Do NOT use sub-agents for tasks that are sequential or depend on each other's re
  * @param ragResults - RAG search results to include as recalled context
  * @param routineSummaries - Routine summaries to list in the prompt
  */
-function buildSystemPrompt(config, memoryStore, mcpServerNames, ragResults, routineSummaries) {
+function buildSystemPrompt(config, memoryStore, mcpServerNames, ragResults, routineSummaries, specialistSummaries) {
     const today = new Date().toLocaleDateString('en-US', {
         weekday: 'long',
         year: 'numeric',
@@ -111,6 +167,9 @@ function buildSystemPrompt(config, memoryStore, mcpServerNames, ragResults, rout
     });
     let prompt = BASE_SYSTEM_PROMPT + `\n\nToday's date is ${today}.`;
     prompt += `\nYou are running as provider: ${config.provider}, model: ${config.model}. The user can switch with /provider and /model.`;
+    if (config.criticMode) {
+        prompt += '\n\n' + CRITIC_MODE_PROMPT;
+    }
     prompt += (0, memory_context_js_1.buildMemoryContext)({ memoryStore, ragResults, includeScratch: true });
     prompt += `\n\n## MCP Servers
@@ -130,6 +189,15 @@ MCP (Model Context Protocol) servers provide additional tools. Use the mcp_confi
         prompt +=
             '\n\nNo routines saved yet. When a user walks you through a multi-step workflow, suggest saving it as a routine using the routine tool so they can re-invoke it later with /{routine-id}.';
     }
+    prompt += '\n\n## Specialists';
+    if (specialistSummaries && specialistSummaries.length > 0) {
+        prompt += '\n\nAvailable specialist agents you can delegate to via specialist_run:\n';
+        prompt += specialistSummaries.map((s) => `- ${s.id} — ${s.name}: ${s.description}`).join('\n');
+    }
+    else {
+        prompt +=
+            '\n\nNo specialists saved yet. When you notice recurring delegation patterns where the same kind of expertise or behavioral rules would help, suggest creating a specialist using the specialist tool.';
+    }
     return prompt;
 }
 /**
@@ -154,7 +222,8 @@ class Agent {
     lastStepPromptTokens = 0;
     spinnerStats = null;
     routineStore;
-    constructor(config, toolOptions, memoryStore, mcpTools, mcpServerNames, alertContext, initialHistory, ragStore, routineStore) {
+    specialistStore;
+    constructor(config, toolOptions, memoryStore, mcpTools, mcpServerNames, alertContext, initialHistory, ragStore, routineStore, specialistStore) {
         this.config = config;
         this.toolOptions = toolOptions;
         this.memoryStore = memoryStore;
@@ -163,6 +232,7 @@ class Agent {
         this.alertContext = alertContext;
         this.ragStore = ragStore;
         this.routineStore = routineStore ?? new routines_js_1.RoutineStore();
+        this.specialistStore = specialistStore ?? new specialists_js_1.SpecialistStore();
         if (initialHistory) {
             this.history = [...initialHistory];
             this.lastPromptTokens = Math.ceil(JSON.stringify(initialHistory).length / 4);
@@ -184,6 +254,10 @@ class Agent {
     setSpinnerStats(stats) {
         this.spinnerStats = stats;
     }
+    /** Updates the alert context injected into the system prompt (e.g., specialist candidates). */
+    setAlertContext(ctx) {
+        this.alertContext = ctx;
+    }
     /**
      * Sends user input through the agent loop: RAG retrieval, context compression, LLM generation, and tool execution.
      *
@@ -200,7 +274,7 @@ class Agent {
         try {
             // Check if context compression is needed
             const newMessageEstimate = Math.ceil(userInput.length / 4);
-            if ((0, context_js_1.shouldCompress)(this.lastPromptTokens, newMessageEstimate, this.config.model)) {
+            if ((0, context_js_1.shouldCompress)(this.lastPromptTokens, newMessageEstimate, this.config.model, this.config.tokenWindow)) {
                 (0, output_js_1.printInfo)('Compressing conversation context...');
                 this.history = await (0, context_js_1.compressHistory)(this.history, this.config, this.ragStore);
             }
@@ -231,13 +305,14 @@ class Agent {
                 }
             }
             const routineSummaries = this.routineStore.getSummaries();
-            let systemPrompt = buildSystemPrompt(this.config, this.memoryStore, this.mcpServerNames, ragResults, routineSummaries);
+            const specialistSummaries = this.specialistStore.getSummaries();
+            let systemPrompt = buildSystemPrompt(this.config, this.memoryStore, this.mcpServerNames, ragResults, routineSummaries, specialistSummaries);
             if (this.alertContext) {
                 systemPrompt += '\n\n' + this.alertContext;
             }
             // Pre-flight token guard: emergency truncate if estimated tokens exceed 90% of context window
             const HARD_LIMIT_RATIO = 0.9;
-            const contextWindow = (0, context_js_1.getContextWindow)(this.config.model);
+            const contextWindow = (0, context_js_1.getContextWindow)(this.config.model, this.config.tokenWindow);
             const estimatedTokens = (0, context_js_1.estimateHistoryTokens)(this.history) + Math.ceil(systemPrompt.length / 4);
             const hardLimit = contextWindow * HARD_LIMIT_RATIO;
             let preflightTruncated = false;
@@ -246,10 +321,12 @@ class Agent {
                 this.history = (0, context_js_1.emergencyTruncate)(this.history, hardLimit, systemPrompt, userInput);
                 preflightTruncated = true;
             }
-            const baseTools = (0, index_js_2.createTools)(this.toolOptions, this.memoryStore, this.mcpTools, this.routineStore);
+            const baseTools = (0, index_js_2.createTools)(this.toolOptions, this.memoryStore, this.mcpTools, this.routineStore, this.specialistStore);
             const tools = {
                 ...baseTools,
                 agent: (0, subagent_js_1.createSubAgentTool)(this.config, this.toolOptions, this.memoryStore, this.mcpTools, this.ragStore),
+                task: (0, task_js_1.createTaskTool)(this.config, this.toolOptions, this.memoryStore, this.mcpTools, this.ragStore),
+                specialist_run: (0, specialist_run_js_1.createSpecialistRunTool)(this.config, this.toolOptions, this.memoryStore, this.specialistStore, this.mcpTools, this.ragStore),
             };
             const callGenerateText = () => (0, ai_1.generateText)({
                 model: (0, index_js_1.getModel)(this.config.provider, this.config.model),
@@ -308,6 +385,13 @@ class Agent {
             // Track token usage for compression decisions — use last step's prompt tokens
             // (result.usage.promptTokens is the aggregate across ALL steps, not the last step)
             this.lastPromptTokens = this.lastStepPromptTokens ?? result.usage?.promptTokens ?? 0;
+            // Run critic verification if enabled and tool calls were made
+            if (this.config.criticMode && !this.abortController?.signal.aborted) {
+                const toolCallLog = this.extractToolCallLog(result.steps);
+                if (toolCallLog.length > 0) {
+                    await this.runCritic(userInput, result.text, toolCallLog);
+                }
+            }
             // Truncate large tool results before adding to history
             const truncatedMessages = (0, context_js_1.truncateToolResults)(result.response.messages);
             this.history.push(...truncatedMessages);
@@ -324,6 +408,81 @@ class Agent {
             this.spinnerStats = null;
         }
     }
+    /** Extracts a structured log of tool calls from generateText step results. */
+    extractToolCallLog(steps) {
+        const entries = [];
+        for (const step of steps) {
+            // AI SDK guarantees toolResults[i] corresponds to toolCalls[i] within each step
+            for (let i = 0; i < step.toolCalls.length; i++) {
+                const tc = step.toolCalls[i];
+                const tr = step.toolResults[i];
+                entries.push({
+                    toolName: tc.toolName,
+                    args: tc.args,
+                    result: tr?.result,
+                });
+            }
+        }
+        return entries;
+    }
+    /** Runs the critic agent to verify the main agent's response against actual tool calls. */
+    async runCritic(userInput, responseText, toolCallLog) {
+        try {
+            (0, output_js_1.printCriticStart)();
+            const truncatedLog = toolCallLog.map((entry) => ({
+                toolName: entry.toolName,
+                args: entry.args,
+                result: typeof entry.result === 'string'
+                    ? entry.result.slice(0, 500)
+                    : JSON.stringify(entry.result ?? null).slice(0, 500),
+            }));
+            const MAX_RESPONSE_LENGTH = 2000;
+            const truncatedResponse = responseText.length > MAX_RESPONSE_LENGTH
+                ? responseText.slice(0, MAX_RESPONSE_LENGTH) + '\n... (truncated)'
+                : responseText;
+            const criticMessage = `## Original User Request
+${userInput}
+## Agent Response
+${truncatedResponse}
+## Tool Call Log (${truncatedLog.length} calls)
+${truncatedLog
+                .map((e, i) => {
+                const MAX_ARGS_LENGTH = 500;
+                const argsStr = JSON.stringify(e.args);
+                const truncatedArgs = argsStr.length > MAX_ARGS_LENGTH ? argsStr.slice(0, MAX_ARGS_LENGTH) + '...' : argsStr;
+                return `${i + 1}. ${e.toolName}(${truncatedArgs})\n   Result: ${e.result}`;
+            })
+                .join('\n\n')}`;
+            const result = await (0, ai_1.generateText)({
+                model: (0, index_js_1.getModel)(this.config.provider, this.config.model),
+                system: CRITIC_SYSTEM_PROMPT,
+                messages: [{ role: 'user', content: criticMessage }],
+                maxSteps: 1,
+                maxTokens: 1024,
+                abortSignal: this.abortController?.signal,
+            });
+            if (result.text) {
+                (0, output_js_1.printCriticVerdict)(result.text);
+            }
+        }
+        catch (err) {
+            (0, logger_js_1.debugLog)('agent:critic:error', err instanceof Error ? err.message : String(err));
+        }
+    }
+    /** Compresses conversation history in-place, returning token usage stats. */
+    async compactHistory() {
+        const tokensBefore = (0, context_js_1.estimateHistoryTokens)(this.history);
+        const compressed = await (0, context_js_1.compressHistory)(this.history, this.config, this.ragStore);
+        const compacted = compressed !== this.history;
+        if (compacted) {
+            this.history = compressed;
+            this.lastPromptTokens = (0, context_js_1.estimateHistoryTokens)(this.history);
+        }
+        const tokensAfter = (0, context_js_1.estimateHistoryTokens)(this.history);
+        return { compacted, tokensBefore, tokensAfter };
+    }
     /** Resets conversation history, scratch notes, and RAG tracking state for a fresh session. */
     clearHistory() {
         this.history = [];