bernard-agent 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +96 -50
- package/dist/agent.d.ts +7 -4
- package/dist/agent.js +88 -127
- package/dist/agent.js.map +1 -1
- package/dist/config.d.ts +13 -1
- package/dist/config.js +66 -4
- package/dist/config.js.map +1 -1
- package/dist/critic.d.ts +32 -0
- package/dist/critic.js +118 -0
- package/dist/critic.js.map +1 -0
- package/dist/cron/runner.js +54 -34
- package/dist/cron/runner.js.map +1 -1
- package/dist/output.d.ts +6 -4
- package/dist/output.js +31 -10
- package/dist/output.js.map +1 -1
- package/dist/overlap-checker.d.ts +61 -0
- package/dist/overlap-checker.js +106 -0
- package/dist/overlap-checker.js.map +1 -0
- package/dist/pac.d.ts +37 -0
- package/dist/pac.js +69 -0
- package/dist/pac.js.map +1 -0
- package/dist/rag-query.js +2 -1
- package/dist/rag-query.js.map +1 -1
- package/dist/rag-worker.js +4 -3
- package/dist/rag-worker.js.map +1 -1
- package/dist/repl.js +160 -4
- package/dist/repl.js.map +1 -1
- package/dist/specialist-candidates.d.ts +13 -0
- package/dist/specialist-candidates.js.map +1 -1
- package/dist/specialist-detector.d.ts +20 -5
- package/dist/specialist-detector.js +72 -9
- package/dist/specialist-detector.js.map +1 -1
- package/dist/specialist-matcher.d.ts +5 -0
- package/dist/specialist-matcher.js +1 -0
- package/dist/specialist-matcher.js.map +1 -1
- package/dist/tools/datetime.d.ts +6 -0
- package/dist/tools/datetime.js +38 -1
- package/dist/tools/datetime.js.map +1 -1
- package/dist/tools/mcp-url.js +1 -1
- package/dist/tools/mcp-url.js.map +1 -1
- package/dist/tools/specialist-run.js +37 -12
- package/dist/tools/specialist-run.js.map +1 -1
- package/dist/tools/subagent.js +37 -12
- package/dist/tools/subagent.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -129,19 +129,22 @@ bernard providers
|
|
|
129
129
|
|
|
130
130
|
Bernard loads `.env` from the current directory first, then falls back to `~/.bernard/.env`.
|
|
131
131
|
|
|
132
|
-
| Variable
|
|
133
|
-
|
|
|
134
|
-
| `BERNARD_PROVIDER`
|
|
135
|
-
| `BERNARD_MODEL`
|
|
136
|
-
| `BERNARD_MAX_TOKENS`
|
|
137
|
-
| `BERNARD_SHELL_TIMEOUT`
|
|
138
|
-
| `BERNARD_TOKEN_WINDOW`
|
|
139
|
-
| `
|
|
140
|
-
| `
|
|
141
|
-
| `
|
|
142
|
-
| `
|
|
143
|
-
| `
|
|
144
|
-
| `
|
|
132
|
+
| Variable | Description | Default |
|
|
133
|
+
| --------------------------------- | -------------------------------------------------------- | ------------------------- |
|
|
134
|
+
| `BERNARD_PROVIDER` | LLM provider (`anthropic`, `openai`, `xai`) | `anthropic` |
|
|
135
|
+
| `BERNARD_MODEL` | Model name | Provider-specific default |
|
|
136
|
+
| `BERNARD_MAX_TOKENS` | Max response tokens | `4096` |
|
|
137
|
+
| `BERNARD_SHELL_TIMEOUT` | Shell command timeout (ms) | `30000` |
|
|
138
|
+
| `BERNARD_TOKEN_WINDOW` | Context window size for compression (0 = auto-detect) | `0` |
|
|
139
|
+
| `BERNARD_MAX_STEPS` | Max agent loop iterations per request | `25` |
|
|
140
|
+
| `BERNARD_RAG_ENABLED` | Enable the RAG memory system | `true` |
|
|
141
|
+
| `BERNARD_CRITIC_MODE` | Enable critic mode for response verification | `false` |
|
|
142
|
+
| `BERNARD_AUTO_CREATE_SPECIALISTS` | Auto-create specialists above confidence threshold | `false` |
|
|
143
|
+
| `BERNARD_AUTO_CREATE_THRESHOLD` | Confidence threshold for auto-creating specialists (0-1) | `0.8` |
|
|
144
|
+
| `BERNARD_DEBUG` | Enable debug logging | unset |
|
|
145
|
+
| `ANTHROPIC_API_KEY` | Anthropic API key | — |
|
|
146
|
+
| `OPENAI_API_KEY` | OpenAI API key | — |
|
|
147
|
+
| `XAI_API_KEY` | xAI API key | — |
|
|
145
148
|
|
|
146
149
|
### Providers and Models
|
|
147
150
|
|
|
@@ -157,11 +160,12 @@ You can switch providers and models at any time during a session with `/provider
|
|
|
157
160
|
|
|
158
161
|
Options can be changed during a session with `/options` or persisted to `~/.bernard/preferences.json`:
|
|
159
162
|
|
|
160
|
-
| Option | Default | Description
|
|
161
|
-
| --------------- | ------- |
|
|
162
|
-
| `max-tokens` | `4096` | Maximum tokens per AI response
|
|
163
|
-
| `
|
|
164
|
-
| `
|
|
163
|
+
| Option | Default | Description |
|
|
164
|
+
| --------------- | ------- | ------------------------------------------------------------ |
|
|
165
|
+
| `max-tokens` | `4096` | Maximum tokens per AI response |
|
|
166
|
+
| `max-steps` | `25` | Maximum agent loop iterations per request (tool call chains) |
|
|
167
|
+
| `shell-timeout` | `30000` | Shell command timeout in milliseconds |
|
|
168
|
+
| `token-window` | `0` | Context window size for compression (0 = auto-detect) |
|
|
165
169
|
|
|
166
170
|
From the CLI:
|
|
167
171
|
|
|
@@ -225,28 +229,30 @@ Features:
|
|
|
225
229
|
|
|
226
230
|
### REPL Slash Commands
|
|
227
231
|
|
|
228
|
-
| Command | Description
|
|
229
|
-
| ----------------- |
|
|
230
|
-
| `/help` | Show available commands
|
|
231
|
-
| `/clear` | Clear conversation history and scratch notes
|
|
232
|
-
| `/compact` | Compress conversation history in-place
|
|
233
|
-
| `/task` | Run an isolated task (no history, structured output)
|
|
234
|
-
| `/memory` | List all persistent memories
|
|
235
|
-
| `/scratch` | List session scratch notes
|
|
236
|
-
| `/mcp` | List connected MCP servers and their tools
|
|
237
|
-
| `/cron` | Show cron jobs and daemon status
|
|
238
|
-
| `/rag` | Show RAG memory stats and recent facts
|
|
239
|
-
| `/provider` | Switch LLM provider interactively
|
|
240
|
-
| `/model` | Switch model for the current provider
|
|
241
|
-
| `/theme` | Switch color theme
|
|
242
|
-
| `/routines` | List saved routines
|
|
243
|
-
| `/create-routine` | Create a routine with guided AI assistance
|
|
244
|
-
| `/create-task` | Create a task routine (`task-` prefixed) with guided AI assistance
|
|
245
|
-
| `/specialists` | List saved specialists
|
|
246
|
-
| `/candidates` | Review auto-detected specialist suggestions _(v0.6.0+)_
|
|
247
|
-
| `/critic` | Toggle critic mode for response verification (on/off)
|
|
248
|
-
| `/options`
|
|
249
|
-
| `/
|
|
232
|
+
| Command | Description |
|
|
233
|
+
| ----------------- | ------------------------------------------------------------------------------------ |
|
|
234
|
+
| `/help` | Show available commands |
|
|
235
|
+
| `/clear` | Clear conversation history and scratch notes |
|
|
236
|
+
| `/compact` | Compress conversation history in-place |
|
|
237
|
+
| `/task` | Run an isolated task (no history, structured output) |
|
|
238
|
+
| `/memory` | List all persistent memories |
|
|
239
|
+
| `/scratch` | List session scratch notes |
|
|
240
|
+
| `/mcp` | List connected MCP servers and their tools |
|
|
241
|
+
| `/cron` | Show cron jobs and daemon status |
|
|
242
|
+
| `/rag` | Show RAG memory stats and recent facts |
|
|
243
|
+
| `/provider` | Switch LLM provider interactively |
|
|
244
|
+
| `/model` | Switch model for the current provider |
|
|
245
|
+
| `/theme` | Switch color theme |
|
|
246
|
+
| `/routines` | List saved routines |
|
|
247
|
+
| `/create-routine` | Create a routine with guided AI assistance |
|
|
248
|
+
| `/create-task` | Create a task routine (`task-` prefixed) with guided AI assistance |
|
|
249
|
+
| `/specialists` | List saved specialists |
|
|
250
|
+
| `/candidates` | Review auto-detected specialist suggestions _(v0.6.0+)_ |
|
|
251
|
+
| `/critic` | Toggle critic mode for response verification (on/off) |
|
|
252
|
+
| `/agent-options` | Configure auto-creation for specialist agents |
|
|
253
|
+
| `/options` | View and modify runtime options (max-tokens, max-steps, shell-timeout, token-window) |
|
|
254
|
+
| `/debug` | Print a diagnostic report for troubleshooting (no secrets leaked) |
|
|
255
|
+
| `/exit` | Quit Bernard (also: `exit`, `quit`) |
|
|
250
256
|
|
|
251
257
|
Type `/{routine-id}` or `/{specialist-id}` to invoke a saved routine or specialist directly (e.g., `/deploy-staging`).
|
|
252
258
|
|
|
@@ -361,7 +367,7 @@ bernard> check the disk usage on /, look up the weather in Austin, and count lin
|
|
|
361
367
|
...
|
|
362
368
|
```
|
|
363
369
|
|
|
364
|
-
Up to 4 concurrent sub-agents. Each gets
|
|
370
|
+
Up to 4 concurrent sub-agents. Each gets 50% of the main agent's step budget (e.g. 13 steps when `max-steps` is 25). Color-coded output in the terminal. Sub-agents accept per-invocation provider/model overrides to use a different LLM than the main session.
|
|
365
371
|
|
|
366
372
|
### Tasks _(v0.6.0+)_
|
|
367
373
|
|
|
@@ -481,6 +487,19 @@ When candidates are detected, you'll see a notification at the start of your nex
|
|
|
481
487
|
|
|
482
488
|
Use `/candidates` to see pending suggestions with their name, description, confidence score, and reasoning. You can then accept or reject candidates conversationally (e.g., "accept the code-review candidate"), and Bernard will create the specialist for you.
|
|
483
489
|
|
|
490
|
+
**Overlap detection** — Before suggesting a new specialist, Bernard computes a token-based similarity score against all existing specialists and pending candidates. If the overlap exceeds 60%, the candidate is suppressed. When a candidate partially overlaps with an existing specialist, Bernard may suggest enhancing the existing specialist instead.
|
|
491
|
+
|
|
492
|
+
**Auto-creation** — You can enable automatic specialist creation for high-confidence candidates:
|
|
493
|
+
|
|
494
|
+
```bash
|
|
495
|
+
/agent-options auto-create on # Enable auto-creation
|
|
496
|
+
/agent-options auto-create off # Disable auto-creation
|
|
497
|
+
/agent-options threshold 0.85 # Set confidence threshold (0-1)
|
|
498
|
+
/agent-options # Show current settings
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
Or via environment variables: `BERNARD_AUTO_CREATE_SPECIALISTS=true` and `BERNARD_AUTO_CREATE_THRESHOLD=0.85`.
|
|
502
|
+
|
|
484
503
|
Candidates are auto-dismissed after 30 days if not reviewed. Up to 10 pending candidates are stored at a time.
|
|
485
504
|
|
|
486
505
|
Storage: one JSON file per candidate in `~/.local/share/bernard/specialist-candidates/`.
|
|
@@ -503,6 +522,12 @@ When enabled:
|
|
|
503
522
|
|
|
504
523
|
The critic checks that claimed actions match actual tool calls and flags any discrepancies. It adds one extra LLM call after tool-using responses. Simple knowledge answers are not verified.
|
|
505
524
|
|
|
525
|
+
**PAC System (Plan-Act-Critic)** — When critic mode is enabled, sub-agents and specialists also get critic verification via a reusable PAC loop. The PAC loop runs the critic after each sub-agent/specialist execution, and if the critic finds issues, it retries the task with feedback (up to 2 retries). This applies to:
|
|
526
|
+
|
|
527
|
+
- Sub-agents (`agent` tool)
|
|
528
|
+
- Specialist runs (`specialist_run` tool)
|
|
529
|
+
- Cron job executions (daemon mode)
|
|
530
|
+
|
|
506
531
|
Default: off. Recommended for high-stakes work (deployments, git operations, multi-file edits).
|
|
507
532
|
|
|
508
533
|
---
|
|
@@ -697,6 +722,8 @@ Bernard automatically compresses conversation history when it approaches 75% of
|
|
|
697
722
|
|
|
698
723
|
Summarization and domain-specific fact extraction run in parallel. Scratch notes survive compression, so multi-step task progress is never lost.
|
|
699
724
|
|
|
725
|
+
**Auto-continue on truncation:** If a response hits the `max-tokens` limit and is cut off, Bernard automatically continues where it left off (up to 3 continuations). After completing, it shows a recommended `max-tokens` value based on actual usage. If the response is still incomplete after 3 continuations, a warning is shown with instructions to increase the limit via `/options max-tokens <value>`.
|
|
726
|
+
|
|
700
727
|
When critic mode is enabled (`/critic on`), Bernard writes plans to scratch before complex tasks and verifies outcomes after tool use. See [Critic Mode](#critic-mode).
|
|
701
728
|
|
|
702
729
|
### RAG Memory
|
|
@@ -741,17 +768,17 @@ Storage: `~/.bernard/conversation-history.json`
|
|
|
741
768
|
|
|
742
769
|
## File Structure
|
|
743
770
|
|
|
744
|
-
Bernard
|
|
771
|
+
Bernard follows the [XDG Base Directory Specification](https://specifications.freedesktop.org/basedir/latest/), splitting files across four standard directories:
|
|
745
772
|
|
|
746
773
|
```
|
|
747
|
-
~/.bernard/
|
|
748
|
-
├── keys.json # API keys (mode 0600)
|
|
774
|
+
~/.config/bernard/ # Config (XDG_CONFIG_HOME)
|
|
749
775
|
├── preferences.json # Provider, model, options
|
|
776
|
+
├── keys.json # API keys (mode 0600)
|
|
750
777
|
├── .env # Fallback environment config
|
|
751
|
-
|
|
752
|
-
|
|
778
|
+
└── mcp.json # MCP server configuration
|
|
779
|
+
|
|
780
|
+
~/.local/share/bernard/ # Data (XDG_DATA_HOME)
|
|
753
781
|
├── memory/ # Persistent memories (*.md)
|
|
754
|
-
├── models/ # Embedding model cache (fastembed)
|
|
755
782
|
├── routines/ # Saved routines (*.json)
|
|
756
783
|
├── specialists/ # Saved specialist profiles (*.json)
|
|
757
784
|
├── specialist-candidates/ # Auto-detected specialist suggestions (*.json)
|
|
@@ -759,12 +786,22 @@ Bernard stores all data in `~/.bernard/`:
|
|
|
759
786
|
│ └── memories.json # RAG fact embeddings
|
|
760
787
|
└── cron/
|
|
761
788
|
├── jobs.json # Scheduled jobs
|
|
762
|
-
├── daemon.pid # Daemon process ID
|
|
763
|
-
├── daemon.log # Daemon output (rotates at 1MB)
|
|
764
789
|
├── logs/ # Per-job execution logs
|
|
765
790
|
└── alerts/ # Cron alert files
|
|
791
|
+
|
|
792
|
+
~/.cache/bernard/ # Cache (XDG_CACHE_HOME)
|
|
793
|
+
├── models/ # Embedding model cache (fastembed)
|
|
794
|
+
└── update-check.json # Update check state
|
|
795
|
+
|
|
796
|
+
~/.local/state/bernard/ # State (XDG_STATE_HOME)
|
|
797
|
+
├── conversation-history.json # Last session (for --resume)
|
|
798
|
+
├── logs/ # Debug log files (*.jsonl)
|
|
799
|
+
├── cron-daemon.pid # Daemon process ID
|
|
800
|
+
└── cron-daemon.log # Daemon output (rotates at 1MB)
|
|
766
801
|
```
|
|
767
802
|
|
|
803
|
+
Override all directories with a single flat path: `BERNARD_HOME=/path`. On first run, files are auto-migrated from legacy `~/.bernard/` to XDG locations.
|
|
804
|
+
|
|
768
805
|
---
|
|
769
806
|
|
|
770
807
|
## Development
|
|
@@ -796,6 +833,10 @@ BERNARD_DEBUG=1 bernard
|
|
|
796
833
|
|
|
797
834
|
Logs are written to `.logs/YYYY-MM-DD.log` in JSON format, covering agent processing, RAG operations, context compression, tool execution, and MCP operations.
|
|
798
835
|
|
|
836
|
+
### Diagnostic Report
|
|
837
|
+
|
|
838
|
+
Use `/debug` in the REPL to print a diagnostic report useful for troubleshooting. The report includes runtime info (Bernard version, Node.js version, OS), LLM configuration, API key status (configured/not set — keys are never shown), MCP server status, RAG/memory/cron state, conversation stats, active settings, and file paths. No secrets are included in the output.
|
|
839
|
+
|
|
799
840
|
### Adding a New Provider
|
|
800
841
|
|
|
801
842
|
1. Install the AI SDK provider package (e.g., `npm install @ai-sdk/google`)
|
|
@@ -815,6 +856,7 @@ src/
|
|
|
815
856
|
├── repl.ts # Interactive REPL loop
|
|
816
857
|
├── agent.ts # Agent class (generateText loop)
|
|
817
858
|
├── config.ts # Config loading and validation
|
|
859
|
+
├── critic.ts # Critic agent for response verification
|
|
818
860
|
├── output.ts # Terminal formatting (Chalk)
|
|
819
861
|
├── theme.ts # Color theme definitions and switching
|
|
820
862
|
├── memory.ts # MemoryStore (persistent + scratch)
|
|
@@ -826,7 +868,11 @@ src/
|
|
|
826
868
|
├── specialists.ts # SpecialistStore (reusable expert profiles)
|
|
827
869
|
├── specialist-candidates.ts # CandidateStore (auto-detected suggestions)
|
|
828
870
|
├── specialist-detector.ts # LLM-based specialist pattern detection
|
|
871
|
+
├── specialist-matcher.ts # Keyword scorer for specialist auto-dispatch
|
|
829
872
|
├── mcp.ts # MCP server manager
|
|
873
|
+
├── overlap-checker.ts # Token-based Jaccard overlap for specialist dedup
|
|
874
|
+
├── pac.ts # Plan-Act-Critic loop wrapper
|
|
875
|
+
├── paths.ts # Centralized XDG file path resolution
|
|
830
876
|
├── rag-worker.ts # Background RAG fact extraction + candidate detection
|
|
831
877
|
├── setup.ts # First-time setup wizard
|
|
832
878
|
├── history.ts # Conversation save/load
|
|
@@ -883,7 +929,7 @@ Found a bug? Please [open an issue](https://github.com/phillt/bernard/issues/new
|
|
|
883
929
|
|
|
884
930
|
- Steps to reproduce the problem
|
|
885
931
|
- Expected vs. actual behavior
|
|
886
|
-
- Your environment
|
|
932
|
+
- Your environment — run `/debug` in the REPL and paste the output
|
|
887
933
|
- Any relevant logs (run with `BERNARD_DEBUG=1` for verbose output)
|
|
888
934
|
|
|
889
935
|
## Third-Party Licenses
|
package/dist/agent.d.ts
CHANGED
|
@@ -49,6 +49,8 @@ export declare class Agent {
|
|
|
49
49
|
private routineStore;
|
|
50
50
|
private specialistStore;
|
|
51
51
|
private candidateStore?;
|
|
52
|
+
private stepLimitHitCount;
|
|
53
|
+
private lastStepLimitHit;
|
|
52
54
|
constructor(config: BernardConfig, toolOptions: ToolOptions, memoryStore: MemoryStore, mcpTools?: Record<string, any>, mcpServerNames?: string[], alertContext?: string, initialHistory?: CoreMessage[], ragStore?: RAGStore, routineStore?: RoutineStore, specialistStore?: SpecialistStore, candidateStore?: CandidateStoreReader);
|
|
53
55
|
/** Returns the current conversation message history. */
|
|
54
56
|
getHistory(): CoreMessage[];
|
|
@@ -56,6 +58,11 @@ export declare class Agent {
|
|
|
56
58
|
getLastRAGResults(): RAGSearchResult[];
|
|
57
59
|
/** Cancels the in-flight LLM request, if any. Safe to call when no request is active. */
|
|
58
60
|
abort(): void;
|
|
61
|
+
/** Returns step limit hit info from last processInput, or null if limit wasn't hit. */
|
|
62
|
+
getStepLimitHit(): {
|
|
63
|
+
currentLimit: number;
|
|
64
|
+
hitCount: number;
|
|
65
|
+
} | null;
|
|
59
66
|
/** Attaches a spinner stats object that will be updated with token usage during generation. */
|
|
60
67
|
setSpinnerStats(stats: SpinnerStats): void;
|
|
61
68
|
/** Updates the alert context injected into the system prompt (e.g., specialist candidates). */
|
|
@@ -69,10 +76,6 @@ export declare class Agent {
|
|
|
69
76
|
* @throws Error wrapping the underlying API error if generation fails for non-abort, non-overflow reasons
|
|
70
77
|
*/
|
|
71
78
|
processInput(userInput: string): Promise<void>;
|
|
72
|
-
/** Extracts a structured log of tool calls from generateText step results. */
|
|
73
|
-
private extractToolCallLog;
|
|
74
|
-
/** Runs the critic agent to verify the main agent's response against actual tool calls. */
|
|
75
|
-
private runCritic;
|
|
76
79
|
/** Compresses conversation history in-place, returning token usage stats. */
|
|
77
80
|
compactHistory(): Promise<CompactResult>;
|
|
78
81
|
/** Resets conversation history, scratch notes, and RAG tracking state for a fresh session. */
|
package/dist/agent.js
CHANGED
|
@@ -9,6 +9,7 @@ const subagent_js_1 = require("./tools/subagent.js");
|
|
|
9
9
|
const task_js_1 = require("./tools/task.js");
|
|
10
10
|
const output_js_1 = require("./output.js");
|
|
11
11
|
const logger_js_1 = require("./logger.js");
|
|
12
|
+
const critic_js_1 = require("./critic.js");
|
|
12
13
|
const context_js_1 = require("./context.js");
|
|
13
14
|
const routines_js_1 = require("./routines.js");
|
|
14
15
|
const specialists_js_1 = require("./specialists.js");
|
|
@@ -16,6 +17,7 @@ const specialist_run_js_1 = require("./tools/specialist-run.js");
|
|
|
16
17
|
const specialist_matcher_js_1 = require("./specialist-matcher.js");
|
|
17
18
|
const memory_context_js_1 = require("./memory-context.js");
|
|
18
19
|
const rag_query_js_1 = require("./rag-query.js");
|
|
20
|
+
const datetime_js_1 = require("./tools/datetime.js");
|
|
19
21
|
const BASE_SYSTEM_PROMPT = `# Identity
|
|
20
22
|
|
|
21
23
|
You are Bernard, a local CLI AI agent with direct shell access, persistent memory, and a suite of tools for system tasks, web reading, and scheduling.
|
|
@@ -38,6 +40,14 @@ You exist only while processing a user message. Each response is a single turn:
|
|
|
38
40
|
- When uncertain about intent, ask a clarifying question rather than guessing.
|
|
39
41
|
- If a request is ambiguous or risky, state your assumptions before acting.
|
|
40
42
|
|
|
43
|
+
## Planning
|
|
44
|
+
Before executing any task that requires more than two tool calls:
|
|
45
|
+
1. Briefly outline your plan in your response text — what steps you intend to take and in what order.
|
|
46
|
+
2. Execute the plan step by step. If the approach needs to change, state the revised plan before continuing.
|
|
47
|
+
3. After completion, summarize what was done and the outcome.
|
|
48
|
+
|
|
49
|
+
This makes your reasoning visible and reduces errors on multi-step tasks. For simple tasks (1-2 tool calls), skip the plan and act directly.
|
|
50
|
+
|
|
41
51
|
## Tool Execution Integrity
|
|
42
52
|
- NEVER simulate, fabricate, or narrate tool execution. If a task requires running a command, you MUST call the shell tool — do not write prose describing what a command "would return" or pretend you already ran it.
|
|
43
53
|
- Your text output can only describe results you actually received from a tool call in this conversation. If you have not called a tool, you have no results to report.
|
|
@@ -115,10 +125,10 @@ const CRITIC_MODE_PROMPT = `## Reliability Mode (Active)
|
|
|
115
125
|
|
|
116
126
|
You are operating with enhanced reliability. Follow these additional rules:
|
|
117
127
|
|
|
118
|
-
### Planning
|
|
119
|
-
|
|
120
|
-
1. Write
|
|
121
|
-
2. Reference
|
|
128
|
+
### Enhanced Planning (Scratch-Based)
|
|
129
|
+
In addition to stating your plan in text, persist it to scratch for reliability:
|
|
130
|
+
1. Write your plan to scratch (key: "plan") listing steps and expected outcomes.
|
|
131
|
+
2. Reference and update the scratch plan during execution.
|
|
122
132
|
3. After completion, delete the plan from scratch to keep it clean.
|
|
123
133
|
|
|
124
134
|
### Proactive Scratch Usage
|
|
@@ -133,32 +143,6 @@ Before executing any task that requires more than two tool calls, file modificat
|
|
|
133
143
|
### Verification
|
|
134
144
|
- After any mutation (file write, git commit, API call), immediately verify the outcome with a read-only command.
|
|
135
145
|
- Your work will be reviewed by a critic agent afterward. Only claim what you can prove with tool output.`;
|
|
136
|
-
const CRITIC_TOTAL_RESULT_BUDGET = 8000;
|
|
137
|
-
const CRITIC_MIN_RESULT_CHARS = 500;
|
|
138
|
-
const CRITIC_MAX_RESPONSE_LENGTH = 4000;
|
|
139
|
-
const CRITIC_MAX_ARGS_LENGTH = 1000;
|
|
140
|
-
const CRITIC_SYSTEM_PROMPT = `You are a verification agent for Bernard, a CLI AI assistant. Your role is to review the agent's work and verify its integrity.
|
|
141
|
-
|
|
142
|
-
You will receive:
|
|
143
|
-
1. The user's original request
|
|
144
|
-
2. The agent's final text response
|
|
145
|
-
3. A log of actual tool calls made (tool name, arguments, results) — note that tool results, arguments, and the agent response may be truncated for context efficiency
|
|
146
|
-
|
|
147
|
-
Your job:
|
|
148
|
-
- Check if the agent's claims in its response are supported by actual tool call results.
|
|
149
|
-
- Verify that tool calls were actually made for actions the agent claims to have performed.
|
|
150
|
-
- Flag any claims not backed by tool evidence (e.g., "I created the file" but no shell/write tool call).
|
|
151
|
-
- Flag any tool results that suggest failure but were reported as success.
|
|
152
|
-
- Tool results and the agent response may be truncated for context efficiency. If a tool result appears cut off, do not treat the missing portion as evidence of failure. Only flag FAIL when there is positive evidence of failure (e.g., an error message visible in the output), not merely the absence of success confirmation in truncated output.
|
|
153
|
-
- Check if the response addresses the user's original intent.
|
|
154
|
-
|
|
155
|
-
Output format (plain text, concise):
|
|
156
|
-
VERDICT: PASS | WARN | FAIL
|
|
157
|
-
[1-3 sentence explanation]
|
|
158
|
-
[If WARN/FAIL: specific issues found]
|
|
159
|
-
|
|
160
|
-
Be strict but fair. Not every response needs tool calls — knowledge answers are fine. Focus on cases where the agent *claims* to have done something via tools.`;
|
|
161
|
-
const CRITIC_MAX_RETRIES = 2;
|
|
162
146
|
/**
|
|
163
147
|
* Assembles the full system prompt including base instructions, memory context, and MCP status.
|
|
164
148
|
* @internal Exported for testing only.
|
|
@@ -171,13 +155,7 @@ const CRITIC_MAX_RETRIES = 2;
|
|
|
171
155
|
* @param specialistMatches - Pre-computed specialist match results for the current input
|
|
172
156
|
*/
|
|
173
157
|
function buildSystemPrompt(config, memoryStore, mcpServerNames, ragResults, routineSummaries, specialistSummaries, specialistMatches) {
|
|
174
|
-
|
|
175
|
-
weekday: 'long',
|
|
176
|
-
year: 'numeric',
|
|
177
|
-
month: 'long',
|
|
178
|
-
day: 'numeric',
|
|
179
|
-
});
|
|
180
|
-
let prompt = BASE_SYSTEM_PROMPT + `\n\nToday's date is ${today}.`;
|
|
158
|
+
let prompt = BASE_SYSTEM_PROMPT + `\n\nCurrent date and time: ${(0, datetime_js_1.formatCurrentDateTime)()}.`;
|
|
181
159
|
prompt += `\nYou are running as provider: ${config.provider}, model: ${config.model}. The user can switch with /provider and /model.`;
|
|
182
160
|
if (config.criticMode) {
|
|
183
161
|
prompt += '\n\n' + CRITIC_MODE_PROMPT;
|
|
@@ -268,6 +246,8 @@ class Agent {
|
|
|
268
246
|
routineStore;
|
|
269
247
|
specialistStore;
|
|
270
248
|
candidateStore;
|
|
249
|
+
stepLimitHitCount = 0;
|
|
250
|
+
lastStepLimitHit = false;
|
|
271
251
|
constructor(config, toolOptions, memoryStore, mcpTools, mcpServerNames, alertContext, initialHistory, ragStore, routineStore, specialistStore, candidateStore) {
|
|
272
252
|
this.config = config;
|
|
273
253
|
this.toolOptions = toolOptions;
|
|
@@ -296,6 +276,12 @@ class Agent {
|
|
|
296
276
|
abort() {
|
|
297
277
|
this.abortController?.abort();
|
|
298
278
|
}
|
|
279
|
+
/** Returns step limit hit info from last processInput, or null if limit wasn't hit. */
|
|
280
|
+
getStepLimitHit() {
|
|
281
|
+
if (!this.lastStepLimitHit)
|
|
282
|
+
return null;
|
|
283
|
+
return { currentLimit: this.config.maxSteps, hitCount: this.stepLimitHitCount };
|
|
284
|
+
}
|
|
299
285
|
/** Attaches a spinner stats object that will be updated with token usage during generation. */
|
|
300
286
|
setSpinnerStats(stats) {
|
|
301
287
|
this.spinnerStats = stats;
|
|
@@ -313,13 +299,15 @@ class Agent {
|
|
|
313
299
|
* @throws Error wrapping the underlying API error if generation fails for non-abort, non-overflow reasons
|
|
314
300
|
*/
|
|
315
301
|
async processInput(userInput) {
|
|
316
|
-
this.
|
|
302
|
+
this.lastStepLimitHit = false;
|
|
303
|
+
const timestamped = (0, datetime_js_1.timestampUserMessage)(userInput);
|
|
304
|
+
this.history.push({ role: 'user', content: timestamped });
|
|
317
305
|
this.abortController = new AbortController();
|
|
318
306
|
this.lastStepPromptTokens = 0;
|
|
319
307
|
this.lastRAGResults = [];
|
|
320
308
|
try {
|
|
321
309
|
// Check if context compression is needed
|
|
322
|
-
const newMessageEstimate = Math.ceil(
|
|
310
|
+
const newMessageEstimate = Math.ceil(timestamped.length / 4);
|
|
323
311
|
if ((0, context_js_1.shouldCompress)(this.lastPromptTokens, newMessageEstimate, this.config.model, this.config.tokenWindow)) {
|
|
324
312
|
(0, output_js_1.printInfo)('Compressing conversation context...');
|
|
325
313
|
this.history = await (0, context_js_1.compressHistory)(this.history, this.config, this.ragStore);
|
|
@@ -378,7 +366,7 @@ class Agent {
|
|
|
378
366
|
const callGenerateText = (messages) => (0, ai_1.generateText)({
|
|
379
367
|
model: (0, index_js_1.getModel)(this.config.provider, this.config.model),
|
|
380
368
|
tools,
|
|
381
|
-
maxSteps:
|
|
369
|
+
maxSteps: this.config.maxSteps,
|
|
382
370
|
maxTokens: this.config.maxTokens,
|
|
383
371
|
system: systemPrompt,
|
|
384
372
|
messages: messages ?? this.history,
|
|
@@ -429,25 +417,74 @@ class Agent {
|
|
|
429
417
|
throw apiErr;
|
|
430
418
|
}
|
|
431
419
|
}
|
|
420
|
+
// Auto-continue when the model hit the maxTokens limit mid-response
|
|
421
|
+
const MAX_CONTINUATIONS = 3;
|
|
422
|
+
let continuations = 0;
|
|
423
|
+
let continuationTokens = 0;
|
|
424
|
+
while (result.finishReason === 'length' && continuations < MAX_CONTINUATIONS) {
|
|
425
|
+
if (this.abortController?.signal.aborted)
|
|
426
|
+
break;
|
|
427
|
+
continuationTokens += result.usage?.completionTokens ?? 0;
|
|
428
|
+
continuations++;
|
|
429
|
+
(0, output_js_1.printWarning)(`Response truncated (hit ${this.config.maxTokens} token limit). Auto-continuing... (${continuations}/${MAX_CONTINUATIONS})`);
|
|
430
|
+
// Append partial response to history so continuation has context
|
|
431
|
+
const partialMessages = (0, context_js_1.truncateToolResults)(result.response.messages);
|
|
432
|
+
this.history.push(...partialMessages);
|
|
433
|
+
this.history.push({
|
|
434
|
+
role: 'user',
|
|
435
|
+
content: '[Your previous response was cut off. Please continue exactly where you left off.]',
|
|
436
|
+
});
|
|
437
|
+
// Restart spinner for the continuation call
|
|
438
|
+
if (this.spinnerStats) {
|
|
439
|
+
(0, output_js_1.startSpinner)(() => (0, output_js_1.buildSpinnerMessage)(this.spinnerStats));
|
|
440
|
+
}
|
|
441
|
+
result = await callGenerateText();
|
|
442
|
+
}
|
|
443
|
+
if (continuations > 0) {
|
|
444
|
+
const totalCompletionTokens = continuationTokens + (result.usage?.completionTokens ?? 0);
|
|
445
|
+
const recommended = Math.ceil((totalCompletionTokens * 1.25) / 1024) * 1024;
|
|
446
|
+
if (result.finishReason === 'length') {
|
|
447
|
+
(0, output_js_1.printWarning)(`Response still incomplete after ${MAX_CONTINUATIONS} continuations. ` +
|
|
448
|
+
`Increase the token limit: /options max-tokens ${recommended}`);
|
|
449
|
+
}
|
|
450
|
+
else {
|
|
451
|
+
(0, output_js_1.printInfo)(`Tip: Response needed ~${totalCompletionTokens} tokens (limit: ${this.config.maxTokens}). ` +
|
|
452
|
+
`To avoid future truncation: /options max-tokens ${recommended}`);
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
// Detect maxSteps exhaustion
|
|
456
|
+
if (result.finishReason === 'tool-calls' && result.steps.length >= this.config.maxSteps) {
|
|
457
|
+
this.lastStepLimitHit = true;
|
|
458
|
+
this.stepLimitHitCount++;
|
|
459
|
+
const msg = this.stepLimitHitCount >= 2
|
|
460
|
+
? `Stopped at loop limit of ${this.config.maxSteps}. Use /options max-steps to adjust permanently.`
|
|
461
|
+
: `Stopped at loop limit of ${this.config.maxSteps}.`;
|
|
462
|
+
(0, output_js_1.printWarning)(msg);
|
|
463
|
+
}
|
|
432
464
|
// Run critic verification if enabled and tool calls were made
|
|
433
|
-
if (this.config.criticMode &&
|
|
434
|
-
|
|
435
|
-
|
|
465
|
+
if (this.config.criticMode &&
|
|
466
|
+
!this.abortController?.signal.aborted &&
|
|
467
|
+
!this.lastStepLimitHit) {
|
|
468
|
+
let toolLog = (0, critic_js_1.extractToolCallLog)(result.steps);
|
|
469
|
+
if (toolLog.length > 0) {
|
|
436
470
|
let retryCount = 0;
|
|
437
|
-
while (retryCount <= CRITIC_MAX_RETRIES) {
|
|
471
|
+
while (retryCount <= critic_js_1.CRITIC_MAX_RETRIES) {
|
|
438
472
|
if (this.abortController?.signal.aborted)
|
|
439
473
|
break;
|
|
440
|
-
const criticResult = await
|
|
474
|
+
const criticResult = await (0, critic_js_1.runCritic)(this.config, userInput, result.text, toolLog, {
|
|
475
|
+
isRetry: retryCount > 0,
|
|
476
|
+
abortSignal: this.abortController?.signal,
|
|
477
|
+
});
|
|
441
478
|
// null (error) or PASS — stop looping
|
|
442
479
|
if (!criticResult || criticResult.verdict === 'PASS')
|
|
443
480
|
break;
|
|
444
481
|
// Exhausted retries — warn and stop
|
|
445
|
-
if (retryCount >= CRITIC_MAX_RETRIES) {
|
|
482
|
+
if (retryCount >= critic_js_1.CRITIC_MAX_RETRIES) {
|
|
446
483
|
(0, output_js_1.printInfo)('Critic still unsatisfied after maximum retries.');
|
|
447
484
|
break;
|
|
448
485
|
}
|
|
449
486
|
retryCount++;
|
|
450
|
-
(0, output_js_1.printCriticRetry)(retryCount, CRITIC_MAX_RETRIES);
|
|
487
|
+
(0, output_js_1.printCriticRetry)(retryCount, critic_js_1.CRITIC_MAX_RETRIES);
|
|
451
488
|
// Push current attempt's messages + critic feedback into history before retrying
|
|
452
489
|
try {
|
|
453
490
|
const truncatedResultMessages = (0, context_js_1.truncateToolResults)(result.response.messages);
|
|
@@ -457,9 +494,9 @@ class Agent {
|
|
|
457
494
|
content: `The critic agent reviewed your work and found issues:\n\nVERDICT: ${criticResult.verdict}\n${criticResult.explanation}\n\nPlease address these issues and try again.`,
|
|
458
495
|
});
|
|
459
496
|
result = await callGenerateText();
|
|
460
|
-
|
|
497
|
+
toolLog = (0, critic_js_1.extractToolCallLog)(result.steps);
|
|
461
498
|
// If no tool calls in retry, nothing more to verify
|
|
462
|
-
if (
|
|
499
|
+
if (toolLog.length === 0)
|
|
463
500
|
break;
|
|
464
501
|
}
|
|
465
502
|
catch (retryErr) {
|
|
@@ -488,84 +525,6 @@ class Agent {
|
|
|
488
525
|
this.spinnerStats = null;
|
|
489
526
|
}
|
|
490
527
|
}
|
|
491
|
-
/** Extracts a structured log of tool calls from generateText step results. */
|
|
492
|
-
extractToolCallLog(steps) {
|
|
493
|
-
const entries = [];
|
|
494
|
-
for (const step of steps) {
|
|
495
|
-
// AI SDK guarantees toolResults[i] corresponds to toolCalls[i] within each step
|
|
496
|
-
for (let i = 0; i < step.toolCalls.length; i++) {
|
|
497
|
-
const tc = step.toolCalls[i];
|
|
498
|
-
const tr = step.toolResults[i];
|
|
499
|
-
entries.push({
|
|
500
|
-
toolName: tc.toolName,
|
|
501
|
-
args: tc.args,
|
|
502
|
-
result: tr?.result,
|
|
503
|
-
});
|
|
504
|
-
}
|
|
505
|
-
}
|
|
506
|
-
return entries;
|
|
507
|
-
}
|
|
508
|
-
/** Runs the critic agent to verify the main agent's response against actual tool calls. */
|
|
509
|
-
async runCritic(userInput, responseText, toolCallLog, isRetry = false) {
|
|
510
|
-
try {
|
|
511
|
-
if (isRetry) {
|
|
512
|
-
(0, output_js_1.printCriticReVerify)();
|
|
513
|
-
}
|
|
514
|
-
else {
|
|
515
|
-
(0, output_js_1.printCriticStart)();
|
|
516
|
-
}
|
|
517
|
-
const perResultLimit = Math.max(CRITIC_MIN_RESULT_CHARS, Math.floor(CRITIC_TOTAL_RESULT_BUDGET / toolCallLog.length));
|
|
518
|
-
const truncatedLog = toolCallLog.map((entry) => {
|
|
519
|
-
const raw = typeof entry.result === 'string' ? entry.result : JSON.stringify(entry.result ?? null);
|
|
520
|
-
const truncated = raw.length > perResultLimit ? raw.slice(0, perResultLimit) + '...' : raw;
|
|
521
|
-
return {
|
|
522
|
-
toolName: entry.toolName,
|
|
523
|
-
args: entry.args,
|
|
524
|
-
result: truncated,
|
|
525
|
-
};
|
|
526
|
-
});
|
|
527
|
-
const truncatedResponse = responseText.length > CRITIC_MAX_RESPONSE_LENGTH
|
|
528
|
-
? responseText.slice(0, CRITIC_MAX_RESPONSE_LENGTH) + '\n... (truncated)'
|
|
529
|
-
: responseText;
|
|
530
|
-
const criticMessage = `## Original User Request
|
|
531
|
-
${userInput}
|
|
532
|
-
|
|
533
|
-
## Agent Response
|
|
534
|
-
${truncatedResponse}
|
|
535
|
-
|
|
536
|
-
## Tool Call Log (${truncatedLog.length} calls)
|
|
537
|
-
${truncatedLog
|
|
538
|
-
.map((e, i) => {
|
|
539
|
-
const argsStr = JSON.stringify(e.args);
|
|
540
|
-
const truncatedArgs = argsStr.length > CRITIC_MAX_ARGS_LENGTH
|
|
541
|
-
? argsStr.slice(0, CRITIC_MAX_ARGS_LENGTH) + '...'
|
|
542
|
-
: argsStr;
|
|
543
|
-
return `${i + 1}. ${e.toolName}(${truncatedArgs})\n Result: ${e.result}`;
|
|
544
|
-
})
|
|
545
|
-
.join('\n\n')}`;
|
|
546
|
-
const result = await (0, ai_1.generateText)({
|
|
547
|
-
model: (0, index_js_1.getModel)(this.config.provider, this.config.model),
|
|
548
|
-
system: CRITIC_SYSTEM_PROMPT,
|
|
549
|
-
messages: [{ role: 'user', content: criticMessage }],
|
|
550
|
-
maxSteps: 1,
|
|
551
|
-
maxTokens: 1024,
|
|
552
|
-
abortSignal: this.abortController?.signal,
|
|
553
|
-
});
|
|
554
|
-
if (result.text) {
|
|
555
|
-
const parsed = (0, output_js_1.parseCriticVerdict)(result.text);
|
|
556
|
-
(0, output_js_1.printCriticVerdict)(result.text);
|
|
557
|
-
return {
|
|
558
|
-
verdict: parsed.verdict,
|
|
559
|
-
explanation: parsed.explanation,
|
|
560
|
-
};
|
|
561
|
-
}
|
|
562
|
-
return null;
|
|
563
|
-
}
|
|
564
|
-
catch (err) {
|
|
565
|
-
(0, logger_js_1.debugLog)('agent:critic:error', err instanceof Error ? err.message : String(err));
|
|
566
|
-
return null;
|
|
567
|
-
}
|
|
568
|
-
}
|
|
569
528
|
/** Compresses conversation history in-place, returning token usage stats. */
|
|
570
529
|
async compactHistory() {
|
|
571
530
|
const tokensBefore = (0, context_js_1.estimateHistoryTokens)(this.history);
|
|
@@ -584,6 +543,8 @@ ${truncatedLog
|
|
|
584
543
|
this.memoryStore.clearScratch();
|
|
585
544
|
this.previousRAGFacts = new Set();
|
|
586
545
|
this.lastRAGResults = [];
|
|
546
|
+
this.stepLimitHitCount = 0;
|
|
547
|
+
this.lastStepLimitHit = false;
|
|
587
548
|
}
|
|
588
549
|
}
|
|
589
550
|
exports.Agent = Agent;
|