bernard-agent 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +96 -50
  2. package/dist/agent.d.ts +7 -4
  3. package/dist/agent.js +88 -127
  4. package/dist/agent.js.map +1 -1
  5. package/dist/config.d.ts +13 -1
  6. package/dist/config.js +66 -4
  7. package/dist/config.js.map +1 -1
  8. package/dist/critic.d.ts +32 -0
  9. package/dist/critic.js +118 -0
  10. package/dist/critic.js.map +1 -0
  11. package/dist/cron/runner.js +54 -34
  12. package/dist/cron/runner.js.map +1 -1
  13. package/dist/output.d.ts +6 -4
  14. package/dist/output.js +31 -10
  15. package/dist/output.js.map +1 -1
  16. package/dist/overlap-checker.d.ts +61 -0
  17. package/dist/overlap-checker.js +106 -0
  18. package/dist/overlap-checker.js.map +1 -0
  19. package/dist/pac.d.ts +37 -0
  20. package/dist/pac.js +69 -0
  21. package/dist/pac.js.map +1 -0
  22. package/dist/rag-query.js +2 -1
  23. package/dist/rag-query.js.map +1 -1
  24. package/dist/rag-worker.js +4 -3
  25. package/dist/rag-worker.js.map +1 -1
  26. package/dist/repl.js +160 -4
  27. package/dist/repl.js.map +1 -1
  28. package/dist/specialist-candidates.d.ts +13 -0
  29. package/dist/specialist-candidates.js.map +1 -1
  30. package/dist/specialist-detector.d.ts +20 -5
  31. package/dist/specialist-detector.js +72 -9
  32. package/dist/specialist-detector.js.map +1 -1
  33. package/dist/specialist-matcher.d.ts +5 -0
  34. package/dist/specialist-matcher.js +1 -0
  35. package/dist/specialist-matcher.js.map +1 -1
  36. package/dist/tools/datetime.d.ts +6 -0
  37. package/dist/tools/datetime.js +38 -1
  38. package/dist/tools/datetime.js.map +1 -1
  39. package/dist/tools/mcp-url.js +1 -1
  40. package/dist/tools/mcp-url.js.map +1 -1
  41. package/dist/tools/specialist-run.js +37 -12
  42. package/dist/tools/specialist-run.js.map +1 -1
  43. package/dist/tools/subagent.js +37 -12
  44. package/dist/tools/subagent.js.map +1 -1
  45. package/package.json +1 -1
package/README.md CHANGED
@@ -129,19 +129,22 @@ bernard providers
129
129
 
130
130
  Bernard loads `.env` from the current directory first, then falls back to `~/.bernard/.env`.
131
131
 
132
- | Variable | Description | Default |
133
- | ----------------------- | ----------------------------------------------------- | ------------------------- |
134
- | `BERNARD_PROVIDER` | LLM provider (`anthropic`, `openai`, `xai`) | `anthropic` |
135
- | `BERNARD_MODEL` | Model name | Provider-specific default |
136
- | `BERNARD_MAX_TOKENS` | Max response tokens | `4096` |
137
- | `BERNARD_SHELL_TIMEOUT` | Shell command timeout (ms) | `30000` |
138
- | `BERNARD_TOKEN_WINDOW` | Context window size for compression (0 = auto-detect) | `0` |
139
- | `BERNARD_RAG_ENABLED` | Enable the RAG memory system | `true` |
140
- | `BERNARD_CRITIC_MODE` | Enable critic mode for response verification | `false` |
141
- | `BERNARD_DEBUG` | Enable debug logging | unset |
142
- | `ANTHROPIC_API_KEY` | Anthropic API key | |
143
- | `OPENAI_API_KEY` | OpenAI API key | |
144
- | `XAI_API_KEY` | xAI API key | |
132
+ | Variable | Description | Default |
133
+ | --------------------------------- | -------------------------------------------------------- | ------------------------- |
134
+ | `BERNARD_PROVIDER` | LLM provider (`anthropic`, `openai`, `xai`) | `anthropic` |
135
+ | `BERNARD_MODEL` | Model name | Provider-specific default |
136
+ | `BERNARD_MAX_TOKENS` | Max response tokens | `4096` |
137
+ | `BERNARD_SHELL_TIMEOUT` | Shell command timeout (ms) | `30000` |
138
+ | `BERNARD_TOKEN_WINDOW` | Context window size for compression (0 = auto-detect) | `0` |
139
+ | `BERNARD_MAX_STEPS` | Max agent loop iterations per request | `25` |
140
+ | `BERNARD_RAG_ENABLED` | Enable the RAG memory system | `true` |
141
+ | `BERNARD_CRITIC_MODE` | Enable critic mode for response verification | `false` |
142
+ | `BERNARD_AUTO_CREATE_SPECIALISTS` | Auto-create specialists above confidence threshold | `false` |
143
+ | `BERNARD_AUTO_CREATE_THRESHOLD` | Confidence threshold for auto-creating specialists (0-1) | `0.8` |
144
+ | `BERNARD_DEBUG` | Enable debug logging | unset |
145
+ | `ANTHROPIC_API_KEY` | Anthropic API key | — |
146
+ | `OPENAI_API_KEY` | OpenAI API key | — |
147
+ | `XAI_API_KEY` | xAI API key | — |
145
148
 
146
149
  ### Providers and Models
147
150
 
@@ -157,11 +160,12 @@ You can switch providers and models at any time during a session with `/provider
157
160
 
158
161
  Options can be changed during a session with `/options` or persisted to `~/.bernard/preferences.json`:
159
162
 
160
- | Option | Default | Description |
161
- | --------------- | ------- | ----------------------------------------------------- |
162
- | `max-tokens` | `4096` | Maximum tokens per AI response |
163
- | `shell-timeout` | `30000` | Shell command timeout in milliseconds |
164
- | `token-window` | `0` | Context window size for compression (0 = auto-detect) |
163
+ | Option | Default | Description |
164
+ | --------------- | ------- | ------------------------------------------------------------ |
165
+ | `max-tokens` | `4096` | Maximum tokens per AI response |
166
+ | `max-steps` | `25` | Maximum agent loop iterations per request (tool call chains) |
167
+ | `shell-timeout` | `30000` | Shell command timeout in milliseconds |
168
+ | `token-window` | `0` | Context window size for compression (0 = auto-detect) |
165
169
 
166
170
  From the CLI:
167
171
 
@@ -225,28 +229,30 @@ Features:
225
229
 
226
230
  ### REPL Slash Commands
227
231
 
228
- | Command | Description |
229
- | ----------------- | ------------------------------------------------------------------------- |
230
- | `/help` | Show available commands |
231
- | `/clear` | Clear conversation history and scratch notes |
232
- | `/compact` | Compress conversation history in-place |
233
- | `/task` | Run an isolated task (no history, structured output) |
234
- | `/memory` | List all persistent memories |
235
- | `/scratch` | List session scratch notes |
236
- | `/mcp` | List connected MCP servers and their tools |
237
- | `/cron` | Show cron jobs and daemon status |
238
- | `/rag` | Show RAG memory stats and recent facts |
239
- | `/provider` | Switch LLM provider interactively |
240
- | `/model` | Switch model for the current provider |
241
- | `/theme` | Switch color theme |
242
- | `/routines` | List saved routines |
243
- | `/create-routine` | Create a routine with guided AI assistance |
244
- | `/create-task` | Create a task routine (`task-` prefixed) with guided AI assistance |
245
- | `/specialists` | List saved specialists |
246
- | `/candidates` | Review auto-detected specialist suggestions _(v0.6.0+)_ |
247
- | `/critic` | Toggle critic mode for response verification (on/off) |
248
- | `/options` | View and modify runtime options (max-tokens, shell-timeout, token-window) |
249
- | `/exit` | Quit Bernard (also: `exit`, `quit`) |
232
+ | Command | Description |
233
+ | ----------------- | ------------------------------------------------------------------------------------ |
234
+ | `/help` | Show available commands |
235
+ | `/clear` | Clear conversation history and scratch notes |
236
+ | `/compact` | Compress conversation history in-place |
237
+ | `/task` | Run an isolated task (no history, structured output) |
238
+ | `/memory` | List all persistent memories |
239
+ | `/scratch` | List session scratch notes |
240
+ | `/mcp` | List connected MCP servers and their tools |
241
+ | `/cron` | Show cron jobs and daemon status |
242
+ | `/rag` | Show RAG memory stats and recent facts |
243
+ | `/provider` | Switch LLM provider interactively |
244
+ | `/model` | Switch model for the current provider |
245
+ | `/theme` | Switch color theme |
246
+ | `/routines` | List saved routines |
247
+ | `/create-routine` | Create a routine with guided AI assistance |
248
+ | `/create-task` | Create a task routine (`task-` prefixed) with guided AI assistance |
249
+ | `/specialists` | List saved specialists |
250
+ | `/candidates` | Review auto-detected specialist suggestions _(v0.6.0+)_ |
251
+ | `/critic` | Toggle critic mode for response verification (on/off) |
252
+ | `/agent-options` | Configure auto-creation for specialist agents |
253
+ | `/options` | View and modify runtime options (max-tokens, max-steps, shell-timeout, token-window) |
254
+ | `/debug` | Print a diagnostic report for troubleshooting (no secrets leaked) |
255
+ | `/exit` | Quit Bernard (also: `exit`, `quit`) |
250
256
 
251
257
  Type `/{routine-id}` or `/{specialist-id}` to invoke a saved routine or specialist directly (e.g., `/deploy-staging`).
252
258
 
@@ -361,7 +367,7 @@ bernard> check the disk usage on /, look up the weather in Austin, and count lin
361
367
  ...
362
368
  ```
363
369
 
364
- Up to 4 concurrent sub-agents. Each gets 10 max steps. Color-coded output in the terminal. Sub-agents accept per-invocation provider/model overrides to use a different LLM than the main session.
370
+ Up to 4 concurrent sub-agents. Each gets 50% of the main agent's step budget (e.g. 13 steps when `max-steps` is 25). Color-coded output in the terminal. Sub-agents accept per-invocation provider/model overrides to use a different LLM than the main session.
365
371
 
366
372
  ### Tasks _(v0.6.0+)_
367
373
 
@@ -481,6 +487,19 @@ When candidates are detected, you'll see a notification at the start of your nex
481
487
 
482
488
  Use `/candidates` to see pending suggestions with their name, description, confidence score, and reasoning. You can then accept or reject candidates conversationally (e.g., "accept the code-review candidate"), and Bernard will create the specialist for you.
483
489
 
490
+ **Overlap detection** — Before suggesting a new specialist, Bernard computes a token-based similarity score against all existing specialists and pending candidates. If the overlap exceeds 60%, the candidate is suppressed. When a candidate partially overlaps with an existing specialist, Bernard may suggest enhancing the existing specialist instead.
491
+
492
+ **Auto-creation** — You can enable automatic specialist creation for high-confidence candidates:
493
+
494
+ ```bash
495
+ /agent-options auto-create on # Enable auto-creation
496
+ /agent-options auto-create off # Disable auto-creation
497
+ /agent-options threshold 0.85 # Set confidence threshold (0-1)
498
+ /agent-options # Show current settings
499
+ ```
500
+
501
+ Or via environment variables: `BERNARD_AUTO_CREATE_SPECIALISTS=true` and `BERNARD_AUTO_CREATE_THRESHOLD=0.85`.
502
+
484
503
  Candidates are auto-dismissed after 30 days if not reviewed. Up to 10 pending candidates are stored at a time.
485
504
 
486
505
  Storage: one JSON file per candidate in `~/.local/share/bernard/specialist-candidates/`.
@@ -503,6 +522,12 @@ When enabled:
503
522
 
504
523
  The critic checks that claimed actions match actual tool calls and flags any discrepancies. It adds one extra LLM call after tool-using responses. Simple knowledge answers are not verified.
505
524
 
525
+ **PAC System (Plan-Act-Critic)** — When critic mode is enabled, sub-agents and specialists also get critic verification via a reusable PAC loop. The PAC loop runs the critic after each sub-agent/specialist execution, and if the critic finds issues, it retries the task with feedback (up to 2 retries). This applies to:
526
+
527
+ - Sub-agents (`agent` tool)
528
+ - Specialist runs (`specialist_run` tool)
529
+ - Cron job executions (daemon mode)
530
+
506
531
  Default: off. Recommended for high-stakes work (deployments, git operations, multi-file edits).
507
532
 
508
533
  ---
@@ -697,6 +722,8 @@ Bernard automatically compresses conversation history when it approaches 75% of
697
722
 
698
723
  Summarization and domain-specific fact extraction run in parallel. Scratch notes survive compression, so multi-step task progress is never lost.
699
724
 
725
+ **Auto-continue on truncation:** If a response hits the `max-tokens` limit and is cut off, Bernard automatically continues where it left off (up to 3 continuations). After completing, it shows a recommended `max-tokens` value based on actual usage. If the response is still incomplete after 3 continuations, a warning is shown with instructions to increase the limit via `/options max-tokens <value>`.
726
+
700
727
  When critic mode is enabled (`/critic on`), Bernard writes plans to scratch before complex tasks and verifies outcomes after tool use. See [Critic Mode](#critic-mode).
701
728
 
702
729
  ### RAG Memory
@@ -741,17 +768,17 @@ Storage: `~/.bernard/conversation-history.json`
741
768
 
742
769
  ## File Structure
743
770
 
744
- Bernard stores all data in `~/.bernard/`:
771
+ Bernard follows the [XDG Base Directory Specification](https://specifications.freedesktop.org/basedir/latest/), splitting files across four standard directories:
745
772
 
746
773
  ```
747
- ~/.bernard/
748
- ├── keys.json # API keys (mode 0600)
774
+ ~/.config/bernard/ # Config (XDG_CONFIG_HOME)
749
775
  ├── preferences.json # Provider, model, options
776
+ ├── keys.json # API keys (mode 0600)
750
777
  ├── .env # Fallback environment config
751
- ├── mcp.json # MCP server configuration
752
- ├── conversation-history.json # Last session (for --resume)
778
+ └── mcp.json # MCP server configuration
779
+
780
+ ~/.local/share/bernard/ # Data (XDG_DATA_HOME)
753
781
  ├── memory/ # Persistent memories (*.md)
754
- ├── models/ # Embedding model cache (fastembed)
755
782
  ├── routines/ # Saved routines (*.json)
756
783
  ├── specialists/ # Saved specialist profiles (*.json)
757
784
  ├── specialist-candidates/ # Auto-detected specialist suggestions (*.json)
@@ -759,12 +786,22 @@ Bernard stores all data in `~/.bernard/`:
759
786
  │ └── memories.json # RAG fact embeddings
760
787
  └── cron/
761
788
  ├── jobs.json # Scheduled jobs
762
- ├── daemon.pid # Daemon process ID
763
- ├── daemon.log # Daemon output (rotates at 1MB)
764
789
  ├── logs/ # Per-job execution logs
765
790
  └── alerts/ # Cron alert files
791
+
792
+ ~/.cache/bernard/ # Cache (XDG_CACHE_HOME)
793
+ ├── models/ # Embedding model cache (fastembed)
794
+ └── update-check.json # Update check state
795
+
796
+ ~/.local/state/bernard/ # State (XDG_STATE_HOME)
797
+ ├── conversation-history.json # Last session (for --resume)
798
+ ├── logs/ # Debug log files (*.jsonl)
799
+ ├── cron-daemon.pid # Daemon process ID
800
+ └── cron-daemon.log # Daemon output (rotates at 1MB)
766
801
  ```
767
802
 
803
+ Override all directories with a single flat path: `BERNARD_HOME=/path`. On first run, files are auto-migrated from legacy `~/.bernard/` to XDG locations.
804
+
768
805
  ---
769
806
 
770
807
  ## Development
@@ -796,6 +833,10 @@ BERNARD_DEBUG=1 bernard
796
833
 
797
834
  Logs are written to `.logs/YYYY-MM-DD.log` in JSON format, covering agent processing, RAG operations, context compression, tool execution, and MCP operations.
798
835
 
836
+ ### Diagnostic Report
837
+
838
+ Use `/debug` in the REPL to print a diagnostic report useful for troubleshooting. The report includes runtime info (Bernard version, Node.js version, OS), LLM configuration, API key status (configured/not set — keys are never shown), MCP server status, RAG/memory/cron state, conversation stats, active settings, and file paths. No secrets are included in the output.
839
+
799
840
  ### Adding a New Provider
800
841
 
801
842
  1. Install the AI SDK provider package (e.g., `npm install @ai-sdk/google`)
@@ -815,6 +856,7 @@ src/
815
856
  ├── repl.ts # Interactive REPL loop
816
857
  ├── agent.ts # Agent class (generateText loop)
817
858
  ├── config.ts # Config loading and validation
859
+ ├── critic.ts # Critic agent for response verification
818
860
  ├── output.ts # Terminal formatting (Chalk)
819
861
  ├── theme.ts # Color theme definitions and switching
820
862
  ├── memory.ts # MemoryStore (persistent + scratch)
@@ -826,7 +868,11 @@ src/
826
868
  ├── specialists.ts # SpecialistStore (reusable expert profiles)
827
869
  ├── specialist-candidates.ts # CandidateStore (auto-detected suggestions)
828
870
  ├── specialist-detector.ts # LLM-based specialist pattern detection
871
+ ├── specialist-matcher.ts # Keyword scorer for specialist auto-dispatch
829
872
  ├── mcp.ts # MCP server manager
873
+ ├── overlap-checker.ts # Token-based Jaccard overlap for specialist dedup
874
+ ├── pac.ts # Plan-Act-Critic loop wrapper
875
+ ├── paths.ts # Centralized XDG file path resolution
830
876
  ├── rag-worker.ts # Background RAG fact extraction + candidate detection
831
877
  ├── setup.ts # First-time setup wizard
832
878
  ├── history.ts # Conversation save/load
@@ -883,7 +929,7 @@ Found a bug? Please [open an issue](https://github.com/phillt/bernard/issues/new
883
929
 
884
930
  - Steps to reproduce the problem
885
931
  - Expected vs. actual behavior
886
- - Your environment (OS, Node version, Bernard version, provider/model)
932
+ - Your environment run `/debug` in the REPL and paste the output
887
933
  - Any relevant logs (run with `BERNARD_DEBUG=1` for verbose output)
888
934
 
889
935
  ## Third-Party Licenses
package/dist/agent.d.ts CHANGED
@@ -49,6 +49,8 @@ export declare class Agent {
49
49
  private routineStore;
50
50
  private specialistStore;
51
51
  private candidateStore?;
52
+ private stepLimitHitCount;
53
+ private lastStepLimitHit;
52
54
  constructor(config: BernardConfig, toolOptions: ToolOptions, memoryStore: MemoryStore, mcpTools?: Record<string, any>, mcpServerNames?: string[], alertContext?: string, initialHistory?: CoreMessage[], ragStore?: RAGStore, routineStore?: RoutineStore, specialistStore?: SpecialistStore, candidateStore?: CandidateStoreReader);
53
55
  /** Returns the current conversation message history. */
54
56
  getHistory(): CoreMessage[];
@@ -56,6 +58,11 @@ export declare class Agent {
56
58
  getLastRAGResults(): RAGSearchResult[];
57
59
  /** Cancels the in-flight LLM request, if any. Safe to call when no request is active. */
58
60
  abort(): void;
61
+ /** Returns step limit hit info from last processInput, or null if limit wasn't hit. */
62
+ getStepLimitHit(): {
63
+ currentLimit: number;
64
+ hitCount: number;
65
+ } | null;
59
66
  /** Attaches a spinner stats object that will be updated with token usage during generation. */
60
67
  setSpinnerStats(stats: SpinnerStats): void;
61
68
  /** Updates the alert context injected into the system prompt (e.g., specialist candidates). */
@@ -69,10 +76,6 @@ export declare class Agent {
69
76
  * @throws Error wrapping the underlying API error if generation fails for non-abort, non-overflow reasons
70
77
  */
71
78
  processInput(userInput: string): Promise<void>;
72
- /** Extracts a structured log of tool calls from generateText step results. */
73
- private extractToolCallLog;
74
- /** Runs the critic agent to verify the main agent's response against actual tool calls. */
75
- private runCritic;
76
79
  /** Compresses conversation history in-place, returning token usage stats. */
77
80
  compactHistory(): Promise<CompactResult>;
78
81
  /** Resets conversation history, scratch notes, and RAG tracking state for a fresh session. */
package/dist/agent.js CHANGED
@@ -9,6 +9,7 @@ const subagent_js_1 = require("./tools/subagent.js");
9
9
  const task_js_1 = require("./tools/task.js");
10
10
  const output_js_1 = require("./output.js");
11
11
  const logger_js_1 = require("./logger.js");
12
+ const critic_js_1 = require("./critic.js");
12
13
  const context_js_1 = require("./context.js");
13
14
  const routines_js_1 = require("./routines.js");
14
15
  const specialists_js_1 = require("./specialists.js");
@@ -16,6 +17,7 @@ const specialist_run_js_1 = require("./tools/specialist-run.js");
16
17
  const specialist_matcher_js_1 = require("./specialist-matcher.js");
17
18
  const memory_context_js_1 = require("./memory-context.js");
18
19
  const rag_query_js_1 = require("./rag-query.js");
20
+ const datetime_js_1 = require("./tools/datetime.js");
19
21
  const BASE_SYSTEM_PROMPT = `# Identity
20
22
 
21
23
  You are Bernard, a local CLI AI agent with direct shell access, persistent memory, and a suite of tools for system tasks, web reading, and scheduling.
@@ -38,6 +40,14 @@ You exist only while processing a user message. Each response is a single turn:
38
40
  - When uncertain about intent, ask a clarifying question rather than guessing.
39
41
  - If a request is ambiguous or risky, state your assumptions before acting.
40
42
 
43
+ ## Planning
44
+ Before executing any task that requires more than two tool calls:
45
+ 1. Briefly outline your plan in your response text — what steps you intend to take and in what order.
46
+ 2. Execute the plan step by step. If the approach needs to change, state the revised plan before continuing.
47
+ 3. After completion, summarize what was done and the outcome.
48
+
49
+ This makes your reasoning visible and reduces errors on multi-step tasks. For simple tasks (1-2 tool calls), skip the plan and act directly.
50
+
41
51
  ## Tool Execution Integrity
42
52
  - NEVER simulate, fabricate, or narrate tool execution. If a task requires running a command, you MUST call the shell tool — do not write prose describing what a command "would return" or pretend you already ran it.
43
53
  - Your text output can only describe results you actually received from a tool call in this conversation. If you have not called a tool, you have no results to report.
@@ -115,10 +125,10 @@ const CRITIC_MODE_PROMPT = `## Reliability Mode (Active)
115
125
 
116
126
  You are operating with enhanced reliability. Follow these additional rules:
117
127
 
118
- ### Planning
119
- Before executing any task that requires more than two tool calls, file modifications, git operations, or multi-step research:
120
- 1. Write a brief plan to scratch (key: "plan") listing the steps you intend to take and the expected outcomes.
121
- 2. Reference this plan during execution. Update it if the approach changes.
128
+ ### Enhanced Planning (Scratch-Based)
129
+ In addition to stating your plan in text, persist it to scratch for reliability:
130
+ 1. Write your plan to scratch (key: "plan") listing steps and expected outcomes.
131
+ 2. Reference and update the scratch plan during execution.
122
132
  3. After completion, delete the plan from scratch to keep it clean.
123
133
 
124
134
  ### Proactive Scratch Usage
@@ -133,32 +143,6 @@ Before executing any task that requires more than two tool calls, file modificat
133
143
  ### Verification
134
144
  - After any mutation (file write, git commit, API call), immediately verify the outcome with a read-only command.
135
145
  - Your work will be reviewed by a critic agent afterward. Only claim what you can prove with tool output.`;
136
- const CRITIC_TOTAL_RESULT_BUDGET = 8000;
137
- const CRITIC_MIN_RESULT_CHARS = 500;
138
- const CRITIC_MAX_RESPONSE_LENGTH = 4000;
139
- const CRITIC_MAX_ARGS_LENGTH = 1000;
140
- const CRITIC_SYSTEM_PROMPT = `You are a verification agent for Bernard, a CLI AI assistant. Your role is to review the agent's work and verify its integrity.
141
-
142
- You will receive:
143
- 1. The user's original request
144
- 2. The agent's final text response
145
- 3. A log of actual tool calls made (tool name, arguments, results) — note that tool results, arguments, and the agent response may be truncated for context efficiency
146
-
147
- Your job:
148
- - Check if the agent's claims in its response are supported by actual tool call results.
149
- - Verify that tool calls were actually made for actions the agent claims to have performed.
150
- - Flag any claims not backed by tool evidence (e.g., "I created the file" but no shell/write tool call).
151
- - Flag any tool results that suggest failure but were reported as success.
152
- - Tool results and the agent response may be truncated for context efficiency. If a tool result appears cut off, do not treat the missing portion as evidence of failure. Only flag FAIL when there is positive evidence of failure (e.g., an error message visible in the output), not merely the absence of success confirmation in truncated output.
153
- - Check if the response addresses the user's original intent.
154
-
155
- Output format (plain text, concise):
156
- VERDICT: PASS | WARN | FAIL
157
- [1-3 sentence explanation]
158
- [If WARN/FAIL: specific issues found]
159
-
160
- Be strict but fair. Not every response needs tool calls — knowledge answers are fine. Focus on cases where the agent *claims* to have done something via tools.`;
161
- const CRITIC_MAX_RETRIES = 2;
162
146
  /**
163
147
  * Assembles the full system prompt including base instructions, memory context, and MCP status.
164
148
  * @internal Exported for testing only.
@@ -171,13 +155,7 @@ const CRITIC_MAX_RETRIES = 2;
171
155
  * @param specialistMatches - Pre-computed specialist match results for the current input
172
156
  */
173
157
  function buildSystemPrompt(config, memoryStore, mcpServerNames, ragResults, routineSummaries, specialistSummaries, specialistMatches) {
174
- const today = new Date().toLocaleDateString('en-US', {
175
- weekday: 'long',
176
- year: 'numeric',
177
- month: 'long',
178
- day: 'numeric',
179
- });
180
- let prompt = BASE_SYSTEM_PROMPT + `\n\nToday's date is ${today}.`;
158
+ let prompt = BASE_SYSTEM_PROMPT + `\n\nCurrent date and time: ${(0, datetime_js_1.formatCurrentDateTime)()}.`;
181
159
  prompt += `\nYou are running as provider: ${config.provider}, model: ${config.model}. The user can switch with /provider and /model.`;
182
160
  if (config.criticMode) {
183
161
  prompt += '\n\n' + CRITIC_MODE_PROMPT;
@@ -268,6 +246,8 @@ class Agent {
268
246
  routineStore;
269
247
  specialistStore;
270
248
  candidateStore;
249
+ stepLimitHitCount = 0;
250
+ lastStepLimitHit = false;
271
251
  constructor(config, toolOptions, memoryStore, mcpTools, mcpServerNames, alertContext, initialHistory, ragStore, routineStore, specialistStore, candidateStore) {
272
252
  this.config = config;
273
253
  this.toolOptions = toolOptions;
@@ -296,6 +276,12 @@ class Agent {
296
276
  abort() {
297
277
  this.abortController?.abort();
298
278
  }
279
+ /** Returns step limit hit info from last processInput, or null if limit wasn't hit. */
280
+ getStepLimitHit() {
281
+ if (!this.lastStepLimitHit)
282
+ return null;
283
+ return { currentLimit: this.config.maxSteps, hitCount: this.stepLimitHitCount };
284
+ }
299
285
  /** Attaches a spinner stats object that will be updated with token usage during generation. */
300
286
  setSpinnerStats(stats) {
301
287
  this.spinnerStats = stats;
@@ -313,13 +299,15 @@ class Agent {
313
299
  * @throws Error wrapping the underlying API error if generation fails for non-abort, non-overflow reasons
314
300
  */
315
301
  async processInput(userInput) {
316
- this.history.push({ role: 'user', content: userInput });
302
+ this.lastStepLimitHit = false;
303
+ const timestamped = (0, datetime_js_1.timestampUserMessage)(userInput);
304
+ this.history.push({ role: 'user', content: timestamped });
317
305
  this.abortController = new AbortController();
318
306
  this.lastStepPromptTokens = 0;
319
307
  this.lastRAGResults = [];
320
308
  try {
321
309
  // Check if context compression is needed
322
- const newMessageEstimate = Math.ceil(userInput.length / 4);
310
+ const newMessageEstimate = Math.ceil(timestamped.length / 4);
323
311
  if ((0, context_js_1.shouldCompress)(this.lastPromptTokens, newMessageEstimate, this.config.model, this.config.tokenWindow)) {
324
312
  (0, output_js_1.printInfo)('Compressing conversation context...');
325
313
  this.history = await (0, context_js_1.compressHistory)(this.history, this.config, this.ragStore);
@@ -378,7 +366,7 @@ class Agent {
378
366
  const callGenerateText = (messages) => (0, ai_1.generateText)({
379
367
  model: (0, index_js_1.getModel)(this.config.provider, this.config.model),
380
368
  tools,
381
- maxSteps: 20,
369
+ maxSteps: this.config.maxSteps,
382
370
  maxTokens: this.config.maxTokens,
383
371
  system: systemPrompt,
384
372
  messages: messages ?? this.history,
@@ -429,25 +417,74 @@ class Agent {
429
417
  throw apiErr;
430
418
  }
431
419
  }
420
+ // Auto-continue when the model hit the maxTokens limit mid-response
421
+ const MAX_CONTINUATIONS = 3;
422
+ let continuations = 0;
423
+ let continuationTokens = 0;
424
+ while (result.finishReason === 'length' && continuations < MAX_CONTINUATIONS) {
425
+ if (this.abortController?.signal.aborted)
426
+ break;
427
+ continuationTokens += result.usage?.completionTokens ?? 0;
428
+ continuations++;
429
+ (0, output_js_1.printWarning)(`Response truncated (hit ${this.config.maxTokens} token limit). Auto-continuing... (${continuations}/${MAX_CONTINUATIONS})`);
430
+ // Append partial response to history so continuation has context
431
+ const partialMessages = (0, context_js_1.truncateToolResults)(result.response.messages);
432
+ this.history.push(...partialMessages);
433
+ this.history.push({
434
+ role: 'user',
435
+ content: '[Your previous response was cut off. Please continue exactly where you left off.]',
436
+ });
437
+ // Restart spinner for the continuation call
438
+ if (this.spinnerStats) {
439
+ (0, output_js_1.startSpinner)(() => (0, output_js_1.buildSpinnerMessage)(this.spinnerStats));
440
+ }
441
+ result = await callGenerateText();
442
+ }
443
+ if (continuations > 0) {
444
+ const totalCompletionTokens = continuationTokens + (result.usage?.completionTokens ?? 0);
445
+ const recommended = Math.ceil((totalCompletionTokens * 1.25) / 1024) * 1024;
446
+ if (result.finishReason === 'length') {
447
+ (0, output_js_1.printWarning)(`Response still incomplete after ${MAX_CONTINUATIONS} continuations. ` +
448
+ `Increase the token limit: /options max-tokens ${recommended}`);
449
+ }
450
+ else {
451
+ (0, output_js_1.printInfo)(`Tip: Response needed ~${totalCompletionTokens} tokens (limit: ${this.config.maxTokens}). ` +
452
+ `To avoid future truncation: /options max-tokens ${recommended}`);
453
+ }
454
+ }
455
+ // Detect maxSteps exhaustion
456
+ if (result.finishReason === 'tool-calls' && result.steps.length >= this.config.maxSteps) {
457
+ this.lastStepLimitHit = true;
458
+ this.stepLimitHitCount++;
459
+ const msg = this.stepLimitHitCount >= 2
460
+ ? `Stopped at loop limit of ${this.config.maxSteps}. Use /options max-steps to adjust permanently.`
461
+ : `Stopped at loop limit of ${this.config.maxSteps}.`;
462
+ (0, output_js_1.printWarning)(msg);
463
+ }
432
464
  // Run critic verification if enabled and tool calls were made
433
- if (this.config.criticMode && !this.abortController?.signal.aborted) {
434
- let toolCallLog = this.extractToolCallLog(result.steps);
435
- if (toolCallLog.length > 0) {
465
+ if (this.config.criticMode &&
466
+ !this.abortController?.signal.aborted &&
467
+ !this.lastStepLimitHit) {
468
+ let toolLog = (0, critic_js_1.extractToolCallLog)(result.steps);
469
+ if (toolLog.length > 0) {
436
470
  let retryCount = 0;
437
- while (retryCount <= CRITIC_MAX_RETRIES) {
471
+ while (retryCount <= critic_js_1.CRITIC_MAX_RETRIES) {
438
472
  if (this.abortController?.signal.aborted)
439
473
  break;
440
- const criticResult = await this.runCritic(userInput, result.text, toolCallLog, retryCount > 0);
474
+ const criticResult = await (0, critic_js_1.runCritic)(this.config, userInput, result.text, toolLog, {
475
+ isRetry: retryCount > 0,
476
+ abortSignal: this.abortController?.signal,
477
+ });
441
478
  // null (error) or PASS — stop looping
442
479
  if (!criticResult || criticResult.verdict === 'PASS')
443
480
  break;
444
481
  // Exhausted retries — warn and stop
445
- if (retryCount >= CRITIC_MAX_RETRIES) {
482
+ if (retryCount >= critic_js_1.CRITIC_MAX_RETRIES) {
446
483
  (0, output_js_1.printInfo)('Critic still unsatisfied after maximum retries.');
447
484
  break;
448
485
  }
449
486
  retryCount++;
450
- (0, output_js_1.printCriticRetry)(retryCount, CRITIC_MAX_RETRIES);
487
+ (0, output_js_1.printCriticRetry)(retryCount, critic_js_1.CRITIC_MAX_RETRIES);
451
488
  // Push current attempt's messages + critic feedback into history before retrying
452
489
  try {
453
490
  const truncatedResultMessages = (0, context_js_1.truncateToolResults)(result.response.messages);
@@ -457,9 +494,9 @@ class Agent {
457
494
  content: `The critic agent reviewed your work and found issues:\n\nVERDICT: ${criticResult.verdict}\n${criticResult.explanation}\n\nPlease address these issues and try again.`,
458
495
  });
459
496
  result = await callGenerateText();
460
- toolCallLog = this.extractToolCallLog(result.steps);
497
+ toolLog = (0, critic_js_1.extractToolCallLog)(result.steps);
461
498
  // If no tool calls in retry, nothing more to verify
462
- if (toolCallLog.length === 0)
499
+ if (toolLog.length === 0)
463
500
  break;
464
501
  }
465
502
  catch (retryErr) {
@@ -488,84 +525,6 @@ class Agent {
488
525
  this.spinnerStats = null;
489
526
  }
490
527
  }
491
- /** Extracts a structured log of tool calls from generateText step results. */
492
- extractToolCallLog(steps) {
493
- const entries = [];
494
- for (const step of steps) {
495
- // AI SDK guarantees toolResults[i] corresponds to toolCalls[i] within each step
496
- for (let i = 0; i < step.toolCalls.length; i++) {
497
- const tc = step.toolCalls[i];
498
- const tr = step.toolResults[i];
499
- entries.push({
500
- toolName: tc.toolName,
501
- args: tc.args,
502
- result: tr?.result,
503
- });
504
- }
505
- }
506
- return entries;
507
- }
508
- /** Runs the critic agent to verify the main agent's response against actual tool calls. */
509
- async runCritic(userInput, responseText, toolCallLog, isRetry = false) {
510
- try {
511
- if (isRetry) {
512
- (0, output_js_1.printCriticReVerify)();
513
- }
514
- else {
515
- (0, output_js_1.printCriticStart)();
516
- }
517
- const perResultLimit = Math.max(CRITIC_MIN_RESULT_CHARS, Math.floor(CRITIC_TOTAL_RESULT_BUDGET / toolCallLog.length));
518
- const truncatedLog = toolCallLog.map((entry) => {
519
- const raw = typeof entry.result === 'string' ? entry.result : JSON.stringify(entry.result ?? null);
520
- const truncated = raw.length > perResultLimit ? raw.slice(0, perResultLimit) + '...' : raw;
521
- return {
522
- toolName: entry.toolName,
523
- args: entry.args,
524
- result: truncated,
525
- };
526
- });
527
- const truncatedResponse = responseText.length > CRITIC_MAX_RESPONSE_LENGTH
528
- ? responseText.slice(0, CRITIC_MAX_RESPONSE_LENGTH) + '\n... (truncated)'
529
- : responseText;
530
- const criticMessage = `## Original User Request
531
- ${userInput}
532
-
533
- ## Agent Response
534
- ${truncatedResponse}
535
-
536
- ## Tool Call Log (${truncatedLog.length} calls)
537
- ${truncatedLog
538
- .map((e, i) => {
539
- const argsStr = JSON.stringify(e.args);
540
- const truncatedArgs = argsStr.length > CRITIC_MAX_ARGS_LENGTH
541
- ? argsStr.slice(0, CRITIC_MAX_ARGS_LENGTH) + '...'
542
- : argsStr;
543
- return `${i + 1}. ${e.toolName}(${truncatedArgs})\n Result: ${e.result}`;
544
- })
545
- .join('\n\n')}`;
546
- const result = await (0, ai_1.generateText)({
547
- model: (0, index_js_1.getModel)(this.config.provider, this.config.model),
548
- system: CRITIC_SYSTEM_PROMPT,
549
- messages: [{ role: 'user', content: criticMessage }],
550
- maxSteps: 1,
551
- maxTokens: 1024,
552
- abortSignal: this.abortController?.signal,
553
- });
554
- if (result.text) {
555
- const parsed = (0, output_js_1.parseCriticVerdict)(result.text);
556
- (0, output_js_1.printCriticVerdict)(result.text);
557
- return {
558
- verdict: parsed.verdict,
559
- explanation: parsed.explanation,
560
- };
561
- }
562
- return null;
563
- }
564
- catch (err) {
565
- (0, logger_js_1.debugLog)('agent:critic:error', err instanceof Error ? err.message : String(err));
566
- return null;
567
- }
568
- }
569
528
  /** Compresses conversation history in-place, returning token usage stats. */
570
529
  async compactHistory() {
571
530
  const tokensBefore = (0, context_js_1.estimateHistoryTokens)(this.history);
@@ -584,6 +543,8 @@ ${truncatedLog
584
543
  this.memoryStore.clearScratch();
585
544
  this.previousRAGFacts = new Set();
586
545
  this.lastRAGResults = [];
546
+ this.stepLimitHitCount = 0;
547
+ this.lastStepLimitHit = false;
587
548
  }
588
549
  }
589
550
  exports.Agent = Agent;