@bluecopa/harness 0.1.0-snapshot.59 → 0.1.0-snapshot.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/README.md +117 -212
  2. package/package.json +1 -2
  3. package/src/agent/create-agent.ts +17 -49
  4. package/src/agent/types.ts +2 -23
  5. package/src/interfaces/tool-provider.ts +0 -2
  6. package/src/loop/vercel-agent-loop.ts +18 -115
  7. package/src/skills/skill-router.ts +6 -12
  8. package/tests/integration/agent-skill-default-from-sandbox.spec.ts +2 -3
  9. package/tests/unit/structured-messages.spec.ts +1 -1
  10. package/vitest.config.ts +1 -1
  11. package/src/arc/agent-runner.ts +0 -697
  12. package/src/arc/arc-loop.ts +0 -818
  13. package/src/arc/arc-types.ts +0 -115
  14. package/src/arc/bridge-tools.ts +0 -170
  15. package/src/arc/bridged-tool-provider.ts +0 -80
  16. package/src/arc/consolidation.ts +0 -118
  17. package/src/arc/context-window.ts +0 -267
  18. package/src/arc/create-arc-agent.ts +0 -99
  19. package/src/arc/debug.ts +0 -62
  20. package/src/arc/episode-compressor.ts +0 -225
  21. package/src/arc/memory-manager.ts +0 -245
  22. package/src/arc/message-convert.ts +0 -111
  23. package/src/arc/object-store/fs-object-store.ts +0 -60
  24. package/src/arc/object-store/memory-object-store.ts +0 -41
  25. package/src/arc/object-store/object-store.ts +0 -12
  26. package/src/arc/profile-builder.ts +0 -157
  27. package/src/arc/resilience/bulkhead.ts +0 -110
  28. package/src/arc/resilience/circuit-breaker.ts +0 -112
  29. package/src/arc/resilience/fallback.ts +0 -27
  30. package/src/arc/resilience/index.ts +0 -21
  31. package/src/arc/resilience/pipeline.ts +0 -103
  32. package/src/arc/resilience/retry.ts +0 -90
  33. package/src/arc/resilience/timeout.ts +0 -60
  34. package/src/arc/resilience/types.ts +0 -71
  35. package/src/arc/sig.ts +0 -115
  36. package/src/arc/skill-resolver.ts +0 -78
  37. package/src/arc/stores/episode-store.ts +0 -120
  38. package/src/arc/stores/long-term-store.ts +0 -86
  39. package/src/arc/stores/rxdb-setup.ts +0 -113
  40. package/src/arc/stores/session-memo-store.ts +0 -58
  41. package/src/arc/tools.ts +0 -67
  42. package/src/arc/types.ts +0 -333
  43. package/src/arc/utils.ts +0 -37
  44. package/testing/index.ts +0 -22
  45. package/testing/scenario-replay.ts +0 -209
  46. package/testing/scenario-types.ts +0 -38
  47. package/testing/scripted-llm.ts +0 -230
  48. package/tests/arc/channel.test.ts +0 -170
  49. package/tests/arc/context-window.test.ts +0 -396
  50. package/tests/arc/e2e.test.ts +0 -353
  51. package/tests/arc/error-paths.test.ts +0 -402
  52. package/tests/arc/live-integration.test.ts +0 -357
  53. package/tests/arc/memory-manager.test.ts +0 -384
  54. package/tests/arc/process-interleaving.test.ts +0 -432
  55. package/tests/arc/process-profiles.test.ts +0 -364
  56. package/tests/arc/resilience-integration.test.ts +0 -381
  57. package/tests/arc/resilience.test.ts +0 -575
  58. package/tests/arc/scenario-driven.test.ts +0 -297
  59. package/tests/arc/tool-dispatch.test.ts +0 -340
  60. package/tests/arc/wasm-pbt.test.ts +0 -104
  61. package/verify/Cargo.lock +0 -637
  62. package/verify/Cargo.toml +0 -24
  63. package/verify/src/lib.rs +0 -5
  64. package/verify/src/main.rs +0 -165
  65. package/verify/src/model/context.rs +0 -100
  66. package/verify/src/model/mod.rs +0 -6
  67. package/verify/src/model/orchestrator.rs +0 -371
  68. package/verify/src/model/process.rs +0 -140
  69. package/verify/src/model/types.rs +0 -273
  70. package/verify/src/properties/liveness.rs +0 -32
  71. package/verify/src/properties/mod.rs +0 -4
  72. package/verify/src/properties/safety.rs +0 -78
  73. package/verify/src/trace/event.rs +0 -155
  74. package/verify/src/trace/mod.rs +0 -2
  75. package/verify/src/trace/validator.rs +0 -367
  76. package/verify/src/wasm/mod.rs +0 -3
  77. package/verify/src/wasm/scenario_generator.rs +0 -400
  78. package/verify/src/wasm/types.rs +0 -104
  79. package/verify/src/wasm/wasm_validator.rs +0 -107
  80. package/verify/tests/model_check.rs +0 -49
  81. package/verify/tests/trace_validation.rs +0 -147
package/README.md CHANGED
@@ -2,17 +2,9 @@
2
2
 
3
3
  Provider-agnostic TypeScript agent framework with Claude-code-compatible tool semantics.
4
4
 
5
- Published on npm as **`@bluecopa/harness`**.
5
+ The harness provides the core loop that drives an AI agent: send messages to an LLM, execute the tool calls it returns, feed results back, and repeat until the LLM produces a final text response.
6
6
 
7
- Two execution modes: a simple single-agent loop (`createAgent` + `VercelAgentLoop`) and a process-based orchestrator (`ArcLoop`) that dispatches parallel processes with context management, memory, and resilience.
8
-
9
- ## Install
10
-
11
- ```bash
12
- pnpm add @bluecopa/harness
13
- ```
14
-
15
- ## Development
7
+ ## Quickstart
16
8
 
17
9
  ```bash
18
10
  pnpm install
@@ -21,11 +13,9 @@ pnpm test
21
13
 
22
14
  ## Architecture
23
15
 
24
- ### Single-Agent Loop
25
-
26
16
  ```
27
17
  ┌──────────────┐ ┌──────────────┐ ┌──────────────────┐
28
- │ createAgent │────►│ AgentLoop │────►│ LLM (Claude) │
18
+ │ createAgent │────▶│ AgentLoop │────▶│ LLM (Claude) │
29
19
  │ (turn loop) │ │ (nextAction)│ │ │
30
20
  └──────┬───────┘ └──────────────┘ └──────────────────┘
31
21
  │ │
@@ -37,82 +27,20 @@ pnpm test
37
27
  └──────────────┘
38
28
  ```
39
29
 
40
- ### ArcLoop Orchestrator
41
-
42
- ```
43
- Orchestrator (ArcLoop Opus 4.6 by default)
44
- │ tools: Thread, Check, Cancel, Remember, ReadEpisode
45
-
46
- │ Turn 1 (parallel):
47
- ├──► Process 0 ("read auth", model=fast) ─┐
48
- ├──► Process 1 ("read routes", model=fast) ─┼──► Episodes
49
- ├──► Process 2 ("read tests", model=fast) ─┘
50
-
51
- │ Turn 2 (dispatch dependent work):
52
- ├──► Thread("fix bug", context=[ep0,ep1,ep2]) ──► Episode
53
-
54
- │ Turn 3 (parallel):
55
- ├──► Thread("run tests", context=[ep3]) ─┐
56
- ├──► Thread("update docs", context=[ep3]) ─┘
57
-
58
- └──► Final text response
59
- ```
60
-
61
- Full architecture doc: [`docs/arc.md`](../docs/arc.md)
62
-
63
- ---
64
-
65
- ## ToolProvider
66
-
67
- The contract for tool execution. All agent modes use this interface.
68
-
69
- ```typescript
70
- interface ToolProvider {
71
- bash(command: string, options?: BashOptions): Promise<ToolResult>;
72
- readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
73
- writeFile(path: string, content: string): Promise<ToolResult>;
74
- editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
75
- glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
76
- grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
77
- webFetch?(options: WebFetchOptions): Promise<ToolResult>;
78
- webSearch?(query: string): Promise<ToolResult>;
79
- capabilities(): ToolProviderCapabilities;
80
- }
81
-
82
- interface ToolResult {
83
- success: boolean;
84
- output: string;
85
- error?: string;
86
- }
87
- ```
88
-
89
- Built-in implementations:
30
+ 1. `createAgent` drives a deterministic step loop
31
+ 2. Each step calls `loop.nextAction(messages)` to get the LLM's decision
32
+ 3. If it's a tool call, the harness executes it via `ToolProvider` and appends the result
33
+ 4. If it's a final action, the loop ends and returns the result
90
34
 
91
- | Provider | Description |
92
- |----------|-------------|
93
- | `LocalToolProvider` | Runs tools on the local filesystem |
94
- | `E2BToolProvider` | Routes tools to a sandbox VM via `ControlPlaneE2BExecutor` |
95
- | `CompositeToolProvider` | Combines multiple providers (e.g. local filesystem + sandbox bash) |
35
+ ## Using with the sandbox
96
36
 
97
- ## SandboxProvider
37
+ The most common setup connects the harness to a running sandbox service via `ControlPlaneE2BExecutor`:
98
38
 
99
- Higher-level sandbox operations beyond basic tool calls:
100
-
101
- ```typescript
102
- interface SandboxProvider {
103
- exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
104
- readSandboxFile(path: string): Promise<SandboxFileBlob>;
105
- writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
106
- }
107
- ```
108
-
109
- Used by `SkillManager` for executing skill scripts in isolated VMs.
110
-
111
- ## Connecting to a Sandbox
112
-
113
- ```typescript
114
- import { ControlPlaneE2BExecutor } from './src/providers/control-plane-e2b-executor';
39
+ ```ts
40
+ import { createAgent } from './src/agent/create-agent';
115
41
  import { E2BToolProvider } from './src/providers/e2b-tool-provider';
42
+ import { ControlPlaneE2BExecutor } from './src/providers/control-plane-e2b-executor';
43
+ import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
116
44
 
117
45
  // Connect to sandbox service
118
46
  const executor = new ControlPlaneE2BExecutor({
@@ -122,174 +50,155 @@ const executor = new ControlPlaneE2BExecutor({
122
50
  });
123
51
  await executor.initialize(); // creates a Firecracker VM
124
52
 
125
- const toolProvider = new E2BToolProvider(executor);
53
+ // Build and run the agent
54
+ const agent = createAgent({
55
+ toolProvider: new E2BToolProvider(executor),
56
+ loop: new VercelAgentLoop(), // needs ANTHROPIC_API_KEY
57
+ });
126
58
 
127
- // ... use with createAgent or ArcLoop
59
+ const result = await agent.run('create a bar chart of sales data');
60
+ console.log(result.output); // LLM's final response
61
+ console.log(result.steps); // number of tool steps
128
62
 
129
- await executor.destroy(); // tears down the VM
63
+ await executor.destroy(); // tears down the VM
130
64
  ```
131
65
 
132
- From environment variables: `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY`.
66
+ For a complete working example, see [`examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts).
67
+
68
+ ### From environment variables
133
69
 
134
- ---
70
+ `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY` automatically:
135
71
 
136
- ## Single-Agent Mode (`createAgent`)
72
+ ```ts
73
+ const executor = ControlPlaneE2BExecutor.fromEnv();
74
+ ```
137
75
 
138
- For simple tasks that don't need orchestration:
76
+ ## Using locally (no sandbox)
139
77
 
140
- ```typescript
78
+ For development without a sandbox service, use `LocalToolProvider` which runs tools on the local machine:
79
+
80
+ ```ts
141
81
  import { createAgent } from './src/agent/create-agent';
142
82
  import { LocalToolProvider } from './src/providers/local-tool-provider';
143
83
 
144
84
  const agent = createAgent({
145
85
  toolProvider: new LocalToolProvider(process.cwd()),
146
- loop: new VercelAgentLoop(), // needs ANTHROPIC_API_KEY
86
+ loop: new VercelAgentLoop(),
147
87
  });
148
88
 
149
89
  const result = await agent.run('list all TypeScript files');
150
- console.log(result.output);
151
90
  ```
152
91
 
153
- ### Configuration
92
+ ## Key modules
93
+
94
+ ### Agent creation (`src/agent/create-agent.ts`)
154
95
 
155
- | Option | Type | Default | Description |
156
- |--------|------|---------|-------------|
157
- | `toolProvider` | `ToolProvider` | required | Executes tool calls |
158
- | `loop` | `AgentLoop` | `VercelAgentLoop` | LLM decision loop |
159
- | `sandboxProvider` | `SandboxProvider` | — | Higher-level sandbox operations |
160
- | `maxSteps` | `number` | 30 | Max tool steps per run |
161
- | `telemetry` | `HarnessTelemetry` | — | OpenTelemetry-style tracing |
162
- | `skillIndexPath` | `string` | — | Path to skill index JSON for routing |
96
+ `createAgent(options)` returns an agent with a `.run(prompt, options?)` method. Options:
163
97
 
164
- ### VercelAgentLoop
98
+ | Option | Type | Description |
99
+ |--------|------|-------------|
100
+ | `toolProvider` | `ToolProvider` | Required. Executes tool calls |
101
+ | `loop` | `AgentLoop` | LLM decision loop (default: `VercelAgentLoop`) |
102
+ | `sandboxProvider` | `SandboxProvider` | Optional. Higher-level sandbox ops (file download, exec with env) |
103
+ | `maxSteps` | `number` | Max tool steps per run (default: 30) |
104
+ | `telemetry` | `HarnessTelemetry` | Optional. OpenTelemetry-style tracing |
105
+ | `skillIndexPath` | `string` | Optional. Path to skill index JSON |
165
106
 
166
- Calls Claude via the Vercel AI SDK. Supports parallel tool calls and configurable system prompt.
107
+ ### Agent loop (`src/loop/vercel-agent-loop.ts`)
167
108
 
168
- ```typescript
109
+ `VercelAgentLoop` calls Claude via the Vercel AI SDK. It supports:
110
+ - Parallel tool calls (returns `ToolBatchAction` when the LLM requests multiple tools at once)
111
+ - Configurable system prompt
112
+ - Model selection via `HARNESS_MODEL` env var (default: `claude-sonnet-4-5`)
113
+
114
+ ```ts
169
115
  const loop = new VercelAgentLoop({
170
116
  systemPrompt: 'You are a helpful coding assistant.',
171
- model: 'claude-sonnet-4-5', // or HARNESS_MODEL env var
172
117
  });
173
118
  ```
174
119
 
175
- ### LCMToolLoop
120
+ ### Tool provider (`src/interfaces/tool-provider.ts`)
176
121
 
177
- Wraps another loop to add Lossless Context Management and optional REPL orchestration:
122
+ The contract for tool execution:
178
123
 
179
- ```typescript
180
- import { LCMToolLoop } from './src/loop/lcm-tool-loop';
181
- import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
124
+ ```ts
125
+ interface ToolProvider {
126
+ bash(command: string, options?: BashOptions): Promise<ToolResult>;
127
+ readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
128
+ writeFile(path: string, content: string): Promise<ToolResult>;
129
+ editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
130
+ glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
131
+ grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
132
+ webFetch?(options: WebFetchOptions): Promise<ToolResult>;
133
+ webSearch?(query: string): Promise<ToolResult>;
134
+ capabilities(): ToolProviderCapabilities;
135
+ }
182
136
 
183
- const loop = new LCMToolLoop({
184
- innerLoop: new VercelAgentLoop(),
185
- toolProvider: mySandboxProvider,
186
- enableRepl: true, // default: true
187
- bridgeDir: '/var/run/bridge',
188
- onActivity: (entry) => console.log(entry),
189
- onLlmRequest: async (prompt) => callLLM(prompt),
190
- onWebFetchRequest: async (url) => fetch(url),
191
- });
137
+ interface ToolResult {
138
+ success: boolean;
139
+ output: string;
140
+ error?: string;
141
+ }
192
142
  ```
193
143
 
194
- **Standard mode**: Lossless context trimming — the LLM always sees a coherent, budget-fitting view of the full conversation.
195
-
196
- **REPL mode**: When the LLM returns a Bash action with the REPL marker, the loop writes a Python script into the sandbox, injects the bridge module, runs the script, and polls for sub-requests (LLM, web_fetch, ask_user) that the harness fulfills.
197
-
198
- ---
144
+ Built-in implementations:
199
145
 
200
- ## ArcLoop (Orchestrator Mode)
146
+ | Provider | Description |
147
+ |----------|-------------|
148
+ | `LocalToolProvider` | Runs tools on the local filesystem |
149
+ | `E2BToolProvider` | Routes tools to an E2B-compatible executor over HTTP |
150
+ | `CompositeToolProvider` | Combines multiple providers (e.g. sandbox + web) |
201
151
 
202
- For complex tasks that benefit from parallel processes, context management, and memory:
152
+ ### Action types (`src/agent/types.ts`)
203
153
 
204
- ```typescript
205
- import { createArcAgent } from './src/arc/create-arc-agent';
154
+ The LLM returns one of these action types each turn:
206
155
 
207
- const agent = await createArcAgent({
208
- toolProvider: myToolProvider,
209
- episodeStore: myEpisodeStore, // required
210
- sessionMemoStore: mySessionMemoStore, // required
211
- longTermStore: myLongTermStore, // required
212
- taskId: 'task-1',
213
- sessionId: 'session-1',
214
- });
156
+ ```ts
157
+ // Single tool call
158
+ interface ToolCallAction {
159
+ type: 'tool';
160
+ name: 'Bash' | 'Read' | 'Write' | 'Edit' | 'Glob' | 'Grep' | ...;
161
+ args: Record<string, unknown>;
162
+ }
215
163
 
216
- // Streaming
217
- for await (const event of agent.stream(messages, signal)) {
218
- if (event.type === 'text_delta') process.stdout.write(event.text);
219
- if (event.type === 'process_dispatched') console.log(` → ${event.action}`);
220
- if (event.type === 'done') console.log(`Done in ${event.stats.durationMs}ms`);
164
+ // Multiple independent tool calls (executed in parallel)
165
+ interface ToolBatchAction {
166
+ type: 'tool_batch';
167
+ calls: ToolCallAction[];
221
168
  }
222
169
 
223
- // Non-streaming
224
- const result = await agent.run(messages, signal);
170
+ // Final text response (ends the loop)
171
+ interface FinalAction {
172
+ type: 'final';
173
+ content: string;
174
+ }
225
175
  ```
226
176
 
227
- ### ArcLoopConfig
228
-
229
- | Option | Type | Default | Description |
230
- |--------|------|---------|-------------|
231
- | `model` | `string` | `'claude-opus-4-6'` | Orchestrator model (ID or tier name) |
232
- | `modelMap` | `Record<ModelTier, string>` | haiku/sonnet/opus | Maps fast/medium/strong to model IDs |
233
- | `apiKey` | `string` | — | Anthropic API key |
234
- | `systemPrompt` | `string` | built-in | Custom orchestrator system prompt |
235
- | `maxTurns` | `number` | 30 | Max orchestrator turns |
236
- | `processTimeout` | `number` | 120_000 | Per-process timeout (ms) |
237
- | `processMaxSteps` | `number` | 20 | Per-process max tool steps |
238
- | `contextWindowSize` | `number` | 200_000 | Context window in tokens |
239
- | `outputReserve` | `number` | 20_000 | Tokens reserved for output |
240
- | `autoMemory` | `boolean` | true | Auto-detect patterns from episodes |
241
- | `episodeStore` | `EpisodeStore` | required | Stores episode summaries + traces |
242
- | `sessionMemoStore` | `SessionMemoStore` | required | Stores session memos |
243
- | `longTermStore` | `LongTermStore` | required | Stores long-term memories |
244
- | `taskId` | `string` | required | Task identifier |
245
- | `sessionId` | `string` | required | Session identifier |
246
- | `toolProvider` | `ToolProvider` | required | Tool execution |
247
- | `processTools` | `Record<string, AnyTool>` | builtinTools | Tools available inside processes |
248
- | `extraOrchestratorTools` | `Record<string, AnyTool>` | — | Custom orchestrator tools |
249
- | `onOrchestratorTool` | `function` | — | Handler for custom orchestrator tools |
250
- | `resilience` | `ResiliencePolicy` | — | Composable resilience pipeline |
251
- | `traceWriter` | `function` | — | Callback for trace event emission |
252
-
253
- ### Resilience
254
-
255
- ```typescript
256
- import { resilience } from './src/arc/resilience';
257
-
258
- const pipeline = resilience()
259
- .retry({ maxRetries: 2, baseDelay: 1000 })
260
- .timeout({ durationMs: 30_000 })
261
- .circuitBreaker({ failureThreshold: 5 })
262
- .build();
263
-
264
- const agent = await createArcAgent({
265
- // ...config
266
- resilience: pipeline,
267
- });
268
- ```
177
+ ### LCM tool loop (`src/loop/lcm-tool-loop.ts`)
269
178
 
270
- ### Trace Emission
179
+ `LCMToolLoop` wraps another loop to add LCM-based tool routing, REPL script execution, and bridge-based tool dispatch. Used in the chat-assistant example.
271
180
 
272
- ```typescript
273
- const traces: TraceEvent[] = [];
274
- const agent = await createArcAgent({
275
- // ...config
276
- traceWriter: (event) => traces.push(event),
277
- });
181
+ ### Sandbox provider (`src/interfaces/sandbox-provider.ts`)
182
+
183
+ Higher-level sandbox operations beyond basic tool calls:
184
+
185
+ ```ts
186
+ interface SandboxProvider {
187
+ exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
188
+ readSandboxFile(path: string): Promise<SandboxFileBlob>;
189
+ writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
190
+ }
278
191
  ```
279
192
 
280
- Traces can be validated against the formal model: `cd verify && cargo run -- trace file.ndjson`
193
+ ### Observability (`src/observability/otel.ts`)
281
194
 
282
- ---
195
+ `HarnessTelemetry` provides OpenTelemetry-style spans and metrics for agent runs.
283
196
 
284
- ## Package Layout
197
+ ## Package layout
285
198
 
286
199
  ```
287
200
  src/
288
201
  ├── agent/ # createAgent, step executor, types
289
- ├── arc/ # ArcLoop orchestrator, processes, memory, resilience
290
- │ ├── resilience/ # Retry, circuit breaker, timeout, bulkhead, fallback
291
- │ ├── stores/ # RxDB + in-memory store implementations
292
- │ └── object-store/ # Pluggable cloud sync (fs, memory)
293
202
  ├── interfaces/ # ToolProvider, SandboxProvider, AgentLoop contracts
294
203
  ├── loop/ # VercelAgentLoop, LCMToolLoop
295
204
  ├── providers/ # LocalToolProvider, E2BToolProvider, ControlPlaneE2BExecutor
@@ -297,20 +206,16 @@ src/
297
206
  ├── hooks/ # Pre/post tool call hooks
298
207
  ├── permissions/ # Tool permission checks
299
208
  ├── sessions/ # Session persistence
300
- ├── subagents/ # Subagent spawning
209
+ ├── subagents/ # Subagent spawning and task tools
301
210
  ├── skills/ # Skill index, routing, and management
302
211
  ├── optimization/ # Benchmark runner
303
212
  └── observability/ # OpenTelemetry integration
304
-
305
- verify/ # Rust formal verification (Stateright model checker)
306
- testing/ # Adversarial scenario replay harness
307
- tests/ # Vitest test suite
308
213
  ```
309
214
 
310
215
  ## Documentation
311
216
 
312
- - [Arc architecture](../docs/arc.md) — process model, context window, memory, resilience, verification
313
- - [Testing](../docs/testing.md) — test layers, running tests, writing new tests
314
- - [Sandbox setup](../docs/PUBLIC_SANDBOX.md) — deploying the sandbox service
315
- - [Release process](../docs/RELEASE.md) — versioning and publishing
316
- - [Example](../examples/chat-assistant/src/chat.ts) — complete working chat assistant
217
+ - Provider guide: `docs/guides/providers.md`
218
+ - Skills guide: `docs/guides/skills.md`
219
+ - Observability guide: `docs/guides/observability.md`
220
+ - Release process: `../docs/RELEASE.md`
221
+ - Full example: [`../examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bluecopa/harness",
3
- "version": "0.1.0-snapshot.59",
3
+ "version": "0.1.0-snapshot.6",
4
4
  "description": "Provider-agnostic TypeScript agent framework",
5
5
  "license": "UNLICENSED",
6
6
  "scripts": {
@@ -10,7 +10,6 @@
10
10
  "dependencies": {
11
11
  "@ai-sdk/anthropic": "^3.0.48",
12
12
  "ai": "^6.0.101",
13
- "rxdb": "^15.39.0",
14
13
  "zod": "^4.1.11"
15
14
  },
16
15
  "devDependencies": {
@@ -7,14 +7,12 @@ import type { HarnessTelemetry } from '../observability/otel';
7
7
  import { HookRunner } from '../hooks/hook-runner';
8
8
  import { PermissionManager } from '../permissions/permission-manager';
9
9
  import { VercelAgentLoop } from '../loop/vercel-agent-loop';
10
- export type { SystemPromptBlock, VercelAgentLoopConfig } from '../loop/vercel-agent-loop';
11
- export type { PrepareStepContext, PrepareStepResult } from './types';
12
10
  import { SkillManager } from '../skills/skill-manager';
13
11
  import { SkillRouter } from '../skills/skill-router';
14
12
  import type { SkillSummary } from '../skills/skill-types';
15
13
  import { SingleFlightStepExecutor } from './step-executor';
16
- import type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, StepUsage, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo } from './types';
17
- export type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, StepUsage, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo };
14
+ import type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo } from './types';
15
+ export type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo };
18
16
  export { HookRunner } from '../hooks/hook-runner';
19
17
  export { PermissionManager } from '../permissions/permission-manager';
20
18
  export type { PermissionMode, PermissionResolver, PermissionRequest } from '../permissions/permission-manager';
@@ -39,8 +37,6 @@ export interface AgentRuntime {
39
37
  /** Custom tool executor. Called for every tool action. Return null to fall through to built-in dispatch.
40
38
  * When hookRunner/permissionManager are provided on the runtime, they are automatically applied before/after this callback — no manual wiring needed. */
41
39
  executeToolAction?: (action: ToolCallAction) => Promise<ToolResult | null>;
42
- /** Progress callback fired before/after each tool call during run(). */
43
- onToolProgress?: (event: { type: 'tool_start'; name: string; args: Record<string, unknown> } | { type: 'tool_end'; name: string; success: boolean; durationMs: number }) => void;
44
40
  }
45
41
 
46
42
  /**
@@ -222,21 +218,9 @@ function toStreamResult(r: ToolResult): { success: boolean; output: string; erro
222
218
  return base;
223
219
  }
224
220
 
225
- /** Build the text the LLM sees for a tool result.
226
- * Success: prefer modelOutput (compact) over raw output.
227
- * Failure: prefer modelOutput (structured fix guidance) → error → output → generic fallback.
228
- * This ensures custom tools can feed actionable error feedback to the model via modelOutput
229
- * so the agent can self-correct instead of stopping with "unknown failure". */
230
- function resultTextForLLM(result: ToolResult): string {
231
- if (result.success) return result.modelOutput ?? result.output;
232
- return result.modelOutput ?? result.error ?? result.output ?? 'unknown failure';
233
- }
234
-
235
- /** Format content string for LLM context. Uses modelOutput (compact summary) when available. */
221
+ /** Format a display-friendly content string for tool results (used in content field). */
236
222
  function formatToolResultContent(call: ToolCallAction, result: ToolResult): string {
237
- const content = result.success
238
- ? resultTextForLLM(result)
239
- : `ERROR: ${resultTextForLLM(result)}`;
223
+ const content = result.success ? result.output : `ERROR: ${result.error ?? 'unknown failure'}`;
240
224
  switch (call.name) {
241
225
  case 'Write':
242
226
  return `Write(${call.args.path}): ${result.success ? 'ok' : content}`;
@@ -531,11 +515,6 @@ export function createAgent(runtime: AgentRuntime) {
531
515
  ? { nextAction: runtime.nextAction }
532
516
  : new VercelAgentLoop());
533
517
 
534
- /** Read lastUsage from the loop if it's a VercelAgentLoop. */
535
- function getLoopUsage(): StepUsage | undefined {
536
- return loop instanceof VercelAgentLoop ? loop.lastUsage : undefined;
537
- }
538
-
539
518
  async function resolveSkillContext(prompt: string): Promise<string> {
540
519
  if (!skillManager || !skillIndexPath) return '';
541
520
 
@@ -617,18 +596,14 @@ export function createAgent(runtime: AgentRuntime) {
617
596
 
618
597
  // Execute valid calls via batch (sequential sandbox ops) or parallel fallback
619
598
  if (validCalls.length > 0) {
620
- for (const c of validCalls) runtime.onToolProgress?.({ type: 'tool_start', name: c.name, args: c.args });
621
- const batchStart = Date.now();
622
599
  const results = await executeBatch(validCalls, runtime.toolProvider, runtime);
623
- const batchMs = Date.now() - batchStart;
624
600
  for (let i = 0; i < validCalls.length; i++) {
625
601
  const call = validCalls[i]!;
626
602
  const r = results[i]!;
627
- runtime.onToolProgress?.({ type: 'tool_end', name: call.name, success: r.success, durationMs: batchMs });
628
603
  if (!r.success) {
629
604
  recordAgentError(runtime.telemetry);
630
605
  }
631
- const resultText = r.success ? resultTextForLLM(r) : `ERROR: ${resultTextForLLM(r)}`;
606
+ const resultText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
632
607
  messages.push({
633
608
  role: 'tool',
634
609
  content: formatToolResultContent(call, r),
@@ -684,8 +659,6 @@ export function createAgent(runtime: AgentRuntime) {
684
659
  } else {
685
660
  consecutiveInvalid = 0;
686
661
  }
687
- runtime.onToolProgress?.({ type: 'tool_start', name: action.name, args: action.args });
688
- const singleStart = Date.now();
689
662
  const result = validationError
690
663
  ? ({ success: false, output: '', error: validationError } as ToolResult)
691
664
  : await executor.run(async () => {
@@ -699,11 +672,10 @@ export function createAgent(runtime: AgentRuntime) {
699
672
  };
700
673
  }
701
674
  });
702
- runtime.onToolProgress?.({ type: 'tool_end', name: action.name, success: result.success, durationMs: Date.now() - singleStart });
703
675
  if (!result.success) {
704
676
  recordAgentError(runtime.telemetry);
705
677
  }
706
- const singleResultText = result.success ? resultTextForLLM(result) : `ERROR: ${resultTextForLLM(result)}`;
678
+ const singleResultText = result.success ? result.output : `ERROR: ${result.error ?? 'unknown failure'}`;
707
679
  messages.push({
708
680
  role: 'tool',
709
681
  content: formatToolResultContent(action, result),
@@ -746,7 +718,8 @@ export function createAgent(runtime: AgentRuntime) {
746
718
  if (event.type === 'text_delta') {
747
719
  finalText += event.text;
748
720
  yield event;
749
- } else if (event.type === 'tool_start') {
721
+ }
722
+ if (event.type === 'tool_start') {
750
723
  pendingTools.push({
751
724
  type: 'tool',
752
725
  name: event.name,
@@ -754,18 +727,13 @@ export function createAgent(runtime: AgentRuntime) {
754
727
  ...(event.toolCallId != null ? { toolCallId: event.toolCallId } : {}),
755
728
  });
756
729
  yield event;
757
- } else {
758
- // Forward all other events (tool_end, step_start, step_end, done)
759
- // from self-managing loops like ArcLoop
760
- yield event;
761
- if (event.type === 'done') return;
762
730
  }
763
731
  }
764
732
 
765
733
  // If no tools → final response
766
734
  if (pendingTools.length === 0) {
767
735
  messages.push({ role: 'assistant', content: finalText });
768
- { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
736
+ yield { type: 'step_end', step };
769
737
  yield { type: 'done', output: finalText, steps: step };
770
738
  return;
771
739
  }
@@ -791,7 +759,7 @@ export function createAgent(runtime: AgentRuntime) {
791
759
  if (action.type === 'final') {
792
760
  yield { type: 'text_delta', text: action.content };
793
761
  messages.push({ role: 'assistant', content: action.content });
794
- { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
762
+ yield { type: 'step_end', step };
795
763
  yield { type: 'done', output: action.content, steps: step };
796
764
  return;
797
765
  }
@@ -803,7 +771,7 @@ export function createAgent(runtime: AgentRuntime) {
803
771
  try {
804
772
  const r = await executeTool(runtime.toolProvider, call, runtime);
805
773
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
806
- const rText = r.success ? resultTextForLLM(r) : `ERROR: ${resultTextForLLM(r)}`;
774
+ const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
807
775
  messages.push({
808
776
  role: 'tool',
809
777
  content: formatToolResultContent(call, r),
@@ -825,7 +793,7 @@ export function createAgent(runtime: AgentRuntime) {
825
793
  try {
826
794
  const r = await executeTool(runtime.toolProvider, action, runtime);
827
795
  yield { type: 'tool_end', name: action.name, result: toStreamResult(r) };
828
- const rText = r.success ? resultTextForLLM(r) : `ERROR: ${resultTextForLLM(r)}`;
796
+ const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
829
797
  messages.push({
830
798
  role: 'tool',
831
799
  content: formatToolResultContent(action, r),
@@ -841,7 +809,7 @@ export function createAgent(runtime: AgentRuntime) {
841
809
  });
842
810
  }
843
811
  }
844
- { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
812
+ yield { type: 'step_end', step };
845
813
  continue;
846
814
  }
847
815
 
@@ -851,7 +819,7 @@ export function createAgent(runtime: AgentRuntime) {
851
819
  const call = pendingTools[i]!;
852
820
  const r = results[i]!;
853
821
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
854
- const rText = r.success ? resultTextForLLM(r) : `ERROR: ${resultTextForLLM(r)}`;
822
+ const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
855
823
  messages.push({
856
824
  role: 'tool',
857
825
  content: formatToolResultContent(call, r),
@@ -869,7 +837,7 @@ export function createAgent(runtime: AgentRuntime) {
869
837
 
870
838
  if (action.type === 'final') {
871
839
  messages.push({ role: 'assistant', content: action.content });
872
- { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
840
+ yield { type: 'step_end', step };
873
841
  yield { type: 'done', output: action.content, steps: step };
874
842
  return;
875
843
  }
@@ -893,7 +861,7 @@ export function createAgent(runtime: AgentRuntime) {
893
861
  const call = calls[i]!;
894
862
  const r = results[i]!;
895
863
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
896
- const rText = r.success ? resultTextForLLM(r) : `ERROR: ${resultTextForLLM(r)}`;
864
+ const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
897
865
  messages.push({
898
866
  role: 'tool',
899
867
  content: formatToolResultContent(call, r),
@@ -907,7 +875,7 @@ export function createAgent(runtime: AgentRuntime) {
907
875
  }
908
876
  }
909
877
 
910
- { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
878
+ yield { type: 'step_end', step };
911
879
  }
912
880
 
913
881
  yield { type: 'done', output: 'ERROR: max steps exceeded', steps: maxSteps };