@bluecopa/harness 0.1.0-snapshot.49 → 0.1.0-snapshot.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/README.md +117 -212
  2. package/package.json +1 -2
  3. package/src/agent/create-agent.ts +27 -30
  4. package/src/agent/types.ts +24 -20
  5. package/src/interfaces/hooks.ts +1 -2
  6. package/src/interfaces/tool-provider.ts +2 -0
  7. package/src/loop/context-store.ts +9 -12
  8. package/src/loop/vercel-agent-loop.ts +118 -44
  9. package/src/skills/skill-router.ts +6 -12
  10. package/tests/integration/agent-skill-default-from-sandbox.spec.ts +2 -3
  11. package/tests/unit/structured-messages.spec.ts +1 -1
  12. package/vitest.config.ts +1 -1
  13. package/src/arc/agent-runner.ts +0 -947
  14. package/src/arc/arc-loop.ts +0 -845
  15. package/src/arc/arc-types.ts +0 -115
  16. package/src/arc/bridge-tools.ts +0 -170
  17. package/src/arc/bridged-tool-provider.ts +0 -80
  18. package/src/arc/consolidation.ts +0 -118
  19. package/src/arc/context-window.ts +0 -267
  20. package/src/arc/create-arc-agent.ts +0 -99
  21. package/src/arc/debug.ts +0 -62
  22. package/src/arc/episode-compressor.ts +0 -225
  23. package/src/arc/memory-manager.ts +0 -245
  24. package/src/arc/message-convert.ts +0 -123
  25. package/src/arc/multi-model.ts +0 -70
  26. package/src/arc/object-store/fs-object-store.ts +0 -60
  27. package/src/arc/object-store/memory-object-store.ts +0 -41
  28. package/src/arc/object-store/object-store.ts +0 -12
  29. package/src/arc/profile-builder.ts +0 -172
  30. package/src/arc/resilience/bulkhead.ts +0 -110
  31. package/src/arc/resilience/circuit-breaker.ts +0 -112
  32. package/src/arc/resilience/fallback.ts +0 -27
  33. package/src/arc/resilience/index.ts +0 -21
  34. package/src/arc/resilience/pipeline.ts +0 -103
  35. package/src/arc/resilience/retry.ts +0 -90
  36. package/src/arc/resilience/timeout.ts +0 -60
  37. package/src/arc/resilience/types.ts +0 -71
  38. package/src/arc/result-pager.ts +0 -77
  39. package/src/arc/sig.ts +0 -115
  40. package/src/arc/skill-resolver.ts +0 -81
  41. package/src/arc/stores/episode-store.ts +0 -120
  42. package/src/arc/stores/long-term-store.ts +0 -86
  43. package/src/arc/stores/rxdb-setup.ts +0 -113
  44. package/src/arc/stores/session-memo-store.ts +0 -58
  45. package/src/arc/tools.ts +0 -67
  46. package/src/arc/types.ts +0 -363
  47. package/src/arc/utils.ts +0 -37
  48. package/src/hooks/middleware.ts +0 -95
  49. package/testing/index.ts +0 -22
  50. package/testing/scenario-replay.ts +0 -209
  51. package/testing/scenario-types.ts +0 -38
  52. package/testing/scripted-llm.ts +0 -230
  53. package/tests/arc/channel.test.ts +0 -170
  54. package/tests/arc/context-window.test.ts +0 -396
  55. package/tests/arc/e2e.test.ts +0 -353
  56. package/tests/arc/error-paths.test.ts +0 -402
  57. package/tests/arc/live-integration.test.ts +0 -357
  58. package/tests/arc/memory-manager.test.ts +0 -384
  59. package/tests/arc/middleware.test.ts +0 -113
  60. package/tests/arc/process-interleaving.test.ts +0 -432
  61. package/tests/arc/process-profiles.test.ts +0 -366
  62. package/tests/arc/resilience-integration.test.ts +0 -381
  63. package/tests/arc/resilience.test.ts +0 -575
  64. package/tests/arc/result-paging.test.ts +0 -392
  65. package/tests/arc/scenario-driven.test.ts +0 -297
  66. package/tests/arc/tool-dispatch.test.ts +0 -340
  67. package/tests/arc/wasm-pbt.test.ts +0 -104
  68. package/verify/Cargo.lock +0 -637
  69. package/verify/Cargo.toml +0 -24
  70. package/verify/src/lib.rs +0 -5
  71. package/verify/src/main.rs +0 -165
  72. package/verify/src/model/context.rs +0 -100
  73. package/verify/src/model/mod.rs +0 -6
  74. package/verify/src/model/orchestrator.rs +0 -371
  75. package/verify/src/model/process.rs +0 -140
  76. package/verify/src/model/types.rs +0 -273
  77. package/verify/src/properties/liveness.rs +0 -32
  78. package/verify/src/properties/mod.rs +0 -4
  79. package/verify/src/properties/safety.rs +0 -78
  80. package/verify/src/trace/event.rs +0 -155
  81. package/verify/src/trace/mod.rs +0 -2
  82. package/verify/src/trace/validator.rs +0 -367
  83. package/verify/src/wasm/mod.rs +0 -3
  84. package/verify/src/wasm/scenario_generator.rs +0 -400
  85. package/verify/src/wasm/types.rs +0 -104
  86. package/verify/src/wasm/wasm_validator.rs +0 -107
  87. package/verify/tests/model_check.rs +0 -49
  88. package/verify/tests/trace_validation.rs +0 -147
package/README.md CHANGED
@@ -2,17 +2,9 @@
2
2
 
3
3
  Provider-agnostic TypeScript agent framework with Claude-code-compatible tool semantics.
4
4
 
5
- Published on npm as **`@bluecopa/harness`**.
5
+ The harness provides the core loop that drives an AI agent: send messages to an LLM, execute the tool calls it returns, feed results back, and repeat until the LLM produces a final text response.
6
6
 
7
- Two execution modes: a simple single-agent loop (`createAgent` + `VercelAgentLoop`) and a process-based orchestrator (`ArcLoop`) that dispatches parallel processes with context management, memory, and resilience.
8
-
9
- ## Install
10
-
11
- ```bash
12
- pnpm add @bluecopa/harness
13
- ```
14
-
15
- ## Development
7
+ ## Quickstart
16
8
 
17
9
  ```bash
18
10
  pnpm install
@@ -21,11 +13,9 @@ pnpm test
21
13
 
22
14
  ## Architecture
23
15
 
24
- ### Single-Agent Loop
25
-
26
16
  ```
27
17
  ┌──────────────┐ ┌──────────────┐ ┌──────────────────┐
28
- │ createAgent │────►│ AgentLoop │────►│ LLM (Claude) │
18
+ │ createAgent │────▶│ AgentLoop │────▶│ LLM (Claude) │
29
19
  │ (turn loop) │ │ (nextAction)│ │ │
30
20
  └──────┬───────┘ └──────────────┘ └──────────────────┘
31
21
  │ │
@@ -37,82 +27,20 @@ pnpm test
37
27
  └──────────────┘
38
28
  ```
39
29
 
40
- ### ArcLoop Orchestrator
41
-
42
- ```
43
- Orchestrator (ArcLoop Opus 4.6 by default)
44
- │ tools: Thread, Check, Cancel, Remember, ReadEpisode
45
-
46
- │ Turn 1 (parallel):
47
- ├──► Process 0 ("read auth", model=fast) ─┐
48
- ├──► Process 1 ("read routes", model=fast) ─┼──► Episodes
49
- ├──► Process 2 ("read tests", model=fast) ─┘
50
-
51
- │ Turn 2 (dispatch dependent work):
52
- ├──► Thread("fix bug", context=[ep0,ep1,ep2]) ──► Episode
53
-
54
- │ Turn 3 (parallel):
55
- ├──► Thread("run tests", context=[ep3]) ─┐
56
- ├──► Thread("update docs", context=[ep3]) ─┘
57
-
58
- └──► Final text response
59
- ```
60
-
61
- Full architecture doc: [`docs/arc.md`](../docs/arc.md)
62
-
63
- ---
64
-
65
- ## ToolProvider
66
-
67
- The contract for tool execution. All agent modes use this interface.
68
-
69
- ```typescript
70
- interface ToolProvider {
71
- bash(command: string, options?: BashOptions): Promise<ToolResult>;
72
- readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
73
- writeFile(path: string, content: string): Promise<ToolResult>;
74
- editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
75
- glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
76
- grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
77
- webFetch?(options: WebFetchOptions): Promise<ToolResult>;
78
- webSearch?(query: string): Promise<ToolResult>;
79
- capabilities(): ToolProviderCapabilities;
80
- }
81
-
82
- interface ToolResult {
83
- success: boolean;
84
- output: string;
85
- error?: string;
86
- }
87
- ```
88
-
89
- Built-in implementations:
30
+ 1. `createAgent` drives a deterministic step loop
31
+ 2. Each step calls `loop.nextAction(messages)` to get the LLM's decision
32
+ 3. If it's a tool call, the harness executes it via `ToolProvider` and appends the result
33
+ 4. If it's a final action, the loop ends and returns the result
90
34
 
91
- | Provider | Description |
92
- |----------|-------------|
93
- | `LocalToolProvider` | Runs tools on the local filesystem |
94
- | `E2BToolProvider` | Routes tools to a sandbox VM via `ControlPlaneE2BExecutor` |
95
- | `CompositeToolProvider` | Combines multiple providers (e.g. local filesystem + sandbox bash) |
35
+ ## Using with the sandbox
96
36
 
97
- ## SandboxProvider
37
+ The most common setup connects the harness to a running sandbox service via `ControlPlaneE2BExecutor`:
98
38
 
99
- Higher-level sandbox operations beyond basic tool calls:
100
-
101
- ```typescript
102
- interface SandboxProvider {
103
- exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
104
- readSandboxFile(path: string): Promise<SandboxFileBlob>;
105
- writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
106
- }
107
- ```
108
-
109
- Used by `SkillManager` for executing skill scripts in isolated VMs.
110
-
111
- ## Connecting to a Sandbox
112
-
113
- ```typescript
114
- import { ControlPlaneE2BExecutor } from './src/providers/control-plane-e2b-executor';
39
+ ```ts
40
+ import { createAgent } from './src/agent/create-agent';
115
41
  import { E2BToolProvider } from './src/providers/e2b-tool-provider';
42
+ import { ControlPlaneE2BExecutor } from './src/providers/control-plane-e2b-executor';
43
+ import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
116
44
 
117
45
  // Connect to sandbox service
118
46
  const executor = new ControlPlaneE2BExecutor({
@@ -122,174 +50,155 @@ const executor = new ControlPlaneE2BExecutor({
122
50
  });
123
51
  await executor.initialize(); // creates a Firecracker VM
124
52
 
125
- const toolProvider = new E2BToolProvider(executor);
53
+ // Build and run the agent
54
+ const agent = createAgent({
55
+ toolProvider: new E2BToolProvider(executor),
56
+ loop: new VercelAgentLoop(), // needs ANTHROPIC_API_KEY
57
+ });
126
58
 
127
- // ... use with createAgent or ArcLoop
59
+ const result = await agent.run('create a bar chart of sales data');
60
+ console.log(result.output); // LLM's final response
61
+ console.log(result.steps); // number of tool steps
128
62
 
129
- await executor.destroy(); // tears down the VM
63
+ await executor.destroy(); // tears down the VM
130
64
  ```
131
65
 
132
- From environment variables: `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY`.
66
+ For a complete working example, see [`examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts).
67
+
68
+ ### From environment variables
133
69
 
134
- ---
70
+ `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY` automatically:
135
71
 
136
- ## Single-Agent Mode (`createAgent`)
72
+ ```ts
73
+ const executor = ControlPlaneE2BExecutor.fromEnv();
74
+ ```
137
75
 
138
- For simple tasks that don't need orchestration:
76
+ ## Using locally (no sandbox)
139
77
 
140
- ```typescript
78
+ For development without a sandbox service, use `LocalToolProvider` which runs tools on the local machine:
79
+
80
+ ```ts
141
81
  import { createAgent } from './src/agent/create-agent';
142
82
  import { LocalToolProvider } from './src/providers/local-tool-provider';
143
83
 
144
84
  const agent = createAgent({
145
85
  toolProvider: new LocalToolProvider(process.cwd()),
146
- loop: new VercelAgentLoop(), // needs ANTHROPIC_API_KEY
86
+ loop: new VercelAgentLoop(),
147
87
  });
148
88
 
149
89
  const result = await agent.run('list all TypeScript files');
150
- console.log(result.output);
151
90
  ```
152
91
 
153
- ### Configuration
92
+ ## Key modules
93
+
94
+ ### Agent creation (`src/agent/create-agent.ts`)
154
95
 
155
- | Option | Type | Default | Description |
156
- |--------|------|---------|-------------|
157
- | `toolProvider` | `ToolProvider` | required | Executes tool calls |
158
- | `loop` | `AgentLoop` | `VercelAgentLoop` | LLM decision loop |
159
- | `sandboxProvider` | `SandboxProvider` | — | Higher-level sandbox operations |
160
- | `maxSteps` | `number` | 30 | Max tool steps per run |
161
- | `telemetry` | `HarnessTelemetry` | — | OpenTelemetry-style tracing |
162
- | `skillIndexPath` | `string` | — | Path to skill index JSON for routing |
96
+ `createAgent(options)` returns an agent with a `.run(prompt, options?)` method. Options:
163
97
 
164
- ### VercelAgentLoop
98
+ | Option | Type | Description |
99
+ |--------|------|-------------|
100
+ | `toolProvider` | `ToolProvider` | Required. Executes tool calls |
101
+ | `loop` | `AgentLoop` | LLM decision loop (default: `VercelAgentLoop`) |
102
+ | `sandboxProvider` | `SandboxProvider` | Optional. Higher-level sandbox ops (file download, exec with env) |
103
+ | `maxSteps` | `number` | Max tool steps per run (default: 30) |
104
+ | `telemetry` | `HarnessTelemetry` | Optional. OpenTelemetry-style tracing |
105
+ | `skillIndexPath` | `string` | Optional. Path to skill index JSON |
165
106
 
166
- Calls Claude via the Vercel AI SDK. Supports parallel tool calls and configurable system prompt.
107
+ ### Agent loop (`src/loop/vercel-agent-loop.ts`)
167
108
 
168
- ```typescript
109
+ `VercelAgentLoop` calls Claude via the Vercel AI SDK. It supports:
110
+ - Parallel tool calls (returns `ToolBatchAction` when the LLM requests multiple tools at once)
111
+ - Configurable system prompt
112
+ - Model selection via `HARNESS_MODEL` env var (default: `claude-sonnet-4-5`)
113
+
114
+ ```ts
169
115
  const loop = new VercelAgentLoop({
170
116
  systemPrompt: 'You are a helpful coding assistant.',
171
- model: 'claude-sonnet-4-5', // or HARNESS_MODEL env var
172
117
  });
173
118
  ```
174
119
 
175
- ### LCMToolLoop
120
+ ### Tool provider (`src/interfaces/tool-provider.ts`)
176
121
 
177
- Wraps another loop to add Lossless Context Management and optional REPL orchestration:
122
+ The contract for tool execution:
178
123
 
179
- ```typescript
180
- import { LCMToolLoop } from './src/loop/lcm-tool-loop';
181
- import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
124
+ ```ts
125
+ interface ToolProvider {
126
+ bash(command: string, options?: BashOptions): Promise<ToolResult>;
127
+ readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
128
+ writeFile(path: string, content: string): Promise<ToolResult>;
129
+ editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
130
+ glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
131
+ grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
132
+ webFetch?(options: WebFetchOptions): Promise<ToolResult>;
133
+ webSearch?(query: string): Promise<ToolResult>;
134
+ capabilities(): ToolProviderCapabilities;
135
+ }
182
136
 
183
- const loop = new LCMToolLoop({
184
- innerLoop: new VercelAgentLoop(),
185
- toolProvider: mySandboxProvider,
186
- enableRepl: true, // default: true
187
- bridgeDir: '/var/run/bridge',
188
- onActivity: (entry) => console.log(entry),
189
- onLlmRequest: async (prompt) => callLLM(prompt),
190
- onWebFetchRequest: async (url) => fetch(url),
191
- });
137
+ interface ToolResult {
138
+ success: boolean;
139
+ output: string;
140
+ error?: string;
141
+ }
192
142
  ```
193
143
 
194
- **Standard mode**: Lossless context trimming — the LLM always sees a coherent, budget-fitting view of the full conversation.
195
-
196
- **REPL mode**: When the LLM returns a Bash action with the REPL marker, the loop writes a Python script into the sandbox, injects the bridge module, runs the script, and polls for sub-requests (LLM, web_fetch, ask_user) that the harness fulfills.
197
-
198
- ---
144
+ Built-in implementations:
199
145
 
200
- ## ArcLoop (Orchestrator Mode)
146
+ | Provider | Description |
147
+ |----------|-------------|
148
+ | `LocalToolProvider` | Runs tools on the local filesystem |
149
+ | `E2BToolProvider` | Routes tools to an E2B-compatible executor over HTTP |
150
+ | `CompositeToolProvider` | Combines multiple providers (e.g. sandbox + web) |
201
151
 
202
- For complex tasks that benefit from parallel processes, context management, and memory:
152
+ ### Action types (`src/agent/types.ts`)
203
153
 
204
- ```typescript
205
- import { createArcAgent } from './src/arc/create-arc-agent';
154
+ The LLM returns one of these action types each turn:
206
155
 
207
- const agent = await createArcAgent({
208
- toolProvider: myToolProvider,
209
- episodeStore: myEpisodeStore, // required
210
- sessionMemoStore: mySessionMemoStore, // required
211
- longTermStore: myLongTermStore, // required
212
- taskId: 'task-1',
213
- sessionId: 'session-1',
214
- });
156
+ ```ts
157
+ // Single tool call
158
+ interface ToolCallAction {
159
+ type: 'tool';
160
+ name: 'Bash' | 'Read' | 'Write' | 'Edit' | 'Glob' | 'Grep' | ...;
161
+ args: Record<string, unknown>;
162
+ }
215
163
 
216
- // Streaming
217
- for await (const event of agent.stream(messages, signal)) {
218
- if (event.type === 'text_delta') process.stdout.write(event.text);
219
- if (event.type === 'process_dispatched') console.log(` → ${event.action}`);
220
- if (event.type === 'done') console.log(`Done in ${event.stats.durationMs}ms`);
164
+ // Multiple independent tool calls (executed in parallel)
165
+ interface ToolBatchAction {
166
+ type: 'tool_batch';
167
+ calls: ToolCallAction[];
221
168
  }
222
169
 
223
- // Non-streaming
224
- const result = await agent.run(messages, signal);
170
+ // Final text response (ends the loop)
171
+ interface FinalAction {
172
+ type: 'final';
173
+ content: string;
174
+ }
225
175
  ```
226
176
 
227
- ### ArcLoopConfig
228
-
229
- | Option | Type | Default | Description |
230
- |--------|------|---------|-------------|
231
- | `model` | `string` | `'claude-opus-4-6'` | Orchestrator model (ID or tier name) |
232
- | `modelMap` | `Record<ModelTier, string>` | haiku/sonnet/opus | Maps fast/medium/strong to model IDs |
233
- | `apiKey` | `string` | — | Anthropic API key |
234
- | `systemPrompt` | `string` | built-in | Custom orchestrator system prompt |
235
- | `maxTurns` | `number` | 30 | Max orchestrator turns |
236
- | `processTimeout` | `number` | 120_000 | Per-process timeout (ms) |
237
- | `processMaxSteps` | `number` | 20 | Per-process max tool steps |
238
- | `contextWindowSize` | `number` | 200_000 | Context window in tokens |
239
- | `outputReserve` | `number` | 20_000 | Tokens reserved for output |
240
- | `autoMemory` | `boolean` | true | Auto-detect patterns from episodes |
241
- | `episodeStore` | `EpisodeStore` | required | Stores episode summaries + traces |
242
- | `sessionMemoStore` | `SessionMemoStore` | required | Stores session memos |
243
- | `longTermStore` | `LongTermStore` | required | Stores long-term memories |
244
- | `taskId` | `string` | required | Task identifier |
245
- | `sessionId` | `string` | required | Session identifier |
246
- | `toolProvider` | `ToolProvider` | required | Tool execution |
247
- | `processTools` | `Record<string, AnyTool>` | builtinTools | Tools available inside processes |
248
- | `extraOrchestratorTools` | `Record<string, AnyTool>` | — | Custom orchestrator tools |
249
- | `onOrchestratorTool` | `function` | — | Handler for custom orchestrator tools |
250
- | `resilience` | `ResiliencePolicy` | — | Composable resilience pipeline |
251
- | `traceWriter` | `function` | — | Callback for trace event emission |
252
-
253
- ### Resilience
254
-
255
- ```typescript
256
- import { resilience } from './src/arc/resilience';
257
-
258
- const pipeline = resilience()
259
- .retry({ maxRetries: 2, baseDelay: 1000 })
260
- .timeout({ durationMs: 30_000 })
261
- .circuitBreaker({ failureThreshold: 5 })
262
- .build();
263
-
264
- const agent = await createArcAgent({
265
- // ...config
266
- resilience: pipeline,
267
- });
268
- ```
177
+ ### LCM tool loop (`src/loop/lcm-tool-loop.ts`)
269
178
 
270
- ### Trace Emission
179
+ `LCMToolLoop` wraps another loop to add LCM-based tool routing, REPL script execution, and bridge-based tool dispatch. Used in the chat-assistant example.
271
180
 
272
- ```typescript
273
- const traces: TraceEvent[] = [];
274
- const agent = await createArcAgent({
275
- // ...config
276
- traceWriter: (event) => traces.push(event),
277
- });
181
+ ### Sandbox provider (`src/interfaces/sandbox-provider.ts`)
182
+
183
+ Higher-level sandbox operations beyond basic tool calls:
184
+
185
+ ```ts
186
+ interface SandboxProvider {
187
+ exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
188
+ readSandboxFile(path: string): Promise<SandboxFileBlob>;
189
+ writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
190
+ }
278
191
  ```
279
192
 
280
- Traces can be validated against the formal model: `cd verify && cargo run -- trace file.ndjson`
193
+ ### Observability (`src/observability/otel.ts`)
281
194
 
282
- ---
195
+ `HarnessTelemetry` provides OpenTelemetry-style spans and metrics for agent runs.
283
196
 
284
- ## Package Layout
197
+ ## Package layout
285
198
 
286
199
  ```
287
200
  src/
288
201
  ├── agent/ # createAgent, step executor, types
289
- ├── arc/ # ArcLoop orchestrator, processes, memory, resilience
290
- │ ├── resilience/ # Retry, circuit breaker, timeout, bulkhead, fallback
291
- │ ├── stores/ # RxDB + in-memory store implementations
292
- │ └── object-store/ # Pluggable cloud sync (fs, memory)
293
202
  ├── interfaces/ # ToolProvider, SandboxProvider, AgentLoop contracts
294
203
  ├── loop/ # VercelAgentLoop, LCMToolLoop
295
204
  ├── providers/ # LocalToolProvider, E2BToolProvider, ControlPlaneE2BExecutor
@@ -297,20 +206,16 @@ src/
297
206
  ├── hooks/ # Pre/post tool call hooks
298
207
  ├── permissions/ # Tool permission checks
299
208
  ├── sessions/ # Session persistence
300
- ├── subagents/ # Subagent spawning
209
+ ├── subagents/ # Subagent spawning and task tools
301
210
  ├── skills/ # Skill index, routing, and management
302
211
  ├── optimization/ # Benchmark runner
303
212
  └── observability/ # OpenTelemetry integration
304
-
305
- verify/ # Rust formal verification (Stateright model checker)
306
- testing/ # Adversarial scenario replay harness
307
- tests/ # Vitest test suite
308
213
  ```
309
214
 
310
215
  ## Documentation
311
216
 
312
- - [Arc architecture](../docs/arc.md) — process model, context window, memory, resilience, verification
313
- - [Testing](../docs/testing.md) — test layers, running tests, writing new tests
314
- - [Sandbox setup](../docs/PUBLIC_SANDBOX.md) — deploying the sandbox service
315
- - [Release process](../docs/RELEASE.md) — versioning and publishing
316
- - [Example](../examples/chat-assistant/src/chat.ts) — complete working chat assistant
217
+ - Provider guide: `docs/guides/providers.md`
218
+ - Skills guide: `docs/guides/skills.md`
219
+ - Observability guide: `docs/guides/observability.md`
220
+ - Release process: `../docs/RELEASE.md`
221
+ - Full example: [`../examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bluecopa/harness",
3
- "version": "0.1.0-snapshot.49",
3
+ "version": "0.1.0-snapshot.5",
4
4
  "description": "Provider-agnostic TypeScript agent framework",
5
5
  "license": "UNLICENSED",
6
6
  "scripts": {
@@ -10,7 +10,6 @@
10
10
  "dependencies": {
11
11
  "@ai-sdk/anthropic": "^3.0.48",
12
12
  "ai": "^6.0.101",
13
- "rxdb": "^15.39.0",
14
13
  "zod": "^4.1.11"
15
14
  },
16
15
  "devDependencies": {
@@ -7,12 +7,14 @@ import type { HarnessTelemetry } from '../observability/otel';
7
7
  import { HookRunner } from '../hooks/hook-runner';
8
8
  import { PermissionManager } from '../permissions/permission-manager';
9
9
  import { VercelAgentLoop } from '../loop/vercel-agent-loop';
10
+ export type { SystemPromptBlock, VercelAgentLoopConfig } from '../loop/vercel-agent-loop';
11
+ export type { PrepareStepContext, PrepareStepResult } from './types';
10
12
  import { SkillManager } from '../skills/skill-manager';
11
13
  import { SkillRouter } from '../skills/skill-router';
12
14
  import type { SkillSummary } from '../skills/skill-types';
13
15
  import { SingleFlightStepExecutor } from './step-executor';
14
- import type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo } from './types';
15
- export type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo };
16
+ import type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, StepUsage, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo } from './types';
17
+ export type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, StepUsage, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo };
16
18
  export { HookRunner } from '../hooks/hook-runner';
17
19
  export { PermissionManager } from '../permissions/permission-manager';
18
20
  export type { PermissionMode, PermissionResolver, PermissionRequest } from '../permissions/permission-manager';
@@ -37,8 +39,6 @@ export interface AgentRuntime {
37
39
  /** Custom tool executor. Called for every tool action. Return null to fall through to built-in dispatch.
38
40
  * When hookRunner/permissionManager are provided on the runtime, they are automatically applied before/after this callback — no manual wiring needed. */
39
41
  executeToolAction?: (action: ToolCallAction) => Promise<ToolResult | null>;
40
- /** Progress callback fired before/after each tool call during run(). */
41
- onToolProgress?: (event: { type: 'tool_start'; name: string; args: Record<string, unknown> } | { type: 'tool_end'; name: string; success: boolean; durationMs: number }) => void;
42
42
  }
43
43
 
44
44
  /**
@@ -220,9 +220,12 @@ function toStreamResult(r: ToolResult): { success: boolean; output: string; erro
220
220
  return base;
221
221
  }
222
222
 
223
- /** Format a display-friendly content string for tool results (used in content field). */
223
+ /** Format content string for LLM context. Uses modelOutput (compact summary) when available. */
224
224
  function formatToolResultContent(call: ToolCallAction, result: ToolResult): string {
225
- const content = result.success ? result.output : `ERROR: ${result.error ?? 'unknown failure'}`;
225
+ // Use modelOutput for LLM context when available keeps context compact
226
+ const content = result.success
227
+ ? (result.modelOutput ?? result.output)
228
+ : `ERROR: ${result.error ?? 'unknown failure'}`;
226
229
  switch (call.name) {
227
230
  case 'Write':
228
231
  return `Write(${call.args.path}): ${result.success ? 'ok' : content}`;
@@ -517,6 +520,11 @@ export function createAgent(runtime: AgentRuntime) {
517
520
  ? { nextAction: runtime.nextAction }
518
521
  : new VercelAgentLoop());
519
522
 
523
+ /** Read lastUsage from the loop if it's a VercelAgentLoop. */
524
+ function getLoopUsage(): StepUsage | undefined {
525
+ return loop instanceof VercelAgentLoop ? loop.lastUsage : undefined;
526
+ }
527
+
520
528
  async function resolveSkillContext(prompt: string): Promise<string> {
521
529
  if (!skillManager || !skillIndexPath) return '';
522
530
 
@@ -598,18 +606,14 @@ export function createAgent(runtime: AgentRuntime) {
598
606
 
599
607
  // Execute valid calls via batch (sequential sandbox ops) or parallel fallback
600
608
  if (validCalls.length > 0) {
601
- for (const c of validCalls) runtime.onToolProgress?.({ type: 'tool_start', name: c.name, args: c.args });
602
- const batchStart = Date.now();
603
609
  const results = await executeBatch(validCalls, runtime.toolProvider, runtime);
604
- const batchMs = Date.now() - batchStart;
605
610
  for (let i = 0; i < validCalls.length; i++) {
606
611
  const call = validCalls[i]!;
607
612
  const r = results[i]!;
608
- runtime.onToolProgress?.({ type: 'tool_end', name: call.name, success: r.success, durationMs: batchMs });
609
613
  if (!r.success) {
610
614
  recordAgentError(runtime.telemetry);
611
615
  }
612
- const resultText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
616
+ const resultText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
613
617
  messages.push({
614
618
  role: 'tool',
615
619
  content: formatToolResultContent(call, r),
@@ -665,8 +669,6 @@ export function createAgent(runtime: AgentRuntime) {
665
669
  } else {
666
670
  consecutiveInvalid = 0;
667
671
  }
668
- runtime.onToolProgress?.({ type: 'tool_start', name: action.name, args: action.args });
669
- const singleStart = Date.now();
670
672
  const result = validationError
671
673
  ? ({ success: false, output: '', error: validationError } as ToolResult)
672
674
  : await executor.run(async () => {
@@ -680,11 +682,10 @@ export function createAgent(runtime: AgentRuntime) {
680
682
  };
681
683
  }
682
684
  });
683
- runtime.onToolProgress?.({ type: 'tool_end', name: action.name, success: result.success, durationMs: Date.now() - singleStart });
684
685
  if (!result.success) {
685
686
  recordAgentError(runtime.telemetry);
686
687
  }
687
- const singleResultText = result.success ? result.output : `ERROR: ${result.error ?? 'unknown failure'}`;
688
+ const singleResultText = result.success ? (result.modelOutput ?? result.output) : `ERROR: ${result.error ?? 'unknown failure'}`;
688
689
  messages.push({
689
690
  role: 'tool',
690
691
  content: formatToolResultContent(action, result),
@@ -727,7 +728,8 @@ export function createAgent(runtime: AgentRuntime) {
727
728
  if (event.type === 'text_delta') {
728
729
  finalText += event.text;
729
730
  yield event;
730
- } else if (event.type === 'tool_start') {
731
+ }
732
+ if (event.type === 'tool_start') {
731
733
  pendingTools.push({
732
734
  type: 'tool',
733
735
  name: event.name,
@@ -735,18 +737,13 @@ export function createAgent(runtime: AgentRuntime) {
735
737
  ...(event.toolCallId != null ? { toolCallId: event.toolCallId } : {}),
736
738
  });
737
739
  yield event;
738
- } else {
739
- // Forward all other events (tool_end, step_start, step_end, done)
740
- // from self-managing loops like ArcLoop
741
- yield event;
742
- if (event.type === 'done') return;
743
740
  }
744
741
  }
745
742
 
746
743
  // If no tools → final response
747
744
  if (pendingTools.length === 0) {
748
745
  messages.push({ role: 'assistant', content: finalText });
749
- yield { type: 'step_end', step };
746
+ { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
750
747
  yield { type: 'done', output: finalText, steps: step };
751
748
  return;
752
749
  }
@@ -772,7 +769,7 @@ export function createAgent(runtime: AgentRuntime) {
772
769
  if (action.type === 'final') {
773
770
  yield { type: 'text_delta', text: action.content };
774
771
  messages.push({ role: 'assistant', content: action.content });
775
- yield { type: 'step_end', step };
772
+ { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
776
773
  yield { type: 'done', output: action.content, steps: step };
777
774
  return;
778
775
  }
@@ -784,7 +781,7 @@ export function createAgent(runtime: AgentRuntime) {
784
781
  try {
785
782
  const r = await executeTool(runtime.toolProvider, call, runtime);
786
783
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
787
- const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
784
+ const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
788
785
  messages.push({
789
786
  role: 'tool',
790
787
  content: formatToolResultContent(call, r),
@@ -806,7 +803,7 @@ export function createAgent(runtime: AgentRuntime) {
806
803
  try {
807
804
  const r = await executeTool(runtime.toolProvider, action, runtime);
808
805
  yield { type: 'tool_end', name: action.name, result: toStreamResult(r) };
809
- const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
806
+ const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
810
807
  messages.push({
811
808
  role: 'tool',
812
809
  content: formatToolResultContent(action, r),
@@ -822,7 +819,7 @@ export function createAgent(runtime: AgentRuntime) {
822
819
  });
823
820
  }
824
821
  }
825
- yield { type: 'step_end', step };
822
+ { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
826
823
  continue;
827
824
  }
828
825
 
@@ -832,7 +829,7 @@ export function createAgent(runtime: AgentRuntime) {
832
829
  const call = pendingTools[i]!;
833
830
  const r = results[i]!;
834
831
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
835
- const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
832
+ const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
836
833
  messages.push({
837
834
  role: 'tool',
838
835
  content: formatToolResultContent(call, r),
@@ -850,7 +847,7 @@ export function createAgent(runtime: AgentRuntime) {
850
847
 
851
848
  if (action.type === 'final') {
852
849
  messages.push({ role: 'assistant', content: action.content });
853
- yield { type: 'step_end', step };
850
+ { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
854
851
  yield { type: 'done', output: action.content, steps: step };
855
852
  return;
856
853
  }
@@ -874,7 +871,7 @@ export function createAgent(runtime: AgentRuntime) {
874
871
  const call = calls[i]!;
875
872
  const r = results[i]!;
876
873
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
877
- const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
874
+ const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
878
875
  messages.push({
879
876
  role: 'tool',
880
877
  content: formatToolResultContent(call, r),
@@ -888,7 +885,7 @@ export function createAgent(runtime: AgentRuntime) {
888
885
  }
889
886
  }
890
887
 
891
- yield { type: 'step_end', step };
888
+ { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
892
889
  }
893
890
 
894
891
  yield { type: 'done', output: 'ERROR: max steps exceeded', steps: maxSteps };