@bluecopa/harness 0.1.0-snapshot.3 → 0.1.0-snapshot.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +212 -117
  2. package/package.json +2 -1
  3. package/src/agent/create-agent.ts +15 -2
  4. package/src/agent/types.ts +15 -2
  5. package/src/arc/agent-runner.ts +623 -0
  6. package/src/arc/arc-loop.ts +786 -0
  7. package/src/arc/arc-types.ts +113 -0
  8. package/src/arc/bridge-tools.ts +170 -0
  9. package/src/arc/bridged-tool-provider.ts +80 -0
  10. package/src/arc/consolidation.ts +118 -0
  11. package/src/arc/context-window.ts +265 -0
  12. package/src/arc/create-arc-agent.ts +99 -0
  13. package/src/arc/debug.ts +62 -0
  14. package/src/arc/episode-compressor.ts +218 -0
  15. package/src/arc/memory-manager.ts +245 -0
  16. package/src/arc/message-convert.ts +111 -0
  17. package/src/arc/object-store/fs-object-store.ts +60 -0
  18. package/src/arc/object-store/memory-object-store.ts +41 -0
  19. package/src/arc/object-store/object-store.ts +12 -0
  20. package/src/arc/resilience/bulkhead.ts +110 -0
  21. package/src/arc/resilience/circuit-breaker.ts +112 -0
  22. package/src/arc/resilience/fallback.ts +27 -0
  23. package/src/arc/resilience/index.ts +21 -0
  24. package/src/arc/resilience/pipeline.ts +103 -0
  25. package/src/arc/resilience/retry.ts +90 -0
  26. package/src/arc/resilience/timeout.ts +60 -0
  27. package/src/arc/resilience/types.ts +71 -0
  28. package/src/arc/stores/episode-store.ts +120 -0
  29. package/src/arc/stores/long-term-store.ts +86 -0
  30. package/src/arc/stores/rxdb-setup.ts +112 -0
  31. package/src/arc/stores/session-memo-store.ts +58 -0
  32. package/src/arc/tools.ts +67 -0
  33. package/src/arc/types.ts +263 -0
  34. package/src/arc/utils.ts +19 -0
  35. package/src/loop/context-store.ts +12 -9
  36. package/src/loop/vercel-agent-loop.ts +24 -8
  37. package/testing/index.ts +22 -0
  38. package/testing/scenario-replay.ts +209 -0
  39. package/testing/scenario-types.ts +38 -0
  40. package/testing/scripted-llm.ts +230 -0
  41. package/tests/arc/channel.test.ts +170 -0
  42. package/tests/arc/context-window.test.ts +396 -0
  43. package/tests/arc/e2e.test.ts +353 -0
  44. package/tests/arc/error-paths.test.ts +402 -0
  45. package/tests/arc/live-integration.test.ts +357 -0
  46. package/tests/arc/memory-manager.test.ts +384 -0
  47. package/tests/arc/process-interleaving.test.ts +432 -0
  48. package/tests/arc/process-profiles.test.ts +364 -0
  49. package/tests/arc/resilience-integration.test.ts +381 -0
  50. package/tests/arc/resilience.test.ts +575 -0
  51. package/tests/arc/scenario-driven.test.ts +297 -0
  52. package/tests/arc/tool-dispatch.test.ts +340 -0
  53. package/tests/arc/wasm-pbt.test.ts +104 -0
  54. package/tests/integration/agent-skill-default-from-sandbox.spec.ts +3 -2
  55. package/tests/unit/structured-messages.spec.ts +1 -1
  56. package/verify/Cargo.lock +637 -0
  57. package/verify/Cargo.toml +24 -0
  58. package/verify/src/lib.rs +5 -0
  59. package/verify/src/main.rs +165 -0
  60. package/verify/src/model/context.rs +100 -0
  61. package/verify/src/model/mod.rs +6 -0
  62. package/verify/src/model/orchestrator.rs +371 -0
  63. package/verify/src/model/process.rs +140 -0
  64. package/verify/src/model/types.rs +273 -0
  65. package/verify/src/properties/liveness.rs +32 -0
  66. package/verify/src/properties/mod.rs +4 -0
  67. package/verify/src/properties/safety.rs +78 -0
  68. package/verify/src/trace/event.rs +155 -0
  69. package/verify/src/trace/mod.rs +2 -0
  70. package/verify/src/trace/validator.rs +367 -0
  71. package/verify/src/wasm/mod.rs +3 -0
  72. package/verify/src/wasm/scenario_generator.rs +400 -0
  73. package/verify/src/wasm/types.rs +104 -0
  74. package/verify/src/wasm/wasm_validator.rs +107 -0
  75. package/verify/tests/model_check.rs +49 -0
  76. package/verify/tests/trace_validation.rs +147 -0
  77. package/vitest.config.ts +1 -1
package/README.md CHANGED
@@ -2,9 +2,17 @@
2
2
 
3
3
  Provider-agnostic TypeScript agent framework with Claude-code-compatible tool semantics.
4
4
 
5
- The harness provides the core loop that drives an AI agent: send messages to an LLM, execute the tool calls it returns, feed results back, and repeat until the LLM produces a final text response.
5
+ Published on npm as **`@bluecopa/harness`**.
6
6
 
7
- ## Quickstart
7
+ Two execution modes: a simple single-agent loop (`createAgent` + `VercelAgentLoop`) and a process-based orchestrator (`ArcLoop`) that dispatches parallel processes with context management, memory, and resilience.
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pnpm add @bluecopa/harness
13
+ ```
14
+
15
+ ## Development
8
16
 
9
17
  ```bash
10
18
  pnpm install
@@ -13,9 +21,11 @@ pnpm test
13
21
 
14
22
  ## Architecture
15
23
 
24
+ ### Single-Agent Loop
25
+
16
26
  ```
17
27
  ┌──────────────┐ ┌──────────────┐ ┌──────────────────┐
18
- │ createAgent │────▶│ AgentLoop │────▶│ LLM (Claude) │
28
+ │ createAgent │────►│ AgentLoop │────►│ LLM (Claude) │
19
29
  │ (turn loop) │ │ (nextAction)│ │ │
20
30
  └──────┬───────┘ └──────────────┘ └──────────────────┘
21
31
  │ │
@@ -27,20 +37,82 @@ pnpm test
27
37
  └──────────────┘
28
38
  ```
29
39
 
30
- 1. `createAgent` drives a deterministic step loop
31
- 2. Each step calls `loop.nextAction(messages)` to get the LLM's decision
32
- 3. If it's a tool call, the harness executes it via `ToolProvider` and appends the result
33
- 4. If it's a final action, the loop ends and returns the result
40
+ ### ArcLoop Orchestrator
34
41
 
35
- ## Using with the sandbox
42
+ ```
43
+ Orchestrator (ArcLoop — Opus 4.6 by default)
44
+ │ tools: Thread, Check, Cancel, Remember, ReadEpisode
45
+
46
+ │ Turn 1 (parallel):
47
+ ├──► Process 0 ("read auth", model=fast) ─┐
48
+ ├──► Process 1 ("read routes", model=fast) ─┼──► Episodes
49
+ ├──► Process 2 ("read tests", model=fast) ─┘
50
+
51
+ │ Turn 2 (dispatch dependent work):
52
+ ├──► Thread("fix bug", context=[ep0,ep1,ep2]) ──► Episode
53
+
54
+ │ Turn 3 (parallel):
55
+ ├──► Thread("run tests", context=[ep3]) ─┐
56
+ ├──► Thread("update docs", context=[ep3]) ─┘
57
+
58
+ └──► Final text response
59
+ ```
36
60
 
37
- The most common setup connects the harness to a running sandbox service via `ControlPlaneE2BExecutor`:
61
+ Full architecture doc: [`docs/arc.md`](../docs/arc.md)
38
62
 
39
- ```ts
40
- import { createAgent } from './src/agent/create-agent';
41
- import { E2BToolProvider } from './src/providers/e2b-tool-provider';
63
+ ---
64
+
65
+ ## ToolProvider
66
+
67
+ The contract for tool execution. All agent modes use this interface.
68
+
69
+ ```typescript
70
+ interface ToolProvider {
71
+ bash(command: string, options?: BashOptions): Promise<ToolResult>;
72
+ readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
73
+ writeFile(path: string, content: string): Promise<ToolResult>;
74
+ editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
75
+ glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
76
+ grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
77
+ webFetch?(options: WebFetchOptions): Promise<ToolResult>;
78
+ webSearch?(query: string): Promise<ToolResult>;
79
+ capabilities(): ToolProviderCapabilities;
80
+ }
81
+
82
+ interface ToolResult {
83
+ success: boolean;
84
+ output: string;
85
+ error?: string;
86
+ }
87
+ ```
88
+
89
+ Built-in implementations:
90
+
91
+ | Provider | Description |
92
+ |----------|-------------|
93
+ | `LocalToolProvider` | Runs tools on the local filesystem |
94
+ | `E2BToolProvider` | Routes tools to a sandbox VM via `ControlPlaneE2BExecutor` |
95
+ | `CompositeToolProvider` | Combines multiple providers (e.g. local filesystem + sandbox bash) |
96
+
97
+ ## SandboxProvider
98
+
99
+ Higher-level sandbox operations beyond basic tool calls:
100
+
101
+ ```typescript
102
+ interface SandboxProvider {
103
+ exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
104
+ readSandboxFile(path: string): Promise<SandboxFileBlob>;
105
+ writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
106
+ }
107
+ ```
108
+
109
+ Used by `SkillManager` for executing skill scripts in isolated VMs.
110
+
111
+ ## Connecting to a Sandbox
112
+
113
+ ```typescript
42
114
  import { ControlPlaneE2BExecutor } from './src/providers/control-plane-e2b-executor';
43
- import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
115
+ import { E2BToolProvider } from './src/providers/e2b-tool-provider';
44
116
 
45
117
  // Connect to sandbox service
46
118
  const executor = new ControlPlaneE2BExecutor({
@@ -50,155 +122,174 @@ const executor = new ControlPlaneE2BExecutor({
50
122
  });
51
123
  await executor.initialize(); // creates a Firecracker VM
52
124
 
53
- // Build and run the agent
54
- const agent = createAgent({
55
- toolProvider: new E2BToolProvider(executor),
56
- loop: new VercelAgentLoop(), // needs ANTHROPIC_API_KEY
57
- });
125
+ const toolProvider = new E2BToolProvider(executor);
58
126
 
59
- const result = await agent.run('create a bar chart of sales data');
60
- console.log(result.output); // LLM's final response
61
- console.log(result.steps); // number of tool steps
127
+ // ... use with createAgent or ArcLoop
62
128
 
63
- await executor.destroy(); // tears down the VM
129
+ await executor.destroy(); // tears down the VM
64
130
  ```
65
131
 
66
- For a complete working example, see [`examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts).
67
-
68
- ### From environment variables
132
+ From environment variables: `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY`.
69
133
 
70
- `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY` automatically:
134
+ ---
71
135
 
72
- ```ts
73
- const executor = ControlPlaneE2BExecutor.fromEnv();
74
- ```
136
+ ## Single-Agent Mode (`createAgent`)
75
137
 
76
- ## Using locally (no sandbox)
138
+ For simple tasks that don't need orchestration:
77
139
 
78
- For development without a sandbox service, use `LocalToolProvider` which runs tools on the local machine:
79
-
80
- ```ts
140
+ ```typescript
81
141
  import { createAgent } from './src/agent/create-agent';
82
142
  import { LocalToolProvider } from './src/providers/local-tool-provider';
83
143
 
84
144
  const agent = createAgent({
85
145
  toolProvider: new LocalToolProvider(process.cwd()),
86
- loop: new VercelAgentLoop(),
146
+ loop: new VercelAgentLoop(), // needs ANTHROPIC_API_KEY
87
147
  });
88
148
 
89
149
  const result = await agent.run('list all TypeScript files');
150
+ console.log(result.output);
90
151
  ```
91
152
 
92
- ## Key modules
93
-
94
- ### Agent creation (`src/agent/create-agent.ts`)
153
+ ### Configuration
95
154
 
96
- `createAgent(options)` returns an agent with a `.run(prompt, options?)` method. Options:
155
+ | Option | Type | Default | Description |
156
+ |--------|------|---------|-------------|
157
+ | `toolProvider` | `ToolProvider` | required | Executes tool calls |
158
+ | `loop` | `AgentLoop` | `VercelAgentLoop` | LLM decision loop |
159
+ | `sandboxProvider` | `SandboxProvider` | — | Higher-level sandbox operations |
160
+ | `maxSteps` | `number` | 30 | Max tool steps per run |
161
+ | `telemetry` | `HarnessTelemetry` | — | OpenTelemetry-style tracing |
162
+ | `skillIndexPath` | `string` | — | Path to skill index JSON for routing |
97
163
 
98
- | Option | Type | Description |
99
- |--------|------|-------------|
100
- | `toolProvider` | `ToolProvider` | Required. Executes tool calls |
101
- | `loop` | `AgentLoop` | LLM decision loop (default: `VercelAgentLoop`) |
102
- | `sandboxProvider` | `SandboxProvider` | Optional. Higher-level sandbox ops (file download, exec with env) |
103
- | `maxSteps` | `number` | Max tool steps per run (default: 30) |
104
- | `telemetry` | `HarnessTelemetry` | Optional. OpenTelemetry-style tracing |
105
- | `skillIndexPath` | `string` | Optional. Path to skill index JSON |
164
+ ### VercelAgentLoop
106
165
 
107
- ### Agent loop (`src/loop/vercel-agent-loop.ts`)
166
+ Calls Claude via the Vercel AI SDK. Supports parallel tool calls and configurable system prompt.
108
167
 
109
- `VercelAgentLoop` calls Claude via the Vercel AI SDK. It supports:
110
- - Parallel tool calls (returns `ToolBatchAction` when the LLM requests multiple tools at once)
111
- - Configurable system prompt
112
- - Model selection via `HARNESS_MODEL` env var (default: `claude-sonnet-4-5`)
113
-
114
- ```ts
168
+ ```typescript
115
169
  const loop = new VercelAgentLoop({
116
170
  systemPrompt: 'You are a helpful coding assistant.',
171
+ model: 'claude-sonnet-4-5', // or HARNESS_MODEL env var
117
172
  });
118
173
  ```
119
174
 
120
- ### Tool provider (`src/interfaces/tool-provider.ts`)
175
+ ### LCMToolLoop
121
176
 
122
- The contract for tool execution:
177
+ Wraps another loop to add Lossless Context Management and optional REPL orchestration:
123
178
 
124
- ```ts
125
- interface ToolProvider {
126
- bash(command: string, options?: BashOptions): Promise<ToolResult>;
127
- readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
128
- writeFile(path: string, content: string): Promise<ToolResult>;
129
- editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
130
- glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
131
- grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
132
- webFetch?(options: WebFetchOptions): Promise<ToolResult>;
133
- webSearch?(query: string): Promise<ToolResult>;
134
- capabilities(): ToolProviderCapabilities;
135
- }
179
+ ```typescript
180
+ import { LCMToolLoop } from './src/loop/lcm-tool-loop';
181
+ import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
136
182
 
137
- interface ToolResult {
138
- success: boolean;
139
- output: string;
140
- error?: string;
141
- }
183
+ const loop = new LCMToolLoop({
184
+ innerLoop: new VercelAgentLoop(),
185
+ toolProvider: mySandboxProvider,
186
+ enableRepl: true, // default: true
187
+ bridgeDir: '/var/run/bridge',
188
+ onActivity: (entry) => console.log(entry),
189
+ onLlmRequest: async (prompt) => callLLM(prompt),
190
+ onWebFetchRequest: async (url) => fetch(url),
191
+ });
142
192
  ```
143
193
 
144
- Built-in implementations:
194
+ **Standard mode**: Lossless context trimming — the LLM always sees a coherent, budget-fitting view of the full conversation.
145
195
 
146
- | Provider | Description |
147
- |----------|-------------|
148
- | `LocalToolProvider` | Runs tools on the local filesystem |
149
- | `E2BToolProvider` | Routes tools to an E2B-compatible executor over HTTP |
150
- | `CompositeToolProvider` | Combines multiple providers (e.g. sandbox + web) |
196
+ **REPL mode**: When the LLM returns a Bash action with the REPL marker, the loop writes a Python script into the sandbox, injects the bridge module, runs the script, and polls for sub-requests (LLM, web_fetch, ask_user) that the harness fulfills.
151
197
 
152
- ### Action types (`src/agent/types.ts`)
198
+ ---
153
199
 
154
- The LLM returns one of these action types each turn:
200
+ ## ArcLoop (Orchestrator Mode)
155
201
 
156
- ```ts
157
- // Single tool call
158
- interface ToolCallAction {
159
- type: 'tool';
160
- name: 'Bash' | 'Read' | 'Write' | 'Edit' | 'Glob' | 'Grep' | ...;
161
- args: Record<string, unknown>;
162
- }
202
+ For complex tasks that benefit from parallel processes, context management, and memory:
163
203
 
164
- // Multiple independent tool calls (executed in parallel)
165
- interface ToolBatchAction {
166
- type: 'tool_batch';
167
- calls: ToolCallAction[];
168
- }
204
+ ```typescript
205
+ import { createArcAgent } from './src/arc/create-arc-agent';
169
206
 
170
- // Final text response (ends the loop)
171
- interface FinalAction {
172
- type: 'final';
173
- content: string;
174
- }
175
- ```
207
+ const agent = await createArcAgent({
208
+ toolProvider: myToolProvider,
209
+ episodeStore: myEpisodeStore, // required
210
+ sessionMemoStore: mySessionMemoStore, // required
211
+ longTermStore: myLongTermStore, // required
212
+ taskId: 'task-1',
213
+ sessionId: 'session-1',
214
+ });
176
215
 
177
- ### LCM tool loop (`src/loop/lcm-tool-loop.ts`)
216
+ // Streaming
217
+ for await (const event of agent.stream(messages, signal)) {
218
+ if (event.type === 'text_delta') process.stdout.write(event.text);
219
+ if (event.type === 'process_dispatched') console.log(` → ${event.action}`);
220
+ if (event.type === 'done') console.log(`Done in ${event.stats.durationMs}ms`);
221
+ }
178
222
 
179
- `LCMToolLoop` wraps another loop to add LCM-based tool routing, REPL script execution, and bridge-based tool dispatch. Used in the chat-assistant example.
223
+ // Non-streaming
224
+ const result = await agent.run(messages, signal);
225
+ ```
180
226
 
181
- ### Sandbox provider (`src/interfaces/sandbox-provider.ts`)
227
+ ### ArcLoopConfig
228
+
229
+ | Option | Type | Default | Description |
230
+ |--------|------|---------|-------------|
231
+ | `model` | `string` | `'claude-opus-4-6'` | Orchestrator model (ID or tier name) |
232
+ | `modelMap` | `Record<ModelTier, string>` | haiku/sonnet/opus | Maps fast/medium/strong to model IDs |
233
+ | `apiKey` | `string` | — | Anthropic API key |
234
+ | `systemPrompt` | `string` | built-in | Custom orchestrator system prompt |
235
+ | `maxTurns` | `number` | 30 | Max orchestrator turns |
236
+ | `processTimeout` | `number` | 120_000 | Per-process timeout (ms) |
237
+ | `processMaxSteps` | `number` | 20 | Per-process max tool steps |
238
+ | `contextWindowSize` | `number` | 200_000 | Context window in tokens |
239
+ | `outputReserve` | `number` | 20_000 | Tokens reserved for output |
240
+ | `autoMemory` | `boolean` | true | Auto-detect patterns from episodes |
241
+ | `episodeStore` | `EpisodeStore` | required | Stores episode summaries + traces |
242
+ | `sessionMemoStore` | `SessionMemoStore` | required | Stores session memos |
243
+ | `longTermStore` | `LongTermStore` | required | Stores long-term memories |
244
+ | `taskId` | `string` | required | Task identifier |
245
+ | `sessionId` | `string` | required | Session identifier |
246
+ | `toolProvider` | `ToolProvider` | required | Tool execution |
247
+ | `processTools` | `Record<string, AnyTool>` | builtinTools | Tools available inside processes |
248
+ | `extraOrchestratorTools` | `Record<string, AnyTool>` | — | Custom orchestrator tools |
249
+ | `onOrchestratorTool` | `function` | — | Handler for custom orchestrator tools |
250
+ | `resilience` | `ResiliencePolicy` | — | Composable resilience pipeline |
251
+ | `traceWriter` | `function` | — | Callback for trace event emission |
252
+
253
+ ### Resilience
254
+
255
+ ```typescript
256
+ import { resilience } from './src/arc/resilience';
257
+
258
+ const pipeline = resilience()
259
+ .retry({ maxRetries: 2, baseDelay: 1000 })
260
+ .timeout({ durationMs: 30_000 })
261
+ .circuitBreaker({ failureThreshold: 5 })
262
+ .build();
263
+
264
+ const agent = await createArcAgent({
265
+ // ...config
266
+ resilience: pipeline,
267
+ });
268
+ ```
182
269
 
183
- Higher-level sandbox operations beyond basic tool calls:
270
+ ### Trace Emission
184
271
 
185
- ```ts
186
- interface SandboxProvider {
187
- exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
188
- readSandboxFile(path: string): Promise<SandboxFileBlob>;
189
- writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
190
- }
272
+ ```typescript
273
+ const traces: TraceEvent[] = [];
274
+ const agent = await createArcAgent({
275
+ // ...config
276
+ traceWriter: (event) => traces.push(event),
277
+ });
191
278
  ```
192
279
 
193
- ### Observability (`src/observability/otel.ts`)
280
+ Traces can be validated against the formal model: `cd verify && cargo run -- trace file.ndjson`
194
281
 
195
- `HarnessTelemetry` provides OpenTelemetry-style spans and metrics for agent runs.
282
+ ---
196
283
 
197
- ## Package layout
284
+ ## Package Layout
198
285
 
199
286
  ```
200
287
  src/
201
288
  ├── agent/ # createAgent, step executor, types
289
+ ├── arc/ # ArcLoop orchestrator, processes, memory, resilience
290
+ │ ├── resilience/ # Retry, circuit breaker, timeout, bulkhead, fallback
291
+ │ ├── stores/ # RxDB + in-memory store implementations
292
+ │ └── object-store/ # Pluggable cloud sync (fs, memory)
202
293
  ├── interfaces/ # ToolProvider, SandboxProvider, AgentLoop contracts
203
294
  ├── loop/ # VercelAgentLoop, LCMToolLoop
204
295
  ├── providers/ # LocalToolProvider, E2BToolProvider, ControlPlaneE2BExecutor
@@ -206,16 +297,20 @@ src/
206
297
  ├── hooks/ # Pre/post tool call hooks
207
298
  ├── permissions/ # Tool permission checks
208
299
  ├── sessions/ # Session persistence
209
- ├── subagents/ # Subagent spawning and task tools
300
+ ├── subagents/ # Subagent spawning
210
301
  ├── skills/ # Skill index, routing, and management
211
302
  ├── optimization/ # Benchmark runner
212
303
  └── observability/ # OpenTelemetry integration
304
+
305
+ verify/ # Rust formal verification (Stateright model checker)
306
+ testing/ # Adversarial scenario replay harness
307
+ tests/ # Vitest test suite
213
308
  ```
214
309
 
215
310
  ## Documentation
216
311
 
217
- - Provider guide: `docs/guides/providers.md`
218
- - Skills guide: `docs/guides/skills.md`
219
- - Observability guide: `docs/guides/observability.md`
220
- - Release process: `../docs/RELEASE.md`
221
- - Full example: [`../examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts)
312
+ - [Arc architecture](../docs/arc.md) — process model, context window, memory, resilience, verification
313
+ - [Testing](../docs/testing.md) — test layers, running tests, writing new tests
314
+ - [Sandbox setup](../docs/PUBLIC_SANDBOX.md) — deploying the sandbox service
315
+ - [Release process](../docs/RELEASE.md) — versioning and publishing
316
+ - [Example](../examples/chat-assistant/src/chat.ts) — complete working chat assistant
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bluecopa/harness",
3
- "version": "0.1.0-snapshot.3",
3
+ "version": "0.1.0-snapshot.30",
4
4
  "description": "Provider-agnostic TypeScript agent framework",
5
5
  "license": "UNLICENSED",
6
6
  "scripts": {
@@ -10,6 +10,7 @@
10
10
  "dependencies": {
11
11
  "@ai-sdk/anthropic": "^3.0.48",
12
12
  "ai": "^6.0.101",
13
+ "rxdb": "^15.39.0",
13
14
  "zod": "^4.1.11"
14
15
  },
15
16
  "devDependencies": {
@@ -37,6 +37,8 @@ export interface AgentRuntime {
37
37
  /** Custom tool executor. Called for every tool action. Return null to fall through to built-in dispatch.
38
38
  * When hookRunner/permissionManager are provided on the runtime, they are automatically applied before/after this callback — no manual wiring needed. */
39
39
  executeToolAction?: (action: ToolCallAction) => Promise<ToolResult | null>;
40
+ /** Progress callback fired before/after each tool call during run(). */
41
+ onToolProgress?: (event: { type: 'tool_start'; name: string; args: Record<string, unknown> } | { type: 'tool_end'; name: string; success: boolean; durationMs: number }) => void;
40
42
  }
41
43
 
42
44
  /**
@@ -596,10 +598,14 @@ export function createAgent(runtime: AgentRuntime) {
596
598
 
597
599
  // Execute valid calls via batch (sequential sandbox ops) or parallel fallback
598
600
  if (validCalls.length > 0) {
601
+ for (const c of validCalls) runtime.onToolProgress?.({ type: 'tool_start', name: c.name, args: c.args });
602
+ const batchStart = Date.now();
599
603
  const results = await executeBatch(validCalls, runtime.toolProvider, runtime);
604
+ const batchMs = Date.now() - batchStart;
600
605
  for (let i = 0; i < validCalls.length; i++) {
601
606
  const call = validCalls[i]!;
602
607
  const r = results[i]!;
608
+ runtime.onToolProgress?.({ type: 'tool_end', name: call.name, success: r.success, durationMs: batchMs });
603
609
  if (!r.success) {
604
610
  recordAgentError(runtime.telemetry);
605
611
  }
@@ -659,6 +665,8 @@ export function createAgent(runtime: AgentRuntime) {
659
665
  } else {
660
666
  consecutiveInvalid = 0;
661
667
  }
668
+ runtime.onToolProgress?.({ type: 'tool_start', name: action.name, args: action.args });
669
+ const singleStart = Date.now();
662
670
  const result = validationError
663
671
  ? ({ success: false, output: '', error: validationError } as ToolResult)
664
672
  : await executor.run(async () => {
@@ -672,6 +680,7 @@ export function createAgent(runtime: AgentRuntime) {
672
680
  };
673
681
  }
674
682
  });
683
+ runtime.onToolProgress?.({ type: 'tool_end', name: action.name, success: result.success, durationMs: Date.now() - singleStart });
675
684
  if (!result.success) {
676
685
  recordAgentError(runtime.telemetry);
677
686
  }
@@ -718,8 +727,7 @@ export function createAgent(runtime: AgentRuntime) {
718
727
  if (event.type === 'text_delta') {
719
728
  finalText += event.text;
720
729
  yield event;
721
- }
722
- if (event.type === 'tool_start') {
730
+ } else if (event.type === 'tool_start') {
723
731
  pendingTools.push({
724
732
  type: 'tool',
725
733
  name: event.name,
@@ -727,6 +735,11 @@ export function createAgent(runtime: AgentRuntime) {
727
735
  ...(event.toolCallId != null ? { toolCallId: event.toolCallId } : {}),
728
736
  });
729
737
  yield event;
738
+ } else {
739
+ // Forward all other events (tool_end, step_start, step_end, done)
740
+ // from self-managing loops like ArcLoop
741
+ yield event;
742
+ if (event.type === 'done') return;
730
743
  }
731
744
  }
732
745
 
@@ -11,13 +11,26 @@ export interface ToolResultInfo {
11
11
  isError?: boolean;
12
12
  }
13
13
 
14
+ export type ContentPart =
15
+ | { type: 'text'; text: string }
16
+ | { type: 'image'; image: Buffer | Uint8Array; mimeType: string };
17
+
14
18
  export interface AgentMessage {
15
19
  role: 'system' | 'user' | 'assistant' | 'tool';
16
- content: string;
20
+ content: string | ContentPart[];
17
21
  toolCalls?: ToolCallInfo[]; // assistant messages: what tools were called
18
22
  toolResults?: ToolResultInfo[]; // tool messages: results keyed by toolCallId
19
23
  }
20
24
 
25
+ /** Extract plain text from content (string or ContentPart[]). */
26
+ export function getTextContent(content: string | ContentPart[]): string {
27
+ if (typeof content === 'string') return content;
28
+ return content
29
+ .filter((p): p is Extract<ContentPart, { type: 'text' }> => p.type === 'text')
30
+ .map((p) => p.text)
31
+ .join('\n');
32
+ }
33
+
21
34
  export interface ToolCallAction {
22
35
  type: 'tool';
23
36
  name: string;
@@ -46,7 +59,7 @@ export interface AgentRunResult {
46
59
  export type AgentStreamEvent =
47
60
  | { type: 'text_delta'; text: string }
48
61
  | { type: 'tool_start'; name: string; args: Record<string, unknown>; toolCallId?: string }
49
- | { type: 'tool_end'; name: string; result: { success: boolean; output: string; error?: string } }
62
+ | { type: 'tool_end'; name: string; result: { success: boolean; output: string; error?: string; [key: string]: unknown } }
50
63
  | { type: 'step_start'; step: number }
51
64
  | { type: 'step_end'; step: number }
52
65
  | { type: 'done'; output: string; steps: number };