@bluecopa/harness 0.1.0-snapshot.5 → 0.1.0-snapshot.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/README.md +212 -117
  2. package/package.json +2 -1
  3. package/src/agent/create-agent.ts +30 -27
  4. package/src/agent/types.ts +20 -24
  5. package/src/arc/agent-runner.ts +955 -0
  6. package/src/arc/arc-loop.ts +845 -0
  7. package/src/arc/arc-types.ts +115 -0
  8. package/src/arc/bridge-tools.ts +170 -0
  9. package/src/arc/bridged-tool-provider.ts +80 -0
  10. package/src/arc/consolidation.ts +118 -0
  11. package/src/arc/context-window.ts +267 -0
  12. package/src/arc/create-arc-agent.ts +99 -0
  13. package/src/arc/debug.ts +62 -0
  14. package/src/arc/episode-compressor.ts +225 -0
  15. package/src/arc/memory-manager.ts +245 -0
  16. package/src/arc/message-convert.ts +123 -0
  17. package/src/arc/multi-model.ts +70 -0
  18. package/src/arc/object-store/fs-object-store.ts +60 -0
  19. package/src/arc/object-store/memory-object-store.ts +41 -0
  20. package/src/arc/object-store/object-store.ts +12 -0
  21. package/src/arc/profile-builder.ts +172 -0
  22. package/src/arc/resilience/bulkhead.ts +110 -0
  23. package/src/arc/resilience/circuit-breaker.ts +112 -0
  24. package/src/arc/resilience/fallback.ts +27 -0
  25. package/src/arc/resilience/index.ts +21 -0
  26. package/src/arc/resilience/pipeline.ts +103 -0
  27. package/src/arc/resilience/retry.ts +90 -0
  28. package/src/arc/resilience/timeout.ts +60 -0
  29. package/src/arc/resilience/types.ts +71 -0
  30. package/src/arc/result-pager.ts +77 -0
  31. package/src/arc/sig.ts +115 -0
  32. package/src/arc/skill-resolver.ts +81 -0
  33. package/src/arc/stores/episode-store.ts +120 -0
  34. package/src/arc/stores/long-term-store.ts +86 -0
  35. package/src/arc/stores/rxdb-setup.ts +113 -0
  36. package/src/arc/stores/session-memo-store.ts +58 -0
  37. package/src/arc/tools.ts +67 -0
  38. package/src/arc/types.ts +363 -0
  39. package/src/arc/utils.ts +37 -0
  40. package/src/hooks/middleware.ts +95 -0
  41. package/src/interfaces/hooks.ts +2 -1
  42. package/src/interfaces/tool-provider.ts +0 -2
  43. package/src/loop/context-store.ts +12 -9
  44. package/src/loop/vercel-agent-loop.ts +44 -118
  45. package/src/skills/skill-router.ts +12 -6
  46. package/testing/index.ts +22 -0
  47. package/testing/scenario-replay.ts +209 -0
  48. package/testing/scenario-types.ts +38 -0
  49. package/testing/scripted-llm.ts +230 -0
  50. package/tests/arc/channel.test.ts +170 -0
  51. package/tests/arc/context-window.test.ts +396 -0
  52. package/tests/arc/e2e.test.ts +353 -0
  53. package/tests/arc/error-paths.test.ts +402 -0
  54. package/tests/arc/live-integration.test.ts +357 -0
  55. package/tests/arc/memory-manager.test.ts +384 -0
  56. package/tests/arc/middleware.test.ts +113 -0
  57. package/tests/arc/process-interleaving.test.ts +432 -0
  58. package/tests/arc/process-profiles.test.ts +366 -0
  59. package/tests/arc/resilience-integration.test.ts +381 -0
  60. package/tests/arc/resilience.test.ts +575 -0
  61. package/tests/arc/result-paging.test.ts +392 -0
  62. package/tests/arc/scenario-driven.test.ts +297 -0
  63. package/tests/arc/tool-dispatch.test.ts +340 -0
  64. package/tests/arc/wasm-pbt.test.ts +104 -0
  65. package/tests/integration/agent-skill-default-from-sandbox.spec.ts +3 -2
  66. package/tests/unit/structured-messages.spec.ts +1 -1
  67. package/verify/Cargo.lock +637 -0
  68. package/verify/Cargo.toml +24 -0
  69. package/verify/src/lib.rs +5 -0
  70. package/verify/src/main.rs +165 -0
  71. package/verify/src/model/context.rs +100 -0
  72. package/verify/src/model/mod.rs +6 -0
  73. package/verify/src/model/orchestrator.rs +371 -0
  74. package/verify/src/model/process.rs +140 -0
  75. package/verify/src/model/types.rs +273 -0
  76. package/verify/src/properties/liveness.rs +32 -0
  77. package/verify/src/properties/mod.rs +4 -0
  78. package/verify/src/properties/safety.rs +78 -0
  79. package/verify/src/trace/event.rs +155 -0
  80. package/verify/src/trace/mod.rs +2 -0
  81. package/verify/src/trace/validator.rs +367 -0
  82. package/verify/src/wasm/mod.rs +3 -0
  83. package/verify/src/wasm/scenario_generator.rs +400 -0
  84. package/verify/src/wasm/types.rs +104 -0
  85. package/verify/src/wasm/wasm_validator.rs +107 -0
  86. package/verify/tests/model_check.rs +49 -0
  87. package/verify/tests/trace_validation.rs +147 -0
  88. package/vitest.config.ts +1 -1
package/README.md CHANGED
@@ -2,9 +2,17 @@
2
2
 
3
3
  Provider-agnostic TypeScript agent framework with Claude-code-compatible tool semantics.
4
4
 
5
- The harness provides the core loop that drives an AI agent: send messages to an LLM, execute the tool calls it returns, feed results back, and repeat until the LLM produces a final text response.
5
+ Published on npm as **`@bluecopa/harness`**.
6
6
 
7
- ## Quickstart
7
+ Two execution modes: a simple single-agent loop (`createAgent` + `VercelAgentLoop`) and a process-based orchestrator (`ArcLoop`) that dispatches parallel processes with context management, memory, and resilience.
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pnpm add @bluecopa/harness
13
+ ```
14
+
15
+ ## Development
8
16
 
9
17
  ```bash
10
18
  pnpm install
@@ -13,9 +21,11 @@ pnpm test
13
21
 
14
22
  ## Architecture
15
23
 
24
+ ### Single-Agent Loop
25
+
16
26
  ```
17
27
  ┌──────────────┐ ┌──────────────┐ ┌──────────────────┐
18
- │ createAgent │────▶│ AgentLoop │────▶│ LLM (Claude) │
28
+ │ createAgent │────►│ AgentLoop │────►│ LLM (Claude) │
19
29
  │ (turn loop) │ │ (nextAction)│ │ │
20
30
  └──────┬───────┘ └──────────────┘ └──────────────────┘
21
31
  │ │
@@ -27,20 +37,82 @@ pnpm test
27
37
  └──────────────┘
28
38
  ```
29
39
 
30
- 1. `createAgent` drives a deterministic step loop
31
- 2. Each step calls `loop.nextAction(messages)` to get the LLM's decision
32
- 3. If it's a tool call, the harness executes it via `ToolProvider` and appends the result
33
- 4. If it's a final action, the loop ends and returns the result
40
+ ### ArcLoop Orchestrator
34
41
 
35
- ## Using with the sandbox
42
+ ```
43
+ Orchestrator (ArcLoop — Opus 4.6 by default)
44
+ │ tools: Thread, Check, Cancel, Remember, ReadEpisode
45
+
46
+ │ Turn 1 (parallel):
47
+ ├──► Process 0 ("read auth", model=fast) ─┐
48
+ ├──► Process 1 ("read routes", model=fast) ─┼──► Episodes
49
+ ├──► Process 2 ("read tests", model=fast) ─┘
50
+
51
+ │ Turn 2 (dispatch dependent work):
52
+ ├──► Thread("fix bug", context=[ep0,ep1,ep2]) ──► Episode
53
+
54
+ │ Turn 3 (parallel):
55
+ ├──► Thread("run tests", context=[ep3]) ─┐
56
+ ├──► Thread("update docs", context=[ep3]) ─┘
57
+
58
+ └──► Final text response
59
+ ```
36
60
 
37
- The most common setup connects the harness to a running sandbox service via `ControlPlaneE2BExecutor`:
61
+ Full architecture doc: [`docs/arc.md`](../docs/arc.md)
38
62
 
39
- ```ts
40
- import { createAgent } from './src/agent/create-agent';
41
- import { E2BToolProvider } from './src/providers/e2b-tool-provider';
63
+ ---
64
+
65
+ ## ToolProvider
66
+
67
+ The contract for tool execution. All agent modes use this interface.
68
+
69
+ ```typescript
70
+ interface ToolProvider {
71
+ bash(command: string, options?: BashOptions): Promise<ToolResult>;
72
+ readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
73
+ writeFile(path: string, content: string): Promise<ToolResult>;
74
+ editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
75
+ glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
76
+ grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
77
+ webFetch?(options: WebFetchOptions): Promise<ToolResult>;
78
+ webSearch?(query: string): Promise<ToolResult>;
79
+ capabilities(): ToolProviderCapabilities;
80
+ }
81
+
82
+ interface ToolResult {
83
+ success: boolean;
84
+ output: string;
85
+ error?: string;
86
+ }
87
+ ```
88
+
89
+ Built-in implementations:
90
+
91
+ | Provider | Description |
92
+ |----------|-------------|
93
+ | `LocalToolProvider` | Runs tools on the local filesystem |
94
+ | `E2BToolProvider` | Routes tools to a sandbox VM via `ControlPlaneE2BExecutor` |
95
+ | `CompositeToolProvider` | Combines multiple providers (e.g. local filesystem + sandbox bash) |
96
+
97
+ ## SandboxProvider
98
+
99
+ Higher-level sandbox operations beyond basic tool calls:
100
+
101
+ ```typescript
102
+ interface SandboxProvider {
103
+ exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
104
+ readSandboxFile(path: string): Promise<SandboxFileBlob>;
105
+ writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
106
+ }
107
+ ```
108
+
109
+ Used by `SkillManager` for executing skill scripts in isolated VMs.
110
+
111
+ ## Connecting to a Sandbox
112
+
113
+ ```typescript
42
114
  import { ControlPlaneE2BExecutor } from './src/providers/control-plane-e2b-executor';
43
- import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
115
+ import { E2BToolProvider } from './src/providers/e2b-tool-provider';
44
116
 
45
117
  // Connect to sandbox service
46
118
  const executor = new ControlPlaneE2BExecutor({
@@ -50,155 +122,174 @@ const executor = new ControlPlaneE2BExecutor({
50
122
  });
51
123
  await executor.initialize(); // creates a Firecracker VM
52
124
 
53
- // Build and run the agent
54
- const agent = createAgent({
55
- toolProvider: new E2BToolProvider(executor),
56
- loop: new VercelAgentLoop(), // needs ANTHROPIC_API_KEY
57
- });
125
+ const toolProvider = new E2BToolProvider(executor);
58
126
 
59
- const result = await agent.run('create a bar chart of sales data');
60
- console.log(result.output); // LLM's final response
61
- console.log(result.steps); // number of tool steps
127
+ // ... use with createAgent or ArcLoop
62
128
 
63
- await executor.destroy(); // tears down the VM
129
+ await executor.destroy(); // tears down the VM
64
130
  ```
65
131
 
66
- For a complete working example, see [`examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts).
67
-
68
- ### From environment variables
132
+ From environment variables: `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY`.
69
133
 
70
- `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY` automatically:
134
+ ---
71
135
 
72
- ```ts
73
- const executor = ControlPlaneE2BExecutor.fromEnv();
74
- ```
136
+ ## Single-Agent Mode (`createAgent`)
75
137
 
76
- ## Using locally (no sandbox)
138
+ For simple tasks that don't need orchestration:
77
139
 
78
- For development without a sandbox service, use `LocalToolProvider` which runs tools on the local machine:
79
-
80
- ```ts
140
+ ```typescript
81
141
  import { createAgent } from './src/agent/create-agent';
82
142
  import { LocalToolProvider } from './src/providers/local-tool-provider';
83
143
 
84
144
  const agent = createAgent({
85
145
  toolProvider: new LocalToolProvider(process.cwd()),
86
- loop: new VercelAgentLoop(),
146
+ loop: new VercelAgentLoop(), // needs ANTHROPIC_API_KEY
87
147
  });
88
148
 
89
149
  const result = await agent.run('list all TypeScript files');
150
+ console.log(result.output);
90
151
  ```
91
152
 
92
- ## Key modules
93
-
94
- ### Agent creation (`src/agent/create-agent.ts`)
153
+ ### Configuration
95
154
 
96
- `createAgent(options)` returns an agent with a `.run(prompt, options?)` method. Options:
155
+ | Option | Type | Default | Description |
156
+ |--------|------|---------|-------------|
157
+ | `toolProvider` | `ToolProvider` | required | Executes tool calls |
158
+ | `loop` | `AgentLoop` | `VercelAgentLoop` | LLM decision loop |
159
+ | `sandboxProvider` | `SandboxProvider` | — | Higher-level sandbox operations |
160
+ | `maxSteps` | `number` | 30 | Max tool steps per run |
161
+ | `telemetry` | `HarnessTelemetry` | — | OpenTelemetry-style tracing |
162
+ | `skillIndexPath` | `string` | — | Path to skill index JSON for routing |
97
163
 
98
- | Option | Type | Description |
99
- |--------|------|-------------|
100
- | `toolProvider` | `ToolProvider` | Required. Executes tool calls |
101
- | `loop` | `AgentLoop` | LLM decision loop (default: `VercelAgentLoop`) |
102
- | `sandboxProvider` | `SandboxProvider` | Optional. Higher-level sandbox ops (file download, exec with env) |
103
- | `maxSteps` | `number` | Max tool steps per run (default: 30) |
104
- | `telemetry` | `HarnessTelemetry` | Optional. OpenTelemetry-style tracing |
105
- | `skillIndexPath` | `string` | Optional. Path to skill index JSON |
164
+ ### VercelAgentLoop
106
165
 
107
- ### Agent loop (`src/loop/vercel-agent-loop.ts`)
166
+ Calls Claude via the Vercel AI SDK. Supports parallel tool calls and configurable system prompt.
108
167
 
109
- `VercelAgentLoop` calls Claude via the Vercel AI SDK. It supports:
110
- - Parallel tool calls (returns `ToolBatchAction` when the LLM requests multiple tools at once)
111
- - Configurable system prompt
112
- - Model selection via `HARNESS_MODEL` env var (default: `claude-sonnet-4-5`)
113
-
114
- ```ts
168
+ ```typescript
115
169
  const loop = new VercelAgentLoop({
116
170
  systemPrompt: 'You are a helpful coding assistant.',
171
+ model: 'claude-sonnet-4-5', // or HARNESS_MODEL env var
117
172
  });
118
173
  ```
119
174
 
120
- ### Tool provider (`src/interfaces/tool-provider.ts`)
175
+ ### LCMToolLoop
121
176
 
122
- The contract for tool execution:
177
+ Wraps another loop to add Lossless Context Management and optional REPL orchestration:
123
178
 
124
- ```ts
125
- interface ToolProvider {
126
- bash(command: string, options?: BashOptions): Promise<ToolResult>;
127
- readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
128
- writeFile(path: string, content: string): Promise<ToolResult>;
129
- editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
130
- glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
131
- grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
132
- webFetch?(options: WebFetchOptions): Promise<ToolResult>;
133
- webSearch?(query: string): Promise<ToolResult>;
134
- capabilities(): ToolProviderCapabilities;
135
- }
179
+ ```typescript
180
+ import { LCMToolLoop } from './src/loop/lcm-tool-loop';
181
+ import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
136
182
 
137
- interface ToolResult {
138
- success: boolean;
139
- output: string;
140
- error?: string;
141
- }
183
+ const loop = new LCMToolLoop({
184
+ innerLoop: new VercelAgentLoop(),
185
+ toolProvider: mySandboxProvider,
186
+ enableRepl: true, // default: true
187
+ bridgeDir: '/var/run/bridge',
188
+ onActivity: (entry) => console.log(entry),
189
+ onLlmRequest: async (prompt) => callLLM(prompt),
190
+ onWebFetchRequest: async (url) => fetch(url),
191
+ });
142
192
  ```
143
193
 
144
- Built-in implementations:
194
+ **Standard mode**: Lossless context trimming — the LLM always sees a coherent, budget-fitting view of the full conversation.
145
195
 
146
- | Provider | Description |
147
- |----------|-------------|
148
- | `LocalToolProvider` | Runs tools on the local filesystem |
149
- | `E2BToolProvider` | Routes tools to an E2B-compatible executor over HTTP |
150
- | `CompositeToolProvider` | Combines multiple providers (e.g. sandbox + web) |
196
+ **REPL mode**: When the LLM returns a Bash action with the REPL marker, the loop writes a Python script into the sandbox, injects the bridge module, runs the script, and polls for sub-requests (LLM, web_fetch, ask_user) that the harness fulfills.
151
197
 
152
- ### Action types (`src/agent/types.ts`)
198
+ ---
153
199
 
154
- The LLM returns one of these action types each turn:
200
+ ## ArcLoop (Orchestrator Mode)
155
201
 
156
- ```ts
157
- // Single tool call
158
- interface ToolCallAction {
159
- type: 'tool';
160
- name: 'Bash' | 'Read' | 'Write' | 'Edit' | 'Glob' | 'Grep' | ...;
161
- args: Record<string, unknown>;
162
- }
202
+ For complex tasks that benefit from parallel processes, context management, and memory:
163
203
 
164
- // Multiple independent tool calls (executed in parallel)
165
- interface ToolBatchAction {
166
- type: 'tool_batch';
167
- calls: ToolCallAction[];
168
- }
204
+ ```typescript
205
+ import { createArcAgent } from './src/arc/create-arc-agent';
169
206
 
170
- // Final text response (ends the loop)
171
- interface FinalAction {
172
- type: 'final';
173
- content: string;
174
- }
175
- ```
207
+ const agent = await createArcAgent({
208
+ toolProvider: myToolProvider,
209
+ episodeStore: myEpisodeStore, // required
210
+ sessionMemoStore: mySessionMemoStore, // required
211
+ longTermStore: myLongTermStore, // required
212
+ taskId: 'task-1',
213
+ sessionId: 'session-1',
214
+ });
176
215
 
177
- ### LCM tool loop (`src/loop/lcm-tool-loop.ts`)
216
+ // Streaming
217
+ for await (const event of agent.stream(messages, signal)) {
218
+ if (event.type === 'text_delta') process.stdout.write(event.text);
219
+ if (event.type === 'process_dispatched') console.log(` → ${event.action}`);
220
+ if (event.type === 'done') console.log(`Done in ${event.stats.durationMs}ms`);
221
+ }
178
222
 
179
- `LCMToolLoop` wraps another loop to add LCM-based tool routing, REPL script execution, and bridge-based tool dispatch. Used in the chat-assistant example.
223
+ // Non-streaming
224
+ const result = await agent.run(messages, signal);
225
+ ```
180
226
 
181
- ### Sandbox provider (`src/interfaces/sandbox-provider.ts`)
227
+ ### ArcLoopConfig
228
+
229
+ | Option | Type | Default | Description |
230
+ |--------|------|---------|-------------|
231
+ | `model` | `string` | `'claude-opus-4-6'` | Orchestrator model (ID or tier name) |
232
+ | `modelMap` | `Record<ModelTier, string>` | haiku/sonnet/opus | Maps fast/medium/strong to model IDs |
233
+ | `apiKey` | `string` | — | Anthropic API key |
234
+ | `systemPrompt` | `string` | built-in | Custom orchestrator system prompt |
235
+ | `maxTurns` | `number` | 30 | Max orchestrator turns |
236
+ | `processTimeout` | `number` | 120_000 | Per-process timeout (ms) |
237
+ | `processMaxSteps` | `number` | 20 | Per-process max tool steps |
238
+ | `contextWindowSize` | `number` | 200_000 | Context window in tokens |
239
+ | `outputReserve` | `number` | 20_000 | Tokens reserved for output |
240
+ | `autoMemory` | `boolean` | true | Auto-detect patterns from episodes |
241
+ | `episodeStore` | `EpisodeStore` | required | Stores episode summaries + traces |
242
+ | `sessionMemoStore` | `SessionMemoStore` | required | Stores session memos |
243
+ | `longTermStore` | `LongTermStore` | required | Stores long-term memories |
244
+ | `taskId` | `string` | required | Task identifier |
245
+ | `sessionId` | `string` | required | Session identifier |
246
+ | `toolProvider` | `ToolProvider` | required | Tool execution |
247
+ | `processTools` | `Record<string, AnyTool>` | builtinTools | Tools available inside processes |
248
+ | `extraOrchestratorTools` | `Record<string, AnyTool>` | — | Custom orchestrator tools |
249
+ | `onOrchestratorTool` | `function` | — | Handler for custom orchestrator tools |
250
+ | `resilience` | `ResiliencePolicy` | — | Composable resilience pipeline |
251
+ | `traceWriter` | `function` | — | Callback for trace event emission |
252
+
253
+ ### Resilience
254
+
255
+ ```typescript
256
+ import { resilience } from './src/arc/resilience';
257
+
258
+ const pipeline = resilience()
259
+ .retry({ maxRetries: 2, baseDelay: 1000 })
260
+ .timeout({ durationMs: 30_000 })
261
+ .circuitBreaker({ failureThreshold: 5 })
262
+ .build();
263
+
264
+ const agent = await createArcAgent({
265
+ // ...config
266
+ resilience: pipeline,
267
+ });
268
+ ```
182
269
 
183
- Higher-level sandbox operations beyond basic tool calls:
270
+ ### Trace Emission
184
271
 
185
- ```ts
186
- interface SandboxProvider {
187
- exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
188
- readSandboxFile(path: string): Promise<SandboxFileBlob>;
189
- writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
190
- }
272
+ ```typescript
273
+ const traces: TraceEvent[] = [];
274
+ const agent = await createArcAgent({
275
+ // ...config
276
+ traceWriter: (event) => traces.push(event),
277
+ });
191
278
  ```
192
279
 
193
- ### Observability (`src/observability/otel.ts`)
280
+ Traces can be validated against the formal model: `cd verify && cargo run -- trace file.ndjson`
194
281
 
195
- `HarnessTelemetry` provides OpenTelemetry-style spans and metrics for agent runs.
282
+ ---
196
283
 
197
- ## Package layout
284
+ ## Package Layout
198
285
 
199
286
  ```
200
287
  src/
201
288
  ├── agent/ # createAgent, step executor, types
289
+ ├── arc/ # ArcLoop orchestrator, processes, memory, resilience
290
+ │ ├── resilience/ # Retry, circuit breaker, timeout, bulkhead, fallback
291
+ │ ├── stores/ # RxDB + in-memory store implementations
292
+ │ └── object-store/ # Pluggable cloud sync (fs, memory)
202
293
  ├── interfaces/ # ToolProvider, SandboxProvider, AgentLoop contracts
203
294
  ├── loop/ # VercelAgentLoop, LCMToolLoop
204
295
  ├── providers/ # LocalToolProvider, E2BToolProvider, ControlPlaneE2BExecutor
@@ -206,16 +297,20 @@ src/
206
297
  ├── hooks/ # Pre/post tool call hooks
207
298
  ├── permissions/ # Tool permission checks
208
299
  ├── sessions/ # Session persistence
209
- ├── subagents/ # Subagent spawning and task tools
300
+ ├── subagents/ # Subagent spawning
210
301
  ├── skills/ # Skill index, routing, and management
211
302
  ├── optimization/ # Benchmark runner
212
303
  └── observability/ # OpenTelemetry integration
304
+
305
+ verify/ # Rust formal verification (Stateright model checker)
306
+ testing/ # Adversarial scenario replay harness
307
+ tests/ # Vitest test suite
213
308
  ```
214
309
 
215
310
  ## Documentation
216
311
 
217
- - Provider guide: `docs/guides/providers.md`
218
- - Skills guide: `docs/guides/skills.md`
219
- - Observability guide: `docs/guides/observability.md`
220
- - Release process: `../docs/RELEASE.md`
221
- - Full example: [`../examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts)
312
+ - [Arc architecture](../docs/arc.md) — process model, context window, memory, resilience, verification
313
+ - [Testing](../docs/testing.md) — test layers, running tests, writing new tests
314
+ - [Sandbox setup](../docs/PUBLIC_SANDBOX.md) — deploying the sandbox service
315
+ - [Release process](../docs/RELEASE.md) — versioning and publishing
316
+ - [Example](../examples/chat-assistant/src/chat.ts) — complete working chat assistant
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bluecopa/harness",
3
- "version": "0.1.0-snapshot.5",
3
+ "version": "0.1.0-snapshot.50",
4
4
  "description": "Provider-agnostic TypeScript agent framework",
5
5
  "license": "UNLICENSED",
6
6
  "scripts": {
@@ -10,6 +10,7 @@
10
10
  "dependencies": {
11
11
  "@ai-sdk/anthropic": "^3.0.48",
12
12
  "ai": "^6.0.101",
13
+ "rxdb": "^15.39.0",
13
14
  "zod": "^4.1.11"
14
15
  },
15
16
  "devDependencies": {
@@ -7,14 +7,12 @@ import type { HarnessTelemetry } from '../observability/otel';
7
7
  import { HookRunner } from '../hooks/hook-runner';
8
8
  import { PermissionManager } from '../permissions/permission-manager';
9
9
  import { VercelAgentLoop } from '../loop/vercel-agent-loop';
10
- export type { SystemPromptBlock, VercelAgentLoopConfig } from '../loop/vercel-agent-loop';
11
- export type { PrepareStepContext, PrepareStepResult } from './types';
12
10
  import { SkillManager } from '../skills/skill-manager';
13
11
  import { SkillRouter } from '../skills/skill-router';
14
12
  import type { SkillSummary } from '../skills/skill-types';
15
13
  import { SingleFlightStepExecutor } from './step-executor';
16
- import type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, StepUsage, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo } from './types';
17
- export type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, StepUsage, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo };
14
+ import type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo } from './types';
15
+ export type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo };
18
16
  export { HookRunner } from '../hooks/hook-runner';
19
17
  export { PermissionManager } from '../permissions/permission-manager';
20
18
  export type { PermissionMode, PermissionResolver, PermissionRequest } from '../permissions/permission-manager';
@@ -39,6 +37,8 @@ export interface AgentRuntime {
39
37
  /** Custom tool executor. Called for every tool action. Return null to fall through to built-in dispatch.
40
38
  * When hookRunner/permissionManager are provided on the runtime, they are automatically applied before/after this callback — no manual wiring needed. */
41
39
  executeToolAction?: (action: ToolCallAction) => Promise<ToolResult | null>;
40
+ /** Progress callback fired before/after each tool call during run(). */
41
+ onToolProgress?: (event: { type: 'tool_start'; name: string; args: Record<string, unknown> } | { type: 'tool_end'; name: string; success: boolean; durationMs: number }) => void;
42
42
  }
43
43
 
44
44
  /**
@@ -220,12 +220,9 @@ function toStreamResult(r: ToolResult): { success: boolean; output: string; erro
220
220
  return base;
221
221
  }
222
222
 
223
- /** Format content string for LLM context. Uses modelOutput (compact summary) when available. */
223
+ /** Format a display-friendly content string for tool results (used in content field). */
224
224
  function formatToolResultContent(call: ToolCallAction, result: ToolResult): string {
225
- // Use modelOutput for LLM context when available keeps context compact
226
- const content = result.success
227
- ? (result.modelOutput ?? result.output)
228
- : `ERROR: ${result.error ?? 'unknown failure'}`;
225
+ const content = result.success ? result.output : `ERROR: ${result.error ?? 'unknown failure'}`;
229
226
  switch (call.name) {
230
227
  case 'Write':
231
228
  return `Write(${call.args.path}): ${result.success ? 'ok' : content}`;
@@ -520,11 +517,6 @@ export function createAgent(runtime: AgentRuntime) {
520
517
  ? { nextAction: runtime.nextAction }
521
518
  : new VercelAgentLoop());
522
519
 
523
- /** Read lastUsage from the loop if it's a VercelAgentLoop. */
524
- function getLoopUsage(): StepUsage | undefined {
525
- return loop instanceof VercelAgentLoop ? loop.lastUsage : undefined;
526
- }
527
-
528
520
  async function resolveSkillContext(prompt: string): Promise<string> {
529
521
  if (!skillManager || !skillIndexPath) return '';
530
522
 
@@ -606,14 +598,18 @@ export function createAgent(runtime: AgentRuntime) {
606
598
 
607
599
  // Execute valid calls via batch (sequential sandbox ops) or parallel fallback
608
600
  if (validCalls.length > 0) {
601
+ for (const c of validCalls) runtime.onToolProgress?.({ type: 'tool_start', name: c.name, args: c.args });
602
+ const batchStart = Date.now();
609
603
  const results = await executeBatch(validCalls, runtime.toolProvider, runtime);
604
+ const batchMs = Date.now() - batchStart;
610
605
  for (let i = 0; i < validCalls.length; i++) {
611
606
  const call = validCalls[i]!;
612
607
  const r = results[i]!;
608
+ runtime.onToolProgress?.({ type: 'tool_end', name: call.name, success: r.success, durationMs: batchMs });
613
609
  if (!r.success) {
614
610
  recordAgentError(runtime.telemetry);
615
611
  }
616
- const resultText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
612
+ const resultText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
617
613
  messages.push({
618
614
  role: 'tool',
619
615
  content: formatToolResultContent(call, r),
@@ -669,6 +665,8 @@ export function createAgent(runtime: AgentRuntime) {
669
665
  } else {
670
666
  consecutiveInvalid = 0;
671
667
  }
668
+ runtime.onToolProgress?.({ type: 'tool_start', name: action.name, args: action.args });
669
+ const singleStart = Date.now();
672
670
  const result = validationError
673
671
  ? ({ success: false, output: '', error: validationError } as ToolResult)
674
672
  : await executor.run(async () => {
@@ -682,10 +680,11 @@ export function createAgent(runtime: AgentRuntime) {
682
680
  };
683
681
  }
684
682
  });
683
+ runtime.onToolProgress?.({ type: 'tool_end', name: action.name, success: result.success, durationMs: Date.now() - singleStart });
685
684
  if (!result.success) {
686
685
  recordAgentError(runtime.telemetry);
687
686
  }
688
- const singleResultText = result.success ? (result.modelOutput ?? result.output) : `ERROR: ${result.error ?? 'unknown failure'}`;
687
+ const singleResultText = result.success ? result.output : `ERROR: ${result.error ?? 'unknown failure'}`;
689
688
  messages.push({
690
689
  role: 'tool',
691
690
  content: formatToolResultContent(action, result),
@@ -728,8 +727,7 @@ export function createAgent(runtime: AgentRuntime) {
728
727
  if (event.type === 'text_delta') {
729
728
  finalText += event.text;
730
729
  yield event;
731
- }
732
- if (event.type === 'tool_start') {
730
+ } else if (event.type === 'tool_start') {
733
731
  pendingTools.push({
734
732
  type: 'tool',
735
733
  name: event.name,
@@ -737,13 +735,18 @@ export function createAgent(runtime: AgentRuntime) {
737
735
  ...(event.toolCallId != null ? { toolCallId: event.toolCallId } : {}),
738
736
  });
739
737
  yield event;
738
+ } else {
739
+ // Forward all other events (tool_end, step_start, step_end, done)
740
+ // from self-managing loops like ArcLoop
741
+ yield event;
742
+ if (event.type === 'done') return;
740
743
  }
741
744
  }
742
745
 
743
746
  // If no tools → final response
744
747
  if (pendingTools.length === 0) {
745
748
  messages.push({ role: 'assistant', content: finalText });
746
- { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
749
+ yield { type: 'step_end', step };
747
750
  yield { type: 'done', output: finalText, steps: step };
748
751
  return;
749
752
  }
@@ -769,7 +772,7 @@ export function createAgent(runtime: AgentRuntime) {
769
772
  if (action.type === 'final') {
770
773
  yield { type: 'text_delta', text: action.content };
771
774
  messages.push({ role: 'assistant', content: action.content });
772
- { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
775
+ yield { type: 'step_end', step };
773
776
  yield { type: 'done', output: action.content, steps: step };
774
777
  return;
775
778
  }
@@ -781,7 +784,7 @@ export function createAgent(runtime: AgentRuntime) {
781
784
  try {
782
785
  const r = await executeTool(runtime.toolProvider, call, runtime);
783
786
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
784
- const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
787
+ const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
785
788
  messages.push({
786
789
  role: 'tool',
787
790
  content: formatToolResultContent(call, r),
@@ -803,7 +806,7 @@ export function createAgent(runtime: AgentRuntime) {
803
806
  try {
804
807
  const r = await executeTool(runtime.toolProvider, action, runtime);
805
808
  yield { type: 'tool_end', name: action.name, result: toStreamResult(r) };
806
- const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
809
+ const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
807
810
  messages.push({
808
811
  role: 'tool',
809
812
  content: formatToolResultContent(action, r),
@@ -819,7 +822,7 @@ export function createAgent(runtime: AgentRuntime) {
819
822
  });
820
823
  }
821
824
  }
822
- { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
825
+ yield { type: 'step_end', step };
823
826
  continue;
824
827
  }
825
828
 
@@ -829,7 +832,7 @@ export function createAgent(runtime: AgentRuntime) {
829
832
  const call = pendingTools[i]!;
830
833
  const r = results[i]!;
831
834
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
832
- const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
835
+ const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
833
836
  messages.push({
834
837
  role: 'tool',
835
838
  content: formatToolResultContent(call, r),
@@ -847,7 +850,7 @@ export function createAgent(runtime: AgentRuntime) {
847
850
 
848
851
  if (action.type === 'final') {
849
852
  messages.push({ role: 'assistant', content: action.content });
850
- { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
853
+ yield { type: 'step_end', step };
851
854
  yield { type: 'done', output: action.content, steps: step };
852
855
  return;
853
856
  }
@@ -871,7 +874,7 @@ export function createAgent(runtime: AgentRuntime) {
871
874
  const call = calls[i]!;
872
875
  const r = results[i]!;
873
876
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
874
- const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
877
+ const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
875
878
  messages.push({
876
879
  role: 'tool',
877
880
  content: formatToolResultContent(call, r),
@@ -885,7 +888,7 @@ export function createAgent(runtime: AgentRuntime) {
885
888
  }
886
889
  }
887
890
 
888
- { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
891
+ yield { type: 'step_end', step };
889
892
  }
890
893
 
891
894
  yield { type: 'done', output: 'ERROR: max steps exceeded', steps: maxSteps };