@bluecopa/harness 0.1.0-snapshot.7 → 0.1.0-snapshot.70

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +208 -148
  2. package/package.json +1 -1
  3. package/src/agent/create-agent.ts +49 -17
  4. package/src/agent/types.ts +27 -2
  5. package/src/arc/agent-runner.ts +994 -0
  6. package/src/arc/arc-loop.ts +803 -306
  7. package/src/arc/arc-types.ts +5 -99
  8. package/src/arc/consolidation.ts +22 -4
  9. package/src/arc/context-window.ts +267 -0
  10. package/src/arc/create-arc-agent.ts +78 -59
  11. package/src/arc/create-org-arc-agent.ts +60 -0
  12. package/src/arc/episode-compressor.ts +140 -67
  13. package/src/arc/memory-manager.ts +245 -0
  14. package/src/arc/message-convert.ts +123 -0
  15. package/src/arc/multi-model.ts +70 -0
  16. package/src/arc/org-arc-loop.ts +343 -0
  17. package/src/arc/org-arc-runner.ts +287 -0
  18. package/src/arc/org-types.ts +104 -0
  19. package/src/arc/profile-builder.ts +173 -0
  20. package/src/arc/resilience/bulkhead.ts +110 -0
  21. package/src/arc/resilience/circuit-breaker.ts +112 -0
  22. package/src/arc/resilience/fallback.ts +27 -0
  23. package/src/arc/resilience/index.ts +21 -0
  24. package/src/arc/resilience/pipeline.ts +103 -0
  25. package/src/arc/resilience/retry.ts +90 -0
  26. package/src/arc/resilience/timeout.ts +60 -0
  27. package/src/arc/resilience/types.ts +71 -0
  28. package/src/arc/result-pager.ts +77 -0
  29. package/src/arc/sig.ts +115 -0
  30. package/src/arc/skill-resolver.ts +109 -0
  31. package/src/arc/stores/rxdb-setup.ts +1 -0
  32. package/src/arc/tools.ts +67 -0
  33. package/src/arc/types.ts +370 -0
  34. package/src/arc/utils.ts +37 -0
  35. package/src/hooks/middleware.ts +95 -0
  36. package/src/interfaces/hooks.ts +7 -1
  37. package/src/interfaces/tool-provider.ts +2 -0
  38. package/src/loop/vercel-agent-loop.ts +122 -19
  39. package/src/skills/skill-router.ts +12 -6
  40. package/testing/index.ts +22 -0
  41. package/testing/scenario-replay.ts +209 -0
  42. package/testing/scenario-types.ts +38 -0
  43. package/testing/scripted-llm.ts +230 -0
  44. package/tests/arc/channel.test.ts +170 -0
  45. package/tests/arc/context-window.test.ts +396 -0
  46. package/tests/arc/e2e.test.ts +353 -0
  47. package/tests/arc/error-paths.test.ts +402 -0
  48. package/tests/arc/live-integration.test.ts +357 -0
  49. package/tests/arc/memory-manager.test.ts +384 -0
  50. package/tests/arc/middleware.test.ts +113 -0
  51. package/tests/arc/org-arc-loop.test.ts +138 -0
  52. package/tests/arc/process-interleaving.test.ts +432 -0
  53. package/tests/arc/process-profiles.test.ts +366 -0
  54. package/tests/arc/resilience-integration.test.ts +381 -0
  55. package/tests/arc/resilience.test.ts +575 -0
  56. package/tests/arc/result-paging.test.ts +392 -0
  57. package/tests/arc/scenario-driven.test.ts +297 -0
  58. package/tests/arc/tool-dispatch.test.ts +340 -0
  59. package/tests/arc/wasm-pbt.test.ts +104 -0
  60. package/verify/Cargo.lock +637 -0
  61. package/verify/Cargo.toml +24 -0
  62. package/verify/src/lib.rs +5 -0
  63. package/verify/src/main.rs +165 -0
  64. package/verify/src/model/context.rs +100 -0
  65. package/verify/src/model/mod.rs +6 -0
  66. package/verify/src/model/orchestrator.rs +371 -0
  67. package/verify/src/model/process.rs +140 -0
  68. package/verify/src/model/types.rs +273 -0
  69. package/verify/src/properties/liveness.rs +32 -0
  70. package/verify/src/properties/mod.rs +4 -0
  71. package/verify/src/properties/safety.rs +78 -0
  72. package/verify/src/trace/event.rs +155 -0
  73. package/verify/src/trace/mod.rs +2 -0
  74. package/verify/src/trace/validator.rs +367 -0
  75. package/verify/src/wasm/mod.rs +3 -0
  76. package/verify/src/wasm/scenario_generator.rs +400 -0
  77. package/verify/src/wasm/types.rs +104 -0
  78. package/verify/src/wasm/wasm_validator.rs +107 -0
  79. package/verify/tests/model_check.rs +49 -0
  80. package/verify/tests/trace_validation.rs +147 -0
  81. package/vitest.config.ts +1 -1
  82. package/src/arc/thread-executor.ts +0 -354
  83. package/src/arc/thread-tool.ts +0 -26
package/README.md CHANGED
@@ -2,9 +2,17 @@
2
2
 
3
3
  Provider-agnostic TypeScript agent framework with Claude-code-compatible tool semantics.
4
4
 
5
- The harness provides the core loop that drives an AI agent: send messages to an LLM, execute the tool calls it returns, feed results back, and repeat until the LLM produces a final text response.
5
+ Published on npm as **`@bluecopa/harness`**.
6
6
 
7
- ## Quickstart
7
+ Two execution modes: a simple single-agent loop (`createAgent` + `VercelAgentLoop`) and a process-based orchestrator (`ArcLoop`) that dispatches parallel processes with context management, memory, and resilience.
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pnpm add @bluecopa/harness
13
+ ```
14
+
15
+ ## Development
8
16
 
9
17
  ```bash
10
18
  pnpm install
@@ -13,9 +21,11 @@ pnpm test
13
21
 
14
22
  ## Architecture
15
23
 
24
+ ### Single-Agent Loop
25
+
16
26
  ```
17
27
  ┌──────────────┐ ┌──────────────┐ ┌──────────────────┐
18
- │ createAgent │────▶│ AgentLoop │────▶│ LLM (Claude) │
28
+ │ createAgent │────►│ AgentLoop │────►│ LLM (Claude) │
19
29
  │ (turn loop) │ │ (nextAction)│ │ │
20
30
  └──────┬───────┘ └──────────────┘ └──────────────────┘
21
31
  │ │
@@ -27,20 +37,82 @@ pnpm test
27
37
  └──────────────┘
28
38
  ```
29
39
 
30
- 1. `createAgent` drives a deterministic step loop
31
- 2. Each step calls `loop.nextAction(messages)` to get the LLM's decision
32
- 3. If it's a tool call, the harness executes it via `ToolProvider` and appends the result
33
- 4. If it's a final action, the loop ends and returns the result
40
+ ### ArcLoop Orchestrator
34
41
 
35
- ## Using with the sandbox
42
+ ```
43
+ Orchestrator (ArcLoop — Opus 4.6 by default)
44
+ │ tools: Thread, Check, Cancel, Remember, ReadEpisode
45
+
46
+ │ Turn 1 (parallel):
47
+ ├──► Process 0 ("read auth", model=fast) ─┐
48
+ ├──► Process 1 ("read routes", model=fast) ─┼──► Episodes
49
+ ├──► Process 2 ("read tests", model=fast) ─┘
50
+
51
+ │ Turn 2 (dispatch dependent work):
52
+ ├──► Thread("fix bug", context=[ep0,ep1,ep2]) ──► Episode
53
+
54
+ │ Turn 3 (parallel):
55
+ ├──► Thread("run tests", context=[ep3]) ─┐
56
+ ├──► Thread("update docs", context=[ep3]) ─┘
57
+
58
+ └──► Final text response
59
+ ```
36
60
 
37
- The most common setup connects the harness to a running sandbox service via `ControlPlaneE2BExecutor`:
61
+ Full architecture doc: [`docs/arc.md`](../docs/arc.md)
38
62
 
39
- ```ts
40
- import { createAgent } from './src/agent/create-agent';
41
- import { E2BToolProvider } from './src/providers/e2b-tool-provider';
63
+ ---
64
+
65
+ ## ToolProvider
66
+
67
+ The contract for tool execution. All agent modes use this interface.
68
+
69
+ ```typescript
70
+ interface ToolProvider {
71
+ bash(command: string, options?: BashOptions): Promise<ToolResult>;
72
+ readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
73
+ writeFile(path: string, content: string): Promise<ToolResult>;
74
+ editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
75
+ glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
76
+ grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
77
+ webFetch?(options: WebFetchOptions): Promise<ToolResult>;
78
+ webSearch?(query: string): Promise<ToolResult>;
79
+ capabilities(): ToolProviderCapabilities;
80
+ }
81
+
82
+ interface ToolResult {
83
+ success: boolean;
84
+ output: string;
85
+ error?: string;
86
+ }
87
+ ```
88
+
89
+ Built-in implementations:
90
+
91
+ | Provider | Description |
92
+ |----------|-------------|
93
+ | `LocalToolProvider` | Runs tools on the local filesystem |
94
+ | `E2BToolProvider` | Routes tools to a sandbox VM via `ControlPlaneE2BExecutor` |
95
+ | `CompositeToolProvider` | Combines multiple providers (e.g. local filesystem + sandbox bash) |
96
+
97
+ ## SandboxProvider
98
+
99
+ Higher-level sandbox operations beyond basic tool calls:
100
+
101
+ ```typescript
102
+ interface SandboxProvider {
103
+ exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
104
+ readSandboxFile(path: string): Promise<SandboxFileBlob>;
105
+ writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
106
+ }
107
+ ```
108
+
109
+ Used by `SkillManager` for executing skill scripts in isolated VMs.
110
+
111
+ ## Connecting to a Sandbox
112
+
113
+ ```typescript
42
114
  import { ControlPlaneE2BExecutor } from './src/providers/control-plane-e2b-executor';
43
- import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
115
+ import { E2BToolProvider } from './src/providers/e2b-tool-provider';
44
116
 
45
117
  // Connect to sandbox service
46
118
  const executor = new ControlPlaneE2BExecutor({
@@ -50,187 +122,172 @@ const executor = new ControlPlaneE2BExecutor({
50
122
  });
51
123
  await executor.initialize(); // creates a Firecracker VM
52
124
 
53
- // Build and run the agent
54
- const agent = createAgent({
55
- toolProvider: new E2BToolProvider(executor),
56
- loop: new VercelAgentLoop(), // needs ANTHROPIC_API_KEY
57
- });
125
+ const toolProvider = new E2BToolProvider(executor);
58
126
 
59
- const result = await agent.run('create a bar chart of sales data');
60
- console.log(result.output); // LLM's final response
61
- console.log(result.steps); // number of tool steps
127
+ // ... use with createAgent or ArcLoop
62
128
 
63
- await executor.destroy(); // tears down the VM
129
+ await executor.destroy(); // tears down the VM
64
130
  ```
65
131
 
66
- For a complete working example, see [`examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts).
67
-
68
- ### From environment variables
132
+ From environment variables: `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY`.
69
133
 
70
- `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY` automatically:
134
+ ---
71
135
 
72
- ```ts
73
- const executor = ControlPlaneE2BExecutor.fromEnv();
74
- ```
75
-
76
- ## Using locally (no sandbox)
136
+ ## Single-Agent Mode (`createAgent`)
77
137
 
78
- For development without a sandbox service, use `LocalToolProvider` which runs tools on the local machine:
138
+ For simple tasks that don't need orchestration:
79
139
 
80
- ```ts
140
+ ```typescript
81
141
  import { createAgent } from './src/agent/create-agent';
82
142
  import { LocalToolProvider } from './src/providers/local-tool-provider';
83
143
 
84
144
  const agent = createAgent({
85
145
  toolProvider: new LocalToolProvider(process.cwd()),
86
- loop: new VercelAgentLoop(),
146
+ loop: new VercelAgentLoop(), // needs ANTHROPIC_API_KEY
87
147
  });
88
148
 
89
149
  const result = await agent.run('list all TypeScript files');
150
+ console.log(result.output);
90
151
  ```
91
152
 
92
- ## Key modules
93
-
94
- ### Agent creation (`src/agent/create-agent.ts`)
153
+ ### Configuration
95
154
 
96
- `createAgent(options)` returns an agent with a `.run(prompt, options?)` method. Options:
155
+ | Option | Type | Default | Description |
156
+ |--------|------|---------|-------------|
157
+ | `toolProvider` | `ToolProvider` | required | Executes tool calls |
158
+ | `loop` | `AgentLoop` | `VercelAgentLoop` | LLM decision loop |
159
+ | `sandboxProvider` | `SandboxProvider` | — | Higher-level sandbox operations |
160
+ | `maxSteps` | `number` | 30 | Max tool steps per run |
161
+ | `telemetry` | `HarnessTelemetry` | — | OpenTelemetry-style tracing |
162
+ | `skillIndexPath` | `string` | — | Path to skill index JSON for routing |
97
163
 
98
- | Option | Type | Description |
99
- |--------|------|-------------|
100
- | `toolProvider` | `ToolProvider` | Required. Executes tool calls |
101
- | `loop` | `AgentLoop` | LLM decision loop (default: `VercelAgentLoop`) |
102
- | `sandboxProvider` | `SandboxProvider` | Optional. Higher-level sandbox ops (file download, exec with env) |
103
- | `maxSteps` | `number` | Max tool steps per run (default: 30) |
104
- | `telemetry` | `HarnessTelemetry` | Optional. OpenTelemetry-style tracing |
105
- | `skillIndexPath` | `string` | Optional. Path to skill index JSON |
164
+ ### VercelAgentLoop
106
165
 
107
- ### Agent loop (`src/loop/vercel-agent-loop.ts`)
166
+ Calls Claude via the Vercel AI SDK. Supports parallel tool calls and configurable system prompt.
108
167
 
109
- `VercelAgentLoop` calls Claude via the Vercel AI SDK. It supports:
110
- - Parallel tool calls (returns `ToolBatchAction` when the LLM requests multiple tools at once)
111
- - Configurable system prompt
112
- - Model selection via `HARNESS_MODEL` env var (default: `claude-sonnet-4-5`)
113
-
114
- ```ts
168
+ ```typescript
115
169
  const loop = new VercelAgentLoop({
116
170
  systemPrompt: 'You are a helpful coding assistant.',
171
+ model: 'claude-sonnet-4-5', // or HARNESS_MODEL env var
117
172
  });
118
173
  ```
119
174
 
120
- ### Tool provider (`src/interfaces/tool-provider.ts`)
175
+ ### LCMToolLoop
121
176
 
122
- The contract for tool execution:
177
+ Wraps another loop to add Lossless Context Management and optional REPL orchestration:
123
178
 
124
- ```ts
125
- interface ToolProvider {
126
- bash(command: string, options?: BashOptions): Promise<ToolResult>;
127
- readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
128
- writeFile(path: string, content: string): Promise<ToolResult>;
129
- editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
130
- glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
131
- grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
132
- webFetch?(options: WebFetchOptions): Promise<ToolResult>;
133
- webSearch?(query: string): Promise<ToolResult>;
134
- capabilities(): ToolProviderCapabilities;
135
- }
179
+ ```typescript
180
+ import { LCMToolLoop } from './src/loop/lcm-tool-loop';
181
+ import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
136
182
 
137
- interface ToolResult {
138
- success: boolean;
139
- output: string;
140
- error?: string;
141
- }
183
+ const loop = new LCMToolLoop({
184
+ innerLoop: new VercelAgentLoop(),
185
+ toolProvider: mySandboxProvider,
186
+ enableRepl: true, // default: true
187
+ bridgeDir: '/var/run/bridge',
188
+ onActivity: (entry) => console.log(entry),
189
+ onLlmRequest: async (prompt) => callLLM(prompt),
190
+ onWebFetchRequest: async (url) => fetch(url),
191
+ });
142
192
  ```
143
193
 
144
- Built-in implementations:
194
+ **Standard mode**: Lossless context trimming — the LLM always sees a coherent, budget-fitting view of the full conversation.
145
195
 
146
- | Provider | Description |
147
- |----------|-------------|
148
- | `LocalToolProvider` | Runs tools on the local filesystem |
149
- | `E2BToolProvider` | Routes tools to an E2B-compatible executor over HTTP |
150
- | `CompositeToolProvider` | Combines multiple providers (e.g. sandbox + web) |
196
+ **REPL mode**: When the LLM returns a Bash action with the REPL marker, the loop writes a Python script into the sandbox, injects the bridge module, runs the script, and polls for sub-requests (LLM, web_fetch, ask_user) that the harness fulfills.
151
197
 
152
- ### Action types (`src/agent/types.ts`)
198
+ ---
153
199
 
154
- The LLM returns one of these action types each turn:
200
+ ## ArcLoop (Orchestrator Mode)
155
201
 
156
- ```ts
157
- // Single tool call
158
- interface ToolCallAction {
159
- type: 'tool';
160
- name: 'Bash' | 'Read' | 'Write' | 'Edit' | 'Glob' | 'Grep' | ...;
161
- args: Record<string, unknown>;
162
- }
202
+ For complex tasks that benefit from parallel processes, context management, and memory:
163
203
 
164
- // Multiple independent tool calls (executed in parallel)
165
- interface ToolBatchAction {
166
- type: 'tool_batch';
167
- calls: ToolCallAction[];
168
- }
169
-
170
- // Final text response (ends the loop)
171
- interface FinalAction {
172
- type: 'final';
173
- content: string;
174
- }
175
- ```
176
-
177
- ### LCM tool loop (`src/loop/lcm-tool-loop.ts`)
178
-
179
- `LCMToolLoop` wraps another loop to add LCM-based tool routing, REPL script execution, and bridge-based tool dispatch. Used in the chat-assistant example.
180
-
181
- ### Sandbox provider (`src/interfaces/sandbox-provider.ts`)
204
+ ```typescript
205
+ import { createArcAgent } from './src/arc/create-arc-agent';
182
206
 
183
- Higher-level sandbox operations beyond basic tool calls:
207
+ const agent = await createArcAgent({
208
+ toolProvider: myToolProvider,
209
+ episodeStore: myEpisodeStore, // required
210
+ sessionMemoStore: mySessionMemoStore, // required
211
+ longTermStore: myLongTermStore, // required
212
+ taskId: 'task-1',
213
+ sessionId: 'session-1',
214
+ });
184
215
 
185
- ```ts
186
- interface SandboxProvider {
187
- exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
188
- readSandboxFile(path: string): Promise<SandboxFileBlob>;
189
- writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
216
+ // Streaming
217
+ for await (const event of agent.stream(messages, signal)) {
218
+ if (event.type === 'text_delta') process.stdout.write(event.text);
219
+ if (event.type === 'process_dispatched') console.log(` → ${event.action}`);
220
+ if (event.type === 'done') console.log(`Done in ${event.stats.durationMs}ms`);
190
221
  }
191
- ```
192
-
193
- ### Observability (`src/observability/otel.ts`)
194
-
195
- `HarnessTelemetry` provides OpenTelemetry-style spans and metrics for agent runs.
196
222
 
197
- ### Arc: Orchestrator + Thread Architecture (`src/arc/`)
223
+ // Non-streaming
224
+ const result = await agent.run(messages, signal);
225
+ ```
198
226
 
199
- `ArcLoop` is an `AgentLoop` implementation where an orchestrator LLM dispatches bounded threads via a single `Thread` tool. Threads produce episodes (summary + full trace). The orchestrator only sees summaries, keeping its context small.
227
+ ### ArcLoopConfig
228
+
229
+ | Option | Type | Default | Description |
230
+ |--------|------|---------|-------------|
231
+ | `model` | `string` | `'claude-opus-4-6'` | Orchestrator model (ID or tier name) |
232
+ | `modelMap` | `Record<ModelTier, string>` | haiku/sonnet/opus | Maps fast/medium/strong to model IDs |
233
+ | `apiKey` | `string` | — | Anthropic API key |
234
+ | `systemPrompt` | `string` | built-in | Custom orchestrator system prompt |
235
+ | `maxTurns` | `number` | 30 | Max orchestrator turns |
236
+ | `processTimeout` | `number` | 120_000 | Per-process timeout (ms) |
237
+ | `processMaxSteps` | `number` | 20 | Per-process max tool steps |
238
+ | `contextWindowSize` | `number` | 200_000 | Context window in tokens |
239
+ | `outputReserve` | `number` | 20_000 | Tokens reserved for output |
240
+ | `autoMemory` | `boolean` | true | Auto-detect patterns from episodes |
241
+ | `episodeStore` | `EpisodeStore` | required | Stores episode summaries + traces |
242
+ | `sessionMemoStore` | `SessionMemoStore` | required | Stores session memos |
243
+ | `longTermStore` | `LongTermStore` | required | Stores long-term memories |
244
+ | `taskId` | `string` | required | Task identifier |
245
+ | `sessionId` | `string` | required | Session identifier |
246
+ | `toolProvider` | `ToolProvider` | required | Tool execution |
247
+ | `processTools` | `Record<string, AnyTool>` | builtinTools | Tools available inside processes |
248
+ | `extraOrchestratorTools` | `Record<string, AnyTool>` | — | Custom orchestrator tools |
249
+ | `onOrchestratorTool` | `function` | — | Handler for custom orchestrator tools |
250
+ | `resilience` | `ResiliencePolicy` | — | Composable resilience pipeline |
251
+ | `traceWriter` | `function` | — | Callback for trace event emission |
252
+
253
+ ### Resilience
254
+
255
+ ```typescript
256
+ import { resilience } from './src/arc/resilience';
257
+
258
+ const pipeline = resilience()
259
+ .retry({ maxRetries: 2, baseDelay: 1000 })
260
+ .timeout({ durationMs: 30_000 })
261
+ .circuitBreaker({ failureThreshold: 5 })
262
+ .build();
263
+
264
+ const agent = await createArcAgent({
265
+ // ...config
266
+ resilience: pipeline,
267
+ });
268
+ ```
200
269
 
201
- ```ts
202
- import { createArcAgent } from './src/arc/create-arc-agent';
203
- import { InMemoryEpisodeStore } from './src/arc/stores/episode-store';
204
- import { InMemorySessionMemoStore } from './src/arc/stores/session-memo-store';
205
- import { InMemoryLongTermStore } from './src/arc/stores/long-term-store';
270
+ ### Trace Emission
206
271
 
207
- const agent = createArcAgent({
208
- toolProvider: new LocalToolProvider(process.cwd()),
209
- episodeStore: new InMemoryEpisodeStore(),
210
- sessionMemoStore: new InMemorySessionMemoStore(),
211
- longTermStore: new InMemoryLongTermStore(),
212
- taskId: 'task-1',
213
- sessionId: 'session-1',
272
+ ```typescript
273
+ const traces: TraceEvent[] = [];
274
+ const agent = await createArcAgent({
275
+ // ...config
276
+ traceWriter: (event) => traces.push(event),
214
277
  });
215
-
216
- const result = await agent.run('Fix the authentication bug');
217
278
  ```
218
279
 
219
- Key features:
220
- - **Parallel threads**: orchestrator calls Thread N times in one turn → all run concurrently
221
- - **Four-tier memory**: thread context → episodes → session memos → long-term
222
- - **Per-thread models**: Haiku for reads, Sonnet for implementation
223
- - **Template compression**: zero-LLM-call episode summaries
224
- - **Async consolidation**: non-blocking background distillation
280
+ Traces can be validated against the formal model: `cd verify && cargo run -- trace file.ndjson`
225
281
 
226
- Full architecture doc: [`docs/arc.md`](../docs/arc.md)
282
+ ---
227
283
 
228
- ## Package layout
284
+ ## Package Layout
229
285
 
230
286
  ```
231
287
  src/
232
288
  ├── agent/ # createAgent, step executor, types
233
- ├── arc/ # ArcLoop orchestrator, threads, memory hierarchy
289
+ ├── arc/ # ArcLoop orchestrator, processes, memory, resilience
290
+ │ ├── resilience/ # Retry, circuit breaker, timeout, bulkhead, fallback
234
291
  │ ├── stores/ # RxDB + in-memory store implementations
235
292
  │ └── object-store/ # Pluggable cloud sync (fs, memory)
236
293
  ├── interfaces/ # ToolProvider, SandboxProvider, AgentLoop contracts
@@ -240,17 +297,20 @@ src/
240
297
  ├── hooks/ # Pre/post tool call hooks
241
298
  ├── permissions/ # Tool permission checks
242
299
  ├── sessions/ # Session persistence
243
- ├── subagents/ # Subagent spawning and task tools
300
+ ├── subagents/ # Subagent spawning
244
301
  ├── skills/ # Skill index, routing, and management
245
302
  ├── optimization/ # Benchmark runner
246
303
  └── observability/ # OpenTelemetry integration
304
+
305
+ verify/ # Rust formal verification (Stateright model checker)
306
+ testing/ # Adversarial scenario replay harness
307
+ tests/ # Vitest test suite
247
308
  ```
248
309
 
249
310
  ## Documentation
250
311
 
251
- - **Arc architecture**: [`docs/arc.md`](../docs/arc.md)
252
- - Provider guide: `docs/guides/providers.md`
253
- - Skills guide: `docs/guides/skills.md`
254
- - Observability guide: `docs/guides/observability.md`
255
- - Release process: `../docs/RELEASE.md`
256
- - Full example: [`../examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts)
312
+ - [Arc architecture](../docs/arc.md) — process model, context window, memory, resilience, verification
313
+ - [Testing](../docs/testing.md) — test layers, running tests, writing new tests
314
+ - [Sandbox setup](../docs/PUBLIC_SANDBOX.md) — deploying the sandbox service
315
+ - [Release process](../docs/RELEASE.md) — versioning and publishing
316
+ - [Example](../examples/chat-assistant/src/chat.ts) complete working chat assistant
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bluecopa/harness",
3
- "version": "0.1.0-snapshot.7",
3
+ "version": "0.1.0-snapshot.70",
4
4
  "description": "Provider-agnostic TypeScript agent framework",
5
5
  "license": "UNLICENSED",
6
6
  "scripts": {
@@ -7,12 +7,14 @@ import type { HarnessTelemetry } from '../observability/otel';
7
7
  import { HookRunner } from '../hooks/hook-runner';
8
8
  import { PermissionManager } from '../permissions/permission-manager';
9
9
  import { VercelAgentLoop } from '../loop/vercel-agent-loop';
10
+ export type { SystemPromptBlock, VercelAgentLoopConfig } from '../loop/vercel-agent-loop';
11
+ export type { PrepareStepContext, PrepareStepResult } from './types';
10
12
  import { SkillManager } from '../skills/skill-manager';
11
13
  import { SkillRouter } from '../skills/skill-router';
12
14
  import type { SkillSummary } from '../skills/skill-types';
13
15
  import { SingleFlightStepExecutor } from './step-executor';
14
- import type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo } from './types';
15
- export type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo };
16
+ import type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, StepUsage, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo } from './types';
17
+ export type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, StepUsage, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo };
16
18
  export { HookRunner } from '../hooks/hook-runner';
17
19
  export { PermissionManager } from '../permissions/permission-manager';
18
20
  export type { PermissionMode, PermissionResolver, PermissionRequest } from '../permissions/permission-manager';
@@ -37,6 +39,8 @@ export interface AgentRuntime {
37
39
  /** Custom tool executor. Called for every tool action. Return null to fall through to built-in dispatch.
38
40
  * When hookRunner/permissionManager are provided on the runtime, they are automatically applied before/after this callback — no manual wiring needed. */
39
41
  executeToolAction?: (action: ToolCallAction) => Promise<ToolResult | null>;
42
+ /** Progress callback fired before/after each tool call during run(). */
43
+ onToolProgress?: (event: { type: 'tool_start'; name: string; args: Record<string, unknown> } | { type: 'tool_end'; name: string; success: boolean; durationMs: number }) => void;
40
44
  }
41
45
 
42
46
  /**
@@ -218,9 +222,21 @@ function toStreamResult(r: ToolResult): { success: boolean; output: string; erro
218
222
  return base;
219
223
  }
220
224
 
221
- /** Format a display-friendly content string for tool results (used in content field). */
225
+ /** Build the text the LLM sees for a tool result.
226
+ * Success: prefer modelOutput (compact) over raw output.
227
+ * Failure: prefer modelOutput (structured fix guidance) → error → output → generic fallback.
228
+ * This ensures custom tools can feed actionable error feedback to the model via modelOutput
229
+ * so the agent can self-correct instead of stopping with "unknown failure". */
230
+ function resultTextForLLM(result: ToolResult): string {
231
+ if (result.success) return result.modelOutput ?? result.output;
232
+ return result.modelOutput ?? result.error ?? result.output ?? 'unknown failure';
233
+ }
234
+
235
+ /** Format content string for LLM context. Uses modelOutput (compact summary) when available. */
222
236
  function formatToolResultContent(call: ToolCallAction, result: ToolResult): string {
223
- const content = result.success ? result.output : `ERROR: ${result.error ?? 'unknown failure'}`;
237
+ const content = result.success
238
+ ? resultTextForLLM(result)
239
+ : `ERROR: ${resultTextForLLM(result)}`;
224
240
  switch (call.name) {
225
241
  case 'Write':
226
242
  return `Write(${call.args.path}): ${result.success ? 'ok' : content}`;
@@ -515,6 +531,11 @@ export function createAgent(runtime: AgentRuntime) {
515
531
  ? { nextAction: runtime.nextAction }
516
532
  : new VercelAgentLoop());
517
533
 
534
+ /** Read lastUsage from the loop if it's a VercelAgentLoop. */
535
+ function getLoopUsage(): StepUsage | undefined {
536
+ return loop instanceof VercelAgentLoop ? loop.lastUsage : undefined;
537
+ }
538
+
518
539
  async function resolveSkillContext(prompt: string): Promise<string> {
519
540
  if (!skillManager || !skillIndexPath) return '';
520
541
 
@@ -596,14 +617,18 @@ export function createAgent(runtime: AgentRuntime) {
596
617
 
597
618
  // Execute valid calls via batch (sequential sandbox ops) or parallel fallback
598
619
  if (validCalls.length > 0) {
620
+ for (const c of validCalls) runtime.onToolProgress?.({ type: 'tool_start', name: c.name, args: c.args });
621
+ const batchStart = Date.now();
599
622
  const results = await executeBatch(validCalls, runtime.toolProvider, runtime);
623
+ const batchMs = Date.now() - batchStart;
600
624
  for (let i = 0; i < validCalls.length; i++) {
601
625
  const call = validCalls[i]!;
602
626
  const r = results[i]!;
627
+ runtime.onToolProgress?.({ type: 'tool_end', name: call.name, success: r.success, durationMs: batchMs });
603
628
  if (!r.success) {
604
629
  recordAgentError(runtime.telemetry);
605
630
  }
606
- const resultText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
631
+ const resultText = r.success ? resultTextForLLM(r) : `ERROR: ${resultTextForLLM(r)}`;
607
632
  messages.push({
608
633
  role: 'tool',
609
634
  content: formatToolResultContent(call, r),
@@ -659,6 +684,8 @@ export function createAgent(runtime: AgentRuntime) {
659
684
  } else {
660
685
  consecutiveInvalid = 0;
661
686
  }
687
+ runtime.onToolProgress?.({ type: 'tool_start', name: action.name, args: action.args });
688
+ const singleStart = Date.now();
662
689
  const result = validationError
663
690
  ? ({ success: false, output: '', error: validationError } as ToolResult)
664
691
  : await executor.run(async () => {
@@ -672,10 +699,11 @@ export function createAgent(runtime: AgentRuntime) {
672
699
  };
673
700
  }
674
701
  });
702
+ runtime.onToolProgress?.({ type: 'tool_end', name: action.name, success: result.success, durationMs: Date.now() - singleStart });
675
703
  if (!result.success) {
676
704
  recordAgentError(runtime.telemetry);
677
705
  }
678
- const singleResultText = result.success ? result.output : `ERROR: ${result.error ?? 'unknown failure'}`;
706
+ const singleResultText = result.success ? resultTextForLLM(result) : `ERROR: ${resultTextForLLM(result)}`;
679
707
  messages.push({
680
708
  role: 'tool',
681
709
  content: formatToolResultContent(action, result),
@@ -718,8 +746,7 @@ export function createAgent(runtime: AgentRuntime) {
718
746
  if (event.type === 'text_delta') {
719
747
  finalText += event.text;
720
748
  yield event;
721
- }
722
- if (event.type === 'tool_start') {
749
+ } else if (event.type === 'tool_start') {
723
750
  pendingTools.push({
724
751
  type: 'tool',
725
752
  name: event.name,
@@ -727,13 +754,18 @@ export function createAgent(runtime: AgentRuntime) {
727
754
  ...(event.toolCallId != null ? { toolCallId: event.toolCallId } : {}),
728
755
  });
729
756
  yield event;
757
+ } else {
758
+ // Forward all other events (tool_end, step_start, step_end, done)
759
+ // from self-managing loops like ArcLoop
760
+ yield event;
761
+ if (event.type === 'done') return;
730
762
  }
731
763
  }
732
764
 
733
765
  // If no tools → final response
734
766
  if (pendingTools.length === 0) {
735
767
  messages.push({ role: 'assistant', content: finalText });
736
- yield { type: 'step_end', step };
768
+ { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
737
769
  yield { type: 'done', output: finalText, steps: step };
738
770
  return;
739
771
  }
@@ -759,7 +791,7 @@ export function createAgent(runtime: AgentRuntime) {
759
791
  if (action.type === 'final') {
760
792
  yield { type: 'text_delta', text: action.content };
761
793
  messages.push({ role: 'assistant', content: action.content });
762
- yield { type: 'step_end', step };
794
+ { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
763
795
  yield { type: 'done', output: action.content, steps: step };
764
796
  return;
765
797
  }
@@ -771,7 +803,7 @@ export function createAgent(runtime: AgentRuntime) {
771
803
  try {
772
804
  const r = await executeTool(runtime.toolProvider, call, runtime);
773
805
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
774
- const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
806
+ const rText = r.success ? resultTextForLLM(r) : `ERROR: ${resultTextForLLM(r)}`;
775
807
  messages.push({
776
808
  role: 'tool',
777
809
  content: formatToolResultContent(call, r),
@@ -793,7 +825,7 @@ export function createAgent(runtime: AgentRuntime) {
793
825
  try {
794
826
  const r = await executeTool(runtime.toolProvider, action, runtime);
795
827
  yield { type: 'tool_end', name: action.name, result: toStreamResult(r) };
796
- const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
828
+ const rText = r.success ? resultTextForLLM(r) : `ERROR: ${resultTextForLLM(r)}`;
797
829
  messages.push({
798
830
  role: 'tool',
799
831
  content: formatToolResultContent(action, r),
@@ -809,7 +841,7 @@ export function createAgent(runtime: AgentRuntime) {
809
841
  });
810
842
  }
811
843
  }
812
- yield { type: 'step_end', step };
844
+ { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
813
845
  continue;
814
846
  }
815
847
 
@@ -819,7 +851,7 @@ export function createAgent(runtime: AgentRuntime) {
819
851
  const call = pendingTools[i]!;
820
852
  const r = results[i]!;
821
853
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
822
- const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
854
+ const rText = r.success ? resultTextForLLM(r) : `ERROR: ${resultTextForLLM(r)}`;
823
855
  messages.push({
824
856
  role: 'tool',
825
857
  content: formatToolResultContent(call, r),
@@ -837,7 +869,7 @@ export function createAgent(runtime: AgentRuntime) {
837
869
 
838
870
  if (action.type === 'final') {
839
871
  messages.push({ role: 'assistant', content: action.content });
840
- yield { type: 'step_end', step };
872
+ { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
841
873
  yield { type: 'done', output: action.content, steps: step };
842
874
  return;
843
875
  }
@@ -861,7 +893,7 @@ export function createAgent(runtime: AgentRuntime) {
861
893
  const call = calls[i]!;
862
894
  const r = results[i]!;
863
895
  yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
864
- const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
896
+ const rText = r.success ? resultTextForLLM(r) : `ERROR: ${resultTextForLLM(r)}`;
865
897
  messages.push({
866
898
  role: 'tool',
867
899
  content: formatToolResultContent(call, r),
@@ -875,7 +907,7 @@ export function createAgent(runtime: AgentRuntime) {
875
907
  }
876
908
  }
877
909
 
878
- yield { type: 'step_end', step };
910
+ { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
879
911
  }
880
912
 
881
913
  yield { type: 'done', output: 'ERROR: max steps exceeded', steps: maxSteps };