npm - @bluecopa/harness - Versions diffs - 0.1.0-snapshot.5 → 0.1.0-snapshot.50 - Mend

@bluecopa/harness 0.1.0-snapshot.5 → 0.1.0-snapshot.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

package/README.md +212 -117
package/package.json +2 -1
package/src/agent/create-agent.ts +30 -27
package/src/agent/types.ts +20 -24
package/src/arc/agent-runner.ts +955 -0
package/src/arc/arc-loop.ts +845 -0
package/src/arc/arc-types.ts +115 -0
package/src/arc/bridge-tools.ts +170 -0
package/src/arc/bridged-tool-provider.ts +80 -0
package/src/arc/consolidation.ts +118 -0
package/src/arc/context-window.ts +267 -0
package/src/arc/create-arc-agent.ts +99 -0
package/src/arc/debug.ts +62 -0
package/src/arc/episode-compressor.ts +225 -0
package/src/arc/memory-manager.ts +245 -0
package/src/arc/message-convert.ts +123 -0
package/src/arc/multi-model.ts +70 -0
package/src/arc/object-store/fs-object-store.ts +60 -0
package/src/arc/object-store/memory-object-store.ts +41 -0
package/src/arc/object-store/object-store.ts +12 -0
package/src/arc/profile-builder.ts +172 -0
package/src/arc/resilience/bulkhead.ts +110 -0
package/src/arc/resilience/circuit-breaker.ts +112 -0
package/src/arc/resilience/fallback.ts +27 -0
package/src/arc/resilience/index.ts +21 -0
package/src/arc/resilience/pipeline.ts +103 -0
package/src/arc/resilience/retry.ts +90 -0
package/src/arc/resilience/timeout.ts +60 -0
package/src/arc/resilience/types.ts +71 -0
package/src/arc/result-pager.ts +77 -0
package/src/arc/sig.ts +115 -0
package/src/arc/skill-resolver.ts +81 -0
package/src/arc/stores/episode-store.ts +120 -0
package/src/arc/stores/long-term-store.ts +86 -0
package/src/arc/stores/rxdb-setup.ts +113 -0
package/src/arc/stores/session-memo-store.ts +58 -0
package/src/arc/tools.ts +67 -0
package/src/arc/types.ts +363 -0
package/src/arc/utils.ts +37 -0
package/src/hooks/middleware.ts +95 -0
package/src/interfaces/hooks.ts +2 -1
package/src/interfaces/tool-provider.ts +0 -2
package/src/loop/context-store.ts +12 -9
package/src/loop/vercel-agent-loop.ts +44 -118
package/src/skills/skill-router.ts +12 -6
package/testing/index.ts +22 -0
package/testing/scenario-replay.ts +209 -0
package/testing/scenario-types.ts +38 -0
package/testing/scripted-llm.ts +230 -0
package/tests/arc/channel.test.ts +170 -0
package/tests/arc/context-window.test.ts +396 -0
package/tests/arc/e2e.test.ts +353 -0
package/tests/arc/error-paths.test.ts +402 -0
package/tests/arc/live-integration.test.ts +357 -0
package/tests/arc/memory-manager.test.ts +384 -0
package/tests/arc/middleware.test.ts +113 -0
package/tests/arc/process-interleaving.test.ts +432 -0
package/tests/arc/process-profiles.test.ts +366 -0
package/tests/arc/resilience-integration.test.ts +381 -0
package/tests/arc/resilience.test.ts +575 -0
package/tests/arc/result-paging.test.ts +392 -0
package/tests/arc/scenario-driven.test.ts +297 -0
package/tests/arc/tool-dispatch.test.ts +340 -0
package/tests/arc/wasm-pbt.test.ts +104 -0
package/tests/integration/agent-skill-default-from-sandbox.spec.ts +3 -2
package/tests/unit/structured-messages.spec.ts +1 -1
package/verify/Cargo.lock +637 -0
package/verify/Cargo.toml +24 -0
package/verify/src/lib.rs +5 -0
package/verify/src/main.rs +165 -0
package/verify/src/model/context.rs +100 -0
package/verify/src/model/mod.rs +6 -0
package/verify/src/model/orchestrator.rs +371 -0
package/verify/src/model/process.rs +140 -0
package/verify/src/model/types.rs +273 -0
package/verify/src/properties/liveness.rs +32 -0
package/verify/src/properties/mod.rs +4 -0
package/verify/src/properties/safety.rs +78 -0
package/verify/src/trace/event.rs +155 -0
package/verify/src/trace/mod.rs +2 -0
package/verify/src/trace/validator.rs +367 -0
package/verify/src/wasm/mod.rs +3 -0
package/verify/src/wasm/scenario_generator.rs +400 -0
package/verify/src/wasm/types.rs +104 -0
package/verify/src/wasm/wasm_validator.rs +107 -0
package/verify/tests/model_check.rs +49 -0
package/verify/tests/trace_validation.rs +147 -0
package/vitest.config.ts +1 -1

package/README.md CHANGED Viewed

@@ -2,9 +2,17 @@
 Provider-agnostic TypeScript agent framework with Claude-code-compatible tool semantics.
-The harness provides the core loop that drives an AI agent: send messages to an LLM, execute the tool calls it returns, feed results back, and repeat until the LLM produces a final text response.
+Published on npm as **`@bluecopa/harness`**.
-## Quickstart
+Two execution modes: a simple single-agent loop (`createAgent` + `VercelAgentLoop`) and a process-based orchestrator (`ArcLoop`) that dispatches parallel processes with context management, memory, and resilience.
+## Install
+```bash
+pnpm add @bluecopa/harness
+```
+## Development
 ```bash
 pnpm install
@@ -13,9 +21,11 @@ pnpm test
 ## Architecture
+### Single-Agent Loop
 ```
 ┌──────────────┐     ┌──────────────┐     ┌──────────────────┐
-│  createAgent │────▶│  AgentLoop   │────▶│  LLM (Claude)    │
+│  createAgent │────►│  AgentLoop   │────►│  LLM (Claude)    │
 │  (turn loop) │     │  (nextAction)│     │                  │
 └──────┬───────┘     └──────────────┘     └──────────────────┘
        │                                           │
@@ -27,20 +37,82 @@ pnpm test
 └──────────────┘
 ```
-1. `createAgent` drives a deterministic step loop
-2. Each step calls `loop.nextAction(messages)` to get the LLM's decision
-3. If it's a tool call, the harness executes it via `ToolProvider` and appends the result
-4. If it's a final action, the loop ends and returns the result
+### ArcLoop Orchestrator
-## Using with the sandbox
+```
+Orchestrator (ArcLoop — Opus 4.6 by default)
+  │  tools: Thread, Check, Cancel, Remember, ReadEpisode
+  │
+  │  Turn 1 (parallel):
+  ├──► Process 0 ("read auth", model=fast)        ─┐
+  ├──► Process 1 ("read routes", model=fast)       ─┼──► Episodes
+  ├──► Process 2 ("read tests", model=fast)        ─┘
+  │
+  │  Turn 2 (dispatch dependent work):
+  ├──► Thread("fix bug", context=[ep0,ep1,ep2])    ──► Episode
+  │
+  │  Turn 3 (parallel):
+  ├──► Thread("run tests", context=[ep3])          ─┐
+  ├──► Thread("update docs", context=[ep3])        ─┘
+  │
+  └──► Final text response
+```
-The most common setup connects the harness to a running sandbox service via `ControlPlaneE2BExecutor`:
+Full architecture doc: [`docs/arc.md`](../docs/arc.md)
-```ts
-import { createAgent } from './src/agent/create-agent';
-import { E2BToolProvider } from './src/providers/e2b-tool-provider';
+---
+## ToolProvider
+The contract for tool execution. All agent modes use this interface.
+```typescript
+interface ToolProvider {
+  bash(command: string, options?: BashOptions): Promise<ToolResult>;
+  readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
+  writeFile(path: string, content: string): Promise<ToolResult>;
+  editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
+  glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
+  grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
+  webFetch?(options: WebFetchOptions): Promise<ToolResult>;
+  webSearch?(query: string): Promise<ToolResult>;
+  capabilities(): ToolProviderCapabilities;
+}
+interface ToolResult {
+  success: boolean;
+  output: string;
+  error?: string;
+}
+```
+Built-in implementations:
+| Provider | Description |
+|----------|-------------|
+| `LocalToolProvider` | Runs tools on the local filesystem |
+| `E2BToolProvider` | Routes tools to a sandbox VM via `ControlPlaneE2BExecutor` |
+| `CompositeToolProvider` | Combines multiple providers (e.g. local filesystem + sandbox bash) |
+## SandboxProvider
+Higher-level sandbox operations beyond basic tool calls:
+```typescript
+interface SandboxProvider {
+  exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
+  readSandboxFile(path: string): Promise<SandboxFileBlob>;
+  writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
+}
+```
+Used by `SkillManager` for executing skill scripts in isolated VMs.
+## Connecting to a Sandbox
+```typescript
 import { ControlPlaneE2BExecutor } from './src/providers/control-plane-e2b-executor';
-import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
+import { E2BToolProvider } from './src/providers/e2b-tool-provider';
 // Connect to sandbox service
 const executor = new ControlPlaneE2BExecutor({
@@ -50,155 +122,174 @@ const executor = new ControlPlaneE2BExecutor({
 });
 await executor.initialize();  // creates a Firecracker VM
-// Build and run the agent
-const agent = createAgent({
-  toolProvider: new E2BToolProvider(executor),
-  loop: new VercelAgentLoop(),  // needs ANTHROPIC_API_KEY
-});
+const toolProvider = new E2BToolProvider(executor);
-const result = await agent.run('create a bar chart of sales data');
-console.log(result.output);   // LLM's final response
-console.log(result.steps);    // number of tool steps
+// ... use with createAgent or ArcLoop
-await executor.destroy();     // tears down the VM
+await executor.destroy();  // tears down the VM
 ```
-For a complete working example, see [`examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts).
-### From environment variables
+From environment variables: `ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY`.
-`ControlPlaneE2BExecutor.fromEnv()` reads `SAMYX_BASE_URL` and `SAMYX_API_KEY` automatically:
+---
-```ts
-const executor = ControlPlaneE2BExecutor.fromEnv();
-```
+## Single-Agent Mode (`createAgent`)
-## Using locally (no sandbox)
+For simple tasks that don't need orchestration:
-For development without a sandbox service, use `LocalToolProvider` which runs tools on the local machine:
-```ts
+```typescript
 import { createAgent } from './src/agent/create-agent';
 import { LocalToolProvider } from './src/providers/local-tool-provider';
 const agent = createAgent({
   toolProvider: new LocalToolProvider(process.cwd()),
-  loop: new VercelAgentLoop(),
+  loop: new VercelAgentLoop(),  // needs ANTHROPIC_API_KEY
 });
 const result = await agent.run('list all TypeScript files');
+console.log(result.output);
 ```
-## Key modules
-### Agent creation (`src/agent/create-agent.ts`)
+### Configuration
-`createAgent(options)` returns an agent with a `.run(prompt, options?)` method. Options:
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `toolProvider` | `ToolProvider` | required | Executes tool calls |
+| `loop` | `AgentLoop` | `VercelAgentLoop` | LLM decision loop |
+| `sandboxProvider` | `SandboxProvider` | — | Higher-level sandbox operations |
+| `maxSteps` | `number` | 30 | Max tool steps per run |
+| `telemetry` | `HarnessTelemetry` | — | OpenTelemetry-style tracing |
+| `skillIndexPath` | `string` | — | Path to skill index JSON for routing |
-| Option | Type | Description |
-|--------|------|-------------|
-| `toolProvider` | `ToolProvider` | Required. Executes tool calls |
-| `loop` | `AgentLoop` | LLM decision loop (default: `VercelAgentLoop`) |
-| `sandboxProvider` | `SandboxProvider` | Optional. Higher-level sandbox ops (file download, exec with env) |
-| `maxSteps` | `number` | Max tool steps per run (default: 30) |
-| `telemetry` | `HarnessTelemetry` | Optional. OpenTelemetry-style tracing |
-| `skillIndexPath` | `string` | Optional. Path to skill index JSON |
+### VercelAgentLoop
-### Agent loop (`src/loop/vercel-agent-loop.ts`)
+Calls Claude via the Vercel AI SDK. Supports parallel tool calls and configurable system prompt.
-`VercelAgentLoop` calls Claude via the Vercel AI SDK. It supports:
-- Parallel tool calls (returns `ToolBatchAction` when the LLM requests multiple tools at once)
-- Configurable system prompt
-- Model selection via `HARNESS_MODEL` env var (default: `claude-sonnet-4-5`)
-```ts
+```typescript
 const loop = new VercelAgentLoop({
   systemPrompt: 'You are a helpful coding assistant.',
+  model: 'claude-sonnet-4-5',  // or HARNESS_MODEL env var
 });
 ```
-### Tool provider (`src/interfaces/tool-provider.ts`)
+### LCMToolLoop
-The contract for tool execution:
+Wraps another loop to add Lossless Context Management and optional REPL orchestration:
-```ts
-interface ToolProvider {
-  bash(command: string, options?: BashOptions): Promise<ToolResult>;
-  readFile(path: string, options?: ReadOptions): Promise<ToolResult>;
-  writeFile(path: string, content: string): Promise<ToolResult>;
-  editFile(path: string, oldText: string, newText: string): Promise<ToolResult>;
-  glob(pattern: string, options?: GlobOptions): Promise<ToolResult>;
-  grep(pattern: string, path?: string, options?: GrepOptions): Promise<ToolResult>;
-  webFetch?(options: WebFetchOptions): Promise<ToolResult>;
-  webSearch?(query: string): Promise<ToolResult>;
-  capabilities(): ToolProviderCapabilities;
-}
+```typescript
+import { LCMToolLoop } from './src/loop/lcm-tool-loop';
+import { VercelAgentLoop } from './src/loop/vercel-agent-loop';
-interface ToolResult {
-  success: boolean;
-  output: string;
-  error?: string;
-}
+const loop = new LCMToolLoop({
+  innerLoop: new VercelAgentLoop(),
+  toolProvider: mySandboxProvider,
+  enableRepl: true,           // default: true
+  bridgeDir: '/var/run/bridge',
+  onActivity: (entry) => console.log(entry),
+  onLlmRequest: async (prompt) => callLLM(prompt),
+  onWebFetchRequest: async (url) => fetch(url),
+});
 ```
-Built-in implementations:
+**Standard mode**: Lossless context trimming — the LLM always sees a coherent, budget-fitting view of the full conversation.
-| Provider | Description |
-|----------|-------------|
-| `LocalToolProvider` | Runs tools on the local filesystem |
-| `E2BToolProvider` | Routes tools to an E2B-compatible executor over HTTP |
-| `CompositeToolProvider` | Combines multiple providers (e.g. sandbox + web) |
+**REPL mode**: When the LLM returns a Bash action with the REPL marker, the loop writes a Python script into the sandbox, injects the bridge module, runs the script, and polls for sub-requests (LLM, web_fetch, ask_user) that the harness fulfills.
-### Action types (`src/agent/types.ts`)
+---
-The LLM returns one of these action types each turn:
+## ArcLoop (Orchestrator Mode)
-```ts
-// Single tool call
-interface ToolCallAction {
-  type: 'tool';
-  name: 'Bash' | 'Read' | 'Write' | 'Edit' | 'Glob' | 'Grep' | ...;
-  args: Record<string, unknown>;
-}
+For complex tasks that benefit from parallel processes, context management, and memory:
-// Multiple independent tool calls (executed in parallel)
-interface ToolBatchAction {
-  type: 'tool_batch';
-  calls: ToolCallAction[];
-}
+```typescript
+import { createArcAgent } from './src/arc/create-arc-agent';
-// Final text response (ends the loop)
-interface FinalAction {
-  type: 'final';
-  content: string;
-}
-```
+const agent = await createArcAgent({
+  toolProvider: myToolProvider,
+  episodeStore: myEpisodeStore,       // required
+  sessionMemoStore: mySessionMemoStore, // required
+  longTermStore: myLongTermStore,       // required
+  taskId: 'task-1',
+  sessionId: 'session-1',
+});
-### LCM tool loop (`src/loop/lcm-tool-loop.ts`)
+// Streaming
+for await (const event of agent.stream(messages, signal)) {
+  if (event.type === 'text_delta') process.stdout.write(event.text);
+  if (event.type === 'process_dispatched') console.log(`  → ${event.action}`);
+  if (event.type === 'done') console.log(`Done in ${event.stats.durationMs}ms`);
+}
-`LCMToolLoop` wraps another loop to add LCM-based tool routing, REPL script execution, and bridge-based tool dispatch. Used in the chat-assistant example.
+// Non-streaming
+const result = await agent.run(messages, signal);
+```
-### Sandbox provider (`src/interfaces/sandbox-provider.ts`)
+### ArcLoopConfig
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `model` | `string` | `'claude-opus-4-6'` | Orchestrator model (ID or tier name) |
+| `modelMap` | `Record<ModelTier, string>` | haiku/sonnet/opus | Maps fast/medium/strong to model IDs |
+| `apiKey` | `string` | — | Anthropic API key |
+| `systemPrompt` | `string` | built-in | Custom orchestrator system prompt |
+| `maxTurns` | `number` | 30 | Max orchestrator turns |
+| `processTimeout` | `number` | 120_000 | Per-process timeout (ms) |
+| `processMaxSteps` | `number` | 20 | Per-process max tool steps |
+| `contextWindowSize` | `number` | 200_000 | Context window in tokens |
+| `outputReserve` | `number` | 20_000 | Tokens reserved for output |
+| `autoMemory` | `boolean` | true | Auto-detect patterns from episodes |
+| `episodeStore` | `EpisodeStore` | required | Stores episode summaries + traces |
+| `sessionMemoStore` | `SessionMemoStore` | required | Stores session memos |
+| `longTermStore` | `LongTermStore` | required | Stores long-term memories |
+| `taskId` | `string` | required | Task identifier |
+| `sessionId` | `string` | required | Session identifier |
+| `toolProvider` | `ToolProvider` | required | Tool execution |
+| `processTools` | `Record<string, AnyTool>` | builtinTools | Tools available inside processes |
+| `extraOrchestratorTools` | `Record<string, AnyTool>` | — | Custom orchestrator tools |
+| `onOrchestratorTool` | `function` | — | Handler for custom orchestrator tools |
+| `resilience` | `ResiliencePolicy` | — | Composable resilience pipeline |
+| `traceWriter` | `function` | — | Callback for trace event emission |
+### Resilience
+```typescript
+import { resilience } from './src/arc/resilience';
+const pipeline = resilience()
+  .retry({ maxRetries: 2, baseDelay: 1000 })
+  .timeout({ durationMs: 30_000 })
+  .circuitBreaker({ failureThreshold: 5 })
+  .build();
+const agent = await createArcAgent({
+  // ...config
+  resilience: pipeline,
+});
+```
-Higher-level sandbox operations beyond basic tool calls:
+### Trace Emission
-```ts
-interface SandboxProvider {
-  exec(command: string, options?: SandboxExecOptions): Promise<SandboxExecResult>;
-  readSandboxFile(path: string): Promise<SandboxFileBlob>;
-  writeSandboxFile(path: string, content: SandboxFileBlob): Promise<void>;
-}
+```typescript
+const traces: TraceEvent[] = [];
+const agent = await createArcAgent({
+  // ...config
+  traceWriter: (event) => traces.push(event),
+});
 ```
-### Observability (`src/observability/otel.ts`)
+Traces can be validated against the formal model: `cd verify && cargo run -- trace file.ndjson`
-`HarnessTelemetry` provides OpenTelemetry-style spans and metrics for agent runs.
+---
-## Package layout
+## Package Layout
 ```
 src/
 ├── agent/          # createAgent, step executor, types
+├── arc/            # ArcLoop orchestrator, processes, memory, resilience
+│   ├── resilience/ # Retry, circuit breaker, timeout, bulkhead, fallback
+│   ├── stores/     # RxDB + in-memory store implementations
+│   └── object-store/ # Pluggable cloud sync (fs, memory)
 ├── interfaces/     # ToolProvider, SandboxProvider, AgentLoop contracts
 ├── loop/           # VercelAgentLoop, LCMToolLoop
 ├── providers/      # LocalToolProvider, E2BToolProvider, ControlPlaneE2BExecutor
@@ -206,16 +297,20 @@ src/
 ├── hooks/          # Pre/post tool call hooks
 ├── permissions/    # Tool permission checks
 ├── sessions/       # Session persistence
-├── subagents/      # Subagent spawning and task tools
+├── subagents/      # Subagent spawning
 ├── skills/         # Skill index, routing, and management
 ├── optimization/   # Benchmark runner
 └── observability/  # OpenTelemetry integration
+verify/             # Rust formal verification (Stateright model checker)
+testing/            # Adversarial scenario replay harness
+tests/              # Vitest test suite
 ```
 ## Documentation
-- Provider guide: `docs/guides/providers.md`
-- Skills guide: `docs/guides/skills.md`
-- Observability guide: `docs/guides/observability.md`
-- Release process: `../docs/RELEASE.md`
-- Full example: [`../examples/chat-assistant/src/chat.ts`](../examples/chat-assistant/src/chat.ts)
+- [Arc architecture](../docs/arc.md) — process model, context window, memory, resilience, verification
+- [Testing](../docs/testing.md) — test layers, running tests, writing new tests
+- [Sandbox setup](../docs/PUBLIC_SANDBOX.md) — deploying the sandbox service
+- [Release process](../docs/RELEASE.md) — versioning and publishing
+- [Example](../examples/chat-assistant/src/chat.ts) — complete working chat assistant

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@bluecopa/harness",
-  "version": "0.1.0-snapshot.5",
+  "version": "0.1.0-snapshot.50",
   "description": "Provider-agnostic TypeScript agent framework",
   "license": "UNLICENSED",
   "scripts": {
@@ -10,6 +10,7 @@
   "dependencies": {
     "@ai-sdk/anthropic": "^3.0.48",
     "ai": "^6.0.101",
+    "rxdb": "^15.39.0",
     "zod": "^4.1.11"
   },
   "devDependencies": {

package/src/agent/create-agent.ts CHANGED Viewed

@@ -7,14 +7,12 @@ import type { HarnessTelemetry } from '../observability/otel';
 import { HookRunner } from '../hooks/hook-runner';
 import { PermissionManager } from '../permissions/permission-manager';
 import { VercelAgentLoop } from '../loop/vercel-agent-loop';
-export type { SystemPromptBlock, VercelAgentLoopConfig } from '../loop/vercel-agent-loop';
-export type { PrepareStepContext, PrepareStepResult } from './types';
 import { SkillManager } from '../skills/skill-manager';
 import { SkillRouter } from '../skills/skill-router';
 import type { SkillSummary } from '../skills/skill-types';
 import { SingleFlightStepExecutor } from './step-executor';
-import type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, StepUsage, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo } from './types';
-export type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, StepUsage, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo };
+import type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo } from './types';
+export type { AgentAction, AgentLoop, AgentMessage, AgentRunResult, AgentStreamEvent, ToolCallAction, ToolBatchAction, ToolCallInfo, ToolResultInfo };
 export { HookRunner } from '../hooks/hook-runner';
 export { PermissionManager } from '../permissions/permission-manager';
 export type { PermissionMode, PermissionResolver, PermissionRequest } from '../permissions/permission-manager';
@@ -39,6 +37,8 @@ export interface AgentRuntime {
   /** Custom tool executor. Called for every tool action. Return null to fall through to built-in dispatch.
    *  When hookRunner/permissionManager are provided on the runtime, they are automatically applied before/after this callback — no manual wiring needed. */
   executeToolAction?: (action: ToolCallAction) => Promise<ToolResult | null>;
+  /** Progress callback fired before/after each tool call during run(). */
+  onToolProgress?: (event: { type: 'tool_start'; name: string; args: Record<string, unknown> } | { type: 'tool_end'; name: string; success: boolean; durationMs: number }) => void;
 }
 /**
@@ -220,12 +220,9 @@ function toStreamResult(r: ToolResult): { success: boolean; output: string; erro
   return base;
 }
-/** Format content string for LLM context. Uses modelOutput (compact summary) when available. */
+/** Format a display-friendly content string for tool results (used in content field). */
 function formatToolResultContent(call: ToolCallAction, result: ToolResult): string {
-  // Use modelOutput for LLM context when available — keeps context compact
-  const content = result.success
-    ? (result.modelOutput ?? result.output)
-    : `ERROR: ${result.error ?? 'unknown failure'}`;
+  const content = result.success ? result.output : `ERROR: ${result.error ?? 'unknown failure'}`;
   switch (call.name) {
     case 'Write':
       return `Write(${call.args.path}): ${result.success ? 'ok' : content}`;
@@ -520,11 +517,6 @@ export function createAgent(runtime: AgentRuntime) {
       ? { nextAction: runtime.nextAction }
       : new VercelAgentLoop());
-  /** Read lastUsage from the loop if it's a VercelAgentLoop. */
-  function getLoopUsage(): StepUsage | undefined {
-    return loop instanceof VercelAgentLoop ? loop.lastUsage : undefined;
-  }
   async function resolveSkillContext(prompt: string): Promise<string> {
     if (!skillManager || !skillIndexPath) return '';
@@ -606,14 +598,18 @@ export function createAgent(runtime: AgentRuntime) {
             // Execute valid calls via batch (sequential sandbox ops) or parallel fallback
             if (validCalls.length > 0) {
+              for (const c of validCalls) runtime.onToolProgress?.({ type: 'tool_start', name: c.name, args: c.args });
+              const batchStart = Date.now();
               const results = await executeBatch(validCalls, runtime.toolProvider, runtime);
+              const batchMs = Date.now() - batchStart;
               for (let i = 0; i < validCalls.length; i++) {
                 const call = validCalls[i]!;
                 const r = results[i]!;
+                runtime.onToolProgress?.({ type: 'tool_end', name: call.name, success: r.success, durationMs: batchMs });
                 if (!r.success) {
                   recordAgentError(runtime.telemetry);
                 }
-                const resultText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
+                const resultText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
                 messages.push({
                   role: 'tool',
                   content: formatToolResultContent(call, r),
@@ -669,6 +665,8 @@ export function createAgent(runtime: AgentRuntime) {
           } else {
             consecutiveInvalid = 0;
           }
+          runtime.onToolProgress?.({ type: 'tool_start', name: action.name, args: action.args });
+          const singleStart = Date.now();
           const result = validationError
             ? ({ success: false, output: '', error: validationError } as ToolResult)
             : await executor.run(async () => {
@@ -682,10 +680,11 @@ export function createAgent(runtime: AgentRuntime) {
                   };
                 }
               });
+          runtime.onToolProgress?.({ type: 'tool_end', name: action.name, success: result.success, durationMs: Date.now() - singleStart });
           if (!result.success) {
             recordAgentError(runtime.telemetry);
           }
-          const singleResultText = result.success ? (result.modelOutput ?? result.output) : `ERROR: ${result.error ?? 'unknown failure'}`;
+          const singleResultText = result.success ? result.output : `ERROR: ${result.error ?? 'unknown failure'}`;
           messages.push({
             role: 'tool',
             content: formatToolResultContent(action, result),
@@ -728,8 +727,7 @@ export function createAgent(runtime: AgentRuntime) {
             if (event.type === 'text_delta') {
               finalText += event.text;
               yield event;
-            }
-            if (event.type === 'tool_start') {
+            } else if (event.type === 'tool_start') {
               pendingTools.push({
                 type: 'tool',
                 name: event.name,
@@ -737,13 +735,18 @@ export function createAgent(runtime: AgentRuntime) {
                 ...(event.toolCallId != null ? { toolCallId: event.toolCallId } : {}),
               });
               yield event;
+            } else {
+              // Forward all other events (tool_end, step_start, step_end, done)
+              // from self-managing loops like ArcLoop
+              yield event;
+              if (event.type === 'done') return;
             }
           }
           // If no tools → final response
           if (pendingTools.length === 0) {
             messages.push({ role: 'assistant', content: finalText });
-            { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
+            yield { type: 'step_end', step };
             yield { type: 'done', output: finalText, steps: step };
             return;
           }
@@ -769,7 +772,7 @@ export function createAgent(runtime: AgentRuntime) {
             if (action.type === 'final') {
               yield { type: 'text_delta', text: action.content };
               messages.push({ role: 'assistant', content: action.content });
-              { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
+              yield { type: 'step_end', step };
               yield { type: 'done', output: action.content, steps: step };
               return;
             }
@@ -781,7 +784,7 @@ export function createAgent(runtime: AgentRuntime) {
                 try {
                   const r = await executeTool(runtime.toolProvider, call, runtime);
                   yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
-                  const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
+                  const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
                   messages.push({
                     role: 'tool',
                     content: formatToolResultContent(call, r),
@@ -803,7 +806,7 @@ export function createAgent(runtime: AgentRuntime) {
               try {
                 const r = await executeTool(runtime.toolProvider, action, runtime);
                 yield { type: 'tool_end', name: action.name, result: toStreamResult(r) };
-                const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
+                const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
                 messages.push({
                   role: 'tool',
                   content: formatToolResultContent(action, r),
@@ -819,7 +822,7 @@ export function createAgent(runtime: AgentRuntime) {
                 });
               }
             }
-            { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
+            yield { type: 'step_end', step };
             continue;
           }
@@ -829,7 +832,7 @@ export function createAgent(runtime: AgentRuntime) {
             const call = pendingTools[i]!;
             const r = results[i]!;
             yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
-            const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
+            const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
             messages.push({
               role: 'tool',
               content: formatToolResultContent(call, r),
@@ -847,7 +850,7 @@ export function createAgent(runtime: AgentRuntime) {
           if (action.type === 'final') {
             messages.push({ role: 'assistant', content: action.content });
-            { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
+            yield { type: 'step_end', step };
             yield { type: 'done', output: action.content, steps: step };
             return;
           }
@@ -871,7 +874,7 @@ export function createAgent(runtime: AgentRuntime) {
             const call = calls[i]!;
             const r = results[i]!;
             yield { type: 'tool_end', name: call.name, result: toStreamResult(r) };
-            const rText = r.success ? (r.modelOutput ?? r.output) : `ERROR: ${r.error ?? 'unknown failure'}`;
+            const rText = r.success ? r.output : `ERROR: ${r.error ?? 'unknown failure'}`;
             messages.push({
               role: 'tool',
               content: formatToolResultContent(call, r),
@@ -885,7 +888,7 @@ export function createAgent(runtime: AgentRuntime) {
           }
         }
-        { const u = getLoopUsage(); yield u ? { type: 'step_end' as const, step, usage: u } : { type: 'step_end' as const, step }; }
+        yield { type: 'step_end', step };
       }
       yield { type: 'done', output: 'ERROR: max steps exceeded', steps: maxSteps };