@bluecopa/harness 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +212 -117
- package/dist/arc/index.d.ts +796 -0
- package/dist/arc/index.js +2863 -0
- package/dist/arc/index.js.map +1 -0
- package/dist/observability/otel.d.ts +36 -0
- package/dist/observability/otel.js +73 -0
- package/dist/observability/otel.js.map +1 -0
- package/dist/shared-types-DRxnerLT.d.ts +138 -0
- package/dist/skills/index.d.ts +67 -0
- package/dist/skills/index.js +282 -0
- package/dist/skills/index.js.map +1 -0
- package/package.json +26 -2
- package/AGENTS.md +0 -18
- package/docs/guides/observability.md +0 -32
- package/docs/guides/providers.md +0 -51
- package/docs/guides/skills.md +0 -25
- package/docs/security/skill-sandbox-threat-model.md +0 -20
- package/src/agent/create-agent.ts +0 -884
- package/src/agent/create-tools.ts +0 -33
- package/src/agent/step-executor.ts +0 -15
- package/src/agent/types.ts +0 -57
- package/src/context/llm-compaction-strategy.ts +0 -37
- package/src/context/prepare-step.ts +0 -65
- package/src/context/token-tracker.ts +0 -26
- package/src/extracted/manifest.json +0 -10
- package/src/extracted/prompts/compaction.md +0 -5
- package/src/extracted/prompts/system.md +0 -5
- package/src/extracted/tools.json +0 -82
- package/src/hooks/hook-runner.ts +0 -22
- package/src/hooks/tool-wrappers.ts +0 -64
- package/src/interfaces/compaction-strategy.ts +0 -18
- package/src/interfaces/hooks.ts +0 -24
- package/src/interfaces/sandbox-provider.ts +0 -29
- package/src/interfaces/session-store.ts +0 -48
- package/src/interfaces/tool-provider.ts +0 -70
- package/src/loop/bridge.ts +0 -363
- package/src/loop/context-store.ts +0 -207
- package/src/loop/lcm-tool-loop.ts +0 -163
- package/src/loop/vercel-agent-loop.ts +0 -279
- package/src/observability/context.ts +0 -17
- package/src/observability/metrics.ts +0 -27
- package/src/observability/otel.ts +0 -105
- package/src/observability/tracing.ts +0 -13
- package/src/optimization/agent-evaluator.ts +0 -40
- package/src/optimization/config-serializer.ts +0 -16
- package/src/optimization/optimization-runner.ts +0 -39
- package/src/optimization/trace-collector.ts +0 -33
- package/src/permissions/permission-manager.ts +0 -34
- package/src/providers/composite-tool-provider.ts +0 -72
- package/src/providers/control-plane-e2b-executor.ts +0 -218
- package/src/providers/e2b-tool-provider.ts +0 -68
- package/src/providers/local-tool-provider.ts +0 -190
- package/src/providers/skill-sandbox-provider.ts +0 -46
- package/src/sessions/file-session-store.ts +0 -61
- package/src/sessions/in-memory-session-store.ts +0 -39
- package/src/sessions/session-manager.ts +0 -44
- package/src/skills/skill-loader.ts +0 -52
- package/src/skills/skill-manager.ts +0 -175
- package/src/skills/skill-router.ts +0 -99
- package/src/skills/skill-types.ts +0 -26
- package/src/subagents/subagent-manager.ts +0 -22
- package/src/subagents/task-tool.ts +0 -13
- package/tests/integration/agent-loop-basic.spec.ts +0 -56
- package/tests/integration/agent-skill-default-from-sandbox.spec.ts +0 -66
- package/tests/integration/concurrency-single-turn.spec.ts +0 -35
- package/tests/integration/otel-metrics-emission.spec.ts +0 -62
- package/tests/integration/otel-trace-propagation.spec.ts +0 -48
- package/tests/integration/parity-benchmark.spec.ts +0 -45
- package/tests/integration/provider-local-smoke.spec.ts +0 -63
- package/tests/integration/session-resume.spec.ts +0 -30
- package/tests/integration/skill-install-rollback.spec.ts +0 -64
- package/tests/integration/skill-sandbox-file-blob.spec.ts +0 -54
- package/tests/integration/skills-progressive-disclosure.spec.ts +0 -61
- package/tests/integration/streaming-compaction-boundary.spec.ts +0 -43
- package/tests/integration/structured-messages-agent.spec.ts +0 -265
- package/tests/integration/subagent-isolation.spec.ts +0 -24
- package/tests/security/skill-sandbox-isolation.spec.ts +0 -51
- package/tests/unit/create-tools-schema-parity.spec.ts +0 -22
- package/tests/unit/extracted-manifest.spec.ts +0 -41
- package/tests/unit/interfaces-contract.spec.ts +0 -101
- package/tests/unit/structured-messages.spec.ts +0 -176
- package/tests/unit/token-tracker.spec.ts +0 -22
- package/tsconfig.json +0 -14
- package/vitest.config.ts +0 -7
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
import type { AgentAction, AgentLoop, AgentMessage, AgentStreamEvent } from '../agent/types';
|
|
2
|
-
import type { ToolProvider } from '../interfaces/tool-provider';
|
|
3
|
-
import type { ActivityEntry, BridgeConfig } from './bridge';
|
|
4
|
-
import { LosslessContextStore, type ContextStoreConfig } from './context-store';
|
|
5
|
-
import { SandboxBridge } from './bridge';
|
|
6
|
-
|
|
7
|
-
// ── REPL script marker ──
|
|
8
|
-
// The inner loop signals a REPL script by returning a Bash action whose
|
|
9
|
-
// command starts with this prefix.
|
|
10
|
-
const REPL_MARKER = '##REPL##\n';
|
|
11
|
-
|
|
12
|
-
export interface LCMToolLoopConfig {
|
|
13
|
-
/** Inner loop that drives LLM decisions. Default: must be provided. */
|
|
14
|
-
innerLoop: AgentLoop;
|
|
15
|
-
/** Tool provider for bridge file I/O and bash execution. */
|
|
16
|
-
toolProvider: ToolProvider;
|
|
17
|
-
/** Context store tuning. */
|
|
18
|
-
contextStore?: ContextStoreConfig;
|
|
19
|
-
/** Enable REPL mode. Default: true. Set false for context-management-only mode. */
|
|
20
|
-
enableRepl?: boolean;
|
|
21
|
-
/** Bridge directory inside the sandbox. Default: /var/run/bridge */
|
|
22
|
-
bridgeDir?: string;
|
|
23
|
-
/** Bridge poll interval in ms. Default: 200 */
|
|
24
|
-
bridgePollIntervalMs?: number;
|
|
25
|
-
/** Harness callback: fulfill LLM requests from the REPL. */
|
|
26
|
-
onLlmRequest?: BridgeConfig['onLlmRequest'];
|
|
27
|
-
/** Harness callback: fulfill web fetch requests from the REPL. */
|
|
28
|
-
onWebFetchRequest?: BridgeConfig['onWebFetchRequest'];
|
|
29
|
-
/** Harness callback: fulfill web search requests from the REPL. */
|
|
30
|
-
onWebSearchRequest?: BridgeConfig['onWebSearchRequest'];
|
|
31
|
-
/** Harness callback: fulfill ask-user requests from the REPL. */
|
|
32
|
-
onAskUserRequest?: BridgeConfig['onAskUserRequest'];
|
|
33
|
-
/** Harness callback: fulfill tell-user requests from the REPL. */
|
|
34
|
-
onTellUserRequest?: BridgeConfig['onTellUserRequest'];
|
|
35
|
-
/** Harness callback: receive real-time activity log entries from the REPL. */
|
|
36
|
-
onActivity?: (entry: ActivityEntry) => void;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
/**
|
|
40
|
-
* LCM Tool Loop — Lossless Context Management with optional REPL orchestration.
|
|
41
|
-
*
|
|
42
|
-
* Composes over an inner AgentLoop (e.g. VercelAgentLoop). Two modes:
|
|
43
|
-
*
|
|
44
|
-
* **Standard mode** (default): The inner loop decides one action at a time.
|
|
45
|
-
* LCM provides lossless context trimming so the LLM always sees a coherent,
|
|
46
|
-
* budget-fitting view of the full conversation.
|
|
47
|
-
*
|
|
48
|
-
* **REPL mode**: When the inner loop returns a Bash action with the REPL marker,
|
|
49
|
-
* the loop writes a Python script into the sandbox, injects the bridge module,
|
|
50
|
-
* runs the script, and polls for sub-requests (LLM, web_fetch, ask_user, etc.)
|
|
51
|
-
* that the harness fulfills. The user sees real-time activity via onActivity.
|
|
52
|
-
*/
|
|
53
|
-
export class LCMToolLoop implements AgentLoop {
|
|
54
|
-
private readonly store: LosslessContextStore;
|
|
55
|
-
private readonly inner: AgentLoop;
|
|
56
|
-
private readonly tp: ToolProvider;
|
|
57
|
-
private readonly enableRepl: boolean;
|
|
58
|
-
private readonly bridgeDir: string;
|
|
59
|
-
private readonly config: LCMToolLoopConfig;
|
|
60
|
-
private bridgeSetupDone = false;
|
|
61
|
-
|
|
62
|
-
constructor(config: LCMToolLoopConfig) {
|
|
63
|
-
this.config = config;
|
|
64
|
-
this.inner = config.innerLoop;
|
|
65
|
-
this.tp = config.toolProvider;
|
|
66
|
-
this.enableRepl = config.enableRepl ?? true;
|
|
67
|
-
this.bridgeDir = config.bridgeDir ?? '/var/run/bridge';
|
|
68
|
-
this.store = new LosslessContextStore(config.contextStore);
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
async nextAction(messages: AgentMessage[]): Promise<AgentAction> {
|
|
72
|
-
// 1. Ingest full history into lossless store
|
|
73
|
-
this.store.ingest(messages);
|
|
74
|
-
|
|
75
|
-
// 2. Get trimmed view
|
|
76
|
-
const view = this.store.getView();
|
|
77
|
-
const stats = this.store.stats();
|
|
78
|
-
this.log(`raw=${stats.raw} view=${stats.view} tokens≈${stats.tokensView}`);
|
|
79
|
-
|
|
80
|
-
// 3. Delegate to inner loop with the trimmed view
|
|
81
|
-
const action = await this.inner.nextAction(view);
|
|
82
|
-
|
|
83
|
-
// 4. If this is a REPL script, enter REPL mode
|
|
84
|
-
if (this.enableRepl && this.isReplScript(action)) {
|
|
85
|
-
return this.executeReplWithBridge(action);
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
// 5. Standard passthrough
|
|
89
|
-
return action;
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
async *streamAction(messages: AgentMessage[]): AsyncGenerator<AgentStreamEvent> {
|
|
93
|
-
// Context management — same as nextAction()
|
|
94
|
-
this.store.ingest(messages);
|
|
95
|
-
const view = this.store.getView();
|
|
96
|
-
|
|
97
|
-
// Delegate streaming to inner loop
|
|
98
|
-
if (this.inner.streamAction) {
|
|
99
|
-
yield* this.inner.streamAction(view);
|
|
100
|
-
}
|
|
101
|
-
// No fallback to nextAction() here — the agent's stream() handles that
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
/** Expose store stats for external logging. */
|
|
105
|
-
getStats() {
|
|
106
|
-
return this.store.stats();
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
// ── REPL detection ──
|
|
110
|
-
|
|
111
|
-
private isReplScript(action: AgentAction): boolean {
|
|
112
|
-
if (action.type !== 'tool' || action.name !== 'Bash') return false;
|
|
113
|
-
const cmd = String(action.args.command ?? '');
|
|
114
|
-
return cmd.startsWith(REPL_MARKER);
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
// ── REPL execution ──
|
|
118
|
-
|
|
119
|
-
private async executeReplWithBridge(action: AgentAction): Promise<AgentAction> {
|
|
120
|
-
if (action.type !== 'tool') return action;
|
|
121
|
-
|
|
122
|
-
const script = String(action.args.command ?? '').slice(REPL_MARKER.length);
|
|
123
|
-
this.log('entering REPL mode');
|
|
124
|
-
|
|
125
|
-
// Ensure bridge is set up — spread only defined optional fields to
|
|
126
|
-
// satisfy exactOptionalPropertyTypes.
|
|
127
|
-
const bridgeCfg: BridgeConfig = { toolProvider: this.tp };
|
|
128
|
-
bridgeCfg.bridgeDir = this.bridgeDir;
|
|
129
|
-
if (this.config.bridgePollIntervalMs != null) bridgeCfg.pollIntervalMs = this.config.bridgePollIntervalMs;
|
|
130
|
-
if (this.config.onLlmRequest) bridgeCfg.onLlmRequest = this.config.onLlmRequest;
|
|
131
|
-
if (this.config.onWebFetchRequest) bridgeCfg.onWebFetchRequest = this.config.onWebFetchRequest;
|
|
132
|
-
if (this.config.onWebSearchRequest) bridgeCfg.onWebSearchRequest = this.config.onWebSearchRequest;
|
|
133
|
-
if (this.config.onAskUserRequest) bridgeCfg.onAskUserRequest = this.config.onAskUserRequest;
|
|
134
|
-
if (this.config.onTellUserRequest) bridgeCfg.onTellUserRequest = this.config.onTellUserRequest;
|
|
135
|
-
if (this.config.onActivity) bridgeCfg.onActivity = this.config.onActivity;
|
|
136
|
-
const bridge = new SandboxBridge(bridgeCfg);
|
|
137
|
-
|
|
138
|
-
if (!this.bridgeSetupDone) {
|
|
139
|
-
await bridge.setup();
|
|
140
|
-
this.bridgeSetupDone = true;
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
// Write the REPL script into the sandbox
|
|
144
|
-
const scriptPath = `${this.bridgeDir}/repl_script.py`;
|
|
145
|
-
await this.tp.writeFile(scriptPath, script);
|
|
146
|
-
|
|
147
|
-
// Run and poll
|
|
148
|
-
const output = await bridge.pollUntilComplete(scriptPath);
|
|
149
|
-
this.log('REPL mode complete');
|
|
150
|
-
|
|
151
|
-
// Return the REPL output as a final action
|
|
152
|
-
return {
|
|
153
|
-
type: 'final',
|
|
154
|
-
content: output || 'REPL script completed with no output.',
|
|
155
|
-
};
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
private log(msg: string): void {
|
|
159
|
-
if (typeof process !== 'undefined' && process.stderr) {
|
|
160
|
-
process.stderr.write(`\x1b[2m◇ ${msg}\x1b[0m\n`);
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
}
|
|
@@ -1,279 +0,0 @@
|
|
|
1
|
-
import { generateText, streamText, stepCountIs, tool, type Tool } from 'ai';
|
|
2
|
-
import { anthropic } from '@ai-sdk/anthropic';
|
|
3
|
-
import { z } from 'zod';
|
|
4
|
-
|
|
5
|
-
import type { AgentAction, AgentMessage, AgentLoop, AgentStreamEvent, ToolCallAction, ToolBatchAction } from '../agent/types';
|
|
6
|
-
|
|
7
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
8
|
-
type AnyTool = Tool<any, any>;
|
|
9
|
-
|
|
10
|
-
// ── Native tool definitions (no execute — harness runs them) ──
|
|
11
|
-
|
|
12
|
-
/** Built-in tool schemas. Merge with custom tools: `{ ...builtinTools, ...myTools }` */
|
|
13
|
-
export const builtinTools = {
|
|
14
|
-
Bash: tool({
|
|
15
|
-
description: 'Run a shell command',
|
|
16
|
-
inputSchema: z.object({
|
|
17
|
-
command: z.string().describe('The shell command to run'),
|
|
18
|
-
cwd: z.string().optional().describe('Working directory'),
|
|
19
|
-
timeout: z.number().optional().describe('Timeout in milliseconds'),
|
|
20
|
-
}),
|
|
21
|
-
}),
|
|
22
|
-
Read: tool({
|
|
23
|
-
description: 'Read a file from the filesystem',
|
|
24
|
-
inputSchema: z.object({
|
|
25
|
-
path: z.string().describe('Absolute path to the file'),
|
|
26
|
-
}),
|
|
27
|
-
}),
|
|
28
|
-
Write: tool({
|
|
29
|
-
description: 'Write content to a file (creates or overwrites)',
|
|
30
|
-
inputSchema: z.object({
|
|
31
|
-
path: z.string().describe('Absolute path to the file'),
|
|
32
|
-
content: z.string().describe('Content to write'),
|
|
33
|
-
}),
|
|
34
|
-
}),
|
|
35
|
-
Edit: tool({
|
|
36
|
-
description: 'Replace text in a file (exact match)',
|
|
37
|
-
inputSchema: z.object({
|
|
38
|
-
path: z.string().describe('Absolute path to the file'),
|
|
39
|
-
old_text: z.string().describe('Exact text to find'),
|
|
40
|
-
new_text: z.string().describe('Replacement text'),
|
|
41
|
-
}),
|
|
42
|
-
}),
|
|
43
|
-
Glob: tool({
|
|
44
|
-
description: 'Find files matching a glob pattern',
|
|
45
|
-
inputSchema: z.object({
|
|
46
|
-
pattern: z.string().describe('Glob pattern (e.g. **/*.ts)'),
|
|
47
|
-
}),
|
|
48
|
-
}),
|
|
49
|
-
Grep: tool({
|
|
50
|
-
description: 'Search file contents with a regex pattern',
|
|
51
|
-
inputSchema: z.object({
|
|
52
|
-
pattern: z.string().describe('Regex pattern to search for'),
|
|
53
|
-
path: z.string().optional().describe('Directory or file to search in'),
|
|
54
|
-
}),
|
|
55
|
-
}),
|
|
56
|
-
WebFetch: tool({
|
|
57
|
-
description: 'Fetch content from a URL',
|
|
58
|
-
inputSchema: z.object({
|
|
59
|
-
url: z.string().describe('URL to fetch'),
|
|
60
|
-
selector: z.string().optional().describe('CSS selector to extract'),
|
|
61
|
-
maxContentLength: z.number().optional().describe('Max content length'),
|
|
62
|
-
}),
|
|
63
|
-
}),
|
|
64
|
-
WebSearch: tool({
|
|
65
|
-
description: 'Search the web',
|
|
66
|
-
inputSchema: z.object({
|
|
67
|
-
query: z.string().describe('Search query'),
|
|
68
|
-
}),
|
|
69
|
-
}),
|
|
70
|
-
AskUser: tool({
|
|
71
|
-
description: 'Ask the user a question and wait for their response',
|
|
72
|
-
inputSchema: z.object({
|
|
73
|
-
question: z.string().describe('The question to ask'),
|
|
74
|
-
options: z.array(z.string()).optional().describe('Optional choices'),
|
|
75
|
-
}),
|
|
76
|
-
}),
|
|
77
|
-
TellUser: tool({
|
|
78
|
-
description: 'Display a message to the user (no response expected)',
|
|
79
|
-
inputSchema: z.object({
|
|
80
|
-
message: z.string().describe('The message to display'),
|
|
81
|
-
}),
|
|
82
|
-
}),
|
|
83
|
-
DownloadRawFile: tool({
|
|
84
|
-
description: 'Download a file from the sandbox to the local machine',
|
|
85
|
-
inputSchema: z.object({
|
|
86
|
-
path: z.string().describe('Sandbox path to download'),
|
|
87
|
-
}),
|
|
88
|
-
}),
|
|
89
|
-
};
|
|
90
|
-
|
|
91
|
-
// ── Convert AgentMessage[] to Vercel AI SDK ModelMessage[] ──
|
|
92
|
-
|
|
93
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
94
|
-
type ModelMessage = any;
|
|
95
|
-
|
|
96
|
-
function toModelMessages(messages: AgentMessage[]): ModelMessage[] {
|
|
97
|
-
const out: ModelMessage[] = [];
|
|
98
|
-
|
|
99
|
-
for (const msg of messages) {
|
|
100
|
-
if (msg.role === 'system') {
|
|
101
|
-
out.push({ role: 'system', content: msg.content });
|
|
102
|
-
continue;
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
if (msg.role === 'user') {
|
|
106
|
-
out.push({ role: 'user', content: msg.content });
|
|
107
|
-
continue;
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
if (msg.role === 'assistant') {
|
|
111
|
-
if (msg.toolCalls && msg.toolCalls.length > 0) {
|
|
112
|
-
// Structured assistant message with tool calls
|
|
113
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
114
|
-
const parts: any[] = [];
|
|
115
|
-
if (msg.content) {
|
|
116
|
-
parts.push({ type: 'text', text: msg.content });
|
|
117
|
-
}
|
|
118
|
-
for (const tc of msg.toolCalls) {
|
|
119
|
-
parts.push({
|
|
120
|
-
type: 'tool-call',
|
|
121
|
-
toolCallId: tc.toolCallId,
|
|
122
|
-
toolName: tc.toolName,
|
|
123
|
-
input: tc.args,
|
|
124
|
-
});
|
|
125
|
-
}
|
|
126
|
-
out.push({ role: 'assistant', content: parts });
|
|
127
|
-
} else {
|
|
128
|
-
out.push({ role: 'assistant', content: msg.content });
|
|
129
|
-
}
|
|
130
|
-
continue;
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
if (msg.role === 'tool') {
|
|
134
|
-
if (msg.toolResults && msg.toolResults.length > 0) {
|
|
135
|
-
// Structured tool result message
|
|
136
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
137
|
-
const parts: any[] = msg.toolResults.map(tr => ({
|
|
138
|
-
type: 'tool-result',
|
|
139
|
-
toolCallId: tr.toolCallId,
|
|
140
|
-
toolName: tr.toolName,
|
|
141
|
-
output: tr.isError
|
|
142
|
-
? { type: 'error-text', value: tr.result }
|
|
143
|
-
: { type: 'text', value: tr.result },
|
|
144
|
-
}));
|
|
145
|
-
out.push({ role: 'tool', content: parts });
|
|
146
|
-
} else {
|
|
147
|
-
// Fallback: no structured fields — wrap as user message
|
|
148
|
-
out.push({ role: 'user', content: `[Tool result]: ${msg.content}` });
|
|
149
|
-
}
|
|
150
|
-
continue;
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
return out;
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
export interface VercelAgentLoopConfig {
|
|
158
|
-
model?: string;
|
|
159
|
-
systemPrompt?: string;
|
|
160
|
-
apiKey?: string;
|
|
161
|
-
/** Custom tool definitions. If provided, replaces built-in agentTools for LLM calls. */
|
|
162
|
-
tools?: Record<string, AnyTool>;
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
export class VercelAgentLoop implements AgentLoop {
|
|
166
|
-
private readonly model: string;
|
|
167
|
-
private readonly systemPrompt: string;
|
|
168
|
-
private readonly tools: Record<string, AnyTool>;
|
|
169
|
-
private readonly validToolNames: Set<string>;
|
|
170
|
-
|
|
171
|
-
constructor(config: VercelAgentLoopConfig = {}) {
|
|
172
|
-
this.model = config.model ?? process.env.HARNESS_MODEL ?? 'claude-sonnet-4-5';
|
|
173
|
-
this.tools = config.tools ?? builtinTools;
|
|
174
|
-
this.validToolNames = new Set(Object.keys(this.tools));
|
|
175
|
-
this.systemPrompt =
|
|
176
|
-
config.systemPrompt ??
|
|
177
|
-
[
|
|
178
|
-
'You are an agent that accomplishes tasks using tools.',
|
|
179
|
-
'You may call multiple independent tools in a single turn.',
|
|
180
|
-
'Use tools when shell or filesystem access is required.',
|
|
181
|
-
'For file-generation tasks, prefer one-shot execution with a single Bash command (heredoc/script + run + verify) instead of repeated Write rewrites.',
|
|
182
|
-
'Avoid rewriting the same file multiple times unless a previous run returned an error that requires a fix.',
|
|
183
|
-
'When the task is fully complete, respond with a brief text summary (no tool call).',
|
|
184
|
-
].join(' ');
|
|
185
|
-
|
|
186
|
-
if (config.apiKey) {
|
|
187
|
-
process.env.ANTHROPIC_API_KEY = config.apiKey;
|
|
188
|
-
}
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
async nextAction(messages: AgentMessage[]): Promise<AgentAction> {
|
|
192
|
-
if (!process.env.ANTHROPIC_API_KEY) {
|
|
193
|
-
throw new Error('ANTHROPIC_API_KEY is required for default VercelAgentLoop');
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
197
|
-
const result = await (generateText as any)({
|
|
198
|
-
model: anthropic(this.model),
|
|
199
|
-
tools: this.tools,
|
|
200
|
-
toolChoice: 'auto',
|
|
201
|
-
system: this.systemPrompt,
|
|
202
|
-
messages: toModelMessages(messages),
|
|
203
|
-
stopWhen: stepCountIs(1),
|
|
204
|
-
});
|
|
205
|
-
|
|
206
|
-
// If the model made tool calls, extract them
|
|
207
|
-
if (result.toolCalls && result.toolCalls.length > 0) {
|
|
208
|
-
const validCalls: ToolCallAction[] = [];
|
|
209
|
-
for (const call of result.toolCalls) {
|
|
210
|
-
const name = call.toolName;
|
|
211
|
-
if (this.validToolNames.has(name)) {
|
|
212
|
-
const toolCallId = (call as { toolCallId?: string }).toolCallId;
|
|
213
|
-
validCalls.push({
|
|
214
|
-
type: 'tool',
|
|
215
|
-
name,
|
|
216
|
-
args: (call as { input: Record<string, unknown> }).input,
|
|
217
|
-
...(toolCallId != null ? { toolCallId } : {}),
|
|
218
|
-
});
|
|
219
|
-
}
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
if (validCalls.length === 0) {
|
|
223
|
-
return { type: 'final', content: `Unknown tool: ${result.toolCalls[0]!.toolName}` };
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
// Single call → backward-compatible ToolCallAction
|
|
227
|
-
if (validCalls.length === 1) {
|
|
228
|
-
return validCalls[0]!;
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
// Multiple calls → ToolBatchAction
|
|
232
|
-
return { type: 'tool_batch', calls: validCalls } satisfies ToolBatchAction;
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
// No tool call — model responded with text (task complete)
|
|
236
|
-
const text = result.text?.trim();
|
|
237
|
-
return { type: 'final', content: text || 'Done.' };
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
async *streamAction(messages: AgentMessage[]): AsyncGenerator<AgentStreamEvent> {
|
|
241
|
-
if (!process.env.ANTHROPIC_API_KEY) {
|
|
242
|
-
throw new Error('ANTHROPIC_API_KEY is required for default VercelAgentLoop');
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
246
|
-
const result = (streamText as any)({
|
|
247
|
-
model: anthropic(this.model),
|
|
248
|
-
tools: this.tools,
|
|
249
|
-
toolChoice: 'auto',
|
|
250
|
-
system: this.systemPrompt,
|
|
251
|
-
messages: toModelMessages(messages),
|
|
252
|
-
stopWhen: stepCountIs(1),
|
|
253
|
-
});
|
|
254
|
-
|
|
255
|
-
const toolArgs = new Map<string, string>();
|
|
256
|
-
|
|
257
|
-
for await (const part of result.fullStream) {
|
|
258
|
-
if (part.type === 'text-delta') {
|
|
259
|
-
yield { type: 'text_delta', text: part.text };
|
|
260
|
-
}
|
|
261
|
-
if (part.type === 'tool-input-start') {
|
|
262
|
-
toolArgs.set(part.id, '');
|
|
263
|
-
}
|
|
264
|
-
if (part.type === 'tool-input-delta') {
|
|
265
|
-
toolArgs.set(part.id, (toolArgs.get(part.id) ?? '') + part.delta);
|
|
266
|
-
}
|
|
267
|
-
if (part.type === 'tool-call') {
|
|
268
|
-
const name = part.toolName;
|
|
269
|
-
if (this.validToolNames.has(name)) {
|
|
270
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
271
|
-
const p = part as any;
|
|
272
|
-
const args: Record<string, unknown> = p.args ?? p.input ?? {};
|
|
273
|
-
const toolCallId: string | undefined = p.toolCallId;
|
|
274
|
-
yield { type: 'tool_start', name, args, ...(toolCallId != null ? { toolCallId } : {}) };
|
|
275
|
-
}
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
}
|
|
279
|
-
}
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import type { SpanRecord } from './otel';
|
|
2
|
-
|
|
3
|
-
export interface CorrelationFields {
|
|
4
|
-
trace_id: string;
|
|
5
|
-
span_id: string;
|
|
6
|
-
run_id?: string | undefined;
|
|
7
|
-
session_id?: string | undefined;
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
export function correlationFromSpan(span: SpanRecord, runId?: string, sessionId?: string): CorrelationFields {
|
|
11
|
-
return {
|
|
12
|
-
trace_id: span.traceId,
|
|
13
|
-
span_id: span.spanId,
|
|
14
|
-
run_id: runId,
|
|
15
|
-
session_id: sessionId
|
|
16
|
-
};
|
|
17
|
-
}
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import { HarnessTelemetry } from './otel';
|
|
2
|
-
|
|
3
|
-
export function recordToolCallDuration(
|
|
4
|
-
telemetry: HarnessTelemetry | undefined,
|
|
5
|
-
toolName: string,
|
|
6
|
-
durationMs: number,
|
|
7
|
-
success: boolean
|
|
8
|
-
): void {
|
|
9
|
-
if (!telemetry?.isEnabled()) return;
|
|
10
|
-
telemetry.counter('tool_calls_total', 1, { tool: toolName, success });
|
|
11
|
-
telemetry.histogram('tool_call_duration_ms', durationMs, { tool: toolName, success });
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
export function recordAgentStep(telemetry: HarnessTelemetry | undefined): void {
|
|
15
|
-
if (!telemetry?.isEnabled()) return;
|
|
16
|
-
telemetry.counter('agent_steps_total', 1);
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
export function recordCompaction(telemetry: HarnessTelemetry | undefined): void {
|
|
20
|
-
if (!telemetry?.isEnabled()) return;
|
|
21
|
-
telemetry.counter('compactions_total', 1);
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
export function recordAgentError(telemetry: HarnessTelemetry | undefined): void {
|
|
25
|
-
if (!telemetry?.isEnabled()) return;
|
|
26
|
-
telemetry.counter('agent_errors_total', 1);
|
|
27
|
-
}
|
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
2
|
-
import { randomUUID } from 'node:crypto';
|
|
3
|
-
|
|
4
|
-
export interface SpanRecord {
|
|
5
|
-
traceId: string;
|
|
6
|
-
spanId: string;
|
|
7
|
-
parentSpanId?: string | undefined;
|
|
8
|
-
name: string;
|
|
9
|
-
attributes: Record<string, string | number | boolean>;
|
|
10
|
-
startTime: number;
|
|
11
|
-
endTime: number;
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
export interface MetricRecord {
|
|
15
|
-
name: string;
|
|
16
|
-
value: number;
|
|
17
|
-
type: 'counter' | 'histogram';
|
|
18
|
-
attributes: Record<string, string | number | boolean>;
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
export interface SpanHandle {
|
|
22
|
-
traceId: string;
|
|
23
|
-
spanId: string;
|
|
24
|
-
end(attributes?: Record<string, string | number | boolean>): void;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
export class HarnessTelemetry {
|
|
28
|
-
private readonly context = new AsyncLocalStorage<{ traceId: string; spanId: string }>();
|
|
29
|
-
private readonly spans: SpanRecord[] = [];
|
|
30
|
-
private readonly metrics: MetricRecord[] = [];
|
|
31
|
-
|
|
32
|
-
constructor(private readonly enabled = true) {}
|
|
33
|
-
|
|
34
|
-
isEnabled(): boolean {
|
|
35
|
-
return this.enabled;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
startSpan(name: string, attributes: Record<string, string | number | boolean> = {}): SpanHandle {
|
|
39
|
-
const now = Date.now();
|
|
40
|
-
const parent = this.context.getStore();
|
|
41
|
-
const traceId = parent?.traceId ?? randomUUID().replace(/-/g, '');
|
|
42
|
-
const spanId = randomUUID().replace(/-/g, '').slice(0, 16);
|
|
43
|
-
|
|
44
|
-
const record: SpanRecord = {
|
|
45
|
-
traceId,
|
|
46
|
-
spanId,
|
|
47
|
-
parentSpanId: parent?.spanId,
|
|
48
|
-
name,
|
|
49
|
-
attributes: { ...attributes },
|
|
50
|
-
startTime: now,
|
|
51
|
-
endTime: now
|
|
52
|
-
};
|
|
53
|
-
|
|
54
|
-
this.spans.push(record);
|
|
55
|
-
|
|
56
|
-
return {
|
|
57
|
-
traceId,
|
|
58
|
-
spanId,
|
|
59
|
-
end: (extra = {}) => {
|
|
60
|
-
record.endTime = Date.now();
|
|
61
|
-
record.attributes = { ...record.attributes, ...extra };
|
|
62
|
-
}
|
|
63
|
-
};
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
async withSpan<T>(
|
|
67
|
-
name: string,
|
|
68
|
-
attributes: Record<string, string | number | boolean>,
|
|
69
|
-
fn: () => Promise<T>
|
|
70
|
-
): Promise<T> {
|
|
71
|
-
if (!this.enabled) {
|
|
72
|
-
return fn();
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
const span = this.startSpan(name, attributes);
|
|
76
|
-
return this.context.run({ traceId: span.traceId, spanId: span.spanId }, async () => {
|
|
77
|
-
try {
|
|
78
|
-
const result = await fn();
|
|
79
|
-
span.end({ success: true });
|
|
80
|
-
return result;
|
|
81
|
-
} catch (error) {
|
|
82
|
-
span.end({ success: false, error: error instanceof Error ? error.message : 'unknown' });
|
|
83
|
-
throw error;
|
|
84
|
-
}
|
|
85
|
-
});
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
counter(name: string, value = 1, attributes: Record<string, string | number | boolean> = {}): void {
|
|
89
|
-
if (!this.enabled) return;
|
|
90
|
-
this.metrics.push({ name, value, type: 'counter', attributes });
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
histogram(name: string, value: number, attributes: Record<string, string | number | boolean> = {}): void {
|
|
94
|
-
if (!this.enabled) return;
|
|
95
|
-
this.metrics.push({ name, value, type: 'histogram', attributes });
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
getSpans(): SpanRecord[] {
|
|
99
|
-
return [...this.spans];
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
getMetrics(): MetricRecord[] {
|
|
103
|
-
return [...this.metrics];
|
|
104
|
-
}
|
|
105
|
-
}
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
import { HarnessTelemetry } from './otel';
|
|
2
|
-
|
|
3
|
-
export async function traceStep<T>(
|
|
4
|
-
telemetry: HarnessTelemetry | undefined,
|
|
5
|
-
name: string,
|
|
6
|
-
attributes: Record<string, string | number | boolean>,
|
|
7
|
-
fn: () => Promise<T>
|
|
8
|
-
): Promise<T> {
|
|
9
|
-
if (!telemetry?.isEnabled()) {
|
|
10
|
-
return fn();
|
|
11
|
-
}
|
|
12
|
-
return telemetry.withSpan(name, attributes, fn);
|
|
13
|
-
}
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import { randomUUID } from 'node:crypto';
|
|
2
|
-
|
|
3
|
-
import type { AgentRunResult } from '../agent/create-agent';
|
|
4
|
-
import { TraceCollector } from './trace-collector';
|
|
5
|
-
|
|
6
|
-
export interface EvaluationTask {
|
|
7
|
-
prompt: string;
|
|
8
|
-
expectedContains?: string;
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
export interface EvaluationResult {
|
|
12
|
-
score: number;
|
|
13
|
-
output: string;
|
|
14
|
-
trace: ReturnType<TraceCollector['toExecutionTrace']>;
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
export type AgentRunner = (prompt: string) => Promise<AgentRunResult>;
|
|
18
|
-
|
|
19
|
-
export class AgentEvaluator {
|
|
20
|
-
constructor(private readonly runAgent: AgentRunner) {}
|
|
21
|
-
|
|
22
|
-
async evaluate(task: EvaluationTask): Promise<EvaluationResult> {
|
|
23
|
-
const trace = new TraceCollector(randomUUID());
|
|
24
|
-
trace.add('task.start', { prompt: task.prompt });
|
|
25
|
-
|
|
26
|
-
const result = await this.runAgent(task.prompt);
|
|
27
|
-
trace.add('task.finish', { output: result.output, steps: result.steps });
|
|
28
|
-
|
|
29
|
-
let score = 1;
|
|
30
|
-
if (task.expectedContains && !result.output.includes(task.expectedContains)) {
|
|
31
|
-
score = 0;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
return {
|
|
35
|
-
score,
|
|
36
|
-
output: result.output,
|
|
37
|
-
trace: trace.toExecutionTrace()
|
|
38
|
-
};
|
|
39
|
-
}
|
|
40
|
-
}
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
import { createHash } from 'node:crypto';
|
|
2
|
-
|
|
3
|
-
export class ConfigSerializer {
|
|
4
|
-
static serialize(config: Record<string, unknown>): string {
|
|
5
|
-
return JSON.stringify(config, Object.keys(config).sort(), 2);
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
static deserialize(serialized: string): Record<string, unknown> {
|
|
9
|
-
return JSON.parse(serialized) as Record<string, unknown>;
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
static hash(config: Record<string, unknown>): string {
|
|
13
|
-
const serialized = ConfigSerializer.serialize(config);
|
|
14
|
-
return createHash('sha256').update(serialized).digest('hex');
|
|
15
|
-
}
|
|
16
|
-
}
|