@agentv/core 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -396,7 +396,7 @@ interface ChatMessage {
396
396
  readonly name?: string;
397
397
  }
398
398
  type ChatPrompt = readonly ChatMessage[];
399
- type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
399
+ type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
400
400
  interface ProviderRequest {
401
401
  readonly question: string;
402
402
  readonly systemPrompt?: string;
@@ -805,6 +805,13 @@ interface PiCodingAgentResolvedConfig {
805
805
  readonly logFormat?: 'summary' | 'json';
806
806
  readonly systemPrompt?: string;
807
807
  }
808
+ interface PiAgentSdkResolvedConfig {
809
+ readonly provider?: string;
810
+ readonly model?: string;
811
+ readonly apiKey?: string;
812
+ readonly timeoutMs?: number;
813
+ readonly systemPrompt?: string;
814
+ }
808
815
  interface ClaudeCodeResolvedConfig {
809
816
  readonly executable: string;
810
817
  readonly model?: string;
@@ -863,6 +870,13 @@ type ResolvedTarget = {
863
870
  readonly workers?: number;
864
871
  readonly providerBatching?: boolean;
865
872
  readonly config: PiCodingAgentResolvedConfig;
873
+ } | {
874
+ readonly kind: 'pi-agent-sdk';
875
+ readonly name: string;
876
+ readonly judgeTarget?: string;
877
+ readonly workers?: number;
878
+ readonly providerBatching?: boolean;
879
+ readonly config: PiAgentSdkResolvedConfig;
866
880
  } | {
867
881
  readonly kind: 'claude-code';
868
882
  readonly name: string;
@@ -1211,37 +1225,9 @@ interface GenerateRubricsOptions {
1211
1225
  */
1212
1226
  declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
1213
1227
 
1214
- /**
1215
- * Payload received by code judges via stdin.
1216
- * All properties use camelCase for TypeScript ergonomics.
1217
- */
1218
- interface CodeJudgePayload {
1219
- readonly question: string;
1220
- readonly expectedOutcome: string;
1221
- readonly expectedMessages: readonly JsonObject[];
1222
- readonly referenceAnswer?: string;
1223
- readonly candidateAnswer: string;
1224
- readonly outputMessages?: readonly OutputMessage[] | null;
1225
- readonly guidelineFiles: readonly string[];
1226
- readonly inputFiles: readonly string[];
1227
- readonly inputMessages: readonly TestMessage[];
1228
- readonly traceSummary?: TraceSummary | null;
1229
- readonly config?: JsonObject | null;
1230
- }
1231
- /**
1232
- * Parse stdin JSON (snake_case) into typed camelCase object.
1233
- * Use this in TypeScript code judges to get type-safe, idiomatic input.
1234
- */
1235
- declare function parseCodeJudgePayload(payload: string): CodeJudgePayload;
1236
- /**
1237
- * Convenience helper that reads stdin and parses it.
1238
- * Equivalent to: parseCodeJudgePayload(readFileSync(0, 'utf8'))
1239
- */
1240
- declare function readCodeJudgePayload(): CodeJudgePayload;
1241
-
1242
1228
  type AgentKernel = {
1243
1229
  status: string;
1244
1230
  };
1245
1231
  declare function createAgentKernel(): AgentKernel;
1246
1232
 
1247
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodeJudgePayload, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseCodeJudgePayload, readCodeJudgePayload, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
1233
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
package/dist/index.d.ts CHANGED
@@ -396,7 +396,7 @@ interface ChatMessage {
396
396
  readonly name?: string;
397
397
  }
398
398
  type ChatPrompt = readonly ChatMessage[];
399
- type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
399
+ type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
400
400
  interface ProviderRequest {
401
401
  readonly question: string;
402
402
  readonly systemPrompt?: string;
@@ -805,6 +805,13 @@ interface PiCodingAgentResolvedConfig {
805
805
  readonly logFormat?: 'summary' | 'json';
806
806
  readonly systemPrompt?: string;
807
807
  }
808
+ interface PiAgentSdkResolvedConfig {
809
+ readonly provider?: string;
810
+ readonly model?: string;
811
+ readonly apiKey?: string;
812
+ readonly timeoutMs?: number;
813
+ readonly systemPrompt?: string;
814
+ }
808
815
  interface ClaudeCodeResolvedConfig {
809
816
  readonly executable: string;
810
817
  readonly model?: string;
@@ -863,6 +870,13 @@ type ResolvedTarget = {
863
870
  readonly workers?: number;
864
871
  readonly providerBatching?: boolean;
865
872
  readonly config: PiCodingAgentResolvedConfig;
873
+ } | {
874
+ readonly kind: 'pi-agent-sdk';
875
+ readonly name: string;
876
+ readonly judgeTarget?: string;
877
+ readonly workers?: number;
878
+ readonly providerBatching?: boolean;
879
+ readonly config: PiAgentSdkResolvedConfig;
866
880
  } | {
867
881
  readonly kind: 'claude-code';
868
882
  readonly name: string;
@@ -1211,37 +1225,9 @@ interface GenerateRubricsOptions {
1211
1225
  */
1212
1226
  declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
1213
1227
 
1214
- /**
1215
- * Payload received by code judges via stdin.
1216
- * All properties use camelCase for TypeScript ergonomics.
1217
- */
1218
- interface CodeJudgePayload {
1219
- readonly question: string;
1220
- readonly expectedOutcome: string;
1221
- readonly expectedMessages: readonly JsonObject[];
1222
- readonly referenceAnswer?: string;
1223
- readonly candidateAnswer: string;
1224
- readonly outputMessages?: readonly OutputMessage[] | null;
1225
- readonly guidelineFiles: readonly string[];
1226
- readonly inputFiles: readonly string[];
1227
- readonly inputMessages: readonly TestMessage[];
1228
- readonly traceSummary?: TraceSummary | null;
1229
- readonly config?: JsonObject | null;
1230
- }
1231
- /**
1232
- * Parse stdin JSON (snake_case) into typed camelCase object.
1233
- * Use this in TypeScript code judges to get type-safe, idiomatic input.
1234
- */
1235
- declare function parseCodeJudgePayload(payload: string): CodeJudgePayload;
1236
- /**
1237
- * Convenience helper that reads stdin and parses it.
1238
- * Equivalent to: parseCodeJudgePayload(readFileSync(0, 'utf8'))
1239
- */
1240
- declare function readCodeJudgePayload(): CodeJudgePayload;
1241
-
1242
1228
  type AgentKernel = {
1243
1229
  status: string;
1244
1230
  };
1245
1231
  declare function createAgentKernel(): AgentKernel;
1246
1232
 
1247
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodeJudgePayload, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseCodeJudgePayload, readCodeJudgePayload, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
1233
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
package/dist/index.js CHANGED
@@ -10,7 +10,7 @@ import {
10
10
  readTextFile,
11
11
  resolveFileReference,
12
12
  resolveTargetDefinition
13
- } from "./chunk-IBTKEEOT.js";
13
+ } from "./chunk-KDEP4I7G.js";
14
14
 
15
15
  // src/evaluation/types.ts
16
16
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -4084,6 +4084,167 @@ var MockProvider = class {
4084
4084
  }
4085
4085
  };
4086
4086
 
4087
+ // src/evaluation/providers/pi-agent-sdk.ts
4088
+ var piAgentModule = null;
4089
+ var piAiModule = null;
4090
+ async function loadPiModules() {
4091
+ if (!piAgentModule || !piAiModule) {
4092
+ try {
4093
+ [piAgentModule, piAiModule] = await Promise.all([
4094
+ import("@mariozechner/pi-agent"),
4095
+ import("@mariozechner/pi-ai")
4096
+ ]);
4097
+ } catch (error) {
4098
+ throw new Error(
4099
+ `Failed to load pi-agent-sdk dependencies. Please install them:
4100
+ npm install @mariozechner/pi-agent @mariozechner/pi-ai
4101
+
4102
+ Original error: ${error instanceof Error ? error.message : String(error)}`
4103
+ );
4104
+ }
4105
+ }
4106
+ return {
4107
+ Agent: piAgentModule.Agent,
4108
+ ProviderTransport: piAgentModule.ProviderTransport,
4109
+ getModel: piAiModule.getModel,
4110
+ getEnvApiKey: piAiModule.getEnvApiKey
4111
+ };
4112
+ }
4113
+ var PiAgentSdkProvider = class {
4114
+ id;
4115
+ kind = "pi-agent-sdk";
4116
+ targetName;
4117
+ supportsBatch = false;
4118
+ config;
4119
+ constructor(targetName, config) {
4120
+ this.id = `pi-agent-sdk:${targetName}`;
4121
+ this.targetName = targetName;
4122
+ this.config = config;
4123
+ }
4124
+ async invoke(request) {
4125
+ if (request.signal?.aborted) {
4126
+ throw new Error("Pi agent SDK request was aborted before execution");
4127
+ }
4128
+ const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
4129
+ const startTime = Date.now();
4130
+ const providerName = this.config.provider ?? "anthropic";
4131
+ const modelId = this.config.model ?? "claude-sonnet-4-20250514";
4132
+ const model = getModel(providerName, modelId);
4133
+ const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
4134
+ const transport = new ProviderTransport({
4135
+ getApiKey: async (provider) => {
4136
+ return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
4137
+ }
4138
+ });
4139
+ const agent = new Agent({
4140
+ initialState: {
4141
+ systemPrompt,
4142
+ model,
4143
+ tools: [],
4144
+ // No tools for simple Q&A
4145
+ messages: []
4146
+ },
4147
+ transport
4148
+ });
4149
+ const outputMessages = [];
4150
+ let finalAssistantContent = "";
4151
+ const unsubscribe = agent.subscribe((event) => {
4152
+ if (event.type === "message_end") {
4153
+ const msg = event.message;
4154
+ if (msg.role === "assistant") {
4155
+ const content = extractTextContent2(msg.content);
4156
+ if (content) {
4157
+ finalAssistantContent = content;
4158
+ }
4159
+ }
4160
+ }
4161
+ });
4162
+ try {
4163
+ const timeoutMs = this.config.timeoutMs ?? 12e4;
4164
+ const timeoutPromise = new Promise((_, reject) => {
4165
+ setTimeout(
4166
+ () => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
4167
+ timeoutMs
4168
+ );
4169
+ });
4170
+ await Promise.race([agent.prompt(request.question), timeoutPromise]);
4171
+ await agent.waitForIdle();
4172
+ const agentMessages = agent.state.messages;
4173
+ for (const msg of agentMessages) {
4174
+ outputMessages.push(convertAgentMessage(msg));
4175
+ }
4176
+ const durationMs = Date.now() - startTime;
4177
+ return {
4178
+ raw: {
4179
+ messages: agentMessages,
4180
+ systemPrompt,
4181
+ model: this.config.model,
4182
+ provider: this.config.provider
4183
+ },
4184
+ outputMessages,
4185
+ durationMs
4186
+ };
4187
+ } finally {
4188
+ unsubscribe();
4189
+ }
4190
+ }
4191
+ };
4192
+ function extractTextContent2(content) {
4193
+ if (typeof content === "string") {
4194
+ return content;
4195
+ }
4196
+ if (!Array.isArray(content)) {
4197
+ return void 0;
4198
+ }
4199
+ const textParts = [];
4200
+ for (const part of content) {
4201
+ if (!part || typeof part !== "object") {
4202
+ continue;
4203
+ }
4204
+ const p = part;
4205
+ if (p.type === "text" && typeof p.text === "string") {
4206
+ textParts.push(p.text);
4207
+ }
4208
+ }
4209
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
4210
+ }
4211
+ function convertAgentMessage(message) {
4212
+ if (!message || typeof message !== "object") {
4213
+ return { role: "unknown", content: String(message) };
4214
+ }
4215
+ const msg = message;
4216
+ const role = typeof msg.role === "string" ? msg.role : "unknown";
4217
+ const content = extractTextContent2(msg.content);
4218
+ const toolCalls = extractToolCalls2(msg.content);
4219
+ const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
4220
+ return {
4221
+ role,
4222
+ content,
4223
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
4224
+ timestamp
4225
+ };
4226
+ }
4227
+ function extractToolCalls2(content) {
4228
+ if (!Array.isArray(content)) {
4229
+ return [];
4230
+ }
4231
+ const toolCalls = [];
4232
+ for (const part of content) {
4233
+ if (!part || typeof part !== "object") {
4234
+ continue;
4235
+ }
4236
+ const p = part;
4237
+ if (p.type === "tool_use" && typeof p.name === "string") {
4238
+ toolCalls.push({
4239
+ tool: p.name,
4240
+ input: p.input,
4241
+ id: typeof p.id === "string" ? p.id : void 0
4242
+ });
4243
+ }
4244
+ }
4245
+ return toolCalls;
4246
+ }
4247
+
4087
4248
  // src/evaluation/providers/pi-coding-agent.ts
4088
4249
  import { spawn as spawn3 } from "node:child_process";
4089
4250
  import { randomUUID as randomUUID3 } from "node:crypto";
@@ -4599,8 +4760,8 @@ function convertPiMessage(message) {
4599
4760
  if (typeof role !== "string") {
4600
4761
  return void 0;
4601
4762
  }
4602
- const content = extractTextContent2(msg.content);
4603
- const toolCalls = extractToolCalls2(msg.content);
4763
+ const content = extractTextContent3(msg.content);
4764
+ const toolCalls = extractToolCalls3(msg.content);
4604
4765
  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
4605
4766
  const metadata = {};
4606
4767
  if (msg.api) metadata.api = msg.api;
@@ -4616,7 +4777,7 @@ function convertPiMessage(message) {
4616
4777
  metadata: Object.keys(metadata).length > 0 ? metadata : void 0
4617
4778
  };
4618
4779
  }
4619
- function extractTextContent2(content) {
4780
+ function extractTextContent3(content) {
4620
4781
  if (typeof content === "string") {
4621
4782
  return content;
4622
4783
  }
@@ -4635,7 +4796,7 @@ function extractTextContent2(content) {
4635
4796
  }
4636
4797
  return textParts.length > 0 ? textParts.join("\n") : void 0;
4637
4798
  }
4638
- function extractToolCalls2(content) {
4799
+ function extractToolCalls3(content) {
4639
4800
  if (!Array.isArray(content)) {
4640
4801
  return [];
4641
4802
  }
@@ -5130,6 +5291,8 @@ function createProvider(target) {
5130
5291
  return new CodexProvider(target.name, target.config);
5131
5292
  case "pi-coding-agent":
5132
5293
  return new PiCodingAgentProvider(target.name, target.config);
5294
+ case "pi-agent-sdk":
5295
+ return new PiAgentSdkProvider(target.name, target.config);
5133
5296
  case "claude-code":
5134
5297
  return new ClaudeCodeProvider(target.name, target.config);
5135
5298
  case "mock":
@@ -5297,12 +5460,6 @@ function toSnakeCase(str) {
5297
5460
  }
5298
5461
  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
5299
5462
  }
5300
- function toCamelCase(str) {
5301
- if (/^[A-Z]/.test(str)) {
5302
- return str;
5303
- }
5304
- return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
5305
- }
5306
5463
  function toSnakeCaseDeep(obj) {
5307
5464
  if (obj === null || obj === void 0) {
5308
5465
  return obj;
@@ -5320,23 +5477,6 @@ function toSnakeCaseDeep(obj) {
5320
5477
  }
5321
5478
  return obj;
5322
5479
  }
5323
- function toCamelCaseDeep(obj) {
5324
- if (obj === null || obj === void 0) {
5325
- return obj;
5326
- }
5327
- if (Array.isArray(obj)) {
5328
- return obj.map((item) => toCamelCaseDeep(item));
5329
- }
5330
- if (typeof obj === "object") {
5331
- const result = {};
5332
- for (const [key, value] of Object.entries(obj)) {
5333
- const camelKey = toCamelCase(key);
5334
- result[camelKey] = toCamelCaseDeep(value);
5335
- }
5336
- return result;
5337
- }
5338
- return obj;
5339
- }
5340
5480
 
5341
5481
  // src/evaluation/evaluators.ts
5342
5482
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
@@ -8081,17 +8221,6 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
8081
8221
  return parts.join("\n");
8082
8222
  }
8083
8223
 
8084
- // src/evaluation/code-judge-sdk.ts
8085
- import { readFileSync } from "node:fs";
8086
- function parseCodeJudgePayload(payload) {
8087
- const parsed = JSON.parse(payload);
8088
- return toCamelCaseDeep(parsed);
8089
- }
8090
- function readCodeJudgePayload() {
8091
- const stdin = readFileSync(0, "utf8");
8092
- return parseCodeJudgePayload(stdin);
8093
- }
8094
-
8095
8224
  // src/index.ts
8096
8225
  function createAgentKernel() {
8097
8226
  return { status: "stub" };
@@ -8134,8 +8263,6 @@ export {
8134
8263
  loadEvalCases,
8135
8264
  mergeExecutionMetrics,
8136
8265
  normalizeLineEndings,
8137
- parseCodeJudgePayload,
8138
- readCodeJudgePayload,
8139
8266
  readJsonFile,
8140
8267
  readTargetDefinitions,
8141
8268
  readTestSuiteMetadata,