@agentv/core 0.7.5 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -132,6 +132,7 @@ interface EvaluationResult {
132
132
  readonly raw_request?: JsonObject;
133
133
  readonly evaluator_raw_request?: JsonObject;
134
134
  readonly evaluator_results?: readonly EvaluatorResult[];
135
+ readonly error?: string;
135
136
  }
136
137
  interface EvaluatorResult {
137
138
  readonly name: string;
@@ -252,11 +253,81 @@ type EnvLookup = Readonly<Record<string, string | undefined>>;
252
253
  interface TargetDefinition {
253
254
  readonly name: string;
254
255
  readonly provider: ProviderKind | string;
255
- readonly settings?: Record<string, unknown> | undefined;
256
256
  readonly judge_target?: string | undefined;
257
257
  readonly workers?: number | undefined;
258
+ readonly provider_batching?: boolean | undefined;
259
+ readonly providerBatching?: boolean | undefined;
260
+ readonly endpoint?: string | unknown | undefined;
261
+ readonly resource?: string | unknown | undefined;
262
+ readonly resourceName?: string | unknown | undefined;
263
+ readonly api_key?: string | unknown | undefined;
264
+ readonly apiKey?: string | unknown | undefined;
265
+ readonly deployment?: string | unknown | undefined;
266
+ readonly deploymentName?: string | unknown | undefined;
267
+ readonly model?: string | unknown | undefined;
268
+ readonly version?: string | unknown | undefined;
269
+ readonly api_version?: string | unknown | undefined;
270
+ readonly variant?: string | unknown | undefined;
271
+ readonly thinking_budget?: number | unknown | undefined;
272
+ readonly thinkingBudget?: number | unknown | undefined;
273
+ readonly temperature?: number | unknown | undefined;
274
+ readonly max_output_tokens?: number | unknown | undefined;
275
+ readonly maxTokens?: number | unknown | undefined;
276
+ readonly executable?: string | unknown | undefined;
277
+ readonly command?: string | unknown | undefined;
278
+ readonly binary?: string | unknown | undefined;
279
+ readonly args?: unknown | undefined;
280
+ readonly arguments?: unknown | undefined;
281
+ readonly cwd?: string | unknown | undefined;
282
+ readonly timeout_seconds?: number | unknown | undefined;
283
+ readonly timeoutSeconds?: number | unknown | undefined;
284
+ readonly log_dir?: string | unknown | undefined;
285
+ readonly logDir?: string | unknown | undefined;
286
+ readonly log_directory?: string | unknown | undefined;
287
+ readonly logDirectory?: string | unknown | undefined;
288
+ readonly log_format?: string | unknown | undefined;
289
+ readonly logFormat?: string | unknown | undefined;
290
+ readonly log_output_format?: string | unknown | undefined;
291
+ readonly logOutputFormat?: string | unknown | undefined;
292
+ readonly response?: string | unknown | undefined;
293
+ readonly delayMs?: number | unknown | undefined;
294
+ readonly delayMinMs?: number | unknown | undefined;
295
+ readonly delayMaxMs?: number | unknown | undefined;
296
+ readonly vscode_cmd?: string | unknown | undefined;
297
+ readonly wait?: boolean | unknown | undefined;
298
+ readonly dry_run?: boolean | unknown | undefined;
299
+ readonly dryRun?: boolean | unknown | undefined;
300
+ readonly subagent_root?: string | unknown | undefined;
301
+ readonly subagentRoot?: string | unknown | undefined;
302
+ readonly workspace_template?: string | unknown | undefined;
303
+ readonly workspaceTemplate?: string | unknown | undefined;
304
+ readonly command_template?: string | unknown | undefined;
305
+ readonly commandTemplate?: string | unknown | undefined;
306
+ readonly files_format?: string | unknown | undefined;
307
+ readonly filesFormat?: string | unknown | undefined;
308
+ readonly attachments_format?: string | unknown | undefined;
309
+ readonly attachmentsFormat?: string | unknown | undefined;
310
+ readonly env?: unknown | undefined;
311
+ readonly healthcheck?: unknown | undefined;
312
+ readonly max_retries?: number | unknown | undefined;
313
+ readonly maxRetries?: number | unknown | undefined;
314
+ readonly retry_initial_delay_ms?: number | unknown | undefined;
315
+ readonly retryInitialDelayMs?: number | unknown | undefined;
316
+ readonly retry_max_delay_ms?: number | unknown | undefined;
317
+ readonly retryMaxDelayMs?: number | unknown | undefined;
318
+ readonly retry_backoff_factor?: number | unknown | undefined;
319
+ readonly retryBackoffFactor?: number | unknown | undefined;
320
+ readonly retry_status_codes?: unknown | undefined;
321
+ readonly retryStatusCodes?: unknown | undefined;
258
322
  }
259
323
 
324
+ interface RetryConfig {
325
+ readonly maxRetries?: number;
326
+ readonly initialDelayMs?: number;
327
+ readonly maxDelayMs?: number;
328
+ readonly backoffFactor?: number;
329
+ readonly retryableStatusCodes?: readonly number[];
330
+ }
260
331
  interface AzureResolvedConfig {
261
332
  readonly resourceName: string;
262
333
  readonly deploymentName: string;
@@ -264,6 +335,7 @@ interface AzureResolvedConfig {
264
335
  readonly version?: string;
265
336
  readonly temperature?: number;
266
337
  readonly maxOutputTokens?: number;
338
+ readonly retry?: RetryConfig;
267
339
  }
268
340
  interface AnthropicResolvedConfig {
269
341
  readonly apiKey: string;
@@ -271,12 +343,14 @@ interface AnthropicResolvedConfig {
271
343
  readonly temperature?: number;
272
344
  readonly maxOutputTokens?: number;
273
345
  readonly thinkingBudget?: number;
346
+ readonly retry?: RetryConfig;
274
347
  }
275
348
  interface GeminiResolvedConfig {
276
349
  readonly apiKey: string;
277
350
  readonly model: string;
278
351
  readonly temperature?: number;
279
352
  readonly maxOutputTokens?: number;
353
+ readonly retry?: RetryConfig;
280
354
  }
281
355
  interface CodexResolvedConfig {
282
356
  readonly executable: string;
@@ -313,7 +387,6 @@ interface CliResolvedConfig {
313
387
  readonly commandTemplate: string;
314
388
  readonly filesFormat?: string;
315
389
  readonly cwd?: string;
316
- readonly env?: Record<string, string>;
317
390
  readonly timeoutMs?: number;
318
391
  readonly healthcheck?: CliHealthcheck;
319
392
  }
package/dist/index.d.ts CHANGED
@@ -132,6 +132,7 @@ interface EvaluationResult {
132
132
  readonly raw_request?: JsonObject;
133
133
  readonly evaluator_raw_request?: JsonObject;
134
134
  readonly evaluator_results?: readonly EvaluatorResult[];
135
+ readonly error?: string;
135
136
  }
136
137
  interface EvaluatorResult {
137
138
  readonly name: string;
@@ -252,11 +253,81 @@ type EnvLookup = Readonly<Record<string, string | undefined>>;
252
253
  interface TargetDefinition {
253
254
  readonly name: string;
254
255
  readonly provider: ProviderKind | string;
255
- readonly settings?: Record<string, unknown> | undefined;
256
256
  readonly judge_target?: string | undefined;
257
257
  readonly workers?: number | undefined;
258
+ readonly provider_batching?: boolean | undefined;
259
+ readonly providerBatching?: boolean | undefined;
260
+ readonly endpoint?: string | unknown | undefined;
261
+ readonly resource?: string | unknown | undefined;
262
+ readonly resourceName?: string | unknown | undefined;
263
+ readonly api_key?: string | unknown | undefined;
264
+ readonly apiKey?: string | unknown | undefined;
265
+ readonly deployment?: string | unknown | undefined;
266
+ readonly deploymentName?: string | unknown | undefined;
267
+ readonly model?: string | unknown | undefined;
268
+ readonly version?: string | unknown | undefined;
269
+ readonly api_version?: string | unknown | undefined;
270
+ readonly variant?: string | unknown | undefined;
271
+ readonly thinking_budget?: number | unknown | undefined;
272
+ readonly thinkingBudget?: number | unknown | undefined;
273
+ readonly temperature?: number | unknown | undefined;
274
+ readonly max_output_tokens?: number | unknown | undefined;
275
+ readonly maxTokens?: number | unknown | undefined;
276
+ readonly executable?: string | unknown | undefined;
277
+ readonly command?: string | unknown | undefined;
278
+ readonly binary?: string | unknown | undefined;
279
+ readonly args?: unknown | undefined;
280
+ readonly arguments?: unknown | undefined;
281
+ readonly cwd?: string | unknown | undefined;
282
+ readonly timeout_seconds?: number | unknown | undefined;
283
+ readonly timeoutSeconds?: number | unknown | undefined;
284
+ readonly log_dir?: string | unknown | undefined;
285
+ readonly logDir?: string | unknown | undefined;
286
+ readonly log_directory?: string | unknown | undefined;
287
+ readonly logDirectory?: string | unknown | undefined;
288
+ readonly log_format?: string | unknown | undefined;
289
+ readonly logFormat?: string | unknown | undefined;
290
+ readonly log_output_format?: string | unknown | undefined;
291
+ readonly logOutputFormat?: string | unknown | undefined;
292
+ readonly response?: string | unknown | undefined;
293
+ readonly delayMs?: number | unknown | undefined;
294
+ readonly delayMinMs?: number | unknown | undefined;
295
+ readonly delayMaxMs?: number | unknown | undefined;
296
+ readonly vscode_cmd?: string | unknown | undefined;
297
+ readonly wait?: boolean | unknown | undefined;
298
+ readonly dry_run?: boolean | unknown | undefined;
299
+ readonly dryRun?: boolean | unknown | undefined;
300
+ readonly subagent_root?: string | unknown | undefined;
301
+ readonly subagentRoot?: string | unknown | undefined;
302
+ readonly workspace_template?: string | unknown | undefined;
303
+ readonly workspaceTemplate?: string | unknown | undefined;
304
+ readonly command_template?: string | unknown | undefined;
305
+ readonly commandTemplate?: string | unknown | undefined;
306
+ readonly files_format?: string | unknown | undefined;
307
+ readonly filesFormat?: string | unknown | undefined;
308
+ readonly attachments_format?: string | unknown | undefined;
309
+ readonly attachmentsFormat?: string | unknown | undefined;
310
+ readonly env?: unknown | undefined;
311
+ readonly healthcheck?: unknown | undefined;
312
+ readonly max_retries?: number | unknown | undefined;
313
+ readonly maxRetries?: number | unknown | undefined;
314
+ readonly retry_initial_delay_ms?: number | unknown | undefined;
315
+ readonly retryInitialDelayMs?: number | unknown | undefined;
316
+ readonly retry_max_delay_ms?: number | unknown | undefined;
317
+ readonly retryMaxDelayMs?: number | unknown | undefined;
318
+ readonly retry_backoff_factor?: number | unknown | undefined;
319
+ readonly retryBackoffFactor?: number | unknown | undefined;
320
+ readonly retry_status_codes?: unknown | undefined;
321
+ readonly retryStatusCodes?: unknown | undefined;
258
322
  }
259
323
 
324
+ interface RetryConfig {
325
+ readonly maxRetries?: number;
326
+ readonly initialDelayMs?: number;
327
+ readonly maxDelayMs?: number;
328
+ readonly backoffFactor?: number;
329
+ readonly retryableStatusCodes?: readonly number[];
330
+ }
260
331
  interface AzureResolvedConfig {
261
332
  readonly resourceName: string;
262
333
  readonly deploymentName: string;
@@ -264,6 +335,7 @@ interface AzureResolvedConfig {
264
335
  readonly version?: string;
265
336
  readonly temperature?: number;
266
337
  readonly maxOutputTokens?: number;
338
+ readonly retry?: RetryConfig;
267
339
  }
268
340
  interface AnthropicResolvedConfig {
269
341
  readonly apiKey: string;
@@ -271,12 +343,14 @@ interface AnthropicResolvedConfig {
271
343
  readonly temperature?: number;
272
344
  readonly maxOutputTokens?: number;
273
345
  readonly thinkingBudget?: number;
346
+ readonly retry?: RetryConfig;
274
347
  }
275
348
  interface GeminiResolvedConfig {
276
349
  readonly apiKey: string;
277
350
  readonly model: string;
278
351
  readonly temperature?: number;
279
352
  readonly maxOutputTokens?: number;
353
+ readonly retry?: RetryConfig;
280
354
  }
281
355
  interface CodexResolvedConfig {
282
356
  readonly executable: string;
@@ -313,7 +387,6 @@ interface CliResolvedConfig {
313
387
  readonly commandTemplate: string;
314
388
  readonly filesFormat?: string;
315
389
  readonly cwd?: string;
316
- readonly env?: Record<string, string>;
317
390
  readonly timeoutMs?: number;
318
391
  readonly healthcheck?: CliHealthcheck;
319
392
  }
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  readTextFile,
10
10
  resolveFileReference,
11
11
  resolveTargetDefinition
12
- } from "./chunk-7XM7HYRS.js";
12
+ } from "./chunk-SNTZFB24.js";
13
13
 
14
14
  // src/evaluation/types.ts
15
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -662,6 +662,67 @@ function ensureChatResponse(result) {
662
662
  }
663
663
  return result;
664
664
  }
665
+ function isRetryableError(error, retryableStatusCodes) {
666
+ if (!error || typeof error !== "object") {
667
+ return false;
668
+ }
669
+ if ("status" in error && typeof error.status === "number") {
670
+ return retryableStatusCodes.includes(error.status);
671
+ }
672
+ if ("message" in error && typeof error.message === "string") {
673
+ const match = error.message.match(/HTTP (\d{3})/);
674
+ if (match) {
675
+ const status = Number.parseInt(match[1], 10);
676
+ return retryableStatusCodes.includes(status);
677
+ }
678
+ }
679
+ if ("name" in error && error.name === "AxAIServiceNetworkError") {
680
+ return true;
681
+ }
682
+ return false;
683
+ }
684
+ function calculateRetryDelay(attempt, config) {
685
+ const delay = Math.min(
686
+ config.maxDelayMs,
687
+ config.initialDelayMs * config.backoffFactor ** attempt
688
+ );
689
+ return delay * (0.75 + Math.random() * 0.5);
690
+ }
691
+ async function sleep(ms) {
692
+ return new Promise((resolve) => setTimeout(resolve, ms));
693
+ }
694
+ async function withRetry(fn, retryConfig, signal) {
695
+ const config = {
696
+ maxRetries: retryConfig?.maxRetries ?? 3,
697
+ initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
698
+ maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
699
+ backoffFactor: retryConfig?.backoffFactor ?? 2,
700
+ retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
701
+ };
702
+ let lastError;
703
+ for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
704
+ if (signal?.aborted) {
705
+ throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
706
+ }
707
+ try {
708
+ return await fn();
709
+ } catch (error) {
710
+ lastError = error;
711
+ if (attempt >= config.maxRetries) {
712
+ break;
713
+ }
714
+ if (!isRetryableError(error, config.retryableStatusCodes)) {
715
+ throw error;
716
+ }
717
+ const delay = calculateRetryDelay(attempt, config);
718
+ await sleep(delay);
719
+ if (signal?.aborted) {
720
+ throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
721
+ }
722
+ }
723
+ }
724
+ throw lastError;
725
+ }
665
726
  var AzureProvider = class {
666
727
  constructor(targetName, config) {
667
728
  this.config = config;
@@ -671,6 +732,7 @@ var AzureProvider = class {
671
732
  temperature: config.temperature,
672
733
  maxOutputTokens: config.maxOutputTokens
673
734
  };
735
+ this.retryConfig = config.retry;
674
736
  this.ai = AxAI.create({
675
737
  name: "azure-openai",
676
738
  apiKey: config.apiKey,
@@ -687,16 +749,21 @@ var AzureProvider = class {
687
749
  targetName;
688
750
  ai;
689
751
  defaults;
752
+ retryConfig;
690
753
  async invoke(request) {
691
754
  const chatPrompt = buildChatPrompt(request);
692
755
  const modelConfig = extractModelConfig(request, this.defaults);
693
- const response = await this.ai.chat(
694
- {
695
- chatPrompt,
696
- model: this.config.deploymentName,
697
- ...modelConfig ? { modelConfig } : {}
698
- },
699
- request.signal ? { abortSignal: request.signal } : void 0
756
+ const response = await withRetry(
757
+ async () => await this.ai.chat(
758
+ {
759
+ chatPrompt,
760
+ model: this.config.deploymentName,
761
+ ...modelConfig ? { modelConfig } : {}
762
+ },
763
+ request.signal ? { abortSignal: request.signal } : void 0
764
+ ),
765
+ this.retryConfig,
766
+ request.signal
700
767
  );
701
768
  return mapResponse(ensureChatResponse(response));
702
769
  }
@@ -714,6 +781,7 @@ var AnthropicProvider = class {
714
781
  maxOutputTokens: config.maxOutputTokens,
715
782
  thinkingBudget: config.thinkingBudget
716
783
  };
784
+ this.retryConfig = config.retry;
717
785
  this.ai = AxAI.create({
718
786
  name: "anthropic",
719
787
  apiKey: config.apiKey
@@ -724,16 +792,21 @@ var AnthropicProvider = class {
724
792
  targetName;
725
793
  ai;
726
794
  defaults;
795
+ retryConfig;
727
796
  async invoke(request) {
728
797
  const chatPrompt = buildChatPrompt(request);
729
798
  const modelConfig = extractModelConfig(request, this.defaults);
730
- const response = await this.ai.chat(
731
- {
732
- chatPrompt,
733
- model: this.config.model,
734
- ...modelConfig ? { modelConfig } : {}
735
- },
736
- request.signal ? { abortSignal: request.signal } : void 0
799
+ const response = await withRetry(
800
+ async () => await this.ai.chat(
801
+ {
802
+ chatPrompt,
803
+ model: this.config.model,
804
+ ...modelConfig ? { modelConfig } : {}
805
+ },
806
+ request.signal ? { abortSignal: request.signal } : void 0
807
+ ),
808
+ this.retryConfig,
809
+ request.signal
737
810
  );
738
811
  return mapResponse(ensureChatResponse(response));
739
812
  }
@@ -750,6 +823,7 @@ var GeminiProvider = class {
750
823
  temperature: config.temperature,
751
824
  maxOutputTokens: config.maxOutputTokens
752
825
  };
826
+ this.retryConfig = config.retry;
753
827
  this.ai = AxAI.create({
754
828
  name: "google-gemini",
755
829
  apiKey: config.apiKey
@@ -760,16 +834,21 @@ var GeminiProvider = class {
760
834
  targetName;
761
835
  ai;
762
836
  defaults;
837
+ retryConfig;
763
838
  async invoke(request) {
764
839
  const chatPrompt = buildChatPrompt(request);
765
840
  const modelConfig = extractModelConfig(request, this.defaults);
766
- const response = await this.ai.chat(
767
- {
768
- chatPrompt,
769
- model: this.config.model,
770
- ...modelConfig ? { modelConfig } : {}
771
- },
772
- request.signal ? { abortSignal: request.signal } : void 0
841
+ const response = await withRetry(
842
+ async () => await this.ai.chat(
843
+ {
844
+ chatPrompt,
845
+ model: this.config.model,
846
+ ...modelConfig ? { modelConfig } : {}
847
+ },
848
+ request.signal ? { abortSignal: request.signal } : void 0
849
+ ),
850
+ this.retryConfig,
851
+ request.signal
773
852
  );
774
853
  return mapResponse(ensureChatResponse(response));
775
854
  }
@@ -839,10 +918,9 @@ var CliProvider = class {
839
918
  const outputFilePath = generateOutputFilePath(request.evalCaseId);
840
919
  const templateValues = buildTemplateValues(request, this.config, outputFilePath);
841
920
  const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
842
- const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
843
921
  const result = await this.runCommand(renderedCommand, {
844
922
  cwd: this.config.cwd,
845
- env,
923
+ env: process.env,
846
924
  timeoutMs: this.config.timeoutMs,
847
925
  signal: request.signal
848
926
  });
@@ -931,10 +1009,9 @@ var CliProvider = class {
931
1009
  generateOutputFilePath("healthcheck")
932
1010
  )
933
1011
  );
934
- const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
935
1012
  const result = await this.runCommand(renderedCommand, {
936
1013
  cwd: healthcheck.cwd ?? this.config.cwd,
937
- env,
1014
+ env: process.env,
938
1015
  timeoutMs,
939
1016
  signal
940
1017
  });
@@ -2167,20 +2244,13 @@ function assertTargetDefinition(value, index, filePath) {
2167
2244
  }
2168
2245
  const name = value.name;
2169
2246
  const provider = value.provider;
2170
- const settings = value.settings;
2171
- const judgeTarget = value.judge_target;
2172
2247
  if (typeof name !== "string" || name.trim().length === 0) {
2173
2248
  throw new Error(`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`);
2174
2249
  }
2175
2250
  if (typeof provider !== "string" || provider.trim().length === 0) {
2176
2251
  throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
2177
2252
  }
2178
- return {
2179
- name,
2180
- provider,
2181
- settings: isRecord(settings) ? settings : void 0,
2182
- judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
2183
- };
2253
+ return value;
2184
2254
  }
2185
2255
  async function fileExists3(filePath) {
2186
2256
  try {
@@ -2821,10 +2891,11 @@ async function runEvaluation(options) {
2821
2891
  await onProgress({
2822
2892
  workerId,
2823
2893
  evalId: evalCase.id,
2824
- status: "completed",
2894
+ status: result.error ? "failed" : "completed",
2825
2895
  startedAt: 0,
2826
2896
  // Not used for completed status
2827
- completedAt: Date.now()
2897
+ completedAt: Date.now(),
2898
+ error: result.error
2828
2899
  });
2829
2900
  }
2830
2901
  if (onResult) {
@@ -3362,7 +3433,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
3362
3433
  target: targetName,
3363
3434
  timestamp: timestamp.toISOString(),
3364
3435
  raw_aspects: [],
3365
- raw_request: rawRequest
3436
+ raw_request: rawRequest,
3437
+ error: message
3366
3438
  };
3367
3439
  }
3368
3440
  function createCacheKey(provider, target, evalCase, promptInputs) {