@wix/evalforge-evaluator 0.88.0 → 0.90.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
24
24
  ));
25
25
 
26
26
  // src/index.ts
27
- var import_evalforge_types6 = require("@wix/evalforge-types");
27
+ var import_evalforge_types8 = require("@wix/evalforge-types");
28
28
 
29
29
  // src/config.ts
30
30
  function loadConfig() {
@@ -249,6 +249,12 @@ function applyParamsToAssertion(assertion, params) {
249
249
  }
250
250
  };
251
251
  }
252
+ if (assertion.type === "time_limit" && params.maxDurationMs !== void 0) {
253
+ return {
254
+ ...assertion,
255
+ maxDurationMs: params.maxDurationMs
256
+ };
257
+ }
252
258
  if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
253
259
  return {
254
260
  ...assertion,
@@ -274,6 +280,12 @@ function resolveSystemAssertion(assertionId, params) {
274
280
  expectedExitCode: params?.expectedExitCode ?? void 0
275
281
  };
276
282
  break;
283
+ case "time_limit":
284
+ baseAssertion = {
285
+ type: "time_limit",
286
+ maxDurationMs: params?.maxDurationMs ?? 3e5
287
+ };
288
+ break;
277
289
  case "llm_judge":
278
290
  baseAssertion = {
279
291
  type: "llm_judge",
@@ -416,7 +428,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
416
428
  }
417
429
 
418
430
  // src/run-scenario/index.ts
419
- var import_evalforge_types4 = require("@wix/evalforge-types");
431
+ var import_evalforge_types6 = require("@wix/evalforge-types");
420
432
  var import_eval_assertions = require("@wix/eval-assertions");
421
433
 
422
434
  // src/run-scenario/environment.ts
@@ -546,7 +558,7 @@ var import_crypto2 = require("crypto");
546
558
  // src/run-scenario/agents/registry.ts
547
559
  var AgentAdapterRegistry = class {
548
560
  /**
549
- * Map of command strings to their registered adapters.
561
+ * Map of run commands to their registered adapters.
550
562
  * Multiple commands can map to the same adapter.
551
563
  */
552
564
  adapters = /* @__PURE__ */ new Map();
@@ -575,9 +587,9 @@ var AgentAdapterRegistry = class {
575
587
  }
576
588
  }
577
589
  /**
578
- * Get an adapter by command string.
590
+ * Get an adapter by run command.
579
591
  *
580
- * @param runCommand - The command string to look up (e.g., 'claude', 'cursor')
592
+ * @param runCommand - The run command to look up
581
593
  * @returns The registered adapter, or undefined if not found
582
594
  */
583
595
  get(runCommand) {
@@ -586,7 +598,7 @@ var AgentAdapterRegistry = class {
586
598
  /**
587
599
  * Check if a command has a registered adapter.
588
600
  *
589
- * @param runCommand - The command string to check
601
+ * @param runCommand - The run command to check
590
602
  * @returns True if an adapter is registered for this command
591
603
  */
592
604
  has(runCommand) {
@@ -603,7 +615,7 @@ var AgentAdapterRegistry = class {
603
615
  /**
604
616
  * Get all supported commands.
605
617
  *
606
- * @returns Array of all registered command strings
618
+ * @returns Array of all registered run commands
607
619
  */
608
620
  getSupportedCommands() {
609
621
  return Array.from(this.adapters.keys());
@@ -653,6 +665,9 @@ function getAdapter(runCommand) {
653
665
  return adapter;
654
666
  }
655
667
 
668
+ // src/run-scenario/agents/claude-code/claude-code-adapter.ts
669
+ var import_evalforge_types4 = require("@wix/evalforge-types");
670
+
656
671
  // src/run-scenario/agents/claude-code/execute.ts
657
672
  var import_evalforge_types3 = require("@wix/evalforge-types");
658
673
  var import_crypto = require("crypto");
@@ -1669,7 +1684,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1669
1684
  var ClaudeCodeAdapter = class {
1670
1685
  id = "claude-code";
1671
1686
  name = "Claude Code";
1672
- supportedCommands = ["claude"];
1687
+ supportedCommands = [import_evalforge_types4.AgentRunCommand.CLAUDE];
1673
1688
  /**
1674
1689
  * Execute a skill using the Claude Code SDK.
1675
1690
  *
@@ -2450,7 +2465,8 @@ function extractTemplateFiles(before, after) {
2450
2465
  }
2451
2466
 
2452
2467
  // src/run-scenario/run-agent-with-context.ts
2453
- var DEFAULT_AGENT_COMMAND = "claude";
2468
+ var import_evalforge_types5 = require("@wix/evalforge-types");
2469
+ var DEFAULT_AGENT_COMMAND = import_evalforge_types5.AgentRunCommand.CLAUDE;
2454
2470
  async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
2455
2471
  const skillsGroupId = evalData.evalRun.skillsGroupId;
2456
2472
  if (!skillsGroupId) {
@@ -2534,10 +2550,11 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
2534
2550
  fileDiffs: partialResult.fileDiffs?.map((d) => ({
2535
2551
  path: d.path,
2536
2552
  status: templateFilesMap.get(d.path)
2537
- }))
2553
+ })),
2554
+ durationMs: partialResult.duration
2538
2555
  };
2539
2556
  const { "x-wix-ai-gateway-stream": _stream, ...judgeHeaders } = config.aiGatewayHeaders;
2540
- const defaultJudgeModel = import_evalforge_types4.DEFAULT_JUDGE_MODEL;
2557
+ const defaultJudgeModel = import_evalforge_types6.DEFAULT_JUDGE_MODEL;
2541
2558
  const assertionContext = {
2542
2559
  workDir,
2543
2560
  defaultJudgeModel,
@@ -2552,10 +2569,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
2552
2569
  assertionContext
2553
2570
  ) : [];
2554
2571
  const passed = assertionResults.filter(
2555
- (r) => r.status === import_evalforge_types4.AssertionResultStatus.PASSED
2572
+ (r) => r.status === import_evalforge_types6.AssertionResultStatus.PASSED
2556
2573
  ).length;
2557
2574
  const failed = assertionResults.filter(
2558
- (r) => r.status === import_evalforge_types4.AssertionResultStatus.FAILED
2575
+ (r) => r.status === import_evalforge_types6.AssertionResultStatus.FAILED
2559
2576
  ).length;
2560
2577
  const total = assertionResults.length;
2561
2578
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -2569,7 +2586,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
2569
2586
  }
2570
2587
 
2571
2588
  // src/error-reporter.ts
2572
- var import_evalforge_types5 = require("@wix/evalforge-types");
2589
+ var import_evalforge_types7 = require("@wix/evalforge-types");
2573
2590
  function formatError(error, phase, context) {
2574
2591
  const timestamp = (/* @__PURE__ */ new Date()).toISOString();
2575
2592
  if (error instanceof Error) {
@@ -2818,7 +2835,7 @@ async function runEvaluation(projectId2, evalRunId2) {
2818
2835
  };
2819
2836
  try {
2820
2837
  await api.updateEvalRun(projectId2, evalRunId2, {
2821
- status: import_evalforge_types6.EvalStatus.COMPLETED,
2838
+ status: import_evalforge_types8.EvalStatus.COMPLETED,
2822
2839
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
2823
2840
  });
2824
2841
  } catch (updateErr) {
@@ -2859,7 +2876,7 @@ runEvaluation(projectId, evalRunId).then(() => {
2859
2876
  authToken: config.authToken
2860
2877
  });
2861
2878
  await api.updateEvalRun(projectId, evalRunId, {
2862
- status: import_evalforge_types6.EvalStatus.FAILED,
2879
+ status: import_evalforge_types8.EvalStatus.FAILED,
2863
2880
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
2864
2881
  jobError,
2865
2882
  jobStatus: "FAILED"
@@ -2882,7 +2899,7 @@ runEvaluation(projectId, evalRunId).then(() => {
2882
2899
  authToken
2883
2900
  });
2884
2901
  await api.updateEvalRun(projectId, evalRunId, {
2885
- status: import_evalforge_types6.EvalStatus.FAILED,
2902
+ status: import_evalforge_types8.EvalStatus.FAILED,
2886
2903
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
2887
2904
  jobError: `Config load failed, then: ${jobError}`,
2888
2905
  jobStatus: "FAILED"