@wix/evalforge-evaluator 0.87.0 → 0.89.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
24
24
  ));
25
25
 
26
26
  // src/index.ts
27
- var import_evalforge_types6 = require("@wix/evalforge-types");
27
+ var import_evalforge_types8 = require("@wix/evalforge-types");
28
28
 
29
29
  // src/config.ts
30
30
  function loadConfig() {
@@ -233,7 +233,21 @@ function applyParamsToAssertion(assertion, params) {
233
233
  );
234
234
  }
235
235
  }
236
- return { ...assertion, prompt, systemPrompt };
236
+ return {
237
+ ...assertion,
238
+ prompt,
239
+ systemPrompt,
240
+ ...params.model !== void 0 && { model: params.model },
241
+ ...params.maxTokens !== void 0 && {
242
+ maxTokens: params.maxTokens
243
+ },
244
+ ...params.temperature !== void 0 && {
245
+ temperature: params.temperature
246
+ },
247
+ ...params.minScore !== void 0 && {
248
+ minScore: params.minScore
249
+ }
250
+ };
237
251
  }
238
252
  if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
239
253
  return {
@@ -265,7 +279,10 @@ function resolveSystemAssertion(assertionId, params) {
265
279
  type: "llm_judge",
266
280
  prompt: params?.prompt ?? "",
267
281
  systemPrompt: params?.systemPrompt,
268
- minScore: params?.minScore
282
+ minScore: params?.minScore,
283
+ model: params?.model,
284
+ maxTokens: params?.maxTokens,
285
+ temperature: params?.temperature
269
286
  };
270
287
  break;
271
288
  default:
@@ -399,7 +416,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
399
416
  }
400
417
 
401
418
  // src/run-scenario/index.ts
402
- var import_evalforge_types4 = require("@wix/evalforge-types");
419
+ var import_evalforge_types6 = require("@wix/evalforge-types");
403
420
  var import_eval_assertions = require("@wix/eval-assertions");
404
421
 
405
422
  // src/run-scenario/environment.ts
@@ -529,7 +546,7 @@ var import_crypto2 = require("crypto");
529
546
  // src/run-scenario/agents/registry.ts
530
547
  var AgentAdapterRegistry = class {
531
548
  /**
532
- * Map of command strings to their registered adapters.
549
+ * Map of run commands to their registered adapters.
533
550
  * Multiple commands can map to the same adapter.
534
551
  */
535
552
  adapters = /* @__PURE__ */ new Map();
@@ -558,9 +575,9 @@ var AgentAdapterRegistry = class {
558
575
  }
559
576
  }
560
577
  /**
561
- * Get an adapter by command string.
578
+ * Get an adapter by run command.
562
579
  *
563
- * @param runCommand - The command string to look up (e.g., 'claude', 'cursor')
580
+ * @param runCommand - The run command to look up
564
581
  * @returns The registered adapter, or undefined if not found
565
582
  */
566
583
  get(runCommand) {
@@ -569,7 +586,7 @@ var AgentAdapterRegistry = class {
569
586
  /**
570
587
  * Check if a command has a registered adapter.
571
588
  *
572
- * @param runCommand - The command string to check
589
+ * @param runCommand - The run command to check
573
590
  * @returns True if an adapter is registered for this command
574
591
  */
575
592
  has(runCommand) {
@@ -586,7 +603,7 @@ var AgentAdapterRegistry = class {
586
603
  /**
587
604
  * Get all supported commands.
588
605
  *
589
- * @returns Array of all registered command strings
606
+ * @returns Array of all registered run commands
590
607
  */
591
608
  getSupportedCommands() {
592
609
  return Array.from(this.adapters.keys());
@@ -636,6 +653,9 @@ function getAdapter(runCommand) {
636
653
  return adapter;
637
654
  }
638
655
 
656
+ // src/run-scenario/agents/claude-code/claude-code-adapter.ts
657
+ var import_evalforge_types4 = require("@wix/evalforge-types");
658
+
639
659
  // src/run-scenario/agents/claude-code/execute.ts
640
660
  var import_evalforge_types3 = require("@wix/evalforge-types");
641
661
  var import_crypto = require("crypto");
@@ -1652,7 +1672,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1652
1672
  var ClaudeCodeAdapter = class {
1653
1673
  id = "claude-code";
1654
1674
  name = "Claude Code";
1655
- supportedCommands = ["claude"];
1675
+ supportedCommands = [import_evalforge_types4.AgentRunCommand.CLAUDE];
1656
1676
  /**
1657
1677
  * Execute a skill using the Claude Code SDK.
1658
1678
  *
@@ -2433,7 +2453,8 @@ function extractTemplateFiles(before, after) {
2433
2453
  }
2434
2454
 
2435
2455
  // src/run-scenario/run-agent-with-context.ts
2436
- var DEFAULT_AGENT_COMMAND = "claude";
2456
+ var import_evalforge_types5 = require("@wix/evalforge-types");
2457
+ var DEFAULT_AGENT_COMMAND = import_evalforge_types5.AgentRunCommand.CLAUDE;
2437
2458
  async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
2438
2459
  const skillsGroupId = evalData.evalRun.skillsGroupId;
2439
2460
  if (!skillsGroupId) {
@@ -2520,7 +2541,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
2520
2541
  }))
2521
2542
  };
2522
2543
  const { "x-wix-ai-gateway-stream": _stream, ...judgeHeaders } = config.aiGatewayHeaders;
2523
- const defaultJudgeModel = import_evalforge_types4.AVAILABLE_MODEL_IDS[0];
2544
+ const defaultJudgeModel = import_evalforge_types6.DEFAULT_JUDGE_MODEL;
2524
2545
  const assertionContext = {
2525
2546
  workDir,
2526
2547
  defaultJudgeModel,
@@ -2535,10 +2556,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
2535
2556
  assertionContext
2536
2557
  ) : [];
2537
2558
  const passed = assertionResults.filter(
2538
- (r) => r.status === import_evalforge_types4.AssertionResultStatus.PASSED
2559
+ (r) => r.status === import_evalforge_types6.AssertionResultStatus.PASSED
2539
2560
  ).length;
2540
2561
  const failed = assertionResults.filter(
2541
- (r) => r.status === import_evalforge_types4.AssertionResultStatus.FAILED
2562
+ (r) => r.status === import_evalforge_types6.AssertionResultStatus.FAILED
2542
2563
  ).length;
2543
2564
  const total = assertionResults.length;
2544
2565
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -2552,7 +2573,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
2552
2573
  }
2553
2574
 
2554
2575
  // src/error-reporter.ts
2555
- var import_evalforge_types5 = require("@wix/evalforge-types");
2576
+ var import_evalforge_types7 = require("@wix/evalforge-types");
2556
2577
  function formatError(error, phase, context) {
2557
2578
  const timestamp = (/* @__PURE__ */ new Date()).toISOString();
2558
2579
  if (error instanceof Error) {
@@ -2801,7 +2822,7 @@ async function runEvaluation(projectId2, evalRunId2) {
2801
2822
  };
2802
2823
  try {
2803
2824
  await api.updateEvalRun(projectId2, evalRunId2, {
2804
- status: import_evalforge_types6.EvalStatus.COMPLETED,
2825
+ status: import_evalforge_types8.EvalStatus.COMPLETED,
2805
2826
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
2806
2827
  });
2807
2828
  } catch (updateErr) {
@@ -2842,7 +2863,7 @@ runEvaluation(projectId, evalRunId).then(() => {
2842
2863
  authToken: config.authToken
2843
2864
  });
2844
2865
  await api.updateEvalRun(projectId, evalRunId, {
2845
- status: import_evalforge_types6.EvalStatus.FAILED,
2866
+ status: import_evalforge_types8.EvalStatus.FAILED,
2846
2867
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
2847
2868
  jobError,
2848
2869
  jobStatus: "FAILED"
@@ -2865,7 +2886,7 @@ runEvaluation(projectId, evalRunId).then(() => {
2865
2886
  authToken
2866
2887
  });
2867
2888
  await api.updateEvalRun(projectId, evalRunId, {
2868
- status: import_evalforge_types6.EvalStatus.FAILED,
2889
+ status: import_evalforge_types8.EvalStatus.FAILED,
2869
2890
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
2870
2891
  jobError: `Config load failed, then: ${jobError}`,
2871
2892
  jobStatus: "FAILED"