@wix/evalforge-evaluator 0.88.0 → 0.89.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +20 -16
- package/build/index.js.map +4 -4
- package/build/index.mjs +11 -7
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +2 -1
- package/build/types/run-scenario/agents/registry.d.ts +14 -14
- package/package.json +4 -4
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -416,7 +416,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
416
416
|
}
|
|
417
417
|
|
|
418
418
|
// src/run-scenario/index.ts
|
|
419
|
-
var
|
|
419
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
420
420
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
421
421
|
|
|
422
422
|
// src/run-scenario/environment.ts
|
|
@@ -546,7 +546,7 @@ var import_crypto2 = require("crypto");
|
|
|
546
546
|
// src/run-scenario/agents/registry.ts
|
|
547
547
|
var AgentAdapterRegistry = class {
|
|
548
548
|
/**
|
|
549
|
-
* Map of
|
|
549
|
+
* Map of run commands to their registered adapters.
|
|
550
550
|
* Multiple commands can map to the same adapter.
|
|
551
551
|
*/
|
|
552
552
|
adapters = /* @__PURE__ */ new Map();
|
|
@@ -575,9 +575,9 @@ var AgentAdapterRegistry = class {
|
|
|
575
575
|
}
|
|
576
576
|
}
|
|
577
577
|
/**
|
|
578
|
-
* Get an adapter by command
|
|
578
|
+
* Get an adapter by run command.
|
|
579
579
|
*
|
|
580
|
-
* @param runCommand - The command
|
|
580
|
+
* @param runCommand - The run command to look up
|
|
581
581
|
* @returns The registered adapter, or undefined if not found
|
|
582
582
|
*/
|
|
583
583
|
get(runCommand) {
|
|
@@ -586,7 +586,7 @@ var AgentAdapterRegistry = class {
|
|
|
586
586
|
/**
|
|
587
587
|
* Check if a command has a registered adapter.
|
|
588
588
|
*
|
|
589
|
-
* @param runCommand - The command
|
|
589
|
+
* @param runCommand - The run command to check
|
|
590
590
|
* @returns True if an adapter is registered for this command
|
|
591
591
|
*/
|
|
592
592
|
has(runCommand) {
|
|
@@ -603,7 +603,7 @@ var AgentAdapterRegistry = class {
|
|
|
603
603
|
/**
|
|
604
604
|
* Get all supported commands.
|
|
605
605
|
*
|
|
606
|
-
* @returns Array of all registered
|
|
606
|
+
* @returns Array of all registered run commands
|
|
607
607
|
*/
|
|
608
608
|
getSupportedCommands() {
|
|
609
609
|
return Array.from(this.adapters.keys());
|
|
@@ -653,6 +653,9 @@ function getAdapter(runCommand) {
|
|
|
653
653
|
return adapter;
|
|
654
654
|
}
|
|
655
655
|
|
|
656
|
+
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
657
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
658
|
+
|
|
656
659
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
657
660
|
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
658
661
|
var import_crypto = require("crypto");
|
|
@@ -1669,7 +1672,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1669
1672
|
var ClaudeCodeAdapter = class {
|
|
1670
1673
|
id = "claude-code";
|
|
1671
1674
|
name = "Claude Code";
|
|
1672
|
-
supportedCommands = [
|
|
1675
|
+
supportedCommands = [import_evalforge_types4.AgentRunCommand.CLAUDE];
|
|
1673
1676
|
/**
|
|
1674
1677
|
* Execute a skill using the Claude Code SDK.
|
|
1675
1678
|
*
|
|
@@ -2450,7 +2453,8 @@ function extractTemplateFiles(before, after) {
|
|
|
2450
2453
|
}
|
|
2451
2454
|
|
|
2452
2455
|
// src/run-scenario/run-agent-with-context.ts
|
|
2453
|
-
var
|
|
2456
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
2457
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types5.AgentRunCommand.CLAUDE;
|
|
2454
2458
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
2455
2459
|
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
2456
2460
|
if (!skillsGroupId) {
|
|
@@ -2537,7 +2541,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2537
2541
|
}))
|
|
2538
2542
|
};
|
|
2539
2543
|
const { "x-wix-ai-gateway-stream": _stream, ...judgeHeaders } = config.aiGatewayHeaders;
|
|
2540
|
-
const defaultJudgeModel =
|
|
2544
|
+
const defaultJudgeModel = import_evalforge_types6.DEFAULT_JUDGE_MODEL;
|
|
2541
2545
|
const assertionContext = {
|
|
2542
2546
|
workDir,
|
|
2543
2547
|
defaultJudgeModel,
|
|
@@ -2552,10 +2556,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2552
2556
|
assertionContext
|
|
2553
2557
|
) : [];
|
|
2554
2558
|
const passed = assertionResults.filter(
|
|
2555
|
-
(r) => r.status ===
|
|
2559
|
+
(r) => r.status === import_evalforge_types6.AssertionResultStatus.PASSED
|
|
2556
2560
|
).length;
|
|
2557
2561
|
const failed = assertionResults.filter(
|
|
2558
|
-
(r) => r.status ===
|
|
2562
|
+
(r) => r.status === import_evalforge_types6.AssertionResultStatus.FAILED
|
|
2559
2563
|
).length;
|
|
2560
2564
|
const total = assertionResults.length;
|
|
2561
2565
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -2569,7 +2573,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2569
2573
|
}
|
|
2570
2574
|
|
|
2571
2575
|
// src/error-reporter.ts
|
|
2572
|
-
var
|
|
2576
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
2573
2577
|
function formatError(error, phase, context) {
|
|
2574
2578
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
2575
2579
|
if (error instanceof Error) {
|
|
@@ -2818,7 +2822,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2818
2822
|
};
|
|
2819
2823
|
try {
|
|
2820
2824
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
2821
|
-
status:
|
|
2825
|
+
status: import_evalforge_types8.EvalStatus.COMPLETED,
|
|
2822
2826
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2823
2827
|
});
|
|
2824
2828
|
} catch (updateErr) {
|
|
@@ -2859,7 +2863,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
2859
2863
|
authToken: config.authToken
|
|
2860
2864
|
});
|
|
2861
2865
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
2862
|
-
status:
|
|
2866
|
+
status: import_evalforge_types8.EvalStatus.FAILED,
|
|
2863
2867
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2864
2868
|
jobError,
|
|
2865
2869
|
jobStatus: "FAILED"
|
|
@@ -2882,7 +2886,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
2882
2886
|
authToken
|
|
2883
2887
|
});
|
|
2884
2888
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
2885
|
-
status:
|
|
2889
|
+
status: import_evalforge_types8.EvalStatus.FAILED,
|
|
2886
2890
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2887
2891
|
jobError: `Config load failed, then: ${jobError}`,
|
|
2888
2892
|
jobStatus: "FAILED"
|