@wix/evalforge-evaluator 0.88.0 → 0.90.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +34 -17
- package/build/index.js.map +4 -4
- package/build/index.mjs +25 -8
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +2 -1
- package/build/types/run-scenario/agents/registry.d.ts +14 -14
- package/package.json +5 -5
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -249,6 +249,12 @@ function applyParamsToAssertion(assertion, params) {
|
|
|
249
249
|
}
|
|
250
250
|
};
|
|
251
251
|
}
|
|
252
|
+
if (assertion.type === "time_limit" && params.maxDurationMs !== void 0) {
|
|
253
|
+
return {
|
|
254
|
+
...assertion,
|
|
255
|
+
maxDurationMs: params.maxDurationMs
|
|
256
|
+
};
|
|
257
|
+
}
|
|
252
258
|
if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
|
|
253
259
|
return {
|
|
254
260
|
...assertion,
|
|
@@ -274,6 +280,12 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
274
280
|
expectedExitCode: params?.expectedExitCode ?? void 0
|
|
275
281
|
};
|
|
276
282
|
break;
|
|
283
|
+
case "time_limit":
|
|
284
|
+
baseAssertion = {
|
|
285
|
+
type: "time_limit",
|
|
286
|
+
maxDurationMs: params?.maxDurationMs ?? 3e5
|
|
287
|
+
};
|
|
288
|
+
break;
|
|
277
289
|
case "llm_judge":
|
|
278
290
|
baseAssertion = {
|
|
279
291
|
type: "llm_judge",
|
|
@@ -416,7 +428,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
416
428
|
}
|
|
417
429
|
|
|
418
430
|
// src/run-scenario/index.ts
|
|
419
|
-
var
|
|
431
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
420
432
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
421
433
|
|
|
422
434
|
// src/run-scenario/environment.ts
|
|
@@ -546,7 +558,7 @@ var import_crypto2 = require("crypto");
|
|
|
546
558
|
// src/run-scenario/agents/registry.ts
|
|
547
559
|
var AgentAdapterRegistry = class {
|
|
548
560
|
/**
|
|
549
|
-
* Map of
|
|
561
|
+
* Map of run commands to their registered adapters.
|
|
550
562
|
* Multiple commands can map to the same adapter.
|
|
551
563
|
*/
|
|
552
564
|
adapters = /* @__PURE__ */ new Map();
|
|
@@ -575,9 +587,9 @@ var AgentAdapterRegistry = class {
|
|
|
575
587
|
}
|
|
576
588
|
}
|
|
577
589
|
/**
|
|
578
|
-
* Get an adapter by command
|
|
590
|
+
* Get an adapter by run command.
|
|
579
591
|
*
|
|
580
|
-
* @param runCommand - The command
|
|
592
|
+
* @param runCommand - The run command to look up
|
|
581
593
|
* @returns The registered adapter, or undefined if not found
|
|
582
594
|
*/
|
|
583
595
|
get(runCommand) {
|
|
@@ -586,7 +598,7 @@ var AgentAdapterRegistry = class {
|
|
|
586
598
|
/**
|
|
587
599
|
* Check if a command has a registered adapter.
|
|
588
600
|
*
|
|
589
|
-
* @param runCommand - The command
|
|
601
|
+
* @param runCommand - The run command to check
|
|
590
602
|
* @returns True if an adapter is registered for this command
|
|
591
603
|
*/
|
|
592
604
|
has(runCommand) {
|
|
@@ -603,7 +615,7 @@ var AgentAdapterRegistry = class {
|
|
|
603
615
|
/**
|
|
604
616
|
* Get all supported commands.
|
|
605
617
|
*
|
|
606
|
-
* @returns Array of all registered
|
|
618
|
+
* @returns Array of all registered run commands
|
|
607
619
|
*/
|
|
608
620
|
getSupportedCommands() {
|
|
609
621
|
return Array.from(this.adapters.keys());
|
|
@@ -653,6 +665,9 @@ function getAdapter(runCommand) {
|
|
|
653
665
|
return adapter;
|
|
654
666
|
}
|
|
655
667
|
|
|
668
|
+
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
669
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
670
|
+
|
|
656
671
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
657
672
|
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
658
673
|
var import_crypto = require("crypto");
|
|
@@ -1669,7 +1684,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1669
1684
|
var ClaudeCodeAdapter = class {
|
|
1670
1685
|
id = "claude-code";
|
|
1671
1686
|
name = "Claude Code";
|
|
1672
|
-
supportedCommands = [
|
|
1687
|
+
supportedCommands = [import_evalforge_types4.AgentRunCommand.CLAUDE];
|
|
1673
1688
|
/**
|
|
1674
1689
|
* Execute a skill using the Claude Code SDK.
|
|
1675
1690
|
*
|
|
@@ -2450,7 +2465,8 @@ function extractTemplateFiles(before, after) {
|
|
|
2450
2465
|
}
|
|
2451
2466
|
|
|
2452
2467
|
// src/run-scenario/run-agent-with-context.ts
|
|
2453
|
-
var
|
|
2468
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
2469
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types5.AgentRunCommand.CLAUDE;
|
|
2454
2470
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
2455
2471
|
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
2456
2472
|
if (!skillsGroupId) {
|
|
@@ -2534,10 +2550,11 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2534
2550
|
fileDiffs: partialResult.fileDiffs?.map((d) => ({
|
|
2535
2551
|
path: d.path,
|
|
2536
2552
|
status: templateFilesMap.get(d.path)
|
|
2537
|
-
}))
|
|
2553
|
+
})),
|
|
2554
|
+
durationMs: partialResult.duration
|
|
2538
2555
|
};
|
|
2539
2556
|
const { "x-wix-ai-gateway-stream": _stream, ...judgeHeaders } = config.aiGatewayHeaders;
|
|
2540
|
-
const defaultJudgeModel =
|
|
2557
|
+
const defaultJudgeModel = import_evalforge_types6.DEFAULT_JUDGE_MODEL;
|
|
2541
2558
|
const assertionContext = {
|
|
2542
2559
|
workDir,
|
|
2543
2560
|
defaultJudgeModel,
|
|
@@ -2552,10 +2569,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2552
2569
|
assertionContext
|
|
2553
2570
|
) : [];
|
|
2554
2571
|
const passed = assertionResults.filter(
|
|
2555
|
-
(r) => r.status ===
|
|
2572
|
+
(r) => r.status === import_evalforge_types6.AssertionResultStatus.PASSED
|
|
2556
2573
|
).length;
|
|
2557
2574
|
const failed = assertionResults.filter(
|
|
2558
|
-
(r) => r.status ===
|
|
2575
|
+
(r) => r.status === import_evalforge_types6.AssertionResultStatus.FAILED
|
|
2559
2576
|
).length;
|
|
2560
2577
|
const total = assertionResults.length;
|
|
2561
2578
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -2569,7 +2586,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2569
2586
|
}
|
|
2570
2587
|
|
|
2571
2588
|
// src/error-reporter.ts
|
|
2572
|
-
var
|
|
2589
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
2573
2590
|
function formatError(error, phase, context) {
|
|
2574
2591
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
2575
2592
|
if (error instanceof Error) {
|
|
@@ -2818,7 +2835,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2818
2835
|
};
|
|
2819
2836
|
try {
|
|
2820
2837
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
2821
|
-
status:
|
|
2838
|
+
status: import_evalforge_types8.EvalStatus.COMPLETED,
|
|
2822
2839
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2823
2840
|
});
|
|
2824
2841
|
} catch (updateErr) {
|
|
@@ -2859,7 +2876,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
2859
2876
|
authToken: config.authToken
|
|
2860
2877
|
});
|
|
2861
2878
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
2862
|
-
status:
|
|
2879
|
+
status: import_evalforge_types8.EvalStatus.FAILED,
|
|
2863
2880
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2864
2881
|
jobError,
|
|
2865
2882
|
jobStatus: "FAILED"
|
|
@@ -2882,7 +2899,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
2882
2899
|
authToken
|
|
2883
2900
|
});
|
|
2884
2901
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
2885
|
-
status:
|
|
2902
|
+
status: import_evalforge_types8.EvalStatus.FAILED,
|
|
2886
2903
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2887
2904
|
jobError: `Config load failed, then: ${jobError}`,
|
|
2888
2905
|
jobStatus: "FAILED"
|