@wix/evalforge-evaluator 0.87.0 → 0.89.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +39 -18
- package/build/index.js.map +4 -4
- package/build/index.mjs +32 -11
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +2 -1
- package/build/types/run-scenario/agents/registry.d.ts +14 -14
- package/package.json +4 -4
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -233,7 +233,21 @@ function applyParamsToAssertion(assertion, params) {
|
|
|
233
233
|
);
|
|
234
234
|
}
|
|
235
235
|
}
|
|
236
|
-
return {
|
|
236
|
+
return {
|
|
237
|
+
...assertion,
|
|
238
|
+
prompt,
|
|
239
|
+
systemPrompt,
|
|
240
|
+
...params.model !== void 0 && { model: params.model },
|
|
241
|
+
...params.maxTokens !== void 0 && {
|
|
242
|
+
maxTokens: params.maxTokens
|
|
243
|
+
},
|
|
244
|
+
...params.temperature !== void 0 && {
|
|
245
|
+
temperature: params.temperature
|
|
246
|
+
},
|
|
247
|
+
...params.minScore !== void 0 && {
|
|
248
|
+
minScore: params.minScore
|
|
249
|
+
}
|
|
250
|
+
};
|
|
237
251
|
}
|
|
238
252
|
if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
|
|
239
253
|
return {
|
|
@@ -265,7 +279,10 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
265
279
|
type: "llm_judge",
|
|
266
280
|
prompt: params?.prompt ?? "",
|
|
267
281
|
systemPrompt: params?.systemPrompt,
|
|
268
|
-
minScore: params?.minScore
|
|
282
|
+
minScore: params?.minScore,
|
|
283
|
+
model: params?.model,
|
|
284
|
+
maxTokens: params?.maxTokens,
|
|
285
|
+
temperature: params?.temperature
|
|
269
286
|
};
|
|
270
287
|
break;
|
|
271
288
|
default:
|
|
@@ -399,7 +416,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
399
416
|
}
|
|
400
417
|
|
|
401
418
|
// src/run-scenario/index.ts
|
|
402
|
-
var
|
|
419
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
403
420
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
404
421
|
|
|
405
422
|
// src/run-scenario/environment.ts
|
|
@@ -529,7 +546,7 @@ var import_crypto2 = require("crypto");
|
|
|
529
546
|
// src/run-scenario/agents/registry.ts
|
|
530
547
|
var AgentAdapterRegistry = class {
|
|
531
548
|
/**
|
|
532
|
-
* Map of
|
|
549
|
+
* Map of run commands to their registered adapters.
|
|
533
550
|
* Multiple commands can map to the same adapter.
|
|
534
551
|
*/
|
|
535
552
|
adapters = /* @__PURE__ */ new Map();
|
|
@@ -558,9 +575,9 @@ var AgentAdapterRegistry = class {
|
|
|
558
575
|
}
|
|
559
576
|
}
|
|
560
577
|
/**
|
|
561
|
-
* Get an adapter by command
|
|
578
|
+
* Get an adapter by run command.
|
|
562
579
|
*
|
|
563
|
-
* @param runCommand - The command
|
|
580
|
+
* @param runCommand - The run command to look up
|
|
564
581
|
* @returns The registered adapter, or undefined if not found
|
|
565
582
|
*/
|
|
566
583
|
get(runCommand) {
|
|
@@ -569,7 +586,7 @@ var AgentAdapterRegistry = class {
|
|
|
569
586
|
/**
|
|
570
587
|
* Check if a command has a registered adapter.
|
|
571
588
|
*
|
|
572
|
-
* @param runCommand - The command
|
|
589
|
+
* @param runCommand - The run command to check
|
|
573
590
|
* @returns True if an adapter is registered for this command
|
|
574
591
|
*/
|
|
575
592
|
has(runCommand) {
|
|
@@ -586,7 +603,7 @@ var AgentAdapterRegistry = class {
|
|
|
586
603
|
/**
|
|
587
604
|
* Get all supported commands.
|
|
588
605
|
*
|
|
589
|
-
* @returns Array of all registered
|
|
606
|
+
* @returns Array of all registered run commands
|
|
590
607
|
*/
|
|
591
608
|
getSupportedCommands() {
|
|
592
609
|
return Array.from(this.adapters.keys());
|
|
@@ -636,6 +653,9 @@ function getAdapter(runCommand) {
|
|
|
636
653
|
return adapter;
|
|
637
654
|
}
|
|
638
655
|
|
|
656
|
+
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
657
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
658
|
+
|
|
639
659
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
640
660
|
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
641
661
|
var import_crypto = require("crypto");
|
|
@@ -1652,7 +1672,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1652
1672
|
var ClaudeCodeAdapter = class {
|
|
1653
1673
|
id = "claude-code";
|
|
1654
1674
|
name = "Claude Code";
|
|
1655
|
-
supportedCommands = [
|
|
1675
|
+
supportedCommands = [import_evalforge_types4.AgentRunCommand.CLAUDE];
|
|
1656
1676
|
/**
|
|
1657
1677
|
* Execute a skill using the Claude Code SDK.
|
|
1658
1678
|
*
|
|
@@ -2433,7 +2453,8 @@ function extractTemplateFiles(before, after) {
|
|
|
2433
2453
|
}
|
|
2434
2454
|
|
|
2435
2455
|
// src/run-scenario/run-agent-with-context.ts
|
|
2436
|
-
var
|
|
2456
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
2457
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types5.AgentRunCommand.CLAUDE;
|
|
2437
2458
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
2438
2459
|
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
2439
2460
|
if (!skillsGroupId) {
|
|
@@ -2520,7 +2541,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2520
2541
|
}))
|
|
2521
2542
|
};
|
|
2522
2543
|
const { "x-wix-ai-gateway-stream": _stream, ...judgeHeaders } = config.aiGatewayHeaders;
|
|
2523
|
-
const defaultJudgeModel =
|
|
2544
|
+
const defaultJudgeModel = import_evalforge_types6.DEFAULT_JUDGE_MODEL;
|
|
2524
2545
|
const assertionContext = {
|
|
2525
2546
|
workDir,
|
|
2526
2547
|
defaultJudgeModel,
|
|
@@ -2535,10 +2556,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2535
2556
|
assertionContext
|
|
2536
2557
|
) : [];
|
|
2537
2558
|
const passed = assertionResults.filter(
|
|
2538
|
-
(r) => r.status ===
|
|
2559
|
+
(r) => r.status === import_evalforge_types6.AssertionResultStatus.PASSED
|
|
2539
2560
|
).length;
|
|
2540
2561
|
const failed = assertionResults.filter(
|
|
2541
|
-
(r) => r.status ===
|
|
2562
|
+
(r) => r.status === import_evalforge_types6.AssertionResultStatus.FAILED
|
|
2542
2563
|
).length;
|
|
2543
2564
|
const total = assertionResults.length;
|
|
2544
2565
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -2552,7 +2573,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2552
2573
|
}
|
|
2553
2574
|
|
|
2554
2575
|
// src/error-reporter.ts
|
|
2555
|
-
var
|
|
2576
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
2556
2577
|
function formatError(error, phase, context) {
|
|
2557
2578
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
2558
2579
|
if (error instanceof Error) {
|
|
@@ -2801,7 +2822,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2801
2822
|
};
|
|
2802
2823
|
try {
|
|
2803
2824
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
2804
|
-
status:
|
|
2825
|
+
status: import_evalforge_types8.EvalStatus.COMPLETED,
|
|
2805
2826
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2806
2827
|
});
|
|
2807
2828
|
} catch (updateErr) {
|
|
@@ -2842,7 +2863,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
2842
2863
|
authToken: config.authToken
|
|
2843
2864
|
});
|
|
2844
2865
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
2845
|
-
status:
|
|
2866
|
+
status: import_evalforge_types8.EvalStatus.FAILED,
|
|
2846
2867
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2847
2868
|
jobError,
|
|
2848
2869
|
jobStatus: "FAILED"
|
|
@@ -2865,7 +2886,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
2865
2886
|
authToken
|
|
2866
2887
|
});
|
|
2867
2888
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
2868
|
-
status:
|
|
2889
|
+
status: import_evalforge_types8.EvalStatus.FAILED,
|
|
2869
2890
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2870
2891
|
jobError: `Config load failed, then: ${jobError}`,
|
|
2871
2892
|
jobStatus: "FAILED"
|