@wix/evalforge-evaluator 0.87.0 → 0.89.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +39 -18
- package/build/index.js.map +4 -4
- package/build/index.mjs +32 -11
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +2 -1
- package/build/types/run-scenario/agents/registry.d.ts +14 -14
- package/package.json +4 -4
package/build/index.mjs
CHANGED
|
@@ -213,7 +213,21 @@ function applyParamsToAssertion(assertion, params) {
|
|
|
213
213
|
);
|
|
214
214
|
}
|
|
215
215
|
}
|
|
216
|
-
return {
|
|
216
|
+
return {
|
|
217
|
+
...assertion,
|
|
218
|
+
prompt,
|
|
219
|
+
systemPrompt,
|
|
220
|
+
...params.model !== void 0 && { model: params.model },
|
|
221
|
+
...params.maxTokens !== void 0 && {
|
|
222
|
+
maxTokens: params.maxTokens
|
|
223
|
+
},
|
|
224
|
+
...params.temperature !== void 0 && {
|
|
225
|
+
temperature: params.temperature
|
|
226
|
+
},
|
|
227
|
+
...params.minScore !== void 0 && {
|
|
228
|
+
minScore: params.minScore
|
|
229
|
+
}
|
|
230
|
+
};
|
|
217
231
|
}
|
|
218
232
|
if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
|
|
219
233
|
return {
|
|
@@ -245,7 +259,10 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
245
259
|
type: "llm_judge",
|
|
246
260
|
prompt: params?.prompt ?? "",
|
|
247
261
|
systemPrompt: params?.systemPrompt,
|
|
248
|
-
minScore: params?.minScore
|
|
262
|
+
minScore: params?.minScore,
|
|
263
|
+
model: params?.model,
|
|
264
|
+
maxTokens: params?.maxTokens,
|
|
265
|
+
temperature: params?.temperature
|
|
249
266
|
};
|
|
250
267
|
break;
|
|
251
268
|
default:
|
|
@@ -381,7 +398,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
381
398
|
// src/run-scenario/index.ts
|
|
382
399
|
import {
|
|
383
400
|
AssertionResultStatus,
|
|
384
|
-
|
|
401
|
+
DEFAULT_JUDGE_MODEL
|
|
385
402
|
} from "@wix/evalforge-types";
|
|
386
403
|
import {
|
|
387
404
|
evaluateAssertions as evaluateAssertionsBase
|
|
@@ -514,7 +531,7 @@ import { randomUUID as randomUUID2 } from "crypto";
|
|
|
514
531
|
// src/run-scenario/agents/registry.ts
|
|
515
532
|
var AgentAdapterRegistry = class {
|
|
516
533
|
/**
|
|
517
|
-
* Map of
|
|
534
|
+
* Map of run commands to their registered adapters.
|
|
518
535
|
* Multiple commands can map to the same adapter.
|
|
519
536
|
*/
|
|
520
537
|
adapters = /* @__PURE__ */ new Map();
|
|
@@ -543,9 +560,9 @@ var AgentAdapterRegistry = class {
|
|
|
543
560
|
}
|
|
544
561
|
}
|
|
545
562
|
/**
|
|
546
|
-
* Get an adapter by command
|
|
563
|
+
* Get an adapter by run command.
|
|
547
564
|
*
|
|
548
|
-
* @param runCommand - The command
|
|
565
|
+
* @param runCommand - The run command to look up
|
|
549
566
|
* @returns The registered adapter, or undefined if not found
|
|
550
567
|
*/
|
|
551
568
|
get(runCommand) {
|
|
@@ -554,7 +571,7 @@ var AgentAdapterRegistry = class {
|
|
|
554
571
|
/**
|
|
555
572
|
* Check if a command has a registered adapter.
|
|
556
573
|
*
|
|
557
|
-
* @param runCommand - The command
|
|
574
|
+
* @param runCommand - The run command to check
|
|
558
575
|
* @returns True if an adapter is registered for this command
|
|
559
576
|
*/
|
|
560
577
|
has(runCommand) {
|
|
@@ -571,7 +588,7 @@ var AgentAdapterRegistry = class {
|
|
|
571
588
|
/**
|
|
572
589
|
* Get all supported commands.
|
|
573
590
|
*
|
|
574
|
-
* @returns Array of all registered
|
|
591
|
+
* @returns Array of all registered run commands
|
|
575
592
|
*/
|
|
576
593
|
getSupportedCommands() {
|
|
577
594
|
return Array.from(this.adapters.keys());
|
|
@@ -621,6 +638,9 @@ function getAdapter(runCommand) {
|
|
|
621
638
|
return adapter;
|
|
622
639
|
}
|
|
623
640
|
|
|
641
|
+
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
642
|
+
import { AgentRunCommand } from "@wix/evalforge-types";
|
|
643
|
+
|
|
624
644
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
625
645
|
import {
|
|
626
646
|
ClaudeModel,
|
|
@@ -1642,7 +1662,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1642
1662
|
var ClaudeCodeAdapter = class {
|
|
1643
1663
|
id = "claude-code";
|
|
1644
1664
|
name = "Claude Code";
|
|
1645
|
-
supportedCommands = [
|
|
1665
|
+
supportedCommands = [AgentRunCommand.CLAUDE];
|
|
1646
1666
|
/**
|
|
1647
1667
|
* Execute a skill using the Claude Code SDK.
|
|
1648
1668
|
*
|
|
@@ -2423,7 +2443,8 @@ function extractTemplateFiles(before, after) {
|
|
|
2423
2443
|
}
|
|
2424
2444
|
|
|
2425
2445
|
// src/run-scenario/run-agent-with-context.ts
|
|
2426
|
-
|
|
2446
|
+
import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
|
|
2447
|
+
var DEFAULT_AGENT_COMMAND = AgentRunCommand2.CLAUDE;
|
|
2427
2448
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
2428
2449
|
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
2429
2450
|
if (!skillsGroupId) {
|
|
@@ -2510,7 +2531,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2510
2531
|
}))
|
|
2511
2532
|
};
|
|
2512
2533
|
const { "x-wix-ai-gateway-stream": _stream, ...judgeHeaders } = config.aiGatewayHeaders;
|
|
2513
|
-
const defaultJudgeModel =
|
|
2534
|
+
const defaultJudgeModel = DEFAULT_JUDGE_MODEL;
|
|
2514
2535
|
const assertionContext = {
|
|
2515
2536
|
workDir,
|
|
2516
2537
|
defaultJudgeModel,
|