@wix/evalforge-evaluator 0.98.0 → 0.100.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +237 -151
- package/build/index.js.map +4 -4
- package/build/index.mjs +233 -147
- package/build/index.mjs.map +4 -4
- package/build/types/fetch-evaluation-data.d.ts +2 -2
- package/build/types/resolve-placeholders.d.ts +26 -0
- package/build/types/run-scenario/agents/claude-code/resolve-mcp-placeholders.d.ts +31 -0
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +3 -0
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +3 -1
- package/build/types/run-scenario/agents/registry.d.ts +32 -63
- package/build/types/run-scenario/index.d.ts +1 -1
- package/build/types/run-scenario/run-agent-with-context.d.ts +3 -3
- package/build/types/run-scenario/utils/write-files.d.ts +6 -0
- package/package.json +4 -4
package/build/index.js
CHANGED
|
@@ -207,6 +207,53 @@ function createApiClient(serverUrl, options = "") {
|
|
|
207
207
|
|
|
208
208
|
// src/fetch-evaluation-data.ts
|
|
209
209
|
var import_evalforge_types = require("@wix/evalforge-types");
|
|
210
|
+
|
|
211
|
+
// src/resolve-placeholders.ts
|
|
212
|
+
var PLACEHOLDER_PATTERN = /\{\{([^}]+)\}\}/g;
|
|
213
|
+
function findPlaceholders(value) {
|
|
214
|
+
const keys = /* @__PURE__ */ new Set();
|
|
215
|
+
collectPlaceholders(value, keys);
|
|
216
|
+
return [...keys];
|
|
217
|
+
}
|
|
218
|
+
function collectPlaceholders(value, keys) {
|
|
219
|
+
if (typeof value === "string") {
|
|
220
|
+
for (const match of value.matchAll(PLACEHOLDER_PATTERN)) {
|
|
221
|
+
keys.add(match[1].trim());
|
|
222
|
+
}
|
|
223
|
+
} else if (Array.isArray(value)) {
|
|
224
|
+
for (const item of value) {
|
|
225
|
+
collectPlaceholders(item, keys);
|
|
226
|
+
}
|
|
227
|
+
} else if (typeof value === "object" && value !== null) {
|
|
228
|
+
for (const val of Object.values(value)) {
|
|
229
|
+
collectPlaceholders(val, keys);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
function resolveValue(value, placeholders) {
|
|
234
|
+
if (typeof value === "string") {
|
|
235
|
+
return value.replace(PLACEHOLDER_PATTERN, (match, key) => {
|
|
236
|
+
const trimmed = key.trim();
|
|
237
|
+
return trimmed in placeholders ? placeholders[trimmed] : match;
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
if (Array.isArray(value)) {
|
|
241
|
+
return value.map((item) => resolveValue(item, placeholders));
|
|
242
|
+
}
|
|
243
|
+
if (typeof value === "object" && value !== null) {
|
|
244
|
+
const result = {};
|
|
245
|
+
for (const [k, v] of Object.entries(value)) {
|
|
246
|
+
result[k] = resolveValue(v, placeholders);
|
|
247
|
+
}
|
|
248
|
+
return result;
|
|
249
|
+
}
|
|
250
|
+
return value;
|
|
251
|
+
}
|
|
252
|
+
function resolvePlaceholdersInString(text, placeholders) {
|
|
253
|
+
return resolveValue(text, placeholders);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// src/fetch-evaluation-data.ts
|
|
210
257
|
function parseSkillNamesFromParams(value) {
|
|
211
258
|
if (typeof value !== "string") {
|
|
212
259
|
return [];
|
|
@@ -222,13 +269,11 @@ function applyParamsToAssertion(assertion, params) {
|
|
|
222
269
|
return assertion;
|
|
223
270
|
}
|
|
224
271
|
if (assertion.type === "llm_judge") {
|
|
225
|
-
|
|
272
|
+
const stringParams = {};
|
|
226
273
|
for (const [key, value] of Object.entries(params)) {
|
|
227
|
-
|
|
228
|
-
const escapedPlaceholder = placeholder.replace(/[{}]/g, "\\$&");
|
|
229
|
-
const replacement = String(value ?? "");
|
|
230
|
-
prompt = prompt.replace(new RegExp(escapedPlaceholder, "g"), replacement);
|
|
274
|
+
stringParams[key] = String(value ?? "");
|
|
231
275
|
}
|
|
276
|
+
const prompt = resolvePlaceholdersInString(assertion.prompt, stringParams);
|
|
232
277
|
return {
|
|
233
278
|
...assertion,
|
|
234
279
|
prompt,
|
|
@@ -337,9 +382,9 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
337
382
|
const scenarios = await Promise.all(
|
|
338
383
|
evalRun.scenarioIds.map((id) => api.getScenario(projectId2, id))
|
|
339
384
|
);
|
|
340
|
-
let
|
|
385
|
+
let agent = null;
|
|
341
386
|
if (evalRun.agentId) {
|
|
342
|
-
|
|
387
|
+
agent = await api.getAgent(projectId2, evalRun.agentId);
|
|
343
388
|
}
|
|
344
389
|
let skills = [];
|
|
345
390
|
let skillsGroup = null;
|
|
@@ -440,7 +485,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
440
485
|
const skillsGroupName = skillsGroup?.name ?? "";
|
|
441
486
|
return {
|
|
442
487
|
evalRun,
|
|
443
|
-
|
|
488
|
+
agent,
|
|
444
489
|
skills,
|
|
445
490
|
skillsGroup,
|
|
446
491
|
skillsGroupName,
|
|
@@ -459,56 +504,18 @@ var import_eval_assertions = require("@wix/eval-assertions");
|
|
|
459
504
|
var import_fs = require("fs");
|
|
460
505
|
var import_os = require("os");
|
|
461
506
|
var import_path2 = __toESM(require("path"));
|
|
462
|
-
var
|
|
507
|
+
var import_evalforge_github_client = require("@wix/evalforge-github-client");
|
|
463
508
|
|
|
464
|
-
// src/run-scenario/
|
|
509
|
+
// src/run-scenario/utils/write-files.ts
|
|
465
510
|
var import_promises = require("fs/promises");
|
|
466
511
|
var import_path = require("path");
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
await Promise.all(
|
|
470
|
-
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
471
|
-
);
|
|
472
|
-
}
|
|
473
|
-
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client.fetchGitHubFolder) {
|
|
474
|
-
const skillName = skill.name;
|
|
475
|
-
const skillDir = (0, import_path.join)(cwd, ".claude", "skills", skillName);
|
|
476
|
-
await (0, import_promises.mkdir)(skillDir, { recursive: true });
|
|
477
|
-
const version = skill.latestVersion;
|
|
478
|
-
if (version?.files && version.files.length > 0) {
|
|
479
|
-
await writeSkillFiles(skillDir, version.files);
|
|
480
|
-
console.log(
|
|
481
|
-
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
482
|
-
);
|
|
483
|
-
} else if (skill.source) {
|
|
484
|
-
try {
|
|
485
|
-
const files = await fetchFn(skill.source, {
|
|
486
|
-
userAgent: "EvalForge-Evaluator"
|
|
487
|
-
});
|
|
488
|
-
await writeSkillFiles(skillDir, files);
|
|
489
|
-
console.log(
|
|
490
|
-
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
491
|
-
);
|
|
492
|
-
} catch (error) {
|
|
493
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
494
|
-
console.error(
|
|
495
|
-
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
496
|
-
);
|
|
497
|
-
throw new Error(
|
|
498
|
-
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
499
|
-
);
|
|
500
|
-
}
|
|
501
|
-
} else {
|
|
502
|
-
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
503
|
-
}
|
|
504
|
-
}
|
|
505
|
-
async function writeSkillFiles(skillDir, files) {
|
|
506
|
-
const resolvedBase = (0, import_path.resolve)(skillDir);
|
|
512
|
+
async function writeFilesToDirectory(targetDir, files) {
|
|
513
|
+
const resolvedBase = (0, import_path.resolve)(targetDir);
|
|
507
514
|
for (const file of files) {
|
|
508
|
-
const filePath = (0, import_path.resolve)(
|
|
515
|
+
const filePath = (0, import_path.resolve)(targetDir, file.path);
|
|
509
516
|
if (!filePath.startsWith(resolvedBase + import_path.sep) && filePath !== resolvedBase) {
|
|
510
517
|
throw new Error(
|
|
511
|
-
`Path traversal detected
|
|
518
|
+
`Path traversal detected: "${file.path}" resolves outside target directory`
|
|
512
519
|
);
|
|
513
520
|
}
|
|
514
521
|
await (0, import_promises.mkdir)((0, import_path.dirname)(filePath), { recursive: true });
|
|
@@ -524,10 +531,10 @@ async function fetchAndWriteTemplateFiles(template, workDir) {
|
|
|
524
531
|
);
|
|
525
532
|
return;
|
|
526
533
|
}
|
|
527
|
-
const files = await (0,
|
|
534
|
+
const files = await (0, import_evalforge_github_client.fetchGitHubFolder)(template.source, {
|
|
528
535
|
userAgent: "EvalForge-Evaluator"
|
|
529
536
|
});
|
|
530
|
-
await
|
|
537
|
+
await writeFilesToDirectory(workDir, files);
|
|
531
538
|
}
|
|
532
539
|
function writeWixEnvFile(workDir) {
|
|
533
540
|
const configPath = import_path2.default.join(workDir, "wix.config.json");
|
|
@@ -581,86 +588,76 @@ var import_crypto2 = require("crypto");
|
|
|
581
588
|
|
|
582
589
|
// src/run-scenario/agents/registry.ts
|
|
583
590
|
var AgentAdapterRegistry = class {
|
|
584
|
-
/**
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
/**
|
|
590
|
-
* Set of all registered adapter instances (for getAll).
|
|
591
|
-
*/
|
|
591
|
+
/** Map of CLI commands to their registered adapters. */
|
|
592
|
+
commandMap = /* @__PURE__ */ new Map();
|
|
593
|
+
/** Map of adapter IDs to their registered adapters. */
|
|
594
|
+
idMap = /* @__PURE__ */ new Map();
|
|
595
|
+
/** Set of all registered adapter instances (for getAll). */
|
|
592
596
|
registeredAdapters = /* @__PURE__ */ new Set();
|
|
593
597
|
/**
|
|
594
598
|
* Register an agent adapter.
|
|
595
599
|
*
|
|
596
|
-
* The adapter
|
|
597
|
-
* If a command is already registered, it will be overwritten with a warning.
|
|
598
|
-
*
|
|
599
|
-
* @param adapter - The adapter to register
|
|
600
|
+
* The adapter is registered by its ID and for all commands in its supportedCommands array.
|
|
601
|
+
* If a command or ID is already registered, it will be overwritten with a warning.
|
|
600
602
|
*/
|
|
601
603
|
register(adapter) {
|
|
602
604
|
this.registeredAdapters.add(adapter);
|
|
605
|
+
this.idMap.set(adapter.id, adapter);
|
|
603
606
|
for (const command of adapter.supportedCommands) {
|
|
604
|
-
if (this.
|
|
605
|
-
const existing = this.
|
|
607
|
+
if (this.commandMap.has(command)) {
|
|
608
|
+
const existing = this.commandMap.get(command);
|
|
606
609
|
console.warn(
|
|
607
610
|
`[AgentAdapterRegistry] Command "${command}" already registered by adapter "${existing.id}". Overwriting with adapter "${adapter.id}".`
|
|
608
611
|
);
|
|
609
612
|
}
|
|
610
|
-
this.
|
|
613
|
+
this.commandMap.set(command, adapter);
|
|
611
614
|
}
|
|
612
615
|
}
|
|
613
|
-
/**
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
* @param runCommand - The run command to look up
|
|
617
|
-
* @returns The registered adapter, or undefined if not found
|
|
618
|
-
*/
|
|
619
|
-
get(runCommand) {
|
|
620
|
-
return this.adapters.get(runCommand);
|
|
616
|
+
/** Get an adapter by CLI command. */
|
|
617
|
+
getByCommand(command) {
|
|
618
|
+
return this.commandMap.get(command);
|
|
621
619
|
}
|
|
622
|
-
/**
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
* @param runCommand - The run command to check
|
|
626
|
-
* @returns True if an adapter is registered for this command
|
|
627
|
-
*/
|
|
628
|
-
has(runCommand) {
|
|
629
|
-
return this.adapters.has(runCommand);
|
|
620
|
+
/** Get an adapter by adapter ID. */
|
|
621
|
+
getById(adapterId) {
|
|
622
|
+
return this.idMap.get(adapterId);
|
|
630
623
|
}
|
|
631
624
|
/**
|
|
632
|
-
*
|
|
633
|
-
*
|
|
634
|
-
* @returns Array of all unique registered adapters
|
|
625
|
+
* Unified lookup: tries CLI command first, then adapter ID.
|
|
626
|
+
* Use this when the identifier could be either a command or an adapter ID.
|
|
635
627
|
*/
|
|
628
|
+
resolve(identifier) {
|
|
629
|
+
return this.commandMap.get(identifier) ?? this.idMap.get(identifier);
|
|
630
|
+
}
|
|
631
|
+
/** Check if a command or adapter ID has a registered adapter. */
|
|
632
|
+
has(identifier) {
|
|
633
|
+
return this.commandMap.has(identifier) || this.idMap.has(identifier);
|
|
634
|
+
}
|
|
635
|
+
/** Get all registered adapters. */
|
|
636
636
|
getAll() {
|
|
637
637
|
return Array.from(this.registeredAdapters);
|
|
638
638
|
}
|
|
639
|
-
/**
|
|
640
|
-
* Get all supported commands.
|
|
641
|
-
*
|
|
642
|
-
* @returns Array of all registered run commands
|
|
643
|
-
*/
|
|
639
|
+
/** Get all supported CLI commands. */
|
|
644
640
|
getSupportedCommands() {
|
|
645
|
-
return Array.from(this.
|
|
641
|
+
return Array.from(this.commandMap.keys());
|
|
642
|
+
}
|
|
643
|
+
/** Get all registered adapter IDs. */
|
|
644
|
+
getAdapterIds() {
|
|
645
|
+
return Array.from(this.idMap.keys());
|
|
646
646
|
}
|
|
647
647
|
/**
|
|
648
648
|
* Unregister an adapter by its ID.
|
|
649
|
-
*
|
|
650
649
|
* Removes the adapter and all its command mappings.
|
|
651
|
-
*
|
|
652
|
-
* @param adapterId - The ID of the adapter to remove
|
|
653
|
-
* @returns True if the adapter was found and removed
|
|
654
650
|
*/
|
|
655
651
|
unregister(adapterId) {
|
|
656
652
|
let found = false;
|
|
657
653
|
for (const adapter of this.registeredAdapters) {
|
|
658
654
|
if (adapter.id === adapterId) {
|
|
659
655
|
this.registeredAdapters.delete(adapter);
|
|
656
|
+
this.idMap.delete(adapterId);
|
|
660
657
|
found = true;
|
|
661
658
|
for (const command of adapter.supportedCommands) {
|
|
662
|
-
if (this.
|
|
663
|
-
this.
|
|
659
|
+
if (this.commandMap.get(command) === adapter) {
|
|
660
|
+
this.commandMap.delete(command);
|
|
664
661
|
}
|
|
665
662
|
}
|
|
666
663
|
break;
|
|
@@ -668,22 +665,21 @@ var AgentAdapterRegistry = class {
|
|
|
668
665
|
}
|
|
669
666
|
return found;
|
|
670
667
|
}
|
|
671
|
-
/**
|
|
672
|
-
* Clear all registered adapters.
|
|
673
|
-
* Primarily useful for testing.
|
|
674
|
-
*/
|
|
668
|
+
/** Clear all registered adapters. Primarily useful for testing. */
|
|
675
669
|
clear() {
|
|
676
|
-
this.
|
|
670
|
+
this.commandMap.clear();
|
|
671
|
+
this.idMap.clear();
|
|
677
672
|
this.registeredAdapters.clear();
|
|
678
673
|
}
|
|
679
674
|
};
|
|
680
675
|
var defaultRegistry = new AgentAdapterRegistry();
|
|
681
|
-
function getAdapter(
|
|
682
|
-
const adapter = defaultRegistry.
|
|
676
|
+
function getAdapter(identifier) {
|
|
677
|
+
const adapter = defaultRegistry.resolve(identifier);
|
|
683
678
|
if (!adapter) {
|
|
684
|
-
const
|
|
679
|
+
const commands = defaultRegistry.getSupportedCommands();
|
|
680
|
+
const ids = defaultRegistry.getAdapterIds();
|
|
685
681
|
throw new Error(
|
|
686
|
-
`No agent adapter registered for
|
|
682
|
+
`No agent adapter registered for "${identifier}". Supported commands: ${commands.length > 0 ? commands.join(", ") : "(none)"}. Registered adapters: ${ids.length > 0 ? ids.join(", ") : "(none)"}`
|
|
687
683
|
);
|
|
688
684
|
}
|
|
689
685
|
return adapter;
|
|
@@ -694,12 +690,97 @@ var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
|
694
690
|
|
|
695
691
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
696
692
|
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
697
|
-
var import_crypto = require("crypto");
|
|
698
693
|
|
|
699
|
-
// src/run-scenario/agents/claude-code/write-
|
|
694
|
+
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
700
695
|
var import_promises2 = require("fs/promises");
|
|
701
696
|
var import_path3 = require("path");
|
|
697
|
+
var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
|
|
698
|
+
async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
699
|
+
await Promise.all(
|
|
700
|
+
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
701
|
+
);
|
|
702
|
+
}
|
|
703
|
+
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
704
|
+
const skillName = skill.name;
|
|
705
|
+
const skillDir = (0, import_path3.join)(cwd, ".claude", "skills", skillName);
|
|
706
|
+
await (0, import_promises2.mkdir)(skillDir, { recursive: true });
|
|
707
|
+
const version = skill.latestVersion;
|
|
708
|
+
if (version?.files && version.files.length > 0) {
|
|
709
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
710
|
+
console.log(
|
|
711
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
712
|
+
);
|
|
713
|
+
} else if (skill.source) {
|
|
714
|
+
try {
|
|
715
|
+
const files = await fetchFn(skill.source, {
|
|
716
|
+
userAgent: "EvalForge-Evaluator"
|
|
717
|
+
});
|
|
718
|
+
await writeFilesToDirectory(skillDir, files);
|
|
719
|
+
console.log(
|
|
720
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
721
|
+
);
|
|
722
|
+
} catch (error) {
|
|
723
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
724
|
+
console.error(
|
|
725
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
726
|
+
);
|
|
727
|
+
throw new Error(
|
|
728
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
729
|
+
);
|
|
730
|
+
}
|
|
731
|
+
} else {
|
|
732
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
// src/run-scenario/agents/claude-code/execute.ts
|
|
737
|
+
var import_crypto = require("crypto");
|
|
738
|
+
|
|
739
|
+
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
740
|
+
var import_promises4 = require("fs/promises");
|
|
741
|
+
var import_path5 = require("path");
|
|
702
742
|
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
743
|
+
|
|
744
|
+
// src/run-scenario/agents/claude-code/resolve-mcp-placeholders.ts
|
|
745
|
+
var import_promises3 = require("fs/promises");
|
|
746
|
+
var import_path4 = require("path");
|
|
747
|
+
var import_os2 = require("os");
|
|
748
|
+
var WIX_AUTH_FILE = (0, import_path4.join)((0, import_os2.homedir)(), ".wix", "auth", "api-key.json");
|
|
749
|
+
async function loadWixAuthPlaceholders(authFilePath = WIX_AUTH_FILE) {
|
|
750
|
+
try {
|
|
751
|
+
const content = await (0, import_promises3.readFile)(authFilePath, "utf-8");
|
|
752
|
+
const auth = JSON.parse(content);
|
|
753
|
+
if (!auth.token || !auth.userInfo?.userId) {
|
|
754
|
+
return {};
|
|
755
|
+
}
|
|
756
|
+
return {
|
|
757
|
+
"wix-auth-token": auth.token,
|
|
758
|
+
"wix-auth-user-id": auth.userInfo.userId
|
|
759
|
+
};
|
|
760
|
+
} catch (err) {
|
|
761
|
+
console.warn(
|
|
762
|
+
`[MCP] Could not load Wix auth file: ${err.message}`
|
|
763
|
+
);
|
|
764
|
+
return {};
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
async function resolveMcpPlaceholders(mcpServers, authFilePath) {
|
|
768
|
+
const needed = findPlaceholders(mcpServers);
|
|
769
|
+
if (needed.length === 0) {
|
|
770
|
+
return mcpServers;
|
|
771
|
+
}
|
|
772
|
+
const placeholders = await loadWixAuthPlaceholders(authFilePath);
|
|
773
|
+
const unresolved = needed.filter((key) => !(key in placeholders));
|
|
774
|
+
if (unresolved.length > 0) {
|
|
775
|
+
throw new Error(
|
|
776
|
+
`MCP config contains unresolvable placeholders: ${unresolved.map((k) => `{{${k}}}`).join(", ")}. Ensure ~/.wix/auth/api-key.json exists (run \`npx @wix/cli login\`).`
|
|
777
|
+
);
|
|
778
|
+
}
|
|
779
|
+
console.log(`[MCP] Resolved ${needed.length} placeholder(s)`);
|
|
780
|
+
return resolveValue(mcpServers, placeholders);
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
703
784
|
async function writeMcpToFilesystem(cwd, mcps) {
|
|
704
785
|
if (mcps.length === 0) return;
|
|
705
786
|
const mcpServers = {};
|
|
@@ -714,19 +795,20 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
714
795
|
mcpServers[key] = value;
|
|
715
796
|
}
|
|
716
797
|
}
|
|
798
|
+
const resolvedServers = await resolveMcpPlaceholders(mcpServers);
|
|
717
799
|
const content = JSON.stringify(
|
|
718
|
-
{ [import_evalforge_types2.MCP_SERVERS_JSON_KEY]:
|
|
800
|
+
{ [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: resolvedServers },
|
|
719
801
|
null,
|
|
720
802
|
2
|
|
721
803
|
);
|
|
722
|
-
const filePath = (0,
|
|
723
|
-
await (0,
|
|
804
|
+
const filePath = (0, import_path5.join)(cwd, ".mcp.json");
|
|
805
|
+
await (0, import_promises4.writeFile)(filePath, content, "utf8");
|
|
724
806
|
console.log(`[MCP] Written to ${filePath}`);
|
|
725
807
|
}
|
|
726
808
|
|
|
727
809
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
728
|
-
var
|
|
729
|
-
var
|
|
810
|
+
var import_promises5 = require("fs/promises");
|
|
811
|
+
var import_path6 = require("path");
|
|
730
812
|
var AGENTS_DIR = ".claude/agents";
|
|
731
813
|
function toAgentFilename(name, index, nameCount) {
|
|
732
814
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -736,20 +818,20 @@ function toAgentFilename(name, index, nameCount) {
|
|
|
736
818
|
}
|
|
737
819
|
async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
738
820
|
if (subAgents.length === 0) return;
|
|
739
|
-
const agentsDir = (0,
|
|
740
|
-
await (0,
|
|
821
|
+
const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
|
|
822
|
+
await (0, import_promises5.mkdir)(agentsDir, { recursive: true });
|
|
741
823
|
const nameCount = /* @__PURE__ */ new Map();
|
|
742
824
|
for (const [i, agent] of subAgents.entries()) {
|
|
743
825
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
744
|
-
const filePath = (0,
|
|
745
|
-
await (0,
|
|
826
|
+
const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
|
|
827
|
+
await (0, import_promises5.writeFile)(filePath, agent.subAgentMd, "utf8");
|
|
746
828
|
}
|
|
747
829
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
748
830
|
}
|
|
749
831
|
|
|
750
832
|
// src/run-scenario/agents/claude-code/write-rules.ts
|
|
751
|
-
var
|
|
752
|
-
var
|
|
833
|
+
var import_promises6 = require("fs/promises");
|
|
834
|
+
var import_path7 = require("path");
|
|
753
835
|
var CURSOR_RULES_DIR = ".cursor/rules";
|
|
754
836
|
function toRuleFilename(name, index, nameCount) {
|
|
755
837
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `rule-${index}`;
|
|
@@ -760,13 +842,13 @@ function toRuleFilename(name, index, nameCount) {
|
|
|
760
842
|
async function appendToFile(filePath, content) {
|
|
761
843
|
let existing = "";
|
|
762
844
|
try {
|
|
763
|
-
existing = await (0,
|
|
845
|
+
existing = await (0, import_promises6.readFile)(filePath, "utf8");
|
|
764
846
|
} catch {
|
|
765
847
|
}
|
|
766
848
|
const merged = existing ? `${existing.trimEnd()}
|
|
767
849
|
|
|
768
850
|
${content}` : content;
|
|
769
|
-
await (0,
|
|
851
|
+
await (0, import_promises6.writeFile)(filePath, merged, "utf8");
|
|
770
852
|
}
|
|
771
853
|
async function writeRulesToFilesystem(cwd, rules) {
|
|
772
854
|
if (rules.length === 0) return;
|
|
@@ -775,21 +857,21 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
775
857
|
for (const [i, rule] of rules.entries()) {
|
|
776
858
|
switch (rule.ruleType) {
|
|
777
859
|
case "claude-md": {
|
|
778
|
-
await appendToFile((0,
|
|
860
|
+
await appendToFile((0, import_path7.join)(cwd, "CLAUDE.md"), rule.content);
|
|
779
861
|
break;
|
|
780
862
|
}
|
|
781
863
|
case "agents-md": {
|
|
782
|
-
await appendToFile((0,
|
|
864
|
+
await appendToFile((0, import_path7.join)(cwd, "AGENTS.md"), rule.content);
|
|
783
865
|
break;
|
|
784
866
|
}
|
|
785
867
|
case "cursor-rule": {
|
|
786
868
|
if (!hasCursorRules) {
|
|
787
|
-
await (0,
|
|
869
|
+
await (0, import_promises6.mkdir)((0, import_path7.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
|
|
788
870
|
hasCursorRules = true;
|
|
789
871
|
}
|
|
790
872
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
791
|
-
const filePath = (0,
|
|
792
|
-
await (0,
|
|
873
|
+
const filePath = (0, import_path7.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
874
|
+
await (0, import_promises6.writeFile)(filePath, rule.content, "utf8");
|
|
793
875
|
break;
|
|
794
876
|
}
|
|
795
877
|
}
|
|
@@ -1824,7 +1906,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
1824
1906
|
|
|
1825
1907
|
// src/run-scenario/file-diff.ts
|
|
1826
1908
|
var import_fs2 = require("fs");
|
|
1827
|
-
var
|
|
1909
|
+
var import_path8 = require("path");
|
|
1828
1910
|
|
|
1829
1911
|
// ../../node_modules/diff/lib/index.mjs
|
|
1830
1912
|
function Diff() {
|
|
@@ -2000,7 +2082,7 @@ Diff.prototype = {
|
|
|
2000
2082
|
tokenize: function tokenize(value) {
|
|
2001
2083
|
return Array.from(value);
|
|
2002
2084
|
},
|
|
2003
|
-
join: function
|
|
2085
|
+
join: function join6(chars) {
|
|
2004
2086
|
return chars.join("");
|
|
2005
2087
|
},
|
|
2006
2088
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -2440,8 +2522,8 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
2440
2522
|
}
|
|
2441
2523
|
const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
|
|
2442
2524
|
for (const entry of entries) {
|
|
2443
|
-
const fullPath = (0,
|
|
2444
|
-
const relativePath = (0,
|
|
2525
|
+
const fullPath = (0, import_path8.join)(dir, entry.name);
|
|
2526
|
+
const relativePath = (0, import_path8.relative)(base, fullPath);
|
|
2445
2527
|
if (shouldIgnore(entry.name)) {
|
|
2446
2528
|
continue;
|
|
2447
2529
|
}
|
|
@@ -2553,14 +2635,17 @@ var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
|
2553
2635
|
var DEFAULT_AGENT_COMMAND = import_evalforge_types5.AgentRunCommand.CLAUDE;
|
|
2554
2636
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
2555
2637
|
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
2638
|
+
const agent = evalData.agent ?? void 0;
|
|
2639
|
+
const isSDK = agent?.agentType === import_evalforge_types5.AgentType.SDK;
|
|
2556
2640
|
if (!skillsGroupId) {
|
|
2557
2641
|
throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
|
|
2558
2642
|
}
|
|
2559
|
-
const
|
|
2560
|
-
const
|
|
2561
|
-
const adapter = getAdapter(runCommand);
|
|
2643
|
+
const identifier = isSDK ? agent.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
2644
|
+
const adapter = getAdapter(identifier);
|
|
2562
2645
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2563
2646
|
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
2647
|
+
const targetId = skillsGroupId ?? agent?.id ?? evalData.evalRun.id;
|
|
2648
|
+
const targetName = evalData.skillsGroupName || agent?.name || "";
|
|
2564
2649
|
const executionContext = {
|
|
2565
2650
|
skills: evalData.skills,
|
|
2566
2651
|
scenario,
|
|
@@ -2572,8 +2657,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2572
2657
|
evalRunId: evalRunId2,
|
|
2573
2658
|
scenarioId: scenario.id,
|
|
2574
2659
|
scenarioName: scenario.name,
|
|
2575
|
-
targetId
|
|
2576
|
-
targetName
|
|
2660
|
+
targetId,
|
|
2661
|
+
targetName,
|
|
2577
2662
|
tracePushUrl: config.tracePushUrl,
|
|
2578
2663
|
routeHeader: config.routeHeader,
|
|
2579
2664
|
authToken: config.authToken
|
|
@@ -2590,8 +2675,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2590
2675
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
2591
2676
|
return {
|
|
2592
2677
|
id: (0, import_crypto2.randomUUID)(),
|
|
2593
|
-
targetId
|
|
2594
|
-
targetName
|
|
2678
|
+
targetId,
|
|
2679
|
+
targetName,
|
|
2595
2680
|
scenarioId: scenario.id,
|
|
2596
2681
|
scenarioName: scenario.name,
|
|
2597
2682
|
modelConfig: agent?.modelConfig,
|
|
@@ -2607,11 +2692,11 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2607
2692
|
|
|
2608
2693
|
// src/run-scenario/index.ts
|
|
2609
2694
|
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
|
|
2610
|
-
const
|
|
2695
|
+
const targetId = evalData.evalRun.skillsGroupId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
2611
2696
|
const workDir = await prepareWorkingDirectory(
|
|
2612
2697
|
config,
|
|
2613
2698
|
evalRunId2,
|
|
2614
|
-
|
|
2699
|
+
targetId,
|
|
2615
2700
|
scenario.id,
|
|
2616
2701
|
template
|
|
2617
2702
|
);
|
|
@@ -2639,7 +2724,8 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2639
2724
|
})),
|
|
2640
2725
|
durationMs: partialResult.duration
|
|
2641
2726
|
};
|
|
2642
|
-
const { "x-wix-ai-gateway-stream":
|
|
2727
|
+
const { "x-wix-ai-gateway-stream": _ignored, ...judgeHeaders } = config.aiGatewayHeaders;
|
|
2728
|
+
void _ignored;
|
|
2643
2729
|
const defaultJudgeModel = import_evalforge_types6.DEFAULT_JUDGE_MODEL;
|
|
2644
2730
|
const assertionContext = {
|
|
2645
2731
|
workDir,
|
|
@@ -2816,7 +2902,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2816
2902
|
scenarioItemCount: evalData.scenarioItems.length,
|
|
2817
2903
|
scenarios: evalData.scenarioItems.map((s) => s.scenario.name),
|
|
2818
2904
|
skillsCount: evalData.skills.length,
|
|
2819
|
-
|
|
2905
|
+
hasAgent: !!evalData.agent,
|
|
2820
2906
|
timestamp: Date.now()
|
|
2821
2907
|
})
|
|
2822
2908
|
);
|
|
@@ -2834,14 +2920,14 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2834
2920
|
`[${ExecutionPhase.FETCH_EVAL_RUN}] Failed to fetch evaluation data: ${errorMsg}`
|
|
2835
2921
|
);
|
|
2836
2922
|
}
|
|
2837
|
-
const {
|
|
2923
|
+
const { agent, skills, scenarioItems } = evalData;
|
|
2838
2924
|
state.currentPhase = ExecutionPhase.VALIDATION;
|
|
2839
2925
|
state.currentContext = {
|
|
2840
2926
|
projectId: projectId2,
|
|
2841
2927
|
evalRunId: evalRunId2,
|
|
2842
2928
|
scenarioCount: scenarioItems.length,
|
|
2843
2929
|
skillCount: skills.length,
|
|
2844
|
-
hasAgent: !!
|
|
2930
|
+
hasAgent: !!agent,
|
|
2845
2931
|
agentId: evalData.evalRun.agentId,
|
|
2846
2932
|
skillsGroupId: evalData.evalRun.skillsGroupId
|
|
2847
2933
|
};
|
|
@@ -2850,9 +2936,9 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2850
2936
|
`[${ExecutionPhase.VALIDATION}] Eval run has no skills: set skillsGroupId and ensure the group has skills. (skillsGroupId: ${evalData.evalRun.skillsGroupId || "not set"})`
|
|
2851
2937
|
);
|
|
2852
2938
|
}
|
|
2853
|
-
if (scenarioItems.length > 0 && skills.length > 0 && !
|
|
2939
|
+
if (scenarioItems.length > 0 && skills.length > 0 && !agent) {
|
|
2854
2940
|
throw new Error(
|
|
2855
|
-
`[${ExecutionPhase.VALIDATION}] Eval run has no
|
|
2941
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for skill-based runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
2856
2942
|
);
|
|
2857
2943
|
}
|
|
2858
2944
|
let completedScenarios = 0;
|
|
@@ -2866,8 +2952,8 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2866
2952
|
scenarioName: scenario.name,
|
|
2867
2953
|
skillsGroupId: evalData.evalRun.skillsGroupId,
|
|
2868
2954
|
skillsGroupName: evalData.skillsGroupName,
|
|
2869
|
-
agentId:
|
|
2870
|
-
agentName:
|
|
2955
|
+
agentId: agent?.id,
|
|
2956
|
+
agentName: agent?.name,
|
|
2871
2957
|
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
2872
2958
|
};
|
|
2873
2959
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
@@ -2875,7 +2961,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2875
2961
|
"[Evaluator] Running scenario with skills group:",
|
|
2876
2962
|
evalData.skillsGroupName,
|
|
2877
2963
|
skillNames ? `(${skillNames})` : "",
|
|
2878
|
-
|
|
2964
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
2879
2965
|
`(${completedScenarios + 1}/${totalScenarios})`
|
|
2880
2966
|
);
|
|
2881
2967
|
try {
|