@wix/evalforge-evaluator 0.98.0 → 0.100.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +237 -151
- package/build/index.js.map +4 -4
- package/build/index.mjs +233 -147
- package/build/index.mjs.map +4 -4
- package/build/types/fetch-evaluation-data.d.ts +2 -2
- package/build/types/resolve-placeholders.d.ts +26 -0
- package/build/types/run-scenario/agents/claude-code/resolve-mcp-placeholders.d.ts +31 -0
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +3 -0
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +3 -1
- package/build/types/run-scenario/agents/registry.d.ts +32 -63
- package/build/types/run-scenario/index.d.ts +1 -1
- package/build/types/run-scenario/run-agent-with-context.d.ts +3 -3
- package/build/types/run-scenario/utils/write-files.d.ts +6 -0
- package/package.json +4 -4
package/build/index.mjs
CHANGED
|
@@ -187,6 +187,53 @@ import {
|
|
|
187
187
|
isSystemAssertionId,
|
|
188
188
|
SYSTEM_ASSERTIONS
|
|
189
189
|
} from "@wix/evalforge-types";
|
|
190
|
+
|
|
191
|
+
// src/resolve-placeholders.ts
|
|
192
|
+
var PLACEHOLDER_PATTERN = /\{\{([^}]+)\}\}/g;
|
|
193
|
+
function findPlaceholders(value) {
|
|
194
|
+
const keys = /* @__PURE__ */ new Set();
|
|
195
|
+
collectPlaceholders(value, keys);
|
|
196
|
+
return [...keys];
|
|
197
|
+
}
|
|
198
|
+
function collectPlaceholders(value, keys) {
|
|
199
|
+
if (typeof value === "string") {
|
|
200
|
+
for (const match of value.matchAll(PLACEHOLDER_PATTERN)) {
|
|
201
|
+
keys.add(match[1].trim());
|
|
202
|
+
}
|
|
203
|
+
} else if (Array.isArray(value)) {
|
|
204
|
+
for (const item of value) {
|
|
205
|
+
collectPlaceholders(item, keys);
|
|
206
|
+
}
|
|
207
|
+
} else if (typeof value === "object" && value !== null) {
|
|
208
|
+
for (const val of Object.values(value)) {
|
|
209
|
+
collectPlaceholders(val, keys);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
function resolveValue(value, placeholders) {
|
|
214
|
+
if (typeof value === "string") {
|
|
215
|
+
return value.replace(PLACEHOLDER_PATTERN, (match, key) => {
|
|
216
|
+
const trimmed = key.trim();
|
|
217
|
+
return trimmed in placeholders ? placeholders[trimmed] : match;
|
|
218
|
+
});
|
|
219
|
+
}
|
|
220
|
+
if (Array.isArray(value)) {
|
|
221
|
+
return value.map((item) => resolveValue(item, placeholders));
|
|
222
|
+
}
|
|
223
|
+
if (typeof value === "object" && value !== null) {
|
|
224
|
+
const result = {};
|
|
225
|
+
for (const [k, v] of Object.entries(value)) {
|
|
226
|
+
result[k] = resolveValue(v, placeholders);
|
|
227
|
+
}
|
|
228
|
+
return result;
|
|
229
|
+
}
|
|
230
|
+
return value;
|
|
231
|
+
}
|
|
232
|
+
function resolvePlaceholdersInString(text, placeholders) {
|
|
233
|
+
return resolveValue(text, placeholders);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// src/fetch-evaluation-data.ts
|
|
190
237
|
function parseSkillNamesFromParams(value) {
|
|
191
238
|
if (typeof value !== "string") {
|
|
192
239
|
return [];
|
|
@@ -202,13 +249,11 @@ function applyParamsToAssertion(assertion, params) {
|
|
|
202
249
|
return assertion;
|
|
203
250
|
}
|
|
204
251
|
if (assertion.type === "llm_judge") {
|
|
205
|
-
|
|
252
|
+
const stringParams = {};
|
|
206
253
|
for (const [key, value] of Object.entries(params)) {
|
|
207
|
-
|
|
208
|
-
const escapedPlaceholder = placeholder.replace(/[{}]/g, "\\$&");
|
|
209
|
-
const replacement = String(value ?? "");
|
|
210
|
-
prompt = prompt.replace(new RegExp(escapedPlaceholder, "g"), replacement);
|
|
254
|
+
stringParams[key] = String(value ?? "");
|
|
211
255
|
}
|
|
256
|
+
const prompt = resolvePlaceholdersInString(assertion.prompt, stringParams);
|
|
212
257
|
return {
|
|
213
258
|
...assertion,
|
|
214
259
|
prompt,
|
|
@@ -317,9 +362,9 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
317
362
|
const scenarios = await Promise.all(
|
|
318
363
|
evalRun.scenarioIds.map((id) => api.getScenario(projectId2, id))
|
|
319
364
|
);
|
|
320
|
-
let
|
|
365
|
+
let agent = null;
|
|
321
366
|
if (evalRun.agentId) {
|
|
322
|
-
|
|
367
|
+
agent = await api.getAgent(projectId2, evalRun.agentId);
|
|
323
368
|
}
|
|
324
369
|
let skills = [];
|
|
325
370
|
let skillsGroup = null;
|
|
@@ -420,7 +465,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
420
465
|
const skillsGroupName = skillsGroup?.name ?? "";
|
|
421
466
|
return {
|
|
422
467
|
evalRun,
|
|
423
|
-
|
|
468
|
+
agent,
|
|
424
469
|
skills,
|
|
425
470
|
skillsGroup,
|
|
426
471
|
skillsGroupName,
|
|
@@ -444,56 +489,18 @@ import {
|
|
|
444
489
|
import { mkdirSync, existsSync, rmSync, readFileSync, writeFileSync } from "fs";
|
|
445
490
|
import { tmpdir } from "os";
|
|
446
491
|
import path from "path";
|
|
447
|
-
import { fetchGitHubFolder
|
|
492
|
+
import { fetchGitHubFolder } from "@wix/evalforge-github-client";
|
|
448
493
|
|
|
449
|
-
// src/run-scenario/
|
|
494
|
+
// src/run-scenario/utils/write-files.ts
|
|
450
495
|
import { mkdir, writeFile } from "fs/promises";
|
|
451
|
-
import { dirname,
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
await Promise.all(
|
|
455
|
-
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
456
|
-
);
|
|
457
|
-
}
|
|
458
|
-
async function writeSkillToFilesystem(cwd, skill, fetchFn = fetchGitHubFolder) {
|
|
459
|
-
const skillName = skill.name;
|
|
460
|
-
const skillDir = join(cwd, ".claude", "skills", skillName);
|
|
461
|
-
await mkdir(skillDir, { recursive: true });
|
|
462
|
-
const version = skill.latestVersion;
|
|
463
|
-
if (version?.files && version.files.length > 0) {
|
|
464
|
-
await writeSkillFiles(skillDir, version.files);
|
|
465
|
-
console.log(
|
|
466
|
-
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
467
|
-
);
|
|
468
|
-
} else if (skill.source) {
|
|
469
|
-
try {
|
|
470
|
-
const files = await fetchFn(skill.source, {
|
|
471
|
-
userAgent: "EvalForge-Evaluator"
|
|
472
|
-
});
|
|
473
|
-
await writeSkillFiles(skillDir, files);
|
|
474
|
-
console.log(
|
|
475
|
-
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
476
|
-
);
|
|
477
|
-
} catch (error) {
|
|
478
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
479
|
-
console.error(
|
|
480
|
-
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
481
|
-
);
|
|
482
|
-
throw new Error(
|
|
483
|
-
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
484
|
-
);
|
|
485
|
-
}
|
|
486
|
-
} else {
|
|
487
|
-
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
488
|
-
}
|
|
489
|
-
}
|
|
490
|
-
async function writeSkillFiles(skillDir, files) {
|
|
491
|
-
const resolvedBase = resolve(skillDir);
|
|
496
|
+
import { dirname, resolve, sep } from "path";
|
|
497
|
+
async function writeFilesToDirectory(targetDir, files) {
|
|
498
|
+
const resolvedBase = resolve(targetDir);
|
|
492
499
|
for (const file of files) {
|
|
493
|
-
const filePath = resolve(
|
|
500
|
+
const filePath = resolve(targetDir, file.path);
|
|
494
501
|
if (!filePath.startsWith(resolvedBase + sep) && filePath !== resolvedBase) {
|
|
495
502
|
throw new Error(
|
|
496
|
-
`Path traversal detected
|
|
503
|
+
`Path traversal detected: "${file.path}" resolves outside target directory`
|
|
497
504
|
);
|
|
498
505
|
}
|
|
499
506
|
await mkdir(dirname(filePath), { recursive: true });
|
|
@@ -509,10 +516,10 @@ async function fetchAndWriteTemplateFiles(template, workDir) {
|
|
|
509
516
|
);
|
|
510
517
|
return;
|
|
511
518
|
}
|
|
512
|
-
const files = await
|
|
519
|
+
const files = await fetchGitHubFolder(template.source, {
|
|
513
520
|
userAgent: "EvalForge-Evaluator"
|
|
514
521
|
});
|
|
515
|
-
await
|
|
522
|
+
await writeFilesToDirectory(workDir, files);
|
|
516
523
|
}
|
|
517
524
|
function writeWixEnvFile(workDir) {
|
|
518
525
|
const configPath = path.join(workDir, "wix.config.json");
|
|
@@ -566,86 +573,76 @@ import { randomUUID as randomUUID2 } from "crypto";
|
|
|
566
573
|
|
|
567
574
|
// src/run-scenario/agents/registry.ts
|
|
568
575
|
var AgentAdapterRegistry = class {
|
|
569
|
-
/**
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
/**
|
|
575
|
-
* Set of all registered adapter instances (for getAll).
|
|
576
|
-
*/
|
|
576
|
+
/** Map of CLI commands to their registered adapters. */
|
|
577
|
+
commandMap = /* @__PURE__ */ new Map();
|
|
578
|
+
/** Map of adapter IDs to their registered adapters. */
|
|
579
|
+
idMap = /* @__PURE__ */ new Map();
|
|
580
|
+
/** Set of all registered adapter instances (for getAll). */
|
|
577
581
|
registeredAdapters = /* @__PURE__ */ new Set();
|
|
578
582
|
/**
|
|
579
583
|
* Register an agent adapter.
|
|
580
584
|
*
|
|
581
|
-
* The adapter
|
|
582
|
-
* If a command is already registered, it will be overwritten with a warning.
|
|
583
|
-
*
|
|
584
|
-
* @param adapter - The adapter to register
|
|
585
|
+
* The adapter is registered by its ID and for all commands in its supportedCommands array.
|
|
586
|
+
* If a command or ID is already registered, it will be overwritten with a warning.
|
|
585
587
|
*/
|
|
586
588
|
register(adapter) {
|
|
587
589
|
this.registeredAdapters.add(adapter);
|
|
590
|
+
this.idMap.set(adapter.id, adapter);
|
|
588
591
|
for (const command of adapter.supportedCommands) {
|
|
589
|
-
if (this.
|
|
590
|
-
const existing = this.
|
|
592
|
+
if (this.commandMap.has(command)) {
|
|
593
|
+
const existing = this.commandMap.get(command);
|
|
591
594
|
console.warn(
|
|
592
595
|
`[AgentAdapterRegistry] Command "${command}" already registered by adapter "${existing.id}". Overwriting with adapter "${adapter.id}".`
|
|
593
596
|
);
|
|
594
597
|
}
|
|
595
|
-
this.
|
|
598
|
+
this.commandMap.set(command, adapter);
|
|
596
599
|
}
|
|
597
600
|
}
|
|
598
|
-
/**
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
* @param runCommand - The run command to look up
|
|
602
|
-
* @returns The registered adapter, or undefined if not found
|
|
603
|
-
*/
|
|
604
|
-
get(runCommand) {
|
|
605
|
-
return this.adapters.get(runCommand);
|
|
601
|
+
/** Get an adapter by CLI command. */
|
|
602
|
+
getByCommand(command) {
|
|
603
|
+
return this.commandMap.get(command);
|
|
606
604
|
}
|
|
607
|
-
/**
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
* @param runCommand - The run command to check
|
|
611
|
-
* @returns True if an adapter is registered for this command
|
|
612
|
-
*/
|
|
613
|
-
has(runCommand) {
|
|
614
|
-
return this.adapters.has(runCommand);
|
|
605
|
+
/** Get an adapter by adapter ID. */
|
|
606
|
+
getById(adapterId) {
|
|
607
|
+
return this.idMap.get(adapterId);
|
|
615
608
|
}
|
|
616
609
|
/**
|
|
617
|
-
*
|
|
618
|
-
*
|
|
619
|
-
* @returns Array of all unique registered adapters
|
|
610
|
+
* Unified lookup: tries CLI command first, then adapter ID.
|
|
611
|
+
* Use this when the identifier could be either a command or an adapter ID.
|
|
620
612
|
*/
|
|
613
|
+
resolve(identifier) {
|
|
614
|
+
return this.commandMap.get(identifier) ?? this.idMap.get(identifier);
|
|
615
|
+
}
|
|
616
|
+
/** Check if a command or adapter ID has a registered adapter. */
|
|
617
|
+
has(identifier) {
|
|
618
|
+
return this.commandMap.has(identifier) || this.idMap.has(identifier);
|
|
619
|
+
}
|
|
620
|
+
/** Get all registered adapters. */
|
|
621
621
|
getAll() {
|
|
622
622
|
return Array.from(this.registeredAdapters);
|
|
623
623
|
}
|
|
624
|
-
/**
|
|
625
|
-
* Get all supported commands.
|
|
626
|
-
*
|
|
627
|
-
* @returns Array of all registered run commands
|
|
628
|
-
*/
|
|
624
|
+
/** Get all supported CLI commands. */
|
|
629
625
|
getSupportedCommands() {
|
|
630
|
-
return Array.from(this.
|
|
626
|
+
return Array.from(this.commandMap.keys());
|
|
627
|
+
}
|
|
628
|
+
/** Get all registered adapter IDs. */
|
|
629
|
+
getAdapterIds() {
|
|
630
|
+
return Array.from(this.idMap.keys());
|
|
631
631
|
}
|
|
632
632
|
/**
|
|
633
633
|
* Unregister an adapter by its ID.
|
|
634
|
-
*
|
|
635
634
|
* Removes the adapter and all its command mappings.
|
|
636
|
-
*
|
|
637
|
-
* @param adapterId - The ID of the adapter to remove
|
|
638
|
-
* @returns True if the adapter was found and removed
|
|
639
635
|
*/
|
|
640
636
|
unregister(adapterId) {
|
|
641
637
|
let found = false;
|
|
642
638
|
for (const adapter of this.registeredAdapters) {
|
|
643
639
|
if (adapter.id === adapterId) {
|
|
644
640
|
this.registeredAdapters.delete(adapter);
|
|
641
|
+
this.idMap.delete(adapterId);
|
|
645
642
|
found = true;
|
|
646
643
|
for (const command of adapter.supportedCommands) {
|
|
647
|
-
if (this.
|
|
648
|
-
this.
|
|
644
|
+
if (this.commandMap.get(command) === adapter) {
|
|
645
|
+
this.commandMap.delete(command);
|
|
649
646
|
}
|
|
650
647
|
}
|
|
651
648
|
break;
|
|
@@ -653,22 +650,21 @@ var AgentAdapterRegistry = class {
|
|
|
653
650
|
}
|
|
654
651
|
return found;
|
|
655
652
|
}
|
|
656
|
-
/**
|
|
657
|
-
* Clear all registered adapters.
|
|
658
|
-
* Primarily useful for testing.
|
|
659
|
-
*/
|
|
653
|
+
/** Clear all registered adapters. Primarily useful for testing. */
|
|
660
654
|
clear() {
|
|
661
|
-
this.
|
|
655
|
+
this.commandMap.clear();
|
|
656
|
+
this.idMap.clear();
|
|
662
657
|
this.registeredAdapters.clear();
|
|
663
658
|
}
|
|
664
659
|
};
|
|
665
660
|
var defaultRegistry = new AgentAdapterRegistry();
|
|
666
|
-
function getAdapter(
|
|
667
|
-
const adapter = defaultRegistry.
|
|
661
|
+
function getAdapter(identifier) {
|
|
662
|
+
const adapter = defaultRegistry.resolve(identifier);
|
|
668
663
|
if (!adapter) {
|
|
669
|
-
const
|
|
664
|
+
const commands = defaultRegistry.getSupportedCommands();
|
|
665
|
+
const ids = defaultRegistry.getAdapterIds();
|
|
670
666
|
throw new Error(
|
|
671
|
-
`No agent adapter registered for
|
|
667
|
+
`No agent adapter registered for "${identifier}". Supported commands: ${commands.length > 0 ? commands.join(", ") : "(none)"}. Registered adapters: ${ids.length > 0 ? ids.join(", ") : "(none)"}`
|
|
672
668
|
);
|
|
673
669
|
}
|
|
674
670
|
return adapter;
|
|
@@ -685,12 +681,97 @@ import {
|
|
|
685
681
|
LiveTraceEventType,
|
|
686
682
|
TRACE_EVENT_PREFIX
|
|
687
683
|
} from "@wix/evalforge-types";
|
|
684
|
+
|
|
685
|
+
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
686
|
+
import { mkdir as mkdir2 } from "fs/promises";
|
|
687
|
+
import { join } from "path";
|
|
688
|
+
import { fetchGitHubFolder as fetchGitHubFolder2 } from "@wix/evalforge-github-client";
|
|
689
|
+
async function writeSkillsToFilesystem(cwd, skills, fetchFn = fetchGitHubFolder2) {
|
|
690
|
+
await Promise.all(
|
|
691
|
+
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
692
|
+
);
|
|
693
|
+
}
|
|
694
|
+
async function writeSkillToFilesystem(cwd, skill, fetchFn = fetchGitHubFolder2) {
|
|
695
|
+
const skillName = skill.name;
|
|
696
|
+
const skillDir = join(cwd, ".claude", "skills", skillName);
|
|
697
|
+
await mkdir2(skillDir, { recursive: true });
|
|
698
|
+
const version = skill.latestVersion;
|
|
699
|
+
if (version?.files && version.files.length > 0) {
|
|
700
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
701
|
+
console.log(
|
|
702
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
703
|
+
);
|
|
704
|
+
} else if (skill.source) {
|
|
705
|
+
try {
|
|
706
|
+
const files = await fetchFn(skill.source, {
|
|
707
|
+
userAgent: "EvalForge-Evaluator"
|
|
708
|
+
});
|
|
709
|
+
await writeFilesToDirectory(skillDir, files);
|
|
710
|
+
console.log(
|
|
711
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
712
|
+
);
|
|
713
|
+
} catch (error) {
|
|
714
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
715
|
+
console.error(
|
|
716
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
717
|
+
);
|
|
718
|
+
throw new Error(
|
|
719
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
720
|
+
);
|
|
721
|
+
}
|
|
722
|
+
} else {
|
|
723
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
// src/run-scenario/agents/claude-code/execute.ts
|
|
688
728
|
import { randomUUID } from "crypto";
|
|
689
729
|
|
|
690
730
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
691
731
|
import { writeFile as writeFile2 } from "fs/promises";
|
|
692
|
-
import { join as
|
|
732
|
+
import { join as join3 } from "path";
|
|
693
733
|
import { MCP_SERVERS_JSON_KEY } from "@wix/evalforge-types";
|
|
734
|
+
|
|
735
|
+
// src/run-scenario/agents/claude-code/resolve-mcp-placeholders.ts
|
|
736
|
+
import { readFile } from "fs/promises";
|
|
737
|
+
import { join as join2 } from "path";
|
|
738
|
+
import { homedir } from "os";
|
|
739
|
+
var WIX_AUTH_FILE = join2(homedir(), ".wix", "auth", "api-key.json");
|
|
740
|
+
async function loadWixAuthPlaceholders(authFilePath = WIX_AUTH_FILE) {
|
|
741
|
+
try {
|
|
742
|
+
const content = await readFile(authFilePath, "utf-8");
|
|
743
|
+
const auth = JSON.parse(content);
|
|
744
|
+
if (!auth.token || !auth.userInfo?.userId) {
|
|
745
|
+
return {};
|
|
746
|
+
}
|
|
747
|
+
return {
|
|
748
|
+
"wix-auth-token": auth.token,
|
|
749
|
+
"wix-auth-user-id": auth.userInfo.userId
|
|
750
|
+
};
|
|
751
|
+
} catch (err) {
|
|
752
|
+
console.warn(
|
|
753
|
+
`[MCP] Could not load Wix auth file: ${err.message}`
|
|
754
|
+
);
|
|
755
|
+
return {};
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
async function resolveMcpPlaceholders(mcpServers, authFilePath) {
|
|
759
|
+
const needed = findPlaceholders(mcpServers);
|
|
760
|
+
if (needed.length === 0) {
|
|
761
|
+
return mcpServers;
|
|
762
|
+
}
|
|
763
|
+
const placeholders = await loadWixAuthPlaceholders(authFilePath);
|
|
764
|
+
const unresolved = needed.filter((key) => !(key in placeholders));
|
|
765
|
+
if (unresolved.length > 0) {
|
|
766
|
+
throw new Error(
|
|
767
|
+
`MCP config contains unresolvable placeholders: ${unresolved.map((k) => `{{${k}}}`).join(", ")}. Ensure ~/.wix/auth/api-key.json exists (run \`npx @wix/cli login\`).`
|
|
768
|
+
);
|
|
769
|
+
}
|
|
770
|
+
console.log(`[MCP] Resolved ${needed.length} placeholder(s)`);
|
|
771
|
+
return resolveValue(mcpServers, placeholders);
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
694
775
|
async function writeMcpToFilesystem(cwd, mcps) {
|
|
695
776
|
if (mcps.length === 0) return;
|
|
696
777
|
const mcpServers = {};
|
|
@@ -705,19 +786,20 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
705
786
|
mcpServers[key] = value;
|
|
706
787
|
}
|
|
707
788
|
}
|
|
789
|
+
const resolvedServers = await resolveMcpPlaceholders(mcpServers);
|
|
708
790
|
const content = JSON.stringify(
|
|
709
|
-
{ [MCP_SERVERS_JSON_KEY]:
|
|
791
|
+
{ [MCP_SERVERS_JSON_KEY]: resolvedServers },
|
|
710
792
|
null,
|
|
711
793
|
2
|
|
712
794
|
);
|
|
713
|
-
const filePath =
|
|
795
|
+
const filePath = join3(cwd, ".mcp.json");
|
|
714
796
|
await writeFile2(filePath, content, "utf8");
|
|
715
797
|
console.log(`[MCP] Written to ${filePath}`);
|
|
716
798
|
}
|
|
717
799
|
|
|
718
800
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
719
|
-
import { mkdir as
|
|
720
|
-
import { join as
|
|
801
|
+
import { mkdir as mkdir3, writeFile as writeFile3 } from "fs/promises";
|
|
802
|
+
import { join as join4 } from "path";
|
|
721
803
|
var AGENTS_DIR = ".claude/agents";
|
|
722
804
|
function toAgentFilename(name, index, nameCount) {
|
|
723
805
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -727,20 +809,20 @@ function toAgentFilename(name, index, nameCount) {
|
|
|
727
809
|
}
|
|
728
810
|
async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
729
811
|
if (subAgents.length === 0) return;
|
|
730
|
-
const agentsDir =
|
|
731
|
-
await
|
|
812
|
+
const agentsDir = join4(cwd, AGENTS_DIR);
|
|
813
|
+
await mkdir3(agentsDir, { recursive: true });
|
|
732
814
|
const nameCount = /* @__PURE__ */ new Map();
|
|
733
815
|
for (const [i, agent] of subAgents.entries()) {
|
|
734
816
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
735
|
-
const filePath =
|
|
817
|
+
const filePath = join4(agentsDir, `${filename}.md`);
|
|
736
818
|
await writeFile3(filePath, agent.subAgentMd, "utf8");
|
|
737
819
|
}
|
|
738
820
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
739
821
|
}
|
|
740
822
|
|
|
741
823
|
// src/run-scenario/agents/claude-code/write-rules.ts
|
|
742
|
-
import { mkdir as
|
|
743
|
-
import { join as
|
|
824
|
+
import { mkdir as mkdir4, writeFile as writeFile4, readFile as readFile2 } from "fs/promises";
|
|
825
|
+
import { join as join5 } from "path";
|
|
744
826
|
var CURSOR_RULES_DIR = ".cursor/rules";
|
|
745
827
|
function toRuleFilename(name, index, nameCount) {
|
|
746
828
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `rule-${index}`;
|
|
@@ -751,7 +833,7 @@ function toRuleFilename(name, index, nameCount) {
|
|
|
751
833
|
async function appendToFile(filePath, content) {
|
|
752
834
|
let existing = "";
|
|
753
835
|
try {
|
|
754
|
-
existing = await
|
|
836
|
+
existing = await readFile2(filePath, "utf8");
|
|
755
837
|
} catch {
|
|
756
838
|
}
|
|
757
839
|
const merged = existing ? `${existing.trimEnd()}
|
|
@@ -766,20 +848,20 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
766
848
|
for (const [i, rule] of rules.entries()) {
|
|
767
849
|
switch (rule.ruleType) {
|
|
768
850
|
case "claude-md": {
|
|
769
|
-
await appendToFile(
|
|
851
|
+
await appendToFile(join5(cwd, "CLAUDE.md"), rule.content);
|
|
770
852
|
break;
|
|
771
853
|
}
|
|
772
854
|
case "agents-md": {
|
|
773
|
-
await appendToFile(
|
|
855
|
+
await appendToFile(join5(cwd, "AGENTS.md"), rule.content);
|
|
774
856
|
break;
|
|
775
857
|
}
|
|
776
858
|
case "cursor-rule": {
|
|
777
859
|
if (!hasCursorRules) {
|
|
778
|
-
await
|
|
860
|
+
await mkdir4(join5(cwd, CURSOR_RULES_DIR), { recursive: true });
|
|
779
861
|
hasCursorRules = true;
|
|
780
862
|
}
|
|
781
863
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
782
|
-
const filePath =
|
|
864
|
+
const filePath = join5(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
783
865
|
await writeFile4(filePath, rule.content, "utf8");
|
|
784
866
|
break;
|
|
785
867
|
}
|
|
@@ -1815,7 +1897,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
1815
1897
|
|
|
1816
1898
|
// src/run-scenario/file-diff.ts
|
|
1817
1899
|
import { readdirSync, readFileSync as readFileSync2, statSync, existsSync as existsSync2 } from "fs";
|
|
1818
|
-
import { join as
|
|
1900
|
+
import { join as join7, relative } from "path";
|
|
1819
1901
|
|
|
1820
1902
|
// ../../node_modules/diff/lib/index.mjs
|
|
1821
1903
|
function Diff() {
|
|
@@ -1991,7 +2073,7 @@ Diff.prototype = {
|
|
|
1991
2073
|
tokenize: function tokenize(value) {
|
|
1992
2074
|
return Array.from(value);
|
|
1993
2075
|
},
|
|
1994
|
-
join: function
|
|
2076
|
+
join: function join6(chars) {
|
|
1995
2077
|
return chars.join("");
|
|
1996
2078
|
},
|
|
1997
2079
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -2431,7 +2513,7 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
2431
2513
|
}
|
|
2432
2514
|
const entries = readdirSync(dir, { withFileTypes: true });
|
|
2433
2515
|
for (const entry of entries) {
|
|
2434
|
-
const fullPath =
|
|
2516
|
+
const fullPath = join7(dir, entry.name);
|
|
2435
2517
|
const relativePath = relative(base, fullPath);
|
|
2436
2518
|
if (shouldIgnore(entry.name)) {
|
|
2437
2519
|
continue;
|
|
@@ -2540,18 +2622,21 @@ function extractTemplateFiles(before, after) {
|
|
|
2540
2622
|
}
|
|
2541
2623
|
|
|
2542
2624
|
// src/run-scenario/run-agent-with-context.ts
|
|
2543
|
-
import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
|
|
2625
|
+
import { AgentRunCommand as AgentRunCommand2, AgentType } from "@wix/evalforge-types";
|
|
2544
2626
|
var DEFAULT_AGENT_COMMAND = AgentRunCommand2.CLAUDE;
|
|
2545
2627
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
2546
2628
|
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
2629
|
+
const agent = evalData.agent ?? void 0;
|
|
2630
|
+
const isSDK = agent?.agentType === AgentType.SDK;
|
|
2547
2631
|
if (!skillsGroupId) {
|
|
2548
2632
|
throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
|
|
2549
2633
|
}
|
|
2550
|
-
const
|
|
2551
|
-
const
|
|
2552
|
-
const adapter = getAdapter(runCommand);
|
|
2634
|
+
const identifier = isSDK ? agent.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
2635
|
+
const adapter = getAdapter(identifier);
|
|
2553
2636
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2554
2637
|
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
2638
|
+
const targetId = skillsGroupId ?? agent?.id ?? evalData.evalRun.id;
|
|
2639
|
+
const targetName = evalData.skillsGroupName || agent?.name || "";
|
|
2555
2640
|
const executionContext = {
|
|
2556
2641
|
skills: evalData.skills,
|
|
2557
2642
|
scenario,
|
|
@@ -2563,8 +2648,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2563
2648
|
evalRunId: evalRunId2,
|
|
2564
2649
|
scenarioId: scenario.id,
|
|
2565
2650
|
scenarioName: scenario.name,
|
|
2566
|
-
targetId
|
|
2567
|
-
targetName
|
|
2651
|
+
targetId,
|
|
2652
|
+
targetName,
|
|
2568
2653
|
tracePushUrl: config.tracePushUrl,
|
|
2569
2654
|
routeHeader: config.routeHeader,
|
|
2570
2655
|
authToken: config.authToken
|
|
@@ -2581,8 +2666,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2581
2666
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
2582
2667
|
return {
|
|
2583
2668
|
id: randomUUID2(),
|
|
2584
|
-
targetId
|
|
2585
|
-
targetName
|
|
2669
|
+
targetId,
|
|
2670
|
+
targetName,
|
|
2586
2671
|
scenarioId: scenario.id,
|
|
2587
2672
|
scenarioName: scenario.name,
|
|
2588
2673
|
modelConfig: agent?.modelConfig,
|
|
@@ -2598,11 +2683,11 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2598
2683
|
|
|
2599
2684
|
// src/run-scenario/index.ts
|
|
2600
2685
|
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
|
|
2601
|
-
const
|
|
2686
|
+
const targetId = evalData.evalRun.skillsGroupId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
2602
2687
|
const workDir = await prepareWorkingDirectory(
|
|
2603
2688
|
config,
|
|
2604
2689
|
evalRunId2,
|
|
2605
|
-
|
|
2690
|
+
targetId,
|
|
2606
2691
|
scenario.id,
|
|
2607
2692
|
template
|
|
2608
2693
|
);
|
|
@@ -2630,7 +2715,8 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2630
2715
|
})),
|
|
2631
2716
|
durationMs: partialResult.duration
|
|
2632
2717
|
};
|
|
2633
|
-
const { "x-wix-ai-gateway-stream":
|
|
2718
|
+
const { "x-wix-ai-gateway-stream": _ignored, ...judgeHeaders } = config.aiGatewayHeaders;
|
|
2719
|
+
void _ignored;
|
|
2634
2720
|
const defaultJudgeModel = DEFAULT_JUDGE_MODEL;
|
|
2635
2721
|
const assertionContext = {
|
|
2636
2722
|
workDir,
|
|
@@ -2807,7 +2893,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2807
2893
|
scenarioItemCount: evalData.scenarioItems.length,
|
|
2808
2894
|
scenarios: evalData.scenarioItems.map((s) => s.scenario.name),
|
|
2809
2895
|
skillsCount: evalData.skills.length,
|
|
2810
|
-
|
|
2896
|
+
hasAgent: !!evalData.agent,
|
|
2811
2897
|
timestamp: Date.now()
|
|
2812
2898
|
})
|
|
2813
2899
|
);
|
|
@@ -2825,14 +2911,14 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2825
2911
|
`[${ExecutionPhase.FETCH_EVAL_RUN}] Failed to fetch evaluation data: ${errorMsg}`
|
|
2826
2912
|
);
|
|
2827
2913
|
}
|
|
2828
|
-
const {
|
|
2914
|
+
const { agent, skills, scenarioItems } = evalData;
|
|
2829
2915
|
state.currentPhase = ExecutionPhase.VALIDATION;
|
|
2830
2916
|
state.currentContext = {
|
|
2831
2917
|
projectId: projectId2,
|
|
2832
2918
|
evalRunId: evalRunId2,
|
|
2833
2919
|
scenarioCount: scenarioItems.length,
|
|
2834
2920
|
skillCount: skills.length,
|
|
2835
|
-
hasAgent: !!
|
|
2921
|
+
hasAgent: !!agent,
|
|
2836
2922
|
agentId: evalData.evalRun.agentId,
|
|
2837
2923
|
skillsGroupId: evalData.evalRun.skillsGroupId
|
|
2838
2924
|
};
|
|
@@ -2841,9 +2927,9 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2841
2927
|
`[${ExecutionPhase.VALIDATION}] Eval run has no skills: set skillsGroupId and ensure the group has skills. (skillsGroupId: ${evalData.evalRun.skillsGroupId || "not set"})`
|
|
2842
2928
|
);
|
|
2843
2929
|
}
|
|
2844
|
-
if (scenarioItems.length > 0 && skills.length > 0 && !
|
|
2930
|
+
if (scenarioItems.length > 0 && skills.length > 0 && !agent) {
|
|
2845
2931
|
throw new Error(
|
|
2846
|
-
`[${ExecutionPhase.VALIDATION}] Eval run has no
|
|
2932
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for skill-based runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
2847
2933
|
);
|
|
2848
2934
|
}
|
|
2849
2935
|
let completedScenarios = 0;
|
|
@@ -2857,8 +2943,8 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2857
2943
|
scenarioName: scenario.name,
|
|
2858
2944
|
skillsGroupId: evalData.evalRun.skillsGroupId,
|
|
2859
2945
|
skillsGroupName: evalData.skillsGroupName,
|
|
2860
|
-
agentId:
|
|
2861
|
-
agentName:
|
|
2946
|
+
agentId: agent?.id,
|
|
2947
|
+
agentName: agent?.name,
|
|
2862
2948
|
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
2863
2949
|
};
|
|
2864
2950
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
@@ -2866,7 +2952,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2866
2952
|
"[Evaluator] Running scenario with skills group:",
|
|
2867
2953
|
evalData.skillsGroupName,
|
|
2868
2954
|
skillNames ? `(${skillNames})` : "",
|
|
2869
|
-
|
|
2955
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
2870
2956
|
`(${completedScenarios + 1}/${totalScenarios})`
|
|
2871
2957
|
);
|
|
2872
2958
|
try {
|