@wix/evalforge-evaluator 0.99.0 → 0.100.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +147 -148
- package/build/index.js.map +4 -4
- package/build/index.mjs +127 -128
- package/build/index.mjs.map +4 -4
- package/build/types/fetch-evaluation-data.d.ts +2 -2
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +3 -1
- package/build/types/run-scenario/agents/registry.d.ts +32 -63
- package/build/types/run-scenario/index.d.ts +1 -1
- package/build/types/run-scenario/run-agent-with-context.d.ts +3 -3
- package/build/types/run-scenario/utils/write-files.d.ts +6 -0
- package/package.json +4 -4
package/build/index.js
CHANGED
|
@@ -382,9 +382,9 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
382
382
|
const scenarios = await Promise.all(
|
|
383
383
|
evalRun.scenarioIds.map((id) => api.getScenario(projectId2, id))
|
|
384
384
|
);
|
|
385
|
-
let
|
|
385
|
+
let agent = null;
|
|
386
386
|
if (evalRun.agentId) {
|
|
387
|
-
|
|
387
|
+
agent = await api.getAgent(projectId2, evalRun.agentId);
|
|
388
388
|
}
|
|
389
389
|
let skills = [];
|
|
390
390
|
let skillsGroup = null;
|
|
@@ -485,7 +485,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
485
485
|
const skillsGroupName = skillsGroup?.name ?? "";
|
|
486
486
|
return {
|
|
487
487
|
evalRun,
|
|
488
|
-
|
|
488
|
+
agent,
|
|
489
489
|
skills,
|
|
490
490
|
skillsGroup,
|
|
491
491
|
skillsGroupName,
|
|
@@ -504,56 +504,18 @@ var import_eval_assertions = require("@wix/eval-assertions");
|
|
|
504
504
|
var import_fs = require("fs");
|
|
505
505
|
var import_os = require("os");
|
|
506
506
|
var import_path2 = __toESM(require("path"));
|
|
507
|
-
var
|
|
507
|
+
var import_evalforge_github_client = require("@wix/evalforge-github-client");
|
|
508
508
|
|
|
509
|
-
// src/run-scenario/
|
|
509
|
+
// src/run-scenario/utils/write-files.ts
|
|
510
510
|
var import_promises = require("fs/promises");
|
|
511
511
|
var import_path = require("path");
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
await Promise.all(
|
|
515
|
-
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
516
|
-
);
|
|
517
|
-
}
|
|
518
|
-
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client.fetchGitHubFolder) {
|
|
519
|
-
const skillName = skill.name;
|
|
520
|
-
const skillDir = (0, import_path.join)(cwd, ".claude", "skills", skillName);
|
|
521
|
-
await (0, import_promises.mkdir)(skillDir, { recursive: true });
|
|
522
|
-
const version = skill.latestVersion;
|
|
523
|
-
if (version?.files && version.files.length > 0) {
|
|
524
|
-
await writeSkillFiles(skillDir, version.files);
|
|
525
|
-
console.log(
|
|
526
|
-
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
527
|
-
);
|
|
528
|
-
} else if (skill.source) {
|
|
529
|
-
try {
|
|
530
|
-
const files = await fetchFn(skill.source, {
|
|
531
|
-
userAgent: "EvalForge-Evaluator"
|
|
532
|
-
});
|
|
533
|
-
await writeSkillFiles(skillDir, files);
|
|
534
|
-
console.log(
|
|
535
|
-
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
536
|
-
);
|
|
537
|
-
} catch (error) {
|
|
538
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
539
|
-
console.error(
|
|
540
|
-
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
541
|
-
);
|
|
542
|
-
throw new Error(
|
|
543
|
-
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
544
|
-
);
|
|
545
|
-
}
|
|
546
|
-
} else {
|
|
547
|
-
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
548
|
-
}
|
|
549
|
-
}
|
|
550
|
-
async function writeSkillFiles(skillDir, files) {
|
|
551
|
-
const resolvedBase = (0, import_path.resolve)(skillDir);
|
|
512
|
+
async function writeFilesToDirectory(targetDir, files) {
|
|
513
|
+
const resolvedBase = (0, import_path.resolve)(targetDir);
|
|
552
514
|
for (const file of files) {
|
|
553
|
-
const filePath = (0, import_path.resolve)(
|
|
515
|
+
const filePath = (0, import_path.resolve)(targetDir, file.path);
|
|
554
516
|
if (!filePath.startsWith(resolvedBase + import_path.sep) && filePath !== resolvedBase) {
|
|
555
517
|
throw new Error(
|
|
556
|
-
`Path traversal detected
|
|
518
|
+
`Path traversal detected: "${file.path}" resolves outside target directory`
|
|
557
519
|
);
|
|
558
520
|
}
|
|
559
521
|
await (0, import_promises.mkdir)((0, import_path.dirname)(filePath), { recursive: true });
|
|
@@ -569,10 +531,10 @@ async function fetchAndWriteTemplateFiles(template, workDir) {
|
|
|
569
531
|
);
|
|
570
532
|
return;
|
|
571
533
|
}
|
|
572
|
-
const files = await (0,
|
|
534
|
+
const files = await (0, import_evalforge_github_client.fetchGitHubFolder)(template.source, {
|
|
573
535
|
userAgent: "EvalForge-Evaluator"
|
|
574
536
|
});
|
|
575
|
-
await
|
|
537
|
+
await writeFilesToDirectory(workDir, files);
|
|
576
538
|
}
|
|
577
539
|
function writeWixEnvFile(workDir) {
|
|
578
540
|
const configPath = import_path2.default.join(workDir, "wix.config.json");
|
|
@@ -626,86 +588,76 @@ var import_crypto2 = require("crypto");
|
|
|
626
588
|
|
|
627
589
|
// src/run-scenario/agents/registry.ts
|
|
628
590
|
var AgentAdapterRegistry = class {
|
|
629
|
-
/**
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
/**
|
|
635
|
-
* Set of all registered adapter instances (for getAll).
|
|
636
|
-
*/
|
|
591
|
+
/** Map of CLI commands to their registered adapters. */
|
|
592
|
+
commandMap = /* @__PURE__ */ new Map();
|
|
593
|
+
/** Map of adapter IDs to their registered adapters. */
|
|
594
|
+
idMap = /* @__PURE__ */ new Map();
|
|
595
|
+
/** Set of all registered adapter instances (for getAll). */
|
|
637
596
|
registeredAdapters = /* @__PURE__ */ new Set();
|
|
638
597
|
/**
|
|
639
598
|
* Register an agent adapter.
|
|
640
599
|
*
|
|
641
|
-
* The adapter
|
|
642
|
-
* If a command is already registered, it will be overwritten with a warning.
|
|
643
|
-
*
|
|
644
|
-
* @param adapter - The adapter to register
|
|
600
|
+
* The adapter is registered by its ID and for all commands in its supportedCommands array.
|
|
601
|
+
* If a command or ID is already registered, it will be overwritten with a warning.
|
|
645
602
|
*/
|
|
646
603
|
register(adapter) {
|
|
647
604
|
this.registeredAdapters.add(adapter);
|
|
605
|
+
this.idMap.set(adapter.id, adapter);
|
|
648
606
|
for (const command of adapter.supportedCommands) {
|
|
649
|
-
if (this.
|
|
650
|
-
const existing = this.
|
|
607
|
+
if (this.commandMap.has(command)) {
|
|
608
|
+
const existing = this.commandMap.get(command);
|
|
651
609
|
console.warn(
|
|
652
610
|
`[AgentAdapterRegistry] Command "${command}" already registered by adapter "${existing.id}". Overwriting with adapter "${adapter.id}".`
|
|
653
611
|
);
|
|
654
612
|
}
|
|
655
|
-
this.
|
|
613
|
+
this.commandMap.set(command, adapter);
|
|
656
614
|
}
|
|
657
615
|
}
|
|
658
|
-
/**
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
* @param runCommand - The run command to look up
|
|
662
|
-
* @returns The registered adapter, or undefined if not found
|
|
663
|
-
*/
|
|
664
|
-
get(runCommand) {
|
|
665
|
-
return this.adapters.get(runCommand);
|
|
616
|
+
/** Get an adapter by CLI command. */
|
|
617
|
+
getByCommand(command) {
|
|
618
|
+
return this.commandMap.get(command);
|
|
666
619
|
}
|
|
667
|
-
/**
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
* @param runCommand - The run command to check
|
|
671
|
-
* @returns True if an adapter is registered for this command
|
|
672
|
-
*/
|
|
673
|
-
has(runCommand) {
|
|
674
|
-
return this.adapters.has(runCommand);
|
|
620
|
+
/** Get an adapter by adapter ID. */
|
|
621
|
+
getById(adapterId) {
|
|
622
|
+
return this.idMap.get(adapterId);
|
|
675
623
|
}
|
|
676
624
|
/**
|
|
677
|
-
*
|
|
678
|
-
*
|
|
679
|
-
* @returns Array of all unique registered adapters
|
|
625
|
+
* Unified lookup: tries CLI command first, then adapter ID.
|
|
626
|
+
* Use this when the identifier could be either a command or an adapter ID.
|
|
680
627
|
*/
|
|
628
|
+
resolve(identifier) {
|
|
629
|
+
return this.commandMap.get(identifier) ?? this.idMap.get(identifier);
|
|
630
|
+
}
|
|
631
|
+
/** Check if a command or adapter ID has a registered adapter. */
|
|
632
|
+
has(identifier) {
|
|
633
|
+
return this.commandMap.has(identifier) || this.idMap.has(identifier);
|
|
634
|
+
}
|
|
635
|
+
/** Get all registered adapters. */
|
|
681
636
|
getAll() {
|
|
682
637
|
return Array.from(this.registeredAdapters);
|
|
683
638
|
}
|
|
684
|
-
/**
|
|
685
|
-
* Get all supported commands.
|
|
686
|
-
*
|
|
687
|
-
* @returns Array of all registered run commands
|
|
688
|
-
*/
|
|
639
|
+
/** Get all supported CLI commands. */
|
|
689
640
|
getSupportedCommands() {
|
|
690
|
-
return Array.from(this.
|
|
641
|
+
return Array.from(this.commandMap.keys());
|
|
642
|
+
}
|
|
643
|
+
/** Get all registered adapter IDs. */
|
|
644
|
+
getAdapterIds() {
|
|
645
|
+
return Array.from(this.idMap.keys());
|
|
691
646
|
}
|
|
692
647
|
/**
|
|
693
648
|
* Unregister an adapter by its ID.
|
|
694
|
-
*
|
|
695
649
|
* Removes the adapter and all its command mappings.
|
|
696
|
-
*
|
|
697
|
-
* @param adapterId - The ID of the adapter to remove
|
|
698
|
-
* @returns True if the adapter was found and removed
|
|
699
650
|
*/
|
|
700
651
|
unregister(adapterId) {
|
|
701
652
|
let found = false;
|
|
702
653
|
for (const adapter of this.registeredAdapters) {
|
|
703
654
|
if (adapter.id === adapterId) {
|
|
704
655
|
this.registeredAdapters.delete(adapter);
|
|
656
|
+
this.idMap.delete(adapterId);
|
|
705
657
|
found = true;
|
|
706
658
|
for (const command of adapter.supportedCommands) {
|
|
707
|
-
if (this.
|
|
708
|
-
this.
|
|
659
|
+
if (this.commandMap.get(command) === adapter) {
|
|
660
|
+
this.commandMap.delete(command);
|
|
709
661
|
}
|
|
710
662
|
}
|
|
711
663
|
break;
|
|
@@ -713,22 +665,21 @@ var AgentAdapterRegistry = class {
|
|
|
713
665
|
}
|
|
714
666
|
return found;
|
|
715
667
|
}
|
|
716
|
-
/**
|
|
717
|
-
* Clear all registered adapters.
|
|
718
|
-
* Primarily useful for testing.
|
|
719
|
-
*/
|
|
668
|
+
/** Clear all registered adapters. Primarily useful for testing. */
|
|
720
669
|
clear() {
|
|
721
|
-
this.
|
|
670
|
+
this.commandMap.clear();
|
|
671
|
+
this.idMap.clear();
|
|
722
672
|
this.registeredAdapters.clear();
|
|
723
673
|
}
|
|
724
674
|
};
|
|
725
675
|
var defaultRegistry = new AgentAdapterRegistry();
|
|
726
|
-
function getAdapter(
|
|
727
|
-
const adapter = defaultRegistry.
|
|
676
|
+
function getAdapter(identifier) {
|
|
677
|
+
const adapter = defaultRegistry.resolve(identifier);
|
|
728
678
|
if (!adapter) {
|
|
729
|
-
const
|
|
679
|
+
const commands = defaultRegistry.getSupportedCommands();
|
|
680
|
+
const ids = defaultRegistry.getAdapterIds();
|
|
730
681
|
throw new Error(
|
|
731
|
-
`No agent adapter registered for
|
|
682
|
+
`No agent adapter registered for "${identifier}". Supported commands: ${commands.length > 0 ? commands.join(", ") : "(none)"}. Registered adapters: ${ids.length > 0 ? ids.join(", ") : "(none)"}`
|
|
732
683
|
);
|
|
733
684
|
}
|
|
734
685
|
return adapter;
|
|
@@ -739,21 +690,65 @@ var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
|
739
690
|
|
|
740
691
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
741
692
|
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
693
|
+
|
|
694
|
+
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
695
|
+
var import_promises2 = require("fs/promises");
|
|
696
|
+
var import_path3 = require("path");
|
|
697
|
+
var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
|
|
698
|
+
async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
699
|
+
await Promise.all(
|
|
700
|
+
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
701
|
+
);
|
|
702
|
+
}
|
|
703
|
+
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
704
|
+
const skillName = skill.name;
|
|
705
|
+
const skillDir = (0, import_path3.join)(cwd, ".claude", "skills", skillName);
|
|
706
|
+
await (0, import_promises2.mkdir)(skillDir, { recursive: true });
|
|
707
|
+
const version = skill.latestVersion;
|
|
708
|
+
if (version?.files && version.files.length > 0) {
|
|
709
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
710
|
+
console.log(
|
|
711
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
712
|
+
);
|
|
713
|
+
} else if (skill.source) {
|
|
714
|
+
try {
|
|
715
|
+
const files = await fetchFn(skill.source, {
|
|
716
|
+
userAgent: "EvalForge-Evaluator"
|
|
717
|
+
});
|
|
718
|
+
await writeFilesToDirectory(skillDir, files);
|
|
719
|
+
console.log(
|
|
720
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
721
|
+
);
|
|
722
|
+
} catch (error) {
|
|
723
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
724
|
+
console.error(
|
|
725
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
726
|
+
);
|
|
727
|
+
throw new Error(
|
|
728
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
729
|
+
);
|
|
730
|
+
}
|
|
731
|
+
} else {
|
|
732
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
// src/run-scenario/agents/claude-code/execute.ts
|
|
742
737
|
var import_crypto = require("crypto");
|
|
743
738
|
|
|
744
739
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
745
|
-
var
|
|
746
|
-
var
|
|
740
|
+
var import_promises4 = require("fs/promises");
|
|
741
|
+
var import_path5 = require("path");
|
|
747
742
|
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
748
743
|
|
|
749
744
|
// src/run-scenario/agents/claude-code/resolve-mcp-placeholders.ts
|
|
750
|
-
var
|
|
751
|
-
var
|
|
745
|
+
var import_promises3 = require("fs/promises");
|
|
746
|
+
var import_path4 = require("path");
|
|
752
747
|
var import_os2 = require("os");
|
|
753
|
-
var WIX_AUTH_FILE = (0,
|
|
748
|
+
var WIX_AUTH_FILE = (0, import_path4.join)((0, import_os2.homedir)(), ".wix", "auth", "api-key.json");
|
|
754
749
|
async function loadWixAuthPlaceholders(authFilePath = WIX_AUTH_FILE) {
|
|
755
750
|
try {
|
|
756
|
-
const content = await (0,
|
|
751
|
+
const content = await (0, import_promises3.readFile)(authFilePath, "utf-8");
|
|
757
752
|
const auth = JSON.parse(content);
|
|
758
753
|
if (!auth.token || !auth.userInfo?.userId) {
|
|
759
754
|
return {};
|
|
@@ -806,14 +801,14 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
806
801
|
null,
|
|
807
802
|
2
|
|
808
803
|
);
|
|
809
|
-
const filePath = (0,
|
|
810
|
-
await (0,
|
|
804
|
+
const filePath = (0, import_path5.join)(cwd, ".mcp.json");
|
|
805
|
+
await (0, import_promises4.writeFile)(filePath, content, "utf8");
|
|
811
806
|
console.log(`[MCP] Written to ${filePath}`);
|
|
812
807
|
}
|
|
813
808
|
|
|
814
809
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
815
|
-
var
|
|
816
|
-
var
|
|
810
|
+
var import_promises5 = require("fs/promises");
|
|
811
|
+
var import_path6 = require("path");
|
|
817
812
|
var AGENTS_DIR = ".claude/agents";
|
|
818
813
|
function toAgentFilename(name, index, nameCount) {
|
|
819
814
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -823,20 +818,20 @@ function toAgentFilename(name, index, nameCount) {
|
|
|
823
818
|
}
|
|
824
819
|
async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
825
820
|
if (subAgents.length === 0) return;
|
|
826
|
-
const agentsDir = (0,
|
|
827
|
-
await (0,
|
|
821
|
+
const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
|
|
822
|
+
await (0, import_promises5.mkdir)(agentsDir, { recursive: true });
|
|
828
823
|
const nameCount = /* @__PURE__ */ new Map();
|
|
829
824
|
for (const [i, agent] of subAgents.entries()) {
|
|
830
825
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
831
|
-
const filePath = (0,
|
|
832
|
-
await (0,
|
|
826
|
+
const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
|
|
827
|
+
await (0, import_promises5.writeFile)(filePath, agent.subAgentMd, "utf8");
|
|
833
828
|
}
|
|
834
829
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
835
830
|
}
|
|
836
831
|
|
|
837
832
|
// src/run-scenario/agents/claude-code/write-rules.ts
|
|
838
|
-
var
|
|
839
|
-
var
|
|
833
|
+
var import_promises6 = require("fs/promises");
|
|
834
|
+
var import_path7 = require("path");
|
|
840
835
|
var CURSOR_RULES_DIR = ".cursor/rules";
|
|
841
836
|
function toRuleFilename(name, index, nameCount) {
|
|
842
837
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `rule-${index}`;
|
|
@@ -847,13 +842,13 @@ function toRuleFilename(name, index, nameCount) {
|
|
|
847
842
|
async function appendToFile(filePath, content) {
|
|
848
843
|
let existing = "";
|
|
849
844
|
try {
|
|
850
|
-
existing = await (0,
|
|
845
|
+
existing = await (0, import_promises6.readFile)(filePath, "utf8");
|
|
851
846
|
} catch {
|
|
852
847
|
}
|
|
853
848
|
const merged = existing ? `${existing.trimEnd()}
|
|
854
849
|
|
|
855
850
|
${content}` : content;
|
|
856
|
-
await (0,
|
|
851
|
+
await (0, import_promises6.writeFile)(filePath, merged, "utf8");
|
|
857
852
|
}
|
|
858
853
|
async function writeRulesToFilesystem(cwd, rules) {
|
|
859
854
|
if (rules.length === 0) return;
|
|
@@ -862,21 +857,21 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
862
857
|
for (const [i, rule] of rules.entries()) {
|
|
863
858
|
switch (rule.ruleType) {
|
|
864
859
|
case "claude-md": {
|
|
865
|
-
await appendToFile((0,
|
|
860
|
+
await appendToFile((0, import_path7.join)(cwd, "CLAUDE.md"), rule.content);
|
|
866
861
|
break;
|
|
867
862
|
}
|
|
868
863
|
case "agents-md": {
|
|
869
|
-
await appendToFile((0,
|
|
864
|
+
await appendToFile((0, import_path7.join)(cwd, "AGENTS.md"), rule.content);
|
|
870
865
|
break;
|
|
871
866
|
}
|
|
872
867
|
case "cursor-rule": {
|
|
873
868
|
if (!hasCursorRules) {
|
|
874
|
-
await (0,
|
|
869
|
+
await (0, import_promises6.mkdir)((0, import_path7.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
|
|
875
870
|
hasCursorRules = true;
|
|
876
871
|
}
|
|
877
872
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
878
|
-
const filePath = (0,
|
|
879
|
-
await (0,
|
|
873
|
+
const filePath = (0, import_path7.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
874
|
+
await (0, import_promises6.writeFile)(filePath, rule.content, "utf8");
|
|
880
875
|
break;
|
|
881
876
|
}
|
|
882
877
|
}
|
|
@@ -1911,7 +1906,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
1911
1906
|
|
|
1912
1907
|
// src/run-scenario/file-diff.ts
|
|
1913
1908
|
var import_fs2 = require("fs");
|
|
1914
|
-
var
|
|
1909
|
+
var import_path8 = require("path");
|
|
1915
1910
|
|
|
1916
1911
|
// ../../node_modules/diff/lib/index.mjs
|
|
1917
1912
|
function Diff() {
|
|
@@ -2527,8 +2522,8 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
2527
2522
|
}
|
|
2528
2523
|
const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
|
|
2529
2524
|
for (const entry of entries) {
|
|
2530
|
-
const fullPath = (0,
|
|
2531
|
-
const relativePath = (0,
|
|
2525
|
+
const fullPath = (0, import_path8.join)(dir, entry.name);
|
|
2526
|
+
const relativePath = (0, import_path8.relative)(base, fullPath);
|
|
2532
2527
|
if (shouldIgnore(entry.name)) {
|
|
2533
2528
|
continue;
|
|
2534
2529
|
}
|
|
@@ -2640,14 +2635,17 @@ var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
|
2640
2635
|
var DEFAULT_AGENT_COMMAND = import_evalforge_types5.AgentRunCommand.CLAUDE;
|
|
2641
2636
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
2642
2637
|
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
2638
|
+
const agent = evalData.agent ?? void 0;
|
|
2639
|
+
const isSDK = agent?.agentType === import_evalforge_types5.AgentType.SDK;
|
|
2643
2640
|
if (!skillsGroupId) {
|
|
2644
2641
|
throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
|
|
2645
2642
|
}
|
|
2646
|
-
const
|
|
2647
|
-
const
|
|
2648
|
-
const adapter = getAdapter(runCommand);
|
|
2643
|
+
const identifier = isSDK ? agent.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
2644
|
+
const adapter = getAdapter(identifier);
|
|
2649
2645
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2650
2646
|
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
2647
|
+
const targetId = skillsGroupId ?? agent?.id ?? evalData.evalRun.id;
|
|
2648
|
+
const targetName = evalData.skillsGroupName || agent?.name || "";
|
|
2651
2649
|
const executionContext = {
|
|
2652
2650
|
skills: evalData.skills,
|
|
2653
2651
|
scenario,
|
|
@@ -2659,8 +2657,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2659
2657
|
evalRunId: evalRunId2,
|
|
2660
2658
|
scenarioId: scenario.id,
|
|
2661
2659
|
scenarioName: scenario.name,
|
|
2662
|
-
targetId
|
|
2663
|
-
targetName
|
|
2660
|
+
targetId,
|
|
2661
|
+
targetName,
|
|
2664
2662
|
tracePushUrl: config.tracePushUrl,
|
|
2665
2663
|
routeHeader: config.routeHeader,
|
|
2666
2664
|
authToken: config.authToken
|
|
@@ -2677,8 +2675,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2677
2675
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
2678
2676
|
return {
|
|
2679
2677
|
id: (0, import_crypto2.randomUUID)(),
|
|
2680
|
-
targetId
|
|
2681
|
-
targetName
|
|
2678
|
+
targetId,
|
|
2679
|
+
targetName,
|
|
2682
2680
|
scenarioId: scenario.id,
|
|
2683
2681
|
scenarioName: scenario.name,
|
|
2684
2682
|
modelConfig: agent?.modelConfig,
|
|
@@ -2694,11 +2692,11 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2694
2692
|
|
|
2695
2693
|
// src/run-scenario/index.ts
|
|
2696
2694
|
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
|
|
2697
|
-
const
|
|
2695
|
+
const targetId = evalData.evalRun.skillsGroupId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
2698
2696
|
const workDir = await prepareWorkingDirectory(
|
|
2699
2697
|
config,
|
|
2700
2698
|
evalRunId2,
|
|
2701
|
-
|
|
2699
|
+
targetId,
|
|
2702
2700
|
scenario.id,
|
|
2703
2701
|
template
|
|
2704
2702
|
);
|
|
@@ -2726,7 +2724,8 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
2726
2724
|
})),
|
|
2727
2725
|
durationMs: partialResult.duration
|
|
2728
2726
|
};
|
|
2729
|
-
const { "x-wix-ai-gateway-stream":
|
|
2727
|
+
const { "x-wix-ai-gateway-stream": _ignored, ...judgeHeaders } = config.aiGatewayHeaders;
|
|
2728
|
+
void _ignored;
|
|
2730
2729
|
const defaultJudgeModel = import_evalforge_types6.DEFAULT_JUDGE_MODEL;
|
|
2731
2730
|
const assertionContext = {
|
|
2732
2731
|
workDir,
|
|
@@ -2903,7 +2902,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2903
2902
|
scenarioItemCount: evalData.scenarioItems.length,
|
|
2904
2903
|
scenarios: evalData.scenarioItems.map((s) => s.scenario.name),
|
|
2905
2904
|
skillsCount: evalData.skills.length,
|
|
2906
|
-
|
|
2905
|
+
hasAgent: !!evalData.agent,
|
|
2907
2906
|
timestamp: Date.now()
|
|
2908
2907
|
})
|
|
2909
2908
|
);
|
|
@@ -2921,14 +2920,14 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2921
2920
|
`[${ExecutionPhase.FETCH_EVAL_RUN}] Failed to fetch evaluation data: ${errorMsg}`
|
|
2922
2921
|
);
|
|
2923
2922
|
}
|
|
2924
|
-
const {
|
|
2923
|
+
const { agent, skills, scenarioItems } = evalData;
|
|
2925
2924
|
state.currentPhase = ExecutionPhase.VALIDATION;
|
|
2926
2925
|
state.currentContext = {
|
|
2927
2926
|
projectId: projectId2,
|
|
2928
2927
|
evalRunId: evalRunId2,
|
|
2929
2928
|
scenarioCount: scenarioItems.length,
|
|
2930
2929
|
skillCount: skills.length,
|
|
2931
|
-
hasAgent: !!
|
|
2930
|
+
hasAgent: !!agent,
|
|
2932
2931
|
agentId: evalData.evalRun.agentId,
|
|
2933
2932
|
skillsGroupId: evalData.evalRun.skillsGroupId
|
|
2934
2933
|
};
|
|
@@ -2937,9 +2936,9 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2937
2936
|
`[${ExecutionPhase.VALIDATION}] Eval run has no skills: set skillsGroupId and ensure the group has skills. (skillsGroupId: ${evalData.evalRun.skillsGroupId || "not set"})`
|
|
2938
2937
|
);
|
|
2939
2938
|
}
|
|
2940
|
-
if (scenarioItems.length > 0 && skills.length > 0 && !
|
|
2939
|
+
if (scenarioItems.length > 0 && skills.length > 0 && !agent) {
|
|
2941
2940
|
throw new Error(
|
|
2942
|
-
`[${ExecutionPhase.VALIDATION}] Eval run has no
|
|
2941
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for skill-based runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
2943
2942
|
);
|
|
2944
2943
|
}
|
|
2945
2944
|
let completedScenarios = 0;
|
|
@@ -2953,8 +2952,8 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2953
2952
|
scenarioName: scenario.name,
|
|
2954
2953
|
skillsGroupId: evalData.evalRun.skillsGroupId,
|
|
2955
2954
|
skillsGroupName: evalData.skillsGroupName,
|
|
2956
|
-
agentId:
|
|
2957
|
-
agentName:
|
|
2955
|
+
agentId: agent?.id,
|
|
2956
|
+
agentName: agent?.name,
|
|
2958
2957
|
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
2959
2958
|
};
|
|
2960
2959
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
@@ -2962,7 +2961,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
2962
2961
|
"[Evaluator] Running scenario with skills group:",
|
|
2963
2962
|
evalData.skillsGroupName,
|
|
2964
2963
|
skillNames ? `(${skillNames})` : "",
|
|
2965
|
-
|
|
2964
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
2966
2965
|
`(${completedScenarios + 1}/${totalScenarios})`
|
|
2967
2966
|
);
|
|
2968
2967
|
try {
|