@wix/evalforge-evaluator 0.91.0 → 0.92.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/build/index.js +75 -9
- package/build/index.js.map +4 -4
- package/build/index.mjs +74 -8
- package/build/index.mjs.map +4 -4
- package/build/types/api-client.d.ts +2 -1
- package/build/types/fetch-evaluation-data.d.ts +2 -1
- package/build/types/run-scenario/agents/claude-code/types.d.ts +3 -1
- package/build/types/run-scenario/agents/claude-code/write-rules.d.ts +13 -0
- package/package.json +4 -4
package/README.md
CHANGED
|
@@ -9,12 +9,13 @@ evaluator <project-id> <eval-run-id>
|
|
|
9
9
|
```
|
|
10
10
|
|
|
11
11
|
1. **Load configuration** from environment variables (server URL, AI Gateway credentials, etc.)
|
|
12
|
-
2. **Fetch evaluation data** from the backend API — eval run, scenarios, agent config, skills, MCPs, sub-agents, and templates
|
|
12
|
+
2. **Fetch evaluation data** from the backend API — eval run, scenarios, agent config, skills, MCPs, sub-agents, rules, and templates
|
|
13
13
|
3. **For each scenario:**
|
|
14
14
|
- Prepare a working directory (download and extract template)
|
|
15
15
|
- Write skills to `.claude/skills/<name>/SKILL.md`
|
|
16
16
|
- Write MCPs to `.mcp.json`
|
|
17
17
|
- Write sub-agents to `.claude/agents/<name>.md`
|
|
18
|
+
- Write rules to `CLAUDE.md`, `AGENTS.md`, or `.cursor/rules/<name>.md` based on rule type
|
|
18
19
|
- Launch the Claude Code agent with the scenario's trigger prompt via `@anthropic-ai/claude-agent-sdk`
|
|
19
20
|
- Stream trace events back to the backend
|
|
20
21
|
- Run assertions on the agent's output
|
package/build/index.js
CHANGED
|
@@ -182,6 +182,9 @@ function createApiClient(serverUrl, options = "") {
|
|
|
182
182
|
getSubAgent(projectId2, id) {
|
|
183
183
|
return fetchJson(`/projects/${projectId2}/sub-agents/${id}`);
|
|
184
184
|
},
|
|
185
|
+
getRule(projectId2, id) {
|
|
186
|
+
return fetchJson(`/projects/${projectId2}/rules/${id}`);
|
|
187
|
+
},
|
|
185
188
|
getAssertion(projectId2, id) {
|
|
186
189
|
return fetchJson(`/projects/${projectId2}/assertions/${id}`);
|
|
187
190
|
},
|
|
@@ -378,6 +381,12 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
378
381
|
evalRun.subAgentIds.map((id) => api.getSubAgent(projectId2, id))
|
|
379
382
|
);
|
|
380
383
|
}
|
|
384
|
+
let rules = [];
|
|
385
|
+
if (evalRun.ruleIds && evalRun.ruleIds.length > 0) {
|
|
386
|
+
rules = await Promise.all(
|
|
387
|
+
evalRun.ruleIds.map((id) => api.getRule(projectId2, id))
|
|
388
|
+
);
|
|
389
|
+
}
|
|
381
390
|
const templateIds = [
|
|
382
391
|
...new Set(
|
|
383
392
|
scenarios.map((s) => s.templateId).filter((id) => !!id)
|
|
@@ -429,6 +438,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
429
438
|
skillsGroupName,
|
|
430
439
|
mcps,
|
|
431
440
|
subAgents,
|
|
441
|
+
rules,
|
|
432
442
|
scenarioItems
|
|
433
443
|
};
|
|
434
444
|
}
|
|
@@ -729,6 +739,56 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
|
729
739
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
730
740
|
}
|
|
731
741
|
|
|
742
|
+
// src/run-scenario/agents/claude-code/write-rules.ts
|
|
743
|
+
var import_promises4 = require("fs/promises");
|
|
744
|
+
var import_path5 = require("path");
|
|
745
|
+
var CURSOR_RULES_DIR = ".cursor/rules";
|
|
746
|
+
function toRuleFilename(name, index, nameCount) {
|
|
747
|
+
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `rule-${index}`;
|
|
748
|
+
const count = nameCount.get(base) ?? 0;
|
|
749
|
+
nameCount.set(base, count + 1);
|
|
750
|
+
return count === 0 ? base : `${base}-${count + 1}`;
|
|
751
|
+
}
|
|
752
|
+
async function appendToFile(filePath, content) {
|
|
753
|
+
let existing = "";
|
|
754
|
+
try {
|
|
755
|
+
existing = await (0, import_promises4.readFile)(filePath, "utf8");
|
|
756
|
+
} catch {
|
|
757
|
+
}
|
|
758
|
+
const merged = existing ? `${existing.trimEnd()}
|
|
759
|
+
|
|
760
|
+
${content}` : content;
|
|
761
|
+
await (0, import_promises4.writeFile)(filePath, merged, "utf8");
|
|
762
|
+
}
|
|
763
|
+
async function writeRulesToFilesystem(cwd, rules) {
|
|
764
|
+
if (rules.length === 0) return;
|
|
765
|
+
const nameCount = /* @__PURE__ */ new Map();
|
|
766
|
+
let hasCursorRules = false;
|
|
767
|
+
for (const [i, rule] of rules.entries()) {
|
|
768
|
+
switch (rule.ruleType) {
|
|
769
|
+
case "claude-md": {
|
|
770
|
+
await appendToFile((0, import_path5.join)(cwd, "CLAUDE.md"), rule.content);
|
|
771
|
+
break;
|
|
772
|
+
}
|
|
773
|
+
case "agents-md": {
|
|
774
|
+
await appendToFile((0, import_path5.join)(cwd, "AGENTS.md"), rule.content);
|
|
775
|
+
break;
|
|
776
|
+
}
|
|
777
|
+
case "cursor-rule": {
|
|
778
|
+
if (!hasCursorRules) {
|
|
779
|
+
await (0, import_promises4.mkdir)((0, import_path5.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
|
|
780
|
+
hasCursorRules = true;
|
|
781
|
+
}
|
|
782
|
+
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
783
|
+
const filePath = (0, import_path5.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
784
|
+
await (0, import_promises4.writeFile)(filePath, rule.content, "utf8");
|
|
785
|
+
break;
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
console.log(`[Rules] Written ${rules.length} rule(s) to ${cwd}`);
|
|
790
|
+
}
|
|
791
|
+
|
|
732
792
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
733
793
|
var DEFAULT_MODEL = import_evalforge_types3.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
734
794
|
function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
@@ -943,10 +1003,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
943
1003
|
}
|
|
944
1004
|
const startTime = /* @__PURE__ */ new Date();
|
|
945
1005
|
const allMessages = [];
|
|
946
|
-
const { mkdir: mkdirAsync, writeFile:
|
|
1006
|
+
const { mkdir: mkdirAsync, writeFile: writeFile5 } = await import("fs/promises");
|
|
947
1007
|
const claudeDir = `${options.cwd}/.claude`;
|
|
948
1008
|
await mkdirAsync(claudeDir, { recursive: true });
|
|
949
|
-
await
|
|
1009
|
+
await writeFile5(`${claudeDir}/settings.json`, "{}", {
|
|
950
1010
|
flag: "wx"
|
|
951
1011
|
}).catch(() => {
|
|
952
1012
|
});
|
|
@@ -956,6 +1016,9 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
956
1016
|
if (options.subAgents && options.subAgents.length > 0) {
|
|
957
1017
|
await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
|
|
958
1018
|
}
|
|
1019
|
+
if (options.rules && options.rules.length > 0) {
|
|
1020
|
+
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
1021
|
+
}
|
|
959
1022
|
console.error(
|
|
960
1023
|
"[DEBUG-H4] writeSkillsToFilesystem START",
|
|
961
1024
|
JSON.stringify({
|
|
@@ -1707,7 +1770,8 @@ var ClaudeCodeAdapter = class {
|
|
|
1707
1770
|
aiGatewayHeaders,
|
|
1708
1771
|
traceContext,
|
|
1709
1772
|
mcps,
|
|
1710
|
-
subAgents
|
|
1773
|
+
subAgents,
|
|
1774
|
+
rules
|
|
1711
1775
|
} = context;
|
|
1712
1776
|
const modelForSdk = modelConfig?.model;
|
|
1713
1777
|
const options = {
|
|
@@ -1719,7 +1783,8 @@ var ClaudeCodeAdapter = class {
|
|
|
1719
1783
|
aiGatewayHeaders,
|
|
1720
1784
|
traceContext,
|
|
1721
1785
|
mcps,
|
|
1722
|
-
subAgents
|
|
1786
|
+
subAgents,
|
|
1787
|
+
rules
|
|
1723
1788
|
};
|
|
1724
1789
|
const { result, llmTrace } = await executeWithClaudeCode(
|
|
1725
1790
|
skills,
|
|
@@ -1746,7 +1811,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
1746
1811
|
|
|
1747
1812
|
// src/run-scenario/file-diff.ts
|
|
1748
1813
|
var import_fs2 = require("fs");
|
|
1749
|
-
var
|
|
1814
|
+
var import_path6 = require("path");
|
|
1750
1815
|
|
|
1751
1816
|
// ../../node_modules/diff/lib/index.mjs
|
|
1752
1817
|
function Diff() {
|
|
@@ -1922,7 +1987,7 @@ Diff.prototype = {
|
|
|
1922
1987
|
tokenize: function tokenize(value) {
|
|
1923
1988
|
return Array.from(value);
|
|
1924
1989
|
},
|
|
1925
|
-
join: function
|
|
1990
|
+
join: function join5(chars) {
|
|
1926
1991
|
return chars.join("");
|
|
1927
1992
|
},
|
|
1928
1993
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -2362,8 +2427,8 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
2362
2427
|
}
|
|
2363
2428
|
const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
|
|
2364
2429
|
for (const entry of entries) {
|
|
2365
|
-
const fullPath = (0,
|
|
2366
|
-
const relativePath = (0,
|
|
2430
|
+
const fullPath = (0, import_path6.join)(dir, entry.name);
|
|
2431
|
+
const relativePath = (0, import_path6.relative)(base, fullPath);
|
|
2367
2432
|
if (shouldIgnore(entry.name)) {
|
|
2368
2433
|
continue;
|
|
2369
2434
|
}
|
|
@@ -2501,7 +2566,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2501
2566
|
authToken: config.authToken
|
|
2502
2567
|
},
|
|
2503
2568
|
mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
|
|
2504
|
-
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
|
|
2569
|
+
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0,
|
|
2570
|
+
rules: evalData.rules?.length > 0 ? evalData.rules : void 0
|
|
2505
2571
|
};
|
|
2506
2572
|
const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
|
|
2507
2573
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|