@wix/evalforge-evaluator 0.90.0 → 0.92.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/build/index.js +81 -9
- package/build/index.js.map +4 -4
- package/build/index.mjs +80 -8
- package/build/index.mjs.map +4 -4
- package/build/types/api-client.d.ts +2 -1
- package/build/types/fetch-evaluation-data.d.ts +2 -1
- package/build/types/run-scenario/agents/claude-code/types.d.ts +3 -1
- package/build/types/run-scenario/agents/claude-code/write-rules.d.ts +13 -0
- package/package.json +5 -5
package/README.md
CHANGED
|
@@ -9,12 +9,13 @@ evaluator <project-id> <eval-run-id>
|
|
|
9
9
|
```
|
|
10
10
|
|
|
11
11
|
1. **Load configuration** from environment variables (server URL, AI Gateway credentials, etc.)
|
|
12
|
-
2. **Fetch evaluation data** from the backend API — eval run, scenarios, agent config, skills, MCPs, sub-agents, and templates
|
|
12
|
+
2. **Fetch evaluation data** from the backend API — eval run, scenarios, agent config, skills, MCPs, sub-agents, rules, and templates
|
|
13
13
|
3. **For each scenario:**
|
|
14
14
|
- Prepare a working directory (download and extract template)
|
|
15
15
|
- Write skills to `.claude/skills/<name>/SKILL.md`
|
|
16
16
|
- Write MCPs to `.mcp.json`
|
|
17
17
|
- Write sub-agents to `.claude/agents/<name>.md`
|
|
18
|
+
- Write rules to `CLAUDE.md`, `AGENTS.md`, or `.cursor/rules/<name>.md` based on rule type
|
|
18
19
|
- Launch the Claude Code agent with the scenario's trigger prompt via `@anthropic-ai/claude-agent-sdk`
|
|
19
20
|
- Stream trace events back to the backend
|
|
20
21
|
- Run assertions on the agent's output
|
package/build/index.js
CHANGED
|
@@ -182,6 +182,9 @@ function createApiClient(serverUrl, options = "") {
|
|
|
182
182
|
getSubAgent(projectId2, id) {
|
|
183
183
|
return fetchJson(`/projects/${projectId2}/sub-agents/${id}`);
|
|
184
184
|
},
|
|
185
|
+
getRule(projectId2, id) {
|
|
186
|
+
return fetchJson(`/projects/${projectId2}/rules/${id}`);
|
|
187
|
+
},
|
|
185
188
|
getAssertion(projectId2, id) {
|
|
186
189
|
return fetchJson(`/projects/${projectId2}/assertions/${id}`);
|
|
187
190
|
},
|
|
@@ -286,6 +289,12 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
286
289
|
maxDurationMs: params?.maxDurationMs ?? 3e5
|
|
287
290
|
};
|
|
288
291
|
break;
|
|
292
|
+
case "cost":
|
|
293
|
+
baseAssertion = {
|
|
294
|
+
type: "cost",
|
|
295
|
+
maxCostUsd: params?.maxCostUsd ?? 1
|
|
296
|
+
};
|
|
297
|
+
break;
|
|
289
298
|
case "llm_judge":
|
|
290
299
|
baseAssertion = {
|
|
291
300
|
type: "llm_judge",
|
|
@@ -372,6 +381,12 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
372
381
|
evalRun.subAgentIds.map((id) => api.getSubAgent(projectId2, id))
|
|
373
382
|
);
|
|
374
383
|
}
|
|
384
|
+
let rules = [];
|
|
385
|
+
if (evalRun.ruleIds && evalRun.ruleIds.length > 0) {
|
|
386
|
+
rules = await Promise.all(
|
|
387
|
+
evalRun.ruleIds.map((id) => api.getRule(projectId2, id))
|
|
388
|
+
);
|
|
389
|
+
}
|
|
375
390
|
const templateIds = [
|
|
376
391
|
...new Set(
|
|
377
392
|
scenarios.map((s) => s.templateId).filter((id) => !!id)
|
|
@@ -423,6 +438,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
423
438
|
skillsGroupName,
|
|
424
439
|
mcps,
|
|
425
440
|
subAgents,
|
|
441
|
+
rules,
|
|
426
442
|
scenarioItems
|
|
427
443
|
};
|
|
428
444
|
}
|
|
@@ -723,6 +739,56 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
|
723
739
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
724
740
|
}
|
|
725
741
|
|
|
742
|
+
// src/run-scenario/agents/claude-code/write-rules.ts
|
|
743
|
+
var import_promises4 = require("fs/promises");
|
|
744
|
+
var import_path5 = require("path");
|
|
745
|
+
var CURSOR_RULES_DIR = ".cursor/rules";
|
|
746
|
+
function toRuleFilename(name, index, nameCount) {
|
|
747
|
+
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `rule-${index}`;
|
|
748
|
+
const count = nameCount.get(base) ?? 0;
|
|
749
|
+
nameCount.set(base, count + 1);
|
|
750
|
+
return count === 0 ? base : `${base}-${count + 1}`;
|
|
751
|
+
}
|
|
752
|
+
async function appendToFile(filePath, content) {
|
|
753
|
+
let existing = "";
|
|
754
|
+
try {
|
|
755
|
+
existing = await (0, import_promises4.readFile)(filePath, "utf8");
|
|
756
|
+
} catch {
|
|
757
|
+
}
|
|
758
|
+
const merged = existing ? `${existing.trimEnd()}
|
|
759
|
+
|
|
760
|
+
${content}` : content;
|
|
761
|
+
await (0, import_promises4.writeFile)(filePath, merged, "utf8");
|
|
762
|
+
}
|
|
763
|
+
async function writeRulesToFilesystem(cwd, rules) {
|
|
764
|
+
if (rules.length === 0) return;
|
|
765
|
+
const nameCount = /* @__PURE__ */ new Map();
|
|
766
|
+
let hasCursorRules = false;
|
|
767
|
+
for (const [i, rule] of rules.entries()) {
|
|
768
|
+
switch (rule.ruleType) {
|
|
769
|
+
case "claude-md": {
|
|
770
|
+
await appendToFile((0, import_path5.join)(cwd, "CLAUDE.md"), rule.content);
|
|
771
|
+
break;
|
|
772
|
+
}
|
|
773
|
+
case "agents-md": {
|
|
774
|
+
await appendToFile((0, import_path5.join)(cwd, "AGENTS.md"), rule.content);
|
|
775
|
+
break;
|
|
776
|
+
}
|
|
777
|
+
case "cursor-rule": {
|
|
778
|
+
if (!hasCursorRules) {
|
|
779
|
+
await (0, import_promises4.mkdir)((0, import_path5.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
|
|
780
|
+
hasCursorRules = true;
|
|
781
|
+
}
|
|
782
|
+
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
783
|
+
const filePath = (0, import_path5.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
784
|
+
await (0, import_promises4.writeFile)(filePath, rule.content, "utf8");
|
|
785
|
+
break;
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
console.log(`[Rules] Written ${rules.length} rule(s) to ${cwd}`);
|
|
790
|
+
}
|
|
791
|
+
|
|
726
792
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
727
793
|
var DEFAULT_MODEL = import_evalforge_types3.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
728
794
|
function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
@@ -937,10 +1003,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
937
1003
|
}
|
|
938
1004
|
const startTime = /* @__PURE__ */ new Date();
|
|
939
1005
|
const allMessages = [];
|
|
940
|
-
const { mkdir: mkdirAsync, writeFile:
|
|
1006
|
+
const { mkdir: mkdirAsync, writeFile: writeFile5 } = await import("fs/promises");
|
|
941
1007
|
const claudeDir = `${options.cwd}/.claude`;
|
|
942
1008
|
await mkdirAsync(claudeDir, { recursive: true });
|
|
943
|
-
await
|
|
1009
|
+
await writeFile5(`${claudeDir}/settings.json`, "{}", {
|
|
944
1010
|
flag: "wx"
|
|
945
1011
|
}).catch(() => {
|
|
946
1012
|
});
|
|
@@ -950,6 +1016,9 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
950
1016
|
if (options.subAgents && options.subAgents.length > 0) {
|
|
951
1017
|
await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
|
|
952
1018
|
}
|
|
1019
|
+
if (options.rules && options.rules.length > 0) {
|
|
1020
|
+
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
1021
|
+
}
|
|
953
1022
|
console.error(
|
|
954
1023
|
"[DEBUG-H4] writeSkillsToFilesystem START",
|
|
955
1024
|
JSON.stringify({
|
|
@@ -1701,7 +1770,8 @@ var ClaudeCodeAdapter = class {
|
|
|
1701
1770
|
aiGatewayHeaders,
|
|
1702
1771
|
traceContext,
|
|
1703
1772
|
mcps,
|
|
1704
|
-
subAgents
|
|
1773
|
+
subAgents,
|
|
1774
|
+
rules
|
|
1705
1775
|
} = context;
|
|
1706
1776
|
const modelForSdk = modelConfig?.model;
|
|
1707
1777
|
const options = {
|
|
@@ -1713,7 +1783,8 @@ var ClaudeCodeAdapter = class {
|
|
|
1713
1783
|
aiGatewayHeaders,
|
|
1714
1784
|
traceContext,
|
|
1715
1785
|
mcps,
|
|
1716
|
-
subAgents
|
|
1786
|
+
subAgents,
|
|
1787
|
+
rules
|
|
1717
1788
|
};
|
|
1718
1789
|
const { result, llmTrace } = await executeWithClaudeCode(
|
|
1719
1790
|
skills,
|
|
@@ -1740,7 +1811,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
1740
1811
|
|
|
1741
1812
|
// src/run-scenario/file-diff.ts
|
|
1742
1813
|
var import_fs2 = require("fs");
|
|
1743
|
-
var
|
|
1814
|
+
var import_path6 = require("path");
|
|
1744
1815
|
|
|
1745
1816
|
// ../../node_modules/diff/lib/index.mjs
|
|
1746
1817
|
function Diff() {
|
|
@@ -1916,7 +1987,7 @@ Diff.prototype = {
|
|
|
1916
1987
|
tokenize: function tokenize(value) {
|
|
1917
1988
|
return Array.from(value);
|
|
1918
1989
|
},
|
|
1919
|
-
join: function
|
|
1990
|
+
join: function join5(chars) {
|
|
1920
1991
|
return chars.join("");
|
|
1921
1992
|
},
|
|
1922
1993
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -2356,8 +2427,8 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
2356
2427
|
}
|
|
2357
2428
|
const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
|
|
2358
2429
|
for (const entry of entries) {
|
|
2359
|
-
const fullPath = (0,
|
|
2360
|
-
const relativePath = (0,
|
|
2430
|
+
const fullPath = (0, import_path6.join)(dir, entry.name);
|
|
2431
|
+
const relativePath = (0, import_path6.relative)(base, fullPath);
|
|
2361
2432
|
if (shouldIgnore(entry.name)) {
|
|
2362
2433
|
continue;
|
|
2363
2434
|
}
|
|
@@ -2495,7 +2566,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2495
2566
|
authToken: config.authToken
|
|
2496
2567
|
},
|
|
2497
2568
|
mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
|
|
2498
|
-
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
|
|
2569
|
+
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0,
|
|
2570
|
+
rules: evalData.rules?.length > 0 ? evalData.rules : void 0
|
|
2499
2571
|
};
|
|
2500
2572
|
const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
|
|
2501
2573
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|