@wix/evalforge-evaluator 0.90.0 → 0.92.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,12 +9,13 @@ evaluator <project-id> <eval-run-id>
9
9
  ```
10
10
 
11
11
  1. **Load configuration** from environment variables (server URL, AI Gateway credentials, etc.)
12
- 2. **Fetch evaluation data** from the backend API — eval run, scenarios, agent config, skills, MCPs, sub-agents, and templates
12
+ 2. **Fetch evaluation data** from the backend API — eval run, scenarios, agent config, skills, MCPs, sub-agents, rules, and templates
13
13
  3. **For each scenario:**
14
14
  - Prepare a working directory (download and extract template)
15
15
  - Write skills to `.claude/skills/<name>/SKILL.md`
16
16
  - Write MCPs to `.mcp.json`
17
17
  - Write sub-agents to `.claude/agents/<name>.md`
18
+ - Write rules to `CLAUDE.md`, `AGENTS.md`, or `.cursor/rules/<name>.md` based on rule type
18
19
  - Launch the Claude Code agent with the scenario's trigger prompt via `@anthropic-ai/claude-agent-sdk`
19
20
  - Stream trace events back to the backend
20
21
  - Run assertions on the agent's output
package/build/index.js CHANGED
@@ -182,6 +182,9 @@ function createApiClient(serverUrl, options = "") {
182
182
  getSubAgent(projectId2, id) {
183
183
  return fetchJson(`/projects/${projectId2}/sub-agents/${id}`);
184
184
  },
185
+ getRule(projectId2, id) {
186
+ return fetchJson(`/projects/${projectId2}/rules/${id}`);
187
+ },
185
188
  getAssertion(projectId2, id) {
186
189
  return fetchJson(`/projects/${projectId2}/assertions/${id}`);
187
190
  },
@@ -286,6 +289,12 @@ function resolveSystemAssertion(assertionId, params) {
286
289
  maxDurationMs: params?.maxDurationMs ?? 3e5
287
290
  };
288
291
  break;
292
+ case "cost":
293
+ baseAssertion = {
294
+ type: "cost",
295
+ maxCostUsd: params?.maxCostUsd ?? 1
296
+ };
297
+ break;
289
298
  case "llm_judge":
290
299
  baseAssertion = {
291
300
  type: "llm_judge",
@@ -372,6 +381,12 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
372
381
  evalRun.subAgentIds.map((id) => api.getSubAgent(projectId2, id))
373
382
  );
374
383
  }
384
+ let rules = [];
385
+ if (evalRun.ruleIds && evalRun.ruleIds.length > 0) {
386
+ rules = await Promise.all(
387
+ evalRun.ruleIds.map((id) => api.getRule(projectId2, id))
388
+ );
389
+ }
375
390
  const templateIds = [
376
391
  ...new Set(
377
392
  scenarios.map((s) => s.templateId).filter((id) => !!id)
@@ -423,6 +438,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
423
438
  skillsGroupName,
424
439
  mcps,
425
440
  subAgents,
441
+ rules,
426
442
  scenarioItems
427
443
  };
428
444
  }
@@ -723,6 +739,56 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
723
739
  console.log(`[SubAgents] Written to ${agentsDir}`);
724
740
  }
725
741
 
742
+ // src/run-scenario/agents/claude-code/write-rules.ts
743
+ var import_promises4 = require("fs/promises");
744
+ var import_path5 = require("path");
745
+ var CURSOR_RULES_DIR = ".cursor/rules";
746
+ function toRuleFilename(name, index, nameCount) {
747
+ const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `rule-${index}`;
748
+ const count = nameCount.get(base) ?? 0;
749
+ nameCount.set(base, count + 1);
750
+ return count === 0 ? base : `${base}-${count + 1}`;
751
+ }
752
+ async function appendToFile(filePath, content) {
753
+ let existing = "";
754
+ try {
755
+ existing = await (0, import_promises4.readFile)(filePath, "utf8");
756
+ } catch {
757
+ }
758
+ const merged = existing ? `${existing.trimEnd()}
759
+
760
+ ${content}` : content;
761
+ await (0, import_promises4.writeFile)(filePath, merged, "utf8");
762
+ }
763
+ async function writeRulesToFilesystem(cwd, rules) {
764
+ if (rules.length === 0) return;
765
+ const nameCount = /* @__PURE__ */ new Map();
766
+ let hasCursorRules = false;
767
+ for (const [i, rule] of rules.entries()) {
768
+ switch (rule.ruleType) {
769
+ case "claude-md": {
770
+ await appendToFile((0, import_path5.join)(cwd, "CLAUDE.md"), rule.content);
771
+ break;
772
+ }
773
+ case "agents-md": {
774
+ await appendToFile((0, import_path5.join)(cwd, "AGENTS.md"), rule.content);
775
+ break;
776
+ }
777
+ case "cursor-rule": {
778
+ if (!hasCursorRules) {
779
+ await (0, import_promises4.mkdir)((0, import_path5.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
780
+ hasCursorRules = true;
781
+ }
782
+ const filename = toRuleFilename(rule.name, i, nameCount);
783
+ const filePath = (0, import_path5.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
784
+ await (0, import_promises4.writeFile)(filePath, rule.content, "utf8");
785
+ break;
786
+ }
787
+ }
788
+ }
789
+ console.log(`[Rules] Written ${rules.length} rule(s) to ${cwd}`);
790
+ }
791
+
726
792
  // src/run-scenario/agents/claude-code/execute.ts
727
793
  var DEFAULT_MODEL = import_evalforge_types3.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
728
794
  function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
@@ -937,10 +1003,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
937
1003
  }
938
1004
  const startTime = /* @__PURE__ */ new Date();
939
1005
  const allMessages = [];
940
- const { mkdir: mkdirAsync, writeFile: writeFile4 } = await import("fs/promises");
1006
+ const { mkdir: mkdirAsync, writeFile: writeFile5 } = await import("fs/promises");
941
1007
  const claudeDir = `${options.cwd}/.claude`;
942
1008
  await mkdirAsync(claudeDir, { recursive: true });
943
- await writeFile4(`${claudeDir}/settings.json`, "{}", {
1009
+ await writeFile5(`${claudeDir}/settings.json`, "{}", {
944
1010
  flag: "wx"
945
1011
  }).catch(() => {
946
1012
  });
@@ -950,6 +1016,9 @@ async function executeWithClaudeCode(skills, scenario, options) {
950
1016
  if (options.subAgents && options.subAgents.length > 0) {
951
1017
  await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
952
1018
  }
1019
+ if (options.rules && options.rules.length > 0) {
1020
+ await writeRulesToFilesystem(options.cwd, options.rules);
1021
+ }
953
1022
  console.error(
954
1023
  "[DEBUG-H4] writeSkillsToFilesystem START",
955
1024
  JSON.stringify({
@@ -1701,7 +1770,8 @@ var ClaudeCodeAdapter = class {
1701
1770
  aiGatewayHeaders,
1702
1771
  traceContext,
1703
1772
  mcps,
1704
- subAgents
1773
+ subAgents,
1774
+ rules
1705
1775
  } = context;
1706
1776
  const modelForSdk = modelConfig?.model;
1707
1777
  const options = {
@@ -1713,7 +1783,8 @@ var ClaudeCodeAdapter = class {
1713
1783
  aiGatewayHeaders,
1714
1784
  traceContext,
1715
1785
  mcps,
1716
- subAgents
1786
+ subAgents,
1787
+ rules
1717
1788
  };
1718
1789
  const { result, llmTrace } = await executeWithClaudeCode(
1719
1790
  skills,
@@ -1740,7 +1811,7 @@ defaultRegistry.register(claudeCodeAdapter);
1740
1811
 
1741
1812
  // src/run-scenario/file-diff.ts
1742
1813
  var import_fs2 = require("fs");
1743
- var import_path5 = require("path");
1814
+ var import_path6 = require("path");
1744
1815
 
1745
1816
  // ../../node_modules/diff/lib/index.mjs
1746
1817
  function Diff() {
@@ -1916,7 +1987,7 @@ Diff.prototype = {
1916
1987
  tokenize: function tokenize(value) {
1917
1988
  return Array.from(value);
1918
1989
  },
1919
- join: function join4(chars) {
1990
+ join: function join5(chars) {
1920
1991
  return chars.join("");
1921
1992
  },
1922
1993
  postProcess: function postProcess(changeObjects) {
@@ -2356,8 +2427,8 @@ function snapshotDirectory(dir, baseDir) {
2356
2427
  }
2357
2428
  const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
2358
2429
  for (const entry of entries) {
2359
- const fullPath = (0, import_path5.join)(dir, entry.name);
2360
- const relativePath = (0, import_path5.relative)(base, fullPath);
2430
+ const fullPath = (0, import_path6.join)(dir, entry.name);
2431
+ const relativePath = (0, import_path6.relative)(base, fullPath);
2361
2432
  if (shouldIgnore(entry.name)) {
2362
2433
  continue;
2363
2434
  }
@@ -2495,7 +2566,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
2495
2566
  authToken: config.authToken
2496
2567
  },
2497
2568
  mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
2498
- subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
2569
+ subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0,
2570
+ rules: evalData.rules?.length > 0 ? evalData.rules : void 0
2499
2571
  };
2500
2572
  const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
2501
2573
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();