npm - @forwardimpact/libeval - Versions diffs - 0.1.57 → 0.1.58 - Mend

@forwardimpact/libeval 0.1.57 → 0.1.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/bin/fit-benchmark.js +8 -6
package/bin/fit-eval.js +6 -8
package/package.json +2 -2
package/src/agent-runner.js +3 -1
package/src/commands/benchmark-run.js +7 -3
package/src/commands/discuss.js +3 -2
package/src/commands/facilitate.js +3 -2
package/src/commands/run.js +2 -1
package/src/commands/supervise.js +3 -2
package/src/discusser.js +3 -2

package/bin/fit-benchmark.js CHANGED Viewed

@@ -10,6 +10,10 @@ import { createLogger } from "@forwardimpact/libtelemetry";
 import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
 import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
 import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
+import {
+  BENCHMARK_AGENT_MODEL,
+  LEAD_MODEL,
+} from "@forwardimpact/libutil/models";
 export const definition = {
   name: "fit-benchmark",
@@ -38,17 +42,15 @@ export const definition = {
         },
         "agent-model": {
           type: "string",
-          description:
-            "Claude model for the agent-under-test (default: claude-sonnet-4-6)",
+          description: `Claude model for the agent-under-test (default: ${BENCHMARK_AGENT_MODEL})`,
         },
         "lead-model": {
           type: "string",
-          description:
-            "Claude model for the lead role (default: claude-opus-4-7)",
+          description: `Claude model for the lead role (default: ${LEAD_MODEL})`,
         },
         "judge-model": {
           type: "string",
-          description: "Claude model for the judge (default: claude-opus-4-7)",
+          description: `Claude model for the judge (default: ${LEAD_MODEL})`,
         },
         "agent-profile": {
           type: "string",
@@ -126,7 +128,7 @@ export const definition = {
   },
   examples: [
     "fit-benchmark run --family=./families/coding",
-    "fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
+    `fit-benchmark run --family=./families/coding --runs=10 --agent-model=${BENCHMARK_AGENT_MODEL}`,
     "fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
     "fit-benchmark report --format=text",
     "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",

package/bin/fit-eval.js CHANGED Viewed

@@ -13,6 +13,7 @@ import { runSuperviseCommand } from "../src/commands/supervise.js";
 import { runFacilitateCommand } from "../src/commands/facilitate.js";
 import { runDiscussCommand } from "../src/commands/discuss.js";
 import { runCallbackCommand } from "../src/commands/callback.js";
+import { AGENT_MODEL, LEAD_MODEL } from "@forwardimpact/libutil/models";
 const LEAD_OPTIONS = {
   "lead-profile": {
@@ -21,8 +22,7 @@ const LEAD_OPTIONS = {
   },
   "lead-model": {
     type: "string",
-    description:
-      "Claude model for the lead role (default: claude-opus-4-7[1m])",
+    description: `Claude model for the lead role (default: ${LEAD_MODEL})`,
   },
 };
@@ -64,8 +64,7 @@ const definition = {
         ...TASK_INPUT_OPTIONS,
         "agent-model": {
           type: "string",
-          description:
-            "Claude model for the agent (default: claude-opus-4-7[1m])",
+          description: `Claude model for the agent (default: ${AGENT_MODEL})`,
         },
         "max-turns": {
           type: "string",
@@ -102,8 +101,7 @@ const definition = {
         ...TASK_INPUT_OPTIONS,
         "agent-model": {
           type: "string",
-          description:
-            "Claude model for the agent (default: claude-opus-4-7[1m])",
+          description: `Claude model for the agent (default: ${AGENT_MODEL})`,
         },
         ...LEAD_OPTIONS,
         "max-turns": {
@@ -147,7 +145,7 @@ const definition = {
         ...TASK_INPUT_OPTIONS,
         "agent-model": {
           type: "string",
-          description: "Claude model for agents (default: claude-opus-4-7[1m])",
+          description: `Claude model for agents (default: ${AGENT_MODEL})`,
         },
         ...LEAD_OPTIONS,
         "max-turns": {
@@ -184,7 +182,7 @@ const definition = {
         ...TASK_INPUT_OPTIONS,
         "agent-model": {
           type: "string",
-          description: "Claude model for agents (default: claude-opus-4-7[1m])",
+          description: `Claude model for agents (default: ${AGENT_MODEL})`,
         },
         ...LEAD_OPTIONS,
         "max-turns": {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.57",
+  "version": "0.1.58",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",
@@ -51,7 +51,7 @@
     "test": "bun test test/*.test.js"
   },
   "dependencies": {
-    "@anthropic-ai/claude-agent-sdk": "0.2.112",
+    "@anthropic-ai/claude-agent-sdk": "0.3.170",
     "@forwardimpact/libcli": "^0.1.0",
     "@forwardimpact/libconfig": "^0.1.0",
     "@forwardimpact/libpreflight": "^0.1.0",

package/src/agent-runner.js CHANGED Viewed

@@ -6,6 +6,8 @@
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
  */
+import { AGENT_MODEL } from "@forwardimpact/libutil/models";
 const DEFAULT_ALLOWED_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
 // fit-eval and kata-action run headless in CI/CD with no human to answer
@@ -43,7 +45,7 @@ export class AgentRunner {
     this.query = deps.query;
     this.output = deps.output;
     this.redactor = deps.redactor;
-    this.model = deps.model ?? "claude-opus-4-7[1m]";
+    this.model = deps.model ?? AGENT_MODEL;
     this.maxTurns = deps.maxTurns ?? 50;
     this.allowedTools = deps.allowedTools ?? DEFAULT_ALLOWED_TOOLS;
     this.onLine = deps.onLine ?? null;

package/src/commands/benchmark-run.js CHANGED Viewed

@@ -8,6 +8,10 @@ import { resolve } from "node:path";
 import { createConfig } from "@forwardimpact/libconfig";
 import { createBenchmarkRunner } from "../benchmark/runner.js";
+import {
+  BENCHMARK_AGENT_MODEL,
+  LEAD_MODEL,
+} from "@forwardimpact/libutil/models";
 /**
  * @param {import("@forwardimpact/libcli").InvocationContext} ctx
@@ -54,9 +58,9 @@ function parseRunOptions(values) {
     family,
     runs,
     output: resolve(output),
-    agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
-    supervisorModel: values["lead-model"] ?? "claude-opus-4-7",
-    judgeModel: values["judge-model"] ?? "claude-opus-4-7",
+    agentModel: values["agent-model"] ?? BENCHMARK_AGENT_MODEL,
+    supervisorModel: values["lead-model"] ?? LEAD_MODEL,
+    judgeModel: values["judge-model"] ?? LEAD_MODEL,
     profiles: {
       agent: values["agent-profile"] ?? null,
       judge: values["judge-profile"] ?? null,

package/src/commands/discuss.js CHANGED Viewed

@@ -4,6 +4,7 @@ import { createDiscusser } from "../discusser.js";
 import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
 import { resolveTaskContent } from "./task-input.js";
+import { AGENT_MODEL, LEAD_MODEL } from "@forwardimpact/libutil/models";
 function parseAgentProfiles(raw, cwd, maxTurns) {
   if (!raw) return [];
@@ -52,8 +53,8 @@ export function parseDiscussOptions(values, runtime) {
     taskAmend,
     agentConfigs,
     leadProfile: values["lead-profile"] ?? undefined,
-    leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
-    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
+    leadModel: values["lead-model"] ?? LEAD_MODEL,
+    agentModel: values["agent-model"] ?? AGENT_MODEL,
     maxTurns,
     maxLeadTurns,
     outputPath: values.output,

package/src/commands/facilitate.js CHANGED Viewed

@@ -4,6 +4,7 @@ import { createFacilitator } from "../facilitator.js";
 import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
 import { resolveTaskContent } from "./task-input.js";
+import { AGENT_MODEL, LEAD_MODEL } from "@forwardimpact/libutil/models";
 /**
  * Parse comma-separated agent profile names into structured configs.
@@ -50,8 +51,8 @@ export function parseFacilitateOptions(values, runtime) {
     taskAmend,
     agentConfigs,
     facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
-    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
-    facilitatorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? AGENT_MODEL,
+    facilitatorModel: values["lead-model"] ?? LEAD_MODEL,
     maxTurns,
     outputPath: values.output,
     facilitatorProfile: values["lead-profile"] ?? undefined,

package/src/commands/run.js CHANGED Viewed

@@ -8,6 +8,7 @@ import { createTeeWriter } from "../tee-writer.js";
 import { SequenceCounter } from "../sequence-counter.js";
 import { resolveTaskContent } from "./task-input.js";
 import { createServiceConfig } from "@forwardimpact/libconfig";
+import { AGENT_MODEL } from "@forwardimpact/libutil/models";
 /**
  * Parse and validate run command options from parsed values.
@@ -26,7 +27,7 @@ function parseRunOptions(values, runtime) {
     taskContent,
     taskAmend,
     cwd: resolve(values.cwd ?? "."),
-    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? AGENT_MODEL,
     maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
     outputPath: values.output,
     agentProfile: values["agent-profile"] ?? undefined,

package/src/commands/supervise.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
 import { resolveTaskContent } from "./task-input.js";
 import { createServiceConfig } from "@forwardimpact/libconfig";
+import { AGENT_MODEL, LEAD_MODEL } from "@forwardimpact/libutil/models";
 /**
  * Parse all supervise flags from parsed values into an options object.
@@ -30,8 +31,8 @@ export async function parseSuperviseOptions(values, runtime) {
     taskAmend,
     supervisorCwd: resolve(values["supervisor-cwd"] ?? "."),
     agentCwd,
-    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
-    supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? AGENT_MODEL,
+    supervisorModel: values["lead-model"] ?? LEAD_MODEL,
     maxTurns: (() => {
       const raw = values["max-turns"] ?? "200";
       return raw === "0" ? 0 : parseInt(raw, 10);

package/src/discusser.js CHANGED Viewed

@@ -29,6 +29,7 @@ import {
   DISCUSS_AGENT_SYSTEM_PROMPT,
 } from "./discuss-tools.js";
 import { OrchestrationLoop } from "./orchestration-loop.js";
+import { AGENT_MODEL, LEAD_MODEL } from "@forwardimpact/libutil/models";
 /** System prompt for the discuss-mode lead. L0 mechanics only per COALIGNED. */
 export const DISCUSS_SYSTEM_PROMPT =
@@ -314,7 +315,7 @@ export function createDiscusser({
       cwd: config.cwd ?? resolvedLeadCwd,
       query,
       output: devNull,
-      model: agentModel ?? "claude-opus-4-7[1m]",
+      model: agentModel ?? AGENT_MODEL,
       maxTurns: config.maxTurns ?? 50,
       allowedTools: config.allowedTools,
       onLine: (line) => discusser.loop.emitLine(config.name, line),
@@ -347,7 +348,7 @@ export function createDiscusser({
     cwd: resolvedLeadCwd,
     query,
     output: devNull,
-    model: leadModel ?? "claude-opus-4-7[1m]",
+    model: leadModel ?? LEAD_MODEL,
     maxTurns: maxTurns ?? 80,
     allowedTools: ["Read", "Glob", "Grep"],
     disallowedTools: defaultDisallowed,