npm - @agentv/core - Versions diffs - 4.6.0 → 4.7.0 - Mend

@agentv/core 4.6.0 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-AIQ5FO4G.js → chunk-75RFVESM.js} +273 -125
package/dist/chunk-75RFVESM.js.map +1 -0
package/dist/evaluation/validation/index.cjs +110 -95
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +30 -72
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +1488 -517
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +291 -74
package/dist/index.d.ts +291 -74
package/dist/index.js +1187 -369
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-AIQ5FO4G.js.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import {
   COMMON_TARGET_SETTINGS,
+  LLM_GRADER_CAPABLE_KINDS,
   TEST_MESSAGE_ROLES,
   buildDirectoryChain,
   buildSearchRoots,
@@ -21,9 +22,10 @@ import {
   normalizeLineEndings,
   readJsonFile,
   readTextFile,
+  resolveDelegatedTargetDefinition,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-AIQ5FO4G.js";
+} from "./chunk-75RFVESM.js";
 import {
   AgentvProvider
 } from "./chunk-PRNXHNLF.js";
@@ -855,8 +857,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
     const negate = rawEvaluator.negate === true ? true : void 0;
     if (isCustomType) {
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
-      const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "negate"]);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
+      const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
       const config2 = {};
       for (const [key, value] of Object.entries(rawEvaluator)) {
         if (!knownProps2.has(key) && value !== void 0) {
@@ -868,6 +875,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         type: customTypeName,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {},
         ...Object.keys(config2).length > 0 ? { config: config2 } : {}
       });
@@ -937,7 +945,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
           );
         }
       }
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       const knownProps2 = /* @__PURE__ */ new Set([
         "name",
         "type",
@@ -963,6 +976,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         resolvedCwd,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {},
         ...Object.keys(config2).length > 0 ? { config: config2 } : {},
         ...targetConfig !== void 0 ? { target: targetConfig } : {}
@@ -1091,7 +1105,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         };
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "composite",
@@ -1099,6 +1118,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         aggregator,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1209,7 +1229,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       const config2 = {
         name,
         type: "tool-trajectory",
@@ -1218,6 +1243,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         ...expected ? { expected } : {},
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {},
         ...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
       };
@@ -1280,7 +1306,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
       const aggregation = asString(rawEvaluator.aggregation);
       const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "field-accuracy",
@@ -1288,6 +1319,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         ...validAggregation ? { aggregation: validAggregation } : {},
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1301,13 +1333,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "latency",
         threshold,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1321,13 +1359,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "cost",
         budget,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1359,13 +1403,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "token-usage",
         ...validLimits,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1411,13 +1461,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "execution-metrics",
         ...validThresholds,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1431,7 +1487,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
       const rawShouldTrigger = rawEvaluator.should_trigger;
       const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "skill-trigger",
@@ -1439,6 +1500,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         ...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1450,13 +1512,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "contains",
         value,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1470,13 +1538,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: typeValue,
         value,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1488,13 +1562,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "icontains",
         value,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1508,13 +1588,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: typeValue,
         value,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1526,13 +1612,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: typeValue,
         value,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1545,7 +1637,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
       }
       const flags = asString(rawEvaluator.flags);
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "regex",
@@ -1553,18 +1650,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         ...flags !== void 0 ? { flags } : {},
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
     }
     if (typeValue === "is-json") {
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "is-json",
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1576,13 +1680,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "equals",
         value,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1618,7 +1728,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "llm-grader",
@@ -1626,6 +1741,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         ...graderTargetName ? { target: graderTargetName } : {},
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
@@ -1695,7 +1811,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const required2 = parseRequired(rawEvaluator.required);
+      const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        rawEvaluator.min_score,
+        name,
+        evalId
+      );
       evaluators.push({
         name,
         type: "llm-grader",
@@ -1703,12 +1824,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         ...graderTargetName ? { target: graderTargetName } : {},
         ...weight2 !== void 0 ? { weight: weight2 } : {},
         ...required2 !== void 0 ? { required: required2 } : {},
+        ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
         ...negate !== void 0 ? { negate } : {}
       });
       continue;
     }
     const weight = validateWeight(rawEvaluator.weight, name, evalId);
-    const required = parseRequired(rawEvaluator.required);
+    const { required, min_score } = parseRequiredAndMinScore(
+      rawEvaluator.required,
+      rawEvaluator.min_score,
+      name,
+      evalId
+    );
     const knownProps = /* @__PURE__ */ new Set([
       "name",
       "type",
@@ -1719,6 +1846,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
       "weight",
       "config",
       "required",
+      "min_score",
       "negate",
       "max_steps",
       "maxSteps",
@@ -1748,6 +1876,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
       ...graderTargetName ? { target: graderTargetName } : {},
       ...weight !== void 0 ? { weight } : {},
       ...required !== void 0 ? { required } : {},
+      ...min_score !== void 0 ? { min_score } : {},
       ...negate !== void 0 ? { negate } : {},
       ...finalConfig ? { config: finalConfig } : {},
       ...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
@@ -1879,10 +2008,23 @@ ${detailBlock}${ANSI_RESET4}`);
     console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
   }
 }
-function parseRequired(value) {
-  if (value === true) return true;
-  if (typeof value === "number" && value > 0 && value <= 1) return value;
-  return void 0;
+function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
+  const result = {};
+  if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
+    result.min_score = rawMinScore;
+  }
+  if (rawRequired === true) {
+    result.required = true;
+  } else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
+    if (result.min_score === void 0) {
+      result.min_score = rawRequired;
+    }
+    result.required = rawRequired;
+    logWarning2(
+      `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
+    );
+  }
+  return result;
 }
 function validateWeight(rawWeight, evaluatorName, evalId) {
   if (rawWeight === void 0) {
@@ -1925,16 +2067,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
     const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
     const expectedOutcome = asString(rawRubric.outcome) ?? "";
     const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
+    let minScore;
     let requiredMinScore;
     let required;
-    if (typeof rawRubric.required_min_score === "number") {
-      const minScore = rawRubric.required_min_score;
-      if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
+    if (typeof rawRubric.min_score === "number") {
+      const ms = rawRubric.min_score;
+      if (ms <= 0 || ms > 1) {
+        throw new Error(
+          `Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
+        );
+      }
+      minScore = ms;
+      requiredMinScore = Math.round(ms * 10);
+    } else if (typeof rawRubric.required_min_score === "number") {
+      const rms = rawRubric.required_min_score;
+      if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
         throw new Error(
-          `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
+          `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
         );
       }
-      requiredMinScore = minScore;
+      requiredMinScore = rms;
+      minScore = rms / 10;
+      logWarning2(
+        `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
+      );
     }
     if (typeof rawRubric.required === "boolean") {
       required = rawRubric.required;
@@ -1954,6 +2110,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
         weight,
         ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
         ...required !== void 0 ? { required } : {},
+        ...minScore !== void 0 ? { min_score: minScore } : {},
         ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
         score_ranges: scoreRanges
       });
@@ -1970,6 +2127,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
         weight,
         // Default to required: true if not specified (backward compatibility)
         required: required ?? true,
+        ...minScore !== void 0 ? { min_score: minScore } : {},
         ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
       });
     }
@@ -2098,12 +2256,22 @@ function parseInlineRubrics(rawRubrics) {
       id: asString(rubric.id) ?? `rubric-${index + 1}`,
       weight: typeof rubric.weight === "number" ? rubric.weight : 1
     };
+    let inlineMinScore;
+    let inlineRequiredMinScore;
+    if (typeof rubric.min_score === "number") {
+      inlineMinScore = rubric.min_score;
+      inlineRequiredMinScore = Math.round(inlineMinScore * 10);
+    } else if (typeof rubric.required_min_score === "number") {
+      inlineRequiredMinScore = rubric.required_min_score;
+      inlineMinScore = inlineRequiredMinScore / 10;
+    }
     if (scoreRanges && scoreRanges.length > 0) {
       return {
         ...baseRubric,
         ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
         ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
-        ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
+        ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
+        ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
         score_ranges: scoreRanges
       };
     }
@@ -2111,7 +2279,8 @@ function parseInlineRubrics(rawRubrics) {
       ...baseRubric,
       outcome: expectedOutcome,
       required: typeof rubric.required === "boolean" ? rubric.required : true,
-      ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
+      ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
+      ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
     };
   }).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
   if (rubricItems.length === 0) {
@@ -2511,6 +2680,9 @@ function resolveExpectedMessages(raw) {
 var ANSI_YELLOW5 = "\x1B[33m";
 var ANSI_RED2 = "\x1B[31m";
 var ANSI_RESET6 = "\x1B[0m";
+function matchesFilter(id, filter) {
+  return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
+}
 function detectFormat(filePath) {
   const ext = path6.extname(filePath).toLowerCase();
   if (ext === ".jsonl") return "jsonl";
@@ -2578,40 +2750,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
   const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
   const rawFile = await readFile5(absoluteTestPath, "utf8");
   const rawCases = parseJsonlContent(rawFile, evalFilePath);
-  const fallbackEvalSet = path6.basename(absoluteTestPath, ".jsonl") || "eval";
-  const evalSetName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet;
+  const fallbackSuiteName = path6.basename(absoluteTestPath, ".jsonl") || "eval";
+  const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
   const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
   const globalExecution = sidecar.execution;
   if (verbose) {
     console.log(`
-[JSONL Dataset: ${evalFilePath}]`);
+[JSONL Suite: ${evalFilePath}]`);
     console.log(`  Cases: ${rawCases.length}`);
-    console.log(`  Eval set: ${evalSetName}`);
+    console.log(`  Suite: ${suiteName}`);
     if (sidecar.description) {
       console.log(`  Description: ${sidecar.description}`);
     }
   }
   const results = [];
   for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
-    const evalcase = rawCases[lineIndex];
+    const testCaseConfig = rawCases[lineIndex];
     const lineNumber = lineIndex + 1;
-    const id = asString4(evalcase.id);
-    if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) {
+    const id = asString4(testCaseConfig.id);
+    if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
       continue;
     }
-    const conversationId = asString4(evalcase.conversation_id);
-    let outcome = asString4(evalcase.criteria);
-    if (!outcome && evalcase.expected_outcome !== void 0) {
-      outcome = asString4(evalcase.expected_outcome);
+    const conversationId = asString4(testCaseConfig.conversation_id);
+    let outcome = asString4(testCaseConfig.criteria);
+    if (!outcome && testCaseConfig.expected_outcome !== void 0) {
+      outcome = asString4(testCaseConfig.expected_outcome);
       if (outcome) {
         logWarning4(
-          `Test '${asString4(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
+          `Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
         );
       }
     }
-    const rawInputMessages = resolveInputMessages(evalcase);
-    const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
-    const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
+    const rawInputMessages = resolveInputMessages(testCaseConfig);
+    const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
+    const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
     if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
       logError2(
         `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
@@ -2648,18 +2820,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
       }
     }
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
-    const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
+    const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
     const mergedExecution = caseExecution ?? globalExecution;
-    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
+    const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
     let evaluators;
     try {
-      evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
+      evaluators = await parseEvaluators(
+        testCaseConfig,
+        mergedExecution,
+        searchRoots,
+        id ?? "unknown"
+      );
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
       continue;
     }
-    const inlineRubrics = evalcase.rubrics;
+    const inlineRubrics = testCaseConfig.rubrics;
     if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
       const rubricEvaluator = parseInlineRubrics(inlineRubrics);
       if (rubricEvaluator) {
@@ -2670,7 +2847,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
     const userFilePaths = collectResolvedInputFilePaths(inputMessages);
     const testCase = {
       id,
-      dataset: evalSetName,
+      suite: suiteName,
       conversation_id: conversationId,
       question,
       input: inputMessages,
@@ -2678,7 +2855,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
       reference_answer: referenceAnswer,
       file_paths: userFilePaths,
       criteria: outcome ?? "",
-      evaluator: evalCaseEvaluatorKind,
+      evaluator: testCaseEvaluatorKind,
       assertions: evaluators
     };
     results.push(testCase);
@@ -2861,6 +3038,9 @@ function buildChatPromptFromSegments(options) {
 var ANSI_YELLOW6 = "\x1B[33m";
 var ANSI_RED3 = "\x1B[31m";
 var ANSI_RESET7 = "\x1B[0m";
+function matchesFilter2(id, filter) {
+  return typeof filter === "string" ? micromatch2.isMatch(id, filter) : filter.some((pattern) => micromatch2.isMatch(id, pattern));
+}
 function resolveTests(suite) {
   if (suite.tests !== void 0) return suite.tests;
   if (suite.eval_cases !== void 0) {
@@ -2940,18 +3120,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
     throw new Error(`Invalid test file format: ${evalFilePath}`);
   }
   const suite = interpolated;
-  const evalSetNameFromSuite = asString5(suite.name)?.trim();
-  const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
-  const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
-  const rawTestcases = resolveTests(suite);
+  const suiteNameFromFile = asString5(suite.name)?.trim();
+  const fallbackSuiteName = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
+  const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
+  const rawTestCases = resolveTests(suite);
   const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
   const evalFileDir = path7.dirname(absoluteTestPath);
-  let expandedTestcases;
-  if (typeof rawTestcases === "string") {
-    const externalPath = path7.resolve(evalFileDir, rawTestcases);
-    expandedTestcases = await loadCasesFromFile(externalPath);
-  } else if (Array.isArray(rawTestcases)) {
-    expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir);
+  let expandedTestCases;
+  if (typeof rawTestCases === "string") {
+    const externalPath = path7.resolve(evalFileDir, rawTestCases);
+    expandedTestCases = await loadCasesFromFile(externalPath);
+  } else if (Array.isArray(rawTestCases)) {
+    expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
   } else {
     throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
   }
@@ -2966,32 +3146,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
   }
   const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
   const results = [];
-  for (const rawEvalcase of expandedTestcases) {
-    if (!isJsonObject(rawEvalcase)) {
+  for (const rawTestCase of expandedTestCases) {
+    if (!isJsonObject(rawTestCase)) {
       logWarning5("Skipping invalid test entry (expected object)");
       continue;
     }
-    const evalcase = rawEvalcase;
-    const id = asString5(evalcase.id);
-    if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
+    const testCaseConfig = rawTestCase;
+    const id = asString5(testCaseConfig.id);
+    if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
       continue;
     }
-    const conversationId = asString5(evalcase.conversation_id);
-    let outcome = asString5(evalcase.criteria);
-    if (!outcome && evalcase.expected_outcome !== void 0) {
-      outcome = asString5(evalcase.expected_outcome);
+    const conversationId = asString5(testCaseConfig.conversation_id);
+    let outcome = asString5(testCaseConfig.criteria);
+    if (!outcome && testCaseConfig.expected_outcome !== void 0) {
+      outcome = asString5(testCaseConfig.expected_outcome);
       if (outcome) {
         logWarning5(
-          `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
+          `Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
         );
       }
     }
-    const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
+    const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
     const skipDefaults = caseExecution?.skip_defaults === true;
+    const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
     const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
-    const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles);
-    const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
-    const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assertions !== void 0 || evalcase.assert !== void 0;
+    const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
+    const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
+    const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
     if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
       logError3(
         `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
@@ -3038,16 +3219,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
       }
     }
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
-    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
+    const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
     let evaluators;
     try {
-      evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
+      evaluators = await parseEvaluators(
+        testCaseConfig,
+        globalExecution,
+        searchRoots,
+        id ?? "unknown"
+      );
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       logError3(`Skipping test '${id}': ${message}`);
       continue;
     }
-    const inlineRubrics = evalcase.rubrics;
+    const inlineRubrics = testCaseConfig.rubrics;
     if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
       const rubricEvaluator = parseInlineRubrics(inlineRubrics);
       if (rubricEvaluator) {
@@ -3056,13 +3242,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
     }
     warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
     const userFilePaths = collectResolvedInputFilePaths(inputMessages);
-    const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
+    const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
     const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
-    const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
-    const caseTargets = extractTargetsFromTestCase(evalcase);
+    const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
+    const caseTargets = extractTargetsFromTestCase(testCaseConfig);
     const testCase = {
       id,
-      dataset: evalSetName,
+      suite: suiteName,
       category: options?.category,
       conversation_id: conversationId,
       question,
@@ -3071,11 +3257,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
       reference_answer: referenceAnswer,
       file_paths: userFilePaths,
       criteria: outcome ?? "",
-      evaluator: evalCaseEvaluatorKind,
+      evaluator: testCaseEvaluatorKind,
       assertions: evaluators,
       workspace: mergedWorkspace,
       metadata,
-      targets: caseTargets
+      targets: caseTargets,
+      ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
     };
     results.push(testCase);
   }
@@ -3619,7 +3806,7 @@ var AzureProvider = class {
     };
     this.retryConfig = config.retry;
     const azure = createAzure(buildAzureOptions(config));
-    this.model = azure.chat(config.deploymentName);
+    this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
   }
   id;
   kind = "azure";
@@ -3745,7 +3932,9 @@ function buildAzureOptions(config) {
   const options = {
     apiKey: config.apiKey,
     apiVersion: config.version,
-    useDeploymentBasedUrls: true
+    // Chat completions still use deployment-scoped Azure URLs for compatibility
+    // with existing deployments. Responses API should use the SDK's v1 path.
+    useDeploymentBasedUrls: config.apiFormat !== "responses"
   };
   const baseURL = normalizeAzureBaseUrl(config.resourceName);
   if (baseURL) {
@@ -5216,15 +5405,16 @@ var CliProvider = class {
       outputFilePath
     );
     const renderedCommand = renderTemplate(this.config.command, templateValues);
+    const effectiveCwd = requests[0]?.cwd ?? this.config.cwd;
     if (this.verbose) {
       console.log(
-        `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
+        `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${effectiveCwd ?? ""} command=${renderedCommand}`
       );
     }
     try {
       const startTime = Date.now();
       const result = await this.runCommand(renderedCommand, {
-        cwd: this.config.cwd,
+        cwd: effectiveCwd,
         env: process.env,
         timeoutMs: this.config.timeoutMs,
         signal: controller.signal
@@ -5257,7 +5447,7 @@ var CliProvider = class {
               command: renderedCommand,
               stderr: result.stderr,
               exitCode: result.exitCode ?? 0,
-              cwd: this.config.cwd,
+              cwd: effectiveCwd,
               outputFile: outputFilePath
             }
           };
@@ -5275,7 +5465,7 @@ var CliProvider = class {
               command: renderedCommand,
               stderr: result.stderr,
               exitCode: result.exitCode ?? 0,
-              cwd: this.config.cwd,
+              cwd: effectiveCwd,
               outputFile: outputFilePath,
               error: errorMessage
             }
@@ -5290,7 +5480,7 @@ var CliProvider = class {
             command: renderedCommand,
             stderr: result.stderr,
             exitCode: result.exitCode ?? 0,
-            cwd: this.config.cwd,
+            cwd: effectiveCwd,
             outputFile: outputFilePath,
             recordId: evalCaseId
           }
@@ -7240,9 +7430,9 @@ var MockProvider = class {
 };
 // src/evaluation/providers/pi-cli.ts
-import { spawn as spawn3 } from "node:child_process";
+import { execSync, spawn as spawn3 } from "node:child_process";
 import { randomUUID as randomUUID7 } from "node:crypto";
-import { createWriteStream as createWriteStream5 } from "node:fs";
+import { accessSync, createWriteStream as createWriteStream5, readFileSync as readFileSync2 } from "node:fs";
 import { mkdir as mkdir6, mkdtemp, rm, writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
 import path19 from "node:path";
@@ -7300,6 +7490,75 @@ function subscribeToPiLogEntries(listener) {
   };
 }
+// src/evaluation/providers/pi-provider-aliases.ts
+var SUBPROVIDER_ALIASES = {
+  azure: "azure-openai-responses"
+};
+var SUBPROVIDER_ALIASES_WITH_BASE_URL = {
+  // Azure v1 endpoints are OpenAI-compatible; use the standard client
+  // to avoid AzureOpenAI adding api-version query params.
+  azure: "openai-responses"
+};
+var ENV_KEY_MAP = {
+  google: "GEMINI_API_KEY",
+  gemini: "GEMINI_API_KEY",
+  anthropic: "ANTHROPIC_API_KEY",
+  openai: "OPENAI_API_KEY",
+  groq: "GROQ_API_KEY",
+  xai: "XAI_API_KEY",
+  openrouter: "OPENROUTER_API_KEY",
+  azure: "AZURE_OPENAI_API_KEY"
+};
+var ENV_BASE_URL_MAP = {
+  openai: "OPENAI_BASE_URL",
+  azure: "AZURE_OPENAI_BASE_URL",
+  openrouter: "OPENROUTER_BASE_URL"
+};
+function resolveSubprovider(name, hasBaseUrl = false) {
+  const lower = name.toLowerCase();
+  if (hasBaseUrl) {
+    const alias = SUBPROVIDER_ALIASES_WITH_BASE_URL[lower];
+    if (alias) return alias;
+  }
+  return SUBPROVIDER_ALIASES[lower] ?? name;
+}
+function resolveCliProvider(name) {
+  const lower = name.toLowerCase();
+  if (lower === "azure") return "azure-openai-responses";
+  return name;
+}
+function resolveEnvKeyName(provider, hasBaseUrl = false) {
+  const lower = provider.toLowerCase();
+  if (hasBaseUrl && lower === "azure") return "OPENAI_API_KEY";
+  return ENV_KEY_MAP[lower];
+}
+function resolveEnvBaseUrlName(provider, hasBaseUrl = false) {
+  const lower = provider.toLowerCase();
+  if (hasBaseUrl && lower === "azure") return "OPENAI_BASE_URL";
+  return ENV_BASE_URL_MAP[lower];
+}
+function extractAzureResourceName(baseUrl) {
+  const urlMatch = baseUrl.match(/^https?:\/\/([^./]+)/);
+  if (urlMatch) return urlMatch[1];
+  return baseUrl;
+}
+function normalizeAzureSdkBaseUrl(baseUrl) {
+  const trimmed = baseUrl.trim().replace(/\/+$/, "");
+  if (!trimmed) {
+    return trimmed;
+  }
+  if (!/^https?:\/\//i.test(trimmed)) {
+    return `https://${trimmed}.openai.azure.com/openai/v1`;
+  }
+  if (/\/openai\/v1$/i.test(trimmed)) {
+    return trimmed;
+  }
+  if (/\/openai$/i.test(trimmed)) {
+    return `${trimmed}/v1`;
+  }
+  return `${trimmed}/openai/v1`;
+}
 // src/evaluation/providers/pi-utils.ts
 function extractPiTextContent(content) {
   if (typeof content === "string") {
@@ -7458,12 +7717,12 @@ var PiCliProvider = class {
   buildPiArgs(prompt, inputFiles) {
     const args = [];
     if (this.config.subprovider) {
-      args.push("--provider", this.config.subprovider);
+      args.push("--provider", resolveCliProvider(this.config.subprovider));
     }
     if (this.config.model) {
       args.push("--model", this.config.model);
     }
-    if (this.config.apiKey) {
+    if (this.config.apiKey && this.config.subprovider?.toLowerCase() !== "azure") {
       args.push("--api-key", this.config.apiKey);
     }
     args.push("--mode", "json");
@@ -7515,35 +7774,35 @@ ${prompt}` : prompt;
   }
   buildEnv() {
     const env = { ...process.env };
-    if (this.config.apiKey) {
-      const provider = this.config.subprovider?.toLowerCase() ?? "google";
-      const ENV_KEY_MAP = {
-        google: "GEMINI_API_KEY",
-        gemini: "GEMINI_API_KEY",
-        anthropic: "ANTHROPIC_API_KEY",
-        openai: "OPENAI_API_KEY",
-        groq: "GROQ_API_KEY",
-        xai: "XAI_API_KEY",
-        openrouter: "OPENROUTER_API_KEY"
-      };
-      const envKey = ENV_KEY_MAP[provider];
-      if (envKey) {
-        env[envKey] = this.config.apiKey;
+    const provider = this.config.subprovider?.toLowerCase() ?? "google";
+    if (provider === "azure") {
+      if (this.config.apiKey) {
+        env.AZURE_OPENAI_API_KEY = this.config.apiKey;
+      }
+      if (this.config.baseUrl) {
+        env.AZURE_OPENAI_RESOURCE_NAME = extractAzureResourceName(this.config.baseUrl);
+      }
+    } else {
+      if (this.config.apiKey) {
+        const envKey = resolveEnvKeyName(provider);
+        if (envKey) {
+          env[envKey] = this.config.apiKey;
+        }
       }
     }
     if (this.config.subprovider) {
-      const provider = this.config.subprovider.toLowerCase();
+      const resolvedProvider = resolveCliProvider(this.config.subprovider);
       const PROVIDER_OWN_PREFIXES = {
         openrouter: ["OPENROUTER_"],
         anthropic: ["ANTHROPIC_"],
         openai: ["OPENAI_"],
-        azure: ["AZURE_OPENAI_"],
+        "azure-openai-responses": ["AZURE_OPENAI_"],
         google: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
         gemini: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
         groq: ["GROQ_"],
         xai: ["XAI_"]
       };
-      const ownPrefixes = PROVIDER_OWN_PREFIXES[provider] ?? [];
+      const ownPrefixes = PROVIDER_OWN_PREFIXES[resolvedProvider] ?? [];
       const allOtherPrefixes = Object.entries(PROVIDER_OWN_PREFIXES).filter(([key]) => key !== provider).flatMap(([, prefixes]) => prefixes);
       for (const key of Object.keys(env)) {
         if (allOtherPrefixes.some((prefix) => key.startsWith(prefix)) && !ownPrefixes.some((prefix) => key.startsWith(prefix))) {
@@ -7834,6 +8093,24 @@ function extractMessages(events) {
       }
     }
   }
+  if (messages) {
+    for (let i = messages.length - 1; i >= 0; i--) {
+      if (messages[i].role === "assistant" && !messages[i].content) {
+        for (let j = events.length - 1; j >= 0; j--) {
+          const evt = events[j];
+          if (!evt || evt.type !== "message_end") continue;
+          const msg = evt.message;
+          if (msg?.role !== "assistant") continue;
+          const text = extractPiTextContent(msg.content);
+          if (text) {
+            messages[i] = { ...messages[i], content: text };
+            break;
+          }
+        }
+        break;
+      }
+    }
+  }
   const eventToolCalls = extractToolCallsFromEvents(events);
   if (eventToolCalls.length > 0) {
     injectEventToolCalls(messages, eventToolCalls);
@@ -8018,17 +8295,43 @@ function formatTimeoutSuffix3(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) return "";
   return ` after ${Math.ceil(timeoutMs / 1e3)}s`;
 }
+function resolveWindowsCmd(executable) {
+  if (process.platform !== "win32") return [executable, []];
+  const lower = executable.toLowerCase();
+  if (lower.endsWith(".js") || lower.endsWith(".exe")) return [executable, []];
+  let fullPath;
+  try {
+    fullPath = execSync(`where ${executable}`, { encoding: "utf-8" }).trim().split(/\r?\n/)[0].trim();
+  } catch {
+    return [executable, []];
+  }
+  const cmdPath = fullPath.endsWith(".cmd") ? fullPath : `${fullPath}.cmd`;
+  try {
+    const content = readFileSync2(cmdPath, "utf-8");
+    const match = content.match(/"?%_prog%"?\s+"([^"]+\.js)"/);
+    if (match) {
+      const dp0 = path19.dirname(path19.resolve(cmdPath));
+      const scriptPath = match[1].replace(/%dp0%[/\\]?/gi, `${dp0}${path19.sep}`);
+      try {
+        accessSync(scriptPath);
+        return ["node", [scriptPath]];
+      } catch {
+      }
+    }
+  } catch {
+  }
+  return [executable, []];
+}
 async function defaultPiRunner(options) {
   return await new Promise((resolve, reject) => {
     const parts = options.executable.split(/\s+/);
-    const executable = parts[0];
-    const executableArgs = parts.slice(1);
+    const [resolvedExe, prefixArgs] = resolveWindowsCmd(parts[0]);
+    const executableArgs = [...prefixArgs, ...parts.slice(1)];
     const allArgs = [...executableArgs, ...options.args];
-    const child = spawn3(executable, allArgs, {
+    const child = spawn3(resolvedExe, allArgs, {
       cwd: options.cwd,
       env: options.env,
-      stdio: ["pipe", "pipe", "pipe"],
-      shell: false
+      stdio: ["pipe", "pipe", "pipe"]
     });
     let stdout = "";
     let stderr = "";
@@ -8083,13 +8386,43 @@ async function defaultPiRunner(options) {
 }
 // src/evaluation/providers/pi-coding-agent.ts
-import { execSync } from "node:child_process";
+import { execSync as execSync2 } from "node:child_process";
 import { randomUUID as randomUUID8 } from "node:crypto";
-import { accessSync, createWriteStream as createWriteStream6 } from "node:fs";
+import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
 import { mkdir as mkdir7 } from "node:fs/promises";
-import path20 from "node:path";
+import path21 from "node:path";
 import { createInterface } from "node:readline";
-import { fileURLToPath as fileURLToPath3 } from "node:url";
+import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
+// src/paths.ts
+import os2 from "node:os";
+import path20 from "node:path";
+var logged = false;
+function getAgentvHome() {
+  const envHome = process.env.AGENTV_HOME;
+  if (envHome && envHome !== "undefined") {
+    if (!logged) {
+      logged = true;
+      console.warn(`Using AGENTV_HOME: ${envHome}`);
+    }
+    return envHome;
+  }
+  return path20.join(os2.homedir(), ".agentv");
+}
+function getWorkspacesRoot() {
+  return path20.join(getAgentvHome(), "workspaces");
+}
+function getSubagentsRoot() {
+  return path20.join(getAgentvHome(), "subagents");
+}
+function getTraceStateRoot() {
+  return path20.join(getAgentvHome(), "trace-state");
+}
+function getWorkspacePoolRoot() {
+  return path20.join(getAgentvHome(), "workspace-pool");
+}
+// src/evaluation/providers/pi-coding-agent.ts
 var piCodingAgentModule = null;
 var piAiModule = null;
 var loadingPromise = null;
@@ -8107,46 +8440,126 @@ async function promptInstall() {
     rl.close();
   }
 }
-function findAgentvRoot() {
-  const thisFile = fileURLToPath3(import.meta.url);
-  let dir = path20.dirname(thisFile);
-  for (let i = 0; i < 10; i++) {
+function findManagedSdkInstallRoot() {
+  return path21.join(getAgentvHome(), "deps", "pi-sdk");
+}
+function resolveGlobalNpmRoot() {
+  try {
+    const root = execSync2("npm root -g", {
+      encoding: "utf-8",
+      stdio: ["ignore", "pipe", "ignore"]
+    }).trim();
+    return root.length > 0 ? root : void 0;
+  } catch {
+    return void 0;
+  }
+}
+function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
+  return path21.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
+}
+function findAccessiblePath(paths) {
+  for (const candidate of paths) {
     try {
-      const pkg = path20.join(dir, "package.json");
-      accessSync(pkg);
-      return dir;
+      accessSync2(candidate);
+      return candidate;
     } catch {
-      const parent = path20.dirname(dir);
-      if (parent === dir) break;
-      dir = parent;
     }
   }
-  return path20.dirname(thisFile);
+  return void 0;
 }
-async function doLoadSdkModules() {
+async function tryImportLocalSdkModules() {
   try {
     [piCodingAgentModule, piAiModule] = await Promise.all([
       import("@mariozechner/pi-coding-agent"),
       import("@mariozechner/pi-ai")
     ]);
+    return true;
   } catch {
-    if (await promptInstall()) {
-      const installDir = findAgentvRoot();
-      console.error(`Installing @mariozechner/pi-coding-agent into ${installDir}...`);
-      execSync("bun add @mariozechner/pi-coding-agent", {
-        cwd: installDir,
-        stdio: "inherit"
-      });
-      [piCodingAgentModule, piAiModule] = await Promise.all([
-        import("@mariozechner/pi-coding-agent"),
-        import("@mariozechner/pi-ai")
-      ]);
-    } else {
-      throw new Error(
-        "pi-coding-agent SDK is not installed. Install it with:\n  bun add @mariozechner/pi-coding-agent"
-      );
+    return false;
+  }
+}
+async function tryImportManagedSdkModules() {
+  const managedRoot = findManagedSdkInstallRoot();
+  const piCodingAgentEntry = findAccessiblePath([
+    path21.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
+  ]);
+  const piAiEntry = findAccessiblePath([
+    path21.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
+    path21.join(
+      managedRoot,
+      "node_modules",
+      "@mariozechner",
+      "pi-coding-agent",
+      "node_modules",
+      "@mariozechner",
+      "pi-ai",
+      "dist",
+      "index.js"
+    )
+  ]);
+  if (!piCodingAgentEntry || !piAiEntry) return false;
+  try {
+    [piCodingAgentModule, piAiModule] = await Promise.all([
+      import(pathToFileURL(piCodingAgentEntry).href),
+      import(pathToFileURL(piAiEntry).href)
+    ]);
+    return true;
+  } catch {
+    return false;
+  }
+}
+async function tryImportGlobalSdkModules() {
+  const globalNpmRoot = resolveGlobalNpmRoot();
+  if (!globalNpmRoot) return false;
+  const piCodingAgentEntry = findAccessiblePath([
+    buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
+  ]);
+  const piAiEntry = findAccessiblePath([
+    buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
+    path21.join(
+      globalNpmRoot,
+      "@mariozechner",
+      "pi-coding-agent",
+      "node_modules",
+      "@mariozechner",
+      "pi-ai",
+      "dist",
+      "index.js"
+    )
+  ]);
+  if (!piCodingAgentEntry || !piAiEntry) return false;
+  try {
+    [piCodingAgentModule, piAiModule] = await Promise.all([
+      import(pathToFileURL(piCodingAgentEntry).href),
+      import(pathToFileURL(piAiEntry).href)
+    ]);
+    return true;
+  } catch {
+    return false;
+  }
+}
+function installSdkModules(installDir) {
+  console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
+  mkdirSync(installDir, { recursive: true });
+  execSync2("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
+    cwd: installDir,
+    stdio: "inherit"
+  });
+}
+async function doLoadSdkModules() {
+  if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
+    return;
+  }
+  if (await promptInstall()) {
+    const installDir = findManagedSdkInstallRoot();
+    installSdkModules(installDir);
+    if (await tryImportManagedSdkModules()) {
+      return;
     }
   }
+  throw new Error(
+    "pi-coding-agent SDK is not installed. Install it with:\n  npm install @mariozechner/pi-coding-agent"
+  );
 }
 async function loadSdkModules() {
   if (!piCodingAgentModule || !piAiModule) {
@@ -8174,7 +8587,9 @@ async function loadSdkModules() {
     codingTools: piSdk.codingTools,
     toolMap,
     SessionManager: piSdk.SessionManager,
-    getModel: piAi.getModel
+    getModel: piAi.getModel,
+    // biome-ignore lint/suspicious/noExplicitAny: registerBuiltInApiProviders exists at runtime but not in type defs
+    registerBuiltInApiProviders: piAi.registerBuiltInApiProviders
   };
 }
 var PiCodingAgentProvider = class {
@@ -8196,17 +8611,35 @@ var PiCodingAgentProvider = class {
     const startTime = (/* @__PURE__ */ new Date()).toISOString();
     const startMs = Date.now();
     const sdk = await loadSdkModules();
+    sdk.registerBuiltInApiProviders();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
       const cwd = this.resolveCwd(request.cwd);
-      const providerName = this.config.subprovider ?? "google";
+      const rawProvider = this.config.subprovider ?? "google";
+      const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
+      const hasBaseUrl = !!normalizedBaseUrl;
+      const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
       const modelId = this.config.model ?? "gemini-2.5-flash";
-      this.setApiKeyEnv(providerName);
-      const model = sdk.getModel(providerName, modelId);
+      this.setApiKeyEnv(rawProvider, hasBaseUrl);
+      this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
+      let model = sdk.getModel(providerName, modelId);
+      if (model && normalizedBaseUrl) {
+        model = { ...model, baseUrl: normalizedBaseUrl };
+      }
       if (!model) {
-        throw new Error(
-          `pi-coding-agent: getModel('${providerName}', '${modelId}') returned undefined. The model '${modelId}' is not registered for provider '${providerName}' in pi-ai. Check that subprovider and model are correct in your target config.`
-        );
+        const envProvider = providerName.replace(/-responses$/, "");
+        model = {
+          id: modelId,
+          name: modelId,
+          api: providerName,
+          provider: envProvider,
+          baseUrl: normalizedBaseUrl ?? "",
+          reasoning: false,
+          input: ["text"],
+          cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+          contextWindow: 128e3,
+          maxTokens: 16384
+        };
       }
       const tools = this.resolveTools(sdk);
       const { session } = await sdk.createAgentSession({
@@ -8359,28 +8792,35 @@ ${fileList}`;
     }
   }
   /** Maps config apiKey to the provider-specific env var the SDK reads. */
-  setApiKeyEnv(providerName) {
+  setApiKeyEnv(providerName, hasBaseUrl = false) {
     if (!this.config.apiKey) return;
-    const ENV_KEY_MAP = {
-      google: "GEMINI_API_KEY",
-      gemini: "GEMINI_API_KEY",
-      anthropic: "ANTHROPIC_API_KEY",
-      openai: "OPENAI_API_KEY",
-      groq: "GROQ_API_KEY",
-      xai: "XAI_API_KEY",
-      openrouter: "OPENROUTER_API_KEY"
-    };
-    const envKey = ENV_KEY_MAP[providerName.toLowerCase()];
+    const envKey = resolveEnvKeyName(providerName, hasBaseUrl);
     if (envKey) {
       process.env[envKey] = this.config.apiKey;
     }
   }
+  /** Maps config baseUrl to the provider-specific env var the SDK reads. */
+  setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
+    const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
+    if (!normalizedBaseUrl) return;
+    const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
+    if (envKey) {
+      process.env[envKey] = normalizedBaseUrl;
+    }
+  }
+  normalizeSdkBaseUrl(providerName, baseUrl) {
+    if (!baseUrl) return void 0;
+    if (providerName.toLowerCase() === "azure") {
+      return normalizeAzureSdkBaseUrl(baseUrl);
+    }
+    return baseUrl;
+  }
   resolveCwd(cwdOverride) {
     if (cwdOverride) {
-      return path20.resolve(cwdOverride);
+      return path21.resolve(cwdOverride);
     }
     if (this.config.cwd) {
-      return path20.resolve(this.config.cwd);
+      return path21.resolve(this.config.cwd);
     }
     return process.cwd();
   }
@@ -8399,9 +8839,9 @@ ${fileList}`;
   }
   resolveLogDirectory() {
     if (this.config.logDir) {
-      return path20.resolve(this.config.logDir);
+      return path21.resolve(this.config.logDir);
     }
-    return path20.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
+    return path21.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -8415,7 +8855,7 @@ ${fileList}`;
       console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = path20.join(logDir, buildLogFilename6(request, this.targetName));
+    const filePath = path21.join(logDir, buildLogFilename6(request, this.targetName));
     try {
       const logger = await PiStreamLogger2.create({
         filePath,
@@ -8640,7 +9080,7 @@ import path30 from "node:path";
 // src/evaluation/providers/vscode/utils/fs.ts
 import { constants as constants2 } from "node:fs";
 import { access as access2, mkdir as mkdir8, readdir as readdir2, rm as rm2, stat as stat2 } from "node:fs/promises";
-import path21 from "node:path";
+import path22 from "node:path";
 async function pathExists(target) {
   try {
     await access2(target, constants2.F_OK);
@@ -8656,7 +9096,7 @@ async function readDirEntries(target) {
   const entries = await readdir2(target, { withFileTypes: true });
   return entries.map((entry) => ({
     name: entry.name,
-    absolutePath: path21.join(target, entry.name),
+    absolutePath: path22.join(target, entry.name),
     isDirectory: entry.isDirectory()
   }));
 }
@@ -8671,9 +9111,9 @@ async function removeIfExists(target) {
 }
 // src/evaluation/providers/vscode/utils/path.ts
-import path22 from "node:path";
+import path23 from "node:path";
 function pathToFileUri2(filePath) {
-  const absolutePath = path22.isAbsolute(filePath) ? filePath : path22.resolve(filePath);
+  const absolutePath = path23.isAbsolute(filePath) ? filePath : path23.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -8682,7 +9122,7 @@ function pathToFileUri2(filePath) {
 }
 // src/evaluation/providers/vscode/dispatch/promptBuilder.ts
-import path23 from "node:path";
+import path24 from "node:path";
 // src/evaluation/providers/vscode/utils/template.ts
 function renderTemplate2(content, variables) {
@@ -8774,8 +9214,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
   });
 }
 function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
-  const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path23.basename(file)}`).join("\n");
-  const responseList = responseFiles.map((file) => `"${path23.basename(file)}"`).join(", ");
+  const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path24.basename(file)}`).join("\n");
+  const responseList = responseFiles.map((file) => `"${path24.basename(file)}"`).join(", ");
   return renderTemplate2(templateContent, {
     requestFiles: requestLines,
     responseList
@@ -8784,7 +9224,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
 // src/evaluation/providers/vscode/dispatch/responseWaiter.ts
 import { readFile as readFile9 } from "node:fs/promises";
-import path24 from "node:path";
+import path25 from "node:path";
 // src/evaluation/providers/vscode/utils/time.ts
 function sleep2(ms) {
@@ -8843,7 +9283,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
 }
 async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
   if (!silent) {
-    const fileList = responseFilesFinal.map((file) => path24.basename(file)).join(", ");
+    const fileList = responseFilesFinal.map((file) => path25.basename(file)).join(", ");
     console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
   }
   const deadline = Date.now() + timeoutMs;
@@ -8852,7 +9292,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
     while (pending.size > 0) {
       if (Date.now() >= deadline) {
         if (!silent) {
-          const remaining = [...pending].map((f) => path24.basename(f)).join(", ");
+          const remaining = [...pending].map((f) => path25.basename(f)).join(", ");
           console.error(
             `error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
           );
@@ -8908,36 +9348,6 @@ import { promisify as promisify2 } from "node:util";
 // src/evaluation/providers/vscode/dispatch/constants.ts
 import path26 from "node:path";
-// src/paths.ts
-import os2 from "node:os";
-import path25 from "node:path";
-var logged = false;
-function getAgentvHome() {
-  const envHome = process.env.AGENTV_HOME;
-  if (envHome && envHome !== "undefined") {
-    if (!logged) {
-      logged = true;
-      console.warn(`Using AGENTV_HOME: ${envHome}`);
-    }
-    return envHome;
-  }
-  return path25.join(os2.homedir(), ".agentv");
-}
-function getWorkspacesRoot() {
-  return path25.join(getAgentvHome(), "workspaces");
-}
-function getSubagentsRoot() {
-  return path25.join(getAgentvHome(), "subagents");
-}
-function getTraceStateRoot() {
-  return path25.join(getAgentvHome(), "trace-state");
-}
-function getWorkspacePoolRoot() {
-  return path25.join(getAgentvHome(), "workspace-pool");
-}
-// src/evaluation/providers/vscode/dispatch/constants.ts
 var DEFAULT_LOCK_NAME = "subagent.lock";
 var DEFAULT_ALIVE_FILENAME = ".alive";
 function getDefaultSubagentRoot(vscodeCmd = "code") {
@@ -10194,9 +10604,10 @@ function resolveAndCreateProvider(definition, env = process.env) {
 }
 // src/evaluation/evaluators/scoring.ts
-var PASS_THRESHOLD = 0.8;
-function scoreToVerdict(score) {
-  return score >= PASS_THRESHOLD ? "pass" : "fail";
+var DEFAULT_THRESHOLD = 0.8;
+var PASS_THRESHOLD = DEFAULT_THRESHOLD;
+function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
+  return score >= threshold ? "pass" : "fail";
 }
 function clampScore(value) {
   if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -10385,13 +10796,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
 async function execShellWithStdin(command, stdinPayload, options = {}) {
   const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
   const { tmpdir: tmpdir3 } = await import("node:os");
-  const path49 = await import("node:path");
+  const path50 = await import("node:path");
   const { randomUUID: randomUUID10 } = await import("node:crypto");
-  const dir = path49.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
+  const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
   await mkdir16(dir, { recursive: true });
-  const stdinPath = path49.join(dir, "stdin.txt");
-  const stdoutPath = path49.join(dir, "stdout.txt");
-  const stderrPath = path49.join(dir, "stderr.txt");
+  const stdinPath = path50.join(dir, "stdin.txt");
+  const stdoutPath = path50.join(dir, "stdout.txt");
+  const stderrPath = path50.join(dir, "stderr.txt");
   await writeFile9(stdinPath, stdinPayload, "utf8");
   const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
   const { spawn: spawn5 } = await import("node:child_process");
@@ -11589,7 +12000,7 @@ ${outputSchema}`;
     parts.push("[[ ## scoring_criteria ## ]]");
     for (const rubric of rubrics) {
       const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
-      const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
+      const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
       parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
       if (rubric.outcome) {
         parts.push(`Description: ${rubric.outcome}`);
@@ -11643,54 +12054,106 @@ ${outputSchema}`;
   async runWithRetry(options) {
     const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;
     let lastError;
+    let lastInvalidResponse;
+    let shouldAttemptStructureFix = false;
     for (let attempt = 1; attempt <= 3; attempt++) {
       try {
-        const model = graderProvider.asLanguageModel?.();
-        if (model) {
-          const modelOptions = {
-            ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
-            ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
-          };
-          const hasImages = images && images.length > 0;
-          const result = hasImages ? await generateText2({
-            model,
-            system: systemPrompt,
-            messages: [
-              {
-                role: "user",
-                content: [
-                  { type: "text", text: userPrompt },
-                  ...toAiSdkImageParts(images)
-                ]
-              }
-            ],
-            ...modelOptions
-          }) : await generateText2({
-            model,
-            system: systemPrompt,
-            prompt: userPrompt,
-            ...modelOptions
-          });
-          const data2 = schema.parse(parseJsonFromText(result.text));
-          const rawUsage = result.usage;
-          const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
-          return { data: data2, tokenUsage };
+        const result = await this.generateStructuredResponse({
+          context,
+          graderProvider,
+          systemPrompt,
+          userPrompt,
+          images
+        });
+        const canRepairResponse = result.text.trim().length > 0;
+        lastInvalidResponse = canRepairResponse ? result : void 0;
+        let data;
+        try {
+          data = schema.parse(parseJsonFromText(result.text));
+        } catch (e) {
+          lastError = e instanceof Error ? e : new Error(String(e));
+          shouldAttemptStructureFix = canRepairResponse;
+          continue;
         }
-        const response = await graderProvider.invoke({
-          question: userPrompt,
+        return {
+          data,
+          providerResponse: result.providerResponse,
+          tokenUsage: result.tokenUsage
+        };
+      } catch (e) {
+        lastError = e instanceof Error ? e : new Error(String(e));
+      }
+    }
+    if (shouldAttemptStructureFix && lastInvalidResponse) {
+      try {
+        const repaired = await this.generateStructuredResponse({
+          context,
+          graderProvider,
           systemPrompt,
-          evalCaseId: context.evalCase.id,
-          attempt: context.attempt,
-          maxOutputTokens: this.maxOutputTokens,
-          temperature: this.temperature
+          userPrompt: buildStructureRepairPrompt({
+            validationError: lastError?.message ?? "Schema validation failed",
+            invalidResponse: lastInvalidResponse.text
+          })
         });
-        const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
-        return { data, providerResponse: response, tokenUsage: response.tokenUsage };
+        const data = schema.parse(parseJsonFromText(repaired.text));
+        return {
+          data,
+          providerResponse: repaired.providerResponse,
+          tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
+        };
       } catch (e) {
         lastError = e instanceof Error ? e : new Error(String(e));
       }
     }
-    throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
+    throw new Error(
+      `Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
+    );
+  }
+  async generateStructuredResponse(options) {
+    const { context, graderProvider, systemPrompt, userPrompt, images } = options;
+    const model = graderProvider.asLanguageModel?.();
+    if (model) {
+      const modelOptions = {
+        ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
+        ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
+      };
+      const hasImages = images && images.length > 0;
+      const result = hasImages ? await generateText2({
+        model,
+        system: systemPrompt,
+        messages: [
+          {
+            role: "user",
+            content: [
+              { type: "text", text: userPrompt },
+              ...toAiSdkImageParts(images)
+            ]
+          }
+        ],
+        ...modelOptions
+      }) : await generateText2({
+        model,
+        system: systemPrompt,
+        prompt: userPrompt,
+        ...modelOptions
+      });
+      const rawUsage = result.usage;
+      const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
+      return { text: result.text, tokenUsage };
+    }
+    const response = await graderProvider.invoke({
+      question: userPrompt,
+      systemPrompt,
+      evalCaseId: context.evalCase.id,
+      attempt: context.attempt,
+      maxOutputTokens: this.maxOutputTokens,
+      temperature: this.temperature
+    });
+    return {
+      text: extractLastAssistantContent(response.output),
+      providerResponse: response,
+      tokenUsage: response.tokenUsage
+    };
   }
 };
 function buildOutputSchema() {
@@ -11710,6 +12173,29 @@ function buildOutputSchema() {
     "}"
   ].join("\n");
 }
+function buildStructureRepairPrompt(options) {
+  const { validationError, invalidResponse } = options;
+  return [
+    "The following evaluation response has useful grading content but invalid JSON structure.",
+    "Repair it to satisfy the schema in the system prompt.",
+    "Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
+    "",
+    "Validation error:",
+    validationError,
+    "",
+    "Invalid response:",
+    invalidResponse
+  ].join("\n");
+}
+function sumTokenUsage(first, second) {
+  if (!first && !second) {
+    return void 0;
+  }
+  return {
+    input: (first?.input ?? 0) + (second?.input ?? 0),
+    output: (first?.output ?? 0) + (second?.output ?? 0)
+  };
+}
 function buildRubricOutputSchema() {
   return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
 You must return a valid JSON object matching this schema:
@@ -11809,19 +12295,21 @@ function calculateScoreRangeResult(result, rubrics) {
     rawScores[rubric.id] = rawScore;
     totalWeight += rubric.weight;
     weightedScoreSum += normalizedScore * rubric.weight;
-    let requiredMinScore;
-    if (rubric.required_min_score !== void 0) {
-      requiredMinScore = rubric.required_min_score;
+    let minScoreThreshold;
+    if (rubric.min_score !== void 0) {
+      minScoreThreshold = rubric.min_score;
+    } else if (rubric.required_min_score !== void 0) {
+      minScoreThreshold = rubric.required_min_score / 10;
     } else if (rubric.required === true) {
-      requiredMinScore = 10;
+      minScoreThreshold = 1;
     }
     const matchingRange = rubric.score_ranges?.find(
       (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
     );
     const rangeDescription = matchingRange?.outcome ?? "";
     const criterionLabel = rubric.outcome ?? rubric.id;
-    const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
-    if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
+    const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
+    if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
       failedRequired = true;
     }
     assertions.push({
@@ -11898,11 +12386,11 @@ function createFilesystemTools(workspacePath) {
       execute: async (input) => {
         try {
           const resolved = resolveSandboxed(workspacePath, input.path);
-          const stat10 = await fs2.stat(resolved);
-          if (stat10.isDirectory()) {
+          const stat11 = await fs2.stat(resolved);
+          if (stat11.isDirectory()) {
             return { error: `'${input.path}' is a directory, not a file` };
           }
-          const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
+          const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
           const fd = await fs2.open(resolved, "r");
           try {
             await fd.read(buffer, 0, buffer.length, 0);
@@ -11910,8 +12398,8 @@ function createFilesystemTools(workspacePath) {
             await fd.close();
           }
           const content = buffer.toString("utf-8");
-          const truncated = stat10.size > MAX_FILE_SIZE;
-          return { content, truncated, size: stat10.size };
+          const truncated = stat11.size > MAX_FILE_SIZE;
+          return { content, truncated, size: stat11.size };
         } catch (error) {
           return { error: error instanceof Error ? error.message : String(error) };
         }
@@ -11962,8 +12450,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
       const ext = path35.extname(entry.name).toLowerCase();
       if (BINARY_EXTENSIONS.has(ext)) continue;
       try {
-        const stat10 = await fs2.stat(fullPath);
-        if (stat10.size > MAX_FILE_SIZE) continue;
+        const stat11 = await fs2.stat(fullPath);
+        if (stat11.size > MAX_FILE_SIZE) continue;
         const content = await fs2.readFile(fullPath, "utf-8");
         const lines = content.split("\n");
         for (let i = 0; i < lines.length; i++) {
@@ -12604,115 +13092,115 @@ var FieldAccuracyEvaluator = class {
    * Evaluate a single field against the expected value.
    */
   evaluateField(fieldConfig, candidateData, expectedData) {
-    const { path: path49, match, required = true, weight = 1 } = fieldConfig;
-    const candidateValue = resolvePath(candidateData, path49);
-    const expectedValue = resolvePath(expectedData, path49);
+    const { path: path50, match, required = true, weight = 1 } = fieldConfig;
+    const candidateValue = resolvePath(candidateData, path50);
+    const expectedValue = resolvePath(expectedData, path50);
     if (expectedValue === void 0) {
       return {
-        path: path49,
+        path: path50,
         score: 1,
         // No expected value means no comparison needed
         weight,
         hit: true,
-        message: `${path49}: no expected value`
+        message: `${path50}: no expected value`
       };
     }
     if (candidateValue === void 0) {
       if (required) {
         return {
-          path: path49,
+          path: path50,
           score: 0,
           weight,
           hit: false,
-          message: `${path49} (required, missing)`
+          message: `${path50} (required, missing)`
         };
       }
       return {
-        path: path49,
+        path: path50,
         score: 1,
         // Don't penalize missing optional fields
         weight: 0,
         // Zero weight means it won't affect the score
         hit: true,
-        message: `${path49}: optional field missing`
+        message: `${path50}: optional field missing`
       };
     }
     switch (match) {
       case "exact":
-        return this.compareExact(path49, candidateValue, expectedValue, weight);
+        return this.compareExact(path50, candidateValue, expectedValue, weight);
       case "numeric_tolerance":
         return this.compareNumericTolerance(
-          path49,
+          path50,
           candidateValue,
           expectedValue,
           fieldConfig,
           weight
         );
       case "date":
-        return this.compareDate(path49, candidateValue, expectedValue, fieldConfig, weight);
+        return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
       default:
         return {
-          path: path49,
+          path: path50,
           score: 0,
           weight,
           hit: false,
-          message: `${path49}: unknown match type "${match}"`
+          message: `${path50}: unknown match type "${match}"`
         };
     }
   }
   /**
    * Exact equality comparison.
    */
-  compareExact(path49, candidateValue, expectedValue, weight) {
+  compareExact(path50, candidateValue, expectedValue, weight) {
     if (deepEqual(candidateValue, expectedValue)) {
       return {
-        path: path49,
+        path: path50,
         score: 1,
         weight,
         hit: true,
-        message: path49
+        message: path50
       };
     }
     if (typeof candidateValue !== typeof expectedValue) {
       return {
-        path: path49,
+        path: path50,
         score: 0,
         weight,
         hit: false,
-        message: `${path49} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
+        message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
       };
     }
     return {
-      path: path49,
+      path: path50,
       score: 0,
       weight,
       hit: false,
-      message: `${path49} (value mismatch)`
+      message: `${path50} (value mismatch)`
     };
   }
   /**
    * Numeric comparison with absolute or relative tolerance.
    */
-  compareNumericTolerance(path49, candidateValue, expectedValue, fieldConfig, weight) {
+  compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
     const { tolerance = 0, relative = false } = fieldConfig;
     const candidateNum = toNumber(candidateValue);
     const expectedNum = toNumber(expectedValue);
     if (candidateNum === null || expectedNum === null) {
       return {
-        path: path49,
+        path: path50,
         score: 0,
         weight,
         hit: false,
-        message: `${path49} (non-numeric value)`
+        message: `${path50} (non-numeric value)`
       };
     }
     if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
       return {
-        path: path49,
+        path: path50,
         score: 0,
         weight,
         hit: false,
-        message: `${path49} (invalid numeric value)`
+        message: `${path50} (invalid numeric value)`
       };
     }
     const diff = Math.abs(candidateNum - expectedNum);
@@ -12725,61 +13213,61 @@ var FieldAccuracyEvaluator = class {
     }
     if (withinTolerance) {
       return {
-        path: path49,
+        path: path50,
         score: 1,
         weight,
         hit: true,
-        message: `${path49} (within tolerance: diff=${diff.toFixed(2)})`
+        message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
       };
     }
     return {
-      path: path49,
+      path: path50,
       score: 0,
       weight,
       hit: false,
-      message: `${path49} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
+      message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
     };
   }
   /**
    * Date comparison with format normalization.
    */
-  compareDate(path49, candidateValue, expectedValue, fieldConfig, weight) {
+  compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
     const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
     const candidateDate = parseDate(String(candidateValue), formats);
     const expectedDate = parseDate(String(expectedValue), formats);
     if (candidateDate === null) {
       return {
-        path: path49,
+        path: path50,
         score: 0,
         weight,
         hit: false,
-        message: `${path49} (unparseable candidate date)`
+        message: `${path50} (unparseable candidate date)`
       };
     }
     if (expectedDate === null) {
       return {
-        path: path49,
+        path: path50,
         score: 0,
         weight,
         hit: false,
-        message: `${path49} (unparseable expected date)`
+        message: `${path50} (unparseable expected date)`
       };
     }
     if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
       return {
-        path: path49,
+        path: path50,
         score: 1,
         weight,
         hit: true,
-        message: path49
+        message: path50
       };
     }
     return {
-      path: path49,
+      path: path50,
       score: 0,
       weight,
       hit: false,
-      message: `${path49} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
+      message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
     };
   }
   /**
@@ -12812,11 +13300,11 @@ var FieldAccuracyEvaluator = class {
     };
   }
 };
-function resolvePath(obj, path49) {
-  if (!path49 || !obj) {
+function resolvePath(obj, path50) {
+  if (!path50 || !obj) {
     return void 0;
   }
-  const parts = path49.split(/\.|\[|\]/).filter((p) => p.length > 0);
+  const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
   let current = obj;
   for (const part of parts) {
     if (current === null || current === void 0) {
@@ -13308,8 +13796,8 @@ var TokenUsageEvaluator = class {
 };
 // src/evaluation/evaluators/tool-trajectory.ts
-function getNestedValue(obj, path49) {
-  const parts = path49.split(".");
+function getNestedValue(obj, path50) {
+  const parts = path50.split(".");
   let current = obj;
   for (const part of parts) {
     if (current === null || current === void 0 || typeof current !== "object") {
@@ -15081,7 +15569,7 @@ var WorkspacePoolManager = class {
   }
   /**
    * Reset an existing slot for reuse:
-   * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
+   * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
    * 2. Re-copy template files (skip repo directories)
    */
   async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
@@ -15094,7 +15582,17 @@ var WorkspacePoolManager = class {
         continue;
       }
       const ref = repo.checkout?.ref ?? "HEAD";
-      await git(["reset", "--hard", ref], { cwd: repoDir });
+      const resolve = repo.checkout?.resolve ?? "remote";
+      if (resolve === "remote") {
+        const fetchArgs = ["fetch", "origin", ref];
+        if (repo.clone?.depth) {
+          fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
+        }
+        await git(fetchArgs, { cwd: repoDir });
+        await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
+      } else {
+        await git(["reset", "--hard", ref], { cwd: repoDir });
+      }
       const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
       await git(["clean", cleanFlag], { cwd: repoDir });
     }
@@ -15391,7 +15889,7 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
 }
 // src/evaluation/orchestrator.ts
-function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
+function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
   return score >= threshold ? "ok" : "quality_failure";
 }
 function buildSkippedEvaluatorError(scores) {
@@ -15483,7 +15981,7 @@ async function runEvaluation(options) {
   const filteredEvalCases = filterEvalCases(evalCases, filter);
   if (filteredEvalCases.length === 0) {
     if (filter) {
-      throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
+      throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
     }
     return [];
   }
@@ -15509,20 +16007,10 @@ async function runEvaluation(options) {
     if (resolvedTargetsByName.has(name)) {
       return resolvedTargetsByName.get(name);
     }
-    let definition = targetDefinitions.get(name);
+    const definition = resolveDelegatedTargetDefinition(name, targetDefinitions, envLookup);
     if (!definition) {
       return void 0;
     }
-    for (let depth = 0; depth < 5; depth++) {
-      const useTarget = definition.use_target;
-      if (typeof useTarget !== "string" || useTarget.trim().length === 0) break;
-      const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
-      const resolvedName = envMatch ? envLookup[envMatch[1]] ?? "" : useTarget.trim();
-      if (resolvedName.length === 0) break;
-      const next = targetDefinitions.get(resolvedName);
-      if (!next) break;
-      definition = next;
-    }
     const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
     resolvedTargetsByName.set(name, resolved);
     return resolved;
@@ -15545,6 +16033,9 @@ async function runEvaluation(options) {
     const graderName = targetContext.graderTarget ?? targetContext.name;
     const resolvedGrader = resolveTargetByName(graderName);
     if (!resolvedGrader) {
+      if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
+        return void 0;
+      }
       return getOrCreateProvider(targetContext);
     }
     return getOrCreateProvider(resolvedGrader);
@@ -15875,7 +16366,7 @@ async function runEvaluation(options) {
           const budgetResult = {
             timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
             testId: evalCase.id,
-            dataset: evalCase.dataset,
+            suite: evalCase.suite,
             category: evalCase.category,
             score: 0,
             assertions: [],
@@ -15912,7 +16403,7 @@ async function runEvaluation(options) {
           const haltResult = {
             timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
             testId: evalCase.id,
-            dataset: evalCase.dataset,
+            suite: evalCase.suite,
             category: evalCase.category,
             score: 0,
             assertions: [],
@@ -16224,7 +16715,7 @@ async function runBatchEvaluation(options) {
         targetResolver,
         availableTargets,
         verbose,
-        threshold: batchThreshold
+        threshold: evalCase.threshold ?? batchThreshold
       });
       if (providerError) {
         result = {
@@ -16686,8 +17177,9 @@ async function runEvalCase(options) {
       fileChanges,
       workspacePath,
       verbose,
-      threshold: caseThreshold
+      threshold: evalCase.threshold ?? caseThreshold
     });
+    const effectiveThreshold = evalCase.threshold ?? caseThreshold;
     const totalDurationMs = Date.now() - caseStartMs;
     const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
     const evalRunTokenUsage = tokenUsage || graderTokens ? {
@@ -16701,7 +17193,7 @@ async function runEvalCase(options) {
       ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
     };
     const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
-    const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
+    const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
     const targetUsedField = targetUsed ? { targetUsed } : {};
     const finalResult = providerError ? {
       ...result,
@@ -16902,7 +17394,8 @@ async function evaluateCandidate(options) {
     targetResolver,
     availableTargets,
     fileChanges,
-    workspacePath
+    workspacePath,
+    threshold: evalThreshold
   });
   const completedAt = nowFn();
   let agentRequest;
@@ -16933,7 +17426,7 @@ async function evaluateCandidate(options) {
   return {
     timestamp: completedAt.toISOString(),
     testId: evalCase.id,
-    dataset: evalCase.dataset,
+    suite: evalCase.suite,
     category: evalCase.category,
     conversationId: evalCase.conversation_id,
     score: score.score,
@@ -16976,7 +17469,8 @@ async function runEvaluatorsForCase(options) {
     targetResolver,
     availableTargets,
     fileChanges,
-    workspacePath
+    workspacePath,
+    threshold
   } = options;
   if (evalCase.assertions && evalCase.assertions.length > 0) {
     return runEvaluatorList({
@@ -17002,7 +17496,8 @@ async function runEvaluatorsForCase(options) {
       targetResolver,
       availableTargets,
       fileChanges,
-      workspacePath
+      workspacePath,
+      threshold
     });
   }
   const evaluatorKind = evalCase.evaluator ?? "llm-grader";
@@ -17104,7 +17599,8 @@ async function runEvaluatorList(options) {
         name: evaluatorConfig.name,
         type: evaluatorConfig.type,
         weight,
-        ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
+        ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
+        ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
       });
       scores.push({
         name: evaluatorConfig.name,
@@ -17139,7 +17635,8 @@ async function runEvaluatorList(options) {
         name: evaluatorConfig.name ?? "unknown",
         type: evaluatorConfig.type ?? "llm-grader",
         weight,
-        ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
+        ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
+        ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
       });
       scores.push({
         name: evaluatorConfig.name ?? "unknown",
@@ -17173,9 +17670,10 @@ async function runEvaluatorList(options) {
       }
     }
   }
+  const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
   const hasRequiredFailure = scored.some((entry) => {
     if (!entry.required) return false;
-    const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
+    const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
     return entry.score.score < minScore;
   });
   const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
@@ -17186,17 +17684,23 @@ async function runEvaluatorList(options) {
   const expectedAspectCount = assertions.length || 1;
   const score = {
     score: aggregateScore,
-    verdict: scoreToVerdict(aggregateScore),
+    verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
     assertions,
     expectedAspectCount
   };
   return { score, scores };
 }
+function formatFilter(filter) {
+  return typeof filter === "string" ? filter : filter.join(", ");
+}
+function matchesFilter3(id, filter) {
+  return typeof filter === "string" ? micromatch3.isMatch(id, filter) : filter.some((pattern) => micromatch3.isMatch(id, pattern));
+}
 function filterEvalCases(evalCases, filter) {
   if (!filter) {
     return evalCases;
   }
-  return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter));
+  return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
 }
 function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
   const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
@@ -17283,7 +17787,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
   return {
     timestamp: timestamp.toISOString(),
     testId: evalCase.id,
-    dataset: evalCase.dataset,
+    suite: evalCase.suite,
     category: evalCase.category,
     conversationId: evalCase.conversation_id,
     score: 0,
@@ -17555,6 +18059,7 @@ async function evaluate(config) {
     verbose: config.verbose,
     maxConcurrency: config.workers ?? 3,
     filter: config.filter,
+    threshold: config.threshold,
     evalCases,
     onResult: async (result) => {
       collectedResults.push(result);
@@ -17565,19 +18070,19 @@ async function evaluate(config) {
   const durationMs = Date.now() - startTime;
   return {
     results: allResults,
-    summary: computeSummary(allResults, durationMs)
+    summary: computeSummary(allResults, durationMs, config.threshold)
   };
 }
 function mapAssertionType(type) {
   return type.replace(/_/g, "-");
 }
-function computeSummary(results, durationMs) {
+function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
   const total = results.length;
   let passed = 0;
   let scoreSum = 0;
   for (const r of results) {
     scoreSum += r.score;
-    if (r.score >= PASS_THRESHOLD) {
+    if (r.score >= threshold) {
       passed++;
     }
   }
@@ -17608,7 +18113,7 @@ async function discoverDefaultTarget(repoRoot) {
   return null;
 }
 async function loadEnvHierarchy(repoRoot, startPath) {
-  const { readFileSync: readFileSync3 } = await import("node:fs");
+  const { readFileSync: readFileSync4 } = await import("node:fs");
   const chain = buildDirectoryChain(startPath, repoRoot);
   const envFiles = [];
   for (const dir of chain) {
@@ -17617,7 +18122,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
   }
   for (let i = 0; i < envFiles.length; i++) {
     try {
-      const content = readFileSync3(envFiles[i], "utf8");
+      const content = readFileSync4(envFiles[i], "utf8");
       for (const line of content.split("\n")) {
         const trimmed = line.trim();
         if (!trimmed || trimmed.startsWith("#")) continue;
@@ -17690,7 +18195,7 @@ var CONFIG_FILE_NAMES = [
 ];
 async function loadTsConfig(projectRoot) {
   const { existsSync: existsSync7 } = await import("node:fs");
-  const { pathToFileURL } = await import("node:url");
+  const { pathToFileURL: pathToFileURL2 } = await import("node:url");
   const { join: join2 } = await import("node:path");
   for (const fileName of CONFIG_FILE_NAMES) {
     const filePath = join2(projectRoot, fileName);
@@ -17698,7 +18203,7 @@ async function loadTsConfig(projectRoot) {
       continue;
     }
     try {
-      const fileUrl = pathToFileURL(filePath).href;
+      const fileUrl = pathToFileURL2(filePath).href;
       const mod = await import(fileUrl);
       const config = mod.default ?? mod;
       return AgentVConfigSchema.parse(config);
@@ -17832,7 +18337,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
 }
 // src/projects.ts
-import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync2, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
+import { existsSync as existsSync6, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
 import path47 from "node:path";
 import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
 function getProjectsRegistryPath() {
@@ -17844,7 +18349,7 @@ function loadProjectRegistry() {
     return { projects: [] };
   }
   try {
-    const raw = readFileSync2(registryPath, "utf-8");
+    const raw = readFileSync3(registryPath, "utf-8");
     const parsed = parseYaml3(raw);
     if (!parsed || !Array.isArray(parsed.projects)) {
       return { projects: [] };
@@ -17858,7 +18363,7 @@ function saveProjectRegistry(registry) {
   const registryPath = getProjectsRegistryPath();
   const dir = path47.dirname(registryPath);
   if (!existsSync6(dir)) {
-    mkdirSync(dir, { recursive: true });
+    mkdirSync2(dir, { recursive: true });
   }
   writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
 }
@@ -18124,7 +18629,7 @@ var OtelTraceExporter = class {
         rootSpan.setAttribute("gen_ai.system", "agentv");
         rootSpan.setAttribute("agentv.test_id", result.testId);
         rootSpan.setAttribute("agentv.target", result.target);
-        if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
+        if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
         rootSpan.setAttribute("agentv.score", result.score);
         if (captureContent && result.output.length > 0) {
           const lastMsg = result.output[result.output.length - 1];
@@ -18333,7 +18838,7 @@ var OtelStreamingObserver = class {
     this.rootSpan.setAttribute("gen_ai.system", "agentv");
     this.rootSpan.setAttribute("agentv.test_id", testId);
     this.rootSpan.setAttribute("agentv.target", target);
-    if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
+    if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
     this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
   }
   /** Create and immediately export a tool span */
@@ -18682,11 +19187,241 @@ function extractToolResultContent(content) {
   return parts.length > 0 ? parts.join("") : void 0;
 }
-// src/import/session-discovery.ts
+// src/import/codex-parser.ts
+function parseCodexSession(jsonl) {
+  const messages = [];
+  let sessionId = "";
+  let cwd;
+  let model;
+  let version;
+  let startTimestamp;
+  let endTimestamp;
+  const pendingCalls = /* @__PURE__ */ new Map();
+  const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
+  for (const line of lines) {
+    let entry;
+    try {
+      entry = JSON.parse(line);
+    } catch {
+      continue;
+    }
+    if (!entry.type) continue;
+    if (entry.timestamp) {
+      if (!startTimestamp) startTimestamp = entry.timestamp;
+      endTimestamp = entry.timestamp;
+    }
+    const payload = entry.payload ?? {};
+    switch (entry.type) {
+      case "session_meta": {
+        sessionId = String(payload.id ?? "");
+        cwd = payload.cwd ? String(payload.cwd) : void 0;
+        version = payload.cli_version ? String(payload.cli_version) : void 0;
+        if (payload.model && !model) {
+          model = String(payload.model);
+        }
+        break;
+      }
+      case "turn_context": {
+        if (payload.model && !model) {
+          model = String(payload.model);
+        }
+        if (payload.cwd && !cwd) {
+          cwd = String(payload.cwd);
+        }
+        break;
+      }
+      case "response_item": {
+        const itemType = String(payload.type ?? "");
+        const role = String(payload.role ?? "");
+        switch (itemType) {
+          case "message": {
+            if (role === "developer") break;
+            const content = extractResponseItemContent(payload.content);
+            if (role === "user" && content) {
+              messages.push({ role: "user", content });
+            } else if (role === "assistant" && content) {
+              messages.push({ role: "assistant", content });
+            }
+            break;
+          }
+          case "function_call": {
+            const toolName = String(payload.name ?? "");
+            const callId = String(payload.call_id ?? "");
+            let input;
+            if (typeof payload.arguments === "string") {
+              try {
+                input = JSON.parse(payload.arguments);
+              } catch {
+                input = payload.arguments;
+              }
+            } else {
+              input = payload.arguments;
+            }
+            const toolCall = { tool: toolName, input, id: callId };
+            const msgIdx = messages.length;
+            messages.push({
+              role: "assistant",
+              toolCalls: [toolCall]
+            });
+            if (callId) {
+              pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
+            }
+            break;
+          }
+          case "custom_tool_call": {
+            const toolName = String(payload.name ?? "");
+            const callId = String(payload.call_id ?? "");
+            let input;
+            if (typeof payload.arguments === "string") {
+              try {
+                input = JSON.parse(payload.arguments);
+              } catch {
+                input = payload.arguments;
+              }
+            } else {
+              input = payload.arguments;
+            }
+            const toolCall = { tool: toolName, input, id: callId };
+            const msgIdx = messages.length;
+            messages.push({
+              role: "assistant",
+              toolCalls: [toolCall]
+            });
+            if (callId) {
+              pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
+            }
+            break;
+          }
+          case "function_call_output":
+          case "custom_tool_call_output": {
+            const callId = String(payload.call_id ?? "");
+            const pending = pendingCalls.get(callId);
+            if (pending) {
+              const existingMsg = messages[pending.msgIdx];
+              const existingCalls = [...existingMsg.toolCalls ?? []];
+              existingCalls[pending.toolIdx] = {
+                ...existingCalls[pending.toolIdx],
+                output: payload.output
+              };
+              messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
+              pendingCalls.delete(callId);
+            }
+            break;
+          }
+          // Skip reasoning blocks (thinking tokens)
+          case "reasoning":
+            break;
+        }
+        break;
+      }
+    }
+  }
+  let durationMs;
+  if (startTimestamp && endTimestamp) {
+    durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
+  }
+  const source = {
+    provider: "codex",
+    sessionId,
+    cwd,
+    startedAt: startTimestamp,
+    model,
+    version
+  };
+  return {
+    messages,
+    source,
+    // Codex rollout files don't include token counts (only rate limit info)
+    tokenUsage: void 0,
+    durationMs,
+    costUsd: null
+  };
+}
+function extractResponseItemContent(content) {
+  if (typeof content === "string") return content;
+  if (!Array.isArray(content)) return void 0;
+  const parts = [];
+  for (const block of content) {
+    if (typeof block === "object" && block !== null) {
+      const b = block;
+      if (typeof b.text === "string") {
+        parts.push(b.text);
+      }
+    }
+  }
+  return parts.length > 0 ? parts.join("") : void 0;
+}
+// src/import/codex-session-discovery.ts
 import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
 import { homedir as homedir3 } from "node:os";
 import path48 from "node:path";
-var DEFAULT_PROJECTS_DIR = () => path48.join(homedir3(), ".claude", "projects");
+var DEFAULT_SESSIONS_DIR = () => path48.join(homedir3(), ".codex", "sessions");
+async function discoverCodexSessions(opts) {
+  const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
+  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
+  const sessions = [];
+  let yearDirs;
+  try {
+    yearDirs = await readdir8(sessionsDir);
+  } catch {
+    return [];
+  }
+  for (const year of yearDirs) {
+    const yearPath = path48.join(sessionsDir, year);
+    let monthDirs;
+    try {
+      monthDirs = await readdir8(yearPath);
+    } catch {
+      continue;
+    }
+    for (const month of monthDirs) {
+      const monthPath = path48.join(yearPath, month);
+      let dayDirs;
+      try {
+        dayDirs = await readdir8(monthPath);
+      } catch {
+        continue;
+      }
+      for (const day of dayDirs) {
+        if (opts?.date) {
+          const dirDate = `${year}-${month}-${day}`;
+          if (dirDate !== opts.date) continue;
+        }
+        const dayPath = path48.join(monthPath, day);
+        let files;
+        try {
+          files = await readdir8(dayPath);
+        } catch {
+          continue;
+        }
+        for (const file of files) {
+          if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
+          const filePath = path48.join(dayPath, file);
+          const nameWithoutExt = file.replace(/\.jsonl$/, "");
+          const parts = nameWithoutExt.split("-");
+          const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
+          let updatedAt;
+          try {
+            const fileStat = await stat9(filePath);
+            updatedAt = fileStat.mtime;
+          } catch {
+            updatedAt = /* @__PURE__ */ new Date(0);
+          }
+          sessions.push({ sessionId, filePath, filename: file, updatedAt });
+        }
+      }
+    }
+  }
+  sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
+  return sessions.slice(0, limit);
+}
+// src/import/session-discovery.ts
+import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
+import { homedir as homedir4 } from "node:os";
+import path49 from "node:path";
+var DEFAULT_PROJECTS_DIR = () => path49.join(homedir4(), ".claude", "projects");
 function encodeProjectPath(projectPath) {
   return projectPath.replace(/\//g, "-");
 }
@@ -18695,7 +19430,7 @@ async function discoverClaudeSessions(opts) {
   const limit = opts?.latest ? 1 : opts?.limit ?? 10;
   let projectDirs;
   try {
-    projectDirs = await readdir8(projectsDir);
+    projectDirs = await readdir9(projectsDir);
   } catch {
     return [];
   }
@@ -18705,10 +19440,10 @@ async function discoverClaudeSessions(opts) {
   }
   const sessions = [];
   for (const projectDir of projectDirs) {
-    const dirPath = path48.join(projectsDir, projectDir);
+    const dirPath = path49.join(projectsDir, projectDir);
     let entries;
     try {
-      entries = await readdir8(dirPath);
+      entries = await readdir9(dirPath);
     } catch {
       continue;
     }
@@ -18716,10 +19451,10 @@ async function discoverClaudeSessions(opts) {
       if (!entry.endsWith(".jsonl")) continue;
       const sessionId = entry.replace(/\.jsonl$/, "");
       if (opts?.sessionId && sessionId !== opts.sessionId) continue;
-      const filePath = path48.join(dirPath, entry);
+      const filePath = path49.join(dirPath, entry);
       let updatedAt;
       try {
-        const fileStat = await stat9(filePath);
+        const fileStat = await stat10(filePath);
         updatedAt = fileStat.mtime;
       } catch {
         updatedAt = /* @__PURE__ */ new Date(0);
@@ -18738,10 +19473,85 @@ async function discoverClaudeSessions(opts) {
 // src/import/types.ts
 import { readFile as readFile14 } from "node:fs/promises";
+function toTranscriptJsonLine(entry) {
+  const firstUserMessage = entry.messages.find((m) => m.role === "user");
+  const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
+  return {
+    input,
+    output: entry.messages,
+    token_usage: entry.tokenUsage ? {
+      input: entry.tokenUsage.input,
+      output: entry.tokenUsage.output,
+      cached: entry.tokenUsage.cached
+    } : void 0,
+    duration_ms: entry.durationMs,
+    cost_usd: entry.costUsd,
+    source: {
+      provider: entry.source.provider,
+      session_id: entry.source.sessionId,
+      model: entry.source.model,
+      timestamp: entry.source.startedAt,
+      git_branch: entry.source.gitBranch,
+      cwd: entry.source.cwd ?? entry.source.projectPath,
+      version: entry.source.version
+    }
+  };
+}
+async function readTranscriptJsonl(filePath) {
+  const text = await readFile14(filePath, "utf8");
+  return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+}
 async function readTranscriptFile(filePath) {
   return readFile14(filePath, "utf8");
 }
+// src/import/transcript-provider.ts
+var TranscriptProvider = class _TranscriptProvider {
+  id;
+  kind = "transcript";
+  targetName;
+  lines;
+  cursor = 0;
+  constructor(targetName, lines) {
+    this.targetName = targetName;
+    this.id = `transcript:${targetName}`;
+    this.lines = lines;
+  }
+  /**
+   * Create a TranscriptProvider from a JSONL file path.
+   */
+  static async fromFile(filePath) {
+    const lines = await readTranscriptJsonl(filePath);
+    if (lines.length === 0) {
+      throw new Error(`Transcript file is empty: ${filePath}`);
+    }
+    const providerName = lines[0].source.provider ?? "transcript";
+    return new _TranscriptProvider(providerName, lines);
+  }
+  get lineCount() {
+    return this.lines.length;
+  }
+  async invoke(_request) {
+    if (this.cursor >= this.lines.length) {
+      throw new Error(
+        `Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
+      );
+    }
+    const line = this.lines[this.cursor++];
+    return {
+      output: line.output,
+      tokenUsage: line.token_usage ? {
+        input: line.token_usage.input,
+        output: line.token_usage.output,
+        cached: line.token_usage.cached
+      } : void 0,
+      durationMs: line.duration_ms,
+      costUsd: line.cost_usd ?? void 0,
+      startTime: line.source.timestamp
+    };
+  }
+};
 // src/index.ts
 function createAgentKernel() {
   return { status: "stub" };
@@ -18755,6 +19565,7 @@ export {
   DEFAULT_EVALUATOR_TEMPLATE,
   DEFAULT_EVAL_PATTERNS,
   DEFAULT_EXPLORATION_TOOLS,
+  DEFAULT_THRESHOLD,
   DeterministicAssertionEvaluator,
   EvaluatorRegistry,
   ExecutionMetricsEvaluator,
@@ -18776,6 +19587,7 @@ export {
   TemplateNotFoundError,
   TokenUsageEvaluator,
   ToolTrajectoryEvaluator,
+  TranscriptProvider,
   WorkspaceCreationError,
   WorkspacePoolManager,
   addProject,
@@ -18812,6 +19624,7 @@ export {
   detectFormat,
   discoverAssertions,
   discoverClaudeSessions,
+  discoverCodexSessions,
   discoverCopilotSessions,
   discoverGraders,
   discoverGraders as discoverJudges,
@@ -18872,6 +19685,8 @@ export {
   normalizeLineEndings,
   parseAgentSkillsEvals,
   parseClaudeSession,
+  parseCodexSession,
+  parseCopilotEvents,
   parseJsonFromText,
   parseJsonSafe,
   readJsonFile,
@@ -18879,8 +19694,10 @@ export {
   readTestSuiteMetadata,
   readTextFile,
   readTranscriptFile,
+  readTranscriptJsonl,
   removeProject,
   resolveAndCreateProvider,
+  resolveDelegatedTargetDefinition,
   resolveFileReference,
   resolveTargetDefinition,
   resolveWorkspaceTemplate,
@@ -18910,6 +19727,7 @@ export {
   substituteVariables,
   toCamelCaseDeep,
   toSnakeCaseDeep,
+  toTranscriptJsonLine,
   tokensPerTool,
   touchProject,
   transpileEvalYaml,