npm - @sanity/ailf - Versions diffs - 0.5.0 → 2.0.0 - Mend

@sanity/ailf 0.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (377) hide show

package/dist/_vendor/ailf-core/schemas/pipeline.d.ts CHANGED Viewed

@@ -47,7 +47,7 @@ export type WeightProfile = z.infer<typeof WeightProfileSchema>;
  */
 export declare const RubricConfigSchema: z.ZodObject<{
     footer: z.ZodString;
-    "mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodString>>>;
+    "mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnion<readonly [z.ZodString, z.ZodRecord<z.ZodString, z.ZodString>]>>>>;
     profiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodNumber>>>;
     templates: z.ZodRecord<z.ZodString, z.ZodObject<{
         criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
@@ -67,19 +67,18 @@ export declare const FeatureSchema: z.ZodObject<{
     id: z.ZodString;
     name: z.ZodString;
     priority: z.ZodEnum<{
+        critical: "critical";
         high: "high";
-        low: "low";
         medium: "medium";
-        critical: "critical";
+        low: "low";
     }>;
     sections: z.ZodArray<z.ZodString>;
     status: z.ZodEnum<{
         covered: "covered";
-        "out-of-scope": "out-of-scope";
-        planned: "planned";
         uncovered: "uncovered";
+        planned: "planned";
+        "out-of-scope": "out-of-scope";
     }>;
-    taskCount: z.ZodOptional<z.ZodNumber>;
 }, z.core.$strip>;
 /** Inferred TypeScript type for a product feature. */
 export type Feature = z.infer<typeof FeatureSchema>;
@@ -92,19 +91,18 @@ export declare const FeatureRegistrySchema: z.ZodObject<{
         id: z.ZodString;
         name: z.ZodString;
         priority: z.ZodEnum<{
+            critical: "critical";
             high: "high";
-            low: "low";
             medium: "medium";
-            critical: "critical";
+            low: "low";
         }>;
         sections: z.ZodArray<z.ZodString>;
         status: z.ZodEnum<{
             covered: "covered";
-            "out-of-scope": "out-of-scope";
-            planned: "planned";
             uncovered: "uncovered";
+            planned: "planned";
+            "out-of-scope": "out-of-scope";
         }>;
-        taskCount: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>>;
 }, z.core.$strip>;
 /** Inferred TypeScript type for the feature registry. */
@@ -440,14 +438,11 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodUnion<readonly [z.ZodObject
 export type TaskFile = z.infer<typeof TaskFileSchema>;
 /**
  * Schema for per-dimension threshold values.
+ * Uses a dynamic record to support all evaluation modes, not just literacy.
  * Keys use kebab-case to match YAML convention; the threshold engine
  * normalizes to camelCase for comparison against FeatureScore fields.
  */
-export declare const ThresholdDimensionsSchema: z.ZodObject<{
-    "code-correctness": z.ZodOptional<z.ZodNumber>;
-    "doc-coverage": z.ZodOptional<z.ZodNumber>;
-    "task-completion": z.ZodOptional<z.ZodNumber>;
-}, z.core.$strip>;
+export declare const ThresholdDimensionsSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
 /** Inferred TypeScript type for threshold dimension overrides. */
 export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
 /**
@@ -457,11 +452,7 @@ export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
 export declare const ThresholdDefaultsSchema: z.ZodObject<{
     ceiling: z.ZodOptional<z.ZodNumber>;
     composite: z.ZodNumber;
-    dimensions: z.ZodOptional<z.ZodObject<{
-        "code-correctness": z.ZodOptional<z.ZodNumber>;
-        "doc-coverage": z.ZodOptional<z.ZodNumber>;
-        "task-completion": z.ZodOptional<z.ZodNumber>;
-    }, z.core.$strip>>;
+    dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
     "doc-lift": z.ZodOptional<z.ZodNumber>;
 }, z.core.$strip>;
 /** Inferred TypeScript type for threshold defaults. */
@@ -501,21 +492,13 @@ export declare const ThresholdConfigSchema: z.ZodObject<{
     areas: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodObject<{
         ceiling: z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
         composite: z.ZodOptional<z.ZodNumber>;
-        dimensions: z.ZodOptional<z.ZodOptional<z.ZodObject<{
-            "code-correctness": z.ZodOptional<z.ZodNumber>;
-            "doc-coverage": z.ZodOptional<z.ZodNumber>;
-            "task-completion": z.ZodOptional<z.ZodNumber>;
-        }, z.core.$strip>>>;
+        dimensions: z.ZodOptional<z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>>;
         "doc-lift": z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
     }, z.core.$strip>>>;
     defaults: z.ZodObject<{
         ceiling: z.ZodOptional<z.ZodNumber>;
         composite: z.ZodNumber;
-        dimensions: z.ZodOptional<z.ZodObject<{
-            "code-correctness": z.ZodOptional<z.ZodNumber>;
-            "doc-coverage": z.ZodOptional<z.ZodNumber>;
-            "task-completion": z.ZodOptional<z.ZodNumber>;
-        }, z.core.$strip>>;
+        dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
         "doc-lift": z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>;
     regression: z.ZodOptional<z.ZodObject<{

package/dist/_vendor/ailf-core/schemas/pipeline.js CHANGED Viewed

@@ -43,10 +43,22 @@ const WeightProfileSchema = z
     return Math.abs(sum - 1.0) < 0.001;
 }, { message: "profile weights must sum to 1.0" });
 /**
- * Mode-to-profile bindings — maps (mode, variant) pairs to profile names.
- * Example: { baseline: { gold: "default", baseline: "output-only" } }
+ * Mode-to-profile bindings — maps (mode, perspective) pairs to profile names.
+ *
+ * Flat form (most modes):
+ *   { "mcp-server": { gold: "mcp-behavior" } }
+ *
+ * Nested form (literacy mode with variant sub-keys):
+ *   { literacy: { baseline: { gold: "default", baseline: "output-only" }, agentic: { gold: "default" } } }
+ *
+ * The nested form adds a variant level between mode and perspective,
+ * allowing a single canonical mode to host multiple scoring variants.
  */
-const ModeProfilesSchema = z.record(z.string(), z.record(z.string(), z.string()));
+const ModeProfileEntrySchema = z.union([
+    z.string(),
+    z.record(z.string(), z.string()),
+]);
+const ModeProfilesSchema = z.record(z.string(), z.record(z.string(), ModeProfileEntrySchema));
 /**
  * Schema for the full config/rubrics.yaml config file.
  *
@@ -96,7 +108,6 @@ export const FeatureSchema = z.object({
     priority: z.enum(["critical", "high", "medium", "low"]),
     sections: z.array(z.string().min(1)).min(1),
     status: z.enum(["covered", "uncovered", "planned", "out-of-scope"]),
-    taskCount: z.number().int().min(0).optional(),
 });
 /**
  * Schema for the full config/features.yaml config file.
@@ -277,14 +288,11 @@ export const TaskFileSchema = z
 // ---------------------------------------------------------------------------
 /**
  * Schema for per-dimension threshold values.
+ * Uses a dynamic record to support all evaluation modes, not just literacy.
  * Keys use kebab-case to match YAML convention; the threshold engine
  * normalizes to camelCase for comparison against FeatureScore fields.
  */
-export const ThresholdDimensionsSchema = z.object({
-    "code-correctness": z.number().min(0).max(100).optional(),
-    "doc-coverage": z.number().min(0).max(100).optional(),
-    "task-completion": z.number().min(0).max(100).optional(),
-});
+export const ThresholdDimensionsSchema = z.record(z.string(), z.number().min(0).max(100));
 /**
  * Schema for threshold defaults (and per-area overrides).
  * All fields are optional in per-area overrides; defaults must have composite.

package/dist/_vendor/ailf-core/schemas/schedules.d.ts CHANGED Viewed

@@ -18,10 +18,15 @@ export declare const ScheduleEntrySchema: z.ZodObject<{
     cron: z.ZodString;
     enabled: z.ZodDefault<z.ZodBoolean>;
     mode: z.ZodDefault<z.ZodEnum<{
-        agentic: "agentic";
+        custom: "custom";
+        literacy: "literacy";
+        "mcp-server": "mcp-server";
+        "agent-harness": "agent-harness";
+        "knowledge-probe": "knowledge-probe";
         baseline: "baseline";
-        full: "full";
+        agentic: "agentic";
         observed: "observed";
+        full: "full";
     }>>;
     name: z.ZodString;
     publish: z.ZodDefault<z.ZodBoolean>;
@@ -53,10 +58,15 @@ export declare const SchedulesFileSchema: z.ZodObject<{
         cron: z.ZodString;
         enabled: z.ZodDefault<z.ZodBoolean>;
         mode: z.ZodDefault<z.ZodEnum<{
-            agentic: "agentic";
+            custom: "custom";
+            literacy: "literacy";
+            "mcp-server": "mcp-server";
+            "agent-harness": "agent-harness";
+            "knowledge-probe": "knowledge-probe";
             baseline: "baseline";
-            full: "full";
+            agentic: "agentic";
             observed: "observed";
+            full: "full";
         }>>;
         name: z.ZodString;
         publish: z.ZodDefault<z.ZodBoolean>;

package/dist/_vendor/ailf-core/schemas/schedules.js CHANGED Viewed

@@ -11,6 +11,7 @@
  * @see docs/design-docs/report-store/implementation.md — Phase 5
  */
 import { z } from "zod";
+import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
 // ---------------------------------------------------------------------------
 // Cron expression validation
 // ---------------------------------------------------------------------------
@@ -34,8 +35,11 @@ export const ScheduleEntrySchema = z.object({
     cron: CronSchema,
     /** Whether this schedule is active */
     enabled: z.boolean().default(true),
-    /** Evaluation mode */
-    mode: z.enum(["agentic", "baseline", "full", "observed"]).default("baseline"),
+    /**
+     * Evaluation mode — accepts both canonical and legacy names.
+     * Legacy names must pass through normalizeMode() before entering typed pipeline code.
+     */
+    mode: z.enum(RAW_EVAL_MODES).default("baseline"),
     /** Human-readable schedule name (used as report tag) */
     name: z
         .string()

package/dist/_vendor/ailf-core/schemas/sinks.d.ts CHANGED Viewed

@@ -17,10 +17,10 @@
 import { z } from "zod";
 /** All supported sink types as a Zod union. */
 export declare const SinkTypeSchema: z.ZodEnum<{
-    webhook: "webhook";
     bigquery: "bigquery";
     "github-comment": "github-comment";
     slack: "slack";
+    webhook: "webhook";
 }>;
 /** Supported sink type string literal union. */
 export type SinkType = z.infer<typeof SinkTypeSchema>;

package/dist/_vendor/ailf-core/services/comparison-formatters.js CHANGED Viewed

@@ -25,12 +25,21 @@ export function formatComparisonMarkdown(report) {
     lines.push("");
     lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallIcon} ${deltaStr(overall)})`);
     lines.push("");
-    // Per-area table
-    lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
-    lines.push("|---------|----------|---------|-------|------|------|------|");
+    // Derive dimension columns from the first area's keys (all areas share the
+    // same scoring profile, so the key set is uniform).
+    const dimKeys = report.areas.length > 0
+        ? Object.keys(report.areas[0].dimensions)
+        : Object.keys(report.deltas.perDimension);
+    // Per-area table — columns are dynamic
+    const dimHeaders = dimKeys.map(kebabToTitleCase);
+    const headerRow = ["Feature", "Baseline", "Current", "Delta", ...dimHeaders];
+    const separatorRow = headerRow.map(() => "------");
+    lines.push(`| ${headerRow.join(" | ")} |`);
+    lines.push(`|${separatorRow.join("|")}|`);
     for (const a of report.areas) {
         const icon = changeIcon(a.change);
-        lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${deltaStr(a.dimensions.taskCompletion.delta)} | ${deltaStr(a.dimensions.codeCorrectness.delta)} | ${deltaStr(a.dimensions.docCoverage.delta)} |`);
+        const dimCells = dimKeys.map((k) => deltaStr(a.dimensions[k]?.delta ?? 0));
+        lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${dimCells.join(" | ")} |`);
     }
     lines.push("");
     // Summary
@@ -55,9 +64,9 @@ export function formatComparisonMarkdown(report) {
     const dim = report.deltas.perDimension;
     lines.push("| Dimension | Delta |");
     lines.push("|-----------|-------|");
-    lines.push(`| Task Completion | ${deltaStr(dim.taskCompletion)} |`);
-    lines.push(`| Code Correctness | ${deltaStr(dim.codeCorrectness)} |`);
-    lines.push(`| Doc Coverage | ${deltaStr(dim.docCoverage)} |`);
+    for (const k of Object.keys(dim)) {
+        lines.push(`| ${kebabToTitleCase(k)} | ${deltaStr(dim[k])} |`);
+    }
     lines.push(`| Doc Lift | ${deltaStr(report.deltas.docLift)} |`);
     if (report.deltas.cost !== undefined) {
         const costStr = report.deltas.cost > 0
@@ -91,29 +100,51 @@ export function formatComparisonTable(report) {
             : "unchanged");
     lines.push(`  Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)} (${overallIcon} ${deltaStr(overall)})`);
     lines.push("");
-    // Per-dimension averages
+    // Per-dimension averages — derived dynamically from the report
     const dim = report.deltas.perDimension;
+    const dimKeys = report.areas.length > 0
+        ? Object.keys(report.areas[0].dimensions)
+        : Object.keys(dim);
     lines.push("  Dimension averages:");
-    lines.push(`    Task Completion:  ${deltaStr(dim.taskCompletion)}`);
-    lines.push(`    Code Correctness: ${deltaStr(dim.codeCorrectness)}`);
-    lines.push(`    Doc Coverage:     ${deltaStr(dim.docCoverage)}`);
-    lines.push(`    Doc Lift:         ${deltaStr(report.deltas.docLift)}`);
+    // Pad labels to the longest dimension label for alignment
+    const dimLabels = dimKeys.map(kebabToTitleCase);
+    // +1 for the colon appended to each label
+    const maxLabelLen = Math.max(...dimLabels.map((l) => l.length + 1), "Doc Lift:".length);
+    for (let i = 0; i < dimKeys.length; i++) {
+        lines.push(`    ${(dimLabels[i] + ":").padEnd(maxLabelLen)} ${deltaStr(dim[dimKeys[i]] ?? 0)}`);
+    }
+    lines.push(`    ${"Doc Lift:".padEnd(maxLabelLen)} ${deltaStr(report.deltas.docLift)}`);
     if (report.deltas.cost !== undefined) {
-        lines.push(`    Cost:             ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
+        lines.push(`    ${"Cost:".padEnd(maxLabelLen)} ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
     }
     lines.push("");
-    // Per-area table
+    // Per-area table — columns are dynamic
     lines.push("-".repeat(80));
     lines.push("PER-AREA BREAKDOWN");
     lines.push("-".repeat(80));
     lines.push("");
-    const h = "| Feature Area        | Baseline | Experiment | Delta | Task | Code | Docs |";
-    const sep = "|---------------------|----------|------------|-------|------|------|------|";
-    lines.push(h);
-    lines.push(sep);
+    const dimHeaders = dimKeys.map(kebabToTitleCase);
+    const colWidths = dimHeaders.map((h) => Math.max(h.length, 4));
+    const hCols = [
+        "Feature Area".padEnd(19),
+        "Baseline".padStart(8),
+        "Experiment".padStart(10),
+        "Delta".padStart(5),
+        ...dimHeaders.map((h, i) => h.padStart(colWidths[i])),
+    ];
+    const sepCols = [
+        "-".repeat(21),
+        "-".repeat(10),
+        "-".repeat(12),
+        "-".repeat(7),
+        ...colWidths.map((w) => "-".repeat(w + 2)),
+    ];
+    lines.push(`| ${hCols.join(" | ")} |`);
+    lines.push(`|${sepCols.join("|")}|`);
     for (const a of report.areas) {
         const icon = changeIcon(a.change);
-        lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${deltaStr(a.dimensions.taskCompletion.delta).padStart(4)} | ${deltaStr(a.dimensions.codeCorrectness.delta).padStart(4)} | ${deltaStr(a.dimensions.docCoverage.delta).padStart(4)} |`);
+        const dimCells = dimKeys.map((k, i) => deltaStr(a.dimensions[k]?.delta ?? 0).padStart(colWidths[i]));
+        lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${dimCells.join(" | ")} |`);
     }
     lines.push("");
     // Classification summary
@@ -187,3 +218,10 @@ function deltaStr(d) {
         return `${Math.round(d)}`;
     return "0";
 }
+/** Convert kebab-case dimension name to title case (e.g. 'task-completion' → 'Task Completion') */
+function kebabToTitleCase(name) {
+    return name
+        .split("-")
+        .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
+        .join(" ");
+}

package/dist/_vendor/ailf-core/services/index.d.ts CHANGED Viewed

@@ -7,6 +7,7 @@
  * Extracted from packages/eval/src/lib/ during the Ports & Adapters
  * migration (Phase 4e).
  */
-export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
+export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
 export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
+export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, type AggregationStrategy, type AreaScore, type AssertionScore, type DimensionScore, type EnsembleGradingConfig, type GraderTransitionConfig, type TaskScore, type TaskScoreOptions, } from "./scoring-engine.js";
 export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";

package/dist/_vendor/ailf-core/services/index.js CHANGED Viewed

@@ -7,6 +7,7 @@
  * Extracted from packages/eval/src/lib/ during the Ports & Adapters
  * migration (Phase 4e).
  */
-export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
+export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
 export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
+export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "./scoring-engine.js";
 export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";

package/dist/_vendor/ailf-core/services/scoring-engine.d.ts ADDED Viewed

@@ -0,0 +1,153 @@
+/**
+ * 4-tier scoring engine — unified scoring across all evaluation modes.
+ *
+ * Tier 1: Assertion-level (atomic pass/fail + optional numeric score)
+ * Tier 2: Dimension-level (aggregated per scoring dimension)
+ * Tier 3: Task-level (weighted composite of dimensions)
+ * Tier 4: Suite/Area-level (aggregated across tasks)
+ *
+ * This engine is mode-agnostic — it works for literacy, MCP server,
+ * agent harness, knowledge probe, and custom modes.
+ *
+ * @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
+ */
+/** The result of a single assertion evaluation */
+export interface AssertionScore {
+    /** Whether the assertion passed */
+    pass: boolean;
+    /** Numeric score in [0, 1], null if not applicable */
+    score: number | null;
+    /** Human-readable explanation */
+    reason: string;
+    /** Assertion type that produced this result */
+    assertionType: string;
+    /** Dimension this assertion contributes to */
+    dimension: string;
+    /** Wall-clock grading time in ms */
+    latencyMs: number;
+    /** Weight of this assertion (1.0 if unspecified) */
+    weight: number;
+}
+/** Aggregation strategy for dimension scoring */
+export type AggregationStrategy = "max" | "mean" | "min" | "weighted-mean";
+/** Aggregated score for a scoring dimension */
+export interface DimensionScore {
+    /** Dimension identifier (e.g., "code-correctness") */
+    dimensionId: string;
+    /** Human-readable label */
+    label: string;
+    /** Aggregated score in [0, 1] */
+    score: number;
+    /** How many assertions contributed */
+    assertionCount: number;
+    /** How many assertions passed */
+    passCount: number;
+    /** Aggregation method used */
+    aggregation: AggregationStrategy;
+    /** Individual assertion results */
+    assertions: AssertionScore[];
+}
+/**
+ * Aggregate assertion scores into dimension scores.
+ *
+ * Groups assertions by dimension, then applies the configured aggregation
+ * strategy (default: weighted-mean).
+ */
+export declare function aggregateDimensions(assertions: AssertionScore[], options?: {
+    defaultAggregation?: AggregationStrategy;
+    dimensionLabels?: Record<string, string>;
+}): DimensionScore[];
+/** Weighted composite score for a task */
+export interface TaskScore {
+    /** Task identifier */
+    taskId: string;
+    /** Feature area (e.g., "groq", "studio"). When absent, aggregateAreas() falls back to taskId prefix. */
+    area?: string;
+    /** Weighted composite score in [0, 1] */
+    score: number;
+    /** Per-dimension breakdown */
+    dimensions: DimensionScore[];
+    /** Weight configuration used */
+    weights: Record<string, number>;
+    /** Source of weights (default profile, task override, etc.) */
+    weightSource: string;
+    /** Whether the task met its quality threshold */
+    passesThreshold: boolean;
+    /** The threshold compared against */
+    threshold: number;
+    /** Warnings about potential misconfiguration (e.g., no dimensions matched weights) */
+    warnings?: string[];
+}
+/** Options for computing a task score */
+export interface TaskScoreOptions {
+    /** Task identifier */
+    taskId: string;
+    /** Feature area (e.g., "groq", "studio"). Falls back to taskId prefix if omitted. */
+    area?: string;
+    /** Dimension weights (must sum to ~1.0) */
+    weights: Record<string, number>;
+    /** Where the weights came from (for traceability) */
+    weightSource?: string;
+    /** Quality threshold (0-1) for pass/fail gate */
+    threshold?: number;
+}
+/**
+ * Compute a weighted task score from dimension scores.
+ */
+export declare function computeTaskScore(dimensions: DimensionScore[], options: TaskScoreOptions): TaskScore;
+/** Aggregated score across tasks in a feature area */
+export interface AreaScore {
+    /** Area identifier (e.g., "groq", "studio") */
+    areaId: string;
+    /** Mean task score */
+    score: number;
+    /** Number of tasks evaluated */
+    taskCount: number;
+    /** Number of tasks passing threshold */
+    passingTaskCount: number;
+    /** Per-task breakdown */
+    tasks: TaskScore[];
+    /** Trend vs previous evaluation */
+    delta: number | null;
+}
+/**
+ * Aggregate task scores into area scores.
+ */
+export declare function aggregateAreas(tasks: TaskScore[], previousScores?: Record<string, number>): AreaScore[];
+/**
+ * Normalize an assertion score to [0, 1] range.
+ *
+ * Different assertion types produce scores in different ranges:
+ * - Boolean (contains, equals, regex): 0 or 1
+ * - LLM rubric: 0-100 (needs /100)
+ * - similar: 0-1 (already normalized)
+ * - javascript/python: user-defined (assumed 0-1)
+ */
+export declare function normalizeScore(rawScore: number, assertionType: string): number;
+/** Grader transition configuration for gradual migration */
+export interface GraderTransitionConfig {
+    /** Current (old) grader model */
+    old: string;
+    /** New grader model to transition to */
+    new_: string;
+    /** ISO date after which old grader is retired */
+    expiration: string;
+    /** Whether to run both graders in parallel */
+    parallel: boolean;
+}
+/** Ensemble grading configuration */
+export interface EnsembleGradingConfig {
+    /** Whether ensemble grading is enabled */
+    enabled: boolean;
+    /** Grader models to use */
+    models: string[];
+    /** Aggregation strategy for ensemble scores */
+    aggregation: "max" | "mean" | "median";
+}
+/**
+ * Compute ensemble score from multiple grader outputs.
+ */
+export declare function computeEnsembleScore(scores: number[], aggregation?: "max" | "mean" | "median"): {
+    score: number;
+    agreement: number;
+};