npm - @sanity/ailf - Versions diffs - 0.5.0 → 2.0.0 - Mend

@sanity/ailf 0.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (377) hide show

package/dist/_vendor/ailf-core/services/scoring-engine.js ADDED Viewed

@@ -0,0 +1,237 @@
+/**
+ * 4-tier scoring engine — unified scoring across all evaluation modes.
+ *
+ * Tier 1: Assertion-level (atomic pass/fail + optional numeric score)
+ * Tier 2: Dimension-level (aggregated per scoring dimension)
+ * Tier 3: Task-level (weighted composite of dimensions)
+ * Tier 4: Suite/Area-level (aggregated across tasks)
+ *
+ * This engine is mode-agnostic — it works for literacy, MCP server,
+ * agent harness, knowledge probe, and custom modes.
+ *
+ * @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
+ */
+/**
+ * Aggregate assertion scores into dimension scores.
+ *
+ * Groups assertions by dimension, then applies the configured aggregation
+ * strategy (default: weighted-mean).
+ */
+export function aggregateDimensions(assertions, options) {
+    const defaultAgg = options?.defaultAggregation ?? "weighted-mean";
+    const labels = options?.dimensionLabels ?? {};
+    // Group by dimension
+    const groups = new Map();
+    for (const a of assertions) {
+        const dim = a.dimension || "uncategorized";
+        const existing = groups.get(dim);
+        if (existing) {
+            existing.push(a);
+        }
+        else {
+            groups.set(dim, [a]);
+        }
+    }
+    const dimensions = [];
+    for (const [dimId, dimAssertions] of groups) {
+        const score = aggregateScores(dimAssertions, defaultAgg);
+        dimensions.push({
+            dimensionId: dimId,
+            label: labels[dimId] ?? dimId,
+            score,
+            assertionCount: dimAssertions.length,
+            passCount: dimAssertions.filter((a) => a.pass).length,
+            aggregation: defaultAgg,
+            assertions: dimAssertions,
+        });
+    }
+    return dimensions.sort((a, b) => a.dimensionId.localeCompare(b.dimensionId));
+}
+/**
+ * Compute a weighted task score from dimension scores.
+ */
+export function computeTaskScore(dimensions, options) {
+    const { weights, taskId } = options;
+    const threshold = options.threshold ?? 0.5;
+    const warnings = [];
+    // Weighted sum
+    let score = 0;
+    let totalWeight = 0;
+    const matchedDimensions = [];
+    for (const dim of dimensions) {
+        const weight = weights[dim.dimensionId] ?? 0;
+        if (weight > 0)
+            matchedDimensions.push(dim.dimensionId);
+        score += dim.score * weight;
+        totalWeight += weight;
+    }
+    // Warn when no dimensions match any weight key — likely misconfiguration
+    if (totalWeight === 0 && dimensions.length > 0) {
+        const dimIds = dimensions.map((d) => d.dimensionId).join(", ");
+        const weightKeys = Object.keys(weights).join(", ");
+        warnings.push(`Task "${taskId}": no dimensions matched weight keys. ` +
+            `Dimensions: [${dimIds}], weights: [${weightKeys}]. Score will be 0.`);
+    }
+    // Normalize if weights don't sum to 1 (guard against NaN when totalWeight is 0)
+    if (totalWeight > 0 && Math.abs(totalWeight - 1.0) > 0.001) {
+        score = score / totalWeight;
+    }
+    else if (totalWeight === 0) {
+        score = 0;
+    }
+    return {
+        taskId,
+        ...(options.area ? { area: options.area } : {}),
+        score,
+        dimensions,
+        weights,
+        weightSource: options.weightSource ?? "default",
+        passesThreshold: score >= threshold,
+        threshold,
+        ...(warnings.length > 0 ? { warnings } : {}),
+    };
+}
+/**
+ * Aggregate task scores into area scores.
+ */
+export function aggregateAreas(tasks, previousScores) {
+    // Group tasks by area (from explicit metadata, falling back to taskId prefix)
+    const groups = new Map();
+    for (const task of tasks) {
+        const area = task.area ?? extractArea(task.taskId);
+        const existing = groups.get(area);
+        if (existing) {
+            existing.push(task);
+        }
+        else {
+            groups.set(area, [task]);
+        }
+    }
+    const areas = [];
+    for (const [areaId, areaTasks] of groups) {
+        const score = areaTasks.length > 0
+            ? areaTasks.reduce((sum, t) => sum + t.score, 0) / areaTasks.length
+            : 0;
+        const previousScore = previousScores?.[areaId] ?? null;
+        areas.push({
+            areaId,
+            score,
+            taskCount: areaTasks.length,
+            passingTaskCount: areaTasks.filter((t) => t.passesThreshold).length,
+            tasks: areaTasks,
+            delta: previousScore !== null ? score - previousScore : null,
+        });
+    }
+    return areas.sort((a, b) => a.areaId.localeCompare(b.areaId));
+}
+// ---------------------------------------------------------------------------
+// Score normalization
+// ---------------------------------------------------------------------------
+/**
+ * Normalize an assertion score to [0, 1] range.
+ *
+ * Different assertion types produce scores in different ranges:
+ * - Boolean (contains, equals, regex): 0 or 1
+ * - LLM rubric: 0-100 (needs /100)
+ * - similar: 0-1 (already normalized)
+ * - javascript/python: user-defined (assumed 0-1)
+ */
+export function normalizeScore(rawScore, assertionType) {
+    switch (assertionType) {
+        case "g-eval":
+        case "llm-rubric":
+        case "model-graded-closedqa":
+        case "model-graded-factuality":
+            // LLM rubrics typically return 0-100
+            return rawScore > 1 ? rawScore / 100 : rawScore;
+        case "similar":
+            // Similarity score is already 0-1
+            return Math.max(0, Math.min(1, rawScore));
+        case "contains":
+        case "contains-all":
+        case "contains-any":
+        case "equals":
+        case "is-json":
+        case "regex":
+            // Boolean assertions: 0 or 1
+            return rawScore > 0 ? 1 : 0;
+        default:
+            // Custom assertions: clamp to [0, 1]
+            return Math.max(0, Math.min(1, rawScore));
+    }
+}
+/**
+ * Compute ensemble score from multiple grader outputs.
+ */
+export function computeEnsembleScore(scores, aggregation = "mean") {
+    if (scores.length === 0)
+        return { score: 0, agreement: 0 };
+    if (scores.length === 1)
+        return { score: scores[0], agreement: 1 };
+    let score;
+    switch (aggregation) {
+        case "mean":
+            score = scores.reduce((a, b) => a + b, 0) / scores.length;
+            break;
+        case "median": {
+            const sorted = [...scores].sort((a, b) => a - b);
+            const mid = Math.floor(sorted.length / 2);
+            score =
+                sorted.length % 2 === 0
+                    ? (sorted[mid - 1] + sorted[mid]) / 2
+                    : sorted[mid];
+            break;
+        }
+        case "max":
+            score = Math.max(...scores);
+            break;
+    }
+    // Agreement: 1 - normalized standard deviation
+    const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
+    const variance = scores.reduce((sum, s) => sum + (s - mean) ** 2, 0) / scores.length;
+    const stdDev = Math.sqrt(variance);
+    const agreement = Math.max(0, 1 - stdDev);
+    return { score, agreement };
+}
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+function aggregateScores(assertions, strategy) {
+    // Pre-filter to assertions with numeric scores. After this filter,
+    // every element's .score is guaranteed non-null — the cast is safe.
+    const scored = assertions.filter((a) => a.score !== null);
+    if (scored.length === 0) {
+        // Fall back to pass rate
+        return assertions.length > 0
+            ? assertions.filter((a) => a.pass).length / assertions.length
+            : 0;
+    }
+    switch (strategy) {
+        case "mean":
+            return scored.reduce((sum, a) => sum + a.score, 0) / scored.length;
+        case "weighted-mean": {
+            const totalWeight = scored.reduce((sum, a) => sum + a.weight, 0);
+            if (totalWeight === 0) {
+                return scored.reduce((sum, a) => sum + a.score, 0) / scored.length;
+            }
+            return (scored.reduce((sum, a) => sum + a.score * a.weight, 0) / totalWeight);
+        }
+        case "min":
+            return Math.min(...scored.map((a) => a.score));
+        case "max":
+            return Math.max(...scored.map((a) => a.score));
+    }
+}
+/**
+ * Extract the area name from a task ID.
+ *
+ * Uses the first segment before the first hyphen (e.g., "groq-blog-queries" → "groq").
+ * This works for single-word areas but fails for multi-word areas
+ * (e.g., "content-lake-queries" → "content" instead of "content-lake").
+ *
+ * TODO: Use explicit area metadata from task definitions instead of parsing taskId.
+ */
+function extractArea(taskId) {
+    const parts = taskId.split("-");
+    return parts[0] || "general";
+}

package/dist/_vendor/ailf-core/services/scoring.d.ts CHANGED Viewed

@@ -9,13 +9,26 @@
  */
 import type { FeatureScore } from "../types/index.js";
 import type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata } from "../types/scoring-input.js";
+/**
+ * Extract dimension names from a scoring profile's weight map.
+ *
+ * Scoring profiles (defined in config/rubrics.ts) map dimension names
+ * to numeric weights. This function returns those dimension names so
+ * callers can work with dynamic dimensions instead of hardcoded ones.
+ */
+export declare function extractDimensions(profile: Record<string, number>): string[];
 /**
  * Classify a grading component into a scoring dimension.
  *
  * Prefers structured metadata (Approach 5) over heuristic string matching.
- * Returns null if the component doesn't map to a known dimension.
+ * Returns the dimension as a kebab-case string, or null if the component
+ * doesn't map to any dimension.
+ *
+ * Returns `string | null` so non-literacy scoring profiles (MCP, agent,
+ * knowledge-probe) can define arbitrary dimension names in metadata
+ * without requiring changes here.
  */
-export declare function classifyRubric(component: ComponentResult): "codeCorrectness" | "docCoverage" | "taskCompletion" | null;
+export declare function classifyRubric(component: ComponentResult): string | null;
 /**
  * Detect the feature area from a test description string.
  *

package/dist/_vendor/ailf-core/services/scoring.js CHANGED Viewed

@@ -8,40 +8,50 @@
  * the Ports & Adapters migration (Phase 4e).
  */
 // ---------------------------------------------------------------------------
+// Dimension extraction
+// ---------------------------------------------------------------------------
+/**
+ * Extract dimension names from a scoring profile's weight map.
+ *
+ * Scoring profiles (defined in config/rubrics.ts) map dimension names
+ * to numeric weights. This function returns those dimension names so
+ * callers can work with dynamic dimensions instead of hardcoded ones.
+ */
+export function extractDimensions(profile) {
+    return Object.keys(profile);
+}
+// ---------------------------------------------------------------------------
 // Rubric classification
 // ---------------------------------------------------------------------------
 /**
  * Classify a grading component into a scoring dimension.
  *
  * Prefers structured metadata (Approach 5) over heuristic string matching.
- * Returns null if the component doesn't map to a known dimension.
+ * Returns the dimension as a kebab-case string, or null if the component
+ * doesn't map to any dimension.
+ *
+ * Returns `string | null` so non-literacy scoring profiles (MCP, agent,
+ * knowledge-probe) can define arbitrary dimension names in metadata
+ * without requiring changes here.
  */
 export function classifyRubric(component) {
-    // Prefer structured metadata (Approach 5) over heuristic matching
+    // Prefer structured metadata — any dimension name is valid, enabling
+    // non-literacy profiles to pass through names like 'input-validation'
     const metadata = component.assertion?.metadata;
     if (metadata?.dimension) {
-        switch (metadata.dimension) {
-            case "code-correctness":
-                return "codeCorrectness";
-            case "doc-coverage":
-                return "docCoverage";
-            case "task-completion":
-                return "taskCompletion";
-            default:
-                return null;
-        }
+        return metadata.dimension;
     }
     // Fallback: heuristic name matching (for backward compatibility)
     const value = (component.assertion?.value ?? "").toLowerCase();
     if (value.includes("task completion")) {
-        return "taskCompletion";
+        return "task-completion";
     }
     if (value.includes("code correctness")) {
-        return "codeCorrectness";
+        return "code-correctness";
     }
     if (value.includes("documentation coverage") ||
         value.includes("hallucinate")) {
-        return "docCoverage";
+        return "doc-coverage";
     }
     return null;
 }

package/dist/_vendor/ailf-core/types/branded-ids.d.ts ADDED Viewed

@@ -0,0 +1,137 @@
+/**
+ * Branded ID types — nominal typing for entity identifiers.
+ *
+ * All entity IDs use branded types to prevent accidental misuse.
+ * A `TaskId` cannot be passed where a `RunId` is expected, even
+ * though both are strings at runtime.
+ *
+ * Constructor functions validate format and return `Result<T, E>` —
+ * parse-don't-validate at the boundary, then pass branded values
+ * through the pipeline.
+ *
+ * The `Brand` utility and `Result` type are defined here as the
+ * foundation. Existing branded types in the codebase (`ReportId`,
+ * `ISOTimestamp`) use inline branding — those will be migrated to
+ * use this utility in Phase 7.
+ *
+ * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
+ * @see docs/design-docs/parse-dont-validate.md (design principle)
+ */
+/** Unique symbol for nominal type branding */
+declare const __brand: unique symbol;
+/**
+ * Brand a base type `T` with a nominal tag `B`.
+ *
+ * At runtime, branded values are identical to their base type.
+ * At compile time, `Brand<string, "TaskId">` is incompatible with
+ * `Brand<string, "RunId">` — preventing accidental ID swaps.
+ */
+export type Brand<T, B extends string> = T & {
+    readonly [__brand]: B;
+};
+/** Unique identifier for an evaluation task */
+export type TaskId = Brand<string, "TaskId">;
+/** URL-safe slug for a task (derived from title) */
+export type TaskSlug = Brand<string, "TaskSlug">;
+/** Unique identifier for an evaluation suite */
+export type SuiteId = Brand<string, "SuiteId">;
+/** Unique identifier for an evaluation run */
+export type RunId = Brand<string, "RunId">;
+/** Content-addressable fingerprint for a run's inputs */
+export type RunFingerprint = Brand<string, "RunFingerprint">;
+/** Unique identifier for a single task × provider result */
+export type ResultId = Brand<string, "ResultId">;
+/** Unique identifier for a trace (observability record) */
+export type TraceId = Brand<string, "TraceId">;
+/**
+ * Unique identifier for a published report (UUID v7).
+ *
+ * Note: An existing `ReportId` branded type is defined in
+ * `packages/core/src/types/index.ts` using inline branding.
+ * This definition uses the `Brand` utility for consistency.
+ * Phase 7 will unify them.
+ */
+export type NewReportId = Brand<string, "ReportId">;
+/** Unique identifier for a provider (LLM, MCP server, agent harness) */
+export type ProviderId = Brand<string, "ProviderId">;
+/** Unique identifier for a prompt template */
+export type PromptId = Brand<string, "PromptId">;
+/** Unique identifier for a rubric scoring template */
+export type RubricId = Brand<string, "RubricId">;
+/** Unique identifier for a fixture (test data) */
+export type FixtureId = Brand<string, "FixtureId">;
+/** Unique identifier for a build artifact */
+export type ArtifactId = Brand<string, "ArtifactId">;
+/**
+ * A success result containing a value.
+ */
+export interface Ok<T> {
+    readonly ok: true;
+    readonly value: T;
+}
+/**
+ * A failure result containing an error.
+ */
+export interface Err<E> {
+    readonly ok: false;
+    readonly error: E;
+}
+/** Discriminated union for parse results — parse-don't-validate pattern */
+export type Result<T, E> = Ok<T> | Err<E>;
+/** Construct a success result */
+export declare function ok<T>(value: T): Ok<T>;
+/** Construct a failure result */
+export declare function err<E>(error: E): Err<E>;
+/** Error returned when an ID string fails format validation */
+export interface IdValidationError {
+    /** Error code identifying the specific validation failure */
+    code: string;
+    /** The raw input that failed validation */
+    raw: string;
+    /** Human-readable error message */
+    message: string;
+}
+/**
+ * Parse a raw string into a `TaskId`.
+ *
+ * Valid format: alphanumeric + hyphens, 1–128 characters.
+ * Examples: `"groq-projection-basics"`, `"mcp-server-tools-list"`
+ */
+export declare function taskId(raw: string): Result<TaskId, IdValidationError>;
+/**
+ * Parse a raw string into a `RunId`.
+ *
+ * Valid format: `run_` prefix followed by alphanumeric characters.
+ */
+export declare function runId(raw: string): Result<RunId, IdValidationError>;
+/**
+ * Parse a raw string into a `SuiteId`.
+ *
+ * Valid format: `suite_` prefix followed by alphanumeric characters.
+ */
+export declare function suiteId(raw: string): Result<SuiteId, IdValidationError>;
+/**
+ * Parse a raw string into a `ResultId`.
+ *
+ * Valid format: `res_` prefix followed by alphanumeric characters.
+ */
+export declare function resultId(raw: string): Result<ResultId, IdValidationError>;
+/**
+ * Parse a raw string into a `TraceId`.
+ *
+ * Valid format: `trace_` prefix followed by alphanumeric characters.
+ */
+export declare function traceId(raw: string): Result<TraceId, IdValidationError>;
+/**
+ * Parse a raw string into a `ProviderId`.
+ *
+ * Valid format: colon-separated segments (e.g., `"openai:chat:gpt-4o"`).
+ */
+export declare function providerId(raw: string): Result<ProviderId, IdValidationError>;
+/**
+ * Parse a raw string into a `FixtureId`.
+ *
+ * Valid format: alphanumeric + hyphens, 1–128 characters.
+ */
+export declare function fixtureId(raw: string): Result<FixtureId, IdValidationError>;
+export {};

package/dist/_vendor/ailf-core/types/branded-ids.js ADDED Viewed

@@ -0,0 +1,136 @@
+/**
+ * Branded ID types — nominal typing for entity identifiers.
+ *
+ * All entity IDs use branded types to prevent accidental misuse.
+ * A `TaskId` cannot be passed where a `RunId` is expected, even
+ * though both are strings at runtime.
+ *
+ * Constructor functions validate format and return `Result<T, E>` —
+ * parse-don't-validate at the boundary, then pass branded values
+ * through the pipeline.
+ *
+ * The `Brand` utility and `Result` type are defined here as the
+ * foundation. Existing branded types in the codebase (`ReportId`,
+ * `ISOTimestamp`) use inline branding — those will be migrated to
+ * use this utility in Phase 7.
+ *
+ * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
+ * @see docs/design-docs/parse-dont-validate.md (design principle)
+ */
+/** Construct a success result */
+export function ok(value) {
+    return { ok: true, value };
+}
+/** Construct a failure result */
+export function err(error) {
+    return { ok: false, error };
+}
+// ---------------------------------------------------------------------------
+// Constructor functions — parse-don't-validate
+// ---------------------------------------------------------------------------
+/**
+ * Parse a raw string into a `TaskId`.
+ *
+ * Valid format: alphanumeric + hyphens, 1–128 characters.
+ * Examples: `"groq-projection-basics"`, `"mcp-server-tools-list"`
+ */
+export function taskId(raw) {
+    if (!raw.match(/^[a-z0-9][a-z0-9-]{0,127}$/)) {
+        return err({
+            code: "INVALID_TASK_ID",
+            raw,
+            message: `Invalid TaskId "${raw}": must be 1–128 lowercase alphanumeric characters or hyphens, starting with a letter or digit`,
+        });
+    }
+    return ok(raw);
+}
+/**
+ * Parse a raw string into a `RunId`.
+ *
+ * Valid format: `run_` prefix followed by alphanumeric characters.
+ */
+export function runId(raw) {
+    if (!raw.match(/^run_[a-zA-Z0-9]{8,}$/)) {
+        return err({
+            code: "INVALID_RUN_ID",
+            raw,
+            message: `Invalid RunId "${raw}": must match run_[a-zA-Z0-9]{8,}`,
+        });
+    }
+    return ok(raw);
+}
+/**
+ * Parse a raw string into a `SuiteId`.
+ *
+ * Valid format: `suite_` prefix followed by alphanumeric characters.
+ */
+export function suiteId(raw) {
+    if (!raw.match(/^suite_[a-zA-Z0-9]{4,}$/)) {
+        return err({
+            code: "INVALID_SUITE_ID",
+            raw,
+            message: `Invalid SuiteId "${raw}": must match suite_[a-zA-Z0-9]{4,}`,
+        });
+    }
+    return ok(raw);
+}
+/**
+ * Parse a raw string into a `ResultId`.
+ *
+ * Valid format: `res_` prefix followed by alphanumeric characters.
+ */
+export function resultId(raw) {
+    if (!raw.match(/^res_[a-zA-Z0-9]{8,}$/)) {
+        return err({
+            code: "INVALID_RESULT_ID",
+            raw,
+            message: `Invalid ResultId "${raw}": must match res_[a-zA-Z0-9]{8,}`,
+        });
+    }
+    return ok(raw);
+}
+/**
+ * Parse a raw string into a `TraceId`.
+ *
+ * Valid format: `trace_` prefix followed by alphanumeric characters.
+ */
+export function traceId(raw) {
+    if (!raw.match(/^trace_[a-zA-Z0-9]{8,}$/)) {
+        return err({
+            code: "INVALID_TRACE_ID",
+            raw,
+            message: `Invalid TraceId "${raw}": must match trace_[a-zA-Z0-9]{8,}`,
+        });
+    }
+    return ok(raw);
+}
+/**
+ * Parse a raw string into a `ProviderId`.
+ *
+ * Valid format: colon-separated segments (e.g., `"openai:chat:gpt-4o"`).
+ */
+export function providerId(raw) {
+    if (!raw.match(/^[a-zA-Z0-9][a-zA-Z0-9:._-]{0,127}$/)) {
+        return err({
+            code: "INVALID_PROVIDER_ID",
+            raw,
+            message: `Invalid ProviderId "${raw}": must be 1–128 alphanumeric characters, colons, dots, underscores, or hyphens`,
+        });
+    }
+    return ok(raw);
+}
+/**
+ * Parse a raw string into a `FixtureId`.
+ *
+ * Valid format: alphanumeric + hyphens, 1–128 characters.
+ */
+export function fixtureId(raw) {
+    if (!raw.match(/^[a-z0-9][a-z0-9-]{0,127}$/)) {
+        return err({
+            code: "INVALID_FIXTURE_ID",
+            raw,
+            message: `Invalid FixtureId "${raw}": must be 1–128 lowercase alphanumeric characters or hyphens`,
+        });
+    }
+    return ok(raw);
+}