npm - @sanity/ailf - Versions diffs - 0.2.0 → 0.3.1 - Mend

@sanity/ailf 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/config/models.yaml +3 -2
package/dist/_vendor/ailf-core/types/index.d.ts +53 -0
package/dist/composition-root.js +7 -2
package/dist/orchestration/pipeline-orchestrator.js +27 -2
package/dist/orchestration/step-runner.js +8 -0
package/dist/orchestration/steps/calculate-scores-step.js +4 -0
package/dist/orchestration/steps/generate-configs-step.js +1 -0
package/dist/orchestration/steps/grader-consistency-step.js +1 -0
package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
package/dist/pipeline/calculate-scores.d.ts +5 -0
package/dist/pipeline/calculate-scores.js +219 -146
package/dist/pipeline/coverage-audit.d.ts +2 -1
package/dist/pipeline/coverage-audit.js +5 -3
package/dist/pipeline/expand-tasks.d.ts +2 -1
package/dist/pipeline/expand-tasks.js +33 -2
package/dist/pipeline/generate-configs.d.ts +3 -1
package/dist/pipeline/generate-configs.js +47 -28
package/dist/pipeline/grader-api.d.ts +2 -1
package/dist/pipeline/grader-api.js +11 -9
package/dist/pipeline/grader-compare-runner.d.ts +3 -0
package/dist/pipeline/grader-compare-runner.js +21 -19
package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
package/dist/pipeline/grader-consistency-runner.js +16 -14
package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
package/dist/pipeline/grader-sensitivity-runner.js +18 -16
package/dist/pipeline/grader-validate-runner.d.ts +3 -0
package/dist/pipeline/grader-validate-runner.js +16 -14
package/dist/pipeline/mirror-repo-tasks.d.ts +3 -1
package/dist/pipeline/mirror-repo-tasks.js +8 -6
package/dist/pipeline/provenance.d.ts +3 -0
package/dist/pipeline/provenance.js +25 -3
package/dist/sources.d.ts +2 -1
package/dist/sources.js +28 -1
package/package.json +3 -3

package/config/models.yaml CHANGED Viewed

@@ -46,10 +46,11 @@ models:
       max_tokens: 4096
     modes: [baseline, observed, agentic-naive, agentic-optimized]
   - id: openai:chat:gpt-5.4
-    label: GPT 5.4 (high)
+    label: GPT 5.4
     config:
-      reasoning_effort: "high"
+      reasoning_effort: "medium"
       max_output_tokens: 4096
+      maxRetries: 1
     modes: [baseline, observed, agentic-naive, agentic-optimized]
   # ── Anthropic ───────────────────────────────────────────────

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -462,6 +462,12 @@ export interface PipelineState {
      * below threshold — this is informational, not a hard failure.
      */
     belowCritical?: string[];
+    /**
+     * Test execution summary. Set by CalculateScoresStep which reads
+     * the eval results and counts passed/failed/errored tests.
+     * Consumed by the orchestrator for the enriched PipelineResult.
+     */
+    testSummary?: TestSummary;
 }
 /**
  * Release auto-scope metadata — which tasks are affected by a content
@@ -484,7 +490,48 @@ export interface ReleaseAutoScope {
     };
 }
 /** Result of a full pipeline run */
+/** Classified failure reason for a pipeline run. */
+export type PipelineFailureReason = {
+    type: "validation";
+    message: string;
+} | {
+    type: "missing-results";
+    message: string;
+} | {
+    type: "step-failed";
+    step: string;
+    message: string;
+} | {
+    type: "all-tests-errored";
+    message: string;
+};
+/** Summary of test execution outcomes. */
+export interface TestSummary {
+    /** Total test cases executed */
+    total: number;
+    /** Tests that passed grading */
+    passed: number;
+    /** Tests that failed grading */
+    failed: number;
+    /** Tests that errored (API timeout, malformed response, etc.) */
+    errored: number;
+    /** Details of errored tests (model, task, error message) */
+    errors?: {
+        model: string;
+        task: string;
+        error: string;
+    }[];
+}
+/** Token usage and estimated cost for a pipeline run. */
+export interface PipelineUsage {
+    totalTokens: number;
+    evalTokens: number;
+    graderTokens: number;
+    estimatedCostUsd?: number;
+}
 export interface PipelineResult {
+    /** Feature areas that scored below the critical threshold. Informational — not a failure. */
+    belowCritical?: string[];
     /** Cache hit/miss statistics for this run */
     cache?: {
         hits: number;
@@ -494,6 +541,8 @@ export interface PipelineResult {
     };
     /** Total duration in milliseconds */
     durationMs: number;
+    /** Classified failure reason (when success is false). */
+    failureReason?: PipelineFailureReason;
     /** @deprecated Use `promptfooUrls` — kept for backward compatibility */
     promptfooUrl?: string;
     /** Per-mode Promptfoo share URLs (one per sub-eval that produced a shareable link) */
@@ -502,6 +551,10 @@ export interface PipelineResult {
     steps: Record<string, StepResult>;
     /** Overall success (all non-skipped steps succeeded) */
     success: boolean;
+    /** Summary of test execution outcomes. */
+    testSummary?: TestSummary;
+    /** Token usage and estimated cost. */
+    usage?: PipelineUsage;
     /** Validation issues found (if any) */
     validation: ValidationResult;
 }

package/dist/composition-root.js CHANGED Viewed

@@ -63,9 +63,14 @@ export function createAppContext(config) {
 function createLogger() {
     if (process.env.AILF_LOG_FORMAT === "json")
         return new JsonLogger();
-    if (process.env.AILF_QUIET === "1")
+    if (process.env.AILF_LOG_LEVEL === "quiet" ||
+        process.env.AILF_QUIET === "1") {
         return new QuietLogger();
-    return new ConsoleLogger({ verbose: process.env.AILF_VERBOSE === "1" });
+    }
+    return new ConsoleLogger({
+        verbose: process.env.AILF_LOG_LEVEL === "verbose" ||
+            process.env.AILF_VERBOSE === "1",
+    });
 }
 function createCache(config) {
     const local = new FilesystemCache(config.rootDir);

package/dist/orchestration/pipeline-orchestrator.js CHANGED Viewed

@@ -77,12 +77,20 @@ export async function orchestratePipeline(ctx, steps) {
     const pipelineStart = Date.now();
     const hasJob = !!ctx.config.jobId;
     ctx.logger.section("ai-literacy-framework — Evaluation Pipeline");
+    ctx.logger.debug(`Pipeline starting with ${steps.length} steps`, {
+        steps: steps.map((s) => s.name),
+        mode: ctx.config.mode,
+        source: ctx.config.source,
+        noCache: ctx.config.noCache,
+        jobId: ctx.config.jobId,
+    });
     // Report initial running status
     if (hasJob) {
         await reportJobProgress(ctx, steps[0]?.name ?? "init", 0, steps.length, "running");
     }
     for (let i = 0; i < steps.length; i++) {
         const step = steps[i];
+        ctx.logger.debug(`Starting step ${i + 1}/${steps.length}: ${step.name}`);
         ctx.logger.section(step.name);
         // Report current step progress
         if (hasJob) {
@@ -97,16 +105,23 @@ export async function orchestratePipeline(ctx, steps) {
         // Fail fast on required step failure
         if (result.status === "failed" && !step.optional) {
             ctx.logger.error(`Pipeline aborted: ${step.name} failed`);
+            const failedError = result.status === "failed" ? result.error : `${step.name} failed`;
             // Report failure to job store
             if (hasJob) {
-                const errorMsg = result.status === "failed" ? result.error : `${step.name} failed`;
                 await reportJobProgress(ctx, step.name, i + 1, steps.length, "failed", {
-                    message: errorMsg,
+                    message: failedError,
                     step: step.name,
                 });
             }
             return {
+                belowCritical: state.belowCritical,
                 durationMs: Date.now() - pipelineStart,
+                failureReason: {
+                    type: "step-failed",
+                    step: step.name,
+                    message: failedError,
+                },
+                promptfooUrls: state.promptfooUrls,
                 steps: results,
                 success: false,
                 validation,
@@ -120,6 +135,13 @@ export async function orchestratePipeline(ctx, steps) {
     const durationMs = Date.now() - pipelineStart;
     ctx.logger.section("Pipeline Complete");
     ctx.logger.info(`All steps completed in ${durationMs}ms`);
+    ctx.logger.debug("Pipeline state at completion", {
+        belowCritical: state.belowCritical,
+        reportId: state.reportId,
+        remoteCacheHits: state.remoteCacheHits
+            ? [...state.remoteCacheHits]
+            : undefined,
+    });
     // Report completion to job store (with reportId from state if available)
     if (hasJob) {
         try {
@@ -145,9 +167,12 @@ export async function orchestratePipeline(ctx, steps) {
         }
     }
     return {
+        belowCritical: state.belowCritical,
         durationMs,
+        promptfooUrls: state.promptfooUrls,
         steps: results,
         success: true,
+        testSummary: state.testSummary,
         validation,
     };
 }

package/dist/orchestration/step-runner.js CHANGED Viewed

@@ -36,10 +36,13 @@ export async function runStep(step, ctx, state = {}) {
     if (canCache) {
         try {
             const inputs = step.cacheInputs(ctx);
+            ctx.logger.debug(`[${step.name}] Cache inputs: ${inputs.length} files`);
             const key = await ctx.cache.computeKey(inputs);
             cacheKey = key;
+            ctx.logger.debug(`[${step.name}] Cache key: ${key}`);
             const cached = await ctx.cache.lookup(step.name, key);
             if (cached.hit) {
+                ctx.logger.debug(`[${step.name}] Cache HIT — skipping execution`);
                 const result = {
                     durationMs: Date.now() - start,
                     status: "success",
@@ -48,11 +51,16 @@ export async function runStep(step, ctx, state = {}) {
                 ctx.logger.step(step.name, result);
                 return result;
             }
+            ctx.logger.debug(`[${step.name}] Cache MISS — executing`);
         }
         catch {
             // Cache lookup failure is non-fatal — proceed to execute
+            ctx.logger.debug(`[${step.name}] Cache lookup failed — proceeding`);
         }
     }
+    else {
+        ctx.logger.debug(`[${step.name}] Cache skipped (${!ctx.cache ? "no cache adapter" : ctx.config.noCache ? "--no-cache" : "no cacheInputs"})`);
+    }
     // 3. Execute
     try {
         const result = await step.execute(ctx, state);

package/dist/orchestration/steps/calculate-scores-step.js CHANGED Viewed

@@ -67,6 +67,7 @@ export class CalculateScoresStep {
         try {
             const result = calculateAndWriteScores({
                 allowedOrigins: ctx.config.allowedOrigins,
+                logger: ctx.logger,
                 mode: ctx.config.mode,
                 resolvedSource,
                 resultsPath: primaryMode !== "baseline"
@@ -77,6 +78,9 @@ export class CalculateScoresStep {
                 source: ctx.config.source,
             });
             belowCritical = result.belowCritical;
+            if (result.testSummary) {
+                state.testSummary = result.testSummary;
+            }
         }
         catch (err) {
             return {

package/dist/orchestration/steps/generate-configs-step.js CHANGED Viewed

@@ -65,6 +65,7 @@ export class GenerateConfigsStep {
                         tags: ctx.config.tags,
                     }
                     : undefined,
+                logger: ctx.logger,
                 resolvedSource,
                 rootDir: ctx.config.rootDir,
                 searchMode: ctx.config.searchMode,

package/dist/orchestration/steps/grader-consistency-step.js CHANGED Viewed

@@ -34,6 +34,7 @@ export class GraderConsistencyStep {
         }
         try {
             await runGraderConsistency({
+                logger: ctx.logger,
                 replications,
                 resultsPath: resolve(ctx.config.rootDir, resultsFile),
                 rootDir: ctx.config.rootDir,

package/dist/orchestration/steps/mirror-repo-tasks-step.js CHANGED Viewed

@@ -59,8 +59,9 @@ export class MirrorRepoTasksStep {
             // Run the mirror
             const result = await mirrorRepoTasks({
                 client,
-                tasks: repoTasks,
                 git,
+                logger: ctx.logger,
+                tasks: repoTasks,
             });
             // Log results
             if (result.areasCreated.length > 0) {

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -1,3 +1,4 @@
+import type { Logger, TestSummary } from "../_vendor/ailf-core/index.d.ts";
 import { type ResolvedSourceConfig } from "../sources.js";
 import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
 import type { GraderJudgment, PerModelEntry } from "./types.js";
@@ -86,6 +87,8 @@ export declare function scoreAgenticResults(resultsPath: string, weights: Record
 export interface CalculateScoresOptions {
     /** Allowed origins for source isolation reporting */
     allowedOrigins?: string[];
+    /** Logger instance (defaults to ConsoleLogger if not provided) */
+    logger?: Logger;
     /** Evaluation mode (controls which result files are read) */
     mode?: string;
     /** Pre-resolved source config (skips loadSource() call) */
@@ -103,5 +106,7 @@ export interface CalculateScoresOptions {
 export interface CalculateScoresResult {
     /** Feature areas that scored below the critical threshold (40). */
     belowCritical: string[];
+    /** Summary of test execution outcomes (total, passed, failed, errored). */
+    testSummary?: TestSummary;
 }
 export declare function calculateAndWriteScores(options: CalculateScoresOptions): CalculateScoresResult;