npm - @sanity/ailf-studio - Versions diffs - 1.2.1 → 1.3.0 - Mend

@sanity/ailf-studio 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -434,6 +434,14 @@ interface ScoreItem {
     /** Ceiling score — gold-standard docs injected */
     ceilingScore?: number;
 }
+/** Reference to an artifact stored in an external object store (GCS). */
+interface ArtifactRef {
+    store: "gcs";
+    bucket: string;
+    path: string;
+    bytes?: number;
+    entryCount?: number;
+}
 /** A single gap/recommendation from gap analysis */
 interface RecommendationGap {
     affectedTaskIds: string[];
@@ -451,12 +459,41 @@ interface RecommendationsData {
     generatedAt: string;
     totalPotentialLift: number;
 }
+/**
+ * Per-test result stored in reports for drill-down and audit.
+ * Mirrors StoredTestResult from @sanity/ailf-core. See D0029.
+ */
+interface StoredTestResultData {
+    area: string;
+    canonicalDocs?: DocumentRef[];
+    compositeScore?: number;
+    cost?: number;
+    dimensions: {
+        dimension: string;
+        reason: string;
+        score: number;
+    }[];
+    latencyMs?: number;
+    modelId: string;
+    outputFailure?: boolean;
+    responseOutput: string;
+    responseOutputTruncated?: boolean;
+    taskId: string;
+    tokenUsage?: {
+        completion: number;
+        prompt: number;
+        total: number;
+    };
+    variant: "baseline" | "gold";
+}
 /** A single low-scoring grader judgment stored in reports */
 interface JudgmentData {
     /** Docs the task expected the model to use */
     canonicalDocs?: DocumentRef[];
     dimension: string;
     modelId: string;
+    /** True when the model failed to produce output (empty response, API error, refusal) */
+    outputFailure?: boolean;
     reason: string;
     score: number;
     taskId: string;
@@ -497,6 +534,13 @@ interface PerModelData {
 interface SummaryData {
     /** Per-feature agent behavior data (only present when agentic mode ran) */
     agentBehavior?: FeatureAgentBehaviorData[] | null;
+    /** External artifact references — present when pipeline uploads to GCS (D0030) */
+    artifacts?: {
+        testOutputs?: ArtifactRef;
+        renderedPrompts?: ArtifactRef;
+        rawResults?: ArtifactRef;
+        traces?: ArtifactRef;
+    };
     belowCritical: string[];
     /** All Sanity documents used across the entire evaluation */
     documentManifest?: DocumentRef[];
@@ -521,6 +565,8 @@ interface SummaryData {
     /** Gap analysis recommendations (when gap analysis was run) */
     recommendations: null | RecommendationsData;
     scores: ScoreItem[];
+    /** Per-test results with model output and metadata (D0029) */
+    testResults?: StoredTestResultData[] | null;
     timestamp: string;
 }
 /** Shape returned by scoreTimelineQuery */
@@ -732,7 +778,7 @@ declare const scoreTimelineQuery: string;
  *
  * Used by: ReportDetail view
  */
-declare const reportDetailQuery = "\n  *[_type == \"ailf.report\" && reportId == $reportId][0] {\n    _id,\n    reportId,\n    completedAt,\n    durationMs,\n    tag,\n    title,\n    provenance,\n    summary,\n    comparison\n  }\n";
+declare const reportDetailQuery = "\n  *[_type == \"ailf.report\" && reportId == $reportId][0] {\n    _id,\n    reportId,\n    completedAt,\n    durationMs,\n    tag,\n    title,\n    provenance,\n    summary,\n    \"comparison\": comparison {\n      areas,\n      deltas,\n      generatedAt,\n      improved,\n      mismatched,\n      noiseThreshold,\n      noiseThresholdEmpirical,\n      notEvaluated,\n      regressed,\n      unchanged\n    }\n  }\n";
 /**
  * Find all reports that evaluated a specific Sanity document or perspective.
  *