@sanity/ailf-studio 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -434,6 +434,14 @@ interface ScoreItem {
434
434
  /** Ceiling score — gold-standard docs injected */
435
435
  ceilingScore?: number;
436
436
  }
437
+ /** Reference to an artifact stored in an external object store (GCS). */
438
+ interface ArtifactRef {
439
+ store: "gcs";
440
+ bucket: string;
441
+ path: string;
442
+ bytes?: number;
443
+ entryCount?: number;
444
+ }
437
445
  /** A single gap/recommendation from gap analysis */
438
446
  interface RecommendationGap {
439
447
  affectedTaskIds: string[];
@@ -451,12 +459,41 @@ interface RecommendationsData {
451
459
  generatedAt: string;
452
460
  totalPotentialLift: number;
453
461
  }
462
+ /**
463
+ * Per-test result stored in reports for drill-down and audit.
464
+ * Mirrors StoredTestResult from @sanity/ailf-core. See D0029.
465
+ */
466
+ interface StoredTestResultData {
467
+ area: string;
468
+ canonicalDocs?: DocumentRef[];
469
+ compositeScore?: number;
470
+ cost?: number;
471
+ dimensions: {
472
+ dimension: string;
473
+ reason: string;
474
+ score: number;
475
+ }[];
476
+ latencyMs?: number;
477
+ modelId: string;
478
+ outputFailure?: boolean;
479
+ responseOutput: string;
480
+ responseOutputTruncated?: boolean;
481
+ taskId: string;
482
+ tokenUsage?: {
483
+ completion: number;
484
+ prompt: number;
485
+ total: number;
486
+ };
487
+ variant: "baseline" | "gold";
488
+ }
454
489
  /** A single low-scoring grader judgment stored in reports */
455
490
  interface JudgmentData {
456
491
  /** Docs the task expected the model to use */
457
492
  canonicalDocs?: DocumentRef[];
458
493
  dimension: string;
459
494
  modelId: string;
495
+ /** True when the model failed to produce output (empty response, API error, refusal) */
496
+ outputFailure?: boolean;
460
497
  reason: string;
461
498
  score: number;
462
499
  taskId: string;
@@ -497,6 +534,13 @@ interface PerModelData {
497
534
  interface SummaryData {
498
535
  /** Per-feature agent behavior data (only present when agentic mode ran) */
499
536
  agentBehavior?: FeatureAgentBehaviorData[] | null;
537
+ /** External artifact references — present when pipeline uploads to GCS (D0030) */
538
+ artifacts?: {
539
+ testOutputs?: ArtifactRef;
540
+ renderedPrompts?: ArtifactRef;
541
+ rawResults?: ArtifactRef;
542
+ traces?: ArtifactRef;
543
+ };
500
544
  belowCritical: string[];
501
545
  /** All Sanity documents used across the entire evaluation */
502
546
  documentManifest?: DocumentRef[];
@@ -521,6 +565,8 @@ interface SummaryData {
521
565
  /** Gap analysis recommendations (when gap analysis was run) */
522
566
  recommendations: null | RecommendationsData;
523
567
  scores: ScoreItem[];
568
+ /** Per-test results with model output and metadata (D0029) */
569
+ testResults?: StoredTestResultData[] | null;
524
570
  timestamp: string;
525
571
  }
526
572
  /** Shape returned by scoreTimelineQuery */
@@ -732,7 +778,7 @@ declare const scoreTimelineQuery: string;
732
778
  *
733
779
  * Used by: ReportDetail view
734
780
  */
735
- declare const reportDetailQuery = "\n *[_type == \"ailf.report\" && reportId == $reportId][0] {\n _id,\n reportId,\n completedAt,\n durationMs,\n tag,\n title,\n provenance,\n summary,\n comparison\n }\n";
781
+ declare const reportDetailQuery = "\n *[_type == \"ailf.report\" && reportId == $reportId][0] {\n _id,\n reportId,\n completedAt,\n durationMs,\n tag,\n title,\n provenance,\n summary,\n \"comparison\": comparison {\n areas,\n deltas,\n generatedAt,\n improved,\n mismatched,\n noiseThreshold,\n noiseThresholdEmpirical,\n notEvaluated,\n regressed,\n unchanged\n }\n }\n";
736
782
  /**
737
783
  * Find all reports that evaluated a specific Sanity document or perspective.
738
784
  *