@plaited/agent-eval-harness 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -777,3 +777,154 @@ export const ComparisonReportSchema = z.object({
777
777
 
778
778
  /** Comparison report type */
779
779
  export type ComparisonReport = z.infer<typeof ComparisonReportSchema>
780
+
781
+ // ============================================================================
782
+ // Trials Comparison Report Schemas
783
+ // ============================================================================
784
+
785
+ /**
786
+ * Capability metrics for trials comparison (passAtK-based).
787
+ *
788
+ * @remarks
789
+ * Measures whether the agent CAN solve the task (at least once in K tries).
790
+ * Higher passAtK means the agent has the capability to solve the task.
791
+ */
792
+ export const TrialsCapabilityMetricsSchema = z.object({
793
+ /** Average passAtK across all prompts */
794
+ avgPassAtK: z.number(),
795
+ /** Median passAtK */
796
+ medianPassAtK: z.number(),
797
+ /** 25th percentile passAtK */
798
+ p25PassAtK: z.number(),
799
+ /** 75th percentile passAtK */
800
+ p75PassAtK: z.number(),
801
+ })
802
+
803
+ /** Trials capability metrics type */
804
+ export type TrialsCapabilityMetrics = z.infer<typeof TrialsCapabilityMetricsSchema>
805
+
806
+ /**
807
+ * Reliability metrics for trials comparison (passExpK-based).
808
+ *
809
+ * @remarks
810
+ * Measures whether the agent CONSISTENTLY solves the task (all K tries).
811
+ * Higher passExpK means the agent reliably solves the task every time.
812
+ */
813
+ export const TrialsReliabilityMetricsSchema = z.object({
814
+ /** Average passExpK across all prompts */
815
+ avgPassExpK: z.number(),
816
+ /** Median passExpK */
817
+ medianPassExpK: z.number(),
818
+ /** 25th percentile passExpK */
819
+ p25PassExpK: z.number(),
820
+ /** 75th percentile passExpK */
821
+ p75PassExpK: z.number(),
822
+ })
823
+
824
+ /** Trials reliability metrics type */
825
+ export type TrialsReliabilityMetrics = z.infer<typeof TrialsReliabilityMetricsSchema>
826
+
827
+ /**
828
+ * Flakiness metrics for trials comparison.
829
+ *
830
+ * @remarks
831
+ * Flakiness = passAtK - passExpK, measuring the gap between capability and reliability.
832
+ * A high flakiness score means the agent can sometimes solve the task but not consistently.
833
+ */
834
+ export const TrialsFlakinessMetricsSchema = z.object({
835
+ /** Average flakiness across all prompts */
836
+ avgFlakiness: z.number(),
837
+ /** Median flakiness */
838
+ medianFlakiness: z.number(),
839
+ /** Number of prompts with non-zero flakiness */
840
+ flakyPromptCount: z.number(),
841
+ /** Top flaky prompts by flakiness score */
842
+ topFlakyPrompts: z.array(
843
+ z.object({
844
+ /** Prompt identifier */
845
+ id: z.string(),
846
+ /** Flakiness score (passAtK - passExpK) */
847
+ flakiness: z.number(),
848
+ }),
849
+ ),
850
+ })
851
+
852
+ /** Trials flakiness metrics type */
853
+ export type TrialsFlakinessMetrics = z.infer<typeof TrialsFlakinessMetricsSchema>
854
+
855
+ /**
856
+ * Per-prompt metrics for trials comparison drill-down.
857
+ */
858
+ export const TrialsPromptComparisonSchema = z.object({
859
+ /** Prompt identifier */
860
+ id: z.string(),
861
+ /** Run label of the capability winner, or null if tie */
862
+ capabilityWinner: z.string().nullable(),
863
+ /** Run label of the reliability winner, or null if tie */
864
+ reliabilityWinner: z.string().nullable(),
865
+ /** passAtK by run label */
866
+ passAtK: z.record(z.string(), z.number()),
867
+ /** passExpK by run label */
868
+ passExpK: z.record(z.string(), z.number()),
869
+ /** Flakiness by run label */
870
+ flakiness: z.record(z.string(), z.number()),
871
+ })
872
+
873
+ /** Trials prompt comparison type */
874
+ export type TrialsPromptComparison = z.infer<typeof TrialsPromptComparisonSchema>
875
+
876
+ /**
877
+ * Metadata for trials comparison report.
878
+ */
879
+ export const TrialsComparisonMetaSchema = z.object({
880
+ /** ISO timestamp when report was generated */
881
+ generatedAt: z.string(),
882
+ /** Run labels included in comparison */
883
+ runs: z.array(z.string()),
884
+ /** Total prompts compared */
885
+ promptCount: z.number(),
886
+ /** Number of trials per prompt (k value) */
887
+ trialsPerPrompt: z.number(),
888
+ /** Input format indicator */
889
+ inputFormat: z.literal('trials'),
890
+ })
891
+
892
+ /** Trials comparison meta type */
893
+ export type TrialsComparisonMeta = z.infer<typeof TrialsComparisonMetaSchema>
894
+
895
+ /**
896
+ * Trials comparison report schema.
897
+ *
898
+ * @remarks
899
+ * Aggregates trials comparison output across capability, reliability, and flakiness dimensions.
900
+ * Used when comparing TrialResult JSONL files instead of CaptureResult files.
901
+ *
902
+ * Key metrics:
903
+ * - Capability: passAtK - can the agent solve this at least once?
904
+ * - Reliability: passExpK - does the agent solve this consistently?
905
+ * - Flakiness: passAtK - passExpK - how inconsistent is the agent?
906
+ */
907
+ export const TrialsComparisonReportSchema = z.object({
908
+ /** Report metadata */
909
+ meta: TrialsComparisonMetaSchema,
910
+ /** Capability metrics by run label */
911
+ capability: z.record(z.string(), TrialsCapabilityMetricsSchema),
912
+ /** Reliability metrics by run label */
913
+ reliability: z.record(z.string(), TrialsReliabilityMetricsSchema),
914
+ /** Flakiness metrics by run label */
915
+ flakiness: z.record(z.string(), TrialsFlakinessMetricsSchema),
916
+ /** Head-to-head comparison details */
917
+ headToHead: z.object({
918
+ /** Pairwise wins by capability */
919
+ capability: z.array(PairwiseComparisonSchema),
920
+ /** Pairwise wins by reliability */
921
+ reliability: z.array(PairwiseComparisonSchema),
922
+ /** Pairwise wins by overall weighted score */
923
+ overall: z.array(PairwiseComparisonSchema),
924
+ }),
925
+ /** Per-prompt breakdown for drill-down (optional, can be large) */
926
+ perPrompt: z.array(TrialsPromptComparisonSchema).optional(),
927
+ })
928
+
929
+ /** Trials comparison report type */
930
+ export type TrialsComparisonReport = z.infer<typeof TrialsComparisonReportSchema>
package/src/schemas.ts CHANGED
@@ -104,6 +104,19 @@ export {
104
104
  TrialEntrySchema,
105
105
  type TrialResult,
106
106
  TrialResultSchema,
107
+ // Trials comparison report types
108
+ type TrialsCapabilityMetrics,
109
+ TrialsCapabilityMetricsSchema,
110
+ type TrialsComparisonMeta,
111
+ TrialsComparisonMetaSchema,
112
+ type TrialsComparisonReport,
113
+ TrialsComparisonReportSchema,
114
+ type TrialsFlakinessMetrics,
115
+ TrialsFlakinessMetricsSchema,
116
+ type TrialsPromptComparison,
117
+ TrialsPromptComparisonSchema,
118
+ type TrialsReliabilityMetrics,
119
+ TrialsReliabilityMetricsSchema,
107
120
  type ValidationResult,
108
121
  ValidationResultSchema,
109
122
  } from './schemas/schemas.ts'