inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/sandbox.py +4 -1
  8. inspect_ai/_cli/score.py +181 -32
  9. inspect_ai/_cli/trace.py +2 -0
  10. inspect_ai/_cli/view.py +4 -2
  11. inspect_ai/_display/core/config.py +7 -1
  12. inspect_ai/_display/core/progress.py +1 -1
  13. inspect_ai/_display/textual/app.py +8 -4
  14. inspect_ai/_display/textual/widgets/samples.py +6 -5
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/__init__.py +0 -0
  17. inspect_ai/_eval/eval.py +100 -97
  18. inspect_ai/_eval/evalset.py +69 -69
  19. inspect_ai/_eval/loader.py +122 -12
  20. inspect_ai/_eval/registry.py +1 -1
  21. inspect_ai/_eval/run.py +14 -0
  22. inspect_ai/_eval/score.py +125 -36
  23. inspect_ai/_eval/task/log.py +105 -4
  24. inspect_ai/_eval/task/results.py +92 -38
  25. inspect_ai/_eval/task/run.py +6 -2
  26. inspect_ai/_eval/task/sandbox.py +35 -2
  27. inspect_ai/_eval/task/task.py +49 -46
  28. inspect_ai/_util/__init__.py +0 -0
  29. inspect_ai/_util/constants.py +1 -1
  30. inspect_ai/_util/content.py +8 -0
  31. inspect_ai/_util/error.py +2 -0
  32. inspect_ai/_util/file.py +15 -1
  33. inspect_ai/_util/logger.py +4 -2
  34. inspect_ai/_util/registry.py +7 -1
  35. inspect_ai/_view/view.py +1 -2
  36. inspect_ai/_view/www/App.css +8 -3
  37. inspect_ai/_view/www/README.md +1 -1
  38. inspect_ai/_view/www/dist/assets/index.css +66 -38
  39. inspect_ai/_view/www/dist/assets/index.js +525 -523
  40. inspect_ai/_view/www/log-schema.json +86 -73
  41. inspect_ai/_view/www/package.json +1 -1
  42. inspect_ai/_view/www/src/App.tsx +1 -0
  43. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
  44. inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
  45. inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
  46. inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
  47. inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
  48. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
  49. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
  50. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
  51. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
  52. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
  53. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
  54. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
  55. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
  56. inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
  57. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
  58. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
  59. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
  60. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
  64. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
  65. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
  66. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
  67. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
  68. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
  69. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
  70. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
  72. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
  73. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
  74. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
  75. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
  76. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
  77. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
  78. inspect_ai/_view/www/src/types/log.d.ts +107 -19
  79. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
  80. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
  81. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
  82. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
  83. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
  84. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
  85. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
  86. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
  87. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
  88. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
  89. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  90. inspect_ai/approval/_approval.py +2 -0
  91. inspect_ai/approval/_approver.py +4 -4
  92. inspect_ai/approval/_auto.py +1 -1
  93. inspect_ai/approval/_human/approver.py +3 -0
  94. inspect_ai/approval/_policy.py +5 -0
  95. inspect_ai/approval/_registry.py +2 -2
  96. inspect_ai/dataset/_dataset.py +36 -45
  97. inspect_ai/dataset/_sources/__init__.py +0 -0
  98. inspect_ai/dataset/_sources/csv.py +13 -13
  99. inspect_ai/dataset/_sources/hf.py +29 -29
  100. inspect_ai/dataset/_sources/json.py +10 -10
  101. inspect_ai/log/__init__.py +2 -0
  102. inspect_ai/log/_convert.py +3 -3
  103. inspect_ai/log/_file.py +24 -9
  104. inspect_ai/log/_log.py +98 -7
  105. inspect_ai/log/_message.py +3 -1
  106. inspect_ai/log/_recorders/file.py +4 -0
  107. inspect_ai/log/_recorders/recorder.py +3 -0
  108. inspect_ai/log/_transcript.py +19 -8
  109. inspect_ai/model/__init__.py +2 -0
  110. inspect_ai/model/_cache.py +39 -21
  111. inspect_ai/model/_call_tools.py +2 -2
  112. inspect_ai/model/_chat_message.py +14 -4
  113. inspect_ai/model/_generate_config.py +1 -1
  114. inspect_ai/model/_model.py +31 -24
  115. inspect_ai/model/_model_output.py +14 -1
  116. inspect_ai/model/_openai.py +10 -18
  117. inspect_ai/model/_providers/google.py +9 -5
  118. inspect_ai/model/_providers/openai.py +5 -9
  119. inspect_ai/model/_providers/openrouter.py +1 -1
  120. inspect_ai/scorer/__init__.py +6 -1
  121. inspect_ai/scorer/_answer.py +1 -1
  122. inspect_ai/scorer/_classification.py +4 -0
  123. inspect_ai/scorer/_match.py +4 -5
  124. inspect_ai/scorer/_metric.py +87 -28
  125. inspect_ai/scorer/_metrics/__init__.py +3 -3
  126. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  127. inspect_ai/scorer/_metrics/mean.py +3 -17
  128. inspect_ai/scorer/_metrics/std.py +111 -30
  129. inspect_ai/scorer/_model.py +12 -12
  130. inspect_ai/scorer/_pattern.py +3 -3
  131. inspect_ai/scorer/_reducer/reducer.py +36 -21
  132. inspect_ai/scorer/_reducer/registry.py +2 -2
  133. inspect_ai/scorer/_reducer/types.py +7 -1
  134. inspect_ai/scorer/_score.py +11 -1
  135. inspect_ai/scorer/_scorer.py +110 -16
  136. inspect_ai/solver/__init__.py +1 -1
  137. inspect_ai/solver/_basic_agent.py +19 -22
  138. inspect_ai/solver/_bridge/__init__.py +0 -3
  139. inspect_ai/solver/_bridge/bridge.py +3 -3
  140. inspect_ai/solver/_chain.py +1 -2
  141. inspect_ai/solver/_critique.py +3 -3
  142. inspect_ai/solver/_fork.py +2 -2
  143. inspect_ai/solver/_human_agent/__init__.py +0 -0
  144. inspect_ai/solver/_human_agent/agent.py +5 -8
  145. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  146. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  147. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  148. inspect_ai/solver/_multiple_choice.py +15 -18
  149. inspect_ai/solver/_prompt.py +7 -7
  150. inspect_ai/solver/_solver.py +53 -52
  151. inspect_ai/solver/_task_state.py +80 -69
  152. inspect_ai/solver/_use_tools.py +9 -9
  153. inspect_ai/tool/__init__.py +2 -1
  154. inspect_ai/tool/_tool.py +43 -14
  155. inspect_ai/tool/_tool_call.py +6 -2
  156. inspect_ai/tool/_tool_choice.py +3 -1
  157. inspect_ai/tool/_tool_def.py +10 -8
  158. inspect_ai/tool/_tool_params.py +24 -0
  159. inspect_ai/tool/_tool_with.py +7 -7
  160. inspect_ai/tool/_tools/__init__.py +0 -0
  161. inspect_ai/tool/_tools/_computer/_common.py +2 -2
  162. inspect_ai/tool/_tools/_computer/_computer.py +11 -0
  163. inspect_ai/tool/_tools/_execute.py +15 -9
  164. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  165. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  166. inspect_ai/tool/_tools/_web_search.py +7 -5
  167. inspect_ai/util/_concurrency.py +3 -3
  168. inspect_ai/util/_panel.py +2 -0
  169. inspect_ai/util/_resource.py +12 -12
  170. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  171. inspect_ai/util/_sandbox/docker/config.py +2 -1
  172. inspect_ai/util/_sandbox/docker/docker.py +10 -1
  173. inspect_ai/util/_sandbox/docker/service.py +100 -0
  174. inspect_ai/util/_sandbox/environment.py +99 -96
  175. inspect_ai/util/_subprocess.py +5 -3
  176. inspect_ai/util/_subtask.py +15 -16
  177. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
  178. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
  179. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
  180. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
  181. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
  182. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -112,6 +112,7 @@ export type Input =
112
112
  | ChatMessageAssistant
113
113
  | ChatMessageTool
114
114
  )[];
115
+ export type Role = "system";
115
116
  export type Content =
116
117
  | string
117
118
  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
@@ -127,18 +128,17 @@ export type Type4 = "video";
127
128
  export type Video = string;
128
129
  export type Format1 = "mp4" | "mpeg" | "mov";
129
130
  export type Source = ("input" | "generate") | null;
130
- export type Role = "system";
131
+ export type Role1 = "user";
131
132
  export type Content1 =
132
133
  | string
133
134
  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
134
135
  export type Source1 = ("input" | "generate") | null;
135
- export type Role1 = "user";
136
136
  export type ToolCallId = string[] | null;
137
+ export type Role2 = "assistant";
137
138
  export type Content2 =
138
139
  | string
139
140
  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
140
141
  export type Source2 = ("input" | "generate") | null;
141
- export type Role2 = "assistant";
142
142
  export type ToolCalls = ToolCall[] | null;
143
143
  export type Id1 = string;
144
144
  export type Function = string;
@@ -148,11 +148,11 @@ export type Title = string | null;
148
148
  export type Format2 = "text" | "markdown";
149
149
  export type Content3 = string;
150
150
  export type Reasoning = string | null;
151
+ export type Role3 = "tool";
151
152
  export type Content4 =
152
153
  | string
153
154
  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
154
155
  export type Source3 = ("input" | "generate") | null;
155
- export type Role3 = "tool";
156
156
  export type ToolCallId1 = string | null;
157
157
  export type Function1 = string | null;
158
158
  export type Type6 =
@@ -315,6 +315,7 @@ export type Timestamp8 = string;
315
315
  export type Pending8 = boolean | null;
316
316
  export type Event8 = "score";
317
317
  export type Target2 = string | string[] | null;
318
+ export type Intermediate = boolean;
318
319
  export type Timestamp9 = string;
319
320
  export type Pending9 = boolean | null;
320
321
  export type Event9 = "error";
@@ -339,6 +340,7 @@ export type Lineno = number;
339
340
  export type Timestamp11 = string;
340
341
  export type Pending11 = boolean | null;
341
342
  export type Event11 = "info";
343
+ export type Source4 = string | null;
342
344
  export type Timestamp12 = string;
343
345
  export type Pending12 = boolean | null;
344
346
  export type Event12 = "step";
@@ -424,6 +426,9 @@ export type SampleId1 = string | number | null;
424
426
  export type Samples2 = EvalSampleScore[];
425
427
  export type Location1 = string;
426
428
 
429
+ /**
430
+ * Evaluation log.
431
+ */
427
432
  export interface EvalLog {
428
433
  version?: Version;
429
434
  status?: Status;
@@ -436,6 +441,9 @@ export interface EvalLog {
436
441
  reductions?: Reductions;
437
442
  location?: Location1;
438
443
  }
444
+ /**
445
+ * Eval target and configuration.
446
+ */
439
447
  export interface EvalSpec {
440
448
  run_id: RunId;
441
449
  created: Created;
@@ -460,6 +468,9 @@ export interface EvalSpec {
460
468
  }
461
469
  export interface TaskAttribs {}
462
470
  export interface TaskArgs {}
471
+ /**
472
+ * Dataset used for evaluation.
473
+ */
463
474
  export interface EvalDataset {
464
475
  name: Name;
465
476
  location: Location;
@@ -468,6 +479,9 @@ export interface EvalDataset {
468
479
  shuffled: Shuffled;
469
480
  }
470
481
  export interface ModelArgs {}
482
+ /**
483
+ * Configuration used for evaluation.
484
+ */
471
485
  export interface EvalConfig {
472
486
  limit: Limit;
473
487
  sample_id: SampleId;
@@ -513,6 +527,9 @@ export interface ApproverPolicyConfig {
513
527
  params: Params;
514
528
  }
515
529
  export interface Params {}
530
+ /**
531
+ * Git revision for evaluation.
532
+ */
516
533
  export interface EvalRevision {
517
534
  type: Type;
518
535
  origin: Origin;
@@ -521,19 +538,25 @@ export interface EvalRevision {
521
538
  export interface Packages {
522
539
  [k: string]: string;
523
540
  }
541
+ /**
542
+ * Plan (solvers) used in evaluation.
543
+ */
524
544
  export interface EvalPlan {
525
545
  name: Name2;
526
546
  steps: Steps;
527
547
  finish: EvalPlanStep | null;
528
548
  config: GenerateConfig;
529
549
  }
550
+ /**
551
+ * Solver step.
552
+ */
530
553
  export interface EvalPlanStep {
531
554
  solver: Solver1;
532
555
  params: Params1;
533
556
  }
534
557
  export interface Params1 {}
535
558
  /**
536
- * Base class for model generation configs.
559
+ * Model generation options.
537
560
  */
538
561
  export interface GenerateConfig {
539
562
  max_retries: MaxRetries;
@@ -560,12 +583,18 @@ export interface GenerateConfig {
560
583
  reasoning_effort: ReasoningEffort;
561
584
  reasoning_history: ReasoningHistory;
562
585
  }
586
+ /**
587
+ * Scoring results from evaluation.
588
+ */
563
589
  export interface EvalResults {
564
590
  total_samples: TotalSamples;
565
591
  completed_samples: CompletedSamples;
566
592
  scores: Scores;
567
593
  metadata: Metadata3;
568
594
  }
595
+ /**
596
+ * Score for evaluation task.
597
+ */
569
598
  export interface EvalScore {
570
599
  name: Name3;
571
600
  scorer: Scorer;
@@ -578,13 +607,19 @@ export interface Params2 {}
578
607
  export interface Metrics {
579
608
  [k: string]: EvalMetric;
580
609
  }
610
+ /**
611
+ * Metric for evaluation score.
612
+ */
581
613
  export interface EvalMetric {
582
614
  name: Name4;
583
615
  value: Value;
584
- options: Options;
616
+ params: Params3;
585
617
  metadata: Metadata1;
586
618
  }
587
- export interface Options {}
619
+ export interface Params3 {}
620
+ /**
621
+ * Timing and usage statistics.
622
+ */
588
623
  export interface EvalStats {
589
624
  started_at: StartedAt;
590
625
  completed_at: CompletedAt;
@@ -593,6 +628,9 @@ export interface EvalStats {
593
628
  export interface ModelUsage {
594
629
  [k: string]: ModelUsage1;
595
630
  }
631
+ /**
632
+ * Token usage for completion.
633
+ */
596
634
  export interface ModelUsage1 {
597
635
  input_tokens: InputTokens;
598
636
  output_tokens: OutputTokens;
@@ -600,11 +638,17 @@ export interface ModelUsage1 {
600
638
  input_tokens_cache_write: InputTokensCacheWrite;
601
639
  input_tokens_cache_read: InputTokensCacheRead;
602
640
  }
641
+ /**
642
+ * Eval error details.
643
+ */
603
644
  export interface EvalError {
604
645
  message: Message;
605
646
  traceback: Traceback;
606
647
  traceback_ansi: TracebackAnsi;
607
648
  }
649
+ /**
650
+ * Sample from evaluation task.
651
+ */
608
652
  export interface EvalSample {
609
653
  id: Id;
610
654
  epoch: Epoch;
@@ -625,40 +669,61 @@ export interface EvalSample {
625
669
  attachments: Attachments;
626
670
  limit: EvalSampleLimit | null;
627
671
  }
672
+ /**
673
+ * System chat message.
674
+ */
628
675
  export interface ChatMessageSystem {
676
+ role: Role;
629
677
  content: Content;
630
678
  source: Source;
631
- role: Role;
632
679
  }
680
+ /**
681
+ * Text content.
682
+ */
633
683
  export interface ContentText {
634
684
  type: Type1;
635
685
  text: Text;
636
686
  }
687
+ /**
688
+ * Image content.
689
+ */
637
690
  export interface ContentImage {
638
691
  type: Type2;
639
692
  image: Image;
640
693
  detail: Detail;
641
694
  }
695
+ /**
696
+ * Audio content.
697
+ */
642
698
  export interface ContentAudio {
643
699
  type: Type3;
644
700
  audio: Audio;
645
701
  format: Format;
646
702
  }
703
+ /**
704
+ * Video content.
705
+ */
647
706
  export interface ContentVideo {
648
707
  type: Type4;
649
708
  video: Video;
650
709
  format: Format1;
651
710
  }
711
+ /**
712
+ * User chat message.
713
+ */
652
714
  export interface ChatMessageUser {
715
+ role: Role1;
653
716
  content: Content1;
654
717
  source: Source1;
655
- role: Role1;
656
718
  tool_call_id: ToolCallId;
657
719
  }
720
+ /**
721
+ * Assistant chat message.
722
+ */
658
723
  export interface ChatMessageAssistant {
724
+ role: Role2;
659
725
  content: Content2;
660
726
  source: Source2;
661
- role: Role2;
662
727
  tool_calls: ToolCalls;
663
728
  reasoning: Reasoning;
664
729
  }
@@ -679,10 +744,13 @@ export interface ToolCallContent {
679
744
  format: Format2;
680
745
  content: Content3;
681
746
  }
747
+ /**
748
+ * Tool chat message.
749
+ */
682
750
  export interface ChatMessageTool {
751
+ role: Role3;
683
752
  content: Content4;
684
753
  source: Source3;
685
- role: Role3;
686
754
  tool_call_id: ToolCallId1;
687
755
  function: Function1;
688
756
  error: ToolCallError | null;
@@ -691,6 +759,9 @@ export interface ToolCallError {
691
759
  type: Type6;
692
760
  message: Message1;
693
761
  }
762
+ /**
763
+ * Output from model generation.
764
+ */
694
765
  export interface ModelOutput {
695
766
  model: Model1;
696
767
  choices: Choices1;
@@ -699,6 +770,9 @@ export interface ModelOutput {
699
770
  metadata: Metadata4;
700
771
  error: Error;
701
772
  }
773
+ /**
774
+ * Choice generated for completion.
775
+ */
702
776
  export interface ChatCompletionChoice {
703
777
  message: ChatMessageAssistant;
704
778
  stop_reason: StopReason;
@@ -729,12 +803,6 @@ export interface TopLogprob {
729
803
  }
730
804
  /**
731
805
  * Score generated by a scorer.
732
- *
733
- * Args:
734
- * value (Value): Score value.
735
- * answer (str | None): Answer extracted from model output (optional).
736
- * explanation (str | None): Explanation of score (optional).
737
- * metadata (dict[str,Any]): Additional metadata related to the score.
738
806
  */
739
807
  export interface Score {
740
808
  value: Value1;
@@ -754,6 +822,9 @@ export interface SampleInitEvent {
754
822
  sample: Sample;
755
823
  state: JsonValue;
756
824
  }
825
+ /**
826
+ * Sample for an evaluation task.
827
+ */
757
828
  export interface Sample {
758
829
  input: Input1;
759
830
  choices: Choices2;
@@ -888,7 +959,7 @@ export interface ToolFunction {
888
959
  name: Name6;
889
960
  }
890
961
  /**
891
- * Base class for model generation configs.
962
+ * Model generation options.
892
963
  */
893
964
  export interface GenerateConfig1 {
894
965
  max_retries: MaxRetries;
@@ -984,7 +1055,10 @@ export interface InputEvent {
984
1055
  input_ansi: InputAnsi;
985
1056
  }
986
1057
  /**
987
- * Event with sample score.
1058
+ * Event with score.
1059
+ *
1060
+ * Can be the final score for a `Sample`, or can be an intermediate score
1061
+ * resulting from a call to `score`.
988
1062
  */
989
1063
  export interface ScoreEvent {
990
1064
  timestamp: Timestamp8;
@@ -992,6 +1066,7 @@ export interface ScoreEvent {
992
1066
  event: Event8;
993
1067
  score: Score;
994
1068
  target: Target2;
1069
+ intermediate: Intermediate;
995
1070
  }
996
1071
  /**
997
1072
  * Event with sample error.
@@ -1011,6 +1086,9 @@ export interface LoggerEvent {
1011
1086
  event: Event10;
1012
1087
  message: LoggingMessage;
1013
1088
  }
1089
+ /**
1090
+ * Message written to Python log.
1091
+ */
1014
1092
  export interface LoggingMessage {
1015
1093
  name: Name7;
1016
1094
  level: Level;
@@ -1027,6 +1105,7 @@ export interface InfoEvent {
1027
1105
  timestamp: Timestamp11;
1028
1106
  pending: Pending11;
1029
1107
  event: Event11;
1108
+ source: Source4;
1030
1109
  data: JsonValue;
1031
1110
  }
1032
1111
  /**
@@ -1063,15 +1142,24 @@ export interface ModelUsage2 {
1063
1142
  export interface Attachments {
1064
1143
  [k: string]: string;
1065
1144
  }
1145
+ /**
1146
+ * Limit encontered by sample.
1147
+ */
1066
1148
  export interface EvalSampleLimit {
1067
1149
  type: Type13;
1068
1150
  limit: Limit2;
1069
1151
  }
1152
+ /**
1153
+ * Score reductions.
1154
+ */
1070
1155
  export interface EvalSampleReductions {
1071
1156
  scorer: Scorer1;
1072
1157
  reducer: Reducer1;
1073
1158
  samples: Samples2;
1074
1159
  }
1160
+ /**
1161
+ * Score and sample_id scored.
1162
+ */
1075
1163
  export interface EvalSampleScore {
1076
1164
  value: Value2;
1077
1165
  answer: Answer1;
@@ -14,7 +14,13 @@ export const ModelTokenTable: React.FC<ModelTokenTable> = ({
14
14
  <TokenHeader />
15
15
  <tbody>
16
16
  {Object.keys(model_usage).map((key) => {
17
- return <TokenRow model={key} usage={model_usage[key]} />;
17
+ return (
18
+ <TokenRow
19
+ key={key}
20
+ model={`${key}-token-row`}
21
+ usage={model_usage[key]}
22
+ />
23
+ );
18
24
  })}
19
25
  </tbody>
20
26
  </TokenTable>
@@ -68,12 +68,14 @@ export const ModelUsagePanel: React.FC<ModelUsageProps> = ({ usage }) => {
68
68
 
69
69
  return (
70
70
  <div className={clsx("text-size-small", styles.wrapper)}>
71
- {rows.map((row) => {
71
+ {rows.map((row, idx) => {
72
72
  if (row.label === "---") {
73
- return <div className={styles.separator}></div>;
73
+ return (
74
+ <div key={`$usage-sep-${idx}`} className={styles.separator}></div>
75
+ );
74
76
  } else {
75
77
  return (
76
- <Fragment>
78
+ <Fragment key={`$usage-row-${idx}`}>
77
79
  <div
78
80
  className={clsx(
79
81
  "text-style-label",
@@ -79,32 +79,6 @@ export const WorkSpaceView: React.FC<WorkSpaceViewProps> = ({
79
79
  [setSelectedTab],
80
80
  );
81
81
 
82
- // Compute tab panels anytime the tabs change
83
- const tabPanels = useMemo(() => {
84
- return Object.keys(tabs).map((key) => {
85
- const tab = tabs[key];
86
- return (
87
- <TabPanel
88
- id={tab.id}
89
- title={tab.label}
90
- onSelected={onSelected}
91
- selected={selectedTab === tab.id}
92
- scrollable={!!tab.scrollable}
93
- scrollRef={tab.scrollRef}
94
- scrollPosition={workspaceTabScrollPositionRef.current?.[tab.id]}
95
- setScrollPosition={useCallback(
96
- (position: number) => {
97
- onScroll(tab.id, position);
98
- },
99
- [onScroll],
100
- )}
101
- >
102
- {tab.content()}
103
- </TabPanel>
104
- );
105
- });
106
- }, [tabs, selectedTab]);
107
-
108
82
  if (evalSpec === undefined) {
109
83
  return <EmptyPanel />;
110
84
  } else {
@@ -150,7 +124,31 @@ export const WorkSpaceView: React.FC<WorkSpaceViewProps> = ({
150
124
  tabControlsClassName={clsx(styles.tabs, "text-size-smaller")}
151
125
  tabPanelsClassName={clsx(styles.tabPanels)}
152
126
  >
153
- {tabPanels}
127
+ {Object.keys(tabs).map((key) => {
128
+ const tab = tabs[key];
129
+ return (
130
+ <TabPanel
131
+ key={tab.id}
132
+ id={tab.id}
133
+ title={tab.label}
134
+ onSelected={onSelected}
135
+ selected={selectedTab === tab.id}
136
+ scrollable={!!tab.scrollable}
137
+ scrollRef={tab.scrollRef}
138
+ scrollPosition={
139
+ workspaceTabScrollPositionRef.current?.[tab.id]
140
+ }
141
+ setScrollPosition={useCallback(
142
+ (position: number) => {
143
+ onScroll(tab.id, position);
144
+ },
145
+ [onScroll],
146
+ )}
147
+ >
148
+ {tab.content()}
149
+ </TabPanel>
150
+ );
151
+ })}
154
152
  </TabSet>
155
153
  </div>
156
154
  </div>
@@ -30,16 +30,6 @@ export const PrimaryBar: React.FC<PrimaryBarProps> = ({
30
30
  evalSpec,
31
31
  setOffcanvas,
32
32
  }) => {
33
- let statusPanel;
34
- if (status === "success") {
35
- statusPanel = <ResultsPanel results={evalResults} />;
36
- } else if (status === "cancelled") {
37
- statusPanel = <CancelledPanel sampleCount={samples?.length || 0} />;
38
- } else if (status === "started") {
39
- statusPanel = <RunningPanel sampleCount={samples?.length || 0} />;
40
- } else if (status === "error") {
41
- statusPanel = <ErroredPanel sampleCount={samples?.length || 0} />;
42
- }
43
33
  const logFileName = file ? filename(file) : "";
44
34
 
45
35
  const handleToggle = useCallback(() => {
@@ -103,7 +93,18 @@ export const PrimaryBar: React.FC<PrimaryBarProps> = ({
103
93
  </div>
104
94
  </div>
105
95
  <div className={clsx(styles.taskStatus, "navbar-text")}>
106
- {statusPanel}
96
+ {status === "success" ? (
97
+ <ResultsPanel results={evalResults} />
98
+ ) : undefined}
99
+ {status === "cancelled" ? (
100
+ <CancelledPanel sampleCount={samples?.length || 0} />
101
+ ) : undefined}
102
+ {status === "started" ? (
103
+ <RunningPanel sampleCount={samples?.length || 0} />
104
+ ) : undefined}
105
+ {status === "error" ? (
106
+ <ErroredPanel sampleCount={samples?.length || 0} />
107
+ ) : undefined}
107
108
  </div>
108
109
  <div id="task-created" style={{ display: "none" }}>
109
110
  {evalSpec?.created}
@@ -14,13 +14,13 @@
14
14
  flex-direction: row;
15
15
  flex-wrap: wrap;
16
16
  justify-content: end;
17
- height: 100%;
18
17
  align-items: center;
19
18
  margin-top: 0.2rem;
20
19
  padding-bottom: 0.4rem;
21
20
  row-gap: 1em;
22
21
  max-height: 15em;
23
22
  overflow: scroll;
23
+ align-items: baseline;
24
24
  }
25
25
 
26
26
  .verticalMetricReducer {
@@ -39,14 +39,26 @@
39
39
  }
40
40
 
41
41
  .verticalMetricValue {
42
- font-size: var(--inspect-font-size-larger);
43
42
  font-weight: 500;
44
43
  text-align: center;
45
44
  }
46
45
 
46
+ .multiScorer {
47
+ padding-left: 0;
48
+ height: 100%;
49
+ display: flex;
50
+ flex-direction: column;
51
+ padding: 0.5em 1em;
52
+ }
53
+
54
+ .multiScorerIndent {
55
+ padding-left: 1.5em;
56
+ }
57
+
47
58
  .multiScorerReducer {
48
59
  text-align: center;
49
60
  margin-bottom: -0.3rem;
61
+ margin-top: 0.2em;
50
62
  }
51
63
 
52
64
  .multiScorerLabel {
@@ -58,10 +70,21 @@
58
70
  .multiScorerValue {
59
71
  display: grid;
60
72
  grid-template-columns: auto auto;
73
+ grid-auto-rows: auto;
61
74
  grid-column-gap: 0.3rem;
62
75
  grid-row-gap: 0;
76
+ padding-top: 0.3em;
63
77
  }
64
78
 
65
79
  .multiScorerValueContent {
66
80
  font-weight: 600;
81
+ text-align: center;
82
+ }
83
+
84
+ .multiScoreMetricGrid {
85
+ display: grid;
86
+ grid-template-rows: auto auto;
87
+ column-gap: 1em;
88
+ padding: 0 0.2em;
89
+ justify-content: center;
67
90
  }