inspect-ai 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. inspect_ai/_cli/eval.py +13 -1
  2. inspect_ai/_display/plain/display.py +9 -11
  3. inspect_ai/_display/textual/app.py +5 -5
  4. inspect_ai/_display/textual/widgets/samples.py +47 -18
  5. inspect_ai/_display/textual/widgets/transcript.py +25 -12
  6. inspect_ai/_eval/eval.py +14 -2
  7. inspect_ai/_eval/evalset.py +6 -1
  8. inspect_ai/_eval/run.py +6 -0
  9. inspect_ai/_eval/task/run.py +44 -15
  10. inspect_ai/_eval/task/task.py +26 -3
  11. inspect_ai/_util/interrupt.py +15 -0
  12. inspect_ai/_util/logger.py +23 -0
  13. inspect_ai/_util/rich.py +7 -8
  14. inspect_ai/_util/text.py +301 -1
  15. inspect_ai/_util/transcript.py +10 -2
  16. inspect_ai/_util/working.py +46 -0
  17. inspect_ai/_view/www/dist/assets/index.css +56 -12
  18. inspect_ai/_view/www/dist/assets/index.js +905 -751
  19. inspect_ai/_view/www/log-schema.json +337 -2
  20. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  21. inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
  22. inspect_ai/_view/www/src/appearance/icons.ts +3 -1
  23. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
  24. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
  25. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
  26. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
  27. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
  28. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +1 -1
  29. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
  30. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
  31. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
  32. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
  33. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
  34. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
  35. inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
  36. inspect_ai/_view/www/src/types/log.d.ts +188 -108
  37. inspect_ai/_view/www/src/utils/format.ts +7 -4
  38. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
  39. inspect_ai/log/__init__.py +2 -0
  40. inspect_ai/log/_condense.py +1 -0
  41. inspect_ai/log/_log.py +72 -12
  42. inspect_ai/log/_samples.py +5 -5
  43. inspect_ai/log/_transcript.py +31 -1
  44. inspect_ai/model/_call_tools.py +1 -1
  45. inspect_ai/model/_conversation.py +1 -1
  46. inspect_ai/model/_model.py +35 -16
  47. inspect_ai/model/_model_call.py +10 -3
  48. inspect_ai/model/_providers/anthropic.py +13 -2
  49. inspect_ai/model/_providers/bedrock.py +7 -0
  50. inspect_ai/model/_providers/cloudflare.py +20 -7
  51. inspect_ai/model/_providers/google.py +358 -302
  52. inspect_ai/model/_providers/groq.py +57 -23
  53. inspect_ai/model/_providers/hf.py +6 -0
  54. inspect_ai/model/_providers/mistral.py +81 -52
  55. inspect_ai/model/_providers/openai.py +9 -0
  56. inspect_ai/model/_providers/providers.py +6 -6
  57. inspect_ai/model/_providers/util/tracker.py +92 -0
  58. inspect_ai/model/_providers/vllm.py +13 -5
  59. inspect_ai/solver/_basic_agent.py +1 -3
  60. inspect_ai/solver/_bridge/patch.py +0 -2
  61. inspect_ai/solver/_limit.py +4 -4
  62. inspect_ai/solver/_plan.py +3 -3
  63. inspect_ai/solver/_solver.py +3 -0
  64. inspect_ai/solver/_task_state.py +10 -1
  65. inspect_ai/tool/_tools/_web_search.py +3 -3
  66. inspect_ai/util/_concurrency.py +14 -8
  67. inspect_ai/util/_sandbox/context.py +15 -0
  68. inspect_ai/util/_sandbox/docker/cleanup.py +8 -3
  69. inspect_ai/util/_sandbox/docker/compose.py +5 -9
  70. inspect_ai/util/_sandbox/docker/docker.py +20 -6
  71. inspect_ai/util/_sandbox/docker/util.py +10 -1
  72. inspect_ai/util/_sandbox/environment.py +32 -1
  73. inspect_ai/util/_sandbox/events.py +149 -0
  74. inspect_ai/util/_sandbox/local.py +3 -3
  75. inspect_ai/util/_sandbox/self_check.py +2 -1
  76. inspect_ai/util/_subprocess.py +4 -1
  77. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +5 -5
  78. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +82 -74
  79. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
  80. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
  81. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
  82. {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0
@@ -39,6 +39,7 @@ export type FailOnError = boolean | number | null;
39
39
  export type MessageLimit = number | null;
40
40
  export type TokenLimit = number | null;
41
41
  export type TimeLimit = number | null;
42
+ export type WorkingLimit = number | null;
42
43
  export type MaxSamples = number | null;
43
44
  export type MaxTasks = number | null;
44
45
  export type MaxSubprocesses = number | null;
@@ -52,7 +53,30 @@ export type Type = "git";
52
53
  export type Origin = string;
53
54
  export type Commit = string;
54
55
  export type Metadata = {} | null;
56
+ export type Scorers = EvalScorer[] | null;
55
57
  export type Name2 = string;
58
+ export type Options = {} | null;
59
+ export type Metrics =
60
+ | (
61
+ | EvalMetricDefinition
62
+ | {
63
+ [k: string]: EvalMetricDefinition[];
64
+ }
65
+ )[]
66
+ | {
67
+ [k: string]: EvalMetricDefinition[];
68
+ }
69
+ | null;
70
+ export type Name3 = string;
71
+ export type Options1 = {} | null;
72
+ export type Metadata1 = {} | null;
73
+ export type Metrics1 =
74
+ | EvalMetricDefinition[]
75
+ | {
76
+ [k: string]: EvalMetricDefinition[];
77
+ }
78
+ | null;
79
+ export type Name4 = string;
56
80
  export type Solver1 = string;
57
81
  export type Steps = EvalPlanStep[];
58
82
  export type MaxRetries = number | null;
@@ -82,15 +106,15 @@ export type ReasoningEffort = ("low" | "medium" | "high") | null;
82
106
  export type ReasoningHistory = boolean | null;
83
107
  export type TotalSamples = number;
84
108
  export type CompletedSamples = number;
85
- export type Name3 = string;
109
+ export type Name5 = string;
86
110
  export type Scorer = string;
87
111
  export type Reducer = string | null;
88
- export type Name4 = string;
112
+ export type Name6 = string;
89
113
  export type Value = number;
90
- export type Metadata1 = {} | null;
91
114
  export type Metadata2 = {} | null;
92
- export type Scores = EvalScore[];
93
115
  export type Metadata3 = {} | null;
116
+ export type Scores = EvalScore[];
117
+ export type Metadata4 = {} | null;
94
118
  export type StartedAt = string;
95
119
  export type CompletedAt = string;
96
120
  export type InputTokens = number;
@@ -194,7 +218,7 @@ export type Bytes1 = number[] | null;
194
218
  export type Content5 = Logprob[];
195
219
  export type Choices1 = ChatCompletionChoice[];
196
220
  export type Time = number | null;
197
- export type Metadata4 = {} | null;
221
+ export type Metadata5 = {} | null;
198
222
  export type Error = string | null;
199
223
  export type Scores1 = {
200
224
  [k: string]: Score;
@@ -209,7 +233,7 @@ export type Value1 =
209
233
  };
210
234
  export type Answer = string | null;
211
235
  export type Explanation = string | null;
212
- export type Metadata5 = {} | null;
236
+ export type Metadata6 = {} | null;
213
237
  export type Timestamp = string;
214
238
  export type Pending = boolean | null;
215
239
  export type Event = "sample_init";
@@ -224,7 +248,7 @@ export type Input1 =
224
248
  export type Choices2 = string[] | null;
225
249
  export type Target1 = string | string[];
226
250
  export type Id2 = number | string | null;
227
- export type Metadata7 = {} | null;
251
+ export type Metadata8 = {} | null;
228
252
  export type Files1 = {
229
253
  [k: string]: string;
230
254
  } | null;
@@ -233,31 +257,49 @@ export type JsonValue = unknown;
233
257
  export type Timestamp1 = string;
234
258
  export type Pending1 = boolean | null;
235
259
  export type Event1 = "sample_limit";
236
- export type Type7 = "message" | "time" | "token" | "operator" | "custom";
260
+ export type Type7 =
261
+ | "message"
262
+ | "time"
263
+ | "working"
264
+ | "token"
265
+ | "operator"
266
+ | "custom";
237
267
  export type Message2 = string;
238
268
  export type Limit1 = number | null;
239
269
  export type Timestamp2 = string;
240
270
  export type Pending2 = boolean | null;
241
- export type Event2 = "state";
271
+ export type Event2 = "sandbox";
272
+ export type Action = "exec" | "read_file" | "write_file";
273
+ export type Cmd = string | null;
274
+ export type Options2 = {
275
+ [k: string]: JsonValue;
276
+ } | null;
277
+ export type File = string | null;
278
+ export type Input2 = string | null;
279
+ export type Result = number | null;
280
+ export type Output = string | null;
281
+ export type Timestamp3 = string;
282
+ export type Pending3 = boolean | null;
283
+ export type Event3 = "state";
242
284
  export type Op = "remove" | "add" | "replace" | "move" | "test" | "copy";
243
285
  export type Path = string;
244
286
  export type From = string | null;
245
287
  export type Changes = JsonChange[];
246
- export type Timestamp3 = string;
247
- export type Pending3 = boolean | null;
248
- export type Event3 = "store";
249
- export type Changes1 = JsonChange[];
250
288
  export type Timestamp4 = string;
251
289
  export type Pending4 = boolean | null;
252
- export type Event4 = "model";
290
+ export type Event4 = "store";
291
+ export type Changes1 = JsonChange[];
292
+ export type Timestamp5 = string;
293
+ export type Pending5 = boolean | null;
294
+ export type Event5 = "model";
253
295
  export type Model2 = string;
254
- export type Input2 = (
296
+ export type Input3 = (
255
297
  | ChatMessageSystem
256
298
  | ChatMessageUser
257
299
  | ChatMessageAssistant
258
300
  | ChatMessageTool
259
301
  )[];
260
- export type Name5 = string;
302
+ export type Name7 = string;
261
303
  export type Description = string;
262
304
  export type Type8 = "object";
263
305
  export type Type9 =
@@ -275,16 +317,17 @@ export type Required1 = string[];
275
317
  export type Additionalproperties1 = boolean;
276
318
  export type Tools1 = ToolInfo[];
277
319
  export type ToolChoice = ("auto" | "any" | "none") | ToolFunction;
278
- export type Name6 = string;
320
+ export type Name8 = string;
279
321
  export type Error1 = string | null;
280
322
  export type Cache = ("read" | "write") | null;
281
- export type Timestamp5 = string;
282
- export type Pending5 = boolean | null;
283
- export type Event5 = "tool";
323
+ export type Time1 = number | null;
324
+ export type Timestamp6 = string;
325
+ export type Pending6 = boolean | null;
326
+ export type Event6 = "tool";
284
327
  export type Type10 = "function";
285
328
  export type Id3 = string;
286
329
  export type Function2 = string;
287
- export type Result =
330
+ export type Result1 =
288
331
  | string
289
332
  | number
290
333
  | boolean
@@ -294,9 +337,9 @@ export type Result =
294
337
  | ContentVideo
295
338
  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
296
339
  export type Truncated = [unknown, unknown] | null;
297
- export type Timestamp6 = string;
298
- export type Pending6 = boolean | null;
299
- export type Event6 = "approval";
340
+ export type Timestamp7 = string;
341
+ export type Pending7 = boolean | null;
342
+ export type Event7 = "approval";
300
343
  export type Message3 = string;
301
344
  export type Approver = string;
302
345
  export type Decision =
@@ -306,23 +349,23 @@ export type Decision =
306
349
  | "escalate"
307
350
  | "terminate";
308
351
  export type Explanation1 = string | null;
309
- export type Timestamp7 = string;
310
- export type Pending7 = boolean | null;
311
- export type Event7 = "input";
312
- export type Input3 = string;
313
- export type InputAnsi = string;
314
352
  export type Timestamp8 = string;
315
353
  export type Pending8 = boolean | null;
316
- export type Event8 = "score";
317
- export type Target2 = string | string[] | null;
318
- export type Intermediate = boolean;
354
+ export type Event8 = "input";
355
+ export type Input4 = string;
356
+ export type InputAnsi = string;
319
357
  export type Timestamp9 = string;
320
358
  export type Pending9 = boolean | null;
321
- export type Event9 = "error";
359
+ export type Event9 = "score";
360
+ export type Target2 = string | string[] | null;
361
+ export type Intermediate = boolean;
322
362
  export type Timestamp10 = string;
323
363
  export type Pending10 = boolean | null;
324
- export type Event10 = "logger";
325
- export type Name7 = string | null;
364
+ export type Event10 = "error";
365
+ export type Timestamp11 = string;
366
+ export type Pending11 = boolean | null;
367
+ export type Event11 = "logger";
368
+ export type Name9 = string | null;
326
369
  export type Level =
327
370
  | "debug"
328
371
  | "trace"
@@ -337,24 +380,25 @@ export type Created1 = number;
337
380
  export type Filename = string;
338
381
  export type Module = string;
339
382
  export type Lineno = number;
340
- export type Timestamp11 = string;
341
- export type Pending11 = boolean | null;
342
- export type Event11 = "info";
343
- export type Source4 = string | null;
344
383
  export type Timestamp12 = string;
345
384
  export type Pending12 = boolean | null;
346
- export type Event12 = "step";
347
- export type Action = "begin" | "end";
348
- export type Type11 = string | null;
349
- export type Name8 = string;
385
+ export type Event12 = "info";
386
+ export type Source4 = string | null;
350
387
  export type Timestamp13 = string;
351
388
  export type Pending13 = boolean | null;
352
- export type Event13 = "subtask";
353
- export type Name9 = string;
389
+ export type Event13 = "step";
390
+ export type Action1 = "begin" | "end";
391
+ export type Type11 = string | null;
392
+ export type Name10 = string;
393
+ export type Timestamp14 = string;
394
+ export type Pending14 = boolean | null;
395
+ export type Event14 = "subtask";
396
+ export type Name11 = string;
354
397
  export type Type12 = string | null;
355
398
  export type Events2 = (
356
399
  | SampleInitEvent
357
400
  | SampleLimitEvent
401
+ | SandboxEvent
358
402
  | StateEvent
359
403
  | StoreEvent
360
404
  | ModelEvent
@@ -371,6 +415,7 @@ export type Events2 = (
371
415
  export type Events1 = (
372
416
  | SampleInitEvent
373
417
  | SampleLimitEvent
418
+ | SandboxEvent
374
419
  | StateEvent
375
420
  | StoreEvent
376
421
  | ModelEvent
@@ -387,6 +432,7 @@ export type Events1 = (
387
432
  export type Events = (
388
433
  | SampleInitEvent
389
434
  | SampleLimitEvent
435
+ | SandboxEvent
390
436
  | StateEvent
391
437
  | StoreEvent
392
438
  | ModelEvent
@@ -400,9 +446,12 @@ export type Events = (
400
446
  | StepEvent
401
447
  | SubtaskEvent
402
448
  )[];
449
+ export type TotalTime = number | null;
450
+ export type WorkingTime = number | null;
403
451
  export type Type13 =
404
452
  | "context"
405
453
  | "time"
454
+ | "working"
406
455
  | "message"
407
456
  | "token"
408
457
  | "operator"
@@ -421,7 +470,7 @@ export type Value2 =
421
470
  };
422
471
  export type Answer1 = string | null;
423
472
  export type Explanation2 = string | null;
424
- export type Metadata8 = {} | null;
473
+ export type Metadata9 = {} | null;
425
474
  export type SampleId1 = string | number | null;
426
475
  export type Samples2 = EvalSampleScore[];
427
476
  export type Location1 = string;
@@ -465,6 +514,8 @@ export interface EvalSpec {
465
514
  revision: EvalRevision | null;
466
515
  packages: Packages;
467
516
  metadata: Metadata;
517
+ scorers: Scorers;
518
+ metrics: Metrics1;
468
519
  }
469
520
  export interface TaskAttribs {}
470
521
  export interface TaskArgs {}
@@ -492,6 +543,7 @@ export interface EvalConfig {
492
543
  message_limit: MessageLimit;
493
544
  token_limit: TokenLimit;
494
545
  time_limit: TimeLimit;
546
+ working_limit: WorkingLimit;
495
547
  max_samples: MaxSamples;
496
548
  max_tasks: MaxTasks;
497
549
  max_subprocesses: MaxSubprocesses;
@@ -538,11 +590,21 @@ export interface EvalRevision {
538
590
  export interface Packages {
539
591
  [k: string]: string;
540
592
  }
593
+ export interface EvalScorer {
594
+ name: Name2;
595
+ options: Options;
596
+ metrics: Metrics;
597
+ metadata: Metadata1;
598
+ }
599
+ export interface EvalMetricDefinition {
600
+ name: Name3;
601
+ options: Options1;
602
+ }
541
603
  /**
542
604
  * Plan (solvers) used in evaluation.
543
605
  */
544
606
  export interface EvalPlan {
545
- name: Name2;
607
+ name: Name4;
546
608
  steps: Steps;
547
609
  finish: EvalPlanStep | null;
548
610
  config: GenerateConfig;
@@ -590,31 +652,31 @@ export interface EvalResults {
590
652
  total_samples: TotalSamples;
591
653
  completed_samples: CompletedSamples;
592
654
  scores: Scores;
593
- metadata: Metadata3;
655
+ metadata: Metadata4;
594
656
  }
595
657
  /**
596
658
  * Score for evaluation task.
597
659
  */
598
660
  export interface EvalScore {
599
- name: Name3;
661
+ name: Name5;
600
662
  scorer: Scorer;
601
663
  reducer: Reducer;
602
664
  params: Params2;
603
- metrics: Metrics;
604
- metadata: Metadata2;
665
+ metrics: Metrics2;
666
+ metadata: Metadata3;
605
667
  }
606
668
  export interface Params2 {}
607
- export interface Metrics {
669
+ export interface Metrics2 {
608
670
  [k: string]: EvalMetric;
609
671
  }
610
672
  /**
611
673
  * Metric for evaluation score.
612
674
  */
613
675
  export interface EvalMetric {
614
- name: Name4;
676
+ name: Name6;
615
677
  value: Value;
616
678
  params: Params3;
617
- metadata: Metadata1;
679
+ metadata: Metadata2;
618
680
  }
619
681
  export interface Params3 {}
620
682
  /**
@@ -661,10 +723,12 @@ export interface EvalSample {
661
723
  messages: Messages;
662
724
  output: ModelOutput;
663
725
  scores: Scores1;
664
- metadata: Metadata6;
726
+ metadata: Metadata7;
665
727
  store: Store;
666
728
  events: Events;
667
729
  model_usage: ModelUsage2;
730
+ total_time: TotalTime;
731
+ working_time: WorkingTime;
668
732
  error: EvalError | null;
669
733
  attachments: Attachments;
670
734
  limit: EvalSampleLimit | null;
@@ -767,7 +831,7 @@ export interface ModelOutput {
767
831
  choices: Choices1;
768
832
  usage: ModelUsage1 | null;
769
833
  time: Time;
770
- metadata: Metadata4;
834
+ metadata: Metadata5;
771
835
  error: Error;
772
836
  }
773
837
  /**
@@ -808,9 +872,9 @@ export interface Score {
808
872
  value: Value1;
809
873
  answer: Answer;
810
874
  explanation: Explanation;
811
- metadata: Metadata5;
875
+ metadata: Metadata6;
812
876
  }
813
- export interface Metadata6 {}
877
+ export interface Metadata7 {}
814
878
  export interface Store {}
815
879
  /**
816
880
  * Beginning of processing a Sample.
@@ -830,7 +894,7 @@ export interface Sample {
830
894
  choices: Choices2;
831
895
  target: Target1;
832
896
  id: Id2;
833
- metadata: Metadata7;
897
+ metadata: Metadata8;
834
898
  sandbox: SandboxEnvironmentSpec | null;
835
899
  files: Files1;
836
900
  setup: Setup1;
@@ -847,12 +911,27 @@ export interface SampleLimitEvent {
847
911
  limit: Limit1;
848
912
  }
849
913
  /**
850
- * Change to the current `TaskState`
914
+ * Sandbox execution or I/O
851
915
  */
852
- export interface StateEvent {
916
+ export interface SandboxEvent {
853
917
  timestamp: Timestamp2;
854
918
  pending: Pending2;
855
919
  event: Event2;
920
+ action: Action;
921
+ cmd: Cmd;
922
+ options: Options2;
923
+ file: File;
924
+ input: Input2;
925
+ result: Result;
926
+ output: Output;
927
+ }
928
+ /**
929
+ * Change to the current `TaskState`
930
+ */
931
+ export interface StateEvent {
932
+ timestamp: Timestamp3;
933
+ pending: Pending3;
934
+ event: Event3;
856
935
  changes: Changes;
857
936
  }
858
937
  /**
@@ -873,20 +952,20 @@ export interface JsonChange {
873
952
  * Change to data within the current `Store`.
874
953
  */
875
954
  export interface StoreEvent {
876
- timestamp: Timestamp3;
877
- pending: Pending3;
878
- event: Event3;
955
+ timestamp: Timestamp4;
956
+ pending: Pending4;
957
+ event: Event4;
879
958
  changes: Changes1;
880
959
  }
881
960
  /**
882
961
  * Call to a language model.
883
962
  */
884
963
  export interface ModelEvent {
885
- timestamp: Timestamp4;
886
- pending: Pending4;
887
- event: Event4;
964
+ timestamp: Timestamp5;
965
+ pending: Pending5;
966
+ event: Event5;
888
967
  model: Model2;
889
- input: Input2;
968
+ input: Input3;
890
969
  tools: Tools1;
891
970
  tool_choice: ToolChoice;
892
971
  config: GenerateConfig1;
@@ -922,7 +1001,7 @@ export interface ModelEvent {
922
1001
  * ```
923
1002
  */
924
1003
  export interface ToolInfo {
925
- name: Name5;
1004
+ name: Name7;
926
1005
  description: Description;
927
1006
  parameters: ToolParams;
928
1007
  }
@@ -956,7 +1035,7 @@ export interface Default {
956
1035
  [k: string]: unknown;
957
1036
  }
958
1037
  export interface ToolFunction {
959
- name: Name6;
1038
+ name: Name8;
960
1039
  }
961
1040
  /**
962
1041
  * Model generation options.
@@ -992,6 +1071,7 @@ export interface GenerateConfig1 {
992
1071
  export interface ModelCall {
993
1072
  request: Request;
994
1073
  response: Response;
1074
+ time: Time1;
995
1075
  }
996
1076
  export interface Request {
997
1077
  [k: string]: JsonValue;
@@ -1003,15 +1083,15 @@ export interface Response {
1003
1083
  * Call to a tool.
1004
1084
  */
1005
1085
  export interface ToolEvent {
1006
- timestamp: Timestamp5;
1007
- pending: Pending5;
1008
- event: Event5;
1086
+ timestamp: Timestamp6;
1087
+ pending: Pending6;
1088
+ event: Event6;
1009
1089
  type: Type10;
1010
1090
  id: Id3;
1011
1091
  function: Function2;
1012
1092
  arguments: Arguments1;
1013
1093
  view: ToolCallContent | null;
1014
- result: Result;
1094
+ result: Result1;
1015
1095
  truncated: Truncated;
1016
1096
  error: ToolCallError | null;
1017
1097
  events: Events1;
@@ -1023,9 +1103,9 @@ export interface Arguments1 {
1023
1103
  * Tool approval.
1024
1104
  */
1025
1105
  export interface ApprovalEvent {
1026
- timestamp: Timestamp6;
1027
- pending: Pending6;
1028
- event: Event6;
1106
+ timestamp: Timestamp7;
1107
+ pending: Pending7;
1108
+ event: Event7;
1029
1109
  message: Message3;
1030
1110
  call: ToolCall;
1031
1111
  view: ToolCallView | null;
@@ -1048,10 +1128,10 @@ export interface ToolCallView {
1048
1128
  * Input screen interaction.
1049
1129
  */
1050
1130
  export interface InputEvent {
1051
- timestamp: Timestamp7;
1052
- pending: Pending7;
1053
- event: Event7;
1054
- input: Input3;
1131
+ timestamp: Timestamp8;
1132
+ pending: Pending8;
1133
+ event: Event8;
1134
+ input: Input4;
1055
1135
  input_ansi: InputAnsi;
1056
1136
  }
1057
1137
  /**
@@ -1061,9 +1141,9 @@ export interface InputEvent {
1061
1141
  * resulting from a call to `score`.
1062
1142
  */
1063
1143
  export interface ScoreEvent {
1064
- timestamp: Timestamp8;
1065
- pending: Pending8;
1066
- event: Event8;
1144
+ timestamp: Timestamp9;
1145
+ pending: Pending9;
1146
+ event: Event9;
1067
1147
  score: Score;
1068
1148
  target: Target2;
1069
1149
  intermediate: Intermediate;
@@ -1072,25 +1152,25 @@ export interface ScoreEvent {
1072
1152
  * Event with sample error.
1073
1153
  */
1074
1154
  export interface ErrorEvent {
1075
- timestamp: Timestamp9;
1076
- pending: Pending9;
1077
- event: Event9;
1155
+ timestamp: Timestamp10;
1156
+ pending: Pending10;
1157
+ event: Event10;
1078
1158
  error: EvalError;
1079
1159
  }
1080
1160
  /**
1081
1161
  * Log message recorded with Python logger.
1082
1162
  */
1083
1163
  export interface LoggerEvent {
1084
- timestamp: Timestamp10;
1085
- pending: Pending10;
1086
- event: Event10;
1164
+ timestamp: Timestamp11;
1165
+ pending: Pending11;
1166
+ event: Event11;
1087
1167
  message: LoggingMessage;
1088
1168
  }
1089
1169
  /**
1090
1170
  * Message written to Python log.
1091
1171
  */
1092
1172
  export interface LoggingMessage {
1093
- name: Name7;
1173
+ name: Name9;
1094
1174
  level: Level;
1095
1175
  message: Message4;
1096
1176
  created: Created1;
@@ -1102,9 +1182,9 @@ export interface LoggingMessage {
1102
1182
  * Event with custom info/data.
1103
1183
  */
1104
1184
  export interface InfoEvent {
1105
- timestamp: Timestamp11;
1106
- pending: Pending11;
1107
- event: Event11;
1185
+ timestamp: Timestamp12;
1186
+ pending: Pending12;
1187
+ event: Event12;
1108
1188
  source: Source4;
1109
1189
  data: JsonValue;
1110
1190
  }
@@ -1112,28 +1192,28 @@ export interface InfoEvent {
1112
1192
  * Step within current sample or subtask.
1113
1193
  */
1114
1194
  export interface StepEvent {
1115
- timestamp: Timestamp12;
1116
- pending: Pending12;
1117
- event: Event12;
1118
- action: Action;
1195
+ timestamp: Timestamp13;
1196
+ pending: Pending13;
1197
+ event: Event13;
1198
+ action: Action1;
1119
1199
  type: Type11;
1120
- name: Name8;
1200
+ name: Name10;
1121
1201
  }
1122
1202
  /**
1123
1203
  * Subtask spawned.
1124
1204
  */
1125
1205
  export interface SubtaskEvent {
1126
- timestamp: Timestamp13;
1127
- pending: Pending13;
1128
- event: Event13;
1129
- name: Name9;
1206
+ timestamp: Timestamp14;
1207
+ pending: Pending14;
1208
+ event: Event14;
1209
+ name: Name11;
1130
1210
  type: Type12;
1131
- input: Input4;
1132
- result: Result1;
1211
+ input: Input5;
1212
+ result: Result2;
1133
1213
  events: Events2;
1134
1214
  }
1135
- export interface Input4 {}
1136
- export interface Result1 {
1215
+ export interface Input5 {}
1216
+ export interface Result2 {
1137
1217
  [k: string]: unknown;
1138
1218
  }
1139
1219
  export interface ModelUsage2 {
@@ -1164,6 +1244,6 @@ export interface EvalSampleScore {
1164
1244
  value: Value2;
1165
1245
  answer: Answer1;
1166
1246
  explanation: Explanation2;
1167
- metadata: Metadata8;
1247
+ metadata: Metadata9;
1168
1248
  sample_id: SampleId1;
1169
1249
  }
@@ -77,7 +77,7 @@ export const formatDataset = (
77
77
  */
78
78
  export const formatTime = (seconds: number): string => {
79
79
  if (seconds < 60) {
80
- return `${seconds} sec`;
80
+ return `${formatPrettyDecimal(seconds, 1)} sec`;
81
81
  } else if (seconds < 60 * 60) {
82
82
  return `${Math.floor(seconds / 60)} min ${seconds % 60} sec`;
83
83
  } else if (seconds < 60 * 60 * 24) {
@@ -97,15 +97,18 @@ export const formatTime = (seconds: number): string => {
97
97
  /**
98
98
  * Formats a number to a string with specific decimal places for prettiness.
99
99
  */
100
- export function formatPrettyDecimal(num: number): string {
100
+ export function formatPrettyDecimal(
101
+ num: number,
102
+ maxDecimals: number = 3,
103
+ ): string {
101
104
  const numDecimalPlaces = num.toString().includes(".")
102
105
  ? num.toString().split(".")[1].length
103
106
  : 0;
104
107
 
105
108
  if (numDecimalPlaces === 0) {
106
109
  return num.toFixed(1);
107
- } else if (numDecimalPlaces > 3) {
108
- return num.toFixed(3);
110
+ } else if (numDecimalPlaces > maxDecimals) {
111
+ return num.toFixed(maxDecimals);
109
112
  } else {
110
113
  return num.toString();
111
114
  }
@@ -78,6 +78,12 @@ export const WorkSpaceView: React.FC<WorkSpaceViewProps> = ({
78
78
  },
79
79
  [setSelectedTab],
80
80
  );
81
+ const handleScroll = useCallback(
82
+ (tabid: string, position: number) => {
83
+ onScroll(tabid, position);
84
+ },
85
+ [onScroll],
86
+ );
81
87
 
82
88
  if (evalSpec === undefined) {
83
89
  return <EmptyPanel />;
@@ -138,12 +144,9 @@ export const WorkSpaceView: React.FC<WorkSpaceViewProps> = ({
138
144
  scrollPosition={
139
145
  workspaceTabScrollPositionRef.current?.[tab.id]
140
146
  }
141
- setScrollPosition={useCallback(
142
- (position: number) => {
143
- onScroll(tab.id, position);
144
- },
145
- [onScroll],
146
- )}
147
+ setScrollPosition={(position: number) => {
148
+ handleScroll(tab.id, position);
149
+ }}
147
150
  >
148
151
  {tab.content()}
149
152
  </TabPanel>