inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/sandbox.py +4 -1
  8. inspect_ai/_cli/score.py +181 -32
  9. inspect_ai/_cli/trace.py +2 -0
  10. inspect_ai/_cli/view.py +4 -2
  11. inspect_ai/_display/core/config.py +7 -1
  12. inspect_ai/_display/core/progress.py +1 -1
  13. inspect_ai/_display/textual/app.py +8 -4
  14. inspect_ai/_display/textual/widgets/samples.py +6 -5
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/__init__.py +0 -0
  17. inspect_ai/_eval/eval.py +100 -97
  18. inspect_ai/_eval/evalset.py +69 -69
  19. inspect_ai/_eval/loader.py +122 -12
  20. inspect_ai/_eval/registry.py +1 -1
  21. inspect_ai/_eval/run.py +14 -0
  22. inspect_ai/_eval/score.py +125 -36
  23. inspect_ai/_eval/task/log.py +105 -4
  24. inspect_ai/_eval/task/results.py +92 -38
  25. inspect_ai/_eval/task/run.py +6 -2
  26. inspect_ai/_eval/task/sandbox.py +35 -2
  27. inspect_ai/_eval/task/task.py +49 -46
  28. inspect_ai/_util/__init__.py +0 -0
  29. inspect_ai/_util/constants.py +1 -1
  30. inspect_ai/_util/content.py +8 -0
  31. inspect_ai/_util/error.py +2 -0
  32. inspect_ai/_util/file.py +15 -1
  33. inspect_ai/_util/logger.py +4 -2
  34. inspect_ai/_util/registry.py +7 -1
  35. inspect_ai/_view/view.py +1 -2
  36. inspect_ai/_view/www/App.css +8 -3
  37. inspect_ai/_view/www/README.md +1 -1
  38. inspect_ai/_view/www/dist/assets/index.css +66 -38
  39. inspect_ai/_view/www/dist/assets/index.js +525 -523
  40. inspect_ai/_view/www/log-schema.json +86 -73
  41. inspect_ai/_view/www/package.json +1 -1
  42. inspect_ai/_view/www/src/App.tsx +1 -0
  43. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
  44. inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
  45. inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
  46. inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
  47. inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
  48. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
  49. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
  50. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
  51. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
  52. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
  53. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
  54. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
  55. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
  56. inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
  57. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
  58. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
  59. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
  60. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
  64. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
  65. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
  66. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
  67. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
  68. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
  69. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
  70. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
  72. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
  73. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
  74. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
  75. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
  76. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
  77. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
  78. inspect_ai/_view/www/src/types/log.d.ts +107 -19
  79. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
  80. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
  81. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
  82. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
  83. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
  84. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
  85. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
  86. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
  87. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
  88. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
  89. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  90. inspect_ai/approval/_approval.py +2 -0
  91. inspect_ai/approval/_approver.py +4 -4
  92. inspect_ai/approval/_auto.py +1 -1
  93. inspect_ai/approval/_human/approver.py +3 -0
  94. inspect_ai/approval/_policy.py +5 -0
  95. inspect_ai/approval/_registry.py +2 -2
  96. inspect_ai/dataset/_dataset.py +36 -45
  97. inspect_ai/dataset/_sources/__init__.py +0 -0
  98. inspect_ai/dataset/_sources/csv.py +13 -13
  99. inspect_ai/dataset/_sources/hf.py +29 -29
  100. inspect_ai/dataset/_sources/json.py +10 -10
  101. inspect_ai/log/__init__.py +2 -0
  102. inspect_ai/log/_convert.py +3 -3
  103. inspect_ai/log/_file.py +24 -9
  104. inspect_ai/log/_log.py +98 -7
  105. inspect_ai/log/_message.py +3 -1
  106. inspect_ai/log/_recorders/file.py +4 -0
  107. inspect_ai/log/_recorders/recorder.py +3 -0
  108. inspect_ai/log/_transcript.py +19 -8
  109. inspect_ai/model/__init__.py +2 -0
  110. inspect_ai/model/_cache.py +39 -21
  111. inspect_ai/model/_call_tools.py +2 -2
  112. inspect_ai/model/_chat_message.py +14 -4
  113. inspect_ai/model/_generate_config.py +1 -1
  114. inspect_ai/model/_model.py +31 -24
  115. inspect_ai/model/_model_output.py +14 -1
  116. inspect_ai/model/_openai.py +10 -18
  117. inspect_ai/model/_providers/google.py +9 -5
  118. inspect_ai/model/_providers/openai.py +5 -9
  119. inspect_ai/model/_providers/openrouter.py +1 -1
  120. inspect_ai/scorer/__init__.py +6 -1
  121. inspect_ai/scorer/_answer.py +1 -1
  122. inspect_ai/scorer/_classification.py +4 -0
  123. inspect_ai/scorer/_match.py +4 -5
  124. inspect_ai/scorer/_metric.py +87 -28
  125. inspect_ai/scorer/_metrics/__init__.py +3 -3
  126. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  127. inspect_ai/scorer/_metrics/mean.py +3 -17
  128. inspect_ai/scorer/_metrics/std.py +111 -30
  129. inspect_ai/scorer/_model.py +12 -12
  130. inspect_ai/scorer/_pattern.py +3 -3
  131. inspect_ai/scorer/_reducer/reducer.py +36 -21
  132. inspect_ai/scorer/_reducer/registry.py +2 -2
  133. inspect_ai/scorer/_reducer/types.py +7 -1
  134. inspect_ai/scorer/_score.py +11 -1
  135. inspect_ai/scorer/_scorer.py +110 -16
  136. inspect_ai/solver/__init__.py +1 -1
  137. inspect_ai/solver/_basic_agent.py +19 -22
  138. inspect_ai/solver/_bridge/__init__.py +0 -3
  139. inspect_ai/solver/_bridge/bridge.py +3 -3
  140. inspect_ai/solver/_chain.py +1 -2
  141. inspect_ai/solver/_critique.py +3 -3
  142. inspect_ai/solver/_fork.py +2 -2
  143. inspect_ai/solver/_human_agent/__init__.py +0 -0
  144. inspect_ai/solver/_human_agent/agent.py +5 -8
  145. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  146. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  147. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  148. inspect_ai/solver/_multiple_choice.py +15 -18
  149. inspect_ai/solver/_prompt.py +7 -7
  150. inspect_ai/solver/_solver.py +53 -52
  151. inspect_ai/solver/_task_state.py +80 -69
  152. inspect_ai/solver/_use_tools.py +9 -9
  153. inspect_ai/tool/__init__.py +2 -1
  154. inspect_ai/tool/_tool.py +43 -14
  155. inspect_ai/tool/_tool_call.py +6 -2
  156. inspect_ai/tool/_tool_choice.py +3 -1
  157. inspect_ai/tool/_tool_def.py +10 -8
  158. inspect_ai/tool/_tool_params.py +24 -0
  159. inspect_ai/tool/_tool_with.py +7 -7
  160. inspect_ai/tool/_tools/__init__.py +0 -0
  161. inspect_ai/tool/_tools/_computer/_common.py +2 -2
  162. inspect_ai/tool/_tools/_computer/_computer.py +11 -0
  163. inspect_ai/tool/_tools/_execute.py +15 -9
  164. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  165. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  166. inspect_ai/tool/_tools/_web_search.py +7 -5
  167. inspect_ai/util/_concurrency.py +3 -3
  168. inspect_ai/util/_panel.py +2 -0
  169. inspect_ai/util/_resource.py +12 -12
  170. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  171. inspect_ai/util/_sandbox/docker/config.py +2 -1
  172. inspect_ai/util/_sandbox/docker/docker.py +10 -1
  173. inspect_ai/util/_sandbox/docker/service.py +100 -0
  174. inspect_ai/util/_sandbox/environment.py +99 -96
  175. inspect_ai/util/_subprocess.py +5 -3
  176. inspect_ai/util/_subtask.py +15 -16
  177. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
  178. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
  179. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
  180. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
  181. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
  182. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -157,6 +157,7 @@
157
157
  "type": "object"
158
158
  },
159
159
  "ChatCompletionChoice": {
160
+ "description": "Choice generated for completion.",
160
161
  "properties": {
161
162
  "message": {
162
163
  "$ref": "#/$defs/ChatMessageAssistant"
@@ -196,7 +197,14 @@
196
197
  "additionalProperties": false
197
198
  },
198
199
  "ChatMessageAssistant": {
200
+ "description": "Assistant chat message.",
199
201
  "properties": {
202
+ "role": {
203
+ "const": "assistant",
204
+ "default": "assistant",
205
+ "title": "Role",
206
+ "type": "string"
207
+ },
200
208
  "content": {
201
209
  "anyOf": [
202
210
  {
@@ -240,12 +248,6 @@
240
248
  "default": null,
241
249
  "title": "Source"
242
250
  },
243
- "role": {
244
- "const": "assistant",
245
- "default": "assistant",
246
- "title": "Role",
247
- "type": "string"
248
- },
249
251
  "tool_calls": {
250
252
  "anyOf": [
251
253
  {
@@ -275,9 +277,9 @@
275
277
  }
276
278
  },
277
279
  "required": [
280
+ "role",
278
281
  "content",
279
282
  "source",
280
- "role",
281
283
  "tool_calls",
282
284
  "reasoning"
283
285
  ],
@@ -286,7 +288,14 @@
286
288
  "additionalProperties": false
287
289
  },
288
290
  "ChatMessageSystem": {
291
+ "description": "System chat message.",
289
292
  "properties": {
293
+ "role": {
294
+ "const": "system",
295
+ "default": "system",
296
+ "title": "Role",
297
+ "type": "string"
298
+ },
290
299
  "content": {
291
300
  "anyOf": [
292
301
  {
@@ -329,25 +338,26 @@
329
338
  ],
330
339
  "default": null,
331
340
  "title": "Source"
332
- },
333
- "role": {
334
- "const": "system",
335
- "default": "system",
336
- "title": "Role",
337
- "type": "string"
338
341
  }
339
342
  },
340
343
  "required": [
344
+ "role",
341
345
  "content",
342
- "source",
343
- "role"
346
+ "source"
344
347
  ],
345
348
  "title": "ChatMessageSystem",
346
349
  "type": "object",
347
350
  "additionalProperties": false
348
351
  },
349
352
  "ChatMessageTool": {
353
+ "description": "Tool chat message.",
350
354
  "properties": {
355
+ "role": {
356
+ "const": "tool",
357
+ "default": "tool",
358
+ "title": "Role",
359
+ "type": "string"
360
+ },
351
361
  "content": {
352
362
  "anyOf": [
353
363
  {
@@ -391,12 +401,6 @@
391
401
  "default": null,
392
402
  "title": "Source"
393
403
  },
394
- "role": {
395
- "const": "tool",
396
- "default": "tool",
397
- "title": "Role",
398
- "type": "string"
399
- },
400
404
  "tool_call_id": {
401
405
  "anyOf": [
402
406
  {
@@ -434,9 +438,9 @@
434
438
  }
435
439
  },
436
440
  "required": [
441
+ "role",
437
442
  "content",
438
443
  "source",
439
- "role",
440
444
  "tool_call_id",
441
445
  "function",
442
446
  "error"
@@ -446,7 +450,14 @@
446
450
  "additionalProperties": false
447
451
  },
448
452
  "ChatMessageUser": {
453
+ "description": "User chat message.",
449
454
  "properties": {
455
+ "role": {
456
+ "const": "user",
457
+ "default": "user",
458
+ "title": "Role",
459
+ "type": "string"
460
+ },
450
461
  "content": {
451
462
  "anyOf": [
452
463
  {
@@ -490,12 +501,6 @@
490
501
  "default": null,
491
502
  "title": "Source"
492
503
  },
493
- "role": {
494
- "const": "user",
495
- "default": "user",
496
- "title": "Role",
497
- "type": "string"
498
- },
499
504
  "tool_call_id": {
500
505
  "anyOf": [
501
506
  {
@@ -513,9 +518,9 @@
513
518
  }
514
519
  },
515
520
  "required": [
521
+ "role",
516
522
  "content",
517
523
  "source",
518
- "role",
519
524
  "tool_call_id"
520
525
  ],
521
526
  "title": "ChatMessageUser",
@@ -523,6 +528,7 @@
523
528
  "additionalProperties": false
524
529
  },
525
530
  "ContentAudio": {
531
+ "description": "Audio content.",
526
532
  "properties": {
527
533
  "type": {
528
534
  "const": "audio",
@@ -553,6 +559,7 @@
553
559
  "additionalProperties": false
554
560
  },
555
561
  "ContentImage": {
562
+ "description": "Image content.",
556
563
  "properties": {
557
564
  "type": {
558
565
  "const": "image",
@@ -585,6 +592,7 @@
585
592
  "additionalProperties": false
586
593
  },
587
594
  "ContentText": {
595
+ "description": "Text content.",
588
596
  "properties": {
589
597
  "type": {
590
598
  "const": "text",
@@ -606,6 +614,7 @@
606
614
  "additionalProperties": false
607
615
  },
608
616
  "ContentVideo": {
617
+ "description": "Video content.",
609
618
  "properties": {
610
619
  "type": {
611
620
  "const": "video",
@@ -677,6 +686,7 @@
677
686
  "additionalProperties": false
678
687
  },
679
688
  "EvalConfig": {
689
+ "description": "Configuration used for evaluation.",
680
690
  "properties": {
681
691
  "limit": {
682
692
  "anyOf": [
@@ -954,6 +964,7 @@
954
964
  "additionalProperties": false
955
965
  },
956
966
  "EvalDataset": {
967
+ "description": "Dataset used for evaluation.",
957
968
  "properties": {
958
969
  "name": {
959
970
  "anyOf": [
@@ -1038,6 +1049,7 @@
1038
1049
  "additionalProperties": false
1039
1050
  },
1040
1051
  "EvalError": {
1052
+ "description": "Eval error details.",
1041
1053
  "properties": {
1042
1054
  "message": {
1043
1055
  "title": "Message",
@@ -1062,6 +1074,7 @@
1062
1074
  "additionalProperties": false
1063
1075
  },
1064
1076
  "EvalMetric": {
1077
+ "description": "Metric for evaluation score.",
1065
1078
  "properties": {
1066
1079
  "name": {
1067
1080
  "title": "Name",
@@ -1078,8 +1091,8 @@
1078
1091
  ],
1079
1092
  "title": "Value"
1080
1093
  },
1081
- "options": {
1082
- "title": "Options",
1094
+ "params": {
1095
+ "title": "Params",
1083
1096
  "type": "object"
1084
1097
  },
1085
1098
  "metadata": {
@@ -1098,7 +1111,7 @@
1098
1111
  "required": [
1099
1112
  "name",
1100
1113
  "value",
1101
- "options",
1114
+ "params",
1102
1115
  "metadata"
1103
1116
  ],
1104
1117
  "title": "EvalMetric",
@@ -1106,6 +1119,7 @@
1106
1119
  "additionalProperties": false
1107
1120
  },
1108
1121
  "EvalPlan": {
1122
+ "description": "Plan (solvers) used in evaluation.",
1109
1123
  "properties": {
1110
1124
  "name": {
1111
1125
  "default": "plan",
@@ -1171,6 +1185,7 @@
1171
1185
  "additionalProperties": false
1172
1186
  },
1173
1187
  "EvalPlanStep": {
1188
+ "description": "Solver step.",
1174
1189
  "properties": {
1175
1190
  "solver": {
1176
1191
  "title": "Solver",
@@ -1190,6 +1205,7 @@
1190
1205
  "additionalProperties": false
1191
1206
  },
1192
1207
  "EvalResults": {
1208
+ "description": "Scoring results from evaluation.",
1193
1209
  "properties": {
1194
1210
  "total_samples": {
1195
1211
  "default": 0,
@@ -1233,6 +1249,7 @@
1233
1249
  "additionalProperties": false
1234
1250
  },
1235
1251
  "EvalRevision": {
1252
+ "description": "Git revision for evaluation.",
1236
1253
  "properties": {
1237
1254
  "type": {
1238
1255
  "const": "git",
@@ -1258,6 +1275,7 @@
1258
1275
  "additionalProperties": false
1259
1276
  },
1260
1277
  "EvalSample": {
1278
+ "description": "Sample from evaluation task.",
1261
1279
  "properties": {
1262
1280
  "id": {
1263
1281
  "anyOf": [
@@ -1526,6 +1544,7 @@
1526
1544
  "additionalProperties": false
1527
1545
  },
1528
1546
  "EvalSampleLimit": {
1547
+ "description": "Limit encontered by sample.",
1529
1548
  "properties": {
1530
1549
  "type": {
1531
1550
  "enum": [
@@ -1553,6 +1572,7 @@
1553
1572
  "additionalProperties": false
1554
1573
  },
1555
1574
  "EvalSampleReductions": {
1575
+ "description": "Score reductions.",
1556
1576
  "properties": {
1557
1577
  "scorer": {
1558
1578
  "title": "Scorer",
@@ -1588,6 +1608,7 @@
1588
1608
  "additionalProperties": false
1589
1609
  },
1590
1610
  "EvalSampleScore": {
1611
+ "description": "Score and sample_id scored.",
1591
1612
  "properties": {
1592
1613
  "value": {
1593
1614
  "anyOf": [
@@ -1711,6 +1732,7 @@
1711
1732
  "additionalProperties": false
1712
1733
  },
1713
1734
  "EvalScore": {
1735
+ "description": "Score for evaluation task.",
1714
1736
  "properties": {
1715
1737
  "name": {
1716
1738
  "title": "Name",
@@ -1769,6 +1791,7 @@
1769
1791
  "additionalProperties": false
1770
1792
  },
1771
1793
  "EvalSpec": {
1794
+ "description": "Eval target and configuration.",
1772
1795
  "properties": {
1773
1796
  "run_id": {
1774
1797
  "title": "Run Id",
@@ -1945,6 +1968,7 @@
1945
1968
  "additionalProperties": false
1946
1969
  },
1947
1970
  "EvalStats": {
1971
+ "description": "Timing and usage statistics.",
1948
1972
  "properties": {
1949
1973
  "started_at": {
1950
1974
  "title": "Started At",
@@ -1972,7 +1996,7 @@
1972
1996
  "additionalProperties": false
1973
1997
  },
1974
1998
  "GenerateConfig": {
1975
- "description": "Base class for model generation configs.",
1999
+ "description": "Model generation options.",
1976
2000
  "properties": {
1977
2001
  "max_retries": {
1978
2002
  "anyOf": [
@@ -2321,6 +2345,18 @@
2321
2345
  "title": "Event",
2322
2346
  "type": "string"
2323
2347
  },
2348
+ "source": {
2349
+ "anyOf": [
2350
+ {
2351
+ "type": "string"
2352
+ },
2353
+ {
2354
+ "type": "null"
2355
+ }
2356
+ ],
2357
+ "default": null,
2358
+ "title": "Source"
2359
+ },
2324
2360
  "data": {
2325
2361
  "$ref": "#/$defs/JsonValue"
2326
2362
  }
@@ -2329,6 +2365,7 @@
2329
2365
  "timestamp",
2330
2366
  "pending",
2331
2367
  "event",
2368
+ "source",
2332
2369
  "data"
2333
2370
  ],
2334
2371
  "title": "InfoEvent",
@@ -2474,6 +2511,7 @@
2474
2511
  "additionalProperties": false
2475
2512
  },
2476
2513
  "LoggingMessage": {
2514
+ "description": "Message written to Python log.",
2477
2515
  "properties": {
2478
2516
  "name": {
2479
2517
  "anyOf": [
@@ -2772,6 +2810,7 @@
2772
2810
  "additionalProperties": false
2773
2811
  },
2774
2812
  "ModelOutput": {
2813
+ "description": "Output from model generation.",
2775
2814
  "properties": {
2776
2815
  "model": {
2777
2816
  "title": "Model",
@@ -2846,6 +2885,7 @@
2846
2885
  "additionalProperties": false
2847
2886
  },
2848
2887
  "ModelUsage": {
2888
+ "description": "Token usage for completion.",
2849
2889
  "properties": {
2850
2890
  "input_tokens": {
2851
2891
  "default": 0,
@@ -2899,6 +2939,7 @@
2899
2939
  "additionalProperties": false
2900
2940
  },
2901
2941
  "Sample": {
2942
+ "description": "Sample for an evaluation task.",
2902
2943
  "properties": {
2903
2944
  "input": {
2904
2945
  "anyOf": [
@@ -3173,7 +3214,7 @@
3173
3214
  "type": "array"
3174
3215
  },
3175
3216
  "Score": {
3176
- "description": "Score generated by a scorer.\n\nArgs:\n value (Value): Score value.\n answer (str | None): Answer extracted from model output (optional).\n explanation (str | None): Explanation of score (optional).\n metadata (dict[str,Any]): Additional metadata related to the score.",
3217
+ "description": "Score generated by a scorer.",
3177
3218
  "properties": {
3178
3219
  "value": {
3179
3220
  "anyOf": [
@@ -3281,7 +3322,7 @@
3281
3322
  "additionalProperties": false
3282
3323
  },
3283
3324
  "ScoreEvent": {
3284
- "description": "Event with sample score.",
3325
+ "description": "Event with score.\n\nCan be the final score for a `Sample`, or can be an intermediate score\nresulting from a call to `score`.",
3285
3326
  "properties": {
3286
3327
  "timestamp": {
3287
3328
  "format": "date-time",
@@ -3326,6 +3367,11 @@
3326
3367
  ],
3327
3368
  "default": null,
3328
3369
  "title": "Target"
3370
+ },
3371
+ "intermediate": {
3372
+ "default": false,
3373
+ "title": "Intermediate",
3374
+ "type": "boolean"
3329
3375
  }
3330
3376
  },
3331
3377
  "required": [
@@ -3333,7 +3379,8 @@
3333
3379
  "pending",
3334
3380
  "event",
3335
3381
  "score",
3336
- "target"
3382
+ "target",
3383
+ "intermediate"
3337
3384
  ],
3338
3385
  "title": "ScoreEvent",
3339
3386
  "type": "object",
@@ -4223,6 +4270,7 @@
4223
4270
  "additionalProperties": false
4224
4271
  }
4225
4272
  },
4273
+ "description": "Evaluation log.",
4226
4274
  "properties": {
4227
4275
  "version": {
4228
4276
  "default": 2,
@@ -4244,37 +4292,7 @@
4244
4292
  "$ref": "#/$defs/EvalSpec"
4245
4293
  },
4246
4294
  "plan": {
4247
- "$ref": "#/$defs/EvalPlan",
4248
- "default": {
4249
- "name": "plan",
4250
- "steps": [],
4251
- "finish": null,
4252
- "config": {
4253
- "best_of": null,
4254
- "cache_prompt": null,
4255
- "frequency_penalty": null,
4256
- "internal_tools": null,
4257
- "logit_bias": null,
4258
- "logprobs": null,
4259
- "max_connections": null,
4260
- "max_retries": null,
4261
- "max_tokens": null,
4262
- "max_tool_output": null,
4263
- "num_choices": null,
4264
- "parallel_tool_calls": null,
4265
- "presence_penalty": null,
4266
- "reasoning_effort": null,
4267
- "reasoning_history": null,
4268
- "seed": null,
4269
- "stop_seqs": null,
4270
- "system_message": null,
4271
- "temperature": null,
4272
- "timeout": null,
4273
- "top_k": null,
4274
- "top_logprobs": null,
4275
- "top_p": null
4276
- }
4277
- }
4295
+ "$ref": "#/$defs/EvalPlan"
4278
4296
  },
4279
4297
  "results": {
4280
4298
  "anyOf": [
@@ -4288,12 +4306,7 @@
4288
4306
  "default": null
4289
4307
  },
4290
4308
  "stats": {
4291
- "$ref": "#/$defs/EvalStats",
4292
- "default": {
4293
- "started_at": "",
4294
- "completed_at": "",
4295
- "model_usage": {}
4296
- }
4309
+ "$ref": "#/$defs/EvalStats"
4297
4310
  },
4298
4311
  "error": {
4299
4312
  "anyOf": [
@@ -8,7 +8,7 @@
8
8
  "scripts": {
9
9
  "build": "vite build",
10
10
  "watch": "vite build --watch",
11
- "dev-watch": "vite build --mode development --watch",
11
+ "dev-watch": "NODE_ENV=development vite build --mode development --watch",
12
12
  "dev": "vite",
13
13
  "prettier:check": "prettier --check src",
14
14
  "prettier:write": "prettier --write src",
@@ -990,6 +990,7 @@ const defaultScorers = (log: EvalSummary): Array<ScorerInfo> => {
990
990
  }, [] as Array<ScorerInfo>);
991
991
  } else if (log.sampleSummaries && log.sampleSummaries.length > 0) {
992
992
  const scores = log.sampleSummaries[0].scores;
993
+
993
994
  if (scores !== null) {
994
995
  return Object.keys(scores).map((key) => {
995
996
  return {
@@ -1,6 +1,6 @@
1
1
  import { ANSIColor, ANSIOutput, ANSIOutputRun, ANSIStyle } from "ansi-output";
2
2
  import clsx from "clsx";
3
- import "./ANSIDisplay.css";
3
+ import "./AnsiDisplay.css";
4
4
 
5
5
  interface ANSIDisplayProps {
6
6
  output: string;
@@ -1,7 +1,7 @@
1
1
  import clsx from "clsx";
2
2
  import { highlightElement } from "prismjs";
3
3
  import React, { useEffect, useMemo, useRef } from "react";
4
- import "./JSONPanel.css";
4
+ import "./JsonPanel.css";
5
5
 
6
6
  const kPrismRenderMaxSize = 250000;
7
7
 
@@ -78,54 +78,6 @@ export const LargeModal: React.FC<LargeModalProps> = ({
78
78
  [setInitialScrollPosition],
79
79
  );
80
80
 
81
- // Capture header elements
82
- const headerEls = [];
83
- // The title
84
- headerEls.push(
85
- <div className={clsx("modal-title", "text-size-smaller", styles.title)}>
86
- {title || ""}
87
- </div>,
88
- );
89
-
90
- // A centered text element with tools to the left and right
91
- if (detail) {
92
- headerEls.push(
93
- <div className={styles.detail}>
94
- {detailTools?.left
95
- ? detailTools.left.map((tool) => {
96
- return <TitleTool {...tool} />;
97
- })
98
- : ""}
99
- <div className={clsx("text-size-smaller", styles.detailText)}>
100
- <div>{detail}</div>
101
- </div>
102
-
103
- {detailTools?.right
104
- ? detailTools.right.map((tool) => {
105
- return <TitleTool {...tool} />;
106
- })
107
- : ""}
108
- </div>,
109
- );
110
- }
111
-
112
- // The close 'x'
113
- headerEls.push(
114
- <button
115
- type="button"
116
- className={clsx(
117
- "btn",
118
- "btn-close-large-dialog",
119
- "text-size-larger",
120
- styles.close,
121
- )}
122
- onClick={onHide}
123
- aria-label="Close"
124
- >
125
- <HtmlEntity html={"&times;"} />
126
- </button>,
127
- );
128
-
129
81
  return (
130
82
  <div
131
83
  id={id}
@@ -147,7 +99,45 @@ export const LargeModal: React.FC<LargeModalProps> = ({
147
99
  role="document"
148
100
  >
149
101
  <div className={clsx("modal-content", styles.content)}>
150
- <div className={clsx("modal-header", styles.header)}>{headerEls}</div>
102
+ <div className={clsx("modal-header", styles.header)}>
103
+ <div
104
+ className={clsx("modal-title", "text-size-smaller", styles.title)}
105
+ >
106
+ {title || ""}
107
+ </div>
108
+
109
+ {detail ? (
110
+ <div className={styles.detail}>
111
+ {detailTools?.left
112
+ ? detailTools.left.map((tool, idx) => {
113
+ return <TitleTool key={`tool-left-${idx}`} {...tool} />;
114
+ })
115
+ : ""}
116
+ <div className={clsx("text-size-smaller", styles.detailText)}>
117
+ <div>{detail}</div>
118
+ </div>
119
+
120
+ {detailTools?.right
121
+ ? detailTools.right.map((tool, idx) => {
122
+ return <TitleTool key={`tool-right-${idx}`} {...tool} />;
123
+ })
124
+ : ""}
125
+ </div>
126
+ ) : undefined}
127
+ <button
128
+ type="button"
129
+ className={clsx(
130
+ "btn",
131
+ "btn-close-large-dialog",
132
+ "text-size-larger",
133
+ styles.close,
134
+ )}
135
+ onClick={onHide}
136
+ aria-label="Close"
137
+ >
138
+ <HtmlEntity html={"&times;"} />
139
+ </button>
140
+ </div>
151
141
  <ProgressBar animating={showProgress} />
152
142
  <div className={"modal-body"} ref={scrollRef} onScroll={onScroll}>
153
143
  {children}
@@ -26,6 +26,7 @@ export const NavPills: React.FC<NavPillsProps> = ({ children }) => {
26
26
  : `Tab ${idx}`;
27
27
  return (
28
28
  <NavPill
29
+ key={`nav-pill-contents-${idx}`}
29
30
  title={title}
30
31
  activeItem={activeItem}
31
32
  setActiveItem={setActiveItem}
@@ -34,9 +35,10 @@ export const NavPills: React.FC<NavPillsProps> = ({ children }) => {
34
35
  });
35
36
 
36
37
  // Wrap each of the children in a 'body' to control its visibility
37
- const navBodies = children.map((child) => {
38
+ const navBodies = children.map((child, idx) => {
38
39
  return (
39
40
  <div
41
+ key={`nav-pill-container-${idx}`}
40
42
  className={
41
43
  child["props"]?.title === activeItem ? styles.visible : styles.hidden
42
44
  }
@@ -1,6 +1,8 @@
1
1
  import clsx from "clsx";
2
2
  import {
3
+ Children,
3
4
  Fragment,
5
+ isValidElement,
4
6
  MouseEvent,
5
7
  ReactElement,
6
8
  useCallback,
@@ -46,10 +48,7 @@ export const TabSet: React.FC<TabSetProps> = ({
46
48
  tools,
47
49
  children,
48
50
  }) => {
49
- const validTabs: ReactElement<TabPanelProps>[] = Array.isArray(children)
50
- ? (children.filter(Boolean) as ReactElement<TabPanelProps>[])
51
- : [children];
52
-
51
+ const validTabs = flattenChildren(children);
53
52
  if (validTabs.length === 0) return null;
54
53
 
55
54
  return (
@@ -198,3 +197,19 @@ const TabTools: React.FC<{ tools?: React.ReactNode }> = ({ tools }) => (
198
197
  // Utility functions
199
198
  const computeTabId = (id: string, index: number) => `${id}-${index}`;
200
199
  const computeTabContentsId = (id: string) => `${id}-contents`;
200
+
201
+ const flattenChildren = (
202
+ children: React.ReactNode,
203
+ ): ReactElement<TabPanelProps>[] => {
204
+ return Children.toArray(children).flatMap((child) => {
205
+ if (isValidElement(child)) {
206
+ const element = child as React.ReactElement<any>;
207
+
208
+ if (element.type === Fragment) {
209
+ return flattenChildren(element.props.children);
210
+ }
211
+ return element;
212
+ }
213
+ return [];
214
+ });
215
+ };
@@ -101,7 +101,6 @@ export const openRemoteLogFile = async (
101
101
  if (remoteZipFile.centralDirectory.has(sampleFile)) {
102
102
  return (await readJSONFile(sampleFile, MAX_BYTES)) as EvalSample;
103
103
  } else {
104
- console.log({ dir: remoteZipFile.centralDirectory });
105
104
  throw new Error(
106
105
  `Unable to read sample file ${sampleFile} - it is not present in the manifest.`,
107
106
  );