inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +2 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/core/progress.py +1 -1
- inspect_ai/_display/textual/app.py +8 -4
- inspect_ai/_display/textual/widgets/samples.py +6 -5
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/__init__.py +0 -0
- inspect_ai/_eval/eval.py +100 -97
- inspect_ai/_eval/evalset.py +69 -69
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +6 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/__init__.py +0 -0
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/App.css +8 -3
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +66 -38
- inspect_ai/_view/www/dist/assets/index.js +525 -523
- inspect_ai/_view/www/log-schema.json +86 -73
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/App.tsx +1 -0
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
- inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
- inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
- inspect_ai/_view/www/src/types/log.d.ts +107 -19
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +36 -45
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +13 -13
- inspect_ai/dataset/_sources/hf.py +29 -29
- inspect_ai/dataset/_sources/json.py +10 -10
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +98 -7
- inspect_ai/log/_message.py +3 -1
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +2 -2
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openrouter.py +1 -1
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +1 -1
- inspect_ai/scorer/_classification.py +4 -0
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +15 -18
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +2 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/_tools/_computer/_common.py +2 -2
- inspect_ai/tool/_tools/_computer/_computer.py +11 -0
- inspect_ai/tool/_tools/_execute.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +10 -1
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -157,6 +157,7 @@
|
|
157
157
|
"type": "object"
|
158
158
|
},
|
159
159
|
"ChatCompletionChoice": {
|
160
|
+
"description": "Choice generated for completion.",
|
160
161
|
"properties": {
|
161
162
|
"message": {
|
162
163
|
"$ref": "#/$defs/ChatMessageAssistant"
|
@@ -196,7 +197,14 @@
|
|
196
197
|
"additionalProperties": false
|
197
198
|
},
|
198
199
|
"ChatMessageAssistant": {
|
200
|
+
"description": "Assistant chat message.",
|
199
201
|
"properties": {
|
202
|
+
"role": {
|
203
|
+
"const": "assistant",
|
204
|
+
"default": "assistant",
|
205
|
+
"title": "Role",
|
206
|
+
"type": "string"
|
207
|
+
},
|
200
208
|
"content": {
|
201
209
|
"anyOf": [
|
202
210
|
{
|
@@ -240,12 +248,6 @@
|
|
240
248
|
"default": null,
|
241
249
|
"title": "Source"
|
242
250
|
},
|
243
|
-
"role": {
|
244
|
-
"const": "assistant",
|
245
|
-
"default": "assistant",
|
246
|
-
"title": "Role",
|
247
|
-
"type": "string"
|
248
|
-
},
|
249
251
|
"tool_calls": {
|
250
252
|
"anyOf": [
|
251
253
|
{
|
@@ -275,9 +277,9 @@
|
|
275
277
|
}
|
276
278
|
},
|
277
279
|
"required": [
|
280
|
+
"role",
|
278
281
|
"content",
|
279
282
|
"source",
|
280
|
-
"role",
|
281
283
|
"tool_calls",
|
282
284
|
"reasoning"
|
283
285
|
],
|
@@ -286,7 +288,14 @@
|
|
286
288
|
"additionalProperties": false
|
287
289
|
},
|
288
290
|
"ChatMessageSystem": {
|
291
|
+
"description": "System chat message.",
|
289
292
|
"properties": {
|
293
|
+
"role": {
|
294
|
+
"const": "system",
|
295
|
+
"default": "system",
|
296
|
+
"title": "Role",
|
297
|
+
"type": "string"
|
298
|
+
},
|
290
299
|
"content": {
|
291
300
|
"anyOf": [
|
292
301
|
{
|
@@ -329,25 +338,26 @@
|
|
329
338
|
],
|
330
339
|
"default": null,
|
331
340
|
"title": "Source"
|
332
|
-
},
|
333
|
-
"role": {
|
334
|
-
"const": "system",
|
335
|
-
"default": "system",
|
336
|
-
"title": "Role",
|
337
|
-
"type": "string"
|
338
341
|
}
|
339
342
|
},
|
340
343
|
"required": [
|
344
|
+
"role",
|
341
345
|
"content",
|
342
|
-
"source"
|
343
|
-
"role"
|
346
|
+
"source"
|
344
347
|
],
|
345
348
|
"title": "ChatMessageSystem",
|
346
349
|
"type": "object",
|
347
350
|
"additionalProperties": false
|
348
351
|
},
|
349
352
|
"ChatMessageTool": {
|
353
|
+
"description": "Tool chat message.",
|
350
354
|
"properties": {
|
355
|
+
"role": {
|
356
|
+
"const": "tool",
|
357
|
+
"default": "tool",
|
358
|
+
"title": "Role",
|
359
|
+
"type": "string"
|
360
|
+
},
|
351
361
|
"content": {
|
352
362
|
"anyOf": [
|
353
363
|
{
|
@@ -391,12 +401,6 @@
|
|
391
401
|
"default": null,
|
392
402
|
"title": "Source"
|
393
403
|
},
|
394
|
-
"role": {
|
395
|
-
"const": "tool",
|
396
|
-
"default": "tool",
|
397
|
-
"title": "Role",
|
398
|
-
"type": "string"
|
399
|
-
},
|
400
404
|
"tool_call_id": {
|
401
405
|
"anyOf": [
|
402
406
|
{
|
@@ -434,9 +438,9 @@
|
|
434
438
|
}
|
435
439
|
},
|
436
440
|
"required": [
|
441
|
+
"role",
|
437
442
|
"content",
|
438
443
|
"source",
|
439
|
-
"role",
|
440
444
|
"tool_call_id",
|
441
445
|
"function",
|
442
446
|
"error"
|
@@ -446,7 +450,14 @@
|
|
446
450
|
"additionalProperties": false
|
447
451
|
},
|
448
452
|
"ChatMessageUser": {
|
453
|
+
"description": "User chat message.",
|
449
454
|
"properties": {
|
455
|
+
"role": {
|
456
|
+
"const": "user",
|
457
|
+
"default": "user",
|
458
|
+
"title": "Role",
|
459
|
+
"type": "string"
|
460
|
+
},
|
450
461
|
"content": {
|
451
462
|
"anyOf": [
|
452
463
|
{
|
@@ -490,12 +501,6 @@
|
|
490
501
|
"default": null,
|
491
502
|
"title": "Source"
|
492
503
|
},
|
493
|
-
"role": {
|
494
|
-
"const": "user",
|
495
|
-
"default": "user",
|
496
|
-
"title": "Role",
|
497
|
-
"type": "string"
|
498
|
-
},
|
499
504
|
"tool_call_id": {
|
500
505
|
"anyOf": [
|
501
506
|
{
|
@@ -513,9 +518,9 @@
|
|
513
518
|
}
|
514
519
|
},
|
515
520
|
"required": [
|
521
|
+
"role",
|
516
522
|
"content",
|
517
523
|
"source",
|
518
|
-
"role",
|
519
524
|
"tool_call_id"
|
520
525
|
],
|
521
526
|
"title": "ChatMessageUser",
|
@@ -523,6 +528,7 @@
|
|
523
528
|
"additionalProperties": false
|
524
529
|
},
|
525
530
|
"ContentAudio": {
|
531
|
+
"description": "Audio content.",
|
526
532
|
"properties": {
|
527
533
|
"type": {
|
528
534
|
"const": "audio",
|
@@ -553,6 +559,7 @@
|
|
553
559
|
"additionalProperties": false
|
554
560
|
},
|
555
561
|
"ContentImage": {
|
562
|
+
"description": "Image content.",
|
556
563
|
"properties": {
|
557
564
|
"type": {
|
558
565
|
"const": "image",
|
@@ -585,6 +592,7 @@
|
|
585
592
|
"additionalProperties": false
|
586
593
|
},
|
587
594
|
"ContentText": {
|
595
|
+
"description": "Text content.",
|
588
596
|
"properties": {
|
589
597
|
"type": {
|
590
598
|
"const": "text",
|
@@ -606,6 +614,7 @@
|
|
606
614
|
"additionalProperties": false
|
607
615
|
},
|
608
616
|
"ContentVideo": {
|
617
|
+
"description": "Video content.",
|
609
618
|
"properties": {
|
610
619
|
"type": {
|
611
620
|
"const": "video",
|
@@ -677,6 +686,7 @@
|
|
677
686
|
"additionalProperties": false
|
678
687
|
},
|
679
688
|
"EvalConfig": {
|
689
|
+
"description": "Configuration used for evaluation.",
|
680
690
|
"properties": {
|
681
691
|
"limit": {
|
682
692
|
"anyOf": [
|
@@ -954,6 +964,7 @@
|
|
954
964
|
"additionalProperties": false
|
955
965
|
},
|
956
966
|
"EvalDataset": {
|
967
|
+
"description": "Dataset used for evaluation.",
|
957
968
|
"properties": {
|
958
969
|
"name": {
|
959
970
|
"anyOf": [
|
@@ -1038,6 +1049,7 @@
|
|
1038
1049
|
"additionalProperties": false
|
1039
1050
|
},
|
1040
1051
|
"EvalError": {
|
1052
|
+
"description": "Eval error details.",
|
1041
1053
|
"properties": {
|
1042
1054
|
"message": {
|
1043
1055
|
"title": "Message",
|
@@ -1062,6 +1074,7 @@
|
|
1062
1074
|
"additionalProperties": false
|
1063
1075
|
},
|
1064
1076
|
"EvalMetric": {
|
1077
|
+
"description": "Metric for evaluation score.",
|
1065
1078
|
"properties": {
|
1066
1079
|
"name": {
|
1067
1080
|
"title": "Name",
|
@@ -1078,8 +1091,8 @@
|
|
1078
1091
|
],
|
1079
1092
|
"title": "Value"
|
1080
1093
|
},
|
1081
|
-
"
|
1082
|
-
"title": "
|
1094
|
+
"params": {
|
1095
|
+
"title": "Params",
|
1083
1096
|
"type": "object"
|
1084
1097
|
},
|
1085
1098
|
"metadata": {
|
@@ -1098,7 +1111,7 @@
|
|
1098
1111
|
"required": [
|
1099
1112
|
"name",
|
1100
1113
|
"value",
|
1101
|
-
"
|
1114
|
+
"params",
|
1102
1115
|
"metadata"
|
1103
1116
|
],
|
1104
1117
|
"title": "EvalMetric",
|
@@ -1106,6 +1119,7 @@
|
|
1106
1119
|
"additionalProperties": false
|
1107
1120
|
},
|
1108
1121
|
"EvalPlan": {
|
1122
|
+
"description": "Plan (solvers) used in evaluation.",
|
1109
1123
|
"properties": {
|
1110
1124
|
"name": {
|
1111
1125
|
"default": "plan",
|
@@ -1171,6 +1185,7 @@
|
|
1171
1185
|
"additionalProperties": false
|
1172
1186
|
},
|
1173
1187
|
"EvalPlanStep": {
|
1188
|
+
"description": "Solver step.",
|
1174
1189
|
"properties": {
|
1175
1190
|
"solver": {
|
1176
1191
|
"title": "Solver",
|
@@ -1190,6 +1205,7 @@
|
|
1190
1205
|
"additionalProperties": false
|
1191
1206
|
},
|
1192
1207
|
"EvalResults": {
|
1208
|
+
"description": "Scoring results from evaluation.",
|
1193
1209
|
"properties": {
|
1194
1210
|
"total_samples": {
|
1195
1211
|
"default": 0,
|
@@ -1233,6 +1249,7 @@
|
|
1233
1249
|
"additionalProperties": false
|
1234
1250
|
},
|
1235
1251
|
"EvalRevision": {
|
1252
|
+
"description": "Git revision for evaluation.",
|
1236
1253
|
"properties": {
|
1237
1254
|
"type": {
|
1238
1255
|
"const": "git",
|
@@ -1258,6 +1275,7 @@
|
|
1258
1275
|
"additionalProperties": false
|
1259
1276
|
},
|
1260
1277
|
"EvalSample": {
|
1278
|
+
"description": "Sample from evaluation task.",
|
1261
1279
|
"properties": {
|
1262
1280
|
"id": {
|
1263
1281
|
"anyOf": [
|
@@ -1526,6 +1544,7 @@
|
|
1526
1544
|
"additionalProperties": false
|
1527
1545
|
},
|
1528
1546
|
"EvalSampleLimit": {
|
1547
|
+
"description": "Limit encontered by sample.",
|
1529
1548
|
"properties": {
|
1530
1549
|
"type": {
|
1531
1550
|
"enum": [
|
@@ -1553,6 +1572,7 @@
|
|
1553
1572
|
"additionalProperties": false
|
1554
1573
|
},
|
1555
1574
|
"EvalSampleReductions": {
|
1575
|
+
"description": "Score reductions.",
|
1556
1576
|
"properties": {
|
1557
1577
|
"scorer": {
|
1558
1578
|
"title": "Scorer",
|
@@ -1588,6 +1608,7 @@
|
|
1588
1608
|
"additionalProperties": false
|
1589
1609
|
},
|
1590
1610
|
"EvalSampleScore": {
|
1611
|
+
"description": "Score and sample_id scored.",
|
1591
1612
|
"properties": {
|
1592
1613
|
"value": {
|
1593
1614
|
"anyOf": [
|
@@ -1711,6 +1732,7 @@
|
|
1711
1732
|
"additionalProperties": false
|
1712
1733
|
},
|
1713
1734
|
"EvalScore": {
|
1735
|
+
"description": "Score for evaluation task.",
|
1714
1736
|
"properties": {
|
1715
1737
|
"name": {
|
1716
1738
|
"title": "Name",
|
@@ -1769,6 +1791,7 @@
|
|
1769
1791
|
"additionalProperties": false
|
1770
1792
|
},
|
1771
1793
|
"EvalSpec": {
|
1794
|
+
"description": "Eval target and configuration.",
|
1772
1795
|
"properties": {
|
1773
1796
|
"run_id": {
|
1774
1797
|
"title": "Run Id",
|
@@ -1945,6 +1968,7 @@
|
|
1945
1968
|
"additionalProperties": false
|
1946
1969
|
},
|
1947
1970
|
"EvalStats": {
|
1971
|
+
"description": "Timing and usage statistics.",
|
1948
1972
|
"properties": {
|
1949
1973
|
"started_at": {
|
1950
1974
|
"title": "Started At",
|
@@ -1972,7 +1996,7 @@
|
|
1972
1996
|
"additionalProperties": false
|
1973
1997
|
},
|
1974
1998
|
"GenerateConfig": {
|
1975
|
-
"description": "
|
1999
|
+
"description": "Model generation options.",
|
1976
2000
|
"properties": {
|
1977
2001
|
"max_retries": {
|
1978
2002
|
"anyOf": [
|
@@ -2321,6 +2345,18 @@
|
|
2321
2345
|
"title": "Event",
|
2322
2346
|
"type": "string"
|
2323
2347
|
},
|
2348
|
+
"source": {
|
2349
|
+
"anyOf": [
|
2350
|
+
{
|
2351
|
+
"type": "string"
|
2352
|
+
},
|
2353
|
+
{
|
2354
|
+
"type": "null"
|
2355
|
+
}
|
2356
|
+
],
|
2357
|
+
"default": null,
|
2358
|
+
"title": "Source"
|
2359
|
+
},
|
2324
2360
|
"data": {
|
2325
2361
|
"$ref": "#/$defs/JsonValue"
|
2326
2362
|
}
|
@@ -2329,6 +2365,7 @@
|
|
2329
2365
|
"timestamp",
|
2330
2366
|
"pending",
|
2331
2367
|
"event",
|
2368
|
+
"source",
|
2332
2369
|
"data"
|
2333
2370
|
],
|
2334
2371
|
"title": "InfoEvent",
|
@@ -2474,6 +2511,7 @@
|
|
2474
2511
|
"additionalProperties": false
|
2475
2512
|
},
|
2476
2513
|
"LoggingMessage": {
|
2514
|
+
"description": "Message written to Python log.",
|
2477
2515
|
"properties": {
|
2478
2516
|
"name": {
|
2479
2517
|
"anyOf": [
|
@@ -2772,6 +2810,7 @@
|
|
2772
2810
|
"additionalProperties": false
|
2773
2811
|
},
|
2774
2812
|
"ModelOutput": {
|
2813
|
+
"description": "Output from model generation.",
|
2775
2814
|
"properties": {
|
2776
2815
|
"model": {
|
2777
2816
|
"title": "Model",
|
@@ -2846,6 +2885,7 @@
|
|
2846
2885
|
"additionalProperties": false
|
2847
2886
|
},
|
2848
2887
|
"ModelUsage": {
|
2888
|
+
"description": "Token usage for completion.",
|
2849
2889
|
"properties": {
|
2850
2890
|
"input_tokens": {
|
2851
2891
|
"default": 0,
|
@@ -2899,6 +2939,7 @@
|
|
2899
2939
|
"additionalProperties": false
|
2900
2940
|
},
|
2901
2941
|
"Sample": {
|
2942
|
+
"description": "Sample for an evaluation task.",
|
2902
2943
|
"properties": {
|
2903
2944
|
"input": {
|
2904
2945
|
"anyOf": [
|
@@ -3173,7 +3214,7 @@
|
|
3173
3214
|
"type": "array"
|
3174
3215
|
},
|
3175
3216
|
"Score": {
|
3176
|
-
"description": "Score generated by a scorer
|
3217
|
+
"description": "Score generated by a scorer.",
|
3177
3218
|
"properties": {
|
3178
3219
|
"value": {
|
3179
3220
|
"anyOf": [
|
@@ -3281,7 +3322,7 @@
|
|
3281
3322
|
"additionalProperties": false
|
3282
3323
|
},
|
3283
3324
|
"ScoreEvent": {
|
3284
|
-
"description": "Event with
|
3325
|
+
"description": "Event with score.\n\nCan be the final score for a `Sample`, or can be an intermediate score\nresulting from a call to `score`.",
|
3285
3326
|
"properties": {
|
3286
3327
|
"timestamp": {
|
3287
3328
|
"format": "date-time",
|
@@ -3326,6 +3367,11 @@
|
|
3326
3367
|
],
|
3327
3368
|
"default": null,
|
3328
3369
|
"title": "Target"
|
3370
|
+
},
|
3371
|
+
"intermediate": {
|
3372
|
+
"default": false,
|
3373
|
+
"title": "Intermediate",
|
3374
|
+
"type": "boolean"
|
3329
3375
|
}
|
3330
3376
|
},
|
3331
3377
|
"required": [
|
@@ -3333,7 +3379,8 @@
|
|
3333
3379
|
"pending",
|
3334
3380
|
"event",
|
3335
3381
|
"score",
|
3336
|
-
"target"
|
3382
|
+
"target",
|
3383
|
+
"intermediate"
|
3337
3384
|
],
|
3338
3385
|
"title": "ScoreEvent",
|
3339
3386
|
"type": "object",
|
@@ -4223,6 +4270,7 @@
|
|
4223
4270
|
"additionalProperties": false
|
4224
4271
|
}
|
4225
4272
|
},
|
4273
|
+
"description": "Evaluation log.",
|
4226
4274
|
"properties": {
|
4227
4275
|
"version": {
|
4228
4276
|
"default": 2,
|
@@ -4244,37 +4292,7 @@
|
|
4244
4292
|
"$ref": "#/$defs/EvalSpec"
|
4245
4293
|
},
|
4246
4294
|
"plan": {
|
4247
|
-
"$ref": "#/$defs/EvalPlan"
|
4248
|
-
"default": {
|
4249
|
-
"name": "plan",
|
4250
|
-
"steps": [],
|
4251
|
-
"finish": null,
|
4252
|
-
"config": {
|
4253
|
-
"best_of": null,
|
4254
|
-
"cache_prompt": null,
|
4255
|
-
"frequency_penalty": null,
|
4256
|
-
"internal_tools": null,
|
4257
|
-
"logit_bias": null,
|
4258
|
-
"logprobs": null,
|
4259
|
-
"max_connections": null,
|
4260
|
-
"max_retries": null,
|
4261
|
-
"max_tokens": null,
|
4262
|
-
"max_tool_output": null,
|
4263
|
-
"num_choices": null,
|
4264
|
-
"parallel_tool_calls": null,
|
4265
|
-
"presence_penalty": null,
|
4266
|
-
"reasoning_effort": null,
|
4267
|
-
"reasoning_history": null,
|
4268
|
-
"seed": null,
|
4269
|
-
"stop_seqs": null,
|
4270
|
-
"system_message": null,
|
4271
|
-
"temperature": null,
|
4272
|
-
"timeout": null,
|
4273
|
-
"top_k": null,
|
4274
|
-
"top_logprobs": null,
|
4275
|
-
"top_p": null
|
4276
|
-
}
|
4277
|
-
}
|
4295
|
+
"$ref": "#/$defs/EvalPlan"
|
4278
4296
|
},
|
4279
4297
|
"results": {
|
4280
4298
|
"anyOf": [
|
@@ -4288,12 +4306,7 @@
|
|
4288
4306
|
"default": null
|
4289
4307
|
},
|
4290
4308
|
"stats": {
|
4291
|
-
"$ref": "#/$defs/EvalStats"
|
4292
|
-
"default": {
|
4293
|
-
"started_at": "",
|
4294
|
-
"completed_at": "",
|
4295
|
-
"model_usage": {}
|
4296
|
-
}
|
4309
|
+
"$ref": "#/$defs/EvalStats"
|
4297
4310
|
},
|
4298
4311
|
"error": {
|
4299
4312
|
"anyOf": [
|
@@ -8,7 +8,7 @@
|
|
8
8
|
"scripts": {
|
9
9
|
"build": "vite build",
|
10
10
|
"watch": "vite build --watch",
|
11
|
-
"dev-watch": "vite build --mode development --watch",
|
11
|
+
"dev-watch": "NODE_ENV=development vite build --mode development --watch",
|
12
12
|
"dev": "vite",
|
13
13
|
"prettier:check": "prettier --check src",
|
14
14
|
"prettier:write": "prettier --write src",
|
inspect_ai/_view/www/src/App.tsx
CHANGED
@@ -990,6 +990,7 @@ const defaultScorers = (log: EvalSummary): Array<ScorerInfo> => {
|
|
990
990
|
}, [] as Array<ScorerInfo>);
|
991
991
|
} else if (log.sampleSummaries && log.sampleSummaries.length > 0) {
|
992
992
|
const scores = log.sampleSummaries[0].scores;
|
993
|
+
|
993
994
|
if (scores !== null) {
|
994
995
|
return Object.keys(scores).map((key) => {
|
995
996
|
return {
|
@@ -78,54 +78,6 @@ export const LargeModal: React.FC<LargeModalProps> = ({
|
|
78
78
|
[setInitialScrollPosition],
|
79
79
|
);
|
80
80
|
|
81
|
-
// Capture header elements
|
82
|
-
const headerEls = [];
|
83
|
-
// The title
|
84
|
-
headerEls.push(
|
85
|
-
<div className={clsx("modal-title", "text-size-smaller", styles.title)}>
|
86
|
-
{title || ""}
|
87
|
-
</div>,
|
88
|
-
);
|
89
|
-
|
90
|
-
// A centered text element with tools to the left and right
|
91
|
-
if (detail) {
|
92
|
-
headerEls.push(
|
93
|
-
<div className={styles.detail}>
|
94
|
-
{detailTools?.left
|
95
|
-
? detailTools.left.map((tool) => {
|
96
|
-
return <TitleTool {...tool} />;
|
97
|
-
})
|
98
|
-
: ""}
|
99
|
-
<div className={clsx("text-size-smaller", styles.detailText)}>
|
100
|
-
<div>{detail}</div>
|
101
|
-
</div>
|
102
|
-
|
103
|
-
{detailTools?.right
|
104
|
-
? detailTools.right.map((tool) => {
|
105
|
-
return <TitleTool {...tool} />;
|
106
|
-
})
|
107
|
-
: ""}
|
108
|
-
</div>,
|
109
|
-
);
|
110
|
-
}
|
111
|
-
|
112
|
-
// The close 'x'
|
113
|
-
headerEls.push(
|
114
|
-
<button
|
115
|
-
type="button"
|
116
|
-
className={clsx(
|
117
|
-
"btn",
|
118
|
-
"btn-close-large-dialog",
|
119
|
-
"text-size-larger",
|
120
|
-
styles.close,
|
121
|
-
)}
|
122
|
-
onClick={onHide}
|
123
|
-
aria-label="Close"
|
124
|
-
>
|
125
|
-
<HtmlEntity html={"×"} />
|
126
|
-
</button>,
|
127
|
-
);
|
128
|
-
|
129
81
|
return (
|
130
82
|
<div
|
131
83
|
id={id}
|
@@ -147,7 +99,45 @@ export const LargeModal: React.FC<LargeModalProps> = ({
|
|
147
99
|
role="document"
|
148
100
|
>
|
149
101
|
<div className={clsx("modal-content", styles.content)}>
|
150
|
-
<div className={clsx("modal-header", styles.header)}>
|
102
|
+
<div className={clsx("modal-header", styles.header)}>
|
103
|
+
<div
|
104
|
+
className={clsx("modal-title", "text-size-smaller", styles.title)}
|
105
|
+
>
|
106
|
+
{title || ""}
|
107
|
+
</div>
|
108
|
+
|
109
|
+
{detail ? (
|
110
|
+
<div className={styles.detail}>
|
111
|
+
{detailTools?.left
|
112
|
+
? detailTools.left.map((tool, idx) => {
|
113
|
+
return <TitleTool key={`tool-left-${idx}`} {...tool} />;
|
114
|
+
})
|
115
|
+
: ""}
|
116
|
+
<div className={clsx("text-size-smaller", styles.detailText)}>
|
117
|
+
<div>{detail}</div>
|
118
|
+
</div>
|
119
|
+
|
120
|
+
{detailTools?.right
|
121
|
+
? detailTools.right.map((tool, idx) => {
|
122
|
+
return <TitleTool key={`tool-right-${idx}`} {...tool} />;
|
123
|
+
})
|
124
|
+
: ""}
|
125
|
+
</div>
|
126
|
+
) : undefined}
|
127
|
+
<button
|
128
|
+
type="button"
|
129
|
+
className={clsx(
|
130
|
+
"btn",
|
131
|
+
"btn-close-large-dialog",
|
132
|
+
"text-size-larger",
|
133
|
+
styles.close,
|
134
|
+
)}
|
135
|
+
onClick={onHide}
|
136
|
+
aria-label="Close"
|
137
|
+
>
|
138
|
+
<HtmlEntity html={"×"} />
|
139
|
+
</button>
|
140
|
+
</div>
|
151
141
|
<ProgressBar animating={showProgress} />
|
152
142
|
<div className={"modal-body"} ref={scrollRef} onScroll={onScroll}>
|
153
143
|
{children}
|
@@ -26,6 +26,7 @@ export const NavPills: React.FC<NavPillsProps> = ({ children }) => {
|
|
26
26
|
: `Tab ${idx}`;
|
27
27
|
return (
|
28
28
|
<NavPill
|
29
|
+
key={`nav-pill-contents-${idx}`}
|
29
30
|
title={title}
|
30
31
|
activeItem={activeItem}
|
31
32
|
setActiveItem={setActiveItem}
|
@@ -34,9 +35,10 @@ export const NavPills: React.FC<NavPillsProps> = ({ children }) => {
|
|
34
35
|
});
|
35
36
|
|
36
37
|
// Wrap each of the children in a 'body' to control its visibility
|
37
|
-
const navBodies = children.map((child) => {
|
38
|
+
const navBodies = children.map((child, idx) => {
|
38
39
|
return (
|
39
40
|
<div
|
41
|
+
key={`nav-pill-container-${idx}`}
|
40
42
|
className={
|
41
43
|
child["props"]?.title === activeItem ? styles.visible : styles.hidden
|
42
44
|
}
|
@@ -1,6 +1,8 @@
|
|
1
1
|
import clsx from "clsx";
|
2
2
|
import {
|
3
|
+
Children,
|
3
4
|
Fragment,
|
5
|
+
isValidElement,
|
4
6
|
MouseEvent,
|
5
7
|
ReactElement,
|
6
8
|
useCallback,
|
@@ -46,10 +48,7 @@ export const TabSet: React.FC<TabSetProps> = ({
|
|
46
48
|
tools,
|
47
49
|
children,
|
48
50
|
}) => {
|
49
|
-
const validTabs
|
50
|
-
? (children.filter(Boolean) as ReactElement<TabPanelProps>[])
|
51
|
-
: [children];
|
52
|
-
|
51
|
+
const validTabs = flattenChildren(children);
|
53
52
|
if (validTabs.length === 0) return null;
|
54
53
|
|
55
54
|
return (
|
@@ -198,3 +197,19 @@ const TabTools: React.FC<{ tools?: React.ReactNode }> = ({ tools }) => (
|
|
198
197
|
// Utility functions
|
199
198
|
const computeTabId = (id: string, index: number) => `${id}-${index}`;
|
200
199
|
const computeTabContentsId = (id: string) => `${id}-contents`;
|
200
|
+
|
201
|
+
const flattenChildren = (
|
202
|
+
children: React.ReactNode,
|
203
|
+
): ReactElement<TabPanelProps>[] => {
|
204
|
+
return Children.toArray(children).flatMap((child) => {
|
205
|
+
if (isValidElement(child)) {
|
206
|
+
const element = child as React.ReactElement<any>;
|
207
|
+
|
208
|
+
if (element.type === Fragment) {
|
209
|
+
return flattenChildren(element.props.children);
|
210
|
+
}
|
211
|
+
return element;
|
212
|
+
}
|
213
|
+
return [];
|
214
|
+
});
|
215
|
+
};
|
@@ -101,7 +101,6 @@ export const openRemoteLogFile = async (
|
|
101
101
|
if (remoteZipFile.centralDirectory.has(sampleFile)) {
|
102
102
|
return (await readJSONFile(sampleFile, MAX_BYTES)) as EvalSample;
|
103
103
|
} else {
|
104
|
-
console.log({ dir: remoteZipFile.centralDirectory });
|
105
104
|
throw new Error(
|
106
105
|
`Unable to read sample file ${sampleFile} - it is not present in the manifest.`,
|
107
106
|
);
|