inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -0
- inspect_ai/_display/textual/widgets/samples.py +3 -3
- inspect_ai/_display/textual/widgets/transcript.py +3 -29
- inspect_ai/_eval/eval.py +19 -2
- inspect_ai/_eval/evalset.py +4 -1
- inspect_ai/_eval/run.py +41 -0
- inspect_ai/_eval/task/generate.py +38 -44
- inspect_ai/_eval/task/log.py +26 -28
- inspect_ai/_eval/task/run.py +23 -27
- inspect_ai/_util/answer.py +26 -0
- inspect_ai/_util/constants.py +0 -1
- inspect_ai/_util/local_server.py +398 -0
- inspect_ai/_util/working.py +10 -4
- inspect_ai/_view/www/dist/assets/index.css +173 -159
- inspect_ai/_view/www/dist/assets/index.js +1417 -1142
- inspect_ai/_view/www/log-schema.json +379 -3
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/@types/log.d.ts +93 -14
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
- inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
- inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
- inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
- inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
- inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
- inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
- inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
- inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
- inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
- inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
- inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
- inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
- inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
- inspect_ai/_view/www/src/components/Card.css +0 -1
- inspect_ai/_view/www/src/constants.ts +2 -0
- inspect_ai/_view/www/src/utils/numeric.ts +17 -0
- inspect_ai/agent/_agent.py +3 -3
- inspect_ai/agent/_as_solver.py +22 -12
- inspect_ai/agent/_as_tool.py +20 -6
- inspect_ai/agent/_handoff.py +12 -1
- inspect_ai/agent/_react.py +4 -3
- inspect_ai/agent/_run.py +16 -3
- inspect_ai/agent/_types.py +9 -0
- inspect_ai/dataset/_dataset.py +6 -3
- inspect_ai/log/__init__.py +14 -0
- inspect_ai/log/_convert.py +4 -9
- inspect_ai/log/_file.py +56 -0
- inspect_ai/log/_log.py +99 -0
- inspect_ai/log/_recorders/__init__.py +2 -0
- inspect_ai/log/_recorders/buffer/database.py +12 -11
- inspect_ai/log/_recorders/buffer/filestore.py +2 -2
- inspect_ai/log/_recorders/buffer/types.py +2 -2
- inspect_ai/log/_recorders/eval.py +20 -65
- inspect_ai/log/_recorders/file.py +28 -6
- inspect_ai/log/_recorders/recorder.py +7 -0
- inspect_ai/log/_recorders/types.py +1 -23
- inspect_ai/log/_samples.py +14 -25
- inspect_ai/log/_transcript.py +84 -36
- inspect_ai/log/_tree.py +118 -0
- inspect_ai/log/_util.py +52 -0
- inspect_ai/model/__init__.py +5 -1
- inspect_ai/model/_call_tools.py +72 -44
- inspect_ai/model/_generate_config.py +14 -8
- inspect_ai/model/_model.py +66 -88
- inspect_ai/model/_model_output.py +25 -0
- inspect_ai/model/_openai.py +2 -0
- inspect_ai/model/_providers/anthropic.py +13 -23
- inspect_ai/model/_providers/hf.py +27 -1
- inspect_ai/model/_providers/openai_o1.py +8 -2
- inspect_ai/model/_providers/providers.py +18 -4
- inspect_ai/model/_providers/sglang.py +247 -0
- inspect_ai/model/_providers/vllm.py +211 -400
- inspect_ai/scorer/_choice.py +1 -2
- inspect_ai/solver/__init__.py +7 -2
- inspect_ai/solver/_basic_agent.py +3 -10
- inspect_ai/solver/_chain.py +1 -1
- inspect_ai/solver/_fork.py +1 -1
- inspect_ai/solver/_multiple_choice.py +5 -22
- inspect_ai/solver/_plan.py +2 -2
- inspect_ai/solver/_task_state.py +26 -88
- inspect_ai/solver/_transcript.py +6 -7
- inspect_ai/tool/_json_rpc_helpers.py +45 -17
- inspect_ai/tool/_mcp/_mcp.py +8 -5
- inspect_ai/tool/_mcp/_sandbox.py +8 -2
- inspect_ai/tool/_mcp/server.py +3 -1
- inspect_ai/tool/_tool_call.py +4 -1
- inspect_ai/tool/_tool_support_helpers.py +51 -12
- inspect_ai/tool/_tools/_bash_session.py +190 -68
- inspect_ai/tool/_tools/_computer/_computer.py +25 -1
- inspect_ai/tool/_tools/_execute.py +4 -1
- inspect_ai/tool/_tools/_text_editor.py +4 -3
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
- inspect_ai/util/__init__.py +16 -0
- inspect_ai/util/_anyio.py +11 -0
- inspect_ai/util/_collect.py +50 -0
- inspect_ai/util/_limit.py +393 -0
- inspect_ai/util/_limited_conversation.py +57 -0
- inspect_ai/util/_span.py +58 -0
- inspect_ai/util/_subtask.py +27 -42
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
- inspect_ai/_display/core/group.py +0 -79
- inspect_ai/solver/_limit.py +0 -39
- inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
- inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
- inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_computer/test_args.py +0 -151
- /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -263,9 +263,10 @@ export type Type9 =
|
|
263
263
|
| "permission"
|
264
264
|
| "file_not_found"
|
265
265
|
| "is_a_directory"
|
266
|
-
| "
|
266
|
+
| "limit"
|
267
267
|
| "approval"
|
268
|
-
| "unknown"
|
268
|
+
| "unknown"
|
269
|
+
| "output_limit";
|
269
270
|
export type Message1 = string;
|
270
271
|
export type Choices = string[] | null;
|
271
272
|
export type Target = string | string[];
|
@@ -315,6 +316,7 @@ export type Explanation = string | null;
|
|
315
316
|
export type Metadata6 = {
|
316
317
|
[k: string]: unknown;
|
317
318
|
} | null;
|
319
|
+
export type SpanId = string | null;
|
318
320
|
export type Timestamp = string;
|
319
321
|
export type WorkingStart = number;
|
320
322
|
export type Pending = boolean | null;
|
@@ -338,6 +340,7 @@ export type Files1 = {
|
|
338
340
|
} | null;
|
339
341
|
export type Setup1 = string | null;
|
340
342
|
export type JsonValue = unknown;
|
343
|
+
export type SpanId1 = string | null;
|
341
344
|
export type Timestamp1 = string;
|
342
345
|
export type WorkingStart1 = number;
|
343
346
|
export type Pending1 = boolean | null;
|
@@ -351,6 +354,7 @@ export type Type10 =
|
|
351
354
|
| "custom";
|
352
355
|
export type Message2 = string;
|
353
356
|
export type Limit1 = number | null;
|
357
|
+
export type SpanId2 = string | null;
|
354
358
|
export type Timestamp2 = string;
|
355
359
|
export type WorkingStart2 = number;
|
356
360
|
export type Pending2 = boolean | null;
|
@@ -365,6 +369,7 @@ export type Input2 = string | null;
|
|
365
369
|
export type Result = number | null;
|
366
370
|
export type Output = string | null;
|
367
371
|
export type Completed = string | null;
|
372
|
+
export type SpanId3 = string | null;
|
368
373
|
export type Timestamp3 = string;
|
369
374
|
export type WorkingStart3 = number;
|
370
375
|
export type Pending3 = boolean | null;
|
@@ -373,11 +378,13 @@ export type Op = "remove" | "add" | "replace" | "move" | "test" | "copy";
|
|
373
378
|
export type Path = string;
|
374
379
|
export type From = string | null;
|
375
380
|
export type Changes = JsonChange[];
|
381
|
+
export type SpanId4 = string | null;
|
376
382
|
export type Timestamp4 = string;
|
377
383
|
export type WorkingStart4 = number;
|
378
384
|
export type Pending4 = boolean | null;
|
379
385
|
export type Event4 = "store";
|
380
386
|
export type Changes1 = JsonChange[];
|
387
|
+
export type SpanId5 = string | null;
|
381
388
|
export type Timestamp5 = string;
|
382
389
|
export type WorkingStart5 = number;
|
383
390
|
export type Pending5 = boolean | null;
|
@@ -398,11 +405,13 @@ export type Additionalproperties1 = boolean;
|
|
398
405
|
export type Tools1 = ToolInfo[];
|
399
406
|
export type ToolChoice = ("auto" | "any" | "none") | ToolFunction;
|
400
407
|
export type Name9 = string;
|
408
|
+
export type Retries = number | null;
|
401
409
|
export type Error1 = string | null;
|
402
410
|
export type Cache = ("read" | "write") | null;
|
403
411
|
export type Time1 = number | null;
|
404
412
|
export type Completed1 = string | null;
|
405
413
|
export type WorkingTime = number | null;
|
414
|
+
export type SpanId6 = string | null;
|
406
415
|
export type Timestamp6 = string;
|
407
416
|
export type WorkingStart6 = number;
|
408
417
|
export type Pending6 = boolean | null;
|
@@ -427,6 +436,7 @@ export type Result1 =
|
|
427
436
|
| ContentVideo
|
428
437
|
)[];
|
429
438
|
export type Truncated = [unknown, unknown] | null;
|
439
|
+
export type SpanId7 = string | null;
|
430
440
|
export type Timestamp7 = string;
|
431
441
|
export type WorkingStart7 = number;
|
432
442
|
export type Pending7 = boolean | null;
|
@@ -440,22 +450,26 @@ export type Decision =
|
|
440
450
|
| "escalate"
|
441
451
|
| "terminate";
|
442
452
|
export type Explanation1 = string | null;
|
453
|
+
export type SpanId8 = string | null;
|
443
454
|
export type Timestamp8 = string;
|
444
455
|
export type WorkingStart8 = number;
|
445
456
|
export type Pending8 = boolean | null;
|
446
457
|
export type Event8 = "input";
|
447
458
|
export type Input4 = string;
|
448
459
|
export type InputAnsi = string;
|
460
|
+
export type SpanId9 = string | null;
|
449
461
|
export type Timestamp9 = string;
|
450
462
|
export type WorkingStart9 = number;
|
451
463
|
export type Pending9 = boolean | null;
|
452
464
|
export type Event9 = "score";
|
453
465
|
export type Target2 = string | string[] | null;
|
454
466
|
export type Intermediate = boolean;
|
467
|
+
export type SpanId10 = string | null;
|
455
468
|
export type Timestamp10 = string;
|
456
469
|
export type WorkingStart10 = number;
|
457
470
|
export type Pending10 = boolean | null;
|
458
471
|
export type Event10 = "error";
|
472
|
+
export type SpanId11 = string | null;
|
459
473
|
export type Timestamp11 = string;
|
460
474
|
export type WorkingStart11 = number;
|
461
475
|
export type Pending11 = boolean | null;
|
@@ -475,24 +489,42 @@ export type Created1 = number;
|
|
475
489
|
export type Filename = string;
|
476
490
|
export type Module = string;
|
477
491
|
export type Lineno = number;
|
492
|
+
export type SpanId12 = string | null;
|
478
493
|
export type Timestamp12 = string;
|
479
494
|
export type WorkingStart12 = number;
|
480
495
|
export type Pending12 = boolean | null;
|
481
496
|
export type Event12 = "info";
|
482
497
|
export type Source4 = string | null;
|
498
|
+
export type SpanId13 = string | null;
|
483
499
|
export type Timestamp13 = string;
|
484
500
|
export type WorkingStart13 = number;
|
485
501
|
export type Pending13 = boolean | null;
|
486
|
-
export type Event13 = "
|
487
|
-
export type
|
502
|
+
export type Event13 = "span_begin";
|
503
|
+
export type Id8 = string;
|
504
|
+
export type ParentId = string | null;
|
488
505
|
export type Type13 = string | null;
|
489
506
|
export type Name11 = string;
|
507
|
+
export type SpanId14 = string | null;
|
490
508
|
export type Timestamp14 = string;
|
491
509
|
export type WorkingStart14 = number;
|
492
510
|
export type Pending14 = boolean | null;
|
493
|
-
export type Event14 = "
|
494
|
-
export type
|
511
|
+
export type Event14 = "span_end";
|
512
|
+
export type Id9 = string;
|
513
|
+
export type SpanId15 = string | null;
|
514
|
+
export type Timestamp15 = string;
|
515
|
+
export type WorkingStart15 = number;
|
516
|
+
export type Pending15 = boolean | null;
|
517
|
+
export type Event15 = "step";
|
518
|
+
export type Action1 = "begin" | "end";
|
495
519
|
export type Type14 = string | null;
|
520
|
+
export type Name12 = string;
|
521
|
+
export type SpanId16 = string | null;
|
522
|
+
export type Timestamp16 = string;
|
523
|
+
export type WorkingStart16 = number;
|
524
|
+
export type Pending16 = boolean | null;
|
525
|
+
export type Event16 = "subtask";
|
526
|
+
export type Name13 = string;
|
527
|
+
export type Type15 = string | null;
|
496
528
|
export type Events2 = (
|
497
529
|
| SampleInitEvent
|
498
530
|
| SampleLimitEvent
|
@@ -507,6 +539,8 @@ export type Events2 = (
|
|
507
539
|
| ErrorEvent
|
508
540
|
| LoggerEvent
|
509
541
|
| InfoEvent
|
542
|
+
| SpanBeginEvent
|
543
|
+
| SpanEndEvent
|
510
544
|
| StepEvent
|
511
545
|
| SubtaskEvent
|
512
546
|
)[];
|
@@ -526,6 +560,8 @@ export type Events1 = (
|
|
526
560
|
| ErrorEvent
|
527
561
|
| LoggerEvent
|
528
562
|
| InfoEvent
|
563
|
+
| SpanBeginEvent
|
564
|
+
| SpanEndEvent
|
529
565
|
| StepEvent
|
530
566
|
| SubtaskEvent
|
531
567
|
)[];
|
@@ -547,6 +583,8 @@ export type Events = (
|
|
547
583
|
| ErrorEvent
|
548
584
|
| LoggerEvent
|
549
585
|
| InfoEvent
|
586
|
+
| SpanBeginEvent
|
587
|
+
| SpanEndEvent
|
550
588
|
| StepEvent
|
551
589
|
| SubtaskEvent
|
552
590
|
)[];
|
@@ -554,7 +592,7 @@ export type TotalTime = number | null;
|
|
554
592
|
export type WorkingTime3 = number | null;
|
555
593
|
export type Uuid = string | null;
|
556
594
|
export type ErrorRetries = EvalError[] | null;
|
557
|
-
export type
|
595
|
+
export type Type16 =
|
558
596
|
| "context"
|
559
597
|
| "time"
|
560
598
|
| "working"
|
@@ -1120,6 +1158,7 @@ export interface Store {
|
|
1120
1158
|
* Beginning of processing a Sample.
|
1121
1159
|
*/
|
1122
1160
|
export interface SampleInitEvent {
|
1161
|
+
span_id: SpanId;
|
1123
1162
|
timestamp: Timestamp;
|
1124
1163
|
working_start: WorkingStart;
|
1125
1164
|
pending: Pending;
|
@@ -1144,6 +1183,7 @@ export interface Sample {
|
|
1144
1183
|
* The sample was unable to finish processing due to a limit
|
1145
1184
|
*/
|
1146
1185
|
export interface SampleLimitEvent {
|
1186
|
+
span_id: SpanId1;
|
1147
1187
|
timestamp: Timestamp1;
|
1148
1188
|
working_start: WorkingStart1;
|
1149
1189
|
pending: Pending1;
|
@@ -1156,6 +1196,7 @@ export interface SampleLimitEvent {
|
|
1156
1196
|
* Sandbox execution or I/O
|
1157
1197
|
*/
|
1158
1198
|
export interface SandboxEvent {
|
1199
|
+
span_id: SpanId2;
|
1159
1200
|
timestamp: Timestamp2;
|
1160
1201
|
working_start: WorkingStart2;
|
1161
1202
|
pending: Pending2;
|
@@ -1173,6 +1214,7 @@ export interface SandboxEvent {
|
|
1173
1214
|
* Change to the current `TaskState`
|
1174
1215
|
*/
|
1175
1216
|
export interface StateEvent {
|
1217
|
+
span_id: SpanId3;
|
1176
1218
|
timestamp: Timestamp3;
|
1177
1219
|
working_start: WorkingStart3;
|
1178
1220
|
pending: Pending3;
|
@@ -1197,6 +1239,7 @@ export interface JsonChange {
|
|
1197
1239
|
* Change to data within the current `Store`.
|
1198
1240
|
*/
|
1199
1241
|
export interface StoreEvent {
|
1242
|
+
span_id: SpanId4;
|
1200
1243
|
timestamp: Timestamp4;
|
1201
1244
|
working_start: WorkingStart4;
|
1202
1245
|
pending: Pending4;
|
@@ -1207,6 +1250,7 @@ export interface StoreEvent {
|
|
1207
1250
|
* Call to a language model.
|
1208
1251
|
*/
|
1209
1252
|
export interface ModelEvent {
|
1253
|
+
span_id: SpanId5;
|
1210
1254
|
timestamp: Timestamp5;
|
1211
1255
|
working_start: WorkingStart5;
|
1212
1256
|
pending: Pending5;
|
@@ -1218,6 +1262,7 @@ export interface ModelEvent {
|
|
1218
1262
|
tool_choice: ToolChoice;
|
1219
1263
|
config: GenerateConfig;
|
1220
1264
|
output: ModelOutput;
|
1265
|
+
retries: Retries;
|
1221
1266
|
error: Error1;
|
1222
1267
|
cache: Cache;
|
1223
1268
|
call: ModelCall | null;
|
@@ -1288,6 +1333,7 @@ export interface Response {
|
|
1288
1333
|
* Call to a tool.
|
1289
1334
|
*/
|
1290
1335
|
export interface ToolEvent {
|
1336
|
+
span_id: SpanId6;
|
1291
1337
|
timestamp: Timestamp6;
|
1292
1338
|
working_start: WorkingStart6;
|
1293
1339
|
pending: Pending6;
|
@@ -1314,6 +1360,7 @@ export interface Arguments1 {
|
|
1314
1360
|
* Tool approval.
|
1315
1361
|
*/
|
1316
1362
|
export interface ApprovalEvent {
|
1363
|
+
span_id: SpanId7;
|
1317
1364
|
timestamp: Timestamp7;
|
1318
1365
|
working_start: WorkingStart7;
|
1319
1366
|
pending: Pending7;
|
@@ -1340,6 +1387,7 @@ export interface ToolCallView {
|
|
1340
1387
|
* Input screen interaction.
|
1341
1388
|
*/
|
1342
1389
|
export interface InputEvent {
|
1390
|
+
span_id: SpanId8;
|
1343
1391
|
timestamp: Timestamp8;
|
1344
1392
|
working_start: WorkingStart8;
|
1345
1393
|
pending: Pending8;
|
@@ -1354,6 +1402,7 @@ export interface InputEvent {
|
|
1354
1402
|
* resulting from a call to `score`.
|
1355
1403
|
*/
|
1356
1404
|
export interface ScoreEvent {
|
1405
|
+
span_id: SpanId9;
|
1357
1406
|
timestamp: Timestamp9;
|
1358
1407
|
working_start: WorkingStart9;
|
1359
1408
|
pending: Pending9;
|
@@ -1366,6 +1415,7 @@ export interface ScoreEvent {
|
|
1366
1415
|
* Event with sample error.
|
1367
1416
|
*/
|
1368
1417
|
export interface ErrorEvent {
|
1418
|
+
span_id: SpanId10;
|
1369
1419
|
timestamp: Timestamp10;
|
1370
1420
|
working_start: WorkingStart10;
|
1371
1421
|
pending: Pending10;
|
@@ -1376,6 +1426,7 @@ export interface ErrorEvent {
|
|
1376
1426
|
* Log message recorded with Python logger.
|
1377
1427
|
*/
|
1378
1428
|
export interface LoggerEvent {
|
1429
|
+
span_id: SpanId11;
|
1379
1430
|
timestamp: Timestamp11;
|
1380
1431
|
working_start: WorkingStart11;
|
1381
1432
|
pending: Pending11;
|
@@ -1398,6 +1449,7 @@ export interface LoggingMessage {
|
|
1398
1449
|
* Event with custom info/data.
|
1399
1450
|
*/
|
1400
1451
|
export interface InfoEvent {
|
1452
|
+
span_id: SpanId12;
|
1401
1453
|
timestamp: Timestamp12;
|
1402
1454
|
working_start: WorkingStart12;
|
1403
1455
|
pending: Pending12;
|
@@ -1406,27 +1458,54 @@ export interface InfoEvent {
|
|
1406
1458
|
data: JsonValue;
|
1407
1459
|
}
|
1408
1460
|
/**
|
1409
|
-
*
|
1461
|
+
* Mark the beginning of a transcript span.
|
1410
1462
|
*/
|
1411
|
-
export interface
|
1463
|
+
export interface SpanBeginEvent {
|
1464
|
+
span_id: SpanId13;
|
1412
1465
|
timestamp: Timestamp13;
|
1413
1466
|
working_start: WorkingStart13;
|
1414
1467
|
pending: Pending13;
|
1415
1468
|
event: Event13;
|
1416
|
-
|
1469
|
+
id: Id8;
|
1470
|
+
parent_id: ParentId;
|
1417
1471
|
type: Type13;
|
1418
1472
|
name: Name11;
|
1419
1473
|
}
|
1420
1474
|
/**
|
1421
|
-
*
|
1475
|
+
* Mark the end of a transcript span.
|
1422
1476
|
*/
|
1423
|
-
export interface
|
1477
|
+
export interface SpanEndEvent {
|
1478
|
+
span_id: SpanId14;
|
1424
1479
|
timestamp: Timestamp14;
|
1425
1480
|
working_start: WorkingStart14;
|
1426
1481
|
pending: Pending14;
|
1427
1482
|
event: Event14;
|
1428
|
-
|
1483
|
+
id: Id9;
|
1484
|
+
}
|
1485
|
+
/**
|
1486
|
+
* Step within current sample or subtask.
|
1487
|
+
*/
|
1488
|
+
export interface StepEvent {
|
1489
|
+
span_id: SpanId15;
|
1490
|
+
timestamp: Timestamp15;
|
1491
|
+
working_start: WorkingStart15;
|
1492
|
+
pending: Pending15;
|
1493
|
+
event: Event15;
|
1494
|
+
action: Action1;
|
1429
1495
|
type: Type14;
|
1496
|
+
name: Name12;
|
1497
|
+
}
|
1498
|
+
/**
|
1499
|
+
* Subtask spawned.
|
1500
|
+
*/
|
1501
|
+
export interface SubtaskEvent {
|
1502
|
+
span_id: SpanId16;
|
1503
|
+
timestamp: Timestamp16;
|
1504
|
+
working_start: WorkingStart16;
|
1505
|
+
pending: Pending16;
|
1506
|
+
event: Event16;
|
1507
|
+
name: Name13;
|
1508
|
+
type: Type15;
|
1430
1509
|
input: Input5;
|
1431
1510
|
result: Result2;
|
1432
1511
|
events: Events2;
|
@@ -1449,7 +1528,7 @@ export interface Attachments {
|
|
1449
1528
|
* Limit encontered by sample.
|
1450
1529
|
*/
|
1451
1530
|
export interface EvalSampleLimit {
|
1452
|
-
type:
|
1531
|
+
type: Type16;
|
1453
1532
|
limit: Limit2;
|
1454
1533
|
}
|
1455
1534
|
/**
|
@@ -41,13 +41,13 @@ export const MetaDataGrid: FC<MetadataGridProps> = ({
|
|
41
41
|
styles.cell,
|
42
42
|
"text-style-label",
|
43
43
|
"text-style-secondary",
|
44
|
-
"text-size-
|
44
|
+
"text-size-smaller",
|
45
45
|
)}
|
46
46
|
>
|
47
47
|
{entry.name}
|
48
48
|
</div>
|
49
49
|
<div
|
50
|
-
className={clsx(styles.value, `${baseId}-value`, "text-size-
|
50
|
+
className={clsx(styles.value, `${baseId}-value`, "text-size-smaller")}
|
51
51
|
>
|
52
52
|
<RenderedContent id={id} entry={entry} />
|
53
53
|
</div>
|
@@ -54,7 +54,7 @@ export const RenderedContent: FC<RenderedContentProps> = ({
|
|
54
54
|
if (typeof entry.value === "object") {
|
55
55
|
return JSON.stringify(entry.value);
|
56
56
|
}
|
57
|
-
return String(entry.value);
|
57
|
+
return String(entry.value).trim();
|
58
58
|
} catch (e) {
|
59
59
|
return "[Unable to display value]";
|
60
60
|
}
|
@@ -17,7 +17,9 @@ import { useLogNavigation } from "../routing/navigationHooks";
|
|
17
17
|
import styles from "./LogView.module.css";
|
18
18
|
import { useInfoTabConfig } from "./tabs/InfoTab";
|
19
19
|
import { useJsonTabConfig } from "./tabs/JsonTab";
|
20
|
+
import { useModelsTab } from "./tabs/ModelsTab";
|
20
21
|
import { useSamplesTabConfig } from "./tabs/SamplesTab";
|
22
|
+
import { useTaskTabConfig } from "./tabs/TaskTab";
|
21
23
|
import { TabDescriptor } from "./types";
|
22
24
|
|
23
25
|
export const LogView: FC = () => {
|
@@ -45,7 +47,14 @@ export const LogView: FC = () => {
|
|
45
47
|
selectedLogSummary?.plan,
|
46
48
|
selectedLogSummary?.error,
|
47
49
|
selectedLogSummary?.results,
|
50
|
+
);
|
51
|
+
|
52
|
+
const taskTabConfig = useTaskTabConfig(evalSpec, selectedLogSummary?.stats);
|
53
|
+
|
54
|
+
const modelsTabConfig = useModelsTab(
|
55
|
+
evalSpec,
|
48
56
|
selectedLogSummary?.stats,
|
57
|
+
selectedLogSummary?.status,
|
49
58
|
);
|
50
59
|
|
51
60
|
const jsonTabConfig = useJsonTabConfig(
|
@@ -60,6 +69,8 @@ export const LogView: FC = () => {
|
|
60
69
|
|
61
70
|
const tabs: Record<string, TabDescriptor<any>> = {
|
62
71
|
...(samplesTabConfig ? { samples: samplesTabConfig } : {}),
|
72
|
+
task: taskTabConfig,
|
73
|
+
model: modelsTabConfig,
|
63
74
|
config: configTabConfig,
|
64
75
|
json: jsonTabConfig,
|
65
76
|
};
|
@@ -6,14 +6,12 @@ import {
|
|
6
6
|
EvalSpec,
|
7
7
|
EvalStats,
|
8
8
|
} from "../../../@types/log";
|
9
|
-
import { UsageCard } from "../../usage/UsageCard";
|
10
|
-
import { TaskErrorCard } from "../error/TaskErrorPanel";
|
11
9
|
import { SampleSummary } from "../../../client/api/types";
|
12
10
|
import { MessageBand } from "../../../components/MessageBand";
|
13
|
-
import { ModelCard } from "../../plan/ModelCard";
|
14
11
|
import { kLogViewInfoTabId } from "../../../constants";
|
15
12
|
import { useTotalSampleCount } from "../../../state/hooks";
|
16
13
|
import { PlanCard } from "../../plan/PlanCard";
|
14
|
+
import { TaskErrorCard } from "../error/TaskErrorPanel";
|
17
15
|
|
18
16
|
// Individual hook for Info tab
|
19
17
|
export const useInfoTabConfig = (
|
@@ -21,7 +19,6 @@ export const useInfoTabConfig = (
|
|
21
19
|
evalPlan: EvalPlan | undefined,
|
22
20
|
evalError: EvalError | undefined | null,
|
23
21
|
evalResults: EvalResults | undefined | null,
|
24
|
-
evalStats: EvalStats | undefined,
|
25
22
|
) => {
|
26
23
|
const totalSampleCount = useTotalSampleCount();
|
27
24
|
return useMemo(() => {
|
@@ -35,11 +32,10 @@ export const useInfoTabConfig = (
|
|
35
32
|
evalPlan,
|
36
33
|
evalError,
|
37
34
|
evalResults,
|
38
|
-
evalStats,
|
39
35
|
sampleCount: totalSampleCount,
|
40
36
|
},
|
41
37
|
};
|
42
|
-
}, [evalSpec, evalPlan, evalError, evalResults,
|
38
|
+
}, [evalSpec, evalPlan, evalError, evalResults, totalSampleCount]);
|
43
39
|
};
|
44
40
|
|
45
41
|
interface PlanTabProps {
|
@@ -57,7 +53,6 @@ export const InfoTab: FC<PlanTabProps> = ({
|
|
57
53
|
evalSpec,
|
58
54
|
evalPlan,
|
59
55
|
evalResults,
|
60
|
-
evalStats,
|
61
56
|
evalStatus,
|
62
57
|
evalError,
|
63
58
|
sampleCount,
|
@@ -85,8 +80,6 @@ export const InfoTab: FC<PlanTabProps> = ({
|
|
85
80
|
evalPlan={evalPlan}
|
86
81
|
scores={evalResults?.scores}
|
87
82
|
/>
|
88
|
-
{evalSpec ? <ModelCard evalSpec={evalSpec} /> : undefined}
|
89
|
-
{evalStatus !== "started" ? <UsageCard stats={evalStats} /> : undefined}
|
90
83
|
{evalStatus === "error" && evalError ? (
|
91
84
|
<TaskErrorCard error={evalError} />
|
92
85
|
) : undefined}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import { FC, useMemo } from "react";
|
2
|
+
import { EvalSpec, EvalStats, Status } from "../../../@types/log";
|
3
|
+
import { kLogViewModelsTabId } from "../../../constants";
|
4
|
+
import { ModelCard } from "../../plan/ModelCard";
|
5
|
+
import { UsageCard } from "../../usage/UsageCard";
|
6
|
+
|
7
|
+
// Individual hook for Info tab
|
8
|
+
export const useModelsTab = (
|
9
|
+
evalSpec: EvalSpec | undefined,
|
10
|
+
evalStats: EvalStats | undefined,
|
11
|
+
evalStatus?: Status,
|
12
|
+
) => {
|
13
|
+
return useMemo(() => {
|
14
|
+
return {
|
15
|
+
id: kLogViewModelsTabId,
|
16
|
+
label: "Models",
|
17
|
+
scrollable: true,
|
18
|
+
component: ModelTab,
|
19
|
+
componentProps: {
|
20
|
+
evalSpec,
|
21
|
+
evalStats,
|
22
|
+
evalStatus,
|
23
|
+
},
|
24
|
+
};
|
25
|
+
}, [evalSpec, evalStats]);
|
26
|
+
};
|
27
|
+
|
28
|
+
interface ModelTabProps {
|
29
|
+
evalSpec?: EvalSpec;
|
30
|
+
evalStats?: EvalStats;
|
31
|
+
evalStatus?: Status;
|
32
|
+
}
|
33
|
+
|
34
|
+
export const ModelTab: FC<ModelTabProps> = ({
|
35
|
+
evalSpec,
|
36
|
+
evalStats,
|
37
|
+
evalStatus,
|
38
|
+
}) => {
|
39
|
+
return (
|
40
|
+
<div style={{ width: "100%" }}>
|
41
|
+
<div style={{ padding: "0.5em 1em 0 1em", width: "100%" }}>
|
42
|
+
{evalSpec ? <ModelCard evalSpec={evalSpec} /> : undefined}
|
43
|
+
{evalStatus !== "started" &&
|
44
|
+
evalStats?.model_usage &&
|
45
|
+
Object.keys(evalStats.model_usage).length > 0 && (
|
46
|
+
<UsageCard stats={evalStats} />
|
47
|
+
)}
|
48
|
+
</div>
|
49
|
+
</div>
|
50
|
+
);
|
51
|
+
};
|
@@ -0,0 +1,143 @@
|
|
1
|
+
import clsx from "clsx";
|
2
|
+
import { FC, useMemo } from "react";
|
3
|
+
import { EvalSpec, EvalStats } from "../../../@types/log";
|
4
|
+
import { Card, CardBody, CardHeader } from "../../../components/Card";
|
5
|
+
import { kLogViewTaskTabId } from "../../../constants";
|
6
|
+
import { formatDuration, toTitleCase } from "../../../utils/format";
|
7
|
+
import { ghCommitUrl } from "../../../utils/git";
|
8
|
+
import { MetaDataView } from "../../content/MetaDataView";
|
9
|
+
|
10
|
+
import styles from "./TaskTab.module.css";
|
11
|
+
|
12
|
+
// Individual hook for Info tab
|
13
|
+
export const useTaskTabConfig = (
|
14
|
+
evalSpec: EvalSpec | undefined,
|
15
|
+
evalStats?: EvalStats,
|
16
|
+
) => {
|
17
|
+
return useMemo(() => {
|
18
|
+
return {
|
19
|
+
id: kLogViewTaskTabId,
|
20
|
+
label: "Task",
|
21
|
+
scrollable: true,
|
22
|
+
component: TaskTab,
|
23
|
+
componentProps: {
|
24
|
+
evalSpec,
|
25
|
+
evalStats,
|
26
|
+
},
|
27
|
+
};
|
28
|
+
}, [evalSpec, evalStats]);
|
29
|
+
};
|
30
|
+
|
31
|
+
interface TaskTabProps {
|
32
|
+
evalSpec?: EvalSpec;
|
33
|
+
evalStats?: EvalStats;
|
34
|
+
}
|
35
|
+
|
36
|
+
export const TaskTab: FC<TaskTabProps> = ({ evalSpec, evalStats }) => {
|
37
|
+
const config: Record<string, unknown> = {};
|
38
|
+
Object.entries(evalSpec?.config || {}).forEach((entry) => {
|
39
|
+
const key = entry[0];
|
40
|
+
const value = entry[1];
|
41
|
+
config[key] = value;
|
42
|
+
});
|
43
|
+
|
44
|
+
const revision = evalSpec?.revision;
|
45
|
+
const packages = evalSpec?.packages;
|
46
|
+
|
47
|
+
const taskInformation: Record<string, unknown> = {
|
48
|
+
["Task ID"]: evalSpec?.task_id,
|
49
|
+
["Run ID"]: evalSpec?.run_id,
|
50
|
+
};
|
51
|
+
|
52
|
+
if (revision) {
|
53
|
+
taskInformation[
|
54
|
+
`${revision.type ? `${toTitleCase(revision.type)} ` : ""}Revision`
|
55
|
+
] = {
|
56
|
+
_html: (
|
57
|
+
<a href={ghCommitUrl(revision.origin, revision.commit)}>
|
58
|
+
{revision.commit}
|
59
|
+
</a>
|
60
|
+
),
|
61
|
+
};
|
62
|
+
}
|
63
|
+
if (packages) {
|
64
|
+
const names = Object.keys(packages).map((key) => {
|
65
|
+
return `${key} ${packages[key]}`;
|
66
|
+
});
|
67
|
+
|
68
|
+
if (names.length === 1) {
|
69
|
+
taskInformation["Inspect"] = names[0];
|
70
|
+
} else {
|
71
|
+
taskInformation["Inspect"] = names;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
if (evalSpec?.tags) {
|
75
|
+
taskInformation["tags"] = evalSpec?.tags.join(", ");
|
76
|
+
}
|
77
|
+
|
78
|
+
if (evalSpec?.sandbox) {
|
79
|
+
if (Array.isArray(evalSpec?.sandbox)) {
|
80
|
+
taskInformation["sandbox"] = evalSpec.sandbox[0];
|
81
|
+
if (evalSpec.sandbox[1]) {
|
82
|
+
taskInformation["sandbox_config"] = evalSpec.sandbox[1];
|
83
|
+
}
|
84
|
+
} else {
|
85
|
+
taskInformation["sandbox"] = evalSpec?.sandbox.type;
|
86
|
+
taskInformation["sandbox_config"] = evalSpec?.sandbox.config;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
const totalDuration = formatDuration(
|
91
|
+
new Date(evalStats?.started_at || 0),
|
92
|
+
new Date(evalStats?.completed_at || 0),
|
93
|
+
);
|
94
|
+
|
95
|
+
const task_args = evalSpec?.task_args || {};
|
96
|
+
|
97
|
+
return (
|
98
|
+
<div style={{ width: "100%" }}>
|
99
|
+
<div style={{ padding: "0.5em 1em 0 1em", width: "100%" }}>
|
100
|
+
<Card>
|
101
|
+
<CardHeader label="Task Info" />
|
102
|
+
<CardBody id={"task-card-config"}>
|
103
|
+
<div className={clsx(styles.grid)}>
|
104
|
+
<MetaDataView
|
105
|
+
key={`plan-md-task`}
|
106
|
+
className={"text-size-small"}
|
107
|
+
entries={taskInformation}
|
108
|
+
tableOptions="sm"
|
109
|
+
/>
|
110
|
+
|
111
|
+
<MetaDataView
|
112
|
+
entries={{
|
113
|
+
["Start"]: new Date(
|
114
|
+
evalStats?.started_at || 0,
|
115
|
+
).toLocaleString(),
|
116
|
+
["End"]: new Date(
|
117
|
+
evalStats?.completed_at || 0,
|
118
|
+
).toLocaleString(),
|
119
|
+
["Duration"]: totalDuration,
|
120
|
+
}}
|
121
|
+
tableOptions="sm"
|
122
|
+
/>
|
123
|
+
</div>
|
124
|
+
</CardBody>
|
125
|
+
</Card>
|
126
|
+
|
127
|
+
{Object.keys(task_args).length > 0 && (
|
128
|
+
<Card>
|
129
|
+
<CardHeader label="Task Args" />
|
130
|
+
<CardBody id={"task-card-config"}>
|
131
|
+
<MetaDataView
|
132
|
+
key={`plan-md-task-args`}
|
133
|
+
className={"text-size-small"}
|
134
|
+
entries={task_args as Record<string, unknown>}
|
135
|
+
tableOptions="sm"
|
136
|
+
/>
|
137
|
+
</CardBody>
|
138
|
+
</Card>
|
139
|
+
)}
|
140
|
+
</div>
|
141
|
+
</div>
|
142
|
+
);
|
143
|
+
};
|