inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. inspect_ai/_cli/eval.py +27 -0
  2. inspect_ai/_display/textual/widgets/samples.py +3 -3
  3. inspect_ai/_display/textual/widgets/transcript.py +3 -29
  4. inspect_ai/_eval/eval.py +19 -2
  5. inspect_ai/_eval/evalset.py +4 -1
  6. inspect_ai/_eval/run.py +41 -0
  7. inspect_ai/_eval/task/generate.py +38 -44
  8. inspect_ai/_eval/task/log.py +26 -28
  9. inspect_ai/_eval/task/run.py +23 -27
  10. inspect_ai/_util/answer.py +26 -0
  11. inspect_ai/_util/constants.py +0 -1
  12. inspect_ai/_util/local_server.py +398 -0
  13. inspect_ai/_util/working.py +10 -4
  14. inspect_ai/_view/www/dist/assets/index.css +173 -159
  15. inspect_ai/_view/www/dist/assets/index.js +1417 -1142
  16. inspect_ai/_view/www/log-schema.json +379 -3
  17. inspect_ai/_view/www/package.json +1 -1
  18. inspect_ai/_view/www/src/@types/log.d.ts +93 -14
  19. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
  20. inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
  21. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
  22. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
  23. inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
  24. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
  25. inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
  26. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
  27. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
  28. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
  29. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
  30. inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
  31. inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
  32. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
  33. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
  34. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
  35. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
  36. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
  37. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
  38. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
  39. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
  40. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
  41. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
  42. inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
  43. inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
  44. inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
  45. inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
  46. inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
  47. inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
  48. inspect_ai/_view/www/src/components/Card.css +0 -1
  49. inspect_ai/_view/www/src/constants.ts +2 -0
  50. inspect_ai/_view/www/src/utils/numeric.ts +17 -0
  51. inspect_ai/agent/_agent.py +3 -3
  52. inspect_ai/agent/_as_solver.py +22 -12
  53. inspect_ai/agent/_as_tool.py +20 -6
  54. inspect_ai/agent/_handoff.py +12 -1
  55. inspect_ai/agent/_react.py +4 -3
  56. inspect_ai/agent/_run.py +16 -3
  57. inspect_ai/agent/_types.py +9 -0
  58. inspect_ai/dataset/_dataset.py +6 -3
  59. inspect_ai/log/__init__.py +14 -0
  60. inspect_ai/log/_convert.py +4 -9
  61. inspect_ai/log/_file.py +56 -0
  62. inspect_ai/log/_log.py +99 -0
  63. inspect_ai/log/_recorders/__init__.py +2 -0
  64. inspect_ai/log/_recorders/buffer/database.py +12 -11
  65. inspect_ai/log/_recorders/buffer/filestore.py +2 -2
  66. inspect_ai/log/_recorders/buffer/types.py +2 -2
  67. inspect_ai/log/_recorders/eval.py +20 -65
  68. inspect_ai/log/_recorders/file.py +28 -6
  69. inspect_ai/log/_recorders/recorder.py +7 -0
  70. inspect_ai/log/_recorders/types.py +1 -23
  71. inspect_ai/log/_samples.py +14 -25
  72. inspect_ai/log/_transcript.py +84 -36
  73. inspect_ai/log/_tree.py +118 -0
  74. inspect_ai/log/_util.py +52 -0
  75. inspect_ai/model/__init__.py +5 -1
  76. inspect_ai/model/_call_tools.py +72 -44
  77. inspect_ai/model/_generate_config.py +14 -8
  78. inspect_ai/model/_model.py +66 -88
  79. inspect_ai/model/_model_output.py +25 -0
  80. inspect_ai/model/_openai.py +2 -0
  81. inspect_ai/model/_providers/anthropic.py +13 -23
  82. inspect_ai/model/_providers/hf.py +27 -1
  83. inspect_ai/model/_providers/openai_o1.py +8 -2
  84. inspect_ai/model/_providers/providers.py +18 -4
  85. inspect_ai/model/_providers/sglang.py +247 -0
  86. inspect_ai/model/_providers/vllm.py +211 -400
  87. inspect_ai/scorer/_choice.py +1 -2
  88. inspect_ai/solver/__init__.py +7 -2
  89. inspect_ai/solver/_basic_agent.py +3 -10
  90. inspect_ai/solver/_chain.py +1 -1
  91. inspect_ai/solver/_fork.py +1 -1
  92. inspect_ai/solver/_multiple_choice.py +5 -22
  93. inspect_ai/solver/_plan.py +2 -2
  94. inspect_ai/solver/_task_state.py +26 -88
  95. inspect_ai/solver/_transcript.py +6 -7
  96. inspect_ai/tool/_json_rpc_helpers.py +45 -17
  97. inspect_ai/tool/_mcp/_mcp.py +8 -5
  98. inspect_ai/tool/_mcp/_sandbox.py +8 -2
  99. inspect_ai/tool/_mcp/server.py +3 -1
  100. inspect_ai/tool/_tool_call.py +4 -1
  101. inspect_ai/tool/_tool_support_helpers.py +51 -12
  102. inspect_ai/tool/_tools/_bash_session.py +190 -68
  103. inspect_ai/tool/_tools/_computer/_computer.py +25 -1
  104. inspect_ai/tool/_tools/_execute.py +4 -1
  105. inspect_ai/tool/_tools/_text_editor.py +4 -3
  106. inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
  107. inspect_ai/util/__init__.py +16 -0
  108. inspect_ai/util/_anyio.py +11 -0
  109. inspect_ai/util/_collect.py +50 -0
  110. inspect_ai/util/_limit.py +393 -0
  111. inspect_ai/util/_limited_conversation.py +57 -0
  112. inspect_ai/util/_span.py +58 -0
  113. inspect_ai/util/_subtask.py +27 -42
  114. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
  115. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
  116. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
  117. inspect_ai/_display/core/group.py +0 -79
  118. inspect_ai/solver/_limit.py +0 -39
  119. inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
  120. inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
  121. inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
  122. inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
  123. inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
  124. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
  125. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
  126. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  127. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
  128. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
  129. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
  130. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
  131. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
  132. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
  133. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
  134. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
  135. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
  136. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
  137. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
  138. inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
  139. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
  140. inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
  141. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
  142. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
  143. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
  144. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  145. inspect_ai/tool/_tools/_computer/test_args.py +0 -151
  146. /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
  147. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
  148. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
  149. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -263,9 +263,10 @@ export type Type9 =
263
263
  | "permission"
264
264
  | "file_not_found"
265
265
  | "is_a_directory"
266
- | "output_limit"
266
+ | "limit"
267
267
  | "approval"
268
- | "unknown";
268
+ | "unknown"
269
+ | "output_limit";
269
270
  export type Message1 = string;
270
271
  export type Choices = string[] | null;
271
272
  export type Target = string | string[];
@@ -315,6 +316,7 @@ export type Explanation = string | null;
315
316
  export type Metadata6 = {
316
317
  [k: string]: unknown;
317
318
  } | null;
319
+ export type SpanId = string | null;
318
320
  export type Timestamp = string;
319
321
  export type WorkingStart = number;
320
322
  export type Pending = boolean | null;
@@ -338,6 +340,7 @@ export type Files1 = {
338
340
  } | null;
339
341
  export type Setup1 = string | null;
340
342
  export type JsonValue = unknown;
343
+ export type SpanId1 = string | null;
341
344
  export type Timestamp1 = string;
342
345
  export type WorkingStart1 = number;
343
346
  export type Pending1 = boolean | null;
@@ -351,6 +354,7 @@ export type Type10 =
351
354
  | "custom";
352
355
  export type Message2 = string;
353
356
  export type Limit1 = number | null;
357
+ export type SpanId2 = string | null;
354
358
  export type Timestamp2 = string;
355
359
  export type WorkingStart2 = number;
356
360
  export type Pending2 = boolean | null;
@@ -365,6 +369,7 @@ export type Input2 = string | null;
365
369
  export type Result = number | null;
366
370
  export type Output = string | null;
367
371
  export type Completed = string | null;
372
+ export type SpanId3 = string | null;
368
373
  export type Timestamp3 = string;
369
374
  export type WorkingStart3 = number;
370
375
  export type Pending3 = boolean | null;
@@ -373,11 +378,13 @@ export type Op = "remove" | "add" | "replace" | "move" | "test" | "copy";
373
378
  export type Path = string;
374
379
  export type From = string | null;
375
380
  export type Changes = JsonChange[];
381
+ export type SpanId4 = string | null;
376
382
  export type Timestamp4 = string;
377
383
  export type WorkingStart4 = number;
378
384
  export type Pending4 = boolean | null;
379
385
  export type Event4 = "store";
380
386
  export type Changes1 = JsonChange[];
387
+ export type SpanId5 = string | null;
381
388
  export type Timestamp5 = string;
382
389
  export type WorkingStart5 = number;
383
390
  export type Pending5 = boolean | null;
@@ -398,11 +405,13 @@ export type Additionalproperties1 = boolean;
398
405
  export type Tools1 = ToolInfo[];
399
406
  export type ToolChoice = ("auto" | "any" | "none") | ToolFunction;
400
407
  export type Name9 = string;
408
+ export type Retries = number | null;
401
409
  export type Error1 = string | null;
402
410
  export type Cache = ("read" | "write") | null;
403
411
  export type Time1 = number | null;
404
412
  export type Completed1 = string | null;
405
413
  export type WorkingTime = number | null;
414
+ export type SpanId6 = string | null;
406
415
  export type Timestamp6 = string;
407
416
  export type WorkingStart6 = number;
408
417
  export type Pending6 = boolean | null;
@@ -427,6 +436,7 @@ export type Result1 =
427
436
  | ContentVideo
428
437
  )[];
429
438
  export type Truncated = [unknown, unknown] | null;
439
+ export type SpanId7 = string | null;
430
440
  export type Timestamp7 = string;
431
441
  export type WorkingStart7 = number;
432
442
  export type Pending7 = boolean | null;
@@ -440,22 +450,26 @@ export type Decision =
440
450
  | "escalate"
441
451
  | "terminate";
442
452
  export type Explanation1 = string | null;
453
+ export type SpanId8 = string | null;
443
454
  export type Timestamp8 = string;
444
455
  export type WorkingStart8 = number;
445
456
  export type Pending8 = boolean | null;
446
457
  export type Event8 = "input";
447
458
  export type Input4 = string;
448
459
  export type InputAnsi = string;
460
+ export type SpanId9 = string | null;
449
461
  export type Timestamp9 = string;
450
462
  export type WorkingStart9 = number;
451
463
  export type Pending9 = boolean | null;
452
464
  export type Event9 = "score";
453
465
  export type Target2 = string | string[] | null;
454
466
  export type Intermediate = boolean;
467
+ export type SpanId10 = string | null;
455
468
  export type Timestamp10 = string;
456
469
  export type WorkingStart10 = number;
457
470
  export type Pending10 = boolean | null;
458
471
  export type Event10 = "error";
472
+ export type SpanId11 = string | null;
459
473
  export type Timestamp11 = string;
460
474
  export type WorkingStart11 = number;
461
475
  export type Pending11 = boolean | null;
@@ -475,24 +489,42 @@ export type Created1 = number;
475
489
  export type Filename = string;
476
490
  export type Module = string;
477
491
  export type Lineno = number;
492
+ export type SpanId12 = string | null;
478
493
  export type Timestamp12 = string;
479
494
  export type WorkingStart12 = number;
480
495
  export type Pending12 = boolean | null;
481
496
  export type Event12 = "info";
482
497
  export type Source4 = string | null;
498
+ export type SpanId13 = string | null;
483
499
  export type Timestamp13 = string;
484
500
  export type WorkingStart13 = number;
485
501
  export type Pending13 = boolean | null;
486
- export type Event13 = "step";
487
- export type Action1 = "begin" | "end";
502
+ export type Event13 = "span_begin";
503
+ export type Id8 = string;
504
+ export type ParentId = string | null;
488
505
  export type Type13 = string | null;
489
506
  export type Name11 = string;
507
+ export type SpanId14 = string | null;
490
508
  export type Timestamp14 = string;
491
509
  export type WorkingStart14 = number;
492
510
  export type Pending14 = boolean | null;
493
- export type Event14 = "subtask";
494
- export type Name12 = string;
511
+ export type Event14 = "span_end";
512
+ export type Id9 = string;
513
+ export type SpanId15 = string | null;
514
+ export type Timestamp15 = string;
515
+ export type WorkingStart15 = number;
516
+ export type Pending15 = boolean | null;
517
+ export type Event15 = "step";
518
+ export type Action1 = "begin" | "end";
495
519
  export type Type14 = string | null;
520
+ export type Name12 = string;
521
+ export type SpanId16 = string | null;
522
+ export type Timestamp16 = string;
523
+ export type WorkingStart16 = number;
524
+ export type Pending16 = boolean | null;
525
+ export type Event16 = "subtask";
526
+ export type Name13 = string;
527
+ export type Type15 = string | null;
496
528
  export type Events2 = (
497
529
  | SampleInitEvent
498
530
  | SampleLimitEvent
@@ -507,6 +539,8 @@ export type Events2 = (
507
539
  | ErrorEvent
508
540
  | LoggerEvent
509
541
  | InfoEvent
542
+ | SpanBeginEvent
543
+ | SpanEndEvent
510
544
  | StepEvent
511
545
  | SubtaskEvent
512
546
  )[];
@@ -526,6 +560,8 @@ export type Events1 = (
526
560
  | ErrorEvent
527
561
  | LoggerEvent
528
562
  | InfoEvent
563
+ | SpanBeginEvent
564
+ | SpanEndEvent
529
565
  | StepEvent
530
566
  | SubtaskEvent
531
567
  )[];
@@ -547,6 +583,8 @@ export type Events = (
547
583
  | ErrorEvent
548
584
  | LoggerEvent
549
585
  | InfoEvent
586
+ | SpanBeginEvent
587
+ | SpanEndEvent
550
588
  | StepEvent
551
589
  | SubtaskEvent
552
590
  )[];
@@ -554,7 +592,7 @@ export type TotalTime = number | null;
554
592
  export type WorkingTime3 = number | null;
555
593
  export type Uuid = string | null;
556
594
  export type ErrorRetries = EvalError[] | null;
557
- export type Type15 =
595
+ export type Type16 =
558
596
  | "context"
559
597
  | "time"
560
598
  | "working"
@@ -1120,6 +1158,7 @@ export interface Store {
1120
1158
  * Beginning of processing a Sample.
1121
1159
  */
1122
1160
  export interface SampleInitEvent {
1161
+ span_id: SpanId;
1123
1162
  timestamp: Timestamp;
1124
1163
  working_start: WorkingStart;
1125
1164
  pending: Pending;
@@ -1144,6 +1183,7 @@ export interface Sample {
1144
1183
  * The sample was unable to finish processing due to a limit
1145
1184
  */
1146
1185
  export interface SampleLimitEvent {
1186
+ span_id: SpanId1;
1147
1187
  timestamp: Timestamp1;
1148
1188
  working_start: WorkingStart1;
1149
1189
  pending: Pending1;
@@ -1156,6 +1196,7 @@ export interface SampleLimitEvent {
1156
1196
  * Sandbox execution or I/O
1157
1197
  */
1158
1198
  export interface SandboxEvent {
1199
+ span_id: SpanId2;
1159
1200
  timestamp: Timestamp2;
1160
1201
  working_start: WorkingStart2;
1161
1202
  pending: Pending2;
@@ -1173,6 +1214,7 @@ export interface SandboxEvent {
1173
1214
  * Change to the current `TaskState`
1174
1215
  */
1175
1216
  export interface StateEvent {
1217
+ span_id: SpanId3;
1176
1218
  timestamp: Timestamp3;
1177
1219
  working_start: WorkingStart3;
1178
1220
  pending: Pending3;
@@ -1197,6 +1239,7 @@ export interface JsonChange {
1197
1239
  * Change to data within the current `Store`.
1198
1240
  */
1199
1241
  export interface StoreEvent {
1242
+ span_id: SpanId4;
1200
1243
  timestamp: Timestamp4;
1201
1244
  working_start: WorkingStart4;
1202
1245
  pending: Pending4;
@@ -1207,6 +1250,7 @@ export interface StoreEvent {
1207
1250
  * Call to a language model.
1208
1251
  */
1209
1252
  export interface ModelEvent {
1253
+ span_id: SpanId5;
1210
1254
  timestamp: Timestamp5;
1211
1255
  working_start: WorkingStart5;
1212
1256
  pending: Pending5;
@@ -1218,6 +1262,7 @@ export interface ModelEvent {
1218
1262
  tool_choice: ToolChoice;
1219
1263
  config: GenerateConfig;
1220
1264
  output: ModelOutput;
1265
+ retries: Retries;
1221
1266
  error: Error1;
1222
1267
  cache: Cache;
1223
1268
  call: ModelCall | null;
@@ -1288,6 +1333,7 @@ export interface Response {
1288
1333
  * Call to a tool.
1289
1334
  */
1290
1335
  export interface ToolEvent {
1336
+ span_id: SpanId6;
1291
1337
  timestamp: Timestamp6;
1292
1338
  working_start: WorkingStart6;
1293
1339
  pending: Pending6;
@@ -1314,6 +1360,7 @@ export interface Arguments1 {
1314
1360
  * Tool approval.
1315
1361
  */
1316
1362
  export interface ApprovalEvent {
1363
+ span_id: SpanId7;
1317
1364
  timestamp: Timestamp7;
1318
1365
  working_start: WorkingStart7;
1319
1366
  pending: Pending7;
@@ -1340,6 +1387,7 @@ export interface ToolCallView {
1340
1387
  * Input screen interaction.
1341
1388
  */
1342
1389
  export interface InputEvent {
1390
+ span_id: SpanId8;
1343
1391
  timestamp: Timestamp8;
1344
1392
  working_start: WorkingStart8;
1345
1393
  pending: Pending8;
@@ -1354,6 +1402,7 @@ export interface InputEvent {
1354
1402
  * resulting from a call to `score`.
1355
1403
  */
1356
1404
  export interface ScoreEvent {
1405
+ span_id: SpanId9;
1357
1406
  timestamp: Timestamp9;
1358
1407
  working_start: WorkingStart9;
1359
1408
  pending: Pending9;
@@ -1366,6 +1415,7 @@ export interface ScoreEvent {
1366
1415
  * Event with sample error.
1367
1416
  */
1368
1417
  export interface ErrorEvent {
1418
+ span_id: SpanId10;
1369
1419
  timestamp: Timestamp10;
1370
1420
  working_start: WorkingStart10;
1371
1421
  pending: Pending10;
@@ -1376,6 +1426,7 @@ export interface ErrorEvent {
1376
1426
  * Log message recorded with Python logger.
1377
1427
  */
1378
1428
  export interface LoggerEvent {
1429
+ span_id: SpanId11;
1379
1430
  timestamp: Timestamp11;
1380
1431
  working_start: WorkingStart11;
1381
1432
  pending: Pending11;
@@ -1398,6 +1449,7 @@ export interface LoggingMessage {
1398
1449
  * Event with custom info/data.
1399
1450
  */
1400
1451
  export interface InfoEvent {
1452
+ span_id: SpanId12;
1401
1453
  timestamp: Timestamp12;
1402
1454
  working_start: WorkingStart12;
1403
1455
  pending: Pending12;
@@ -1406,27 +1458,54 @@ export interface InfoEvent {
1406
1458
  data: JsonValue;
1407
1459
  }
1408
1460
  /**
1409
- * Step within current sample or subtask.
1461
+ * Mark the beginning of a transcript span.
1410
1462
  */
1411
- export interface StepEvent {
1463
+ export interface SpanBeginEvent {
1464
+ span_id: SpanId13;
1412
1465
  timestamp: Timestamp13;
1413
1466
  working_start: WorkingStart13;
1414
1467
  pending: Pending13;
1415
1468
  event: Event13;
1416
- action: Action1;
1469
+ id: Id8;
1470
+ parent_id: ParentId;
1417
1471
  type: Type13;
1418
1472
  name: Name11;
1419
1473
  }
1420
1474
  /**
1421
- * Subtask spawned.
1475
+ * Mark the end of a transcript span.
1422
1476
  */
1423
- export interface SubtaskEvent {
1477
+ export interface SpanEndEvent {
1478
+ span_id: SpanId14;
1424
1479
  timestamp: Timestamp14;
1425
1480
  working_start: WorkingStart14;
1426
1481
  pending: Pending14;
1427
1482
  event: Event14;
1428
- name: Name12;
1483
+ id: Id9;
1484
+ }
1485
+ /**
1486
+ * Step within current sample or subtask.
1487
+ */
1488
+ export interface StepEvent {
1489
+ span_id: SpanId15;
1490
+ timestamp: Timestamp15;
1491
+ working_start: WorkingStart15;
1492
+ pending: Pending15;
1493
+ event: Event15;
1494
+ action: Action1;
1429
1495
  type: Type14;
1496
+ name: Name12;
1497
+ }
1498
+ /**
1499
+ * Subtask spawned.
1500
+ */
1501
+ export interface SubtaskEvent {
1502
+ span_id: SpanId16;
1503
+ timestamp: Timestamp16;
1504
+ working_start: WorkingStart16;
1505
+ pending: Pending16;
1506
+ event: Event16;
1507
+ name: Name13;
1508
+ type: Type15;
1430
1509
  input: Input5;
1431
1510
  result: Result2;
1432
1511
  events: Events2;
@@ -1449,7 +1528,7 @@ export interface Attachments {
1449
1528
  * Limit encontered by sample.
1450
1529
  */
1451
1530
  export interface EvalSampleLimit {
1452
- type: Type15;
1531
+ type: Type16;
1453
1532
  limit: Limit2;
1454
1533
  }
1455
1534
  /**
@@ -41,13 +41,13 @@ export const MetaDataGrid: FC<MetadataGridProps> = ({
41
41
  styles.cell,
42
42
  "text-style-label",
43
43
  "text-style-secondary",
44
- "text-size-small",
44
+ "text-size-smaller",
45
45
  )}
46
46
  >
47
47
  {entry.name}
48
48
  </div>
49
49
  <div
50
- className={clsx(styles.value, `${baseId}-value`, "text-size-small")}
50
+ className={clsx(styles.value, `${baseId}-value`, "text-size-smaller")}
51
51
  >
52
52
  <RenderedContent id={id} entry={entry} />
53
53
  </div>
@@ -9,7 +9,7 @@
9
9
  }
10
10
 
11
11
  .cell {
12
- padding: 0em 0.5em 0.3em 0em !important;
12
+ padding: 0em 0.7em 0.3em 0em !important;
13
13
  }
14
14
 
15
15
  .compact .cell {
@@ -1,7 +1,7 @@
1
1
  .grid {
2
2
  display: grid;
3
3
  grid-template-columns: max-content auto;
4
- column-gap: 1em;
4
+ column-gap: 0.5em;
5
5
  row-gap: 0.2em;
6
6
  }
7
7
 
@@ -54,7 +54,7 @@ export const RenderedContent: FC<RenderedContentProps> = ({
54
54
  if (typeof entry.value === "object") {
55
55
  return JSON.stringify(entry.value);
56
56
  }
57
- return String(entry.value);
57
+ return String(entry.value).trim();
58
58
  } catch (e) {
59
59
  return "[Unable to display value]";
60
60
  }
@@ -17,7 +17,9 @@ import { useLogNavigation } from "../routing/navigationHooks";
17
17
  import styles from "./LogView.module.css";
18
18
  import { useInfoTabConfig } from "./tabs/InfoTab";
19
19
  import { useJsonTabConfig } from "./tabs/JsonTab";
20
+ import { useModelsTab } from "./tabs/ModelsTab";
20
21
  import { useSamplesTabConfig } from "./tabs/SamplesTab";
22
+ import { useTaskTabConfig } from "./tabs/TaskTab";
21
23
  import { TabDescriptor } from "./types";
22
24
 
23
25
  export const LogView: FC = () => {
@@ -45,7 +47,14 @@ export const LogView: FC = () => {
45
47
  selectedLogSummary?.plan,
46
48
  selectedLogSummary?.error,
47
49
  selectedLogSummary?.results,
50
+ );
51
+
52
+ const taskTabConfig = useTaskTabConfig(evalSpec, selectedLogSummary?.stats);
53
+
54
+ const modelsTabConfig = useModelsTab(
55
+ evalSpec,
48
56
  selectedLogSummary?.stats,
57
+ selectedLogSummary?.status,
49
58
  );
50
59
 
51
60
  const jsonTabConfig = useJsonTabConfig(
@@ -60,6 +69,8 @@ export const LogView: FC = () => {
60
69
 
61
70
  const tabs: Record<string, TabDescriptor<any>> = {
62
71
  ...(samplesTabConfig ? { samples: samplesTabConfig } : {}),
72
+ task: taskTabConfig,
73
+ model: modelsTabConfig,
63
74
  config: configTabConfig,
64
75
  json: jsonTabConfig,
65
76
  };
@@ -6,14 +6,12 @@ import {
6
6
  EvalSpec,
7
7
  EvalStats,
8
8
  } from "../../../@types/log";
9
- import { UsageCard } from "../../usage/UsageCard";
10
- import { TaskErrorCard } from "../error/TaskErrorPanel";
11
9
  import { SampleSummary } from "../../../client/api/types";
12
10
  import { MessageBand } from "../../../components/MessageBand";
13
- import { ModelCard } from "../../plan/ModelCard";
14
11
  import { kLogViewInfoTabId } from "../../../constants";
15
12
  import { useTotalSampleCount } from "../../../state/hooks";
16
13
  import { PlanCard } from "../../plan/PlanCard";
14
+ import { TaskErrorCard } from "../error/TaskErrorPanel";
17
15
 
18
16
  // Individual hook for Info tab
19
17
  export const useInfoTabConfig = (
@@ -21,7 +19,6 @@ export const useInfoTabConfig = (
21
19
  evalPlan: EvalPlan | undefined,
22
20
  evalError: EvalError | undefined | null,
23
21
  evalResults: EvalResults | undefined | null,
24
- evalStats: EvalStats | undefined,
25
22
  ) => {
26
23
  const totalSampleCount = useTotalSampleCount();
27
24
  return useMemo(() => {
@@ -35,11 +32,10 @@ export const useInfoTabConfig = (
35
32
  evalPlan,
36
33
  evalError,
37
34
  evalResults,
38
- evalStats,
39
35
  sampleCount: totalSampleCount,
40
36
  },
41
37
  };
42
- }, [evalSpec, evalPlan, evalError, evalResults, evalStats, totalSampleCount]);
38
+ }, [evalSpec, evalPlan, evalError, evalResults, totalSampleCount]);
43
39
  };
44
40
 
45
41
  interface PlanTabProps {
@@ -57,7 +53,6 @@ export const InfoTab: FC<PlanTabProps> = ({
57
53
  evalSpec,
58
54
  evalPlan,
59
55
  evalResults,
60
- evalStats,
61
56
  evalStatus,
62
57
  evalError,
63
58
  sampleCount,
@@ -85,8 +80,6 @@ export const InfoTab: FC<PlanTabProps> = ({
85
80
  evalPlan={evalPlan}
86
81
  scores={evalResults?.scores}
87
82
  />
88
- {evalSpec ? <ModelCard evalSpec={evalSpec} /> : undefined}
89
- {evalStatus !== "started" ? <UsageCard stats={evalStats} /> : undefined}
90
83
  {evalStatus === "error" && evalError ? (
91
84
  <TaskErrorCard error={evalError} />
92
85
  ) : undefined}
@@ -0,0 +1,51 @@
1
+ import { FC, useMemo } from "react";
2
+ import { EvalSpec, EvalStats, Status } from "../../../@types/log";
3
+ import { kLogViewModelsTabId } from "../../../constants";
4
+ import { ModelCard } from "../../plan/ModelCard";
5
+ import { UsageCard } from "../../usage/UsageCard";
6
+
7
+ // Individual hook for Info tab
8
+ export const useModelsTab = (
9
+ evalSpec: EvalSpec | undefined,
10
+ evalStats: EvalStats | undefined,
11
+ evalStatus?: Status,
12
+ ) => {
13
+ return useMemo(() => {
14
+ return {
15
+ id: kLogViewModelsTabId,
16
+ label: "Models",
17
+ scrollable: true,
18
+ component: ModelTab,
19
+ componentProps: {
20
+ evalSpec,
21
+ evalStats,
22
+ evalStatus,
23
+ },
24
+ };
25
+ }, [evalSpec, evalStats]);
26
+ };
27
+
28
+ interface ModelTabProps {
29
+ evalSpec?: EvalSpec;
30
+ evalStats?: EvalStats;
31
+ evalStatus?: Status;
32
+ }
33
+
34
+ export const ModelTab: FC<ModelTabProps> = ({
35
+ evalSpec,
36
+ evalStats,
37
+ evalStatus,
38
+ }) => {
39
+ return (
40
+ <div style={{ width: "100%" }}>
41
+ <div style={{ padding: "0.5em 1em 0 1em", width: "100%" }}>
42
+ {evalSpec ? <ModelCard evalSpec={evalSpec} /> : undefined}
43
+ {evalStatus !== "started" &&
44
+ evalStats?.model_usage &&
45
+ Object.keys(evalStats.model_usage).length > 0 && (
46
+ <UsageCard stats={evalStats} />
47
+ )}
48
+ </div>
49
+ </div>
50
+ );
51
+ };
@@ -0,0 +1,6 @@
1
+ .grid {
2
+ display: grid;
3
+ grid-template-columns: fit-content(50%) fit-content(50%);
4
+ column-gap: 3em;
5
+ align-items: flex-start;
6
+ }
@@ -0,0 +1,143 @@
1
+ import clsx from "clsx";
2
+ import { FC, useMemo } from "react";
3
+ import { EvalSpec, EvalStats } from "../../../@types/log";
4
+ import { Card, CardBody, CardHeader } from "../../../components/Card";
5
+ import { kLogViewTaskTabId } from "../../../constants";
6
+ import { formatDuration, toTitleCase } from "../../../utils/format";
7
+ import { ghCommitUrl } from "../../../utils/git";
8
+ import { MetaDataView } from "../../content/MetaDataView";
9
+
10
+ import styles from "./TaskTab.module.css";
11
+
12
+ // Individual hook for Info tab
13
+ export const useTaskTabConfig = (
14
+ evalSpec: EvalSpec | undefined,
15
+ evalStats?: EvalStats,
16
+ ) => {
17
+ return useMemo(() => {
18
+ return {
19
+ id: kLogViewTaskTabId,
20
+ label: "Task",
21
+ scrollable: true,
22
+ component: TaskTab,
23
+ componentProps: {
24
+ evalSpec,
25
+ evalStats,
26
+ },
27
+ };
28
+ }, [evalSpec, evalStats]);
29
+ };
30
+
31
+ interface TaskTabProps {
32
+ evalSpec?: EvalSpec;
33
+ evalStats?: EvalStats;
34
+ }
35
+
36
+ export const TaskTab: FC<TaskTabProps> = ({ evalSpec, evalStats }) => {
37
+ const config: Record<string, unknown> = {};
38
+ Object.entries(evalSpec?.config || {}).forEach((entry) => {
39
+ const key = entry[0];
40
+ const value = entry[1];
41
+ config[key] = value;
42
+ });
43
+
44
+ const revision = evalSpec?.revision;
45
+ const packages = evalSpec?.packages;
46
+
47
+ const taskInformation: Record<string, unknown> = {
48
+ ["Task ID"]: evalSpec?.task_id,
49
+ ["Run ID"]: evalSpec?.run_id,
50
+ };
51
+
52
+ if (revision) {
53
+ taskInformation[
54
+ `${revision.type ? `${toTitleCase(revision.type)} ` : ""}Revision`
55
+ ] = {
56
+ _html: (
57
+ <a href={ghCommitUrl(revision.origin, revision.commit)}>
58
+ {revision.commit}
59
+ </a>
60
+ ),
61
+ };
62
+ }
63
+ if (packages) {
64
+ const names = Object.keys(packages).map((key) => {
65
+ return `${key} ${packages[key]}`;
66
+ });
67
+
68
+ if (names.length === 1) {
69
+ taskInformation["Inspect"] = names[0];
70
+ } else {
71
+ taskInformation["Inspect"] = names;
72
+ }
73
+ }
74
+ if (evalSpec?.tags) {
75
+ taskInformation["tags"] = evalSpec?.tags.join(", ");
76
+ }
77
+
78
+ if (evalSpec?.sandbox) {
79
+ if (Array.isArray(evalSpec?.sandbox)) {
80
+ taskInformation["sandbox"] = evalSpec.sandbox[0];
81
+ if (evalSpec.sandbox[1]) {
82
+ taskInformation["sandbox_config"] = evalSpec.sandbox[1];
83
+ }
84
+ } else {
85
+ taskInformation["sandbox"] = evalSpec?.sandbox.type;
86
+ taskInformation["sandbox_config"] = evalSpec?.sandbox.config;
87
+ }
88
+ }
89
+
90
+ const totalDuration = formatDuration(
91
+ new Date(evalStats?.started_at || 0),
92
+ new Date(evalStats?.completed_at || 0),
93
+ );
94
+
95
+ const task_args = evalSpec?.task_args || {};
96
+
97
+ return (
98
+ <div style={{ width: "100%" }}>
99
+ <div style={{ padding: "0.5em 1em 0 1em", width: "100%" }}>
100
+ <Card>
101
+ <CardHeader label="Task Info" />
102
+ <CardBody id={"task-card-config"}>
103
+ <div className={clsx(styles.grid)}>
104
+ <MetaDataView
105
+ key={`plan-md-task`}
106
+ className={"text-size-small"}
107
+ entries={taskInformation}
108
+ tableOptions="sm"
109
+ />
110
+
111
+ <MetaDataView
112
+ entries={{
113
+ ["Start"]: new Date(
114
+ evalStats?.started_at || 0,
115
+ ).toLocaleString(),
116
+ ["End"]: new Date(
117
+ evalStats?.completed_at || 0,
118
+ ).toLocaleString(),
119
+ ["Duration"]: totalDuration,
120
+ }}
121
+ tableOptions="sm"
122
+ />
123
+ </div>
124
+ </CardBody>
125
+ </Card>
126
+
127
+ {Object.keys(task_args).length > 0 && (
128
+ <Card>
129
+ <CardHeader label="Task Args" />
130
+ <CardBody id={"task-card-config"}>
131
+ <MetaDataView
132
+ key={`plan-md-task-args`}
133
+ className={"text-size-small"}
134
+ entries={task_args as Record<string, unknown>}
135
+ tableOptions="sm"
136
+ />
137
+ </CardBody>
138
+ </Card>
139
+ )}
140
+ </div>
141
+ </div>
142
+ );
143
+ };