inspect-ai 0.3.98__py3-none-any.whl → 0.3.100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -0
- inspect_ai/_cli/log.py +1 -1
- inspect_ai/_display/core/config.py +11 -5
- inspect_ai/_display/core/panel.py +66 -2
- inspect_ai/_display/core/textual.py +5 -2
- inspect_ai/_display/plain/display.py +1 -0
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/widgets/transcript.py +41 -1
- inspect_ai/_eval/run.py +12 -4
- inspect_ai/_eval/score.py +2 -4
- inspect_ai/_eval/task/log.py +1 -1
- inspect_ai/_eval/task/run.py +59 -81
- inspect_ai/_eval/task/task.py +1 -1
- inspect_ai/_util/_async.py +1 -1
- inspect_ai/_util/content.py +11 -6
- inspect_ai/_util/interrupt.py +2 -2
- inspect_ai/_util/text.py +7 -0
- inspect_ai/_util/working.py +8 -37
- inspect_ai/_view/__init__.py +0 -0
- inspect_ai/_view/schema.py +3 -1
- inspect_ai/_view/view.py +14 -0
- inspect_ai/_view/www/CLAUDE.md +15 -0
- inspect_ai/_view/www/dist/assets/index.css +273 -169
- inspect_ai/_view/www/dist/assets/index.js +20079 -17019
- inspect_ai/_view/www/log-schema.json +122 -8
- inspect_ai/_view/www/package.json +5 -1
- inspect_ai/_view/www/src/@types/log.d.ts +20 -2
- inspect_ai/_view/www/src/app/App.tsx +1 -15
- inspect_ai/_view/www/src/app/appearance/icons.ts +4 -1
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +24 -6
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +0 -5
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +221 -205
- inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +2 -1
- inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +5 -0
- inspect_ai/_view/www/src/app/routing/url.ts +84 -4
- inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.module.css +0 -5
- inspect_ai/_view/www/src/app/samples/SampleDialog.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/SampleDisplay.module.css +7 -0
- inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +26 -19
- inspect_ai/_view/www/src/app/samples/SampleSummaryView.module.css +1 -2
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +8 -6
- inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.tsx +0 -4
- inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.tsx +3 -2
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +2 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +2 -0
- inspect_ai/_view/www/src/app/samples/chat/messages.ts +1 -0
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +1 -0
- inspect_ai/_view/www/src/app/samples/list/SampleRow.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/scores/SampleScoresGrid.module.css +2 -2
- inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +2 -3
- inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/InputEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleInitEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +3 -2
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.tsx +4 -5
- inspect_ai/_view/www/src/app/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/StepEventView.tsx +1 -3
- inspect_ai/_view/www/src/app/samples/transcript/SubtaskEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +3 -4
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.module.css +42 -0
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.tsx +77 -0
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualList.tsx +27 -71
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +13 -3
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.tsx +27 -2
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.module.css +1 -0
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +21 -22
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.module.css +45 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +223 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.module.css +10 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +258 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/tree-visitors.ts +187 -0
- inspect_ai/_view/www/src/app/samples/transcript/state/StateEventRenderers.tsx +8 -1
- inspect_ai/_view/www/src/app/samples/transcript/state/StateEventView.tsx +3 -4
- inspect_ai/_view/www/src/app/samples/transcript/transform/hooks.ts +78 -0
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +340 -135
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +3 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +2 -0
- inspect_ai/_view/www/src/app/types.ts +5 -1
- inspect_ai/_view/www/src/client/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +6 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +1 -1
- inspect_ai/_view/www/src/components/PopOver.tsx +422 -0
- inspect_ai/_view/www/src/components/PulsingDots.module.css +9 -9
- inspect_ai/_view/www/src/components/PulsingDots.tsx +4 -1
- inspect_ai/_view/www/src/components/StickyScroll.tsx +183 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +4 -0
- inspect_ai/_view/www/src/state/hooks.ts +52 -2
- inspect_ai/_view/www/src/state/logSlice.ts +4 -3
- inspect_ai/_view/www/src/state/samplePolling.ts +8 -0
- inspect_ai/_view/www/src/state/sampleSlice.ts +53 -9
- inspect_ai/_view/www/src/state/scrolling.ts +152 -0
- inspect_ai/_view/www/src/utils/attachments.ts +7 -0
- inspect_ai/_view/www/src/utils/python.ts +18 -0
- inspect_ai/_view/www/yarn.lock +269 -6
- inspect_ai/agent/_react.py +12 -7
- inspect_ai/agent/_run.py +46 -11
- inspect_ai/analysis/beta/_dataframe/samples/table.py +19 -18
- inspect_ai/log/_bundle.py +5 -3
- inspect_ai/log/_log.py +3 -3
- inspect_ai/log/_recorders/file.py +2 -9
- inspect_ai/log/_transcript.py +1 -1
- inspect_ai/model/_call_tools.py +6 -2
- inspect_ai/model/_openai.py +1 -1
- inspect_ai/model/_openai_responses.py +78 -39
- inspect_ai/model/_openai_web_search.py +31 -0
- inspect_ai/model/_providers/anthropic.py +3 -6
- inspect_ai/model/_providers/azureai.py +72 -3
- inspect_ai/model/_providers/openai.py +2 -1
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/scorer/_metric.py +1 -2
- inspect_ai/solver/_task_state.py +2 -2
- inspect_ai/tool/_tool.py +6 -2
- inspect_ai/tool/_tool_def.py +27 -4
- inspect_ai/tool/_tool_info.py +2 -0
- inspect_ai/tool/_tools/_web_search/_google.py +15 -4
- inspect_ai/tool/_tools/_web_search/_tavily.py +35 -12
- inspect_ai/tool/_tools/_web_search/_web_search.py +214 -45
- inspect_ai/util/__init__.py +6 -0
- inspect_ai/util/_json.py +3 -0
- inspect_ai/util/_limit.py +374 -141
- inspect_ai/util/_sandbox/docker/compose.py +20 -11
- inspect_ai/util/_span.py +1 -1
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/RECORD +131 -117
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/top_level.txt +0 -0
@@ -656,6 +656,17 @@
|
|
656
656
|
"ContentAudio": {
|
657
657
|
"description": "Audio content.",
|
658
658
|
"properties": {
|
659
|
+
"internal": {
|
660
|
+
"anyOf": [
|
661
|
+
{
|
662
|
+
"$ref": "#/$defs/JsonValue"
|
663
|
+
},
|
664
|
+
{
|
665
|
+
"type": "null"
|
666
|
+
}
|
667
|
+
],
|
668
|
+
"default": null
|
669
|
+
},
|
659
670
|
"type": {
|
660
671
|
"const": "audio",
|
661
672
|
"default": "audio",
|
@@ -676,6 +687,7 @@
|
|
676
687
|
}
|
677
688
|
},
|
678
689
|
"required": [
|
690
|
+
"internal",
|
679
691
|
"type",
|
680
692
|
"audio",
|
681
693
|
"format"
|
@@ -687,6 +699,17 @@
|
|
687
699
|
"ContentImage": {
|
688
700
|
"description": "Image content.",
|
689
701
|
"properties": {
|
702
|
+
"internal": {
|
703
|
+
"anyOf": [
|
704
|
+
{
|
705
|
+
"$ref": "#/$defs/JsonValue"
|
706
|
+
},
|
707
|
+
{
|
708
|
+
"type": "null"
|
709
|
+
}
|
710
|
+
],
|
711
|
+
"default": null
|
712
|
+
},
|
690
713
|
"type": {
|
691
714
|
"const": "image",
|
692
715
|
"default": "image",
|
@@ -709,6 +732,7 @@
|
|
709
732
|
}
|
710
733
|
},
|
711
734
|
"required": [
|
735
|
+
"internal",
|
712
736
|
"type",
|
713
737
|
"image",
|
714
738
|
"detail"
|
@@ -720,6 +744,17 @@
|
|
720
744
|
"ContentReasoning": {
|
721
745
|
"description": "Reasoning content.\n\nSee the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.",
|
722
746
|
"properties": {
|
747
|
+
"internal": {
|
748
|
+
"anyOf": [
|
749
|
+
{
|
750
|
+
"$ref": "#/$defs/JsonValue"
|
751
|
+
},
|
752
|
+
{
|
753
|
+
"type": "null"
|
754
|
+
}
|
755
|
+
],
|
756
|
+
"default": null
|
757
|
+
},
|
723
758
|
"type": {
|
724
759
|
"const": "reasoning",
|
725
760
|
"default": "reasoning",
|
@@ -749,6 +784,7 @@
|
|
749
784
|
}
|
750
785
|
},
|
751
786
|
"required": [
|
787
|
+
"internal",
|
752
788
|
"type",
|
753
789
|
"reasoning",
|
754
790
|
"signature",
|
@@ -761,6 +797,17 @@
|
|
761
797
|
"ContentText": {
|
762
798
|
"description": "Text content.",
|
763
799
|
"properties": {
|
800
|
+
"internal": {
|
801
|
+
"anyOf": [
|
802
|
+
{
|
803
|
+
"$ref": "#/$defs/JsonValue"
|
804
|
+
},
|
805
|
+
{
|
806
|
+
"type": "null"
|
807
|
+
}
|
808
|
+
],
|
809
|
+
"default": null
|
810
|
+
},
|
764
811
|
"type": {
|
765
812
|
"const": "text",
|
766
813
|
"default": "text",
|
@@ -785,6 +832,7 @@
|
|
785
832
|
}
|
786
833
|
},
|
787
834
|
"required": [
|
835
|
+
"internal",
|
788
836
|
"type",
|
789
837
|
"text",
|
790
838
|
"refusal"
|
@@ -796,6 +844,17 @@
|
|
796
844
|
"ContentVideo": {
|
797
845
|
"description": "Video content.",
|
798
846
|
"properties": {
|
847
|
+
"internal": {
|
848
|
+
"anyOf": [
|
849
|
+
{
|
850
|
+
"$ref": "#/$defs/JsonValue"
|
851
|
+
},
|
852
|
+
{
|
853
|
+
"type": "null"
|
854
|
+
}
|
855
|
+
],
|
856
|
+
"default": null
|
857
|
+
},
|
799
858
|
"type": {
|
800
859
|
"const": "video",
|
801
860
|
"default": "video",
|
@@ -817,6 +876,7 @@
|
|
817
876
|
}
|
818
877
|
},
|
819
878
|
"required": [
|
879
|
+
"internal",
|
820
880
|
"type",
|
821
881
|
"video",
|
822
882
|
"format"
|
@@ -1136,6 +1196,18 @@
|
|
1136
1196
|
"default": null,
|
1137
1197
|
"title": "Log Samples"
|
1138
1198
|
},
|
1199
|
+
"log_realtime": {
|
1200
|
+
"anyOf": [
|
1201
|
+
{
|
1202
|
+
"type": "boolean"
|
1203
|
+
},
|
1204
|
+
{
|
1205
|
+
"type": "null"
|
1206
|
+
}
|
1207
|
+
],
|
1208
|
+
"default": null,
|
1209
|
+
"title": "Log Realtime"
|
1210
|
+
},
|
1139
1211
|
"log_images": {
|
1140
1212
|
"anyOf": [
|
1141
1213
|
{
|
@@ -1205,6 +1277,7 @@
|
|
1205
1277
|
"max_sandboxes",
|
1206
1278
|
"sandbox_cleanup",
|
1207
1279
|
"log_samples",
|
1280
|
+
"log_realtime",
|
1208
1281
|
"log_images",
|
1209
1282
|
"log_buffer",
|
1210
1283
|
"log_shared",
|
@@ -1502,7 +1575,8 @@
|
|
1502
1575
|
"reasoning_tokens": null,
|
1503
1576
|
"reasoning_summary": null,
|
1504
1577
|
"reasoning_history": null,
|
1505
|
-
"response_schema": null
|
1578
|
+
"response_schema": null,
|
1579
|
+
"extra_body": null
|
1506
1580
|
}
|
1507
1581
|
}
|
1508
1582
|
},
|
@@ -1944,7 +2018,7 @@
|
|
1944
2018
|
"additionalProperties": false
|
1945
2019
|
},
|
1946
2020
|
"EvalSampleLimit": {
|
1947
|
-
"description": "Limit
|
2021
|
+
"description": "Limit encountered by sample.",
|
1948
2022
|
"properties": {
|
1949
2023
|
"type": {
|
1950
2024
|
"enum": [
|
@@ -1961,7 +2035,7 @@
|
|
1961
2035
|
},
|
1962
2036
|
"limit": {
|
1963
2037
|
"title": "Limit",
|
1964
|
-
"type": "
|
2038
|
+
"type": "number"
|
1965
2039
|
}
|
1966
2040
|
},
|
1967
2041
|
"required": [
|
@@ -2277,6 +2351,10 @@
|
|
2277
2351
|
"EvalSpec": {
|
2278
2352
|
"description": "Eval target and configuration.",
|
2279
2353
|
"properties": {
|
2354
|
+
"eval_id": {
|
2355
|
+
"title": "Eval Id",
|
2356
|
+
"type": "string"
|
2357
|
+
},
|
2280
2358
|
"run_id": {
|
2281
2359
|
"title": "Run Id",
|
2282
2360
|
"type": "string"
|
@@ -2294,9 +2372,16 @@
|
|
2294
2372
|
"type": "string"
|
2295
2373
|
},
|
2296
2374
|
"task_version": {
|
2375
|
+
"anyOf": [
|
2376
|
+
{
|
2377
|
+
"type": "integer"
|
2378
|
+
},
|
2379
|
+
{
|
2380
|
+
"type": "string"
|
2381
|
+
}
|
2382
|
+
],
|
2297
2383
|
"default": 0,
|
2298
|
-
"title": "Task Version"
|
2299
|
-
"type": "integer"
|
2384
|
+
"title": "Task Version"
|
2300
2385
|
},
|
2301
2386
|
"task_file": {
|
2302
2387
|
"anyOf": [
|
@@ -2500,6 +2585,7 @@
|
|
2500
2585
|
}
|
2501
2586
|
},
|
2502
2587
|
"required": [
|
2588
|
+
"eval_id",
|
2503
2589
|
"run_id",
|
2504
2590
|
"created",
|
2505
2591
|
"task",
|
@@ -2897,6 +2983,19 @@
|
|
2897
2983
|
}
|
2898
2984
|
],
|
2899
2985
|
"default": null
|
2986
|
+
},
|
2987
|
+
"extra_body": {
|
2988
|
+
"anyOf": [
|
2989
|
+
{
|
2990
|
+
"additionalProperties": true,
|
2991
|
+
"type": "object"
|
2992
|
+
},
|
2993
|
+
{
|
2994
|
+
"type": "null"
|
2995
|
+
}
|
2996
|
+
],
|
2997
|
+
"default": null,
|
2998
|
+
"title": "Extra Body"
|
2900
2999
|
}
|
2901
3000
|
},
|
2902
3001
|
"title": "GenerateConfig",
|
@@ -2927,7 +3026,8 @@
|
|
2927
3026
|
"reasoning_tokens",
|
2928
3027
|
"reasoning_summary",
|
2929
3028
|
"reasoning_history",
|
2930
|
-
"response_schema"
|
3029
|
+
"response_schema",
|
3030
|
+
"extra_body"
|
2931
3031
|
],
|
2932
3032
|
"additionalProperties": false
|
2933
3033
|
},
|
@@ -4163,7 +4263,7 @@
|
|
4163
4263
|
"limit": {
|
4164
4264
|
"anyOf": [
|
4165
4265
|
{
|
4166
|
-
"type": "
|
4266
|
+
"type": "number"
|
4167
4267
|
},
|
4168
4268
|
{
|
4169
4269
|
"type": "null"
|
@@ -5592,12 +5692,26 @@
|
|
5592
5692
|
},
|
5593
5693
|
"parameters": {
|
5594
5694
|
"$ref": "#/$defs/ToolParams"
|
5695
|
+
},
|
5696
|
+
"options": {
|
5697
|
+
"anyOf": [
|
5698
|
+
{
|
5699
|
+
"additionalProperties": true,
|
5700
|
+
"type": "object"
|
5701
|
+
},
|
5702
|
+
{
|
5703
|
+
"type": "null"
|
5704
|
+
}
|
5705
|
+
],
|
5706
|
+
"default": null,
|
5707
|
+
"title": "Options"
|
5595
5708
|
}
|
5596
5709
|
},
|
5597
5710
|
"required": [
|
5598
5711
|
"name",
|
5599
5712
|
"description",
|
5600
|
-
"parameters"
|
5713
|
+
"parameters",
|
5714
|
+
"options"
|
5601
5715
|
],
|
5602
5716
|
"title": "ToolInfo",
|
5603
5717
|
"type": "object",
|
@@ -62,6 +62,8 @@
|
|
62
62
|
"@codemirror/lint": "^6.8.5",
|
63
63
|
"@codemirror/state": "^6.5.2",
|
64
64
|
"@lezer/highlight": "^1.2.1",
|
65
|
+
"@mui/material": "^7.1.0",
|
66
|
+
"@mui/x-tree-view": "^8.3.0",
|
65
67
|
"@popperjs/core": "^2.11.8",
|
66
68
|
"ansi-output": "^0.0.9",
|
67
69
|
"asciinema-player": "^3.9.0",
|
@@ -86,8 +88,10 @@
|
|
86
88
|
"prismjs": "^1.30.0",
|
87
89
|
"react": "^19.0.0",
|
88
90
|
"react-dom": "^19.0.0",
|
91
|
+
"react-popper": "^2.3.0",
|
89
92
|
"react-router-dom": "^7.5.3",
|
90
93
|
"react-virtuoso": "^4.12.6",
|
91
|
-
"zustand": "^5.0.
|
94
|
+
"zustand": "^5.0.5",
|
95
|
+
"use-resize-observer": "^9.1.0"
|
92
96
|
}
|
93
97
|
}
|
@@ -7,11 +7,12 @@
|
|
7
7
|
|
8
8
|
export type Version = number;
|
9
9
|
export type Status = "started" | "success" | "cancelled" | "error";
|
10
|
+
export type EvalId = string;
|
10
11
|
export type RunId = string;
|
11
12
|
export type Created = string;
|
12
13
|
export type Task = string;
|
13
14
|
export type TaskId = string;
|
14
|
-
export type TaskVersion = number;
|
15
|
+
export type TaskVersion = number | string;
|
15
16
|
export type TaskFile = string | null;
|
16
17
|
export type TaskRegistryName = string | null;
|
17
18
|
export type Solver = string | null;
|
@@ -68,6 +69,9 @@ export type Anyof = JSONSchema[] | null;
|
|
68
69
|
export type Required = string[] | null;
|
69
70
|
export type Description1 = string | null;
|
70
71
|
export type Strict = boolean | null;
|
72
|
+
export type ExtraBody = {
|
73
|
+
[k: string]: unknown;
|
74
|
+
} | null;
|
71
75
|
export type ModelBaseUrl = string | null;
|
72
76
|
export type ModelRoles = {
|
73
77
|
[k: string]: EvalModelConfig;
|
@@ -99,6 +103,7 @@ export type MaxSubprocesses = number | null;
|
|
99
103
|
export type MaxSandboxes = number | null;
|
100
104
|
export type SandboxCleanup = boolean | null;
|
101
105
|
export type LogSamples = boolean | null;
|
106
|
+
export type LogRealtime = boolean | null;
|
102
107
|
export type LogImages = boolean | null;
|
103
108
|
export type LogBuffer = number | null;
|
104
109
|
export type LogShared = number | null;
|
@@ -402,6 +407,9 @@ export type Description2 = string;
|
|
402
407
|
export type Type11 = "object";
|
403
408
|
export type Required1 = string[];
|
404
409
|
export type Additionalproperties1 = boolean;
|
410
|
+
export type Options3 = {
|
411
|
+
[k: string]: unknown;
|
412
|
+
} | null;
|
405
413
|
export type Tools1 = ToolInfo[];
|
406
414
|
export type ToolChoice = ("auto" | "any" | "none") | ToolFunction;
|
407
415
|
export type Name9 = string;
|
@@ -640,6 +648,7 @@ export interface EvalLog {
|
|
640
648
|
* Eval target and configuration.
|
641
649
|
*/
|
642
650
|
export interface EvalSpec {
|
651
|
+
eval_id: EvalId;
|
643
652
|
run_id: RunId;
|
644
653
|
created: Created;
|
645
654
|
task: Task;
|
@@ -722,6 +731,7 @@ export interface GenerateConfig {
|
|
722
731
|
reasoning_summary: ReasoningSummary;
|
723
732
|
reasoning_history: ReasoningHistory;
|
724
733
|
response_schema: ResponseSchema | null;
|
734
|
+
extra_body: ExtraBody;
|
725
735
|
}
|
726
736
|
/**
|
727
737
|
* Schema for model response when using Structured Output.
|
@@ -786,6 +796,7 @@ export interface EvalConfig {
|
|
786
796
|
max_sandboxes: MaxSandboxes;
|
787
797
|
sandbox_cleanup: SandboxCleanup;
|
788
798
|
log_samples: LogSamples;
|
799
|
+
log_realtime: LogRealtime;
|
789
800
|
log_images: LogImages;
|
790
801
|
log_buffer: LogBuffer;
|
791
802
|
log_shared: LogShared;
|
@@ -888,6 +899,7 @@ export interface GenerateConfig1 {
|
|
888
899
|
reasoning_summary: ReasoningSummary;
|
889
900
|
reasoning_history: ReasoningHistory;
|
890
901
|
response_schema: ResponseSchema | null;
|
902
|
+
extra_body: ExtraBody;
|
891
903
|
}
|
892
904
|
/**
|
893
905
|
* Scoring results from evaluation.
|
@@ -998,6 +1010,7 @@ export interface ChatMessageSystem {
|
|
998
1010
|
* Text content.
|
999
1011
|
*/
|
1000
1012
|
export interface ContentText {
|
1013
|
+
internal: unknown;
|
1001
1014
|
type: Type3;
|
1002
1015
|
text: Text;
|
1003
1016
|
refusal: Refusal;
|
@@ -1008,6 +1021,7 @@ export interface ContentText {
|
|
1008
1021
|
* See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
|
1009
1022
|
*/
|
1010
1023
|
export interface ContentReasoning {
|
1024
|
+
internal: unknown;
|
1011
1025
|
type: Type4;
|
1012
1026
|
reasoning: Reasoning;
|
1013
1027
|
signature: Signature;
|
@@ -1017,6 +1031,7 @@ export interface ContentReasoning {
|
|
1017
1031
|
* Image content.
|
1018
1032
|
*/
|
1019
1033
|
export interface ContentImage {
|
1034
|
+
internal: unknown;
|
1020
1035
|
type: Type5;
|
1021
1036
|
image: Image;
|
1022
1037
|
detail: Detail;
|
@@ -1025,6 +1040,7 @@ export interface ContentImage {
|
|
1025
1040
|
* Audio content.
|
1026
1041
|
*/
|
1027
1042
|
export interface ContentAudio {
|
1043
|
+
internal: unknown;
|
1028
1044
|
type: Type6;
|
1029
1045
|
audio: Audio;
|
1030
1046
|
format: Format1;
|
@@ -1033,6 +1049,7 @@ export interface ContentAudio {
|
|
1033
1049
|
* Video content.
|
1034
1050
|
*/
|
1035
1051
|
export interface ContentVideo {
|
1052
|
+
internal: unknown;
|
1036
1053
|
type: Type7;
|
1037
1054
|
video: Video;
|
1038
1055
|
format: Format2;
|
@@ -1299,6 +1316,7 @@ export interface ToolInfo {
|
|
1299
1316
|
name: Name8;
|
1300
1317
|
description: Description2;
|
1301
1318
|
parameters: ToolParams;
|
1319
|
+
options: Options3;
|
1302
1320
|
}
|
1303
1321
|
/**
|
1304
1322
|
* Description of tool parameters object in JSON Schema format.
|
@@ -1525,7 +1543,7 @@ export interface Attachments {
|
|
1525
1543
|
[k: string]: string;
|
1526
1544
|
}
|
1527
1545
|
/**
|
1528
|
-
* Limit
|
1546
|
+
* Limit encountered by sample.
|
1529
1547
|
*/
|
1530
1548
|
export interface EvalSampleLimit {
|
1531
1549
|
type: Type16;
|
@@ -14,7 +14,7 @@ import "./App.css";
|
|
14
14
|
|
15
15
|
import ClipboardJS from "clipboard";
|
16
16
|
import { FC, useCallback, useEffect } from "react";
|
17
|
-
import { RouterProvider
|
17
|
+
import { RouterProvider } from "react-router-dom";
|
18
18
|
import { ClientAPI, HostMessage } from "../client/api/types.ts";
|
19
19
|
import { useStore } from "../state/store.ts";
|
20
20
|
import { AppRouter } from "./routing/AppRouter.tsx";
|
@@ -32,7 +32,6 @@ export const App: FC<AppProps> = ({ api }) => {
|
|
32
32
|
|
33
33
|
const logs = useStore((state) => state.logs.logs);
|
34
34
|
const selectedLogFile = useStore((state) => state.logs.selectedLogFile);
|
35
|
-
|
36
35
|
const loadedLogFile = useStore((state) => state.log.loadedLog);
|
37
36
|
const selectedLogSummary = useStore((state) => state.log.selectedLogSummary);
|
38
37
|
|
@@ -46,15 +45,6 @@ export const App: FC<AppProps> = ({ api }) => {
|
|
46
45
|
const loadLog = useStore((state) => state.logActions.loadLog);
|
47
46
|
const pollLog = useStore((state) => state.logActions.pollLog);
|
48
47
|
|
49
|
-
const { sampleId } = useParams<{
|
50
|
-
logPath?: string;
|
51
|
-
tabId?: string;
|
52
|
-
sampleId?: string;
|
53
|
-
epoch?: string;
|
54
|
-
sampleTabId?: string;
|
55
|
-
}>();
|
56
|
-
const selectSample = useStore((state) => state.logActions.selectSample);
|
57
|
-
|
58
48
|
// Load a specific log
|
59
49
|
useEffect(() => {
|
60
50
|
const loadSpecificLog = async () => {
|
@@ -66,10 +56,6 @@ export const App: FC<AppProps> = ({ api }) => {
|
|
66
56
|
// Then load the log
|
67
57
|
await loadLog(selectedLogFile);
|
68
58
|
|
69
|
-
if (!sampleId) {
|
70
|
-
selectSample(0);
|
71
|
-
}
|
72
|
-
|
73
59
|
// Finally set loading to false
|
74
60
|
setAppStatus({ loading: false, error: undefined });
|
75
61
|
} catch (e) {
|
@@ -9,6 +9,7 @@ const loggingIcons: Record<string, string> = {
|
|
9
9
|
};
|
10
10
|
|
11
11
|
export const ApplicationIcons = {
|
12
|
+
agent: "bi bi-grid", // bi bi-x-diamond
|
12
13
|
approve: "bi bi-shield",
|
13
14
|
approvals: {
|
14
15
|
approve: "bi bi-shield-check",
|
@@ -56,7 +57,7 @@ export const ApplicationIcons = {
|
|
56
57
|
json: "bi bi-filetype-json",
|
57
58
|
limits: {
|
58
59
|
messages: "bi bi-chat-right-text",
|
59
|
-
custom: "bi bi-
|
60
|
+
custom: "bi bi-exclamation-triangle",
|
60
61
|
operator: "bi bi-person-workspace",
|
61
62
|
tokens: "bi bi-list",
|
62
63
|
time: "bi bi-clock",
|
@@ -91,6 +92,7 @@ export const ApplicationIcons = {
|
|
91
92
|
sandbox: "bi bi-box-seam",
|
92
93
|
scorer: "bi bi-calculator",
|
93
94
|
search: "bi bi-search",
|
95
|
+
sidebar: "bi bi-list",
|
94
96
|
solvers: {
|
95
97
|
default: "bi bi-arrow-return-right",
|
96
98
|
generate: "bi bi-share",
|
@@ -106,5 +108,6 @@ export const ApplicationIcons = {
|
|
106
108
|
open: "bi bi-caret-down-fill",
|
107
109
|
closed: "bi bi-caret-right-fill",
|
108
110
|
},
|
111
|
+
turns: "bi bi-chat-left-text", // bi bi-repeat
|
109
112
|
usage: "bi bi-stopwatch",
|
110
113
|
};
|
@@ -7,6 +7,7 @@ interface MetadataGridProps {
|
|
7
7
|
id?: string;
|
8
8
|
className?: string | string[];
|
9
9
|
style?: CSSProperties;
|
10
|
+
size?: "mini" | "small";
|
10
11
|
entries: Record<string, unknown>;
|
11
12
|
plain?: boolean;
|
12
13
|
}
|
@@ -18,10 +19,13 @@ export const MetaDataGrid: FC<MetadataGridProps> = ({
|
|
18
19
|
id,
|
19
20
|
entries,
|
20
21
|
className,
|
22
|
+
size,
|
21
23
|
style,
|
22
24
|
plain,
|
23
25
|
}) => {
|
24
26
|
const baseId = "metadata-grid";
|
27
|
+
const fontStyle =
|
28
|
+
size === "mini" ? "text-size-smallest" : "text-size-smaller";
|
25
29
|
|
26
30
|
const entryEls = entryRecords(entries).map((entry, index) => {
|
27
31
|
const id = `${baseId}-value-${index}`;
|
@@ -41,15 +45,29 @@ export const MetaDataGrid: FC<MetadataGridProps> = ({
|
|
41
45
|
styles.cell,
|
42
46
|
"text-style-label",
|
43
47
|
"text-style-secondary",
|
44
|
-
|
48
|
+
fontStyle,
|
45
49
|
)}
|
46
50
|
>
|
47
|
-
{entry
|
51
|
+
{entry?.name}
|
48
52
|
</div>
|
49
|
-
<div
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
+
<div className={clsx(styles.value, `${baseId}-value`, fontStyle)}>
|
54
|
+
{entry && (
|
55
|
+
<RenderedContent
|
56
|
+
id={id}
|
57
|
+
entry={entry}
|
58
|
+
renderObject={(obj: any) => {
|
59
|
+
return (
|
60
|
+
<MetaDataGrid
|
61
|
+
id={id}
|
62
|
+
className={clsx(styles.nested)}
|
63
|
+
entries={obj}
|
64
|
+
size={size}
|
65
|
+
plain={plain}
|
66
|
+
/>
|
67
|
+
);
|
68
|
+
}}
|
69
|
+
/>
|
70
|
+
)}
|
53
71
|
</div>
|
54
72
|
</Fragment>
|
55
73
|
);
|