inspect-ai 0.3.82__py3-none-any.whl → 0.3.83__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_display/textual/app.py +14 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +9 -3
- inspect_ai/_display/textual/widgets/task_detail.py +3 -4
- inspect_ai/_display/textual/widgets/tasks.py +17 -1
- inspect_ai/_display/textual/widgets/vscode.py +44 -0
- inspect_ai/_eval/eval.py +36 -24
- inspect_ai/_eval/evalset.py +17 -18
- inspect_ai/_eval/loader.py +34 -11
- inspect_ai/_eval/run.py +8 -13
- inspect_ai/_eval/score.py +13 -3
- inspect_ai/_eval/task/generate.py +8 -9
- inspect_ai/_eval/task/log.py +2 -0
- inspect_ai/_eval/task/task.py +23 -9
- inspect_ai/_util/file.py +13 -0
- inspect_ai/_util/json.py +2 -1
- inspect_ai/_util/registry.py +1 -0
- inspect_ai/_util/vscode.py +37 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +304 -128
- inspect_ai/_view/www/dist/assets/index.js +47495 -27519
- inspect_ai/_view/www/log-schema.json +124 -31
- inspect_ai/_view/www/package.json +3 -0
- inspect_ai/_view/www/src/App.tsx +12 -0
- inspect_ai/_view/www/src/appearance/icons.ts +1 -0
- inspect_ai/_view/www/src/components/Card.tsx +6 -4
- inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
- inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +1 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +113 -23
- inspect_ai/_view/www/src/components/Modal.module.css +38 -0
- inspect_ai/_view/www/src/components/Modal.tsx +77 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +7 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +7 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +11 -34
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +6 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +12 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +2 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +3 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +1 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +9 -3
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -11
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +2 -1
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +7 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +25 -8
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +1 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +11 -22
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
- inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +25 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +29 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +0 -1
- inspect_ai/_view/www/src/state/hooks.ts +5 -3
- inspect_ai/_view/www/src/state/logPolling.ts +5 -1
- inspect_ai/_view/www/src/state/logSlice.ts +10 -0
- inspect_ai/_view/www/src/state/samplePolling.ts +4 -1
- inspect_ai/_view/www/src/state/sampleSlice.ts +13 -0
- inspect_ai/_view/www/src/types/log.d.ts +34 -26
- inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
- inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +18 -16
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +68 -71
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +1 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +18 -0
- inspect_ai/_view/www/yarn.lock +94 -1
- inspect_ai/agent/__init__.py +36 -0
- inspect_ai/agent/_agent.py +268 -0
- inspect_ai/agent/_as_solver.py +72 -0
- inspect_ai/agent/_as_tool.py +122 -0
- inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
- inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
- inspect_ai/agent/_filter.py +46 -0
- inspect_ai/agent/_handoff.py +93 -0
- inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
- inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
- inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
- inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
- inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
- inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
- inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
- inspect_ai/agent/_react.py +241 -0
- inspect_ai/agent/_run.py +36 -0
- inspect_ai/agent/_types.py +81 -0
- inspect_ai/log/_log.py +11 -2
- inspect_ai/log/_transcript.py +13 -9
- inspect_ai/model/__init__.py +7 -1
- inspect_ai/model/_call_tools.py +256 -52
- inspect_ai/model/_chat_message.py +7 -4
- inspect_ai/model/_conversation.py +13 -62
- inspect_ai/model/_display.py +85 -0
- inspect_ai/model/_model.py +113 -14
- inspect_ai/model/_model_output.py +14 -9
- inspect_ai/model/_openai.py +16 -4
- inspect_ai/model/_openai_computer_use.py +162 -0
- inspect_ai/model/_openai_responses.py +319 -165
- inspect_ai/model/_providers/anthropic.py +20 -21
- inspect_ai/model/_providers/azureai.py +24 -13
- inspect_ai/model/_providers/bedrock.py +1 -7
- inspect_ai/model/_providers/cloudflare.py +3 -3
- inspect_ai/model/_providers/goodfire.py +2 -6
- inspect_ai/model/_providers/google.py +11 -10
- inspect_ai/model/_providers/groq.py +6 -3
- inspect_ai/model/_providers/hf.py +7 -3
- inspect_ai/model/_providers/mistral.py +7 -10
- inspect_ai/model/_providers/openai.py +47 -17
- inspect_ai/model/_providers/openai_o1.py +11 -4
- inspect_ai/model/_providers/openai_responses.py +12 -14
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/together.py +12 -2
- inspect_ai/model/_providers/util/chatapi.py +7 -2
- inspect_ai/model/_providers/util/hf_handler.py +4 -2
- inspect_ai/model/_providers/util/llama31.py +4 -2
- inspect_ai/model/_providers/vertex.py +11 -9
- inspect_ai/model/_providers/vllm.py +4 -4
- inspect_ai/scorer/__init__.py +2 -0
- inspect_ai/scorer/_metrics/__init__.py +2 -0
- inspect_ai/scorer/_metrics/grouped.py +84 -0
- inspect_ai/scorer/_score.py +26 -6
- inspect_ai/solver/__init__.py +2 -2
- inspect_ai/solver/_basic_agent.py +22 -9
- inspect_ai/solver/_bridge.py +31 -0
- inspect_ai/solver/_chain.py +20 -12
- inspect_ai/solver/_fork.py +5 -1
- inspect_ai/solver/_human_agent.py +52 -0
- inspect_ai/solver/_prompt.py +3 -1
- inspect_ai/solver/_run.py +59 -0
- inspect_ai/solver/_solver.py +14 -4
- inspect_ai/solver/_task_state.py +5 -3
- inspect_ai/tool/_tool_call.py +15 -8
- inspect_ai/tool/_tool_def.py +17 -12
- inspect_ai/tool/_tool_support_helpers.py +2 -2
- inspect_ai/tool/_tool_with.py +14 -11
- inspect_ai/tool/_tools/_bash_session.py +11 -2
- inspect_ai/tool/_tools/_computer/_common.py +18 -2
- inspect_ai/tool/_tools/_computer/_computer.py +18 -2
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +100 -61
- inspect_ai/util/__init__.py +2 -0
- inspect_ai/util/_anyio.py +27 -0
- inspect_ai/util/_sandbox/__init__.py +2 -1
- inspect_ai/util/_sandbox/context.py +32 -7
- inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
- inspect_ai/util/_sandbox/docker/compose.py +2 -2
- inspect_ai/util/_sandbox/docker/docker.py +12 -1
- inspect_ai/util/_store_model.py +30 -7
- inspect_ai/util/_subprocess.py +13 -3
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +179 -153
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -167
- /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@ export type Task = string;
|
|
13
13
|
export type TaskId = string;
|
14
14
|
export type TaskVersion = number;
|
15
15
|
export type TaskFile = string | null;
|
16
|
+
export type TaskRegistryName = string | null;
|
16
17
|
export type Solver = string | null;
|
17
18
|
export type SolverArgs = {} | null;
|
18
19
|
export type Tags = string[] | null;
|
@@ -161,6 +162,7 @@ export type Content =
|
|
161
162
|
)[];
|
162
163
|
export type Type3 = "text";
|
163
164
|
export type Text = string;
|
165
|
+
export type Refusal = boolean | null;
|
164
166
|
export type Type4 = "reasoning";
|
165
167
|
export type Reasoning = string;
|
166
168
|
export type Signature = string | null;
|
@@ -204,12 +206,11 @@ export type Role2 = "assistant";
|
|
204
206
|
export type ToolCalls = ToolCall[] | null;
|
205
207
|
export type Id4 = string;
|
206
208
|
export type Function = string;
|
207
|
-
export type Type8 = string;
|
208
|
-
export type InternalName = string | null;
|
209
209
|
export type ParseError = string | null;
|
210
210
|
export type Title = string | null;
|
211
211
|
export type Format2 = "text" | "markdown";
|
212
212
|
export type Content3 = string;
|
213
|
+
export type Model1 = string | null;
|
213
214
|
export type Id5 = string | null;
|
214
215
|
export type Content4 =
|
215
216
|
| string
|
@@ -224,8 +225,7 @@ export type Source3 = ("input" | "generate") | null;
|
|
224
225
|
export type Role3 = "tool";
|
225
226
|
export type ToolCallId1 = string | null;
|
226
227
|
export type Function1 = string | null;
|
227
|
-
export type
|
228
|
-
export type Type9 =
|
228
|
+
export type Type8 =
|
229
229
|
| "parsing"
|
230
230
|
| "timeout"
|
231
231
|
| "unicode_decode"
|
@@ -246,7 +246,7 @@ export type Messages = (
|
|
246
246
|
| ChatMessageAssistant
|
247
247
|
| ChatMessageTool
|
248
248
|
)[];
|
249
|
-
export type
|
249
|
+
export type Model2 = string;
|
250
250
|
export type StopReason =
|
251
251
|
| "stop"
|
252
252
|
| "max_tokens"
|
@@ -305,7 +305,7 @@ export type Timestamp1 = string;
|
|
305
305
|
export type WorkingStart1 = number;
|
306
306
|
export type Pending1 = boolean | null;
|
307
307
|
export type Event1 = "sample_limit";
|
308
|
-
export type
|
308
|
+
export type Type9 =
|
309
309
|
| "message"
|
310
310
|
| "time"
|
311
311
|
| "working"
|
@@ -345,7 +345,7 @@ export type Timestamp5 = string;
|
|
345
345
|
export type WorkingStart5 = number;
|
346
346
|
export type Pending5 = boolean | null;
|
347
347
|
export type Event5 = "model";
|
348
|
-
export type
|
348
|
+
export type Model3 = string;
|
349
349
|
export type Input3 = (
|
350
350
|
| ChatMessageSystem
|
351
351
|
| ChatMessageUser
|
@@ -354,7 +354,7 @@ export type Input3 = (
|
|
354
354
|
)[];
|
355
355
|
export type Name8 = string;
|
356
356
|
export type Description2 = string;
|
357
|
-
export type
|
357
|
+
export type Type10 = "object";
|
358
358
|
export type Required1 = string[];
|
359
359
|
export type Additionalproperties1 = boolean;
|
360
360
|
export type Tools1 = ToolInfo[];
|
@@ -369,10 +369,9 @@ export type Timestamp6 = string;
|
|
369
369
|
export type WorkingStart6 = number;
|
370
370
|
export type Pending6 = boolean | null;
|
371
371
|
export type Event6 = "tool";
|
372
|
-
export type
|
372
|
+
export type Type11 = "function";
|
373
373
|
export type Id7 = string;
|
374
374
|
export type Function2 = string;
|
375
|
-
export type InternalName2 = string | null;
|
376
375
|
export type Result1 =
|
377
376
|
| string
|
378
377
|
| number
|
@@ -448,14 +447,14 @@ export type WorkingStart13 = number;
|
|
448
447
|
export type Pending13 = boolean | null;
|
449
448
|
export type Event13 = "step";
|
450
449
|
export type Action1 = "begin" | "end";
|
451
|
-
export type
|
450
|
+
export type Type12 = string | null;
|
452
451
|
export type Name11 = string;
|
453
452
|
export type Timestamp14 = string;
|
454
453
|
export type WorkingStart14 = number;
|
455
454
|
export type Pending14 = boolean | null;
|
456
455
|
export type Event14 = "subtask";
|
457
456
|
export type Name12 = string;
|
458
|
-
export type
|
457
|
+
export type Type13 = string | null;
|
459
458
|
export type Events2 = (
|
460
459
|
| SampleInitEvent
|
461
460
|
| SampleLimitEvent
|
@@ -494,6 +493,8 @@ export type Events1 = (
|
|
494
493
|
)[];
|
495
494
|
export type Completed3 = string | null;
|
496
495
|
export type WorkingTime2 = number | null;
|
496
|
+
export type Agent = string | null;
|
497
|
+
export type Failed = boolean | null;
|
497
498
|
export type Events = (
|
498
499
|
| SampleInitEvent
|
499
500
|
| SampleLimitEvent
|
@@ -514,7 +515,7 @@ export type Events = (
|
|
514
515
|
export type TotalTime = number | null;
|
515
516
|
export type WorkingTime3 = number | null;
|
516
517
|
export type Uuid = string | null;
|
517
|
-
export type
|
518
|
+
export type Type14 =
|
518
519
|
| "context"
|
519
520
|
| "time"
|
520
521
|
| "working"
|
@@ -566,6 +567,7 @@ export interface EvalSpec {
|
|
566
567
|
task_id: TaskId;
|
567
568
|
task_version: TaskVersion;
|
568
569
|
task_file: TaskFile;
|
570
|
+
task_registry_name: TaskRegistryName;
|
569
571
|
task_attribs: TaskAttribs;
|
570
572
|
task_args: TaskArgs;
|
571
573
|
solver: Solver;
|
@@ -847,6 +849,7 @@ export interface ChatMessageSystem {
|
|
847
849
|
id: Id1;
|
848
850
|
content: Content;
|
849
851
|
source: Source;
|
852
|
+
internal: unknown;
|
850
853
|
role: Role;
|
851
854
|
}
|
852
855
|
/**
|
@@ -855,6 +858,7 @@ export interface ChatMessageSystem {
|
|
855
858
|
export interface ContentText {
|
856
859
|
type: Type3;
|
857
860
|
text: Text;
|
861
|
+
refusal: Refusal;
|
858
862
|
}
|
859
863
|
/**
|
860
864
|
* Reasoning content.
|
@@ -898,6 +902,7 @@ export interface ChatMessageUser {
|
|
898
902
|
id: Id2;
|
899
903
|
content: Content1;
|
900
904
|
source: Source1;
|
905
|
+
internal: unknown;
|
901
906
|
role: Role1;
|
902
907
|
tool_call_id: ToolCallId;
|
903
908
|
}
|
@@ -908,15 +913,16 @@ export interface ChatMessageAssistant {
|
|
908
913
|
id: Id3;
|
909
914
|
content: Content2;
|
910
915
|
source: Source2;
|
916
|
+
internal: unknown;
|
911
917
|
role: Role2;
|
912
918
|
tool_calls: ToolCalls;
|
919
|
+
model: Model1;
|
913
920
|
}
|
914
921
|
export interface ToolCall {
|
915
922
|
id: Id4;
|
916
923
|
function: Function;
|
917
924
|
arguments: Arguments;
|
918
|
-
|
919
|
-
internal_name: InternalName;
|
925
|
+
internal: unknown;
|
920
926
|
parse_error: ParseError;
|
921
927
|
view: ToolCallContent | null;
|
922
928
|
}
|
@@ -936,21 +942,21 @@ export interface ChatMessageTool {
|
|
936
942
|
id: Id5;
|
937
943
|
content: Content4;
|
938
944
|
source: Source3;
|
945
|
+
internal: unknown;
|
939
946
|
role: Role3;
|
940
947
|
tool_call_id: ToolCallId1;
|
941
948
|
function: Function1;
|
942
|
-
internal_name: InternalName1;
|
943
949
|
error: ToolCallError | null;
|
944
950
|
}
|
945
951
|
export interface ToolCallError {
|
946
|
-
type:
|
952
|
+
type: Type8;
|
947
953
|
message: Message1;
|
948
954
|
}
|
949
955
|
/**
|
950
956
|
* Output from model generation.
|
951
957
|
*/
|
952
958
|
export interface ModelOutput {
|
953
|
-
model:
|
959
|
+
model: Model2;
|
954
960
|
choices: Choices1;
|
955
961
|
usage: ModelUsage1 | null;
|
956
962
|
time: Time;
|
@@ -1031,7 +1037,7 @@ export interface SampleLimitEvent {
|
|
1031
1037
|
working_start: WorkingStart1;
|
1032
1038
|
pending: Pending1;
|
1033
1039
|
event: Event1;
|
1034
|
-
type:
|
1040
|
+
type: Type9;
|
1035
1041
|
message: Message2;
|
1036
1042
|
limit: Limit1;
|
1037
1043
|
}
|
@@ -1094,7 +1100,7 @@ export interface ModelEvent {
|
|
1094
1100
|
working_start: WorkingStart5;
|
1095
1101
|
pending: Pending5;
|
1096
1102
|
event: Event5;
|
1097
|
-
model:
|
1103
|
+
model: Model3;
|
1098
1104
|
input: Input3;
|
1099
1105
|
tools: Tools1;
|
1100
1106
|
tool_choice: ToolChoice;
|
@@ -1141,7 +1147,7 @@ export interface ToolInfo {
|
|
1141
1147
|
* Description of tool parameters object in JSON Schema format.
|
1142
1148
|
*/
|
1143
1149
|
export interface ToolParams {
|
1144
|
-
type:
|
1150
|
+
type: Type10;
|
1145
1151
|
properties: Properties1;
|
1146
1152
|
required: Required1;
|
1147
1153
|
additionalProperties: Additionalproperties1;
|
@@ -1204,11 +1210,11 @@ export interface ToolEvent {
|
|
1204
1210
|
working_start: WorkingStart6;
|
1205
1211
|
pending: Pending6;
|
1206
1212
|
event: Event6;
|
1207
|
-
type:
|
1213
|
+
type: Type11;
|
1208
1214
|
id: Id7;
|
1209
1215
|
function: Function2;
|
1210
1216
|
arguments: Arguments1;
|
1211
|
-
|
1217
|
+
internal: unknown;
|
1212
1218
|
view: ToolCallContent | null;
|
1213
1219
|
result: Result1;
|
1214
1220
|
truncated: Truncated;
|
@@ -1216,6 +1222,8 @@ export interface ToolEvent {
|
|
1216
1222
|
events: Events1;
|
1217
1223
|
completed: Completed3;
|
1218
1224
|
working_time: WorkingTime2;
|
1225
|
+
agent: Agent;
|
1226
|
+
failed: Failed;
|
1219
1227
|
}
|
1220
1228
|
export interface Arguments1 {
|
1221
1229
|
[k: string]: JsonValue;
|
@@ -1324,7 +1332,7 @@ export interface StepEvent {
|
|
1324
1332
|
pending: Pending13;
|
1325
1333
|
event: Event13;
|
1326
1334
|
action: Action1;
|
1327
|
-
type:
|
1335
|
+
type: Type12;
|
1328
1336
|
name: Name11;
|
1329
1337
|
}
|
1330
1338
|
/**
|
@@ -1336,7 +1344,7 @@ export interface SubtaskEvent {
|
|
1336
1344
|
pending: Pending14;
|
1337
1345
|
event: Event14;
|
1338
1346
|
name: Name12;
|
1339
|
-
type:
|
1347
|
+
type: Type13;
|
1340
1348
|
input: Input5;
|
1341
1349
|
result: Result2;
|
1342
1350
|
events: Events2;
|
@@ -1357,7 +1365,7 @@ export interface Attachments {
|
|
1357
1365
|
* Limit encontered by sample.
|
1358
1366
|
*/
|
1359
1367
|
export interface EvalSampleLimit {
|
1360
|
-
type:
|
1368
|
+
type: Type14;
|
1361
1369
|
limit: Limit2;
|
1362
1370
|
}
|
1363
1371
|
/**
|
@@ -0,0 +1,21 @@
|
|
1
|
+
declare module "markdown-it-katex" {
|
2
|
+
import MarkdownIt from "markdown-it";
|
3
|
+
|
4
|
+
interface KatexOptions {
|
5
|
+
throwOnError?: boolean;
|
6
|
+
errorColor?: string;
|
7
|
+
macros?: Record<string, string>;
|
8
|
+
fleqn?: boolean;
|
9
|
+
trust?: boolean;
|
10
|
+
output?: "html" | "htmlAndMathml" | "mathml";
|
11
|
+
minRuleThickness?: number;
|
12
|
+
colorIsTextColor?: boolean;
|
13
|
+
maxSize?: number;
|
14
|
+
maxExpand?: number;
|
15
|
+
strict?: boolean | string | Function;
|
16
|
+
}
|
17
|
+
|
18
|
+
const markdownItKatex: (md: MarkdownIt, options?: KatexOptions) => void;
|
19
|
+
|
20
|
+
export default markdownItKatex;
|
21
|
+
}
|
@@ -1,43 +1,110 @@
|
|
1
1
|
export const asyncJsonParse = async (text: string): Promise<any> => {
|
2
|
+
// Encode the input text
|
2
3
|
const encoder = new TextEncoder();
|
3
4
|
const encodedText = encoder.encode(text);
|
5
|
+
|
6
|
+
// Create a worker from the inline script
|
4
7
|
const blob = new Blob([kWorkerCode], { type: "application/javascript" });
|
5
8
|
const blobURL = URL.createObjectURL(blob);
|
6
9
|
const worker = new Worker(blobURL);
|
10
|
+
|
7
11
|
try {
|
8
12
|
const result = new Promise((resolve, reject) => {
|
9
13
|
worker.onmessage = function (e) {
|
10
14
|
if (e.data.success) {
|
11
|
-
|
15
|
+
if (e.data.serialized) {
|
16
|
+
// Deserialize the result if it was sent as a transferable
|
17
|
+
const decoder = new TextDecoder();
|
18
|
+
const resultString = decoder.decode(e.data.result);
|
19
|
+
resolve(JSON.parse(resultString));
|
20
|
+
} else {
|
21
|
+
resolve(e.data.result);
|
22
|
+
}
|
12
23
|
} else {
|
13
|
-
|
24
|
+
const error = new Error(e.data.error);
|
25
|
+
if (e.data.stack) {
|
26
|
+
error.stack = e.data.stack;
|
27
|
+
}
|
28
|
+
reject(error);
|
14
29
|
}
|
15
30
|
};
|
31
|
+
|
16
32
|
worker.onerror = function (error) {
|
17
|
-
reject(new Error(error.message));
|
33
|
+
reject(new Error(`Worker error: ${error.message}`));
|
18
34
|
};
|
19
35
|
});
|
20
|
-
|
21
|
-
|
22
|
-
|
36
|
+
|
37
|
+
// Transfer the encoded text buffer to the worker
|
38
|
+
worker.postMessage(
|
39
|
+
{
|
40
|
+
scriptContent: kJson5ScriptBase64,
|
41
|
+
encodedText,
|
42
|
+
},
|
43
|
+
[encodedText.buffer],
|
44
|
+
);
|
45
|
+
|
23
46
|
return await result;
|
24
47
|
} finally {
|
48
|
+
// Clean up resources
|
25
49
|
worker.terminate();
|
26
50
|
URL.revokeObjectURL(blobURL);
|
27
51
|
}
|
28
52
|
};
|
29
53
|
|
30
54
|
const kWorkerCode = `
|
55
|
+
// Store the JSON5 parser once loaded
|
56
|
+
let JSON5 = null;
|
57
|
+
|
31
58
|
self.onmessage = function (e) {
|
32
|
-
|
33
|
-
|
34
|
-
const decoder = new TextDecoder();
|
35
|
-
const text = decoder.decode(encodedText);
|
59
|
+
const { encodedText, scriptContent } = e.data;
|
60
|
+
|
36
61
|
try {
|
62
|
+
// Only load the JSON5 script if we haven't done so yet
|
63
|
+
if (!JSON5) {
|
64
|
+
const script = atob(scriptContent);
|
65
|
+
|
66
|
+
new Function(script)();
|
67
|
+
// Verify it was loaded properly
|
68
|
+
if (typeof self.JSON5 !== 'object' || typeof self.JSON5.parse !== 'function') {
|
69
|
+
throw new Error('Failed to initialize JSON5 parser');
|
70
|
+
}
|
71
|
+
JSON5 = self.JSON5;
|
72
|
+
}
|
73
|
+
|
74
|
+
// Decode the text using TextDecoder
|
75
|
+
const decoder = new TextDecoder();
|
76
|
+
const text = decoder.decode(encodedText);
|
77
|
+
|
78
|
+
// Parse with JSON5
|
37
79
|
const result = JSON5.parse(text);
|
38
|
-
|
80
|
+
|
81
|
+
if (result && typeof result === 'object' &&
|
82
|
+
(Array.isArray(result) ? result.length > 10000 : Object.keys(result).length > 10000)) {
|
83
|
+
|
84
|
+
// Large result, use transferrable object
|
85
|
+
const resultString = JSON.stringify(result);
|
86
|
+
const encoder = new TextEncoder();
|
87
|
+
const serialized = encoder.encode(resultString);
|
88
|
+
|
89
|
+
postMessage({
|
90
|
+
success: true,
|
91
|
+
serialized: true,
|
92
|
+
result: serialized
|
93
|
+
}, [serialized.buffer]);
|
94
|
+
} else {
|
95
|
+
// Small results, send directly
|
96
|
+
postMessage({
|
97
|
+
success: true,
|
98
|
+
serialized: false,
|
99
|
+
result: result
|
100
|
+
});
|
101
|
+
}
|
39
102
|
} catch (err) {
|
40
|
-
postMessage({
|
103
|
+
postMessage({
|
104
|
+
success: false,
|
105
|
+
error: err.message,
|
106
|
+
stack: err.stack || ''
|
107
|
+
});
|
41
108
|
}
|
42
109
|
};`;
|
43
110
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import { ApplicationIcons } from "../appearance/icons";
|
2
2
|
import { ToolButton } from "../components/ToolButton";
|
3
|
-
import { SampleTools } from "../samples/SamplesTools";
|
3
|
+
import { SampleTools, ScoreFilterTools } from "../samples/SamplesTools";
|
4
4
|
import { JsonTab } from "./tabs/JsonTab";
|
5
5
|
import { SamplesTab } from "./tabs/SamplesTab";
|
6
6
|
|
@@ -131,22 +131,24 @@ export const useSamplesTabConfig = (
|
|
131
131
|
running: evalStatus === "started",
|
132
132
|
},
|
133
133
|
tools: () =>
|
134
|
-
|
134
|
+
!samplesDescriptor
|
135
135
|
? undefined
|
136
|
-
:
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
136
|
+
: totalSampleCount === 1
|
137
|
+
? [<ScoreFilterTools />]
|
138
|
+
: [
|
139
|
+
<SampleTools
|
140
|
+
samples={sampleSummaries || []}
|
141
|
+
key="sample-tools"
|
142
|
+
/>,
|
143
|
+
evalStatus === "started" && !streamSamples && (
|
144
|
+
<ToolButton
|
145
|
+
key="refresh"
|
146
|
+
label="Refresh"
|
147
|
+
icon={ApplicationIcons.refresh}
|
148
|
+
onClick={refreshLog}
|
149
|
+
/>
|
150
|
+
),
|
151
|
+
],
|
150
152
|
};
|
151
153
|
}, [
|
152
154
|
evalStatus,
|
@@ -87,3 +87,19 @@
|
|
87
87
|
padding: 0 0.2em;
|
88
88
|
justify-content: center;
|
89
89
|
}
|
90
|
+
|
91
|
+
.moreButton {
|
92
|
+
margin-top: 0.5em;
|
93
|
+
margin-bottom: 0.5em;
|
94
|
+
padding-right: 0;
|
95
|
+
}
|
96
|
+
|
97
|
+
.metricsSummary {
|
98
|
+
display: flex;
|
99
|
+
flex-direction: column;
|
100
|
+
align-items: flex-end;
|
101
|
+
}
|
102
|
+
|
103
|
+
.modalScores {
|
104
|
+
padding-bottom: 4em;
|
105
|
+
}
|
@@ -1,10 +1,14 @@
|
|
1
1
|
import clsx from "clsx";
|
2
2
|
import { FC } from "react";
|
3
3
|
import { RunningMetric } from "../../api/types";
|
4
|
+
import { LinkButton } from "../../components/LinkButton";
|
5
|
+
import { Modal } from "../../components/Modal";
|
6
|
+
import { useProperty } from "../../state/hooks";
|
4
7
|
import { Scores } from "../../types/log";
|
5
8
|
import { formatPrettyDecimal } from "../../utils/format";
|
6
9
|
import { metricDisplayName } from "../utils";
|
7
10
|
import styles from "./ResultsPanel.module.css";
|
11
|
+
import { ScoreGrid } from "./ScoreGrid";
|
8
12
|
|
9
13
|
export interface ResultsMetric {
|
10
14
|
name: string;
|
@@ -82,6 +86,14 @@ interface ResultsPanelProps {
|
|
82
86
|
}
|
83
87
|
|
84
88
|
export const ResultsPanel: FC<ResultsPanelProps> = ({ scorers }) => {
|
89
|
+
const [showing, setShowing] = useProperty(
|
90
|
+
"results-panel-metrics",
|
91
|
+
"modal-showing",
|
92
|
+
{
|
93
|
+
defaultValue: false,
|
94
|
+
},
|
95
|
+
);
|
96
|
+
|
85
97
|
if (!scorers || scorers.length === 0) {
|
86
98
|
return undefined;
|
87
99
|
}
|
@@ -107,23 +119,69 @@ export const ResultsPanel: FC<ResultsPanelProps> = ({ scorers }) => {
|
|
107
119
|
);
|
108
120
|
} else {
|
109
121
|
const showReducer = scorers.findIndex((score) => !!score.reducer) !== -1;
|
122
|
+
const grouped = groupMetrics(scorers);
|
123
|
+
|
124
|
+
// Try to select metrics with a group size 5 or less, if possible
|
125
|
+
let primaryResults = grouped[0];
|
126
|
+
if (primaryResults.length > 5) {
|
127
|
+
const shorterResults = grouped.find((g) => {
|
128
|
+
return g.length <= 5;
|
129
|
+
});
|
130
|
+
if (shorterResults) {
|
131
|
+
primaryResults = shorterResults;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
110
135
|
return (
|
111
|
-
<div className={styles.
|
112
|
-
{
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
136
|
+
<div className={clsx(styles.metricsSummary)}>
|
137
|
+
<ScoreGrid scoreGroups={[primaryResults]} showReducer={showReducer} />
|
138
|
+
{grouped.length > 1 ? (
|
139
|
+
<>
|
140
|
+
<Modal
|
141
|
+
id="results-metrics"
|
142
|
+
showing={showing}
|
143
|
+
setShowing={setShowing}
|
144
|
+
title={"Scoring Detail"}
|
145
|
+
>
|
146
|
+
<ScoreGrid
|
147
|
+
scoreGroups={grouped}
|
148
|
+
showReducer={showReducer}
|
149
|
+
className={styles.modalScores}
|
150
|
+
striped={false}
|
151
|
+
/>
|
152
|
+
</Modal>
|
153
|
+
<LinkButton
|
154
|
+
className={styles.moreButton}
|
155
|
+
text={"All scoring..."}
|
156
|
+
onClick={() => {
|
157
|
+
setShowing(true);
|
158
|
+
}}
|
119
159
|
/>
|
120
|
-
|
121
|
-
|
160
|
+
</>
|
161
|
+
) : undefined}
|
122
162
|
</div>
|
123
163
|
);
|
124
164
|
}
|
125
165
|
};
|
126
166
|
|
167
|
+
const metricsKey = (metrics: ResultsMetric[]): string => {
|
168
|
+
const metricKey = metrics.map((m) => m.name).join("");
|
169
|
+
return metricKey;
|
170
|
+
};
|
171
|
+
|
172
|
+
const groupMetrics = (scorers: ResultsScorer[]): ResultsScorer[][] => {
|
173
|
+
const results: Record<string, ResultsScorer[]> = {};
|
174
|
+
scorers.forEach((scorer) => {
|
175
|
+
if (scorer.metrics.length > 0) {
|
176
|
+
const key = metricsKey(scorer.metrics);
|
177
|
+
results[key] = results[key] || [];
|
178
|
+
|
179
|
+
results[key].push(scorer);
|
180
|
+
}
|
181
|
+
});
|
182
|
+
return Object.values(results);
|
183
|
+
};
|
184
|
+
|
127
185
|
interface VerticalMetricProps {
|
128
186
|
metric: ResultsMetric;
|
129
187
|
reducer?: string;
|
@@ -177,64 +235,3 @@ const VerticalMetric: FC<VerticalMetricProps> = ({
|
|
177
235
|
</div>
|
178
236
|
);
|
179
237
|
};
|
180
|
-
|
181
|
-
interface MultiScorerMetricProps {
|
182
|
-
scorer: ResultsScorer;
|
183
|
-
isFirst: boolean;
|
184
|
-
showReducer: boolean;
|
185
|
-
}
|
186
|
-
|
187
|
-
const MultiScorerMetric: FC<MultiScorerMetricProps> = ({
|
188
|
-
scorer,
|
189
|
-
isFirst,
|
190
|
-
showReducer,
|
191
|
-
}) => {
|
192
|
-
const titleFontClz = "text-size-base";
|
193
|
-
const reducerFontClz = "text-size-smaller";
|
194
|
-
const valueFontClz = "text-size-base";
|
195
|
-
|
196
|
-
return (
|
197
|
-
<div
|
198
|
-
className={clsx(
|
199
|
-
styles.multiScorer,
|
200
|
-
isFirst ? styles.multiScorerIndent : undefined,
|
201
|
-
)}
|
202
|
-
>
|
203
|
-
<div
|
204
|
-
className={clsx(
|
205
|
-
titleFontClz,
|
206
|
-
"text-style-label",
|
207
|
-
"text-style-secondary",
|
208
|
-
"multi-score-label",
|
209
|
-
styles.multiScorerLabel,
|
210
|
-
)}
|
211
|
-
>
|
212
|
-
{scorer.scorer}
|
213
|
-
</div>
|
214
|
-
{showReducer ? (
|
215
|
-
<div
|
216
|
-
className={clsx(
|
217
|
-
reducerFontClz,
|
218
|
-
"text-style-label",
|
219
|
-
"text-style-secondary",
|
220
|
-
styles.multiScorerReducer,
|
221
|
-
)}
|
222
|
-
>
|
223
|
-
{scorer.reducer || "default"}
|
224
|
-
</div>
|
225
|
-
) : undefined}
|
226
|
-
<div className={clsx(valueFontClz, styles.multiScorerValue)}>
|
227
|
-
{scorer.metrics.map((metric) => {
|
228
|
-
return (
|
229
|
-
<div className={styles.multiScoreMetricGrid} key={metric.name}>
|
230
|
-
<div>{metricDisplayName(metric)}</div>
|
231
|
-
<div className={styles.multiScorerValueContent}>
|
232
|
-
{metric.value ? formatPrettyDecimal(metric.value) : undefined}
|
233
|
-
</div>
|
234
|
-
</div>
|
235
|
-
);
|
236
|
-
})}
|
237
|
-
</div>
|
238
|
-
</div>
|
239
|
-
);
|
240
|
-
};
|
@@ -0,0 +1,35 @@
|
|
1
|
+
.table {
|
2
|
+
margin-bottom: 0;
|
3
|
+
}
|
4
|
+
|
5
|
+
.scorer,
|
6
|
+
.value {
|
7
|
+
padding-top: 0.2em !important;
|
8
|
+
padding-bottom: 0.2em !important;
|
9
|
+
}
|
10
|
+
|
11
|
+
.label,
|
12
|
+
.value {
|
13
|
+
text-align: center;
|
14
|
+
padding-left: 1em;
|
15
|
+
padding-right: 1em;
|
16
|
+
}
|
17
|
+
|
18
|
+
.label {
|
19
|
+
font-weight: 400;
|
20
|
+
padding-left: 1em;
|
21
|
+
padding-right: 1em;
|
22
|
+
}
|
23
|
+
|
24
|
+
.scorer {
|
25
|
+
font-weight: 400;
|
26
|
+
}
|
27
|
+
|
28
|
+
.groupSeparator {
|
29
|
+
padding-top: 2em;
|
30
|
+
border-bottom: hidden;
|
31
|
+
}
|
32
|
+
|
33
|
+
.tableBody {
|
34
|
+
border-top-color: var(--bs-light-border-subtle);
|
35
|
+
}
|