inspect-ai 0.3.82__py3-none-any.whl → 0.3.83__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_display/textual/app.py +14 -3
  3. inspect_ai/_display/textual/display.py +4 -0
  4. inspect_ai/_display/textual/widgets/samples.py +9 -3
  5. inspect_ai/_display/textual/widgets/task_detail.py +3 -4
  6. inspect_ai/_display/textual/widgets/tasks.py +17 -1
  7. inspect_ai/_display/textual/widgets/vscode.py +44 -0
  8. inspect_ai/_eval/eval.py +36 -24
  9. inspect_ai/_eval/evalset.py +17 -18
  10. inspect_ai/_eval/loader.py +34 -11
  11. inspect_ai/_eval/run.py +8 -13
  12. inspect_ai/_eval/score.py +13 -3
  13. inspect_ai/_eval/task/generate.py +8 -9
  14. inspect_ai/_eval/task/log.py +2 -0
  15. inspect_ai/_eval/task/task.py +23 -9
  16. inspect_ai/_util/file.py +13 -0
  17. inspect_ai/_util/json.py +2 -1
  18. inspect_ai/_util/registry.py +1 -0
  19. inspect_ai/_util/vscode.py +37 -0
  20. inspect_ai/_view/www/App.css +6 -0
  21. inspect_ai/_view/www/dist/assets/index.css +304 -128
  22. inspect_ai/_view/www/dist/assets/index.js +47495 -27519
  23. inspect_ai/_view/www/log-schema.json +124 -31
  24. inspect_ai/_view/www/package.json +3 -0
  25. inspect_ai/_view/www/src/App.tsx +12 -0
  26. inspect_ai/_view/www/src/appearance/icons.ts +1 -0
  27. inspect_ai/_view/www/src/components/Card.tsx +6 -4
  28. inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
  29. inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
  30. inspect_ai/_view/www/src/components/LiveVirtualList.tsx +1 -1
  31. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +113 -23
  32. inspect_ai/_view/www/src/components/Modal.module.css +38 -0
  33. inspect_ai/_view/www/src/components/Modal.tsx +77 -0
  34. inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
  35. inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
  36. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
  37. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +7 -0
  38. inspect_ai/_view/www/src/samples/SampleDialog.tsx +7 -0
  39. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +11 -34
  40. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +6 -0
  41. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
  42. inspect_ai/_view/www/src/samples/SamplesTools.tsx +12 -0
  43. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +2 -0
  44. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -0
  45. inspect_ai/_view/www/src/samples/chat/messages.ts +3 -1
  46. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +1 -0
  47. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +9 -3
  48. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
  49. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
  50. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
  51. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -11
  52. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +2 -1
  53. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +7 -1
  54. inspect_ai/_view/www/src/samples/list/SampleList.tsx +25 -8
  55. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +1 -1
  56. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +11 -22
  57. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
  58. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
  59. inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
  60. inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
  61. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
  62. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +25 -4
  63. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +29 -2
  64. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +0 -1
  65. inspect_ai/_view/www/src/state/hooks.ts +5 -3
  66. inspect_ai/_view/www/src/state/logPolling.ts +5 -1
  67. inspect_ai/_view/www/src/state/logSlice.ts +10 -0
  68. inspect_ai/_view/www/src/state/samplePolling.ts +4 -1
  69. inspect_ai/_view/www/src/state/sampleSlice.ts +13 -0
  70. inspect_ai/_view/www/src/types/log.d.ts +34 -26
  71. inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
  72. inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
  73. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +18 -16
  74. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -0
  75. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +68 -71
  76. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
  77. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
  78. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +1 -1
  79. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
  80. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +18 -0
  81. inspect_ai/_view/www/yarn.lock +94 -1
  82. inspect_ai/agent/__init__.py +36 -0
  83. inspect_ai/agent/_agent.py +268 -0
  84. inspect_ai/agent/_as_solver.py +72 -0
  85. inspect_ai/agent/_as_tool.py +122 -0
  86. inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
  87. inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
  88. inspect_ai/agent/_filter.py +46 -0
  89. inspect_ai/agent/_handoff.py +93 -0
  90. inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
  91. inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
  92. inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
  93. inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
  94. inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
  95. inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
  96. inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
  97. inspect_ai/agent/_react.py +241 -0
  98. inspect_ai/agent/_run.py +36 -0
  99. inspect_ai/agent/_types.py +81 -0
  100. inspect_ai/log/_log.py +11 -2
  101. inspect_ai/log/_transcript.py +13 -9
  102. inspect_ai/model/__init__.py +7 -1
  103. inspect_ai/model/_call_tools.py +256 -52
  104. inspect_ai/model/_chat_message.py +7 -4
  105. inspect_ai/model/_conversation.py +13 -62
  106. inspect_ai/model/_display.py +85 -0
  107. inspect_ai/model/_model.py +113 -14
  108. inspect_ai/model/_model_output.py +14 -9
  109. inspect_ai/model/_openai.py +16 -4
  110. inspect_ai/model/_openai_computer_use.py +162 -0
  111. inspect_ai/model/_openai_responses.py +319 -165
  112. inspect_ai/model/_providers/anthropic.py +20 -21
  113. inspect_ai/model/_providers/azureai.py +24 -13
  114. inspect_ai/model/_providers/bedrock.py +1 -7
  115. inspect_ai/model/_providers/cloudflare.py +3 -3
  116. inspect_ai/model/_providers/goodfire.py +2 -6
  117. inspect_ai/model/_providers/google.py +11 -10
  118. inspect_ai/model/_providers/groq.py +6 -3
  119. inspect_ai/model/_providers/hf.py +7 -3
  120. inspect_ai/model/_providers/mistral.py +7 -10
  121. inspect_ai/model/_providers/openai.py +47 -17
  122. inspect_ai/model/_providers/openai_o1.py +11 -4
  123. inspect_ai/model/_providers/openai_responses.py +12 -14
  124. inspect_ai/model/_providers/providers.py +2 -2
  125. inspect_ai/model/_providers/together.py +12 -2
  126. inspect_ai/model/_providers/util/chatapi.py +7 -2
  127. inspect_ai/model/_providers/util/hf_handler.py +4 -2
  128. inspect_ai/model/_providers/util/llama31.py +4 -2
  129. inspect_ai/model/_providers/vertex.py +11 -9
  130. inspect_ai/model/_providers/vllm.py +4 -4
  131. inspect_ai/scorer/__init__.py +2 -0
  132. inspect_ai/scorer/_metrics/__init__.py +2 -0
  133. inspect_ai/scorer/_metrics/grouped.py +84 -0
  134. inspect_ai/scorer/_score.py +26 -6
  135. inspect_ai/solver/__init__.py +2 -2
  136. inspect_ai/solver/_basic_agent.py +22 -9
  137. inspect_ai/solver/_bridge.py +31 -0
  138. inspect_ai/solver/_chain.py +20 -12
  139. inspect_ai/solver/_fork.py +5 -1
  140. inspect_ai/solver/_human_agent.py +52 -0
  141. inspect_ai/solver/_prompt.py +3 -1
  142. inspect_ai/solver/_run.py +59 -0
  143. inspect_ai/solver/_solver.py +14 -4
  144. inspect_ai/solver/_task_state.py +5 -3
  145. inspect_ai/tool/_tool_call.py +15 -8
  146. inspect_ai/tool/_tool_def.py +17 -12
  147. inspect_ai/tool/_tool_support_helpers.py +2 -2
  148. inspect_ai/tool/_tool_with.py +14 -11
  149. inspect_ai/tool/_tools/_bash_session.py +11 -2
  150. inspect_ai/tool/_tools/_computer/_common.py +18 -2
  151. inspect_ai/tool/_tools/_computer/_computer.py +18 -2
  152. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
  153. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
  154. inspect_ai/tool/_tools/_think.py +1 -1
  155. inspect_ai/tool/_tools/_web_browser/_web_browser.py +100 -61
  156. inspect_ai/util/__init__.py +2 -0
  157. inspect_ai/util/_anyio.py +27 -0
  158. inspect_ai/util/_sandbox/__init__.py +2 -1
  159. inspect_ai/util/_sandbox/context.py +32 -7
  160. inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
  161. inspect_ai/util/_sandbox/docker/compose.py +2 -2
  162. inspect_ai/util/_sandbox/docker/docker.py +12 -1
  163. inspect_ai/util/_store_model.py +30 -7
  164. inspect_ai/util/_subprocess.py +13 -3
  165. {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
  166. {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +179 -153
  167. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -167
  168. /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
  169. /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
  170. /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
  171. /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
  172. /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
  173. /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
  174. /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
  175. /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
  176. /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
  177. {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
  178. {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
  179. {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
  180. {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@ export type Task = string;
13
13
  export type TaskId = string;
14
14
  export type TaskVersion = number;
15
15
  export type TaskFile = string | null;
16
+ export type TaskRegistryName = string | null;
16
17
  export type Solver = string | null;
17
18
  export type SolverArgs = {} | null;
18
19
  export type Tags = string[] | null;
@@ -161,6 +162,7 @@ export type Content =
161
162
  )[];
162
163
  export type Type3 = "text";
163
164
  export type Text = string;
165
+ export type Refusal = boolean | null;
164
166
  export type Type4 = "reasoning";
165
167
  export type Reasoning = string;
166
168
  export type Signature = string | null;
@@ -204,12 +206,11 @@ export type Role2 = "assistant";
204
206
  export type ToolCalls = ToolCall[] | null;
205
207
  export type Id4 = string;
206
208
  export type Function = string;
207
- export type Type8 = string;
208
- export type InternalName = string | null;
209
209
  export type ParseError = string | null;
210
210
  export type Title = string | null;
211
211
  export type Format2 = "text" | "markdown";
212
212
  export type Content3 = string;
213
+ export type Model1 = string | null;
213
214
  export type Id5 = string | null;
214
215
  export type Content4 =
215
216
  | string
@@ -224,8 +225,7 @@ export type Source3 = ("input" | "generate") | null;
224
225
  export type Role3 = "tool";
225
226
  export type ToolCallId1 = string | null;
226
227
  export type Function1 = string | null;
227
- export type InternalName1 = string | null;
228
- export type Type9 =
228
+ export type Type8 =
229
229
  | "parsing"
230
230
  | "timeout"
231
231
  | "unicode_decode"
@@ -246,7 +246,7 @@ export type Messages = (
246
246
  | ChatMessageAssistant
247
247
  | ChatMessageTool
248
248
  )[];
249
- export type Model1 = string;
249
+ export type Model2 = string;
250
250
  export type StopReason =
251
251
  | "stop"
252
252
  | "max_tokens"
@@ -305,7 +305,7 @@ export type Timestamp1 = string;
305
305
  export type WorkingStart1 = number;
306
306
  export type Pending1 = boolean | null;
307
307
  export type Event1 = "sample_limit";
308
- export type Type10 =
308
+ export type Type9 =
309
309
  | "message"
310
310
  | "time"
311
311
  | "working"
@@ -345,7 +345,7 @@ export type Timestamp5 = string;
345
345
  export type WorkingStart5 = number;
346
346
  export type Pending5 = boolean | null;
347
347
  export type Event5 = "model";
348
- export type Model2 = string;
348
+ export type Model3 = string;
349
349
  export type Input3 = (
350
350
  | ChatMessageSystem
351
351
  | ChatMessageUser
@@ -354,7 +354,7 @@ export type Input3 = (
354
354
  )[];
355
355
  export type Name8 = string;
356
356
  export type Description2 = string;
357
- export type Type11 = "object";
357
+ export type Type10 = "object";
358
358
  export type Required1 = string[];
359
359
  export type Additionalproperties1 = boolean;
360
360
  export type Tools1 = ToolInfo[];
@@ -369,10 +369,9 @@ export type Timestamp6 = string;
369
369
  export type WorkingStart6 = number;
370
370
  export type Pending6 = boolean | null;
371
371
  export type Event6 = "tool";
372
- export type Type12 = "function";
372
+ export type Type11 = "function";
373
373
  export type Id7 = string;
374
374
  export type Function2 = string;
375
- export type InternalName2 = string | null;
376
375
  export type Result1 =
377
376
  | string
378
377
  | number
@@ -448,14 +447,14 @@ export type WorkingStart13 = number;
448
447
  export type Pending13 = boolean | null;
449
448
  export type Event13 = "step";
450
449
  export type Action1 = "begin" | "end";
451
- export type Type13 = string | null;
450
+ export type Type12 = string | null;
452
451
  export type Name11 = string;
453
452
  export type Timestamp14 = string;
454
453
  export type WorkingStart14 = number;
455
454
  export type Pending14 = boolean | null;
456
455
  export type Event14 = "subtask";
457
456
  export type Name12 = string;
458
- export type Type14 = string | null;
457
+ export type Type13 = string | null;
459
458
  export type Events2 = (
460
459
  | SampleInitEvent
461
460
  | SampleLimitEvent
@@ -494,6 +493,8 @@ export type Events1 = (
494
493
  )[];
495
494
  export type Completed3 = string | null;
496
495
  export type WorkingTime2 = number | null;
496
+ export type Agent = string | null;
497
+ export type Failed = boolean | null;
497
498
  export type Events = (
498
499
  | SampleInitEvent
499
500
  | SampleLimitEvent
@@ -514,7 +515,7 @@ export type Events = (
514
515
  export type TotalTime = number | null;
515
516
  export type WorkingTime3 = number | null;
516
517
  export type Uuid = string | null;
517
- export type Type15 =
518
+ export type Type14 =
518
519
  | "context"
519
520
  | "time"
520
521
  | "working"
@@ -566,6 +567,7 @@ export interface EvalSpec {
566
567
  task_id: TaskId;
567
568
  task_version: TaskVersion;
568
569
  task_file: TaskFile;
570
+ task_registry_name: TaskRegistryName;
569
571
  task_attribs: TaskAttribs;
570
572
  task_args: TaskArgs;
571
573
  solver: Solver;
@@ -847,6 +849,7 @@ export interface ChatMessageSystem {
847
849
  id: Id1;
848
850
  content: Content;
849
851
  source: Source;
852
+ internal: unknown;
850
853
  role: Role;
851
854
  }
852
855
  /**
@@ -855,6 +858,7 @@ export interface ChatMessageSystem {
855
858
  export interface ContentText {
856
859
  type: Type3;
857
860
  text: Text;
861
+ refusal: Refusal;
858
862
  }
859
863
  /**
860
864
  * Reasoning content.
@@ -898,6 +902,7 @@ export interface ChatMessageUser {
898
902
  id: Id2;
899
903
  content: Content1;
900
904
  source: Source1;
905
+ internal: unknown;
901
906
  role: Role1;
902
907
  tool_call_id: ToolCallId;
903
908
  }
@@ -908,15 +913,16 @@ export interface ChatMessageAssistant {
908
913
  id: Id3;
909
914
  content: Content2;
910
915
  source: Source2;
916
+ internal: unknown;
911
917
  role: Role2;
912
918
  tool_calls: ToolCalls;
919
+ model: Model1;
913
920
  }
914
921
  export interface ToolCall {
915
922
  id: Id4;
916
923
  function: Function;
917
924
  arguments: Arguments;
918
- type: Type8;
919
- internal_name: InternalName;
925
+ internal: unknown;
920
926
  parse_error: ParseError;
921
927
  view: ToolCallContent | null;
922
928
  }
@@ -936,21 +942,21 @@ export interface ChatMessageTool {
936
942
  id: Id5;
937
943
  content: Content4;
938
944
  source: Source3;
945
+ internal: unknown;
939
946
  role: Role3;
940
947
  tool_call_id: ToolCallId1;
941
948
  function: Function1;
942
- internal_name: InternalName1;
943
949
  error: ToolCallError | null;
944
950
  }
945
951
  export interface ToolCallError {
946
- type: Type9;
952
+ type: Type8;
947
953
  message: Message1;
948
954
  }
949
955
  /**
950
956
  * Output from model generation.
951
957
  */
952
958
  export interface ModelOutput {
953
- model: Model1;
959
+ model: Model2;
954
960
  choices: Choices1;
955
961
  usage: ModelUsage1 | null;
956
962
  time: Time;
@@ -1031,7 +1037,7 @@ export interface SampleLimitEvent {
1031
1037
  working_start: WorkingStart1;
1032
1038
  pending: Pending1;
1033
1039
  event: Event1;
1034
- type: Type10;
1040
+ type: Type9;
1035
1041
  message: Message2;
1036
1042
  limit: Limit1;
1037
1043
  }
@@ -1094,7 +1100,7 @@ export interface ModelEvent {
1094
1100
  working_start: WorkingStart5;
1095
1101
  pending: Pending5;
1096
1102
  event: Event5;
1097
- model: Model2;
1103
+ model: Model3;
1098
1104
  input: Input3;
1099
1105
  tools: Tools1;
1100
1106
  tool_choice: ToolChoice;
@@ -1141,7 +1147,7 @@ export interface ToolInfo {
1141
1147
  * Description of tool parameters object in JSON Schema format.
1142
1148
  */
1143
1149
  export interface ToolParams {
1144
- type: Type11;
1150
+ type: Type10;
1145
1151
  properties: Properties1;
1146
1152
  required: Required1;
1147
1153
  additionalProperties: Additionalproperties1;
@@ -1204,11 +1210,11 @@ export interface ToolEvent {
1204
1210
  working_start: WorkingStart6;
1205
1211
  pending: Pending6;
1206
1212
  event: Event6;
1207
- type: Type12;
1213
+ type: Type11;
1208
1214
  id: Id7;
1209
1215
  function: Function2;
1210
1216
  arguments: Arguments1;
1211
- internal_name: InternalName2;
1217
+ internal: unknown;
1212
1218
  view: ToolCallContent | null;
1213
1219
  result: Result1;
1214
1220
  truncated: Truncated;
@@ -1216,6 +1222,8 @@ export interface ToolEvent {
1216
1222
  events: Events1;
1217
1223
  completed: Completed3;
1218
1224
  working_time: WorkingTime2;
1225
+ agent: Agent;
1226
+ failed: Failed;
1219
1227
  }
1220
1228
  export interface Arguments1 {
1221
1229
  [k: string]: JsonValue;
@@ -1324,7 +1332,7 @@ export interface StepEvent {
1324
1332
  pending: Pending13;
1325
1333
  event: Event13;
1326
1334
  action: Action1;
1327
- type: Type13;
1335
+ type: Type12;
1328
1336
  name: Name11;
1329
1337
  }
1330
1338
  /**
@@ -1336,7 +1344,7 @@ export interface SubtaskEvent {
1336
1344
  pending: Pending14;
1337
1345
  event: Event14;
1338
1346
  name: Name12;
1339
- type: Type14;
1347
+ type: Type13;
1340
1348
  input: Input5;
1341
1349
  result: Result2;
1342
1350
  events: Events2;
@@ -1357,7 +1365,7 @@ export interface Attachments {
1357
1365
  * Limit encontered by sample.
1358
1366
  */
1359
1367
  export interface EvalSampleLimit {
1360
- type: Type15;
1368
+ type: Type14;
1361
1369
  limit: Limit2;
1362
1370
  }
1363
1371
  /**
@@ -0,0 +1,21 @@
1
+ declare module "markdown-it-katex" {
2
+ import MarkdownIt from "markdown-it";
3
+
4
+ interface KatexOptions {
5
+ throwOnError?: boolean;
6
+ errorColor?: string;
7
+ macros?: Record<string, string>;
8
+ fleqn?: boolean;
9
+ trust?: boolean;
10
+ output?: "html" | "htmlAndMathml" | "mathml";
11
+ minRuleThickness?: number;
12
+ colorIsTextColor?: boolean;
13
+ maxSize?: number;
14
+ maxExpand?: number;
15
+ strict?: boolean | string | Function;
16
+ }
17
+
18
+ const markdownItKatex: (md: MarkdownIt, options?: KatexOptions) => void;
19
+
20
+ export default markdownItKatex;
21
+ }
@@ -1,43 +1,110 @@
1
1
  export const asyncJsonParse = async (text: string): Promise<any> => {
2
+ // Encode the input text
2
3
  const encoder = new TextEncoder();
3
4
  const encodedText = encoder.encode(text);
5
+
6
+ // Create a worker from the inline script
4
7
  const blob = new Blob([kWorkerCode], { type: "application/javascript" });
5
8
  const blobURL = URL.createObjectURL(blob);
6
9
  const worker = new Worker(blobURL);
10
+
7
11
  try {
8
12
  const result = new Promise((resolve, reject) => {
9
13
  worker.onmessage = function (e) {
10
14
  if (e.data.success) {
11
- resolve(e.data.result);
15
+ if (e.data.serialized) {
16
+ // Deserialize the result if it was sent as a transferable
17
+ const decoder = new TextDecoder();
18
+ const resultString = decoder.decode(e.data.result);
19
+ resolve(JSON.parse(resultString));
20
+ } else {
21
+ resolve(e.data.result);
22
+ }
12
23
  } else {
13
- reject(new Error(e.data.error));
24
+ const error = new Error(e.data.error);
25
+ if (e.data.stack) {
26
+ error.stack = e.data.stack;
27
+ }
28
+ reject(error);
14
29
  }
15
30
  };
31
+
16
32
  worker.onerror = function (error) {
17
- reject(new Error(error.message));
33
+ reject(new Error(`Worker error: ${error.message}`));
18
34
  };
19
35
  });
20
- worker.postMessage({ scriptContent: kJson5ScriptBase64, encodedText }, [
21
- encodedText.buffer,
22
- ]);
36
+
37
+ // Transfer the encoded text buffer to the worker
38
+ worker.postMessage(
39
+ {
40
+ scriptContent: kJson5ScriptBase64,
41
+ encodedText,
42
+ },
43
+ [encodedText.buffer],
44
+ );
45
+
23
46
  return await result;
24
47
  } finally {
48
+ // Clean up resources
25
49
  worker.terminate();
26
50
  URL.revokeObjectURL(blobURL);
27
51
  }
28
52
  };
29
53
 
30
54
  const kWorkerCode = `
55
+ // Store the JSON5 parser once loaded
56
+ let JSON5 = null;
57
+
31
58
  self.onmessage = function (e) {
32
- eval(atob(e.data.scriptContent));
33
- const { encodedText } = e.data;
34
- const decoder = new TextDecoder();
35
- const text = decoder.decode(encodedText);
59
+ const { encodedText, scriptContent } = e.data;
60
+
36
61
  try {
62
+ // Only load the JSON5 script if we haven't done so yet
63
+ if (!JSON5) {
64
+ const script = atob(scriptContent);
65
+
66
+ new Function(script)();
67
+ // Verify it was loaded properly
68
+ if (typeof self.JSON5 !== 'object' || typeof self.JSON5.parse !== 'function') {
69
+ throw new Error('Failed to initialize JSON5 parser');
70
+ }
71
+ JSON5 = self.JSON5;
72
+ }
73
+
74
+ // Decode the text using TextDecoder
75
+ const decoder = new TextDecoder();
76
+ const text = decoder.decode(encodedText);
77
+
78
+ // Parse with JSON5
37
79
  const result = JSON5.parse(text);
38
- postMessage({ success: true, result });
80
+
81
+ if (result && typeof result === 'object' &&
82
+ (Array.isArray(result) ? result.length > 10000 : Object.keys(result).length > 10000)) {
83
+
84
+ // Large result, use transferrable object
85
+ const resultString = JSON.stringify(result);
86
+ const encoder = new TextEncoder();
87
+ const serialized = encoder.encode(resultString);
88
+
89
+ postMessage({
90
+ success: true,
91
+ serialized: true,
92
+ result: serialized
93
+ }, [serialized.buffer]);
94
+ } else {
95
+ // Small results, send directly
96
+ postMessage({
97
+ success: true,
98
+ serialized: false,
99
+ result: result
100
+ });
101
+ }
39
102
  } catch (err) {
40
- postMessage({ success: false, error: err.message });
103
+ postMessage({
104
+ success: false,
105
+ error: err.message,
106
+ stack: err.stack || ''
107
+ });
41
108
  }
42
109
  };`;
43
110
 
@@ -1,6 +1,6 @@
1
1
  import { ApplicationIcons } from "../appearance/icons";
2
2
  import { ToolButton } from "../components/ToolButton";
3
- import { SampleTools } from "../samples/SamplesTools";
3
+ import { SampleTools, ScoreFilterTools } from "../samples/SamplesTools";
4
4
  import { JsonTab } from "./tabs/JsonTab";
5
5
  import { SamplesTab } from "./tabs/SamplesTab";
6
6
 
@@ -131,22 +131,24 @@ export const useSamplesTabConfig = (
131
131
  running: evalStatus === "started",
132
132
  },
133
133
  tools: () =>
134
- totalSampleCount === 1 || !samplesDescriptor
134
+ !samplesDescriptor
135
135
  ? undefined
136
- : [
137
- <SampleTools
138
- samples={sampleSummaries || []}
139
- key="sample-tools"
140
- />,
141
- evalStatus === "started" && !streamSamples && (
142
- <ToolButton
143
- key="refresh"
144
- label="Refresh"
145
- icon={ApplicationIcons.refresh}
146
- onClick={refreshLog}
147
- />
148
- ),
149
- ].filter(Boolean),
136
+ : totalSampleCount === 1
137
+ ? [<ScoreFilterTools />]
138
+ : [
139
+ <SampleTools
140
+ samples={sampleSummaries || []}
141
+ key="sample-tools"
142
+ />,
143
+ evalStatus === "started" && !streamSamples && (
144
+ <ToolButton
145
+ key="refresh"
146
+ label="Refresh"
147
+ icon={ApplicationIcons.refresh}
148
+ onClick={refreshLog}
149
+ />
150
+ ),
151
+ ],
150
152
  };
151
153
  }, [
152
154
  evalStatus,
@@ -87,3 +87,19 @@
87
87
  padding: 0 0.2em;
88
88
  justify-content: center;
89
89
  }
90
+
91
+ .moreButton {
92
+ margin-top: 0.5em;
93
+ margin-bottom: 0.5em;
94
+ padding-right: 0;
95
+ }
96
+
97
+ .metricsSummary {
98
+ display: flex;
99
+ flex-direction: column;
100
+ align-items: flex-end;
101
+ }
102
+
103
+ .modalScores {
104
+ padding-bottom: 4em;
105
+ }
@@ -1,10 +1,14 @@
1
1
  import clsx from "clsx";
2
2
  import { FC } from "react";
3
3
  import { RunningMetric } from "../../api/types";
4
+ import { LinkButton } from "../../components/LinkButton";
5
+ import { Modal } from "../../components/Modal";
6
+ import { useProperty } from "../../state/hooks";
4
7
  import { Scores } from "../../types/log";
5
8
  import { formatPrettyDecimal } from "../../utils/format";
6
9
  import { metricDisplayName } from "../utils";
7
10
  import styles from "./ResultsPanel.module.css";
11
+ import { ScoreGrid } from "./ScoreGrid";
8
12
 
9
13
  export interface ResultsMetric {
10
14
  name: string;
@@ -82,6 +86,14 @@ interface ResultsPanelProps {
82
86
  }
83
87
 
84
88
  export const ResultsPanel: FC<ResultsPanelProps> = ({ scorers }) => {
89
+ const [showing, setShowing] = useProperty(
90
+ "results-panel-metrics",
91
+ "modal-showing",
92
+ {
93
+ defaultValue: false,
94
+ },
95
+ );
96
+
85
97
  if (!scorers || scorers.length === 0) {
86
98
  return undefined;
87
99
  }
@@ -107,23 +119,69 @@ export const ResultsPanel: FC<ResultsPanelProps> = ({ scorers }) => {
107
119
  );
108
120
  } else {
109
121
  const showReducer = scorers.findIndex((score) => !!score.reducer) !== -1;
122
+ const grouped = groupMetrics(scorers);
123
+
124
+ // Try to select metrics with a group size 5 or less, if possible
125
+ let primaryResults = grouped[0];
126
+ if (primaryResults.length > 5) {
127
+ const shorterResults = grouped.find((g) => {
128
+ return g.length <= 5;
129
+ });
130
+ if (shorterResults) {
131
+ primaryResults = shorterResults;
132
+ }
133
+ }
134
+
110
135
  return (
111
- <div className={styles.multiMetricsRows}>
112
- {scorers.map((scorer, index) => {
113
- return (
114
- <MultiScorerMetric
115
- key={`multi-metric-${index}`}
116
- scorer={scorer}
117
- isFirst={index === 0}
118
- showReducer={showReducer}
136
+ <div className={clsx(styles.metricsSummary)}>
137
+ <ScoreGrid scoreGroups={[primaryResults]} showReducer={showReducer} />
138
+ {grouped.length > 1 ? (
139
+ <>
140
+ <Modal
141
+ id="results-metrics"
142
+ showing={showing}
143
+ setShowing={setShowing}
144
+ title={"Scoring Detail"}
145
+ >
146
+ <ScoreGrid
147
+ scoreGroups={grouped}
148
+ showReducer={showReducer}
149
+ className={styles.modalScores}
150
+ striped={false}
151
+ />
152
+ </Modal>
153
+ <LinkButton
154
+ className={styles.moreButton}
155
+ text={"All scoring..."}
156
+ onClick={() => {
157
+ setShowing(true);
158
+ }}
119
159
  />
120
- );
121
- })}
160
+ </>
161
+ ) : undefined}
122
162
  </div>
123
163
  );
124
164
  }
125
165
  };
126
166
 
167
+ const metricsKey = (metrics: ResultsMetric[]): string => {
168
+ const metricKey = metrics.map((m) => m.name).join("");
169
+ return metricKey;
170
+ };
171
+
172
+ const groupMetrics = (scorers: ResultsScorer[]): ResultsScorer[][] => {
173
+ const results: Record<string, ResultsScorer[]> = {};
174
+ scorers.forEach((scorer) => {
175
+ if (scorer.metrics.length > 0) {
176
+ const key = metricsKey(scorer.metrics);
177
+ results[key] = results[key] || [];
178
+
179
+ results[key].push(scorer);
180
+ }
181
+ });
182
+ return Object.values(results);
183
+ };
184
+
127
185
  interface VerticalMetricProps {
128
186
  metric: ResultsMetric;
129
187
  reducer?: string;
@@ -177,64 +235,3 @@ const VerticalMetric: FC<VerticalMetricProps> = ({
177
235
  </div>
178
236
  );
179
237
  };
180
-
181
- interface MultiScorerMetricProps {
182
- scorer: ResultsScorer;
183
- isFirst: boolean;
184
- showReducer: boolean;
185
- }
186
-
187
- const MultiScorerMetric: FC<MultiScorerMetricProps> = ({
188
- scorer,
189
- isFirst,
190
- showReducer,
191
- }) => {
192
- const titleFontClz = "text-size-base";
193
- const reducerFontClz = "text-size-smaller";
194
- const valueFontClz = "text-size-base";
195
-
196
- return (
197
- <div
198
- className={clsx(
199
- styles.multiScorer,
200
- isFirst ? styles.multiScorerIndent : undefined,
201
- )}
202
- >
203
- <div
204
- className={clsx(
205
- titleFontClz,
206
- "text-style-label",
207
- "text-style-secondary",
208
- "multi-score-label",
209
- styles.multiScorerLabel,
210
- )}
211
- >
212
- {scorer.scorer}
213
- </div>
214
- {showReducer ? (
215
- <div
216
- className={clsx(
217
- reducerFontClz,
218
- "text-style-label",
219
- "text-style-secondary",
220
- styles.multiScorerReducer,
221
- )}
222
- >
223
- {scorer.reducer || "default"}
224
- </div>
225
- ) : undefined}
226
- <div className={clsx(valueFontClz, styles.multiScorerValue)}>
227
- {scorer.metrics.map((metric) => {
228
- return (
229
- <div className={styles.multiScoreMetricGrid} key={metric.name}>
230
- <div>{metricDisplayName(metric)}</div>
231
- <div className={styles.multiScorerValueContent}>
232
- {metric.value ? formatPrettyDecimal(metric.value) : undefined}
233
- </div>
234
- </div>
235
- );
236
- })}
237
- </div>
238
- </div>
239
- );
240
- };
@@ -0,0 +1,35 @@
1
+ .table {
2
+ margin-bottom: 0;
3
+ }
4
+
5
+ .scorer,
6
+ .value {
7
+ padding-top: 0.2em !important;
8
+ padding-bottom: 0.2em !important;
9
+ }
10
+
11
+ .label,
12
+ .value {
13
+ text-align: center;
14
+ padding-left: 1em;
15
+ padding-right: 1em;
16
+ }
17
+
18
+ .label {
19
+ font-weight: 400;
20
+ padding-left: 1em;
21
+ padding-right: 1em;
22
+ }
23
+
24
+ .scorer {
25
+ font-weight: 400;
26
+ }
27
+
28
+ .groupSeparator {
29
+ padding-top: 2em;
30
+ border-bottom: hidden;
31
+ }
32
+
33
+ .tableBody {
34
+ border-top-color: var(--bs-light-border-subtle);
35
+ }