PyPI - inspect-ai - Versions diffs - 0.3.82__py3-none-any.whl → 0.3.83__py3-none-any.whl - Mend

inspect-ai 0.3.82py3-none-any.whl → 0.3.83py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

inspect_ai/__init__.py +2 -1
inspect_ai/_display/textual/app.py +14 -3
inspect_ai/_display/textual/display.py +4 -0
inspect_ai/_display/textual/widgets/samples.py +9 -3
inspect_ai/_display/textual/widgets/task_detail.py +3 -4
inspect_ai/_display/textual/widgets/tasks.py +17 -1
inspect_ai/_display/textual/widgets/vscode.py +44 -0
inspect_ai/_eval/eval.py +36 -24
inspect_ai/_eval/evalset.py +17 -18
inspect_ai/_eval/loader.py +34 -11
inspect_ai/_eval/run.py +8 -13
inspect_ai/_eval/score.py +13 -3
inspect_ai/_eval/task/generate.py +8 -9
inspect_ai/_eval/task/log.py +2 -0
inspect_ai/_eval/task/task.py +23 -9
inspect_ai/_util/file.py +13 -0
inspect_ai/_util/json.py +2 -1
inspect_ai/_util/registry.py +1 -0
inspect_ai/_util/vscode.py +37 -0
inspect_ai/_view/www/App.css +6 -0
inspect_ai/_view/www/dist/assets/index.css +304 -128
inspect_ai/_view/www/dist/assets/index.js +47495 -27519
inspect_ai/_view/www/log-schema.json +124 -31
inspect_ai/_view/www/package.json +3 -0
inspect_ai/_view/www/src/App.tsx +12 -0
inspect_ai/_view/www/src/appearance/icons.ts +1 -0
inspect_ai/_view/www/src/components/Card.tsx +6 -4
inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
inspect_ai/_view/www/src/components/LiveVirtualList.tsx +1 -1
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +113 -23
inspect_ai/_view/www/src/components/Modal.module.css +38 -0
inspect_ai/_view/www/src/components/Modal.tsx +77 -0
inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +7 -0
inspect_ai/_view/www/src/samples/SampleDialog.tsx +7 -0
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +11 -34
inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +6 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
inspect_ai/_view/www/src/samples/SamplesTools.tsx +12 -0
inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +2 -0
inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -0
inspect_ai/_view/www/src/samples/chat/messages.ts +3 -1
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +1 -0
inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +9 -3
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -11
inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +2 -1
inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +7 -1
inspect_ai/_view/www/src/samples/list/SampleList.tsx +25 -8
inspect_ai/_view/www/src/samples/list/SampleRow.tsx +1 -1
inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +11 -22
inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +25 -4
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +29 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +0 -1
inspect_ai/_view/www/src/state/hooks.ts +5 -3
inspect_ai/_view/www/src/state/logPolling.ts +5 -1
inspect_ai/_view/www/src/state/logSlice.ts +10 -0
inspect_ai/_view/www/src/state/samplePolling.ts +4 -1
inspect_ai/_view/www/src/state/sampleSlice.ts +13 -0
inspect_ai/_view/www/src/types/log.d.ts +34 -26
inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
inspect_ai/_view/www/src/workspace/WorkSpace.tsx +18 -16
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -0
inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +68 -71
inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +1 -1
inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +18 -0
inspect_ai/_view/www/yarn.lock +94 -1
inspect_ai/agent/__init__.py +36 -0
inspect_ai/agent/_agent.py +268 -0
inspect_ai/agent/_as_solver.py +72 -0
inspect_ai/agent/_as_tool.py +122 -0
inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
inspect_ai/agent/_filter.py +46 -0
inspect_ai/agent/_handoff.py +93 -0
inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
inspect_ai/agent/_react.py +241 -0
inspect_ai/agent/_run.py +36 -0
inspect_ai/agent/_types.py +81 -0
inspect_ai/log/_log.py +11 -2
inspect_ai/log/_transcript.py +13 -9
inspect_ai/model/__init__.py +7 -1
inspect_ai/model/_call_tools.py +256 -52
inspect_ai/model/_chat_message.py +7 -4
inspect_ai/model/_conversation.py +13 -62
inspect_ai/model/_display.py +85 -0
inspect_ai/model/_model.py +113 -14
inspect_ai/model/_model_output.py +14 -9
inspect_ai/model/_openai.py +16 -4
inspect_ai/model/_openai_computer_use.py +162 -0
inspect_ai/model/_openai_responses.py +319 -165
inspect_ai/model/_providers/anthropic.py +20 -21
inspect_ai/model/_providers/azureai.py +24 -13
inspect_ai/model/_providers/bedrock.py +1 -7
inspect_ai/model/_providers/cloudflare.py +3 -3
inspect_ai/model/_providers/goodfire.py +2 -6
inspect_ai/model/_providers/google.py +11 -10
inspect_ai/model/_providers/groq.py +6 -3
inspect_ai/model/_providers/hf.py +7 -3
inspect_ai/model/_providers/mistral.py +7 -10
inspect_ai/model/_providers/openai.py +47 -17
inspect_ai/model/_providers/openai_o1.py +11 -4
inspect_ai/model/_providers/openai_responses.py +12 -14
inspect_ai/model/_providers/providers.py +2 -2
inspect_ai/model/_providers/together.py +12 -2
inspect_ai/model/_providers/util/chatapi.py +7 -2
inspect_ai/model/_providers/util/hf_handler.py +4 -2
inspect_ai/model/_providers/util/llama31.py +4 -2
inspect_ai/model/_providers/vertex.py +11 -9
inspect_ai/model/_providers/vllm.py +4 -4
inspect_ai/scorer/__init__.py +2 -0
inspect_ai/scorer/_metrics/__init__.py +2 -0
inspect_ai/scorer/_metrics/grouped.py +84 -0
inspect_ai/scorer/_score.py +26 -6
inspect_ai/solver/__init__.py +2 -2
inspect_ai/solver/_basic_agent.py +22 -9
inspect_ai/solver/_bridge.py +31 -0
inspect_ai/solver/_chain.py +20 -12
inspect_ai/solver/_fork.py +5 -1
inspect_ai/solver/_human_agent.py +52 -0
inspect_ai/solver/_prompt.py +3 -1
inspect_ai/solver/_run.py +59 -0
inspect_ai/solver/_solver.py +14 -4
inspect_ai/solver/_task_state.py +5 -3
inspect_ai/tool/_tool_call.py +15 -8
inspect_ai/tool/_tool_def.py +17 -12
inspect_ai/tool/_tool_support_helpers.py +2 -2
inspect_ai/tool/_tool_with.py +14 -11
inspect_ai/tool/_tools/_bash_session.py +11 -2
inspect_ai/tool/_tools/_computer/_common.py +18 -2
inspect_ai/tool/_tools/_computer/_computer.py +18 -2
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
inspect_ai/tool/_tools/_think.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +100 -61
inspect_ai/util/__init__.py +2 -0
inspect_ai/util/_anyio.py +27 -0
inspect_ai/util/_sandbox/__init__.py +2 -1
inspect_ai/util/_sandbox/context.py +32 -7
inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
inspect_ai/util/_sandbox/docker/compose.py +2 -2
inspect_ai/util/_sandbox/docker/docker.py +12 -1
inspect_ai/util/_store_model.py +30 -7
inspect_ai/util/_subprocess.py +13 -3
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +179 -153
inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -167
/inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
/inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.82.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0

inspect_ai/_view/www/src/types/log.d.ts CHANGED Viewed

@@ -13,6 +13,7 @@ export type Task = string;
 export type TaskId = string;
 export type TaskVersion = number;
 export type TaskFile = string | null;
+export type TaskRegistryName = string | null;
 export type Solver = string | null;
 export type SolverArgs = {} | null;
 export type Tags = string[] | null;
@@ -161,6 +162,7 @@ export type Content =
     )[];
 export type Type3 = "text";
 export type Text = string;
+export type Refusal = boolean | null;
 export type Type4 = "reasoning";
 export type Reasoning = string;
 export type Signature = string | null;
@@ -204,12 +206,11 @@ export type Role2 = "assistant";
 export type ToolCalls = ToolCall[] | null;
 export type Id4 = string;
 export type Function = string;
-export type Type8 = string;
-export type InternalName = string | null;
 export type ParseError = string | null;
 export type Title = string | null;
 export type Format2 = "text" | "markdown";
 export type Content3 = string;
+export type Model1 = string | null;
 export type Id5 = string | null;
 export type Content4 =
   | string
@@ -224,8 +225,7 @@ export type Source3 = ("input" | "generate") | null;
 export type Role3 = "tool";
 export type ToolCallId1 = string | null;
 export type Function1 = string | null;
-export type InternalName1 = string | null;
-export type Type9 =
+export type Type8 =
   | "parsing"
   | "timeout"
   | "unicode_decode"
@@ -246,7 +246,7 @@ export type Messages = (
   | ChatMessageAssistant
   | ChatMessageTool
 )[];
-export type Model1 = string;
+export type Model2 = string;
 export type StopReason =
   | "stop"
   | "max_tokens"
@@ -305,7 +305,7 @@ export type Timestamp1 = string;
 export type WorkingStart1 = number;
 export type Pending1 = boolean | null;
 export type Event1 = "sample_limit";
-export type Type10 =
+export type Type9 =
   | "message"
   | "time"
   | "working"
@@ -345,7 +345,7 @@ export type Timestamp5 = string;
 export type WorkingStart5 = number;
 export type Pending5 = boolean | null;
 export type Event5 = "model";
-export type Model2 = string;
+export type Model3 = string;
 export type Input3 = (
   | ChatMessageSystem
   | ChatMessageUser
@@ -354,7 +354,7 @@ export type Input3 = (
 )[];
 export type Name8 = string;
 export type Description2 = string;
-export type Type11 = "object";
+export type Type10 = "object";
 export type Required1 = string[];
 export type Additionalproperties1 = boolean;
 export type Tools1 = ToolInfo[];
@@ -369,10 +369,9 @@ export type Timestamp6 = string;
 export type WorkingStart6 = number;
 export type Pending6 = boolean | null;
 export type Event6 = "tool";
-export type Type12 = "function";
+export type Type11 = "function";
 export type Id7 = string;
 export type Function2 = string;
-export type InternalName2 = string | null;
 export type Result1 =
   | string
   | number
@@ -448,14 +447,14 @@ export type WorkingStart13 = number;
 export type Pending13 = boolean | null;
 export type Event13 = "step";
 export type Action1 = "begin" | "end";
-export type Type13 = string | null;
+export type Type12 = string | null;
 export type Name11 = string;
 export type Timestamp14 = string;
 export type WorkingStart14 = number;
 export type Pending14 = boolean | null;
 export type Event14 = "subtask";
 export type Name12 = string;
-export type Type14 = string | null;
+export type Type13 = string | null;
 export type Events2 = (
   | SampleInitEvent
   | SampleLimitEvent
@@ -494,6 +493,8 @@ export type Events1 = (
 )[];
 export type Completed3 = string | null;
 export type WorkingTime2 = number | null;
+export type Agent = string | null;
+export type Failed = boolean | null;
 export type Events = (
   | SampleInitEvent
   | SampleLimitEvent
@@ -514,7 +515,7 @@ export type Events = (
 export type TotalTime = number | null;
 export type WorkingTime3 = number | null;
 export type Uuid = string | null;
-export type Type15 =
+export type Type14 =
   | "context"
   | "time"
   | "working"
@@ -566,6 +567,7 @@ export interface EvalSpec {
   task_id: TaskId;
   task_version: TaskVersion;
   task_file: TaskFile;
+  task_registry_name: TaskRegistryName;
   task_attribs: TaskAttribs;
   task_args: TaskArgs;
   solver: Solver;
@@ -847,6 +849,7 @@ export interface ChatMessageSystem {
   id: Id1;
   content: Content;
   source: Source;
+  internal: unknown;
   role: Role;
 }
 /**
@@ -855,6 +858,7 @@ export interface ChatMessageSystem {
 export interface ContentText {
   type: Type3;
   text: Text;
+  refusal: Refusal;
 }
 /**
  * Reasoning content.
@@ -898,6 +902,7 @@ export interface ChatMessageUser {
   id: Id2;
   content: Content1;
   source: Source1;
+  internal: unknown;
   role: Role1;
   tool_call_id: ToolCallId;
 }
@@ -908,15 +913,16 @@ export interface ChatMessageAssistant {
   id: Id3;
   content: Content2;
   source: Source2;
+  internal: unknown;
   role: Role2;
   tool_calls: ToolCalls;
+  model: Model1;
 }
 export interface ToolCall {
   id: Id4;
   function: Function;
   arguments: Arguments;
-  type: Type8;
-  internal_name: InternalName;
+  internal: unknown;
   parse_error: ParseError;
   view: ToolCallContent | null;
 }
@@ -936,21 +942,21 @@ export interface ChatMessageTool {
   id: Id5;
   content: Content4;
   source: Source3;
+  internal: unknown;
   role: Role3;
   tool_call_id: ToolCallId1;
   function: Function1;
-  internal_name: InternalName1;
   error: ToolCallError | null;
 }
 export interface ToolCallError {
-  type: Type9;
+  type: Type8;
   message: Message1;
 }
 /**
  * Output from model generation.
  */
 export interface ModelOutput {
-  model: Model1;
+  model: Model2;
   choices: Choices1;
   usage: ModelUsage1 | null;
   time: Time;
@@ -1031,7 +1037,7 @@ export interface SampleLimitEvent {
   working_start: WorkingStart1;
   pending: Pending1;
   event: Event1;
-  type: Type10;
+  type: Type9;
   message: Message2;
   limit: Limit1;
 }
@@ -1094,7 +1100,7 @@ export interface ModelEvent {
   working_start: WorkingStart5;
   pending: Pending5;
   event: Event5;
-  model: Model2;
+  model: Model3;
   input: Input3;
   tools: Tools1;
   tool_choice: ToolChoice;
@@ -1141,7 +1147,7 @@ export interface ToolInfo {
  * Description of tool parameters object in JSON Schema format.
  */
 export interface ToolParams {
-  type: Type11;
+  type: Type10;
   properties: Properties1;
   required: Required1;
   additionalProperties: Additionalproperties1;
@@ -1204,11 +1210,11 @@ export interface ToolEvent {
   working_start: WorkingStart6;
   pending: Pending6;
   event: Event6;
-  type: Type12;
+  type: Type11;
   id: Id7;
   function: Function2;
   arguments: Arguments1;
-  internal_name: InternalName2;
+  internal: unknown;
   view: ToolCallContent | null;
   result: Result1;
   truncated: Truncated;
@@ -1216,6 +1222,8 @@ export interface ToolEvent {
   events: Events1;
   completed: Completed3;
   working_time: WorkingTime2;
+  agent: Agent;
+  failed: Failed;
 }
 export interface Arguments1 {
   [k: string]: JsonValue;
@@ -1324,7 +1332,7 @@ export interface StepEvent {
   pending: Pending13;
   event: Event13;
   action: Action1;
-  type: Type13;
+  type: Type12;
   name: Name11;
 }
 /**
@@ -1336,7 +1344,7 @@ export interface SubtaskEvent {
   pending: Pending14;
   event: Event14;
   name: Name12;
-  type: Type14;
+  type: Type13;
   input: Input5;
   result: Result2;
   events: Events2;
@@ -1357,7 +1365,7 @@ export interface Attachments {
  * Limit encontered by sample.
  */
 export interface EvalSampleLimit {
-  type: Type15;
+  type: Type14;
   limit: Limit2;
 }
 /**

inspect_ai/_view/www/src/types/markdown-it-katex.d.ts ADDED Viewed

@@ -0,0 +1,21 @@
+declare module "markdown-it-katex" {
+  import MarkdownIt from "markdown-it";
+  interface KatexOptions {
+    throwOnError?: boolean;
+    errorColor?: string;
+    macros?: Record<string, string>;
+    fleqn?: boolean;
+    trust?: boolean;
+    output?: "html" | "htmlAndMathml" | "mathml";
+    minRuleThickness?: number;
+    colorIsTextColor?: boolean;
+    maxSize?: number;
+    maxExpand?: number;
+    strict?: boolean | string | Function;
+  }
+  const markdownItKatex: (md: MarkdownIt, options?: KatexOptions) => void;
+  export default markdownItKatex;
+}

inspect_ai/_view/www/src/utils/json-worker.ts CHANGED Viewed

@@ -1,43 +1,110 @@
 export const asyncJsonParse = async (text: string): Promise<any> => {
+  // Encode the input text
   const encoder = new TextEncoder();
   const encodedText = encoder.encode(text);
+  // Create a worker from the inline script
   const blob = new Blob([kWorkerCode], { type: "application/javascript" });
   const blobURL = URL.createObjectURL(blob);
   const worker = new Worker(blobURL);
   try {
     const result = new Promise((resolve, reject) => {
       worker.onmessage = function (e) {
         if (e.data.success) {
-          resolve(e.data.result);
+          if (e.data.serialized) {
+            // Deserialize the result if it was sent as a transferable
+            const decoder = new TextDecoder();
+            const resultString = decoder.decode(e.data.result);
+            resolve(JSON.parse(resultString));
+          } else {
+            resolve(e.data.result);
+          }
         } else {
-          reject(new Error(e.data.error));
+          const error = new Error(e.data.error);
+          if (e.data.stack) {
+            error.stack = e.data.stack;
+          }
+          reject(error);
         }
       };
       worker.onerror = function (error) {
-        reject(new Error(error.message));
+        reject(new Error(`Worker error: ${error.message}`));
       };
     });
-    worker.postMessage({ scriptContent: kJson5ScriptBase64, encodedText }, [
-      encodedText.buffer,
-    ]);
+    // Transfer the encoded text buffer to the worker
+    worker.postMessage(
+      {
+        scriptContent: kJson5ScriptBase64,
+        encodedText,
+      },
+      [encodedText.buffer],
+    );
     return await result;
   } finally {
+    // Clean up resources
     worker.terminate();
     URL.revokeObjectURL(blobURL);
   }
 };
 const kWorkerCode = `
+// Store the JSON5 parser once loaded
+let JSON5 = null;
 self.onmessage = function (e) {
-  eval(atob(e.data.scriptContent));
-  const { encodedText } = e.data;
-  const decoder = new TextDecoder();
-  const text = decoder.decode(encodedText);
+  const { encodedText, scriptContent } = e.data;
   try {
+    // Only load the JSON5 script if we haven't done so yet
+    if (!JSON5) {
+      const script = atob(scriptContent);
+      new Function(script)();
+      // Verify it was loaded properly
+      if (typeof self.JSON5 !== 'object' || typeof self.JSON5.parse !== 'function') {
+        throw new Error('Failed to initialize JSON5 parser');
+      }
+      JSON5 = self.JSON5;
+    }
+    // Decode the text using TextDecoder
+    const decoder = new TextDecoder();
+    const text = decoder.decode(encodedText);
+    // Parse with JSON5
     const result = JSON5.parse(text);
-    postMessage({ success: true, result });
+    if (result && typeof result === 'object' &&
+        (Array.isArray(result) ? result.length > 10000 : Object.keys(result).length > 10000)) {
+      // Large result, use transferrable object
+      const resultString = JSON.stringify(result);
+      const encoder = new TextEncoder();
+      const serialized = encoder.encode(resultString);
+      postMessage({
+        success: true,
+        serialized: true,
+        result: serialized
+      }, [serialized.buffer]);
+    } else {
+      // Small results, send directly
+      postMessage({
+        success: true,
+        serialized: false,
+        result: result
+      });
+    }
   } catch (err) {
-    postMessage({ success: false, error: err.message });
+    postMessage({
+      success: false,
+      error: err.message,
+      stack: err.stack || ''
+    });
   }
 };`;

inspect_ai/_view/www/src/workspace/WorkSpace.tsx CHANGED Viewed

@@ -1,6 +1,6 @@
 import { ApplicationIcons } from "../appearance/icons";
 import { ToolButton } from "../components/ToolButton";
-import { SampleTools } from "../samples/SamplesTools";
+import { SampleTools, ScoreFilterTools } from "../samples/SamplesTools";
 import { JsonTab } from "./tabs/JsonTab";
 import { SamplesTab } from "./tabs/SamplesTab";
@@ -131,22 +131,24 @@ export const useSamplesTabConfig = (
         running: evalStatus === "started",
       },
       tools: () =>
-        totalSampleCount === 1 || !samplesDescriptor
+        !samplesDescriptor
           ? undefined
-          : [
-              <SampleTools
-                samples={sampleSummaries || []}
-                key="sample-tools"
-              />,
-              evalStatus === "started" && !streamSamples && (
-                <ToolButton
-                  key="refresh"
-                  label="Refresh"
-                  icon={ApplicationIcons.refresh}
-                  onClick={refreshLog}
-                />
-              ),
-            ].filter(Boolean),
+          : totalSampleCount === 1
+            ? [<ScoreFilterTools />]
+            : [
+                <SampleTools
+                  samples={sampleSummaries || []}
+                  key="sample-tools"
+                />,
+                evalStatus === "started" && !streamSamples && (
+                  <ToolButton
+                    key="refresh"
+                    label="Refresh"
+                    icon={ApplicationIcons.refresh}
+                    onClick={refreshLog}
+                  />
+                ),
+              ],
     };
   }, [
     evalStatus,

inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css CHANGED Viewed

@@ -87,3 +87,19 @@
   padding: 0 0.2em;
   justify-content: center;
 }
+.moreButton {
+  margin-top: 0.5em;
+  margin-bottom: 0.5em;
+  padding-right: 0;
+}
+.metricsSummary {
+  display: flex;
+  flex-direction: column;
+  align-items: flex-end;
+}
+.modalScores {
+  padding-bottom: 4em;
+}

inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx CHANGED Viewed

@@ -1,10 +1,14 @@
 import clsx from "clsx";
 import { FC } from "react";
 import { RunningMetric } from "../../api/types";
+import { LinkButton } from "../../components/LinkButton";
+import { Modal } from "../../components/Modal";
+import { useProperty } from "../../state/hooks";
 import { Scores } from "../../types/log";
 import { formatPrettyDecimal } from "../../utils/format";
 import { metricDisplayName } from "../utils";
 import styles from "./ResultsPanel.module.css";
+import { ScoreGrid } from "./ScoreGrid";
 export interface ResultsMetric {
   name: string;
@@ -82,6 +86,14 @@ interface ResultsPanelProps {
 }
 export const ResultsPanel: FC<ResultsPanelProps> = ({ scorers }) => {
+  const [showing, setShowing] = useProperty(
+    "results-panel-metrics",
+    "modal-showing",
+    {
+      defaultValue: false,
+    },
+  );
   if (!scorers || scorers.length === 0) {
     return undefined;
   }
@@ -107,23 +119,69 @@ export const ResultsPanel: FC<ResultsPanelProps> = ({ scorers }) => {
     );
   } else {
     const showReducer = scorers.findIndex((score) => !!score.reducer) !== -1;
+    const grouped = groupMetrics(scorers);
+    // Try to select metrics with a group size 5 or less, if possible
+    let primaryResults = grouped[0];
+    if (primaryResults.length > 5) {
+      const shorterResults = grouped.find((g) => {
+        return g.length <= 5;
+      });
+      if (shorterResults) {
+        primaryResults = shorterResults;
+      }
+    }
     return (
-      <div className={styles.multiMetricsRows}>
-        {scorers.map((scorer, index) => {
-          return (
-            <MultiScorerMetric
-              key={`multi-metric-${index}`}
-              scorer={scorer}
-              isFirst={index === 0}
-              showReducer={showReducer}
+      <div className={clsx(styles.metricsSummary)}>
+        <ScoreGrid scoreGroups={[primaryResults]} showReducer={showReducer} />
+        {grouped.length > 1 ? (
+          <>
+            <Modal
+              id="results-metrics"
+              showing={showing}
+              setShowing={setShowing}
+              title={"Scoring Detail"}
+            >
+              <ScoreGrid
+                scoreGroups={grouped}
+                showReducer={showReducer}
+                className={styles.modalScores}
+                striped={false}
+              />
+            </Modal>
+            <LinkButton
+              className={styles.moreButton}
+              text={"All scoring..."}
+              onClick={() => {
+                setShowing(true);
+              }}
             />
-          );
-        })}
+          </>
+        ) : undefined}
       </div>
     );
   }
 };
+const metricsKey = (metrics: ResultsMetric[]): string => {
+  const metricKey = metrics.map((m) => m.name).join("");
+  return metricKey;
+};
+const groupMetrics = (scorers: ResultsScorer[]): ResultsScorer[][] => {
+  const results: Record<string, ResultsScorer[]> = {};
+  scorers.forEach((scorer) => {
+    if (scorer.metrics.length > 0) {
+      const key = metricsKey(scorer.metrics);
+      results[key] = results[key] || [];
+      results[key].push(scorer);
+    }
+  });
+  return Object.values(results);
+};
 interface VerticalMetricProps {
   metric: ResultsMetric;
   reducer?: string;
@@ -177,64 +235,3 @@ const VerticalMetric: FC<VerticalMetricProps> = ({
     </div>
   );
 };
-interface MultiScorerMetricProps {
-  scorer: ResultsScorer;
-  isFirst: boolean;
-  showReducer: boolean;
-}
-const MultiScorerMetric: FC<MultiScorerMetricProps> = ({
-  scorer,
-  isFirst,
-  showReducer,
-}) => {
-  const titleFontClz = "text-size-base";
-  const reducerFontClz = "text-size-smaller";
-  const valueFontClz = "text-size-base";
-  return (
-    <div
-      className={clsx(
-        styles.multiScorer,
-        isFirst ? styles.multiScorerIndent : undefined,
-      )}
-    >
-      <div
-        className={clsx(
-          titleFontClz,
-          "text-style-label",
-          "text-style-secondary",
-          "multi-score-label",
-          styles.multiScorerLabel,
-        )}
-      >
-        {scorer.scorer}
-      </div>
-      {showReducer ? (
-        <div
-          className={clsx(
-            reducerFontClz,
-            "text-style-label",
-            "text-style-secondary",
-            styles.multiScorerReducer,
-          )}
-        >
-          {scorer.reducer || "default"}
-        </div>
-      ) : undefined}
-      <div className={clsx(valueFontClz, styles.multiScorerValue)}>
-        {scorer.metrics.map((metric) => {
-          return (
-            <div className={styles.multiScoreMetricGrid} key={metric.name}>
-              <div>{metricDisplayName(metric)}</div>
-              <div className={styles.multiScorerValueContent}>
-                {metric.value ? formatPrettyDecimal(metric.value) : undefined}
-              </div>
-            </div>
-          );
-        })}
-      </div>
-    </div>
-  );
-};

inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css ADDED Viewed

@@ -0,0 +1,35 @@
+.table {
+  margin-bottom: 0;
+}
+.scorer,
+.value {
+  padding-top: 0.2em !important;
+  padding-bottom: 0.2em !important;
+}
+.label,
+.value {
+  text-align: center;
+  padding-left: 1em;
+  padding-right: 1em;
+}
+.label {
+  font-weight: 400;
+  padding-left: 1em;
+  padding-right: 1em;
+}
+.scorer {
+  font-weight: 400;
+}
+.groupSeparator {
+  padding-top: 2em;
+  border-bottom: hidden;
+}
+.tableBody {
+  border-top-color: var(--bs-light-border-subtle);
+}

inspect-ai 0.3.82__py3-none-any.whl → 0.3.83__py3-none-any.whl

inspect-ai 0.3.82py3-none-any.whl → 0.3.83py3-none-any.whl