inspect-ai 0.3.81__py3-none-any.whl → 0.3.83__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_cli/eval.py +35 -2
- inspect_ai/_cli/util.py +44 -1
- inspect_ai/_display/core/config.py +1 -1
- inspect_ai/_display/core/display.py +13 -4
- inspect_ai/_display/core/results.py +1 -1
- inspect_ai/_display/textual/app.py +14 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +9 -3
- inspect_ai/_display/textual/widgets/task_detail.py +8 -8
- inspect_ai/_display/textual/widgets/tasks.py +17 -1
- inspect_ai/_display/textual/widgets/vscode.py +44 -0
- inspect_ai/_eval/eval.py +74 -25
- inspect_ai/_eval/evalset.py +22 -18
- inspect_ai/_eval/loader.py +34 -11
- inspect_ai/_eval/run.py +13 -15
- inspect_ai/_eval/score.py +13 -3
- inspect_ai/_eval/task/generate.py +8 -9
- inspect_ai/_eval/task/log.py +55 -6
- inspect_ai/_eval/task/run.py +51 -10
- inspect_ai/_eval/task/task.py +23 -9
- inspect_ai/_util/constants.py +2 -0
- inspect_ai/_util/file.py +30 -1
- inspect_ai/_util/json.py +37 -1
- inspect_ai/_util/registry.py +1 -0
- inspect_ai/_util/vscode.py +37 -0
- inspect_ai/_view/server.py +113 -1
- inspect_ai/_view/www/App.css +7 -1
- inspect_ai/_view/www/dist/assets/index.css +813 -415
- inspect_ai/_view/www/dist/assets/index.js +54475 -32003
- inspect_ai/_view/www/eslint.config.mjs +1 -1
- inspect_ai/_view/www/log-schema.json +137 -31
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/package.json +11 -2
- inspect_ai/_view/www/src/App.tsx +161 -853
- inspect_ai/_view/www/src/api/api-browser.ts +176 -5
- inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
- inspect_ai/_view/www/src/api/client-api.ts +66 -10
- inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
- inspect_ai/_view/www/src/api/types.ts +107 -2
- inspect_ai/_view/www/src/appearance/icons.ts +2 -0
- inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
- inspect_ai/_view/www/src/components/Card.tsx +6 -4
- inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
- inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
- inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
- inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
- inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
- inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +116 -26
- inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
- inspect_ai/_view/www/src/components/Modal.module.css +38 -0
- inspect_ai/_view/www/src/components/Modal.tsx +77 -0
- inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
- inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
- inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
- inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
- inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
- inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
- inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
- inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
- inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
- inspect_ai/_view/www/src/index.tsx +26 -94
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
- inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
- inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +74 -28
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +58 -22
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +135 -104
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +10 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +83 -36
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +35 -30
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +6 -1
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +5 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +36 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +11 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +34 -20
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -10
- inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +22 -3
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +27 -2
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +122 -85
- inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +28 -15
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
- inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
- inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +12 -27
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
- inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +33 -17
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
- inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
- inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +81 -60
- inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +29 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
- inspect_ai/_view/www/src/scoring/utils.ts +87 -0
- inspect_ai/_view/www/src/state/appSlice.ts +244 -0
- inspect_ai/_view/www/src/state/hooks.ts +399 -0
- inspect_ai/_view/www/src/state/logPolling.ts +200 -0
- inspect_ai/_view/www/src/state/logSlice.ts +224 -0
- inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
- inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
- inspect_ai/_view/www/src/state/samplePolling.ts +314 -0
- inspect_ai/_view/www/src/state/sampleSlice.ts +140 -0
- inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
- inspect_ai/_view/www/src/state/scrolling.ts +206 -0
- inspect_ai/_view/www/src/state/store.ts +168 -0
- inspect_ai/_view/www/src/state/store_filter.ts +84 -0
- inspect_ai/_view/www/src/state/utils.ts +23 -0
- inspect_ai/_view/www/src/storage/index.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +36 -26
- inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
- inspect_ai/_view/www/src/types.ts +94 -32
- inspect_ai/_view/www/src/utils/attachments.ts +58 -23
- inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
- inspect_ai/_view/www/src/utils/logger.ts +52 -0
- inspect_ai/_view/www/src/utils/polling.ts +100 -0
- inspect_ai/_view/www/src/utils/react.ts +30 -0
- inspect_ai/_view/www/src/utils/vscode.ts +1 -1
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +184 -217
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -1
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +159 -103
- inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
- inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +12 -14
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
- inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
- inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +128 -115
- inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
- inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
- inspect_ai/_view/www/src/workspace/types.ts +4 -3
- inspect_ai/_view/www/src/workspace/utils.ts +4 -4
- inspect_ai/_view/www/vite.config.js +6 -0
- inspect_ai/_view/www/yarn.lock +464 -355
- inspect_ai/agent/__init__.py +36 -0
- inspect_ai/agent/_agent.py +268 -0
- inspect_ai/agent/_as_solver.py +72 -0
- inspect_ai/agent/_as_tool.py +122 -0
- inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
- inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
- inspect_ai/agent/_filter.py +46 -0
- inspect_ai/agent/_handoff.py +93 -0
- inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
- inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
- inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
- inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
- inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
- inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
- inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
- inspect_ai/agent/_react.py +241 -0
- inspect_ai/agent/_run.py +36 -0
- inspect_ai/agent/_types.py +81 -0
- inspect_ai/log/_condense.py +26 -0
- inspect_ai/log/_log.py +17 -5
- inspect_ai/log/_recorders/buffer/__init__.py +14 -0
- inspect_ai/log/_recorders/buffer/buffer.py +30 -0
- inspect_ai/log/_recorders/buffer/database.py +685 -0
- inspect_ai/log/_recorders/buffer/filestore.py +259 -0
- inspect_ai/log/_recorders/buffer/types.py +84 -0
- inspect_ai/log/_recorders/eval.py +2 -11
- inspect_ai/log/_recorders/types.py +30 -0
- inspect_ai/log/_transcript.py +32 -2
- inspect_ai/model/__init__.py +7 -1
- inspect_ai/model/_call_tools.py +257 -52
- inspect_ai/model/_chat_message.py +7 -4
- inspect_ai/model/_conversation.py +13 -62
- inspect_ai/model/_display.py +85 -0
- inspect_ai/model/_generate_config.py +2 -2
- inspect_ai/model/_model.py +114 -14
- inspect_ai/model/_model_output.py +14 -9
- inspect_ai/model/_openai.py +16 -4
- inspect_ai/model/_openai_computer_use.py +162 -0
- inspect_ai/model/_openai_responses.py +319 -165
- inspect_ai/model/_providers/anthropic.py +20 -21
- inspect_ai/model/_providers/azureai.py +24 -13
- inspect_ai/model/_providers/bedrock.py +1 -7
- inspect_ai/model/_providers/cloudflare.py +3 -3
- inspect_ai/model/_providers/goodfire.py +2 -6
- inspect_ai/model/_providers/google.py +11 -10
- inspect_ai/model/_providers/groq.py +6 -3
- inspect_ai/model/_providers/hf.py +7 -3
- inspect_ai/model/_providers/mistral.py +7 -10
- inspect_ai/model/_providers/openai.py +47 -17
- inspect_ai/model/_providers/openai_o1.py +11 -4
- inspect_ai/model/_providers/openai_responses.py +12 -14
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/together.py +12 -2
- inspect_ai/model/_providers/util/chatapi.py +7 -2
- inspect_ai/model/_providers/util/hf_handler.py +4 -2
- inspect_ai/model/_providers/util/llama31.py +4 -2
- inspect_ai/model/_providers/vertex.py +11 -9
- inspect_ai/model/_providers/vllm.py +4 -4
- inspect_ai/scorer/__init__.py +2 -0
- inspect_ai/scorer/_metrics/__init__.py +2 -0
- inspect_ai/scorer/_metrics/grouped.py +84 -0
- inspect_ai/scorer/_score.py +26 -6
- inspect_ai/solver/__init__.py +2 -2
- inspect_ai/solver/_basic_agent.py +22 -9
- inspect_ai/solver/_bridge.py +31 -0
- inspect_ai/solver/_chain.py +20 -12
- inspect_ai/solver/_fork.py +5 -1
- inspect_ai/solver/_human_agent.py +52 -0
- inspect_ai/solver/_prompt.py +3 -1
- inspect_ai/solver/_run.py +59 -0
- inspect_ai/solver/_solver.py +14 -4
- inspect_ai/solver/_task_state.py +5 -3
- inspect_ai/tool/_tool_call.py +15 -8
- inspect_ai/tool/_tool_def.py +17 -12
- inspect_ai/tool/_tool_support_helpers.py +4 -4
- inspect_ai/tool/_tool_with.py +14 -11
- inspect_ai/tool/_tools/_bash_session.py +11 -2
- inspect_ai/tool/_tools/_computer/_common.py +18 -2
- inspect_ai/tool/_tools/_computer/_computer.py +18 -2
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +103 -62
- inspect_ai/util/__init__.py +2 -0
- inspect_ai/util/_anyio.py +27 -0
- inspect_ai/util/_sandbox/__init__.py +2 -1
- inspect_ai/util/_sandbox/context.py +32 -7
- inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
- inspect_ai/util/_sandbox/docker/compose.py +2 -2
- inspect_ai/util/_sandbox/docker/docker.py +12 -1
- inspect_ai/util/_store_model.py +30 -7
- inspect_ai/util/_subprocess.py +13 -3
- inspect_ai/util/_subtask.py +1 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +295 -229
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -169
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
- /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
inspect_ai/__init__.py
CHANGED
@@ -10,7 +10,8 @@ from inspect_ai._eval.score import score, score_async
|
|
10
10
|
from inspect_ai._eval.task import Epochs, Task, TaskInfo, task_with
|
11
11
|
from inspect_ai._eval.task.tasks import Tasks
|
12
12
|
from inspect_ai._util.constants import PKG_NAME
|
13
|
-
from inspect_ai.
|
13
|
+
from inspect_ai.agent._human.agent import human_cli
|
14
|
+
from inspect_ai.solver._human_agent import human_agent
|
14
15
|
|
15
16
|
__version__ = importlib_version(PKG_NAME)
|
16
17
|
|
inspect_ai/_cli/eval.py
CHANGED
@@ -10,6 +10,7 @@ from inspect_ai._util.constants import (
|
|
10
10
|
ALL_LOG_LEVELS,
|
11
11
|
DEFAULT_EPOCHS,
|
12
12
|
DEFAULT_LOG_LEVEL_TRANSCRIPT,
|
13
|
+
DEFAULT_LOG_SHARED,
|
13
14
|
DEFAULT_MAX_CONNECTIONS,
|
14
15
|
)
|
15
16
|
from inspect_ai._util.file import filesystem
|
@@ -25,7 +26,12 @@ from .common import (
|
|
25
26
|
common_options,
|
26
27
|
process_common_options,
|
27
28
|
)
|
28
|
-
from .util import
|
29
|
+
from .util import (
|
30
|
+
int_or_bool_flag_callback,
|
31
|
+
parse_cli_args,
|
32
|
+
parse_cli_config,
|
33
|
+
parse_sandbox,
|
34
|
+
)
|
29
35
|
|
30
36
|
MAX_SAMPLES_HELP = "Maximum number of samples to run in parallel (default is running all samples in parallel)"
|
31
37
|
MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
|
@@ -41,6 +47,7 @@ LOG_IMAGES_HELP = (
|
|
41
47
|
"Include base64 encoded versions of filename or URL based images in the log file."
|
42
48
|
)
|
43
49
|
LOG_BUFFER_HELP = "Number of samples to buffer before writing log file. If not specified, an appropriate default for the format and filesystem is chosen (10 for most all cases, 100 for JSON logs on remote filesystems)."
|
50
|
+
LOG_SHARED_HELP = "Sync sample events to log directory so that users on other systems can see log updates in realtime (defaults to no syncing). If enabled will sync every 10 seconds (or pass a value to sync every `n` seconds)."
|
44
51
|
NO_SCORE_HELP = (
|
45
52
|
"Do not score model output (use the inspect score command to score output later)"
|
46
53
|
)
|
@@ -266,6 +273,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
266
273
|
@click.option(
|
267
274
|
"--log-buffer", type=int, help=LOG_BUFFER_HELP, envvar="INSPECT_EVAL_LOG_BUFFER"
|
268
275
|
)
|
276
|
+
@click.option(
|
277
|
+
"--log-shared",
|
278
|
+
is_flag=False,
|
279
|
+
flag_value="true",
|
280
|
+
default=None,
|
281
|
+
callback=int_or_bool_flag_callback(DEFAULT_LOG_SHARED),
|
282
|
+
help=LOG_SHARED_HELP,
|
283
|
+
envvar=["INSPECT_LOG_SHARED", "INSPECT_EVAL_LOG_SHARED"],
|
284
|
+
)
|
269
285
|
@click.option(
|
270
286
|
"--no-score",
|
271
287
|
type=bool,
|
@@ -396,7 +412,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
396
412
|
@click.option(
|
397
413
|
"--reasoning-effort",
|
398
414
|
type=click.Choice(["low", "medium", "high"]),
|
399
|
-
help="Constrains effort on reasoning for reasoning models. Open AI o-series models only.",
|
415
|
+
help="Constrains effort on reasoning for reasoning models (defaults to `medium`). Open AI o-series models only.",
|
400
416
|
envvar="INSPECT_EVAL_REASONING_EFFORT",
|
401
417
|
)
|
402
418
|
@click.option(
|
@@ -503,6 +519,7 @@ def eval_command(
|
|
503
519
|
no_log_samples: bool | None,
|
504
520
|
log_images: bool | None,
|
505
521
|
log_buffer: int | None,
|
522
|
+
log_shared: int | None,
|
506
523
|
no_score: bool | None,
|
507
524
|
no_score_display: bool | None,
|
508
525
|
log_format: Literal["eval", "json"] | None,
|
@@ -556,6 +573,7 @@ def eval_command(
|
|
556
573
|
no_log_samples=no_log_samples,
|
557
574
|
log_images=log_images,
|
558
575
|
log_buffer=log_buffer,
|
576
|
+
log_shared=log_shared,
|
559
577
|
no_score=no_score,
|
560
578
|
no_score_display=no_score_display,
|
561
579
|
is_eval_set=False,
|
@@ -670,6 +688,7 @@ def eval_set_command(
|
|
670
688
|
no_log_samples: bool | None,
|
671
689
|
log_images: bool | None,
|
672
690
|
log_buffer: int | None,
|
691
|
+
log_shared: int | None,
|
673
692
|
no_score: bool | None,
|
674
693
|
no_score_display: bool | None,
|
675
694
|
bundle_dir: str | None,
|
@@ -728,6 +747,7 @@ def eval_set_command(
|
|
728
747
|
no_log_samples=no_log_samples,
|
729
748
|
log_images=log_images,
|
730
749
|
log_buffer=log_buffer,
|
750
|
+
log_shared=log_shared,
|
731
751
|
no_score=no_score,
|
732
752
|
no_score_display=no_score_display,
|
733
753
|
is_eval_set=True,
|
@@ -783,6 +803,7 @@ def eval_exec(
|
|
783
803
|
no_log_samples: bool | None,
|
784
804
|
log_images: bool | None,
|
785
805
|
log_buffer: int | None,
|
806
|
+
log_shared: int | None,
|
786
807
|
no_score: bool | None,
|
787
808
|
no_score_display: bool | None,
|
788
809
|
is_eval_set: bool = False,
|
@@ -865,6 +886,7 @@ def eval_exec(
|
|
865
886
|
log_samples=log_samples,
|
866
887
|
log_images=log_images,
|
867
888
|
log_buffer=log_buffer,
|
889
|
+
log_shared=log_shared,
|
868
890
|
score=score,
|
869
891
|
score_display=score_display,
|
870
892
|
)
|
@@ -1004,6 +1026,15 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
1004
1026
|
@click.option(
|
1005
1027
|
"--log-buffer", type=int, help=LOG_BUFFER_HELP, envvar="INSPECT_EVAL_LOG_BUFFER"
|
1006
1028
|
)
|
1029
|
+
@click.option(
|
1030
|
+
"--log-shared",
|
1031
|
+
is_flag=False,
|
1032
|
+
flag_value="true",
|
1033
|
+
default=None,
|
1034
|
+
callback=int_or_bool_flag_callback(DEFAULT_LOG_SHARED),
|
1035
|
+
help=LOG_SHARED_HELP,
|
1036
|
+
envvar=["INSPECT_LOG_SHARED", "INSPECT_EVAL_LOG_SHARED"],
|
1037
|
+
)
|
1007
1038
|
@click.option(
|
1008
1039
|
"--no-score",
|
1009
1040
|
type=bool,
|
@@ -1052,6 +1083,7 @@ def eval_retry_command(
|
|
1052
1083
|
no_log_samples: bool | None,
|
1053
1084
|
log_images: bool | None,
|
1054
1085
|
log_buffer: int | None,
|
1086
|
+
log_shared: int | None,
|
1055
1087
|
no_score: bool | None,
|
1056
1088
|
no_score_display: bool | None,
|
1057
1089
|
max_connections: int | None,
|
@@ -1099,6 +1131,7 @@ def eval_retry_command(
|
|
1099
1131
|
log_samples=log_samples,
|
1100
1132
|
log_images=log_images,
|
1101
1133
|
log_buffer=log_buffer,
|
1134
|
+
log_shared=log_shared,
|
1102
1135
|
score=score,
|
1103
1136
|
score_display=score_display,
|
1104
1137
|
max_retries=max_retries,
|
inspect_ai/_cli/util.py
CHANGED
@@ -1,11 +1,54 @@
|
|
1
|
-
from typing import Any
|
1
|
+
from typing import Any, Callable
|
2
2
|
|
3
|
+
import click
|
3
4
|
import yaml
|
4
5
|
|
5
6
|
from inspect_ai._util.config import resolve_args
|
6
7
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
7
8
|
|
8
9
|
|
10
|
+
def int_or_bool_flag_callback(
|
11
|
+
true_value: int, false_value: int = 0
|
12
|
+
) -> Callable[[click.Context, click.Parameter, Any], int]:
|
13
|
+
def callback(ctx: click.Context, param: click.Parameter, value: Any) -> int:
|
14
|
+
"""Callback to parse the an option that can either be a boolean flag or integer.
|
15
|
+
|
16
|
+
Desired behavior:
|
17
|
+
- Not specified at all -> false_value
|
18
|
+
- Specified with no value -> true_value
|
19
|
+
- Specified with "true"/"false" -> true_value or false_value respectively
|
20
|
+
- Specified with an integer -> that integer
|
21
|
+
"""
|
22
|
+
# 1. If this parameter was never given on the command line,
|
23
|
+
# then we return 0.
|
24
|
+
source = ctx.get_parameter_source(param.name) if param.name else ""
|
25
|
+
if source == click.core.ParameterSource.DEFAULT:
|
26
|
+
# Means the user did NOT specify the flag at all
|
27
|
+
return false_value
|
28
|
+
|
29
|
+
# 2. The user did specify the flag. If value is None,
|
30
|
+
# that means they used the flag with no argument, e.g. --my-flag
|
31
|
+
if value is None:
|
32
|
+
return true_value
|
33
|
+
|
34
|
+
# 3. If there is a value, try to parse booleans or an integer.
|
35
|
+
lower_val = value.lower()
|
36
|
+
if lower_val in ("true", "yes", "1"):
|
37
|
+
return true_value
|
38
|
+
elif lower_val in ("false", "no", "0"):
|
39
|
+
return false_value
|
40
|
+
else:
|
41
|
+
# 4. Otherwise, assume it is an integer
|
42
|
+
try:
|
43
|
+
return int(value)
|
44
|
+
except ValueError:
|
45
|
+
raise click.BadParameter(
|
46
|
+
f"Expected 'true', 'false', or an integer for --{param.name}. Got: {value}"
|
47
|
+
)
|
48
|
+
|
49
|
+
return callback
|
50
|
+
|
51
|
+
|
9
52
|
def parse_cli_config(
|
10
53
|
args: tuple[str] | list[str] | None, config: str | None
|
11
54
|
) -> dict[str, Any]:
|
@@ -36,7 +36,7 @@ def task_config(
|
|
36
36
|
value = value if isinstance(value, list) else [value]
|
37
37
|
value = [str(v) for v in value]
|
38
38
|
config_print.append(f"{name}: {','.join(value)}")
|
39
|
-
elif name not in ["limit", "model", "response_schema"]:
|
39
|
+
elif name not in ["limit", "model", "response_schema", "log_shared"]:
|
40
40
|
if isinstance(value, list):
|
41
41
|
value = ",".join([str(v) for v in value])
|
42
42
|
if isinstance(value, str):
|
@@ -15,6 +15,7 @@ from typing import (
|
|
15
15
|
)
|
16
16
|
|
17
17
|
import rich
|
18
|
+
from pydantic import BaseModel, Field, field_validator
|
18
19
|
from rich.console import Console
|
19
20
|
|
20
21
|
from inspect_ai.log import EvalConfig, EvalResults, EvalStats
|
@@ -104,12 +105,20 @@ class TaskScreen(contextlib.AbstractContextManager["TaskScreen"]):
|
|
104
105
|
raise NotImplementedError("input_panel not implemented by current display")
|
105
106
|
|
106
107
|
|
107
|
-
|
108
|
-
class TaskDisplayMetric:
|
108
|
+
class TaskDisplayMetric(BaseModel):
|
109
109
|
scorer: str
|
110
110
|
name: str
|
111
|
-
value: float | int
|
112
|
-
reducer: str | None
|
111
|
+
value: float | int | None = Field(default=None)
|
112
|
+
reducer: str | None = Field(default=None)
|
113
|
+
|
114
|
+
@field_validator("value", mode="before")
|
115
|
+
@classmethod
|
116
|
+
def handle_null_value(cls, v: Any) -> Union[float, int, None]:
|
117
|
+
if v is None:
|
118
|
+
return None
|
119
|
+
if isinstance(v, float | int):
|
120
|
+
return v
|
121
|
+
raise ValueError(f"Expected float, int, or None, got {type(v)}")
|
113
122
|
|
114
123
|
|
115
124
|
@runtime_checkable
|
@@ -180,7 +180,7 @@ def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> s
|
|
180
180
|
)
|
181
181
|
|
182
182
|
metric = metrics[0]
|
183
|
-
if np.isnan(metric.value):
|
183
|
+
if metric.value is None or np.isnan(metric.value):
|
184
184
|
value = " n/a"
|
185
185
|
else:
|
186
186
|
value = f"{metric.value:.2f}"
|
@@ -58,10 +58,12 @@ class TaskScreenResult(Generic[TR]):
|
|
58
58
|
value: TR | BaseException,
|
59
59
|
tasks: list[TaskWithResult],
|
60
60
|
output: list[str],
|
61
|
+
warnings: list[str],
|
61
62
|
) -> None:
|
62
63
|
self.value = value
|
63
64
|
self.tasks = tasks
|
64
65
|
self.output = output
|
66
|
+
self.warnings = warnings
|
65
67
|
|
66
68
|
|
67
69
|
class TaskScreenApp(App[TR]):
|
@@ -86,6 +88,7 @@ class TaskScreenApp(App[TR]):
|
|
86
88
|
self._worker: Worker[TR] | None = None
|
87
89
|
self._error: BaseException | None = None
|
88
90
|
self._output: list[str] = []
|
91
|
+
self._warnings: list[str] = []
|
89
92
|
|
90
93
|
# task screen
|
91
94
|
self._total_tasks = 0
|
@@ -120,7 +123,12 @@ class TaskScreenApp(App[TR]):
|
|
120
123
|
value = CancelledError()
|
121
124
|
|
122
125
|
# return result w/ output
|
123
|
-
return TaskScreenResult(
|
126
|
+
return TaskScreenResult(
|
127
|
+
value=value,
|
128
|
+
tasks=self._app_tasks,
|
129
|
+
output=self._output,
|
130
|
+
warnings=self._warnings,
|
131
|
+
)
|
124
132
|
|
125
133
|
async def on_load(self) -> None:
|
126
134
|
# events used to synchronise loading
|
@@ -349,8 +357,11 @@ class TaskScreenApp(App[TR]):
|
|
349
357
|
if text.endswith("\n"):
|
350
358
|
text = text[:-1]
|
351
359
|
|
352
|
-
# track output (for printing at the end)
|
353
|
-
|
360
|
+
# track output and warnings (for printing at the end)
|
361
|
+
if "WARNING" in text:
|
362
|
+
self._warnings.append(text)
|
363
|
+
else:
|
364
|
+
self._output.append(text)
|
354
365
|
|
355
366
|
# write to console view
|
356
367
|
self.query_one(ConsoleView).write_ansi(text)
|
@@ -42,6 +42,10 @@ class TextualDisplay(Display):
|
|
42
42
|
# print tasks
|
43
43
|
rich.print(tasks_results(result.tasks))
|
44
44
|
|
45
|
+
# print warnings
|
46
|
+
if result.warnings:
|
47
|
+
print("\n".join(result.warnings))
|
48
|
+
|
45
49
|
# raise error as required
|
46
50
|
if isinstance(result.value, BaseException):
|
47
51
|
raise result.value
|
@@ -17,7 +17,7 @@ from textual.widgets import (
|
|
17
17
|
OptionList,
|
18
18
|
Static,
|
19
19
|
)
|
20
|
-
from textual.widgets.option_list import Option
|
20
|
+
from textual.widgets.option_list import Option, OptionDoesNotExist
|
21
21
|
|
22
22
|
from inspect_ai._display.textual.widgets.port_mappings import get_url
|
23
23
|
from inspect_ai._util.format import format_progress_time
|
@@ -124,7 +124,7 @@ class SamplesList(OptionList):
|
|
124
124
|
def set_samples(self, samples: list[ActiveSample]) -> None:
|
125
125
|
# check for a highlighted sample (make sure we don't remove it)
|
126
126
|
highlighted_id = (
|
127
|
-
self.
|
127
|
+
self.get_id_at_index(self.highlighted)
|
128
128
|
if self.highlighted is not None
|
129
129
|
else None
|
130
130
|
)
|
@@ -179,12 +179,18 @@ class SamplesList(OptionList):
|
|
179
179
|
self.scroll_to_highlight()
|
180
180
|
|
181
181
|
def sample_for_highlighted(self, highlighted: int) -> ActiveSample | None:
|
182
|
-
highlighted_id = self.
|
182
|
+
highlighted_id = self.get_id_at_index(highlighted)
|
183
183
|
if highlighted_id is not None:
|
184
184
|
return sample_for_id(self.samples, highlighted_id)
|
185
185
|
else:
|
186
186
|
return None
|
187
187
|
|
188
|
+
def get_id_at_index(self, index: int) -> str | None:
|
189
|
+
try:
|
190
|
+
return self.get_option_at_index(index).id
|
191
|
+
except OptionDoesNotExist:
|
192
|
+
return None
|
193
|
+
|
188
194
|
|
189
195
|
class SampleVNC(Horizontal):
|
190
196
|
DEFAULT_CSS = """
|
@@ -14,7 +14,7 @@ from inspect_ai._display.core.display import TaskDisplayMetric
|
|
14
14
|
@dataclass
|
15
15
|
class TaskMetric:
|
16
16
|
name: str
|
17
|
-
value: float
|
17
|
+
value: float | int | None
|
18
18
|
|
19
19
|
|
20
20
|
class TaskDetail(Widget):
|
@@ -221,21 +221,21 @@ class TaskMetrics(Widget):
|
|
221
221
|
self.recompute_grid()
|
222
222
|
|
223
223
|
def on_mount(self) -> None:
|
224
|
-
self.recompute_grid()
|
224
|
+
self.recompute_grid(True)
|
225
225
|
|
226
|
-
def recompute_grid(self) -> None:
|
227
|
-
if not self.is_mounted:
|
226
|
+
def recompute_grid(self, force: bool = False) -> None:
|
227
|
+
if not self.is_mounted and not force:
|
228
228
|
return
|
229
|
-
|
230
229
|
grid = self.query_one(f"#{self.grid_id()}")
|
231
230
|
|
232
231
|
grid.remove_children()
|
233
232
|
for metric in self.metrics:
|
234
233
|
# Add the value static but keep it around
|
235
234
|
# for future updates
|
236
|
-
|
237
|
-
self.
|
238
|
-
|
235
|
+
if metric.value is not None:
|
236
|
+
self.value_widgets[metric.name] = Static(
|
237
|
+
self._metric_value(metric.value), markup=False
|
238
|
+
)
|
239
239
|
|
240
240
|
grid.mount(Static(metric.name, markup=False))
|
241
241
|
grid.mount(self.value_widgets[metric.name])
|
@@ -17,6 +17,11 @@ from inspect_ai._display.core.results import task_metric
|
|
17
17
|
from inspect_ai._display.textual.widgets.clock import Clock
|
18
18
|
from inspect_ai._display.textual.widgets.task_detail import TaskDetail
|
19
19
|
from inspect_ai._display.textual.widgets.toggle import Toggle
|
20
|
+
from inspect_ai._display.textual.widgets.vscode import conditional_vscode_link
|
21
|
+
from inspect_ai._util.file import to_uri
|
22
|
+
from inspect_ai._util.vscode import (
|
23
|
+
VSCodeCommand,
|
24
|
+
)
|
20
25
|
|
21
26
|
from ...core.display import (
|
22
27
|
Progress,
|
@@ -151,7 +156,7 @@ class TaskProgressView(Widget):
|
|
151
156
|
height: auto;
|
152
157
|
width: 1fr;
|
153
158
|
layout: grid;
|
154
|
-
grid-size:
|
159
|
+
grid-size: 9 2;
|
155
160
|
grid-columns: auto auto auto auto 1fr auto auto auto;
|
156
161
|
grid-rows: auto auto;
|
157
162
|
grid-gutter: 0 1;
|
@@ -200,6 +205,15 @@ class TaskProgressView(Widget):
|
|
200
205
|
|
201
206
|
self.sample_count_width: int = sample_count_width
|
202
207
|
self.display_metrics = display_metrics
|
208
|
+
self.view_log_link = conditional_vscode_link(
|
209
|
+
"[View Log]",
|
210
|
+
VSCodeCommand(
|
211
|
+
command="inspect.openLogViewer",
|
212
|
+
args=[to_uri(task.profile.log_location)]
|
213
|
+
if task.profile.log_location
|
214
|
+
else [],
|
215
|
+
),
|
216
|
+
)
|
203
217
|
|
204
218
|
metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
|
205
219
|
metrics_width: reactive[int | None] = reactive(None)
|
@@ -222,6 +236,8 @@ class TaskProgressView(Widget):
|
|
222
236
|
yield self.count_display
|
223
237
|
yield self.metrics_display
|
224
238
|
yield Clock()
|
239
|
+
yield self.view_log_link
|
240
|
+
|
225
241
|
yield self.task_detail
|
226
242
|
|
227
243
|
@on(Toggle.Toggled)
|
@@ -0,0 +1,44 @@
|
|
1
|
+
from textual.widget import Widget
|
2
|
+
from textual.widgets import Link, Static
|
3
|
+
|
4
|
+
from inspect_ai._util.vscode import (
|
5
|
+
VSCodeCommand,
|
6
|
+
can_execute_vscode_command,
|
7
|
+
execute_vscode_commands,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
def conditional_vscode_link(text: str, command: VSCodeCommand) -> Widget:
|
12
|
+
if can_execute_vscode_command(command.command):
|
13
|
+
vscode_link = VSCodeLink(text)
|
14
|
+
vscode_link.commands = [command]
|
15
|
+
return vscode_link
|
16
|
+
else:
|
17
|
+
return Static()
|
18
|
+
|
19
|
+
|
20
|
+
class VSCodeLink(Link):
|
21
|
+
def __init__(
|
22
|
+
self,
|
23
|
+
text: str,
|
24
|
+
*,
|
25
|
+
url: str | None = None,
|
26
|
+
tooltip: str | None = None,
|
27
|
+
name: str | None = None,
|
28
|
+
id: str | None = None,
|
29
|
+
classes: str | None = None,
|
30
|
+
disabled: bool = False,
|
31
|
+
) -> None:
|
32
|
+
super().__init__(
|
33
|
+
text,
|
34
|
+
url=url,
|
35
|
+
tooltip=tooltip,
|
36
|
+
name=name,
|
37
|
+
id=id,
|
38
|
+
classes=classes,
|
39
|
+
disabled=disabled,
|
40
|
+
)
|
41
|
+
self.commands: list[VSCodeCommand] = []
|
42
|
+
|
43
|
+
def on_click(self) -> None:
|
44
|
+
execute_vscode_commands(self.commands)
|