inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +2 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/core/progress.py +1 -1
- inspect_ai/_display/textual/app.py +8 -4
- inspect_ai/_display/textual/widgets/samples.py +6 -5
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/__init__.py +0 -0
- inspect_ai/_eval/eval.py +100 -97
- inspect_ai/_eval/evalset.py +69 -69
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +6 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/__init__.py +0 -0
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/App.css +8 -3
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +66 -38
- inspect_ai/_view/www/dist/assets/index.js +525 -523
- inspect_ai/_view/www/log-schema.json +86 -73
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/App.tsx +1 -0
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
- inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
- inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
- inspect_ai/_view/www/src/types/log.d.ts +107 -19
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +36 -45
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +13 -13
- inspect_ai/dataset/_sources/hf.py +29 -29
- inspect_ai/dataset/_sources/json.py +10 -10
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +98 -7
- inspect_ai/log/_message.py +3 -1
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +2 -2
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openrouter.py +1 -1
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +1 -1
- inspect_ai/scorer/_classification.py +4 -0
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +15 -18
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +2 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/_tools/_computer/_common.py +2 -2
- inspect_ai/tool/_tools/_computer/_computer.py +11 -0
- inspect_ai/tool/_tools/_execute.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +10 -1
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
inspect_ai/util/_subprocess.py
CHANGED
@@ -20,6 +20,8 @@ T = TypeVar("T", str, bytes)
|
|
20
20
|
|
21
21
|
@dataclass
|
22
22
|
class ExecResult(Generic[T]):
|
23
|
+
"""Execution result from call to `subprocess()`."""
|
24
|
+
|
23
25
|
success: bool
|
24
26
|
"""Did the process exit with success."""
|
25
27
|
|
@@ -85,11 +87,11 @@ async def subprocess(
|
|
85
87
|
cwd (str | Path | None): Switch to directory for execution.
|
86
88
|
env (dict[str, str]): Additional environment variables.
|
87
89
|
capture_output (bool): Capture stderr and stdout into ExecResult
|
88
|
-
|
90
|
+
(if False, then output is redirected to parent stderr/stdout)
|
89
91
|
output_limit (int | None): Stop reading output if it exceeds
|
90
|
-
|
92
|
+
the specified limit (in bytes).
|
91
93
|
timeout (int | None): Timeout. If the timeout expires then
|
92
|
-
|
94
|
+
a `TimeoutError` will be raised.
|
93
95
|
|
94
96
|
Returns:
|
95
97
|
Subprocess result (text or binary depending on `text` param)
|
inspect_ai/util/_subtask.py
CHANGED
@@ -27,21 +27,21 @@ logger = getLogger(__name__)
|
|
27
27
|
|
28
28
|
@runtime_checkable
|
29
29
|
class Subtask(Protocol):
|
30
|
-
"""Subtask with distinct `Store` and `Transcript`.
|
31
|
-
|
32
|
-
Args:
|
33
|
-
*args (Any): Arguments for the subtask.
|
34
|
-
**kwargs (Any): Keyword arguments for the subtask.
|
35
|
-
|
36
|
-
Returns:
|
37
|
-
Result of subtask.
|
38
|
-
"""
|
39
|
-
|
40
30
|
async def __call__(
|
41
31
|
self,
|
42
32
|
*args: Any,
|
43
33
|
**kwargs: Any,
|
44
|
-
) -> Any:
|
34
|
+
) -> Any:
|
35
|
+
"""Subtask with distinct `Store` and `Transcript`.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
*args (Any): Arguments for the subtask.
|
39
|
+
**kwargs (Any): Keyword arguments for the subtask.
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
Result of subtask.
|
43
|
+
"""
|
44
|
+
...
|
45
45
|
|
46
46
|
|
47
47
|
@overload
|
@@ -71,11 +71,10 @@ def subtask(
|
|
71
71
|
r"""Decorator for subtasks.
|
72
72
|
|
73
73
|
Args:
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
input (dict[str, Any] | None): Input to log for subtask
|
74
|
+
name: Name for subtask (defaults to function name)
|
75
|
+
store: Store to use for subtask
|
76
|
+
type: Type to use for subtask
|
77
|
+
input: Input to log for subtask
|
79
78
|
|
80
79
|
Returns:
|
81
80
|
Function which runs the Subtask, providing an isolated
|
@@ -1,8 +1,8 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: inspect_ai
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.65
|
4
4
|
Summary: Framework for large language model evaluations
|
5
|
-
Author: UK AI
|
5
|
+
Author: UK AI Security Institute
|
6
6
|
License: MIT License
|
7
7
|
Project-URL: Documentation, https://inspect.ai-safety-institute.org.uk/
|
8
8
|
Project-URL: Source Code, https://github.com/UKGovernmentBEIS/inspect_ai
|
@@ -55,6 +55,7 @@ Requires-Dist: azure-ai-inference; extra == "dev"
|
|
55
55
|
Requires-Dist: google-cloud-aiplatform; extra == "dev"
|
56
56
|
Requires-Dist: google-generativeai; extra == "dev"
|
57
57
|
Requires-Dist: goodfire; extra == "dev"
|
58
|
+
Requires-Dist: griffe; extra == "dev"
|
58
59
|
Requires-Dist: groq; extra == "dev"
|
59
60
|
Requires-Dist: ipython; extra == "dev"
|
60
61
|
Requires-Dist: mistralai; extra == "dev"
|
@@ -69,8 +70,9 @@ Requires-Dist: pytest-asyncio; extra == "dev"
|
|
69
70
|
Requires-Dist: pytest-cov; extra == "dev"
|
70
71
|
Requires-Dist: pytest-dotenv; extra == "dev"
|
71
72
|
Requires-Dist: pytest-xdist; extra == "dev"
|
72
|
-
Requires-Dist: ruff==0.9.
|
73
|
+
Requires-Dist: ruff==0.9.5; extra == "dev"
|
73
74
|
Requires-Dist: textual-dev>=0.86.2; extra == "dev"
|
75
|
+
Requires-Dist: types-Markdown; extra == "dev"
|
74
76
|
Requires-Dist: types-PyYAML; extra == "dev"
|
75
77
|
Requires-Dist: types-beautifulsoup4; extra == "dev"
|
76
78
|
Requires-Dist: types-aioboto3; extra == "dev"
|
@@ -82,15 +84,17 @@ Requires-Dist: types-protobuf; extra == "dev"
|
|
82
84
|
Requires-Dist: types-psutil; extra == "dev"
|
83
85
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
84
86
|
Provides-Extra: doc
|
85
|
-
Requires-Dist: quarto-cli; extra == "doc"
|
87
|
+
Requires-Dist: quarto-cli==1.5.57; extra == "doc"
|
86
88
|
Requires-Dist: jupyter; extra == "doc"
|
89
|
+
Requires-Dist: panflute; extra == "doc"
|
90
|
+
Requires-Dist: markdown; extra == "doc"
|
87
91
|
Provides-Extra: dist
|
88
92
|
Requires-Dist: twine; extra == "dist"
|
89
93
|
Requires-Dist: build; extra == "dist"
|
90
94
|
|
91
|
-
[<img width="295" src="https://inspect.ai-safety-institute.org.uk/images/aisi-logo.
|
95
|
+
[<img width="295" src="https://inspect.ai-safety-institute.org.uk/images/aisi-logo.svg" />](https://aisi.gov.uk/)
|
92
96
|
|
93
|
-
Welcome to Inspect, a framework for large language model evaluations created by the [UK AI
|
97
|
+
Welcome to Inspect, a framework for large language model evaluations created by the [UK AI Security Institute](https://aisi.gov.uk/).
|
94
98
|
|
95
99
|
Inspect provides many built-in components, including facilities for prompt engineering, tool usage, multi-turn dialog, and model graded evaluations. Extensions to Inspect (e.g. to support new elicitation and scoring techniques) can be provided by other Python packages.
|
96
100
|
|