inspect-ai 0.3.49__py3-none-any.whl → 0.3.51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. inspect_ai/_cli/info.py +2 -2
  2. inspect_ai/_cli/log.py +2 -2
  3. inspect_ai/_cli/score.py +2 -2
  4. inspect_ai/_display/core/display.py +19 -0
  5. inspect_ai/_display/core/panel.py +37 -7
  6. inspect_ai/_display/core/progress.py +29 -2
  7. inspect_ai/_display/core/results.py +79 -40
  8. inspect_ai/_display/core/textual.py +21 -0
  9. inspect_ai/_display/rich/display.py +28 -8
  10. inspect_ai/_display/textual/app.py +107 -1
  11. inspect_ai/_display/textual/display.py +1 -1
  12. inspect_ai/_display/textual/widgets/samples.py +132 -91
  13. inspect_ai/_display/textual/widgets/task_detail.py +236 -0
  14. inspect_ai/_display/textual/widgets/tasks.py +74 -6
  15. inspect_ai/_display/textual/widgets/toggle.py +32 -0
  16. inspect_ai/_eval/context.py +2 -0
  17. inspect_ai/_eval/eval.py +4 -3
  18. inspect_ai/_eval/loader.py +1 -1
  19. inspect_ai/_eval/run.py +35 -2
  20. inspect_ai/_eval/task/log.py +13 -11
  21. inspect_ai/_eval/task/results.py +12 -3
  22. inspect_ai/_eval/task/run.py +139 -36
  23. inspect_ai/_eval/task/sandbox.py +2 -1
  24. inspect_ai/_util/_async.py +30 -1
  25. inspect_ai/_util/file.py +31 -4
  26. inspect_ai/_util/html.py +3 -0
  27. inspect_ai/_util/logger.py +6 -5
  28. inspect_ai/_util/platform.py +5 -6
  29. inspect_ai/_util/registry.py +1 -1
  30. inspect_ai/_view/server.py +9 -9
  31. inspect_ai/_view/www/App.css +2 -2
  32. inspect_ai/_view/www/dist/assets/index.css +2 -2
  33. inspect_ai/_view/www/dist/assets/index.js +352 -294
  34. inspect_ai/_view/www/log-schema.json +13 -0
  35. inspect_ai/_view/www/package.json +1 -0
  36. inspect_ai/_view/www/src/components/MessageBand.mjs +1 -1
  37. inspect_ai/_view/www/src/components/Tools.mjs +16 -13
  38. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -3
  39. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +52 -77
  40. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +38 -13
  41. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +15 -2
  42. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +4 -2
  43. inspect_ai/_view/www/src/types/log.d.ts +2 -0
  44. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +2 -0
  45. inspect_ai/_view/www/yarn.lock +9 -4
  46. inspect_ai/approval/__init__.py +1 -1
  47. inspect_ai/approval/_human/approver.py +35 -0
  48. inspect_ai/approval/_human/console.py +62 -0
  49. inspect_ai/approval/_human/manager.py +108 -0
  50. inspect_ai/approval/_human/panel.py +233 -0
  51. inspect_ai/approval/_human/util.py +51 -0
  52. inspect_ai/dataset/_sources/hf.py +2 -2
  53. inspect_ai/dataset/_sources/util.py +1 -1
  54. inspect_ai/log/_file.py +106 -36
  55. inspect_ai/log/_recorders/eval.py +226 -158
  56. inspect_ai/log/_recorders/file.py +9 -6
  57. inspect_ai/log/_recorders/json.py +35 -12
  58. inspect_ai/log/_recorders/recorder.py +15 -15
  59. inspect_ai/log/_samples.py +52 -0
  60. inspect_ai/model/_model.py +14 -0
  61. inspect_ai/model/_model_output.py +4 -0
  62. inspect_ai/model/_providers/azureai.py +1 -1
  63. inspect_ai/model/_providers/hf.py +106 -4
  64. inspect_ai/model/_providers/util/__init__.py +2 -0
  65. inspect_ai/model/_providers/util/hf_handler.py +200 -0
  66. inspect_ai/scorer/_common.py +1 -1
  67. inspect_ai/solver/_plan.py +0 -8
  68. inspect_ai/solver/_task_state.py +18 -1
  69. inspect_ai/solver/_use_tools.py +9 -1
  70. inspect_ai/tool/_tool_def.py +2 -2
  71. inspect_ai/tool/_tool_info.py +14 -2
  72. inspect_ai/tool/_tool_params.py +2 -1
  73. inspect_ai/tool/_tools/_execute.py +1 -1
  74. inspect_ai/tool/_tools/_web_browser/_web_browser.py +6 -0
  75. inspect_ai/util/__init__.py +5 -6
  76. inspect_ai/util/_panel.py +91 -0
  77. inspect_ai/util/_sandbox/__init__.py +2 -6
  78. inspect_ai/util/_sandbox/context.py +4 -3
  79. inspect_ai/util/_sandbox/docker/compose.py +12 -2
  80. inspect_ai/util/_sandbox/docker/docker.py +19 -9
  81. inspect_ai/util/_sandbox/docker/util.py +10 -2
  82. inspect_ai/util/_sandbox/environment.py +47 -41
  83. inspect_ai/util/_sandbox/local.py +15 -10
  84. inspect_ai/util/_subprocess.py +43 -3
  85. {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/METADATA +2 -2
  86. {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/RECORD +90 -82
  87. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
  88. inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
  89. inspect_ai/approval/_human.py +0 -123
  90. {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/LICENSE +0 -0
  91. {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/WHEEL +0 -0
  92. {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/entry_points.txt +0 -0
  93. {inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import abc
2
4
  from dataclasses import dataclass, field
3
5
  from typing import Awaitable, Callable, Literal, NamedTuple, Union, overload
@@ -6,49 +8,37 @@ from pydantic import BaseModel, Field
6
8
 
7
9
  from .._subprocess import ExecResult
8
10
 
9
- TaskInit = Callable[[str, str | None], Awaitable[None]]
10
- TaskCleanup = Callable[[str, str | None, bool], Awaitable[None]]
11
+ TaskInit = Callable[[str, Union["SandboxEnvironmentConfigType", None]], Awaitable[None]]
12
+ TaskCleanup = Callable[
13
+ [str, Union["SandboxEnvironmentConfigType", None], bool], Awaitable[None]
14
+ ]
11
15
 
12
16
  SampleInit = Callable[
13
- [str, str | None, dict[str, str]], Awaitable[dict[str, "SandboxEnvironment"]]
17
+ [str, Union["SandboxEnvironmentConfigType", None], dict[str, str]],
18
+ Awaitable[dict[str, "SandboxEnvironment"]],
14
19
  ]
15
20
  SampleCleanup = Callable[
16
- [str, str | None, dict[str, "SandboxEnvironment"], bool], Awaitable[None]
21
+ [
22
+ str,
23
+ Union["SandboxEnvironmentConfigType", None],
24
+ dict[str, "SandboxEnvironment"],
25
+ bool,
26
+ ],
27
+ Awaitable[None],
17
28
  ]
18
29
 
19
30
 
20
- class SandboxConnectionBase(BaseModel):
31
+ class SandboxConnection(BaseModel):
32
+ """Information required to connect to sandbox."""
33
+
21
34
  command: str
22
35
  """Shell command to connect to sandbox."""
23
36
 
24
- working_dir: str
25
- """Agent working directory."""
26
-
27
-
28
- class SandboxConnectionLocal(SandboxConnectionBase):
29
- type: Literal["local"] = Field(default="local")
30
-
31
-
32
- class SandboxConnectionContainer(SandboxConnectionBase):
33
- type: Literal["container"] = Field(default="container")
34
- """Sandbox login type."""
35
-
36
- container: str
37
- """Container name."""
38
-
39
-
40
- class SandboxConnectionSSH(SandboxConnectionBase):
41
- type: Literal["ssh"] = Field(default="ssh")
42
- """Sandbox login type."""
43
-
44
- destination: str
45
- """SSH destination server."""
37
+ vscode_command: list[str] | None = Field(default=None)
38
+ """Optional vscode command (+args) to connect to sandbox."""
46
39
 
47
-
48
- SandboxConnection = Union[
49
- SandboxConnectionContainer, SandboxConnectionLocal, SandboxConnectionSSH
50
- ]
51
- """Information required to connect to sandbox."""
40
+ container: str | None = Field(default=None)
41
+ """Optional container name (will not apply to all sandboxes)."""
52
42
 
53
43
 
54
44
  class SandboxEnvironment(abc.ABC):
@@ -64,24 +54,29 @@ class SandboxEnvironment(abc.ABC):
64
54
  return []
65
55
 
66
56
  @classmethod
67
- async def task_init(cls, task_name: str, config: str | None) -> None:
57
+ async def task_init(
58
+ cls, task_name: str, config: SandboxEnvironmentConfigType | None
59
+ ) -> None:
68
60
  """Called at task startup initialize resources.
69
61
 
70
62
  Args:
71
63
  task_name (str): Name of task using the sandbox environment.
72
- config (str): Implementation defined configuration file (optional).
64
+ config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
73
65
  """
74
66
  pass
75
67
 
76
68
  @classmethod
77
69
  async def sample_init(
78
- cls, task_name: str, config: str | None, metadata: dict[str, str]
70
+ cls,
71
+ task_name: str,
72
+ config: SandboxEnvironmentConfigType | None,
73
+ metadata: dict[str, str],
79
74
  ) -> dict[str, "SandboxEnvironment"]:
80
75
  """Initialize sandbox environments for a sample.
81
76
 
82
77
  Args:
83
78
  task_name (str): Name of task using the sandbox environment.
84
- config (str): Implementation defined configuration file (optional).
79
+ config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
85
80
  metadata (dict[str,str]): Sample `metadata` field
86
81
 
87
82
  Returns:
@@ -96,7 +91,7 @@ class SandboxEnvironment(abc.ABC):
96
91
  async def sample_cleanup(
97
92
  cls,
98
93
  task_name: str,
99
- config: str | None,
94
+ config: SandboxEnvironmentConfigType | None,
100
95
  environments: dict[str, "SandboxEnvironment"],
101
96
  interrupted: bool,
102
97
  ) -> None:
@@ -104,7 +99,7 @@ class SandboxEnvironment(abc.ABC):
104
99
 
105
100
  Args:
106
101
  task_name (str): Name of task using the sandbox environment.
107
- config (str): Implementation defined configuration file (optional).
102
+ config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
108
103
  environments (dict[str,SandboxEnvironment]): Sandbox environments created for this sample.
109
104
  interrupted (bool): Was the task interrupted by an error or cancellation
110
105
  """
@@ -112,13 +107,13 @@ class SandboxEnvironment(abc.ABC):
112
107
 
113
108
  @classmethod
114
109
  async def task_cleanup(
115
- cls, task_name: str, config: str | None, cleanup: bool
110
+ cls, task_name: str, config: SandboxEnvironmentConfigType | None, cleanup: bool
116
111
  ) -> None:
117
112
  """Called at task exit as a last chance to cleanup resources.
118
113
 
119
114
  Args:
120
115
  task_name (str): Name of task using the sandbox environment.
121
- config (str): Implementation defined configuration file (optional).
116
+ config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
122
117
  cleanup (bool): Whether to actually cleanup environment resources
123
118
  (False if `--no-sandbox-cleanup` was specified)
124
119
  """
@@ -227,6 +222,15 @@ class SandboxEnvironment(abc.ABC):
227
222
  ...
228
223
 
229
224
  async def connection(self) -> SandboxConnection:
225
+ """Information required to connect to sandbox environment.
226
+
227
+ Returns:
228
+ SandboxConnection: connection information
229
+
230
+ Raises:
231
+ NotImplementedError: For sandboxes that don't provide connections
232
+ ConnectionError: If sandbox is not currently running.
233
+ """
230
234
  raise NotImplementedError("connection not implemented")
231
235
 
232
236
 
@@ -248,8 +252,10 @@ class SandboxEnvironmentSpec(NamedTuple):
248
252
  """Specification of a SandboxEnvironment."""
249
253
 
250
254
  type: str
251
- config: str | None = None
255
+ config: SandboxEnvironmentConfigType | None = None
256
+
252
257
 
258
+ SandboxEnvironmentConfigType = BaseModel | str
253
259
 
254
260
  SandboxEnvironmentType = SandboxEnvironmentSpec | str | tuple[str, str]
255
261
  """SandboxEnvironmentSpec and str and tuple shorthands for it.
@@ -7,8 +7,15 @@ import aiofiles
7
7
  from typing_extensions import override
8
8
 
9
9
  from .._subprocess import ExecResult, subprocess
10
- from .environment import SandboxConnection, SandboxConnectionLocal, SandboxEnvironment
11
- from .limits import verify_exec_result_size, verify_read_file_size
10
+ from .environment import (
11
+ SandboxEnvironment,
12
+ SandboxEnvironmentConfigType,
13
+ )
14
+ from .limits import (
15
+ SandboxEnvironmentLimits,
16
+ verify_exec_result_size,
17
+ verify_read_file_size,
18
+ )
12
19
  from .registry import sandboxenv
13
20
 
14
21
 
@@ -17,7 +24,10 @@ class LocalSandboxEnvironment(SandboxEnvironment):
17
24
  @override
18
25
  @classmethod
19
26
  async def sample_init(
20
- cls, task_name: str, config: str | None, metadata: dict[str, str]
27
+ cls,
28
+ task_name: str,
29
+ config: SandboxEnvironmentConfigType | None,
30
+ metadata: dict[str, str],
21
31
  ) -> dict[str, SandboxEnvironment]:
22
32
  return {"default": LocalSandboxEnvironment()}
23
33
 
@@ -26,7 +36,7 @@ class LocalSandboxEnvironment(SandboxEnvironment):
26
36
  async def sample_cleanup(
27
37
  cls,
28
38
  task_name: str,
29
- config: str | None,
39
+ config: SandboxEnvironmentConfigType | None,
30
40
  environments: dict[str, SandboxEnvironment],
31
41
  interrupted: bool,
32
42
  ) -> None:
@@ -63,6 +73,7 @@ class LocalSandboxEnvironment(SandboxEnvironment):
63
73
  cwd=final_cwd,
64
74
  env=env,
65
75
  timeout=timeout,
76
+ output_limit=SandboxEnvironmentLimits.MAX_EXEC_OUTPUT_SIZE,
66
77
  )
67
78
  verify_exec_result_size(result)
68
79
  return result
@@ -97,12 +108,6 @@ class LocalSandboxEnvironment(SandboxEnvironment):
97
108
  async with aiofiles.open(file, "rb") as f:
98
109
  return await f.read()
99
110
 
100
- @override
101
- async def connection(self) -> SandboxConnection:
102
- return SandboxConnectionLocal(
103
- command="/bin/bash --login", working_dir=self.directory.name
104
- )
105
-
106
111
  def _resolve_file(self, file: str) -> str:
107
112
  path = Path(file)
108
113
  if path.is_absolute():
@@ -39,6 +39,7 @@ async def subprocess(
39
39
  cwd: str | Path | None = None,
40
40
  env: dict[str, str] = {},
41
41
  capture_output: bool = True,
42
+ output_limit: int | None = None,
42
43
  timeout: int | None = None,
43
44
  ) -> ExecResult[str]: ...
44
45
 
@@ -51,6 +52,7 @@ async def subprocess(
51
52
  cwd: str | Path | None = None,
52
53
  env: dict[str, str] = {},
53
54
  capture_output: bool = True,
55
+ output_limit: int | None = None,
54
56
  timeout: int | None = None,
55
57
  ) -> ExecResult[bytes]: ...
56
58
 
@@ -62,6 +64,7 @@ async def subprocess(
62
64
  cwd: str | Path | None = None,
63
65
  env: dict[str, str] = {},
64
66
  capture_output: bool = True,
67
+ output_limit: int | None = None,
65
68
  timeout: int | None = None,
66
69
  ) -> Union[ExecResult[str], ExecResult[bytes]]:
67
70
  """Execute and wait for a subprocess.
@@ -80,6 +83,8 @@ async def subprocess(
80
83
  env (dict[str, str]): Additional environment variables.
81
84
  capture_output (bool): Capture stderr and stdout into ExecResult
82
85
  (if False, then output is redirected to parent stderr/stdout)
86
+ output_limit (int | None): Stop reading output if it exceeds
87
+ the specified limit (in bytes).
83
88
  timeout (int | None): Timeout. If the timeout expires then
84
89
  a `TimeoutError` will be raised.
85
90
 
@@ -119,10 +124,45 @@ async def subprocess(
119
124
  # yield the proc
120
125
  yield proc
121
126
 
127
+ # write stdin if specified
128
+ if proc.stdin is not None:
129
+ if input is not None:
130
+ proc.stdin.write(input)
131
+ await proc.stdin.drain()
132
+ proc.stdin.close()
133
+ await proc.stdin.wait_closed()
134
+
135
+ # read streams incrementally so we can check output limits
136
+ async def read_stream(stream: asyncio.StreamReader | None) -> bytes:
137
+ # return early for no stream
138
+ if stream is None:
139
+ return bytes()
140
+
141
+ # read 8k at a time
142
+ output = bytearray()
143
+ while True:
144
+ # read chunk and terminate if we are done
145
+ chunk = await stream.read(8192)
146
+ if not chunk:
147
+ break
148
+
149
+ # append to output
150
+ output.extend(chunk)
151
+
152
+ # stop if we have a limit and we have exceeded it
153
+ if output_limit is not None and len(output) > output_limit:
154
+ proc.kill()
155
+ break
156
+
157
+ # return stream output
158
+ return bytes(output)
159
+
122
160
  # wait for it to execute and yield result
123
- stdout, stderr = await proc.communicate(input=input)
124
- success = proc.returncode == 0
125
- returncode = proc.returncode if proc.returncode is not None else 1
161
+ stdout, stderr = await asyncio.gather(
162
+ read_stream(proc.stdout), read_stream(proc.stderr)
163
+ )
164
+ returncode = await proc.wait()
165
+ success = returncode == 0
126
166
  if text:
127
167
  yield ExecResult[str](
128
168
  success=success,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: inspect_ai
3
- Version: 0.3.49
3
+ Version: 0.3.51
4
4
  Summary: Framework for large language model evaluations
5
5
  Author: UK AI Safety Institute
6
6
  License: MIT License
@@ -68,7 +68,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
68
68
  Requires-Dist: pytest-cov; extra == "dev"
69
69
  Requires-Dist: pytest-dotenv; extra == "dev"
70
70
  Requires-Dist: pytest-xdist; extra == "dev"
71
- Requires-Dist: ruff==0.8.1; extra == "dev"
71
+ Requires-Dist: ruff==0.8.2; extra == "dev"
72
72
  Requires-Dist: textual-dev>=0.86.2; extra == "dev"
73
73
  Requires-Dist: types-PyYAML; extra == "dev"
74
74
  Requires-Dist: types-aiofiles; extra == "dev"