inspect-ai 0.3.55__py3-none-any.whl → 0.3.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +1 -0
- inspect_ai/_cli/common.py +1 -1
- inspect_ai/_cli/trace.py +33 -20
- inspect_ai/_display/core/active.py +1 -1
- inspect_ai/_display/core/display.py +1 -1
- inspect_ai/_display/core/footer.py +1 -1
- inspect_ai/_display/core/panel.py +1 -1
- inspect_ai/_display/core/progress.py +0 -6
- inspect_ai/_display/core/rich.py +1 -1
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/app.py +15 -17
- inspect_ai/_display/textual/widgets/clock.py +3 -3
- inspect_ai/_display/textual/widgets/samples.py +6 -13
- inspect_ai/_eval/context.py +9 -1
- inspect_ai/_eval/run.py +16 -11
- inspect_ai/_eval/score.py +4 -10
- inspect_ai/_eval/task/results.py +5 -4
- inspect_ai/_eval/task/run.py +6 -12
- inspect_ai/_eval/task/task.py +10 -0
- inspect_ai/_util/ansi.py +31 -0
- inspect_ai/_util/datetime.py +1 -1
- inspect_ai/_util/deprecation.py +1 -1
- inspect_ai/_util/format.py +7 -0
- inspect_ai/_util/json.py +11 -1
- inspect_ai/_util/logger.py +14 -13
- inspect_ai/_util/throttle.py +10 -1
- inspect_ai/_util/trace.py +79 -47
- inspect_ai/_util/transcript.py +37 -4
- inspect_ai/_util/vscode.py +51 -0
- inspect_ai/_view/notify.py +2 -1
- inspect_ai/_view/www/.prettierrc.js +12 -0
- inspect_ai/_view/www/App.css +22 -1
- inspect_ai/_view/www/dist/assets/index.css +2374 -2
- inspect_ai/_view/www/dist/assets/index.js +29752 -24492
- inspect_ai/_view/www/log-schema.json +262 -215
- inspect_ai/_view/www/package.json +1 -0
- inspect_ai/_view/www/src/App.mjs +19 -9
- inspect_ai/_view/www/src/Types.mjs +0 -1
- inspect_ai/_view/www/src/api/Types.mjs +15 -4
- inspect_ai/_view/www/src/api/api-http.mjs +2 -0
- inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
- inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
- inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
- inspect_ai/_view/www/src/components/MessageContent.mjs +1 -1
- inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
- inspect_ai/_view/www/src/components/Tools.mjs +28 -5
- inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
- inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
- inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
- inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
- inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +238 -178
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
- inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +28 -20
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
- inspect_ai/_view/www/yarn.lock +44 -0
- inspect_ai/approval/_apply.py +4 -0
- inspect_ai/approval/_human/panel.py +5 -8
- inspect_ai/dataset/_dataset.py +51 -10
- inspect_ai/dataset/_util.py +31 -3
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_log.py +30 -2
- inspect_ai/log/_recorders/eval.py +2 -0
- inspect_ai/model/_call_tools.py +31 -7
- inspect_ai/model/_chat_message.py +3 -0
- inspect_ai/model/_model.py +42 -1
- inspect_ai/model/_providers/anthropic.py +4 -0
- inspect_ai/model/_providers/google.py +24 -6
- inspect_ai/model/_providers/openai.py +17 -3
- inspect_ai/model/_providers/openai_o1.py +10 -12
- inspect_ai/model/_render.py +9 -2
- inspect_ai/scorer/_metric.py +12 -1
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_human_agent/agent.py +83 -0
- inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
- inspect_ai/solver/_human_agent/commands/clock.py +70 -0
- inspect_ai/solver/_human_agent/commands/command.py +59 -0
- inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
- inspect_ai/solver/_human_agent/commands/note.py +42 -0
- inspect_ai/solver/_human_agent/commands/score.py +80 -0
- inspect_ai/solver/_human_agent/commands/status.py +62 -0
- inspect_ai/solver/_human_agent/commands/submit.py +151 -0
- inspect_ai/solver/_human_agent/install.py +222 -0
- inspect_ai/solver/_human_agent/panel.py +252 -0
- inspect_ai/solver/_human_agent/service.py +45 -0
- inspect_ai/solver/_human_agent/state.py +55 -0
- inspect_ai/solver/_human_agent/view.py +24 -0
- inspect_ai/solver/_task_state.py +28 -2
- inspect_ai/tool/_tool.py +10 -2
- inspect_ai/tool/_tool_info.py +2 -1
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +16 -13
- inspect_ai/util/__init__.py +12 -4
- inspect_ai/{_util/display.py → util/_display.py} +6 -0
- inspect_ai/util/_panel.py +31 -9
- inspect_ai/util/_sandbox/__init__.py +0 -3
- inspect_ai/util/_sandbox/context.py +5 -1
- inspect_ai/util/_sandbox/docker/compose.py +17 -13
- inspect_ai/util/_sandbox/docker/docker.py +9 -6
- inspect_ai/util/_sandbox/docker/internal.py +1 -1
- inspect_ai/util/_sandbox/docker/util.py +3 -2
- inspect_ai/util/_sandbox/environment.py +6 -5
- inspect_ai/util/_sandbox/local.py +1 -1
- inspect_ai/util/_sandbox/self_check.py +18 -18
- inspect_ai/util/_sandbox/service.py +22 -7
- inspect_ai/util/_store.py +7 -8
- inspect_ai/util/_store_model.py +110 -0
- inspect_ai/util/_subprocess.py +3 -3
- inspect_ai/util/_throttle.py +32 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/RECORD +131 -108
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import json
|
3
|
+
from logging import getLogger
|
3
4
|
from pathlib import PurePosixPath
|
4
5
|
from textwrap import dedent
|
5
6
|
from typing import (
|
@@ -14,9 +15,12 @@ from inspect_ai.util._subprocess import ExecResult
|
|
14
15
|
|
15
16
|
from .environment import SandboxEnvironment
|
16
17
|
|
18
|
+
logger = getLogger(__name__)
|
19
|
+
|
20
|
+
|
17
21
|
REQUESTS_DIR = "requests"
|
18
22
|
RESPONSES_DIR = "responses"
|
19
|
-
SERVICES_DIR = "/tmp/
|
23
|
+
SERVICES_DIR = "/var/tmp/sandbox-services"
|
20
24
|
|
21
25
|
ID = "id"
|
22
26
|
METHOD = "method"
|
@@ -70,7 +74,7 @@ class SandboxService:
|
|
70
74
|
|
71
75
|
```python
|
72
76
|
import sys
|
73
|
-
sys.path.append("/tmp/
|
77
|
+
sys.path.append("/var/tmp/sandbox-services/foo")
|
74
78
|
import foo
|
75
79
|
```
|
76
80
|
|
@@ -79,7 +83,7 @@ class SandboxService:
|
|
79
83
|
```python
|
80
84
|
import importlib.util
|
81
85
|
spec = importlib.util.spec_from_file_location(
|
82
|
-
"foo", "/tmp/
|
86
|
+
"foo", "/var/tmp/sandbox-services/foo/foo.py"
|
83
87
|
)
|
84
88
|
foo = importlib.util.module_from_spec(spec)
|
85
89
|
spec.loader.exec_module(foo)
|
@@ -150,8 +154,14 @@ class SandboxService:
|
|
150
154
|
f"Error reading request for service {self._name}: '{read_request}' ({result.stderr})"
|
151
155
|
)
|
152
156
|
|
153
|
-
# parse request
|
154
|
-
|
157
|
+
# parse request (decode error could occur if its incomplete so bypass this)
|
158
|
+
try:
|
159
|
+
request_data = json.loads(result.stdout)
|
160
|
+
except json.JSONDecodeError:
|
161
|
+
logger.warning(
|
162
|
+
f"JSON decoding error reading service request: {result.stdout}"
|
163
|
+
)
|
164
|
+
return None
|
155
165
|
if not isinstance(request_data, dict):
|
156
166
|
raise TypeError(f"Service request is not a dict (type={request_data})")
|
157
167
|
|
@@ -275,7 +285,7 @@ class SandboxService:
|
|
275
285
|
return request_id
|
276
286
|
|
277
287
|
def _read_{self._name}_response(request_id: str) -> tuple[bool, Any]:
|
278
|
-
from json import load
|
288
|
+
from json import JSONDecodeError, load
|
279
289
|
from pathlib import Path
|
280
290
|
|
281
291
|
responses_dir = Path("{SERVICES_DIR}", "{self._name}", "{RESPONSES_DIR}")
|
@@ -283,7 +293,12 @@ class SandboxService:
|
|
283
293
|
if response_path.exists():
|
284
294
|
# read and remove the file
|
285
295
|
with open(response_path, "r") as f:
|
286
|
-
|
296
|
+
# it's possible the file is still being written so
|
297
|
+
# just catch and wait for another retry if this occurs
|
298
|
+
try:
|
299
|
+
response = load(f)
|
300
|
+
except JSONDecodeError:
|
301
|
+
return False, None
|
287
302
|
response_path.unlink()
|
288
303
|
|
289
304
|
# raise error if we have one
|
inspect_ai/util/_store.py
CHANGED
@@ -34,18 +34,14 @@ class Store:
|
|
34
34
|
inheriting from Pydantic `BaseModel`)
|
35
35
|
"""
|
36
36
|
|
37
|
-
def __init__(self) -> None:
|
38
|
-
self._data
|
37
|
+
def __init__(self, data: dict[str, Any] | None = None) -> None:
|
38
|
+
self._data = deepcopy(data) if data else {}
|
39
39
|
|
40
40
|
@overload
|
41
|
-
def get(self, key: str, default: None = None) -> Any:
|
42
|
-
return self._data.get(key, default)
|
41
|
+
def get(self, key: str, default: None = None) -> Any: ...
|
43
42
|
|
44
43
|
@overload
|
45
|
-
def get(self, key: str, default: VT) -> VT:
|
46
|
-
if key not in self._data.keys():
|
47
|
-
self._data[key] = default
|
48
|
-
return cast(VT, self._data.get(key, default))
|
44
|
+
def get(self, key: str, default: VT) -> VT: ...
|
49
45
|
|
50
46
|
def get(self, key: str, default: VT | None = None) -> VT | Any:
|
51
47
|
"""Get a value from the store.
|
@@ -60,6 +56,9 @@ class Store:
|
|
60
56
|
Returns:
|
61
57
|
Value if is exists, otherwise default.
|
62
58
|
"""
|
59
|
+
if default is not None:
|
60
|
+
if key not in self._data.keys():
|
61
|
+
self._data[key] = default
|
63
62
|
return cast(VT, self._data.get(key, default))
|
64
63
|
|
65
64
|
def set(self, key: str, value: Any) -> None:
|
@@ -0,0 +1,110 @@
|
|
1
|
+
from typing import Any, Type, TypeVar
|
2
|
+
|
3
|
+
from pydantic import BaseModel, ConfigDict, Field
|
4
|
+
|
5
|
+
from ._store import Store, store
|
6
|
+
|
7
|
+
|
8
|
+
class StoreModel(BaseModel):
|
9
|
+
"""Store backed Pydandic BaseModel.
|
10
|
+
|
11
|
+
The model is initialised from a Store, so that Store should
|
12
|
+
either already satisfy the validation constraints of the model
|
13
|
+
OR you should provide Field(default=) annotations for all of
|
14
|
+
your model fields (the latter approach is recommended).
|
15
|
+
"""
|
16
|
+
|
17
|
+
store: Store = Field(exclude=True, default_factory=store)
|
18
|
+
|
19
|
+
def model_post_init(self, __context: Any) -> None:
|
20
|
+
for name in self.model_fields.keys():
|
21
|
+
if name == "store":
|
22
|
+
continue
|
23
|
+
# if its in the store, then have our dict reflect that
|
24
|
+
ns_name = self._ns_name(name)
|
25
|
+
if ns_name in self.store:
|
26
|
+
self.__dict__[name] = self.store.get(ns_name)
|
27
|
+
# if its not in the store, then reflect dict into store
|
28
|
+
elif name in self.__dict__.keys():
|
29
|
+
self.store.set(ns_name, self.__dict__[name])
|
30
|
+
|
31
|
+
def __getattribute__(self, name: str) -> Any:
|
32
|
+
# sidestep dunders and pydantic fields
|
33
|
+
if name.startswith("__") or name.startswith("model_"):
|
34
|
+
return object.__getattribute__(self, name)
|
35
|
+
# handle model_fields (except 'store') by reading the store
|
36
|
+
elif name in object.__getattribute__(self, "model_fields") and name != "store":
|
37
|
+
store_key = self._ns_name(name)
|
38
|
+
if store_key in self.store:
|
39
|
+
return self.store.get(store_key)
|
40
|
+
else:
|
41
|
+
return object.__getattribute__(self, name)
|
42
|
+
# default to super
|
43
|
+
else:
|
44
|
+
return super().__getattribute__(name)
|
45
|
+
|
46
|
+
def __setattr__(self, name: str, value: Any) -> None:
|
47
|
+
if name in self.model_fields:
|
48
|
+
# validate with the new value (can throw ValidationError)
|
49
|
+
temp_data = self.store._data.copy()
|
50
|
+
temp_data[self._ns_name(name)] = value
|
51
|
+
self._validate_store(temp_data)
|
52
|
+
|
53
|
+
# update the store and sync the underlying __dict__
|
54
|
+
self.store.set(self._ns_name(name), value)
|
55
|
+
self.__dict__[name] = value
|
56
|
+
else:
|
57
|
+
super().__setattr__(name, value)
|
58
|
+
|
59
|
+
def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
|
60
|
+
self._sync_model() # in case store was updated behind our back
|
61
|
+
return super().model_dump(*args, **kwargs)
|
62
|
+
|
63
|
+
def model_dump_json(self, *args: Any, **kwargs: Any) -> str:
|
64
|
+
self._sync_model() # in case store was updated behind our back
|
65
|
+
return super().model_dump_json(*args, **kwargs)
|
66
|
+
|
67
|
+
def _sync_model(self) -> None:
|
68
|
+
self._validate_store()
|
69
|
+
for field_name in self.model_fields.keys():
|
70
|
+
if field_name == "store":
|
71
|
+
continue
|
72
|
+
store_value = self.store.get(self._ns_name(field_name))
|
73
|
+
self.__dict__[field_name] = store_value
|
74
|
+
|
75
|
+
def _validate_store(self, data: dict[str, Any] | None = None) -> None:
|
76
|
+
# validate store or custom dict
|
77
|
+
data = data if data is not None else self.store._data
|
78
|
+
|
79
|
+
# pick out keys to validate
|
80
|
+
validate: dict[str, Any] = {}
|
81
|
+
for k, v in data.items():
|
82
|
+
if k.startswith(f"{self.__class__.__name__}:"):
|
83
|
+
unprefixed = self._un_ns_name(k)
|
84
|
+
validate[unprefixed] = v
|
85
|
+
|
86
|
+
# perform validation
|
87
|
+
self.__class__.model_validate(validate)
|
88
|
+
|
89
|
+
def _ns_name(self, name: str) -> str:
|
90
|
+
return f"{self.__class__.__name__}:{name}"
|
91
|
+
|
92
|
+
def _un_ns_name(self, name: str) -> str:
|
93
|
+
return name.replace(f"{self.__class__.__name__}:", "", 1)
|
94
|
+
|
95
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
96
|
+
|
97
|
+
|
98
|
+
SMT = TypeVar("SMT", bound=StoreModel)
|
99
|
+
|
100
|
+
|
101
|
+
def store_as(model_cls: Type[SMT]) -> SMT:
|
102
|
+
"""Get a Pydantic model interface to the store.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
model_cls: Pydantic model type (must derive from StoreModel)
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
StoreModel: Instance of model_cls bound to current Store.
|
109
|
+
"""
|
110
|
+
return model_cls(store=store())
|
inspect_ai/util/_subprocess.py
CHANGED
@@ -101,9 +101,9 @@ async def subprocess(
|
|
101
101
|
input = input.encode() if isinstance(input, str) else input
|
102
102
|
|
103
103
|
# function to run command (we may or may not run it w/ concurrency)
|
104
|
-
async def run_command() ->
|
105
|
-
|
106
|
-
|
104
|
+
async def run_command() -> AsyncGenerator[
|
105
|
+
Union[Process, ExecResult[str], ExecResult[bytes]], None
|
106
|
+
]:
|
107
107
|
if isinstance(args, str):
|
108
108
|
proc = await asyncio.create_subprocess_shell(
|
109
109
|
args,
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import time
|
2
|
+
from functools import wraps
|
3
|
+
from typing import Any, Callable
|
4
|
+
|
5
|
+
|
6
|
+
def throttle(seconds: float) -> Callable[..., Any]:
|
7
|
+
"""Throttle a function to ensure it is called no more than every n seconds.
|
8
|
+
|
9
|
+
Args:
|
10
|
+
seconds (float): Throttle time.
|
11
|
+
|
12
|
+
Returns:
|
13
|
+
Callable: Throttled function.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
|
17
|
+
last_called: float = 0
|
18
|
+
last_result: Any = None
|
19
|
+
|
20
|
+
@wraps(func)
|
21
|
+
def wrapped(*args: Any, **kwargs: Any) -> Any:
|
22
|
+
nonlocal last_called
|
23
|
+
nonlocal last_result
|
24
|
+
current_time = time.time()
|
25
|
+
if current_time - last_called >= seconds:
|
26
|
+
last_result = func(*args, **kwargs)
|
27
|
+
last_called = current_time
|
28
|
+
return last_result
|
29
|
+
|
30
|
+
return wrapped
|
31
|
+
|
32
|
+
return decorator
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: inspect_ai
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.57
|
4
4
|
Summary: Framework for large language model evaluations
|
5
5
|
Author: UK AI Safety Institute
|
6
6
|
License: MIT License
|
@@ -67,7 +67,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
|
|
67
67
|
Requires-Dist: pytest-cov; extra == "dev"
|
68
68
|
Requires-Dist: pytest-dotenv; extra == "dev"
|
69
69
|
Requires-Dist: pytest-xdist; extra == "dev"
|
70
|
-
Requires-Dist: ruff==0.
|
70
|
+
Requires-Dist: ruff==0.9.0; extra == "dev"
|
71
71
|
Requires-Dist: textual-dev>=0.86.2; extra == "dev"
|
72
72
|
Requires-Dist: types-PyYAML; extra == "dev"
|
73
73
|
Requires-Dist: types-beautifulsoup4; extra == "dev"
|