inspect-ai 0.3.56__py3-none-any.whl → 0.3.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_display/core/panel.py +1 -1
- inspect_ai/_eval/run.py +16 -11
- inspect_ai/_util/datetime.py +1 -1
- inspect_ai/_util/deprecation.py +1 -1
- inspect_ai/_util/json.py +11 -1
- inspect_ai/_util/logger.py +2 -1
- inspect_ai/_util/trace.py +39 -3
- inspect_ai/_util/transcript.py +36 -7
- inspect_ai/_view/www/.prettierrc.js +12 -0
- inspect_ai/_view/www/dist/assets/index.js +286 -224
- inspect_ai/_view/www/log-schema.json +124 -125
- inspect_ai/_view/www/src/App.mjs +18 -9
- inspect_ai/_view/www/src/Types.mjs +0 -1
- inspect_ai/_view/www/src/api/Types.mjs +15 -4
- inspect_ai/_view/www/src/api/api-http.mjs +2 -0
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
- inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
- inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
- inspect_ai/_view/www/src/components/MessageContent.mjs +1 -1
- inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
- inspect_ai/_view/www/src/components/Tools.mjs +18 -3
- inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
- inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
- inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
- inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
- inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
- inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +238 -178
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
- inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
- inspect_ai/_view/www/src/types/log.d.ts +2 -8
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
- inspect_ai/log/_log.py +25 -0
- inspect_ai/log/_recorders/eval.py +2 -0
- inspect_ai/model/_call_tools.py +27 -5
- inspect_ai/model/_providers/google.py +24 -6
- inspect_ai/model/_providers/openai.py +17 -3
- inspect_ai/model/_providers/openai_o1.py +10 -12
- inspect_ai/tool/_tool_info.py +2 -1
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
- inspect_ai/util/__init__.py +4 -0
- inspect_ai/util/_sandbox/docker/compose.py +1 -3
- inspect_ai/util/_sandbox/docker/util.py +2 -1
- inspect_ai/util/_sandbox/self_check.py +18 -18
- inspect_ai/util/_store.py +2 -2
- inspect_ai/util/_subprocess.py +3 -3
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/RECORD +57 -56
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/top_level.txt +0 -0
@@ -112,7 +112,7 @@ def tasks_title(completed: int, total: int) -> str:
|
|
112
112
|
def task_title(profile: TaskProfile, show_model: bool) -> str:
|
113
113
|
eval_epochs = profile.eval_config.epochs or 1
|
114
114
|
epochs = f" x {profile.eval_config.epochs}" if eval_epochs > 1 else ""
|
115
|
-
samples = f"{profile.samples//eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
|
115
|
+
samples = f"{profile.samples // eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
|
116
116
|
title = f"{registry_unqualified_name(profile.name)} ({samples})"
|
117
117
|
if show_model:
|
118
118
|
title = f"{title}: {profile.model}"
|
inspect_ai/_eval/run.py
CHANGED
@@ -42,7 +42,7 @@ from .task.log import TaskLogger
|
|
42
42
|
from .task.run import TaskRunOptions, task_run
|
43
43
|
from .task.rundir import task_run_dir_switching
|
44
44
|
from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
|
45
|
-
from .task.util import task_run_dir
|
45
|
+
from .task.util import slice_dataset, task_run_dir
|
46
46
|
|
47
47
|
log = logging.getLogger(__name__)
|
48
48
|
|
@@ -70,12 +70,23 @@ async def eval_run(
|
|
70
70
|
# get cwd before switching to task dir
|
71
71
|
eval_wd = os.getcwd()
|
72
72
|
|
73
|
+
# ensure sample ids
|
74
|
+
for resolved_task in tasks:
|
75
|
+
# add sample ids to dataset if they aren't there (start at 1 not 0)
|
76
|
+
task = resolved_task.task
|
77
|
+
for id, sample in enumerate(task.dataset):
|
78
|
+
if sample.id is None:
|
79
|
+
sample.id = id + 1
|
80
|
+
|
81
|
+
# Ensure sample ids are unique
|
82
|
+
ensure_unique_ids(task.dataset)
|
83
|
+
|
73
84
|
# run startup pass for the sandbox environments
|
74
85
|
shutdown_sandbox_environments: Callable[[], Awaitable[None]] | None = None
|
75
86
|
if has_sandbox:
|
76
87
|
cleanup = eval_config.sandbox_cleanup is not False
|
77
88
|
shutdown_sandbox_environments = await startup_sandbox_environments(
|
78
|
-
resolve_sandbox_environment(eval_sandbox), tasks, cleanup
|
89
|
+
resolve_sandbox_environment(eval_sandbox), tasks, eval_config, cleanup
|
79
90
|
)
|
80
91
|
|
81
92
|
# resolve solver and solver spec
|
@@ -146,14 +157,6 @@ async def eval_run(
|
|
146
157
|
else:
|
147
158
|
task.fail_on_error = task_eval_config.fail_on_error
|
148
159
|
|
149
|
-
# add sample ids to dataset if they aren't there (start at 1 not 0)
|
150
|
-
for id, sample in enumerate(task.dataset):
|
151
|
-
if sample.id is None:
|
152
|
-
sample.id = id + 1
|
153
|
-
|
154
|
-
# Ensure sample ids are unique
|
155
|
-
ensure_unique_ids(task.dataset)
|
156
|
-
|
157
160
|
# create and track the logger
|
158
161
|
logger = TaskLogger(
|
159
162
|
task_name=task.name,
|
@@ -340,13 +343,15 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
340
343
|
async def startup_sandbox_environments(
|
341
344
|
eval_sandbox: SandboxEnvironmentSpec | None,
|
342
345
|
tasks: list[ResolvedTask],
|
346
|
+
config: EvalConfig,
|
343
347
|
cleanup: bool,
|
344
348
|
) -> Callable[[], Awaitable[None]]:
|
345
349
|
# find unique sandboxenvs
|
346
350
|
sandboxenvs: Set[TaskSandboxEnvironment] = set()
|
347
351
|
for task in tasks:
|
348
352
|
# resolve each sample and add to sandboxenvs
|
349
|
-
|
353
|
+
dataset = slice_dataset(task.task.dataset, config.limit, config.sample_id)
|
354
|
+
for sample in dataset:
|
350
355
|
sandbox = resolve_sandbox_for_task(eval_sandbox, task.task, sample)
|
351
356
|
if sandbox is not None and sandbox not in sandboxenvs:
|
352
357
|
sandboxenvs.add(sandbox)
|
inspect_ai/_util/datetime.py
CHANGED
@@ -4,7 +4,7 @@ from typing import Literal
|
|
4
4
|
|
5
5
|
def iso_now(
|
6
6
|
timespec: Literal[
|
7
|
-
"auto", "hours", "minutes", "seconds", "milliseconds" "microseconds"
|
7
|
+
"auto", "hours", "minutes", "seconds", "milliseconds", "microseconds"
|
8
8
|
] = "seconds",
|
9
9
|
) -> str:
|
10
10
|
return datetime.now().astimezone().isoformat(timespec=timespec)
|
inspect_ai/_util/deprecation.py
CHANGED
@@ -174,7 +174,7 @@ def default_deprecation_msg(
|
|
174
174
|
|
175
175
|
_qual = getattr(obj, "__qualname__", "") or ""
|
176
176
|
if _qual.endswith(".__init__") or _qual.endswith(".__new__"):
|
177
|
-
_obj = f
|
177
|
+
_obj = f" class ({_qual.rsplit('.', 1)[0]})"
|
178
178
|
elif _qual and _obj:
|
179
179
|
_obj += f" ({_qual})"
|
180
180
|
|
inspect_ai/_util/json.py
CHANGED
@@ -103,10 +103,20 @@ def json_changes(
|
|
103
103
|
paths = json_change.path.split("/")[1:]
|
104
104
|
replaced = before
|
105
105
|
for path in paths:
|
106
|
-
|
106
|
+
decoded_path = decode_json_pointer_segment(path)
|
107
|
+
index: Any = (
|
108
|
+
int(decoded_path) if decoded_path.isnumeric() else decoded_path
|
109
|
+
)
|
107
110
|
replaced = replaced[index]
|
108
111
|
json_change.replaced = replaced
|
109
112
|
changes.append(json_change)
|
110
113
|
return changes
|
111
114
|
else:
|
112
115
|
return None
|
116
|
+
|
117
|
+
|
118
|
+
def decode_json_pointer_segment(segment: str) -> str:
|
119
|
+
"""Decode a single JSON Pointer segment."""
|
120
|
+
# JSON points encode ~ and / because they are special characters
|
121
|
+
# this decodes these values (https://www.rfc-editor.org/rfc/rfc6901)
|
122
|
+
return segment.replace("~1", "/").replace("~0", "~")
|
inspect_ai/_util/logger.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import atexit
|
2
2
|
import os
|
3
|
+
import re
|
3
4
|
from logging import (
|
4
5
|
DEBUG,
|
5
6
|
INFO,
|
@@ -182,7 +183,7 @@ def notify_logger_record(record: LogRecord, write: bool) -> None:
|
|
182
183
|
if write:
|
183
184
|
transcript()._event(LoggerEvent(message=LoggingMessage.from_log_record(record)))
|
184
185
|
global _rate_limit_count
|
185
|
-
if (record.levelno <= INFO and "
|
186
|
+
if (record.levelno <= INFO and re.search(r"\b429\b", record.getMessage())) or (
|
186
187
|
record.levelno == DEBUG
|
187
188
|
# See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html#validating-retry-attempts
|
188
189
|
# for boto retry logic / log messages (this is tracking standard or adapative retries)
|
inspect_ai/_util/trace.py
CHANGED
@@ -33,6 +33,22 @@ def inspect_trace_file() -> Path:
|
|
33
33
|
def trace_action(
|
34
34
|
logger: Logger, action: str, message: str, *args: Any, **kwargs: Any
|
35
35
|
) -> Generator[None, None, None]:
|
36
|
+
"""Trace a long running or poentially unreliable action.
|
37
|
+
|
38
|
+
Trace actions for which you want to collect data on the resolution
|
39
|
+
(e.g. succeeded, cancelled, failed, timed out, etc.) and duration of.
|
40
|
+
|
41
|
+
Traces are written to the `TRACE` log level (which is just below
|
42
|
+
`HTTP` and `INFO`). List and read trace logs with `inspect trace list`
|
43
|
+
and related commands (see `inspect trace --help` for details).
|
44
|
+
|
45
|
+
Args:
|
46
|
+
logger (Logger): Logger to use for tracing (e.g. from `getLogger(__name__)`)
|
47
|
+
action (str): Name of action to trace (e.g. 'Model', 'Subprocess', etc.)
|
48
|
+
message (str): Message describing action (can be a format string w/ args or kwargs)
|
49
|
+
*args (Any): Positional arguments for `message` format string.
|
50
|
+
**kwargs (Any): Named args for `message` format string.
|
51
|
+
"""
|
36
52
|
trace_id = uuid()
|
37
53
|
start_monotonic = time.monotonic()
|
38
54
|
start_wall = time.time()
|
@@ -117,6 +133,19 @@ def trace_action(
|
|
117
133
|
def trace_message(
|
118
134
|
logger: Logger, category: str, message: str, *args: Any, **kwargs: Any
|
119
135
|
) -> None:
|
136
|
+
"""Log a message using the TRACE log level.
|
137
|
+
|
138
|
+
The `TRACE` log level is just below `HTTP` and `INFO`). List and
|
139
|
+
read trace logs with `inspect trace list` and related commands
|
140
|
+
(see `inspect trace --help` for details).
|
141
|
+
|
142
|
+
Args:
|
143
|
+
logger (Logger): Logger to use for tracing (e.g. from `getLogger(__name__)`)
|
144
|
+
category (str): Category of trace message.
|
145
|
+
message (str): Trace message (can be a format string w/ args or kwargs)
|
146
|
+
*args (Any): Positional arguments for `message` format string.
|
147
|
+
**kwargs (Any): Named args for `message` format string.
|
148
|
+
"""
|
120
149
|
logger.log(TRACE, f"[{category}] {message}", *args, **kwargs)
|
121
150
|
|
122
151
|
|
@@ -250,9 +279,16 @@ def read_trace_file(file: Path) -> list[TraceRecord]:
|
|
250
279
|
|
251
280
|
|
252
281
|
def rotate_trace_files() -> None:
|
253
|
-
|
254
|
-
|
255
|
-
|
282
|
+
# if multiple inspect processes start up at once they
|
283
|
+
# will all be attempting to rotate at the same time,
|
284
|
+
# which can lead to FileNotFoundError -- ignore these
|
285
|
+
# errors if they occur
|
286
|
+
try:
|
287
|
+
rotate_files = list_trace_files()[10:]
|
288
|
+
for file in rotate_files:
|
289
|
+
file.file.unlink(missing_ok=True)
|
290
|
+
except FileNotFoundError:
|
291
|
+
pass
|
256
292
|
|
257
293
|
|
258
294
|
def compress_trace_log(log_handler: FileHandler) -> Callable[[], None]:
|
inspect_ai/_util/transcript.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import html
|
2
|
+
import re
|
2
3
|
from typing import Any
|
3
4
|
|
4
5
|
from rich.align import AlignMethod
|
@@ -19,13 +20,43 @@ def transcript_code_theme() -> str:
|
|
19
20
|
def transcript_markdown(content: str, *, escape: bool = False) -> Markdown:
|
20
21
|
code_theme = transcript_code_theme()
|
21
22
|
return Markdown(
|
22
|
-
|
23
|
+
html_escape_markdown(content) if escape else content,
|
23
24
|
code_theme=code_theme,
|
24
25
|
inline_code_lexer="python",
|
25
26
|
inline_code_theme=code_theme,
|
26
27
|
)
|
27
28
|
|
28
29
|
|
30
|
+
def html_escape_markdown(content: str) -> str:
|
31
|
+
"""Escape markdown lines that aren't in a code block."""
|
32
|
+
codeblock_pattern = re.compile("`{3,}")
|
33
|
+
current_codeblock = ""
|
34
|
+
escaped: list[str] = []
|
35
|
+
lines = content.splitlines()
|
36
|
+
for line in lines:
|
37
|
+
# look for matching end of codeblock
|
38
|
+
if current_codeblock:
|
39
|
+
if current_codeblock in line:
|
40
|
+
current_codeblock = ""
|
41
|
+
escaped.append(line)
|
42
|
+
continue
|
43
|
+
|
44
|
+
# look for beginning of codeblock
|
45
|
+
match = codeblock_pattern.search(line)
|
46
|
+
if match:
|
47
|
+
current_codeblock = match[0]
|
48
|
+
escaped.append(line)
|
49
|
+
continue
|
50
|
+
|
51
|
+
# escape if we are not in a codeblock
|
52
|
+
if current_codeblock:
|
53
|
+
escaped.append(line)
|
54
|
+
else:
|
55
|
+
escaped.append(html.escape(line, quote=False))
|
56
|
+
|
57
|
+
return "\n".join(escaped)
|
58
|
+
|
59
|
+
|
29
60
|
def set_transcript_markdown_options(markdown: Markdown) -> None:
|
30
61
|
code_theme = transcript_code_theme()
|
31
62
|
markdown.code_theme = code_theme
|
@@ -89,12 +120,10 @@ def transcript_function(function: str, arguments: dict[str, Any]) -> RenderableT
|
|
89
120
|
return transcript_markdown("```python\n" + call + "\n```\n")
|
90
121
|
|
91
122
|
|
92
|
-
DOUBLE_LINE = Box(
|
93
|
-
" ══ \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n"
|
94
|
-
)
|
123
|
+
DOUBLE_LINE = Box(" ══ \n \n \n \n \n \n \n \n")
|
95
124
|
|
96
|
-
LINE = Box(" ── \n
|
125
|
+
LINE = Box(" ── \n \n \n \n \n \n \n \n")
|
97
126
|
|
98
|
-
DOTTED = Box(" ·· \n
|
127
|
+
DOTTED = Box(" ·· \n \n \n \n \n \n \n \n")
|
99
128
|
|
100
|
-
NOBORDER = Box(" \n
|
129
|
+
NOBORDER = Box(" \n \n \n \n \n \n \n \n")
|
@@ -0,0 +1,12 @@
|
|
1
|
+
// Do not remove this file even if the config is empty!
|
2
|
+
// VSCode's "Format Document" will respect this config and use the default
|
3
|
+
// settings, which is what we want. Without prettierrc, VSCode falls back to
|
4
|
+
// users settings, which could be different.
|
5
|
+
|
6
|
+
/**
|
7
|
+
* @see https://prettier.io/docs/en/configuration.html
|
8
|
+
* @type {import("prettier").Config}
|
9
|
+
*/
|
10
|
+
const config = {};
|
11
|
+
|
12
|
+
export default config;
|