inspect-ai 0.3.99__py3-none-any.whl → 0.3.100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_display/core/config.py +11 -5
- inspect_ai/_display/core/panel.py +66 -2
- inspect_ai/_display/core/textual.py +5 -2
- inspect_ai/_display/plain/display.py +1 -0
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/widgets/transcript.py +37 -9
- inspect_ai/_eval/score.py +2 -4
- inspect_ai/_eval/task/run.py +59 -81
- inspect_ai/_util/content.py +11 -6
- inspect_ai/_util/interrupt.py +2 -2
- inspect_ai/_util/text.py +7 -0
- inspect_ai/_util/working.py +8 -37
- inspect_ai/_view/__init__.py +0 -0
- inspect_ai/_view/schema.py +2 -1
- inspect_ai/_view/www/CLAUDE.md +15 -0
- inspect_ai/_view/www/dist/assets/index.css +263 -159
- inspect_ai/_view/www/dist/assets/index.js +22153 -19093
- inspect_ai/_view/www/log-schema.json +77 -3
- inspect_ai/_view/www/package.json +5 -1
- inspect_ai/_view/www/src/@types/log.d.ts +9 -0
- inspect_ai/_view/www/src/app/App.tsx +1 -15
- inspect_ai/_view/www/src/app/appearance/icons.ts +4 -1
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +24 -6
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +0 -5
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +220 -205
- inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +2 -1
- inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +5 -0
- inspect_ai/_view/www/src/app/routing/url.ts +84 -4
- inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.module.css +0 -5
- inspect_ai/_view/www/src/app/samples/SampleDialog.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/SampleDisplay.module.css +7 -0
- inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +24 -17
- inspect_ai/_view/www/src/app/samples/SampleSummaryView.module.css +1 -2
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +8 -6
- inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.tsx +0 -4
- inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.tsx +3 -2
- inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +2 -0
- inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +2 -0
- inspect_ai/_view/www/src/app/samples/chat/messages.ts +1 -0
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +1 -0
- inspect_ai/_view/www/src/app/samples/list/SampleRow.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/InputEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleInitEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +3 -2
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.tsx +4 -5
- inspect_ai/_view/www/src/app/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/StepEventView.tsx +1 -3
- inspect_ai/_view/www/src/app/samples/transcript/SubtaskEventView.tsx +1 -2
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +3 -4
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.module.css +42 -0
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.tsx +77 -0
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualList.tsx +27 -71
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +13 -3
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.tsx +27 -2
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.module.css +1 -0
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +21 -22
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.module.css +45 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +223 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.module.css +10 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +258 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/tree-visitors.ts +187 -0
- inspect_ai/_view/www/src/app/samples/transcript/state/StateEventRenderers.tsx +8 -1
- inspect_ai/_view/www/src/app/samples/transcript/state/StateEventView.tsx +3 -4
- inspect_ai/_view/www/src/app/samples/transcript/transform/hooks.ts +78 -0
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +340 -135
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +3 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +2 -0
- inspect_ai/_view/www/src/app/types.ts +5 -1
- inspect_ai/_view/www/src/client/api/api-browser.ts +2 -2
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +6 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +1 -1
- inspect_ai/_view/www/src/components/PopOver.tsx +422 -0
- inspect_ai/_view/www/src/components/PulsingDots.module.css +9 -9
- inspect_ai/_view/www/src/components/PulsingDots.tsx +4 -1
- inspect_ai/_view/www/src/components/StickyScroll.tsx +183 -0
- inspect_ai/_view/www/src/components/TabSet.tsx +4 -0
- inspect_ai/_view/www/src/state/hooks.ts +52 -2
- inspect_ai/_view/www/src/state/logSlice.ts +4 -3
- inspect_ai/_view/www/src/state/samplePolling.ts +8 -0
- inspect_ai/_view/www/src/state/sampleSlice.ts +53 -9
- inspect_ai/_view/www/src/state/scrolling.ts +152 -0
- inspect_ai/_view/www/src/utils/attachments.ts +7 -0
- inspect_ai/_view/www/src/utils/python.ts +18 -0
- inspect_ai/_view/www/yarn.lock +269 -6
- inspect_ai/agent/_react.py +12 -7
- inspect_ai/agent/_run.py +2 -3
- inspect_ai/analysis/beta/_dataframe/samples/table.py +19 -18
- inspect_ai/log/_log.py +1 -1
- inspect_ai/log/_recorders/file.py +2 -9
- inspect_ai/log/_transcript.py +1 -1
- inspect_ai/model/_call_tools.py +6 -2
- inspect_ai/model/_openai.py +1 -1
- inspect_ai/model/_openai_responses.py +78 -39
- inspect_ai/model/_openai_web_search.py +31 -0
- inspect_ai/model/_providers/azureai.py +72 -3
- inspect_ai/model/_providers/openai.py +2 -1
- inspect_ai/scorer/_metric.py +1 -2
- inspect_ai/solver/_task_state.py +2 -2
- inspect_ai/tool/_tool.py +6 -2
- inspect_ai/tool/_tool_def.py +27 -4
- inspect_ai/tool/_tool_info.py +2 -0
- inspect_ai/tool/_tools/_web_search/_google.py +15 -4
- inspect_ai/tool/_tools/_web_search/_tavily.py +35 -12
- inspect_ai/tool/_tools/_web_search/_web_search.py +214 -45
- inspect_ai/util/__init__.py +4 -0
- inspect_ai/util/_json.py +3 -0
- inspect_ai/util/_limit.py +230 -20
- inspect_ai/util/_sandbox/docker/compose.py +20 -11
- inspect_ai/util/_span.py +1 -1
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.100.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.100.dist-info}/RECORD +120 -106
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.100.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.100.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.100.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.99.dist-info → inspect_ai-0.3.100.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,8 @@
|
|
1
|
+
from rich.console import RenderableType
|
2
|
+
from rich.text import Text
|
3
|
+
|
1
4
|
from inspect_ai._util.registry import is_model_dict, is_registry_dict
|
5
|
+
from inspect_ai._util.text import truncate_text
|
2
6
|
from inspect_ai.log._log import eval_config_defaults
|
3
7
|
|
4
8
|
from .display import TaskProfile
|
@@ -6,7 +10,7 @@ from .display import TaskProfile
|
|
6
10
|
|
7
11
|
def task_config(
|
8
12
|
profile: TaskProfile, generate_config: bool = True, style: str = ""
|
9
|
-
) ->
|
13
|
+
) -> RenderableType:
|
10
14
|
# merge config
|
11
15
|
# wind params back for display
|
12
16
|
task_args = dict(profile.task_args)
|
@@ -39,15 +43,17 @@ def task_config(
|
|
39
43
|
elif name not in ["limit", "model", "response_schema", "log_shared"]:
|
40
44
|
if isinstance(value, list):
|
41
45
|
value = ",".join([str(v) for v in value])
|
46
|
+
elif isinstance(value, dict):
|
47
|
+
value = "{...}"
|
42
48
|
if isinstance(value, str):
|
49
|
+
value = truncate_text(value, 50)
|
43
50
|
value = value.replace("[", "\\[")
|
44
51
|
config_print.append(f"{name}: {value}")
|
45
52
|
values = ", ".join(config_print)
|
46
53
|
if values:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
return values
|
54
|
+
values_text = Text(values, style=style)
|
55
|
+
values_text.truncate(500, overflow="ellipsis")
|
56
|
+
return values_text
|
51
57
|
else:
|
52
58
|
return ""
|
53
59
|
|
@@ -9,6 +9,7 @@ from rich.text import Text
|
|
9
9
|
from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
|
10
10
|
from inspect_ai._util.path import cwd_relative_path
|
11
11
|
from inspect_ai._util.registry import registry_unqualified_name
|
12
|
+
from inspect_ai.util._display import display_type
|
12
13
|
|
13
14
|
from .display import TaskProfile
|
14
15
|
from .rich import is_vscode_notebook, rich_theme
|
@@ -24,7 +25,13 @@ def task_panel(
|
|
24
25
|
| None,
|
25
26
|
footer: RenderableType | tuple[RenderableType, RenderableType] | None,
|
26
27
|
log_location: str | None,
|
27
|
-
) ->
|
28
|
+
) -> RenderableType:
|
29
|
+
# dispatch to plain handler if we are in plain mode
|
30
|
+
if display_type() == "plain":
|
31
|
+
return task_panel_plain(
|
32
|
+
profile, show_model, body, subtitle, footer, log_location
|
33
|
+
)
|
34
|
+
|
28
35
|
# rendering context
|
29
36
|
theme = rich_theme()
|
30
37
|
console = rich.get_console()
|
@@ -93,7 +100,7 @@ def task_panel(
|
|
93
100
|
# create panel w/ title
|
94
101
|
panel = Panel(
|
95
102
|
root,
|
96
|
-
title=
|
103
|
+
title=task_panel_title(profile, show_model),
|
97
104
|
title_align="left",
|
98
105
|
width=width,
|
99
106
|
expand=True,
|
@@ -101,6 +108,63 @@ def task_panel(
|
|
101
108
|
return panel
|
102
109
|
|
103
110
|
|
111
|
+
def task_panel_plain(
|
112
|
+
profile: TaskProfile,
|
113
|
+
show_model: bool,
|
114
|
+
body: RenderableType,
|
115
|
+
subtitle: RenderableType
|
116
|
+
| str
|
117
|
+
| Tuple[RenderableType | str, RenderableType | str]
|
118
|
+
| None,
|
119
|
+
footer: RenderableType | tuple[RenderableType, RenderableType] | None,
|
120
|
+
log_location: str | None,
|
121
|
+
) -> RenderableType:
|
122
|
+
# delimiter text
|
123
|
+
delimeter = "---------------------------------------------------------"
|
124
|
+
|
125
|
+
# root table for output
|
126
|
+
table = Table.grid(expand=False)
|
127
|
+
table.add_column()
|
128
|
+
table.add_row(delimeter)
|
129
|
+
|
130
|
+
# title and subtitle
|
131
|
+
table.add_row(task_panel_title(profile, show_model))
|
132
|
+
if isinstance(subtitle, tuple):
|
133
|
+
subtitle = subtitle[0]
|
134
|
+
table.add_row(subtitle)
|
135
|
+
|
136
|
+
# task info
|
137
|
+
if body:
|
138
|
+
table.add_row(body)
|
139
|
+
|
140
|
+
# footer
|
141
|
+
if isinstance(footer, tuple):
|
142
|
+
footer = footer[0]
|
143
|
+
if footer:
|
144
|
+
table.add_row(footer)
|
145
|
+
|
146
|
+
# log location
|
147
|
+
if log_location:
|
148
|
+
# Print a cwd relative path
|
149
|
+
try:
|
150
|
+
log_location_relative = cwd_relative_path(log_location, walk_up=True)
|
151
|
+
except ValueError:
|
152
|
+
log_location_relative = log_location
|
153
|
+
table.add_row(f"Log: {log_location_relative}")
|
154
|
+
|
155
|
+
table.add_row(delimeter)
|
156
|
+
table.add_row("")
|
157
|
+
|
158
|
+
return table
|
159
|
+
|
160
|
+
|
161
|
+
def task_panel_title(profile: TaskProfile, show_model: bool) -> str:
|
162
|
+
theme = rich_theme()
|
163
|
+
return (
|
164
|
+
f"[bold][{theme.meta}]{task_title(profile, show_model)}[/{theme.meta}][/bold]"
|
165
|
+
)
|
166
|
+
|
167
|
+
|
104
168
|
def to_renderable(item: RenderableType | str, style: str = "") -> RenderableType:
|
105
169
|
if isinstance(item, str):
|
106
170
|
return Text.from_markup(item, style=style)
|
@@ -8,8 +8,6 @@ logger = getLogger(__name__)
|
|
8
8
|
# force mouse support for textual -- this works around an issue where
|
9
9
|
# mouse events are disabled after a reload of the vs code ide, see:
|
10
10
|
# https://github.com/Textualize/textual/issues/5380
|
11
|
-
# ansi codes for enabling mouse support are idempotent so it is fine
|
12
|
-
# to do this even in cases where mouse support is already enabled.
|
13
11
|
# we try/catch since we aren't 100% sure there aren't cases where doing
|
14
12
|
# this won't raise and we'd rather not fail hard in in these case
|
15
13
|
def textual_enable_mouse_support(driver: Driver) -> None:
|
@@ -17,5 +15,10 @@ def textual_enable_mouse_support(driver: Driver) -> None:
|
|
17
15
|
if enable_mouse_support:
|
18
16
|
try:
|
19
17
|
enable_mouse_support()
|
18
|
+
# Re-enable SGR-Pixels format if it was previously enabled.
|
19
|
+
# See #1943.
|
20
|
+
enable_mouse_pixels = getattr(driver, "_enable_mouse_pixels", None)
|
21
|
+
if enable_mouse_pixels and getattr(driver, "_mouse_pixels", False):
|
22
|
+
enable_mouse_pixels()
|
20
23
|
except Exception as ex:
|
21
24
|
logger.warning(f"Error enabling mouse support: {ex}")
|
@@ -341,8 +341,6 @@ def tasks_live_status(
|
|
341
341
|
|
342
342
|
# get config
|
343
343
|
config = task_config(tasks[0].profile, generate_config=False, style=theme.light)
|
344
|
-
if config:
|
345
|
-
config += "\n"
|
346
344
|
|
347
345
|
# build footer table
|
348
346
|
footer_table = Table.grid(expand=True)
|
@@ -356,6 +354,8 @@ def tasks_live_status(
|
|
356
354
|
layout_table = Table.grid(expand=True)
|
357
355
|
layout_table.add_column()
|
358
356
|
layout_table.add_row(config)
|
357
|
+
if config:
|
358
|
+
layout_table.add_row("")
|
359
359
|
layout_table.add_row(progress)
|
360
360
|
layout_table.add_row(footer_table)
|
361
361
|
|
@@ -105,9 +105,28 @@ class TranscriptView(ScrollableContainer):
|
|
105
105
|
self, events: Sequence[Event], limit: int = 10
|
106
106
|
) -> list[Widget]:
|
107
107
|
widgets: list[Widget] = []
|
108
|
-
|
109
|
-
#
|
110
|
-
|
108
|
+
|
109
|
+
# filter the events to the <limit> most recent
|
110
|
+
filtered_events = events
|
111
|
+
if len(events) > limit:
|
112
|
+
filtered_events = filtered_events[-limit:]
|
113
|
+
|
114
|
+
# find the sample init event
|
115
|
+
sample_init: SampleInitEvent | None = None
|
116
|
+
for event in events:
|
117
|
+
if isinstance(event, SampleInitEvent):
|
118
|
+
sample_init = event
|
119
|
+
break
|
120
|
+
|
121
|
+
# add the sample init event if it isn't already in the event list
|
122
|
+
if sample_init and sample_init not in filtered_events:
|
123
|
+
filtered_events = [sample_init] + list(filtered_events)
|
124
|
+
|
125
|
+
# compute how many events we filtered out
|
126
|
+
filtered_count = len(events) - len(filtered_events)
|
127
|
+
showed_filtered_count = False
|
128
|
+
|
129
|
+
for event in filtered_events:
|
111
130
|
display = render_event(event)
|
112
131
|
if display:
|
113
132
|
for d in display:
|
@@ -123,14 +142,23 @@ class TranscriptView(ScrollableContainer):
|
|
123
142
|
set_transcript_markdown_options(d.content)
|
124
143
|
widgets.append(Static(d.content, markup=False))
|
125
144
|
widgets.append(Static(Text(" ")))
|
126
|
-
widget_count += 1
|
127
145
|
|
128
|
-
|
129
|
-
|
130
|
-
|
146
|
+
if not showed_filtered_count and filtered_count > 0:
|
147
|
+
showed_filtered_count = True
|
148
|
+
|
149
|
+
widgets.append(
|
150
|
+
Static(
|
151
|
+
transcript_separator(
|
152
|
+
f"{filtered_count} events..."
|
153
|
+
if filtered_count > 1
|
154
|
+
else "1 event...",
|
155
|
+
self.app.current_theme.primary,
|
156
|
+
)
|
157
|
+
)
|
158
|
+
)
|
159
|
+
widgets.append(Static(Text(" ")))
|
131
160
|
|
132
|
-
|
133
|
-
return widgets[::-1]
|
161
|
+
return widgets
|
134
162
|
|
135
163
|
|
136
164
|
class EventDisplay(NamedTuple):
|
inspect_ai/_eval/score.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import functools
|
2
2
|
from copy import deepcopy
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Any, Callable, Literal
|
4
|
+
from typing import Any, Callable, Literal
|
5
5
|
|
6
6
|
import anyio
|
7
7
|
|
@@ -270,9 +270,7 @@ def metrics_from_log(log: EvalLog) -> list[Metric] | dict[str, list[Metric]] | N
|
|
270
270
|
|
271
271
|
|
272
272
|
def metric_from_log(metric: EvalMetricDefinition) -> Metric:
|
273
|
-
return
|
274
|
-
Metric, registry_create("metric", metric.name, **(metric.options or {}))
|
275
|
-
)
|
273
|
+
return registry_create("metric", metric.name, **(metric.options or {}))
|
276
274
|
|
277
275
|
|
278
276
|
def reducers_from_log(log: EvalLog) -> list[ScoreReducer] | None:
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -35,11 +35,7 @@ from inspect_ai._util.registry import (
|
|
35
35
|
registry_log_name,
|
36
36
|
registry_unqualified_name,
|
37
37
|
)
|
38
|
-
from inspect_ai._util.working import
|
39
|
-
end_sample_working_limit,
|
40
|
-
init_sample_working_limit,
|
41
|
-
sample_waiting_time,
|
42
|
-
)
|
38
|
+
from inspect_ai._util.working import init_sample_working_time, sample_waiting_time
|
43
39
|
from inspect_ai._view.notify import view_notify_eval
|
44
40
|
from inspect_ai.dataset import Dataset, Sample
|
45
41
|
from inspect_ai.log import (
|
@@ -90,6 +86,8 @@ from inspect_ai.solver._fork import set_task_generate
|
|
90
86
|
from inspect_ai.solver._solver import Solver
|
91
87
|
from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
|
92
88
|
from inspect_ai.util._limit import LimitExceededError
|
89
|
+
from inspect_ai.util._limit import time_limit as create_time_limit
|
90
|
+
from inspect_ai.util._limit import working_limit as create_working_limit
|
93
91
|
from inspect_ai.util._sandbox.context import sandbox_connections
|
94
92
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
95
93
|
from inspect_ai.util._span import span
|
@@ -635,10 +633,6 @@ async def task_run_sample(
|
|
635
633
|
)
|
636
634
|
|
637
635
|
async with sandboxenv_cm:
|
638
|
-
timeout_cm: (
|
639
|
-
contextlib._GeneratorContextManager[anyio.CancelScope]
|
640
|
-
| contextlib.nullcontext[None]
|
641
|
-
) = contextlib.nullcontext()
|
642
636
|
try:
|
643
637
|
# update active sample wth sandboxes now that we are initialised
|
644
638
|
# (ensure that we still exit init context in presence of sandbox error)
|
@@ -647,19 +641,17 @@ async def task_run_sample(
|
|
647
641
|
finally:
|
648
642
|
await init_span.__aexit__(None, None, None)
|
649
643
|
|
650
|
-
# initialise timeout context manager
|
651
|
-
timeout_cm = (
|
652
|
-
anyio.fail_after(time_limit)
|
653
|
-
if time_limit is not None
|
654
|
-
else contextlib.nullcontext()
|
655
|
-
)
|
656
|
-
|
657
644
|
# record start time
|
658
645
|
start_time = time.monotonic()
|
659
|
-
|
660
|
-
|
661
|
-
# run sample w/ optional
|
662
|
-
with
|
646
|
+
init_sample_working_time(start_time)
|
647
|
+
|
648
|
+
# run sample w/ optional limits
|
649
|
+
with (
|
650
|
+
state._token_limit,
|
651
|
+
state._message_limit,
|
652
|
+
create_time_limit(time_limit),
|
653
|
+
create_working_limit(working_limit),
|
654
|
+
):
|
663
655
|
# mark started
|
664
656
|
active.started = datetime.now().timestamp()
|
665
657
|
|
@@ -675,24 +667,15 @@ async def task_run_sample(
|
|
675
667
|
)
|
676
668
|
|
677
669
|
# set progress for plan then run it
|
678
|
-
|
679
|
-
|
680
|
-
# disable sample working limit after execution
|
681
|
-
end_sample_working_limit()
|
670
|
+
async with span("solvers"):
|
671
|
+
state = await plan(state, generate)
|
682
672
|
|
683
673
|
except TimeoutError:
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
limit=time_limit,
|
690
|
-
)
|
691
|
-
)
|
692
|
-
else:
|
693
|
-
py_logger.warning(
|
694
|
-
"Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
|
695
|
-
)
|
674
|
+
# Scoped time limits manifest themselves as LimitExceededError, not
|
675
|
+
# TimeoutError.
|
676
|
+
py_logger.warning(
|
677
|
+
"Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
|
678
|
+
)
|
696
679
|
|
697
680
|
# capture most recent state for scoring
|
698
681
|
state = sample_state() or state
|
@@ -737,54 +720,59 @@ async def task_run_sample(
|
|
737
720
|
# the cause of the timeout is a hung container and scoring requires
|
738
721
|
# interacting with the container). as a middle ground we use half
|
739
722
|
# of the original timeout value for scoring.
|
740
|
-
if time_limit
|
741
|
-
timeout_cm = anyio.fail_after(time_limit / 2)
|
723
|
+
scoring_time_limit = time_limit / 2 if time_limit else None
|
742
724
|
|
743
725
|
set_sample_state(state)
|
744
726
|
|
745
727
|
# scoring
|
746
728
|
try:
|
747
729
|
# timeout during scoring will result in an ordinary sample error
|
748
|
-
with
|
730
|
+
with create_time_limit(scoring_time_limit):
|
749
731
|
if error is None:
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
async with span(name=scorer_name, type="scorer"):
|
755
|
-
score_result = (
|
756
|
-
await scorer(state, Target(sample.target))
|
757
|
-
if scorer
|
758
|
-
else None
|
732
|
+
async with span(name="scorers"):
|
733
|
+
for scorer in scorers or []:
|
734
|
+
scorer_name = unique_scorer_name(
|
735
|
+
scorer, list(results.keys())
|
759
736
|
)
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
737
|
+
async with span(name=scorer_name, type="scorer"):
|
738
|
+
score_result = (
|
739
|
+
await scorer(state, Target(sample.target))
|
740
|
+
if scorer
|
741
|
+
else None
|
742
|
+
)
|
743
|
+
if score_result is not None:
|
744
|
+
sample_score = SampleScore(
|
745
|
+
score=score_result,
|
746
|
+
sample_id=sample.id,
|
747
|
+
sample_metadata=sample.metadata,
|
748
|
+
scorer=registry_unqualified_name(
|
749
|
+
scorer
|
750
|
+
),
|
751
|
+
)
|
752
|
+
transcript()._event(
|
753
|
+
ScoreEvent(
|
754
|
+
score=score_result,
|
755
|
+
target=sample.target,
|
756
|
+
)
|
757
|
+
)
|
758
|
+
results[scorer_name] = sample_score
|
759
|
+
|
760
|
+
# add scores returned by solvers
|
761
|
+
if state.scores is not None:
|
762
|
+
for name, score in state.scores.items():
|
763
|
+
results[name] = SampleScore(
|
764
|
+
score=score,
|
765
|
+
sample_id=state.sample_id,
|
766
|
+
sample_metadata=state.metadata,
|
766
767
|
)
|
767
768
|
transcript()._event(
|
768
769
|
ScoreEvent(
|
769
|
-
score=
|
770
|
+
score=score, target=sample.target
|
770
771
|
)
|
771
772
|
)
|
772
|
-
results[scorer_name] = sample_score
|
773
|
-
|
774
|
-
# add scores returned by solvers
|
775
|
-
if state.scores is not None:
|
776
|
-
for name, score in state.scores.items():
|
777
|
-
results[name] = SampleScore(
|
778
|
-
score=score,
|
779
|
-
sample_id=state.sample_id,
|
780
|
-
sample_metadata=state.metadata,
|
781
|
-
)
|
782
|
-
transcript()._event(
|
783
|
-
ScoreEvent(score=score, target=sample.target)
|
784
|
-
)
|
785
773
|
|
786
|
-
|
787
|
-
|
774
|
+
# propagate results into scores
|
775
|
+
state.scores = {k: v.score for k, v in results.items()}
|
788
776
|
|
789
777
|
except anyio.get_cancelled_exc_class():
|
790
778
|
if active.interrupt_action:
|
@@ -798,17 +786,7 @@ async def task_run_sample(
|
|
798
786
|
raise
|
799
787
|
|
800
788
|
except BaseException as ex:
|
801
|
-
#
|
802
|
-
if isinstance(ex, TimeoutError):
|
803
|
-
transcript()._event(
|
804
|
-
SampleLimitEvent(
|
805
|
-
type="time",
|
806
|
-
message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
|
807
|
-
limit=time_limit,
|
808
|
-
)
|
809
|
-
)
|
810
|
-
|
811
|
-
# handle error (this will throw if we've exceeded the limit)
|
789
|
+
# handle error
|
812
790
|
error, raise_error = handle_error(ex)
|
813
791
|
|
814
792
|
except Exception as ex:
|
inspect_ai/_util/content.py
CHANGED
@@ -1,9 +1,14 @@
|
|
1
1
|
from typing import Literal, Union
|
2
2
|
|
3
|
-
from pydantic import BaseModel, Field
|
3
|
+
from pydantic import BaseModel, Field, JsonValue
|
4
4
|
|
5
5
|
|
6
|
-
class
|
6
|
+
class ContentBase(BaseModel):
|
7
|
+
internal: JsonValue | None = Field(default=None)
|
8
|
+
"""Model provider specific payload - typically used to aid transformation back to model types."""
|
9
|
+
|
10
|
+
|
11
|
+
class ContentText(ContentBase):
|
7
12
|
"""Text content."""
|
8
13
|
|
9
14
|
type: Literal["text"] = Field(default="text")
|
@@ -16,7 +21,7 @@ class ContentText(BaseModel):
|
|
16
21
|
"""Was this a refusal message?"""
|
17
22
|
|
18
23
|
|
19
|
-
class ContentReasoning(
|
24
|
+
class ContentReasoning(ContentBase):
|
20
25
|
"""Reasoning content.
|
21
26
|
|
22
27
|
See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
|
@@ -35,7 +40,7 @@ class ContentReasoning(BaseModel):
|
|
35
40
|
"""Indicates that the explicit content of this reasoning block has been redacted."""
|
36
41
|
|
37
42
|
|
38
|
-
class ContentImage(
|
43
|
+
class ContentImage(ContentBase):
|
39
44
|
"""Image content."""
|
40
45
|
|
41
46
|
type: Literal["image"] = Field(default="image")
|
@@ -51,7 +56,7 @@ class ContentImage(BaseModel):
|
|
51
56
|
"""
|
52
57
|
|
53
58
|
|
54
|
-
class ContentAudio(
|
59
|
+
class ContentAudio(ContentBase):
|
55
60
|
"""Audio content."""
|
56
61
|
|
57
62
|
type: Literal["audio"] = Field(default="audio")
|
@@ -64,7 +69,7 @@ class ContentAudio(BaseModel):
|
|
64
69
|
"""Format of audio data ('mp3' or 'wav')"""
|
65
70
|
|
66
71
|
|
67
|
-
class ContentVideo(
|
72
|
+
class ContentVideo(ContentBase):
|
68
73
|
"""Video content."""
|
69
74
|
|
70
75
|
type: Literal["video"] = Field(default="video")
|
inspect_ai/_util/interrupt.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import anyio
|
2
2
|
|
3
|
-
from .
|
3
|
+
from inspect_ai.util._limit import check_working_limit
|
4
4
|
|
5
5
|
|
6
6
|
def check_sample_interrupt() -> None:
|
@@ -12,4 +12,4 @@ def check_sample_interrupt() -> None:
|
|
12
12
|
raise anyio.get_cancelled_exc_class()
|
13
13
|
|
14
14
|
# check for working_limit
|
15
|
-
|
15
|
+
check_working_limit()
|
inspect_ai/_util/text.py
CHANGED
@@ -1,12 +1,19 @@
|
|
1
1
|
import random
|
2
2
|
import re
|
3
3
|
import string
|
4
|
+
import textwrap
|
4
5
|
from logging import getLogger
|
5
6
|
from typing import List, NamedTuple
|
6
7
|
|
7
8
|
logger = getLogger(__name__)
|
8
9
|
|
9
10
|
|
11
|
+
def truncate_text(text: str, max_length: int) -> str:
|
12
|
+
if len(text) <= max_length:
|
13
|
+
return text
|
14
|
+
return textwrap.shorten(text, width=max_length, placeholder="...")
|
15
|
+
|
16
|
+
|
10
17
|
def strip_punctuation(s: str) -> str:
|
11
18
|
return s.strip(string.whitespace + string.punctuation)
|
12
19
|
|
inspect_ai/_util/working.py
CHANGED
@@ -1,19 +1,14 @@
|
|
1
1
|
import time
|
2
2
|
from contextvars import ContextVar
|
3
3
|
|
4
|
-
from inspect_ai.util._limit import
|
4
|
+
from inspect_ai.util._limit import check_working_limit, record_waiting_time
|
5
5
|
|
6
6
|
|
7
|
-
def
|
8
|
-
_sample_working_limit.set(working_limit)
|
7
|
+
def init_sample_working_time(start_time: float) -> None:
|
9
8
|
_sample_start_time.set(start_time)
|
10
9
|
_sample_waiting_time.set(0)
|
11
10
|
|
12
11
|
|
13
|
-
def end_sample_working_limit() -> None:
|
14
|
-
_sample_working_limit.set(None)
|
15
|
-
|
16
|
-
|
17
12
|
def sample_waiting_time() -> float:
|
18
13
|
return _sample_waiting_time.get()
|
19
14
|
|
@@ -23,37 +18,13 @@ def sample_working_time() -> float:
|
|
23
18
|
|
24
19
|
|
25
20
|
def report_sample_waiting_time(waiting_time: float) -> None:
|
21
|
+
# record and check for scoped limits
|
22
|
+
record_waiting_time(waiting_time)
|
23
|
+
check_working_limit()
|
24
|
+
|
25
|
+
# record sample-level limits
|
26
26
|
_sample_waiting_time.set(_sample_waiting_time.get() + waiting_time)
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
def check_sample_working_limit() -> None:
|
31
|
-
from inspect_ai.log._transcript import SampleLimitEvent, transcript
|
32
|
-
|
33
|
-
# no check if we don't have a limit
|
34
|
-
working_limit = _sample_working_limit.get()
|
35
|
-
if working_limit is None:
|
36
|
-
return
|
37
|
-
|
38
|
-
# are we over the limit?
|
39
|
-
running_time = time.monotonic() - _sample_start_time.get()
|
40
|
-
working_time = running_time - sample_waiting_time()
|
41
|
-
if working_time > working_limit:
|
42
|
-
message = f"Exceeded working time limit ({working_limit:,} seconds)"
|
43
|
-
transcript()._event(
|
44
|
-
SampleLimitEvent(type="working", limit=int(working_limit), message=message)
|
45
|
-
)
|
46
|
-
raise LimitExceededError(
|
47
|
-
type="working",
|
48
|
-
value=int(working_time),
|
49
|
-
limit=int(working_limit),
|
50
|
-
message=message,
|
51
|
-
)
|
52
|
-
|
53
|
-
|
54
|
-
_sample_working_limit: ContextVar[float | None] = ContextVar(
|
55
|
-
"sample_working_limit", default=None
|
56
|
-
)
|
27
|
+
|
57
28
|
|
58
29
|
_sample_start_time: ContextVar[float] = ContextVar("sample_start_time", default=0)
|
59
30
|
|
File without changes
|
inspect_ai/_view/schema.py
CHANGED
@@ -45,9 +45,10 @@ def sync_view_schema() -> None:
|
|
45
45
|
"false",
|
46
46
|
],
|
47
47
|
cwd=WWW_DIR,
|
48
|
+
check=True,
|
48
49
|
)
|
49
50
|
|
50
|
-
subprocess.run(["yarn", "prettier:write"], cwd=types_path.parent)
|
51
|
+
subprocess.run(["yarn", "prettier:write"], cwd=types_path.parent, check=True)
|
51
52
|
|
52
53
|
shutil.copyfile(types_path, vs_code_types_path)
|
53
54
|
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# CLAUDE.md
|
2
|
+
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
4
|
+
|
5
|
+
## Build/Lint/Test Commands
|
6
|
+
|
7
|
+
- Run type checker, linter, and tests: `yarn check-all`
|
8
|
+
- Run type checker: `yarn tsc`
|
9
|
+
- Run linter: `yarn lint`
|
10
|
+
- Run tests: `yarn test`
|
11
|
+
|
12
|
+
## Other Information
|
13
|
+
|
14
|
+
- The code in this project is typescript, learn more about the configuration by inspecting package.json.
|
15
|
+
- Respect existing code patterns when modifying files. Run linting before committing changes.
|