inspect-ai 0.3.98__py3-none-any.whl → 0.3.99__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -0
- inspect_ai/_cli/log.py +1 -1
- inspect_ai/_display/textual/widgets/transcript.py +15 -3
- inspect_ai/_eval/run.py +12 -4
- inspect_ai/_eval/task/log.py +1 -1
- inspect_ai/_eval/task/task.py +1 -1
- inspect_ai/_util/_async.py +1 -1
- inspect_ai/_view/schema.py +1 -0
- inspect_ai/_view/view.py +14 -0
- inspect_ai/_view/www/dist/assets/index.css +10 -10
- inspect_ai/_view/www/dist/assets/index.js +10 -10
- inspect_ai/_view/www/log-schema.json +45 -5
- inspect_ai/_view/www/src/@types/log.d.ts +11 -2
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +2 -1
- inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/app/samples/scores/SampleScoresGrid.module.css +2 -2
- inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +1 -1
- inspect_ai/agent/_run.py +44 -8
- inspect_ai/log/_bundle.py +5 -3
- inspect_ai/log/_log.py +2 -2
- inspect_ai/model/_providers/anthropic.py +3 -6
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/util/__init__.py +2 -0
- inspect_ai/util/_limit.py +160 -137
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/RECORD +30 -30
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.98.dist-info → inspect_ai-0.3.99.dist-info}/top_level.txt +0 -0
inspect_ai/__init__.py
CHANGED
@@ -10,6 +10,7 @@ from inspect_ai._eval.score import score, score_async
|
|
10
10
|
from inspect_ai._eval.task import Epochs, Task, TaskInfo, task_with
|
11
11
|
from inspect_ai._eval.task.tasks import Tasks
|
12
12
|
from inspect_ai._util.constants import PKG_NAME
|
13
|
+
from inspect_ai._view.view import view
|
13
14
|
from inspect_ai.agent._human.agent import human_cli
|
14
15
|
from inspect_ai.solver._human_agent import human_agent
|
15
16
|
|
@@ -32,4 +33,5 @@ __all__ = [
|
|
32
33
|
"TaskInfo",
|
33
34
|
"task",
|
34
35
|
"task_with",
|
36
|
+
"view",
|
35
37
|
]
|
inspect_ai/_cli/log.py
CHANGED
@@ -199,6 +199,6 @@ def view_resource(file: str) -> str:
|
|
199
199
|
|
200
200
|
|
201
201
|
def view_type_resource(file: str) -> str:
|
202
|
-
resource = PKG_PATH / "_view" / "www" / "src" / "types" / file
|
202
|
+
resource = PKG_PATH / "_view" / "www" / "src" / "@types" / file
|
203
203
|
with open(resource, "r", encoding="utf-8") as f:
|
204
204
|
return f.read()
|
@@ -84,6 +84,7 @@ class TranscriptView(ScrollableContainer):
|
|
84
84
|
scroll_to_end = (
|
85
85
|
new_sample or abs(self.scroll_y - self.max_scroll_y) <= 20
|
86
86
|
)
|
87
|
+
|
87
88
|
async with self.batch():
|
88
89
|
await self.remove_children()
|
89
90
|
await self.mount_all(
|
@@ -100,9 +101,13 @@ class TranscriptView(ScrollableContainer):
|
|
100
101
|
else:
|
101
102
|
self._pending_sample = sample
|
102
103
|
|
103
|
-
def _widgets_for_events(
|
104
|
+
def _widgets_for_events(
|
105
|
+
self, events: Sequence[Event], limit: int = 10
|
106
|
+
) -> list[Widget]:
|
104
107
|
widgets: list[Widget] = []
|
105
|
-
|
108
|
+
widget_count = 0
|
109
|
+
# reverse the events so that the N most recents events are displayed
|
110
|
+
for event in events[::-1]:
|
106
111
|
display = render_event(event)
|
107
112
|
if display:
|
108
113
|
for d in display:
|
@@ -118,7 +123,14 @@ class TranscriptView(ScrollableContainer):
|
|
118
123
|
set_transcript_markdown_options(d.content)
|
119
124
|
widgets.append(Static(d.content, markup=False))
|
120
125
|
widgets.append(Static(Text(" ")))
|
121
|
-
|
126
|
+
widget_count += 1
|
127
|
+
|
128
|
+
# only render the N most recent events
|
129
|
+
if widget_count >= limit:
|
130
|
+
break
|
131
|
+
|
132
|
+
# reverse the list since we added the events in reverse order
|
133
|
+
return widgets[::-1]
|
122
134
|
|
123
135
|
|
124
136
|
class EventDisplay(NamedTuple):
|
inspect_ai/_eval/run.py
CHANGED
@@ -298,10 +298,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
298
298
|
|
299
299
|
# setup pending tasks, queue, and results
|
300
300
|
pending_tasks = tasks.copy()
|
301
|
-
results: list[EvalLog] = []
|
301
|
+
results: list[tuple[int, EvalLog]] = []
|
302
302
|
tasks_completed = 0
|
303
303
|
total_tasks = len(tasks)
|
304
304
|
|
305
|
+
# Create a mapping from task to its original index
|
306
|
+
task_to_original_index = {id(task): i for i, task in enumerate(tasks)}
|
307
|
+
|
305
308
|
# produce/consume tasks
|
306
309
|
send_channel, receive_channel = anyio.create_memory_object_stream[TaskRunOptions](
|
307
310
|
parallel * 2
|
@@ -322,7 +325,7 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
322
325
|
# among those models, pick one with the least usage
|
323
326
|
model = min(models_with_pending, key=lambda m: model_counts[m])
|
324
327
|
|
325
|
-
# now we know there
|
328
|
+
# now we know there's at least one pending task for this model so it's safe to pick it
|
326
329
|
next_task = next(t for t in pending_tasks if str(t.model) == model)
|
327
330
|
pending_tasks.remove(next_task)
|
328
331
|
model_counts[str(next_task.model)] += 1
|
@@ -339,6 +342,8 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
339
342
|
nonlocal tasks_completed
|
340
343
|
async for task_options in receive_channel:
|
341
344
|
result: EvalLog | None = None
|
345
|
+
# Get the original index of this task
|
346
|
+
original_index = task_to_original_index[id(task_options)]
|
342
347
|
|
343
348
|
# run the task
|
344
349
|
try:
|
@@ -354,11 +359,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
354
359
|
# see: https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result
|
355
360
|
def create_task_runner(
|
356
361
|
options: TaskRunOptions = task_options,
|
362
|
+
idx: int = original_index,
|
357
363
|
) -> Callable[[], Awaitable[None]]:
|
358
364
|
async def run_task() -> None:
|
359
365
|
nonlocal result
|
360
366
|
result = await task_run(options)
|
361
|
-
|
367
|
+
# Store result with its original index
|
368
|
+
results.append((idx, result))
|
362
369
|
|
363
370
|
return run_task
|
364
371
|
|
@@ -426,7 +433,8 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
|
|
426
433
|
|
427
434
|
clear_task_screen()
|
428
435
|
|
429
|
-
return
|
436
|
+
# Sort results by original index and return just the values
|
437
|
+
return [r for _, r in sorted(results)]
|
430
438
|
|
431
439
|
|
432
440
|
def resolve_task_sample_ids(
|
inspect_ai/_eval/task/log.py
CHANGED
inspect_ai/_eval/task/task.py
CHANGED
inspect_ai/_util/_async.py
CHANGED
@@ -136,7 +136,7 @@ def current_async_backend() -> Literal["asyncio", "trio"] | None:
|
|
136
136
|
|
137
137
|
|
138
138
|
def configured_async_backend() -> Literal["asyncio", "trio"]:
|
139
|
-
backend = os.environ.get("INSPECT_ASYNC_BACKEND", "asyncio").lower()
|
139
|
+
backend = os.environ.get("INSPECT_ASYNC_BACKEND", "asyncio").lower() or "asyncio"
|
140
140
|
return _validate_backend(backend)
|
141
141
|
|
142
142
|
|
inspect_ai/_view/schema.py
CHANGED
inspect_ai/_view/view.py
CHANGED
@@ -30,6 +30,20 @@ def view(
|
|
30
30
|
log_level: str | None = None,
|
31
31
|
fs_options: dict[str, Any] = {},
|
32
32
|
) -> None:
|
33
|
+
"""Run the Inspect View server.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
log_dir: Directory to view logs from.
|
37
|
+
recursive: Recursively list files in `log_dir`.
|
38
|
+
host: Tcp/ip host (defaults to "127.0.0.1").
|
39
|
+
port: Tcp/ip port (defaults to 7575).
|
40
|
+
authorization: Validate requests by checking for this authorization header.
|
41
|
+
log_level: Level for logging to the console: "debug", "http", "sandbox",
|
42
|
+
"info", "warning", "error", or "critical" (defaults to "warning").
|
43
|
+
fs_options: Additional arguments to pass through to the filesystem provider
|
44
|
+
(e.g. `S3FileSystem`). Use `{"anon": True }` if you are accessing a
|
45
|
+
public S3 bucket with no credentials.
|
46
|
+
"""
|
33
47
|
init_dotenv()
|
34
48
|
init_logger(log_level)
|
35
49
|
|
@@ -17166,41 +17166,41 @@ thead th {
|
|
17166
17166
|
flex-direction: column;
|
17167
17167
|
padding-top: 0.1em;
|
17168
17168
|
}
|
17169
|
-
.
|
17169
|
+
._container_181fj_1 {
|
17170
17170
|
display: grid;
|
17171
17171
|
grid-template-columns:
|
17172
|
-
minmax(
|
17173
|
-
|
17172
|
+
minmax(0, max-content) minmax(0, max-content) minmax(0, max-content)
|
17173
|
+
5fr;
|
17174
17174
|
column-gap: 0.75em;
|
17175
17175
|
}
|
17176
17176
|
|
17177
|
-
.
|
17177
|
+
._container_181fj_1 ._cell_181fj_9 {
|
17178
17178
|
margin-bottom: 0.5em;
|
17179
17179
|
}
|
17180
17180
|
|
17181
|
-
.
|
17181
|
+
._fullWidth_181fj_13 {
|
17182
17182
|
grid-column: 1 / -1;
|
17183
17183
|
}
|
17184
17184
|
|
17185
|
-
.
|
17185
|
+
._heading_181fj_17 {
|
17186
17186
|
font-weight: 600;
|
17187
17187
|
}
|
17188
17188
|
|
17189
|
-
.
|
17189
|
+
._padded_181fj_21 {
|
17190
17190
|
padding-bottom: 3em;
|
17191
17191
|
}
|
17192
17192
|
|
17193
|
-
.
|
17193
|
+
._separator_181fj_25 {
|
17194
17194
|
height: 1px;
|
17195
17195
|
background-color: var(--bs-light-border-subtle);
|
17196
17196
|
}
|
17197
17197
|
|
17198
|
-
.
|
17198
|
+
._separatorPadded_181fj_30 {
|
17199
17199
|
margin-top: 0.5em;
|
17200
17200
|
margin-bottom: 0.5em;
|
17201
17201
|
}
|
17202
17202
|
|
17203
|
-
.
|
17203
|
+
._headerSep_181fj_35 {
|
17204
17204
|
margin-top: 0.1em;
|
17205
17205
|
margin-bottom: 0.2em;
|
17206
17206
|
}
|
@@ -39446,7 +39446,7 @@ Please change the parent <Route path="${parentPath}"> to <Route path="${parentPa
|
|
39446
39446
|
const rendered = entry2.value.trim();
|
39447
39447
|
if (options2.renderString === "markdown") {
|
39448
39448
|
return {
|
39449
|
-
rendered
|
39449
|
+
rendered: /* @__PURE__ */ jsxRuntimeExports.jsx(MarkdownDiv, { markdown: rendered })
|
39450
39450
|
};
|
39451
39451
|
} else {
|
39452
39452
|
return {
|
@@ -51898,12 +51898,12 @@ self.onmessage = function (e) {
|
|
51898
51898
|
);
|
51899
51899
|
return scorerDescriptor == null ? void 0 : scorerDescriptor.render(scoreData.value);
|
51900
51900
|
};
|
51901
|
-
const container$6 = "
|
51902
|
-
const cell$1 = "
|
51903
|
-
const fullWidth = "
|
51904
|
-
const separator$2 = "
|
51905
|
-
const separatorPadded = "
|
51906
|
-
const headerSep = "
|
51901
|
+
const container$6 = "_container_181fj_1";
|
51902
|
+
const cell$1 = "_cell_181fj_9";
|
51903
|
+
const fullWidth = "_fullWidth_181fj_13";
|
51904
|
+
const separator$2 = "_separator_181fj_25";
|
51905
|
+
const separatorPadded = "_separatorPadded_181fj_30";
|
51906
|
+
const headerSep = "_headerSep_181fj_35";
|
51907
51907
|
const styles$x = {
|
51908
51908
|
container: container$6,
|
51909
51909
|
cell: cell$1,
|
@@ -52473,7 +52473,7 @@ self.onmessage = function (e) {
|
|
52473
52473
|
{
|
52474
52474
|
output: event.error.traceback_ansi,
|
52475
52475
|
style: {
|
52476
|
-
fontSize: "clamp(0.
|
52476
|
+
fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
|
52477
52477
|
margin: "0.5em 0"
|
52478
52478
|
}
|
52479
52479
|
}
|
@@ -61749,7 +61749,7 @@ ${events}
|
|
61749
61749
|
output: sample2.error.traceback_ansi,
|
61750
61750
|
className: clsx("text-size-small", styles$A.ansi),
|
61751
61751
|
style: {
|
61752
|
-
fontSize: "clamp(0.
|
61752
|
+
fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
|
61753
61753
|
margin: "0.5em 0"
|
61754
61754
|
}
|
61755
61755
|
}
|
@@ -61764,7 +61764,7 @@ ${events}
|
|
61764
61764
|
output: retry.traceback_ansi,
|
61765
61765
|
className: clsx("text-size-small", styles$A.ansi),
|
61766
61766
|
style: {
|
61767
|
-
fontSize: "clamp(0.
|
61767
|
+
fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
|
61768
61768
|
margin: "0.5em 0"
|
61769
61769
|
}
|
61770
61770
|
}
|
@@ -1136,6 +1136,18 @@
|
|
1136
1136
|
"default": null,
|
1137
1137
|
"title": "Log Samples"
|
1138
1138
|
},
|
1139
|
+
"log_realtime": {
|
1140
|
+
"anyOf": [
|
1141
|
+
{
|
1142
|
+
"type": "boolean"
|
1143
|
+
},
|
1144
|
+
{
|
1145
|
+
"type": "null"
|
1146
|
+
}
|
1147
|
+
],
|
1148
|
+
"default": null,
|
1149
|
+
"title": "Log Realtime"
|
1150
|
+
},
|
1139
1151
|
"log_images": {
|
1140
1152
|
"anyOf": [
|
1141
1153
|
{
|
@@ -1205,6 +1217,7 @@
|
|
1205
1217
|
"max_sandboxes",
|
1206
1218
|
"sandbox_cleanup",
|
1207
1219
|
"log_samples",
|
1220
|
+
"log_realtime",
|
1208
1221
|
"log_images",
|
1209
1222
|
"log_buffer",
|
1210
1223
|
"log_shared",
|
@@ -1502,7 +1515,8 @@
|
|
1502
1515
|
"reasoning_tokens": null,
|
1503
1516
|
"reasoning_summary": null,
|
1504
1517
|
"reasoning_history": null,
|
1505
|
-
"response_schema": null
|
1518
|
+
"response_schema": null,
|
1519
|
+
"extra_body": null
|
1506
1520
|
}
|
1507
1521
|
}
|
1508
1522
|
},
|
@@ -1944,7 +1958,7 @@
|
|
1944
1958
|
"additionalProperties": false
|
1945
1959
|
},
|
1946
1960
|
"EvalSampleLimit": {
|
1947
|
-
"description": "Limit
|
1961
|
+
"description": "Limit encountered by sample.",
|
1948
1962
|
"properties": {
|
1949
1963
|
"type": {
|
1950
1964
|
"enum": [
|
@@ -2277,6 +2291,10 @@
|
|
2277
2291
|
"EvalSpec": {
|
2278
2292
|
"description": "Eval target and configuration.",
|
2279
2293
|
"properties": {
|
2294
|
+
"eval_id": {
|
2295
|
+
"title": "Eval Id",
|
2296
|
+
"type": "string"
|
2297
|
+
},
|
2280
2298
|
"run_id": {
|
2281
2299
|
"title": "Run Id",
|
2282
2300
|
"type": "string"
|
@@ -2294,9 +2312,16 @@
|
|
2294
2312
|
"type": "string"
|
2295
2313
|
},
|
2296
2314
|
"task_version": {
|
2315
|
+
"anyOf": [
|
2316
|
+
{
|
2317
|
+
"type": "integer"
|
2318
|
+
},
|
2319
|
+
{
|
2320
|
+
"type": "string"
|
2321
|
+
}
|
2322
|
+
],
|
2297
2323
|
"default": 0,
|
2298
|
-
"title": "Task Version"
|
2299
|
-
"type": "integer"
|
2324
|
+
"title": "Task Version"
|
2300
2325
|
},
|
2301
2326
|
"task_file": {
|
2302
2327
|
"anyOf": [
|
@@ -2500,6 +2525,7 @@
|
|
2500
2525
|
}
|
2501
2526
|
},
|
2502
2527
|
"required": [
|
2528
|
+
"eval_id",
|
2503
2529
|
"run_id",
|
2504
2530
|
"created",
|
2505
2531
|
"task",
|
@@ -2897,6 +2923,19 @@
|
|
2897
2923
|
}
|
2898
2924
|
],
|
2899
2925
|
"default": null
|
2926
|
+
},
|
2927
|
+
"extra_body": {
|
2928
|
+
"anyOf": [
|
2929
|
+
{
|
2930
|
+
"additionalProperties": true,
|
2931
|
+
"type": "object"
|
2932
|
+
},
|
2933
|
+
{
|
2934
|
+
"type": "null"
|
2935
|
+
}
|
2936
|
+
],
|
2937
|
+
"default": null,
|
2938
|
+
"title": "Extra Body"
|
2900
2939
|
}
|
2901
2940
|
},
|
2902
2941
|
"title": "GenerateConfig",
|
@@ -2927,7 +2966,8 @@
|
|
2927
2966
|
"reasoning_tokens",
|
2928
2967
|
"reasoning_summary",
|
2929
2968
|
"reasoning_history",
|
2930
|
-
"response_schema"
|
2969
|
+
"response_schema",
|
2970
|
+
"extra_body"
|
2931
2971
|
],
|
2932
2972
|
"additionalProperties": false
|
2933
2973
|
},
|
@@ -7,11 +7,12 @@
|
|
7
7
|
|
8
8
|
export type Version = number;
|
9
9
|
export type Status = "started" | "success" | "cancelled" | "error";
|
10
|
+
export type EvalId = string;
|
10
11
|
export type RunId = string;
|
11
12
|
export type Created = string;
|
12
13
|
export type Task = string;
|
13
14
|
export type TaskId = string;
|
14
|
-
export type TaskVersion = number;
|
15
|
+
export type TaskVersion = number | string;
|
15
16
|
export type TaskFile = string | null;
|
16
17
|
export type TaskRegistryName = string | null;
|
17
18
|
export type Solver = string | null;
|
@@ -68,6 +69,9 @@ export type Anyof = JSONSchema[] | null;
|
|
68
69
|
export type Required = string[] | null;
|
69
70
|
export type Description1 = string | null;
|
70
71
|
export type Strict = boolean | null;
|
72
|
+
export type ExtraBody = {
|
73
|
+
[k: string]: unknown;
|
74
|
+
} | null;
|
71
75
|
export type ModelBaseUrl = string | null;
|
72
76
|
export type ModelRoles = {
|
73
77
|
[k: string]: EvalModelConfig;
|
@@ -99,6 +103,7 @@ export type MaxSubprocesses = number | null;
|
|
99
103
|
export type MaxSandboxes = number | null;
|
100
104
|
export type SandboxCleanup = boolean | null;
|
101
105
|
export type LogSamples = boolean | null;
|
106
|
+
export type LogRealtime = boolean | null;
|
102
107
|
export type LogImages = boolean | null;
|
103
108
|
export type LogBuffer = number | null;
|
104
109
|
export type LogShared = number | null;
|
@@ -640,6 +645,7 @@ export interface EvalLog {
|
|
640
645
|
* Eval target and configuration.
|
641
646
|
*/
|
642
647
|
export interface EvalSpec {
|
648
|
+
eval_id: EvalId;
|
643
649
|
run_id: RunId;
|
644
650
|
created: Created;
|
645
651
|
task: Task;
|
@@ -722,6 +728,7 @@ export interface GenerateConfig {
|
|
722
728
|
reasoning_summary: ReasoningSummary;
|
723
729
|
reasoning_history: ReasoningHistory;
|
724
730
|
response_schema: ResponseSchema | null;
|
731
|
+
extra_body: ExtraBody;
|
725
732
|
}
|
726
733
|
/**
|
727
734
|
* Schema for model response when using Structured Output.
|
@@ -786,6 +793,7 @@ export interface EvalConfig {
|
|
786
793
|
max_sandboxes: MaxSandboxes;
|
787
794
|
sandbox_cleanup: SandboxCleanup;
|
788
795
|
log_samples: LogSamples;
|
796
|
+
log_realtime: LogRealtime;
|
789
797
|
log_images: LogImages;
|
790
798
|
log_buffer: LogBuffer;
|
791
799
|
log_shared: LogShared;
|
@@ -888,6 +896,7 @@ export interface GenerateConfig1 {
|
|
888
896
|
reasoning_summary: ReasoningSummary;
|
889
897
|
reasoning_history: ReasoningHistory;
|
890
898
|
response_schema: ResponseSchema | null;
|
899
|
+
extra_body: ExtraBody;
|
891
900
|
}
|
892
901
|
/**
|
893
902
|
* Scoring results from evaluation.
|
@@ -1525,7 +1534,7 @@ export interface Attachments {
|
|
1525
1534
|
[k: string]: string;
|
1526
1535
|
}
|
1527
1536
|
/**
|
1528
|
-
* Limit
|
1537
|
+
* Limit encountered by sample.
|
1529
1538
|
*/
|
1530
1539
|
export interface EvalSampleLimit {
|
1531
1540
|
type: Type16;
|
@@ -9,6 +9,7 @@ import { MetaDataView } from "./MetaDataView";
|
|
9
9
|
import clsx from "clsx";
|
10
10
|
import { FC, Fragment, isValidElement, JSX, ReactNode } from "react";
|
11
11
|
import JSONPanel from "../../components/JsonPanel";
|
12
|
+
import { MarkdownDiv } from "../../components/MarkdownDiv";
|
12
13
|
import { isJson } from "../../utils/json";
|
13
14
|
import styles from "./RenderedContent.module.css";
|
14
15
|
import { Buckets, ContentRenderer, RenderOptions } from "./types";
|
@@ -142,7 +143,7 @@ const contentRenderers: Record<string, ContentRenderer> = {
|
|
142
143
|
const rendered = entry.value.trim();
|
143
144
|
if (options.renderString === "markdown") {
|
144
145
|
return {
|
145
|
-
rendered: rendered
|
146
|
+
rendered: <MarkdownDiv markdown={rendered} />,
|
146
147
|
};
|
147
148
|
} else {
|
148
149
|
return {
|
@@ -275,7 +275,7 @@ export const SampleDisplay: FC<SampleDisplayProps> = ({ id, scrollRef }) => {
|
|
275
275
|
output={sample.error.traceback_ansi}
|
276
276
|
className={clsx("text-size-small", styles.ansi)}
|
277
277
|
style={{
|
278
|
-
fontSize: "clamp(0.
|
278
|
+
fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
|
279
279
|
margin: "0.5em 0",
|
280
280
|
}}
|
281
281
|
/>
|
@@ -291,7 +291,7 @@ export const SampleDisplay: FC<SampleDisplayProps> = ({ id, scrollRef }) => {
|
|
291
291
|
output={retry.traceback_ansi}
|
292
292
|
className={clsx("text-size-small", styles.ansi)}
|
293
293
|
style={{
|
294
|
-
fontSize: "clamp(0.
|
294
|
+
fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
|
295
295
|
margin: "0.5em 0",
|
296
296
|
}}
|
297
297
|
/>
|
@@ -32,7 +32,7 @@ export const ErrorEventView: FC<ErrorEventViewProps> = ({
|
|
32
32
|
<ANSIDisplay
|
33
33
|
output={event.error.traceback_ansi}
|
34
34
|
style={{
|
35
|
-
fontSize: "clamp(0.
|
35
|
+
fontSize: "clamp(0.3rem, 1.1vw, 0.8rem)",
|
36
36
|
margin: "0.5em 0",
|
37
37
|
}}
|
38
38
|
/>
|
inspect_ai/agent/_run.py
CHANGED
@@ -1,20 +1,43 @@
|
|
1
1
|
from copy import copy
|
2
|
-
from typing import Any
|
2
|
+
from typing import Any, overload
|
3
3
|
|
4
4
|
from inspect_ai._util.registry import registry_unqualified_name
|
5
5
|
from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
|
6
|
-
from inspect_ai.util._limit import Limit, apply_limits
|
6
|
+
from inspect_ai.util._limit import Limit, LimitExceededError, apply_limits
|
7
7
|
from inspect_ai.util._span import span
|
8
8
|
|
9
9
|
from ._agent import Agent, AgentState
|
10
10
|
|
11
11
|
|
12
|
+
@overload
|
12
13
|
async def run(
|
13
14
|
agent: Agent,
|
14
15
|
input: str | list[ChatMessage] | AgentState,
|
15
16
|
limits: list[Limit] = [],
|
17
|
+
*,
|
18
|
+
name: str | None = None,
|
16
19
|
**agent_kwargs: Any,
|
17
|
-
) -> AgentState:
|
20
|
+
) -> tuple[AgentState, LimitExceededError | None]: ...
|
21
|
+
|
22
|
+
|
23
|
+
@overload
|
24
|
+
async def run(
|
25
|
+
agent: Agent,
|
26
|
+
input: str | list[ChatMessage] | AgentState,
|
27
|
+
*,
|
28
|
+
name: str | None = None,
|
29
|
+
**agent_kwargs: Any,
|
30
|
+
) -> AgentState: ...
|
31
|
+
|
32
|
+
|
33
|
+
async def run(
|
34
|
+
agent: Agent,
|
35
|
+
input: str | list[ChatMessage] | AgentState,
|
36
|
+
limits: list[Limit] = [],
|
37
|
+
*,
|
38
|
+
name: str | None = None,
|
39
|
+
**agent_kwargs: Any,
|
40
|
+
) -> AgentState | tuple[AgentState, LimitExceededError | None]:
|
18
41
|
"""Run an agent.
|
19
42
|
|
20
43
|
The input messages(s) will be copied prior to running so are
|
@@ -26,10 +49,16 @@ async def run(
|
|
26
49
|
limits: List of limits to apply to the agent. Should a limit be
|
27
50
|
exceeded, a LimitExceededError is raised which the caller may
|
28
51
|
handle as appropriate.
|
52
|
+
name: Optional display name for the transcript entry. If not provided, the
|
53
|
+
agent's name as defined in the registry will be used.
|
29
54
|
**agent_kwargs: Additional arguments to pass to agent.
|
30
55
|
|
31
56
|
Returns:
|
32
|
-
AgentState: Messages and generated output.
|
57
|
+
AgentState: Messages and generated output. This is all that is returned if no
|
58
|
+
limits are supplied.
|
59
|
+
LimitExceededError | None: If a non-empty limits list is supplied, a tuple is
|
60
|
+
returned. If a limit was exceeded, the second value in the tuple is the
|
61
|
+
exception instance. If no limit was exceeded, the second element is None.
|
33
62
|
"""
|
34
63
|
# copy input so we don't mutate it in place
|
35
64
|
input = copy(input)
|
@@ -52,9 +81,16 @@ async def run(
|
|
52
81
|
# create state
|
53
82
|
state = AgentState(messages=input_messages)
|
54
83
|
|
55
|
-
# run the agent with limits
|
56
|
-
with apply_limits(limits):
|
84
|
+
# run the agent with limits, catching errors which are a direct result of our limits
|
85
|
+
with apply_limits(limits, catch_errors=True) as limit_scope:
|
57
86
|
# run the agent
|
58
|
-
agent_name = registry_unqualified_name(agent)
|
87
|
+
agent_name = name or registry_unqualified_name(agent)
|
59
88
|
async with span(name=agent_name, type="agent"):
|
60
|
-
|
89
|
+
state = await agent(state, **agent_kwargs)
|
90
|
+
if limits:
|
91
|
+
return state, None
|
92
|
+
else:
|
93
|
+
return state
|
94
|
+
|
95
|
+
# execution reaches this point iff one of "our" limits was exceeded
|
96
|
+
return state, limit_scope.limit_error
|
inspect_ai/log/_bundle.py
CHANGED
@@ -146,7 +146,7 @@ def copy_log_files(
|
|
146
146
|
log_fs = filesystem(log_dir, fs_options)
|
147
147
|
if log_fs.exists(log_dir):
|
148
148
|
eval_logs = log_files_from_ls(
|
149
|
-
log_fs.ls(log_dir, recursive=True), ["json", "eval"],
|
149
|
+
log_fs.ls(log_dir, recursive=True), ["json", "eval"], False
|
150
150
|
)
|
151
151
|
if len(eval_logs) == 0:
|
152
152
|
raise PrerequisiteError(
|
@@ -201,8 +201,10 @@ def move_output(
|
|
201
201
|
output_fs.mkdir(dir_path)
|
202
202
|
tick()
|
203
203
|
|
204
|
-
# Copy the files
|
205
|
-
for working_file in
|
204
|
+
# Copy the files, preserving relative mtime ordering
|
205
|
+
for _, working_file in sorted(
|
206
|
+
(os.stat(os.path.join(root, f)).st_mtime, f) for f in files
|
207
|
+
):
|
206
208
|
target_path = (
|
207
209
|
os.path.join(relative_dir, working_file)
|
208
210
|
if relative_dir != "."
|
inspect_ai/log/_log.py
CHANGED
@@ -158,7 +158,7 @@ class EvalConfig(BaseModel):
|
|
158
158
|
|
159
159
|
|
160
160
|
class EvalSampleLimit(BaseModel):
|
161
|
-
"""Limit
|
161
|
+
"""Limit encountered by sample."""
|
162
162
|
|
163
163
|
type: Literal[
|
164
164
|
"context", "time", "working", "message", "token", "operator", "custom"
|
@@ -694,7 +694,7 @@ class EvalSpec(BaseModel):
|
|
694
694
|
task_id: str = Field(default_factory=str)
|
695
695
|
"""Unique task id."""
|
696
696
|
|
697
|
-
task_version: int = Field(default=0)
|
697
|
+
task_version: int | str = Field(default=0)
|
698
698
|
"""Task version."""
|
699
699
|
|
700
700
|
task_file: str | None = Field(default=None)
|