inspect-ai 0.3.51__py3-none-any.whl → 0.3.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +44 -2
- inspect_ai/_display/core/config.py +4 -0
- inspect_ai/_display/core/panel.py +1 -1
- inspect_ai/_display/core/progress.py +9 -3
- inspect_ai/_display/core/results.py +8 -4
- inspect_ai/_display/textual/widgets/task_detail.py +45 -13
- inspect_ai/_display/textual/widgets/tasks.py +86 -5
- inspect_ai/_display/textual/widgets/transcript.py +4 -17
- inspect_ai/_eval/eval.py +29 -1
- inspect_ai/_eval/evalset.py +7 -0
- inspect_ai/_eval/registry.py +2 -2
- inspect_ai/_eval/task/log.py +6 -1
- inspect_ai/_eval/task/results.py +22 -4
- inspect_ai/_eval/task/run.py +18 -12
- inspect_ai/_eval/task/sandbox.py +72 -43
- inspect_ai/_eval/task/task.py +4 -0
- inspect_ai/_eval/task/util.py +17 -6
- inspect_ai/_util/logger.py +10 -2
- inspect_ai/_util/samples.py +7 -0
- inspect_ai/_util/transcript.py +8 -0
- inspect_ai/_view/www/App.css +13 -0
- inspect_ai/_view/www/dist/assets/index.css +13 -0
- inspect_ai/_view/www/dist/assets/index.js +105 -55
- inspect_ai/_view/www/src/App.mjs +31 -6
- inspect_ai/_view/www/src/Types.mjs +6 -0
- inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
- inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
- inspect_ai/_view/www/src/components/Tools.mjs +46 -18
- inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +18 -5
- inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
- inspect_ai/log/_log.py +6 -0
- inspect_ai/log/_recorders/eval.py +8 -7
- inspect_ai/model/_call_tools.py +2 -6
- inspect_ai/model/_generate_config.py +6 -0
- inspect_ai/model/_model.py +18 -4
- inspect_ai/model/_providers/azureai.py +22 -2
- inspect_ai/model/_providers/bedrock.py +17 -1
- inspect_ai/model/_providers/hf.py +1 -1
- inspect_ai/model/_providers/openai.py +32 -8
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/vllm.py +1 -1
- inspect_ai/model/_render.py +7 -6
- inspect_ai/model/_trace.py +1 -1
- inspect_ai/solver/_basic_agent.py +8 -1
- inspect_ai/tool/_tool_transcript.py +28 -0
- inspect_ai/util/_sandbox/context.py +1 -2
- inspect_ai/util/_sandbox/docker/config.py +8 -10
- inspect_ai/util/_sandbox/docker/docker.py +9 -5
- inspect_ai/util/_sandbox/docker/util.py +3 -3
- inspect_ai/util/_sandbox/environment.py +7 -2
- inspect_ai/util/_sandbox/limits.py +1 -1
- inspect_ai/util/_sandbox/local.py +8 -9
- {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/METADATA +2 -4
- {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/RECORD +60 -59
- {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -12,7 +12,7 @@ from inspect_ai._util.constants import (
|
|
12
12
|
DEFAULT_MAX_RETRIES,
|
13
13
|
)
|
14
14
|
from inspect_ai._util.file import filesystem
|
15
|
-
from inspect_ai._util.samples import parse_samples_limit
|
15
|
+
from inspect_ai._util.samples import parse_sample_id, parse_samples_limit
|
16
16
|
from inspect_ai.log._file import log_file_info
|
17
17
|
from inspect_ai.model import GenerateConfigArgs
|
18
18
|
from inspect_ai.scorer._reducer import create_reducers
|
@@ -30,6 +30,7 @@ MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
|
|
30
30
|
MAX_SUBPROCESSES_HELP = (
|
31
31
|
"Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
|
32
32
|
)
|
33
|
+
MAX_SANDBOXES_HELP = "Maximum number of sandboxes (per-provider) to run in parallel."
|
33
34
|
NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
|
34
35
|
FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
|
35
36
|
NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
|
@@ -144,6 +145,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
144
145
|
help="Limit samples to evaluate e.g. 10 or 10-20",
|
145
146
|
envvar="INSPECT_EVAL_LIMIT",
|
146
147
|
)
|
148
|
+
@click.option(
|
149
|
+
"--sample-id",
|
150
|
+
type=str,
|
151
|
+
help="Evaluate specific sample(s) (comma separated list of ids)",
|
152
|
+
envvar="INSPECT_EVAL_SAMPLE_ID",
|
153
|
+
)
|
147
154
|
@click.option(
|
148
155
|
"--epochs",
|
149
156
|
type=int,
|
@@ -186,6 +193,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
186
193
|
help=MAX_SUBPROCESSES_HELP,
|
187
194
|
envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
|
188
195
|
)
|
196
|
+
@click.option(
|
197
|
+
"--max-sandboxes",
|
198
|
+
type=int,
|
199
|
+
help=MAX_SANDBOXES_HELP,
|
200
|
+
envvar="INSPECT_EVAL_MAX_SANDBOXES",
|
201
|
+
)
|
189
202
|
@click.option(
|
190
203
|
"--message-limit",
|
191
204
|
type=int,
|
@@ -355,6 +368,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
355
368
|
help='Cache prompt prefix (Anthropic only). Defaults to "auto", which will enable caching for requests with tools.',
|
356
369
|
envvar="INSPECT_EVAL_CACHE_PROMPT",
|
357
370
|
)
|
371
|
+
@click.option(
|
372
|
+
"--reasoning-effort",
|
373
|
+
type=click.Choice(["low", "medium", "high"]),
|
374
|
+
help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
|
375
|
+
envvar="INSPECT_EVAL_REASONING_EFFORT",
|
376
|
+
)
|
358
377
|
@click.option(
|
359
378
|
"--log-format",
|
360
379
|
type=click.Choice(["eval", "json"], case_sensitive=False),
|
@@ -391,6 +410,7 @@ def eval_command(
|
|
391
410
|
epochs: int | None,
|
392
411
|
epochs_reducer: str | None,
|
393
412
|
limit: str | None,
|
413
|
+
sample_id: str | None,
|
394
414
|
max_retries: int | None,
|
395
415
|
timeout: int | None,
|
396
416
|
max_connections: int | None,
|
@@ -412,12 +432,14 @@ def eval_command(
|
|
412
432
|
parallel_tool_calls: bool | None,
|
413
433
|
max_tool_output: int | None,
|
414
434
|
cache_prompt: str | None,
|
435
|
+
reasoning_effort: str | None,
|
415
436
|
message_limit: int | None,
|
416
437
|
token_limit: int | None,
|
417
438
|
time_limit: int | None,
|
418
439
|
max_samples: int | None,
|
419
440
|
max_tasks: int | None,
|
420
441
|
max_subprocesses: int | None,
|
442
|
+
max_sandboxes: int | None,
|
421
443
|
fail_on_error: bool | float | None,
|
422
444
|
no_fail_on_error: bool | None,
|
423
445
|
no_log_samples: bool | None,
|
@@ -458,12 +480,14 @@ def eval_command(
|
|
458
480
|
epochs=epochs,
|
459
481
|
epochs_reducer=epochs_reducer,
|
460
482
|
limit=limit,
|
483
|
+
sample_id=sample_id,
|
461
484
|
message_limit=message_limit,
|
462
485
|
token_limit=token_limit,
|
463
486
|
time_limit=time_limit,
|
464
487
|
max_samples=max_samples,
|
465
488
|
max_tasks=max_tasks,
|
466
489
|
max_subprocesses=max_subprocesses,
|
490
|
+
max_sandboxes=max_sandboxes,
|
467
491
|
fail_on_error=fail_on_error,
|
468
492
|
no_fail_on_error=no_fail_on_error,
|
469
493
|
debug_errors=common["debug_errors"],
|
@@ -543,6 +567,7 @@ def eval_set_command(
|
|
543
567
|
epochs: int | None,
|
544
568
|
epochs_reducer: str | None,
|
545
569
|
limit: str | None,
|
570
|
+
sample_id: str | None,
|
546
571
|
max_retries: int | None,
|
547
572
|
timeout: int | None,
|
548
573
|
max_connections: int | None,
|
@@ -564,12 +589,14 @@ def eval_set_command(
|
|
564
589
|
parallel_tool_calls: bool | None,
|
565
590
|
max_tool_output: int | None,
|
566
591
|
cache_prompt: str | None,
|
592
|
+
reasoning_effort: str | None,
|
567
593
|
message_limit: int | None,
|
568
594
|
token_limit: int | None,
|
569
595
|
time_limit: int | None,
|
570
596
|
max_samples: int | None,
|
571
597
|
max_tasks: int | None,
|
572
598
|
max_subprocesses: int | None,
|
599
|
+
max_sandboxes: int | None,
|
573
600
|
fail_on_error: bool | float | None,
|
574
601
|
no_fail_on_error: bool | None,
|
575
602
|
no_log_samples: bool | None,
|
@@ -612,12 +639,14 @@ def eval_set_command(
|
|
612
639
|
epochs=epochs,
|
613
640
|
epochs_reducer=epochs_reducer,
|
614
641
|
limit=limit,
|
642
|
+
sample_id=sample_id,
|
615
643
|
message_limit=message_limit,
|
616
644
|
token_limit=token_limit,
|
617
645
|
time_limit=time_limit,
|
618
646
|
max_samples=max_samples,
|
619
647
|
max_tasks=max_tasks,
|
620
648
|
max_subprocesses=max_subprocesses,
|
649
|
+
max_sandboxes=max_sandboxes,
|
621
650
|
fail_on_error=fail_on_error,
|
622
651
|
no_fail_on_error=no_fail_on_error,
|
623
652
|
debug_errors=common["debug_errors"],
|
@@ -662,12 +691,14 @@ def eval_exec(
|
|
662
691
|
epochs: int | None,
|
663
692
|
epochs_reducer: str | None,
|
664
693
|
limit: str | None,
|
694
|
+
sample_id: str | None,
|
665
695
|
message_limit: int | None,
|
666
696
|
token_limit: int | None,
|
667
697
|
time_limit: int | None,
|
668
698
|
max_samples: int | None,
|
669
699
|
max_tasks: int | None,
|
670
700
|
max_subprocesses: int | None,
|
701
|
+
max_sandboxes: int | None,
|
671
702
|
fail_on_error: bool | float | None,
|
672
703
|
no_fail_on_error: bool | None,
|
673
704
|
debug_errors: bool | None,
|
@@ -699,8 +730,9 @@ def eval_exec(
|
|
699
730
|
else None
|
700
731
|
)
|
701
732
|
|
702
|
-
# resolve range
|
733
|
+
# resolve range and sample id
|
703
734
|
eval_limit = parse_samples_limit(limit)
|
735
|
+
eval_sample_id = parse_sample_id(sample_id)
|
704
736
|
|
705
737
|
# resolve fail_on_error
|
706
738
|
if no_fail_on_error is True:
|
@@ -734,6 +766,7 @@ def eval_exec(
|
|
734
766
|
log_dir=log_dir,
|
735
767
|
log_format=log_format,
|
736
768
|
limit=eval_limit,
|
769
|
+
sample_id=eval_sample_id,
|
737
770
|
epochs=eval_epochs,
|
738
771
|
fail_on_error=fail_on_error,
|
739
772
|
debug_errors=debug_errors,
|
@@ -743,6 +776,7 @@ def eval_exec(
|
|
743
776
|
max_samples=max_samples,
|
744
777
|
max_tasks=max_tasks,
|
745
778
|
max_subprocesses=max_subprocesses,
|
779
|
+
max_sandboxes=max_sandboxes,
|
746
780
|
log_samples=log_samples,
|
747
781
|
log_images=log_images,
|
748
782
|
log_buffer=log_buffer,
|
@@ -821,6 +855,12 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
821
855
|
help=MAX_SUBPROCESSES_HELP,
|
822
856
|
envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
|
823
857
|
)
|
858
|
+
@click.option(
|
859
|
+
"--max-sandboxes",
|
860
|
+
type=int,
|
861
|
+
help=MAX_SANDBOXES_HELP,
|
862
|
+
envvar="INSPECT_EVAL_MAX_SANDBOXES",
|
863
|
+
)
|
824
864
|
@click.option(
|
825
865
|
"--no-sandbox-cleanup",
|
826
866
|
type=bool,
|
@@ -891,6 +931,7 @@ def eval_retry_command(
|
|
891
931
|
max_samples: int | None,
|
892
932
|
max_tasks: int | None,
|
893
933
|
max_subprocesses: int | None,
|
934
|
+
max_sandboxes: int | None,
|
894
935
|
no_sandbox_cleanup: bool | None,
|
895
936
|
trace: bool | None,
|
896
937
|
fail_on_error: bool | float | None,
|
@@ -934,6 +975,7 @@ def eval_retry_command(
|
|
934
975
|
max_samples=max_samples,
|
935
976
|
max_tasks=max_tasks,
|
936
977
|
max_subprocesses=max_subprocesses,
|
978
|
+
max_sandboxes=max_sandboxes,
|
937
979
|
sandbox_cleanup=sandbox_cleanup,
|
938
980
|
trace=trace,
|
939
981
|
fail_on_error=fail_on_error,
|
@@ -24,6 +24,10 @@ def task_config(
|
|
24
24
|
config_print.append(
|
25
25
|
f"{name}: {','.join([approver['name'] for approver in value['approvers']])}"
|
26
26
|
)
|
27
|
+
elif name == "sample_id":
|
28
|
+
value = value if isinstance(value, list) else [value]
|
29
|
+
value = [str(v) for v in value]
|
30
|
+
config_print.append(f"{name}: {','.join(value)}")
|
27
31
|
elif name not in ["limit", "model"]:
|
28
32
|
config_print.append(f"{name}: {value}")
|
29
33
|
values = ", ".join(config_print)
|
@@ -112,7 +112,7 @@ def tasks_title(completed: int, total: int) -> str:
|
|
112
112
|
def task_title(profile: TaskProfile, show_model: bool) -> str:
|
113
113
|
eval_epochs = profile.eval_config.epochs or 1
|
114
114
|
epochs = f" x {profile.eval_config.epochs}" if eval_epochs > 1 else ""
|
115
|
-
samples = f"{profile.samples//eval_epochs:,}{epochs} sample{'s' if profile.samples
|
115
|
+
samples = f"{profile.samples//eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
|
116
116
|
title = f"{registry_unqualified_name(profile.name)} ({samples})"
|
117
117
|
if show_model:
|
118
118
|
title = f"{title}: {profile.model}"
|
@@ -130,9 +130,15 @@ def progress_time(time: float) -> str:
|
|
130
130
|
return f"{hours:2.0f}:{minutes:02.0f}:{seconds:02.0f}"
|
131
131
|
|
132
132
|
|
133
|
-
def progress_count(complete: int, total: int) -> str:
|
134
|
-
# Pad the display to keep it stable
|
133
|
+
def progress_count(complete: int, total: int, width: int | None = None) -> str:
|
134
|
+
# Pad the display to keep it stable as the
|
135
|
+
# complete metrics
|
135
136
|
total_str = f"{total:,}"
|
136
137
|
complete_str = f"{complete:,}"
|
137
138
|
padding = max(0, len(total_str) - len(complete_str))
|
138
|
-
|
139
|
+
padded = " " * padding + f"[{complete_str}/{total_str}]"
|
140
|
+
|
141
|
+
# If a width has ben specified, pad up to this width as well
|
142
|
+
if width is not None:
|
143
|
+
padded = padded.rjust(width)
|
144
|
+
return padded
|
@@ -166,7 +166,7 @@ def task_interrupted(profile: TaskProfile, samples_completed: int) -> Renderable
|
|
166
166
|
return message
|
167
167
|
|
168
168
|
|
169
|
-
def task_metric(metrics: list[TaskDisplayMetric]) -> str:
|
169
|
+
def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> str:
|
170
170
|
reducer_names: Set[str] = {
|
171
171
|
metric.reducer for metric in metrics if metric.reducer is not None
|
172
172
|
}
|
@@ -180,10 +180,14 @@ def task_metric(metrics: list[TaskDisplayMetric]) -> str:
|
|
180
180
|
else:
|
181
181
|
value = f"{metric.value:.2f}"
|
182
182
|
|
183
|
-
if show_reducer:
|
184
|
-
|
183
|
+
if show_reducer and metric.reducer is not None:
|
184
|
+
metric_str = f"{metric.name}/{metric.reducer}: {value}"
|
185
185
|
else:
|
186
|
-
|
186
|
+
metric_str = f"{metric.name}: {value}"
|
187
|
+
|
188
|
+
if width is not None:
|
189
|
+
metric_str = metric_str.rjust(width)
|
190
|
+
return metric_str
|
187
191
|
|
188
192
|
|
189
193
|
def task_metrics(scores: list[EvalScore]) -> str:
|
@@ -63,6 +63,9 @@ class TaskDetail(Widget):
|
|
63
63
|
def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
|
64
64
|
# Group by reducer then scorer within reducers
|
65
65
|
self.metrics = metrics
|
66
|
+
|
67
|
+
# clear the existing computed reducers
|
68
|
+
self.by_reducer = {}
|
66
69
|
for metric in metrics:
|
67
70
|
reducer_group = (
|
68
71
|
self.by_reducer[metric.reducer]
|
@@ -117,6 +120,7 @@ class TaskDetail(Widget):
|
|
117
120
|
for remove in to_remove:
|
118
121
|
task_metric = self.existing_metrics[remove]
|
119
122
|
task_metric.remove()
|
123
|
+
del self.existing_metrics[remove]
|
120
124
|
|
121
125
|
# add or update widgets with metrics
|
122
126
|
for reducer, scorers in self.by_reducer.items():
|
@@ -187,24 +191,52 @@ class TaskMetrics(Widget):
|
|
187
191
|
self.grid: Grid = Grid()
|
188
192
|
self.value_widgets: dict[str, Static] = {}
|
189
193
|
|
194
|
+
def grid_id(self) -> str:
|
195
|
+
return f"{self.id}-grid"
|
196
|
+
|
190
197
|
def compose(self) -> ComposeResult:
|
191
|
-
#
|
198
|
+
# Yield the title and base grid
|
192
199
|
yield Center(self._title())
|
193
|
-
|
194
|
-
for metric in self.metrics:
|
195
|
-
# Add the value static but keep it around
|
196
|
-
# for future updates
|
197
|
-
self.value_widgets[metric.name] = Static(
|
198
|
-
self._metric_value(metric.value)
|
199
|
-
)
|
200
|
-
|
201
|
-
yield Static(metric.name)
|
202
|
-
yield self.value_widgets[metric.name]
|
200
|
+
yield Grid(id=self.grid_id())
|
203
201
|
|
204
202
|
def update(self, metrics: list[TaskMetric]) -> None:
|
203
|
+
self.metrics = metrics
|
204
|
+
|
205
|
+
# We assume that generally the initial metric names will
|
206
|
+
# always match future updates (so we can just update values in line)
|
207
|
+
# but if an unrecognized metric appears on the scene, just
|
208
|
+
# recompute the whole grid
|
209
|
+
need_recompute = False
|
205
210
|
for metric in metrics:
|
206
|
-
widget = self.value_widgets
|
207
|
-
widget
|
211
|
+
widget = self.value_widgets.get(metric.name)
|
212
|
+
if widget:
|
213
|
+
# Just update the values themselves
|
214
|
+
widget.update(content=f"{metric.value:,.3f}")
|
215
|
+
else:
|
216
|
+
# Don't have a widget for this, recompute the whole grid
|
217
|
+
need_recompute = True
|
218
|
+
break
|
219
|
+
|
220
|
+
if need_recompute:
|
221
|
+
self.recompute_grid()
|
222
|
+
|
223
|
+
def on_mount(self) -> None:
|
224
|
+
self.recompute_grid()
|
225
|
+
|
226
|
+
def recompute_grid(self) -> None:
|
227
|
+
if not self.is_mounted:
|
228
|
+
return
|
229
|
+
|
230
|
+
grid = self.query_one(f"#{self.grid_id()}")
|
231
|
+
|
232
|
+
grid.remove_children()
|
233
|
+
for metric in self.metrics:
|
234
|
+
# Add the value static but keep it around
|
235
|
+
# for future updates
|
236
|
+
self.value_widgets[metric.name] = Static(self._metric_value(metric.value))
|
237
|
+
|
238
|
+
grid.mount(Static(metric.name))
|
239
|
+
grid.mount(self.value_widgets[metric.name])
|
208
240
|
|
209
241
|
def _title(self) -> Widget:
|
210
242
|
if self.scorer is None:
|
@@ -36,6 +36,9 @@ from ...core.progress import (
|
|
36
36
|
progress_model_name,
|
37
37
|
)
|
38
38
|
|
39
|
+
MAX_METRIC_WIDTH = 25
|
40
|
+
MAX_COUNT_WIDTH = 15
|
41
|
+
|
39
42
|
|
40
43
|
class TasksView(Container):
|
41
44
|
DEFAULT_CSS = """
|
@@ -68,6 +71,7 @@ class TasksView(Container):
|
|
68
71
|
super().__init__()
|
69
72
|
self.description_width = MAX_DESCRIPTION_WIDTH
|
70
73
|
self.model_name_width = MAX_MODEL_NAME_WIDTH
|
74
|
+
self.sample_count_width = 0
|
71
75
|
|
72
76
|
def init_tasks(self, tasks: list[TaskSpec]) -> None:
|
73
77
|
# clear existing tasks
|
@@ -80,15 +84,41 @@ class TasksView(Container):
|
|
80
84
|
self.model_name_width = min(
|
81
85
|
max([len(str(task.model)) for task in tasks]), MAX_MODEL_NAME_WIDTH
|
82
86
|
)
|
87
|
+
self.update_progress_widths()
|
83
88
|
|
84
89
|
def add_task(self, task: TaskWithResult) -> TaskDisplay:
|
90
|
+
self.update_count_width(task.profile.samples)
|
85
91
|
task_display = TaskProgressView(
|
86
|
-
task, self.description_width, self.model_name_width
|
92
|
+
task, self.description_width, self.model_name_width, self.sample_count_width
|
87
93
|
)
|
88
94
|
self.tasks.mount(task_display)
|
89
95
|
self.tasks.scroll_to_widget(task_display)
|
96
|
+
self.update_progress_widths()
|
97
|
+
|
90
98
|
return task_display
|
91
99
|
|
100
|
+
def update_count_width(self, samples: int) -> None:
|
101
|
+
sample_count_str = progress_count(samples, samples, self.sample_count_width)
|
102
|
+
self.sample_count_width = min(
|
103
|
+
max(self.sample_count_width, len(sample_count_str)), MAX_COUNT_WIDTH
|
104
|
+
)
|
105
|
+
|
106
|
+
def update_progress_widths(self) -> None:
|
107
|
+
progress_views = self.tasks.query_children(TaskProgressView)
|
108
|
+
metrics_size = 0
|
109
|
+
for progress_view in progress_views:
|
110
|
+
metrics_size = max(
|
111
|
+
metrics_size,
|
112
|
+
progress_view.metrics_width
|
113
|
+
if progress_view.metrics_width is not None
|
114
|
+
else 0,
|
115
|
+
)
|
116
|
+
metrics_size = min(metrics_size, MAX_METRIC_WIDTH)
|
117
|
+
|
118
|
+
for progress_view in progress_views:
|
119
|
+
progress_view.update_metrics_width(metrics_size)
|
120
|
+
progress_view.update_count_width(self.sample_count_width)
|
121
|
+
|
92
122
|
def compose(self) -> ComposeResult:
|
93
123
|
yield Static(id="tasks-config")
|
94
124
|
yield Static(id="tasks-targets")
|
@@ -139,13 +169,18 @@ class TaskProgressView(Widget):
|
|
139
169
|
"""
|
140
170
|
|
141
171
|
def __init__(
|
142
|
-
self,
|
172
|
+
self,
|
173
|
+
task: TaskWithResult,
|
174
|
+
description_width: int,
|
175
|
+
model_name_width: int,
|
176
|
+
sample_count_width: int,
|
143
177
|
) -> None:
|
144
178
|
super().__init__()
|
145
179
|
self.t = task
|
146
180
|
|
147
181
|
self.description_width = description_width
|
148
182
|
self.model_name_width = model_name_width
|
183
|
+
|
149
184
|
self.progress_bar = ProgressBar(total=task.profile.steps, show_eta=False)
|
150
185
|
self.count_display = Static()
|
151
186
|
self.metrics_display = Static(id="task-metrics")
|
@@ -154,6 +189,14 @@ class TaskProgressView(Widget):
|
|
154
189
|
self.toggle = Toggle()
|
155
190
|
self.task_detail = TaskDetail(id="task-detail", classes="hidden")
|
156
191
|
|
192
|
+
self.sample_count_width: int = sample_count_width
|
193
|
+
|
194
|
+
metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
|
195
|
+
metrics_width: reactive[int | None] = reactive(None)
|
196
|
+
sample_count_width: reactive[int] = reactive(0)
|
197
|
+
samples_complete: reactive[int] = reactive(0)
|
198
|
+
samples_total: reactive[int] = reactive(0)
|
199
|
+
|
157
200
|
def compose(self) -> ComposeResult:
|
158
201
|
yield self.toggle
|
159
202
|
yield TaskStatusIcon()
|
@@ -191,13 +234,51 @@ class TaskProgressView(Widget):
|
|
191
234
|
self.task_progress.complete()
|
192
235
|
|
193
236
|
def sample_complete(self, complete: int, total: int) -> None:
|
194
|
-
self.
|
237
|
+
self.samples_complete = complete
|
238
|
+
self.samples_total = total
|
195
239
|
|
196
240
|
def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
|
197
|
-
|
198
|
-
|
241
|
+
self.metrics = metrics
|
242
|
+
|
243
|
+
def update_metrics_width(self, width: int) -> None:
|
244
|
+
self.metrics_width = width
|
245
|
+
|
246
|
+
def update_count_width(self, width: int) -> None:
|
247
|
+
self.sample_count_width = width
|
248
|
+
|
249
|
+
def _watch_sample_count_width(self, width: int) -> None:
|
250
|
+
self.refresh_count()
|
251
|
+
|
252
|
+
def _watch_samples_complete(self, complete: int) -> None:
|
253
|
+
self.refresh_count()
|
254
|
+
|
255
|
+
def _watch_samples_total(self, total: int) -> None:
|
256
|
+
self.refresh_count()
|
257
|
+
|
258
|
+
def _watch_metrics_width(self, width: int) -> None:
|
259
|
+
self.update_metrics_label()
|
260
|
+
|
261
|
+
def _watch_metrics(self, metrics: list[TaskDisplayMetric] | None) -> None:
|
262
|
+
if metrics is not None and len(metrics) > 0:
|
263
|
+
# update label
|
264
|
+
self.update_metrics_label()
|
265
|
+
|
266
|
+
# update details
|
199
267
|
self.task_detail.update_metrics(metrics)
|
200
268
|
|
269
|
+
def refresh_count(self) -> None:
|
270
|
+
progress_label = progress_count(
|
271
|
+
self.samples_complete, self.samples_total, self.sample_count_width
|
272
|
+
)
|
273
|
+
self.count_display.update(progress_label)
|
274
|
+
|
275
|
+
def update_metrics_label(self) -> None:
|
276
|
+
# compute the label (with a min size)
|
277
|
+
if self.metrics is not None:
|
278
|
+
metric_label = task_metric(self.metrics, self.metrics_width)
|
279
|
+
self.metrics_width = len(metric_label)
|
280
|
+
self.metrics_display.update(metric_label)
|
281
|
+
|
201
282
|
|
202
283
|
class TaskStatusIcon(Static):
|
203
284
|
result: reactive[TaskResult | None] = reactive(None)
|
@@ -10,10 +10,10 @@ from textual.widget import Widget
|
|
10
10
|
from textual.widgets import Static
|
11
11
|
|
12
12
|
from inspect_ai._util.content import ContentText
|
13
|
-
from inspect_ai._util.format import format_function_call
|
14
13
|
from inspect_ai._util.rich import lines_display
|
15
14
|
from inspect_ai._util.transcript import (
|
16
15
|
set_transcript_markdown_options,
|
16
|
+
transcript_function,
|
17
17
|
transcript_markdown,
|
18
18
|
transcript_separator,
|
19
19
|
)
|
@@ -36,6 +36,7 @@ from inspect_ai.log._transcript import (
|
|
36
36
|
from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
|
37
37
|
from inspect_ai.model._render import messages_preceding_assistant
|
38
38
|
from inspect_ai.tool._tool import ToolResult
|
39
|
+
from inspect_ai.tool._tool_transcript import transcript_tool_call
|
39
40
|
|
40
41
|
|
41
42
|
class TranscriptView(ScrollableContainer):
|
@@ -195,16 +196,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
|
|
195
196
|
display.extend(render_event(e) or [])
|
196
197
|
|
197
198
|
# render the call
|
198
|
-
content
|
199
|
-
if event.view:
|
200
|
-
if event.view.title:
|
201
|
-
content.append(Text.from_markup(f"[bold]{event.view.title}[/bold]\n"))
|
202
|
-
if event.view.format == "markdown":
|
203
|
-
content.append(transcript_markdown(event.view.content))
|
204
|
-
else:
|
205
|
-
content.append(event.view.content)
|
206
|
-
else:
|
207
|
-
content.append(render_function_call(event.function, event.arguments))
|
199
|
+
content = transcript_tool_call(event)
|
208
200
|
|
209
201
|
# render the output
|
210
202
|
if isinstance(event.result, list):
|
@@ -266,7 +258,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
|
|
266
258
|
for e in event.events:
|
267
259
|
display.extend(render_event(e) or [])
|
268
260
|
|
269
|
-
content: list[RenderableType] = [
|
261
|
+
content: list[RenderableType] = [transcript_function(event.name, event.input)]
|
270
262
|
if event.result:
|
271
263
|
content.append(Text())
|
272
264
|
if isinstance(event.result, str | int | float | bool | None):
|
@@ -309,11 +301,6 @@ def render_error_event(event: ErrorEvent) -> EventDisplay:
|
|
309
301
|
return EventDisplay("error", event.error.traceback.strip())
|
310
302
|
|
311
303
|
|
312
|
-
def render_function_call(function: str, arguments: dict[str, Any]) -> RenderableType:
|
313
|
-
call = format_function_call(function, arguments)
|
314
|
-
return transcript_markdown("```python\n" + call + "\n```\n")
|
315
|
-
|
316
|
-
|
317
304
|
def render_as_json(json: Any) -> RenderableType:
|
318
305
|
return transcript_markdown(
|
319
306
|
"```json\n"
|