inspect-ai 0.3.52__py3-none-any.whl → 0.3.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +55 -1
- inspect_ai/_cli/main.py +2 -0
- inspect_ai/_cli/trace.py +244 -0
- inspect_ai/_display/core/progress.py +9 -3
- inspect_ai/_display/core/results.py +8 -4
- inspect_ai/_display/textual/app.py +5 -1
- inspect_ai/_display/textual/widgets/task_detail.py +3 -0
- inspect_ai/_display/textual/widgets/tasks.py +97 -6
- inspect_ai/_eval/eval.py +33 -0
- inspect_ai/_eval/evalset.py +4 -0
- inspect_ai/_eval/registry.py +2 -2
- inspect_ai/_eval/task/images.py +4 -14
- inspect_ai/_eval/task/results.py +22 -4
- inspect_ai/_eval/task/run.py +40 -20
- inspect_ai/_eval/task/sandbox.py +72 -43
- inspect_ai/_eval/task/task.py +4 -0
- inspect_ai/_eval/task/util.py +2 -0
- inspect_ai/_util/constants.py +3 -3
- inspect_ai/_util/display.py +1 -0
- inspect_ai/_util/logger.py +34 -8
- inspect_ai/_util/trace.py +275 -0
- inspect_ai/_view/www/App.css +13 -0
- inspect_ai/_view/www/dist/assets/index.css +13 -0
- inspect_ai/_view/www/dist/assets/index.js +80 -43
- inspect_ai/_view/www/src/App.mjs +31 -6
- inspect_ai/_view/www/src/Types.mjs +6 -0
- inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
- inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
- inspect_ai/_view/www/src/components/Tools.mjs +46 -18
- inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
- inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
- inspect_ai/log/_log.py +6 -0
- inspect_ai/log/_message.py +2 -2
- inspect_ai/log/_recorders/eval.py +8 -18
- inspect_ai/log/_recorders/json.py +19 -17
- inspect_ai/model/_cache.py +22 -16
- inspect_ai/model/_call_tools.py +9 -1
- inspect_ai/model/_generate_config.py +8 -2
- inspect_ai/model/_model.py +11 -12
- inspect_ai/model/_providers/azureai.py +1 -1
- inspect_ai/model/_providers/bedrock.py +18 -2
- inspect_ai/model/_providers/hf.py +1 -1
- inspect_ai/model/_providers/openai.py +32 -8
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/vllm.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
- inspect_ai/util/_sandbox/context.py +7 -3
- inspect_ai/util/_sandbox/docker/compose.py +58 -19
- inspect_ai/util/_sandbox/docker/config.py +8 -10
- inspect_ai/util/_sandbox/docker/docker.py +20 -16
- inspect_ai/util/_sandbox/docker/util.py +3 -9
- inspect_ai/util/_sandbox/environment.py +7 -2
- inspect_ai/util/_sandbox/limits.py +1 -1
- inspect_ai/util/_sandbox/local.py +8 -9
- inspect_ai/util/_sandbox/service.py +17 -7
- inspect_ai/util/_subprocess.py +6 -1
- inspect_ai/util/_subtask.py +8 -2
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/METADATA +6 -8
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/RECORD +64 -62
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -30,6 +30,7 @@ MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
|
|
30
30
|
MAX_SUBPROCESSES_HELP = (
|
31
31
|
"Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
|
32
32
|
)
|
33
|
+
MAX_SANDBOXES_HELP = "Maximum number of sandboxes (per-provider) to run in parallel."
|
33
34
|
NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
|
34
35
|
FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
|
35
36
|
NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
|
@@ -41,6 +42,7 @@ LOG_BUFFER_HELP = "Number of samples to buffer before writing log file. If not s
|
|
41
42
|
NO_SCORE_HELP = (
|
42
43
|
"Do not score model output (use the inspect score command to score output later)"
|
43
44
|
)
|
45
|
+
NO_SCORE_DISPLAY = "Do not display scoring metrics in realtime."
|
44
46
|
MAX_CONNECTIONS_HELP = f"Maximum number of concurrent connections to Model API (defaults to {DEFAULT_MAX_CONNECTIONS})"
|
45
47
|
MAX_RETRIES_HELP = (
|
46
48
|
f"Maximum number of times to retry request (defaults to {DEFAULT_MAX_RETRIES})"
|
@@ -192,6 +194,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
192
194
|
help=MAX_SUBPROCESSES_HELP,
|
193
195
|
envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
|
194
196
|
)
|
197
|
+
@click.option(
|
198
|
+
"--max-sandboxes",
|
199
|
+
type=int,
|
200
|
+
help=MAX_SANDBOXES_HELP,
|
201
|
+
envvar="INSPECT_EVAL_MAX_SANDBOXES",
|
202
|
+
)
|
195
203
|
@click.option(
|
196
204
|
"--message-limit",
|
197
205
|
type=int,
|
@@ -250,6 +258,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
250
258
|
help=NO_SCORE_HELP,
|
251
259
|
envvar="INSPECT_EVAL_NO_SCORE",
|
252
260
|
)
|
261
|
+
@click.option(
|
262
|
+
"--no-score-display",
|
263
|
+
type=bool,
|
264
|
+
is_flag=True,
|
265
|
+
help=NO_SCORE_HELP,
|
266
|
+
envvar="INSPECT_EVAL_SCORE_DISPLAY",
|
267
|
+
)
|
253
268
|
@click.option(
|
254
269
|
"--max-tokens",
|
255
270
|
type=int,
|
@@ -332,7 +347,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
332
347
|
"--logprobs",
|
333
348
|
type=bool,
|
334
349
|
is_flag=True,
|
335
|
-
help="Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, vLLM only.",
|
350
|
+
help="Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only.",
|
336
351
|
envvar="INSPECT_EVAL_LOGPROBS",
|
337
352
|
)
|
338
353
|
@click.option(
|
@@ -361,6 +376,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
361
376
|
help='Cache prompt prefix (Anthropic only). Defaults to "auto", which will enable caching for requests with tools.',
|
362
377
|
envvar="INSPECT_EVAL_CACHE_PROMPT",
|
363
378
|
)
|
379
|
+
@click.option(
|
380
|
+
"--reasoning-effort",
|
381
|
+
type=click.Choice(["low", "medium", "high"]),
|
382
|
+
help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
|
383
|
+
envvar="INSPECT_EVAL_REASONING_EFFORT",
|
384
|
+
)
|
364
385
|
@click.option(
|
365
386
|
"--log-format",
|
366
387
|
type=click.Choice(["eval", "json"], case_sensitive=False),
|
@@ -419,18 +440,21 @@ def eval_command(
|
|
419
440
|
parallel_tool_calls: bool | None,
|
420
441
|
max_tool_output: int | None,
|
421
442
|
cache_prompt: str | None,
|
443
|
+
reasoning_effort: str | None,
|
422
444
|
message_limit: int | None,
|
423
445
|
token_limit: int | None,
|
424
446
|
time_limit: int | None,
|
425
447
|
max_samples: int | None,
|
426
448
|
max_tasks: int | None,
|
427
449
|
max_subprocesses: int | None,
|
450
|
+
max_sandboxes: int | None,
|
428
451
|
fail_on_error: bool | float | None,
|
429
452
|
no_fail_on_error: bool | None,
|
430
453
|
no_log_samples: bool | None,
|
431
454
|
log_images: bool | None,
|
432
455
|
log_buffer: int | None,
|
433
456
|
no_score: bool | None,
|
457
|
+
no_score_display: bool | None,
|
434
458
|
log_format: Literal["eval", "json"] | None,
|
435
459
|
**common: Unpack[CommonOptions],
|
436
460
|
) -> None:
|
@@ -472,6 +496,7 @@ def eval_command(
|
|
472
496
|
max_samples=max_samples,
|
473
497
|
max_tasks=max_tasks,
|
474
498
|
max_subprocesses=max_subprocesses,
|
499
|
+
max_sandboxes=max_sandboxes,
|
475
500
|
fail_on_error=fail_on_error,
|
476
501
|
no_fail_on_error=no_fail_on_error,
|
477
502
|
debug_errors=common["debug_errors"],
|
@@ -479,6 +504,7 @@ def eval_command(
|
|
479
504
|
log_images=log_images,
|
480
505
|
log_buffer=log_buffer,
|
481
506
|
no_score=no_score,
|
507
|
+
no_score_display=no_score_display,
|
482
508
|
is_eval_set=False,
|
483
509
|
**config,
|
484
510
|
)
|
@@ -573,18 +599,21 @@ def eval_set_command(
|
|
573
599
|
parallel_tool_calls: bool | None,
|
574
600
|
max_tool_output: int | None,
|
575
601
|
cache_prompt: str | None,
|
602
|
+
reasoning_effort: str | None,
|
576
603
|
message_limit: int | None,
|
577
604
|
token_limit: int | None,
|
578
605
|
time_limit: int | None,
|
579
606
|
max_samples: int | None,
|
580
607
|
max_tasks: int | None,
|
581
608
|
max_subprocesses: int | None,
|
609
|
+
max_sandboxes: int | None,
|
582
610
|
fail_on_error: bool | float | None,
|
583
611
|
no_fail_on_error: bool | None,
|
584
612
|
no_log_samples: bool | None,
|
585
613
|
log_images: bool | None,
|
586
614
|
log_buffer: int | None,
|
587
615
|
no_score: bool | None,
|
616
|
+
no_score_display: bool | None,
|
588
617
|
bundle_dir: str | None,
|
589
618
|
bundle_overwrite: bool | None,
|
590
619
|
log_format: Literal["eval", "json"] | None,
|
@@ -628,6 +657,7 @@ def eval_set_command(
|
|
628
657
|
max_samples=max_samples,
|
629
658
|
max_tasks=max_tasks,
|
630
659
|
max_subprocesses=max_subprocesses,
|
660
|
+
max_sandboxes=max_sandboxes,
|
631
661
|
fail_on_error=fail_on_error,
|
632
662
|
no_fail_on_error=no_fail_on_error,
|
633
663
|
debug_errors=common["debug_errors"],
|
@@ -635,6 +665,7 @@ def eval_set_command(
|
|
635
665
|
log_images=log_images,
|
636
666
|
log_buffer=log_buffer,
|
637
667
|
no_score=no_score,
|
668
|
+
no_score_display=no_score_display,
|
638
669
|
is_eval_set=True,
|
639
670
|
retry_attempts=retry_attempts,
|
640
671
|
retry_wait=retry_wait,
|
@@ -679,6 +710,7 @@ def eval_exec(
|
|
679
710
|
max_samples: int | None,
|
680
711
|
max_tasks: int | None,
|
681
712
|
max_subprocesses: int | None,
|
713
|
+
max_sandboxes: int | None,
|
682
714
|
fail_on_error: bool | float | None,
|
683
715
|
no_fail_on_error: bool | None,
|
684
716
|
debug_errors: bool | None,
|
@@ -686,6 +718,7 @@ def eval_exec(
|
|
686
718
|
log_images: bool | None,
|
687
719
|
log_buffer: int | None,
|
688
720
|
no_score: bool | None,
|
721
|
+
no_score_display: bool | None,
|
689
722
|
is_eval_set: bool = False,
|
690
723
|
retry_attempts: int | None = None,
|
691
724
|
retry_wait: int | None = None,
|
@@ -726,6 +759,7 @@ def eval_exec(
|
|
726
759
|
log_images = False if log_images is False else None
|
727
760
|
trace = True if trace else None
|
728
761
|
score = False if no_score else True
|
762
|
+
score_display = False if no_score_display else None
|
729
763
|
|
730
764
|
# build params
|
731
765
|
params: dict[str, Any] = (
|
@@ -756,10 +790,12 @@ def eval_exec(
|
|
756
790
|
max_samples=max_samples,
|
757
791
|
max_tasks=max_tasks,
|
758
792
|
max_subprocesses=max_subprocesses,
|
793
|
+
max_sandboxes=max_sandboxes,
|
759
794
|
log_samples=log_samples,
|
760
795
|
log_images=log_images,
|
761
796
|
log_buffer=log_buffer,
|
762
797
|
score=score,
|
798
|
+
score_display=score_display,
|
763
799
|
)
|
764
800
|
| kwargs
|
765
801
|
)
|
@@ -834,6 +870,12 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
834
870
|
help=MAX_SUBPROCESSES_HELP,
|
835
871
|
envvar="INSPECT_EVAL_MAX_SUBPROCESSES",
|
836
872
|
)
|
873
|
+
@click.option(
|
874
|
+
"--max-sandboxes",
|
875
|
+
type=int,
|
876
|
+
help=MAX_SANDBOXES_HELP,
|
877
|
+
envvar="INSPECT_EVAL_MAX_SANDBOXES",
|
878
|
+
)
|
837
879
|
@click.option(
|
838
880
|
"--no-sandbox-cleanup",
|
839
881
|
type=bool,
|
@@ -888,6 +930,13 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
|
|
888
930
|
help=NO_SCORE_HELP,
|
889
931
|
envvar="INSPECT_EVAL_SCORE",
|
890
932
|
)
|
933
|
+
@click.option(
|
934
|
+
"--no-score-display",
|
935
|
+
type=bool,
|
936
|
+
is_flag=True,
|
937
|
+
help=NO_SCORE_HELP,
|
938
|
+
envvar="INSPECT_EVAL_SCORE_DISPLAY",
|
939
|
+
)
|
891
940
|
@click.option(
|
892
941
|
"--max-connections",
|
893
942
|
type=int,
|
@@ -904,6 +953,7 @@ def eval_retry_command(
|
|
904
953
|
max_samples: int | None,
|
905
954
|
max_tasks: int | None,
|
906
955
|
max_subprocesses: int | None,
|
956
|
+
max_sandboxes: int | None,
|
907
957
|
no_sandbox_cleanup: bool | None,
|
908
958
|
trace: bool | None,
|
909
959
|
fail_on_error: bool | float | None,
|
@@ -912,6 +962,7 @@ def eval_retry_command(
|
|
912
962
|
log_images: bool | None,
|
913
963
|
log_buffer: int | None,
|
914
964
|
no_score: bool | None,
|
965
|
+
no_score_display: bool | None,
|
915
966
|
max_connections: int | None,
|
916
967
|
max_retries: int | None,
|
917
968
|
timeout: int | None,
|
@@ -926,6 +977,7 @@ def eval_retry_command(
|
|
926
977
|
log_samples = False if no_log_samples else None
|
927
978
|
log_images = False if log_images is False else None
|
928
979
|
score = False if no_score else True
|
980
|
+
score_display = False if no_score_display else None
|
929
981
|
|
930
982
|
# resolve fail_on_error
|
931
983
|
if no_fail_on_error is True:
|
@@ -947,6 +999,7 @@ def eval_retry_command(
|
|
947
999
|
max_samples=max_samples,
|
948
1000
|
max_tasks=max_tasks,
|
949
1001
|
max_subprocesses=max_subprocesses,
|
1002
|
+
max_sandboxes=max_sandboxes,
|
950
1003
|
sandbox_cleanup=sandbox_cleanup,
|
951
1004
|
trace=trace,
|
952
1005
|
fail_on_error=fail_on_error,
|
@@ -955,6 +1008,7 @@ def eval_retry_command(
|
|
955
1008
|
log_images=log_images,
|
956
1009
|
log_buffer=log_buffer,
|
957
1010
|
score=score,
|
1011
|
+
score_display=score_display,
|
958
1012
|
max_retries=max_retries,
|
959
1013
|
timeout=timeout,
|
960
1014
|
max_connections=max_connections,
|
inspect_ai/_cli/main.py
CHANGED
@@ -11,6 +11,7 @@ from .list import list_command
|
|
11
11
|
from .log import log_command
|
12
12
|
from .sandbox import sandbox_command
|
13
13
|
from .score import score_command
|
14
|
+
from .trace import trace_command
|
14
15
|
from .view import view_command
|
15
16
|
|
16
17
|
|
@@ -46,6 +47,7 @@ inspect.add_command(log_command)
|
|
46
47
|
inspect.add_command(score_command)
|
47
48
|
inspect.add_command(view_command)
|
48
49
|
inspect.add_command(sandbox_command)
|
50
|
+
inspect.add_command(trace_command)
|
49
51
|
|
50
52
|
|
51
53
|
def main() -> None:
|
inspect_ai/_cli/trace.py
ADDED
@@ -0,0 +1,244 @@
|
|
1
|
+
import os
|
2
|
+
import shlex
|
3
|
+
import time
|
4
|
+
from datetime import datetime
|
5
|
+
from json import dumps
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Callable, cast
|
8
|
+
|
9
|
+
import click
|
10
|
+
from pydantic_core import to_json
|
11
|
+
from rich import print as r_print
|
12
|
+
from rich.console import Console, RenderableType
|
13
|
+
from rich.table import Column, Table
|
14
|
+
|
15
|
+
from inspect_ai._util.error import PrerequisiteError
|
16
|
+
from inspect_ai._util.logger import TRACE_FILE_NAME
|
17
|
+
from inspect_ai._util.trace import ActionTraceRecord, inspect_trace_dir, read_trace_file
|
18
|
+
|
19
|
+
|
20
|
+
@click.group("trace")
|
21
|
+
def trace_command() -> None:
|
22
|
+
"""List and read execution traces.
|
23
|
+
|
24
|
+
Inspect includes a TRACE log-level which is right below the HTTP and INFO log levels (so not written to the console by default). However, TRACE logs are always recorded to a separate file, and the last 10 TRACE logs are preserved. The 'trace' command provides ways to list and read these traces.
|
25
|
+
"""
|
26
|
+
return None
|
27
|
+
|
28
|
+
|
29
|
+
@trace_command.command("list")
|
30
|
+
@click.option(
|
31
|
+
"--json",
|
32
|
+
type=bool,
|
33
|
+
is_flag=True,
|
34
|
+
default=False,
|
35
|
+
help="Output listing as JSON",
|
36
|
+
)
|
37
|
+
def list_command(json: bool) -> None:
|
38
|
+
"""List all trace files."""
|
39
|
+
trace_dir = inspect_trace_dir()
|
40
|
+
trace_files: list[dict[str, float | str]] = [
|
41
|
+
{"mtime": f.lstat().st_mtime, "file": f.absolute().as_posix()}
|
42
|
+
for f in trace_dir.iterdir()
|
43
|
+
if f.is_file()
|
44
|
+
]
|
45
|
+
trace_files.sort(key=lambda f: cast(float, f["mtime"]), reverse=True)
|
46
|
+
if json:
|
47
|
+
print(dumps(trace_files, indent=2))
|
48
|
+
else:
|
49
|
+
table = Table(box=None, show_header=True, pad_edge=False)
|
50
|
+
table.add_column("Time")
|
51
|
+
table.add_column("Trace File")
|
52
|
+
for file in trace_files:
|
53
|
+
mtime = datetime.fromtimestamp(cast(float, file["mtime"])).astimezone()
|
54
|
+
table.add_row(
|
55
|
+
mtime.strftime("%d-%b %H:%M:%S %Z"), shlex.quote(str(file["file"]))
|
56
|
+
)
|
57
|
+
r_print(table)
|
58
|
+
|
59
|
+
|
60
|
+
@trace_command.command("dump")
|
61
|
+
@click.argument("trace-file", type=str, required=False, default=TRACE_FILE_NAME)
|
62
|
+
def read_command(trace_file: str) -> None:
|
63
|
+
"""Dump a trace file to stdout (as a JSON array of log records)."""
|
64
|
+
trace_file_path = resolve_trace_file_path(trace_file)
|
65
|
+
|
66
|
+
traces = read_trace_file(trace_file_path)
|
67
|
+
print(
|
68
|
+
to_json(traces, indent=2, exclude_none=True, fallback=lambda _: None).decode()
|
69
|
+
)
|
70
|
+
|
71
|
+
|
72
|
+
@trace_command.command("anomalies")
|
73
|
+
@click.argument("trace-file", type=str, required=False, default=TRACE_FILE_NAME)
|
74
|
+
@click.option(
|
75
|
+
"--all",
|
76
|
+
is_flag=True,
|
77
|
+
default=False,
|
78
|
+
help="Show all anomolies including errors and timeouts (by default only still running and cancelled actions are shown).",
|
79
|
+
)
|
80
|
+
def anomolies_command(trace_file: str, all: bool) -> None:
|
81
|
+
"""Look for anomalies in a trace file (never completed or cancelled actions)."""
|
82
|
+
trace_file_path = resolve_trace_file_path(trace_file)
|
83
|
+
traces = read_trace_file(trace_file_path)
|
84
|
+
|
85
|
+
# Track started actions
|
86
|
+
running_actions: dict[str, ActionTraceRecord] = {}
|
87
|
+
canceled_actions: dict[str, ActionTraceRecord] = {}
|
88
|
+
error_actions: dict[str, ActionTraceRecord] = {}
|
89
|
+
timeout_actions: dict[str, ActionTraceRecord] = {}
|
90
|
+
|
91
|
+
def action_started(trace: ActionTraceRecord) -> None:
|
92
|
+
running_actions[trace.trace_id] = trace
|
93
|
+
|
94
|
+
def action_completed(trace: ActionTraceRecord) -> ActionTraceRecord:
|
95
|
+
start_trace = running_actions.get(trace.trace_id)
|
96
|
+
if start_trace:
|
97
|
+
del running_actions[trace.trace_id]
|
98
|
+
return start_trace
|
99
|
+
else:
|
100
|
+
raise RuntimeError(f"Expected {trace.trace_id} in action dictionary.")
|
101
|
+
|
102
|
+
def action_failed(trace: ActionTraceRecord) -> None:
|
103
|
+
if all:
|
104
|
+
error_actions[start_trace.trace_id] = trace
|
105
|
+
|
106
|
+
def action_canceled(trace: ActionTraceRecord) -> None:
|
107
|
+
canceled_actions[start_trace.trace_id] = trace
|
108
|
+
|
109
|
+
def action_timeout(trace: ActionTraceRecord) -> None:
|
110
|
+
if all:
|
111
|
+
timeout_actions[start_trace.trace_id] = trace
|
112
|
+
|
113
|
+
for trace in traces:
|
114
|
+
if isinstance(trace, ActionTraceRecord):
|
115
|
+
match trace.event:
|
116
|
+
case "enter":
|
117
|
+
action_started(trace)
|
118
|
+
case "exit":
|
119
|
+
action_completed(trace)
|
120
|
+
case "cancel":
|
121
|
+
start_trace = action_completed(trace)
|
122
|
+
trace.start_time = start_trace.start_time
|
123
|
+
action_canceled(trace)
|
124
|
+
case "error":
|
125
|
+
start_trace = action_completed(trace)
|
126
|
+
trace.start_time = start_trace.start_time
|
127
|
+
action_failed(trace)
|
128
|
+
case "timeout":
|
129
|
+
start_trace = action_completed(trace)
|
130
|
+
trace.start_time = start_trace.start_time
|
131
|
+
action_timeout(trace)
|
132
|
+
case _:
|
133
|
+
print(f"Unknown event type: {trace.event}")
|
134
|
+
|
135
|
+
# do we have any traces?
|
136
|
+
if (
|
137
|
+
len(running_actions)
|
138
|
+
+ len(canceled_actions)
|
139
|
+
+ len(error_actions)
|
140
|
+
+ len(timeout_actions)
|
141
|
+
== 0
|
142
|
+
):
|
143
|
+
print(f"TRACE: {shlex.quote(trace_file_path.as_posix())}\n")
|
144
|
+
if all:
|
145
|
+
print("No anomalies found in trace log.")
|
146
|
+
else:
|
147
|
+
print(
|
148
|
+
"No running or cancelled actions found in trace log (pass --all to see errors and timeouts)."
|
149
|
+
)
|
150
|
+
return
|
151
|
+
|
152
|
+
with open(os.devnull, "w") as f:
|
153
|
+
# generate output
|
154
|
+
console = Console(record=True, file=f)
|
155
|
+
|
156
|
+
def print_fn(o: RenderableType) -> None:
|
157
|
+
console.print(o, highlight=False)
|
158
|
+
|
159
|
+
print_fn(f"[bold]TRACE: {shlex.quote(trace_file_path.as_posix())}[bold]")
|
160
|
+
|
161
|
+
_print_bucket(print_fn, "Running Actions", running_actions)
|
162
|
+
_print_bucket(print_fn, "Cancelled Actions", canceled_actions)
|
163
|
+
_print_bucket(print_fn, "Error Actions", error_actions)
|
164
|
+
_print_bucket(print_fn, "Timeout Actions", timeout_actions)
|
165
|
+
|
166
|
+
# print
|
167
|
+
print(console.export_text(styles=True).strip())
|
168
|
+
|
169
|
+
|
170
|
+
def _print_bucket(
|
171
|
+
print_fn: Callable[[RenderableType], None],
|
172
|
+
label: str,
|
173
|
+
bucket: dict[str, ActionTraceRecord],
|
174
|
+
) -> None:
|
175
|
+
if len(bucket) > 0:
|
176
|
+
# Sort the items in chronological order of when
|
177
|
+
# they finished so the first finished item is at the top
|
178
|
+
sorted_actions = sorted(
|
179
|
+
bucket.values(),
|
180
|
+
key=lambda record: (record.start_time or 0) + (record.duration or 0),
|
181
|
+
reverse=True,
|
182
|
+
)
|
183
|
+
|
184
|
+
# create table
|
185
|
+
table = Table(
|
186
|
+
Column(""),
|
187
|
+
Column("", justify="right"),
|
188
|
+
Column(""),
|
189
|
+
Column("", width=22),
|
190
|
+
box=None,
|
191
|
+
title=label,
|
192
|
+
title_justify="left",
|
193
|
+
title_style="bold",
|
194
|
+
pad_edge=False,
|
195
|
+
padding=(0, 1),
|
196
|
+
)
|
197
|
+
|
198
|
+
for action in sorted_actions:
|
199
|
+
# Compute duration (use the event duration or time since started)
|
200
|
+
duration = (
|
201
|
+
action.duration
|
202
|
+
if action.duration is not None
|
203
|
+
else time.time() - action.start_time
|
204
|
+
if action.start_time is not None
|
205
|
+
else 0.0
|
206
|
+
)
|
207
|
+
|
208
|
+
# The event start time
|
209
|
+
start_time = formatTime(action.start_time) if action.start_time else "None"
|
210
|
+
|
211
|
+
# Event detail
|
212
|
+
detail = (
|
213
|
+
f"{action.detail or action.message} {action.error}"
|
214
|
+
if action.event == "error"
|
215
|
+
else (action.detail or action.message)
|
216
|
+
)
|
217
|
+
|
218
|
+
table.add_row(
|
219
|
+
action.action,
|
220
|
+
f"{round(duration, 2):.2f}s".rjust(8),
|
221
|
+
f" {detail}",
|
222
|
+
start_time,
|
223
|
+
)
|
224
|
+
|
225
|
+
print_fn("")
|
226
|
+
print_fn(table)
|
227
|
+
|
228
|
+
|
229
|
+
def resolve_trace_file_path(trace_file: str) -> Path:
|
230
|
+
trace_file_path = Path(trace_file)
|
231
|
+
if not trace_file_path.is_absolute():
|
232
|
+
trace_file_path = inspect_trace_dir() / trace_file_path
|
233
|
+
|
234
|
+
if not trace_file_path.exists():
|
235
|
+
raise PrerequisiteError(
|
236
|
+
f"The specified trace file '{trace_file_path}' does not exist."
|
237
|
+
)
|
238
|
+
|
239
|
+
return trace_file_path
|
240
|
+
|
241
|
+
|
242
|
+
def formatTime(timestamp: float) -> str:
|
243
|
+
dt = datetime.fromtimestamp(timestamp).astimezone()
|
244
|
+
return dt.strftime("%H:%M:%S %Z")
|
@@ -130,9 +130,15 @@ def progress_time(time: float) -> str:
|
|
130
130
|
return f"{hours:2.0f}:{minutes:02.0f}:{seconds:02.0f}"
|
131
131
|
|
132
132
|
|
133
|
-
def progress_count(complete: int, total: int) -> str:
|
134
|
-
# Pad the display to keep it stable
|
133
|
+
def progress_count(complete: int, total: int, width: int | None = None) -> str:
|
134
|
+
# Pad the display to keep it stable as the
|
135
|
+
# complete metrics
|
135
136
|
total_str = f"{total:,}"
|
136
137
|
complete_str = f"{complete:,}"
|
137
138
|
padding = max(0, len(total_str) - len(complete_str))
|
138
|
-
|
139
|
+
padded = " " * padding + f"[{complete_str}/{total_str}]"
|
140
|
+
|
141
|
+
# If a width has ben specified, pad up to this width as well
|
142
|
+
if width is not None:
|
143
|
+
padded = padded.rjust(width)
|
144
|
+
return padded
|
@@ -166,7 +166,7 @@ def task_interrupted(profile: TaskProfile, samples_completed: int) -> Renderable
|
|
166
166
|
return message
|
167
167
|
|
168
168
|
|
169
|
-
def task_metric(metrics: list[TaskDisplayMetric]) -> str:
|
169
|
+
def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> str:
|
170
170
|
reducer_names: Set[str] = {
|
171
171
|
metric.reducer for metric in metrics if metric.reducer is not None
|
172
172
|
}
|
@@ -180,10 +180,14 @@ def task_metric(metrics: list[TaskDisplayMetric]) -> str:
|
|
180
180
|
else:
|
181
181
|
value = f"{metric.value:.2f}"
|
182
182
|
|
183
|
-
if show_reducer:
|
184
|
-
|
183
|
+
if show_reducer and metric.reducer is not None:
|
184
|
+
metric_str = f"{metric.name}/{metric.reducer}: {value}"
|
185
185
|
else:
|
186
|
-
|
186
|
+
metric_str = f"{metric.name}: {value}"
|
187
|
+
|
188
|
+
if width is not None:
|
189
|
+
metric_str = metric_str.rjust(width)
|
190
|
+
return metric_str
|
187
191
|
|
188
192
|
|
189
193
|
def task_metrics(scores: list[EvalScore]) -> str:
|
@@ -197,7 +197,11 @@ class TaskScreenApp(App[TR]):
|
|
197
197
|
|
198
198
|
# add task
|
199
199
|
try:
|
200
|
-
|
200
|
+
task_view = self.query_one(TasksView)
|
201
|
+
task_view.set_display_metrics(
|
202
|
+
profile.eval_config.score_display is not False
|
203
|
+
)
|
204
|
+
yield task_view.add_task(task)
|
201
205
|
finally:
|
202
206
|
pass
|
203
207
|
|