inspect-ai 0.3.102__py3-none-any.whl → 0.3.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. inspect_ai/_cli/common.py +2 -1
  2. inspect_ai/_cli/eval.py +2 -1
  3. inspect_ai/_display/core/active.py +3 -0
  4. inspect_ai/_display/core/config.py +1 -0
  5. inspect_ai/_display/core/panel.py +21 -13
  6. inspect_ai/_display/core/results.py +3 -7
  7. inspect_ai/_display/core/rich.py +3 -5
  8. inspect_ai/_display/log/__init__.py +0 -0
  9. inspect_ai/_display/log/display.py +173 -0
  10. inspect_ai/_display/plain/display.py +2 -2
  11. inspect_ai/_display/rich/display.py +2 -4
  12. inspect_ai/_display/textual/app.py +1 -6
  13. inspect_ai/_display/textual/widgets/task_detail.py +3 -14
  14. inspect_ai/_display/textual/widgets/tasks.py +1 -1
  15. inspect_ai/_eval/eval.py +14 -2
  16. inspect_ai/_eval/evalset.py +3 -2
  17. inspect_ai/_eval/registry.py +6 -1
  18. inspect_ai/_eval/run.py +7 -1
  19. inspect_ai/_eval/task/constants.py +1 -0
  20. inspect_ai/_eval/task/log.py +5 -1
  21. inspect_ai/_eval/task/run.py +1 -1
  22. inspect_ai/_util/citation.py +88 -0
  23. inspect_ai/_util/content.py +24 -2
  24. inspect_ai/_util/json.py +17 -2
  25. inspect_ai/_util/registry.py +19 -4
  26. inspect_ai/_view/schema.py +0 -6
  27. inspect_ai/_view/www/dist/assets/index.css +82 -24
  28. inspect_ai/_view/www/dist/assets/index.js +10124 -9808
  29. inspect_ai/_view/www/log-schema.json +418 -1
  30. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  31. inspect_ai/_view/www/node_modules/katex/src/fonts/generate_fonts.py +58 -0
  32. inspect_ai/_view/www/node_modules/katex/src/metrics/extract_tfms.py +114 -0
  33. inspect_ai/_view/www/node_modules/katex/src/metrics/extract_ttfs.py +122 -0
  34. inspect_ai/_view/www/node_modules/katex/src/metrics/format_json.py +28 -0
  35. inspect_ai/_view/www/node_modules/katex/src/metrics/parse_tfm.py +211 -0
  36. inspect_ai/_view/www/package.json +2 -2
  37. inspect_ai/_view/www/src/@types/log.d.ts +140 -39
  38. inspect_ai/_view/www/src/app/content/RecordTree.tsx +13 -0
  39. inspect_ai/_view/www/src/app/log-view/LogView.tsx +1 -1
  40. inspect_ai/_view/www/src/app/routing/logNavigation.ts +31 -0
  41. inspect_ai/_view/www/src/app/routing/{navigationHooks.ts → sampleNavigation.ts} +39 -86
  42. inspect_ai/_view/www/src/app/samples/SampleDialog.tsx +1 -1
  43. inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +1 -1
  44. inspect_ai/_view/www/src/app/samples/chat/MessageCitations.module.css +16 -0
  45. inspect_ai/_view/www/src/app/samples/chat/MessageCitations.tsx +63 -0
  46. inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +6 -0
  47. inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +174 -25
  48. inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +21 -3
  49. inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.module.css +7 -0
  50. inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.tsx +111 -0
  51. inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.module.css +10 -0
  52. inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.tsx +14 -0
  53. inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.module.css +19 -0
  54. inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.tsx +49 -0
  55. inspect_ai/_view/www/src/app/samples/chat/messages.ts +7 -1
  56. inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +12 -2
  57. inspect_ai/_view/www/src/app/samples/chat/types.ts +4 -0
  58. inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +1 -1
  59. inspect_ai/_view/www/src/app/samples/sampleLimit.ts +2 -2
  60. inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
  61. inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +4 -4
  62. inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +1 -1
  63. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +15 -2
  64. inspect_ai/_view/www/src/tests/README.md +2 -2
  65. inspect_ai/_view/www/src/utils/git.ts +3 -1
  66. inspect_ai/_view/www/src/utils/html.ts +6 -0
  67. inspect_ai/agent/_handoff.py +3 -3
  68. inspect_ai/log/_condense.py +5 -0
  69. inspect_ai/log/_file.py +4 -1
  70. inspect_ai/log/_log.py +9 -4
  71. inspect_ai/log/_recorders/eval.py +4 -3
  72. inspect_ai/log/_recorders/json.py +5 -2
  73. inspect_ai/log/_recorders/recorder.py +1 -0
  74. inspect_ai/log/_util.py +2 -0
  75. inspect_ai/model/__init__.py +14 -0
  76. inspect_ai/model/_call_tools.py +13 -4
  77. inspect_ai/model/_chat_message.py +3 -0
  78. inspect_ai/model/_openai_responses.py +80 -34
  79. inspect_ai/model/_providers/_anthropic_citations.py +158 -0
  80. inspect_ai/model/_providers/_google_citations.py +100 -0
  81. inspect_ai/model/_providers/anthropic.py +196 -34
  82. inspect_ai/model/_providers/google.py +94 -22
  83. inspect_ai/model/_providers/mistral.py +20 -7
  84. inspect_ai/model/_providers/openai.py +11 -10
  85. inspect_ai/model/_providers/openai_compatible.py +3 -2
  86. inspect_ai/model/_providers/openai_responses.py +2 -5
  87. inspect_ai/model/_providers/perplexity.py +123 -0
  88. inspect_ai/model/_providers/providers.py +13 -2
  89. inspect_ai/model/_providers/vertex.py +3 -0
  90. inspect_ai/model/_trim.py +5 -0
  91. inspect_ai/tool/__init__.py +14 -0
  92. inspect_ai/tool/_mcp/_mcp.py +5 -2
  93. inspect_ai/tool/_mcp/sampling.py +19 -3
  94. inspect_ai/tool/_mcp/server.py +1 -1
  95. inspect_ai/tool/_tool.py +10 -1
  96. inspect_ai/tool/_tools/_web_search/_base_http_provider.py +104 -0
  97. inspect_ai/tool/_tools/_web_search/_exa.py +78 -0
  98. inspect_ai/tool/_tools/_web_search/_google.py +22 -25
  99. inspect_ai/tool/_tools/_web_search/_tavily.py +47 -65
  100. inspect_ai/tool/_tools/_web_search/_web_search.py +83 -36
  101. inspect_ai/tool/_tools/_web_search/_web_search_provider.py +7 -0
  102. inspect_ai/util/_display.py +11 -2
  103. inspect_ai/util/_sandbox/docker/compose.py +2 -2
  104. inspect_ai/util/_span.py +12 -1
  105. {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/METADATA +2 -2
  106. {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/RECORD +112 -88
  107. /inspect_ai/model/{_openai_computer_use.py → _providers/_openai_computer_use.py} +0 -0
  108. /inspect_ai/model/{_openai_web_search.py → _providers/_openai_web_search.py} +0 -0
  109. {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/WHEEL +0 -0
  110. {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/entry_points.txt +0 -0
  111. {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/licenses/LICENSE +0 -0
  112. {inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/common.py CHANGED
@@ -60,7 +60,8 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
60
60
  @click.option(
61
61
  "--display",
62
62
  type=click.Choice(
63
- ["full", "conversation", "rich", "plain", "none"], case_sensitive=False
63
+ ["full", "conversation", "rich", "plain", "log", "none"],
64
+ case_sensitive=False,
64
65
  ),
65
66
  default=DEFAULT_DISPLAY,
66
67
  envvar="INSPECT_DISPLAY",
inspect_ai/_cli/eval.py CHANGED
@@ -641,7 +641,7 @@ def eval_command(
641
641
  @click.option(
642
642
  "--retry-connections",
643
643
  type=float,
644
- help="Reduce max_connections at this rate with each retry (defaults to 0.5)",
644
+ help="Reduce max_connections at this rate with each retry (defaults to 1.0, which results in no reduction).",
645
645
  envvar="INSPECT_EVAL_RETRY_CONNECTIONS",
646
646
  )
647
647
  @click.option(
@@ -966,6 +966,7 @@ def eval_exec(
966
966
  success, _ = eval_set(**params)
967
967
  return success
968
968
  else:
969
+ params["log_header_only"] = True # cli invocation doesn't need full log
969
970
  eval(**params)
970
971
  return True
971
972
 
@@ -5,6 +5,7 @@ import rich
5
5
 
6
6
  from inspect_ai.util._display import display_type
7
7
 
8
+ from ..log.display import LogDisplay
8
9
  from ..plain.display import PlainDisplay
9
10
  from ..rich.display import RichDisplay
10
11
  from ..textual.display import TextualDisplay
@@ -24,6 +25,8 @@ def display() -> Display:
24
25
  and not rich.get_console().is_jupyter
25
26
  ):
26
27
  _active_display = TextualDisplay()
28
+ elif display_type() == "log":
29
+ _active_display = LogDisplay()
27
30
  else:
28
31
  _active_display = RichDisplay()
29
32
 
@@ -30,6 +30,7 @@ def task_config(
30
30
  config = dict(profile.generate_config.model_dump(exclude_none=True)) | config
31
31
  if profile.tags:
32
32
  config["tags"] = ",".join(profile.tags)
33
+ config["dataset"] = profile.dataset
33
34
  config_print: list[str] = []
34
35
  for name, value in config.items():
35
36
  if name == "approval" and isinstance(value, dict):
@@ -1,7 +1,7 @@
1
1
  from typing import Tuple
2
2
 
3
3
  import rich
4
- from rich.console import RenderableType
4
+ from rich.console import Group, RenderableType
5
5
  from rich.panel import Panel
6
6
  from rich.table import Table
7
7
  from rich.text import Text
@@ -9,7 +9,7 @@ from rich.text import Text
9
9
  from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
10
10
  from inspect_ai._util.path import cwd_relative_path
11
11
  from inspect_ai._util.registry import registry_unqualified_name
12
- from inspect_ai.util._display import display_type
12
+ from inspect_ai.util._display import display_type_plain
13
13
 
14
14
  from .display import TaskProfile
15
15
  from .rich import is_vscode_notebook, rich_theme
@@ -27,7 +27,7 @@ def task_panel(
27
27
  log_location: str | None,
28
28
  ) -> RenderableType:
29
29
  # dispatch to plain handler if we are in plain mode
30
- if display_type() == "plain":
30
+ if display_type_plain():
31
31
  return task_panel_plain(
32
32
  profile, show_model, body, subtitle, footer, log_location
33
33
  )
@@ -89,23 +89,31 @@ def task_panel(
89
89
  log_location_relative = log_location
90
90
 
91
91
  root = Table.grid(expand=True)
92
- root.add_column()
92
+ root.add_column(overflow="fold")
93
93
  root.add_row(table)
94
94
  root.add_row()
95
95
  root.add_row(
96
96
  f"[bold][{theme.light}]Log:[/{theme.light}][/bold] "
97
97
  + f"[{theme.link}]{log_location_relative}[/{theme.link}]"
98
98
  )
99
+ root.add_row()
99
100
 
100
- # create panel w/ title
101
- panel = Panel(
102
- root,
103
- title=task_panel_title(profile, show_model),
104
- title_align="left",
105
- width=width,
106
- expand=True,
107
- )
108
- return panel
101
+ panel = Panel(
102
+ task_panel_title(profile, show_model),
103
+ padding=(0, 0),
104
+ width=width,
105
+ height=3,
106
+ expand=True,
107
+ )
108
+ return Group(panel, root)
109
+ else:
110
+ return Panel(
111
+ root,
112
+ title=task_panel_title(profile, show_model),
113
+ title_align="left",
114
+ width=width,
115
+ expand=True,
116
+ )
109
117
 
110
118
 
111
119
  def task_panel_plain(
@@ -18,7 +18,7 @@ from .display import (
18
18
  TaskSuccess,
19
19
  TaskWithResult,
20
20
  )
21
- from .panel import task_panel, task_targets
21
+ from .panel import task_panel
22
22
  from .rich import rich_theme
23
23
 
24
24
 
@@ -41,8 +41,6 @@ def task_result_cancelled(
41
41
  ) -> RenderableType:
42
42
  # The contents of the panel
43
43
  config = task_config(profile)
44
- targets = task_targets(profile)
45
- subtitle = config, targets
46
44
  body = task_stats(cancelled.stats)
47
45
 
48
46
  # The panel
@@ -50,7 +48,7 @@ def task_result_cancelled(
50
48
  profile=profile,
51
49
  show_model=True,
52
50
  body=body,
53
- subtitle=subtitle,
51
+ subtitle=config,
54
52
  footer=task_interrupted(profile, cancelled.samples_completed),
55
53
  log_location=profile.log_location,
56
54
  )
@@ -76,8 +74,6 @@ def task_results(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
76
74
  def task_result_summary(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
77
75
  # The contents of the panel
78
76
  config = task_config(profile)
79
- targets = task_targets(profile)
80
- subtitle = config, targets
81
77
  body = task_stats(success.stats)
82
78
 
83
79
  # the panel
@@ -85,7 +81,7 @@ def task_result_summary(profile: TaskProfile, success: TaskSuccess) -> Renderabl
85
81
  profile=profile,
86
82
  show_model=True,
87
83
  body=body,
88
- subtitle=subtitle,
84
+ subtitle=config,
89
85
  footer=task_results(profile, success),
90
86
  log_location=profile.log_location,
91
87
  )
@@ -11,7 +11,7 @@ from typing_extensions import override
11
11
 
12
12
  from inspect_ai._util.platform import is_running_in_jupyterlab, is_running_in_vscode
13
13
  from inspect_ai._util.transcript import transcript_code_theme
14
- from inspect_ai.util._display import display_type
14
+ from inspect_ai.util._display import display_type, display_type_plain
15
15
 
16
16
 
17
17
  def is_vscode_notebook(console: Console) -> bool:
@@ -20,15 +20,13 @@ def is_vscode_notebook(console: Console) -> bool:
20
20
 
21
21
  def rich_no_color() -> bool:
22
22
  return (
23
- display_type() == "plain"
24
- or not is_running_in_vscode()
25
- or is_running_in_jupyterlab()
23
+ display_type_plain() or not is_running_in_vscode() or is_running_in_jupyterlab()
26
24
  )
27
25
 
28
26
 
29
27
  def rich_initialise() -> None:
30
28
  # reflect ansi prefs
31
- if display_type() == "plain":
29
+ if display_type_plain():
32
30
  rich.reconfigure(no_color=True, force_terminal=False, force_interactive=False)
33
31
  elif rich_no_color():
34
32
  rich.reconfigure(no_color=True)
File without changes
@@ -0,0 +1,173 @@
1
+ import contextlib
2
+ import logging
3
+ from typing import AsyncIterator, Callable, Coroutine, Iterator
4
+
5
+ import anyio
6
+ from rich.console import Console
7
+
8
+ from inspect_ai._util._async import configured_async_backend, run_coroutine
9
+ from inspect_ai._util.platform import running_in_notebook
10
+
11
+ from ...util import throttle
12
+ from ...util._concurrency import concurrency_status_display
13
+ from ..core.display import (
14
+ TR,
15
+ Display,
16
+ Progress,
17
+ TaskDisplay,
18
+ TaskDisplayMetric,
19
+ TaskProfile,
20
+ TaskResult,
21
+ TaskScreen,
22
+ TaskSpec,
23
+ TaskWithResult,
24
+ )
25
+ from ..core.footer import task_http_retries_str
26
+ from ..core.results import task_metric, tasks_results
27
+
28
+
29
+ class LogDisplay(Display):
30
+ def __init__(self) -> None:
31
+ self.total_tasks: int = 0
32
+ self.tasks: list[TaskWithResult] = []
33
+ self.parallel = False
34
+
35
+ def print(self, message: str) -> None:
36
+ logging.info(message, stacklevel=2)
37
+
38
+ @contextlib.contextmanager
39
+ def progress(self, total: int) -> Iterator[Progress]:
40
+ yield LogProgress(total)
41
+
42
+ def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR:
43
+ if running_in_notebook():
44
+ return run_coroutine(main())
45
+ else:
46
+ return anyio.run(main, backend=configured_async_backend())
47
+
48
+ @contextlib.contextmanager
49
+ def suspend_task_app(self) -> Iterator[None]:
50
+ yield
51
+
52
+ @contextlib.asynccontextmanager
53
+ async def task_screen(
54
+ self, tasks: list[TaskSpec], parallel: bool
55
+ ) -> AsyncIterator[TaskScreen]:
56
+ self.total_tasks = len(tasks)
57
+ self.tasks = []
58
+ self.parallel = parallel
59
+ try:
60
+ logging.info(f"Running {self.total_tasks} tasks...", stacklevel=3)
61
+ yield TaskScreen()
62
+ finally:
63
+ # Log final results
64
+ if self.tasks:
65
+ self._log_results()
66
+
67
+ @contextlib.contextmanager
68
+ def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]:
69
+ # Create and yield task display
70
+ task = TaskWithResult(profile, None)
71
+ self.tasks.append(task)
72
+ yield LogTaskDisplay(task)
73
+ self._log_status()
74
+
75
+ def display_counter(self, caption: str, value: str) -> None:
76
+ logging.info(f"{caption}: {value}", stacklevel=2)
77
+
78
+ def _log_status(self) -> None:
79
+ """Log status updates for all tasks"""
80
+ completed_tasks = sum(1 for task in self.tasks if task.result is not None)
81
+ total_tasks = len(self.tasks)
82
+ logging.info(f"{completed_tasks}/{total_tasks} tasks complete", stacklevel=4)
83
+
84
+ def _log_results(self) -> None:
85
+ """Log final results"""
86
+ results = tasks_results(self.tasks)
87
+ console = Console(width=120)
88
+ console.log(results, _stack_offset=4)
89
+
90
+
91
+ class LogProgress(Progress):
92
+ def __init__(self, total: int):
93
+ self.total = total
94
+ self.current = 0
95
+
96
+ def update(self, n: int = 1) -> None:
97
+ self.current += n
98
+
99
+ def complete(self) -> None:
100
+ self.current = self.total
101
+
102
+
103
+ class LogTaskDisplay(TaskDisplay):
104
+ def __init__(self, task: TaskWithResult):
105
+ self.task = task
106
+ self.progress_display: LogProgress | None = None
107
+ self.samples_complete = 0
108
+ self.samples_total = 0
109
+ self.current_metrics: list[TaskDisplayMetric] | None = None
110
+
111
+ @contextlib.contextmanager
112
+ def progress(self) -> Iterator[Progress]:
113
+ self.progress_display = LogProgress(self.task.profile.steps)
114
+ yield self.progress_display
115
+
116
+ @throttle(5)
117
+ def _log_status_throttled(self, stacklevel: int) -> None:
118
+ self._log_status(stacklevel=stacklevel + 2)
119
+
120
+ def _log_status(self, stacklevel: int) -> None:
121
+ """Log status updates"""
122
+ status_parts: list[str] = []
123
+
124
+ # Add task name and model
125
+ status_parts.append(f"Task: {self.task.profile.name}")
126
+ status_parts.append(f"Model: {self.task.profile.model}")
127
+
128
+ # Add step progress
129
+ if self.progress_display:
130
+ progress_percent = int(
131
+ self.progress_display.current / self.progress_display.total * 100
132
+ )
133
+ status_parts.append(
134
+ f"Steps: {self.progress_display.current}/{self.progress_display.total} {progress_percent}%"
135
+ )
136
+
137
+ # Add sample progress
138
+ status_parts.append(f"Samples: {self.samples_complete}/{self.samples_total}")
139
+
140
+ # Add metrics
141
+ if self.current_metrics:
142
+ metric_str = task_metric(self.current_metrics)
143
+ status_parts.append(metric_str)
144
+
145
+ # Add resource usage
146
+ resources_dict: dict[str, str] = {}
147
+ for model, resource in concurrency_status_display().items():
148
+ resources_dict[model] = f"{resource[0]}/{resource[1]}"
149
+ resources = ", ".join(
150
+ [f"{key}: {value}" for key, value in resources_dict.items()]
151
+ )
152
+ status_parts.append(resources)
153
+
154
+ # Add rate limits
155
+ rate_limits = task_http_retries_str()
156
+ if rate_limits:
157
+ status_parts.append(rate_limits)
158
+
159
+ # Print on new line
160
+ logging.info(", ".join(status_parts), stacklevel=stacklevel)
161
+
162
+ def sample_complete(self, complete: int, total: int) -> None:
163
+ self.samples_complete = complete
164
+ self.samples_total = total
165
+ self._log_status_throttled(stacklevel=3)
166
+
167
+ def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
168
+ self.current_metrics = metrics
169
+ self._log_status_throttled(stacklevel=3)
170
+
171
+ def complete(self, result: TaskResult) -> None:
172
+ self.task.result = result
173
+ self._log_status(stacklevel=3)
@@ -25,7 +25,7 @@ from ..core.display import (
25
25
  TaskWithResult,
26
26
  )
27
27
  from ..core.footer import task_http_retries_str
28
- from ..core.panel import task_panel, task_targets
28
+ from ..core.panel import task_panel
29
29
  from ..core.results import task_metric, tasks_results
30
30
 
31
31
 
@@ -79,7 +79,7 @@ class PlainDisplay(Display):
79
79
  profile=profile,
80
80
  show_model=True,
81
81
  body="", # Empty body since we haven't started yet
82
- subtitle=(task_config(profile), task_targets(profile)),
82
+ subtitle=task_config(profile),
83
83
  footer=None,
84
84
  log_location=None,
85
85
  )
@@ -32,7 +32,7 @@ from ..core.display import (
32
32
  TaskWithResult,
33
33
  )
34
34
  from ..core.footer import task_footer
35
- from ..core.panel import task_panel, task_targets, task_title, tasks_title
35
+ from ..core.panel import task_panel, task_title, tasks_title
36
36
  from ..core.progress import (
37
37
  RichProgress,
38
38
  progress_description,
@@ -311,15 +311,13 @@ def task_live_status(
311
311
 
312
312
  # the panel contents
313
313
  config = task_config(tasks[0].profile, style=theme.light)
314
- targets = task_targets(tasks[0].profile)
315
- subtitle = config, targets
316
314
 
317
315
  # the panel
318
316
  return task_panel(
319
317
  profile=tasks[0].profile,
320
318
  show_model=len(tasks) == 1,
321
319
  body=Group("", progress),
322
- subtitle=subtitle,
320
+ subtitle=config,
323
321
  footer=task_footer(counters, theme.light),
324
322
  log_location=None,
325
323
  )
@@ -42,7 +42,7 @@ from ..core.display import (
42
42
  TaskWithResult,
43
43
  )
44
44
  from ..core.footer import task_footer
45
- from ..core.panel import task_targets, task_title, tasks_title
45
+ from ..core.panel import task_title, tasks_title
46
46
  from ..core.rich import record_console_input, rich_initialise, rich_theme
47
47
  from .theme import inspect_dark, inspect_light
48
48
  from .widgets.console import ConsoleView
@@ -296,13 +296,8 @@ class TaskScreenApp(App[TR]):
296
296
  tasks.config = task_config(
297
297
  self._tasks[0].profile, generate_config=not self._parallel
298
298
  )
299
- if not self._parallel:
300
- tasks.targets = task_targets(self._tasks[0].profile)
301
- else:
302
- tasks.targets = " \n "
303
299
  else:
304
300
  tasks.config = ""
305
- tasks.targets = ""
306
301
 
307
302
  def update_samples(self) -> None:
308
303
  samples_view = self.query_one(SamplesView)
@@ -30,6 +30,8 @@ class TaskDetail(Widget):
30
30
  width: 100%;
31
31
  height: auto;
32
32
  grid-gutter: 1 3;
33
+ grid-size-columns: 3;
34
+ grid-columns: 1fr 1fr 1fr;
33
35
  }
34
36
  """
35
37
 
@@ -92,20 +94,6 @@ class TaskDetail(Widget):
92
94
  if len(self.by_reducer) == 0:
93
95
  return
94
96
 
95
- # Compute the row and column count
96
- row_count = len(self.by_reducer)
97
- col_count = len(next(iter(self.by_reducer.values())))
98
-
99
- # If this can fit in a single row, make it fit
100
- # otherwise place each reducer on their own row
101
- self.grid.styles.grid_columns = "auto"
102
- if row_count * col_count < 4:
103
- self.grid.styles.grid_size_columns = row_count * col_count
104
- self.grid.styles.grid_size_rows = 1
105
- else:
106
- self.grid.styles.grid_size_columns = col_count
107
- self.grid.styles.grid_size_rows = row_count
108
-
109
97
  # In order to reduce flashing the below tracks use of widgets
110
98
  # and updates them when possible (removing and adding them as needed)
111
99
  # Makes keys for tracking Task Metric widgets
@@ -142,6 +130,7 @@ class TaskMetrics(Widget):
142
130
  TaskMetrics {
143
131
  width: auto;
144
132
  height: auto;
133
+ border: solid $foreground 20%;
145
134
  }
146
135
  TaskMetrics Grid {
147
136
  width: auto;
@@ -174,7 +174,7 @@ class TaskProgressView(Widget):
174
174
  color:$text-secondary;
175
175
  }
176
176
  #task-detail {
177
- column-span: 8;
177
+ column-span: 9;
178
178
  }
179
179
  .hidden {
180
180
  display: none;
inspect_ai/_eval/eval.py CHANGED
@@ -105,6 +105,7 @@ def eval(
105
105
  log_images: bool | None = None,
106
106
  log_buffer: int | None = None,
107
107
  log_shared: bool | int | None = None,
108
+ log_header_only: bool | None = None,
108
109
  score: bool = True,
109
110
  score_display: bool | None = None,
110
111
  **kwargs: Unpack[GenerateConfigArgs],
@@ -181,6 +182,8 @@ def eval(
181
182
  log_shared: Sync sample events to log directory so that users on other systems
182
183
  can see log updates in realtime (defaults to no syncing). Specify `True`
183
184
  to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
185
+ log_header_only: If `True`, the function should return only log headers rather
186
+ than full logs with samples (defaults to `False`).
184
187
  score: Score output (defaults to True)
185
188
  score_display: Show scoring metrics in realtime (defaults to True)
186
189
  **kwargs: Model generation options.
@@ -234,6 +237,7 @@ def eval(
234
237
  log_images=log_images,
235
238
  log_buffer=log_buffer,
236
239
  log_shared=log_shared,
240
+ log_header_only=log_header_only,
237
241
  score=score,
238
242
  score_display=score_display,
239
243
  **kwargs,
@@ -288,6 +292,7 @@ async def eval_async(
288
292
  log_images: bool | None = None,
289
293
  log_buffer: int | None = None,
290
294
  log_shared: bool | int | None = None,
295
+ log_header_only: bool | None = None,
291
296
  score: bool = True,
292
297
  score_display: bool | None = None,
293
298
  **kwargs: Unpack[GenerateConfigArgs],
@@ -344,7 +349,9 @@ async def eval_async(
344
349
  log_buffer: Number of samples to buffer before writing log file.
345
350
  If not specified, an appropriate default for the format and filesystem is
346
351
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
347
- log_shared: Indicate that the log directory is shared, which results in additional syncing of realtime log data for Inspect View.
352
+ log_shared: Indicate that the log directory is shared, which results in additional
353
+ syncing of realtime log data for Inspect View.
354
+ log_header_only: If `True`, the function should return only log headers rather than full logs with samples (defaults to `False`).
348
355
  score: Score output (defaults to True)
349
356
  score_display: Show scoring metrics in realtime (defaults to True)
350
357
  **kwargs: Model generation options.
@@ -432,6 +439,9 @@ async def eval_async(
432
439
  # resolve log_shared
433
440
  log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
434
441
 
442
+ # resolve header only
443
+ log_header_only = log_header_only is True
444
+
435
445
  # validate that --log-shared can't use used with 'json' format
436
446
  if log_shared and log_format == JSON_LOG_FORMAT:
437
447
  raise PrerequisiteError(
@@ -507,6 +517,7 @@ async def eval_async(
507
517
  eval_config=eval_config,
508
518
  eval_sandbox=sandbox,
509
519
  recorder=recorder,
520
+ header_only=log_header_only,
510
521
  epochs_reducer=epochs_reducer,
511
522
  solver=solver,
512
523
  tags=tags,
@@ -532,6 +543,7 @@ async def eval_async(
532
543
  eval_config=eval_config,
533
544
  eval_sandbox=sandbox,
534
545
  recorder=recorder,
546
+ header_only=log_header_only,
535
547
  epochs_reducer=epochs_reducer,
536
548
  solver=solver,
537
549
  tags=tags,
@@ -800,7 +812,7 @@ async def eval_retry_async(
800
812
  model_roles = model_roles_config_to_model_roles(eval_log.eval.model_roles)
801
813
 
802
814
  # collect the rest of the params we need for the eval
803
- task_args = eval_log.eval.task_args
815
+ task_args = eval_log.eval.task_args_passed
804
816
  tags = eval_log.eval.tags
805
817
  limit = eval_log.eval.config.limit
806
818
  sample_id = eval_log.eval.config.sample_id
@@ -114,7 +114,7 @@ def eval_set(
114
114
  (defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
115
115
  per-retry will in no case by longer than 1 hour.
116
116
  retry_connections: Reduce max_connections at this rate with each retry
117
- (defaults to 0.5)
117
+ (defaults to 1.0, which results in no reduction).
118
118
  retry_cleanup: Cleanup failed log files after retries
119
119
  (defaults to True)
120
120
  model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
@@ -235,6 +235,7 @@ def eval_set(
235
235
  log_images=log_images,
236
236
  log_buffer=log_buffer,
237
237
  log_shared=log_shared,
238
+ log_header_only=True,
238
239
  score=score,
239
240
  **kwargs,
240
241
  )
@@ -274,7 +275,7 @@ def eval_set(
274
275
  fs.mkdir(log_dir, exist_ok=True)
275
276
 
276
277
  # resolve some parameters
277
- retry_connections = retry_connections or 0.5
278
+ retry_connections = retry_connections or 1.0
278
279
  retry_cleanup = retry_cleanup is not False
279
280
  max_connections = starting_max_connections(models, GenerateConfig(**kwargs))
280
281
  max_tasks = max_tasks if max_tasks is not None else max(len(models), 4)
@@ -8,6 +8,7 @@ from inspect_ai._util.error import PrerequisiteError
8
8
  from inspect_ai._util.package import get_installed_package_name
9
9
  from inspect_ai._util.registry import (
10
10
  RegistryInfo,
11
+ extract_named_params,
11
12
  registry_add,
12
13
  registry_create,
13
14
  registry_info,
@@ -17,7 +18,7 @@ from inspect_ai._util.registry import (
17
18
  )
18
19
 
19
20
  from .task import Task
20
- from .task.constants import TASK_FILE_ATTR, TASK_RUN_DIR_ATTR
21
+ from .task.constants import TASK_ALL_PARAMS_ATTR, TASK_FILE_ATTR, TASK_RUN_DIR_ATTR
21
22
 
22
23
  MODEL_PARAM = "model"
23
24
 
@@ -133,6 +134,10 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
133
134
  **w_kwargs,
134
135
  )
135
136
 
137
+ # extract all task parameters including defaults
138
+ named_params = extract_named_params(task_type, True, *w_args, **w_kwargs)
139
+ setattr(task_instance, TASK_ALL_PARAMS_ATTR, named_params)
140
+
136
141
  # if its not from an installed package then it is a "local"
137
142
  # module import, so set its task file and run dir
138
143
  if get_installed_package_name(task_type) is None:
inspect_ai/_eval/run.py CHANGED
@@ -3,6 +3,7 @@ import os
3
3
  import sys
4
4
  from typing import Any, Awaitable, Callable, Set, cast
5
5
 
6
+ from inspect_ai._eval.task.constants import TASK_ALL_PARAMS_ATTR
6
7
  from inspect_ai._eval.task.task import Task
7
8
  from inspect_ai._util.environ import environ_vars
8
9
  from inspect_ai._util.trace import trace_action
@@ -63,6 +64,7 @@ async def eval_run(
63
64
  eval_config: EvalConfig,
64
65
  eval_sandbox: SandboxEnvironmentType | None,
65
66
  recorder: Recorder,
67
+ header_only: bool,
66
68
  epochs_reducer: list[ScoreReducer] | None = None,
67
69
  solver: Solver | SolverSpec | None = None,
68
70
  tags: list[str] | None = None,
@@ -207,11 +209,15 @@ async def eval_run(
207
209
  metrics=eval_metrics,
208
210
  sandbox=resolved_task.sandbox,
209
211
  task_attribs=task.attribs,
210
- task_args=resolved_task.task_args,
212
+ task_args=getattr(
213
+ task, TASK_ALL_PARAMS_ATTR, resolved_task.task_args
214
+ ),
215
+ task_args_passed=resolved_task.task_args,
211
216
  model_args=resolved_task.model.model_args,
212
217
  eval_config=task_eval_config,
213
218
  metadata=((metadata or {}) | (task.metadata or {})) or None,
214
219
  recorder=recorder,
220
+ header_only=header_only,
215
221
  )
216
222
  await logger.init()
217
223
 
@@ -1,2 +1,3 @@
1
1
  TASK_FILE_ATTR = "__task_file__"
2
2
  TASK_RUN_DIR_ATTR = "__task_run_dir__"
3
+ TASK_ALL_PARAMS_ATTR = "__task_all_params__"