inspect-ai 0.3.57__py3-none-any.whl → 0.3.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/common.py +7 -3
  3. inspect_ai/_cli/eval.py +17 -2
  4. inspect_ai/_cli/trace.py +21 -2
  5. inspect_ai/_display/core/active.py +4 -3
  6. inspect_ai/_display/core/config.py +3 -3
  7. inspect_ai/_display/core/panel.py +7 -3
  8. inspect_ai/_display/plain/__init__.py +0 -0
  9. inspect_ai/_display/plain/display.py +203 -0
  10. inspect_ai/_display/rich/display.py +4 -9
  11. inspect_ai/_display/textual/app.py +4 -1
  12. inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
  13. inspect_ai/_display/textual/widgets/samples.py +119 -16
  14. inspect_ai/_display/textual/widgets/sandbox.py +37 -0
  15. inspect_ai/_eval/eval.py +32 -20
  16. inspect_ai/_eval/evalset.py +7 -5
  17. inspect_ai/_eval/score.py +1 -0
  18. inspect_ai/_eval/task/__init__.py +2 -2
  19. inspect_ai/_eval/task/images.py +40 -25
  20. inspect_ai/_eval/task/results.py +50 -22
  21. inspect_ai/_eval/task/run.py +180 -124
  22. inspect_ai/_eval/task/sandbox.py +10 -5
  23. inspect_ai/_eval/task/task.py +140 -25
  24. inspect_ai/_util/constants.py +2 -0
  25. inspect_ai/_util/content.py +23 -1
  26. inspect_ai/_util/images.py +20 -17
  27. inspect_ai/_util/kvstore.py +73 -0
  28. inspect_ai/_util/notgiven.py +18 -0
  29. inspect_ai/_util/port_names.py +61 -0
  30. inspect_ai/_util/text.py +23 -0
  31. inspect_ai/_util/thread.py +5 -0
  32. inspect_ai/_view/www/App.css +31 -1
  33. inspect_ai/_view/www/dist/assets/index.css +31 -1
  34. inspect_ai/_view/www/dist/assets/index.js +25375 -1846
  35. inspect_ai/_view/www/log-schema.json +129 -15
  36. inspect_ai/_view/www/package.json +2 -0
  37. inspect_ai/_view/www/src/App.mjs +8 -10
  38. inspect_ai/_view/www/src/Types.mjs +0 -1
  39. inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
  40. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
  41. inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
  42. inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
  43. inspect_ai/_view/www/src/components/MessageContent.mjs +43 -1
  44. inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
  45. inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
  46. inspect_ai/_view/www/src/index.js +75 -2
  47. inspect_ai/_view/www/src/navbar/Navbar.mjs +3 -0
  48. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +18 -9
  49. inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
  50. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
  51. inspect_ai/_view/www/src/samples/SampleList.mjs +18 -48
  52. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
  53. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +29 -13
  54. inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -1
  55. inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
  56. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
  57. inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
  58. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
  59. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
  60. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
  61. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
  62. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
  63. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
  64. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
  65. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
  66. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
  67. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
  68. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
  69. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
  70. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
  71. inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
  72. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
  73. inspect_ai/_view/www/src/types/log.d.ts +62 -27
  74. inspect_ai/_view/www/src/utils/Format.mjs +10 -3
  75. inspect_ai/_view/www/src/utils/Json.mjs +12 -6
  76. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +10 -4
  77. inspect_ai/_view/www/vite.config.js +7 -0
  78. inspect_ai/_view/www/yarn.lock +116 -0
  79. inspect_ai/approval/_human/__init__.py +0 -0
  80. inspect_ai/approval/_human/util.py +2 -2
  81. inspect_ai/approval/_policy.py +12 -6
  82. inspect_ai/dataset/_sources/csv.py +2 -1
  83. inspect_ai/dataset/_sources/json.py +2 -1
  84. inspect_ai/dataset/_sources/util.py +15 -7
  85. inspect_ai/log/_condense.py +11 -1
  86. inspect_ai/log/_log.py +3 -6
  87. inspect_ai/log/_recorders/eval.py +19 -8
  88. inspect_ai/log/_samples.py +26 -5
  89. inspect_ai/log/_transcript.py +32 -2
  90. inspect_ai/model/__init__.py +10 -2
  91. inspect_ai/model/_call_tools.py +59 -12
  92. inspect_ai/model/_chat_message.py +2 -4
  93. inspect_ai/model/_conversation.py +61 -0
  94. inspect_ai/model/_generate_config.py +10 -4
  95. inspect_ai/model/_model.py +117 -18
  96. inspect_ai/model/_model_output.py +7 -2
  97. inspect_ai/model/_providers/anthropic.py +109 -51
  98. inspect_ai/model/_providers/azureai.py +26 -24
  99. inspect_ai/model/_providers/bedrock.py +43 -44
  100. inspect_ai/model/_providers/google.py +121 -58
  101. inspect_ai/model/_providers/groq.py +7 -5
  102. inspect_ai/model/_providers/hf.py +11 -6
  103. inspect_ai/model/_providers/mistral.py +17 -20
  104. inspect_ai/model/_providers/openai.py +32 -21
  105. inspect_ai/model/_providers/openai_o1.py +9 -8
  106. inspect_ai/model/_providers/providers.py +1 -1
  107. inspect_ai/model/_providers/together.py +8 -8
  108. inspect_ai/model/_providers/vertex.py +18 -8
  109. inspect_ai/scorer/__init__.py +13 -2
  110. inspect_ai/scorer/_metrics/__init__.py +2 -2
  111. inspect_ai/scorer/_metrics/std.py +3 -3
  112. inspect_ai/scorer/_reducer/reducer.py +1 -1
  113. inspect_ai/scorer/_scorer.py +2 -2
  114. inspect_ai/solver/__init__.py +2 -5
  115. inspect_ai/solver/_prompt.py +35 -5
  116. inspect_ai/solver/_task_state.py +80 -38
  117. inspect_ai/tool/__init__.py +11 -1
  118. inspect_ai/tool/_tool.py +21 -3
  119. inspect_ai/tool/_tool_call.py +10 -0
  120. inspect_ai/tool/_tool_def.py +16 -5
  121. inspect_ai/tool/_tool_with.py +21 -4
  122. inspect_ai/tool/beta/__init__.py +5 -0
  123. inspect_ai/tool/beta/_computer/__init__.py +3 -0
  124. inspect_ai/tool/beta/_computer/_common.py +133 -0
  125. inspect_ai/tool/beta/_computer/_computer.py +155 -0
  126. inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
  127. inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
  128. inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
  129. inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
  130. inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
  131. inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
  132. inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
  133. inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
  134. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
  135. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
  136. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
  137. inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
  138. inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
  139. inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
  140. inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
  141. inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
  142. inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
  143. inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
  144. inspect_ai/util/__init__.py +2 -3
  145. inspect_ai/util/{_trace.py → _conversation.py} +3 -17
  146. inspect_ai/util/_display.py +14 -4
  147. inspect_ai/util/_limit.py +26 -0
  148. inspect_ai/util/_sandbox/context.py +12 -13
  149. inspect_ai/util/_sandbox/docker/compose.py +24 -11
  150. inspect_ai/util/_sandbox/docker/docker.py +84 -14
  151. inspect_ai/util/_sandbox/docker/internal.py +3 -1
  152. inspect_ai/util/_sandbox/environment.py +27 -1
  153. inspect_ai/util/_sandbox/local.py +1 -0
  154. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/METADATA +2 -2
  155. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/RECORD +159 -128
  156. inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
  157. inspect_ai/model/_trace.py +0 -48
  158. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/LICENSE +0 -0
  159. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/WHEEL +0 -0
  160. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/entry_points.txt +0 -0
  161. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/top_level.txt +0 -0
@@ -5,28 +5,28 @@ from rich.console import RenderableType
5
5
  from rich.table import Table
6
6
  from rich.text import Text
7
7
  from textual.app import ComposeResult
8
- from textual.containers import (
9
- Horizontal,
10
- HorizontalGroup,
11
- Vertical,
12
- VerticalGroup,
13
- )
8
+ from textual.containers import Horizontal, HorizontalGroup, Vertical, VerticalGroup
14
9
  from textual.reactive import reactive
15
10
  from textual.widget import Widget
16
11
  from textual.widgets import (
17
12
  Button,
18
13
  Collapsible,
14
+ Link,
19
15
  LoadingIndicator,
20
16
  OptionList,
21
17
  Static,
22
18
  )
23
19
  from textual.widgets.option_list import Option, Separator
24
20
 
21
+ from inspect_ai._display.textual.widgets.port_mappings import get_url
25
22
  from inspect_ai._util.format import format_progress_time
23
+ from inspect_ai._util.port_names import get_service_by_port
26
24
  from inspect_ai._util.registry import registry_unqualified_name
27
25
  from inspect_ai.log._samples import ActiveSample
26
+ from inspect_ai.log._transcript import ToolEvent
28
27
 
29
28
  from .clock import Clock
29
+ from .sandbox import SandboxView
30
30
  from .transcript import TranscriptView
31
31
 
32
32
 
@@ -73,6 +73,7 @@ class SamplesView(Widget):
73
73
 
74
74
  async def set_highlighted_sample(self, highlighted: int | None) -> None:
75
75
  sample_info = self.query_one(SampleInfo)
76
+ sample_vnc = self.query_one(SampleVNC)
76
77
  transcript_view = self.query_one(TranscriptView)
77
78
  sample_toolbar = self.query_one(SampleToolbar)
78
79
  if highlighted is not None:
@@ -82,12 +83,14 @@ class SamplesView(Widget):
82
83
  transcript_view.display = True
83
84
  sample_toolbar.display = True
84
85
  await sample_info.sync_sample(sample)
86
+ await sample_vnc.sync_sample(sample)
85
87
  await transcript_view.sync_sample(sample)
86
88
  await sample_toolbar.sync_sample(sample)
87
89
  return
88
90
 
89
91
  # otherwise hide ui
90
92
  sample_info.display = False
93
+ sample_vnc.display = False
91
94
  transcript_view.display = False
92
95
  sample_toolbar.display = False
93
96
 
@@ -181,10 +184,59 @@ class SamplesList(OptionList):
181
184
  return None
182
185
 
183
186
 
184
- class SampleInfo(Horizontal):
187
+ class SampleVNC(Horizontal):
188
+ DEFAULT_CSS = """
189
+ SampleVNC {
190
+ layout: grid;
191
+ grid-size: 2 1;
192
+ grid-columns: auto 1fr;
193
+ }
194
+ SampleVNC Static {
195
+ color: $secondary;
196
+ }
197
+ SampleVNC Link {
198
+ color: $accent;
199
+ }
200
+ """
201
+
202
+ def __init__(self) -> None:
203
+ super().__init__()
204
+ self._sample: ActiveSample | None = None
205
+
206
+ def compose(self) -> ComposeResult:
207
+ yield Static("VNC: ")
208
+ yield Link("")
209
+
210
+ async def sync_sample(self, sample: ActiveSample) -> None:
211
+ if sample == self._sample:
212
+ return
213
+
214
+ # defult to hidden (show if we find a vnc connection)
215
+ self.display = False
216
+
217
+ # is there a vnc connection? if so populate
218
+ for connection in [c for c in sample.sandboxes.values() if c.ports]:
219
+ for port in connection.ports or []:
220
+ service = get_service_by_port(port.container_port, port.protocol)
221
+ if service == "noVNC" and port.mappings:
222
+ host_mappings = port.mappings
223
+ link = self.query_one(Link)
224
+ vnc_url = get_url(host_mappings[0].host_port, service)
225
+ if vnc_url:
226
+ link.text = vnc_url
227
+ link.url = link.text
228
+ self.display = True
229
+ break
230
+
231
+
232
+ class SampleInfo(Vertical):
185
233
  DEFAULT_CSS = """
186
234
  SampleInfo {
187
235
  color: $text-muted;
236
+ layout: grid;
237
+ grid-size: 1 2;
238
+ grid-rows: auto 1;
239
+ grid-gutter: 1;
188
240
  }
189
241
  SampleInfo Collapsible {
190
242
  padding: 0;
@@ -217,11 +269,13 @@ class SampleInfo(Horizontal):
217
269
  def __init__(self) -> None:
218
270
  super().__init__()
219
271
  self._sample: ActiveSample | None = None
272
+ self._sandbox_count: int | None = None
220
273
 
221
274
  def compose(self) -> ComposeResult:
222
275
  with Collapsible(title=""):
223
276
  yield SampleLimits()
224
277
  yield SandboxesView()
278
+ yield SampleVNC()
225
279
 
226
280
  async def sync_sample(self, sample: ActiveSample | None) -> None:
227
281
  if sample is None:
@@ -232,12 +286,14 @@ class SampleInfo(Horizontal):
232
286
  limits = self.query_one(SampleLimits)
233
287
  await limits.sync_sample(sample)
234
288
 
289
+ new_sandbox_count = len(sample.sandboxes)
235
290
  # bail if we've already processed this sample
236
- if self._sample == sample:
291
+ if self._sample == sample and self._sandbox_count == new_sandbox_count:
237
292
  return
238
293
 
239
294
  # set sample
240
295
  self._sample = sample
296
+ self._sandbox_count = new_sandbox_count
241
297
 
242
298
  # update UI
243
299
  self.display = True
@@ -245,6 +301,7 @@ class SampleInfo(Horizontal):
245
301
  self.query_one(Collapsible).title = title
246
302
  sandboxes = self.query_one(SandboxesView)
247
303
  await sandboxes.sync_sample(sample)
304
+ await self.query_one(SampleVNC).sync_sample(sample)
248
305
 
249
306
 
250
307
  class SampleLimits(Widget):
@@ -294,6 +351,9 @@ class SandboxesView(Vertical):
294
351
  background: transparent;
295
352
  height: auto;
296
353
  }
354
+ #sandboxes-list {
355
+ height: auto;
356
+ }
297
357
  SandboxesView Static {
298
358
  background: transparent;
299
359
  }
@@ -311,16 +371,24 @@ class SandboxesView(Vertical):
311
371
 
312
372
  async def sync_sample(self, sample: ActiveSample) -> None:
313
373
  if len(sample.sandboxes) > 0:
374
+ multiple_sandboxes = len(sample.sandboxes) > 1
314
375
  self.display = True
315
376
  sandboxes_caption = cast(Static, self.query_one("#sandboxes-caption"))
316
- sandboxes_caption.update("[bold]sandbox containers:[/bold]")
377
+ sandboxes_caption.update(
378
+ f"[bold]sandbox container{'s' if multiple_sandboxes else ''}:[/bold]"
379
+ )
317
380
 
318
381
  sandboxes_list = self.query_one("#sandboxes-list")
319
382
  await sandboxes_list.remove_children()
383
+
320
384
  await sandboxes_list.mount_all(
321
- [Static(sandbox.command) for sandbox in sample.sandboxes.values()]
385
+ [
386
+ SandboxView(connection, name if multiple_sandboxes else None)
387
+ for name, connection in sample.sandboxes.items()
388
+ ]
322
389
  )
323
- sandboxes_list.mount(
390
+
391
+ await sandboxes_list.mount(
324
392
  Static(
325
393
  "[italic]Hold down Alt (or Option) to select text for copying[/italic]",
326
394
  classes="clipboard-message",
@@ -332,16 +400,29 @@ class SandboxesView(Vertical):
332
400
 
333
401
 
334
402
  class SampleToolbar(Horizontal):
403
+ STATUS_GROUP = "status_group"
404
+ TIMEOUT_TOOL_CALL = "timeout_tool_call"
335
405
  CANCEL_SCORE_OUTPUT = "cancel_score_output"
336
406
  CANCEL_RAISE_ERROR = "cancel_raise_error"
337
407
  PENDING_STATUS = "pending_status"
338
408
  PENDING_CAPTION = "pending_caption"
339
409
 
340
410
  DEFAULT_CSS = f"""
411
+ SampleToolbar {{
412
+ grid-size: 5 1;
413
+ grid-columns: auto auto 1fr auto auto;
414
+ }}
415
+ SampleToolbar #{STATUS_GROUP} {{
416
+ min-width: 20;
417
+ }}
341
418
  SampleToolbar Button {{
342
419
  margin-bottom: 1;
343
420
  margin-right: 2;
344
- min-width: 20;
421
+ min-width: 18;
422
+ }}
423
+ SampleToolbar #{TIMEOUT_TOOL_CALL} {{
424
+ color: $secondary-darken-3;
425
+ min-width: 16;
345
426
  }}
346
427
  SampleToolbar #{CANCEL_SCORE_OUTPUT} {{
347
428
  color: $primary-darken-3;
@@ -356,9 +437,16 @@ class SampleToolbar(Horizontal):
356
437
  self.sample: ActiveSample | None = None
357
438
 
358
439
  def compose(self) -> ComposeResult:
359
- with VerticalGroup(id=self.PENDING_STATUS):
360
- yield Static("Executing...", id=self.PENDING_CAPTION)
361
- yield HorizontalGroup(EventLoadingIndicator(), Clock())
440
+ with HorizontalGroup(id=self.STATUS_GROUP):
441
+ with VerticalGroup(id=self.PENDING_STATUS):
442
+ yield Static("Executing...", id=self.PENDING_CAPTION)
443
+ yield HorizontalGroup(EventLoadingIndicator(), Clock())
444
+ yield Button(
445
+ Text("Timeout Tool"),
446
+ id=self.TIMEOUT_TOOL_CALL,
447
+ tooltip="Cancel the tool call and report a timeout to the model.",
448
+ )
449
+ yield Horizontal()
362
450
  yield Button(
363
451
  Text("Cancel (Score)"),
364
452
  id=self.CANCEL_SCORE_OUTPUT,
@@ -372,12 +460,21 @@ class SampleToolbar(Horizontal):
372
460
 
373
461
  def on_mount(self) -> None:
374
462
  self.query_one("#" + self.PENDING_STATUS).visible = False
463
+ self.query_one("#" + self.TIMEOUT_TOOL_CALL).display = False
375
464
  self.query_one("#" + self.CANCEL_SCORE_OUTPUT).display = False
376
465
  self.query_one("#" + self.CANCEL_RAISE_ERROR).display = False
377
466
 
378
467
  def on_button_pressed(self, event: Button.Pressed) -> None:
379
468
  if self.sample:
380
- if event.button.id == self.CANCEL_SCORE_OUTPUT:
469
+ if event.button.id == self.TIMEOUT_TOOL_CALL:
470
+ last_event = (
471
+ self.sample.transcript.events[-1]
472
+ if self.sample.transcript.events
473
+ else None
474
+ )
475
+ if isinstance(last_event, ToolEvent):
476
+ last_event.cancel()
477
+ elif event.button.id == self.CANCEL_SCORE_OUTPUT:
381
478
  self.sample.interrupt("score")
382
479
  elif event.button.id == self.CANCEL_RAISE_ERROR:
383
480
  self.sample.interrupt("error")
@@ -389,6 +486,7 @@ class SampleToolbar(Horizontal):
389
486
  self.sample = sample
390
487
 
391
488
  pending_status = self.query_one("#" + self.PENDING_STATUS)
489
+ timeout_tool = self.query_one("#" + self.TIMEOUT_TOOL_CALL)
392
490
  clock = self.query_one(Clock)
393
491
  cancel_score_output = cast(
394
492
  Button, self.query_one("#" + self.CANCEL_SCORE_OUTPUT)
@@ -419,14 +517,19 @@ class SampleToolbar(Horizontal):
419
517
  pending_caption.update(
420
518
  Text.from_markup(f"[italic]{pending_caption_text}[/italic]")
421
519
  )
520
+
521
+ timeout_tool.display = isinstance(last_event, ToolEvent)
522
+
422
523
  clock.start(last_event.timestamp.timestamp())
423
524
  else:
424
525
  pending_status.visible = False
526
+ timeout_tool.display = False
425
527
  clock.stop()
426
528
 
427
529
  else:
428
530
  self.display = False
429
531
  pending_status.visible = False
532
+ timeout_tool.display = False
430
533
  clock.stop()
431
534
 
432
535
 
@@ -0,0 +1,37 @@
1
+ from textual.app import ComposeResult
2
+ from textual.containers import Horizontal, Vertical
3
+ from textual.widgets import Static
4
+
5
+ from inspect_ai.util._sandbox.environment import SandboxConnection
6
+
7
+ from .port_mappings import PortMappingsView
8
+
9
+
10
+ class SandboxView(Vertical):
11
+ DEFAULT_CSS = """
12
+ .indent {
13
+ width: 2;
14
+ }
15
+ .no_indent {
16
+ width: 0;
17
+ }
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ connection: SandboxConnection,
23
+ name: str | None, # if None, no header or indent
24
+ ) -> None:
25
+ super().__init__()
26
+ self.sandbox_name = name
27
+ self.connection = connection
28
+
29
+ def compose(self) -> ComposeResult:
30
+ if self.sandbox_name:
31
+ yield Static(self.sandbox_name)
32
+ with Horizontal():
33
+ yield Static("", classes="indent" if self.sandbox_name else "no_indent")
34
+ with Vertical():
35
+ yield Static(self.connection.command)
36
+ if self.connection.ports:
37
+ yield PortMappingsView(self.connection.ports)
inspect_ai/_eval/eval.py CHANGED
@@ -7,11 +7,12 @@ from shortuuid import uuid
7
7
  from typing_extensions import Unpack
8
8
 
9
9
  from inspect_ai._cli.util import parse_cli_args
10
- from inspect_ai._display.core.active import display
10
+ from inspect_ai._display.core.active import display as task_display
11
11
  from inspect_ai._util.config import resolve_args
12
12
  from inspect_ai._util.constants import DEFAULT_LOG_FORMAT
13
13
  from inspect_ai._util.error import PrerequisiteError
14
14
  from inspect_ai._util.file import absolute_file_path
15
+ from inspect_ai._util.logger import warn_once
15
16
  from inspect_ai._util.platform import platform_init
16
17
  from inspect_ai._util.registry import registry_lookup
17
18
  from inspect_ai.approval._apply import init_tool_approval
@@ -34,7 +35,7 @@ from inspect_ai.scorer._reducer import reducer_log_names
34
35
  from inspect_ai.solver._chain import chain
35
36
  from inspect_ai.solver._solver import Solver, SolverSpec
36
37
  from inspect_ai.util import SandboxEnvironmentType
37
- from inspect_ai.util._trace import init_trace
38
+ from inspect_ai.util._display import DisplayType, display_type, init_display_type
38
39
 
39
40
  from .context import init_eval_context
40
41
  from .loader import ResolvedTask, resolve_tasks
@@ -55,6 +56,7 @@ def eval(
55
56
  solver: Solver | list[Solver] | SolverSpec | None = None,
56
57
  tags: list[str] | None = None,
57
58
  trace: bool | None = None,
59
+ display: DisplayType | None = None,
58
60
  approval: str | list[ApprovalPolicy] | None = None,
59
61
  log_level: str | None = None,
60
62
  log_level_transcript: str | None = None,
@@ -100,7 +102,8 @@ def eval(
100
102
  solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
101
103
  Optional (uses task solver by default).
102
104
  tags (list[str] | None): Tags to associate with this evaluation run.
103
- trace: (bool | None): Trace message interactions with evaluated model to terminal.
105
+ trace (bool | None): Trace message interactions with evaluated model to terminal.
106
+ display (DisplayType | None): Task display type (defaults to 'full').
104
107
  approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
105
108
  Either a path to an approval policy config file or a list of approval policies.
106
109
  Defaults to no approval policy.
@@ -150,9 +153,11 @@ def eval(
150
153
  platform_init()
151
154
 
152
155
  # resolve eval trace
153
- max_tasks, max_samples = init_eval_trace(trace, max_tasks, max_samples, model)
156
+ max_tasks, max_samples = init_eval_display(
157
+ display, trace, max_tasks, max_samples, model
158
+ )
154
159
 
155
- return display().run_task_app(
160
+ return task_display().run_task_app(
156
161
  main=eval_async(
157
162
  tasks=tasks,
158
163
  model=model,
@@ -163,7 +168,6 @@ def eval(
163
168
  sandbox_cleanup=sandbox_cleanup,
164
169
  solver=solver,
165
170
  tags=tags,
166
- trace=trace,
167
171
  approval=approval,
168
172
  log_level=log_level,
169
173
  log_level_transcript=log_level_transcript,
@@ -201,7 +205,6 @@ async def eval_async(
201
205
  sandbox_cleanup: bool | None = None,
202
206
  solver: Solver | list[Solver] | SolverSpec | None = None,
203
207
  tags: list[str] | None = None,
204
- trace: bool | None = None,
205
208
  approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
206
209
  log_level: str | None = None,
207
210
  log_level_transcript: str | None = None,
@@ -247,7 +250,6 @@ async def eval_async(
247
250
  solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
248
251
  Optional (uses task solver by default).
249
252
  tags (list[str] | None): Tags to associate with this evaluation run.
250
- trace: (bool | None): Trace message interactions with evaluated model to terminal.
251
253
  approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
252
254
  Either a path to an approval policy config file or a list of approval policies.
253
255
  Defaults to no approval policy.
@@ -329,8 +331,8 @@ async def eval_async(
329
331
  log.warning("No inspect tasks were found at the specified paths.")
330
332
  return []
331
333
 
332
- # apply trace mode constraints
333
- if trace:
334
+ # apply conversation display constraints
335
+ if display_type() == "conversation":
334
336
  # single task at a time
335
337
  if max_tasks is not None:
336
338
  max_tasks = 1
@@ -371,7 +373,6 @@ async def eval_async(
371
373
  epochs_reducer=reducer_log_names(epochs_reducer)
372
374
  if epochs_reducer
373
375
  else None,
374
- trace=trace,
375
376
  approval=config_from_approval_policies(approval) if approval else None,
376
377
  fail_on_error=fail_on_error,
377
378
  message_limit=message_limit,
@@ -467,6 +468,7 @@ def eval_retry(
467
468
  max_sandboxes: int | None = None,
468
469
  sandbox_cleanup: bool | None = None,
469
470
  trace: bool | None = None,
471
+ display: DisplayType | None = None,
470
472
  fail_on_error: bool | float | None = None,
471
473
  debug_errors: bool | None = None,
472
474
  log_samples: bool | None = None,
@@ -501,6 +503,7 @@ def eval_retry(
501
503
  sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
502
504
  (defaults to True)
503
505
  trace (bool | None): Trace message interactions with evaluated model to terminal.
506
+ display (DisplayType | None): Task display type (defaults to 'full').
504
507
  fail_on_error (bool | float | None): `True` to fail on first sample error
505
508
  (default); `False` to never fail on sample errors; Value between 0 and 1
506
509
  to fail if a proportion of total samples fails. Value greater than 1 to fail
@@ -529,9 +532,9 @@ def eval_retry(
529
532
  platform_init()
530
533
 
531
534
  # resolve eval trace
532
- max_tasks, max_samples = init_eval_trace(trace, max_tasks, max_samples)
535
+ max_tasks, max_samples = init_eval_display(display, trace, max_tasks, max_samples)
533
536
 
534
- return display().run_task_app(
537
+ return task_display().run_task_app(
535
538
  main=eval_retry_async(
536
539
  tasks=tasks,
537
540
  log_level=log_level,
@@ -800,9 +803,8 @@ def eval_init(
800
803
 
801
804
  # resolve tasks (set active model to resolve uses of the
802
805
  # 'default' model in tools, solvers, and scorers)
803
- from inspect_ai._display.core.active import display
804
806
 
805
- with display().suspend_task_app():
807
+ with task_display().suspend_task_app():
806
808
  resolved_tasks: list[ResolvedTask] = []
807
809
  for m in models:
808
810
  init_active_model(m, generate_config)
@@ -816,17 +818,27 @@ def eval_init(
816
818
  return models, approval, resolved_tasks
817
819
 
818
820
 
819
- def init_eval_trace(
821
+ def init_eval_display(
822
+ display: DisplayType | None,
820
823
  trace: bool | None,
821
824
  max_tasks: int | None,
822
825
  max_samples: int | None,
823
826
  model: Any = None,
824
827
  ) -> tuple[int | None, int | None]:
825
- # init trace setting
826
- init_trace(trace)
827
-
828
- # adapt task/samples as required
828
+ # propagate any trace value to display_type
829
829
  if trace:
830
+ warn_once(
831
+ log,
832
+ "WARNING: The --trace flag is deprecated (use --display=conversation instead)",
833
+ )
834
+ display = "conversation"
835
+
836
+ # apply default and init
837
+ display = display or display_type()
838
+ init_display_type(display)
839
+
840
+ # adapt task/samples as required if we are in conversation mode
841
+ if display_type() == "conversation":
830
842
  # single task at a time
831
843
  if max_tasks is not None:
832
844
  max_tasks = 1
@@ -33,7 +33,7 @@ from inspect_ai.model import (
33
33
  )
34
34
  from inspect_ai.model._generate_config import GenerateConfig
35
35
  from inspect_ai.solver._solver import Solver, SolverSpec
36
- from inspect_ai.util import SandboxEnvironmentType
36
+ from inspect_ai.util import DisplayType, SandboxEnvironmentType
37
37
 
38
38
  from .eval import eval, eval_init
39
39
  from .loader import ResolvedTask, resolve_task_args
@@ -59,6 +59,7 @@ def eval_set(
59
59
  solver: Solver | list[Solver] | SolverSpec | None = None,
60
60
  tags: list[str] | None = None,
61
61
  trace: bool | None = None,
62
+ display: DisplayType | None = None,
62
63
  approval: str | list[ApprovalPolicy] | None = None,
63
64
  score: bool = True,
64
65
  log_level: str | None = None,
@@ -116,6 +117,7 @@ def eval_set(
116
117
  evaluating task(s). ptional (uses task solver by default).
117
118
  tags (list[str] | None): Tags to associate with this evaluation run.
118
119
  trace: (bool | None): Trace message interactions with evaluated model to terminal.
120
+ display (DisplayType | None): Task display type (defaults to 'full').
119
121
  approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
120
122
  Either a path to an approval policy config file or a list of approval policies.
121
123
  Defaults to no approval policy.
@@ -180,6 +182,7 @@ def eval_set(
180
182
  solver=solver,
181
183
  tags=tags,
182
184
  trace=trace,
185
+ display=display,
183
186
  approval=approval,
184
187
  log_level=log_level,
185
188
  log_level_transcript=log_level_transcript,
@@ -501,9 +504,6 @@ def latest_completed_task_eval_logs(
501
504
  # take the most recent completed log for each id
502
505
  latest_completed_logs: list[Log] = []
503
506
  for id, id_logs in logs_by_id.items():
504
- # filter on completed
505
- id_logs = [id_log for id_log in id_logs if id_log[1].status != "started"]
506
-
507
507
  # continue if there are no target logs
508
508
  if len(id_logs) == 0:
509
509
  continue
@@ -517,11 +517,13 @@ def latest_completed_task_eval_logs(
517
517
  latest_completed_logs.append(id_logs[0])
518
518
 
519
519
  # remove the rest if requested
520
+ # (don't remove 'started' in case its needed for post-mortum debugging)
520
521
  if cleanup_older:
521
522
  fs = filesystem(id_logs[0][0].name)
522
523
  for id_log in id_logs[1:]:
523
524
  try:
524
- fs.rm(id_log[0].name)
525
+ if id_log.header.status != "started":
526
+ fs.rm(id_log.info.name)
525
527
  except Exception as ex:
526
528
  logger.warning(f"Error attempt to remove '{id_log[0].name}': {ex}")
527
529
 
inspect_ai/_eval/score.py CHANGED
@@ -85,6 +85,7 @@ async def score_async(
85
85
  sample_id=sample.id,
86
86
  epoch=sample.epoch,
87
87
  input=sample.input,
88
+ target=Target(sample.target),
88
89
  choices=sample.choices,
89
90
  messages=sample.messages,
90
91
  output=sample.output,
@@ -1,4 +1,4 @@
1
- from .task import Task, TaskInfo, PreviousTask, Tasks # noqa: I001, F401
1
+ from .task import Task, TaskInfo, PreviousTask, Tasks, task_with # noqa: I001, F401
2
2
  from .epochs import Epochs
3
3
 
4
- __all__ = ["Epochs", "Task", "TaskInfo", "PreviousTask", "Tasks"]
4
+ __all__ = ["Epochs", "Task", "TaskInfo", "PreviousTask", "Tasks", "task_with"]