inspect-ai 0.3.93__py3-none-any.whl → 0.3.95__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. inspect_ai/_display/textual/widgets/samples.py +3 -3
  2. inspect_ai/_display/textual/widgets/transcript.py +3 -29
  3. inspect_ai/_eval/loader.py +1 -1
  4. inspect_ai/_eval/task/run.py +21 -12
  5. inspect_ai/_util/answer.py +26 -0
  6. inspect_ai/_util/constants.py +0 -1
  7. inspect_ai/_util/exception.py +4 -0
  8. inspect_ai/_util/hash.py +39 -0
  9. inspect_ai/_util/local_server.py +51 -21
  10. inspect_ai/_util/path.py +22 -0
  11. inspect_ai/_util/trace.py +1 -1
  12. inspect_ai/_util/working.py +4 -0
  13. inspect_ai/_view/www/dist/assets/index.css +23 -22
  14. inspect_ai/_view/www/dist/assets/index.js +517 -204
  15. inspect_ai/_view/www/log-schema.json +375 -0
  16. inspect_ai/_view/www/package.json +1 -1
  17. inspect_ai/_view/www/src/@types/log.d.ts +90 -12
  18. inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
  19. inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
  20. inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
  21. inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
  22. inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
  23. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
  24. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
  25. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
  26. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
  27. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
  28. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
  29. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
  30. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
  31. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
  32. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
  33. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
  34. inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
  35. inspect_ai/_view/www/src/app/types.ts +12 -2
  36. inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
  37. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
  38. inspect_ai/_view/www/src/state/hooks.ts +19 -3
  39. inspect_ai/_view/www/src/state/logSlice.ts +23 -5
  40. inspect_ai/_view/www/yarn.lock +9 -9
  41. inspect_ai/agent/_as_solver.py +3 -1
  42. inspect_ai/agent/_as_tool.py +6 -4
  43. inspect_ai/agent/_bridge/patch.py +1 -3
  44. inspect_ai/agent/_handoff.py +5 -1
  45. inspect_ai/agent/_react.py +4 -3
  46. inspect_ai/agent/_run.py +6 -1
  47. inspect_ai/agent/_types.py +9 -0
  48. inspect_ai/analysis/__init__.py +0 -0
  49. inspect_ai/analysis/beta/__init__.py +57 -0
  50. inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
  51. inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
  52. inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
  53. inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
  54. inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
  55. inspect_ai/analysis/beta/_dataframe/evals/table.py +140 -0
  56. inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
  57. inspect_ai/analysis/beta/_dataframe/events/columns.py +37 -0
  58. inspect_ai/analysis/beta/_dataframe/events/table.py +14 -0
  59. inspect_ai/analysis/beta/_dataframe/extract.py +54 -0
  60. inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
  61. inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
  62. inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
  63. inspect_ai/analysis/beta/_dataframe/messages/table.py +87 -0
  64. inspect_ai/analysis/beta/_dataframe/record.py +377 -0
  65. inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
  66. inspect_ai/analysis/beta/_dataframe/samples/columns.py +73 -0
  67. inspect_ai/analysis/beta/_dataframe/samples/extract.py +82 -0
  68. inspect_ai/analysis/beta/_dataframe/samples/table.py +329 -0
  69. inspect_ai/analysis/beta/_dataframe/util.py +157 -0
  70. inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
  71. inspect_ai/dataset/_dataset.py +6 -3
  72. inspect_ai/log/__init__.py +10 -0
  73. inspect_ai/log/_convert.py +4 -9
  74. inspect_ai/log/_file.py +1 -1
  75. inspect_ai/log/_log.py +21 -1
  76. inspect_ai/log/_samples.py +14 -17
  77. inspect_ai/log/_transcript.py +77 -35
  78. inspect_ai/log/_tree.py +118 -0
  79. inspect_ai/model/_call_tools.py +44 -35
  80. inspect_ai/model/_model.py +51 -44
  81. inspect_ai/model/_openai_responses.py +17 -18
  82. inspect_ai/model/_providers/anthropic.py +30 -5
  83. inspect_ai/model/_providers/hf.py +27 -1
  84. inspect_ai/model/_providers/providers.py +1 -1
  85. inspect_ai/model/_providers/sglang.py +8 -2
  86. inspect_ai/model/_providers/vllm.py +6 -2
  87. inspect_ai/scorer/_choice.py +1 -2
  88. inspect_ai/solver/_chain.py +1 -1
  89. inspect_ai/solver/_fork.py +1 -1
  90. inspect_ai/solver/_multiple_choice.py +9 -23
  91. inspect_ai/solver/_plan.py +2 -2
  92. inspect_ai/solver/_task_state.py +7 -3
  93. inspect_ai/solver/_transcript.py +6 -7
  94. inspect_ai/tool/_mcp/_context.py +3 -5
  95. inspect_ai/tool/_mcp/_mcp.py +6 -5
  96. inspect_ai/tool/_mcp/server.py +1 -1
  97. inspect_ai/tool/_tools/_execute.py +4 -1
  98. inspect_ai/tool/_tools/_think.py +1 -1
  99. inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
  100. inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
  101. inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
  102. inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
  103. inspect_ai/util/__init__.py +4 -0
  104. inspect_ai/util/_anyio.py +11 -0
  105. inspect_ai/util/_collect.py +50 -0
  106. inspect_ai/util/_sandbox/events.py +3 -2
  107. inspect_ai/util/_span.py +58 -0
  108. inspect_ai/util/_subtask.py +27 -42
  109. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/METADATA +8 -1
  110. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/RECORD +114 -82
  111. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/WHEEL +1 -1
  112. inspect_ai/_display/core/group.py +0 -79
  113. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/entry_points.txt +0 -0
  114. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/licenses/LICENSE +0 -0
  115. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/top_level.txt +0 -0
@@ -591,10 +591,10 @@ class SampleToolbar(Horizontal):
591
591
  )
592
592
  if isinstance(last_event, ModelEvent):
593
593
  # see if there are retries in play
594
- if sample.retry_count > 0:
595
- suffix = "retry" if sample.retry_count == 1 else "retries"
594
+ if last_event.retries:
595
+ suffix = "retry" if last_event.retries == 1 else "retries"
596
596
  pending_caption_text = (
597
- f"Generating ({sample.retry_count:,} {suffix})..."
597
+ f"Generating ({last_event.retries:,} {suffix})..."
598
598
  )
599
599
  else:
600
600
  pending_caption_text = "Generating..."
@@ -30,7 +30,7 @@ from inspect_ai.log._transcript import (
30
30
  SampleInitEvent,
31
31
  SampleLimitEvent,
32
32
  ScoreEvent,
33
- StepEvent,
33
+ SpanBeginEvent,
34
34
  SubtaskEvent,
35
35
  ToolEvent,
36
36
  )
@@ -211,10 +211,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
211
211
  # render the call
212
212
  content = transcript_tool_call(event)
213
213
 
214
- # render sub-events
215
- if event.events:
216
- content.extend(render_sub_events(event.events))
217
-
218
214
  # render the output
219
215
  if isinstance(event.result, list):
220
216
  result: ToolResult = "\n".join(
@@ -235,23 +231,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
235
231
  return [EventDisplay("tool call", Group(*content))]
236
232
 
237
233
 
238
- def render_step_event(event: StepEvent) -> EventDisplay:
239
- if event.type == "solver":
240
- return render_solver_event(event)
241
- if event.type == "scorer":
242
- return render_scorer_event(event)
243
- else:
244
- return EventDisplay(step_title(event))
245
-
246
-
247
- def render_solver_event(event: StepEvent) -> EventDisplay:
248
- return EventDisplay(step_title(event))
249
-
250
-
251
- def render_scorer_event(event: StepEvent) -> EventDisplay:
252
- return EventDisplay(step_title(event))
253
-
254
-
255
234
  def render_score_event(event: ScoreEvent) -> EventDisplay:
256
235
  table = Table(box=None, show_header=False)
257
236
  table.add_column("", min_width=10, justify="left")
@@ -272,10 +251,6 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
272
251
  # render header
273
252
  content: list[RenderableType] = [transcript_function(event.name, event.input)]
274
253
 
275
- # render sub-events
276
- if event.events:
277
- content.extend(render_sub_events(event.events))
278
-
279
254
  if event.result:
280
255
  content.append(Text())
281
256
  if isinstance(event.result, str | int | float | bool | None):
@@ -345,8 +320,8 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
345
320
  return content
346
321
 
347
322
 
348
- def step_title(event: StepEvent) -> str:
349
- return f"{event.type or 'step'}: {event.name}"
323
+ def span_title(event: SpanBeginEvent) -> str:
324
+ return f"{event.type or 'span'}: {event.name}"
350
325
 
351
326
 
352
327
  EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
@@ -354,7 +329,6 @@ EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
354
329
  _renderers: list[tuple[Type[Event], EventRenderer]] = [
355
330
  (SampleInitEvent, render_sample_init_event),
356
331
  (SampleLimitEvent, render_sample_limit_event),
357
- (StepEvent, render_step_event),
358
332
  (ModelEvent, render_model_event),
359
333
  (ToolEvent, render_tool_event),
360
334
  (SubtaskEvent, render_subtask_event),
@@ -428,7 +428,7 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
428
428
  return as_solver(agent)
429
429
  else:
430
430
  raise ValueError(
431
- f"Unkonwn solver {solver_name} (not registered as a @solver or @agent)"
431
+ f"Unknown solver {solver_name} (not registered as a @solver or @agent)"
432
432
  )
433
433
 
434
434
  # we do have a solver file
@@ -24,10 +24,10 @@ from inspect_ai._util._async import tg_collect
24
24
  from inspect_ai._util.constants import (
25
25
  DEFAULT_EPOCHS,
26
26
  DEFAULT_MAX_CONNECTIONS,
27
- SAMPLE_SUBTASK,
28
27
  )
29
28
  from inspect_ai._util.datetime import iso_now
30
29
  from inspect_ai._util.error import exception_message
30
+ from inspect_ai._util.exception import TerminateSampleError
31
31
  from inspect_ai._util.hooks import send_telemetry
32
32
  from inspect_ai._util.json import to_json_str_safe
33
33
  from inspect_ai._util.registry import (
@@ -36,6 +36,7 @@ from inspect_ai._util.registry import (
36
36
  registry_unqualified_name,
37
37
  )
38
38
  from inspect_ai._util.working import (
39
+ end_sample_working_limit,
39
40
  init_sample_working_limit,
40
41
  sample_waiting_time,
41
42
  )
@@ -65,8 +66,8 @@ from inspect_ai.log._transcript import (
65
66
  SampleInitEvent,
66
67
  SampleLimitEvent,
67
68
  ScoreEvent,
68
- StepEvent,
69
69
  Transcript,
70
+ init_transcript,
70
71
  transcript,
71
72
  )
72
73
  from inspect_ai.model import (
@@ -91,7 +92,8 @@ from inspect_ai.solver._task_state import sample_state, set_sample_state, state_
91
92
  from inspect_ai.util._limit import LimitExceededError
92
93
  from inspect_ai.util._sandbox.context import sandbox_connections
93
94
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
94
- from inspect_ai.util._subtask import init_subtask
95
+ from inspect_ai.util._span import span
96
+ from inspect_ai.util._store import init_subtask_store
95
97
 
96
98
  from ..context import init_task_context
97
99
  from ..task import Task
@@ -558,7 +560,9 @@ async def task_run_sample(
558
560
  # initialise subtask and scoring context
559
561
  init_sample_model_usage()
560
562
  set_sample_state(state)
561
- sample_transcript: Transcript = init_subtask(SAMPLE_SUBTASK, state.store)
563
+ sample_transcript = Transcript()
564
+ init_transcript(sample_transcript)
565
+ init_subtask_store(state.store)
562
566
  if logger:
563
567
  sample_transcript._subscribe(
564
568
  lambda event: logger.log_sample_event(sample_id, state.epoch, event)
@@ -617,7 +621,8 @@ async def task_run_sample(
617
621
  results: dict[str, SampleScore] = {}
618
622
  try:
619
623
  # begin init
620
- transcript()._event(StepEvent(action="begin", name="init"))
624
+ init_span = span("init", type="init")
625
+ await init_span.__aenter__()
621
626
 
622
627
  # sample init event (remove file bodies as they have content or absolute paths)
623
628
  event_sample = sample.model_copy(
@@ -636,10 +641,11 @@ async def task_run_sample(
636
641
  ) = contextlib.nullcontext()
637
642
  try:
638
643
  # update active sample wth sandboxes now that we are initialised
639
- active.sandboxes = await sandbox_connections()
640
-
641
- # end init
642
- transcript()._event(StepEvent(action="end", name="init"))
644
+ # (ensure that we still exit init context in presence of sandbox error)
645
+ try:
646
+ active.sandboxes = await sandbox_connections()
647
+ finally:
648
+ await init_span.__aexit__(None, None, None)
643
649
 
644
650
  # initialise timeout context manager
645
651
  timeout_cm = (
@@ -671,6 +677,9 @@ async def task_run_sample(
671
677
  # set progress for plan then run it
672
678
  state = await plan(state, generate)
673
679
 
680
+ # disable sample working limit after execution
681
+ end_sample_working_limit()
682
+
674
683
  except TimeoutError:
675
684
  if time_limit is not None:
676
685
  transcript()._event(
@@ -712,7 +721,7 @@ async def task_run_sample(
712
721
  # handle the cancel exception
713
722
  raise
714
723
 
715
- except LimitExceededError:
724
+ except (LimitExceededError, TerminateSampleError):
716
725
  # capture most recent state for scoring
717
726
  state = sample_state() or state
718
727
 
@@ -742,7 +751,7 @@ async def task_run_sample(
742
751
  scorer_name = unique_scorer_name(
743
752
  scorer, list(results.keys())
744
753
  )
745
- with transcript().step(name=scorer_name, type="scorer"):
754
+ async with span(name=scorer_name, type="scorer"):
746
755
  score_result = (
747
756
  await scorer(state, Target(sample.target))
748
757
  if scorer
@@ -922,7 +931,7 @@ async def log_sample(
922
931
  input=sample.input,
923
932
  choices=sample.choices,
924
933
  target=sample.target,
925
- metadata=sample.metadata or {},
934
+ metadata=state.metadata or {},
926
935
  sandbox=sample.sandbox,
927
936
  files=list(sample.files.keys()) if sample.files else None,
928
937
  setup=sample.setup,
@@ -0,0 +1,26 @@
1
+ def answer_character(index: int) -> str:
2
+ r"""
3
+ Helper to go from array index to char, for example:
4
+
5
+ 0 -> 'A', 1 -> 'B', etc
6
+ """
7
+ if index < 26:
8
+ return chr(ord("A") + index)
9
+ else:
10
+ return str(index - 25)
11
+
12
+
13
+ def answer_index(char: str) -> int:
14
+ r"""
15
+ Helper to go from char to array index, for example:
16
+
17
+ 'A' -> 0, 'B' -> 1, etc
18
+ """
19
+ if char.isalpha() or char == "," or char == " ":
20
+ return ord(char.upper()) - ord("A")
21
+ elif char.isnumeric():
22
+ return 25 + int(char)
23
+ else:
24
+ raise ValueError(
25
+ f"Unepxected multiple choice answer: {char} (must be a letter or number)"
26
+ )
@@ -34,7 +34,6 @@ EVAL_LOG_FORMAT = "eval"
34
34
  DEFAULT_DISPLAY = "full"
35
35
  LOG_SCHEMA_VERSION = 2
36
36
  SCORED_SUFFIX = "-scored"
37
- SAMPLE_SUBTASK = "sample"
38
37
  CONSOLE_DISPLAY_WIDTH = 120
39
38
  BASE_64_DATA_REMOVED = "<base64-data-removed>"
40
39
  SANDBOX_SETUP_TIMEOUT = 300
@@ -0,0 +1,4 @@
1
+ class TerminateSampleError(RuntimeError):
2
+ def __init__(self, reason: str) -> None:
3
+ self.reason = reason
4
+ super().__init__(reason)
inspect_ai/_util/hash.py CHANGED
@@ -1,3 +1,5 @@
1
+ import hashlib
2
+
1
3
  import mmh3
2
4
 
3
5
 
@@ -7,3 +9,40 @@ def mm3_hash(message: str) -> str:
7
9
 
8
10
  # Convert to unsigned integers and then to hexadecimal
9
11
  return f"{h1 & 0xFFFFFFFFFFFFFFFF:016x}{h2 & 0xFFFFFFFFFFFFFFFF:016x}"
12
+
13
+
14
+ def base57_id_hash(content: str) -> str:
15
+ """Generate base67 hash for content.
16
+
17
+ Hash the content, truncate to 128 bits, and then further truncate to 93 bits,
18
+ returning a 22-character Base-57-URL string. Collision probability reaches 50%
19
+ at approximately 70 trillion items.
20
+ """
21
+ digest_size = 16 # 128 bits
22
+ digest = hashlib.blake2s(content.encode(), digest_size=digest_size).digest()
23
+
24
+ # Truncate to ~93 bits (log₂57^22 ≈ 128.3)
25
+ as_int = int.from_bytes(digest, "big")
26
+ base57_str = to_base57(as_int)
27
+ if len(base57_str) > 22:
28
+ return base57_str[-22:] # Take last 22 chars if longer
29
+ else:
30
+ # This is unlikely with a 128-bit input
31
+ return base57_str.rjust(22, ALPHABET57[0])
32
+
33
+
34
+ # shortuuid uses these 57 characters (excluding similar-looking characters like 0/O, 1/I/l, etc.)
35
+ ALPHABET57 = "23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
36
+
37
+
38
+ def to_base57(n: int) -> str:
39
+ if n == 0:
40
+ return ALPHABET57[0]
41
+
42
+ out = []
43
+ while n:
44
+ n, rem = divmod(n, 57)
45
+ out.append(ALPHABET57[rem])
46
+
47
+ # reverse and return
48
+ return "".join(reversed(out))
@@ -62,16 +62,24 @@ def release_port(lock_socket: socket.socket) -> None:
62
62
  logger.error(f"Error closing socket: {e}")
63
63
 
64
64
 
65
- def execute_shell_command(command: list[str]) -> subprocess.Popen[str]:
65
+ def execute_shell_command(
66
+ command: list[str], env: Optional[dict[str, str]] = None
67
+ ) -> subprocess.Popen[str]:
66
68
  """
67
69
  Execute a command and return its process handle.
68
70
 
69
71
  Args:
70
72
  command: List of command arguments
73
+ env: Optional environment variables to pass to the subprocess
71
74
 
72
75
  Returns:
73
76
  A subprocess.Popen object representing the running process
74
77
  """
78
+ # Create a process environment by copying current environment and updating with new values
79
+ process_env = os.environ.copy()
80
+ if env:
81
+ process_env.update(env)
82
+
75
83
  # Create a process that redirects output to pipes so we can capture it
76
84
  process = subprocess.Popen(
77
85
  command,
@@ -79,6 +87,7 @@ def execute_shell_command(command: list[str]) -> subprocess.Popen[str]:
79
87
  stdout=subprocess.PIPE,
80
88
  stderr=subprocess.PIPE,
81
89
  bufsize=1, # Line buffered
90
+ env=process_env, # Pass the environment variables
82
91
  )
83
92
 
84
93
  # Set up background thread to read and log stdout
@@ -134,7 +143,10 @@ def kill_process_tree(pid: int) -> None:
134
143
 
135
144
 
136
145
  def launch_server_cmd(
137
- command: list[str], host: str = "0.0.0.0", port: Optional[int] = None
146
+ command: list[str],
147
+ host: str = "0.0.0.0",
148
+ port: Optional[int] = None,
149
+ env: Optional[dict[str, str]] = None,
138
150
  ) -> Tuple[subprocess.Popen[str], int, list[str]]:
139
151
  """
140
152
  Launch a server process with the given base command and return the process, port, and full command.
@@ -143,6 +155,7 @@ def launch_server_cmd(
143
155
  command: Base command to execute
144
156
  host: Host to bind to
145
157
  port: Port to bind to. If None, a free port is reserved.
158
+ env: Optional environment variables to pass to the subprocess
146
159
 
147
160
  Returns:
148
161
  Tuple of (process, port, full_command)
@@ -155,7 +168,7 @@ def launch_server_cmd(
155
168
  full_command = command + ["--port", str(port)]
156
169
  logger.info(f"Launching server on port {port}")
157
170
 
158
- process = execute_shell_command(full_command)
171
+ process = execute_shell_command(full_command, env=env)
159
172
 
160
173
  if lock_socket is not None:
161
174
  process_socket_map[process] = lock_socket
@@ -181,6 +194,7 @@ def wait_for_server(
181
194
  base_url: str,
182
195
  process: subprocess.Popen[str],
183
196
  full_command: Optional[list[str]] = None,
197
+ env: Optional[dict[str, str]] = None,
184
198
  timeout: Optional[int] = None,
185
199
  api_key: Optional[str] = None,
186
200
  ) -> None:
@@ -191,6 +205,7 @@ def wait_for_server(
191
205
  base_url: The base URL of the server
192
206
  process: The subprocess running the server
193
207
  full_command: The full command used to launch the server
208
+ env: The environment variables to use for the request
194
209
  timeout: Maximum time to wait in seconds. None means wait forever.
195
210
  api_key: The API key to use for the request
196
211
  """
@@ -198,7 +213,10 @@ def wait_for_server(
198
213
  start_time = time.time()
199
214
  debug_advice = "Try rerunning with '--log-level debug' to see the full traceback."
200
215
  if full_command:
201
- debug_advice += f" Alternatively, you can run the following launch command manually to see the full traceback:\n\n{' '.join(full_command)}\n\n"
216
+ debug_advice += " Alternatively, you can run the following launch command manually to see the full traceback:\n\n"
217
+ if env:
218
+ debug_advice += " ".join([f"{k}={v}" for k, v in env.items()]) + " "
219
+ debug_advice += " ".join(full_command) + "\n\n"
202
220
 
203
221
  while True:
204
222
  # Check for timeout first
@@ -245,6 +263,7 @@ def start_local_server(
245
263
  server_type: str = "server",
246
264
  timeout: Optional[int] = DEFAULT_TIMEOUT,
247
265
  server_args: Optional[dict[str, Any]] = None,
266
+ env: Optional[dict[str, str]] = None,
248
267
  ) -> Tuple[str, subprocess.Popen[str], int]:
249
268
  """
250
269
  Start a server with the given command and handle potential errors.
@@ -257,6 +276,7 @@ def start_local_server(
257
276
  server_type: Type of server being started (for error messages)
258
277
  timeout: Maximum time to wait for server to become ready
259
278
  server_args: Additional server arguments to pass to the command
279
+ env: Optional environment variables to pass to the subprocess
260
280
  Returns:
261
281
  Tuple of (base_url, process, port)
262
282
 
@@ -266,15 +286,22 @@ def start_local_server(
266
286
  full_command = base_cmd
267
287
  server_process = None
268
288
 
289
+ # Initialize environment variables if not provided
290
+ process_env = {} if env is None else env.copy()
291
+
269
292
  if server_args:
270
293
  for key, value in server_args.items():
271
294
  # Convert Python style args (underscore) to CLI style (dash)
272
295
  cli_key = key.replace("_", "-")
273
- full_command.extend([f"--{cli_key}", str(value)])
296
+ if value == "":
297
+ # If the value is empty, just add the flag
298
+ full_command.extend([f"--{cli_key}"])
299
+ else:
300
+ full_command.extend([f"--{cli_key}", str(value)])
274
301
 
275
302
  try:
276
303
  server_process, found_port, full_command = launch_server_cmd(
277
- full_command, host=host, port=port
304
+ full_command, host=host, port=port, env=process_env
278
305
  )
279
306
  base_url = f"http://localhost:{found_port}/v1"
280
307
  wait_for_server(
@@ -283,6 +310,7 @@ def start_local_server(
283
310
  api_key=api_key,
284
311
  timeout=timeout,
285
312
  full_command=full_command,
313
+ env=process_env,
286
314
  )
287
315
  return base_url, server_process, found_port
288
316
  except Exception as e:
@@ -330,17 +358,18 @@ def merge_env_server_args(
330
358
 
331
359
  def configure_devices(
332
360
  server_args: dict[str, Any], parallel_size_param: str = "tensor_parallel_size"
333
- ) -> dict[str, Any]:
334
- """Configure device settings and return updated server args.
361
+ ) -> tuple[dict[str, Any], dict[str, str]]:
362
+ """Configure device settings and return updated server args and environment variables.
335
363
 
336
364
  Args:
337
365
  server_args: Dictionary of server arguments
338
366
  parallel_size_param: Name of parameter to set with device count if not specified
339
367
 
340
368
  Returns:
341
- Updated server arguments dict
369
+ Tuple of (updated server arguments dict, environment variables dict)
342
370
  """
343
371
  result = server_args.copy()
372
+ env_vars = {}
344
373
 
345
374
  devices = None
346
375
  if "device" in result and "devices" in result:
@@ -350,19 +379,20 @@ def configure_devices(
350
379
  elif "device" in result:
351
380
  devices = result.pop("device")
352
381
 
353
- # Convert device list to comma-separated string if needed
354
- if isinstance(devices, list):
355
- device_str = ",".join(map(str, devices))
356
- else:
357
- device_str = str(devices)
382
+ if devices is not None:
383
+ # Convert device list to comma-separated string if needed
384
+ if isinstance(devices, list):
385
+ device_str = ",".join(map(str, devices))
386
+ else:
387
+ device_str = str(devices)
358
388
 
359
- # Set CUDA_VISIBLE_DEVICES environment variable
360
- os.environ["CUDA_VISIBLE_DEVICES"] = device_str
389
+ # Add to env_vars instead of setting os.environ directly
390
+ env_vars["CUDA_VISIBLE_DEVICES"] = device_str
361
391
 
362
- device_count = len(device_str.split(","))
392
+ device_count = len(device_str.split(","))
363
393
 
364
- # Set parallel size parameter if not explicitly provided
365
- if parallel_size_param not in result:
366
- result[parallel_size_param] = device_count
394
+ # Set parallel size parameter if not explicitly provided
395
+ if parallel_size_param not in result:
396
+ result[parallel_size_param] = device_count
367
397
 
368
- return result
398
+ return result, env_vars
inspect_ai/_util/path.py CHANGED
@@ -6,6 +6,10 @@ from copy import deepcopy
6
6
  from pathlib import PurePath
7
7
  from typing import Any, Iterator, overload
8
8
 
9
+ from fsspec.implementations.local import LocalFileSystem # type: ignore
10
+
11
+ from inspect_ai._util.file import filesystem
12
+
9
13
 
10
14
  @contextmanager
11
15
  def add_to_path(p: str) -> Iterator[None]:
@@ -98,6 +102,24 @@ def cwd_relative_path(file: str | None, walk_up: bool = False) -> str | None:
98
102
  return None
99
103
 
100
104
 
105
+ def pretty_path(file: str) -> str:
106
+ fs = filesystem(file)
107
+ if fs.is_local():
108
+ file = LocalFileSystem._strip_protocol(file)
109
+ return cwd_relative_path(file)
110
+ else:
111
+ return file
112
+
113
+
114
+ def native_path(file: str) -> str:
115
+ fs = filesystem(file)
116
+ if fs.is_local():
117
+ file = LocalFileSystem._strip_protocol(file)
118
+ return file
119
+ else:
120
+ return file
121
+
122
+
101
123
  # A slightly modified implementation of task_path.relative(d, walk_up=True)
102
124
  # since that wasn't introduced until python 3.12
103
125
  def relative_walk(from_path: PurePath, to_path: PurePath) -> str:
inspect_ai/_util/trace.py CHANGED
@@ -287,7 +287,7 @@ def rotate_trace_files() -> None:
287
287
  rotate_files = list_trace_files()[10:]
288
288
  for file in rotate_files:
289
289
  file.file.unlink(missing_ok=True)
290
- except FileNotFoundError:
290
+ except (FileNotFoundError, OSError):
291
291
  pass
292
292
 
293
293
 
@@ -10,6 +10,10 @@ def init_sample_working_limit(start_time: float, working_limit: float | None) ->
10
10
  _sample_waiting_time.set(0)
11
11
 
12
12
 
13
+ def end_sample_working_limit() -> None:
14
+ _sample_working_limit.set(None)
15
+
16
+
13
17
  def sample_waiting_time() -> float:
14
18
  return _sample_waiting_time.get()
15
19
 
@@ -15489,34 +15489,34 @@ pre[class*="language-"] {
15489
15489
  padding: 0.1rem 0.6rem;
15490
15490
  border-radius: var(--bs-border-radius);
15491
15491
  }
15492
- ._expandableBordered_1wpxz_1 {
15492
+ ._expandableBordered_59eal_1 {
15493
15493
  border: solid var(--bs-light-border-subtle) 1px;
15494
15494
  }
15495
15495
 
15496
- ._expandableTogglable_1wpxz_5 {
15496
+ ._expandableTogglable_59eal_5 {
15497
15497
  margin-bottom: 1em;
15498
15498
  }
15499
15499
 
15500
- ._expandableContents_1wpxz_9 {
15500
+ ._expandableContents_59eal_9 {
15501
15501
  font-size: var(--inspect-font-size-base);
15502
15502
  }
15503
15503
 
15504
- ._expandableCollapsed_1wpxz_13 {
15504
+ ._expandableCollapsed_59eal_13 {
15505
15505
  overflow: hidden;
15506
15506
  }
15507
15507
 
15508
- ._moreToggle_1wpxz_17 {
15508
+ ._moreToggle_59eal_17 {
15509
15509
  display: flex;
15510
15510
  margin-top: 0;
15511
15511
  position: relative;
15512
- height: 8px;
15512
+ height: 18px;
15513
15513
  }
15514
15514
 
15515
- ._moreToggle_1wpxz_17._bordered_1wpxz_24 {
15515
+ ._moreToggle_59eal_17._bordered_59eal_24 {
15516
15516
  border-top: solid var(--bs-light-border-subtle) 1px;
15517
15517
  }
15518
15518
 
15519
- ._moreToggleContainer_1wpxz_28 {
15519
+ ._moreToggleContainer_59eal_28 {
15520
15520
  position: absolute;
15521
15521
  top: -1px;
15522
15522
  right: 0;
@@ -15527,7 +15527,7 @@ pre[class*="language-"] {
15527
15527
  margin-right: 0;
15528
15528
  }
15529
15529
 
15530
- ._moreToggleButton_1wpxz_39 {
15530
+ ._moreToggleButton_59eal_39 {
15531
15531
  font-size: var(--inspect-font-size-smaller);
15532
15532
  border: none;
15533
15533
  padding: 0.1rem 0.5rem;
@@ -17342,37 +17342,38 @@ pre[class*="language-"] {
17342
17342
  ._metadata_1a3fk_21 {
17343
17343
  margin: 0.5em 0;
17344
17344
  }
17345
- ._contents_iwnfd_1 {
17345
+ ._contents_1irga_1 {
17346
17346
  margin-top: 0.5em;
17347
17347
  }
17348
17348
 
17349
- ._contents_iwnfd_1 > :last-child {
17349
+ ._contents_1irga_1 > :last-child {
17350
17350
  margin-bottom: 0;
17351
17351
  }
17352
17352
 
17353
- ._twoColumn_iwnfd_9 {
17353
+ ._twoColumn_1irga_9 {
17354
17354
  display: grid;
17355
17355
  grid-template-columns: auto 1fr;
17356
17356
  column-gap: 1.5em;
17357
17357
  }
17358
17358
 
17359
- ._exec_iwnfd_15 {
17360
- margin-top: 0.5em;
17359
+ ._exec_1irga_15 {
17360
+ margin-top: 0;
17361
17361
  }
17362
17362
 
17363
- ._result_iwnfd_19 {
17363
+ ._result_1irga_19 {
17364
17364
  margin-top: 0.5em;
17365
17365
  }
17366
17366
 
17367
- ._fileLabel_iwnfd_23 {
17367
+ ._fileLabel_1irga_23 {
17368
17368
  margin-top: 0;
17369
17369
  margin-bottom: 0;
17370
17370
  }
17371
17371
 
17372
- ._wrapPre_iwnfd_28 {
17372
+ ._wrapPre_1irga_28 {
17373
17373
  white-space: pre-wrap;
17374
17374
  word-wrap: break-word;
17375
17375
  overflow-wrap: break-word;
17376
+ margin-bottom: 0;
17376
17377
  }
17377
17378
  ._explanation_1ww42_1 {
17378
17379
  display: grid;
@@ -20001,20 +20002,20 @@ span.ap-marker-container:hover span.ap-marker {
20001
20002
  padding-top: 0rem;
20002
20003
  margin-top: -8px;
20003
20004
  }
20004
- ._darkenedBg_1sie6_1 {
20005
+ ._darkenedBg_u9na2_1 {
20005
20006
  background-color: var(--bs-light-bg-subtle);
20006
20007
  }
20007
20008
 
20008
- ._normalBg_1sie6_5 {
20009
+ ._normalBg_u9na2_5 {
20009
20010
  background-color: var(--bs-body-bg);
20010
20011
  }
20011
20012
 
20012
- ._node_1sie6_9 {
20013
+ ._node_u9na2_9 {
20013
20014
  padding-top: 0.7rem;
20014
- padding-bottom: 0em;
20015
+ padding-bottom: 1px;
20015
20016
  }
20016
20017
 
20017
- ._attached_1sie6_14 {
20018
+ ._attached_u9na2_14 {
20018
20019
  padding-top: 0rem;
20019
20020
  margin-top: -8px;
20020
20021
  }