inspect-ai 0.3.93__py3-none-any.whl → 0.3.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. inspect_ai/_display/textual/widgets/samples.py +3 -3
  2. inspect_ai/_display/textual/widgets/transcript.py +3 -29
  3. inspect_ai/_eval/task/run.py +10 -7
  4. inspect_ai/_util/answer.py +26 -0
  5. inspect_ai/_util/constants.py +0 -1
  6. inspect_ai/_util/local_server.py +51 -21
  7. inspect_ai/_view/www/dist/assets/index.css +14 -13
  8. inspect_ai/_view/www/dist/assets/index.js +400 -84
  9. inspect_ai/_view/www/log-schema.json +375 -0
  10. inspect_ai/_view/www/src/@types/log.d.ts +90 -12
  11. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
  12. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
  13. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
  14. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
  15. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
  16. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
  17. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
  18. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
  19. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
  20. inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
  21. inspect_ai/agent/_as_solver.py +3 -1
  22. inspect_ai/agent/_as_tool.py +6 -4
  23. inspect_ai/agent/_handoff.py +5 -1
  24. inspect_ai/agent/_react.py +4 -3
  25. inspect_ai/agent/_run.py +6 -1
  26. inspect_ai/agent/_types.py +9 -0
  27. inspect_ai/dataset/_dataset.py +6 -3
  28. inspect_ai/log/__init__.py +10 -0
  29. inspect_ai/log/_convert.py +4 -9
  30. inspect_ai/log/_samples.py +14 -17
  31. inspect_ai/log/_transcript.py +77 -35
  32. inspect_ai/log/_tree.py +118 -0
  33. inspect_ai/model/_call_tools.py +42 -34
  34. inspect_ai/model/_model.py +45 -40
  35. inspect_ai/model/_providers/hf.py +27 -1
  36. inspect_ai/model/_providers/sglang.py +8 -2
  37. inspect_ai/model/_providers/vllm.py +6 -2
  38. inspect_ai/scorer/_choice.py +1 -2
  39. inspect_ai/solver/_chain.py +1 -1
  40. inspect_ai/solver/_fork.py +1 -1
  41. inspect_ai/solver/_multiple_choice.py +5 -22
  42. inspect_ai/solver/_plan.py +2 -2
  43. inspect_ai/solver/_transcript.py +6 -7
  44. inspect_ai/tool/_mcp/_mcp.py +6 -5
  45. inspect_ai/tool/_tools/_execute.py +4 -1
  46. inspect_ai/util/__init__.py +4 -0
  47. inspect_ai/util/_anyio.py +11 -0
  48. inspect_ai/util/_collect.py +50 -0
  49. inspect_ai/util/_span.py +58 -0
  50. inspect_ai/util/_subtask.py +27 -42
  51. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
  52. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +56 -51
  53. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
  54. inspect_ai/_display/core/group.py +0 -79
  55. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
  56. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
  57. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -591,10 +591,10 @@ class SampleToolbar(Horizontal):
591
591
  )
592
592
  if isinstance(last_event, ModelEvent):
593
593
  # see if there are retries in play
594
- if sample.retry_count > 0:
595
- suffix = "retry" if sample.retry_count == 1 else "retries"
594
+ if last_event.retries:
595
+ suffix = "retry" if last_event.retries == 1 else "retries"
596
596
  pending_caption_text = (
597
- f"Generating ({sample.retry_count:,} {suffix})..."
597
+ f"Generating ({last_event.retries:,} {suffix})..."
598
598
  )
599
599
  else:
600
600
  pending_caption_text = "Generating..."
@@ -30,7 +30,7 @@ from inspect_ai.log._transcript import (
30
30
  SampleInitEvent,
31
31
  SampleLimitEvent,
32
32
  ScoreEvent,
33
- StepEvent,
33
+ SpanBeginEvent,
34
34
  SubtaskEvent,
35
35
  ToolEvent,
36
36
  )
@@ -211,10 +211,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
211
211
  # render the call
212
212
  content = transcript_tool_call(event)
213
213
 
214
- # render sub-events
215
- if event.events:
216
- content.extend(render_sub_events(event.events))
217
-
218
214
  # render the output
219
215
  if isinstance(event.result, list):
220
216
  result: ToolResult = "\n".join(
@@ -235,23 +231,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
235
231
  return [EventDisplay("tool call", Group(*content))]
236
232
 
237
233
 
238
- def render_step_event(event: StepEvent) -> EventDisplay:
239
- if event.type == "solver":
240
- return render_solver_event(event)
241
- if event.type == "scorer":
242
- return render_scorer_event(event)
243
- else:
244
- return EventDisplay(step_title(event))
245
-
246
-
247
- def render_solver_event(event: StepEvent) -> EventDisplay:
248
- return EventDisplay(step_title(event))
249
-
250
-
251
- def render_scorer_event(event: StepEvent) -> EventDisplay:
252
- return EventDisplay(step_title(event))
253
-
254
-
255
234
  def render_score_event(event: ScoreEvent) -> EventDisplay:
256
235
  table = Table(box=None, show_header=False)
257
236
  table.add_column("", min_width=10, justify="left")
@@ -272,10 +251,6 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
272
251
  # render header
273
252
  content: list[RenderableType] = [transcript_function(event.name, event.input)]
274
253
 
275
- # render sub-events
276
- if event.events:
277
- content.extend(render_sub_events(event.events))
278
-
279
254
  if event.result:
280
255
  content.append(Text())
281
256
  if isinstance(event.result, str | int | float | bool | None):
@@ -345,8 +320,8 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
345
320
  return content
346
321
 
347
322
 
348
- def step_title(event: StepEvent) -> str:
349
- return f"{event.type or 'step'}: {event.name}"
323
+ def span_title(event: SpanBeginEvent) -> str:
324
+ return f"{event.type or 'span'}: {event.name}"
350
325
 
351
326
 
352
327
  EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
@@ -354,7 +329,6 @@ EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
354
329
  _renderers: list[tuple[Type[Event], EventRenderer]] = [
355
330
  (SampleInitEvent, render_sample_init_event),
356
331
  (SampleLimitEvent, render_sample_limit_event),
357
- (StepEvent, render_step_event),
358
332
  (ModelEvent, render_model_event),
359
333
  (ToolEvent, render_tool_event),
360
334
  (SubtaskEvent, render_subtask_event),
@@ -24,7 +24,6 @@ from inspect_ai._util._async import tg_collect
24
24
  from inspect_ai._util.constants import (
25
25
  DEFAULT_EPOCHS,
26
26
  DEFAULT_MAX_CONNECTIONS,
27
- SAMPLE_SUBTASK,
28
27
  )
29
28
  from inspect_ai._util.datetime import iso_now
30
29
  from inspect_ai._util.error import exception_message
@@ -65,8 +64,8 @@ from inspect_ai.log._transcript import (
65
64
  SampleInitEvent,
66
65
  SampleLimitEvent,
67
66
  ScoreEvent,
68
- StepEvent,
69
67
  Transcript,
68
+ init_transcript,
70
69
  transcript,
71
70
  )
72
71
  from inspect_ai.model import (
@@ -91,7 +90,8 @@ from inspect_ai.solver._task_state import sample_state, set_sample_state, state_
91
90
  from inspect_ai.util._limit import LimitExceededError
92
91
  from inspect_ai.util._sandbox.context import sandbox_connections
93
92
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
94
- from inspect_ai.util._subtask import init_subtask
93
+ from inspect_ai.util._span import span
94
+ from inspect_ai.util._store import init_subtask_store
95
95
 
96
96
  from ..context import init_task_context
97
97
  from ..task import Task
@@ -558,7 +558,9 @@ async def task_run_sample(
558
558
  # initialise subtask and scoring context
559
559
  init_sample_model_usage()
560
560
  set_sample_state(state)
561
- sample_transcript: Transcript = init_subtask(SAMPLE_SUBTASK, state.store)
561
+ sample_transcript = Transcript()
562
+ init_transcript(sample_transcript)
563
+ init_subtask_store(state.store)
562
564
  if logger:
563
565
  sample_transcript._subscribe(
564
566
  lambda event: logger.log_sample_event(sample_id, state.epoch, event)
@@ -617,7 +619,8 @@ async def task_run_sample(
617
619
  results: dict[str, SampleScore] = {}
618
620
  try:
619
621
  # begin init
620
- transcript()._event(StepEvent(action="begin", name="init"))
622
+ init_span = span("init", type="init")
623
+ await init_span.__aenter__()
621
624
 
622
625
  # sample init event (remove file bodies as they have content or absolute paths)
623
626
  event_sample = sample.model_copy(
@@ -639,7 +642,7 @@ async def task_run_sample(
639
642
  active.sandboxes = await sandbox_connections()
640
643
 
641
644
  # end init
642
- transcript()._event(StepEvent(action="end", name="init"))
645
+ await init_span.__aexit__(None, None, None)
643
646
 
644
647
  # initialise timeout context manager
645
648
  timeout_cm = (
@@ -742,7 +745,7 @@ async def task_run_sample(
742
745
  scorer_name = unique_scorer_name(
743
746
  scorer, list(results.keys())
744
747
  )
745
- with transcript().step(name=scorer_name, type="scorer"):
748
+ async with span(name=scorer_name, type="scorer"):
746
749
  score_result = (
747
750
  await scorer(state, Target(sample.target))
748
751
  if scorer
@@ -0,0 +1,26 @@
1
+ def answer_character(index: int) -> str:
2
+ r"""
3
+ Helper to go from array index to char, for example:
4
+
5
+ 0 -> 'A', 1 -> 'B', etc
6
+ """
7
+ if index < 26:
8
+ return chr(ord("A") + index)
9
+ else:
10
+ return str(index - 25)
11
+
12
+
13
+ def answer_index(char: str) -> int:
14
+ r"""
15
+ Helper to go from char to array index, for example:
16
+
17
+ 'A' -> 0, 'B' -> 1, etc
18
+ """
19
+ if char.isalpha() or char == "," or char == " ":
20
+ return ord(char.upper()) - ord("A")
21
+ elif char.isnumeric():
22
+ return 25 + int(char)
23
+ else:
24
+ raise ValueError(
25
+ f"Unepxected multiple choice answer: {char} (must be a letter or number)"
26
+ )
@@ -34,7 +34,6 @@ EVAL_LOG_FORMAT = "eval"
34
34
  DEFAULT_DISPLAY = "full"
35
35
  LOG_SCHEMA_VERSION = 2
36
36
  SCORED_SUFFIX = "-scored"
37
- SAMPLE_SUBTASK = "sample"
38
37
  CONSOLE_DISPLAY_WIDTH = 120
39
38
  BASE_64_DATA_REMOVED = "<base64-data-removed>"
40
39
  SANDBOX_SETUP_TIMEOUT = 300
@@ -62,16 +62,24 @@ def release_port(lock_socket: socket.socket) -> None:
62
62
  logger.error(f"Error closing socket: {e}")
63
63
 
64
64
 
65
- def execute_shell_command(command: list[str]) -> subprocess.Popen[str]:
65
+ def execute_shell_command(
66
+ command: list[str], env: Optional[dict[str, str]] = None
67
+ ) -> subprocess.Popen[str]:
66
68
  """
67
69
  Execute a command and return its process handle.
68
70
 
69
71
  Args:
70
72
  command: List of command arguments
73
+ env: Optional environment variables to pass to the subprocess
71
74
 
72
75
  Returns:
73
76
  A subprocess.Popen object representing the running process
74
77
  """
78
+ # Create a process environment by copying current environment and updating with new values
79
+ process_env = os.environ.copy()
80
+ if env:
81
+ process_env.update(env)
82
+
75
83
  # Create a process that redirects output to pipes so we can capture it
76
84
  process = subprocess.Popen(
77
85
  command,
@@ -79,6 +87,7 @@ def execute_shell_command(command: list[str]) -> subprocess.Popen[str]:
79
87
  stdout=subprocess.PIPE,
80
88
  stderr=subprocess.PIPE,
81
89
  bufsize=1, # Line buffered
90
+ env=process_env, # Pass the environment variables
82
91
  )
83
92
 
84
93
  # Set up background thread to read and log stdout
@@ -134,7 +143,10 @@ def kill_process_tree(pid: int) -> None:
134
143
 
135
144
 
136
145
  def launch_server_cmd(
137
- command: list[str], host: str = "0.0.0.0", port: Optional[int] = None
146
+ command: list[str],
147
+ host: str = "0.0.0.0",
148
+ port: Optional[int] = None,
149
+ env: Optional[dict[str, str]] = None,
138
150
  ) -> Tuple[subprocess.Popen[str], int, list[str]]:
139
151
  """
140
152
  Launch a server process with the given base command and return the process, port, and full command.
@@ -143,6 +155,7 @@ def launch_server_cmd(
143
155
  command: Base command to execute
144
156
  host: Host to bind to
145
157
  port: Port to bind to. If None, a free port is reserved.
158
+ env: Optional environment variables to pass to the subprocess
146
159
 
147
160
  Returns:
148
161
  Tuple of (process, port, full_command)
@@ -155,7 +168,7 @@ def launch_server_cmd(
155
168
  full_command = command + ["--port", str(port)]
156
169
  logger.info(f"Launching server on port {port}")
157
170
 
158
- process = execute_shell_command(full_command)
171
+ process = execute_shell_command(full_command, env=env)
159
172
 
160
173
  if lock_socket is not None:
161
174
  process_socket_map[process] = lock_socket
@@ -181,6 +194,7 @@ def wait_for_server(
181
194
  base_url: str,
182
195
  process: subprocess.Popen[str],
183
196
  full_command: Optional[list[str]] = None,
197
+ env: Optional[dict[str, str]] = None,
184
198
  timeout: Optional[int] = None,
185
199
  api_key: Optional[str] = None,
186
200
  ) -> None:
@@ -191,6 +205,7 @@ def wait_for_server(
191
205
  base_url: The base URL of the server
192
206
  process: The subprocess running the server
193
207
  full_command: The full command used to launch the server
208
+ env: The environment variables to use for the request
194
209
  timeout: Maximum time to wait in seconds. None means wait forever.
195
210
  api_key: The API key to use for the request
196
211
  """
@@ -198,7 +213,10 @@ def wait_for_server(
198
213
  start_time = time.time()
199
214
  debug_advice = "Try rerunning with '--log-level debug' to see the full traceback."
200
215
  if full_command:
201
- debug_advice += f" Alternatively, you can run the following launch command manually to see the full traceback:\n\n{' '.join(full_command)}\n\n"
216
+ debug_advice += " Alternatively, you can run the following launch command manually to see the full traceback:\n\n"
217
+ if env:
218
+ debug_advice += " ".join([f"{k}={v}" for k, v in env.items()]) + " "
219
+ debug_advice += " ".join(full_command) + "\n\n"
202
220
 
203
221
  while True:
204
222
  # Check for timeout first
@@ -245,6 +263,7 @@ def start_local_server(
245
263
  server_type: str = "server",
246
264
  timeout: Optional[int] = DEFAULT_TIMEOUT,
247
265
  server_args: Optional[dict[str, Any]] = None,
266
+ env: Optional[dict[str, str]] = None,
248
267
  ) -> Tuple[str, subprocess.Popen[str], int]:
249
268
  """
250
269
  Start a server with the given command and handle potential errors.
@@ -257,6 +276,7 @@ def start_local_server(
257
276
  server_type: Type of server being started (for error messages)
258
277
  timeout: Maximum time to wait for server to become ready
259
278
  server_args: Additional server arguments to pass to the command
279
+ env: Optional environment variables to pass to the subprocess
260
280
  Returns:
261
281
  Tuple of (base_url, process, port)
262
282
 
@@ -266,15 +286,22 @@ def start_local_server(
266
286
  full_command = base_cmd
267
287
  server_process = None
268
288
 
289
+ # Initialize environment variables if not provided
290
+ process_env = {} if env is None else env.copy()
291
+
269
292
  if server_args:
270
293
  for key, value in server_args.items():
271
294
  # Convert Python style args (underscore) to CLI style (dash)
272
295
  cli_key = key.replace("_", "-")
273
- full_command.extend([f"--{cli_key}", str(value)])
296
+ if value == "":
297
+ # If the value is empty, just add the flag
298
+ full_command.extend([f"--{cli_key}"])
299
+ else:
300
+ full_command.extend([f"--{cli_key}", str(value)])
274
301
 
275
302
  try:
276
303
  server_process, found_port, full_command = launch_server_cmd(
277
- full_command, host=host, port=port
304
+ full_command, host=host, port=port, env=process_env
278
305
  )
279
306
  base_url = f"http://localhost:{found_port}/v1"
280
307
  wait_for_server(
@@ -283,6 +310,7 @@ def start_local_server(
283
310
  api_key=api_key,
284
311
  timeout=timeout,
285
312
  full_command=full_command,
313
+ env=process_env,
286
314
  )
287
315
  return base_url, server_process, found_port
288
316
  except Exception as e:
@@ -330,17 +358,18 @@ def merge_env_server_args(
330
358
 
331
359
  def configure_devices(
332
360
  server_args: dict[str, Any], parallel_size_param: str = "tensor_parallel_size"
333
- ) -> dict[str, Any]:
334
- """Configure device settings and return updated server args.
361
+ ) -> tuple[dict[str, Any], dict[str, str]]:
362
+ """Configure device settings and return updated server args and environment variables.
335
363
 
336
364
  Args:
337
365
  server_args: Dictionary of server arguments
338
366
  parallel_size_param: Name of parameter to set with device count if not specified
339
367
 
340
368
  Returns:
341
- Updated server arguments dict
369
+ Tuple of (updated server arguments dict, environment variables dict)
342
370
  """
343
371
  result = server_args.copy()
372
+ env_vars = {}
344
373
 
345
374
  devices = None
346
375
  if "device" in result and "devices" in result:
@@ -350,19 +379,20 @@ def configure_devices(
350
379
  elif "device" in result:
351
380
  devices = result.pop("device")
352
381
 
353
- # Convert device list to comma-separated string if needed
354
- if isinstance(devices, list):
355
- device_str = ",".join(map(str, devices))
356
- else:
357
- device_str = str(devices)
382
+ if devices is not None:
383
+ # Convert device list to comma-separated string if needed
384
+ if isinstance(devices, list):
385
+ device_str = ",".join(map(str, devices))
386
+ else:
387
+ device_str = str(devices)
358
388
 
359
- # Set CUDA_VISIBLE_DEVICES environment variable
360
- os.environ["CUDA_VISIBLE_DEVICES"] = device_str
389
+ # Add to env_vars instead of setting os.environ directly
390
+ env_vars["CUDA_VISIBLE_DEVICES"] = device_str
361
391
 
362
- device_count = len(device_str.split(","))
392
+ device_count = len(device_str.split(","))
363
393
 
364
- # Set parallel size parameter if not explicitly provided
365
- if parallel_size_param not in result:
366
- result[parallel_size_param] = device_count
394
+ # Set parallel size parameter if not explicitly provided
395
+ if parallel_size_param not in result:
396
+ result[parallel_size_param] = device_count
367
397
 
368
- return result
398
+ return result, env_vars
@@ -17342,37 +17342,38 @@ pre[class*="language-"] {
17342
17342
  ._metadata_1a3fk_21 {
17343
17343
  margin: 0.5em 0;
17344
17344
  }
17345
- ._contents_iwnfd_1 {
17345
+ ._contents_1irga_1 {
17346
17346
  margin-top: 0.5em;
17347
17347
  }
17348
17348
 
17349
- ._contents_iwnfd_1 > :last-child {
17349
+ ._contents_1irga_1 > :last-child {
17350
17350
  margin-bottom: 0;
17351
17351
  }
17352
17352
 
17353
- ._twoColumn_iwnfd_9 {
17353
+ ._twoColumn_1irga_9 {
17354
17354
  display: grid;
17355
17355
  grid-template-columns: auto 1fr;
17356
17356
  column-gap: 1.5em;
17357
17357
  }
17358
17358
 
17359
- ._exec_iwnfd_15 {
17360
- margin-top: 0.5em;
17359
+ ._exec_1irga_15 {
17360
+ margin-top: 0;
17361
17361
  }
17362
17362
 
17363
- ._result_iwnfd_19 {
17363
+ ._result_1irga_19 {
17364
17364
  margin-top: 0.5em;
17365
17365
  }
17366
17366
 
17367
- ._fileLabel_iwnfd_23 {
17367
+ ._fileLabel_1irga_23 {
17368
17368
  margin-top: 0;
17369
17369
  margin-bottom: 0;
17370
17370
  }
17371
17371
 
17372
- ._wrapPre_iwnfd_28 {
17372
+ ._wrapPre_1irga_28 {
17373
17373
  white-space: pre-wrap;
17374
17374
  word-wrap: break-word;
17375
17375
  overflow-wrap: break-word;
17376
+ margin-bottom: 0;
17376
17377
  }
17377
17378
  ._explanation_1ww42_1 {
17378
17379
  display: grid;
@@ -20001,20 +20002,20 @@ span.ap-marker-container:hover span.ap-marker {
20001
20002
  padding-top: 0rem;
20002
20003
  margin-top: -8px;
20003
20004
  }
20004
- ._darkenedBg_1sie6_1 {
20005
+ ._darkenedBg_u9na2_1 {
20005
20006
  background-color: var(--bs-light-bg-subtle);
20006
20007
  }
20007
20008
 
20008
- ._normalBg_1sie6_5 {
20009
+ ._normalBg_u9na2_5 {
20009
20010
  background-color: var(--bs-body-bg);
20010
20011
  }
20011
20012
 
20012
- ._node_1sie6_9 {
20013
+ ._node_u9na2_9 {
20013
20014
  padding-top: 0.7rem;
20014
- padding-bottom: 0em;
20015
+ padding-bottom: 1px;
20015
20016
  }
20016
20017
 
20017
- ._attached_1sie6_14 {
20018
+ ._attached_u9na2_14 {
20018
20019
  padding-top: 0rem;
20019
20020
  margin-top: -8px;
20020
20021
  }