inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. inspect_ai/_cli/eval.py +27 -0
  2. inspect_ai/_display/textual/widgets/samples.py +3 -3
  3. inspect_ai/_display/textual/widgets/transcript.py +3 -29
  4. inspect_ai/_eval/eval.py +19 -2
  5. inspect_ai/_eval/evalset.py +4 -1
  6. inspect_ai/_eval/run.py +41 -0
  7. inspect_ai/_eval/task/generate.py +38 -44
  8. inspect_ai/_eval/task/log.py +26 -28
  9. inspect_ai/_eval/task/run.py +23 -27
  10. inspect_ai/_util/answer.py +26 -0
  11. inspect_ai/_util/constants.py +0 -1
  12. inspect_ai/_util/local_server.py +398 -0
  13. inspect_ai/_util/working.py +10 -4
  14. inspect_ai/_view/www/dist/assets/index.css +173 -159
  15. inspect_ai/_view/www/dist/assets/index.js +1417 -1142
  16. inspect_ai/_view/www/log-schema.json +379 -3
  17. inspect_ai/_view/www/package.json +1 -1
  18. inspect_ai/_view/www/src/@types/log.d.ts +93 -14
  19. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
  20. inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
  21. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
  22. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
  23. inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
  24. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
  25. inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
  26. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
  27. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
  28. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
  29. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
  30. inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
  31. inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
  32. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
  33. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
  34. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
  35. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
  36. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
  37. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
  38. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
  39. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
  40. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
  41. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
  42. inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
  43. inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
  44. inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
  45. inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
  46. inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
  47. inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
  48. inspect_ai/_view/www/src/components/Card.css +0 -1
  49. inspect_ai/_view/www/src/constants.ts +2 -0
  50. inspect_ai/_view/www/src/utils/numeric.ts +17 -0
  51. inspect_ai/agent/_agent.py +3 -3
  52. inspect_ai/agent/_as_solver.py +22 -12
  53. inspect_ai/agent/_as_tool.py +20 -6
  54. inspect_ai/agent/_handoff.py +12 -1
  55. inspect_ai/agent/_react.py +4 -3
  56. inspect_ai/agent/_run.py +16 -3
  57. inspect_ai/agent/_types.py +9 -0
  58. inspect_ai/dataset/_dataset.py +6 -3
  59. inspect_ai/log/__init__.py +14 -0
  60. inspect_ai/log/_convert.py +4 -9
  61. inspect_ai/log/_file.py +56 -0
  62. inspect_ai/log/_log.py +99 -0
  63. inspect_ai/log/_recorders/__init__.py +2 -0
  64. inspect_ai/log/_recorders/buffer/database.py +12 -11
  65. inspect_ai/log/_recorders/buffer/filestore.py +2 -2
  66. inspect_ai/log/_recorders/buffer/types.py +2 -2
  67. inspect_ai/log/_recorders/eval.py +20 -65
  68. inspect_ai/log/_recorders/file.py +28 -6
  69. inspect_ai/log/_recorders/recorder.py +7 -0
  70. inspect_ai/log/_recorders/types.py +1 -23
  71. inspect_ai/log/_samples.py +14 -25
  72. inspect_ai/log/_transcript.py +84 -36
  73. inspect_ai/log/_tree.py +118 -0
  74. inspect_ai/log/_util.py +52 -0
  75. inspect_ai/model/__init__.py +5 -1
  76. inspect_ai/model/_call_tools.py +72 -44
  77. inspect_ai/model/_generate_config.py +14 -8
  78. inspect_ai/model/_model.py +66 -88
  79. inspect_ai/model/_model_output.py +25 -0
  80. inspect_ai/model/_openai.py +2 -0
  81. inspect_ai/model/_providers/anthropic.py +13 -23
  82. inspect_ai/model/_providers/hf.py +27 -1
  83. inspect_ai/model/_providers/openai_o1.py +8 -2
  84. inspect_ai/model/_providers/providers.py +18 -4
  85. inspect_ai/model/_providers/sglang.py +247 -0
  86. inspect_ai/model/_providers/vllm.py +211 -400
  87. inspect_ai/scorer/_choice.py +1 -2
  88. inspect_ai/solver/__init__.py +7 -2
  89. inspect_ai/solver/_basic_agent.py +3 -10
  90. inspect_ai/solver/_chain.py +1 -1
  91. inspect_ai/solver/_fork.py +1 -1
  92. inspect_ai/solver/_multiple_choice.py +5 -22
  93. inspect_ai/solver/_plan.py +2 -2
  94. inspect_ai/solver/_task_state.py +26 -88
  95. inspect_ai/solver/_transcript.py +6 -7
  96. inspect_ai/tool/_json_rpc_helpers.py +45 -17
  97. inspect_ai/tool/_mcp/_mcp.py +8 -5
  98. inspect_ai/tool/_mcp/_sandbox.py +8 -2
  99. inspect_ai/tool/_mcp/server.py +3 -1
  100. inspect_ai/tool/_tool_call.py +4 -1
  101. inspect_ai/tool/_tool_support_helpers.py +51 -12
  102. inspect_ai/tool/_tools/_bash_session.py +190 -68
  103. inspect_ai/tool/_tools/_computer/_computer.py +25 -1
  104. inspect_ai/tool/_tools/_execute.py +4 -1
  105. inspect_ai/tool/_tools/_text_editor.py +4 -3
  106. inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
  107. inspect_ai/util/__init__.py +16 -0
  108. inspect_ai/util/_anyio.py +11 -0
  109. inspect_ai/util/_collect.py +50 -0
  110. inspect_ai/util/_limit.py +393 -0
  111. inspect_ai/util/_limited_conversation.py +57 -0
  112. inspect_ai/util/_span.py +58 -0
  113. inspect_ai/util/_subtask.py +27 -42
  114. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
  115. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
  116. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
  117. inspect_ai/_display/core/group.py +0 -79
  118. inspect_ai/solver/_limit.py +0 -39
  119. inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
  120. inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
  121. inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
  122. inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
  123. inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
  124. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
  125. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
  126. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  127. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
  128. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
  129. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
  130. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
  131. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
  132. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
  133. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
  134. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
  135. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
  136. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
  137. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
  138. inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
  139. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
  140. inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
  141. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
  142. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
  143. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
  144. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  145. inspect_ai/tool/_tools/_computer/test_args.py +0 -151
  146. /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
  147. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
  148. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
  149. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,6 @@ from inspect_ai._util._async import tg_collect
24
24
  from inspect_ai._util.constants import (
25
25
  DEFAULT_EPOCHS,
26
26
  DEFAULT_MAX_CONNECTIONS,
27
- SAMPLE_SUBTASK,
28
27
  )
29
28
  from inspect_ai._util.datetime import iso_now
30
29
  from inspect_ai._util.error import exception_message
@@ -51,8 +50,12 @@ from inspect_ai.log import (
51
50
  )
52
51
  from inspect_ai.log._condense import condense_sample
53
52
  from inspect_ai.log._file import eval_log_json_str
54
- from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
55
- from inspect_ai.log._recorders.types import SampleSummary
53
+ from inspect_ai.log._log import (
54
+ EvalSampleLimit,
55
+ EvalSampleReductions,
56
+ EvalSampleSummary,
57
+ eval_error,
58
+ )
56
59
  from inspect_ai.log._samples import (
57
60
  active_sample,
58
61
  )
@@ -61,8 +64,8 @@ from inspect_ai.log._transcript import (
61
64
  SampleInitEvent,
62
65
  SampleLimitEvent,
63
66
  ScoreEvent,
64
- StepEvent,
65
67
  Transcript,
68
+ init_transcript,
66
69
  transcript,
67
70
  )
68
71
  from inspect_ai.model import (
@@ -82,12 +85,13 @@ from inspect_ai.scorer._scorer import unique_scorer_name
82
85
  from inspect_ai.solver import Generate, Plan, TaskState
83
86
  from inspect_ai.solver._chain import Chain, unroll
84
87
  from inspect_ai.solver._fork import set_task_generate
85
- from inspect_ai.solver._limit import SampleLimitExceededError
86
88
  from inspect_ai.solver._solver import Solver
87
89
  from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
90
+ from inspect_ai.util._limit import LimitExceededError
88
91
  from inspect_ai.util._sandbox.context import sandbox_connections
89
92
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
90
- from inspect_ai.util._subtask import init_subtask
93
+ from inspect_ai.util._span import span
94
+ from inspect_ai.util._store import init_subtask_store
91
95
 
92
96
  from ..context import init_task_context
93
97
  from ..task import Task
@@ -554,7 +558,9 @@ async def task_run_sample(
554
558
  # initialise subtask and scoring context
555
559
  init_sample_model_usage()
556
560
  set_sample_state(state)
557
- sample_transcript: Transcript = init_subtask(SAMPLE_SUBTASK, state.store)
561
+ sample_transcript = Transcript()
562
+ init_transcript(sample_transcript)
563
+ init_subtask_store(state.store)
558
564
  if logger:
559
565
  sample_transcript._subscribe(
560
566
  lambda event: logger.log_sample_event(sample_id, state.epoch, event)
@@ -613,7 +619,8 @@ async def task_run_sample(
613
619
  results: dict[str, SampleScore] = {}
614
620
  try:
615
621
  # begin init
616
- transcript()._event(StepEvent(action="begin", name="init"))
622
+ init_span = span("init", type="init")
623
+ await init_span.__aenter__()
617
624
 
618
625
  # sample init event (remove file bodies as they have content or absolute paths)
619
626
  event_sample = sample.model_copy(
@@ -635,7 +642,7 @@ async def task_run_sample(
635
642
  active.sandboxes = await sandbox_connections()
636
643
 
637
644
  # end init
638
- transcript()._event(StepEvent(action="end", name="init"))
645
+ await init_span.__aexit__(None, None, None)
639
646
 
640
647
  # initialise timeout context manager
641
648
  timeout_cm = (
@@ -649,17 +656,18 @@ async def task_run_sample(
649
656
  init_sample_working_limit(start_time, working_limit)
650
657
 
651
658
  # run sample w/ optional timeout
652
- with timeout_cm:
659
+ with timeout_cm, state._token_limit, state._message_limit:
653
660
  # mark started
654
661
  active.started = datetime.now().timestamp()
655
662
 
656
663
  if logger is not None:
657
664
  await logger.start_sample(
658
- SampleSummary(
665
+ EvalSampleSummary(
659
666
  id=sample_id,
660
667
  epoch=state.epoch,
661
668
  input=sample.input,
662
669
  target=sample.target,
670
+ metadata=sample.metadata or {},
663
671
  )
664
672
  )
665
673
 
@@ -707,18 +715,9 @@ async def task_run_sample(
707
715
  # handle the cancel exception
708
716
  raise
709
717
 
710
- except SampleLimitExceededError as ex:
711
- # sample limit event
712
- transcript()._event(
713
- SampleLimitEvent(
714
- type=ex.type,
715
- limit=ex.limit,
716
- message=f"Sample completed: {ex.message}",
717
- )
718
- )
719
-
718
+ except LimitExceededError:
720
719
  # capture most recent state for scoring
721
- state = ex.state or sample_state() or state
720
+ state = sample_state() or state
722
721
 
723
722
  except BaseException as ex:
724
723
  error, raise_error = handle_error(ex)
@@ -735,9 +734,6 @@ async def task_run_sample(
735
734
  if time_limit is not None:
736
735
  timeout_cm = anyio.fail_after(time_limit / 2)
737
736
 
738
- # turn off message and token limits
739
- state.message_limit = None
740
- state.token_limit = None
741
737
  set_sample_state(state)
742
738
 
743
739
  # scoring
@@ -749,7 +745,7 @@ async def task_run_sample(
749
745
  scorer_name = unique_scorer_name(
750
746
  scorer, list(results.keys())
751
747
  )
752
- with transcript().step(name=scorer_name, type="scorer"):
748
+ async with span(name=scorer_name, type="scorer"):
753
749
  score_result = (
754
750
  await scorer(state, Target(sample.target))
755
751
  if scorer
@@ -929,7 +925,7 @@ async def log_sample(
929
925
  input=sample.input,
930
926
  choices=sample.choices,
931
927
  target=sample.target,
932
- metadata=state.metadata if state.metadata else {},
928
+ metadata=sample.metadata or {},
933
929
  sandbox=sample.sandbox,
934
930
  files=list(sample.files.keys()) if sample.files else None,
935
931
  setup=sample.setup,
@@ -0,0 +1,26 @@
1
+ def answer_character(index: int) -> str:
2
+ r"""
3
+ Helper to go from array index to char, for example:
4
+
5
+ 0 -> 'A', 1 -> 'B', etc
6
+ """
7
+ if index < 26:
8
+ return chr(ord("A") + index)
9
+ else:
10
+ return str(index - 25)
11
+
12
+
13
+ def answer_index(char: str) -> int:
14
+ r"""
15
+ Helper to go from char to array index, for example:
16
+
17
+ 'A' -> 0, 'B' -> 1, etc
18
+ """
19
+ if char.isalpha() or char == "," or char == " ":
20
+ return ord(char.upper()) - ord("A")
21
+ elif char.isnumeric():
22
+ return 25 + int(char)
23
+ else:
24
+ raise ValueError(
25
+ f"Unepxected multiple choice answer: {char} (must be a letter or number)"
26
+ )
@@ -34,7 +34,6 @@ EVAL_LOG_FORMAT = "eval"
34
34
  DEFAULT_DISPLAY = "full"
35
35
  LOG_SCHEMA_VERSION = 2
36
36
  SCORED_SUFFIX = "-scored"
37
- SAMPLE_SUBTASK = "sample"
38
37
  CONSOLE_DISPLAY_WIDTH = 120
39
38
  BASE_64_DATA_REMOVED = "<base64-data-removed>"
40
39
  SANDBOX_SETUP_TIMEOUT = 300
@@ -0,0 +1,398 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import random
5
+ import socket
6
+ import subprocess
7
+ import time
8
+ from typing import Any, Dict, Optional, Tuple
9
+
10
+ import httpx
11
+
12
+ # Set up logger for this module
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Global dictionary to keep track of process -> reserved port mappings
16
+ process_socket_map = {}
17
+
18
+
19
+ DEFAULT_TIMEOUT = 60 * 10 # fairly conservative default timeout of 10 minutes
20
+
21
+
22
+ def reserve_port(
23
+ host: str, start: int = 30000, end: int = 40000
24
+ ) -> Tuple[int, socket.socket]:
25
+ """
26
+ Reserve an available port by trying to bind a socket.
27
+
28
+ Args:
29
+ host: Host to bind to
30
+ start: Minimum port number to try
31
+ end: Maximum port number to try
32
+
33
+ Returns:
34
+ A tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
35
+ """
36
+ candidates = list(range(start, end))
37
+ random.shuffle(candidates)
38
+
39
+ for port in candidates:
40
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
41
+ sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
42
+ try:
43
+ # Attempt to bind to the port on localhost
44
+ sock.bind((host, port))
45
+ return port, sock
46
+ except socket.error:
47
+ sock.close() # Failed to bind, try next port
48
+ continue
49
+ raise RuntimeError("No free port available.")
50
+
51
+
52
+ def release_port(lock_socket: socket.socket) -> None:
53
+ """
54
+ Release the reserved port by closing the lock socket.
55
+
56
+ Args:
57
+ lock_socket: The socket to close
58
+ """
59
+ try:
60
+ lock_socket.close()
61
+ except Exception as e:
62
+ logger.error(f"Error closing socket: {e}")
63
+
64
+
65
+ def execute_shell_command(
66
+ command: list[str], env: Optional[dict[str, str]] = None
67
+ ) -> subprocess.Popen[str]:
68
+ """
69
+ Execute a command and return its process handle.
70
+
71
+ Args:
72
+ command: List of command arguments
73
+ env: Optional environment variables to pass to the subprocess
74
+
75
+ Returns:
76
+ A subprocess.Popen object representing the running process
77
+ """
78
+ # Create a process environment by copying current environment and updating with new values
79
+ process_env = os.environ.copy()
80
+ if env:
81
+ process_env.update(env)
82
+
83
+ # Create a process that redirects output to pipes so we can capture it
84
+ process = subprocess.Popen(
85
+ command,
86
+ text=True,
87
+ stdout=subprocess.PIPE,
88
+ stderr=subprocess.PIPE,
89
+ bufsize=1, # Line buffered
90
+ env=process_env, # Pass the environment variables
91
+ )
92
+
93
+ # Set up background thread to read and log stdout
94
+ def log_output() -> None:
95
+ if process.stdout is None:
96
+ return
97
+ for line in iter(process.stdout.readline, ""):
98
+ if line:
99
+ logger.debug(line.strip())
100
+ process.stdout.close()
101
+
102
+ # Set up background thread to read and log stderr
103
+ def log_error() -> None:
104
+ if process.stderr is None:
105
+ return
106
+ for line in iter(process.stderr.readline, ""):
107
+ if line:
108
+ logger.info(line.strip())
109
+ process.stderr.close()
110
+
111
+ # Start background threads to handle output
112
+ import threading
113
+
114
+ threading.Thread(target=log_output, daemon=True).start()
115
+ threading.Thread(target=log_error, daemon=True).start()
116
+
117
+ logger.info(f"Started server with command: {' '.join(command)}")
118
+ return process
119
+
120
+
121
+ def kill_process_tree(pid: int) -> None:
122
+ """
123
+ Kill a process and all its children.
124
+
125
+ Args:
126
+ pid: Process ID to kill
127
+ """
128
+ try:
129
+ # Send SIGTERM
130
+ subprocess.run(["pkill", "-TERM", "-P", str(pid)], check=False)
131
+ subprocess.run(["kill", "-TERM", str(pid)], check=False)
132
+ time.sleep(1)
133
+
134
+ # If process still exists, send SIGKILL
135
+ try:
136
+ os.kill(pid, 0) # Check if process exists
137
+ subprocess.run(["pkill", "-KILL", "-P", str(pid)], check=False)
138
+ subprocess.run(["kill", "-KILL", str(pid)], check=False)
139
+ except OSError:
140
+ pass # Process already terminated
141
+ except Exception as e:
142
+ logger.error(f"Error killing process tree: {e}")
143
+
144
+
145
+ def launch_server_cmd(
146
+ command: list[str],
147
+ host: str = "0.0.0.0",
148
+ port: Optional[int] = None,
149
+ env: Optional[dict[str, str]] = None,
150
+ ) -> Tuple[subprocess.Popen[str], int, list[str]]:
151
+ """
152
+ Launch a server process with the given base command and return the process, port, and full command.
153
+
154
+ Args:
155
+ command: Base command to execute
156
+ host: Host to bind to
157
+ port: Port to bind to. If None, a free port is reserved.
158
+ env: Optional environment variables to pass to the subprocess
159
+
160
+ Returns:
161
+ Tuple of (process, port, full_command)
162
+ """
163
+ if port is None:
164
+ port, lock_socket = reserve_port(host)
165
+ else:
166
+ lock_socket = None
167
+
168
+ full_command = command + ["--port", str(port)]
169
+ logger.info(f"Launching server on port {port}")
170
+
171
+ process = execute_shell_command(full_command, env=env)
172
+
173
+ if lock_socket is not None:
174
+ process_socket_map[process] = lock_socket
175
+
176
+ return process, port, full_command
177
+
178
+
179
+ def terminate_process(process: subprocess.Popen[str]) -> None:
180
+ """
181
+ Terminate the process and automatically release the reserved port.
182
+
183
+ Args:
184
+ process: The process to terminate
185
+ """
186
+ kill_process_tree(process.pid)
187
+
188
+ lock_socket = process_socket_map.pop(process, None)
189
+ if lock_socket is not None:
190
+ release_port(lock_socket)
191
+
192
+
193
+ def wait_for_server(
194
+ base_url: str,
195
+ process: subprocess.Popen[str],
196
+ full_command: Optional[list[str]] = None,
197
+ env: Optional[dict[str, str]] = None,
198
+ timeout: Optional[int] = None,
199
+ api_key: Optional[str] = None,
200
+ ) -> None:
201
+ """
202
+ Wait for the server to be ready by polling the /v1/models endpoint.
203
+
204
+ Args:
205
+ base_url: The base URL of the server
206
+ process: The subprocess running the server
207
+ full_command: The full command used to launch the server
208
+ env: The environment variables to use for the request
209
+ timeout: Maximum time to wait in seconds. None means wait forever.
210
+ api_key: The API key to use for the request
211
+ """
212
+ logger.info(f"Waiting for server at {base_url} to become ready...")
213
+ start_time = time.time()
214
+ debug_advice = "Try rerunning with '--log-level debug' to see the full traceback."
215
+ if full_command:
216
+ debug_advice += " Alternatively, you can run the following launch command manually to see the full traceback:\n\n"
217
+ if env:
218
+ debug_advice += " ".join([f"{k}={v}" for k, v in env.items()]) + " "
219
+ debug_advice += " ".join(full_command) + "\n\n"
220
+
221
+ while True:
222
+ # Check for timeout first
223
+ if timeout and time.time() - start_time > timeout:
224
+ error_msg = f"Server did not become ready within timeout period ({timeout} seconds). Try increasing the timeout with '-M timeout=...'. {debug_advice}"
225
+ logger.error(error_msg)
226
+ raise TimeoutError(error_msg)
227
+
228
+ # Check if the process is still alive
229
+ if process.poll() is not None:
230
+ exit_code = process.poll()
231
+ error_msg = f"Server process exited unexpectedly with code {exit_code}. {debug_advice}"
232
+ logger.error(error_msg)
233
+ raise RuntimeError(error_msg)
234
+
235
+ try:
236
+ response = httpx.get(
237
+ f"{base_url}/v1/models",
238
+ headers={"Authorization": f"Bearer {api_key or 'None'}"},
239
+ timeout=5.0, # Short timeout for individual requests
240
+ )
241
+ if response.status_code == 200:
242
+ logger.info("Server is ready.")
243
+ break
244
+
245
+ # Log non-200 status but don't treat as hard error yet
246
+ logger.debug(
247
+ f"Server check returned status {response.status_code}, retrying..."
248
+ )
249
+ except httpx.RequestError as e:
250
+ # Log connection errors but don't treat as hard error yet
251
+ logger.debug(f"Server check failed: {e}, retrying...")
252
+ pass # Request failed (e.g., connection refused), will retry
253
+
254
+ # Wait before the next poll attempt
255
+ time.sleep(1)
256
+
257
+
258
+ def start_local_server(
259
+ base_cmd: list[str],
260
+ host: str,
261
+ port: Optional[int] = None,
262
+ api_key: Optional[str] = None,
263
+ server_type: str = "server",
264
+ timeout: Optional[int] = DEFAULT_TIMEOUT,
265
+ server_args: Optional[dict[str, Any]] = None,
266
+ env: Optional[dict[str, str]] = None,
267
+ ) -> Tuple[str, subprocess.Popen[str], int]:
268
+ """
269
+ Start a server with the given command and handle potential errors.
270
+
271
+ Args:
272
+ base_cmd: List of base command arguments
273
+ host: Host to bind to
274
+ port: Port to bind to. If None, a free port is reserved.
275
+ api_key: API key to use for server authentication
276
+ server_type: Type of server being started (for error messages)
277
+ timeout: Maximum time to wait for server to become ready
278
+ server_args: Additional server arguments to pass to the command
279
+ env: Optional environment variables to pass to the subprocess
280
+ Returns:
281
+ Tuple of (base_url, process, port)
282
+
283
+ Raises:
284
+ RuntimeError: If server fails to start
285
+ """
286
+ full_command = base_cmd
287
+ server_process = None
288
+
289
+ # Initialize environment variables if not provided
290
+ process_env = {} if env is None else env.copy()
291
+
292
+ if server_args:
293
+ for key, value in server_args.items():
294
+ # Convert Python style args (underscore) to CLI style (dash)
295
+ cli_key = key.replace("_", "-")
296
+ if value == "":
297
+ # If the value is empty, just add the flag
298
+ full_command.extend([f"--{cli_key}"])
299
+ else:
300
+ full_command.extend([f"--{cli_key}", str(value)])
301
+
302
+ try:
303
+ server_process, found_port, full_command = launch_server_cmd(
304
+ full_command, host=host, port=port, env=process_env
305
+ )
306
+ base_url = f"http://localhost:{found_port}/v1"
307
+ wait_for_server(
308
+ f"http://localhost:{found_port}",
309
+ server_process,
310
+ api_key=api_key,
311
+ timeout=timeout,
312
+ full_command=full_command,
313
+ env=process_env,
314
+ )
315
+ return base_url, server_process, found_port
316
+ except Exception as e:
317
+ # Cleanup any partially started server
318
+ if server_process:
319
+ terminate_process(server_process)
320
+
321
+ # Re-raise with more context
322
+ raise RuntimeError(f"Failed to start {server_type} server: {str(e)}") from e
323
+
324
+
325
+ def merge_env_server_args(
326
+ env_var_name: str,
327
+ provided_args: Dict[str, Any],
328
+ logger: logging.Logger,
329
+ ) -> Dict[str, Any]:
330
+ """
331
+ Load server arguments from an environment variable and merge them with provided arguments.
332
+
333
+ Args:
334
+ env_var_name: Name of the environment variable containing JSON server args
335
+ provided_args: Dictionary of server arguments provided by the user
336
+ logger: Logger instance to log messages
337
+
338
+ Returns:
339
+ Dictionary of merged server arguments, with provided args taking precedence
340
+ """
341
+ env_server_args = {}
342
+ server_args_json = os.environ.get(env_var_name)
343
+
344
+ if server_args_json:
345
+ try:
346
+ env_server_args = json.loads(server_args_json)
347
+ logger.info(
348
+ f"Loaded server args from environment {env_var_name}: {env_server_args}"
349
+ )
350
+ except json.JSONDecodeError:
351
+ logger.warning(
352
+ f"Failed to parse {env_var_name} as JSON: {server_args_json}"
353
+ )
354
+
355
+ # Merge environment args with provided args (provided args take precedence)
356
+ return {**env_server_args, **provided_args}
357
+
358
+
359
+ def configure_devices(
360
+ server_args: dict[str, Any], parallel_size_param: str = "tensor_parallel_size"
361
+ ) -> tuple[dict[str, Any], dict[str, str]]:
362
+ """Configure device settings and return updated server args and environment variables.
363
+
364
+ Args:
365
+ server_args: Dictionary of server arguments
366
+ parallel_size_param: Name of parameter to set with device count if not specified
367
+
368
+ Returns:
369
+ Tuple of (updated server arguments dict, environment variables dict)
370
+ """
371
+ result = server_args.copy()
372
+ env_vars = {}
373
+
374
+ devices = None
375
+ if "device" in result and "devices" in result:
376
+ raise ValueError("Cannot specify both device and devices in server args")
377
+ elif "devices" in result:
378
+ devices = result.pop("devices")
379
+ elif "device" in result:
380
+ devices = result.pop("device")
381
+
382
+ if devices is not None:
383
+ # Convert device list to comma-separated string if needed
384
+ if isinstance(devices, list):
385
+ device_str = ",".join(map(str, devices))
386
+ else:
387
+ device_str = str(devices)
388
+
389
+ # Add to env_vars instead of setting os.environ directly
390
+ env_vars["CUDA_VISIBLE_DEVICES"] = device_str
391
+
392
+ device_count = len(device_str.split(","))
393
+
394
+ # Set parallel size parameter if not explicitly provided
395
+ if parallel_size_param not in result:
396
+ result[parallel_size_param] = device_count
397
+
398
+ return result, env_vars
@@ -1,6 +1,8 @@
1
1
  import time
2
2
  from contextvars import ContextVar
3
3
 
4
+ from inspect_ai.util._limit import LimitExceededError
5
+
4
6
 
5
7
  def init_sample_working_limit(start_time: float, working_limit: float | None) -> None:
6
8
  _sample_working_limit.set(working_limit)
@@ -22,6 +24,8 @@ def report_sample_waiting_time(waiting_time: float) -> None:
22
24
 
23
25
 
24
26
  def check_sample_working_limit() -> None:
27
+ from inspect_ai.log._transcript import SampleLimitEvent, transcript
28
+
25
29
  # no check if we don't have a limit
26
30
  working_limit = _sample_working_limit.get()
27
31
  if working_limit is None:
@@ -31,13 +35,15 @@ def check_sample_working_limit() -> None:
31
35
  running_time = time.monotonic() - _sample_start_time.get()
32
36
  working_time = running_time - sample_waiting_time()
33
37
  if working_time > working_limit:
34
- from inspect_ai.solver._limit import SampleLimitExceededError
35
-
36
- raise SampleLimitExceededError(
38
+ message = f"Exceeded working time limit ({working_limit:,} seconds)"
39
+ transcript()._event(
40
+ SampleLimitEvent(type="working", limit=int(working_limit), message=message)
41
+ )
42
+ raise LimitExceededError(
37
43
  type="working",
38
44
  value=int(working_time),
39
45
  limit=int(working_limit),
40
- message=f"Exceeded working time limit ({working_limit:,} seconds)",
46
+ message=message,
41
47
  )
42
48
 
43
49