inspect-ai 0.3.92__py3-none-any.whl → 0.3.93__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. inspect_ai/_cli/eval.py +27 -0
  2. inspect_ai/_eval/eval.py +19 -2
  3. inspect_ai/_eval/evalset.py +4 -1
  4. inspect_ai/_eval/run.py +41 -0
  5. inspect_ai/_eval/task/generate.py +38 -44
  6. inspect_ai/_eval/task/log.py +26 -28
  7. inspect_ai/_eval/task/run.py +13 -20
  8. inspect_ai/_util/local_server.py +368 -0
  9. inspect_ai/_util/working.py +10 -4
  10. inspect_ai/_view/www/dist/assets/index.css +159 -146
  11. inspect_ai/_view/www/dist/assets/index.js +1020 -1061
  12. inspect_ai/_view/www/log-schema.json +4 -3
  13. inspect_ai/_view/www/package.json +1 -1
  14. inspect_ai/_view/www/src/@types/log.d.ts +3 -2
  15. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
  16. inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
  17. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
  18. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
  19. inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
  20. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
  21. inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
  22. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
  23. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
  24. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
  25. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
  26. inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
  27. inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
  28. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
  29. inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
  30. inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
  31. inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
  32. inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
  33. inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
  34. inspect_ai/_view/www/src/components/Card.css +0 -1
  35. inspect_ai/_view/www/src/constants.ts +2 -0
  36. inspect_ai/_view/www/src/utils/numeric.ts +17 -0
  37. inspect_ai/agent/_agent.py +3 -3
  38. inspect_ai/agent/_as_solver.py +20 -12
  39. inspect_ai/agent/_as_tool.py +15 -3
  40. inspect_ai/agent/_handoff.py +8 -1
  41. inspect_ai/agent/_run.py +11 -3
  42. inspect_ai/log/__init__.py +4 -0
  43. inspect_ai/log/_file.py +56 -0
  44. inspect_ai/log/_log.py +99 -0
  45. inspect_ai/log/_recorders/__init__.py +2 -0
  46. inspect_ai/log/_recorders/buffer/database.py +12 -11
  47. inspect_ai/log/_recorders/buffer/filestore.py +2 -2
  48. inspect_ai/log/_recorders/buffer/types.py +2 -2
  49. inspect_ai/log/_recorders/eval.py +20 -65
  50. inspect_ai/log/_recorders/file.py +28 -6
  51. inspect_ai/log/_recorders/recorder.py +7 -0
  52. inspect_ai/log/_recorders/types.py +1 -23
  53. inspect_ai/log/_samples.py +0 -8
  54. inspect_ai/log/_transcript.py +7 -1
  55. inspect_ai/log/_util.py +52 -0
  56. inspect_ai/model/__init__.py +5 -1
  57. inspect_ai/model/_call_tools.py +32 -12
  58. inspect_ai/model/_generate_config.py +14 -8
  59. inspect_ai/model/_model.py +21 -48
  60. inspect_ai/model/_model_output.py +25 -0
  61. inspect_ai/model/_openai.py +2 -0
  62. inspect_ai/model/_providers/anthropic.py +13 -23
  63. inspect_ai/model/_providers/openai_o1.py +8 -2
  64. inspect_ai/model/_providers/providers.py +18 -4
  65. inspect_ai/model/_providers/sglang.py +241 -0
  66. inspect_ai/model/_providers/vllm.py +207 -400
  67. inspect_ai/solver/__init__.py +7 -2
  68. inspect_ai/solver/_basic_agent.py +3 -10
  69. inspect_ai/solver/_task_state.py +26 -88
  70. inspect_ai/tool/_json_rpc_helpers.py +45 -17
  71. inspect_ai/tool/_mcp/_mcp.py +2 -0
  72. inspect_ai/tool/_mcp/_sandbox.py +8 -2
  73. inspect_ai/tool/_mcp/server.py +3 -1
  74. inspect_ai/tool/_tool_call.py +4 -1
  75. inspect_ai/tool/_tool_support_helpers.py +51 -12
  76. inspect_ai/tool/_tools/_bash_session.py +190 -68
  77. inspect_ai/tool/_tools/_computer/_computer.py +25 -1
  78. inspect_ai/tool/_tools/_text_editor.py +4 -3
  79. inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
  80. inspect_ai/util/__init__.py +12 -0
  81. inspect_ai/util/_limit.py +393 -0
  82. inspect_ai/util/_limited_conversation.py +57 -0
  83. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.93.dist-info}/METADATA +1 -1
  84. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.93.dist-info}/RECORD +89 -108
  85. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.93.dist-info}/WHEEL +1 -1
  86. inspect_ai/solver/_limit.py +0 -39
  87. inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
  88. inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
  89. inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
  90. inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
  91. inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
  92. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
  93. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
  94. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  95. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
  96. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
  97. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
  98. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
  99. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
  100. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
  101. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
  102. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
  103. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
  104. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
  105. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
  106. inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
  107. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
  108. inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
  109. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
  110. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
  111. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
  112. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  113. inspect_ai/tool/_tools/_computer/test_args.py +0 -151
  114. /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
  115. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.93.dist-info}/entry_points.txt +0 -0
  116. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.93.dist-info}/licenses/LICENSE +0 -0
  117. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.93.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,368 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import random
5
+ import socket
6
+ import subprocess
7
+ import time
8
+ from typing import Any, Dict, Optional, Tuple
9
+
10
+ import httpx
11
+
12
+ # Set up logger for this module
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Global dictionary to keep track of process -> reserved port mappings
16
+ process_socket_map = {}
17
+
18
+
19
+ DEFAULT_TIMEOUT = 60 * 10 # fairly conservative default timeout of 10 minutes
20
+
21
+
22
+ def reserve_port(
23
+ host: str, start: int = 30000, end: int = 40000
24
+ ) -> Tuple[int, socket.socket]:
25
+ """
26
+ Reserve an available port by trying to bind a socket.
27
+
28
+ Args:
29
+ host: Host to bind to
30
+ start: Minimum port number to try
31
+ end: Maximum port number to try
32
+
33
+ Returns:
34
+ A tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
35
+ """
36
+ candidates = list(range(start, end))
37
+ random.shuffle(candidates)
38
+
39
+ for port in candidates:
40
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
41
+ sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
42
+ try:
43
+ # Attempt to bind to the port on localhost
44
+ sock.bind((host, port))
45
+ return port, sock
46
+ except socket.error:
47
+ sock.close() # Failed to bind, try next port
48
+ continue
49
+ raise RuntimeError("No free port available.")
50
+
51
+
52
+ def release_port(lock_socket: socket.socket) -> None:
53
+ """
54
+ Release the reserved port by closing the lock socket.
55
+
56
+ Args:
57
+ lock_socket: The socket to close
58
+ """
59
+ try:
60
+ lock_socket.close()
61
+ except Exception as e:
62
+ logger.error(f"Error closing socket: {e}")
63
+
64
+
65
+ def execute_shell_command(command: list[str]) -> subprocess.Popen[str]:
66
+ """
67
+ Execute a command and return its process handle.
68
+
69
+ Args:
70
+ command: List of command arguments
71
+
72
+ Returns:
73
+ A subprocess.Popen object representing the running process
74
+ """
75
+ # Create a process that redirects output to pipes so we can capture it
76
+ process = subprocess.Popen(
77
+ command,
78
+ text=True,
79
+ stdout=subprocess.PIPE,
80
+ stderr=subprocess.PIPE,
81
+ bufsize=1, # Line buffered
82
+ )
83
+
84
+ # Set up background thread to read and log stdout
85
+ def log_output() -> None:
86
+ if process.stdout is None:
87
+ return
88
+ for line in iter(process.stdout.readline, ""):
89
+ if line:
90
+ logger.debug(line.strip())
91
+ process.stdout.close()
92
+
93
+ # Set up background thread to read and log stderr
94
+ def log_error() -> None:
95
+ if process.stderr is None:
96
+ return
97
+ for line in iter(process.stderr.readline, ""):
98
+ if line:
99
+ logger.info(line.strip())
100
+ process.stderr.close()
101
+
102
+ # Start background threads to handle output
103
+ import threading
104
+
105
+ threading.Thread(target=log_output, daemon=True).start()
106
+ threading.Thread(target=log_error, daemon=True).start()
107
+
108
+ logger.info(f"Started server with command: {' '.join(command)}")
109
+ return process
110
+
111
+
112
+ def kill_process_tree(pid: int) -> None:
113
+ """
114
+ Kill a process and all its children.
115
+
116
+ Args:
117
+ pid: Process ID to kill
118
+ """
119
+ try:
120
+ # Send SIGTERM
121
+ subprocess.run(["pkill", "-TERM", "-P", str(pid)], check=False)
122
+ subprocess.run(["kill", "-TERM", str(pid)], check=False)
123
+ time.sleep(1)
124
+
125
+ # If process still exists, send SIGKILL
126
+ try:
127
+ os.kill(pid, 0) # Check if process exists
128
+ subprocess.run(["pkill", "-KILL", "-P", str(pid)], check=False)
129
+ subprocess.run(["kill", "-KILL", str(pid)], check=False)
130
+ except OSError:
131
+ pass # Process already terminated
132
+ except Exception as e:
133
+ logger.error(f"Error killing process tree: {e}")
134
+
135
+
136
+ def launch_server_cmd(
137
+ command: list[str], host: str = "0.0.0.0", port: Optional[int] = None
138
+ ) -> Tuple[subprocess.Popen[str], int, list[str]]:
139
+ """
140
+ Launch a server process with the given base command and return the process, port, and full command.
141
+
142
+ Args:
143
+ command: Base command to execute
144
+ host: Host to bind to
145
+ port: Port to bind to. If None, a free port is reserved.
146
+
147
+ Returns:
148
+ Tuple of (process, port, full_command)
149
+ """
150
+ if port is None:
151
+ port, lock_socket = reserve_port(host)
152
+ else:
153
+ lock_socket = None
154
+
155
+ full_command = command + ["--port", str(port)]
156
+ logger.info(f"Launching server on port {port}")
157
+
158
+ process = execute_shell_command(full_command)
159
+
160
+ if lock_socket is not None:
161
+ process_socket_map[process] = lock_socket
162
+
163
+ return process, port, full_command
164
+
165
+
166
+ def terminate_process(process: subprocess.Popen[str]) -> None:
167
+ """
168
+ Terminate the process and automatically release the reserved port.
169
+
170
+ Args:
171
+ process: The process to terminate
172
+ """
173
+ kill_process_tree(process.pid)
174
+
175
+ lock_socket = process_socket_map.pop(process, None)
176
+ if lock_socket is not None:
177
+ release_port(lock_socket)
178
+
179
+
180
+ def wait_for_server(
181
+ base_url: str,
182
+ process: subprocess.Popen[str],
183
+ full_command: Optional[list[str]] = None,
184
+ timeout: Optional[int] = None,
185
+ api_key: Optional[str] = None,
186
+ ) -> None:
187
+ """
188
+ Wait for the server to be ready by polling the /v1/models endpoint.
189
+
190
+ Args:
191
+ base_url: The base URL of the server
192
+ process: The subprocess running the server
193
+ full_command: The full command used to launch the server
194
+ timeout: Maximum time to wait in seconds. None means wait forever.
195
+ api_key: The API key to use for the request
196
+ """
197
+ logger.info(f"Waiting for server at {base_url} to become ready...")
198
+ start_time = time.time()
199
+ debug_advice = "Try rerunning with '--log-level debug' to see the full traceback."
200
+ if full_command:
201
+ debug_advice += f" Alternatively, you can run the following launch command manually to see the full traceback:\n\n{' '.join(full_command)}\n\n"
202
+
203
+ while True:
204
+ # Check for timeout first
205
+ if timeout and time.time() - start_time > timeout:
206
+ error_msg = f"Server did not become ready within timeout period ({timeout} seconds). Try increasing the timeout with '-M timeout=...'. {debug_advice}"
207
+ logger.error(error_msg)
208
+ raise TimeoutError(error_msg)
209
+
210
+ # Check if the process is still alive
211
+ if process.poll() is not None:
212
+ exit_code = process.poll()
213
+ error_msg = f"Server process exited unexpectedly with code {exit_code}. {debug_advice}"
214
+ logger.error(error_msg)
215
+ raise RuntimeError(error_msg)
216
+
217
+ try:
218
+ response = httpx.get(
219
+ f"{base_url}/v1/models",
220
+ headers={"Authorization": f"Bearer {api_key or 'None'}"},
221
+ timeout=5.0, # Short timeout for individual requests
222
+ )
223
+ if response.status_code == 200:
224
+ logger.info("Server is ready.")
225
+ break
226
+
227
+ # Log non-200 status but don't treat as hard error yet
228
+ logger.debug(
229
+ f"Server check returned status {response.status_code}, retrying..."
230
+ )
231
+ except httpx.RequestError as e:
232
+ # Log connection errors but don't treat as hard error yet
233
+ logger.debug(f"Server check failed: {e}, retrying...")
234
+ pass # Request failed (e.g., connection refused), will retry
235
+
236
+ # Wait before the next poll attempt
237
+ time.sleep(1)
238
+
239
+
240
+ def start_local_server(
241
+ base_cmd: list[str],
242
+ host: str,
243
+ port: Optional[int] = None,
244
+ api_key: Optional[str] = None,
245
+ server_type: str = "server",
246
+ timeout: Optional[int] = DEFAULT_TIMEOUT,
247
+ server_args: Optional[dict[str, Any]] = None,
248
+ ) -> Tuple[str, subprocess.Popen[str], int]:
249
+ """
250
+ Start a server with the given command and handle potential errors.
251
+
252
+ Args:
253
+ base_cmd: List of base command arguments
254
+ host: Host to bind to
255
+ port: Port to bind to. If None, a free port is reserved.
256
+ api_key: API key to use for server authentication
257
+ server_type: Type of server being started (for error messages)
258
+ timeout: Maximum time to wait for server to become ready
259
+ server_args: Additional server arguments to pass to the command
260
+ Returns:
261
+ Tuple of (base_url, process, port)
262
+
263
+ Raises:
264
+ RuntimeError: If server fails to start
265
+ """
266
+ full_command = base_cmd
267
+ server_process = None
268
+
269
+ if server_args:
270
+ for key, value in server_args.items():
271
+ # Convert Python style args (underscore) to CLI style (dash)
272
+ cli_key = key.replace("_", "-")
273
+ full_command.extend([f"--{cli_key}", str(value)])
274
+
275
+ try:
276
+ server_process, found_port, full_command = launch_server_cmd(
277
+ full_command, host=host, port=port
278
+ )
279
+ base_url = f"http://localhost:{found_port}/v1"
280
+ wait_for_server(
281
+ f"http://localhost:{found_port}",
282
+ server_process,
283
+ api_key=api_key,
284
+ timeout=timeout,
285
+ full_command=full_command,
286
+ )
287
+ return base_url, server_process, found_port
288
+ except Exception as e:
289
+ # Cleanup any partially started server
290
+ if server_process:
291
+ terminate_process(server_process)
292
+
293
+ # Re-raise with more context
294
+ raise RuntimeError(f"Failed to start {server_type} server: {str(e)}") from e
295
+
296
+
297
+ def merge_env_server_args(
298
+ env_var_name: str,
299
+ provided_args: Dict[str, Any],
300
+ logger: logging.Logger,
301
+ ) -> Dict[str, Any]:
302
+ """
303
+ Load server arguments from an environment variable and merge them with provided arguments.
304
+
305
+ Args:
306
+ env_var_name: Name of the environment variable containing JSON server args
307
+ provided_args: Dictionary of server arguments provided by the user
308
+ logger: Logger instance to log messages
309
+
310
+ Returns:
311
+ Dictionary of merged server arguments, with provided args taking precedence
312
+ """
313
+ env_server_args = {}
314
+ server_args_json = os.environ.get(env_var_name)
315
+
316
+ if server_args_json:
317
+ try:
318
+ env_server_args = json.loads(server_args_json)
319
+ logger.info(
320
+ f"Loaded server args from environment {env_var_name}: {env_server_args}"
321
+ )
322
+ except json.JSONDecodeError:
323
+ logger.warning(
324
+ f"Failed to parse {env_var_name} as JSON: {server_args_json}"
325
+ )
326
+
327
+ # Merge environment args with provided args (provided args take precedence)
328
+ return {**env_server_args, **provided_args}
329
+
330
+
331
+ def configure_devices(
332
+ server_args: dict[str, Any], parallel_size_param: str = "tensor_parallel_size"
333
+ ) -> dict[str, Any]:
334
+ """Configure device settings and return updated server args.
335
+
336
+ Args:
337
+ server_args: Dictionary of server arguments
338
+ parallel_size_param: Name of parameter to set with device count if not specified
339
+
340
+ Returns:
341
+ Updated server arguments dict
342
+ """
343
+ result = server_args.copy()
344
+
345
+ devices = None
346
+ if "device" in result and "devices" in result:
347
+ raise ValueError("Cannot specify both device and devices in server args")
348
+ elif "devices" in result:
349
+ devices = result.pop("devices")
350
+ elif "device" in result:
351
+ devices = result.pop("device")
352
+
353
+ # Convert device list to comma-separated string if needed
354
+ if isinstance(devices, list):
355
+ device_str = ",".join(map(str, devices))
356
+ else:
357
+ device_str = str(devices)
358
+
359
+ # Set CUDA_VISIBLE_DEVICES environment variable
360
+ os.environ["CUDA_VISIBLE_DEVICES"] = device_str
361
+
362
+ device_count = len(device_str.split(","))
363
+
364
+ # Set parallel size parameter if not explicitly provided
365
+ if parallel_size_param not in result:
366
+ result[parallel_size_param] = device_count
367
+
368
+ return result
@@ -1,6 +1,8 @@
1
1
  import time
2
2
  from contextvars import ContextVar
3
3
 
4
+ from inspect_ai.util._limit import LimitExceededError
5
+
4
6
 
5
7
  def init_sample_working_limit(start_time: float, working_limit: float | None) -> None:
6
8
  _sample_working_limit.set(working_limit)
@@ -22,6 +24,8 @@ def report_sample_waiting_time(waiting_time: float) -> None:
22
24
 
23
25
 
24
26
  def check_sample_working_limit() -> None:
27
+ from inspect_ai.log._transcript import SampleLimitEvent, transcript
28
+
25
29
  # no check if we don't have a limit
26
30
  working_limit = _sample_working_limit.get()
27
31
  if working_limit is None:
@@ -31,13 +35,15 @@ def check_sample_working_limit() -> None:
31
35
  running_time = time.monotonic() - _sample_start_time.get()
32
36
  working_time = running_time - sample_waiting_time()
33
37
  if working_time > working_limit:
34
- from inspect_ai.solver._limit import SampleLimitExceededError
35
-
36
- raise SampleLimitExceededError(
38
+ message = f"Exceeded working time limit ({working_limit:,} seconds)"
39
+ transcript()._event(
40
+ SampleLimitEvent(type="working", limit=int(working_limit), message=message)
41
+ )
42
+ raise LimitExceededError(
37
43
  type="working",
38
44
  value=int(working_time),
39
45
  limit=int(working_limit),
40
- message=f"Exceeded working time limit ({working_limit:,} seconds)",
46
+ message=message,
41
47
  )
42
48
 
43
49