inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -0
- inspect_ai/_display/textual/widgets/samples.py +3 -3
- inspect_ai/_display/textual/widgets/transcript.py +3 -29
- inspect_ai/_eval/eval.py +19 -2
- inspect_ai/_eval/evalset.py +4 -1
- inspect_ai/_eval/run.py +41 -0
- inspect_ai/_eval/task/generate.py +38 -44
- inspect_ai/_eval/task/log.py +26 -28
- inspect_ai/_eval/task/run.py +23 -27
- inspect_ai/_util/answer.py +26 -0
- inspect_ai/_util/constants.py +0 -1
- inspect_ai/_util/local_server.py +398 -0
- inspect_ai/_util/working.py +10 -4
- inspect_ai/_view/www/dist/assets/index.css +173 -159
- inspect_ai/_view/www/dist/assets/index.js +1417 -1142
- inspect_ai/_view/www/log-schema.json +379 -3
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/@types/log.d.ts +93 -14
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
- inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
- inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
- inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
- inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
- inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
- inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
- inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
- inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
- inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
- inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
- inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
- inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
- inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
- inspect_ai/_view/www/src/components/Card.css +0 -1
- inspect_ai/_view/www/src/constants.ts +2 -0
- inspect_ai/_view/www/src/utils/numeric.ts +17 -0
- inspect_ai/agent/_agent.py +3 -3
- inspect_ai/agent/_as_solver.py +22 -12
- inspect_ai/agent/_as_tool.py +20 -6
- inspect_ai/agent/_handoff.py +12 -1
- inspect_ai/agent/_react.py +4 -3
- inspect_ai/agent/_run.py +16 -3
- inspect_ai/agent/_types.py +9 -0
- inspect_ai/dataset/_dataset.py +6 -3
- inspect_ai/log/__init__.py +14 -0
- inspect_ai/log/_convert.py +4 -9
- inspect_ai/log/_file.py +56 -0
- inspect_ai/log/_log.py +99 -0
- inspect_ai/log/_recorders/__init__.py +2 -0
- inspect_ai/log/_recorders/buffer/database.py +12 -11
- inspect_ai/log/_recorders/buffer/filestore.py +2 -2
- inspect_ai/log/_recorders/buffer/types.py +2 -2
- inspect_ai/log/_recorders/eval.py +20 -65
- inspect_ai/log/_recorders/file.py +28 -6
- inspect_ai/log/_recorders/recorder.py +7 -0
- inspect_ai/log/_recorders/types.py +1 -23
- inspect_ai/log/_samples.py +14 -25
- inspect_ai/log/_transcript.py +84 -36
- inspect_ai/log/_tree.py +118 -0
- inspect_ai/log/_util.py +52 -0
- inspect_ai/model/__init__.py +5 -1
- inspect_ai/model/_call_tools.py +72 -44
- inspect_ai/model/_generate_config.py +14 -8
- inspect_ai/model/_model.py +66 -88
- inspect_ai/model/_model_output.py +25 -0
- inspect_ai/model/_openai.py +2 -0
- inspect_ai/model/_providers/anthropic.py +13 -23
- inspect_ai/model/_providers/hf.py +27 -1
- inspect_ai/model/_providers/openai_o1.py +8 -2
- inspect_ai/model/_providers/providers.py +18 -4
- inspect_ai/model/_providers/sglang.py +247 -0
- inspect_ai/model/_providers/vllm.py +211 -400
- inspect_ai/scorer/_choice.py +1 -2
- inspect_ai/solver/__init__.py +7 -2
- inspect_ai/solver/_basic_agent.py +3 -10
- inspect_ai/solver/_chain.py +1 -1
- inspect_ai/solver/_fork.py +1 -1
- inspect_ai/solver/_multiple_choice.py +5 -22
- inspect_ai/solver/_plan.py +2 -2
- inspect_ai/solver/_task_state.py +26 -88
- inspect_ai/solver/_transcript.py +6 -7
- inspect_ai/tool/_json_rpc_helpers.py +45 -17
- inspect_ai/tool/_mcp/_mcp.py +8 -5
- inspect_ai/tool/_mcp/_sandbox.py +8 -2
- inspect_ai/tool/_mcp/server.py +3 -1
- inspect_ai/tool/_tool_call.py +4 -1
- inspect_ai/tool/_tool_support_helpers.py +51 -12
- inspect_ai/tool/_tools/_bash_session.py +190 -68
- inspect_ai/tool/_tools/_computer/_computer.py +25 -1
- inspect_ai/tool/_tools/_execute.py +4 -1
- inspect_ai/tool/_tools/_text_editor.py +4 -3
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
- inspect_ai/util/__init__.py +16 -0
- inspect_ai/util/_anyio.py +11 -0
- inspect_ai/util/_collect.py +50 -0
- inspect_ai/util/_limit.py +393 -0
- inspect_ai/util/_limited_conversation.py +57 -0
- inspect_ai/util/_span.py +58 -0
- inspect_ai/util/_subtask.py +27 -42
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
- inspect_ai/_display/core/group.py +0 -79
- inspect_ai/solver/_limit.py +0 -39
- inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
- inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
- inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_computer/test_args.py +0 -151
- /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/task/run.py
CHANGED
@@ -24,7 +24,6 @@ from inspect_ai._util._async import tg_collect
|
|
24
24
|
from inspect_ai._util.constants import (
|
25
25
|
DEFAULT_EPOCHS,
|
26
26
|
DEFAULT_MAX_CONNECTIONS,
|
27
|
-
SAMPLE_SUBTASK,
|
28
27
|
)
|
29
28
|
from inspect_ai._util.datetime import iso_now
|
30
29
|
from inspect_ai._util.error import exception_message
|
@@ -51,8 +50,12 @@ from inspect_ai.log import (
|
|
51
50
|
)
|
52
51
|
from inspect_ai.log._condense import condense_sample
|
53
52
|
from inspect_ai.log._file import eval_log_json_str
|
54
|
-
from inspect_ai.log._log import
|
55
|
-
|
53
|
+
from inspect_ai.log._log import (
|
54
|
+
EvalSampleLimit,
|
55
|
+
EvalSampleReductions,
|
56
|
+
EvalSampleSummary,
|
57
|
+
eval_error,
|
58
|
+
)
|
56
59
|
from inspect_ai.log._samples import (
|
57
60
|
active_sample,
|
58
61
|
)
|
@@ -61,8 +64,8 @@ from inspect_ai.log._transcript import (
|
|
61
64
|
SampleInitEvent,
|
62
65
|
SampleLimitEvent,
|
63
66
|
ScoreEvent,
|
64
|
-
StepEvent,
|
65
67
|
Transcript,
|
68
|
+
init_transcript,
|
66
69
|
transcript,
|
67
70
|
)
|
68
71
|
from inspect_ai.model import (
|
@@ -82,12 +85,13 @@ from inspect_ai.scorer._scorer import unique_scorer_name
|
|
82
85
|
from inspect_ai.solver import Generate, Plan, TaskState
|
83
86
|
from inspect_ai.solver._chain import Chain, unroll
|
84
87
|
from inspect_ai.solver._fork import set_task_generate
|
85
|
-
from inspect_ai.solver._limit import SampleLimitExceededError
|
86
88
|
from inspect_ai.solver._solver import Solver
|
87
89
|
from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
|
90
|
+
from inspect_ai.util._limit import LimitExceededError
|
88
91
|
from inspect_ai.util._sandbox.context import sandbox_connections
|
89
92
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
90
|
-
from inspect_ai.util.
|
93
|
+
from inspect_ai.util._span import span
|
94
|
+
from inspect_ai.util._store import init_subtask_store
|
91
95
|
|
92
96
|
from ..context import init_task_context
|
93
97
|
from ..task import Task
|
@@ -554,7 +558,9 @@ async def task_run_sample(
|
|
554
558
|
# initialise subtask and scoring context
|
555
559
|
init_sample_model_usage()
|
556
560
|
set_sample_state(state)
|
557
|
-
sample_transcript
|
561
|
+
sample_transcript = Transcript()
|
562
|
+
init_transcript(sample_transcript)
|
563
|
+
init_subtask_store(state.store)
|
558
564
|
if logger:
|
559
565
|
sample_transcript._subscribe(
|
560
566
|
lambda event: logger.log_sample_event(sample_id, state.epoch, event)
|
@@ -613,7 +619,8 @@ async def task_run_sample(
|
|
613
619
|
results: dict[str, SampleScore] = {}
|
614
620
|
try:
|
615
621
|
# begin init
|
616
|
-
|
622
|
+
init_span = span("init", type="init")
|
623
|
+
await init_span.__aenter__()
|
617
624
|
|
618
625
|
# sample init event (remove file bodies as they have content or absolute paths)
|
619
626
|
event_sample = sample.model_copy(
|
@@ -635,7 +642,7 @@ async def task_run_sample(
|
|
635
642
|
active.sandboxes = await sandbox_connections()
|
636
643
|
|
637
644
|
# end init
|
638
|
-
|
645
|
+
await init_span.__aexit__(None, None, None)
|
639
646
|
|
640
647
|
# initialise timeout context manager
|
641
648
|
timeout_cm = (
|
@@ -649,17 +656,18 @@ async def task_run_sample(
|
|
649
656
|
init_sample_working_limit(start_time, working_limit)
|
650
657
|
|
651
658
|
# run sample w/ optional timeout
|
652
|
-
with timeout_cm:
|
659
|
+
with timeout_cm, state._token_limit, state._message_limit:
|
653
660
|
# mark started
|
654
661
|
active.started = datetime.now().timestamp()
|
655
662
|
|
656
663
|
if logger is not None:
|
657
664
|
await logger.start_sample(
|
658
|
-
|
665
|
+
EvalSampleSummary(
|
659
666
|
id=sample_id,
|
660
667
|
epoch=state.epoch,
|
661
668
|
input=sample.input,
|
662
669
|
target=sample.target,
|
670
|
+
metadata=sample.metadata or {},
|
663
671
|
)
|
664
672
|
)
|
665
673
|
|
@@ -707,18 +715,9 @@ async def task_run_sample(
|
|
707
715
|
# handle the cancel exception
|
708
716
|
raise
|
709
717
|
|
710
|
-
except
|
711
|
-
# sample limit event
|
712
|
-
transcript()._event(
|
713
|
-
SampleLimitEvent(
|
714
|
-
type=ex.type,
|
715
|
-
limit=ex.limit,
|
716
|
-
message=f"Sample completed: {ex.message}",
|
717
|
-
)
|
718
|
-
)
|
719
|
-
|
718
|
+
except LimitExceededError:
|
720
719
|
# capture most recent state for scoring
|
721
|
-
state =
|
720
|
+
state = sample_state() or state
|
722
721
|
|
723
722
|
except BaseException as ex:
|
724
723
|
error, raise_error = handle_error(ex)
|
@@ -735,9 +734,6 @@ async def task_run_sample(
|
|
735
734
|
if time_limit is not None:
|
736
735
|
timeout_cm = anyio.fail_after(time_limit / 2)
|
737
736
|
|
738
|
-
# turn off message and token limits
|
739
|
-
state.message_limit = None
|
740
|
-
state.token_limit = None
|
741
737
|
set_sample_state(state)
|
742
738
|
|
743
739
|
# scoring
|
@@ -749,7 +745,7 @@ async def task_run_sample(
|
|
749
745
|
scorer_name = unique_scorer_name(
|
750
746
|
scorer, list(results.keys())
|
751
747
|
)
|
752
|
-
with
|
748
|
+
async with span(name=scorer_name, type="scorer"):
|
753
749
|
score_result = (
|
754
750
|
await scorer(state, Target(sample.target))
|
755
751
|
if scorer
|
@@ -929,7 +925,7 @@ async def log_sample(
|
|
929
925
|
input=sample.input,
|
930
926
|
choices=sample.choices,
|
931
927
|
target=sample.target,
|
932
|
-
metadata=
|
928
|
+
metadata=sample.metadata or {},
|
933
929
|
sandbox=sample.sandbox,
|
934
930
|
files=list(sample.files.keys()) if sample.files else None,
|
935
931
|
setup=sample.setup,
|
@@ -0,0 +1,26 @@
|
|
1
|
+
def answer_character(index: int) -> str:
|
2
|
+
r"""
|
3
|
+
Helper to go from array index to char, for example:
|
4
|
+
|
5
|
+
0 -> 'A', 1 -> 'B', etc
|
6
|
+
"""
|
7
|
+
if index < 26:
|
8
|
+
return chr(ord("A") + index)
|
9
|
+
else:
|
10
|
+
return str(index - 25)
|
11
|
+
|
12
|
+
|
13
|
+
def answer_index(char: str) -> int:
|
14
|
+
r"""
|
15
|
+
Helper to go from char to array index, for example:
|
16
|
+
|
17
|
+
'A' -> 0, 'B' -> 1, etc
|
18
|
+
"""
|
19
|
+
if char.isalpha() or char == "," or char == " ":
|
20
|
+
return ord(char.upper()) - ord("A")
|
21
|
+
elif char.isnumeric():
|
22
|
+
return 25 + int(char)
|
23
|
+
else:
|
24
|
+
raise ValueError(
|
25
|
+
f"Unepxected multiple choice answer: {char} (must be a letter or number)"
|
26
|
+
)
|
inspect_ai/_util/constants.py
CHANGED
@@ -0,0 +1,398 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
import random
|
5
|
+
import socket
|
6
|
+
import subprocess
|
7
|
+
import time
|
8
|
+
from typing import Any, Dict, Optional, Tuple
|
9
|
+
|
10
|
+
import httpx
|
11
|
+
|
12
|
+
# Set up logger for this module
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
# Global dictionary to keep track of process -> reserved port mappings
|
16
|
+
process_socket_map = {}
|
17
|
+
|
18
|
+
|
19
|
+
DEFAULT_TIMEOUT = 60 * 10 # fairly conservative default timeout of 10 minutes
|
20
|
+
|
21
|
+
|
22
|
+
def reserve_port(
|
23
|
+
host: str, start: int = 30000, end: int = 40000
|
24
|
+
) -> Tuple[int, socket.socket]:
|
25
|
+
"""
|
26
|
+
Reserve an available port by trying to bind a socket.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
host: Host to bind to
|
30
|
+
start: Minimum port number to try
|
31
|
+
end: Maximum port number to try
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
A tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
|
35
|
+
"""
|
36
|
+
candidates = list(range(start, end))
|
37
|
+
random.shuffle(candidates)
|
38
|
+
|
39
|
+
for port in candidates:
|
40
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
41
|
+
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
42
|
+
try:
|
43
|
+
# Attempt to bind to the port on localhost
|
44
|
+
sock.bind((host, port))
|
45
|
+
return port, sock
|
46
|
+
except socket.error:
|
47
|
+
sock.close() # Failed to bind, try next port
|
48
|
+
continue
|
49
|
+
raise RuntimeError("No free port available.")
|
50
|
+
|
51
|
+
|
52
|
+
def release_port(lock_socket: socket.socket) -> None:
|
53
|
+
"""
|
54
|
+
Release the reserved port by closing the lock socket.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
lock_socket: The socket to close
|
58
|
+
"""
|
59
|
+
try:
|
60
|
+
lock_socket.close()
|
61
|
+
except Exception as e:
|
62
|
+
logger.error(f"Error closing socket: {e}")
|
63
|
+
|
64
|
+
|
65
|
+
def execute_shell_command(
|
66
|
+
command: list[str], env: Optional[dict[str, str]] = None
|
67
|
+
) -> subprocess.Popen[str]:
|
68
|
+
"""
|
69
|
+
Execute a command and return its process handle.
|
70
|
+
|
71
|
+
Args:
|
72
|
+
command: List of command arguments
|
73
|
+
env: Optional environment variables to pass to the subprocess
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
A subprocess.Popen object representing the running process
|
77
|
+
"""
|
78
|
+
# Create a process environment by copying current environment and updating with new values
|
79
|
+
process_env = os.environ.copy()
|
80
|
+
if env:
|
81
|
+
process_env.update(env)
|
82
|
+
|
83
|
+
# Create a process that redirects output to pipes so we can capture it
|
84
|
+
process = subprocess.Popen(
|
85
|
+
command,
|
86
|
+
text=True,
|
87
|
+
stdout=subprocess.PIPE,
|
88
|
+
stderr=subprocess.PIPE,
|
89
|
+
bufsize=1, # Line buffered
|
90
|
+
env=process_env, # Pass the environment variables
|
91
|
+
)
|
92
|
+
|
93
|
+
# Set up background thread to read and log stdout
|
94
|
+
def log_output() -> None:
|
95
|
+
if process.stdout is None:
|
96
|
+
return
|
97
|
+
for line in iter(process.stdout.readline, ""):
|
98
|
+
if line:
|
99
|
+
logger.debug(line.strip())
|
100
|
+
process.stdout.close()
|
101
|
+
|
102
|
+
# Set up background thread to read and log stderr
|
103
|
+
def log_error() -> None:
|
104
|
+
if process.stderr is None:
|
105
|
+
return
|
106
|
+
for line in iter(process.stderr.readline, ""):
|
107
|
+
if line:
|
108
|
+
logger.info(line.strip())
|
109
|
+
process.stderr.close()
|
110
|
+
|
111
|
+
# Start background threads to handle output
|
112
|
+
import threading
|
113
|
+
|
114
|
+
threading.Thread(target=log_output, daemon=True).start()
|
115
|
+
threading.Thread(target=log_error, daemon=True).start()
|
116
|
+
|
117
|
+
logger.info(f"Started server with command: {' '.join(command)}")
|
118
|
+
return process
|
119
|
+
|
120
|
+
|
121
|
+
def kill_process_tree(pid: int) -> None:
|
122
|
+
"""
|
123
|
+
Kill a process and all its children.
|
124
|
+
|
125
|
+
Args:
|
126
|
+
pid: Process ID to kill
|
127
|
+
"""
|
128
|
+
try:
|
129
|
+
# Send SIGTERM
|
130
|
+
subprocess.run(["pkill", "-TERM", "-P", str(pid)], check=False)
|
131
|
+
subprocess.run(["kill", "-TERM", str(pid)], check=False)
|
132
|
+
time.sleep(1)
|
133
|
+
|
134
|
+
# If process still exists, send SIGKILL
|
135
|
+
try:
|
136
|
+
os.kill(pid, 0) # Check if process exists
|
137
|
+
subprocess.run(["pkill", "-KILL", "-P", str(pid)], check=False)
|
138
|
+
subprocess.run(["kill", "-KILL", str(pid)], check=False)
|
139
|
+
except OSError:
|
140
|
+
pass # Process already terminated
|
141
|
+
except Exception as e:
|
142
|
+
logger.error(f"Error killing process tree: {e}")
|
143
|
+
|
144
|
+
|
145
|
+
def launch_server_cmd(
|
146
|
+
command: list[str],
|
147
|
+
host: str = "0.0.0.0",
|
148
|
+
port: Optional[int] = None,
|
149
|
+
env: Optional[dict[str, str]] = None,
|
150
|
+
) -> Tuple[subprocess.Popen[str], int, list[str]]:
|
151
|
+
"""
|
152
|
+
Launch a server process with the given base command and return the process, port, and full command.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
command: Base command to execute
|
156
|
+
host: Host to bind to
|
157
|
+
port: Port to bind to. If None, a free port is reserved.
|
158
|
+
env: Optional environment variables to pass to the subprocess
|
159
|
+
|
160
|
+
Returns:
|
161
|
+
Tuple of (process, port, full_command)
|
162
|
+
"""
|
163
|
+
if port is None:
|
164
|
+
port, lock_socket = reserve_port(host)
|
165
|
+
else:
|
166
|
+
lock_socket = None
|
167
|
+
|
168
|
+
full_command = command + ["--port", str(port)]
|
169
|
+
logger.info(f"Launching server on port {port}")
|
170
|
+
|
171
|
+
process = execute_shell_command(full_command, env=env)
|
172
|
+
|
173
|
+
if lock_socket is not None:
|
174
|
+
process_socket_map[process] = lock_socket
|
175
|
+
|
176
|
+
return process, port, full_command
|
177
|
+
|
178
|
+
|
179
|
+
def terminate_process(process: subprocess.Popen[str]) -> None:
|
180
|
+
"""
|
181
|
+
Terminate the process and automatically release the reserved port.
|
182
|
+
|
183
|
+
Args:
|
184
|
+
process: The process to terminate
|
185
|
+
"""
|
186
|
+
kill_process_tree(process.pid)
|
187
|
+
|
188
|
+
lock_socket = process_socket_map.pop(process, None)
|
189
|
+
if lock_socket is not None:
|
190
|
+
release_port(lock_socket)
|
191
|
+
|
192
|
+
|
193
|
+
def wait_for_server(
|
194
|
+
base_url: str,
|
195
|
+
process: subprocess.Popen[str],
|
196
|
+
full_command: Optional[list[str]] = None,
|
197
|
+
env: Optional[dict[str, str]] = None,
|
198
|
+
timeout: Optional[int] = None,
|
199
|
+
api_key: Optional[str] = None,
|
200
|
+
) -> None:
|
201
|
+
"""
|
202
|
+
Wait for the server to be ready by polling the /v1/models endpoint.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
base_url: The base URL of the server
|
206
|
+
process: The subprocess running the server
|
207
|
+
full_command: The full command used to launch the server
|
208
|
+
env: The environment variables to use for the request
|
209
|
+
timeout: Maximum time to wait in seconds. None means wait forever.
|
210
|
+
api_key: The API key to use for the request
|
211
|
+
"""
|
212
|
+
logger.info(f"Waiting for server at {base_url} to become ready...")
|
213
|
+
start_time = time.time()
|
214
|
+
debug_advice = "Try rerunning with '--log-level debug' to see the full traceback."
|
215
|
+
if full_command:
|
216
|
+
debug_advice += " Alternatively, you can run the following launch command manually to see the full traceback:\n\n"
|
217
|
+
if env:
|
218
|
+
debug_advice += " ".join([f"{k}={v}" for k, v in env.items()]) + " "
|
219
|
+
debug_advice += " ".join(full_command) + "\n\n"
|
220
|
+
|
221
|
+
while True:
|
222
|
+
# Check for timeout first
|
223
|
+
if timeout and time.time() - start_time > timeout:
|
224
|
+
error_msg = f"Server did not become ready within timeout period ({timeout} seconds). Try increasing the timeout with '-M timeout=...'. {debug_advice}"
|
225
|
+
logger.error(error_msg)
|
226
|
+
raise TimeoutError(error_msg)
|
227
|
+
|
228
|
+
# Check if the process is still alive
|
229
|
+
if process.poll() is not None:
|
230
|
+
exit_code = process.poll()
|
231
|
+
error_msg = f"Server process exited unexpectedly with code {exit_code}. {debug_advice}"
|
232
|
+
logger.error(error_msg)
|
233
|
+
raise RuntimeError(error_msg)
|
234
|
+
|
235
|
+
try:
|
236
|
+
response = httpx.get(
|
237
|
+
f"{base_url}/v1/models",
|
238
|
+
headers={"Authorization": f"Bearer {api_key or 'None'}"},
|
239
|
+
timeout=5.0, # Short timeout for individual requests
|
240
|
+
)
|
241
|
+
if response.status_code == 200:
|
242
|
+
logger.info("Server is ready.")
|
243
|
+
break
|
244
|
+
|
245
|
+
# Log non-200 status but don't treat as hard error yet
|
246
|
+
logger.debug(
|
247
|
+
f"Server check returned status {response.status_code}, retrying..."
|
248
|
+
)
|
249
|
+
except httpx.RequestError as e:
|
250
|
+
# Log connection errors but don't treat as hard error yet
|
251
|
+
logger.debug(f"Server check failed: {e}, retrying...")
|
252
|
+
pass # Request failed (e.g., connection refused), will retry
|
253
|
+
|
254
|
+
# Wait before the next poll attempt
|
255
|
+
time.sleep(1)
|
256
|
+
|
257
|
+
|
258
|
+
def start_local_server(
|
259
|
+
base_cmd: list[str],
|
260
|
+
host: str,
|
261
|
+
port: Optional[int] = None,
|
262
|
+
api_key: Optional[str] = None,
|
263
|
+
server_type: str = "server",
|
264
|
+
timeout: Optional[int] = DEFAULT_TIMEOUT,
|
265
|
+
server_args: Optional[dict[str, Any]] = None,
|
266
|
+
env: Optional[dict[str, str]] = None,
|
267
|
+
) -> Tuple[str, subprocess.Popen[str], int]:
|
268
|
+
"""
|
269
|
+
Start a server with the given command and handle potential errors.
|
270
|
+
|
271
|
+
Args:
|
272
|
+
base_cmd: List of base command arguments
|
273
|
+
host: Host to bind to
|
274
|
+
port: Port to bind to. If None, a free port is reserved.
|
275
|
+
api_key: API key to use for server authentication
|
276
|
+
server_type: Type of server being started (for error messages)
|
277
|
+
timeout: Maximum time to wait for server to become ready
|
278
|
+
server_args: Additional server arguments to pass to the command
|
279
|
+
env: Optional environment variables to pass to the subprocess
|
280
|
+
Returns:
|
281
|
+
Tuple of (base_url, process, port)
|
282
|
+
|
283
|
+
Raises:
|
284
|
+
RuntimeError: If server fails to start
|
285
|
+
"""
|
286
|
+
full_command = base_cmd
|
287
|
+
server_process = None
|
288
|
+
|
289
|
+
# Initialize environment variables if not provided
|
290
|
+
process_env = {} if env is None else env.copy()
|
291
|
+
|
292
|
+
if server_args:
|
293
|
+
for key, value in server_args.items():
|
294
|
+
# Convert Python style args (underscore) to CLI style (dash)
|
295
|
+
cli_key = key.replace("_", "-")
|
296
|
+
if value == "":
|
297
|
+
# If the value is empty, just add the flag
|
298
|
+
full_command.extend([f"--{cli_key}"])
|
299
|
+
else:
|
300
|
+
full_command.extend([f"--{cli_key}", str(value)])
|
301
|
+
|
302
|
+
try:
|
303
|
+
server_process, found_port, full_command = launch_server_cmd(
|
304
|
+
full_command, host=host, port=port, env=process_env
|
305
|
+
)
|
306
|
+
base_url = f"http://localhost:{found_port}/v1"
|
307
|
+
wait_for_server(
|
308
|
+
f"http://localhost:{found_port}",
|
309
|
+
server_process,
|
310
|
+
api_key=api_key,
|
311
|
+
timeout=timeout,
|
312
|
+
full_command=full_command,
|
313
|
+
env=process_env,
|
314
|
+
)
|
315
|
+
return base_url, server_process, found_port
|
316
|
+
except Exception as e:
|
317
|
+
# Cleanup any partially started server
|
318
|
+
if server_process:
|
319
|
+
terminate_process(server_process)
|
320
|
+
|
321
|
+
# Re-raise with more context
|
322
|
+
raise RuntimeError(f"Failed to start {server_type} server: {str(e)}") from e
|
323
|
+
|
324
|
+
|
325
|
+
def merge_env_server_args(
|
326
|
+
env_var_name: str,
|
327
|
+
provided_args: Dict[str, Any],
|
328
|
+
logger: logging.Logger,
|
329
|
+
) -> Dict[str, Any]:
|
330
|
+
"""
|
331
|
+
Load server arguments from an environment variable and merge them with provided arguments.
|
332
|
+
|
333
|
+
Args:
|
334
|
+
env_var_name: Name of the environment variable containing JSON server args
|
335
|
+
provided_args: Dictionary of server arguments provided by the user
|
336
|
+
logger: Logger instance to log messages
|
337
|
+
|
338
|
+
Returns:
|
339
|
+
Dictionary of merged server arguments, with provided args taking precedence
|
340
|
+
"""
|
341
|
+
env_server_args = {}
|
342
|
+
server_args_json = os.environ.get(env_var_name)
|
343
|
+
|
344
|
+
if server_args_json:
|
345
|
+
try:
|
346
|
+
env_server_args = json.loads(server_args_json)
|
347
|
+
logger.info(
|
348
|
+
f"Loaded server args from environment {env_var_name}: {env_server_args}"
|
349
|
+
)
|
350
|
+
except json.JSONDecodeError:
|
351
|
+
logger.warning(
|
352
|
+
f"Failed to parse {env_var_name} as JSON: {server_args_json}"
|
353
|
+
)
|
354
|
+
|
355
|
+
# Merge environment args with provided args (provided args take precedence)
|
356
|
+
return {**env_server_args, **provided_args}
|
357
|
+
|
358
|
+
|
359
|
+
def configure_devices(
|
360
|
+
server_args: dict[str, Any], parallel_size_param: str = "tensor_parallel_size"
|
361
|
+
) -> tuple[dict[str, Any], dict[str, str]]:
|
362
|
+
"""Configure device settings and return updated server args and environment variables.
|
363
|
+
|
364
|
+
Args:
|
365
|
+
server_args: Dictionary of server arguments
|
366
|
+
parallel_size_param: Name of parameter to set with device count if not specified
|
367
|
+
|
368
|
+
Returns:
|
369
|
+
Tuple of (updated server arguments dict, environment variables dict)
|
370
|
+
"""
|
371
|
+
result = server_args.copy()
|
372
|
+
env_vars = {}
|
373
|
+
|
374
|
+
devices = None
|
375
|
+
if "device" in result and "devices" in result:
|
376
|
+
raise ValueError("Cannot specify both device and devices in server args")
|
377
|
+
elif "devices" in result:
|
378
|
+
devices = result.pop("devices")
|
379
|
+
elif "device" in result:
|
380
|
+
devices = result.pop("device")
|
381
|
+
|
382
|
+
if devices is not None:
|
383
|
+
# Convert device list to comma-separated string if needed
|
384
|
+
if isinstance(devices, list):
|
385
|
+
device_str = ",".join(map(str, devices))
|
386
|
+
else:
|
387
|
+
device_str = str(devices)
|
388
|
+
|
389
|
+
# Add to env_vars instead of setting os.environ directly
|
390
|
+
env_vars["CUDA_VISIBLE_DEVICES"] = device_str
|
391
|
+
|
392
|
+
device_count = len(device_str.split(","))
|
393
|
+
|
394
|
+
# Set parallel size parameter if not explicitly provided
|
395
|
+
if parallel_size_param not in result:
|
396
|
+
result[parallel_size_param] = device_count
|
397
|
+
|
398
|
+
return result, env_vars
|
inspect_ai/_util/working.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
import time
|
2
2
|
from contextvars import ContextVar
|
3
3
|
|
4
|
+
from inspect_ai.util._limit import LimitExceededError
|
5
|
+
|
4
6
|
|
5
7
|
def init_sample_working_limit(start_time: float, working_limit: float | None) -> None:
|
6
8
|
_sample_working_limit.set(working_limit)
|
@@ -22,6 +24,8 @@ def report_sample_waiting_time(waiting_time: float) -> None:
|
|
22
24
|
|
23
25
|
|
24
26
|
def check_sample_working_limit() -> None:
|
27
|
+
from inspect_ai.log._transcript import SampleLimitEvent, transcript
|
28
|
+
|
25
29
|
# no check if we don't have a limit
|
26
30
|
working_limit = _sample_working_limit.get()
|
27
31
|
if working_limit is None:
|
@@ -31,13 +35,15 @@ def check_sample_working_limit() -> None:
|
|
31
35
|
running_time = time.monotonic() - _sample_start_time.get()
|
32
36
|
working_time = running_time - sample_waiting_time()
|
33
37
|
if working_time > working_limit:
|
34
|
-
|
35
|
-
|
36
|
-
|
38
|
+
message = f"Exceeded working time limit ({working_limit:,} seconds)"
|
39
|
+
transcript()._event(
|
40
|
+
SampleLimitEvent(type="working", limit=int(working_limit), message=message)
|
41
|
+
)
|
42
|
+
raise LimitExceededError(
|
37
43
|
type="working",
|
38
44
|
value=int(working_time),
|
39
45
|
limit=int(working_limit),
|
40
|
-
message=
|
46
|
+
message=message,
|
41
47
|
)
|
42
48
|
|
43
49
|
|