inspect-ai 0.3.93__py3-none-any.whl → 0.3.95__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_display/textual/widgets/samples.py +3 -3
- inspect_ai/_display/textual/widgets/transcript.py +3 -29
- inspect_ai/_eval/loader.py +1 -1
- inspect_ai/_eval/task/run.py +21 -12
- inspect_ai/_util/answer.py +26 -0
- inspect_ai/_util/constants.py +0 -1
- inspect_ai/_util/exception.py +4 -0
- inspect_ai/_util/hash.py +39 -0
- inspect_ai/_util/local_server.py +51 -21
- inspect_ai/_util/path.py +22 -0
- inspect_ai/_util/trace.py +1 -1
- inspect_ai/_util/working.py +4 -0
- inspect_ai/_view/www/dist/assets/index.css +23 -22
- inspect_ai/_view/www/dist/assets/index.js +517 -204
- inspect_ai/_view/www/log-schema.json +375 -0
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/@types/log.d.ts +90 -12
- inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
- inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
- inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
- inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
- inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
- inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
- inspect_ai/_view/www/src/app/types.ts +12 -2
- inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
- inspect_ai/_view/www/src/state/hooks.ts +19 -3
- inspect_ai/_view/www/src/state/logSlice.ts +23 -5
- inspect_ai/_view/www/yarn.lock +9 -9
- inspect_ai/agent/_as_solver.py +3 -1
- inspect_ai/agent/_as_tool.py +6 -4
- inspect_ai/agent/_bridge/patch.py +1 -3
- inspect_ai/agent/_handoff.py +5 -1
- inspect_ai/agent/_react.py +4 -3
- inspect_ai/agent/_run.py +6 -1
- inspect_ai/agent/_types.py +9 -0
- inspect_ai/analysis/__init__.py +0 -0
- inspect_ai/analysis/beta/__init__.py +57 -0
- inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
- inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
- inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
- inspect_ai/analysis/beta/_dataframe/evals/table.py +140 -0
- inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/events/columns.py +37 -0
- inspect_ai/analysis/beta/_dataframe/events/table.py +14 -0
- inspect_ai/analysis/beta/_dataframe/extract.py +54 -0
- inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
- inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
- inspect_ai/analysis/beta/_dataframe/messages/table.py +87 -0
- inspect_ai/analysis/beta/_dataframe/record.py +377 -0
- inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/samples/columns.py +73 -0
- inspect_ai/analysis/beta/_dataframe/samples/extract.py +82 -0
- inspect_ai/analysis/beta/_dataframe/samples/table.py +329 -0
- inspect_ai/analysis/beta/_dataframe/util.py +157 -0
- inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
- inspect_ai/dataset/_dataset.py +6 -3
- inspect_ai/log/__init__.py +10 -0
- inspect_ai/log/_convert.py +4 -9
- inspect_ai/log/_file.py +1 -1
- inspect_ai/log/_log.py +21 -1
- inspect_ai/log/_samples.py +14 -17
- inspect_ai/log/_transcript.py +77 -35
- inspect_ai/log/_tree.py +118 -0
- inspect_ai/model/_call_tools.py +44 -35
- inspect_ai/model/_model.py +51 -44
- inspect_ai/model/_openai_responses.py +17 -18
- inspect_ai/model/_providers/anthropic.py +30 -5
- inspect_ai/model/_providers/hf.py +27 -1
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/sglang.py +8 -2
- inspect_ai/model/_providers/vllm.py +6 -2
- inspect_ai/scorer/_choice.py +1 -2
- inspect_ai/solver/_chain.py +1 -1
- inspect_ai/solver/_fork.py +1 -1
- inspect_ai/solver/_multiple_choice.py +9 -23
- inspect_ai/solver/_plan.py +2 -2
- inspect_ai/solver/_task_state.py +7 -3
- inspect_ai/solver/_transcript.py +6 -7
- inspect_ai/tool/_mcp/_context.py +3 -5
- inspect_ai/tool/_mcp/_mcp.py +6 -5
- inspect_ai/tool/_mcp/server.py +1 -1
- inspect_ai/tool/_tools/_execute.py +4 -1
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
- inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
- inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
- inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
- inspect_ai/util/__init__.py +4 -0
- inspect_ai/util/_anyio.py +11 -0
- inspect_ai/util/_collect.py +50 -0
- inspect_ai/util/_sandbox/events.py +3 -2
- inspect_ai/util/_span.py +58 -0
- inspect_ai/util/_subtask.py +27 -42
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/METADATA +8 -1
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/RECORD +114 -82
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/WHEEL +1 -1
- inspect_ai/_display/core/group.py +0 -79
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.95.dist-info}/top_level.txt +0 -0
@@ -591,10 +591,10 @@ class SampleToolbar(Horizontal):
|
|
591
591
|
)
|
592
592
|
if isinstance(last_event, ModelEvent):
|
593
593
|
# see if there are retries in play
|
594
|
-
if
|
595
|
-
suffix = "retry" if
|
594
|
+
if last_event.retries:
|
595
|
+
suffix = "retry" if last_event.retries == 1 else "retries"
|
596
596
|
pending_caption_text = (
|
597
|
-
f"Generating ({
|
597
|
+
f"Generating ({last_event.retries:,} {suffix})..."
|
598
598
|
)
|
599
599
|
else:
|
600
600
|
pending_caption_text = "Generating..."
|
@@ -30,7 +30,7 @@ from inspect_ai.log._transcript import (
|
|
30
30
|
SampleInitEvent,
|
31
31
|
SampleLimitEvent,
|
32
32
|
ScoreEvent,
|
33
|
-
|
33
|
+
SpanBeginEvent,
|
34
34
|
SubtaskEvent,
|
35
35
|
ToolEvent,
|
36
36
|
)
|
@@ -211,10 +211,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
|
|
211
211
|
# render the call
|
212
212
|
content = transcript_tool_call(event)
|
213
213
|
|
214
|
-
# render sub-events
|
215
|
-
if event.events:
|
216
|
-
content.extend(render_sub_events(event.events))
|
217
|
-
|
218
214
|
# render the output
|
219
215
|
if isinstance(event.result, list):
|
220
216
|
result: ToolResult = "\n".join(
|
@@ -235,23 +231,6 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
|
|
235
231
|
return [EventDisplay("tool call", Group(*content))]
|
236
232
|
|
237
233
|
|
238
|
-
def render_step_event(event: StepEvent) -> EventDisplay:
|
239
|
-
if event.type == "solver":
|
240
|
-
return render_solver_event(event)
|
241
|
-
if event.type == "scorer":
|
242
|
-
return render_scorer_event(event)
|
243
|
-
else:
|
244
|
-
return EventDisplay(step_title(event))
|
245
|
-
|
246
|
-
|
247
|
-
def render_solver_event(event: StepEvent) -> EventDisplay:
|
248
|
-
return EventDisplay(step_title(event))
|
249
|
-
|
250
|
-
|
251
|
-
def render_scorer_event(event: StepEvent) -> EventDisplay:
|
252
|
-
return EventDisplay(step_title(event))
|
253
|
-
|
254
|
-
|
255
234
|
def render_score_event(event: ScoreEvent) -> EventDisplay:
|
256
235
|
table = Table(box=None, show_header=False)
|
257
236
|
table.add_column("", min_width=10, justify="left")
|
@@ -272,10 +251,6 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
|
|
272
251
|
# render header
|
273
252
|
content: list[RenderableType] = [transcript_function(event.name, event.input)]
|
274
253
|
|
275
|
-
# render sub-events
|
276
|
-
if event.events:
|
277
|
-
content.extend(render_sub_events(event.events))
|
278
|
-
|
279
254
|
if event.result:
|
280
255
|
content.append(Text())
|
281
256
|
if isinstance(event.result, str | int | float | bool | None):
|
@@ -345,8 +320,8 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
|
|
345
320
|
return content
|
346
321
|
|
347
322
|
|
348
|
-
def
|
349
|
-
return f"{event.type or '
|
323
|
+
def span_title(event: SpanBeginEvent) -> str:
|
324
|
+
return f"{event.type or 'span'}: {event.name}"
|
350
325
|
|
351
326
|
|
352
327
|
EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
|
@@ -354,7 +329,6 @@ EventRenderer = Callable[[Any], EventDisplay | list[EventDisplay] | None]
|
|
354
329
|
_renderers: list[tuple[Type[Event], EventRenderer]] = [
|
355
330
|
(SampleInitEvent, render_sample_init_event),
|
356
331
|
(SampleLimitEvent, render_sample_limit_event),
|
357
|
-
(StepEvent, render_step_event),
|
358
332
|
(ModelEvent, render_model_event),
|
359
333
|
(ToolEvent, render_tool_event),
|
360
334
|
(SubtaskEvent, render_subtask_event),
|
inspect_ai/_eval/loader.py
CHANGED
@@ -428,7 +428,7 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
428
428
|
return as_solver(agent)
|
429
429
|
else:
|
430
430
|
raise ValueError(
|
431
|
-
f"
|
431
|
+
f"Unknown solver {solver_name} (not registered as a @solver or @agent)"
|
432
432
|
)
|
433
433
|
|
434
434
|
# we do have a solver file
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -24,10 +24,10 @@ from inspect_ai._util._async import tg_collect
|
|
24
24
|
from inspect_ai._util.constants import (
|
25
25
|
DEFAULT_EPOCHS,
|
26
26
|
DEFAULT_MAX_CONNECTIONS,
|
27
|
-
SAMPLE_SUBTASK,
|
28
27
|
)
|
29
28
|
from inspect_ai._util.datetime import iso_now
|
30
29
|
from inspect_ai._util.error import exception_message
|
30
|
+
from inspect_ai._util.exception import TerminateSampleError
|
31
31
|
from inspect_ai._util.hooks import send_telemetry
|
32
32
|
from inspect_ai._util.json import to_json_str_safe
|
33
33
|
from inspect_ai._util.registry import (
|
@@ -36,6 +36,7 @@ from inspect_ai._util.registry import (
|
|
36
36
|
registry_unqualified_name,
|
37
37
|
)
|
38
38
|
from inspect_ai._util.working import (
|
39
|
+
end_sample_working_limit,
|
39
40
|
init_sample_working_limit,
|
40
41
|
sample_waiting_time,
|
41
42
|
)
|
@@ -65,8 +66,8 @@ from inspect_ai.log._transcript import (
|
|
65
66
|
SampleInitEvent,
|
66
67
|
SampleLimitEvent,
|
67
68
|
ScoreEvent,
|
68
|
-
StepEvent,
|
69
69
|
Transcript,
|
70
|
+
init_transcript,
|
70
71
|
transcript,
|
71
72
|
)
|
72
73
|
from inspect_ai.model import (
|
@@ -91,7 +92,8 @@ from inspect_ai.solver._task_state import sample_state, set_sample_state, state_
|
|
91
92
|
from inspect_ai.util._limit import LimitExceededError
|
92
93
|
from inspect_ai.util._sandbox.context import sandbox_connections
|
93
94
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
94
|
-
from inspect_ai.util.
|
95
|
+
from inspect_ai.util._span import span
|
96
|
+
from inspect_ai.util._store import init_subtask_store
|
95
97
|
|
96
98
|
from ..context import init_task_context
|
97
99
|
from ..task import Task
|
@@ -558,7 +560,9 @@ async def task_run_sample(
|
|
558
560
|
# initialise subtask and scoring context
|
559
561
|
init_sample_model_usage()
|
560
562
|
set_sample_state(state)
|
561
|
-
sample_transcript
|
563
|
+
sample_transcript = Transcript()
|
564
|
+
init_transcript(sample_transcript)
|
565
|
+
init_subtask_store(state.store)
|
562
566
|
if logger:
|
563
567
|
sample_transcript._subscribe(
|
564
568
|
lambda event: logger.log_sample_event(sample_id, state.epoch, event)
|
@@ -617,7 +621,8 @@ async def task_run_sample(
|
|
617
621
|
results: dict[str, SampleScore] = {}
|
618
622
|
try:
|
619
623
|
# begin init
|
620
|
-
|
624
|
+
init_span = span("init", type="init")
|
625
|
+
await init_span.__aenter__()
|
621
626
|
|
622
627
|
# sample init event (remove file bodies as they have content or absolute paths)
|
623
628
|
event_sample = sample.model_copy(
|
@@ -636,10 +641,11 @@ async def task_run_sample(
|
|
636
641
|
) = contextlib.nullcontext()
|
637
642
|
try:
|
638
643
|
# update active sample wth sandboxes now that we are initialised
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
644
|
+
# (ensure that we still exit init context in presence of sandbox error)
|
645
|
+
try:
|
646
|
+
active.sandboxes = await sandbox_connections()
|
647
|
+
finally:
|
648
|
+
await init_span.__aexit__(None, None, None)
|
643
649
|
|
644
650
|
# initialise timeout context manager
|
645
651
|
timeout_cm = (
|
@@ -671,6 +677,9 @@ async def task_run_sample(
|
|
671
677
|
# set progress for plan then run it
|
672
678
|
state = await plan(state, generate)
|
673
679
|
|
680
|
+
# disable sample working limit after execution
|
681
|
+
end_sample_working_limit()
|
682
|
+
|
674
683
|
except TimeoutError:
|
675
684
|
if time_limit is not None:
|
676
685
|
transcript()._event(
|
@@ -712,7 +721,7 @@ async def task_run_sample(
|
|
712
721
|
# handle the cancel exception
|
713
722
|
raise
|
714
723
|
|
715
|
-
except LimitExceededError:
|
724
|
+
except (LimitExceededError, TerminateSampleError):
|
716
725
|
# capture most recent state for scoring
|
717
726
|
state = sample_state() or state
|
718
727
|
|
@@ -742,7 +751,7 @@ async def task_run_sample(
|
|
742
751
|
scorer_name = unique_scorer_name(
|
743
752
|
scorer, list(results.keys())
|
744
753
|
)
|
745
|
-
with
|
754
|
+
async with span(name=scorer_name, type="scorer"):
|
746
755
|
score_result = (
|
747
756
|
await scorer(state, Target(sample.target))
|
748
757
|
if scorer
|
@@ -922,7 +931,7 @@ async def log_sample(
|
|
922
931
|
input=sample.input,
|
923
932
|
choices=sample.choices,
|
924
933
|
target=sample.target,
|
925
|
-
metadata=
|
934
|
+
metadata=state.metadata or {},
|
926
935
|
sandbox=sample.sandbox,
|
927
936
|
files=list(sample.files.keys()) if sample.files else None,
|
928
937
|
setup=sample.setup,
|
@@ -0,0 +1,26 @@
|
|
1
|
+
def answer_character(index: int) -> str:
|
2
|
+
r"""
|
3
|
+
Helper to go from array index to char, for example:
|
4
|
+
|
5
|
+
0 -> 'A', 1 -> 'B', etc
|
6
|
+
"""
|
7
|
+
if index < 26:
|
8
|
+
return chr(ord("A") + index)
|
9
|
+
else:
|
10
|
+
return str(index - 25)
|
11
|
+
|
12
|
+
|
13
|
+
def answer_index(char: str) -> int:
|
14
|
+
r"""
|
15
|
+
Helper to go from char to array index, for example:
|
16
|
+
|
17
|
+
'A' -> 0, 'B' -> 1, etc
|
18
|
+
"""
|
19
|
+
if char.isalpha() or char == "," or char == " ":
|
20
|
+
return ord(char.upper()) - ord("A")
|
21
|
+
elif char.isnumeric():
|
22
|
+
return 25 + int(char)
|
23
|
+
else:
|
24
|
+
raise ValueError(
|
25
|
+
f"Unepxected multiple choice answer: {char} (must be a letter or number)"
|
26
|
+
)
|
inspect_ai/_util/constants.py
CHANGED
inspect_ai/_util/hash.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
import hashlib
|
2
|
+
|
1
3
|
import mmh3
|
2
4
|
|
3
5
|
|
@@ -7,3 +9,40 @@ def mm3_hash(message: str) -> str:
|
|
7
9
|
|
8
10
|
# Convert to unsigned integers and then to hexadecimal
|
9
11
|
return f"{h1 & 0xFFFFFFFFFFFFFFFF:016x}{h2 & 0xFFFFFFFFFFFFFFFF:016x}"
|
12
|
+
|
13
|
+
|
14
|
+
def base57_id_hash(content: str) -> str:
|
15
|
+
"""Generate base67 hash for content.
|
16
|
+
|
17
|
+
Hash the content, truncate to 128 bits, and then further truncate to 93 bits,
|
18
|
+
returning a 22-character Base-57-URL string. Collision probability reaches 50%
|
19
|
+
at approximately 70 trillion items.
|
20
|
+
"""
|
21
|
+
digest_size = 16 # 128 bits
|
22
|
+
digest = hashlib.blake2s(content.encode(), digest_size=digest_size).digest()
|
23
|
+
|
24
|
+
# Truncate to ~93 bits (log₂57^22 ≈ 128.3)
|
25
|
+
as_int = int.from_bytes(digest, "big")
|
26
|
+
base57_str = to_base57(as_int)
|
27
|
+
if len(base57_str) > 22:
|
28
|
+
return base57_str[-22:] # Take last 22 chars if longer
|
29
|
+
else:
|
30
|
+
# This is unlikely with a 128-bit input
|
31
|
+
return base57_str.rjust(22, ALPHABET57[0])
|
32
|
+
|
33
|
+
|
34
|
+
# shortuuid uses these 57 characters (excluding similar-looking characters like 0/O, 1/I/l, etc.)
|
35
|
+
ALPHABET57 = "23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
|
36
|
+
|
37
|
+
|
38
|
+
def to_base57(n: int) -> str:
|
39
|
+
if n == 0:
|
40
|
+
return ALPHABET57[0]
|
41
|
+
|
42
|
+
out = []
|
43
|
+
while n:
|
44
|
+
n, rem = divmod(n, 57)
|
45
|
+
out.append(ALPHABET57[rem])
|
46
|
+
|
47
|
+
# reverse and return
|
48
|
+
return "".join(reversed(out))
|
inspect_ai/_util/local_server.py
CHANGED
@@ -62,16 +62,24 @@ def release_port(lock_socket: socket.socket) -> None:
|
|
62
62
|
logger.error(f"Error closing socket: {e}")
|
63
63
|
|
64
64
|
|
65
|
-
def execute_shell_command(
|
65
|
+
def execute_shell_command(
|
66
|
+
command: list[str], env: Optional[dict[str, str]] = None
|
67
|
+
) -> subprocess.Popen[str]:
|
66
68
|
"""
|
67
69
|
Execute a command and return its process handle.
|
68
70
|
|
69
71
|
Args:
|
70
72
|
command: List of command arguments
|
73
|
+
env: Optional environment variables to pass to the subprocess
|
71
74
|
|
72
75
|
Returns:
|
73
76
|
A subprocess.Popen object representing the running process
|
74
77
|
"""
|
78
|
+
# Create a process environment by copying current environment and updating with new values
|
79
|
+
process_env = os.environ.copy()
|
80
|
+
if env:
|
81
|
+
process_env.update(env)
|
82
|
+
|
75
83
|
# Create a process that redirects output to pipes so we can capture it
|
76
84
|
process = subprocess.Popen(
|
77
85
|
command,
|
@@ -79,6 +87,7 @@ def execute_shell_command(command: list[str]) -> subprocess.Popen[str]:
|
|
79
87
|
stdout=subprocess.PIPE,
|
80
88
|
stderr=subprocess.PIPE,
|
81
89
|
bufsize=1, # Line buffered
|
90
|
+
env=process_env, # Pass the environment variables
|
82
91
|
)
|
83
92
|
|
84
93
|
# Set up background thread to read and log stdout
|
@@ -134,7 +143,10 @@ def kill_process_tree(pid: int) -> None:
|
|
134
143
|
|
135
144
|
|
136
145
|
def launch_server_cmd(
|
137
|
-
command: list[str],
|
146
|
+
command: list[str],
|
147
|
+
host: str = "0.0.0.0",
|
148
|
+
port: Optional[int] = None,
|
149
|
+
env: Optional[dict[str, str]] = None,
|
138
150
|
) -> Tuple[subprocess.Popen[str], int, list[str]]:
|
139
151
|
"""
|
140
152
|
Launch a server process with the given base command and return the process, port, and full command.
|
@@ -143,6 +155,7 @@ def launch_server_cmd(
|
|
143
155
|
command: Base command to execute
|
144
156
|
host: Host to bind to
|
145
157
|
port: Port to bind to. If None, a free port is reserved.
|
158
|
+
env: Optional environment variables to pass to the subprocess
|
146
159
|
|
147
160
|
Returns:
|
148
161
|
Tuple of (process, port, full_command)
|
@@ -155,7 +168,7 @@ def launch_server_cmd(
|
|
155
168
|
full_command = command + ["--port", str(port)]
|
156
169
|
logger.info(f"Launching server on port {port}")
|
157
170
|
|
158
|
-
process = execute_shell_command(full_command)
|
171
|
+
process = execute_shell_command(full_command, env=env)
|
159
172
|
|
160
173
|
if lock_socket is not None:
|
161
174
|
process_socket_map[process] = lock_socket
|
@@ -181,6 +194,7 @@ def wait_for_server(
|
|
181
194
|
base_url: str,
|
182
195
|
process: subprocess.Popen[str],
|
183
196
|
full_command: Optional[list[str]] = None,
|
197
|
+
env: Optional[dict[str, str]] = None,
|
184
198
|
timeout: Optional[int] = None,
|
185
199
|
api_key: Optional[str] = None,
|
186
200
|
) -> None:
|
@@ -191,6 +205,7 @@ def wait_for_server(
|
|
191
205
|
base_url: The base URL of the server
|
192
206
|
process: The subprocess running the server
|
193
207
|
full_command: The full command used to launch the server
|
208
|
+
env: The environment variables to use for the request
|
194
209
|
timeout: Maximum time to wait in seconds. None means wait forever.
|
195
210
|
api_key: The API key to use for the request
|
196
211
|
"""
|
@@ -198,7 +213,10 @@ def wait_for_server(
|
|
198
213
|
start_time = time.time()
|
199
214
|
debug_advice = "Try rerunning with '--log-level debug' to see the full traceback."
|
200
215
|
if full_command:
|
201
|
-
debug_advice +=
|
216
|
+
debug_advice += " Alternatively, you can run the following launch command manually to see the full traceback:\n\n"
|
217
|
+
if env:
|
218
|
+
debug_advice += " ".join([f"{k}={v}" for k, v in env.items()]) + " "
|
219
|
+
debug_advice += " ".join(full_command) + "\n\n"
|
202
220
|
|
203
221
|
while True:
|
204
222
|
# Check for timeout first
|
@@ -245,6 +263,7 @@ def start_local_server(
|
|
245
263
|
server_type: str = "server",
|
246
264
|
timeout: Optional[int] = DEFAULT_TIMEOUT,
|
247
265
|
server_args: Optional[dict[str, Any]] = None,
|
266
|
+
env: Optional[dict[str, str]] = None,
|
248
267
|
) -> Tuple[str, subprocess.Popen[str], int]:
|
249
268
|
"""
|
250
269
|
Start a server with the given command and handle potential errors.
|
@@ -257,6 +276,7 @@ def start_local_server(
|
|
257
276
|
server_type: Type of server being started (for error messages)
|
258
277
|
timeout: Maximum time to wait for server to become ready
|
259
278
|
server_args: Additional server arguments to pass to the command
|
279
|
+
env: Optional environment variables to pass to the subprocess
|
260
280
|
Returns:
|
261
281
|
Tuple of (base_url, process, port)
|
262
282
|
|
@@ -266,15 +286,22 @@ def start_local_server(
|
|
266
286
|
full_command = base_cmd
|
267
287
|
server_process = None
|
268
288
|
|
289
|
+
# Initialize environment variables if not provided
|
290
|
+
process_env = {} if env is None else env.copy()
|
291
|
+
|
269
292
|
if server_args:
|
270
293
|
for key, value in server_args.items():
|
271
294
|
# Convert Python style args (underscore) to CLI style (dash)
|
272
295
|
cli_key = key.replace("_", "-")
|
273
|
-
|
296
|
+
if value == "":
|
297
|
+
# If the value is empty, just add the flag
|
298
|
+
full_command.extend([f"--{cli_key}"])
|
299
|
+
else:
|
300
|
+
full_command.extend([f"--{cli_key}", str(value)])
|
274
301
|
|
275
302
|
try:
|
276
303
|
server_process, found_port, full_command = launch_server_cmd(
|
277
|
-
full_command, host=host, port=port
|
304
|
+
full_command, host=host, port=port, env=process_env
|
278
305
|
)
|
279
306
|
base_url = f"http://localhost:{found_port}/v1"
|
280
307
|
wait_for_server(
|
@@ -283,6 +310,7 @@ def start_local_server(
|
|
283
310
|
api_key=api_key,
|
284
311
|
timeout=timeout,
|
285
312
|
full_command=full_command,
|
313
|
+
env=process_env,
|
286
314
|
)
|
287
315
|
return base_url, server_process, found_port
|
288
316
|
except Exception as e:
|
@@ -330,17 +358,18 @@ def merge_env_server_args(
|
|
330
358
|
|
331
359
|
def configure_devices(
|
332
360
|
server_args: dict[str, Any], parallel_size_param: str = "tensor_parallel_size"
|
333
|
-
) -> dict[str, Any]:
|
334
|
-
"""Configure device settings and return updated server args.
|
361
|
+
) -> tuple[dict[str, Any], dict[str, str]]:
|
362
|
+
"""Configure device settings and return updated server args and environment variables.
|
335
363
|
|
336
364
|
Args:
|
337
365
|
server_args: Dictionary of server arguments
|
338
366
|
parallel_size_param: Name of parameter to set with device count if not specified
|
339
367
|
|
340
368
|
Returns:
|
341
|
-
|
369
|
+
Tuple of (updated server arguments dict, environment variables dict)
|
342
370
|
"""
|
343
371
|
result = server_args.copy()
|
372
|
+
env_vars = {}
|
344
373
|
|
345
374
|
devices = None
|
346
375
|
if "device" in result and "devices" in result:
|
@@ -350,19 +379,20 @@ def configure_devices(
|
|
350
379
|
elif "device" in result:
|
351
380
|
devices = result.pop("device")
|
352
381
|
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
382
|
+
if devices is not None:
|
383
|
+
# Convert device list to comma-separated string if needed
|
384
|
+
if isinstance(devices, list):
|
385
|
+
device_str = ",".join(map(str, devices))
|
386
|
+
else:
|
387
|
+
device_str = str(devices)
|
358
388
|
|
359
|
-
|
360
|
-
|
389
|
+
# Add to env_vars instead of setting os.environ directly
|
390
|
+
env_vars["CUDA_VISIBLE_DEVICES"] = device_str
|
361
391
|
|
362
|
-
|
392
|
+
device_count = len(device_str.split(","))
|
363
393
|
|
364
|
-
|
365
|
-
|
366
|
-
|
394
|
+
# Set parallel size parameter if not explicitly provided
|
395
|
+
if parallel_size_param not in result:
|
396
|
+
result[parallel_size_param] = device_count
|
367
397
|
|
368
|
-
return result
|
398
|
+
return result, env_vars
|
inspect_ai/_util/path.py
CHANGED
@@ -6,6 +6,10 @@ from copy import deepcopy
|
|
6
6
|
from pathlib import PurePath
|
7
7
|
from typing import Any, Iterator, overload
|
8
8
|
|
9
|
+
from fsspec.implementations.local import LocalFileSystem # type: ignore
|
10
|
+
|
11
|
+
from inspect_ai._util.file import filesystem
|
12
|
+
|
9
13
|
|
10
14
|
@contextmanager
|
11
15
|
def add_to_path(p: str) -> Iterator[None]:
|
@@ -98,6 +102,24 @@ def cwd_relative_path(file: str | None, walk_up: bool = False) -> str | None:
|
|
98
102
|
return None
|
99
103
|
|
100
104
|
|
105
|
+
def pretty_path(file: str) -> str:
|
106
|
+
fs = filesystem(file)
|
107
|
+
if fs.is_local():
|
108
|
+
file = LocalFileSystem._strip_protocol(file)
|
109
|
+
return cwd_relative_path(file)
|
110
|
+
else:
|
111
|
+
return file
|
112
|
+
|
113
|
+
|
114
|
+
def native_path(file: str) -> str:
|
115
|
+
fs = filesystem(file)
|
116
|
+
if fs.is_local():
|
117
|
+
file = LocalFileSystem._strip_protocol(file)
|
118
|
+
return file
|
119
|
+
else:
|
120
|
+
return file
|
121
|
+
|
122
|
+
|
101
123
|
# A slightly modified implementation of task_path.relative(d, walk_up=True)
|
102
124
|
# since that wasn't introduced until python 3.12
|
103
125
|
def relative_walk(from_path: PurePath, to_path: PurePath) -> str:
|
inspect_ai/_util/trace.py
CHANGED
inspect_ai/_util/working.py
CHANGED
@@ -10,6 +10,10 @@ def init_sample_working_limit(start_time: float, working_limit: float | None) ->
|
|
10
10
|
_sample_waiting_time.set(0)
|
11
11
|
|
12
12
|
|
13
|
+
def end_sample_working_limit() -> None:
|
14
|
+
_sample_working_limit.set(None)
|
15
|
+
|
16
|
+
|
13
17
|
def sample_waiting_time() -> float:
|
14
18
|
return _sample_waiting_time.get()
|
15
19
|
|
@@ -15489,34 +15489,34 @@ pre[class*="language-"] {
|
|
15489
15489
|
padding: 0.1rem 0.6rem;
|
15490
15490
|
border-radius: var(--bs-border-radius);
|
15491
15491
|
}
|
15492
|
-
.
|
15492
|
+
._expandableBordered_59eal_1 {
|
15493
15493
|
border: solid var(--bs-light-border-subtle) 1px;
|
15494
15494
|
}
|
15495
15495
|
|
15496
|
-
.
|
15496
|
+
._expandableTogglable_59eal_5 {
|
15497
15497
|
margin-bottom: 1em;
|
15498
15498
|
}
|
15499
15499
|
|
15500
|
-
.
|
15500
|
+
._expandableContents_59eal_9 {
|
15501
15501
|
font-size: var(--inspect-font-size-base);
|
15502
15502
|
}
|
15503
15503
|
|
15504
|
-
.
|
15504
|
+
._expandableCollapsed_59eal_13 {
|
15505
15505
|
overflow: hidden;
|
15506
15506
|
}
|
15507
15507
|
|
15508
|
-
.
|
15508
|
+
._moreToggle_59eal_17 {
|
15509
15509
|
display: flex;
|
15510
15510
|
margin-top: 0;
|
15511
15511
|
position: relative;
|
15512
|
-
height:
|
15512
|
+
height: 18px;
|
15513
15513
|
}
|
15514
15514
|
|
15515
|
-
.
|
15515
|
+
._moreToggle_59eal_17._bordered_59eal_24 {
|
15516
15516
|
border-top: solid var(--bs-light-border-subtle) 1px;
|
15517
15517
|
}
|
15518
15518
|
|
15519
|
-
.
|
15519
|
+
._moreToggleContainer_59eal_28 {
|
15520
15520
|
position: absolute;
|
15521
15521
|
top: -1px;
|
15522
15522
|
right: 0;
|
@@ -15527,7 +15527,7 @@ pre[class*="language-"] {
|
|
15527
15527
|
margin-right: 0;
|
15528
15528
|
}
|
15529
15529
|
|
15530
|
-
.
|
15530
|
+
._moreToggleButton_59eal_39 {
|
15531
15531
|
font-size: var(--inspect-font-size-smaller);
|
15532
15532
|
border: none;
|
15533
15533
|
padding: 0.1rem 0.5rem;
|
@@ -17342,37 +17342,38 @@ pre[class*="language-"] {
|
|
17342
17342
|
._metadata_1a3fk_21 {
|
17343
17343
|
margin: 0.5em 0;
|
17344
17344
|
}
|
17345
|
-
.
|
17345
|
+
._contents_1irga_1 {
|
17346
17346
|
margin-top: 0.5em;
|
17347
17347
|
}
|
17348
17348
|
|
17349
|
-
.
|
17349
|
+
._contents_1irga_1 > :last-child {
|
17350
17350
|
margin-bottom: 0;
|
17351
17351
|
}
|
17352
17352
|
|
17353
|
-
.
|
17353
|
+
._twoColumn_1irga_9 {
|
17354
17354
|
display: grid;
|
17355
17355
|
grid-template-columns: auto 1fr;
|
17356
17356
|
column-gap: 1.5em;
|
17357
17357
|
}
|
17358
17358
|
|
17359
|
-
.
|
17360
|
-
margin-top: 0
|
17359
|
+
._exec_1irga_15 {
|
17360
|
+
margin-top: 0;
|
17361
17361
|
}
|
17362
17362
|
|
17363
|
-
.
|
17363
|
+
._result_1irga_19 {
|
17364
17364
|
margin-top: 0.5em;
|
17365
17365
|
}
|
17366
17366
|
|
17367
|
-
.
|
17367
|
+
._fileLabel_1irga_23 {
|
17368
17368
|
margin-top: 0;
|
17369
17369
|
margin-bottom: 0;
|
17370
17370
|
}
|
17371
17371
|
|
17372
|
-
.
|
17372
|
+
._wrapPre_1irga_28 {
|
17373
17373
|
white-space: pre-wrap;
|
17374
17374
|
word-wrap: break-word;
|
17375
17375
|
overflow-wrap: break-word;
|
17376
|
+
margin-bottom: 0;
|
17376
17377
|
}
|
17377
17378
|
._explanation_1ww42_1 {
|
17378
17379
|
display: grid;
|
@@ -20001,20 +20002,20 @@ span.ap-marker-container:hover span.ap-marker {
|
|
20001
20002
|
padding-top: 0rem;
|
20002
20003
|
margin-top: -8px;
|
20003
20004
|
}
|
20004
|
-
.
|
20005
|
+
._darkenedBg_u9na2_1 {
|
20005
20006
|
background-color: var(--bs-light-bg-subtle);
|
20006
20007
|
}
|
20007
20008
|
|
20008
|
-
.
|
20009
|
+
._normalBg_u9na2_5 {
|
20009
20010
|
background-color: var(--bs-body-bg);
|
20010
20011
|
}
|
20011
20012
|
|
20012
|
-
.
|
20013
|
+
._node_u9na2_9 {
|
20013
20014
|
padding-top: 0.7rem;
|
20014
|
-
padding-bottom:
|
20015
|
+
padding-bottom: 1px;
|
20015
20016
|
}
|
20016
20017
|
|
20017
|
-
.
|
20018
|
+
._attached_u9na2_14 {
|
20018
20019
|
padding-top: 0rem;
|
20019
20020
|
margin-top: -8px;
|
20020
20021
|
}
|