inspect-ai 0.3.94__py3-none-any.whl → 0.3.96__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_eval/loader.py +1 -1
- inspect_ai/_eval/task/run.py +12 -6
- inspect_ai/_util/exception.py +4 -0
- inspect_ai/_util/hash.py +39 -0
- inspect_ai/_util/local_server.py +16 -0
- inspect_ai/_util/path.py +22 -0
- inspect_ai/_util/trace.py +1 -1
- inspect_ai/_util/working.py +4 -0
- inspect_ai/_view/www/dist/assets/index.css +9 -9
- inspect_ai/_view/www/dist/assets/index.js +117 -120
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
- inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
- inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
- inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
- inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
- inspect_ai/_view/www/src/app/types.ts +12 -2
- inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
- inspect_ai/_view/www/src/state/hooks.ts +19 -3
- inspect_ai/_view/www/src/state/logSlice.ts +23 -5
- inspect_ai/_view/www/yarn.lock +9 -9
- inspect_ai/agent/_bridge/patch.py +1 -3
- inspect_ai/agent/_types.py +1 -1
- inspect_ai/analysis/__init__.py +0 -0
- inspect_ai/analysis/beta/__init__.py +67 -0
- inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
- inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
- inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
- inspect_ai/analysis/beta/_dataframe/evals/table.py +177 -0
- inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/events/columns.py +87 -0
- inspect_ai/analysis/beta/_dataframe/events/extract.py +26 -0
- inspect_ai/analysis/beta/_dataframe/events/table.py +100 -0
- inspect_ai/analysis/beta/_dataframe/extract.py +73 -0
- inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
- inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
- inspect_ai/analysis/beta/_dataframe/messages/table.py +79 -0
- inspect_ai/analysis/beta/_dataframe/progress.py +26 -0
- inspect_ai/analysis/beta/_dataframe/record.py +377 -0
- inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/samples/columns.py +77 -0
- inspect_ai/analysis/beta/_dataframe/samples/extract.py +54 -0
- inspect_ai/analysis/beta/_dataframe/samples/table.py +370 -0
- inspect_ai/analysis/beta/_dataframe/util.py +160 -0
- inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
- inspect_ai/log/_file.py +10 -3
- inspect_ai/log/_log.py +21 -1
- inspect_ai/model/_call_tools.py +2 -1
- inspect_ai/model/_model.py +6 -4
- inspect_ai/model/_openai_responses.py +17 -18
- inspect_ai/model/_providers/anthropic.py +30 -5
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/solver/_multiple_choice.py +4 -1
- inspect_ai/solver/_task_state.py +8 -4
- inspect_ai/tool/_mcp/_context.py +3 -5
- inspect_ai/tool/_mcp/_sandbox.py +17 -14
- inspect_ai/tool/_mcp/server.py +1 -1
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
- inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
- inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
- inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
- inspect_ai/util/_sandbox/events.py +3 -2
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/METADATA +9 -2
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/RECORD +75 -46
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/loader.py
CHANGED
@@ -428,7 +428,7 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
428
428
|
return as_solver(agent)
|
429
429
|
else:
|
430
430
|
raise ValueError(
|
431
|
-
f"
|
431
|
+
f"Unknown solver {solver_name} (not registered as a @solver or @agent)"
|
432
432
|
)
|
433
433
|
|
434
434
|
# we do have a solver file
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -27,6 +27,7 @@ from inspect_ai._util.constants import (
|
|
27
27
|
)
|
28
28
|
from inspect_ai._util.datetime import iso_now
|
29
29
|
from inspect_ai._util.error import exception_message
|
30
|
+
from inspect_ai._util.exception import TerminateSampleError
|
30
31
|
from inspect_ai._util.hooks import send_telemetry
|
31
32
|
from inspect_ai._util.json import to_json_str_safe
|
32
33
|
from inspect_ai._util.registry import (
|
@@ -35,6 +36,7 @@ from inspect_ai._util.registry import (
|
|
35
36
|
registry_unqualified_name,
|
36
37
|
)
|
37
38
|
from inspect_ai._util.working import (
|
39
|
+
end_sample_working_limit,
|
38
40
|
init_sample_working_limit,
|
39
41
|
sample_waiting_time,
|
40
42
|
)
|
@@ -639,10 +641,11 @@ async def task_run_sample(
|
|
639
641
|
) = contextlib.nullcontext()
|
640
642
|
try:
|
641
643
|
# update active sample wth sandboxes now that we are initialised
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
644
|
+
# (ensure that we still exit init context in presence of sandbox error)
|
645
|
+
try:
|
646
|
+
active.sandboxes = await sandbox_connections()
|
647
|
+
finally:
|
648
|
+
await init_span.__aexit__(None, None, None)
|
646
649
|
|
647
650
|
# initialise timeout context manager
|
648
651
|
timeout_cm = (
|
@@ -674,6 +677,9 @@ async def task_run_sample(
|
|
674
677
|
# set progress for plan then run it
|
675
678
|
state = await plan(state, generate)
|
676
679
|
|
680
|
+
# disable sample working limit after execution
|
681
|
+
end_sample_working_limit()
|
682
|
+
|
677
683
|
except TimeoutError:
|
678
684
|
if time_limit is not None:
|
679
685
|
transcript()._event(
|
@@ -715,7 +721,7 @@ async def task_run_sample(
|
|
715
721
|
# handle the cancel exception
|
716
722
|
raise
|
717
723
|
|
718
|
-
except LimitExceededError:
|
724
|
+
except (LimitExceededError, TerminateSampleError):
|
719
725
|
# capture most recent state for scoring
|
720
726
|
state = sample_state() or state
|
721
727
|
|
@@ -925,7 +931,7 @@ async def log_sample(
|
|
925
931
|
input=sample.input,
|
926
932
|
choices=sample.choices,
|
927
933
|
target=sample.target,
|
928
|
-
metadata=
|
934
|
+
metadata=state.metadata or {},
|
929
935
|
sandbox=sample.sandbox,
|
930
936
|
files=list(sample.files.keys()) if sample.files else None,
|
931
937
|
setup=sample.setup,
|
inspect_ai/_util/hash.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
import hashlib
|
2
|
+
|
1
3
|
import mmh3
|
2
4
|
|
3
5
|
|
@@ -7,3 +9,40 @@ def mm3_hash(message: str) -> str:
|
|
7
9
|
|
8
10
|
# Convert to unsigned integers and then to hexadecimal
|
9
11
|
return f"{h1 & 0xFFFFFFFFFFFFFFFF:016x}{h2 & 0xFFFFFFFFFFFFFFFF:016x}"
|
12
|
+
|
13
|
+
|
14
|
+
def base57_id_hash(content: str) -> str:
|
15
|
+
"""Generate base67 hash for content.
|
16
|
+
|
17
|
+
Hash the content, truncate to 128 bits, and then further truncate to 93 bits,
|
18
|
+
returning a 22-character Base-57-URL string. Collision probability reaches 50%
|
19
|
+
at approximately 70 trillion items.
|
20
|
+
"""
|
21
|
+
digest_size = 16 # 128 bits
|
22
|
+
digest = hashlib.blake2s(content.encode(), digest_size=digest_size).digest()
|
23
|
+
|
24
|
+
# Truncate to ~93 bits (log₂57^22 ≈ 128.3)
|
25
|
+
as_int = int.from_bytes(digest, "big")
|
26
|
+
base57_str = to_base57(as_int)
|
27
|
+
if len(base57_str) > 22:
|
28
|
+
return base57_str[-22:] # Take last 22 chars if longer
|
29
|
+
else:
|
30
|
+
# This is unlikely with a 128-bit input
|
31
|
+
return base57_str.rjust(22, ALPHABET57[0])
|
32
|
+
|
33
|
+
|
34
|
+
# shortuuid uses these 57 characters (excluding similar-looking characters like 0/O, 1/I/l, etc.)
|
35
|
+
ALPHABET57 = "23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
|
36
|
+
|
37
|
+
|
38
|
+
def to_base57(n: int) -> str:
|
39
|
+
if n == 0:
|
40
|
+
return ALPHABET57[0]
|
41
|
+
|
42
|
+
out = []
|
43
|
+
while n:
|
44
|
+
n, rem = divmod(n, 57)
|
45
|
+
out.append(ALPHABET57[rem])
|
46
|
+
|
47
|
+
# reverse and return
|
48
|
+
return "".join(reversed(out))
|
inspect_ai/_util/local_server.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
+
import platform
|
4
5
|
import random
|
5
6
|
import socket
|
6
7
|
import subprocess
|
@@ -33,6 +34,21 @@ def reserve_port(
|
|
33
34
|
Returns:
|
34
35
|
A tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
|
35
36
|
"""
|
37
|
+
is_macos = platform.system() == "Darwin"
|
38
|
+
|
39
|
+
if is_macos:
|
40
|
+
logger.info(
|
41
|
+
"MacOS system detected. A free binding port will be identified, but not reserved until the server binds to it."
|
42
|
+
)
|
43
|
+
# On macOS, let the OS pick a free port but not open it
|
44
|
+
# It leads to a small racode condition window until the port
|
45
|
+
# is actually opened by the llm server
|
46
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
47
|
+
s.bind((host, 0)) # Bind to any free port
|
48
|
+
port = s.getsockname()[1]
|
49
|
+
return port, s
|
50
|
+
|
51
|
+
# Non-macOS behavior: try ports in range
|
36
52
|
candidates = list(range(start, end))
|
37
53
|
random.shuffle(candidates)
|
38
54
|
|
inspect_ai/_util/path.py
CHANGED
@@ -6,6 +6,10 @@ from copy import deepcopy
|
|
6
6
|
from pathlib import PurePath
|
7
7
|
from typing import Any, Iterator, overload
|
8
8
|
|
9
|
+
from fsspec.implementations.local import LocalFileSystem # type: ignore
|
10
|
+
|
11
|
+
from inspect_ai._util.file import filesystem
|
12
|
+
|
9
13
|
|
10
14
|
@contextmanager
|
11
15
|
def add_to_path(p: str) -> Iterator[None]:
|
@@ -98,6 +102,24 @@ def cwd_relative_path(file: str | None, walk_up: bool = False) -> str | None:
|
|
98
102
|
return None
|
99
103
|
|
100
104
|
|
105
|
+
def pretty_path(file: str) -> str:
|
106
|
+
fs = filesystem(file)
|
107
|
+
if fs.is_local():
|
108
|
+
file = LocalFileSystem._strip_protocol(file)
|
109
|
+
return cwd_relative_path(file)
|
110
|
+
else:
|
111
|
+
return file
|
112
|
+
|
113
|
+
|
114
|
+
def native_path(file: str) -> str:
|
115
|
+
fs = filesystem(file)
|
116
|
+
if fs.is_local():
|
117
|
+
file = LocalFileSystem._strip_protocol(file)
|
118
|
+
return file
|
119
|
+
else:
|
120
|
+
return file
|
121
|
+
|
122
|
+
|
101
123
|
# A slightly modified implementation of task_path.relative(d, walk_up=True)
|
102
124
|
# since that wasn't introduced until python 3.12
|
103
125
|
def relative_walk(from_path: PurePath, to_path: PurePath) -> str:
|
inspect_ai/_util/trace.py
CHANGED
inspect_ai/_util/working.py
CHANGED
@@ -10,6 +10,10 @@ def init_sample_working_limit(start_time: float, working_limit: float | None) ->
|
|
10
10
|
_sample_waiting_time.set(0)
|
11
11
|
|
12
12
|
|
13
|
+
def end_sample_working_limit() -> None:
|
14
|
+
_sample_working_limit.set(None)
|
15
|
+
|
16
|
+
|
13
17
|
def sample_waiting_time() -> float:
|
14
18
|
return _sample_waiting_time.get()
|
15
19
|
|
@@ -15489,34 +15489,34 @@ pre[class*="language-"] {
|
|
15489
15489
|
padding: 0.1rem 0.6rem;
|
15490
15490
|
border-radius: var(--bs-border-radius);
|
15491
15491
|
}
|
15492
|
-
.
|
15492
|
+
._expandableBordered_59eal_1 {
|
15493
15493
|
border: solid var(--bs-light-border-subtle) 1px;
|
15494
15494
|
}
|
15495
15495
|
|
15496
|
-
.
|
15496
|
+
._expandableTogglable_59eal_5 {
|
15497
15497
|
margin-bottom: 1em;
|
15498
15498
|
}
|
15499
15499
|
|
15500
|
-
.
|
15500
|
+
._expandableContents_59eal_9 {
|
15501
15501
|
font-size: var(--inspect-font-size-base);
|
15502
15502
|
}
|
15503
15503
|
|
15504
|
-
.
|
15504
|
+
._expandableCollapsed_59eal_13 {
|
15505
15505
|
overflow: hidden;
|
15506
15506
|
}
|
15507
15507
|
|
15508
|
-
.
|
15508
|
+
._moreToggle_59eal_17 {
|
15509
15509
|
display: flex;
|
15510
15510
|
margin-top: 0;
|
15511
15511
|
position: relative;
|
15512
|
-
height:
|
15512
|
+
height: 18px;
|
15513
15513
|
}
|
15514
15514
|
|
15515
|
-
.
|
15515
|
+
._moreToggle_59eal_17._bordered_59eal_24 {
|
15516
15516
|
border-top: solid var(--bs-light-border-subtle) 1px;
|
15517
15517
|
}
|
15518
15518
|
|
15519
|
-
.
|
15519
|
+
._moreToggleContainer_59eal_28 {
|
15520
15520
|
position: absolute;
|
15521
15521
|
top: -1px;
|
15522
15522
|
right: 0;
|
@@ -15527,7 +15527,7 @@ pre[class*="language-"] {
|
|
15527
15527
|
margin-right: 0;
|
15528
15528
|
}
|
15529
15529
|
|
15530
|
-
.
|
15530
|
+
._moreToggleButton_59eal_39 {
|
15531
15531
|
font-size: var(--inspect-font-size-smaller);
|
15532
15532
|
border: none;
|
15533
15533
|
padding: 0.1rem 0.5rem;
|