inspect-ai 0.3.95__py3-none-any.whl → 0.3.96__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_util/local_server.py +16 -0
- inspect_ai/agent/_types.py +1 -1
- inspect_ai/analysis/beta/__init__.py +11 -1
- inspect_ai/analysis/beta/_dataframe/evals/table.py +65 -28
- inspect_ai/analysis/beta/_dataframe/events/columns.py +50 -0
- inspect_ai/analysis/beta/_dataframe/events/extract.py +26 -0
- inspect_ai/analysis/beta/_dataframe/events/table.py +89 -3
- inspect_ai/analysis/beta/_dataframe/extract.py +44 -25
- inspect_ai/analysis/beta/_dataframe/messages/columns.py +1 -1
- inspect_ai/analysis/beta/_dataframe/messages/table.py +6 -14
- inspect_ai/analysis/beta/_dataframe/progress.py +26 -0
- inspect_ai/analysis/beta/_dataframe/samples/columns.py +7 -3
- inspect_ai/analysis/beta/_dataframe/samples/extract.py +5 -33
- inspect_ai/analysis/beta/_dataframe/samples/table.py +76 -35
- inspect_ai/analysis/beta/_dataframe/util.py +19 -16
- inspect_ai/log/_file.py +9 -2
- inspect_ai/solver/_task_state.py +1 -1
- inspect_ai/tool/_mcp/_sandbox.py +17 -14
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.96.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.96.dist-info}/RECORD +24 -22
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.96.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.96.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.96.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.96.dist-info}/top_level.txt +0 -0
inspect_ai/_util/local_server.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
+
import platform
|
4
5
|
import random
|
5
6
|
import socket
|
6
7
|
import subprocess
|
@@ -33,6 +34,21 @@ def reserve_port(
|
|
33
34
|
Returns:
|
34
35
|
A tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
|
35
36
|
"""
|
37
|
+
is_macos = platform.system() == "Darwin"
|
38
|
+
|
39
|
+
if is_macos:
|
40
|
+
logger.info(
|
41
|
+
"MacOS system detected. A free binding port will be identified, but not reserved until the server binds to it."
|
42
|
+
)
|
43
|
+
# On macOS, let the OS pick a free port but not open it
|
44
|
+
# It leads to a small racode condition window until the port
|
45
|
+
# is actually opened by the llm server
|
46
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
47
|
+
s.bind((host, 0)) # Bind to any free port
|
48
|
+
port = s.getsockname()[1]
|
49
|
+
return port, s
|
50
|
+
|
51
|
+
# Non-macOS behavior: try ports in range
|
36
52
|
candidates = list(range(start, end))
|
37
53
|
random.shuffle(candidates)
|
38
54
|
|
inspect_ai/agent/_types.py
CHANGED
@@ -43,7 +43,7 @@ class AgentPrompt(NamedTuple):
|
|
43
43
|
|
44
44
|
DEFAULT_CONTINUE_PROMPT = """
|
45
45
|
Please proceed to the next step using your best judgement. If you believe you
|
46
|
-
have completed the task, please call the `{submit}()` tool.
|
46
|
+
have completed the task, please call the `{submit}()` tool with your final answer.
|
47
47
|
"""
|
48
48
|
|
49
49
|
|
@@ -15,7 +15,13 @@ from ._dataframe.evals.columns import (
|
|
15
15
|
EvalTask,
|
16
16
|
)
|
17
17
|
from ._dataframe.evals.table import evals_df
|
18
|
-
from ._dataframe.events.columns import
|
18
|
+
from ._dataframe.events.columns import (
|
19
|
+
EventColumn,
|
20
|
+
EventInfo,
|
21
|
+
EventTiming,
|
22
|
+
ModelEventColumns,
|
23
|
+
ToolEventColumns,
|
24
|
+
)
|
19
25
|
from ._dataframe.events.table import events_df
|
20
26
|
from ._dataframe.messages.columns import (
|
21
27
|
MessageColumn,
|
@@ -50,6 +56,10 @@ __all__ = [
|
|
50
56
|
"MessageFilter",
|
51
57
|
"events_df",
|
52
58
|
"EventColumn",
|
59
|
+
"EventInfo",
|
60
|
+
"EventTiming",
|
61
|
+
"ModelEventColumns",
|
62
|
+
"ToolEventColumns",
|
53
63
|
"Column",
|
54
64
|
"ColumnType",
|
55
65
|
"ColumnError",
|
@@ -1,10 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING, Literal, overload
|
3
|
+
from typing import TYPE_CHECKING, Callable, Literal, overload
|
4
4
|
|
5
|
-
from inspect_ai._display import display
|
6
5
|
from inspect_ai._util.path import pretty_path
|
6
|
+
from inspect_ai.analysis.beta._dataframe.progress import import_progress
|
7
7
|
from inspect_ai.log._file import (
|
8
|
+
list_eval_logs,
|
8
9
|
read_eval_log,
|
9
10
|
)
|
10
11
|
|
@@ -29,39 +30,32 @@ EVAL_SUFFIX = "_eval"
|
|
29
30
|
|
30
31
|
@overload
|
31
32
|
def evals_df(
|
32
|
-
logs: LogPaths,
|
33
|
+
logs: LogPaths = list_eval_logs(),
|
33
34
|
columns: list[Column] = EvalColumns,
|
34
|
-
recursive: bool = True,
|
35
|
-
reverse: bool = False,
|
36
35
|
strict: Literal[True] = True,
|
37
36
|
) -> "pd.DataFrame": ...
|
38
37
|
|
39
38
|
|
40
39
|
@overload
|
41
40
|
def evals_df(
|
42
|
-
logs: LogPaths,
|
41
|
+
logs: LogPaths = list_eval_logs(),
|
43
42
|
columns: list[Column] = EvalColumns,
|
44
|
-
recursive: bool = True,
|
45
|
-
reverse: bool = False,
|
46
43
|
strict: Literal[False] = False,
|
47
44
|
) -> tuple["pd.DataFrame", ColumnErrors]: ...
|
48
45
|
|
49
46
|
|
50
47
|
def evals_df(
|
51
|
-
logs: LogPaths,
|
48
|
+
logs: LogPaths = list_eval_logs(),
|
52
49
|
columns: list[Column] = EvalColumns,
|
53
|
-
recursive: bool = True,
|
54
|
-
reverse: bool = False,
|
55
50
|
strict: bool = True,
|
56
51
|
) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
|
57
52
|
"""Read a dataframe containing evals.
|
58
53
|
|
59
54
|
Args:
|
60
55
|
logs: One or more paths to log files or log directories.
|
56
|
+
Defaults to the contents of the currently active log directory
|
57
|
+
(e.g. ./logs or INSPECT_LOG_DIR).
|
61
58
|
columns: Specification for what columns to read from log files.
|
62
|
-
recursive: Include recursive contents of directories (defaults to `True`)
|
63
|
-
reverse: Reverse the order of the dataframe (by default, items
|
64
|
-
are ordered from oldest to newest).
|
65
59
|
strict: Raise import errors immediately. Defaults to `True`.
|
66
60
|
If `False` then a tuple of `DataFrame` and errors is returned.
|
67
61
|
|
@@ -73,7 +67,46 @@ def evals_df(
|
|
73
67
|
verify_prerequisites()
|
74
68
|
|
75
69
|
# resolve logs
|
76
|
-
log_paths = resolve_logs(logs
|
70
|
+
log_paths = resolve_logs(logs)
|
71
|
+
|
72
|
+
with import_progress("reading logs", total=len(log_paths)) as (p, task_id):
|
73
|
+
if strict:
|
74
|
+
evals_table, _ = _read_evals_df(
|
75
|
+
log_paths, columns, True, lambda: p.update(task_id, advance=1)
|
76
|
+
)
|
77
|
+
return evals_table
|
78
|
+
else:
|
79
|
+
evals_table, all_errors, _ = _read_evals_df(
|
80
|
+
log_paths, columns, False, lambda: p.update(task_id, advance=1)
|
81
|
+
)
|
82
|
+
return evals_table, all_errors
|
83
|
+
|
84
|
+
|
85
|
+
@overload
|
86
|
+
def _read_evals_df(
|
87
|
+
log_paths: list[str],
|
88
|
+
columns: list[Column],
|
89
|
+
strict: Literal[True],
|
90
|
+
progress: Callable[[], None],
|
91
|
+
) -> tuple["pd.DataFrame", int]: ...
|
92
|
+
|
93
|
+
|
94
|
+
@overload
|
95
|
+
def _read_evals_df(
|
96
|
+
log_paths: list[str],
|
97
|
+
columns: list[Column],
|
98
|
+
strict: Literal[False],
|
99
|
+
progress: Callable[[], None],
|
100
|
+
) -> tuple["pd.DataFrame", ColumnErrors, int]: ...
|
101
|
+
|
102
|
+
|
103
|
+
def _read_evals_df(
|
104
|
+
log_paths: list[str],
|
105
|
+
columns: list[Column],
|
106
|
+
strict: bool,
|
107
|
+
progress: Callable[[], None],
|
108
|
+
) -> tuple["pd.DataFrame", int] | tuple["pd.DataFrame", ColumnErrors, int]:
|
109
|
+
verify_prerequisites()
|
77
110
|
|
78
111
|
# resolve duplicate columns
|
79
112
|
columns = resolve_duplicate_columns(columns)
|
@@ -85,27 +118,31 @@ def evals_df(
|
|
85
118
|
ensure_eval_id(columns)
|
86
119
|
|
87
120
|
# read logs
|
121
|
+
total_samples = 0
|
88
122
|
records: list[dict[str, ColumnType]] = []
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
123
|
+
for log_path in log_paths:
|
124
|
+
log = read_eval_log(log_path, header_only=True)
|
125
|
+
if strict:
|
126
|
+
record = import_record(log, columns, strict=True)
|
127
|
+
else:
|
128
|
+
record, errors = import_record(log, columns, strict=False)
|
129
|
+
all_errors[pretty_path(log_path)] = errors
|
130
|
+
records.append(record)
|
131
|
+
total_samples += (
|
132
|
+
len(log.eval.dataset.sample_ids)
|
133
|
+
if log.eval.dataset.sample_ids is not None
|
134
|
+
else (log.eval.dataset.samples or 100)
|
135
|
+
)
|
136
|
+
progress()
|
100
137
|
|
101
138
|
# return table (+errors if strict=False)
|
102
139
|
evals_table = records_to_pandas(records)
|
103
140
|
evals_table = reorder_evals_df_columns(evals_table, columns)
|
104
141
|
|
105
142
|
if strict:
|
106
|
-
return evals_table
|
143
|
+
return evals_table, total_samples
|
107
144
|
else:
|
108
|
-
return evals_table, all_errors
|
145
|
+
return evals_table, all_errors, total_samples
|
109
146
|
|
110
147
|
|
111
148
|
def ensure_eval_id(columns: list[Column]) -> None:
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from datetime import datetime
|
1
2
|
from typing import Any, Callable, Mapping, Type
|
2
3
|
|
3
4
|
from jsonpath_ng import JSONPath # type: ignore
|
@@ -7,6 +8,12 @@ from typing_extensions import override
|
|
7
8
|
from inspect_ai.log._transcript import Event
|
8
9
|
|
9
10
|
from ..columns import Column, ColumnType
|
11
|
+
from .extract import (
|
12
|
+
completion_as_str,
|
13
|
+
model_event_input_as_str,
|
14
|
+
tool_choice_as_str,
|
15
|
+
tool_view_as_str,
|
16
|
+
)
|
10
17
|
|
11
18
|
|
12
19
|
class EventColumn(Column):
|
@@ -35,3 +42,46 @@ class EventColumn(Column):
|
|
35
42
|
@override
|
36
43
|
def path_schema(self) -> Mapping[str, Any] | None:
|
37
44
|
return None
|
45
|
+
|
46
|
+
|
47
|
+
EventInfo: list[Column] = [
|
48
|
+
EventColumn("event", path="event"),
|
49
|
+
EventColumn("span_id", path="span_id"),
|
50
|
+
]
|
51
|
+
"""Event basic information columns."""
|
52
|
+
|
53
|
+
EventTiming: list[Column] = [
|
54
|
+
EventColumn("timestamp", path="timestamp", type=datetime),
|
55
|
+
EventColumn("completed", path="completed", type=datetime),
|
56
|
+
EventColumn("working_start", path="working_start"),
|
57
|
+
EventColumn("working_time", path="working_time"),
|
58
|
+
]
|
59
|
+
"""Event timing columns."""
|
60
|
+
|
61
|
+
ModelEventColumns: list[Column] = [
|
62
|
+
EventColumn("model_event_model", path="model"),
|
63
|
+
EventColumn("model_event_role", path="role"),
|
64
|
+
EventColumn("model_event_input", path=model_event_input_as_str),
|
65
|
+
EventColumn("model_event_tools", path="tools"),
|
66
|
+
EventColumn("model_event_tool_choice", path=tool_choice_as_str),
|
67
|
+
EventColumn("model_event_config", path="config"),
|
68
|
+
EventColumn("model_event_usage", path="output.usage"),
|
69
|
+
EventColumn("model_event_time", path="output.time"),
|
70
|
+
EventColumn("model_event_completion", path=completion_as_str),
|
71
|
+
EventColumn("model_event_retries", path="retries"),
|
72
|
+
EventColumn("model_event_error", path="error"),
|
73
|
+
EventColumn("model_event_cache", path="cache"),
|
74
|
+
EventColumn("model_event_call", path="call"),
|
75
|
+
]
|
76
|
+
"""Model event columns."""
|
77
|
+
|
78
|
+
ToolEventColumns: list[Column] = [
|
79
|
+
EventColumn("tool_event_function", path="function"),
|
80
|
+
EventColumn("tool_event_arguments", path="arguments"),
|
81
|
+
EventColumn("tool_event_view", path=tool_view_as_str),
|
82
|
+
EventColumn("tool_event_result", path="result"),
|
83
|
+
EventColumn("tool_event_truncated", path="truncated"),
|
84
|
+
EventColumn("tool_event_error_type", path="error.type"),
|
85
|
+
EventColumn("tool_event_error_message", path="error.message"),
|
86
|
+
]
|
87
|
+
"""Tool event columns."""
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from inspect_ai.log._transcript import ModelEvent, ToolEvent
|
2
|
+
|
3
|
+
from ..extract import messages_as_str
|
4
|
+
|
5
|
+
|
6
|
+
def model_event_input_as_str(event: ModelEvent) -> str:
|
7
|
+
return messages_as_str(event.input)
|
8
|
+
|
9
|
+
|
10
|
+
def tool_choice_as_str(event: ModelEvent) -> str:
|
11
|
+
if isinstance(event.tool_choice, str):
|
12
|
+
return event.tool_choice
|
13
|
+
else:
|
14
|
+
return event.tool_choice.name
|
15
|
+
|
16
|
+
|
17
|
+
def completion_as_str(event: ModelEvent) -> str:
|
18
|
+
return event.output.completion
|
19
|
+
|
20
|
+
|
21
|
+
def tool_view_as_str(event: ToolEvent) -> str | None:
|
22
|
+
if event.view is not None:
|
23
|
+
title = f"{event.view.title}\n\n" if event.view.title is not None else ""
|
24
|
+
return f"{title}{event.view.content}"
|
25
|
+
else:
|
26
|
+
return None
|
@@ -1,14 +1,100 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING
|
3
|
+
from typing import TYPE_CHECKING, Callable, Literal, TypeAlias
|
4
|
+
|
5
|
+
from inspect_ai.analysis.beta._dataframe.events.columns import EventInfo
|
6
|
+
from inspect_ai.log._file import list_eval_logs
|
7
|
+
from inspect_ai.log._transcript import Event
|
4
8
|
|
5
9
|
if TYPE_CHECKING:
|
6
10
|
import pandas as pd
|
7
11
|
|
12
|
+
from typing_extensions import overload
|
13
|
+
|
14
|
+
from ..columns import Column, ColumnErrors
|
15
|
+
from ..samples.table import EventsDetail, _read_samples_df
|
8
16
|
from ..util import LogPaths, verify_prerequisites
|
9
17
|
|
18
|
+
EventFilter: TypeAlias = (
|
19
|
+
list[
|
20
|
+
Literal[
|
21
|
+
"sample_init",
|
22
|
+
"sample_limit",
|
23
|
+
"sandbox",
|
24
|
+
"state",
|
25
|
+
"store",
|
26
|
+
"model",
|
27
|
+
"tool",
|
28
|
+
"sandbox",
|
29
|
+
"approval",
|
30
|
+
"input",
|
31
|
+
"score",
|
32
|
+
"error",
|
33
|
+
"logger",
|
34
|
+
"info",
|
35
|
+
"span_begin",
|
36
|
+
"span_end",
|
37
|
+
"subtask",
|
38
|
+
]
|
39
|
+
]
|
40
|
+
| Callable[[Event], bool]
|
41
|
+
)
|
42
|
+
"""Filter for `events_df()` rows."""
|
43
|
+
|
44
|
+
|
45
|
+
@overload
|
46
|
+
def events_df(
|
47
|
+
logs: LogPaths = list_eval_logs(),
|
48
|
+
columns: list[Column] = EventInfo,
|
49
|
+
filter: EventFilter | None = None,
|
50
|
+
strict: Literal[True] = True,
|
51
|
+
) -> "pd.DataFrame": ...
|
52
|
+
|
10
53
|
|
11
|
-
|
54
|
+
@overload
|
55
|
+
def events_df(
|
56
|
+
logs: LogPaths = list_eval_logs(),
|
57
|
+
columns: list[Column] = EventInfo,
|
58
|
+
filter: EventFilter | None = None,
|
59
|
+
strict: Literal[False] = False,
|
60
|
+
) -> tuple["pd.DataFrame", ColumnErrors]: ...
|
61
|
+
|
62
|
+
|
63
|
+
def events_df(
|
64
|
+
logs: LogPaths = list_eval_logs(),
|
65
|
+
columns: list[Column] = EventInfo,
|
66
|
+
filter: EventFilter | None = None,
|
67
|
+
strict: bool = True,
|
68
|
+
) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
|
69
|
+
"""Read a dataframe containing events from a set of evals.
|
70
|
+
|
71
|
+
Args:
|
72
|
+
logs: One or more paths to log files or log directories.
|
73
|
+
Defaults to the contents of the currently active log directory
|
74
|
+
(e.g. ./logs or INSPECT_LOG_DIR).
|
75
|
+
columns: Specification for what columns to read from log files.
|
76
|
+
filter: List of event types to include or callable that performs the filter.
|
77
|
+
strict: Raise import errors immediately. Defaults to `True`.
|
78
|
+
If `False` then a tuple of `DataFrame` and errors is returned.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
For `strict`, a Pandas `DataFrame` with information for the specified logs.
|
82
|
+
For `strict=False`, a tuple of Pandas `DataFrame` and a dictionary of errors
|
83
|
+
encountered (by log file) during import.
|
84
|
+
"""
|
12
85
|
verify_prerequisites()
|
13
86
|
|
14
|
-
|
87
|
+
# resolve filter/detail
|
88
|
+
if filter is None:
|
89
|
+
detail = EventsDetail(filter=lambda e: True)
|
90
|
+
elif callable(filter):
|
91
|
+
detail = EventsDetail(filter=filter)
|
92
|
+
else:
|
93
|
+
detail = EventsDetail(filter=lambda e: e.event in filter)
|
94
|
+
|
95
|
+
return _read_samples_df(
|
96
|
+
logs=logs,
|
97
|
+
columns=columns,
|
98
|
+
strict=strict,
|
99
|
+
detail=detail,
|
100
|
+
)
|
@@ -5,11 +5,16 @@ from typing import Any, cast
|
|
5
5
|
import shortuuid
|
6
6
|
from pydantic import BaseModel, JsonValue
|
7
7
|
|
8
|
-
from inspect_ai.
|
8
|
+
from inspect_ai.model._chat_message import (
|
9
|
+
ChatMessage,
|
10
|
+
ChatMessageAssistant,
|
11
|
+
ChatMessageTool,
|
12
|
+
ChatMessageUser,
|
13
|
+
)
|
9
14
|
|
10
15
|
|
11
16
|
def model_to_record(model: BaseModel) -> dict[str, JsonValue]:
|
12
|
-
return cast(dict[str, JsonValue],
|
17
|
+
return cast(dict[str, JsonValue], model.model_dump(mode="json", exclude_none=True))
|
13
18
|
|
14
19
|
|
15
20
|
def list_as_str(x: JsonValue) -> str:
|
@@ -21,34 +26,48 @@ def score_values(x: JsonValue) -> dict[str, JsonValue]:
|
|
21
26
|
return {k: v["value"] for k, v in scores.items()}
|
22
27
|
|
23
28
|
|
24
|
-
def
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
+
def auto_id(base: str, index: str) -> str:
|
30
|
+
seed = f"{base}_{index}"
|
31
|
+
hash_bytes = hashlib.md5(seed.encode("utf-8")).digest()
|
32
|
+
long_uuid = uuid.UUID(bytes=hash_bytes)
|
33
|
+
return shortuuid.encode(long_uuid)
|
29
34
|
|
30
35
|
|
31
|
-
def messages_as_str(
|
32
|
-
if isinstance(
|
33
|
-
messages =
|
34
|
-
|
35
|
-
else:
|
36
|
-
raise ValueError(f"Unexpected type for messages: {type(x)}")
|
36
|
+
def messages_as_str(messages: str | list[ChatMessage]) -> str:
|
37
|
+
if isinstance(messages, str):
|
38
|
+
messages = [ChatMessageUser(content=messages)]
|
39
|
+
return "\n\n".join([message_as_str(message) for message in messages])
|
37
40
|
|
38
41
|
|
39
|
-
def message_as_str(message:
|
40
|
-
|
42
|
+
def message_as_str(message: ChatMessage) -> str:
|
43
|
+
transcript: list[str] = []
|
44
|
+
role = message.role
|
45
|
+
content = message.text.strip() if message.text else ""
|
41
46
|
|
47
|
+
# assistant messages with tool calls
|
48
|
+
if isinstance(message, ChatMessageAssistant) and message.tool_calls is not None:
|
49
|
+
entry = f"{role}:\n{content}\n"
|
42
50
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
else:
|
47
|
-
return "\n".join([c["text"] if c["type"] == "text" else "" for c in content])
|
51
|
+
for tool in message.tool_calls:
|
52
|
+
func_name = tool.function
|
53
|
+
args = tool.arguments
|
48
54
|
|
55
|
+
if isinstance(args, dict):
|
56
|
+
args_text = "\n".join(f"{k}: {v}" for k, v in args.items())
|
57
|
+
entry += f"\nTool Call: {func_name}\nArguments:\n{args_text}"
|
58
|
+
else:
|
59
|
+
entry += f"\nTool Call: {func_name}\nArguments: {args}"
|
49
60
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
61
|
+
transcript.append(entry)
|
62
|
+
|
63
|
+
# tool responses with errors
|
64
|
+
elif isinstance(message, ChatMessageTool) and message.error is not None:
|
65
|
+
func_name = message.function or "unknown"
|
66
|
+
entry = f"{role}:\n{content}\n\nError in tool call '{func_name}':\n{message.error.message}\n"
|
67
|
+
transcript.append(entry)
|
68
|
+
|
69
|
+
# normal messages
|
70
|
+
else:
|
71
|
+
transcript.append(f"{role}:\n{content}\n")
|
72
|
+
|
73
|
+
return "\n".join(transcript)
|
@@ -43,8 +43,8 @@ class MessageColumn(Column):
|
|
43
43
|
|
44
44
|
MessageContent: list[Column] = [
|
45
45
|
MessageColumn("role", path="role", required=True),
|
46
|
-
MessageColumn("content", path=message_text),
|
47
46
|
MessageColumn("source", path="source"),
|
47
|
+
MessageColumn("content", path=message_text),
|
48
48
|
]
|
49
49
|
"""Message content columns."""
|
50
50
|
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING, Callable, Literal, TypeAlias
|
4
4
|
|
5
|
+
from inspect_ai.log._file import list_eval_logs
|
5
6
|
from inspect_ai.model._chat_message import ChatMessage
|
6
7
|
|
7
8
|
if TYPE_CHECKING:
|
@@ -22,43 +23,36 @@ MessageFilter: TypeAlias = (
|
|
22
23
|
|
23
24
|
@overload
|
24
25
|
def messages_df(
|
25
|
-
logs: LogPaths,
|
26
|
+
logs: LogPaths = list_eval_logs(),
|
26
27
|
columns: list[Column] = MessageColumns,
|
27
28
|
filter: MessageFilter | None = None,
|
28
|
-
recursive: bool = True,
|
29
|
-
reverse: bool = False,
|
30
29
|
strict: Literal[True] = True,
|
31
30
|
) -> "pd.DataFrame": ...
|
32
31
|
|
33
32
|
|
34
33
|
@overload
|
35
34
|
def messages_df(
|
36
|
-
logs: LogPaths,
|
35
|
+
logs: LogPaths = list_eval_logs(),
|
37
36
|
columns: list[Column] = MessageColumns,
|
38
37
|
filter: MessageFilter | None = None,
|
39
|
-
recursive: bool = True,
|
40
|
-
reverse: bool = False,
|
41
38
|
strict: Literal[False] = False,
|
42
39
|
) -> tuple["pd.DataFrame", ColumnErrors]: ...
|
43
40
|
|
44
41
|
|
45
42
|
def messages_df(
|
46
|
-
logs: LogPaths,
|
43
|
+
logs: LogPaths = list_eval_logs(),
|
47
44
|
columns: list[Column] = MessageColumns,
|
48
45
|
filter: MessageFilter | None = None,
|
49
|
-
recursive: bool = True,
|
50
|
-
reverse: bool = False,
|
51
46
|
strict: bool = True,
|
52
47
|
) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
|
53
48
|
"""Read a dataframe containing messages from a set of evals.
|
54
49
|
|
55
50
|
Args:
|
56
51
|
logs: One or more paths to log files or log directories.
|
52
|
+
Defaults to the contents of the currently active log directory
|
53
|
+
(e.g. ./logs or INSPECT_LOG_DIR).
|
57
54
|
columns: Specification for what columns to read from log files.
|
58
55
|
filter: List of message role types to include or callable that performs the filter.
|
59
|
-
recursive: Include recursive contents of directories (defaults to `True`)
|
60
|
-
reverse: Reverse the order of the dataframe (by default, items
|
61
|
-
are ordered from oldest to newest).
|
62
56
|
strict: Raise import errors immediately. Defaults to `True`.
|
63
57
|
If `False` then a tuple of `DataFrame` and errors is returned.
|
64
58
|
|
@@ -80,8 +74,6 @@ def messages_df(
|
|
80
74
|
return _read_samples_df(
|
81
75
|
logs=logs,
|
82
76
|
columns=columns,
|
83
|
-
recursive=recursive,
|
84
|
-
reverse=reverse,
|
85
77
|
strict=strict,
|
86
78
|
detail=detail,
|
87
79
|
)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from contextlib import contextmanager
|
2
|
+
from typing import Iterator
|
3
|
+
|
4
|
+
from rich.progress import (
|
5
|
+
BarColumn,
|
6
|
+
Progress,
|
7
|
+
TaskID,
|
8
|
+
TaskProgressColumn,
|
9
|
+
TextColumn,
|
10
|
+
TimeElapsedColumn,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
@contextmanager
|
15
|
+
def import_progress(
|
16
|
+
description: str, total: float | None
|
17
|
+
) -> Iterator[tuple[Progress, TaskID]]:
|
18
|
+
with Progress(
|
19
|
+
TextColumn("[progress.description]{task.description:<18}"),
|
20
|
+
BarColumn(),
|
21
|
+
TaskProgressColumn(),
|
22
|
+
TimeElapsedColumn(),
|
23
|
+
transient=True,
|
24
|
+
) as progress:
|
25
|
+
task_id = progress.add_task(description, total=total)
|
26
|
+
yield progress, task_id
|
@@ -7,9 +7,13 @@ from typing_extensions import override
|
|
7
7
|
from inspect_ai.log._log import EvalSample, EvalSampleSummary
|
8
8
|
|
9
9
|
from ..columns import Column, ColumnType
|
10
|
-
from ..extract import
|
10
|
+
from ..extract import list_as_str, score_values
|
11
11
|
from ..validate import resolved_schema
|
12
|
-
from .extract import
|
12
|
+
from .extract import (
|
13
|
+
sample_input_as_str,
|
14
|
+
sample_messages_as_str,
|
15
|
+
sample_path_requires_full,
|
16
|
+
)
|
13
17
|
|
14
18
|
|
15
19
|
class SampleColumn(Column):
|
@@ -54,7 +58,7 @@ class SampleColumn(Column):
|
|
54
58
|
SampleSummary: list[Column] = [
|
55
59
|
SampleColumn("id", path="id", required=True, type=str),
|
56
60
|
SampleColumn("epoch", path="epoch", required=True),
|
57
|
-
SampleColumn("input", path=
|
61
|
+
SampleColumn("input", path=sample_input_as_str, required=True),
|
58
62
|
SampleColumn("target", path="target", required=True, value=list_as_str),
|
59
63
|
SampleColumn("metadata_*", path="metadata"),
|
60
64
|
SampleColumn("score_*", path="scores", value=score_values),
|
@@ -3,45 +3,17 @@ from typing import Callable
|
|
3
3
|
from jsonpath_ng import JSONPath # type: ignore
|
4
4
|
from pydantic import JsonValue
|
5
5
|
|
6
|
-
from inspect_ai.analysis.beta._dataframe.extract import auto_id
|
7
6
|
from inspect_ai.log._log import EvalSample, EvalSampleSummary
|
8
|
-
from inspect_ai.model._chat_message import ChatMessageAssistant, ChatMessageTool
|
9
7
|
|
8
|
+
from ..extract import auto_id, messages_as_str
|
10
9
|
|
11
|
-
def sample_messages_as_str(sample: EvalSample) -> str:
|
12
|
-
# format each message for the transcript
|
13
|
-
transcript: list[str] = []
|
14
|
-
for msg in sample.messages:
|
15
|
-
role = msg.role
|
16
|
-
content = msg.text.strip() if msg.text else ""
|
17
|
-
|
18
|
-
# assistant messages with tool calls
|
19
|
-
if isinstance(msg, ChatMessageAssistant) and msg.tool_calls is not None:
|
20
|
-
entry = f"{role}:\n{content}\n"
|
21
|
-
|
22
|
-
for tool in msg.tool_calls:
|
23
|
-
func_name = tool.function
|
24
|
-
args = tool.arguments
|
25
10
|
|
26
|
-
|
27
|
-
|
28
|
-
entry += f"\nTool Call: {func_name}\nArguments:\n{args_text}"
|
29
|
-
else:
|
30
|
-
entry += f"\nTool Call: {func_name}\nArguments: {args}"
|
11
|
+
def sample_input_as_str(sample: EvalSample) -> str:
|
12
|
+
return messages_as_str(sample.input)
|
31
13
|
|
32
|
-
transcript.append(entry)
|
33
14
|
|
34
|
-
|
35
|
-
|
36
|
-
func_name = msg.function or "unknown"
|
37
|
-
entry = f"{role}:\n{content}\n\nError in tool call '{func_name}':\n{msg.error.message}\n"
|
38
|
-
transcript.append(entry)
|
39
|
-
|
40
|
-
# normal messages
|
41
|
-
else:
|
42
|
-
transcript.append(f"{role}:\n{content}\n")
|
43
|
-
|
44
|
-
return "\n".join(transcript)
|
15
|
+
def sample_messages_as_str(sample: EvalSample) -> str:
|
16
|
+
return messages_as_str(sample.messages)
|
45
17
|
|
46
18
|
|
47
19
|
def sample_path_requires_full(
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import dataclass
|
4
|
+
from functools import lru_cache
|
4
5
|
from typing import (
|
5
6
|
TYPE_CHECKING,
|
6
7
|
Callable,
|
@@ -9,21 +10,24 @@ from typing import (
|
|
9
10
|
overload,
|
10
11
|
)
|
11
12
|
|
12
|
-
from inspect_ai.
|
13
|
+
from inspect_ai._util.hash import mm3_hash
|
13
14
|
from inspect_ai._util.path import pretty_path
|
14
|
-
from inspect_ai.analysis.beta._dataframe.
|
15
|
-
from inspect_ai.analysis.beta._dataframe.messages.columns import MessageColumn
|
15
|
+
from inspect_ai.analysis.beta._dataframe.progress import import_progress
|
16
16
|
from inspect_ai.log._file import (
|
17
|
+
list_eval_logs,
|
17
18
|
read_eval_log_sample_summaries,
|
18
19
|
read_eval_log_samples,
|
19
20
|
)
|
20
21
|
from inspect_ai.log._log import EvalSample, EvalSampleSummary
|
21
|
-
from inspect_ai.log._transcript import
|
22
|
+
from inspect_ai.log._transcript import Event
|
22
23
|
from inspect_ai.model._chat_message import ChatMessage
|
23
24
|
|
24
25
|
from ..columns import Column, ColumnErrors, ColumnType
|
25
26
|
from ..evals.columns import EvalColumn
|
26
|
-
from ..evals.table import EVAL_ID, EVAL_SUFFIX,
|
27
|
+
from ..evals.table import EVAL_ID, EVAL_SUFFIX, _read_evals_df, ensure_eval_id
|
28
|
+
from ..events.columns import EventColumn
|
29
|
+
from ..extract import message_as_str
|
30
|
+
from ..messages.columns import MessageColumn
|
27
31
|
from ..record import import_record, resolve_duplicate_columns
|
28
32
|
from ..util import (
|
29
33
|
LogPaths,
|
@@ -46,39 +50,32 @@ SAMPLE_SUFFIX = "_sample"
|
|
46
50
|
|
47
51
|
@overload
|
48
52
|
def samples_df(
|
49
|
-
logs: LogPaths,
|
53
|
+
logs: LogPaths = list_eval_logs(),
|
50
54
|
columns: list[Column] = SampleSummary,
|
51
|
-
recursive: bool = True,
|
52
|
-
reverse: bool = False,
|
53
55
|
strict: Literal[True] = True,
|
54
56
|
) -> "pd.DataFrame": ...
|
55
57
|
|
56
58
|
|
57
59
|
@overload
|
58
60
|
def samples_df(
|
59
|
-
logs: LogPaths,
|
61
|
+
logs: LogPaths = list_eval_logs(),
|
60
62
|
columns: list[Column] = SampleSummary,
|
61
|
-
recursive: bool = True,
|
62
|
-
reverse: bool = False,
|
63
63
|
strict: Literal[False] = False,
|
64
64
|
) -> tuple["pd.DataFrame", ColumnErrors]: ...
|
65
65
|
|
66
66
|
|
67
67
|
def samples_df(
|
68
|
-
logs: LogPaths,
|
68
|
+
logs: LogPaths = list_eval_logs(),
|
69
69
|
columns: list[Column] = SampleSummary,
|
70
|
-
recursive: bool = True,
|
71
|
-
reverse: bool = False,
|
72
70
|
strict: bool = True,
|
73
71
|
) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
|
74
72
|
"""Read a dataframe containing samples from a set of evals.
|
75
73
|
|
76
74
|
Args:
|
77
75
|
logs: One or more paths to log files or log directories.
|
76
|
+
Defaults to the contents of the currently active log directory
|
77
|
+
(e.g. ./logs or INSPECT_LOG_DIR).
|
78
78
|
columns: Specification for what columns to read from log files.
|
79
|
-
recursive: Include recursive contents of directories (defaults to `True`)
|
80
|
-
reverse: Reverse the order of the dataframe (by default, items
|
81
|
-
are ordered from oldest to newest).
|
82
79
|
strict: Raise import errors immediately. Defaults to `True`.
|
83
80
|
If `False` then a tuple of `DataFrame` and errors is returned.
|
84
81
|
|
@@ -87,9 +84,7 @@ def samples_df(
|
|
87
84
|
For `strict=False`, a tuple of Pandas `DataFrame` and a dictionary of errors
|
88
85
|
encountered (by log file) during import.
|
89
86
|
"""
|
90
|
-
return _read_samples_df(
|
91
|
-
logs, columns, recursive=recursive, reverse=reverse, strict=strict
|
92
|
-
)
|
87
|
+
return _read_samples_df(logs, columns, strict=strict)
|
93
88
|
|
94
89
|
|
95
90
|
@dataclass
|
@@ -101,24 +96,22 @@ class MessagesDetail:
|
|
101
96
|
|
102
97
|
@dataclass
|
103
98
|
class EventsDetail:
|
104
|
-
name: str = "
|
99
|
+
name: str = "event"
|
105
100
|
col_type = EventColumn
|
106
|
-
filter: Callable[[
|
101
|
+
filter: Callable[[Event], bool] = lambda e: True
|
107
102
|
|
108
103
|
|
109
104
|
def _read_samples_df(
|
110
105
|
logs: LogPaths,
|
111
106
|
columns: list[Column],
|
112
107
|
*,
|
113
|
-
recursive: bool = True,
|
114
|
-
reverse: bool = False,
|
115
108
|
strict: bool = True,
|
116
109
|
detail: MessagesDetail | EventsDetail | None = None,
|
117
110
|
) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
|
118
111
|
verify_prerequisites()
|
119
112
|
|
120
113
|
# resolve logs
|
121
|
-
logs = resolve_logs(logs
|
114
|
+
logs = resolve_logs(logs)
|
122
115
|
|
123
116
|
# split columns by type
|
124
117
|
columns_eval: list[Column] = []
|
@@ -150,12 +143,31 @@ def _read_samples_df(
|
|
150
143
|
# make sure eval_id is present
|
151
144
|
ensure_eval_id(columns_eval)
|
152
145
|
|
153
|
-
#
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
146
|
+
# determine how we will allocate progress
|
147
|
+
with import_progress("scanning logs", total=len(logs)) as (
|
148
|
+
p,
|
149
|
+
task_id,
|
150
|
+
):
|
151
|
+
|
152
|
+
def progress() -> None:
|
153
|
+
p.update(task_id, advance=1)
|
154
|
+
|
155
|
+
# read samples from each log
|
156
|
+
sample_records: list[dict[str, ColumnType]] = []
|
157
|
+
detail_records: list[dict[str, ColumnType]] = []
|
158
|
+
all_errors = ColumnErrors()
|
159
|
+
|
160
|
+
# read logs and note total samples
|
161
|
+
evals_table, total_samples = _read_evals_df(
|
162
|
+
logs, columns=columns_eval, strict=True, progress=progress
|
163
|
+
)
|
164
|
+
|
165
|
+
# update progress now that we know the total samples
|
166
|
+
entity = detail.name if detail else "sample"
|
167
|
+
p.reset(
|
168
|
+
task_id, description=f"reading {entity}s", completed=0, total=total_samples
|
169
|
+
)
|
170
|
+
|
159
171
|
# read samples
|
160
172
|
for eval_id, log in zip(evals_table[EVAL_ID].to_list(), logs):
|
161
173
|
# get a generator for the samples (might require reading the full log
|
@@ -191,9 +203,9 @@ def _read_samples_df(
|
|
191
203
|
# filter detail records
|
192
204
|
assert isinstance(sample, EvalSample)
|
193
205
|
if isinstance(detail, MessagesDetail):
|
194
|
-
detail_items: list[ChatMessage] | list[Event] =
|
195
|
-
|
196
|
-
|
206
|
+
detail_items: list[ChatMessage] | list[Event] = (
|
207
|
+
sample_messages_from_events(sample.events, detail.filter)
|
208
|
+
)
|
197
209
|
elif isinstance(detail, EventsDetail):
|
198
210
|
detail_items = [e for e in sample.events if detail.filter(e)]
|
199
211
|
else:
|
@@ -226,7 +238,7 @@ def _read_samples_df(
|
|
226
238
|
|
227
239
|
# record sample record
|
228
240
|
sample_records.append(record)
|
229
|
-
|
241
|
+
progress()
|
230
242
|
|
231
243
|
# normalize records and produce samples table
|
232
244
|
samples_table = records_to_pandas(sample_records)
|
@@ -262,6 +274,35 @@ def _read_samples_df(
|
|
262
274
|
return samples_table, all_errors
|
263
275
|
|
264
276
|
|
277
|
+
def sample_messages_from_events(
|
278
|
+
events: list[Event], filter: Callable[[ChatMessage], bool]
|
279
|
+
) -> list[ChatMessage]:
|
280
|
+
# don't yield the same event twice
|
281
|
+
ids: set[str] = set()
|
282
|
+
|
283
|
+
# we need to look at the full input to every model event and add
|
284
|
+
# messages we haven't seen before
|
285
|
+
messages: list[ChatMessage] = []
|
286
|
+
for event in events:
|
287
|
+
if event.event == "model":
|
288
|
+
event_messages = event.input + (
|
289
|
+
[event.output.message] if not event.output.empty else []
|
290
|
+
)
|
291
|
+
for message in event_messages:
|
292
|
+
id = message.id or message_hash(message_as_str(message))
|
293
|
+
if id not in ids:
|
294
|
+
messages.append(message)
|
295
|
+
ids.add(id)
|
296
|
+
|
297
|
+
# then apply the filter
|
298
|
+
return [message for message in messages if filter(message)]
|
299
|
+
|
300
|
+
|
301
|
+
@lru_cache(maxsize=100)
|
302
|
+
def message_hash(message: str) -> str:
|
303
|
+
return mm3_hash(message)
|
304
|
+
|
305
|
+
|
265
306
|
def reorder_samples_df_columns(
|
266
307
|
df: "pd.DataFrame",
|
267
308
|
eval_columns: list[Column],
|
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Sequence, TypeAlias
|
|
9
9
|
from inspect_ai._util.error import pip_dependency_error
|
10
10
|
from inspect_ai._util.file import FileInfo, filesystem
|
11
11
|
from inspect_ai._util.version import verify_required_version
|
12
|
-
from inspect_ai.log._file import log_files_from_ls
|
12
|
+
from inspect_ai.log._file import EvalLogInfo, log_files_from_ls
|
13
13
|
|
14
14
|
if TYPE_CHECKING:
|
15
15
|
import pandas as pd
|
@@ -17,7 +17,9 @@ if TYPE_CHECKING:
|
|
17
17
|
|
18
18
|
from .columns import ColumnType
|
19
19
|
|
20
|
-
LogPaths: TypeAlias =
|
20
|
+
LogPaths: TypeAlias = (
|
21
|
+
PathLike[str] | str | EvalLogInfo | Sequence[PathLike[str] | str | EvalLogInfo]
|
22
|
+
)
|
21
23
|
|
22
24
|
|
23
25
|
def verify_prerequisites() -> None:
|
@@ -41,30 +43,31 @@ def verify_prerequisites() -> None:
|
|
41
43
|
verify_required_version("inspect_ai.analysis", "pyarrow", "10.0.1")
|
42
44
|
|
43
45
|
|
44
|
-
def resolve_logs(logs: LogPaths
|
46
|
+
def resolve_logs(logs: LogPaths) -> list[str]:
|
45
47
|
# normalize to list of str
|
46
|
-
logs = [logs] if isinstance(logs, str | PathLike) else logs
|
47
|
-
|
48
|
+
logs = [logs] if isinstance(logs, str | PathLike | EvalLogInfo) else logs
|
49
|
+
logs_str = [
|
50
|
+
Path(log).as_posix()
|
51
|
+
if isinstance(log, PathLike)
|
52
|
+
else log.name
|
53
|
+
if isinstance(log, EvalLogInfo)
|
54
|
+
else log
|
55
|
+
for log in logs
|
56
|
+
]
|
48
57
|
|
49
58
|
# expand directories
|
50
59
|
log_paths: list[FileInfo] = []
|
51
|
-
for
|
52
|
-
|
53
|
-
|
54
|
-
fs = filesystem(log)
|
55
|
-
info = fs.info(log)
|
60
|
+
for log_str in logs_str:
|
61
|
+
fs = filesystem(log_str)
|
62
|
+
info = fs.info(log_str)
|
56
63
|
if info.type == "directory":
|
57
64
|
log_paths.extend(
|
58
|
-
[
|
59
|
-
fi
|
60
|
-
for fi in fs.ls(info.name, recursive=recursive)
|
61
|
-
if fi.type == "file"
|
62
|
-
]
|
65
|
+
[fi for fi in fs.ls(info.name, recursive=True) if fi.type == "file"]
|
63
66
|
)
|
64
67
|
else:
|
65
68
|
log_paths.append(info)
|
66
69
|
|
67
|
-
log_files = log_files_from_ls(log_paths,
|
70
|
+
log_files = log_files_from_ls(log_paths, sort=False)
|
68
71
|
return [log_file.name for log_file in log_files]
|
69
72
|
|
70
73
|
|
inspect_ai/log/_file.py
CHANGED
@@ -526,12 +526,19 @@ def log_files_from_ls(
|
|
526
526
|
ls: list[FileInfo],
|
527
527
|
formats: list[Literal["eval", "json"]] | None = None,
|
528
528
|
descending: bool = True,
|
529
|
+
sort: bool = True,
|
529
530
|
) -> list[EvalLogInfo]:
|
530
531
|
extensions = [f".{format}" for format in (formats or ALL_LOG_FORMATS)]
|
531
532
|
return [
|
532
533
|
log_file_info(file)
|
533
|
-
for file in
|
534
|
-
|
534
|
+
for file in (
|
535
|
+
sorted(
|
536
|
+
ls,
|
537
|
+
key=lambda file: (file.mtime if file.mtime else 0),
|
538
|
+
reverse=descending,
|
539
|
+
)
|
540
|
+
if sort
|
541
|
+
else ls
|
535
542
|
)
|
536
543
|
if file.type == "file" and is_log_file(file.name, extensions)
|
537
544
|
]
|
inspect_ai/solver/_task_state.py
CHANGED
@@ -138,7 +138,7 @@ class TaskState:
|
|
138
138
|
The `TaskState` represents the internal state of the `Task` being run for a single `Sample`.
|
139
139
|
|
140
140
|
The `TaskState` is passed to and returned from each solver during a sample's
|
141
|
-
evaluation. It allows us to
|
141
|
+
evaluation. It allows us to maintain the manipulated message history, the tools
|
142
142
|
available to the model, the final output of the model, and whether the task
|
143
143
|
is completed or has hit a limit.
|
144
144
|
"""
|
inspect_ai/tool/_mcp/_sandbox.py
CHANGED
@@ -5,6 +5,7 @@ from typing import TextIO
|
|
5
5
|
import anyio
|
6
6
|
from anyio.streams.memory import MemoryObjectReceiveStream, MemoryObjectSendStream
|
7
7
|
from mcp import JSONRPCRequest, StdioServerParameters
|
8
|
+
from mcp.shared.message import SessionMessage
|
8
9
|
from mcp.types import JSONRPCMessage, JSONRPCNotification
|
9
10
|
|
10
11
|
from inspect_ai.tool._tool_support_helpers import (
|
@@ -36,12 +37,12 @@ async def sandbox_client( # type: ignore
|
|
36
37
|
)
|
37
38
|
|
38
39
|
# read_stream is remote process's stdout
|
39
|
-
read_stream: MemoryObjectReceiveStream[
|
40
|
-
read_stream_writer: MemoryObjectSendStream[
|
40
|
+
read_stream: MemoryObjectReceiveStream[SessionMessage | Exception]
|
41
|
+
read_stream_writer: MemoryObjectSendStream[SessionMessage | Exception]
|
41
42
|
|
42
43
|
# write_stream is remote process's stdin
|
43
|
-
write_stream: MemoryObjectSendStream[
|
44
|
-
write_stream_reader: MemoryObjectReceiveStream[
|
44
|
+
write_stream: MemoryObjectSendStream[SessionMessage]
|
45
|
+
write_stream_reader: MemoryObjectReceiveStream[SessionMessage]
|
45
46
|
|
46
47
|
read_stream_writer, read_stream = anyio.create_memory_object_stream(0)
|
47
48
|
write_stream, write_stream_reader = anyio.create_memory_object_stream(0)
|
@@ -64,18 +65,20 @@ async def sandbox_client( # type: ignore
|
|
64
65
|
async with write_stream_reader:
|
65
66
|
# This reads messages until the stream is closed
|
66
67
|
async for message in write_stream_reader:
|
67
|
-
root = message.root
|
68
|
+
root = message.message.root
|
68
69
|
if isinstance(root, JSONRPCRequest):
|
69
70
|
await read_stream_writer.send(
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
71
|
+
SessionMessage(
|
72
|
+
message=await exec_model_request(
|
73
|
+
sandbox=sandbox_environment,
|
74
|
+
method="mcp_send_request",
|
75
|
+
params={
|
76
|
+
"session_id": session_id,
|
77
|
+
"request": root.model_dump(),
|
78
|
+
},
|
79
|
+
result_type=JSONRPCMessage,
|
80
|
+
timeout=timeout,
|
81
|
+
)
|
79
82
|
)
|
80
83
|
)
|
81
84
|
elif isinstance(root, JSONRPCNotification):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: inspect_ai
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.96
|
4
4
|
Summary: Framework for large language model evaluations
|
5
5
|
Author: UK AI Security Institute
|
6
6
|
License: MIT License
|
@@ -23,7 +23,7 @@ License-File: LICENSE
|
|
23
23
|
Requires-Dist: aiohttp>=3.9.0
|
24
24
|
Requires-Dist: anyio>=4.8.0
|
25
25
|
Requires-Dist: beautifulsoup4
|
26
|
-
Requires-Dist: click
|
26
|
+
Requires-Dist: click<8.2.0,>=8.1.3
|
27
27
|
Requires-Dist: debugpy
|
28
28
|
Requires-Dist: docstring-parser>=0.16
|
29
29
|
Requires-Dist: exceptiongroup>=1.0.2; python_version < "3.11"
|
@@ -98,7 +98,7 @@ inspect_ai/_util/interrupt.py,sha256=T30e5YaKSNmnO695p0lK0dquUWFq6dNNtdAFPmWGwME
|
|
98
98
|
inspect_ai/_util/json.py,sha256=LiHF4XPrcuCBpnBKYCIX2AkvmsYuPieQ6HNdSlUMVvU,3653
|
99
99
|
inspect_ai/_util/kvstore.py,sha256=z2IXLWP4QqqGqsq5_MbYjBQPcEJqfWK4IyZXgV-kppA,2398
|
100
100
|
inspect_ai/_util/list.py,sha256=6_5r5jI5RKK34kCmIqqVQ5hYG-G8v0F5H7L-DmQQ2E4,279
|
101
|
-
inspect_ai/_util/local_server.py,sha256=
|
101
|
+
inspect_ai/_util/local_server.py,sha256=T54l-csb2qmQDvZ7zNYVq6_j0BuW5FZSBKT9GfXNc6w,13787
|
102
102
|
inspect_ai/_util/logger.py,sha256=XpGyoe8V7FIhNU1rnjTjwR07LVbshA9rRZn33sOitig,6230
|
103
103
|
inspect_ai/_util/notebook.py,sha256=Mgz3J4uBh-MqVBRmpiJqDHRpn2hd7HIOBeJBwLG-bbk,2998
|
104
104
|
inspect_ai/_util/notgiven.py,sha256=zkn6AYflKLf8YlnwTAMxPLQ-4LyIVmKpGcNcXf-Ssng,457
|
@@ -485,7 +485,7 @@ inspect_ai/agent/_filter.py,sha256=qnT0HbT4edpDi0MwXY3Q3It2pzNRkTRXZDOqfCwMY6M,1
|
|
485
485
|
inspect_ai/agent/_handoff.py,sha256=NY29zJWxZyB9YtIi9TtD7ydvULEY-Q8wfdedMDD1bjA,3729
|
486
486
|
inspect_ai/agent/_react.py,sha256=oTHY-ZMXkCNMBwn161G_Ov-svgKqAfzOp7FryJg9imE,14078
|
487
487
|
inspect_ai/agent/_run.py,sha256=9KAfguMPn9czothbFk_ng5xRtvIWeOjNvHuvERWENMU,1875
|
488
|
-
inspect_ai/agent/_types.py,sha256=
|
488
|
+
inspect_ai/agent/_types.py,sha256=FALCBDziC2CrEy18wBzBbIcQlZs5bCPilyqQ4RXizDc,4227
|
489
489
|
inspect_ai/agent/_bridge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
490
490
|
inspect_ai/agent/_bridge/bridge.py,sha256=Qk1z54vSZvFZMmFMOvopwY6rhFxHmJwOipZ_yVsbryU,3465
|
491
491
|
inspect_ai/agent/_bridge/patch.py,sha256=IFpgL7WImh5RnAz7fAr574krVqa_Gm9A_eZ7leW983s,7061
|
@@ -505,28 +505,30 @@ inspect_ai/agent/_human/commands/score.py,sha256=6DyKiYHU7w-tKxHH5cZ0rXgFY7NWc4k
|
|
505
505
|
inspect_ai/agent/_human/commands/status.py,sha256=uUO5M4skWDp29OS8sqVKAqZw0OcM3MSesBYQNbRypJ0,1934
|
506
506
|
inspect_ai/agent/_human/commands/submit.py,sha256=D2p1M2ApvAcaVZhbP3fFofG9ZsPVvmxivSLIF5xQxtA,6524
|
507
507
|
inspect_ai/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
508
|
-
inspect_ai/analysis/beta/__init__.py,sha256=
|
508
|
+
inspect_ai/analysis/beta/__init__.py,sha256=iz72c_fRBhtXmfBUPH_cGnnFpH-SD9DEULTb0-pNY-8,1413
|
509
509
|
inspect_ai/analysis/beta/_dataframe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
510
510
|
inspect_ai/analysis/beta/_dataframe/columns.py,sha256=feUqCpm9kxieoKPwXT8EwF8DTcwxG4JCCjCGO5XNcJc,4454
|
511
|
-
inspect_ai/analysis/beta/_dataframe/extract.py,sha256=
|
511
|
+
inspect_ai/analysis/beta/_dataframe/extract.py,sha256=MnRUwtJ0ATS-94qi8nzqZ5hdV2ZQ5rw_kBZ_FCxxdLg,2288
|
512
|
+
inspect_ai/analysis/beta/_dataframe/progress.py,sha256=YUUi8U-4BIklDmPbuVCeIQ6DkpQMC0tJHrGrZdOLIno,626
|
512
513
|
inspect_ai/analysis/beta/_dataframe/record.py,sha256=rT3k9LuMvogw2kbFoCIHhNYb_p8QqER_FY2J9W0f1kY,12690
|
513
|
-
inspect_ai/analysis/beta/_dataframe/util.py,sha256=
|
514
|
+
inspect_ai/analysis/beta/_dataframe/util.py,sha256=OGfBa2P3i3a1PQQP7Q5Y-uaUms1gYuaE83kvnfhrYXA,4964
|
514
515
|
inspect_ai/analysis/beta/_dataframe/validate.py,sha256=_UBn_fosgppF3Y5wCCtF8-cnCVM61XdOK6Lm91jMgH0,6213
|
515
516
|
inspect_ai/analysis/beta/_dataframe/evals/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
516
517
|
inspect_ai/analysis/beta/_dataframe/evals/columns.py,sha256=ZMR1AByGmHWGmn3qoWefF7pDNnL4mMMlzDlwkUECm5I,4725
|
517
518
|
inspect_ai/analysis/beta/_dataframe/evals/extract.py,sha256=XUHFWveTcAFWYTPFgsOIKB9jZT0o4v_7ElVZGJ-SAf8,586
|
518
|
-
inspect_ai/analysis/beta/_dataframe/evals/table.py,sha256=
|
519
|
+
inspect_ai/analysis/beta/_dataframe/evals/table.py,sha256=oxSJg762WPIjTln5P04nC_h-KDmAEblROyMUgePPqak,5077
|
519
520
|
inspect_ai/analysis/beta/_dataframe/events/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
520
|
-
inspect_ai/analysis/beta/_dataframe/events/columns.py,sha256=
|
521
|
-
inspect_ai/analysis/beta/_dataframe/events/
|
521
|
+
inspect_ai/analysis/beta/_dataframe/events/columns.py,sha256=VH6U0zXiBEK_4dXskh1OhksYjAY7KvpZWMTv9w0bBbA,2912
|
522
|
+
inspect_ai/analysis/beta/_dataframe/events/extract.py,sha256=XxCMslBjzbI_q74bG47w5f9ncBzqJxMXSrCBJ3g23NE,705
|
523
|
+
inspect_ai/analysis/beta/_dataframe/events/table.py,sha256=KDZbhCgwevfwAHdSGIhUIvzBKqJWFzRe6OalxJpDRE8,2869
|
522
524
|
inspect_ai/analysis/beta/_dataframe/messages/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
523
|
-
inspect_ai/analysis/beta/_dataframe/messages/columns.py,sha256=
|
525
|
+
inspect_ai/analysis/beta/_dataframe/messages/columns.py,sha256=T8dbyGsg6ut7G0xbnfxVAaJK43EmfvDnCbIhIvrmbB8,1728
|
524
526
|
inspect_ai/analysis/beta/_dataframe/messages/extract.py,sha256=B7st9zoXSIj_sXm9-h_fLaRtb3ybIgXcOk41IfOxhGA,660
|
525
|
-
inspect_ai/analysis/beta/_dataframe/messages/table.py,sha256=
|
527
|
+
inspect_ai/analysis/beta/_dataframe/messages/table.py,sha256=pAESqFx9WzAyuQCsjrzD0ShbJT1yFf7Con6cu10etbs,2519
|
526
528
|
inspect_ai/analysis/beta/_dataframe/samples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
527
|
-
inspect_ai/analysis/beta/_dataframe/samples/columns.py,sha256=
|
528
|
-
inspect_ai/analysis/beta/_dataframe/samples/extract.py,sha256=
|
529
|
-
inspect_ai/analysis/beta/_dataframe/samples/table.py,sha256=
|
529
|
+
inspect_ai/analysis/beta/_dataframe/samples/columns.py,sha256=Ffi734379rSwrkDth3wyMGVIsrepp8fjXKFVWUS-CQw,2493
|
530
|
+
inspect_ai/analysis/beta/_dataframe/samples/extract.py,sha256=WkalxZbV4Fwx1hCJIdC3D6JeE51tPBNkufjQ762eWEQ,1404
|
531
|
+
inspect_ai/analysis/beta/_dataframe/samples/table.py,sha256=c9CMdrcCcZECcvNn1jsZj_oh1RirX9aSOLJxFJ9HnY4,12252
|
530
532
|
inspect_ai/approval/__init__.py,sha256=Bqq4GFljOqKaIUkuCvhlFv89TfJpvbuO_R0jVyjb8VI,379
|
531
533
|
inspect_ai/approval/_apply.py,sha256=v9v9XfvBt203TbvdB5aJbHR_SqC23xcEjBPpESbXKg8,2146
|
532
534
|
inspect_ai/approval/_approval.py,sha256=twQcEvfU3-hPdsG785ak8OvRMOzMa00-UQAdz9Mh8Fo,863
|
@@ -560,7 +562,7 @@ inspect_ai/log/__init__.py,sha256=PZsopxfD0ipS6g_5CMipbttrxI1R1fy10Si0zs4lO38,25
|
|
560
562
|
inspect_ai/log/_bundle.py,sha256=5Uy-s64_SFokZ7WRzti9mD7yoKrd2sOzdvqKyahoiC4,8045
|
561
563
|
inspect_ai/log/_condense.py,sha256=OedMphK5Q2YPuY1cnoAM7tGsyVIU6Kwrv3oIeb3dFmY,10881
|
562
564
|
inspect_ai/log/_convert.py,sha256=afEOHkaQtCkTWdwyFweGTEzLq0VVdhTjhr0IgVX5W7I,3324
|
563
|
-
inspect_ai/log/_file.py,sha256=
|
565
|
+
inspect_ai/log/_file.py,sha256=PPYVC1TbvGpWjUyke_in84fNQQ-U-ybZdMV2jbX0ugU,19503
|
564
566
|
inspect_ai/log/_log.py,sha256=2WiLyUPygrq4CyzRoDCT5-lqRzh-HMkt-pHEfuEt0sE,29994
|
565
567
|
inspect_ai/log/_message.py,sha256=QofM_JZF_x3k_5ta1uQzoN_VnMoUhXFnqWurIn9FXOY,1999
|
566
568
|
inspect_ai/log/_model.py,sha256=8tEhFZc1tBFgA6A_spXTqTBdvbzZP5t7ul7DiloHRWk,1698
|
@@ -661,7 +663,7 @@ inspect_ai/solver/_plan.py,sha256=lpbjIbBpiPzud7jaHqA81ZFFO0gjt_4EW0blzG4DquA,72
|
|
661
663
|
inspect_ai/solver/_prompt.py,sha256=n2gkRUMSRKViDBL4WtepNoMx7zidIkQgOHLGllP6WVo,4955
|
662
664
|
inspect_ai/solver/_run.py,sha256=k-IYoFpyNq8-HTFgQck4Akvs3OtopiL4qRWj8_yLhvY,1763
|
663
665
|
inspect_ai/solver/_solver.py,sha256=UJ2CvmJr74n65x4xipZTxNzGfvUyuTHnnRSY0QqNo5I,9563
|
664
|
-
inspect_ai/solver/_task_state.py,sha256=
|
666
|
+
inspect_ai/solver/_task_state.py,sha256=DMbaDuXOViZlCez5pEv3Y9czIPk61qyfycNjSIgprOI,14704
|
665
667
|
inspect_ai/solver/_transcript.py,sha256=kdnkR8243NXlIvcDpZ4nb1XKT7pBYHLk5V26MtwP2EU,1047
|
666
668
|
inspect_ai/solver/_use_tools.py,sha256=VmhCjKpkWgifOS20toBcK2bFDmyPqfxkBvcHs_-nv58,2235
|
667
669
|
inspect_ai/solver/_util.py,sha256=pthrf-CzC6FnQYSUFLXTYM4wFEJptZrh5POTmV-Jtow,446
|
@@ -681,7 +683,7 @@ inspect_ai/tool/beta.py,sha256=KQYntN2MLiIHp4Gf4GXv3QO3aYHBBaP-npkluTT-aDM,153
|
|
681
683
|
inspect_ai/tool/_mcp/__init__.py,sha256=vqtlBle1T_jlRQPvLKJbLgW5h_I0Ee33nDBI-rCtIeA,314
|
682
684
|
inspect_ai/tool/_mcp/_context.py,sha256=tKQuBZ5ooRvDEW0ffACejdjKi7f8VFfYRn5uaMZGDPw,405
|
683
685
|
inspect_ai/tool/_mcp/_mcp.py,sha256=gNTlNTzMRU5L-h4_EGPqosbPLumSdIh3_25ofrGodqs,10599
|
684
|
-
inspect_ai/tool/_mcp/_sandbox.py,sha256=
|
686
|
+
inspect_ai/tool/_mcp/_sandbox.py,sha256=tW3-kqUrtKlbPEUtyIP2Ywh7FhakCQA9dyeabmLnPuU,4444
|
685
687
|
inspect_ai/tool/_mcp/_types.py,sha256=RT9ZRugYR3ArKe54_fuYxeenlWa_os0_DYadVIJEHlM,769
|
686
688
|
inspect_ai/tool/_mcp/connection.py,sha256=c1VRVtN90f2KptKCXlQ6fAX2Bxx8HXu3_ZvYmt_35dw,1901
|
687
689
|
inspect_ai/tool/_mcp/sampling.py,sha256=YDfrYj6GAec4R3JkQpUc_fPROQUpRARvbUPq7FVKSQ0,4001
|
@@ -737,9 +739,9 @@ inspect_ai/util/_sandbox/docker/internal.py,sha256=c8X8TLrBPOvsfnq5TkMlb_bzTALyc
|
|
737
739
|
inspect_ai/util/_sandbox/docker/prereqs.py,sha256=0j6_OauBBnVlpBleADcZavIAAQZy4WewVjbRn9c0stg,3355
|
738
740
|
inspect_ai/util/_sandbox/docker/service.py,sha256=hhHIWH1VDFLwehdGd19aUBD_VKfDO3GCPxpw1HSwVQk,2437
|
739
741
|
inspect_ai/util/_sandbox/docker/util.py,sha256=EeInihCNXgUWxaqZ4dNOJd719kXL2_jr63QCoXn68vA,3154
|
740
|
-
inspect_ai-0.3.
|
741
|
-
inspect_ai-0.3.
|
742
|
-
inspect_ai-0.3.
|
743
|
-
inspect_ai-0.3.
|
744
|
-
inspect_ai-0.3.
|
745
|
-
inspect_ai-0.3.
|
742
|
+
inspect_ai-0.3.96.dist-info/licenses/LICENSE,sha256=xZPCr8gTiFIerrA_DRpLAbw-UUftnLFsHxKeW-NTtq8,1081
|
743
|
+
inspect_ai-0.3.96.dist-info/METADATA,sha256=GHfPnN-m8cQUeqysFWpqn5z0tfuMt7JCeTNN1tYJYOA,5438
|
744
|
+
inspect_ai-0.3.96.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
|
745
|
+
inspect_ai-0.3.96.dist-info/entry_points.txt,sha256=WGGLmzTzDWLzYfiyovSY6oEKuf-gqzSDNOb5V-hk3fM,54
|
746
|
+
inspect_ai-0.3.96.dist-info/top_level.txt,sha256=Tp3za30CHXJEKLk8xLe9qGsW4pBzJpEIOMHOHNCXiVo,11
|
747
|
+
inspect_ai-0.3.96.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|