inspect-ai 0.3.95__py3-none-any.whl → 0.3.97__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_eval/eval.py +10 -2
- inspect_ai/_eval/task/util.py +32 -3
- inspect_ai/_util/local_server.py +16 -0
- inspect_ai/_util/registry.py +7 -0
- inspect_ai/_util/timer.py +13 -0
- inspect_ai/_view/www/dist/assets/index.css +275 -195
- inspect_ai/_view/www/dist/assets/index.js +8568 -7376
- inspect_ai/_view/www/src/app/App.css +1 -0
- inspect_ai/_view/www/src/app/App.tsx +27 -10
- inspect_ai/_view/www/src/app/appearance/icons.ts +5 -0
- inspect_ai/_view/www/src/app/content/RecordTree.module.css +22 -0
- inspect_ai/_view/www/src/app/content/RecordTree.tsx +370 -0
- inspect_ai/_view/www/src/app/content/RenderedContent.module.css +5 -0
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +32 -19
- inspect_ai/_view/www/src/app/content/record_processors/store.ts +101 -0
- inspect_ai/_view/www/src/app/content/record_processors/types.ts +3 -0
- inspect_ai/_view/www/src/app/content/types.ts +5 -0
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +1 -0
- inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +35 -28
- inspect_ai/_view/www/src/app/log-view/LogViewLayout.tsx +1 -8
- inspect_ai/_view/www/src/app/log-view/navbar/PrimaryBar.tsx +2 -4
- inspect_ai/_view/www/src/app/log-view/navbar/ResultsPanel.tsx +13 -3
- inspect_ai/_view/www/src/app/log-view/navbar/ScoreGrid.module.css +15 -0
- inspect_ai/_view/www/src/app/log-view/navbar/ScoreGrid.tsx +14 -10
- inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +9 -3
- inspect_ai/_view/www/src/app/log-view/tabs/JsonTab.tsx +1 -3
- inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +8 -2
- inspect_ai/_view/www/src/app/log-view/types.ts +1 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.module.css +7 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.tsx +5 -2
- inspect_ai/_view/www/src/app/plan/PlanCard.tsx +13 -8
- inspect_ai/_view/www/src/app/routing/navigationHooks.ts +63 -8
- inspect_ai/_view/www/src/app/routing/url.ts +45 -0
- inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.module.css +2 -1
- inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.tsx +15 -8
- inspect_ai/_view/www/src/app/samples/SampleDialog.module.css +3 -0
- inspect_ai/_view/www/src/app/samples/SampleDialog.tsx +16 -5
- inspect_ai/_view/www/src/app/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +68 -31
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +12 -7
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -5
- inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.module.css +9 -0
- inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.tsx +48 -18
- inspect_ai/_view/www/src/app/samples/chat/ChatView.tsx +0 -1
- inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.module.css +4 -0
- inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.tsx +41 -1
- inspect_ai/_view/www/src/app/samples/chat/messages.ts +7 -0
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.module.css +0 -3
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolInput.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/chat/tools/ToolOutput.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +5 -1
- inspect_ai/_view/www/src/app/samples/descriptor/score/PassFailScoreDescriptor.tsx +11 -6
- inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +7 -0
- inspect_ai/_view/www/src/app/samples/list/SampleRow.tsx +5 -18
- inspect_ai/_view/www/src/app/samples/sample-tools/SortFilter.tsx +1 -1
- inspect_ai/_view/www/src/app/samples/scores/SampleScoresGrid.tsx +18 -5
- inspect_ai/_view/www/src/app/samples/scores/SampleScoresView.module.css +0 -6
- inspect_ai/_view/www/src/app/samples/scores/SampleScoresView.tsx +4 -1
- inspect_ai/_view/www/src/app/samples/transcript/ApprovalEventView.tsx +4 -2
- inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +6 -4
- inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.tsx +13 -6
- inspect_ai/_view/www/src/app/samples/transcript/InputEventView.tsx +6 -4
- inspect_ai/_view/www/src/app/samples/transcript/LoggerEventView.tsx +4 -2
- inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +11 -8
- inspect_ai/_view/www/src/app/samples/transcript/SampleInitEventView.tsx +14 -8
- inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +13 -8
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.tsx +25 -16
- inspect_ai/_view/www/src/app/samples/transcript/ScoreEventView.tsx +7 -5
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +11 -28
- inspect_ai/_view/www/src/app/samples/transcript/StepEventView.tsx +12 -20
- inspect_ai/_view/www/src/app/samples/transcript/SubtaskEventView.tsx +12 -31
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +25 -29
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualList.tsx +297 -0
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +0 -8
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.tsx +43 -25
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.module.css +43 -0
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +109 -43
- inspect_ai/_view/www/src/app/samples/transcript/state/StateEventView.tsx +19 -8
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +128 -60
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +14 -4
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +6 -4
- inspect_ai/_view/www/src/app/types.ts +12 -1
- inspect_ai/_view/www/src/components/Card.css +6 -3
- inspect_ai/_view/www/src/components/Card.tsx +15 -2
- inspect_ai/_view/www/src/components/CopyButton.tsx +4 -6
- inspect_ai/_view/www/src/components/ExpandablePanel.module.css +20 -14
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +17 -22
- inspect_ai/_view/www/src/components/LargeModal.tsx +5 -1
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +25 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.css +4 -0
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +2 -2
- inspect_ai/_view/www/src/components/TabSet.module.css +6 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +8 -2
- inspect_ai/_view/www/src/state/hooks.ts +83 -13
- inspect_ai/_view/www/src/state/logPolling.ts +2 -2
- inspect_ai/_view/www/src/state/logSlice.ts +1 -2
- inspect_ai/_view/www/src/state/logsSlice.ts +9 -9
- inspect_ai/_view/www/src/state/samplePolling.ts +1 -1
- inspect_ai/_view/www/src/state/sampleSlice.ts +134 -7
- inspect_ai/_view/www/src/state/scoring.ts +1 -1
- inspect_ai/_view/www/src/state/scrolling.ts +39 -6
- inspect_ai/_view/www/src/state/store.ts +5 -0
- inspect_ai/_view/www/src/state/store_filter.ts +47 -44
- inspect_ai/_view/www/src/utils/debugging.ts +95 -0
- inspect_ai/_view/www/src/utils/format.ts +2 -2
- inspect_ai/_view/www/src/utils/json.ts +29 -0
- inspect_ai/agent/__init__.py +2 -1
- inspect_ai/agent/_agent.py +12 -0
- inspect_ai/agent/_react.py +184 -48
- inspect_ai/agent/_types.py +15 -2
- inspect_ai/analysis/beta/__init__.py +11 -3
- inspect_ai/analysis/beta/_dataframe/columns.py +11 -16
- inspect_ai/analysis/beta/_dataframe/evals/table.py +101 -39
- inspect_ai/analysis/beta/_dataframe/events/columns.py +50 -0
- inspect_ai/analysis/beta/_dataframe/events/extract.py +26 -0
- inspect_ai/analysis/beta/_dataframe/events/table.py +77 -3
- inspect_ai/analysis/beta/_dataframe/extract.py +44 -25
- inspect_ai/analysis/beta/_dataframe/messages/columns.py +1 -1
- inspect_ai/analysis/beta/_dataframe/messages/table.py +30 -29
- inspect_ai/analysis/beta/_dataframe/progress.py +56 -0
- inspect_ai/analysis/beta/_dataframe/record.py +13 -9
- inspect_ai/analysis/beta/_dataframe/samples/columns.py +8 -4
- inspect_ai/analysis/beta/_dataframe/samples/extract.py +5 -33
- inspect_ai/analysis/beta/_dataframe/samples/table.py +211 -60
- inspect_ai/analysis/beta/_dataframe/util.py +33 -28
- inspect_ai/log/_file.py +9 -2
- inspect_ai/model/_call_tools.py +1 -1
- inspect_ai/model/_providers/anthropic.py +18 -5
- inspect_ai/model/_providers/azureai.py +7 -2
- inspect_ai/model/_providers/util/llama31.py +3 -3
- inspect_ai/solver/_task_state.py +1 -1
- inspect_ai/tool/_mcp/_sandbox.py +17 -14
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/RECORD +140 -133
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.module.css +0 -48
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +0 -276
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
|
|
1
|
+
from datetime import datetime
|
1
2
|
from typing import Any, Callable, Mapping, Type
|
2
3
|
|
3
4
|
from jsonpath_ng import JSONPath # type: ignore
|
@@ -7,6 +8,12 @@ from typing_extensions import override
|
|
7
8
|
from inspect_ai.log._transcript import Event
|
8
9
|
|
9
10
|
from ..columns import Column, ColumnType
|
11
|
+
from .extract import (
|
12
|
+
completion_as_str,
|
13
|
+
model_event_input_as_str,
|
14
|
+
tool_choice_as_str,
|
15
|
+
tool_view_as_str,
|
16
|
+
)
|
10
17
|
|
11
18
|
|
12
19
|
class EventColumn(Column):
|
@@ -35,3 +42,46 @@ class EventColumn(Column):
|
|
35
42
|
@override
|
36
43
|
def path_schema(self) -> Mapping[str, Any] | None:
|
37
44
|
return None
|
45
|
+
|
46
|
+
|
47
|
+
EventInfo: list[Column] = [
|
48
|
+
EventColumn("event", path="event"),
|
49
|
+
EventColumn("span_id", path="span_id"),
|
50
|
+
]
|
51
|
+
"""Event basic information columns."""
|
52
|
+
|
53
|
+
EventTiming: list[Column] = [
|
54
|
+
EventColumn("timestamp", path="timestamp", type=datetime),
|
55
|
+
EventColumn("completed", path="completed", type=datetime),
|
56
|
+
EventColumn("working_start", path="working_start"),
|
57
|
+
EventColumn("working_time", path="working_time"),
|
58
|
+
]
|
59
|
+
"""Event timing columns."""
|
60
|
+
|
61
|
+
ModelEventColumns: list[Column] = [
|
62
|
+
EventColumn("model_event_model", path="model"),
|
63
|
+
EventColumn("model_event_role", path="role"),
|
64
|
+
EventColumn("model_event_input", path=model_event_input_as_str),
|
65
|
+
EventColumn("model_event_tools", path="tools"),
|
66
|
+
EventColumn("model_event_tool_choice", path=tool_choice_as_str),
|
67
|
+
EventColumn("model_event_config", path="config"),
|
68
|
+
EventColumn("model_event_usage", path="output.usage"),
|
69
|
+
EventColumn("model_event_time", path="output.time"),
|
70
|
+
EventColumn("model_event_completion", path=completion_as_str),
|
71
|
+
EventColumn("model_event_retries", path="retries"),
|
72
|
+
EventColumn("model_event_error", path="error"),
|
73
|
+
EventColumn("model_event_cache", path="cache"),
|
74
|
+
EventColumn("model_event_call", path="call"),
|
75
|
+
]
|
76
|
+
"""Model event columns."""
|
77
|
+
|
78
|
+
ToolEventColumns: list[Column] = [
|
79
|
+
EventColumn("tool_event_function", path="function"),
|
80
|
+
EventColumn("tool_event_arguments", path="arguments"),
|
81
|
+
EventColumn("tool_event_view", path=tool_view_as_str),
|
82
|
+
EventColumn("tool_event_result", path="result"),
|
83
|
+
EventColumn("tool_event_truncated", path="truncated"),
|
84
|
+
EventColumn("tool_event_error_type", path="error.type"),
|
85
|
+
EventColumn("tool_event_error_message", path="error.message"),
|
86
|
+
]
|
87
|
+
"""Tool event columns."""
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from inspect_ai.log._transcript import ModelEvent, ToolEvent
|
2
|
+
|
3
|
+
from ..extract import messages_as_str
|
4
|
+
|
5
|
+
|
6
|
+
def model_event_input_as_str(event: ModelEvent) -> str:
|
7
|
+
return messages_as_str(event.input)
|
8
|
+
|
9
|
+
|
10
|
+
def tool_choice_as_str(event: ModelEvent) -> str:
|
11
|
+
if isinstance(event.tool_choice, str):
|
12
|
+
return event.tool_choice
|
13
|
+
else:
|
14
|
+
return event.tool_choice.name
|
15
|
+
|
16
|
+
|
17
|
+
def completion_as_str(event: ModelEvent) -> str:
|
18
|
+
return event.output.completion
|
19
|
+
|
20
|
+
|
21
|
+
def tool_view_as_str(event: ToolEvent) -> str | None:
|
22
|
+
if event.view is not None:
|
23
|
+
title = f"{event.view.title}\n\n" if event.view.title is not None else ""
|
24
|
+
return f"{title}{event.view.content}"
|
25
|
+
else:
|
26
|
+
return None
|
@@ -1,14 +1,88 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING
|
3
|
+
from typing import TYPE_CHECKING, Callable, Literal, Sequence, TypeAlias
|
4
|
+
|
5
|
+
from inspect_ai.analysis.beta._dataframe.events.columns import EventInfo
|
6
|
+
from inspect_ai.log._file import list_eval_logs
|
7
|
+
from inspect_ai.log._transcript import Event
|
4
8
|
|
5
9
|
if TYPE_CHECKING:
|
6
10
|
import pandas as pd
|
7
11
|
|
12
|
+
from typing_extensions import overload
|
13
|
+
|
14
|
+
from ..columns import Column, ColumnError
|
15
|
+
from ..samples.table import EventsDetail, _read_samples_df
|
8
16
|
from ..util import LogPaths, verify_prerequisites
|
9
17
|
|
18
|
+
EventFilter: TypeAlias = Callable[[Event], bool]
|
19
|
+
"""Filter for `events_df()` rows."""
|
20
|
+
|
21
|
+
|
22
|
+
@overload
|
23
|
+
def events_df(
|
24
|
+
logs: LogPaths = list_eval_logs(),
|
25
|
+
columns: Sequence[Column] = EventInfo,
|
26
|
+
filter: EventFilter | None = None,
|
27
|
+
strict: Literal[True] = True,
|
28
|
+
parallel: bool | int = False,
|
29
|
+
quiet: bool = False,
|
30
|
+
) -> "pd.DataFrame": ...
|
31
|
+
|
10
32
|
|
11
|
-
|
33
|
+
@overload
|
34
|
+
def events_df(
|
35
|
+
logs: LogPaths = list_eval_logs(),
|
36
|
+
columns: Sequence[Column] = EventInfo,
|
37
|
+
filter: EventFilter | None = None,
|
38
|
+
strict: Literal[False] = False,
|
39
|
+
parallel: bool | int = False,
|
40
|
+
quiet: bool = False,
|
41
|
+
) -> tuple["pd.DataFrame", list[ColumnError]]: ...
|
42
|
+
|
43
|
+
|
44
|
+
def events_df(
|
45
|
+
logs: LogPaths = list_eval_logs(),
|
46
|
+
columns: Sequence[Column] = EventInfo,
|
47
|
+
filter: EventFilter | None = None,
|
48
|
+
strict: bool = True,
|
49
|
+
parallel: bool | int = False,
|
50
|
+
quiet: bool = False,
|
51
|
+
) -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]]:
|
52
|
+
"""Read a dataframe containing events from a set of evals.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
logs: One or more paths to log files or log directories.
|
56
|
+
Defaults to the contents of the currently active log directory
|
57
|
+
(e.g. ./logs or INSPECT_LOG_DIR).
|
58
|
+
columns: Specification for what columns to read from log files.
|
59
|
+
filter: Callable that filters event types.
|
60
|
+
strict: Raise import errors immediately. Defaults to `True`.
|
61
|
+
If `False` then a tuple of `DataFrame` and errors is returned.
|
62
|
+
parallel: If `True`, use `ProcessPoolExecutor` to read logs in parallel
|
63
|
+
(with workers based on `mp.cpu_count()`, capped at 8). If `int`, read
|
64
|
+
in parallel with the specified number of workers. If `False` (the default)
|
65
|
+
do not read in parallel.
|
66
|
+
quiet: If `True` do not print any output or progress (defaults to `False`).
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
For `strict`, a Pandas `DataFrame` with information for the specified logs.
|
70
|
+
For `strict=False`, a tuple of Pandas `DataFrame` and a dictionary of errors
|
71
|
+
encountered (by log file) during import.
|
72
|
+
"""
|
12
73
|
verify_prerequisites()
|
13
74
|
|
14
|
-
|
75
|
+
# resolve filter/detail
|
76
|
+
if callable(filter):
|
77
|
+
detail = EventsDetail(filter=filter)
|
78
|
+
else:
|
79
|
+
detail = EventsDetail()
|
80
|
+
|
81
|
+
return _read_samples_df(
|
82
|
+
logs=logs,
|
83
|
+
columns=columns,
|
84
|
+
strict=strict,
|
85
|
+
detail=detail,
|
86
|
+
progress=not quiet,
|
87
|
+
parallel=parallel,
|
88
|
+
)
|
@@ -5,11 +5,16 @@ from typing import Any, cast
|
|
5
5
|
import shortuuid
|
6
6
|
from pydantic import BaseModel, JsonValue
|
7
7
|
|
8
|
-
from inspect_ai.
|
8
|
+
from inspect_ai.model._chat_message import (
|
9
|
+
ChatMessage,
|
10
|
+
ChatMessageAssistant,
|
11
|
+
ChatMessageTool,
|
12
|
+
ChatMessageUser,
|
13
|
+
)
|
9
14
|
|
10
15
|
|
11
16
|
def model_to_record(model: BaseModel) -> dict[str, JsonValue]:
|
12
|
-
return cast(dict[str, JsonValue],
|
17
|
+
return cast(dict[str, JsonValue], model.model_dump(mode="json", exclude_none=True))
|
13
18
|
|
14
19
|
|
15
20
|
def list_as_str(x: JsonValue) -> str:
|
@@ -21,34 +26,48 @@ def score_values(x: JsonValue) -> dict[str, JsonValue]:
|
|
21
26
|
return {k: v["value"] for k, v in scores.items()}
|
22
27
|
|
23
28
|
|
24
|
-
def
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
+
def auto_id(base: str, index: str) -> str:
|
30
|
+
seed = f"{base}_{index}"
|
31
|
+
hash_bytes = hashlib.md5(seed.encode("utf-8")).digest()
|
32
|
+
long_uuid = uuid.UUID(bytes=hash_bytes)
|
33
|
+
return shortuuid.encode(long_uuid)
|
29
34
|
|
30
35
|
|
31
|
-
def messages_as_str(
|
32
|
-
if isinstance(
|
33
|
-
messages =
|
34
|
-
|
35
|
-
else:
|
36
|
-
raise ValueError(f"Unexpected type for messages: {type(x)}")
|
36
|
+
def messages_as_str(messages: str | list[ChatMessage]) -> str:
|
37
|
+
if isinstance(messages, str):
|
38
|
+
messages = [ChatMessageUser(content=messages)]
|
39
|
+
return "\n\n".join([message_as_str(message) for message in messages])
|
37
40
|
|
38
41
|
|
39
|
-
def message_as_str(message:
|
40
|
-
|
42
|
+
def message_as_str(message: ChatMessage) -> str:
|
43
|
+
transcript: list[str] = []
|
44
|
+
role = message.role
|
45
|
+
content = message.text.strip() if message.text else ""
|
41
46
|
|
47
|
+
# assistant messages with tool calls
|
48
|
+
if isinstance(message, ChatMessageAssistant) and message.tool_calls is not None:
|
49
|
+
entry = f"{role}:\n{content}\n"
|
42
50
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
else:
|
47
|
-
return "\n".join([c["text"] if c["type"] == "text" else "" for c in content])
|
51
|
+
for tool in message.tool_calls:
|
52
|
+
func_name = tool.function
|
53
|
+
args = tool.arguments
|
48
54
|
|
55
|
+
if isinstance(args, dict):
|
56
|
+
args_text = "\n".join(f"{k}: {v}" for k, v in args.items())
|
57
|
+
entry += f"\nTool Call: {func_name}\nArguments:\n{args_text}"
|
58
|
+
else:
|
59
|
+
entry += f"\nTool Call: {func_name}\nArguments: {args}"
|
49
60
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
61
|
+
transcript.append(entry)
|
62
|
+
|
63
|
+
# tool responses with errors
|
64
|
+
elif isinstance(message, ChatMessageTool) and message.error is not None:
|
65
|
+
func_name = message.function or "unknown"
|
66
|
+
entry = f"{role}:\n{content}\n\nError in tool call '{func_name}':\n{message.error.message}\n"
|
67
|
+
transcript.append(entry)
|
68
|
+
|
69
|
+
# normal messages
|
70
|
+
else:
|
71
|
+
transcript.append(f"{role}:\n{content}\n")
|
72
|
+
|
73
|
+
return "\n".join(transcript)
|
@@ -43,8 +43,8 @@ class MessageColumn(Column):
|
|
43
43
|
|
44
44
|
MessageContent: list[Column] = [
|
45
45
|
MessageColumn("role", path="role", required=True),
|
46
|
-
MessageColumn("content", path=message_text),
|
47
46
|
MessageColumn("source", path="source"),
|
47
|
+
MessageColumn("content", path=message_text),
|
48
48
|
]
|
49
49
|
"""Message content columns."""
|
50
50
|
|
@@ -1,7 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING, Callable, Literal, TypeAlias
|
3
|
+
from typing import TYPE_CHECKING, Callable, Literal, Sequence, TypeAlias
|
4
4
|
|
5
|
+
from inspect_ai.log._file import list_eval_logs
|
5
6
|
from inspect_ai.model._chat_message import ChatMessage
|
6
7
|
|
7
8
|
if TYPE_CHECKING:
|
@@ -9,58 +10,60 @@ if TYPE_CHECKING:
|
|
9
10
|
|
10
11
|
from typing_extensions import overload
|
11
12
|
|
12
|
-
from ..columns import Column,
|
13
|
+
from ..columns import Column, ColumnError
|
13
14
|
from ..samples.table import MessagesDetail, _read_samples_df
|
14
15
|
from ..util import LogPaths, verify_prerequisites
|
15
16
|
from .columns import MessageColumns
|
16
17
|
|
17
|
-
MessageFilter: TypeAlias =
|
18
|
-
list[Literal["system", "user", "assistant", "tool"]] | Callable[[ChatMessage], bool]
|
19
|
-
)
|
18
|
+
MessageFilter: TypeAlias = Callable[[ChatMessage], bool]
|
20
19
|
"""Filter for `messages_df()` rows."""
|
21
20
|
|
22
21
|
|
23
22
|
@overload
|
24
23
|
def messages_df(
|
25
|
-
logs: LogPaths,
|
26
|
-
columns:
|
24
|
+
logs: LogPaths = list_eval_logs(),
|
25
|
+
columns: Sequence[Column] = MessageColumns,
|
27
26
|
filter: MessageFilter | None = None,
|
28
|
-
recursive: bool = True,
|
29
|
-
reverse: bool = False,
|
30
27
|
strict: Literal[True] = True,
|
28
|
+
parallel: bool | int = False,
|
29
|
+
quiet: bool = False,
|
31
30
|
) -> "pd.DataFrame": ...
|
32
31
|
|
33
32
|
|
34
33
|
@overload
|
35
34
|
def messages_df(
|
36
|
-
logs: LogPaths,
|
37
|
-
columns:
|
35
|
+
logs: LogPaths = list_eval_logs(),
|
36
|
+
columns: Sequence[Column] = MessageColumns,
|
38
37
|
filter: MessageFilter | None = None,
|
39
|
-
recursive: bool = True,
|
40
|
-
reverse: bool = False,
|
41
38
|
strict: Literal[False] = False,
|
42
|
-
|
39
|
+
parallel: bool | int = False,
|
40
|
+
quiet: bool = False,
|
41
|
+
) -> tuple["pd.DataFrame", list[ColumnError]]: ...
|
43
42
|
|
44
43
|
|
45
44
|
def messages_df(
|
46
|
-
logs: LogPaths,
|
47
|
-
columns:
|
45
|
+
logs: LogPaths = list_eval_logs(),
|
46
|
+
columns: Sequence[Column] = MessageColumns,
|
48
47
|
filter: MessageFilter | None = None,
|
49
|
-
recursive: bool = True,
|
50
|
-
reverse: bool = False,
|
51
48
|
strict: bool = True,
|
52
|
-
|
49
|
+
parallel: bool | int = False,
|
50
|
+
quiet: bool = False,
|
51
|
+
) -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]]:
|
53
52
|
"""Read a dataframe containing messages from a set of evals.
|
54
53
|
|
55
54
|
Args:
|
56
55
|
logs: One or more paths to log files or log directories.
|
56
|
+
Defaults to the contents of the currently active log directory
|
57
|
+
(e.g. ./logs or INSPECT_LOG_DIR).
|
57
58
|
columns: Specification for what columns to read from log files.
|
58
|
-
filter:
|
59
|
-
recursive: Include recursive contents of directories (defaults to `True`)
|
60
|
-
reverse: Reverse the order of the dataframe (by default, items
|
61
|
-
are ordered from oldest to newest).
|
59
|
+
filter: Callable that filters messages
|
62
60
|
strict: Raise import errors immediately. Defaults to `True`.
|
63
61
|
If `False` then a tuple of `DataFrame` and errors is returned.
|
62
|
+
parallel: If `True`, use `ProcessPoolExecutor` to read logs in parallel
|
63
|
+
(with workers based on `mp.cpu_count()`, capped at 8). If `int`, read
|
64
|
+
in parallel with the specified number of workers. If `False` (the default)
|
65
|
+
do not read in parallel.
|
66
|
+
quiet: If `True` do not print any output or progress (defaults to `False`).
|
64
67
|
|
65
68
|
Returns:
|
66
69
|
For `strict`, a Pandas `DataFrame` with information for the specified logs.
|
@@ -70,18 +73,16 @@ def messages_df(
|
|
70
73
|
verify_prerequisites()
|
71
74
|
|
72
75
|
# resolve filter/detail
|
73
|
-
if filter
|
74
|
-
detail = MessagesDetail(filter=lambda m: True)
|
75
|
-
elif callable(filter):
|
76
|
+
if callable(filter):
|
76
77
|
detail = MessagesDetail(filter=filter)
|
77
78
|
else:
|
78
|
-
detail = MessagesDetail(
|
79
|
+
detail = MessagesDetail()
|
79
80
|
|
80
81
|
return _read_samples_df(
|
81
82
|
logs=logs,
|
82
83
|
columns=columns,
|
83
|
-
recursive=recursive,
|
84
|
-
reverse=reverse,
|
85
84
|
strict=strict,
|
86
85
|
detail=detail,
|
86
|
+
parallel=parallel,
|
87
|
+
progress=not quiet,
|
87
88
|
)
|
@@ -0,0 +1,56 @@
|
|
1
|
+
from contextlib import contextmanager
|
2
|
+
from typing import Iterator, Protocol
|
3
|
+
|
4
|
+
from rich.progress import (
|
5
|
+
BarColumn,
|
6
|
+
Progress,
|
7
|
+
TaskID,
|
8
|
+
TaskProgressColumn,
|
9
|
+
TextColumn,
|
10
|
+
TimeElapsedColumn,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
class ImportProgress(Protocol):
|
15
|
+
def update(self) -> None: ...
|
16
|
+
def reset(self, description: str, completed: int, total: int) -> None: ...
|
17
|
+
|
18
|
+
|
19
|
+
class NoProgress(ImportProgress):
|
20
|
+
def update(self) -> None:
|
21
|
+
pass
|
22
|
+
|
23
|
+
def reset(self, description: str, completed: int, total: int) -> None:
|
24
|
+
pass
|
25
|
+
|
26
|
+
|
27
|
+
class RichImportProgress(ImportProgress):
|
28
|
+
def __init__(self, progress: Progress, task_id: TaskID) -> None:
|
29
|
+
self._progress = progress
|
30
|
+
self._task_id = task_id
|
31
|
+
|
32
|
+
def update(self) -> None:
|
33
|
+
self._progress.update(self._task_id, advance=1)
|
34
|
+
|
35
|
+
def reset(self, description: str, completed: int, total: int) -> None:
|
36
|
+
self._progress.reset(
|
37
|
+
self._task_id, description=description, completed=completed, total=total
|
38
|
+
)
|
39
|
+
|
40
|
+
|
41
|
+
@contextmanager
|
42
|
+
def no_progress() -> Iterator[ImportProgress]:
|
43
|
+
yield NoProgress()
|
44
|
+
|
45
|
+
|
46
|
+
@contextmanager
|
47
|
+
def import_progress(description: str, total: float | None) -> Iterator[ImportProgress]:
|
48
|
+
with Progress(
|
49
|
+
TextColumn("[progress.description]{task.description:<18}"),
|
50
|
+
BarColumn(),
|
51
|
+
TaskProgressColumn(),
|
52
|
+
TimeElapsedColumn(),
|
53
|
+
transient=True,
|
54
|
+
) as progress:
|
55
|
+
task_id = progress.add_task(description, total=total)
|
56
|
+
yield RichImportProgress(progress, task_id)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import json
|
2
2
|
from datetime import date, datetime, time, timezone
|
3
|
-
from typing import Any, Callable, Literal, Type, cast, overload
|
3
|
+
from typing import Any, Callable, Literal, Sequence, Type, cast, overload
|
4
4
|
|
5
5
|
import yaml
|
6
6
|
from jsonpath_ng import JSONPath # type: ignore
|
@@ -20,38 +20,41 @@ from .extract import model_to_record
|
|
20
20
|
|
21
21
|
@overload
|
22
22
|
def import_record(
|
23
|
+
log: EvalLog,
|
23
24
|
record: EvalLog
|
24
25
|
| EvalSampleSummary
|
25
26
|
| EvalSample
|
26
27
|
| ChatMessage
|
27
28
|
| Event
|
28
29
|
| dict[str, JsonValue],
|
29
|
-
columns:
|
30
|
+
columns: Sequence[Column],
|
30
31
|
strict: Literal[True] = True,
|
31
32
|
) -> dict[str, ColumnType]: ...
|
32
33
|
|
33
34
|
|
34
35
|
@overload
|
35
36
|
def import_record(
|
37
|
+
log: EvalLog,
|
36
38
|
record: EvalLog
|
37
39
|
| EvalSampleSummary
|
38
40
|
| EvalSample
|
39
41
|
| ChatMessage
|
40
42
|
| Event
|
41
43
|
| dict[str, JsonValue],
|
42
|
-
columns:
|
44
|
+
columns: Sequence[Column],
|
43
45
|
strict: Literal[False],
|
44
46
|
) -> tuple[dict[str, ColumnType], list[ColumnError]]: ...
|
45
47
|
|
46
48
|
|
47
49
|
def import_record(
|
50
|
+
log: EvalLog,
|
48
51
|
record: EvalLog
|
49
52
|
| EvalSampleSummary
|
50
53
|
| EvalSample
|
51
54
|
| ChatMessage
|
52
55
|
| Event
|
53
56
|
| dict[str, JsonValue],
|
54
|
-
columns:
|
57
|
+
columns: Sequence[Column],
|
55
58
|
strict: bool = True,
|
56
59
|
) -> dict[str, ColumnType] | tuple[dict[str, ColumnType], list[ColumnError]]:
|
57
60
|
# resolve the record BaseModel into a dict (and optionally a summary dict).
|
@@ -80,7 +83,7 @@ def import_record(
|
|
80
83
|
try:
|
81
84
|
result[name] = _resolve_value(value, column.type)
|
82
85
|
except ValueError as ex:
|
83
|
-
error = ColumnError(name, path=column.path,
|
86
|
+
error = ColumnError(name, path=column.path, error=ex, log=log)
|
84
87
|
if strict:
|
85
88
|
raise ValueError(str(error))
|
86
89
|
else:
|
@@ -90,10 +93,10 @@ def import_record(
|
|
90
93
|
def field_not_found(
|
91
94
|
name: str, path: JSONPath | None, required_type: str | None = None
|
92
95
|
) -> None:
|
93
|
-
|
96
|
+
ex = ValueError(
|
94
97
|
f"field not of type {required_type}" if required_type else "field not found"
|
95
98
|
)
|
96
|
-
error = ColumnError(name, path=path,
|
99
|
+
error = ColumnError(name, path=path, error=ex, log=log)
|
97
100
|
if strict:
|
98
101
|
raise ValueError(str(error))
|
99
102
|
else:
|
@@ -157,7 +160,8 @@ def import_record(
|
|
157
160
|
error = ColumnError(
|
158
161
|
column.name,
|
159
162
|
path=str(column.path) if column.path else None,
|
160
|
-
|
163
|
+
error=ex,
|
164
|
+
log=log,
|
161
165
|
)
|
162
166
|
if strict:
|
163
167
|
raise ValueError(str(error))
|
@@ -190,7 +194,7 @@ def import_record(
|
|
190
194
|
return result, errors
|
191
195
|
|
192
196
|
|
193
|
-
def resolve_duplicate_columns(columns:
|
197
|
+
def resolve_duplicate_columns(columns: Sequence[Column]) -> list[Column]:
|
194
198
|
"""Remove duplicate columns (with the later columns winning)"""
|
195
199
|
seen = set[str]()
|
196
200
|
deduped: list[Column] = []
|
@@ -7,9 +7,13 @@ from typing_extensions import override
|
|
7
7
|
from inspect_ai.log._log import EvalSample, EvalSampleSummary
|
8
8
|
|
9
9
|
from ..columns import Column, ColumnType
|
10
|
-
from ..extract import
|
10
|
+
from ..extract import list_as_str, score_values
|
11
11
|
from ..validate import resolved_schema
|
12
|
-
from .extract import
|
12
|
+
from .extract import (
|
13
|
+
sample_input_as_str,
|
14
|
+
sample_messages_as_str,
|
15
|
+
sample_path_requires_full,
|
16
|
+
)
|
13
17
|
|
14
18
|
|
15
19
|
class SampleColumn(Column):
|
@@ -54,14 +58,14 @@ class SampleColumn(Column):
|
|
54
58
|
SampleSummary: list[Column] = [
|
55
59
|
SampleColumn("id", path="id", required=True, type=str),
|
56
60
|
SampleColumn("epoch", path="epoch", required=True),
|
57
|
-
SampleColumn("input", path=
|
61
|
+
SampleColumn("input", path=sample_input_as_str, required=True),
|
58
62
|
SampleColumn("target", path="target", required=True, value=list_as_str),
|
59
63
|
SampleColumn("metadata_*", path="metadata"),
|
60
64
|
SampleColumn("score_*", path="scores", value=score_values),
|
61
65
|
SampleColumn("model_usage", path="model_usage"),
|
62
66
|
SampleColumn("total_time", path="total_time"),
|
63
67
|
SampleColumn("working_time", path="total_time"),
|
64
|
-
SampleColumn("error", path="error"),
|
68
|
+
SampleColumn("error", path="error", default=""),
|
65
69
|
SampleColumn("limit", path="limit"),
|
66
70
|
SampleColumn("retries", path="retries"),
|
67
71
|
]
|
@@ -3,45 +3,17 @@ from typing import Callable
|
|
3
3
|
from jsonpath_ng import JSONPath # type: ignore
|
4
4
|
from pydantic import JsonValue
|
5
5
|
|
6
|
-
from inspect_ai.analysis.beta._dataframe.extract import auto_id
|
7
6
|
from inspect_ai.log._log import EvalSample, EvalSampleSummary
|
8
|
-
from inspect_ai.model._chat_message import ChatMessageAssistant, ChatMessageTool
|
9
7
|
|
8
|
+
from ..extract import auto_id, messages_as_str
|
10
9
|
|
11
|
-
def sample_messages_as_str(sample: EvalSample) -> str:
|
12
|
-
# format each message for the transcript
|
13
|
-
transcript: list[str] = []
|
14
|
-
for msg in sample.messages:
|
15
|
-
role = msg.role
|
16
|
-
content = msg.text.strip() if msg.text else ""
|
17
|
-
|
18
|
-
# assistant messages with tool calls
|
19
|
-
if isinstance(msg, ChatMessageAssistant) and msg.tool_calls is not None:
|
20
|
-
entry = f"{role}:\n{content}\n"
|
21
|
-
|
22
|
-
for tool in msg.tool_calls:
|
23
|
-
func_name = tool.function
|
24
|
-
args = tool.arguments
|
25
10
|
|
26
|
-
|
27
|
-
|
28
|
-
entry += f"\nTool Call: {func_name}\nArguments:\n{args_text}"
|
29
|
-
else:
|
30
|
-
entry += f"\nTool Call: {func_name}\nArguments: {args}"
|
11
|
+
def sample_input_as_str(sample: EvalSample) -> str:
|
12
|
+
return messages_as_str(sample.input)
|
31
13
|
|
32
|
-
transcript.append(entry)
|
33
14
|
|
34
|
-
|
35
|
-
|
36
|
-
func_name = msg.function or "unknown"
|
37
|
-
entry = f"{role}:\n{content}\n\nError in tool call '{func_name}':\n{msg.error.message}\n"
|
38
|
-
transcript.append(entry)
|
39
|
-
|
40
|
-
# normal messages
|
41
|
-
else:
|
42
|
-
transcript.append(f"{role}:\n{content}\n")
|
43
|
-
|
44
|
-
return "\n".join(transcript)
|
15
|
+
def sample_messages_as_str(sample: EvalSample) -> str:
|
16
|
+
return messages_as_str(sample.messages)
|
45
17
|
|
46
18
|
|
47
19
|
def sample_path_requires_full(
|