inspect-ai 0.3.94__py3-none-any.whl → 0.3.96__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_eval/loader.py +1 -1
- inspect_ai/_eval/task/run.py +12 -6
- inspect_ai/_util/exception.py +4 -0
- inspect_ai/_util/hash.py +39 -0
- inspect_ai/_util/local_server.py +16 -0
- inspect_ai/_util/path.py +22 -0
- inspect_ai/_util/trace.py +1 -1
- inspect_ai/_util/working.py +4 -0
- inspect_ai/_view/www/dist/assets/index.css +9 -9
- inspect_ai/_view/www/dist/assets/index.js +117 -120
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
- inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
- inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
- inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
- inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
- inspect_ai/_view/www/src/app/types.ts +12 -2
- inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
- inspect_ai/_view/www/src/state/hooks.ts +19 -3
- inspect_ai/_view/www/src/state/logSlice.ts +23 -5
- inspect_ai/_view/www/yarn.lock +9 -9
- inspect_ai/agent/_bridge/patch.py +1 -3
- inspect_ai/agent/_types.py +1 -1
- inspect_ai/analysis/__init__.py +0 -0
- inspect_ai/analysis/beta/__init__.py +67 -0
- inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
- inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
- inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
- inspect_ai/analysis/beta/_dataframe/evals/table.py +177 -0
- inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/events/columns.py +87 -0
- inspect_ai/analysis/beta/_dataframe/events/extract.py +26 -0
- inspect_ai/analysis/beta/_dataframe/events/table.py +100 -0
- inspect_ai/analysis/beta/_dataframe/extract.py +73 -0
- inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
- inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
- inspect_ai/analysis/beta/_dataframe/messages/table.py +79 -0
- inspect_ai/analysis/beta/_dataframe/progress.py +26 -0
- inspect_ai/analysis/beta/_dataframe/record.py +377 -0
- inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
- inspect_ai/analysis/beta/_dataframe/samples/columns.py +77 -0
- inspect_ai/analysis/beta/_dataframe/samples/extract.py +54 -0
- inspect_ai/analysis/beta/_dataframe/samples/table.py +370 -0
- inspect_ai/analysis/beta/_dataframe/util.py +160 -0
- inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
- inspect_ai/log/_file.py +10 -3
- inspect_ai/log/_log.py +21 -1
- inspect_ai/model/_call_tools.py +2 -1
- inspect_ai/model/_model.py +6 -4
- inspect_ai/model/_openai_responses.py +17 -18
- inspect_ai/model/_providers/anthropic.py +30 -5
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/solver/_multiple_choice.py +4 -1
- inspect_ai/solver/_task_state.py +8 -4
- inspect_ai/tool/_mcp/_context.py +3 -5
- inspect_ai/tool/_mcp/_sandbox.py +17 -14
- inspect_ai/tool/_mcp/server.py +1 -1
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
- inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
- inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
- inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
- inspect_ai/util/_sandbox/events.py +3 -2
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/METADATA +9 -2
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/RECORD +75 -46
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/WHEEL +1 -1
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.96.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
import {
|
1
|
+
import { FilterError, LogState, ScoreLabel } from "../app/types";
|
2
2
|
import { EvalSummary, PendingSamples } from "../client/api/types";
|
3
3
|
import { kDefaultSort, kLogViewInfoTabId } from "../constants";
|
4
4
|
import { createLogger } from "../utils/logger";
|
@@ -23,7 +23,13 @@ export interface LogSlice {
|
|
23
23
|
setPendingSampleSummaries: (samples: PendingSamples) => void;
|
24
24
|
|
25
25
|
// Set filter criteria
|
26
|
-
setFilter: (filter:
|
26
|
+
setFilter: (filter: string) => void;
|
27
|
+
|
28
|
+
// Set the filter error
|
29
|
+
setFilterError: (error: FilterError) => void;
|
30
|
+
|
31
|
+
// Clear the filter error
|
32
|
+
clearFilterError: () => void;
|
27
33
|
|
28
34
|
// Set epoch filter
|
29
35
|
setEpoch: (epoch: string) => void;
|
@@ -60,7 +66,9 @@ const initialState = {
|
|
60
66
|
loadedLog: undefined,
|
61
67
|
|
62
68
|
// Filter state
|
63
|
-
filter:
|
69
|
+
filter: "",
|
70
|
+
filterError: undefined,
|
71
|
+
|
64
72
|
epoch: "all",
|
65
73
|
sort: kDefaultSort,
|
66
74
|
score: undefined,
|
@@ -110,10 +118,19 @@ export const createLogSlice = (
|
|
110
118
|
state.log.pendingSampleSummaries = pendingSampleSummaries;
|
111
119
|
}),
|
112
120
|
|
113
|
-
setFilter: (filter:
|
121
|
+
setFilter: (filter: string) =>
|
114
122
|
set((state) => {
|
115
123
|
state.log.filter = filter;
|
116
124
|
}),
|
125
|
+
setFilterError: (error: FilterError) =>
|
126
|
+
set((state) => {
|
127
|
+
state.log.filterError = error;
|
128
|
+
}),
|
129
|
+
clearFilterError: () => {
|
130
|
+
set((state) => {
|
131
|
+
state.log.filterError = undefined;
|
132
|
+
});
|
133
|
+
},
|
117
134
|
setEpoch: (epoch: string) =>
|
118
135
|
set((state) => {
|
119
136
|
state.log.epoch = epoch;
|
@@ -132,7 +149,8 @@ export const createLogSlice = (
|
|
132
149
|
}),
|
133
150
|
resetFiltering: () =>
|
134
151
|
set((state) => {
|
135
|
-
state.log.filter =
|
152
|
+
state.log.filter = "";
|
153
|
+
state.log.filterError = undefined;
|
136
154
|
state.log.epoch = "all";
|
137
155
|
state.log.sort = kDefaultSort;
|
138
156
|
state.log.score = undefined;
|
inspect_ai/_view/www/yarn.lock
CHANGED
@@ -4254,17 +4254,17 @@ react-refresh@^0.17.0:
|
|
4254
4254
|
resolved "https://registry.yarnpkg.com/react-refresh/-/react-refresh-0.17.0.tgz#b7e579c3657f23d04eccbe4ad2e58a8ed51e7e53"
|
4255
4255
|
integrity sha512-z6F7K9bV85EfseRCp2bzrpyQ0Gkw1uLoCel9XBVWPg/TjRj94SkJzUTGfOa4bs7iJvBWtQG0Wq7wnI0syw3EBQ==
|
4256
4256
|
|
4257
|
-
react-router-dom@^7.5.
|
4258
|
-
version "7.5.
|
4259
|
-
resolved "https://registry.yarnpkg.com/react-router-dom/-/react-router-dom-7.5.
|
4260
|
-
integrity sha512-
|
4257
|
+
react-router-dom@^7.5.3:
|
4258
|
+
version "7.5.3"
|
4259
|
+
resolved "https://registry.yarnpkg.com/react-router-dom/-/react-router-dom-7.5.3.tgz#496e9f6d90f731703c7772668b41747028e0a2d5"
|
4260
|
+
integrity sha512-cK0jSaTyW4jV9SRKAItMIQfWZ/D6WEZafgHuuCb9g+SjhLolY78qc+De4w/Cz9ybjvLzShAmaIMEXt8iF1Cm+A==
|
4261
4261
|
dependencies:
|
4262
|
-
react-router "7.5.
|
4262
|
+
react-router "7.5.3"
|
4263
4263
|
|
4264
|
-
react-router@7.5.
|
4265
|
-
version "7.5.
|
4266
|
-
resolved "https://registry.yarnpkg.com/react-router/-/react-router-7.5.
|
4267
|
-
integrity sha512
|
4264
|
+
react-router@7.5.3:
|
4265
|
+
version "7.5.3"
|
4266
|
+
resolved "https://registry.yarnpkg.com/react-router/-/react-router-7.5.3.tgz#9e5420832af8c3690740c1797d4fa54613fea06d"
|
4267
|
+
integrity sha512-3iUDM4/fZCQ89SXlDa+Ph3MevBrozBAI655OAfWQlTm9nBR0IKlrmNwFow5lPHttbwvITZfkeeeZFP6zt3F7pw==
|
4268
4268
|
dependencies:
|
4269
4269
|
cookie "^1.0.1"
|
4270
4270
|
set-cookie-parser "^2.6.0"
|
@@ -3,7 +3,7 @@ import re
|
|
3
3
|
from contextvars import ContextVar
|
4
4
|
from functools import wraps
|
5
5
|
from time import time
|
6
|
-
from typing import Any, AsyncGenerator,
|
6
|
+
from typing import Any, AsyncGenerator, Type, cast
|
7
7
|
|
8
8
|
from openai._base_client import AsyncAPIClient, _AsyncStreamT
|
9
9
|
from openai._models import FinalRequestOptions
|
@@ -65,7 +65,6 @@ def init_openai_request_patch() -> None:
|
|
65
65
|
*,
|
66
66
|
stream: bool = False,
|
67
67
|
stream_cls: type[_AsyncStreamT] | None = None,
|
68
|
-
remaining_retries: Optional[int] = None,
|
69
68
|
) -> Any:
|
70
69
|
# we have patched the underlying request method so now need to figure out when to
|
71
70
|
# patch and when to stand down
|
@@ -88,7 +87,6 @@ def init_openai_request_patch() -> None:
|
|
88
87
|
options,
|
89
88
|
stream=stream,
|
90
89
|
stream_cls=stream_cls,
|
91
|
-
remaining_retries=remaining_retries,
|
92
90
|
)
|
93
91
|
|
94
92
|
setattr(AsyncAPIClient, "request", patched_request)
|
inspect_ai/agent/_types.py
CHANGED
@@ -43,7 +43,7 @@ class AgentPrompt(NamedTuple):
|
|
43
43
|
|
44
44
|
DEFAULT_CONTINUE_PROMPT = """
|
45
45
|
Please proceed to the next step using your best judgement. If you believe you
|
46
|
-
have completed the task, please call the `{submit}()` tool.
|
46
|
+
have completed the task, please call the `{submit}()` tool with your final answer.
|
47
47
|
"""
|
48
48
|
|
49
49
|
|
File without changes
|
@@ -0,0 +1,67 @@
|
|
1
|
+
from ._dataframe.columns import (
|
2
|
+
Column,
|
3
|
+
ColumnError,
|
4
|
+
ColumnErrors,
|
5
|
+
ColumnType,
|
6
|
+
)
|
7
|
+
from ._dataframe.evals.columns import (
|
8
|
+
EvalColumn,
|
9
|
+
EvalColumns,
|
10
|
+
EvalConfig,
|
11
|
+
EvalInfo,
|
12
|
+
EvalModel,
|
13
|
+
EvalResults,
|
14
|
+
EvalScores,
|
15
|
+
EvalTask,
|
16
|
+
)
|
17
|
+
from ._dataframe.evals.table import evals_df
|
18
|
+
from ._dataframe.events.columns import (
|
19
|
+
EventColumn,
|
20
|
+
EventInfo,
|
21
|
+
EventTiming,
|
22
|
+
ModelEventColumns,
|
23
|
+
ToolEventColumns,
|
24
|
+
)
|
25
|
+
from ._dataframe.events.table import events_df
|
26
|
+
from ._dataframe.messages.columns import (
|
27
|
+
MessageColumn,
|
28
|
+
MessageColumns,
|
29
|
+
MessageContent,
|
30
|
+
MessageToolCalls,
|
31
|
+
)
|
32
|
+
from ._dataframe.messages.table import MessageFilter, messages_df
|
33
|
+
from ._dataframe.samples.columns import SampleColumn, SampleMessages, SampleSummary
|
34
|
+
from ._dataframe.samples.table import samples_df
|
35
|
+
|
36
|
+
__all__ = [
|
37
|
+
"evals_df",
|
38
|
+
"EvalColumn",
|
39
|
+
"EvalColumns",
|
40
|
+
"EvalInfo",
|
41
|
+
"EvalTask",
|
42
|
+
"EvalModel",
|
43
|
+
"EvalColumns",
|
44
|
+
"EvalConfig",
|
45
|
+
"EvalResults",
|
46
|
+
"EvalScores",
|
47
|
+
"samples_df",
|
48
|
+
"SampleColumn",
|
49
|
+
"SampleSummary",
|
50
|
+
"SampleMessages",
|
51
|
+
"messages_df",
|
52
|
+
"MessageColumn",
|
53
|
+
"MessageContent",
|
54
|
+
"MessageToolCalls",
|
55
|
+
"MessageColumns",
|
56
|
+
"MessageFilter",
|
57
|
+
"events_df",
|
58
|
+
"EventColumn",
|
59
|
+
"EventInfo",
|
60
|
+
"EventTiming",
|
61
|
+
"ModelEventColumns",
|
62
|
+
"ToolEventColumns",
|
63
|
+
"Column",
|
64
|
+
"ColumnType",
|
65
|
+
"ColumnError",
|
66
|
+
"ColumnErrors",
|
67
|
+
]
|
File without changes
|
@@ -0,0 +1,145 @@
|
|
1
|
+
import abc
|
2
|
+
from dataclasses import KW_ONLY, dataclass
|
3
|
+
from datetime import date, datetime, time
|
4
|
+
from typing import Any, Callable, Mapping, Type, TypeAlias
|
5
|
+
|
6
|
+
from jsonpath_ng import JSONPath # type: ignore
|
7
|
+
from jsonpath_ng.ext import parse # type: ignore
|
8
|
+
from pydantic import JsonValue
|
9
|
+
|
10
|
+
from .validate import jsonpath_in_schema
|
11
|
+
|
12
|
+
ColumnType: TypeAlias = int | float | bool | str | date | time | datetime | None
|
13
|
+
"""Valid types for columns.
|
14
|
+
|
15
|
+
Values of `list` and `dict` are converted into column values as JSON `str`.
|
16
|
+
"""
|
17
|
+
|
18
|
+
|
19
|
+
class Column(abc.ABC):
|
20
|
+
"""
|
21
|
+
Specification for importing a column into a dataframe.
|
22
|
+
|
23
|
+
Extract columns from an `EvalLog` path either using [JSONPath](https://github.com/h2non/jsonpath-ng) expressions
|
24
|
+
or a function that takes `EvalLog` and returns a value.
|
25
|
+
|
26
|
+
By default, columns are not required, pass `required=True` to make them required. Non-required
|
27
|
+
columns are extracted as `None`, provide a `default` to yield an alternate value.
|
28
|
+
|
29
|
+
The `type` option serves as both a validation check and a directive to attempt to coerce the
|
30
|
+
data into the specified `type`. Coercion from `str` to other types is done after interpreting
|
31
|
+
the string using YAML (e.g. `"true"` -> `True`).
|
32
|
+
|
33
|
+
The `value` function provides an additional hook for transformation of the value read
|
34
|
+
from the log before it is realized as a column (e.g. list to a comma-separated string).
|
35
|
+
|
36
|
+
The `root` option indicates which root eval log context the columns select from.
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
name: str,
|
42
|
+
*,
|
43
|
+
path: str | JSONPath | None,
|
44
|
+
required: bool = False,
|
45
|
+
default: JsonValue | None = None,
|
46
|
+
type: Type[ColumnType] | None = None,
|
47
|
+
value: Callable[[JsonValue], JsonValue] | None = None,
|
48
|
+
) -> None:
|
49
|
+
self._name = name
|
50
|
+
self._path: str | JSONPath | None = path
|
51
|
+
self._required = required
|
52
|
+
self._default = default
|
53
|
+
self._type = type
|
54
|
+
self._value = value
|
55
|
+
self._validated: bool | None = None
|
56
|
+
|
57
|
+
@property
|
58
|
+
def name(self) -> str:
|
59
|
+
"""Column name."""
|
60
|
+
return self._name
|
61
|
+
|
62
|
+
@property
|
63
|
+
def path(self) -> JSONPath | None:
|
64
|
+
"""Path to column in `EvalLog`"""
|
65
|
+
if isinstance(self._path, str):
|
66
|
+
self._path = parse(self._path)
|
67
|
+
return self._path
|
68
|
+
|
69
|
+
@property
|
70
|
+
def required(self) -> bool:
|
71
|
+
"""Is the column required? (error is raised if required columns aren't found)."""
|
72
|
+
return self._required
|
73
|
+
|
74
|
+
@property
|
75
|
+
def default(self) -> JsonValue | None:
|
76
|
+
"""Default value for column when it is read from the log as `None`."""
|
77
|
+
return self._default
|
78
|
+
|
79
|
+
@property
|
80
|
+
def type(self) -> Type[ColumnType] | None:
|
81
|
+
"""Column type (import will attempt to coerce to the specified type)."""
|
82
|
+
return self._type
|
83
|
+
|
84
|
+
def value(self, x: JsonValue) -> JsonValue:
|
85
|
+
"""Convert extracted value into a column value (defaults to identity function).
|
86
|
+
|
87
|
+
Params:
|
88
|
+
x: Value to convert.
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
Converted value.
|
92
|
+
"""
|
93
|
+
if self._value:
|
94
|
+
return self._value(x)
|
95
|
+
else:
|
96
|
+
return x
|
97
|
+
|
98
|
+
def validate_path(self) -> bool:
|
99
|
+
if self.path is not None:
|
100
|
+
if self._validated is None:
|
101
|
+
schema = self.path_schema()
|
102
|
+
self._validated = (
|
103
|
+
jsonpath_in_schema(self.path, schema) if schema else True
|
104
|
+
)
|
105
|
+
return self._validated
|
106
|
+
else:
|
107
|
+
return True
|
108
|
+
|
109
|
+
@abc.abstractmethod
|
110
|
+
def path_schema(self) -> Mapping[str, Any] | None: ...
|
111
|
+
|
112
|
+
|
113
|
+
@dataclass
|
114
|
+
class ColumnError:
|
115
|
+
"""Error which occurred parsing a column."""
|
116
|
+
|
117
|
+
column: str
|
118
|
+
"""Target column name."""
|
119
|
+
|
120
|
+
_: KW_ONLY
|
121
|
+
|
122
|
+
path: str | None
|
123
|
+
"""Path to select column value. """
|
124
|
+
|
125
|
+
message: str
|
126
|
+
"""Error message."""
|
127
|
+
|
128
|
+
def __str__(self) -> str:
|
129
|
+
msg = f"Error reading column '{self.column}'"
|
130
|
+
if self.path:
|
131
|
+
msg = f"{msg} from path '{self.path}'"
|
132
|
+
return f"{msg}: {self.message}"
|
133
|
+
|
134
|
+
|
135
|
+
class ColumnErrors(dict[str, list[ColumnError]]):
|
136
|
+
"""Dictionary of column errors keyed by log file."""
|
137
|
+
|
138
|
+
def __str__(self) -> str:
|
139
|
+
lines: list[str] = [""]
|
140
|
+
for file, errors in self.items():
|
141
|
+
lines.append(file)
|
142
|
+
for error in errors:
|
143
|
+
lines.append(f" - {error}")
|
144
|
+
lines.append("")
|
145
|
+
return "\n".join(lines)
|
File without changes
|
@@ -0,0 +1,132 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from typing import Any, Callable, Mapping, Type
|
3
|
+
|
4
|
+
from jsonpath_ng import JSONPath # type: ignore
|
5
|
+
from pydantic import JsonValue
|
6
|
+
from typing_extensions import override
|
7
|
+
|
8
|
+
from inspect_ai.log._log import EvalLog
|
9
|
+
|
10
|
+
from ..columns import Column, ColumnType
|
11
|
+
from ..extract import list_as_str
|
12
|
+
from ..validate import resolved_schema
|
13
|
+
from .extract import eval_log_location, eval_log_scores_dict
|
14
|
+
|
15
|
+
|
16
|
+
class EvalColumn(Column):
|
17
|
+
"""Column which maps to `EvalLog`."""
|
18
|
+
|
19
|
+
def __init__(
|
20
|
+
self,
|
21
|
+
name: str,
|
22
|
+
*,
|
23
|
+
path: str | JSONPath | Callable[[EvalLog], JsonValue],
|
24
|
+
required: bool = False,
|
25
|
+
default: JsonValue | None = None,
|
26
|
+
type: Type[ColumnType] | None = None,
|
27
|
+
value: Callable[[JsonValue], JsonValue] | None = None,
|
28
|
+
) -> None:
|
29
|
+
super().__init__(
|
30
|
+
name=name,
|
31
|
+
path=path if not callable(path) else None,
|
32
|
+
required=required,
|
33
|
+
default=default,
|
34
|
+
type=type,
|
35
|
+
value=value,
|
36
|
+
)
|
37
|
+
self._extract_eval = path if callable(path) else None
|
38
|
+
|
39
|
+
@override
|
40
|
+
def path_schema(self) -> Mapping[str, Any]:
|
41
|
+
return self.schema
|
42
|
+
|
43
|
+
schema = resolved_schema(EvalLog)
|
44
|
+
|
45
|
+
|
46
|
+
EvalId: list[Column] = [
|
47
|
+
EvalColumn("eval_id", path="eval.eval_id", required=True),
|
48
|
+
]
|
49
|
+
"""Eval id column."""
|
50
|
+
|
51
|
+
EvalInfo: list[Column] = [
|
52
|
+
EvalColumn("run_id", path="eval.run_id", required=True),
|
53
|
+
EvalColumn("task_id", path="eval.task_id", required=True),
|
54
|
+
EvalColumn("log", path=eval_log_location),
|
55
|
+
EvalColumn("created", path="eval.created", type=datetime, required=True),
|
56
|
+
EvalColumn("tags", path="eval.tags", default="", value=list_as_str),
|
57
|
+
EvalColumn("git_origin", path="eval.revision.origin"),
|
58
|
+
EvalColumn("git_commit", path="eval.revision.commit"),
|
59
|
+
EvalColumn("packages", path="eval.packages"),
|
60
|
+
EvalColumn("metadata", path="eval.metadata"),
|
61
|
+
]
|
62
|
+
"""Eval basic information columns."""
|
63
|
+
|
64
|
+
EvalTask: list[Column] = [
|
65
|
+
EvalColumn("task_name", path="eval.task", required=True),
|
66
|
+
EvalColumn("task_version", path="eval.task_version", required=True),
|
67
|
+
EvalColumn("task_file", path="eval.task_file"),
|
68
|
+
EvalColumn("task_attribs", path="eval.task_attribs"),
|
69
|
+
EvalColumn("task_arg_*", path="eval.task_args"),
|
70
|
+
EvalColumn("solver", path="eval.solver"),
|
71
|
+
EvalColumn("solver_args", path="eval.solver_args"),
|
72
|
+
EvalColumn("sandbox_type", path="eval.sandbox.type"),
|
73
|
+
EvalColumn("sandbox_config", path="eval.sandbox.config"),
|
74
|
+
]
|
75
|
+
"""Eval task configuration columns."""
|
76
|
+
|
77
|
+
EvalModel: list[Column] = [
|
78
|
+
EvalColumn("model", path="eval.model", required=True),
|
79
|
+
EvalColumn("model_base_url", path="eval.model_base_url"),
|
80
|
+
EvalColumn("model_args", path="eval.model_base_url"),
|
81
|
+
EvalColumn("model_generate_config", path="eval.model_generate_config"),
|
82
|
+
EvalColumn("model_roles", path="eval.model_roles"),
|
83
|
+
]
|
84
|
+
"""Eval model columns."""
|
85
|
+
|
86
|
+
EvalDataset: list[Column] = [
|
87
|
+
EvalColumn("dataset_name", path="eval.dataset.name"),
|
88
|
+
EvalColumn("dataset_location", path="eval.dataset.location"),
|
89
|
+
EvalColumn("dataset_samples", path="eval.dataset.samples"),
|
90
|
+
EvalColumn("dataset_sample_ids", path="eval.dataset.sample_ids"),
|
91
|
+
EvalColumn("dataset_shuffled", path="eval.dataset.shuffled"),
|
92
|
+
]
|
93
|
+
"""Eval dataset columns."""
|
94
|
+
|
95
|
+
EvalConfig: list[Column] = [
|
96
|
+
EvalColumn("epochs", path="eval.config.epochs"),
|
97
|
+
EvalColumn("epochs_reducer", path="eval.config.epochs_reducer"),
|
98
|
+
EvalColumn("approval", path="eval.config.approval"),
|
99
|
+
EvalColumn("message_limit", path="eval.config.message_limit"),
|
100
|
+
EvalColumn("token_limit", path="eval.config.token_limit"),
|
101
|
+
EvalColumn("time_limit", path="eval.config.time_limit"),
|
102
|
+
EvalColumn("working_limit", path="eval.config.working_limit"),
|
103
|
+
]
|
104
|
+
"""Eval configuration columns."""
|
105
|
+
|
106
|
+
EvalResults: list[Column] = [
|
107
|
+
EvalColumn("status", path="status", required=True),
|
108
|
+
EvalColumn("error_message", path="error.message"),
|
109
|
+
EvalColumn("error_traceback", path="error.traceback"),
|
110
|
+
EvalColumn("total_samples", path="results.total_samples"),
|
111
|
+
EvalColumn("completed_samples", path="results.completed_samples"),
|
112
|
+
EvalColumn("score_headline_name", path="results.scores[0].scorer"),
|
113
|
+
EvalColumn("score_headline_metric", path="results.scores[0].metrics.*.name"),
|
114
|
+
EvalColumn("score_headline_value", path="results.scores[0].metrics.*.value"),
|
115
|
+
]
|
116
|
+
"""Eval results columns."""
|
117
|
+
|
118
|
+
EvalScores: list[Column] = [
|
119
|
+
EvalColumn("score_*_*", path=eval_log_scores_dict),
|
120
|
+
]
|
121
|
+
"""Eval scores (one score/metric per-columns)."""
|
122
|
+
|
123
|
+
EvalColumns: list[Column] = (
|
124
|
+
EvalInfo
|
125
|
+
+ EvalTask
|
126
|
+
+ EvalModel
|
127
|
+
+ EvalDataset
|
128
|
+
+ EvalConfig
|
129
|
+
+ EvalResults
|
130
|
+
+ EvalScores
|
131
|
+
)
|
132
|
+
"""Default columns to import for `evals_df()`."""
|
@@ -0,0 +1,23 @@
|
|
1
|
+
from inspect_ai._util.path import native_path
|
2
|
+
from inspect_ai.log._log import EvalLog
|
3
|
+
|
4
|
+
|
5
|
+
def eval_log_location(log: EvalLog) -> str:
|
6
|
+
return native_path(log.location)
|
7
|
+
|
8
|
+
|
9
|
+
def eval_log_scores_dict(
|
10
|
+
log: EvalLog,
|
11
|
+
) -> list[dict[str, dict[str, int | float]]] | None:
|
12
|
+
if log.results is not None:
|
13
|
+
metrics = [
|
14
|
+
{
|
15
|
+
score.name: {
|
16
|
+
metric.name: metric.value for metric in score.metrics.values()
|
17
|
+
}
|
18
|
+
}
|
19
|
+
for score in log.results.scores
|
20
|
+
]
|
21
|
+
return metrics
|
22
|
+
else:
|
23
|
+
return None
|
@@ -0,0 +1,177 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Callable, Literal, overload
|
4
|
+
|
5
|
+
from inspect_ai._util.path import pretty_path
|
6
|
+
from inspect_ai.analysis.beta._dataframe.progress import import_progress
|
7
|
+
from inspect_ai.log._file import (
|
8
|
+
list_eval_logs,
|
9
|
+
read_eval_log,
|
10
|
+
)
|
11
|
+
|
12
|
+
from ..columns import Column, ColumnErrors, ColumnType
|
13
|
+
from ..record import import_record, resolve_duplicate_columns
|
14
|
+
from ..util import (
|
15
|
+
LogPaths,
|
16
|
+
add_unreferenced_columns,
|
17
|
+
records_to_pandas,
|
18
|
+
resolve_columns,
|
19
|
+
resolve_logs,
|
20
|
+
verify_prerequisites,
|
21
|
+
)
|
22
|
+
from .columns import EvalColumns, EvalId
|
23
|
+
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
import pandas as pd
|
26
|
+
|
27
|
+
EVAL_ID = "eval_id"
|
28
|
+
EVAL_SUFFIX = "_eval"
|
29
|
+
|
30
|
+
|
31
|
+
@overload
|
32
|
+
def evals_df(
|
33
|
+
logs: LogPaths = list_eval_logs(),
|
34
|
+
columns: list[Column] = EvalColumns,
|
35
|
+
strict: Literal[True] = True,
|
36
|
+
) -> "pd.DataFrame": ...
|
37
|
+
|
38
|
+
|
39
|
+
@overload
|
40
|
+
def evals_df(
|
41
|
+
logs: LogPaths = list_eval_logs(),
|
42
|
+
columns: list[Column] = EvalColumns,
|
43
|
+
strict: Literal[False] = False,
|
44
|
+
) -> tuple["pd.DataFrame", ColumnErrors]: ...
|
45
|
+
|
46
|
+
|
47
|
+
def evals_df(
|
48
|
+
logs: LogPaths = list_eval_logs(),
|
49
|
+
columns: list[Column] = EvalColumns,
|
50
|
+
strict: bool = True,
|
51
|
+
) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
|
52
|
+
"""Read a dataframe containing evals.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
logs: One or more paths to log files or log directories.
|
56
|
+
Defaults to the contents of the currently active log directory
|
57
|
+
(e.g. ./logs or INSPECT_LOG_DIR).
|
58
|
+
columns: Specification for what columns to read from log files.
|
59
|
+
strict: Raise import errors immediately. Defaults to `True`.
|
60
|
+
If `False` then a tuple of `DataFrame` and errors is returned.
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
For `strict`, a Pandas `DataFrame` with information for the specified logs.
|
64
|
+
For `strict=False`, a tuple of Pandas `DataFrame` and a dictionary of errors
|
65
|
+
encountered (by log file) during import.
|
66
|
+
"""
|
67
|
+
verify_prerequisites()
|
68
|
+
|
69
|
+
# resolve logs
|
70
|
+
log_paths = resolve_logs(logs)
|
71
|
+
|
72
|
+
with import_progress("reading logs", total=len(log_paths)) as (p, task_id):
|
73
|
+
if strict:
|
74
|
+
evals_table, _ = _read_evals_df(
|
75
|
+
log_paths, columns, True, lambda: p.update(task_id, advance=1)
|
76
|
+
)
|
77
|
+
return evals_table
|
78
|
+
else:
|
79
|
+
evals_table, all_errors, _ = _read_evals_df(
|
80
|
+
log_paths, columns, False, lambda: p.update(task_id, advance=1)
|
81
|
+
)
|
82
|
+
return evals_table, all_errors
|
83
|
+
|
84
|
+
|
85
|
+
@overload
|
86
|
+
def _read_evals_df(
|
87
|
+
log_paths: list[str],
|
88
|
+
columns: list[Column],
|
89
|
+
strict: Literal[True],
|
90
|
+
progress: Callable[[], None],
|
91
|
+
) -> tuple["pd.DataFrame", int]: ...
|
92
|
+
|
93
|
+
|
94
|
+
@overload
|
95
|
+
def _read_evals_df(
|
96
|
+
log_paths: list[str],
|
97
|
+
columns: list[Column],
|
98
|
+
strict: Literal[False],
|
99
|
+
progress: Callable[[], None],
|
100
|
+
) -> tuple["pd.DataFrame", ColumnErrors, int]: ...
|
101
|
+
|
102
|
+
|
103
|
+
def _read_evals_df(
|
104
|
+
log_paths: list[str],
|
105
|
+
columns: list[Column],
|
106
|
+
strict: bool,
|
107
|
+
progress: Callable[[], None],
|
108
|
+
) -> tuple["pd.DataFrame", int] | tuple["pd.DataFrame", ColumnErrors, int]:
|
109
|
+
verify_prerequisites()
|
110
|
+
|
111
|
+
# resolve duplicate columns
|
112
|
+
columns = resolve_duplicate_columns(columns)
|
113
|
+
|
114
|
+
# accumulate errors for strict=False
|
115
|
+
all_errors = ColumnErrors()
|
116
|
+
|
117
|
+
# ensure eval_id
|
118
|
+
ensure_eval_id(columns)
|
119
|
+
|
120
|
+
# read logs
|
121
|
+
total_samples = 0
|
122
|
+
records: list[dict[str, ColumnType]] = []
|
123
|
+
for log_path in log_paths:
|
124
|
+
log = read_eval_log(log_path, header_only=True)
|
125
|
+
if strict:
|
126
|
+
record = import_record(log, columns, strict=True)
|
127
|
+
else:
|
128
|
+
record, errors = import_record(log, columns, strict=False)
|
129
|
+
all_errors[pretty_path(log_path)] = errors
|
130
|
+
records.append(record)
|
131
|
+
total_samples += (
|
132
|
+
len(log.eval.dataset.sample_ids)
|
133
|
+
if log.eval.dataset.sample_ids is not None
|
134
|
+
else (log.eval.dataset.samples or 100)
|
135
|
+
)
|
136
|
+
progress()
|
137
|
+
|
138
|
+
# return table (+errors if strict=False)
|
139
|
+
evals_table = records_to_pandas(records)
|
140
|
+
evals_table = reorder_evals_df_columns(evals_table, columns)
|
141
|
+
|
142
|
+
if strict:
|
143
|
+
return evals_table, total_samples
|
144
|
+
else:
|
145
|
+
return evals_table, all_errors, total_samples
|
146
|
+
|
147
|
+
|
148
|
+
def ensure_eval_id(columns: list[Column]) -> None:
|
149
|
+
if not any([column.name == EVAL_ID for column in columns]):
|
150
|
+
columns.extend(EvalId)
|
151
|
+
|
152
|
+
|
153
|
+
def reorder_evals_df_columns(
|
154
|
+
df: "pd.DataFrame", eval_columns: list[Column]
|
155
|
+
) -> "pd.DataFrame":
|
156
|
+
actual_columns = list(df.columns)
|
157
|
+
ordered_columns: list[str] = []
|
158
|
+
|
159
|
+
# eval_id first
|
160
|
+
if EVAL_ID in actual_columns:
|
161
|
+
ordered_columns.append(EVAL_ID)
|
162
|
+
|
163
|
+
# eval columns
|
164
|
+
for col in eval_columns:
|
165
|
+
col_pattern = col.name
|
166
|
+
if col_pattern == EVAL_ID:
|
167
|
+
continue # Already handled
|
168
|
+
|
169
|
+
ordered_columns.extend(
|
170
|
+
resolve_columns(col_pattern, EVAL_SUFFIX, actual_columns, ordered_columns)
|
171
|
+
)
|
172
|
+
|
173
|
+
# add any unreferenced columns
|
174
|
+
ordered_columns = add_unreferenced_columns(actual_columns, ordered_columns)
|
175
|
+
|
176
|
+
# reorder the DataFrame
|
177
|
+
return df[ordered_columns]
|
File without changes
|