inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +2 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/core/progress.py +1 -1
- inspect_ai/_display/textual/app.py +8 -4
- inspect_ai/_display/textual/widgets/samples.py +6 -5
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/__init__.py +0 -0
- inspect_ai/_eval/eval.py +100 -97
- inspect_ai/_eval/evalset.py +69 -69
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +6 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/__init__.py +0 -0
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/App.css +8 -3
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +66 -38
- inspect_ai/_view/www/dist/assets/index.js +525 -523
- inspect_ai/_view/www/log-schema.json +86 -73
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/App.tsx +1 -0
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
- inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
- inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
- inspect_ai/_view/www/src/types/log.d.ts +107 -19
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +36 -45
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +13 -13
- inspect_ai/dataset/_sources/hf.py +29 -29
- inspect_ai/dataset/_sources/json.py +10 -10
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +98 -7
- inspect_ai/log/_message.py +3 -1
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +2 -2
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openrouter.py +1 -1
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +1 -1
- inspect_ai/scorer/_classification.py +4 -0
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +15 -18
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +2 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/_tools/_computer/_common.py +2 -2
- inspect_ai/tool/_tools/_computer/_computer.py +11 -0
- inspect_ai/tool/_tools/_execute.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +10 -1
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
import clsx from "clsx";
|
2
2
|
import { EvalMetric, EvalResults, EvalScore, Reducer } from "../../types/log";
|
3
3
|
import { formatPrettyDecimal } from "../../utils/format";
|
4
|
+
import { metricDisplayName } from "../utils";
|
4
5
|
import styles from "./ResultsPanel.module.css";
|
5
6
|
|
6
7
|
interface ResultsPanelProps {
|
@@ -23,7 +24,7 @@ export const ResultsPanel: React.FC<ResultsPanelProps> = ({ results }) => {
|
|
23
24
|
metric: {
|
24
25
|
name: key,
|
25
26
|
value: score.metrics[key].value,
|
26
|
-
|
27
|
+
params: score.metrics[key].params,
|
27
28
|
metadata: {},
|
28
29
|
},
|
29
30
|
};
|
@@ -31,18 +32,35 @@ export const ResultsPanel: React.FC<ResultsPanelProps> = ({ results }) => {
|
|
31
32
|
});
|
32
33
|
|
33
34
|
const metrics = Object.values(scorers)[0];
|
35
|
+
const showReducer = metrics && metrics.length > 0 && !!metrics[0].reducer;
|
34
36
|
return (
|
35
37
|
<div className={styles.simpleMetricsRows}>
|
36
38
|
{metrics.map((metric, i) => {
|
37
|
-
return
|
39
|
+
return (
|
40
|
+
<VerticalMetric
|
41
|
+
key={`simple-metric-${i}`}
|
42
|
+
metricSummary={metric}
|
43
|
+
isFirst={i === 0}
|
44
|
+
showReducer={showReducer}
|
45
|
+
/>
|
46
|
+
);
|
38
47
|
})}
|
39
48
|
</div>
|
40
49
|
);
|
41
50
|
} else {
|
51
|
+
const showReducer =
|
52
|
+
results?.scores.findIndex((score) => !!score.reducer) !== -1;
|
42
53
|
return (
|
43
54
|
<div className={styles.multiMetricsRows}>
|
44
55
|
{results?.scores?.map((score, index) => {
|
45
|
-
return
|
56
|
+
return (
|
57
|
+
<MultiScorerMetric
|
58
|
+
key={`multi-metric-${index}`}
|
59
|
+
scorer={score}
|
60
|
+
isFirst={index === 0}
|
61
|
+
showReducer={showReducer}
|
62
|
+
/>
|
63
|
+
);
|
46
64
|
})}
|
47
65
|
</div>
|
48
66
|
);
|
@@ -52,6 +70,7 @@ export const ResultsPanel: React.FC<ResultsPanelProps> = ({ results }) => {
|
|
52
70
|
interface VerticalMetricProps {
|
53
71
|
metricSummary: MetricSummary;
|
54
72
|
isFirst: boolean;
|
73
|
+
showReducer: boolean;
|
55
74
|
}
|
56
75
|
|
57
76
|
/** Renders a Vertical Metric
|
@@ -59,21 +78,8 @@ interface VerticalMetricProps {
|
|
59
78
|
const VerticalMetric: React.FC<VerticalMetricProps> = ({
|
60
79
|
metricSummary,
|
61
80
|
isFirst,
|
81
|
+
showReducer,
|
62
82
|
}) => {
|
63
|
-
const reducer_component = metricSummary.reducer ? (
|
64
|
-
<div
|
65
|
-
className={clsx(
|
66
|
-
"text-style-label",
|
67
|
-
"text-style-secondary",
|
68
|
-
styles.verticalMetricReducer,
|
69
|
-
)}
|
70
|
-
>
|
71
|
-
{metricSummary.reducer}
|
72
|
-
</div>
|
73
|
-
) : (
|
74
|
-
""
|
75
|
-
);
|
76
|
-
|
77
83
|
return (
|
78
84
|
<div style={{ paddingLeft: isFirst ? "0" : "1em" }}>
|
79
85
|
<div
|
@@ -84,11 +90,26 @@ const VerticalMetric: React.FC<VerticalMetricProps> = ({
|
|
84
90
|
styles.verticalMetricName,
|
85
91
|
)}
|
86
92
|
>
|
87
|
-
{metricSummary.metric
|
93
|
+
{metricDisplayName(metricSummary.metric)}
|
88
94
|
</div>
|
89
|
-
{
|
95
|
+
{showReducer ? (
|
96
|
+
<div
|
97
|
+
className={clsx(
|
98
|
+
"text-style-label",
|
99
|
+
"text-style-secondary",
|
100
|
+
styles.verticalMetricReducer,
|
101
|
+
)}
|
102
|
+
>
|
103
|
+
{metricSummary.reducer || "default"}
|
104
|
+
</div>
|
105
|
+
) : undefined}
|
106
|
+
|
90
107
|
<div
|
91
|
-
className={clsx(
|
108
|
+
className={clsx(
|
109
|
+
"vertical-metric-value",
|
110
|
+
"text-size-largest",
|
111
|
+
styles.verticalMetricValue,
|
112
|
+
)}
|
92
113
|
>
|
93
114
|
{formatPrettyDecimal(metricSummary.metric.value)}
|
94
115
|
</div>
|
@@ -99,33 +120,25 @@ const VerticalMetric: React.FC<VerticalMetricProps> = ({
|
|
99
120
|
interface MultiScorerMetricProps {
|
100
121
|
scorer: EvalScore;
|
101
122
|
isFirst: boolean;
|
123
|
+
showReducer: boolean;
|
102
124
|
}
|
103
125
|
|
104
126
|
const MultiScorerMetric: React.FC<MultiScorerMetricProps> = ({
|
105
127
|
scorer,
|
106
128
|
isFirst,
|
129
|
+
showReducer,
|
107
130
|
}) => {
|
108
131
|
const titleFontClz = "text-size-base";
|
109
132
|
const reducerFontClz = "text-size-smaller";
|
110
133
|
const valueFontClz = "text-size-base";
|
111
134
|
|
112
|
-
|
135
|
+
return (
|
113
136
|
<div
|
114
137
|
className={clsx(
|
115
|
-
|
116
|
-
|
117
|
-
"text-style-secondary",
|
118
|
-
styles.multiScorerReducer,
|
138
|
+
styles.multiScorer,
|
139
|
+
isFirst ? styles.multiScorerIndent : undefined,
|
119
140
|
)}
|
120
141
|
>
|
121
|
-
{scorer.reducer}
|
122
|
-
</div>
|
123
|
-
) : (
|
124
|
-
""
|
125
|
-
);
|
126
|
-
|
127
|
-
return (
|
128
|
-
<div style={{ paddingLeft: isFirst ? "0" : "1.5em" }}>
|
129
142
|
<div
|
130
143
|
className={clsx(
|
131
144
|
titleFontClz,
|
@@ -137,13 +150,24 @@ const MultiScorerMetric: React.FC<MultiScorerMetricProps> = ({
|
|
137
150
|
>
|
138
151
|
{scorer.name}
|
139
152
|
</div>
|
140
|
-
{
|
153
|
+
{showReducer ? (
|
154
|
+
<div
|
155
|
+
className={clsx(
|
156
|
+
reducerFontClz,
|
157
|
+
"text-style-label",
|
158
|
+
"text-style-secondary",
|
159
|
+
styles.multiScorerReducer,
|
160
|
+
)}
|
161
|
+
>
|
162
|
+
{scorer.reducer || "default"}
|
163
|
+
</div>
|
164
|
+
) : undefined}
|
141
165
|
<div className={clsx(valueFontClz, styles.multiScorerValue)}>
|
142
166
|
{Object.keys(scorer.metrics).map((key) => {
|
143
167
|
const metric = scorer.metrics[key];
|
144
168
|
return (
|
145
|
-
<div>
|
146
|
-
<div>{metric
|
169
|
+
<div className={styles.multiScoreMetricGrid} key={key}>
|
170
|
+
<div>{metricDisplayName(metric)}</div>
|
147
171
|
<div className={styles.multiScorerValueContent}>
|
148
172
|
{formatPrettyDecimal(metric.value)}
|
149
173
|
</div>
|
@@ -53,6 +53,7 @@ export const SecondaryBar: React.FC<SecondaryBarProps> = ({
|
|
53
53
|
size: "minmax(12%, auto)",
|
54
54
|
value: (
|
55
55
|
<LabeledValue
|
56
|
+
key="sb-dataset"
|
56
57
|
label="Dataset"
|
57
58
|
className={(styles.staticCol, "text-size-small")}
|
58
59
|
>
|
@@ -71,6 +72,7 @@ export const SecondaryBar: React.FC<SecondaryBarProps> = ({
|
|
71
72
|
size: "minmax(12%, auto)",
|
72
73
|
value: (
|
73
74
|
<LabeledValue
|
75
|
+
key="sb-scorer"
|
74
76
|
label={label}
|
75
77
|
className={clsx(
|
76
78
|
styles.staticCol,
|
@@ -88,6 +90,7 @@ export const SecondaryBar: React.FC<SecondaryBarProps> = ({
|
|
88
90
|
size: "minmax(12%, auto)",
|
89
91
|
value: (
|
90
92
|
<LabeledValue
|
93
|
+
key="sb-params"
|
91
94
|
label="Config"
|
92
95
|
className={clsx(styles.justifyRight, "text-size-small")}
|
93
96
|
>
|
@@ -106,6 +109,7 @@ export const SecondaryBar: React.FC<SecondaryBarProps> = ({
|
|
106
109
|
size: "minmax(12%, auto)",
|
107
110
|
value: (
|
108
111
|
<LabeledValue
|
112
|
+
key="sb-duration"
|
109
113
|
label="Duration"
|
110
114
|
className={clsx(styles.justifyRight, "text-size-small")}
|
111
115
|
>
|
@@ -2,17 +2,19 @@ import clsx from "clsx";
|
|
2
2
|
import { EvalScore } from "../../types/log";
|
3
3
|
import { formatPrettyDecimal } from "../../utils/format";
|
4
4
|
|
5
|
+
import { metricDisplayName } from "../utils";
|
5
6
|
import styles from "./SidebarScoreView.module.css";
|
6
7
|
interface SidebarScoreProps {
|
7
8
|
scorer: EvalScore;
|
8
9
|
}
|
9
10
|
|
10
11
|
export const SidebarScoreView: React.FC<SidebarScoreProps> = ({ scorer }) => {
|
12
|
+
const showReducer = !!scorer.reducer;
|
11
13
|
return (
|
12
14
|
<div className={styles.container}>
|
13
15
|
{Object.keys(scorer.metrics).map((metric) => {
|
14
16
|
return (
|
15
|
-
<div className={styles.metric}>
|
17
|
+
<div className={styles.metric} key={metric}>
|
16
18
|
<div
|
17
19
|
className={clsx(
|
18
20
|
"text-style-secondary",
|
@@ -21,11 +23,11 @@ export const SidebarScoreView: React.FC<SidebarScoreProps> = ({ scorer }) => {
|
|
21
23
|
styles.metricName,
|
22
24
|
)}
|
23
25
|
>
|
24
|
-
{scorer.metrics[metric]
|
26
|
+
{metricDisplayName(scorer.metrics[metric])}
|
25
27
|
</div>
|
26
|
-
{
|
28
|
+
{showReducer ? (
|
27
29
|
<div className={clsx("text-size-small", styles.metricReducer)}>
|
28
|
-
|
30
|
+
{scorer.reducer || "default"}
|
29
31
|
</div>
|
30
32
|
) : (
|
31
33
|
""
|
@@ -2,6 +2,7 @@ import clsx from "clsx";
|
|
2
2
|
import { Fragment } from "react";
|
3
3
|
import { Scores } from "../../types/log";
|
4
4
|
import { formatPrettyDecimal } from "../../utils/format";
|
5
|
+
import { metricDisplayName } from "../utils";
|
5
6
|
import styles from "./SidebarScoresView.module.css";
|
6
7
|
|
7
8
|
interface SidebarScoresProps {
|
@@ -9,26 +10,34 @@ interface SidebarScoresProps {
|
|
9
10
|
}
|
10
11
|
|
11
12
|
export const SidebarScoresView: React.FC<SidebarScoresProps> = ({ scores }) => {
|
13
|
+
const showReducer = scores.findIndex((score) => !!score.reducer) !== -1;
|
12
14
|
return (
|
13
15
|
<div className={styles.container}>
|
14
|
-
{scores.map((score) => {
|
16
|
+
{scores.map((score, idx) => {
|
15
17
|
const name = score.name;
|
16
18
|
const reducer = score.reducer;
|
17
19
|
return (
|
18
|
-
<div className={styles.scoreWrapper}>
|
20
|
+
<div className={styles.scoreWrapper} key={`scorer-${name}-${idx}`}>
|
19
21
|
<div
|
20
22
|
className={clsx(
|
21
23
|
"text-style-secondary",
|
22
|
-
"text-label",
|
24
|
+
"text-style-label",
|
23
25
|
"text-size-small",
|
24
26
|
styles.metricName,
|
25
27
|
)}
|
26
28
|
>
|
27
29
|
{name}
|
28
30
|
</div>
|
29
|
-
{
|
30
|
-
<div
|
31
|
-
{
|
31
|
+
{showReducer ? (
|
32
|
+
<div
|
33
|
+
className={clsx(
|
34
|
+
"text-size-small",
|
35
|
+
"text-style-label",
|
36
|
+
"text-style-secondary",
|
37
|
+
styles.metricReducer,
|
38
|
+
)}
|
39
|
+
>
|
40
|
+
{reducer || "default"}
|
32
41
|
</div>
|
33
42
|
) : (
|
34
43
|
""
|
@@ -38,14 +47,7 @@ export const SidebarScoresView: React.FC<SidebarScoresProps> = ({ scores }) => {
|
|
38
47
|
const metric = score.metrics[key];
|
39
48
|
return (
|
40
49
|
<Fragment key={key}>
|
41
|
-
<div
|
42
|
-
className={clsx(
|
43
|
-
"text-style-secondary",
|
44
|
-
"text-style-label",
|
45
|
-
)}
|
46
|
-
>
|
47
|
-
{metric.name}
|
48
|
-
</div>
|
50
|
+
<div className={clsx()}>{metricDisplayName(metric)}</div>
|
49
51
|
<div className={styles.metricValue}>
|
50
52
|
{formatPrettyDecimal(metric.value)}
|
51
53
|
</div>
|
@@ -36,24 +36,6 @@ export const InfoTab: React.FC<PlanTabProps> = ({
|
|
36
36
|
setHidden(false);
|
37
37
|
}, [evalSpec, evalPlan, evalResults, evalStats, samples]);
|
38
38
|
|
39
|
-
const infoCards = [];
|
40
|
-
infoCards.push([
|
41
|
-
<PlanCard
|
42
|
-
evalSpec={evalSpec}
|
43
|
-
evalPlan={evalPlan}
|
44
|
-
scores={evalResults?.scores}
|
45
|
-
/>,
|
46
|
-
]);
|
47
|
-
|
48
|
-
if (evalStatus !== "started") {
|
49
|
-
infoCards.push(<UsageCard stats={evalStats} />);
|
50
|
-
}
|
51
|
-
|
52
|
-
// If there is error or progress, includes those within info
|
53
|
-
if (evalStatus === "error" && evalError) {
|
54
|
-
infoCards.unshift(<TaskErrorCard error={evalError} />);
|
55
|
-
}
|
56
|
-
|
57
39
|
const showWarning =
|
58
40
|
(!samples || samples.length === 0) &&
|
59
41
|
evalStatus === "success" &&
|
@@ -73,7 +55,15 @@ export const InfoTab: React.FC<PlanTabProps> = ({
|
|
73
55
|
""
|
74
56
|
)}
|
75
57
|
<div style={{ padding: "0.5em 1em 0 1em", width: "100%" }}>
|
76
|
-
|
58
|
+
<PlanCard
|
59
|
+
evalSpec={evalSpec}
|
60
|
+
evalPlan={evalPlan}
|
61
|
+
scores={evalResults?.scores}
|
62
|
+
/>
|
63
|
+
{evalStatus !== "started" ? <UsageCard stats={evalStats} /> : undefined}
|
64
|
+
{evalStatus === "error" && evalError ? (
|
65
|
+
<TaskErrorCard error={evalError} />
|
66
|
+
) : undefined}
|
77
67
|
</div>
|
78
68
|
</div>
|
79
69
|
);
|
@@ -0,0 +1,34 @@
|
|
1
|
+
import { EvalMetric } from "../types/log";
|
2
|
+
|
3
|
+
export const metricDisplayName = (metric: EvalMetric): string => {
|
4
|
+
let modifier = undefined;
|
5
|
+
for (const metricModifier of metricModifiers) {
|
6
|
+
modifier = metricModifier(metric);
|
7
|
+
if (modifier) {
|
8
|
+
break;
|
9
|
+
}
|
10
|
+
}
|
11
|
+
const metricName = !modifier ? metric.name : `${metric.name}[${modifier}]`;
|
12
|
+
|
13
|
+
return metricName;
|
14
|
+
};
|
15
|
+
|
16
|
+
type MetricModifier = (metric: EvalMetric) => string | undefined;
|
17
|
+
|
18
|
+
const clusterMetricModifier: MetricModifier = (
|
19
|
+
metric: EvalMetric,
|
20
|
+
): string | undefined => {
|
21
|
+
if (metric.name !== "stderr") {
|
22
|
+
return undefined;
|
23
|
+
}
|
24
|
+
|
25
|
+
const clusterValue = ((metric.params || {}) as Record<string, unknown>)[
|
26
|
+
"cluster"
|
27
|
+
];
|
28
|
+
if (clusterValue === undefined || typeof clusterValue !== "string") {
|
29
|
+
return undefined;
|
30
|
+
}
|
31
|
+
return clusterValue;
|
32
|
+
};
|
33
|
+
|
34
|
+
const metricModifiers: MetricModifier[] = [clusterMetricModifier];
|
inspect_ai/approval/_approval.py
CHANGED
inspect_ai/approval/_approver.py
CHANGED
@@ -20,10 +20,10 @@ class Approver(Protocol):
|
|
20
20
|
Approve or reject a tool call.
|
21
21
|
|
22
22
|
Args:
|
23
|
-
message
|
24
|
-
call
|
25
|
-
view
|
26
|
-
state
|
23
|
+
message: Message genreated by the model along with the tool call.
|
24
|
+
call: The tool call to be approved.
|
25
|
+
view: Custom rendering of tool context and call.
|
26
|
+
state: The current task state, if available.
|
27
27
|
|
28
28
|
Returns:
|
29
29
|
Approval: An Approval object containing the decision and explanation.
|
inspect_ai/approval/_auto.py
CHANGED
@@ -11,7 +11,7 @@ def auto_approver(decision: ApprovalDecision = "approve") -> Approver:
|
|
11
11
|
"""Automatically apply a decision to tool calls.
|
12
12
|
|
13
13
|
Args:
|
14
|
-
decision
|
14
|
+
decision: Decision to apply.
|
15
15
|
|
16
16
|
Returns:
|
17
17
|
Approver: Auto approver.
|
inspect_ai/approval/_policy.py
CHANGED
@@ -20,8 +20,13 @@ from ._call import call_approver, record_approval
|
|
20
20
|
|
21
21
|
@dataclass
|
22
22
|
class ApprovalPolicy:
|
23
|
+
"""Policy mapping approvers to tools."""
|
24
|
+
|
23
25
|
approver: Approver
|
26
|
+
"""Approver for policy."""
|
27
|
+
|
24
28
|
tools: str | list[str]
|
29
|
+
"""Tools to use this approver for (can be full tool names or globs)."""
|
25
30
|
|
26
31
|
|
27
32
|
def policy_approver(policies: str | list[ApprovalPolicy]) -> Approver:
|
inspect_ai/approval/_registry.py
CHANGED
@@ -31,11 +31,11 @@ def approver(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
31
31
|
Args:
|
32
32
|
*args: Function returning `Approver` targeted by
|
33
33
|
plain approver decorator without attributes (e.g. `@approver`)
|
34
|
-
name
|
34
|
+
name:
|
35
35
|
Optional name for approver. If the decorator has no name
|
36
36
|
argument then the name of the function
|
37
37
|
will be used to automatically assign a name.
|
38
|
-
**attribs:
|
38
|
+
**attribs: Additional approver attributes.
|
39
39
|
|
40
40
|
Returns:
|
41
41
|
Approver with registry attributes.
|
inspect_ai/dataset/_dataset.py
CHANGED
@@ -27,6 +27,8 @@ MT = TypeVar("MT", bound=BaseModel)
|
|
27
27
|
|
28
28
|
|
29
29
|
class Sample(BaseModel):
|
30
|
+
r"""Sample for an evaluation task."""
|
31
|
+
|
30
32
|
def __init__(
|
31
33
|
self,
|
32
34
|
input: str | list[ChatMessage],
|
@@ -38,22 +40,22 @@ class Sample(BaseModel):
|
|
38
40
|
files: dict[str, str] | None = None,
|
39
41
|
setup: str | None = None,
|
40
42
|
) -> None:
|
41
|
-
r"""
|
43
|
+
r"""Create a Sample.
|
42
44
|
|
43
45
|
Args:
|
44
|
-
input
|
45
|
-
choices
|
46
|
-
|
47
|
-
target
|
46
|
+
input: The input to be submitted to the model.
|
47
|
+
choices: Optional. List of available answer choices
|
48
|
+
(used only for multiple-choice evals).
|
49
|
+
target: Optional. Ideal target output. May be a literal value
|
48
50
|
or narrative text to be used by a model grader.
|
49
|
-
id
|
50
|
-
metadata
|
51
|
-
|
52
|
-
|
53
|
-
files
|
54
|
-
|
55
|
-
setup
|
56
|
-
|
51
|
+
id: Optional. Unique identifier for sample.
|
52
|
+
metadata: Optional. Arbitrary metadata associated with the sample.
|
53
|
+
sandbox (SandboxEnvironmentType | None): Sandbox environment type (or optionally a str or tuple with a shorthand spec)
|
54
|
+
sandbox: Optional. Sandbox specification for this sample.
|
55
|
+
files: Optional. Files that go along with the sample (copied to
|
56
|
+
SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).
|
57
|
+
setup: Optional. Setup script to run for sample (run
|
58
|
+
within default SandboxEnvironment).
|
57
59
|
"""
|
58
60
|
super().__init__(
|
59
61
|
input=input,
|
@@ -144,14 +146,6 @@ class Dataset(Sequence[Sample], abc.ABC):
|
|
144
146
|
@abc.abstractmethod
|
145
147
|
def shuffled(self) -> bool: ...
|
146
148
|
|
147
|
-
@abc.abstractmethod
|
148
|
-
def shuffle_choices(self, seed: int | None = None) -> None:
|
149
|
-
"""Shuffle the order of the choices with each sample.
|
150
|
-
|
151
|
-
Args:
|
152
|
-
seed: (int | None): Random seed for shuffling (optional).
|
153
|
-
"""
|
154
|
-
|
155
149
|
@overload
|
156
150
|
def __getitem__(self, index: int) -> Sample: ...
|
157
151
|
|
@@ -164,14 +158,6 @@ class Dataset(Sequence[Sample], abc.ABC):
|
|
164
158
|
@abc.abstractmethod
|
165
159
|
def __len__(self) -> int: ...
|
166
160
|
|
167
|
-
@abc.abstractmethod
|
168
|
-
def shuffle(self, seed: int | None = None) -> None:
|
169
|
-
"""Shuffle the order of the dataset (in place).
|
170
|
-
|
171
|
-
Args:
|
172
|
-
seed: (int | None): Random seed for shuffling (optional).
|
173
|
-
"""
|
174
|
-
|
175
161
|
@abc.abstractmethod
|
176
162
|
def sort(
|
177
163
|
self,
|
@@ -185,8 +171,8 @@ class Dataset(Sequence[Sample], abc.ABC):
|
|
185
171
|
The key function defaults to measuring the length of the sample's input field.
|
186
172
|
|
187
173
|
Args:
|
188
|
-
reverse
|
189
|
-
key
|
174
|
+
reverse: If `Treu`, sort in descending order. Defaults to False.
|
175
|
+
key: a callable mapping each item to a numeric value (optional, defaults to sample_input_len).
|
190
176
|
"""
|
191
177
|
|
192
178
|
@abc.abstractmethod
|
@@ -196,28 +182,33 @@ class Dataset(Sequence[Sample], abc.ABC):
|
|
196
182
|
"""Filter the dataset using a predicate.
|
197
183
|
|
198
184
|
Args:
|
199
|
-
predicate
|
200
|
-
name
|
185
|
+
predicate: Filtering function.
|
186
|
+
name: Name for filtered dataset (optional).
|
201
187
|
|
202
188
|
Returns:
|
203
189
|
Filtered dataset.
|
204
190
|
"""
|
205
191
|
|
192
|
+
@abc.abstractmethod
|
193
|
+
def shuffle(self, seed: int | None = None) -> None:
|
194
|
+
"""Shuffle the order of the dataset (in place).
|
195
|
+
|
196
|
+
Args:
|
197
|
+
seed: Random seed for shuffling (optional).
|
198
|
+
"""
|
199
|
+
|
200
|
+
@abc.abstractmethod
|
201
|
+
def shuffle_choices(self, seed: int | None = None) -> None:
|
202
|
+
"""Shuffle the order of the choices with each sample.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
seed: Random seed for shuffling (optional).
|
206
|
+
"""
|
207
|
+
|
206
208
|
|
207
209
|
@dataclass
|
208
210
|
class FieldSpec:
|
209
|
-
r"""Specification for mapping data source fields to sample fields.
|
210
|
-
|
211
|
-
Args:
|
212
|
-
input (str): Name of the field containing the sample input.
|
213
|
-
target (str): Name of the field containing the sample target.
|
214
|
-
choices (str): Optional. Name of field containing the list of answer choices.
|
215
|
-
id (str): Optional. Unique identifier for the sample.
|
216
|
-
metadata (list[str] | None): List of additional field names that should be read as metadata.
|
217
|
-
sandbox (str): Optional. Sandbox type along with optional config file
|
218
|
-
files (str): Optional. Files that go along with the sample.
|
219
|
-
setup (str): Optional. Setup script to run for sample .
|
220
|
-
"""
|
211
|
+
r"""Specification for mapping data source fields to sample fields."""
|
221
212
|
|
222
213
|
input: str = field(default="input")
|
223
214
|
"""Name of the field containing the sample input."""
|
File without changes
|
@@ -35,30 +35,30 @@ def csv_dataset(
|
|
35
35
|
r"""Read dataset from CSV file.
|
36
36
|
|
37
37
|
Args:
|
38
|
-
csv_file
|
38
|
+
csv_file: Path to CSV file. Can be a local filesystem path,
|
39
39
|
a path to an S3 bucket (e.g. "s3://my-bucket"), or an HTTPS URL.
|
40
40
|
Use `fs_options` to pass arguments through to the `S3FileSystem` constructor.
|
41
|
-
sample_fields
|
41
|
+
sample_fields: Method of mapping underlying
|
42
42
|
fields in the data source to Sample objects. Pass `None` if the data is already
|
43
43
|
stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a
|
44
44
|
`FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
|
45
45
|
handle mapping with a custom function that returns one or more samples.
|
46
|
-
auto_id
|
47
|
-
shuffle
|
48
|
-
seed:
|
49
|
-
shuffle_choices:
|
50
|
-
limit
|
51
|
-
dialect
|
52
|
-
encoding
|
53
|
-
name
|
46
|
+
auto_id: Assign an auto-incrementing ID for each sample.
|
47
|
+
shuffle: Randomly shuffle the dataset order.
|
48
|
+
seed: Seed used for random shuffle.
|
49
|
+
shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
|
50
|
+
limit: Limit the number of records to read.
|
51
|
+
dialect: CSV dialect ("unix", "excel" or"excel-tab"). Defaults to "unix". See https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters for more details
|
52
|
+
encoding: Text encoding for file (defaults to "utf-8").
|
53
|
+
name: Optional name for dataset (for logging). If not specified,
|
54
54
|
defaults to the stem of the filename
|
55
|
-
fs_options
|
55
|
+
fs_options: Optional. Additional arguments to pass through
|
56
56
|
to the filesystem provider (e.g. `S3FileSystem`). Use `{"anon": True }`
|
57
57
|
if you are accessing a public S3 bucket with no credentials.
|
58
|
-
fieldnames
|
58
|
+
fieldnames: Optional. A list of fieldnames to use for the CSV.
|
59
59
|
If None, the values in the first row of the file will be used as the fieldnames.
|
60
60
|
Useful for files without a header.
|
61
|
-
delimiter
|
61
|
+
delimiter: Optional. The delimiter to use when parsing the file. Defaults to ",".
|
62
62
|
|
63
63
|
Returns:
|
64
64
|
Dataset read from CSV file.
|