inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/sandbox.py +4 -1
  8. inspect_ai/_cli/score.py +181 -32
  9. inspect_ai/_cli/trace.py +2 -0
  10. inspect_ai/_cli/view.py +4 -2
  11. inspect_ai/_display/core/config.py +7 -1
  12. inspect_ai/_display/core/progress.py +1 -1
  13. inspect_ai/_display/textual/app.py +8 -4
  14. inspect_ai/_display/textual/widgets/samples.py +6 -5
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/__init__.py +0 -0
  17. inspect_ai/_eval/eval.py +100 -97
  18. inspect_ai/_eval/evalset.py +69 -69
  19. inspect_ai/_eval/loader.py +122 -12
  20. inspect_ai/_eval/registry.py +1 -1
  21. inspect_ai/_eval/run.py +14 -0
  22. inspect_ai/_eval/score.py +125 -36
  23. inspect_ai/_eval/task/log.py +105 -4
  24. inspect_ai/_eval/task/results.py +92 -38
  25. inspect_ai/_eval/task/run.py +6 -2
  26. inspect_ai/_eval/task/sandbox.py +35 -2
  27. inspect_ai/_eval/task/task.py +49 -46
  28. inspect_ai/_util/__init__.py +0 -0
  29. inspect_ai/_util/constants.py +1 -1
  30. inspect_ai/_util/content.py +8 -0
  31. inspect_ai/_util/error.py +2 -0
  32. inspect_ai/_util/file.py +15 -1
  33. inspect_ai/_util/logger.py +4 -2
  34. inspect_ai/_util/registry.py +7 -1
  35. inspect_ai/_view/view.py +1 -2
  36. inspect_ai/_view/www/App.css +8 -3
  37. inspect_ai/_view/www/README.md +1 -1
  38. inspect_ai/_view/www/dist/assets/index.css +66 -38
  39. inspect_ai/_view/www/dist/assets/index.js +525 -523
  40. inspect_ai/_view/www/log-schema.json +86 -73
  41. inspect_ai/_view/www/package.json +1 -1
  42. inspect_ai/_view/www/src/App.tsx +1 -0
  43. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
  44. inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
  45. inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
  46. inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
  47. inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
  48. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
  49. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
  50. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
  51. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
  52. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
  53. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
  54. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
  55. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
  56. inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
  57. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
  58. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
  59. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
  60. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
  61. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
  62. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
  63. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
  64. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
  65. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
  66. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
  67. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
  68. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
  69. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
  70. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
  72. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
  73. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
  74. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
  75. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
  76. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
  77. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
  78. inspect_ai/_view/www/src/types/log.d.ts +107 -19
  79. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
  80. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
  81. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
  82. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
  83. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
  84. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
  85. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
  86. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
  87. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
  88. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
  89. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  90. inspect_ai/approval/_approval.py +2 -0
  91. inspect_ai/approval/_approver.py +4 -4
  92. inspect_ai/approval/_auto.py +1 -1
  93. inspect_ai/approval/_human/approver.py +3 -0
  94. inspect_ai/approval/_policy.py +5 -0
  95. inspect_ai/approval/_registry.py +2 -2
  96. inspect_ai/dataset/_dataset.py +36 -45
  97. inspect_ai/dataset/_sources/__init__.py +0 -0
  98. inspect_ai/dataset/_sources/csv.py +13 -13
  99. inspect_ai/dataset/_sources/hf.py +29 -29
  100. inspect_ai/dataset/_sources/json.py +10 -10
  101. inspect_ai/log/__init__.py +2 -0
  102. inspect_ai/log/_convert.py +3 -3
  103. inspect_ai/log/_file.py +24 -9
  104. inspect_ai/log/_log.py +98 -7
  105. inspect_ai/log/_message.py +3 -1
  106. inspect_ai/log/_recorders/file.py +4 -0
  107. inspect_ai/log/_recorders/recorder.py +3 -0
  108. inspect_ai/log/_transcript.py +19 -8
  109. inspect_ai/model/__init__.py +2 -0
  110. inspect_ai/model/_cache.py +39 -21
  111. inspect_ai/model/_call_tools.py +2 -2
  112. inspect_ai/model/_chat_message.py +14 -4
  113. inspect_ai/model/_generate_config.py +1 -1
  114. inspect_ai/model/_model.py +31 -24
  115. inspect_ai/model/_model_output.py +14 -1
  116. inspect_ai/model/_openai.py +10 -18
  117. inspect_ai/model/_providers/google.py +9 -5
  118. inspect_ai/model/_providers/openai.py +5 -9
  119. inspect_ai/model/_providers/openrouter.py +1 -1
  120. inspect_ai/scorer/__init__.py +6 -1
  121. inspect_ai/scorer/_answer.py +1 -1
  122. inspect_ai/scorer/_classification.py +4 -0
  123. inspect_ai/scorer/_match.py +4 -5
  124. inspect_ai/scorer/_metric.py +87 -28
  125. inspect_ai/scorer/_metrics/__init__.py +3 -3
  126. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  127. inspect_ai/scorer/_metrics/mean.py +3 -17
  128. inspect_ai/scorer/_metrics/std.py +111 -30
  129. inspect_ai/scorer/_model.py +12 -12
  130. inspect_ai/scorer/_pattern.py +3 -3
  131. inspect_ai/scorer/_reducer/reducer.py +36 -21
  132. inspect_ai/scorer/_reducer/registry.py +2 -2
  133. inspect_ai/scorer/_reducer/types.py +7 -1
  134. inspect_ai/scorer/_score.py +11 -1
  135. inspect_ai/scorer/_scorer.py +110 -16
  136. inspect_ai/solver/__init__.py +1 -1
  137. inspect_ai/solver/_basic_agent.py +19 -22
  138. inspect_ai/solver/_bridge/__init__.py +0 -3
  139. inspect_ai/solver/_bridge/bridge.py +3 -3
  140. inspect_ai/solver/_chain.py +1 -2
  141. inspect_ai/solver/_critique.py +3 -3
  142. inspect_ai/solver/_fork.py +2 -2
  143. inspect_ai/solver/_human_agent/__init__.py +0 -0
  144. inspect_ai/solver/_human_agent/agent.py +5 -8
  145. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  146. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  147. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  148. inspect_ai/solver/_multiple_choice.py +15 -18
  149. inspect_ai/solver/_prompt.py +7 -7
  150. inspect_ai/solver/_solver.py +53 -52
  151. inspect_ai/solver/_task_state.py +80 -69
  152. inspect_ai/solver/_use_tools.py +9 -9
  153. inspect_ai/tool/__init__.py +2 -1
  154. inspect_ai/tool/_tool.py +43 -14
  155. inspect_ai/tool/_tool_call.py +6 -2
  156. inspect_ai/tool/_tool_choice.py +3 -1
  157. inspect_ai/tool/_tool_def.py +10 -8
  158. inspect_ai/tool/_tool_params.py +24 -0
  159. inspect_ai/tool/_tool_with.py +7 -7
  160. inspect_ai/tool/_tools/__init__.py +0 -0
  161. inspect_ai/tool/_tools/_computer/_common.py +2 -2
  162. inspect_ai/tool/_tools/_computer/_computer.py +11 -0
  163. inspect_ai/tool/_tools/_execute.py +15 -9
  164. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  165. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  166. inspect_ai/tool/_tools/_web_search.py +7 -5
  167. inspect_ai/util/_concurrency.py +3 -3
  168. inspect_ai/util/_panel.py +2 -0
  169. inspect_ai/util/_resource.py +12 -12
  170. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  171. inspect_ai/util/_sandbox/docker/config.py +2 -1
  172. inspect_ai/util/_sandbox/docker/docker.py +10 -1
  173. inspect_ai/util/_sandbox/docker/service.py +100 -0
  174. inspect_ai/util/_sandbox/environment.py +99 -96
  175. inspect_ai/util/_subprocess.py +5 -3
  176. inspect_ai/util/_subtask.py +15 -16
  177. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
  178. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
  179. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
  180. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
  181. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
  182. {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import clsx from "clsx";
2
2
  import { EvalMetric, EvalResults, EvalScore, Reducer } from "../../types/log";
3
3
  import { formatPrettyDecimal } from "../../utils/format";
4
+ import { metricDisplayName } from "../utils";
4
5
  import styles from "./ResultsPanel.module.css";
5
6
 
6
7
  interface ResultsPanelProps {
@@ -23,7 +24,7 @@ export const ResultsPanel: React.FC<ResultsPanelProps> = ({ results }) => {
23
24
  metric: {
24
25
  name: key,
25
26
  value: score.metrics[key].value,
26
- options: {},
27
+ params: score.metrics[key].params,
27
28
  metadata: {},
28
29
  },
29
30
  };
@@ -31,18 +32,35 @@ export const ResultsPanel: React.FC<ResultsPanelProps> = ({ results }) => {
31
32
  });
32
33
 
33
34
  const metrics = Object.values(scorers)[0];
35
+ const showReducer = metrics && metrics.length > 0 && !!metrics[0].reducer;
34
36
  return (
35
37
  <div className={styles.simpleMetricsRows}>
36
38
  {metrics.map((metric, i) => {
37
- return <VerticalMetric metricSummary={metric} isFirst={i === 0} />;
39
+ return (
40
+ <VerticalMetric
41
+ key={`simple-metric-${i}`}
42
+ metricSummary={metric}
43
+ isFirst={i === 0}
44
+ showReducer={showReducer}
45
+ />
46
+ );
38
47
  })}
39
48
  </div>
40
49
  );
41
50
  } else {
51
+ const showReducer =
52
+ results?.scores.findIndex((score) => !!score.reducer) !== -1;
42
53
  return (
43
54
  <div className={styles.multiMetricsRows}>
44
55
  {results?.scores?.map((score, index) => {
45
- return <MultiScorerMetric scorer={score} isFirst={index === 0} />;
56
+ return (
57
+ <MultiScorerMetric
58
+ key={`multi-metric-${index}`}
59
+ scorer={score}
60
+ isFirst={index === 0}
61
+ showReducer={showReducer}
62
+ />
63
+ );
46
64
  })}
47
65
  </div>
48
66
  );
@@ -52,6 +70,7 @@ export const ResultsPanel: React.FC<ResultsPanelProps> = ({ results }) => {
52
70
  interface VerticalMetricProps {
53
71
  metricSummary: MetricSummary;
54
72
  isFirst: boolean;
73
+ showReducer: boolean;
55
74
  }
56
75
 
57
76
  /** Renders a Vertical Metric
@@ -59,21 +78,8 @@ interface VerticalMetricProps {
59
78
  const VerticalMetric: React.FC<VerticalMetricProps> = ({
60
79
  metricSummary,
61
80
  isFirst,
81
+ showReducer,
62
82
  }) => {
63
- const reducer_component = metricSummary.reducer ? (
64
- <div
65
- className={clsx(
66
- "text-style-label",
67
- "text-style-secondary",
68
- styles.verticalMetricReducer,
69
- )}
70
- >
71
- {metricSummary.reducer}
72
- </div>
73
- ) : (
74
- ""
75
- );
76
-
77
83
  return (
78
84
  <div style={{ paddingLeft: isFirst ? "0" : "1em" }}>
79
85
  <div
@@ -84,11 +90,26 @@ const VerticalMetric: React.FC<VerticalMetricProps> = ({
84
90
  styles.verticalMetricName,
85
91
  )}
86
92
  >
87
- {metricSummary.metric.name}
93
+ {metricDisplayName(metricSummary.metric)}
88
94
  </div>
89
- {reducer_component}
95
+ {showReducer ? (
96
+ <div
97
+ className={clsx(
98
+ "text-style-label",
99
+ "text-style-secondary",
100
+ styles.verticalMetricReducer,
101
+ )}
102
+ >
103
+ {metricSummary.reducer || "default"}
104
+ </div>
105
+ ) : undefined}
106
+
90
107
  <div
91
- className={clsx("vertical-metric-value", styles.verticalMetricValue)}
108
+ className={clsx(
109
+ "vertical-metric-value",
110
+ "text-size-largest",
111
+ styles.verticalMetricValue,
112
+ )}
92
113
  >
93
114
  {formatPrettyDecimal(metricSummary.metric.value)}
94
115
  </div>
@@ -99,33 +120,25 @@ const VerticalMetric: React.FC<VerticalMetricProps> = ({
99
120
  interface MultiScorerMetricProps {
100
121
  scorer: EvalScore;
101
122
  isFirst: boolean;
123
+ showReducer: boolean;
102
124
  }
103
125
 
104
126
  const MultiScorerMetric: React.FC<MultiScorerMetricProps> = ({
105
127
  scorer,
106
128
  isFirst,
129
+ showReducer,
107
130
  }) => {
108
131
  const titleFontClz = "text-size-base";
109
132
  const reducerFontClz = "text-size-smaller";
110
133
  const valueFontClz = "text-size-base";
111
134
 
112
- const reducer_component = scorer.reducer ? (
135
+ return (
113
136
  <div
114
137
  className={clsx(
115
- reducerFontClz,
116
- "text-style-label",
117
- "text-style-secondary",
118
- styles.multiScorerReducer,
138
+ styles.multiScorer,
139
+ isFirst ? styles.multiScorerIndent : undefined,
119
140
  )}
120
141
  >
121
- {scorer.reducer}
122
- </div>
123
- ) : (
124
- ""
125
- );
126
-
127
- return (
128
- <div style={{ paddingLeft: isFirst ? "0" : "1.5em" }}>
129
142
  <div
130
143
  className={clsx(
131
144
  titleFontClz,
@@ -137,13 +150,24 @@ const MultiScorerMetric: React.FC<MultiScorerMetricProps> = ({
137
150
  >
138
151
  {scorer.name}
139
152
  </div>
140
- {reducer_component}
153
+ {showReducer ? (
154
+ <div
155
+ className={clsx(
156
+ reducerFontClz,
157
+ "text-style-label",
158
+ "text-style-secondary",
159
+ styles.multiScorerReducer,
160
+ )}
161
+ >
162
+ {scorer.reducer || "default"}
163
+ </div>
164
+ ) : undefined}
141
165
  <div className={clsx(valueFontClz, styles.multiScorerValue)}>
142
166
  {Object.keys(scorer.metrics).map((key) => {
143
167
  const metric = scorer.metrics[key];
144
168
  return (
145
- <div>
146
- <div>{metric.name}</div>
169
+ <div className={styles.multiScoreMetricGrid} key={key}>
170
+ <div>{metricDisplayName(metric)}</div>
147
171
  <div className={styles.multiScorerValueContent}>
148
172
  {formatPrettyDecimal(metric.value)}
149
173
  </div>
@@ -53,6 +53,7 @@ export const SecondaryBar: React.FC<SecondaryBarProps> = ({
53
53
  size: "minmax(12%, auto)",
54
54
  value: (
55
55
  <LabeledValue
56
+ key="sb-dataset"
56
57
  label="Dataset"
57
58
  className={(styles.staticCol, "text-size-small")}
58
59
  >
@@ -71,6 +72,7 @@ export const SecondaryBar: React.FC<SecondaryBarProps> = ({
71
72
  size: "minmax(12%, auto)",
72
73
  value: (
73
74
  <LabeledValue
75
+ key="sb-scorer"
74
76
  label={label}
75
77
  className={clsx(
76
78
  styles.staticCol,
@@ -88,6 +90,7 @@ export const SecondaryBar: React.FC<SecondaryBarProps> = ({
88
90
  size: "minmax(12%, auto)",
89
91
  value: (
90
92
  <LabeledValue
93
+ key="sb-params"
91
94
  label="Config"
92
95
  className={clsx(styles.justifyRight, "text-size-small")}
93
96
  >
@@ -106,6 +109,7 @@ export const SecondaryBar: React.FC<SecondaryBarProps> = ({
106
109
  size: "minmax(12%, auto)",
107
110
  value: (
108
111
  <LabeledValue
112
+ key="sb-duration"
109
113
  label="Duration"
110
114
  className={clsx(styles.justifyRight, "text-size-small")}
111
115
  >
@@ -2,17 +2,19 @@ import clsx from "clsx";
2
2
  import { EvalScore } from "../../types/log";
3
3
  import { formatPrettyDecimal } from "../../utils/format";
4
4
 
5
+ import { metricDisplayName } from "../utils";
5
6
  import styles from "./SidebarScoreView.module.css";
6
7
  interface SidebarScoreProps {
7
8
  scorer: EvalScore;
8
9
  }
9
10
 
10
11
  export const SidebarScoreView: React.FC<SidebarScoreProps> = ({ scorer }) => {
12
+ const showReducer = !!scorer.reducer;
11
13
  return (
12
14
  <div className={styles.container}>
13
15
  {Object.keys(scorer.metrics).map((metric) => {
14
16
  return (
15
- <div className={styles.metric}>
17
+ <div className={styles.metric} key={metric}>
16
18
  <div
17
19
  className={clsx(
18
20
  "text-style-secondary",
@@ -21,11 +23,11 @@ export const SidebarScoreView: React.FC<SidebarScoreProps> = ({ scorer }) => {
21
23
  styles.metricName,
22
24
  )}
23
25
  >
24
- {scorer.metrics[metric].name}
26
+ {metricDisplayName(scorer.metrics[metric])}
25
27
  </div>
26
- {scorer.reducer ? (
28
+ {showReducer ? (
27
29
  <div className={clsx("text-size-small", styles.metricReducer)}>
28
- ${scorer.reducer}
30
+ {scorer.reducer || "default"}
29
31
  </div>
30
32
  ) : (
31
33
  ""
@@ -2,6 +2,7 @@ import clsx from "clsx";
2
2
  import { Fragment } from "react";
3
3
  import { Scores } from "../../types/log";
4
4
  import { formatPrettyDecimal } from "../../utils/format";
5
+ import { metricDisplayName } from "../utils";
5
6
  import styles from "./SidebarScoresView.module.css";
6
7
 
7
8
  interface SidebarScoresProps {
@@ -9,26 +10,34 @@ interface SidebarScoresProps {
9
10
  }
10
11
 
11
12
  export const SidebarScoresView: React.FC<SidebarScoresProps> = ({ scores }) => {
13
+ const showReducer = scores.findIndex((score) => !!score.reducer) !== -1;
12
14
  return (
13
15
  <div className={styles.container}>
14
- {scores.map((score) => {
16
+ {scores.map((score, idx) => {
15
17
  const name = score.name;
16
18
  const reducer = score.reducer;
17
19
  return (
18
- <div className={styles.scoreWrapper}>
20
+ <div className={styles.scoreWrapper} key={`scorer-${name}-${idx}`}>
19
21
  <div
20
22
  className={clsx(
21
23
  "text-style-secondary",
22
- "text-label",
24
+ "text-style-label",
23
25
  "text-size-small",
24
26
  styles.metricName,
25
27
  )}
26
28
  >
27
29
  {name}
28
30
  </div>
29
- {reducer ? (
30
- <div className={clsx("text-size-small", styles.metricReducer)}>
31
- {reducer}
31
+ {showReducer ? (
32
+ <div
33
+ className={clsx(
34
+ "text-size-small",
35
+ "text-style-label",
36
+ "text-style-secondary",
37
+ styles.metricReducer,
38
+ )}
39
+ >
40
+ {reducer || "default"}
32
41
  </div>
33
42
  ) : (
34
43
  ""
@@ -38,14 +47,7 @@ export const SidebarScoresView: React.FC<SidebarScoresProps> = ({ scores }) => {
38
47
  const metric = score.metrics[key];
39
48
  return (
40
49
  <Fragment key={key}>
41
- <div
42
- className={clsx(
43
- "text-style-secondary",
44
- "text-style-label",
45
- )}
46
- >
47
- {metric.name}
48
- </div>
50
+ <div className={clsx()}>{metricDisplayName(metric)}</div>
49
51
  <div className={styles.metricValue}>
50
52
  {formatPrettyDecimal(metric.value)}
51
53
  </div>
@@ -36,24 +36,6 @@ export const InfoTab: React.FC<PlanTabProps> = ({
36
36
  setHidden(false);
37
37
  }, [evalSpec, evalPlan, evalResults, evalStats, samples]);
38
38
 
39
- const infoCards = [];
40
- infoCards.push([
41
- <PlanCard
42
- evalSpec={evalSpec}
43
- evalPlan={evalPlan}
44
- scores={evalResults?.scores}
45
- />,
46
- ]);
47
-
48
- if (evalStatus !== "started") {
49
- infoCards.push(<UsageCard stats={evalStats} />);
50
- }
51
-
52
- // If there is error or progress, includes those within info
53
- if (evalStatus === "error" && evalError) {
54
- infoCards.unshift(<TaskErrorCard error={evalError} />);
55
- }
56
-
57
39
  const showWarning =
58
40
  (!samples || samples.length === 0) &&
59
41
  evalStatus === "success" &&
@@ -73,7 +55,15 @@ export const InfoTab: React.FC<PlanTabProps> = ({
73
55
  ""
74
56
  )}
75
57
  <div style={{ padding: "0.5em 1em 0 1em", width: "100%" }}>
76
- {infoCards}
58
+ <PlanCard
59
+ evalSpec={evalSpec}
60
+ evalPlan={evalPlan}
61
+ scores={evalResults?.scores}
62
+ />
63
+ {evalStatus !== "started" ? <UsageCard stats={evalStats} /> : undefined}
64
+ {evalStatus === "error" && evalError ? (
65
+ <TaskErrorCard error={evalError} />
66
+ ) : undefined}
77
67
  </div>
78
68
  </div>
79
69
  );
@@ -0,0 +1,34 @@
1
+ import { EvalMetric } from "../types/log";
2
+
3
+ export const metricDisplayName = (metric: EvalMetric): string => {
4
+ let modifier = undefined;
5
+ for (const metricModifier of metricModifiers) {
6
+ modifier = metricModifier(metric);
7
+ if (modifier) {
8
+ break;
9
+ }
10
+ }
11
+ const metricName = !modifier ? metric.name : `${metric.name}[${modifier}]`;
12
+
13
+ return metricName;
14
+ };
15
+
16
+ type MetricModifier = (metric: EvalMetric) => string | undefined;
17
+
18
+ const clusterMetricModifier: MetricModifier = (
19
+ metric: EvalMetric,
20
+ ): string | undefined => {
21
+ if (metric.name !== "stderr") {
22
+ return undefined;
23
+ }
24
+
25
+ const clusterValue = ((metric.params || {}) as Record<string, unknown>)[
26
+ "cluster"
27
+ ];
28
+ if (clusterValue === undefined || typeof clusterValue !== "string") {
29
+ return undefined;
30
+ }
31
+ return clusterValue;
32
+ };
33
+
34
+ const metricModifiers: MetricModifier[] = [clusterMetricModifier];
@@ -17,6 +17,8 @@ Possible values:
17
17
 
18
18
 
19
19
  class Approval(BaseModel):
20
+ """Approval details (decision, explanation, etc.)"""
21
+
20
22
  decision: ApprovalDecision
21
23
  """Approval decision."""
22
24
 
@@ -20,10 +20,10 @@ class Approver(Protocol):
20
20
  Approve or reject a tool call.
21
21
 
22
22
  Args:
23
- message (str): Message genreated by the model along with the tool call.
24
- call (ToolCall): The tool call to be approved.
25
- view (ToolCallView): Custom rendering of tool context and call.
26
- state (state | None): The current task state, if available.
23
+ message: Message genreated by the model along with the tool call.
24
+ call: The tool call to be approved.
25
+ view: Custom rendering of tool context and call.
26
+ state: The current task state, if available.
27
27
 
28
28
  Returns:
29
29
  Approval: An Approval object containing the decision and explanation.
@@ -11,7 +11,7 @@ def auto_approver(decision: ApprovalDecision = "approve") -> Approver:
11
11
  """Automatically apply a decision to tool calls.
12
12
 
13
13
  Args:
14
- decision (ApprovalDecision): Decision to apply.
14
+ decision: Decision to apply.
15
15
 
16
16
  Returns:
17
17
  Approver: Auto approver.
@@ -14,6 +14,9 @@ def human_approver(
14
14
  ) -> Approver:
15
15
  """Interactive human approver.
16
16
 
17
+ Args:
18
+ choices: Choices to present to human.
19
+
17
20
  Returns:
18
21
  Approver: Interactive human approver.
19
22
  """
@@ -20,8 +20,13 @@ from ._call import call_approver, record_approval
20
20
 
21
21
  @dataclass
22
22
  class ApprovalPolicy:
23
+ """Policy mapping approvers to tools."""
24
+
23
25
  approver: Approver
26
+ """Approver for policy."""
27
+
24
28
  tools: str | list[str]
29
+ """Tools to use this approver for (can be full tool names or globs)."""
25
30
 
26
31
 
27
32
  def policy_approver(policies: str | list[ApprovalPolicy]) -> Approver:
@@ -31,11 +31,11 @@ def approver(*args: Any, name: str | None = None, **attribs: Any) -> Any:
31
31
  Args:
32
32
  *args: Function returning `Approver` targeted by
33
33
  plain approver decorator without attributes (e.g. `@approver`)
34
- name (str | None):
34
+ name:
35
35
  Optional name for approver. If the decorator has no name
36
36
  argument then the name of the function
37
37
  will be used to automatically assign a name.
38
- **attribs: (dict[str,Any]): Additional approver attributes.
38
+ **attribs: Additional approver attributes.
39
39
 
40
40
  Returns:
41
41
  Approver with registry attributes.
@@ -27,6 +27,8 @@ MT = TypeVar("MT", bound=BaseModel)
27
27
 
28
28
 
29
29
  class Sample(BaseModel):
30
+ r"""Sample for an evaluation task."""
31
+
30
32
  def __init__(
31
33
  self,
32
34
  input: str | list[ChatMessage],
@@ -38,22 +40,22 @@ class Sample(BaseModel):
38
40
  files: dict[str, str] | None = None,
39
41
  setup: str | None = None,
40
42
  ) -> None:
41
- r"""Sample to be used in an evaluation task.
43
+ r"""Create a Sample.
42
44
 
43
45
  Args:
44
- input (str | list[ChatMessage]): The input to be submitted to the model.
45
- choices (list[str] | None): Optional. List of available answer choices
46
- (used only for multiple-choice evals).
47
- target (str | list[str]): Optional. Ideal target output. May be a literal value
46
+ input: The input to be submitted to the model.
47
+ choices: Optional. List of available answer choices
48
+ (used only for multiple-choice evals).
49
+ target: Optional. Ideal target output. May be a literal value
48
50
  or narrative text to be used by a model grader.
49
- id (int | str | None): Optional. Unique identifier for sample.
50
- metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample.
51
- sandbox (SandboxEnvironmentType | None): Sandbox environment type
52
- (or optionally a str or tuple with a shorthand spec)
53
- files (dict[str, str] | None): Optional. Files that go along with the sample (copied to
54
- SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).
55
- setup (str | None): Optional. Setup script to run for sample (run
56
- within default SandboxEnvironment).
51
+ id: Optional. Unique identifier for sample.
52
+ metadata: Optional. Arbitrary metadata associated with the sample.
53
+ sandbox (SandboxEnvironmentType | None): Sandbox environment type (or optionally a str or tuple with a shorthand spec)
54
+ sandbox: Optional. Sandbox specification for this sample.
55
+ files: Optional. Files that go along with the sample (copied to
56
+ SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).
57
+ setup: Optional. Setup script to run for sample (run
58
+ within default SandboxEnvironment).
57
59
  """
58
60
  super().__init__(
59
61
  input=input,
@@ -144,14 +146,6 @@ class Dataset(Sequence[Sample], abc.ABC):
144
146
  @abc.abstractmethod
145
147
  def shuffled(self) -> bool: ...
146
148
 
147
- @abc.abstractmethod
148
- def shuffle_choices(self, seed: int | None = None) -> None:
149
- """Shuffle the order of the choices with each sample.
150
-
151
- Args:
152
- seed: (int | None): Random seed for shuffling (optional).
153
- """
154
-
155
149
  @overload
156
150
  def __getitem__(self, index: int) -> Sample: ...
157
151
 
@@ -164,14 +158,6 @@ class Dataset(Sequence[Sample], abc.ABC):
164
158
  @abc.abstractmethod
165
159
  def __len__(self) -> int: ...
166
160
 
167
- @abc.abstractmethod
168
- def shuffle(self, seed: int | None = None) -> None:
169
- """Shuffle the order of the dataset (in place).
170
-
171
- Args:
172
- seed: (int | None): Random seed for shuffling (optional).
173
- """
174
-
175
161
  @abc.abstractmethod
176
162
  def sort(
177
163
  self,
@@ -185,8 +171,8 @@ class Dataset(Sequence[Sample], abc.ABC):
185
171
  The key function defaults to measuring the length of the sample's input field.
186
172
 
187
173
  Args:
188
- reverse (bool): if true, sort in descending order. Defaults to False.
189
- key (Callable[[Any], Any]): a callable mapping each item to a numeric value (optional, defaults to sample_input_len).
174
+ reverse: If `Treu`, sort in descending order. Defaults to False.
175
+ key: a callable mapping each item to a numeric value (optional, defaults to sample_input_len).
190
176
  """
191
177
 
192
178
  @abc.abstractmethod
@@ -196,28 +182,33 @@ class Dataset(Sequence[Sample], abc.ABC):
196
182
  """Filter the dataset using a predicate.
197
183
 
198
184
  Args:
199
- predicate (Callable[[Sample], bool]): Filtering function.
200
- name (str | None): Name for filtered dataset (optional).
185
+ predicate: Filtering function.
186
+ name: Name for filtered dataset (optional).
201
187
 
202
188
  Returns:
203
189
  Filtered dataset.
204
190
  """
205
191
 
192
+ @abc.abstractmethod
193
+ def shuffle(self, seed: int | None = None) -> None:
194
+ """Shuffle the order of the dataset (in place).
195
+
196
+ Args:
197
+ seed: Random seed for shuffling (optional).
198
+ """
199
+
200
+ @abc.abstractmethod
201
+ def shuffle_choices(self, seed: int | None = None) -> None:
202
+ """Shuffle the order of the choices with each sample.
203
+
204
+ Args:
205
+ seed: Random seed for shuffling (optional).
206
+ """
207
+
206
208
 
207
209
  @dataclass
208
210
  class FieldSpec:
209
- r"""Specification for mapping data source fields to sample fields.
210
-
211
- Args:
212
- input (str): Name of the field containing the sample input.
213
- target (str): Name of the field containing the sample target.
214
- choices (str): Optional. Name of field containing the list of answer choices.
215
- id (str): Optional. Unique identifier for the sample.
216
- metadata (list[str] | None): List of additional field names that should be read as metadata.
217
- sandbox (str): Optional. Sandbox type along with optional config file
218
- files (str): Optional. Files that go along with the sample.
219
- setup (str): Optional. Setup script to run for sample .
220
- """
211
+ r"""Specification for mapping data source fields to sample fields."""
221
212
 
222
213
  input: str = field(default="input")
223
214
  """Name of the field containing the sample input."""
File without changes
@@ -35,30 +35,30 @@ def csv_dataset(
35
35
  r"""Read dataset from CSV file.
36
36
 
37
37
  Args:
38
- csv_file (str): Path to CSV file. Can be a local filesystem path,
38
+ csv_file: Path to CSV file. Can be a local filesystem path,
39
39
  a path to an S3 bucket (e.g. "s3://my-bucket"), or an HTTPS URL.
40
40
  Use `fs_options` to pass arguments through to the `S3FileSystem` constructor.
41
- sample_fields (FieldSpec | RecordToSample): Method of mapping underlying
41
+ sample_fields: Method of mapping underlying
42
42
  fields in the data source to Sample objects. Pass `None` if the data is already
43
43
  stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a
44
44
  `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
45
45
  handle mapping with a custom function that returns one or more samples.
46
- auto_id (bool): Assign an auto-incrementing ID for each sample.
47
- shuffle (bool): Randomly shuffle the dataset order.
48
- seed: (int | None): Seed used for random shuffle.
49
- shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
50
- limit (int | None): Limit the number of records to read.
51
- dialect (str): CSV dialect ("unix", "excel" or"excel-tab"). Defaults to "unix". See https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters for more details
52
- encoding (str): Text encoding for file (defaults to "utf-8").
53
- name (str): Optional name for dataset (for logging). If not specified,
46
+ auto_id: Assign an auto-incrementing ID for each sample.
47
+ shuffle: Randomly shuffle the dataset order.
48
+ seed: Seed used for random shuffle.
49
+ shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
50
+ limit: Limit the number of records to read.
51
+ dialect: CSV dialect ("unix", "excel" or"excel-tab"). Defaults to "unix". See https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters for more details
52
+ encoding: Text encoding for file (defaults to "utf-8").
53
+ name: Optional name for dataset (for logging). If not specified,
54
54
  defaults to the stem of the filename
55
- fs_options (dict[str, Any]): Optional. Additional arguments to pass through
55
+ fs_options: Optional. Additional arguments to pass through
56
56
  to the filesystem provider (e.g. `S3FileSystem`). Use `{"anon": True }`
57
57
  if you are accessing a public S3 bucket with no credentials.
58
- fieldnames (list[str] | None): Optional. A list of fieldnames to use for the CSV.
58
+ fieldnames: Optional. A list of fieldnames to use for the CSV.
59
59
  If None, the values in the first row of the file will be used as the fieldnames.
60
60
  Useful for files without a header.
61
- delimiter (str): Optional. The delimiter to use when parsing the file. Defaults to ",".
61
+ delimiter: Optional. The delimiter to use when parsing the file. Defaults to ",".
62
62
 
63
63
  Returns:
64
64
  Dataset read from CSV file.