inspect-ai 0.3.56__py3-none-any.whl → 0.3.57__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. inspect_ai/_display/core/panel.py +1 -1
  2. inspect_ai/_eval/run.py +16 -11
  3. inspect_ai/_util/datetime.py +1 -1
  4. inspect_ai/_util/deprecation.py +1 -1
  5. inspect_ai/_util/json.py +11 -1
  6. inspect_ai/_util/logger.py +2 -1
  7. inspect_ai/_util/trace.py +39 -3
  8. inspect_ai/_util/transcript.py +36 -7
  9. inspect_ai/_view/www/.prettierrc.js +12 -0
  10. inspect_ai/_view/www/dist/assets/index.js +286 -224
  11. inspect_ai/_view/www/log-schema.json +124 -125
  12. inspect_ai/_view/www/src/App.mjs +18 -9
  13. inspect_ai/_view/www/src/Types.mjs +0 -1
  14. inspect_ai/_view/www/src/api/Types.mjs +15 -4
  15. inspect_ai/_view/www/src/api/api-http.mjs +2 -0
  16. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
  17. inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
  18. inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
  19. inspect_ai/_view/www/src/components/MessageContent.mjs +1 -1
  20. inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
  21. inspect_ai/_view/www/src/components/Tools.mjs +18 -3
  22. inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
  23. inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
  24. inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
  25. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
  26. inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
  27. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
  28. inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
  29. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +238 -178
  30. inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
  31. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
  32. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
  33. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
  34. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
  35. inspect_ai/_view/www/src/types/log.d.ts +2 -8
  36. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
  37. inspect_ai/log/_log.py +25 -0
  38. inspect_ai/log/_recorders/eval.py +2 -0
  39. inspect_ai/model/_call_tools.py +27 -5
  40. inspect_ai/model/_providers/google.py +24 -6
  41. inspect_ai/model/_providers/openai.py +17 -3
  42. inspect_ai/model/_providers/openai_o1.py +10 -12
  43. inspect_ai/tool/_tool_info.py +2 -1
  44. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
  45. inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
  46. inspect_ai/util/__init__.py +4 -0
  47. inspect_ai/util/_sandbox/docker/compose.py +1 -3
  48. inspect_ai/util/_sandbox/docker/util.py +2 -1
  49. inspect_ai/util/_sandbox/self_check.py +18 -18
  50. inspect_ai/util/_store.py +2 -2
  51. inspect_ai/util/_subprocess.py +3 -3
  52. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/METADATA +3 -3
  53. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/RECORD +57 -56
  54. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/WHEEL +1 -1
  55. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/LICENSE +0 -0
  56. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/entry_points.txt +0 -0
  57. {inspect_ai-0.3.56.dist-info → inspect_ai-0.3.57.dist-info}/top_level.txt +0 -0
@@ -1482,7 +1482,7 @@
1482
1482
  },
1483
1483
  "samples": {
1484
1484
  "items": {
1485
- "$ref": "#/$defs/SampleScore"
1485
+ "$ref": "#/$defs/EvalSampleScore"
1486
1486
  },
1487
1487
  "title": "Samples",
1488
1488
  "type": "array"
@@ -1497,6 +1497,129 @@
1497
1497
  "type": "object",
1498
1498
  "additionalProperties": false
1499
1499
  },
1500
+ "EvalSampleScore": {
1501
+ "properties": {
1502
+ "value": {
1503
+ "anyOf": [
1504
+ {
1505
+ "type": "string"
1506
+ },
1507
+ {
1508
+ "type": "integer"
1509
+ },
1510
+ {
1511
+ "type": "number"
1512
+ },
1513
+ {
1514
+ "type": "boolean"
1515
+ },
1516
+ {
1517
+ "items": {
1518
+ "anyOf": [
1519
+ {
1520
+ "type": "string"
1521
+ },
1522
+ {
1523
+ "type": "integer"
1524
+ },
1525
+ {
1526
+ "type": "number"
1527
+ },
1528
+ {
1529
+ "type": "boolean"
1530
+ }
1531
+ ]
1532
+ },
1533
+ "type": "array"
1534
+ },
1535
+ {
1536
+ "additionalProperties": {
1537
+ "anyOf": [
1538
+ {
1539
+ "type": "string"
1540
+ },
1541
+ {
1542
+ "type": "integer"
1543
+ },
1544
+ {
1545
+ "type": "number"
1546
+ },
1547
+ {
1548
+ "type": "boolean"
1549
+ },
1550
+ {
1551
+ "type": "null"
1552
+ }
1553
+ ]
1554
+ },
1555
+ "type": "object"
1556
+ }
1557
+ ],
1558
+ "title": "Value"
1559
+ },
1560
+ "answer": {
1561
+ "anyOf": [
1562
+ {
1563
+ "type": "string"
1564
+ },
1565
+ {
1566
+ "type": "null"
1567
+ }
1568
+ ],
1569
+ "default": null,
1570
+ "title": "Answer"
1571
+ },
1572
+ "explanation": {
1573
+ "anyOf": [
1574
+ {
1575
+ "type": "string"
1576
+ },
1577
+ {
1578
+ "type": "null"
1579
+ }
1580
+ ],
1581
+ "default": null,
1582
+ "title": "Explanation"
1583
+ },
1584
+ "metadata": {
1585
+ "anyOf": [
1586
+ {
1587
+ "type": "object"
1588
+ },
1589
+ {
1590
+ "type": "null"
1591
+ }
1592
+ ],
1593
+ "default": null,
1594
+ "title": "Metadata"
1595
+ },
1596
+ "sample_id": {
1597
+ "anyOf": [
1598
+ {
1599
+ "type": "string"
1600
+ },
1601
+ {
1602
+ "type": "integer"
1603
+ },
1604
+ {
1605
+ "type": "null"
1606
+ }
1607
+ ],
1608
+ "default": null,
1609
+ "title": "Sample Id"
1610
+ }
1611
+ },
1612
+ "required": [
1613
+ "value",
1614
+ "answer",
1615
+ "explanation",
1616
+ "metadata",
1617
+ "sample_id"
1618
+ ],
1619
+ "title": "EvalSampleScore",
1620
+ "type": "object",
1621
+ "additionalProperties": false
1622
+ },
1500
1623
  "EvalScore": {
1501
1624
  "properties": {
1502
1625
  "name": {
@@ -2905,130 +3028,6 @@
2905
3028
  "type": "object",
2906
3029
  "additionalProperties": false
2907
3030
  },
2908
- "SampleScore": {
2909
- "description": "Score for a Sample\n\nArgs:\n sample_id: (str | int | None) Unique id of a sample",
2910
- "properties": {
2911
- "value": {
2912
- "anyOf": [
2913
- {
2914
- "type": "string"
2915
- },
2916
- {
2917
- "type": "integer"
2918
- },
2919
- {
2920
- "type": "number"
2921
- },
2922
- {
2923
- "type": "boolean"
2924
- },
2925
- {
2926
- "items": {
2927
- "anyOf": [
2928
- {
2929
- "type": "string"
2930
- },
2931
- {
2932
- "type": "integer"
2933
- },
2934
- {
2935
- "type": "number"
2936
- },
2937
- {
2938
- "type": "boolean"
2939
- }
2940
- ]
2941
- },
2942
- "type": "array"
2943
- },
2944
- {
2945
- "additionalProperties": {
2946
- "anyOf": [
2947
- {
2948
- "type": "string"
2949
- },
2950
- {
2951
- "type": "integer"
2952
- },
2953
- {
2954
- "type": "number"
2955
- },
2956
- {
2957
- "type": "boolean"
2958
- },
2959
- {
2960
- "type": "null"
2961
- }
2962
- ]
2963
- },
2964
- "type": "object"
2965
- }
2966
- ],
2967
- "title": "Value"
2968
- },
2969
- "answer": {
2970
- "anyOf": [
2971
- {
2972
- "type": "string"
2973
- },
2974
- {
2975
- "type": "null"
2976
- }
2977
- ],
2978
- "default": null,
2979
- "title": "Answer"
2980
- },
2981
- "explanation": {
2982
- "anyOf": [
2983
- {
2984
- "type": "string"
2985
- },
2986
- {
2987
- "type": "null"
2988
- }
2989
- ],
2990
- "default": null,
2991
- "title": "Explanation"
2992
- },
2993
- "metadata": {
2994
- "anyOf": [
2995
- {
2996
- "type": "object"
2997
- },
2998
- {
2999
- "type": "null"
3000
- }
3001
- ],
3002
- "default": null,
3003
- "title": "Metadata"
3004
- },
3005
- "sample_id": {
3006
- "anyOf": [
3007
- {
3008
- "type": "string"
3009
- },
3010
- {
3011
- "type": "integer"
3012
- },
3013
- {
3014
- "type": "null"
3015
- }
3016
- ],
3017
- "default": null,
3018
- "title": "Sample Id"
3019
- }
3020
- },
3021
- "required": [
3022
- "value",
3023
- "answer",
3024
- "explanation",
3025
- "metadata",
3026
- "sample_id"
3027
- ],
3028
- "title": "SampleScore",
3029
- "type": "object",
3030
- "additionalProperties": false
3031
- },
3032
3031
  "SandboxEnvironmentSpec": {
3033
3032
  "maxItems": 2,
3034
3033
  "minItems": 1,
@@ -32,7 +32,10 @@ import { FindBand } from "./components/FindBand.mjs";
32
32
  import { isVscode } from "./utils/Html.mjs";
33
33
  import { getVscodeApi } from "./utils/vscode.mjs";
34
34
  import { kDefaultSort } from "./constants.mjs";
35
- import { createsSamplesDescriptor } from "./samples/SamplesDescriptor.mjs";
35
+ import {
36
+ createEvalDescriptor,
37
+ createSamplesDescriptor,
38
+ } from "./samples/SamplesDescriptor.mjs";
36
39
  import { byEpoch, bySample, sortSamples } from "./samples/tools/SortFilter.mjs";
37
40
  import { resolveAttachments } from "./utils/attachments.mjs";
38
41
  import { filterFnForType } from "./samples/tools/filters.mjs";
@@ -76,7 +79,7 @@ export function App({
76
79
  initialState?.headersLoading || false,
77
80
  );
78
81
 
79
- // Selected Log
82
+ /** @type {[import("./Types.mjs").CurrentLog, function(import("./Types.mjs").CurrentLog): void]} */
80
83
  const [selectedLog, setSelectedLog] = useState(
81
84
  initialState?.selectedLog || {
82
85
  contents: undefined,
@@ -95,6 +98,7 @@ export function App({
95
98
  ? initialState.selectedSampleIndex
96
99
  : -1,
97
100
  );
101
+ /** @type {[import("./types/log").EvalSample, function(import("./types/log").EvalSample): void]} */
98
102
  const [selectedSample, setSelectedSample] = useState(
99
103
  initialState?.selectedSample,
100
104
  );
@@ -326,7 +330,7 @@ export function App({
326
330
 
327
331
  // Set the grouping
328
332
  let grouping = "none";
329
- if (samplesDescriptor?.epochs > 1) {
333
+ if (samplesDescriptor?.evalDescriptor?.epochs > 1) {
330
334
  if (byEpoch(sort) || epoch !== "all") {
331
335
  grouping = "epoch";
332
336
  } else if (bySample(sort)) {
@@ -339,14 +343,17 @@ export function App({
339
343
  setGroupByOrder(order);
340
344
  }, [selectedLog, filter, sort, epoch]);
341
345
 
342
- const samplesDescriptor = useMemo(() => {
343
- return createsSamplesDescriptor(
346
+ const evalDescriptor = useMemo(() => {
347
+ return createEvalDescriptor(
344
348
  scores,
345
349
  selectedLog.contents?.sampleSummaries,
346
350
  selectedLog.contents?.eval?.config?.epochs || 1,
347
- score,
348
351
  );
349
- }, [selectedLog, scores, score]);
352
+ }, [selectedLog, scores]);
353
+
354
+ const samplesDescriptor = useMemo(() => {
355
+ return createSamplesDescriptor(evalDescriptor, score);
356
+ }, [evalDescriptor, score]);
350
357
 
351
358
  const refreshSampleTab = useCallback(
352
359
  (sample) => {
@@ -513,9 +520,11 @@ export function App({
513
520
  // Reset the workspace tab
514
521
  const hasSamples =
515
522
  !!log.sampleSummaries && log.sampleSummaries.length > 0;
516
- const showSamples = log.status !== "error" && hasSamples;
523
+ const showSamples = hasSamples;
517
524
  setSelectedWorkspaceTab(
518
- showSamples ? kEvalWorkspaceTabId : kInfoWorkspaceTabId,
525
+ log.status !== "error" && hasSamples
526
+ ? kEvalWorkspaceTabId
527
+ : kInfoWorkspaceTabId,
519
528
  );
520
529
 
521
530
  // Select the default scorer to use
@@ -8,7 +8,6 @@
8
8
  * @typedef {Object} CurrentLog
9
9
  * @property {string} name
10
10
  * @property {import("./api/Types.mjs").EvalSummary} contents
11
- * @property {string} raw
12
11
  */
13
12
 
14
13
  /**
@@ -30,15 +30,26 @@
30
30
  * @property { import("../types/log").Input } input
31
31
  * @property { import("../types/log").Target } target
32
32
  * @property { import("../types/log").Scores1 } scores
33
+ * @property { string } [error]
33
34
  * @property { import("../types/log").Type11 } [limit]
34
35
  */
35
36
 
36
37
  /**
37
- * @typedef {Object} Capabilities
38
- * @property {boolean} downloadFiles - Indicates if file downloads are supported.
39
- * @property {boolean} webWorkers - Indicates if web workers are supported.
40
- *
38
+ * Fields shared by EvalSample and SampleSummary.
39
+ * Contains only fields that are copied verbatim in src/inspect_ai/log/_recorders/eval.py.
40
+ *
41
+ * @typedef {Object} BasicSampleData
42
+ * @property { number | string } id
43
+ * @property { number } epoch
44
+ * @property { import("../types/log").Target } target
45
+ * @property { import("../types/log").Scores1 } scores
46
+ */
41
47
 
48
+ /**
49
+ * @typedef {Object} Capabilities
50
+ * @property {boolean} downloadFiles - Indicates if file downloads are supported.
51
+ * @property {boolean} webWorkers - Indicates if web workers are supported.
52
+ */
42
53
 
43
54
  /**
44
55
  * @typedef {Object} LogViewAPI
@@ -56,6 +56,7 @@ function simpleHttpAPI(logInfo) {
56
56
  });
57
57
  return Promise.resolve({
58
58
  files: logs,
59
+ log_dir,
59
60
  });
60
61
  } else if (log_file) {
61
62
  // Check the cache
@@ -76,6 +77,7 @@ function simpleHttpAPI(logInfo) {
76
77
 
77
78
  return {
78
79
  files: [result],
80
+ log_dir,
79
81
  };
80
82
  } else {
81
83
  // No log.json could be found, and there isn't a log file,
@@ -14,8 +14,8 @@ export const ExpandablePanel = ({
14
14
  const [collapsed, setCollapsed] = useState(collapse);
15
15
  const [showToggle, setShowToggle] = useState(false);
16
16
 
17
- const contentsRef = useRef();
18
- const observerRef = useRef();
17
+ const contentsRef = useRef(/** @type {HTMLElement|null} */ (null));
18
+ const observerRef = useRef(/** @type {IntersectionObserver|null} */ (null));
19
19
 
20
20
  // Ensure that when content changes, we reset the collapse state.
21
21
  useEffect(() => {
@@ -4,7 +4,7 @@ import { ApplicationIcons } from "../appearance/Icons.mjs";
4
4
  import { FontSize } from "../appearance/Fonts.mjs";
5
5
 
6
6
  export const FindBand = ({ hideBand }) => {
7
- const searchBoxRef = useRef();
7
+ const searchBoxRef = useRef(/** @type {HTMLInputElement|null} */ (null));
8
8
  useEffect(() => {
9
9
  searchBoxRef.current.focus();
10
10
  }, []);
@@ -31,13 +31,14 @@ export const FindBand = ({ hideBand }) => {
31
31
  };
32
32
 
33
33
  // capture what is focused
34
- const focusedElement = document.activeElement;
34
+ const focusedElement = /** @type {HTMLElement} */ (document.activeElement);
35
+ // @ts-expect-error: `Window.find` is non-standard
35
36
  const result = window.find(term, false, !!back, false, false, true, false);
36
37
  const noResultEl = window.document.getElementById(
37
38
  "inspect-find-no-results",
38
39
  );
39
40
  if (result) {
40
- noResultEl.style.opacity = 0;
41
+ noResultEl.style.opacity = "0";
41
42
  const selection = window.getSelection();
42
43
  if (selection.rangeCount > 0) {
43
44
  // See if the parent is an expandable panel and expand it
@@ -58,7 +59,7 @@ export const FindBand = ({ hideBand }) => {
58
59
  }, 100);
59
60
  }
60
61
  } else {
61
- noResultEl.style.opacity = 1;
62
+ noResultEl.style.opacity = "1";
62
63
  }
63
64
 
64
65
  // Return focus to the previously focused element
@@ -31,7 +31,7 @@ export const LargeModal = (props) => {
31
31
 
32
32
  // Support restoring the scroll position
33
33
  // but only do this for the first time that the children are set
34
- const scrollRef = useRef();
34
+ const scrollRef = useRef(/** @type {HTMLElement|null} */ (null));
35
35
  useEffect(() => {
36
36
  if (scrollRef.current) {
37
37
  setTimeout(() => {
@@ -52,7 +52,7 @@ const messageRenderers = {
52
52
  return html`<img
53
53
  src="${content.image}"
54
54
  style=${{
55
- maxWidth: "400px",
55
+ maxWidth: "800px",
56
56
  border: "solid var(--bs-border-color) 1px",
57
57
  }}
58
58
  />`;
@@ -44,7 +44,7 @@ export const TabPanel = ({
44
44
  children,
45
45
  }) => {
46
46
  const tabContentsId = computeTabContentsId(id, index);
47
- const tabContentsRef = useRef();
47
+ const tabContentsRef = useRef(/** @type {HTMLElement|null} */ (null));
48
48
  useEffect(() => {
49
49
  setTimeout(() => {
50
50
  if (
@@ -65,7 +65,19 @@ export const ToolCallView = ({
65
65
  }) => {
66
66
  // don't collapse if output includes an image
67
67
  function isContentImage(value) {
68
- return value && typeof value === "object" && value.type === "image";
68
+ if (value && typeof value === "object") {
69
+ if (value.type === "image") {
70
+ return true;
71
+ } else if (value.type === "tool") {
72
+ if (
73
+ Array.isArray(value.content) &&
74
+ value.content.some(isContentImage)
75
+ ) {
76
+ return true;
77
+ }
78
+ }
79
+ }
80
+ return false;
69
81
  }
70
82
  const collapse = Array.isArray(output)
71
83
  ? output.every((item) => !isContentImage(item))
@@ -152,10 +164,13 @@ export const ToolInput = ({ type, contents, view, style }) => {
152
164
  }
153
165
 
154
166
  if (view) {
155
- const toolInputRef = useRef(/** @type {HTMLElement|null} */ (null));
167
+ const toolInputRef = useRef(
168
+ /** @type {import("preact").Component & { base: Element }} */ (null),
169
+ );
156
170
  useEffect(() => {
157
171
  // Sniff around for code in the view that could be text highlighted
158
172
  if (toolInputRef.current) {
173
+ // @ts-expect-error: TS doesn't know that `HTMLCollection` is iterable.
159
174
  for (const child of toolInputRef.current.base.children) {
160
175
  if (child.tagName === "PRE") {
161
176
  const childChild = child.firstElementChild;
@@ -241,7 +256,7 @@ export const ToolOutput = ({ output, style }) => {
241
256
  html`<img
242
257
  src="${out.image}"
243
258
  style=${{
244
- maxWidth: "100%",
259
+ maxWidth: "800px",
245
260
  border: "solid var(--bs-border-color) 1px",
246
261
  ...style,
247
262
  }}
@@ -10,32 +10,30 @@ const STYLE_CONTENT =
10
10
  "position:absolute; top:0; left:0; height:100%; width:100%; overflow:visible;";
11
11
 
12
12
  export class VirtualList extends Component {
13
+ /** @type {HTMLElement} */ base;
14
+
13
15
  constructor(props) {
14
16
  super(props);
15
17
  this.state = {
16
18
  height: 0,
17
19
  offset: 0,
18
20
  };
19
- this.resize = this.resize.bind(this);
20
- this.handleScroll = throttle(this.handleScroll.bind(this), 100);
21
+ this.resize = () => {
22
+ if (this.state.height !== this.base.offsetHeight) {
23
+ this.setState({ height: this.base.offsetHeight });
24
+ }
25
+ };
26
+ this.handleScroll = throttle(() => {
27
+ if (this.base) {
28
+ this.setState({ offset: this.base.scrollTop });
29
+ }
30
+ if (this.props.sync) {
31
+ this.forceUpdate();
32
+ }
33
+ }, 100);
21
34
  this.containerRef = createRef();
22
35
  }
23
36
 
24
- resize() {
25
- if (this.state.height !== this.base.offsetHeight) {
26
- this.setState({ height: this.base.offsetHeight });
27
- }
28
- }
29
-
30
- handleScroll() {
31
- if (this.base) {
32
- this.setState({ offset: this.base.scrollTop });
33
- }
34
- if (this.props.sync) {
35
- this.forceUpdate();
36
- }
37
- }
38
-
39
37
  componentDidUpdate() {
40
38
  this.resize();
41
39
  }
@@ -6,7 +6,8 @@ import {
6
6
  openRemoteZipFile,
7
7
  } from "../utils/remoteZipFile.mjs";
8
8
 
9
- const MAX_BYTES = 12582912;
9
+ // don't try to load samples greater than 50mb
10
+ const MAX_BYTES = 50 * 1024 * 1024;
10
11
 
11
12
  /**
12
13
  * @typedef {Object} SampleEntry