inspect-ai 0.3.95__py3-none-any.whl → 0.3.97__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. inspect_ai/_eval/eval.py +10 -2
  2. inspect_ai/_eval/task/util.py +32 -3
  3. inspect_ai/_util/local_server.py +16 -0
  4. inspect_ai/_util/registry.py +7 -0
  5. inspect_ai/_util/timer.py +13 -0
  6. inspect_ai/_view/www/dist/assets/index.css +275 -195
  7. inspect_ai/_view/www/dist/assets/index.js +8568 -7376
  8. inspect_ai/_view/www/src/app/App.css +1 -0
  9. inspect_ai/_view/www/src/app/App.tsx +27 -10
  10. inspect_ai/_view/www/src/app/appearance/icons.ts +5 -0
  11. inspect_ai/_view/www/src/app/content/RecordTree.module.css +22 -0
  12. inspect_ai/_view/www/src/app/content/RecordTree.tsx +370 -0
  13. inspect_ai/_view/www/src/app/content/RenderedContent.module.css +5 -0
  14. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +32 -19
  15. inspect_ai/_view/www/src/app/content/record_processors/store.ts +101 -0
  16. inspect_ai/_view/www/src/app/content/record_processors/types.ts +3 -0
  17. inspect_ai/_view/www/src/app/content/types.ts +5 -0
  18. inspect_ai/_view/www/src/app/log-view/LogView.tsx +1 -0
  19. inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +35 -28
  20. inspect_ai/_view/www/src/app/log-view/LogViewLayout.tsx +1 -8
  21. inspect_ai/_view/www/src/app/log-view/navbar/PrimaryBar.tsx +2 -4
  22. inspect_ai/_view/www/src/app/log-view/navbar/ResultsPanel.tsx +13 -3
  23. inspect_ai/_view/www/src/app/log-view/navbar/ScoreGrid.module.css +15 -0
  24. inspect_ai/_view/www/src/app/log-view/navbar/ScoreGrid.tsx +14 -10
  25. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +9 -3
  26. inspect_ai/_view/www/src/app/log-view/tabs/JsonTab.tsx +1 -3
  27. inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +8 -2
  28. inspect_ai/_view/www/src/app/log-view/types.ts +1 -0
  29. inspect_ai/_view/www/src/app/plan/ModelCard.module.css +7 -0
  30. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +5 -2
  31. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +13 -8
  32. inspect_ai/_view/www/src/app/routing/navigationHooks.ts +63 -8
  33. inspect_ai/_view/www/src/app/routing/url.ts +45 -0
  34. inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.module.css +2 -1
  35. inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.tsx +15 -8
  36. inspect_ai/_view/www/src/app/samples/SampleDialog.module.css +3 -0
  37. inspect_ai/_view/www/src/app/samples/SampleDialog.tsx +16 -5
  38. inspect_ai/_view/www/src/app/samples/SampleDisplay.module.css +9 -1
  39. inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +68 -31
  40. inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +12 -7
  41. inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -5
  42. inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.module.css +9 -0
  43. inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.tsx +48 -18
  44. inspect_ai/_view/www/src/app/samples/chat/ChatView.tsx +0 -1
  45. inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.module.css +4 -0
  46. inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.tsx +41 -1
  47. inspect_ai/_view/www/src/app/samples/chat/messages.ts +7 -0
  48. inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.module.css +0 -3
  49. inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +1 -1
  50. inspect_ai/_view/www/src/app/samples/chat/tools/ToolInput.module.css +1 -1
  51. inspect_ai/_view/www/src/app/samples/chat/tools/ToolOutput.module.css +1 -1
  52. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +5 -1
  53. inspect_ai/_view/www/src/app/samples/descriptor/score/PassFailScoreDescriptor.tsx +11 -6
  54. inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +7 -0
  55. inspect_ai/_view/www/src/app/samples/list/SampleRow.tsx +5 -18
  56. inspect_ai/_view/www/src/app/samples/sample-tools/SortFilter.tsx +1 -1
  57. inspect_ai/_view/www/src/app/samples/scores/SampleScoresGrid.tsx +18 -5
  58. inspect_ai/_view/www/src/app/samples/scores/SampleScoresView.module.css +0 -6
  59. inspect_ai/_view/www/src/app/samples/scores/SampleScoresView.tsx +4 -1
  60. inspect_ai/_view/www/src/app/samples/transcript/ApprovalEventView.tsx +4 -2
  61. inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +6 -4
  62. inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.module.css +1 -1
  63. inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.tsx +13 -6
  64. inspect_ai/_view/www/src/app/samples/transcript/InputEventView.tsx +6 -4
  65. inspect_ai/_view/www/src/app/samples/transcript/LoggerEventView.tsx +4 -2
  66. inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +11 -8
  67. inspect_ai/_view/www/src/app/samples/transcript/SampleInitEventView.tsx +14 -8
  68. inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +13 -8
  69. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.tsx +25 -16
  70. inspect_ai/_view/www/src/app/samples/transcript/ScoreEventView.tsx +7 -5
  71. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +11 -28
  72. inspect_ai/_view/www/src/app/samples/transcript/StepEventView.tsx +12 -20
  73. inspect_ai/_view/www/src/app/samples/transcript/SubtaskEventView.tsx +12 -31
  74. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +25 -29
  75. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualList.tsx +297 -0
  76. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +0 -8
  77. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.tsx +43 -25
  78. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.module.css +43 -0
  79. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +109 -43
  80. inspect_ai/_view/www/src/app/samples/transcript/state/StateEventView.tsx +19 -8
  81. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +128 -60
  82. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +14 -4
  83. inspect_ai/_view/www/src/app/samples/transcript/types.ts +6 -4
  84. inspect_ai/_view/www/src/app/types.ts +12 -1
  85. inspect_ai/_view/www/src/components/Card.css +6 -3
  86. inspect_ai/_view/www/src/components/Card.tsx +15 -2
  87. inspect_ai/_view/www/src/components/CopyButton.tsx +4 -6
  88. inspect_ai/_view/www/src/components/ExpandablePanel.module.css +20 -14
  89. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +17 -22
  90. inspect_ai/_view/www/src/components/LargeModal.tsx +5 -1
  91. inspect_ai/_view/www/src/components/LiveVirtualList.tsx +25 -1
  92. inspect_ai/_view/www/src/components/MarkdownDiv.css +4 -0
  93. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +2 -2
  94. inspect_ai/_view/www/src/components/TabSet.module.css +6 -1
  95. inspect_ai/_view/www/src/components/TabSet.tsx +8 -2
  96. inspect_ai/_view/www/src/state/hooks.ts +83 -13
  97. inspect_ai/_view/www/src/state/logPolling.ts +2 -2
  98. inspect_ai/_view/www/src/state/logSlice.ts +1 -2
  99. inspect_ai/_view/www/src/state/logsSlice.ts +9 -9
  100. inspect_ai/_view/www/src/state/samplePolling.ts +1 -1
  101. inspect_ai/_view/www/src/state/sampleSlice.ts +134 -7
  102. inspect_ai/_view/www/src/state/scoring.ts +1 -1
  103. inspect_ai/_view/www/src/state/scrolling.ts +39 -6
  104. inspect_ai/_view/www/src/state/store.ts +5 -0
  105. inspect_ai/_view/www/src/state/store_filter.ts +47 -44
  106. inspect_ai/_view/www/src/utils/debugging.ts +95 -0
  107. inspect_ai/_view/www/src/utils/format.ts +2 -2
  108. inspect_ai/_view/www/src/utils/json.ts +29 -0
  109. inspect_ai/agent/__init__.py +2 -1
  110. inspect_ai/agent/_agent.py +12 -0
  111. inspect_ai/agent/_react.py +184 -48
  112. inspect_ai/agent/_types.py +15 -2
  113. inspect_ai/analysis/beta/__init__.py +11 -3
  114. inspect_ai/analysis/beta/_dataframe/columns.py +11 -16
  115. inspect_ai/analysis/beta/_dataframe/evals/table.py +101 -39
  116. inspect_ai/analysis/beta/_dataframe/events/columns.py +50 -0
  117. inspect_ai/analysis/beta/_dataframe/events/extract.py +26 -0
  118. inspect_ai/analysis/beta/_dataframe/events/table.py +77 -3
  119. inspect_ai/analysis/beta/_dataframe/extract.py +44 -25
  120. inspect_ai/analysis/beta/_dataframe/messages/columns.py +1 -1
  121. inspect_ai/analysis/beta/_dataframe/messages/table.py +30 -29
  122. inspect_ai/analysis/beta/_dataframe/progress.py +56 -0
  123. inspect_ai/analysis/beta/_dataframe/record.py +13 -9
  124. inspect_ai/analysis/beta/_dataframe/samples/columns.py +8 -4
  125. inspect_ai/analysis/beta/_dataframe/samples/extract.py +5 -33
  126. inspect_ai/analysis/beta/_dataframe/samples/table.py +211 -60
  127. inspect_ai/analysis/beta/_dataframe/util.py +33 -28
  128. inspect_ai/log/_file.py +9 -2
  129. inspect_ai/model/_call_tools.py +1 -1
  130. inspect_ai/model/_providers/anthropic.py +18 -5
  131. inspect_ai/model/_providers/azureai.py +7 -2
  132. inspect_ai/model/_providers/util/llama31.py +3 -3
  133. inspect_ai/solver/_task_state.py +1 -1
  134. inspect_ai/tool/_mcp/_sandbox.py +17 -14
  135. {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/METADATA +2 -2
  136. {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/RECORD +140 -133
  137. {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/WHEEL +1 -1
  138. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.module.css +0 -48
  139. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +0 -276
  140. {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/entry_points.txt +0 -0
  141. {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/licenses/LICENSE +0 -0
  142. {inspect_ai-0.3.95.dist-info → inspect_ai-0.3.97.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,37 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import multiprocessing as mp
4
+ from concurrent.futures import ProcessPoolExecutor, as_completed
3
5
  from dataclasses import dataclass
6
+ from functools import lru_cache
7
+ from itertools import chain
4
8
  from typing import (
5
9
  TYPE_CHECKING,
6
10
  Callable,
7
11
  Generator,
8
12
  Literal,
13
+ Sequence,
14
+ cast,
9
15
  overload,
10
16
  )
11
17
 
12
- from inspect_ai._display import display
13
- from inspect_ai._util.path import pretty_path
14
- from inspect_ai.analysis.beta._dataframe.events.columns import EventColumn
15
- from inspect_ai.analysis.beta._dataframe.messages.columns import MessageColumn
18
+ from inspect_ai._util.hash import mm3_hash
19
+ from inspect_ai.analysis.beta._dataframe.progress import import_progress, no_progress
16
20
  from inspect_ai.log._file import (
21
+ list_eval_logs,
17
22
  read_eval_log_sample_summaries,
18
23
  read_eval_log_samples,
19
24
  )
20
25
  from inspect_ai.log._log import EvalSample, EvalSampleSummary
21
- from inspect_ai.log._transcript import BaseEvent, Event
26
+ from inspect_ai.log._transcript import Event
22
27
  from inspect_ai.model._chat_message import ChatMessage
23
28
 
24
- from ..columns import Column, ColumnErrors, ColumnType
29
+ from ..columns import Column, ColumnError, ColumnType
25
30
  from ..evals.columns import EvalColumn
26
- from ..evals.table import EVAL_ID, EVAL_SUFFIX, ensure_eval_id, evals_df
31
+ from ..evals.table import EVAL_ID, EVAL_SUFFIX, _read_evals_df, ensure_eval_id
32
+ from ..events.columns import EventColumn
33
+ from ..extract import message_as_str
34
+ from ..messages.columns import MessageColumn
27
35
  from ..record import import_record, resolve_duplicate_columns
28
36
  from ..util import (
29
37
  LogPaths,
@@ -46,49 +54,55 @@ SAMPLE_SUFFIX = "_sample"
46
54
 
47
55
  @overload
48
56
  def samples_df(
49
- logs: LogPaths,
50
- columns: list[Column] = SampleSummary,
51
- recursive: bool = True,
52
- reverse: bool = False,
57
+ logs: LogPaths = list_eval_logs(),
58
+ columns: Sequence[Column] = SampleSummary,
53
59
  strict: Literal[True] = True,
60
+ parallel: bool | int = False,
61
+ quiet: bool = False,
54
62
  ) -> "pd.DataFrame": ...
55
63
 
56
64
 
57
65
  @overload
58
66
  def samples_df(
59
- logs: LogPaths,
60
- columns: list[Column] = SampleSummary,
61
- recursive: bool = True,
62
- reverse: bool = False,
67
+ logs: LogPaths = list_eval_logs(),
68
+ columns: Sequence[Column] = SampleSummary,
63
69
  strict: Literal[False] = False,
64
- ) -> tuple["pd.DataFrame", ColumnErrors]: ...
70
+ parallel: bool | int = False,
71
+ quiet: bool = False,
72
+ ) -> tuple["pd.DataFrame", list[ColumnError]]: ...
65
73
 
66
74
 
67
75
  def samples_df(
68
- logs: LogPaths,
69
- columns: list[Column] = SampleSummary,
70
- recursive: bool = True,
71
- reverse: bool = False,
76
+ logs: LogPaths = list_eval_logs(),
77
+ columns: Sequence[Column] = SampleSummary,
72
78
  strict: bool = True,
73
- ) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
79
+ parallel: bool | int = False,
80
+ quiet: bool = False,
81
+ ) -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]]:
74
82
  """Read a dataframe containing samples from a set of evals.
75
83
 
76
84
  Args:
77
85
  logs: One or more paths to log files or log directories.
86
+ Defaults to the contents of the currently active log directory
87
+ (e.g. ./logs or INSPECT_LOG_DIR).
78
88
  columns: Specification for what columns to read from log files.
79
- recursive: Include recursive contents of directories (defaults to `True`)
80
- reverse: Reverse the order of the dataframe (by default, items
81
- are ordered from oldest to newest).
82
89
  strict: Raise import errors immediately. Defaults to `True`.
83
90
  If `False` then a tuple of `DataFrame` and errors is returned.
91
+ parallel: If `True`, use `ProcessPoolExecutor` to read logs in parallel
92
+ (with workers based on `mp.cpu_count()`, capped at 8). If `int`, read
93
+ in parallel with the specified number of workers. If `False` (the default)
94
+ do not read in parallel.
95
+ quiet: If `True` do not print any output or progress (defaults to `False`).
84
96
 
85
97
  Returns:
86
98
  For `strict`, a Pandas `DataFrame` with information for the specified logs.
87
99
  For `strict=False`, a tuple of Pandas `DataFrame` and a dictionary of errors
88
100
  encountered (by log file) during import.
89
101
  """
102
+ verify_prerequisites()
103
+
90
104
  return _read_samples_df(
91
- logs, columns, recursive=recursive, reverse=reverse, strict=strict
105
+ logs, columns, strict=strict, progress=not quiet, parallel=parallel
92
106
  )
93
107
 
94
108
 
@@ -96,30 +110,108 @@ def samples_df(
96
110
  class MessagesDetail:
97
111
  name: str = "message"
98
112
  col_type = MessageColumn
99
- filter: Callable[[ChatMessage], bool] = lambda m: True
113
+ filter: Callable[[ChatMessage], bool] | None = None
100
114
 
101
115
 
102
116
  @dataclass
103
117
  class EventsDetail:
104
- name: str = "message"
118
+ name: str = "event"
105
119
  col_type = EventColumn
106
- filter: Callable[[BaseEvent], bool] = lambda e: True
120
+ filter: Callable[[Event], bool] | None = None
107
121
 
108
122
 
109
123
  def _read_samples_df(
110
124
  logs: LogPaths,
111
- columns: list[Column],
125
+ columns: Sequence[Column],
112
126
  *,
113
- recursive: bool = True,
114
- reverse: bool = False,
115
127
  strict: bool = True,
116
128
  detail: MessagesDetail | EventsDetail | None = None,
117
- ) -> "pd.DataFrame" | tuple["pd.DataFrame", ColumnErrors]:
118
- verify_prerequisites()
129
+ progress: bool = True,
130
+ parallel: bool | int = False,
131
+ ) -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]]:
132
+ import pandas as pd
119
133
 
120
134
  # resolve logs
121
- logs = resolve_logs(logs, recursive=recursive, reverse=reverse)
135
+ logs = resolve_logs(logs)
136
+
137
+ if parallel:
138
+ # resolve number of workers (cap at 8 as eventually we run into disk/memory contention)
139
+ if parallel is True:
140
+ parallel = max(min(mp.cpu_count(), 8), 2)
141
+
142
+ # flatted out list of logs
143
+ logs = resolve_logs(logs)
144
+
145
+ # establish progress
146
+ entity = detail.name if detail else "sample"
147
+ progress_cm = (
148
+ import_progress(f"reading {entity}s", total=len(logs))
149
+ if progress
150
+ else no_progress()
151
+ )
122
152
 
153
+ # run the parallel reads (setup arrays for holding results in order)
154
+ df_results: list[pd.DataFrame | None] = [None] * len(logs)
155
+ error_results: list[list[ColumnError] | None] = [None] * len(logs)
156
+ executor = ProcessPoolExecutor(max_workers=parallel)
157
+ try:
158
+ with progress_cm as p:
159
+ futures = {
160
+ executor.submit(
161
+ _read_samples_df_serial, # type: ignore[arg-type]
162
+ logs=[log],
163
+ columns=columns,
164
+ strict=strict,
165
+ detail=detail,
166
+ progress=False,
167
+ ): idx
168
+ for idx, log in enumerate(logs)
169
+ }
170
+ for fut in as_completed(futures):
171
+ idx = futures[fut]
172
+ if strict:
173
+ df_results[idx] = cast(pd.DataFrame, fut.result())
174
+ else:
175
+ df, errs = cast(
176
+ tuple[pd.DataFrame, list[ColumnError]], fut.result()
177
+ )
178
+ df_results[idx] = df
179
+ error_results[idx] = errs
180
+ p.update()
181
+ finally:
182
+ executor.shutdown(wait=False, cancel_futures=True)
183
+
184
+ # recombine df
185
+ df = pd.concat(df_results, ignore_index=True)
186
+ subset = f"{detail.name}_id" if detail else SAMPLE_ID
187
+ df.drop_duplicates(subset=subset, ignore_index=True, inplace=True)
188
+
189
+ # recombine errors
190
+ errors: list[ColumnError] = list(
191
+ chain.from_iterable(e for e in error_results if e)
192
+ )
193
+
194
+ # return as required
195
+ if strict:
196
+ return df
197
+ else:
198
+ return df, errors
199
+
200
+ # non-parallel
201
+ else:
202
+ return _read_samples_df_serial(
203
+ logs=logs, columns=columns, strict=strict, detail=detail, progress=progress
204
+ )
205
+
206
+
207
+ def _read_samples_df_serial(
208
+ logs: list[str],
209
+ columns: Sequence[Column],
210
+ *,
211
+ strict: bool = True,
212
+ detail: MessagesDetail | EventsDetail | None = None,
213
+ progress: bool = True,
214
+ ) -> "pd.DataFrame" | tuple["pd.DataFrame", list[ColumnError]]:
123
215
  # split columns by type
124
216
  columns_eval: list[Column] = []
125
217
  columns_sample: list[Column] = []
@@ -148,33 +240,56 @@ def _read_samples_df(
148
240
  )
149
241
 
150
242
  # make sure eval_id is present
151
- ensure_eval_id(columns_eval)
152
-
153
- # read samples from each log
154
- sample_records: list[dict[str, ColumnType]] = []
155
- detail_records: list[dict[str, ColumnType]] = []
156
- all_errors = ColumnErrors()
157
- evals_table = evals_df(logs, columns=columns_eval)
158
- with display().progress(total=len(evals_table)) as p:
243
+ columns_eval = list(ensure_eval_id(columns_eval))
244
+
245
+ # establish progress
246
+ progress_cm = (
247
+ import_progress("scanning logs", total=len(logs)) if progress else no_progress()
248
+ )
249
+
250
+ # determine how we will allocate progress
251
+ with progress_cm as p:
252
+ # read samples from each log
253
+ sample_records: list[dict[str, ColumnType]] = []
254
+ detail_records: list[dict[str, ColumnType]] = []
255
+ all_errors: list[ColumnError] = []
256
+
257
+ # read logs and note total samples
258
+ evals_table, eval_logs, total_samples = _read_evals_df(
259
+ logs, columns=columns_eval, strict=True, progress=p.update
260
+ )
261
+
262
+ # update progress now that we know the total samples
263
+ entity = detail.name if detail else "sample"
264
+ p.reset(description=f"reading {entity}s", completed=0, total=total_samples)
265
+
159
266
  # read samples
160
- for eval_id, log in zip(evals_table[EVAL_ID].to_list(), logs):
267
+ for eval_id, eval_log in zip(evals_table[EVAL_ID].to_list(), eval_logs):
161
268
  # get a generator for the samples (might require reading the full log
162
269
  # or might be fine to just read the summaries)
163
270
  if require_full_samples:
164
271
  samples: Generator[EvalSample | EvalSampleSummary, None, None] = (
165
272
  read_eval_log_samples(
166
- log, all_samples_required=False, resolve_attachments=True
273
+ eval_log.location,
274
+ all_samples_required=False,
275
+ resolve_attachments=True,
167
276
  )
168
277
  )
169
278
  else:
170
- samples = (summary for summary in read_eval_log_sample_summaries(log))
279
+ samples = (
280
+ summary
281
+ for summary in read_eval_log_sample_summaries(eval_log.location)
282
+ )
171
283
  for sample in samples:
172
284
  if strict:
173
- record = import_record(sample, columns_sample, strict=True)
285
+ record = import_record(
286
+ eval_log, sample, columns_sample, strict=True
287
+ )
174
288
  else:
175
- record, errors = import_record(sample, columns_sample, strict=False)
176
- error_key = f"{pretty_path(log)} [{sample.id}, {sample.epoch}]"
177
- all_errors[error_key] = errors
289
+ record, errors = import_record(
290
+ eval_log, sample, columns_sample, strict=False
291
+ )
292
+ all_errors.extend(errors)
178
293
 
179
294
  # inject ids
180
295
  sample_id = sample.uuid or auto_sample_id(eval_id, sample)
@@ -191,11 +306,15 @@ def _read_samples_df(
191
306
  # filter detail records
192
307
  assert isinstance(sample, EvalSample)
193
308
  if isinstance(detail, MessagesDetail):
194
- detail_items: list[ChatMessage] | list[Event] = [
195
- m for m in sample.messages if detail.filter(m)
196
- ]
309
+ detail_items: list[ChatMessage] | list[Event] = (
310
+ sample_messages_from_events(sample.events, detail.filter)
311
+ )
197
312
  elif isinstance(detail, EventsDetail):
198
- detail_items = [e for e in sample.events if detail.filter(e)]
313
+ detail_items = [
314
+ e
315
+ for e in sample.events
316
+ if detail.filter is None or detail.filter(e)
317
+ ]
199
318
  else:
200
319
  detail_items = []
201
320
 
@@ -203,16 +322,13 @@ def _read_samples_df(
203
322
  for index, item in enumerate(detail_items):
204
323
  if strict:
205
324
  detail_record = import_record(
206
- item, columns_detail, strict=True
325
+ eval_log, item, columns_detail, strict=True
207
326
  )
208
327
  else:
209
328
  detail_record, errors = import_record(
210
- item, columns_detail, strict=False
211
- )
212
- error_key = (
213
- f"{pretty_path(log)} [{sample.id}, {sample.epoch}]"
329
+ eval_log, item, columns_detail, strict=False
214
330
  )
215
- all_errors[error_key] = errors
331
+ all_errors.extend(errors)
216
332
 
217
333
  # inject ids
218
334
  detail_id = detail_record.get(
@@ -226,14 +342,20 @@ def _read_samples_df(
226
342
 
227
343
  # record sample record
228
344
  sample_records.append(record)
229
- p.update()
345
+ p.update()
230
346
 
231
347
  # normalize records and produce samples table
232
348
  samples_table = records_to_pandas(sample_records)
349
+ samples_table.drop_duplicates(
350
+ "sample_id", keep="first", inplace=True, ignore_index=True
351
+ )
233
352
 
234
353
  # if we have detail records then join them into the samples table
235
354
  if detail is not None:
236
355
  details_table = records_to_pandas(detail_records)
356
+ details_table.drop_duplicates(
357
+ f"{detail.name}_id", keep="first", inplace=True, ignore_index=True
358
+ )
237
359
  samples_table = details_table.merge(
238
360
  samples_table,
239
361
  on=SAMPLE_ID,
@@ -262,6 +384,35 @@ def _read_samples_df(
262
384
  return samples_table, all_errors
263
385
 
264
386
 
387
+ def sample_messages_from_events(
388
+ events: list[Event], filter: Callable[[ChatMessage], bool] | None
389
+ ) -> list[ChatMessage]:
390
+ # don't yield the same event twice
391
+ ids: set[str] = set()
392
+
393
+ # we need to look at the full input to every model event and add
394
+ # messages we haven't seen before
395
+ messages: list[ChatMessage] = []
396
+ for event in events:
397
+ if event.event == "model":
398
+ event_messages = event.input + (
399
+ [event.output.message] if not event.output.empty else []
400
+ )
401
+ for message in event_messages:
402
+ id = message.id or message_hash(message_as_str(message))
403
+ if id not in ids:
404
+ messages.append(message)
405
+ ids.add(id)
406
+
407
+ # then apply the filter
408
+ return [message for message in messages if filter is None or filter(message)]
409
+
410
+
411
+ @lru_cache(maxsize=100)
412
+ def message_hash(message: str) -> str:
413
+ return mm3_hash(message)
414
+
415
+
265
416
  def reorder_samples_df_columns(
266
417
  df: "pd.DataFrame",
267
418
  eval_columns: list[Column],
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Sequence, TypeAlias
9
9
  from inspect_ai._util.error import pip_dependency_error
10
10
  from inspect_ai._util.file import FileInfo, filesystem
11
11
  from inspect_ai._util.version import verify_required_version
12
- from inspect_ai.log._file import log_files_from_ls
12
+ from inspect_ai.log._file import EvalLogInfo, log_files_from_ls
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  import pandas as pd
@@ -17,7 +17,9 @@ if TYPE_CHECKING:
17
17
 
18
18
  from .columns import ColumnType
19
19
 
20
- LogPaths: TypeAlias = PathLike[str] | str | Sequence[PathLike[str] | str]
20
+ LogPaths: TypeAlias = (
21
+ PathLike[str] | str | EvalLogInfo | Sequence[PathLike[str] | str | EvalLogInfo]
22
+ )
21
23
 
22
24
 
23
25
  def verify_prerequisites() -> None:
@@ -37,34 +39,35 @@ def verify_prerequisites() -> None:
37
39
  raise pip_dependency_error("inspect_ai.analysis", required_packages)
38
40
 
39
41
  # enforce version constraints
40
- verify_required_version("inspect_ai.analysis", "pandas", "2.0.0")
42
+ verify_required_version("inspect_ai.analysis", "pandas", "2.1.0")
41
43
  verify_required_version("inspect_ai.analysis", "pyarrow", "10.0.1")
42
44
 
43
45
 
44
- def resolve_logs(logs: LogPaths, recursive: bool, reverse: bool) -> list[str]:
46
+ def resolve_logs(logs: LogPaths) -> list[str]:
45
47
  # normalize to list of str
46
- logs = [logs] if isinstance(logs, str | PathLike) else logs
47
- logs = [Path(log).as_posix() if isinstance(log, PathLike) else log for log in logs]
48
+ logs = [logs] if isinstance(logs, str | PathLike | EvalLogInfo) else logs
49
+ logs_str = [
50
+ Path(log).as_posix()
51
+ if isinstance(log, PathLike)
52
+ else log.name
53
+ if isinstance(log, EvalLogInfo)
54
+ else log
55
+ for log in logs
56
+ ]
48
57
 
49
58
  # expand directories
50
59
  log_paths: list[FileInfo] = []
51
- for log in logs:
52
- if isinstance(log, PathLike):
53
- log = Path(log).as_posix()
54
- fs = filesystem(log)
55
- info = fs.info(log)
60
+ for log_str in logs_str:
61
+ fs = filesystem(log_str)
62
+ info = fs.info(log_str)
56
63
  if info.type == "directory":
57
64
  log_paths.extend(
58
- [
59
- fi
60
- for fi in fs.ls(info.name, recursive=recursive)
61
- if fi.type == "file"
62
- ]
65
+ [fi for fi in fs.ls(info.name, recursive=True) if fi.type == "file"]
63
66
  )
64
67
  else:
65
68
  log_paths.append(info)
66
69
 
67
- log_files = log_files_from_ls(log_paths, descending=reverse)
70
+ log_files = log_files_from_ls(log_paths, sort=False)
68
71
  return [log_file.name for log_file in log_files]
69
72
 
70
73
 
@@ -138,20 +141,22 @@ def add_unreferenced_columns(
138
141
  def records_to_pandas(records: list[dict[str, ColumnType]]) -> "pd.DataFrame":
139
142
  import pyarrow as pa
140
143
 
144
+ # create arrow table
141
145
  records = normalize_records(records)
142
- table = pa.Table.from_pylist(records).to_pandas(types_mapper=arrow_types_mapper)
143
- return table
146
+ table = pa.Table.from_pylist(records)
144
147
 
148
+ # convert arrow to pandas
149
+ df = table.to_pandas(types_mapper=arrow_types_mapper)
145
150
 
146
- def arrow_types_mapper(
147
- arrow_type: "pa.DataType",
148
- ) -> "pd.api.extensions.ExtensionDtype" | None:
151
+ # swap numpy-backed nullable columns for arrow-backed equivalents
152
+ # df = df.convert_dtypes(dtype_backend="pyarrow")
153
+ return df
154
+
155
+
156
+ def arrow_types_mapper(arrow_type: pa.DataType) -> pd.ArrowDtype:
149
157
  import pandas as pd
150
158
  import pyarrow as pa
151
159
 
152
- # convert str => str
153
- if pa.types.is_string(arrow_type):
154
- return pd.StringDtype()
155
- # default conversion for other types
156
- else:
157
- return None
160
+ if pa.types.is_null(arrow_type):
161
+ arrow_type = pa.string()
162
+ return pd.ArrowDtype(arrow_type)
inspect_ai/log/_file.py CHANGED
@@ -526,12 +526,19 @@ def log_files_from_ls(
526
526
  ls: list[FileInfo],
527
527
  formats: list[Literal["eval", "json"]] | None = None,
528
528
  descending: bool = True,
529
+ sort: bool = True,
529
530
  ) -> list[EvalLogInfo]:
530
531
  extensions = [f".{format}" for format in (formats or ALL_LOG_FORMATS)]
531
532
  return [
532
533
  log_file_info(file)
533
- for file in sorted(
534
- ls, key=lambda file: (file.mtime if file.mtime else 0), reverse=descending
534
+ for file in (
535
+ sorted(
536
+ ls,
537
+ key=lambda file: (file.mtime if file.mtime else 0),
538
+ reverse=descending,
539
+ )
540
+ if sort
541
+ else ls
535
542
  )
536
543
  if file.type == "file" and is_log_file(file.name, extensions)
537
544
  ]
@@ -303,7 +303,7 @@ async def execute_tools(
303
303
  )
304
304
  result_messages.append(tool_message)
305
305
  display_conversation_message(tool_message)
306
- else:
306
+ elif result is not None:
307
307
  for message in result.messages:
308
308
  result_messages.append(message)
309
309
  display_conversation_message(message)
@@ -276,13 +276,25 @@ class AnthropicAPI(ModelAPI):
276
276
  params = dict(model=self.service_model_name(), max_tokens=max_tokens)
277
277
  headers: dict[str, str] = {}
278
278
  betas: list[str] = []
279
- # some params not compatible with thinking models
280
- if not self.is_using_thinking(config):
281
- if config.temperature is not None:
279
+
280
+ # temperature not compatible with extended thinking
281
+ THINKING_WARNING = "anthropic models do not support the '{parameter}' parameter when using extended thinking."
282
+ if config.temperature is not None:
283
+ if self.is_using_thinking(config):
284
+ warn_once(logger, THINKING_WARNING.format(parameter="temperature"))
285
+ else:
282
286
  params["temperature"] = config.temperature
283
- if config.top_p is not None:
287
+ # top_p not compatible with extended thinking
288
+ if config.top_p is not None:
289
+ if self.is_using_thinking(config):
290
+ warn_once(logger, THINKING_WARNING.format(parameter="top_p"))
291
+ else:
284
292
  params["top_p"] = config.top_p
285
- if config.top_k is not None:
293
+ # top_k not compatible with extended thinking
294
+ if config.top_k is not None:
295
+ if self.is_using_thinking(config):
296
+ warn_once(logger, THINKING_WARNING.format(parameter="top_k"))
297
+ else:
286
298
  params["top_k"] = config.top_k
287
299
 
288
300
  # some thinking-only stuff
@@ -346,6 +358,7 @@ class AnthropicAPI(ModelAPI):
346
358
  # for "overloaded_error" so we check for it explicitly
347
359
  if (
348
360
  isinstance(ex.body, dict)
361
+ and isinstance(ex.body.get("error", {}), dict)
349
362
  and ex.body.get("error", {}).get("type", "") == "overloaded_error"
350
363
  ):
351
364
  return True
@@ -138,6 +138,7 @@ class AzureAIAPI(ModelAPI):
138
138
  ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
139
139
  # emulate tools (auto for llama, opt-in for others)
140
140
  if self.emulate_tools is None and self.is_llama():
141
+ self.emulate_tools = True
141
142
  handler: ChatAPIHandler | None = Llama31Handler(self.model_name)
142
143
  elif self.emulate_tools:
143
144
  handler = Llama31Handler(self.model_name)
@@ -151,10 +152,14 @@ class AzureAIAPI(ModelAPI):
151
152
  # prepare request
152
153
  request = dict(
153
154
  messages=await chat_request_messages(input, handler),
154
- tools=chat_tools(tools) if len(tools) > 0 else None,
155
- tool_choice=chat_tool_choice(tool_choice) if len(tools) > 0 else None,
156
155
  **self.completion_params(config),
157
156
  )
157
+ # newer versions of vllm reject requests with tools or tool_choice if the
158
+ # server hasn't been started explicitly with the --tool-call-parser and
159
+ # --enable-auto-tool-choice flags
160
+ if (not self.emulate_tools) and len(tools) > 0:
161
+ request["tools"] = chat_tools(tools)
162
+ request["tool_choice"] = chat_tool_choice(tool_choice)
158
163
 
159
164
  # create client (note the client needs to be created and closed
160
165
  # with each call so it can be cleaned up and not end up on another
@@ -79,7 +79,7 @@ class Llama31Handler(ChatAPIHandler):
79
79
  prompt that asks the model to use the <tool_call>...</tool_call> syntax)
80
80
  """
81
81
  # extract tool calls
82
- tool_call_regex = rf"<{TOOL_CALL}>((?:.|\n)*?)</{TOOL_CALL}>"
82
+ tool_call_regex = rf"<{TOOL_CALL}s?>((?:.|\n)*?)</{TOOL_CALL}s?>"
83
83
  tool_calls_content: list[str] = re.findall(tool_call_regex, response)
84
84
 
85
85
  # if there are tool calls proceed with parsing
@@ -93,7 +93,7 @@ class Llama31Handler(ChatAPIHandler):
93
93
  ]
94
94
 
95
95
  # find other content that exists outside tool calls
96
- tool_call_content_regex = rf"<{TOOL_CALL}>(?:.|\n)*?</{TOOL_CALL}>"
96
+ tool_call_content_regex = rf"<{TOOL_CALL}s?>(?:.|\n)*?</{TOOL_CALL}s?>"
97
97
  other_content = re.split(tool_call_content_regex, response, flags=re.DOTALL)
98
98
  other_content = [
99
99
  str(content).strip()
@@ -164,7 +164,7 @@ def parse_tool_call_content(content: str, tools: list[ToolInfo]) -> ToolCall:
164
164
  # see if we can get the fields (if not report error)
165
165
  name = tool_call_data.get("name", None)
166
166
  arguments = tool_call_data.get("arguments", None)
167
- if not name or not arguments:
167
+ if not name or (arguments is None):
168
168
  raise ValueError(
169
169
  "Required 'name' and 'arguments' not provided in JSON dictionary."
170
170
  )
@@ -138,7 +138,7 @@ class TaskState:
138
138
  The `TaskState` represents the internal state of the `Task` being run for a single `Sample`.
139
139
 
140
140
  The `TaskState` is passed to and returned from each solver during a sample's
141
- evaluation. It allows us to manipulated the message history, the tools
141
+ evaluation. It allows us to maintain the manipulated message history, the tools
142
142
  available to the model, the final output of the model, and whether the task
143
143
  is completed or has hit a limit.
144
144
  """