inspect-ai 0.3.51__py3-none-any.whl → 0.3.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
inspect_ai/_cli/eval.py CHANGED
@@ -12,7 +12,7 @@ from inspect_ai._util.constants import (
12
12
  DEFAULT_MAX_RETRIES,
13
13
  )
14
14
  from inspect_ai._util.file import filesystem
15
- from inspect_ai._util.samples import parse_samples_limit
15
+ from inspect_ai._util.samples import parse_sample_id, parse_samples_limit
16
16
  from inspect_ai.log._file import log_file_info
17
17
  from inspect_ai.model import GenerateConfigArgs
18
18
  from inspect_ai.scorer._reducer import create_reducers
@@ -144,6 +144,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
144
144
  help="Limit samples to evaluate e.g. 10 or 10-20",
145
145
  envvar="INSPECT_EVAL_LIMIT",
146
146
  )
147
+ @click.option(
148
+ "--sample-id",
149
+ type=str,
150
+ help="Evaluate specific sample(s) (comma separated list of ids)",
151
+ envvar="INSPECT_EVAL_SAMPLE_ID",
152
+ )
147
153
  @click.option(
148
154
  "--epochs",
149
155
  type=int,
@@ -391,6 +397,7 @@ def eval_command(
391
397
  epochs: int | None,
392
398
  epochs_reducer: str | None,
393
399
  limit: str | None,
400
+ sample_id: str | None,
394
401
  max_retries: int | None,
395
402
  timeout: int | None,
396
403
  max_connections: int | None,
@@ -458,6 +465,7 @@ def eval_command(
458
465
  epochs=epochs,
459
466
  epochs_reducer=epochs_reducer,
460
467
  limit=limit,
468
+ sample_id=sample_id,
461
469
  message_limit=message_limit,
462
470
  token_limit=token_limit,
463
471
  time_limit=time_limit,
@@ -543,6 +551,7 @@ def eval_set_command(
543
551
  epochs: int | None,
544
552
  epochs_reducer: str | None,
545
553
  limit: str | None,
554
+ sample_id: str | None,
546
555
  max_retries: int | None,
547
556
  timeout: int | None,
548
557
  max_connections: int | None,
@@ -612,6 +621,7 @@ def eval_set_command(
612
621
  epochs=epochs,
613
622
  epochs_reducer=epochs_reducer,
614
623
  limit=limit,
624
+ sample_id=sample_id,
615
625
  message_limit=message_limit,
616
626
  token_limit=token_limit,
617
627
  time_limit=time_limit,
@@ -662,6 +672,7 @@ def eval_exec(
662
672
  epochs: int | None,
663
673
  epochs_reducer: str | None,
664
674
  limit: str | None,
675
+ sample_id: str | None,
665
676
  message_limit: int | None,
666
677
  token_limit: int | None,
667
678
  time_limit: int | None,
@@ -699,8 +710,9 @@ def eval_exec(
699
710
  else None
700
711
  )
701
712
 
702
- # resolve range
713
+ # resolve range and sample id
703
714
  eval_limit = parse_samples_limit(limit)
715
+ eval_sample_id = parse_sample_id(sample_id)
704
716
 
705
717
  # resolve fail_on_error
706
718
  if no_fail_on_error is True:
@@ -734,6 +746,7 @@ def eval_exec(
734
746
  log_dir=log_dir,
735
747
  log_format=log_format,
736
748
  limit=eval_limit,
749
+ sample_id=eval_sample_id,
737
750
  epochs=eval_epochs,
738
751
  fail_on_error=fail_on_error,
739
752
  debug_errors=debug_errors,
@@ -24,6 +24,10 @@ def task_config(
24
24
  config_print.append(
25
25
  f"{name}: {','.join([approver['name'] for approver in value['approvers']])}"
26
26
  )
27
+ elif name == "sample_id":
28
+ value = value if isinstance(value, list) else [value]
29
+ value = [str(v) for v in value]
30
+ config_print.append(f"{name}: {','.join(value)}")
27
31
  elif name not in ["limit", "model"]:
28
32
  config_print.append(f"{name}: {value}")
29
33
  values = ", ".join(config_print)
@@ -112,7 +112,7 @@ def tasks_title(completed: int, total: int) -> str:
112
112
  def task_title(profile: TaskProfile, show_model: bool) -> str:
113
113
  eval_epochs = profile.eval_config.epochs or 1
114
114
  epochs = f" x {profile.eval_config.epochs}" if eval_epochs > 1 else ""
115
- samples = f"{profile.samples//eval_epochs:,}{epochs} sample{'s' if profile.samples > 1 else ''}"
115
+ samples = f"{profile.samples//eval_epochs:,}{epochs} sample{'s' if profile.samples != 1 else ''}"
116
116
  title = f"{registry_unqualified_name(profile.name)} ({samples})"
117
117
  if show_model:
118
118
  title = f"{title}: {profile.model}"
@@ -63,6 +63,9 @@ class TaskDetail(Widget):
63
63
  def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
64
64
  # Group by reducer then scorer within reducers
65
65
  self.metrics = metrics
66
+
67
+ # clear the existing computed reducers
68
+ self.by_reducer = {}
66
69
  for metric in metrics:
67
70
  reducer_group = (
68
71
  self.by_reducer[metric.reducer]
@@ -117,6 +120,7 @@ class TaskDetail(Widget):
117
120
  for remove in to_remove:
118
121
  task_metric = self.existing_metrics[remove]
119
122
  task_metric.remove()
123
+ del self.existing_metrics[remove]
120
124
 
121
125
  # add or update widgets with metrics
122
126
  for reducer, scorers in self.by_reducer.items():
@@ -187,24 +191,49 @@ class TaskMetrics(Widget):
187
191
  self.grid: Grid = Grid()
188
192
  self.value_widgets: dict[str, Static] = {}
189
193
 
194
+ def grid_id(self) -> str:
195
+ return f"{self.id}-grid"
196
+
190
197
  def compose(self) -> ComposeResult:
191
- # Just yield a single DataTable widget
198
+ # Yield the title and base grid
192
199
  yield Center(self._title())
193
- with Grid():
194
- for metric in self.metrics:
195
- # Add the value static but keep it around
196
- # for future updates
197
- self.value_widgets[metric.name] = Static(
198
- self._metric_value(metric.value)
199
- )
200
-
201
- yield Static(metric.name)
202
- yield self.value_widgets[metric.name]
200
+ yield Grid(id=self.grid_id())
203
201
 
204
202
  def update(self, metrics: list[TaskMetric]) -> None:
203
+ self.metrics = metrics
204
+
205
+ # We assume that generally the initial metric names will
206
+ # always match future updates (so we can just update values in line)
207
+ # but if an unrecognized metric appears on the scene, just
208
+ # recompute the whole grid
209
+ need_recompute = False
205
210
  for metric in metrics:
206
- widget = self.value_widgets[metric.name]
207
- widget.update(content=f"{metric.value:,.3f}")
211
+ widget = self.value_widgets.get(metric.name)
212
+ if widget:
213
+ # Just update the values themselves
214
+ widget.update(content=f"{metric.value:,.3f}")
215
+ else:
216
+ # Don't have a widget for this, recompute the whole grid
217
+ need_recompute = True
218
+ break
219
+
220
+ if need_recompute:
221
+ self.recompute_grid()
222
+
223
+ def on_mount(self) -> None:
224
+ self.recompute_grid()
225
+
226
+ def recompute_grid(self) -> None:
227
+ grid = self.query_one(f"#{self.grid_id()}")
228
+
229
+ grid.remove_children()
230
+ for metric in self.metrics:
231
+ # Add the value static but keep it around
232
+ # for future updates
233
+ self.value_widgets[metric.name] = Static(self._metric_value(metric.value))
234
+
235
+ grid.mount(Static(metric.name))
236
+ grid.mount(self.value_widgets[metric.name])
208
237
 
209
238
  def _title(self) -> Widget:
210
239
  if self.scorer is None:
@@ -10,10 +10,10 @@ from textual.widget import Widget
10
10
  from textual.widgets import Static
11
11
 
12
12
  from inspect_ai._util.content import ContentText
13
- from inspect_ai._util.format import format_function_call
14
13
  from inspect_ai._util.rich import lines_display
15
14
  from inspect_ai._util.transcript import (
16
15
  set_transcript_markdown_options,
16
+ transcript_function,
17
17
  transcript_markdown,
18
18
  transcript_separator,
19
19
  )
@@ -36,6 +36,7 @@ from inspect_ai.log._transcript import (
36
36
  from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
37
37
  from inspect_ai.model._render import messages_preceding_assistant
38
38
  from inspect_ai.tool._tool import ToolResult
39
+ from inspect_ai.tool._tool_transcript import transcript_tool_call
39
40
 
40
41
 
41
42
  class TranscriptView(ScrollableContainer):
@@ -195,16 +196,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
195
196
  display.extend(render_event(e) or [])
196
197
 
197
198
  # render the call
198
- content: list[RenderableType] = []
199
- if event.view:
200
- if event.view.title:
201
- content.append(Text.from_markup(f"[bold]{event.view.title}[/bold]\n"))
202
- if event.view.format == "markdown":
203
- content.append(transcript_markdown(event.view.content))
204
- else:
205
- content.append(event.view.content)
206
- else:
207
- content.append(render_function_call(event.function, event.arguments))
199
+ content = transcript_tool_call(event)
208
200
 
209
201
  # render the output
210
202
  if isinstance(event.result, list):
@@ -266,7 +258,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
266
258
  for e in event.events:
267
259
  display.extend(render_event(e) or [])
268
260
 
269
- content: list[RenderableType] = [render_function_call(event.name, event.input)]
261
+ content: list[RenderableType] = [transcript_function(event.name, event.input)]
270
262
  if event.result:
271
263
  content.append(Text())
272
264
  if isinstance(event.result, str | int | float | bool | None):
@@ -309,11 +301,6 @@ def render_error_event(event: ErrorEvent) -> EventDisplay:
309
301
  return EventDisplay("error", event.error.traceback.strip())
310
302
 
311
303
 
312
- def render_function_call(function: str, arguments: dict[str, Any]) -> RenderableType:
313
- call = format_function_call(function, arguments)
314
- return transcript_markdown("```python\n" + call + "\n```\n")
315
-
316
-
317
304
  def render_as_json(json: Any) -> RenderableType:
318
305
  return transcript_markdown(
319
306
  "```json\n"
inspect_ai/_eval/eval.py CHANGED
@@ -61,6 +61,7 @@ def eval(
61
61
  log_dir: str | None = None,
62
62
  log_format: Literal["eval", "json"] | None = None,
63
63
  limit: int | tuple[int, int] | None = None,
64
+ sample_id: str | int | list[str | int] | None = None,
64
65
  epochs: int | Epochs | None = None,
65
66
  fail_on_error: bool | float | None = None,
66
67
  debug_errors: bool | None = None,
@@ -110,6 +111,7 @@ def eval(
110
111
  to "eval", the native high-performance format).
111
112
  limit (int | tuple[int, int] | None): Limit evaluated samples
112
113
  (defaults to all samples).
114
+ sample_id (str | int | list[str | int] | None): Evaluate specific sample(s) from the dataset.
113
115
  epochs (int | Epochs | None): Epochs to repeat samples for and optional score
114
116
  reducer function(s) used to combine sample scores (defaults to "mean")
115
117
  fail_on_error (bool | float | None): `True` to fail on first sample error
@@ -163,6 +165,7 @@ def eval(
163
165
  log_dir=log_dir,
164
166
  log_format=log_format,
165
167
  limit=limit,
168
+ sample_id=sample_id,
166
169
  epochs=epochs,
167
170
  fail_on_error=fail_on_error,
168
171
  debug_errors=debug_errors,
@@ -198,6 +201,7 @@ async def eval_async(
198
201
  log_dir: str | None = None,
199
202
  log_format: Literal["eval", "json"] | None = None,
200
203
  limit: int | tuple[int, int] | None = None,
204
+ sample_id: str | int | list[str | int] | None = None,
201
205
  epochs: int | Epochs | None = None,
202
206
  fail_on_error: bool | float | None = None,
203
207
  debug_errors: bool | None = None,
@@ -245,8 +249,9 @@ async def eval_async(
245
249
  (defaults to file log in ./logs directory).
246
250
  log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
247
251
  to "eval", the native high-performance format).
248
- limit (int | tuple[int, int] | None): Limit evaluated samples
252
+ limit (str | int | list[str | int] | None): Limit evaluated samples
249
253
  (defaults to all samples).
254
+ sample_id (str | list[str] | None): Evaluate specific sample(s) from the dataset.
250
255
  epochs (int | Epochs | None): Epochs to repeat samples for and optional score
251
256
  reducer function(s) used to combine sample scores (defaults to "mean")
252
257
  fail_on_error (bool | float | None): `True` to fail on first sample error
@@ -335,6 +340,10 @@ async def eval_async(
335
340
  # resolve solver
336
341
  solver = chain(solver) if isinstance(solver, list) else solver
337
342
 
343
+ # ensure consistency of limit and sample_id
344
+ if sample_id is not None and limit is not None:
345
+ raise ValueError("You cannot specify both sample_id and limit.")
346
+
338
347
  # resolve epochs
339
348
  if isinstance(epochs, int):
340
349
  epochs = Epochs(epochs)
@@ -345,6 +354,7 @@ async def eval_async(
345
354
  epochs_reducer = epochs.reducer if epochs else None
346
355
  eval_config = EvalConfig(
347
356
  limit=limit,
357
+ sample_id=sample_id,
348
358
  epochs=epochs.epochs if epochs else None,
349
359
  epochs_reducer=reducer_log_names(epochs_reducer)
350
360
  if epochs_reducer
@@ -642,6 +652,7 @@ async def eval_retry_async(
642
652
  task_args = eval_log.eval.task_args
643
653
  tags = eval_log.eval.tags
644
654
  limit = eval_log.eval.config.limit
655
+ sample_id = eval_log.eval.config.sample_id
645
656
  epochs = (
646
657
  Epochs(eval_log.eval.config.epochs, eval_log.eval.config.epochs_reducer)
647
658
  if eval_log.eval.config.epochs
@@ -699,6 +710,7 @@ async def eval_retry_async(
699
710
  log_dir=log_dir,
700
711
  log_format=log_format,
701
712
  limit=limit,
713
+ sample_id=sample_id,
702
714
  epochs=epochs,
703
715
  fail_on_error=fail_on_error,
704
716
  debug_errors=debug_errors,
@@ -65,6 +65,7 @@ def eval_set(
65
65
  log_level_transcript: str | None = None,
66
66
  log_format: Literal["eval", "json"] | None = None,
67
67
  limit: int | tuple[int, int] | None = None,
68
+ sample_id: str | int | list[str | int] | None = None,
68
69
  epochs: int | Epochs | None = None,
69
70
  fail_on_error: bool | float | None = None,
70
71
  debug_errors: bool | None = None,
@@ -125,6 +126,7 @@ def eval_set(
125
126
  log files (defaults to "eval", the native high-performance format).
126
127
  limit (int | tuple[int, int] | None): Limit evaluated samples
127
128
  (defaults to all samples).
129
+ sample_id (str | int | list[str | int] | None): Evaluate specific sample(s) from the dataset.
128
130
  epochs (int | Epochs | None): Epochs to repeat samples for and optional score
129
131
  reducer function(s) used to combine sample scores (defaults to "mean")
130
132
  fail_on_error (bool | float | None): `True` to fail on first sample error
@@ -181,6 +183,7 @@ def eval_set(
181
183
  log_dir=log_dir,
182
184
  log_format=log_format,
183
185
  limit=limit,
186
+ sample_id=sample_id,
184
187
  epochs=epochs,
185
188
  fail_on_error=fail_on_error,
186
189
  debug_errors=debug_errors,
@@ -83,7 +83,12 @@ class TaskLogger:
83
83
  # ensure that the dataset has sample ids and record them
84
84
  sample_ids = cast(
85
85
  list[int | str],
86
- [sample.id for sample in slice_dataset(dataset, eval_config.limit)],
86
+ [
87
+ sample.id
88
+ for sample in slice_dataset(
89
+ dataset, eval_config.limit, eval_config.sample_id
90
+ )
91
+ ],
87
92
  )
88
93
 
89
94
  # create eval spec
@@ -162,6 +162,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
162
162
  dataset=task.dataset,
163
163
  model_name=model_name,
164
164
  limit=config.limit,
165
+ sample_id=config.sample_id,
165
166
  epochs=epochs,
166
167
  log_images=log_images,
167
168
  message_limit=config.message_limit,
@@ -748,13 +749,14 @@ async def resolve_dataset(
748
749
  dataset: Dataset,
749
750
  model_name: ModelName,
750
751
  limit: int | tuple[int, int] | None,
752
+ sample_id: str | int | list[str | int] | None,
751
753
  epochs: int,
752
754
  log_images: bool,
753
755
  message_limit: int | None,
754
756
  token_limit: int | None,
755
757
  ) -> tuple[Dataset, list[Sample], list[TaskState]]:
756
- # apply limit to dataset
757
- dataset = slice_dataset(dataset, limit)
758
+ # slice dataset
759
+ dataset = slice_dataset(dataset, limit, sample_id)
758
760
 
759
761
  # apply epochs (deepcopy the samples so they remain independent)
760
762
  samples: list[Sample] = []
@@ -39,10 +39,19 @@ def task_file(task: Task, relative: bool = False) -> str | None:
39
39
  def slice_dataset(
40
40
  dataset: Dataset,
41
41
  limit: int | tuple[int, int] | None,
42
+ sample_id: str | int | list[str | int] | None,
42
43
  ) -> Dataset:
43
- dataset_limit = (
44
- slice(0, len(dataset))
45
- if limit is None
46
- else (slice(*limit) if isinstance(limit, tuple) else slice(0, limit))
47
- )
48
- return dataset[dataset_limit]
44
+ def normalise(id: str | int | None) -> str:
45
+ return id if isinstance(id, str) else str(id).zfill(20)
46
+
47
+ if sample_id is not None:
48
+ sample_id = sample_id if isinstance(sample_id, list) else [sample_id]
49
+ sample_id = [normalise(id) for id in sample_id]
50
+ return dataset.filter(lambda sample: normalise(sample.id) in sample_id)
51
+ else:
52
+ dataset_limit = (
53
+ slice(0, len(dataset))
54
+ if limit is None
55
+ else (slice(*limit) if isinstance(limit, tuple) else slice(0, limit))
56
+ )
57
+ return dataset[dataset_limit]
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  from logging import (
3
+ DEBUG,
3
4
  INFO,
4
5
  WARNING,
5
6
  FileHandler,
@@ -129,7 +130,7 @@ def init_logger(
129
130
  # init logging handler on demand
130
131
  global _logHandler
131
132
  if not _logHandler:
132
- _logHandler = LogHandler(min(HTTP, levelno), transcript_levelno)
133
+ _logHandler = LogHandler(min(DEBUG, levelno), transcript_levelno)
133
134
  getLogger().addHandler(_logHandler)
134
135
 
135
136
  # establish default capture level
@@ -139,6 +140,7 @@ def init_logger(
139
140
  getLogger().setLevel(capture_level)
140
141
  getLogger(PKG_NAME).setLevel(capture_level)
141
142
  getLogger("httpx").setLevel(capture_level)
143
+ getLogger("botocore").setLevel(DEBUG)
142
144
 
143
145
  # set the levelno on the global handler
144
146
  _logHandler.display_level = levelno
@@ -154,7 +156,13 @@ def notify_logger_record(record: LogRecord, write: bool) -> None:
154
156
  if write:
155
157
  transcript()._event(LoggerEvent(message=LoggingMessage.from_log_record(record)))
156
158
  global _rate_limit_count
157
- if record.levelno <= INFO and "429" in record.getMessage():
159
+ if (record.levelno <= INFO and "429" in record.getMessage()) or (
160
+ record.levelno == DEBUG
161
+ # See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html#validating-retry-attempts
162
+ # for boto retry logic / log messages (this is tracking standard or adapative retries)
163
+ and "botocore.retries.standard" in record.name
164
+ and "Retry needed, retrying request after delay of:" in record.getMessage()
165
+ ):
158
166
  _rate_limit_count = _rate_limit_count + 1
159
167
 
160
168
 
@@ -7,3 +7,10 @@ def parse_samples_limit(limit: str | None) -> int | tuple[int, int] | None:
7
7
  return (limit_split[0] - 1, limit_split[1])
8
8
  else:
9
9
  return None
10
+
11
+
12
+ def parse_sample_id(sample_id: str | None) -> list[str] | None:
13
+ if sample_id is not None:
14
+ return [id.strip() for id in sample_id.split(",")]
15
+ else:
16
+ return None
@@ -1,4 +1,5 @@
1
1
  import html
2
+ from typing import Any
2
3
 
3
4
  from rich.align import AlignMethod
4
5
  from rich.box import ROUNDED, Box
@@ -8,6 +9,8 @@ from rich.panel import Panel
8
9
  from rich.rule import Rule
9
10
  from rich.text import Text
10
11
 
12
+ from .format import format_function_call
13
+
11
14
 
12
15
  def transcript_code_theme() -> str:
13
16
  return "github-dark"
@@ -81,6 +84,11 @@ def transcript_separator(title: str, color: str) -> RenderableType:
81
84
  return Rule(title=title, style=f"{color} bold", align="center", end="\n\n")
82
85
 
83
86
 
87
+ def transcript_function(function: str, arguments: dict[str, Any]) -> RenderableType:
88
+ call = format_function_call(function, arguments)
89
+ return transcript_markdown("```python\n" + call + "\n```\n")
90
+
91
+
84
92
  LINE = Box(" ── \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n")
85
93
 
86
94
  DOTTED = Box(" ·· \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n")
@@ -20225,7 +20225,7 @@ const metadataViewsForSample = (id, sample) => {
20225
20225
  }
20226
20226
  return sampleMetadatas;
20227
20227
  };
20228
- const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
20228
+ const SampleSummary = ({ parent_id, sample, style, sampleDescriptor }) => {
20229
20229
  const input = (sampleDescriptor == null ? void 0 : sampleDescriptor.messageShape.normalized.input) > 0 ? Math.max(0.15, sampleDescriptor.messageShape.normalized.input) : 0;
20230
20230
  const target = (sampleDescriptor == null ? void 0 : sampleDescriptor.messageShape.normalized.target) > 0 ? Math.max(0.15, sampleDescriptor.messageShape.normalized.target) : 0;
20231
20231
  const answer = (sampleDescriptor == null ? void 0 : sampleDescriptor.messageShape.normalized.answer) > 0 ? Math.max(0.15, sampleDescriptor.messageShape.normalized.answer) : 0;
@@ -20246,7 +20246,7 @@ const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
20246
20246
  const columns = [];
20247
20247
  columns.push({
20248
20248
  label: "Id",
20249
- value: id,
20249
+ value: sample.id,
20250
20250
  size: `${idSize}em`
20251
20251
  });
20252
20252
  columns.push({
@@ -20299,7 +20299,7 @@ const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
20299
20299
  });
20300
20300
  return m$1`
20301
20301
  <div
20302
- id=${`sample-heading-${id}`}
20302
+ id=${`sample-heading-${parent_id}`}
20303
20303
  style=${{
20304
20304
  display: "grid",
20305
20305
  gridTemplateColumns: `${columns.map((col) => {
@@ -25449,22 +25449,35 @@ const createsSamplesDescriptor = (scorers, samples, epochs, selectedScore) => {
25449
25449
  if (!sample || !sample.scores) {
25450
25450
  return [];
25451
25451
  }
25452
- scorers.map((score2) => {
25452
+ const scoreNames = scorers.map((score2) => {
25453
25453
  return score2.name;
25454
25454
  });
25455
25455
  const sampleScorer = sample.scores[scorer];
25456
25456
  const scoreVal = sampleScorer.value;
25457
25457
  if (typeof scoreVal === "object") {
25458
25458
  const names = Object.keys(scoreVal);
25459
- const scores = names.map((name) => {
25460
- return {
25461
- name,
25462
- rendered: () => {
25463
- return scoreDescriptor.render(scoreVal[name]);
25459
+ if (names.find((name) => {
25460
+ return scoreNames.includes(name);
25461
+ })) {
25462
+ const scores = names.map((name) => {
25463
+ return {
25464
+ name,
25465
+ rendered: () => {
25466
+ return scoreDescriptor.render(scoreVal[name]);
25467
+ }
25468
+ };
25469
+ });
25470
+ return scores;
25471
+ } else {
25472
+ return [
25473
+ {
25474
+ name: scorer,
25475
+ rendered: () => {
25476
+ return scoreDescriptor.render(scoreVal);
25477
+ }
25464
25478
  }
25465
- };
25466
- });
25467
- return scores;
25479
+ ];
25480
+ }
25468
25481
  } else {
25469
25482
  return [
25470
25483
  {
@@ -350,7 +350,17 @@ const metadataViewsForSample = (id, sample) => {
350
350
  return sampleMetadatas;
351
351
  };
352
352
 
353
- const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
353
+ /**
354
+ * Component to display a sample with relevant context and visibility control.
355
+ *
356
+ * @param {Object} props - The properties passed to the component.
357
+ * @param {string} props.parent_id - The id of the parent com
358
+ * @param {import("../types/log").EvalSample} [props.sample] - the sample
359
+ * @param {Object} [props.style] - Inline styles for the table element.
360
+ * @param {import("../samples/SamplesDescriptor.mjs").SamplesDescriptor} props.sampleDescriptor - the sample descriptor
361
+ * @returns {import("preact").JSX.Element} The TranscriptView component.
362
+ */
363
+ const SampleSummary = ({ parent_id, sample, style, sampleDescriptor }) => {
354
364
  const input =
355
365
  sampleDescriptor?.messageShape.normalized.input > 0
356
366
  ? Math.max(0.15, sampleDescriptor.messageShape.normalized.input)
@@ -386,7 +396,7 @@ const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
386
396
  const columns = [];
387
397
  columns.push({
388
398
  label: "Id",
389
- value: id,
399
+ value: sample.id,
390
400
  size: `${idSize}em`,
391
401
  });
392
402
 
@@ -412,7 +422,8 @@ const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
412
422
 
413
423
  const fullAnswer =
414
424
  sample && sampleDescriptor
415
- ? sampleDescriptor.selectedScorer(sample).answer()
425
+ ? // @ts-ignore
426
+ sampleDescriptor.selectedScorer(sample).answer()
416
427
  : undefined;
417
428
  if (fullAnswer) {
418
429
  columns.push({
@@ -445,14 +456,16 @@ const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
445
456
  message=${sample.error.message}
446
457
  style=${{ marginTop: "0.4rem" }}
447
458
  />`
448
- : sampleDescriptor?.selectedScore(sample).render(),
459
+ : // TODO: Cleanup once the PR lands which makes sample / sample summary share common interface
460
+ // @ts-ignore
461
+ sampleDescriptor?.selectedScore(sample).render(),
449
462
  size: "minmax(2em, auto)",
450
463
  center: true,
451
464
  });
452
465
 
453
466
  return html`
454
467
  <div
455
- id=${`sample-heading-${id}`}
468
+ id=${`sample-heading-${parent_id}`}
456
469
  style=${{
457
470
  display: "grid",
458
471
  gridTemplateColumns: `${columns
inspect_ai/log/_log.py CHANGED
@@ -37,6 +37,9 @@ class EvalConfig(BaseModel):
37
37
  limit: int | tuple[int, int] | None = Field(default=None)
38
38
  """Sample limit (number of samples or range of samples)."""
39
39
 
40
+ sample_id: str | int | list[str | int] | None = Field(default=None)
41
+ """Evaluate specific sample(s)."""
42
+
40
43
  epochs: int | None = Field(default=None)
41
44
  """Number of epochs to run samples over."""
42
45
 
@@ -68,10 +68,6 @@ async def call_tools(
68
68
  # create a transript for this call
69
69
  init_transcript(Transcript(name=call.function))
70
70
 
71
- # Amend the tool call with a custom view
72
- view = tool_call_view(call, tdefs)
73
- call.view = view
74
-
75
71
  result: Any = ""
76
72
  tool_error: ToolCallError | None = None
77
73
  try:
@@ -142,7 +138,7 @@ async def call_tools(
142
138
  arguments=call.arguments,
143
139
  result=content,
144
140
  truncated=truncated,
145
- view=view,
141
+ view=call.view,
146
142
  error=tool_error,
147
143
  events=list(transcript().events),
148
144
  )
@@ -163,7 +159,7 @@ async def call_tools(
163
159
  id=call.id,
164
160
  function=call.function,
165
161
  arguments=call.arguments,
166
- view=tool_call_view(call, tdefs),
162
+ view=call.view,
167
163
  pending=True,
168
164
  )
169
165
  transcript()._event(event)
@@ -31,11 +31,11 @@ from inspect_ai._util.registry import (
31
31
  )
32
32
  from inspect_ai._util.retry import log_rate_limit_retry
33
33
  from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
34
- from inspect_ai.tool._tool_def import ToolDef
34
+ from inspect_ai.tool._tool_def import ToolDef, tool_defs
35
35
  from inspect_ai.util import concurrency
36
36
 
37
37
  from ._cache import CacheEntry, CachePolicy, cache_fetch, cache_store
38
- from ._call_tools import disable_parallel_tools, tools_info
38
+ from ._call_tools import disable_parallel_tools, tool_call_view, tools_info
39
39
  from ._chat_message import (
40
40
  ChatMessage,
41
41
  ChatMessageAssistant,
@@ -248,7 +248,7 @@ class Model:
248
248
  async with self._connection_concurrency(config):
249
249
  return await self._generate(
250
250
  input=input,
251
- tools=tools_info(tools),
251
+ tools=tools,
252
252
  tool_choice=tool_choice,
253
253
  config=config,
254
254
  cache=cache,
@@ -257,7 +257,10 @@ class Model:
257
257
  async def _generate(
258
258
  self,
259
259
  input: list[ChatMessage],
260
- tools: list[ToolInfo],
260
+ tools: list[Tool]
261
+ | list[ToolDef]
262
+ | list[ToolInfo]
263
+ | list[Tool | ToolDef | ToolInfo],
261
264
  tool_choice: ToolChoice | None,
262
265
  config: GenerateConfig,
263
266
  cache: bool | CachePolicy = False,
@@ -265,6 +268,12 @@ class Model:
265
268
  # default to 'auto' for tool_choice (same as underlying model apis)
266
269
  tool_choice = tool_choice if tool_choice else "auto"
267
270
 
271
+ # extract tool defs if we can
272
+ tdefs = tool_defs([tool for tool in tools if not isinstance(tool, ToolInfo)])
273
+
274
+ # resolve all tools into tool_info
275
+ tools = tools_info(tools)
276
+
268
277
  # if we have a specific tool selected then filter out the others
269
278
  if isinstance(tool_choice, ToolFunction):
270
279
  tools = [tool for tool in tools if tool.name == tool_choice.name]
@@ -374,6 +383,11 @@ class Model:
374
383
  # update output with time elapsed
375
384
  output.time = time_elapsed
376
385
 
386
+ # add views to tool calls
387
+ for choice in output.choices:
388
+ for tool_call in choice.message.tool_calls or []:
389
+ tool_call.view = tool_call_view(tool_call, tdefs)
390
+
377
391
  # complete the transcript event
378
392
  complete(output, call)
379
393
 
@@ -89,6 +89,19 @@ class AzureAIAPI(ModelAPI):
89
89
  config=config,
90
90
  )
91
91
 
92
+ # collect known model_args (then delete them so we can pass the rest on)
93
+ def collect_model_arg(name: str) -> Any | None:
94
+ nonlocal model_args
95
+ value = model_args.get(name, None)
96
+ if value:
97
+ model_args.pop(name)
98
+ return value
99
+
100
+ emulate_tools = collect_model_arg("emulate_tools")
101
+ self.emulate_tools = (
102
+ not not emulate_tools if emulate_tools is not None else None
103
+ )
104
+
92
105
  # resolve api_key
93
106
  if not self.api_key:
94
107
  self.api_key = os.environ.get(
@@ -118,8 +131,15 @@ class AzureAIAPI(ModelAPI):
118
131
  tool_choice: ToolChoice,
119
132
  config: GenerateConfig,
120
133
  ) -> ModelOutput | tuple[ModelOutput, ModelCall]:
121
- # if its llama then do fake tool calls
122
- handler: ChatAPIHandler | None = Llama31Handler() if self.is_llama() else None
134
+ # emulate tools (auto for llama, opt-in for others)
135
+ if self.emulate_tools is None and self.is_llama():
136
+ handler: ChatAPIHandler | None = Llama31Handler()
137
+ elif self.emulate_tools:
138
+ handler = Llama31Handler()
139
+ else:
140
+ handler = None
141
+
142
+ # resolve input
123
143
  if handler:
124
144
  input = handler.input_with_tools(input, tools)
125
145
 
@@ -1,8 +1,7 @@
1
1
  from rich.console import RenderableType
2
2
 
3
- from inspect_ai._util.format import format_function_call
4
- from inspect_ai._util.transcript import transcript_markdown
5
3
  from inspect_ai.tool._tool_call import ToolCall
4
+ from inspect_ai.tool._tool_transcript import transcript_tool_call
6
5
 
7
6
  from ._chat_message import ChatMessage, ChatMessageAssistant, ChatMessageTool
8
7
 
@@ -17,8 +16,10 @@ def messages_preceding_assistant(messages: list[ChatMessage]) -> list[ChatMessag
17
16
  return list(reversed(preceding))
18
17
 
19
18
 
20
- def render_tool_calls(tool_calls: list[ToolCall]) -> RenderableType:
21
- formatted_calls: list[str] = []
19
+ def render_tool_calls(tool_calls: list[ToolCall]) -> list[RenderableType]:
20
+ formatted_calls: list[RenderableType] = []
21
+
22
22
  for call in tool_calls:
23
- formatted_calls.append(format_function_call(call.function, call.arguments))
24
- return transcript_markdown("```python\n" + "\n\n".join(formatted_calls) + "\n```\n")
23
+ formatted_calls.extend(transcript_tool_call(call))
24
+
25
+ return formatted_calls
@@ -42,7 +42,7 @@ def trace_assistant_message(
42
42
  # print tool calls
43
43
  if message.tool_calls:
44
44
  content.append(Text())
45
- content.append(render_tool_calls(message.tool_calls))
45
+ content.extend(render_tool_calls(message.tool_calls))
46
46
 
47
47
  # print the assistant message
48
48
  trace_panel(title="Assistant", content=content)
@@ -54,6 +54,7 @@ def basic_agent(
54
54
  max_attempts: int = 1,
55
55
  message_limit: int | None = None,
56
56
  token_limit: int | None = None,
57
+ max_tool_output: int | None = None,
57
58
  score_value: ValueToFloat | None = None,
58
59
  incorrect_message: str
59
60
  | Callable[[TaskState, list[Score]], str] = DEFAULT_INCORRECT_MESSAGE,
@@ -87,6 +88,8 @@ def basic_agent(
87
88
  If not specified, will use limit_messages defined for the task. If there is none
88
89
  defined for the task, 50 will be used as a default.
89
90
  token_limit (int | None): Limit on tokens used in sample before terminating agent.
91
+ max_tool_output (int | None): Maximum output length (in bytes).
92
+ Defaults to max_tool_output from active GenerateConfig.
90
93
  score_value (ValueToFloat): Function used to extract float from scores (defaults
91
94
  to standard value_to_float())
92
95
  incorrect_message (str | Callable[[TaskState, list[Score]], str]): User message reply for an
@@ -182,7 +185,9 @@ def basic_agent(
182
185
  # resolve tools calls (if any)
183
186
  if state.output.message.tool_calls:
184
187
  # call tool functions
185
- tool_results = await call_tools(state.output.message, state.tools)
188
+ tool_results = await call_tools(
189
+ state.output.message, state.tools, max_output=max_tool_output
190
+ )
186
191
  state.messages.extend(tool_results)
187
192
 
188
193
  # was an answer submitted?
@@ -194,11 +199,13 @@ def basic_agent(
194
199
  # exit if we are at max_attempts
195
200
  attempts += 1
196
201
  if attempts >= max_attempts:
202
+ state.completed = True
197
203
  break
198
204
 
199
205
  # exit if the submission is successful
200
206
  answer_scores = await score(state)
201
207
  if score_value_fn(answer_scores[0].value) == 1.0:
208
+ state.completed = True
202
209
  break
203
210
 
204
211
  # otherwise notify the model that it was incorrect and continue
@@ -0,0 +1,28 @@
1
+ from pydantic import JsonValue
2
+ from rich.console import RenderableType
3
+ from rich.text import Text
4
+ from typing_extensions import Protocol
5
+
6
+ from inspect_ai._util.transcript import transcript_function, transcript_markdown
7
+
8
+ from ._tool_call import ToolCallContent
9
+
10
+
11
+ class TranscriptToolCall(Protocol):
12
+ function: str
13
+ arguments: dict[str, JsonValue]
14
+ view: ToolCallContent | None
15
+
16
+
17
+ def transcript_tool_call(call: TranscriptToolCall) -> list[RenderableType]:
18
+ content: list[RenderableType] = []
19
+ if call.view:
20
+ if call.view.title:
21
+ content.append(Text.from_markup(f"[bold]{call.view.title}[/bold]\n"))
22
+ if call.view.format == "markdown":
23
+ content.append(transcript_markdown(call.view.content))
24
+ else:
25
+ content.append(call.view.content)
26
+ else:
27
+ content.append(transcript_function(call.function, call.arguments))
28
+ return content
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: inspect_ai
3
- Version: 0.3.51
3
+ Version: 0.3.52
4
4
  Summary: Framework for large language model evaluations
5
5
  Author: UK AI Safety Institute
6
6
  License: MIT License
@@ -68,7 +68,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
68
68
  Requires-Dist: pytest-cov; extra == "dev"
69
69
  Requires-Dist: pytest-dotenv; extra == "dev"
70
70
  Requires-Dist: pytest-xdist; extra == "dev"
71
- Requires-Dist: ruff==0.8.2; extra == "dev"
71
+ Requires-Dist: ruff==0.8.3; extra == "dev"
72
72
  Requires-Dist: textual-dev>=0.86.2; extra == "dev"
73
73
  Requires-Dist: types-PyYAML; extra == "dev"
74
74
  Requires-Dist: types-aiofiles; extra == "dev"
@@ -3,7 +3,7 @@ inspect_ai/__main__.py,sha256=oWX4YwDZDg3GS3-IG0yPGoSEOfSzWihELg7QmrUlxjM,67
3
3
  inspect_ai/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  inspect_ai/_cli/cache.py,sha256=nOX9ysB3rZ-V8b_ryTpkgpoAynNlq4Op_fIqAIl4EVg,3910
5
5
  inspect_ai/_cli/common.py,sha256=f6OGE63OoN5Y4tk0i8pllfSdmhGlf8teZY8NCvWexGY,3573
6
- inspect_ai/_cli/eval.py,sha256=N0n2WZSUyoKhi-1sahmN5xP25lmjrjziPslU04lM16Y,29503
6
+ inspect_ai/_cli/eval.py,sha256=5rxtpJw-px4ugIJDaZOJCrSWNGc9Hs9DgDyvhJLv0Wo,29940
7
7
  inspect_ai/_cli/info.py,sha256=d5W7VA5buinGcsdQjWqlsMM6iSNNGRUHQrr4JS2k8nk,1749
8
8
  inspect_ai/_cli/list.py,sha256=GreVEhJRpagiCpzVc3FSGhcdpTq9B8Jh--mfgs4ueFQ,2454
9
9
  inspect_ai/_cli/log.py,sha256=boSzHZkiabhnYWHLRVsZVENCryG-MDaNHIIbpSp0Mcs,5729
@@ -14,11 +14,11 @@ inspect_ai/_cli/util.py,sha256=rOyKR5p08-04IwJdcjakNXD1Gm-dGFtzaTTx7hyArPE,1402
14
14
  inspect_ai/_cli/view.py,sha256=9UstZ5a1upMZlVKfQCK8L_HvTR1WgLpBp3zpSgTyrLo,2786
15
15
  inspect_ai/_display/__init__.py,sha256=t9Xj8FbxvdBNsalnr16U0r3jSTFX9w4yXcUJwb06_6k,405
16
16
  inspect_ai/_display/core/active.py,sha256=ZXH9KzuXr9ViAGcAZRWtegbO_YqApvgBjr_7TGAxDFQ,1219
17
- inspect_ai/_display/core/config.py,sha256=1wdzgzFmMsPOtk6_AVH3xY4qp5y-44NexTSxxHWbX8c,1458
17
+ inspect_ai/_display/core/config.py,sha256=zvF8jW3NhDGhVSCAqjXsblNjvKGUtMUdn2YMor3cFC4,1664
18
18
  inspect_ai/_display/core/display.py,sha256=3Tu6XO5v1oMhG9ZgmMUz11PncwmO_bK5W2q67IhQkl0,3064
19
19
  inspect_ai/_display/core/footer.py,sha256=2e5kimg8drlJgdhHYuHUiP-B8U6LtYLyORWB4a1kXx8,805
20
20
  inspect_ai/_display/core/group.py,sha256=z8CIwQ-8Mm9adQ8JDuMjw94ih9GfymU5s-1qnbKoEPs,2871
21
- inspect_ai/_display/core/panel.py,sha256=dGwhkYl7JOXsug-Spd0oLaOBNZyXJEq1TTnaWpq6fQA,3753
21
+ inspect_ai/_display/core/panel.py,sha256=G2sb1uayFmFb0qiU37OziRXaxZF2t3APXadPbQcsZqY,3754
22
22
  inspect_ai/_display/core/progress.py,sha256=lqpMSjpvVwewqcLH5U5PzgCjFHniiZfjz30WXFpThR4,4176
23
23
  inspect_ai/_display/core/results.py,sha256=slA7cRUmGCenofc7rcfCx8dcwx5I6QRERbmJl2W0P8s,7192
24
24
  inspect_ai/_display/core/rich.py,sha256=ve4y1O_P9E1py6s4jGq1lHa7t6IlF5Brgue2-xikJOg,2729
@@ -33,14 +33,14 @@ inspect_ai/_display/textual/widgets/clock.py,sha256=8NijI2cqgTY1DJF8-nvK_QUJXxcn
33
33
  inspect_ai/_display/textual/widgets/console.py,sha256=lp5lbT9erPjxE1NWzvuJ5Bj8mN2ZZSBTgKQWHinMKgA,1590
34
34
  inspect_ai/_display/textual/widgets/footer.py,sha256=_4Dlzp9kriUVELBEk6HQzgiwLigHe5mH0dZqWr6sGi4,1078
35
35
  inspect_ai/_display/textual/widgets/samples.py,sha256=KPL2UMnKQe2zpFdVnTGOaIgebKLEdSXeZ59j2Yn1js0,14767
36
- inspect_ai/_display/textual/widgets/task_detail.py,sha256=vF16J-AOaTAJ8hZB-_d6DbIF66ynA2E9bH8quIrMIao,7301
36
+ inspect_ai/_display/textual/widgets/task_detail.py,sha256=AC_GxBxphTO6b7XiWA9Xf6ZoJH4ndMCJ4PxahfKr4QQ,8237
37
37
  inspect_ai/_display/textual/widgets/tasks.py,sha256=vtPEP93rSlUdfJGo4pIkX3rtdrjhc_AG3qkBBRhdAVs,8018
38
38
  inspect_ai/_display/textual/widgets/titlebar.py,sha256=Gh_vnsco_1lStPb34TXM9MZJffjy83-1ekoRzUQF_6w,2144
39
39
  inspect_ai/_display/textual/widgets/toggle.py,sha256=ToYs-S4n90yuxWcAW2OTg6AbRf0GhSz61XxfhE6XZ3Y,895
40
- inspect_ai/_display/textual/widgets/transcript.py,sha256=qYbs_gQwQ64-yPkybWcFpjDqufkicDMejOK_oKn09Ww,11561
40
+ inspect_ai/_display/textual/widgets/transcript.py,sha256=rRTM9aSNHau89mSmlH6Ni9qIckYS9_Han0RtuO6QZmk,10999
41
41
  inspect_ai/_eval/context.py,sha256=YdbaFJfzYEa-iF1fP8BEpB3l6ZnlOJZ8ntPfIrhdacQ,1080
42
- inspect_ai/_eval/eval.py,sha256=DPqjOT0cbMPnl1a93QMZxhM8-z324HlRsz2gc-xjh8o,34422
43
- inspect_ai/_eval/evalset.py,sha256=qScAjunjDVn3EJD8Wox84c6NGXn1WbVu3MxmQIN89es,26646
42
+ inspect_ai/_eval/eval.py,sha256=SbgF53XwvzzuP-qRKcow4rMahKknzcgvo6x1KGz-dYM,35076
43
+ inspect_ai/_eval/evalset.py,sha256=G7nuwRkHfraKaKKmOst6oCMxjAtNO41pEGd6NSqk19g,26839
44
44
  inspect_ai/_eval/list.py,sha256=VbZ-2EI6MqrXvCN7VTz21TQSoU5K5_Q0hqhxmj5A_m0,3744
45
45
  inspect_ai/_eval/loader.py,sha256=d_7PcajwAF8sG-sMeYOpkrSt5TcUz_tewgWocIIIlKQ,16267
46
46
  inspect_ai/_eval/registry.py,sha256=j-HBhwgg-3GsOqEZHmtALZnBqgJLSQWuEzwe0hsfug4,5422
@@ -52,13 +52,13 @@ inspect_ai/_eval/task/epochs.py,sha256=Ci7T6CQniSOTChv5Im2dCdSDrP-5hq19rV6iJ2uBc
52
52
  inspect_ai/_eval/task/error.py,sha256=gJnd8X7LHpPz5zcOq_az6ONZICGJ0_VpSz9yhF0yRyY,1233
53
53
  inspect_ai/_eval/task/generate.py,sha256=Edm-_6Wp1mkb7XpGkfTAqobWPfjJeWB0sb8-76UjNrc,1999
54
54
  inspect_ai/_eval/task/images.py,sha256=gnhOK97qFBnZYFisv1lgD4hLTaK83cHcJ5D9zA-NoLE,3524
55
- inspect_ai/_eval/task/log.py,sha256=kJI7XyyrkGBU9qfCacNwSM7LExaqMxgk28qig12AHUs,5961
55
+ inspect_ai/_eval/task/log.py,sha256=wxA7rc-YGY7eri_IgJXIBPo4i6hy6j7-8FJlEO0TxRg,6068
56
56
  inspect_ai/_eval/task/results.py,sha256=eRT_nBEZrUEOA4tw0n2Zy04K9g_OyihXoXuyR1sODUk,13724
57
- inspect_ai/_eval/task/run.py,sha256=bnu6o5QemLsRtsJdsOQAWDAsqo5GWhXI7p-SOPMyUSE,31085
57
+ inspect_ai/_eval/task/run.py,sha256=EpNrTru4u103aHRy63H9BBIOCWc5aFLn3hciaGDMBcM,31178
58
58
  inspect_ai/_eval/task/rundir.py,sha256=QXetLfqi1lRo-PcIuu7maQpVO57c2ebnsjfZk0lsAFk,2001
59
59
  inspect_ai/_eval/task/sandbox.py,sha256=k1QmGZAvObwQLCJmtmpSUqOODNHbBU3Kpevbn6q-4yM,4536
60
60
  inspect_ai/_eval/task/task.py,sha256=6TKyKrhjlTTZvG5bWAhqQM4gKO7x4yedrfhtnO_1ZW4,7813
61
- inspect_ai/_eval/task/util.py,sha256=qpgZnLWJrijfIZKo63KGkpUxV4r2Odel_muycOhtTcg,1321
61
+ inspect_ai/_eval/task/util.py,sha256=9OEE4k-jESyTOpOEcHk_4cvCZrd1sUJ_00fI6Ubt6o8,1763
62
62
  inspect_ai/_util/_async.py,sha256=K5lVPKwl25JkLkcXfb0m3aZ-RJ4O3fog5HQm5EqbjM4,981
63
63
  inspect_ai/_util/appdirs.py,sha256=lhURbDS9xT2YBzWOe0jjxsdK4ZdiVAv_WwXQC83V_jw,563
64
64
  inspect_ai/_util/config.py,sha256=nuWVZbShE8IPnotDfRJx0uBZJxwbV36M0qKVYsQDEEI,848
@@ -84,7 +84,7 @@ inspect_ai/_util/http.py,sha256=c4yvH48ZkETZ7sNDuNzBR0NUS4r-6WzCaolW9my13ns,3628
84
84
  inspect_ai/_util/images.py,sha256=pCaL_GwWGQsnawu4WmbubY7eHZzgqXvHrmK5-GJikn8,1245
85
85
  inspect_ai/_util/json.py,sha256=1mgVURVVPo34hQcGmX6u25hlbRdRY4hmNHXZGVwYaKc,3342
86
86
  inspect_ai/_util/list.py,sha256=6_5r5jI5RKK34kCmIqqVQ5hYG-G8v0F5H7L-DmQQ2E4,279
87
- inspect_ai/_util/logger.py,sha256=i5vl-ahfGk9pzXJPXljeCswqYLu6p9Hq2XJ64aYLfX0,5707
87
+ inspect_ai/_util/logger.py,sha256=erX7YLYwrtZ1Z1SI2sun70f40zn7EU38DlUKB5Sgt9E,6155
88
88
  inspect_ai/_util/notebook.py,sha256=Mgz3J4uBh-MqVBRmpiJqDHRpn2hd7HIOBeJBwLG-bbk,2998
89
89
  inspect_ai/_util/package.py,sha256=2ntItRYaYBaVWI5eDaB4FdpI1IUBiBWNRxq7FChvk1I,2729
90
90
  inspect_ai/_util/path.py,sha256=fLfKXOXuQstwVfl2i1Gzxix8TZLxLqtZIHqyqK3MnvY,3795
@@ -93,12 +93,12 @@ inspect_ai/_util/platform.py,sha256=knsijYYaif5rgkGuYRwr_p7DlsD1VD-yfbt08dKOrGM,
93
93
  inspect_ai/_util/registry.py,sha256=yoajCn16miznXpFiFNOT-1TTrS5ZuZbly5mt7T4UJbM,11689
94
94
  inspect_ai/_util/retry.py,sha256=fAupOVgGJ0ImnmtXmCHBKRa3AQC7jDA-Zf_zilSCRl0,1899
95
95
  inspect_ai/_util/rich.py,sha256=sNWEsGlGmkkZZLo4AcEv-_yJI1bI0HcpZVt7wNJFsXg,692
96
- inspect_ai/_util/samples.py,sha256=UfjZUnHwKnkIMVw14jkaEnNtSQxbU2AUFhEZfkckh_w,322
96
+ inspect_ai/_util/samples.py,sha256=uobAN2i1U-6YBxCBvaW6z1-xFufQIuFXHnnnK-oTDKc,507
97
97
  inspect_ai/_util/terminal.py,sha256=I4NDy7Ln5YSCzxbd0O9OPslEHQMBVKZfqJl3TOCegTg,4166
98
98
  inspect_ai/_util/text.py,sha256=1Q5tNqB-61yXXo_bQxzxJCEXCMU7aVs3qpXQir2xKu0,3174
99
99
  inspect_ai/_util/throttle.py,sha256=Yoz-qnGULR88QFrJfeo4TQgfQ9AHsVNVKqdDlxgSipc,663
100
100
  inspect_ai/_util/timeouts.py,sha256=-iC1LjpNqB6Hx-i36MfSrLy819RVhKNo4KXjZDuskZQ,5193
101
- inspect_ai/_util/transcript.py,sha256=SqGCfsH5mCk41MndwRK46BspBq5djcK3dTPx4INp23Q,2474
101
+ inspect_ai/_util/transcript.py,sha256=KG6Vz57fTYDinrscd-iT8K2JfYPZ7SJrpedrlLKK9Lo,2744
102
102
  inspect_ai/_util/url.py,sha256=Ci9I1T7hSRuA1CvgQmN9TWTSzUPO4ILMkd_8maZlO6k,622
103
103
  inspect_ai/_util/version.py,sha256=PRh1HSHD_EgRW_VYIPquOPj-gL9DM164ePQ9LL3d9lU,739
104
104
  inspect_ai/_view/notify.py,sha256=6lI42tz4Ui7ThvwsJgBamRcCczSTWGXGWtciP3M8VaA,1394
@@ -123,7 +123,7 @@ inspect_ai/_view/www/yarn.lock,sha256=UsPRVYxWj1teJWLag7l8znvHprucro96dR3xLkmGZw
123
123
  inspect_ai/_view/www/dist/index.html,sha256=ErXXXs-OUDOAZexWCmn_u-frxXROpg4kBt6Yvjj0a0U,998
124
124
  inspect_ai/_view/www/dist/assets/favicon.svg,sha256=b9AHYZaO2zBzeKH6G4PwXZMGGW_UxY0omKHam-c9MAs,1508
125
125
  inspect_ai/_view/www/dist/assets/index.css,sha256=KEUcz_Eijk8JxCiFDRpkc8X1fWm6xLq2gmkKXOXZDQk,813685
126
- inspect_ai/_view/www/dist/assets/index.js,sha256=OSaMvUDbT66h9MLMMesBdZgVnAvRuGGsjBVzhC_dpzc,923612
126
+ inspect_ai/_view/www/dist/assets/index.js,sha256=UuWrNHF1GZdpQ8M_5_HEY7wjsTzlVLeZy7O-rbTUXhM,924009
127
127
  inspect_ai/_view/www/src/App.mjs,sha256=7uCBYgClLj804nK_5Nt0imusq7PqYTuaW7WwxYlKDkc,28679
128
128
  inspect_ai/_view/www/src/Register.mjs,sha256=jpIqov7fyyLPW9Ouhc9KOy__o8s-P5h-VF9S1RDsIDU,116
129
129
  inspect_ai/_view/www/src/Types.mjs,sha256=-GHM7V0ememrEteUSvA1-sh7eucDLgVpOgJ5pEDYVfs,623
@@ -183,7 +183,7 @@ inspect_ai/_view/www/src/navbar/Navbar.mjs,sha256=y9WZXlvVk7aubkE5IhF190IuF7FG6q
183
183
  inspect_ai/_view/www/src/navbar/SecondaryBar.mjs,sha256=5Sn8jh_rdD0s1CT1qW3fsl-_MX-woYuDDSFDJWkVlmc,4424
184
184
  inspect_ai/_view/www/src/plan/PlanCard.mjs,sha256=2JGBPFiGTJmf9DR9IuwgcrACO_3mPFpap2mwokCzwTk,10326
185
185
  inspect_ai/_view/www/src/samples/SampleDialog.mjs,sha256=5cBb2UIRe7Z6BfFAJ4Q1djDJSZZdia-2E0rC6X2a5m0,3709
186
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs,sha256=RoktTmo_lq4G7NGqE-UmlUW4fcvempMRBKE0WsURFHw,15031
186
+ inspect_ai/_view/www/src/samples/SampleDisplay.mjs,sha256=wSn0vPR5KGwywK7AKefy1zEDp2tZpSckRvrB8vIOBL4,15756
187
187
  inspect_ai/_view/www/src/samples/SampleError.mjs,sha256=48in4mqQpe2KxQK9__3rBfK5mEKIqPrIscWeCW1r_Hk,2674
188
188
  inspect_ai/_view/www/src/samples/SampleLimit.mjs,sha256=_IT5kKng8L3A26fHjG0vANAn2y73Nb6j5A_90osQz2E,627
189
189
  inspect_ai/_view/www/src/samples/SampleList.mjs,sha256=39dNnMlIay8qTFC_jBU4kz-VSDUrK8TXEd4RQ8BUaqs,11149
@@ -274,7 +274,7 @@ inspect_ai/log/_bundle.py,sha256=5Uy-s64_SFokZ7WRzti9mD7yoKrd2sOzdvqKyahoiC4,804
274
274
  inspect_ai/log/_condense.py,sha256=zWSvdq3bQ6Pe8yiOWHVj5Gc--he9oogQ3SI_RXJ5z3o,9568
275
275
  inspect_ai/log/_convert.py,sha256=62cwEY5zhz76GaTB5cwkRUgWQz5kehxQ82o7ItBcX2U,3461
276
276
  inspect_ai/log/_file.py,sha256=RqadF_vNbhgx7PxghVmg-1WiJkpz2vwfq3AJ1C_umOM,17685
277
- inspect_ai/log/_log.py,sha256=natE9pn0ykJ9cmXkug3y65Ra2-EdyrKSiQ99K3u80tg,18620
277
+ inspect_ai/log/_log.py,sha256=jmjS2k75i4iftPsYjjrUH6UBPeyKZMX97TgRqrfWckk,18732
278
278
  inspect_ai/log/_message.py,sha256=VCKvYcTOH6M9AvYykqx8D7fgXnvti0yDS7vY5To3zOI,1927
279
279
  inspect_ai/log/_retry.py,sha256=e7a2hjl3Ncl8b8sU7CsDpvK8DV0b1uSRLeokRX1mt34,2109
280
280
  inspect_ai/log/_samples.py,sha256=b-5-wT1rD5KQTfVqsYlrQPSabj9glmHM9SmZg9mDk-c,3814
@@ -287,18 +287,18 @@ inspect_ai/log/_recorders/json.py,sha256=edsglc71m-RUqddv5D_N4g24XsGBVOK6CoVXPZ_
287
287
  inspect_ai/log/_recorders/recorder.py,sha256=yvW_D99QxUQmnF5EiGsWIVetBXdssMUcsq5us9oRzx4,1551
288
288
  inspect_ai/model/__init__.py,sha256=mBs6hmdWu6KhTQCXFh_NHrCw9oQ9-mn6wDLTDqjguN8,2028
289
289
  inspect_ai/model/_cache.py,sha256=VlMQGPgFxJGPXktqWy_wMpA3iJWmvA8siSdmX71MEHQ,13517
290
- inspect_ai/model/_call_tools.py,sha256=ptYAPtlS4Wh6YbFHVVqx2B7u-gFciVNPWKjcdky74A8,13611
290
+ inspect_ai/model/_call_tools.py,sha256=IxykjNqUAxBpPSJvW_0GuJRJX8jvkjlu51T63sNIhHI,13468
291
291
  inspect_ai/model/_chat_message.py,sha256=Zc2hHMLbWONuhLx-XYPOekDD20mF4uTU62WRTsry3c4,4355
292
292
  inspect_ai/model/_generate_config.py,sha256=0bk3FKFrwgEBkjrdtwzHOPDBIJG9VIzTdgh2xrvTaZI,8360
293
293
  inspect_ai/model/_image.py,sha256=kpO2Bn_-c-dK80HuPOPH1eSNmcoc39kofwf4yTTiTFE,477
294
- inspect_ai/model/_model.py,sha256=ZUvOhHUYutJ5QwBrhOCYt58O52k8WHFXYBp17IjRvF0,28662
294
+ inspect_ai/model/_model.py,sha256=pI7nwBio0Kx4dYz5GSV1o5h31gSI-61dOBo5Zp-qAH8,29184
295
295
  inspect_ai/model/_model_call.py,sha256=r6ObHZwm7jy1qX2qxvAP6iPV3BhdGThL-VH-QEawQhA,2017
296
296
  inspect_ai/model/_model_output.py,sha256=iv0bk6U5jRnhqeULIMAVrC57j2Be-EjOhe7kvxAzCcg,6537
297
297
  inspect_ai/model/_registry.py,sha256=Cr2y32EqLnOqLbSWoXHVK4ivTTzCUhJuACxoTyPt8kY,2032
298
- inspect_ai/model/_render.py,sha256=aawzq3KZMXbM5xwJXQRgE3V3fRO0ymFQ8J4Abqvl0nc,921
299
- inspect_ai/model/_trace.py,sha256=uw_RraVxWPWVpBuUcXwAl9ZgLuw0RG1FwjWznGlhNfw,1601
298
+ inspect_ai/model/_render.py,sha256=bGtGdFFWvNdeBjs60Junk0I8xVnBOj4Oe3a-86BjPtc,802
299
+ inspect_ai/model/_trace.py,sha256=Zr4cZGICQO85L0tRqW5oehuvPFk1EO5NBUtaJtLKBwk,1601
300
300
  inspect_ai/model/_providers/anthropic.py,sha256=lgWQDhr4d7EjPJYwj021YRc8VZAtPRmTwzJthVyMqGg,20801
301
- inspect_ai/model/_providers/azureai.py,sha256=TBnavnhwyi-RUjWHQkyCDkdABeXKsK3nIRZ-i9U_SIY,13267
301
+ inspect_ai/model/_providers/azureai.py,sha256=ZTNmFd0PePFjJjGA5o3JYud5qdfJvrLDp3ah7XOkvp4,13927
302
302
  inspect_ai/model/_providers/bedrock.py,sha256=AVej7e6OoYGW-C2jR8H4jBn3aKwTj5RwPOMzVxO_2XE,22702
303
303
  inspect_ai/model/_providers/cloudflare.py,sha256=h6ubjf0kxyMM7Aj2tm68tWa-2R7RAXNGp1O6KMvi0Gw,4143
304
304
  inspect_ai/model/_providers/google.py,sha256=WA1eRIjMwJoOrnaPq1TiodIIGLBURPaBBRwQaOHM4VU,20281
@@ -342,7 +342,7 @@ inspect_ai/scorer/_reducer/reducer.py,sha256=dRFIW9_gi30i64g-AZWxDTHla7mJfqyF0dJ
342
342
  inspect_ai/scorer/_reducer/registry.py,sha256=J2tvuuxf4jBC09_SCBZg99Qb2qQUWG8STEsw7ASWpXQ,5388
343
343
  inspect_ai/scorer/_reducer/types.py,sha256=uimvzIBRK7x1Dof77gsHYe9PU3hekB1opm9DTAa4sL4,340
344
344
  inspect_ai/solver/__init__.py,sha256=LLemW4Szs7uJuJFjp0KLlVS1V7M-2_PpkYS_17GGtaE,3234
345
- inspect_ai/solver/_basic_agent.py,sha256=cQUzpYJmhEEa7mgLl8ZY3yHJFwLapLT3UuPacalRD78,9241
345
+ inspect_ai/solver/_basic_agent.py,sha256=vvC0v4g-7hLzGCsmb_I0xa_Z3RCLwp4ak7Qk-EVl7F8,9594
346
346
  inspect_ai/solver/_chain.py,sha256=F-2ZHE2KOlDAIgH1_Q23whUMH5TjYGvCHhcOgbRxe7I,2234
347
347
  inspect_ai/solver/_critique.py,sha256=ddO8J7VqSEsT6kofawpn3PrcUpLjLUMFmJi0hocDZpI,3504
348
348
  inspect_ai/solver/_fork.py,sha256=Ge1PwpCHjeZhm2CLAWKss2uFuQd9BGzVinLOW6UOnfE,2880
@@ -362,6 +362,7 @@ inspect_ai/tool/_tool_def.py,sha256=OQo6jhtDfrj2uDDKeoT4g8Ju1r8uroK6DqQkpyUikEA,
362
362
  inspect_ai/tool/_tool_description.py,sha256=SZTQzehReNNKwQ0iUL6v4pPfEptgf3UOP4J888JV18M,524
363
363
  inspect_ai/tool/_tool_info.py,sha256=zoAUkA99VbgSc5bLPGwkYRT5u8rzS9NjrrxHR24A214,7865
364
364
  inspect_ai/tool/_tool_params.py,sha256=oLYlxcyKtIbMxZh5yowSynNrBR5sWj4nrdel6pFSIbc,1158
365
+ inspect_ai/tool/_tool_transcript.py,sha256=rMibJoBN5Nn41RwInqk45h9RDPxZGu81saDf4SkpqTs,904
365
366
  inspect_ai/tool/_tool_with.py,sha256=iZYVhuZSL0Q9PFKz-ob_923E77WzuQ2U1Qo4DfdWuBo,1881
366
367
  inspect_ai/tool/_tools/_execute.py,sha256=DkFlvUTvI595H1zH5IKArhbyBo8YZWqq9tvoUMdvlaw,2823
367
368
  inspect_ai/tool/_tools/_web_search.py,sha256=YqZ3E65ssdq1X2NSH9Mqt5INXdPVQOdKa3PbKi7XjAY,7828
@@ -406,9 +407,9 @@ inspect_ai/util/_sandbox/docker/docker.py,sha256=eb8yJQbG-3ZWT2zvzhAd7l3ejrdxyv7
406
407
  inspect_ai/util/_sandbox/docker/internal.py,sha256=pwK3xl-fx_5FVmVmvautE8R7op3XCjgiA-1JqlRcHII,1311
407
408
  inspect_ai/util/_sandbox/docker/prereqs.py,sha256=LKWt2T5CJ8hZ25SFDGX9FTGXFAtHzvQoolOffjQKlm8,3341
408
409
  inspect_ai/util/_sandbox/docker/util.py,sha256=VyB9_Aab8vmgs6dJkAyt3oO_jejRPAppyaibZdetayc,2836
409
- inspect_ai-0.3.51.dist-info/LICENSE,sha256=aYPffOl9TwBXDQ8g33Jh6AsBhobb3A76qNm7r2HZsps,1079
410
- inspect_ai-0.3.51.dist-info/METADATA,sha256=gyhqUPXr7sIz01JUJ4O8Rcv0Mo6WPeUJydphTrsNfMk,4564
411
- inspect_ai-0.3.51.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
412
- inspect_ai-0.3.51.dist-info/entry_points.txt,sha256=WGGLmzTzDWLzYfiyovSY6oEKuf-gqzSDNOb5V-hk3fM,54
413
- inspect_ai-0.3.51.dist-info/top_level.txt,sha256=Tp3za30CHXJEKLk8xLe9qGsW4pBzJpEIOMHOHNCXiVo,11
414
- inspect_ai-0.3.51.dist-info/RECORD,,
410
+ inspect_ai-0.3.52.dist-info/LICENSE,sha256=aYPffOl9TwBXDQ8g33Jh6AsBhobb3A76qNm7r2HZsps,1079
411
+ inspect_ai-0.3.52.dist-info/METADATA,sha256=zUCf5WJGaR7f7rNipmHuDY7HKygcKAJICduIbX_Dyso,4564
412
+ inspect_ai-0.3.52.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
413
+ inspect_ai-0.3.52.dist-info/entry_points.txt,sha256=WGGLmzTzDWLzYfiyovSY6oEKuf-gqzSDNOb5V-hk3fM,54
414
+ inspect_ai-0.3.52.dist-info/top_level.txt,sha256=Tp3za30CHXJEKLk8xLe9qGsW4pBzJpEIOMHOHNCXiVo,11
415
+ inspect_ai-0.3.52.dist-info/RECORD,,