inspect-ai 0.3.76__py3-none-any.whl → 0.3.78__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
inspect_ai/_cli/eval.py CHANGED
@@ -115,6 +115,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
115
115
  help="Tags to associate with this evaluation run.",
116
116
  envvar="INSPECT_EVAL_TAGS",
117
117
  )
118
+ @click.option(
119
+ "--metadata",
120
+ multiple=True,
121
+ type=str,
122
+ help="Metadata to associate with this evaluation run (more than one --metadata argument can be specified).",
123
+ envvar="INSPECT_EVAL_METADATA",
124
+ )
118
125
  @click.option(
119
126
  "--trace",
120
127
  type=bool,
@@ -449,6 +456,7 @@ def eval_command(
449
456
  s: tuple[str] | None,
450
457
  solver_config: str | None,
451
458
  tags: str | None,
459
+ metadata: tuple[str] | None,
452
460
  trace: bool | None,
453
461
  approval: str | None,
454
462
  sandbox: str | None,
@@ -525,6 +533,7 @@ def eval_command(
525
533
  s=s,
526
534
  solver_config=solver_config,
527
535
  tags=tags,
536
+ metadata=metadata,
528
537
  trace=trace,
529
538
  approval=approval,
530
539
  sandbox=sandbox,
@@ -616,6 +625,7 @@ def eval_set_command(
616
625
  s: tuple[str] | None,
617
626
  solver_config: str | None,
618
627
  tags: str | None,
628
+ metadata: tuple[str] | None,
619
629
  sandbox: str | None,
620
630
  no_sandbox_cleanup: bool | None,
621
631
  epochs: int | None,
@@ -695,6 +705,7 @@ def eval_set_command(
695
705
  s=s,
696
706
  solver_config=solver_config,
697
707
  tags=tags,
708
+ metadata=metadata,
698
709
  trace=trace,
699
710
  approval=approval,
700
711
  sandbox=sandbox,
@@ -749,6 +760,7 @@ def eval_exec(
749
760
  s: tuple[str] | None,
750
761
  solver_config: str | None,
751
762
  tags: str | None,
763
+ metadata: tuple[str] | None,
752
764
  trace: bool | None,
753
765
  approval: str | None,
754
766
  sandbox: str | None,
@@ -790,6 +802,9 @@ def eval_exec(
790
802
  # parse tags
791
803
  eval_tags = parse_comma_separated(tags)
792
804
 
805
+ # parse metadata
806
+ eval_metadata = parse_cli_args(metadata)
807
+
793
808
  # resolve epochs
794
809
  eval_epochs = (
795
810
  Epochs(epochs, create_reducers(parse_comma_separated(epochs_reducer)))
@@ -825,6 +840,7 @@ def eval_exec(
825
840
  task_args=task_args,
826
841
  solver=SolverSpec(solver, solver_args) if solver else None,
827
842
  tags=eval_tags,
843
+ metadata=eval_metadata,
828
844
  trace=trace,
829
845
  approval=approval,
830
846
  sandbox=parse_sandbox(sandbox),
@@ -131,9 +131,14 @@ def task_stats(stats: EvalStats) -> RenderableType:
131
131
  else:
132
132
  input_tokens = f"[bold]I: [/bold]{usage.input_tokens:,}"
133
133
 
134
+ if usage.reasoning_tokens is not None:
135
+ reasoning_tokens = f", [bold]R: [/bold]{usage.reasoning_tokens:,}"
136
+ else:
137
+ reasoning_tokens = ""
138
+
134
139
  table.add_row(
135
140
  Text(model, style="bold"),
136
- f" {usage.total_tokens:,} tokens [{input_tokens}, [bold]O: [/bold]{usage.output_tokens:,}]",
141
+ f" {usage.total_tokens:,} tokens [{input_tokens}, [bold]O: [/bold]{usage.output_tokens:,}{reasoning_tokens}]",
137
142
  style=theme.light,
138
143
  )
139
144
 
inspect_ai/_eval/eval.py CHANGED
@@ -68,6 +68,7 @@ def eval(
68
68
  sandbox_cleanup: bool | None = None,
69
69
  solver: Solver | list[Solver] | SolverSpec | None = None,
70
70
  tags: list[str] | None = None,
71
+ metadata: dict[str, Any] | None = None,
71
72
  trace: bool | None = None,
72
73
  display: DisplayType | None = None,
73
74
  approval: str | list[ApprovalPolicy] | None = None,
@@ -116,6 +117,7 @@ def eval(
116
117
  solver: Alternative solver for task(s).
117
118
  Optional (uses task solver by default).
118
119
  tags: Tags to associate with this evaluation run.
120
+ metadata: Metadata to associate with this evaluation run.
119
121
  trace: Trace message interactions with evaluated model to terminal.
120
122
  display: Task display type (defaults to 'full').
121
123
  approval: Tool use approval policies.
@@ -186,6 +188,7 @@ def eval(
186
188
  sandbox_cleanup=sandbox_cleanup,
187
189
  solver=solver,
188
190
  tags=tags,
191
+ metadata=metadata,
189
192
  approval=approval,
190
193
  log_level=log_level,
191
194
  log_level_transcript=log_level_transcript,
@@ -235,6 +238,7 @@ async def eval_async(
235
238
  sandbox_cleanup: bool | None = None,
236
239
  solver: Solver | list[Solver] | SolverSpec | None = None,
237
240
  tags: list[str] | None = None,
241
+ metadata: dict[str, Any] | None = None,
238
242
  approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
239
243
  log_level: str | None = None,
240
244
  log_level_transcript: str | None = None,
@@ -274,7 +278,8 @@ async def eval_async(
274
278
  sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
275
279
  sandbox_cleanup: Cleanup sandbox environments after task completes (defaults to True)
276
280
  solver: Alternative solver for task(s). Optional (uses task solver by default).
277
- tags (list[str] | None): Tags to associate with this evaluation run.
281
+ tags: Tags to associate with this evaluation run.
282
+ metadata: Metadata to associate with this evaluation run.
278
283
  approval: Tool use approval policies.
279
284
  Either a path to an approval policy config file or a list of approval policies.
280
285
  Defaults to no approval policy.
@@ -449,6 +454,7 @@ async def eval_async(
449
454
  epochs_reducer=epochs_reducer,
450
455
  solver=solver,
451
456
  tags=tags,
457
+ metadata=metadata,
452
458
  score=score,
453
459
  debug_errors=debug_errors is True,
454
460
  **kwargs,
@@ -473,6 +479,7 @@ async def eval_async(
473
479
  epochs_reducer=epochs_reducer,
474
480
  solver=solver,
475
481
  tags=tags,
482
+ metadata=metadata,
476
483
  score=score,
477
484
  **kwargs,
478
485
  )
@@ -68,6 +68,7 @@ def eval_set(
68
68
  sandbox_cleanup: bool | None = None,
69
69
  solver: Solver | list[Solver] | SolverSpec | None = None,
70
70
  tags: list[str] | None = None,
71
+ metadata: dict[str, Any] | None = None,
71
72
  trace: bool | None = None,
72
73
  display: DisplayType | None = None,
73
74
  approval: str | list[ApprovalPolicy] | None = None,
@@ -127,6 +128,7 @@ def eval_set(
127
128
  solver: Alternative solver(s) for
128
129
  evaluating task(s). ptional (uses task solver by default).
129
130
  tags: Tags to associate with this evaluation run.
131
+ metadata: Metadata to associate with this evaluation run.
130
132
  trace: Trace message interactions with evaluated model to terminal.
131
133
  display: Task display type (defaults to 'full').
132
134
  approval: Tool use approval policies.
@@ -193,6 +195,7 @@ def eval_set(
193
195
  sandbox_cleanup=sandbox_cleanup,
194
196
  solver=solver,
195
197
  tags=tags,
198
+ metadata=metadata,
196
199
  trace=trace,
197
200
  display=display,
198
201
  approval=approval,
inspect_ai/_eval/run.py CHANGED
@@ -2,7 +2,7 @@ import functools
2
2
  import logging
3
3
  import os
4
4
  import sys
5
- from typing import Awaitable, Callable, Set, cast
5
+ from typing import Any, Awaitable, Callable, Set, cast
6
6
 
7
7
  from inspect_ai._eval.task.task import Task
8
8
  from inspect_ai._util.trace import trace_action
@@ -68,6 +68,7 @@ async def eval_run(
68
68
  epochs_reducer: list[ScoreReducer] | None = None,
69
69
  solver: Solver | SolverSpec | None = None,
70
70
  tags: list[str] | None = None,
71
+ metadata: dict[str, Any] | None = None,
71
72
  debug_errors: bool = False,
72
73
  score: bool = True,
73
74
  **kwargs: Unpack[GenerateConfigArgs],
@@ -205,7 +206,7 @@ async def eval_run(
205
206
  task_args=resolved_task.task_args,
206
207
  model_args=resolved_task.model.model_args,
207
208
  eval_config=task_eval_config,
208
- metadata=task.metadata,
209
+ metadata=((metadata or {}) | (task.metadata or {})) or None,
209
210
  recorder=recorder,
210
211
  )
211
212
  await logger.init()
@@ -12,6 +12,9 @@ class ContentText(BaseModel):
12
12
  text: str
13
13
  """Text content."""
14
14
 
15
+ refusal: bool | None = Field(default=None)
16
+ """Was this a refusal message?"""
17
+
15
18
 
16
19
  class ContentReasoning(BaseModel):
17
20
  """Reasoning content.
@@ -21577,7 +21577,7 @@ var require_assets = __commonJS({
21577
21577
  className: clsx(
21578
21578
  "source-code",
21579
21579
  "sourceCode",
21580
- `language-${highlightLanguage}`,
21580
+ highlightLanguage ? `language-${highlightLanguage}` : void 0,
21581
21581
  styles$10.outputCode
21582
21582
  ),
21583
21583
  children: formattedContent
@@ -21613,6 +21613,22 @@ var require_assets = __commonJS({
21613
21613
  }
21614
21614
  const collapse = Array.isArray(output2) ? output2.every((item2) => !isContentImage(item2)) : !isContentImage(output2);
21615
21615
  const normalizedContent = reactExports.useMemo(() => normalizeContent$1(output2), [output2]);
21616
+ const hasContent = normalizedContent.find((c2) => {
21617
+ if (c2.type === "tool") {
21618
+ for (const t2 of c2.content) {
21619
+ if (t2.type === "text") {
21620
+ if (t2.text) {
21621
+ return true;
21622
+ }
21623
+ } else {
21624
+ return true;
21625
+ }
21626
+ }
21627
+ return false;
21628
+ } else {
21629
+ return true;
21630
+ }
21631
+ });
21616
21632
  const contents2 = mode !== "compact" ? input2 : input2 || functionCall;
21617
21633
  return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [
21618
21634
  mode !== "compact" && (!view || view.title) ? /* @__PURE__ */ jsxRuntimeExports.jsx(ToolTitle, { title: (view == null ? void 0 : view.title) || functionCall }) : "",
@@ -21625,7 +21641,7 @@ var require_assets = __commonJS({
21625
21641
  toolCallView: view
21626
21642
  }
21627
21643
  ),
21628
- /* @__PURE__ */ jsxRuntimeExports.jsx(ExpandablePanel, { collapse, border: true, lines: 15, children: /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: normalizedContent }) })
21644
+ hasContent ? /* @__PURE__ */ jsxRuntimeExports.jsx(ExpandablePanel, { collapse, border: true, lines: 15, children: /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: normalizedContent }) }) : void 0
21629
21645
  ] }) })
21630
21646
  ] });
21631
21647
  };
@@ -83,8 +83,24 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
83
83
  : !isContentImage(output);
84
84
  const normalizedContent = useMemo(() => normalizeContent(output), [output]);
85
85
 
86
- const contents = mode !== "compact" ? input : input || functionCall;
86
+ const hasContent = normalizedContent.find((c) => {
87
+ if (c.type === "tool") {
88
+ for (const t of c.content) {
89
+ if (t.type === "text") {
90
+ if (t.text) {
91
+ return true;
92
+ }
93
+ } else {
94
+ return true;
95
+ }
96
+ }
97
+ return false;
98
+ } else {
99
+ return true;
100
+ }
101
+ });
87
102
 
103
+ const contents = mode !== "compact" ? input : input || functionCall;
88
104
  return (
89
105
  <div>
90
106
  {mode !== "compact" && (!view || view.title) ? (
@@ -99,9 +115,11 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
99
115
  contents={contents}
100
116
  toolCallView={view}
101
117
  />
102
- <ExpandablePanel collapse={collapse} border={true} lines={15}>
103
- <MessageContent contents={normalizedContent} />
104
- </ExpandablePanel>
118
+ {hasContent ? (
119
+ <ExpandablePanel collapse={collapse} border={true} lines={15}>
120
+ <MessageContent contents={normalizedContent} />
121
+ </ExpandablePanel>
122
+ ) : undefined}
105
123
  </div>
106
124
  </div>
107
125
  </div>
@@ -64,7 +64,7 @@ export const ToolInput: FC<ToolInputProps> = memo((props) => {
64
64
  className={clsx(
65
65
  "source-code",
66
66
  "sourceCode",
67
- `language-${highlightLanguage}`,
67
+ highlightLanguage ? `language-${highlightLanguage}` : undefined,
68
68
  styles.outputCode,
69
69
  )}
70
70
  >
@@ -1,7 +1,9 @@
1
1
  import json
2
2
  import re
3
+ from copy import copy
3
4
  from typing import Literal
4
5
 
6
+ from openai import BadRequestError, OpenAIError
5
7
  from openai.types.chat import (
6
8
  ChatCompletion,
7
9
  ChatCompletionAssistantMessageParam,
@@ -26,7 +28,9 @@ from openai.types.chat.chat_completion import Choice, ChoiceLogprobs
26
28
  from openai.types.chat.chat_completion_message_tool_call import Function
27
29
  from openai.types.completion_usage import CompletionUsage
28
30
  from openai.types.shared_params.function_definition import FunctionDefinition
31
+ from pydantic import JsonValue
29
32
 
33
+ from inspect_ai._util.constants import BASE_64_DATA_REMOVED
30
34
  from inspect_ai._util.content import (
31
35
  Content,
32
36
  ContentAudio,
@@ -48,7 +52,16 @@ from ._chat_message import (
48
52
  ChatMessageTool,
49
53
  ChatMessageUser,
50
54
  )
51
- from ._model_output import ModelUsage, StopReason, as_stop_reason
55
+ from ._model_output import ModelOutput, ModelUsage, StopReason, as_stop_reason
56
+
57
+
58
+ class OpenAIResponseError(OpenAIError):
59
+ def __init__(self, code: str, message: str) -> None:
60
+ self.code = code
61
+ self.message = message
62
+
63
+ def __str__(self) -> str:
64
+ return f"{self.code}: {self.message}"
52
65
 
53
66
 
54
67
  def is_o_series(name: str) -> bool:
@@ -58,6 +71,10 @@ def is_o_series(name: str) -> bool:
58
71
  return not is_gpt(name) and bool(re.search(r"o\d+", name))
59
72
 
60
73
 
74
+ def is_o1_pro(name: str) -> bool:
75
+ return "o1-pro" in name
76
+
77
+
61
78
  def is_o1_mini(name: str) -> bool:
62
79
  return "o1-mini" in name
63
80
 
@@ -320,6 +337,7 @@ def chat_messages_from_openai(
320
337
  chat_messages.append(ChatMessageUser(content=content))
321
338
  elif message["role"] == "assistant":
322
339
  # resolve content
340
+ refusal: Literal[True] | None = None
323
341
  asst_content = message.get("content", None)
324
342
  if isinstance(asst_content, str):
325
343
  result = parse_content_with_reasoning(asst_content)
@@ -336,6 +354,8 @@ def chat_messages_from_openai(
336
354
  content = asst_content
337
355
  elif asst_content is None:
338
356
  content = message.get("refusal", None) or ""
357
+ if content:
358
+ refusal = True
339
359
  else:
340
360
  content = []
341
361
  for ac in asst_content:
@@ -348,7 +368,7 @@ def chat_messages_from_openai(
348
368
  )
349
369
  if reasoning is not None:
350
370
  if isinstance(content, str):
351
- content = [ContentText(text=content)]
371
+ content = [ContentText(text=content, refusal=refusal)]
352
372
  else:
353
373
  content.insert(0, ContentReasoning(reasoning=str(reasoning)))
354
374
 
@@ -437,7 +457,7 @@ def content_from_openai(
437
457
  )
438
458
  ]
439
459
  elif content["type"] == "refusal":
440
- return [ContentText(text=content["refusal"])]
460
+ return [ContentText(text=content["refusal"], refusal=True)]
441
461
  else:
442
462
  content_type = content["type"]
443
463
  raise ValueError(f"Unexpected content type '{content_type}' in message.")
@@ -455,8 +475,10 @@ def chat_message_assistant_from_openai(
455
475
  if reasoning is not None:
456
476
  content: str | list[Content] = [
457
477
  ContentReasoning(reasoning=str(reasoning)),
458
- ContentText(text=msg_content),
478
+ ContentText(text=msg_content, refusal=True if refusal else None),
459
479
  ]
480
+ elif refusal is not None:
481
+ content = [ContentText(text=msg_content, refusal=True)]
460
482
  else:
461
483
  content = msg_content
462
484
 
@@ -484,3 +506,44 @@ def chat_choices_from_openai(
484
506
  )
485
507
  for choice in choices
486
508
  ]
509
+
510
+
511
+ def openai_handle_bad_request(
512
+ model_name: str, e: BadRequestError
513
+ ) -> ModelOutput | Exception:
514
+ # extract message
515
+ if isinstance(e.body, dict) and "message" in e.body.keys():
516
+ content = str(e.body.get("message"))
517
+ else:
518
+ content = e.message
519
+
520
+ # narrow stop_reason
521
+ stop_reason: StopReason | None = None
522
+ if e.code == "context_length_exceeded":
523
+ stop_reason = "model_length"
524
+ elif (
525
+ e.code == "invalid_prompt" # seems to happen for o1/o3
526
+ or e.code == "content_policy_violation" # seems to happen for vision
527
+ or e.code == "content_filter" # seems to happen on azure
528
+ ):
529
+ stop_reason = "content_filter"
530
+
531
+ if stop_reason:
532
+ return ModelOutput.from_content(
533
+ model=model_name, content=content, stop_reason=stop_reason
534
+ )
535
+ else:
536
+ return e
537
+
538
+
539
+ def openai_media_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
540
+ # remove images from raw api call
541
+ if key == "image_url" and isinstance(value, dict) and "url" in value:
542
+ url = str(value.get("url"))
543
+ if url.startswith("data:"):
544
+ value = copy(value)
545
+ value.update(url=BASE_64_DATA_REMOVED)
546
+ elif key == "input_audio" and isinstance(value, dict) and "data" in value:
547
+ value = copy(value)
548
+ value.update(data=BASE_64_DATA_REMOVED)
549
+ return value