inspect-ai 0.3.76__py3-none-any.whl → 0.3.77__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +16 -0
- inspect_ai/_display/core/results.py +6 -1
- inspect_ai/_eval/eval.py +8 -1
- inspect_ai/_eval/evalset.py +3 -0
- inspect_ai/_eval/run.py +3 -2
- inspect_ai/_util/content.py +3 -0
- inspect_ai/_view/www/dist/assets/index.js +18 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +22 -4
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +1 -1
- inspect_ai/model/_openai.py +67 -4
- inspect_ai/model/_openai_responses.py +277 -0
- inspect_ai/model/_providers/anthropic.py +1 -0
- inspect_ai/model/_providers/azureai.py +2 -2
- inspect_ai/model/_providers/mistral.py +29 -13
- inspect_ai/model/_providers/openai.py +53 -49
- inspect_ai/model/_providers/openai_responses.py +177 -0
- inspect_ai/model/_providers/openrouter.py +52 -2
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +23 -3
- inspect_ai/tool/_tools/_think.py +48 -0
- {inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/RECORD +27 -25
- {inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/WHEEL +1 -1
- inspect_ai/model/_image.py +0 -15
- {inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -115,6 +115,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
115
115
|
help="Tags to associate with this evaluation run.",
|
116
116
|
envvar="INSPECT_EVAL_TAGS",
|
117
117
|
)
|
118
|
+
@click.option(
|
119
|
+
"--metadata",
|
120
|
+
multiple=True,
|
121
|
+
type=str,
|
122
|
+
help="Metadata to associate with this evaluation run (more than one --metadata argument can be specified).",
|
123
|
+
envvar="INSPECT_EVAL_METADATA",
|
124
|
+
)
|
118
125
|
@click.option(
|
119
126
|
"--trace",
|
120
127
|
type=bool,
|
@@ -449,6 +456,7 @@ def eval_command(
|
|
449
456
|
s: tuple[str] | None,
|
450
457
|
solver_config: str | None,
|
451
458
|
tags: str | None,
|
459
|
+
metadata: tuple[str] | None,
|
452
460
|
trace: bool | None,
|
453
461
|
approval: str | None,
|
454
462
|
sandbox: str | None,
|
@@ -525,6 +533,7 @@ def eval_command(
|
|
525
533
|
s=s,
|
526
534
|
solver_config=solver_config,
|
527
535
|
tags=tags,
|
536
|
+
metadata=metadata,
|
528
537
|
trace=trace,
|
529
538
|
approval=approval,
|
530
539
|
sandbox=sandbox,
|
@@ -616,6 +625,7 @@ def eval_set_command(
|
|
616
625
|
s: tuple[str] | None,
|
617
626
|
solver_config: str | None,
|
618
627
|
tags: str | None,
|
628
|
+
metadata: tuple[str] | None,
|
619
629
|
sandbox: str | None,
|
620
630
|
no_sandbox_cleanup: bool | None,
|
621
631
|
epochs: int | None,
|
@@ -695,6 +705,7 @@ def eval_set_command(
|
|
695
705
|
s=s,
|
696
706
|
solver_config=solver_config,
|
697
707
|
tags=tags,
|
708
|
+
metadata=metadata,
|
698
709
|
trace=trace,
|
699
710
|
approval=approval,
|
700
711
|
sandbox=sandbox,
|
@@ -749,6 +760,7 @@ def eval_exec(
|
|
749
760
|
s: tuple[str] | None,
|
750
761
|
solver_config: str | None,
|
751
762
|
tags: str | None,
|
763
|
+
metadata: tuple[str] | None,
|
752
764
|
trace: bool | None,
|
753
765
|
approval: str | None,
|
754
766
|
sandbox: str | None,
|
@@ -790,6 +802,9 @@ def eval_exec(
|
|
790
802
|
# parse tags
|
791
803
|
eval_tags = parse_comma_separated(tags)
|
792
804
|
|
805
|
+
# parse metadata
|
806
|
+
eval_metadata = parse_cli_args(metadata)
|
807
|
+
|
793
808
|
# resolve epochs
|
794
809
|
eval_epochs = (
|
795
810
|
Epochs(epochs, create_reducers(parse_comma_separated(epochs_reducer)))
|
@@ -825,6 +840,7 @@ def eval_exec(
|
|
825
840
|
task_args=task_args,
|
826
841
|
solver=SolverSpec(solver, solver_args) if solver else None,
|
827
842
|
tags=eval_tags,
|
843
|
+
metadata=eval_metadata,
|
828
844
|
trace=trace,
|
829
845
|
approval=approval,
|
830
846
|
sandbox=parse_sandbox(sandbox),
|
@@ -131,9 +131,14 @@ def task_stats(stats: EvalStats) -> RenderableType:
|
|
131
131
|
else:
|
132
132
|
input_tokens = f"[bold]I: [/bold]{usage.input_tokens:,}"
|
133
133
|
|
134
|
+
if usage.reasoning_tokens is not None:
|
135
|
+
reasoning_tokens = f", [bold]R: [/bold]{usage.reasoning_tokens:,}"
|
136
|
+
else:
|
137
|
+
reasoning_tokens = ""
|
138
|
+
|
134
139
|
table.add_row(
|
135
140
|
Text(model, style="bold"),
|
136
|
-
f" {usage.total_tokens:,} tokens [{input_tokens}, [bold]O: [/bold]{usage.output_tokens:,}]",
|
141
|
+
f" {usage.total_tokens:,} tokens [{input_tokens}, [bold]O: [/bold]{usage.output_tokens:,}{reasoning_tokens}]",
|
137
142
|
style=theme.light,
|
138
143
|
)
|
139
144
|
|
inspect_ai/_eval/eval.py
CHANGED
@@ -68,6 +68,7 @@ def eval(
|
|
68
68
|
sandbox_cleanup: bool | None = None,
|
69
69
|
solver: Solver | list[Solver] | SolverSpec | None = None,
|
70
70
|
tags: list[str] | None = None,
|
71
|
+
metadata: dict[str, Any] | None = None,
|
71
72
|
trace: bool | None = None,
|
72
73
|
display: DisplayType | None = None,
|
73
74
|
approval: str | list[ApprovalPolicy] | None = None,
|
@@ -116,6 +117,7 @@ def eval(
|
|
116
117
|
solver: Alternative solver for task(s).
|
117
118
|
Optional (uses task solver by default).
|
118
119
|
tags: Tags to associate with this evaluation run.
|
120
|
+
metadata: Metadata to associate with this evaluation run.
|
119
121
|
trace: Trace message interactions with evaluated model to terminal.
|
120
122
|
display: Task display type (defaults to 'full').
|
121
123
|
approval: Tool use approval policies.
|
@@ -186,6 +188,7 @@ def eval(
|
|
186
188
|
sandbox_cleanup=sandbox_cleanup,
|
187
189
|
solver=solver,
|
188
190
|
tags=tags,
|
191
|
+
metadata=metadata,
|
189
192
|
approval=approval,
|
190
193
|
log_level=log_level,
|
191
194
|
log_level_transcript=log_level_transcript,
|
@@ -235,6 +238,7 @@ async def eval_async(
|
|
235
238
|
sandbox_cleanup: bool | None = None,
|
236
239
|
solver: Solver | list[Solver] | SolverSpec | None = None,
|
237
240
|
tags: list[str] | None = None,
|
241
|
+
metadata: dict[str, Any] | None = None,
|
238
242
|
approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
|
239
243
|
log_level: str | None = None,
|
240
244
|
log_level_transcript: str | None = None,
|
@@ -274,7 +278,8 @@ async def eval_async(
|
|
274
278
|
sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
|
275
279
|
sandbox_cleanup: Cleanup sandbox environments after task completes (defaults to True)
|
276
280
|
solver: Alternative solver for task(s). Optional (uses task solver by default).
|
277
|
-
tags
|
281
|
+
tags: Tags to associate with this evaluation run.
|
282
|
+
metadata: Metadata to associate with this evaluation run.
|
278
283
|
approval: Tool use approval policies.
|
279
284
|
Either a path to an approval policy config file or a list of approval policies.
|
280
285
|
Defaults to no approval policy.
|
@@ -449,6 +454,7 @@ async def eval_async(
|
|
449
454
|
epochs_reducer=epochs_reducer,
|
450
455
|
solver=solver,
|
451
456
|
tags=tags,
|
457
|
+
metadata=metadata,
|
452
458
|
score=score,
|
453
459
|
debug_errors=debug_errors is True,
|
454
460
|
**kwargs,
|
@@ -473,6 +479,7 @@ async def eval_async(
|
|
473
479
|
epochs_reducer=epochs_reducer,
|
474
480
|
solver=solver,
|
475
481
|
tags=tags,
|
482
|
+
metadata=metadata,
|
476
483
|
score=score,
|
477
484
|
**kwargs,
|
478
485
|
)
|
inspect_ai/_eval/evalset.py
CHANGED
@@ -68,6 +68,7 @@ def eval_set(
|
|
68
68
|
sandbox_cleanup: bool | None = None,
|
69
69
|
solver: Solver | list[Solver] | SolverSpec | None = None,
|
70
70
|
tags: list[str] | None = None,
|
71
|
+
metadata: dict[str, Any] | None = None,
|
71
72
|
trace: bool | None = None,
|
72
73
|
display: DisplayType | None = None,
|
73
74
|
approval: str | list[ApprovalPolicy] | None = None,
|
@@ -127,6 +128,7 @@ def eval_set(
|
|
127
128
|
solver: Alternative solver(s) for
|
128
129
|
evaluating task(s). ptional (uses task solver by default).
|
129
130
|
tags: Tags to associate with this evaluation run.
|
131
|
+
metadata: Metadata to associate with this evaluation run.
|
130
132
|
trace: Trace message interactions with evaluated model to terminal.
|
131
133
|
display: Task display type (defaults to 'full').
|
132
134
|
approval: Tool use approval policies.
|
@@ -193,6 +195,7 @@ def eval_set(
|
|
193
195
|
sandbox_cleanup=sandbox_cleanup,
|
194
196
|
solver=solver,
|
195
197
|
tags=tags,
|
198
|
+
metadata=metadata,
|
196
199
|
trace=trace,
|
197
200
|
display=display,
|
198
201
|
approval=approval,
|
inspect_ai/_eval/run.py
CHANGED
@@ -2,7 +2,7 @@ import functools
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import sys
|
5
|
-
from typing import Awaitable, Callable, Set, cast
|
5
|
+
from typing import Any, Awaitable, Callable, Set, cast
|
6
6
|
|
7
7
|
from inspect_ai._eval.task.task import Task
|
8
8
|
from inspect_ai._util.trace import trace_action
|
@@ -68,6 +68,7 @@ async def eval_run(
|
|
68
68
|
epochs_reducer: list[ScoreReducer] | None = None,
|
69
69
|
solver: Solver | SolverSpec | None = None,
|
70
70
|
tags: list[str] | None = None,
|
71
|
+
metadata: dict[str, Any] | None = None,
|
71
72
|
debug_errors: bool = False,
|
72
73
|
score: bool = True,
|
73
74
|
**kwargs: Unpack[GenerateConfigArgs],
|
@@ -205,7 +206,7 @@ async def eval_run(
|
|
205
206
|
task_args=resolved_task.task_args,
|
206
207
|
model_args=resolved_task.model.model_args,
|
207
208
|
eval_config=task_eval_config,
|
208
|
-
metadata=task.metadata,
|
209
|
+
metadata=((metadata or {}) | (task.metadata or {})) or None,
|
209
210
|
recorder=recorder,
|
210
211
|
)
|
211
212
|
await logger.init()
|
inspect_ai/_util/content.py
CHANGED
@@ -21577,7 +21577,7 @@ var require_assets = __commonJS({
|
|
21577
21577
|
className: clsx(
|
21578
21578
|
"source-code",
|
21579
21579
|
"sourceCode",
|
21580
|
-
`language-${highlightLanguage}
|
21580
|
+
highlightLanguage ? `language-${highlightLanguage}` : void 0,
|
21581
21581
|
styles$10.outputCode
|
21582
21582
|
),
|
21583
21583
|
children: formattedContent
|
@@ -21613,6 +21613,22 @@ var require_assets = __commonJS({
|
|
21613
21613
|
}
|
21614
21614
|
const collapse = Array.isArray(output2) ? output2.every((item2) => !isContentImage(item2)) : !isContentImage(output2);
|
21615
21615
|
const normalizedContent = reactExports.useMemo(() => normalizeContent$1(output2), [output2]);
|
21616
|
+
const hasContent = normalizedContent.find((c2) => {
|
21617
|
+
if (c2.type === "tool") {
|
21618
|
+
for (const t2 of c2.content) {
|
21619
|
+
if (t2.type === "text") {
|
21620
|
+
if (t2.text) {
|
21621
|
+
return true;
|
21622
|
+
}
|
21623
|
+
} else {
|
21624
|
+
return true;
|
21625
|
+
}
|
21626
|
+
}
|
21627
|
+
return false;
|
21628
|
+
} else {
|
21629
|
+
return true;
|
21630
|
+
}
|
21631
|
+
});
|
21616
21632
|
const contents2 = mode !== "compact" ? input2 : input2 || functionCall;
|
21617
21633
|
return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [
|
21618
21634
|
mode !== "compact" && (!view || view.title) ? /* @__PURE__ */ jsxRuntimeExports.jsx(ToolTitle, { title: (view == null ? void 0 : view.title) || functionCall }) : "",
|
@@ -21625,7 +21641,7 @@ var require_assets = __commonJS({
|
|
21625
21641
|
toolCallView: view
|
21626
21642
|
}
|
21627
21643
|
),
|
21628
|
-
/* @__PURE__ */ jsxRuntimeExports.jsx(ExpandablePanel, { collapse, border: true, lines: 15, children: /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: normalizedContent }) })
|
21644
|
+
hasContent ? /* @__PURE__ */ jsxRuntimeExports.jsx(ExpandablePanel, { collapse, border: true, lines: 15, children: /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: normalizedContent }) }) : void 0
|
21629
21645
|
] }) })
|
21630
21646
|
] });
|
21631
21647
|
};
|
@@ -83,8 +83,24 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
|
|
83
83
|
: !isContentImage(output);
|
84
84
|
const normalizedContent = useMemo(() => normalizeContent(output), [output]);
|
85
85
|
|
86
|
-
const
|
86
|
+
const hasContent = normalizedContent.find((c) => {
|
87
|
+
if (c.type === "tool") {
|
88
|
+
for (const t of c.content) {
|
89
|
+
if (t.type === "text") {
|
90
|
+
if (t.text) {
|
91
|
+
return true;
|
92
|
+
}
|
93
|
+
} else {
|
94
|
+
return true;
|
95
|
+
}
|
96
|
+
}
|
97
|
+
return false;
|
98
|
+
} else {
|
99
|
+
return true;
|
100
|
+
}
|
101
|
+
});
|
87
102
|
|
103
|
+
const contents = mode !== "compact" ? input : input || functionCall;
|
88
104
|
return (
|
89
105
|
<div>
|
90
106
|
{mode !== "compact" && (!view || view.title) ? (
|
@@ -99,9 +115,11 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
|
|
99
115
|
contents={contents}
|
100
116
|
toolCallView={view}
|
101
117
|
/>
|
102
|
-
|
103
|
-
<
|
104
|
-
|
118
|
+
{hasContent ? (
|
119
|
+
<ExpandablePanel collapse={collapse} border={true} lines={15}>
|
120
|
+
<MessageContent contents={normalizedContent} />
|
121
|
+
</ExpandablePanel>
|
122
|
+
) : undefined}
|
105
123
|
</div>
|
106
124
|
</div>
|
107
125
|
</div>
|
inspect_ai/model/_openai.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import json
|
2
2
|
import re
|
3
|
+
from copy import copy
|
3
4
|
from typing import Literal
|
4
5
|
|
6
|
+
from openai import BadRequestError, OpenAIError
|
5
7
|
from openai.types.chat import (
|
6
8
|
ChatCompletion,
|
7
9
|
ChatCompletionAssistantMessageParam,
|
@@ -26,7 +28,9 @@ from openai.types.chat.chat_completion import Choice, ChoiceLogprobs
|
|
26
28
|
from openai.types.chat.chat_completion_message_tool_call import Function
|
27
29
|
from openai.types.completion_usage import CompletionUsage
|
28
30
|
from openai.types.shared_params.function_definition import FunctionDefinition
|
31
|
+
from pydantic import JsonValue
|
29
32
|
|
33
|
+
from inspect_ai._util.constants import BASE_64_DATA_REMOVED
|
30
34
|
from inspect_ai._util.content import (
|
31
35
|
Content,
|
32
36
|
ContentAudio,
|
@@ -48,7 +52,16 @@ from ._chat_message import (
|
|
48
52
|
ChatMessageTool,
|
49
53
|
ChatMessageUser,
|
50
54
|
)
|
51
|
-
from ._model_output import ModelUsage, StopReason, as_stop_reason
|
55
|
+
from ._model_output import ModelOutput, ModelUsage, StopReason, as_stop_reason
|
56
|
+
|
57
|
+
|
58
|
+
class OpenAIResponseError(OpenAIError):
|
59
|
+
def __init__(self, code: str, message: str) -> None:
|
60
|
+
self.code = code
|
61
|
+
self.message = message
|
62
|
+
|
63
|
+
def __str__(self) -> str:
|
64
|
+
return f"{self.code}: {self.message}"
|
52
65
|
|
53
66
|
|
54
67
|
def is_o_series(name: str) -> bool:
|
@@ -58,6 +71,10 @@ def is_o_series(name: str) -> bool:
|
|
58
71
|
return not is_gpt(name) and bool(re.search(r"o\d+", name))
|
59
72
|
|
60
73
|
|
74
|
+
def is_o1_pro(name: str) -> bool:
|
75
|
+
return "o1-pro" in name
|
76
|
+
|
77
|
+
|
61
78
|
def is_o1_mini(name: str) -> bool:
|
62
79
|
return "o1-mini" in name
|
63
80
|
|
@@ -320,6 +337,7 @@ def chat_messages_from_openai(
|
|
320
337
|
chat_messages.append(ChatMessageUser(content=content))
|
321
338
|
elif message["role"] == "assistant":
|
322
339
|
# resolve content
|
340
|
+
refusal: Literal[True] | None = None
|
323
341
|
asst_content = message.get("content", None)
|
324
342
|
if isinstance(asst_content, str):
|
325
343
|
result = parse_content_with_reasoning(asst_content)
|
@@ -336,6 +354,8 @@ def chat_messages_from_openai(
|
|
336
354
|
content = asst_content
|
337
355
|
elif asst_content is None:
|
338
356
|
content = message.get("refusal", None) or ""
|
357
|
+
if content:
|
358
|
+
refusal = True
|
339
359
|
else:
|
340
360
|
content = []
|
341
361
|
for ac in asst_content:
|
@@ -348,7 +368,7 @@ def chat_messages_from_openai(
|
|
348
368
|
)
|
349
369
|
if reasoning is not None:
|
350
370
|
if isinstance(content, str):
|
351
|
-
content = [ContentText(text=content)]
|
371
|
+
content = [ContentText(text=content, refusal=refusal)]
|
352
372
|
else:
|
353
373
|
content.insert(0, ContentReasoning(reasoning=str(reasoning)))
|
354
374
|
|
@@ -437,7 +457,7 @@ def content_from_openai(
|
|
437
457
|
)
|
438
458
|
]
|
439
459
|
elif content["type"] == "refusal":
|
440
|
-
return [ContentText(text=content["refusal"])]
|
460
|
+
return [ContentText(text=content["refusal"], refusal=True)]
|
441
461
|
else:
|
442
462
|
content_type = content["type"]
|
443
463
|
raise ValueError(f"Unexpected content type '{content_type}' in message.")
|
@@ -455,8 +475,10 @@ def chat_message_assistant_from_openai(
|
|
455
475
|
if reasoning is not None:
|
456
476
|
content: str | list[Content] = [
|
457
477
|
ContentReasoning(reasoning=str(reasoning)),
|
458
|
-
ContentText(text=msg_content),
|
478
|
+
ContentText(text=msg_content, refusal=True if refusal else None),
|
459
479
|
]
|
480
|
+
elif refusal is not None:
|
481
|
+
content = [ContentText(text=msg_content, refusal=True)]
|
460
482
|
else:
|
461
483
|
content = msg_content
|
462
484
|
|
@@ -484,3 +506,44 @@ def chat_choices_from_openai(
|
|
484
506
|
)
|
485
507
|
for choice in choices
|
486
508
|
]
|
509
|
+
|
510
|
+
|
511
|
+
def openai_handle_bad_request(
|
512
|
+
model_name: str, e: BadRequestError
|
513
|
+
) -> ModelOutput | Exception:
|
514
|
+
# extract message
|
515
|
+
if isinstance(e.body, dict) and "message" in e.body.keys():
|
516
|
+
content = str(e.body.get("message"))
|
517
|
+
else:
|
518
|
+
content = e.message
|
519
|
+
|
520
|
+
# narrow stop_reason
|
521
|
+
stop_reason: StopReason | None = None
|
522
|
+
if e.code == "context_length_exceeded":
|
523
|
+
stop_reason = "model_length"
|
524
|
+
elif (
|
525
|
+
e.code == "invalid_prompt" # seems to happen for o1/o3
|
526
|
+
or e.code == "content_policy_violation" # seems to happen for vision
|
527
|
+
or e.code == "content_filter" # seems to happen on azure
|
528
|
+
):
|
529
|
+
stop_reason = "content_filter"
|
530
|
+
|
531
|
+
if stop_reason:
|
532
|
+
return ModelOutput.from_content(
|
533
|
+
model=model_name, content=content, stop_reason=stop_reason
|
534
|
+
)
|
535
|
+
else:
|
536
|
+
return e
|
537
|
+
|
538
|
+
|
539
|
+
def openai_media_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
|
540
|
+
# remove images from raw api call
|
541
|
+
if key == "image_url" and isinstance(value, dict) and "url" in value:
|
542
|
+
url = str(value.get("url"))
|
543
|
+
if url.startswith("data:"):
|
544
|
+
value = copy(value)
|
545
|
+
value.update(url=BASE_64_DATA_REMOVED)
|
546
|
+
elif key == "input_audio" and isinstance(value, dict) and "data" in value:
|
547
|
+
value = copy(value)
|
548
|
+
value.update(data=BASE_64_DATA_REMOVED)
|
549
|
+
return value
|