inspect-ai 0.3.61__py3-none-any.whl → 0.3.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -0
- inspect_ai/_cli/view.py +4 -0
- inspect_ai/_display/textual/widgets/transcript.py +15 -9
- inspect_ai/_eval/task/error.py +10 -14
- inspect_ai/_eval/task/run.py +10 -8
- inspect_ai/_util/transcript.py +11 -0
- inspect_ai/_view/www/dist/assets/index.css +1 -0
- inspect_ai/_view/www/dist/assets/index.js +100 -94
- inspect_ai/_view/www/log-schema.json +35 -19
- inspect_ai/_view/www/src/components/ChatView.mjs +23 -0
- inspect_ai/_view/www/src/types/log.d.ts +6 -4
- inspect_ai/log/_recorders/eval.py +1 -1
- inspect_ai/model/_chat_message.py +27 -0
- inspect_ai/model/_conversation.py +10 -3
- inspect_ai/model/_generate_config.py +6 -0
- inspect_ai/model/_model.py +74 -0
- inspect_ai/model/_openai.py +33 -1
- inspect_ai/model/_providers/anthropic.py +12 -0
- inspect_ai/model/_providers/groq.py +4 -0
- inspect_ai/model/_providers/openai.py +21 -9
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_reasoning.py +17 -0
- inspect_ai/solver/_basic_agent.py +19 -9
- inspect_ai/tool/beta/_computer/_resources/Dockerfile +4 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/RECORD +34 -29
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -385,6 +385,14 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
385
385
|
help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
|
386
386
|
envvar="INSPECT_EVAL_REASONING_EFFORT",
|
387
387
|
)
|
388
|
+
@click.option(
|
389
|
+
"--reasoning-history/--no-reasoning-history",
|
390
|
+
type=bool,
|
391
|
+
is_flag=True,
|
392
|
+
default=True,
|
393
|
+
help="Include reasoning in chat message history sent to generate.",
|
394
|
+
envvar="INSPECT_EVAL_REASONING_HISTORY",
|
395
|
+
)
|
388
396
|
@click.option(
|
389
397
|
"--log-format",
|
390
398
|
type=click.Choice(["eval", "json"], case_sensitive=False),
|
@@ -444,6 +452,7 @@ def eval_command(
|
|
444
452
|
max_tool_output: int | None,
|
445
453
|
cache_prompt: str | None,
|
446
454
|
reasoning_effort: str | None,
|
455
|
+
reasoning_history: bool | None,
|
447
456
|
message_limit: int | None,
|
448
457
|
token_limit: int | None,
|
449
458
|
time_limit: int | None,
|
@@ -603,6 +612,7 @@ def eval_set_command(
|
|
603
612
|
max_tool_output: int | None,
|
604
613
|
cache_prompt: str | None,
|
605
614
|
reasoning_effort: str | None,
|
615
|
+
reasoning_history: bool | None,
|
606
616
|
message_limit: int | None,
|
607
617
|
token_limit: int | None,
|
608
618
|
time_limit: int | None,
|
@@ -841,6 +851,9 @@ def config_from_locals(locals: dict[str, Any]) -> GenerateConfigArgs:
|
|
841
851
|
if key == "internal_tools":
|
842
852
|
if value is not False:
|
843
853
|
value = None
|
854
|
+
if key == "reasoning_history":
|
855
|
+
if value is not False:
|
856
|
+
value = None
|
844
857
|
config[key] = value # type: ignore
|
845
858
|
return config
|
846
859
|
|
inspect_ai/_cli/view.py
CHANGED
@@ -63,6 +63,10 @@ def start(
|
|
63
63
|
INSPECT_VIEW_AUTHORIZATION_TOKEN = "INSPECT_VIEW_AUTHORIZATION_TOKEN"
|
64
64
|
authorization = os.environ.get(INSPECT_VIEW_AUTHORIZATION_TOKEN, None)
|
65
65
|
if authorization:
|
66
|
+
# this indicates we are in vscode -- we want to set the log level to HTTP
|
67
|
+
# in vscode, updated versions of the extension do this but we set it
|
68
|
+
# manually here as a temporary bridge for running against older versions
|
69
|
+
common["log_level"] = "HTTP"
|
66
70
|
del os.environ[INSPECT_VIEW_AUTHORIZATION_TOKEN]
|
67
71
|
os.unsetenv(INSPECT_VIEW_AUTHORIZATION_TOKEN)
|
68
72
|
|
@@ -15,6 +15,7 @@ from inspect_ai._util.transcript import (
|
|
15
15
|
set_transcript_markdown_options,
|
16
16
|
transcript_function,
|
17
17
|
transcript_markdown,
|
18
|
+
transcript_reasoning,
|
18
19
|
transcript_separator,
|
19
20
|
)
|
20
21
|
from inspect_ai.log._samples import ActiveSample
|
@@ -33,7 +34,11 @@ from inspect_ai.log._transcript import (
|
|
33
34
|
SubtaskEvent,
|
34
35
|
ToolEvent,
|
35
36
|
)
|
36
|
-
from inspect_ai.model._chat_message import
|
37
|
+
from inspect_ai.model._chat_message import (
|
38
|
+
ChatMessage,
|
39
|
+
ChatMessageAssistant,
|
40
|
+
ChatMessageUser,
|
41
|
+
)
|
37
42
|
from inspect_ai.model._render import messages_preceding_assistant
|
38
43
|
from inspect_ai.tool._tool import ToolResult
|
39
44
|
from inspect_ai.tool._tool_transcript import transcript_tool_call
|
@@ -171,8 +176,8 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
|
|
171
176
|
# content
|
172
177
|
content: list[RenderableType] = []
|
173
178
|
|
174
|
-
def append_message(message: ChatMessage
|
175
|
-
content.extend(render_message(message
|
179
|
+
def append_message(message: ChatMessage) -> None:
|
180
|
+
content.extend(render_message(message))
|
176
181
|
|
177
182
|
# render preceding messages
|
178
183
|
preceding = messages_preceding_assistant(event.input)
|
@@ -309,16 +314,17 @@ def render_as_json(json: Any) -> RenderableType:
|
|
309
314
|
)
|
310
315
|
|
311
316
|
|
312
|
-
def render_message(
|
313
|
-
message: ChatMessage, text: str | None = None
|
314
|
-
) -> list[RenderableType]:
|
317
|
+
def render_message(message: ChatMessage) -> list[RenderableType]:
|
315
318
|
content: list[RenderableType] = [
|
316
319
|
Text(message.role.capitalize(), style="bold"),
|
317
320
|
Text(),
|
318
321
|
]
|
319
|
-
|
320
|
-
if
|
321
|
-
content.extend(
|
322
|
+
|
323
|
+
if isinstance(message, ChatMessageAssistant) and message.reasoning:
|
324
|
+
content.extend(transcript_reasoning(message.reasoning))
|
325
|
+
|
326
|
+
if message.text:
|
327
|
+
content.extend([transcript_markdown(message.text.strip(), escape=True)])
|
322
328
|
return content
|
323
329
|
|
324
330
|
|
inspect_ai/_eval/task/error.py
CHANGED
@@ -8,28 +8,24 @@ class SampleErrorHandler:
|
|
8
8
|
self.fail_on_error = True if fail_on_error is None else fail_on_error
|
9
9
|
self.total_samples = float(total_samples)
|
10
10
|
|
11
|
-
def __call__(self, ex: BaseException) -> EvalError:
|
11
|
+
def __call__(self, ex: BaseException) -> tuple[EvalError, BaseException | None]:
|
12
12
|
# increment error count
|
13
13
|
self.error_count += 1
|
14
14
|
|
15
15
|
# create error (we may return it)
|
16
|
-
def sample_error(
|
17
|
-
|
16
|
+
def sample_error(
|
17
|
+
*, raise_error: bool
|
18
|
+
) -> tuple[EvalError, BaseException | None]:
|
19
|
+
return eval_error(
|
20
|
+
ex, type(ex), ex, ex.__traceback__
|
21
|
+
), ex if raise_error else None
|
18
22
|
|
19
23
|
# check against limits
|
20
24
|
if isinstance(self.fail_on_error, bool):
|
21
|
-
|
22
|
-
raise ex
|
23
|
-
else:
|
24
|
-
return sample_error()
|
25
|
+
return sample_error(raise_error=self.fail_on_error)
|
25
26
|
else:
|
26
27
|
if self.fail_on_error < 1:
|
27
28
|
max_errors = self.fail_on_error * self.total_samples
|
28
|
-
|
29
|
-
raise ex
|
30
|
-
else:
|
31
|
-
return sample_error()
|
32
|
-
elif self.error_count >= self.fail_on_error:
|
33
|
-
raise ex
|
29
|
+
return sample_error(raise_error=self.error_count >= max_errors)
|
34
30
|
else:
|
35
|
-
return sample_error()
|
31
|
+
return sample_error(raise_error=self.error_count >= self.fail_on_error)
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -496,7 +496,7 @@ async def task_run_sample(
|
|
496
496
|
logger: TaskLogger | None,
|
497
497
|
log_images: bool,
|
498
498
|
sample_source: EvalSampleSource | None,
|
499
|
-
sample_error:
|
499
|
+
sample_error: SampleErrorHandler,
|
500
500
|
sample_complete: Callable[[dict[str, SampleScore]], None],
|
501
501
|
fails_on_error: bool,
|
502
502
|
time_limit: int | None,
|
@@ -548,12 +548,12 @@ async def task_run_sample(
|
|
548
548
|
)
|
549
549
|
|
550
550
|
# helper to handle exceptions (will throw if we've exceeded the limit)
|
551
|
-
def handle_error(ex: BaseException) -> EvalError:
|
551
|
+
def handle_error(ex: BaseException) -> tuple[EvalError, BaseException | None]:
|
552
552
|
err = sample_error(ex)
|
553
553
|
py_logger.warning(
|
554
554
|
f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
|
555
555
|
)
|
556
|
-
transcript()._event(ErrorEvent(error=err))
|
556
|
+
transcript()._event(ErrorEvent(error=err[0]))
|
557
557
|
return err
|
558
558
|
|
559
559
|
# solver loop
|
@@ -572,6 +572,7 @@ async def task_run_sample(
|
|
572
572
|
) as active,
|
573
573
|
):
|
574
574
|
error: EvalError | None = None
|
575
|
+
raise_error: BaseException | None = None
|
575
576
|
results: dict[str, SampleScore] = {}
|
576
577
|
try:
|
577
578
|
async with sandboxenv_cm:
|
@@ -640,7 +641,7 @@ async def task_run_sample(
|
|
640
641
|
state = sample_state() or state
|
641
642
|
case "error":
|
642
643
|
# default error handling
|
643
|
-
error = handle_error(ex)
|
644
|
+
error, raise_error = handle_error(ex)
|
644
645
|
|
645
646
|
else:
|
646
647
|
raise
|
@@ -660,7 +661,7 @@ async def task_run_sample(
|
|
660
661
|
state.completed = True
|
661
662
|
|
662
663
|
except BaseException as ex:
|
663
|
-
error = handle_error(ex)
|
664
|
+
error, raise_error = handle_error(ex)
|
664
665
|
|
665
666
|
# set timeout for scoring. if the original timeout was hit we still
|
666
667
|
# want to provide opportunity for scoring, but we don't necessarily
|
@@ -737,11 +738,10 @@ async def task_run_sample(
|
|
737
738
|
)
|
738
739
|
|
739
740
|
# handle error (this will throw if we've exceeded the limit)
|
740
|
-
error = handle_error(ex)
|
741
|
+
error, raise_error = handle_error(ex)
|
741
742
|
|
742
|
-
# handle sandboxenv init errors
|
743
743
|
except Exception as ex:
|
744
|
-
error = handle_error(ex)
|
744
|
+
error, raise_error = handle_error(ex)
|
745
745
|
|
746
746
|
# complete the sample
|
747
747
|
progress(SAMPLE_TOTAL_PROGRESS_UNITS)
|
@@ -772,6 +772,8 @@ async def task_run_sample(
|
|
772
772
|
if results is not None:
|
773
773
|
sample_complete(results)
|
774
774
|
return results
|
775
|
+
elif raise_error:
|
776
|
+
raise raise_error
|
775
777
|
else:
|
776
778
|
return None
|
777
779
|
|
inspect_ai/_util/transcript.py
CHANGED
@@ -111,6 +111,17 @@ def transcript_panel(
|
|
111
111
|
)
|
112
112
|
|
113
113
|
|
114
|
+
def transcript_reasoning(reasoning: str) -> list[RenderableType]:
|
115
|
+
content: list[RenderableType] = []
|
116
|
+
content.append(
|
117
|
+
transcript_markdown(
|
118
|
+
f"**<think>** \n{reasoning} \n**</think>**\n\n", escape=True
|
119
|
+
)
|
120
|
+
)
|
121
|
+
content.append(Text())
|
122
|
+
return content
|
123
|
+
|
124
|
+
|
114
125
|
def transcript_separator(title: str, color: str) -> RenderableType:
|
115
126
|
return Rule(title=title, style=f"{color} bold", align="center", end="\n\n")
|
116
127
|
|
@@ -15735,6 +15735,7 @@ pre.ap-terminal.ap-cursor-on .ap-line .ap-cursor.ap-inverse {
|
|
15735
15735
|
}
|
15736
15736
|
pre.ap-terminal:not(.ap-blink) .ap-line .ap-blink {
|
15737
15737
|
color: transparent;
|
15738
|
+
border-color: transparent;
|
15738
15739
|
}
|
15739
15740
|
pre.ap-terminal .ap-bright {
|
15740
15741
|
font-weight: bold;
|