inspect-ai 0.3.60__py3-none-any.whl → 0.3.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -1
- inspect_ai/_cli/view.py +4 -0
- inspect_ai/_display/textual/widgets/transcript.py +15 -9
- inspect_ai/_eval/task/error.py +10 -14
- inspect_ai/_eval/task/generate.py +41 -35
- inspect_ai/_eval/task/run.py +20 -12
- inspect_ai/_util/hooks.py +17 -7
- inspect_ai/_util/transcript.py +11 -0
- inspect_ai/_view/www/dist/assets/index.css +1 -0
- inspect_ai/_view/www/dist/assets/index.js +100 -94
- inspect_ai/_view/www/log-schema.json +35 -19
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/components/ChatView.mjs +23 -0
- inspect_ai/_view/www/src/types/log.d.ts +6 -4
- inspect_ai/log/_recorders/eval.py +1 -1
- inspect_ai/model/_chat_message.py +29 -2
- inspect_ai/model/_conversation.py +10 -3
- inspect_ai/model/_generate_config.py +6 -0
- inspect_ai/model/_model.py +164 -25
- inspect_ai/model/_openai.py +33 -1
- inspect_ai/model/_providers/anthropic.py +12 -3
- inspect_ai/model/_providers/groq.py +4 -0
- inspect_ai/model/_providers/openai.py +21 -9
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_reasoning.py +17 -0
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_basic_agent.py +78 -58
- inspect_ai/{util → solver}/_limit.py +13 -0
- inspect_ai/solver/_task_state.py +37 -7
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
- inspect_ai/tool/beta/_computer/_resources/Dockerfile +5 -3
- inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +1 -1
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
- inspect_ai/util/__init__.py +0 -2
- inspect_ai/util/_sandbox/self_check.py +51 -28
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/RECORD +45 -40
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +0 -10
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -385,6 +385,14 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
385
385
|
help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
|
386
386
|
envvar="INSPECT_EVAL_REASONING_EFFORT",
|
387
387
|
)
|
388
|
+
@click.option(
|
389
|
+
"--reasoning-history/--no-reasoning-history",
|
390
|
+
type=bool,
|
391
|
+
is_flag=True,
|
392
|
+
default=True,
|
393
|
+
help="Include reasoning in chat message history sent to generate.",
|
394
|
+
envvar="INSPECT_EVAL_REASONING_HISTORY",
|
395
|
+
)
|
388
396
|
@click.option(
|
389
397
|
"--log-format",
|
390
398
|
type=click.Choice(["eval", "json"], case_sensitive=False),
|
@@ -444,6 +452,7 @@ def eval_command(
|
|
444
452
|
max_tool_output: int | None,
|
445
453
|
cache_prompt: str | None,
|
446
454
|
reasoning_effort: str | None,
|
455
|
+
reasoning_history: bool | None,
|
447
456
|
message_limit: int | None,
|
448
457
|
token_limit: int | None,
|
449
458
|
time_limit: int | None,
|
@@ -592,7 +601,6 @@ def eval_set_command(
|
|
592
601
|
logit_bias: str | None,
|
593
602
|
seed: int | None,
|
594
603
|
stop_seqs: str | None,
|
595
|
-
suffix: str | None,
|
596
604
|
temperature: float | None,
|
597
605
|
top_p: float | None,
|
598
606
|
top_k: int | None,
|
@@ -604,6 +612,7 @@ def eval_set_command(
|
|
604
612
|
max_tool_output: int | None,
|
605
613
|
cache_prompt: str | None,
|
606
614
|
reasoning_effort: str | None,
|
615
|
+
reasoning_history: bool | None,
|
607
616
|
message_limit: int | None,
|
608
617
|
token_limit: int | None,
|
609
618
|
time_limit: int | None,
|
@@ -842,6 +851,9 @@ def config_from_locals(locals: dict[str, Any]) -> GenerateConfigArgs:
|
|
842
851
|
if key == "internal_tools":
|
843
852
|
if value is not False:
|
844
853
|
value = None
|
854
|
+
if key == "reasoning_history":
|
855
|
+
if value is not False:
|
856
|
+
value = None
|
845
857
|
config[key] = value # type: ignore
|
846
858
|
return config
|
847
859
|
|
inspect_ai/_cli/view.py
CHANGED
@@ -63,6 +63,10 @@ def start(
|
|
63
63
|
INSPECT_VIEW_AUTHORIZATION_TOKEN = "INSPECT_VIEW_AUTHORIZATION_TOKEN"
|
64
64
|
authorization = os.environ.get(INSPECT_VIEW_AUTHORIZATION_TOKEN, None)
|
65
65
|
if authorization:
|
66
|
+
# this indicates we are in vscode -- we want to set the log level to HTTP
|
67
|
+
# in vscode, updated versions of the extension do this but we set it
|
68
|
+
# manually here as a temporary bridge for running against older versions
|
69
|
+
common["log_level"] = "HTTP"
|
66
70
|
del os.environ[INSPECT_VIEW_AUTHORIZATION_TOKEN]
|
67
71
|
os.unsetenv(INSPECT_VIEW_AUTHORIZATION_TOKEN)
|
68
72
|
|
@@ -15,6 +15,7 @@ from inspect_ai._util.transcript import (
|
|
15
15
|
set_transcript_markdown_options,
|
16
16
|
transcript_function,
|
17
17
|
transcript_markdown,
|
18
|
+
transcript_reasoning,
|
18
19
|
transcript_separator,
|
19
20
|
)
|
20
21
|
from inspect_ai.log._samples import ActiveSample
|
@@ -33,7 +34,11 @@ from inspect_ai.log._transcript import (
|
|
33
34
|
SubtaskEvent,
|
34
35
|
ToolEvent,
|
35
36
|
)
|
36
|
-
from inspect_ai.model._chat_message import
|
37
|
+
from inspect_ai.model._chat_message import (
|
38
|
+
ChatMessage,
|
39
|
+
ChatMessageAssistant,
|
40
|
+
ChatMessageUser,
|
41
|
+
)
|
37
42
|
from inspect_ai.model._render import messages_preceding_assistant
|
38
43
|
from inspect_ai.tool._tool import ToolResult
|
39
44
|
from inspect_ai.tool._tool_transcript import transcript_tool_call
|
@@ -171,8 +176,8 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
|
|
171
176
|
# content
|
172
177
|
content: list[RenderableType] = []
|
173
178
|
|
174
|
-
def append_message(message: ChatMessage
|
175
|
-
content.extend(render_message(message
|
179
|
+
def append_message(message: ChatMessage) -> None:
|
180
|
+
content.extend(render_message(message))
|
176
181
|
|
177
182
|
# render preceding messages
|
178
183
|
preceding = messages_preceding_assistant(event.input)
|
@@ -309,16 +314,17 @@ def render_as_json(json: Any) -> RenderableType:
|
|
309
314
|
)
|
310
315
|
|
311
316
|
|
312
|
-
def render_message(
|
313
|
-
message: ChatMessage, text: str | None = None
|
314
|
-
) -> list[RenderableType]:
|
317
|
+
def render_message(message: ChatMessage) -> list[RenderableType]:
|
315
318
|
content: list[RenderableType] = [
|
316
319
|
Text(message.role.capitalize(), style="bold"),
|
317
320
|
Text(),
|
318
321
|
]
|
319
|
-
|
320
|
-
if
|
321
|
-
content.extend(
|
322
|
+
|
323
|
+
if isinstance(message, ChatMessageAssistant) and message.reasoning:
|
324
|
+
content.extend(transcript_reasoning(message.reasoning))
|
325
|
+
|
326
|
+
if message.text:
|
327
|
+
content.extend([transcript_markdown(message.text.strip(), escape=True)])
|
322
328
|
return content
|
323
329
|
|
324
330
|
|
inspect_ai/_eval/task/error.py
CHANGED
@@ -8,28 +8,24 @@ class SampleErrorHandler:
|
|
8
8
|
self.fail_on_error = True if fail_on_error is None else fail_on_error
|
9
9
|
self.total_samples = float(total_samples)
|
10
10
|
|
11
|
-
def __call__(self, ex: BaseException) -> EvalError:
|
11
|
+
def __call__(self, ex: BaseException) -> tuple[EvalError, BaseException | None]:
|
12
12
|
# increment error count
|
13
13
|
self.error_count += 1
|
14
14
|
|
15
15
|
# create error (we may return it)
|
16
|
-
def sample_error(
|
17
|
-
|
16
|
+
def sample_error(
|
17
|
+
*, raise_error: bool
|
18
|
+
) -> tuple[EvalError, BaseException | None]:
|
19
|
+
return eval_error(
|
20
|
+
ex, type(ex), ex, ex.__traceback__
|
21
|
+
), ex if raise_error else None
|
18
22
|
|
19
23
|
# check against limits
|
20
24
|
if isinstance(self.fail_on_error, bool):
|
21
|
-
|
22
|
-
raise ex
|
23
|
-
else:
|
24
|
-
return sample_error()
|
25
|
+
return sample_error(raise_error=self.fail_on_error)
|
25
26
|
else:
|
26
27
|
if self.fail_on_error < 1:
|
27
28
|
max_errors = self.fail_on_error * self.total_samples
|
28
|
-
|
29
|
-
raise ex
|
30
|
-
else:
|
31
|
-
return sample_error()
|
32
|
-
elif self.error_count >= self.fail_on_error:
|
33
|
-
raise ex
|
29
|
+
return sample_error(raise_error=self.error_count >= max_errors)
|
34
30
|
else:
|
35
|
-
return sample_error()
|
31
|
+
return sample_error(raise_error=self.error_count >= self.fail_on_error)
|
@@ -8,6 +8,7 @@ from inspect_ai.model import (
|
|
8
8
|
)
|
9
9
|
from inspect_ai.model._cache import epoch
|
10
10
|
from inspect_ai.solver import TaskState
|
11
|
+
from inspect_ai.solver._limit import SampleLimitExceededError
|
11
12
|
from inspect_ai.tool import ToolFunction
|
12
13
|
|
13
14
|
|
@@ -21,45 +22,50 @@ async def task_generate(
|
|
21
22
|
# track tool_choice (revert to "auto" after first forced call of a tool)
|
22
23
|
tool_choice = state.tool_choice
|
23
24
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
try:
|
26
|
+
while True:
|
27
|
+
# If we don't update the epoch here as we go, it's entirely possible
|
28
|
+
# we'd cache the same response for every single epoch, which would
|
29
|
+
# completely defeat the point!
|
30
|
+
epoch.set(state.epoch)
|
29
31
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
32
|
+
# call the model
|
33
|
+
state.output = await model.generate(
|
34
|
+
input=state.messages,
|
35
|
+
tools=state.tools,
|
36
|
+
tool_choice=tool_choice,
|
37
|
+
config=config,
|
38
|
+
cache=cache,
|
39
|
+
)
|
38
40
|
|
39
|
-
|
40
|
-
|
41
|
-
|
41
|
+
# append the assistant message
|
42
|
+
message = state.output.message
|
43
|
+
state.messages.append(message)
|
42
44
|
|
43
|
-
|
44
|
-
|
45
|
-
|
45
|
+
# check for completed
|
46
|
+
if state.completed:
|
47
|
+
return state
|
46
48
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
49
|
+
# resolve tool calls if necessary
|
50
|
+
if tool_calls != "none" and message.tool_calls:
|
51
|
+
# call tools and append messages to state
|
52
|
+
state.messages.extend(
|
53
|
+
await call_tools(message, state.tools, config.max_tool_output)
|
54
|
+
)
|
53
55
|
|
54
|
-
|
55
|
-
|
56
|
-
|
56
|
+
# check for completed or only executing a single tool call
|
57
|
+
if state.completed or tool_calls == "single":
|
58
|
+
return state
|
59
|
+
|
60
|
+
# if a tool_call was forced set tool_choice to 'auto'
|
61
|
+
# (otherwise it will get forced over and over again)
|
62
|
+
if isinstance(tool_choice, ToolFunction):
|
63
|
+
tool_choice = "auto"
|
57
64
|
|
58
|
-
#
|
59
|
-
|
60
|
-
|
61
|
-
tool_choice = "auto"
|
65
|
+
# no tool calls or not resolving tool calls, we are done!
|
66
|
+
else:
|
67
|
+
return state
|
62
68
|
|
63
|
-
|
64
|
-
|
65
|
-
|
69
|
+
# propagate current state along with sample limit exceeded
|
70
|
+
except SampleLimitExceededError as ex:
|
71
|
+
raise ex.with_state(state)
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -75,9 +75,9 @@ from inspect_ai.scorer._scorer import unique_scorer_name
|
|
75
75
|
from inspect_ai.solver import Generate, Plan, TaskState
|
76
76
|
from inspect_ai.solver._chain import Chain, unroll
|
77
77
|
from inspect_ai.solver._fork import set_task_generate
|
78
|
+
from inspect_ai.solver._limit import SampleLimitExceededError
|
78
79
|
from inspect_ai.solver._solver import Solver
|
79
80
|
from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
|
80
|
-
from inspect_ai.util._limit import SampleLimitExceededError
|
81
81
|
from inspect_ai.util._sandbox.context import sandbox_connections
|
82
82
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
83
83
|
from inspect_ai.util._subtask import init_subtask
|
@@ -402,7 +402,13 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
402
402
|
view_notify_eval(logger.location)
|
403
403
|
|
404
404
|
try:
|
405
|
-
|
405
|
+
if (
|
406
|
+
await send_telemetry("eval_log_location", eval_log.location)
|
407
|
+
== "not_handled"
|
408
|
+
):
|
409
|
+
# Converting the eval log to JSON is expensive. Only do so if
|
410
|
+
# eval_log_location was not handled.
|
411
|
+
await send_telemetry("eval_log", eval_log_json_str(eval_log))
|
406
412
|
except Exception as ex:
|
407
413
|
py_logger.warning(
|
408
414
|
f"Error occurred sending telemetry: {exception_message(ex)}"
|
@@ -490,7 +496,7 @@ async def task_run_sample(
|
|
490
496
|
logger: TaskLogger | None,
|
491
497
|
log_images: bool,
|
492
498
|
sample_source: EvalSampleSource | None,
|
493
|
-
sample_error:
|
499
|
+
sample_error: SampleErrorHandler,
|
494
500
|
sample_complete: Callable[[dict[str, SampleScore]], None],
|
495
501
|
fails_on_error: bool,
|
496
502
|
time_limit: int | None,
|
@@ -542,12 +548,12 @@ async def task_run_sample(
|
|
542
548
|
)
|
543
549
|
|
544
550
|
# helper to handle exceptions (will throw if we've exceeded the limit)
|
545
|
-
def handle_error(ex: BaseException) -> EvalError:
|
551
|
+
def handle_error(ex: BaseException) -> tuple[EvalError, BaseException | None]:
|
546
552
|
err = sample_error(ex)
|
547
553
|
py_logger.warning(
|
548
554
|
f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
|
549
555
|
)
|
550
|
-
transcript()._event(ErrorEvent(error=err))
|
556
|
+
transcript()._event(ErrorEvent(error=err[0]))
|
551
557
|
return err
|
552
558
|
|
553
559
|
# solver loop
|
@@ -566,6 +572,7 @@ async def task_run_sample(
|
|
566
572
|
) as active,
|
567
573
|
):
|
568
574
|
error: EvalError | None = None
|
575
|
+
raise_error: BaseException | None = None
|
569
576
|
results: dict[str, SampleScore] = {}
|
570
577
|
try:
|
571
578
|
async with sandboxenv_cm:
|
@@ -634,7 +641,7 @@ async def task_run_sample(
|
|
634
641
|
state = sample_state() or state
|
635
642
|
case "error":
|
636
643
|
# default error handling
|
637
|
-
error = handle_error(ex)
|
644
|
+
error, raise_error = handle_error(ex)
|
638
645
|
|
639
646
|
else:
|
640
647
|
raise
|
@@ -650,11 +657,11 @@ async def task_run_sample(
|
|
650
657
|
)
|
651
658
|
|
652
659
|
# capture most recent state for scoring
|
653
|
-
state = sample_state() or state
|
660
|
+
state = ex.state or sample_state() or state
|
654
661
|
state.completed = True
|
655
662
|
|
656
663
|
except BaseException as ex:
|
657
|
-
error = handle_error(ex)
|
664
|
+
error, raise_error = handle_error(ex)
|
658
665
|
|
659
666
|
# set timeout for scoring. if the original timeout was hit we still
|
660
667
|
# want to provide opportunity for scoring, but we don't necessarily
|
@@ -731,11 +738,10 @@ async def task_run_sample(
|
|
731
738
|
)
|
732
739
|
|
733
740
|
# handle error (this will throw if we've exceeded the limit)
|
734
|
-
error = handle_error(ex)
|
741
|
+
error, raise_error = handle_error(ex)
|
735
742
|
|
736
|
-
|
737
|
-
|
738
|
-
error = handle_error(ex)
|
743
|
+
except Exception as ex:
|
744
|
+
error, raise_error = handle_error(ex)
|
739
745
|
|
740
746
|
# complete the sample
|
741
747
|
progress(SAMPLE_TOTAL_PROGRESS_UNITS)
|
@@ -766,6 +772,8 @@ async def task_run_sample(
|
|
766
772
|
if results is not None:
|
767
773
|
sample_complete(results)
|
768
774
|
return results
|
775
|
+
elif raise_error:
|
776
|
+
raise raise_error
|
769
777
|
else:
|
770
778
|
return None
|
771
779
|
|
inspect_ai/_util/hooks.py
CHANGED
@@ -17,19 +17,29 @@ from .error import PrerequisiteError
|
|
17
17
|
#
|
18
18
|
# Telemetry can be optionally enabled by setting an INSPECT_TELEMETRY
|
19
19
|
# environment variable that points to a function in a package which
|
20
|
-
# conforms to the TelemetrySend signature below.
|
20
|
+
# conforms to the TelemetrySend signature below. A return value of True
|
21
|
+
# indicates that the telemetry event was handled.
|
21
22
|
|
22
|
-
# There are currently
|
23
|
-
# - model_usage
|
24
|
-
# -
|
23
|
+
# There are currently three types of telemetry sent:
|
24
|
+
# - model_usage (JSON string of the model usage)
|
25
|
+
# - eval_log_location (file path or URL string of the eval log)
|
26
|
+
# - eval_log (JSON string of the eval log)
|
27
|
+
# [only sent if eval_log_location unhandled]
|
28
|
+
# The eval_log_location type is preferred over eval_log as it means we can take
|
29
|
+
# advantage of the .eval format and avoid loading the whole log into memory.
|
25
30
|
|
26
|
-
TelemetrySend = Callable[[str, str], Awaitable[
|
31
|
+
TelemetrySend = Callable[[str, str], Awaitable[bool]]
|
27
32
|
|
28
33
|
|
29
|
-
async def send_telemetry(
|
34
|
+
async def send_telemetry(
|
35
|
+
type: Literal["model_usage", "eval_log", "eval_log_location"], json: str
|
36
|
+
) -> Literal["handled", "not_handled", "no_subscribers"]:
|
30
37
|
global _send_telemetry
|
31
38
|
if _send_telemetry:
|
32
|
-
await _send_telemetry(type, json)
|
39
|
+
if await _send_telemetry(type, json):
|
40
|
+
return "handled"
|
41
|
+
return "not_handled"
|
42
|
+
return "no_subscribers"
|
33
43
|
|
34
44
|
|
35
45
|
_send_telemetry: TelemetrySend | None = None
|
inspect_ai/_util/transcript.py
CHANGED
@@ -111,6 +111,17 @@ def transcript_panel(
|
|
111
111
|
)
|
112
112
|
|
113
113
|
|
114
|
+
def transcript_reasoning(reasoning: str) -> list[RenderableType]:
|
115
|
+
content: list[RenderableType] = []
|
116
|
+
content.append(
|
117
|
+
transcript_markdown(
|
118
|
+
f"**<think>** \n{reasoning} \n**</think>**\n\n", escape=True
|
119
|
+
)
|
120
|
+
)
|
121
|
+
content.append(Text())
|
122
|
+
return content
|
123
|
+
|
124
|
+
|
114
125
|
def transcript_separator(title: str, color: str) -> RenderableType:
|
115
126
|
return Rule(title=title, style=f"{color} bold", align="center", end="\n\n")
|
116
127
|
|
@@ -15735,6 +15735,7 @@ pre.ap-terminal.ap-cursor-on .ap-line .ap-cursor.ap-inverse {
|
|
15735
15735
|
}
|
15736
15736
|
pre.ap-terminal:not(.ap-blink) .ap-line .ap-blink {
|
15737
15737
|
color: transparent;
|
15738
|
+
border-color: transparent;
|
15738
15739
|
}
|
15739
15740
|
pre.ap-terminal .ap-bright {
|
15740
15741
|
font-weight: bold;
|