inspect-ai 0.3.60__py3-none-any.whl → 0.3.62__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. inspect_ai/_cli/eval.py +13 -1
  2. inspect_ai/_cli/view.py +4 -0
  3. inspect_ai/_display/textual/widgets/transcript.py +15 -9
  4. inspect_ai/_eval/task/error.py +10 -14
  5. inspect_ai/_eval/task/generate.py +41 -35
  6. inspect_ai/_eval/task/run.py +20 -12
  7. inspect_ai/_util/hooks.py +17 -7
  8. inspect_ai/_util/transcript.py +11 -0
  9. inspect_ai/_view/www/dist/assets/index.css +1 -0
  10. inspect_ai/_view/www/dist/assets/index.js +100 -94
  11. inspect_ai/_view/www/log-schema.json +35 -19
  12. inspect_ai/_view/www/package.json +1 -1
  13. inspect_ai/_view/www/src/components/ChatView.mjs +23 -0
  14. inspect_ai/_view/www/src/types/log.d.ts +6 -4
  15. inspect_ai/log/_recorders/eval.py +1 -1
  16. inspect_ai/model/_chat_message.py +29 -2
  17. inspect_ai/model/_conversation.py +10 -3
  18. inspect_ai/model/_generate_config.py +6 -0
  19. inspect_ai/model/_model.py +164 -25
  20. inspect_ai/model/_openai.py +33 -1
  21. inspect_ai/model/_providers/anthropic.py +12 -3
  22. inspect_ai/model/_providers/groq.py +4 -0
  23. inspect_ai/model/_providers/openai.py +21 -9
  24. inspect_ai/model/_providers/providers.py +1 -1
  25. inspect_ai/model/_reasoning.py +17 -0
  26. inspect_ai/solver/__init__.py +2 -0
  27. inspect_ai/solver/_basic_agent.py +78 -58
  28. inspect_ai/{util → solver}/_limit.py +13 -0
  29. inspect_ai/solver/_task_state.py +37 -7
  30. inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
  31. inspect_ai/tool/beta/_computer/_resources/Dockerfile +5 -3
  32. inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +1 -1
  33. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  34. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
  35. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
  36. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +10 -0
  37. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
  38. inspect_ai/util/__init__.py +0 -2
  39. inspect_ai/util/_sandbox/self_check.py +51 -28
  40. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/METADATA +2 -2
  41. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/RECORD +45 -40
  42. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +0 -10
  43. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/LICENSE +0 -0
  44. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/WHEEL +0 -0
  45. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/entry_points.txt +0 -0
  46. {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -385,6 +385,14 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
385
385
  help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
386
386
  envvar="INSPECT_EVAL_REASONING_EFFORT",
387
387
  )
388
+ @click.option(
389
+ "--reasoning-history/--no-reasoning-history",
390
+ type=bool,
391
+ is_flag=True,
392
+ default=True,
393
+ help="Include reasoning in chat message history sent to generate.",
394
+ envvar="INSPECT_EVAL_REASONING_HISTORY",
395
+ )
388
396
  @click.option(
389
397
  "--log-format",
390
398
  type=click.Choice(["eval", "json"], case_sensitive=False),
@@ -444,6 +452,7 @@ def eval_command(
444
452
  max_tool_output: int | None,
445
453
  cache_prompt: str | None,
446
454
  reasoning_effort: str | None,
455
+ reasoning_history: bool | None,
447
456
  message_limit: int | None,
448
457
  token_limit: int | None,
449
458
  time_limit: int | None,
@@ -592,7 +601,6 @@ def eval_set_command(
592
601
  logit_bias: str | None,
593
602
  seed: int | None,
594
603
  stop_seqs: str | None,
595
- suffix: str | None,
596
604
  temperature: float | None,
597
605
  top_p: float | None,
598
606
  top_k: int | None,
@@ -604,6 +612,7 @@ def eval_set_command(
604
612
  max_tool_output: int | None,
605
613
  cache_prompt: str | None,
606
614
  reasoning_effort: str | None,
615
+ reasoning_history: bool | None,
607
616
  message_limit: int | None,
608
617
  token_limit: int | None,
609
618
  time_limit: int | None,
@@ -842,6 +851,9 @@ def config_from_locals(locals: dict[str, Any]) -> GenerateConfigArgs:
842
851
  if key == "internal_tools":
843
852
  if value is not False:
844
853
  value = None
854
+ if key == "reasoning_history":
855
+ if value is not False:
856
+ value = None
845
857
  config[key] = value # type: ignore
846
858
  return config
847
859
 
inspect_ai/_cli/view.py CHANGED
@@ -63,6 +63,10 @@ def start(
63
63
  INSPECT_VIEW_AUTHORIZATION_TOKEN = "INSPECT_VIEW_AUTHORIZATION_TOKEN"
64
64
  authorization = os.environ.get(INSPECT_VIEW_AUTHORIZATION_TOKEN, None)
65
65
  if authorization:
66
+ # this indicates we are in vscode -- we want to set the log level to HTTP
67
+ # in vscode, updated versions of the extension do this but we set it
68
+ # manually here as a temporary bridge for running against older versions
69
+ common["log_level"] = "HTTP"
66
70
  del os.environ[INSPECT_VIEW_AUTHORIZATION_TOKEN]
67
71
  os.unsetenv(INSPECT_VIEW_AUTHORIZATION_TOKEN)
68
72
 
@@ -15,6 +15,7 @@ from inspect_ai._util.transcript import (
15
15
  set_transcript_markdown_options,
16
16
  transcript_function,
17
17
  transcript_markdown,
18
+ transcript_reasoning,
18
19
  transcript_separator,
19
20
  )
20
21
  from inspect_ai.log._samples import ActiveSample
@@ -33,7 +34,11 @@ from inspect_ai.log._transcript import (
33
34
  SubtaskEvent,
34
35
  ToolEvent,
35
36
  )
36
- from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
37
+ from inspect_ai.model._chat_message import (
38
+ ChatMessage,
39
+ ChatMessageAssistant,
40
+ ChatMessageUser,
41
+ )
37
42
  from inspect_ai.model._render import messages_preceding_assistant
38
43
  from inspect_ai.tool._tool import ToolResult
39
44
  from inspect_ai.tool._tool_transcript import transcript_tool_call
@@ -171,8 +176,8 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
171
176
  # content
172
177
  content: list[RenderableType] = []
173
178
 
174
- def append_message(message: ChatMessage, text: str | None = None) -> None:
175
- content.extend(render_message(message, text))
179
+ def append_message(message: ChatMessage) -> None:
180
+ content.extend(render_message(message))
176
181
 
177
182
  # render preceding messages
178
183
  preceding = messages_preceding_assistant(event.input)
@@ -309,16 +314,17 @@ def render_as_json(json: Any) -> RenderableType:
309
314
  )
310
315
 
311
316
 
312
- def render_message(
313
- message: ChatMessage, text: str | None = None
314
- ) -> list[RenderableType]:
317
+ def render_message(message: ChatMessage) -> list[RenderableType]:
315
318
  content: list[RenderableType] = [
316
319
  Text(message.role.capitalize(), style="bold"),
317
320
  Text(),
318
321
  ]
319
- text = text or message.text
320
- if text:
321
- content.extend([transcript_markdown(text.strip(), escape=True)])
322
+
323
+ if isinstance(message, ChatMessageAssistant) and message.reasoning:
324
+ content.extend(transcript_reasoning(message.reasoning))
325
+
326
+ if message.text:
327
+ content.extend([transcript_markdown(message.text.strip(), escape=True)])
322
328
  return content
323
329
 
324
330
 
@@ -8,28 +8,24 @@ class SampleErrorHandler:
8
8
  self.fail_on_error = True if fail_on_error is None else fail_on_error
9
9
  self.total_samples = float(total_samples)
10
10
 
11
- def __call__(self, ex: BaseException) -> EvalError:
11
+ def __call__(self, ex: BaseException) -> tuple[EvalError, BaseException | None]:
12
12
  # increment error count
13
13
  self.error_count += 1
14
14
 
15
15
  # create error (we may return it)
16
- def sample_error() -> EvalError:
17
- return eval_error(ex, type(ex), ex, ex.__traceback__)
16
+ def sample_error(
17
+ *, raise_error: bool
18
+ ) -> tuple[EvalError, BaseException | None]:
19
+ return eval_error(
20
+ ex, type(ex), ex, ex.__traceback__
21
+ ), ex if raise_error else None
18
22
 
19
23
  # check against limits
20
24
  if isinstance(self.fail_on_error, bool):
21
- if self.fail_on_error:
22
- raise ex
23
- else:
24
- return sample_error()
25
+ return sample_error(raise_error=self.fail_on_error)
25
26
  else:
26
27
  if self.fail_on_error < 1:
27
28
  max_errors = self.fail_on_error * self.total_samples
28
- if self.error_count >= max_errors:
29
- raise ex
30
- else:
31
- return sample_error()
32
- elif self.error_count >= self.fail_on_error:
33
- raise ex
29
+ return sample_error(raise_error=self.error_count >= max_errors)
34
30
  else:
35
- return sample_error()
31
+ return sample_error(raise_error=self.error_count >= self.fail_on_error)
@@ -8,6 +8,7 @@ from inspect_ai.model import (
8
8
  )
9
9
  from inspect_ai.model._cache import epoch
10
10
  from inspect_ai.solver import TaskState
11
+ from inspect_ai.solver._limit import SampleLimitExceededError
11
12
  from inspect_ai.tool import ToolFunction
12
13
 
13
14
 
@@ -21,45 +22,50 @@ async def task_generate(
21
22
  # track tool_choice (revert to "auto" after first forced call of a tool)
22
23
  tool_choice = state.tool_choice
23
24
 
24
- while True:
25
- # If we don't update the epoch here as we go, it's entirely possible
26
- # we'd cache the same response for every single epoch, which would
27
- # completely defeat the point!
28
- epoch.set(state.epoch)
25
+ try:
26
+ while True:
27
+ # If we don't update the epoch here as we go, it's entirely possible
28
+ # we'd cache the same response for every single epoch, which would
29
+ # completely defeat the point!
30
+ epoch.set(state.epoch)
29
31
 
30
- # call the model
31
- state.output = await model.generate(
32
- input=state.messages,
33
- tools=state.tools,
34
- tool_choice=tool_choice,
35
- config=config,
36
- cache=cache,
37
- )
32
+ # call the model
33
+ state.output = await model.generate(
34
+ input=state.messages,
35
+ tools=state.tools,
36
+ tool_choice=tool_choice,
37
+ config=config,
38
+ cache=cache,
39
+ )
38
40
 
39
- # append the assistant message
40
- message = state.output.message
41
- state.messages.append(message)
41
+ # append the assistant message
42
+ message = state.output.message
43
+ state.messages.append(message)
42
44
 
43
- # check for completed
44
- if state.completed:
45
- return state
45
+ # check for completed
46
+ if state.completed:
47
+ return state
46
48
 
47
- # resolve tool calls if necessary
48
- if tool_calls != "none" and message.tool_calls:
49
- # call tools and append messages to state
50
- state.messages.extend(
51
- await call_tools(message, state.tools, config.max_tool_output)
52
- )
49
+ # resolve tool calls if necessary
50
+ if tool_calls != "none" and message.tool_calls:
51
+ # call tools and append messages to state
52
+ state.messages.extend(
53
+ await call_tools(message, state.tools, config.max_tool_output)
54
+ )
53
55
 
54
- # check for completed or only executing a single tool call
55
- if state.completed or tool_calls == "single":
56
- return state
56
+ # check for completed or only executing a single tool call
57
+ if state.completed or tool_calls == "single":
58
+ return state
59
+
60
+ # if a tool_call was forced set tool_choice to 'auto'
61
+ # (otherwise it will get forced over and over again)
62
+ if isinstance(tool_choice, ToolFunction):
63
+ tool_choice = "auto"
57
64
 
58
- # if a tool_call was forced set tool_choice to 'auto'
59
- # (otherwise it will get forced over and over again)
60
- if isinstance(tool_choice, ToolFunction):
61
- tool_choice = "auto"
65
+ # no tool calls or not resolving tool calls, we are done!
66
+ else:
67
+ return state
62
68
 
63
- # no tool calls or not resolving tool calls, we are done!
64
- else:
65
- return state
69
+ # propagate current state along with sample limit exceeded
70
+ except SampleLimitExceededError as ex:
71
+ raise ex.with_state(state)
@@ -75,9 +75,9 @@ from inspect_ai.scorer._scorer import unique_scorer_name
75
75
  from inspect_ai.solver import Generate, Plan, TaskState
76
76
  from inspect_ai.solver._chain import Chain, unroll
77
77
  from inspect_ai.solver._fork import set_task_generate
78
+ from inspect_ai.solver._limit import SampleLimitExceededError
78
79
  from inspect_ai.solver._solver import Solver
79
80
  from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
80
- from inspect_ai.util._limit import SampleLimitExceededError
81
81
  from inspect_ai.util._sandbox.context import sandbox_connections
82
82
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
83
83
  from inspect_ai.util._subtask import init_subtask
@@ -402,7 +402,13 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
402
402
  view_notify_eval(logger.location)
403
403
 
404
404
  try:
405
- await send_telemetry("eval_log", eval_log_json_str(eval_log))
405
+ if (
406
+ await send_telemetry("eval_log_location", eval_log.location)
407
+ == "not_handled"
408
+ ):
409
+ # Converting the eval log to JSON is expensive. Only do so if
410
+ # eval_log_location was not handled.
411
+ await send_telemetry("eval_log", eval_log_json_str(eval_log))
406
412
  except Exception as ex:
407
413
  py_logger.warning(
408
414
  f"Error occurred sending telemetry: {exception_message(ex)}"
@@ -490,7 +496,7 @@ async def task_run_sample(
490
496
  logger: TaskLogger | None,
491
497
  log_images: bool,
492
498
  sample_source: EvalSampleSource | None,
493
- sample_error: Callable[[BaseException], EvalError],
499
+ sample_error: SampleErrorHandler,
494
500
  sample_complete: Callable[[dict[str, SampleScore]], None],
495
501
  fails_on_error: bool,
496
502
  time_limit: int | None,
@@ -542,12 +548,12 @@ async def task_run_sample(
542
548
  )
543
549
 
544
550
  # helper to handle exceptions (will throw if we've exceeded the limit)
545
- def handle_error(ex: BaseException) -> EvalError:
551
+ def handle_error(ex: BaseException) -> tuple[EvalError, BaseException | None]:
546
552
  err = sample_error(ex)
547
553
  py_logger.warning(
548
554
  f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
549
555
  )
550
- transcript()._event(ErrorEvent(error=err))
556
+ transcript()._event(ErrorEvent(error=err[0]))
551
557
  return err
552
558
 
553
559
  # solver loop
@@ -566,6 +572,7 @@ async def task_run_sample(
566
572
  ) as active,
567
573
  ):
568
574
  error: EvalError | None = None
575
+ raise_error: BaseException | None = None
569
576
  results: dict[str, SampleScore] = {}
570
577
  try:
571
578
  async with sandboxenv_cm:
@@ -634,7 +641,7 @@ async def task_run_sample(
634
641
  state = sample_state() or state
635
642
  case "error":
636
643
  # default error handling
637
- error = handle_error(ex)
644
+ error, raise_error = handle_error(ex)
638
645
 
639
646
  else:
640
647
  raise
@@ -650,11 +657,11 @@ async def task_run_sample(
650
657
  )
651
658
 
652
659
  # capture most recent state for scoring
653
- state = sample_state() or state
660
+ state = ex.state or sample_state() or state
654
661
  state.completed = True
655
662
 
656
663
  except BaseException as ex:
657
- error = handle_error(ex)
664
+ error, raise_error = handle_error(ex)
658
665
 
659
666
  # set timeout for scoring. if the original timeout was hit we still
660
667
  # want to provide opportunity for scoring, but we don't necessarily
@@ -731,11 +738,10 @@ async def task_run_sample(
731
738
  )
732
739
 
733
740
  # handle error (this will throw if we've exceeded the limit)
734
- error = handle_error(ex)
741
+ error, raise_error = handle_error(ex)
735
742
 
736
- # handle sandboxenv init errors
737
- except BaseException as ex:
738
- error = handle_error(ex)
743
+ except Exception as ex:
744
+ error, raise_error = handle_error(ex)
739
745
 
740
746
  # complete the sample
741
747
  progress(SAMPLE_TOTAL_PROGRESS_UNITS)
@@ -766,6 +772,8 @@ async def task_run_sample(
766
772
  if results is not None:
767
773
  sample_complete(results)
768
774
  return results
775
+ elif raise_error:
776
+ raise raise_error
769
777
  else:
770
778
  return None
771
779
 
inspect_ai/_util/hooks.py CHANGED
@@ -17,19 +17,29 @@ from .error import PrerequisiteError
17
17
  #
18
18
  # Telemetry can be optionally enabled by setting an INSPECT_TELEMETRY
19
19
  # environment variable that points to a function in a package which
20
- # conforms to the TelemetrySend signature below.
20
+ # conforms to the TelemetrySend signature below. A return value of True
21
+ # indicates that the telemetry event was handled.
21
22
 
22
- # There are currently two types of telemetry sent:
23
- # - model_usage (type ModelUsage)
24
- # - eval_log (type EvalLog)
23
+ # There are currently three types of telemetry sent:
24
+ # - model_usage (JSON string of the model usage)
25
+ # - eval_log_location (file path or URL string of the eval log)
26
+ # - eval_log (JSON string of the eval log)
27
+ # [only sent if eval_log_location unhandled]
28
+ # The eval_log_location type is preferred over eval_log as it means we can take
29
+ # advantage of the .eval format and avoid loading the whole log into memory.
25
30
 
26
- TelemetrySend = Callable[[str, str], Awaitable[None]]
31
+ TelemetrySend = Callable[[str, str], Awaitable[bool]]
27
32
 
28
33
 
29
- async def send_telemetry(type: Literal["model_usage", "eval_log"], json: str) -> None:
34
+ async def send_telemetry(
35
+ type: Literal["model_usage", "eval_log", "eval_log_location"], json: str
36
+ ) -> Literal["handled", "not_handled", "no_subscribers"]:
30
37
  global _send_telemetry
31
38
  if _send_telemetry:
32
- await _send_telemetry(type, json)
39
+ if await _send_telemetry(type, json):
40
+ return "handled"
41
+ return "not_handled"
42
+ return "no_subscribers"
33
43
 
34
44
 
35
45
  _send_telemetry: TelemetrySend | None = None
@@ -111,6 +111,17 @@ def transcript_panel(
111
111
  )
112
112
 
113
113
 
114
+ def transcript_reasoning(reasoning: str) -> list[RenderableType]:
115
+ content: list[RenderableType] = []
116
+ content.append(
117
+ transcript_markdown(
118
+ f"**<think>** \n{reasoning} \n**</think>**\n\n", escape=True
119
+ )
120
+ )
121
+ content.append(Text())
122
+ return content
123
+
124
+
114
125
  def transcript_separator(title: str, color: str) -> RenderableType:
115
126
  return Rule(title=title, style=f"{color} bold", align="center", end="\n\n")
116
127
 
@@ -15735,6 +15735,7 @@ pre.ap-terminal.ap-cursor-on .ap-line .ap-cursor.ap-inverse {
15735
15735
  }
15736
15736
  pre.ap-terminal:not(.ap-blink) .ap-line .ap-blink {
15737
15737
  color: transparent;
15738
+ border-color: transparent;
15738
15739
  }
15739
15740
  pre.ap-terminal .ap-bright {
15740
15741
  font-weight: bold;