inspect-ai 0.3.61__py3-none-any.whl → 0.3.62__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. inspect_ai/_cli/eval.py +13 -0
  2. inspect_ai/_cli/view.py +4 -0
  3. inspect_ai/_display/textual/widgets/transcript.py +15 -9
  4. inspect_ai/_eval/task/error.py +10 -14
  5. inspect_ai/_eval/task/run.py +10 -8
  6. inspect_ai/_util/transcript.py +11 -0
  7. inspect_ai/_view/www/dist/assets/index.css +1 -0
  8. inspect_ai/_view/www/dist/assets/index.js +100 -94
  9. inspect_ai/_view/www/log-schema.json +35 -19
  10. inspect_ai/_view/www/src/components/ChatView.mjs +23 -0
  11. inspect_ai/_view/www/src/types/log.d.ts +6 -4
  12. inspect_ai/log/_recorders/eval.py +1 -1
  13. inspect_ai/model/_chat_message.py +27 -0
  14. inspect_ai/model/_conversation.py +10 -3
  15. inspect_ai/model/_generate_config.py +6 -0
  16. inspect_ai/model/_model.py +74 -0
  17. inspect_ai/model/_openai.py +33 -1
  18. inspect_ai/model/_providers/anthropic.py +12 -0
  19. inspect_ai/model/_providers/groq.py +4 -0
  20. inspect_ai/model/_providers/openai.py +21 -9
  21. inspect_ai/model/_providers/providers.py +1 -1
  22. inspect_ai/model/_reasoning.py +17 -0
  23. inspect_ai/solver/_basic_agent.py +19 -9
  24. inspect_ai/tool/beta/_computer/_resources/Dockerfile +4 -0
  25. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  26. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
  27. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
  28. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
  29. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/METADATA +1 -1
  30. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/RECORD +34 -29
  31. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/LICENSE +0 -0
  32. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/WHEEL +0 -0
  33. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/entry_points.txt +0 -0
  34. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.62.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -385,6 +385,14 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
385
385
  help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
386
386
  envvar="INSPECT_EVAL_REASONING_EFFORT",
387
387
  )
388
+ @click.option(
389
+ "--reasoning-history/--no-reasoning-history",
390
+ type=bool,
391
+ is_flag=True,
392
+ default=True,
393
+ help="Include reasoning in chat message history sent to generate.",
394
+ envvar="INSPECT_EVAL_REASONING_HISTORY",
395
+ )
388
396
  @click.option(
389
397
  "--log-format",
390
398
  type=click.Choice(["eval", "json"], case_sensitive=False),
@@ -444,6 +452,7 @@ def eval_command(
444
452
  max_tool_output: int | None,
445
453
  cache_prompt: str | None,
446
454
  reasoning_effort: str | None,
455
+ reasoning_history: bool | None,
447
456
  message_limit: int | None,
448
457
  token_limit: int | None,
449
458
  time_limit: int | None,
@@ -603,6 +612,7 @@ def eval_set_command(
603
612
  max_tool_output: int | None,
604
613
  cache_prompt: str | None,
605
614
  reasoning_effort: str | None,
615
+ reasoning_history: bool | None,
606
616
  message_limit: int | None,
607
617
  token_limit: int | None,
608
618
  time_limit: int | None,
@@ -841,6 +851,9 @@ def config_from_locals(locals: dict[str, Any]) -> GenerateConfigArgs:
841
851
  if key == "internal_tools":
842
852
  if value is not False:
843
853
  value = None
854
+ if key == "reasoning_history":
855
+ if value is not False:
856
+ value = None
844
857
  config[key] = value # type: ignore
845
858
  return config
846
859
 
inspect_ai/_cli/view.py CHANGED
@@ -63,6 +63,10 @@ def start(
63
63
  INSPECT_VIEW_AUTHORIZATION_TOKEN = "INSPECT_VIEW_AUTHORIZATION_TOKEN"
64
64
  authorization = os.environ.get(INSPECT_VIEW_AUTHORIZATION_TOKEN, None)
65
65
  if authorization:
66
+ # this indicates we are in vscode -- we want to set the log level to HTTP
67
+ # in vscode, updated versions of the extension do this but we set it
68
+ # manually here as a temporary bridge for running against older versions
69
+ common["log_level"] = "HTTP"
66
70
  del os.environ[INSPECT_VIEW_AUTHORIZATION_TOKEN]
67
71
  os.unsetenv(INSPECT_VIEW_AUTHORIZATION_TOKEN)
68
72
 
@@ -15,6 +15,7 @@ from inspect_ai._util.transcript import (
15
15
  set_transcript_markdown_options,
16
16
  transcript_function,
17
17
  transcript_markdown,
18
+ transcript_reasoning,
18
19
  transcript_separator,
19
20
  )
20
21
  from inspect_ai.log._samples import ActiveSample
@@ -33,7 +34,11 @@ from inspect_ai.log._transcript import (
33
34
  SubtaskEvent,
34
35
  ToolEvent,
35
36
  )
36
- from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
37
+ from inspect_ai.model._chat_message import (
38
+ ChatMessage,
39
+ ChatMessageAssistant,
40
+ ChatMessageUser,
41
+ )
37
42
  from inspect_ai.model._render import messages_preceding_assistant
38
43
  from inspect_ai.tool._tool import ToolResult
39
44
  from inspect_ai.tool._tool_transcript import transcript_tool_call
@@ -171,8 +176,8 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
171
176
  # content
172
177
  content: list[RenderableType] = []
173
178
 
174
- def append_message(message: ChatMessage, text: str | None = None) -> None:
175
- content.extend(render_message(message, text))
179
+ def append_message(message: ChatMessage) -> None:
180
+ content.extend(render_message(message))
176
181
 
177
182
  # render preceding messages
178
183
  preceding = messages_preceding_assistant(event.input)
@@ -309,16 +314,17 @@ def render_as_json(json: Any) -> RenderableType:
309
314
  )
310
315
 
311
316
 
312
- def render_message(
313
- message: ChatMessage, text: str | None = None
314
- ) -> list[RenderableType]:
317
+ def render_message(message: ChatMessage) -> list[RenderableType]:
315
318
  content: list[RenderableType] = [
316
319
  Text(message.role.capitalize(), style="bold"),
317
320
  Text(),
318
321
  ]
319
- text = text or message.text
320
- if text:
321
- content.extend([transcript_markdown(text.strip(), escape=True)])
322
+
323
+ if isinstance(message, ChatMessageAssistant) and message.reasoning:
324
+ content.extend(transcript_reasoning(message.reasoning))
325
+
326
+ if message.text:
327
+ content.extend([transcript_markdown(message.text.strip(), escape=True)])
322
328
  return content
323
329
 
324
330
 
@@ -8,28 +8,24 @@ class SampleErrorHandler:
8
8
  self.fail_on_error = True if fail_on_error is None else fail_on_error
9
9
  self.total_samples = float(total_samples)
10
10
 
11
- def __call__(self, ex: BaseException) -> EvalError:
11
+ def __call__(self, ex: BaseException) -> tuple[EvalError, BaseException | None]:
12
12
  # increment error count
13
13
  self.error_count += 1
14
14
 
15
15
  # create error (we may return it)
16
- def sample_error() -> EvalError:
17
- return eval_error(ex, type(ex), ex, ex.__traceback__)
16
+ def sample_error(
17
+ *, raise_error: bool
18
+ ) -> tuple[EvalError, BaseException | None]:
19
+ return eval_error(
20
+ ex, type(ex), ex, ex.__traceback__
21
+ ), ex if raise_error else None
18
22
 
19
23
  # check against limits
20
24
  if isinstance(self.fail_on_error, bool):
21
- if self.fail_on_error:
22
- raise ex
23
- else:
24
- return sample_error()
25
+ return sample_error(raise_error=self.fail_on_error)
25
26
  else:
26
27
  if self.fail_on_error < 1:
27
28
  max_errors = self.fail_on_error * self.total_samples
28
- if self.error_count >= max_errors:
29
- raise ex
30
- else:
31
- return sample_error()
32
- elif self.error_count >= self.fail_on_error:
33
- raise ex
29
+ return sample_error(raise_error=self.error_count >= max_errors)
34
30
  else:
35
- return sample_error()
31
+ return sample_error(raise_error=self.error_count >= self.fail_on_error)
@@ -496,7 +496,7 @@ async def task_run_sample(
496
496
  logger: TaskLogger | None,
497
497
  log_images: bool,
498
498
  sample_source: EvalSampleSource | None,
499
- sample_error: Callable[[BaseException], EvalError],
499
+ sample_error: SampleErrorHandler,
500
500
  sample_complete: Callable[[dict[str, SampleScore]], None],
501
501
  fails_on_error: bool,
502
502
  time_limit: int | None,
@@ -548,12 +548,12 @@ async def task_run_sample(
548
548
  )
549
549
 
550
550
  # helper to handle exceptions (will throw if we've exceeded the limit)
551
- def handle_error(ex: BaseException) -> EvalError:
551
+ def handle_error(ex: BaseException) -> tuple[EvalError, BaseException | None]:
552
552
  err = sample_error(ex)
553
553
  py_logger.warning(
554
554
  f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
555
555
  )
556
- transcript()._event(ErrorEvent(error=err))
556
+ transcript()._event(ErrorEvent(error=err[0]))
557
557
  return err
558
558
 
559
559
  # solver loop
@@ -572,6 +572,7 @@ async def task_run_sample(
572
572
  ) as active,
573
573
  ):
574
574
  error: EvalError | None = None
575
+ raise_error: BaseException | None = None
575
576
  results: dict[str, SampleScore] = {}
576
577
  try:
577
578
  async with sandboxenv_cm:
@@ -640,7 +641,7 @@ async def task_run_sample(
640
641
  state = sample_state() or state
641
642
  case "error":
642
643
  # default error handling
643
- error = handle_error(ex)
644
+ error, raise_error = handle_error(ex)
644
645
 
645
646
  else:
646
647
  raise
@@ -660,7 +661,7 @@ async def task_run_sample(
660
661
  state.completed = True
661
662
 
662
663
  except BaseException as ex:
663
- error = handle_error(ex)
664
+ error, raise_error = handle_error(ex)
664
665
 
665
666
  # set timeout for scoring. if the original timeout was hit we still
666
667
  # want to provide opportunity for scoring, but we don't necessarily
@@ -737,11 +738,10 @@ async def task_run_sample(
737
738
  )
738
739
 
739
740
  # handle error (this will throw if we've exceeded the limit)
740
- error = handle_error(ex)
741
+ error, raise_error = handle_error(ex)
741
742
 
742
- # handle sandboxenv init errors
743
743
  except Exception as ex:
744
- error = handle_error(ex)
744
+ error, raise_error = handle_error(ex)
745
745
 
746
746
  # complete the sample
747
747
  progress(SAMPLE_TOTAL_PROGRESS_UNITS)
@@ -772,6 +772,8 @@ async def task_run_sample(
772
772
  if results is not None:
773
773
  sample_complete(results)
774
774
  return results
775
+ elif raise_error:
776
+ raise raise_error
775
777
  else:
776
778
  return None
777
779
 
@@ -111,6 +111,17 @@ def transcript_panel(
111
111
  )
112
112
 
113
113
 
114
+ def transcript_reasoning(reasoning: str) -> list[RenderableType]:
115
+ content: list[RenderableType] = []
116
+ content.append(
117
+ transcript_markdown(
118
+ f"**<think>** \n{reasoning} \n**</think>**\n\n", escape=True
119
+ )
120
+ )
121
+ content.append(Text())
122
+ return content
123
+
124
+
114
125
  def transcript_separator(title: str, color: str) -> RenderableType:
115
126
  return Rule(title=title, style=f"{color} bold", align="center", end="\n\n")
116
127
 
@@ -15735,6 +15735,7 @@ pre.ap-terminal.ap-cursor-on .ap-line .ap-cursor.ap-inverse {
15735
15735
  }
15736
15736
  pre.ap-terminal:not(.ap-blink) .ap-line .ap-blink {
15737
15737
  color: transparent;
15738
+ border-color: transparent;
15738
15739
  }
15739
15740
  pre.ap-terminal .ap-bright {
15740
15741
  font-weight: bold;