inspect-ai 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. inspect_ai/__init__.py +3 -2
  2. inspect_ai/_cli/cache.py +1 -1
  3. inspect_ai/_cli/common.py +15 -0
  4. inspect_ai/_cli/eval.py +4 -5
  5. inspect_ai/_cli/log.py +1 -1
  6. inspect_ai/_cli/sandbox.py +1 -1
  7. inspect_ai/_cli/trace.py +1 -1
  8. inspect_ai/_cli/view.py +1 -1
  9. inspect_ai/_display/core/config.py +3 -1
  10. inspect_ai/_eval/eval.py +55 -61
  11. inspect_ai/_eval/evalset.py +64 -154
  12. inspect_ai/_eval/loader.py +27 -54
  13. inspect_ai/_eval/registry.py +4 -15
  14. inspect_ai/_eval/run.py +7 -4
  15. inspect_ai/_eval/task/__init__.py +8 -2
  16. inspect_ai/_eval/task/log.py +9 -1
  17. inspect_ai/_eval/task/resolved.py +35 -0
  18. inspect_ai/_eval/task/run.py +4 -0
  19. inspect_ai/_eval/task/task.py +50 -69
  20. inspect_ai/_eval/task/tasks.py +30 -0
  21. inspect_ai/_util/constants.py +3 -0
  22. inspect_ai/_util/dotenv.py +17 -0
  23. inspect_ai/_util/logger.py +3 -0
  24. inspect_ai/_util/registry.py +43 -2
  25. inspect_ai/_view/server.py +28 -10
  26. inspect_ai/_view/www/dist/assets/index.css +32 -19
  27. inspect_ai/_view/www/dist/assets/index.js +17682 -29989
  28. inspect_ai/_view/www/log-schema.json +79 -9
  29. inspect_ai/_view/www/package.json +2 -2
  30. inspect_ai/_view/www/src/appearance/styles.ts +6 -5
  31. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
  32. inspect_ai/_view/www/src/constants.ts +3 -0
  33. inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
  34. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
  35. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
  36. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
  37. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
  38. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
  39. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
  40. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
  41. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
  42. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
  43. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
  44. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
  45. inspect_ai/_view/www/src/types/log.d.ts +11 -5
  46. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
  47. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
  48. inspect_ai/_view/www/yarn.lock +12 -5
  49. inspect_ai/log/_log.py +10 -1
  50. inspect_ai/log/_recorders/eval.py +27 -8
  51. inspect_ai/log/_recorders/json.py +10 -2
  52. inspect_ai/log/_transcript.py +13 -4
  53. inspect_ai/model/_call_tools.py +13 -4
  54. inspect_ai/model/_chat_message.py +15 -1
  55. inspect_ai/model/_model.py +30 -12
  56. inspect_ai/model/_model_output.py +6 -1
  57. inspect_ai/model/_openai.py +11 -6
  58. inspect_ai/model/_providers/anthropic.py +167 -77
  59. inspect_ai/model/_providers/google.py +6 -2
  60. inspect_ai/model/_providers/none.py +31 -0
  61. inspect_ai/model/_providers/openai.py +11 -8
  62. inspect_ai/model/_providers/providers.py +7 -0
  63. inspect_ai/model/_providers/vertex.py +5 -2
  64. inspect_ai/solver/_bridge/bridge.py +1 -1
  65. inspect_ai/solver/_chain.py +7 -6
  66. inspect_ai/tool/__init__.py +4 -0
  67. inspect_ai/tool/_tool_call.py +5 -2
  68. inspect_ai/tool/_tool_support_helpers.py +200 -0
  69. inspect_ai/tool/_tools/_bash_session.py +119 -0
  70. inspect_ai/tool/_tools/_computer/_computer.py +1 -1
  71. inspect_ai/tool/_tools/_text_editor.py +121 -0
  72. inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
  73. inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
  74. inspect_ai/tool/_tools/_web_search.py +2 -2
  75. inspect_ai/util/_json.py +28 -0
  76. inspect_ai/util/_sandbox/context.py +18 -8
  77. inspect_ai/util/_sandbox/docker/config.py +1 -1
  78. inspect_ai/util/_sandbox/docker/internal.py +3 -3
  79. inspect_ai/util/_sandbox/environment.py +17 -2
  80. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/METADATA +8 -5
  81. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/RECORD +85 -108
  82. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/WHEEL +1 -1
  83. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
  84. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
  85. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
  86. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
  87. inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
  88. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
  89. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
  90. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
  91. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
  92. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
  93. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
  94. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
  95. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
  96. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
  97. inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
  98. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
  99. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
  100. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
  101. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
  102. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
  103. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
  104. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
  105. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
  106. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
  107. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
  108. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
  109. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
  110. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
  111. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
  112. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
  113. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/entry_points.txt +0 -0
  114. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info/licenses}/LICENSE +0 -0
  115. {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/top_level.txt +0 -0
@@ -52,19 +52,22 @@ from ._model_output import ModelUsage, StopReason, as_stop_reason
52
52
 
53
53
 
54
54
  def is_o_series(name: str) -> bool:
55
- return bool(re.match(r"(^|.*\/)o\d+", name))
55
+ if bool(re.match(r"^o\d+", name)):
56
+ return True
57
+ else:
58
+ return not is_gpt(name) and bool(re.search(r"o\d+", name))
56
59
 
57
60
 
58
61
  def is_o1_mini(name: str) -> bool:
59
- return name.startswith("o1-mini")
62
+ return "o1-mini" in name
60
63
 
61
64
 
62
65
  def is_o1_preview(name: str) -> bool:
63
- return name.startswith("o1-preview")
66
+ return "o1-preview" in name
64
67
 
65
68
 
66
69
  def is_gpt(name: str) -> bool:
67
- return name.startswith("gpt")
70
+ return "gpt" in name
68
71
 
69
72
 
70
73
  def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
@@ -80,12 +83,13 @@ def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
80
83
  def openai_chat_tool_call_param(
81
84
  tool_call: ToolCall,
82
85
  ) -> ChatCompletionMessageToolCallParam:
86
+ assert tool_call.type == "function", f"Unexpected tool call type {tool_call.type}"
83
87
  return ChatCompletionMessageToolCallParam(
84
88
  id=tool_call.id,
85
89
  function=dict(
86
90
  name=tool_call.function, arguments=json.dumps(tool_call.arguments)
87
91
  ),
88
- type=tool_call.type,
92
+ type="function", # Type narrowing couldn't figure it out
89
93
  )
90
94
 
91
95
 
@@ -108,7 +112,8 @@ async def openai_chat_completion_part(
108
112
  image_url=dict(url=image_url, detail=detail),
109
113
  )
110
114
  elif content.type == "audio":
111
- audio_data = await file_as_data_uri(content.audio)
115
+ audio_data_uri = await file_as_data_uri(content.audio)
116
+ audio_data = audio_data_uri.split("base64,")[1]
112
117
 
113
118
  return ChatCompletionContentPartInputAudioParam(
114
119
  type="input_audio", input_audio=dict(data=audio_data, format=content.format)
@@ -1,23 +1,12 @@
1
1
  import functools
2
2
  import os
3
3
  import re
4
- import sys
5
4
  from copy import copy
6
5
  from logging import getLogger
7
- from typing import Any, Literal, Optional, Tuple, TypedDict, cast
6
+ from typing import Any, Literal, NamedTuple, Optional, Tuple, cast
8
7
 
9
8
  import httpcore
10
9
  import httpx
11
-
12
- from inspect_ai._util.http import is_retryable_http_status
13
-
14
- from .util.hooks import HttpxHooks
15
-
16
- if sys.version_info >= (3, 11):
17
- from typing import NotRequired
18
- else:
19
- from typing_extensions import NotRequired
20
-
21
10
  from anthropic import (
22
11
  APIConnectionError,
23
12
  APIStatusError,
@@ -39,19 +28,19 @@ from anthropic.types import (
39
28
  TextBlockParam,
40
29
  ThinkingBlock,
41
30
  ThinkingBlockParam,
31
+ ToolBash20250124Param,
42
32
  ToolParam,
43
33
  ToolResultBlockParam,
34
+ ToolTextEditor20250124Param,
44
35
  ToolUseBlock,
45
36
  ToolUseBlockParam,
46
37
  message_create_params,
47
38
  )
39
+ from anthropic.types.beta import BetaToolComputerUse20250124Param
48
40
  from pydantic import JsonValue
49
41
  from typing_extensions import override
50
42
 
51
- from inspect_ai._util.constants import (
52
- BASE_64_DATA_REMOVED,
53
- NO_CONTENT,
54
- )
43
+ from inspect_ai._util.constants import BASE_64_DATA_REMOVED, NO_CONTENT
55
44
  from inspect_ai._util.content import (
56
45
  Content,
57
46
  ContentImage,
@@ -59,6 +48,7 @@ from inspect_ai._util.content import (
59
48
  ContentText,
60
49
  )
61
50
  from inspect_ai._util.error import exception_message
51
+ from inspect_ai._util.http import is_retryable_http_status
62
52
  from inspect_ai._util.images import file_as_data_uri
63
53
  from inspect_ai._util.logger import warn_once
64
54
  from inspect_ai._util.url import data_uri_mime_type, data_uri_to_base64
@@ -70,11 +60,14 @@ from .._model import ModelAPI
70
60
  from .._model_call import ModelCall
71
61
  from .._model_output import ChatCompletionChoice, ModelOutput, ModelUsage, StopReason
72
62
  from .util import environment_prerequisite_error, model_base_url
63
+ from .util.hooks import HttpxHooks
73
64
 
74
65
  logger = getLogger(__name__)
75
66
 
76
67
  ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY"
77
68
 
69
+ INTERNAL_COMPUTER_TOOL_NAME = "computer"
70
+
78
71
 
79
72
  class AnthropicAPI(ModelAPI):
80
73
  def __init__(
@@ -93,7 +86,7 @@ class AnthropicAPI(ModelAPI):
93
86
  else:
94
87
  self.service = None
95
88
 
96
- # collect gemerate model_args (then delete them so we can pass the rest on)
89
+ # collect generate model_args (then delete them so we can pass the rest on)
97
90
  def collect_model_arg(name: str) -> Any | None:
98
91
  nonlocal model_args
99
92
  value = model_args.get(name, None)
@@ -193,14 +186,11 @@ class AnthropicAPI(ModelAPI):
193
186
 
194
187
  # generate
195
188
  try:
196
- (
197
- system_param,
198
- tools_param,
199
- messages,
200
- computer_use,
201
- ) = await self.resolve_chat_input(input, tools, config)
189
+ system_param, tools_param, messages = await self.resolve_chat_input(
190
+ input, tools, config
191
+ )
202
192
 
203
- # prepare request params (assembed this way so we can log the raw model call)
193
+ # prepare request params (assembled this way so we can log the raw model call)
204
194
  request = dict(messages=messages)
205
195
 
206
196
  # system messages and tools
@@ -218,7 +208,13 @@ class AnthropicAPI(ModelAPI):
218
208
 
219
209
  # extra headers (for time tracker and computer use)
220
210
  extra_headers = headers | {HttpxHooks.REQUEST_ID_HEADER: request_id}
221
- if computer_use:
211
+ if any(
212
+ tool.get("type", None) == "computer_20250124" for tool in tools_param
213
+ ):
214
+ # From: https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#claude-3-7-sonnet-beta-flag
215
+ # Note: The Bash (bash_20250124) and Text Editor (text_editor_20250124)
216
+ # tools are generally available for Claude 3.5 Sonnet (new) as well and
217
+ # can be used without the computer use beta header.
222
218
  betas.append("computer-use-2025-01-24")
223
219
  if len(betas) > 0:
224
220
  extra_headers["anthropic-beta"] = ",".join(betas)
@@ -240,7 +236,9 @@ class AnthropicAPI(ModelAPI):
240
236
  response = message.model_dump()
241
237
 
242
238
  # extract output
243
- output = model_output_from_message(message, tools)
239
+ output = await model_output_from_message(
240
+ self.client, self.model_name, message, tools
241
+ )
244
242
 
245
243
  # return output and call
246
244
  return output, model_call()
@@ -403,9 +401,7 @@ class AnthropicAPI(ModelAPI):
403
401
  input: list[ChatMessage],
404
402
  tools: list[ToolInfo],
405
403
  config: GenerateConfig,
406
- ) -> Tuple[
407
- list[TextBlockParam] | None, list["ToolParamDef"], list[MessageParam], bool
408
- ]:
404
+ ) -> Tuple[list[TextBlockParam] | None, list["ToolParamDef"], list[MessageParam]]:
409
405
  # extract system message
410
406
  system_messages, messages = split_system_messages(input, config)
411
407
 
@@ -418,7 +414,7 @@ class AnthropicAPI(ModelAPI):
418
414
  )
419
415
 
420
416
  # tools
421
- tools_params, computer_use = self.tool_params_for_tools(tools, config)
417
+ tools_params = [self.tool_param_for_tool_info(tool, config) for tool in tools]
422
418
 
423
419
  # system messages
424
420
  if len(system_messages) > 0:
@@ -468,40 +464,35 @@ class AnthropicAPI(ModelAPI):
468
464
  add_cache_control(cast(dict[str, Any], content[-1]))
469
465
 
470
466
  # return chat input
471
- return system_param, tools_params, message_params, computer_use
472
-
473
- def tool_params_for_tools(
474
- self, tools: list[ToolInfo], config: GenerateConfig
475
- ) -> tuple[list["ToolParamDef"], bool]:
476
- # tool params and computer_use bit to return
477
- tool_params: list["ToolParamDef"] = []
478
- computer_use = False
479
-
480
- # for each tool, check if it has a native computer use implementation and use that
481
- # when available (noting that we need to set the computer use request header)
482
- for tool in tools:
483
- computer_use_tool = (
467
+ return system_param, tools_params, message_params
468
+
469
+ def tool_param_for_tool_info(
470
+ self, tool: ToolInfo, config: GenerateConfig
471
+ ) -> "ToolParamDef":
472
+ # Use a native tool implementation when available. Otherwise, use the
473
+ # standard tool implementation
474
+ return self.maybe_native_tool_param(tool, config) or ToolParam(
475
+ name=tool.name,
476
+ description=tool.description,
477
+ input_schema=tool.parameters.model_dump(exclude_none=True),
478
+ )
479
+
480
+ def maybe_native_tool_param(
481
+ self, tool: ToolInfo, config: GenerateConfig
482
+ ) -> Optional["ToolParamDef"]:
483
+ return (
484
+ (
484
485
  self.computer_use_tool_param(tool)
485
- if config.internal_tools is not False
486
- else None
486
+ or self.text_editor_tool_param(tool)
487
+ or self.bash_tool_param(tool)
487
488
  )
488
- if computer_use_tool:
489
- tool_params.append(computer_use_tool)
490
- computer_use = True
491
- else:
492
- tool_params.append(
493
- ToolParam(
494
- name=tool.name,
495
- description=tool.description,
496
- input_schema=tool.parameters.model_dump(exclude_none=True),
497
- )
498
- )
499
-
500
- return tool_params, computer_use
489
+ if config.internal_tools is not False
490
+ else None
491
+ )
501
492
 
502
493
  def computer_use_tool_param(
503
494
  self, tool: ToolInfo
504
- ) -> Optional["ComputerUseToolParam"]:
495
+ ) -> Optional[BetaToolComputerUse20250124Param]:
505
496
  # check for compatible 'computer' tool
506
497
  if tool.name == "computer" and (
507
498
  sorted(tool.parameters.properties.keys())
@@ -523,7 +514,7 @@ class AnthropicAPI(ModelAPI):
523
514
  "Use of Anthropic's native computer use support is not enabled in Claude 3.5. Please use 3.7 or later to leverage the native support.",
524
515
  )
525
516
  return None
526
- return ComputerUseToolParam(
517
+ return BetaToolComputerUse20250124Param(
527
518
  type="computer_20250124",
528
519
  name="computer",
529
520
  # Note: The dimensions passed here for display_width_px and display_height_px should
@@ -540,23 +531,58 @@ class AnthropicAPI(ModelAPI):
540
531
  else:
541
532
  return None
542
533
 
534
+ def text_editor_tool_param(
535
+ self, tool: ToolInfo
536
+ ) -> Optional[ToolTextEditor20250124Param]:
537
+ # check for compatible 'text editor' tool
538
+ if tool.name == "text_editor" and (
539
+ sorted(tool.parameters.properties.keys())
540
+ == sorted(
541
+ [
542
+ "command",
543
+ "file_text",
544
+ "insert_line",
545
+ "new_str",
546
+ "old_str",
547
+ "path",
548
+ "view_range",
549
+ ]
550
+ )
551
+ ):
552
+ return ToolTextEditor20250124Param(
553
+ type="text_editor_20250124", name="str_replace_editor"
554
+ )
555
+ # not a text_editor tool
556
+ else:
557
+ return None
543
558
 
544
- # native anthropic tool definitions for computer use beta
545
- # https://docs.anthropic.com/en/docs/build-with-claude/computer-use
546
- class ComputerUseToolParam(TypedDict):
547
- type: str
548
- name: str
549
- display_width_px: NotRequired[int]
550
- display_height_px: NotRequired[int]
551
- display_number: NotRequired[int]
559
+ def bash_tool_param(self, tool: ToolInfo) -> Optional[ToolBash20250124Param]:
560
+ # check for compatible 'bash' tool
561
+ if tool.name == "bash_session" and (
562
+ sorted(tool.parameters.properties.keys()) == sorted(["command", "restart"])
563
+ ):
564
+ return ToolBash20250124Param(type="bash_20250124", name="bash")
565
+ # not a bash tool
566
+ else:
567
+ return None
552
568
 
553
569
 
554
- # tools can be either a stock tool param or a special computer use tool param
555
- ToolParamDef = ToolParam | ComputerUseToolParam
570
+ # tools can be either a stock tool param or a special Anthropic native use tool param
571
+ ToolParamDef = (
572
+ ToolParam
573
+ | BetaToolComputerUse20250124Param
574
+ | ToolTextEditor20250124Param
575
+ | ToolBash20250124Param
576
+ )
556
577
 
557
578
 
558
579
  def add_cache_control(
559
- param: TextBlockParam | ToolParam | ComputerUseToolParam | dict[str, Any],
580
+ param: TextBlockParam
581
+ | ToolParam
582
+ | BetaToolComputerUse20250124Param
583
+ | ToolTextEditor20250124Param
584
+ | ToolBash20250124Param
585
+ | dict[str, Any],
560
586
  ) -> None:
561
587
  cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}
562
588
 
@@ -565,10 +591,10 @@ def consecutive_user_message_reducer(
565
591
  messages: list[MessageParam],
566
592
  message: MessageParam,
567
593
  ) -> list[MessageParam]:
568
- return consective_message_reducer(messages, message, "user")
594
+ return consecutive_message_reducer(messages, message, "user")
569
595
 
570
596
 
571
- def consective_message_reducer(
597
+ def consecutive_message_reducer(
572
598
  messages: list[MessageParam],
573
599
  message: MessageParam,
574
600
  role: Literal["user", "assistant"],
@@ -581,6 +607,7 @@ def consective_message_reducer(
581
607
 
582
608
 
583
609
  def combine_messages(a: MessageParam, b: MessageParam) -> MessageParam:
610
+ # TODO: Fix this code as it currently drops interesting properties when combining
584
611
  role = a["role"]
585
612
  a_content = a["content"]
586
613
  b_content = b["content"]
@@ -700,7 +727,7 @@ async def message_param(message: ChatMessage) -> MessageParam:
700
727
  ToolUseBlockParam(
701
728
  type="tool_use",
702
729
  id=tool_call.id,
703
- name=tool_call.function,
730
+ name=tool_call.internal_name or tool_call.function,
704
731
  input=tool_call.arguments,
705
732
  )
706
733
  )
@@ -724,9 +751,15 @@ async def message_param(message: ChatMessage) -> MessageParam:
724
751
  )
725
752
 
726
753
 
727
- def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelOutput:
754
+ async def model_output_from_message(
755
+ client: AsyncAnthropic | AsyncAnthropicBedrock | AsyncAnthropicVertex,
756
+ model: str,
757
+ message: Message,
758
+ tools: list[ToolInfo],
759
+ ) -> ModelOutput:
728
760
  # extract content and tool calls
729
761
  content: list[Content] = []
762
+ reasoning_tokens = 0
730
763
  tool_calls: list[ToolCall] | None = None
731
764
 
732
765
  for content_block in message.content:
@@ -741,11 +774,13 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
741
774
  content.append(ContentText(type="text", text=content_text))
742
775
  elif isinstance(content_block, ToolUseBlock):
743
776
  tool_calls = tool_calls or []
777
+ info = maybe_mapped_call_info(content_block.name, tools)
744
778
  tool_calls.append(
745
779
  ToolCall(
746
- type="function",
780
+ type=info.internal_type,
747
781
  id=content_block.id,
748
- function=content_block.name,
782
+ function=info.inspect_name,
783
+ internal_name=info.internal_name,
749
784
  arguments=content_block.model_dump().get("input", {}),
750
785
  )
751
786
  )
@@ -754,6 +789,9 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
754
789
  ContentReasoning(reasoning=content_block.data, redacted=True)
755
790
  )
756
791
  elif isinstance(content_block, ThinkingBlock):
792
+ reasoning_tokens += await count_tokens(
793
+ client, model, content_block.thinking
794
+ )
757
795
  content.append(
758
796
  ContentReasoning(
759
797
  reasoning=content_block.thinking, signature=content_block.signature
@@ -787,7 +825,39 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
787
825
  total_tokens=total_tokens,
788
826
  input_tokens_cache_write=input_tokens_cache_write,
789
827
  input_tokens_cache_read=input_tokens_cache_read,
828
+ reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
829
+ ),
830
+ )
831
+
832
+
833
+ class CallInfo(NamedTuple):
834
+ internal_name: str | None
835
+ internal_type: str
836
+ inspect_name: str
837
+
838
+
839
+ def maybe_mapped_call_info(tool_called: str, tools: list[ToolInfo]) -> CallInfo:
840
+ """
841
+ Return call info - potentially transformed by native tool mappings.
842
+
843
+ Anthropic prescribes names for their native tools - `computer`, `bash`, and
844
+ `str_replace_editor`. For a variety of reasons, Inspect's tool names to not
845
+ necessarily conform to internal names. Anthropic also provides specific tool
846
+ types for these built-in tools.
847
+ """
848
+ mappings = (
849
+ (INTERNAL_COMPUTER_TOOL_NAME, "computer_20250124", "computer"),
850
+ ("str_replace_editor", "text_editor_20250124", "text_editor"),
851
+ ("bash", "bash_20250124", "bash_session"),
852
+ )
853
+
854
+ return next(
855
+ (
856
+ CallInfo(entry[0], entry[1], entry[2])
857
+ for entry in mappings
858
+ if entry[0] == tool_called and any(tool.name == entry[2] for tool in tools)
790
859
  ),
860
+ CallInfo(None, "function", tool_called),
791
861
  )
792
862
 
793
863
 
@@ -852,6 +922,26 @@ async def message_param_content(
852
922
  )
853
923
 
854
924
 
925
+ async def count_tokens(
926
+ client: AsyncAnthropic | AsyncAnthropicBedrock | AsyncAnthropicVertex,
927
+ model: str,
928
+ text: str,
929
+ ) -> int:
930
+ try:
931
+ response = await client.messages.count_tokens(
932
+ model=model,
933
+ messages=[{"role": "user", "content": text}],
934
+ )
935
+ return response.input_tokens
936
+ except Exception as e:
937
+ logger.warning(
938
+ f"Error counting tokens (falling back to estimated tokens): {str(e)}"
939
+ )
940
+ words = text.split()
941
+ estimated_tokens = int(len(words) * 1.3)
942
+ return estimated_tokens
943
+
944
+
855
945
  def model_call_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
856
946
  # remove base64 encoded images
857
947
  if (
@@ -267,8 +267,12 @@ class GoogleGenAIAPI(ModelAPI):
267
267
  import requests # type: ignore
268
268
 
269
269
  # standard http errors
270
- if isinstance(ex, APIError):
271
- return is_retryable_http_status(ex.status)
270
+ if (
271
+ isinstance(ex, APIError)
272
+ and isinstance(ex.status, str)
273
+ and ex.status.isdigit()
274
+ ):
275
+ return is_retryable_http_status(int(ex.status))
272
276
 
273
277
  # low-level requests exceptions
274
278
  elif isinstance(ex, requests.exceptions.RequestException):
@@ -0,0 +1,31 @@
1
+ from inspect_ai._util.error import PrerequisiteError
2
+ from inspect_ai.tool import ToolChoice, ToolInfo
3
+
4
+ from .._chat_message import ChatMessage
5
+ from .._generate_config import GenerateConfig
6
+ from .._model import ModelAPI
7
+ from .._model_output import ModelOutput
8
+
9
+
10
+ class NoModel(ModelAPI):
11
+ """A sentinel model type indicating there is no model specified."""
12
+
13
+ def __init__(
14
+ self,
15
+ model_name: str = "none",
16
+ base_url: str | None = None,
17
+ api_key: str | None = None,
18
+ config: GenerateConfig = GenerateConfig(),
19
+ ) -> None:
20
+ super().__init__(model_name, base_url, api_key, [], config)
21
+
22
+ async def generate(
23
+ self,
24
+ input: list[ChatMessage],
25
+ tools: list[ToolInfo],
26
+ tool_choice: ToolChoice,
27
+ config: GenerateConfig,
28
+ ) -> ModelOutput:
29
+ raise PrerequisiteError(
30
+ "No model specified (and no INSPECT_EVAL_MODEL defined)"
31
+ )
@@ -67,6 +67,16 @@ class OpenAIAPI(ModelAPI):
67
67
  config: GenerateConfig = GenerateConfig(),
68
68
  **model_args: Any,
69
69
  ) -> None:
70
+ # extract azure service prefix from model name (other providers
71
+ # that subclass from us like together expect to have the qualifier
72
+ # in the model name e.g. google/gemma-2b-it)
73
+ parts = model_name.split("/")
74
+ if parts[0] == "azure" and len(parts) > 1:
75
+ self.service: str | None = parts[0]
76
+ model_name = "/".join(parts[1:])
77
+ else:
78
+ self.service = None
79
+
70
80
  # call super
71
81
  super().__init__(
72
82
  model_name=model_name,
@@ -76,14 +86,6 @@ class OpenAIAPI(ModelAPI):
76
86
  config=config,
77
87
  )
78
88
 
79
- # extract any service prefix from model name
80
- parts = model_name.split("/")
81
- if len(parts) > 1:
82
- self.service: str | None = parts[0]
83
- model_name = "/".join(parts[1:])
84
- else:
85
- self.service = None
86
-
87
89
  # resolve api_key
88
90
  if not self.api_key:
89
91
  self.api_key = os.environ.get(
@@ -322,6 +324,7 @@ class OpenAIAPI(ModelAPI):
322
324
  config.reasoning_effort is not None
323
325
  and not self.is_gpt()
324
326
  and not self.is_o1_mini()
327
+ and not self.is_o1_preview()
325
328
  ):
326
329
  params["reasoning_effort"] = config.reasoning_effort
327
330
  if config.response_schema is not None:
@@ -250,6 +250,13 @@ def mockllm() -> type[ModelAPI]:
250
250
  return MockLLM
251
251
 
252
252
 
253
+ @modelapi(name="none")
254
+ def none() -> type[ModelAPI]:
255
+ from .none import NoModel
256
+
257
+ return NoModel
258
+
259
+
253
260
  @modelapi("goodfire")
254
261
  def goodfire() -> type[ModelAPI]:
255
262
  """Get the Goodfire API provider."""
@@ -34,8 +34,8 @@ from inspect_ai._util.content import (
34
34
  Content,
35
35
  ContentAudio,
36
36
  ContentImage,
37
+ ContentReasoning,
37
38
  ContentText,
38
- ContentVideo,
39
39
  )
40
40
  from inspect_ai._util.http import is_retryable_http_status
41
41
  from inspect_ai._util.images import file_as_data
@@ -336,10 +336,13 @@ async def content_part(content: Content | str) -> Part:
336
336
  elif isinstance(content, ContentImage):
337
337
  image_bytes, mime_type = await file_as_data(content.image)
338
338
  return Part.from_image(image=Image.from_bytes(data=image_bytes))
339
+ elif isinstance(content, ContentReasoning):
340
+ return Part.from_text(content.reasoning or NO_CONTENT)
339
341
  else:
340
342
  if isinstance(content, ContentAudio):
341
343
  file = content.audio
342
- elif isinstance(content, ContentVideo):
344
+ else:
345
+ # it's ContentVideo
343
346
  file = content.video
344
347
  file_bytes, mime_type = await file_as_data(file)
345
348
  return Part.from_data(file_bytes, mime_type)
@@ -17,7 +17,7 @@ from .._task_state import TaskState
17
17
  def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solver:
18
18
  """Bridge an external agent into an Inspect Solver.
19
19
 
20
- See documentation at <https://inspect.ai-safety-institute.org.uk/agent-bridge.html>
20
+ See documentation at <https://inspect.aisi.org.uk/agent-bridge.html>
21
21
 
22
22
  Args:
23
23
  agent: Callable which takes a sample `dict` and returns a result `dict`.
@@ -2,10 +2,11 @@ from typing import Sequence, overload
2
2
 
3
3
  from typing_extensions import override
4
4
 
5
- from ._solver import Generate, Solver
5
+ from ._solver import Generate, Solver, solver
6
6
  from ._task_state import TaskState
7
7
 
8
8
 
9
+ @solver
9
10
  def chain(*solvers: Solver | list[Solver]) -> Solver:
10
11
  """Compose a solver from multiple other solvers.
11
12
 
@@ -22,8 +23,8 @@ def chain(*solvers: Solver | list[Solver]) -> Solver:
22
23
  """
23
24
  # flatten lists and chains
24
25
  all_solvers: list[Solver] = []
25
- for solver in solvers:
26
- all_solvers.extend(unroll(solver))
26
+ for s in solvers:
27
+ all_solvers.extend(unroll(s))
27
28
 
28
29
  return Chain(all_solvers)
29
30
 
@@ -72,9 +73,9 @@ class Chain(Sequence[Solver], Solver):
72
73
  ) -> TaskState:
73
74
  from ._transcript import solver_transcript
74
75
 
75
- for solver in self._solvers:
76
- with solver_transcript(solver, state) as st:
77
- state = await solver(state, generate)
76
+ for slv in self._solvers:
77
+ with solver_transcript(slv, state) as st:
78
+ state = await slv(state, generate)
78
79
  st.complete(state)
79
80
  if state.completed:
80
81
  break
@@ -22,17 +22,21 @@ from ._tool_def import ToolDef
22
22
  from ._tool_info import ToolInfo
23
23
  from ._tool_params import ToolParam, ToolParams
24
24
  from ._tool_with import tool_with
25
+ from ._tools._bash_session import bash_session
25
26
  from ._tools._computer import computer
26
27
  from ._tools._execute import bash, python
28
+ from ._tools._text_editor import text_editor
27
29
  from ._tools._web_browser import web_browser
28
30
  from ._tools._web_search import web_search
29
31
 
30
32
  __all__ = [
31
33
  "bash",
34
+ "bash_session",
32
35
  "computer",
33
36
  "python",
34
37
  "web_browser",
35
38
  "web_search",
39
+ "text_editor",
36
40
  "tool",
37
41
  "tool_with",
38
42
  "Tool",