inspect-ai 0.3.94__py3-none-any.whl → 0.3.95__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. inspect_ai/_eval/loader.py +1 -1
  2. inspect_ai/_eval/task/run.py +12 -6
  3. inspect_ai/_util/exception.py +4 -0
  4. inspect_ai/_util/hash.py +39 -0
  5. inspect_ai/_util/path.py +22 -0
  6. inspect_ai/_util/trace.py +1 -1
  7. inspect_ai/_util/working.py +4 -0
  8. inspect_ai/_view/www/dist/assets/index.css +9 -9
  9. inspect_ai/_view/www/dist/assets/index.js +117 -120
  10. inspect_ai/_view/www/package.json +1 -1
  11. inspect_ai/_view/www/src/app/log-view/navbar/SecondaryBar.tsx +2 -2
  12. inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +1 -4
  13. inspect_ai/_view/www/src/app/samples/SamplesTools.tsx +3 -13
  14. inspect_ai/_view/www/src/app/samples/sample-tools/SelectScorer.tsx +45 -48
  15. inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +16 -15
  16. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +47 -75
  17. inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +9 -9
  18. inspect_ai/_view/www/src/app/types.ts +12 -2
  19. inspect_ai/_view/www/src/components/ExpandablePanel.module.css +1 -1
  20. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +5 -5
  21. inspect_ai/_view/www/src/state/hooks.ts +19 -3
  22. inspect_ai/_view/www/src/state/logSlice.ts +23 -5
  23. inspect_ai/_view/www/yarn.lock +9 -9
  24. inspect_ai/agent/_bridge/patch.py +1 -3
  25. inspect_ai/analysis/__init__.py +0 -0
  26. inspect_ai/analysis/beta/__init__.py +57 -0
  27. inspect_ai/analysis/beta/_dataframe/__init__.py +0 -0
  28. inspect_ai/analysis/beta/_dataframe/columns.py +145 -0
  29. inspect_ai/analysis/beta/_dataframe/evals/__init__.py +0 -0
  30. inspect_ai/analysis/beta/_dataframe/evals/columns.py +132 -0
  31. inspect_ai/analysis/beta/_dataframe/evals/extract.py +23 -0
  32. inspect_ai/analysis/beta/_dataframe/evals/table.py +140 -0
  33. inspect_ai/analysis/beta/_dataframe/events/__init__.py +0 -0
  34. inspect_ai/analysis/beta/_dataframe/events/columns.py +37 -0
  35. inspect_ai/analysis/beta/_dataframe/events/table.py +14 -0
  36. inspect_ai/analysis/beta/_dataframe/extract.py +54 -0
  37. inspect_ai/analysis/beta/_dataframe/messages/__init__.py +0 -0
  38. inspect_ai/analysis/beta/_dataframe/messages/columns.py +60 -0
  39. inspect_ai/analysis/beta/_dataframe/messages/extract.py +21 -0
  40. inspect_ai/analysis/beta/_dataframe/messages/table.py +87 -0
  41. inspect_ai/analysis/beta/_dataframe/record.py +377 -0
  42. inspect_ai/analysis/beta/_dataframe/samples/__init__.py +0 -0
  43. inspect_ai/analysis/beta/_dataframe/samples/columns.py +73 -0
  44. inspect_ai/analysis/beta/_dataframe/samples/extract.py +82 -0
  45. inspect_ai/analysis/beta/_dataframe/samples/table.py +329 -0
  46. inspect_ai/analysis/beta/_dataframe/util.py +157 -0
  47. inspect_ai/analysis/beta/_dataframe/validate.py +171 -0
  48. inspect_ai/log/_file.py +1 -1
  49. inspect_ai/log/_log.py +21 -1
  50. inspect_ai/model/_call_tools.py +2 -1
  51. inspect_ai/model/_model.py +6 -4
  52. inspect_ai/model/_openai_responses.py +17 -18
  53. inspect_ai/model/_providers/anthropic.py +30 -5
  54. inspect_ai/model/_providers/providers.py +1 -1
  55. inspect_ai/solver/_multiple_choice.py +4 -1
  56. inspect_ai/solver/_task_state.py +7 -3
  57. inspect_ai/tool/_mcp/_context.py +3 -5
  58. inspect_ai/tool/_mcp/server.py +1 -1
  59. inspect_ai/tool/_tools/_think.py +1 -1
  60. inspect_ai/tool/_tools/_web_search/__init__.py +3 -0
  61. inspect_ai/tool/_tools/{_web_search.py → _web_search/_google.py} +56 -103
  62. inspect_ai/tool/_tools/_web_search/_tavily.py +77 -0
  63. inspect_ai/tool/_tools/_web_search/_web_search.py +85 -0
  64. inspect_ai/util/_sandbox/events.py +3 -2
  65. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/METADATA +8 -1
  66. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/RECORD +70 -43
  67. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/WHEEL +1 -1
  68. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/entry_points.txt +0 -0
  69. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/licenses/LICENSE +0 -0
  70. {inspect_ai-0.3.94.dist-info → inspect_ai-0.3.95.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,171 @@
1
+ from __future__ import annotations
2
+
3
+ from logging import getLogger
4
+ from typing import Any, Iterator, Mapping, Type
5
+
6
+ import jsonref # type: ignore
7
+ from jsonpath_ng import Fields, Index, JSONPath, Slice, Where, WhereNot # type: ignore
8
+ from jsonpath_ng.ext.filter import Filter # type: ignore
9
+ from pydantic import BaseModel
10
+
11
+ logger = getLogger(__name__)
12
+
13
+ Schema = Mapping[str, Any]
14
+
15
+
16
+ def resolved_schema(model: Type[BaseModel]) -> Schema:
17
+ schema_dict = model.model_json_schema()
18
+ base = "file:///memory/inspect_schema.json"
19
+ schema: Schema = jsonref.replace_refs(
20
+ schema_dict, base_uri=base, jsonschema=True, proxies=False
21
+ )
22
+ return schema
23
+
24
+
25
+ def jsonpath_in_schema(expr: JSONPath, schema: Schema) -> bool:
26
+ # don't validate unsupported constructs
27
+ if find_unsupported(expr):
28
+ return True
29
+
30
+ def descend(sch: Schema, tok: str | int | None) -> list[Schema]:
31
+ # First, branch through anyOf/oneOf/allOf
32
+ outs: list[Schema] = []
33
+ for branch in _expand_union(sch):
34
+ outs.extend(descend_concrete(branch, tok))
35
+ return outs
36
+
37
+ def descend_concrete(sch: Schema, tok: str | int | None) -> list[Schema]:
38
+ # totally open object – accept any child
39
+ if sch == {}:
40
+ return [{}] # stay alive, accept any key
41
+
42
+ outs: list[Schema] = []
43
+
44
+ def open_dict(node: Schema) -> None:
45
+ """Append the schema that governs unknown keys.
46
+
47
+ - None / missing -> open object -> {}
48
+ - True -> open object -> {}
49
+ - Mapping -> that mapping (could be {} or a real subschema)
50
+ - False -> closed object -> (do nothing)
51
+ """
52
+ if "additionalProperties" not in node:
53
+ if not node.get("properties"):
54
+ outs.append({})
55
+ else:
56
+ ap = node["additionalProperties"]
57
+ if ap is True:
58
+ outs.append({})
59
+ elif isinstance(ap, Mapping): # {} or {...}
60
+ outs.append(ap)
61
+ # ap is False -> closed dict -> ignore
62
+
63
+ # Wildcard -----------------------------------------------------------
64
+ if tok is None:
65
+ if "properties" in sch:
66
+ outs.extend(sch["properties"].values())
67
+ if "object" in _types(sch):
68
+ open_dict(sch)
69
+ if "array" in _types(sch) and "items" in sch:
70
+ outs.extend(_normalize_items(sch["items"]))
71
+ return outs
72
+
73
+ # Property access ----------------------------------------------------
74
+ if isinstance(tok, str):
75
+ if "properties" in sch and tok in sch["properties"]:
76
+ outs.append(sch["properties"][tok])
77
+ elif "additionalProperties" in sch: # PRESENCE, not truthiness
78
+ open_dict(sch)
79
+ elif "object" in _types(sch):
80
+ open_dict(sch)
81
+
82
+ # Array index --------------------------------------------------------
83
+ else: # tok is int or None from an Index node
84
+ if "array" in _types(sch) and "items" in sch:
85
+ outs.extend(_normalize_items(sch["items"], index=tok))
86
+
87
+ return outs
88
+
89
+ def _types(sch: Schema) -> set[str]:
90
+ t = sch.get("type")
91
+ return set(t) if isinstance(t, list) else {t} if t else set()
92
+
93
+ def _normalize_items(items: Any, index: int | None = None) -> list[Schema]:
94
+ if isinstance(items, list):
95
+ if index is None: # wildcard/slice
96
+ return items
97
+ if 0 <= index < len(items):
98
+ return [items[index]]
99
+ return []
100
+ if isinstance(items, Mapping):
101
+ return [items]
102
+ return []
103
+
104
+ states = [schema]
105
+ for tok in iter_tokens(expr):
106
+ next_states: list[Schema] = []
107
+ for st in states:
108
+ next_states.extend(descend(st, tok))
109
+ if not next_states: # nothing matched this segment
110
+ return False
111
+ states = next_states
112
+ return True # every segment found at least one schema
113
+
114
+
115
+ def iter_tokens(node: JSONPath) -> Iterator[str | int | None]:
116
+ """Linearise a jsonpath-ng AST into a stream of tokens we care about."""
117
+ if hasattr(node, "left"): # Child, Descendants, etc.
118
+ yield from iter_tokens(node.left)
119
+ yield from iter_tokens(node.right)
120
+ elif isinstance(node, Fields):
121
+ yield from node.fields # e.g. ["foo"]
122
+ elif isinstance(node, Index):
123
+ yield node.index # 0 / -1 / None for wildcard
124
+ elif isinstance(node, Slice):
125
+ yield None # treat any slice as wildcard
126
+
127
+
128
+ COMBINATORS = ("anyOf", "oneOf", "allOf")
129
+
130
+
131
+ def _expand_union(sch: Schema) -> list[Schema]:
132
+ """Return sch itself or the list of subschemas if it is a combinator."""
133
+ for key in COMBINATORS:
134
+ if key in sch:
135
+ subs: list[Schema] = []
136
+ for sub in sch[key]:
137
+ # a sub-schema might itself be an anyOf/oneOf/allOf
138
+ subs.extend(_expand_union(sub))
139
+ return subs
140
+ return [sch]
141
+
142
+
143
+ UNSUPPORTED: tuple[type[JSONPath], ...] = (
144
+ Filter, # [?foo > 0]
145
+ Where, # .foo[(@.bar < 42)]
146
+ WhereNot,
147
+ Slice, # [1:5] (wildcard “[*]” is Index/None, not Slice)
148
+ )
149
+
150
+
151
+ def find_unsupported(node: JSONPath) -> list[type[JSONPath]]:
152
+ """Return a list of node types present in `node` that we do not validate."""
153
+ bad: list[type[JSONPath]] = []
154
+ stack: list[JSONPath] = [node]
155
+ while stack:
156
+ n = stack.pop()
157
+ if isinstance(n, UNSUPPORTED):
158
+ bad.append(type(n))
159
+ # Drill into children (jsonpath-ng uses .left / .right / .child attributes)
160
+ for attr in ("left", "right", "child", "expression"):
161
+ stack.extend(
162
+ [getattr(n, attr)]
163
+ if hasattr(n, attr) and isinstance(getattr(n, attr), JSONPath)
164
+ else []
165
+ )
166
+ # handle containers like Fields(fields=[...]) and Index(index=[...])
167
+ if hasattr(n, "__dict__"):
168
+ for v in n.__dict__.values():
169
+ if isinstance(v, list):
170
+ stack.extend(x for x in v if isinstance(x, JSONPath))
171
+ return bad
inspect_ai/log/_file.py CHANGED
@@ -524,7 +524,7 @@ def manifest_eval_log_name(info: EvalLogInfo, log_dir: str, sep: str) -> str:
524
524
 
525
525
  def log_files_from_ls(
526
526
  ls: list[FileInfo],
527
- formats: list[Literal["eval", "json"]] | None,
527
+ formats: list[Literal["eval", "json"]] | None = None,
528
528
  descending: bool = True,
529
529
  ) -> list[EvalLogInfo]:
530
530
  extensions = [f".{format}" for format in (formats or ALL_LOG_FORMATS)]
inspect_ai/log/_log.py CHANGED
@@ -17,9 +17,11 @@ from pydantic import (
17
17
  )
18
18
  from rich.console import Console, RenderableType
19
19
  from rich.traceback import Traceback
20
+ from shortuuid import uuid
20
21
 
21
- from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, PKG_NAME
22
+ from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, DESERIALIZING, PKG_NAME
22
23
  from inspect_ai._util.error import EvalError, exception_message
24
+ from inspect_ai._util.hash import base57_id_hash
23
25
  from inspect_ai._util.logger import warn_once
24
26
  from inspect_ai.approval._policy import ApprovalPolicyConfig
25
27
  from inspect_ai.dataset._dataset import MT, metadata_as
@@ -677,6 +679,9 @@ class EvalModelConfig(BaseModel):
677
679
  class EvalSpec(BaseModel):
678
680
  """Eval target and configuration."""
679
681
 
682
+ eval_id: str = Field(default_factory=str)
683
+ """Globally unique id for eval."""
684
+
680
685
  run_id: str = Field(default_factory=str)
681
686
  """Unique run id"""
682
687
 
@@ -757,6 +762,21 @@ class EvalSpec(BaseModel):
757
762
  # allow field model_args
758
763
  model_config = ConfigDict(protected_namespaces=())
759
764
 
765
+ def model_post_init(self, __context: Any) -> None:
766
+ # check if deserializing
767
+ is_deserializing = isinstance(__context, dict) and __context.get(
768
+ DESERIALIZING, False
769
+ )
770
+
771
+ # Generate eval_id if needed
772
+ if self.eval_id == "":
773
+ if is_deserializing:
774
+ # we want the eval_id to be stable across reads of the eval log so we compose it
775
+ # as a hash that matches the size/apperance of shortuuid-based uuids
776
+ self.eval_id = base57_id_hash(self.run_id + self.task_id + self.created)
777
+ else:
778
+ self.eval_id = uuid()
779
+
760
780
  @model_validator(mode="before")
761
781
  @classmethod
762
782
  def read_sandbox_spec(
@@ -39,6 +39,7 @@ from inspect_ai._util.content import (
39
39
  ContentText,
40
40
  ContentVideo,
41
41
  )
42
+ from inspect_ai._util.exception import TerminateSampleError
42
43
  from inspect_ai._util.format import format_function_call
43
44
  from inspect_ai._util.logger import warn_once
44
45
  from inspect_ai._util.registry import registry_unqualified_name
@@ -376,7 +377,7 @@ async def call_tool(
376
377
  transcript()._event(
377
378
  SampleLimitEvent(type="operator", limit=1, message=message)
378
379
  )
379
- raise LimitExceededError("operator", value=1, limit=1, message=message)
380
+ raise TerminateSampleError(message)
380
381
  else:
381
382
  raise ToolApprovalError(approval.explanation if approval else None)
382
383
  if approval and approval.modified:
@@ -1237,9 +1237,10 @@ def tool_result_images_as_user_message(
1237
1237
 
1238
1238
  Tool responses will have images replaced with "Image content is included below.", and the new user message will contain the images.
1239
1239
  """
1240
- init_accum: ImagesAccumulator = ([], [], [])
1241
1240
  chat_messages, user_message_content, tool_call_ids = functools.reduce(
1242
- tool_result_images_reducer, messages, init_accum
1241
+ tool_result_images_reducer,
1242
+ messages,
1243
+ (list[ChatMessage](), list[Content](), list[str]()),
1243
1244
  )
1244
1245
  # if the last message was a tool result, we may need to flush the pending stuff here
1245
1246
  return maybe_adding_user_message(chat_messages, user_message_content, tool_call_ids)
@@ -1265,9 +1266,10 @@ def tool_result_images_reducer(
1265
1266
  and isinstance(message.content, list)
1266
1267
  and any([isinstance(c, ContentImage) for c in message.content])
1267
1268
  ):
1268
- init_accum: ImageContentAccumulator = ([], [])
1269
1269
  new_user_message_content, edited_tool_message_content = functools.reduce(
1270
- tool_result_image_content_reducer, message.content, init_accum
1270
+ tool_result_image_content_reducer,
1271
+ message.content,
1272
+ (list[Content](), list[Content]()),
1271
1273
  )
1272
1274
 
1273
1275
  return (
@@ -184,24 +184,23 @@ def openai_responses_chat_choices(
184
184
  # │ │ ┌───────────────────┐ │ │ │ │ ┌───────────────────┐ │ │ │ │ ┌───────────────────┐ │ │
185
185
  # │ │ │ type: "reasoning" │ │ │ │ │ │ ContentText │ │ │ │ │ │ type: "reasoning" │ │ │
186
186
  # │ │ │ id: "rs_bbbbbb" │ │ │ │ │ │ text: "" │ │ │ │ │ │ id: "rs_bbbbbb" │ │ │
187
- # │ │ │ summary: [] │ │ │ │ │ └───────────────────┘ │ │ │ │ │ summary: [] │ │ │
188
- # │ │ └───────────────────┘ │ │ │ │ ┌───────────────────┐ │ │ │ │ ┌───────────────────┐ │ │
189
- # │ │ ┌───────────────────┐ │ │ │ │ │ ContentText │ │ │ │ │ │ type: "message" │ │ │
190
- # │ │ │ type: "message" │ │ │ │ │ text: "text1" │ │ │ │ │ id: "msg_ccccccc" │ │ │
191
- # │ │ │ id: "msg_ccccccc" │ │ │ │ │ └───────────────────┘ │ │ │ │ │ role: "assistant" │ │ │
192
- # │ │ │ role: "assistant" │ │ │--->│┌───────────────────┐│--->│ │ │ ┌───────────────┐ │ │ │
193
- # │ │ │ ┌───────────────┐ │ │ │ │ │ ContentText │ │ │ │ │ │ Content │ │ │ │
194
- # │ │ │ │ Content │ │ │ │ │ text: "text2" │ │ │ │ │ │ │ ┌───────────┐ │ │ │ │
195
- # │ │ │ │ ┌───────────┐ │ │ │ │ │ └───────────────────────┘ │ │ │ │ │ │"text1" │ │ │ │ │
196
- # │ │ │ │ │"text1" │ │ │ │ │ │ ┌───────────────────────┐ │ │ │ │ │ └───────────┘ │ │ │ │
197
- # │ │ │ │ └───────────┘ │ │ │ │ │ │ internal │ │ │ │ │ │ ┌───────────┐ │ │ │ │
198
- # │ │ │ │ ┌───────────┐ │ │ │ │ │ │ ┌───────────────────┐ │ │ │ │ │ │ "text2" │ │ │ │
199
- # │ │ │ "text2" │ │ │ │ │ │ │ │ reasoning_id: │ │ │ │ │ │ └───────────┘ │ │ │
200
- # │ │ └───────────┘ │ │ │ │ │ "rs_bbbbbb" │ │ │ │ └───────────────┘ │ │
201
- # │ └───────────────┘ │ │ │ │ │ └───────────────────┘ │ │ │ └───────────────────┘
202
- # │ │ └───────────────────┘ │ │ ┌───────────────────┐ │ └───────────────────────┘ │
203
- # └───────────────────────┘ │ │ output_msg_id: │ │ │ └───────────────────────────┘
204
- # └───────────────────────────┘ │ │ │ "msg_ccccccc" │ │ │
187
+ # │ │ │ summary: [] │ │ │ │ │ ├───────────────────┤ │ │ │ │ │ summary: [] │ │ │
188
+ # │ │ ├───────────────────┤ │ │ │ │ ContentText │ │ │ │ ├───────────────────┤ │ │
189
+ # │ │ type: "message" │ │ │ │ │ text: "text1" │ │ │ │ │ │ type: "message" │ │ │
190
+ # │ │ │ id: "msg_ccccccc" │ │ │ │ │ ├───────────────────┤ │ │ │ │ │ id: "msg_ccccccc" │ │ │
191
+ # │ │ │ role: "assistant" │ │ │ │ │ ContentText │ │ │ │ │ role: "assistant" │ │ │
192
+ # │ │ │ ┌───────────────┐ │ │ ->text: "text2" │ │ │ -> │ │ │ ┌───────────────┐ │ │ │
193
+ # │ │ │ Content │ │ │ │ │ └───────────────────┘ │ │ │ │ │ │ Content │ │ │ │
194
+ # │ │ │ │ ┌───────────┐ │ │ │ │ │ └───────────────────────┘ │ │ │ │ │ ┌───────────┐ │ │ │ │
195
+ # │ │ │ │ │"text1" │ │ │ │ │ │ ┌───────────────────────┐ │ │ │ │ │ │"text1" │ │ │ │ │
196
+ # │ │ │ │ ├───────────┤ │ │ │ │ │ internal │ │ │ │ ├───────────┤ │ │ │ │
197
+ # │ │ │ │ │"text2" │ │ │ │ │ │ │ ┌───────────────────┐ │ │ │ │ │ │ │"text2" │ │ │ │ │
198
+ # │ │ │ │ └───────────┘ │ │ │ │ │ │ reasoning_id: │ │ │ │ │ │ └───────────┘ │ │ │ │
199
+ # │ │ │ └───────────────┘ │ │ │ │ │ │ "rs_bbbbbb" │ │ │ │ │ │ └───────────────┘ │ │ │
200
+ # │ │ └───────────────────┘ │ │ │ │ └───────────────────┘ │ │ │ │ └───────────────────┘ │ │
201
+ # │ └───────────────────────┘ │ │ │ ┌───────────────────┐ │ │ │ └───────────────────────┘
202
+ # └───────────────────────────┘ │ │ │ output_msg_id: │ │ │ └───────────────────────────┘
203
+ # │ │ │ "msg_ccccccc" │ │ │
205
204
  # │ │ └───────────────────┘ │ │
206
205
  # │ └───────────────────────┘ │
207
206
  # └───────────────────────────┘
@@ -33,7 +33,10 @@ from anthropic.types import (
33
33
  ToolUseBlockParam,
34
34
  message_create_params,
35
35
  )
36
- from anthropic.types.beta import BetaToolComputerUse20250124Param
36
+ from anthropic.types.beta import (
37
+ BetaToolComputerUse20250124Param,
38
+ BetaToolTextEditor20241022Param,
39
+ )
37
40
  from pydantic import JsonValue
38
41
  from typing_extensions import override
39
42
 
@@ -218,6 +221,8 @@ class AnthropicAPI(ModelAPI):
218
221
  # tools are generally available for Claude 3.5 Sonnet (new) as well and
219
222
  # can be used without the computer use beta header.
220
223
  betas.append("computer-use-2025-01-24")
224
+ if any("20241022" in str(tool.get("type", "")) for tool in tools_param):
225
+ betas.append("computer-use-2024-10-22")
221
226
  if len(betas) > 0:
222
227
  extra_headers["anthropic-beta"] = ",".join(betas)
223
228
 
@@ -337,6 +342,15 @@ class AnthropicAPI(ModelAPI):
337
342
  @override
338
343
  def should_retry(self, ex: Exception) -> bool:
339
344
  if isinstance(ex, APIStatusError):
345
+ # for unknown reasons, anthropic does not always set status_code == 529
346
+ # for "overloaded_error" so we check for it explicitly
347
+ if (
348
+ isinstance(ex.body, dict)
349
+ and ex.body.get("error", {}).get("type", "") == "overloaded_error"
350
+ ):
351
+ return True
352
+
353
+ # standard http status code checking
340
354
  return is_retryable_http_status(ex.status_code)
341
355
  elif httpx_should_retry(ex):
342
356
  return True
@@ -545,7 +559,7 @@ class AnthropicAPI(ModelAPI):
545
559
 
546
560
  def text_editor_tool_param(
547
561
  self, tool: ToolInfo
548
- ) -> Optional[ToolTextEditor20250124Param]:
562
+ ) -> ToolTextEditor20250124Param | BetaToolTextEditor20241022Param | None:
549
563
  # check for compatible 'text editor' tool
550
564
  if tool.name == "text_editor" and (
551
565
  sorted(tool.parameters.properties.keys())
@@ -561,8 +575,14 @@ class AnthropicAPI(ModelAPI):
561
575
  ]
562
576
  )
563
577
  ):
564
- return ToolTextEditor20250124Param(
565
- type="text_editor_20250124", name="str_replace_editor"
578
+ return (
579
+ BetaToolTextEditor20241022Param(
580
+ type="text_editor_20241022", name="str_replace_editor"
581
+ )
582
+ if self.is_claude_3_5()
583
+ else ToolTextEditor20250124Param(
584
+ type="text_editor_20250124", name="str_replace_editor"
585
+ )
566
586
  )
567
587
  # not a text_editor tool
568
588
  else:
@@ -571,7 +591,10 @@ class AnthropicAPI(ModelAPI):
571
591
 
572
592
  # tools can be either a stock tool param or a special Anthropic native use tool param
573
593
  ToolParamDef = (
574
- ToolParam | BetaToolComputerUse20250124Param | ToolTextEditor20250124Param
594
+ ToolParam
595
+ | BetaToolComputerUse20250124Param
596
+ | ToolTextEditor20250124Param
597
+ | BetaToolTextEditor20241022Param
575
598
  )
576
599
 
577
600
 
@@ -580,6 +603,7 @@ def add_cache_control(
580
603
  | ToolParam
581
604
  | BetaToolComputerUse20250124Param
582
605
  | ToolTextEditor20250124Param
606
+ | BetaToolTextEditor20241022Param
583
607
  | dict[str, Any],
584
608
  ) -> None:
585
609
  cast(dict[str, Any], param)["cache_control"] = {"type": "ephemeral"}
@@ -844,6 +868,7 @@ def _names_for_tool_call(
844
868
  """
845
869
  mappings = (
846
870
  (INTERNAL_COMPUTER_TOOL_NAME, "computer_20250124", "computer"),
871
+ ("str_replace_editor", "text_editor_20241022", "text_editor"),
847
872
  ("str_replace_editor", "text_editor_20250124", "text_editor"),
848
873
  ("bash", "bash_20250124", "bash_session"),
849
874
  )
@@ -281,7 +281,7 @@ def none() -> type[ModelAPI]:
281
281
  def validate_openai_client(feature: str) -> None:
282
282
  FEATURE = feature
283
283
  PACKAGE = "openai"
284
- MIN_VERSION = "1.75.0"
284
+ MIN_VERSION = "1.78.0"
285
285
 
286
286
  # verify we have the package
287
287
  try:
@@ -200,6 +200,7 @@ def multiple_choice(
200
200
  template: str | None = None,
201
201
  cot: bool = False,
202
202
  multiple_correct: bool = False,
203
+ max_tokens: int | None = None,
203
204
  **kwargs: Unpack[DeprecatedArgs],
204
205
  ) -> Solver:
205
206
  """Multiple choice question solver. Formats a multiple choice question prompt, then calls `generate()`.
@@ -226,6 +227,8 @@ def multiple_choice(
226
227
  squares? A) 3, B) 4, C) 9" has multiple correct answers, B and C. Leave
227
228
  as `False` if there's exactly one correct answer from the choices
228
229
  available. NOTE: this has no effect if you provide a custom template.
230
+ max_tokens: Default `None`. Controls the number of tokens generated through the call
231
+ to generate().
229
232
  **kwargs (Any): Deprecated arguments for backward compatibility.
230
233
 
231
234
  #### Shuffling
@@ -282,7 +285,7 @@ def multiple_choice(
282
285
  template=str(template),
283
286
  )
284
287
 
285
- state = await generate(state)
288
+ state = await generate(state, max_tokens=max_tokens)
286
289
 
287
290
  answers = parse_answers(state)
288
291
  if answers and answers.group(1):
@@ -204,13 +204,17 @@ class TaskState:
204
204
  Convenience function for accessing the initial input from the `Sample` as a string.
205
205
 
206
206
  If the `input` is a `list[ChatMessage]`, this will return the text from
207
- the first chat message
207
+ the last chat message
208
208
  """
209
209
  if isinstance(self._input, str):
210
210
  return self._input
211
211
  else:
212
212
  input = next(
213
- (message.text for message in self._input if message.role == "user"),
213
+ (
214
+ message.text
215
+ for message in reversed(self._input)
216
+ if message.role == "user"
217
+ ),
214
218
  None,
215
219
  )
216
220
  if input:
@@ -231,7 +235,7 @@ class TaskState:
231
235
  write access to the user chat prompt. Raises an
232
236
  exception if there is no user prompt
233
237
  """
234
- prompt = next((m for m in self.messages if m.role == "user"), None)
238
+ prompt = next((m for m in reversed(self.messages) if m.role == "user"), None)
235
239
  if prompt:
236
240
  return prompt
237
241
  else:
@@ -2,13 +2,11 @@ from contextlib import _AsyncGeneratorContextManager
2
2
  from typing import TypeAlias
3
3
 
4
4
  from anyio.streams.memory import MemoryObjectReceiveStream, MemoryObjectSendStream
5
- from mcp.types import (
6
- JSONRPCMessage,
7
- )
5
+ from mcp.shared.message import SessionMessage
8
6
 
9
7
  MCPServerContext: TypeAlias = _AsyncGeneratorContextManager[
10
8
  tuple[
11
- MemoryObjectReceiveStream[JSONRPCMessage | Exception],
12
- MemoryObjectSendStream[JSONRPCMessage],
9
+ MemoryObjectReceiveStream[SessionMessage | Exception],
10
+ MemoryObjectSendStream[SessionMessage],
13
11
  ],
14
12
  ]
@@ -102,7 +102,7 @@ def mcp_server_sandbox(
102
102
  def verfify_mcp_package() -> None:
103
103
  FEATURE = "MCP tools"
104
104
  PACKAGE = "mcp"
105
- MIN_VERSION = "1.6.0"
105
+ MIN_VERSION = "1.8.0"
106
106
 
107
107
  # verify we have the package
108
108
  try:
@@ -41,7 +41,7 @@ def think(
41
41
  def think_tool_viewer() -> ToolCallViewer:
42
42
  def viewer(tool_call: ToolCall) -> ToolCallView:
43
43
  call = ToolCallContent(
44
- format="markdown", content=tool_call.arguments["thought"]
44
+ format="markdown", content=tool_call.arguments.get("thought", "")
45
45
  )
46
46
  return ToolCallView(call=call)
47
47
 
@@ -0,0 +1,3 @@
1
+ from ._web_search import web_search
2
+
3
+ __all__ = ["web_search"]