inspect-ai 0.3.60__py3-none-any.whl → 0.3.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -1
- inspect_ai/_cli/view.py +4 -0
- inspect_ai/_display/textual/widgets/transcript.py +15 -9
- inspect_ai/_eval/task/error.py +10 -14
- inspect_ai/_eval/task/generate.py +41 -35
- inspect_ai/_eval/task/run.py +20 -12
- inspect_ai/_util/hooks.py +17 -7
- inspect_ai/_util/transcript.py +11 -0
- inspect_ai/_view/www/dist/assets/index.css +1 -0
- inspect_ai/_view/www/dist/assets/index.js +100 -94
- inspect_ai/_view/www/log-schema.json +35 -19
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/components/ChatView.mjs +23 -0
- inspect_ai/_view/www/src/types/log.d.ts +6 -4
- inspect_ai/log/_recorders/eval.py +1 -1
- inspect_ai/model/_chat_message.py +29 -2
- inspect_ai/model/_conversation.py +10 -3
- inspect_ai/model/_generate_config.py +6 -0
- inspect_ai/model/_model.py +164 -25
- inspect_ai/model/_openai.py +33 -1
- inspect_ai/model/_providers/anthropic.py +12 -3
- inspect_ai/model/_providers/groq.py +4 -0
- inspect_ai/model/_providers/openai.py +21 -9
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_reasoning.py +17 -0
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_basic_agent.py +78 -58
- inspect_ai/{util → solver}/_limit.py +13 -0
- inspect_ai/solver/_task_state.py +37 -7
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
- inspect_ai/tool/beta/_computer/_resources/Dockerfile +5 -3
- inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +1 -1
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
- inspect_ai/util/__init__.py +0 -2
- inspect_ai/util/_sandbox/self_check.py +51 -28
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/RECORD +45 -40
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +0 -10
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.60.dist-info → inspect_ai-0.3.62.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,7 @@ else:
|
|
12
12
|
|
13
13
|
from anthropic import (
|
14
14
|
APIConnectionError,
|
15
|
+
APIStatusError,
|
15
16
|
AsyncAnthropic,
|
16
17
|
AsyncAnthropicBedrock,
|
17
18
|
AsyncAnthropicVertex,
|
@@ -215,6 +216,17 @@ class AnthropicAPI(ModelAPI):
|
|
215
216
|
# return output and call
|
216
217
|
return output, model_call()
|
217
218
|
|
219
|
+
except APIStatusError as ex:
|
220
|
+
if ex.status_code == 413:
|
221
|
+
return ModelOutput.from_content(
|
222
|
+
model=self.model_name,
|
223
|
+
content=ex.message,
|
224
|
+
stop_reason="model_length",
|
225
|
+
error=ex.message,
|
226
|
+
), model_call()
|
227
|
+
else:
|
228
|
+
raise ex
|
229
|
+
|
218
230
|
except BadRequestError as ex:
|
219
231
|
return self.handle_bad_request(ex), model_call()
|
220
232
|
|
@@ -291,9 +303,6 @@ class AnthropicAPI(ModelAPI):
|
|
291
303
|
elif "content filtering" in error:
|
292
304
|
content = "Sorry, but I am unable to help with that request."
|
293
305
|
stop_reason = "content_filter"
|
294
|
-
else:
|
295
|
-
content = error
|
296
|
-
stop_reason = "unknown"
|
297
306
|
|
298
307
|
if content and stop_reason:
|
299
308
|
return ModelOutput.from_content(
|
@@ -294,8 +294,12 @@ def chat_tool_calls(message: Any, tools: list[ToolInfo]) -> Optional[List[ToolCa
|
|
294
294
|
|
295
295
|
|
296
296
|
def chat_message_assistant(message: Any, tools: list[ToolInfo]) -> ChatMessageAssistant:
|
297
|
+
reasoning = getattr(message, "reasoning", None)
|
298
|
+
if reasoning is not None:
|
299
|
+
reasoning = str(reasoning)
|
297
300
|
return ChatMessageAssistant(
|
298
301
|
content=message.content or "",
|
299
302
|
source="generate",
|
300
303
|
tool_calls=chat_tool_calls(message, tools),
|
304
|
+
reasoning=reasoning,
|
301
305
|
)
|
@@ -35,10 +35,12 @@ from .._model_output import (
|
|
35
35
|
StopReason,
|
36
36
|
)
|
37
37
|
from .._openai import (
|
38
|
-
|
38
|
+
is_gpt,
|
39
39
|
is_o1_full,
|
40
40
|
is_o1_mini,
|
41
41
|
is_o1_preview,
|
42
|
+
is_o3,
|
43
|
+
is_o_series,
|
42
44
|
openai_chat_messages,
|
43
45
|
openai_chat_tool_choice,
|
44
46
|
openai_chat_tools,
|
@@ -140,8 +142,8 @@ class OpenAIAPI(ModelAPI):
|
|
140
142
|
def is_azure(self) -> bool:
|
141
143
|
return self.service == "azure"
|
142
144
|
|
143
|
-
def
|
144
|
-
return
|
145
|
+
def is_o_series(self) -> bool:
|
146
|
+
return is_o_series(self.model_name)
|
145
147
|
|
146
148
|
def is_o1_full(self) -> bool:
|
147
149
|
return is_o1_full(self.model_name)
|
@@ -149,9 +151,15 @@ class OpenAIAPI(ModelAPI):
|
|
149
151
|
def is_o1_mini(self) -> bool:
|
150
152
|
return is_o1_mini(self.model_name)
|
151
153
|
|
154
|
+
def is_o3(self) -> bool:
|
155
|
+
return is_o3(self.model_name)
|
156
|
+
|
152
157
|
def is_o1_preview(self) -> bool:
|
153
158
|
return is_o1_preview(self.model_name)
|
154
159
|
|
160
|
+
def is_gpt(self) -> bool:
|
161
|
+
return is_gpt(self.model_name)
|
162
|
+
|
155
163
|
async def generate(
|
156
164
|
self,
|
157
165
|
input: list[ChatMessage],
|
@@ -258,7 +266,7 @@ class OpenAIAPI(ModelAPI):
|
|
258
266
|
model=self.model_name,
|
259
267
|
)
|
260
268
|
if config.max_tokens is not None:
|
261
|
-
if self.
|
269
|
+
if self.is_o_series():
|
262
270
|
params["max_completion_tokens"] = config.max_tokens
|
263
271
|
else:
|
264
272
|
params["max_tokens"] = config.max_tokens
|
@@ -273,10 +281,10 @@ class OpenAIAPI(ModelAPI):
|
|
273
281
|
if config.seed is not None:
|
274
282
|
params["seed"] = config.seed
|
275
283
|
if config.temperature is not None:
|
276
|
-
if self.
|
284
|
+
if self.is_o_series():
|
277
285
|
warn_once(
|
278
286
|
logger,
|
279
|
-
"
|
287
|
+
"o series models do not support the 'temperature' parameter (temperature is always 1).",
|
280
288
|
)
|
281
289
|
else:
|
282
290
|
params["temperature"] = config.temperature
|
@@ -293,9 +301,9 @@ class OpenAIAPI(ModelAPI):
|
|
293
301
|
params["logprobs"] = config.logprobs
|
294
302
|
if config.top_logprobs is not None:
|
295
303
|
params["top_logprobs"] = config.top_logprobs
|
296
|
-
if tools and config.parallel_tool_calls is not None and not self.
|
304
|
+
if tools and config.parallel_tool_calls is not None and not self.is_o_series():
|
297
305
|
params["parallel_tool_calls"] = config.parallel_tool_calls
|
298
|
-
if config.reasoning_effort is not None and self.
|
306
|
+
if config.reasoning_effort is not None and not self.is_gpt():
|
299
307
|
params["reasoning_effort"] = config.reasoning_effort
|
300
308
|
|
301
309
|
return params
|
@@ -312,7 +320,11 @@ class OpenAIAPI(ModelAPI):
|
|
312
320
|
stop_reason: StopReason | None = None
|
313
321
|
if e.code == "context_length_exceeded":
|
314
322
|
stop_reason = "model_length"
|
315
|
-
elif
|
323
|
+
elif (
|
324
|
+
e.code == "invalid_prompt" # seems to happen for o1/o3
|
325
|
+
or e.code == "content_policy_violation" # seems to happen for vision
|
326
|
+
or e.code == "content_filter" # seems to happen on azure
|
327
|
+
):
|
316
328
|
stop_reason = "content_filter"
|
317
329
|
|
318
330
|
if stop_reason:
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import re
|
2
|
+
from typing import NamedTuple
|
3
|
+
|
4
|
+
|
5
|
+
class ContentWithReasoning(NamedTuple):
|
6
|
+
content: str
|
7
|
+
reasoning: str
|
8
|
+
|
9
|
+
|
10
|
+
def parse_content_with_reasoning(content: str) -> ContentWithReasoning | None:
|
11
|
+
match = re.match(r"\s*<think>(.*?)</think>(.*)", content, re.DOTALL)
|
12
|
+
if match:
|
13
|
+
return ContentWithReasoning(
|
14
|
+
content=match.group(2).strip(), reasoning=match.group(1).strip()
|
15
|
+
)
|
16
|
+
else:
|
17
|
+
return None
|
inspect_ai/solver/__init__.py
CHANGED
@@ -6,6 +6,7 @@ from ._chain import chain
|
|
6
6
|
from ._critique import self_critique
|
7
7
|
from ._fork import fork
|
8
8
|
from ._human_agent.agent import human_agent
|
9
|
+
from ._limit import SampleLimitExceededError
|
9
10
|
from ._multiple_choice import MultipleChoiceTemplate, multiple_choice
|
10
11
|
from ._plan import Plan, plan
|
11
12
|
from ._prompt import chain_of_thought, prompt_template, system_message, user_message
|
@@ -37,6 +38,7 @@ __all__ = [
|
|
37
38
|
"TaskState",
|
38
39
|
"Generate",
|
39
40
|
"MultipleChoiceTemplate",
|
41
|
+
"SampleLimitExceededError",
|
40
42
|
]
|
41
43
|
|
42
44
|
|
@@ -1,8 +1,9 @@
|
|
1
1
|
from logging import getLogger
|
2
|
-
from typing import Callable, cast
|
2
|
+
from typing import Awaitable, Callable, cast
|
3
3
|
|
4
4
|
from typing_extensions import TypedDict, Unpack
|
5
5
|
|
6
|
+
from inspect_ai._util._async import is_callable_coroutine
|
6
7
|
from inspect_ai.model._cache import CachePolicy
|
7
8
|
from inspect_ai.model._call_tools import call_tools
|
8
9
|
from inspect_ai.model._chat_message import ChatMessageTool, ChatMessageUser
|
@@ -13,6 +14,7 @@ from inspect_ai.solver._chain import chain
|
|
13
14
|
from inspect_ai.tool._tool import Tool, ToolResult, tool
|
14
15
|
from inspect_ai.tool._tool_with import tool_with
|
15
16
|
|
17
|
+
from ._limit import SampleLimitExceededError
|
16
18
|
from ._prompt import system_message
|
17
19
|
from ._solver import Generate, Solver, solver
|
18
20
|
from ._task_state import TaskState
|
@@ -57,7 +59,9 @@ def basic_agent(
|
|
57
59
|
max_tool_output: int | None = None,
|
58
60
|
score_value: ValueToFloat | None = None,
|
59
61
|
incorrect_message: str
|
60
|
-
| Callable[
|
62
|
+
| Callable[
|
63
|
+
[TaskState, list[Score]], str | Awaitable[str]
|
64
|
+
] = DEFAULT_INCORRECT_MESSAGE,
|
61
65
|
continue_message: str = DEFAULT_CONTINUE_MESSAGE,
|
62
66
|
submit_name: str = DEFAULT_SUBMIT_NAME,
|
63
67
|
submit_description: str = DEFAULT_SUBMIT_DESCRIPTION,
|
@@ -92,8 +96,9 @@ def basic_agent(
|
|
92
96
|
Defaults to max_tool_output from active GenerateConfig.
|
93
97
|
score_value (ValueToFloat): Function used to extract float from scores (defaults
|
94
98
|
to standard value_to_float())
|
95
|
-
incorrect_message (str | Callable[[TaskState, list[Score]], str]):
|
96
|
-
incorrect submission from the model. Alternatively,
|
99
|
+
incorrect_message (str | Callable[[TaskState, list[Score]], str | Awaitable[str]]):
|
100
|
+
User message reply for an incorrect submission from the model. Alternatively,
|
101
|
+
a function which returns a message (function may optionally be async)
|
97
102
|
continue_message (str): User message to urge the model to continue when it
|
98
103
|
doesn't make a tool call.
|
99
104
|
submit_name (str): Name for tool used to make submissions
|
@@ -167,61 +172,76 @@ def basic_agent(
|
|
167
172
|
# track attempts
|
168
173
|
attempts = 0
|
169
174
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
state.messages.append(state.output.message)
|
177
|
-
|
178
|
-
# check for context window overflow
|
179
|
-
if state.output.stop_reason == "model_length":
|
180
|
-
from inspect_ai.log._transcript import transcript
|
181
|
-
|
182
|
-
transcript().info("Agent terminated: model context window exceeded")
|
183
|
-
break
|
184
|
-
|
185
|
-
# resolve tools calls (if any)
|
186
|
-
if state.output.message.tool_calls:
|
187
|
-
# call tool functions
|
188
|
-
tool_results = await call_tools(
|
189
|
-
state.output.message, state.tools, max_output=max_tool_output
|
175
|
+
try:
|
176
|
+
# main loop (state.completed checks message_limit and token_limit)
|
177
|
+
while not state.completed:
|
178
|
+
# generate output and append assistant message
|
179
|
+
state.output = await get_model().generate(
|
180
|
+
input=state.messages, tools=state.tools, cache=cache
|
190
181
|
)
|
191
|
-
state.messages.
|
192
|
-
|
193
|
-
#
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
182
|
+
state.messages.append(state.output.message)
|
183
|
+
|
184
|
+
# check for context window overflow
|
185
|
+
if state.output.stop_reason == "model_length":
|
186
|
+
from inspect_ai.log._transcript import transcript
|
187
|
+
|
188
|
+
transcript().info(
|
189
|
+
"Agent terminated: model context window exceeded"
|
190
|
+
)
|
191
|
+
break
|
192
|
+
|
193
|
+
# resolve tools calls (if any)
|
194
|
+
if state.output.message.tool_calls:
|
195
|
+
# call tool functions
|
196
|
+
tool_results = await call_tools(
|
197
|
+
state.output.message,
|
198
|
+
state.tools,
|
199
|
+
max_output=max_tool_output,
|
200
|
+
)
|
201
|
+
state.messages.extend(tool_results)
|
202
|
+
|
203
|
+
# was an answer submitted?
|
204
|
+
answer = submission(tool_results)
|
205
|
+
if answer:
|
206
|
+
# set the output to the answer for scoring
|
207
|
+
state.output.completion = answer
|
208
|
+
|
209
|
+
# exit if we are at max_attempts
|
210
|
+
attempts += 1
|
211
|
+
if attempts >= max_attempts:
|
212
|
+
state.completed = True
|
213
|
+
break
|
214
|
+
|
215
|
+
# exit if the submission is successful
|
216
|
+
answer_scores = await score(state)
|
217
|
+
if score_value_fn(answer_scores[0].value) == 1.0:
|
218
|
+
state.completed = True
|
219
|
+
break
|
220
|
+
|
221
|
+
# otherwise notify the model that it was incorrect and continue
|
222
|
+
else:
|
223
|
+
if is_callable_coroutine(incorrect_message):
|
224
|
+
response_message: str = await incorrect_message(
|
225
|
+
state, answer_scores
|
226
|
+
) # type: ignore[misc,operator]
|
227
|
+
elif callable(incorrect_message):
|
228
|
+
response_message = cast(
|
229
|
+
str, incorrect_message(state, answer_scores)
|
230
|
+
)
|
231
|
+
else:
|
232
|
+
response_message = incorrect_message
|
233
|
+
|
234
|
+
state.messages.append(
|
235
|
+
ChatMessageUser(content=response_message)
|
236
|
+
)
|
237
|
+
|
238
|
+
# no tool calls, urge the model to continue
|
239
|
+
else:
|
240
|
+
state.messages.append(ChatMessageUser(content=continue_message))
|
241
|
+
|
242
|
+
# propagate current state along with sample limit exceeded
|
243
|
+
except SampleLimitExceededError as ex:
|
244
|
+
raise ex.with_state(state)
|
225
245
|
|
226
246
|
return state
|
227
247
|
|
@@ -1,5 +1,7 @@
|
|
1
1
|
from typing import Literal
|
2
2
|
|
3
|
+
from ._task_state import TaskState
|
4
|
+
|
3
5
|
|
4
6
|
class SampleLimitExceededError(Exception):
|
5
7
|
"""Exception raised when a sample limit is exceeded.
|
@@ -18,9 +20,20 @@ class SampleLimitExceededError(Exception):
|
|
18
20
|
value: int,
|
19
21
|
limit: int,
|
20
22
|
message: str | None = None,
|
23
|
+
state: TaskState | None = None,
|
21
24
|
) -> None:
|
22
25
|
self.type = type
|
23
26
|
self.value = value
|
24
27
|
self.limit = limit
|
25
28
|
self.message = f"Exceeded {type} limit: {limit:,}"
|
29
|
+
self.state = state
|
26
30
|
super().__init__(message)
|
31
|
+
|
32
|
+
def with_state(self, state: TaskState) -> "SampleLimitExceededError":
|
33
|
+
return SampleLimitExceededError(
|
34
|
+
self.type,
|
35
|
+
value=self.value,
|
36
|
+
limit=self.limit,
|
37
|
+
message=self.message,
|
38
|
+
state=state,
|
39
|
+
)
|
inspect_ai/solver/_task_state.py
CHANGED
@@ -22,7 +22,6 @@ from inspect_ai.scorer._metric import Score
|
|
22
22
|
from inspect_ai.scorer._target import Target
|
23
23
|
from inspect_ai.tool import Tool, ToolChoice
|
24
24
|
from inspect_ai.tool._tool_def import ToolDef
|
25
|
-
from inspect_ai.util._limit import SampleLimitExceededError
|
26
25
|
from inspect_ai.util._store import Store, store_jsonable
|
27
26
|
from inspect_ai.util._store_model import SMT
|
28
27
|
|
@@ -173,7 +172,7 @@ class TaskState:
|
|
173
172
|
self.metadata = metadata
|
174
173
|
"""Metadata from the `Sample` for this `TaskState`"""
|
175
174
|
|
176
|
-
self._messages: list[ChatMessage] = ChatMessageList(messages)
|
175
|
+
self._messages: list[ChatMessage] = ChatMessageList(messages, self)
|
177
176
|
"""
|
178
177
|
Chat conversation history for sample.
|
179
178
|
|
@@ -272,7 +271,7 @@ class TaskState:
|
|
272
271
|
@messages.setter
|
273
272
|
def messages(self, messages: list[ChatMessage]) -> None:
|
274
273
|
"""Set messages in chat history."""
|
275
|
-
self._messages = ChatMessageList(messages)
|
274
|
+
self._messages = ChatMessageList(messages, self)
|
276
275
|
|
277
276
|
@property
|
278
277
|
def max_messages(self) -> int | None:
|
@@ -319,8 +318,32 @@ class TaskState:
|
|
319
318
|
|
320
319
|
@property
|
321
320
|
def completed(self) -> bool:
|
322
|
-
"""Is the task completed.
|
323
|
-
|
321
|
+
"""Is the task completed.
|
322
|
+
|
323
|
+
Additionally, checks message and token limits and raises if they are exceeded.
|
324
|
+
"""
|
325
|
+
from inspect_ai.log._samples import set_active_sample_total_messages
|
326
|
+
|
327
|
+
from ._limit import SampleLimitExceededError
|
328
|
+
|
329
|
+
# update messages
|
330
|
+
set_active_sample_total_messages(len(self.messages))
|
331
|
+
|
332
|
+
if self._completed:
|
333
|
+
return True
|
334
|
+
elif self.message_limit and len(self.messages) >= self.message_limit:
|
335
|
+
raise SampleLimitExceededError(
|
336
|
+
"message",
|
337
|
+
value=len(self.messages),
|
338
|
+
limit=self.message_limit,
|
339
|
+
state=self,
|
340
|
+
)
|
341
|
+
elif self.token_limit and self.token_usage >= self.token_limit:
|
342
|
+
raise SampleLimitExceededError(
|
343
|
+
"token", value=self.token_usage, limit=self.token_limit, state=self
|
344
|
+
)
|
345
|
+
else:
|
346
|
+
return self._completed
|
324
347
|
|
325
348
|
@completed.setter
|
326
349
|
def completed(self, completed: bool) -> None:
|
@@ -403,7 +426,8 @@ def sample_jsonable(sample: Sample) -> dict[str, Any]:
|
|
403
426
|
|
404
427
|
|
405
428
|
class ChatMessageList(list[ChatMessage]):
|
406
|
-
def __init__(self, iterable: Iterable[ChatMessage]):
|
429
|
+
def __init__(self, iterable: Iterable[ChatMessage], parent_state: TaskState):
|
430
|
+
self.parent_state = parent_state
|
407
431
|
items, length = self._iterable_length(iterable)
|
408
432
|
self._check_size(length)
|
409
433
|
super().__init__(items)
|
@@ -411,12 +435,18 @@ class ChatMessageList(list[ChatMessage]):
|
|
411
435
|
def _check_size(self, additional_items: int = 1) -> None:
|
412
436
|
from inspect_ai.log._samples import active_sample_message_limit
|
413
437
|
|
438
|
+
from ._limit import SampleLimitExceededError
|
439
|
+
|
414
440
|
messages_limit = active_sample_message_limit()
|
415
441
|
if messages_limit is not None:
|
416
442
|
messages = len(self) + additional_items
|
417
443
|
if messages > messages_limit:
|
418
444
|
raise SampleLimitExceededError(
|
419
|
-
"message",
|
445
|
+
"message",
|
446
|
+
value=messages,
|
447
|
+
limit=messages_limit,
|
448
|
+
message=None,
|
449
|
+
state=self.parent_state,
|
420
450
|
)
|
421
451
|
|
422
452
|
def append(self, item: ChatMessage) -> None:
|
@@ -345,7 +345,9 @@ async def web_browser_cmd(cmd: str, *args: str) -> str:
|
|
345
345
|
if sandbox_env:
|
346
346
|
store = store_as(WebBrowserStore)
|
347
347
|
if not store.session_id:
|
348
|
-
result = await sandbox_env.exec(
|
348
|
+
result = await sandbox_env.exec(
|
349
|
+
["python3", WEB_CLIENT_NEW_SESSION], timeout=180
|
350
|
+
)
|
349
351
|
|
350
352
|
if not result.success:
|
351
353
|
raise RuntimeError(
|
@@ -33,8 +33,6 @@ RUN apt-get update && \
|
|
33
33
|
|
34
34
|
# Userland apt-get'able apps
|
35
35
|
RUN apt-get install -y --no-install-recommends \
|
36
|
-
# A simple image viewer.
|
37
|
-
xpaint \
|
38
36
|
# A calculator application.
|
39
37
|
galculator && \
|
40
38
|
apt-get clean
|
@@ -62,6 +60,10 @@ RUN apt-get install -y \
|
|
62
60
|
# configure noVNC
|
63
61
|
RUN ln -s /usr/share/novnc/vnc.html /usr/share/novnc/index.html
|
64
62
|
|
63
|
+
# configure python alias
|
64
|
+
RUN ln -s /usr/bin/python3 /usr/bin/python
|
65
|
+
|
66
|
+
|
65
67
|
# We copy requirements.txt by itself so that changes to the scripts will be in a later layer
|
66
68
|
# and we only pip install if requirements.txt changes
|
67
69
|
COPY tool/requirements.txt /opt/inspect/tool/requirements.txt
|
@@ -78,7 +80,7 @@ RUN useradd -m -s /bin/bash -d $HOME $USERNAME
|
|
78
80
|
RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
79
81
|
USER ${USERNAME}
|
80
82
|
WORKDIR $HOME
|
81
|
-
|
83
|
+
ADD --chown=$USERNAME:$USERNAME image_home_dir/ $HOME
|
82
84
|
|
83
85
|
# configure Firefox to skip all 'first run' UI
|
84
86
|
RUN mkdir -p $HOME/.mozilla/firefox-esr/profile.default && \
|
inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb
ADDED
Binary file
|
@@ -0,0 +1,61 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
|
3
|
+
<channel name="xfce4-panel" version="1.0">
|
4
|
+
<property name="configver" type="int" value="2"/>
|
5
|
+
<property name="panels" type="array">
|
6
|
+
<value type="int" value="1"/>
|
7
|
+
<property name="dark-mode" type="bool" value="true"/>
|
8
|
+
<property name="panel-1" type="empty">
|
9
|
+
<property name="position" type="string" value="p=6;x=0;y=0"/>
|
10
|
+
<property name="length" type="uint" value="100"/>
|
11
|
+
<property name="position-locked" type="bool" value="true"/>
|
12
|
+
<property name="icon-size" type="uint" value="16"/>
|
13
|
+
<property name="size" type="uint" value="26"/>
|
14
|
+
<property name="plugin-ids" type="array">
|
15
|
+
<value type="int" value="1"/>
|
16
|
+
<value type="int" value="2"/>
|
17
|
+
<value type="int" value="3"/>
|
18
|
+
<value type="int" value="4"/>
|
19
|
+
<value type="int" value="5"/>
|
20
|
+
<value type="int" value="6"/>
|
21
|
+
<value type="int" value="8"/>
|
22
|
+
<value type="int" value="10"/>
|
23
|
+
<value type="int" value="11"/>
|
24
|
+
<value type="int" value="12"/>
|
25
|
+
<value type="int" value="13"/>
|
26
|
+
<value type="int" value="14"/>
|
27
|
+
</property>
|
28
|
+
</property>
|
29
|
+
</property>
|
30
|
+
<property name="plugins" type="empty">
|
31
|
+
<property name="plugin-1" type="string" value="applicationsmenu"/>
|
32
|
+
<property name="plugin-2" type="string" value="tasklist">
|
33
|
+
<property name="grouping" type="uint" value="1"/>
|
34
|
+
</property>
|
35
|
+
<property name="plugin-3" type="string" value="separator">
|
36
|
+
<property name="expand" type="bool" value="true"/>
|
37
|
+
<property name="style" type="uint" value="0"/>
|
38
|
+
</property>
|
39
|
+
<property name="plugin-4" type="string" value="pager"/>
|
40
|
+
<property name="plugin-5" type="string" value="separator">
|
41
|
+
<property name="style" type="uint" value="0"/>
|
42
|
+
</property>
|
43
|
+
<property name="plugin-6" type="string" value="systray">
|
44
|
+
<property name="square-icons" type="bool" value="true"/>
|
45
|
+
</property>
|
46
|
+
<property name="plugin-8" type="string" value="pulseaudio">
|
47
|
+
<property name="enable-keyboard-shortcuts" type="bool" value="true"/>
|
48
|
+
<property name="show-notifications" type="bool" value="true"/>
|
49
|
+
</property>
|
50
|
+
<property name="plugin-9" type="string" value="power-manager-plugin"/>
|
51
|
+
<property name="plugin-10" type="string" value="notification-plugin"/>
|
52
|
+
<property name="plugin-11" type="string" value="separator">
|
53
|
+
<property name="style" type="uint" value="0"/>
|
54
|
+
</property>
|
55
|
+
<property name="plugin-12" type="string" value="clock"/>
|
56
|
+
<property name="plugin-13" type="string" value="separator">
|
57
|
+
<property name="style" type="uint" value="0"/>
|
58
|
+
</property>
|
59
|
+
<property name="plugin-14" type="string" value="actions"/>
|
60
|
+
</property>
|
61
|
+
</channel>
|
@@ -0,0 +1,10 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
|
3
|
+
<channel name="xfce4-screensaver" version="1.0">
|
4
|
+
<property name="saver" type="empty">
|
5
|
+
<property name="mode" type="int" value="0" />
|
6
|
+
</property>
|
7
|
+
<property name="lock" type="empty">
|
8
|
+
<property name="enabled" type="bool" value="false" />
|
9
|
+
</property>
|
10
|
+
</channel>
|
inspect_ai/util/__init__.py
CHANGED
@@ -3,7 +3,6 @@ from inspect_ai._util.trace import trace_action, trace_message
|
|
3
3
|
from ._concurrency import concurrency
|
4
4
|
from ._console import input_screen
|
5
5
|
from ._display import DisplayType, display_type
|
6
|
-
from ._limit import SampleLimitExceededError
|
7
6
|
from ._panel import InputPanel, input_panel
|
8
7
|
from ._resource import resource
|
9
8
|
from ._sandbox import (
|
@@ -37,7 +36,6 @@ __all__ = [
|
|
37
36
|
"input_panel",
|
38
37
|
"input_screen",
|
39
38
|
"OutputLimitExceededError",
|
40
|
-
"SampleLimitExceededError",
|
41
39
|
"resource",
|
42
40
|
"subprocess",
|
43
41
|
"SandboxEnvironment",
|