inspect-ai 0.3.59__py3-none-any.whl → 0.3.61__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +0 -8
- inspect_ai/_display/textual/widgets/samples.py +1 -1
- inspect_ai/_eval/eval.py +10 -1
- inspect_ai/_eval/loader.py +79 -19
- inspect_ai/_eval/registry.py +6 -0
- inspect_ai/_eval/score.py +2 -1
- inspect_ai/_eval/task/generate.py +41 -35
- inspect_ai/_eval/task/results.py +6 -5
- inspect_ai/_eval/task/run.py +21 -15
- inspect_ai/_util/hooks.py +17 -7
- inspect_ai/_view/www/dist/assets/index.js +262 -303
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/App.mjs +6 -6
- inspect_ai/_view/www/src/Types.mjs +1 -1
- inspect_ai/_view/www/src/api/Types.ts +133 -0
- inspect_ai/_view/www/src/api/{api-browser.mjs → api-browser.ts} +25 -13
- inspect_ai/_view/www/src/api/api-http.ts +219 -0
- inspect_ai/_view/www/src/api/api-shared.ts +47 -0
- inspect_ai/_view/www/src/api/{api-vscode.mjs → api-vscode.ts} +22 -19
- inspect_ai/_view/www/src/api/{client-api.mjs → client-api.ts} +93 -53
- inspect_ai/_view/www/src/api/index.ts +51 -0
- inspect_ai/_view/www/src/api/jsonrpc.ts +225 -0
- inspect_ai/_view/www/src/components/DownloadButton.mjs +1 -1
- inspect_ai/_view/www/src/index.js +2 -2
- inspect_ai/_view/www/src/log/{remoteLogFile.mjs → remoteLogFile.ts} +62 -46
- inspect_ai/_view/www/src/navbar/Navbar.mjs +1 -1
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +1 -1
- inspect_ai/_view/www/src/samples/SampleList.mjs +1 -1
- inspect_ai/_view/www/src/samples/SampleScores.mjs +1 -1
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +14 -14
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +10 -10
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +2 -2
- inspect_ai/_view/www/src/utils/{Json.mjs → json-worker.ts} +1 -3
- inspect_ai/_view/www/src/utils/vscode.ts +36 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
- inspect_ai/approval/_human/manager.py +1 -1
- inspect_ai/model/_call_tools.py +55 -0
- inspect_ai/model/_chat_message.py +2 -2
- inspect_ai/model/_conversation.py +1 -4
- inspect_ai/model/_generate_config.py +2 -8
- inspect_ai/model/_model.py +90 -25
- inspect_ai/model/_model_output.py +15 -0
- inspect_ai/model/_openai.py +383 -0
- inspect_ai/model/_providers/anthropic.py +52 -14
- inspect_ai/model/_providers/azureai.py +1 -1
- inspect_ai/model/_providers/goodfire.py +248 -0
- inspect_ai/model/_providers/groq.py +7 -3
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +2 -1
- inspect_ai/model/_providers/openai.py +36 -202
- inspect_ai/model/_providers/openai_o1.py +2 -4
- inspect_ai/model/_providers/providers.py +22 -0
- inspect_ai/model/_providers/together.py +4 -4
- inspect_ai/model/_providers/util/__init__.py +2 -3
- inspect_ai/model/_providers/util/hf_handler.py +1 -1
- inspect_ai/model/_providers/util/llama31.py +1 -1
- inspect_ai/model/_providers/util/util.py +0 -76
- inspect_ai/scorer/_metric.py +3 -0
- inspect_ai/scorer/_scorer.py +2 -1
- inspect_ai/solver/__init__.py +4 -0
- inspect_ai/solver/_basic_agent.py +65 -55
- inspect_ai/solver/_bridge/__init__.py +3 -0
- inspect_ai/solver/_bridge/bridge.py +100 -0
- inspect_ai/solver/_bridge/patch.py +170 -0
- inspect_ai/{util → solver}/_limit.py +13 -0
- inspect_ai/solver/_solver.py +6 -0
- inspect_ai/solver/_task_state.py +37 -7
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
- inspect_ai/tool/beta/_computer/_resources/Dockerfile +1 -3
- inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +1 -1
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +10 -0
- inspect_ai/util/__init__.py +0 -2
- inspect_ai/util/_display.py +5 -0
- inspect_ai/util/_sandbox/docker/prereqs.py +1 -1
- inspect_ai/util/_sandbox/self_check.py +51 -28
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/METADATA +3 -2
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/RECORD +81 -76
- inspect_ai/_view/www/src/api/Types.mjs +0 -117
- inspect_ai/_view/www/src/api/api-http.mjs +0 -300
- inspect_ai/_view/www/src/api/api-shared.mjs +0 -10
- inspect_ai/_view/www/src/api/index.mjs +0 -49
- inspect_ai/_view/www/src/api/jsonrpc.mjs +0 -208
- inspect_ai/_view/www/src/utils/vscode.mjs +0 -16
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +0 -10
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -314,12 +314,6 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
314
314
|
help="Sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
|
315
315
|
envvar="INSPECT_EVAL_STOP_SEQS",
|
316
316
|
)
|
317
|
-
@click.option(
|
318
|
-
"--suffix",
|
319
|
-
type=str,
|
320
|
-
help="The suffix that comes after a completion of inserted text. OpenAI only.",
|
321
|
-
envvar="INSPECT_EVAL_SUFFIX",
|
322
|
-
)
|
323
317
|
@click.option(
|
324
318
|
"--temperature",
|
325
319
|
type=float,
|
@@ -439,7 +433,6 @@ def eval_command(
|
|
439
433
|
logit_bias: str | None,
|
440
434
|
seed: int | None,
|
441
435
|
stop_seqs: str | None,
|
442
|
-
suffix: str | None,
|
443
436
|
temperature: float | None,
|
444
437
|
top_p: float | None,
|
445
438
|
top_k: int | None,
|
@@ -599,7 +592,6 @@ def eval_set_command(
|
|
599
592
|
logit_bias: str | None,
|
600
593
|
seed: int | None,
|
601
594
|
stop_seqs: str | None,
|
602
|
-
suffix: str | None,
|
603
595
|
temperature: float | None,
|
604
596
|
top_p: float | None,
|
605
597
|
top_k: int | None,
|
inspect_ai/_eval/eval.py
CHANGED
@@ -35,7 +35,12 @@ from inspect_ai.scorer._reducer import reducer_log_names
|
|
35
35
|
from inspect_ai.solver._chain import chain
|
36
36
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
37
37
|
from inspect_ai.util import SandboxEnvironmentType
|
38
|
-
from inspect_ai.util._display import
|
38
|
+
from inspect_ai.util._display import (
|
39
|
+
DisplayType,
|
40
|
+
display_type,
|
41
|
+
display_type_initialized,
|
42
|
+
init_display_type,
|
43
|
+
)
|
39
44
|
|
40
45
|
from .context import init_eval_context
|
41
46
|
from .loader import ResolvedTask, resolve_tasks
|
@@ -306,6 +311,10 @@ async def eval_async(
|
|
306
311
|
|
307
312
|
_eval_async_running = True
|
308
313
|
|
314
|
+
# if we are called outside of eval() then set display type to "plain"
|
315
|
+
if not display_type_initialized():
|
316
|
+
init_display_type("plain")
|
317
|
+
|
309
318
|
# resolve model and task args
|
310
319
|
model_args = resolve_args(model_args)
|
311
320
|
task_args = resolve_args(task_args)
|
inspect_ai/_eval/loader.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import ast
|
2
2
|
import contextlib
|
3
|
+
import inspect
|
3
4
|
import os
|
4
5
|
from dataclasses import dataclass, field
|
5
6
|
from importlib.machinery import SourceFileLoader
|
@@ -9,11 +10,13 @@ from pathlib import Path
|
|
9
10
|
from types import ModuleType
|
10
11
|
from typing import Any, Callable, cast
|
11
12
|
|
13
|
+
from typing_extensions import overload
|
14
|
+
|
12
15
|
from inspect_ai._eval.task.util import task_file, task_run_dir
|
13
16
|
from inspect_ai._util.decorator import parse_decorators
|
14
17
|
from inspect_ai._util.error import PrerequisiteError
|
15
18
|
from inspect_ai._util.logger import warn_once
|
16
|
-
from inspect_ai._util.path import chdir_python
|
19
|
+
from inspect_ai._util.path import chdir_python, cwd_relative_path
|
17
20
|
from inspect_ai._util.registry import (
|
18
21
|
RegistryInfo,
|
19
22
|
is_registry_object,
|
@@ -23,6 +26,7 @@ from inspect_ai._util.registry import (
|
|
23
26
|
registry_params,
|
24
27
|
)
|
25
28
|
from inspect_ai.model import Model, ModelName
|
29
|
+
from inspect_ai.solver._bridge import bridge
|
26
30
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
27
31
|
from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
|
28
32
|
from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
|
@@ -334,6 +338,16 @@ def split_spec(spec: str) -> tuple[str, str | None]:
|
|
334
338
|
return spec, None
|
335
339
|
|
336
340
|
|
341
|
+
@overload
|
342
|
+
def load_module(
|
343
|
+
module_path: Path, filter: Callable[[str], bool]
|
344
|
+
) -> ModuleType | None: ...
|
345
|
+
|
346
|
+
|
347
|
+
@overload
|
348
|
+
def load_module(module_path: Path, filter: None = None) -> ModuleType: ...
|
349
|
+
|
350
|
+
|
337
351
|
def load_module(
|
338
352
|
module_path: Path, filter: Callable[[str], bool] | None = None
|
339
353
|
) -> ModuleType | None:
|
@@ -425,28 +439,74 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
425
439
|
else contextlib.nullcontext()
|
426
440
|
)
|
427
441
|
|
442
|
+
# pretty solver name for error messages
|
443
|
+
pretty_solver_file = (
|
444
|
+
cwd_relative_path(solver_file.as_posix()) if solver_file else None
|
445
|
+
)
|
446
|
+
|
428
447
|
with create_cm:
|
429
|
-
# if
|
430
|
-
if solver_file is
|
431
|
-
|
432
|
-
|
448
|
+
# if there is no solver file then just create from the registry by name
|
449
|
+
if solver_file is None:
|
450
|
+
if solver_name is None:
|
451
|
+
raise ValueError(f"Unable to resolve solver name from {spec.solver}")
|
452
|
+
return cast(Solver, registry_create("solver", solver_name, **spec.args))
|
433
453
|
|
434
|
-
|
454
|
+
# we do have a solver file
|
455
|
+
else:
|
456
|
+
# load the module and parse decorators
|
457
|
+
solver_module = load_module(solver_file)
|
458
|
+
decorators = parse_decorators(solver_file, "solver")
|
459
|
+
|
460
|
+
# if there is no solver_name see if we can discover it
|
435
461
|
if solver_name is None:
|
436
|
-
|
437
|
-
|
462
|
+
if len(decorators) == 1:
|
463
|
+
# decorator based solver
|
464
|
+
solver_name = decorators[0][0]
|
465
|
+
elif len(decorators) == 0:
|
466
|
+
# see if we can find an agent based solver
|
467
|
+
functions = [
|
468
|
+
function
|
469
|
+
for function in inspect.getmembers(
|
470
|
+
solver_module, inspect.isfunction
|
471
|
+
)
|
472
|
+
if function[1].__module__ == solver_module.__name__
|
473
|
+
]
|
474
|
+
agent_functions = [
|
475
|
+
function
|
476
|
+
for function in functions
|
477
|
+
if "agent" in function[0] and not function[0].startswith("_")
|
478
|
+
]
|
479
|
+
if len(agent_functions) == 1:
|
480
|
+
# agent based solver
|
481
|
+
solver_name = agent_functions[0][0]
|
482
|
+
|
483
|
+
elif len(agent_functions) == 0:
|
484
|
+
raise PrerequisiteError(
|
485
|
+
f"The source file {pretty_solver_file} does not contain any @solver functions or agent functions."
|
486
|
+
)
|
487
|
+
else:
|
488
|
+
raise PrerequisiteError(
|
489
|
+
f"The source file {pretty_solver_file} has more than one agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
|
490
|
+
)
|
491
|
+
else:
|
438
492
|
raise PrerequisiteError(
|
439
|
-
f"The source file {
|
493
|
+
f"The source file {pretty_solver_file} has more than one @solver function (qualify which solver using e.g. '{solver_file.name}y@solver_fn')"
|
440
494
|
)
|
441
|
-
if len(solvers) > 1:
|
442
|
-
raise PrerequisiteError(
|
443
|
-
f"The source file {solver_file.as_posix()} has more than one @solver function (qualify which solver using file.py@solver)"
|
444
|
-
)
|
445
|
-
solver_name = solvers[0][0]
|
446
495
|
|
447
|
-
|
448
|
-
|
449
|
-
|
496
|
+
# create decorator based solvers using the registry
|
497
|
+
if any(solver[0] == solver_name for solver in decorators):
|
498
|
+
return cast(Solver, registry_create("solver", solver_name, **spec.args))
|
450
499
|
|
451
|
-
|
452
|
-
|
500
|
+
# create agent based solvers by calling the function and wrapping it in bridge()
|
501
|
+
else:
|
502
|
+
agent_fn = getattr(solver_module, solver_name, None)
|
503
|
+
if inspect.isfunction(agent_fn):
|
504
|
+
return bridge(agent_fn(**spec.args))
|
505
|
+
elif agent_fn is not None:
|
506
|
+
raise PrerequisiteError(
|
507
|
+
f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
|
508
|
+
)
|
509
|
+
else:
|
510
|
+
raise PrerequisiteError(
|
511
|
+
f"The function {solver_name} was not found in file {pretty_solver_file}."
|
512
|
+
)
|
inspect_ai/_eval/registry.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import inspect
|
2
2
|
import logging
|
3
3
|
from copy import deepcopy
|
4
|
+
from functools import wraps
|
4
5
|
from pathlib import Path
|
5
6
|
from typing import Any, Callable, TypeVar, cast, overload
|
6
7
|
|
@@ -125,6 +126,7 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
125
126
|
params = list(inspect.signature(task_type).parameters.keys())
|
126
127
|
|
127
128
|
# Create and return the wrapper function
|
129
|
+
@wraps(task_type)
|
128
130
|
def wrapper(*w_args: Any, **w_kwargs: Any) -> Task:
|
129
131
|
# Create the task
|
130
132
|
task_instance = task_type(*w_args, **w_kwargs)
|
@@ -154,6 +156,10 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
154
156
|
# Return the task instance
|
155
157
|
return task_instance
|
156
158
|
|
159
|
+
# functools.wraps overrides the return type annotation of the inner function, so
|
160
|
+
# we explicitly set it again
|
161
|
+
wrapper.__annotations__["return"] = Task
|
162
|
+
|
157
163
|
# Register the task and return the wrapper
|
158
164
|
return task_register(
|
159
165
|
task=cast(TaskType, wrapper), name=task_name, attribs=attribs, params=params
|
inspect_ai/_eval/score.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Callable, cast
|
|
5
5
|
from inspect_ai._display import display
|
6
6
|
from inspect_ai._util.path import chdir_python
|
7
7
|
from inspect_ai._util.platform import platform_init
|
8
|
-
from inspect_ai._util.registry import registry_create
|
8
|
+
from inspect_ai._util.registry import registry_create, registry_unqualified_name
|
9
9
|
from inspect_ai.log import (
|
10
10
|
EvalLog,
|
11
11
|
EvalMetric,
|
@@ -185,6 +185,7 @@ async def run_score_task(
|
|
185
185
|
results[scorer_name] = SampleScore(
|
186
186
|
score=result,
|
187
187
|
sample_id=state.sample_id,
|
188
|
+
scorer=registry_unqualified_name(scorer),
|
188
189
|
)
|
189
190
|
|
190
191
|
progress()
|
@@ -8,6 +8,7 @@ from inspect_ai.model import (
|
|
8
8
|
)
|
9
9
|
from inspect_ai.model._cache import epoch
|
10
10
|
from inspect_ai.solver import TaskState
|
11
|
+
from inspect_ai.solver._limit import SampleLimitExceededError
|
11
12
|
from inspect_ai.tool import ToolFunction
|
12
13
|
|
13
14
|
|
@@ -21,45 +22,50 @@ async def task_generate(
|
|
21
22
|
# track tool_choice (revert to "auto" after first forced call of a tool)
|
22
23
|
tool_choice = state.tool_choice
|
23
24
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
try:
|
26
|
+
while True:
|
27
|
+
# If we don't update the epoch here as we go, it's entirely possible
|
28
|
+
# we'd cache the same response for every single epoch, which would
|
29
|
+
# completely defeat the point!
|
30
|
+
epoch.set(state.epoch)
|
29
31
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
32
|
+
# call the model
|
33
|
+
state.output = await model.generate(
|
34
|
+
input=state.messages,
|
35
|
+
tools=state.tools,
|
36
|
+
tool_choice=tool_choice,
|
37
|
+
config=config,
|
38
|
+
cache=cache,
|
39
|
+
)
|
38
40
|
|
39
|
-
|
40
|
-
|
41
|
-
|
41
|
+
# append the assistant message
|
42
|
+
message = state.output.message
|
43
|
+
state.messages.append(message)
|
42
44
|
|
43
|
-
|
44
|
-
|
45
|
-
|
45
|
+
# check for completed
|
46
|
+
if state.completed:
|
47
|
+
return state
|
46
48
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
49
|
+
# resolve tool calls if necessary
|
50
|
+
if tool_calls != "none" and message.tool_calls:
|
51
|
+
# call tools and append messages to state
|
52
|
+
state.messages.extend(
|
53
|
+
await call_tools(message, state.tools, config.max_tool_output)
|
54
|
+
)
|
53
55
|
|
54
|
-
|
55
|
-
|
56
|
-
|
56
|
+
# check for completed or only executing a single tool call
|
57
|
+
if state.completed or tool_calls == "single":
|
58
|
+
return state
|
59
|
+
|
60
|
+
# if a tool_call was forced set tool_choice to 'auto'
|
61
|
+
# (otherwise it will get forced over and over again)
|
62
|
+
if isinstance(tool_choice, ToolFunction):
|
63
|
+
tool_choice = "auto"
|
57
64
|
|
58
|
-
#
|
59
|
-
|
60
|
-
|
61
|
-
tool_choice = "auto"
|
65
|
+
# no tool calls or not resolving tool calls, we are done!
|
66
|
+
else:
|
67
|
+
return state
|
62
68
|
|
63
|
-
|
64
|
-
|
65
|
-
|
69
|
+
# propagate current state along with sample limit exceeded
|
70
|
+
except SampleLimitExceededError as ex:
|
71
|
+
raise ex.with_state(state)
|
inspect_ai/_eval/task/results.py
CHANGED
@@ -65,11 +65,12 @@ def eval_results(
|
|
65
65
|
# extract scorers info from scorers then create scorers info for any
|
66
66
|
# scores not already accounted for by a scorer name
|
67
67
|
scorers_info = [ScorerInfo.from_scorer(scorer) for scorer in (scorers or [])]
|
68
|
-
scorer_names =
|
69
|
-
for
|
70
|
-
|
71
|
-
|
72
|
-
|
68
|
+
scorer_names = {info.name for info in scorers_info}
|
69
|
+
for sample_scores in scores:
|
70
|
+
for name, sample_score in sample_scores.items():
|
71
|
+
if sample_score.scorer is None and name not in scorer_names:
|
72
|
+
scorers_info.append(ScorerInfo.from_name(name))
|
73
|
+
scorer_names.add(name)
|
73
74
|
|
74
75
|
# record scorer
|
75
76
|
if len(scorers_info) > 0:
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -27,8 +27,12 @@ from inspect_ai._util.constants import (
|
|
27
27
|
from inspect_ai._util.datetime import iso_now
|
28
28
|
from inspect_ai._util.error import exception_message
|
29
29
|
from inspect_ai._util.hooks import send_telemetry
|
30
|
-
from inspect_ai._util.registry import
|
31
|
-
|
30
|
+
from inspect_ai._util.registry import (
|
31
|
+
is_registry_object,
|
32
|
+
registry_log_name,
|
33
|
+
registry_unqualified_name,
|
34
|
+
)
|
35
|
+
from inspect_ai._util.timeouts import Timeout, timeout
|
32
36
|
from inspect_ai._view.notify import view_notify_eval
|
33
37
|
from inspect_ai.dataset import Dataset, Sample
|
34
38
|
from inspect_ai.log import (
|
@@ -71,9 +75,9 @@ from inspect_ai.scorer._scorer import unique_scorer_name
|
|
71
75
|
from inspect_ai.solver import Generate, Plan, TaskState
|
72
76
|
from inspect_ai.solver._chain import Chain, unroll
|
73
77
|
from inspect_ai.solver._fork import set_task_generate
|
78
|
+
from inspect_ai.solver._limit import SampleLimitExceededError
|
74
79
|
from inspect_ai.solver._solver import Solver
|
75
80
|
from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
|
76
|
-
from inspect_ai.util._limit import SampleLimitExceededError
|
77
81
|
from inspect_ai.util._sandbox.context import sandbox_connections
|
78
82
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
79
83
|
from inspect_ai.util._subtask import init_subtask
|
@@ -398,7 +402,13 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
398
402
|
view_notify_eval(logger.location)
|
399
403
|
|
400
404
|
try:
|
401
|
-
|
405
|
+
if (
|
406
|
+
await send_telemetry("eval_log_location", eval_log.location)
|
407
|
+
== "not_handled"
|
408
|
+
):
|
409
|
+
# Converting the eval log to JSON is expensive. Only do so if
|
410
|
+
# eval_log_location was not handled.
|
411
|
+
await send_telemetry("eval_log", eval_log_json_str(eval_log))
|
402
412
|
except Exception as ex:
|
403
413
|
py_logger.warning(
|
404
414
|
f"Error occurred sending telemetry: {exception_message(ex)}"
|
@@ -646,26 +656,21 @@ async def task_run_sample(
|
|
646
656
|
)
|
647
657
|
|
648
658
|
# capture most recent state for scoring
|
649
|
-
state = sample_state() or state
|
659
|
+
state = ex.state or sample_state() or state
|
650
660
|
state.completed = True
|
651
661
|
|
652
662
|
except BaseException as ex:
|
653
663
|
error = handle_error(ex)
|
654
664
|
|
655
|
-
# set timeout for scoring. if the original timeout was
|
656
|
-
#
|
657
|
-
# timeout time. if the original timeout was hit we still want
|
658
|
-
# to provide an opportunity for scoring, but we don't necessarily
|
665
|
+
# set timeout for scoring. if the original timeout was hit we still
|
666
|
+
# want to provide opportunity for scoring, but we don't necessarily
|
659
667
|
# want to wait the full timeout again (especially in the case where
|
660
668
|
# the cause of the timeout is a hung container and scoring requires
|
661
669
|
# interacting with the container). as a middle ground we use half
|
662
670
|
# of the original timeout value for scoring.
|
663
671
|
if isinstance(timeout_cm, Timeout):
|
664
|
-
|
665
|
-
|
666
|
-
else:
|
667
|
-
assert time_limit
|
668
|
-
timeout_cm = timeout(time_limit / 2)
|
672
|
+
assert time_limit
|
673
|
+
timeout_cm = timeout(time_limit / 2)
|
669
674
|
|
670
675
|
# turn off sample limits
|
671
676
|
set_active_sample_token_limit(None)
|
@@ -690,6 +695,7 @@ async def task_run_sample(
|
|
690
695
|
sample_score = SampleScore(
|
691
696
|
score=score_result,
|
692
697
|
sample_id=sample.id,
|
698
|
+
scorer=registry_unqualified_name(scorer),
|
693
699
|
)
|
694
700
|
transcript()._event(
|
695
701
|
ScoreEvent(
|
@@ -734,7 +740,7 @@ async def task_run_sample(
|
|
734
740
|
error = handle_error(ex)
|
735
741
|
|
736
742
|
# handle sandboxenv init errors
|
737
|
-
except
|
743
|
+
except Exception as ex:
|
738
744
|
error = handle_error(ex)
|
739
745
|
|
740
746
|
# complete the sample
|
inspect_ai/_util/hooks.py
CHANGED
@@ -17,19 +17,29 @@ from .error import PrerequisiteError
|
|
17
17
|
#
|
18
18
|
# Telemetry can be optionally enabled by setting an INSPECT_TELEMETRY
|
19
19
|
# environment variable that points to a function in a package which
|
20
|
-
# conforms to the TelemetrySend signature below.
|
20
|
+
# conforms to the TelemetrySend signature below. A return value of True
|
21
|
+
# indicates that the telemetry event was handled.
|
21
22
|
|
22
|
-
# There are currently
|
23
|
-
# - model_usage
|
24
|
-
# -
|
23
|
+
# There are currently three types of telemetry sent:
|
24
|
+
# - model_usage (JSON string of the model usage)
|
25
|
+
# - eval_log_location (file path or URL string of the eval log)
|
26
|
+
# - eval_log (JSON string of the eval log)
|
27
|
+
# [only sent if eval_log_location unhandled]
|
28
|
+
# The eval_log_location type is preferred over eval_log as it means we can take
|
29
|
+
# advantage of the .eval format and avoid loading the whole log into memory.
|
25
30
|
|
26
|
-
TelemetrySend = Callable[[str, str], Awaitable[
|
31
|
+
TelemetrySend = Callable[[str, str], Awaitable[bool]]
|
27
32
|
|
28
33
|
|
29
|
-
async def send_telemetry(
|
34
|
+
async def send_telemetry(
|
35
|
+
type: Literal["model_usage", "eval_log", "eval_log_location"], json: str
|
36
|
+
) -> Literal["handled", "not_handled", "no_subscribers"]:
|
30
37
|
global _send_telemetry
|
31
38
|
if _send_telemetry:
|
32
|
-
await _send_telemetry(type, json)
|
39
|
+
if await _send_telemetry(type, json):
|
40
|
+
return "handled"
|
41
|
+
return "not_handled"
|
42
|
+
return "no_subscribers"
|
33
43
|
|
34
44
|
|
35
45
|
_send_telemetry: TelemetrySend | None = None
|