inspect-ai 0.3.59__py3-none-any.whl → 0.3.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +0 -7
- inspect_ai/_display/textual/widgets/samples.py +1 -1
- inspect_ai/_eval/eval.py +10 -1
- inspect_ai/_eval/loader.py +79 -19
- inspect_ai/_eval/registry.py +6 -0
- inspect_ai/_eval/score.py +2 -1
- inspect_ai/_eval/task/results.py +6 -5
- inspect_ai/_eval/task/run.py +11 -11
- inspect_ai/_view/www/dist/assets/index.js +262 -303
- inspect_ai/_view/www/src/App.mjs +6 -6
- inspect_ai/_view/www/src/Types.mjs +1 -1
- inspect_ai/_view/www/src/api/Types.ts +133 -0
- inspect_ai/_view/www/src/api/{api-browser.mjs → api-browser.ts} +25 -13
- inspect_ai/_view/www/src/api/api-http.ts +219 -0
- inspect_ai/_view/www/src/api/api-shared.ts +47 -0
- inspect_ai/_view/www/src/api/{api-vscode.mjs → api-vscode.ts} +22 -19
- inspect_ai/_view/www/src/api/{client-api.mjs → client-api.ts} +93 -53
- inspect_ai/_view/www/src/api/index.ts +51 -0
- inspect_ai/_view/www/src/api/jsonrpc.ts +225 -0
- inspect_ai/_view/www/src/components/DownloadButton.mjs +1 -1
- inspect_ai/_view/www/src/index.js +2 -2
- inspect_ai/_view/www/src/log/{remoteLogFile.mjs → remoteLogFile.ts} +62 -46
- inspect_ai/_view/www/src/navbar/Navbar.mjs +1 -1
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +1 -1
- inspect_ai/_view/www/src/samples/SampleList.mjs +1 -1
- inspect_ai/_view/www/src/samples/SampleScores.mjs +1 -1
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +14 -14
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +10 -10
- inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +2 -2
- inspect_ai/_view/www/src/utils/{Json.mjs → json-worker.ts} +1 -3
- inspect_ai/_view/www/src/utils/vscode.ts +36 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
- inspect_ai/approval/_human/manager.py +1 -1
- inspect_ai/model/_call_tools.py +55 -0
- inspect_ai/model/_conversation.py +1 -4
- inspect_ai/model/_generate_config.py +2 -8
- inspect_ai/model/_model_output.py +15 -0
- inspect_ai/model/_openai.py +383 -0
- inspect_ai/model/_providers/anthropic.py +52 -11
- inspect_ai/model/_providers/azureai.py +1 -1
- inspect_ai/model/_providers/goodfire.py +248 -0
- inspect_ai/model/_providers/groq.py +7 -3
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +2 -1
- inspect_ai/model/_providers/openai.py +36 -202
- inspect_ai/model/_providers/openai_o1.py +2 -4
- inspect_ai/model/_providers/providers.py +22 -0
- inspect_ai/model/_providers/together.py +4 -4
- inspect_ai/model/_providers/util/__init__.py +2 -3
- inspect_ai/model/_providers/util/hf_handler.py +1 -1
- inspect_ai/model/_providers/util/llama31.py +1 -1
- inspect_ai/model/_providers/util/util.py +0 -76
- inspect_ai/scorer/_metric.py +3 -0
- inspect_ai/scorer/_scorer.py +2 -1
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_basic_agent.py +1 -1
- inspect_ai/solver/_bridge/__init__.py +3 -0
- inspect_ai/solver/_bridge/bridge.py +100 -0
- inspect_ai/solver/_bridge/patch.py +170 -0
- inspect_ai/solver/_solver.py +6 -0
- inspect_ai/util/_display.py +5 -0
- inspect_ai/util/_sandbox/docker/prereqs.py +1 -1
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/METADATA +3 -2
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/RECORD +68 -63
- inspect_ai/_view/www/src/api/Types.mjs +0 -117
- inspect_ai/_view/www/src/api/api-http.mjs +0 -300
- inspect_ai/_view/www/src/api/api-shared.mjs +0 -10
- inspect_ai/_view/www/src/api/index.mjs +0 -49
- inspect_ai/_view/www/src/api/jsonrpc.mjs +0 -208
- inspect_ai/_view/www/src/utils/vscode.mjs +0 -16
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.60.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py
CHANGED
@@ -314,12 +314,6 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
|
|
314
314
|
help="Sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
|
315
315
|
envvar="INSPECT_EVAL_STOP_SEQS",
|
316
316
|
)
|
317
|
-
@click.option(
|
318
|
-
"--suffix",
|
319
|
-
type=str,
|
320
|
-
help="The suffix that comes after a completion of inserted text. OpenAI only.",
|
321
|
-
envvar="INSPECT_EVAL_SUFFIX",
|
322
|
-
)
|
323
317
|
@click.option(
|
324
318
|
"--temperature",
|
325
319
|
type=float,
|
@@ -439,7 +433,6 @@ def eval_command(
|
|
439
433
|
logit_bias: str | None,
|
440
434
|
seed: int | None,
|
441
435
|
stop_seqs: str | None,
|
442
|
-
suffix: str | None,
|
443
436
|
temperature: float | None,
|
444
437
|
top_p: float | None,
|
445
438
|
top_k: int | None,
|
inspect_ai/_eval/eval.py
CHANGED
@@ -35,7 +35,12 @@ from inspect_ai.scorer._reducer import reducer_log_names
|
|
35
35
|
from inspect_ai.solver._chain import chain
|
36
36
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
37
37
|
from inspect_ai.util import SandboxEnvironmentType
|
38
|
-
from inspect_ai.util._display import
|
38
|
+
from inspect_ai.util._display import (
|
39
|
+
DisplayType,
|
40
|
+
display_type,
|
41
|
+
display_type_initialized,
|
42
|
+
init_display_type,
|
43
|
+
)
|
39
44
|
|
40
45
|
from .context import init_eval_context
|
41
46
|
from .loader import ResolvedTask, resolve_tasks
|
@@ -306,6 +311,10 @@ async def eval_async(
|
|
306
311
|
|
307
312
|
_eval_async_running = True
|
308
313
|
|
314
|
+
# if we are called outside of eval() then set display type to "plain"
|
315
|
+
if not display_type_initialized():
|
316
|
+
init_display_type("plain")
|
317
|
+
|
309
318
|
# resolve model and task args
|
310
319
|
model_args = resolve_args(model_args)
|
311
320
|
task_args = resolve_args(task_args)
|
inspect_ai/_eval/loader.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import ast
|
2
2
|
import contextlib
|
3
|
+
import inspect
|
3
4
|
import os
|
4
5
|
from dataclasses import dataclass, field
|
5
6
|
from importlib.machinery import SourceFileLoader
|
@@ -9,11 +10,13 @@ from pathlib import Path
|
|
9
10
|
from types import ModuleType
|
10
11
|
from typing import Any, Callable, cast
|
11
12
|
|
13
|
+
from typing_extensions import overload
|
14
|
+
|
12
15
|
from inspect_ai._eval.task.util import task_file, task_run_dir
|
13
16
|
from inspect_ai._util.decorator import parse_decorators
|
14
17
|
from inspect_ai._util.error import PrerequisiteError
|
15
18
|
from inspect_ai._util.logger import warn_once
|
16
|
-
from inspect_ai._util.path import chdir_python
|
19
|
+
from inspect_ai._util.path import chdir_python, cwd_relative_path
|
17
20
|
from inspect_ai._util.registry import (
|
18
21
|
RegistryInfo,
|
19
22
|
is_registry_object,
|
@@ -23,6 +26,7 @@ from inspect_ai._util.registry import (
|
|
23
26
|
registry_params,
|
24
27
|
)
|
25
28
|
from inspect_ai.model import Model, ModelName
|
29
|
+
from inspect_ai.solver._bridge import bridge
|
26
30
|
from inspect_ai.solver._solver import Solver, SolverSpec
|
27
31
|
from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
|
28
32
|
from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
|
@@ -334,6 +338,16 @@ def split_spec(spec: str) -> tuple[str, str | None]:
|
|
334
338
|
return spec, None
|
335
339
|
|
336
340
|
|
341
|
+
@overload
|
342
|
+
def load_module(
|
343
|
+
module_path: Path, filter: Callable[[str], bool]
|
344
|
+
) -> ModuleType | None: ...
|
345
|
+
|
346
|
+
|
347
|
+
@overload
|
348
|
+
def load_module(module_path: Path, filter: None = None) -> ModuleType: ...
|
349
|
+
|
350
|
+
|
337
351
|
def load_module(
|
338
352
|
module_path: Path, filter: Callable[[str], bool] | None = None
|
339
353
|
) -> ModuleType | None:
|
@@ -425,28 +439,74 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
|
|
425
439
|
else contextlib.nullcontext()
|
426
440
|
)
|
427
441
|
|
442
|
+
# pretty solver name for error messages
|
443
|
+
pretty_solver_file = (
|
444
|
+
cwd_relative_path(solver_file.as_posix()) if solver_file else None
|
445
|
+
)
|
446
|
+
|
428
447
|
with create_cm:
|
429
|
-
# if
|
430
|
-
if solver_file is
|
431
|
-
|
432
|
-
|
448
|
+
# if there is no solver file then just create from the registry by name
|
449
|
+
if solver_file is None:
|
450
|
+
if solver_name is None:
|
451
|
+
raise ValueError(f"Unable to resolve solver name from {spec.solver}")
|
452
|
+
return cast(Solver, registry_create("solver", solver_name, **spec.args))
|
433
453
|
|
434
|
-
|
454
|
+
# we do have a solver file
|
455
|
+
else:
|
456
|
+
# load the module and parse decorators
|
457
|
+
solver_module = load_module(solver_file)
|
458
|
+
decorators = parse_decorators(solver_file, "solver")
|
459
|
+
|
460
|
+
# if there is no solver_name see if we can discover it
|
435
461
|
if solver_name is None:
|
436
|
-
|
437
|
-
|
462
|
+
if len(decorators) == 1:
|
463
|
+
# decorator based solver
|
464
|
+
solver_name = decorators[0][0]
|
465
|
+
elif len(decorators) == 0:
|
466
|
+
# see if we can find an agent based solver
|
467
|
+
functions = [
|
468
|
+
function
|
469
|
+
for function in inspect.getmembers(
|
470
|
+
solver_module, inspect.isfunction
|
471
|
+
)
|
472
|
+
if function[1].__module__ == solver_module.__name__
|
473
|
+
]
|
474
|
+
agent_functions = [
|
475
|
+
function
|
476
|
+
for function in functions
|
477
|
+
if "agent" in function[0] and not function[0].startswith("_")
|
478
|
+
]
|
479
|
+
if len(agent_functions) == 1:
|
480
|
+
# agent based solver
|
481
|
+
solver_name = agent_functions[0][0]
|
482
|
+
|
483
|
+
elif len(agent_functions) == 0:
|
484
|
+
raise PrerequisiteError(
|
485
|
+
f"The source file {pretty_solver_file} does not contain any @solver functions or agent functions."
|
486
|
+
)
|
487
|
+
else:
|
488
|
+
raise PrerequisiteError(
|
489
|
+
f"The source file {pretty_solver_file} has more than one agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
|
490
|
+
)
|
491
|
+
else:
|
438
492
|
raise PrerequisiteError(
|
439
|
-
f"The source file {
|
493
|
+
f"The source file {pretty_solver_file} has more than one @solver function (qualify which solver using e.g. '{solver_file.name}y@solver_fn')"
|
440
494
|
)
|
441
|
-
if len(solvers) > 1:
|
442
|
-
raise PrerequisiteError(
|
443
|
-
f"The source file {solver_file.as_posix()} has more than one @solver function (qualify which solver using file.py@solver)"
|
444
|
-
)
|
445
|
-
solver_name = solvers[0][0]
|
446
495
|
|
447
|
-
|
448
|
-
|
449
|
-
|
496
|
+
# create decorator based solvers using the registry
|
497
|
+
if any(solver[0] == solver_name for solver in decorators):
|
498
|
+
return cast(Solver, registry_create("solver", solver_name, **spec.args))
|
450
499
|
|
451
|
-
|
452
|
-
|
500
|
+
# create agent based solvers by calling the function and wrapping it in bridge()
|
501
|
+
else:
|
502
|
+
agent_fn = getattr(solver_module, solver_name, None)
|
503
|
+
if inspect.isfunction(agent_fn):
|
504
|
+
return bridge(agent_fn(**spec.args))
|
505
|
+
elif agent_fn is not None:
|
506
|
+
raise PrerequisiteError(
|
507
|
+
f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
|
508
|
+
)
|
509
|
+
else:
|
510
|
+
raise PrerequisiteError(
|
511
|
+
f"The function {solver_name} was not found in file {pretty_solver_file}."
|
512
|
+
)
|
inspect_ai/_eval/registry.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import inspect
|
2
2
|
import logging
|
3
3
|
from copy import deepcopy
|
4
|
+
from functools import wraps
|
4
5
|
from pathlib import Path
|
5
6
|
from typing import Any, Callable, TypeVar, cast, overload
|
6
7
|
|
@@ -125,6 +126,7 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
125
126
|
params = list(inspect.signature(task_type).parameters.keys())
|
126
127
|
|
127
128
|
# Create and return the wrapper function
|
129
|
+
@wraps(task_type)
|
128
130
|
def wrapper(*w_args: Any, **w_kwargs: Any) -> Task:
|
129
131
|
# Create the task
|
130
132
|
task_instance = task_type(*w_args, **w_kwargs)
|
@@ -154,6 +156,10 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
|
|
154
156
|
# Return the task instance
|
155
157
|
return task_instance
|
156
158
|
|
159
|
+
# functools.wraps overrides the return type annotation of the inner function, so
|
160
|
+
# we explicitly set it again
|
161
|
+
wrapper.__annotations__["return"] = Task
|
162
|
+
|
157
163
|
# Register the task and return the wrapper
|
158
164
|
return task_register(
|
159
165
|
task=cast(TaskType, wrapper), name=task_name, attribs=attribs, params=params
|
inspect_ai/_eval/score.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Callable, cast
|
|
5
5
|
from inspect_ai._display import display
|
6
6
|
from inspect_ai._util.path import chdir_python
|
7
7
|
from inspect_ai._util.platform import platform_init
|
8
|
-
from inspect_ai._util.registry import registry_create
|
8
|
+
from inspect_ai._util.registry import registry_create, registry_unqualified_name
|
9
9
|
from inspect_ai.log import (
|
10
10
|
EvalLog,
|
11
11
|
EvalMetric,
|
@@ -185,6 +185,7 @@ async def run_score_task(
|
|
185
185
|
results[scorer_name] = SampleScore(
|
186
186
|
score=result,
|
187
187
|
sample_id=state.sample_id,
|
188
|
+
scorer=registry_unqualified_name(scorer),
|
188
189
|
)
|
189
190
|
|
190
191
|
progress()
|
inspect_ai/_eval/task/results.py
CHANGED
@@ -65,11 +65,12 @@ def eval_results(
|
|
65
65
|
# extract scorers info from scorers then create scorers info for any
|
66
66
|
# scores not already accounted for by a scorer name
|
67
67
|
scorers_info = [ScorerInfo.from_scorer(scorer) for scorer in (scorers or [])]
|
68
|
-
scorer_names =
|
69
|
-
for
|
70
|
-
|
71
|
-
|
72
|
-
|
68
|
+
scorer_names = {info.name for info in scorers_info}
|
69
|
+
for sample_scores in scores:
|
70
|
+
for name, sample_score in sample_scores.items():
|
71
|
+
if sample_score.scorer is None and name not in scorer_names:
|
72
|
+
scorers_info.append(ScorerInfo.from_name(name))
|
73
|
+
scorer_names.add(name)
|
73
74
|
|
74
75
|
# record scorer
|
75
76
|
if len(scorers_info) > 0:
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -27,8 +27,12 @@ from inspect_ai._util.constants import (
|
|
27
27
|
from inspect_ai._util.datetime import iso_now
|
28
28
|
from inspect_ai._util.error import exception_message
|
29
29
|
from inspect_ai._util.hooks import send_telemetry
|
30
|
-
from inspect_ai._util.registry import
|
31
|
-
|
30
|
+
from inspect_ai._util.registry import (
|
31
|
+
is_registry_object,
|
32
|
+
registry_log_name,
|
33
|
+
registry_unqualified_name,
|
34
|
+
)
|
35
|
+
from inspect_ai._util.timeouts import Timeout, timeout
|
32
36
|
from inspect_ai._view.notify import view_notify_eval
|
33
37
|
from inspect_ai.dataset import Dataset, Sample
|
34
38
|
from inspect_ai.log import (
|
@@ -652,20 +656,15 @@ async def task_run_sample(
|
|
652
656
|
except BaseException as ex:
|
653
657
|
error = handle_error(ex)
|
654
658
|
|
655
|
-
# set timeout for scoring. if the original timeout was
|
656
|
-
#
|
657
|
-
# timeout time. if the original timeout was hit we still want
|
658
|
-
# to provide an opportunity for scoring, but we don't necessarily
|
659
|
+
# set timeout for scoring. if the original timeout was hit we still
|
660
|
+
# want to provide opportunity for scoring, but we don't necessarily
|
659
661
|
# want to wait the full timeout again (especially in the case where
|
660
662
|
# the cause of the timeout is a hung container and scoring requires
|
661
663
|
# interacting with the container). as a middle ground we use half
|
662
664
|
# of the original timeout value for scoring.
|
663
665
|
if isinstance(timeout_cm, Timeout):
|
664
|
-
|
665
|
-
|
666
|
-
else:
|
667
|
-
assert time_limit
|
668
|
-
timeout_cm = timeout(time_limit / 2)
|
666
|
+
assert time_limit
|
667
|
+
timeout_cm = timeout(time_limit / 2)
|
669
668
|
|
670
669
|
# turn off sample limits
|
671
670
|
set_active_sample_token_limit(None)
|
@@ -690,6 +689,7 @@ async def task_run_sample(
|
|
690
689
|
sample_score = SampleScore(
|
691
690
|
score=score_result,
|
692
691
|
sample_id=sample.id,
|
692
|
+
scorer=registry_unqualified_name(scorer),
|
693
693
|
)
|
694
694
|
transcript()._event(
|
695
695
|
ScoreEvent(
|