inspect-ai 0.3.88__py3-none-any.whl → 0.3.90__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +16 -0
- inspect_ai/_cli/score.py +1 -12
- inspect_ai/_cli/util.py +4 -2
- inspect_ai/_display/core/footer.py +2 -2
- inspect_ai/_display/plain/display.py +2 -2
- inspect_ai/_eval/context.py +7 -1
- inspect_ai/_eval/eval.py +51 -27
- inspect_ai/_eval/evalset.py +27 -10
- inspect_ai/_eval/loader.py +7 -8
- inspect_ai/_eval/run.py +23 -31
- inspect_ai/_eval/score.py +18 -1
- inspect_ai/_eval/task/log.py +5 -13
- inspect_ai/_eval/task/resolved.py +1 -0
- inspect_ai/_eval/task/run.py +231 -256
- inspect_ai/_eval/task/task.py +25 -2
- inspect_ai/_eval/task/util.py +1 -8
- inspect_ai/_util/constants.py +1 -0
- inspect_ai/_util/json.py +8 -3
- inspect_ai/_util/registry.py +30 -13
- inspect_ai/_view/www/App.css +5 -0
- inspect_ai/_view/www/dist/assets/index.css +71 -36
- inspect_ai/_view/www/dist/assets/index.js +573 -475
- inspect_ai/_view/www/log-schema.json +66 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
- inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
- inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -6
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +0 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
- inspect_ai/_view/www/src/types/log.d.ts +24 -6
- inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
- inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
- inspect_ai/agent/_agent.py +12 -0
- inspect_ai/agent/_as_tool.py +1 -1
- inspect_ai/agent/_bridge/bridge.py +9 -2
- inspect_ai/agent/_react.py +142 -74
- inspect_ai/agent/_run.py +13 -2
- inspect_ai/agent/_types.py +6 -0
- inspect_ai/approval/_apply.py +6 -7
- inspect_ai/approval/_approver.py +3 -3
- inspect_ai/approval/_auto.py +2 -2
- inspect_ai/approval/_call.py +20 -4
- inspect_ai/approval/_human/approver.py +3 -3
- inspect_ai/approval/_human/manager.py +2 -2
- inspect_ai/approval/_human/panel.py +3 -3
- inspect_ai/approval/_policy.py +3 -3
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_log.py +23 -2
- inspect_ai/log/_model.py +58 -0
- inspect_ai/log/_recorders/file.py +14 -3
- inspect_ai/log/_transcript.py +3 -0
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +4 -1
- inspect_ai/model/_model.py +49 -3
- inspect_ai/model/_openai.py +151 -21
- inspect_ai/model/_providers/anthropic.py +20 -12
- inspect_ai/model/_providers/bedrock.py +3 -3
- inspect_ai/model/_providers/cloudflare.py +29 -108
- inspect_ai/model/_providers/google.py +21 -10
- inspect_ai/model/_providers/grok.py +23 -17
- inspect_ai/model/_providers/groq.py +61 -37
- inspect_ai/model/_providers/llama_cpp_python.py +8 -9
- inspect_ai/model/_providers/mistral.py +8 -3
- inspect_ai/model/_providers/ollama.py +8 -9
- inspect_ai/model/_providers/openai.py +53 -157
- inspect_ai/model/_providers/openai_compatible.py +195 -0
- inspect_ai/model/_providers/openrouter.py +4 -15
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/model/_providers/together.py +25 -23
- inspect_ai/model/_trim.py +83 -0
- inspect_ai/solver/_plan.py +5 -3
- inspect_ai/tool/_tool_def.py +8 -2
- inspect_ai/util/__init__.py +3 -0
- inspect_ai/util/_concurrency.py +15 -2
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/RECORD +88 -83
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/WHEEL +1 -1
- inspect_ai/_eval/task/rundir.py +0 -78
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/score.py
CHANGED
@@ -6,6 +6,7 @@ from typing import Any, Callable, Literal, cast
|
|
6
6
|
import anyio
|
7
7
|
|
8
8
|
from inspect_ai._display import display
|
9
|
+
from inspect_ai._eval.context import init_task_context
|
9
10
|
from inspect_ai._eval.loader import scorer_from_spec
|
10
11
|
from inspect_ai._util._async import configured_async_backend, run_coroutine, tg_collect
|
11
12
|
from inspect_ai._util.platform import platform_init, running_in_notebook
|
@@ -14,7 +15,9 @@ from inspect_ai.log import (
|
|
14
15
|
EvalLog,
|
15
16
|
)
|
16
17
|
from inspect_ai.log._log import EvalMetricDefinition
|
18
|
+
from inspect_ai.log._model import model_roles_config_to_model_roles
|
17
19
|
from inspect_ai.model import ModelName
|
20
|
+
from inspect_ai.model._model import get_model
|
18
21
|
from inspect_ai.scorer import Metric, Scorer, Target
|
19
22
|
from inspect_ai.scorer._metric import SampleScore
|
20
23
|
from inspect_ai.scorer._reducer import (
|
@@ -122,7 +125,7 @@ async def score_async(
|
|
122
125
|
scores: list[dict[str, SampleScore]] = await tg_collect(
|
123
126
|
[
|
124
127
|
functools.partial(
|
125
|
-
run_score_task, state, Target(sample.target), scorers, progress
|
128
|
+
run_score_task, log, state, Target(sample.target), scorers, progress
|
126
129
|
)
|
127
130
|
for (sample, state) in zip(log.samples, states)
|
128
131
|
]
|
@@ -218,11 +221,25 @@ async def task_score(
|
|
218
221
|
|
219
222
|
|
220
223
|
async def run_score_task(
|
224
|
+
log: EvalLog,
|
221
225
|
state: TaskState,
|
222
226
|
target: Target,
|
223
227
|
scorers: list[Scorer],
|
224
228
|
progress: Callable[..., None],
|
225
229
|
) -> dict[str, SampleScore]:
|
230
|
+
# get the model then initialize the async context
|
231
|
+
model = get_model(
|
232
|
+
model=log.eval.model,
|
233
|
+
config=log.plan.config.merge(log.eval.model_generate_config),
|
234
|
+
**log.eval.model_args,
|
235
|
+
)
|
236
|
+
|
237
|
+
# get the model roles
|
238
|
+
model_roles = model_roles_config_to_model_roles(log.eval.model_roles)
|
239
|
+
|
240
|
+
# initialize active model
|
241
|
+
init_task_context(model, model_roles)
|
242
|
+
|
226
243
|
results: dict[str, SampleScore] = {}
|
227
244
|
for scorer in scorers:
|
228
245
|
result = await scorer(state, target)
|
inspect_ai/_eval/task/log.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
from importlib import metadata as importlib_metadata
|
2
|
-
from
|
3
|
-
from typing import Any, Iterator, Literal, cast
|
2
|
+
from typing import Any, Literal, cast
|
4
3
|
|
5
4
|
from shortuuid import uuid
|
6
5
|
|
@@ -34,6 +33,7 @@ from inspect_ai.log._log import (
|
|
34
33
|
EvalScorer,
|
35
34
|
eval_config_defaults,
|
36
35
|
)
|
36
|
+
from inspect_ai.log._model import model_args_for_log, model_roles_to_model_roles_config
|
37
37
|
from inspect_ai.log._recorders import Recorder
|
38
38
|
from inspect_ai.log._recorders.buffer import SampleBufferDatabase
|
39
39
|
from inspect_ai.log._recorders.types import SampleEvent, SampleSummary
|
@@ -63,6 +63,7 @@ class TaskLogger:
|
|
63
63
|
solver: SolverSpec | None,
|
64
64
|
tags: list[str] | None,
|
65
65
|
model: Model,
|
66
|
+
model_roles: dict[str, Model] | None,
|
66
67
|
dataset: Dataset,
|
67
68
|
scorer: list[ScorerSpec] | None,
|
68
69
|
metrics: list[MetricSpec] | dict[str, list[MetricSpec]] | None,
|
@@ -84,17 +85,7 @@ class TaskLogger:
|
|
84
85
|
packages = {PKG_NAME: importlib_metadata.version(PKG_NAME)}
|
85
86
|
|
86
87
|
# redact authentication oriented model_args
|
87
|
-
model_args = model_args
|
88
|
-
if "api_key" in model_args:
|
89
|
-
del model_args["api_key"]
|
90
|
-
model_args = {k: v for k, v in model_args.items() if not k.startswith("aws_")}
|
91
|
-
|
92
|
-
# don't try to serialise generators
|
93
|
-
model_args = {
|
94
|
-
k: v
|
95
|
-
for k, v in model_args.items()
|
96
|
-
if not isgenerator(v) and not isinstance(v, Iterator)
|
97
|
-
}
|
88
|
+
model_args = model_args_for_log(model_args)
|
98
89
|
|
99
90
|
# cwd_relative_path for sandbox config
|
100
91
|
if sandbox and isinstance(sandbox.config, str):
|
@@ -141,6 +132,7 @@ class TaskLogger:
|
|
141
132
|
model=str(ModelName(model)),
|
142
133
|
model_generate_config=model.config,
|
143
134
|
model_base_url=model.api.base_url,
|
135
|
+
model_roles=model_roles_to_model_roles_config(model_roles),
|
144
136
|
dataset=EvalDataset(
|
145
137
|
name=dataset.name,
|
146
138
|
location=cwd_relative_path(dataset.location),
|