inspect-ai 0.3.88__py3-none-any.whl → 0.3.90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. inspect_ai/_cli/eval.py +16 -0
  2. inspect_ai/_cli/score.py +1 -12
  3. inspect_ai/_cli/util.py +4 -2
  4. inspect_ai/_display/core/footer.py +2 -2
  5. inspect_ai/_display/plain/display.py +2 -2
  6. inspect_ai/_eval/context.py +7 -1
  7. inspect_ai/_eval/eval.py +51 -27
  8. inspect_ai/_eval/evalset.py +27 -10
  9. inspect_ai/_eval/loader.py +7 -8
  10. inspect_ai/_eval/run.py +23 -31
  11. inspect_ai/_eval/score.py +18 -1
  12. inspect_ai/_eval/task/log.py +5 -13
  13. inspect_ai/_eval/task/resolved.py +1 -0
  14. inspect_ai/_eval/task/run.py +231 -256
  15. inspect_ai/_eval/task/task.py +25 -2
  16. inspect_ai/_eval/task/util.py +1 -8
  17. inspect_ai/_util/constants.py +1 -0
  18. inspect_ai/_util/json.py +8 -3
  19. inspect_ai/_util/registry.py +30 -13
  20. inspect_ai/_view/www/App.css +5 -0
  21. inspect_ai/_view/www/dist/assets/index.css +71 -36
  22. inspect_ai/_view/www/dist/assets/index.js +573 -475
  23. inspect_ai/_view/www/log-schema.json +66 -0
  24. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
  25. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
  26. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
  27. inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
  28. inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
  29. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -2
  30. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  31. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
  32. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -6
  33. inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +0 -2
  34. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
  35. inspect_ai/_view/www/src/types/log.d.ts +24 -6
  36. inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
  37. inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
  38. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
  39. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
  40. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
  41. inspect_ai/agent/_agent.py +12 -0
  42. inspect_ai/agent/_as_tool.py +1 -1
  43. inspect_ai/agent/_bridge/bridge.py +9 -2
  44. inspect_ai/agent/_react.py +142 -74
  45. inspect_ai/agent/_run.py +13 -2
  46. inspect_ai/agent/_types.py +6 -0
  47. inspect_ai/approval/_apply.py +6 -7
  48. inspect_ai/approval/_approver.py +3 -3
  49. inspect_ai/approval/_auto.py +2 -2
  50. inspect_ai/approval/_call.py +20 -4
  51. inspect_ai/approval/_human/approver.py +3 -3
  52. inspect_ai/approval/_human/manager.py +2 -2
  53. inspect_ai/approval/_human/panel.py +3 -3
  54. inspect_ai/approval/_policy.py +3 -3
  55. inspect_ai/log/__init__.py +2 -0
  56. inspect_ai/log/_log.py +23 -2
  57. inspect_ai/log/_model.py +58 -0
  58. inspect_ai/log/_recorders/file.py +14 -3
  59. inspect_ai/log/_transcript.py +3 -0
  60. inspect_ai/model/__init__.py +2 -0
  61. inspect_ai/model/_call_tools.py +4 -1
  62. inspect_ai/model/_model.py +49 -3
  63. inspect_ai/model/_openai.py +151 -21
  64. inspect_ai/model/_providers/anthropic.py +20 -12
  65. inspect_ai/model/_providers/bedrock.py +3 -3
  66. inspect_ai/model/_providers/cloudflare.py +29 -108
  67. inspect_ai/model/_providers/google.py +21 -10
  68. inspect_ai/model/_providers/grok.py +23 -17
  69. inspect_ai/model/_providers/groq.py +61 -37
  70. inspect_ai/model/_providers/llama_cpp_python.py +8 -9
  71. inspect_ai/model/_providers/mistral.py +8 -3
  72. inspect_ai/model/_providers/ollama.py +8 -9
  73. inspect_ai/model/_providers/openai.py +53 -157
  74. inspect_ai/model/_providers/openai_compatible.py +195 -0
  75. inspect_ai/model/_providers/openrouter.py +4 -15
  76. inspect_ai/model/_providers/providers.py +11 -0
  77. inspect_ai/model/_providers/together.py +25 -23
  78. inspect_ai/model/_trim.py +83 -0
  79. inspect_ai/solver/_plan.py +5 -3
  80. inspect_ai/tool/_tool_def.py +8 -2
  81. inspect_ai/util/__init__.py +3 -0
  82. inspect_ai/util/_concurrency.py +15 -2
  83. {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/METADATA +1 -1
  84. {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/RECORD +88 -83
  85. {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/WHEEL +1 -1
  86. inspect_ai/_eval/task/rundir.py +0 -78
  87. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
  88. {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/entry_points.txt +0 -0
  89. {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/licenses/LICENSE +0 -0
  90. {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/score.py CHANGED
@@ -6,6 +6,7 @@ from typing import Any, Callable, Literal, cast
6
6
  import anyio
7
7
 
8
8
  from inspect_ai._display import display
9
+ from inspect_ai._eval.context import init_task_context
9
10
  from inspect_ai._eval.loader import scorer_from_spec
10
11
  from inspect_ai._util._async import configured_async_backend, run_coroutine, tg_collect
11
12
  from inspect_ai._util.platform import platform_init, running_in_notebook
@@ -14,7 +15,9 @@ from inspect_ai.log import (
14
15
  EvalLog,
15
16
  )
16
17
  from inspect_ai.log._log import EvalMetricDefinition
18
+ from inspect_ai.log._model import model_roles_config_to_model_roles
17
19
  from inspect_ai.model import ModelName
20
+ from inspect_ai.model._model import get_model
18
21
  from inspect_ai.scorer import Metric, Scorer, Target
19
22
  from inspect_ai.scorer._metric import SampleScore
20
23
  from inspect_ai.scorer._reducer import (
@@ -122,7 +125,7 @@ async def score_async(
122
125
  scores: list[dict[str, SampleScore]] = await tg_collect(
123
126
  [
124
127
  functools.partial(
125
- run_score_task, state, Target(sample.target), scorers, progress
128
+ run_score_task, log, state, Target(sample.target), scorers, progress
126
129
  )
127
130
  for (sample, state) in zip(log.samples, states)
128
131
  ]
@@ -218,11 +221,25 @@ async def task_score(
218
221
 
219
222
 
220
223
  async def run_score_task(
224
+ log: EvalLog,
221
225
  state: TaskState,
222
226
  target: Target,
223
227
  scorers: list[Scorer],
224
228
  progress: Callable[..., None],
225
229
  ) -> dict[str, SampleScore]:
230
+ # get the model then initialize the async context
231
+ model = get_model(
232
+ model=log.eval.model,
233
+ config=log.plan.config.merge(log.eval.model_generate_config),
234
+ **log.eval.model_args,
235
+ )
236
+
237
+ # get the model roles
238
+ model_roles = model_roles_config_to_model_roles(log.eval.model_roles)
239
+
240
+ # initialize active model
241
+ init_task_context(model, model_roles)
242
+
226
243
  results: dict[str, SampleScore] = {}
227
244
  for scorer in scorers:
228
245
  result = await scorer(state, target)
@@ -1,6 +1,5 @@
1
1
  from importlib import metadata as importlib_metadata
2
- from inspect import isgenerator
3
- from typing import Any, Iterator, Literal, cast
2
+ from typing import Any, Literal, cast
4
3
 
5
4
  from shortuuid import uuid
6
5
 
@@ -34,6 +33,7 @@ from inspect_ai.log._log import (
34
33
  EvalScorer,
35
34
  eval_config_defaults,
36
35
  )
36
+ from inspect_ai.log._model import model_args_for_log, model_roles_to_model_roles_config
37
37
  from inspect_ai.log._recorders import Recorder
38
38
  from inspect_ai.log._recorders.buffer import SampleBufferDatabase
39
39
  from inspect_ai.log._recorders.types import SampleEvent, SampleSummary
@@ -63,6 +63,7 @@ class TaskLogger:
63
63
  solver: SolverSpec | None,
64
64
  tags: list[str] | None,
65
65
  model: Model,
66
+ model_roles: dict[str, Model] | None,
66
67
  dataset: Dataset,
67
68
  scorer: list[ScorerSpec] | None,
68
69
  metrics: list[MetricSpec] | dict[str, list[MetricSpec]] | None,
@@ -84,17 +85,7 @@ class TaskLogger:
84
85
  packages = {PKG_NAME: importlib_metadata.version(PKG_NAME)}
85
86
 
86
87
  # redact authentication oriented model_args
87
- model_args = model_args.copy()
88
- if "api_key" in model_args:
89
- del model_args["api_key"]
90
- model_args = {k: v for k, v in model_args.items() if not k.startswith("aws_")}
91
-
92
- # don't try to serialise generators
93
- model_args = {
94
- k: v
95
- for k, v in model_args.items()
96
- if not isgenerator(v) and not isinstance(v, Iterator)
97
- }
88
+ model_args = model_args_for_log(model_args)
98
89
 
99
90
  # cwd_relative_path for sandbox config
100
91
  if sandbox and isinstance(sandbox.config, str):
@@ -141,6 +132,7 @@ class TaskLogger:
141
132
  model=str(ModelName(model)),
142
133
  model_generate_config=model.config,
143
134
  model_base_url=model.api.base_url,
135
+ model_roles=model_roles_to_model_roles_config(model_roles),
144
136
  dataset=EvalDataset(
145
137
  name=dataset.name,
146
138
  location=cwd_relative_path(dataset.location),
@@ -13,6 +13,7 @@ class ResolvedTask:
13
13
  task_args: dict[str, Any]
14
14
  task_file: str | None
15
15
  model: Model
16
+ model_roles: dict[str, Model] | None
16
17
  sandbox: SandboxEnvironmentSpec | None
17
18
  sequence: int
18
19
  id: str | None = field(default=None)