inspect-ai 0.3.59__py3-none-any.whl → 0.3.61__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. inspect_ai/_cli/eval.py +0 -8
  2. inspect_ai/_display/textual/widgets/samples.py +1 -1
  3. inspect_ai/_eval/eval.py +10 -1
  4. inspect_ai/_eval/loader.py +79 -19
  5. inspect_ai/_eval/registry.py +6 -0
  6. inspect_ai/_eval/score.py +2 -1
  7. inspect_ai/_eval/task/generate.py +41 -35
  8. inspect_ai/_eval/task/results.py +6 -5
  9. inspect_ai/_eval/task/run.py +21 -15
  10. inspect_ai/_util/hooks.py +17 -7
  11. inspect_ai/_view/www/dist/assets/index.js +262 -303
  12. inspect_ai/_view/www/package.json +1 -1
  13. inspect_ai/_view/www/src/App.mjs +6 -6
  14. inspect_ai/_view/www/src/Types.mjs +1 -1
  15. inspect_ai/_view/www/src/api/Types.ts +133 -0
  16. inspect_ai/_view/www/src/api/{api-browser.mjs → api-browser.ts} +25 -13
  17. inspect_ai/_view/www/src/api/api-http.ts +219 -0
  18. inspect_ai/_view/www/src/api/api-shared.ts +47 -0
  19. inspect_ai/_view/www/src/api/{api-vscode.mjs → api-vscode.ts} +22 -19
  20. inspect_ai/_view/www/src/api/{client-api.mjs → client-api.ts} +93 -53
  21. inspect_ai/_view/www/src/api/index.ts +51 -0
  22. inspect_ai/_view/www/src/api/jsonrpc.ts +225 -0
  23. inspect_ai/_view/www/src/components/DownloadButton.mjs +1 -1
  24. inspect_ai/_view/www/src/index.js +2 -2
  25. inspect_ai/_view/www/src/log/{remoteLogFile.mjs → remoteLogFile.ts} +62 -46
  26. inspect_ai/_view/www/src/navbar/Navbar.mjs +1 -1
  27. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +1 -1
  28. inspect_ai/_view/www/src/samples/SampleList.mjs +1 -1
  29. inspect_ai/_view/www/src/samples/SampleScores.mjs +1 -1
  30. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +14 -14
  31. inspect_ai/_view/www/src/samples/SamplesTab.mjs +10 -10
  32. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +2 -2
  33. inspect_ai/_view/www/src/utils/{Json.mjs → json-worker.ts} +1 -3
  34. inspect_ai/_view/www/src/utils/vscode.ts +36 -0
  35. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
  36. inspect_ai/approval/_human/manager.py +1 -1
  37. inspect_ai/model/_call_tools.py +55 -0
  38. inspect_ai/model/_chat_message.py +2 -2
  39. inspect_ai/model/_conversation.py +1 -4
  40. inspect_ai/model/_generate_config.py +2 -8
  41. inspect_ai/model/_model.py +90 -25
  42. inspect_ai/model/_model_output.py +15 -0
  43. inspect_ai/model/_openai.py +383 -0
  44. inspect_ai/model/_providers/anthropic.py +52 -14
  45. inspect_ai/model/_providers/azureai.py +1 -1
  46. inspect_ai/model/_providers/goodfire.py +248 -0
  47. inspect_ai/model/_providers/groq.py +7 -3
  48. inspect_ai/model/_providers/hf.py +6 -0
  49. inspect_ai/model/_providers/mistral.py +2 -1
  50. inspect_ai/model/_providers/openai.py +36 -202
  51. inspect_ai/model/_providers/openai_o1.py +2 -4
  52. inspect_ai/model/_providers/providers.py +22 -0
  53. inspect_ai/model/_providers/together.py +4 -4
  54. inspect_ai/model/_providers/util/__init__.py +2 -3
  55. inspect_ai/model/_providers/util/hf_handler.py +1 -1
  56. inspect_ai/model/_providers/util/llama31.py +1 -1
  57. inspect_ai/model/_providers/util/util.py +0 -76
  58. inspect_ai/scorer/_metric.py +3 -0
  59. inspect_ai/scorer/_scorer.py +2 -1
  60. inspect_ai/solver/__init__.py +4 -0
  61. inspect_ai/solver/_basic_agent.py +65 -55
  62. inspect_ai/solver/_bridge/__init__.py +3 -0
  63. inspect_ai/solver/_bridge/bridge.py +100 -0
  64. inspect_ai/solver/_bridge/patch.py +170 -0
  65. inspect_ai/{util → solver}/_limit.py +13 -0
  66. inspect_ai/solver/_solver.py +6 -0
  67. inspect_ai/solver/_task_state.py +37 -7
  68. inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -1
  69. inspect_ai/tool/beta/_computer/_resources/Dockerfile +1 -3
  70. inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +1 -1
  71. inspect_ai/tool/beta/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +10 -0
  72. inspect_ai/util/__init__.py +0 -2
  73. inspect_ai/util/_display.py +5 -0
  74. inspect_ai/util/_sandbox/docker/prereqs.py +1 -1
  75. inspect_ai/util/_sandbox/self_check.py +51 -28
  76. {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/METADATA +3 -2
  77. {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/RECORD +81 -76
  78. inspect_ai/_view/www/src/api/Types.mjs +0 -117
  79. inspect_ai/_view/www/src/api/api-http.mjs +0 -300
  80. inspect_ai/_view/www/src/api/api-shared.mjs +0 -10
  81. inspect_ai/_view/www/src/api/index.mjs +0 -49
  82. inspect_ai/_view/www/src/api/jsonrpc.mjs +0 -208
  83. inspect_ai/_view/www/src/utils/vscode.mjs +0 -16
  84. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +0 -10
  85. {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/LICENSE +0 -0
  86. {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/WHEEL +0 -0
  87. {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/entry_points.txt +0 -0
  88. {inspect_ai-0.3.59.dist-info → inspect_ai-0.3.61.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/eval.py CHANGED
@@ -314,12 +314,6 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
314
314
  help="Sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.",
315
315
  envvar="INSPECT_EVAL_STOP_SEQS",
316
316
  )
317
- @click.option(
318
- "--suffix",
319
- type=str,
320
- help="The suffix that comes after a completion of inserted text. OpenAI only.",
321
- envvar="INSPECT_EVAL_SUFFIX",
322
- )
323
317
  @click.option(
324
318
  "--temperature",
325
319
  type=float,
@@ -439,7 +433,6 @@ def eval_command(
439
433
  logit_bias: str | None,
440
434
  seed: int | None,
441
435
  stop_seqs: str | None,
442
- suffix: str | None,
443
436
  temperature: float | None,
444
437
  top_p: float | None,
445
438
  top_k: int | None,
@@ -599,7 +592,6 @@ def eval_set_command(
599
592
  logit_bias: str | None,
600
593
  seed: int | None,
601
594
  stop_seqs: str | None,
602
- suffix: str | None,
603
595
  temperature: float | None,
604
596
  top_p: float | None,
605
597
  top_k: int | None,
@@ -413,7 +413,7 @@ class SampleToolbar(Horizontal):
413
413
  grid-columns: auto auto 1fr auto auto;
414
414
  }}
415
415
  SampleToolbar #{STATUS_GROUP} {{
416
- min-width: 20;
416
+ width: 22;
417
417
  }}
418
418
  SampleToolbar Button {{
419
419
  margin-bottom: 1;
inspect_ai/_eval/eval.py CHANGED
@@ -35,7 +35,12 @@ from inspect_ai.scorer._reducer import reducer_log_names
35
35
  from inspect_ai.solver._chain import chain
36
36
  from inspect_ai.solver._solver import Solver, SolverSpec
37
37
  from inspect_ai.util import SandboxEnvironmentType
38
- from inspect_ai.util._display import DisplayType, display_type, init_display_type
38
+ from inspect_ai.util._display import (
39
+ DisplayType,
40
+ display_type,
41
+ display_type_initialized,
42
+ init_display_type,
43
+ )
39
44
 
40
45
  from .context import init_eval_context
41
46
  from .loader import ResolvedTask, resolve_tasks
@@ -306,6 +311,10 @@ async def eval_async(
306
311
 
307
312
  _eval_async_running = True
308
313
 
314
+ # if we are called outside of eval() then set display type to "plain"
315
+ if not display_type_initialized():
316
+ init_display_type("plain")
317
+
309
318
  # resolve model and task args
310
319
  model_args = resolve_args(model_args)
311
320
  task_args = resolve_args(task_args)
@@ -1,5 +1,6 @@
1
1
  import ast
2
2
  import contextlib
3
+ import inspect
3
4
  import os
4
5
  from dataclasses import dataclass, field
5
6
  from importlib.machinery import SourceFileLoader
@@ -9,11 +10,13 @@ from pathlib import Path
9
10
  from types import ModuleType
10
11
  from typing import Any, Callable, cast
11
12
 
13
+ from typing_extensions import overload
14
+
12
15
  from inspect_ai._eval.task.util import task_file, task_run_dir
13
16
  from inspect_ai._util.decorator import parse_decorators
14
17
  from inspect_ai._util.error import PrerequisiteError
15
18
  from inspect_ai._util.logger import warn_once
16
- from inspect_ai._util.path import chdir_python
19
+ from inspect_ai._util.path import chdir_python, cwd_relative_path
17
20
  from inspect_ai._util.registry import (
18
21
  RegistryInfo,
19
22
  is_registry_object,
@@ -23,6 +26,7 @@ from inspect_ai._util.registry import (
23
26
  registry_params,
24
27
  )
25
28
  from inspect_ai.model import Model, ModelName
29
+ from inspect_ai.solver._bridge import bridge
26
30
  from inspect_ai.solver._solver import Solver, SolverSpec
27
31
  from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
28
32
  from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
@@ -334,6 +338,16 @@ def split_spec(spec: str) -> tuple[str, str | None]:
334
338
  return spec, None
335
339
 
336
340
 
341
+ @overload
342
+ def load_module(
343
+ module_path: Path, filter: Callable[[str], bool]
344
+ ) -> ModuleType | None: ...
345
+
346
+
347
+ @overload
348
+ def load_module(module_path: Path, filter: None = None) -> ModuleType: ...
349
+
350
+
337
351
  def load_module(
338
352
  module_path: Path, filter: Callable[[str], bool] | None = None
339
353
  ) -> ModuleType | None:
@@ -425,28 +439,74 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
425
439
  else contextlib.nullcontext()
426
440
  )
427
441
 
442
+ # pretty solver name for error messages
443
+ pretty_solver_file = (
444
+ cwd_relative_path(solver_file.as_posix()) if solver_file else None
445
+ )
446
+
428
447
  with create_cm:
429
- # if we have a file then we need to load it and (if required) determine the solver name
430
- if solver_file is not None:
431
- # load the module so that registry_create works
432
- load_module(solver_file)
448
+ # if there is no solver file then just create from the registry by name
449
+ if solver_file is None:
450
+ if solver_name is None:
451
+ raise ValueError(f"Unable to resolve solver name from {spec.solver}")
452
+ return cast(Solver, registry_create("solver", solver_name, **spec.args))
433
453
 
434
- # if there is no solver_name we need to discover the first @solver
454
+ # we do have a solver file
455
+ else:
456
+ # load the module and parse decorators
457
+ solver_module = load_module(solver_file)
458
+ decorators = parse_decorators(solver_file, "solver")
459
+
460
+ # if there is no solver_name see if we can discover it
435
461
  if solver_name is None:
436
- solvers = parse_decorators(solver_file, "solver")
437
- if len(solvers) == 0:
462
+ if len(decorators) == 1:
463
+ # decorator based solver
464
+ solver_name = decorators[0][0]
465
+ elif len(decorators) == 0:
466
+ # see if we can find an agent based solver
467
+ functions = [
468
+ function
469
+ for function in inspect.getmembers(
470
+ solver_module, inspect.isfunction
471
+ )
472
+ if function[1].__module__ == solver_module.__name__
473
+ ]
474
+ agent_functions = [
475
+ function
476
+ for function in functions
477
+ if "agent" in function[0] and not function[0].startswith("_")
478
+ ]
479
+ if len(agent_functions) == 1:
480
+ # agent based solver
481
+ solver_name = agent_functions[0][0]
482
+
483
+ elif len(agent_functions) == 0:
484
+ raise PrerequisiteError(
485
+ f"The source file {pretty_solver_file} does not contain any @solver functions or agent functions."
486
+ )
487
+ else:
488
+ raise PrerequisiteError(
489
+ f"The source file {pretty_solver_file} has more than one agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
490
+ )
491
+ else:
438
492
  raise PrerequisiteError(
439
- f"The source file {solver_file.as_posix()} does not contain any @solver functions."
493
+ f"The source file {pretty_solver_file} has more than one @solver function (qualify which solver using e.g. '{solver_file.name}y@solver_fn')"
440
494
  )
441
- if len(solvers) > 1:
442
- raise PrerequisiteError(
443
- f"The source file {solver_file.as_posix()} has more than one @solver function (qualify which solver using file.py@solver)"
444
- )
445
- solver_name = solvers[0][0]
446
495
 
447
- # make mypy happy and catch unexpected branching
448
- if solver_name is None:
449
- raise ValueError(f"Unable to resolve solver name from {spec.solver}")
496
+ # create decorator based solvers using the registry
497
+ if any(solver[0] == solver_name for solver in decorators):
498
+ return cast(Solver, registry_create("solver", solver_name, **spec.args))
450
499
 
451
- solver = cast(Solver, registry_create("solver", solver_name, **spec.args))
452
- return solver
500
+ # create agent based solvers by calling the function and wrapping it in bridge()
501
+ else:
502
+ agent_fn = getattr(solver_module, solver_name, None)
503
+ if inspect.isfunction(agent_fn):
504
+ return bridge(agent_fn(**spec.args))
505
+ elif agent_fn is not None:
506
+ raise PrerequisiteError(
507
+ f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
508
+ )
509
+ else:
510
+ raise PrerequisiteError(
511
+ f"The function {solver_name} was not found in file {pretty_solver_file}."
512
+ )
@@ -1,6 +1,7 @@
1
1
  import inspect
2
2
  import logging
3
3
  from copy import deepcopy
4
+ from functools import wraps
4
5
  from pathlib import Path
5
6
  from typing import Any, Callable, TypeVar, cast, overload
6
7
 
@@ -125,6 +126,7 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
125
126
  params = list(inspect.signature(task_type).parameters.keys())
126
127
 
127
128
  # Create and return the wrapper function
129
+ @wraps(task_type)
128
130
  def wrapper(*w_args: Any, **w_kwargs: Any) -> Task:
129
131
  # Create the task
130
132
  task_instance = task_type(*w_args, **w_kwargs)
@@ -154,6 +156,10 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
154
156
  # Return the task instance
155
157
  return task_instance
156
158
 
159
+ # functools.wraps overrides the return type annotation of the inner function, so
160
+ # we explicitly set it again
161
+ wrapper.__annotations__["return"] = Task
162
+
157
163
  # Register the task and return the wrapper
158
164
  return task_register(
159
165
  task=cast(TaskType, wrapper), name=task_name, attribs=attribs, params=params
inspect_ai/_eval/score.py CHANGED
@@ -5,7 +5,7 @@ from typing import Callable, cast
5
5
  from inspect_ai._display import display
6
6
  from inspect_ai._util.path import chdir_python
7
7
  from inspect_ai._util.platform import platform_init
8
- from inspect_ai._util.registry import registry_create
8
+ from inspect_ai._util.registry import registry_create, registry_unqualified_name
9
9
  from inspect_ai.log import (
10
10
  EvalLog,
11
11
  EvalMetric,
@@ -185,6 +185,7 @@ async def run_score_task(
185
185
  results[scorer_name] = SampleScore(
186
186
  score=result,
187
187
  sample_id=state.sample_id,
188
+ scorer=registry_unqualified_name(scorer),
188
189
  )
189
190
 
190
191
  progress()
@@ -8,6 +8,7 @@ from inspect_ai.model import (
8
8
  )
9
9
  from inspect_ai.model._cache import epoch
10
10
  from inspect_ai.solver import TaskState
11
+ from inspect_ai.solver._limit import SampleLimitExceededError
11
12
  from inspect_ai.tool import ToolFunction
12
13
 
13
14
 
@@ -21,45 +22,50 @@ async def task_generate(
21
22
  # track tool_choice (revert to "auto" after first forced call of a tool)
22
23
  tool_choice = state.tool_choice
23
24
 
24
- while True:
25
- # If we don't update the epoch here as we go, it's entirely possible
26
- # we'd cache the same response for every single epoch, which would
27
- # completely defeat the point!
28
- epoch.set(state.epoch)
25
+ try:
26
+ while True:
27
+ # If we don't update the epoch here as we go, it's entirely possible
28
+ # we'd cache the same response for every single epoch, which would
29
+ # completely defeat the point!
30
+ epoch.set(state.epoch)
29
31
 
30
- # call the model
31
- state.output = await model.generate(
32
- input=state.messages,
33
- tools=state.tools,
34
- tool_choice=tool_choice,
35
- config=config,
36
- cache=cache,
37
- )
32
+ # call the model
33
+ state.output = await model.generate(
34
+ input=state.messages,
35
+ tools=state.tools,
36
+ tool_choice=tool_choice,
37
+ config=config,
38
+ cache=cache,
39
+ )
38
40
 
39
- # append the assistant message
40
- message = state.output.message
41
- state.messages.append(message)
41
+ # append the assistant message
42
+ message = state.output.message
43
+ state.messages.append(message)
42
44
 
43
- # check for completed
44
- if state.completed:
45
- return state
45
+ # check for completed
46
+ if state.completed:
47
+ return state
46
48
 
47
- # resolve tool calls if necessary
48
- if tool_calls != "none" and message.tool_calls:
49
- # call tools and append messages to state
50
- state.messages.extend(
51
- await call_tools(message, state.tools, config.max_tool_output)
52
- )
49
+ # resolve tool calls if necessary
50
+ if tool_calls != "none" and message.tool_calls:
51
+ # call tools and append messages to state
52
+ state.messages.extend(
53
+ await call_tools(message, state.tools, config.max_tool_output)
54
+ )
53
55
 
54
- # check for completed or only executing a single tool call
55
- if state.completed or tool_calls == "single":
56
- return state
56
+ # check for completed or only executing a single tool call
57
+ if state.completed or tool_calls == "single":
58
+ return state
59
+
60
+ # if a tool_call was forced set tool_choice to 'auto'
61
+ # (otherwise it will get forced over and over again)
62
+ if isinstance(tool_choice, ToolFunction):
63
+ tool_choice = "auto"
57
64
 
58
- # if a tool_call was forced set tool_choice to 'auto'
59
- # (otherwise it will get forced over and over again)
60
- if isinstance(tool_choice, ToolFunction):
61
- tool_choice = "auto"
65
+ # no tool calls or not resolving tool calls, we are done!
66
+ else:
67
+ return state
62
68
 
63
- # no tool calls or not resolving tool calls, we are done!
64
- else:
65
- return state
69
+ # propagate current state along with sample limit exceeded
70
+ except SampleLimitExceededError as ex:
71
+ raise ex.with_state(state)
@@ -65,11 +65,12 @@ def eval_results(
65
65
  # extract scorers info from scorers then create scorers info for any
66
66
  # scores not already accounted for by a scorer name
67
67
  scorers_info = [ScorerInfo.from_scorer(scorer) for scorer in (scorers or [])]
68
- scorer_names = [info.name for info in scorers_info]
69
- for name in set(key for sample_scores in scores for key in sample_scores):
70
- if name not in scorer_names:
71
- scorers_info.append(ScorerInfo.from_name(name))
72
- scorer_names.append(name)
68
+ scorer_names = {info.name for info in scorers_info}
69
+ for sample_scores in scores:
70
+ for name, sample_score in sample_scores.items():
71
+ if sample_score.scorer is None and name not in scorer_names:
72
+ scorers_info.append(ScorerInfo.from_name(name))
73
+ scorer_names.add(name)
73
74
 
74
75
  # record scorer
75
76
  if len(scorers_info) > 0:
@@ -27,8 +27,12 @@ from inspect_ai._util.constants import (
27
27
  from inspect_ai._util.datetime import iso_now
28
28
  from inspect_ai._util.error import exception_message
29
29
  from inspect_ai._util.hooks import send_telemetry
30
- from inspect_ai._util.registry import is_registry_object, registry_log_name
31
- from inspect_ai._util.timeouts import Timeout, timeout, timeout_at
30
+ from inspect_ai._util.registry import (
31
+ is_registry_object,
32
+ registry_log_name,
33
+ registry_unqualified_name,
34
+ )
35
+ from inspect_ai._util.timeouts import Timeout, timeout
32
36
  from inspect_ai._view.notify import view_notify_eval
33
37
  from inspect_ai.dataset import Dataset, Sample
34
38
  from inspect_ai.log import (
@@ -71,9 +75,9 @@ from inspect_ai.scorer._scorer import unique_scorer_name
71
75
  from inspect_ai.solver import Generate, Plan, TaskState
72
76
  from inspect_ai.solver._chain import Chain, unroll
73
77
  from inspect_ai.solver._fork import set_task_generate
78
+ from inspect_ai.solver._limit import SampleLimitExceededError
74
79
  from inspect_ai.solver._solver import Solver
75
80
  from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
76
- from inspect_ai.util._limit import SampleLimitExceededError
77
81
  from inspect_ai.util._sandbox.context import sandbox_connections
78
82
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
79
83
  from inspect_ai.util._subtask import init_subtask
@@ -398,7 +402,13 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
398
402
  view_notify_eval(logger.location)
399
403
 
400
404
  try:
401
- await send_telemetry("eval_log", eval_log_json_str(eval_log))
405
+ if (
406
+ await send_telemetry("eval_log_location", eval_log.location)
407
+ == "not_handled"
408
+ ):
409
+ # Converting the eval log to JSON is expensive. Only do so if
410
+ # eval_log_location was not handled.
411
+ await send_telemetry("eval_log", eval_log_json_str(eval_log))
402
412
  except Exception as ex:
403
413
  py_logger.warning(
404
414
  f"Error occurred sending telemetry: {exception_message(ex)}"
@@ -646,26 +656,21 @@ async def task_run_sample(
646
656
  )
647
657
 
648
658
  # capture most recent state for scoring
649
- state = sample_state() or state
659
+ state = ex.state or sample_state() or state
650
660
  state.completed = True
651
661
 
652
662
  except BaseException as ex:
653
663
  error = handle_error(ex)
654
664
 
655
- # set timeout for scoring. if the original timeout was never hit
656
- # then just create a new timeout_cm targeting the original
657
- # timeout time. if the original timeout was hit we still want
658
- # to provide an opportunity for scoring, but we don't necessarily
665
+ # set timeout for scoring. if the original timeout was hit we still
666
+ # want to provide opportunity for scoring, but we don't necessarily
659
667
  # want to wait the full timeout again (especially in the case where
660
668
  # the cause of the timeout is a hung container and scoring requires
661
669
  # interacting with the container). as a middle ground we use half
662
670
  # of the original timeout value for scoring.
663
671
  if isinstance(timeout_cm, Timeout):
664
- if not timeout_cm.expired():
665
- timeout_cm = timeout_at(timeout_cm.when())
666
- else:
667
- assert time_limit
668
- timeout_cm = timeout(time_limit / 2)
672
+ assert time_limit
673
+ timeout_cm = timeout(time_limit / 2)
669
674
 
670
675
  # turn off sample limits
671
676
  set_active_sample_token_limit(None)
@@ -690,6 +695,7 @@ async def task_run_sample(
690
695
  sample_score = SampleScore(
691
696
  score=score_result,
692
697
  sample_id=sample.id,
698
+ scorer=registry_unqualified_name(scorer),
693
699
  )
694
700
  transcript()._event(
695
701
  ScoreEvent(
@@ -734,7 +740,7 @@ async def task_run_sample(
734
740
  error = handle_error(ex)
735
741
 
736
742
  # handle sandboxenv init errors
737
- except BaseException as ex:
743
+ except Exception as ex:
738
744
  error = handle_error(ex)
739
745
 
740
746
  # complete the sample
inspect_ai/_util/hooks.py CHANGED
@@ -17,19 +17,29 @@ from .error import PrerequisiteError
17
17
  #
18
18
  # Telemetry can be optionally enabled by setting an INSPECT_TELEMETRY
19
19
  # environment variable that points to a function in a package which
20
- # conforms to the TelemetrySend signature below.
20
+ # conforms to the TelemetrySend signature below. A return value of True
21
+ # indicates that the telemetry event was handled.
21
22
 
22
- # There are currently two types of telemetry sent:
23
- # - model_usage (type ModelUsage)
24
- # - eval_log (type EvalLog)
23
+ # There are currently three types of telemetry sent:
24
+ # - model_usage (JSON string of the model usage)
25
+ # - eval_log_location (file path or URL string of the eval log)
26
+ # - eval_log (JSON string of the eval log)
27
+ # [only sent if eval_log_location unhandled]
28
+ # The eval_log_location type is preferred over eval_log as it means we can take
29
+ # advantage of the .eval format and avoid loading the whole log into memory.
25
30
 
26
- TelemetrySend = Callable[[str, str], Awaitable[None]]
31
+ TelemetrySend = Callable[[str, str], Awaitable[bool]]
27
32
 
28
33
 
29
- async def send_telemetry(type: Literal["model_usage", "eval_log"], json: str) -> None:
34
+ async def send_telemetry(
35
+ type: Literal["model_usage", "eval_log", "eval_log_location"], json: str
36
+ ) -> Literal["handled", "not_handled", "no_subscribers"]:
30
37
  global _send_telemetry
31
38
  if _send_telemetry:
32
- await _send_telemetry(type, json)
39
+ if await _send_telemetry(type, json):
40
+ return "handled"
41
+ return "not_handled"
42
+ return "no_subscribers"
33
43
 
34
44
 
35
45
  _send_telemetry: TelemetrySend | None = None