inspect-ai 0.3.57__py3-none-any.whl → 0.3.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/common.py +7 -3
  3. inspect_ai/_cli/eval.py +17 -2
  4. inspect_ai/_cli/trace.py +21 -2
  5. inspect_ai/_display/core/active.py +4 -3
  6. inspect_ai/_display/core/config.py +3 -3
  7. inspect_ai/_display/core/panel.py +7 -3
  8. inspect_ai/_display/plain/__init__.py +0 -0
  9. inspect_ai/_display/plain/display.py +203 -0
  10. inspect_ai/_display/rich/display.py +4 -9
  11. inspect_ai/_display/textual/app.py +4 -1
  12. inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
  13. inspect_ai/_display/textual/widgets/samples.py +119 -16
  14. inspect_ai/_display/textual/widgets/sandbox.py +37 -0
  15. inspect_ai/_eval/eval.py +32 -20
  16. inspect_ai/_eval/evalset.py +7 -5
  17. inspect_ai/_eval/score.py +1 -0
  18. inspect_ai/_eval/task/__init__.py +2 -2
  19. inspect_ai/_eval/task/images.py +40 -25
  20. inspect_ai/_eval/task/results.py +50 -22
  21. inspect_ai/_eval/task/run.py +180 -124
  22. inspect_ai/_eval/task/sandbox.py +10 -5
  23. inspect_ai/_eval/task/task.py +140 -25
  24. inspect_ai/_util/constants.py +2 -0
  25. inspect_ai/_util/content.py +23 -1
  26. inspect_ai/_util/images.py +20 -17
  27. inspect_ai/_util/kvstore.py +73 -0
  28. inspect_ai/_util/notgiven.py +18 -0
  29. inspect_ai/_util/port_names.py +61 -0
  30. inspect_ai/_util/text.py +23 -0
  31. inspect_ai/_util/thread.py +5 -0
  32. inspect_ai/_view/www/App.css +31 -1
  33. inspect_ai/_view/www/dist/assets/index.css +31 -1
  34. inspect_ai/_view/www/dist/assets/index.js +25375 -1846
  35. inspect_ai/_view/www/log-schema.json +129 -15
  36. inspect_ai/_view/www/package.json +2 -0
  37. inspect_ai/_view/www/src/App.mjs +8 -10
  38. inspect_ai/_view/www/src/Types.mjs +0 -1
  39. inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
  40. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
  41. inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
  42. inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
  43. inspect_ai/_view/www/src/components/MessageContent.mjs +43 -1
  44. inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
  45. inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
  46. inspect_ai/_view/www/src/index.js +75 -2
  47. inspect_ai/_view/www/src/navbar/Navbar.mjs +3 -0
  48. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +18 -9
  49. inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
  50. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
  51. inspect_ai/_view/www/src/samples/SampleList.mjs +18 -48
  52. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
  53. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +29 -13
  54. inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -1
  55. inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
  56. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
  57. inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
  58. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
  59. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
  60. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
  61. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
  62. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
  63. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
  64. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
  65. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
  66. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
  67. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
  68. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
  69. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
  70. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
  71. inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
  72. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
  73. inspect_ai/_view/www/src/types/log.d.ts +62 -27
  74. inspect_ai/_view/www/src/utils/Format.mjs +10 -3
  75. inspect_ai/_view/www/src/utils/Json.mjs +12 -6
  76. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +10 -4
  77. inspect_ai/_view/www/vite.config.js +7 -0
  78. inspect_ai/_view/www/yarn.lock +116 -0
  79. inspect_ai/approval/_human/__init__.py +0 -0
  80. inspect_ai/approval/_human/util.py +2 -2
  81. inspect_ai/approval/_policy.py +12 -6
  82. inspect_ai/dataset/_sources/csv.py +2 -1
  83. inspect_ai/dataset/_sources/json.py +2 -1
  84. inspect_ai/dataset/_sources/util.py +15 -7
  85. inspect_ai/log/_condense.py +11 -1
  86. inspect_ai/log/_log.py +3 -6
  87. inspect_ai/log/_recorders/eval.py +19 -8
  88. inspect_ai/log/_samples.py +26 -5
  89. inspect_ai/log/_transcript.py +32 -2
  90. inspect_ai/model/__init__.py +10 -2
  91. inspect_ai/model/_call_tools.py +59 -12
  92. inspect_ai/model/_chat_message.py +2 -4
  93. inspect_ai/model/_conversation.py +61 -0
  94. inspect_ai/model/_generate_config.py +10 -4
  95. inspect_ai/model/_model.py +117 -18
  96. inspect_ai/model/_model_output.py +7 -2
  97. inspect_ai/model/_providers/anthropic.py +109 -51
  98. inspect_ai/model/_providers/azureai.py +26 -24
  99. inspect_ai/model/_providers/bedrock.py +43 -44
  100. inspect_ai/model/_providers/google.py +121 -58
  101. inspect_ai/model/_providers/groq.py +7 -5
  102. inspect_ai/model/_providers/hf.py +11 -6
  103. inspect_ai/model/_providers/mistral.py +17 -20
  104. inspect_ai/model/_providers/openai.py +32 -21
  105. inspect_ai/model/_providers/openai_o1.py +9 -8
  106. inspect_ai/model/_providers/providers.py +1 -1
  107. inspect_ai/model/_providers/together.py +8 -8
  108. inspect_ai/model/_providers/vertex.py +18 -8
  109. inspect_ai/scorer/__init__.py +13 -2
  110. inspect_ai/scorer/_metrics/__init__.py +2 -2
  111. inspect_ai/scorer/_metrics/std.py +3 -3
  112. inspect_ai/scorer/_reducer/reducer.py +1 -1
  113. inspect_ai/scorer/_scorer.py +2 -2
  114. inspect_ai/solver/__init__.py +2 -5
  115. inspect_ai/solver/_prompt.py +35 -5
  116. inspect_ai/solver/_task_state.py +80 -38
  117. inspect_ai/tool/__init__.py +11 -1
  118. inspect_ai/tool/_tool.py +21 -3
  119. inspect_ai/tool/_tool_call.py +10 -0
  120. inspect_ai/tool/_tool_def.py +16 -5
  121. inspect_ai/tool/_tool_with.py +21 -4
  122. inspect_ai/tool/beta/__init__.py +5 -0
  123. inspect_ai/tool/beta/_computer/__init__.py +3 -0
  124. inspect_ai/tool/beta/_computer/_common.py +133 -0
  125. inspect_ai/tool/beta/_computer/_computer.py +155 -0
  126. inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
  127. inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
  128. inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
  129. inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
  130. inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
  131. inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
  132. inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
  133. inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
  134. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
  135. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
  136. inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
  137. inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
  138. inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
  139. inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
  140. inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
  141. inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
  142. inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
  143. inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
  144. inspect_ai/util/__init__.py +2 -3
  145. inspect_ai/util/{_trace.py → _conversation.py} +3 -17
  146. inspect_ai/util/_display.py +14 -4
  147. inspect_ai/util/_limit.py +26 -0
  148. inspect_ai/util/_sandbox/context.py +12 -13
  149. inspect_ai/util/_sandbox/docker/compose.py +24 -11
  150. inspect_ai/util/_sandbox/docker/docker.py +84 -14
  151. inspect_ai/util/_sandbox/docker/internal.py +3 -1
  152. inspect_ai/util/_sandbox/environment.py +27 -1
  153. inspect_ai/util/_sandbox/local.py +1 -0
  154. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/METADATA +2 -2
  155. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/RECORD +159 -128
  156. inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
  157. inspect_ai/model/_trace.py +0 -48
  158. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/LICENSE +0 -0
  159. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/WHEEL +0 -0
  160. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/entry_points.txt +0 -0
  161. {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@ import sys
4
4
  import time
5
5
  from copy import deepcopy
6
6
  from dataclasses import dataclass, field
7
+ from datetime import datetime
7
8
  from logging import getLogger
8
9
  from pathlib import PurePath
9
10
  from typing import Callable, Literal
@@ -26,10 +27,7 @@ from inspect_ai._util.constants import (
26
27
  from inspect_ai._util.datetime import iso_now
27
28
  from inspect_ai._util.error import exception_message
28
29
  from inspect_ai._util.hooks import send_telemetry
29
- from inspect_ai._util.registry import (
30
- is_registry_object,
31
- registry_log_name,
32
- )
30
+ from inspect_ai._util.registry import is_registry_object, registry_log_name
33
31
  from inspect_ai._util.timeouts import Timeout, timeout, timeout_at
34
32
  from inspect_ai._view.notify import view_notify_eval
35
33
  from inspect_ai.dataset import Dataset, Sample
@@ -44,7 +42,11 @@ from inspect_ai.log import (
44
42
  from inspect_ai.log._condense import condense_sample
45
43
  from inspect_ai.log._file import eval_log_json_str
46
44
  from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
47
- from inspect_ai.log._samples import active_sample
45
+ from inspect_ai.log._samples import (
46
+ active_sample,
47
+ set_active_sample_message_limit,
48
+ set_active_sample_token_limit,
49
+ )
48
50
  from inspect_ai.log._transcript import (
49
51
  ErrorEvent,
50
52
  SampleInitEvent,
@@ -71,6 +73,8 @@ from inspect_ai.solver._chain import Chain, unroll
71
73
  from inspect_ai.solver._fork import set_task_generate
72
74
  from inspect_ai.solver._solver import Solver
73
75
  from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
76
+ from inspect_ai.util._limit import SampleLimitExceededError
77
+ from inspect_ai.util._sandbox.context import sandbox_connections
74
78
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
75
79
  from inspect_ai.util._subtask import init_subtask
76
80
 
@@ -79,10 +83,10 @@ from ..task import Task
79
83
  from .error import SampleErrorHandler
80
84
  from .generate import task_generate
81
85
  from .images import (
82
- sample_without_base64_images,
83
- samples_with_base64_images,
84
- state_without_base64_images,
85
- states_with_base64_images,
86
+ sample_without_base64_content,
87
+ samples_with_base64_content,
88
+ state_without_base64_content,
89
+ states_with_base64_content,
86
90
  )
87
91
  from .log import TaskLogger, collect_eval_data, log_start
88
92
  from .results import eval_results
@@ -533,21 +537,18 @@ async def task_run_sample(
533
537
  else contextlib.nullcontext()
534
538
  )
535
539
 
536
- # use timeout if provided
537
- timeout_cm = (
538
- timeout(time_limit) if time_limit is not None else contextlib.nullcontext()
539
- )
540
-
541
540
  # helper to handle exceptions (will throw if we've exceeded the limit)
542
541
  def handle_error(ex: BaseException) -> EvalError:
543
542
  err = sample_error(ex)
543
+ py_logger.warning(
544
+ f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
545
+ )
544
546
  transcript()._event(ErrorEvent(error=err))
545
547
  return err
546
548
 
547
549
  # solver loop
548
550
  async with (
549
551
  semaphore_cm,
550
- sandboxenv_cm,
551
552
  active_sample(
552
553
  task=task_name,
553
554
  model=str(state.model),
@@ -561,125 +562,179 @@ async def task_run_sample(
561
562
  ) as active,
562
563
  ):
563
564
  error: EvalError | None = None
565
+ results: dict[str, SampleScore] = {}
564
566
  try:
565
- async with timeout_cm:
566
- # sample init event (remove file bodies as they have content or absolute paths)
567
- event_sample = sample.model_copy(
568
- update=dict(files={k: "" for k in sample.files.keys()})
569
- if sample.files
570
- else None
571
- )
572
- transcript()._event(
573
- SampleInitEvent(sample=event_sample, state=state_jsonable(state))
574
- )
567
+ async with sandboxenv_cm:
568
+ try:
569
+ # update active sample wth sandboxes now that we are initialised
570
+ active.sandboxes = await sandbox_connections()
571
+
572
+ # initialise timeout context manager
573
+ timeout_cm = (
574
+ timeout(time_limit)
575
+ if time_limit is not None
576
+ else contextlib.nullcontext()
577
+ )
575
578
 
576
- # set progress for plan then run it
577
- state = await plan(state, generate)
579
+ # run sample w/ optional timeout
580
+ async with timeout_cm:
581
+ # mark started
582
+ active.started = datetime.now().timestamp()
578
583
 
579
- except TimeoutError:
580
- if time_limit is not None:
581
- transcript()._event(
582
- SampleLimitEvent(
583
- type="time",
584
- message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
585
- limit=time_limit,
586
- )
587
- )
588
- else:
589
- py_logger.warning(
590
- "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
591
- )
584
+ # sample init event (remove file bodies as they have content or absolute paths)
585
+ event_sample = sample.model_copy(
586
+ update=dict(files={k: "" for k in sample.files.keys()})
587
+ if sample.files
588
+ else None
589
+ )
590
+ transcript()._event(
591
+ SampleInitEvent(
592
+ sample=event_sample, state=state_jsonable(state)
593
+ )
594
+ )
592
595
 
593
- # capture most recent state for scoring
594
- state = sample_state() or state
596
+ # set progress for plan then run it
597
+ state = await plan(state, generate)
595
598
 
596
- except asyncio.CancelledError as ex:
597
- if active.interrupt_action:
598
- # record eve t
599
- transcript()._event(
600
- SampleLimitEvent(
601
- type="operator",
602
- message="Sample completed: interrupted by operator",
603
- )
604
- )
599
+ except TimeoutError:
600
+ if time_limit is not None:
601
+ transcript()._event(
602
+ SampleLimitEvent(
603
+ type="time",
604
+ message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
605
+ limit=time_limit,
606
+ )
607
+ )
608
+ else:
609
+ py_logger.warning(
610
+ "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
611
+ )
605
612
 
606
- # handle the action
607
- match active.interrupt_action:
608
- case "score":
609
- # continue to scoring (capture the most recent state)
610
- state = sample_state() or state
611
- case "error":
612
- # default error handling
613
- error = handle_error(ex)
613
+ # capture most recent state for scoring
614
+ state = sample_state() or state
614
615
 
615
- else:
616
- raise
616
+ except asyncio.CancelledError as ex:
617
+ if active.interrupt_action:
618
+ # record eve t
619
+ transcript()._event(
620
+ SampleLimitEvent(
621
+ type="operator",
622
+ message="Sample completed: interrupted by operator",
623
+ )
624
+ )
617
625
 
618
- except BaseException as ex:
619
- error = handle_error(ex)
626
+ # handle the action
627
+ match active.interrupt_action:
628
+ case "score":
629
+ # continue to scoring (capture the most recent state)
630
+ state = sample_state() or state
631
+ case "error":
632
+ # default error handling
633
+ error = handle_error(ex)
634
+
635
+ else:
636
+ raise
637
+
638
+ except SampleLimitExceededError as ex:
639
+ # sample limit event
640
+ transcript()._event(
641
+ SampleLimitEvent(
642
+ type=ex.type,
643
+ limit=ex.limit,
644
+ message=f"Sample completed: {ex.message}",
645
+ )
646
+ )
620
647
 
621
- # set timeout for scoring. if the original timeout was never hit
622
- # then just create a new timeout_cm targeting the original
623
- # timeout time. if the original timeout was hit we still want
624
- # to provide an opportunity for scoring, but we don't necessarily
625
- # want to wait the full timeout again (especially in the case where
626
- # the cause of the timeout is a hung container and scoring requires
627
- # interacting with the container). as a middle ground we use half
628
- # of the original timeout value for scoring.
629
- if isinstance(timeout_cm, Timeout):
630
- if not timeout_cm.expired():
631
- timeout_cm = timeout_at(timeout_cm.when())
632
- else:
633
- assert time_limit
634
- timeout_cm = timeout(time_limit / 2)
648
+ # capture most recent state for scoring
649
+ state = sample_state() or state
650
+ state.completed = True
651
+
652
+ except BaseException as ex:
653
+ error = handle_error(ex)
654
+
655
+ # set timeout for scoring. if the original timeout was never hit
656
+ # then just create a new timeout_cm targeting the original
657
+ # timeout time. if the original timeout was hit we still want
658
+ # to provide an opportunity for scoring, but we don't necessarily
659
+ # want to wait the full timeout again (especially in the case where
660
+ # the cause of the timeout is a hung container and scoring requires
661
+ # interacting with the container). as a middle ground we use half
662
+ # of the original timeout value for scoring.
663
+ if isinstance(timeout_cm, Timeout):
664
+ if not timeout_cm.expired():
665
+ timeout_cm = timeout_at(timeout_cm.when())
666
+ else:
667
+ assert time_limit
668
+ timeout_cm = timeout(time_limit / 2)
669
+
670
+ # turn off sample limits
671
+ set_active_sample_token_limit(None)
672
+ set_active_sample_message_limit(None)
673
+
674
+ # scoring
675
+ try:
676
+ # timeout during scoring will result in an ordinary sample error
677
+ async with timeout_cm:
678
+ if error is None:
679
+ for scorer in scorers or []:
680
+ scorer_name = unique_scorer_name(
681
+ scorer, list(results.keys())
682
+ )
683
+ with transcript().step(name=scorer_name, type="scorer"):
684
+ score_result = (
685
+ await scorer(state, Target(sample.target))
686
+ if scorer
687
+ else None
688
+ )
689
+ if score_result is not None:
690
+ sample_score = SampleScore(
691
+ score=score_result,
692
+ sample_id=sample.id,
693
+ )
694
+ transcript()._event(
695
+ ScoreEvent(
696
+ score=score_result, target=sample.target
697
+ )
698
+ )
699
+ results[scorer_name] = sample_score
700
+
701
+ # add scores returned by solvers
702
+ if state.scores is not None:
703
+ for name, score in state.scores.items():
704
+ results[name] = SampleScore(
705
+ score=score, sample_id=state.sample_id
706
+ )
707
+
708
+ # propagate results into scores
709
+ state.scores = {k: v.score for k, v in results.items()}
710
+
711
+ except asyncio.CancelledError:
712
+ if active.interrupt_action:
713
+ transcript()._event(
714
+ SampleLimitEvent(
715
+ type="operator",
716
+ message="Unable to score sample due to operator interruption",
717
+ )
718
+ )
635
719
 
636
- # scoring
637
- try:
638
- # timeout during scoring will result in an ordinary sample error
639
- async with timeout_cm:
640
- results: dict[str, SampleScore] = {}
641
- if scorers and error is None:
642
- for scorer in scorers:
643
- scorer_name = unique_scorer_name(scorer, list(results.keys()))
644
- with transcript().step(name=scorer_name, type="scorer"):
645
- score_result = (
646
- await scorer(state, Target(sample.target))
647
- if scorer
648
- else None
720
+ raise
721
+
722
+ except BaseException as ex:
723
+ # note timeout
724
+ if isinstance(ex, TimeoutError):
725
+ transcript()._event(
726
+ SampleLimitEvent(
727
+ type="time",
728
+ message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
729
+ limit=time_limit,
649
730
  )
650
- if score_result is not None:
651
- sample_score = SampleScore(
652
- score=score_result,
653
- sample_id=sample.id,
654
- )
655
- transcript()._event(
656
- ScoreEvent(score=score_result, target=sample.target)
657
- )
658
- results[scorer_name] = sample_score
659
-
660
- except asyncio.CancelledError:
661
- if active.interrupt_action:
662
- transcript()._event(
663
- SampleLimitEvent(
664
- type="operator",
665
- message="Unable to score sample due to operator interruption",
666
- )
667
- )
731
+ )
668
732
 
669
- raise
733
+ # handle error (this will throw if we've exceeded the limit)
734
+ error = handle_error(ex)
670
735
 
736
+ # handle sandboxenv init errors
671
737
  except BaseException as ex:
672
- # note timeout
673
- if isinstance(ex, TimeoutError):
674
- transcript()._event(
675
- SampleLimitEvent(
676
- type="time",
677
- message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
678
- limit=time_limit,
679
- )
680
- )
681
-
682
- # handle error (this will throw if we've exceeded the limit)
683
738
  error = handle_error(ex)
684
739
 
685
740
  # complete the sample
@@ -689,12 +744,12 @@ async def task_run_sample(
689
744
  if logger is not None:
690
745
  # if we are logging images then be sure to base64 images injected by solvers
691
746
  if log_images:
692
- state = (await states_with_base64_images([state]))[0]
747
+ state = (await states_with_base64_content([state]))[0]
693
748
 
694
749
  # otherwise ensure there are no base64 images in sample or messages
695
750
  else:
696
- sample = sample_without_base64_images(sample)
697
- state = state_without_base64_images(state)
751
+ sample = sample_without_base64_content(sample)
752
+ state = state_without_base64_content(state)
698
753
 
699
754
  # log the sample
700
755
  await log_sample(
@@ -784,7 +839,7 @@ async def resolve_dataset(
784
839
 
785
840
  # if we are logging images then resolve sample images here
786
841
  if log_images:
787
- samples = await samples_with_base64_images(samples)
842
+ samples = await samples_with_base64_content(samples)
788
843
 
789
844
  # prime the eval tasks (deep copy so they share no state w/ sample)
790
845
  sample_epochs: list[int] = []
@@ -797,6 +852,7 @@ async def resolve_dataset(
797
852
  epoch=epoch,
798
853
  model=model_name,
799
854
  input=sample.input,
855
+ target=Target(sample.target),
800
856
  choices=sample.choices,
801
857
  messages=sample_messages(sample),
802
858
  message_limit=message_limit,
@@ -4,11 +4,13 @@ import contextlib
4
4
  from random import random
5
5
  from typing import AsyncGenerator, Callable, NamedTuple, cast
6
6
 
7
+ import httpx
8
+
7
9
  from inspect_ai._eval.task.task import Task
8
10
  from inspect_ai._eval.task.util import task_run_dir
9
11
  from inspect_ai._util.file import file, filesystem
10
12
  from inspect_ai._util.registry import registry_unqualified_name
11
- from inspect_ai._util.url import data_uri_to_base64, is_data_uri
13
+ from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url
12
14
  from inspect_ai.dataset import Sample
13
15
  from inspect_ai.util._concurrency import concurrency
14
16
  from inspect_ai.util._sandbox.context import (
@@ -65,12 +67,12 @@ async def sandboxenv_context(
65
67
  files: dict[str, bytes] = {}
66
68
  if sample.files:
67
69
  for path, contents in sample.files.items():
68
- files[path] = read_sandboxenv_file(contents)
70
+ files[path] = await read_sandboxenv_file(contents)
69
71
 
70
72
  # read setup script from sample (add bash shebang if necessary)
71
73
  setup: bytes | None = None
72
74
  if sample.setup:
73
- setup = read_sandboxenv_file(sample.setup)
75
+ setup = await read_sandboxenv_file(sample.setup)
74
76
  setup_str = setup.decode(encoding="utf-8")
75
77
  if not setup_str.strip().startswith("#!"):
76
78
  setup_str = f"#!/usr/bin/env bash\n\n{setup_str}"
@@ -108,13 +110,16 @@ async def sandboxenv_context(
108
110
  )
109
111
 
110
112
 
111
- def read_sandboxenv_file(contents: str) -> bytes:
113
+ async def read_sandboxenv_file(contents: str) -> bytes:
112
114
  if is_data_uri(contents):
113
115
  contents_base64 = data_uri_to_base64(contents)
114
116
  file_bytes = base64.b64decode(contents_base64)
117
+ elif is_http_url(contents):
118
+ client = httpx.AsyncClient()
119
+ file_bytes = (await client.get(contents, follow_redirects=True)).content
115
120
  else:
116
121
  # try to read as a file (if it doesn't exist or has a path not cool w/
117
- # the fileystem then we fall back to contents)
122
+ # the filesystem then we fall back to contents)
118
123
  try:
119
124
  fs = filesystem(contents)
120
125
  if fs.exists(contents):
@@ -1,3 +1,4 @@
1
+ from copy import deepcopy
1
2
  from dataclasses import dataclass
2
3
  from logging import getLogger
3
4
  from typing import Any, Callable, Sequence, cast
@@ -6,6 +7,7 @@ from pydantic import BaseModel
6
7
  from typing_extensions import TypedDict, Unpack
7
8
 
8
9
  from inspect_ai._util.logger import warn_once
10
+ from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
9
11
  from inspect_ai._util.registry import is_registry_object, registry_info
10
12
  from inspect_ai.approval._policy import ApprovalPolicy, approval_policies_from_config
11
13
  from inspect_ai.dataset import Dataset, MemoryDataset, Sample
@@ -115,35 +117,15 @@ class Task:
115
117
  f"DEPRECATED: the '{arg}' parameter is deprecated (please use the '{newarg}' parameter instead)",
116
118
  )
117
119
 
118
- # resolve epochs / epochs_reducer
119
- if isinstance(epochs, int):
120
- epochs = Epochs(epochs)
121
- if epochs is not None and epochs.epochs < 1:
122
- raise ValueError("epochs must be a positive integer.")
123
-
124
- # resolve dataset (provide empty sample to bootstrap tasks w/o samples,
125
- # which could occur for testing or for an interactive mode eval)
126
- dataset = dataset or [Sample(input="prompt")]
127
- self.dataset: Dataset = (
128
- dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset))
129
- )
120
+ self.dataset = resolve_dataset(dataset)
130
121
  self.setup = setup
131
- self.solver = chain(solver) if isinstance(solver, list) else solver
132
- self.scorer = (
133
- scorer
134
- if isinstance(scorer, list)
135
- else [scorer]
136
- if scorer is not None
137
- else None
138
- )
122
+ self.solver = resolve_solver(solver)
123
+ self.scorer = resolve_scorer(scorer)
139
124
  self.metrics = metrics
140
125
  self.config = config
141
126
  self.sandbox = resolve_sandbox_environment(sandbox)
142
- self.approval = (
143
- approval_policies_from_config(approval)
144
- if isinstance(approval, str)
145
- else approval
146
- )
127
+ self.approval = resolve_approval(approval)
128
+ epochs = resolve_epochs(epochs)
147
129
  self.epochs = epochs.epochs if epochs else None
148
130
  self.epochs_reducer = epochs.reducer if epochs else None
149
131
  self.fail_on_error = fail_on_error
@@ -171,6 +153,106 @@ class Task:
171
153
  return dict()
172
154
 
173
155
 
156
+ def task_with(
157
+ task: Task,
158
+ *,
159
+ dataset: Dataset | Sequence[Sample] | None | NotGiven = NOT_GIVEN,
160
+ setup: Solver | list[Solver] | None | NotGiven = NOT_GIVEN,
161
+ solver: Solver | list[Solver] | NotGiven = NOT_GIVEN,
162
+ scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
163
+ metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
164
+ config: GenerateConfig | NotGiven = NOT_GIVEN,
165
+ sandbox: SandboxEnvironmentType | None | NotGiven = NOT_GIVEN,
166
+ approval: str | list[ApprovalPolicy] | None | NotGiven = NOT_GIVEN,
167
+ epochs: int | Epochs | None | NotGiven = NOT_GIVEN,
168
+ fail_on_error: bool | float | None | NotGiven = NOT_GIVEN,
169
+ message_limit: int | None | NotGiven = NOT_GIVEN,
170
+ token_limit: int | None | NotGiven = NOT_GIVEN,
171
+ time_limit: int | None | NotGiven = NOT_GIVEN,
172
+ name: str | None | NotGiven = NOT_GIVEN,
173
+ version: int | NotGiven = NOT_GIVEN,
174
+ metadata: dict[str, Any] | None | NotGiven = NOT_GIVEN,
175
+ ) -> Task:
176
+ """Task adapted with alternate values for one or more options.
177
+
178
+ Args:
179
+ task (Task): Task to adapt (it is deep copied prior to mutating options)
180
+ dataset (Dataset | Sequence[Sample]): Dataset to evaluate
181
+ setup: (Solver | list[Solver] | None): Setup step (always run
182
+ even when the main `solver` is replaced).
183
+ solver: (Solver | list[Solver]): Solver or list of solvers.
184
+ Defaults to generate(), a normal call to the model.
185
+ scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
186
+ metrics (list[Metric] | dict[str, list[Metric]] | None):
187
+ Alternative metrics (overrides the metrics provided by the specified scorer).
188
+ config (GenerateConfig): Model generation config.
189
+ sandbox (SandboxEnvironmentType | None): Sandbox environment type
190
+ (or optionally a str or tuple with a shorthand spec)
191
+ approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
192
+ Either a path to an approval policy config file or a list of approval policies.
193
+ Defaults to no approval policy.
194
+ epochs (int | Epochs | None): Epochs to repeat samples for and optional score
195
+ reducer function(s) used to combine sample scores (defaults to "mean")
196
+ fail_on_error (bool | float | None): `True` to fail on first sample error
197
+ (default); `False` to never fail on sample errors; Value between 0 and 1
198
+ to fail if a proportion of total samples fails. Value greater than 1 to fail
199
+ eval if a count of samples fails.
200
+ message_limit (int | None): Limit on total messages used for each sample.
201
+ token_limit (int | None): Limit on total tokens used for each sample.
202
+ time_limit (int | None): Limit on time (in seconds) for execution of each sample.
203
+ name: (str | None): Task name. If not specified is automatically
204
+ determined based on the name of the task directory (or "task")
205
+ if its anonymous task (e.g. created in a notebook and passed to
206
+ eval() directly)
207
+ version: (int): Version of task (to distinguish evolutions
208
+ of the task spec or breaking changes to it)
209
+ metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
210
+
211
+ Returns:
212
+ Task: Task adapted with alternate options.
213
+ """
214
+ # deep copy the task
215
+ task = deepcopy(task)
216
+
217
+ if not isinstance(dataset, NotGiven):
218
+ task.dataset = resolve_dataset(dataset)
219
+ if not isinstance(setup, NotGiven):
220
+ task.setup = setup
221
+ if not isinstance(solver, NotGiven):
222
+ task.solver = resolve_solver(solver)
223
+ if not isinstance(scorer, NotGiven):
224
+ task.scorer = resolve_scorer(scorer)
225
+ if not isinstance(metrics, NotGiven):
226
+ task.metrics = metrics
227
+ if not isinstance(config, NotGiven):
228
+ task.config = config
229
+ if not isinstance(sandbox, NotGiven):
230
+ task.sandbox = resolve_sandbox_environment(sandbox)
231
+ if not isinstance(approval, NotGiven):
232
+ task.approval = resolve_approval(approval)
233
+ if not isinstance(epochs, NotGiven):
234
+ epochs = resolve_epochs(epochs)
235
+ task.epochs = epochs.epochs if epochs else None
236
+ task.epochs_reducer = epochs.reducer if epochs else None
237
+ if not isinstance(fail_on_error, NotGiven):
238
+ task.fail_on_error = fail_on_error
239
+ if not isinstance(message_limit, NotGiven):
240
+ task.message_limit = message_limit
241
+ if not isinstance(token_limit, NotGiven):
242
+ task.token_limit = token_limit
243
+ if not isinstance(time_limit, NotGiven):
244
+ task.time_limit = time_limit
245
+ if not isinstance(version, NotGiven):
246
+ task.version = version
247
+ if not isinstance(name, NotGiven):
248
+ task._name = name
249
+ if not isinstance(metadata, NotGiven):
250
+ task.metadata = metadata
251
+
252
+ # return modified task
253
+ return task
254
+
255
+
174
256
  class TaskInfo(BaseModel):
175
257
  """Task information (file, name, and attributes)."""
176
258
 
@@ -225,3 +307,36 @@ classes, and task instances (a single task or list of tasks
225
307
  can be specified). None is a request to read a task out
226
308
  of the current working directory.
227
309
  """
310
+
311
+
312
+ def resolve_approval(
313
+ approval: str | list[ApprovalPolicy] | None,
314
+ ) -> list[ApprovalPolicy] | None:
315
+ return (
316
+ approval_policies_from_config(approval)
317
+ if isinstance(approval, str)
318
+ else approval
319
+ )
320
+
321
+
322
+ def resolve_epochs(epochs: int | Epochs | None) -> Epochs | None:
323
+ if isinstance(epochs, int):
324
+ epochs = Epochs(epochs)
325
+ if epochs is not None and epochs.epochs < 1:
326
+ raise ValueError("epochs must be a positive integer.")
327
+ return epochs
328
+
329
+
330
+ def resolve_dataset(dataset: Dataset | Sequence[Sample] | None) -> Dataset:
331
+ dataset = dataset or [Sample(input="prompt")]
332
+ return dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset))
333
+
334
+
335
+ def resolve_solver(solver: Solver | list[Solver]) -> Solver:
336
+ return chain(solver) if isinstance(solver, list) else solver
337
+
338
+
339
+ def resolve_scorer(scorer: Scorer | list[Scorer] | None) -> list[Scorer] | None:
340
+ return (
341
+ scorer if isinstance(scorer, list) else [scorer] if scorer is not None else None
342
+ )
@@ -36,3 +36,5 @@ SCORED_SUFFIX = "-scored"
36
36
  SAMPLE_SUBTASK = "sample"
37
37
  CONSOLE_DISPLAY_WIDTH = 120
38
38
  BASE_64_DATA_REMOVED = "<base64-data-removed>"
39
+ SANDBOX_SETUP_TIMEOUT = 300
40
+ NO_CONTENT = "(no content)"