inspect-ai 0.3.57__py3-none-any.whl → 0.3.59__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_cli/common.py +7 -3
- inspect_ai/_cli/eval.py +17 -2
- inspect_ai/_cli/trace.py +21 -2
- inspect_ai/_display/core/active.py +4 -3
- inspect_ai/_display/core/config.py +3 -3
- inspect_ai/_display/core/panel.py +7 -3
- inspect_ai/_display/plain/__init__.py +0 -0
- inspect_ai/_display/plain/display.py +203 -0
- inspect_ai/_display/rich/display.py +4 -9
- inspect_ai/_display/textual/app.py +4 -1
- inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
- inspect_ai/_display/textual/widgets/samples.py +119 -16
- inspect_ai/_display/textual/widgets/sandbox.py +37 -0
- inspect_ai/_eval/eval.py +32 -20
- inspect_ai/_eval/evalset.py +7 -5
- inspect_ai/_eval/score.py +1 -0
- inspect_ai/_eval/task/__init__.py +2 -2
- inspect_ai/_eval/task/images.py +40 -25
- inspect_ai/_eval/task/results.py +50 -22
- inspect_ai/_eval/task/run.py +180 -124
- inspect_ai/_eval/task/sandbox.py +10 -5
- inspect_ai/_eval/task/task.py +140 -25
- inspect_ai/_util/constants.py +2 -0
- inspect_ai/_util/content.py +23 -1
- inspect_ai/_util/images.py +20 -17
- inspect_ai/_util/kvstore.py +73 -0
- inspect_ai/_util/notgiven.py +18 -0
- inspect_ai/_util/port_names.py +61 -0
- inspect_ai/_util/text.py +23 -0
- inspect_ai/_util/thread.py +5 -0
- inspect_ai/_view/www/App.css +31 -1
- inspect_ai/_view/www/dist/assets/index.css +31 -1
- inspect_ai/_view/www/dist/assets/index.js +25375 -1846
- inspect_ai/_view/www/log-schema.json +129 -15
- inspect_ai/_view/www/package.json +2 -0
- inspect_ai/_view/www/src/App.mjs +8 -10
- inspect_ai/_view/www/src/Types.mjs +0 -1
- inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
- inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
- inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
- inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
- inspect_ai/_view/www/src/components/MessageContent.mjs +43 -1
- inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
- inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
- inspect_ai/_view/www/src/index.js +75 -2
- inspect_ai/_view/www/src/navbar/Navbar.mjs +3 -0
- inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +18 -9
- inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
- inspect_ai/_view/www/src/samples/SampleList.mjs +18 -48
- inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
- inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +29 -13
- inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -1
- inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
- inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
- inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
- inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
- inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
- inspect_ai/_view/www/src/types/log.d.ts +62 -27
- inspect_ai/_view/www/src/utils/Format.mjs +10 -3
- inspect_ai/_view/www/src/utils/Json.mjs +12 -6
- inspect_ai/_view/www/src/workspace/WorkSpace.mjs +10 -4
- inspect_ai/_view/www/vite.config.js +7 -0
- inspect_ai/_view/www/yarn.lock +116 -0
- inspect_ai/approval/_human/__init__.py +0 -0
- inspect_ai/approval/_human/util.py +2 -2
- inspect_ai/approval/_policy.py +12 -6
- inspect_ai/dataset/_sources/csv.py +2 -1
- inspect_ai/dataset/_sources/json.py +2 -1
- inspect_ai/dataset/_sources/util.py +15 -7
- inspect_ai/log/_condense.py +11 -1
- inspect_ai/log/_log.py +3 -6
- inspect_ai/log/_recorders/eval.py +19 -8
- inspect_ai/log/_samples.py +26 -5
- inspect_ai/log/_transcript.py +32 -2
- inspect_ai/model/__init__.py +10 -2
- inspect_ai/model/_call_tools.py +59 -12
- inspect_ai/model/_chat_message.py +2 -4
- inspect_ai/model/_conversation.py +61 -0
- inspect_ai/model/_generate_config.py +10 -4
- inspect_ai/model/_model.py +117 -18
- inspect_ai/model/_model_output.py +7 -2
- inspect_ai/model/_providers/anthropic.py +109 -51
- inspect_ai/model/_providers/azureai.py +26 -24
- inspect_ai/model/_providers/bedrock.py +43 -44
- inspect_ai/model/_providers/google.py +121 -58
- inspect_ai/model/_providers/groq.py +7 -5
- inspect_ai/model/_providers/hf.py +11 -6
- inspect_ai/model/_providers/mistral.py +17 -20
- inspect_ai/model/_providers/openai.py +32 -21
- inspect_ai/model/_providers/openai_o1.py +9 -8
- inspect_ai/model/_providers/providers.py +1 -1
- inspect_ai/model/_providers/together.py +8 -8
- inspect_ai/model/_providers/vertex.py +18 -8
- inspect_ai/scorer/__init__.py +13 -2
- inspect_ai/scorer/_metrics/__init__.py +2 -2
- inspect_ai/scorer/_metrics/std.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +1 -1
- inspect_ai/scorer/_scorer.py +2 -2
- inspect_ai/solver/__init__.py +2 -5
- inspect_ai/solver/_prompt.py +35 -5
- inspect_ai/solver/_task_state.py +80 -38
- inspect_ai/tool/__init__.py +11 -1
- inspect_ai/tool/_tool.py +21 -3
- inspect_ai/tool/_tool_call.py +10 -0
- inspect_ai/tool/_tool_def.py +16 -5
- inspect_ai/tool/_tool_with.py +21 -4
- inspect_ai/tool/beta/__init__.py +5 -0
- inspect_ai/tool/beta/_computer/__init__.py +3 -0
- inspect_ai/tool/beta/_computer/_common.py +133 -0
- inspect_ai/tool/beta/_computer/_computer.py +155 -0
- inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
- inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
- inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
- inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
- inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
- inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
- inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
- inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/util/__init__.py +2 -3
- inspect_ai/util/{_trace.py → _conversation.py} +3 -17
- inspect_ai/util/_display.py +14 -4
- inspect_ai/util/_limit.py +26 -0
- inspect_ai/util/_sandbox/context.py +12 -13
- inspect_ai/util/_sandbox/docker/compose.py +24 -11
- inspect_ai/util/_sandbox/docker/docker.py +84 -14
- inspect_ai/util/_sandbox/docker/internal.py +3 -1
- inspect_ai/util/_sandbox/environment.py +27 -1
- inspect_ai/util/_sandbox/local.py +1 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/RECORD +159 -128
- inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
- inspect_ai/model/_trace.py +0 -48
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/task/run.py
CHANGED
@@ -4,6 +4,7 @@ import sys
|
|
4
4
|
import time
|
5
5
|
from copy import deepcopy
|
6
6
|
from dataclasses import dataclass, field
|
7
|
+
from datetime import datetime
|
7
8
|
from logging import getLogger
|
8
9
|
from pathlib import PurePath
|
9
10
|
from typing import Callable, Literal
|
@@ -26,10 +27,7 @@ from inspect_ai._util.constants import (
|
|
26
27
|
from inspect_ai._util.datetime import iso_now
|
27
28
|
from inspect_ai._util.error import exception_message
|
28
29
|
from inspect_ai._util.hooks import send_telemetry
|
29
|
-
from inspect_ai._util.registry import
|
30
|
-
is_registry_object,
|
31
|
-
registry_log_name,
|
32
|
-
)
|
30
|
+
from inspect_ai._util.registry import is_registry_object, registry_log_name
|
33
31
|
from inspect_ai._util.timeouts import Timeout, timeout, timeout_at
|
34
32
|
from inspect_ai._view.notify import view_notify_eval
|
35
33
|
from inspect_ai.dataset import Dataset, Sample
|
@@ -44,7 +42,11 @@ from inspect_ai.log import (
|
|
44
42
|
from inspect_ai.log._condense import condense_sample
|
45
43
|
from inspect_ai.log._file import eval_log_json_str
|
46
44
|
from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
|
47
|
-
from inspect_ai.log._samples import
|
45
|
+
from inspect_ai.log._samples import (
|
46
|
+
active_sample,
|
47
|
+
set_active_sample_message_limit,
|
48
|
+
set_active_sample_token_limit,
|
49
|
+
)
|
48
50
|
from inspect_ai.log._transcript import (
|
49
51
|
ErrorEvent,
|
50
52
|
SampleInitEvent,
|
@@ -71,6 +73,8 @@ from inspect_ai.solver._chain import Chain, unroll
|
|
71
73
|
from inspect_ai.solver._fork import set_task_generate
|
72
74
|
from inspect_ai.solver._solver import Solver
|
73
75
|
from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
|
76
|
+
from inspect_ai.util._limit import SampleLimitExceededError
|
77
|
+
from inspect_ai.util._sandbox.context import sandbox_connections
|
74
78
|
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
|
75
79
|
from inspect_ai.util._subtask import init_subtask
|
76
80
|
|
@@ -79,10 +83,10 @@ from ..task import Task
|
|
79
83
|
from .error import SampleErrorHandler
|
80
84
|
from .generate import task_generate
|
81
85
|
from .images import (
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
+
sample_without_base64_content,
|
87
|
+
samples_with_base64_content,
|
88
|
+
state_without_base64_content,
|
89
|
+
states_with_base64_content,
|
86
90
|
)
|
87
91
|
from .log import TaskLogger, collect_eval_data, log_start
|
88
92
|
from .results import eval_results
|
@@ -533,21 +537,18 @@ async def task_run_sample(
|
|
533
537
|
else contextlib.nullcontext()
|
534
538
|
)
|
535
539
|
|
536
|
-
# use timeout if provided
|
537
|
-
timeout_cm = (
|
538
|
-
timeout(time_limit) if time_limit is not None else contextlib.nullcontext()
|
539
|
-
)
|
540
|
-
|
541
540
|
# helper to handle exceptions (will throw if we've exceeded the limit)
|
542
541
|
def handle_error(ex: BaseException) -> EvalError:
|
543
542
|
err = sample_error(ex)
|
543
|
+
py_logger.warning(
|
544
|
+
f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
|
545
|
+
)
|
544
546
|
transcript()._event(ErrorEvent(error=err))
|
545
547
|
return err
|
546
548
|
|
547
549
|
# solver loop
|
548
550
|
async with (
|
549
551
|
semaphore_cm,
|
550
|
-
sandboxenv_cm,
|
551
552
|
active_sample(
|
552
553
|
task=task_name,
|
553
554
|
model=str(state.model),
|
@@ -561,125 +562,179 @@ async def task_run_sample(
|
|
561
562
|
) as active,
|
562
563
|
):
|
563
564
|
error: EvalError | None = None
|
565
|
+
results: dict[str, SampleScore] = {}
|
564
566
|
try:
|
565
|
-
async with
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
567
|
+
async with sandboxenv_cm:
|
568
|
+
try:
|
569
|
+
# update active sample wth sandboxes now that we are initialised
|
570
|
+
active.sandboxes = await sandbox_connections()
|
571
|
+
|
572
|
+
# initialise timeout context manager
|
573
|
+
timeout_cm = (
|
574
|
+
timeout(time_limit)
|
575
|
+
if time_limit is not None
|
576
|
+
else contextlib.nullcontext()
|
577
|
+
)
|
575
578
|
|
576
|
-
|
577
|
-
|
579
|
+
# run sample w/ optional timeout
|
580
|
+
async with timeout_cm:
|
581
|
+
# mark started
|
582
|
+
active.started = datetime.now().timestamp()
|
578
583
|
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
"Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
|
591
|
-
)
|
584
|
+
# sample init event (remove file bodies as they have content or absolute paths)
|
585
|
+
event_sample = sample.model_copy(
|
586
|
+
update=dict(files={k: "" for k in sample.files.keys()})
|
587
|
+
if sample.files
|
588
|
+
else None
|
589
|
+
)
|
590
|
+
transcript()._event(
|
591
|
+
SampleInitEvent(
|
592
|
+
sample=event_sample, state=state_jsonable(state)
|
593
|
+
)
|
594
|
+
)
|
592
595
|
|
593
|
-
|
594
|
-
|
596
|
+
# set progress for plan then run it
|
597
|
+
state = await plan(state, generate)
|
595
598
|
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
599
|
+
except TimeoutError:
|
600
|
+
if time_limit is not None:
|
601
|
+
transcript()._event(
|
602
|
+
SampleLimitEvent(
|
603
|
+
type="time",
|
604
|
+
message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
|
605
|
+
limit=time_limit,
|
606
|
+
)
|
607
|
+
)
|
608
|
+
else:
|
609
|
+
py_logger.warning(
|
610
|
+
"Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
|
611
|
+
)
|
605
612
|
|
606
|
-
|
607
|
-
|
608
|
-
case "score":
|
609
|
-
# continue to scoring (capture the most recent state)
|
610
|
-
state = sample_state() or state
|
611
|
-
case "error":
|
612
|
-
# default error handling
|
613
|
-
error = handle_error(ex)
|
613
|
+
# capture most recent state for scoring
|
614
|
+
state = sample_state() or state
|
614
615
|
|
615
|
-
|
616
|
-
|
616
|
+
except asyncio.CancelledError as ex:
|
617
|
+
if active.interrupt_action:
|
618
|
+
# record eve t
|
619
|
+
transcript()._event(
|
620
|
+
SampleLimitEvent(
|
621
|
+
type="operator",
|
622
|
+
message="Sample completed: interrupted by operator",
|
623
|
+
)
|
624
|
+
)
|
617
625
|
|
618
|
-
|
619
|
-
|
626
|
+
# handle the action
|
627
|
+
match active.interrupt_action:
|
628
|
+
case "score":
|
629
|
+
# continue to scoring (capture the most recent state)
|
630
|
+
state = sample_state() or state
|
631
|
+
case "error":
|
632
|
+
# default error handling
|
633
|
+
error = handle_error(ex)
|
634
|
+
|
635
|
+
else:
|
636
|
+
raise
|
637
|
+
|
638
|
+
except SampleLimitExceededError as ex:
|
639
|
+
# sample limit event
|
640
|
+
transcript()._event(
|
641
|
+
SampleLimitEvent(
|
642
|
+
type=ex.type,
|
643
|
+
limit=ex.limit,
|
644
|
+
message=f"Sample completed: {ex.message}",
|
645
|
+
)
|
646
|
+
)
|
620
647
|
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
648
|
+
# capture most recent state for scoring
|
649
|
+
state = sample_state() or state
|
650
|
+
state.completed = True
|
651
|
+
|
652
|
+
except BaseException as ex:
|
653
|
+
error = handle_error(ex)
|
654
|
+
|
655
|
+
# set timeout for scoring. if the original timeout was never hit
|
656
|
+
# then just create a new timeout_cm targeting the original
|
657
|
+
# timeout time. if the original timeout was hit we still want
|
658
|
+
# to provide an opportunity for scoring, but we don't necessarily
|
659
|
+
# want to wait the full timeout again (especially in the case where
|
660
|
+
# the cause of the timeout is a hung container and scoring requires
|
661
|
+
# interacting with the container). as a middle ground we use half
|
662
|
+
# of the original timeout value for scoring.
|
663
|
+
if isinstance(timeout_cm, Timeout):
|
664
|
+
if not timeout_cm.expired():
|
665
|
+
timeout_cm = timeout_at(timeout_cm.when())
|
666
|
+
else:
|
667
|
+
assert time_limit
|
668
|
+
timeout_cm = timeout(time_limit / 2)
|
669
|
+
|
670
|
+
# turn off sample limits
|
671
|
+
set_active_sample_token_limit(None)
|
672
|
+
set_active_sample_message_limit(None)
|
673
|
+
|
674
|
+
# scoring
|
675
|
+
try:
|
676
|
+
# timeout during scoring will result in an ordinary sample error
|
677
|
+
async with timeout_cm:
|
678
|
+
if error is None:
|
679
|
+
for scorer in scorers or []:
|
680
|
+
scorer_name = unique_scorer_name(
|
681
|
+
scorer, list(results.keys())
|
682
|
+
)
|
683
|
+
with transcript().step(name=scorer_name, type="scorer"):
|
684
|
+
score_result = (
|
685
|
+
await scorer(state, Target(sample.target))
|
686
|
+
if scorer
|
687
|
+
else None
|
688
|
+
)
|
689
|
+
if score_result is not None:
|
690
|
+
sample_score = SampleScore(
|
691
|
+
score=score_result,
|
692
|
+
sample_id=sample.id,
|
693
|
+
)
|
694
|
+
transcript()._event(
|
695
|
+
ScoreEvent(
|
696
|
+
score=score_result, target=sample.target
|
697
|
+
)
|
698
|
+
)
|
699
|
+
results[scorer_name] = sample_score
|
700
|
+
|
701
|
+
# add scores returned by solvers
|
702
|
+
if state.scores is not None:
|
703
|
+
for name, score in state.scores.items():
|
704
|
+
results[name] = SampleScore(
|
705
|
+
score=score, sample_id=state.sample_id
|
706
|
+
)
|
707
|
+
|
708
|
+
# propagate results into scores
|
709
|
+
state.scores = {k: v.score for k, v in results.items()}
|
710
|
+
|
711
|
+
except asyncio.CancelledError:
|
712
|
+
if active.interrupt_action:
|
713
|
+
transcript()._event(
|
714
|
+
SampleLimitEvent(
|
715
|
+
type="operator",
|
716
|
+
message="Unable to score sample due to operator interruption",
|
717
|
+
)
|
718
|
+
)
|
635
719
|
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
await scorer(state, Target(sample.target))
|
647
|
-
if scorer
|
648
|
-
else None
|
720
|
+
raise
|
721
|
+
|
722
|
+
except BaseException as ex:
|
723
|
+
# note timeout
|
724
|
+
if isinstance(ex, TimeoutError):
|
725
|
+
transcript()._event(
|
726
|
+
SampleLimitEvent(
|
727
|
+
type="time",
|
728
|
+
message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
|
729
|
+
limit=time_limit,
|
649
730
|
)
|
650
|
-
|
651
|
-
sample_score = SampleScore(
|
652
|
-
score=score_result,
|
653
|
-
sample_id=sample.id,
|
654
|
-
)
|
655
|
-
transcript()._event(
|
656
|
-
ScoreEvent(score=score_result, target=sample.target)
|
657
|
-
)
|
658
|
-
results[scorer_name] = sample_score
|
659
|
-
|
660
|
-
except asyncio.CancelledError:
|
661
|
-
if active.interrupt_action:
|
662
|
-
transcript()._event(
|
663
|
-
SampleLimitEvent(
|
664
|
-
type="operator",
|
665
|
-
message="Unable to score sample due to operator interruption",
|
666
|
-
)
|
667
|
-
)
|
731
|
+
)
|
668
732
|
|
669
|
-
|
733
|
+
# handle error (this will throw if we've exceeded the limit)
|
734
|
+
error = handle_error(ex)
|
670
735
|
|
736
|
+
# handle sandboxenv init errors
|
671
737
|
except BaseException as ex:
|
672
|
-
# note timeout
|
673
|
-
if isinstance(ex, TimeoutError):
|
674
|
-
transcript()._event(
|
675
|
-
SampleLimitEvent(
|
676
|
-
type="time",
|
677
|
-
message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
|
678
|
-
limit=time_limit,
|
679
|
-
)
|
680
|
-
)
|
681
|
-
|
682
|
-
# handle error (this will throw if we've exceeded the limit)
|
683
738
|
error = handle_error(ex)
|
684
739
|
|
685
740
|
# complete the sample
|
@@ -689,12 +744,12 @@ async def task_run_sample(
|
|
689
744
|
if logger is not None:
|
690
745
|
# if we are logging images then be sure to base64 images injected by solvers
|
691
746
|
if log_images:
|
692
|
-
state = (await
|
747
|
+
state = (await states_with_base64_content([state]))[0]
|
693
748
|
|
694
749
|
# otherwise ensure there are no base64 images in sample or messages
|
695
750
|
else:
|
696
|
-
sample =
|
697
|
-
state =
|
751
|
+
sample = sample_without_base64_content(sample)
|
752
|
+
state = state_without_base64_content(state)
|
698
753
|
|
699
754
|
# log the sample
|
700
755
|
await log_sample(
|
@@ -784,7 +839,7 @@ async def resolve_dataset(
|
|
784
839
|
|
785
840
|
# if we are logging images then resolve sample images here
|
786
841
|
if log_images:
|
787
|
-
samples = await
|
842
|
+
samples = await samples_with_base64_content(samples)
|
788
843
|
|
789
844
|
# prime the eval tasks (deep copy so they share no state w/ sample)
|
790
845
|
sample_epochs: list[int] = []
|
@@ -797,6 +852,7 @@ async def resolve_dataset(
|
|
797
852
|
epoch=epoch,
|
798
853
|
model=model_name,
|
799
854
|
input=sample.input,
|
855
|
+
target=Target(sample.target),
|
800
856
|
choices=sample.choices,
|
801
857
|
messages=sample_messages(sample),
|
802
858
|
message_limit=message_limit,
|
inspect_ai/_eval/task/sandbox.py
CHANGED
@@ -4,11 +4,13 @@ import contextlib
|
|
4
4
|
from random import random
|
5
5
|
from typing import AsyncGenerator, Callable, NamedTuple, cast
|
6
6
|
|
7
|
+
import httpx
|
8
|
+
|
7
9
|
from inspect_ai._eval.task.task import Task
|
8
10
|
from inspect_ai._eval.task.util import task_run_dir
|
9
11
|
from inspect_ai._util.file import file, filesystem
|
10
12
|
from inspect_ai._util.registry import registry_unqualified_name
|
11
|
-
from inspect_ai._util.url import data_uri_to_base64, is_data_uri
|
13
|
+
from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url
|
12
14
|
from inspect_ai.dataset import Sample
|
13
15
|
from inspect_ai.util._concurrency import concurrency
|
14
16
|
from inspect_ai.util._sandbox.context import (
|
@@ -65,12 +67,12 @@ async def sandboxenv_context(
|
|
65
67
|
files: dict[str, bytes] = {}
|
66
68
|
if sample.files:
|
67
69
|
for path, contents in sample.files.items():
|
68
|
-
files[path] = read_sandboxenv_file(contents)
|
70
|
+
files[path] = await read_sandboxenv_file(contents)
|
69
71
|
|
70
72
|
# read setup script from sample (add bash shebang if necessary)
|
71
73
|
setup: bytes | None = None
|
72
74
|
if sample.setup:
|
73
|
-
setup = read_sandboxenv_file(sample.setup)
|
75
|
+
setup = await read_sandboxenv_file(sample.setup)
|
74
76
|
setup_str = setup.decode(encoding="utf-8")
|
75
77
|
if not setup_str.strip().startswith("#!"):
|
76
78
|
setup_str = f"#!/usr/bin/env bash\n\n{setup_str}"
|
@@ -108,13 +110,16 @@ async def sandboxenv_context(
|
|
108
110
|
)
|
109
111
|
|
110
112
|
|
111
|
-
def read_sandboxenv_file(contents: str) -> bytes:
|
113
|
+
async def read_sandboxenv_file(contents: str) -> bytes:
|
112
114
|
if is_data_uri(contents):
|
113
115
|
contents_base64 = data_uri_to_base64(contents)
|
114
116
|
file_bytes = base64.b64decode(contents_base64)
|
117
|
+
elif is_http_url(contents):
|
118
|
+
client = httpx.AsyncClient()
|
119
|
+
file_bytes = (await client.get(contents, follow_redirects=True)).content
|
115
120
|
else:
|
116
121
|
# try to read as a file (if it doesn't exist or has a path not cool w/
|
117
|
-
# the
|
122
|
+
# the filesystem then we fall back to contents)
|
118
123
|
try:
|
119
124
|
fs = filesystem(contents)
|
120
125
|
if fs.exists(contents):
|
inspect_ai/_eval/task/task.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
from copy import deepcopy
|
1
2
|
from dataclasses import dataclass
|
2
3
|
from logging import getLogger
|
3
4
|
from typing import Any, Callable, Sequence, cast
|
@@ -6,6 +7,7 @@ from pydantic import BaseModel
|
|
6
7
|
from typing_extensions import TypedDict, Unpack
|
7
8
|
|
8
9
|
from inspect_ai._util.logger import warn_once
|
10
|
+
from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
|
9
11
|
from inspect_ai._util.registry import is_registry_object, registry_info
|
10
12
|
from inspect_ai.approval._policy import ApprovalPolicy, approval_policies_from_config
|
11
13
|
from inspect_ai.dataset import Dataset, MemoryDataset, Sample
|
@@ -115,35 +117,15 @@ class Task:
|
|
115
117
|
f"DEPRECATED: the '{arg}' parameter is deprecated (please use the '{newarg}' parameter instead)",
|
116
118
|
)
|
117
119
|
|
118
|
-
|
119
|
-
if isinstance(epochs, int):
|
120
|
-
epochs = Epochs(epochs)
|
121
|
-
if epochs is not None and epochs.epochs < 1:
|
122
|
-
raise ValueError("epochs must be a positive integer.")
|
123
|
-
|
124
|
-
# resolve dataset (provide empty sample to bootstrap tasks w/o samples,
|
125
|
-
# which could occur for testing or for an interactive mode eval)
|
126
|
-
dataset = dataset or [Sample(input="prompt")]
|
127
|
-
self.dataset: Dataset = (
|
128
|
-
dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset))
|
129
|
-
)
|
120
|
+
self.dataset = resolve_dataset(dataset)
|
130
121
|
self.setup = setup
|
131
|
-
self.solver =
|
132
|
-
self.scorer = (
|
133
|
-
scorer
|
134
|
-
if isinstance(scorer, list)
|
135
|
-
else [scorer]
|
136
|
-
if scorer is not None
|
137
|
-
else None
|
138
|
-
)
|
122
|
+
self.solver = resolve_solver(solver)
|
123
|
+
self.scorer = resolve_scorer(scorer)
|
139
124
|
self.metrics = metrics
|
140
125
|
self.config = config
|
141
126
|
self.sandbox = resolve_sandbox_environment(sandbox)
|
142
|
-
self.approval = (
|
143
|
-
|
144
|
-
if isinstance(approval, str)
|
145
|
-
else approval
|
146
|
-
)
|
127
|
+
self.approval = resolve_approval(approval)
|
128
|
+
epochs = resolve_epochs(epochs)
|
147
129
|
self.epochs = epochs.epochs if epochs else None
|
148
130
|
self.epochs_reducer = epochs.reducer if epochs else None
|
149
131
|
self.fail_on_error = fail_on_error
|
@@ -171,6 +153,106 @@ class Task:
|
|
171
153
|
return dict()
|
172
154
|
|
173
155
|
|
156
|
+
def task_with(
|
157
|
+
task: Task,
|
158
|
+
*,
|
159
|
+
dataset: Dataset | Sequence[Sample] | None | NotGiven = NOT_GIVEN,
|
160
|
+
setup: Solver | list[Solver] | None | NotGiven = NOT_GIVEN,
|
161
|
+
solver: Solver | list[Solver] | NotGiven = NOT_GIVEN,
|
162
|
+
scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
|
163
|
+
metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
|
164
|
+
config: GenerateConfig | NotGiven = NOT_GIVEN,
|
165
|
+
sandbox: SandboxEnvironmentType | None | NotGiven = NOT_GIVEN,
|
166
|
+
approval: str | list[ApprovalPolicy] | None | NotGiven = NOT_GIVEN,
|
167
|
+
epochs: int | Epochs | None | NotGiven = NOT_GIVEN,
|
168
|
+
fail_on_error: bool | float | None | NotGiven = NOT_GIVEN,
|
169
|
+
message_limit: int | None | NotGiven = NOT_GIVEN,
|
170
|
+
token_limit: int | None | NotGiven = NOT_GIVEN,
|
171
|
+
time_limit: int | None | NotGiven = NOT_GIVEN,
|
172
|
+
name: str | None | NotGiven = NOT_GIVEN,
|
173
|
+
version: int | NotGiven = NOT_GIVEN,
|
174
|
+
metadata: dict[str, Any] | None | NotGiven = NOT_GIVEN,
|
175
|
+
) -> Task:
|
176
|
+
"""Task adapted with alternate values for one or more options.
|
177
|
+
|
178
|
+
Args:
|
179
|
+
task (Task): Task to adapt (it is deep copied prior to mutating options)
|
180
|
+
dataset (Dataset | Sequence[Sample]): Dataset to evaluate
|
181
|
+
setup: (Solver | list[Solver] | None): Setup step (always run
|
182
|
+
even when the main `solver` is replaced).
|
183
|
+
solver: (Solver | list[Solver]): Solver or list of solvers.
|
184
|
+
Defaults to generate(), a normal call to the model.
|
185
|
+
scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
|
186
|
+
metrics (list[Metric] | dict[str, list[Metric]] | None):
|
187
|
+
Alternative metrics (overrides the metrics provided by the specified scorer).
|
188
|
+
config (GenerateConfig): Model generation config.
|
189
|
+
sandbox (SandboxEnvironmentType | None): Sandbox environment type
|
190
|
+
(or optionally a str or tuple with a shorthand spec)
|
191
|
+
approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
|
192
|
+
Either a path to an approval policy config file or a list of approval policies.
|
193
|
+
Defaults to no approval policy.
|
194
|
+
epochs (int | Epochs | None): Epochs to repeat samples for and optional score
|
195
|
+
reducer function(s) used to combine sample scores (defaults to "mean")
|
196
|
+
fail_on_error (bool | float | None): `True` to fail on first sample error
|
197
|
+
(default); `False` to never fail on sample errors; Value between 0 and 1
|
198
|
+
to fail if a proportion of total samples fails. Value greater than 1 to fail
|
199
|
+
eval if a count of samples fails.
|
200
|
+
message_limit (int | None): Limit on total messages used for each sample.
|
201
|
+
token_limit (int | None): Limit on total tokens used for each sample.
|
202
|
+
time_limit (int | None): Limit on time (in seconds) for execution of each sample.
|
203
|
+
name: (str | None): Task name. If not specified is automatically
|
204
|
+
determined based on the name of the task directory (or "task")
|
205
|
+
if its anonymous task (e.g. created in a notebook and passed to
|
206
|
+
eval() directly)
|
207
|
+
version: (int): Version of task (to distinguish evolutions
|
208
|
+
of the task spec or breaking changes to it)
|
209
|
+
metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
|
210
|
+
|
211
|
+
Returns:
|
212
|
+
Task: Task adapted with alternate options.
|
213
|
+
"""
|
214
|
+
# deep copy the task
|
215
|
+
task = deepcopy(task)
|
216
|
+
|
217
|
+
if not isinstance(dataset, NotGiven):
|
218
|
+
task.dataset = resolve_dataset(dataset)
|
219
|
+
if not isinstance(setup, NotGiven):
|
220
|
+
task.setup = setup
|
221
|
+
if not isinstance(solver, NotGiven):
|
222
|
+
task.solver = resolve_solver(solver)
|
223
|
+
if not isinstance(scorer, NotGiven):
|
224
|
+
task.scorer = resolve_scorer(scorer)
|
225
|
+
if not isinstance(metrics, NotGiven):
|
226
|
+
task.metrics = metrics
|
227
|
+
if not isinstance(config, NotGiven):
|
228
|
+
task.config = config
|
229
|
+
if not isinstance(sandbox, NotGiven):
|
230
|
+
task.sandbox = resolve_sandbox_environment(sandbox)
|
231
|
+
if not isinstance(approval, NotGiven):
|
232
|
+
task.approval = resolve_approval(approval)
|
233
|
+
if not isinstance(epochs, NotGiven):
|
234
|
+
epochs = resolve_epochs(epochs)
|
235
|
+
task.epochs = epochs.epochs if epochs else None
|
236
|
+
task.epochs_reducer = epochs.reducer if epochs else None
|
237
|
+
if not isinstance(fail_on_error, NotGiven):
|
238
|
+
task.fail_on_error = fail_on_error
|
239
|
+
if not isinstance(message_limit, NotGiven):
|
240
|
+
task.message_limit = message_limit
|
241
|
+
if not isinstance(token_limit, NotGiven):
|
242
|
+
task.token_limit = token_limit
|
243
|
+
if not isinstance(time_limit, NotGiven):
|
244
|
+
task.time_limit = time_limit
|
245
|
+
if not isinstance(version, NotGiven):
|
246
|
+
task.version = version
|
247
|
+
if not isinstance(name, NotGiven):
|
248
|
+
task._name = name
|
249
|
+
if not isinstance(metadata, NotGiven):
|
250
|
+
task.metadata = metadata
|
251
|
+
|
252
|
+
# return modified task
|
253
|
+
return task
|
254
|
+
|
255
|
+
|
174
256
|
class TaskInfo(BaseModel):
|
175
257
|
"""Task information (file, name, and attributes)."""
|
176
258
|
|
@@ -225,3 +307,36 @@ classes, and task instances (a single task or list of tasks
|
|
225
307
|
can be specified). None is a request to read a task out
|
226
308
|
of the current working directory.
|
227
309
|
"""
|
310
|
+
|
311
|
+
|
312
|
+
def resolve_approval(
|
313
|
+
approval: str | list[ApprovalPolicy] | None,
|
314
|
+
) -> list[ApprovalPolicy] | None:
|
315
|
+
return (
|
316
|
+
approval_policies_from_config(approval)
|
317
|
+
if isinstance(approval, str)
|
318
|
+
else approval
|
319
|
+
)
|
320
|
+
|
321
|
+
|
322
|
+
def resolve_epochs(epochs: int | Epochs | None) -> Epochs | None:
|
323
|
+
if isinstance(epochs, int):
|
324
|
+
epochs = Epochs(epochs)
|
325
|
+
if epochs is not None and epochs.epochs < 1:
|
326
|
+
raise ValueError("epochs must be a positive integer.")
|
327
|
+
return epochs
|
328
|
+
|
329
|
+
|
330
|
+
def resolve_dataset(dataset: Dataset | Sequence[Sample] | None) -> Dataset:
|
331
|
+
dataset = dataset or [Sample(input="prompt")]
|
332
|
+
return dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset))
|
333
|
+
|
334
|
+
|
335
|
+
def resolve_solver(solver: Solver | list[Solver]) -> Solver:
|
336
|
+
return chain(solver) if isinstance(solver, list) else solver
|
337
|
+
|
338
|
+
|
339
|
+
def resolve_scorer(scorer: Scorer | list[Scorer] | None) -> list[Scorer] | None:
|
340
|
+
return (
|
341
|
+
scorer if isinstance(scorer, list) else [scorer] if scorer is not None else None
|
342
|
+
)
|