inspect-ai 0.3.104__py3-none-any.whl → 0.3.105__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_eval/evalset.py +1 -1
- inspect_ai/_eval/task/run.py +64 -38
- inspect_ai/_view/server.py +17 -0
- inspect_ai/_view/www/dist/assets/index.css +33 -29
- inspect_ai/_view/www/dist/assets/index.js +559 -247
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +4 -0
- inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -0
- inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +26 -0
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +14 -3
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +359 -7
- inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/language.ts +6 -0
- inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +1 -1
- inspect_ai/_view/www/src/client/api/api-browser.ts +25 -0
- inspect_ai/_view/www/src/client/api/api-http.ts +3 -0
- inspect_ai/_view/www/src/client/api/api-vscode.ts +6 -0
- inspect_ai/_view/www/src/client/api/client-api.ts +3 -0
- inspect_ai/_view/www/src/client/api/jsonrpc.ts +1 -0
- inspect_ai/_view/www/src/client/api/types.ts +3 -0
- inspect_ai/_view/www/src/state/samplePolling.ts +17 -1
- inspect_ai/agent/_handoff.py +5 -2
- inspect_ai/agent/_react.py +5 -5
- inspect_ai/dataset/_dataset.py +1 -1
- inspect_ai/log/_samples.py +5 -0
- inspect_ai/model/_call_tools.py +4 -4
- inspect_ai/model/_providers/anthropic.py +23 -2
- inspect_ai/model/_providers/google.py +5 -1
- inspect_ai/util/__init__.py +8 -0
- inspect_ai/util/_background.py +64 -0
- inspect_ai/util/_limit.py +72 -5
- inspect_ai/util/_sandbox/__init__.py +2 -0
- inspect_ai/util/_sandbox/service.py +28 -7
- inspect_ai/util/_subprocess.py +51 -38
- {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/RECORD +38 -37
- {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/evalset.py
CHANGED
@@ -578,7 +578,7 @@ def task_identifier(task: ResolvedTask | EvalLog) -> str:
|
|
578
578
|
else:
|
579
579
|
task_file = task.eval.task_file or ""
|
580
580
|
task_name = task.eval.task
|
581
|
-
task_args = task.eval.
|
581
|
+
task_args = task.eval.task_args_passed
|
582
582
|
model = str(task.eval.model)
|
583
583
|
model_roles = task.eval.model_roles or {}
|
584
584
|
|
inspect_ai/_eval/task/run.py
CHANGED
@@ -10,6 +10,7 @@ from pathlib import PurePath
|
|
10
10
|
from typing import Callable, Literal
|
11
11
|
|
12
12
|
import anyio
|
13
|
+
from anyio.abc import TaskGroup
|
13
14
|
from typing_extensions import Unpack
|
14
15
|
|
15
16
|
from inspect_ai._display import (
|
@@ -306,37 +307,57 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
|
|
306
307
|
task.metrics,
|
307
308
|
)
|
308
309
|
|
310
|
+
async def run_sample(
|
311
|
+
sample: Sample, state: TaskState
|
312
|
+
) -> dict[str, SampleScore] | None:
|
313
|
+
result: dict[str, SampleScore] | None = None
|
314
|
+
|
315
|
+
async def run(tg: TaskGroup) -> None:
|
316
|
+
try:
|
317
|
+
nonlocal result
|
318
|
+
result = await task_run_sample(
|
319
|
+
tg=tg,
|
320
|
+
task_name=task.name,
|
321
|
+
log_location=profile.log_location,
|
322
|
+
sample=sample,
|
323
|
+
state=state,
|
324
|
+
sandbox=sandbox,
|
325
|
+
max_sandboxes=config.max_sandboxes,
|
326
|
+
sandbox_cleanup=sandbox_cleanup,
|
327
|
+
plan=plan,
|
328
|
+
scorers=scorers,
|
329
|
+
generate=generate,
|
330
|
+
progress=progress,
|
331
|
+
logger=logger if log_samples else None,
|
332
|
+
log_images=log_images,
|
333
|
+
sample_source=sample_source,
|
334
|
+
sample_error=sample_error_handler,
|
335
|
+
sample_complete=sample_complete,
|
336
|
+
fails_on_error=(
|
337
|
+
config.fail_on_error is None
|
338
|
+
or config.fail_on_error is True
|
339
|
+
),
|
340
|
+
retry_on_error=config.retry_on_error or 0,
|
341
|
+
error_retries=[],
|
342
|
+
time_limit=config.time_limit,
|
343
|
+
working_limit=config.working_limit,
|
344
|
+
semaphore=sample_semaphore,
|
345
|
+
)
|
346
|
+
finally:
|
347
|
+
tg.cancel_scope.cancel()
|
348
|
+
|
349
|
+
async with anyio.create_task_group() as tg:
|
350
|
+
tg.start_soon(run, tg)
|
351
|
+
|
352
|
+
return result
|
353
|
+
|
309
354
|
sample_results = await tg_collect(
|
310
355
|
[
|
311
|
-
functools.partial(
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
sample=sample,
|
316
|
-
state=state,
|
317
|
-
sandbox=sandbox,
|
318
|
-
max_sandboxes=config.max_sandboxes,
|
319
|
-
sandbox_cleanup=sandbox_cleanup,
|
320
|
-
plan=plan,
|
321
|
-
scorers=scorers,
|
322
|
-
generate=generate,
|
323
|
-
progress=progress,
|
324
|
-
logger=logger if log_samples else None,
|
325
|
-
log_images=log_images,
|
326
|
-
sample_source=sample_source,
|
327
|
-
sample_error=sample_error_handler,
|
328
|
-
sample_complete=sample_complete,
|
329
|
-
fails_on_error=(
|
330
|
-
config.fail_on_error is None
|
331
|
-
or config.fail_on_error is True
|
332
|
-
),
|
333
|
-
retry_on_error=config.retry_on_error or 0,
|
334
|
-
error_retries=[],
|
335
|
-
time_limit=config.time_limit,
|
336
|
-
working_limit=config.working_limit,
|
337
|
-
semaphore=sample_semaphore,
|
356
|
+
functools.partial(run_sample, sample, state)
|
357
|
+
for (sample, state) in zip(
|
358
|
+
samples,
|
359
|
+
states,
|
338
360
|
)
|
339
|
-
for (sample, state) in zip(samples, states)
|
340
361
|
]
|
341
362
|
)
|
342
363
|
|
@@ -492,6 +513,7 @@ def update_metrics_display_fn(
|
|
492
513
|
|
493
514
|
async def task_run_sample(
|
494
515
|
*,
|
516
|
+
tg: TaskGroup,
|
495
517
|
task_name: str,
|
496
518
|
log_location: str,
|
497
519
|
sample: Sample,
|
@@ -611,12 +633,14 @@ async def task_run_sample(
|
|
611
633
|
working_limit=working_limit,
|
612
634
|
fails_on_error=fails_on_error or (retry_on_error > 0),
|
613
635
|
transcript=sample_transcript,
|
636
|
+
tg=tg,
|
614
637
|
) as active,
|
615
638
|
):
|
616
639
|
start_time: float | None = None
|
617
640
|
error: EvalError | None = None
|
618
641
|
raise_error: BaseException | None = None
|
619
642
|
results: dict[str, SampleScore] = {}
|
643
|
+
limit: EvalSampleLimit | None = None
|
620
644
|
try:
|
621
645
|
# begin init
|
622
646
|
init_span = span("init", type="init")
|
@@ -704,9 +728,17 @@ async def task_run_sample(
|
|
704
728
|
# handle the cancel exception
|
705
729
|
raise
|
706
730
|
|
707
|
-
except
|
731
|
+
except LimitExceededError as ex:
|
732
|
+
# capture most recent state for scoring
|
733
|
+
state = sample_state() or state
|
734
|
+
limit = EvalSampleLimit(
|
735
|
+
type=ex.type, limit=ex.limit if ex.limit is not None else -1
|
736
|
+
)
|
737
|
+
|
738
|
+
except TerminateSampleError:
|
708
739
|
# capture most recent state for scoring
|
709
740
|
state = sample_state() or state
|
741
|
+
limit = EvalSampleLimit(type="operator", limit=1)
|
710
742
|
|
711
743
|
except BaseException as ex:
|
712
744
|
error, raise_error = handle_error(ex)
|
@@ -815,6 +847,7 @@ async def task_run_sample(
|
|
815
847
|
state=state,
|
816
848
|
scores=results,
|
817
849
|
error=error,
|
850
|
+
limit=limit,
|
818
851
|
error_retries=error_retries,
|
819
852
|
log_images=log_images,
|
820
853
|
)
|
@@ -854,6 +887,7 @@ async def task_run_sample(
|
|
854
887
|
time_limit=time_limit,
|
855
888
|
working_limit=working_limit,
|
856
889
|
semaphore=semaphore,
|
890
|
+
tg=tg,
|
857
891
|
)
|
858
892
|
|
859
893
|
# no error
|
@@ -879,6 +913,7 @@ async def log_sample(
|
|
879
913
|
state: TaskState,
|
880
914
|
scores: dict[str, SampleScore],
|
881
915
|
error: EvalError | None,
|
916
|
+
limit: EvalSampleLimit | None,
|
882
917
|
error_retries: list[EvalError],
|
883
918
|
log_images: bool,
|
884
919
|
) -> None:
|
@@ -894,15 +929,6 @@ async def log_sample(
|
|
894
929
|
# compute total time if we can
|
895
930
|
total_time = time.monotonic() - start_time if start_time is not None else None
|
896
931
|
|
897
|
-
# if a limit was hit, note that in the Eval Sample
|
898
|
-
limit = None
|
899
|
-
for e in transcript().events:
|
900
|
-
if e.event == "sample_limit":
|
901
|
-
limit = EvalSampleLimit(
|
902
|
-
type=e.type, limit=e.limit if e.limit is not None else -1
|
903
|
-
)
|
904
|
-
break
|
905
|
-
|
906
932
|
eval_sample = EvalSample(
|
907
933
|
id=id,
|
908
934
|
epoch=state.epoch,
|
inspect_ai/_view/server.py
CHANGED
@@ -155,6 +155,23 @@ def view_server(
|
|
155
155
|
body=samples.model_dump_json(), headers={"ETag": samples.etag}
|
156
156
|
)
|
157
157
|
|
158
|
+
@routes.get("/api/log-message")
|
159
|
+
async def api_log_message(request: web.Request) -> web.Response:
|
160
|
+
# log file requested
|
161
|
+
file = query_param_required("log_file", request, str)
|
162
|
+
|
163
|
+
file = urllib.parse.unquote(file)
|
164
|
+
validate_log_file_request(file)
|
165
|
+
|
166
|
+
# message to log
|
167
|
+
message = query_param_required("message", request, str)
|
168
|
+
|
169
|
+
# log the message
|
170
|
+
logger.warning(f"[CLIENT MESSAGE] ({file}): {message}")
|
171
|
+
|
172
|
+
# respond
|
173
|
+
return web.Response(status=204)
|
174
|
+
|
158
175
|
@routes.get("/api/pending-sample-data")
|
159
176
|
async def api_sample_events(request: web.Request) -> web.Response:
|
160
177
|
# log file requested
|
@@ -15655,18 +15655,40 @@ pre[class*="language-"] {
|
|
15655
15655
|
background-color: var(--bs-light-border-subtle);
|
15656
15656
|
margin-top: -1px;
|
15657
15657
|
}
|
15658
|
-
.
|
15658
|
+
._keyPairContainer_1ltuo_1 {
|
15659
|
+
display: grid;
|
15660
|
+
grid-template-columns: max-content auto;
|
15661
|
+
column-gap: 0.5em;
|
15662
|
+
padding-top: 4px;
|
15663
|
+
padding-bottom: 4px;
|
15664
|
+
border-bottom: solid 1px var(--bs-border-color);
|
15665
|
+
}
|
15666
|
+
|
15667
|
+
._key_1ltuo_1 {
|
15668
|
+
display: grid;
|
15669
|
+
grid-template-columns: 1em auto;
|
15670
|
+
cursor: pointer;
|
15671
|
+
}
|
15672
|
+
|
15673
|
+
._pre_1ltuo_16 {
|
15674
|
+
margin-bottom: 0;
|
15675
|
+
}
|
15676
|
+
|
15677
|
+
._treeIcon_1ltuo_20 {
|
15678
|
+
margin-top: -3px;
|
15679
|
+
}
|
15680
|
+
._message_1ivu3_1 {
|
15659
15681
|
font-weight: 300;
|
15660
15682
|
margin-left: 0;
|
15661
15683
|
margin-right: 0;
|
15662
15684
|
white-space: normal;
|
15663
15685
|
}
|
15664
15686
|
|
15665
|
-
.
|
15687
|
+
._systemRole_1ivu3_8 {
|
15666
15688
|
opacity: 0.7;
|
15667
15689
|
}
|
15668
15690
|
|
15669
|
-
.
|
15691
|
+
._messageGrid_1ivu3_12 {
|
15670
15692
|
display: grid;
|
15671
15693
|
grid-template-columns: max-content max-content max-content;
|
15672
15694
|
column-gap: 0.3em;
|
@@ -15674,24 +15696,28 @@ pre[class*="language-"] {
|
|
15674
15696
|
margin-bottom: 0.3em;
|
15675
15697
|
}
|
15676
15698
|
|
15677
|
-
.
|
15699
|
+
._messageContents_1ivu3_20 {
|
15678
15700
|
margin-left: 0;
|
15679
15701
|
padding-bottom: 0;
|
15680
15702
|
}
|
15681
15703
|
|
15682
|
-
.
|
15704
|
+
._messageContents_1ivu3_20._indented_1ivu3_25 {
|
15683
15705
|
margin-left: 0rem;
|
15684
15706
|
}
|
15685
15707
|
|
15686
|
-
.
|
15708
|
+
._copyLink_1ivu3_29 {
|
15687
15709
|
opacity: 0;
|
15688
15710
|
padding-left: 0;
|
15689
15711
|
padding-right: 2em;
|
15690
15712
|
}
|
15691
15713
|
|
15692
|
-
.
|
15714
|
+
._copyLink_1ivu3_29:hover {
|
15693
15715
|
opacity: 1;
|
15694
15716
|
}
|
15717
|
+
|
15718
|
+
._metadataLabel_1ivu3_39 {
|
15719
|
+
padding-top: 1em;
|
15720
|
+
}
|
15695
15721
|
._webSearch_1376z_1 {
|
15696
15722
|
display: grid;
|
15697
15723
|
grid-template-columns: max-content 1fr;
|
@@ -15702,28 +15728,6 @@ pre[class*="language-"] {
|
|
15702
15728
|
._query_1376z_8 {
|
15703
15729
|
font-family: var(--bs-font-monospace);
|
15704
15730
|
}
|
15705
|
-
._keyPairContainer_1ltuo_1 {
|
15706
|
-
display: grid;
|
15707
|
-
grid-template-columns: max-content auto;
|
15708
|
-
column-gap: 0.5em;
|
15709
|
-
padding-top: 4px;
|
15710
|
-
padding-bottom: 4px;
|
15711
|
-
border-bottom: solid 1px var(--bs-border-color);
|
15712
|
-
}
|
15713
|
-
|
15714
|
-
._key_1ltuo_1 {
|
15715
|
-
display: grid;
|
15716
|
-
grid-template-columns: 1em auto;
|
15717
|
-
cursor: pointer;
|
15718
|
-
}
|
15719
|
-
|
15720
|
-
._pre_1ltuo_16 {
|
15721
|
-
margin-bottom: 0;
|
15722
|
-
}
|
15723
|
-
|
15724
|
-
._treeIcon_1ltuo_20 {
|
15725
|
-
margin-top: -3px;
|
15726
|
-
}
|
15727
15731
|
._contentData_1sd1z_1 {
|
15728
15732
|
border: solid var(--bs-light-border-subtle) 1px;
|
15729
15733
|
padding: 0.5em;
|