inspect-ai 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +13 -1
- inspect_ai/_display/plain/display.py +9 -11
- inspect_ai/_display/textual/app.py +5 -5
- inspect_ai/_display/textual/widgets/samples.py +47 -18
- inspect_ai/_display/textual/widgets/transcript.py +25 -12
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +6 -1
- inspect_ai/_eval/run.py +6 -0
- inspect_ai/_eval/task/run.py +44 -15
- inspect_ai/_eval/task/task.py +26 -3
- inspect_ai/_util/interrupt.py +15 -0
- inspect_ai/_util/logger.py +23 -0
- inspect_ai/_util/rich.py +7 -8
- inspect_ai/_util/text.py +301 -1
- inspect_ai/_util/transcript.py +10 -2
- inspect_ai/_util/working.py +46 -0
- inspect_ai/_view/www/dist/assets/index.css +56 -12
- inspect_ai/_view/www/dist/assets/index.js +905 -751
- inspect_ai/_view/www/log-schema.json +337 -2
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
- inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
- inspect_ai/_view/www/src/appearance/icons.ts +3 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
- inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
- inspect_ai/_view/www/src/types/log.d.ts +188 -108
- inspect_ai/_view/www/src/utils/format.ts +7 -4
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_condense.py +1 -0
- inspect_ai/log/_log.py +72 -12
- inspect_ai/log/_samples.py +5 -5
- inspect_ai/log/_transcript.py +31 -1
- inspect_ai/model/_call_tools.py +1 -1
- inspect_ai/model/_conversation.py +1 -1
- inspect_ai/model/_model.py +35 -16
- inspect_ai/model/_model_call.py +10 -3
- inspect_ai/model/_providers/anthropic.py +13 -2
- inspect_ai/model/_providers/bedrock.py +7 -0
- inspect_ai/model/_providers/cloudflare.py +20 -7
- inspect_ai/model/_providers/google.py +358 -302
- inspect_ai/model/_providers/groq.py +57 -23
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +81 -52
- inspect_ai/model/_providers/openai.py +9 -0
- inspect_ai/model/_providers/providers.py +6 -6
- inspect_ai/model/_providers/util/tracker.py +92 -0
- inspect_ai/model/_providers/vllm.py +13 -5
- inspect_ai/solver/_basic_agent.py +1 -3
- inspect_ai/solver/_bridge/patch.py +0 -2
- inspect_ai/solver/_limit.py +4 -4
- inspect_ai/solver/_plan.py +3 -3
- inspect_ai/solver/_solver.py +3 -0
- inspect_ai/solver/_task_state.py +10 -1
- inspect_ai/tool/_tools/_web_search.py +3 -3
- inspect_ai/util/_concurrency.py +14 -8
- inspect_ai/util/_sandbox/context.py +15 -0
- inspect_ai/util/_sandbox/docker/cleanup.py +8 -3
- inspect_ai/util/_sandbox/docker/compose.py +5 -9
- inspect_ai/util/_sandbox/docker/docker.py +20 -6
- inspect_ai/util/_sandbox/docker/util.py +10 -1
- inspect_ai/util/_sandbox/environment.py +32 -1
- inspect_ai/util/_sandbox/events.py +149 -0
- inspect_ai/util/_sandbox/local.py +3 -3
- inspect_ai/util/_sandbox/self_check.py +2 -1
- inspect_ai/util/_subprocess.py +4 -1
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +5 -5
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +82 -74
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0
@@ -39,6 +39,7 @@ export type FailOnError = boolean | number | null;
|
|
39
39
|
export type MessageLimit = number | null;
|
40
40
|
export type TokenLimit = number | null;
|
41
41
|
export type TimeLimit = number | null;
|
42
|
+
export type WorkingLimit = number | null;
|
42
43
|
export type MaxSamples = number | null;
|
43
44
|
export type MaxTasks = number | null;
|
44
45
|
export type MaxSubprocesses = number | null;
|
@@ -52,7 +53,30 @@ export type Type = "git";
|
|
52
53
|
export type Origin = string;
|
53
54
|
export type Commit = string;
|
54
55
|
export type Metadata = {} | null;
|
56
|
+
export type Scorers = EvalScorer[] | null;
|
55
57
|
export type Name2 = string;
|
58
|
+
export type Options = {} | null;
|
59
|
+
export type Metrics =
|
60
|
+
| (
|
61
|
+
| EvalMetricDefinition
|
62
|
+
| {
|
63
|
+
[k: string]: EvalMetricDefinition[];
|
64
|
+
}
|
65
|
+
)[]
|
66
|
+
| {
|
67
|
+
[k: string]: EvalMetricDefinition[];
|
68
|
+
}
|
69
|
+
| null;
|
70
|
+
export type Name3 = string;
|
71
|
+
export type Options1 = {} | null;
|
72
|
+
export type Metadata1 = {} | null;
|
73
|
+
export type Metrics1 =
|
74
|
+
| EvalMetricDefinition[]
|
75
|
+
| {
|
76
|
+
[k: string]: EvalMetricDefinition[];
|
77
|
+
}
|
78
|
+
| null;
|
79
|
+
export type Name4 = string;
|
56
80
|
export type Solver1 = string;
|
57
81
|
export type Steps = EvalPlanStep[];
|
58
82
|
export type MaxRetries = number | null;
|
@@ -82,15 +106,15 @@ export type ReasoningEffort = ("low" | "medium" | "high") | null;
|
|
82
106
|
export type ReasoningHistory = boolean | null;
|
83
107
|
export type TotalSamples = number;
|
84
108
|
export type CompletedSamples = number;
|
85
|
-
export type
|
109
|
+
export type Name5 = string;
|
86
110
|
export type Scorer = string;
|
87
111
|
export type Reducer = string | null;
|
88
|
-
export type
|
112
|
+
export type Name6 = string;
|
89
113
|
export type Value = number;
|
90
|
-
export type Metadata1 = {} | null;
|
91
114
|
export type Metadata2 = {} | null;
|
92
|
-
export type Scores = EvalScore[];
|
93
115
|
export type Metadata3 = {} | null;
|
116
|
+
export type Scores = EvalScore[];
|
117
|
+
export type Metadata4 = {} | null;
|
94
118
|
export type StartedAt = string;
|
95
119
|
export type CompletedAt = string;
|
96
120
|
export type InputTokens = number;
|
@@ -194,7 +218,7 @@ export type Bytes1 = number[] | null;
|
|
194
218
|
export type Content5 = Logprob[];
|
195
219
|
export type Choices1 = ChatCompletionChoice[];
|
196
220
|
export type Time = number | null;
|
197
|
-
export type
|
221
|
+
export type Metadata5 = {} | null;
|
198
222
|
export type Error = string | null;
|
199
223
|
export type Scores1 = {
|
200
224
|
[k: string]: Score;
|
@@ -209,7 +233,7 @@ export type Value1 =
|
|
209
233
|
};
|
210
234
|
export type Answer = string | null;
|
211
235
|
export type Explanation = string | null;
|
212
|
-
export type
|
236
|
+
export type Metadata6 = {} | null;
|
213
237
|
export type Timestamp = string;
|
214
238
|
export type Pending = boolean | null;
|
215
239
|
export type Event = "sample_init";
|
@@ -224,7 +248,7 @@ export type Input1 =
|
|
224
248
|
export type Choices2 = string[] | null;
|
225
249
|
export type Target1 = string | string[];
|
226
250
|
export type Id2 = number | string | null;
|
227
|
-
export type
|
251
|
+
export type Metadata8 = {} | null;
|
228
252
|
export type Files1 = {
|
229
253
|
[k: string]: string;
|
230
254
|
} | null;
|
@@ -233,31 +257,49 @@ export type JsonValue = unknown;
|
|
233
257
|
export type Timestamp1 = string;
|
234
258
|
export type Pending1 = boolean | null;
|
235
259
|
export type Event1 = "sample_limit";
|
236
|
-
export type Type7 =
|
260
|
+
export type Type7 =
|
261
|
+
| "message"
|
262
|
+
| "time"
|
263
|
+
| "working"
|
264
|
+
| "token"
|
265
|
+
| "operator"
|
266
|
+
| "custom";
|
237
267
|
export type Message2 = string;
|
238
268
|
export type Limit1 = number | null;
|
239
269
|
export type Timestamp2 = string;
|
240
270
|
export type Pending2 = boolean | null;
|
241
|
-
export type Event2 = "
|
271
|
+
export type Event2 = "sandbox";
|
272
|
+
export type Action = "exec" | "read_file" | "write_file";
|
273
|
+
export type Cmd = string | null;
|
274
|
+
export type Options2 = {
|
275
|
+
[k: string]: JsonValue;
|
276
|
+
} | null;
|
277
|
+
export type File = string | null;
|
278
|
+
export type Input2 = string | null;
|
279
|
+
export type Result = number | null;
|
280
|
+
export type Output = string | null;
|
281
|
+
export type Timestamp3 = string;
|
282
|
+
export type Pending3 = boolean | null;
|
283
|
+
export type Event3 = "state";
|
242
284
|
export type Op = "remove" | "add" | "replace" | "move" | "test" | "copy";
|
243
285
|
export type Path = string;
|
244
286
|
export type From = string | null;
|
245
287
|
export type Changes = JsonChange[];
|
246
|
-
export type Timestamp3 = string;
|
247
|
-
export type Pending3 = boolean | null;
|
248
|
-
export type Event3 = "store";
|
249
|
-
export type Changes1 = JsonChange[];
|
250
288
|
export type Timestamp4 = string;
|
251
289
|
export type Pending4 = boolean | null;
|
252
|
-
export type Event4 = "
|
290
|
+
export type Event4 = "store";
|
291
|
+
export type Changes1 = JsonChange[];
|
292
|
+
export type Timestamp5 = string;
|
293
|
+
export type Pending5 = boolean | null;
|
294
|
+
export type Event5 = "model";
|
253
295
|
export type Model2 = string;
|
254
|
-
export type
|
296
|
+
export type Input3 = (
|
255
297
|
| ChatMessageSystem
|
256
298
|
| ChatMessageUser
|
257
299
|
| ChatMessageAssistant
|
258
300
|
| ChatMessageTool
|
259
301
|
)[];
|
260
|
-
export type
|
302
|
+
export type Name7 = string;
|
261
303
|
export type Description = string;
|
262
304
|
export type Type8 = "object";
|
263
305
|
export type Type9 =
|
@@ -275,16 +317,17 @@ export type Required1 = string[];
|
|
275
317
|
export type Additionalproperties1 = boolean;
|
276
318
|
export type Tools1 = ToolInfo[];
|
277
319
|
export type ToolChoice = ("auto" | "any" | "none") | ToolFunction;
|
278
|
-
export type
|
320
|
+
export type Name8 = string;
|
279
321
|
export type Error1 = string | null;
|
280
322
|
export type Cache = ("read" | "write") | null;
|
281
|
-
export type
|
282
|
-
export type
|
283
|
-
export type
|
323
|
+
export type Time1 = number | null;
|
324
|
+
export type Timestamp6 = string;
|
325
|
+
export type Pending6 = boolean | null;
|
326
|
+
export type Event6 = "tool";
|
284
327
|
export type Type10 = "function";
|
285
328
|
export type Id3 = string;
|
286
329
|
export type Function2 = string;
|
287
|
-
export type
|
330
|
+
export type Result1 =
|
288
331
|
| string
|
289
332
|
| number
|
290
333
|
| boolean
|
@@ -294,9 +337,9 @@ export type Result =
|
|
294
337
|
| ContentVideo
|
295
338
|
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
296
339
|
export type Truncated = [unknown, unknown] | null;
|
297
|
-
export type
|
298
|
-
export type
|
299
|
-
export type
|
340
|
+
export type Timestamp7 = string;
|
341
|
+
export type Pending7 = boolean | null;
|
342
|
+
export type Event7 = "approval";
|
300
343
|
export type Message3 = string;
|
301
344
|
export type Approver = string;
|
302
345
|
export type Decision =
|
@@ -306,23 +349,23 @@ export type Decision =
|
|
306
349
|
| "escalate"
|
307
350
|
| "terminate";
|
308
351
|
export type Explanation1 = string | null;
|
309
|
-
export type Timestamp7 = string;
|
310
|
-
export type Pending7 = boolean | null;
|
311
|
-
export type Event7 = "input";
|
312
|
-
export type Input3 = string;
|
313
|
-
export type InputAnsi = string;
|
314
352
|
export type Timestamp8 = string;
|
315
353
|
export type Pending8 = boolean | null;
|
316
|
-
export type Event8 = "
|
317
|
-
export type
|
318
|
-
export type
|
354
|
+
export type Event8 = "input";
|
355
|
+
export type Input4 = string;
|
356
|
+
export type InputAnsi = string;
|
319
357
|
export type Timestamp9 = string;
|
320
358
|
export type Pending9 = boolean | null;
|
321
|
-
export type Event9 = "
|
359
|
+
export type Event9 = "score";
|
360
|
+
export type Target2 = string | string[] | null;
|
361
|
+
export type Intermediate = boolean;
|
322
362
|
export type Timestamp10 = string;
|
323
363
|
export type Pending10 = boolean | null;
|
324
|
-
export type Event10 = "
|
325
|
-
export type
|
364
|
+
export type Event10 = "error";
|
365
|
+
export type Timestamp11 = string;
|
366
|
+
export type Pending11 = boolean | null;
|
367
|
+
export type Event11 = "logger";
|
368
|
+
export type Name9 = string | null;
|
326
369
|
export type Level =
|
327
370
|
| "debug"
|
328
371
|
| "trace"
|
@@ -337,24 +380,25 @@ export type Created1 = number;
|
|
337
380
|
export type Filename = string;
|
338
381
|
export type Module = string;
|
339
382
|
export type Lineno = number;
|
340
|
-
export type Timestamp11 = string;
|
341
|
-
export type Pending11 = boolean | null;
|
342
|
-
export type Event11 = "info";
|
343
|
-
export type Source4 = string | null;
|
344
383
|
export type Timestamp12 = string;
|
345
384
|
export type Pending12 = boolean | null;
|
346
|
-
export type Event12 = "
|
347
|
-
export type
|
348
|
-
export type Type11 = string | null;
|
349
|
-
export type Name8 = string;
|
385
|
+
export type Event12 = "info";
|
386
|
+
export type Source4 = string | null;
|
350
387
|
export type Timestamp13 = string;
|
351
388
|
export type Pending13 = boolean | null;
|
352
|
-
export type Event13 = "
|
353
|
-
export type
|
389
|
+
export type Event13 = "step";
|
390
|
+
export type Action1 = "begin" | "end";
|
391
|
+
export type Type11 = string | null;
|
392
|
+
export type Name10 = string;
|
393
|
+
export type Timestamp14 = string;
|
394
|
+
export type Pending14 = boolean | null;
|
395
|
+
export type Event14 = "subtask";
|
396
|
+
export type Name11 = string;
|
354
397
|
export type Type12 = string | null;
|
355
398
|
export type Events2 = (
|
356
399
|
| SampleInitEvent
|
357
400
|
| SampleLimitEvent
|
401
|
+
| SandboxEvent
|
358
402
|
| StateEvent
|
359
403
|
| StoreEvent
|
360
404
|
| ModelEvent
|
@@ -371,6 +415,7 @@ export type Events2 = (
|
|
371
415
|
export type Events1 = (
|
372
416
|
| SampleInitEvent
|
373
417
|
| SampleLimitEvent
|
418
|
+
| SandboxEvent
|
374
419
|
| StateEvent
|
375
420
|
| StoreEvent
|
376
421
|
| ModelEvent
|
@@ -387,6 +432,7 @@ export type Events1 = (
|
|
387
432
|
export type Events = (
|
388
433
|
| SampleInitEvent
|
389
434
|
| SampleLimitEvent
|
435
|
+
| SandboxEvent
|
390
436
|
| StateEvent
|
391
437
|
| StoreEvent
|
392
438
|
| ModelEvent
|
@@ -400,9 +446,12 @@ export type Events = (
|
|
400
446
|
| StepEvent
|
401
447
|
| SubtaskEvent
|
402
448
|
)[];
|
449
|
+
export type TotalTime = number | null;
|
450
|
+
export type WorkingTime = number | null;
|
403
451
|
export type Type13 =
|
404
452
|
| "context"
|
405
453
|
| "time"
|
454
|
+
| "working"
|
406
455
|
| "message"
|
407
456
|
| "token"
|
408
457
|
| "operator"
|
@@ -421,7 +470,7 @@ export type Value2 =
|
|
421
470
|
};
|
422
471
|
export type Answer1 = string | null;
|
423
472
|
export type Explanation2 = string | null;
|
424
|
-
export type
|
473
|
+
export type Metadata9 = {} | null;
|
425
474
|
export type SampleId1 = string | number | null;
|
426
475
|
export type Samples2 = EvalSampleScore[];
|
427
476
|
export type Location1 = string;
|
@@ -465,6 +514,8 @@ export interface EvalSpec {
|
|
465
514
|
revision: EvalRevision | null;
|
466
515
|
packages: Packages;
|
467
516
|
metadata: Metadata;
|
517
|
+
scorers: Scorers;
|
518
|
+
metrics: Metrics1;
|
468
519
|
}
|
469
520
|
export interface TaskAttribs {}
|
470
521
|
export interface TaskArgs {}
|
@@ -492,6 +543,7 @@ export interface EvalConfig {
|
|
492
543
|
message_limit: MessageLimit;
|
493
544
|
token_limit: TokenLimit;
|
494
545
|
time_limit: TimeLimit;
|
546
|
+
working_limit: WorkingLimit;
|
495
547
|
max_samples: MaxSamples;
|
496
548
|
max_tasks: MaxTasks;
|
497
549
|
max_subprocesses: MaxSubprocesses;
|
@@ -538,11 +590,21 @@ export interface EvalRevision {
|
|
538
590
|
export interface Packages {
|
539
591
|
[k: string]: string;
|
540
592
|
}
|
593
|
+
export interface EvalScorer {
|
594
|
+
name: Name2;
|
595
|
+
options: Options;
|
596
|
+
metrics: Metrics;
|
597
|
+
metadata: Metadata1;
|
598
|
+
}
|
599
|
+
export interface EvalMetricDefinition {
|
600
|
+
name: Name3;
|
601
|
+
options: Options1;
|
602
|
+
}
|
541
603
|
/**
|
542
604
|
* Plan (solvers) used in evaluation.
|
543
605
|
*/
|
544
606
|
export interface EvalPlan {
|
545
|
-
name:
|
607
|
+
name: Name4;
|
546
608
|
steps: Steps;
|
547
609
|
finish: EvalPlanStep | null;
|
548
610
|
config: GenerateConfig;
|
@@ -590,31 +652,31 @@ export interface EvalResults {
|
|
590
652
|
total_samples: TotalSamples;
|
591
653
|
completed_samples: CompletedSamples;
|
592
654
|
scores: Scores;
|
593
|
-
metadata:
|
655
|
+
metadata: Metadata4;
|
594
656
|
}
|
595
657
|
/**
|
596
658
|
* Score for evaluation task.
|
597
659
|
*/
|
598
660
|
export interface EvalScore {
|
599
|
-
name:
|
661
|
+
name: Name5;
|
600
662
|
scorer: Scorer;
|
601
663
|
reducer: Reducer;
|
602
664
|
params: Params2;
|
603
|
-
metrics:
|
604
|
-
metadata:
|
665
|
+
metrics: Metrics2;
|
666
|
+
metadata: Metadata3;
|
605
667
|
}
|
606
668
|
export interface Params2 {}
|
607
|
-
export interface
|
669
|
+
export interface Metrics2 {
|
608
670
|
[k: string]: EvalMetric;
|
609
671
|
}
|
610
672
|
/**
|
611
673
|
* Metric for evaluation score.
|
612
674
|
*/
|
613
675
|
export interface EvalMetric {
|
614
|
-
name:
|
676
|
+
name: Name6;
|
615
677
|
value: Value;
|
616
678
|
params: Params3;
|
617
|
-
metadata:
|
679
|
+
metadata: Metadata2;
|
618
680
|
}
|
619
681
|
export interface Params3 {}
|
620
682
|
/**
|
@@ -661,10 +723,12 @@ export interface EvalSample {
|
|
661
723
|
messages: Messages;
|
662
724
|
output: ModelOutput;
|
663
725
|
scores: Scores1;
|
664
|
-
metadata:
|
726
|
+
metadata: Metadata7;
|
665
727
|
store: Store;
|
666
728
|
events: Events;
|
667
729
|
model_usage: ModelUsage2;
|
730
|
+
total_time: TotalTime;
|
731
|
+
working_time: WorkingTime;
|
668
732
|
error: EvalError | null;
|
669
733
|
attachments: Attachments;
|
670
734
|
limit: EvalSampleLimit | null;
|
@@ -767,7 +831,7 @@ export interface ModelOutput {
|
|
767
831
|
choices: Choices1;
|
768
832
|
usage: ModelUsage1 | null;
|
769
833
|
time: Time;
|
770
|
-
metadata:
|
834
|
+
metadata: Metadata5;
|
771
835
|
error: Error;
|
772
836
|
}
|
773
837
|
/**
|
@@ -808,9 +872,9 @@ export interface Score {
|
|
808
872
|
value: Value1;
|
809
873
|
answer: Answer;
|
810
874
|
explanation: Explanation;
|
811
|
-
metadata:
|
875
|
+
metadata: Metadata6;
|
812
876
|
}
|
813
|
-
export interface
|
877
|
+
export interface Metadata7 {}
|
814
878
|
export interface Store {}
|
815
879
|
/**
|
816
880
|
* Beginning of processing a Sample.
|
@@ -830,7 +894,7 @@ export interface Sample {
|
|
830
894
|
choices: Choices2;
|
831
895
|
target: Target1;
|
832
896
|
id: Id2;
|
833
|
-
metadata:
|
897
|
+
metadata: Metadata8;
|
834
898
|
sandbox: SandboxEnvironmentSpec | null;
|
835
899
|
files: Files1;
|
836
900
|
setup: Setup1;
|
@@ -847,12 +911,27 @@ export interface SampleLimitEvent {
|
|
847
911
|
limit: Limit1;
|
848
912
|
}
|
849
913
|
/**
|
850
|
-
*
|
914
|
+
* Sandbox execution or I/O
|
851
915
|
*/
|
852
|
-
export interface
|
916
|
+
export interface SandboxEvent {
|
853
917
|
timestamp: Timestamp2;
|
854
918
|
pending: Pending2;
|
855
919
|
event: Event2;
|
920
|
+
action: Action;
|
921
|
+
cmd: Cmd;
|
922
|
+
options: Options2;
|
923
|
+
file: File;
|
924
|
+
input: Input2;
|
925
|
+
result: Result;
|
926
|
+
output: Output;
|
927
|
+
}
|
928
|
+
/**
|
929
|
+
* Change to the current `TaskState`
|
930
|
+
*/
|
931
|
+
export interface StateEvent {
|
932
|
+
timestamp: Timestamp3;
|
933
|
+
pending: Pending3;
|
934
|
+
event: Event3;
|
856
935
|
changes: Changes;
|
857
936
|
}
|
858
937
|
/**
|
@@ -873,20 +952,20 @@ export interface JsonChange {
|
|
873
952
|
* Change to data within the current `Store`.
|
874
953
|
*/
|
875
954
|
export interface StoreEvent {
|
876
|
-
timestamp:
|
877
|
-
pending:
|
878
|
-
event:
|
955
|
+
timestamp: Timestamp4;
|
956
|
+
pending: Pending4;
|
957
|
+
event: Event4;
|
879
958
|
changes: Changes1;
|
880
959
|
}
|
881
960
|
/**
|
882
961
|
* Call to a language model.
|
883
962
|
*/
|
884
963
|
export interface ModelEvent {
|
885
|
-
timestamp:
|
886
|
-
pending:
|
887
|
-
event:
|
964
|
+
timestamp: Timestamp5;
|
965
|
+
pending: Pending5;
|
966
|
+
event: Event5;
|
888
967
|
model: Model2;
|
889
|
-
input:
|
968
|
+
input: Input3;
|
890
969
|
tools: Tools1;
|
891
970
|
tool_choice: ToolChoice;
|
892
971
|
config: GenerateConfig1;
|
@@ -922,7 +1001,7 @@ export interface ModelEvent {
|
|
922
1001
|
* ```
|
923
1002
|
*/
|
924
1003
|
export interface ToolInfo {
|
925
|
-
name:
|
1004
|
+
name: Name7;
|
926
1005
|
description: Description;
|
927
1006
|
parameters: ToolParams;
|
928
1007
|
}
|
@@ -956,7 +1035,7 @@ export interface Default {
|
|
956
1035
|
[k: string]: unknown;
|
957
1036
|
}
|
958
1037
|
export interface ToolFunction {
|
959
|
-
name:
|
1038
|
+
name: Name8;
|
960
1039
|
}
|
961
1040
|
/**
|
962
1041
|
* Model generation options.
|
@@ -992,6 +1071,7 @@ export interface GenerateConfig1 {
|
|
992
1071
|
export interface ModelCall {
|
993
1072
|
request: Request;
|
994
1073
|
response: Response;
|
1074
|
+
time: Time1;
|
995
1075
|
}
|
996
1076
|
export interface Request {
|
997
1077
|
[k: string]: JsonValue;
|
@@ -1003,15 +1083,15 @@ export interface Response {
|
|
1003
1083
|
* Call to a tool.
|
1004
1084
|
*/
|
1005
1085
|
export interface ToolEvent {
|
1006
|
-
timestamp:
|
1007
|
-
pending:
|
1008
|
-
event:
|
1086
|
+
timestamp: Timestamp6;
|
1087
|
+
pending: Pending6;
|
1088
|
+
event: Event6;
|
1009
1089
|
type: Type10;
|
1010
1090
|
id: Id3;
|
1011
1091
|
function: Function2;
|
1012
1092
|
arguments: Arguments1;
|
1013
1093
|
view: ToolCallContent | null;
|
1014
|
-
result:
|
1094
|
+
result: Result1;
|
1015
1095
|
truncated: Truncated;
|
1016
1096
|
error: ToolCallError | null;
|
1017
1097
|
events: Events1;
|
@@ -1023,9 +1103,9 @@ export interface Arguments1 {
|
|
1023
1103
|
* Tool approval.
|
1024
1104
|
*/
|
1025
1105
|
export interface ApprovalEvent {
|
1026
|
-
timestamp:
|
1027
|
-
pending:
|
1028
|
-
event:
|
1106
|
+
timestamp: Timestamp7;
|
1107
|
+
pending: Pending7;
|
1108
|
+
event: Event7;
|
1029
1109
|
message: Message3;
|
1030
1110
|
call: ToolCall;
|
1031
1111
|
view: ToolCallView | null;
|
@@ -1048,10 +1128,10 @@ export interface ToolCallView {
|
|
1048
1128
|
* Input screen interaction.
|
1049
1129
|
*/
|
1050
1130
|
export interface InputEvent {
|
1051
|
-
timestamp:
|
1052
|
-
pending:
|
1053
|
-
event:
|
1054
|
-
input:
|
1131
|
+
timestamp: Timestamp8;
|
1132
|
+
pending: Pending8;
|
1133
|
+
event: Event8;
|
1134
|
+
input: Input4;
|
1055
1135
|
input_ansi: InputAnsi;
|
1056
1136
|
}
|
1057
1137
|
/**
|
@@ -1061,9 +1141,9 @@ export interface InputEvent {
|
|
1061
1141
|
* resulting from a call to `score`.
|
1062
1142
|
*/
|
1063
1143
|
export interface ScoreEvent {
|
1064
|
-
timestamp:
|
1065
|
-
pending:
|
1066
|
-
event:
|
1144
|
+
timestamp: Timestamp9;
|
1145
|
+
pending: Pending9;
|
1146
|
+
event: Event9;
|
1067
1147
|
score: Score;
|
1068
1148
|
target: Target2;
|
1069
1149
|
intermediate: Intermediate;
|
@@ -1072,25 +1152,25 @@ export interface ScoreEvent {
|
|
1072
1152
|
* Event with sample error.
|
1073
1153
|
*/
|
1074
1154
|
export interface ErrorEvent {
|
1075
|
-
timestamp:
|
1076
|
-
pending:
|
1077
|
-
event:
|
1155
|
+
timestamp: Timestamp10;
|
1156
|
+
pending: Pending10;
|
1157
|
+
event: Event10;
|
1078
1158
|
error: EvalError;
|
1079
1159
|
}
|
1080
1160
|
/**
|
1081
1161
|
* Log message recorded with Python logger.
|
1082
1162
|
*/
|
1083
1163
|
export interface LoggerEvent {
|
1084
|
-
timestamp:
|
1085
|
-
pending:
|
1086
|
-
event:
|
1164
|
+
timestamp: Timestamp11;
|
1165
|
+
pending: Pending11;
|
1166
|
+
event: Event11;
|
1087
1167
|
message: LoggingMessage;
|
1088
1168
|
}
|
1089
1169
|
/**
|
1090
1170
|
* Message written to Python log.
|
1091
1171
|
*/
|
1092
1172
|
export interface LoggingMessage {
|
1093
|
-
name:
|
1173
|
+
name: Name9;
|
1094
1174
|
level: Level;
|
1095
1175
|
message: Message4;
|
1096
1176
|
created: Created1;
|
@@ -1102,9 +1182,9 @@ export interface LoggingMessage {
|
|
1102
1182
|
* Event with custom info/data.
|
1103
1183
|
*/
|
1104
1184
|
export interface InfoEvent {
|
1105
|
-
timestamp:
|
1106
|
-
pending:
|
1107
|
-
event:
|
1185
|
+
timestamp: Timestamp12;
|
1186
|
+
pending: Pending12;
|
1187
|
+
event: Event12;
|
1108
1188
|
source: Source4;
|
1109
1189
|
data: JsonValue;
|
1110
1190
|
}
|
@@ -1112,28 +1192,28 @@ export interface InfoEvent {
|
|
1112
1192
|
* Step within current sample or subtask.
|
1113
1193
|
*/
|
1114
1194
|
export interface StepEvent {
|
1115
|
-
timestamp:
|
1116
|
-
pending:
|
1117
|
-
event:
|
1118
|
-
action:
|
1195
|
+
timestamp: Timestamp13;
|
1196
|
+
pending: Pending13;
|
1197
|
+
event: Event13;
|
1198
|
+
action: Action1;
|
1119
1199
|
type: Type11;
|
1120
|
-
name:
|
1200
|
+
name: Name10;
|
1121
1201
|
}
|
1122
1202
|
/**
|
1123
1203
|
* Subtask spawned.
|
1124
1204
|
*/
|
1125
1205
|
export interface SubtaskEvent {
|
1126
|
-
timestamp:
|
1127
|
-
pending:
|
1128
|
-
event:
|
1129
|
-
name:
|
1206
|
+
timestamp: Timestamp14;
|
1207
|
+
pending: Pending14;
|
1208
|
+
event: Event14;
|
1209
|
+
name: Name11;
|
1130
1210
|
type: Type12;
|
1131
|
-
input:
|
1132
|
-
result:
|
1211
|
+
input: Input5;
|
1212
|
+
result: Result2;
|
1133
1213
|
events: Events2;
|
1134
1214
|
}
|
1135
|
-
export interface
|
1136
|
-
export interface
|
1215
|
+
export interface Input5 {}
|
1216
|
+
export interface Result2 {
|
1137
1217
|
[k: string]: unknown;
|
1138
1218
|
}
|
1139
1219
|
export interface ModelUsage2 {
|
@@ -1164,6 +1244,6 @@ export interface EvalSampleScore {
|
|
1164
1244
|
value: Value2;
|
1165
1245
|
answer: Answer1;
|
1166
1246
|
explanation: Explanation2;
|
1167
|
-
metadata:
|
1247
|
+
metadata: Metadata9;
|
1168
1248
|
sample_id: SampleId1;
|
1169
1249
|
}
|
@@ -77,7 +77,7 @@ export const formatDataset = (
|
|
77
77
|
*/
|
78
78
|
export const formatTime = (seconds: number): string => {
|
79
79
|
if (seconds < 60) {
|
80
|
-
return `${seconds} sec`;
|
80
|
+
return `${formatPrettyDecimal(seconds, 1)} sec`;
|
81
81
|
} else if (seconds < 60 * 60) {
|
82
82
|
return `${Math.floor(seconds / 60)} min ${seconds % 60} sec`;
|
83
83
|
} else if (seconds < 60 * 60 * 24) {
|
@@ -97,15 +97,18 @@ export const formatTime = (seconds: number): string => {
|
|
97
97
|
/**
|
98
98
|
* Formats a number to a string with specific decimal places for prettiness.
|
99
99
|
*/
|
100
|
-
export function formatPrettyDecimal(
|
100
|
+
export function formatPrettyDecimal(
|
101
|
+
num: number,
|
102
|
+
maxDecimals: number = 3,
|
103
|
+
): string {
|
101
104
|
const numDecimalPlaces = num.toString().includes(".")
|
102
105
|
? num.toString().split(".")[1].length
|
103
106
|
: 0;
|
104
107
|
|
105
108
|
if (numDecimalPlaces === 0) {
|
106
109
|
return num.toFixed(1);
|
107
|
-
} else if (numDecimalPlaces >
|
108
|
-
return num.toFixed(
|
110
|
+
} else if (numDecimalPlaces > maxDecimals) {
|
111
|
+
return num.toFixed(maxDecimals);
|
109
112
|
} else {
|
110
113
|
return num.toString();
|
111
114
|
}
|
@@ -78,6 +78,12 @@ export const WorkSpaceView: React.FC<WorkSpaceViewProps> = ({
|
|
78
78
|
},
|
79
79
|
[setSelectedTab],
|
80
80
|
);
|
81
|
+
const handleScroll = useCallback(
|
82
|
+
(tabid: string, position: number) => {
|
83
|
+
onScroll(tabid, position);
|
84
|
+
},
|
85
|
+
[onScroll],
|
86
|
+
);
|
81
87
|
|
82
88
|
if (evalSpec === undefined) {
|
83
89
|
return <EmptyPanel />;
|
@@ -138,12 +144,9 @@ export const WorkSpaceView: React.FC<WorkSpaceViewProps> = ({
|
|
138
144
|
scrollPosition={
|
139
145
|
workspaceTabScrollPositionRef.current?.[tab.id]
|
140
146
|
}
|
141
|
-
setScrollPosition={
|
142
|
-
(position
|
143
|
-
|
144
|
-
},
|
145
|
-
[onScroll],
|
146
|
-
)}
|
147
|
+
setScrollPosition={(position: number) => {
|
148
|
+
handleScroll(tab.id, position);
|
149
|
+
}}
|
147
150
|
>
|
148
151
|
{tab.content()}
|
149
152
|
</TabPanel>
|