inspect-ai 0.3.63__py3-none-any.whl → 0.3.65__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/cache.py +8 -7
- inspect_ai/_cli/common.py +0 -12
- inspect_ai/_cli/eval.py +32 -4
- inspect_ai/_cli/info.py +1 -0
- inspect_ai/_cli/list.py +1 -1
- inspect_ai/_cli/log.py +2 -0
- inspect_ai/_cli/sandbox.py +4 -1
- inspect_ai/_cli/score.py +181 -32
- inspect_ai/_cli/trace.py +2 -0
- inspect_ai/_cli/view.py +4 -2
- inspect_ai/_display/core/config.py +7 -1
- inspect_ai/_display/core/progress.py +1 -1
- inspect_ai/_display/textual/app.py +8 -4
- inspect_ai/_display/textual/widgets/samples.py +6 -5
- inspect_ai/_display/textual/widgets/sandbox.py +6 -0
- inspect_ai/_eval/__init__.py +0 -0
- inspect_ai/_eval/eval.py +100 -97
- inspect_ai/_eval/evalset.py +69 -69
- inspect_ai/_eval/loader.py +122 -12
- inspect_ai/_eval/registry.py +1 -1
- inspect_ai/_eval/run.py +14 -0
- inspect_ai/_eval/score.py +125 -36
- inspect_ai/_eval/task/log.py +105 -4
- inspect_ai/_eval/task/results.py +92 -38
- inspect_ai/_eval/task/run.py +6 -2
- inspect_ai/_eval/task/sandbox.py +35 -2
- inspect_ai/_eval/task/task.py +49 -46
- inspect_ai/_util/__init__.py +0 -0
- inspect_ai/_util/constants.py +1 -1
- inspect_ai/_util/content.py +8 -0
- inspect_ai/_util/error.py +2 -0
- inspect_ai/_util/file.py +15 -1
- inspect_ai/_util/logger.py +4 -2
- inspect_ai/_util/registry.py +7 -1
- inspect_ai/_view/view.py +1 -2
- inspect_ai/_view/www/App.css +8 -3
- inspect_ai/_view/www/README.md +1 -1
- inspect_ai/_view/www/dist/assets/index.css +66 -38
- inspect_ai/_view/www/dist/assets/index.js +525 -523
- inspect_ai/_view/www/log-schema.json +86 -73
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/App.tsx +1 -0
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +1 -1
- inspect_ai/_view/www/src/components/JsonPanel.tsx +1 -1
- inspect_ai/_view/www/src/components/LargeModal.tsx +39 -49
- inspect_ai/_view/www/src/components/NavPills.tsx +3 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +19 -4
- inspect_ai/_view/www/src/logfile/remoteLogFile.ts +0 -1
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +1 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +1 -1
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +6 -13
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +17 -2
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +1 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +14 -5
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +4 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +16 -24
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +1 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +27 -13
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +19 -17
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +12 -10
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +56 -66
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +12 -5
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +21 -36
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +27 -25
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +5 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +13 -13
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +9 -5
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +1 -1
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +17 -6
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +14 -19
- inspect_ai/_view/www/src/types/log.d.ts +107 -19
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +7 -1
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +5 -3
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +25 -27
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +12 -11
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +25 -2
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +60 -36
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +4 -0
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +6 -4
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +16 -14
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +9 -19
- inspect_ai/_view/www/src/workspace/utils.ts +34 -0
- inspect_ai/approval/_approval.py +2 -0
- inspect_ai/approval/_approver.py +4 -4
- inspect_ai/approval/_auto.py +1 -1
- inspect_ai/approval/_human/approver.py +3 -0
- inspect_ai/approval/_policy.py +5 -0
- inspect_ai/approval/_registry.py +2 -2
- inspect_ai/dataset/_dataset.py +36 -45
- inspect_ai/dataset/_sources/__init__.py +0 -0
- inspect_ai/dataset/_sources/csv.py +13 -13
- inspect_ai/dataset/_sources/hf.py +29 -29
- inspect_ai/dataset/_sources/json.py +10 -10
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_convert.py +3 -3
- inspect_ai/log/_file.py +24 -9
- inspect_ai/log/_log.py +98 -7
- inspect_ai/log/_message.py +3 -1
- inspect_ai/log/_recorders/file.py +4 -0
- inspect_ai/log/_recorders/recorder.py +3 -0
- inspect_ai/log/_transcript.py +19 -8
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_cache.py +39 -21
- inspect_ai/model/_call_tools.py +2 -2
- inspect_ai/model/_chat_message.py +14 -4
- inspect_ai/model/_generate_config.py +1 -1
- inspect_ai/model/_model.py +31 -24
- inspect_ai/model/_model_output.py +14 -1
- inspect_ai/model/_openai.py +10 -18
- inspect_ai/model/_providers/google.py +9 -5
- inspect_ai/model/_providers/openai.py +5 -9
- inspect_ai/model/_providers/openrouter.py +1 -1
- inspect_ai/scorer/__init__.py +6 -1
- inspect_ai/scorer/_answer.py +1 -1
- inspect_ai/scorer/_classification.py +4 -0
- inspect_ai/scorer/_match.py +4 -5
- inspect_ai/scorer/_metric.py +87 -28
- inspect_ai/scorer/_metrics/__init__.py +3 -3
- inspect_ai/scorer/_metrics/accuracy.py +8 -10
- inspect_ai/scorer/_metrics/mean.py +3 -17
- inspect_ai/scorer/_metrics/std.py +111 -30
- inspect_ai/scorer/_model.py +12 -12
- inspect_ai/scorer/_pattern.py +3 -3
- inspect_ai/scorer/_reducer/reducer.py +36 -21
- inspect_ai/scorer/_reducer/registry.py +2 -2
- inspect_ai/scorer/_reducer/types.py +7 -1
- inspect_ai/scorer/_score.py +11 -1
- inspect_ai/scorer/_scorer.py +110 -16
- inspect_ai/solver/__init__.py +1 -1
- inspect_ai/solver/_basic_agent.py +19 -22
- inspect_ai/solver/_bridge/__init__.py +0 -3
- inspect_ai/solver/_bridge/bridge.py +3 -3
- inspect_ai/solver/_chain.py +1 -2
- inspect_ai/solver/_critique.py +3 -3
- inspect_ai/solver/_fork.py +2 -2
- inspect_ai/solver/_human_agent/__init__.py +0 -0
- inspect_ai/solver/_human_agent/agent.py +5 -8
- inspect_ai/solver/_human_agent/commands/clock.py +14 -10
- inspect_ai/solver/_human_agent/commands/note.py +1 -1
- inspect_ai/solver/_human_agent/commands/score.py +0 -11
- inspect_ai/solver/_multiple_choice.py +15 -18
- inspect_ai/solver/_prompt.py +7 -7
- inspect_ai/solver/_solver.py +53 -52
- inspect_ai/solver/_task_state.py +80 -69
- inspect_ai/solver/_use_tools.py +9 -9
- inspect_ai/tool/__init__.py +2 -1
- inspect_ai/tool/_tool.py +43 -14
- inspect_ai/tool/_tool_call.py +6 -2
- inspect_ai/tool/_tool_choice.py +3 -1
- inspect_ai/tool/_tool_def.py +10 -8
- inspect_ai/tool/_tool_params.py +24 -0
- inspect_ai/tool/_tool_with.py +7 -7
- inspect_ai/tool/_tools/__init__.py +0 -0
- inspect_ai/tool/_tools/_computer/_common.py +2 -2
- inspect_ai/tool/_tools/_computer/_computer.py +11 -0
- inspect_ai/tool/_tools/_execute.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
- inspect_ai/tool/_tools/_web_search.py +7 -5
- inspect_ai/util/_concurrency.py +3 -3
- inspect_ai/util/_panel.py +2 -0
- inspect_ai/util/_resource.py +12 -12
- inspect_ai/util/_sandbox/docker/compose.py +23 -20
- inspect_ai/util/_sandbox/docker/config.py +2 -1
- inspect_ai/util/_sandbox/docker/docker.py +10 -1
- inspect_ai/util/_sandbox/docker/service.py +100 -0
- inspect_ai/util/_sandbox/environment.py +99 -96
- inspect_ai/util/_subprocess.py +5 -3
- inspect_ai/util/_subtask.py +15 -16
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/LICENSE +1 -1
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/METADATA +10 -6
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/RECORD +182 -175
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.63.dist-info → inspect_ai-0.3.65.dist-info}/top_level.txt +0 -0
@@ -112,6 +112,7 @@ export type Input =
|
|
112
112
|
| ChatMessageAssistant
|
113
113
|
| ChatMessageTool
|
114
114
|
)[];
|
115
|
+
export type Role = "system";
|
115
116
|
export type Content =
|
116
117
|
| string
|
117
118
|
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
@@ -127,18 +128,17 @@ export type Type4 = "video";
|
|
127
128
|
export type Video = string;
|
128
129
|
export type Format1 = "mp4" | "mpeg" | "mov";
|
129
130
|
export type Source = ("input" | "generate") | null;
|
130
|
-
export type
|
131
|
+
export type Role1 = "user";
|
131
132
|
export type Content1 =
|
132
133
|
| string
|
133
134
|
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
134
135
|
export type Source1 = ("input" | "generate") | null;
|
135
|
-
export type Role1 = "user";
|
136
136
|
export type ToolCallId = string[] | null;
|
137
|
+
export type Role2 = "assistant";
|
137
138
|
export type Content2 =
|
138
139
|
| string
|
139
140
|
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
140
141
|
export type Source2 = ("input" | "generate") | null;
|
141
|
-
export type Role2 = "assistant";
|
142
142
|
export type ToolCalls = ToolCall[] | null;
|
143
143
|
export type Id1 = string;
|
144
144
|
export type Function = string;
|
@@ -148,11 +148,11 @@ export type Title = string | null;
|
|
148
148
|
export type Format2 = "text" | "markdown";
|
149
149
|
export type Content3 = string;
|
150
150
|
export type Reasoning = string | null;
|
151
|
+
export type Role3 = "tool";
|
151
152
|
export type Content4 =
|
152
153
|
| string
|
153
154
|
| (ContentText | ContentImage | ContentAudio | ContentVideo)[];
|
154
155
|
export type Source3 = ("input" | "generate") | null;
|
155
|
-
export type Role3 = "tool";
|
156
156
|
export type ToolCallId1 = string | null;
|
157
157
|
export type Function1 = string | null;
|
158
158
|
export type Type6 =
|
@@ -315,6 +315,7 @@ export type Timestamp8 = string;
|
|
315
315
|
export type Pending8 = boolean | null;
|
316
316
|
export type Event8 = "score";
|
317
317
|
export type Target2 = string | string[] | null;
|
318
|
+
export type Intermediate = boolean;
|
318
319
|
export type Timestamp9 = string;
|
319
320
|
export type Pending9 = boolean | null;
|
320
321
|
export type Event9 = "error";
|
@@ -339,6 +340,7 @@ export type Lineno = number;
|
|
339
340
|
export type Timestamp11 = string;
|
340
341
|
export type Pending11 = boolean | null;
|
341
342
|
export type Event11 = "info";
|
343
|
+
export type Source4 = string | null;
|
342
344
|
export type Timestamp12 = string;
|
343
345
|
export type Pending12 = boolean | null;
|
344
346
|
export type Event12 = "step";
|
@@ -424,6 +426,9 @@ export type SampleId1 = string | number | null;
|
|
424
426
|
export type Samples2 = EvalSampleScore[];
|
425
427
|
export type Location1 = string;
|
426
428
|
|
429
|
+
/**
|
430
|
+
* Evaluation log.
|
431
|
+
*/
|
427
432
|
export interface EvalLog {
|
428
433
|
version?: Version;
|
429
434
|
status?: Status;
|
@@ -436,6 +441,9 @@ export interface EvalLog {
|
|
436
441
|
reductions?: Reductions;
|
437
442
|
location?: Location1;
|
438
443
|
}
|
444
|
+
/**
|
445
|
+
* Eval target and configuration.
|
446
|
+
*/
|
439
447
|
export interface EvalSpec {
|
440
448
|
run_id: RunId;
|
441
449
|
created: Created;
|
@@ -460,6 +468,9 @@ export interface EvalSpec {
|
|
460
468
|
}
|
461
469
|
export interface TaskAttribs {}
|
462
470
|
export interface TaskArgs {}
|
471
|
+
/**
|
472
|
+
* Dataset used for evaluation.
|
473
|
+
*/
|
463
474
|
export interface EvalDataset {
|
464
475
|
name: Name;
|
465
476
|
location: Location;
|
@@ -468,6 +479,9 @@ export interface EvalDataset {
|
|
468
479
|
shuffled: Shuffled;
|
469
480
|
}
|
470
481
|
export interface ModelArgs {}
|
482
|
+
/**
|
483
|
+
* Configuration used for evaluation.
|
484
|
+
*/
|
471
485
|
export interface EvalConfig {
|
472
486
|
limit: Limit;
|
473
487
|
sample_id: SampleId;
|
@@ -513,6 +527,9 @@ export interface ApproverPolicyConfig {
|
|
513
527
|
params: Params;
|
514
528
|
}
|
515
529
|
export interface Params {}
|
530
|
+
/**
|
531
|
+
* Git revision for evaluation.
|
532
|
+
*/
|
516
533
|
export interface EvalRevision {
|
517
534
|
type: Type;
|
518
535
|
origin: Origin;
|
@@ -521,19 +538,25 @@ export interface EvalRevision {
|
|
521
538
|
export interface Packages {
|
522
539
|
[k: string]: string;
|
523
540
|
}
|
541
|
+
/**
|
542
|
+
* Plan (solvers) used in evaluation.
|
543
|
+
*/
|
524
544
|
export interface EvalPlan {
|
525
545
|
name: Name2;
|
526
546
|
steps: Steps;
|
527
547
|
finish: EvalPlanStep | null;
|
528
548
|
config: GenerateConfig;
|
529
549
|
}
|
550
|
+
/**
|
551
|
+
* Solver step.
|
552
|
+
*/
|
530
553
|
export interface EvalPlanStep {
|
531
554
|
solver: Solver1;
|
532
555
|
params: Params1;
|
533
556
|
}
|
534
557
|
export interface Params1 {}
|
535
558
|
/**
|
536
|
-
*
|
559
|
+
* Model generation options.
|
537
560
|
*/
|
538
561
|
export interface GenerateConfig {
|
539
562
|
max_retries: MaxRetries;
|
@@ -560,12 +583,18 @@ export interface GenerateConfig {
|
|
560
583
|
reasoning_effort: ReasoningEffort;
|
561
584
|
reasoning_history: ReasoningHistory;
|
562
585
|
}
|
586
|
+
/**
|
587
|
+
* Scoring results from evaluation.
|
588
|
+
*/
|
563
589
|
export interface EvalResults {
|
564
590
|
total_samples: TotalSamples;
|
565
591
|
completed_samples: CompletedSamples;
|
566
592
|
scores: Scores;
|
567
593
|
metadata: Metadata3;
|
568
594
|
}
|
595
|
+
/**
|
596
|
+
* Score for evaluation task.
|
597
|
+
*/
|
569
598
|
export interface EvalScore {
|
570
599
|
name: Name3;
|
571
600
|
scorer: Scorer;
|
@@ -578,13 +607,19 @@ export interface Params2 {}
|
|
578
607
|
export interface Metrics {
|
579
608
|
[k: string]: EvalMetric;
|
580
609
|
}
|
610
|
+
/**
|
611
|
+
* Metric for evaluation score.
|
612
|
+
*/
|
581
613
|
export interface EvalMetric {
|
582
614
|
name: Name4;
|
583
615
|
value: Value;
|
584
|
-
|
616
|
+
params: Params3;
|
585
617
|
metadata: Metadata1;
|
586
618
|
}
|
587
|
-
export interface
|
619
|
+
export interface Params3 {}
|
620
|
+
/**
|
621
|
+
* Timing and usage statistics.
|
622
|
+
*/
|
588
623
|
export interface EvalStats {
|
589
624
|
started_at: StartedAt;
|
590
625
|
completed_at: CompletedAt;
|
@@ -593,6 +628,9 @@ export interface EvalStats {
|
|
593
628
|
export interface ModelUsage {
|
594
629
|
[k: string]: ModelUsage1;
|
595
630
|
}
|
631
|
+
/**
|
632
|
+
* Token usage for completion.
|
633
|
+
*/
|
596
634
|
export interface ModelUsage1 {
|
597
635
|
input_tokens: InputTokens;
|
598
636
|
output_tokens: OutputTokens;
|
@@ -600,11 +638,17 @@ export interface ModelUsage1 {
|
|
600
638
|
input_tokens_cache_write: InputTokensCacheWrite;
|
601
639
|
input_tokens_cache_read: InputTokensCacheRead;
|
602
640
|
}
|
641
|
+
/**
|
642
|
+
* Eval error details.
|
643
|
+
*/
|
603
644
|
export interface EvalError {
|
604
645
|
message: Message;
|
605
646
|
traceback: Traceback;
|
606
647
|
traceback_ansi: TracebackAnsi;
|
607
648
|
}
|
649
|
+
/**
|
650
|
+
* Sample from evaluation task.
|
651
|
+
*/
|
608
652
|
export interface EvalSample {
|
609
653
|
id: Id;
|
610
654
|
epoch: Epoch;
|
@@ -625,40 +669,61 @@ export interface EvalSample {
|
|
625
669
|
attachments: Attachments;
|
626
670
|
limit: EvalSampleLimit | null;
|
627
671
|
}
|
672
|
+
/**
|
673
|
+
* System chat message.
|
674
|
+
*/
|
628
675
|
export interface ChatMessageSystem {
|
676
|
+
role: Role;
|
629
677
|
content: Content;
|
630
678
|
source: Source;
|
631
|
-
role: Role;
|
632
679
|
}
|
680
|
+
/**
|
681
|
+
* Text content.
|
682
|
+
*/
|
633
683
|
export interface ContentText {
|
634
684
|
type: Type1;
|
635
685
|
text: Text;
|
636
686
|
}
|
687
|
+
/**
|
688
|
+
* Image content.
|
689
|
+
*/
|
637
690
|
export interface ContentImage {
|
638
691
|
type: Type2;
|
639
692
|
image: Image;
|
640
693
|
detail: Detail;
|
641
694
|
}
|
695
|
+
/**
|
696
|
+
* Audio content.
|
697
|
+
*/
|
642
698
|
export interface ContentAudio {
|
643
699
|
type: Type3;
|
644
700
|
audio: Audio;
|
645
701
|
format: Format;
|
646
702
|
}
|
703
|
+
/**
|
704
|
+
* Video content.
|
705
|
+
*/
|
647
706
|
export interface ContentVideo {
|
648
707
|
type: Type4;
|
649
708
|
video: Video;
|
650
709
|
format: Format1;
|
651
710
|
}
|
711
|
+
/**
|
712
|
+
* User chat message.
|
713
|
+
*/
|
652
714
|
export interface ChatMessageUser {
|
715
|
+
role: Role1;
|
653
716
|
content: Content1;
|
654
717
|
source: Source1;
|
655
|
-
role: Role1;
|
656
718
|
tool_call_id: ToolCallId;
|
657
719
|
}
|
720
|
+
/**
|
721
|
+
* Assistant chat message.
|
722
|
+
*/
|
658
723
|
export interface ChatMessageAssistant {
|
724
|
+
role: Role2;
|
659
725
|
content: Content2;
|
660
726
|
source: Source2;
|
661
|
-
role: Role2;
|
662
727
|
tool_calls: ToolCalls;
|
663
728
|
reasoning: Reasoning;
|
664
729
|
}
|
@@ -679,10 +744,13 @@ export interface ToolCallContent {
|
|
679
744
|
format: Format2;
|
680
745
|
content: Content3;
|
681
746
|
}
|
747
|
+
/**
|
748
|
+
* Tool chat message.
|
749
|
+
*/
|
682
750
|
export interface ChatMessageTool {
|
751
|
+
role: Role3;
|
683
752
|
content: Content4;
|
684
753
|
source: Source3;
|
685
|
-
role: Role3;
|
686
754
|
tool_call_id: ToolCallId1;
|
687
755
|
function: Function1;
|
688
756
|
error: ToolCallError | null;
|
@@ -691,6 +759,9 @@ export interface ToolCallError {
|
|
691
759
|
type: Type6;
|
692
760
|
message: Message1;
|
693
761
|
}
|
762
|
+
/**
|
763
|
+
* Output from model generation.
|
764
|
+
*/
|
694
765
|
export interface ModelOutput {
|
695
766
|
model: Model1;
|
696
767
|
choices: Choices1;
|
@@ -699,6 +770,9 @@ export interface ModelOutput {
|
|
699
770
|
metadata: Metadata4;
|
700
771
|
error: Error;
|
701
772
|
}
|
773
|
+
/**
|
774
|
+
* Choice generated for completion.
|
775
|
+
*/
|
702
776
|
export interface ChatCompletionChoice {
|
703
777
|
message: ChatMessageAssistant;
|
704
778
|
stop_reason: StopReason;
|
@@ -729,12 +803,6 @@ export interface TopLogprob {
|
|
729
803
|
}
|
730
804
|
/**
|
731
805
|
* Score generated by a scorer.
|
732
|
-
*
|
733
|
-
* Args:
|
734
|
-
* value (Value): Score value.
|
735
|
-
* answer (str | None): Answer extracted from model output (optional).
|
736
|
-
* explanation (str | None): Explanation of score (optional).
|
737
|
-
* metadata (dict[str,Any]): Additional metadata related to the score.
|
738
806
|
*/
|
739
807
|
export interface Score {
|
740
808
|
value: Value1;
|
@@ -754,6 +822,9 @@ export interface SampleInitEvent {
|
|
754
822
|
sample: Sample;
|
755
823
|
state: JsonValue;
|
756
824
|
}
|
825
|
+
/**
|
826
|
+
* Sample for an evaluation task.
|
827
|
+
*/
|
757
828
|
export interface Sample {
|
758
829
|
input: Input1;
|
759
830
|
choices: Choices2;
|
@@ -888,7 +959,7 @@ export interface ToolFunction {
|
|
888
959
|
name: Name6;
|
889
960
|
}
|
890
961
|
/**
|
891
|
-
*
|
962
|
+
* Model generation options.
|
892
963
|
*/
|
893
964
|
export interface GenerateConfig1 {
|
894
965
|
max_retries: MaxRetries;
|
@@ -984,7 +1055,10 @@ export interface InputEvent {
|
|
984
1055
|
input_ansi: InputAnsi;
|
985
1056
|
}
|
986
1057
|
/**
|
987
|
-
* Event with
|
1058
|
+
* Event with score.
|
1059
|
+
*
|
1060
|
+
* Can be the final score for a `Sample`, or can be an intermediate score
|
1061
|
+
* resulting from a call to `score`.
|
988
1062
|
*/
|
989
1063
|
export interface ScoreEvent {
|
990
1064
|
timestamp: Timestamp8;
|
@@ -992,6 +1066,7 @@ export interface ScoreEvent {
|
|
992
1066
|
event: Event8;
|
993
1067
|
score: Score;
|
994
1068
|
target: Target2;
|
1069
|
+
intermediate: Intermediate;
|
995
1070
|
}
|
996
1071
|
/**
|
997
1072
|
* Event with sample error.
|
@@ -1011,6 +1086,9 @@ export interface LoggerEvent {
|
|
1011
1086
|
event: Event10;
|
1012
1087
|
message: LoggingMessage;
|
1013
1088
|
}
|
1089
|
+
/**
|
1090
|
+
* Message written to Python log.
|
1091
|
+
*/
|
1014
1092
|
export interface LoggingMessage {
|
1015
1093
|
name: Name7;
|
1016
1094
|
level: Level;
|
@@ -1027,6 +1105,7 @@ export interface InfoEvent {
|
|
1027
1105
|
timestamp: Timestamp11;
|
1028
1106
|
pending: Pending11;
|
1029
1107
|
event: Event11;
|
1108
|
+
source: Source4;
|
1030
1109
|
data: JsonValue;
|
1031
1110
|
}
|
1032
1111
|
/**
|
@@ -1063,15 +1142,24 @@ export interface ModelUsage2 {
|
|
1063
1142
|
export interface Attachments {
|
1064
1143
|
[k: string]: string;
|
1065
1144
|
}
|
1145
|
+
/**
|
1146
|
+
* Limit encontered by sample.
|
1147
|
+
*/
|
1066
1148
|
export interface EvalSampleLimit {
|
1067
1149
|
type: Type13;
|
1068
1150
|
limit: Limit2;
|
1069
1151
|
}
|
1152
|
+
/**
|
1153
|
+
* Score reductions.
|
1154
|
+
*/
|
1070
1155
|
export interface EvalSampleReductions {
|
1071
1156
|
scorer: Scorer1;
|
1072
1157
|
reducer: Reducer1;
|
1073
1158
|
samples: Samples2;
|
1074
1159
|
}
|
1160
|
+
/**
|
1161
|
+
* Score and sample_id scored.
|
1162
|
+
*/
|
1075
1163
|
export interface EvalSampleScore {
|
1076
1164
|
value: Value2;
|
1077
1165
|
answer: Answer1;
|
@@ -14,7 +14,13 @@ export const ModelTokenTable: React.FC<ModelTokenTable> = ({
|
|
14
14
|
<TokenHeader />
|
15
15
|
<tbody>
|
16
16
|
{Object.keys(model_usage).map((key) => {
|
17
|
-
return
|
17
|
+
return (
|
18
|
+
<TokenRow
|
19
|
+
key={key}
|
20
|
+
model={`${key}-token-row`}
|
21
|
+
usage={model_usage[key]}
|
22
|
+
/>
|
23
|
+
);
|
18
24
|
})}
|
19
25
|
</tbody>
|
20
26
|
</TokenTable>
|
@@ -68,12 +68,14 @@ export const ModelUsagePanel: React.FC<ModelUsageProps> = ({ usage }) => {
|
|
68
68
|
|
69
69
|
return (
|
70
70
|
<div className={clsx("text-size-small", styles.wrapper)}>
|
71
|
-
{rows.map((row) => {
|
71
|
+
{rows.map((row, idx) => {
|
72
72
|
if (row.label === "---") {
|
73
|
-
return
|
73
|
+
return (
|
74
|
+
<div key={`$usage-sep-${idx}`} className={styles.separator}></div>
|
75
|
+
);
|
74
76
|
} else {
|
75
77
|
return (
|
76
|
-
<Fragment>
|
78
|
+
<Fragment key={`$usage-row-${idx}`}>
|
77
79
|
<div
|
78
80
|
className={clsx(
|
79
81
|
"text-style-label",
|
@@ -79,32 +79,6 @@ export const WorkSpaceView: React.FC<WorkSpaceViewProps> = ({
|
|
79
79
|
[setSelectedTab],
|
80
80
|
);
|
81
81
|
|
82
|
-
// Compute tab panels anytime the tabs change
|
83
|
-
const tabPanels = useMemo(() => {
|
84
|
-
return Object.keys(tabs).map((key) => {
|
85
|
-
const tab = tabs[key];
|
86
|
-
return (
|
87
|
-
<TabPanel
|
88
|
-
id={tab.id}
|
89
|
-
title={tab.label}
|
90
|
-
onSelected={onSelected}
|
91
|
-
selected={selectedTab === tab.id}
|
92
|
-
scrollable={!!tab.scrollable}
|
93
|
-
scrollRef={tab.scrollRef}
|
94
|
-
scrollPosition={workspaceTabScrollPositionRef.current?.[tab.id]}
|
95
|
-
setScrollPosition={useCallback(
|
96
|
-
(position: number) => {
|
97
|
-
onScroll(tab.id, position);
|
98
|
-
},
|
99
|
-
[onScroll],
|
100
|
-
)}
|
101
|
-
>
|
102
|
-
{tab.content()}
|
103
|
-
</TabPanel>
|
104
|
-
);
|
105
|
-
});
|
106
|
-
}, [tabs, selectedTab]);
|
107
|
-
|
108
82
|
if (evalSpec === undefined) {
|
109
83
|
return <EmptyPanel />;
|
110
84
|
} else {
|
@@ -150,7 +124,31 @@ export const WorkSpaceView: React.FC<WorkSpaceViewProps> = ({
|
|
150
124
|
tabControlsClassName={clsx(styles.tabs, "text-size-smaller")}
|
151
125
|
tabPanelsClassName={clsx(styles.tabPanels)}
|
152
126
|
>
|
153
|
-
{
|
127
|
+
{Object.keys(tabs).map((key) => {
|
128
|
+
const tab = tabs[key];
|
129
|
+
return (
|
130
|
+
<TabPanel
|
131
|
+
key={tab.id}
|
132
|
+
id={tab.id}
|
133
|
+
title={tab.label}
|
134
|
+
onSelected={onSelected}
|
135
|
+
selected={selectedTab === tab.id}
|
136
|
+
scrollable={!!tab.scrollable}
|
137
|
+
scrollRef={tab.scrollRef}
|
138
|
+
scrollPosition={
|
139
|
+
workspaceTabScrollPositionRef.current?.[tab.id]
|
140
|
+
}
|
141
|
+
setScrollPosition={useCallback(
|
142
|
+
(position: number) => {
|
143
|
+
onScroll(tab.id, position);
|
144
|
+
},
|
145
|
+
[onScroll],
|
146
|
+
)}
|
147
|
+
>
|
148
|
+
{tab.content()}
|
149
|
+
</TabPanel>
|
150
|
+
);
|
151
|
+
})}
|
154
152
|
</TabSet>
|
155
153
|
</div>
|
156
154
|
</div>
|
@@ -30,16 +30,6 @@ export const PrimaryBar: React.FC<PrimaryBarProps> = ({
|
|
30
30
|
evalSpec,
|
31
31
|
setOffcanvas,
|
32
32
|
}) => {
|
33
|
-
let statusPanel;
|
34
|
-
if (status === "success") {
|
35
|
-
statusPanel = <ResultsPanel results={evalResults} />;
|
36
|
-
} else if (status === "cancelled") {
|
37
|
-
statusPanel = <CancelledPanel sampleCount={samples?.length || 0} />;
|
38
|
-
} else if (status === "started") {
|
39
|
-
statusPanel = <RunningPanel sampleCount={samples?.length || 0} />;
|
40
|
-
} else if (status === "error") {
|
41
|
-
statusPanel = <ErroredPanel sampleCount={samples?.length || 0} />;
|
42
|
-
}
|
43
33
|
const logFileName = file ? filename(file) : "";
|
44
34
|
|
45
35
|
const handleToggle = useCallback(() => {
|
@@ -103,7 +93,18 @@ export const PrimaryBar: React.FC<PrimaryBarProps> = ({
|
|
103
93
|
</div>
|
104
94
|
</div>
|
105
95
|
<div className={clsx(styles.taskStatus, "navbar-text")}>
|
106
|
-
{
|
96
|
+
{status === "success" ? (
|
97
|
+
<ResultsPanel results={evalResults} />
|
98
|
+
) : undefined}
|
99
|
+
{status === "cancelled" ? (
|
100
|
+
<CancelledPanel sampleCount={samples?.length || 0} />
|
101
|
+
) : undefined}
|
102
|
+
{status === "started" ? (
|
103
|
+
<RunningPanel sampleCount={samples?.length || 0} />
|
104
|
+
) : undefined}
|
105
|
+
{status === "error" ? (
|
106
|
+
<ErroredPanel sampleCount={samples?.length || 0} />
|
107
|
+
) : undefined}
|
107
108
|
</div>
|
108
109
|
<div id="task-created" style={{ display: "none" }}>
|
109
110
|
{evalSpec?.created}
|
@@ -14,13 +14,13 @@
|
|
14
14
|
flex-direction: row;
|
15
15
|
flex-wrap: wrap;
|
16
16
|
justify-content: end;
|
17
|
-
height: 100%;
|
18
17
|
align-items: center;
|
19
18
|
margin-top: 0.2rem;
|
20
19
|
padding-bottom: 0.4rem;
|
21
20
|
row-gap: 1em;
|
22
21
|
max-height: 15em;
|
23
22
|
overflow: scroll;
|
23
|
+
align-items: baseline;
|
24
24
|
}
|
25
25
|
|
26
26
|
.verticalMetricReducer {
|
@@ -39,14 +39,26 @@
|
|
39
39
|
}
|
40
40
|
|
41
41
|
.verticalMetricValue {
|
42
|
-
font-size: var(--inspect-font-size-larger);
|
43
42
|
font-weight: 500;
|
44
43
|
text-align: center;
|
45
44
|
}
|
46
45
|
|
46
|
+
.multiScorer {
|
47
|
+
padding-left: 0;
|
48
|
+
height: 100%;
|
49
|
+
display: flex;
|
50
|
+
flex-direction: column;
|
51
|
+
padding: 0.5em 1em;
|
52
|
+
}
|
53
|
+
|
54
|
+
.multiScorerIndent {
|
55
|
+
padding-left: 1.5em;
|
56
|
+
}
|
57
|
+
|
47
58
|
.multiScorerReducer {
|
48
59
|
text-align: center;
|
49
60
|
margin-bottom: -0.3rem;
|
61
|
+
margin-top: 0.2em;
|
50
62
|
}
|
51
63
|
|
52
64
|
.multiScorerLabel {
|
@@ -58,10 +70,21 @@
|
|
58
70
|
.multiScorerValue {
|
59
71
|
display: grid;
|
60
72
|
grid-template-columns: auto auto;
|
73
|
+
grid-auto-rows: auto;
|
61
74
|
grid-column-gap: 0.3rem;
|
62
75
|
grid-row-gap: 0;
|
76
|
+
padding-top: 0.3em;
|
63
77
|
}
|
64
78
|
|
65
79
|
.multiScorerValueContent {
|
66
80
|
font-weight: 600;
|
81
|
+
text-align: center;
|
82
|
+
}
|
83
|
+
|
84
|
+
.multiScoreMetricGrid {
|
85
|
+
display: grid;
|
86
|
+
grid-template-rows: auto auto;
|
87
|
+
column-gap: 1em;
|
88
|
+
padding: 0 0.2em;
|
89
|
+
justify-content: center;
|
67
90
|
}
|