inspect-ai 0.3.87__py3-none-any.whl → 0.3.89__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +16 -0
- inspect_ai/_cli/score.py +1 -12
- inspect_ai/_cli/util.py +4 -2
- inspect_ai/_display/core/footer.py +2 -2
- inspect_ai/_display/plain/display.py +2 -2
- inspect_ai/_eval/context.py +7 -1
- inspect_ai/_eval/eval.py +51 -27
- inspect_ai/_eval/evalset.py +27 -10
- inspect_ai/_eval/loader.py +7 -8
- inspect_ai/_eval/run.py +23 -31
- inspect_ai/_eval/score.py +18 -1
- inspect_ai/_eval/task/log.py +5 -13
- inspect_ai/_eval/task/resolved.py +1 -0
- inspect_ai/_eval/task/run.py +231 -244
- inspect_ai/_eval/task/task.py +25 -2
- inspect_ai/_eval/task/util.py +1 -8
- inspect_ai/_util/constants.py +1 -0
- inspect_ai/_util/json.py +8 -3
- inspect_ai/_util/registry.py +30 -13
- inspect_ai/_view/www/App.css +5 -0
- inspect_ai/_view/www/dist/assets/index.css +55 -18
- inspect_ai/_view/www/dist/assets/index.js +550 -458
- inspect_ai/_view/www/log-schema.json +84 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
- inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
- inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
- inspect_ai/_view/www/src/types/log.d.ts +150 -129
- inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
- inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
- inspect_ai/agent/_agent.py +12 -0
- inspect_ai/agent/_as_tool.py +1 -1
- inspect_ai/agent/_bridge/bridge.py +9 -2
- inspect_ai/agent/_react.py +142 -74
- inspect_ai/agent/_run.py +13 -2
- inspect_ai/agent/_types.py +6 -0
- inspect_ai/approval/_apply.py +6 -9
- inspect_ai/approval/_approver.py +3 -3
- inspect_ai/approval/_auto.py +2 -2
- inspect_ai/approval/_call.py +20 -4
- inspect_ai/approval/_human/approver.py +3 -3
- inspect_ai/approval/_human/manager.py +2 -2
- inspect_ai/approval/_human/panel.py +3 -3
- inspect_ai/approval/_policy.py +3 -3
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_log.py +23 -2
- inspect_ai/log/_model.py +58 -0
- inspect_ai/log/_recorders/file.py +14 -3
- inspect_ai/log/_transcript.py +3 -0
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +15 -2
- inspect_ai/model/_model.py +49 -3
- inspect_ai/model/_openai.py +151 -21
- inspect_ai/model/_providers/anthropic.py +25 -14
- inspect_ai/model/_providers/bedrock.py +3 -3
- inspect_ai/model/_providers/cloudflare.py +29 -108
- inspect_ai/model/_providers/google.py +21 -10
- inspect_ai/model/_providers/grok.py +23 -17
- inspect_ai/model/_providers/groq.py +61 -37
- inspect_ai/model/_providers/llama_cpp_python.py +8 -9
- inspect_ai/model/_providers/mistral.py +8 -3
- inspect_ai/model/_providers/ollama.py +8 -9
- inspect_ai/model/_providers/openai.py +53 -157
- inspect_ai/model/_providers/openai_compatible.py +195 -0
- inspect_ai/model/_providers/openrouter.py +4 -15
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/model/_providers/together.py +25 -23
- inspect_ai/model/_trim.py +83 -0
- inspect_ai/solver/_plan.py +5 -3
- inspect_ai/tool/_tool_call.py +3 -0
- inspect_ai/tool/_tool_def.py +8 -2
- inspect_ai/util/__init__.py +3 -0
- inspect_ai/util/_concurrency.py +15 -2
- {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/RECORD +86 -81
- inspect_ai/_eval/task/rundir.py +0 -78
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
- {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/top_level.txt +0 -0
@@ -24,12 +24,57 @@ export type SampleIds = (number | string)[] | null;
|
|
24
24
|
export type Shuffled = boolean | null;
|
25
25
|
export type Type = string;
|
26
26
|
export type Model = string;
|
27
|
+
export type MaxRetries = number | null;
|
28
|
+
export type Timeout = number | null;
|
29
|
+
export type MaxConnections = number | null;
|
30
|
+
export type SystemMessage = string | null;
|
31
|
+
export type MaxTokens = number | null;
|
32
|
+
export type TopP = number | null;
|
33
|
+
export type Temperature = number | null;
|
34
|
+
export type StopSeqs = string[] | null;
|
35
|
+
export type BestOf = number | null;
|
36
|
+
export type FrequencyPenalty = number | null;
|
37
|
+
export type PresencePenalty = number | null;
|
38
|
+
export type LogitBias = {
|
39
|
+
[k: string]: number;
|
40
|
+
} | null;
|
41
|
+
export type Seed = number | null;
|
42
|
+
export type TopK = number | null;
|
43
|
+
export type NumChoices = number | null;
|
44
|
+
export type Logprobs = boolean | null;
|
45
|
+
export type TopLogprobs = number | null;
|
46
|
+
export type ParallelToolCalls = boolean | null;
|
47
|
+
export type InternalTools = boolean | null;
|
48
|
+
export type MaxToolOutput = number | null;
|
49
|
+
export type CachePrompt = "auto" | boolean | null;
|
50
|
+
export type ReasoningEffort = ("low" | "medium" | "high") | null;
|
51
|
+
export type ReasoningTokens = number | null;
|
52
|
+
export type ReasoningHistory = ("none" | "all" | "last" | "auto") | null;
|
53
|
+
export type Name1 = string;
|
54
|
+
export type Type1 =
|
55
|
+
| ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
|
56
|
+
| null;
|
57
|
+
export type Description = string | null;
|
58
|
+
export type Enum = unknown[] | null;
|
59
|
+
export type Properties = {
|
60
|
+
[k: string]: JSONSchema;
|
61
|
+
} | null;
|
62
|
+
export type Additionalproperties = JSONSchema | boolean | null;
|
63
|
+
export type Anyof = JSONSchema[] | null;
|
64
|
+
export type Required = string[] | null;
|
65
|
+
export type Description1 = string | null;
|
66
|
+
export type Strict = boolean | null;
|
27
67
|
export type ModelBaseUrl = string | null;
|
68
|
+
export type ModelRoles = {
|
69
|
+
[k: string]: EvalModelConfig;
|
70
|
+
} | null;
|
71
|
+
export type Model1 = string;
|
72
|
+
export type BaseUrl = string | null;
|
28
73
|
export type Limit = number | [unknown, unknown] | null;
|
29
74
|
export type SampleId = string | number | (string | number)[] | null;
|
30
75
|
export type Epochs = number | null;
|
31
76
|
export type EpochsReducer = string[] | null;
|
32
|
-
export type
|
77
|
+
export type Name2 = string;
|
33
78
|
export type Tools = string | string[];
|
34
79
|
export type Approvers = ApproverPolicyConfig[];
|
35
80
|
export type FailOnError = boolean | number | null;
|
@@ -47,12 +92,12 @@ export type LogImages = boolean | null;
|
|
47
92
|
export type LogBuffer = number | null;
|
48
93
|
export type LogShared = number | null;
|
49
94
|
export type ScoreDisplay = boolean | null;
|
50
|
-
export type
|
95
|
+
export type Type2 = "git";
|
51
96
|
export type Origin = string;
|
52
97
|
export type Commit = string;
|
53
98
|
export type Metadata = {} | null;
|
54
99
|
export type Scorers = EvalScorer[] | null;
|
55
|
-
export type
|
100
|
+
export type Name3 = string;
|
56
101
|
export type Options = {} | null;
|
57
102
|
export type Metrics =
|
58
103
|
| (
|
@@ -65,7 +110,7 @@ export type Metrics =
|
|
65
110
|
[k: string]: EvalMetricDefinition[];
|
66
111
|
}
|
67
112
|
| null;
|
68
|
-
export type
|
113
|
+
export type Name4 = string;
|
69
114
|
export type Options1 = {} | null;
|
70
115
|
export type Metadata1 = {} | null;
|
71
116
|
export type Metrics1 =
|
@@ -74,49 +119,9 @@ export type Metrics1 =
|
|
74
119
|
[k: string]: EvalMetricDefinition[];
|
75
120
|
}
|
76
121
|
| null;
|
77
|
-
export type
|
122
|
+
export type Name5 = string;
|
78
123
|
export type Solver1 = string;
|
79
124
|
export type Steps = EvalPlanStep[];
|
80
|
-
export type MaxRetries = number | null;
|
81
|
-
export type Timeout = number | null;
|
82
|
-
export type MaxConnections = number | null;
|
83
|
-
export type SystemMessage = string | null;
|
84
|
-
export type MaxTokens = number | null;
|
85
|
-
export type TopP = number | null;
|
86
|
-
export type Temperature = number | null;
|
87
|
-
export type StopSeqs = string[] | null;
|
88
|
-
export type BestOf = number | null;
|
89
|
-
export type FrequencyPenalty = number | null;
|
90
|
-
export type PresencePenalty = number | null;
|
91
|
-
export type LogitBias = {
|
92
|
-
[k: string]: number;
|
93
|
-
} | null;
|
94
|
-
export type Seed = number | null;
|
95
|
-
export type TopK = number | null;
|
96
|
-
export type NumChoices = number | null;
|
97
|
-
export type Logprobs = boolean | null;
|
98
|
-
export type TopLogprobs = number | null;
|
99
|
-
export type ParallelToolCalls = boolean | null;
|
100
|
-
export type InternalTools = boolean | null;
|
101
|
-
export type MaxToolOutput = number | null;
|
102
|
-
export type CachePrompt = "auto" | boolean | null;
|
103
|
-
export type ReasoningEffort = ("low" | "medium" | "high") | null;
|
104
|
-
export type ReasoningTokens = number | null;
|
105
|
-
export type ReasoningHistory = ("none" | "all" | "last" | "auto") | null;
|
106
|
-
export type Name5 = string;
|
107
|
-
export type Type2 =
|
108
|
-
| ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
|
109
|
-
| null;
|
110
|
-
export type Description = string | null;
|
111
|
-
export type Enum = unknown[] | null;
|
112
|
-
export type Properties = {
|
113
|
-
[k: string]: JSONSchema;
|
114
|
-
} | null;
|
115
|
-
export type Additionalproperties = JSONSchema | boolean | null;
|
116
|
-
export type Anyof = JSONSchema[] | null;
|
117
|
-
export type Required = string[] | null;
|
118
|
-
export type Description1 = string | null;
|
119
|
-
export type Strict = boolean | null;
|
120
125
|
export type TotalSamples = number;
|
121
126
|
export type CompletedSamples = number;
|
122
127
|
export type Name6 = string;
|
@@ -210,7 +215,8 @@ export type ParseError = string | null;
|
|
210
215
|
export type Title = string | null;
|
211
216
|
export type Format2 = "text" | "markdown";
|
212
217
|
export type Content3 = string;
|
213
|
-
export type
|
218
|
+
export type Type8 = string | null;
|
219
|
+
export type Model2 = string | null;
|
214
220
|
export type Id5 = string | null;
|
215
221
|
export type Content4 =
|
216
222
|
| string
|
@@ -225,7 +231,7 @@ export type Source3 = ("input" | "generate") | null;
|
|
225
231
|
export type Role3 = "tool";
|
226
232
|
export type ToolCallId1 = string | null;
|
227
233
|
export type Function1 = string | null;
|
228
|
-
export type
|
234
|
+
export type Type9 =
|
229
235
|
| "parsing"
|
230
236
|
| "timeout"
|
231
237
|
| "unicode_decode"
|
@@ -246,7 +252,7 @@ export type Messages = (
|
|
246
252
|
| ChatMessageAssistant
|
247
253
|
| ChatMessageTool
|
248
254
|
)[];
|
249
|
-
export type
|
255
|
+
export type Model3 = string;
|
250
256
|
export type StopReason =
|
251
257
|
| "stop"
|
252
258
|
| "max_tokens"
|
@@ -305,7 +311,7 @@ export type Timestamp1 = string;
|
|
305
311
|
export type WorkingStart1 = number;
|
306
312
|
export type Pending1 = boolean | null;
|
307
313
|
export type Event1 = "sample_limit";
|
308
|
-
export type
|
314
|
+
export type Type10 =
|
309
315
|
| "message"
|
310
316
|
| "time"
|
311
317
|
| "working"
|
@@ -345,7 +351,8 @@ export type Timestamp5 = string;
|
|
345
351
|
export type WorkingStart5 = number;
|
346
352
|
export type Pending5 = boolean | null;
|
347
353
|
export type Event5 = "model";
|
348
|
-
export type
|
354
|
+
export type Model4 = string;
|
355
|
+
export type Role4 = string | null;
|
349
356
|
export type Input3 = (
|
350
357
|
| ChatMessageSystem
|
351
358
|
| ChatMessageUser
|
@@ -354,7 +361,7 @@ export type Input3 = (
|
|
354
361
|
)[];
|
355
362
|
export type Name8 = string;
|
356
363
|
export type Description2 = string;
|
357
|
-
export type
|
364
|
+
export type Type11 = "object";
|
358
365
|
export type Required1 = string[];
|
359
366
|
export type Additionalproperties1 = boolean;
|
360
367
|
export type Tools1 = ToolInfo[];
|
@@ -369,7 +376,7 @@ export type Timestamp6 = string;
|
|
369
376
|
export type WorkingStart6 = number;
|
370
377
|
export type Pending6 = boolean | null;
|
371
378
|
export type Event6 = "tool";
|
372
|
-
export type
|
379
|
+
export type Type12 = "function";
|
373
380
|
export type Id7 = string;
|
374
381
|
export type Function2 = string;
|
375
382
|
export type Result1 =
|
@@ -447,14 +454,14 @@ export type WorkingStart13 = number;
|
|
447
454
|
export type Pending13 = boolean | null;
|
448
455
|
export type Event13 = "step";
|
449
456
|
export type Action1 = "begin" | "end";
|
450
|
-
export type
|
457
|
+
export type Type13 = string | null;
|
451
458
|
export type Name11 = string;
|
452
459
|
export type Timestamp14 = string;
|
453
460
|
export type WorkingStart14 = number;
|
454
461
|
export type Pending14 = boolean | null;
|
455
462
|
export type Event14 = "subtask";
|
456
463
|
export type Name12 = string;
|
457
|
-
export type
|
464
|
+
export type Type14 = string | null;
|
458
465
|
export type Events2 = (
|
459
466
|
| SampleInitEvent
|
460
467
|
| SampleLimitEvent
|
@@ -515,7 +522,7 @@ export type Events = (
|
|
515
522
|
export type TotalTime = number | null;
|
516
523
|
export type WorkingTime3 = number | null;
|
517
524
|
export type Uuid = string | null;
|
518
|
-
export type
|
525
|
+
export type Type15 =
|
519
526
|
| "context"
|
520
527
|
| "time"
|
521
528
|
| "working"
|
@@ -576,8 +583,10 @@ export interface EvalSpec {
|
|
576
583
|
dataset: EvalDataset;
|
577
584
|
sandbox: SandboxEnvironmentSpec | null;
|
578
585
|
model: Model;
|
586
|
+
model_generate_config: GenerateConfig;
|
579
587
|
model_base_url: ModelBaseUrl;
|
580
588
|
model_args: ModelArgs;
|
589
|
+
model_roles: ModelRoles;
|
581
590
|
config: EvalConfig;
|
582
591
|
revision: EvalRevision | null;
|
583
592
|
packages: Packages;
|
@@ -607,7 +616,73 @@ export interface SandboxEnvironmentSpec {
|
|
607
616
|
export interface Config {
|
608
617
|
[k: string]: unknown;
|
609
618
|
}
|
619
|
+
/**
|
620
|
+
* Model generation options.
|
621
|
+
*/
|
622
|
+
export interface GenerateConfig {
|
623
|
+
max_retries: MaxRetries;
|
624
|
+
timeout: Timeout;
|
625
|
+
max_connections: MaxConnections;
|
626
|
+
system_message: SystemMessage;
|
627
|
+
max_tokens: MaxTokens;
|
628
|
+
top_p: TopP;
|
629
|
+
temperature: Temperature;
|
630
|
+
stop_seqs: StopSeqs;
|
631
|
+
best_of: BestOf;
|
632
|
+
frequency_penalty: FrequencyPenalty;
|
633
|
+
presence_penalty: PresencePenalty;
|
634
|
+
logit_bias: LogitBias;
|
635
|
+
seed: Seed;
|
636
|
+
top_k: TopK;
|
637
|
+
num_choices: NumChoices;
|
638
|
+
logprobs: Logprobs;
|
639
|
+
top_logprobs: TopLogprobs;
|
640
|
+
parallel_tool_calls: ParallelToolCalls;
|
641
|
+
internal_tools: InternalTools;
|
642
|
+
max_tool_output: MaxToolOutput;
|
643
|
+
cache_prompt: CachePrompt;
|
644
|
+
reasoning_effort: ReasoningEffort;
|
645
|
+
reasoning_tokens: ReasoningTokens;
|
646
|
+
reasoning_history: ReasoningHistory;
|
647
|
+
response_schema: ResponseSchema | null;
|
648
|
+
}
|
649
|
+
/**
|
650
|
+
* Schema for model response when using Structured Output.
|
651
|
+
*/
|
652
|
+
export interface ResponseSchema {
|
653
|
+
name: Name1;
|
654
|
+
json_schema: JSONSchema;
|
655
|
+
description: Description1;
|
656
|
+
strict: Strict;
|
657
|
+
}
|
658
|
+
/**
|
659
|
+
* JSON Schema for type.
|
660
|
+
*/
|
661
|
+
export interface JSONSchema {
|
662
|
+
type: Type1;
|
663
|
+
description: Description;
|
664
|
+
default: Default;
|
665
|
+
enum: Enum;
|
666
|
+
items: JSONSchema | null;
|
667
|
+
properties: Properties;
|
668
|
+
additionalProperties: Additionalproperties;
|
669
|
+
anyOf: Anyof;
|
670
|
+
required: Required;
|
671
|
+
}
|
672
|
+
export interface Default {
|
673
|
+
[k: string]: unknown;
|
674
|
+
}
|
610
675
|
export interface ModelArgs {}
|
676
|
+
/**
|
677
|
+
* Model config.
|
678
|
+
*/
|
679
|
+
export interface EvalModelConfig {
|
680
|
+
model: Model1;
|
681
|
+
config: GenerateConfig;
|
682
|
+
base_url: BaseUrl;
|
683
|
+
args: Args;
|
684
|
+
}
|
685
|
+
export interface Args {}
|
611
686
|
/**
|
612
687
|
* Configuration used for evaluation.
|
613
688
|
*/
|
@@ -653,7 +728,7 @@ export interface ApprovalPolicyConfig {
|
|
653
728
|
* ```
|
654
729
|
*/
|
655
730
|
export interface ApproverPolicyConfig {
|
656
|
-
name:
|
731
|
+
name: Name2;
|
657
732
|
tools: Tools;
|
658
733
|
params: Params;
|
659
734
|
}
|
@@ -662,7 +737,7 @@ export interface Params {}
|
|
662
737
|
* Git revision for evaluation.
|
663
738
|
*/
|
664
739
|
export interface EvalRevision {
|
665
|
-
type:
|
740
|
+
type: Type2;
|
666
741
|
origin: Origin;
|
667
742
|
commit: Commit;
|
668
743
|
}
|
@@ -670,23 +745,23 @@ export interface Packages {
|
|
670
745
|
[k: string]: string;
|
671
746
|
}
|
672
747
|
export interface EvalScorer {
|
673
|
-
name:
|
748
|
+
name: Name3;
|
674
749
|
options: Options;
|
675
750
|
metrics: Metrics;
|
676
751
|
metadata: Metadata1;
|
677
752
|
}
|
678
753
|
export interface EvalMetricDefinition {
|
679
|
-
name:
|
754
|
+
name: Name4;
|
680
755
|
options: Options1;
|
681
756
|
}
|
682
757
|
/**
|
683
758
|
* Plan (solvers) used in evaluation.
|
684
759
|
*/
|
685
760
|
export interface EvalPlan {
|
686
|
-
name:
|
761
|
+
name: Name5;
|
687
762
|
steps: Steps;
|
688
763
|
finish: EvalPlanStep | null;
|
689
|
-
config:
|
764
|
+
config: GenerateConfig1;
|
690
765
|
}
|
691
766
|
/**
|
692
767
|
* Solver step.
|
@@ -699,7 +774,7 @@ export interface Params1 {}
|
|
699
774
|
/**
|
700
775
|
* Model generation options.
|
701
776
|
*/
|
702
|
-
export interface
|
777
|
+
export interface GenerateConfig1 {
|
703
778
|
max_retries: MaxRetries;
|
704
779
|
timeout: Timeout;
|
705
780
|
max_connections: MaxConnections;
|
@@ -726,32 +801,6 @@ export interface GenerateConfig {
|
|
726
801
|
reasoning_history: ReasoningHistory;
|
727
802
|
response_schema: ResponseSchema | null;
|
728
803
|
}
|
729
|
-
/**
|
730
|
-
* Schema for model response when using Structured Output.
|
731
|
-
*/
|
732
|
-
export interface ResponseSchema {
|
733
|
-
name: Name5;
|
734
|
-
json_schema: JSONSchema;
|
735
|
-
description: Description1;
|
736
|
-
strict: Strict;
|
737
|
-
}
|
738
|
-
/**
|
739
|
-
* JSON Schema for type.
|
740
|
-
*/
|
741
|
-
export interface JSONSchema {
|
742
|
-
type: Type2;
|
743
|
-
description: Description;
|
744
|
-
default: Default;
|
745
|
-
enum: Enum;
|
746
|
-
items: JSONSchema | null;
|
747
|
-
properties: Properties;
|
748
|
-
additionalProperties: Additionalproperties;
|
749
|
-
anyOf: Anyof;
|
750
|
-
required: Required;
|
751
|
-
}
|
752
|
-
export interface Default {
|
753
|
-
[k: string]: unknown;
|
754
|
-
}
|
755
804
|
/**
|
756
805
|
* Scoring results from evaluation.
|
757
806
|
*/
|
@@ -916,7 +965,7 @@ export interface ChatMessageAssistant {
|
|
916
965
|
internal: unknown;
|
917
966
|
role: Role2;
|
918
967
|
tool_calls: ToolCalls;
|
919
|
-
model:
|
968
|
+
model: Model2;
|
920
969
|
}
|
921
970
|
export interface ToolCall {
|
922
971
|
id: Id4;
|
@@ -925,6 +974,7 @@ export interface ToolCall {
|
|
925
974
|
internal: unknown;
|
926
975
|
parse_error: ParseError;
|
927
976
|
view: ToolCallContent | null;
|
977
|
+
type: Type8;
|
928
978
|
}
|
929
979
|
export interface Arguments {}
|
930
980
|
/**
|
@@ -949,14 +999,14 @@ export interface ChatMessageTool {
|
|
949
999
|
error: ToolCallError | null;
|
950
1000
|
}
|
951
1001
|
export interface ToolCallError {
|
952
|
-
type:
|
1002
|
+
type: Type9;
|
953
1003
|
message: Message1;
|
954
1004
|
}
|
955
1005
|
/**
|
956
1006
|
* Output from model generation.
|
957
1007
|
*/
|
958
1008
|
export interface ModelOutput {
|
959
|
-
model:
|
1009
|
+
model: Model3;
|
960
1010
|
choices: Choices1;
|
961
1011
|
usage: ModelUsage1 | null;
|
962
1012
|
time: Time;
|
@@ -1037,7 +1087,7 @@ export interface SampleLimitEvent {
|
|
1037
1087
|
working_start: WorkingStart1;
|
1038
1088
|
pending: Pending1;
|
1039
1089
|
event: Event1;
|
1040
|
-
type:
|
1090
|
+
type: Type10;
|
1041
1091
|
message: Message2;
|
1042
1092
|
limit: Limit1;
|
1043
1093
|
}
|
@@ -1100,11 +1150,12 @@ export interface ModelEvent {
|
|
1100
1150
|
working_start: WorkingStart5;
|
1101
1151
|
pending: Pending5;
|
1102
1152
|
event: Event5;
|
1103
|
-
model:
|
1153
|
+
model: Model4;
|
1154
|
+
role: Role4;
|
1104
1155
|
input: Input3;
|
1105
1156
|
tools: Tools1;
|
1106
1157
|
tool_choice: ToolChoice;
|
1107
|
-
config:
|
1158
|
+
config: GenerateConfig;
|
1108
1159
|
output: ModelOutput;
|
1109
1160
|
error: Error1;
|
1110
1161
|
cache: Cache;
|
@@ -1147,7 +1198,7 @@ export interface ToolInfo {
|
|
1147
1198
|
* Description of tool parameters object in JSON Schema format.
|
1148
1199
|
*/
|
1149
1200
|
export interface ToolParams {
|
1150
|
-
type:
|
1201
|
+
type: Type11;
|
1151
1202
|
properties: Properties1;
|
1152
1203
|
required: Required1;
|
1153
1204
|
additionalProperties: Additionalproperties1;
|
@@ -1158,36 +1209,6 @@ export interface Properties1 {
|
|
1158
1209
|
export interface ToolFunction {
|
1159
1210
|
name: Name9;
|
1160
1211
|
}
|
1161
|
-
/**
|
1162
|
-
* Model generation options.
|
1163
|
-
*/
|
1164
|
-
export interface GenerateConfig1 {
|
1165
|
-
max_retries: MaxRetries;
|
1166
|
-
timeout: Timeout;
|
1167
|
-
max_connections: MaxConnections;
|
1168
|
-
system_message: SystemMessage;
|
1169
|
-
max_tokens: MaxTokens;
|
1170
|
-
top_p: TopP;
|
1171
|
-
temperature: Temperature;
|
1172
|
-
stop_seqs: StopSeqs;
|
1173
|
-
best_of: BestOf;
|
1174
|
-
frequency_penalty: FrequencyPenalty;
|
1175
|
-
presence_penalty: PresencePenalty;
|
1176
|
-
logit_bias: LogitBias;
|
1177
|
-
seed: Seed;
|
1178
|
-
top_k: TopK;
|
1179
|
-
num_choices: NumChoices;
|
1180
|
-
logprobs: Logprobs;
|
1181
|
-
top_logprobs: TopLogprobs;
|
1182
|
-
parallel_tool_calls: ParallelToolCalls;
|
1183
|
-
internal_tools: InternalTools;
|
1184
|
-
max_tool_output: MaxToolOutput;
|
1185
|
-
cache_prompt: CachePrompt;
|
1186
|
-
reasoning_effort: ReasoningEffort;
|
1187
|
-
reasoning_tokens: ReasoningTokens;
|
1188
|
-
reasoning_history: ReasoningHistory;
|
1189
|
-
response_schema: ResponseSchema | null;
|
1190
|
-
}
|
1191
1212
|
/**
|
1192
1213
|
* Model call (raw request/response data).
|
1193
1214
|
*/
|
@@ -1210,7 +1231,7 @@ export interface ToolEvent {
|
|
1210
1231
|
working_start: WorkingStart6;
|
1211
1232
|
pending: Pending6;
|
1212
1233
|
event: Event6;
|
1213
|
-
type:
|
1234
|
+
type: Type12;
|
1214
1235
|
id: Id7;
|
1215
1236
|
function: Function2;
|
1216
1237
|
arguments: Arguments1;
|
@@ -1332,7 +1353,7 @@ export interface StepEvent {
|
|
1332
1353
|
pending: Pending13;
|
1333
1354
|
event: Event13;
|
1334
1355
|
action: Action1;
|
1335
|
-
type:
|
1356
|
+
type: Type13;
|
1336
1357
|
name: Name11;
|
1337
1358
|
}
|
1338
1359
|
/**
|
@@ -1344,7 +1365,7 @@ export interface SubtaskEvent {
|
|
1344
1365
|
pending: Pending14;
|
1345
1366
|
event: Event14;
|
1346
1367
|
name: Name12;
|
1347
|
-
type:
|
1368
|
+
type: Type14;
|
1348
1369
|
input: Input5;
|
1349
1370
|
result: Result2;
|
1350
1371
|
events: Events2;
|
@@ -1365,7 +1386,7 @@ export interface Attachments {
|
|
1365
1386
|
* Limit encontered by sample.
|
1366
1387
|
*/
|
1367
1388
|
export interface EvalSampleLimit {
|
1368
|
-
type:
|
1389
|
+
type: Type15;
|
1369
1390
|
limit: Limit2;
|
1370
1391
|
}
|
1371
1392
|
/**
|
@@ -0,0 +1,16 @@
|
|
1
|
+
.container {
|
2
|
+
display: flex;
|
3
|
+
flex-direction: row;
|
4
|
+
flex-wrap: wrap;
|
5
|
+
gap: 0;
|
6
|
+
margin-top: -0.2rem;
|
7
|
+
margin-bottom: 0.2rem;
|
8
|
+
}
|
9
|
+
|
10
|
+
.grid {
|
11
|
+
display: grid;
|
12
|
+
grid-template-rows: repeat(auto-fill, minmax(10px, 1fr));
|
13
|
+
grid-template-columns: 1fr;
|
14
|
+
gap: 0.1em;
|
15
|
+
padding-right: 1em;
|
16
|
+
}
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import { FC } from "react";
|
2
|
+
import { ModelRoles } from "../../types/log";
|
3
|
+
|
4
|
+
import clsx from "clsx";
|
5
|
+
import styles from "./ModelRolesView.module.css";
|
6
|
+
|
7
|
+
interface ModelRolesViewProps {
|
8
|
+
roles: ModelRoles;
|
9
|
+
}
|
10
|
+
|
11
|
+
/**
|
12
|
+
* Renders the Navbar
|
13
|
+
*/
|
14
|
+
export const ModelRolesView: FC<ModelRolesViewProps> = ({ roles }) => {
|
15
|
+
roles = roles || {};
|
16
|
+
|
17
|
+
// Render as a single line if there is only a single
|
18
|
+
// model role
|
19
|
+
const singleLine = Object.keys(roles).length !== 1;
|
20
|
+
|
21
|
+
// Render a layout of model roles
|
22
|
+
const modelEls = Object.keys(roles).map((key) => {
|
23
|
+
const role = key;
|
24
|
+
const roleData = roles[role];
|
25
|
+
const model = roleData.model;
|
26
|
+
return (
|
27
|
+
<div
|
28
|
+
className={clsx(
|
29
|
+
singleLine ? styles.grid : undefined,
|
30
|
+
"text-style-secondary",
|
31
|
+
"text-size-smallest",
|
32
|
+
)}
|
33
|
+
key={key}
|
34
|
+
>
|
35
|
+
<span className={clsx("text-style-label")}>{role}:</span>
|
36
|
+
<span>{model}</span>
|
37
|
+
</div>
|
38
|
+
);
|
39
|
+
});
|
40
|
+
return modelEls.length > 0 ? (
|
41
|
+
<div className={styles.container}>{modelEls}</div>
|
42
|
+
) : undefined;
|
43
|
+
};
|
@@ -7,6 +7,7 @@ import { kModelNone } from "../../constants";
|
|
7
7
|
import { useStore } from "../../state/store";
|
8
8
|
import { EvalResults, EvalSpec, Status } from "../../types/log";
|
9
9
|
import { filename } from "../../utils/path";
|
10
|
+
import { ModelRolesView } from "./ModelRolesView";
|
10
11
|
import styles from "./PrimaryBar.module.css";
|
11
12
|
import {
|
12
13
|
displayScorersFromRunningMetrics,
|
@@ -100,6 +101,10 @@ export const PrimaryBar: FC<PrimaryBarProps> = ({
|
|
100
101
|
""
|
101
102
|
)}
|
102
103
|
</div>
|
104
|
+
{evalSpec?.model_roles ? (
|
105
|
+
<ModelRolesView roles={evalSpec.model_roles} />
|
106
|
+
) : undefined}
|
107
|
+
|
103
108
|
<div className={clsx("text-size-small", styles.secondaryContainer)}>
|
104
109
|
<div className={clsx("navbar-secondary-text", "text-truncate")}>
|
105
110
|
{logFileName}
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import { FC } from "react";
|
2
2
|
import { SampleSummary } from "../../api/types";
|
3
3
|
import { MessageBand } from "../../components/MessageBand";
|
4
|
+
import { ModelCard } from "../../plan/ModelCard";
|
4
5
|
import { PlanCard } from "../../plan/PlanCard";
|
5
6
|
import {
|
6
7
|
EvalError,
|
@@ -55,6 +56,7 @@ export const InfoTab: FC<PlanTabProps> = ({
|
|
55
56
|
evalPlan={evalPlan}
|
56
57
|
scores={evalResults?.scores}
|
57
58
|
/>
|
59
|
+
{evalSpec ? <ModelCard evalSpec={evalSpec} /> : undefined}
|
58
60
|
{evalStatus !== "started" ? <UsageCard stats={evalStats} /> : undefined}
|
59
61
|
{evalStatus === "error" && evalError ? (
|
60
62
|
<TaskErrorCard error={evalError} />
|
inspect_ai/agent/_agent.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from copy import copy, deepcopy
|
2
2
|
from functools import wraps
|
3
|
+
from inspect import signature
|
3
4
|
from typing import (
|
4
5
|
Any,
|
5
6
|
Callable,
|
@@ -7,6 +8,7 @@ from typing import (
|
|
7
8
|
Protocol,
|
8
9
|
TypeGuard,
|
9
10
|
cast,
|
11
|
+
get_type_hints,
|
10
12
|
overload,
|
11
13
|
runtime_checkable,
|
12
14
|
)
|
@@ -189,6 +191,16 @@ def agent(
|
|
189
191
|
)
|
190
192
|
return agent
|
191
193
|
|
194
|
+
# If a user's code runs "from __future__ import annotations", all type annotations are stored as strings,
|
195
|
+
# which can break introspection-based mechanisms (like inspecting a function’s signature).
|
196
|
+
# The following two lines resolve these string annotations using the original function's globals,
|
197
|
+
# ensuring that any forward references (e.g., "Agent") are evaluated to their actual types,
|
198
|
+
# and then reassign the original function's signature to the wrapper.
|
199
|
+
agent_wrapper.__annotations__ = get_type_hints(
|
200
|
+
agent_wrapper, agent_type.__globals__
|
201
|
+
)
|
202
|
+
agent_wrapper.__signature__ = signature(agent_type) # type: ignore[attr-defined]
|
203
|
+
|
192
204
|
# register
|
193
205
|
return agent_register(cast(Callable[P, Agent], agent_wrapper), agent_name)
|
194
206
|
|
inspect_ai/agent/_as_tool.py
CHANGED
@@ -42,7 +42,7 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
|
|
42
42
|
|
43
43
|
async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
|
44
44
|
# prepare state and call agent
|
45
|
-
state = AgentState(messages=[ChatMessageUser(content=input)])
|
45
|
+
state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
|
46
46
|
state = await agent(state, *args, **(agent_kwargs | kwargs))
|
47
47
|
|
48
48
|
# find assistant message to read content from (prefer output)
|