inspect-ai 0.3.87__py3-none-any.whl → 0.3.89__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. inspect_ai/_cli/eval.py +16 -0
  2. inspect_ai/_cli/score.py +1 -12
  3. inspect_ai/_cli/util.py +4 -2
  4. inspect_ai/_display/core/footer.py +2 -2
  5. inspect_ai/_display/plain/display.py +2 -2
  6. inspect_ai/_eval/context.py +7 -1
  7. inspect_ai/_eval/eval.py +51 -27
  8. inspect_ai/_eval/evalset.py +27 -10
  9. inspect_ai/_eval/loader.py +7 -8
  10. inspect_ai/_eval/run.py +23 -31
  11. inspect_ai/_eval/score.py +18 -1
  12. inspect_ai/_eval/task/log.py +5 -13
  13. inspect_ai/_eval/task/resolved.py +1 -0
  14. inspect_ai/_eval/task/run.py +231 -244
  15. inspect_ai/_eval/task/task.py +25 -2
  16. inspect_ai/_eval/task/util.py +1 -8
  17. inspect_ai/_util/constants.py +1 -0
  18. inspect_ai/_util/json.py +8 -3
  19. inspect_ai/_util/registry.py +30 -13
  20. inspect_ai/_view/www/App.css +5 -0
  21. inspect_ai/_view/www/dist/assets/index.css +55 -18
  22. inspect_ai/_view/www/dist/assets/index.js +550 -458
  23. inspect_ai/_view/www/log-schema.json +84 -1
  24. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
  25. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
  26. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
  27. inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
  28. inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
  29. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
  30. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
  31. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
  32. inspect_ai/_view/www/src/types/log.d.ts +150 -129
  33. inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
  34. inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
  35. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
  36. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
  37. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
  38. inspect_ai/agent/_agent.py +12 -0
  39. inspect_ai/agent/_as_tool.py +1 -1
  40. inspect_ai/agent/_bridge/bridge.py +9 -2
  41. inspect_ai/agent/_react.py +142 -74
  42. inspect_ai/agent/_run.py +13 -2
  43. inspect_ai/agent/_types.py +6 -0
  44. inspect_ai/approval/_apply.py +6 -9
  45. inspect_ai/approval/_approver.py +3 -3
  46. inspect_ai/approval/_auto.py +2 -2
  47. inspect_ai/approval/_call.py +20 -4
  48. inspect_ai/approval/_human/approver.py +3 -3
  49. inspect_ai/approval/_human/manager.py +2 -2
  50. inspect_ai/approval/_human/panel.py +3 -3
  51. inspect_ai/approval/_policy.py +3 -3
  52. inspect_ai/log/__init__.py +2 -0
  53. inspect_ai/log/_log.py +23 -2
  54. inspect_ai/log/_model.py +58 -0
  55. inspect_ai/log/_recorders/file.py +14 -3
  56. inspect_ai/log/_transcript.py +3 -0
  57. inspect_ai/model/__init__.py +2 -0
  58. inspect_ai/model/_call_tools.py +15 -2
  59. inspect_ai/model/_model.py +49 -3
  60. inspect_ai/model/_openai.py +151 -21
  61. inspect_ai/model/_providers/anthropic.py +25 -14
  62. inspect_ai/model/_providers/bedrock.py +3 -3
  63. inspect_ai/model/_providers/cloudflare.py +29 -108
  64. inspect_ai/model/_providers/google.py +21 -10
  65. inspect_ai/model/_providers/grok.py +23 -17
  66. inspect_ai/model/_providers/groq.py +61 -37
  67. inspect_ai/model/_providers/llama_cpp_python.py +8 -9
  68. inspect_ai/model/_providers/mistral.py +8 -3
  69. inspect_ai/model/_providers/ollama.py +8 -9
  70. inspect_ai/model/_providers/openai.py +53 -157
  71. inspect_ai/model/_providers/openai_compatible.py +195 -0
  72. inspect_ai/model/_providers/openrouter.py +4 -15
  73. inspect_ai/model/_providers/providers.py +11 -0
  74. inspect_ai/model/_providers/together.py +25 -23
  75. inspect_ai/model/_trim.py +83 -0
  76. inspect_ai/solver/_plan.py +5 -3
  77. inspect_ai/tool/_tool_call.py +3 -0
  78. inspect_ai/tool/_tool_def.py +8 -2
  79. inspect_ai/util/__init__.py +3 -0
  80. inspect_ai/util/_concurrency.py +15 -2
  81. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/METADATA +1 -1
  82. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/RECORD +86 -81
  83. inspect_ai/_eval/task/rundir.py +0 -78
  84. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
  85. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/WHEEL +0 -0
  86. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/entry_points.txt +0 -0
  87. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/licenses/LICENSE +0 -0
  88. {inspect_ai-0.3.87.dist-info → inspect_ai-0.3.89.dist-info}/top_level.txt +0 -0
@@ -24,12 +24,57 @@ export type SampleIds = (number | string)[] | null;
24
24
  export type Shuffled = boolean | null;
25
25
  export type Type = string;
26
26
  export type Model = string;
27
+ export type MaxRetries = number | null;
28
+ export type Timeout = number | null;
29
+ export type MaxConnections = number | null;
30
+ export type SystemMessage = string | null;
31
+ export type MaxTokens = number | null;
32
+ export type TopP = number | null;
33
+ export type Temperature = number | null;
34
+ export type StopSeqs = string[] | null;
35
+ export type BestOf = number | null;
36
+ export type FrequencyPenalty = number | null;
37
+ export type PresencePenalty = number | null;
38
+ export type LogitBias = {
39
+ [k: string]: number;
40
+ } | null;
41
+ export type Seed = number | null;
42
+ export type TopK = number | null;
43
+ export type NumChoices = number | null;
44
+ export type Logprobs = boolean | null;
45
+ export type TopLogprobs = number | null;
46
+ export type ParallelToolCalls = boolean | null;
47
+ export type InternalTools = boolean | null;
48
+ export type MaxToolOutput = number | null;
49
+ export type CachePrompt = "auto" | boolean | null;
50
+ export type ReasoningEffort = ("low" | "medium" | "high") | null;
51
+ export type ReasoningTokens = number | null;
52
+ export type ReasoningHistory = ("none" | "all" | "last" | "auto") | null;
53
+ export type Name1 = string;
54
+ export type Type1 =
55
+ | ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
56
+ | null;
57
+ export type Description = string | null;
58
+ export type Enum = unknown[] | null;
59
+ export type Properties = {
60
+ [k: string]: JSONSchema;
61
+ } | null;
62
+ export type Additionalproperties = JSONSchema | boolean | null;
63
+ export type Anyof = JSONSchema[] | null;
64
+ export type Required = string[] | null;
65
+ export type Description1 = string | null;
66
+ export type Strict = boolean | null;
27
67
  export type ModelBaseUrl = string | null;
68
+ export type ModelRoles = {
69
+ [k: string]: EvalModelConfig;
70
+ } | null;
71
+ export type Model1 = string;
72
+ export type BaseUrl = string | null;
28
73
  export type Limit = number | [unknown, unknown] | null;
29
74
  export type SampleId = string | number | (string | number)[] | null;
30
75
  export type Epochs = number | null;
31
76
  export type EpochsReducer = string[] | null;
32
- export type Name1 = string;
77
+ export type Name2 = string;
33
78
  export type Tools = string | string[];
34
79
  export type Approvers = ApproverPolicyConfig[];
35
80
  export type FailOnError = boolean | number | null;
@@ -47,12 +92,12 @@ export type LogImages = boolean | null;
47
92
  export type LogBuffer = number | null;
48
93
  export type LogShared = number | null;
49
94
  export type ScoreDisplay = boolean | null;
50
- export type Type1 = "git";
95
+ export type Type2 = "git";
51
96
  export type Origin = string;
52
97
  export type Commit = string;
53
98
  export type Metadata = {} | null;
54
99
  export type Scorers = EvalScorer[] | null;
55
- export type Name2 = string;
100
+ export type Name3 = string;
56
101
  export type Options = {} | null;
57
102
  export type Metrics =
58
103
  | (
@@ -65,7 +110,7 @@ export type Metrics =
65
110
  [k: string]: EvalMetricDefinition[];
66
111
  }
67
112
  | null;
68
- export type Name3 = string;
113
+ export type Name4 = string;
69
114
  export type Options1 = {} | null;
70
115
  export type Metadata1 = {} | null;
71
116
  export type Metrics1 =
@@ -74,49 +119,9 @@ export type Metrics1 =
74
119
  [k: string]: EvalMetricDefinition[];
75
120
  }
76
121
  | null;
77
- export type Name4 = string;
122
+ export type Name5 = string;
78
123
  export type Solver1 = string;
79
124
  export type Steps = EvalPlanStep[];
80
- export type MaxRetries = number | null;
81
- export type Timeout = number | null;
82
- export type MaxConnections = number | null;
83
- export type SystemMessage = string | null;
84
- export type MaxTokens = number | null;
85
- export type TopP = number | null;
86
- export type Temperature = number | null;
87
- export type StopSeqs = string[] | null;
88
- export type BestOf = number | null;
89
- export type FrequencyPenalty = number | null;
90
- export type PresencePenalty = number | null;
91
- export type LogitBias = {
92
- [k: string]: number;
93
- } | null;
94
- export type Seed = number | null;
95
- export type TopK = number | null;
96
- export type NumChoices = number | null;
97
- export type Logprobs = boolean | null;
98
- export type TopLogprobs = number | null;
99
- export type ParallelToolCalls = boolean | null;
100
- export type InternalTools = boolean | null;
101
- export type MaxToolOutput = number | null;
102
- export type CachePrompt = "auto" | boolean | null;
103
- export type ReasoningEffort = ("low" | "medium" | "high") | null;
104
- export type ReasoningTokens = number | null;
105
- export type ReasoningHistory = ("none" | "all" | "last" | "auto") | null;
106
- export type Name5 = string;
107
- export type Type2 =
108
- | ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
109
- | null;
110
- export type Description = string | null;
111
- export type Enum = unknown[] | null;
112
- export type Properties = {
113
- [k: string]: JSONSchema;
114
- } | null;
115
- export type Additionalproperties = JSONSchema | boolean | null;
116
- export type Anyof = JSONSchema[] | null;
117
- export type Required = string[] | null;
118
- export type Description1 = string | null;
119
- export type Strict = boolean | null;
120
125
  export type TotalSamples = number;
121
126
  export type CompletedSamples = number;
122
127
  export type Name6 = string;
@@ -210,7 +215,8 @@ export type ParseError = string | null;
210
215
  export type Title = string | null;
211
216
  export type Format2 = "text" | "markdown";
212
217
  export type Content3 = string;
213
- export type Model1 = string | null;
218
+ export type Type8 = string | null;
219
+ export type Model2 = string | null;
214
220
  export type Id5 = string | null;
215
221
  export type Content4 =
216
222
  | string
@@ -225,7 +231,7 @@ export type Source3 = ("input" | "generate") | null;
225
231
  export type Role3 = "tool";
226
232
  export type ToolCallId1 = string | null;
227
233
  export type Function1 = string | null;
228
- export type Type8 =
234
+ export type Type9 =
229
235
  | "parsing"
230
236
  | "timeout"
231
237
  | "unicode_decode"
@@ -246,7 +252,7 @@ export type Messages = (
246
252
  | ChatMessageAssistant
247
253
  | ChatMessageTool
248
254
  )[];
249
- export type Model2 = string;
255
+ export type Model3 = string;
250
256
  export type StopReason =
251
257
  | "stop"
252
258
  | "max_tokens"
@@ -305,7 +311,7 @@ export type Timestamp1 = string;
305
311
  export type WorkingStart1 = number;
306
312
  export type Pending1 = boolean | null;
307
313
  export type Event1 = "sample_limit";
308
- export type Type9 =
314
+ export type Type10 =
309
315
  | "message"
310
316
  | "time"
311
317
  | "working"
@@ -345,7 +351,8 @@ export type Timestamp5 = string;
345
351
  export type WorkingStart5 = number;
346
352
  export type Pending5 = boolean | null;
347
353
  export type Event5 = "model";
348
- export type Model3 = string;
354
+ export type Model4 = string;
355
+ export type Role4 = string | null;
349
356
  export type Input3 = (
350
357
  | ChatMessageSystem
351
358
  | ChatMessageUser
@@ -354,7 +361,7 @@ export type Input3 = (
354
361
  )[];
355
362
  export type Name8 = string;
356
363
  export type Description2 = string;
357
- export type Type10 = "object";
364
+ export type Type11 = "object";
358
365
  export type Required1 = string[];
359
366
  export type Additionalproperties1 = boolean;
360
367
  export type Tools1 = ToolInfo[];
@@ -369,7 +376,7 @@ export type Timestamp6 = string;
369
376
  export type WorkingStart6 = number;
370
377
  export type Pending6 = boolean | null;
371
378
  export type Event6 = "tool";
372
- export type Type11 = "function";
379
+ export type Type12 = "function";
373
380
  export type Id7 = string;
374
381
  export type Function2 = string;
375
382
  export type Result1 =
@@ -447,14 +454,14 @@ export type WorkingStart13 = number;
447
454
  export type Pending13 = boolean | null;
448
455
  export type Event13 = "step";
449
456
  export type Action1 = "begin" | "end";
450
- export type Type12 = string | null;
457
+ export type Type13 = string | null;
451
458
  export type Name11 = string;
452
459
  export type Timestamp14 = string;
453
460
  export type WorkingStart14 = number;
454
461
  export type Pending14 = boolean | null;
455
462
  export type Event14 = "subtask";
456
463
  export type Name12 = string;
457
- export type Type13 = string | null;
464
+ export type Type14 = string | null;
458
465
  export type Events2 = (
459
466
  | SampleInitEvent
460
467
  | SampleLimitEvent
@@ -515,7 +522,7 @@ export type Events = (
515
522
  export type TotalTime = number | null;
516
523
  export type WorkingTime3 = number | null;
517
524
  export type Uuid = string | null;
518
- export type Type14 =
525
+ export type Type15 =
519
526
  | "context"
520
527
  | "time"
521
528
  | "working"
@@ -576,8 +583,10 @@ export interface EvalSpec {
576
583
  dataset: EvalDataset;
577
584
  sandbox: SandboxEnvironmentSpec | null;
578
585
  model: Model;
586
+ model_generate_config: GenerateConfig;
579
587
  model_base_url: ModelBaseUrl;
580
588
  model_args: ModelArgs;
589
+ model_roles: ModelRoles;
581
590
  config: EvalConfig;
582
591
  revision: EvalRevision | null;
583
592
  packages: Packages;
@@ -607,7 +616,73 @@ export interface SandboxEnvironmentSpec {
607
616
  export interface Config {
608
617
  [k: string]: unknown;
609
618
  }
619
+ /**
620
+ * Model generation options.
621
+ */
622
+ export interface GenerateConfig {
623
+ max_retries: MaxRetries;
624
+ timeout: Timeout;
625
+ max_connections: MaxConnections;
626
+ system_message: SystemMessage;
627
+ max_tokens: MaxTokens;
628
+ top_p: TopP;
629
+ temperature: Temperature;
630
+ stop_seqs: StopSeqs;
631
+ best_of: BestOf;
632
+ frequency_penalty: FrequencyPenalty;
633
+ presence_penalty: PresencePenalty;
634
+ logit_bias: LogitBias;
635
+ seed: Seed;
636
+ top_k: TopK;
637
+ num_choices: NumChoices;
638
+ logprobs: Logprobs;
639
+ top_logprobs: TopLogprobs;
640
+ parallel_tool_calls: ParallelToolCalls;
641
+ internal_tools: InternalTools;
642
+ max_tool_output: MaxToolOutput;
643
+ cache_prompt: CachePrompt;
644
+ reasoning_effort: ReasoningEffort;
645
+ reasoning_tokens: ReasoningTokens;
646
+ reasoning_history: ReasoningHistory;
647
+ response_schema: ResponseSchema | null;
648
+ }
649
+ /**
650
+ * Schema for model response when using Structured Output.
651
+ */
652
+ export interface ResponseSchema {
653
+ name: Name1;
654
+ json_schema: JSONSchema;
655
+ description: Description1;
656
+ strict: Strict;
657
+ }
658
+ /**
659
+ * JSON Schema for type.
660
+ */
661
+ export interface JSONSchema {
662
+ type: Type1;
663
+ description: Description;
664
+ default: Default;
665
+ enum: Enum;
666
+ items: JSONSchema | null;
667
+ properties: Properties;
668
+ additionalProperties: Additionalproperties;
669
+ anyOf: Anyof;
670
+ required: Required;
671
+ }
672
+ export interface Default {
673
+ [k: string]: unknown;
674
+ }
610
675
  export interface ModelArgs {}
676
+ /**
677
+ * Model config.
678
+ */
679
+ export interface EvalModelConfig {
680
+ model: Model1;
681
+ config: GenerateConfig;
682
+ base_url: BaseUrl;
683
+ args: Args;
684
+ }
685
+ export interface Args {}
611
686
  /**
612
687
  * Configuration used for evaluation.
613
688
  */
@@ -653,7 +728,7 @@ export interface ApprovalPolicyConfig {
653
728
  * ```
654
729
  */
655
730
  export interface ApproverPolicyConfig {
656
- name: Name1;
731
+ name: Name2;
657
732
  tools: Tools;
658
733
  params: Params;
659
734
  }
@@ -662,7 +737,7 @@ export interface Params {}
662
737
  * Git revision for evaluation.
663
738
  */
664
739
  export interface EvalRevision {
665
- type: Type1;
740
+ type: Type2;
666
741
  origin: Origin;
667
742
  commit: Commit;
668
743
  }
@@ -670,23 +745,23 @@ export interface Packages {
670
745
  [k: string]: string;
671
746
  }
672
747
  export interface EvalScorer {
673
- name: Name2;
748
+ name: Name3;
674
749
  options: Options;
675
750
  metrics: Metrics;
676
751
  metadata: Metadata1;
677
752
  }
678
753
  export interface EvalMetricDefinition {
679
- name: Name3;
754
+ name: Name4;
680
755
  options: Options1;
681
756
  }
682
757
  /**
683
758
  * Plan (solvers) used in evaluation.
684
759
  */
685
760
  export interface EvalPlan {
686
- name: Name4;
761
+ name: Name5;
687
762
  steps: Steps;
688
763
  finish: EvalPlanStep | null;
689
- config: GenerateConfig;
764
+ config: GenerateConfig1;
690
765
  }
691
766
  /**
692
767
  * Solver step.
@@ -699,7 +774,7 @@ export interface Params1 {}
699
774
  /**
700
775
  * Model generation options.
701
776
  */
702
- export interface GenerateConfig {
777
+ export interface GenerateConfig1 {
703
778
  max_retries: MaxRetries;
704
779
  timeout: Timeout;
705
780
  max_connections: MaxConnections;
@@ -726,32 +801,6 @@ export interface GenerateConfig {
726
801
  reasoning_history: ReasoningHistory;
727
802
  response_schema: ResponseSchema | null;
728
803
  }
729
- /**
730
- * Schema for model response when using Structured Output.
731
- */
732
- export interface ResponseSchema {
733
- name: Name5;
734
- json_schema: JSONSchema;
735
- description: Description1;
736
- strict: Strict;
737
- }
738
- /**
739
- * JSON Schema for type.
740
- */
741
- export interface JSONSchema {
742
- type: Type2;
743
- description: Description;
744
- default: Default;
745
- enum: Enum;
746
- items: JSONSchema | null;
747
- properties: Properties;
748
- additionalProperties: Additionalproperties;
749
- anyOf: Anyof;
750
- required: Required;
751
- }
752
- export interface Default {
753
- [k: string]: unknown;
754
- }
755
804
  /**
756
805
  * Scoring results from evaluation.
757
806
  */
@@ -916,7 +965,7 @@ export interface ChatMessageAssistant {
916
965
  internal: unknown;
917
966
  role: Role2;
918
967
  tool_calls: ToolCalls;
919
- model: Model1;
968
+ model: Model2;
920
969
  }
921
970
  export interface ToolCall {
922
971
  id: Id4;
@@ -925,6 +974,7 @@ export interface ToolCall {
925
974
  internal: unknown;
926
975
  parse_error: ParseError;
927
976
  view: ToolCallContent | null;
977
+ type: Type8;
928
978
  }
929
979
  export interface Arguments {}
930
980
  /**
@@ -949,14 +999,14 @@ export interface ChatMessageTool {
949
999
  error: ToolCallError | null;
950
1000
  }
951
1001
  export interface ToolCallError {
952
- type: Type8;
1002
+ type: Type9;
953
1003
  message: Message1;
954
1004
  }
955
1005
  /**
956
1006
  * Output from model generation.
957
1007
  */
958
1008
  export interface ModelOutput {
959
- model: Model2;
1009
+ model: Model3;
960
1010
  choices: Choices1;
961
1011
  usage: ModelUsage1 | null;
962
1012
  time: Time;
@@ -1037,7 +1087,7 @@ export interface SampleLimitEvent {
1037
1087
  working_start: WorkingStart1;
1038
1088
  pending: Pending1;
1039
1089
  event: Event1;
1040
- type: Type9;
1090
+ type: Type10;
1041
1091
  message: Message2;
1042
1092
  limit: Limit1;
1043
1093
  }
@@ -1100,11 +1150,12 @@ export interface ModelEvent {
1100
1150
  working_start: WorkingStart5;
1101
1151
  pending: Pending5;
1102
1152
  event: Event5;
1103
- model: Model3;
1153
+ model: Model4;
1154
+ role: Role4;
1104
1155
  input: Input3;
1105
1156
  tools: Tools1;
1106
1157
  tool_choice: ToolChoice;
1107
- config: GenerateConfig1;
1158
+ config: GenerateConfig;
1108
1159
  output: ModelOutput;
1109
1160
  error: Error1;
1110
1161
  cache: Cache;
@@ -1147,7 +1198,7 @@ export interface ToolInfo {
1147
1198
  * Description of tool parameters object in JSON Schema format.
1148
1199
  */
1149
1200
  export interface ToolParams {
1150
- type: Type10;
1201
+ type: Type11;
1151
1202
  properties: Properties1;
1152
1203
  required: Required1;
1153
1204
  additionalProperties: Additionalproperties1;
@@ -1158,36 +1209,6 @@ export interface Properties1 {
1158
1209
  export interface ToolFunction {
1159
1210
  name: Name9;
1160
1211
  }
1161
- /**
1162
- * Model generation options.
1163
- */
1164
- export interface GenerateConfig1 {
1165
- max_retries: MaxRetries;
1166
- timeout: Timeout;
1167
- max_connections: MaxConnections;
1168
- system_message: SystemMessage;
1169
- max_tokens: MaxTokens;
1170
- top_p: TopP;
1171
- temperature: Temperature;
1172
- stop_seqs: StopSeqs;
1173
- best_of: BestOf;
1174
- frequency_penalty: FrequencyPenalty;
1175
- presence_penalty: PresencePenalty;
1176
- logit_bias: LogitBias;
1177
- seed: Seed;
1178
- top_k: TopK;
1179
- num_choices: NumChoices;
1180
- logprobs: Logprobs;
1181
- top_logprobs: TopLogprobs;
1182
- parallel_tool_calls: ParallelToolCalls;
1183
- internal_tools: InternalTools;
1184
- max_tool_output: MaxToolOutput;
1185
- cache_prompt: CachePrompt;
1186
- reasoning_effort: ReasoningEffort;
1187
- reasoning_tokens: ReasoningTokens;
1188
- reasoning_history: ReasoningHistory;
1189
- response_schema: ResponseSchema | null;
1190
- }
1191
1212
  /**
1192
1213
  * Model call (raw request/response data).
1193
1214
  */
@@ -1210,7 +1231,7 @@ export interface ToolEvent {
1210
1231
  working_start: WorkingStart6;
1211
1232
  pending: Pending6;
1212
1233
  event: Event6;
1213
- type: Type11;
1234
+ type: Type12;
1214
1235
  id: Id7;
1215
1236
  function: Function2;
1216
1237
  arguments: Arguments1;
@@ -1332,7 +1353,7 @@ export interface StepEvent {
1332
1353
  pending: Pending13;
1333
1354
  event: Event13;
1334
1355
  action: Action1;
1335
- type: Type12;
1356
+ type: Type13;
1336
1357
  name: Name11;
1337
1358
  }
1338
1359
  /**
@@ -1344,7 +1365,7 @@ export interface SubtaskEvent {
1344
1365
  pending: Pending14;
1345
1366
  event: Event14;
1346
1367
  name: Name12;
1347
- type: Type13;
1368
+ type: Type14;
1348
1369
  input: Input5;
1349
1370
  result: Result2;
1350
1371
  events: Events2;
@@ -1365,7 +1386,7 @@ export interface Attachments {
1365
1386
  * Limit encontered by sample.
1366
1387
  */
1367
1388
  export interface EvalSampleLimit {
1368
- type: Type14;
1389
+ type: Type15;
1369
1390
  limit: Limit2;
1370
1391
  }
1371
1392
  /**
@@ -0,0 +1,16 @@
1
+ .container {
2
+ display: flex;
3
+ flex-direction: row;
4
+ flex-wrap: wrap;
5
+ gap: 0;
6
+ margin-top: -0.2rem;
7
+ margin-bottom: 0.2rem;
8
+ }
9
+
10
+ .grid {
11
+ display: grid;
12
+ grid-template-rows: repeat(auto-fill, minmax(10px, 1fr));
13
+ grid-template-columns: 1fr;
14
+ gap: 0.1em;
15
+ padding-right: 1em;
16
+ }
@@ -0,0 +1,43 @@
1
+ import { FC } from "react";
2
+ import { ModelRoles } from "../../types/log";
3
+
4
+ import clsx from "clsx";
5
+ import styles from "./ModelRolesView.module.css";
6
+
7
+ interface ModelRolesViewProps {
8
+ roles: ModelRoles;
9
+ }
10
+
11
+ /**
12
+ * Renders the Navbar
13
+ */
14
+ export const ModelRolesView: FC<ModelRolesViewProps> = ({ roles }) => {
15
+ roles = roles || {};
16
+
17
+ // Render as a single line if there is only a single
18
+ // model role
19
+ const singleLine = Object.keys(roles).length !== 1;
20
+
21
+ // Render a layout of model roles
22
+ const modelEls = Object.keys(roles).map((key) => {
23
+ const role = key;
24
+ const roleData = roles[role];
25
+ const model = roleData.model;
26
+ return (
27
+ <div
28
+ className={clsx(
29
+ singleLine ? styles.grid : undefined,
30
+ "text-style-secondary",
31
+ "text-size-smallest",
32
+ )}
33
+ key={key}
34
+ >
35
+ <span className={clsx("text-style-label")}>{role}:</span>
36
+ <span>{model}</span>
37
+ </div>
38
+ );
39
+ });
40
+ return modelEls.length > 0 ? (
41
+ <div className={styles.container}>{modelEls}</div>
42
+ ) : undefined;
43
+ };
@@ -46,7 +46,7 @@
46
46
 
47
47
  .secondaryContainer {
48
48
  opacity: 0.7;
49
- margin-top: 0.1rem;
49
+ margin-top: -0.1rem;
50
50
  padding-bottom: 0;
51
51
  display: grid;
52
52
  grid-template-columns: minmax(0, max-content) max-content;
@@ -7,6 +7,7 @@ import { kModelNone } from "../../constants";
7
7
  import { useStore } from "../../state/store";
8
8
  import { EvalResults, EvalSpec, Status } from "../../types/log";
9
9
  import { filename } from "../../utils/path";
10
+ import { ModelRolesView } from "./ModelRolesView";
10
11
  import styles from "./PrimaryBar.module.css";
11
12
  import {
12
13
  displayScorersFromRunningMetrics,
@@ -100,6 +101,10 @@ export const PrimaryBar: FC<PrimaryBarProps> = ({
100
101
  ""
101
102
  )}
102
103
  </div>
104
+ {evalSpec?.model_roles ? (
105
+ <ModelRolesView roles={evalSpec.model_roles} />
106
+ ) : undefined}
107
+
103
108
  <div className={clsx("text-size-small", styles.secondaryContainer)}>
104
109
  <div className={clsx("navbar-secondary-text", "text-truncate")}>
105
110
  {logFileName}
@@ -1,6 +1,7 @@
1
1
  import { FC } from "react";
2
2
  import { SampleSummary } from "../../api/types";
3
3
  import { MessageBand } from "../../components/MessageBand";
4
+ import { ModelCard } from "../../plan/ModelCard";
4
5
  import { PlanCard } from "../../plan/PlanCard";
5
6
  import {
6
7
  EvalError,
@@ -55,6 +56,7 @@ export const InfoTab: FC<PlanTabProps> = ({
55
56
  evalPlan={evalPlan}
56
57
  scores={evalResults?.scores}
57
58
  />
59
+ {evalSpec ? <ModelCard evalSpec={evalSpec} /> : undefined}
58
60
  {evalStatus !== "started" ? <UsageCard stats={evalStats} /> : undefined}
59
61
  {evalStatus === "error" && evalError ? (
60
62
  <TaskErrorCard error={evalError} />
@@ -1,5 +1,6 @@
1
1
  from copy import copy, deepcopy
2
2
  from functools import wraps
3
+ from inspect import signature
3
4
  from typing import (
4
5
  Any,
5
6
  Callable,
@@ -7,6 +8,7 @@ from typing import (
7
8
  Protocol,
8
9
  TypeGuard,
9
10
  cast,
11
+ get_type_hints,
10
12
  overload,
11
13
  runtime_checkable,
12
14
  )
@@ -189,6 +191,16 @@ def agent(
189
191
  )
190
192
  return agent
191
193
 
194
+ # If a user's code runs "from __future__ import annotations", all type annotations are stored as strings,
195
+ # which can break introspection-based mechanisms (like inspecting a function’s signature).
196
+ # The following two lines resolve these string annotations using the original function's globals,
197
+ # ensuring that any forward references (e.g., "Agent") are evaluated to their actual types,
198
+ # and then reassign the original function's signature to the wrapper.
199
+ agent_wrapper.__annotations__ = get_type_hints(
200
+ agent_wrapper, agent_type.__globals__
201
+ )
202
+ agent_wrapper.__signature__ = signature(agent_type) # type: ignore[attr-defined]
203
+
192
204
  # register
193
205
  return agent_register(cast(Callable[P, Agent], agent_wrapper), agent_name)
194
206
 
@@ -42,7 +42,7 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
42
42
 
43
43
  async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
44
44
  # prepare state and call agent
45
- state = AgentState(messages=[ChatMessageUser(content=input)])
45
+ state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
46
46
  state = await agent(state, *args, **(agent_kwargs | kwargs))
47
47
 
48
48
  # find assistant message to read content from (prefer output)