palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +343 -209
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +639 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +62 -6
  19. palimpzest/prompts/filter_prompts.py +51 -6
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
  22. palimpzest/prompts/prompt_factory.py +375 -47
  23. palimpzest/prompts/split_proposer_prompts.py +1 -1
  24. palimpzest/prompts/util_phrases.py +5 -0
  25. palimpzest/prompts/validator.py +239 -0
  26. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  27. palimpzest/query/execution/execution_strategy.py +210 -317
  28. palimpzest/query/execution/execution_strategy_type.py +5 -7
  29. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  30. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  31. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  32. palimpzest/query/generators/generators.py +160 -331
  33. palimpzest/query/operators/__init__.py +15 -5
  34. palimpzest/query/operators/aggregate.py +50 -33
  35. palimpzest/query/operators/compute.py +201 -0
  36. palimpzest/query/operators/convert.py +33 -19
  37. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  38. palimpzest/query/operators/distinct.py +62 -0
  39. palimpzest/query/operators/filter.py +26 -16
  40. palimpzest/query/operators/join.py +403 -0
  41. palimpzest/query/operators/limit.py +3 -3
  42. palimpzest/query/operators/logical.py +205 -77
  43. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  44. palimpzest/query/operators/physical.py +27 -21
  45. palimpzest/query/operators/project.py +3 -3
  46. palimpzest/query/operators/rag_convert.py +7 -7
  47. palimpzest/query/operators/retrieve.py +9 -9
  48. palimpzest/query/operators/scan.py +81 -42
  49. palimpzest/query/operators/search.py +524 -0
  50. palimpzest/query/operators/split_convert.py +10 -8
  51. palimpzest/query/optimizer/__init__.py +7 -9
  52. palimpzest/query/optimizer/cost_model.py +108 -441
  53. palimpzest/query/optimizer/optimizer.py +123 -181
  54. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  55. palimpzest/query/optimizer/plan.py +352 -67
  56. palimpzest/query/optimizer/primitives.py +43 -19
  57. palimpzest/query/optimizer/rules.py +484 -646
  58. palimpzest/query/optimizer/tasks.py +127 -58
  59. palimpzest/query/processor/config.py +42 -76
  60. palimpzest/query/processor/query_processor.py +73 -18
  61. palimpzest/query/processor/query_processor_factory.py +46 -38
  62. palimpzest/schemabuilder/schema_builder.py +15 -28
  63. palimpzest/utils/model_helpers.py +32 -77
  64. palimpzest/utils/progress.py +114 -102
  65. palimpzest/validator/__init__.py +0 -0
  66. palimpzest/validator/validator.py +306 -0
  67. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
  68. palimpzest-0.8.1.dist-info/RECORD +95 -0
  69. palimpzest/core/lib/fields.py +0 -141
  70. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  71. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  72. palimpzest/query/generators/api_client_factory.py +0 -30
  73. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  74. palimpzest/query/operators/map.py +0 -130
  75. palimpzest/query/processor/nosentinel_processor.py +0 -33
  76. palimpzest/query/processor/processing_strategy_type.py +0 -28
  77. palimpzest/query/processor/sentinel_processor.py +0 -88
  78. palimpzest/query/processor/streaming_processor.py +0 -149
  79. palimpzest/sets.py +0 -405
  80. palimpzest/utils/datareader_helpers.py +0 -61
  81. palimpzest/utils/demo_helpers.py +0 -75
  82. palimpzest/utils/field_helpers.py +0 -69
  83. palimpzest/utils/generation_helpers.py +0 -69
  84. palimpzest/utils/sandbox.py +0 -183
  85. palimpzest-0.7.21.dist-info/RECORD +0 -95
  86. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  87. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
  88. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
  89. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0
palimpzest/constants.py CHANGED
@@ -10,77 +10,125 @@ class Model(str, Enum):
10
10
  which requires invoking an LLM. It does NOT specify whether the model need be executed
11
11
  remotely or locally (if applicable).
12
12
  """
13
- LLAMA3_2_3B = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
14
- LLAMA3_1_8B = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
15
- LLAMA3_3_70B = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
16
- LLAMA3_2_90B_V = "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo"
17
- MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
18
- DEEPSEEK_V3 = "deepseek-ai/DeepSeek-V3"
19
- DEEPSEEK_R1_DISTILL_QWEN_1_5B = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
20
- GPT_4o = "gpt-4o-2024-08-06"
21
- GPT_4o_MINI = "gpt-4o-mini-2024-07-18"
13
+ LLAMA3_2_3B = "together_ai/meta-llama/Llama-3.2-3B-Instruct-Turbo"
14
+ LLAMA3_1_8B = "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
15
+ LLAMA3_3_70B = "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"
16
+ LLAMA3_2_90B_V = "together_ai/meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo"
17
+ DEEPSEEK_V3 = "together_ai/deepseek-ai/DeepSeek-V3"
18
+ DEEPSEEK_R1_DISTILL_QWEN_1_5B = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
19
+ GPT_4o = "openai/gpt-4o-2024-08-06"
20
+ GPT_4o_MINI = "openai/gpt-4o-mini-2024-07-18"
21
+ GPT_4_1 = "openai/gpt-4.1-2025-04-14"
22
+ GPT_4_1_MINI = "openai/gpt-4.1-mini-2025-04-14"
23
+ GPT_4_1_NANO = "openai/gpt-4.1-nano-2025-04-14"
24
+ GPT_5 = "openai/gpt-5-2025-08-07"
25
+ GPT_5_MINI = "openai/gpt-5-mini-2025-08-07"
26
+ GPT_5_NANO = "openai/gpt-5-nano-2025-08-07"
27
+ o4_MINI = "openai/o4-mini-2025-04-16" # noqa: N815
22
28
  TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
23
29
  CLIP_VIT_B_32 = "clip-ViT-B-32"
30
+ CLAUDE_3_5_SONNET = "anthropic/claude-3-5-sonnet-20241022"
31
+ CLAUDE_3_7_SONNET = "anthropic/claude-3-7-sonnet-20250219"
32
+ CLAUDE_3_5_HAIKU = "anthropic/claude-3-5-haiku-20241022"
33
+ GEMINI_2_0_FLASH = "vertex_ai/gemini-2.0-flash"
34
+ GEMINI_2_5_FLASH = "vertex_ai/gemini-2.5-flash"
35
+ GEMINI_2_5_PRO = "vertex_ai/gemini-2.5-pro"
36
+ GOOGLE_GEMINI_2_5_FLASH = "google/gemini-2.5-flash"
37
+ GOOGLE_GEMINI_2_5_FLASH_LITE = "google/gemini-2.5-flash-lite"
38
+ GOOGLE_GEMINI_2_5_PRO = "google/gemini-2.5-pro"
39
+ LLAMA_4_MAVERICK = "vertex_ai/meta/llama-4-maverick-17b-128e-instruct-maas"
40
+ GPT_4o_AUDIO_PREVIEW = "openai/gpt-4o-audio-preview"
41
+ GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
42
+ VLLM_QWEN_1_5_0_5B_CHAT = "hosted_vllm/qwen/Qwen1.5-0.5B-Chat"
24
43
  # o1 = "o1-2024-12-17"
25
44
 
26
45
  def __repr__(self):
27
46
  return f"{self.name}"
28
47
 
29
- def is_deepseek_model(self):
30
- return "deepseek" in self.value.lower()
31
-
32
48
  def is_llama_model(self):
33
49
  return "llama" in self.value.lower()
34
50
 
35
- def is_mixtral_model(self):
36
- return "mixtral" in self.value.lower()
37
-
38
51
  def is_clip_model(self):
39
52
  return "clip" in self.value.lower()
40
53
 
41
54
  def is_together_model(self):
42
- is_llama_model = self.is_llama_model()
43
- is_mixtral_model = self.is_mixtral_model()
44
- is_deepseek_model = self.is_deepseek_model()
45
- is_clip_model = self.is_clip_model()
46
- return is_llama_model or is_mixtral_model or is_deepseek_model or is_clip_model
47
-
48
- def is_gpt_4o_model(self):
49
- return "gpt-4o" in self.value.lower()
50
-
51
- def is_o1_model(self):
52
- return "o1" in self.value.lower()
55
+ return "together_ai" in self.value.lower() or self.is_clip_model()
53
56
 
54
57
  def is_text_embedding_model(self):
55
58
  return "text-embedding" in self.value.lower()
56
59
 
60
+ def is_o_model(self):
61
+ return self in [Model.o4_MINI]
62
+
63
+ def is_gpt_5_model(self):
64
+ return self in [Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO]
65
+
57
66
  def is_openai_model(self):
58
- is_gpt4_model = self.is_gpt_4o_model()
59
- is_o1_model = self.is_o1_model()
60
- is_text_embedding_model = self.is_text_embedding_model()
61
- return is_gpt4_model or is_o1_model or is_text_embedding_model
67
+ return "openai" in self.value.lower() or self.is_text_embedding_model()
68
+
69
+ def is_anthropic_model(self):
70
+ return "anthropic" in self.value.lower()
71
+
72
+ def is_vertex_model(self):
73
+ return "vertex_ai" in self.value.lower()
74
+
75
+ def is_google_model(self):
76
+ return "google" in self.value.lower()
77
+
78
+ def is_vllm_model(self):
79
+ return "hosted_vllm" in self.value.lower()
62
80
 
81
+ def is_reasoning_model(self):
82
+ reasoning_models = [
83
+ Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO, Model.o4_MINI,
84
+ Model.GEMINI_2_5_PRO, Model.GEMINI_2_5_FLASH,
85
+ Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
86
+ Model.CLAUDE_3_7_SONNET,
87
+ ]
88
+ return self in reasoning_models
89
+
90
+ def is_text_model(self):
91
+ non_text_models = [
92
+ Model.LLAMA3_2_90B_V,
93
+ Model.CLIP_VIT_B_32, Model.TEXT_EMBEDDING_3_SMALL,
94
+ Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
95
+ ]
96
+ return self not in non_text_models
97
+
98
+ # TODO: I think SONNET and HAIKU are vision-capable too
63
99
  def is_vision_model(self):
64
- vision_models = [
65
- "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
66
- "gpt-4o-2024-08-06",
67
- "gpt-4o-mini-2024-07-18",
68
- "o1-2024-12-17",
100
+ return self in [
101
+ Model.LLAMA3_2_90B_V, Model.LLAMA_4_MAVERICK,
102
+ Model.GPT_4o, Model.GPT_4o_MINI, Model.GPT_4_1, Model.GPT_4_1_MINI, Model.GPT_4_1_NANO, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO,
103
+ Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
104
+ Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
69
105
  ]
70
- return self.value in vision_models
71
106
 
72
- def is_embedding_model(self):
73
- is_clip_model = self.is_clip_model()
74
- is_text_embedding_model = self.is_text_embedding_model()
75
- return is_clip_model or is_text_embedding_model
107
+ def is_audio_model(self):
108
+ return self in [
109
+ Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
110
+ Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
111
+ Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
112
+ ]
76
113
 
77
- class APIClient(str, Enum):
78
- """
79
- APIClient describes the API client to be used when invoking an LLM.
80
- """
114
+ def is_text_image_multimodal_model(self):
115
+ return self in [
116
+ Model.LLAMA_4_MAVERICK,
117
+ Model.GPT_4o, Model.GPT_4o_MINI, Model.GPT_4_1, Model.GPT_4_1_MINI, Model.GPT_4_1_NANO, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO,
118
+ Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
119
+ Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
120
+ ]
121
+
122
+ def is_text_audio_multimodal_model(self):
123
+ return self in [
124
+ Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
125
+ Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
126
+ Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
127
+ ]
128
+
129
+ def is_embedding_model(self):
130
+ return self in [Model.CLIP_VIT_B_32, Model.TEXT_EMBEDDING_3_SMALL]
81
131
 
82
- OPENAI = "openai"
83
- TOGETHER = "together"
84
132
 
85
133
  class PromptStrategy(str, Enum):
86
134
  """
@@ -90,28 +138,48 @@ class PromptStrategy(str, Enum):
90
138
 
91
139
  # Chain-of-Thought Boolean Prompt Strategies
92
140
  COT_BOOL = "chain-of-thought-bool"
141
+ COT_BOOL_NO_REASONING = "chain-of-thought-bool-no-reasoning"
93
142
  # COT_BOOL_CRITIC = "chain-of-thought-bool-critic"
94
143
  # COT_BOOL_REFINE = "chain-of-thought-bool-refine"
95
144
 
96
145
  # Chain-of-Thought Boolean with Image Prompt Strategies
97
146
  COT_BOOL_IMAGE = "chain-of-thought-bool-image"
147
+ COT_BOOL_IMAGE_NO_REASONING = "chain-of-thought-bool-image"
148
+ COT_BOOL_AUDIO = "chain-of-thought-bool-audio"
149
+ COT_BOOL_AUDIO_NO_REASONING = "chain-of-thought-bool-audio"
98
150
  # COT_BOOL_IMAGE_CRITIC = "chain-of-thought-bool-image-critic"
99
151
  # COT_BOOL_IMAGE_REFINE = "chain-of-thought-bool-image-refine"
100
152
 
153
+ # Chain-of-Thought Join Prompt Strategies
154
+ COT_JOIN = "chain-of-thought-join"
155
+ COT_JOIN_NO_REASONING = "chain-of-thought-join-no-reasoning"
156
+ COT_JOIN_IMAGE = "chain-of-thought-join-image"
157
+ COT_JOIN_IMAGE_NO_REASONING = "chain-of-thought-join-image-no-reasoning"
158
+ COT_JOIN_AUDIO = "chain-of-thought-join-audio"
159
+ COT_JOIN_AUDIO_NO_REASONING = "chain-of-thought-join-audio-no-reasoning"
160
+
101
161
  # Chain-of-Thought Question Answering Prompt Strategies
102
162
  COT_QA = "chain-of-thought-question"
163
+ COT_QA_NO_REASONING = "chain-of-thought-question-no-reasoning"
103
164
  COT_QA_CRITIC = "chain-of-thought-question-critic"
104
165
  COT_QA_REFINE = "chain-of-thought-question-refine"
105
166
 
106
167
  # Chain-of-Thought Question with Image Prompt Strategies
107
168
  COT_QA_IMAGE = "chain-of-thought-question-image"
169
+ COT_QA_IMAGE_NO_REASONING = "chain-of-thought-question-image-no-reasoning"
108
170
  COT_QA_IMAGE_CRITIC = "chain-of-thought-question-critic-image"
109
171
  COT_QA_IMAGE_REFINE = "chain-of-thought-question-refine-image"
110
172
 
173
+ # Chain-of-Thought Queestion with Audio Prompt Strategies
174
+ COT_QA_AUDIO = "chain-of-thought-question-audio"
175
+ COT_QA_AUDIO_NO_REASONING = "chain-of-thought-question-audio-no-reasoning"
176
+ # TODO: COT_QA_AUDIO_CRITIC/REFINE
177
+
111
178
  # Mixture-of-Agents Prompt Strategies
112
179
  COT_MOA_PROPOSER = "chain-of-thought-mixture-of-agents-proposer"
113
180
  COT_MOA_PROPOSER_IMAGE = "chain-of-thought-mixture-of-agents-proposer-image"
114
181
  COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation"
182
+ # TODO: COT_MOA_PROPOSER_AUDIO
115
183
 
116
184
  # Split Convert Prompt Strategies
117
185
  SPLIT_PROPOSER = "split-proposer"
@@ -120,11 +188,17 @@ class PromptStrategy(str, Enum):
120
188
  def is_image_prompt(self):
121
189
  return "image" in self.value
122
190
 
191
+ def is_audio_prompt(self):
192
+ return "audio" in self.value
193
+
123
194
  def is_bool_prompt(self):
124
195
  return "bool" in self.value
125
196
 
197
+ def is_join_prompt(self):
198
+ return "join" in self.value
199
+
126
200
  def is_convert_prompt(self):
127
- return "bool" not in self.value
201
+ return "bool" not in self.value and "join" not in self.value
128
202
 
129
203
  def is_critic_prompt(self):
130
204
  return "critic" in self.value
@@ -144,6 +218,9 @@ class PromptStrategy(str, Enum):
144
218
  def is_split_merger_prompt(self):
145
219
  return "split-merger" in self.value
146
220
 
221
+ def is_no_reasoning_prompt(self):
222
+ return "no-reasoning" in self.value
223
+
147
224
  class AggFunc(str, Enum):
148
225
  COUNT = "count"
149
226
  AVERAGE = "average"
@@ -169,6 +246,7 @@ class PickOutputStrategy(str, Enum):
169
246
  ENSEMBLE = "ensemble"
170
247
 
171
248
 
249
+ AUDIO_EXTENSIONS = [".wav"]
172
250
  IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"]
173
251
  PDF_EXTENSIONS = [".pdf"]
174
252
  XLS_EXTENSIONS = [".xls", ".xlsx"]
@@ -210,8 +288,8 @@ NAIVE_BYTES_PER_RECORD = 1024
210
288
  # Rough conversion from # of characters --> # of tokens; assumes 1 token ~= 4 chars
211
289
  TOKENS_PER_CHARACTER = 0.25
212
290
 
213
- # Rough estimate of the number of tokens the context is allowed to take up for MIXTRAL and LLAMA3 models
214
- MIXTRAL_LLAMA_CONTEXT_TOKENS_LIMIT = 6000
291
+ # Rough estimate of the number of tokens the context is allowed to take up for LLAMA3 models
292
+ LLAMA_CONTEXT_TOKENS_LIMIT = 6000
215
293
 
216
294
  # a naive estimate for the input record size
217
295
  NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
@@ -219,6 +297,9 @@ NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
219
297
  # a naive estimate for filter selectivity
220
298
  NAIVE_EST_FILTER_SELECTIVITY = 0.5
221
299
 
300
+ # a naive estimate for join selectivity
301
+ NAIVE_EST_JOIN_SELECTIVITY = 0.5
302
+
222
303
  # a naive estimate for the number of input tokens processed per record
223
304
  NAIVE_EST_NUM_INPUT_TOKENS = 1000
224
305
 
@@ -246,6 +327,7 @@ LOG_LLM_OUTPUT = False
246
327
  # - in the future we should split quality for vision vs. multi-modal vs. text
247
328
  # - code quality was computed using HumanEval, but that benchmark is too easy and should be replaced.
248
329
  # - https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro
330
+ # - https://www.vals.ai/benchmarks/mmlu_pro-08-12-2025
249
331
  #
250
332
  # Cost is presented in terms of USD / token for input tokens and USD / token for
251
333
  # generated tokens.
@@ -260,75 +342,74 @@ LLAMA3_2_3B_INSTRUCT_MODEL_CARD = {
260
342
  "usd_per_input_token": 0.06 / 1e6,
261
343
  "usd_per_output_token": 0.06 / 1e6,
262
344
  ##### Time #####
263
- "seconds_per_output_token": 0.0064,
345
+ "seconds_per_output_token": 0.0079,
264
346
  ##### Agg. Benchmark #####
265
347
  "overall": 36.50, # https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/discussions/13
266
- ##### Code #####
267
- "code": 0.0,
268
348
  }
269
349
  LLAMA3_1_8B_INSTRUCT_MODEL_CARD = {
270
350
  ##### Cost in USD #####
271
351
  "usd_per_input_token": 0.18 / 1e6,
272
352
  "usd_per_output_token": 0.18 / 1e6,
273
353
  ##### Time #####
274
- "seconds_per_output_token": 0.0059,
354
+ "seconds_per_output_token": 0.0050,
275
355
  ##### Agg. Benchmark #####
276
356
  "overall": 44.25,
277
- ##### Code #####
278
- "code": 72.6,
279
357
  }
280
358
  LLAMA3_3_70B_INSTRUCT_MODEL_CARD = {
281
359
  ##### Cost in USD #####
282
360
  "usd_per_input_token": 0.88 / 1e6,
283
361
  "usd_per_output_token": 0.88 / 1e6,
284
362
  ##### Time #####
285
- "seconds_per_output_token": 0.0139,
363
+ "seconds_per_output_token": 0.0122,
286
364
  ##### Agg. Benchmark #####
287
- "overall": 65.92,
288
- ##### Code #####
289
- "code": 88.4,
365
+ "overall": 69.9,
290
366
  }
291
367
  LLAMA3_2_90B_V_MODEL_CARD = {
292
368
  ##### Cost in USD #####
293
369
  "usd_per_input_token": 1.2 / 1e6,
294
370
  "usd_per_output_token": 1.2 / 1e6,
295
371
  ##### Time #####
296
- "seconds_per_output_token": 0.0222,
372
+ "seconds_per_output_token": 0.0303,
297
373
  ##### Agg. Benchmark #####
298
374
  "overall": 65.00, # set to be slightly higher than gpt-4o-mini
299
375
  }
300
- MIXTRAL_8X_7B_MODEL_CARD = {
301
- ##### Cost in USD #####
302
- "usd_per_input_token": 0.6 / 1e6,
303
- "usd_per_output_token": 0.6 / 1e6,
304
- ##### Time #####
305
- "seconds_per_output_token": 0.0112,
306
- ##### Agg. Benchmark #####
307
- "overall": 43.27,
308
- ##### Code #####
309
- "code": 40.0,
310
- }
311
376
  DEEPSEEK_V3_MODEL_CARD = {
312
377
  ##### Cost in USD #####
313
378
  "usd_per_input_token": 1.25 / 1E6,
314
379
  "usd_per_output_token": 1.25 / 1E6,
315
380
  ##### Time #####
316
- "seconds_per_output_token": 0.0769,
381
+ "seconds_per_output_token": 0.0114,
317
382
  ##### Agg. Benchmark #####
318
- "overall": 75.87,
319
- ##### Code #####
320
- "code": 92.0,
383
+ "overall": 73.8,
321
384
  }
322
385
  DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD = {
323
386
  ##### Cost in USD #####
324
387
  "usd_per_input_token": 0.18 / 1E6,
325
388
  "usd_per_output_token": 0.18 / 1E6,
326
389
  ##### Time #####
327
- "seconds_per_output_token": 0.0026,
390
+ "seconds_per_output_token": 0.0050, # NOTE: copied to be same as LLAMA3_1_8B_INSTRUCT_MODEL_CARD; need to update when we have data
328
391
  ##### Agg. Benchmark #####
329
392
  "overall": 39.90, # https://www.reddit.com/r/LocalLLaMA/comments/1iserf9/deepseek_r1_distilled_models_mmlu_pro_benchmarks/
330
- ##### Code #####
331
- "code": 0.0,
393
+ }
394
+ GPT_4o_AUDIO_PREVIEW_MODEL_CARD = {
395
+ # NOTE: COPYING OVERALL AND SECONDS_PER_OUTPUT_TOKEN FROM GPT_4o; need to update when we have audio-specific benchmarks
396
+ ##### Cost in USD #####
397
+ "usd_per_audio_input_token": 2.5 / 1e6,
398
+ "usd_per_output_token": 10.0 / 1e6,
399
+ ##### Time #####
400
+ "seconds_per_output_token": 0.0080,
401
+ ##### Agg. Benchmark #####
402
+ "overall": 74.1,
403
+ }
404
+ GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD = {
405
+ # NOTE: COPYING OVERALL AND SECONDS_PER_OUTPUT_TOKEN FROM GPT_4o; need to update when we have audio-specific benchmarks
406
+ ##### Cost in USD #####
407
+ "usd_per_audio_input_token": 0.15 / 1e6,
408
+ "usd_per_output_token": 0.6 / 1e6,
409
+ ##### Time #####
410
+ "seconds_per_output_token": 0.0159,
411
+ ##### Agg. Benchmark #####
412
+ "overall": 62.7,
332
413
  }
333
414
  GPT_4o_MODEL_CARD = {
334
415
  # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
@@ -336,11 +417,9 @@ GPT_4o_MODEL_CARD = {
336
417
  "usd_per_input_token": 2.5 / 1e6,
337
418
  "usd_per_output_token": 10.0 / 1e6,
338
419
  ##### Time #####
339
- "seconds_per_output_token": 0.0079,
420
+ "seconds_per_output_token": 0.0080,
340
421
  ##### Agg. Benchmark #####
341
- "overall": 74.68,
342
- ##### Code #####
343
- "code": 90.0,
422
+ "overall": 74.1,
344
423
  }
345
424
  GPT_4o_MINI_MODEL_CARD = {
346
425
  # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
@@ -348,24 +427,90 @@ GPT_4o_MINI_MODEL_CARD = {
348
427
  "usd_per_input_token": 0.15 / 1e6,
349
428
  "usd_per_output_token": 0.6 / 1e6,
350
429
  ##### Time #####
351
- "seconds_per_output_token": 0.0098,
430
+ "seconds_per_output_token": 0.0159,
431
+ ##### Agg. Benchmark #####
432
+ "overall": 62.7,
433
+ }
434
+ GPT_4_1_MODEL_CARD = {
435
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
436
+ ##### Cost in USD #####
437
+ "usd_per_input_token": 2.0 / 1e6,
438
+ "usd_per_output_token": 8.0 / 1e6,
439
+ ##### Time #####
440
+ "seconds_per_output_token": 0.0076,
441
+ ##### Agg. Benchmark #####
442
+ "overall": 80.5,
443
+ }
444
+ GPT_4_1_MINI_MODEL_CARD = {
445
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
446
+ ##### Cost in USD #####
447
+ "usd_per_input_token": 0.4 / 1e6,
448
+ "usd_per_output_token": 1.6 / 1e6,
449
+ ##### Time #####
450
+ "seconds_per_output_token": 0.0161,
352
451
  ##### Agg. Benchmark #####
353
- "overall": 63.09,
354
- ##### Code #####
355
- "code": 86.0,
452
+ "overall": 77.2,
356
453
  }
357
- o1_MODEL_CARD = { # noqa: N816
454
+ GPT_4_1_NANO_MODEL_CARD = {
358
455
  # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
359
456
  ##### Cost in USD #####
360
- "usd_per_input_token": 15 / 1e6,
361
- "usd_per_output_token": 60 / 1e6,
457
+ "usd_per_input_token": 0.1 / 1e6,
458
+ "usd_per_output_token": 0.4 / 1e6,
362
459
  ##### Time #####
363
- "seconds_per_output_token": 0.0110,
460
+ "seconds_per_output_token": 0.0060,
364
461
  ##### Agg. Benchmark #####
365
- "overall": 89.30,
366
- ##### Code #####
367
- "code": 92.3, # NOTE: just copying MMLU score for now
462
+ "overall": 62.3,
368
463
  }
464
+ GPT_5_MODEL_CARD = {
465
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
466
+ ##### Cost in USD #####
467
+ "usd_per_input_token": 1.25 / 1e6,
468
+ "usd_per_output_token": 10.0 / 1e6,
469
+ ##### Time #####
470
+ "seconds_per_output_token": 0.0060,
471
+ ##### Agg. Benchmark #####
472
+ "overall": 87.00,
473
+ }
474
+ GPT_5_MINI_MODEL_CARD = {
475
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
476
+ ##### Cost in USD #####
477
+ "usd_per_input_token": 0.25 / 1e6,
478
+ "usd_per_output_token": 2.0 / 1e6,
479
+ ##### Time #####
480
+ "seconds_per_output_token": 0.0135,
481
+ ##### Agg. Benchmark #####
482
+ "overall": 82.50,
483
+ }
484
+ GPT_5_NANO_MODEL_CARD = {
485
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
486
+ ##### Cost in USD #####
487
+ "usd_per_input_token": 0.05 / 1e6,
488
+ "usd_per_output_token": 0.4 / 1e6,
489
+ ##### Time #####
490
+ "seconds_per_output_token": 0.0055,
491
+ ##### Agg. Benchmark #####
492
+ "overall": 77.9,
493
+ }
494
+ o4_MINI_MODEL_CARD = { # noqa: N816
495
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
496
+ ##### Cost in USD #####
497
+ "usd_per_input_token": 1.1 / 1e6,
498
+ "usd_per_output_token": 4.4 / 1e6,
499
+ ##### Time #####
500
+ "seconds_per_output_token": 0.0092,
501
+ ##### Agg. Benchmark #####
502
+ "overall": 80.6, # using number reported for o3-mini; true number is likely higher
503
+ }
504
+ # o1_MODEL_CARD = { # noqa: N816
505
+ # # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
506
+ # ##### Cost in USD #####
507
+ # "usd_per_input_token": 15 / 1e6,
508
+ # "usd_per_output_token": 60 / 1e6,
509
+ # ##### Time #####
510
+ # "seconds_per_output_token": 0.0110,
511
+ # ##### Agg. Benchmark #####
512
+ # "overall": 83.50,
513
+ # }
369
514
  TEXT_EMBEDDING_3_SMALL_MODEL_CARD = {
370
515
  ##### Cost in USD #####
371
516
  "usd_per_input_token": 0.02 / 1e6,
@@ -384,7 +529,91 @@ CLIP_VIT_B_32_MODEL_CARD = {
384
529
  ##### Agg. Benchmark #####
385
530
  "overall": 63.3, # NOTE: ImageNet top-1 accuracy
386
531
  }
387
-
532
+ CLAUDE_3_5_SONNET_MODEL_CARD = {
533
+ ##### Cost in USD #####
534
+ "usd_per_input_token": 3.0 / 1e6,
535
+ "usd_per_output_token": 15.0 / 1e6,
536
+ ##### Time #####
537
+ "seconds_per_output_token": 0.0154,
538
+ ##### Agg. Benchmark #####
539
+ "overall": 78.4,
540
+ }
541
+ CLAUDE_3_7_SONNET_MODEL_CARD = {
542
+ ##### Cost in USD #####
543
+ "usd_per_input_token": 3.0 / 1e6,
544
+ "usd_per_output_token": 15.0 / 1e6,
545
+ ##### Time #####
546
+ "seconds_per_output_token": 0.0156,
547
+ ##### Agg. Benchmark #####
548
+ "overall": 80.7,
549
+ }
550
+ CLAUDE_3_5_HAIKU_MODEL_CARD = {
551
+ ##### Cost in USD #####
552
+ "usd_per_input_token": 0.8 / 1e6,
553
+ "usd_per_output_token": 4.0 / 1e6,
554
+ ##### Time #####
555
+ "seconds_per_output_token": 0.0189,
556
+ ##### Agg. Benchmark #####
557
+ "overall": 64.1,
558
+ }
559
+ GEMINI_2_0_FLASH_MODEL_CARD = {
560
+ ##### Cost in USD #####
561
+ "usd_per_input_token": 0.15 / 1e6,
562
+ "usd_per_output_token": 0.6 / 1e6,
563
+ "usd_per_audio_input_token": 1.0 / 1e6,
564
+ ##### Time #####
565
+ "seconds_per_output_token": 0.0054,
566
+ ##### Agg. Benchmark #####
567
+ "overall": 77.40,
568
+ }
569
+ GEMINI_2_5_FLASH_LITE_MODEL_CARD = {
570
+ ##### Cost in USD #####
571
+ "usd_per_input_token": 0.1 / 1e6,
572
+ "usd_per_output_token": 0.4 / 1e6,
573
+ "usd_per_audio_input_token": 0.3 / 1e6,
574
+ ##### Time #####
575
+ "seconds_per_output_token": 0.0034,
576
+ ##### Agg. Benchmark #####
577
+ "overall": 79.1, # NOTE: interpolated between gemini 2.5 flash and gemini 2.0 flash
578
+ }
579
+ GEMINI_2_5_FLASH_MODEL_CARD = {
580
+ ##### Cost in USD #####
581
+ "usd_per_input_token": 0.30 / 1e6,
582
+ "usd_per_output_token": 2.5 / 1e6,
583
+ "usd_per_audio_input_token": 1.0 / 1e6,
584
+ ##### Time #####
585
+ "seconds_per_output_token": 0.0044,
586
+ ##### Agg. Benchmark #####
587
+ "overall": 80.75, # NOTE: interpolated between gemini 2.0 flash and gemini 2.5 pro
588
+ }
589
+ GEMINI_2_5_PRO_MODEL_CARD = {
590
+ ##### Cost in USD #####
591
+ "usd_per_input_token": 1.25 / 1e6,
592
+ "usd_per_output_token": 10.0 / 1e6,
593
+ "usd_per_audio_input_token": 1.25 / 1e6,
594
+ ##### Time #####
595
+ "seconds_per_output_token": 0.0072,
596
+ ##### Agg. Benchmark #####
597
+ "overall": 84.10,
598
+ }
599
+ LLAMA_4_MAVERICK_MODEL_CARD = {
600
+ ##### Cost in USD #####
601
+ "usd_per_input_token": 0.35 / 1e6,
602
+ "usd_per_output_token": 1.15 / 1e6,
603
+ ##### Time #####
604
+ "seconds_per_output_token": 0.0122,
605
+ ##### Agg. Benchmark #####
606
+ "overall": 79.4,
607
+ }
608
+ VLLM_QWEN_1_5_0_5B_CHAT_MODEL_CARD = {
609
+ ##### Cost in USD #####
610
+ "usd_per_input_token": 0.0 / 1e6,
611
+ "usd_per_output_token": 0.0 / 1e6,
612
+ ##### Time #####
613
+ "seconds_per_output_token": 0.1000, # TODO: fill-in with a better estimate
614
+ ##### Agg. Benchmark #####
615
+ "overall": 30.0, # TODO: fill-in with a better estimate
616
+ }
388
617
 
389
618
  MODEL_CARDS = {
390
619
  Model.LLAMA3_2_3B.value: LLAMA3_2_3B_INSTRUCT_MODEL_CARD,
@@ -393,124 +622,29 @@ MODEL_CARDS = {
393
622
  Model.LLAMA3_2_90B_V.value: LLAMA3_2_90B_V_MODEL_CARD,
394
623
  Model.DEEPSEEK_V3.value: DEEPSEEK_V3_MODEL_CARD,
395
624
  Model.DEEPSEEK_R1_DISTILL_QWEN_1_5B.value: DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD,
396
- Model.MIXTRAL.value: MIXTRAL_8X_7B_MODEL_CARD,
397
625
  Model.GPT_4o.value: GPT_4o_MODEL_CARD,
398
626
  Model.GPT_4o_MINI.value: GPT_4o_MINI_MODEL_CARD,
627
+ Model.GPT_4o_AUDIO_PREVIEW.value: GPT_4o_AUDIO_PREVIEW_MODEL_CARD,
628
+ Model.GPT_4o_MINI_AUDIO_PREVIEW.value: GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD,
629
+ Model.GPT_4_1.value: GPT_4_1_MODEL_CARD,
630
+ Model.GPT_4_1_MINI.value: GPT_4_1_MINI_MODEL_CARD,
631
+ Model.GPT_4_1_NANO.value: GPT_4_1_NANO_MODEL_CARD,
632
+ Model.GPT_5.value: GPT_5_MODEL_CARD,
633
+ Model.GPT_5_MINI.value: GPT_5_MINI_MODEL_CARD,
634
+ Model.GPT_5_NANO.value: GPT_5_NANO_MODEL_CARD,
635
+ Model.o4_MINI.value: o4_MINI_MODEL_CARD,
399
636
  # Model.o1.value: o1_MODEL_CARD,
400
637
  Model.TEXT_EMBEDDING_3_SMALL.value: TEXT_EMBEDDING_3_SMALL_MODEL_CARD,
401
638
  Model.CLIP_VIT_B_32.value: CLIP_VIT_B_32_MODEL_CARD,
639
+ Model.CLAUDE_3_5_SONNET.value: CLAUDE_3_5_SONNET_MODEL_CARD,
640
+ Model.CLAUDE_3_7_SONNET.value: CLAUDE_3_7_SONNET_MODEL_CARD,
641
+ Model.CLAUDE_3_5_HAIKU.value: CLAUDE_3_5_HAIKU_MODEL_CARD,
642
+ Model.GEMINI_2_0_FLASH.value: GEMINI_2_0_FLASH_MODEL_CARD,
643
+ Model.GEMINI_2_5_FLASH.value: GEMINI_2_5_FLASH_MODEL_CARD,
644
+ Model.GEMINI_2_5_PRO.value: GEMINI_2_5_PRO_MODEL_CARD,
645
+ Model.GOOGLE_GEMINI_2_5_FLASH.value: GEMINI_2_5_FLASH_MODEL_CARD,
646
+ Model.GOOGLE_GEMINI_2_5_FLASH_LITE.value: GEMINI_2_5_FLASH_LITE_MODEL_CARD,
647
+ Model.GOOGLE_GEMINI_2_5_PRO.value: GEMINI_2_5_PRO_MODEL_CARD,
648
+ Model.LLAMA_4_MAVERICK.value: LLAMA_4_MAVERICK_MODEL_CARD,
649
+ Model.VLLM_QWEN_1_5_0_5B_CHAT.value: VLLM_QWEN_1_5_0_5B_CHAT_MODEL_CARD,
402
650
  }
403
-
404
-
405
- ###### DEPRECATED ######
406
- # # NOTE: seconds_per_output_token is based on `gpt-3.5-turbo-1106`
407
- # GPT_3_5_MODEL_CARD = {
408
- # ##### Cost in USD #####
409
- # "usd_per_input_token": 0.5 / 1E6,
410
- # "usd_per_output_token": 1.5 / 1E6,
411
- # ##### Time #####
412
- # "seconds_per_output_token": 0.0065,
413
- # ##### Agg. Benchmark #####
414
- # "overall": 70.0, # 5-shot
415
- # ##### Commonsense Reasoning #####
416
- # "reasoning": 84.1,
417
- # ### "HellaSwag": 85.5,^ # 10-shot
418
- # ### "WinoGrande": 81.6,^ # 5-shot
419
- # ### "Arc-e": 85.2,^ # 25-shot
420
- # ##### World Knowledge #####
421
- # ##### Reading Comprehension #####
422
- # ### "DROP": 64.1, # 3-shot
423
- # ##### Code #####
424
- # "code": 48.1,
425
- # ### "HumanEval": 48.1,^ # 0-shot
426
- # ##### Math #####
427
- # "math": 57.1,
428
- # ### "GSM8K": 57.1,^ # 5-shot
429
- # }
430
- # # NOTE: the seconds_per_output_token was computed based on a slightly different model ('gpt-4-1106-preview')
431
- # # and the benchmark statistics were computed based on the GPT-4 Technical Report; these might be
432
- # # slightly innacurate compared to the real numbers for gpt-4-0125-preview, but we'll use them until
433
- # # we have something better. (The cost metrics are accurate).
434
- # GPT_4_MODEL_CARD = {
435
- # ##### Cost in USD #####
436
- # "usd_per_input_token": 10 / 1E6,
437
- # "usd_per_output_token": 30 / 1E6,
438
- # ##### Time #####
439
- # "seconds_per_output_token": 0.018,
440
- # ##### Agg. Benchmark #####
441
- # "overall": 86.4, # 5-shot
442
- # ##### Commonsense Reasoning #####
443
- # "reasoning": 93.0,
444
- # ### "HellaSwag": 95.3,^ # 10-shot
445
- # ### "WinoGrande": 87.5,^ # 5-shot
446
- # ### "Arc-e": 96.3,^ # 25-shot
447
- # ##### World Knowledge #####
448
- # ##### Reading Comprehension #####
449
- # ### "DROP": 80.9, # 3-shot
450
- # ##### Code #####
451
- # "code": 67.0,
452
- # ### "HumanEval": 67.0,^ # 0-shot
453
- # ##### Math #####
454
- # "math": 92.0,
455
- # ### "GSM8K": 92.0,^ # 5-shot
456
- # }
457
-
458
- # # TODO: use cost info in here: https://platform.openai.com/docs/guides/vision/calculating-costs
459
- # GPT_4V_MODEL_CARD = {
460
- # ##### Cost in USD #####
461
- # "usd_per_input_token": 10 / 1E6,
462
- # "usd_per_output_token": 30 / 1E6,
463
- # ##### Time #####
464
- # "seconds_per_output_token": 0.042 / 10.0, # TODO: / 10.0 is a hack; need to figure out why time estimates are so off
465
- # ##### Agg. Benchmark #####
466
- # "overall": 86.4,
467
- # }
468
-
469
-
470
- # GEMINI_1_MODEL_CARD = {
471
- # ##### Cost in USD #####
472
- # "usd_per_input_token": 125 / 1E8, # Gemini is free but rate limited for now. Pricing will be updated
473
- # "usd_per_output_token": 375 / 1E9,
474
- # ##### Time #####
475
- # "seconds_per_output_token": 0.042 / 10.0, # TODO:
476
- # ##### Agg. Benchmark #####
477
- # "overall": 65.0, # 90.0 TODO: we are using the free version of Gemini which is substantially worse than its paid version; I'm manually revising it's quality below that of Mixtral
478
- # ##### Commonsense Reasoning #####
479
- # "reasoning": 80.0, # 87.8, TODO: see note above on overall
480
- # # "HellaSwag": 87.8, # 10-shot
481
- # ##### World Knowledge #####
482
- # ##### Reading Comprehension #####
483
- # # "DROP": 82.4, # Variable shots ?
484
- # ##### Code #####
485
- # "code": 74.4,
486
- # # "HumanEval": 74.4, # 0-shot (IT)*
487
- # # "Natural2Code": 74.9, # 0-shot
488
- # ##### Math #####
489
- # "math": 94.4,
490
- # # "GSM8K": 94.4, # maj1@32
491
- # # "MATH": 53.2, # 4-shot
492
- # }
493
-
494
- # GEMINI_1V_MODEL_CARD = {
495
- # ##### Cost in USD #####
496
- # "usd_per_input_token": 25 / 1E6, # Gemini is free but rate limited for now. Pricing will be updated
497
- # "usd_per_output_token": 375 / 1E9,
498
- # ##### Time #####
499
- # "seconds_per_output_token": 0.042, # / 10.0, # TODO:
500
- # ##### Agg. Benchmark #####
501
- # "overall": 65.0, # 90.0, TODO: see note above in Gemini_1 model card
502
- # ##### Commonsense Reasoning #####
503
- # "reasoning": 80.0, # 87.8, TODO: see note above in Gemini_1 model card
504
- # # "HellaSwag": 87.8, # 10-shot
505
- # ##### World Knowledge #####
506
- # ##### Reading Comprehension #####
507
- # # "DROP": 82.4, # Variable shots ?
508
- # ##### Code #####
509
- # "code": 74.4,
510
- # # "HumanEval": 74.4, # 0-shot (IT)*
511
- # # "Natural2Code": 74.9, # 0-shot
512
- # ##### Math #####
513
- # "math": 94.4,
514
- # # "GSM8K": 94.4, # maj1@32
515
- # # "MATH": 53.2, # 4-shot
516
- # }