palimpzest 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +259 -197
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +634 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +61 -5
  19. palimpzest/prompts/filter_prompts.py +50 -5
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
  22. palimpzest/prompts/prompt_factory.py +358 -46
  23. palimpzest/prompts/validator.py +239 -0
  24. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  25. palimpzest/query/execution/execution_strategy.py +210 -317
  26. palimpzest/query/execution/execution_strategy_type.py +5 -7
  27. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  28. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  29. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  30. palimpzest/query/generators/generators.py +157 -330
  31. palimpzest/query/operators/__init__.py +15 -5
  32. palimpzest/query/operators/aggregate.py +50 -33
  33. palimpzest/query/operators/compute.py +201 -0
  34. palimpzest/query/operators/convert.py +27 -21
  35. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  36. palimpzest/query/operators/distinct.py +62 -0
  37. palimpzest/query/operators/filter.py +22 -13
  38. palimpzest/query/operators/join.py +402 -0
  39. palimpzest/query/operators/limit.py +3 -3
  40. palimpzest/query/operators/logical.py +198 -80
  41. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  42. palimpzest/query/operators/physical.py +27 -21
  43. palimpzest/query/operators/project.py +3 -3
  44. palimpzest/query/operators/rag_convert.py +7 -7
  45. palimpzest/query/operators/retrieve.py +9 -9
  46. palimpzest/query/operators/scan.py +81 -42
  47. palimpzest/query/operators/search.py +524 -0
  48. palimpzest/query/operators/split_convert.py +10 -8
  49. palimpzest/query/optimizer/__init__.py +7 -9
  50. palimpzest/query/optimizer/cost_model.py +108 -441
  51. palimpzest/query/optimizer/optimizer.py +123 -181
  52. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  53. palimpzest/query/optimizer/plan.py +352 -67
  54. palimpzest/query/optimizer/primitives.py +43 -19
  55. palimpzest/query/optimizer/rules.py +484 -646
  56. palimpzest/query/optimizer/tasks.py +127 -58
  57. palimpzest/query/processor/config.py +41 -76
  58. palimpzest/query/processor/query_processor.py +73 -18
  59. palimpzest/query/processor/query_processor_factory.py +46 -38
  60. palimpzest/schemabuilder/schema_builder.py +15 -28
  61. palimpzest/utils/model_helpers.py +27 -77
  62. palimpzest/utils/progress.py +114 -102
  63. palimpzest/validator/__init__.py +0 -0
  64. palimpzest/validator/validator.py +306 -0
  65. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
  66. palimpzest-0.8.0.dist-info/RECORD +95 -0
  67. palimpzest/core/lib/fields.py +0 -141
  68. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  69. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  70. palimpzest/query/generators/api_client_factory.py +0 -30
  71. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  72. palimpzest/query/operators/map.py +0 -130
  73. palimpzest/query/processor/nosentinel_processor.py +0 -33
  74. palimpzest/query/processor/processing_strategy_type.py +0 -28
  75. palimpzest/query/processor/sentinel_processor.py +0 -88
  76. palimpzest/query/processor/streaming_processor.py +0 -149
  77. palimpzest/sets.py +0 -405
  78. palimpzest/utils/datareader_helpers.py +0 -61
  79. palimpzest/utils/demo_helpers.py +0 -75
  80. palimpzest/utils/field_helpers.py +0 -69
  81. palimpzest/utils/generation_helpers.py +0 -69
  82. palimpzest/utils/sandbox.py +0 -183
  83. palimpzest-0.7.21.dist-info/RECORD +0 -95
  84. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  85. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
  86. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
  87. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
palimpzest/constants.py CHANGED
@@ -10,77 +10,110 @@ class Model(str, Enum):
10
10
  which requires invoking an LLM. It does NOT specify whether the model need be executed
11
11
  remotely or locally (if applicable).
12
12
  """
13
- LLAMA3_2_3B = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
14
- LLAMA3_1_8B = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
15
- LLAMA3_3_70B = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
16
- LLAMA3_2_90B_V = "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo"
17
- MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
18
- DEEPSEEK_V3 = "deepseek-ai/DeepSeek-V3"
19
- DEEPSEEK_R1_DISTILL_QWEN_1_5B = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
20
- GPT_4o = "gpt-4o-2024-08-06"
21
- GPT_4o_MINI = "gpt-4o-mini-2024-07-18"
13
+ LLAMA3_2_3B = "together_ai/meta-llama/Llama-3.2-3B-Instruct-Turbo"
14
+ LLAMA3_1_8B = "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
15
+ LLAMA3_3_70B = "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"
16
+ LLAMA3_2_90B_V = "together_ai/meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo"
17
+ DEEPSEEK_V3 = "together_ai/deepseek-ai/DeepSeek-V3"
18
+ DEEPSEEK_R1_DISTILL_QWEN_1_5B = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
19
+ GPT_4o = "openai/gpt-4o-2024-08-06"
20
+ GPT_4o_MINI = "openai/gpt-4o-mini-2024-07-18"
21
+ GPT_5 = "openai/gpt-5"
22
+ GPT_5_MINI = "openai/gpt-5-mini"
23
+ o4_MINI = "openai/o4-mini-2025-04-16" # noqa: N815
22
24
  TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
23
25
  CLIP_VIT_B_32 = "clip-ViT-B-32"
26
+ CLAUDE_3_5_SONNET = "anthropic/claude-3-5-sonnet-20241022"
27
+ CLAUDE_3_7_SONNET = "anthropic/claude-3-7-sonnet-20250219"
28
+ CLAUDE_3_5_HAIKU = "anthropic/claude-3-5-haiku-20241022"
29
+ GEMINI_2_0_FLASH = "vertex_ai/gemini-2.0-flash"
30
+ GEMINI_2_5_FLASH = "vertex_ai/gemini-2.5-flash"
31
+ GEMINI_2_5_PRO = "vertex_ai/gemini-2.5-pro"
32
+ LLAMA_4_MAVERICK = "vertex_ai/meta/llama-4-maverick-17b-128e-instruct-maas"
33
+ GPT_4o_AUDIO_PREVIEW = "openai/gpt-4o-audio-preview"
34
+ GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
35
+ VLLM_QWEN_1_5_0_5B_CHAT = "hosted_vllm/qwen/Qwen1.5-0.5B-Chat"
24
36
  # o1 = "o1-2024-12-17"
25
37
 
26
38
  def __repr__(self):
27
39
  return f"{self.name}"
28
40
 
29
- def is_deepseek_model(self):
30
- return "deepseek" in self.value.lower()
31
-
32
41
  def is_llama_model(self):
33
42
  return "llama" in self.value.lower()
34
43
 
35
- def is_mixtral_model(self):
36
- return "mixtral" in self.value.lower()
37
-
38
44
  def is_clip_model(self):
39
45
  return "clip" in self.value.lower()
40
46
 
41
47
  def is_together_model(self):
42
- is_llama_model = self.is_llama_model()
43
- is_mixtral_model = self.is_mixtral_model()
44
- is_deepseek_model = self.is_deepseek_model()
45
- is_clip_model = self.is_clip_model()
46
- return is_llama_model or is_mixtral_model or is_deepseek_model or is_clip_model
47
-
48
- def is_gpt_4o_model(self):
49
- return "gpt-4o" in self.value.lower()
50
-
51
- def is_o1_model(self):
52
- return "o1" in self.value.lower()
48
+ return "together_ai" in self.value.lower() or self.is_clip_model()
53
49
 
54
50
  def is_text_embedding_model(self):
55
51
  return "text-embedding" in self.value.lower()
56
52
 
53
+ def is_o_model(self):
54
+ return self in [Model.o4_MINI]
55
+
56
+ def is_gpt_5_model(self):
57
+ return self in [Model.GPT_5, Model.GPT_5_MINI]
58
+
57
59
  def is_openai_model(self):
58
- is_gpt4_model = self.is_gpt_4o_model()
59
- is_o1_model = self.is_o1_model()
60
- is_text_embedding_model = self.is_text_embedding_model()
61
- return is_gpt4_model or is_o1_model or is_text_embedding_model
60
+ return "openai" in self.value.lower() or self.is_text_embedding_model()
61
+
62
+ def is_anthropic_model(self):
63
+ return "anthropic" in self.value.lower()
64
+
65
+ def is_vertex_model(self):
66
+ return "vertex_ai" in self.value.lower()
67
+
68
+ def is_vllm_model(self):
69
+ return "hosted_vllm" in self.value.lower()
70
+
71
+ def is_reasoning_model(self):
72
+ reasoning_models = [
73
+ Model.GPT_5, Model.GPT_5_MINI, Model.o4_MINI,
74
+ Model.GEMINI_2_5_PRO, Model.GEMINI_2_5_FLASH,
75
+ Model.CLAUDE_3_7_SONNET,
76
+ ]
77
+ return self in reasoning_models
78
+
79
+ def is_text_model(self):
80
+ non_text_models = [
81
+ Model.LLAMA3_2_90B_V,
82
+ Model.CLIP_VIT_B_32, Model.TEXT_EMBEDDING_3_SMALL,
83
+ Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
84
+ ]
85
+ return self not in non_text_models
62
86
 
87
+ # TODO: I think SONNET and HAIKU are vision-capable too
63
88
  def is_vision_model(self):
64
- vision_models = [
65
- "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
66
- "gpt-4o-2024-08-06",
67
- "gpt-4o-mini-2024-07-18",
68
- "o1-2024-12-17",
89
+ return self in [
90
+ Model.LLAMA3_2_90B_V, Model.LLAMA_4_MAVERICK,
91
+ Model.GPT_4o, Model.GPT_4o_MINI, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI,
92
+ Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
69
93
  ]
70
- return self.value in vision_models
71
94
 
72
- def is_embedding_model(self):
73
- is_clip_model = self.is_clip_model()
74
- is_text_embedding_model = self.is_text_embedding_model()
75
- return is_clip_model or is_text_embedding_model
95
+ def is_audio_model(self):
96
+ return self in [
97
+ Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
98
+ Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
99
+ ]
76
100
 
77
- class APIClient(str, Enum):
78
- """
79
- APIClient describes the API client to be used when invoking an LLM.
80
- """
101
+ def is_text_image_multimodal_model(self):
102
+ return self in [
103
+ Model.LLAMA_4_MAVERICK,
104
+ Model.GPT_4o, Model.GPT_4o_MINI, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI,
105
+ Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
106
+ ]
107
+
108
+ def is_text_audio_multimodal_model(self):
109
+ return self in [
110
+ Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
111
+ Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
112
+ ]
113
+
114
+ def is_embedding_model(self):
115
+ return self in [Model.CLIP_VIT_B_32, Model.TEXT_EMBEDDING_3_SMALL]
81
116
 
82
- OPENAI = "openai"
83
- TOGETHER = "together"
84
117
 
85
118
  class PromptStrategy(str, Enum):
86
119
  """
@@ -90,28 +123,48 @@ class PromptStrategy(str, Enum):
90
123
 
91
124
  # Chain-of-Thought Boolean Prompt Strategies
92
125
  COT_BOOL = "chain-of-thought-bool"
126
+ COT_BOOL_NO_REASONING = "chain-of-thought-bool-no-reasoning"
93
127
  # COT_BOOL_CRITIC = "chain-of-thought-bool-critic"
94
128
  # COT_BOOL_REFINE = "chain-of-thought-bool-refine"
95
129
 
96
130
  # Chain-of-Thought Boolean with Image Prompt Strategies
97
131
  COT_BOOL_IMAGE = "chain-of-thought-bool-image"
132
+ COT_BOOL_IMAGE_NO_REASONING = "chain-of-thought-bool-image"
133
+ COT_BOOL_AUDIO = "chain-of-thought-bool-audio"
134
+ COT_BOOL_AUDIO_NO_REASONING = "chain-of-thought-bool-audio"
98
135
  # COT_BOOL_IMAGE_CRITIC = "chain-of-thought-bool-image-critic"
99
136
  # COT_BOOL_IMAGE_REFINE = "chain-of-thought-bool-image-refine"
100
137
 
138
+ # Chain-of-Thought Join Prompt Strategies
139
+ COT_JOIN = "chain-of-thought-join"
140
+ COT_JOIN_NO_REASONING = "chain-of-thought-join-no-reasoning"
141
+ COT_JOIN_IMAGE = "chain-of-thought-join-image"
142
+ COT_JOIN_IMAGE_NO_REASONING = "chain-of-thought-join-image-no-reasoning"
143
+ COT_JOIN_AUDIO = "chain-of-thought-join-audio"
144
+ COT_JOIN_AUDIO_NO_REASONING = "chain-of-thought-join-audio-no-reasoning"
145
+
101
146
  # Chain-of-Thought Question Answering Prompt Strategies
102
147
  COT_QA = "chain-of-thought-question"
148
+ COT_QA_NO_REASONING = "chain-of-thought-question-no-reasoning"
103
149
  COT_QA_CRITIC = "chain-of-thought-question-critic"
104
150
  COT_QA_REFINE = "chain-of-thought-question-refine"
105
151
 
106
152
  # Chain-of-Thought Question with Image Prompt Strategies
107
153
  COT_QA_IMAGE = "chain-of-thought-question-image"
154
+ COT_QA_IMAGE_NO_REASONING = "chain-of-thought-question-image-no-reasoning"
108
155
  COT_QA_IMAGE_CRITIC = "chain-of-thought-question-critic-image"
109
156
  COT_QA_IMAGE_REFINE = "chain-of-thought-question-refine-image"
110
157
 
158
+ # Chain-of-Thought Queestion with Audio Prompt Strategies
159
+ COT_QA_AUDIO = "chain-of-thought-question-audio"
160
+ COT_QA_AUDIO_NO_REASONING = "chain-of-thought-question-audio-no-reasoning"
161
+ # TODO: COT_QA_AUDIO_CRITIC/REFINE
162
+
111
163
  # Mixture-of-Agents Prompt Strategies
112
164
  COT_MOA_PROPOSER = "chain-of-thought-mixture-of-agents-proposer"
113
165
  COT_MOA_PROPOSER_IMAGE = "chain-of-thought-mixture-of-agents-proposer-image"
114
166
  COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation"
167
+ # TODO: COT_MOA_PROPOSER_AUDIO
115
168
 
116
169
  # Split Convert Prompt Strategies
117
170
  SPLIT_PROPOSER = "split-proposer"
@@ -120,11 +173,17 @@ class PromptStrategy(str, Enum):
120
173
  def is_image_prompt(self):
121
174
  return "image" in self.value
122
175
 
176
+ def is_audio_prompt(self):
177
+ return "audio" in self.value
178
+
123
179
  def is_bool_prompt(self):
124
180
  return "bool" in self.value
125
181
 
182
+ def is_join_prompt(self):
183
+ return "join" in self.value
184
+
126
185
  def is_convert_prompt(self):
127
- return "bool" not in self.value
186
+ return "bool" not in self.value and "join" not in self.value
128
187
 
129
188
  def is_critic_prompt(self):
130
189
  return "critic" in self.value
@@ -144,6 +203,9 @@ class PromptStrategy(str, Enum):
144
203
  def is_split_merger_prompt(self):
145
204
  return "split-merger" in self.value
146
205
 
206
+ def is_no_reasoning_prompt(self):
207
+ return "no-reasoning" in self.value
208
+
147
209
  class AggFunc(str, Enum):
148
210
  COUNT = "count"
149
211
  AVERAGE = "average"
@@ -169,6 +231,7 @@ class PickOutputStrategy(str, Enum):
169
231
  ENSEMBLE = "ensemble"
170
232
 
171
233
 
234
+ AUDIO_EXTENSIONS = [".wav"]
172
235
  IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"]
173
236
  PDF_EXTENSIONS = [".pdf"]
174
237
  XLS_EXTENSIONS = [".xls", ".xlsx"]
@@ -210,8 +273,8 @@ NAIVE_BYTES_PER_RECORD = 1024
210
273
  # Rough conversion from # of characters --> # of tokens; assumes 1 token ~= 4 chars
211
274
  TOKENS_PER_CHARACTER = 0.25
212
275
 
213
- # Rough estimate of the number of tokens the context is allowed to take up for MIXTRAL and LLAMA3 models
214
- MIXTRAL_LLAMA_CONTEXT_TOKENS_LIMIT = 6000
276
+ # Rough estimate of the number of tokens the context is allowed to take up for LLAMA3 models
277
+ LLAMA_CONTEXT_TOKENS_LIMIT = 6000
215
278
 
216
279
  # a naive estimate for the input record size
217
280
  NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
@@ -219,6 +282,9 @@ NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
219
282
  # a naive estimate for filter selectivity
220
283
  NAIVE_EST_FILTER_SELECTIVITY = 0.5
221
284
 
285
+ # a naive estimate for join selectivity
286
+ NAIVE_EST_JOIN_SELECTIVITY = 0.5
287
+
222
288
  # a naive estimate for the number of input tokens processed per record
223
289
  NAIVE_EST_NUM_INPUT_TOKENS = 1000
224
290
 
@@ -246,6 +312,7 @@ LOG_LLM_OUTPUT = False
246
312
  # - in the future we should split quality for vision vs. multi-modal vs. text
247
313
  # - code quality was computed using HumanEval, but that benchmark is too easy and should be replaced.
248
314
  # - https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro
315
+ # - https://www.vals.ai/benchmarks/mmlu_pro-08-12-2025
249
316
  #
250
317
  # Cost is presented in terms of USD / token for input tokens and USD / token for
251
318
  # generated tokens.
@@ -263,8 +330,6 @@ LLAMA3_2_3B_INSTRUCT_MODEL_CARD = {
263
330
  "seconds_per_output_token": 0.0064,
264
331
  ##### Agg. Benchmark #####
265
332
  "overall": 36.50, # https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/discussions/13
266
- ##### Code #####
267
- "code": 0.0,
268
333
  }
269
334
  LLAMA3_1_8B_INSTRUCT_MODEL_CARD = {
270
335
  ##### Cost in USD #####
@@ -274,8 +339,6 @@ LLAMA3_1_8B_INSTRUCT_MODEL_CARD = {
274
339
  "seconds_per_output_token": 0.0059,
275
340
  ##### Agg. Benchmark #####
276
341
  "overall": 44.25,
277
- ##### Code #####
278
- "code": 72.6,
279
342
  }
280
343
  LLAMA3_3_70B_INSTRUCT_MODEL_CARD = {
281
344
  ##### Cost in USD #####
@@ -284,9 +347,7 @@ LLAMA3_3_70B_INSTRUCT_MODEL_CARD = {
284
347
  ##### Time #####
285
348
  "seconds_per_output_token": 0.0139,
286
349
  ##### Agg. Benchmark #####
287
- "overall": 65.92,
288
- ##### Code #####
289
- "code": 88.4,
350
+ "overall": 69.9,
290
351
  }
291
352
  LLAMA3_2_90B_V_MODEL_CARD = {
292
353
  ##### Cost in USD #####
@@ -297,17 +358,6 @@ LLAMA3_2_90B_V_MODEL_CARD = {
297
358
  ##### Agg. Benchmark #####
298
359
  "overall": 65.00, # set to be slightly higher than gpt-4o-mini
299
360
  }
300
- MIXTRAL_8X_7B_MODEL_CARD = {
301
- ##### Cost in USD #####
302
- "usd_per_input_token": 0.6 / 1e6,
303
- "usd_per_output_token": 0.6 / 1e6,
304
- ##### Time #####
305
- "seconds_per_output_token": 0.0112,
306
- ##### Agg. Benchmark #####
307
- "overall": 43.27,
308
- ##### Code #####
309
- "code": 40.0,
310
- }
311
361
  DEEPSEEK_V3_MODEL_CARD = {
312
362
  ##### Cost in USD #####
313
363
  "usd_per_input_token": 1.25 / 1E6,
@@ -315,9 +365,7 @@ DEEPSEEK_V3_MODEL_CARD = {
315
365
  ##### Time #####
316
366
  "seconds_per_output_token": 0.0769,
317
367
  ##### Agg. Benchmark #####
318
- "overall": 75.87,
319
- ##### Code #####
320
- "code": 92.0,
368
+ "overall": 73.8,
321
369
  }
322
370
  DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD = {
323
371
  ##### Cost in USD #####
@@ -327,8 +375,26 @@ DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD = {
327
375
  "seconds_per_output_token": 0.0026,
328
376
  ##### Agg. Benchmark #####
329
377
  "overall": 39.90, # https://www.reddit.com/r/LocalLLaMA/comments/1iserf9/deepseek_r1_distilled_models_mmlu_pro_benchmarks/
330
- ##### Code #####
331
- "code": 0.0,
378
+ }
379
+ GPT_4o_AUDIO_PREVIEW_MODEL_CARD = {
380
+ # NOTE: COPYING OVERALL AND SECONDS_PER_OUTPUT_TOKEN FROM GPT_4o; need to update when we have audio-specific benchmarks
381
+ ##### Cost in USD #####
382
+ "usd_per_audio_input_token": 2.5 / 1e6,
383
+ "usd_per_output_token": 10.0 / 1e6,
384
+ ##### Time #####
385
+ "seconds_per_output_token": 0.0079,
386
+ ##### Agg. Benchmark #####
387
+ "overall": 74.1,
388
+ }
389
+ GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD = {
390
+ # NOTE: COPYING OVERALL AND SECONDS_PER_OUTPUT_TOKEN FROM GPT_4o; need to update when we have audio-specific benchmarks
391
+ ##### Cost in USD #####
392
+ "usd_per_audio_input_token": 0.15 / 1e6,
393
+ "usd_per_output_token": 0.6 / 1e6,
394
+ ##### Time #####
395
+ "seconds_per_output_token": 0.0098,
396
+ ##### Agg. Benchmark #####
397
+ "overall": 62.7,
332
398
  }
333
399
  GPT_4o_MODEL_CARD = {
334
400
  # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
@@ -338,9 +404,7 @@ GPT_4o_MODEL_CARD = {
338
404
  ##### Time #####
339
405
  "seconds_per_output_token": 0.0079,
340
406
  ##### Agg. Benchmark #####
341
- "overall": 74.68,
342
- ##### Code #####
343
- "code": 90.0,
407
+ "overall": 74.1,
344
408
  }
345
409
  GPT_4o_MINI_MODEL_CARD = {
346
410
  # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
@@ -350,9 +414,37 @@ GPT_4o_MINI_MODEL_CARD = {
350
414
  ##### Time #####
351
415
  "seconds_per_output_token": 0.0098,
352
416
  ##### Agg. Benchmark #####
353
- "overall": 63.09,
354
- ##### Code #####
355
- "code": 86.0,
417
+ "overall": 62.7,
418
+ }
419
+ GPT_5_MODEL_CARD = {
420
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
421
+ ##### Cost in USD #####
422
+ "usd_per_input_token": 1.25 / 1e6,
423
+ "usd_per_output_token": 10.0 / 1e6,
424
+ ##### Time #####
425
+ "seconds_per_output_token": 0.0139,
426
+ ##### Agg. Benchmark #####
427
+ "overall": 87.00,
428
+ }
429
+ GPT_5_MINI_MODEL_CARD = {
430
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
431
+ ##### Cost in USD #####
432
+ "usd_per_input_token": 0.25 / 1e6,
433
+ "usd_per_output_token": 2.0 / 1e6,
434
+ ##### Time #####
435
+ "seconds_per_output_token": 0.0094,
436
+ ##### Agg. Benchmark #####
437
+ "overall": 82.50,
438
+ }
439
+ o4_MINI_MODEL_CARD = { # noqa: N816
440
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
441
+ ##### Cost in USD #####
442
+ "usd_per_input_token": 1.1 / 1e6,
443
+ "usd_per_output_token": 4.4 / 1e6,
444
+ ##### Time #####
445
+ "seconds_per_output_token": 0.0093,
446
+ ##### Agg. Benchmark #####
447
+ "overall": 80.6, # using number reported for o3-mini; true number is likely higher
356
448
  }
357
449
  o1_MODEL_CARD = { # noqa: N816
358
450
  # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
@@ -362,9 +454,7 @@ o1_MODEL_CARD = { # noqa: N816
362
454
  ##### Time #####
363
455
  "seconds_per_output_token": 0.0110,
364
456
  ##### Agg. Benchmark #####
365
- "overall": 89.30,
366
- ##### Code #####
367
- "code": 92.3, # NOTE: just copying MMLU score for now
457
+ "overall": 83.50,
368
458
  }
369
459
  TEXT_EMBEDDING_3_SMALL_MODEL_CARD = {
370
460
  ##### Cost in USD #####
@@ -384,7 +474,81 @@ CLIP_VIT_B_32_MODEL_CARD = {
384
474
  ##### Agg. Benchmark #####
385
475
  "overall": 63.3, # NOTE: ImageNet top-1 accuracy
386
476
  }
387
-
477
+ CLAUDE_3_5_SONNET_MODEL_CARD = {
478
+ ##### Cost in USD #####
479
+ "usd_per_input_token": 3.0 / 1e6,
480
+ "usd_per_output_token": 15.0 / 1e6,
481
+ ##### Time #####
482
+ "seconds_per_output_token": 0.0127,
483
+ ##### Agg. Benchmark #####
484
+ "overall": 78.4,
485
+ }
486
+ CLAUDE_3_7_SONNET_MODEL_CARD = {
487
+ ##### Cost in USD #####
488
+ "usd_per_input_token": 3.0 / 1e6,
489
+ "usd_per_output_token": 15.0 / 1e6,
490
+ ##### Time #####
491
+ "seconds_per_output_token": 0.0130,
492
+ ##### Agg. Benchmark #####
493
+ "overall": 80.7,
494
+ }
495
+ CLAUDE_3_5_HAIKU_MODEL_CARD = {
496
+ ##### Cost in USD #####
497
+ "usd_per_input_token": 0.8 / 1e6,
498
+ "usd_per_output_token": 4.0 / 1e6,
499
+ ##### Time #####
500
+ "seconds_per_output_token": 0.0152,
501
+ ##### Agg. Benchmark #####
502
+ "overall": 64.1,
503
+ }
504
+ GEMINI_2_0_FLASH_MODEL_CARD = {
505
+ ##### Cost in USD #####
506
+ "usd_per_input_token": 0.15 / 1e6,
507
+ "usd_per_output_token": 0.6 / 1e6,
508
+ "usd_per_audio_input_token": 1.0 / 1e6,
509
+ ##### Time #####
510
+ "seconds_per_output_token": 0.0049,
511
+ ##### Agg. Benchmark #####
512
+ "overall": 77.40,
513
+ }
514
+ GEMINI_2_5_FLASH_MODEL_CARD = {
515
+ ##### Cost in USD #####
516
+ "usd_per_input_token": 0.30 / 1e6,
517
+ "usd_per_output_token": 2.5 / 1e6,
518
+ "usd_per_audio_input_token": 1.0 / 1e6,
519
+ ##### Time #####
520
+ "seconds_per_output_token": 0.0039,
521
+ ##### Agg. Benchmark #####
522
+ "overall": 80.75, # NOTE: interpolated between gemini 2.0 flash and gemini 2.5 pro
523
+ }
524
+ GEMINI_2_5_PRO_MODEL_CARD = {
525
+ ##### Cost in USD #####
526
+ "usd_per_input_token": 1.25 / 1e6,
527
+ "usd_per_output_token": 10.0 / 1e6,
528
+ "usd_per_audio_input_token": 1.25 / 1e6,
529
+ ##### Time #####
530
+ "seconds_per_output_token": 0.0070,
531
+ ##### Agg. Benchmark #####
532
+ "overall": 84.10,
533
+ }
534
+ LLAMA_4_MAVERICK_MODEL_CARD = {
535
+ ##### Cost in USD #####
536
+ "usd_per_input_token": 0.35 / 1e6,
537
+ "usd_per_output_token": 1.15 / 1e6,
538
+ ##### Time #####
539
+ "seconds_per_output_token": 0.0058,
540
+ ##### Agg. Benchmark #####
541
+ "overall": 79.4,
542
+ }
543
+ VLLM_QWEN_1_5_0_5B_CHAT_MODEL_CARD = {
544
+ ##### Cost in USD #####
545
+ "usd_per_input_token": 0.0 / 1e6,
546
+ "usd_per_output_token": 0.0 / 1e6,
547
+ ##### Time #####
548
+ "seconds_per_output_token": 0.1000, # TODO: fill-in with a better estimate
549
+ ##### Agg. Benchmark #####
550
+ "overall": 30.0, # TODO: fill-in with a better estimate
551
+ }
388
552
 
389
553
  MODEL_CARDS = {
390
554
  Model.LLAMA3_2_3B.value: LLAMA3_2_3B_INSTRUCT_MODEL_CARD,
@@ -393,124 +557,22 @@ MODEL_CARDS = {
393
557
  Model.LLAMA3_2_90B_V.value: LLAMA3_2_90B_V_MODEL_CARD,
394
558
  Model.DEEPSEEK_V3.value: DEEPSEEK_V3_MODEL_CARD,
395
559
  Model.DEEPSEEK_R1_DISTILL_QWEN_1_5B.value: DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD,
396
- Model.MIXTRAL.value: MIXTRAL_8X_7B_MODEL_CARD,
397
560
  Model.GPT_4o.value: GPT_4o_MODEL_CARD,
398
561
  Model.GPT_4o_MINI.value: GPT_4o_MINI_MODEL_CARD,
562
+ Model.GPT_4o_AUDIO_PREVIEW.value: GPT_4o_AUDIO_PREVIEW_MODEL_CARD,
563
+ Model.GPT_4o_MINI_AUDIO_PREVIEW.value: GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD,
564
+ Model.GPT_5.value: GPT_5_MODEL_CARD,
565
+ Model.GPT_5_MINI.value: GPT_5_MINI_MODEL_CARD,
566
+ Model.o4_MINI.value: o4_MINI_MODEL_CARD,
399
567
  # Model.o1.value: o1_MODEL_CARD,
400
568
  Model.TEXT_EMBEDDING_3_SMALL.value: TEXT_EMBEDDING_3_SMALL_MODEL_CARD,
401
569
  Model.CLIP_VIT_B_32.value: CLIP_VIT_B_32_MODEL_CARD,
570
+ Model.CLAUDE_3_5_SONNET.value: CLAUDE_3_5_SONNET_MODEL_CARD,
571
+ Model.CLAUDE_3_7_SONNET.value: CLAUDE_3_7_SONNET_MODEL_CARD,
572
+ Model.CLAUDE_3_5_HAIKU.value: CLAUDE_3_5_HAIKU_MODEL_CARD,
573
+ Model.GEMINI_2_0_FLASH.value: GEMINI_2_0_FLASH_MODEL_CARD,
574
+ Model.GEMINI_2_5_FLASH.value: GEMINI_2_5_FLASH_MODEL_CARD,
575
+ Model.GEMINI_2_5_PRO.value: GEMINI_2_5_PRO_MODEL_CARD,
576
+ Model.LLAMA_4_MAVERICK.value: LLAMA_4_MAVERICK_MODEL_CARD,
577
+ Model.VLLM_QWEN_1_5_0_5B_CHAT.value: VLLM_QWEN_1_5_0_5B_CHAT_MODEL_CARD,
402
578
  }
403
-
404
-
405
- ###### DEPRECATED ######
406
- # # NOTE: seconds_per_output_token is based on `gpt-3.5-turbo-1106`
407
- # GPT_3_5_MODEL_CARD = {
408
- # ##### Cost in USD #####
409
- # "usd_per_input_token": 0.5 / 1E6,
410
- # "usd_per_output_token": 1.5 / 1E6,
411
- # ##### Time #####
412
- # "seconds_per_output_token": 0.0065,
413
- # ##### Agg. Benchmark #####
414
- # "overall": 70.0, # 5-shot
415
- # ##### Commonsense Reasoning #####
416
- # "reasoning": 84.1,
417
- # ### "HellaSwag": 85.5,^ # 10-shot
418
- # ### "WinoGrande": 81.6,^ # 5-shot
419
- # ### "Arc-e": 85.2,^ # 25-shot
420
- # ##### World Knowledge #####
421
- # ##### Reading Comprehension #####
422
- # ### "DROP": 64.1, # 3-shot
423
- # ##### Code #####
424
- # "code": 48.1,
425
- # ### "HumanEval": 48.1,^ # 0-shot
426
- # ##### Math #####
427
- # "math": 57.1,
428
- # ### "GSM8K": 57.1,^ # 5-shot
429
- # }
430
- # # NOTE: the seconds_per_output_token was computed based on a slightly different model ('gpt-4-1106-preview')
431
- # # and the benchmark statistics were computed based on the GPT-4 Technical Report; these might be
432
- # # slightly innacurate compared to the real numbers for gpt-4-0125-preview, but we'll use them until
433
- # # we have something better. (The cost metrics are accurate).
434
- # GPT_4_MODEL_CARD = {
435
- # ##### Cost in USD #####
436
- # "usd_per_input_token": 10 / 1E6,
437
- # "usd_per_output_token": 30 / 1E6,
438
- # ##### Time #####
439
- # "seconds_per_output_token": 0.018,
440
- # ##### Agg. Benchmark #####
441
- # "overall": 86.4, # 5-shot
442
- # ##### Commonsense Reasoning #####
443
- # "reasoning": 93.0,
444
- # ### "HellaSwag": 95.3,^ # 10-shot
445
- # ### "WinoGrande": 87.5,^ # 5-shot
446
- # ### "Arc-e": 96.3,^ # 25-shot
447
- # ##### World Knowledge #####
448
- # ##### Reading Comprehension #####
449
- # ### "DROP": 80.9, # 3-shot
450
- # ##### Code #####
451
- # "code": 67.0,
452
- # ### "HumanEval": 67.0,^ # 0-shot
453
- # ##### Math #####
454
- # "math": 92.0,
455
- # ### "GSM8K": 92.0,^ # 5-shot
456
- # }
457
-
458
- # # TODO: use cost info in here: https://platform.openai.com/docs/guides/vision/calculating-costs
459
- # GPT_4V_MODEL_CARD = {
460
- # ##### Cost in USD #####
461
- # "usd_per_input_token": 10 / 1E6,
462
- # "usd_per_output_token": 30 / 1E6,
463
- # ##### Time #####
464
- # "seconds_per_output_token": 0.042 / 10.0, # TODO: / 10.0 is a hack; need to figure out why time estimates are so off
465
- # ##### Agg. Benchmark #####
466
- # "overall": 86.4,
467
- # }
468
-
469
-
470
- # GEMINI_1_MODEL_CARD = {
471
- # ##### Cost in USD #####
472
- # "usd_per_input_token": 125 / 1E8, # Gemini is free but rate limited for now. Pricing will be updated
473
- # "usd_per_output_token": 375 / 1E9,
474
- # ##### Time #####
475
- # "seconds_per_output_token": 0.042 / 10.0, # TODO:
476
- # ##### Agg. Benchmark #####
477
- # "overall": 65.0, # 90.0 TODO: we are using the free version of Gemini which is substantially worse than its paid version; I'm manually revising it's quality below that of Mixtral
478
- # ##### Commonsense Reasoning #####
479
- # "reasoning": 80.0, # 87.8, TODO: see note above on overall
480
- # # "HellaSwag": 87.8, # 10-shot
481
- # ##### World Knowledge #####
482
- # ##### Reading Comprehension #####
483
- # # "DROP": 82.4, # Variable shots ?
484
- # ##### Code #####
485
- # "code": 74.4,
486
- # # "HumanEval": 74.4, # 0-shot (IT)*
487
- # # "Natural2Code": 74.9, # 0-shot
488
- # ##### Math #####
489
- # "math": 94.4,
490
- # # "GSM8K": 94.4, # maj1@32
491
- # # "MATH": 53.2, # 4-shot
492
- # }
493
-
494
- # GEMINI_1V_MODEL_CARD = {
495
- # ##### Cost in USD #####
496
- # "usd_per_input_token": 25 / 1E6, # Gemini is free but rate limited for now. Pricing will be updated
497
- # "usd_per_output_token": 375 / 1E9,
498
- # ##### Time #####
499
- # "seconds_per_output_token": 0.042, # / 10.0, # TODO:
500
- # ##### Agg. Benchmark #####
501
- # "overall": 65.0, # 90.0, TODO: see note above in Gemini_1 model card
502
- # ##### Commonsense Reasoning #####
503
- # "reasoning": 80.0, # 87.8, TODO: see note above in Gemini_1 model card
504
- # # "HellaSwag": 87.8, # 10-shot
505
- # ##### World Knowledge #####
506
- # ##### Reading Comprehension #####
507
- # # "DROP": 82.4, # Variable shots ?
508
- # ##### Code #####
509
- # "code": 74.4,
510
- # # "HumanEval": 74.4, # 0-shot (IT)*
511
- # # "Natural2Code": 74.9, # 0-shot
512
- # ##### Math #####
513
- # "math": 94.4,
514
- # # "GSM8K": 94.4, # maj1@32
515
- # # "MATH": 53.2, # 4-shot
516
- # }