palimpzest 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +259 -197
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +634 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +61 -5
- palimpzest/prompts/filter_prompts.py +50 -5
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
- palimpzest/prompts/prompt_factory.py +358 -46
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +157 -330
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +27 -21
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +22 -13
- palimpzest/query/operators/join.py +402 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +198 -80
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +41 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +27 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
- palimpzest-0.8.0.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.21.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
palimpzest/constants.py
CHANGED
|
@@ -10,77 +10,110 @@ class Model(str, Enum):
|
|
|
10
10
|
which requires invoking an LLM. It does NOT specify whether the model need be executed
|
|
11
11
|
remotely or locally (if applicable).
|
|
12
12
|
"""
|
|
13
|
-
LLAMA3_2_3B = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
|
|
14
|
-
LLAMA3_1_8B = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
|
|
15
|
-
LLAMA3_3_70B = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
|
|
16
|
-
LLAMA3_2_90B_V = "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo"
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
13
|
+
LLAMA3_2_3B = "together_ai/meta-llama/Llama-3.2-3B-Instruct-Turbo"
|
|
14
|
+
LLAMA3_1_8B = "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
|
|
15
|
+
LLAMA3_3_70B = "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"
|
|
16
|
+
LLAMA3_2_90B_V = "together_ai/meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo"
|
|
17
|
+
DEEPSEEK_V3 = "together_ai/deepseek-ai/DeepSeek-V3"
|
|
18
|
+
DEEPSEEK_R1_DISTILL_QWEN_1_5B = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
|
|
19
|
+
GPT_4o = "openai/gpt-4o-2024-08-06"
|
|
20
|
+
GPT_4o_MINI = "openai/gpt-4o-mini-2024-07-18"
|
|
21
|
+
GPT_5 = "openai/gpt-5"
|
|
22
|
+
GPT_5_MINI = "openai/gpt-5-mini"
|
|
23
|
+
o4_MINI = "openai/o4-mini-2025-04-16" # noqa: N815
|
|
22
24
|
TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
|
|
23
25
|
CLIP_VIT_B_32 = "clip-ViT-B-32"
|
|
26
|
+
CLAUDE_3_5_SONNET = "anthropic/claude-3-5-sonnet-20241022"
|
|
27
|
+
CLAUDE_3_7_SONNET = "anthropic/claude-3-7-sonnet-20250219"
|
|
28
|
+
CLAUDE_3_5_HAIKU = "anthropic/claude-3-5-haiku-20241022"
|
|
29
|
+
GEMINI_2_0_FLASH = "vertex_ai/gemini-2.0-flash"
|
|
30
|
+
GEMINI_2_5_FLASH = "vertex_ai/gemini-2.5-flash"
|
|
31
|
+
GEMINI_2_5_PRO = "vertex_ai/gemini-2.5-pro"
|
|
32
|
+
LLAMA_4_MAVERICK = "vertex_ai/meta/llama-4-maverick-17b-128e-instruct-maas"
|
|
33
|
+
GPT_4o_AUDIO_PREVIEW = "openai/gpt-4o-audio-preview"
|
|
34
|
+
GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
|
|
35
|
+
VLLM_QWEN_1_5_0_5B_CHAT = "hosted_vllm/qwen/Qwen1.5-0.5B-Chat"
|
|
24
36
|
# o1 = "o1-2024-12-17"
|
|
25
37
|
|
|
26
38
|
def __repr__(self):
|
|
27
39
|
return f"{self.name}"
|
|
28
40
|
|
|
29
|
-
def is_deepseek_model(self):
|
|
30
|
-
return "deepseek" in self.value.lower()
|
|
31
|
-
|
|
32
41
|
def is_llama_model(self):
|
|
33
42
|
return "llama" in self.value.lower()
|
|
34
43
|
|
|
35
|
-
def is_mixtral_model(self):
|
|
36
|
-
return "mixtral" in self.value.lower()
|
|
37
|
-
|
|
38
44
|
def is_clip_model(self):
|
|
39
45
|
return "clip" in self.value.lower()
|
|
40
46
|
|
|
41
47
|
def is_together_model(self):
|
|
42
|
-
|
|
43
|
-
is_mixtral_model = self.is_mixtral_model()
|
|
44
|
-
is_deepseek_model = self.is_deepseek_model()
|
|
45
|
-
is_clip_model = self.is_clip_model()
|
|
46
|
-
return is_llama_model or is_mixtral_model or is_deepseek_model or is_clip_model
|
|
47
|
-
|
|
48
|
-
def is_gpt_4o_model(self):
|
|
49
|
-
return "gpt-4o" in self.value.lower()
|
|
50
|
-
|
|
51
|
-
def is_o1_model(self):
|
|
52
|
-
return "o1" in self.value.lower()
|
|
48
|
+
return "together_ai" in self.value.lower() or self.is_clip_model()
|
|
53
49
|
|
|
54
50
|
def is_text_embedding_model(self):
|
|
55
51
|
return "text-embedding" in self.value.lower()
|
|
56
52
|
|
|
53
|
+
def is_o_model(self):
|
|
54
|
+
return self in [Model.o4_MINI]
|
|
55
|
+
|
|
56
|
+
def is_gpt_5_model(self):
|
|
57
|
+
return self in [Model.GPT_5, Model.GPT_5_MINI]
|
|
58
|
+
|
|
57
59
|
def is_openai_model(self):
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
return
|
|
60
|
+
return "openai" in self.value.lower() or self.is_text_embedding_model()
|
|
61
|
+
|
|
62
|
+
def is_anthropic_model(self):
|
|
63
|
+
return "anthropic" in self.value.lower()
|
|
64
|
+
|
|
65
|
+
def is_vertex_model(self):
|
|
66
|
+
return "vertex_ai" in self.value.lower()
|
|
67
|
+
|
|
68
|
+
def is_vllm_model(self):
|
|
69
|
+
return "hosted_vllm" in self.value.lower()
|
|
70
|
+
|
|
71
|
+
def is_reasoning_model(self):
|
|
72
|
+
reasoning_models = [
|
|
73
|
+
Model.GPT_5, Model.GPT_5_MINI, Model.o4_MINI,
|
|
74
|
+
Model.GEMINI_2_5_PRO, Model.GEMINI_2_5_FLASH,
|
|
75
|
+
Model.CLAUDE_3_7_SONNET,
|
|
76
|
+
]
|
|
77
|
+
return self in reasoning_models
|
|
78
|
+
|
|
79
|
+
def is_text_model(self):
|
|
80
|
+
non_text_models = [
|
|
81
|
+
Model.LLAMA3_2_90B_V,
|
|
82
|
+
Model.CLIP_VIT_B_32, Model.TEXT_EMBEDDING_3_SMALL,
|
|
83
|
+
Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
|
|
84
|
+
]
|
|
85
|
+
return self not in non_text_models
|
|
62
86
|
|
|
87
|
+
# TODO: I think SONNET and HAIKU are vision-capable too
|
|
63
88
|
def is_vision_model(self):
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
"o1-2024-12-17",
|
|
89
|
+
return self in [
|
|
90
|
+
Model.LLAMA3_2_90B_V, Model.LLAMA_4_MAVERICK,
|
|
91
|
+
Model.GPT_4o, Model.GPT_4o_MINI, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI,
|
|
92
|
+
Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
|
|
69
93
|
]
|
|
70
|
-
return self.value in vision_models
|
|
71
94
|
|
|
72
|
-
def
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
95
|
+
def is_audio_model(self):
|
|
96
|
+
return self in [
|
|
97
|
+
Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
|
|
98
|
+
Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
|
|
99
|
+
]
|
|
76
100
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
101
|
+
def is_text_image_multimodal_model(self):
|
|
102
|
+
return self in [
|
|
103
|
+
Model.LLAMA_4_MAVERICK,
|
|
104
|
+
Model.GPT_4o, Model.GPT_4o_MINI, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI,
|
|
105
|
+
Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
def is_text_audio_multimodal_model(self):
|
|
109
|
+
return self in [
|
|
110
|
+
Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
|
|
111
|
+
Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
def is_embedding_model(self):
|
|
115
|
+
return self in [Model.CLIP_VIT_B_32, Model.TEXT_EMBEDDING_3_SMALL]
|
|
81
116
|
|
|
82
|
-
OPENAI = "openai"
|
|
83
|
-
TOGETHER = "together"
|
|
84
117
|
|
|
85
118
|
class PromptStrategy(str, Enum):
|
|
86
119
|
"""
|
|
@@ -90,28 +123,48 @@ class PromptStrategy(str, Enum):
|
|
|
90
123
|
|
|
91
124
|
# Chain-of-Thought Boolean Prompt Strategies
|
|
92
125
|
COT_BOOL = "chain-of-thought-bool"
|
|
126
|
+
COT_BOOL_NO_REASONING = "chain-of-thought-bool-no-reasoning"
|
|
93
127
|
# COT_BOOL_CRITIC = "chain-of-thought-bool-critic"
|
|
94
128
|
# COT_BOOL_REFINE = "chain-of-thought-bool-refine"
|
|
95
129
|
|
|
96
130
|
# Chain-of-Thought Boolean with Image Prompt Strategies
|
|
97
131
|
COT_BOOL_IMAGE = "chain-of-thought-bool-image"
|
|
132
|
+
COT_BOOL_IMAGE_NO_REASONING = "chain-of-thought-bool-image"
|
|
133
|
+
COT_BOOL_AUDIO = "chain-of-thought-bool-audio"
|
|
134
|
+
COT_BOOL_AUDIO_NO_REASONING = "chain-of-thought-bool-audio"
|
|
98
135
|
# COT_BOOL_IMAGE_CRITIC = "chain-of-thought-bool-image-critic"
|
|
99
136
|
# COT_BOOL_IMAGE_REFINE = "chain-of-thought-bool-image-refine"
|
|
100
137
|
|
|
138
|
+
# Chain-of-Thought Join Prompt Strategies
|
|
139
|
+
COT_JOIN = "chain-of-thought-join"
|
|
140
|
+
COT_JOIN_NO_REASONING = "chain-of-thought-join-no-reasoning"
|
|
141
|
+
COT_JOIN_IMAGE = "chain-of-thought-join-image"
|
|
142
|
+
COT_JOIN_IMAGE_NO_REASONING = "chain-of-thought-join-image-no-reasoning"
|
|
143
|
+
COT_JOIN_AUDIO = "chain-of-thought-join-audio"
|
|
144
|
+
COT_JOIN_AUDIO_NO_REASONING = "chain-of-thought-join-audio-no-reasoning"
|
|
145
|
+
|
|
101
146
|
# Chain-of-Thought Question Answering Prompt Strategies
|
|
102
147
|
COT_QA = "chain-of-thought-question"
|
|
148
|
+
COT_QA_NO_REASONING = "chain-of-thought-question-no-reasoning"
|
|
103
149
|
COT_QA_CRITIC = "chain-of-thought-question-critic"
|
|
104
150
|
COT_QA_REFINE = "chain-of-thought-question-refine"
|
|
105
151
|
|
|
106
152
|
# Chain-of-Thought Question with Image Prompt Strategies
|
|
107
153
|
COT_QA_IMAGE = "chain-of-thought-question-image"
|
|
154
|
+
COT_QA_IMAGE_NO_REASONING = "chain-of-thought-question-image-no-reasoning"
|
|
108
155
|
COT_QA_IMAGE_CRITIC = "chain-of-thought-question-critic-image"
|
|
109
156
|
COT_QA_IMAGE_REFINE = "chain-of-thought-question-refine-image"
|
|
110
157
|
|
|
158
|
+
# Chain-of-Thought Queestion with Audio Prompt Strategies
|
|
159
|
+
COT_QA_AUDIO = "chain-of-thought-question-audio"
|
|
160
|
+
COT_QA_AUDIO_NO_REASONING = "chain-of-thought-question-audio-no-reasoning"
|
|
161
|
+
# TODO: COT_QA_AUDIO_CRITIC/REFINE
|
|
162
|
+
|
|
111
163
|
# Mixture-of-Agents Prompt Strategies
|
|
112
164
|
COT_MOA_PROPOSER = "chain-of-thought-mixture-of-agents-proposer"
|
|
113
165
|
COT_MOA_PROPOSER_IMAGE = "chain-of-thought-mixture-of-agents-proposer-image"
|
|
114
166
|
COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation"
|
|
167
|
+
# TODO: COT_MOA_PROPOSER_AUDIO
|
|
115
168
|
|
|
116
169
|
# Split Convert Prompt Strategies
|
|
117
170
|
SPLIT_PROPOSER = "split-proposer"
|
|
@@ -120,11 +173,17 @@ class PromptStrategy(str, Enum):
|
|
|
120
173
|
def is_image_prompt(self):
|
|
121
174
|
return "image" in self.value
|
|
122
175
|
|
|
176
|
+
def is_audio_prompt(self):
|
|
177
|
+
return "audio" in self.value
|
|
178
|
+
|
|
123
179
|
def is_bool_prompt(self):
|
|
124
180
|
return "bool" in self.value
|
|
125
181
|
|
|
182
|
+
def is_join_prompt(self):
|
|
183
|
+
return "join" in self.value
|
|
184
|
+
|
|
126
185
|
def is_convert_prompt(self):
|
|
127
|
-
return "bool" not in self.value
|
|
186
|
+
return "bool" not in self.value and "join" not in self.value
|
|
128
187
|
|
|
129
188
|
def is_critic_prompt(self):
|
|
130
189
|
return "critic" in self.value
|
|
@@ -144,6 +203,9 @@ class PromptStrategy(str, Enum):
|
|
|
144
203
|
def is_split_merger_prompt(self):
|
|
145
204
|
return "split-merger" in self.value
|
|
146
205
|
|
|
206
|
+
def is_no_reasoning_prompt(self):
|
|
207
|
+
return "no-reasoning" in self.value
|
|
208
|
+
|
|
147
209
|
class AggFunc(str, Enum):
|
|
148
210
|
COUNT = "count"
|
|
149
211
|
AVERAGE = "average"
|
|
@@ -169,6 +231,7 @@ class PickOutputStrategy(str, Enum):
|
|
|
169
231
|
ENSEMBLE = "ensemble"
|
|
170
232
|
|
|
171
233
|
|
|
234
|
+
AUDIO_EXTENSIONS = [".wav"]
|
|
172
235
|
IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"]
|
|
173
236
|
PDF_EXTENSIONS = [".pdf"]
|
|
174
237
|
XLS_EXTENSIONS = [".xls", ".xlsx"]
|
|
@@ -210,8 +273,8 @@ NAIVE_BYTES_PER_RECORD = 1024
|
|
|
210
273
|
# Rough conversion from # of characters --> # of tokens; assumes 1 token ~= 4 chars
|
|
211
274
|
TOKENS_PER_CHARACTER = 0.25
|
|
212
275
|
|
|
213
|
-
# Rough estimate of the number of tokens the context is allowed to take up for
|
|
214
|
-
|
|
276
|
+
# Rough estimate of the number of tokens the context is allowed to take up for LLAMA3 models
|
|
277
|
+
LLAMA_CONTEXT_TOKENS_LIMIT = 6000
|
|
215
278
|
|
|
216
279
|
# a naive estimate for the input record size
|
|
217
280
|
NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
|
|
@@ -219,6 +282,9 @@ NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
|
|
|
219
282
|
# a naive estimate for filter selectivity
|
|
220
283
|
NAIVE_EST_FILTER_SELECTIVITY = 0.5
|
|
221
284
|
|
|
285
|
+
# a naive estimate for join selectivity
|
|
286
|
+
NAIVE_EST_JOIN_SELECTIVITY = 0.5
|
|
287
|
+
|
|
222
288
|
# a naive estimate for the number of input tokens processed per record
|
|
223
289
|
NAIVE_EST_NUM_INPUT_TOKENS = 1000
|
|
224
290
|
|
|
@@ -246,6 +312,7 @@ LOG_LLM_OUTPUT = False
|
|
|
246
312
|
# - in the future we should split quality for vision vs. multi-modal vs. text
|
|
247
313
|
# - code quality was computed using HumanEval, but that benchmark is too easy and should be replaced.
|
|
248
314
|
# - https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro
|
|
315
|
+
# - https://www.vals.ai/benchmarks/mmlu_pro-08-12-2025
|
|
249
316
|
#
|
|
250
317
|
# Cost is presented in terms of USD / token for input tokens and USD / token for
|
|
251
318
|
# generated tokens.
|
|
@@ -263,8 +330,6 @@ LLAMA3_2_3B_INSTRUCT_MODEL_CARD = {
|
|
|
263
330
|
"seconds_per_output_token": 0.0064,
|
|
264
331
|
##### Agg. Benchmark #####
|
|
265
332
|
"overall": 36.50, # https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/discussions/13
|
|
266
|
-
##### Code #####
|
|
267
|
-
"code": 0.0,
|
|
268
333
|
}
|
|
269
334
|
LLAMA3_1_8B_INSTRUCT_MODEL_CARD = {
|
|
270
335
|
##### Cost in USD #####
|
|
@@ -274,8 +339,6 @@ LLAMA3_1_8B_INSTRUCT_MODEL_CARD = {
|
|
|
274
339
|
"seconds_per_output_token": 0.0059,
|
|
275
340
|
##### Agg. Benchmark #####
|
|
276
341
|
"overall": 44.25,
|
|
277
|
-
##### Code #####
|
|
278
|
-
"code": 72.6,
|
|
279
342
|
}
|
|
280
343
|
LLAMA3_3_70B_INSTRUCT_MODEL_CARD = {
|
|
281
344
|
##### Cost in USD #####
|
|
@@ -284,9 +347,7 @@ LLAMA3_3_70B_INSTRUCT_MODEL_CARD = {
|
|
|
284
347
|
##### Time #####
|
|
285
348
|
"seconds_per_output_token": 0.0139,
|
|
286
349
|
##### Agg. Benchmark #####
|
|
287
|
-
"overall":
|
|
288
|
-
##### Code #####
|
|
289
|
-
"code": 88.4,
|
|
350
|
+
"overall": 69.9,
|
|
290
351
|
}
|
|
291
352
|
LLAMA3_2_90B_V_MODEL_CARD = {
|
|
292
353
|
##### Cost in USD #####
|
|
@@ -297,17 +358,6 @@ LLAMA3_2_90B_V_MODEL_CARD = {
|
|
|
297
358
|
##### Agg. Benchmark #####
|
|
298
359
|
"overall": 65.00, # set to be slightly higher than gpt-4o-mini
|
|
299
360
|
}
|
|
300
|
-
MIXTRAL_8X_7B_MODEL_CARD = {
|
|
301
|
-
##### Cost in USD #####
|
|
302
|
-
"usd_per_input_token": 0.6 / 1e6,
|
|
303
|
-
"usd_per_output_token": 0.6 / 1e6,
|
|
304
|
-
##### Time #####
|
|
305
|
-
"seconds_per_output_token": 0.0112,
|
|
306
|
-
##### Agg. Benchmark #####
|
|
307
|
-
"overall": 43.27,
|
|
308
|
-
##### Code #####
|
|
309
|
-
"code": 40.0,
|
|
310
|
-
}
|
|
311
361
|
DEEPSEEK_V3_MODEL_CARD = {
|
|
312
362
|
##### Cost in USD #####
|
|
313
363
|
"usd_per_input_token": 1.25 / 1E6,
|
|
@@ -315,9 +365,7 @@ DEEPSEEK_V3_MODEL_CARD = {
|
|
|
315
365
|
##### Time #####
|
|
316
366
|
"seconds_per_output_token": 0.0769,
|
|
317
367
|
##### Agg. Benchmark #####
|
|
318
|
-
"overall":
|
|
319
|
-
##### Code #####
|
|
320
|
-
"code": 92.0,
|
|
368
|
+
"overall": 73.8,
|
|
321
369
|
}
|
|
322
370
|
DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD = {
|
|
323
371
|
##### Cost in USD #####
|
|
@@ -327,8 +375,26 @@ DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD = {
|
|
|
327
375
|
"seconds_per_output_token": 0.0026,
|
|
328
376
|
##### Agg. Benchmark #####
|
|
329
377
|
"overall": 39.90, # https://www.reddit.com/r/LocalLLaMA/comments/1iserf9/deepseek_r1_distilled_models_mmlu_pro_benchmarks/
|
|
330
|
-
|
|
331
|
-
|
|
378
|
+
}
|
|
379
|
+
GPT_4o_AUDIO_PREVIEW_MODEL_CARD = {
|
|
380
|
+
# NOTE: COPYING OVERALL AND SECONDS_PER_OUTPUT_TOKEN FROM GPT_4o; need to update when we have audio-specific benchmarks
|
|
381
|
+
##### Cost in USD #####
|
|
382
|
+
"usd_per_audio_input_token": 2.5 / 1e6,
|
|
383
|
+
"usd_per_output_token": 10.0 / 1e6,
|
|
384
|
+
##### Time #####
|
|
385
|
+
"seconds_per_output_token": 0.0079,
|
|
386
|
+
##### Agg. Benchmark #####
|
|
387
|
+
"overall": 74.1,
|
|
388
|
+
}
|
|
389
|
+
GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD = {
|
|
390
|
+
# NOTE: COPYING OVERALL AND SECONDS_PER_OUTPUT_TOKEN FROM GPT_4o; need to update when we have audio-specific benchmarks
|
|
391
|
+
##### Cost in USD #####
|
|
392
|
+
"usd_per_audio_input_token": 0.15 / 1e6,
|
|
393
|
+
"usd_per_output_token": 0.6 / 1e6,
|
|
394
|
+
##### Time #####
|
|
395
|
+
"seconds_per_output_token": 0.0098,
|
|
396
|
+
##### Agg. Benchmark #####
|
|
397
|
+
"overall": 62.7,
|
|
332
398
|
}
|
|
333
399
|
GPT_4o_MODEL_CARD = {
|
|
334
400
|
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
@@ -338,9 +404,7 @@ GPT_4o_MODEL_CARD = {
|
|
|
338
404
|
##### Time #####
|
|
339
405
|
"seconds_per_output_token": 0.0079,
|
|
340
406
|
##### Agg. Benchmark #####
|
|
341
|
-
"overall": 74.
|
|
342
|
-
##### Code #####
|
|
343
|
-
"code": 90.0,
|
|
407
|
+
"overall": 74.1,
|
|
344
408
|
}
|
|
345
409
|
GPT_4o_MINI_MODEL_CARD = {
|
|
346
410
|
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
@@ -350,9 +414,37 @@ GPT_4o_MINI_MODEL_CARD = {
|
|
|
350
414
|
##### Time #####
|
|
351
415
|
"seconds_per_output_token": 0.0098,
|
|
352
416
|
##### Agg. Benchmark #####
|
|
353
|
-
"overall":
|
|
354
|
-
|
|
355
|
-
|
|
417
|
+
"overall": 62.7,
|
|
418
|
+
}
|
|
419
|
+
GPT_5_MODEL_CARD = {
|
|
420
|
+
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
421
|
+
##### Cost in USD #####
|
|
422
|
+
"usd_per_input_token": 1.25 / 1e6,
|
|
423
|
+
"usd_per_output_token": 10.0 / 1e6,
|
|
424
|
+
##### Time #####
|
|
425
|
+
"seconds_per_output_token": 0.0139,
|
|
426
|
+
##### Agg. Benchmark #####
|
|
427
|
+
"overall": 87.00,
|
|
428
|
+
}
|
|
429
|
+
GPT_5_MINI_MODEL_CARD = {
|
|
430
|
+
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
431
|
+
##### Cost in USD #####
|
|
432
|
+
"usd_per_input_token": 0.25 / 1e6,
|
|
433
|
+
"usd_per_output_token": 2.0 / 1e6,
|
|
434
|
+
##### Time #####
|
|
435
|
+
"seconds_per_output_token": 0.0094,
|
|
436
|
+
##### Agg. Benchmark #####
|
|
437
|
+
"overall": 82.50,
|
|
438
|
+
}
|
|
439
|
+
o4_MINI_MODEL_CARD = { # noqa: N816
|
|
440
|
+
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
441
|
+
##### Cost in USD #####
|
|
442
|
+
"usd_per_input_token": 1.1 / 1e6,
|
|
443
|
+
"usd_per_output_token": 4.4 / 1e6,
|
|
444
|
+
##### Time #####
|
|
445
|
+
"seconds_per_output_token": 0.0093,
|
|
446
|
+
##### Agg. Benchmark #####
|
|
447
|
+
"overall": 80.6, # using number reported for o3-mini; true number is likely higher
|
|
356
448
|
}
|
|
357
449
|
o1_MODEL_CARD = { # noqa: N816
|
|
358
450
|
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
@@ -362,9 +454,7 @@ o1_MODEL_CARD = { # noqa: N816
|
|
|
362
454
|
##### Time #####
|
|
363
455
|
"seconds_per_output_token": 0.0110,
|
|
364
456
|
##### Agg. Benchmark #####
|
|
365
|
-
"overall":
|
|
366
|
-
##### Code #####
|
|
367
|
-
"code": 92.3, # NOTE: just copying MMLU score for now
|
|
457
|
+
"overall": 83.50,
|
|
368
458
|
}
|
|
369
459
|
TEXT_EMBEDDING_3_SMALL_MODEL_CARD = {
|
|
370
460
|
##### Cost in USD #####
|
|
@@ -384,7 +474,81 @@ CLIP_VIT_B_32_MODEL_CARD = {
|
|
|
384
474
|
##### Agg. Benchmark #####
|
|
385
475
|
"overall": 63.3, # NOTE: ImageNet top-1 accuracy
|
|
386
476
|
}
|
|
387
|
-
|
|
477
|
+
CLAUDE_3_5_SONNET_MODEL_CARD = {
|
|
478
|
+
##### Cost in USD #####
|
|
479
|
+
"usd_per_input_token": 3.0 / 1e6,
|
|
480
|
+
"usd_per_output_token": 15.0 / 1e6,
|
|
481
|
+
##### Time #####
|
|
482
|
+
"seconds_per_output_token": 0.0127,
|
|
483
|
+
##### Agg. Benchmark #####
|
|
484
|
+
"overall": 78.4,
|
|
485
|
+
}
|
|
486
|
+
CLAUDE_3_7_SONNET_MODEL_CARD = {
|
|
487
|
+
##### Cost in USD #####
|
|
488
|
+
"usd_per_input_token": 3.0 / 1e6,
|
|
489
|
+
"usd_per_output_token": 15.0 / 1e6,
|
|
490
|
+
##### Time #####
|
|
491
|
+
"seconds_per_output_token": 0.0130,
|
|
492
|
+
##### Agg. Benchmark #####
|
|
493
|
+
"overall": 80.7,
|
|
494
|
+
}
|
|
495
|
+
CLAUDE_3_5_HAIKU_MODEL_CARD = {
|
|
496
|
+
##### Cost in USD #####
|
|
497
|
+
"usd_per_input_token": 0.8 / 1e6,
|
|
498
|
+
"usd_per_output_token": 4.0 / 1e6,
|
|
499
|
+
##### Time #####
|
|
500
|
+
"seconds_per_output_token": 0.0152,
|
|
501
|
+
##### Agg. Benchmark #####
|
|
502
|
+
"overall": 64.1,
|
|
503
|
+
}
|
|
504
|
+
GEMINI_2_0_FLASH_MODEL_CARD = {
|
|
505
|
+
##### Cost in USD #####
|
|
506
|
+
"usd_per_input_token": 0.15 / 1e6,
|
|
507
|
+
"usd_per_output_token": 0.6 / 1e6,
|
|
508
|
+
"usd_per_audio_input_token": 1.0 / 1e6,
|
|
509
|
+
##### Time #####
|
|
510
|
+
"seconds_per_output_token": 0.0049,
|
|
511
|
+
##### Agg. Benchmark #####
|
|
512
|
+
"overall": 77.40,
|
|
513
|
+
}
|
|
514
|
+
GEMINI_2_5_FLASH_MODEL_CARD = {
|
|
515
|
+
##### Cost in USD #####
|
|
516
|
+
"usd_per_input_token": 0.30 / 1e6,
|
|
517
|
+
"usd_per_output_token": 2.5 / 1e6,
|
|
518
|
+
"usd_per_audio_input_token": 1.0 / 1e6,
|
|
519
|
+
##### Time #####
|
|
520
|
+
"seconds_per_output_token": 0.0039,
|
|
521
|
+
##### Agg. Benchmark #####
|
|
522
|
+
"overall": 80.75, # NOTE: interpolated between gemini 2.0 flash and gemini 2.5 pro
|
|
523
|
+
}
|
|
524
|
+
GEMINI_2_5_PRO_MODEL_CARD = {
|
|
525
|
+
##### Cost in USD #####
|
|
526
|
+
"usd_per_input_token": 1.25 / 1e6,
|
|
527
|
+
"usd_per_output_token": 10.0 / 1e6,
|
|
528
|
+
"usd_per_audio_input_token": 1.25 / 1e6,
|
|
529
|
+
##### Time #####
|
|
530
|
+
"seconds_per_output_token": 0.0070,
|
|
531
|
+
##### Agg. Benchmark #####
|
|
532
|
+
"overall": 84.10,
|
|
533
|
+
}
|
|
534
|
+
LLAMA_4_MAVERICK_MODEL_CARD = {
|
|
535
|
+
##### Cost in USD #####
|
|
536
|
+
"usd_per_input_token": 0.35 / 1e6,
|
|
537
|
+
"usd_per_output_token": 1.15 / 1e6,
|
|
538
|
+
##### Time #####
|
|
539
|
+
"seconds_per_output_token": 0.0058,
|
|
540
|
+
##### Agg. Benchmark #####
|
|
541
|
+
"overall": 79.4,
|
|
542
|
+
}
|
|
543
|
+
VLLM_QWEN_1_5_0_5B_CHAT_MODEL_CARD = {
|
|
544
|
+
##### Cost in USD #####
|
|
545
|
+
"usd_per_input_token": 0.0 / 1e6,
|
|
546
|
+
"usd_per_output_token": 0.0 / 1e6,
|
|
547
|
+
##### Time #####
|
|
548
|
+
"seconds_per_output_token": 0.1000, # TODO: fill-in with a better estimate
|
|
549
|
+
##### Agg. Benchmark #####
|
|
550
|
+
"overall": 30.0, # TODO: fill-in with a better estimate
|
|
551
|
+
}
|
|
388
552
|
|
|
389
553
|
MODEL_CARDS = {
|
|
390
554
|
Model.LLAMA3_2_3B.value: LLAMA3_2_3B_INSTRUCT_MODEL_CARD,
|
|
@@ -393,124 +557,22 @@ MODEL_CARDS = {
|
|
|
393
557
|
Model.LLAMA3_2_90B_V.value: LLAMA3_2_90B_V_MODEL_CARD,
|
|
394
558
|
Model.DEEPSEEK_V3.value: DEEPSEEK_V3_MODEL_CARD,
|
|
395
559
|
Model.DEEPSEEK_R1_DISTILL_QWEN_1_5B.value: DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD,
|
|
396
|
-
Model.MIXTRAL.value: MIXTRAL_8X_7B_MODEL_CARD,
|
|
397
560
|
Model.GPT_4o.value: GPT_4o_MODEL_CARD,
|
|
398
561
|
Model.GPT_4o_MINI.value: GPT_4o_MINI_MODEL_CARD,
|
|
562
|
+
Model.GPT_4o_AUDIO_PREVIEW.value: GPT_4o_AUDIO_PREVIEW_MODEL_CARD,
|
|
563
|
+
Model.GPT_4o_MINI_AUDIO_PREVIEW.value: GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD,
|
|
564
|
+
Model.GPT_5.value: GPT_5_MODEL_CARD,
|
|
565
|
+
Model.GPT_5_MINI.value: GPT_5_MINI_MODEL_CARD,
|
|
566
|
+
Model.o4_MINI.value: o4_MINI_MODEL_CARD,
|
|
399
567
|
# Model.o1.value: o1_MODEL_CARD,
|
|
400
568
|
Model.TEXT_EMBEDDING_3_SMALL.value: TEXT_EMBEDDING_3_SMALL_MODEL_CARD,
|
|
401
569
|
Model.CLIP_VIT_B_32.value: CLIP_VIT_B_32_MODEL_CARD,
|
|
570
|
+
Model.CLAUDE_3_5_SONNET.value: CLAUDE_3_5_SONNET_MODEL_CARD,
|
|
571
|
+
Model.CLAUDE_3_7_SONNET.value: CLAUDE_3_7_SONNET_MODEL_CARD,
|
|
572
|
+
Model.CLAUDE_3_5_HAIKU.value: CLAUDE_3_5_HAIKU_MODEL_CARD,
|
|
573
|
+
Model.GEMINI_2_0_FLASH.value: GEMINI_2_0_FLASH_MODEL_CARD,
|
|
574
|
+
Model.GEMINI_2_5_FLASH.value: GEMINI_2_5_FLASH_MODEL_CARD,
|
|
575
|
+
Model.GEMINI_2_5_PRO.value: GEMINI_2_5_PRO_MODEL_CARD,
|
|
576
|
+
Model.LLAMA_4_MAVERICK.value: LLAMA_4_MAVERICK_MODEL_CARD,
|
|
577
|
+
Model.VLLM_QWEN_1_5_0_5B_CHAT.value: VLLM_QWEN_1_5_0_5B_CHAT_MODEL_CARD,
|
|
402
578
|
}
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
###### DEPRECATED ######
|
|
406
|
-
# # NOTE: seconds_per_output_token is based on `gpt-3.5-turbo-1106`
|
|
407
|
-
# GPT_3_5_MODEL_CARD = {
|
|
408
|
-
# ##### Cost in USD #####
|
|
409
|
-
# "usd_per_input_token": 0.5 / 1E6,
|
|
410
|
-
# "usd_per_output_token": 1.5 / 1E6,
|
|
411
|
-
# ##### Time #####
|
|
412
|
-
# "seconds_per_output_token": 0.0065,
|
|
413
|
-
# ##### Agg. Benchmark #####
|
|
414
|
-
# "overall": 70.0, # 5-shot
|
|
415
|
-
# ##### Commonsense Reasoning #####
|
|
416
|
-
# "reasoning": 84.1,
|
|
417
|
-
# ### "HellaSwag": 85.5,^ # 10-shot
|
|
418
|
-
# ### "WinoGrande": 81.6,^ # 5-shot
|
|
419
|
-
# ### "Arc-e": 85.2,^ # 25-shot
|
|
420
|
-
# ##### World Knowledge #####
|
|
421
|
-
# ##### Reading Comprehension #####
|
|
422
|
-
# ### "DROP": 64.1, # 3-shot
|
|
423
|
-
# ##### Code #####
|
|
424
|
-
# "code": 48.1,
|
|
425
|
-
# ### "HumanEval": 48.1,^ # 0-shot
|
|
426
|
-
# ##### Math #####
|
|
427
|
-
# "math": 57.1,
|
|
428
|
-
# ### "GSM8K": 57.1,^ # 5-shot
|
|
429
|
-
# }
|
|
430
|
-
# # NOTE: the seconds_per_output_token was computed based on a slightly different model ('gpt-4-1106-preview')
|
|
431
|
-
# # and the benchmark statistics were computed based on the GPT-4 Technical Report; these might be
|
|
432
|
-
# # slightly innacurate compared to the real numbers for gpt-4-0125-preview, but we'll use them until
|
|
433
|
-
# # we have something better. (The cost metrics are accurate).
|
|
434
|
-
# GPT_4_MODEL_CARD = {
|
|
435
|
-
# ##### Cost in USD #####
|
|
436
|
-
# "usd_per_input_token": 10 / 1E6,
|
|
437
|
-
# "usd_per_output_token": 30 / 1E6,
|
|
438
|
-
# ##### Time #####
|
|
439
|
-
# "seconds_per_output_token": 0.018,
|
|
440
|
-
# ##### Agg. Benchmark #####
|
|
441
|
-
# "overall": 86.4, # 5-shot
|
|
442
|
-
# ##### Commonsense Reasoning #####
|
|
443
|
-
# "reasoning": 93.0,
|
|
444
|
-
# ### "HellaSwag": 95.3,^ # 10-shot
|
|
445
|
-
# ### "WinoGrande": 87.5,^ # 5-shot
|
|
446
|
-
# ### "Arc-e": 96.3,^ # 25-shot
|
|
447
|
-
# ##### World Knowledge #####
|
|
448
|
-
# ##### Reading Comprehension #####
|
|
449
|
-
# ### "DROP": 80.9, # 3-shot
|
|
450
|
-
# ##### Code #####
|
|
451
|
-
# "code": 67.0,
|
|
452
|
-
# ### "HumanEval": 67.0,^ # 0-shot
|
|
453
|
-
# ##### Math #####
|
|
454
|
-
# "math": 92.0,
|
|
455
|
-
# ### "GSM8K": 92.0,^ # 5-shot
|
|
456
|
-
# }
|
|
457
|
-
|
|
458
|
-
# # TODO: use cost info in here: https://platform.openai.com/docs/guides/vision/calculating-costs
|
|
459
|
-
# GPT_4V_MODEL_CARD = {
|
|
460
|
-
# ##### Cost in USD #####
|
|
461
|
-
# "usd_per_input_token": 10 / 1E6,
|
|
462
|
-
# "usd_per_output_token": 30 / 1E6,
|
|
463
|
-
# ##### Time #####
|
|
464
|
-
# "seconds_per_output_token": 0.042 / 10.0, # TODO: / 10.0 is a hack; need to figure out why time estimates are so off
|
|
465
|
-
# ##### Agg. Benchmark #####
|
|
466
|
-
# "overall": 86.4,
|
|
467
|
-
# }
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
# GEMINI_1_MODEL_CARD = {
|
|
471
|
-
# ##### Cost in USD #####
|
|
472
|
-
# "usd_per_input_token": 125 / 1E8, # Gemini is free but rate limited for now. Pricing will be updated
|
|
473
|
-
# "usd_per_output_token": 375 / 1E9,
|
|
474
|
-
# ##### Time #####
|
|
475
|
-
# "seconds_per_output_token": 0.042 / 10.0, # TODO:
|
|
476
|
-
# ##### Agg. Benchmark #####
|
|
477
|
-
# "overall": 65.0, # 90.0 TODO: we are using the free version of Gemini which is substantially worse than its paid version; I'm manually revising it's quality below that of Mixtral
|
|
478
|
-
# ##### Commonsense Reasoning #####
|
|
479
|
-
# "reasoning": 80.0, # 87.8, TODO: see note above on overall
|
|
480
|
-
# # "HellaSwag": 87.8, # 10-shot
|
|
481
|
-
# ##### World Knowledge #####
|
|
482
|
-
# ##### Reading Comprehension #####
|
|
483
|
-
# # "DROP": 82.4, # Variable shots ?
|
|
484
|
-
# ##### Code #####
|
|
485
|
-
# "code": 74.4,
|
|
486
|
-
# # "HumanEval": 74.4, # 0-shot (IT)*
|
|
487
|
-
# # "Natural2Code": 74.9, # 0-shot
|
|
488
|
-
# ##### Math #####
|
|
489
|
-
# "math": 94.4,
|
|
490
|
-
# # "GSM8K": 94.4, # maj1@32
|
|
491
|
-
# # "MATH": 53.2, # 4-shot
|
|
492
|
-
# }
|
|
493
|
-
|
|
494
|
-
# GEMINI_1V_MODEL_CARD = {
|
|
495
|
-
# ##### Cost in USD #####
|
|
496
|
-
# "usd_per_input_token": 25 / 1E6, # Gemini is free but rate limited for now. Pricing will be updated
|
|
497
|
-
# "usd_per_output_token": 375 / 1E9,
|
|
498
|
-
# ##### Time #####
|
|
499
|
-
# "seconds_per_output_token": 0.042, # / 10.0, # TODO:
|
|
500
|
-
# ##### Agg. Benchmark #####
|
|
501
|
-
# "overall": 65.0, # 90.0, TODO: see note above in Gemini_1 model card
|
|
502
|
-
# ##### Commonsense Reasoning #####
|
|
503
|
-
# "reasoning": 80.0, # 87.8, TODO: see note above in Gemini_1 model card
|
|
504
|
-
# # "HellaSwag": 87.8, # 10-shot
|
|
505
|
-
# ##### World Knowledge #####
|
|
506
|
-
# ##### Reading Comprehension #####
|
|
507
|
-
# # "DROP": 82.4, # Variable shots ?
|
|
508
|
-
# ##### Code #####
|
|
509
|
-
# "code": 74.4,
|
|
510
|
-
# # "HumanEval": 74.4, # 0-shot (IT)*
|
|
511
|
-
# # "Natural2Code": 74.9, # 0-shot
|
|
512
|
-
# ##### Math #####
|
|
513
|
-
# "math": 94.4,
|
|
514
|
-
# # "GSM8K": 94.4, # maj1@32
|
|
515
|
-
# # "MATH": 53.2, # 4-shot
|
|
516
|
-
# }
|