palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +343 -209
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +639 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +62 -6
- palimpzest/prompts/filter_prompts.py +51 -6
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
- palimpzest/prompts/prompt_factory.py +375 -47
- palimpzest/prompts/split_proposer_prompts.py +1 -1
- palimpzest/prompts/util_phrases.py +5 -0
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +160 -331
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +33 -19
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +26 -16
- palimpzest/query/operators/join.py +403 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +205 -77
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +42 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +32 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
- palimpzest-0.8.1.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.21.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0
palimpzest/constants.py
CHANGED
|
@@ -10,77 +10,125 @@ class Model(str, Enum):
|
|
|
10
10
|
which requires invoking an LLM. It does NOT specify whether the model need be executed
|
|
11
11
|
remotely or locally (if applicable).
|
|
12
12
|
"""
|
|
13
|
-
LLAMA3_2_3B = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
|
|
14
|
-
LLAMA3_1_8B = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
|
|
15
|
-
LLAMA3_3_70B = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
|
|
16
|
-
LLAMA3_2_90B_V = "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo"
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
13
|
+
LLAMA3_2_3B = "together_ai/meta-llama/Llama-3.2-3B-Instruct-Turbo"
|
|
14
|
+
LLAMA3_1_8B = "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
|
|
15
|
+
LLAMA3_3_70B = "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"
|
|
16
|
+
LLAMA3_2_90B_V = "together_ai/meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo"
|
|
17
|
+
DEEPSEEK_V3 = "together_ai/deepseek-ai/DeepSeek-V3"
|
|
18
|
+
DEEPSEEK_R1_DISTILL_QWEN_1_5B = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
|
|
19
|
+
GPT_4o = "openai/gpt-4o-2024-08-06"
|
|
20
|
+
GPT_4o_MINI = "openai/gpt-4o-mini-2024-07-18"
|
|
21
|
+
GPT_4_1 = "openai/gpt-4.1-2025-04-14"
|
|
22
|
+
GPT_4_1_MINI = "openai/gpt-4.1-mini-2025-04-14"
|
|
23
|
+
GPT_4_1_NANO = "openai/gpt-4.1-nano-2025-04-14"
|
|
24
|
+
GPT_5 = "openai/gpt-5-2025-08-07"
|
|
25
|
+
GPT_5_MINI = "openai/gpt-5-mini-2025-08-07"
|
|
26
|
+
GPT_5_NANO = "openai/gpt-5-nano-2025-08-07"
|
|
27
|
+
o4_MINI = "openai/o4-mini-2025-04-16" # noqa: N815
|
|
22
28
|
TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
|
|
23
29
|
CLIP_VIT_B_32 = "clip-ViT-B-32"
|
|
30
|
+
CLAUDE_3_5_SONNET = "anthropic/claude-3-5-sonnet-20241022"
|
|
31
|
+
CLAUDE_3_7_SONNET = "anthropic/claude-3-7-sonnet-20250219"
|
|
32
|
+
CLAUDE_3_5_HAIKU = "anthropic/claude-3-5-haiku-20241022"
|
|
33
|
+
GEMINI_2_0_FLASH = "vertex_ai/gemini-2.0-flash"
|
|
34
|
+
GEMINI_2_5_FLASH = "vertex_ai/gemini-2.5-flash"
|
|
35
|
+
GEMINI_2_5_PRO = "vertex_ai/gemini-2.5-pro"
|
|
36
|
+
GOOGLE_GEMINI_2_5_FLASH = "google/gemini-2.5-flash"
|
|
37
|
+
GOOGLE_GEMINI_2_5_FLASH_LITE = "google/gemini-2.5-flash-lite"
|
|
38
|
+
GOOGLE_GEMINI_2_5_PRO = "google/gemini-2.5-pro"
|
|
39
|
+
LLAMA_4_MAVERICK = "vertex_ai/meta/llama-4-maverick-17b-128e-instruct-maas"
|
|
40
|
+
GPT_4o_AUDIO_PREVIEW = "openai/gpt-4o-audio-preview"
|
|
41
|
+
GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
|
|
42
|
+
VLLM_QWEN_1_5_0_5B_CHAT = "hosted_vllm/qwen/Qwen1.5-0.5B-Chat"
|
|
24
43
|
# o1 = "o1-2024-12-17"
|
|
25
44
|
|
|
26
45
|
def __repr__(self):
|
|
27
46
|
return f"{self.name}"
|
|
28
47
|
|
|
29
|
-
def is_deepseek_model(self):
|
|
30
|
-
return "deepseek" in self.value.lower()
|
|
31
|
-
|
|
32
48
|
def is_llama_model(self):
|
|
33
49
|
return "llama" in self.value.lower()
|
|
34
50
|
|
|
35
|
-
def is_mixtral_model(self):
|
|
36
|
-
return "mixtral" in self.value.lower()
|
|
37
|
-
|
|
38
51
|
def is_clip_model(self):
|
|
39
52
|
return "clip" in self.value.lower()
|
|
40
53
|
|
|
41
54
|
def is_together_model(self):
|
|
42
|
-
|
|
43
|
-
is_mixtral_model = self.is_mixtral_model()
|
|
44
|
-
is_deepseek_model = self.is_deepseek_model()
|
|
45
|
-
is_clip_model = self.is_clip_model()
|
|
46
|
-
return is_llama_model or is_mixtral_model or is_deepseek_model or is_clip_model
|
|
47
|
-
|
|
48
|
-
def is_gpt_4o_model(self):
|
|
49
|
-
return "gpt-4o" in self.value.lower()
|
|
50
|
-
|
|
51
|
-
def is_o1_model(self):
|
|
52
|
-
return "o1" in self.value.lower()
|
|
55
|
+
return "together_ai" in self.value.lower() or self.is_clip_model()
|
|
53
56
|
|
|
54
57
|
def is_text_embedding_model(self):
|
|
55
58
|
return "text-embedding" in self.value.lower()
|
|
56
59
|
|
|
60
|
+
def is_o_model(self):
|
|
61
|
+
return self in [Model.o4_MINI]
|
|
62
|
+
|
|
63
|
+
def is_gpt_5_model(self):
|
|
64
|
+
return self in [Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO]
|
|
65
|
+
|
|
57
66
|
def is_openai_model(self):
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
return
|
|
67
|
+
return "openai" in self.value.lower() or self.is_text_embedding_model()
|
|
68
|
+
|
|
69
|
+
def is_anthropic_model(self):
|
|
70
|
+
return "anthropic" in self.value.lower()
|
|
71
|
+
|
|
72
|
+
def is_vertex_model(self):
|
|
73
|
+
return "vertex_ai" in self.value.lower()
|
|
74
|
+
|
|
75
|
+
def is_google_model(self):
|
|
76
|
+
return "google" in self.value.lower()
|
|
77
|
+
|
|
78
|
+
def is_vllm_model(self):
|
|
79
|
+
return "hosted_vllm" in self.value.lower()
|
|
62
80
|
|
|
81
|
+
def is_reasoning_model(self):
|
|
82
|
+
reasoning_models = [
|
|
83
|
+
Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO, Model.o4_MINI,
|
|
84
|
+
Model.GEMINI_2_5_PRO, Model.GEMINI_2_5_FLASH,
|
|
85
|
+
Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
|
|
86
|
+
Model.CLAUDE_3_7_SONNET,
|
|
87
|
+
]
|
|
88
|
+
return self in reasoning_models
|
|
89
|
+
|
|
90
|
+
def is_text_model(self):
|
|
91
|
+
non_text_models = [
|
|
92
|
+
Model.LLAMA3_2_90B_V,
|
|
93
|
+
Model.CLIP_VIT_B_32, Model.TEXT_EMBEDDING_3_SMALL,
|
|
94
|
+
Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
|
|
95
|
+
]
|
|
96
|
+
return self not in non_text_models
|
|
97
|
+
|
|
98
|
+
# TODO: I think SONNET and HAIKU are vision-capable too
|
|
63
99
|
def is_vision_model(self):
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
100
|
+
return self in [
|
|
101
|
+
Model.LLAMA3_2_90B_V, Model.LLAMA_4_MAVERICK,
|
|
102
|
+
Model.GPT_4o, Model.GPT_4o_MINI, Model.GPT_4_1, Model.GPT_4_1_MINI, Model.GPT_4_1_NANO, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO,
|
|
103
|
+
Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
|
|
104
|
+
Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
|
|
69
105
|
]
|
|
70
|
-
return self.value in vision_models
|
|
71
106
|
|
|
72
|
-
def
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
107
|
+
def is_audio_model(self):
|
|
108
|
+
return self in [
|
|
109
|
+
Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
|
|
110
|
+
Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
|
|
111
|
+
Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
|
|
112
|
+
]
|
|
76
113
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
114
|
+
def is_text_image_multimodal_model(self):
|
|
115
|
+
return self in [
|
|
116
|
+
Model.LLAMA_4_MAVERICK,
|
|
117
|
+
Model.GPT_4o, Model.GPT_4o_MINI, Model.GPT_4_1, Model.GPT_4_1_MINI, Model.GPT_4_1_NANO, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO,
|
|
118
|
+
Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
|
|
119
|
+
Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
def is_text_audio_multimodal_model(self):
|
|
123
|
+
return self in [
|
|
124
|
+
Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
|
|
125
|
+
Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
|
|
126
|
+
Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
def is_embedding_model(self):
|
|
130
|
+
return self in [Model.CLIP_VIT_B_32, Model.TEXT_EMBEDDING_3_SMALL]
|
|
81
131
|
|
|
82
|
-
OPENAI = "openai"
|
|
83
|
-
TOGETHER = "together"
|
|
84
132
|
|
|
85
133
|
class PromptStrategy(str, Enum):
|
|
86
134
|
"""
|
|
@@ -90,28 +138,48 @@ class PromptStrategy(str, Enum):
|
|
|
90
138
|
|
|
91
139
|
# Chain-of-Thought Boolean Prompt Strategies
|
|
92
140
|
COT_BOOL = "chain-of-thought-bool"
|
|
141
|
+
COT_BOOL_NO_REASONING = "chain-of-thought-bool-no-reasoning"
|
|
93
142
|
# COT_BOOL_CRITIC = "chain-of-thought-bool-critic"
|
|
94
143
|
# COT_BOOL_REFINE = "chain-of-thought-bool-refine"
|
|
95
144
|
|
|
96
145
|
# Chain-of-Thought Boolean with Image Prompt Strategies
|
|
97
146
|
COT_BOOL_IMAGE = "chain-of-thought-bool-image"
|
|
147
|
+
COT_BOOL_IMAGE_NO_REASONING = "chain-of-thought-bool-image"
|
|
148
|
+
COT_BOOL_AUDIO = "chain-of-thought-bool-audio"
|
|
149
|
+
COT_BOOL_AUDIO_NO_REASONING = "chain-of-thought-bool-audio"
|
|
98
150
|
# COT_BOOL_IMAGE_CRITIC = "chain-of-thought-bool-image-critic"
|
|
99
151
|
# COT_BOOL_IMAGE_REFINE = "chain-of-thought-bool-image-refine"
|
|
100
152
|
|
|
153
|
+
# Chain-of-Thought Join Prompt Strategies
|
|
154
|
+
COT_JOIN = "chain-of-thought-join"
|
|
155
|
+
COT_JOIN_NO_REASONING = "chain-of-thought-join-no-reasoning"
|
|
156
|
+
COT_JOIN_IMAGE = "chain-of-thought-join-image"
|
|
157
|
+
COT_JOIN_IMAGE_NO_REASONING = "chain-of-thought-join-image-no-reasoning"
|
|
158
|
+
COT_JOIN_AUDIO = "chain-of-thought-join-audio"
|
|
159
|
+
COT_JOIN_AUDIO_NO_REASONING = "chain-of-thought-join-audio-no-reasoning"
|
|
160
|
+
|
|
101
161
|
# Chain-of-Thought Question Answering Prompt Strategies
|
|
102
162
|
COT_QA = "chain-of-thought-question"
|
|
163
|
+
COT_QA_NO_REASONING = "chain-of-thought-question-no-reasoning"
|
|
103
164
|
COT_QA_CRITIC = "chain-of-thought-question-critic"
|
|
104
165
|
COT_QA_REFINE = "chain-of-thought-question-refine"
|
|
105
166
|
|
|
106
167
|
# Chain-of-Thought Question with Image Prompt Strategies
|
|
107
168
|
COT_QA_IMAGE = "chain-of-thought-question-image"
|
|
169
|
+
COT_QA_IMAGE_NO_REASONING = "chain-of-thought-question-image-no-reasoning"
|
|
108
170
|
COT_QA_IMAGE_CRITIC = "chain-of-thought-question-critic-image"
|
|
109
171
|
COT_QA_IMAGE_REFINE = "chain-of-thought-question-refine-image"
|
|
110
172
|
|
|
173
|
+
# Chain-of-Thought Queestion with Audio Prompt Strategies
|
|
174
|
+
COT_QA_AUDIO = "chain-of-thought-question-audio"
|
|
175
|
+
COT_QA_AUDIO_NO_REASONING = "chain-of-thought-question-audio-no-reasoning"
|
|
176
|
+
# TODO: COT_QA_AUDIO_CRITIC/REFINE
|
|
177
|
+
|
|
111
178
|
# Mixture-of-Agents Prompt Strategies
|
|
112
179
|
COT_MOA_PROPOSER = "chain-of-thought-mixture-of-agents-proposer"
|
|
113
180
|
COT_MOA_PROPOSER_IMAGE = "chain-of-thought-mixture-of-agents-proposer-image"
|
|
114
181
|
COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation"
|
|
182
|
+
# TODO: COT_MOA_PROPOSER_AUDIO
|
|
115
183
|
|
|
116
184
|
# Split Convert Prompt Strategies
|
|
117
185
|
SPLIT_PROPOSER = "split-proposer"
|
|
@@ -120,11 +188,17 @@ class PromptStrategy(str, Enum):
|
|
|
120
188
|
def is_image_prompt(self):
|
|
121
189
|
return "image" in self.value
|
|
122
190
|
|
|
191
|
+
def is_audio_prompt(self):
|
|
192
|
+
return "audio" in self.value
|
|
193
|
+
|
|
123
194
|
def is_bool_prompt(self):
|
|
124
195
|
return "bool" in self.value
|
|
125
196
|
|
|
197
|
+
def is_join_prompt(self):
|
|
198
|
+
return "join" in self.value
|
|
199
|
+
|
|
126
200
|
def is_convert_prompt(self):
|
|
127
|
-
return "bool" not in self.value
|
|
201
|
+
return "bool" not in self.value and "join" not in self.value
|
|
128
202
|
|
|
129
203
|
def is_critic_prompt(self):
|
|
130
204
|
return "critic" in self.value
|
|
@@ -144,6 +218,9 @@ class PromptStrategy(str, Enum):
|
|
|
144
218
|
def is_split_merger_prompt(self):
|
|
145
219
|
return "split-merger" in self.value
|
|
146
220
|
|
|
221
|
+
def is_no_reasoning_prompt(self):
|
|
222
|
+
return "no-reasoning" in self.value
|
|
223
|
+
|
|
147
224
|
class AggFunc(str, Enum):
|
|
148
225
|
COUNT = "count"
|
|
149
226
|
AVERAGE = "average"
|
|
@@ -169,6 +246,7 @@ class PickOutputStrategy(str, Enum):
|
|
|
169
246
|
ENSEMBLE = "ensemble"
|
|
170
247
|
|
|
171
248
|
|
|
249
|
+
AUDIO_EXTENSIONS = [".wav"]
|
|
172
250
|
IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"]
|
|
173
251
|
PDF_EXTENSIONS = [".pdf"]
|
|
174
252
|
XLS_EXTENSIONS = [".xls", ".xlsx"]
|
|
@@ -210,8 +288,8 @@ NAIVE_BYTES_PER_RECORD = 1024
|
|
|
210
288
|
# Rough conversion from # of characters --> # of tokens; assumes 1 token ~= 4 chars
|
|
211
289
|
TOKENS_PER_CHARACTER = 0.25
|
|
212
290
|
|
|
213
|
-
# Rough estimate of the number of tokens the context is allowed to take up for
|
|
214
|
-
|
|
291
|
+
# Rough estimate of the number of tokens the context is allowed to take up for LLAMA3 models
|
|
292
|
+
LLAMA_CONTEXT_TOKENS_LIMIT = 6000
|
|
215
293
|
|
|
216
294
|
# a naive estimate for the input record size
|
|
217
295
|
NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
|
|
@@ -219,6 +297,9 @@ NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
|
|
|
219
297
|
# a naive estimate for filter selectivity
|
|
220
298
|
NAIVE_EST_FILTER_SELECTIVITY = 0.5
|
|
221
299
|
|
|
300
|
+
# a naive estimate for join selectivity
|
|
301
|
+
NAIVE_EST_JOIN_SELECTIVITY = 0.5
|
|
302
|
+
|
|
222
303
|
# a naive estimate for the number of input tokens processed per record
|
|
223
304
|
NAIVE_EST_NUM_INPUT_TOKENS = 1000
|
|
224
305
|
|
|
@@ -246,6 +327,7 @@ LOG_LLM_OUTPUT = False
|
|
|
246
327
|
# - in the future we should split quality for vision vs. multi-modal vs. text
|
|
247
328
|
# - code quality was computed using HumanEval, but that benchmark is too easy and should be replaced.
|
|
248
329
|
# - https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro
|
|
330
|
+
# - https://www.vals.ai/benchmarks/mmlu_pro-08-12-2025
|
|
249
331
|
#
|
|
250
332
|
# Cost is presented in terms of USD / token for input tokens and USD / token for
|
|
251
333
|
# generated tokens.
|
|
@@ -260,75 +342,74 @@ LLAMA3_2_3B_INSTRUCT_MODEL_CARD = {
|
|
|
260
342
|
"usd_per_input_token": 0.06 / 1e6,
|
|
261
343
|
"usd_per_output_token": 0.06 / 1e6,
|
|
262
344
|
##### Time #####
|
|
263
|
-
"seconds_per_output_token": 0.
|
|
345
|
+
"seconds_per_output_token": 0.0079,
|
|
264
346
|
##### Agg. Benchmark #####
|
|
265
347
|
"overall": 36.50, # https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/discussions/13
|
|
266
|
-
##### Code #####
|
|
267
|
-
"code": 0.0,
|
|
268
348
|
}
|
|
269
349
|
LLAMA3_1_8B_INSTRUCT_MODEL_CARD = {
|
|
270
350
|
##### Cost in USD #####
|
|
271
351
|
"usd_per_input_token": 0.18 / 1e6,
|
|
272
352
|
"usd_per_output_token": 0.18 / 1e6,
|
|
273
353
|
##### Time #####
|
|
274
|
-
"seconds_per_output_token": 0.
|
|
354
|
+
"seconds_per_output_token": 0.0050,
|
|
275
355
|
##### Agg. Benchmark #####
|
|
276
356
|
"overall": 44.25,
|
|
277
|
-
##### Code #####
|
|
278
|
-
"code": 72.6,
|
|
279
357
|
}
|
|
280
358
|
LLAMA3_3_70B_INSTRUCT_MODEL_CARD = {
|
|
281
359
|
##### Cost in USD #####
|
|
282
360
|
"usd_per_input_token": 0.88 / 1e6,
|
|
283
361
|
"usd_per_output_token": 0.88 / 1e6,
|
|
284
362
|
##### Time #####
|
|
285
|
-
"seconds_per_output_token": 0.
|
|
363
|
+
"seconds_per_output_token": 0.0122,
|
|
286
364
|
##### Agg. Benchmark #####
|
|
287
|
-
"overall":
|
|
288
|
-
##### Code #####
|
|
289
|
-
"code": 88.4,
|
|
365
|
+
"overall": 69.9,
|
|
290
366
|
}
|
|
291
367
|
LLAMA3_2_90B_V_MODEL_CARD = {
|
|
292
368
|
##### Cost in USD #####
|
|
293
369
|
"usd_per_input_token": 1.2 / 1e6,
|
|
294
370
|
"usd_per_output_token": 1.2 / 1e6,
|
|
295
371
|
##### Time #####
|
|
296
|
-
"seconds_per_output_token": 0.
|
|
372
|
+
"seconds_per_output_token": 0.0303,
|
|
297
373
|
##### Agg. Benchmark #####
|
|
298
374
|
"overall": 65.00, # set to be slightly higher than gpt-4o-mini
|
|
299
375
|
}
|
|
300
|
-
MIXTRAL_8X_7B_MODEL_CARD = {
|
|
301
|
-
##### Cost in USD #####
|
|
302
|
-
"usd_per_input_token": 0.6 / 1e6,
|
|
303
|
-
"usd_per_output_token": 0.6 / 1e6,
|
|
304
|
-
##### Time #####
|
|
305
|
-
"seconds_per_output_token": 0.0112,
|
|
306
|
-
##### Agg. Benchmark #####
|
|
307
|
-
"overall": 43.27,
|
|
308
|
-
##### Code #####
|
|
309
|
-
"code": 40.0,
|
|
310
|
-
}
|
|
311
376
|
DEEPSEEK_V3_MODEL_CARD = {
|
|
312
377
|
##### Cost in USD #####
|
|
313
378
|
"usd_per_input_token": 1.25 / 1E6,
|
|
314
379
|
"usd_per_output_token": 1.25 / 1E6,
|
|
315
380
|
##### Time #####
|
|
316
|
-
"seconds_per_output_token": 0.
|
|
381
|
+
"seconds_per_output_token": 0.0114,
|
|
317
382
|
##### Agg. Benchmark #####
|
|
318
|
-
"overall":
|
|
319
|
-
##### Code #####
|
|
320
|
-
"code": 92.0,
|
|
383
|
+
"overall": 73.8,
|
|
321
384
|
}
|
|
322
385
|
DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD = {
|
|
323
386
|
##### Cost in USD #####
|
|
324
387
|
"usd_per_input_token": 0.18 / 1E6,
|
|
325
388
|
"usd_per_output_token": 0.18 / 1E6,
|
|
326
389
|
##### Time #####
|
|
327
|
-
"seconds_per_output_token": 0.
|
|
390
|
+
"seconds_per_output_token": 0.0050, # NOTE: copied to be same as LLAMA3_1_8B_INSTRUCT_MODEL_CARD; need to update when we have data
|
|
328
391
|
##### Agg. Benchmark #####
|
|
329
392
|
"overall": 39.90, # https://www.reddit.com/r/LocalLLaMA/comments/1iserf9/deepseek_r1_distilled_models_mmlu_pro_benchmarks/
|
|
330
|
-
|
|
331
|
-
|
|
393
|
+
}
|
|
394
|
+
GPT_4o_AUDIO_PREVIEW_MODEL_CARD = {
|
|
395
|
+
# NOTE: COPYING OVERALL AND SECONDS_PER_OUTPUT_TOKEN FROM GPT_4o; need to update when we have audio-specific benchmarks
|
|
396
|
+
##### Cost in USD #####
|
|
397
|
+
"usd_per_audio_input_token": 2.5 / 1e6,
|
|
398
|
+
"usd_per_output_token": 10.0 / 1e6,
|
|
399
|
+
##### Time #####
|
|
400
|
+
"seconds_per_output_token": 0.0080,
|
|
401
|
+
##### Agg. Benchmark #####
|
|
402
|
+
"overall": 74.1,
|
|
403
|
+
}
|
|
404
|
+
GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD = {
|
|
405
|
+
# NOTE: COPYING OVERALL AND SECONDS_PER_OUTPUT_TOKEN FROM GPT_4o; need to update when we have audio-specific benchmarks
|
|
406
|
+
##### Cost in USD #####
|
|
407
|
+
"usd_per_audio_input_token": 0.15 / 1e6,
|
|
408
|
+
"usd_per_output_token": 0.6 / 1e6,
|
|
409
|
+
##### Time #####
|
|
410
|
+
"seconds_per_output_token": 0.0159,
|
|
411
|
+
##### Agg. Benchmark #####
|
|
412
|
+
"overall": 62.7,
|
|
332
413
|
}
|
|
333
414
|
GPT_4o_MODEL_CARD = {
|
|
334
415
|
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
@@ -336,11 +417,9 @@ GPT_4o_MODEL_CARD = {
|
|
|
336
417
|
"usd_per_input_token": 2.5 / 1e6,
|
|
337
418
|
"usd_per_output_token": 10.0 / 1e6,
|
|
338
419
|
##### Time #####
|
|
339
|
-
"seconds_per_output_token": 0.
|
|
420
|
+
"seconds_per_output_token": 0.0080,
|
|
340
421
|
##### Agg. Benchmark #####
|
|
341
|
-
"overall": 74.
|
|
342
|
-
##### Code #####
|
|
343
|
-
"code": 90.0,
|
|
422
|
+
"overall": 74.1,
|
|
344
423
|
}
|
|
345
424
|
GPT_4o_MINI_MODEL_CARD = {
|
|
346
425
|
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
@@ -348,24 +427,90 @@ GPT_4o_MINI_MODEL_CARD = {
|
|
|
348
427
|
"usd_per_input_token": 0.15 / 1e6,
|
|
349
428
|
"usd_per_output_token": 0.6 / 1e6,
|
|
350
429
|
##### Time #####
|
|
351
|
-
"seconds_per_output_token": 0.
|
|
430
|
+
"seconds_per_output_token": 0.0159,
|
|
431
|
+
##### Agg. Benchmark #####
|
|
432
|
+
"overall": 62.7,
|
|
433
|
+
}
|
|
434
|
+
GPT_4_1_MODEL_CARD = {
|
|
435
|
+
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
436
|
+
##### Cost in USD #####
|
|
437
|
+
"usd_per_input_token": 2.0 / 1e6,
|
|
438
|
+
"usd_per_output_token": 8.0 / 1e6,
|
|
439
|
+
##### Time #####
|
|
440
|
+
"seconds_per_output_token": 0.0076,
|
|
441
|
+
##### Agg. Benchmark #####
|
|
442
|
+
"overall": 80.5,
|
|
443
|
+
}
|
|
444
|
+
GPT_4_1_MINI_MODEL_CARD = {
|
|
445
|
+
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
446
|
+
##### Cost in USD #####
|
|
447
|
+
"usd_per_input_token": 0.4 / 1e6,
|
|
448
|
+
"usd_per_output_token": 1.6 / 1e6,
|
|
449
|
+
##### Time #####
|
|
450
|
+
"seconds_per_output_token": 0.0161,
|
|
352
451
|
##### Agg. Benchmark #####
|
|
353
|
-
"overall":
|
|
354
|
-
##### Code #####
|
|
355
|
-
"code": 86.0,
|
|
452
|
+
"overall": 77.2,
|
|
356
453
|
}
|
|
357
|
-
|
|
454
|
+
GPT_4_1_NANO_MODEL_CARD = {
|
|
358
455
|
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
359
456
|
##### Cost in USD #####
|
|
360
|
-
"usd_per_input_token":
|
|
361
|
-
"usd_per_output_token":
|
|
457
|
+
"usd_per_input_token": 0.1 / 1e6,
|
|
458
|
+
"usd_per_output_token": 0.4 / 1e6,
|
|
362
459
|
##### Time #####
|
|
363
|
-
"seconds_per_output_token": 0.
|
|
460
|
+
"seconds_per_output_token": 0.0060,
|
|
364
461
|
##### Agg. Benchmark #####
|
|
365
|
-
"overall":
|
|
366
|
-
##### Code #####
|
|
367
|
-
"code": 92.3, # NOTE: just copying MMLU score for now
|
|
462
|
+
"overall": 62.3,
|
|
368
463
|
}
|
|
464
|
+
GPT_5_MODEL_CARD = {
|
|
465
|
+
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
466
|
+
##### Cost in USD #####
|
|
467
|
+
"usd_per_input_token": 1.25 / 1e6,
|
|
468
|
+
"usd_per_output_token": 10.0 / 1e6,
|
|
469
|
+
##### Time #####
|
|
470
|
+
"seconds_per_output_token": 0.0060,
|
|
471
|
+
##### Agg. Benchmark #####
|
|
472
|
+
"overall": 87.00,
|
|
473
|
+
}
|
|
474
|
+
GPT_5_MINI_MODEL_CARD = {
|
|
475
|
+
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
476
|
+
##### Cost in USD #####
|
|
477
|
+
"usd_per_input_token": 0.25 / 1e6,
|
|
478
|
+
"usd_per_output_token": 2.0 / 1e6,
|
|
479
|
+
##### Time #####
|
|
480
|
+
"seconds_per_output_token": 0.0135,
|
|
481
|
+
##### Agg. Benchmark #####
|
|
482
|
+
"overall": 82.50,
|
|
483
|
+
}
|
|
484
|
+
GPT_5_NANO_MODEL_CARD = {
|
|
485
|
+
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
486
|
+
##### Cost in USD #####
|
|
487
|
+
"usd_per_input_token": 0.05 / 1e6,
|
|
488
|
+
"usd_per_output_token": 0.4 / 1e6,
|
|
489
|
+
##### Time #####
|
|
490
|
+
"seconds_per_output_token": 0.0055,
|
|
491
|
+
##### Agg. Benchmark #####
|
|
492
|
+
"overall": 77.9,
|
|
493
|
+
}
|
|
494
|
+
o4_MINI_MODEL_CARD = { # noqa: N816
|
|
495
|
+
# NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
496
|
+
##### Cost in USD #####
|
|
497
|
+
"usd_per_input_token": 1.1 / 1e6,
|
|
498
|
+
"usd_per_output_token": 4.4 / 1e6,
|
|
499
|
+
##### Time #####
|
|
500
|
+
"seconds_per_output_token": 0.0092,
|
|
501
|
+
##### Agg. Benchmark #####
|
|
502
|
+
"overall": 80.6, # using number reported for o3-mini; true number is likely higher
|
|
503
|
+
}
|
|
504
|
+
# o1_MODEL_CARD = { # noqa: N816
|
|
505
|
+
# # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
|
|
506
|
+
# ##### Cost in USD #####
|
|
507
|
+
# "usd_per_input_token": 15 / 1e6,
|
|
508
|
+
# "usd_per_output_token": 60 / 1e6,
|
|
509
|
+
# ##### Time #####
|
|
510
|
+
# "seconds_per_output_token": 0.0110,
|
|
511
|
+
# ##### Agg. Benchmark #####
|
|
512
|
+
# "overall": 83.50,
|
|
513
|
+
# }
|
|
369
514
|
TEXT_EMBEDDING_3_SMALL_MODEL_CARD = {
|
|
370
515
|
##### Cost in USD #####
|
|
371
516
|
"usd_per_input_token": 0.02 / 1e6,
|
|
@@ -384,7 +529,91 @@ CLIP_VIT_B_32_MODEL_CARD = {
|
|
|
384
529
|
##### Agg. Benchmark #####
|
|
385
530
|
"overall": 63.3, # NOTE: ImageNet top-1 accuracy
|
|
386
531
|
}
|
|
387
|
-
|
|
532
|
+
CLAUDE_3_5_SONNET_MODEL_CARD = {
|
|
533
|
+
##### Cost in USD #####
|
|
534
|
+
"usd_per_input_token": 3.0 / 1e6,
|
|
535
|
+
"usd_per_output_token": 15.0 / 1e6,
|
|
536
|
+
##### Time #####
|
|
537
|
+
"seconds_per_output_token": 0.0154,
|
|
538
|
+
##### Agg. Benchmark #####
|
|
539
|
+
"overall": 78.4,
|
|
540
|
+
}
|
|
541
|
+
CLAUDE_3_7_SONNET_MODEL_CARD = {
|
|
542
|
+
##### Cost in USD #####
|
|
543
|
+
"usd_per_input_token": 3.0 / 1e6,
|
|
544
|
+
"usd_per_output_token": 15.0 / 1e6,
|
|
545
|
+
##### Time #####
|
|
546
|
+
"seconds_per_output_token": 0.0156,
|
|
547
|
+
##### Agg. Benchmark #####
|
|
548
|
+
"overall": 80.7,
|
|
549
|
+
}
|
|
550
|
+
CLAUDE_3_5_HAIKU_MODEL_CARD = {
|
|
551
|
+
##### Cost in USD #####
|
|
552
|
+
"usd_per_input_token": 0.8 / 1e6,
|
|
553
|
+
"usd_per_output_token": 4.0 / 1e6,
|
|
554
|
+
##### Time #####
|
|
555
|
+
"seconds_per_output_token": 0.0189,
|
|
556
|
+
##### Agg. Benchmark #####
|
|
557
|
+
"overall": 64.1,
|
|
558
|
+
}
|
|
559
|
+
GEMINI_2_0_FLASH_MODEL_CARD = {
|
|
560
|
+
##### Cost in USD #####
|
|
561
|
+
"usd_per_input_token": 0.15 / 1e6,
|
|
562
|
+
"usd_per_output_token": 0.6 / 1e6,
|
|
563
|
+
"usd_per_audio_input_token": 1.0 / 1e6,
|
|
564
|
+
##### Time #####
|
|
565
|
+
"seconds_per_output_token": 0.0054,
|
|
566
|
+
##### Agg. Benchmark #####
|
|
567
|
+
"overall": 77.40,
|
|
568
|
+
}
|
|
569
|
+
GEMINI_2_5_FLASH_LITE_MODEL_CARD = {
|
|
570
|
+
##### Cost in USD #####
|
|
571
|
+
"usd_per_input_token": 0.1 / 1e6,
|
|
572
|
+
"usd_per_output_token": 0.4 / 1e6,
|
|
573
|
+
"usd_per_audio_input_token": 0.3 / 1e6,
|
|
574
|
+
##### Time #####
|
|
575
|
+
"seconds_per_output_token": 0.0034,
|
|
576
|
+
##### Agg. Benchmark #####
|
|
577
|
+
"overall": 79.1, # NOTE: interpolated between gemini 2.5 flash and gemini 2.0 flash
|
|
578
|
+
}
|
|
579
|
+
GEMINI_2_5_FLASH_MODEL_CARD = {
|
|
580
|
+
##### Cost in USD #####
|
|
581
|
+
"usd_per_input_token": 0.30 / 1e6,
|
|
582
|
+
"usd_per_output_token": 2.5 / 1e6,
|
|
583
|
+
"usd_per_audio_input_token": 1.0 / 1e6,
|
|
584
|
+
##### Time #####
|
|
585
|
+
"seconds_per_output_token": 0.0044,
|
|
586
|
+
##### Agg. Benchmark #####
|
|
587
|
+
"overall": 80.75, # NOTE: interpolated between gemini 2.0 flash and gemini 2.5 pro
|
|
588
|
+
}
|
|
589
|
+
GEMINI_2_5_PRO_MODEL_CARD = {
|
|
590
|
+
##### Cost in USD #####
|
|
591
|
+
"usd_per_input_token": 1.25 / 1e6,
|
|
592
|
+
"usd_per_output_token": 10.0 / 1e6,
|
|
593
|
+
"usd_per_audio_input_token": 1.25 / 1e6,
|
|
594
|
+
##### Time #####
|
|
595
|
+
"seconds_per_output_token": 0.0072,
|
|
596
|
+
##### Agg. Benchmark #####
|
|
597
|
+
"overall": 84.10,
|
|
598
|
+
}
|
|
599
|
+
LLAMA_4_MAVERICK_MODEL_CARD = {
|
|
600
|
+
##### Cost in USD #####
|
|
601
|
+
"usd_per_input_token": 0.35 / 1e6,
|
|
602
|
+
"usd_per_output_token": 1.15 / 1e6,
|
|
603
|
+
##### Time #####
|
|
604
|
+
"seconds_per_output_token": 0.0122,
|
|
605
|
+
##### Agg. Benchmark #####
|
|
606
|
+
"overall": 79.4,
|
|
607
|
+
}
|
|
608
|
+
VLLM_QWEN_1_5_0_5B_CHAT_MODEL_CARD = {
|
|
609
|
+
##### Cost in USD #####
|
|
610
|
+
"usd_per_input_token": 0.0 / 1e6,
|
|
611
|
+
"usd_per_output_token": 0.0 / 1e6,
|
|
612
|
+
##### Time #####
|
|
613
|
+
"seconds_per_output_token": 0.1000, # TODO: fill-in with a better estimate
|
|
614
|
+
##### Agg. Benchmark #####
|
|
615
|
+
"overall": 30.0, # TODO: fill-in with a better estimate
|
|
616
|
+
}
|
|
388
617
|
|
|
389
618
|
MODEL_CARDS = {
|
|
390
619
|
Model.LLAMA3_2_3B.value: LLAMA3_2_3B_INSTRUCT_MODEL_CARD,
|
|
@@ -393,124 +622,29 @@ MODEL_CARDS = {
|
|
|
393
622
|
Model.LLAMA3_2_90B_V.value: LLAMA3_2_90B_V_MODEL_CARD,
|
|
394
623
|
Model.DEEPSEEK_V3.value: DEEPSEEK_V3_MODEL_CARD,
|
|
395
624
|
Model.DEEPSEEK_R1_DISTILL_QWEN_1_5B.value: DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD,
|
|
396
|
-
Model.MIXTRAL.value: MIXTRAL_8X_7B_MODEL_CARD,
|
|
397
625
|
Model.GPT_4o.value: GPT_4o_MODEL_CARD,
|
|
398
626
|
Model.GPT_4o_MINI.value: GPT_4o_MINI_MODEL_CARD,
|
|
627
|
+
Model.GPT_4o_AUDIO_PREVIEW.value: GPT_4o_AUDIO_PREVIEW_MODEL_CARD,
|
|
628
|
+
Model.GPT_4o_MINI_AUDIO_PREVIEW.value: GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD,
|
|
629
|
+
Model.GPT_4_1.value: GPT_4_1_MODEL_CARD,
|
|
630
|
+
Model.GPT_4_1_MINI.value: GPT_4_1_MINI_MODEL_CARD,
|
|
631
|
+
Model.GPT_4_1_NANO.value: GPT_4_1_NANO_MODEL_CARD,
|
|
632
|
+
Model.GPT_5.value: GPT_5_MODEL_CARD,
|
|
633
|
+
Model.GPT_5_MINI.value: GPT_5_MINI_MODEL_CARD,
|
|
634
|
+
Model.GPT_5_NANO.value: GPT_5_NANO_MODEL_CARD,
|
|
635
|
+
Model.o4_MINI.value: o4_MINI_MODEL_CARD,
|
|
399
636
|
# Model.o1.value: o1_MODEL_CARD,
|
|
400
637
|
Model.TEXT_EMBEDDING_3_SMALL.value: TEXT_EMBEDDING_3_SMALL_MODEL_CARD,
|
|
401
638
|
Model.CLIP_VIT_B_32.value: CLIP_VIT_B_32_MODEL_CARD,
|
|
639
|
+
Model.CLAUDE_3_5_SONNET.value: CLAUDE_3_5_SONNET_MODEL_CARD,
|
|
640
|
+
Model.CLAUDE_3_7_SONNET.value: CLAUDE_3_7_SONNET_MODEL_CARD,
|
|
641
|
+
Model.CLAUDE_3_5_HAIKU.value: CLAUDE_3_5_HAIKU_MODEL_CARD,
|
|
642
|
+
Model.GEMINI_2_0_FLASH.value: GEMINI_2_0_FLASH_MODEL_CARD,
|
|
643
|
+
Model.GEMINI_2_5_FLASH.value: GEMINI_2_5_FLASH_MODEL_CARD,
|
|
644
|
+
Model.GEMINI_2_5_PRO.value: GEMINI_2_5_PRO_MODEL_CARD,
|
|
645
|
+
Model.GOOGLE_GEMINI_2_5_FLASH.value: GEMINI_2_5_FLASH_MODEL_CARD,
|
|
646
|
+
Model.GOOGLE_GEMINI_2_5_FLASH_LITE.value: GEMINI_2_5_FLASH_LITE_MODEL_CARD,
|
|
647
|
+
Model.GOOGLE_GEMINI_2_5_PRO.value: GEMINI_2_5_PRO_MODEL_CARD,
|
|
648
|
+
Model.LLAMA_4_MAVERICK.value: LLAMA_4_MAVERICK_MODEL_CARD,
|
|
649
|
+
Model.VLLM_QWEN_1_5_0_5B_CHAT.value: VLLM_QWEN_1_5_0_5B_CHAT_MODEL_CARD,
|
|
402
650
|
}
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
###### DEPRECATED ######
|
|
406
|
-
# # NOTE: seconds_per_output_token is based on `gpt-3.5-turbo-1106`
|
|
407
|
-
# GPT_3_5_MODEL_CARD = {
|
|
408
|
-
# ##### Cost in USD #####
|
|
409
|
-
# "usd_per_input_token": 0.5 / 1E6,
|
|
410
|
-
# "usd_per_output_token": 1.5 / 1E6,
|
|
411
|
-
# ##### Time #####
|
|
412
|
-
# "seconds_per_output_token": 0.0065,
|
|
413
|
-
# ##### Agg. Benchmark #####
|
|
414
|
-
# "overall": 70.0, # 5-shot
|
|
415
|
-
# ##### Commonsense Reasoning #####
|
|
416
|
-
# "reasoning": 84.1,
|
|
417
|
-
# ### "HellaSwag": 85.5,^ # 10-shot
|
|
418
|
-
# ### "WinoGrande": 81.6,^ # 5-shot
|
|
419
|
-
# ### "Arc-e": 85.2,^ # 25-shot
|
|
420
|
-
# ##### World Knowledge #####
|
|
421
|
-
# ##### Reading Comprehension #####
|
|
422
|
-
# ### "DROP": 64.1, # 3-shot
|
|
423
|
-
# ##### Code #####
|
|
424
|
-
# "code": 48.1,
|
|
425
|
-
# ### "HumanEval": 48.1,^ # 0-shot
|
|
426
|
-
# ##### Math #####
|
|
427
|
-
# "math": 57.1,
|
|
428
|
-
# ### "GSM8K": 57.1,^ # 5-shot
|
|
429
|
-
# }
|
|
430
|
-
# # NOTE: the seconds_per_output_token was computed based on a slightly different model ('gpt-4-1106-preview')
|
|
431
|
-
# # and the benchmark statistics were computed based on the GPT-4 Technical Report; these might be
|
|
432
|
-
# # slightly innacurate compared to the real numbers for gpt-4-0125-preview, but we'll use them until
|
|
433
|
-
# # we have something better. (The cost metrics are accurate).
|
|
434
|
-
# GPT_4_MODEL_CARD = {
|
|
435
|
-
# ##### Cost in USD #####
|
|
436
|
-
# "usd_per_input_token": 10 / 1E6,
|
|
437
|
-
# "usd_per_output_token": 30 / 1E6,
|
|
438
|
-
# ##### Time #####
|
|
439
|
-
# "seconds_per_output_token": 0.018,
|
|
440
|
-
# ##### Agg. Benchmark #####
|
|
441
|
-
# "overall": 86.4, # 5-shot
|
|
442
|
-
# ##### Commonsense Reasoning #####
|
|
443
|
-
# "reasoning": 93.0,
|
|
444
|
-
# ### "HellaSwag": 95.3,^ # 10-shot
|
|
445
|
-
# ### "WinoGrande": 87.5,^ # 5-shot
|
|
446
|
-
# ### "Arc-e": 96.3,^ # 25-shot
|
|
447
|
-
# ##### World Knowledge #####
|
|
448
|
-
# ##### Reading Comprehension #####
|
|
449
|
-
# ### "DROP": 80.9, # 3-shot
|
|
450
|
-
# ##### Code #####
|
|
451
|
-
# "code": 67.0,
|
|
452
|
-
# ### "HumanEval": 67.0,^ # 0-shot
|
|
453
|
-
# ##### Math #####
|
|
454
|
-
# "math": 92.0,
|
|
455
|
-
# ### "GSM8K": 92.0,^ # 5-shot
|
|
456
|
-
# }
|
|
457
|
-
|
|
458
|
-
# # TODO: use cost info in here: https://platform.openai.com/docs/guides/vision/calculating-costs
|
|
459
|
-
# GPT_4V_MODEL_CARD = {
|
|
460
|
-
# ##### Cost in USD #####
|
|
461
|
-
# "usd_per_input_token": 10 / 1E6,
|
|
462
|
-
# "usd_per_output_token": 30 / 1E6,
|
|
463
|
-
# ##### Time #####
|
|
464
|
-
# "seconds_per_output_token": 0.042 / 10.0, # TODO: / 10.0 is a hack; need to figure out why time estimates are so off
|
|
465
|
-
# ##### Agg. Benchmark #####
|
|
466
|
-
# "overall": 86.4,
|
|
467
|
-
# }
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
# GEMINI_1_MODEL_CARD = {
|
|
471
|
-
# ##### Cost in USD #####
|
|
472
|
-
# "usd_per_input_token": 125 / 1E8, # Gemini is free but rate limited for now. Pricing will be updated
|
|
473
|
-
# "usd_per_output_token": 375 / 1E9,
|
|
474
|
-
# ##### Time #####
|
|
475
|
-
# "seconds_per_output_token": 0.042 / 10.0, # TODO:
|
|
476
|
-
# ##### Agg. Benchmark #####
|
|
477
|
-
# "overall": 65.0, # 90.0 TODO: we are using the free version of Gemini which is substantially worse than its paid version; I'm manually revising it's quality below that of Mixtral
|
|
478
|
-
# ##### Commonsense Reasoning #####
|
|
479
|
-
# "reasoning": 80.0, # 87.8, TODO: see note above on overall
|
|
480
|
-
# # "HellaSwag": 87.8, # 10-shot
|
|
481
|
-
# ##### World Knowledge #####
|
|
482
|
-
# ##### Reading Comprehension #####
|
|
483
|
-
# # "DROP": 82.4, # Variable shots ?
|
|
484
|
-
# ##### Code #####
|
|
485
|
-
# "code": 74.4,
|
|
486
|
-
# # "HumanEval": 74.4, # 0-shot (IT)*
|
|
487
|
-
# # "Natural2Code": 74.9, # 0-shot
|
|
488
|
-
# ##### Math #####
|
|
489
|
-
# "math": 94.4,
|
|
490
|
-
# # "GSM8K": 94.4, # maj1@32
|
|
491
|
-
# # "MATH": 53.2, # 4-shot
|
|
492
|
-
# }
|
|
493
|
-
|
|
494
|
-
# GEMINI_1V_MODEL_CARD = {
|
|
495
|
-
# ##### Cost in USD #####
|
|
496
|
-
# "usd_per_input_token": 25 / 1E6, # Gemini is free but rate limited for now. Pricing will be updated
|
|
497
|
-
# "usd_per_output_token": 375 / 1E9,
|
|
498
|
-
# ##### Time #####
|
|
499
|
-
# "seconds_per_output_token": 0.042, # / 10.0, # TODO:
|
|
500
|
-
# ##### Agg. Benchmark #####
|
|
501
|
-
# "overall": 65.0, # 90.0, TODO: see note above in Gemini_1 model card
|
|
502
|
-
# ##### Commonsense Reasoning #####
|
|
503
|
-
# "reasoning": 80.0, # 87.8, TODO: see note above in Gemini_1 model card
|
|
504
|
-
# # "HellaSwag": 87.8, # 10-shot
|
|
505
|
-
# ##### World Knowledge #####
|
|
506
|
-
# ##### Reading Comprehension #####
|
|
507
|
-
# # "DROP": 82.4, # Variable shots ?
|
|
508
|
-
# ##### Code #####
|
|
509
|
-
# "code": 74.4,
|
|
510
|
-
# # "HumanEval": 74.4, # 0-shot (IT)*
|
|
511
|
-
# # "Natural2Code": 74.9, # 0-shot
|
|
512
|
-
# ##### Math #####
|
|
513
|
-
# "math": 94.4,
|
|
514
|
-
# # "GSM8K": 94.4, # maj1@32
|
|
515
|
-
# # "MATH": 53.2, # 4-shot
|
|
516
|
-
# }
|