crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +112 -63
- helm/benchmark/run_spec_factory.py +15 -10
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +444 -65
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +447 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/schema_vhelm.yaml +824 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +78 -14
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +11 -5
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +199 -2
- helm/clients/vertexai_client.py +117 -64
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +12 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +29 -3
- helm/config/model_deployments.yaml +504 -12
- helm/config/model_metadata.yaml +579 -52
- helm/config/tokenizer_configs.yaml +100 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/schema_vlm.yaml +0 -576
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
helm/config/model_metadata.yaml
CHANGED
|
@@ -100,6 +100,25 @@ models:
|
|
|
100
100
|
# - j2-large -> j2-light
|
|
101
101
|
|
|
102
102
|
|
|
103
|
+
# AI Singapore
|
|
104
|
+
- name: aisingapore/sea-lion-7b
|
|
105
|
+
display_name: SEA-LION (7B)
|
|
106
|
+
description: SEA-LION is a collection of language models which has been pretrained and instruct-tuned on languages from the Southeast Asia region. It utilizes the MPT architecture and a custom SEABPETokenizer for tokenization.
|
|
107
|
+
creator_organization_name: AI Singapore
|
|
108
|
+
access: open
|
|
109
|
+
num_parameters: 7000000000
|
|
110
|
+
release_date: 2023-02-24
|
|
111
|
+
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
112
|
+
|
|
113
|
+
- name: aisingapore/sea-lion-7b-instruct
|
|
114
|
+
display_name: SEA-LION Instruct (7B)
|
|
115
|
+
description: SEA-LION is a collection of language models which has been pretrained and instruct-tuned on languages from the Southeast Asia region. It utilizes the MPT architecture and a custom SEABPETokenizer for tokenization.
|
|
116
|
+
creator_organization_name: AI Singapore
|
|
117
|
+
access: open
|
|
118
|
+
num_parameters: 7000000000
|
|
119
|
+
release_date: 2023-02-24
|
|
120
|
+
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
121
|
+
|
|
103
122
|
|
|
104
123
|
# Aleph Alpha
|
|
105
124
|
# Aleph Alpha's Luminous models: https://docs.aleph-alpha.com/docs/introduction/luminous
|
|
@@ -189,7 +208,7 @@ models:
|
|
|
189
208
|
|
|
190
209
|
# Anthropic
|
|
191
210
|
- name: anthropic/claude-v1.3
|
|
192
|
-
display_name:
|
|
211
|
+
display_name: Claude v1.3
|
|
193
212
|
description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
|
|
194
213
|
creator_organization_name: Anthropic
|
|
195
214
|
access: limited
|
|
@@ -198,7 +217,7 @@ models:
|
|
|
198
217
|
tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
199
218
|
|
|
200
219
|
- name: anthropic/claude-instant-v1
|
|
201
|
-
display_name:
|
|
220
|
+
display_name: Claude Instant V1
|
|
202
221
|
description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
|
|
203
222
|
creator_organization_name: Anthropic
|
|
204
223
|
access: limited
|
|
@@ -206,7 +225,7 @@ models:
|
|
|
206
225
|
tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
207
226
|
|
|
208
227
|
- name: anthropic/claude-instant-1.2
|
|
209
|
-
display_name:
|
|
228
|
+
display_name: Claude Instant 1.2
|
|
210
229
|
description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
|
|
211
230
|
creator_organization_name: Anthropic
|
|
212
231
|
access: limited
|
|
@@ -214,7 +233,7 @@ models:
|
|
|
214
233
|
tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
215
234
|
|
|
216
235
|
- name: anthropic/claude-2.0
|
|
217
|
-
display_name:
|
|
236
|
+
display_name: Claude 2.0
|
|
218
237
|
description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
|
|
219
238
|
creator_organization_name: Anthropic
|
|
220
239
|
access: limited
|
|
@@ -222,7 +241,7 @@ models:
|
|
|
222
241
|
tags: [ANTHROPIC_CLAUDE_2_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
223
242
|
|
|
224
243
|
- name: anthropic/claude-2.1
|
|
225
|
-
display_name:
|
|
244
|
+
display_name: Claude 2.1
|
|
226
245
|
description: Claude 2.1 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
|
|
227
246
|
creator_organization_name: Anthropic
|
|
228
247
|
access: limited
|
|
@@ -231,7 +250,7 @@ models:
|
|
|
231
250
|
|
|
232
251
|
- name: anthropic/claude-3-haiku-20240307
|
|
233
252
|
display_name: Claude 3 Haiku (20240307)
|
|
234
|
-
description: Claude 3 is a a family of models that possess vision and multilingual capabilities. They were trained with various methods such as unsupervised learning and Constitutional AI.
|
|
253
|
+
description: Claude 3 is a a family of models that possess vision and multilingual capabilities. They were trained with various methods such as unsupervised learning and Constitutional AI ([blog](https://www.anthropic.com/news/claude-3-family)).
|
|
235
254
|
creator_organization_name: Anthropic
|
|
236
255
|
access: limited
|
|
237
256
|
release_date: 2024-03-13 # https://www.anthropic.com/news/claude-3-haiku
|
|
@@ -239,7 +258,7 @@ models:
|
|
|
239
258
|
|
|
240
259
|
- name: anthropic/claude-3-sonnet-20240229
|
|
241
260
|
display_name: Claude 3 Sonnet (20240229)
|
|
242
|
-
description: Claude 3 is a a family of models that possess vision and multilingual capabilities. They were trained with various methods such as unsupervised learning and Constitutional AI.
|
|
261
|
+
description: Claude 3 is a a family of models that possess vision and multilingual capabilities. They were trained with various methods such as unsupervised learning and Constitutional AI ([blog](https://www.anthropic.com/news/claude-3-family)).
|
|
243
262
|
creator_organization_name: Anthropic
|
|
244
263
|
access: limited
|
|
245
264
|
release_date: 2024-03-04 # https://www.anthropic.com/news/claude-3-family
|
|
@@ -247,9 +266,9 @@ models:
|
|
|
247
266
|
|
|
248
267
|
- name: anthropic/claude-3-opus-20240229
|
|
249
268
|
display_name: Claude 3 Opus (20240229)
|
|
250
|
-
description: Claude 3 is a a family of models that possess vision and multilingual capabilities. They were trained with various methods such as unsupervised learning and Constitutional AI.
|
|
251
|
-
creator_organization_name: Anthropic
|
|
269
|
+
description: Claude 3 is a a family of models that possess vision and multilingual capabilities. They were trained with various methods such as unsupervised learning and Constitutional AI ([blog](https://www.anthropic.com/news/claude-3-family)).
|
|
252
270
|
access: limited
|
|
271
|
+
creator_organization_name: Anthropic
|
|
253
272
|
release_date: 2024-03-04 # https://www.anthropic.com/news/claude-3-family
|
|
254
273
|
tags: [ANTHROPIC_CLAUDE_3_MODEL_TAG, TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
255
274
|
|
|
@@ -327,6 +346,18 @@ models:
|
|
|
327
346
|
release_date: 2023-05-09 # ArXiv submission date
|
|
328
347
|
tags: [CODE_MODEL_TAG]
|
|
329
348
|
|
|
349
|
+
# BioMistral
|
|
350
|
+
|
|
351
|
+
- name: biomistral/biomistral-7b
|
|
352
|
+
display_name: BioMistral (7B)
|
|
353
|
+
description: BioMistral 7B is an open-source LLM tailored for the biomedical domain, utilizing Mistral as its foundation model and further pre-trained on PubMed Central.
|
|
354
|
+
creator_organization_name: BioMistral
|
|
355
|
+
access: open
|
|
356
|
+
num_parameters: 7300000000
|
|
357
|
+
release_date: 2024-02-15
|
|
358
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
359
|
+
|
|
360
|
+
|
|
330
361
|
|
|
331
362
|
|
|
332
363
|
# Cerebras Systems
|
|
@@ -418,7 +449,7 @@ models:
|
|
|
418
449
|
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
419
450
|
|
|
420
451
|
- name: cohere/command-medium-beta # DEPRECATED
|
|
421
|
-
display_name:
|
|
452
|
+
display_name: Command beta (6.1B)
|
|
422
453
|
description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
|
|
423
454
|
creator_organization_name: Cohere
|
|
424
455
|
access: limited
|
|
@@ -427,7 +458,7 @@ models:
|
|
|
427
458
|
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
428
459
|
|
|
429
460
|
- name: cohere/command-xlarge-beta # DEPRECATED
|
|
430
|
-
display_name:
|
|
461
|
+
display_name: Command beta (52.4B)
|
|
431
462
|
description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
|
|
432
463
|
creator_organization_name: Cohere
|
|
433
464
|
access: limited
|
|
@@ -436,7 +467,7 @@ models:
|
|
|
436
467
|
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
437
468
|
|
|
438
469
|
- name: cohere/command
|
|
439
|
-
display_name:
|
|
470
|
+
display_name: Command
|
|
440
471
|
description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
|
|
441
472
|
creator_organization_name: Cohere
|
|
442
473
|
access: limited
|
|
@@ -444,12 +475,30 @@ models:
|
|
|
444
475
|
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
445
476
|
|
|
446
477
|
- name: cohere/command-light
|
|
447
|
-
display_name:
|
|
478
|
+
display_name: Command Light
|
|
448
479
|
description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
|
|
449
480
|
creator_organization_name: Cohere
|
|
450
481
|
access: limited
|
|
451
482
|
release_date: 2023-09-29
|
|
452
|
-
tags: [TEXT_MODEL_TAG,
|
|
483
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
484
|
+
|
|
485
|
+
- name: cohere/command-r
|
|
486
|
+
display_name: Command R
|
|
487
|
+
description: Command R is a multilingual 35B parameter model with a context length of 128K that has been trained with conversational tool use capabilities.
|
|
488
|
+
creator_organization_name: Cohere
|
|
489
|
+
access: open
|
|
490
|
+
num_parameters: 35000000000
|
|
491
|
+
release_date: 2024-03-11
|
|
492
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
493
|
+
|
|
494
|
+
- name: cohere/command-r-plus
|
|
495
|
+
display_name: Command R Plus
|
|
496
|
+
description: Command R+ is a multilingual 104B parameter model with a context length of 128K that has been trained with conversational tool use capabilities.
|
|
497
|
+
creator_organization_name: Cohere
|
|
498
|
+
access: open
|
|
499
|
+
num_parameters: 104000000000
|
|
500
|
+
release_date: 2024-04-04
|
|
501
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
453
502
|
|
|
454
503
|
# Craiyon
|
|
455
504
|
- name: craiyon/dalle-mini
|
|
@@ -534,7 +583,7 @@ models:
|
|
|
534
583
|
access: open
|
|
535
584
|
num_parameters: 132000000000
|
|
536
585
|
release_date: 2024-03-27
|
|
537
|
-
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
586
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
538
587
|
|
|
539
588
|
|
|
540
589
|
# DeepMind
|
|
@@ -559,8 +608,8 @@ models:
|
|
|
559
608
|
|
|
560
609
|
# Deepseek
|
|
561
610
|
- name: deepseek-ai/deepseek-llm-67b-chat
|
|
562
|
-
display_name: DeepSeek Chat (67B)
|
|
563
|
-
description: DeepSeek Chat is a open-source language model trained on 2 trillion tokens in both English and Chinese, and fine-tuned supervised fine-tuning (SFT) and Direct Preference Optimization (DPO). ([paper](https://arxiv.org/abs/2401.02954))
|
|
611
|
+
display_name: DeepSeek LLM Chat (67B)
|
|
612
|
+
description: DeepSeek LLM Chat is a open-source language model trained on 2 trillion tokens in both English and Chinese, and fine-tuned supervised fine-tuning (SFT) and Direct Preference Optimization (DPO). ([paper](https://arxiv.org/abs/2401.02954))
|
|
564
613
|
creator_organization_name: DeepSeek
|
|
565
614
|
access: open
|
|
566
615
|
num_parameters: 67000000000
|
|
@@ -624,7 +673,16 @@ models:
|
|
|
624
673
|
release_date: 2023-02-13
|
|
625
674
|
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
626
675
|
|
|
676
|
+
# EPFL LLM
|
|
627
677
|
|
|
678
|
+
- name: epfl-llm/meditron-7b
|
|
679
|
+
display_name: Meditron (7B)
|
|
680
|
+
description: Meditron-7B is a 7 billion parameter model adapted to the medical domain from Llama-2-7B through continued pretraining on a comprehensively curated medical corpus.
|
|
681
|
+
creator_organization_name: EPFL LLM
|
|
682
|
+
access: open
|
|
683
|
+
num_parameters: 7000000000
|
|
684
|
+
release_date: 2023-11-27
|
|
685
|
+
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
628
686
|
|
|
629
687
|
# Google
|
|
630
688
|
- name: google/t5-11b
|
|
@@ -670,15 +728,23 @@ models:
|
|
|
670
728
|
creator_organization_name: Google
|
|
671
729
|
access: limited
|
|
672
730
|
release_date: 2023-12-13
|
|
673
|
-
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
731
|
+
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
674
732
|
|
|
675
733
|
- name: google/gemini-1.0-pro-001
|
|
676
|
-
display_name: Gemini 1.0 Pro
|
|
734
|
+
display_name: Gemini 1.0 Pro (001)
|
|
677
735
|
description: Gemini 1.0 Pro is a multimodal model able to reason across text, images, video, audio and code. ([paper](https://arxiv.org/abs/2312.11805))
|
|
678
736
|
creator_organization_name: Google
|
|
679
737
|
access: limited
|
|
680
738
|
release_date: 2023-12-13
|
|
681
|
-
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
739
|
+
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
740
|
+
|
|
741
|
+
- name: google/gemini-1.0-pro-002
|
|
742
|
+
display_name: Gemini 1.0 Pro (002)
|
|
743
|
+
description: Gemini 1.0 Pro is a multimodal model able to reason across text, images, video, audio and code. ([paper](https://arxiv.org/abs/2312.11805))
|
|
744
|
+
creator_organization_name: Google
|
|
745
|
+
access: limited
|
|
746
|
+
release_date: 2024-04-09
|
|
747
|
+
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
682
748
|
|
|
683
749
|
# Note: This is aliased to a snapshot of gemini-pro-vision. When possible, please use a versioned snapshot instead.
|
|
684
750
|
- name: google/gemini-pro-vision
|
|
@@ -695,15 +761,79 @@ models:
|
|
|
695
761
|
creator_organization_name: Google
|
|
696
762
|
access: limited
|
|
697
763
|
release_date: 2023-12-13
|
|
698
|
-
tags: [VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
764
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, GOOGLE_GEMINI_PRO_VISION_V1_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
765
|
+
|
|
766
|
+
- name: google/gemini-1.5-pro-001
|
|
767
|
+
display_name: Gemini 1.5 Pro (001)
|
|
768
|
+
description: Gemini 1.5 Pro is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([paper](https://arxiv.org/abs/2403.05530))
|
|
769
|
+
creator_organization_name: Google
|
|
770
|
+
access: limited
|
|
771
|
+
release_date: 2024-05-24
|
|
772
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
773
|
+
|
|
774
|
+
- name: google/gemini-1.5-flash-001
|
|
775
|
+
display_name: Gemini 1.5 Flash (001)
|
|
776
|
+
description: Gemini 1.5 Flash is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([paper](https://arxiv.org/abs/2403.05530))
|
|
777
|
+
creator_organization_name: Google
|
|
778
|
+
access: limited
|
|
779
|
+
release_date: 2024-05-24
|
|
780
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
699
781
|
|
|
700
782
|
- name: google/gemini-1.5-pro-preview-0409
|
|
701
|
-
display_name: Gemini 1.5 Pro
|
|
702
|
-
description: Gemini 1.5 Pro is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. ([paper](https://arxiv.org/abs/2403.05530))
|
|
783
|
+
display_name: Gemini 1.5 Pro (0409 preview)
|
|
784
|
+
description: Gemini 1.5 Pro is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([paper](https://arxiv.org/abs/2403.05530))
|
|
703
785
|
creator_organization_name: Google
|
|
704
786
|
access: limited
|
|
705
787
|
release_date: 2024-04-10
|
|
706
|
-
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
788
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
789
|
+
|
|
790
|
+
- name: google/gemini-1.5-pro-preview-0514
|
|
791
|
+
display_name: Gemini 1.5 Pro (0514 preview)
|
|
792
|
+
description: Gemini 1.5 Pro is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([paper](https://arxiv.org/abs/2403.05530))
|
|
793
|
+
creator_organization_name: Google
|
|
794
|
+
access: limited
|
|
795
|
+
release_date: 2024-05-14
|
|
796
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
797
|
+
|
|
798
|
+
- name: google/gemini-1.5-flash-preview-0514
|
|
799
|
+
display_name: Gemini 1.5 Flash (0514 preview)
|
|
800
|
+
description: Gemini 1.5 Flash is a smaller Gemini model. It has a 1 million token context window and allows interleaving text, images, audio and video as inputs. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([blog](https://blog.google/technology/developers/gemini-gemma-developer-updates-may-2024/))
|
|
801
|
+
creator_organization_name: Google
|
|
802
|
+
access: limited
|
|
803
|
+
release_date: 2024-05-14
|
|
804
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
805
|
+
|
|
806
|
+
- name: google/gemini-1.5-pro-001-safety-default
|
|
807
|
+
display_name: Gemini 1.5 Pro (001, default safety)
|
|
808
|
+
description: Gemini 1.5 Pro is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and uses default safety settings. ([paper](https://arxiv.org/abs/2403.05530))
|
|
809
|
+
creator_organization_name: Google
|
|
810
|
+
access: limited
|
|
811
|
+
release_date: 2024-05-24
|
|
812
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
813
|
+
|
|
814
|
+
- name: google/gemini-1.5-pro-001-safety-block-none
|
|
815
|
+
display_name: Gemini 1.5 Pro (001, BLOCK_NONE safety)
|
|
816
|
+
description: Gemini 1.5 Pro is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([paper](https://arxiv.org/abs/2403.05530))
|
|
817
|
+
creator_organization_name: Google
|
|
818
|
+
access: limited
|
|
819
|
+
release_date: 2024-05-24
|
|
820
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
821
|
+
|
|
822
|
+
- name: google/gemini-1.5-flash-001-safety-default
|
|
823
|
+
display_name: Gemini 1.5 Flash (001, default safety)
|
|
824
|
+
description: Gemini 1.5 Flash is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and uses default safety settings. ([paper](https://arxiv.org/abs/2403.05530))
|
|
825
|
+
creator_organization_name: Google
|
|
826
|
+
access: limited
|
|
827
|
+
release_date: 2024-05-24
|
|
828
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
829
|
+
|
|
830
|
+
- name: google/gemini-1.5-flash-001-safety-block-none
|
|
831
|
+
display_name: Gemini 1.5 Flash (001, BLOCK_NONE safety)
|
|
832
|
+
description: Gemini 1.5 Flash is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([paper](https://arxiv.org/abs/2403.05530))
|
|
833
|
+
creator_organization_name: Google
|
|
834
|
+
access: limited
|
|
835
|
+
release_date: 2024-05-24
|
|
836
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
707
837
|
|
|
708
838
|
- name: google/gemma-2b
|
|
709
839
|
display_name: Gemma (2B)
|
|
@@ -742,6 +872,22 @@ models:
|
|
|
742
872
|
# TODO: Add OUTPUT_FORMAT_INSTRUCTIONS_TAG tag
|
|
743
873
|
tags: [TEXT_MODEL_TAG, GOOGLE_GEMMA_INSTRUCT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
744
874
|
|
|
875
|
+
- name: google/paligemma-3b-mix-224
|
|
876
|
+
display_name: PaliGemma (3B) Mix 224
|
|
877
|
+
description: PaliGemma is a versatile and lightweight vision-language model (VLM) inspired by PaLI-3 and based on open components such as the SigLIP vision model and the Gemma language model. Pre-trained with 224x224 input images and 128 token input/output text sequences. Finetuned on a mixture of downstream academic datasets. ([blog](https://developers.googleblog.com/en/gemma-family-and-toolkit-expansion-io-2024/))
|
|
878
|
+
creator_organization_name: Google
|
|
879
|
+
access: open
|
|
880
|
+
release_date: 2024-05-12
|
|
881
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
882
|
+
|
|
883
|
+
- name: google/paligemma-3b-mix-448
|
|
884
|
+
display_name: PaliGemma (3B) Mix 448
|
|
885
|
+
description: PaliGemma is a versatile and lightweight vision-language model (VLM) inspired by PaLI-3 and based on open components such as the SigLIP vision model and the Gemma language model. Pre-trained with 448x448 input images and 512 token input/output text sequences. Finetuned on a mixture of downstream academic datasets. ([blog](https://developers.googleblog.com/en/gemma-family-and-toolkit-expansion-io-2024/))
|
|
886
|
+
creator_organization_name: Google
|
|
887
|
+
access: open
|
|
888
|
+
release_date: 2024-05-12
|
|
889
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
890
|
+
|
|
745
891
|
- name: google/text-bison@001
|
|
746
892
|
display_name: PaLM-2 (Bison)
|
|
747
893
|
description: The best value PaLM model. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
|
|
@@ -798,12 +944,35 @@ models:
|
|
|
798
944
|
release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
|
|
799
945
|
tags: [CODE_MODEL_TAG]
|
|
800
946
|
|
|
947
|
+
- name: google/medlm-medium
|
|
948
|
+
display_name: MedLM (Medium)
|
|
949
|
+
description: MedLM is a family of foundation models fine-tuned for the healthcare industry based on Google Research's medically-tuned large language model, Med-PaLM 2. ([documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/medlm/overview))
|
|
950
|
+
creator_organization_name: Google
|
|
951
|
+
access: limited
|
|
952
|
+
release_date: 2023-12-13
|
|
953
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
801
954
|
|
|
955
|
+
- name: google/medlm-large
|
|
956
|
+
display_name: MedLM (Large)
|
|
957
|
+
description: MedLM is a family of foundation models fine-tuned for the healthcare industry based on Google Research's medically-tuned large language model, Med-PaLM 2. ([documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/medlm/overview))
|
|
958
|
+
creator_organization_name: Google
|
|
959
|
+
access: limited
|
|
960
|
+
release_date: 2023-12-13
|
|
961
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
802
962
|
|
|
803
963
|
# HuggingFace
|
|
964
|
+
- name: HuggingFaceM4/idefics2-8b
|
|
965
|
+
display_name: IDEFICS 2 (8B)
|
|
966
|
+
description: IDEFICS 2 (8B parameters) is an open multimodal model that accepts arbitrary sequences of image and text inputs and produces text outputs. ([blog](https://huggingface.co/blog/idefics2)).
|
|
967
|
+
creator_organization_name: HuggingFace
|
|
968
|
+
access: open
|
|
969
|
+
num_parameters: 8000000000
|
|
970
|
+
release_date: 2024-04-15
|
|
971
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, IDEFICS_MODEL_TAG, FULL_FUNCTIONALITY_VLM_TAG]
|
|
972
|
+
|
|
804
973
|
- name: HuggingFaceM4/idefics-9b
|
|
805
974
|
display_name: IDEFICS (9B)
|
|
806
|
-
description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo
|
|
975
|
+
description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo ([blog](https://huggingface.co/blog/idefics)).
|
|
807
976
|
creator_organization_name: HuggingFace
|
|
808
977
|
access: open
|
|
809
978
|
num_parameters: 9000000000
|
|
@@ -811,8 +980,8 @@ models:
|
|
|
811
980
|
tags: [VISION_LANGUAGE_MODEL_TAG, IDEFICS_MODEL_TAG, FULL_FUNCTIONALITY_VLM_TAG]
|
|
812
981
|
|
|
813
982
|
- name: HuggingFaceM4/idefics-9b-instruct
|
|
814
|
-
display_name: IDEFICS
|
|
815
|
-
description: IDEFICS
|
|
983
|
+
display_name: IDEFICS-instruct (9B)
|
|
984
|
+
description: IDEFICS-instruct (9B parameters) is the instruction-tuned version of IDEFICS 9B ([blog](https://huggingface.co/blog/idefics)).
|
|
816
985
|
creator_organization_name: HuggingFace
|
|
817
986
|
access: open
|
|
818
987
|
num_parameters: 9000000000
|
|
@@ -821,7 +990,7 @@ models:
|
|
|
821
990
|
|
|
822
991
|
- name: HuggingFaceM4/idefics-80b
|
|
823
992
|
display_name: IDEFICS (80B)
|
|
824
|
-
description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo
|
|
993
|
+
description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo ([blog](https://huggingface.co/blog/idefics)).
|
|
825
994
|
creator_organization_name: HuggingFace
|
|
826
995
|
access: open
|
|
827
996
|
num_parameters: 80000000000
|
|
@@ -829,8 +998,8 @@ models:
|
|
|
829
998
|
tags: [VISION_LANGUAGE_MODEL_TAG, IDEFICS_MODEL_TAG, FULL_FUNCTIONALITY_VLM_TAG]
|
|
830
999
|
|
|
831
1000
|
- name: HuggingFaceM4/idefics-80b-instruct
|
|
832
|
-
display_name: IDEFICS
|
|
833
|
-
description: IDEFICS
|
|
1001
|
+
display_name: IDEFICS-instruct (80B)
|
|
1002
|
+
description: IDEFICS-instruct (80B parameters) is the instruction-tuned version of IDEFICS 80B ([blog](https://huggingface.co/blog/idefics)).
|
|
834
1003
|
creator_organization_name: HuggingFace
|
|
835
1004
|
access: open
|
|
836
1005
|
num_parameters: 80000000000
|
|
@@ -1050,8 +1219,6 @@ models:
|
|
|
1050
1219
|
release_date: 2023-06-22
|
|
1051
1220
|
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1052
1221
|
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
1222
|
# Meta
|
|
1056
1223
|
- name: meta/opt-iml-175b # NOT SUPPORTED
|
|
1057
1224
|
display_name: OPT-IML (175B)
|
|
@@ -1210,6 +1377,44 @@ models:
|
|
|
1210
1377
|
release_date: 2024-04-18
|
|
1211
1378
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1212
1379
|
|
|
1380
|
+
- name: meta/llama-3-8b-chat
|
|
1381
|
+
display_name: Llama 3 Instruct (8B)
|
|
1382
|
+
description: Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. It used SFT, rejection sampling, PPO and DPO for post-training.
|
|
1383
|
+
creator_organization_name: Meta
|
|
1384
|
+
access: open
|
|
1385
|
+
num_parameters: 8000000000
|
|
1386
|
+
release_date: 2024-04-18
|
|
1387
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1388
|
+
|
|
1389
|
+
- name: meta/llama-3-70b-chat
|
|
1390
|
+
display_name: Llama 3 Instruct (70B)
|
|
1391
|
+
description: Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. It used SFT, rejection sampling, PPO and DPO for post-training.
|
|
1392
|
+
creator_organization_name: Meta
|
|
1393
|
+
access: open
|
|
1394
|
+
num_parameters: 70000000000
|
|
1395
|
+
release_date: 2024-04-18
|
|
1396
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1397
|
+
|
|
1398
|
+
- name: meta/llama-guard-7b
|
|
1399
|
+
display_name: Llama Guard (7B)
|
|
1400
|
+
description: Llama-Guard is a 7B parameter Llama 2-based input-output safeguard model. It can be used for classifying content in both LLM inputs (prompt classification) and in LLM responses (response classification). It acts as an LLM it generates text in its output that indicates whether a given prompt or response is safe/unsafe, and if unsafe based on a policy, it also lists the violating subcategories.
|
|
1401
|
+
creator_organization_name: Meta
|
|
1402
|
+
access: open
|
|
1403
|
+
num_parameters: 7000000000
|
|
1404
|
+
release_date: 2023-12-07
|
|
1405
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1406
|
+
|
|
1407
|
+
- name: meta/llama-guard-2-8b
|
|
1408
|
+
display_name: Llama Guard 2 (8B)
|
|
1409
|
+
description: Llama Guard 2 is an 8B parameter Llama 3-based LLM safeguard model. Similar to Llama Guard, it can be used for classifying content in both LLM inputs (prompt classification) and in LLM responses (response classification). It acts as an LLM – it generates text in its output that indicates whether a given prompt or response is safe or unsafe, and if unsafe, it also lists the content categories violated.
|
|
1410
|
+
creator_organization_name: Meta
|
|
1411
|
+
access: open
|
|
1412
|
+
num_parameters: 8000000000
|
|
1413
|
+
release_date: 2024-04-18
|
|
1414
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1415
|
+
|
|
1416
|
+
|
|
1417
|
+
|
|
1213
1418
|
|
|
1214
1419
|
# Microsoft/NVIDIA
|
|
1215
1420
|
- name: microsoft/TNLGv2_530B
|
|
@@ -1247,11 +1452,46 @@ models:
|
|
|
1247
1452
|
num_parameters: 13000000000
|
|
1248
1453
|
release_date: 2023-10-05
|
|
1249
1454
|
tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG, LIMITED_FUNCTIONALITY_VLM_TAG]
|
|
1250
|
-
|
|
1455
|
+
|
|
1456
|
+
- name: uw-madison/llava-v1.6-vicuna-7b-hf
|
|
1457
|
+
display_name: LLaVA 1.6 (7B)
|
|
1458
|
+
description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. ([paper](https://arxiv.org/abs/2304.08485))
|
|
1459
|
+
creator_organization_name: Microsoft
|
|
1460
|
+
access: open
|
|
1461
|
+
num_parameters: 7000000000
|
|
1462
|
+
release_date: 2024-01-01
|
|
1463
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG, LIMITED_FUNCTIONALITY_VLM_TAG]
|
|
1464
|
+
|
|
1465
|
+
- name: uw-madison/llava-v1.6-vicuna-13b-hf
|
|
1466
|
+
display_name: LLaVA 1.6 (13B)
|
|
1467
|
+
description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. ([paper](https://arxiv.org/abs/2304.08485))
|
|
1468
|
+
creator_organization_name: Microsoft
|
|
1469
|
+
access: open
|
|
1470
|
+
num_parameters: 13000000000
|
|
1471
|
+
release_date: 2024-01-01
|
|
1472
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG, LIMITED_FUNCTIONALITY_VLM_TAG]
|
|
1473
|
+
|
|
1474
|
+
- name: uw-madison/llava-v1.6-mistral-7b-hf
|
|
1475
|
+
display_name: LLaVA 1.6 + Mistral (7B)
|
|
1476
|
+
description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. ([paper](https://arxiv.org/abs/2304.08485))
|
|
1477
|
+
creator_organization_name: Microsoft
|
|
1478
|
+
access: open
|
|
1479
|
+
num_parameters: 7000000000
|
|
1480
|
+
release_date: 2024-01-01
|
|
1481
|
+
tags: [ VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG, LIMITED_FUNCTIONALITY_VLM_TAG ]
|
|
1482
|
+
|
|
1483
|
+
- name: uw-madison/llava-v1.6-34b-hf
|
|
1484
|
+
display_name: LLaVA + Nous-Hermes-2-Yi-34B (34B)
|
|
1485
|
+
description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. ([paper](https://arxiv.org/abs/2304.08485))
|
|
1486
|
+
creator_organization_name: Microsoft
|
|
1487
|
+
access: open
|
|
1488
|
+
num_parameters: 34000000000
|
|
1489
|
+
release_date: 2024-01-01
|
|
1490
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG, LIMITED_FUNCTIONALITY_VLM_TAG]
|
|
1251
1491
|
|
|
1252
1492
|
- name: openflamingo/OpenFlamingo-9B-vitl-mpt7b
|
|
1253
1493
|
display_name: OpenFlamingo (9B)
|
|
1254
|
-
description: OpenFlamingo is an open source implementation of DeepMind's Flamingo models. This 9B-parameter model uses a CLIP ViT-L/14 vision encoder and MPT-7B language model
|
|
1494
|
+
description: OpenFlamingo is an open source implementation of DeepMind's Flamingo models. This 9B-parameter model uses a CLIP ViT-L/14 vision encoder and MPT-7B language model ([paper](https://arxiv.org/abs/2308.01390)).
|
|
1255
1495
|
creator_organization_name: OpenFlamingo
|
|
1256
1496
|
access: open
|
|
1257
1497
|
num_parameters: 9000000000
|
|
@@ -1267,7 +1507,15 @@ models:
|
|
|
1267
1507
|
release_date: 2023-10-05
|
|
1268
1508
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1269
1509
|
|
|
1270
|
-
|
|
1510
|
+
# KAIST AI
|
|
1511
|
+
- name: kaistai/prometheus-vision-13b-v1.0-hf
|
|
1512
|
+
display_name: LLaVA + Vicuna-v1.5 (13B)
|
|
1513
|
+
description: LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. ([paper](https://arxiv.org/abs/2304.08485))
|
|
1514
|
+
creator_organization_name: KAIST AI
|
|
1515
|
+
access: open
|
|
1516
|
+
num_parameters: 13000000000
|
|
1517
|
+
release_date: 2024-01-01
|
|
1518
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG, LIMITED_FUNCTIONALITY_VLM_TAG]
|
|
1271
1519
|
|
|
1272
1520
|
# 01.AI
|
|
1273
1521
|
- name: 01-ai/yi-6b
|
|
@@ -1278,6 +1526,7 @@ models:
|
|
|
1278
1526
|
num_parameters: 6000000000
|
|
1279
1527
|
release_date: 2023-11-02
|
|
1280
1528
|
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1529
|
+
|
|
1281
1530
|
- name: 01-ai/yi-34b
|
|
1282
1531
|
display_name: Yi (34B)
|
|
1283
1532
|
description: The Yi models are large language models trained from scratch by developers at 01.AI.
|
|
@@ -1287,6 +1536,39 @@ models:
|
|
|
1287
1536
|
release_date: 2023-11-02
|
|
1288
1537
|
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1289
1538
|
|
|
1539
|
+
- name: 01-ai/yi-6b-chat
|
|
1540
|
+
display_name: Yi Chat (6B)
|
|
1541
|
+
description: The Yi models are large language models trained from scratch by developers at 01.AI.
|
|
1542
|
+
creator_organization_name: 01.AI
|
|
1543
|
+
access: open
|
|
1544
|
+
num_parameters: 6000000000
|
|
1545
|
+
release_date: 2023-11-23
|
|
1546
|
+
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1547
|
+
|
|
1548
|
+
- name: 01-ai/yi-34b-chat
|
|
1549
|
+
display_name: Yi Chat (34B)
|
|
1550
|
+
description: The Yi models are large language models trained from scratch by developers at 01.AI.
|
|
1551
|
+
creator_organization_name: 01.AI
|
|
1552
|
+
access: open
|
|
1553
|
+
num_parameters: 34000000000
|
|
1554
|
+
release_date: 2023-11-23
|
|
1555
|
+
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1556
|
+
|
|
1557
|
+
- name: 01-ai/yi-large
|
|
1558
|
+
display_name: Yi Large
|
|
1559
|
+
description: The Yi models are large language models trained from scratch by developers at 01.AI. ([tweet](https://x.com/01AI_Yi/status/1789894091620458667))
|
|
1560
|
+
creator_organization_name: 01.AI
|
|
1561
|
+
access: limited
|
|
1562
|
+
release_date: 2024-05-12
|
|
1563
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1564
|
+
|
|
1565
|
+
- name: 01-ai/yi-large-preview
|
|
1566
|
+
display_name: Yi Large (Preview)
|
|
1567
|
+
description: The Yi models are large language models trained from scratch by developers at 01.AI. ([tweet](https://x.com/01AI_Yi/status/1789894091620458667))
|
|
1568
|
+
creator_organization_name: 01.AI
|
|
1569
|
+
access: limited
|
|
1570
|
+
release_date: 2024-05-12
|
|
1571
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1290
1572
|
|
|
1291
1573
|
# Allen Institute for AI
|
|
1292
1574
|
# OLMo Blog: https://blog.allenai.org/olmo-open-language-model-87ccfc95f580
|
|
@@ -1318,29 +1600,64 @@ models:
|
|
|
1318
1600
|
# TODO: Add instruct tag.
|
|
1319
1601
|
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1320
1602
|
|
|
1603
|
+
- name: allenai/olmo-1.7-7b
|
|
1604
|
+
display_name: OLMo 1.7 (7B)
|
|
1605
|
+
description: OLMo is a series of Open Language Models trained on the Dolma dataset. The instruct versions was trained on the Tulu SFT mixture and a cleaned version of the UltraFeedback dataset.
|
|
1606
|
+
creator_organization_name: Allen Institute for AI
|
|
1607
|
+
access: open
|
|
1608
|
+
num_parameters: 7000000000
|
|
1609
|
+
release_date: 2024-04-17
|
|
1610
|
+
tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1321
1611
|
|
|
1322
1612
|
# Mistral AI
|
|
1323
1613
|
- name: mistralai/mistral-7b-v0.1
|
|
1324
1614
|
display_name: Mistral v0.1 (7B)
|
|
1325
|
-
description: Mistral 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
|
|
1615
|
+
description: Mistral 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA). ([blog post](https://mistral.ai/news/announcing-mistral-7b/))
|
|
1616
|
+
creator_organization_name: Mistral AI
|
|
1617
|
+
access: open
|
|
1618
|
+
num_parameters: 7300000000
|
|
1619
|
+
release_date: 2023-09-27
|
|
1620
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1621
|
+
|
|
1622
|
+
- name: mistralai/mistral-7b-instruct-v0.1
|
|
1623
|
+
display_name: Mistral Instruct v0.1 (7B)
|
|
1624
|
+
description: Mistral v0.1 Instruct 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA). The instruct version was fined-tuned using publicly available conversation datasets. ([blog post](https://mistral.ai/news/announcing-mistral-7b/))
|
|
1326
1625
|
creator_organization_name: Mistral AI
|
|
1327
1626
|
access: open
|
|
1328
1627
|
num_parameters: 7300000000
|
|
1329
1628
|
release_date: 2023-09-27
|
|
1330
1629
|
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1331
1630
|
|
|
1631
|
+
- name: mistralai/mistral-7b-instruct-v0.2
|
|
1632
|
+
display_name: Mistral Instruct v0.2 (7B)
|
|
1633
|
+
description: Mistral v0.2 Instruct 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA). Compared to v0.1, v0.2 has a 32k context window and no Sliding-Window Attention (SWA). ([blog post](https://mistral.ai/news/la-plateforme/))
|
|
1634
|
+
creator_organization_name: Mistral AI
|
|
1635
|
+
access: open
|
|
1636
|
+
num_parameters: 7300000000
|
|
1637
|
+
release_date: 2024-03-23
|
|
1638
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1639
|
+
|
|
1640
|
+
- name: mistralai/mistral-7b-instruct-v0.3
|
|
1641
|
+
display_name: Mistral Instruct v0.3 (7B)
|
|
1642
|
+
description: Mistral v0.3 Instruct 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA). Compared to v0.1, v0.2 has a 32k context window and no Sliding-Window Attention (SWA). ([blog post](https://mistral.ai/news/la-plateforme/))
|
|
1643
|
+
creator_organization_name: Mistral AI
|
|
1644
|
+
access: open
|
|
1645
|
+
num_parameters: 7300000000
|
|
1646
|
+
release_date: 2024-05-22
|
|
1647
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1648
|
+
|
|
1332
1649
|
- name: mistralai/mixtral-8x7b-32kseqlen
|
|
1333
1650
|
display_name: Mixtral (8x7B 32K seqlen)
|
|
1334
|
-
description:
|
|
1651
|
+
description: Mixtral is a mixture-of-experts model that has 46.7B total parameters but only uses 12.9B parameters per token. ([blog post](https://mistral.ai/news/mixtral-of-experts/), [tweet](https://twitter.com/MistralAI/status/1733150512395038967)).
|
|
1335
1652
|
creator_organization_name: Mistral AI
|
|
1336
1653
|
access: open
|
|
1337
1654
|
num_parameters: 46700000000
|
|
1338
1655
|
release_date: 2023-12-08
|
|
1339
|
-
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG
|
|
1656
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1340
1657
|
|
|
1341
1658
|
- name: mistralai/mixtral-8x7b-instruct-v0.1
|
|
1342
|
-
display_name: Mixtral (8x7B
|
|
1343
|
-
description: Mixtral (8x7B
|
|
1659
|
+
display_name: Mixtral Instruct (8x7B)
|
|
1660
|
+
description: Mixtral Instruct (8x7B) is a version of Mixtral (8x7B) that was optimized through supervised fine-tuning and direct preference optimisation (DPO) for careful instruction following. ([blog post](https://mistral.ai/news/mixtral-of-experts/)).
|
|
1344
1661
|
creator_organization_name: Mistral AI
|
|
1345
1662
|
access: open
|
|
1346
1663
|
num_parameters: 46700000000
|
|
@@ -1350,7 +1667,16 @@ models:
|
|
|
1350
1667
|
|
|
1351
1668
|
- name: mistralai/mixtral-8x22b
|
|
1352
1669
|
display_name: Mixtral (8x22B)
|
|
1353
|
-
description: Mistral AI's mixture-of-experts model ([
|
|
1670
|
+
description: Mistral AI's mixture-of-experts model that uses 39B active parameters out of 141B ([blog post](https://mistral.ai/news/mixtral-8x22b/)).
|
|
1671
|
+
creator_organization_name: Mistral AI
|
|
1672
|
+
access: open
|
|
1673
|
+
num_parameters: 176000000000
|
|
1674
|
+
release_date: 2024-04-10
|
|
1675
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1676
|
+
|
|
1677
|
+
- name: mistralai/mixtral-8x22b-instruct-v0.1
|
|
1678
|
+
display_name: Mixtral Instruct (8x22B)
|
|
1679
|
+
description: Mistral AI's mixture-of-experts model that uses 39B active parameters out of 141B ([blog post](https://mistral.ai/news/mixtral-8x22b/)).
|
|
1354
1680
|
creator_organization_name: Mistral AI
|
|
1355
1681
|
access: open
|
|
1356
1682
|
num_parameters: 176000000000
|
|
@@ -1641,7 +1967,7 @@ models:
|
|
|
1641
1967
|
tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1642
1968
|
|
|
1643
1969
|
- name: openai/gpt-3.5-turbo-0125
|
|
1644
|
-
display_name:
|
|
1970
|
+
display_name: GPT-3.5 Turbo (0125)
|
|
1645
1971
|
description: Sibling model of text-davinci-003 that is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2024-01-25.
|
|
1646
1972
|
creator_organization_name: OpenAI
|
|
1647
1973
|
access: limited
|
|
@@ -1720,9 +2046,26 @@ models:
|
|
|
1720
2046
|
release_date: 2024-04-09
|
|
1721
2047
|
tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1722
2048
|
|
|
2049
|
+
- name: openai/gpt-4o-2024-05-13
|
|
2050
|
+
display_name: GPT-4o (2024-05-13)
|
|
2051
|
+
description: GPT-4o (2024-05-13) is a large multimodal model that accepts as input any combination of text, audio, and image and generates any combination of text, audio, and image outputs.
|
|
2052
|
+
creator_organization_name: OpenAI
|
|
2053
|
+
access: limited
|
|
2054
|
+
release_date: 2024-04-09
|
|
2055
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2056
|
+
|
|
1723
2057
|
- name: openai/gpt-4-vision-preview
|
|
1724
|
-
|
|
1725
|
-
|
|
2058
|
+
# According to https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4, this model has pointed gpt-4-1106-vision-preview.
|
|
2059
|
+
display_name: GPT-4V (1106 preview)
|
|
2060
|
+
description: GPT-4V is a large multimodal model that accepts both text and images and is optimized for chat ([model card](https://openai.com/research/gpt-4v-system-card)).
|
|
2061
|
+
creator_organization_name: OpenAI
|
|
2062
|
+
access: limited
|
|
2063
|
+
release_date: 2023-11-06
|
|
2064
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, FULL_FUNCTIONALITY_VLM_TAG]
|
|
2065
|
+
|
|
2066
|
+
- name: openai/gpt-4-1106-vision-preview
|
|
2067
|
+
display_name: GPT-4V (1106 preview)
|
|
2068
|
+
description: GPT-4V is a large multimodal model that accepts both text and images and is optimized for chat ([model card](https://openai.com/research/gpt-4v-system-card)).
|
|
1726
2069
|
creator_organization_name: OpenAI
|
|
1727
2070
|
access: limited
|
|
1728
2071
|
release_date: 2023-11-06
|
|
@@ -1858,7 +2201,7 @@ models:
|
|
|
1858
2201
|
|
|
1859
2202
|
- name: qwen/qwen-7b
|
|
1860
2203
|
display_name: Qwen
|
|
1861
|
-
description: 7B-parameter version of the large language model series, Qwen (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen
|
|
2204
|
+
description: 7B-parameter version of the large language model series, Qwen (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen is a family of transformer models with SwiGLU activation, RoPE, and multi-head attention. ([blog](https://qwenlm.github.io/blog/qwen1.5/))
|
|
1862
2205
|
creator_organization_name: Qwen
|
|
1863
2206
|
access: open
|
|
1864
2207
|
release_date: 2024-02-05
|
|
@@ -1866,7 +2209,7 @@ models:
|
|
|
1866
2209
|
|
|
1867
2210
|
- name: qwen/qwen1.5-7b
|
|
1868
2211
|
display_name: Qwen1.5 (7B)
|
|
1869
|
-
description: 7B-parameter version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen
|
|
2212
|
+
description: 7B-parameter version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen is a family of transformer models with SwiGLU activation, RoPE, and multi-head attention. ([blog](https://qwenlm.github.io/blog/qwen1.5/))
|
|
1870
2213
|
creator_organization_name: Qwen
|
|
1871
2214
|
access: open
|
|
1872
2215
|
release_date: 2024-02-05
|
|
@@ -1874,7 +2217,7 @@ models:
|
|
|
1874
2217
|
|
|
1875
2218
|
- name: qwen/qwen1.5-14b
|
|
1876
2219
|
display_name: Qwen1.5 (14B)
|
|
1877
|
-
description: 14B-parameter version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen
|
|
2220
|
+
description: 14B-parameter version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen is a family of transformer models with SwiGLU activation, RoPE, and multi-head attention. ([blog](https://qwenlm.github.io/blog/qwen1.5/))
|
|
1878
2221
|
creator_organization_name: Qwen
|
|
1879
2222
|
access: open
|
|
1880
2223
|
release_date: 2024-02-05
|
|
@@ -1882,23 +2225,71 @@ models:
|
|
|
1882
2225
|
|
|
1883
2226
|
- name: qwen/qwen1.5-32b
|
|
1884
2227
|
display_name: Qwen1.5 (32B)
|
|
1885
|
-
description: 32B-parameter version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen
|
|
2228
|
+
description: 32B-parameter version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen is a family of transformer models with SwiGLU activation, RoPE, and multi-head attention. The 32B version also includes grouped query attention (GQA). ([blog](https://qwenlm.github.io/blog/qwen1.5-32b/))
|
|
1886
2229
|
creator_organization_name: Qwen
|
|
1887
2230
|
access: open
|
|
1888
|
-
release_date: 2024-02
|
|
2231
|
+
release_date: 2024-04-02
|
|
1889
2232
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1890
2233
|
|
|
1891
2234
|
- name: qwen/qwen1.5-72b
|
|
1892
2235
|
display_name: Qwen1.5 (72B)
|
|
1893
|
-
description: 72B-parameter version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen
|
|
2236
|
+
description: 72B-parameter version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen is a family of transformer models with SwiGLU activation, RoPE, and multi-head attention. ([blog](https://qwenlm.github.io/blog/qwen1.5/))
|
|
1894
2237
|
creator_organization_name: Qwen
|
|
1895
2238
|
access: open
|
|
1896
2239
|
release_date: 2024-02-05
|
|
1897
2240
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
1898
2241
|
|
|
2242
|
+
- name: qwen/qwen1.5-7b-chat
|
|
2243
|
+
display_name: Qwen1.5 Chat (7B)
|
|
2244
|
+
description: 7B-parameter version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen is a family of transformer models with SwiGLU activation, RoPE, and multi-head attention. ([blog](https://qwenlm.github.io/blog/qwen1.5/))
|
|
2245
|
+
creator_organization_name: Qwen
|
|
2246
|
+
access: open
|
|
2247
|
+
release_date: 2024-02-05
|
|
2248
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2249
|
+
|
|
2250
|
+
- name: qwen/qwen1.5-14b-chat
|
|
2251
|
+
display_name: Qwen1.5 Chat (14B)
|
|
2252
|
+
description: 14B-parameter chat version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen is a family of transformer models with SwiGLU activation, RoPE, and multi-head attention. ([blog](https://qwenlm.github.io/blog/qwen1.5/))
|
|
2253
|
+
creator_organization_name: Qwen
|
|
2254
|
+
access: open
|
|
2255
|
+
release_date: 2024-02-05
|
|
2256
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2257
|
+
|
|
2258
|
+
- name: qwen/qwen1.5-32b-chat
|
|
2259
|
+
display_name: Qwen1.5 Chat (32B)
|
|
2260
|
+
description: 32B-parameter version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen is a family of transformer models with SwiGLU activation, RoPE, and multi-head attention. The 32B version also includes grouped query attention (GQA). ([blog](https://qwenlm.github.io/blog/qwen1.5-32b/))
|
|
2261
|
+
creator_organization_name: Qwen
|
|
2262
|
+
access: open
|
|
2263
|
+
release_date: 2024-04-02
|
|
2264
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2265
|
+
|
|
2266
|
+
- name: qwen/qwen1.5-72b-chat
|
|
2267
|
+
display_name: Qwen1.5 Chat (72B)
|
|
2268
|
+
description: 72B-parameter chat version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen is a family of transformer models with SwiGLU activation, RoPE, and multi-head attention. ([blog](https://qwenlm.github.io/blog/qwen1.5/))
|
|
2269
|
+
creator_organization_name: Qwen
|
|
2270
|
+
access: open
|
|
2271
|
+
release_date: 2024-02-05
|
|
2272
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2273
|
+
|
|
2274
|
+
- name: qwen/qwen1.5-110b-chat
|
|
2275
|
+
display_name: Qwen1.5 Chat (110B)
|
|
2276
|
+
description: 110B-parameter chat version of the large language model series, Qwen 1.5 (abbr. Tongyi Qianwen), proposed by Aibaba Cloud. Qwen is a family of transformer models with SwiGLU activation, RoPE, and multi-head attention. The 110B version also includes grouped query attention (GQA). ([blog](https://qwenlm.github.io/blog/qwen1.5-110b/))
|
|
2277
|
+
creator_organization_name: Qwen
|
|
2278
|
+
access: open
|
|
2279
|
+
release_date: 2024-04-25
|
|
2280
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2281
|
+
|
|
2282
|
+
- name: qwen/qwen2-72b-instruct
|
|
2283
|
+
display_name: Qwen2 Instruct (72B)
|
|
2284
|
+
description: 72B-parameter chat version of the large language model series, Qwen2. Qwen2 uses Group Query Attention (GQA) and has extended context length support up to 128K tokens. ([blog](https://qwenlm.github.io/blog/qwen2/))
|
|
2285
|
+
creator_organization_name: Qwen
|
|
2286
|
+
access: open
|
|
2287
|
+
release_date: 2024-06-07
|
|
2288
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2289
|
+
|
|
1899
2290
|
- name: qwen/qwen-vl
|
|
1900
2291
|
display_name: Qwen-VL
|
|
1901
|
-
description: Visual multimodal version of the large model series ([paper](https://arxiv.org/abs/2308.12966)).
|
|
2292
|
+
description: Visual multimodal version of the Qwen large language model series ([paper](https://arxiv.org/abs/2308.12966)).
|
|
1902
2293
|
creator_organization_name: Alibaba Cloud
|
|
1903
2294
|
access: open
|
|
1904
2295
|
release_date: 2023-08-24
|
|
@@ -1906,12 +2297,49 @@ models:
|
|
|
1906
2297
|
|
|
1907
2298
|
- name: qwen/qwen-vl-chat
|
|
1908
2299
|
display_name: Qwen-VL Chat
|
|
1909
|
-
description: Chat version of
|
|
2300
|
+
description: Chat version of Qwen-VL ([paper](https://arxiv.org/abs/2308.12966)).
|
|
1910
2301
|
creator_organization_name: Alibaba Cloud
|
|
1911
2302
|
access: open
|
|
1912
2303
|
release_date: 2023-08-24
|
|
1913
2304
|
tags: [VISION_LANGUAGE_MODEL_TAG, FULL_FUNCTIONALITY_VLM_TAG]
|
|
1914
2305
|
|
|
2306
|
+
# SAIL (Sea AI Lab)
|
|
2307
|
+
- name: sail/sailor-7b
|
|
2308
|
+
display_name: Sailor (7B)
|
|
2309
|
+
description: Sailor is a suite of Open Language Models tailored for South-East Asia, focusing on languages such as Indonesian, Thai, Vietnamese, Malay, and Lao. These models were continually pre-trained from Qwen1.5. ([paper](https://arxiv.org/abs/2404.03608))
|
|
2310
|
+
creator_organization_name: SAIL
|
|
2311
|
+
access: open
|
|
2312
|
+
num_parameters: 7000000000
|
|
2313
|
+
release_date: 2024-04-04
|
|
2314
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2315
|
+
|
|
2316
|
+
- name: sail/sailor-7b-chat
|
|
2317
|
+
display_name: Sailor Chat (7B)
|
|
2318
|
+
description: Sailor is a suite of Open Language Models tailored for South-East Asia, focusing on languages such as Indonesian, Thai, Vietnamese, Malay, and Lao. These models were continually pre-trained from Qwen1.5. ([paper](https://arxiv.org/abs/2404.03608))
|
|
2319
|
+
creator_organization_name: SAIL
|
|
2320
|
+
access: open
|
|
2321
|
+
num_parameters: 7000000000
|
|
2322
|
+
release_date: 2024-04-04
|
|
2323
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2324
|
+
|
|
2325
|
+
- name: sail/sailor-14b
|
|
2326
|
+
display_name: Sailor (14B)
|
|
2327
|
+
description: Sailor is a suite of Open Language Models tailored for South-East Asia, focusing on languages such as Indonesian, Thai, Vietnamese, Malay, and Lao. These models were continually pre-trained from Qwen1.5. ([paper](https://arxiv.org/abs/2404.03608))
|
|
2328
|
+
creator_organization_name: SAIL
|
|
2329
|
+
access: open
|
|
2330
|
+
num_parameters: 14000000000
|
|
2331
|
+
release_date: 2024-04-04
|
|
2332
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2333
|
+
|
|
2334
|
+
- name: sail/sailor-14b-chat
|
|
2335
|
+
display_name: Sailor Chat (14B)
|
|
2336
|
+
description: Sailor is a suite of Open Language Models tailored for South-East Asia, focusing on languages such as Indonesian, Thai, Vietnamese, Malay, and Lao. These models were continually pre-trained from Qwen1.5. ([paper](https://arxiv.org/abs/2404.03608))
|
|
2337
|
+
creator_organization_name: SAIL
|
|
2338
|
+
access: open
|
|
2339
|
+
num_parameters: 14000000000
|
|
2340
|
+
release_date: 2024-04-04
|
|
2341
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2342
|
+
|
|
1915
2343
|
# Salesforce
|
|
1916
2344
|
- name: salesforce/codegen # NOT SUPPORTED
|
|
1917
2345
|
display_name: CodeGen (16B)
|
|
@@ -1922,6 +2350,34 @@ models:
|
|
|
1922
2350
|
release_date: 2022-03-25
|
|
1923
2351
|
tags: [] # TODO: add tags
|
|
1924
2352
|
|
|
2353
|
+
# SCB10X
|
|
2354
|
+
- name: scb10x/typhoon-v1.5-72b
|
|
2355
|
+
display_name: Typhoon v1.5 (72B)
|
|
2356
|
+
description: Typhoon v1.5 (72B) is pretrained Thai large language model with 72 billion parameters based on Qwen1.5-72B. ([blog](https://blog.opentyphoon.ai/typhoon-1-5-release-a9364cb8e8d7))
|
|
2357
|
+
creator_organization_name: SCB10X
|
|
2358
|
+
access: open
|
|
2359
|
+
num_parameters: 72000000000
|
|
2360
|
+
release_date: 2024-05-08
|
|
2361
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2362
|
+
|
|
2363
|
+
- name: scb10x/typhoon-v1.5-72b-instruct
|
|
2364
|
+
display_name: Typhoon v1.5 Instruct (72B)
|
|
2365
|
+
description: Typhoon v1.5 Instruct (72B) is pretrained Thai large language model with 72 billion parameters based on Qwen1.5-72B. ([blog](https://blog.opentyphoon.ai/typhoon-1-5-release-a9364cb8e8d7))
|
|
2366
|
+
creator_organization_name: SCB10X
|
|
2367
|
+
access: open
|
|
2368
|
+
num_parameters: 72000000000
|
|
2369
|
+
release_date: 2024-05-08
|
|
2370
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2371
|
+
|
|
2372
|
+
# Snowflake
|
|
2373
|
+
- name: snowflake/snowflake-arctic-instruct
|
|
2374
|
+
display_name: Arctic Instruct
|
|
2375
|
+
description: Arctic combines a 10B dense transformer model with a residual 128x3.66B MoE MLP resulting in 480B total and 17B active parameters chosen using a top-2 gating.
|
|
2376
|
+
creator_organization_name: Snowflake
|
|
2377
|
+
access: open
|
|
2378
|
+
num_parameters: 482000000000
|
|
2379
|
+
release_date: 2024-04-24
|
|
2380
|
+
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1925
2381
|
|
|
1926
2382
|
|
|
1927
2383
|
# Stability AI
|
|
@@ -2188,6 +2644,15 @@ models:
|
|
|
2188
2644
|
# Does not support echo
|
|
2189
2645
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
2190
2646
|
|
|
2647
|
+
- name: writer/palmyra-vision-003
|
|
2648
|
+
display_name: Palmyra Vision 003
|
|
2649
|
+
description: Palmyra Vision 003 (internal only)
|
|
2650
|
+
creator_organization_name: Writer
|
|
2651
|
+
access: limited
|
|
2652
|
+
num_parameters: 5000000000
|
|
2653
|
+
release_date: 2024-05-24
|
|
2654
|
+
# Does not support echo
|
|
2655
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_VLM_TAG]
|
|
2191
2656
|
|
|
2192
2657
|
|
|
2193
2658
|
# Yandex
|
|
@@ -2199,3 +2664,65 @@ models:
|
|
|
2199
2664
|
num_parameters: 100000000000
|
|
2200
2665
|
release_date: 2022-06-23
|
|
2201
2666
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
|
|
2667
|
+
|
|
2668
|
+
# Reka
|
|
2669
|
+
- name: reka/reka-core
|
|
2670
|
+
display_name: Reka-Core
|
|
2671
|
+
description: Reka-Core
|
|
2672
|
+
creator_organization_name: Reka AI
|
|
2673
|
+
access: limited
|
|
2674
|
+
release_date: 2024-04-18
|
|
2675
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
2676
|
+
|
|
2677
|
+
- name: reka/reka-core-20240415
|
|
2678
|
+
display_name: Reka-Core-20240415
|
|
2679
|
+
description: Reka-Core-20240415
|
|
2680
|
+
creator_organization_name: Reka AI
|
|
2681
|
+
access: limited
|
|
2682
|
+
release_date: 2024-04-18
|
|
2683
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
2684
|
+
|
|
2685
|
+
- name: reka/reka-core-20240501
|
|
2686
|
+
display_name: Reka-Core-20240501
|
|
2687
|
+
description: Reka-Core-20240501
|
|
2688
|
+
creator_organization_name: Reka AI
|
|
2689
|
+
access: limited
|
|
2690
|
+
release_date: 2024-05-01
|
|
2691
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
2692
|
+
|
|
2693
|
+
- name: reka/reka-flash
|
|
2694
|
+
display_name: Reka-Flash (21B)
|
|
2695
|
+
description: Reka-Flash (21B)
|
|
2696
|
+
creator_organization_name: Reka AI
|
|
2697
|
+
access: limited
|
|
2698
|
+
num_parameters: 21000000000
|
|
2699
|
+
release_date: 2024-04-18
|
|
2700
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
2701
|
+
|
|
2702
|
+
- name: reka/reka-flash-20240226
|
|
2703
|
+
display_name: Reka-Flash-20240226 (21B)
|
|
2704
|
+
description: Reka-Flash-20240226 (21B)
|
|
2705
|
+
creator_organization_name: Reka AI
|
|
2706
|
+
access: limited
|
|
2707
|
+
num_parameters: 21000000000
|
|
2708
|
+
release_date: 2024-04-18
|
|
2709
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
2710
|
+
|
|
2711
|
+
- name: reka/reka-edge
|
|
2712
|
+
display_name: Reka-Edge (7B)
|
|
2713
|
+
description: Reka-Edge (7B)
|
|
2714
|
+
creator_organization_name: Reka AI
|
|
2715
|
+
access: limited
|
|
2716
|
+
num_parameters: 7000000000
|
|
2717
|
+
release_date: 2024-04-18
|
|
2718
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
2719
|
+
|
|
2720
|
+
- name: reka/reka-edge-20240208
|
|
2721
|
+
display_name: Reka-Edge-20240208 (7B)
|
|
2722
|
+
description: Reka-Edge-20240208 (7B)
|
|
2723
|
+
creator_organization_name: Reka AI
|
|
2724
|
+
access: limited
|
|
2725
|
+
num_parameters: 7000000000
|
|
2726
|
+
release_date: 2024-04-18
|
|
2727
|
+
tags: [VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
|
|
2728
|
+
|