crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -307,7 +307,7 @@ model_deployments:
307
307
 
308
308
  - name: cohere/command
309
309
  model_name: cohere/command
310
- tokenizer_name: cohere/cohere
310
+ tokenizer_name: cohere/command
311
311
  max_sequence_length: 2019 # TODO: verify this
312
312
  max_request_length: 2020 # TODO: verify this
313
313
  client_spec:
@@ -317,7 +317,7 @@ model_deployments:
317
317
 
318
318
  - name: cohere/command-light
319
319
  model_name: cohere/command-light
320
- tokenizer_name: cohere/cohere
320
+ tokenizer_name: cohere/command-light
321
321
  max_sequence_length: 2019 # TODO: verify this
322
322
  max_request_length: 2020 # TODO: verify this
323
323
  client_spec:
@@ -325,6 +325,25 @@ model_deployments:
325
325
  window_service_spec:
326
326
  class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
327
327
 
328
+ - name: cohere/command-r
329
+ model_name: cohere/command-r
330
+ tokenizer_name: cohere/command-r
331
+ max_sequence_length: 128000
332
+ max_request_length: 128000
333
+ client_spec:
334
+ class_name: "helm.clients.cohere_client.CohereChatClient"
335
+
336
+ - name: cohere/command-r-plus
337
+ model_name: cohere/command-r-plus
338
+ tokenizer_name: cohere/command-r-plus
339
+ # "We have a known issue where prompts between 112K - 128K in length
340
+ # result in bad generations."
341
+ # Source: https://docs.cohere.com/docs/command-r-plus
342
+ max_sequence_length: 110000
343
+ max_request_length: 110000
344
+ client_spec:
345
+ class_name: "helm.clients.cohere_client.CohereChatClient"
346
+
328
347
  # Craiyon
329
348
 
330
349
  - name: craiyon/dalle-mini
@@ -352,7 +371,7 @@ model_deployments:
352
371
  tokenizer_name: databricks/dbrx-instruct
353
372
  max_sequence_length: 32767
354
373
  client_spec:
355
- class_name: "helm.clients.together_client.TogetherClient"
374
+ class_name: "helm.clients.together_client.TogetherChatClient"
356
375
 
357
376
  # DeepFloyd
358
377
 
@@ -390,7 +409,7 @@ model_deployments:
390
409
  tokenizer_name: deepseek-ai/deepseek-llm-67b-chat
391
410
  max_sequence_length: 4095
392
411
  client_spec:
393
- class_name: "helm.clients.together_client.TogetherClient"
412
+ class_name: "helm.clients.together_client.TogetherChatClient"
394
413
 
395
414
  # Gooseai
396
415
 
@@ -434,9 +453,17 @@ model_deployments:
434
453
  client_spec:
435
454
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
436
455
 
456
+ - name: google/gemini-1.0-pro-002
457
+ model_name: google/gemini-1.0-pro-002
458
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
459
+ max_sequence_length: 30720
460
+ max_sequence_and_generated_tokens_length: 32768 # Officially max_sequence_length + 2048
461
+ client_spec:
462
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
463
+
437
464
  - name: google/gemini-pro-vision
438
465
  model_name: google/gemini-pro-vision
439
- tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
466
+ tokenizer_name: openai/cl100k_base
440
467
  max_sequence_length: 12288
441
468
  max_sequence_and_generated_tokens_length: 16384 # Officially max_sequence_length + 4096, in practice max_output_tokens <= 2048 for vision models
442
469
  client_spec:
@@ -450,6 +477,22 @@ model_deployments:
450
477
  client_spec:
451
478
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
452
479
 
480
+ - name: google/gemini-1.5-flash-001
481
+ model_name: google/gemini-1.5-flash-001
482
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
483
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
484
+ # TODO: Max output tokens: 8192
485
+ client_spec:
486
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
487
+
488
+ - name: google/gemini-1.5-pro-001
489
+ model_name: google/gemini-1.5-pro-001
490
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
491
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
492
+ # TODO: Max output tokens: 8192
493
+ client_spec:
494
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
495
+
453
496
  - name: google/gemini-1.5-pro-preview-0409
454
497
  model_name: google/gemini-1.5-pro-preview-0409
455
498
  tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
@@ -458,6 +501,63 @@ model_deployments:
458
501
  client_spec:
459
502
  class_name: "helm.clients.vertexai_client.VertexAIChatClient"
460
503
 
504
+ - name: google/gemini-1.5-pro-preview-0514
505
+ model_name: google/gemini-1.5-pro-preview-0514
506
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
507
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
508
+ # TODO: Max output tokens: 8192
509
+ client_spec:
510
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
511
+
512
+ - name: google/gemini-1.5-flash-preview-0514
513
+ model_name: google/gemini-1.5-flash-preview-0514
514
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
515
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
516
+ # TODO: Max output tokens: 8192
517
+ client_spec:
518
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
519
+
520
+ ## Gemini with different safety settings
521
+ - name: google/gemini-1.5-pro-001-safety-default
522
+ model_name: google/gemini-1.5-pro-001-safety-default
523
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
524
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
525
+ # TODO: Max output tokens: 8192
526
+ client_spec:
527
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
528
+ args:
529
+ safety_settings_preset: default
530
+
531
+ - name: google/gemini-1.5-pro-001-safety-block-none
532
+ model_name: google/gemini-1.5-pro-001-safety-block-none
533
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
534
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
535
+ # TODO: Max output tokens: 8192
536
+ client_spec:
537
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
538
+ args:
539
+ safety_settings_preset: block_none
540
+
541
+ - name: google/gemini-1.5-flash-001-safety-default
542
+ model_name: google/gemini-1.5-flash-001-safety-default
543
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
544
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
545
+ # TODO: Max output tokens: 8192
546
+ client_spec:
547
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
548
+ args:
549
+ safety_settings_preset: default
550
+
551
+ - name: google/gemini-1.5-flash-001-safety-block-none
552
+ model_name: google/gemini-1.5-flash-001-safety-block-none
553
+ tokenizer_name: google/gemma-2b # Gemini has no tokenizer endpoint, so we approximate by using Gemma's tokenizer.
554
+ max_sequence_length: 1000000 # Source: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models
555
+ # TODO: Max output tokens: 8192
556
+ client_spec:
557
+ class_name: "helm.clients.vertexai_client.VertexAIChatClient"
558
+ args:
559
+ safety_settings_preset: block_none
560
+
461
561
  ## Gemma
462
562
  - name: together/gemma-2b
463
563
  model_name: google/gemma-2b
@@ -487,6 +587,42 @@ model_deployments:
487
587
  client_spec:
488
588
  class_name: "helm.clients.together_client.TogetherClient"
489
589
 
590
+ ## MedLM
591
+ - name: google/medlm-medium
592
+ model_name: google/medlm-medium
593
+ tokenizer_name: google/text-bison@001
594
+ max_sequence_length: 6000 # Officially 8192
595
+ max_sequence_and_generated_tokens_length: 7000 # Officially 9216
596
+ client_spec:
597
+ class_name: "helm.clients.vertexai_client.VertexAITextClient"
598
+ window_service_spec:
599
+ class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
600
+
601
+ - name: google/medlm-large
602
+ model_name: google/medlm-large
603
+ tokenizer_name: google/text-bison@001
604
+ max_sequence_length: 6000 # Officially 8192
605
+ max_sequence_and_generated_tokens_length: 7000 # Officially 9216
606
+ client_spec:
607
+ class_name: "helm.clients.vertexai_client.VertexAITextClient"
608
+ window_service_spec:
609
+ class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
610
+
611
+ ## PaliGemma
612
+ - name: google/paligemma-3b-mix-224
613
+ model_name: google/paligemma-3b-mix-224
614
+ tokenizer_name: google/gemma-2b
615
+ max_sequence_length: 7167
616
+ client_spec:
617
+ class_name: "helm.clients.vision_language.paligemma_client.PaliGemmaClient"
618
+
619
+ - name: google/paligemma-3b-mix-448
620
+ model_name: google/paligemma-3b-mix-448
621
+ tokenizer_name: google/gemma-2b
622
+ max_sequence_length: 7167
623
+ client_spec:
624
+ class_name: "helm.clients.vision_language.paligemma_client.PaliGemmaClient"
625
+
490
626
  ## PaLM 2
491
627
  - name: google/text-bison@001
492
628
  model_name: google/text-bison@001
@@ -504,7 +640,7 @@ model_deployments:
504
640
  max_sequence_length: 6000 # Officially 8192
505
641
  max_sequence_and_generated_tokens_length: 9216
506
642
  client_spec:
507
- class_name: "helm.proxy.clients.vertexai_client.VertexAITextClient"
643
+ class_name: "helm.clients.vertexai_client.VertexAITextClient"
508
644
  window_service_spec:
509
645
  class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
510
646
 
@@ -545,7 +681,7 @@ model_deployments:
545
681
  max_sequence_length: 6000 # Officially 6144
546
682
  max_sequence_and_generated_tokens_length: 7168
547
683
  client_spec:
548
- class_name: "helm.proxy.clients.vertexai_client.VertexAITextClient"
684
+ class_name: "helm.clients.vertexai_client.VertexAITextClient"
549
685
  window_service_spec:
550
686
  class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
551
687
 
@@ -561,6 +697,25 @@ model_deployments:
561
697
 
562
698
  # HuggingFace
563
699
 
700
+ ## AI Singapore
701
+ - name: huggingface/sea-lion-7b
702
+ model_name: aisingapore/sea-lion-7b
703
+ tokenizer_name: aisingapore/sea-lion-7b
704
+ max_sequence_length: 2048
705
+ client_spec:
706
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
707
+ args:
708
+ trust_remote_code: true
709
+
710
+ - name: huggingface/sea-lion-7b-instruct
711
+ model_name: aisingapore/sea-lion-7b-instruct
712
+ tokenizer_name: aisingapore/sea-lion-7b
713
+ max_sequence_length: 2048
714
+ client_spec:
715
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
716
+ args:
717
+ trust_remote_code: true
718
+
564
719
  ## Bigcode
565
720
  - name: huggingface/santacoder
566
721
  model_name: bigcode/santacoder
@@ -576,6 +731,15 @@ model_deployments:
576
731
  client_spec:
577
732
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
578
733
 
734
+ ## Biomistral
735
+
736
+ - name: huggingface/biomistral-7b
737
+ model_name: biomistral/biomistral-7b
738
+ tokenizer_name: mistralai/Mistral-7B-v0.1
739
+ max_sequence_length: 32000
740
+ client_spec:
741
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
742
+
579
743
  ## Databricks
580
744
  - name: huggingface/dolly-v2-3b
581
745
  model_name: databricks/dolly-v2-3b
@@ -658,6 +822,15 @@ model_deployments:
658
822
  client_spec:
659
823
  class_name: "helm.clients.huggingface_client.HuggingFaceClient"
660
824
 
825
+ ## Meditron
826
+
827
+ - name: huggingface/meditron-7b
828
+ model_name: epfl-llm/meditron-7b
829
+ tokenizer_name: meta-llama/Llama-2-7b-hf
830
+ max_sequence_length: 4094
831
+ client_spec:
832
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
833
+
661
834
  ## Meta
662
835
  - name: huggingface/opt-175b
663
836
  model_name: meta/opt-175b
@@ -709,7 +882,43 @@ model_deployments:
709
882
  max_sequence_length: 2048
710
883
  client_spec:
711
884
  class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
712
-
885
+
886
+ - name: huggingface/llava-v1.6-vicuna-7b-hf
887
+ model_name: uw-madison/llava-v1.6-vicuna-7b-hf
888
+ tokenizer_name: hf-internal-testing/llama-tokenizer
889
+ max_sequence_length: 2048
890
+ client_spec:
891
+ class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
892
+
893
+ - name: huggingface/llava-v1.6-vicuna-13b-hf
894
+ model_name: uw-madison/llava-v1.6-vicuna-13b-hf
895
+ tokenizer_name: hf-internal-testing/llama-tokenizer
896
+ max_sequence_length: 2048
897
+ client_spec:
898
+ class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
899
+
900
+ - name: huggingface/llava-v1.6-mistral-7b-hf
901
+ model_name: uw-madison/llava-v1.6-mistral-7b-hf
902
+ tokenizer_name: hf-internal-testing/llama-tokenizer
903
+ max_sequence_length: 2048
904
+ client_spec:
905
+ class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
906
+
907
+ - name: huggingface/llava-v1.6-34b-hf
908
+ model_name: uw-madison/llava-v1.6-34b-hf
909
+ tokenizer_name: hf-internal-testing/llama-tokenizer
910
+ max_sequence_length: 2048
911
+ client_spec:
912
+ class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
913
+
914
+ ## KAIST AI
915
+ - name: huggingface/prometheus-vision-13b-v1.0-hf
916
+ model_name: kaistai/prometheus-vision-13b-v1.0-hf
917
+ tokenizer_name: hf-internal-testing/llama-tokenizer
918
+ max_sequence_length: 2048
919
+ client_spec:
920
+ class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
921
+
713
922
  ## OpenFlamingo
714
923
  - name: openflamingo/OpenFlamingo-9B-vitl-mpt7b
715
924
  model_name: openflamingo/OpenFlamingo-9B-vitl-mpt7b
@@ -783,6 +992,50 @@ model_deployments:
783
992
  args:
784
993
  pretrained_model_name_or_path: openai-community/gpt2
785
994
 
995
+ ## SAIL (SEA AI Lab)
996
+ - name: sail/sailor-7b
997
+ model_name: sail/sailor-7b
998
+ tokenizer_name: qwen/qwen1.5-7b
999
+ max_sequence_length: 32768
1000
+ client_spec:
1001
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1002
+
1003
+ - name: sail/sailor-7b-chat
1004
+ model_name: sail/sailor-7b-chat
1005
+ tokenizer_name: qwen/qwen1.5-7b
1006
+ max_sequence_length: 32768
1007
+ client_spec:
1008
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1009
+
1010
+ - name: sail/sailor-14b
1011
+ model_name: sail/sailor-14b
1012
+ tokenizer_name: qwen/qwen1.5-7b
1013
+ max_sequence_length: 32768
1014
+ client_spec:
1015
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1016
+
1017
+ - name: sail/sailor-14b-chat
1018
+ model_name: sail/sailor-14b-chat
1019
+ tokenizer_name: qwen/qwen1.5-7b
1020
+ max_sequence_length: 32768
1021
+ client_spec:
1022
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1023
+
1024
+ ## SCB10X
1025
+ - name: huggingface/typhoon-v1.5-72b
1026
+ model_name: scb10x/typhoon-v1.5-72b
1027
+ tokenizer_name: qwen/qwen1.5-7b
1028
+ max_sequence_length: 32768
1029
+ client_spec:
1030
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1031
+
1032
+ - name: huggingface/typhoon-v1.5-72b-instruct
1033
+ model_name: scb10x/typhoon-v1.5-72b-instruct
1034
+ tokenizer_name: qwen/qwen1.5-7b
1035
+ max_sequence_length: 32768
1036
+ client_spec:
1037
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1038
+
786
1039
  ## StabilityAI
787
1040
  - name: huggingface/stablelm-base-alpha-3b
788
1041
  model_name: stabilityai/stablelm-base-alpha-3b
@@ -963,6 +1216,15 @@ model_deployments:
963
1216
  class_name: "helm.benchmark.window_services.image_generation.clip_window_service.CLIPWindowService"
964
1217
 
965
1218
  # HuggingFaceM4
1219
+ - name: HuggingFaceM4/idefics2-8b
1220
+ model_name: HuggingFaceM4/idefics2-8b
1221
+ # From https://huggingface.co/docs/transformers/main/en/model_doc/idefics2,
1222
+ # "constructs a IDEFICS2 processor which wraps a LLama tokenizer."
1223
+ tokenizer_name: hf-internal-testing/llama-tokenizer
1224
+ max_sequence_length: 2048
1225
+ client_spec:
1226
+ class_name: "helm.clients.vision_language.huggingface_vision2seq_client.HuggingFaceVision2SeqClient"
1227
+
966
1228
  - name: HuggingFaceM4/idefics-9b
967
1229
  model_name: HuggingFaceM4/idefics-9b
968
1230
  tokenizer_name: HuggingFaceM4/idefics-9b
@@ -1311,6 +1573,13 @@ model_deployments:
1311
1573
  client_spec:
1312
1574
  class_name: "helm.clients.openai_client.OpenAIClient"
1313
1575
 
1576
+ - name: openai/gpt-4o-2024-05-13
1577
+ model_name: openai/gpt-4o-2024-05-13
1578
+ tokenizer_name: openai/o200k_base
1579
+ max_sequence_length: 128000
1580
+ client_spec:
1581
+ class_name: "helm.clients.openai_client.OpenAIClient"
1582
+
1314
1583
  - name: openai/gpt-4-vision-preview
1315
1584
  model_name: openai/gpt-4-vision-preview
1316
1585
  tokenizer_name: openai/cl100k_base
@@ -1320,6 +1589,15 @@ model_deployments:
1320
1589
  client_spec:
1321
1590
  class_name: "helm.clients.openai_client.OpenAIClient"
1322
1591
 
1592
+ - name: openai/gpt-4-1106-vision-preview
1593
+ model_name: openai/gpt-4-1106-vision-preview
1594
+ tokenizer_name: openai/cl100k_base
1595
+ max_sequence_length: 128000 # According to https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
1596
+ max_request_length: 128001
1597
+ max_sequence_and_generated_tokens_length: 132096
1598
+ client_spec:
1599
+ class_name: "helm.clients.openai_client.OpenAIClient"
1600
+
1323
1601
  ## Codex Models
1324
1602
  # DEPRECATED: Codex models have been shut down on March 23 2023.
1325
1603
 
@@ -1589,6 +1867,42 @@ model_deployments:
1589
1867
  args:
1590
1868
  together_model: meta-llama/Meta-Llama-3-70B
1591
1869
 
1870
+ - name: together/llama-3-8b-chat
1871
+ model_name: meta/llama-3-8b-chat
1872
+ tokenizer_name: meta/llama-3-8b
1873
+ max_sequence_length: 8191
1874
+ client_spec:
1875
+ class_name: "helm.clients.together_client.TogetherChatClient"
1876
+ args:
1877
+ together_model: meta-llama/Llama-3-8b-chat-hf
1878
+
1879
+ - name: together/llama-3-70b-chat
1880
+ model_name: meta/llama-3-70b-chat
1881
+ tokenizer_name: meta/llama-3-8b
1882
+ max_sequence_length: 8191
1883
+ client_spec:
1884
+ class_name: "helm.clients.together_client.TogetherChatClient"
1885
+ args:
1886
+ together_model: meta-llama/Llama-3-70b-chat-hf
1887
+
1888
+ - name: together/llama-guard-7b
1889
+ model_name: meta/llama-guard-7b
1890
+ tokenizer_name: meta-llama/Llama-2-7b-hf
1891
+ max_sequence_length: 2047
1892
+ client_spec:
1893
+ class_name: "helm.clients.together_client.TogetherClient"
1894
+ args:
1895
+ together_model: meta-llama/llama-guard-7b
1896
+
1897
+ - name: together/llama-guard-2-8b
1898
+ model_name: meta/llama-guard-2-8b
1899
+ tokenizer_name: meta/llama-3-8b
1900
+ max_sequence_length: 4094
1901
+ client_spec:
1902
+ class_name: "helm.clients.together_client.TogetherClient"
1903
+ args:
1904
+ together_model: meta-llama/llamaguard-2-8b
1905
+
1592
1906
  # 01.AI
1593
1907
  - name: together/yi-6b
1594
1908
  model_name: 01-ai/yi-6b
@@ -1608,6 +1922,38 @@ model_deployments:
1608
1922
  args:
1609
1923
  together_model: zero-one-ai/Yi-34B
1610
1924
 
1925
+ - name: together/yi-6b-chat
1926
+ model_name: 01-ai/yi-6b-chat
1927
+ tokenizer_name: 01-ai/Yi-6B
1928
+ max_sequence_length: 4095
1929
+ client_spec:
1930
+ class_name: "helm.clients.together_client.TogetherChatClient"
1931
+ args:
1932
+ together_model: zero-one-ai/Yi-6B-Chat
1933
+
1934
+ - name: together/yi-34b-chat
1935
+ model_name: 01-ai/yi-34b-chat
1936
+ tokenizer_name: 01-ai/Yi-6B
1937
+ max_sequence_length: 4095
1938
+ client_spec:
1939
+ class_name: "helm.clients.together_client.TogetherChatClient"
1940
+ args:
1941
+ together_model: zero-one-ai/Yi-34B-Chat
1942
+
1943
+ - name: 01-ai/yi-large
1944
+ model_name: 01-ai/yi-large
1945
+ tokenizer_name: 01-ai/Yi-6B # Actual tokenizer is publicly unavailable, so use a substitute
1946
+ max_sequence_length: 16000
1947
+ client_spec:
1948
+ class_name: "helm.clients.yi_client.YiChatClient"
1949
+
1950
+ - name: 01-ai/yi-large-preview
1951
+ model_name: 01-ai/yi-large-preview
1952
+ tokenizer_name: 01-ai/Yi-6B # Actual tokenizer is publicly unavailable, so use a substitute
1953
+ max_sequence_length: 16000
1954
+ client_spec:
1955
+ class_name: "helm.clients.yi_client.YiChatClient"
1956
+
1611
1957
 
1612
1958
  # Allen Institute for AI
1613
1959
  - name: together/olmo-7b
@@ -1629,8 +1975,16 @@ model_deployments:
1629
1975
  tokenizer_name: allenai/olmo-7b
1630
1976
  max_sequence_length: 2047
1631
1977
  client_spec:
1632
- class_name: "helm.clients.together_client.TogetherClient"
1978
+ class_name: "helm.clients.together_client.TogetherChatClient"
1633
1979
 
1980
+ - name: huggingface/olmo-1.7-7b
1981
+ model_name: allenai/olmo-1.7-7b
1982
+ tokenizer_name: allenai/OLMo-1.7-7B-hf
1983
+ max_sequence_length: 2048
1984
+ client_spec:
1985
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
1986
+ args:
1987
+ pretrained_model_name_or_path: allenai/OLMo-1.7-7B-hf
1634
1988
 
1635
1989
  ## MistralAI
1636
1990
  - name: together/mistral-7b-v0.1
@@ -1642,6 +1996,27 @@ model_deployments:
1642
1996
  args:
1643
1997
  together_model: mistralai/Mistral-7B-v0.1
1644
1998
 
1999
+ - name: together/mistral-7b-instruct-v0.1
2000
+ model_name: mistralai/mistral-7b-instruct-v0.1
2001
+ tokenizer_name: mistralai/Mistral-7B-Instruct-v0.1
2002
+ max_sequence_length: 4000
2003
+ client_spec:
2004
+ class_name: "helm.clients.together_client.TogetherChatClient"
2005
+
2006
+ - name: together/mistral-7b-instruct-v0.2
2007
+ model_name: mistralai/mistral-7b-instruct-v0.2
2008
+ tokenizer_name: mistralai/Mistral-7B-Instruct-v0.2
2009
+ max_sequence_length: 32000
2010
+ client_spec:
2011
+ class_name: "helm.clients.together_client.TogetherChatClient"
2012
+
2013
+ - name: together/mistral-7b-instruct-v0.3
2014
+ model_name: mistralai/mistral-7b-instruct-v0.3
2015
+ tokenizer_name: mistralai/Mistral-7B-Instruct-v0.3
2016
+ max_sequence_length: 32000
2017
+ client_spec:
2018
+ class_name: "helm.clients.together_client.TogetherChatClient"
2019
+
1645
2020
  - name: together/mixtral-8x7b-32kseqlen
1646
2021
  model_name: mistralai/mixtral-8x7b-32kseqlen
1647
2022
  tokenizer_name: mistralai/Mistral-7B-v0.1
@@ -1656,7 +2031,7 @@ model_deployments:
1656
2031
  tokenizer_name: mistralai/Mistral-7B-v0.1
1657
2032
  max_sequence_length: 4095 # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
1658
2033
  client_spec:
1659
- class_name: "helm.clients.together_client.TogetherClient"
2034
+ class_name: "helm.clients.together_client.TogetherChatClient"
1660
2035
 
1661
2036
  - name: together/mixtral-8x22b
1662
2037
  model_name: mistralai/mixtral-8x22b
@@ -1665,6 +2040,22 @@ model_deployments:
1665
2040
  client_spec:
1666
2041
  class_name: "helm.clients.together_client.TogetherClient"
1667
2042
 
2043
+ - name: together/mixtral-8x22b-instruct-v0.1
2044
+ model_name: mistralai/mixtral-8x22b-instruct-v0.1
2045
+ tokenizer_name: mistralai/Mistral-7B-v0.1
2046
+ max_sequence_length: 65535
2047
+ client_spec:
2048
+ class_name: "helm.clients.together_client.TogetherChatClient"
2049
+
2050
+
2051
+ ## Snowflake
2052
+ - name: together/snowflake-arctic-instruct
2053
+ model_name: snowflake/snowflake-arctic-instruct
2054
+ tokenizer_name: snowflake/snowflake-arctic-instruct
2055
+ max_sequence_length: 4000 # Lower than 4096 because of chat tokens
2056
+ client_spec:
2057
+ class_name: "helm.clients.together_client.TogetherChatClient"
2058
+
1668
2059
  ## Stanford
1669
2060
  - name: together/alpaca-7b
1670
2061
  model_name: stanford/alpaca-7b
@@ -1880,12 +2271,21 @@ model_deployments:
1880
2271
  client_spec:
1881
2272
  class_name: "helm.clients.palmyra_client.PalmyraClient"
1882
2273
 
2274
+ - name: writer/palmyra-vision-003
2275
+ model_name: writer/palmyra-vision-003
2276
+ tokenizer_name: writer/gpt2
2277
+ max_sequence_length: 2048
2278
+ max_sequence_and_generated_tokens_length: 2048
2279
+ client_spec:
2280
+ class_name: "helm.clients.vision_language.palmyra_vision_client.PalmyraVisionClient"
2281
+
2282
+
1883
2283
  # Qwen
1884
2284
 
1885
2285
  - name: together/qwen-7b
1886
2286
  model_name: qwen/qwen-7b
1887
2287
  tokenizer_name: qwen/qwen-7b
1888
- max_sequence_length: 8191
2288
+ max_sequence_length: 32767
1889
2289
  client_spec:
1890
2290
  class_name: "helm.clients.together_client.TogetherClient"
1891
2291
  args:
@@ -1921,12 +2321,54 @@ model_deployments:
1921
2321
  - name: together/qwen1.5-72b
1922
2322
  model_name: qwen/qwen1.5-72b
1923
2323
  tokenizer_name: qwen/qwen1.5-7b
1924
- max_sequence_length: 4095
2324
+ max_sequence_length: 32767
1925
2325
  client_spec:
1926
2326
  class_name: "helm.clients.together_client.TogetherClient"
1927
2327
  args:
1928
2328
  together_model: Qwen/Qwen1.5-72B
1929
2329
 
2330
+ - name: together/qwen1.5-7b-chat
2331
+ model_name: qwen/qwen1.5-7b-chat
2332
+ tokenizer_name: qwen/qwen1.5-7b
2333
+ max_sequence_length: 32767
2334
+ client_spec:
2335
+ class_name: "helm.clients.together_client.TogetherChatClient"
2336
+
2337
+ - name: together/qwen1.5-14b-chat
2338
+ model_name: qwen/qwen1.5-14b-chat
2339
+ tokenizer_name: qwen/qwen1.5-7b
2340
+ max_sequence_length: 32767
2341
+ client_spec:
2342
+ class_name: "helm.clients.together_client.TogetherChatClient"
2343
+
2344
+ - name: together/qwen1.5-32b-chat
2345
+ model_name: qwen/qwen1.5-32b-chat
2346
+ tokenizer_name: qwen/qwen1.5-7b
2347
+ max_sequence_length: 32767
2348
+ client_spec:
2349
+ class_name: "helm.clients.together_client.TogetherChatClient"
2350
+
2351
+ - name: together/qwen1.5-72b-chat
2352
+ model_name: qwen/qwen1.5-72b-chat
2353
+ tokenizer_name: qwen/qwen1.5-7b
2354
+ max_sequence_length: 32767
2355
+ client_spec:
2356
+ class_name: "helm.clients.together_client.TogetherChatClient"
2357
+
2358
+ - name: together/qwen1.5-110b-chat
2359
+ model_name: qwen/qwen1.5-110b-chat
2360
+ tokenizer_name: qwen/qwen1.5-7b
2361
+ max_sequence_length: 32767
2362
+ client_spec:
2363
+ class_name: "helm.clients.together_client.TogetherChatClient"
2364
+
2365
+ - name: together/qwen2-72b-instruct
2366
+ model_name: qwen/qwen2-72b-instruct
2367
+ tokenizer_name: qwen/qwen2-72b-instruct
2368
+ max_sequence_length: 128000
2369
+ client_spec:
2370
+ class_name: "helm.clients.together_client.TogetherChatClient"
2371
+
1930
2372
  - name: huggingface/qwen-vl
1931
2373
  model_name: qwen/qwen-vl
1932
2374
  tokenizer_name: qwen/qwen-vl
@@ -1940,3 +2382,53 @@ model_deployments:
1940
2382
  max_sequence_length: 8191
1941
2383
  client_spec:
1942
2384
  class_name: "helm.clients.vision_language.qwen_vlm_client.QwenVLMClient"
2385
+
2386
+ # Reka
2387
+ - name: reka/reka-core
2388
+ model_name: reka/reka-core
2389
+ tokenizer_name: openai/cl100k_base
2390
+ max_sequence_length: 128000
2391
+ client_spec:
2392
+ class_name: "helm.clients.reka_client.RekaClient"
2393
+
2394
+ - name: reka/reka-core-20240415
2395
+ model_name: reka/reka-core-20240415
2396
+ tokenizer_name: openai/cl100k_base
2397
+ max_sequence_length: 128000
2398
+ client_spec:
2399
+ class_name: "helm.clients.reka_client.RekaClient"
2400
+
2401
+ - name: reka/reka-core-20240501
2402
+ model_name: reka/reka-core-20240501
2403
+ tokenizer_name: openai/cl100k_base
2404
+ max_sequence_length: 128000
2405
+ client_spec:
2406
+ class_name: "helm.clients.reka_client.RekaClient"
2407
+
2408
+ - name: reka/reka-flash
2409
+ model_name: reka/reka-flash
2410
+ tokenizer_name: openai/cl100k_base
2411
+ max_sequence_length: 128000
2412
+ client_spec:
2413
+ class_name: "helm.clients.reka_client.RekaClient"
2414
+
2415
+ - name: reka/reka-flash-20240226
2416
+ model_name: reka/reka-flash-20240226
2417
+ tokenizer_name: openai/cl100k_base
2418
+ max_sequence_length: 128000
2419
+ client_spec:
2420
+ class_name: "helm.clients.reka_client.RekaClient"
2421
+
2422
+ - name: reka/reka-edge
2423
+ model_name: reka/reka-edge
2424
+ tokenizer_name: openai/cl100k_base
2425
+ max_sequence_length: 64000
2426
+ client_spec:
2427
+ class_name: "helm.clients.reka_client.RekaClient"
2428
+
2429
+ - name: reka/reka-edge-20240208
2430
+ model_name: reka/reka-edge-20240208
2431
+ tokenizer_name: openai/cl100k_base
2432
+ max_sequence_length: 64000
2433
+ client_spec:
2434
+ class_name: "helm.clients.reka_client.RekaClient"