crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (121) hide show
  1. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +3 -1
  2. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +117 -115
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/metrics/bbq_metrics.py +12 -0
  5. helm/benchmark/metrics/evaluate_reference_metrics.py +12 -0
  6. helm/benchmark/metrics/safety_metrics.py +13 -1
  7. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  8. helm/benchmark/presentation/run_display.py +13 -3
  9. helm/benchmark/presentation/run_entry.py +2 -2
  10. helm/benchmark/run.py +1 -1
  11. helm/benchmark/run_specs/arabic_run_specs.py +6 -0
  12. helm/benchmark/run_specs/medhelm_run_specs.py +2 -2
  13. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  14. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  15. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  16. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  17. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  18. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  19. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  20. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  21. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  22. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  23. helm/benchmark/scenarios/commonsense_scenario.py +7 -1
  24. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  25. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  26. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  27. helm/benchmark/scenarios/gsm_scenario.py +9 -3
  28. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  29. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  30. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  31. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  32. helm/benchmark/scenarios/legalbench_scenario.py +6 -7
  33. helm/benchmark/scenarios/math_scenario.py +11 -4
  34. helm/benchmark/scenarios/med_qa_scenario.py +7 -1
  35. helm/benchmark/scenarios/medi_qa_scenario.py +2 -2
  36. helm/benchmark/scenarios/mmlu_scenario.py +8 -2
  37. helm/benchmark/scenarios/narrativeqa_scenario.py +3 -4
  38. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  39. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  40. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  41. helm/benchmark/scenarios/spider_scenario.py +18 -0
  42. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  43. helm/benchmark/scenarios/wmt_14_scenario.py +9 -2
  44. helm/benchmark/static/schema_long_context.yaml +12 -31
  45. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  46. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  47. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  48. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  49. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  50. helm/benchmark/static_build/index.html +5 -6
  51. helm/clients/ai21_client.py +2 -0
  52. helm/clients/aleph_alpha_client.py +2 -0
  53. helm/clients/anthropic_client.py +7 -1
  54. helm/clients/audio_language/diva_llama_client.py +2 -0
  55. helm/clients/audio_language/llama_omni_client.py +2 -1
  56. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  57. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  58. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  59. helm/clients/bedrock_client.py +2 -0
  60. helm/clients/cohere_client.py +3 -0
  61. helm/clients/google_client.py +2 -0
  62. helm/clients/http_model_client.py +2 -0
  63. helm/clients/huggingface_client.py +2 -1
  64. helm/clients/ibm_client.py +3 -1
  65. helm/clients/image_generation/adobe_vision_client.py +2 -0
  66. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  67. helm/clients/image_generation/cogview2_client.py +2 -1
  68. helm/clients/image_generation/dalle2_client.py +2 -0
  69. helm/clients/image_generation/dalle_mini_client.py +2 -1
  70. helm/clients/image_generation/deep_floyd_client.py +2 -0
  71. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  72. helm/clients/image_generation/lexica_client.py +2 -0
  73. helm/clients/image_generation/mindalle_client.py +2 -1
  74. helm/clients/image_generation/together_image_generation_client.py +2 -0
  75. helm/clients/megatron_client.py +2 -0
  76. helm/clients/mistral_client.py +2 -0
  77. helm/clients/moderation_api_client.py +2 -0
  78. helm/clients/openai_client.py +5 -1
  79. helm/clients/palmyra_client.py +2 -1
  80. helm/clients/reka_client.py +2 -1
  81. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  82. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  83. helm/clients/together_client.py +4 -0
  84. helm/clients/vertexai_client.py +4 -0
  85. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  86. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  87. helm/clients/vision_language/idefics_client.py +2 -1
  88. helm/clients/vision_language/open_flamingo_client.py +2 -1
  89. helm/clients/vision_language/paligemma_client.py +2 -1
  90. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  91. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  92. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  93. helm/clients/writer_client.py +2 -0
  94. helm/common/hierarchical_logger.py +20 -0
  95. helm/common/optional_dependencies.py +1 -1
  96. helm/common/test_general.py +4 -0
  97. helm/config/model_deployments.yaml +225 -0
  98. helm/config/model_metadata.yaml +232 -7
  99. helm/config/tokenizer_configs.yaml +74 -4
  100. helm/benchmark/static_build/assets/index-671a5e06.js +0 -10
  101. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  102. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  103. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  104. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  105. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  106. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  107. {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  108. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  109. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  110. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  111. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  112. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  113. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  114. /helm/benchmark/static_build/assets/{index-9352595e.css → index-oIeiQW2g.css} +0 -0
  115. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  116. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  117. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  118. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  119. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  120. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  121. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -9,6 +9,7 @@ import requests
9
9
  from retrying import retry
10
10
 
11
11
  from helm.common.cache import CacheConfig
12
+ from helm.common.hierarchical_logger import hexception
12
13
  from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
13
14
  from helm.common.object_spec import get_class_by_name
14
15
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -273,6 +274,7 @@ class TogetherClient(CachingClient):
273
274
  try:
274
275
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it_sync))
275
276
  except Exception as error:
277
+ hexception(error)
276
278
  return RequestResult(
277
279
  success=False,
278
280
  cached=False,
@@ -455,6 +457,7 @@ class TogetherChatClient(CachingClient):
455
457
  raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
456
458
  response = ChatCompletionResponse.model_validate(raw_response)
457
459
  except Exception as error:
460
+ hexception(error)
458
461
  return RequestResult(
459
462
  success=False,
460
463
  cached=False,
@@ -562,6 +565,7 @@ class TogetherCompletionClient(CachingClient):
562
565
  raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
563
566
  response = CompletionResponse.model_validate(raw_response)
564
567
  except Exception as error:
568
+ hexception(error)
565
569
  return RequestResult(
566
570
  success=False,
567
571
  cached=False,
@@ -4,6 +4,7 @@ from threading import Lock
4
4
  from typing import Any, Dict, Mapping, Optional, List, Union, cast
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
+ from helm.common.hierarchical_logger import hexception
7
8
  from helm.common.multimodal_request_utils import get_contents_as_bytes
8
9
  from helm.common.media_object import TEXT_TYPE
9
10
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -152,6 +153,7 @@ class VertexAITextClient(VertexAIClient):
152
153
 
153
154
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
154
155
  except (requests.exceptions.RequestException, AssertionError) as e:
156
+ hexception(e)
155
157
  error: str = f"VertexAITextClient error: {e}"
156
158
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
157
159
 
@@ -310,6 +312,7 @@ class VertexAIChatClient(VertexAIClient):
310
312
  error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
311
313
  )
312
314
  except (requests.exceptions.RequestException, AssertionError) as e:
315
+ hexception(e)
313
316
  error: str = f"VertexAITextClient error: {e}"
314
317
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
315
318
 
@@ -440,6 +443,7 @@ class VertexAIChatClient(VertexAIClient):
440
443
  cache_key = self.make_cache_key_with_safety_settings_preset(raw_cache_key, request)
441
444
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
442
445
  except requests.exceptions.RequestException as e:
446
+ hexception(e)
443
447
  error: str = f"Gemini Vision error: {e}"
444
448
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
445
449
  except VertexAIContentBlockedError as e:
@@ -8,7 +8,7 @@ import torch
8
8
 
9
9
  from helm.common.cache import CacheConfig
10
10
  from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
14
14
  from helm.common.request import wrap_request_time
@@ -125,6 +125,7 @@ class HuggingFaceVision2SeqClient(CachingClient):
125
125
  )
126
126
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
127
127
  except RuntimeError as model_error:
128
+ hexception(model_error)
128
129
  return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
129
130
 
130
131
  for text in result["output"]:
@@ -5,6 +5,7 @@ from transformers import pipeline
5
5
  from transformers.pipelines import ImageToTextPipeline
6
6
 
7
7
  from helm.common.cache import CacheConfig
8
+ from helm.common.hierarchical_logger import hexception
8
9
  from helm.common.images_utils import open_image
9
10
  from helm.common.media_object import TEXT_TYPE
10
11
  from helm.common.optional_dependencies import handle_module_not_found_error
@@ -93,6 +94,7 @@ class HuggingFaceVLMClient(CachingClient):
93
94
  )
94
95
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
95
96
  except RuntimeError as e:
97
+ hexception(e)
96
98
  return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])
97
99
 
98
100
  output: str = result["generated_text"]
@@ -8,7 +8,7 @@ from transformers import IdeficsForVisionText2Text, AutoProcessor, IdeficsProces
8
8
  from helm.common.cache import CacheConfig
9
9
  from helm.common.images_utils import open_image
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.optional_dependencies import handle_module_not_found_error
14
14
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
@@ -137,6 +137,7 @@ class IDEFICSClient(CachingClient):
137
137
  )
138
138
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
139
139
  except RuntimeError as model_error:
140
+ hexception(model_error)
140
141
  return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
141
142
 
142
143
  for text in result["output"]:
@@ -5,7 +5,7 @@ import torch
5
5
  from huggingface_hub import hf_hub_download
6
6
 
7
7
  from helm.common.cache import CacheConfig
8
- from helm.common.hierarchical_logger import hlog, htrack_block
8
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
9
9
  from helm.common.images_utils import open_image
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
11
  from helm.common.media_object import TEXT_TYPE
@@ -131,6 +131,7 @@ class OpenFlamingoClient(CachingClient):
131
131
  )
132
132
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
133
133
  except RuntimeError as ex:
134
+ hexception(ex)
134
135
  return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
135
136
 
136
137
  completions: List[GeneratedOutput] = []
@@ -8,7 +8,7 @@ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
8
8
  from helm.common.cache import CacheConfig
9
9
  from helm.common.images_utils import open_image
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.optional_dependencies import handle_module_not_found_error
14
14
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
@@ -126,6 +126,7 @@ class PaliGemmaClient(CachingClient):
126
126
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
127
127
  concat_results.append(result)
128
128
  except RuntimeError as model_error:
129
+ hexception(model_error)
129
130
  return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
130
131
 
131
132
  for result in concat_results:
@@ -5,6 +5,7 @@ import requests
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.images_utils import encode_base64
8
+ from helm.common.hierarchical_logger import hexception
8
9
  from helm.common.media_object import TEXT_TYPE
9
10
  from helm.common.request import Request, RequestResult, GeneratedOutput, ErrorFlags
10
11
  from helm.common.request import wrap_request_time
@@ -76,6 +77,7 @@ class PalmyraVisionClient(CachingClient):
76
77
  )
77
78
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
78
79
  except PalmyraVisionContentBlockedError as ex:
80
+ hexception(ex)
79
81
  return RequestResult(
80
82
  success=False,
81
83
  cached=False,
@@ -8,7 +8,7 @@ import torch
8
8
 
9
9
  from helm.common.cache import CacheConfig
10
10
  from helm.common.gpu_utils import get_torch_device_name
11
- from helm.common.hierarchical_logger import hlog, htrack_block
11
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
12
12
  from helm.common.media_object import TEXT_TYPE
13
13
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
14
14
  from helm.common.request import wrap_request_time
@@ -157,6 +157,7 @@ class Qwen2VLMClient(CachingClient):
157
157
  )
158
158
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
159
159
  except RuntimeError as model_error:
160
+ hexception(model_error)
160
161
  return RequestResult(
161
162
  success=False,
162
163
  cached=False,
@@ -7,7 +7,7 @@ from transformers.generation import GenerationConfig
7
7
 
8
8
  from helm.common.cache import CacheConfig
9
9
  from helm.common.gpu_utils import get_torch_device_name
10
- from helm.common.hierarchical_logger import hlog, htrack_block
10
+ from helm.common.hierarchical_logger import hexception, hlog, htrack_block
11
11
  from helm.common.media_object import TEXT_TYPE
12
12
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
13
13
  from helm.common.request import wrap_request_time
@@ -139,6 +139,7 @@ class QwenVLMClient(CachingClient):
139
139
  )
140
140
  result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
141
141
  except RuntimeError as model_error:
142
+ hexception(model_error)
142
143
  return RequestResult(
143
144
  success=False, cached=False, error=str(model_error), completions=[], embedding=[]
144
145
  )
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Mapping, Optional
2
2
 
3
3
  from helm.clients.client import CachingClient
4
4
  from helm.common.cache import CacheConfig
5
+ from helm.common.hierarchical_logger import hexception
5
6
  from helm.common.optional_dependencies import handle_module_not_found_error
6
7
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
7
8
 
@@ -82,6 +83,7 @@ class WriterClient(CachingClient):
82
83
  raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
83
84
  chat_completion: ChatCompletion = ChatCompletion.model_validate(raw_response)
84
85
  except Exception as error:
86
+ hexception(error)
85
87
  return RequestResult(
86
88
  success=False,
87
89
  cached=False,
@@ -64,6 +64,16 @@ class HierarchicalLogger(object):
64
64
  self.logger.warning(self.indent() + str(x), **kwargs)
65
65
  sys.stdout.flush()
66
66
 
67
+ def error(self, x: Any, **kwargs) -> None:
68
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
69
+ self.logger.error(self.indent() + str(x), **kwargs)
70
+ sys.stdout.flush()
71
+
72
+ def exception(self, x: Any, **kwargs) -> None:
73
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
74
+ self.logger.exception(self.indent() + str(x), **kwargs)
75
+ sys.stdout.flush()
76
+
67
77
 
68
78
  def format_time(s: float) -> str:
69
79
  """Return a nice string representation of `s` seconds."""
@@ -96,6 +106,16 @@ def hwarn(x: Any, **kwargs) -> None:
96
106
  singleton.warn(x, **kwargs)
97
107
 
98
108
 
109
+ def herror(x: Any, **kwargs) -> None:
110
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
111
+ singleton.error(x, **kwargs)
112
+
113
+
114
+ def hexception(x: Any, **kwargs) -> None:
115
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
116
+ singleton.exception(x, **kwargs)
117
+
118
+
99
119
  class htrack_block:
100
120
  def __init__(self, x: Any, stacklevel=1) -> None:
101
121
  self._stacklevel = stacklevel + 1
@@ -9,7 +9,7 @@ def handle_module_not_found_error(e: ModuleNotFoundError, suggestions: Optional[
9
9
  # TODO: Ask user to install more specific optional dependencies
10
10
  # e.g. crfm-helm[plots] or crfm-helm[server]
11
11
  suggested_commands = " or ".join(
12
- [f"`pip install crfm-helm[{suggestion}]`" for suggestion in (suggestions or []) + ["all"]]
12
+ [f'`pip install "crfm-helm[{suggestion}]"`' for suggestion in (suggestions or []) + ["all"]]
13
13
  )
14
14
  raise OptionalDependencyNotInstalled(
15
15
  f"Optional dependency {e.name} is not installed. Please run {suggested_commands} to install it."
@@ -1,5 +1,8 @@
1
1
  import shutil
2
2
  import os
3
+
4
+ import pytest
5
+
3
6
  from helm.common.general import (
4
7
  ensure_file_downloaded,
5
8
  format_tags,
@@ -12,6 +15,7 @@ from helm.common.general import (
12
15
 
13
16
 
14
17
  def test_ensure_file_downloaded():
18
+ pytest.skip("Skipping download tests because these downloads are not reliable and may be throttled")
15
19
  ensure_file_downloaded("https://ftp.gnu.org/gnu/tar/tar-1.34.tar.gz", "test-tar", unpack=True, unpack_type="untar")
16
20
  assert os.path.isdir("test-tar")
17
21
  shutil.rmtree("test-tar")
@@ -730,6 +730,13 @@ model_deployments:
730
730
  thinking_budget_tokens: 10000
731
731
  stream: true
732
732
 
733
+ - name: anthropic/claude-sonnet-4-5-20250929
734
+ model_name: anthropic/claude-sonnet-4-5-20250929
735
+ tokenizer_name: anthropic/claude
736
+ max_sequence_length: 200000
737
+ client_spec:
738
+ class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
739
+
733
740
  - name: anthropic/stanford-online-all-v4-s3
734
741
  deprecated: true # Closed model, not accessible via API
735
742
  model_name: anthropic/stanford-online-all-v4-s3
@@ -861,6 +868,20 @@ model_deployments:
861
868
  parse_thinking: true
862
869
  disable_logprobs: True
863
870
 
871
+ - name: together/deepseek-r1-distill-llama-70b
872
+ model_name: deepseek-ai/deepseek-r1-distill-llama-70b
873
+ tokenizer_name: deepseek-ai/deepseek-r1-distill-llama-70b
874
+ max_sequence_length: 131072
875
+ client_spec:
876
+ class_name: "helm.clients.together_client.TogetherChatClient"
877
+
878
+ - name: together/deepseek-r1-distill-qwen-14b
879
+ model_name: deepseek-ai/deepseek-r1-distill-qwen-14b
880
+ tokenizer_name: deepseek-ai/deepseek-r1-distill-qwen-14b
881
+ max_sequence_length: 131072
882
+ client_spec:
883
+ class_name: "helm.clients.together_client.TogetherChatClient"
884
+
864
885
  # Gooseai
865
886
 
866
887
  # TODO: Migrate these models to use OpenAIClient
@@ -3873,6 +3894,15 @@ model_deployments:
3873
3894
  args:
3874
3895
  parse_thinking: true
3875
3896
 
3897
+ - name: together/qwen3-next-80b-a3b-thinking
3898
+ model_name: qwen/qwen3-next-80b-a3b-thinking
3899
+ tokenizer_name: qwen/qwen3-next-80b-a3b-thinking
3900
+ max_sequence_length: 262144
3901
+ client_spec:
3902
+ class_name: "helm.clients.together_client.TogetherChatClient"
3903
+ args:
3904
+ parse_thinking: true
3905
+
3876
3906
  - name: together/qwen3-235b-a22b-instruct-2507-fp8
3877
3907
  model_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
3878
3908
  tokenizer_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
@@ -4321,6 +4351,201 @@ model_deployments:
4321
4351
  args:
4322
4352
  pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
4323
4353
 
4354
+ # AceGPT-v2
4355
+ - name: huggingface/acegpt-v2-8b-chat
4356
+ model_name: freedomintelligence/acegpt-v2-8b-chat
4357
+ tokenizer_name: freedomintelligence/acegpt-v2-8b-chat
4358
+ max_sequence_length: 8192
4359
+ client_spec:
4360
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4361
+ args:
4362
+ device_map: auto
4363
+
4364
+ - name: huggingface/acegpt-v2-32b-chat
4365
+ model_name: freedomintelligence/acegpt-v2-32b-chat
4366
+ tokenizer_name: freedomintelligence/acegpt-v2-32b-chat
4367
+ max_sequence_length: 32768
4368
+ client_spec:
4369
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4370
+ args:
4371
+ device_map: auto
4372
+
4373
+ - name: huggingface/acegpt-v2-70b-chat
4374
+ model_name: freedomintelligence/acegpt-v2-70b-chat
4375
+ tokenizer_name: freedomintelligence/acegpt-v2-70b-chat
4376
+ max_sequence_length: 8192
4377
+ client_spec:
4378
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4379
+ args:
4380
+ device_map: auto
4381
+
4382
+ # ALLaM
4383
+ - name: huggingface/allam-7b-instruct-preview
4384
+ model_name: allam-ai/allam-7b-instruct-preview
4385
+ tokenizer_name: allam-ai/allam-7b-instruct-preview
4386
+ max_sequence_length: 4096
4387
+ client_spec:
4388
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4389
+ args:
4390
+ device_map: auto
4391
+
4392
+ # SILMA
4393
+ - name: huggingface/silma-9b-instruct-v1.0
4394
+ model_name: silma-ai/silma-9b-instruct-v1.0
4395
+ tokenizer_name: silma-ai/silma-9b-instruct-v1.0
4396
+ max_sequence_length: 8192
4397
+ client_spec:
4398
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4399
+ args:
4400
+ device_map: auto
4401
+
4402
+ # Jais Family
4403
+ #
4404
+ # NOTE: Jais Family models require `transformers<=4.52.3`.
4405
+ # On more recent versions of transformers, one of the following errors might occur:
4406
+ #
4407
+ # File "/path/to//site-packages/transformers/models/gemma3n/configuration_gemma3n.py", line 31, in <module>
4408
+ # from timm.data import ImageNetInfo, infer_imagenet_subset
4409
+ # ImportError: cannot import name 'ImageNetInfo' from 'timm.data' (/path/to/site-packages/timm/data/__init__.py)
4410
+ #
4411
+ # File "/path/to/.cache/huggingface/modules/transformers_modules/inceptionai/jais-family-590m-chat/90ac4769212b4964c6e81e183140224628228365/modeling_jais.py", line 899, in forward
4412
+ # past_length = past_key_values[0][0].size(-2)
4413
+ # AttributeError: 'NoneType' object has no attribute 'size'
4414
+
4415
+ - name: huggingface/jais-family-590m-chat
4416
+ model_name: inceptionai/jais-family-590m-chat
4417
+ tokenizer_name: inceptionai/jais-family-590m-chat
4418
+ max_sequence_length: 2048
4419
+ client_spec:
4420
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4421
+ args:
4422
+ trust_remote_code: true
4423
+ revision: 90ac4769212b4964c6e81e183140224628228365
4424
+
4425
+ - name: huggingface/jais-family-1p3b-chat
4426
+ model_name: inceptionai/jais-family-1p3b-chat
4427
+ tokenizer_name: inceptionai/jais-family-590m-chat
4428
+ max_sequence_length: 2048
4429
+ client_spec:
4430
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4431
+ args:
4432
+ trust_remote_code: true
4433
+ revision: 4b93176e2cb00f369b3bc0a8786e4cf16260c804
4434
+
4435
+ - name: huggingface/jais-family-2p7b-chat
4436
+ model_name: inceptionai/jais-family-2p7b-chat
4437
+ tokenizer_name: inceptionai/jais-family-590m-chat
4438
+ max_sequence_length: 2048
4439
+ client_spec:
4440
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4441
+ args:
4442
+ trust_remote_code: true
4443
+ revision: b2bf5d1bcd969ce868f66fb1ad8c3480289ea6b2
4444
+
4445
+ - name: huggingface/jais-family-6p7b-chat
4446
+ model_name: inceptionai/jais-family-6p7b-chat
4447
+ tokenizer_name: inceptionai/jais-family-590m-chat
4448
+ max_sequence_length: 2048
4449
+ client_spec:
4450
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4451
+ args:
4452
+ device_map: auto
4453
+ trust_remote_code: true
4454
+ revision: 683805efe6126c6536feb4aa23317e70222ac94c
4455
+
4456
+ - name: huggingface/jais-family-13b-chat
4457
+ model_name: inceptionai/jais-family-13b-chat
4458
+ tokenizer_name: inceptionai/jais-family-590m-chat
4459
+ max_sequence_length: 2048
4460
+ client_spec:
4461
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4462
+ args:
4463
+ device_map: auto
4464
+ trust_remote_code: true
4465
+ revision: 0ef8b4f80429609890816d912b331d3b95864707
4466
+
4467
+ - name: huggingface/jais-family-30b-8k-chat
4468
+ model_name: inceptionai/jais-family-30b-8k-chat
4469
+ tokenizer_name: inceptionai/jais-family-590m-chat
4470
+ max_sequence_length: 8192
4471
+ client_spec:
4472
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4473
+ args:
4474
+ device_map: auto
4475
+ trust_remote_code: true
4476
+ revision: dab185164dd3b79ec9201d7f4cf878ce91ae7e14
4477
+
4478
+ - name: huggingface/jais-family-30b-16k-chat
4479
+ model_name: inceptionai/jais-family-30b-16k-chat
4480
+ tokenizer_name: inceptionai/jais-family-590m-chat
4481
+ max_sequence_length: 16384
4482
+ client_spec:
4483
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4484
+ args:
4485
+ device_map: auto
4486
+ trust_remote_code: true
4487
+ revision: 369f88eeee4d313155f1b1dca4ebec90f9f9f2a4
4488
+
4489
+ # Jais Adapter
4490
+ - name: huggingface/jais-adapted-7b-chat
4491
+ model_name: inceptionai/jais-adapted-7b-chat
4492
+ tokenizer_name: inceptionai/jais-adapted-7b-chat
4493
+ max_sequence_length: 4096
4494
+ client_spec:
4495
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4496
+ args:
4497
+ device_map: auto
4498
+
4499
+ - name: huggingface/jais-adapted-13b-chat
4500
+ model_name: inceptionai/jais-adapted-13b-chat
4501
+ tokenizer_name: inceptionai/jais-adapted-7b-chat
4502
+ max_sequence_length: 4096
4503
+ client_spec:
4504
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4505
+ args:
4506
+ device_map: auto
4507
+
4508
+ - name: huggingface/jais-adapted-70b-chat
4509
+ model_name: inceptionai/jais-adapted-70b-chat
4510
+ tokenizer_name: inceptionai/jais-adapted-7b-chat
4511
+ max_sequence_length: 4096
4512
+ client_spec:
4513
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4514
+ args:
4515
+ device_map: auto
4516
+
4517
+ - name: huggingface/falcon3-1b-instruct
4518
+ model_name: tiiuae/falcon3-1b-instruct
4519
+ tokenizer_name: tiiuae/falcon3-1b-instruct
4520
+ max_sequence_length: 8192
4521
+ client_spec:
4522
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4523
+
4524
+ - name: huggingface/falcon3-3b-instruct
4525
+ model_name: tiiuae/falcon3-3b-instruct
4526
+ tokenizer_name: tiiuae/falcon3-1b-instruct
4527
+ max_sequence_length: 32768
4528
+ client_spec:
4529
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4530
+
4531
+ - name: huggingface/falcon3-7b-instruct
4532
+ model_name: tiiuae/falcon3-7b-instruct
4533
+ tokenizer_name: tiiuae/falcon3-7b-instruct
4534
+ max_sequence_length: 32768
4535
+ client_spec:
4536
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4537
+ args:
4538
+ device_map: auto
4539
+
4540
+ - name: huggingface/falcon3-10b-instruct
4541
+ model_name: tiiuae/falcon3-10b-instruct
4542
+ tokenizer_name: tiiuae/falcon3-1b-instruct
4543
+ max_sequence_length: 32768
4544
+ client_spec:
4545
+ class_name: "helm.clients.huggingface_client.HuggingFaceClient"
4546
+ args:
4547
+ device_map: auto
4548
+
4324
4549
  # IBM WatsonX
4325
4550
  - name: ibm/llama-3.3-70b-instruct
4326
4551
  model_name: meta/llama-3.3-70b-instruct