crfm-helm 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +3 -1
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +117 -115
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +12 -0
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_specs/arabic_run_specs.py +6 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +2 -2
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/commonsense_scenario.py +7 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/gsm_scenario.py +9 -3
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -7
- helm/benchmark/scenarios/math_scenario.py +11 -4
- helm/benchmark/scenarios/med_qa_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mmlu_scenario.py +8 -2
- helm/benchmark/scenarios/narrativeqa_scenario.py +3 -4
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +9 -2
- helm/benchmark/static/schema_long_context.yaml +12 -31
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +5 -1
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/together_client.py +4 -0
- helm/clients/vertexai_client.py +4 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +225 -0
- helm/config/model_metadata.yaml +232 -7
- helm/config/tokenizer_configs.yaml +74 -4
- helm/benchmark/static_build/assets/index-671a5e06.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.8.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{index-9352595e.css → index-oIeiQW2g.css} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
helm/clients/together_client.py
CHANGED
|
@@ -9,6 +9,7 @@ import requests
|
|
|
9
9
|
from retrying import retry
|
|
10
10
|
|
|
11
11
|
from helm.common.cache import CacheConfig
|
|
12
|
+
from helm.common.hierarchical_logger import hexception
|
|
12
13
|
from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
|
|
13
14
|
from helm.common.object_spec import get_class_by_name
|
|
14
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -273,6 +274,7 @@ class TogetherClient(CachingClient):
|
|
|
273
274
|
try:
|
|
274
275
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it_sync))
|
|
275
276
|
except Exception as error:
|
|
277
|
+
hexception(error)
|
|
276
278
|
return RequestResult(
|
|
277
279
|
success=False,
|
|
278
280
|
cached=False,
|
|
@@ -455,6 +457,7 @@ class TogetherChatClient(CachingClient):
|
|
|
455
457
|
raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
456
458
|
response = ChatCompletionResponse.model_validate(raw_response)
|
|
457
459
|
except Exception as error:
|
|
460
|
+
hexception(error)
|
|
458
461
|
return RequestResult(
|
|
459
462
|
success=False,
|
|
460
463
|
cached=False,
|
|
@@ -562,6 +565,7 @@ class TogetherCompletionClient(CachingClient):
|
|
|
562
565
|
raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
563
566
|
response = CompletionResponse.model_validate(raw_response)
|
|
564
567
|
except Exception as error:
|
|
568
|
+
hexception(error)
|
|
565
569
|
return RequestResult(
|
|
566
570
|
success=False,
|
|
567
571
|
cached=False,
|
helm/clients/vertexai_client.py
CHANGED
|
@@ -4,6 +4,7 @@ from threading import Lock
|
|
|
4
4
|
from typing import Any, Dict, Mapping, Optional, List, Union, cast
|
|
5
5
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
|
+
from helm.common.hierarchical_logger import hexception
|
|
7
8
|
from helm.common.multimodal_request_utils import get_contents_as_bytes
|
|
8
9
|
from helm.common.media_object import TEXT_TYPE
|
|
9
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -152,6 +153,7 @@ class VertexAITextClient(VertexAIClient):
|
|
|
152
153
|
|
|
153
154
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
154
155
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
156
|
+
hexception(e)
|
|
155
157
|
error: str = f"VertexAITextClient error: {e}"
|
|
156
158
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
157
159
|
|
|
@@ -310,6 +312,7 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
310
312
|
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
311
313
|
)
|
|
312
314
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
315
|
+
hexception(e)
|
|
313
316
|
error: str = f"VertexAITextClient error: {e}"
|
|
314
317
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
315
318
|
|
|
@@ -440,6 +443,7 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
440
443
|
cache_key = self.make_cache_key_with_safety_settings_preset(raw_cache_key, request)
|
|
441
444
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
442
445
|
except requests.exceptions.RequestException as e:
|
|
446
|
+
hexception(e)
|
|
443
447
|
error: str = f"Gemini Vision error: {e}"
|
|
444
448
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
445
449
|
except VertexAIContentBlockedError as e:
|
|
@@ -8,7 +8,7 @@ import torch
|
|
|
8
8
|
|
|
9
9
|
from helm.common.cache import CacheConfig
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
|
|
11
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
12
12
|
from helm.common.media_object import TEXT_TYPE
|
|
13
13
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
14
14
|
from helm.common.request import wrap_request_time
|
|
@@ -125,6 +125,7 @@ class HuggingFaceVision2SeqClient(CachingClient):
|
|
|
125
125
|
)
|
|
126
126
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
127
127
|
except RuntimeError as model_error:
|
|
128
|
+
hexception(model_error)
|
|
128
129
|
return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
|
|
129
130
|
|
|
130
131
|
for text in result["output"]:
|
|
@@ -5,6 +5,7 @@ from transformers import pipeline
|
|
|
5
5
|
from transformers.pipelines import ImageToTextPipeline
|
|
6
6
|
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.hierarchical_logger import hexception
|
|
8
9
|
from helm.common.images_utils import open_image
|
|
9
10
|
from helm.common.media_object import TEXT_TYPE
|
|
10
11
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -93,6 +94,7 @@ class HuggingFaceVLMClient(CachingClient):
|
|
|
93
94
|
)
|
|
94
95
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
95
96
|
except RuntimeError as e:
|
|
97
|
+
hexception(e)
|
|
96
98
|
return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])
|
|
97
99
|
|
|
98
100
|
output: str = result["generated_text"]
|
|
@@ -8,7 +8,7 @@ from transformers import IdeficsForVisionText2Text, AutoProcessor, IdeficsProces
|
|
|
8
8
|
from helm.common.cache import CacheConfig
|
|
9
9
|
from helm.common.images_utils import open_image
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name
|
|
11
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
12
12
|
from helm.common.media_object import TEXT_TYPE
|
|
13
13
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
14
14
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
@@ -137,6 +137,7 @@ class IDEFICSClient(CachingClient):
|
|
|
137
137
|
)
|
|
138
138
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
139
139
|
except RuntimeError as model_error:
|
|
140
|
+
hexception(model_error)
|
|
140
141
|
return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
|
|
141
142
|
|
|
142
143
|
for text in result["output"]:
|
|
@@ -5,7 +5,7 @@ import torch
|
|
|
5
5
|
from huggingface_hub import hf_hub_download
|
|
6
6
|
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
8
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
9
9
|
from helm.common.images_utils import open_image
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name
|
|
11
11
|
from helm.common.media_object import TEXT_TYPE
|
|
@@ -131,6 +131,7 @@ class OpenFlamingoClient(CachingClient):
|
|
|
131
131
|
)
|
|
132
132
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
133
133
|
except RuntimeError as ex:
|
|
134
|
+
hexception(ex)
|
|
134
135
|
return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
|
|
135
136
|
|
|
136
137
|
completions: List[GeneratedOutput] = []
|
|
@@ -8,7 +8,7 @@ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
|
|
|
8
8
|
from helm.common.cache import CacheConfig
|
|
9
9
|
from helm.common.images_utils import open_image
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name
|
|
11
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
12
12
|
from helm.common.media_object import TEXT_TYPE
|
|
13
13
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
14
14
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
@@ -126,6 +126,7 @@ class PaliGemmaClient(CachingClient):
|
|
|
126
126
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
127
127
|
concat_results.append(result)
|
|
128
128
|
except RuntimeError as model_error:
|
|
129
|
+
hexception(model_error)
|
|
129
130
|
return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
|
|
130
131
|
|
|
131
132
|
for result in concat_results:
|
|
@@ -5,6 +5,7 @@ import requests
|
|
|
5
5
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.images_utils import encode_base64
|
|
8
|
+
from helm.common.hierarchical_logger import hexception
|
|
8
9
|
from helm.common.media_object import TEXT_TYPE
|
|
9
10
|
from helm.common.request import Request, RequestResult, GeneratedOutput, ErrorFlags
|
|
10
11
|
from helm.common.request import wrap_request_time
|
|
@@ -76,6 +77,7 @@ class PalmyraVisionClient(CachingClient):
|
|
|
76
77
|
)
|
|
77
78
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
78
79
|
except PalmyraVisionContentBlockedError as ex:
|
|
80
|
+
hexception(ex)
|
|
79
81
|
return RequestResult(
|
|
80
82
|
success=False,
|
|
81
83
|
cached=False,
|
|
@@ -8,7 +8,7 @@ import torch
|
|
|
8
8
|
|
|
9
9
|
from helm.common.cache import CacheConfig
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name
|
|
11
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
12
12
|
from helm.common.media_object import TEXT_TYPE
|
|
13
13
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
14
14
|
from helm.common.request import wrap_request_time
|
|
@@ -157,6 +157,7 @@ class Qwen2VLMClient(CachingClient):
|
|
|
157
157
|
)
|
|
158
158
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
159
159
|
except RuntimeError as model_error:
|
|
160
|
+
hexception(model_error)
|
|
160
161
|
return RequestResult(
|
|
161
162
|
success=False,
|
|
162
163
|
cached=False,
|
|
@@ -7,7 +7,7 @@ from transformers.generation import GenerationConfig
|
|
|
7
7
|
|
|
8
8
|
from helm.common.cache import CacheConfig
|
|
9
9
|
from helm.common.gpu_utils import get_torch_device_name
|
|
10
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
10
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
11
11
|
from helm.common.media_object import TEXT_TYPE
|
|
12
12
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
13
13
|
from helm.common.request import wrap_request_time
|
|
@@ -139,6 +139,7 @@ class QwenVLMClient(CachingClient):
|
|
|
139
139
|
)
|
|
140
140
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
141
141
|
except RuntimeError as model_error:
|
|
142
|
+
hexception(model_error)
|
|
142
143
|
return RequestResult(
|
|
143
144
|
success=False, cached=False, error=str(model_error), completions=[], embedding=[]
|
|
144
145
|
)
|
helm/clients/writer_client.py
CHANGED
|
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Mapping, Optional
|
|
|
2
2
|
|
|
3
3
|
from helm.clients.client import CachingClient
|
|
4
4
|
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.common.hierarchical_logger import hexception
|
|
5
6
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
7
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
7
8
|
|
|
@@ -82,6 +83,7 @@ class WriterClient(CachingClient):
|
|
|
82
83
|
raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
83
84
|
chat_completion: ChatCompletion = ChatCompletion.model_validate(raw_response)
|
|
84
85
|
except Exception as error:
|
|
86
|
+
hexception(error)
|
|
85
87
|
return RequestResult(
|
|
86
88
|
success=False,
|
|
87
89
|
cached=False,
|
|
@@ -64,6 +64,16 @@ class HierarchicalLogger(object):
|
|
|
64
64
|
self.logger.warning(self.indent() + str(x), **kwargs)
|
|
65
65
|
sys.stdout.flush()
|
|
66
66
|
|
|
67
|
+
def error(self, x: Any, **kwargs) -> None:
|
|
68
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
69
|
+
self.logger.error(self.indent() + str(x), **kwargs)
|
|
70
|
+
sys.stdout.flush()
|
|
71
|
+
|
|
72
|
+
def exception(self, x: Any, **kwargs) -> None:
|
|
73
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
74
|
+
self.logger.exception(self.indent() + str(x), **kwargs)
|
|
75
|
+
sys.stdout.flush()
|
|
76
|
+
|
|
67
77
|
|
|
68
78
|
def format_time(s: float) -> str:
|
|
69
79
|
"""Return a nice string representation of `s` seconds."""
|
|
@@ -96,6 +106,16 @@ def hwarn(x: Any, **kwargs) -> None:
|
|
|
96
106
|
singleton.warn(x, **kwargs)
|
|
97
107
|
|
|
98
108
|
|
|
109
|
+
def herror(x: Any, **kwargs) -> None:
|
|
110
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
111
|
+
singleton.error(x, **kwargs)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def hexception(x: Any, **kwargs) -> None:
|
|
115
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
116
|
+
singleton.exception(x, **kwargs)
|
|
117
|
+
|
|
118
|
+
|
|
99
119
|
class htrack_block:
|
|
100
120
|
def __init__(self, x: Any, stacklevel=1) -> None:
|
|
101
121
|
self._stacklevel = stacklevel + 1
|
|
@@ -9,7 +9,7 @@ def handle_module_not_found_error(e: ModuleNotFoundError, suggestions: Optional[
|
|
|
9
9
|
# TODO: Ask user to install more specific optional dependencies
|
|
10
10
|
# e.g. crfm-helm[plots] or crfm-helm[server]
|
|
11
11
|
suggested_commands = " or ".join(
|
|
12
|
-
[f
|
|
12
|
+
[f'`pip install "crfm-helm[{suggestion}]"`' for suggestion in (suggestions or []) + ["all"]]
|
|
13
13
|
)
|
|
14
14
|
raise OptionalDependencyNotInstalled(
|
|
15
15
|
f"Optional dependency {e.name} is not installed. Please run {suggested_commands} to install it."
|
helm/common/test_general.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import shutil
|
|
2
2
|
import os
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
3
6
|
from helm.common.general import (
|
|
4
7
|
ensure_file_downloaded,
|
|
5
8
|
format_tags,
|
|
@@ -12,6 +15,7 @@ from helm.common.general import (
|
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
def test_ensure_file_downloaded():
|
|
18
|
+
pytest.skip("Skipping download tests because these downloads are not reliable and may be throttled")
|
|
15
19
|
ensure_file_downloaded("https://ftp.gnu.org/gnu/tar/tar-1.34.tar.gz", "test-tar", unpack=True, unpack_type="untar")
|
|
16
20
|
assert os.path.isdir("test-tar")
|
|
17
21
|
shutil.rmtree("test-tar")
|
|
@@ -730,6 +730,13 @@ model_deployments:
|
|
|
730
730
|
thinking_budget_tokens: 10000
|
|
731
731
|
stream: true
|
|
732
732
|
|
|
733
|
+
- name: anthropic/claude-sonnet-4-5-20250929
|
|
734
|
+
model_name: anthropic/claude-sonnet-4-5-20250929
|
|
735
|
+
tokenizer_name: anthropic/claude
|
|
736
|
+
max_sequence_length: 200000
|
|
737
|
+
client_spec:
|
|
738
|
+
class_name: "helm.clients.anthropic_client.AnthropicMessagesClient"
|
|
739
|
+
|
|
733
740
|
- name: anthropic/stanford-online-all-v4-s3
|
|
734
741
|
deprecated: true # Closed model, not accessible via API
|
|
735
742
|
model_name: anthropic/stanford-online-all-v4-s3
|
|
@@ -861,6 +868,20 @@ model_deployments:
|
|
|
861
868
|
parse_thinking: true
|
|
862
869
|
disable_logprobs: True
|
|
863
870
|
|
|
871
|
+
- name: together/deepseek-r1-distill-llama-70b
|
|
872
|
+
model_name: deepseek-ai/deepseek-r1-distill-llama-70b
|
|
873
|
+
tokenizer_name: deepseek-ai/deepseek-r1-distill-llama-70b
|
|
874
|
+
max_sequence_length: 131072
|
|
875
|
+
client_spec:
|
|
876
|
+
class_name: "helm.clients.together_client.TogetherChatClient"
|
|
877
|
+
|
|
878
|
+
- name: together/deepseek-r1-distill-qwen-14b
|
|
879
|
+
model_name: deepseek-ai/deepseek-r1-distill-qwen-14b
|
|
880
|
+
tokenizer_name: deepseek-ai/deepseek-r1-distill-qwen-14b
|
|
881
|
+
max_sequence_length: 131072
|
|
882
|
+
client_spec:
|
|
883
|
+
class_name: "helm.clients.together_client.TogetherChatClient"
|
|
884
|
+
|
|
864
885
|
# Gooseai
|
|
865
886
|
|
|
866
887
|
# TODO: Migrate these models to use OpenAIClient
|
|
@@ -3873,6 +3894,15 @@ model_deployments:
|
|
|
3873
3894
|
args:
|
|
3874
3895
|
parse_thinking: true
|
|
3875
3896
|
|
|
3897
|
+
- name: together/qwen3-next-80b-a3b-thinking
|
|
3898
|
+
model_name: qwen/qwen3-next-80b-a3b-thinking
|
|
3899
|
+
tokenizer_name: qwen/qwen3-next-80b-a3b-thinking
|
|
3900
|
+
max_sequence_length: 262144
|
|
3901
|
+
client_spec:
|
|
3902
|
+
class_name: "helm.clients.together_client.TogetherChatClient"
|
|
3903
|
+
args:
|
|
3904
|
+
parse_thinking: true
|
|
3905
|
+
|
|
3876
3906
|
- name: together/qwen3-235b-a22b-instruct-2507-fp8
|
|
3877
3907
|
model_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
|
|
3878
3908
|
tokenizer_name: qwen/qwen3-235b-a22b-instruct-2507-fp8
|
|
@@ -4321,6 +4351,201 @@ model_deployments:
|
|
|
4321
4351
|
args:
|
|
4322
4352
|
pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
|
|
4323
4353
|
|
|
4354
|
+
# AceGPT-v2
|
|
4355
|
+
- name: huggingface/acegpt-v2-8b-chat
|
|
4356
|
+
model_name: freedomintelligence/acegpt-v2-8b-chat
|
|
4357
|
+
tokenizer_name: freedomintelligence/acegpt-v2-8b-chat
|
|
4358
|
+
max_sequence_length: 8192
|
|
4359
|
+
client_spec:
|
|
4360
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4361
|
+
args:
|
|
4362
|
+
device_map: auto
|
|
4363
|
+
|
|
4364
|
+
- name: huggingface/acegpt-v2-32b-chat
|
|
4365
|
+
model_name: freedomintelligence/acegpt-v2-32b-chat
|
|
4366
|
+
tokenizer_name: freedomintelligence/acegpt-v2-32b-chat
|
|
4367
|
+
max_sequence_length: 32768
|
|
4368
|
+
client_spec:
|
|
4369
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4370
|
+
args:
|
|
4371
|
+
device_map: auto
|
|
4372
|
+
|
|
4373
|
+
- name: huggingface/acegpt-v2-70b-chat
|
|
4374
|
+
model_name: freedomintelligence/acegpt-v2-70b-chat
|
|
4375
|
+
tokenizer_name: freedomintelligence/acegpt-v2-70b-chat
|
|
4376
|
+
max_sequence_length: 8192
|
|
4377
|
+
client_spec:
|
|
4378
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4379
|
+
args:
|
|
4380
|
+
device_map: auto
|
|
4381
|
+
|
|
4382
|
+
# ALLaM
|
|
4383
|
+
- name: huggingface/allam-7b-instruct-preview
|
|
4384
|
+
model_name: allam-ai/allam-7b-instruct-preview
|
|
4385
|
+
tokenizer_name: allam-ai/allam-7b-instruct-preview
|
|
4386
|
+
max_sequence_length: 4096
|
|
4387
|
+
client_spec:
|
|
4388
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4389
|
+
args:
|
|
4390
|
+
device_map: auto
|
|
4391
|
+
|
|
4392
|
+
# SILMA
|
|
4393
|
+
- name: huggingface/silma-9b-instruct-v1.0
|
|
4394
|
+
model_name: silma-ai/silma-9b-instruct-v1.0
|
|
4395
|
+
tokenizer_name: silma-ai/silma-9b-instruct-v1.0
|
|
4396
|
+
max_sequence_length: 8192
|
|
4397
|
+
client_spec:
|
|
4398
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4399
|
+
args:
|
|
4400
|
+
device_map: auto
|
|
4401
|
+
|
|
4402
|
+
# Jais Family
|
|
4403
|
+
#
|
|
4404
|
+
# NOTE: Jais Family models require `transformers<=4.52.3`.
|
|
4405
|
+
# On more recent versions of transformers, one of the following errors might occur:
|
|
4406
|
+
#
|
|
4407
|
+
# File "/path/to//site-packages/transformers/models/gemma3n/configuration_gemma3n.py", line 31, in <module>
|
|
4408
|
+
# from timm.data import ImageNetInfo, infer_imagenet_subset
|
|
4409
|
+
# ImportError: cannot import name 'ImageNetInfo' from 'timm.data' (/path/to/site-packages/timm/data/__init__.py)
|
|
4410
|
+
#
|
|
4411
|
+
# File "/path/to/.cache/huggingface/modules/transformers_modules/inceptionai/jais-family-590m-chat/90ac4769212b4964c6e81e183140224628228365/modeling_jais.py", line 899, in forward
|
|
4412
|
+
# past_length = past_key_values[0][0].size(-2)
|
|
4413
|
+
# AttributeError: 'NoneType' object has no attribute 'size'
|
|
4414
|
+
|
|
4415
|
+
- name: huggingface/jais-family-590m-chat
|
|
4416
|
+
model_name: inceptionai/jais-family-590m-chat
|
|
4417
|
+
tokenizer_name: inceptionai/jais-family-590m-chat
|
|
4418
|
+
max_sequence_length: 2048
|
|
4419
|
+
client_spec:
|
|
4420
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4421
|
+
args:
|
|
4422
|
+
trust_remote_code: true
|
|
4423
|
+
revision: 90ac4769212b4964c6e81e183140224628228365
|
|
4424
|
+
|
|
4425
|
+
- name: huggingface/jais-family-1p3b-chat
|
|
4426
|
+
model_name: inceptionai/jais-family-1p3b-chat
|
|
4427
|
+
tokenizer_name: inceptionai/jais-family-590m-chat
|
|
4428
|
+
max_sequence_length: 2048
|
|
4429
|
+
client_spec:
|
|
4430
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4431
|
+
args:
|
|
4432
|
+
trust_remote_code: true
|
|
4433
|
+
revision: 4b93176e2cb00f369b3bc0a8786e4cf16260c804
|
|
4434
|
+
|
|
4435
|
+
- name: huggingface/jais-family-2p7b-chat
|
|
4436
|
+
model_name: inceptionai/jais-family-2p7b-chat
|
|
4437
|
+
tokenizer_name: inceptionai/jais-family-590m-chat
|
|
4438
|
+
max_sequence_length: 2048
|
|
4439
|
+
client_spec:
|
|
4440
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4441
|
+
args:
|
|
4442
|
+
trust_remote_code: true
|
|
4443
|
+
revision: b2bf5d1bcd969ce868f66fb1ad8c3480289ea6b2
|
|
4444
|
+
|
|
4445
|
+
- name: huggingface/jais-family-6p7b-chat
|
|
4446
|
+
model_name: inceptionai/jais-family-6p7b-chat
|
|
4447
|
+
tokenizer_name: inceptionai/jais-family-590m-chat
|
|
4448
|
+
max_sequence_length: 2048
|
|
4449
|
+
client_spec:
|
|
4450
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4451
|
+
args:
|
|
4452
|
+
device_map: auto
|
|
4453
|
+
trust_remote_code: true
|
|
4454
|
+
revision: 683805efe6126c6536feb4aa23317e70222ac94c
|
|
4455
|
+
|
|
4456
|
+
- name: huggingface/jais-family-13b-chat
|
|
4457
|
+
model_name: inceptionai/jais-family-13b-chat
|
|
4458
|
+
tokenizer_name: inceptionai/jais-family-590m-chat
|
|
4459
|
+
max_sequence_length: 2048
|
|
4460
|
+
client_spec:
|
|
4461
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4462
|
+
args:
|
|
4463
|
+
device_map: auto
|
|
4464
|
+
trust_remote_code: true
|
|
4465
|
+
revision: 0ef8b4f80429609890816d912b331d3b95864707
|
|
4466
|
+
|
|
4467
|
+
- name: huggingface/jais-family-30b-8k-chat
|
|
4468
|
+
model_name: inceptionai/jais-family-30b-8k-chat
|
|
4469
|
+
tokenizer_name: inceptionai/jais-family-590m-chat
|
|
4470
|
+
max_sequence_length: 8192
|
|
4471
|
+
client_spec:
|
|
4472
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4473
|
+
args:
|
|
4474
|
+
device_map: auto
|
|
4475
|
+
trust_remote_code: true
|
|
4476
|
+
revision: dab185164dd3b79ec9201d7f4cf878ce91ae7e14
|
|
4477
|
+
|
|
4478
|
+
- name: huggingface/jais-family-30b-16k-chat
|
|
4479
|
+
model_name: inceptionai/jais-family-30b-16k-chat
|
|
4480
|
+
tokenizer_name: inceptionai/jais-family-590m-chat
|
|
4481
|
+
max_sequence_length: 16384
|
|
4482
|
+
client_spec:
|
|
4483
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4484
|
+
args:
|
|
4485
|
+
device_map: auto
|
|
4486
|
+
trust_remote_code: true
|
|
4487
|
+
revision: 369f88eeee4d313155f1b1dca4ebec90f9f9f2a4
|
|
4488
|
+
|
|
4489
|
+
# Jais Adapter
|
|
4490
|
+
- name: huggingface/jais-adapted-7b-chat
|
|
4491
|
+
model_name: inceptionai/jais-adapted-7b-chat
|
|
4492
|
+
tokenizer_name: inceptionai/jais-adapted-7b-chat
|
|
4493
|
+
max_sequence_length: 4096
|
|
4494
|
+
client_spec:
|
|
4495
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4496
|
+
args:
|
|
4497
|
+
device_map: auto
|
|
4498
|
+
|
|
4499
|
+
- name: huggingface/jais-adapted-13b-chat
|
|
4500
|
+
model_name: inceptionai/jais-adapted-13b-chat
|
|
4501
|
+
tokenizer_name: inceptionai/jais-adapted-7b-chat
|
|
4502
|
+
max_sequence_length: 4096
|
|
4503
|
+
client_spec:
|
|
4504
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4505
|
+
args:
|
|
4506
|
+
device_map: auto
|
|
4507
|
+
|
|
4508
|
+
- name: huggingface/jais-adapted-70b-chat
|
|
4509
|
+
model_name: inceptionai/jais-adapted-70b-chat
|
|
4510
|
+
tokenizer_name: inceptionai/jais-adapted-7b-chat
|
|
4511
|
+
max_sequence_length: 4096
|
|
4512
|
+
client_spec:
|
|
4513
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4514
|
+
args:
|
|
4515
|
+
device_map: auto
|
|
4516
|
+
|
|
4517
|
+
- name: huggingface/falcon3-1b-instruct
|
|
4518
|
+
model_name: tiiuae/falcon3-1b-instruct
|
|
4519
|
+
tokenizer_name: tiiuae/falcon3-1b-instruct
|
|
4520
|
+
max_sequence_length: 8192
|
|
4521
|
+
client_spec:
|
|
4522
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4523
|
+
|
|
4524
|
+
- name: huggingface/falcon3-3b-instruct
|
|
4525
|
+
model_name: tiiuae/falcon3-3b-instruct
|
|
4526
|
+
tokenizer_name: tiiuae/falcon3-1b-instruct
|
|
4527
|
+
max_sequence_length: 32768
|
|
4528
|
+
client_spec:
|
|
4529
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4530
|
+
|
|
4531
|
+
- name: huggingface/falcon3-7b-instruct
|
|
4532
|
+
model_name: tiiuae/falcon3-7b-instruct
|
|
4533
|
+
tokenizer_name: tiiuae/falcon3-7b-instruct
|
|
4534
|
+
max_sequence_length: 32768
|
|
4535
|
+
client_spec:
|
|
4536
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4537
|
+
args:
|
|
4538
|
+
device_map: auto
|
|
4539
|
+
|
|
4540
|
+
- name: huggingface/falcon3-10b-instruct
|
|
4541
|
+
model_name: tiiuae/falcon3-10b-instruct
|
|
4542
|
+
tokenizer_name: tiiuae/falcon3-1b-instruct
|
|
4543
|
+
max_sequence_length: 32768
|
|
4544
|
+
client_spec:
|
|
4545
|
+
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
|
|
4546
|
+
args:
|
|
4547
|
+
device_map: auto
|
|
4548
|
+
|
|
4324
4549
|
# IBM WatsonX
|
|
4325
4550
|
- name: ibm/llama-3.3-70b-instruct
|
|
4326
4551
|
model_name: meta/llama-3.3-70b-instruct
|