crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -0,0 +1,62 @@
1
+ import soundfile as sf
2
+
3
+ from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor # type: ignore
4
+ from qwen_omni_utils import process_mm_info
5
+
6
+ # default: Load the model on the available device(s)
7
+ model = Qwen2_5OmniModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
8
+
9
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving.
10
+ # model = Qwen2_5OmniModel.from_pretrained(
11
+ # "Qwen/Qwen2.5-Omni-7B",
12
+ # torch_dtype="auto",
13
+ # device_map="auto",
14
+ # attn_implementation="flash_attention_2",
15
+ # )
16
+
17
+ processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
18
+
19
+ conversation = [
20
+ {
21
+ "role": "system",
22
+ "content": (
23
+ "You are Qwen, a virtual human developed by the Qwen Team,"
24
+ " Alibaba Group, capable of perceiving auditory and visual"
25
+ " inputs, as well as generating text and speech."
26
+ ),
27
+ },
28
+ {
29
+ "role": "user",
30
+ "content": [
31
+ {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
32
+ ],
33
+ },
34
+ ]
35
+
36
+ # set use audio in video
37
+ USE_AUDIO_IN_VIDEO = True
38
+
39
+ # Preparation for inference
40
+ text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
41
+ audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
42
+ inputs = processor(
43
+ text=text,
44
+ audios=audios,
45
+ images=images,
46
+ videos=videos,
47
+ return_tensors="pt",
48
+ padding=True,
49
+ use_audio_in_video=USE_AUDIO_IN_VIDEO,
50
+ )
51
+ inputs = inputs.to(model.device).to(model.dtype)
52
+
53
+ # Inference: Generation of the output text and audio
54
+ text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
55
+
56
+ text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
57
+ print(text)
58
+ sf.write(
59
+ "output.wav",
60
+ audio.reshape(-1).detach().cpu().numpy(),
61
+ samplerate=24000,
62
+ )
@@ -117,10 +117,12 @@ class BedrockNovaClient(CachingClient):
117
117
  tokenizer_name: str,
118
118
  assumed_role: Optional[str] = None,
119
119
  region: Optional[str] = None,
120
+ bedrock_model_id: Optional[str] = None,
120
121
  ):
121
122
  super().__init__(cache_config=cache_config)
122
123
  self.tokenizer = tokenizer
123
124
  self.tokenizer_name = tokenizer_name
125
+ self.bedrock_model_id = bedrock_model_id
124
126
  self.bedrock_client = get_bedrock_client_v1(
125
127
  assumed_role=assumed_role or os.environ.get("BEDROCK_ASSUME_ROLE", None),
126
128
  region=region,
@@ -144,7 +146,7 @@ class BedrockNovaClient(CachingClient):
144
146
  messages = self._get_messages_from_request(request)
145
147
 
146
148
  return {
147
- "modelId": model_id,
149
+ "modelId": self.bedrock_model_id or model_id,
148
150
  "inferenceConfig": {
149
151
  "temperature": request.temperature,
150
152
  "maxTokens": request.max_tokens,
helm/clients/client.py CHANGED
@@ -2,7 +2,7 @@ import json
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import List, Mapping, Optional, cast
4
4
 
5
- from helm.common.hierarchical_logger import hlog
5
+ from helm.common.hierarchical_logger import hwarn
6
6
  from helm.common.media_object import MultimediaObject, TEXT_TYPE
7
7
  from helm.common.request import Request, RequestResult, GeneratedOutput, Token
8
8
  from helm.common.cache import Cache, CacheConfig
@@ -65,7 +65,7 @@ def truncate_sequence(
65
65
  # where max_tokens = 0, so there's nothing to truncate.
66
66
  if request.echo_prompt:
67
67
  if request.max_tokens != 0:
68
- hlog("WARNING: don't know how to handle echo_prompt and max_tokens > 0, not truncating")
68
+ hwarn("don't know how to handle echo_prompt and max_tokens > 0, not truncating")
69
69
  return sequence
70
70
 
71
71
  if end_of_text_token:
@@ -90,8 +90,8 @@ def truncate_sequence(
90
90
  new_tokens.append(token)
91
91
 
92
92
  if len(new_text) < len(sequence.text) and len(new_tokens) == len(sequence.tokens):
93
- hlog(
94
- f"WARNING: Stripped characters from text ({len(sequence.text)} -> {len(new_text)}), "
93
+ hwarn(
94
+ f"Stripped characters from text ({len(sequence.text)} -> {len(new_text)}), "
95
95
  f"but wasn't able to strip the tokens"
96
96
  )
97
97
 
@@ -99,14 +99,14 @@ def truncate_sequence(
99
99
  new_logprob = sum(token.logprob for token in new_tokens)
100
100
 
101
101
  if print_warning:
102
- hlog(f"WARNING: truncate_sequence needs to strip {json.dumps(stop)}")
102
+ hwarn(f"truncate_sequence needs to strip {json.dumps(stop)}")
103
103
 
104
104
  sequence = GeneratedOutput(text=new_text, logprob=new_logprob, tokens=new_tokens)
105
105
 
106
106
  # Truncate based on the max number of tokens.
107
107
  if len(sequence.tokens) > request.max_tokens:
108
108
  if print_warning:
109
- hlog(f"WARNING: truncate_sequence needs to truncate {len(sequence.tokens)} down to {request.max_tokens}")
109
+ hwarn(f"truncate_sequence needs to truncate {len(sequence.tokens)} down to {request.max_tokens}")
110
110
  new_tokens = sequence.tokens[: request.max_tokens]
111
111
 
112
112
  # This is imperfect stitching together of tokens, so just to make sure this is okay
@@ -114,7 +114,7 @@ def truncate_sequence(
114
114
  # Usually, in our benchmark, max_tokens is active when it's 1, so hopefully this isn't an issue.
115
115
  new_text = "".join(token.text for token in new_tokens)
116
116
  if not sequence.text.startswith(new_text):
117
- hlog(f"WARNING: {json.dumps(sequence.text)} does not start with truncated text {json.dumps(new_text)}")
117
+ hwarn(f"{json.dumps(sequence.text)} does not start with truncated text {json.dumps(new_text)}")
118
118
 
119
119
  new_logprob = sum(token.logprob for token in new_tokens)
120
120
 
@@ -0,0 +1,36 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from helm.clients.openai_client import OpenAIClient
4
+ from helm.common.cache import CacheConfig
5
+ from helm.common.request import Request
6
+ from helm.tokenizers.tokenizer import Tokenizer
7
+
8
+
9
+ class GrokChatClient(OpenAIClient):
10
+
11
+ BASE_URL = "https://api.x.ai/v1"
12
+
13
+ _UNSUPPORTED_ARGUMENTS = ["presence_penalty", "frequency_penalty"]
14
+
15
+ def __init__(
16
+ self,
17
+ tokenizer: Tokenizer,
18
+ tokenizer_name: str,
19
+ cache_config: CacheConfig,
20
+ api_key: Optional[str] = None,
21
+ ):
22
+ super().__init__(
23
+ tokenizer=tokenizer,
24
+ tokenizer_name=tokenizer_name,
25
+ cache_config=cache_config,
26
+ api_key=api_key,
27
+ org_id=None,
28
+ base_url="https://api.x.ai/v1",
29
+ )
30
+
31
+ def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
32
+ raw_request = super()._make_chat_raw_request(request)
33
+ for unsupported_argument in self._UNSUPPORTED_ARGUMENTS:
34
+ if unsupported_argument in raw_request:
35
+ del raw_request[unsupported_argument]
36
+ return raw_request
@@ -8,7 +8,7 @@ from transformers.generation.stopping_criteria import (
8
8
  from typing import Any, Dict, List, Optional, TypedDict
9
9
 
10
10
  from helm.common.cache import CacheConfig
11
- from helm.common.hierarchical_logger import htrack_block, hlog
11
+ from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
12
12
  from helm.common.optional_dependencies import handle_module_not_found_error
13
13
  from helm.common.request import (
14
14
  wrap_request_time,
@@ -18,6 +18,7 @@ from helm.common.request import (
18
18
  GeneratedOutput,
19
19
  Token,
20
20
  )
21
+ from helm.proxy.retry import NonRetriableException
21
22
  from helm.tokenizers.tokenizer import Tokenizer
22
23
  from helm.clients.client import CachingClient, truncate_sequence
23
24
  from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer, WrappedPreTrainedTokenizer
@@ -256,6 +257,7 @@ class HuggingFaceClient(CachingClient):
256
257
  tokenizer: Tokenizer,
257
258
  pretrained_model_name_or_path: Optional[str] = None,
258
259
  end_of_text_token: Optional[str] = None,
260
+ apply_chat_template: Optional[bool] = None,
259
261
  **kwargs,
260
262
  ):
261
263
  super().__init__(cache_config=cache_config)
@@ -266,9 +268,46 @@ class HuggingFaceClient(CachingClient):
266
268
  "but instead it is {tokenizer}"
267
269
  )
268
270
  self._wrapped_tokenizer: WrappedPreTrainedTokenizer = tokenizer.get_wrapped_tokenizer()
269
- self._tokenizer = tokenizer
270
271
  self._kwargs = _process_huggingface_client_kwargs(kwargs)
271
272
  self._end_of_text_token = end_of_text_token
273
+ # If the user did not explicitly configure whether the model is a chat model with `apply_chat_template` arg,
274
+ # auto-infer if the model is a chat model based on whether the tokenizer has a chat template.
275
+ # Note: Auto-inference is incorrect for some non-chat models that still have chat templates
276
+ # e.g. Qwen2, Qwen 2.5.
277
+ # For these models, the `apply_chat_template` arg should be explicitly set to false.
278
+ if apply_chat_template is not None:
279
+ self._apply_chat_template = apply_chat_template
280
+ else:
281
+ with self._wrapped_tokenizer as hf_tokenizer:
282
+ self._apply_chat_template = bool(hf_tokenizer.chat_template)
283
+ hwarn(
284
+ f"Automatically set `apply_chat_template` to {self._apply_chat_template} based on "
285
+ "whether the tokenizer has a chat template. "
286
+ "If this is incorrect, please explicitly set `apply_chat_template`."
287
+ )
288
+
289
+ def get_prompt(self, request: Request) -> str:
290
+ if request.prompt and request.messages:
291
+ raise NonRetriableException(f"More than one of `prompt` and `messages` was set in request: {request}")
292
+ # Chat model expects a list of messages as input
293
+ if self._apply_chat_template:
294
+ with self._wrapped_tokenizer as tokenizer:
295
+ if request.messages:
296
+ prompt = tokenizer.apply_chat_template(request.messages, tokenize=False, add_generation_prompt=True)
297
+ assert isinstance(prompt, str)
298
+ return prompt
299
+ else:
300
+ prompt = tokenizer.apply_chat_template(
301
+ [{"role": "user", "content": request.prompt}], tokenize=False, add_generation_prompt=True
302
+ )
303
+ assert isinstance(prompt, str)
304
+ return prompt
305
+ # Base non-chat model expects a string as input
306
+ else:
307
+ if request.messages:
308
+ raise NonRetriableException("Chat mesages not supported by non-chat model")
309
+ else:
310
+ return request.prompt
272
311
 
273
312
  def make_request(self, request: Request) -> RequestResult:
274
313
  # Embedding not supported for this model
@@ -277,7 +316,7 @@ class HuggingFaceClient(CachingClient):
277
316
 
278
317
  raw_request: HuggingFaceRequest = {
279
318
  "engine": request.model_engine,
280
- "prompt": request.prompt,
319
+ "prompt": self.get_prompt(request),
281
320
  "temperature": 1e-7 if request.temperature == 0 else request.temperature,
282
321
  "num_return_sequences": request.num_completions,
283
322
  "max_new_tokens": request.max_tokens,
@@ -0,0 +1,138 @@
1
+ from threading import Lock
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ import transformers
5
+
6
+ from helm.clients.client import CachingClient
7
+ from helm.common.cache import CacheConfig
8
+ from helm.common.hierarchical_logger import htrack_block, hwarn
9
+ from helm.common.request import GeneratedOutput, Request, RequestResult, wrap_request_time
10
+ from helm.proxy.retry import NonRetriableException
11
+
12
+
13
+ _pipelines: Dict[str, transformers.Pipeline] = {}
14
+ _pipelines_lock: Lock = Lock()
15
+
16
+
17
+ def _get_pipeline(
18
+ helm_model_name: str,
19
+ pipeline_kwargs: Dict[str, Any],
20
+ ) -> Any:
21
+ """
22
+ Checks if the desired HuggingFaceModel is cached. Creates the HuggingFaceModel if it's not cached.
23
+ Returns the HuggingFaceModel.
24
+ """
25
+ global _pipelines
26
+ global _pipelines_lock
27
+ with _pipelines_lock:
28
+ if helm_model_name not in _pipelines:
29
+ huggingface_model_name = pipeline_kwargs["model"]
30
+ with htrack_block(
31
+ f"Loading HuggingFace model {huggingface_model_name} (kwargs={pipeline_kwargs}) "
32
+ f"for HELM model {helm_model_name} with transformers.pipeline"
33
+ ):
34
+ _pipelines[helm_model_name] = transformers.pipeline(**pipeline_kwargs)
35
+
36
+ return _pipelines[helm_model_name]
37
+
38
+
39
+ class HuggingFacePipelineClient(CachingClient):
40
+ def __init__(
41
+ self,
42
+ cache_config: CacheConfig,
43
+ model_name: str,
44
+ pretrained_model_name_or_path: Optional[str] = None,
45
+ apply_chat_template: Optional[bool] = None,
46
+ **kwargs,
47
+ ):
48
+ # Include `pretrained_model_name_or_path` parameter so that model deployments can use
49
+ # the `pretrained_model_name_or_path` arg to override `model_name`
50
+ super().__init__(cache_config=cache_config)
51
+ self._helm_model_name = model_name
52
+ self._pipeline_kwargs = {
53
+ "model": pretrained_model_name_or_path or self._helm_model_name,
54
+ "task": "text-generation",
55
+ **kwargs,
56
+ }
57
+ self._pipeline = _get_pipeline(self._helm_model_name, self._pipeline_kwargs)
58
+ if apply_chat_template is not None:
59
+ self._apply_chat_template = apply_chat_template
60
+ else:
61
+ # If the user did not explicitly configure whether the model is a chat model with `apply_chat_template` arg,
62
+ # auto-infer if the model is a chat model based on whether the tokenizer has a chat template.
63
+ # Note: Auto-inference is incorrect for some non-chat models that still have chat templates
64
+ # e.g. Qwen2, Qwen 2.5.
65
+ # For these models, the `apply_chat_template` arg should be explicitly set to false.
66
+ self._apply_chat_template = bool(self._pipeline.tokenizer.chat_template)
67
+ hwarn(
68
+ f"Automatically set `apply_chat_template` to {self._apply_chat_template} based on "
69
+ "whether the tokenizer has a chat template. "
70
+ "If this is incorrect, please explicitly set `apply_chat_template`."
71
+ )
72
+
73
+ def make_text_inputs(self, request: Request) -> Union[str, List[Dict[str, str]]]:
74
+ if request.prompt and request.messages:
75
+ raise NonRetriableException(f"More than one of `prompt` and `messages` was set in request: {request}")
76
+ # Chat model expects a list of messages as input
77
+ if self._apply_chat_template:
78
+ if request.messages:
79
+ return request.messages
80
+ else:
81
+ return [{"role": "user", "content": request.prompt}]
82
+ # Base non-chat model expects a string as input
83
+ else:
84
+ if request.messages:
85
+ raise NonRetriableException("Chat mesages not supported by non-chat model")
86
+ else:
87
+ return request.prompt
88
+
89
+ def make_request(self, request: Request) -> RequestResult:
90
+ """Make a request"""
91
+ if request.model != self._helm_model_name:
92
+ raise NonRetriableException(
93
+ f"This instance of HuggingFacePipelineClient has loaded model {self._helm_model_name} but the request was for model {request.model}" # noqa: E501
94
+ )
95
+ completions: List[GeneratedOutput] = []
96
+ do_sample = request.temperature > 0.0
97
+ raw_request = {
98
+ "text_inputs": self.make_text_inputs(request),
99
+ "return_full_text": request.echo_prompt,
100
+ "temperature": request.temperature if do_sample else None,
101
+ "num_return_sequences": request.num_completions,
102
+ "max_new_tokens": request.max_tokens,
103
+ "top_p": request.top_p,
104
+ "top_k": request.top_k_per_token if do_sample else None,
105
+ "do_sample": do_sample,
106
+ "return_dict_in_generate": True,
107
+ }
108
+ if request.stop_sequences:
109
+ stop_sequence_ids = self._pipeline.tokenizer(
110
+ request.stop_sequences, return_token_type_ids=False, add_special_tokens=False
111
+ )
112
+ if len(stop_sequence_ids.input_ids) == 1 and len(stop_sequence_ids.input_ids[0]) == 1:
113
+ raw_request["eos_token_id"] = stop_sequence_ids.input_ids[0][0]
114
+ else:
115
+ raise NonRetriableException(
116
+ "Multiple stop sequences and stop sequences of multiple tokens, are not yet supported by HuggingFacePipelineClient" # noqa: E501
117
+ )
118
+
119
+ def do_it() -> Dict[str, Any]:
120
+ pipeline_outputs = self._pipeline(**raw_request)
121
+ return {"outputs": pipeline_outputs}
122
+
123
+ cache_key = CachingClient.make_cache_key(
124
+ {"pipeline_kwargs": self._pipeline_kwargs, **raw_request},
125
+ request,
126
+ )
127
+
128
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
129
+ for raw_output in response["outputs"]:
130
+ completions.append(GeneratedOutput(text=raw_output["generated_text"], logprob=0, tokens=[]))
131
+ return RequestResult(
132
+ success=True,
133
+ cached=cached,
134
+ request_time=response["request_time"],
135
+ request_datetime=response["request_datetime"],
136
+ completions=completions,
137
+ embedding=[],
138
+ )
@@ -12,7 +12,7 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
- """ DalleBart model configuration """
15
+ """DalleBart model configuration"""
16
16
  import warnings
17
17
 
18
18
  from transformers.configuration_utils import PretrainedConfig
@@ -12,7 +12,7 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
- """ DalleBart model. """
15
+ """DalleBart model."""
16
16
 
17
17
  import math
18
18
  from functools import partial
@@ -1,4 +1,4 @@
1
- """ DalleBart processor """
1
+ """DalleBart processor"""
2
2
 
3
3
  from typing import List
4
4
 
@@ -1,4 +1,4 @@
1
- """ DalleBart tokenizer """
1
+ """DalleBart tokenizer"""
2
2
 
3
3
  from transformers import BartTokenizerFast
4
4