PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/clients/audio_language/test.py ADDED Viewed

@@ -0,0 +1,62 @@
+import soundfile as sf
+from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor  # type: ignore
+from qwen_omni_utils import process_mm_info
+# default: Load the model on the available device(s)
+model = Qwen2_5OmniModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
+# We recommend enabling flash_attention_2 for better acceleration and memory saving.
+# model = Qwen2_5OmniModel.from_pretrained(
+#     "Qwen/Qwen2.5-Omni-7B",
+#     torch_dtype="auto",
+#     device_map="auto",
+#     attn_implementation="flash_attention_2",
+# )
+processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
+conversation = [
+    {
+        "role": "system",
+        "content": (
+            "You are Qwen, a virtual human developed by the Qwen Team,"
+            " Alibaba Group, capable of perceiving auditory and visual"
+            " inputs, as well as generating text and speech."
+        ),
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
+        ],
+    },
+]
+# set use audio in video
+USE_AUDIO_IN_VIDEO = True
+# Preparation for inference
+text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+inputs = processor(
+    text=text,
+    audios=audios,
+    images=images,
+    videos=videos,
+    return_tensors="pt",
+    padding=True,
+    use_audio_in_video=USE_AUDIO_IN_VIDEO,
+)
+inputs = inputs.to(model.device).to(model.dtype)
+# Inference: Generation of the output text and audio
+text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print(text)
+sf.write(
+    "output.wav",
+    audio.reshape(-1).detach().cpu().numpy(),
+    samplerate=24000,
+)

helm/clients/bedrock_client.py CHANGED Viewed

@@ -117,10 +117,12 @@ class BedrockNovaClient(CachingClient):
         tokenizer_name: str,
         assumed_role: Optional[str] = None,
         region: Optional[str] = None,
+        bedrock_model_id: Optional[str] = None,
     ):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer_name
+        self.bedrock_model_id = bedrock_model_id
         self.bedrock_client = get_bedrock_client_v1(
             assumed_role=assumed_role or os.environ.get("BEDROCK_ASSUME_ROLE", None),
             region=region,
@@ -144,7 +146,7 @@ class BedrockNovaClient(CachingClient):
         messages = self._get_messages_from_request(request)
         return {
-            "modelId": model_id,
+            "modelId": self.bedrock_model_id or model_id,
             "inferenceConfig": {
                 "temperature": request.temperature,
                 "maxTokens": request.max_tokens,

helm/clients/client.py CHANGED Viewed

@@ -2,7 +2,7 @@ import json
 from abc import ABC, abstractmethod
 from typing import List, Mapping, Optional, cast
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.media_object import MultimediaObject, TEXT_TYPE
 from helm.common.request import Request, RequestResult, GeneratedOutput, Token
 from helm.common.cache import Cache, CacheConfig
@@ -65,7 +65,7 @@ def truncate_sequence(
     # where max_tokens = 0, so there's nothing to truncate.
     if request.echo_prompt:
         if request.max_tokens != 0:
-            hlog("WARNING: don't know how to handle echo_prompt and max_tokens > 0, not truncating")
+            hwarn("don't know how to handle echo_prompt and max_tokens > 0, not truncating")
         return sequence
     if end_of_text_token:
@@ -90,8 +90,8 @@ def truncate_sequence(
             new_tokens.append(token)
         if len(new_text) < len(sequence.text) and len(new_tokens) == len(sequence.tokens):
-            hlog(
-                f"WARNING: Stripped characters from text ({len(sequence.text)} -> {len(new_text)}), "
+            hwarn(
+                f"Stripped characters from text ({len(sequence.text)} -> {len(new_text)}), "
                 f"but wasn't able to strip the tokens"
             )
@@ -99,14 +99,14 @@ def truncate_sequence(
         new_logprob = sum(token.logprob for token in new_tokens)
         if print_warning:
-            hlog(f"WARNING: truncate_sequence needs to strip {json.dumps(stop)}")
+            hwarn(f"truncate_sequence needs to strip {json.dumps(stop)}")
         sequence = GeneratedOutput(text=new_text, logprob=new_logprob, tokens=new_tokens)
     # Truncate based on the max number of tokens.
     if len(sequence.tokens) > request.max_tokens:
         if print_warning:
-            hlog(f"WARNING: truncate_sequence needs to truncate {len(sequence.tokens)} down to {request.max_tokens}")
+            hwarn(f"truncate_sequence needs to truncate {len(sequence.tokens)} down to {request.max_tokens}")
         new_tokens = sequence.tokens[: request.max_tokens]
         # This is imperfect stitching together of tokens, so just to make sure this is okay
@@ -114,7 +114,7 @@ def truncate_sequence(
         # Usually, in our benchmark, max_tokens is active when it's 1, so hopefully this isn't an issue.
         new_text = "".join(token.text for token in new_tokens)
         if not sequence.text.startswith(new_text):
-            hlog(f"WARNING: {json.dumps(sequence.text)} does not start with truncated text {json.dumps(new_text)}")
+            hwarn(f"{json.dumps(sequence.text)} does not start with truncated text {json.dumps(new_text)}")
         new_logprob = sum(token.logprob for token in new_tokens)

helm/clients/grok_client.py ADDED Viewed

@@ -0,0 +1,36 @@
+from typing import Any, Dict, Optional
+from helm.clients.openai_client import OpenAIClient
+from helm.common.cache import CacheConfig
+from helm.common.request import Request
+from helm.tokenizers.tokenizer import Tokenizer
+class GrokChatClient(OpenAIClient):
+    BASE_URL = "https://api.x.ai/v1"
+    _UNSUPPORTED_ARGUMENTS = ["presence_penalty", "frequency_penalty"]
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        api_key: Optional[str] = None,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            tokenizer_name=tokenizer_name,
+            cache_config=cache_config,
+            api_key=api_key,
+            org_id=None,
+            base_url="https://api.x.ai/v1",
+        )
+    def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
+        raw_request = super()._make_chat_raw_request(request)
+        for unsupported_argument in self._UNSUPPORTED_ARGUMENTS:
+            if unsupported_argument in raw_request:
+                del raw_request[unsupported_argument]
+        return raw_request

helm/clients/huggingface_client.py CHANGED Viewed

@@ -8,7 +8,7 @@ from transformers.generation.stopping_criteria import (
 from typing import Any, Dict, List, Optional, TypedDict
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import htrack_block, hlog
+from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import (
     wrap_request_time,
@@ -18,6 +18,7 @@ from helm.common.request import (
     GeneratedOutput,
     Token,
 )
+from helm.proxy.retry import NonRetriableException
 from helm.tokenizers.tokenizer import Tokenizer
 from helm.clients.client import CachingClient, truncate_sequence
 from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer, WrappedPreTrainedTokenizer
@@ -256,6 +257,7 @@ class HuggingFaceClient(CachingClient):
         tokenizer: Tokenizer,
         pretrained_model_name_or_path: Optional[str] = None,
         end_of_text_token: Optional[str] = None,
+        apply_chat_template: Optional[bool] = None,
         **kwargs,
     ):
         super().__init__(cache_config=cache_config)
@@ -266,9 +268,46 @@ class HuggingFaceClient(CachingClient):
                 "but instead it is {tokenizer}"
             )
         self._wrapped_tokenizer: WrappedPreTrainedTokenizer = tokenizer.get_wrapped_tokenizer()
-        self._tokenizer = tokenizer
         self._kwargs = _process_huggingface_client_kwargs(kwargs)
         self._end_of_text_token = end_of_text_token
+        # If the user did not explicitly configure whether the model is a chat model with `apply_chat_template` arg,
+        # auto-infer if the model is a chat model based on whether the tokenizer has a chat template.
+        # Note: Auto-inference is incorrect for some non-chat models that still have chat templates
+        # e.g. Qwen2, Qwen 2.5.
+        # For these models, the `apply_chat_template` arg should be explicitly set to false.
+        if apply_chat_template is not None:
+            self._apply_chat_template = apply_chat_template
+        else:
+            with self._wrapped_tokenizer as hf_tokenizer:
+                self._apply_chat_template = bool(hf_tokenizer.chat_template)
+                hwarn(
+                    f"Automatically set `apply_chat_template` to {self._apply_chat_template} based on "
+                    "whether the tokenizer has a chat template. "
+                    "If this is incorrect, please explicitly set `apply_chat_template`."
+                )
+    def get_prompt(self, request: Request) -> str:
+        if request.prompt and request.messages:
+            raise NonRetriableException(f"More than one of `prompt` and `messages` was set in request: {request}")
+        # Chat model expects a list of messages as input
+        if self._apply_chat_template:
+            with self._wrapped_tokenizer as tokenizer:
+                if request.messages:
+                    prompt = tokenizer.apply_chat_template(request.messages, tokenize=False, add_generation_prompt=True)
+                    assert isinstance(prompt, str)
+                    return prompt
+                else:
+                    prompt = tokenizer.apply_chat_template(
+                        [{"role": "user", "content": request.prompt}], tokenize=False, add_generation_prompt=True
+                    )
+                    assert isinstance(prompt, str)
+                    return prompt
+        # Base non-chat model expects a string as input
+        else:
+            if request.messages:
+                raise NonRetriableException("Chat mesages not supported by non-chat model")
+            else:
+                return request.prompt
     def make_request(self, request: Request) -> RequestResult:
         # Embedding not supported for this model
@@ -277,7 +316,7 @@ class HuggingFaceClient(CachingClient):
         raw_request: HuggingFaceRequest = {
             "engine": request.model_engine,
-            "prompt": request.prompt,
+            "prompt": self.get_prompt(request),
             "temperature": 1e-7 if request.temperature == 0 else request.temperature,
             "num_return_sequences": request.num_completions,
             "max_new_tokens": request.max_tokens,

helm/clients/huggingface_pipeline_client.py ADDED Viewed

@@ -0,0 +1,138 @@
+from threading import Lock
+from typing import Any, Dict, List, Optional, Union
+import transformers
+from helm.clients.client import CachingClient
+from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import htrack_block, hwarn
+from helm.common.request import GeneratedOutput, Request, RequestResult, wrap_request_time
+from helm.proxy.retry import NonRetriableException
+_pipelines: Dict[str, transformers.Pipeline] = {}
+_pipelines_lock: Lock = Lock()
+def _get_pipeline(
+    helm_model_name: str,
+    pipeline_kwargs: Dict[str, Any],
+) -> Any:
+    """
+    Checks if the desired HuggingFaceModel is cached. Creates the HuggingFaceModel if it's not cached.
+    Returns the HuggingFaceModel.
+    """
+    global _pipelines
+    global _pipelines_lock
+    with _pipelines_lock:
+        if helm_model_name not in _pipelines:
+            huggingface_model_name = pipeline_kwargs["model"]
+            with htrack_block(
+                f"Loading HuggingFace model {huggingface_model_name} (kwargs={pipeline_kwargs}) "
+                f"for HELM model {helm_model_name} with transformers.pipeline"
+            ):
+                _pipelines[helm_model_name] = transformers.pipeline(**pipeline_kwargs)
+    return _pipelines[helm_model_name]
+class HuggingFacePipelineClient(CachingClient):
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        model_name: str,
+        pretrained_model_name_or_path: Optional[str] = None,
+        apply_chat_template: Optional[bool] = None,
+        **kwargs,
+    ):
+        # Include `pretrained_model_name_or_path` parameter so that model deployments can use
+        # the `pretrained_model_name_or_path` arg to override `model_name`
+        super().__init__(cache_config=cache_config)
+        self._helm_model_name = model_name
+        self._pipeline_kwargs = {
+            "model": pretrained_model_name_or_path or self._helm_model_name,
+            "task": "text-generation",
+            **kwargs,
+        }
+        self._pipeline = _get_pipeline(self._helm_model_name, self._pipeline_kwargs)
+        if apply_chat_template is not None:
+            self._apply_chat_template = apply_chat_template
+        else:
+            # If the user did not explicitly configure whether the model is a chat model with `apply_chat_template` arg,
+            # auto-infer if the model is a chat model based on whether the tokenizer has a chat template.
+            # Note: Auto-inference is incorrect for some non-chat models that still have chat templates
+            # e.g. Qwen2, Qwen 2.5.
+            # For these models, the `apply_chat_template` arg should be explicitly set to false.
+            self._apply_chat_template = bool(self._pipeline.tokenizer.chat_template)
+            hwarn(
+                f"Automatically set `apply_chat_template` to {self._apply_chat_template} based on "
+                "whether the tokenizer has a chat template. "
+                "If this is incorrect, please explicitly set `apply_chat_template`."
+            )
+    def make_text_inputs(self, request: Request) -> Union[str, List[Dict[str, str]]]:
+        if request.prompt and request.messages:
+            raise NonRetriableException(f"More than one of `prompt` and `messages` was set in request: {request}")
+        # Chat model expects a list of messages as input
+        if self._apply_chat_template:
+            if request.messages:
+                return request.messages
+            else:
+                return [{"role": "user", "content": request.prompt}]
+        # Base non-chat model expects a string as input
+        else:
+            if request.messages:
+                raise NonRetriableException("Chat mesages not supported by non-chat model")
+            else:
+                return request.prompt
+    def make_request(self, request: Request) -> RequestResult:
+        """Make a request"""
+        if request.model != self._helm_model_name:
+            raise NonRetriableException(
+                f"This instance of HuggingFacePipelineClient has loaded model {self._helm_model_name} but the request was for model {request.model}"  # noqa: E501
+            )
+        completions: List[GeneratedOutput] = []
+        do_sample = request.temperature > 0.0
+        raw_request = {
+            "text_inputs": self.make_text_inputs(request),
+            "return_full_text": request.echo_prompt,
+            "temperature": request.temperature if do_sample else None,
+            "num_return_sequences": request.num_completions,
+            "max_new_tokens": request.max_tokens,
+            "top_p": request.top_p,
+            "top_k": request.top_k_per_token if do_sample else None,
+            "do_sample": do_sample,
+            "return_dict_in_generate": True,
+        }
+        if request.stop_sequences:
+            stop_sequence_ids = self._pipeline.tokenizer(
+                request.stop_sequences, return_token_type_ids=False, add_special_tokens=False
+            )
+            if len(stop_sequence_ids.input_ids) == 1 and len(stop_sequence_ids.input_ids[0]) == 1:
+                raw_request["eos_token_id"] = stop_sequence_ids.input_ids[0][0]
+            else:
+                raise NonRetriableException(
+                    "Multiple stop sequences and stop sequences of multiple tokens, are not yet supported by HuggingFacePipelineClient"  # noqa: E501
+                )
+        def do_it() -> Dict[str, Any]:
+            pipeline_outputs = self._pipeline(**raw_request)
+            return {"outputs": pipeline_outputs}
+        cache_key = CachingClient.make_cache_key(
+            {"pipeline_kwargs": self._pipeline_kwargs, **raw_request},
+            request,
+        )
+        response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        for raw_output in response["outputs"]:
+            completions.append(GeneratedOutput(text=raw_output["generated_text"], logprob=0, tokens=[]))
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=response["request_time"],
+            request_datetime=response["request_datetime"],
+            completions=completions,
+            embedding=[],
+        )

helm/clients/image_generation/dalle_mini/model/configuration.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DalleBart model configuration """
+"""DalleBart model configuration"""
 import warnings
 from transformers.configuration_utils import PretrainedConfig

helm/clients/image_generation/dalle_mini/model/modeling.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DalleBart model. """
+"""DalleBart model."""
 import math
 from functools import partial

helm/clients/image_generation/dalle_mini/model/processor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-""" DalleBart processor """
+"""DalleBart processor"""
 from typing import List

helm/clients/image_generation/dalle_mini/model/tokenizer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-""" DalleBart tokenizer """
+"""DalleBart tokenizer"""
 from transformers import BartTokenizerFast

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl