PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +7 -1
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +191 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +47 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +24 -6
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/huggingface_client.py +2 -2
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +33 -20
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -13
helm/clients/vertexai_client.py +19 -11
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +83 -34
helm/common/object_spec.py +23 -8
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +525 -172
helm/config/model_metadata.yaml +185 -10
helm/config/tokenizer_configs.yaml +100 -2
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/clients/test_openrouter_client.py ADDED Viewed

@@ -0,0 +1,69 @@
+import os
+import pytest
+import tempfile
+from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
+from helm.common.request import Request
+from helm.clients.openrouter_client import OpenRouterClient
+from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
+class TestOpenRouterClient:
+    def setup_method(self, method):
+        cache_file = tempfile.NamedTemporaryFile(delete=False)
+        self.cache_path: str = cache_file.name
+        self.tokenizer_name = "mistralai/Mistral-7B-v0.1"
+        self.tokenizer = HuggingFaceTokenizer(
+            cache_config=BlackHoleCacheConfig(),
+            tokenizer_name=self.tokenizer_name,
+        )
+    def teardown_method(self, method):
+        os.remove(self.cache_path)
+    @pytest.mark.parametrize(
+        "model_name,test_input,expected_model",
+        [
+            (
+                "mistralai/mistral-medium-3.1",
+                Request(
+                    model="mistralai/mistral-medium-3.1",
+                    model_deployment="openrouter/mistral-medium-3.1",
+                ),
+                "mistralai/mistral-medium-3.1",
+            ),
+            (
+                None,
+                Request(model="openai/gpt-oss-20b:free", model_deployment="openrouter/gpt-oss-20b:free"),
+                "openai/gpt-oss-20b:free",
+            ),
+        ],
+    )
+    def test_get_model_for_request(self, model_name, test_input, expected_model):
+        client = OpenRouterClient(
+            tokenizer_name=self.tokenizer_name,
+            tokenizer=self.tokenizer,
+            cache_config=SqliteCacheConfig(self.cache_path),
+            model_name=model_name,
+            api_key="test_key",
+        )
+        assert client._get_model_for_request(test_input) == expected_model
+    def test_api_key_env_var(self, monkeypatch):
+        monkeypatch.setenv("OPENROUTER_API_KEY", "test_key")
+        client = OpenRouterClient(
+            tokenizer_name=self.tokenizer_name,
+            tokenizer=self.tokenizer,
+            cache_config=SqliteCacheConfig(self.cache_path),
+        )
+        assert client.api_key == "test_key"
+    def test_api_key_argument(self):
+        client = OpenRouterClient(
+            tokenizer_name=self.tokenizer_name,
+            tokenizer=self.tokenizer,
+            cache_config=BlackHoleCacheConfig(),
+            api_key="explicit_key",
+        )
+        assert client.api_key == "explicit_key"

helm/clients/together_client.py CHANGED Viewed

@@ -25,8 +25,6 @@ except ModuleNotFoundError as e:
 class _RewriteRequestTags:
     """Tags that indicate that the request for the model must be rewritten before sending to Together."""
-    # TODO: Convert to StrEnum after upgrading to Python 3.11
     ADD_EOS_TOKEN_AS_STOP_SEQUENCE = "ADD_EOS_TOKEN_AS_STOP_SEQUENCE"
     """Indicates that the EOS token should be added as an extra stop sequence.
@@ -101,7 +99,20 @@ class JobNotFinishedError(TogetherClientError):
     pass
-def _parse_thinking(input: str) -> Tuple[str, str]:
+def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
+    """Return a tuple of thinking text and output text."""
+    match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), match.group(2))
+    match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), "")
+    return (input, "")
+def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
     """Return a tuple of thinking text and output text."""
     match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
     if match:
@@ -114,6 +125,31 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
     return (input, "")
+def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
+    """Return a tuple of thinking text and output text."""
+    match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), match.group(2))
+    match = re.match(r"\n<think>(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), "")
+    return (input, "")
+def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
+    # TODO: Come up with a more sustainable extensible way of doing this.
+    if "deepseek-r1" in model_name:
+        return _parse_thinking_deepseek_r1(input)
+    elif "qwen3" in model_name:
+        return _parse_thinking_qwen3(input)
+    elif "glm-4.5" in model_name:
+        return _parse_thinking_glm_4_5(input)
+    else:
+        raise Exception(f"No thinking parser available for model {model_name}")
 class TogetherClient(CachingClient):
     """
     Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -348,9 +384,8 @@ class TogetherChatClient(CachingClient):
         self._client = Together(api_key=api_key)
         self._together_model = together_model
         self._disable_logprobs = bool(disable_logprobs)
-        # self.output_processor is actually a function, not a class
         self._parse_thinking = bool(parse_thinking)
+        # self.output_processor is actually a function, not a class
         self.output_processor: Optional[Callable[[str], str]] = (
             get_class_by_name(output_processor) if output_processor else None
         )
@@ -446,15 +481,15 @@ class TogetherChatClient(CachingClient):
             if self.output_processor:
                 output_text = self.output_processor(output_text)
+            thinking: Optional[Thinking] = None
             if self._parse_thinking:
-                thinking_text, output_text = _parse_thinking(output_text)
-                generated_outputs.append(
-                    GeneratedOutput(
-                        text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
-                    )
-                )
-            else:
-                generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
+                thinking_text, output_text = _parse_thinking(output_text, request.model)
+                thinking = Thinking(text=thinking_text)
+            elif hasattr(choice.message, "reasoning_content"):
+                thinking = Thinking(text=choice.message.reasoning_content)
+            generated_outputs.append(
+                GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
+            )
         return RequestResult(
             success=True,
             cached=cached,

helm/clients/vertexai_client.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import requests
 from abc import ABC, abstractmethod
 from threading import Lock
-from typing import Any, Dict, Mapping, Optional, List, Union
+from typing import Any, Dict, Mapping, Optional, List, Union, cast
 from helm.common.cache import CacheConfig
 from helm.common.multimodal_request_utils import get_contents_as_bytes
@@ -107,7 +107,7 @@ class VertexAITextClient(VertexAIClient):
     def make_request(self, request: Request) -> RequestResult:
         """Make a request"""
-        parameters = {
+        parameters: Dict[str, Any] = {
             "temperature": request.temperature,
             "max_output_tokens": request.max_tokens,
             "top_k": request.top_k_per_token,
@@ -207,21 +207,23 @@ class VertexAIChatClient(VertexAIClient):
     def make_request(self, request: Request) -> RequestResult:
         """Make a request"""
-        contents = [request.prompt]
+        # mypy is unhappy without this cast
+        contents: Union[List[Union[str, Image, Part]], List[Content]] = cast(
+            List[Union[str, Image, Part]], [request.prompt]
+        )
         # For the multimodal case, build up the content with the media objects of `request.multimodal_prompt`
         if request.multimodal_prompt is not None:
             return self._make_multimodal_request(request)
         if request.messages is not None:
-            contents = []
             role_mapping = {"user": "user", "assistant": "model"}
-            for msg in request.messages:
-                contents.append(
-                    Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
-                )
+            contents = [
+                Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
+                for msg in request.messages
+            ]
-        parameters = {
+        parameters: Dict[str, Any] = {
             "temperature": request.temperature,
             "max_output_tokens": request.max_tokens,
             "top_k": request.top_k_per_token,
@@ -274,8 +276,14 @@ class VertexAIChatClient(VertexAIClient):
                     if not candidate.content:
                         raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
                     if not candidate.content.parts:
-                        raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
-                    predictions.append({"text": candidate.content.text})
+                        if candidate.finish_reason == 2:  # MAX_TOKENS
+                            # This means that there is no text output because the maximum number of tokens were
+                            # reached during thinking.
+                            predictions.append({"text": ""})
+                        else:
+                            raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
+                    else:
+                        predictions.append({"text": candidate.content.text})
                     # TODO: Extract more information from the response
                 return {"predictions": predictions}

helm/clients/vllm_client.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Any, Dict, Optional
 from helm.common.cache import CacheConfig
 from helm.common.request import Request
-from helm.clients.openai_client import OpenAILegacyCompletionsClient
+from helm.clients.openai_client import OpenAIClient, OpenAILegacyCompletionsClient
 from helm.tokenizers.tokenizer import Tokenizer
@@ -19,6 +19,8 @@ class VLLMClient(OpenAILegacyCompletionsClient):
         tokenizer_name: str,
         cache_config: CacheConfig,
         base_url: Optional[str] = None,
+        vllm_model_name: Optional[str] = None,
+        **kwargs,
     ):
         super().__init__(
             tokenizer=tokenizer,
@@ -27,18 +29,52 @@ class VLLMClient(OpenAILegacyCompletionsClient):
             api_key="EMPTY",
             org_id=None,
             base_url=base_url,
+            openai_model_name=vllm_model_name,
+            **kwargs,
         )
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer_name
-    def _get_model_for_request(self, request: Request) -> str:
-        # The `model` parameter for vLLM should be the whole model name including the creator organization,
-        # unlike OpenAI which only uses the model engine.
-        return request.model
+        self.vllm_model_name = vllm_model_name
     def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
         raw_request = super()._to_raw_completion_request(request)
         # This avoids the error: best_of must be 1 when using greedy sampling
-        if "best_of" in raw_request and raw_request["best_of"] > 1:
+        if (
+            "temperature" in raw_request
+            and raw_request["temperature"] == 0.0
+            and "best_of" in raw_request
+            and raw_request["best_of"] > 1
+        ):
             raw_request["best_of"] = 1
         return raw_request
+class VLLMChatClient(OpenAIClient):
+    """Sends request to a vLLM server using the OpenAI-compatible API.
+    Only uses the Chat Completions API.
+    See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        base_url: Optional[str] = None,
+        vllm_model_name: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            tokenizer_name=tokenizer_name,
+            cache_config=cache_config,
+            api_key="EMPTY",
+            org_id=None,
+            base_url=base_url,
+            openai_model_name=vllm_model_name,
+            **kwargs,
+        )
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
+        self.vllm_model_name = vllm_model_name

helm/clients/vllm_granite_thinking_client.py ADDED Viewed

@@ -0,0 +1,56 @@
+from dataclasses import replace
+import re
+from typing import Any, Dict, List, Tuple
+from helm.clients.vllm_client import VLLMChatClient
+from helm.common.request import GeneratedOutput, Request, RequestResult, Thinking
+class VLLMGraniteThinkingClient(VLLMChatClient):
+    """Sends request to a Granite model on vLLM server with thinking enabled.
+    From vLLM documentation at
+    https://docs.vllm.ai/en/v0.9.1/features/reasoning_outputs.html
+    IBM Granite 3.2 reasoning is disabled by default;
+    to enable it, you must also pass thinking=True in your chat_template_kwargs.
+    """
+    def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
+        raw_request = super()._make_chat_raw_request(request)
+        raw_request["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
+        return raw_request
+    def _parse_thinking(self, input: str) -> Tuple[str, str]:
+        """Return a tuple of thinking text and output text."""
+        match = re.match(r"<think>(.*)</think>\s*<response>(.*)</response>", input, re.DOTALL)
+        if match:
+            return (match.group(1), match.group(2))
+        match = re.match(r"<think>(.*)</think>\s*<response>(.*)", input, re.DOTALL)
+        if match:
+            return (match.group(1), match.group(2))
+        match = re.match(r"<think>(.*)</think>\s*", input, re.DOTALL)
+        if match:
+            return (match.group(1), "")
+        match = re.match(r"<think>(.*)", input, re.DOTALL)
+        if match:
+            return (match.group(1), "")
+        return (input, "")
+    def _make_chat_request(self, request: Request) -> RequestResult:
+        request_result = super()._make_chat_request(request)
+        modified_completions: List[GeneratedOutput] = []
+        for completion in request_result.completions:
+            thinking, modified_text = self._parse_thinking(completion.text)
+            modified_completions.append(
+                replace(
+                    completion,
+                    text=modified_text,
+                    thinking=Thinking(text=thinking),
+                )
+            )
+        return replace(request_result, completions=modified_completions)

helm/common/critique_request.py CHANGED Viewed

@@ -6,7 +6,6 @@ from helm.common.media_object import MediaObject
 class QuestionType:
     """String enum of question types."""
-    # TODO: Make this a StrEnum after upgrading to Python 3.11
     MULTIPLE_CHOICE: str = "multiple_choice"
     CHECKBOX: str = "checkbox"
     FREE_RESPONSE: str = "free_response"

helm/common/hierarchical_logger.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import logging
+import logging.config
+import yaml
+import os
 import sys
 import time
 from typing import Any, Callable, List, Optional
@@ -34,22 +37,31 @@ class HierarchicalLogger(object):
     def indent(self) -> str:
         return "  " * len(self.start_times)
-    def track_begin(self, x: Any) -> None:
-        self.logger.info(self.indent() + str(x) + " {")
+    def track_begin(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.info(self.indent() + str(x) + " {", **kwargs)
         sys.stdout.flush()
         self.start_times.append(time.time())
-    def track_end(self) -> None:
+    def track_end(self, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
         t = time.time() - self.start_times.pop()
-        self.logger.info(self.indent() + "} [%s]" % (format_time(t)))
+        self.logger.info(self.indent() + "} [%s]" % (format_time(t)), **kwargs)
         sys.stdout.flush()
-    def log(self, x: Any) -> None:
-        self.logger.info(self.indent() + str(x))
+    def log(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.info(self.indent() + str(x), **kwargs)
         sys.stdout.flush()
-    def warn(self, x: Any) -> None:
-        self.logger.warning(self.indent() + str(x))
+    def debug(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.debug(self.indent() + str(x), **kwargs)
+        sys.stdout.flush()
+    def warn(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.warning(self.indent() + str(x), **kwargs)
         sys.stdout.flush()
@@ -69,23 +81,31 @@ singleton = HierarchicalLogger()
 # Exposed public methods
-def hlog(x: Any) -> None:
-    singleton.log(x)
+def hdebug(x: Any, **kwargs) -> None:
+    kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+    singleton.debug(x, **kwargs)
+def hlog(x: Any, **kwargs) -> None:
+    kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+    singleton.log(x, **kwargs)
-def hwarn(x: Any) -> None:
-    singleton.warn(x)
+def hwarn(x: Any, **kwargs) -> None:
+    kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+    singleton.warn(x, **kwargs)
 class htrack_block:
-    def __init__(self, x: Any) -> None:
+    def __init__(self, x: Any, stacklevel=1) -> None:
+        self._stacklevel = stacklevel + 1
         self.x = x
     def __enter__(self) -> None:
-        singleton.track_begin(self.x)
+        singleton.track_begin(self.x, stacklevel=self._stacklevel)
     def __exit__(self, tpe: Any, value: Any, callback: Any) -> None:
-        singleton.track_end()
+        singleton.track_end(stacklevel=self._stacklevel)
 class htrack:
@@ -116,34 +136,63 @@ class htrack:
                     description = description.replace("$" + k, str(v))
             else:
                 description = ""
-            with htrack_block(parent + fn.__name__ + description):
+            with htrack_block(parent + fn.__name__ + description, stacklevel=2):
                 return fn(*args, **kwargs)
         return wrapper
-def setup_default_logging():
+def setup_default_logging(config_path: Optional[str] = None):
     """
-    Setup a default logger to STDOUT for HELM via Python logging
-    """
-    formatter = ColoredFormatter(
-        "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
-        datefmt="%Y-%m-%dT%H:%M:%S",
-        reset=True,
-        log_colors={
-            "DEBUG": "cyan",
-            "INFO": "green",
-            "WARNING": "yellow",
-            "ERROR": "red",
-            "CRITICAL": "red,bg_white",
-        },
-        secondary_log_colors={},
-        style="%",
-    )
+    Setup Python logging for HELM
+    Priority:
+    1. External config file (YAML or JSON).
+    2. ENV var LOG_LEVEL.
+    3. a default logger to STDOUT
+    """
     logger = logging.getLogger("helm")
-    logger.setLevel(logging.INFO)
     logger.propagate = False
+    if config_path and os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        logging.config.dictConfig(config)
+        hdebug("setup custom HELM logging")
+        return
+    log_level = (os.getenv("HELM_LOG_LEVEL") or os.getenv("LOG_LEVEL") or "INFO").upper()
+    try:
+        logger.setLevel(getattr(logging, log_level))
+    except AttributeError:
+        logger.setLevel(logging.INFO)
+    # Set formatter
+    formatter: Optional[logging.Formatter] = None
+    if sys.stdout.isatty():
+        try:
+            formatter = ColoredFormatter(
+                "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
+                datefmt="%Y-%m-%dT%H:%M:%S",
+                reset=True,
+                log_colors={
+                    "DEBUG": "cyan",
+                    "INFO": "green",
+                    "WARNING": "yellow",
+                    "ERROR": "red",
+                    "CRITICAL": "red,bg_white",
+                },
+                style="%",
+            )
+        except ImportError:
+            pass
+    if formatter is None:
+        # fallback
+        formatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    # Add default stdout handler
     handler = logging.StreamHandler(sys.stdout)
     handler.setFormatter(formatter)
     logger.addHandler(handler)
+    hdebug("setup default HELM logging")

helm/common/object_spec.py CHANGED Viewed

@@ -55,14 +55,23 @@ def inject_object_spec_args(
     This is loosely based on instance (constant) bindings and provider bindings in Guice dependency injection.
     Example:
-    class MyClass:
-        def __init__(a: int, b: int, c: int, d: int = 0):
-            pass
-    old_object_spec = ObjectSpec(class_name="MyClass", args={"a": 11})
-    new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
-    # new_object_spec is now ObjectSpec(class_name="MyClass", args={"a": 11, "b": 12, "c": 13})
+        >>> from helm.common.object_spec import *  # NOQA
+        >>> import sys, types
+        >>> # Given a custom class with hashable arguments
+        >>> class MyClass:
+        ...     def __init__(a: int, b: int, c: int, d: int = 0):
+        ...         pass
+        >>> #
+        >>> # <boilerplate>: make a dummy module for MyClass to make this doctest exectuable
+        >>> sys.modules["my_module"] = type("MyModule", (types.ModuleType,), {"MyClass": MyClass})("my_module")
+        >>> # </boilerplate>
+        >>> #
+        >>> # Define new style and old style object specs
+        >>> old_object_spec = ObjectSpec(class_name="my_module.MyClass", args={"a": 11})
+        >>> new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
+        >>> # new_object_spec is now
+        >>> print(new_object_spec)
+        ObjectSpec(class_name='my_module.MyClass', args={'a': 11, 'b': 12, 'c': 13})
     """
     cls = get_class_by_name(spec.class_name)
     init_signature = inspect.signature(cls.__init__)
@@ -93,6 +102,12 @@ def parse_object_spec(description: str) -> ObjectSpec:
         <class_name>:<key>=<value>,<key>=<value>
     Usually, the description is something that's succinct and can be typed on the command-line.
     Here, value defaults to string.
+    Example:
+        >>> from helm.common.object_spec import *  # NOQA
+        >>> description = 'mscoco:model=huggingface_stable-diffusion-v1-4'
+        >>> parse_object_spec(description)
+        ObjectSpec(class_name='mscoco', args={'model': 'huggingface_stable-diffusion-v1-4'})
     """
     def parse_arg(arg: str) -> Tuple[str, Any]:

crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.8py3-none-any.whl