PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +7 -1
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +191 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +47 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +24 -6
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/huggingface_client.py +2 -2
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +33 -20
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -13
helm/clients/vertexai_client.py +19 -11
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +83 -34
helm/common/object_spec.py +23 -8
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +525 -172
helm/config/model_metadata.yaml +185 -10
helm/config/tokenizer_configs.yaml +100 -2
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/benchmark/run_specs/bluex_run_specs.py ADDED Viewed

@@ -0,0 +1,40 @@
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("bluex")
+def get_bluex_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEXScenario", args={})
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="""
+        Escolha a alternativa correta para as questões de vestibulares (responda apenas com a letra).
+        Exemplo de Pergunta com a resposta:
+        Em um romance narrado em primeira pessoa, o narrador participa dos acontecimentos da trama,
+        relatando suas próprias experiências e sentimentos. Qual alternativa apresenta essa característica?
+        (A) Narrador onisciente que conhece os pensamentos de todas as personagens.
+        (B) Narrador que descreve os fatos de forma imparcial, sem envolvimento emocional.
+        (C) Narrador-personagem que vivencia e relata os eventos da história.
+        (D) Narrador observador que apenas registra as ações visíveis.
+        (E) Narrador em segunda pessoa que se dirige constantemente ao leitor.
+        Resposta correta: C
+        A partir disso, responda:
+        """,
+        input_noun="Pergunta",
+        output_noun="Resposta",
+    )
+    return RunSpec(
+        name="bluex",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["bluex"],
+    )

helm/benchmark/run_specs/classic_run_specs.py CHANGED Viewed

@@ -35,7 +35,6 @@ from helm.benchmark.metrics.common_metric_specs import (
     get_f1_metric_specs,
     get_generative_harms_metric_specs,
     get_language_modeling_metric_specs,
-    get_numeracy_metric_specs,
     get_open_ended_generation_metric_specs,
     get_summarization_metric_specs,
     get_basic_generation_metric_specs,
@@ -381,58 +380,6 @@ def get_raft_spec(subset: str) -> RunSpec:
     )
-@run_spec_function("numeracy")
-def get_numeracy_spec(
-    relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
-) -> RunSpec:
-    from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
-    run_solver_bool: bool = True if run_solver.lower() == "true" else False
-    del run_solver
-    random_seed = int(seed)
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
-        args={"seed": random_seed, "relation_type": relation_type, "mode": mode},
-    )
-    if mode in ["example", "standard"]:
-        # Test a model's ability to impute datapoints for a given (example or randomly sampled) relation.
-        adapter_args: Dict[str, Any] = {
-            "max_train_instances": 100,
-            "max_eval_instances": 100,
-            "dim": RELTYPE_INFO[relation_type].num_variables + 1,
-        }
-    elif mode == "function":
-        # Test a model's ability to impute datapoints for randomly sampled relations
-        # (resampled for each evaluation point).
-        adapter_args = {
-            "instructions": "",
-            "max_train_instances": 0,  # Turn off general version of `function` mode because it doesn't cleanly
-            # capture a higher-order version of this task / is a little convoluted
-            # for models, currently.
-            # (In the general version, the model sees other relations of the same class,
-            # and needs to impute a datapoint for the last one. Presumably, inferring
-            # the class - eg. the degree of the relation - would help.)
-            "max_eval_instances": 1000,
-            "dim": RELTYPE_INFO[relation_type].num_variables + 1,
-            "instance_prefix": "\n\n",
-        }
-    else:
-        raise ValueError(f"Invalid mode: {mode}")
-    adapter_spec = get_numeracy_adapter_spec(**adapter_args)  # Construct the AdapterSpec using a helper function.
-    # `get_numeracy_adapter_spec` is defined in numeracy_scenario.py
-    # because it is used within the scenario to construct the instances themselves.
-    return RunSpec(
-        name=f"numeracy:relation_type={relation_type},mode={mode}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_numeracy_metric_specs(run_solver_bool),
-        groups=["numeracy"],
-    )
 @run_spec_function("boolq")
 def get_boolq_spec(only_contrast=False) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -806,12 +753,12 @@ def get_xsum_sampled_summarization_spec(temperature: float = 0.3, device: str =
     )
     return RunSpec(
-        name=f"summarization_xsum:temperature={temperature},device={device}",
+        name=f"summarization_xsum_sampled:temperature={temperature},device={device}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_summarization_metric_specs({"task": "summarization_xsum_sampled", "device": device})
         + get_generative_harms_metric_specs(),
-        groups=["summarization_xsum"],
+        groups=["summarization_xsum_sampled"],
     )

helm/benchmark/run_specs/codeinsights_run_specs.py ADDED Viewed

@@ -0,0 +1,192 @@
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
+from helm.benchmark.metrics.codeinsights_metric_specs import (
+    get_functional_correctness_metric_specs,
+    get_comprehensive_code_evaluation_metric_specs,
+    get_edge_case_metric_specs,
+    get_code_efficiency_metric_specs,
+)
+@run_spec_function("codeinsights_correct_code")
+def get_codeinsights_correct_code_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.codeinsights_correct_code_scenario.CodeInsightsCorrectCodeScenario",
+        args={"num_testcases": num_testcases},
+    )
+    instruction = (
+        "You are a skilled C++ programmer working on a foundational programming course assignment. "
+        "Your task is to write correct, efficient C++ code that solves the given problem. "
+        "Write clean, well-structured code that follows good programming practices. "
+        "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
+        "DO NOT reproduce the template part as the generated code would be inserted to the template,"
+        "and make sure the code is compatible with the Unit Test Input"
+        "Ensure your code is correct, efficient, includes any class definition when needed, and handles all edge cases properly."
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instruction,
+        output_noun="Your code",
+        stop_sequences=[],
+        max_tokens=4000,
+        temperature=tpr,
+    )
+    return RunSpec(
+        name=f"codeinsights_correct_code:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_functional_correctness_metric_specs() + get_basic_metric_specs([]),
+        groups=["codeinsights", "codeinsights_correct_code"],
+    )
+@run_spec_function("codeinsights_student_coding")
+def get_codeinsights_student_coding_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.codeinsights_student_coding_scenario.CodeInsightsStudentCodingScenario",
+        args={"num_testcases": num_testcases},
+    )
+    instruction = (
+        "You are the same student who wrote the three examples below in your foundational C++ course. "
+        "Mimic exactly your personal coding style, conventions, and level of proficiency—"
+        "do not over‐optimize or introduce unfamiliar patterns. "
+        "Include the same sort of formatting, variable names, and minor imperfections you demonstrated. "
+        "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
+        "DO NOT reproduce the template part as the generated code would be inserted to the template,"
+        "and make sure the code is compatible with the Unit Test Input"
+        "Ensure your code includes any class definition when needed."
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instruction,
+        output_noun="Your code",
+        stop_sequences=[],
+        max_tokens=4000,
+        temperature=tpr,
+    )
+    return RunSpec(
+        name=f"codeinsights_student_coding:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_comprehensive_code_evaluation_metric_specs() + get_basic_metric_specs([]),
+        groups=["codeinsights", "codeinsights_student_coding"],
+    )
+@run_spec_function("codeinsights_student_mistake")
+def get_codeinsights_student_mistake_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.codeinsights_student_mistake_scenario.CodeInsightsStudentMistakeScenario",
+        args={"num_testcases": num_testcases},
+    )
+    instruction = (
+        "You are a C++ student with a consistent personal style, conventions, and proficiency level.\n"
+        "Your task is to attempt the target problem **but introduce realistic mistake** you would typically make—"
+        "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
+        "DO NOT reproduce the template part as the generated code would be inserted to the template,"
+        "and make sure the code is compatible with the Unit Test Input"
+        "Ensure your code is includes any class definition when needed."
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instruction,
+        output_noun="Your code",
+        stop_sequences=[],
+        max_tokens=4000,
+        temperature=tpr,
+    )
+    return RunSpec(
+        name=f"codeinsights_student_mistake:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_comprehensive_code_evaluation_metric_specs() + get_basic_metric_specs([]),
+        groups=["codeinsights", "codeinsights_student_mistake"],
+    )
+@run_spec_function("codeinsights_code_efficiency")
+def get_codeinsights_code_efficiency_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
+    """
+    Run specification for code efficiency evaluation scenario.
+    This scenario evaluates whether LLM-generated code has similar runtime efficiency
+    as the original student code. It focuses on problems where both solutions are
+    functionally correct and measures runtime performance alignment.
+    Requires C++ compiler (g++) to be available for actual compilation and execution.
+    """
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.codeinsights_code_efficiency_scenario.CodeInsightsCodeEfficiencyScenario",
+        args={"num_testcases": num_testcases},
+    )
+    instruction = (
+        "You are the same student who wrote the three examples below in your foundational C++ course. "
+        "Mimic exactly your personal coding style, conventions, and make sure to generate a correct code. "
+        "Do not over-optimize or introduce unfamiliar patterns. If the code is correct but inefficient, "
+        "imitate the inefficiency. "
+        "If the student writes efficiently, write efficiently too. "
+        "Include the same sort of formatting, variable names, and minor imperfections you demonstrated. "
+        "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template."
+        "DO NOT reproduce the template part as the generated code would be inserted to the template,"
+        "and make sure the code is compatible with the Unit Test Input"
+        "Ensure your code is correct, includes any class definition when needed, and handles all edge cases properly."
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instruction,
+        output_noun="Your code",
+        stop_sequences=[],
+        max_tokens=4000,
+        temperature=tpr,
+    )
+    return RunSpec(
+        name=f"codeinsights_code_efficiency:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_code_efficiency_metric_specs(
+            num_runtime_runs=5,  # Run each solution 5 times for averaging
+            timeout_seconds=10,  # 10 second timeout per execution
+        )
+        + get_basic_metric_specs([]),
+        groups=["codeinsights", "codeinsights_code_efficiency"],
+    )
+@run_spec_function("codeinsights_edge_case")
+def get_codeinsights_edge_case_run_spec(tpr: float = 0.0, num_testcases: int = 1) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.codeinsights_edge_case_scenario.CodeInsightsEdgeCaseScenario",
+        args={"num_testcases": num_testcases},
+    )
+    instruction = (
+        "You are a student studying C++ with a consistent personal style, conventions, and proficiency level.\n"
+        "Your task is to identify which test case you would likely to fail for a given question with unit tests.\n"
+        "Respond only with integer of the unittest number\n\n"
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instruction,
+        output_noun="Your code",
+        stop_sequences=[],
+        max_tokens=4000,
+        temperature=tpr,
+    )
+    return RunSpec(
+        name=f"codeinsights_edge_case:temperature={adapter_spec.temperature},num_testcases={num_testcases}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_edge_case_metric_specs() + get_basic_metric_specs([]),
+        groups=["codeinsights", "codeinsights_edge_case"],
+    )

helm/benchmark/run_specs/healthqa_br_run_specs.py ADDED Viewed

@@ -0,0 +1,40 @@
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("healthqa_br")
+def get_healthqa_br_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.healthqa_br_scenario.HEALTHQA_BR_Scenario", args={}
+    )
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="""
+        Escolha a alternativa correta para as questões de medicina (responda apenas com a letra).
+        Exemplo de Pergunta com a resposta:
+        Qual dos seguintes órgãos é responsável pela produção da insulina no corpo humano?
+        A) Fígado
+        B) Rins
+        C) Pâncreas
+        D) Baço
+        E) Coração
+        Resposta correta: C
+        A partir disso, responda:
+        """,
+        input_noun="Pergunta",
+        output_noun="Resposta",
+    )
+    return RunSpec(
+        name="healthqa_br",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["healthqa_br"],
+    )

helm/benchmark/run_specs/heim_run_specs.py CHANGED Viewed

@@ -60,7 +60,9 @@ def get_core_heim_metric_specs() -> List[MetricSpec]:
             class_name="helm.benchmark.metrics.image_generation.fractal_dimension_metric.FractalDimensionMetric",
             args={},
         ),
-        MetricSpec(class_name="helm.benchmark.metrics.image_generation.nsfw_metrics.NSFWMetric", args={}),
+        # Disabled due to keras issue.
+        # See: https://github.com/stanford-crfm/helm/issues/3741#issuecomment-3109478877
+        # MetricSpec(class_name="helm.benchmark.metrics.image_generation.nsfw_metrics.NSFWMetric", args={}),
         MetricSpec(class_name="helm.benchmark.metrics.image_generation.nudity_metrics.NudityMetric", args={}),
         MetricSpec(class_name="helm.benchmark.metrics.image_generation.watermark_metrics.WatermarkMetric", args={}),
     ] + get_basic_metric_specs(names=[])

helm/benchmark/run_specs/lmkt_run_specs.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""Run spec functions for Vietnam WVS cultural alignment evaluation."""
+from helm.benchmark.adaptation.common_adapter_specs import (
+    get_generation_adapter_spec,
+)
+from helm.benchmark.metrics.common_metric_specs import (
+    get_exact_match_metric_specs,
+    get_f1_metric_specs,
+    get_open_ended_generation_metric_specs,
+)
+from helm.benchmark.metrics.lmkt_metric_specs import get_semantic_similarity_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+INSTRUCTIONS = {
+    "cultural_value_understanding_wvs": {
+        "en": {
+            "instructions": "Please respond as the {country} persona described below.",
+            "input_noun": "Question",
+            "output_noun": "Answer",
+        },
+        "vi": {
+            "instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
+            "input_noun": "Câu hỏi",
+            "output_noun": "Trả lời",
+        },
+    },
+    "social_norm_application_normad": {
+        "en": {
+            "instructions": "Please respond as the {country} persona described below.",
+            "input_noun": "Situation",
+            "output_noun": "Response",
+        },
+        "vi": {
+            "instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
+            "input_noun": "Tình huống",
+            "output_noun": "Phản hồi",
+        },
+    },
+    "social_norm_explanation_normad": {
+        "en": {
+            "instructions": "Please respond as the {country} persona described below.",
+            "input_noun": "Situation",
+            "output_noun": "Explanation",
+        },
+        "vi": {
+            "instructions": "Vui lòng trả lời như một người {country} được mô tả bên dưới.",
+            "input_noun": "Tình huống",
+            "output_noun": "Giải thích",
+        },
+    },
+}
+COUNTRIES = {
+    "US": "United States",
+    "VN": "Vietnam",
+}
+@run_spec_function("cultural_value_understanding_wvs")
+def get_cultural_value_understanding_wvs_spec(language: str, country: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.lmkt_scenarios.CulturalValueUnderstandingWVSScenario",
+        args={
+            "language": language,
+            "num_personas": 300,
+            "num_question_variants": 4,
+            "include_few_shot_examples": True,
+        },
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["instructions"].format(
+            country=COUNTRIES[country]
+        ),
+        input_noun=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["input_noun"],
+        output_noun=INSTRUCTIONS["cultural_value_understanding_wvs"][language]["output_noun"],
+        max_tokens=3,
+        stop_sequences=[],
+    )
+    return RunSpec(
+        name="cultural_value_understanding_wvs",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_f1_metric_specs(),
+        groups=["lmkt", "cultural_value_understanding_wvs"],
+    )
+@run_spec_function("social_norm_application_normad")
+def get_social_norm_application_normad_spec(language: str, country: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.lmkt_scenarios.SocialNormApplicationNormADScenario",
+        args={
+            "language": language,
+        },
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=INSTRUCTIONS["social_norm_application_normad"][language]["instructions"].format(
+            country=COUNTRIES[country]
+        ),
+        input_noun=INSTRUCTIONS["social_norm_application_normad"][language]["input_noun"],
+        output_noun=INSTRUCTIONS["social_norm_application_normad"][language]["output_noun"],
+        max_tokens=5,
+        stop_sequences=[],
+    )
+    return RunSpec(
+        name="social_norm_application_normad",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_f1_metric_specs(),
+        groups=["lmkt", "social_norm_application_normad"],
+    )
+@run_spec_function("social_norm_explanation_normad")
+def get_social_norm_explanation_normad_spec(language: str, country: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.lmkt_scenarios.SocialNormExplanationNormADScenario",
+        args={
+            "language": language,
+        },
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=INSTRUCTIONS["social_norm_explanation_normad"][language]["instructions"].format(
+            country=COUNTRIES[country]
+        ),
+        input_noun=INSTRUCTIONS["social_norm_explanation_normad"][language]["input_noun"],
+        output_noun=INSTRUCTIONS["social_norm_explanation_normad"][language]["output_noun"],
+        max_tokens=128,
+        stop_sequences=[],
+    )
+    return RunSpec(
+        name="social_norm_explanation_normad",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs() + get_semantic_similarity_metric_specs(),
+        groups=["lmkt", "social_norm_explanation_normad"],
+    )

helm/benchmark/run_specs/long_context_run_specs.py CHANGED Viewed

@@ -1,4 +1,9 @@
-from helm.benchmark.adaptation.adapter_spec import ADAPT_CHAT, ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.adaptation.adapter_spec import (
+    ADAPT_CHAT,
+    ADAPT_GENERATION,
+    ADAPT_MULTIPLE_CHOICE_JOINT,
+    AdapterSpec,
+)
 from helm.benchmark.metrics.common_metric_specs import (
     get_exact_match_metric_specs,
     get_open_ended_generation_metric_specs,
@@ -29,6 +34,27 @@ def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
     )
+def _get_long_context_multiple_choice_adapter_spec(max_tokens: int) -> AdapterSpec:
+    return AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        global_prefix="",
+        global_suffix="",
+        instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n\n",  # noqa: E501
+        input_prefix="",
+        input_suffix="\n",
+        reference_prefix="A. ",
+        reference_suffix="\n",
+        output_prefix="\nAnswer the question above based on the passage. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n",  # noqa: E501
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        temperature=0.0,
+        max_tokens=max_tokens,
+        stop_sequences=[],
+    )
 @run_spec_function("ruler_hotpotqa")
 def get_ruler_hotpotqa_spec(max_num_words: int = 131072) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -96,6 +122,27 @@ def get_infinite_bench_en_qa_spec(max_num_words: int = 131072) -> RunSpec:
     )
+@run_spec_function("infinite_bench_en_mc")
+def get_infinite_bench_en_mc_spec(max_num_words: int = 131072) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.infinite_bench_en_mc_scenario.InfiniteBenchEnMCScenario",
+        args={
+            "max_num_words": max_num_words,
+        },
+    )
+    adapter_spec = _get_long_context_multiple_choice_adapter_spec(max_tokens=40)
+    metric_specs = get_exact_match_metric_specs()
+    return RunSpec(
+        name=f"infinite_bench_en_mc:max_num_words={max_num_words}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["infinite_bench_en_mc"],
+    )
 @run_spec_function("infinite_bench_en_sum")
 def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:

helm/benchmark/run_specs/medhelm/__init__.py ADDED Viewed

File without changes

crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.8py3-none-any.whl