PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show

crfm_helm-0.5.10.dist-info/METADATA +369 -0
crfm_helm-0.5.10.dist-info/RECORD +1008 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +80 -29
helm/benchmark/adaptation/adapters/adapter.py +2 -2
helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
helm/benchmark/adaptation/common_adapter_specs.py +443 -0
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/adaptation/request_state.py +6 -1
helm/benchmark/adaptation/scenario_state.py +6 -2
helm/benchmark/annotation/aci_bench_annotator.py +84 -0
helm/benchmark/annotation/air_bench_annotator.py +79 -0
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/annotator.py +48 -0
helm/benchmark/annotation/annotator_factory.py +50 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/call_center_annotator.py +258 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +96 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +55 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
helm/benchmark/annotation/live_qa_annotator.py +76 -0
helm/benchmark/annotation/med_dialog_annotator.py +88 -0
helm/benchmark/annotation/medalign_annotator.py +89 -0
helm/benchmark/annotation/medi_qa_annotator.py +87 -0
helm/benchmark/annotation/medication_qa_annotator.py +86 -0
helm/benchmark/annotation/mental_health_annotator.py +87 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
helm/benchmark/annotation/model_as_judge.py +309 -0
helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
helm/benchmark/annotation/test_annotator_factory.py +26 -0
helm/benchmark/annotation/test_dummy_annotator.py +44 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation/xstest_annotator.py +100 -0
helm/benchmark/annotation_executor.py +144 -0
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/data_augmenter.py +0 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +3 -3
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +26 -4
helm/benchmark/augmentations/perturbation_description.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +29 -0
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +56 -19
helm/benchmark/augmentations/translate_perturbation.py +31 -0
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/config_registry.py +7 -1
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +54 -25
helm/benchmark/huggingface_registration.py +28 -10
helm/benchmark/metrics/air_bench_metrics.py +3212 -0
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/basic_metrics.py +437 -667
helm/benchmark/metrics/bbq_metrics.py +17 -6
helm/benchmark/metrics/bias_metrics.py +18 -9
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/classification_metrics.py +107 -22
helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/code_metrics_helper.py +11 -3
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +174 -0
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
helm/benchmark/metrics/copyright_metrics.py +5 -5
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
helm/benchmark/metrics/disinformation_metrics.py +8 -114
helm/benchmark/metrics/dry_run_metrics.py +35 -6
helm/benchmark/metrics/efficiency_metrics.py +287 -0
helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +67 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
helm/benchmark/metrics/language_modeling_metrics.py +111 -0
helm/benchmark/metrics/live_qa_metrics.py +35 -0
helm/benchmark/metrics/llm_jury_metrics.py +58 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/machine_translation_metrics.py +89 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
helm/benchmark/metrics/medec_metrics.py +124 -0
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/metric.py +121 -175
helm/benchmark/metrics/metric_name.py +0 -1
helm/benchmark/metrics/metric_service.py +23 -7
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/omni_math_metrics.py +44 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/ranking_metrics.py +5 -5
helm/benchmark/metrics/reference_metric.py +148 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/safety_metrics.py +91 -0
helm/benchmark/metrics/seahelm_metrics.py +201 -0
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +8 -11
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +150 -11
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +145 -70
helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
helm/benchmark/metrics/test_metric.py +3 -3
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
helm/benchmark/metrics/toxicity_metrics.py +37 -7
helm/benchmark/metrics/toxicity_utils.py +23 -0
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/unitxt_metrics.py +107 -0
helm/benchmark/metrics/vision_language/__init__.py +0 -0
helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
helm/benchmark/metrics/vision_language/image_utils.py +100 -0
helm/benchmark/metrics/wildbench_metrics.py +54 -0
helm/benchmark/model_deployment_registry.py +69 -5
helm/benchmark/model_metadata_registry.py +58 -2
helm/benchmark/multi_gpu_runner.py +133 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +51 -20
helm/benchmark/presentation/run_display.py +51 -12
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +83 -66
helm/benchmark/presentation/summarize.py +483 -388
helm/benchmark/presentation/table.py +8 -8
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_contamination.py +2 -2
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/presentation/test_run_entry.py +2 -2
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/presentation/test_summarize.py +148 -6
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +151 -87
helm/benchmark/run_expander.py +418 -33
helm/benchmark/run_spec.py +93 -0
helm/benchmark/run_spec_factory.py +180 -0
helm/benchmark/run_specs/__init__.py +0 -0
helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/call_center_run_specs.py +201 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1393 -0
helm/benchmark/run_specs/cleva_run_specs.py +277 -0
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +224 -0
helm/benchmark/run_specs/finance_run_specs.py +114 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +625 -0
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
helm/benchmark/run_specs/lite_run_specs.py +307 -0
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +188 -0
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +191 -0
helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
helm/benchmark/run_specs/simple_run_specs.py +104 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +63 -62
helm/benchmark/runner_config_registry.py +21 -0
helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
helm/benchmark/scenarios/air_bench_scenario.py +76 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
helm/benchmark/scenarios/banking77_scenario.py +77 -0
helm/benchmark/scenarios/bbq_scenario.py +17 -2
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +18 -3
helm/benchmark/scenarios/boolq_scenario.py +21 -1
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
helm/benchmark/scenarios/clear_scenario.py +180 -0
helm/benchmark/scenarios/cleva_scenario.py +482 -3
helm/benchmark/scenarios/code_scenario.py +46 -4
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +33 -1
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
helm/benchmark/scenarios/disinformation_scenario.py +32 -1
helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
helm/benchmark/scenarios/financebench_scenario.py +74 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
helm/benchmark/scenarios/gpqa_scenario.py +98 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +21 -2
helm/benchmark/scenarios/gsm_scenario.py +31 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
helm/benchmark/scenarios/headqa_scenario.py +158 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
helm/benchmark/scenarios/ice_scenario.py +28 -4
helm/benchmark/scenarios/ifeval_scenario.py +71 -0
helm/benchmark/scenarios/image_generation/__init__.py +0 -0
helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +26 -3
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
helm/benchmark/scenarios/legal_support_scenario.py +24 -1
helm/benchmark/scenarios/legalbench_scenario.py +45 -3
helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
helm/benchmark/scenarios/lextreme_scenario.py +22 -1
helm/benchmark/scenarios/live_qa_scenario.py +94 -0
helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +81 -22
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +30 -1
helm/benchmark/scenarios/medalign_scenario.py +117 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
helm/benchmark/scenarios/medbullets_scenario.py +167 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
helm/benchmark/scenarios/medec_scenario.py +148 -0
helm/benchmark/scenarios/medhallu_scenario.py +95 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +146 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
helm/benchmark/scenarios/mmlu_scenario.py +32 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +31 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +71 -0
helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
helm/benchmark/scenarios/quac_scenario.py +24 -1
helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
helm/benchmark/scenarios/raft_scenario.py +33 -3
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
helm/benchmark/scenarios/scenario.py +44 -1
helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
helm/benchmark/scenarios/simple_scenarios.py +122 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +109 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
helm/benchmark/scenarios/summarization_scenario.py +48 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +4 -3
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_scenario.py +6 -3
helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/unitxt_scenario.py +62 -0
helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
helm/benchmark/scenarios/vicuna_scenario.py +22 -2
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
helm/benchmark/scenarios/wikifact_scenario.py +31 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +101 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +32 -2
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +78 -50
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +269 -0
helm/benchmark/static/schema_capabilities.yaml +254 -0
helm/benchmark/static/schema_classic.yaml +259 -1140
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +191 -0
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +161 -0
helm/benchmark/static/schema_legal.yaml +566 -0
helm/benchmark/static/schema_lite.yaml +3 -286
helm/benchmark/static/schema_long_context.yaml +282 -0
helm/benchmark/static/schema_medhelm.yaml +1176 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu.yaml +1449 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +283 -0
helm/benchmark/static/schema_seahelm.yaml +723 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_torr.yaml +474 -0
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_unitxt.yaml +370 -0
helm/benchmark/static/schema_vhelm.yaml +933 -0
helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
helm/benchmark/static_build/config.js +4 -0
helm/benchmark/static_build/index.html +19 -0
helm/benchmark/test_data_preprocessor.py +3 -3
helm/benchmark/test_run_expander.py +1 -1
helm/benchmark/window_services/default_window_service.py +3 -45
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
helm/benchmark/window_services/ice_window_service.py +1 -35
helm/benchmark/window_services/image_generation/__init__.py +0 -0
helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
helm/benchmark/window_services/local_window_service.py +22 -5
helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
helm/benchmark/window_services/test_bloom_window_service.py +5 -4
helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
helm/benchmark/window_services/test_gptj_window_service.py +11 -5
helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
helm/benchmark/window_services/test_openai_window_service.py +18 -12
helm/benchmark/window_services/test_opt_window_service.py +6 -5
helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
helm/benchmark/window_services/test_t511b_window_service.py +5 -4
helm/benchmark/window_services/test_ul2_window_service.py +5 -4
helm/benchmark/window_services/test_utils.py +6 -6
helm/benchmark/window_services/test_yalm_window_service.py +5 -4
helm/benchmark/window_services/tokenizer_service.py +7 -13
helm/benchmark/window_services/window_service.py +42 -0
helm/benchmark/window_services/window_service_factory.py +4 -1
helm/benchmark/window_services/yalm_window_service.py +1 -28
helm/clients/__init__.py +0 -0
helm/{proxy/clients → clients}/ai21_client.py +78 -12
helm/clients/aleph_alpha_client.py +114 -0
helm/{proxy/clients → clients}/anthropic_client.py +304 -21
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +122 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +199 -0
helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
helm/clients/audio_language/qwen_audiolm_client.py +153 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/audio_language/test.py +62 -0
helm/{proxy/clients → clients}/auto_client.py +72 -31
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +381 -0
helm/clients/bedrock_utils.py +105 -0
helm/{proxy/clients → clients}/client.py +92 -17
helm/clients/clip_score_client.py +49 -0
helm/clients/clip_scorers/__init__.py +0 -0
helm/clients/clip_scorers/base_clip_scorer.py +18 -0
helm/clients/clip_scorers/clip_scorer.py +50 -0
helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
helm/{proxy/clients → clients}/cohere_client.py +105 -14
helm/clients/dspy_client.py +135 -0
helm/clients/gcs_client.py +82 -0
helm/{proxy/clients → clients}/google_client.py +8 -6
helm/clients/google_translate_client.py +35 -0
helm/clients/grok_client.py +36 -0
helm/{proxy/clients → clients}/http_model_client.py +8 -8
helm/{proxy/clients → clients}/huggingface_client.py +157 -86
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +269 -0
helm/clients/image_generation/__init__.py +0 -0
helm/clients/image_generation/adobe_vision_client.py +80 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
helm/clients/image_generation/cogview2/__init__.py +0 -0
helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
helm/clients/image_generation/cogview2_client.py +192 -0
helm/clients/image_generation/dalle2_client.py +194 -0
helm/clients/image_generation/dalle3_client.py +108 -0
helm/clients/image_generation/dalle_mini/__init__.py +3 -0
helm/clients/image_generation/dalle_mini/data.py +442 -0
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
helm/clients/image_generation/dalle_mini/model/text.py +251 -0
helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
helm/clients/image_generation/dalle_mini_client.py +191 -0
helm/clients/image_generation/deep_floyd_client.py +80 -0
helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
helm/clients/image_generation/image_generation_client_utils.py +9 -0
helm/clients/image_generation/lexica_client.py +88 -0
helm/clients/image_generation/mindalle/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/__init__.py +216 -0
helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
helm/clients/image_generation/mindalle/utils/config.py +129 -0
helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
helm/clients/image_generation/mindalle/utils/utils.py +89 -0
helm/clients/image_generation/mindalle_client.py +116 -0
helm/clients/image_generation/nudity_check_client.py +64 -0
helm/clients/image_generation/together_image_generation_client.py +113 -0
helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
helm/{proxy/clients → clients}/megatron_client.py +7 -5
helm/clients/mistral_client.py +180 -0
helm/clients/moderation_api_client.py +111 -0
helm/clients/nvidia_nim_client.py +32 -0
helm/clients/open_lm_client.py +43 -0
helm/clients/openai_client.py +604 -0
helm/clients/openai_responses_client.py +200 -0
helm/clients/openrouter_client.py +31 -0
helm/{proxy/clients → clients}/palmyra_client.py +31 -14
helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
helm/clients/reka_client.py +190 -0
helm/clients/simple_client.py +64 -0
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +95 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/{proxy/clients → clients}/test_auto_client.py +13 -15
helm/clients/test_client.py +98 -0
helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
helm/clients/test_openrouter_client.py +69 -0
helm/clients/test_simple_client.py +19 -0
helm/clients/test_together_client.py +184 -0
helm/clients/together_client.py +599 -0
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +488 -0
helm/clients/vision_language/__init__.py +0 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
helm/clients/vision_language/huggingface_vlm_client.py +114 -0
helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
helm/clients/vision_language/open_flamingo/__init__.py +2 -0
helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
helm/clients/vision_language/open_flamingo_client.py +155 -0
helm/clients/vision_language/paligemma_client.py +147 -0
helm/clients/vision_language/palmyra_vision_client.py +101 -0
helm/clients/vision_language/qwen2_vlm_client.py +189 -0
helm/clients/vision_language/qwen_vlm_client.py +174 -0
helm/clients/vllm_client.py +80 -0
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +105 -0
helm/clients/yi_client.py +28 -0
helm/common/audio_utils.py +111 -0
helm/common/cache.py +23 -33
helm/common/cache_backend_config.py +47 -0
helm/common/clip_score_request.py +41 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +10 -2
helm/common/file_caches/__init__.py +0 -0
helm/common/file_caches/file_cache.py +16 -0
helm/common/file_caches/local_file_cache.py +61 -0
helm/common/file_caches/test_local_file_cache.py +25 -0
helm/common/file_upload_request.py +27 -0
helm/common/general.py +10 -3
helm/common/hierarchical_logger.py +124 -12
helm/common/image_generation_parameters.py +25 -0
helm/common/images_utils.py +60 -5
helm/common/key_value_store.py +41 -10
helm/common/local_context.py +140 -0
helm/common/media_object.py +14 -1
helm/common/moderations_api_request.py +71 -0
helm/common/mongo_key_value_store.py +8 -7
helm/common/multimodal_request_utils.py +57 -0
helm/common/nudity_check_request.py +29 -0
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +45 -19
helm/common/response_format.py +18 -0
helm/common/test_cache.py +1 -48
helm/common/test_general.py +10 -0
helm/common/test_logging.py +94 -0
helm/common/test_media_object.py +1 -1
helm/common/tokenization_request.py +1 -10
helm/config/model_deployments.yaml +4713 -1005
helm/config/model_metadata.yaml +4045 -255
helm/config/tokenizer_configs.yaml +1091 -50
helm/proxy/accounts.py +31 -4
helm/proxy/cli.py +6 -4
helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/critique/model_critique_client.py +40 -10
helm/proxy/example_queries.py +33 -28
helm/proxy/retry.py +5 -0
helm/proxy/server.py +82 -18
helm/proxy/services/remote_service.py +32 -7
helm/proxy/services/server_service.py +71 -69
helm/proxy/services/service.py +30 -6
helm/proxy/services/test_remote_service.py +6 -5
helm/proxy/services/test_service.py +1 -13
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +61 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +462 -0
helm/proxy/test_accounts.py +32 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +37 -37
helm/proxy/token_counters/test_auto_token_counter.py +164 -0
helm/proxy/token_counters/token_counter.py +3 -5
helm/tokenizers/__init__.py +0 -0
helm/tokenizers/ai21_tokenizer.py +52 -0
helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
helm/tokenizers/cohere_tokenizer.py +50 -0
helm/tokenizers/grok_tokenizer.py +55 -0
helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/simple_tokenizer.py +33 -0
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
helm/tokenizers/test_simple_tokenizer.py +33 -0
helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
crfm_helm-0.4.0.dist-info/METADATA +0 -264
crfm_helm-0.4.0.dist-info/RECORD +0 -397
helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
helm/benchmark/data_overlap/export_scenario_text.py +0 -119
helm/benchmark/data_overlap/light_scenario.py +0 -60
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/run_specs.py +0 -2762
helm/benchmark/scenarios/numeracy_scenario.py +0 -784
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/utils.js +0 -285
helm/benchmark/test_model_deployment_definition.py +0 -92
helm/benchmark/test_model_properties.py +0 -1570
helm/benchmark/vlm_run_specs.py +0 -97
helm/benchmark/window_services/ai21_window_service.py +0 -258
helm/benchmark/window_services/cohere_window_service.py +0 -163
helm/benchmark/window_services/flan_t5_window_service.py +0 -29
helm/benchmark/window_services/gpt2_window_service.py +0 -32
helm/benchmark/window_services/huggingface_window_service.py +0 -60
helm/benchmark/window_services/t0pp_window_service.py +0 -35
helm/benchmark/window_services/t511b_window_service.py +0 -30
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -74
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -326
helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
helm/benchmark/window_services/ul2_window_service.py +0 -30
helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
helm/common/cache_utils.py +0 -14
helm/proxy/clients/aleph_alpha_client.py +0 -95
helm/proxy/clients/goose_ai_client.py +0 -99
helm/proxy/clients/microsoft_client.py +0 -180
helm/proxy/clients/openai_client.py +0 -206
helm/proxy/clients/simple_client.py +0 -60
helm/proxy/clients/test_client.py +0 -49
helm/proxy/clients/test_together_client.py +0 -97
helm/proxy/clients/together_client.py +0 -334
helm/proxy/clients/vertexai_client.py +0 -115
helm/proxy/token_counters/ai21_token_counter.py +0 -20
helm/proxy/token_counters/cohere_token_counter.py +0 -13
helm/proxy/token_counters/free_token_counter.py +0 -12
helm/proxy/token_counters/gooseai_token_counter.py +0 -24
helm/proxy/token_counters/openai_token_counter.py +0 -22
helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
helm/proxy/token_counters/test_openai_token_counter.py +0 -81
helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
helm/proxy/tokenizers/ice_tokenizer.py +0 -30
helm/proxy/tokenizers/simple_tokenizer.py +0 -32
helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
/helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
/helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
/helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
/helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
/helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
/helm/{proxy/clients → clients}/ai21_utils.py +0 -0
/helm/{proxy/clients → clients}/cohere_utils.py +0 -0
/helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
/helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
/helm/{benchmark → proxy}/static/general.js +0 -0
/helm/{benchmark → proxy}/static/info-icon.png +0 -0

helm/benchmark/scenarios/ehrshot_scenario.py ADDED Viewed

@@ -0,0 +1,1541 @@
+import multiprocessing
+import os
+import pandas as pd
+import tiktoken
+from functools import partial
+from tqdm import tqdm
+from typing import Any, Dict, List, Optional, Mapping
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.common.general import check_file_exists, ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    TEST_SPLIT,
+    Input,
+    Scenario,
+    Instance,
+    CORRECT_TAG,
+    Reference,
+    Output,
+    ScenarioMetadata,
+)
+##################################
+# Config
+##################################
+CONFIG: Dict[str, Any] = {
+    "seed": 1,
+    "n_shots": 0,
+    "ehr_converter": "codes_only",
+    "max_labels_per_task": 10_000,
+    "guo": {
+        "is_include_persona": True,
+        "is_include_clinical_def": False,
+        "is_include_code_def": False,
+        "is_use_short_clinical_def": False,
+        "is_include_cot": False,
+    },
+    "lab": {
+        "is_include_persona": True,
+        "is_include_clinical_def": False,
+        "is_include_code_def": False,
+        "is_use_short_clinical_def": False,
+        "is_include_cot": False,
+    },
+    "new": {
+        "is_include_persona": True,
+        "is_include_clinical_def": True,
+        "is_include_code_def": True,
+        "is_use_short_clinical_def": True,
+        "is_include_cot": False,
+    },
+}
+##################################
+# Task names
+##################################
+TASK_FULL_NAMES = {
+    "new_acutemi": "Acute Myocardial Infarction",
+    "new_celiac": "Celiac Disease",
+    "new_hyperlipidemia": "Hyperlipidemia",
+    "new_hypertension": "Hypertension",
+    "new_lupus": "Systemic Lupus Erythematosus",
+    "new_pancan": "Pancreatic Cancer",
+    "lab_anemia": "Anemia",
+    "lab_hyperkalemia": "Hyperkalemia",
+    "lab_hypoglycemia": "Hypoglycemia",
+    "lab_hyponatremia": "Hyponatremia",
+    "lab_thrombocytopenia": "Thrombocytopenia",
+    "guo_los": "Length of Stay",
+    "guo_readmission": "30-Day Readmission",
+    "guo_icu": "ICU Transfer",
+}
+TASK_QUESTIONS = {
+    "new_acutemi": "Is this patient likely to receive a first-time diagnosis of \
+    Acute Myocardial Infarction within the next year?",
+    "new_celiac": "Is this patient likely to receive a first-time diagnosis of \
+    Celiac Disease within the next year?",
+    "new_hyperlipidemia": "Is this patient likely to receive a first-time diagnosis \
+    of Hyperlipidemia within the next year?",
+    "new_hypertension": "Is this patient likely to receive a first-time diagnosis \
+    of Hypertension within the next year?",
+    "new_lupus": "Is this patient likely to receive a first-time diagnosis of \
+    Systemic Lupus Erythematosus within the next year?",
+    "new_pancan": "Is this patient likely to receive a first-time diagnosis of \
+    Pancreatic Cancer within the next year?",
+    "lab_anemia": "If a lab test for Anemia is ordered for this patient right now, \
+    will it come back back abnormal? (i.e. <120 g/L)",
+    "lab_hyperkalemia": "If a lab test for Hyperkalemia is ordered for this patient right now, \
+    will it come back back abnormal (i.e. >5.5 mmol/L)?",
+    "lab_hypoglycemia": "If a lab test for Hypoglycemia is ordered for this patient right now, \
+    will it come back back abnormal (i.e. <3.9 mmol/L)?",
+    "lab_hyponatremia": "If a lab test for Hyponatremia is ordered for this patient right now, \
+    will it come back back abnormal (i.e. <135 mmol/L)?",
+    "lab_thrombocytopenia": "If a lab test for Thrombocytopenia is ordered for this patient right now, \
+    will it come back back abnormal? (i.e. <150 109/L)",
+    "guo_los": "If this patient is admitted to the hospital right now, is the patient likely to have a \
+    length of stay of at least 7 days (i.e. a week)?",
+    "guo_readmission": "If this patient is discharged from the hospital right now, is the patient likely \
+    to be readmitted to the hospital within 30 days?",
+    "guo_icu": "If this patient is admitted to the hospital right now, is the patient likely to be \
+    transferred to the ICU at any point during their admission?",
+}
+##################################
+# Task Definitions
+##################################
+# Exact task definitions from the EHRSHOT paper
+TASK_DEFS = {}
+TASK_DEFS["guo_icu"] = (
+    "Predict whether a patient will be transferred to the ICU during a visit to the "
+    "hospital. The prediction time is at 11:59pm on the day of admission, and ICU "
+    "transfers that occur on the same day as admission are ignored."
+)
+TASK_DEFS["guo_los"] = (
+    "Predict whether a patient’s total length of stay during a visit to the hospital "
+    "will be at least 7 days. The prediction time is at 11:59pm on the day of admission, "
+    "and visits that last less than one day (i.e. discharge occurs on the same day of "
+    "admission) are ignored."
+)
+TASK_DEFS["guo_readmission"] = (
+    "Predict whether a patient will be re-admitted to the hospital within 30 days after "
+    "being discharged from a visit. The prediction time is at 11:59pm on the day of "
+    "admission, and admissions where a readmission occurs on the same day as the "
+    "corresponding discharge are ignored."
+)
+# Use the binary classification formulation here
+TASK_DEFS["lab_thrombocytopenia"] = (
+    "Predict whether a thrombocytopenia lab comes back as normal (>=150 109/L) "
+    " or abnormal (<150 109/L). We consider all lab results coded as LOINC/LP393218-5, "
+    "LOINC/LG32892-8, or LOINC/777-3. The prediction time is immediately before the lab "
+    "result is recorded."
+)
+TASK_DEFS["lab_hyperkalemia"] = (
+    "Predict whether a hyperkalemia lab comes back as normal (<=5.5 mmol/L) or "
+    "abnormal (>5.5 mmol/L). We consider all lab results coded as LOINC LG7931-1, "
+    "LOINC/LP386618-5, LOINC/LG109906, LOINC/6298-4, or LOINC/2823-3. The prediction "
+    "time is immediately before the lab result is recorded."
+)
+TASK_DEFS["lab_hypoglycemia"] = (
+    "Predict whether a hypoglycemia lab comes back as normal (>=3.9 mmol/L) or "
+    "abnormal (<3.9 mmol/L). We consider all lab results coded as SNOMED/33747003, "
+    "LOINC/LP4161453, or LOINC/14749-6. The prediction time is immediately before the "
+    "lab result is recorded."
+)
+TASK_DEFS["lab_hyponatremia"] = (
+    "Predict whether a hyponatremia lab comes back as normal (>=135 mmol/L) or "
+    "abnormal (<135 mmol/L). We consider all lab results coded as LOINC/LG11363-5, "
+    "LOINC/2951-2, or LOINC/2947-0. The prediction time is immediately before the lab "
+    "result is recorded."
+)
+TASK_DEFS["lab_anemia"] = (
+    "Predict whether an anemia lab comes back as normal (>=120 g/L) or "
+    "abnormal (<120 g/L). We consider all lab results coded as LOINC/LP392452-1. "
+    "The prediction time is immediately before the lab result is recorded."
+)
+##################################
+# Personas
+##################################
+# Generated from GPT4o using the wikipedia definition and this prompt:
+#
+# Read the following definition of a medical condition and suggest the most
+# likely medical specialists (up to 5) who would diagnosis and treat a patient
+# with this condition. Only list the title and respond with a Python List object.
+#
+# "{CLINICAL_DEFINIION}""
+PERSONAS = {}
+PERSONAS["new_acutemi"] = [
+    "Cardiologist",
+    "Emergency Medicine Physician",
+    "Interventional Cardiologist",
+    "Intensivist",
+    "Primary Care Physician",
+]
+PERSONAS["new_celiac"] = [
+    "Gastroenterologist",
+    "Immunologist",
+    "Endocrinologist",
+    "Pediatrician",
+    "Primary Care Physician",
+]
+PERSONAS["new_hyperlipidemia"] = [
+    "Cardiologist",
+    "Endocrinologist",
+    "Primary Care Physician",
+    "Lipidologist",
+    "Gastroenterologist",
+]
+PERSONAS["new_hypertension"] = [
+    "Primary Care Physician",
+    "Cardiologist",
+    "Nephrologist",
+    "Endocrinologist",
+    "Internist",
+]
+PERSONAS["new_lupus"] = [
+    "Rheumatologist",
+    "Immunologist",
+    "Nephrologist",
+    "Cardiologist",
+    "Dermatologist",
+]
+PERSONAS["new_pancan"] = [
+    "Oncologist",
+    "Gastroenterologist",
+    "Radiologist",
+    "Hepatobiliary Surgeon",
+    "Genetic Counselor",
+]
+# Generated from GPT4o using the wikipedia definition and this prompt:
+#
+# Read the following definition of a medical prediction task, and suggest
+# the most likely medical specialists or professions (up to 5) who would be
+# involved in either predicting, treating, or managing a patient who might
+# have this event occur. Only list the title and respond with a Python List object.
+#
+# "{TASK_DEFINITION}""
+PERSONAS["guo_icu"] = [
+    "Intensivist",
+    "Hospitalist",
+    "Critical Care Nurse",
+    "Emergency Medicine Physician",
+    "Medical Data Scientist",
+]
+PERSONAS["guo_los"] = [
+    "Hospitalist",
+    "Internal Medicine Specialist",
+    "Intensivist",
+    "Discharge Planner",
+    "Clinical Data Scientist",
+]
+PERSONAS["guo_readmission"] = [
+    "Hospitalist",
+    "Internal Medicine Specialist",
+    "Case Manager",
+    "Discharge Planner",
+    "Primary Care Physician",
+]
+PERSONAS["lab_anemia"] = [
+    "Hematologist",
+    "Primary Care Physician",
+    "Internal Medicine Specialist",
+    "Clinical Pathologist",
+    "Nurse Practitioner",
+]
+PERSONAS["lab_hyperkalemia"] = [
+    "Nephrologist",
+    "Endocrinologist",
+    "Cardiologist",
+    "Primary Care Physician",
+    "Clinical Laboratory Scientist",
+]
+PERSONAS["lab_hypoglycemia"] = [
+    "Endocrinologist",
+    "Primary Care Physician",
+    "Diabetologist",
+    "Clinical Laboratory Scientist",
+    "Nurse Practitioner",
+]
+PERSONAS["lab_hyponatremia"] = [
+    "Nephrologist",
+    "Endocrinologist",
+    "Primary Care Physician",
+    "Emergency Medicine Physician",
+    "Clinical Laboratory Scientist",
+]
+PERSONAS["lab_thrombocytopenia"] = [
+    "Hematologist",
+    "Primary Care Physician",
+    "Pathologist",
+    "Oncologist",
+    "Critical Care Specialist",
+]
+##################################
+# Clinical Definitions
+##################################
+# Generated from GPT4o using the Wikipedia definition and this prompt:
+#
+# "Read the following text describing a clinical condition. Provide a short,
+# diagnostic-focused definition that would enable a doctor LLM to review a
+# patient's historical EHR and predict the likelihood of developing the
+# condition in question: {text}"
+CLINICAL_SHORT_DEFS = {}
+CLINICAL_SHORT_DEFS["new_acutemi"] = (
+    "A myocardial infarction (MI), commonly known as a heart attack, occurs when "
+    "blood flow to the heart muscle is blocked, leading to tissue death. Key "
+    "symptoms include chest pain radiating to the left shoulder, arm, or jaw, "
+    "shortness of breath, nausea, and cold sweats. Atypical presentations, "
+    "especially in women and the elderly, include neck pain, fatigue, and "
+    "minimal symptoms. Major risk factors are coronary artery disease, high "
+    "blood pressure, smoking, diabetes, and obesity. Diagnosis involves ECGs and "
+    "blood tests for troponin levels. Immediate treatment aims to restore blood "
+    "flow via percutaneous coronary intervention (PCI) or thrombolysis. Long-term "
+    "management includes lifestyle changes and medications like aspirin and "
+    "statins."
+)
+CLINICAL_SHORT_DEFS["new_celiac"] = (
+    "Celiac disease is a chronic autoimmune disorder primarily affecting the small "
+    "intestine, triggered by an intolerance to gluten (proteins found in wheat, "
+    "rye, and barley). Symptoms range from gastrointestinal issues like chronic "
+    "diarrhea and malabsorption to non-gastrointestinal symptoms or even no "
+    "symptoms. It is linked with other autoimmune diseases such as Type 1 diabetes "
+    "and Hashimoto's thyroiditis. Diagnosis involves blood antibody tests, "
+    "intestinal biopsies, and genetic testing. The only effective treatment is a "
+    "strict lifelong gluten-free diet, which mitigates symptoms and reduces "
+    "complications."
+)
+CLINICAL_SHORT_DEFS["new_hyperlipidemia"] = (
+    "Hyperlipidemia is characterized by abnormally high levels of lipids or "
+    "lipoproteins in the blood, including fats, triglycerides, cholesterol, and "
+    "phospholipids. It can result from genetic factors (primary hyperlipidemia) or "
+    "underlying conditions like diabetes (secondary hyperlipidemia). This "
+    "condition is a modifiable risk factor for cardiovascular disease and may also "
+    "predispose individuals to acute pancreatitis. Diagnosis requires laboratory "
+    "tests to measure lipid levels, and management often involves chronic "
+    "medication to control these levels."
+)
+CLINICAL_SHORT_DEFS["new_hypertension"] = (
+    "Hypertension, or high blood pressure, is a chronic medical condition "
+    "characterized by persistently elevated blood pressure in the arteries, with a "
+    "resting measurement at or above 130/80 mmHg. It is a significant risk factor "
+    "for numerous cardiovascular and systemic diseases, including stroke, coronary "
+    "artery disease, heart failure, atrial fibrillation, and chronic kidney "
+    "disease. Hypertension is divided into primary (essential) hypertension, "
+    "accounting for 90-95% of cases, which is due to nonspecific lifestyle and "
+    "genetic factors, and secondary hypertension, due to identifiable causes like "
+    "chronic kidney disease and endocrine disorders. Identifying hypertension in a "
+    "patient's EHR involves reviewing blood pressure readings, assessing for risk "
+    "factors such as obesity, high salt intake, and smoking, and noting any related "
+    "health conditions or medications."
+)
+CLINICAL_SHORT_DEFS["new_lupus"] = (
+    "Systemic lupus erythematosus (SLE) is an autoimmune disease where the body's "
+    "immune system mistakenly attacks healthy tissues, causing inflammation and "
+    "damage in various organs. Common symptoms include joint pain and swelling, "
+    "fever, chest pain, hair loss, mouth ulcers, swollen lymph nodes, fatigue, and "
+    "a distinctive facial rash. Diagnosis involves a combination of clinical "
+    "symptoms and laboratory tests, particularly the presence of anti-nuclear "
+    "antibodies. Risk factors include genetic predisposition, female sex hormones, "
+    "sunlight exposure, smoking, vitamin D deficiency, and certain infections. "
+    "There is no cure, but treatments such as NSAIDs, corticosteroids, "
+    "immunosuppressants, hydroxychloroquine, and methotrexate can manage symptoms. "
+    "SLE significantly increases the risk of cardiovascular disease and can "
+    "complicate pregnancies."
+)
+CLINICAL_SHORT_DEFS["new_pancan"] = (
+    "Pancreatic cancer, primarily pancreatic adenocarcinoma, arises from the "
+    "pancreas and often goes undetected until advanced stages. Key diagnostic "
+    "indicators include abdominal pain, jaundice, weight loss, and a history of "
+    "smoking, obesity, diabetes, or certain genetic conditions. Screening relies "
+    "on imaging, blood tests, and biopsies, with risk factors including smoking "
+    "(25% of cases) and genetic predispositions (5-10%). Early diagnosis is rare, "
+    "with over half of cases occurring in individuals over 70. Neuroendocrine "
+    "tumors, though less common and aggressive, also originate in the pancreas. "
+    "Treatment options vary by stage and include surgery, radiotherapy, and "
+    "chemotherapy, but the prognosis remains poor, particularly for late-stage "
+    "adenocarcinoma."
+)
+# Definitions are comprised of the first section of each task's Wikpedia page.
+# Retreived July 19, 2024.
+CLINICAL_DEFS = {}
+CLINICAL_DEFS["new_acutemi"] = (
+    "A myocardial infarction (MI), commonly known as a heart attack, occurs when "
+    "blood flow decreases or stops in one of the coronary arteries of the heart, "
+    "causing infarction (tissue death) to the heart muscle. The most common symptom "
+    "is retrosternal chest pain or discomfort that classically radiates to the left "
+    "shoulder, arm, or jaw. The pain may occasionally feel like heartburn. Other "
+    "symptoms may include shortness of breath, nausea, feeling faint, a cold sweat, "
+    "feeling tired, and decreased level of consciousness. About 30% of people have "
+    "atypical symptoms. Women more often present without chest pain and instead have "
+    "neck pain, arm pain or feel tired. Among those over 75 years old, about 5% have "
+    "had an MI with little or no history of symptoms. An MI may cause heart failure, "
+    "an irregular heartbeat, cardiogenic shock or cardiac arrest. Most MIs occur due "
+    "to coronary artery disease. Risk factors include high blood pressure, smoking, "
+    "diabetes, lack of exercise, obesity, high blood cholesterol, poor diet, and "
+    "excessive alcohol intake. The complete blockage of a coronary artery caused by a "
+    "rupture of an atherosclerotic plaque is usually the underlying mechanism of an MI. "
+    "MIs are less commonly caused by coronary artery spasms, which may be due to "
+    "cocaine, significant emotional stress (often known as Takotsubo syndrome or broken "
+    "heart syndrome) and extreme cold, among others. Many tests are helpful to help "
+    "with diagnosis, including electrocardiograms (ECGs), blood tests and coronary "
+    "angiography. An ECG, which is a recording of the heart's electrical activity, may "
+    "confirm an ST elevation MI (STEMI), if ST elevation is present. Commonly used blood "
+    "tests include troponin and less often creatine kinase MB. Treatment of an MI is "
+    "time-critical. Aspirin is an appropriate immediate treatment for a suspected MI. "
+    "Nitroglycerin or opioids may be used to help with chest pain; however, they do not "
+    "improve overall outcomes. Supplemental oxygen is recommended in those with low "
+    "oxygen levels or shortness of breath. In a STEMI, treatments attempt to restore "
+    "blood flow to the heart and include percutaneous coronary intervention (PCI), where "
+    "the arteries are pushed open and may be stented, or thrombolysis, where the blockage "
+    "is removed using medications. People who have a non-ST elevation myocardial infarction "
+    "(NSTEMI) are often managed with the blood thinner heparin, with the additional use of "
+    "PCI in those at high risk. In people with blockages of multiple coronary arteries and "
+    "diabetes, coronary artery bypass surgery (CABG) may be recommended rather than "
+    "angioplasty. After an MI, lifestyle modifications, along with long-term treatment with "
+    "aspirin, beta blockers and statins, are typically recommended. Worldwide, about 15.9 "
+    "million myocardial infarctions occurred in 2015. More than 3 million people had an ST "
+    "elevation MI, and more than 4 million had an NSTEMI. STEMIs occur about twice as often "
+    "in men as women. About one million people have an MI each year in the United States. In "
+    "the developed world, the risk of death in those who have had a STEMI is about 10%. Rates "
+    "of MI for a given age have decreased globally between 1990 and 2010. In 2011, an MI was "
+    "one of the top five most expensive conditions during inpatient hospitalizations in the "
+    "US, with a cost of about $11.5 billion for 612,000 hospital stays."
+)
+CLINICAL_DEFS["new_celiac"] = (
+    "Coeliac disease (British English) or celiac disease (American English) is a "
+    "long-term autoimmune disorder, primarily affecting the small intestine, "
+    "where individuals develop intolerance to gluten, present in foods such as "
+    "wheat, rye and barley. Classic symptoms include gastrointestinal problems "
+    "such as chronic diarrhoea, abdominal distention, malabsorption, loss of "
+    "appetite, and among children failure to grow normally. Non-classic symptoms "
+    "are more common, especially in people older than two years. There may be "
+    "mild or absent gastrointestinal symptoms, a wide number of symptoms involving "
+    "any part of the body, or no obvious symptoms. Coeliac disease was first "
+    "described in childhood; however, it may develop at any age. It i-s associated "
+    "with other autoimmune diseases, such as Type 1 diabetes mellitus and Hashimoto's "
+    "thyroiditis, among others. Coeliac disease is caused by a reaction to gluten, a "
+    "group of various proteins found in wheat and in other grains such as barley and "
+    "rye. Moderate quantities of oats, free of contamination with other gluten-"
+    "containing grains, are usually tolerated. The occurrence of problems may depend "
+    "on the variety of oat. It occurs more often in people who are genetically "
+    "predisposed. Upon exposure to gluten, an abnormal immune response may lead to the "
+    "production of several different autoantibodies that can affect a number of "
+    "different organs. In the small bowel, this causes an inflammatory reaction and "
+    "may produce shortening of the villi lining the small intestine (villous atrophy). "
+    "This affects the absorption of nutrients, frequently leading to anaemia. Diagnosis "
+    "is typically made by a combination of blood antibody tests and intestinal biopsies, "
+    "helped by specific genetic testing. Making the diagnosis is not always "
+    "straightforward. About 10% of the time, the autoantibodies in the blood are "
+    "negative, and many people have only minor intestinal changes with normal villi. "
+    "People may have severe symptoms and they may be investigated for years before a "
+    "diagnosis is achieved. As a result of screening, the diagnosis is increasingly "
+    "being made in people who have no symptoms. Evidence regarding the effects of "
+    "screening, however, is not sufficient to determine its usefulness. While the "
+    "disease is caused by a permanent intolerance to gluten proteins, it is distinct "
+    "from wheat allergy, which is much more rare. The only known effective treatment is "
+    "a strict lifelong gluten-free diet, which leads to recovery of the intestinal "
+    "lining (mucous membrane), improves symptoms, and reduces the risk of developing "
+    "complications in most people. If untreated, it may result in cancers such as "
+    "intestinal lymphoma, and a slightly increased risk of early death. Rates vary "
+    "between different regions of the world, from as few as 1 in 300 to as many as 1 in "
+    "40, with an average of between 1 in 100 and 1 in 170 people. It is estimated that "
+    "80% of cases remain undiagnosed, usually because of minimal or absent "
+    "gastrointestinal complaints and lack of knowledge of symptoms and diagnostic "
+    "criteria. Coeliac disease is slightly more common in women than in men."
+)
+CLINICAL_DEFS["new_hyperlipidemia"] = (
+    "Hyperlipidemia is abnormally high levels of any or all lipids (e.g. fats, "
+    "triglycerides, cholesterol, phospholipids) or lipoproteins in the blood. "
+    "The term hyperlipidemia refers to the laboratory finding itself and is also "
+    "used as an umbrella term covering any of various acquired or genetic "
+    "disorders that result in that finding. Hyperlipidemia represents a subset "
+    "of dyslipidemia and a superset of hypercholesterolemia. Hyperlipidemia is "
+    "usually chronic and requires ongoing medication to control blood lipid "
+    "levels. Lipids (water-insoluble molecules) are transported in a protein "
+    "capsule. The size of that capsule, or lipoprotein, determines its density. "
+    "The lipoprotein density and type of apolipoproteins it contains determines "
+    "the fate of the particle and its influence on metabolism. Hyperlipidemias "
+    "are divided into primary and secondary subtypes. Primary hyperlipidemia is "
+    "usually due to genetic causes (such as a mutation in a receptor protein), "
+    "while secondary hyperlipidemia arises due to other underlying causes such "
+    "as diabetes. Lipid and lipoprotein abnormalities are common in the general "
+    "population and are regarded as modifiable risk factors for cardiovascular "
+    "disease due to their influence on atherosclerosis. In addition, some forms "
+    "may predispose to acute pancreatitis."
+)
+CLINICAL_DEFS["new_hypertension"] = (
+    "Hypertension, also known as high blood pressure, is a long-term medical "
+    "condition in which the blood pressure in the arteries is persistently "
+    "elevated. High blood pressure usually does not cause symptoms itself. It is, "
+    "however, a major risk factor for stroke, coronary artery disease, heart "
+    "failure, atrial fibrillation, peripheral arterial disease, vision loss, "
+    "chronic kidney disease, and dementia. Hypertension is a major cause of "
+    "premature death worldwide. High blood pressure is classified as primary "
+    "(essential) hypertension or secondary hypertension. About 90-95% of cases are "
+    "primary, defined as high blood pressure due to nonspecific lifestyle and "
+    "genetic factors. Lifestyle factors that increase the risk include excess salt "
+    "in the diet, excess body weight, smoking, physical inactivity and alcohol use. "
+    "The remaining 5-10% of cases are categorized as secondary hypertension, defined "
+    "as high blood pressure due to a clearly identifiable cause, such as chronic "
+    "kidney disease, narrowing of the kidney arteries, an endocrine disorder, or the "
+    "use of birth control pills. Blood pressure is classified by two measurements, "
+    "the systolic (first number) and diastolic (second number) pressures. For most "
+    "adults, normal blood pressure at rest is within the range of 100-140 millimeters "
+    "mercury (mmHg) systolic and 60-90 mmHg diastolic. For most adults, high blood "
+    "pressure is present if the resting blood pressure is persistently at or above "
+    "130/80 or 140/90 mmHg. Different numbers apply to children. Ambulatory blood "
+    "pressure monitoring over a 24-hour period appears more accurate than office-based "
+    "blood pressure measurement. Lifestyle changes and medications can lower blood "
+    "pressure and decrease the risk of health complications. Lifestyle changes include "
+    "weight loss, physical exercise, decreased salt intake, reducing alcohol intake, "
+    "and a healthy diet. If lifestyle changes are not sufficient, blood pressure "
+    "medications are used. Up to three medications taken concurrently can control "
+    "blood pressure in 90% of people. The treatment of moderately high arterial blood "
+    "pressure (defined as >160/100 mmHg) with medications is associated with an improved "
+    "life expectancy. The effect of treatment of blood pressure between 130/80 mmHg and "
+    "160/100 mmHg is less clear, with some reviews finding benefit and others finding "
+    "unclear benefit. High blood pressure affects 33% of the population globally. About "
+    "half of all people with high blood pressure do not know that they have it. In 2019, "
+    "high blood pressure was believed to have been a factor in 19% of all deaths (10.4 "
+    "million globally)."
+)
+CLINICAL_DEFS["new_lupus"] = (
+    "Lupus, technically known as systemic lupus erythematosus (SLE), is an autoimmune "
+    "disease in which the body's immune system mistakenly attacks healthy tissue in many "
+    "parts of the body. Symptoms vary among people and may be mild to severe. Common "
+    "symptoms include painful and swollen joints, fever, chest pain, hair loss, mouth "
+    "ulcers, swollen lymph nodes, feeling tired, and a red rash which is most commonly on "
+    "the face. Often there are periods of illness, called flares, and periods of remission "
+    "during which there are few symptoms. The cause of SLE is not clear. It is thought to "
+    "involve a combination of genetics and environmental factors. Among identical twins, "
+    "if one is affected there is a 24% chance the other one will also develop the disease. "
+    "Female sex hormones, sunlight, smoking, vitamin D deficiency, and certain infections "
+    "are also believed to increase a person's risk. The mechanism involves an immune "
+    "response by autoantibodies against a person's own tissues. These are most commonly "
+    "anti-nuclear antibodies and they result in inflammation. Diagnosis can be difficult "
+    "and is based on a combination of symptoms and laboratory tests. There are a number of "
+    "other kinds of lupus erythematosus including discoid lupus erythematosus, neonatal "
+    "lupus, and subacute cutaneous lupus erythematosus. There is no cure for SLE, but there "
+    "are experimental and symptomatic treatments. Treatments may include NSAIDs, "
+    "corticosteroids, immunosuppressants, hydroxychloroquine, and methotrexate. Although "
+    "corticosteroids are rapidly effective, long-term use results in side effects. "
+    "Alternative medicine has not been shown to affect the disease. Men have higher "
+    "mortality. SLE significantly increases the risk of cardiovascular disease, with this "
+    "being the most common cause of death. While women with lupus have higher risk "
+    "pregnancies, most are successful. Rate of SLE varies between countries from 20 to 70 "
+    "per 100,000. Women of childbearing age are affected about nine times more often than "
+    "men. While it most commonly begins between the ages of 15 and 45, a wide range of ages "
+    "can be affected. Those of African, Caribbean, and Chinese descent are at higher risk "
+    "than those of European descent. Rates of disease in the developing world are unclear. "
+    "Lupus is Latin for 'wolf': the disease was so-named in the 13th century as the rash "
+    "was thought to appear like a wolf's bite."
+)
+CLINICAL_DEFS["new_pancan"] = (
+    "Pancreatic cancer arises when cells in the pancreas, a glandular organ behind "
+    "the stomach, begin to multiply out of control and form a mass. These cancerous "
+    "cells have the ability to invade other parts of the body. A number of types of "
+    "pancreatic cancer are known. The most common, pancreatic adenocarcinoma, accounts "
+    "for about 90% of cases, and the term 'pancreatic cancer' is sometimes used to "
+    "refer only to that type. These adenocarcinomas start within the part of the "
+    "pancreas that makes digestive enzymes. Several other types of cancer, which "
+    "collectively represent the majority of the non-adenocarcinomas, can also arise "
+    "from these cells. About 1-2% of cases of pancreatic cancer are neuroendocrine "
+    "tumors, which arise from the hormone-producing cells of the pancreas. These are "
+    "generally less aggressive than pancreatic adenocarcinoma. Signs and symptoms of "
+    "the most-common form of pancreatic cancer may include yellow skin, abdominal or "
+    "back pain, unexplained weight loss, light-colored stools, dark urine, and loss of "
+    "appetite. Usually, no symptoms are seen in the disease's early stages, and "
+    "symptoms that are specific enough to suggest pancreatic cancer typically do not "
+    "develop until the disease has reached an advanced stage. By the time of diagnosis, "
+    "pancreatic cancer has often spread to other parts of the body. Pancreatic cancer "
+    "rarely occurs before the age of 40, and more than half of cases of pancreatic "
+    "adenocarcinoma occur in those over 70. Risk factors for pancreatic cancer include "
+    "tobacco smoking, obesity, diabetes, and certain rare genetic conditions. About 25% "
+    "of cases are linked to smoking, and 5-10% are linked to inherited genes. Pancreatic "
+    "cancer is usually diagnosed by a combination of medical imaging techniques such as "
+    "ultrasound or computed tomography, blood tests, and examination of tissue samples "
+    "(biopsy). The disease is divided into stages, from early (stage I) to late (stage "
+    "IV). Screening the general population has not been found to be effective. The risk "
+    "of developing pancreatic cancer is lower among non-smokers, and people who maintain "
+    "a healthy weight and limit their consumption of red or processed meat; the risk is "
+    "greater for men, smokers, and those with diabetes. There is some evidence that links "
+    "high levels of red meat consumption to increased risk of pancreatic cancer. Smokers' "
+    "risk of developing the disease decreases immediately upon quitting, and almost "
+    "returns to that of the rest of the population after 20 years. Pancreatic cancer can "
+    "be treated with surgery, radiotherapy, chemotherapy, palliative care, or a "
+    "combination of these. Treatment options are partly based on the cancer stage. "
+    "Surgery is the only treatment that can cure pancreatic adenocarcinoma, and may also "
+    "be done to improve quality of life without the potential for cure. Pain management "
+    "and medications to improve digestion are sometimes needed. Early palliative care is "
+    "recommended even for those receiving treatment that aims for a cure. Pancreatic "
+    "cancer is among the most deadly forms of cancer globally, with one of the lowest "
+    "survival rates. In 2015, pancreatic cancers of all types resulted in 411,600 deaths "
+    "globally. Pancreatic cancer is the fifth-most-common cause of death from cancer in "
+    "the United Kingdom, and the third most-common in the United States. The disease "
+    "occurs most often in the developed world, where about 70% of the new cases in 2012 "
+    "originated. Pancreatic adenocarcinoma typically has a very poor prognosis; after "
+    "diagnosis, 25% of people survive one year and 12% live for five years. For cancers "
+    "diagnosed early, the five-year survival rate rises to about 20%. Neuroendocrine "
+    "cancers have better outcomes; at five years from diagnosis, 65% of those diagnosed "
+    "are living, though survival considerably varies depending on the type of tumor."
+)
+##################################
+# Coded Definitions
+##################################
+# generated from the OMOP Athena vocabulary
+# phenotype definition: parent->descendants
+CODE_DEFS = {
+    "new_acutemi": {
+        "SNOMED/57054005": {
+            "descendants": {
+                "ICD9CM/410.80",
+                "SNOMED/1204155000",
+                "ICD10CM/I21.4",
+                "SNOMED/401314000",
+                "ICD9CM/410.1",
+                "SNOMED/17531000119105",
+                "ICD9CM/410.90",
+                "SNOMED/836294006",
+                "SNOMED/703251009",
+                "ICD9CM/410.31",
+                "SNOMED/12238151000119107",
+                "SNOMED/44011000000104",
+                "SNOMED/73795002",
+                "SNOMED/836293000",
+                "SNOMED/83351000000106",
+                "SNOMED/23311000119105",
+                "ICD9CM/410.10",
+                "SNOMED/285991000119100",
+                "SNOMED/44821000087100",
+                "SNOMED/868226001",
+                "SNOMED/44841000087109",
+                "SNOMED/1208872002",
+                "SNOMED/703213009",
+                "SNOMED/194803008",
+                "SNOMED/285981000119103",
+                "SNOMED/122701000000102",
+                "ICD9CM/410.2",
+                "SNOMED/195545006",
+                "SNOMED/194808004",
+                "SNOMED/72977004",
+                "SNOMED/282006",
+                "ICD10CM/I21.0",
+                "SNOMED/401303003",
+                "ICD10CM/I21.11",
+                "SNOMED/15712881000119105",
+                "ICD9CM/410.22",
+                "ICD9CM/410.51",
+                "SNOMED/233834004",
+                "SNOMED/45881000000101",
+                "SNOMED/840316004",
+                "ICD9CM/410.3",
+                "SNOMED/15713201000119105",
+                "SNOMED/471851000000100",
+                "SNOMED/70211005",
+                "SNOMED/471711000000109",
+                "SNOMED/54329005",
+                "HCPCS/G8009",
+                "SNOMED/194811003",
+                "SNOMED/233828006",
+                "SNOMED/44811000087108",
+                "SNOMED/76593002",
+                "ICD9CM/410.71",
+                "SNOMED/15712921000119103",
+                "SNOMED/12238111000119106",
+                "SNOMED/233837006",
+                "SNOMED/44001000000101",
+                "SNOMED/233830008",
+                "SNOMED/194798004",
+                "SNOMED/840609007",
+                "SNOMED/703164000",
+                "ICD9CM/410.91",
+                "ICD9CM/410.70",
+                "SNOMED/703165004",
+                "SNOMED/70422006",
+                "HCPCS/G8007",
+                "SNOMED/194807009",
+                "ICD9CM/410.52",
+                "ICD10CM/I21.3",
+                "SNOMED/233836002",
+                "ICD9CM/410.4",
+                "SNOMED/58612006",
+                "SNOMED/583001000000107",
+                "ICD9CM/410.11",
+                "SNOMED/57054005",
+                "SNOMED/846668006",
+                "ICD9CM/410",
+                "SNOMED/1204151009",
+                "ICD9CM/410.02",
+                "SNOMED/233832000",
+                "SNOMED/233829003",
+                "SNOMED/44851000087107",
+                "SNOMED/15962541000119106",
+                "SNOMED/233827001",
+                "SNOMED/64627002",
+                "SNOMED/307140009",
+                "SNOMED/194810002",
+                "SNOMED/194805001",
+                "SNOMED/79009004",
+                "SNOMED/155321002",
+                "ICD9CM/410.40",
+                "ICD9CM/410.32",
+                "SNOMED/1204154001",
+                "SNOMED/840309000",
+                "SNOMED/233835003",
+                "ICD10CM/I21.2",
+                "SNOMED/233831007",
+                "SNOMED/59063002",
+                "ICD9CM/410.30",
+                "ICD10CM/I21.1",
+                "ICD9CM/410.72",
+                "ICD9CM/410.20",
+                "ICD10CM/I21.01",
+                "ICD9CM/410.5",
+                "SNOMED/15990001",
+                "ICD10CM/I21.21",
+                "ICD9CM/410.81",
+                "SNOMED/703212004",
+                "SNOMED/46001000000109",
+                "SNOMED/896689003",
+                "HCPCS/G8010",
+                "SNOMED/1163440003",
+                "ICD9CM/410.00",
+                "ICD10CM/I21.9",
+                "ICD9CM/410.0",
+                "ICD9CM/410.01",
+                "SNOMED/896691006",
+                "SNOMED/868214006",
+                "SNOMED/868224003",
+                "SNOMED/15963181000119104",
+                "ICD9CM/410.41",
+                "SNOMED/623341000000106",
+                "ICD9CM/410.42",
+                "SNOMED/233825009",
+                "ICD9CM/410.12",
+                "SNOMED/868220007",
+                "SNOMED/868225002",
+                "ICD9CM/410.82",
+                "SNOMED/52035003",
+                "ICD9CM/410.21",
+                "HCPCS/G8006",
+                "SNOMED/62695002",
+                "SNOMED/703252002",
+                "SNOMED/15713161000119100",
+                "SNOMED/868217004",
+                "SNOMED/155319007",
+                "SNOMED/304914007",
+                "SNOMED/15713041000119103",
+                "SNOMED/44831000087103",
+                "ICD9CM/410.8",
+                "SNOMED/840312002",
+                "ICD9CM/410.50",
+                "SNOMED/15712841000119100",
+                "SNOMED/1204222000",
+                "SNOMED/194809007",
+                "ICD10CM/I21.19",
+                "ICD9CM/410.92",
+                "SNOMED/65547006",
+                "ICD10CM/I21",
+                "SNOMED/1089451000000100",
+                "ICD9CM/410.7",
+                "SNOMED/840680009",
+                "ICD10CM/I21.29",
+                "SNOMED/846683001",
+                "ICD10CM/I21.02",
+                "SNOMED/896696001",
+                "SNOMED/233826005",
+                "SNOMED/1204152002",
+                "SNOMED/15713001000119100",
+                "SNOMED/836295007",
+                "SNOMED/703253007",
+                "SNOMED/15713121000119105",
+                "SNOMED/15713081000119108",
+                "ICD10CM/I21.09",
+                "SNOMED/70998009",
+                "SNOMED/896697005",
+                "SNOMED/15712961000119108",
+                "ICD9CM/410.9",
+                "SNOMED/233838001",
+                "SNOMED/1089471000000109",
+                "SNOMED/233833005",
+                "SNOMED/412771006",
+            }
+        }
+    },
+    "new_celiac": {
+        "SNOMED/396331005": {
+            "descendants": {
+                "SNOMED/197481005",
+                "SNOMED/45259000",
+                "SNOMED/91867008",
+                "SNOMED/197479008",
+                "ICD9CM/579.0",
+                "SNOMED/266478000",
+                "SNOMED/155842007",
+                "ICD10CM/K90.0",
+                "SNOMED/770593004",
+                "SNOMED/396331005",
+                "SNOMED/197478000",
+                "SNOMED/61715008",
+                "SNOMED/23829007",
+                "SNOMED/1197730009",
+                "SNOMED/396330006",
+                "SNOMED/722386009",
+            }
+        }
+    },
+    "new_hyperlipidemia": {
+        "SNOMED/55822004": {
+            "descendants": {
+                "SNOMED/299465007",
+                "ICD10CM/E78.2",
+                "SNOMED/214021000000106",
+                "SNOMED/15771000119109",
+                "SNOMED/518631000000104",
+                "SNOMED/426161002",
+                "SNOMED/403830007",
+                "SNOMED/154743001",
+                "SNOMED/402473001",
+                "SNOMED/238040008",
+                "SNOMED/701000119103",
+                "SNOMED/773649005",
+                "ICD9CM/272.1",
+                "SNOMED/302870006",
+                "SNOMED/238080004",
+                "SNOMED/389985001",
+                "SNOMED/137931000119102",
+                "ICD10CM/E78.00",
+                "SNOMED/238079002",
+                "SNOMED/238085009",
+                "SNOMED/129590000",
+                "SNOMED/13644009",
+                "SNOMED/238083002",
+                "SNOMED/34171006",
+                "SNOMED/403829002",
+                "SNOMED/238078005",
+                "SNOMED/238081000",
+                "SNOMED/238039006",
+                "SNOMED/403831006",
+                "SNOMED/397915002",
+                "SNOMED/238084008",
+                "SNOMED/267433009",
+                "SNOMED/778111000000106",
+                "ICD10CM/E78.01",
+                "SNOMED/767133009",
+                "SNOMED/267432004",
+                "SNOMED/402786009",
+                "ICD9CM/272.2",
+                "SNOMED/129589009",
+                "SNOMED/238076009",
+                "SNOMED/275598004",
+                "SNOMED/398036000",
+                "SNOMED/190782002",
+                "SNOMED/518591000000104",
+                "SNOMED/55822004",
+                "SNOMED/1571000119104",
+                "SNOMED/402725005",
+                "SNOMED/34528009",
+                "SNOMED/238082007",
+                "SNOMED/114831000119107",
+                "SNOMED/190774002",
+                "SNOMED/238077000",
+                "ICD10CM/E78.5",
+                "SNOMED/403827000",
+                "SNOMED/778121000000100",
+                "SNOMED/48190005",
+                "SNOMED/238087001",
+                "SNOMED/154741004",
+                "SNOMED/238088006",
+                "SNOMED/267434003",
+                "SNOMED/633291000000106",
+                "ICD10CM/E78.3",
+                "SNOMED/34349009",
+                "OMOP Extension/OMOP5166017",
+                "SNOMED/402475008",
+                "SNOMED/402785008",
+                "SNOMED/267435002",
+                "OMOP Extension/OMOP5181809",
+                "SNOMED/238086005",
+                "SNOMED/137941000119106",
+                "ICD9CM/272.3",
+                "SNOMED/403828005",
+                "SNOMED/129591001",
+                "SNOMED/154742006",
+                "SNOMED/633301000000105",
+                "SNOMED/190778004",
+                "SNOMED/518601000000105",
+                "ICD9CM/272.4",
+                "SNOMED/1208738002",
+                "SNOMED/190775001",
+                "ICD10CM/E78.1",
+                "SNOMED/445261005",
+                "ICD9CM/272.0",
+                "SNOMED/890601000000107",
+                "ICD10CM/E78.4",
+                "SNOMED/402726006",
+                "SNOMED/1197489003",
+                "SNOMED/773726000",
+                "SNOMED/402787000",
+                "SNOMED/33513003",
+                "SNOMED/402727002",
+                "ICD10CM/E78.49",
+                "SNOMED/402474007",
+                "ICD10CM/E78.0",
+                "SNOMED/190777009",
+                "SNOMED/31654005",
+                "SNOMED/238089003",
+                "SNOMED/491251000000107",
+            }
+        }
+    },
+    "new_hypertension": {
+        "SNOMED/59621000": {
+            "descendants": {
+                "SNOMED/46481004",
+                "SNOMED/72022006",
+                "ICD9CM/401",
+                "SNOMED/1201005",
+                "ICD10CM/I10",
+                "SNOMED/23717007",
+                "SNOMED/266228004",
+                "SNOMED/71874008",
+                "SNOMED/9901000",
+                "SNOMED/371125006",
+                "SNOMED/194760004",
+                "SNOMED/40511000119107",
+                "SNOMED/63287004",
+                "SNOMED/18416000",
+                "ICD9CM/401.9",
+                "SNOMED/1218009",
+                "SNOMED/78975002",
+                "SNOMED/78808002",
+                "SNOMED/35303009",
+                "SNOMED/155296003",
+                "ICD9CM/401.1",
+                "SNOMED/194758001",
+                "ICD9CM/401.0",
+                "SNOMED/19769006",
+                "SNOMED/429457004",
+                "SNOMED/59621000",
+            }
+        }
+    },
+    "new_lupus": {
+        "SNOMED/55464009": {
+            "descendants": {
+                "SNOMED/156450004",
+                "SNOMED/76521009",
+                "SNOMED/239888002",
+                "SNOMED/36402006",
+                "SNOMED/403487009",
+                "ICD10CM/M32.1",
+                "SNOMED/295111000119108",
+                "SNOMED/25380002",
+                "ICD10CM/M32.0",
+                "SNOMED/196138005",
+                "SNOMED/203784000",
+                "SNOMED/698694005",
+                "SNOMED/309762007",
+                "SNOMED/724781003",
+                "ICD10CM/M32.8",
+                "SNOMED/201435004",
+                "SNOMED/11013005",
+                "SNOMED/201436003",
+                "SNOMED/403488004",
+                "ICD10CM/M32.10",
+                "ICD10CM/M32.12",
+                "SNOMED/95644001",
+                "ICD10CM/M32.14",
+                "SNOMED/77753005",
+                "SNOMED/95408003",
+                "SNOMED/239889005",
+                "SNOMED/52042003",
+                "SNOMED/239886003",
+                "SNOMED/68815009",
+                "SNOMED/73286009",
+                "SNOMED/773333003",
+                "SNOMED/593481000000109",
+                "SNOMED/409421000000100",
+                "SNOMED/54072008",
+                "SNOMED/239890001",
+                "SNOMED/295101000119105",
+                "ICD10CM/M32.13",
+                "SNOMED/4676006",
+                "ICD10CM/M32.19",
+                "SNOMED/758321000000100",
+                "SNOMED/239887007",
+                "SNOMED/197608009",
+                "SNOMED/201438002",
+                "ICD10CM/M32.9",
+                "SNOMED/201437007",
+                "SNOMED/19682006",
+                "SNOMED/55464009",
+                "OMOP Extension/OMOP5166128",
+                "ICD9CM/710.0",
+                "ICD10CM/M32",
+                "SNOMED/201439005",
+                "SNOMED/295121000119101",
+                "SNOMED/403486000",
+            }
+        }
+    },
+    "new_pancan": {
+        "SNOMED/372003004": {
+            "descendants": {
+                "SNOMED/1268532006",
+                "ICD9CM/157.2",
+                "SNOMED/733351008",
+                "ICD9CM/157.3",
+                "ICD9CM/157.9",
+                "ICD10CM/C25",
+                "SNOMED/1268911008",
+                "SNOMED/93715005",
+                "SNOMED/1197286008",
+                "SNOMED/16823941000119108",
+                "ICD10CM/C25.4",
+                "SNOMED/1268698000",
+                "SNOMED/352701000119102",
+                "ICD10CM/C24.1",
+                "SNOMED/1197279000",
+                "SNOMED/1259747009",
+                "SNOMED/1259310006",
+                "SNOMED/94082003",
+                "ICD10CM/C25.9",
+                "SNOMED/1259800005",
+                "SNOMED/314999005",
+                "SNOMED/1259539007",
+                "SNOMED/735735001",
+                "SNOMED/681721000119103",
+                "SNOMED/1259682002",
+                "SNOMED/1197283000",
+                "ICD9CM/157.8",
+                "SNOMED/371967001",
+                "SNOMED/681911000119108",
+                "ICD9CM/157.0",
+                "SNOMED/109849001",
+                "ICD10CM/C25.2",
+                "ICD9CM/157.4",
+                "SNOMED/1268561007",
+                "SNOMED/1651000119109",
+                "SNOMED/93668007",
+                "SNOMED/1259309001",
+                "SNOMED/1268546006",
+                "SNOMED/93938001",
+                "ICD9CM/156.2",
+                "SNOMED/681971000119100",
+                "ICD9CM/157",
+                "ICD10CM/C25.7",
+                "SNOMED/1268563005",
+                "SNOMED/143391000119109",
+                "SNOMED/93823001",
+                "SNOMED/681621000119105",
+                "SNOMED/721718003",
+                "ICD9CM/157.1",
+                "SNOMED/93939009",
+                "ICD10CM/C25.3",
+                "SNOMED/1259311005",
+                "SNOMED/1259700000",
+                "ICD10CM/C25.1",
+                "ICD10CM/C25.0",
+                "SNOMED/93843007",
+                "SNOMED/1259358002",
+                "SNOMED/372003004",
+                "SNOMED/372119009",
+                "SNOMED/681831000119107",
+                "SNOMED/1259415000",
+                "SNOMED/1259799006",
+                "SNOMED/1268542008",
+            }
+        }
+    },
+}
+##################################
+# Action
+##################################
+ACTION_TMPL = (
+    "Review the patient's EHR history. Based on all available medical evidence in the provided EHR, please "
+    "answer the question: {question}"
+)
+ACTION_COT_TMPL = (
+    "Review the patient's EHR history. Based on all available medical evidence in the provided EHR, please "
+    "answer the question: {question} Reflect on the problem and generate a numbered list of the steps "
+    "you took to reach your conclusion."
+)
+def get_task_config(task_name: str) -> Mapping[str, Any]:
+    if task_name.startswith("guo"):
+        return CONFIG["guo"]
+    elif task_name.startswith("lab"):
+        return CONFIG["lab"]
+    else:
+        return CONFIG["new"]
+def lumia_prompt(
+    task_name: str, config: Dict[str, Any], examples: List[Dict[str, Any]], timeline: Dict[str, Any]
+) -> str:
+    # Base template
+    task_config: Mapping[str, Any] = get_task_config(task_name)
+    tmpl: Dict[str, Any]
+    tmpl = base_prompt(task_name, **task_config)
+    # EHR => String converter
+    if config.get("ehr_converter") == "codes_and_timestamps":
+        ehr_converter = codes_and_timestamps
+    elif config.get("ehr_converter") == "codes_only":
+        ehr_converter = codes_only
+    else:
+        raise ValueError(f"Invalid `ehr_converter` strategy: {config.get('ehr_converter')}")
+    # Examples
+    # examples: str = "\n\n# Examples\n\n" + "\n".join([tmpl['example'].format(ehr=ehr_converter(e['ehr']), \
+    # label=e['label']) for e in examples]) + "\n\n" if len(examples) > 0 else ""
+    tmpl["instruction"] = tmpl["instruction"].replace(
+        'Then respond with "yes" or "no" as your final output',
+        'Then respond with "A" for yes or "B" for no as your final output',
+    )
+    # Prompt
+    prompt: str = f"""
+# Instructions
+{tmpl['instruction']}
+# Your Task
+{tmpl['example'].format(ehr=ehr_converter(timeline['ehr']))}"""
+    return prompt
+def get_code_def(task_name: str):
+    """Create a text list of all codes defining this task definition."""
+    code_def: List[str] = []
+    for parent in CODE_DEFS[task_name]:
+        code_def.extend(CODE_DEFS[task_name][parent]["descendants"])
+    return ", ".join(sorted(code_def))
+def base_prompt(
+    task_name: str,
+    is_include_persona: bool = True,
+    is_include_clinical_def: bool = True,
+    is_include_code_def: bool = True,
+    is_use_short_clinical_def: bool = True,
+    is_include_cot: bool = True,
+    seed: int = 1,
+    **kwargs,
+) -> Dict[str, Any]:
+    """
+    Build the base prompt template for the provided task_name. This will create
+    a prompt that can be populated by specific labeled patient examples.
+    Example output:
+        'instruction': 'You are an expert endocrinologist at Stanford Healthcare, an academic medical center
+                        affiliated with Stanford University. You specialize in diagnosing and treating
+                        hypertension.\n\n Clinical Definition: Hypertension, or high blood pressure, is a
+                        chronic medical condition characterized by persistently elevated blood pressure in
+                        the arteries, with a resting measurement at or above 130/80 mmHg.
+                        It is a significant risk factor for numerous cardiovascular and systemic diseases,
+                        including stroke, coronary artery disease, heart failure, atrial fibrillation, and
+                        chronic kidney disease. Hypertension is divided into primary (essential) hypertension,
+                        accounting for 90-95% of cases, which is due to nonspecific lifestyle and
+                        genetic factors, and secondary hypertension, due to identifiable causes like chronic
+                        kidney disease and endocrine disorders. Identifying hypertension in a patient\'s EHR
+                        involves reviewing blood pressure readings, assessing for risk factors such as obesity,
+                        high salt intake, and smoking, and noting any related health conditions or medications.\n\n
+                        Medical Code Definition: In an electronic health record (EHR), hypertension is denoted by the
+                        occurrence of any of the following medical codes: ICD10CM/I10, ICD9CM/401, ICD9CM/401.0,
+                        ICD9CM/401.1, ICD9CM/401.9, SNOMED/1201005, SNOMED/1218009, SNOMED/155296003,
+                        SNOMED/18416000, SNOMED/194758001, SNOMED/194760004, SNOMED/19769006, SNOMED/23717007,
+                        SNOMED/266228004, SNOMED/35303009, SNOMED/371125006, SNOMED/40511000119107, SNOMED/429457004,
+                        SNOMED/46481004, SNOMED/59621000, SNOMED/63287004, SNOMED/71874008,
+                        SNOMED/72022006, SNOMED/78808002, SNOMED/78975002, SNOMED/9901000\n\n
+                        Instruction: Review the patient\'s EHR history and answer the following question:
+                        Based on all available medical evidence in the provided EHR, is this patient likely to
+                        receive a first time diagnosis of hypertension within the next year?'
+        'example':     'Patient EHR: {ehr}\n\n'
+        'delimiter':    '\n##\n'
+    """
+    prompt = []
+    task_full_name = TASK_FULL_NAMES[task_name].lower()
+    # 1. Persona
+    if is_include_persona:
+        persona = PERSONAS[task_name][0]
+        prompt.append(
+            "You are an expert {role} at Stanford Healthcare, an academic medical center "
+            "affiliated with Stanford University. You specialize in predicting "
+            "{task_full_name}.".format(
+                role=persona.lower(),
+                task_full_name=task_full_name,
+            )
+        )
+    # 2. Natural language clinical definition
+    if is_include_clinical_def and is_use_short_clinical_def:
+        prompt.append("Clinical Definition: " + CLINICAL_SHORT_DEFS[task_name])
+    elif is_include_clinical_def:
+        prompt.append("Clinical Definition: " + CLINICAL_DEFS[task_name])
+    # 3. Medical ontology phenotype definition (code-based inclusion criteria)
+    if is_include_code_def:
+        code_def = (
+            "Medical Code Definition: In an electronic health record (EHR), {task_full_name} is denoted "
+            "by the occurrence of any of the following medical codes: {code_def}"
+        )
+        prompt.append(
+            code_def.format(
+                task_full_name=task_full_name,
+                code_def=get_code_def(task_name),
+            )
+        )
+    # 4. Question for this task
+    question: str = TASK_QUESTIONS[task_name]
+    # 4. Action
+    if is_include_cot:
+        prompt.append("Instruction: " + ACTION_COT_TMPL.format(question=question))
+    else:
+        prompt.append("Instruction: " + ACTION_TMPL.format(question=question))
+    # padding
+    prompt = [item + "\n" for item in prompt]
+    prompt_text = "\n".join(prompt)
+    return {
+        "instruction": prompt_text,
+        "example": "Patient EHR:\n{ehr}\n\n",
+        "delimiter": "\n##\n",
+    }
+def codes_and_timestamps(events: List[Dict[str, Any]]) -> str:
+    """Format a list of MEDS events into a string.
+    Example:
+        > events = [
+            { 'time' : datetime.datetime(2024, 1, 1), 'code' : 'ICD10CM/C25.9' },
+            { 'time' : datetime.datetime(2024, 1, 2), 'code' : 'ICD10CM/C25.9' },
+            { 'time' : datetime.datetime(2024, 1, 3), 'code' : 'ICD10CM/C25.9' },
+        ]
+        > format_ehr(events)
+        # Output: "
+        #   - 2024-01-01 ICD10CM/C25.9
+        #   - 2024-01-02 ICD10CM/C25.9
+        #   - 2024-01-03 ICD10CM/C25.9
+        # "
+    """
+    return "\n".join([f"- {event['time'].strftime('%Y-%m-%d')} {event['code']}" for event in events])
+def codes_only(events: List[Any]) -> str:
+    """Format a list of MEDS events into a string.
+    Example:
+        events = [
+            { 'time' : datetime.datetime(2024, 1, 1), 'code' : 'ICD10CM/C25.9' },
+            { 'time' : datetime.datetime(2024, 1, 2), 'code' : 'ICD10CM/C25.9' },
+            { 'time' : datetime.datetime(2024, 1, 3), 'code' : 'ICD10CM/C25.9' },
+        ] or
+        events = [
+            'ICD10CM/C25.9',
+            'ICD10CM/C25.9',
+            'ICD10CM/C25.9',
+        ]
+        format_ehr(events)
+        # Output: "
+        #   - ICD10CM/C25.9
+        #   - ICD10CM/C25.9
+        #   - ICD10CM/C25.9
+        # "
+    """
+    return "\n".join([f"- {event['code'] if isinstance(event, dict) else event}" for event in events])
+def _process_prior_events_chunk(
+    chunk_data: pd.DataFrame, grouped_a: pd.DataFrame, is_show_tqdm: bool = False
+) -> List[List[str]]:
+    """Process a chunk of label rows and return their prior events."""
+    chunk_prior_events: List[List[str]] = []
+    for _, row_b in tqdm(
+        chunk_data.iterrows(), total=len(chunk_data), desc="Processing labels", disable=not is_show_tqdm
+    ):
+        # Find events for this patient that occur before the specified time
+        patient_events = (
+            grouped_a.get_group(row_b["subject_id"]) if row_b["subject_id"] in grouped_a.groups else pd.DataFrame()
+        )
+        if len(patient_events) == 0:
+            chunk_prior_events.append([])
+            continue
+        # Filter events before the specified time
+        prior_patient_events = patient_events[patient_events["time"] <= row_b["prediction_time"]]
+        # Collect codes
+        chunk_prior_events.append(list(prior_patient_events["code"]))
+    return chunk_prior_events
+def get_prior_events(df_data: pd.DataFrame, df_labels: pd.DataFrame, n_procs: int = 4) -> List[List[str]]:
+    """
+    Find events for each patient in `df_data` that occur before the specified time in `df_labels`
+    Returns:
+    --------
+    list of lists
+        For each row in df_labels, a list of codes that occur before its time for the same patient
+    """
+    # Convert Polars DataFrames to pandas
+    df_data = df_data.to_pandas() if not isinstance(df_data, pd.DataFrame) else df_data
+    df_labels = df_labels.to_pandas() if not isinstance(df_labels, pd.DataFrame) else df_labels
+    # Sort both dataframes to ensure proper filtering
+    df_data_sorted = df_data.sort_values(["subject_id", "time"])
+    df_labels_sorted = df_labels.sort_values(["subject_id", "prediction_time"])
+    # Create a list to store results
+    prior_events: List[List[str]] = []
+    # Group A dataframe by subject_id for efficient lookup
+    grouped_a = df_data_sorted.groupby("subject_id")
+    if n_procs == 1:
+        prior_events = _process_prior_events_chunk(df_labels_sorted, grouped_a, is_show_tqdm=True)
+    else:
+        # Split df_labels_sorted into n chunks
+        chunk_size = 1_000
+        chunks = [df_labels_sorted.iloc[i : i + chunk_size] for i in range(0, len(df_labels_sorted), chunk_size)]
+        # Create partial function with grouped_a already set
+        process_chunk_partial = partial(_process_prior_events_chunk, grouped_a=grouped_a)
+        # Process chunks in parallel
+        print(f"Processing {len(chunks)} chunks across {n_procs} processes...")
+        with multiprocessing.Pool(n_procs) as pool:
+            results = list(tqdm(pool.imap(process_chunk_partial, chunks), total=len(chunks), desc="Processing chunks"))
+        # Flatten results
+        prior_events = [event for chunk_result in results for event in chunk_result]
+    return prior_events
+def count_tokens(text: str, model: str = "gpt-4") -> int:
+    """
+    Counts the number of tokens in a string using the GPT-4 tokenizer.
+    """
+    # Load the tokenizer for the specified model
+    tokenizer = tiktoken.encoding_for_model(model)
+    # Encode the text to get the tokens
+    tokens = tokenizer.encode(text)
+    # Return the number of tokens
+    return len(tokens)
+class EHRSHOTScenario(Scenario):
+    """
+    From "An EHR Benchmark for Few-Shot Evaluation of Foundation Models" (Wornow et al. 2023),
+    EHRSHOT is a collection of structured data from 6,739 deidentified longitudinal
+    electronic health records (EHRs) sourced from Stanford Medicine. It contains
+    15 unique clinical prediction tasks. We use a subset of 14 of these tasks, namely
+    the binary classification tasks.
+    Citation
+    ```
+    @article{wornow2023ehrshot,
+        title={EHRSHOT: An EHR Benchmark for Few-Shot Evaluation of Foundation Models},
+        author={Michael Wornow and Rahul Thapa and Ethan Steinberg and Jason Fries and Nigam Shah},
+        year={2023},
+        eprint={2307.02028},
+        archivePrefix={arXiv},
+        primaryClass={cs.LG}
+    }
+    ```
+    """
+    name = "ehrshot"
+    description = (
+        "EHRSHOT is a benchmark designed to evaluate a model's ability to predict future"
+        "clinical events using structured EHR data. Each instance contains a patient's"
+        "historical EHR data and a forward-looking clinical question about whether a particular"
+        "diagnosis, lab result, or hospital event will occur."
+    )
+    tags = []  # TODO
+    POSSIBLE_ANSWER_CHOICES: List[str] = [
+        "yes",
+        "no",
+    ]
+    def __init__(self, subject: str, data_path: str, max_length: Optional[int] = None):
+        super().__init__()
+        self.subject: str = subject  # same as "task" or "labeling_function"
+        self.max_length = max_length
+        self.data_path = data_path
+    def create_benchmark(self, output_path: str, n_procs: int = 4) -> Dict[str, str]:
+        """Loads the MEDS dataset and converts it to prompts"""
+        # Load MEDS EHRSHOT patient timelines
+        data_parquet_path = os.path.join(self.data_path, "data/data.parquet")
+        check_file_exists(
+            data_parquet_path, msg=f"[EHRSHOTScenario] Required parquet data file not found: '{data_parquet_path}'"
+        )
+        splits_parquet_path = os.path.join(self.data_path, "metadata/subject_splits.parquet")
+        check_file_exists(
+            splits_parquet_path, msg=f"[EHRSHOTScenario] Required splits file not found: '{splits_parquet_path}'"
+        )
+        df_data = pd.read_parquet(data_parquet_path)
+        df_splits = pd.read_parquet(splits_parquet_path)
+        # Load MEDS EHRSHOT labels
+        tasks = sorted(os.listdir(os.path.join(self.data_path, "labels")))
+        for t in tasks:
+            path_to_labels: str = os.path.join(self.data_path, "labels", t, "labels.parquet")
+            check_file_exists(
+                path_to_labels, msg=f"[EHRSHOTScenario] Required labels file not found: '{path_to_labels}'"
+            )
+            if t != self.subject or not os.path.exists(path_to_labels):
+                continue
+            df_labels = pd.read_parquet(path_to_labels)
+            # If lab value task, limit to 10k random labels b/c too many in EHRSHOT (upwards of 300k)
+            if self.subject.startswith("lab_"):
+                df_labels = df_labels.sample(n=CONFIG["max_labels_per_task"], random_state=CONFIG["seed"])
+            # Create patient timelines, limited to only events prior to the prediction time of the label
+            timelines_raw: List[List[str]] = get_prior_events(df_data, df_labels, n_procs=n_procs)
+            timelines: List[List[Dict[str, Any]]] = [
+                [{"code": code} for code in timeline] for timeline in timelines_raw
+            ]
+            assert (
+                len(timelines) == df_labels.shape[0]
+            ), f"Expected {df_labels.shape[0]} prior events, got {len(timelines)}"
+        # Add splits
+        df_labels["split"] = df_labels["subject_id"].map(df_splits["split"])
+        # TODO -- Few-shot examples
+        examples: List[Dict[str, Any]] = []
+        n_shots = CONFIG.get("n_shots", 0)
+        for i in range(n_shots if isinstance(n_shots, int) else 0):
+            pass
+        # Create LUMIA-ified prompt for each label
+        print(f"Generating {len(timelines)} prompts...")
+        prompts: List[str] = [lumia_prompt(self.subject, CONFIG, examples, {"ehr": x, "label": 0}) for x in timelines]
+        df_labels["prompt"] = prompts
+        # Save to parquet
+        path_to_output_dir: str = os.path.join(output_path, self.subject)
+        ensure_directory_exists(path_to_output_dir)
+        df_labels.to_parquet(os.path.join(path_to_output_dir, "medhelm_prompts.parquet"))
+        return {"status": "success"}
+    def get_instances(self, output_path: str) -> List[Instance]:
+        path_to_input_csv: str = os.path.join(output_path, self.subject, "medhelm_prompts.parquet")
+        if not os.path.exists(path_to_input_csv):
+            print(f"Creating benchmark from SCRATCH for {self.subject}...")
+            self.create_benchmark(output_path=output_path)  # Create benchmark from scratch
+        # Load data for this task
+        df = pd.read_parquet(path_to_input_csv)
+        # Generate instances
+        instances: List[Instance] = []
+        # df['prompt']=df['prompt'].str.replace('yes','A for yes').str.replace('no','B for no')
+        for prompt, label, split in tqdm(
+            zip(df["prompt"], df["boolean_value"], df["split"]), total=len(df), desc="Generating instances"
+        ):
+            if self.max_length is not None and count_tokens(prompt) > self.max_length:
+                continue
+            label = "yes" if label else "no"
+            # split = TEST_SPLIT if split == "held_out" else (VALID_SPLIT if split == "tuning" else TRAIN_SPLIT)
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == label else [])
+                for pred_answer in EHRSHOTScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),  # prompt
+                    references=references,  # `plan` is the the label; `tags` is whether it is correct`
+                    split=TEST_SPLIT,  # the split
+                )
+            )
+        return instances
+    def get_metadata(self):
+        return ScenarioMetadata(
+            name="ehrshot",
+            display_name="EHRSHOT",
+            description="EHRSHOT is a benchmark designed to evaluate a model's ability to predict "
+            "future clinical events using structured EHR code sequences. Each instance "
+            "contains a patient's historical EHR data and a forward-looking clinical "
+            "question about whether a particular diagnosis, lab result, or hospital event "
+            "will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).",
+            taxonomy=TaxonomyInfo(
+                task="Classification",
+                what="Predict whether a medical event will occur in the future based " "on EHR codes",
+                when="Future prediction",
+                who="Clinician, Insurer",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.4.0py3-none-any.whl → 0.5.10py3-none-any.whl