PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show

crfm_helm-0.5.10.dist-info/METADATA +369 -0
crfm_helm-0.5.10.dist-info/RECORD +1008 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +80 -29
helm/benchmark/adaptation/adapters/adapter.py +2 -2
helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
helm/benchmark/adaptation/common_adapter_specs.py +443 -0
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/adaptation/request_state.py +6 -1
helm/benchmark/adaptation/scenario_state.py +6 -2
helm/benchmark/annotation/aci_bench_annotator.py +84 -0
helm/benchmark/annotation/air_bench_annotator.py +79 -0
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/annotator.py +48 -0
helm/benchmark/annotation/annotator_factory.py +50 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/call_center_annotator.py +258 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +96 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +55 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
helm/benchmark/annotation/live_qa_annotator.py +76 -0
helm/benchmark/annotation/med_dialog_annotator.py +88 -0
helm/benchmark/annotation/medalign_annotator.py +89 -0
helm/benchmark/annotation/medi_qa_annotator.py +87 -0
helm/benchmark/annotation/medication_qa_annotator.py +86 -0
helm/benchmark/annotation/mental_health_annotator.py +87 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
helm/benchmark/annotation/model_as_judge.py +309 -0
helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
helm/benchmark/annotation/test_annotator_factory.py +26 -0
helm/benchmark/annotation/test_dummy_annotator.py +44 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation/xstest_annotator.py +100 -0
helm/benchmark/annotation_executor.py +144 -0
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/data_augmenter.py +0 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +3 -3
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +26 -4
helm/benchmark/augmentations/perturbation_description.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +29 -0
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +56 -19
helm/benchmark/augmentations/translate_perturbation.py +31 -0
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/config_registry.py +7 -1
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +54 -25
helm/benchmark/huggingface_registration.py +28 -10
helm/benchmark/metrics/air_bench_metrics.py +3212 -0
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/basic_metrics.py +437 -667
helm/benchmark/metrics/bbq_metrics.py +17 -6
helm/benchmark/metrics/bias_metrics.py +18 -9
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/classification_metrics.py +107 -22
helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/code_metrics_helper.py +11 -3
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +174 -0
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
helm/benchmark/metrics/copyright_metrics.py +5 -5
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
helm/benchmark/metrics/disinformation_metrics.py +8 -114
helm/benchmark/metrics/dry_run_metrics.py +35 -6
helm/benchmark/metrics/efficiency_metrics.py +287 -0
helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +67 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
helm/benchmark/metrics/language_modeling_metrics.py +111 -0
helm/benchmark/metrics/live_qa_metrics.py +35 -0
helm/benchmark/metrics/llm_jury_metrics.py +58 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/machine_translation_metrics.py +89 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
helm/benchmark/metrics/medec_metrics.py +124 -0
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/metric.py +121 -175
helm/benchmark/metrics/metric_name.py +0 -1
helm/benchmark/metrics/metric_service.py +23 -7
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/omni_math_metrics.py +44 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/ranking_metrics.py +5 -5
helm/benchmark/metrics/reference_metric.py +148 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/safety_metrics.py +91 -0
helm/benchmark/metrics/seahelm_metrics.py +201 -0
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +8 -11
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +150 -11
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +145 -70
helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
helm/benchmark/metrics/test_metric.py +3 -3
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
helm/benchmark/metrics/toxicity_metrics.py +37 -7
helm/benchmark/metrics/toxicity_utils.py +23 -0
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/unitxt_metrics.py +107 -0
helm/benchmark/metrics/vision_language/__init__.py +0 -0
helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
helm/benchmark/metrics/vision_language/image_utils.py +100 -0
helm/benchmark/metrics/wildbench_metrics.py +54 -0
helm/benchmark/model_deployment_registry.py +69 -5
helm/benchmark/model_metadata_registry.py +58 -2
helm/benchmark/multi_gpu_runner.py +133 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +51 -20
helm/benchmark/presentation/run_display.py +51 -12
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +83 -66
helm/benchmark/presentation/summarize.py +483 -388
helm/benchmark/presentation/table.py +8 -8
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_contamination.py +2 -2
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/presentation/test_run_entry.py +2 -2
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/presentation/test_summarize.py +148 -6
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +151 -87
helm/benchmark/run_expander.py +418 -33
helm/benchmark/run_spec.py +93 -0
helm/benchmark/run_spec_factory.py +180 -0
helm/benchmark/run_specs/__init__.py +0 -0
helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/call_center_run_specs.py +201 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1393 -0
helm/benchmark/run_specs/cleva_run_specs.py +277 -0
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +224 -0
helm/benchmark/run_specs/finance_run_specs.py +114 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +625 -0
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
helm/benchmark/run_specs/lite_run_specs.py +307 -0
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +188 -0
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +191 -0
helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
helm/benchmark/run_specs/simple_run_specs.py +104 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +63 -62
helm/benchmark/runner_config_registry.py +21 -0
helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
helm/benchmark/scenarios/air_bench_scenario.py +76 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
helm/benchmark/scenarios/banking77_scenario.py +77 -0
helm/benchmark/scenarios/bbq_scenario.py +17 -2
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +18 -3
helm/benchmark/scenarios/boolq_scenario.py +21 -1
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
helm/benchmark/scenarios/clear_scenario.py +180 -0
helm/benchmark/scenarios/cleva_scenario.py +482 -3
helm/benchmark/scenarios/code_scenario.py +46 -4
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +33 -1
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
helm/benchmark/scenarios/disinformation_scenario.py +32 -1
helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
helm/benchmark/scenarios/financebench_scenario.py +74 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
helm/benchmark/scenarios/gpqa_scenario.py +98 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +21 -2
helm/benchmark/scenarios/gsm_scenario.py +31 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
helm/benchmark/scenarios/headqa_scenario.py +158 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
helm/benchmark/scenarios/ice_scenario.py +28 -4
helm/benchmark/scenarios/ifeval_scenario.py +71 -0
helm/benchmark/scenarios/image_generation/__init__.py +0 -0
helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +26 -3
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
helm/benchmark/scenarios/legal_support_scenario.py +24 -1
helm/benchmark/scenarios/legalbench_scenario.py +45 -3
helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
helm/benchmark/scenarios/lextreme_scenario.py +22 -1
helm/benchmark/scenarios/live_qa_scenario.py +94 -0
helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +81 -22
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +30 -1
helm/benchmark/scenarios/medalign_scenario.py +117 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
helm/benchmark/scenarios/medbullets_scenario.py +167 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
helm/benchmark/scenarios/medec_scenario.py +148 -0
helm/benchmark/scenarios/medhallu_scenario.py +95 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +146 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
helm/benchmark/scenarios/mmlu_scenario.py +32 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +31 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +71 -0
helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
helm/benchmark/scenarios/quac_scenario.py +24 -1
helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
helm/benchmark/scenarios/raft_scenario.py +33 -3
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
helm/benchmark/scenarios/scenario.py +44 -1
helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
helm/benchmark/scenarios/simple_scenarios.py +122 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +109 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
helm/benchmark/scenarios/summarization_scenario.py +48 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +4 -3
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_scenario.py +6 -3
helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/unitxt_scenario.py +62 -0
helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
helm/benchmark/scenarios/vicuna_scenario.py +22 -2
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
helm/benchmark/scenarios/wikifact_scenario.py +31 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +101 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +32 -2
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +78 -50
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +269 -0
helm/benchmark/static/schema_capabilities.yaml +254 -0
helm/benchmark/static/schema_classic.yaml +259 -1140
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +191 -0
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +161 -0
helm/benchmark/static/schema_legal.yaml +566 -0
helm/benchmark/static/schema_lite.yaml +3 -286
helm/benchmark/static/schema_long_context.yaml +282 -0
helm/benchmark/static/schema_medhelm.yaml +1176 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu.yaml +1449 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +283 -0
helm/benchmark/static/schema_seahelm.yaml +723 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_torr.yaml +474 -0
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_unitxt.yaml +370 -0
helm/benchmark/static/schema_vhelm.yaml +933 -0
helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
helm/benchmark/static_build/config.js +4 -0
helm/benchmark/static_build/index.html +19 -0
helm/benchmark/test_data_preprocessor.py +3 -3
helm/benchmark/test_run_expander.py +1 -1
helm/benchmark/window_services/default_window_service.py +3 -45
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
helm/benchmark/window_services/ice_window_service.py +1 -35
helm/benchmark/window_services/image_generation/__init__.py +0 -0
helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
helm/benchmark/window_services/local_window_service.py +22 -5
helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
helm/benchmark/window_services/test_bloom_window_service.py +5 -4
helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
helm/benchmark/window_services/test_gptj_window_service.py +11 -5
helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
helm/benchmark/window_services/test_openai_window_service.py +18 -12
helm/benchmark/window_services/test_opt_window_service.py +6 -5
helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
helm/benchmark/window_services/test_t511b_window_service.py +5 -4
helm/benchmark/window_services/test_ul2_window_service.py +5 -4
helm/benchmark/window_services/test_utils.py +6 -6
helm/benchmark/window_services/test_yalm_window_service.py +5 -4
helm/benchmark/window_services/tokenizer_service.py +7 -13
helm/benchmark/window_services/window_service.py +42 -0
helm/benchmark/window_services/window_service_factory.py +4 -1
helm/benchmark/window_services/yalm_window_service.py +1 -28
helm/clients/__init__.py +0 -0
helm/{proxy/clients → clients}/ai21_client.py +78 -12
helm/clients/aleph_alpha_client.py +114 -0
helm/{proxy/clients → clients}/anthropic_client.py +304 -21
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +122 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +199 -0
helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
helm/clients/audio_language/qwen_audiolm_client.py +153 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/audio_language/test.py +62 -0
helm/{proxy/clients → clients}/auto_client.py +72 -31
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +381 -0
helm/clients/bedrock_utils.py +105 -0
helm/{proxy/clients → clients}/client.py +92 -17
helm/clients/clip_score_client.py +49 -0
helm/clients/clip_scorers/__init__.py +0 -0
helm/clients/clip_scorers/base_clip_scorer.py +18 -0
helm/clients/clip_scorers/clip_scorer.py +50 -0
helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
helm/{proxy/clients → clients}/cohere_client.py +105 -14
helm/clients/dspy_client.py +135 -0
helm/clients/gcs_client.py +82 -0
helm/{proxy/clients → clients}/google_client.py +8 -6
helm/clients/google_translate_client.py +35 -0
helm/clients/grok_client.py +36 -0
helm/{proxy/clients → clients}/http_model_client.py +8 -8
helm/{proxy/clients → clients}/huggingface_client.py +157 -86
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +269 -0
helm/clients/image_generation/__init__.py +0 -0
helm/clients/image_generation/adobe_vision_client.py +80 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
helm/clients/image_generation/cogview2/__init__.py +0 -0
helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
helm/clients/image_generation/cogview2_client.py +192 -0
helm/clients/image_generation/dalle2_client.py +194 -0
helm/clients/image_generation/dalle3_client.py +108 -0
helm/clients/image_generation/dalle_mini/__init__.py +3 -0
helm/clients/image_generation/dalle_mini/data.py +442 -0
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
helm/clients/image_generation/dalle_mini/model/text.py +251 -0
helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
helm/clients/image_generation/dalle_mini_client.py +191 -0
helm/clients/image_generation/deep_floyd_client.py +80 -0
helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
helm/clients/image_generation/image_generation_client_utils.py +9 -0
helm/clients/image_generation/lexica_client.py +88 -0
helm/clients/image_generation/mindalle/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/__init__.py +216 -0
helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
helm/clients/image_generation/mindalle/utils/config.py +129 -0
helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
helm/clients/image_generation/mindalle/utils/utils.py +89 -0
helm/clients/image_generation/mindalle_client.py +116 -0
helm/clients/image_generation/nudity_check_client.py +64 -0
helm/clients/image_generation/together_image_generation_client.py +113 -0
helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
helm/{proxy/clients → clients}/megatron_client.py +7 -5
helm/clients/mistral_client.py +180 -0
helm/clients/moderation_api_client.py +111 -0
helm/clients/nvidia_nim_client.py +32 -0
helm/clients/open_lm_client.py +43 -0
helm/clients/openai_client.py +604 -0
helm/clients/openai_responses_client.py +200 -0
helm/clients/openrouter_client.py +31 -0
helm/{proxy/clients → clients}/palmyra_client.py +31 -14
helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
helm/clients/reka_client.py +190 -0
helm/clients/simple_client.py +64 -0
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +95 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/{proxy/clients → clients}/test_auto_client.py +13 -15
helm/clients/test_client.py +98 -0
helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
helm/clients/test_openrouter_client.py +69 -0
helm/clients/test_simple_client.py +19 -0
helm/clients/test_together_client.py +184 -0
helm/clients/together_client.py +599 -0
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +488 -0
helm/clients/vision_language/__init__.py +0 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
helm/clients/vision_language/huggingface_vlm_client.py +114 -0
helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
helm/clients/vision_language/open_flamingo/__init__.py +2 -0
helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
helm/clients/vision_language/open_flamingo_client.py +155 -0
helm/clients/vision_language/paligemma_client.py +147 -0
helm/clients/vision_language/palmyra_vision_client.py +101 -0
helm/clients/vision_language/qwen2_vlm_client.py +189 -0
helm/clients/vision_language/qwen_vlm_client.py +174 -0
helm/clients/vllm_client.py +80 -0
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +105 -0
helm/clients/yi_client.py +28 -0
helm/common/audio_utils.py +111 -0
helm/common/cache.py +23 -33
helm/common/cache_backend_config.py +47 -0
helm/common/clip_score_request.py +41 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +10 -2
helm/common/file_caches/__init__.py +0 -0
helm/common/file_caches/file_cache.py +16 -0
helm/common/file_caches/local_file_cache.py +61 -0
helm/common/file_caches/test_local_file_cache.py +25 -0
helm/common/file_upload_request.py +27 -0
helm/common/general.py +10 -3
helm/common/hierarchical_logger.py +124 -12
helm/common/image_generation_parameters.py +25 -0
helm/common/images_utils.py +60 -5
helm/common/key_value_store.py +41 -10
helm/common/local_context.py +140 -0
helm/common/media_object.py +14 -1
helm/common/moderations_api_request.py +71 -0
helm/common/mongo_key_value_store.py +8 -7
helm/common/multimodal_request_utils.py +57 -0
helm/common/nudity_check_request.py +29 -0
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +45 -19
helm/common/response_format.py +18 -0
helm/common/test_cache.py +1 -48
helm/common/test_general.py +10 -0
helm/common/test_logging.py +94 -0
helm/common/test_media_object.py +1 -1
helm/common/tokenization_request.py +1 -10
helm/config/model_deployments.yaml +4713 -1005
helm/config/model_metadata.yaml +4045 -255
helm/config/tokenizer_configs.yaml +1091 -50
helm/proxy/accounts.py +31 -4
helm/proxy/cli.py +6 -4
helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/critique/model_critique_client.py +40 -10
helm/proxy/example_queries.py +33 -28
helm/proxy/retry.py +5 -0
helm/proxy/server.py +82 -18
helm/proxy/services/remote_service.py +32 -7
helm/proxy/services/server_service.py +71 -69
helm/proxy/services/service.py +30 -6
helm/proxy/services/test_remote_service.py +6 -5
helm/proxy/services/test_service.py +1 -13
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +61 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +462 -0
helm/proxy/test_accounts.py +32 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +37 -37
helm/proxy/token_counters/test_auto_token_counter.py +164 -0
helm/proxy/token_counters/token_counter.py +3 -5
helm/tokenizers/__init__.py +0 -0
helm/tokenizers/ai21_tokenizer.py +52 -0
helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
helm/tokenizers/cohere_tokenizer.py +50 -0
helm/tokenizers/grok_tokenizer.py +55 -0
helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/simple_tokenizer.py +33 -0
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
helm/tokenizers/test_simple_tokenizer.py +33 -0
helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
crfm_helm-0.4.0.dist-info/METADATA +0 -264
crfm_helm-0.4.0.dist-info/RECORD +0 -397
helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
helm/benchmark/data_overlap/export_scenario_text.py +0 -119
helm/benchmark/data_overlap/light_scenario.py +0 -60
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/run_specs.py +0 -2762
helm/benchmark/scenarios/numeracy_scenario.py +0 -784
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/utils.js +0 -285
helm/benchmark/test_model_deployment_definition.py +0 -92
helm/benchmark/test_model_properties.py +0 -1570
helm/benchmark/vlm_run_specs.py +0 -97
helm/benchmark/window_services/ai21_window_service.py +0 -258
helm/benchmark/window_services/cohere_window_service.py +0 -163
helm/benchmark/window_services/flan_t5_window_service.py +0 -29
helm/benchmark/window_services/gpt2_window_service.py +0 -32
helm/benchmark/window_services/huggingface_window_service.py +0 -60
helm/benchmark/window_services/t0pp_window_service.py +0 -35
helm/benchmark/window_services/t511b_window_service.py +0 -30
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -74
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -326
helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
helm/benchmark/window_services/ul2_window_service.py +0 -30
helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
helm/common/cache_utils.py +0 -14
helm/proxy/clients/aleph_alpha_client.py +0 -95
helm/proxy/clients/goose_ai_client.py +0 -99
helm/proxy/clients/microsoft_client.py +0 -180
helm/proxy/clients/openai_client.py +0 -206
helm/proxy/clients/simple_client.py +0 -60
helm/proxy/clients/test_client.py +0 -49
helm/proxy/clients/test_together_client.py +0 -97
helm/proxy/clients/together_client.py +0 -334
helm/proxy/clients/vertexai_client.py +0 -115
helm/proxy/token_counters/ai21_token_counter.py +0 -20
helm/proxy/token_counters/cohere_token_counter.py +0 -13
helm/proxy/token_counters/free_token_counter.py +0 -12
helm/proxy/token_counters/gooseai_token_counter.py +0 -24
helm/proxy/token_counters/openai_token_counter.py +0 -22
helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
helm/proxy/token_counters/test_openai_token_counter.py +0 -81
helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
helm/proxy/tokenizers/ice_tokenizer.py +0 -30
helm/proxy/tokenizers/simple_tokenizer.py +0 -32
helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
/helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
/helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
/helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
/helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
/helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
/helm/{proxy/clients → clients}/ai21_utils.py +0 -0
/helm/{proxy/clients → clients}/cohere_utils.py +0 -0
/helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
/helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
/helm/{benchmark → proxy}/static/general.js +0 -0
/helm/{benchmark → proxy}/static/info-icon.png +0 -0

helm/benchmark/metrics/image_generation/clip_score_metrics.py ADDED Viewed

@@ -0,0 +1,84 @@
+from statistics import mean
+from typing import List
+from helm.common.general import singleton
+from helm.common.request import RequestResult
+from helm.common.clip_score_request import DEFAULT_CLIP_SCORE_MODEL, CLIPScoreResult, CLIPScoreRequest
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.window_services.image_generation.clip_window_service import CLIPWindowService
+from helm.common.images_utils import is_blacked_out_image
+from helm.common.multimodal_request_utils import gather_generated_image_locations
+class CLIPScoreMetric(Metric):
+    """
+    Defines CLIPScore-based metrics (https://arxiv.org/abs/2104.08718).
+    CLIPScore is a reference free metric that can be used to evaluate the correlation between an image
+    caption and the content of the image. It has been found to be highly correlated with human judgement.
+    """
+    def __init__(self, multilingual: bool = False):
+        self._multilingual: bool = multilingual
+    def __repr__(self):
+        return f"CLIPScoreMetric(multilingual={self._multilingual})"
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        def get_metric_name(base_name: str) -> str:
+            if self._multilingual:
+                base_name = f"{base_name}_multilingual"
+            return base_name
+        assert request_state.result is not None
+        request_result: RequestResult = request_state.result
+        prompt: str = request_state.request.prompt
+        perturbation_name: str = request_state.instance.perturbation.name if request_state.instance.perturbation else ""
+        if (
+            request_state.instance.contrast_inputs is not None
+            and len(request_state.instance.contrast_inputs) > 0
+            and perturbation_name in ["translate", "dialect", "mild_mix"]
+        ):
+            prompt = singleton(request_state.instance.contrast_inputs).text
+        # Truncate the prompt using the CLIP tokenizer before feeding into the CLIP model.
+        # Otherwise, the library will throw an error.
+        model = DEFAULT_CLIP_SCORE_MODEL
+        # The max length is 77, but we also need to account for <|startoftext|> and <|endoftext|>.
+        # This max length is hardcoded for DEFAULT_CLIP_SCORE_MODEL i.e. openai/clip-vit-large-patch14
+        max_sequence_length = 77 - 2
+        prompt = CLIPWindowService(
+            service=metric_service,
+            tokenizer_name=DEFAULT_CLIP_SCORE_MODEL,
+            max_sequence_length=max_sequence_length,
+            max_request_length=max_sequence_length,
+            end_of_text_token="",
+            prefix_token="",
+        ).truncate_from_right(prompt)
+        scores: List[float] = []
+        image_locations: List[str] = gather_generated_image_locations(request_result)
+        for location in image_locations:
+            if not is_blacked_out_image(location):
+                result: CLIPScoreResult = metric_service.compute_clip_score(
+                    CLIPScoreRequest(prompt, location, model=model, multilingual=self._multilingual)
+                )
+                scores.append(result.score)
+        stats: List[Stat] = [
+            Stat(MetricName(get_metric_name("expected_clip_score"))).add(mean(scores) if len(scores) > 0 else 0),
+            Stat(MetricName(get_metric_name("max_clip_score"))).add(max(scores) if len(scores) > 0 else 0),
+        ]
+        return stats

helm/benchmark/metrics/image_generation/denoised_runtime_metric.py ADDED Viewed

@@ -0,0 +1,42 @@
+from collections import defaultdict
+from tqdm import tqdm
+from typing import Dict
+import math
+import numpy as np
+from helm.common.request import RequestResult
+from helm.benchmark.scenarios.scenario import Instance
+from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric import MetricInterface, MetricResult
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+class DenoisedRuntimeMetric(MetricInterface):
+    def __repr__(self):
+        return "DenoisedRuntimeMetric()"
+    def evaluate(
+        self,
+        scenario_state: ScenarioState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+        parallelism: int,
+    ) -> MetricResult:
+        instance_to_min_request_times: Dict[Instance, float] = defaultdict(lambda: math.inf)
+        for request_state in tqdm(scenario_state.request_states):
+            assert request_state.result is not None
+            request_result: RequestResult = request_state.result
+            assert request_result.request_time is not None
+            request_time: float = request_result.request_time
+            instance: Instance = request_state.instance
+            instance_to_min_request_times[instance] = min(instance_to_min_request_times[instance], request_time)
+        denoised_runtime: float = float(np.mean(list(instance_to_min_request_times.values())))
+        return MetricResult(
+            aggregated_stats=[Stat(MetricName("denoised_runtime")).add(denoised_runtime)], per_instance_stats=[]
+        )

helm/benchmark/metrics/image_generation/detection_metrics.py ADDED Viewed

@@ -0,0 +1,57 @@
+from typing import List, Dict, Any
+import json
+from statistics import mean
+from helm.common.request import RequestResult
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.common.multimodal_request_utils import gather_generated_image_locations
+from helm.benchmark.metrics.image_generation.detectors.vitdet import ViTDetDetector
+class DetectionMetric(Metric):
+    """
+    Define metrics following DALL-EVAL (https://arxiv.org/abs/2202.04053),
+    which measure whether generated images contain the correct objects, counts, and relations
+    as specified in input text prompts.
+    """
+    def __init__(self):
+        self._detection_model = None
+    def __repr__(self):
+        return "DetectionMetric()"
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.result is not None
+        request_result: RequestResult = request_state.result
+        image_locations: List[str] = gather_generated_image_locations(request_result)
+        if len(image_locations) == 0:
+            return []
+        if self._detection_model is None:
+            self._detection_model = ViTDetDetector()
+        instance = request_state.instance
+        references: Dict[str, Any] = {**json.loads(instance.references[0].output.text), "skill": instance.sub_split}
+        prompt: str = request_state.request.prompt
+        scores: List[float] = []
+        for image_location in image_locations:
+            score: float = self._detection_model.compute_score(prompt, image_location, references)
+            scores.append(score)
+        stats: List[Stat] = [
+            Stat(MetricName("detection_correct_frac")).add(mean(scores) if len(scores) > 0 else 0),
+        ]
+        return stats

helm/benchmark/metrics/image_generation/detectors/base_detector.py ADDED Viewed

@@ -0,0 +1,8 @@
+from abc import abstractmethod, ABC
+from typing import Any, Dict
+class BaseDetector(ABC):
+    @abstractmethod
+    def compute_score(self, caption: str, image_location: str, references: Dict[str, Any]) -> float:
+        pass

helm/benchmark/metrics/image_generation/detectors/vitdet.py ADDED Viewed

@@ -0,0 +1,178 @@
+import os
+from typing import Dict, Any
+import torch
+from helm.benchmark.runner import get_cached_models_path
+from helm.common.general import ensure_file_downloaded, hlog
+from helm.common.images_utils import open_image
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.gpu_utils import get_torch_device
+from helm.benchmark.metrics.image_generation.detectors.base_detector import BaseDetector
+MODEL_CONFIG_DOWNLOAD_URL: str = "https://drive.google.com/uc?id=1MLuwQ0ZN0gJQ42oVCc0aFz6Rneb1g3Rt"
+MODEL_CHECKPOINT_DOWNLOAD_URL: str = (
+    "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/mask_rcnn_vitdet_b/f325346929/model_final_61ccd1.pkl"
+)
+class ViTDetDetector(BaseDetector):
+    def __init__(self):
+        try:
+            from detectron2.checkpoint import DetectionCheckpointer
+            from detectron2.config import LazyConfig
+            from detectron2.config import instantiate
+            from detectron2.data.catalog import MetadataCatalog
+        except ModuleNotFoundError as e:
+            handle_module_not_found_error(e, ["heim"])
+        super().__init__()
+        cache_path: str = get_cached_models_path()
+        cfg_path: str = os.path.join(cache_path, "vitdet_model.yaml")
+        ensure_file_downloaded(source_url=MODEL_CONFIG_DOWNLOAD_URL, target_path=cfg_path)
+        cfg = LazyConfig.load(cfg_path)
+        model_path: str = os.path.join(cache_path, "vitdet_model.pkl")
+        ensure_file_downloaded(source_url=MODEL_CHECKPOINT_DOWNLOAD_URL, target_path=model_path)
+        cfg.train.init_checkpoint = model_path
+        model = instantiate(cfg.model).cuda()
+        model = model.eval()
+        for p in model.parameters():
+            p.requires_grad = False
+        DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
+        self._cfg = cfg
+        self._model = model
+        self._device: torch.device = get_torch_device()
+        hlog("Initialized the ViTDet model.")
+        # COCO classes
+        self._coco_classes = MetadataCatalog.get("coco_2017_val").thing_classes
+    def forward_model(self, image_location: str) -> float:
+        try:
+            from detectron2.data.common import DatasetFromList, MapDataset
+            from detectron2.config import instantiate
+        except ModuleNotFoundError as e:
+            handle_module_not_found_error(e, ["heim"])
+        image = open_image(image_location)
+        dataset_dicts = [
+            {
+                "file_name": image_location,
+                "width": image.width,
+                "height": image.height,
+            }
+        ]
+        dataset = DatasetFromList(dataset_dicts, copy=False)
+        mapper = instantiate(self._cfg.dataloader.test.mapper)
+        dataset = MapDataset(dataset, mapper)
+        inputs = [dataset[0]]
+        outputs = self._model(inputs)
+        return outputs[0]["instances"]
+    def compute_score(self, caption: str, image_location: str, references: Dict[str, Any]) -> float:
+        # hlog(f'compute score for prompt: {caption}, file: {image_location}, skill: {references["skill"]}')
+        instances = self.forward_model(image_location)
+        if references["skill"] == "object":
+            return self.compute_score_object(instances, references)
+        if references["skill"] == "count":
+            return self.compute_score_count(instances, references)
+        if references["skill"] == "spatial":
+            return self.compute_score_spatial(instances, references)
+        raise NotImplementedError(references["skill"])
+    def compute_score_object(self, instances, references):
+        gt_class_name = references["object"]
+        gt_class = self._coco_classes.index(gt_class_name)
+        if len(instances.scores) == 0:
+            pred_id = None
+            pred_score = torch.zeros(())
+            pred_class = None
+            pred_class_name = None
+            correct = 0.0
+        else:
+            pred_id = instances.scores.max(-1).indices
+            pred_score = instances.scores[pred_id]  # (num_instances,) -> ()    # noqa
+            pred_class = instances.pred_classes[pred_id]  # (num_instances,) -> ()
+            pred_class_name = self._coco_classes[pred_class.item()]  # noqa
+            correct = float(pred_class == gt_class)
+        # hlog(f"pred_class: {pred_class_name}, gt_class: {gt_class_name}, correct: {correct}")
+        return correct
+    def compute_score_count(self, instances, references):
+        # assume that there is only one type of object
+        gt_class_name = references["object"]
+        gt_class_idx = self._coco_classes.index(gt_class_name)
+        gt_count = references["count"]
+        if len(instances.scores) == 0:
+            pred_count = 0
+            correct = 0.0
+        else:
+            pred_count = (instances.pred_classes == gt_class_idx).sum().item()
+            correct = float(pred_count == gt_count)
+        return correct
+    def compute_score_spatial(self, instances, references):
+        gt_class_name_1, gt_class_name_2 = references["objects"]
+        gt_class_idx_1 = self._coco_classes.index(gt_class_name_1)
+        gt_class_idx_2 = self._coco_classes.index(gt_class_name_2)
+        relation = references["relation"].split("_")[0]
+        if len(instances.scores) == 0:
+            correct = 0
+            pred_rel = "no_pred"
+        else:
+            pred_count_1 = (instances.pred_classes == gt_class_idx_1).sum().item()
+            pred_count_2 = (instances.pred_classes == gt_class_idx_2).sum().item()
+            if pred_count_1 != 1 or pred_count_2 != 1:
+                correct = 0
+                pred_rel = "obj_count_mismatch"
+            else:
+                x11, y11 = instances.pred_boxes[instances.pred_classes == gt_class_idx_1].tensor[0, :2]
+                x21, y21 = instances.pred_boxes[instances.pred_classes == gt_class_idx_2].tensor[0, :2]
+                x_diff = x11 - x21
+                y_diff = y11 - y21
+                # FIXME: The code below mimics dall-eval logic. I don't think
+                # we need to follow it. Does the case of two objects of same
+                # category make sense? Also, I don't know why we need to
+                # to ensure something is more "right" than it is "above".
+                if gt_class_name_1 == gt_class_name_2:
+                    if abs(x_diff) > abs(y_diff):
+                        if relation in ["left", "right"]:
+                            correct = 1
+                            pred_rel = "relation_correct"
+                        else:
+                            pred_rel = "relation_incorrect"
+                            correct = 0
+                    else:
+                        if relation in ["above", "below"]:
+                            pred_rel = "relation_correct"
+                            correct = 1
+                        else:
+                            pred_rel = "relation_incorrect"
+                            correct = 0
+                else:
+                    if abs(x_diff) > abs(y_diff):
+                        if x11 < x21:
+                            pred_rel = "right"
+                        else:
+                            pred_rel = "left"
+                    else:
+                        if y11 > y21:
+                            pred_rel = "above"
+                        else:
+                            pred_rel = "below"
+                    if relation == pred_rel:
+                        correct = 1
+                    else:
+                        correct = 0
+        return correct

helm/benchmark/metrics/image_generation/efficiency_metrics.py ADDED Viewed

@@ -0,0 +1,41 @@
+from typing import List
+from helm.common.request import RequestResult
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.common.multimodal_request_utils import gather_generated_image_locations
+class EfficiencyMetric(Metric):
+    """
+    Defines the efficiency metrics for text-to-image models.
+    """
+    def __repr__(self):
+        return "EfficiencyMetric()"
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        prompt: str = request_state.request.prompt
+        assert request_state.result is not None
+        request_result: RequestResult = request_state.result
+        image_locations: List[str] = gather_generated_image_locations(request_result)
+        if len(image_locations) == 0:
+            return []
+        # inference_runtime is computed in BasicMetric
+        stats: List[Stat] = [
+            Stat(MetricName("prompt_length")).add(len(prompt)),
+            Stat(MetricName("num_generated_images")).add(len(request_result.completions)),
+        ]
+        return stats

helm/benchmark/metrics/image_generation/fidelity_metrics.py ADDED Viewed

@@ -0,0 +1,168 @@
+from tqdm import tqdm
+from typing import Dict, List, Set, Optional
+import math
+import os
+import shutil
+from helm.common.general import ensure_directory_exists, generate_unique_id, get_file_name, hlog
+from helm.common.gpu_utils import is_cuda_available, get_torch_device
+from helm.common.request import RequestResult
+from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
+from helm.benchmark.scenarios.scenario import Instance
+from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.metric import MetricInterface, MetricResult
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.common.images_utils import is_blacked_out_image, copy_image
+from helm.common.optional_dependencies import handle_module_not_found_error
+class FidelityMetric(MetricInterface):
+    """
+    Frechet Inception Distance (FID) is a measure of similarity between two sets of images.
+    Inception Score (IS) measures quality and diversity of images.
+    Both metrics require a large number of samples to compute.
+    @misc{Seitzer2020FID,
+      author={Maximilian Seitzer},
+      title={{pytorch-fid: FID Score for PyTorch}},
+      month={August},
+      year={2020},
+      note={Version 0.3.0},
+      howpublished={https://github.com/mseitzer/pytorch-fid},
+    }
+    @misc{obukhov2020torchfidelity,
+      author={Anton Obukhov and Maximilian Seitzer and Po-Wei Wu and Semen Zhydenko and Jonathan Kyl
+              and Elvis Yu-Jing Lin},
+      year=2020,
+      title={High-fidelity performance metrics for generative models in PyTorch},
+      url={https://github.com/toshas/torch-fidelity},
+      publisher={Zenodo},
+      version={v0.3.0},
+      doi={10.5281/zenodo.4957738},
+      note={Version: 0.3.0, DOI: 10.5281/zenodo.4957738}
+    }
+    """
+    IMAGE_WIDTH: int = 512
+    IMAGE_HEIGHT: int = 512
+    def __repr__(self):
+        return "FidelityMetric()"
+    def evaluate(
+        self,
+        scenario_state: ScenarioState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+        parallelism: int,
+    ) -> MetricResult:
+        try:
+            import torch_fidelity
+            from pytorch_fid.fid_score import calculate_fid_given_paths
+        except ModuleNotFoundError as e:
+            handle_module_not_found_error(e, ["heim"])
+        dest_path: str
+        unique_perturbations: Set[Optional[PerturbationDescription]] = set()
+        gold_images_path: str = os.path.join(eval_cache_path, generate_unique_id())
+        ensure_directory_exists(gold_images_path)
+        # The library requires the gold and generated images to be in two separate directories.
+        # Gather the gold images and the unique perturbations
+        num_gold_images: int = 0
+        for request_state in tqdm(scenario_state.request_states):
+            instance: Instance = request_state.instance
+            unique_perturbations.add(instance.perturbation)
+            for reference in instance.references:
+                if not reference.is_correct:
+                    continue
+                assert (
+                    reference.output.multimedia_content is not None
+                    and reference.output.multimedia_content.media_objects[0].location is not None
+                )
+                file_path: str = reference.output.multimedia_content.media_objects[0].location
+                dest_path = os.path.join(gold_images_path, get_file_name(file_path))
+                copy_image(file_path, dest_path, width=self.IMAGE_WIDTH, height=self.IMAGE_HEIGHT)
+                num_gold_images += 1
+        hlog(f"Resized {num_gold_images} gold images to {self.IMAGE_WIDTH}x{self.IMAGE_HEIGHT}.")
+        # Compute the FID for each perturbation group
+        stats: List[Stat] = []
+        for perturbation in unique_perturbations:
+            perturbation_name: str = "" if perturbation is None else str(perturbation)
+            generated_images_path: str = os.path.join(eval_cache_path, generate_unique_id())
+            ensure_directory_exists(generated_images_path)
+            num_generated_images: int = 0
+            for request_state in tqdm(scenario_state.request_states):
+                if request_state.instance.perturbation != perturbation:
+                    continue
+                assert request_state.result is not None
+                request_result: RequestResult = request_state.result
+                # Gather the model-generated images
+                for image in request_result.completions:
+                    assert image.multimodal_content is not None
+                    location = image.multimodal_content.media_objects[0].location
+                    if location is not None and not is_blacked_out_image(location):
+                        dest_path = os.path.join(generated_images_path, get_file_name(location))
+                        copy_image(location, dest_path, width=self.IMAGE_WIDTH, height=self.IMAGE_HEIGHT)
+                        num_generated_images += 1
+            compute_kid: bool = num_generated_images >= 1000
+            hlog(f"Resized {num_generated_images} images to {self.IMAGE_WIDTH}x{self.IMAGE_HEIGHT}.")
+            try:
+                hlog(f"Computing FID between {generated_images_path} and {gold_images_path}...")
+                fid: float = calculate_fid_given_paths(
+                    paths=[generated_images_path, gold_images_path],
+                    device=get_torch_device(),
+                    # Following defaults set in
+                    # https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/fid_score.py#L54
+                    batch_size=50,
+                    dims=2048,
+                    num_workers=8,
+                )
+                hlog(f"Done. FID score: {fid}")
+                # The torch_fidelity library fails when there are too few images (i.e., `max_eval_instances` is small).
+                hlog("Computing the other fidelity metrics...")
+                metrics_dict: Dict[str, float] = torch_fidelity.calculate_metrics(
+                    input1=generated_images_path,
+                    input2=gold_images_path,
+                    isc=True,
+                    fid=False,
+                    kid=compute_kid,
+                    ppl=False,  # Requires `GenerativeModel`
+                    cuda=is_cuda_available(),
+                    save_cpu_ram=not is_cuda_available(),
+                )
+                inception_score: float = metrics_dict["inception_score_mean"]
+                if math.isnan(inception_score):
+                    inception_score = 0
+                stats.extend(
+                    [
+                        Stat(MetricName("fid", perturbation=perturbation)).add(fid),
+                        Stat(MetricName("inception_score", perturbation=perturbation)).add(inception_score),
+                    ]
+                )
+                if compute_kid:
+                    kid: float = metrics_dict["kernel_inception_distance_mean"]
+                    stats.append(Stat(MetricName("kernel_inception_distance", perturbation=perturbation)).add(kid))
+            except AssertionError as e:
+                hlog(f"Error occurred when computing fidelity metrics for perturbation: {perturbation_name} Error: {e}")
+            shutil.rmtree(generated_images_path)
+        # Delete the gold images directory
+        shutil.rmtree(gold_images_path)
+        return MetricResult(aggregated_stats=stats, per_instance_stats=[])

helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py ADDED Viewed

File without changes

helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py ADDED Viewed

@@ -0,0 +1,63 @@
+import numpy as np
+from helm.common.optional_dependencies import handle_module_not_found_error
+def compute_fractal_dimension(image_path: str) -> float:
+    """
+    Compute the fractal coefficient of an image.
+    From https://en.wikipedia.org/wiki/Minkowski–Bouligand_dimension, in fractal
+    geometry, the Minkowski–Bouligand dimension, also known as Minkowski dimension
+    or box-counting dimension, is a way of determining the fractal dimension of a
+    set S in a Euclidean space Rn, or more generally in a metric space (X, d).
+    Adapted from https://gist.github.com/viveksck/1110dfca01e4ec2c608515f0d5a5b1d1.
+    :param image_path: Path to the image.
+    """
+    def fractal_dimension(Z, threshold=0.2):
+        # Only for 2d image
+        assert len(Z.shape) == 2
+        # From https://github.com/rougier/numpy-100 (#87)
+        def boxcount(Z, k):
+            S = np.add.reduceat(
+                np.add.reduceat(Z, np.arange(0, Z.shape[0], k), axis=0), np.arange(0, Z.shape[1], k), axis=1
+            )
+            # We count non-empty (0) and non-full boxes (k*k)
+            return len(np.where((S > 0) & (S < k * k))[0])
+        # Transform Z into a binary array
+        Z = Z < threshold
+        # Minimal dimension of image
+        p = min(Z.shape)
+        # Greatest power of 2 less than or equal to p
+        n = 2 ** np.floor(np.log(p) / np.log(2))
+        # Extract the exponent
+        n = int(np.log(n) / np.log(2))
+        # Build successive box sizes (from 2**n down to 2**1)
+        sizes = 2 ** np.arange(n, 1, -1)
+        # Actual box counting with decreasing size
+        counts = []
+        for size in sizes:
+            counts.append(boxcount(Z, size))
+        # Fit the successive log(sizes) with log (counts)
+        coeffs = np.polyfit(np.log(sizes), np.log(counts), 1)
+        return -coeffs[0]
+    try:
+        import cv2
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["heim"])
+    image: np.ndarray = cv2.imread(image_path, 0) / 255.0  # type: ignore
+    assert image.min() >= 0 and image.max() <= 1
+    return fractal_dimension(image)

helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py ADDED Viewed

@@ -0,0 +1,33 @@
+import os
+from helm.benchmark.metrics.image_generation.fractal_dimension.fractal_dimension_util import compute_fractal_dimension
+def fractal_dimension_test(image_filename: str, expected_fractal_dimension: float):
+    image_path: str = os.path.join(os.path.dirname(__file__), "test_images", image_filename)
+    dim: float = compute_fractal_dimension(image_path)
+    assert round(dim, 2) == expected_fractal_dimension
+# Test case are inspired by https://www.sciencedirect.com/science/article/pii/S0097849303001547
+def test_compute_fractal_dimension_cloud():
+    # Clouds have a fractal dimension (D) of 1.30-1.33.
+    fractal_dimension_test("cloud.png", 1.34)
+def test_compute_fractal_dimension_sea_anemone():
+    # Sea anemones have a D of 1.6.
+    fractal_dimension_test("sea_anemone.png", 1.54)
+def test_compute_fractal_dimension_snowflake():
+    # Snowflakes have a D of 1.7.
+    fractal_dimension_test("snowflakes.png", 1.69)
+def test_compute_fractal_dimension_convergence():
+    # "Pollock continued to drip paint for a period lasting up to six months, depositing layer upon layer,
+    # and gradually creating a highly dense fractal pattern. As a result, the D value of his paintings rose
+    # gradually as they neared completion, starting in the range of 1.3–1.5 for the initial springboard layer
+    # and reaching a final value as high as 1.9". Convergence was produced in 1952 by Jackson Pollock.
+    fractal_dimension_test("convergence.png", 1.83)

crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.4.0py3-none-any.whl → 0.5.10py3-none-any.whl