crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class ModerationAPIRequest:
|
|
7
|
+
# Text to check against OpenAI's content policy
|
|
8
|
+
text: str
|
|
9
|
+
|
|
10
|
+
# From https://beta.openai.com/docs/api-reference/moderations/create,
|
|
11
|
+
# "the default is text-moderation-latest which will be automatically upgraded over time.
|
|
12
|
+
# This ensures you are always using our most accurate model. If you use text-moderation-stable,
|
|
13
|
+
# we will provide advanced notice before updating the model. Accuracy of text-moderation-stable
|
|
14
|
+
# may be slightly lower than for text-moderation-latest."
|
|
15
|
+
use_latest_model: bool = False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class ModerationCategoryFlaggedResults:
|
|
20
|
+
"""
|
|
21
|
+
Contains per-category binary content violation flags.
|
|
22
|
+
For descriptions of the categories, see https://beta.openai.com/docs/guides/moderation/overview.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
hate_flagged: bool
|
|
26
|
+
hate_threatening_flagged: bool
|
|
27
|
+
self_harm_flagged: bool
|
|
28
|
+
sexual_flagged: bool
|
|
29
|
+
sexual_minors_flagged: bool
|
|
30
|
+
violence_flagged: bool
|
|
31
|
+
violence_graphic_flagged: bool
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class ModerationCategoryScores:
|
|
36
|
+
"""
|
|
37
|
+
Contains per-category scores. Values are between 0 and 1, where higher values denote higher
|
|
38
|
+
confidence. The scores should not be interpreted as probabilities.
|
|
39
|
+
For descriptions of the categories, see https://beta.openai.com/docs/guides/moderation/overview.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
hate_score: float
|
|
43
|
+
hate_threatening_score: float
|
|
44
|
+
self_harm_score: float
|
|
45
|
+
sexual_score: float
|
|
46
|
+
sexual_minors_score: float
|
|
47
|
+
violence_score: float
|
|
48
|
+
violence_graphic_score: float
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass(frozen=True)
|
|
52
|
+
class ModerationAPIRequestResult:
|
|
53
|
+
"""Result after sending a `ModerationAPIRequest`."""
|
|
54
|
+
|
|
55
|
+
# Whether the request was successful
|
|
56
|
+
success: bool
|
|
57
|
+
|
|
58
|
+
# Whether the request was cached
|
|
59
|
+
cached: bool
|
|
60
|
+
|
|
61
|
+
# True if the model classifies the content as violating OpenAI's content policy, False otherwise
|
|
62
|
+
flagged: Optional[bool]
|
|
63
|
+
|
|
64
|
+
# Flagged results
|
|
65
|
+
flagged_results: Optional[ModerationCategoryFlaggedResults]
|
|
66
|
+
|
|
67
|
+
# Score results
|
|
68
|
+
scores: Optional[ModerationCategoryScores]
|
|
69
|
+
|
|
70
|
+
# If `success` is false, what was the error?
|
|
71
|
+
error: Optional[str] = None
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from typing import Dict, Generator, Iterable, Optional, Tuple
|
|
2
|
+
from typing import Dict, Generator, Iterable, Mapping, Optional, Tuple
|
|
3
3
|
|
|
4
4
|
from helm.common.key_value_store import KeyValueStore
|
|
5
5
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -35,15 +35,15 @@ class MongoKeyValueStore(KeyValueStore):
|
|
|
35
35
|
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
36
36
|
return
|
|
37
37
|
|
|
38
|
-
def _canonicalize_key(self, key:
|
|
38
|
+
def _canonicalize_key(self, key: Mapping) -> SON:
|
|
39
39
|
serialized = json.dumps(key, sort_keys=True)
|
|
40
40
|
return json.loads(serialized, object_pairs_hook=SON)
|
|
41
41
|
|
|
42
|
-
def contains(self, key:
|
|
42
|
+
def contains(self, key: Mapping) -> bool:
|
|
43
43
|
query = {self._REQUEST_KEY: self._canonicalize_key(key)}
|
|
44
44
|
return self._collection.find_one(query) is not None
|
|
45
45
|
|
|
46
|
-
def get(self, key:
|
|
46
|
+
def get(self, key: Mapping) -> Optional[Dict]:
|
|
47
47
|
query = {self._REQUEST_KEY: self._canonicalize_key(key)}
|
|
48
48
|
document = self._collection.find_one(query)
|
|
49
49
|
if document is not None:
|
|
@@ -63,7 +63,7 @@ class MongoKeyValueStore(KeyValueStore):
|
|
|
63
63
|
else:
|
|
64
64
|
yield (request, response)
|
|
65
65
|
|
|
66
|
-
def put(self, key:
|
|
66
|
+
def put(self, key: Mapping, value: Dict) -> None:
|
|
67
67
|
request = self._canonicalize_key(key)
|
|
68
68
|
document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, value)])
|
|
69
69
|
# The MongoDB collection should have a unique indexed on "request"
|
|
@@ -84,5 +84,6 @@ class MongoKeyValueStore(KeyValueStore):
|
|
|
84
84
|
# Note: unlike put, multi_put does not support documents with null bytes in keys.
|
|
85
85
|
self._collection.bulk_write(operations)
|
|
86
86
|
|
|
87
|
-
def remove(self, key:
|
|
88
|
-
self.
|
|
87
|
+
def remove(self, key: Mapping) -> None:
|
|
88
|
+
query = {self._REQUEST_KEY: self._canonicalize_key(key)}
|
|
89
|
+
self._collection.delete_one(query)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
import urllib.parse
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
8
|
+
from helm.benchmark.scenarios.scenario import Reference
|
|
9
|
+
from helm.common.request import RequestResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_contents_as_bytes(path: str) -> bytes:
|
|
13
|
+
"""Get the contents at the location as bytes.
|
|
14
|
+
|
|
15
|
+
The location can be a local path or a URL."""
|
|
16
|
+
# Fetch the audio file and convert it to a base64 encoded string
|
|
17
|
+
is_remote = urllib.parse.urlparse(path).scheme in ["http", "https"]
|
|
18
|
+
if is_remote:
|
|
19
|
+
response = requests.get(path)
|
|
20
|
+
response.raise_for_status()
|
|
21
|
+
return response.content
|
|
22
|
+
else:
|
|
23
|
+
with open(path, "rb") as f:
|
|
24
|
+
return f.read()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_contents_as_base64(path: str) -> str:
|
|
28
|
+
"""Get the contents at the location as a base64-encoded string.
|
|
29
|
+
|
|
30
|
+
The location can be a local path or a URL."""
|
|
31
|
+
return base64.b64encode(get_contents_as_bytes(path)).decode("utf-8")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def gather_generated_image_locations(request_result: RequestResult) -> List[str]:
|
|
35
|
+
"""Gathers the locations (file paths or URLs) of the generated images."""
|
|
36
|
+
image_locations: List[str] = []
|
|
37
|
+
for image in request_result.completions:
|
|
38
|
+
# Models like DALL-E 2 can skip generating images for prompts that violate their content policy
|
|
39
|
+
if image.multimodal_content is None or image.multimodal_content.size == 0:
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
location: Optional[str] = image.multimodal_content.media_objects[0].location
|
|
43
|
+
if location is not None:
|
|
44
|
+
image_locations.append(location)
|
|
45
|
+
return image_locations
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_gold_image_location(request_state: RequestState) -> str:
|
|
49
|
+
"""Returns the first gold image location."""
|
|
50
|
+
references: List[Reference] = request_state.instance.references
|
|
51
|
+
assert (
|
|
52
|
+
len(references) > 0
|
|
53
|
+
and references[0].output.multimedia_content is not None
|
|
54
|
+
and references[0].output.multimedia_content.size > 0
|
|
55
|
+
and references[0].output.multimedia_content.media_objects[0].location is not None
|
|
56
|
+
), "Expected at least one gold image"
|
|
57
|
+
return references[0].output.multimedia_content.media_objects[0].location
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List, Optional, Dict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class NudityCheckRequest:
|
|
7
|
+
"""
|
|
8
|
+
Checks for nudity for a given set of images.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
# Batch of images
|
|
12
|
+
image_locations: List[str] = field(default_factory=list)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class NudityCheckResult:
|
|
17
|
+
"""Result after sending a `NudityCheckRequest`."""
|
|
18
|
+
|
|
19
|
+
# Whether the request was successful
|
|
20
|
+
success: bool
|
|
21
|
+
|
|
22
|
+
# Whether the request was cached
|
|
23
|
+
cached: bool
|
|
24
|
+
|
|
25
|
+
# Nudity results. True indicates the particular image contains nudity.
|
|
26
|
+
image_to_nudity: Dict[str, bool] = field(default_factory=dict)
|
|
27
|
+
|
|
28
|
+
# If `success` is false, what was the error?
|
|
29
|
+
error: Optional[str] = None
|
helm/common/object_spec.py
CHANGED
|
@@ -55,14 +55,23 @@ def inject_object_spec_args(
|
|
|
55
55
|
This is loosely based on instance (constant) bindings and provider bindings in Guice dependency injection.
|
|
56
56
|
|
|
57
57
|
Example:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
58
|
+
>>> from helm.common.object_spec import * # NOQA
|
|
59
|
+
>>> import sys, types
|
|
60
|
+
>>> # Given a custom class with hashable arguments
|
|
61
|
+
>>> class MyClass:
|
|
62
|
+
... def __init__(a: int, b: int, c: int, d: int = 0):
|
|
63
|
+
... pass
|
|
64
|
+
>>> #
|
|
65
|
+
>>> # <boilerplate>: make a dummy module for MyClass to make this doctest exectuable
|
|
66
|
+
>>> sys.modules["my_module"] = type("MyModule", (types.ModuleType,), {"MyClass": MyClass})("my_module")
|
|
67
|
+
>>> # </boilerplate>
|
|
68
|
+
>>> #
|
|
69
|
+
>>> # Define new style and old style object specs
|
|
70
|
+
>>> old_object_spec = ObjectSpec(class_name="my_module.MyClass", args={"a": 11})
|
|
71
|
+
>>> new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
|
|
72
|
+
>>> # new_object_spec is now
|
|
73
|
+
>>> print(new_object_spec)
|
|
74
|
+
ObjectSpec(class_name='my_module.MyClass', args={'a': 11, 'b': 12, 'c': 13})
|
|
66
75
|
"""
|
|
67
76
|
cls = get_class_by_name(spec.class_name)
|
|
68
77
|
init_signature = inspect.signature(cls.__init__)
|
|
@@ -93,6 +102,12 @@ def parse_object_spec(description: str) -> ObjectSpec:
|
|
|
93
102
|
<class_name>:<key>=<value>,<key>=<value>
|
|
94
103
|
Usually, the description is something that's succinct and can be typed on the command-line.
|
|
95
104
|
Here, value defaults to string.
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
>>> from helm.common.object_spec import * # NOQA
|
|
108
|
+
>>> description = 'mscoco:model=huggingface_stable-diffusion-v1-4'
|
|
109
|
+
>>> parse_object_spec(description)
|
|
110
|
+
ObjectSpec(class_name='mscoco', args={'model': 'huggingface_stable-diffusion-v1-4'})
|
|
96
111
|
"""
|
|
97
112
|
|
|
98
113
|
def parse_arg(arg: str) -> Tuple[str, Any]:
|
|
@@ -9,7 +9,7 @@ def handle_module_not_found_error(e: ModuleNotFoundError, suggestions: Optional[
|
|
|
9
9
|
# TODO: Ask user to install more specific optional dependencies
|
|
10
10
|
# e.g. crfm-helm[plots] or crfm-helm[server]
|
|
11
11
|
suggested_commands = " or ".join(
|
|
12
|
-
[f
|
|
12
|
+
[f'`pip install "crfm-helm[{suggestion}]"`' for suggestion in (suggestions or []) + ["all"]]
|
|
13
13
|
)
|
|
14
14
|
raise OptionalDependencyNotInstalled(
|
|
15
15
|
f"Optional dependency {e.name} is not installed. Please run {suggested_commands} to install it."
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class REEvalParameters:
|
|
7
|
+
"""
|
|
8
|
+
Parameters for reeval evaluation.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
model_ability: Optional[float] = None
|
|
12
|
+
"""The inital ability of the model to perform the task. Used for reeval evaluation."""
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from helm.common.context import Context
|
|
2
|
+
from helm.common.cache import CacheConfig
|
|
3
|
+
from helm.common.authentication import Authentication
|
|
4
|
+
from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
|
|
5
|
+
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
6
|
+
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
7
|
+
from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
8
|
+
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
9
|
+
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
10
|
+
from helm.common.tokenization_request import (
|
|
11
|
+
TokenizationRequest,
|
|
12
|
+
TokenizationRequestResult,
|
|
13
|
+
DecodeRequestResult,
|
|
14
|
+
DecodeRequest,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.request import Request, RequestResult
|
|
17
|
+
from helm.proxy.query import Query, QueryResult
|
|
18
|
+
from helm.proxy.services.remote_service import RemoteService
|
|
19
|
+
from helm.proxy.services.service import GeneralInfo, Service
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RemoteContext(Context):
|
|
23
|
+
def __init__(self, base_url: str, auth: Authentication):
|
|
24
|
+
self.service: Service = RemoteService(base_url)
|
|
25
|
+
self.auth = auth
|
|
26
|
+
|
|
27
|
+
def get_general_info(self) -> GeneralInfo:
|
|
28
|
+
return self.service.get_general_info()
|
|
29
|
+
|
|
30
|
+
def expand_query(self, query: Query) -> QueryResult:
|
|
31
|
+
return self.service.expand_query(query)
|
|
32
|
+
|
|
33
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
34
|
+
return self.service.make_request(self.auth, request)
|
|
35
|
+
|
|
36
|
+
def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
37
|
+
return self.service.tokenize(self.auth, request)
|
|
38
|
+
|
|
39
|
+
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
|
|
40
|
+
return self.service.decode(self.auth, request)
|
|
41
|
+
|
|
42
|
+
def upload(self, request: FileUploadRequest) -> FileUploadResult:
|
|
43
|
+
return self.service.upload(self.auth, request)
|
|
44
|
+
|
|
45
|
+
def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
|
|
46
|
+
return self.service.check_nudity(self.auth, request)
|
|
47
|
+
|
|
48
|
+
def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
49
|
+
return self.service.compute_clip_score(self.auth, request)
|
|
50
|
+
|
|
51
|
+
def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
52
|
+
return self.service.get_toxicity_scores(self.auth, request)
|
|
53
|
+
|
|
54
|
+
def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
55
|
+
return self.service.get_moderation_results(self.auth, request)
|
|
56
|
+
|
|
57
|
+
def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
|
|
58
|
+
return self.service.make_critique_request(self.auth, request)
|
|
59
|
+
|
|
60
|
+
def get_cache_config(self, shard_name: str) -> CacheConfig:
|
|
61
|
+
return self.service.get_cache_config(shard_name)
|
helm/common/request.py
CHANGED
|
@@ -3,7 +3,9 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from typing import Any, Callable, Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
from helm.common.media_object import MultimediaObject
|
|
6
|
-
from .
|
|
6
|
+
from helm.common.image_generation_parameters import ImageGenerationParameters
|
|
7
|
+
from helm.common.general import indent_lines, format_text
|
|
8
|
+
from helm.common.response_format import ResponseFormat
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
@dataclass(frozen=True)
|
|
@@ -68,6 +70,28 @@ class Request:
|
|
|
68
70
|
multimodal_prompt: Optional[MultimediaObject] = None
|
|
69
71
|
"""Multimodal prompt with media objects interleaved (e.g., text, video, image, text, ...)"""
|
|
70
72
|
|
|
73
|
+
image_generation_parameters: Optional[ImageGenerationParameters] = None
|
|
74
|
+
"""Parameters for image generation."""
|
|
75
|
+
|
|
76
|
+
response_format: Optional[ResponseFormat] = None
|
|
77
|
+
"""EXPERIMENTAL: Response format. Currently only supported by OpenAI and Together."""
|
|
78
|
+
|
|
79
|
+
def validate(self):
|
|
80
|
+
if (
|
|
81
|
+
(self.messages and self.prompt)
|
|
82
|
+
or (self.messages and self.multimodal_prompt)
|
|
83
|
+
or (self.prompt and self.multimodal_prompt)
|
|
84
|
+
):
|
|
85
|
+
raise ValueError("Exactly one of the messages, prompt, multimodal_prompt fields should be set")
|
|
86
|
+
|
|
87
|
+
if self.multimodal_prompt:
|
|
88
|
+
for media_object in self.multimodal_prompt.media_objects:
|
|
89
|
+
if media_object.content_type == "text" and media_object.text is None:
|
|
90
|
+
raise ValueError("Media object with text content type must have text set")
|
|
91
|
+
|
|
92
|
+
if media_object.content_type == "image" and media_object.location is None:
|
|
93
|
+
raise ValueError("Media object with image content type must have location set")
|
|
94
|
+
|
|
71
95
|
@property
|
|
72
96
|
def model_host(self) -> str:
|
|
73
97
|
"""Returns the model host (referring to the deployment).
|
|
@@ -93,8 +117,6 @@ class Token:
|
|
|
93
117
|
"""
|
|
94
118
|
A `Token` represents one token position in a `Sequence`, which has the
|
|
95
119
|
chosen `text` as well as the top probabilities under the model.
|
|
96
|
-
|
|
97
|
-
Note: (text, logprob) could exist or not exist in `top_logprobs`.
|
|
98
120
|
"""
|
|
99
121
|
|
|
100
122
|
# Text that was chosen
|
|
@@ -103,22 +125,20 @@ class Token:
|
|
|
103
125
|
# Log probability of generating that
|
|
104
126
|
logprob: float
|
|
105
127
|
|
|
106
|
-
# text -> log probability of generating that
|
|
107
|
-
top_logprobs: Dict[str, float]
|
|
108
|
-
|
|
109
128
|
def render_lines(self) -> List[str]:
|
|
110
|
-
top_logprobs_entries = sorted(self.top_logprobs.items(), key=lambda entry: -entry[1])
|
|
111
|
-
top_logprobs_str = (
|
|
112
|
-
"{" + ", ".join(f"{format_text(text)}: {logprob}" for text, logprob in top_logprobs_entries) + "}"
|
|
113
|
-
)
|
|
114
129
|
return [
|
|
115
|
-
f"{format_text(self.text)} logprob={self.logprob}
|
|
130
|
+
f"{format_text(self.text)} logprob={self.logprob}",
|
|
116
131
|
]
|
|
117
132
|
|
|
118
133
|
|
|
119
134
|
@dataclass(frozen=True)
|
|
120
|
-
class
|
|
121
|
-
|
|
135
|
+
class Thinking:
|
|
136
|
+
text: Optional[str] = None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass(frozen=True)
|
|
140
|
+
class GeneratedOutput:
|
|
141
|
+
"""A `GeneratedOutput` is a single generated output that may contain text or multimodal content."""
|
|
122
142
|
|
|
123
143
|
# The concatenation of all the tokens
|
|
124
144
|
text: str
|
|
@@ -130,10 +150,16 @@ class Sequence:
|
|
|
130
150
|
tokens: List[Token]
|
|
131
151
|
|
|
132
152
|
# Why did the sequence finish?
|
|
133
|
-
finish_reason: Optional[Dict] = None
|
|
153
|
+
finish_reason: Optional[Dict[str, Any]] = None
|
|
154
|
+
|
|
155
|
+
# Could be a sequence made up of multimedia content
|
|
156
|
+
multimodal_content: Optional[MultimediaObject] = None
|
|
157
|
+
|
|
158
|
+
# Could be reasoning
|
|
159
|
+
thinking: Optional[Thinking] = None
|
|
134
160
|
|
|
135
|
-
def __add__(self, other: "
|
|
136
|
-
return
|
|
161
|
+
def __add__(self, other: "GeneratedOutput") -> "GeneratedOutput":
|
|
162
|
+
return GeneratedOutput(self.text + other.text, self.logprob + other.logprob, self.tokens + other.tokens)
|
|
137
163
|
|
|
138
164
|
def render_lines(self) -> List[str]:
|
|
139
165
|
result = [
|
|
@@ -172,14 +198,14 @@ class RequestResult:
|
|
|
172
198
|
embedding: List[float]
|
|
173
199
|
"""Fixed dimensional embedding corresponding to the entire prompt"""
|
|
174
200
|
|
|
175
|
-
completions: List[
|
|
201
|
+
completions: List[GeneratedOutput]
|
|
176
202
|
"""List of completion"""
|
|
177
203
|
|
|
178
204
|
cached: bool
|
|
179
205
|
"""Whether the request was actually cached"""
|
|
180
206
|
|
|
181
207
|
request_time: Optional[float] = None
|
|
182
|
-
"""How long
|
|
208
|
+
"""How long the request took in seconds"""
|
|
183
209
|
|
|
184
210
|
request_datetime: Optional[int] = None
|
|
185
211
|
"""When was the request sent?
|
|
@@ -227,7 +253,7 @@ EMBEDDING_UNAVAILABLE_REQUEST_RESULT = RequestResult(
|
|
|
227
253
|
)
|
|
228
254
|
|
|
229
255
|
|
|
230
|
-
def wrap_request_time(compute: Callable[[], Dict[str, Any]]) -> Callable[[], Any]:
|
|
256
|
+
def wrap_request_time(compute: Callable[[], Dict[str, Any]]) -> Callable[[], Dict[str, Any]]:
|
|
231
257
|
"""Return a version of `compute` that puts `request_time` into its output."""
|
|
232
258
|
|
|
233
259
|
def wrapped_compute():
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class ResponseFormat:
|
|
7
|
+
"""EXPERIMENTAL: Model response format.
|
|
8
|
+
|
|
9
|
+
Currently only supports JSON schema.
|
|
10
|
+
|
|
11
|
+
Currently only supported by OpenAI and Together.
|
|
12
|
+
|
|
13
|
+
See:
|
|
14
|
+
- https://platform.openai.com/docs/guides/structured-outputs
|
|
15
|
+
- https://docs.together.ai/docs/json-mode"""
|
|
16
|
+
|
|
17
|
+
json_schema: Optional[Dict[str, Any]] = None
|
|
18
|
+
"""EXPERIMENTAL: The JSON schema that the model output should conform to."""
|
helm/common/test_cache.py
CHANGED
|
@@ -3,9 +3,7 @@ import tempfile
|
|
|
3
3
|
import unittest
|
|
4
4
|
import threading
|
|
5
5
|
|
|
6
|
-
from helm.common.cache import Cache, SqliteCacheConfig,
|
|
7
|
-
|
|
8
|
-
from sqlitedict import SqliteDict
|
|
6
|
+
from helm.common.cache import Cache, SqliteCacheConfig, cache_stats, get_all_from_sqlite
|
|
9
7
|
|
|
10
8
|
|
|
11
9
|
class TestCache(unittest.TestCase):
|
|
@@ -99,51 +97,6 @@ class TestCache(unittest.TestCase):
|
|
|
99
97
|
assert cache_stats.num_computes[self.cache_path] >= num_items
|
|
100
98
|
assert cache_stats.num_computes[self.cache_path] <= num_items * num_threads
|
|
101
99
|
|
|
102
|
-
def test_follower(self):
|
|
103
|
-
cache = Cache(SqliteCacheConfig(self.cache_path))
|
|
104
|
-
request_1 = {"name": "request1"}
|
|
105
|
-
compute_1 = lambda: {"response": "response1"}
|
|
106
|
-
|
|
107
|
-
response, cached = cache.get(request_1, compute_1)
|
|
108
|
-
assert response == {"response": "response1"}
|
|
109
|
-
assert not cached
|
|
110
|
-
assert cache_stats.num_queries[self.cache_path] == 1
|
|
111
|
-
assert cache_stats.num_computes[self.cache_path] == 1
|
|
112
|
-
|
|
113
|
-
follower_cache_file = tempfile.NamedTemporaryFile(delete=False)
|
|
114
|
-
follower_cache_path = follower_cache_file.name
|
|
115
|
-
with follower_cache_file:
|
|
116
|
-
cache_with_follower_config = WithFollowerCacheConfig(
|
|
117
|
-
main=SqliteCacheConfig(self.cache_path),
|
|
118
|
-
follower=SqliteCacheConfig(follower_cache_path),
|
|
119
|
-
)
|
|
120
|
-
cache_with_follower = Cache(cache_with_follower_config)
|
|
121
|
-
|
|
122
|
-
response, cached = cache_with_follower.get(request_1, compute_1)
|
|
123
|
-
assert response == {"response": "response1"}
|
|
124
|
-
assert cached
|
|
125
|
-
assert cache_stats.num_queries[self.cache_path] == 2
|
|
126
|
-
assert cache_stats.num_computes[self.cache_path] == 1
|
|
127
|
-
assert cache_stats.num_queries[follower_cache_path] == 0
|
|
128
|
-
assert cache_stats.num_computes[follower_cache_path] == 0
|
|
129
|
-
|
|
130
|
-
request_2 = {"name": "request2"}
|
|
131
|
-
compute_2 = lambda: {"response": "response2"}
|
|
132
|
-
|
|
133
|
-
response, cached = cache_with_follower.get(request_2, compute_2)
|
|
134
|
-
assert response == {"response": "response2"}
|
|
135
|
-
assert not cached
|
|
136
|
-
assert cache_stats.num_queries[self.cache_path] == 3
|
|
137
|
-
assert cache_stats.num_computes[self.cache_path] == 2
|
|
138
|
-
assert cache_stats.num_queries[follower_cache_path] == 0
|
|
139
|
-
assert cache_stats.num_computes[follower_cache_path] == 0
|
|
140
|
-
|
|
141
|
-
expected_dict = {
|
|
142
|
-
'{"name": "request1"}': {"response": "response1"},
|
|
143
|
-
'{"name": "request2"}': {"response": "response2"},
|
|
144
|
-
}
|
|
145
|
-
self.assertCountEqual(SqliteDict(follower_cache_path).items(), expected_dict.items())
|
|
146
|
-
|
|
147
100
|
def test_get_all_from_sqlite(self):
|
|
148
101
|
cache = Cache(SqliteCacheConfig(self.cache_path))
|
|
149
102
|
num_items = 10 # TODO: Inrcrease to 100
|
helm/common/test_general.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import shutil
|
|
2
2
|
import os
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
3
6
|
from helm.common.general import (
|
|
4
7
|
ensure_file_downloaded,
|
|
5
8
|
format_tags,
|
|
@@ -7,10 +10,12 @@ from helm.common.general import (
|
|
|
7
10
|
format_split,
|
|
8
11
|
get_file_name,
|
|
9
12
|
unique_simplification,
|
|
13
|
+
is_url,
|
|
10
14
|
)
|
|
11
15
|
|
|
12
16
|
|
|
13
17
|
def test_ensure_file_downloaded():
|
|
18
|
+
pytest.skip("Skipping download tests because these downloads are not reliable and may be throttled")
|
|
14
19
|
ensure_file_downloaded("https://ftp.gnu.org/gnu/tar/tar-1.34.tar.gz", "test-tar", unpack=True, unpack_type="untar")
|
|
15
20
|
assert os.path.isdir("test-tar")
|
|
16
21
|
shutil.rmtree("test-tar")
|
|
@@ -58,3 +63,8 @@ def test_unique_simplification():
|
|
|
58
63
|
|
|
59
64
|
def test_get_file_name():
|
|
60
65
|
assert get_file_name("/path/to/image.png") == "image.png"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_is_url():
|
|
69
|
+
assert is_url("https://crfm.stanford.edu")
|
|
70
|
+
assert not is_url("/some/path")
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import tempfile
|
|
3
|
+
import textwrap
|
|
4
|
+
import pathlib
|
|
5
|
+
from helm.benchmark import run
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ArgvContext:
|
|
10
|
+
"""
|
|
11
|
+
Helper to assign a temporary value to sys.argv and then restore it
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, argv: Optional[List[str]]):
|
|
15
|
+
self.argv = argv
|
|
16
|
+
self._original_argv: Optional[List[str]] = None
|
|
17
|
+
|
|
18
|
+
def __enter__(self):
|
|
19
|
+
self._original_argv = sys.argv[:]
|
|
20
|
+
sys.argv = self.argv or []
|
|
21
|
+
|
|
22
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
23
|
+
assert self._original_argv is not None # Satisfies mypy
|
|
24
|
+
sys.argv = self._original_argv
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_run_with_custom_logging_config():
|
|
28
|
+
# Setup temporary directory
|
|
29
|
+
with tempfile.TemporaryDirectory(prefix="helm_test_") as tmp_dir_str:
|
|
30
|
+
tmp_dir = pathlib.Path(tmp_dir_str)
|
|
31
|
+
log_path = tmp_dir / "test.log"
|
|
32
|
+
log_config_path = tmp_dir / "test_config.yaml"
|
|
33
|
+
|
|
34
|
+
# Write custom YAML log config to file
|
|
35
|
+
log_config_text = textwrap.dedent(
|
|
36
|
+
f"""
|
|
37
|
+
version: 1
|
|
38
|
+
disable_existing_loggers: false
|
|
39
|
+
formatters:
|
|
40
|
+
simple:
|
|
41
|
+
datefmt: '%Y-%m-%dT%H:%M:%S'
|
|
42
|
+
format: '%(asctime)s %(levelname)s %(name)s %(message)s'
|
|
43
|
+
handlers:
|
|
44
|
+
file:
|
|
45
|
+
class: logging.FileHandler
|
|
46
|
+
filename: {log_path}
|
|
47
|
+
formatter: simple
|
|
48
|
+
level: DEBUG
|
|
49
|
+
mode: w
|
|
50
|
+
loggers:
|
|
51
|
+
helm:
|
|
52
|
+
handlers:
|
|
53
|
+
- file
|
|
54
|
+
level: DEBUG
|
|
55
|
+
propagate: false
|
|
56
|
+
"""
|
|
57
|
+
).strip()
|
|
58
|
+
|
|
59
|
+
log_config_path.write_text(log_config_text)
|
|
60
|
+
|
|
61
|
+
# Simulate command-line arguments
|
|
62
|
+
argv = [
|
|
63
|
+
"run.py", # Fake script name
|
|
64
|
+
"--run-entries",
|
|
65
|
+
"mmlu:subject=philosophy,model=openai/gpt2",
|
|
66
|
+
"-m",
|
|
67
|
+
"1",
|
|
68
|
+
"--suite",
|
|
69
|
+
"my-suite",
|
|
70
|
+
"--dry-run",
|
|
71
|
+
"--log-config",
|
|
72
|
+
str(log_config_path),
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
# Call main
|
|
76
|
+
with ArgvContext(argv):
|
|
77
|
+
run.main()
|
|
78
|
+
|
|
79
|
+
# Check log file contents
|
|
80
|
+
assert log_path.exists(), "Log file was not created"
|
|
81
|
+
log_contents = log_path.read_text()
|
|
82
|
+
|
|
83
|
+
# Test that log file was written to disk as requested
|
|
84
|
+
print("Log Contents")
|
|
85
|
+
print("------------")
|
|
86
|
+
print(log_contents)
|
|
87
|
+
|
|
88
|
+
assert (
|
|
89
|
+
"mscoco" in log_contents or "huggingface" in log_contents or "dry-run" in log_contents
|
|
90
|
+
), "Expected log content not found in log file:\n"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__":
|
|
94
|
+
test_run_with_custom_logging_config()
|