crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
helm/proxy/accounts.py
CHANGED
|
@@ -23,6 +23,9 @@ DEFAULT_QUOTAS = {
|
|
|
23
23
|
"jurassic": {"daily": 10000},
|
|
24
24
|
"gooseai": {"daily": 10000},
|
|
25
25
|
"cohere": {"daily": 10000},
|
|
26
|
+
"dall_e": {"daily": 5}, # In terms of the number of generated images
|
|
27
|
+
"together_vision": {"daily": 30},
|
|
28
|
+
"simple": {"daily": 10000},
|
|
26
29
|
}
|
|
27
30
|
|
|
28
31
|
|
|
@@ -303,7 +306,7 @@ class Accounts:
|
|
|
303
306
|
model_group: str,
|
|
304
307
|
granularity: str,
|
|
305
308
|
compute_period: Callable[[], str],
|
|
306
|
-
):
|
|
309
|
+
) -> None:
|
|
307
310
|
"""Helper that checks the usage at a certain granularity (e.g., daily, monthly, total)."""
|
|
308
311
|
|
|
309
312
|
model_group_usages = account.usages.get(model_group)
|
|
@@ -321,14 +324,38 @@ class Accounts:
|
|
|
321
324
|
if not usage.can_use():
|
|
322
325
|
raise InsufficientQuotaError(f"{granularity} quota ({usage.quota}) for {model_group} already used up")
|
|
323
326
|
|
|
327
|
+
def check_non_empty_quota(
|
|
328
|
+
account: Account,
|
|
329
|
+
model_group: str,
|
|
330
|
+
) -> None:
|
|
331
|
+
"""Helper that checks that the account has quota at some granularity.
|
|
332
|
+
|
|
333
|
+
At each granularity, a quota of None means unlimited quota.
|
|
334
|
+
However, if the quota is None at every granularity, it means that there is no quota.
|
|
335
|
+
To enforce this rule, this helper raises a InsufficientQuotaError if the quota is None
|
|
336
|
+
at every granularity."""
|
|
337
|
+
model_group_usages = account.usages.get(model_group)
|
|
338
|
+
if model_group_usages is None:
|
|
339
|
+
raise InsufficientQuotaError(f"No quota for {model_group}")
|
|
340
|
+
if all(
|
|
341
|
+
[
|
|
342
|
+
granularity_usage.quota is None or granularity_usage.quota <= 0
|
|
343
|
+
for granularity_usage in model_group_usages.values()
|
|
344
|
+
]
|
|
345
|
+
):
|
|
346
|
+
raise InsufficientQuotaError(f"No quota for {model_group}")
|
|
347
|
+
|
|
324
348
|
if self.root_mode:
|
|
325
349
|
return
|
|
326
350
|
|
|
327
351
|
with SqliteDict(self.path) as cache:
|
|
328
352
|
account: Account = from_dict(Account, cache[api_key])
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
353
|
+
if account.is_admin:
|
|
354
|
+
return
|
|
355
|
+
granular_check_can_use(account, model_group, "daily", compute_daily_period)
|
|
356
|
+
granular_check_can_use(account, model_group, "monthly", compute_monthly_period)
|
|
357
|
+
granular_check_can_use(account, model_group, "total", compute_total_period)
|
|
358
|
+
check_non_empty_quota(account, model_group)
|
|
332
359
|
|
|
333
360
|
def use(self, api_key: str, model_group: str, delta: int):
|
|
334
361
|
"""
|
helm/proxy/cli.py
CHANGED
|
@@ -21,10 +21,10 @@ from typing import List, Dict
|
|
|
21
21
|
import re
|
|
22
22
|
import sys
|
|
23
23
|
|
|
24
|
-
from helm.common.hierarchical_logger import hlog
|
|
24
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
25
25
|
from helm.common.authentication import Authentication
|
|
26
|
-
from .accounts import Usage, Account
|
|
27
|
-
from .services.remote_service import RemoteService, add_service_args, create_authentication
|
|
26
|
+
from helm.proxy.accounts import Usage, Account
|
|
27
|
+
from helm.proxy.services.remote_service import RemoteService, add_service_args, create_authentication
|
|
28
28
|
|
|
29
29
|
GRANULARITIES = ["daily", "monthly", "total"]
|
|
30
30
|
UNLIMITED_QUOTA = "unlimited"
|
|
@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
|
|
|
123
123
|
|
|
124
124
|
# Update quotas
|
|
125
125
|
for quota_str in args.quotas:
|
|
126
|
-
m = re.match(
|
|
126
|
+
m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
|
|
127
127
|
if not m:
|
|
128
128
|
raise Exception(
|
|
129
129
|
f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "
|
|
@@ -198,6 +198,8 @@ def main():
|
|
|
198
198
|
|
|
199
199
|
args = parser.parse_args()
|
|
200
200
|
|
|
201
|
+
setup_default_logging()
|
|
202
|
+
|
|
201
203
|
service = create_remote_service(args)
|
|
202
204
|
auth = create_authentication(args)
|
|
203
205
|
|
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
from threading import Lock
|
|
5
5
|
from typing import Dict, List, Optional, Tuple, Union
|
|
6
6
|
import re
|
|
7
|
+
import sys
|
|
7
8
|
|
|
8
9
|
from helm.common.critique_request import (
|
|
9
10
|
CritiqueRequest,
|
|
@@ -15,6 +16,8 @@ from helm.common.critique_request import (
|
|
|
15
16
|
from helm.common.hierarchical_logger import hlog
|
|
16
17
|
from helm.proxy.critique.mechanical_turk_utils import replace_emoji_characters
|
|
17
18
|
|
|
19
|
+
csv.field_size_limit(sys.maxsize)
|
|
20
|
+
|
|
18
21
|
# A representation of fields that can be used as a dict key.
|
|
19
22
|
_CritiqueRequestKey = Tuple[Tuple[str, str], ...]
|
|
20
23
|
|
|
@@ -38,7 +38,7 @@ def replace_emoji_characters(s: str) -> str:
|
|
|
38
38
|
highpoints = re.compile("[\U00010000-\U0010ffff]")
|
|
39
39
|
elif sys.maxunicode == 65535:
|
|
40
40
|
# Python was built with '--enable-unicode=ucs2'
|
|
41
|
-
highpoints = re.compile("[\
|
|
41
|
+
highpoints = re.compile("[\ud800-\udbff][\udc00-\udfff]")
|
|
42
42
|
else:
|
|
43
43
|
raise UnicodeError("Unable to determine if Python was built using UCS-2 or UCS-4")
|
|
44
44
|
|
|
@@ -2,7 +2,7 @@ from typing import Dict, List, Union, Optional
|
|
|
2
2
|
import string
|
|
3
3
|
import dataclasses
|
|
4
4
|
|
|
5
|
-
from helm.benchmark.
|
|
5
|
+
from helm.benchmark.run_spec_factory import get_default_model_deployment_for_model
|
|
6
6
|
from helm.common.critique_request import (
|
|
7
7
|
CritiqueRequest,
|
|
8
8
|
CritiqueRequestResult,
|
|
@@ -12,9 +12,10 @@ from helm.common.critique_request import (
|
|
|
12
12
|
)
|
|
13
13
|
from helm.common.hierarchical_logger import hlog
|
|
14
14
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
15
|
-
from helm.common.request import Request, RequestResult,
|
|
16
|
-
from helm.
|
|
15
|
+
from helm.common.request import Request, RequestResult, GeneratedOutput
|
|
16
|
+
from helm.clients.client import Client
|
|
17
17
|
from helm.proxy.critique.critique_client import CritiqueClient
|
|
18
|
+
from helm.common.media_object import MultimediaObject, MediaObject
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class CritiqueParseError(Exception):
|
|
@@ -24,6 +25,8 @@ class CritiqueParseError(Exception):
|
|
|
24
25
|
class ModelCritiqueClient(CritiqueClient):
|
|
25
26
|
"""A CritiqueClient that queries a Model to answer CritiqueRequests."""
|
|
26
27
|
|
|
28
|
+
VISION_LANGUAGE_MODELS = ["openai/gpt-4-vision", "reka/reka", "huggingface/prometheus-vision"]
|
|
29
|
+
|
|
27
30
|
def __init__(self, client: Client, model_name):
|
|
28
31
|
self._client = client
|
|
29
32
|
self._model_name = model_name
|
|
@@ -31,6 +34,11 @@ class ModelCritiqueClient(CritiqueClient):
|
|
|
31
34
|
get_default_model_deployment_for_model(model_name, warn_arg_deprecated=False, ignore_deprecated=True)
|
|
32
35
|
or self._model_name
|
|
33
36
|
)
|
|
37
|
+
self.vision_language = False
|
|
38
|
+
for vision_language_model_name in self.VISION_LANGUAGE_MODELS:
|
|
39
|
+
if model_name.startswith(vision_language_model_name):
|
|
40
|
+
self.vision_language = True
|
|
41
|
+
break
|
|
34
42
|
|
|
35
43
|
def _interpolate_fields(self, text: str, fields: Dict[str, str]) -> str:
|
|
36
44
|
for key, value in fields.items():
|
|
@@ -58,10 +66,15 @@ class ModelCritiqueClient(CritiqueClient):
|
|
|
58
66
|
|
|
59
67
|
requests: List[Request] = []
|
|
60
68
|
for question in task.questions:
|
|
61
|
-
prompt: str
|
|
69
|
+
prompt: str
|
|
70
|
+
if len(question.text) > 0:
|
|
71
|
+
prompt = base_prompt + "\n\n" + self._question_to_prompt(question, fields)
|
|
72
|
+
else:
|
|
73
|
+
# We may don't want to add extra newlines and prompts
|
|
74
|
+
# if the question text is empty (e.g., the Vibe-Eval evaluator).
|
|
75
|
+
prompt = base_prompt
|
|
62
76
|
if question.question_type == "free_response":
|
|
63
|
-
|
|
64
|
-
max_tokens = 100
|
|
77
|
+
max_tokens = 100 if task.max_tokens is None else task.max_tokens
|
|
65
78
|
elif question.question_type == "checkbox":
|
|
66
79
|
# We multiply by 2 because the model will generate a comma after each option.
|
|
67
80
|
max_tokens = len(question.options) * 2
|
|
@@ -78,12 +91,21 @@ class ModelCritiqueClient(CritiqueClient):
|
|
|
78
91
|
|
|
79
92
|
prompt = anthropic.HUMAN_PROMPT + prompt + anthropic.AI_PROMPT
|
|
80
93
|
|
|
94
|
+
multimodal_prompt: Optional[MultimediaObject] = None
|
|
95
|
+
if self.vision_language:
|
|
96
|
+
assert question.media_object is not None, "Expect media_object for vision-language models"
|
|
97
|
+
image_media: MediaObject = question.media_object
|
|
98
|
+
text_media: MediaObject = MediaObject(text=prompt, content_type="text/plain")
|
|
99
|
+
multimodal_prompt = MultimediaObject(media_objects=[image_media, text_media])
|
|
100
|
+
prompt = "" # set to empty string to avoid conflicts with multimodal_prompt
|
|
101
|
+
|
|
81
102
|
request = Request(
|
|
82
103
|
model=self._model_name,
|
|
83
104
|
model_deployment=self._model_deployment_name,
|
|
84
105
|
prompt=prompt,
|
|
85
106
|
max_tokens=max_tokens,
|
|
86
107
|
echo_prompt=False,
|
|
108
|
+
multimodal_prompt=multimodal_prompt,
|
|
87
109
|
)
|
|
88
110
|
requests.append(request)
|
|
89
111
|
return requests
|
|
@@ -114,7 +136,7 @@ class ModelCritiqueClient(CritiqueClient):
|
|
|
114
136
|
return answers
|
|
115
137
|
|
|
116
138
|
def _multiple_choice_completion_to_answer(
|
|
117
|
-
self, question: CritiqueQuestionTemplate, completion:
|
|
139
|
+
self, question: CritiqueQuestionTemplate, completion: GeneratedOutput
|
|
118
140
|
) -> Optional[str]:
|
|
119
141
|
"""Convert a multiple choice completion to an answer."""
|
|
120
142
|
assert question.question_type == "multiple_choice"
|
|
@@ -124,14 +146,20 @@ class ModelCritiqueClient(CritiqueClient):
|
|
|
124
146
|
raise CritiqueParseError(
|
|
125
147
|
f"Invalid answer: {completion}. Multiple choice questions should have one answer."
|
|
126
148
|
)
|
|
127
|
-
|
|
149
|
+
letter_answer = answers[0]
|
|
150
|
+
choice_rank = string.ascii_uppercase.index(letter_answer)
|
|
151
|
+
if choice_rank >= len(question.options):
|
|
152
|
+
raise CritiqueParseError(
|
|
153
|
+
f"Invalid answer: {completion}. The answer is out of range of the options: {question.options}"
|
|
154
|
+
)
|
|
155
|
+
return letter_answer
|
|
128
156
|
except CritiqueParseError as e:
|
|
129
157
|
# If there was an error parsing the answer, we assume the user did not answer the question.
|
|
130
158
|
hlog(f"Error parsing answer: {e}. Skipping question (and so the respondent entirely)")
|
|
131
159
|
return None
|
|
132
160
|
|
|
133
161
|
def _checkbox_completion_to_answer(
|
|
134
|
-
self, question: CritiqueQuestionTemplate, completion:
|
|
162
|
+
self, question: CritiqueQuestionTemplate, completion: GeneratedOutput
|
|
135
163
|
) -> Optional[List[str]]:
|
|
136
164
|
"""Convert a checkbox completion to an answer."""
|
|
137
165
|
assert question.question_type == "checkbox"
|
|
@@ -147,7 +175,9 @@ class ModelCritiqueClient(CritiqueClient):
|
|
|
147
175
|
hlog(f"Error parsing answer: {e}. Skipping question (and so the respondent entirely)")
|
|
148
176
|
return None
|
|
149
177
|
|
|
150
|
-
def _free_response_completion_to_answer(
|
|
178
|
+
def _free_response_completion_to_answer(
|
|
179
|
+
self, question: CritiqueQuestionTemplate, completion: GeneratedOutput
|
|
180
|
+
) -> str:
|
|
151
181
|
"""Convert a free response completion to an answer."""
|
|
152
182
|
assert question.question_type == "free_response"
|
|
153
183
|
return completion.text
|
helm/proxy/example_queries.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import textwrap
|
|
2
2
|
|
|
3
|
-
from .query import Query
|
|
3
|
+
from helm.proxy.query import Query
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def dedent(text: str) -> str:
|
|
@@ -21,6 +21,7 @@ example_queries = [
|
|
|
21
21
|
"""
|
|
22
22
|
temperature: 0.5 # Medium amount of randomness
|
|
23
23
|
stop_sequences: [.] # Stop when you hit a period
|
|
24
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
24
25
|
"""
|
|
25
26
|
),
|
|
26
27
|
environments="",
|
|
@@ -31,28 +32,33 @@ example_queries = [
|
|
|
31
32
|
"""
|
|
32
33
|
temperature: 0.5 # Medium amount of randomness
|
|
33
34
|
stop_sequences: [\\n] # Stop when you hit a newline
|
|
34
|
-
num_completions:
|
|
35
|
+
num_completions: 5 # Generate many samples
|
|
36
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
35
37
|
"""
|
|
36
38
|
),
|
|
37
39
|
environments="",
|
|
38
40
|
),
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
41
|
+
# Disabled because `max_tokens: 0` no longer works on the OpenAI API
|
|
42
|
+
# Query(
|
|
43
|
+
# prompt="The quick brown fox jumps over the lazy dog.",
|
|
44
|
+
# settings=dedent(
|
|
45
|
+
# """
|
|
46
|
+
# echo_prompt: true # Analyze the prompt
|
|
47
|
+
# max_tokens: 0 # Don't generate any more
|
|
48
|
+
# top_k_per_token: 5 # Show alternatives for each position
|
|
49
|
+
# model: openai/text-davinci-002
|
|
50
|
+
# model_deployment: openai/text-davinci-002
|
|
51
|
+
# """
|
|
52
|
+
# ),
|
|
53
|
+
# environments=dedent(""),
|
|
54
|
+
# ),
|
|
50
55
|
Query(
|
|
51
56
|
prompt="Odd numbers: 1 -> 3 -> 5",
|
|
52
57
|
settings=dedent(
|
|
53
58
|
"""
|
|
54
59
|
temperature: 0 # Deterministic
|
|
55
60
|
max_tokens: 50
|
|
61
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
56
62
|
"""
|
|
57
63
|
),
|
|
58
64
|
environments="",
|
|
@@ -63,13 +69,14 @@ example_queries = [
|
|
|
63
69
|
"""
|
|
64
70
|
temperature: 0
|
|
65
71
|
stop_sequences: [.]
|
|
66
|
-
|
|
72
|
+
# Try out multiple models
|
|
73
|
+
model: ${model}
|
|
67
74
|
"""
|
|
68
75
|
),
|
|
69
76
|
environments=dedent(
|
|
70
77
|
"""
|
|
71
78
|
occupation: [mathematician, lawyer, doctor]
|
|
72
|
-
|
|
79
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
73
80
|
"""
|
|
74
81
|
),
|
|
75
82
|
),
|
|
@@ -88,12 +95,13 @@ example_queries = [
|
|
|
88
95
|
temperature: 0.5
|
|
89
96
|
stop_sequences: [\\n]
|
|
90
97
|
num_completions: 5
|
|
91
|
-
|
|
98
|
+
# Try out multiple models
|
|
99
|
+
model: ${model}
|
|
92
100
|
"""
|
|
93
101
|
),
|
|
94
102
|
environments=dedent(
|
|
95
103
|
"""
|
|
96
|
-
|
|
104
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
97
105
|
"""
|
|
98
106
|
),
|
|
99
107
|
),
|
|
@@ -122,20 +130,21 @@ example_queries = [
|
|
|
122
130
|
temperature: 0
|
|
123
131
|
max_tokens: 1
|
|
124
132
|
top_k_per_token: 4
|
|
125
|
-
|
|
133
|
+
# Try out multiple models
|
|
134
|
+
model: ${model}
|
|
126
135
|
"""
|
|
127
136
|
),
|
|
128
137
|
environments=dedent(
|
|
129
138
|
"""
|
|
130
|
-
|
|
139
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
131
140
|
"""
|
|
132
141
|
),
|
|
133
142
|
),
|
|
134
143
|
Query(
|
|
135
|
-
prompt="
|
|
144
|
+
prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
|
|
136
145
|
settings=dedent(
|
|
137
146
|
"""
|
|
138
|
-
|
|
147
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
139
148
|
"""
|
|
140
149
|
),
|
|
141
150
|
environments="",
|
|
@@ -144,19 +153,15 @@ example_queries = [
|
|
|
144
153
|
prompt="The quick brown fox",
|
|
145
154
|
settings=dedent(
|
|
146
155
|
"""
|
|
147
|
-
model_deployment: ${model_deployment}
|
|
148
156
|
temperature: 0.3
|
|
149
157
|
stop_sequences: [\\n]
|
|
158
|
+
# Try out multiple models
|
|
159
|
+
model: ${model}
|
|
150
160
|
"""
|
|
151
161
|
),
|
|
152
162
|
environments=dedent(
|
|
153
163
|
"""
|
|
154
|
-
|
|
155
|
-
"openai/davinci", "openai/text-davinci-002",
|
|
156
|
-
"openai/text-davinci-003", "ai21/j1-grande-v2-beta",
|
|
157
|
-
"together/gpt-j-6b", "together/gpt-jt-6b-v1",
|
|
158
|
-
"together/bloom", "together/opt-175b"
|
|
159
|
-
]
|
|
164
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
160
165
|
"""
|
|
161
166
|
),
|
|
162
167
|
),
|
helm/proxy/retry.py
CHANGED
|
@@ -5,6 +5,7 @@ from retrying import Retrying
|
|
|
5
5
|
from helm.common.request import RequestResult
|
|
6
6
|
from helm.common.tokenization_request import TokenizationRequestResult
|
|
7
7
|
from helm.common.hierarchical_logger import hlog
|
|
8
|
+
import os
|
|
8
9
|
import traceback
|
|
9
10
|
import threading
|
|
10
11
|
|
|
@@ -19,6 +20,10 @@ Example usage:
|
|
|
19
20
|
...
|
|
20
21
|
"""
|
|
21
22
|
|
|
23
|
+
# TODO: make these configurable at a config / cli level
|
|
24
|
+
HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
|
|
25
|
+
HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
|
|
26
|
+
|
|
22
27
|
# The lock is used to prevent multiple threads from printing at the same time.
|
|
23
28
|
# This can cause issues when printing the stack trace.
|
|
24
29
|
# (The stack traces can get mixed up and become unreadable.)
|
helm/proxy/server.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
# mypy: check_untyped_defs = False
|
|
2
|
-
|
|
3
1
|
"""
|
|
4
2
|
Starts a REST server for the frontend to interact with.
|
|
5
3
|
Look at `index.js` to see how the functionality is invoked.
|
|
@@ -8,6 +6,7 @@ Look at `index.js` to see how the functionality is invoked.
|
|
|
8
6
|
from urllib.parse import unquote_plus
|
|
9
7
|
import argparse
|
|
10
8
|
import dataclasses
|
|
9
|
+
import importlib_resources as resources
|
|
11
10
|
import json
|
|
12
11
|
import os
|
|
13
12
|
import sys
|
|
@@ -20,15 +19,20 @@ from helm.benchmark.config_registry import (
|
|
|
20
19
|
register_configs_from_directory,
|
|
21
20
|
register_builtin_configs_from_helm_package,
|
|
22
21
|
)
|
|
22
|
+
from helm.benchmark.model_deployment_registry import get_default_model_deployment_for_model
|
|
23
23
|
from helm.common.authentication import Authentication
|
|
24
|
-
from helm.common.
|
|
24
|
+
from helm.common.cache_backend_config import CacheBackendConfig, MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
25
|
+
from helm.common.general import ensure_directory_exists
|
|
26
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
25
27
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
26
28
|
from helm.common.request import Request
|
|
27
29
|
from helm.common.perspective_api_request import PerspectiveAPIRequest
|
|
30
|
+
from helm.common.moderations_api_request import ModerationAPIRequest
|
|
28
31
|
from helm.common.tokenization_request import TokenizationRequest, DecodeRequest
|
|
29
|
-
from .
|
|
30
|
-
from .
|
|
31
|
-
from .
|
|
32
|
+
from helm.proxy.services.service import CACHE_DIR
|
|
33
|
+
from helm.proxy.accounts import Account
|
|
34
|
+
from helm.proxy.services.server_service import ServerService
|
|
35
|
+
from helm.proxy.query import Query
|
|
32
36
|
|
|
33
37
|
try:
|
|
34
38
|
import gunicorn # noqa
|
|
@@ -39,6 +43,7 @@ except ModuleNotFoundError as e:
|
|
|
39
43
|
bottle.BaseRequest.MEMFILE_MAX = 1024 * 1024
|
|
40
44
|
|
|
41
45
|
app = bottle.default_app()
|
|
46
|
+
service: ServerService
|
|
42
47
|
|
|
43
48
|
|
|
44
49
|
def safe_call(func, to_json=True):
|
|
@@ -82,30 +87,30 @@ def handle_root():
|
|
|
82
87
|
|
|
83
88
|
@app.get("/static/<filename:path>")
|
|
84
89
|
def handle_static_filename(filename):
|
|
85
|
-
resp = bottle.static_file(filename, root=
|
|
90
|
+
resp = bottle.static_file(filename, root=app.config["helm.staticpath"])
|
|
86
91
|
resp.add_header("Cache-Control", "no-store, must-revalidate ")
|
|
87
92
|
return resp
|
|
88
93
|
|
|
89
94
|
|
|
95
|
+
@app.get("/output/<filename:path>")
|
|
96
|
+
def handle_output_filename(filename):
|
|
97
|
+
resp = bottle.static_file(filename, root=app.config["crfm.proxy.outputpath"])
|
|
98
|
+
return resp
|
|
99
|
+
|
|
100
|
+
|
|
90
101
|
@app.get("/api/general_info")
|
|
91
102
|
def handle_get_general_info():
|
|
92
103
|
def perform(args):
|
|
104
|
+
global service
|
|
93
105
|
return dataclasses.asdict(service.get_general_info())
|
|
94
106
|
|
|
95
107
|
return safe_call(perform)
|
|
96
108
|
|
|
97
109
|
|
|
98
|
-
@app.get("/api/window_service_info")
|
|
99
|
-
def handle_get_window_service_info():
|
|
100
|
-
def perform(args):
|
|
101
|
-
return dataclasses.asdict(service.get_window_service_info(args["model_name"]))
|
|
102
|
-
|
|
103
|
-
return safe_call(perform)
|
|
104
|
-
|
|
105
|
-
|
|
106
110
|
@app.post("/api/account")
|
|
107
111
|
def handle_create_account():
|
|
108
112
|
def perform(args):
|
|
113
|
+
global service
|
|
109
114
|
auth = Authentication(**json.loads(args["auth"]))
|
|
110
115
|
return dataclasses.asdict(service.create_account(auth))
|
|
111
116
|
|
|
@@ -115,6 +120,7 @@ def handle_create_account():
|
|
|
115
120
|
@app.delete("/api/account")
|
|
116
121
|
def handle_delete_account():
|
|
117
122
|
def perform(args):
|
|
123
|
+
global service
|
|
118
124
|
auth = Authentication(**json.loads(args["auth"]))
|
|
119
125
|
api_key = args["api_key"]
|
|
120
126
|
return dataclasses.asdict(service.delete_account(auth, api_key))
|
|
@@ -125,6 +131,7 @@ def handle_delete_account():
|
|
|
125
131
|
@app.get("/api/account")
|
|
126
132
|
def handle_get_account():
|
|
127
133
|
def perform(args):
|
|
134
|
+
global service
|
|
128
135
|
auth = Authentication(**json.loads(args["auth"]))
|
|
129
136
|
if "all" in args and args["all"].lower() == "true":
|
|
130
137
|
return [dataclasses.asdict(account) for account in service.get_accounts(auth)]
|
|
@@ -137,6 +144,7 @@ def handle_get_account():
|
|
|
137
144
|
@app.put("/api/account")
|
|
138
145
|
def handle_update_account():
|
|
139
146
|
def perform(args):
|
|
147
|
+
global service
|
|
140
148
|
auth = Authentication(**json.loads(args["auth"]))
|
|
141
149
|
account = from_dict(Account, json.loads(args["account"]))
|
|
142
150
|
return dataclasses.asdict(service.update_account(auth, account))
|
|
@@ -147,6 +155,7 @@ def handle_update_account():
|
|
|
147
155
|
@app.put("/api/account/api_key")
|
|
148
156
|
def handle_update_api_key():
|
|
149
157
|
def perform(args):
|
|
158
|
+
global service
|
|
150
159
|
auth = Authentication(**json.loads(args["auth"]))
|
|
151
160
|
account = from_dict(Account, json.loads(args["account"]))
|
|
152
161
|
return dataclasses.asdict(service.rotate_api_key(auth, account))
|
|
@@ -157,6 +166,7 @@ def handle_update_api_key():
|
|
|
157
166
|
@app.get("/api/query")
|
|
158
167
|
def handle_query():
|
|
159
168
|
def perform(args):
|
|
169
|
+
global service
|
|
160
170
|
query = Query(**args)
|
|
161
171
|
return dataclasses.asdict(service.expand_query(query))
|
|
162
172
|
|
|
@@ -166,9 +176,28 @@ def handle_query():
|
|
|
166
176
|
@app.get("/api/request")
|
|
167
177
|
def handle_request():
|
|
168
178
|
def perform(args):
|
|
179
|
+
global service
|
|
169
180
|
auth = Authentication(**json.loads(args["auth"]))
|
|
170
181
|
request = Request(**json.loads(args["request"]))
|
|
171
|
-
|
|
182
|
+
# Hack to maintain reverse compatibility with clients with version <= 0.3.0.
|
|
183
|
+
# Clients with version <= 0.3.0 do not set model_deployment, but this is now
|
|
184
|
+
# required by Request.
|
|
185
|
+
if not request.model_deployment:
|
|
186
|
+
model_deployment = get_default_model_deployment_for_model(request.model)
|
|
187
|
+
if model_deployment is None:
|
|
188
|
+
raise ValueError(f"Unknown model '{request.model}'")
|
|
189
|
+
request = dataclasses.replace(request, model_deployment=model_deployment)
|
|
190
|
+
|
|
191
|
+
raw_response = dataclasses.asdict(service.make_request(auth, request))
|
|
192
|
+
|
|
193
|
+
# Hack to maintain reverse compatibility with clients with version <= 1.0.0.
|
|
194
|
+
# Clients with version <= 1.0.0 expect each token to contain a `top_logprobs`
|
|
195
|
+
# field of type dict.
|
|
196
|
+
for completion in raw_response["completions"]:
|
|
197
|
+
for token in completion["tokens"]:
|
|
198
|
+
token["top_logprobs"] = {}
|
|
199
|
+
|
|
200
|
+
return raw_response
|
|
172
201
|
|
|
173
202
|
return safe_call(perform)
|
|
174
203
|
|
|
@@ -176,6 +205,7 @@ def handle_request():
|
|
|
176
205
|
@app.get("/api/tokenize")
|
|
177
206
|
def handle_tokenization():
|
|
178
207
|
def perform(args):
|
|
208
|
+
global service
|
|
179
209
|
auth = Authentication(**json.loads(args["auth"]))
|
|
180
210
|
request = TokenizationRequest(**json.loads(args["request"]))
|
|
181
211
|
return dataclasses.asdict(service.tokenize(auth, request))
|
|
@@ -186,6 +216,7 @@ def handle_tokenization():
|
|
|
186
216
|
@app.get("/api/decode")
|
|
187
217
|
def handle_decode():
|
|
188
218
|
def perform(args):
|
|
219
|
+
global service
|
|
189
220
|
auth = Authentication(**json.loads(args["auth"]))
|
|
190
221
|
request = DecodeRequest(**json.loads(args["request"]))
|
|
191
222
|
return dataclasses.asdict(service.decode(auth, request))
|
|
@@ -196,6 +227,7 @@ def handle_decode():
|
|
|
196
227
|
@app.get("/api/toxicity")
|
|
197
228
|
def handle_toxicity_request():
|
|
198
229
|
def perform(args):
|
|
230
|
+
global service
|
|
199
231
|
auth = Authentication(**json.loads(args["auth"]))
|
|
200
232
|
request = PerspectiveAPIRequest(**json.loads(args["request"]))
|
|
201
233
|
return dataclasses.asdict(service.get_toxicity_scores(auth, request))
|
|
@@ -203,9 +235,21 @@ def handle_toxicity_request():
|
|
|
203
235
|
return safe_call(perform)
|
|
204
236
|
|
|
205
237
|
|
|
238
|
+
@app.get("/api/moderation")
|
|
239
|
+
def handle_moderation_request():
|
|
240
|
+
def perform(args):
|
|
241
|
+
global service
|
|
242
|
+
auth = Authentication(**json.loads(args["auth"]))
|
|
243
|
+
request = ModerationAPIRequest(**json.loads(args["request"]))
|
|
244
|
+
return dataclasses.asdict(service.get_moderation_results(auth, request))
|
|
245
|
+
|
|
246
|
+
return safe_call(perform)
|
|
247
|
+
|
|
248
|
+
|
|
206
249
|
@app.get("/api/shutdown")
|
|
207
250
|
def handle_shutdown():
|
|
208
251
|
def perform(args):
|
|
252
|
+
global service
|
|
209
253
|
auth = Authentication(**json.loads(args["auth"]))
|
|
210
254
|
service.shutdown(auth)
|
|
211
255
|
|
|
@@ -218,6 +262,7 @@ def main():
|
|
|
218
262
|
parser.add_argument("-p", "--port", type=int, help="What port to listen on", default=1959)
|
|
219
263
|
parser.add_argument("--ssl-key-file", type=str, help="Path to SSL key file")
|
|
220
264
|
parser.add_argument("--ssl-cert-file", type=str, help="Path to SSL cert file")
|
|
265
|
+
parser.add_argument("--ssl-ca-certs", type=str, help="Path to SSL CA certs")
|
|
221
266
|
parser.add_argument("-b", "--base-path", help="What directory has credentials, etc.", default="prod_env")
|
|
222
267
|
parser.add_argument("-w", "--workers", type=int, help="Number of worker processes to handle requests", default=8)
|
|
223
268
|
parser.add_argument("-t", "--timeout", type=int, help="Request timeout in seconds", default=5 * 60)
|
|
@@ -228,21 +273,40 @@ def main():
|
|
|
228
273
|
default="",
|
|
229
274
|
)
|
|
230
275
|
args = parser.parse_args()
|
|
276
|
+
setup_default_logging()
|
|
231
277
|
|
|
232
278
|
register_builtin_configs_from_helm_package()
|
|
233
279
|
register_configs_from_directory(args.base_path)
|
|
234
280
|
|
|
235
|
-
|
|
281
|
+
cache_backend_config: CacheBackendConfig
|
|
282
|
+
if args.mongo_uri:
|
|
283
|
+
cache_backend_config = MongoCacheBackendConfig(args.mongo_uri)
|
|
284
|
+
else:
|
|
285
|
+
sqlite_cache_path = os.path.join(args.base_path, CACHE_DIR)
|
|
286
|
+
ensure_directory_exists(sqlite_cache_path)
|
|
287
|
+
cache_backend_config = SqliteCacheBackendConfig(sqlite_cache_path)
|
|
288
|
+
|
|
289
|
+
static_package_name = "helm.proxy.static"
|
|
290
|
+
resource_path = resources.files(static_package_name).joinpath("index.html")
|
|
291
|
+
with resources.as_file(resource_path) as resource_filename:
|
|
292
|
+
static_path = str(resource_filename.parent)
|
|
293
|
+
app.config["helm.staticpath"] = static_path
|
|
294
|
+
|
|
295
|
+
service = ServerService(base_path=args.base_path, cache_backend_config=cache_backend_config)
|
|
236
296
|
|
|
237
297
|
gunicorn_args = {
|
|
238
298
|
"workers": args.workers,
|
|
239
299
|
"timeout": args.timeout,
|
|
240
300
|
"limit_request_line": 0, # Controls the maximum size of HTTP request line in bytes. 0 = unlimited.
|
|
241
301
|
}
|
|
242
|
-
if args.ssl_key_file
|
|
302
|
+
if args.ssl_key_file:
|
|
243
303
|
gunicorn_args["keyfile"] = args.ssl_key_file
|
|
304
|
+
if args.ssl_cert_file:
|
|
244
305
|
gunicorn_args["certfile"] = args.ssl_cert_file
|
|
306
|
+
if args.ssl_ca_certs:
|
|
307
|
+
gunicorn_args["ca_certs"] = args.ssl_ca_certs
|
|
245
308
|
|
|
246
309
|
# Clear arguments before running gunicorn as it also uses argparse
|
|
247
310
|
sys.argv = [sys.argv[0]]
|
|
311
|
+
app.config["crfm.proxy.outputpath"] = os.path.join(os.path.realpath(args.base_path), "cache", "output")
|
|
248
312
|
app.run(host="0.0.0.0", port=args.port, server="gunicorn", **gunicorn_args)
|