crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -5,11 +5,16 @@ import urllib.parse
|
|
|
5
5
|
from dataclasses import asdict
|
|
6
6
|
from typing import Any, List, Optional
|
|
7
7
|
|
|
8
|
+
from helm.common.cache import CacheConfig
|
|
9
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
8
10
|
from helm.common.authentication import Authentication
|
|
11
|
+
from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
|
|
9
12
|
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
13
|
+
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
14
|
+
from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
10
15
|
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
16
|
+
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
11
17
|
from helm.common.tokenization_request import (
|
|
12
|
-
WindowServiceInfo,
|
|
13
18
|
TokenizationRequest,
|
|
14
19
|
TokenizationRequestResult,
|
|
15
20
|
DecodeRequestResult,
|
|
@@ -19,7 +24,7 @@ from helm.common.request import Request, RequestResult
|
|
|
19
24
|
from dacite import from_dict
|
|
20
25
|
from helm.proxy.accounts import Account
|
|
21
26
|
from helm.proxy.query import Query, QueryResult
|
|
22
|
-
from .service import Service, GeneralInfo
|
|
27
|
+
from helm.proxy.services.service import Service, GeneralInfo
|
|
23
28
|
|
|
24
29
|
|
|
25
30
|
class RemoteServiceError(Exception):
|
|
@@ -27,6 +32,8 @@ class RemoteServiceError(Exception):
|
|
|
27
32
|
|
|
28
33
|
|
|
29
34
|
class RemoteService(Service):
|
|
35
|
+
NOT_SUPPORTED_ERROR: str = "Not supported through the remote service."
|
|
36
|
+
|
|
30
37
|
def __init__(self, base_url):
|
|
31
38
|
self.base_url: str = base_url
|
|
32
39
|
|
|
@@ -43,11 +50,6 @@ class RemoteService(Service):
|
|
|
43
50
|
response = requests.get(f"{self.base_url}/api/general_info").json()
|
|
44
51
|
return from_dict(GeneralInfo, response)
|
|
45
52
|
|
|
46
|
-
def get_window_service_info(self, model_name) -> WindowServiceInfo:
|
|
47
|
-
params = {"model_name": model_name}
|
|
48
|
-
response = requests.get(f"{self.base_url}/api/window_service_info?{urllib.parse.urlencode(params)}").json()
|
|
49
|
-
return from_dict(WindowServiceInfo, response)
|
|
50
|
-
|
|
51
53
|
def expand_query(self, query: Query) -> QueryResult:
|
|
52
54
|
params = asdict(query)
|
|
53
55
|
response = requests.get(f"{self.base_url}/api/query?{urllib.parse.urlencode(params)}").json()
|
|
@@ -84,6 +86,15 @@ class RemoteService(Service):
|
|
|
84
86
|
RemoteService._check_response(response, request_json)
|
|
85
87
|
return from_dict(DecodeRequestResult, response)
|
|
86
88
|
|
|
89
|
+
def upload(self, auth: Authentication, request: FileUploadRequest) -> FileUploadResult:
|
|
90
|
+
raise NotImplementedError(self.NOT_SUPPORTED_ERROR)
|
|
91
|
+
|
|
92
|
+
def check_nudity(self, auth: Authentication, request: NudityCheckRequest) -> NudityCheckResult:
|
|
93
|
+
raise NotImplementedError(self.NOT_SUPPORTED_ERROR)
|
|
94
|
+
|
|
95
|
+
def compute_clip_score(self, auth: Authentication, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
96
|
+
raise NotImplementedError(self.NOT_SUPPORTED_ERROR)
|
|
97
|
+
|
|
87
98
|
def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
88
99
|
request_json: str = json.dumps(asdict(request))
|
|
89
100
|
params = {
|
|
@@ -94,6 +105,16 @@ class RemoteService(Service):
|
|
|
94
105
|
RemoteService._check_response(response, request_json)
|
|
95
106
|
return from_dict(PerspectiveAPIRequestResult, response)
|
|
96
107
|
|
|
108
|
+
def get_moderation_results(self, auth: Authentication, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
109
|
+
request_json: str = json.dumps(asdict(request))
|
|
110
|
+
params = {
|
|
111
|
+
"auth": json.dumps(asdict(auth)),
|
|
112
|
+
"request": request_json,
|
|
113
|
+
}
|
|
114
|
+
response = requests.get(f"{self.base_url}/api/moderation?{urllib.parse.urlencode(params)}").json()
|
|
115
|
+
RemoteService._check_response(response, request_json)
|
|
116
|
+
return from_dict(ModerationAPIRequestResult, response)
|
|
117
|
+
|
|
97
118
|
def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
|
|
98
119
|
raise NotImplementedError("make_critique_request is not supported by RemoteServer")
|
|
99
120
|
|
|
@@ -153,6 +174,10 @@ class RemoteService(Service):
|
|
|
153
174
|
# A ConnectionError is expected when shutting down the server.
|
|
154
175
|
pass
|
|
155
176
|
|
|
177
|
+
def get_cache_config(self, shard_name: str) -> CacheConfig:
|
|
178
|
+
"""Returns a CacheConfig"""
|
|
179
|
+
return BlackHoleCacheBackendConfig().get_cache_config(shard_name)
|
|
180
|
+
|
|
156
181
|
|
|
157
182
|
def add_service_args(parser: argparse.ArgumentParser):
|
|
158
183
|
"""Add command-line arguments to enable command-line utilities to specify how to connect to a remote server."""
|
|
@@ -1,15 +1,18 @@
|
|
|
1
|
-
import dataclasses
|
|
2
1
|
import os
|
|
3
2
|
import signal
|
|
4
|
-
from typing import List
|
|
5
|
-
from helm.common.cache_utils import build_cache_config
|
|
3
|
+
from typing import List
|
|
6
4
|
|
|
5
|
+
from helm.common.local_context import LocalContext
|
|
6
|
+
from helm.common.cache import CacheConfig
|
|
7
|
+
from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
|
|
7
8
|
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
8
9
|
from helm.common.authentication import Authentication
|
|
9
|
-
from helm.common.
|
|
10
|
+
from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
|
|
11
|
+
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
12
|
+
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
13
|
+
from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
10
14
|
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
11
15
|
from helm.common.tokenization_request import (
|
|
12
|
-
WindowServiceInfo,
|
|
13
16
|
TokenizationRequest,
|
|
14
17
|
TokenizationRequestResult,
|
|
15
18
|
DecodeRequest,
|
|
@@ -18,24 +21,13 @@ from helm.common.tokenization_request import (
|
|
|
18
21
|
from helm.common.request import Request, RequestResult
|
|
19
22
|
from helm.common.hierarchical_logger import hlog
|
|
20
23
|
from helm.proxy.accounts import Accounts, Account
|
|
21
|
-
from helm.proxy.clients.auto_client import AutoClient
|
|
22
|
-
from helm.proxy.clients.toxicity_classifier_client import ToxicityClassifierClient
|
|
23
|
-
from helm.proxy.example_queries import example_queries
|
|
24
|
-
from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
|
|
25
24
|
from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization
|
|
26
25
|
from helm.proxy.query import Query, QueryResult
|
|
27
|
-
from helm.proxy.retry import retry_request
|
|
28
26
|
from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
|
|
29
|
-
from helm.proxy.
|
|
30
|
-
from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
31
|
-
from .service import (
|
|
27
|
+
from helm.proxy.services.service import (
|
|
32
28
|
Service,
|
|
33
|
-
CACHE_DIR,
|
|
34
29
|
ACCOUNTS_FILE,
|
|
35
30
|
GeneralInfo,
|
|
36
|
-
VERSION,
|
|
37
|
-
expand_environments,
|
|
38
|
-
synthesize_request,
|
|
39
31
|
)
|
|
40
32
|
|
|
41
33
|
|
|
@@ -44,52 +36,46 @@ class ServerService(Service):
|
|
|
44
36
|
Main class that supports various functionality for the server.
|
|
45
37
|
"""
|
|
46
38
|
|
|
47
|
-
def __init__(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
base_path: str = "prod_env",
|
|
42
|
+
root_mode: bool = False,
|
|
43
|
+
cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
|
|
44
|
+
):
|
|
51
45
|
accounts_path = os.path.join(base_path, ACCOUNTS_FILE)
|
|
52
46
|
|
|
53
|
-
self.
|
|
54
|
-
self.
|
|
55
|
-
cache_config = build_cache_config(cache_path, mongo_uri, "huggingface")
|
|
56
|
-
self.token_counter = AutoTokenCounter(HuggingFaceTokenizer(cache_config=cache_config))
|
|
47
|
+
self.context = LocalContext(base_path, cache_backend_config)
|
|
48
|
+
self.token_counter = AutoTokenCounter(self.context.tokenizer)
|
|
57
49
|
self.accounts = Accounts(accounts_path, root_mode=root_mode)
|
|
58
|
-
# Lazily instantiated by get_toxicity_scores()
|
|
59
|
-
self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
|
|
60
50
|
|
|
61
51
|
def get_general_info(self) -> GeneralInfo:
|
|
62
|
-
|
|
63
|
-
# TODO(#2158): Either fix this or delete get_general_info.
|
|
64
|
-
all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
|
|
65
|
-
return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
|
|
66
|
-
|
|
67
|
-
def get_window_service_info(self, model_name) -> WindowServiceInfo:
|
|
68
|
-
# The import statement is placed here to avoid two problems, please refer to the link for details
|
|
69
|
-
# https://github.com/stanford-crfm/helm/pull/1430#discussion_r1156686624
|
|
70
|
-
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
71
|
-
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
72
|
-
|
|
73
|
-
token_service = TokenizerService(self, Authentication(""))
|
|
74
|
-
window_service = WindowServiceFactory.get_window_service(model_name, token_service)
|
|
75
|
-
return WindowServiceInfo(
|
|
76
|
-
tokenizer_name=window_service.tokenizer_name,
|
|
77
|
-
max_sequence_length=window_service.max_sequence_length,
|
|
78
|
-
max_request_length=window_service.max_request_length,
|
|
79
|
-
end_of_text_token=window_service.end_of_text_token,
|
|
80
|
-
prefix_token=window_service.prefix_token,
|
|
81
|
-
)
|
|
52
|
+
return self.context.get_general_info()
|
|
82
53
|
|
|
83
54
|
def expand_query(self, query: Query) -> QueryResult:
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
55
|
+
return self.context.expand_query(query)
|
|
56
|
+
|
|
57
|
+
def _get_model_group_for_model_deployment(self, model_deployment: str) -> str:
|
|
58
|
+
if model_deployment.startswith("openai/"):
|
|
59
|
+
if model_deployment.startswith("openai/code-"):
|
|
60
|
+
return "codex"
|
|
61
|
+
elif model_deployment.startswith("openai/dall-e-"):
|
|
62
|
+
return "dall_e"
|
|
63
|
+
elif model_deployment.startswith("openai/gpt-4"):
|
|
64
|
+
return "gpt4"
|
|
65
|
+
elif model_deployment.startswith("openai/gpt-3"):
|
|
66
|
+
return "gpt3"
|
|
67
|
+
elif (
|
|
68
|
+
model_deployment.startswith("openai/o1")
|
|
69
|
+
or model_deployment.startswith("openai/o3")
|
|
70
|
+
or model_deployment.startswith("openai/o4")
|
|
71
|
+
):
|
|
72
|
+
return "o1"
|
|
73
|
+
else:
|
|
74
|
+
return "openai"
|
|
75
|
+
elif model_deployment.startswith("ai21/"):
|
|
76
|
+
return "jurassic"
|
|
77
|
+
else:
|
|
78
|
+
return get_model_deployment_host_organization(model_deployment)
|
|
93
79
|
|
|
94
80
|
def make_request(self, auth: Authentication, request: Request) -> RequestResult:
|
|
95
81
|
"""Actually make a request to an API."""
|
|
@@ -98,44 +84,57 @@ class ServerService(Service):
|
|
|
98
84
|
# https://github.com/stanford-crfm/benchmarking/issues/56
|
|
99
85
|
|
|
100
86
|
self.accounts.authenticate(auth)
|
|
101
|
-
|
|
87
|
+
model_group: str = self._get_model_group_for_model_deployment(request.model_deployment)
|
|
102
88
|
# Make sure we can use
|
|
103
|
-
self.accounts.check_can_use(auth.api_key,
|
|
89
|
+
self.accounts.check_can_use(auth.api_key, model_group)
|
|
104
90
|
|
|
105
91
|
# Use!
|
|
106
|
-
request_result: RequestResult = self.
|
|
92
|
+
request_result: RequestResult = self.context.make_request(request)
|
|
107
93
|
|
|
108
94
|
# Only deduct if not cached
|
|
109
95
|
if not request_result.cached:
|
|
110
96
|
# Count the number of tokens used
|
|
111
97
|
count: int = self.token_counter.count_tokens(request, request_result.completions)
|
|
112
|
-
self.accounts.use(auth.api_key,
|
|
98
|
+
self.accounts.use(auth.api_key, model_group, count)
|
|
113
99
|
|
|
114
100
|
return request_result
|
|
115
101
|
|
|
116
102
|
def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
117
103
|
"""Tokenize via an API."""
|
|
118
104
|
self.accounts.authenticate(auth)
|
|
119
|
-
return self.
|
|
105
|
+
return self.context.tokenize(request)
|
|
120
106
|
|
|
121
107
|
def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
|
|
122
108
|
"""Decodes to text."""
|
|
123
109
|
self.accounts.authenticate(auth)
|
|
124
|
-
return self.
|
|
110
|
+
return self.context.decode(request)
|
|
111
|
+
|
|
112
|
+
def upload(self, auth: Authentication, request: FileUploadRequest) -> FileUploadResult:
|
|
113
|
+
"""Uploads a file to external storage."""
|
|
114
|
+
self.accounts.authenticate(auth)
|
|
115
|
+
return self.context.upload(request)
|
|
116
|
+
|
|
117
|
+
def check_nudity(self, auth: Authentication, request: NudityCheckRequest) -> NudityCheckResult:
|
|
118
|
+
"""Check for nudity."""
|
|
119
|
+
self.accounts.authenticate(auth)
|
|
120
|
+
return self.context.check_nudity(request)
|
|
121
|
+
|
|
122
|
+
def compute_clip_score(self, auth: Authentication, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
123
|
+
"""Computes CLIPScore for a given caption and image."""
|
|
124
|
+
self.accounts.authenticate(auth)
|
|
125
|
+
return self.context.compute_clip_score(request)
|
|
125
126
|
|
|
126
127
|
def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
if not self.toxicity_classifier_client:
|
|
130
|
-
self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
|
|
131
|
-
return self.toxicity_classifier_client.get_toxicity_scores(request)
|
|
128
|
+
self.accounts.authenticate(auth)
|
|
129
|
+
return self.context.get_toxicity_scores(request)
|
|
132
130
|
|
|
131
|
+
def get_moderation_results(self, auth: Authentication, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
133
132
|
self.accounts.authenticate(auth)
|
|
134
|
-
return
|
|
133
|
+
return self.context.get_moderation_results(request)
|
|
135
134
|
|
|
136
135
|
def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
|
|
137
136
|
self.accounts.authenticate(auth)
|
|
138
|
-
return self.
|
|
137
|
+
return self.context.make_critique_request(request)
|
|
139
138
|
|
|
140
139
|
def create_account(self, auth: Authentication) -> Account:
|
|
141
140
|
"""Creates a new account."""
|
|
@@ -168,3 +167,6 @@ class ServerService(Service):
|
|
|
168
167
|
hlog(f"Shutting down server by killing its own process {pid}...")
|
|
169
168
|
os.kill(pid, signal.SIGTERM)
|
|
170
169
|
hlog("Done.")
|
|
170
|
+
|
|
171
|
+
def get_cache_config(self, shard_name: str) -> CacheConfig:
|
|
172
|
+
return self.context.get_cache_config(shard_name)
|
helm/proxy/services/service.py
CHANGED
|
@@ -5,9 +5,12 @@ from typing import Dict, List, Tuple, Any
|
|
|
5
5
|
|
|
6
6
|
from helm.common.general import parse_hocon
|
|
7
7
|
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
8
|
+
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
9
|
+
from helm.common.file_upload_request import FileUploadResult, FileUploadRequest
|
|
10
|
+
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
8
11
|
from helm.common.perspective_api_request import PerspectiveAPIRequestResult, PerspectiveAPIRequest
|
|
12
|
+
from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
|
|
9
13
|
from helm.common.tokenization_request import (
|
|
10
|
-
WindowServiceInfo,
|
|
11
14
|
TokenizationRequest,
|
|
12
15
|
TokenizationRequestResult,
|
|
13
16
|
DecodeRequest,
|
|
@@ -17,6 +20,7 @@ from helm.common.request import Request, RequestResult
|
|
|
17
20
|
from helm.benchmark.model_metadata_registry import ModelMetadata
|
|
18
21
|
from helm.proxy.query import Query, QueryResult
|
|
19
22
|
from helm.proxy.accounts import Authentication, Account
|
|
23
|
+
from helm.common.cache import CacheConfig
|
|
20
24
|
|
|
21
25
|
VERSION = "1.0"
|
|
22
26
|
ACCOUNTS_FILE = "accounts.sqlite"
|
|
@@ -80,11 +84,6 @@ class Service(ABC):
|
|
|
80
84
|
"""Get general info."""
|
|
81
85
|
pass
|
|
82
86
|
|
|
83
|
-
@abstractmethod
|
|
84
|
-
def get_window_service_info(self, model_name: str) -> WindowServiceInfo:
|
|
85
|
-
"""Get window service info."""
|
|
86
|
-
pass
|
|
87
|
-
|
|
88
87
|
@abstractmethod
|
|
89
88
|
def expand_query(self, query: Query) -> QueryResult:
|
|
90
89
|
"""Turn the `query` into requests."""
|
|
@@ -105,11 +104,31 @@ class Service(ABC):
|
|
|
105
104
|
"""Decodes to text."""
|
|
106
105
|
pass
|
|
107
106
|
|
|
107
|
+
@abstractmethod
|
|
108
|
+
def upload(self, auth: Authentication, request: FileUploadRequest) -> FileUploadResult:
|
|
109
|
+
"""Uploads a file to external storage."""
|
|
110
|
+
pass
|
|
111
|
+
|
|
112
|
+
@abstractmethod
|
|
113
|
+
def check_nudity(self, auth: Authentication, request: NudityCheckRequest) -> NudityCheckResult:
|
|
114
|
+
"""Check for nudity for a batch of images."""
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
@abstractmethod
|
|
118
|
+
def compute_clip_score(self, auth: Authentication, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
119
|
+
"""Computes CLIPScore for a given caption and image."""
|
|
120
|
+
pass
|
|
121
|
+
|
|
108
122
|
@abstractmethod
|
|
109
123
|
def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
110
124
|
"""Get toxicity scores for a batch of text."""
|
|
111
125
|
pass
|
|
112
126
|
|
|
127
|
+
@abstractmethod
|
|
128
|
+
def get_moderation_results(self, auth: Authentication, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
129
|
+
"""Get OpenAI's moderation results for some text."""
|
|
130
|
+
pass
|
|
131
|
+
|
|
113
132
|
@abstractmethod
|
|
114
133
|
def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
|
|
115
134
|
"""Get responses to a critique request."""
|
|
@@ -149,3 +168,8 @@ class Service(ABC):
|
|
|
149
168
|
def shutdown(self, auth: Authentication):
|
|
150
169
|
"""Shutdown server."""
|
|
151
170
|
pass
|
|
171
|
+
|
|
172
|
+
@abstractmethod
|
|
173
|
+
def get_cache_config(self, shard_name: str) -> CacheConfig:
|
|
174
|
+
"""Returns a CacheConfig"""
|
|
175
|
+
pass
|
|
@@ -17,9 +17,9 @@ from sqlitedict import SqliteDict
|
|
|
17
17
|
from helm.common.authentication import Authentication
|
|
18
18
|
from helm.common.request import Request, RequestResult
|
|
19
19
|
from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
|
|
20
|
-
from helm.proxy.accounts import Account
|
|
21
|
-
from .remote_service import RemoteService
|
|
22
|
-
from .service import ACCOUNTS_FILE
|
|
20
|
+
from helm.proxy.accounts import Account, set_default_quotas
|
|
21
|
+
from helm.proxy.services.remote_service import RemoteService
|
|
22
|
+
from helm.proxy.services.service import ACCOUNTS_FILE
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
@dataclass(frozen=True)
|
|
@@ -55,6 +55,7 @@ class TestRemoteServerService:
|
|
|
55
55
|
|
|
56
56
|
with SqliteDict(os.path.join(path, ACCOUNTS_FILE)) as cache:
|
|
57
57
|
account: Account = Account(TestRemoteServerService._ADMIN_API_KEY, is_admin=True)
|
|
58
|
+
set_default_quotas(account)
|
|
58
59
|
cache[TestRemoteServerService._ADMIN_API_KEY] = asdict(account)
|
|
59
60
|
cache.commit()
|
|
60
61
|
return path
|
|
@@ -126,9 +127,9 @@ class TestRemoteServerService:
|
|
|
126
127
|
assert response.success
|
|
127
128
|
|
|
128
129
|
def test_tokenize(self):
|
|
129
|
-
request = TokenizationRequest(text="1 2 3", tokenizer="simple/
|
|
130
|
+
request = TokenizationRequest(text="1 2 3", tokenizer="simple/tokenizer1")
|
|
130
131
|
response: TokenizationRequestResult = self.service.tokenize(self.auth, request)
|
|
131
|
-
assert [token.value for token in response.tokens] == ["1", "2", "3"]
|
|
132
|
+
assert [token.value for token in response.tokens] == ["1", " ", "2", " ", "3"]
|
|
132
133
|
|
|
133
134
|
def test_make_request_plus_sign(self):
|
|
134
135
|
# Ensure + in prompt doesn't get replaced by a blank space
|
|
@@ -8,7 +8,7 @@ from helm.common.authentication import Authentication
|
|
|
8
8
|
from helm.common.request import Request
|
|
9
9
|
from helm.proxy.accounts import AuthenticationError, Accounts
|
|
10
10
|
from helm.proxy.query import Query
|
|
11
|
-
from .server_service import ServerService
|
|
11
|
+
from helm.proxy.services.server_service import ServerService
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def get_authentication():
|
|
@@ -197,18 +197,6 @@ def helper_prod_test_service(request: Request, expected_text: str):
|
|
|
197
197
|
# Consistency of log probs
|
|
198
198
|
assert completion.logprob == sum(token.logprob for token in completion.tokens)
|
|
199
199
|
|
|
200
|
-
for token in completion.tokens[1:]:
|
|
201
|
-
assert len(token.top_logprobs) == request.top_k_per_token
|
|
202
|
-
|
|
203
|
-
# If generated token was one of the top, make sure has the right probability
|
|
204
|
-
if token.text in token.top_logprobs:
|
|
205
|
-
assert token.logprob == token.top_logprobs[token.text]
|
|
206
|
-
|
|
207
|
-
# If temperature = 0, then make sure we're getting the top probability token
|
|
208
|
-
if request.temperature == 0:
|
|
209
|
-
assert token.text in token.top_logprobs
|
|
210
|
-
assert token.logprob == max(token.top_logprobs.values())
|
|
211
|
-
|
|
212
200
|
# Make sure we get the expected_text in one of the completions
|
|
213
201
|
assert any(completion.text == expected_text for completion in result.completions)
|
|
214
202
|
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<title>CRFM Models API</title>
|
|
5
|
+
<meta charset="utf-8">
|
|
6
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
|
7
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/css/bootstrap.min.css" integrity="sha384-/Y6pD6FV/Vv2HJnA6t+vslU6fwYXjCFtcEpHbNJ0lyAFsXTsjBbfaDjzALeQsN6M" crossorigin="anonymous">
|
|
8
|
+
<link rel="stylesheet" type="text/css" href="index.css">
|
|
9
|
+
</head>
|
|
10
|
+
|
|
11
|
+
<body>
|
|
12
|
+
<div class="container-fluid">
|
|
13
|
+
<nav class="navbar navbar-expand-sm navbar-light bg-faded">
|
|
14
|
+
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#nav-content" aria-controls="nav-content" aria-expanded="false" aria-label="Toggle navigation">
|
|
15
|
+
<span class="navbar-toggler-icon"></span>
|
|
16
|
+
</button>
|
|
17
|
+
|
|
18
|
+
<div class="collapse navbar-collapse" id="nav-content">
|
|
19
|
+
<ul class="navbar-nav">
|
|
20
|
+
<li class="nav-item"><a class="nav-link" href="index.html">Query</a></li>
|
|
21
|
+
<li class="nav-item"><a class="nav-link active" href="help.html">Help</a></li>
|
|
22
|
+
<li class="nav-item" id="loginInfo"></li>
|
|
23
|
+
</ul>
|
|
24
|
+
</div>
|
|
25
|
+
</nav>
|
|
26
|
+
|
|
27
|
+
<div class="row">
|
|
28
|
+
<div class="col-sm-12">
|
|
29
|
+
<p>
|
|
30
|
+
This web interface and API allows you to experiment with multiple large language models using a unified interface.
|
|
31
|
+
To log in, you need an API key.
|
|
32
|
+
</p>
|
|
33
|
+
<h2>Query</h2>
|
|
34
|
+
<ul>
|
|
35
|
+
<li>In the web interface, you can enter a <b>query</b>, which consists of the following components:
|
|
36
|
+
<ul>
|
|
37
|
+
<li><b>prompt</b>, which is what text we want to feed into the language model. The prompt can have variables (e.g., <tt>${name}</tt>) which are filled in later.</li>
|
|
38
|
+
<li><b>settings</b>, which configures how we're going to call the backend API (HOCON format):
|
|
39
|
+
<ul>
|
|
40
|
+
<li><tt>model</tt>: which model to query; options are:
|
|
41
|
+
<div id="help-models" />
|
|
42
|
+
</li>
|
|
43
|
+
<li><tt>temperature</tt>: a non-negative number determining amount of stochasticity (e.g., <tt>1</tt> is sampling from the model, <tt>0</tt> is returning the maximum probability output)</li>
|
|
44
|
+
<li><tt>num_completions</tt>: number of completions (sequences, independent sampled) to return</li>
|
|
45
|
+
<li><tt>top_k_per_token</tt>: number of candidates per token position in each completion</li>
|
|
46
|
+
<li><tt>max_tokens</tt>: maximum number of tokens before generation stops</li>
|
|
47
|
+
<li><tt>stop_sequences</tt>: list of strings that will stop generation (e.g., <tt>'.'</tt> or <tt>'\n'</tt>)</li>
|
|
48
|
+
<li><tt>echo_prompt</tt>: Whether to include the prompt as a prefix of the completion</li>
|
|
49
|
+
<li><tt>top_p</tt>: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.</li>
|
|
50
|
+
<li><tt>presence_penalty</tt>: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. (OpenAI only)</li>
|
|
51
|
+
<li><tt>frequency_penalty</tt>: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. (OpenAI only)</li>
|
|
52
|
+
</ul>
|
|
53
|
+
Settings can also have variables in them (e.g., <tt>temperature</tt>).
|
|
54
|
+
</li>
|
|
55
|
+
<li><b>environments</b>, which specifies for each variable, a list of values (HOCON format).
|
|
56
|
+
</ul>
|
|
57
|
+
</li>
|
|
58
|
+
<li>
|
|
59
|
+
When the query is submitted,
|
|
60
|
+
we consider all possible assignments of values to variables.
|
|
61
|
+
For example:
|
|
62
|
+
<ul>
|
|
63
|
+
<li>environments has <tt>name: [Boston, New York]</tt> and <tt>temperature: [0, 1]</tt></li>
|
|
64
|
+
<li>prompt is <tt>${name} is a</tt></li>
|
|
65
|
+
<li>settings is <tt>temperature: ${temperature}</tt></li>
|
|
66
|
+
</ul>
|
|
67
|
+
This gives rise to 4 <b>requests</b>:
|
|
68
|
+
<ul>
|
|
69
|
+
<li>prompt: <tt>Boston is a</tt>, temperature: <tt>0</tt></li>
|
|
70
|
+
<li>prompt: <tt>Boston is a</tt>, temperature: <tt>1</tt></li>
|
|
71
|
+
<li>prompt: <tt>New York is a</tt>, temperature: <tt>0</tt></li>
|
|
72
|
+
<li>prompt: <tt>New York is a</tt>, temperature: <tt>1</tt></li>
|
|
73
|
+
</ul>
|
|
74
|
+
</li>
|
|
75
|
+
</ul>
|
|
76
|
+
|
|
77
|
+
<h2><a name="quotas">Quotas</a></h2>
|
|
78
|
+
<p>
|
|
79
|
+
For each model group (e.g., <tt>gpt3</tt>) and time granularity
|
|
80
|
+
(e.g., <tt>daily</tt>, <tt>monthly</tt>, <tt>total</tt>), you are
|
|
81
|
+
given a quota of a certain number of tokens.
|
|
82
|
+
Once you go over that number, you won't be able to use the API.
|
|
83
|
+
However, note that requests that have already been made (by you or
|
|
84
|
+
anyone) that are cached are not counted towards your quota.
|
|
85
|
+
For example, if your daily quota for <tt>gpt3</tt> is <tt>10000</tt>,
|
|
86
|
+
that means each day, you will get 10000 tokens.
|
|
87
|
+
</p>
|
|
88
|
+
</div>
|
|
89
|
+
</div>
|
|
90
|
+
</div>
|
|
91
|
+
|
|
92
|
+
<script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>
|
|
93
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js" integrity="sha384-b/U6ypiBEHpOf/4+1nzFpr53nxSS+GLCkfwBdFNTxtclqqenISfwAzpKaMNFNmj4" crossorigin="anonymous"></script>
|
|
94
|
+
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/js/bootstrap.min.js" integrity="sha384-h0AbiXch4ZDo7tp9hKZ4TsHbi047NrKGLO3SEJAg45jXxnGIfYzk4Si90RDIqNm1" crossorigin="anonymous"></script>
|
|
95
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.tablesorter/2.29.0/js/jquery.tablesorter.min.js"></script>
|
|
96
|
+
<script src="general.js"></script>
|
|
97
|
+
<script src="index.js"></script>
|
|
98
|
+
</body>
|
|
99
|
+
</html>
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
.warning {
|
|
2
|
+
color: red;
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
.active {
|
|
6
|
+
background-color: #f0f0f0;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
.examples-block {
|
|
10
|
+
margin-left: 120px;
|
|
11
|
+
margin-top: 10px;
|
|
12
|
+
margin-bottom: 10px;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
.request {
|
|
16
|
+
background-color: #f0f0f0;
|
|
17
|
+
margin-top: 20px;
|
|
18
|
+
margin-bottom: 20px;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
.completion {
|
|
22
|
+
border: solid 1px;
|
|
23
|
+
border-color: #c0c0c0;
|
|
24
|
+
padding: 3px;
|
|
25
|
+
margin-top: 5px;
|
|
26
|
+
margin-bottom: 5px;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
.completion:hover {
|
|
30
|
+
background-color: #e8e8e8;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
.metadata {
|
|
34
|
+
float: right;
|
|
35
|
+
font-style: italic;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
.thinking {
|
|
39
|
+
font-style: italic;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
.token:hover {
|
|
43
|
+
background-color: lightgreen;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
.block {
|
|
47
|
+
margin: 15px;
|
|
48
|
+
padding: 15px;
|
|
49
|
+
border: solid 1px;
|
|
50
|
+
border-color: lightgray;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
td {
|
|
54
|
+
text-align: left;
|
|
55
|
+
vertical-align: top;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
.help-icon {
|
|
59
|
+
margin-left: 3px;
|
|
60
|
+
margin-right: 3px;
|
|
61
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<title>CRFM Models API</title>
|
|
5
|
+
<meta charset="utf-8">
|
|
6
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
|
7
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/css/bootstrap.min.css" integrity="sha384-/Y6pD6FV/Vv2HJnA6t+vslU6fwYXjCFtcEpHbNJ0lyAFsXTsjBbfaDjzALeQsN6M" crossorigin="anonymous">
|
|
8
|
+
<link rel="stylesheet" type="text/css" href="index.css">
|
|
9
|
+
</head>
|
|
10
|
+
|
|
11
|
+
<body>
|
|
12
|
+
<div class="container-fluid">
|
|
13
|
+
<nav class="navbar navbar-expand-sm navbar-light bg-faded">
|
|
14
|
+
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#nav-content" aria-controls="nav-content" aria-expanded="false" aria-label="Toggle navigation">
|
|
15
|
+
<span class="navbar-toggler-icon"></span>
|
|
16
|
+
</button>
|
|
17
|
+
|
|
18
|
+
<div class="collapse navbar-collapse" id="nav-content">
|
|
19
|
+
<ul class="navbar-nav">
|
|
20
|
+
<li class="nav-item"><a class="nav-link active" href="index.html">Query</a></li>
|
|
21
|
+
<li class="nav-item"><a class="nav-link" href="help.html">Help</a></li>
|
|
22
|
+
<li class="nav-item" id="loginInfo"></li>
|
|
23
|
+
</ul>
|
|
24
|
+
</div>
|
|
25
|
+
</nav>
|
|
26
|
+
|
|
27
|
+
<div class="row">
|
|
28
|
+
<div class="col-sm-12" id="main">
|
|
29
|
+
</div>
|
|
30
|
+
</div>
|
|
31
|
+
</div>
|
|
32
|
+
|
|
33
|
+
<script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>
|
|
34
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js" integrity="sha384-b/U6ypiBEHpOf/4+1nzFpr53nxSS+GLCkfwBdFNTxtclqqenISfwAzpKaMNFNmj4" crossorigin="anonymous"></script>
|
|
35
|
+
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/js/bootstrap.min.js" integrity="sha384-h0AbiXch4ZDo7tp9hKZ4TsHbi047NrKGLO3SEJAg45jXxnGIfYzk4Si90RDIqNm1" crossorigin="anonymous"></script>
|
|
36
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.tablesorter/2.29.0/js/jquery.tablesorter.min.js"></script>
|
|
37
|
+
<script src="general.js"></script>
|
|
38
|
+
<script src="index.js"></script>
|
|
39
|
+
</body>
|
|
40
|
+
</html>
|