crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -10,9 +10,10 @@ from typing import List, Dict, Optional, Any, Callable, Union, Mapping, Tuple, S
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
from scipy.stats import pearsonr
|
|
12
12
|
|
|
13
|
+
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
|
|
13
14
|
from helm.common.hierarchical_logger import hlog
|
|
14
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
15
|
-
from helm.benchmark.
|
|
16
|
+
from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
|
|
16
17
|
from helm.benchmark.presentation.summarize import AGGREGATE_WIN_RATE_COLUMN
|
|
17
18
|
|
|
18
19
|
try:
|
|
@@ -133,9 +134,6 @@ class Plotter:
|
|
|
133
134
|
self.plot_format = plot_format
|
|
134
135
|
self._tables_cache: Dict[str, Dict[str, Table]] = {}
|
|
135
136
|
|
|
136
|
-
schema = read_schema()
|
|
137
|
-
self.model_metadata = {model_field.display_name: model_field for model_field in schema.models}
|
|
138
|
-
|
|
139
137
|
def get_group_tables(self, group_name: str) -> Dict[str, Table]:
|
|
140
138
|
"""Reads and parses group tables. Uses _tables_cache to avoid reprocessing the same table multiple times."""
|
|
141
139
|
if group_name in self._tables_cache:
|
|
@@ -338,14 +336,14 @@ class Plotter:
|
|
|
338
336
|
|
|
339
337
|
def get_model_release_date(model_name: str) -> Optional[date]:
|
|
340
338
|
"""Maps a model name to the month of model release."""
|
|
341
|
-
release_date =
|
|
339
|
+
release_date = MODEL_NAME_TO_MODEL_METADATA[model_name].release_date
|
|
342
340
|
if release_date is None:
|
|
343
341
|
return None
|
|
344
342
|
return release_date.replace(day=1)
|
|
345
343
|
|
|
346
344
|
def get_model_size(model_name: str) -> Optional[int]:
|
|
347
345
|
"""Maps a model name to the number of parameters, rounding to the nearest leading digit."""
|
|
348
|
-
size =
|
|
346
|
+
size = MODEL_NAME_TO_MODEL_METADATA[model_name].num_parameters
|
|
349
347
|
if size is None:
|
|
350
348
|
return None
|
|
351
349
|
grain = 10 ** (len(str(size)) - 1)
|
|
@@ -401,7 +399,9 @@ class Plotter:
|
|
|
401
399
|
|
|
402
400
|
for i, access_level in enumerate(access_levels):
|
|
403
401
|
model_indices: List[int] = [
|
|
404
|
-
idx
|
|
402
|
+
idx
|
|
403
|
+
for idx, model in enumerate(table.adapters)
|
|
404
|
+
if MODEL_NAME_TO_MODEL_METADATA[model].access == access_level
|
|
405
405
|
]
|
|
406
406
|
best_model_index = model_indices[table.mean_win_rates[model_indices].argmax()]
|
|
407
407
|
|
|
@@ -611,6 +611,7 @@ def main():
|
|
|
611
611
|
parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
|
|
612
612
|
parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
|
|
613
613
|
args = parser.parse_args()
|
|
614
|
+
register_builtin_configs_from_helm_package()
|
|
614
615
|
base_path = os.path.join(args.output_path, "runs", args.suite)
|
|
615
616
|
if not os.path.exists(os.path.join(base_path, "groups")):
|
|
616
617
|
hlog(f"ERROR: Could not find `groups` directory under {base_path}. Did you run `summarize.py` first?")
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from collections import OrderedDict, defaultdict
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
import os
|
|
4
|
-
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
|
4
|
+
from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
|
|
5
5
|
|
|
6
|
-
from helm.benchmark.adaptation.
|
|
6
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
7
7
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS,
|
|
8
8
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
9
9
|
)
|
|
@@ -12,11 +12,13 @@ from helm.benchmark.adaptation.request_state import RequestState
|
|
|
12
12
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
13
13
|
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
14
14
|
from helm.benchmark.metrics.metric import PerInstanceStats
|
|
15
|
+
from helm.common.multimodal_request_utils import gather_generated_image_locations
|
|
15
16
|
from helm.benchmark.presentation.schema import Schema
|
|
16
|
-
from helm.benchmark.
|
|
17
|
+
from helm.benchmark.run_spec import RunSpec
|
|
17
18
|
from helm.benchmark.scenarios.scenario import Instance
|
|
18
19
|
from helm.common.general import write
|
|
19
20
|
from helm.common.hierarchical_logger import hlog, htrack
|
|
21
|
+
from helm.common.images_utils import encode_base64
|
|
20
22
|
from helm.common.request import Request
|
|
21
23
|
from helm.common.codec import from_json, to_json
|
|
22
24
|
|
|
@@ -43,6 +45,9 @@ class DisplayPrediction:
|
|
|
43
45
|
truncated_predicted_text: Optional[str]
|
|
44
46
|
"""The truncated prediction text, if truncation is required by the Adapter method."""
|
|
45
47
|
|
|
48
|
+
base64_images: Optional[List[str]]
|
|
49
|
+
"""Images in base64."""
|
|
50
|
+
|
|
46
51
|
mapped_output: Optional[str]
|
|
47
52
|
"""The mapped output, if an output mapping exists and the prediction can be mapped"""
|
|
48
53
|
|
|
@@ -52,6 +57,8 @@ class DisplayPrediction:
|
|
|
52
57
|
stats: Dict[str, float]
|
|
53
58
|
"""Statistics computed from the predicted output"""
|
|
54
59
|
|
|
60
|
+
annotations: Optional[Dict[str, Any]]
|
|
61
|
+
|
|
55
62
|
|
|
56
63
|
@dataclass(frozen=True)
|
|
57
64
|
class DisplayRequest:
|
|
@@ -73,19 +80,17 @@ class DisplayRequest:
|
|
|
73
80
|
"""The actual Request to display in the web frontend.
|
|
74
81
|
|
|
75
82
|
There can be multiple requests per trial. The displayed request should be the
|
|
76
|
-
most relevant request e.g. the request for the chosen
|
|
83
|
+
most relevant request e.g. the request for the chosen choice for multiple choice questions."""
|
|
77
84
|
|
|
78
85
|
|
|
79
|
-
def _read_scenario_state(
|
|
80
|
-
scenario_state_path: str = os.path.join(run_path, "scenario_state.json")
|
|
86
|
+
def _read_scenario_state(scenario_state_path: str) -> ScenarioState:
|
|
81
87
|
if not os.path.exists(scenario_state_path):
|
|
82
88
|
raise ValueError(f"Could not load ScenarioState from {scenario_state_path}")
|
|
83
89
|
with open(scenario_state_path) as f:
|
|
84
90
|
return from_json(f.read(), ScenarioState)
|
|
85
91
|
|
|
86
92
|
|
|
87
|
-
def _read_per_instance_stats(
|
|
88
|
-
per_instance_stats_path: str = os.path.join(run_path, "per_instance_stats.json")
|
|
93
|
+
def _read_per_instance_stats(per_instance_stats_path: str) -> List[PerInstanceStats]:
|
|
89
94
|
if not os.path.exists(per_instance_stats_path):
|
|
90
95
|
raise ValueError(f"Could not load PerInstanceStats from {per_instance_stats_path}")
|
|
91
96
|
with open(per_instance_stats_path) as f:
|
|
@@ -104,8 +109,7 @@ def _truncate_predicted_text(
|
|
|
104
109
|
tokens = request_state.result.completions[0].tokens
|
|
105
110
|
if tokens:
|
|
106
111
|
first_token = tokens[0]
|
|
107
|
-
|
|
108
|
-
prefix = first_token.text
|
|
112
|
+
prefix = first_token.text
|
|
109
113
|
if prefix:
|
|
110
114
|
predicted_text = predicted_text
|
|
111
115
|
prefix = prefix
|
|
@@ -128,7 +132,7 @@ def _get_metric_names_for_group(run_group_name: str, schema: Schema) -> Set[str]
|
|
|
128
132
|
if metric_group is None:
|
|
129
133
|
continue
|
|
130
134
|
for metric_name_matcher in metric_group.metrics:
|
|
131
|
-
if metric_name_matcher.perturbation_name:
|
|
135
|
+
if metric_name_matcher.perturbation_name and metric_name_matcher.perturbation_name != "__all__":
|
|
132
136
|
continue
|
|
133
137
|
result.add(metric_name_matcher.substitute(run_group.environment).name)
|
|
134
138
|
return result
|
|
@@ -168,16 +172,35 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
168
172
|
display_predictions_file_path = os.path.join(run_path, _DISPLAY_PREDICTIONS_JSON_FILE_NAME)
|
|
169
173
|
display_requests_file_path = os.path.join(run_path, _DISPLAY_REQUESTS_JSON_FILE_NAME)
|
|
170
174
|
|
|
175
|
+
scenario_state_path = os.path.join(run_path, "scenario_state.json")
|
|
176
|
+
per_instance_stats_path = os.path.join(run_path, "per_instance_stats.json")
|
|
177
|
+
|
|
171
178
|
if (
|
|
172
179
|
skip_completed
|
|
173
180
|
and os.path.exists(instances_file_path)
|
|
174
181
|
and os.path.exists(display_predictions_file_path)
|
|
175
182
|
and os.path.exists(display_requests_file_path)
|
|
176
183
|
):
|
|
177
|
-
hlog(
|
|
184
|
+
hlog(
|
|
185
|
+
f"Skipping writing display JSON for run {run_spec.name} "
|
|
186
|
+
"because all output display JSON files already exist."
|
|
187
|
+
)
|
|
188
|
+
return
|
|
189
|
+
elif not os.path.exists(scenario_state_path):
|
|
190
|
+
hlog(
|
|
191
|
+
f"Skipping writing display JSON for run {run_spec.name} because "
|
|
192
|
+
f"the scenario state JSON file does not exist at {scenario_state_path}"
|
|
193
|
+
)
|
|
178
194
|
return
|
|
179
|
-
|
|
180
|
-
|
|
195
|
+
elif not os.path.exists(per_instance_stats_path):
|
|
196
|
+
hlog(
|
|
197
|
+
f"Skipping writing display JSON for run {run_spec.name} because "
|
|
198
|
+
f"the per instance stats JSON file does not exist at {per_instance_stats_path}"
|
|
199
|
+
)
|
|
200
|
+
return
|
|
201
|
+
|
|
202
|
+
scenario_state = _read_scenario_state(scenario_state_path)
|
|
203
|
+
per_instance_stats = _read_per_instance_stats(per_instance_stats_path)
|
|
181
204
|
|
|
182
205
|
metric_names = _get_metric_names_for_groups(run_spec.groups, schema)
|
|
183
206
|
|
|
@@ -239,9 +262,17 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
239
262
|
mapped_output = (
|
|
240
263
|
request_state.output_mapping.get(predicted_text.strip()) if request_state.output_mapping else None
|
|
241
264
|
)
|
|
242
|
-
instance_id_to_instance[
|
|
243
|
-
|
|
244
|
-
|
|
265
|
+
instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
|
|
266
|
+
request_state.instance
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Process images and include if they exist
|
|
270
|
+
images: List[str] = [
|
|
271
|
+
encode_base64(image_location)
|
|
272
|
+
for image_location in gather_generated_image_locations(request_state.result)
|
|
273
|
+
if os.path.exists(image_location)
|
|
274
|
+
]
|
|
275
|
+
|
|
245
276
|
predictions.append(
|
|
246
277
|
DisplayPrediction(
|
|
247
278
|
instance_id=request_state.instance.id,
|
|
@@ -249,9 +280,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
249
280
|
train_trial_index=request_state.train_trial_index,
|
|
250
281
|
predicted_text=predicted_text,
|
|
251
282
|
truncated_predicted_text=_truncate_predicted_text(predicted_text, request_state, run_spec.adapter_spec),
|
|
283
|
+
base64_images=images,
|
|
252
284
|
mapped_output=mapped_output,
|
|
253
285
|
reference_index=request_state.reference_index,
|
|
254
286
|
stats=trial_stats,
|
|
287
|
+
annotations=request_state.annotations,
|
|
255
288
|
)
|
|
256
289
|
)
|
|
257
290
|
requests.append(
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from datetime import date
|
|
3
2
|
from typing import List, Optional, Dict
|
|
4
3
|
import dacite
|
|
5
4
|
import mako.template
|
|
6
|
-
import yaml
|
|
5
|
+
import yaml
|
|
7
6
|
import importlib_resources as resources
|
|
8
7
|
|
|
9
8
|
from helm.common.general import hlog
|
|
@@ -11,8 +10,11 @@ from helm.benchmark.metrics.metric_name import MetricName
|
|
|
11
10
|
from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
|
|
12
11
|
|
|
13
12
|
|
|
13
|
+
# TODO: change to `helm.benchmark.config`
|
|
14
14
|
SCHEMA_YAML_PACKAGE: str = "helm.benchmark.static"
|
|
15
|
-
|
|
15
|
+
|
|
16
|
+
# TODO: add heim, vhelm, etc.
|
|
17
|
+
SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
@dataclass(frozen=True)
|
|
@@ -43,34 +45,6 @@ class Field:
|
|
|
43
45
|
return name
|
|
44
46
|
|
|
45
47
|
|
|
46
|
-
# Note: also see Model from `models.py`.
|
|
47
|
-
@dataclass(frozen=True)
|
|
48
|
-
class ModelField(Field):
|
|
49
|
-
# Organization that originally created the model (e.g. "EleutherAI")
|
|
50
|
-
# Note that this may be different from group or the prefix of the model `name`
|
|
51
|
-
# ("together" in "together/gpt-j-6b") as the hosting organization
|
|
52
|
-
# may be different from the creator organization. We also capitalize
|
|
53
|
-
# this field properly to later display in the UI.
|
|
54
|
-
# TODO: in the future, we want to cleanup the naming in the following ways:
|
|
55
|
-
# - make the creator_organization an identifier with a separate display name
|
|
56
|
-
# - have a convention like <hosting_organization><creator_organization>/<model_name>
|
|
57
|
-
creator_organization: Optional[str] = None
|
|
58
|
-
|
|
59
|
-
# How this model is available (e.g., limited)
|
|
60
|
-
access: Optional[str] = None
|
|
61
|
-
|
|
62
|
-
# Whether we have yet to evaluate this model
|
|
63
|
-
todo: bool = False
|
|
64
|
-
|
|
65
|
-
# When was the model released
|
|
66
|
-
release_date: Optional[date] = None
|
|
67
|
-
|
|
68
|
-
# The number of parameters
|
|
69
|
-
# This should be a string as the number of parameters is usually a round number (175B),
|
|
70
|
-
# but we set it as an int for plotting purposes.
|
|
71
|
-
num_parameters: Optional[int] = None
|
|
72
|
-
|
|
73
|
-
|
|
74
48
|
@dataclass(frozen=True)
|
|
75
49
|
class MetricNameMatcher:
|
|
76
50
|
"""
|
|
@@ -97,7 +71,7 @@ class MetricNameMatcher:
|
|
|
97
71
|
if self.name != metric_name.name:
|
|
98
72
|
return False
|
|
99
73
|
|
|
100
|
-
if self.split != metric_name.split:
|
|
74
|
+
if self.split != "__all__" and self.split != metric_name.split:
|
|
101
75
|
return False
|
|
102
76
|
|
|
103
77
|
# Optional
|
|
@@ -118,9 +92,11 @@ class MetricNameMatcher:
|
|
|
118
92
|
return MetricNameMatcher(
|
|
119
93
|
name=mako.template.Template(self.name).render(**environment),
|
|
120
94
|
split=mako.template.Template(self.split).render(**environment),
|
|
121
|
-
perturbation_name=
|
|
122
|
-
|
|
123
|
-
|
|
95
|
+
perturbation_name=(
|
|
96
|
+
mako.template.Template(self.perturbation_name).render(**environment)
|
|
97
|
+
if self.perturbation_name is not None
|
|
98
|
+
else None
|
|
99
|
+
),
|
|
124
100
|
)
|
|
125
101
|
|
|
126
102
|
|
|
@@ -132,6 +108,9 @@ class MetricGroup(Field):
|
|
|
132
108
|
|
|
133
109
|
metrics: List[MetricNameMatcher] = field(default_factory=list)
|
|
134
110
|
|
|
111
|
+
hide_win_rates: Optional[bool] = None
|
|
112
|
+
"""If set to true, do not compute win rates."""
|
|
113
|
+
|
|
135
114
|
|
|
136
115
|
BY_METRIC = "by_metric"
|
|
137
116
|
BY_GROUP = "by_group"
|
|
@@ -207,18 +186,18 @@ class RunGroup(Field):
|
|
|
207
186
|
|
|
208
187
|
# Which adapter_spec fields we should preserve when displaying methods for this group
|
|
209
188
|
# When we are constructing a table where the rows are methods, what constitutes a "method" is given by the set of
|
|
210
|
-
# adapter keys. By default, this should just be "
|
|
211
|
-
# "marginalized out". However, for ablations, we want to include both "
|
|
212
|
-
|
|
189
|
+
# adapter keys. By default, this should just be "model_deployment" (e.g., BLOOM), where details like
|
|
190
|
+
# "num_train_instances" are "marginalized out". However, for ablations, we want to include both "model_deployment"
|
|
191
|
+
# and "num_train_instances".
|
|
192
|
+
# NOTE: "model" is kept for backward compatibility reason.
|
|
193
|
+
# TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore.
|
|
194
|
+
adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"])
|
|
213
195
|
|
|
214
196
|
|
|
215
197
|
@dataclass
|
|
216
198
|
class Schema:
|
|
217
199
|
"""Specifies information about what to display on the frontend."""
|
|
218
200
|
|
|
219
|
-
# Models
|
|
220
|
-
models: List[ModelField]
|
|
221
|
-
|
|
222
201
|
# Adapter fields (e.g., temperature)
|
|
223
202
|
adapter: List[Field]
|
|
224
203
|
|
|
@@ -235,16 +214,19 @@ class Schema:
|
|
|
235
214
|
run_groups: List[RunGroup]
|
|
236
215
|
|
|
237
216
|
def __post_init__(self):
|
|
238
|
-
self.name_to_model = {model.name: model for model in self.models}
|
|
239
217
|
self.name_to_metric = {metric.name: metric for metric in self.metrics}
|
|
240
218
|
self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
|
|
241
219
|
self.name_to_metric_group = {metric_group.name: metric_group for metric_group in self.metric_groups}
|
|
242
220
|
self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
|
|
243
221
|
|
|
244
222
|
|
|
245
|
-
def
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
223
|
+
def get_default_schema_path() -> str:
|
|
224
|
+
return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def read_schema(schema_path: str) -> Schema:
|
|
228
|
+
# TODO: merge in model metadata from `model_metadata.yaml`
|
|
229
|
+
hlog(f"Reading schema file {schema_path}...")
|
|
230
|
+
with open(schema_path, "r") as f:
|
|
249
231
|
raw = yaml.safe_load(f)
|
|
250
232
|
return dacite.from_dict(Schema, raw)
|