crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +8 -2
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +36 -0
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +214 -16
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +14 -16
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +203 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +12 -72
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1069 -546
- helm/config/model_metadata.yaml +753 -31
- helm/config/tokenizer_configs.yaml +142 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
4
|
+
ADAPT_GENERATION,
|
|
5
|
+
ADAPT_LANGUAGE_MODELING,
|
|
6
|
+
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
7
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
8
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
9
|
+
ADAPT_RANKING_BINARY,
|
|
10
|
+
AdapterSpec,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def format_instructions(instructions: str) -> str:
|
|
15
|
+
if len(instructions) > 0:
|
|
16
|
+
instructions += "\n"
|
|
17
|
+
return instructions
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_multiple_choice_joint_adapter_spec(
|
|
21
|
+
instructions: str,
|
|
22
|
+
input_noun: Optional[str],
|
|
23
|
+
output_noun: str,
|
|
24
|
+
num_outputs: int = 5,
|
|
25
|
+
max_train_instances: int = 5,
|
|
26
|
+
max_tokens: int = 5,
|
|
27
|
+
sample_train: bool = True,
|
|
28
|
+
**kwargs,
|
|
29
|
+
) -> AdapterSpec:
|
|
30
|
+
"""
|
|
31
|
+
[instructions]
|
|
32
|
+
|
|
33
|
+
[input_noun]: [input]
|
|
34
|
+
[reference_1]
|
|
35
|
+
...
|
|
36
|
+
[reference_k]
|
|
37
|
+
[output_noun]: [output]
|
|
38
|
+
|
|
39
|
+
[input_noun]: [input]
|
|
40
|
+
[reference_1]
|
|
41
|
+
...
|
|
42
|
+
[reference_k]
|
|
43
|
+
[output_noun]:
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
return AdapterSpec(
|
|
47
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
48
|
+
instructions=format_instructions(instructions),
|
|
49
|
+
input_prefix=f"{input_noun}: " if input_noun is not None else "",
|
|
50
|
+
input_suffix="\n" if input_noun is not None else "",
|
|
51
|
+
output_prefix=f"{output_noun}: ",
|
|
52
|
+
output_suffix="\n",
|
|
53
|
+
max_train_instances=max_train_instances,
|
|
54
|
+
num_outputs=num_outputs,
|
|
55
|
+
max_tokens=max_tokens,
|
|
56
|
+
temperature=0.0,
|
|
57
|
+
stop_sequences=["\n"],
|
|
58
|
+
sample_train=sample_train,
|
|
59
|
+
**kwargs,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = False) -> AdapterSpec:
|
|
64
|
+
"""
|
|
65
|
+
[input] [reference_i]
|
|
66
|
+
or
|
|
67
|
+
[reference_i]
|
|
68
|
+
"""
|
|
69
|
+
assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}
|
|
70
|
+
|
|
71
|
+
return AdapterSpec(
|
|
72
|
+
method=method,
|
|
73
|
+
instructions="",
|
|
74
|
+
input_prefix="",
|
|
75
|
+
input_suffix="",
|
|
76
|
+
output_prefix=" " if not empty_input else "",
|
|
77
|
+
output_suffix="",
|
|
78
|
+
# Separate is basically language modeling, so can't easily use in-context examples
|
|
79
|
+
max_train_instances=0,
|
|
80
|
+
num_outputs=1,
|
|
81
|
+
max_tokens=0,
|
|
82
|
+
temperature=0.0,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_multiple_choice_adapter_spec(
|
|
87
|
+
method: str,
|
|
88
|
+
instructions: str,
|
|
89
|
+
input_noun: Optional[str],
|
|
90
|
+
output_noun: str,
|
|
91
|
+
max_train_instances: int = 5,
|
|
92
|
+
num_outputs: int = 5,
|
|
93
|
+
max_tokens: int = 1,
|
|
94
|
+
empty_input: bool = False,
|
|
95
|
+
sample_train: bool = True,
|
|
96
|
+
**kwargs,
|
|
97
|
+
):
|
|
98
|
+
"""
|
|
99
|
+
Toggle between joint and separate adapters.
|
|
100
|
+
"""
|
|
101
|
+
if method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
102
|
+
return get_multiple_choice_joint_adapter_spec(
|
|
103
|
+
instructions,
|
|
104
|
+
input_noun,
|
|
105
|
+
output_noun,
|
|
106
|
+
max_train_instances=max_train_instances,
|
|
107
|
+
num_outputs=num_outputs,
|
|
108
|
+
max_tokens=max_tokens,
|
|
109
|
+
sample_train=sample_train,
|
|
110
|
+
**kwargs,
|
|
111
|
+
)
|
|
112
|
+
elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
|
|
113
|
+
return get_multiple_choice_separate_adapter_spec(method, empty_input)
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(f"Invalid adaptation method: {method}")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get_ranking_binary_adapter_spec(
|
|
119
|
+
instructions: str = "",
|
|
120
|
+
document_noun: str = "Passage",
|
|
121
|
+
query_noun: str = "Query",
|
|
122
|
+
output_prefix: str = "Does the passage answer the query?",
|
|
123
|
+
output_noun: str = "Answer",
|
|
124
|
+
max_train_instances: int = 4,
|
|
125
|
+
num_outputs: int = 1,
|
|
126
|
+
num_train_trials: int = 1,
|
|
127
|
+
temperature: float = 0.0,
|
|
128
|
+
max_tokens: int = 5,
|
|
129
|
+
**kwargs,
|
|
130
|
+
) -> AdapterSpec:
|
|
131
|
+
"""
|
|
132
|
+
[instructions]
|
|
133
|
+
|
|
134
|
+
[object_noun]: [object]
|
|
135
|
+
[query_noun]: [query]
|
|
136
|
+
[prompt_noun]: [prompt_content]
|
|
137
|
+
[output_noun]: [output]
|
|
138
|
+
|
|
139
|
+
...
|
|
140
|
+
|
|
141
|
+
[object_noun]: [object]
|
|
142
|
+
[query_noun]: [query]
|
|
143
|
+
[prompt_noun]: [prompt_content]
|
|
144
|
+
[output_noun]: [output]
|
|
145
|
+
|
|
146
|
+
[object_noun]: [object]
|
|
147
|
+
[query_noun]: [query]
|
|
148
|
+
[prompt_noun]: [prompt_content]
|
|
149
|
+
[output_noun]: [output]
|
|
150
|
+
"""
|
|
151
|
+
msg = (
|
|
152
|
+
"There must be an even number of in-context examples to ensure that"
|
|
153
|
+
"an equal number of positive and negative examples are included."
|
|
154
|
+
)
|
|
155
|
+
assert max_train_instances % 2 == 0, msg
|
|
156
|
+
max_train_instances = int(max_train_instances / 2)
|
|
157
|
+
|
|
158
|
+
return AdapterSpec(
|
|
159
|
+
method=ADAPT_RANKING_BINARY,
|
|
160
|
+
instructions=format_instructions(instructions),
|
|
161
|
+
input_prefix=f"{query_noun}: ",
|
|
162
|
+
input_suffix="\n",
|
|
163
|
+
reference_prefix=f"{document_noun}: ",
|
|
164
|
+
reference_suffix="\n",
|
|
165
|
+
output_prefix=f"{output_prefix}\n{output_noun}: ",
|
|
166
|
+
max_train_instances=max_train_instances,
|
|
167
|
+
num_outputs=num_outputs,
|
|
168
|
+
num_train_trials=num_train_trials,
|
|
169
|
+
temperature=temperature,
|
|
170
|
+
max_tokens=max_tokens,
|
|
171
|
+
**kwargs,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def get_completion_adapter_spec(
|
|
176
|
+
instructions: str = "",
|
|
177
|
+
input_prefix: str = "",
|
|
178
|
+
output_prefix: str = "",
|
|
179
|
+
output_suffix: str = "",
|
|
180
|
+
max_train_instances: int = 0,
|
|
181
|
+
temperature: float = 0.0,
|
|
182
|
+
num_outputs: int = 1,
|
|
183
|
+
max_tokens: int = 100,
|
|
184
|
+
stop_sequences: Optional[List] = None, # default value of `stop_sequences` is no stop sequence,
|
|
185
|
+
**kwargs,
|
|
186
|
+
) -> AdapterSpec:
|
|
187
|
+
"""
|
|
188
|
+
[input][output_prefix][output][output_suffix]
|
|
189
|
+
|
|
190
|
+
[input][output_prefix]
|
|
191
|
+
"""
|
|
192
|
+
if stop_sequences is None:
|
|
193
|
+
stop_sequences = []
|
|
194
|
+
|
|
195
|
+
return AdapterSpec(
|
|
196
|
+
method=ADAPT_GENERATION,
|
|
197
|
+
instructions=format_instructions(instructions),
|
|
198
|
+
input_prefix=input_prefix,
|
|
199
|
+
input_suffix="",
|
|
200
|
+
output_prefix=output_prefix,
|
|
201
|
+
output_suffix=output_suffix,
|
|
202
|
+
max_train_instances=max_train_instances,
|
|
203
|
+
temperature=temperature,
|
|
204
|
+
num_outputs=num_outputs,
|
|
205
|
+
max_tokens=max_tokens,
|
|
206
|
+
stop_sequences=stop_sequences,
|
|
207
|
+
**kwargs,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def get_generation_adapter_spec(
|
|
212
|
+
instructions: str = "",
|
|
213
|
+
input_noun: Optional[str] = None,
|
|
214
|
+
newline_after_input_noun: bool = False,
|
|
215
|
+
output_noun: Optional[str] = None,
|
|
216
|
+
newline_after_output_noun: bool = False,
|
|
217
|
+
max_train_instances: int = 5,
|
|
218
|
+
num_outputs: int = 1,
|
|
219
|
+
max_tokens: int = 5,
|
|
220
|
+
stop_sequences: Optional[List] = None, # default value of `stop_sequences` is ["\n"]
|
|
221
|
+
temperature: float = 0.0,
|
|
222
|
+
multi_label: bool = False,
|
|
223
|
+
) -> AdapterSpec:
|
|
224
|
+
"""
|
|
225
|
+
[instructions]
|
|
226
|
+
|
|
227
|
+
[input_noun]: [input]
|
|
228
|
+
[output_noun]: [output]
|
|
229
|
+
|
|
230
|
+
[input_noun]: [input]
|
|
231
|
+
[output_noun]:
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
def format_prefix(noun: Optional[str], append_new_line: bool) -> str:
|
|
235
|
+
"""
|
|
236
|
+
When `append_new_line` is False:
|
|
237
|
+
[input_noun]: [input]
|
|
238
|
+
|
|
239
|
+
When `append_new_line` is True:
|
|
240
|
+
[input_noun]:
|
|
241
|
+
[input]
|
|
242
|
+
"""
|
|
243
|
+
prefix: str = f"{noun}:" if noun is not None else ""
|
|
244
|
+
if len(prefix) > 0:
|
|
245
|
+
prefix += "\n" if append_new_line else " "
|
|
246
|
+
return prefix
|
|
247
|
+
|
|
248
|
+
if stop_sequences is None:
|
|
249
|
+
stop_sequences = ["\n"]
|
|
250
|
+
|
|
251
|
+
return AdapterSpec(
|
|
252
|
+
method=ADAPT_GENERATION,
|
|
253
|
+
instructions=format_instructions(instructions),
|
|
254
|
+
input_prefix=format_prefix(input_noun, append_new_line=newline_after_input_noun),
|
|
255
|
+
input_suffix="\n",
|
|
256
|
+
output_prefix=format_prefix(output_noun, append_new_line=newline_after_output_noun),
|
|
257
|
+
output_suffix="\n",
|
|
258
|
+
max_train_instances=max_train_instances,
|
|
259
|
+
num_outputs=num_outputs,
|
|
260
|
+
max_tokens=max_tokens,
|
|
261
|
+
temperature=temperature,
|
|
262
|
+
stop_sequences=stop_sequences,
|
|
263
|
+
multi_label=multi_label,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def get_instruct_adapter_spec(
|
|
268
|
+
num_outputs: int = 1,
|
|
269
|
+
max_tokens: int = 512,
|
|
270
|
+
temperature: float = 0.7,
|
|
271
|
+
) -> AdapterSpec:
|
|
272
|
+
"""
|
|
273
|
+
Zero-shot instruction-following.
|
|
274
|
+
"""
|
|
275
|
+
return AdapterSpec(
|
|
276
|
+
method=ADAPT_GENERATION,
|
|
277
|
+
instructions="",
|
|
278
|
+
input_prefix="",
|
|
279
|
+
input_suffix="\n",
|
|
280
|
+
output_prefix="",
|
|
281
|
+
output_suffix="",
|
|
282
|
+
max_train_instances=0,
|
|
283
|
+
num_outputs=num_outputs,
|
|
284
|
+
max_tokens=max_tokens,
|
|
285
|
+
temperature=temperature,
|
|
286
|
+
stop_sequences=[],
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def get_few_shot_instruct_adapter_spec(
|
|
291
|
+
num_outputs: int = 1,
|
|
292
|
+
max_tokens: int = 512,
|
|
293
|
+
temperature: float = 0.7,
|
|
294
|
+
max_train_instances: int = 0,
|
|
295
|
+
) -> AdapterSpec:
|
|
296
|
+
"""
|
|
297
|
+
Few-shot instruction-following.
|
|
298
|
+
"""
|
|
299
|
+
return AdapterSpec(
|
|
300
|
+
method=ADAPT_GENERATION,
|
|
301
|
+
instructions="",
|
|
302
|
+
input_prefix="",
|
|
303
|
+
input_suffix="\n",
|
|
304
|
+
output_prefix="",
|
|
305
|
+
output_suffix="",
|
|
306
|
+
max_train_instances=max_train_instances,
|
|
307
|
+
num_outputs=num_outputs,
|
|
308
|
+
max_tokens=max_tokens,
|
|
309
|
+
temperature=temperature,
|
|
310
|
+
stop_sequences=[],
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def get_language_modeling_adapter_spec() -> AdapterSpec:
|
|
315
|
+
"""
|
|
316
|
+
Used for language modeling.
|
|
317
|
+
"""
|
|
318
|
+
return AdapterSpec(
|
|
319
|
+
method=ADAPT_LANGUAGE_MODELING,
|
|
320
|
+
instructions="",
|
|
321
|
+
input_prefix="",
|
|
322
|
+
input_suffix="",
|
|
323
|
+
output_prefix="",
|
|
324
|
+
output_suffix="",
|
|
325
|
+
max_train_instances=0,
|
|
326
|
+
num_outputs=1,
|
|
327
|
+
max_tokens=0,
|
|
328
|
+
temperature=0.0,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
|
|
333
|
+
"""
|
|
334
|
+
Used for summarization.
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
if num_sents == 1:
|
|
338
|
+
out_pref = "Summarize the above article in 1 sentence.\n"
|
|
339
|
+
elif num_sents is None:
|
|
340
|
+
out_pref = "Summarize the above article.\n"
|
|
341
|
+
else:
|
|
342
|
+
out_pref = f"Summarize the above article in {num_sents} sentences.\n"
|
|
343
|
+
|
|
344
|
+
return AdapterSpec(
|
|
345
|
+
method=ADAPT_GENERATION,
|
|
346
|
+
instructions="",
|
|
347
|
+
input_prefix="###\nArticle: ",
|
|
348
|
+
input_suffix="\n\n",
|
|
349
|
+
output_prefix=out_pref,
|
|
350
|
+
output_suffix="\n",
|
|
351
|
+
max_train_instances=max_train_instances,
|
|
352
|
+
num_outputs=1,
|
|
353
|
+
stop_sequences=["###"], # Separator between few-shot instances.
|
|
354
|
+
**kwargs,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def get_machine_translation_adapter_spec(
|
|
359
|
+
source_language, target_language, max_train_instances, **kwargs
|
|
360
|
+
) -> AdapterSpec:
|
|
361
|
+
"""
|
|
362
|
+
Used for machine translation.
|
|
363
|
+
"""
|
|
364
|
+
return AdapterSpec(
|
|
365
|
+
method=ADAPT_GENERATION,
|
|
366
|
+
instructions=f"Translate the following sentences from {source_language} to {target_language}.",
|
|
367
|
+
input_prefix=f"{source_language}: ",
|
|
368
|
+
input_suffix="\n",
|
|
369
|
+
output_prefix=f"{target_language}: ",
|
|
370
|
+
output_suffix="\n",
|
|
371
|
+
max_train_instances=max_train_instances,
|
|
372
|
+
num_outputs=1,
|
|
373
|
+
stop_sequences=["\n\n"],
|
|
374
|
+
temperature=0.0,
|
|
375
|
+
**kwargs,
|
|
376
|
+
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Optional, Dict, List
|
|
2
|
+
from typing import Optional, Dict, List, Any
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.scenarios.scenario import Instance
|
|
5
5
|
from helm.common.general import indent_lines, format_text_lines, serialize
|
|
@@ -45,6 +45,11 @@ class RequestState:
|
|
|
45
45
|
num_conditioning_tokens: int = 0
|
|
46
46
|
"""The number of initial tokens that will be ignored when computing language modeling metrics"""
|
|
47
47
|
|
|
48
|
+
annotations: Optional[Dict[str, Any]] = None
|
|
49
|
+
"""Output of some post-processing step that is needed for the metric to understand the request
|
|
50
|
+
Should match the annotator's name to an Annotation (usually a list of dictionaries for each completion)
|
|
51
|
+
Example: parsing, rendering an image based on the text completion, etc."""
|
|
52
|
+
|
|
48
53
|
def __post_init__(self):
|
|
49
54
|
if self.request_mode:
|
|
50
55
|
assert self.request_mode in ["original", "calibration"], f"Invalid request_mode: {self.request_mode}"
|
|
@@ -3,8 +3,9 @@ from dataclasses import dataclass
|
|
|
3
3
|
from typing import List, Dict, Tuple, Optional
|
|
4
4
|
|
|
5
5
|
from helm.benchmark.scenarios.scenario import Instance
|
|
6
|
-
from .adapter_spec import AdapterSpec
|
|
7
|
-
from .request_state import RequestState
|
|
6
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
7
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
8
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
@dataclass
|
|
@@ -21,6 +22,9 @@ class ScenarioState:
|
|
|
21
22
|
# List of `RequestState`s that were produced by adaptation (and execution)
|
|
22
23
|
request_states: List[RequestState]
|
|
23
24
|
|
|
25
|
+
# Annotations to use for this run spec
|
|
26
|
+
annotator_specs: Optional[List[AnnotatorSpec]] = None
|
|
27
|
+
|
|
24
28
|
def __post_init__(self):
|
|
25
29
|
# Create derived indices based on `request_states` so it's easier for
|
|
26
30
|
# the `Metric` later to access them. Two things are produced:
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Dict, List, Any
|
|
2
|
+
from abc import abstractmethod, ABC
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.common.object_spec import ObjectSpec
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Annotator(ABC):
|
|
10
|
+
"""Annotator is an abstract class for annotating a request state. Annotators are used to add additional
|
|
11
|
+
information to a request state that is needed for a metric to understand the request. This could be
|
|
12
|
+
parsing, rendering an image based on the text completion, etc."""
|
|
13
|
+
|
|
14
|
+
name: str
|
|
15
|
+
"""Name of the annotator. Should be filled in by the subclass."""
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def annotate(self, request_state: RequestState) -> Any:
|
|
19
|
+
"""Fills the annotations field of the request state with additional information
|
|
20
|
+
that are implementation specific."""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class AnnotatorSpec(ObjectSpec):
|
|
26
|
+
"""Specifies how to create an `Annotator`.
|
|
27
|
+
The user should only specify the class name.
|
|
28
|
+
The arguments will be filled in by the `AnnotatorFactory`.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DummyAnnotator(Annotator):
|
|
35
|
+
"""A dummy annotator that does nothing."""
|
|
36
|
+
|
|
37
|
+
name = "dummy"
|
|
38
|
+
|
|
39
|
+
def annotate(self, request_state: RequestState) -> List[Dict[str, Any]]:
|
|
40
|
+
if request_state.result is None:
|
|
41
|
+
raise ValueError("Annotation requires a result")
|
|
42
|
+
annotation_values: List[str] = [completion.text.upper() for completion in request_state.result.completions]
|
|
43
|
+
return [{"all_caps": value} for value in annotation_values]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict, Mapping, Optional
|
|
3
|
+
|
|
4
|
+
from helm.common.credentials_utils import provide_api_key
|
|
5
|
+
from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
|
|
6
|
+
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from helm.common.object_spec import create_object, inject_object_spec_args
|
|
8
|
+
from helm.benchmark.annotation.annotator import Annotator, AnnotatorSpec
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AnnotatorFactory:
|
|
12
|
+
"""Factory for creating annotators."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self, credentials: Mapping[str, Any], file_storage_path: str, cache_backend_config: CacheBackendConfig
|
|
16
|
+
):
|
|
17
|
+
self.credentials = credentials
|
|
18
|
+
self.file_storage_path = file_storage_path
|
|
19
|
+
self.cache_backend_config = cache_backend_config
|
|
20
|
+
hlog(f"AnnotatorFactory: file_storage_path = {file_storage_path}")
|
|
21
|
+
hlog(f"AnnotatorFactory: cache_backend_config = {cache_backend_config}")
|
|
22
|
+
|
|
23
|
+
# Cache for annotators
|
|
24
|
+
# This is used to prevent duplicate creation of annotators
|
|
25
|
+
# It is especially important as annotation is a multi-threaded
|
|
26
|
+
# process and creating a new annotator for each request can cause
|
|
27
|
+
# race conditions.
|
|
28
|
+
self.annotators: Dict[str, Annotator] = {}
|
|
29
|
+
|
|
30
|
+
def get_annotator(self, annotator_spec: AnnotatorSpec) -> Annotator:
|
|
31
|
+
"""Return a annotator based on the name."""
|
|
32
|
+
# First try to find the annotator in the cache
|
|
33
|
+
assert annotator_spec.args is None or annotator_spec.args == {}
|
|
34
|
+
annotator_name: str = annotator_spec.class_name.split(".")[-1].lower().replace("annotator", "")
|
|
35
|
+
annotator: Optional[Annotator] = self.annotators.get(annotator_name)
|
|
36
|
+
if annotator is not None:
|
|
37
|
+
return annotator
|
|
38
|
+
|
|
39
|
+
# Otherwise, create the client
|
|
40
|
+
cache_config: CacheConfig = self.cache_backend_config.get_cache_config(annotator_name)
|
|
41
|
+
annotator_spec = inject_object_spec_args(
|
|
42
|
+
annotator_spec,
|
|
43
|
+
constant_bindings={
|
|
44
|
+
"cache_config": cache_config,
|
|
45
|
+
},
|
|
46
|
+
provider_bindings={
|
|
47
|
+
"api_key": lambda: provide_api_key(self.credentials, annotator_name),
|
|
48
|
+
"file_storage_path": lambda: self._get_file_storage_path(annotator_name),
|
|
49
|
+
},
|
|
50
|
+
)
|
|
51
|
+
annotator = create_object(annotator_spec)
|
|
52
|
+
|
|
53
|
+
# Cache the client
|
|
54
|
+
self.annotators[annotator_name] = annotator
|
|
55
|
+
|
|
56
|
+
return annotator
|
|
57
|
+
|
|
58
|
+
def _get_file_storage_path(self, annotator_name: str) -> str:
|
|
59
|
+
# Returns the path to use for a local file cache for the given annotator
|
|
60
|
+
local_file_cache_path: str = os.path.join(self.file_storage_path, "output", annotator_name)
|
|
61
|
+
return local_file_cache_path
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any, Dict, List, Tuple, Callable
|
|
3
|
+
|
|
4
|
+
from dacite import from_dict
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.annotation.annotator import Annotator
|
|
7
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
8
|
+
from helm.common.cache import Cache, CacheConfig
|
|
9
|
+
from helm.common.file_caches.local_file_cache import LocalPILFileCache
|
|
10
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
11
|
+
from helm.common.media_object import MediaObject
|
|
12
|
+
from helm.proxy.retry import get_retry_decorator
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from PIL import Image
|
|
16
|
+
except ModuleNotFoundError as e:
|
|
17
|
+
handle_module_not_found_error(e, suggestions=["images"])
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def retry_if_compilation_failed(result: Dict[str, Any]) -> bool:
|
|
21
|
+
"""Retries when the compilation fails."""
|
|
22
|
+
return "unknown_error" in result
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
retry: Callable = get_retry_decorator(
|
|
26
|
+
"Compilation", max_attempts=5, wait_exponential_multiplier_seconds=2, retry_on_result=retry_if_compilation_failed
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CompilationError(Exception):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ImageCompilerAnnotator(Annotator, ABC):
|
|
35
|
+
"""Annotator that compiles the text completions into an image."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, cache_config: CacheConfig, file_storage_path: str):
|
|
38
|
+
self._cache = Cache(cache_config)
|
|
39
|
+
self._file_cache = LocalPILFileCache(file_storage_path)
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def compile_completion_into_image(
|
|
43
|
+
self, request_state: RequestState, completion_text: str
|
|
44
|
+
) -> Tuple[Image.Image, Dict[str, Any]]:
|
|
45
|
+
raise NotImplementedError
|
|
46
|
+
|
|
47
|
+
def postprocess_infos(self, infos: Dict[str, Any]) -> Dict[str, Any]:
|
|
48
|
+
"""Postprocess the infos."""
|
|
49
|
+
return infos
|
|
50
|
+
|
|
51
|
+
def annotate(self, request_state: RequestState) -> List[Dict[str, Any]]:
|
|
52
|
+
"""Fills the annotations field of the request state with the compiled image."""
|
|
53
|
+
assert request_state.result is not None, "Annotator can only be used after the request has been processed."
|
|
54
|
+
annotations: List[Dict[str, Any]] = []
|
|
55
|
+
for completion in request_state.result.completions:
|
|
56
|
+
completion_text: str = completion.text.strip()
|
|
57
|
+
raw_response: Dict[str, Any]
|
|
58
|
+
|
|
59
|
+
@retry
|
|
60
|
+
def compile() -> Dict[str, Any]:
|
|
61
|
+
def do_it() -> Dict[str, Any]:
|
|
62
|
+
try:
|
|
63
|
+
assert self._file_cache is not None
|
|
64
|
+
image, infos = self.compile_completion_into_image(request_state, completion_text)
|
|
65
|
+
infos = self.postprocess_infos(infos)
|
|
66
|
+
image_path: str = self._file_cache.store_image(lambda: image)
|
|
67
|
+
return {
|
|
68
|
+
"media_object": MediaObject(location=image_path, content_type="image/png").to_dict(),
|
|
69
|
+
**infos,
|
|
70
|
+
}
|
|
71
|
+
except CompilationError as e:
|
|
72
|
+
return {"error": str(e)}
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
cache_key: Dict[str, str] = {"completion": completion_text}
|
|
76
|
+
raw_response, _ = self._cache.get(cache_key, do_it)
|
|
77
|
+
return raw_response
|
|
78
|
+
except Exception as e:
|
|
79
|
+
return {"unknown_error": str(e)}
|
|
80
|
+
|
|
81
|
+
raw_response = compile()
|
|
82
|
+
response = {**raw_response}
|
|
83
|
+
if "media_object" in response:
|
|
84
|
+
response["media_object"] = from_dict(MediaObject, response["media_object"])
|
|
85
|
+
|
|
86
|
+
# Merge annotations
|
|
87
|
+
annotations.append(response)
|
|
88
|
+
return annotations
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from typing import List, Tuple, Dict, Any
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.image2structure.image_compiler_annotator import ImageCompilerAnnotator, CompilationError
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
|
+
from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
|
|
7
|
+
latex_to_image,
|
|
8
|
+
strip_unnecessary_latex_parts,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from PIL import Image
|
|
13
|
+
except ModuleNotFoundError as e:
|
|
14
|
+
handle_module_not_found_error(e, suggestions=["images"])
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LatexCompilerAnnotator(ImageCompilerAnnotator):
|
|
18
|
+
"""Annotator that compiles the text completions into a LaTeX document."""
|
|
19
|
+
|
|
20
|
+
name: str = "latex_compiler"
|
|
21
|
+
|
|
22
|
+
# Delimiters for the code block
|
|
23
|
+
DELIMITERS: List[Tuple[str, str]] = [
|
|
24
|
+
("```latex", "```"),
|
|
25
|
+
("```", "```"),
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
def postprocess_infos(self, infos: Dict[str, Any]) -> Dict[str, Any]:
|
|
29
|
+
"""Postprocess the infos."""
|
|
30
|
+
annotations = super().postprocess_infos(infos)
|
|
31
|
+
assert "latex_code" in annotations, "The latex_code field should be present in the infos"
|
|
32
|
+
annotations["text"] = strip_unnecessary_latex_parts(annotations["latex_code"])
|
|
33
|
+
return annotations
|
|
34
|
+
|
|
35
|
+
def compile_completion_into_image(
|
|
36
|
+
self, request_state: RequestState, completion_text: str
|
|
37
|
+
) -> Tuple[Image.Image, Dict[str, Any]]:
|
|
38
|
+
"""Given a completion, parse the LaTeX and compile it into an image."""
|
|
39
|
+
# Get the assets path
|
|
40
|
+
assets_path: str = ""
|
|
41
|
+
|
|
42
|
+
# Check for code block delimiters
|
|
43
|
+
# After this completion should be a valid latex code block
|
|
44
|
+
for start, end in self.DELIMITERS:
|
|
45
|
+
if start in completion_text and end in completion_text[completion_text.index(start) + len(start) :]:
|
|
46
|
+
start_index = completion_text.index(start) + len(start)
|
|
47
|
+
end_index = completion_text.index(end, start_index)
|
|
48
|
+
completion_text = completion_text[start_index:end_index]
|
|
49
|
+
break
|
|
50
|
+
|
|
51
|
+
# Convert the latex code to an image
|
|
52
|
+
try:
|
|
53
|
+
image, infos = latex_to_image(completion_text, assets_path, crop=True)
|
|
54
|
+
except RuntimeError as e:
|
|
55
|
+
# We do not want to catch OptionalDependencyNotInstalled (error with latex installation)
|
|
56
|
+
# because it is a fatal error and should be handled by the user
|
|
57
|
+
raise CompilationError(str(e)) from e
|
|
58
|
+
|
|
59
|
+
return image, infos
|