crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +8 -2
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +36 -0
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +214 -16
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +14 -16
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +203 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +12 -72
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1069 -546
- helm/config/model_metadata.yaml +753 -31
- helm/config/tokenizer_configs.yaml +142 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -1,9 +1,20 @@
|
|
|
1
1
|
import collections
|
|
2
|
+
import os
|
|
2
3
|
import typing
|
|
3
4
|
from typing import Dict, List, Optional
|
|
4
5
|
from datasets import load_dataset, DatasetDict
|
|
5
6
|
|
|
6
|
-
from .
|
|
7
|
+
from helm.common.general import ensure_directory_exists
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
7
18
|
|
|
8
19
|
|
|
9
20
|
def remove_boxed(string: str) -> Optional[str]:
|
|
@@ -354,7 +365,13 @@ class MATHScenario(Scenario):
|
|
|
354
365
|
|
|
355
366
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
356
367
|
dataset = {}
|
|
357
|
-
|
|
368
|
+
cache_dir = os.path.join(output_path, "data")
|
|
369
|
+
ensure_directory_exists(cache_dir)
|
|
370
|
+
data = (
|
|
371
|
+
typing.cast(DatasetDict, load_dataset("competition_math", cache_dir=cache_dir))
|
|
372
|
+
.sort("problem")
|
|
373
|
+
.shuffle(seed=42)
|
|
374
|
+
)
|
|
358
375
|
|
|
359
376
|
def group_by_key(dataset_list, key):
|
|
360
377
|
dataset_per_key = collections.defaultdict(list)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_file_downloaded
|
|
7
|
+
|
|
8
|
+
from .scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MedicationQAScenario(Scenario):
|
|
12
|
+
"""
|
|
13
|
+
The gold standard corpus for medication question answering introduced in the MedInfo 2019 paper
|
|
14
|
+
"Bridging the Gap between Consumers’ Medication Questions and Trusted Answers":
|
|
15
|
+
http://ebooks.iospress.nl/publication/51941
|
|
16
|
+
|
|
17
|
+
This dataset has consumer questions, as opposed to very clinical questions.
|
|
18
|
+
|
|
19
|
+
Paper citation:
|
|
20
|
+
|
|
21
|
+
@inproceedings{BenAbacha:MEDINFO19,
|
|
22
|
+
author = {Asma {Ben Abacha} and Yassine Mrabet and Mark Sharp and
|
|
23
|
+
Travis Goodwin and Sonya E. Shooshan and Dina Demner{-}Fushman},
|
|
24
|
+
title = {Bridging the Gap between Consumers’ Medication Questions and Trusted Answers},
|
|
25
|
+
booktitle = {MEDINFO 2019},
|
|
26
|
+
year = {2019},
|
|
27
|
+
}
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
SOURCE_REPO_URL = "https://github.com/abachaa/Medication_QA_MedInfo2019/raw/master/"
|
|
31
|
+
FILENAME = "MedInfo2019-QA-Medications.xlsx"
|
|
32
|
+
|
|
33
|
+
name = "medication_qa"
|
|
34
|
+
description = "MedInfo 2019 MedicationQA medication question answering task"
|
|
35
|
+
tags = ["knowledge", "generation", "question_answering", "biomedical"]
|
|
36
|
+
|
|
37
|
+
def download_medication_qa(self, path: str):
|
|
38
|
+
"""download the .xlsx spreadsheet containing the question-answer pairs"""
|
|
39
|
+
ensure_file_downloaded(
|
|
40
|
+
source_url=os.path.join(self.SOURCE_REPO_URL, self.FILENAME),
|
|
41
|
+
target_path=os.path.join(path, self.FILENAME),
|
|
42
|
+
unpack=False,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
46
|
+
self.download_medication_qa(output_path)
|
|
47
|
+
data_path = os.path.join(output_path, self.FILENAME)
|
|
48
|
+
|
|
49
|
+
data = pd.read_excel(data_path)
|
|
50
|
+
data = data[~data.Answer.isna()] # remove rows missing answers
|
|
51
|
+
instances = [
|
|
52
|
+
Instance(
|
|
53
|
+
input=Input(row.Question),
|
|
54
|
+
references=[Reference(Output(row.Answer), tags=[CORRECT_TAG])],
|
|
55
|
+
split=TEST_SPLIT,
|
|
56
|
+
)
|
|
57
|
+
for _, row in data.iterrows()
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
return instances
|
|
@@ -358,7 +358,7 @@ def distance_paraboloid(point: List[int], rel_str: str, TOL: float = 1e-10):
|
|
|
358
358
|
sols = []
|
|
359
359
|
# Try each possible combined solution for x, y, z, λ
|
|
360
360
|
for sol_xyz, val_λs in zip(sols_xyz, vals_λ):
|
|
361
|
-
val_λs =
|
|
361
|
+
val_λs = tuple(set(filter(lambda _: not _.is_symbol, val_λs))) # get distinct values for λ if there are any
|
|
362
362
|
if len(val_λs) > 1: # there can be at most one distinct value for λ
|
|
363
363
|
continue
|
|
364
364
|
val_λ = val_λs[0] if val_λs else λ
|
|
@@ -108,7 +108,6 @@ class OpinionsQAScenario(Scenario):
|
|
|
108
108
|
self.context: str = context
|
|
109
109
|
|
|
110
110
|
def download_data(self, output_path: str):
|
|
111
|
-
|
|
112
111
|
data_dir: str = os.path.join(output_path, "data")
|
|
113
112
|
if not os.path.exists(data_dir):
|
|
114
113
|
os.makedirs(data_dir)
|
|
@@ -150,14 +149,12 @@ class OpinionsQAScenario(Scenario):
|
|
|
150
149
|
bios_df = pd.read_csv(bios_path, sep="\t")
|
|
151
150
|
|
|
152
151
|
for split in all_splits:
|
|
153
|
-
|
|
154
152
|
csv_path: str = csv_dict[split]
|
|
155
153
|
assert os.path.exists(csv_path)
|
|
156
154
|
|
|
157
155
|
question_df = self.read_survey_questions(csv_path)
|
|
158
156
|
|
|
159
157
|
for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
|
|
160
|
-
|
|
161
158
|
# Opinions QA test questions have no correct answer and thus we set it to be None by default
|
|
162
159
|
# for all test instances.
|
|
163
160
|
# In the case where context = steer-qa, we add demographic information in the form of a
|
|
@@ -182,7 +179,6 @@ class OpinionsQAScenario(Scenario):
|
|
|
182
179
|
else:
|
|
183
180
|
# context = "steer-bio"or "steer-portray"
|
|
184
181
|
for bio in bios_df["question"].values:
|
|
185
|
-
|
|
186
182
|
context = PassageQuestionInput(passage=bio, question=question + "\n")
|
|
187
183
|
instance = Instance(
|
|
188
184
|
context,
|
|
@@ -25,6 +25,10 @@ DEFAULT_TEST_SIZE: int = 1000
|
|
|
25
25
|
""" Reference tags """
|
|
26
26
|
CORRECT_TAG: str = "correct"
|
|
27
27
|
|
|
28
|
+
""" Asset tags (used for compiled outputs such as image2structure)"""
|
|
29
|
+
ASSET_NAME_TAG: str = "asset_name"
|
|
30
|
+
ASSET_PATH_TAG: str = "asset_path"
|
|
31
|
+
|
|
28
32
|
# Reference tag functions for ranking scenarios.
|
|
29
33
|
# @TODO: (For future) Should there be a base RankingScenario class?
|
|
30
34
|
|
|
@@ -1,7 +1,128 @@
|
|
|
1
|
+
"""Simple scenarios for debugging and for tutorials.
|
|
2
|
+
|
|
3
|
+
NOTE: Typically, each scenario should be in its own file,
|
|
4
|
+
but these scenarios are placed in the same module for
|
|
5
|
+
tutorial purposes."""
|
|
6
|
+
|
|
1
7
|
import random
|
|
2
8
|
from typing import List
|
|
3
9
|
|
|
4
|
-
from .scenario import
|
|
10
|
+
from helm.benchmark.scenarios.scenario import (
|
|
11
|
+
Scenario,
|
|
12
|
+
Instance,
|
|
13
|
+
Reference,
|
|
14
|
+
TRAIN_SPLIT,
|
|
15
|
+
TEST_SPLIT,
|
|
16
|
+
CORRECT_TAG,
|
|
17
|
+
Input,
|
|
18
|
+
Output,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SimpleMCQAScenario(Scenario):
|
|
23
|
+
"""Simple multiple-choice question answering scenario for tutorials and debugging.
|
|
24
|
+
|
|
25
|
+
The task is to answer questions about whether two-digit numbers are even or odd.
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
|
|
29
|
+
Answer the following questions with a single letter only.
|
|
30
|
+
|
|
31
|
+
Question: Is 24 even or odd?
|
|
32
|
+
A. Even
|
|
33
|
+
B. Odd
|
|
34
|
+
Answer: A"""
|
|
35
|
+
|
|
36
|
+
name = "simple_mcqa"
|
|
37
|
+
description = "Answer if two-digit numbers are even or odd."
|
|
38
|
+
tags = ["question answering"]
|
|
39
|
+
|
|
40
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
41
|
+
instances: List[Instance] = []
|
|
42
|
+
for i in range(10, 100):
|
|
43
|
+
# NOTE: For simplicity, the input text and reference output text
|
|
44
|
+
# is the same for all instances.
|
|
45
|
+
# However, for most question answering scenarios, the input text
|
|
46
|
+
# and reference output text can vary between questions.
|
|
47
|
+
input = Input(text=f"Is {i} even or odd?")
|
|
48
|
+
references = [
|
|
49
|
+
Reference(Output(text="Even"), tags=[CORRECT_TAG] if i % 2 == 0 else []),
|
|
50
|
+
Reference(Output(text="Odd"), tags=[CORRECT_TAG] if i % 2 == 1 else []),
|
|
51
|
+
]
|
|
52
|
+
split = TRAIN_SPLIT if i <= 20 else TEST_SPLIT
|
|
53
|
+
instance = Instance(input=input, references=references, split=split)
|
|
54
|
+
instances.append(instance)
|
|
55
|
+
return instances
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class SimpleShortAnswerQAScenario(Scenario):
|
|
59
|
+
"""Simple short answer question answering scenario for tutorials and debugging.
|
|
60
|
+
|
|
61
|
+
The task is to answer questions about whether two-digit numbers are even or odd.
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
|
|
65
|
+
Answer the following questions with a single word only.
|
|
66
|
+
|
|
67
|
+
Question: Is 24 even or odd?
|
|
68
|
+
Answer: Even"""
|
|
69
|
+
|
|
70
|
+
name = "simple_mcqa"
|
|
71
|
+
description = "Answer if two-digit numbers are even or odd."
|
|
72
|
+
tags = ["question answering"]
|
|
73
|
+
|
|
74
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
75
|
+
instances: List[Instance] = []
|
|
76
|
+
for i in range(10, 100):
|
|
77
|
+
# NOTE: For simplicity, the input text and reference output text
|
|
78
|
+
# is the same for all instances.
|
|
79
|
+
# However, for most question answering scenarios, the input text
|
|
80
|
+
# and reference output text can vary between questions.
|
|
81
|
+
input = Input(text=f"Is {i} even or odd?")
|
|
82
|
+
correct_answer = "Even" if i % 2 == 0 else "Odd"
|
|
83
|
+
# NOTE: Unlike multiple-choice question answering, only the correct
|
|
84
|
+
# references are needed for short-answer question answering.
|
|
85
|
+
references = [
|
|
86
|
+
Reference(Output(text=correct_answer), tags=[CORRECT_TAG]),
|
|
87
|
+
]
|
|
88
|
+
split = TRAIN_SPLIT if i <= 20 else TEST_SPLIT
|
|
89
|
+
instance = Instance(input=input, references=references, split=split)
|
|
90
|
+
instances.append(instance)
|
|
91
|
+
return instances
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class SimpleClassificationScenario(Scenario):
|
|
95
|
+
"""Simple multiple-choice question answering scenario for tutorials and debugging.
|
|
96
|
+
|
|
97
|
+
The task is to classify two-digit numbers as even or odd.
|
|
98
|
+
|
|
99
|
+
Example:
|
|
100
|
+
|
|
101
|
+
Classify the following numbers by their pairity. The classes are "Even" and "Odd".
|
|
102
|
+
|
|
103
|
+
Number: 24
|
|
104
|
+
Pairity: Even"""
|
|
105
|
+
|
|
106
|
+
name = "simple_classification"
|
|
107
|
+
description = "Classify numbers by pairity."
|
|
108
|
+
tags = ["classification"]
|
|
109
|
+
|
|
110
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
111
|
+
instances: List[Instance] = []
|
|
112
|
+
for i in range(10, 100):
|
|
113
|
+
input = Input(text=str(i))
|
|
114
|
+
# NOTE: For classification scenarios, the reference outputs should be the same
|
|
115
|
+
# for all instances, and should include both correct and incorrect classes.
|
|
116
|
+
# HELM only supports single-label classification. Exactly one reference
|
|
117
|
+
# should have the CORRECT_TAG tag.
|
|
118
|
+
references = [
|
|
119
|
+
Reference(Output(text="Even"), tags=[CORRECT_TAG] if i % 2 == 0 else []),
|
|
120
|
+
Reference(Output(text="Odd"), tags=[CORRECT_TAG] if i % 2 == 1 else []),
|
|
121
|
+
]
|
|
122
|
+
split = TRAIN_SPLIT if i <= 20 else TEST_SPLIT
|
|
123
|
+
instance = Instance(input=input, references=references, split=split)
|
|
124
|
+
instances.append(instance)
|
|
125
|
+
return instances
|
|
5
126
|
|
|
6
127
|
|
|
7
128
|
class Simple1Scenario(Scenario):
|
|
@@ -1,9 +1,15 @@
|
|
|
1
|
+
import pytest
|
|
1
2
|
from tempfile import TemporaryDirectory
|
|
2
3
|
|
|
3
4
|
from helm.benchmark.scenarios.math_scenario import MATHScenario
|
|
4
5
|
from helm.benchmark.scenarios.scenario import Input, Output, Reference
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
# TODO: Fix the test for newer versions of diffusers: https://github.com/stanford-crfm/helm/issues/2168
|
|
9
|
+
@pytest.mark.skip(
|
|
10
|
+
reason="Incompatible with newer versions with diffusers>0.24.0. Fails with "
|
|
11
|
+
'"Loading a dataset cached in a LocalFileSystem is not supported"'
|
|
12
|
+
)
|
|
7
13
|
def test_math_scenario_get_instances():
|
|
8
14
|
math_scenario = MATHScenario(subject="number_theory", level="1")
|
|
9
15
|
with TemporaryDirectory() as tmpdir:
|
|
@@ -1,10 +1,13 @@
|
|
|
1
|
-
from helm.benchmark.
|
|
2
|
-
from helm.benchmark.scenarios.scenario import create_scenario, Scenario, Input, PassageQuestionInput
|
|
1
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec, create_scenario, Scenario, Input, PassageQuestionInput
|
|
3
2
|
|
|
4
3
|
|
|
5
4
|
class TestScenario:
|
|
6
5
|
def setup_method(self, method):
|
|
7
|
-
|
|
6
|
+
scenario_spec: ScenarioSpec = ScenarioSpec(
|
|
7
|
+
class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
|
|
8
|
+
args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 2, "num_test_instances": 2},
|
|
9
|
+
)
|
|
10
|
+
self.scenario: Scenario = create_scenario(scenario_spec)
|
|
8
11
|
|
|
9
12
|
def test_render_lines(self):
|
|
10
13
|
instances = self.scenario.get_instances(output_path="")
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.simple_scenarios import (
|
|
5
|
+
SimpleMCQAScenario,
|
|
6
|
+
SimpleShortAnswerQAScenario,
|
|
7
|
+
SimpleClassificationScenario,
|
|
8
|
+
)
|
|
9
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.scenarios
|
|
13
|
+
def test_simple_mcqa_scenario():
|
|
14
|
+
scenario = SimpleMCQAScenario()
|
|
15
|
+
with TemporaryDirectory() as tmpdir:
|
|
16
|
+
instances = scenario.get_instances(tmpdir)
|
|
17
|
+
assert len(instances) == 90
|
|
18
|
+
assert instances[0].input == Input(text="Is 10 even or odd?")
|
|
19
|
+
assert instances[0].references == [
|
|
20
|
+
Reference(output=Output(text="Even"), tags=[CORRECT_TAG]),
|
|
21
|
+
Reference(output=Output(text="Odd"), tags=[]),
|
|
22
|
+
]
|
|
23
|
+
assert instances[0].split == "train"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.mark.scenarios
|
|
27
|
+
def test_simple_short_answer_qa_scenario():
|
|
28
|
+
scenario = SimpleShortAnswerQAScenario()
|
|
29
|
+
with TemporaryDirectory() as tmpdir:
|
|
30
|
+
instances = scenario.get_instances(tmpdir)
|
|
31
|
+
assert len(instances) == 90
|
|
32
|
+
assert instances[0].input == Input(text="Is 10 even or odd?")
|
|
33
|
+
assert instances[0].references == [
|
|
34
|
+
Reference(output=Output(text="Even"), tags=[CORRECT_TAG]),
|
|
35
|
+
]
|
|
36
|
+
assert instances[0].split == "train"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.scenarios
|
|
40
|
+
def test_simple_classification_scenario():
|
|
41
|
+
scenario = SimpleClassificationScenario()
|
|
42
|
+
with TemporaryDirectory() as tmpdir:
|
|
43
|
+
instances = scenario.get_instances(tmpdir)
|
|
44
|
+
assert len(instances) == 90
|
|
45
|
+
assert instances[0].input == Input(text="10")
|
|
46
|
+
assert instances[0].references == [
|
|
47
|
+
Reference(output=Output(text="Even"), tags=[CORRECT_TAG]),
|
|
48
|
+
Reference(output=Output(text="Odd"), tags=[]),
|
|
49
|
+
]
|
|
50
|
+
assert instances[0].split == "train"
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ThaiExamScenario(Scenario):
|
|
11
|
+
"""
|
|
12
|
+
ThaiExam, a benchmark comprising Thai multiple-choice examinations as follows:
|
|
13
|
+
|
|
14
|
+
∙ ONET: The Ordinary National Educational Test (ONET) is an examination for students in Thailand.
|
|
15
|
+
We select the grade-12 ONET exam, which comprises 5 subjects and each question has 5 choices.
|
|
16
|
+
These subjects are Thai, English, Mathematics, Social Studies, and Science.
|
|
17
|
+
Amounting to a total of 170 questions and options.
|
|
18
|
+
|
|
19
|
+
∙ IC: The Investment Consultant (IC) examination, a licensing test for investment professionals in Thailand.
|
|
20
|
+
Developed by the Stock Exchange of Thailand (SET), features 4 choices per question.
|
|
21
|
+
We extracted questions for levels 1, 2, and 3 resulting in a total of 95 questions and options.
|
|
22
|
+
|
|
23
|
+
∙ TGAT: The Thai General Aptitude Test (TGAT), a national high school examination in Thailand.
|
|
24
|
+
Focuses on critical and logical thinking skills.
|
|
25
|
+
We collected a total of 90 questions and answers. The TGAT consists of four choices per question.
|
|
26
|
+
|
|
27
|
+
∙ TPAT-1: The Thai Professional Aptitude Test 1 (TPAT-1) is a national high school examination in Thailand.
|
|
28
|
+
The Exam assesses students’ professional skills requirement in medical schools.
|
|
29
|
+
This subset contains reasoning and medical ethics. We collected a total of 116 questions and answers.
|
|
30
|
+
The TPAT-1 consists of 5 choices per question.
|
|
31
|
+
|
|
32
|
+
∙ A-Level: An academic knowledge assessment examination (Applied Knowledge Level)
|
|
33
|
+
that covers general foundational subjects taught in schools.
|
|
34
|
+
The content assessed in this examination aligns with the curriculum guidelines
|
|
35
|
+
and emphasizes the practical application of knowledge in daily life.
|
|
36
|
+
We collected a total of 175 questions and answers.
|
|
37
|
+
|
|
38
|
+
We created and used these exams to evaluate the performance of the Typhoon models(https://arxiv.org/abs/2312.13951).
|
|
39
|
+
|
|
40
|
+
Prompt models using the following format
|
|
41
|
+
|
|
42
|
+
<input> # train
|
|
43
|
+
A. <reference>
|
|
44
|
+
B. <reference>
|
|
45
|
+
C. <reference>
|
|
46
|
+
D. <reference>
|
|
47
|
+
E. <reference>
|
|
48
|
+
Answer: <A/B/C/D/E>
|
|
49
|
+
|
|
50
|
+
x N (N-shot)
|
|
51
|
+
|
|
52
|
+
<input> # test
|
|
53
|
+
A. <reference1>
|
|
54
|
+
B. <reference2>
|
|
55
|
+
C. <reference3>
|
|
56
|
+
D. <reference4>
|
|
57
|
+
E. <reference5>
|
|
58
|
+
Answer:
|
|
59
|
+
|
|
60
|
+
For example:
|
|
61
|
+
|
|
62
|
+
ในระบบย่อยอาหารของมนุษย์ การดูดซึมสารอาหารส่วนใหญ่เกิดขึ้นที่อวัยวะใด?
|
|
63
|
+
A. ลำไส้เล็ก
|
|
64
|
+
B. ตับอ่อน
|
|
65
|
+
C. ลำไส้ใหญ่
|
|
66
|
+
D. กระเพาะอาหาร
|
|
67
|
+
E. หัวใจ
|
|
68
|
+
Answer: A
|
|
69
|
+
|
|
70
|
+
ข้อใดอธิบายเกี่ยวกับแรงไฟฟ้าได้ถูกต้อง?
|
|
71
|
+
A. เกิดได้โดยที่วัตถุไม่ต้องสัมผัสกัน
|
|
72
|
+
B. เป็นได้เฉพาะแรงผลักเท่านั้น
|
|
73
|
+
C. เป็นได้เฉพาะแรงดูดเท่านั้น
|
|
74
|
+
D. เป็นแรงต้านระหว่างวัตถุเท่านั้น
|
|
75
|
+
E. ถูกทุกข้อ
|
|
76
|
+
Answer:
|
|
77
|
+
|
|
78
|
+
Target: A
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
name = "thai_exam"
|
|
82
|
+
description = "ThaiExam benchmark comprising Thai multiple-choice examinations."
|
|
83
|
+
tags = ["knowledge", "multiple_choice"]
|
|
84
|
+
|
|
85
|
+
def __init__(self, exam: str):
|
|
86
|
+
super().__init__()
|
|
87
|
+
self.exam = exam
|
|
88
|
+
|
|
89
|
+
def download_thai_exam(self, path: str):
|
|
90
|
+
ensure_file_downloaded(
|
|
91
|
+
"https://storage.googleapis.com/thai_dataset/thai_exam.tar.gz",
|
|
92
|
+
target_path=path,
|
|
93
|
+
unpack=True,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def process_jsonl(self, jsonl_path: str, split: str) -> List[Instance]:
|
|
97
|
+
instances: List[Instance] = []
|
|
98
|
+
hlog(f"Reading {jsonl_path}")
|
|
99
|
+
with open(jsonl_path, "r") as f:
|
|
100
|
+
for line in f:
|
|
101
|
+
data = json.loads(line)
|
|
102
|
+
# for handle missing key incase of some subject doesn't have all 5 choices
|
|
103
|
+
answers = [data[key] for key in ["a", "b", "c", "d", "e"] if key in data and data[key] != ""]
|
|
104
|
+
answers_dict = dict(zip(["A", "B", "C", "D", "E"], answers))
|
|
105
|
+
|
|
106
|
+
question, correct_answer = data["question"], answers_dict[data["answer"].upper()]
|
|
107
|
+
|
|
108
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
109
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
110
|
+
|
|
111
|
+
instance = Instance(
|
|
112
|
+
input=Input(text=question),
|
|
113
|
+
references=list(map(answer_to_reference, answers)),
|
|
114
|
+
split=split,
|
|
115
|
+
)
|
|
116
|
+
instances.append(instance)
|
|
117
|
+
return instances
|
|
118
|
+
|
|
119
|
+
def get_instances(self, output_path) -> List[Instance]:
|
|
120
|
+
data_path: str = os.path.join(output_path, "data")
|
|
121
|
+
self.download_thai_exam(data_path)
|
|
122
|
+
|
|
123
|
+
instances: List[Instance] = []
|
|
124
|
+
splits: Dict[str, str] = {
|
|
125
|
+
"train": TRAIN_SPLIT,
|
|
126
|
+
"test": TEST_SPLIT,
|
|
127
|
+
}
|
|
128
|
+
for split in splits:
|
|
129
|
+
jsonl_path: str = os.path.join(data_path, self.exam, f"{self.exam}_{split}.jsonl")
|
|
130
|
+
if not os.path.exists(jsonl_path):
|
|
131
|
+
hlog(f"{jsonl_path} doesn't exist, skipping")
|
|
132
|
+
continue
|
|
133
|
+
instances.extend(self.process_jsonl(jsonl_path, splits[split]))
|
|
134
|
+
|
|
135
|
+
return instances
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from datasets import load_dataset
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Output,
|
|
6
|
+
Reference,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Input,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
VALID_SPLIT,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class UnitxtScenario(Scenario):
|
|
18
|
+
"""Integration with Unitxt: https://unitxt.rtfd.io/"""
|
|
19
|
+
|
|
20
|
+
name = "unitxt"
|
|
21
|
+
description = "Unitxt Scenarios"
|
|
22
|
+
tags = ["unitxt"]
|
|
23
|
+
|
|
24
|
+
UNITXT_SPLIT_NAME_TO_HELM_SPLIT_NAME = {
|
|
25
|
+
"train": TRAIN_SPLIT,
|
|
26
|
+
"test": TEST_SPLIT,
|
|
27
|
+
"validation": VALID_SPLIT,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def __init__(self, **kwargs):
|
|
31
|
+
super().__init__()
|
|
32
|
+
self.kwargs = kwargs
|
|
33
|
+
|
|
34
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
35
|
+
dataset_name = ",".join(f"{key}={value}" for key, value in self.kwargs.items())
|
|
36
|
+
dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
|
|
37
|
+
|
|
38
|
+
instances: List[Instance] = []
|
|
39
|
+
|
|
40
|
+
for unitxt_split_name, helm_split_name in UnitxtScenario.UNITXT_SPLIT_NAME_TO_HELM_SPLIT_NAME.items():
|
|
41
|
+
for index, row in enumerate(dataset[unitxt_split_name]):
|
|
42
|
+
references = [
|
|
43
|
+
Reference(
|
|
44
|
+
output=Output(text=reference_text),
|
|
45
|
+
tags=[CORRECT_TAG],
|
|
46
|
+
)
|
|
47
|
+
for reference_text in row["references"]
|
|
48
|
+
]
|
|
49
|
+
instance = Instance(
|
|
50
|
+
id=f"{unitxt_split_name}{index}",
|
|
51
|
+
input=Input(text=row["source"]),
|
|
52
|
+
references=references,
|
|
53
|
+
split=helm_split_name,
|
|
54
|
+
)
|
|
55
|
+
instances.append(instance)
|
|
56
|
+
return instances
|
|
@@ -137,7 +137,9 @@ class VerifiabilityJudgementScenario(Scenario):
|
|
|
137
137
|
for _, filesplit in split_to_filesplit.items():
|
|
138
138
|
target_name = f"verifiability_judgments_{filesplit}.jsonl"
|
|
139
139
|
target_path: str = os.path.join(data_path, target_name)
|
|
140
|
-
url: str =
|
|
140
|
+
url: str = (
|
|
141
|
+
f"https://github.com/nelson-liu/evaluating-verifiability-in-generative-search-engines/raw/40bf37e3a4eca7d82515df2c800ec9605458d637/verifiability_judgments/{target_name}.gz" # noqa: E501
|
|
142
|
+
)
|
|
141
143
|
ensure_file_downloaded(source_url=url, target_path=target_path)
|
|
142
144
|
assert os.path.exists(target_path)
|
|
143
145
|
|
|
@@ -26,7 +26,7 @@ class VicunaScenario(Scenario):
|
|
|
26
26
|
return self.category == "all" or raw["category"] == self.category
|
|
27
27
|
|
|
28
28
|
# Download the raw data
|
|
29
|
-
source_url = "https://raw.githubusercontent.com/lm-sys/FastChat/
|
|
29
|
+
source_url = "https://raw.githubusercontent.com/lm-sys/FastChat/v0.2.5/fastchat/eval/table/question.jsonl"
|
|
30
30
|
data_path: str = os.path.join(output_path, "vicuna_questions.jsonl")
|
|
31
31
|
|
|
32
32
|
ensure_file_downloaded(
|