crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
- crfm_helm-0.5.1.dist-info/RECORD +654 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +25 -3
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +41 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +213 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +41 -1
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +205 -35
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +163 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +757 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +823 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +233 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +301 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +104 -73
- helm/clients/vertexai_client.py +400 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +111 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +33 -3
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1159 -538
- helm/config/model_metadata.yaml +868 -41
- helm/config/tokenizer_configs.yaml +149 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Helper utilities for working with Amazon Bedrock."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import boto3
|
|
11
|
+
from botocore.config import Config
|
|
12
|
+
except ModuleNotFoundError as e:
|
|
13
|
+
handle_module_not_found_error(e, ["aws"])
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# From https://github.com/aws-samples/amazon-bedrock-workshop/blob/main/01_Generation/00_generate_w_bedrock.ipynb
|
|
17
|
+
# MIT-0 Licensed
|
|
18
|
+
def get_bedrock_client(
|
|
19
|
+
assumed_role: Optional[str] = None,
|
|
20
|
+
region: Optional[str] = None,
|
|
21
|
+
runtime: Optional[bool] = True,
|
|
22
|
+
):
|
|
23
|
+
"""Create a boto3 client for Amazon Bedrock, with optional configuration overrides
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
assumed_role :
|
|
28
|
+
Optional ARN of an AWS IAM role to assume for calling the Bedrock service. If not
|
|
29
|
+
specified, the current active credentials will be used.
|
|
30
|
+
region :
|
|
31
|
+
Optional name of the AWS Region in which the service should be called (e.g. "us-east-1").
|
|
32
|
+
If not specified, AWS_REGION or AWS_DEFAULT_REGION environment variable will be used.
|
|
33
|
+
runtime :
|
|
34
|
+
Optional choice of getting different client to perform operations with the Amazon Bedrock service.
|
|
35
|
+
"""
|
|
36
|
+
if region is None:
|
|
37
|
+
target_region = os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION"))
|
|
38
|
+
else:
|
|
39
|
+
target_region = region
|
|
40
|
+
|
|
41
|
+
session_kwargs = {"region_name": target_region}
|
|
42
|
+
client_kwargs = {**session_kwargs}
|
|
43
|
+
|
|
44
|
+
profile_name = os.environ.get("AWS_PROFILE")
|
|
45
|
+
if profile_name:
|
|
46
|
+
session_kwargs["profile_name"] = profile_name
|
|
47
|
+
|
|
48
|
+
retry_config = Config(
|
|
49
|
+
region_name=target_region,
|
|
50
|
+
retries={
|
|
51
|
+
"max_attempts": 10,
|
|
52
|
+
"mode": "standard",
|
|
53
|
+
},
|
|
54
|
+
)
|
|
55
|
+
session = boto3.Session(**session_kwargs)
|
|
56
|
+
|
|
57
|
+
if assumed_role:
|
|
58
|
+
sts = session.client("sts")
|
|
59
|
+
response = sts.assume_role(RoleArn=str(assumed_role), RoleSessionName="crfm-helm")
|
|
60
|
+
client_kwargs["aws_access_key_id"] = response["Credentials"]["AccessKeyId"]
|
|
61
|
+
client_kwargs["aws_secret_access_key"] = response["Credentials"]["SecretAccessKey"]
|
|
62
|
+
client_kwargs["aws_session_token"] = response["Credentials"]["SessionToken"]
|
|
63
|
+
|
|
64
|
+
if runtime:
|
|
65
|
+
service_name = "bedrock-runtime"
|
|
66
|
+
else:
|
|
67
|
+
service_name = "bedrock"
|
|
68
|
+
|
|
69
|
+
bedrock_client = session.client(service_name=service_name, config=retry_config, **client_kwargs)
|
|
70
|
+
|
|
71
|
+
hlog(f"Amazon Bedrock client successfully created with endpoint {bedrock_client._endpoint}")
|
|
72
|
+
return bedrock_client
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import List, Mapping, Optional, cast
|
|
4
4
|
|
|
5
5
|
from helm.common.hierarchical_logger import hlog
|
|
6
6
|
from helm.common.media_object import MultimediaObject, TEXT_TYPE
|
|
7
|
-
from helm.common.request import Request, RequestResult,
|
|
7
|
+
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
8
8
|
from helm.common.cache import Cache, CacheConfig
|
|
9
|
+
from helm.common.tokenization_request import DecodeRequest, TokenizationRequest
|
|
10
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class Client(ABC):
|
|
@@ -30,24 +32,28 @@ class CachingClient(Client):
|
|
|
30
32
|
self.cache = Cache(cache_config) if cache_config is not None else None
|
|
31
33
|
|
|
32
34
|
@staticmethod
|
|
33
|
-
def make_cache_key(raw_request:
|
|
35
|
+
def make_cache_key(raw_request: Mapping, request: Request) -> Mapping:
|
|
34
36
|
"""
|
|
35
37
|
Construct the key for the cache using the raw request.
|
|
36
38
|
Add `request.random` to the key, if defined.
|
|
37
39
|
"""
|
|
38
40
|
if request.random is not None:
|
|
39
41
|
assert "random" not in raw_request
|
|
40
|
-
cache_key = {**raw_request, "random": request.random}
|
|
42
|
+
cache_key: Mapping = {**raw_request, "random": request.random}
|
|
41
43
|
else:
|
|
42
44
|
cache_key = raw_request
|
|
43
45
|
return cache_key
|
|
44
46
|
|
|
45
47
|
|
|
46
|
-
def truncate_sequence(sequence:
|
|
48
|
+
def truncate_sequence(sequence: GeneratedOutput, request: Request, print_warning: bool = True) -> GeneratedOutput:
|
|
47
49
|
"""
|
|
48
50
|
Certain providers have bugs where they aren't respecting max_tokens,
|
|
49
51
|
stop_sequences and the end of text token, so as a hack, we have to manually
|
|
50
52
|
truncate the suffix of `sequence` and `tokens` as a post-hoc process.
|
|
53
|
+
|
|
54
|
+
This method is unsafe and may produce warnings or incorrect results.
|
|
55
|
+
Prefer using the safer truncate_and_tokenize_response_text() method instead
|
|
56
|
+
if your use case satisfies its requirements.
|
|
51
57
|
"""
|
|
52
58
|
# TODO: if echo_prompt, then we should only ignore the prompt, but we don't
|
|
53
59
|
# know how many tokens the prompt takes up.
|
|
@@ -87,7 +93,7 @@ def truncate_sequence(sequence: Sequence, request: Request, print_warning: bool
|
|
|
87
93
|
if print_warning:
|
|
88
94
|
hlog(f"WARNING: truncate_sequence needs to strip {json.dumps(stop)}")
|
|
89
95
|
|
|
90
|
-
sequence =
|
|
96
|
+
sequence = GeneratedOutput(text=new_text, logprob=new_logprob, tokens=new_tokens)
|
|
91
97
|
|
|
92
98
|
# Truncate based on the max number of tokens.
|
|
93
99
|
if len(sequence.tokens) > request.max_tokens:
|
|
@@ -104,11 +110,63 @@ def truncate_sequence(sequence: Sequence, request: Request, print_warning: bool
|
|
|
104
110
|
|
|
105
111
|
new_logprob = sum(token.logprob for token in new_tokens)
|
|
106
112
|
|
|
107
|
-
sequence =
|
|
113
|
+
sequence = GeneratedOutput(text=new_text, logprob=new_logprob, tokens=new_tokens)
|
|
108
114
|
|
|
109
115
|
return sequence
|
|
110
116
|
|
|
111
117
|
|
|
118
|
+
def truncate_and_tokenize_response_text(
|
|
119
|
+
text: str, request: Request, tokenizer: Tokenizer, tokenizer_name: str, original_finish_reason: str = "endoftext"
|
|
120
|
+
) -> GeneratedOutput:
|
|
121
|
+
"""Truncate a string-only response to respect stop_sequences and max_tokens.
|
|
122
|
+
|
|
123
|
+
This can only be used if all of the following conditions are true:
|
|
124
|
+
|
|
125
|
+
- You have access to the tokenizer.
|
|
126
|
+
- The request has echo_prompt = False.
|
|
127
|
+
- The tokenizer supports encoding and decoding.
|
|
128
|
+
- The tokenizer's tokenize() method supports truncation.
|
|
129
|
+
- The model's response is text-only.
|
|
130
|
+
- The model's response not already provide the tokenized text.
|
|
131
|
+
- The model's response does not provide logprobs.
|
|
132
|
+
|
|
133
|
+
This method is safer than truncate_sequence() and should be preferred if the above conditions are met.
|
|
134
|
+
Unlike truncate_sequence(), this method will not produce warnings or incorrect results.
|
|
135
|
+
This is because the the tokens are derived from the truncated text using the tokenizer,
|
|
136
|
+
so the text and the tokens in the resulting result are guranteed to match."""
|
|
137
|
+
# Finish reason strings are token from basic_metrics._compute_finish_reason_metrics()
|
|
138
|
+
finish_reason: str = original_finish_reason
|
|
139
|
+
if request.echo_prompt:
|
|
140
|
+
raise Exception("truncate_and_tokenize_response_text() does not support requests with echo_prompt = True")
|
|
141
|
+
|
|
142
|
+
for stop_sequence in request.stop_sequences:
|
|
143
|
+
try:
|
|
144
|
+
text = text[: text.index(stop_sequence)]
|
|
145
|
+
finish_reason = "stop"
|
|
146
|
+
except ValueError:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
token_strings = cast(
|
|
150
|
+
List[str], tokenizer.tokenize(TokenizationRequest(text=text, tokenizer=tokenizer_name)).raw_tokens
|
|
151
|
+
)
|
|
152
|
+
if len(token_strings) > request.max_tokens:
|
|
153
|
+
encoded_ints = cast(
|
|
154
|
+
List[int],
|
|
155
|
+
tokenizer.tokenize(
|
|
156
|
+
TokenizationRequest(
|
|
157
|
+
text=text, tokenizer=tokenizer_name, encode=True, truncation=True, max_length=request.max_tokens
|
|
158
|
+
)
|
|
159
|
+
).raw_tokens,
|
|
160
|
+
)
|
|
161
|
+
text = tokenizer.decode(DecodeRequest(encoded_ints, tokenizer_name)).text
|
|
162
|
+
token_strings = cast(
|
|
163
|
+
List[str], tokenizer.tokenize(TokenizationRequest(text=text, tokenizer=tokenizer_name)).raw_tokens
|
|
164
|
+
)
|
|
165
|
+
finish_reason = "length"
|
|
166
|
+
tokens = [Token(text=token_string, logprob=0.0) for token_string in token_strings]
|
|
167
|
+
return GeneratedOutput(text=text, logprob=0.0, tokens=tokens, finish_reason={"reason": finish_reason})
|
|
168
|
+
|
|
169
|
+
|
|
112
170
|
def cleanup_str(token: str, tokenizer_name: Optional[str] = None) -> str:
|
|
113
171
|
"""
|
|
114
172
|
Certain tokenizers introduce special characters to represent spaces, such as
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
from dataclasses import asdict
|
|
3
|
+
|
|
4
|
+
from helm.common.cache import Cache, CacheConfig
|
|
5
|
+
from helm.common.clip_score_request import DEFAULT_CLIP_SCORE_MODEL, CLIPScoreRequest, CLIPScoreResult
|
|
6
|
+
from helm.clients.clip_scorers.base_clip_scorer import BaseCLIPScorer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CLIPScoreClientError(Exception):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CLIPScoreClient:
|
|
14
|
+
def __init__(self, cache_config: CacheConfig):
|
|
15
|
+
self.cache = Cache(cache_config)
|
|
16
|
+
self._clip_scorer: Optional[BaseCLIPScorer] = None
|
|
17
|
+
|
|
18
|
+
def compute_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
19
|
+
"""
|
|
20
|
+
Compute a CLIPScore for a given caption and image.
|
|
21
|
+
"""
|
|
22
|
+
# TODO: support multilingual CLIPScore and other CLIP models.
|
|
23
|
+
assert request.model == DEFAULT_CLIP_SCORE_MODEL, f"Unsupported model: {request.model}"
|
|
24
|
+
assert not request.multilingual
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
|
|
28
|
+
def do_it():
|
|
29
|
+
if self._clip_scorer is None:
|
|
30
|
+
from helm.clients.clip_scorers.clip_scorer import CLIPScorer
|
|
31
|
+
|
|
32
|
+
self._clip_scorer = CLIPScorer()
|
|
33
|
+
|
|
34
|
+
score: float = self._clip_scorer.compute_score(
|
|
35
|
+
caption=request.caption, image_location=request.image_location
|
|
36
|
+
)
|
|
37
|
+
return {"score": score}
|
|
38
|
+
|
|
39
|
+
cache_key: Dict = asdict(request)
|
|
40
|
+
results, cached = self.cache.get(cache_key, do_it)
|
|
41
|
+
|
|
42
|
+
except Exception as e:
|
|
43
|
+
raise CLIPScoreClientError(e)
|
|
44
|
+
|
|
45
|
+
return CLIPScoreResult(
|
|
46
|
+
success=True,
|
|
47
|
+
cached=cached,
|
|
48
|
+
score=results["score"],
|
|
49
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from abc import abstractmethod, ABC
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseCLIPScorer(ABC):
|
|
6
|
+
@abstractmethod
|
|
7
|
+
def compute_score(self, caption: str, image_location: str) -> float:
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
def select_best_image(self, caption: str, image_locations: List[str]) -> str:
|
|
11
|
+
"""Selects the image from a list of images with the highest CLIPScore given the caption."""
|
|
12
|
+
assert len(image_locations) > 0, "Need at least one image"
|
|
13
|
+
|
|
14
|
+
if len(image_locations) == 1:
|
|
15
|
+
return image_locations[0]
|
|
16
|
+
|
|
17
|
+
scores: List[float] = [self.compute_score(caption, image_location) for image_location in image_locations]
|
|
18
|
+
return image_locations[scores.index(max(scores))]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
from torchvision import transforms
|
|
4
|
+
import torch
|
|
5
|
+
|
|
6
|
+
from helm.common.gpu_utils import get_torch_device
|
|
7
|
+
from helm.common.images_utils import open_image
|
|
8
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
9
|
+
from .base_clip_scorer import BaseCLIPScorer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_ = torch.manual_seed(42)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CLIPScorer(BaseCLIPScorer):
|
|
16
|
+
"""
|
|
17
|
+
CLIPScore is a reference free metric that can be used to evaluate the correlation between an image
|
|
18
|
+
caption and the content of the image. It has been found to be highly correlated with human judgement.
|
|
19
|
+
Paper: https://arxiv.org/abs/2104.08718
|
|
20
|
+
|
|
21
|
+
We use the TorchMetrics implementation:
|
|
22
|
+
https://torchmetrics.readthedocs.io/en/stable/multimodal/clip_score.html.
|
|
23
|
+
The score is bound between 0 and 100, where a score closer to 100 is better.
|
|
24
|
+
|
|
25
|
+
Verified implementation against the scores of image-caption pairs from
|
|
26
|
+
https://wandb.ai/dalle-mini/dalle-mini/reports/OpenAI-CLIP-Score-exploration--VmlldzoxNjMwODM1.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
model_name: Literal[
|
|
32
|
+
"openai/clip-vit-base-patch16",
|
|
33
|
+
"openai/clip-vit-base-patch32",
|
|
34
|
+
"openai/clip-vit-large-patch14-336",
|
|
35
|
+
"openai/clip-vit-large-patch14",
|
|
36
|
+
] = "openai/clip-vit-large-patch14",
|
|
37
|
+
):
|
|
38
|
+
try:
|
|
39
|
+
from torchmetrics.multimodal import CLIPScore
|
|
40
|
+
except ModuleNotFoundError as e:
|
|
41
|
+
handle_module_not_found_error(e, ["heim"])
|
|
42
|
+
|
|
43
|
+
self._device: torch.device = get_torch_device()
|
|
44
|
+
self._metric = CLIPScore(model_name_or_path=model_name).to(self._device)
|
|
45
|
+
|
|
46
|
+
def compute_score(self, caption: str, image_location: str) -> float:
|
|
47
|
+
image = open_image(image_location)
|
|
48
|
+
image_tensor: torch.Tensor = transforms.ToTensor()(image).to(self._device)
|
|
49
|
+
score: float = self._metric(image_tensor, caption).detach().item()
|
|
50
|
+
return score
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import transformers
|
|
3
|
+
|
|
4
|
+
from helm.common.gpu_utils import get_torch_device, get_torch_device_name
|
|
5
|
+
from helm.common.images_utils import open_image
|
|
6
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
7
|
+
from .base_clip_scorer import BaseCLIPScorer
|
|
8
|
+
|
|
9
|
+
_ = torch.manual_seed(42)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MultilingualCLIPScorer(BaseCLIPScorer):
|
|
13
|
+
"""
|
|
14
|
+
Multilingual-CLIP extends OpenAI's English text encoders to multiple other languages.
|
|
15
|
+
Adapted from https://huggingface.co/M-CLIP/XLM-Roberta-Large-Vit-L-14
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
TEXT_MODEL_NAME: str = "M-CLIP/XLM-Roberta-Large-Vit-L-14"
|
|
19
|
+
IMAGE_MODEL_NAME: str = "ViT-L/14"
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
try:
|
|
23
|
+
import clip
|
|
24
|
+
from multilingual_clip import pt_multilingual_clip
|
|
25
|
+
except ModuleNotFoundError as e:
|
|
26
|
+
handle_module_not_found_error(e, ["heim"])
|
|
27
|
+
|
|
28
|
+
super().__init__()
|
|
29
|
+
self._device: torch.device = get_torch_device()
|
|
30
|
+
self._text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(self.TEXT_MODEL_NAME)
|
|
31
|
+
self._tokenizer = transformers.AutoTokenizer.from_pretrained(self.TEXT_MODEL_NAME)
|
|
32
|
+
self._model, self._preprocess = clip.load(self.IMAGE_MODEL_NAME, device=get_torch_device_name())
|
|
33
|
+
|
|
34
|
+
def compute_score(self, caption: str, image_location: str) -> float:
|
|
35
|
+
# Get text features
|
|
36
|
+
text_features = self._text_model.forward(caption, self._tokenizer)
|
|
37
|
+
text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
|
|
38
|
+
text_features = text_features.to(self._device)
|
|
39
|
+
|
|
40
|
+
image = open_image(image_location)
|
|
41
|
+
image = self._preprocess(image).unsqueeze(0).to(self._device)
|
|
42
|
+
|
|
43
|
+
# Get image features
|
|
44
|
+
with torch.no_grad():
|
|
45
|
+
image_features = self._model.encode_image(image)
|
|
46
|
+
image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
|
|
47
|
+
|
|
48
|
+
# Compute score using text and image features
|
|
49
|
+
score = 100 * (image_features * text_features).sum(axis=-1)
|
|
50
|
+
return score.detach().item()
|
|
@@ -8,7 +8,7 @@ from helm.common.request import (
|
|
|
8
8
|
EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
|
|
9
9
|
Request,
|
|
10
10
|
RequestResult,
|
|
11
|
-
|
|
11
|
+
GeneratedOutput,
|
|
12
12
|
Token,
|
|
13
13
|
)
|
|
14
14
|
from .client import CachingClient, truncate_sequence
|
|
@@ -120,7 +120,7 @@ class CohereClient(CachingClient):
|
|
|
120
120
|
error: str = f"CohereClient error: {e}"
|
|
121
121
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
122
122
|
|
|
123
|
-
completions: List[
|
|
123
|
+
completions: List[GeneratedOutput] = []
|
|
124
124
|
for generation in response["generations"]:
|
|
125
125
|
# From https://docs.cohere.ai/generate-reference, "the likelihood refers to the average log-likelihood
|
|
126
126
|
# of the entire specified string..." What we want is the sum of the log probabilities of all tokens.
|
|
@@ -132,14 +132,7 @@ class CohereClient(CachingClient):
|
|
|
132
132
|
logprob: float = token_likelihood.get("likelihood", 0)
|
|
133
133
|
sequence_logprob += logprob
|
|
134
134
|
|
|
135
|
-
tokens.append(
|
|
136
|
-
Token(
|
|
137
|
-
text=token_likelihood["token"],
|
|
138
|
-
logprob=logprob,
|
|
139
|
-
# Cohere does not include the top log probs in the response
|
|
140
|
-
top_logprobs={},
|
|
141
|
-
)
|
|
142
|
-
)
|
|
135
|
+
tokens.append(Token(text=token_likelihood["token"], logprob=logprob))
|
|
143
136
|
|
|
144
137
|
sequence_text: str = generation["text"]
|
|
145
138
|
if request.echo_prompt and request.max_tokens > 0:
|
|
@@ -147,7 +140,7 @@ class CohereClient(CachingClient):
|
|
|
147
140
|
# `return_likelihoods` is "ALL" and `max_tokens` is greater than 0.
|
|
148
141
|
sequence_text = request.prompt + sequence_text
|
|
149
142
|
|
|
150
|
-
completion:
|
|
143
|
+
completion: GeneratedOutput = GeneratedOutput(text=sequence_text, logprob=sequence_logprob, tokens=tokens)
|
|
151
144
|
completion = truncate_sequence(completion, request)
|
|
152
145
|
completions.append(completion)
|
|
153
146
|
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from dataclasses import asdict
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
from helm.common.cache import Cache, CacheConfig
|
|
6
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
7
|
+
from helm.common.hierarchical_logger import hlog
|
|
8
|
+
from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GCSClientError(Exception):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GCSClient:
|
|
16
|
+
"""
|
|
17
|
+
Uploads files to GCS. Ensure the GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json
|
|
18
|
+
environment variable is set.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
MAX_CHECK_ATTEMPTS: int = 10
|
|
22
|
+
|
|
23
|
+
def __init__(self, bucket_name: str, cache_config: CacheConfig):
|
|
24
|
+
try:
|
|
25
|
+
from google.cloud import storage # type: ignore
|
|
26
|
+
except ModuleNotFoundError as e:
|
|
27
|
+
handle_module_not_found_error(e, ["heim"])
|
|
28
|
+
|
|
29
|
+
self._bucket_name: str = bucket_name
|
|
30
|
+
self._cache = Cache(cache_config)
|
|
31
|
+
self._storage_client: Optional[storage.Client] = None
|
|
32
|
+
|
|
33
|
+
def upload(self, request: FileUploadRequest) -> FileUploadResult:
|
|
34
|
+
"""Uploads a file to GCS."""
|
|
35
|
+
try:
|
|
36
|
+
from google.cloud import storage # type: ignore
|
|
37
|
+
except ModuleNotFoundError as e:
|
|
38
|
+
handle_module_not_found_error(e, ["heim"])
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
|
|
42
|
+
def do_it():
|
|
43
|
+
if self._storage_client is None:
|
|
44
|
+
self._storage_client = storage.Client()
|
|
45
|
+
|
|
46
|
+
bucket = self._storage_client.bucket(self._bucket_name)
|
|
47
|
+
file_path: str = request.path
|
|
48
|
+
blob = bucket.blob(file_path)
|
|
49
|
+
|
|
50
|
+
# Optional: set a generation-match precondition to avoid potential race conditions
|
|
51
|
+
# and data corruptions. The request to upload is aborted if the object's
|
|
52
|
+
# generation number does not match your precondition. For a destination
|
|
53
|
+
# object that does not yet exist, set the if_generation_match precondition to 0.
|
|
54
|
+
# If the destination object already exists in your bucket, set instead a
|
|
55
|
+
# generation-match precondition using its generation number.
|
|
56
|
+
generation_match_precondition: int = 0
|
|
57
|
+
|
|
58
|
+
blob.upload_from_filename(file_path, if_generation_match=generation_match_precondition)
|
|
59
|
+
url: str = self._get_url(file_path)
|
|
60
|
+
|
|
61
|
+
# Ensure the file was uploaded successfully
|
|
62
|
+
uploaded: bool = False
|
|
63
|
+
for _ in range(0, self.MAX_CHECK_ATTEMPTS):
|
|
64
|
+
check_response = requests.head(url)
|
|
65
|
+
if check_response.status_code == 200:
|
|
66
|
+
uploaded = True
|
|
67
|
+
break
|
|
68
|
+
assert uploaded, f"File {file_path} was not uploaded successfully."
|
|
69
|
+
|
|
70
|
+
hlog(f"File {file_path} uploaded and is available at {url}.")
|
|
71
|
+
return {"url": url}
|
|
72
|
+
|
|
73
|
+
cache_key: Dict = asdict(request)
|
|
74
|
+
result, cached = self._cache.get(cache_key, do_it)
|
|
75
|
+
|
|
76
|
+
except Exception as e:
|
|
77
|
+
raise GCSClientError(e)
|
|
78
|
+
|
|
79
|
+
return FileUploadResult(success=True, cached=cached, url=result["url"])
|
|
80
|
+
|
|
81
|
+
def _get_url(self, path: str) -> str:
|
|
82
|
+
return f"https://storage.googleapis.com/{self._bucket_name}/{path}"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List, Dict
|
|
2
2
|
|
|
3
3
|
from helm.common.cache import CacheConfig
|
|
4
|
-
from helm.common.request import Request, RequestResult,
|
|
4
|
+
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
5
5
|
from .client import CachingClient, truncate_sequence
|
|
6
6
|
|
|
7
7
|
|
|
@@ -32,7 +32,7 @@ class GoogleClient(CachingClient):
|
|
|
32
32
|
|
|
33
33
|
def make_request(self, request: Request) -> RequestResult:
|
|
34
34
|
raw_request = GoogleClient.convert_to_raw_request(request)
|
|
35
|
-
cache_key
|
|
35
|
+
cache_key = CachingClient.make_cache_key(raw_request, request)
|
|
36
36
|
|
|
37
37
|
try:
|
|
38
38
|
|
|
@@ -48,17 +48,17 @@ class GoogleClient(CachingClient):
|
|
|
48
48
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
49
49
|
|
|
50
50
|
# Expect the result to be structured the same way as a response from OpenAI API.
|
|
51
|
-
completions: List[
|
|
51
|
+
completions: List[GeneratedOutput] = []
|
|
52
52
|
for raw_completion in response["choices"]:
|
|
53
53
|
sequence_logprob = 0
|
|
54
54
|
tokens: List[Token] = []
|
|
55
55
|
|
|
56
56
|
raw_data = raw_completion["logprobs"]
|
|
57
57
|
for text, logprob in zip(raw_data["tokens"], raw_data["token_logprobs"]):
|
|
58
|
-
tokens.append(Token(text=text, logprob=logprob or 0
|
|
58
|
+
tokens.append(Token(text=text, logprob=logprob or 0))
|
|
59
59
|
sequence_logprob += logprob or 0
|
|
60
60
|
|
|
61
|
-
completion =
|
|
61
|
+
completion = GeneratedOutput(
|
|
62
62
|
text=raw_completion["text"],
|
|
63
63
|
logprob=sequence_logprob,
|
|
64
64
|
tokens=tokens,
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from helm.common.cache import Cache, SqliteCacheConfig
|
|
4
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from google.cloud import translate_v2 as translate # type: ignore
|
|
8
|
+
except ModuleNotFoundError as e:
|
|
9
|
+
handle_module_not_found_error(e, ["heim"])
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GoogleTranslateClient:
|
|
13
|
+
"""
|
|
14
|
+
Client for Google Translate.
|
|
15
|
+
Follow the instructions at https://cloud.google.com/translate/docs/setup to use this client.
|
|
16
|
+
|
|
17
|
+
# TODO: add this as a central service
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, cache_path: str = "prod_env/cache/google_translate.sqlite"):
|
|
21
|
+
self.translate_client: Optional[translate.Client] = None
|
|
22
|
+
self.cache = Cache(SqliteCacheConfig(cache_path))
|
|
23
|
+
|
|
24
|
+
def translate(self, text: str, target_language: str) -> str:
|
|
25
|
+
def do_it():
|
|
26
|
+
if self.translate_client is None:
|
|
27
|
+
self.translate_client = translate.Client()
|
|
28
|
+
|
|
29
|
+
result = self.translate_client.translate(text, target_language=target_language)
|
|
30
|
+
del result["input"]
|
|
31
|
+
assert "translatedText" in result, f"Invalid response: {result}"
|
|
32
|
+
return result
|
|
33
|
+
|
|
34
|
+
response, _ = self.cache.get({"text": text, "target_language": target_language}, do_it)
|
|
35
|
+
return response["translatedText"]
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from dataclasses import asdict
|
|
3
|
+
from typing import Any, Dict
|
|
3
4
|
|
|
4
5
|
from helm.common.cache import CacheConfig
|
|
5
6
|
from helm.common.request import (
|
|
6
7
|
wrap_request_time,
|
|
7
8
|
Request,
|
|
8
9
|
RequestResult,
|
|
9
|
-
|
|
10
|
+
GeneratedOutput,
|
|
10
11
|
Token,
|
|
11
12
|
EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
|
|
12
13
|
)
|
|
@@ -51,7 +52,7 @@ class HTTPModelClient(CachingClient):
|
|
|
51
52
|
|
|
52
53
|
try:
|
|
53
54
|
|
|
54
|
-
def do_it():
|
|
55
|
+
def do_it() -> Dict[str, Any]:
|
|
55
56
|
url = f"{self.base_url}/process"
|
|
56
57
|
response = requests.post(url, json=raw_request, timeout=self.timeout)
|
|
57
58
|
response.raise_for_status()
|
|
@@ -63,11 +64,8 @@ class HTTPModelClient(CachingClient):
|
|
|
63
64
|
else:
|
|
64
65
|
response, cached = do_it(), False
|
|
65
66
|
|
|
66
|
-
tokens = [
|
|
67
|
-
|
|
68
|
-
for token in response["tokens"]
|
|
69
|
-
]
|
|
70
|
-
completions = [Sequence(text=response["text"], logprob=response["logprob"], tokens=tokens)]
|
|
67
|
+
tokens = [Token(text=token["text"], logprob=token["logprob"]) for token in response["tokens"]]
|
|
68
|
+
completions = [GeneratedOutput(text=response["text"], logprob=response["logprob"], tokens=tokens)]
|
|
71
69
|
|
|
72
70
|
return RequestResult(
|
|
73
71
|
success=True,
|