crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, Dict, List, Optional
|
|
1
|
+
from typing import Any, Dict, List, Optional, TypedDict, Union, cast
|
|
2
2
|
import json
|
|
3
3
|
import requests
|
|
4
4
|
import time
|
|
@@ -6,13 +6,14 @@ import urllib.parse
|
|
|
6
6
|
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
8
|
from helm.common.hierarchical_logger import htrack_block, hlog
|
|
9
|
+
from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
|
|
9
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
10
11
|
from helm.common.request import (
|
|
11
12
|
wrap_request_time,
|
|
12
13
|
EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
|
|
13
14
|
Request,
|
|
14
15
|
RequestResult,
|
|
15
|
-
|
|
16
|
+
GeneratedOutput,
|
|
16
17
|
Token,
|
|
17
18
|
ErrorFlags,
|
|
18
19
|
)
|
|
@@ -20,16 +21,30 @@ from helm.common.tokenization_request import (
|
|
|
20
21
|
TokenizationRequest,
|
|
21
22
|
TokenizationRequestResult,
|
|
22
23
|
)
|
|
23
|
-
from helm.proxy.
|
|
24
|
-
from .
|
|
24
|
+
from helm.proxy.retry import NonRetriableException
|
|
25
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
26
|
+
from helm.clients.client import CachingClient, truncate_sequence, truncate_and_tokenize_response_text
|
|
25
27
|
|
|
26
28
|
try:
|
|
27
|
-
import
|
|
29
|
+
from anthropic import Anthropic, BadRequestError
|
|
30
|
+
from anthropic.types import MessageParam
|
|
31
|
+
from anthropic.types.image_block_param import ImageBlockParam
|
|
32
|
+
from anthropic.types.text_block_param import TextBlockParam
|
|
28
33
|
import websocket
|
|
29
34
|
except ModuleNotFoundError as e:
|
|
30
35
|
handle_module_not_found_error(e, ["anthropic"])
|
|
31
36
|
|
|
32
37
|
|
|
38
|
+
class AnthropicCompletionRequest(TypedDict):
|
|
39
|
+
prompt: str
|
|
40
|
+
stop_sequences: List[str]
|
|
41
|
+
model: str
|
|
42
|
+
max_tokens_to_sample: int
|
|
43
|
+
temperature: float
|
|
44
|
+
top_p: float
|
|
45
|
+
top_k: int
|
|
46
|
+
|
|
47
|
+
|
|
33
48
|
class AnthropicClient(CachingClient):
|
|
34
49
|
"""
|
|
35
50
|
Client for the Anthropic models (https://arxiv.org/abs/2204.05862).
|
|
@@ -56,15 +71,19 @@ class AnthropicClient(CachingClient):
|
|
|
56
71
|
ADDITIONAL_TOKENS: int = 5
|
|
57
72
|
PROMPT_ANSWER_START: str = "The answer is "
|
|
58
73
|
|
|
59
|
-
def __init__(
|
|
60
|
-
|
|
74
|
+
def __init__(
|
|
75
|
+
self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: Optional[str] = None
|
|
76
|
+
):
|
|
77
|
+
super().__init__(cache_config=cache_config)
|
|
78
|
+
self.tokenizer = tokenizer
|
|
79
|
+
self.tokenizer_name = tokenizer_name
|
|
61
80
|
self.api_key: Optional[str] = api_key
|
|
62
|
-
self.
|
|
81
|
+
self.client = Anthropic(api_key=api_key)
|
|
63
82
|
|
|
64
|
-
def _send_request(self, raw_request:
|
|
83
|
+
def _send_request(self, raw_request: AnthropicCompletionRequest) -> Dict[str, Any]:
|
|
65
84
|
if self.api_key is None:
|
|
66
85
|
raise Exception("API key is not set. Please set it in the HELM config file.")
|
|
67
|
-
result = self.
|
|
86
|
+
result = self.client.completions.create(**raw_request).model_dump()
|
|
68
87
|
assert "error" not in result, f"Request failed with error: {result['error']}"
|
|
69
88
|
return result
|
|
70
89
|
|
|
@@ -99,7 +118,7 @@ class AnthropicClient(CachingClient):
|
|
|
99
118
|
if request.max_tokens == 0 and not request.echo_prompt:
|
|
100
119
|
raise ValueError("echo_prompt must be True when max_tokens=0.")
|
|
101
120
|
|
|
102
|
-
raw_request = {
|
|
121
|
+
raw_request: AnthropicCompletionRequest = {
|
|
103
122
|
"prompt": request.prompt,
|
|
104
123
|
"stop_sequences": request.stop_sequences,
|
|
105
124
|
"model": request.model_engine,
|
|
@@ -109,7 +128,7 @@ class AnthropicClient(CachingClient):
|
|
|
109
128
|
"top_k": request.top_k_per_token,
|
|
110
129
|
}
|
|
111
130
|
|
|
112
|
-
completions: List[
|
|
131
|
+
completions: List[GeneratedOutput] = []
|
|
113
132
|
|
|
114
133
|
# `num_completions` is not supported, so instead make `num_completions` separate requests.
|
|
115
134
|
for completion_index in range(request.num_completions):
|
|
@@ -164,15 +183,13 @@ class AnthropicClient(CachingClient):
|
|
|
164
183
|
# The Anthropic API doesn't return us tokens or logprobs, so we tokenize ourselves.
|
|
165
184
|
tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
|
|
166
185
|
# Anthropic uses their own tokenizer
|
|
167
|
-
TokenizationRequest(text, tokenizer=
|
|
186
|
+
TokenizationRequest(text, tokenizer=self.tokenizer_name)
|
|
168
187
|
)
|
|
169
188
|
|
|
170
189
|
# Log probs are not currently not supported by the Anthropic, so set to 0 for now.
|
|
171
|
-
tokens: List[Token] = [
|
|
172
|
-
Token(text=str(text), logprob=0, top_logprobs={}) for text in tokenization_result.raw_tokens
|
|
173
|
-
]
|
|
190
|
+
tokens: List[Token] = [Token(text=str(text), logprob=0) for text in tokenization_result.raw_tokens]
|
|
174
191
|
|
|
175
|
-
completion =
|
|
192
|
+
completion = GeneratedOutput(text=response["completion"], logprob=0, tokens=tokens)
|
|
176
193
|
# See NOTE() in _filter_completion() to understand why warnings are printed for truncation.
|
|
177
194
|
# TODO(#1512): Fix this with post-processing.
|
|
178
195
|
sequence = truncate_sequence(completion, request, print_warning=True)
|
|
@@ -188,6 +205,179 @@ class AnthropicClient(CachingClient):
|
|
|
188
205
|
)
|
|
189
206
|
|
|
190
207
|
|
|
208
|
+
def _is_content_moderation_failure(response: Dict) -> bool:
|
|
209
|
+
"""Return whether a a response failed because of the content moderation filter."""
|
|
210
|
+
if (
|
|
211
|
+
"error" in response
|
|
212
|
+
and "message" in response["error"]
|
|
213
|
+
and response["error"]["message"] == "Output blocked by content filtering policy"
|
|
214
|
+
):
|
|
215
|
+
hlog(f"Anthropic - output blocked by content filtering policy: {response}")
|
|
216
|
+
return True
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class AnthropicMessagesRequest(TypedDict, total=False):
|
|
221
|
+
messages: List[MessageParam]
|
|
222
|
+
model: str
|
|
223
|
+
stop_sequences: List[str]
|
|
224
|
+
system: str
|
|
225
|
+
max_tokens: int
|
|
226
|
+
temperature: float
|
|
227
|
+
top_k: int
|
|
228
|
+
top_p: float
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class AnthropicMessagesRequestError(NonRetriableException):
|
|
232
|
+
pass
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class AnthropicMessagesResponseError(Exception):
|
|
236
|
+
pass
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
class AnthropicMessagesClient(CachingClient):
|
|
240
|
+
# Source: https://docs.anthropic.com/claude/docs/models-overview
|
|
241
|
+
MAX_OUTPUT_TOKENS = 4096
|
|
242
|
+
|
|
243
|
+
def __init__(
|
|
244
|
+
self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: Optional[str] = None
|
|
245
|
+
):
|
|
246
|
+
super().__init__(cache_config=cache_config)
|
|
247
|
+
self.tokenizer = tokenizer
|
|
248
|
+
self.tokenizer_name = tokenizer_name
|
|
249
|
+
self.client = Anthropic(api_key=api_key)
|
|
250
|
+
self.api_key: Optional[str] = api_key
|
|
251
|
+
|
|
252
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
253
|
+
if request.max_tokens > AnthropicMessagesClient.MAX_OUTPUT_TOKENS:
|
|
254
|
+
raise AnthropicMessagesRequestError(
|
|
255
|
+
f"Request.max_tokens must be <= {AnthropicMessagesClient.MAX_OUTPUT_TOKENS}"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
messages: List[MessageParam] = []
|
|
259
|
+
system_message: Optional[MessageParam] = None
|
|
260
|
+
|
|
261
|
+
if request.messages is not None:
|
|
262
|
+
# TODO(#2439): Refactor out Request validation
|
|
263
|
+
if request.multimodal_prompt is not None or request.prompt:
|
|
264
|
+
raise AnthropicMessagesRequestError(
|
|
265
|
+
"Exactly one of Request.messages, Request.prompt or Request.multimodel_prompt should be set"
|
|
266
|
+
)
|
|
267
|
+
messages = cast(List[MessageParam], request.messages)
|
|
268
|
+
if messages[0]["role"] == "system":
|
|
269
|
+
system_message = messages[0]
|
|
270
|
+
messages = messages[1:]
|
|
271
|
+
|
|
272
|
+
elif request.multimodal_prompt is not None:
|
|
273
|
+
# TODO(#2439): Refactor out Request validation
|
|
274
|
+
if request.messages is not None or request.prompt:
|
|
275
|
+
raise AnthropicMessagesRequestError(
|
|
276
|
+
"Exactly one of Request.messages, Request.prompt or Request.multimodel_prompt should be set"
|
|
277
|
+
)
|
|
278
|
+
blocks: List[Union[TextBlockParam, ImageBlockParam]] = []
|
|
279
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
280
|
+
if media_object.is_type(IMAGE_TYPE):
|
|
281
|
+
# TODO(#2439): Refactor out Request validation
|
|
282
|
+
if not media_object.location:
|
|
283
|
+
raise Exception("MediaObject of image type has missing location field value")
|
|
284
|
+
|
|
285
|
+
from helm.common.images_utils import encode_base64
|
|
286
|
+
|
|
287
|
+
base64_image: str = encode_base64(media_object.location, format="JPEG")
|
|
288
|
+
image_block: ImageBlockParam = {
|
|
289
|
+
"type": "image",
|
|
290
|
+
"source": {
|
|
291
|
+
"type": "base64",
|
|
292
|
+
"media_type": "image/jpeg",
|
|
293
|
+
"data": base64_image,
|
|
294
|
+
},
|
|
295
|
+
}
|
|
296
|
+
blocks.append(image_block)
|
|
297
|
+
if media_object.is_type(TEXT_TYPE):
|
|
298
|
+
# TODO(#2439): Refactor out Request validation
|
|
299
|
+
if media_object.text is None:
|
|
300
|
+
raise ValueError("MediaObject of text type has missing text field value")
|
|
301
|
+
text_block: TextBlockParam = {
|
|
302
|
+
"type": "text",
|
|
303
|
+
"text": media_object.text,
|
|
304
|
+
}
|
|
305
|
+
blocks.append(text_block)
|
|
306
|
+
messages = [{"role": "user", "content": blocks}]
|
|
307
|
+
|
|
308
|
+
else:
|
|
309
|
+
messages = [{"role": "user", "content": request.prompt}]
|
|
310
|
+
|
|
311
|
+
raw_request: AnthropicMessagesRequest = {
|
|
312
|
+
"messages": messages,
|
|
313
|
+
"model": request.model_engine,
|
|
314
|
+
"stop_sequences": request.stop_sequences,
|
|
315
|
+
"max_tokens": request.max_tokens,
|
|
316
|
+
"temperature": request.temperature,
|
|
317
|
+
"top_p": request.top_p,
|
|
318
|
+
"top_k": request.top_k_per_token,
|
|
319
|
+
}
|
|
320
|
+
if system_message is not None:
|
|
321
|
+
raw_request["system"] = cast(str, system_message["content"])
|
|
322
|
+
completions: List[GeneratedOutput] = []
|
|
323
|
+
|
|
324
|
+
# `num_completions` is not supported, so instead make `num_completions` separate requests.
|
|
325
|
+
for completion_index in range(request.num_completions):
|
|
326
|
+
|
|
327
|
+
def do_it() -> Dict[str, Any]:
|
|
328
|
+
try:
|
|
329
|
+
result = self.client.messages.create(**raw_request).model_dump()
|
|
330
|
+
if "content" not in result or not result["content"]:
|
|
331
|
+
raise AnthropicMessagesResponseError(f"Anthropic response has empty content: {result}")
|
|
332
|
+
elif "text" not in result["content"][0]:
|
|
333
|
+
raise AnthropicMessagesResponseError(f"Anthropic response has non-text content: {result}")
|
|
334
|
+
return result
|
|
335
|
+
except BadRequestError as e:
|
|
336
|
+
response = e.response.json()
|
|
337
|
+
if _is_content_moderation_failure(response):
|
|
338
|
+
return response
|
|
339
|
+
raise
|
|
340
|
+
|
|
341
|
+
cache_key = CachingClient.make_cache_key(
|
|
342
|
+
{
|
|
343
|
+
"completion_index": completion_index,
|
|
344
|
+
**raw_request,
|
|
345
|
+
},
|
|
346
|
+
request,
|
|
347
|
+
)
|
|
348
|
+
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
349
|
+
|
|
350
|
+
if _is_content_moderation_failure(response):
|
|
351
|
+
hlog(
|
|
352
|
+
f"WARNING: Returning empty request for {request.model_deployment} "
|
|
353
|
+
"due to content moderation filter"
|
|
354
|
+
)
|
|
355
|
+
return RequestResult(
|
|
356
|
+
success=False,
|
|
357
|
+
cached=cached,
|
|
358
|
+
error=response["error"]["message"],
|
|
359
|
+
completions=[],
|
|
360
|
+
embedding=[],
|
|
361
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
362
|
+
request_time=response["request_time"],
|
|
363
|
+
request_datetime=response["request_datetime"],
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
completion = truncate_and_tokenize_response_text(
|
|
367
|
+
response["content"][0]["text"], request, self.tokenizer, self.tokenizer_name, original_finish_reason=""
|
|
368
|
+
)
|
|
369
|
+
completions.append(completion)
|
|
370
|
+
|
|
371
|
+
return RequestResult(
|
|
372
|
+
success=True,
|
|
373
|
+
cached=cached,
|
|
374
|
+
request_time=response["request_time"],
|
|
375
|
+
request_datetime=response["request_datetime"],
|
|
376
|
+
completions=completions,
|
|
377
|
+
embedding=[],
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
|
|
191
381
|
class AnthropicRequestError(Exception):
|
|
192
382
|
pass
|
|
193
383
|
|
|
@@ -239,9 +429,9 @@ class AnthropicLegacyClient(CachingClient):
|
|
|
239
429
|
hlog(f"Invalid logprobs response: {raw_response}")
|
|
240
430
|
return False
|
|
241
431
|
|
|
242
|
-
def __init__(self, api_key: str,
|
|
432
|
+
def __init__(self, api_key: str, cache_config: CacheConfig):
|
|
243
433
|
hlog("This client is deprecated. Please use AnthropicClient instead.")
|
|
244
|
-
super().__init__(cache_config=cache_config
|
|
434
|
+
super().__init__(cache_config=cache_config)
|
|
245
435
|
self.api_key = api_key
|
|
246
436
|
|
|
247
437
|
def make_request(self, request: Request) -> RequestResult:
|
|
@@ -249,7 +439,7 @@ class AnthropicLegacyClient(CachingClient):
|
|
|
249
439
|
if request.embedding:
|
|
250
440
|
return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
|
|
251
441
|
# Validate the fields of `Request`
|
|
252
|
-
if request.
|
|
442
|
+
if request.model_engine != "stanford-online-all-v4-s3":
|
|
253
443
|
raise ValueError(f"Invalid model: {request.model}")
|
|
254
444
|
if request.max_tokens > AnthropicLegacyClient.MAX_COMPLETION_LENGTH:
|
|
255
445
|
raise ValueError(
|
|
@@ -390,7 +580,7 @@ class AnthropicLegacyClient(CachingClient):
|
|
|
390
580
|
|
|
391
581
|
# Since Anthropic doesn't support multiple completions, we have to manually call it multiple times,
|
|
392
582
|
# and aggregate the results into `completions` and `request_time`.
|
|
393
|
-
completions: List[
|
|
583
|
+
completions: List[GeneratedOutput] = []
|
|
394
584
|
all_cached = True
|
|
395
585
|
request_time = 0
|
|
396
586
|
request_datetime: Optional[int] = None
|
|
@@ -423,8 +613,7 @@ class AnthropicLegacyClient(CachingClient):
|
|
|
423
613
|
for text, token_logprob, all_logprobs, all_tokens in zip(
|
|
424
614
|
log_probs["tokens"], log_probs["logprobs"], log_probs["topk_logprobs"], log_probs["topk_tokens"]
|
|
425
615
|
):
|
|
426
|
-
|
|
427
|
-
tokens.append(Token(text=text, logprob=token_logprob, top_logprobs=top_logprobs))
|
|
616
|
+
tokens.append(Token(text=text, logprob=token_logprob))
|
|
428
617
|
sequence_logprob += token_logprob
|
|
429
618
|
|
|
430
619
|
finish_reason: str = response["stop_reason"]
|
|
@@ -432,7 +621,7 @@ class AnthropicLegacyClient(CachingClient):
|
|
|
432
621
|
if finish_reason == AnthropicLegacyClient.STOP_SEQUENCE_STOP_REASON:
|
|
433
622
|
finish_reason = "stop"
|
|
434
623
|
|
|
435
|
-
completion =
|
|
624
|
+
completion = GeneratedOutput(
|
|
436
625
|
text=response["text"],
|
|
437
626
|
logprob=sequence_logprob,
|
|
438
627
|
tokens=tokens,
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
from dataclasses import replace
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict, Mapping, Optional
|
|
4
|
+
|
|
5
|
+
from retrying import Attempt, RetryError
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment
|
|
8
|
+
from helm.common.file_caches.file_cache import FileCache
|
|
9
|
+
from helm.common.file_caches.local_file_cache import LocalFileCache
|
|
10
|
+
from helm.common.credentials_utils import provide_api_key
|
|
11
|
+
from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
|
|
12
|
+
from helm.common.hierarchical_logger import hlog
|
|
13
|
+
from helm.common.object_spec import create_object, inject_object_spec_args
|
|
14
|
+
from helm.common.request import Request, RequestResult
|
|
15
|
+
from helm.clients.client import Client
|
|
16
|
+
from helm.clients.moderation_api_client import ModerationAPIClient
|
|
17
|
+
from helm.proxy.critique.critique_client import CritiqueClient
|
|
18
|
+
from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
|
|
19
|
+
from helm.proxy.retry import NonRetriableException, retry_request
|
|
20
|
+
from helm.tokenizers.auto_tokenizer import AutoTokenizer
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AuthenticationError(NonRetriableException):
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AutoClient(Client):
|
|
28
|
+
"""Automatically dispatch to the proper `Client` based on the model deployment name."""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self, credentials: Mapping[str, Any], file_storage_path: str, cache_backend_config: CacheBackendConfig
|
|
32
|
+
):
|
|
33
|
+
self._auto_tokenizer = AutoTokenizer(credentials, cache_backend_config)
|
|
34
|
+
self.credentials = credentials
|
|
35
|
+
self.file_storage_path = file_storage_path
|
|
36
|
+
self.cache_backend_config = cache_backend_config
|
|
37
|
+
self.clients: Dict[str, Client] = {}
|
|
38
|
+
self._critique_client: Optional[CritiqueClient] = None
|
|
39
|
+
hlog(f"AutoClient: file_storage_path = {file_storage_path}")
|
|
40
|
+
hlog(f"AutoClient: cache_backend_config = {cache_backend_config}")
|
|
41
|
+
|
|
42
|
+
def _get_client(self, model_deployment_name: str) -> Client:
|
|
43
|
+
"""Return a client based on the model, creating it if necessary."""
|
|
44
|
+
# First try to find the client in the cache
|
|
45
|
+
client: Optional[Client] = self.clients.get(model_deployment_name)
|
|
46
|
+
if client is not None:
|
|
47
|
+
return client
|
|
48
|
+
|
|
49
|
+
# Otherwise, create the client
|
|
50
|
+
model_deployment: ModelDeployment = get_model_deployment(model_deployment_name)
|
|
51
|
+
if model_deployment:
|
|
52
|
+
# Perform dependency injection to fill in remaining arguments.
|
|
53
|
+
# Dependency injection is needed here for these reasons:
|
|
54
|
+
#
|
|
55
|
+
# 1. Different clients have different parameters. Dependency injection provides arguments
|
|
56
|
+
# that match the parameters of the client.
|
|
57
|
+
# 2. Some arguments, such as the tokenizer, are not static data objects that can be
|
|
58
|
+
# in the users configuration file. Instead, they have to be constructed dynamically at
|
|
59
|
+
# runtime.
|
|
60
|
+
# 3. The providers must be lazily-evaluated, because eager evaluation can result in an
|
|
61
|
+
# exception. For instance, some clients do not require an API key, so trying to fetch
|
|
62
|
+
# the API key from configuration eagerly will result in an exception because the user
|
|
63
|
+
# will not have configured an API key.
|
|
64
|
+
|
|
65
|
+
# Prepare a cache
|
|
66
|
+
host_organization: str = model_deployment.host_organization
|
|
67
|
+
cache_config: CacheConfig = self.cache_backend_config.get_cache_config(host_organization)
|
|
68
|
+
|
|
69
|
+
client_spec = inject_object_spec_args(
|
|
70
|
+
model_deployment.client_spec,
|
|
71
|
+
constant_bindings={
|
|
72
|
+
"cache_config": cache_config,
|
|
73
|
+
"tokenizer_name": model_deployment.tokenizer_name,
|
|
74
|
+
},
|
|
75
|
+
provider_bindings={
|
|
76
|
+
"api_key": lambda: provide_api_key(self.credentials, host_organization, model_deployment_name),
|
|
77
|
+
"tokenizer": lambda: self._auto_tokenizer._get_tokenizer(
|
|
78
|
+
tokenizer_name=model_deployment.tokenizer_name or model_deployment.name
|
|
79
|
+
),
|
|
80
|
+
"org_id": lambda: self.credentials.get(
|
|
81
|
+
host_organization + "OrgId", None
|
|
82
|
+
), # OpenAI, GooseAI, Microsoft
|
|
83
|
+
"moderation_api_client": lambda: self.get_moderation_api_client(), # OpenAI DALL-E
|
|
84
|
+
"lock_file_path": lambda: os.path.join(
|
|
85
|
+
self.file_storage_path, f"{host_organization}.lock"
|
|
86
|
+
), # Microsoft
|
|
87
|
+
"project_id": lambda: self.credentials.get(host_organization + "ProjectId", None), # VertexAI
|
|
88
|
+
"location": lambda: self.credentials.get(host_organization + "Location", None), # VertexAI
|
|
89
|
+
"hf_auth_token": lambda: self.credentials.get("huggingfaceAuthToken", None), # HuggingFace
|
|
90
|
+
"file_cache": lambda: self._get_file_cache(host_organization), # Text-to-image models
|
|
91
|
+
},
|
|
92
|
+
)
|
|
93
|
+
client = create_object(client_spec)
|
|
94
|
+
else:
|
|
95
|
+
raise ValueError(f"Could not find client for model deployment: {model_deployment_name}")
|
|
96
|
+
|
|
97
|
+
# Cache the client
|
|
98
|
+
self.clients[model_deployment_name] = client
|
|
99
|
+
|
|
100
|
+
return client
|
|
101
|
+
|
|
102
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
103
|
+
"""
|
|
104
|
+
Dispatch based on the name of the model (e.g., openai/davinci).
|
|
105
|
+
Retries if request fails.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
# TODO: need to revisit this because this swallows up any exceptions that are raised.
|
|
109
|
+
@retry_request
|
|
110
|
+
def make_request_with_retry(client: Client, request: Request) -> RequestResult:
|
|
111
|
+
return client.make_request(request)
|
|
112
|
+
|
|
113
|
+
client: Client = self._get_client(request.model_deployment)
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
return make_request_with_retry(client=client, request=request)
|
|
117
|
+
except RetryError as e:
|
|
118
|
+
last_attempt: Attempt = e.last_attempt
|
|
119
|
+
retry_error: str = (
|
|
120
|
+
f"Failed to make request to {request.model_deployment} after retrying "
|
|
121
|
+
f"{last_attempt.attempt_number} times"
|
|
122
|
+
)
|
|
123
|
+
hlog(retry_error)
|
|
124
|
+
|
|
125
|
+
# Notify our user that we failed to make the request even after retrying.
|
|
126
|
+
return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
|
|
127
|
+
|
|
128
|
+
def get_gcs_client(self):
|
|
129
|
+
from .gcs_client import GCSClient
|
|
130
|
+
|
|
131
|
+
bucket_name: str = self.credentials["gcsBucketName"]
|
|
132
|
+
cache_config: CacheConfig = self.cache_backend_config.get_cache_config("gcs")
|
|
133
|
+
return GCSClient(bucket_name, cache_config)
|
|
134
|
+
|
|
135
|
+
def get_nudity_check_client(self):
|
|
136
|
+
from helm.clients.image_generation.nudity_check_client import NudityCheckClient
|
|
137
|
+
|
|
138
|
+
cache_config: CacheConfig = self.cache_backend_config.get_cache_config("nudity")
|
|
139
|
+
return NudityCheckClient(cache_config)
|
|
140
|
+
|
|
141
|
+
def get_clip_score_client(self):
|
|
142
|
+
from .clip_score_client import CLIPScoreClient
|
|
143
|
+
|
|
144
|
+
cache_config: CacheConfig = self.cache_backend_config.get_cache_config("clip_score")
|
|
145
|
+
return CLIPScoreClient(cache_config)
|
|
146
|
+
|
|
147
|
+
def get_toxicity_classifier_client(self) -> ToxicityClassifierClient:
|
|
148
|
+
"""Get the toxicity classifier client. We currently only support Perspective API."""
|
|
149
|
+
from helm.clients.perspective_api_client import PerspectiveAPIClient
|
|
150
|
+
|
|
151
|
+
cache_config: CacheConfig = self.cache_backend_config.get_cache_config("perspectiveapi")
|
|
152
|
+
return PerspectiveAPIClient(self.credentials.get("perspectiveApiKey", ""), cache_config)
|
|
153
|
+
|
|
154
|
+
def get_moderation_api_client(self) -> ModerationAPIClient:
|
|
155
|
+
"""Get the ModerationAPI client."""
|
|
156
|
+
cache_config: CacheConfig = self.cache_backend_config.get_cache_config("ModerationAPI")
|
|
157
|
+
return ModerationAPIClient(self.credentials.get("openaiApiKey", ""), cache_config)
|
|
158
|
+
|
|
159
|
+
def get_critique_client(self) -> CritiqueClient:
|
|
160
|
+
"""Get the critique client."""
|
|
161
|
+
if self._critique_client:
|
|
162
|
+
return self._critique_client
|
|
163
|
+
critique_type = self.credentials.get("critiqueType")
|
|
164
|
+
if critique_type == "random":
|
|
165
|
+
from helm.proxy.critique.critique_client import RandomCritiqueClient
|
|
166
|
+
|
|
167
|
+
self._critique_client = RandomCritiqueClient()
|
|
168
|
+
elif critique_type == "mturk":
|
|
169
|
+
from helm.proxy.critique.mechanical_turk_critique_client import (
|
|
170
|
+
MechanicalTurkCritiqueClient,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
self._critique_client = MechanicalTurkCritiqueClient()
|
|
174
|
+
elif critique_type == "surgeai":
|
|
175
|
+
from helm.proxy.critique.surge_ai_critique_client import (
|
|
176
|
+
SurgeAICritiqueClient,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
surgeai_credentials = self.credentials.get("surgeaiApiKey")
|
|
180
|
+
if not surgeai_credentials:
|
|
181
|
+
raise ValueError("surgeaiApiKey credentials are required for SurgeAICritiqueClient")
|
|
182
|
+
self._critique_client = SurgeAICritiqueClient(
|
|
183
|
+
surgeai_credentials, self.cache_backend_config.get_cache_config("surgeai")
|
|
184
|
+
)
|
|
185
|
+
elif critique_type == "model":
|
|
186
|
+
from helm.proxy.critique.model_critique_client import ModelCritiqueClient
|
|
187
|
+
|
|
188
|
+
model_name: Optional[str] = self.credentials.get("critiqueModelName")
|
|
189
|
+
if model_name is None:
|
|
190
|
+
raise ValueError("critiqueModelName is required for ModelCritiqueClient")
|
|
191
|
+
client: Client = self._get_client(model_name)
|
|
192
|
+
self._critique_client = ModelCritiqueClient(client, model_name)
|
|
193
|
+
elif critique_type == "scale":
|
|
194
|
+
from helm.proxy.critique.scale_critique_client import ScaleCritiqueClient
|
|
195
|
+
|
|
196
|
+
scale_credentials = self.credentials.get("scaleApiKey")
|
|
197
|
+
scale_project = self.credentials.get("scaleProject", None)
|
|
198
|
+
if not scale_project:
|
|
199
|
+
raise ValueError("scaleProject is required for ScaleCritiqueClient.")
|
|
200
|
+
if not scale_credentials:
|
|
201
|
+
raise ValueError("scaleApiKey is required for ScaleCritiqueClient")
|
|
202
|
+
self._critique_client = ScaleCritiqueClient(
|
|
203
|
+
scale_credentials, self.cache_backend_config.get_cache_config("scale"), scale_project
|
|
204
|
+
)
|
|
205
|
+
else:
|
|
206
|
+
raise ValueError(
|
|
207
|
+
"CritiqueClient is not configured; set critiqueType to 'mturk',"
|
|
208
|
+
"'mturk-sandbox', 'surgeai', 'scale' or 'random'"
|
|
209
|
+
)
|
|
210
|
+
return self._critique_client
|
|
211
|
+
|
|
212
|
+
def _get_file_cache(self, host_organization: str) -> FileCache:
|
|
213
|
+
# Initialize `FileCache` for text-to-image model APIs
|
|
214
|
+
local_file_cache_path: str = os.path.join(self.file_storage_path, "output", host_organization)
|
|
215
|
+
return LocalFileCache(local_file_cache_path, file_extension="png")
|