crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -2,9 +2,13 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
from helm.common.authentication import Authentication
|
|
4
4
|
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
5
|
+
from helm.common.file_upload_request import FileUploadResult, FileUploadRequest
|
|
6
|
+
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
7
|
+
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
5
8
|
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
6
9
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
10
|
from helm.proxy.services.service import Service
|
|
11
|
+
from helm.common.cache import Cache
|
|
8
12
|
|
|
9
13
|
|
|
10
14
|
class MetricService(TokenizerService):
|
|
@@ -15,8 +19,20 @@ class MetricService(TokenizerService):
|
|
|
15
19
|
def __init__(self, service: Service, auth: Authentication):
|
|
16
20
|
super().__init__(service, auth)
|
|
17
21
|
|
|
22
|
+
def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
|
|
23
|
+
return self._service.check_nudity(self._auth, request)
|
|
24
|
+
|
|
25
|
+
def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
26
|
+
return self._service.compute_clip_score(self._auth, request)
|
|
27
|
+
|
|
28
|
+
def upload(self, request: FileUploadRequest) -> FileUploadResult:
|
|
29
|
+
return self._service.upload(self._auth, request)
|
|
30
|
+
|
|
18
31
|
def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
19
32
|
return self._service.get_toxicity_scores(self._auth, request)
|
|
20
33
|
|
|
21
34
|
def make_critique_request(self, request: CritiqueRequest) -> Optional[CritiqueRequestResult]:
|
|
22
35
|
return self._service.make_critique_request(self._auth, request)
|
|
36
|
+
|
|
37
|
+
def get_cache(self, shard_name: str) -> Cache:
|
|
38
|
+
return Cache(self._service.get_cache_config(shard_name))
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
-
from .
|
|
4
|
+
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
5
5
|
from .metric_name import MetricName
|
|
6
6
|
from .statistic import Stat
|
|
7
7
|
from nltk.translate.bleu_score import corpus_bleu
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
class CLEVAParaphraseGenerationMetric(
|
|
10
|
+
class CLEVAParaphraseGenerationMetric(EvaluateInstancesMetric):
|
|
11
11
|
"""
|
|
12
12
|
Compute the Chinese iBLEU score for Paraphrase Generation scenarios of CLEVA benchmark.
|
|
13
13
|
This implementation allows variable number of references (i.e., golds).
|
|
@@ -21,8 +21,7 @@ class CLEVAParaphraseGenerationMetric(Metric):
|
|
|
21
21
|
def __init__(self, alpha: float = 0.8): # calculate iBLEU_0.8 by default
|
|
22
22
|
self.alpha = alpha
|
|
23
23
|
|
|
24
|
-
def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
|
|
25
|
-
|
|
24
|
+
def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
|
|
26
25
|
inputs: List = []
|
|
27
26
|
preds: List = []
|
|
28
27
|
golds: List[List[str]] = []
|
|
@@ -4,11 +4,11 @@ from typing import Callable, Dict, List, Tuple, Optional
|
|
|
4
4
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_RANKING_BINARY
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
6
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
7
|
+
from helm.benchmark.metrics.reference_metric import ReferenceMetric
|
|
7
8
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
8
9
|
from helm.benchmark.scenarios.scenario import unpack_tag, CORRECT_TAG, Reference
|
|
9
10
|
from helm.common.request import RequestResult
|
|
10
|
-
from helm.common.general import binarize_dict
|
|
11
|
-
from .metric import Metric
|
|
11
|
+
from helm.common.general import assert_present, binarize_dict
|
|
12
12
|
from .metric_name import MetricName
|
|
13
13
|
from .metric_service import MetricService
|
|
14
14
|
from .statistic import Stat
|
|
@@ -58,7 +58,7 @@ class RankingObject:
|
|
|
58
58
|
model_relevance: Optional[int] = None
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
class RankingMetric(
|
|
61
|
+
class RankingMetric(ReferenceMetric):
|
|
62
62
|
"""Ranking metric."""
|
|
63
63
|
|
|
64
64
|
""" Methods supported by this metric.
|
|
@@ -205,14 +205,13 @@ class RankingMetric(Metric):
|
|
|
205
205
|
relevance dictionary, which contains the ground truth relevance
|
|
206
206
|
values for each document.
|
|
207
207
|
"""
|
|
208
|
-
assert all([r.model_relevance is not None for r in ranking_objs])
|
|
209
208
|
if rank_limit:
|
|
210
209
|
return {
|
|
211
|
-
self.get_query_string(r.reference_index): r.model_relevance
|
|
210
|
+
self.get_query_string(r.reference_index): assert_present(r.model_relevance)
|
|
212
211
|
for r in ranking_objs
|
|
213
212
|
if r.rank and r.rank <= rank_limit
|
|
214
213
|
}
|
|
215
|
-
return {self.get_query_string(r.reference_index): r.model_relevance for r in ranking_objs}
|
|
214
|
+
return {self.get_query_string(r.reference_index): assert_present(r.model_relevance) for r in ranking_objs}
|
|
216
215
|
|
|
217
216
|
def get_true_relevances(self, ranking_objects: List[RankingObject]) -> Dict[str, int]:
|
|
218
217
|
"""Get the true relevance dictionary."""
|
|
@@ -372,7 +371,7 @@ class RankingMetric(Metric):
|
|
|
372
371
|
# len(ranking_objects) minus its relevance.
|
|
373
372
|
stats += [
|
|
374
373
|
Stat(MetricName(f"ref{r.reference_index}_rank")).add(
|
|
375
|
-
len(ranking_objects) - r.model_relevance
|
|
374
|
+
len(ranking_objects) - assert_present(r.model_relevance)
|
|
376
375
|
)
|
|
377
376
|
for r in ranking_objects
|
|
378
377
|
]
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import List, Dict
|
|
4
|
+
from helm.benchmark.metrics.metric import (
|
|
5
|
+
MetricInterface,
|
|
6
|
+
MetricResult,
|
|
7
|
+
PerInstanceStats,
|
|
8
|
+
add_context,
|
|
9
|
+
compute_worst_case_metrics,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from helm.common.general import parallel_map
|
|
13
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_LANGUAGE_MODELING
|
|
14
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
15
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
16
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
17
|
+
from .metric_name import MetricName, MetricContext
|
|
18
|
+
from .metric_service import MetricService
|
|
19
|
+
from .statistic import Stat, merge_stat
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class Processor:
|
|
24
|
+
"""Evaluates an instance."""
|
|
25
|
+
|
|
26
|
+
# TODO: not ideal that we have circular dependencies; subclasses of Metric
|
|
27
|
+
# should override the Processor rather than the Metric.
|
|
28
|
+
metric: "ReferenceMetric"
|
|
29
|
+
metric_service: MetricService
|
|
30
|
+
eval_cache_path: str
|
|
31
|
+
adapter_spec: AdapterSpec
|
|
32
|
+
|
|
33
|
+
def process(self, references_states: List[RequestState]) -> List[Stat]:
|
|
34
|
+
instance_stats: List[Stat] = []
|
|
35
|
+
|
|
36
|
+
# Evaluate the references
|
|
37
|
+
if len(references_states) == 0:
|
|
38
|
+
return instance_stats
|
|
39
|
+
instance_stats.extend(
|
|
40
|
+
self.metric.evaluate_references(
|
|
41
|
+
self.adapter_spec, references_states, self.metric_service, self.eval_cache_path
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Add instance-related context (e.g., split, perturbation) to the metrics
|
|
46
|
+
for i, stat in enumerate(instance_stats):
|
|
47
|
+
instance_stats[i] = add_context(stat, MetricContext.from_instance(references_states[0].instance))
|
|
48
|
+
|
|
49
|
+
return instance_stats
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ReferenceMetric(MetricInterface, ABC):
|
|
53
|
+
"""
|
|
54
|
+
A `Metric` takes the results of execution and produces `Stat`s for a
|
|
55
|
+
scenario.
|
|
56
|
+
|
|
57
|
+
Note: `Metric` actually right now is a bit of misnomer because it produces many
|
|
58
|
+
`Stat`s, that might be distinct but are computed together. Eventually we
|
|
59
|
+
might move to a world where there is one (or very few metrics that are domain-independent).
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def evaluate(
|
|
63
|
+
self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
|
|
64
|
+
) -> MetricResult:
|
|
65
|
+
"""
|
|
66
|
+
Main entry point for a `Metric`. This function groups the single
|
|
67
|
+
list of `RequestState` by training trial and instance, and invokes
|
|
68
|
+
other functions to process those. This should serve most purposes.
|
|
69
|
+
|
|
70
|
+
Any logic that doesn't decompose along instances should go here, such
|
|
71
|
+
as robustness.
|
|
72
|
+
"""
|
|
73
|
+
assert scenario_state.adapter_spec.method != ADAPT_LANGUAGE_MODELING, (
|
|
74
|
+
"Metric no longer knows how to handle the language modeling adapter. "
|
|
75
|
+
+ "All run_specs with that adapter should use LanguageModelingMetric. "
|
|
76
|
+
+ "If you are seeing this issue, please file a Github issue."
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
adapter_spec = scenario_state.adapter_spec
|
|
80
|
+
global_stats: Dict[MetricName, Stat] = {}
|
|
81
|
+
all_per_instance_stats: List[PerInstanceStats] = []
|
|
82
|
+
|
|
83
|
+
for train_trial_index in range(adapter_spec.num_train_trials):
|
|
84
|
+
# Construct inputs
|
|
85
|
+
request_state_sets: List[List[RequestState]] = []
|
|
86
|
+
for instance in scenario_state.instances:
|
|
87
|
+
references_states = []
|
|
88
|
+
for reference_index in range(len(instance.references)):
|
|
89
|
+
references_states.extend(
|
|
90
|
+
scenario_state.get_request_states(train_trial_index, instance, reference_index)
|
|
91
|
+
)
|
|
92
|
+
request_state_sets.append(references_states)
|
|
93
|
+
|
|
94
|
+
# Do it!
|
|
95
|
+
processor = Processor(
|
|
96
|
+
metric=self,
|
|
97
|
+
metric_service=metric_service,
|
|
98
|
+
eval_cache_path=eval_cache_path,
|
|
99
|
+
adapter_spec=scenario_state.adapter_spec,
|
|
100
|
+
)
|
|
101
|
+
results: List[List[Stat]] = parallel_map(
|
|
102
|
+
processor.process,
|
|
103
|
+
request_state_sets,
|
|
104
|
+
parallelism=parallelism,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Compute per-instance stats
|
|
108
|
+
per_instance_stats: List[PerInstanceStats] = []
|
|
109
|
+
for instance, stats in zip(scenario_state.instances, results):
|
|
110
|
+
assert instance.id is not None, f"id was none for instance: {instance}"
|
|
111
|
+
# Sometimes a metric (e.g., BiasMetric) doesn't produce any statistics
|
|
112
|
+
if len(stats) > 0:
|
|
113
|
+
per_instance_stats.append(
|
|
114
|
+
PerInstanceStats(instance.id, instance.perturbation, train_trial_index, stats)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Aggregate these stats
|
|
118
|
+
trial_stats: Dict[MetricName, Stat] = {} # Statistics just for this trial
|
|
119
|
+
for instance_stats in results:
|
|
120
|
+
for stat in instance_stats:
|
|
121
|
+
merge_stat(trial_stats, stat)
|
|
122
|
+
|
|
123
|
+
# Compute worst-case metrics.
|
|
124
|
+
# This is here since we want these stats for all metrics and they
|
|
125
|
+
# aggregate across contexts (perturbations).
|
|
126
|
+
worst_case_stats = compute_worst_case_metrics(dict(zip(scenario_state.instances, results)))
|
|
127
|
+
for stat in worst_case_stats:
|
|
128
|
+
merge_stat(trial_stats, stat)
|
|
129
|
+
|
|
130
|
+
# We take the mean value for each trial.
|
|
131
|
+
for stat in trial_stats.values():
|
|
132
|
+
merge_stat(global_stats, stat.take_mean())
|
|
133
|
+
|
|
134
|
+
all_per_instance_stats.extend(per_instance_stats)
|
|
135
|
+
|
|
136
|
+
# Wrap aggregated and per-instance stats in a MetricResult.
|
|
137
|
+
return MetricResult(list(global_stats.values()), all_per_instance_stats)
|
|
138
|
+
|
|
139
|
+
@abstractmethod
|
|
140
|
+
def evaluate_references(
|
|
141
|
+
self,
|
|
142
|
+
adapter_spec: AdapterSpec,
|
|
143
|
+
reference_request_states: List[RequestState],
|
|
144
|
+
metric_service: MetricService,
|
|
145
|
+
eval_cache_path: str,
|
|
146
|
+
) -> List[Stat]:
|
|
147
|
+
"""Evaluate the references. Override me!"""
|
|
148
|
+
pass
|
|
@@ -50,7 +50,6 @@ class SummaCImager:
|
|
|
50
50
|
def __init__(
|
|
51
51
|
self, model_name="mnli", granularity="paragraph", use_cache=True, max_doc_sents=100, device="cuda", **kwargs
|
|
52
52
|
):
|
|
53
|
-
|
|
54
53
|
self.grans = granularity.split("-")
|
|
55
54
|
|
|
56
55
|
assert (
|
|
@@ -154,7 +153,6 @@ class SummaCImager:
|
|
|
154
153
|
for j in range(N_gen)
|
|
155
154
|
]
|
|
156
155
|
for batch in utils_misc.batcher(dataset, batch_size=20):
|
|
157
|
-
|
|
158
156
|
if self.model_name == "decomp":
|
|
159
157
|
batch_evids, batch_conts, batch_neuts = [], [], []
|
|
160
158
|
batch_json = [{"premise": d["premise"], "hypothesis": d["hypothesis"]} for d in batch]
|
|
@@ -5,19 +5,20 @@ import os
|
|
|
5
5
|
import pickle
|
|
6
6
|
|
|
7
7
|
import spacy
|
|
8
|
+
import spacy.cli
|
|
8
9
|
from typing import List, Dict, Optional
|
|
9
10
|
from collections import defaultdict
|
|
10
11
|
|
|
11
12
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
12
13
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
13
14
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
15
|
+
from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
|
|
14
16
|
from helm.common.hierarchical_logger import hlog
|
|
15
17
|
from helm.common.general import ensure_file_downloaded
|
|
16
18
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
17
19
|
from .metric import Metric, MetricResult
|
|
18
20
|
from .metric_name import MetricName
|
|
19
21
|
from .metric_service import MetricService
|
|
20
|
-
from .basic_metrics import get_rouge_function
|
|
21
22
|
from .statistic import Stat
|
|
22
23
|
from .summac.model_summac import SummaCZS
|
|
23
24
|
from bert_score import BERTScorer
|
|
@@ -52,12 +53,12 @@ class SummarizationMetric(Metric):
|
|
|
52
53
|
# avoid triggering a bug in DataStatsMetric that raises
|
|
53
54
|
# `NameError: name 'stderr' is not defined`
|
|
54
55
|
if not spacy.util.is_package("en_core_web_sm"):
|
|
55
|
-
spacy.cli.download("en_core_web_sm")
|
|
56
|
+
spacy.cli.download("en_core_web_sm")
|
|
56
57
|
|
|
57
58
|
try:
|
|
58
59
|
from summ_eval.data_stats_metric import DataStatsMetric
|
|
59
60
|
except ModuleNotFoundError as e:
|
|
60
|
-
handle_module_not_found_error(e, ["
|
|
61
|
+
handle_module_not_found_error(e, ["summarization"])
|
|
61
62
|
|
|
62
63
|
self.data_stats_metric = DataStatsMetric()
|
|
63
64
|
self.task: str = task
|
|
@@ -168,7 +169,6 @@ class SummarizationMetric(Metric):
|
|
|
168
169
|
metric_service: MetricService,
|
|
169
170
|
eval_cache_path: str,
|
|
170
171
|
) -> List[Stat]:
|
|
171
|
-
|
|
172
172
|
refs: List[str] = [self._remove_braces(ref.output.text) for ref in request_state.instance.references]
|
|
173
173
|
inp: str = self._remove_braces(request_state.instance.input.text)
|
|
174
174
|
|
|
@@ -182,9 +182,9 @@ class SummarizationMetric(Metric):
|
|
|
182
182
|
self.humaneval = self._load_humaneval(eval_cache_path)
|
|
183
183
|
|
|
184
184
|
# get human evaluation scores if they exist
|
|
185
|
-
|
|
185
|
+
deployment = adapter_spec.model_deployment.replace("/", "_")
|
|
186
186
|
for metric_name in ["faithfulness", "relevance", "coherence"]:
|
|
187
|
-
val = self.humaneval[(metric_name,
|
|
187
|
+
val = self.humaneval[(metric_name, deployment, request_state.instance.id, pred)]
|
|
188
188
|
result.append(Stat(MetricName(f"HumanEval-{metric_name}")).add(float(val)))
|
|
189
189
|
except KeyError:
|
|
190
190
|
pass
|
|
@@ -196,8 +196,8 @@ class SummarizationMetric(Metric):
|
|
|
196
196
|
if self.qa_fact_eval is None:
|
|
197
197
|
self._load_qafacteval(eval_cache_path)
|
|
198
198
|
assert self.qa_fact_eval is not None
|
|
199
|
-
|
|
200
|
-
val = self.qa_fact_eval[
|
|
199
|
+
deployment = adapter_spec.model_deployment.replace("/", "_")
|
|
200
|
+
val = self.qa_fact_eval[deployment][(request_state.instance.id, pred)]
|
|
201
201
|
result.append(Stat(MetricName("QAFactEval")).add(float(val)))
|
|
202
202
|
except KeyError:
|
|
203
203
|
pass
|
|
@@ -7,7 +7,7 @@ from helm.benchmark.adaptation.request_state import RequestState
|
|
|
7
7
|
from helm.benchmark.metrics.classification_metrics import ClassificationMetric
|
|
8
8
|
from helm.benchmark.metrics.statistic import Stat
|
|
9
9
|
from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference, CORRECT_TAG
|
|
10
|
-
from helm.common.request import Request, RequestResult,
|
|
10
|
+
from helm.common.request import Request, RequestResult, GeneratedOutput
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class _Option(NamedTuple):
|
|
@@ -26,9 +26,12 @@ def _request_state(prediction: str, options: List[_Option]):
|
|
|
26
26
|
request_mode=None,
|
|
27
27
|
train_trial_index=0,
|
|
28
28
|
output_mapping=None,
|
|
29
|
-
request=Request(),
|
|
29
|
+
request=Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002"),
|
|
30
30
|
result=RequestResult(
|
|
31
|
-
success=True,
|
|
31
|
+
success=True,
|
|
32
|
+
embedding=[],
|
|
33
|
+
completions=[GeneratedOutput(text=prediction, logprob=0.0, tokens=[])],
|
|
34
|
+
cached=False,
|
|
32
35
|
),
|
|
33
36
|
num_train_instances=0,
|
|
34
37
|
prompt_truncated=False,
|
|
@@ -76,7 +79,7 @@ def test_evaluate_instances_binary_generation():
|
|
|
76
79
|
]
|
|
77
80
|
|
|
78
81
|
assert_stats_equal(
|
|
79
|
-
metric.evaluate_instances(request_states),
|
|
82
|
+
metric.evaluate_instances(request_states, ""),
|
|
80
83
|
_expected_stats(
|
|
81
84
|
{
|
|
82
85
|
"yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
|
|
@@ -106,7 +109,7 @@ def test_evaluate_instances_multi_class():
|
|
|
106
109
|
_request_state("invalid", _options("c")),
|
|
107
110
|
]
|
|
108
111
|
assert_stats_equal(
|
|
109
|
-
metric.evaluate_instances(request_states),
|
|
112
|
+
metric.evaluate_instances(request_states, ""),
|
|
110
113
|
_expected_stats(
|
|
111
114
|
{
|
|
112
115
|
"d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
|
|
@@ -139,7 +142,7 @@ def test_evaluate_instances_multilabel():
|
|
|
139
142
|
]
|
|
140
143
|
|
|
141
144
|
assert_stats_equal(
|
|
142
|
-
metric.evaluate_instances(request_states),
|
|
145
|
+
metric.evaluate_instances(request_states, ""),
|
|
143
146
|
_expected_stats(
|
|
144
147
|
{
|
|
145
148
|
"d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Test metrics
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pytest
|
|
6
|
+
from helm.benchmark.metrics.disinformation_metrics import _monte_carlo_entropy, _self_bleu
|
|
7
|
+
from helm.common.request import GeneratedOutput, Token
|
|
8
|
+
|
|
9
|
+
# Test tokens
|
|
10
|
+
_TEST_1_TOKENS: List[Token] = [
|
|
11
|
+
Token("This", logprob=-0.25),
|
|
12
|
+
Token("is", logprob=-0.25),
|
|
13
|
+
Token("a", logprob=-0.25),
|
|
14
|
+
Token("test", logprob=-0.25),
|
|
15
|
+
]
|
|
16
|
+
_TEST_2_TOKENS: List[Token] = [
|
|
17
|
+
Token("This", logprob=-0.25),
|
|
18
|
+
Token("is", logprob=-0.25),
|
|
19
|
+
Token("another", logprob=-0.5),
|
|
20
|
+
Token("test", logprob=-0.25),
|
|
21
|
+
]
|
|
22
|
+
_TEST_EMPTY_TOKENS: List[Token] = []
|
|
23
|
+
test_empty_str_tokens: List[Token] = [
|
|
24
|
+
Token("", logprob=0),
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
# Test Sequences (two standard, one with an empty token, and one with no tokens)
|
|
28
|
+
_TEST_1 = GeneratedOutput(text="This is a test", logprob=-1, tokens=_TEST_1_TOKENS)
|
|
29
|
+
_TEST_2 = GeneratedOutput(text="This is another test", logprob=-1.25, tokens=_TEST_2_TOKENS)
|
|
30
|
+
_TEST_EMPTY = GeneratedOutput(text="", logprob=-float("nan"), tokens=_TEST_EMPTY_TOKENS)
|
|
31
|
+
_TEST_EMPTY_STR = GeneratedOutput(text="", logprob=0, tokens=test_empty_str_tokens)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Test Self-BLEU
|
|
35
|
+
def test_self_bleu_with_self():
|
|
36
|
+
score = _self_bleu([_TEST_1, _TEST_1])
|
|
37
|
+
assert score == pytest.approx(100)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_self_blue_with_other():
|
|
41
|
+
score = _self_bleu([_TEST_1, _TEST_2])
|
|
42
|
+
assert 0 < score < 100
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_self_blue_one_sequence():
|
|
46
|
+
score = _self_bleu([_TEST_1])
|
|
47
|
+
assert score == 0
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_self_blue_one_full_one_empty():
|
|
51
|
+
score = _self_bleu([_TEST_1, _TEST_EMPTY_STR])
|
|
52
|
+
assert score == 0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Test MC Entropy
|
|
56
|
+
def test_mc_entropy_with_self():
|
|
57
|
+
score = _monte_carlo_entropy([_TEST_1, _TEST_1])
|
|
58
|
+
assert score == pytest.approx(-_TEST_1.logprob)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_mc_entropy_with_other():
|
|
62
|
+
score = _monte_carlo_entropy([_TEST_1, _TEST_2])
|
|
63
|
+
assert score == pytest.approx(-(_TEST_1.logprob + _TEST_2.logprob) / 2)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_mc_entropy_one_sequence():
|
|
67
|
+
score = _monte_carlo_entropy([_TEST_1])
|
|
68
|
+
assert score == -_TEST_1.logprob
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_mc_entropy_one_full_one_empty():
|
|
72
|
+
score = _monte_carlo_entropy([_TEST_EMPTY_STR])
|
|
73
|
+
assert score == _TEST_EMPTY_STR.logprob
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_mc_entropy_with_no_tokens():
|
|
77
|
+
score = _monte_carlo_entropy([_TEST_EMPTY])
|
|
78
|
+
assert np.isnan(score)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from helm.benchmark.metrics.evaluate_reference_metrics import (
|
|
2
|
+
exact_match,
|
|
3
|
+
exact_match_indicator,
|
|
4
|
+
final_number_exact_match,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_exact_match():
|
|
9
|
+
assert exact_match("33", "33") == 1
|
|
10
|
+
assert exact_match("33", "33 ") == 1
|
|
11
|
+
assert exact_match("33", "34") == 0
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_exact_match_indicator():
|
|
15
|
+
assert exact_match_indicator("33", "33") == 1
|
|
16
|
+
assert exact_match_indicator("33", "stuff 33") == 1
|
|
17
|
+
assert exact_match_indicator("stuff 33", "33") == 1
|
|
18
|
+
assert exact_match_indicator("33", "33 stuff") == 0
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_final_number_exact_match():
|
|
22
|
+
assert final_number_exact_match("33", "33") == 1
|
|
23
|
+
assert final_number_exact_match("33", "33 eggs.") == 1
|
|
24
|
+
assert final_number_exact_match("33", "-33") == 0
|
|
25
|
+
assert final_number_exact_match("-33", "-33") == 1
|
|
26
|
+
assert final_number_exact_match("The answer is 33", "\\boxed{33}") == 1
|
|
27
|
+
assert final_number_exact_match("The answer is 33", "\\boxed{33} and 34") == 0
|
|
28
|
+
assert final_number_exact_match("34.2", "2") == 0
|
|
29
|
+
assert final_number_exact_match("342", "342.") == 1
|
|
30
|
+
assert final_number_exact_match("3,420", "3420") == 1
|
|
@@ -3,13 +3,13 @@ from .basic_metrics import get_num_bytes, convert_tokens_to_text
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
def test_get_num_bytes():
|
|
6
|
-
tokens = [Token(text, 0
|
|
6
|
+
tokens = [Token(text, 0) for text in ["bytes:\\x99", "Hello", " world", "bytes:\\xe2\\x80"]]
|
|
7
7
|
assert get_num_bytes(tokens) == 14
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def test_convert_tokens_to_text():
|
|
11
11
|
tokens = [
|
|
12
|
-
Token(text, 0
|
|
12
|
+
Token(text, 0)
|
|
13
13
|
for text in [
|
|
14
14
|
"<|endoftext|>",
|
|
15
15
|
"bytes:\\xe2\\x80",
|
|
@@ -39,5 +39,5 @@ class AutoTokenCostEstimator(TokenCostEstimator):
|
|
|
39
39
|
"""
|
|
40
40
|
Estimate the number of tokens for a given request based on the organization.
|
|
41
41
|
"""
|
|
42
|
-
token_cost_estimator: TokenCostEstimator = self._get_estimator(request.
|
|
42
|
+
token_cost_estimator: TokenCostEstimator = self._get_estimator(request.model_host)
|
|
43
43
|
return token_cost_estimator.estimate_tokens(request, metric_service)
|
|
@@ -2,11 +2,19 @@ from helm.benchmark.metrics.metric_service import MetricService
|
|
|
2
2
|
from helm.benchmark.window_services.window_service import WindowService
|
|
3
3
|
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
4
4
|
from helm.common.request import Request
|
|
5
|
-
from helm.proxy.token_counters.gooseai_token_counter import GooseAITokenCounter
|
|
6
5
|
from .token_cost_estimator import TokenCostEstimator
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class GooseAITokenCostEstimator(TokenCostEstimator):
|
|
9
|
+
# From https://goose.ai/pricing: "the base price includes your first 25 tokens
|
|
10
|
+
# generated, and you can scale beyond that on a per-token basis."
|
|
11
|
+
BASE_PRICE_TOKENS: int = 25
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def account_for_base_tokens(num_tokens: int):
|
|
15
|
+
"""Subtracts the number of tokens included in the base price."""
|
|
16
|
+
return max(num_tokens - GooseAITokenCostEstimator.BASE_PRICE_TOKENS, 0)
|
|
17
|
+
|
|
10
18
|
def estimate_tokens(self, request: Request, metric_service: MetricService) -> int:
|
|
11
19
|
"""
|
|
12
20
|
Estimate the number of generated tokens for a given request. Formula:
|
|
@@ -17,6 +25,8 @@ class GooseAITokenCostEstimator(TokenCostEstimator):
|
|
|
17
25
|
"""
|
|
18
26
|
total_estimated_tokens: int = request.num_completions * request.max_tokens
|
|
19
27
|
if request.echo_prompt:
|
|
20
|
-
window_service: WindowService = WindowServiceFactory.get_window_service(
|
|
28
|
+
window_service: WindowService = WindowServiceFactory.get_window_service(
|
|
29
|
+
request.model_deployment, metric_service
|
|
30
|
+
)
|
|
21
31
|
total_estimated_tokens += window_service.get_num_tokens(request.prompt)
|
|
22
|
-
return
|
|
32
|
+
return GooseAITokenCostEstimator.account_for_base_tokens(total_estimated_tokens)
|
|
@@ -15,7 +15,7 @@ class OpenAITokenCostEstimator(TokenCostEstimator):
|
|
|
15
15
|
|
|
16
16
|
Add num_tokens(prompt) if Request.echo_prompt is True.
|
|
17
17
|
"""
|
|
18
|
-
tokenizer: WindowService = WindowServiceFactory.get_window_service(request.
|
|
18
|
+
tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model_deployment, metric_service)
|
|
19
19
|
num_prompt_tokens: int = tokenizer.get_num_tokens(request.prompt)
|
|
20
20
|
total_estimated_tokens: int = num_prompt_tokens + request.num_completions * request.max_tokens
|
|
21
21
|
|
|
@@ -10,6 +10,8 @@ class TestAI21TokenCostEstimator:
|
|
|
10
10
|
|
|
11
11
|
def test_estimate_tokens(self):
|
|
12
12
|
request = Request(
|
|
13
|
+
model="openai/text-davinci-002",
|
|
14
|
+
model_deployment="openai/text-davinci-002",
|
|
13
15
|
prompt="The Center for Research on Foundation Models (CRFM) is "
|
|
14
16
|
"an interdisciplinary initiative born out of the Stanford "
|
|
15
17
|
"Institute for Human-Centered Artificial Intelligence (HAI) "
|
|
@@ -12,7 +12,6 @@ from .openai_token_cost_estimator import OpenAITokenCostEstimator
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class TestOpenAITokenCostEstimator:
|
|
15
|
-
|
|
16
15
|
# The following prompt has 51 tokens according to the GPT-2 tokenizer
|
|
17
16
|
TEST_PROMPT: str = (
|
|
18
17
|
"The Center for Research on Foundation Models (CRFM) is "
|
|
@@ -37,13 +36,21 @@ class TestOpenAITokenCostEstimator:
|
|
|
37
36
|
self._mock_metric_service.tokenize = MagicMock(return_value=tokenization_request_result)
|
|
38
37
|
|
|
39
38
|
def test_estimate_tokens(self):
|
|
40
|
-
request = Request(
|
|
39
|
+
request = Request(
|
|
40
|
+
model="openai/text-davinci-002",
|
|
41
|
+
model_deployment="openai/text-davinci-002",
|
|
42
|
+
prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
|
|
43
|
+
num_completions=3,
|
|
44
|
+
max_tokens=100,
|
|
45
|
+
)
|
|
41
46
|
|
|
42
47
|
# Prompt + max number of tokens from completions = 51 + 3 * 100
|
|
43
48
|
assert self._token_cost_estimator.estimate_tokens(request, self._mock_metric_service) == 51 + 3 * 100
|
|
44
49
|
|
|
45
50
|
def test_estimate_tokens_with_echo_prompt(self):
|
|
46
51
|
request = Request(
|
|
52
|
+
model="openai/text-davinci-002",
|
|
53
|
+
model_deployment="openai/text-davinci-002",
|
|
47
54
|
prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
|
|
48
55
|
echo_prompt=True,
|
|
49
56
|
num_completions=1,
|
|
@@ -5,7 +5,7 @@ from helm.common.request import RequestResult
|
|
|
5
5
|
from helm.common.hierarchical_logger import hlog
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
7
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
8
|
-
from helm.
|
|
8
|
+
from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
|
|
9
9
|
from .metric import Metric
|
|
10
10
|
from .metric_name import MetricName
|
|
11
11
|
from .metric_service import MetricService
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
2
|
+
from helm.common.perspective_api_request import PerspectiveAPIRequestResult, PerspectiveAPIRequest, ToxicityAttributes
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def compute_toxicity_score(text: str, metric_service: MetricService) -> float:
|
|
6
|
+
"""
|
|
7
|
+
Compute the toxicity score of a given text using Perspective API.
|
|
8
|
+
Returns a value between 0 and 1 where a score 0.5 or greater is deemed toxic.
|
|
9
|
+
"""
|
|
10
|
+
response: PerspectiveAPIRequestResult = metric_service.get_toxicity_scores(
|
|
11
|
+
request=PerspectiveAPIRequest(text_batch=[text])
|
|
12
|
+
)
|
|
13
|
+
attributes: ToxicityAttributes = response.text_to_toxicity_attributes[text]
|
|
14
|
+
assert attributes.toxicity_score is not None
|
|
15
|
+
return attributes.toxicity_score
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def is_prompt_toxic(text: str, metric_service: MetricService) -> bool:
|
|
19
|
+
"""
|
|
20
|
+
Returns True, if the prompt is considered toxic, False otherwise.
|
|
21
|
+
"""
|
|
22
|
+
score: float = compute_toxicity_score(text, metric_service)
|
|
23
|
+
return score >= 0.5
|