crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from typing import Tuple, Dict, Any
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
import tempfile
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.annotation.image2structure.image_compiler_annotator import ImageCompilerAnnotator, CompilationError
|
|
7
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
8
|
+
from helm.common.cache import CacheConfig
|
|
9
|
+
from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from PIL import Image, ImageOps
|
|
13
|
+
except ModuleNotFoundError as ex:
|
|
14
|
+
handle_module_not_found_error(ex, suggestions=["images"])
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LilypondCompilerAnnotator(ImageCompilerAnnotator):
|
|
18
|
+
"""Annotator that compiles the text completions into a music sheet with LilyPond."""
|
|
19
|
+
|
|
20
|
+
name: str = "lilypond_compiler"
|
|
21
|
+
base_path = "/home/josselin/installs/lilypond-2.24.3/bin"
|
|
22
|
+
|
|
23
|
+
def __init__(self, cache_config: CacheConfig, file_storage_path: str):
|
|
24
|
+
super().__init__(cache_config, file_storage_path)
|
|
25
|
+
try:
|
|
26
|
+
result = subprocess.run([f"{self.base_path}/lilypond", "--version"], capture_output=True, text=True)
|
|
27
|
+
if result.returncode != 0:
|
|
28
|
+
raise OptionalDependencyNotInstalled(
|
|
29
|
+
"LilyPond is not installed. Download and install it from https://lilypond.org/download.html"
|
|
30
|
+
)
|
|
31
|
+
except FileNotFoundError as e:
|
|
32
|
+
raise OptionalDependencyNotInstalled(
|
|
33
|
+
"LilyPond is not installed. Download and install it from https://lilypond.org/download.html.\n"
|
|
34
|
+
f"Original error: {e}"
|
|
35
|
+
) from e
|
|
36
|
+
|
|
37
|
+
def compile_completion_into_image(
|
|
38
|
+
self, request_state: RequestState, completion_text: str
|
|
39
|
+
) -> Tuple[Image.Image, Dict[str, Any]]:
|
|
40
|
+
"""Given a completion, compile it with LilyPond."""
|
|
41
|
+
# The LilyPond command requires a file on disk, so we write the completion to a temporary file
|
|
42
|
+
tmp = tempfile.NamedTemporaryFile()
|
|
43
|
+
ly_file_path: str = f"{tmp.name}.ly"
|
|
44
|
+
with open(ly_file_path, "w") as f:
|
|
45
|
+
f.write(completion_text)
|
|
46
|
+
|
|
47
|
+
# What we pass in as -o should be the same name as the .ly file, but without the extension
|
|
48
|
+
output_path: str = ly_file_path.replace(".ly", "")
|
|
49
|
+
# The image file of the music sheet should have the same path as the .ly file, but with .png extension
|
|
50
|
+
sheet_music_path: str = ly_file_path.replace(".ly", ".png")
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
# Edits the LilyPond file to be compatible with the current version
|
|
54
|
+
result = subprocess.run(
|
|
55
|
+
[f"{self.base_path}/convert-ly", "-e", ly_file_path], capture_output=True, text=True
|
|
56
|
+
)
|
|
57
|
+
assert result.returncode == 0, f"convert-ly failed: {result.stderr}"
|
|
58
|
+
|
|
59
|
+
# Generate PNG image from the LilyPond file
|
|
60
|
+
# LilyPond supports partial compilation, which means it attempts to produce an image
|
|
61
|
+
# for the correct portions of the code, even if there are errors elsewhere
|
|
62
|
+
subprocess.run(
|
|
63
|
+
[f"{self.base_path}/lilypond", "--png", "-o", output_path, ly_file_path], capture_output=True, text=True
|
|
64
|
+
)
|
|
65
|
+
# If an image file is not generated, we consider it an absolute compilation failure
|
|
66
|
+
assert os.path.exists(sheet_music_path), "lilypond did not generate the image"
|
|
67
|
+
|
|
68
|
+
# Load the image as a PIL Image object
|
|
69
|
+
image = Image.open(sheet_music_path)
|
|
70
|
+
|
|
71
|
+
# Crop the image to remove the white space around the music sheet
|
|
72
|
+
(w, h) = image.size
|
|
73
|
+
image = image.crop((0, 0, w, h - int(h * 0.2))) # Remove pagination
|
|
74
|
+
image = image.crop(ImageOps.invert(image).getbbox()) # Remove white border
|
|
75
|
+
except (AssertionError, RuntimeError) as e:
|
|
76
|
+
raise CompilationError(str(e)) from e
|
|
77
|
+
finally:
|
|
78
|
+
# Clean up the temporary files
|
|
79
|
+
if os.path.exists(ly_file_path):
|
|
80
|
+
os.remove(ly_file_path)
|
|
81
|
+
if os.path.exists(sheet_music_path):
|
|
82
|
+
os.remove(sheet_music_path)
|
|
83
|
+
|
|
84
|
+
return image, dict()
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from typing import List, Tuple, Optional, Dict, Any
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import threading
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.annotation.image2structure.image_compiler_annotator import ImageCompilerAnnotator, CompilationError
|
|
8
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
9
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
10
|
+
from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import ScreenshotOptions
|
|
11
|
+
from helm.benchmark.scenarios.vision_language.image2structure.webpage.utils import convert_html_to_text
|
|
12
|
+
from helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario import serve_and_take_screenshot
|
|
13
|
+
from helm.benchmark.scenarios.scenario import ASSET_NAME_TAG, ASSET_PATH_TAG
|
|
14
|
+
from helm.common.general import ensure_directory_exists
|
|
15
|
+
from helm.common.cache import CacheConfig
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from PIL import Image
|
|
19
|
+
from html2text import HTML2Text
|
|
20
|
+
except ModuleNotFoundError as e:
|
|
21
|
+
handle_module_not_found_error(e, suggestions=["image2structure"])
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class WebpageCompilerAnnotator(ImageCompilerAnnotator):
|
|
25
|
+
"""Annotator that compiles the text completions into a webpage
|
|
26
|
+
And takes a screenshot of the webpage."""
|
|
27
|
+
|
|
28
|
+
name: str = "webpage_compiler"
|
|
29
|
+
|
|
30
|
+
# Delimiters for the code block
|
|
31
|
+
DELIMITERS: List[Tuple[str, str]] = [
|
|
32
|
+
("```json", "```"),
|
|
33
|
+
("```", "```"),
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
def __init__(self, cache_config: CacheConfig, file_storage_path: str):
|
|
37
|
+
super().__init__(cache_config, file_storage_path)
|
|
38
|
+
self._html2text = HTML2Text()
|
|
39
|
+
self._html2text.ignore_links = True
|
|
40
|
+
|
|
41
|
+
def postprocess_infos(self, infos: Dict[str, Any]) -> Dict[str, Any]:
|
|
42
|
+
"""Postprocess the infos."""
|
|
43
|
+
annotations = super().postprocess_infos(infos)
|
|
44
|
+
assert "html" in annotations, "The html field should be present in the infos"
|
|
45
|
+
annotations["text"] = convert_html_to_text(self._html2text, infos["html"])
|
|
46
|
+
return annotations
|
|
47
|
+
|
|
48
|
+
def compile_completion_into_image(
|
|
49
|
+
self, request_state: RequestState, completion_text: str
|
|
50
|
+
) -> Tuple[Image.Image, Dict[str, Any]]:
|
|
51
|
+
"""Given a completion, parse the code and compile it into an image and return the image and the infos."""
|
|
52
|
+
# Create a temporary directory to store the files
|
|
53
|
+
cache_config: CacheConfig = self._cache.config
|
|
54
|
+
repo_path: str = "prod_env/tmp"
|
|
55
|
+
if hasattr(cache_config, "path"):
|
|
56
|
+
repo_path = os.path.join(os.path.dirname(cache_config.path), "tmp")
|
|
57
|
+
# Make the repo path thread safe by adding the thread id
|
|
58
|
+
repo_path = f"{repo_path}_{threading.get_ident()}"
|
|
59
|
+
ensure_directory_exists(repo_path)
|
|
60
|
+
|
|
61
|
+
# Check for code block delimiters
|
|
62
|
+
# After this completion should be a valid json object
|
|
63
|
+
for start, end in self.DELIMITERS:
|
|
64
|
+
if start in completion_text and end in completion_text[completion_text.index(start) + len(start) :]:
|
|
65
|
+
start_index = completion_text.index(start) + len(start)
|
|
66
|
+
end_index = completion_text.index(end, start_index)
|
|
67
|
+
completion_text = completion_text[start_index:end_index]
|
|
68
|
+
break
|
|
69
|
+
|
|
70
|
+
# Parse code into json object
|
|
71
|
+
structure: dict
|
|
72
|
+
try:
|
|
73
|
+
structure = json.loads(completion_text)
|
|
74
|
+
except json.JSONDecodeError as e:
|
|
75
|
+
raise CompilationError(f"Failed to parse the completion as a JSON object: {e}") from e
|
|
76
|
+
|
|
77
|
+
# Copy the assets
|
|
78
|
+
assets_paths: List[str] = []
|
|
79
|
+
assets_names: List[str] = []
|
|
80
|
+
for reference in request_state.instance.references:
|
|
81
|
+
if ASSET_PATH_TAG in reference.tags:
|
|
82
|
+
assert reference.output.multimedia_content is not None
|
|
83
|
+
for media_object in reference.output.multimedia_content.media_objects:
|
|
84
|
+
assert media_object.is_local_file
|
|
85
|
+
assert media_object.is_type("image")
|
|
86
|
+
assert type(media_object.location) == str
|
|
87
|
+
assets_paths.append(media_object.location)
|
|
88
|
+
if ASSET_NAME_TAG in reference.tags:
|
|
89
|
+
assert reference.output.multimedia_content is not None
|
|
90
|
+
for media_object in reference.output.multimedia_content.media_objects:
|
|
91
|
+
assert media_object.is_type("text")
|
|
92
|
+
assert type(media_object.text) == str
|
|
93
|
+
assets_names.append(media_object.text)
|
|
94
|
+
assert len(assets_paths) == len(assets_names)
|
|
95
|
+
for asset_path, asset_name in zip(assets_paths, assets_names):
|
|
96
|
+
dest_path: str = os.path.join(repo_path, asset_name)
|
|
97
|
+
# Make sure the parent directory exists
|
|
98
|
+
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
|
|
99
|
+
shutil.copyfile(asset_path, dest_path)
|
|
100
|
+
# os.symlink(asset_path, dest_path)
|
|
101
|
+
|
|
102
|
+
# Create each file in a temporary directory
|
|
103
|
+
if not isinstance(structure, list):
|
|
104
|
+
raise CompilationError("The completion should be a list of files")
|
|
105
|
+
for item in structure:
|
|
106
|
+
filename: Optional[str] = item.get("filename")
|
|
107
|
+
content: Optional[str] = item.get("content")
|
|
108
|
+
if filename is None or content is None:
|
|
109
|
+
raise CompilationError("Each file should have a valid filename and content")
|
|
110
|
+
# Create parent directories if they do not exist
|
|
111
|
+
if filename in assets_names:
|
|
112
|
+
# Some models will include assets in their response like this:
|
|
113
|
+
# {
|
|
114
|
+
# "filename": "chmber.jpg",
|
|
115
|
+
# "content": "The content of the chmber.jpg file is a binary image and cannot be displayed as text."
|
|
116
|
+
# }
|
|
117
|
+
# In this case, we skip the file creation
|
|
118
|
+
continue
|
|
119
|
+
parent_dir = os.path.join(repo_path, os.path.dirname(filename))
|
|
120
|
+
os.makedirs(parent_dir, exist_ok=True)
|
|
121
|
+
with open(os.path.join(repo_path, filename), "w") as f:
|
|
122
|
+
f.write(content)
|
|
123
|
+
|
|
124
|
+
# Save the screenshot, loads the image and remove the file
|
|
125
|
+
destination_path: str = os.path.join(repo_path, "output.png")
|
|
126
|
+
infos: Dict[str, Any] = serve_and_take_screenshot(repo_path, destination_path, ScreenshotOptions())
|
|
127
|
+
image: Image.Image = Image.open(destination_path)
|
|
128
|
+
|
|
129
|
+
# Delete the repository
|
|
130
|
+
shutil.rmtree(repo_path, ignore_errors=True)
|
|
131
|
+
|
|
132
|
+
return image, infos
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.annotation.annotator_factory import AnnotatorFactory
|
|
6
|
+
from helm.benchmark.annotation.annotator import Annotator, AnnotatorSpec
|
|
7
|
+
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestAnnotatorFactory:
|
|
11
|
+
def setup_method(self):
|
|
12
|
+
credentials: Dict[str, Any] = {}
|
|
13
|
+
cache_config = BlackHoleCacheBackendConfig()
|
|
14
|
+
self.file_storage_path: str = "tmp"
|
|
15
|
+
self.annotator_factory = AnnotatorFactory(credentials, self.file_storage_path, cache_config)
|
|
16
|
+
|
|
17
|
+
def teardown_method(self):
|
|
18
|
+
if os.path.exists(self.file_storage_path):
|
|
19
|
+
shutil.rmtree(self.file_storage_path)
|
|
20
|
+
|
|
21
|
+
def test_get_annotator(self):
|
|
22
|
+
annotator = self.annotator_factory.get_annotator(
|
|
23
|
+
AnnotatorSpec(class_name="helm.benchmark.annotation.annotator.DummyAnnotator")
|
|
24
|
+
)
|
|
25
|
+
assert isinstance(annotator, Annotator)
|
|
26
|
+
assert annotator.name == "dummy"
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from dataclasses import replace
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.annotation.annotator import Annotator, DummyAnnotator
|
|
6
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Instance, Input
|
|
8
|
+
from helm.common.request import Request, RequestResult, GeneratedOutput
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TestDummyAnnotator:
|
|
12
|
+
def setup_method(self):
|
|
13
|
+
self.annotator: Annotator = DummyAnnotator()
|
|
14
|
+
self.request_state = RequestState(
|
|
15
|
+
instance=Instance(input=Input(text="hello world"), references=[]),
|
|
16
|
+
request=Request(),
|
|
17
|
+
request_mode="original",
|
|
18
|
+
output_mapping=None,
|
|
19
|
+
result=None,
|
|
20
|
+
train_trial_index=0,
|
|
21
|
+
num_train_instances=0,
|
|
22
|
+
prompt_truncated=False,
|
|
23
|
+
reference_index=None,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def test_annotate(self):
|
|
27
|
+
request_state: RequestState = replace(
|
|
28
|
+
self.request_state,
|
|
29
|
+
result=RequestResult(
|
|
30
|
+
success=True,
|
|
31
|
+
embedding=[],
|
|
32
|
+
completions=[GeneratedOutput(text="How are you?", logprob=0, tokens=[])],
|
|
33
|
+
cached=True,
|
|
34
|
+
),
|
|
35
|
+
)
|
|
36
|
+
annotations: Any = self.annotator.annotate(request_state)
|
|
37
|
+
assert annotations is not None
|
|
38
|
+
assert "all_caps" in annotations[0]
|
|
39
|
+
assert request_state.result is not None # To make mypy happy
|
|
40
|
+
assert annotations[0]["all_caps"] == request_state.result.completions[0].text.upper()
|
|
41
|
+
|
|
42
|
+
def test_annotate_no_result(self):
|
|
43
|
+
with pytest.raises(ValueError):
|
|
44
|
+
self.annotator.annotate(self.request_state)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from typing import Optional, List, Dict, Any
|
|
4
|
+
from dataclasses import dataclass, replace
|
|
5
|
+
from helm.common.cache_backend_config import (
|
|
6
|
+
CacheBackendConfig,
|
|
7
|
+
BlackHoleCacheBackendConfig,
|
|
8
|
+
MongoCacheBackendConfig,
|
|
9
|
+
SqliteCacheBackendConfig,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from helm.common.general import ensure_directory_exists, parallel_map, get_credentials
|
|
13
|
+
from helm.common.hierarchical_logger import htrack, hlog
|
|
14
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
15
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
16
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec, Annotator
|
|
17
|
+
from helm.benchmark.annotation.annotator_factory import AnnotatorFactory
|
|
18
|
+
from helm.proxy.services.service import CACHE_DIR
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AnnotationExecutorError(Exception):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class AnnotationExecutionSpec:
|
|
27
|
+
|
|
28
|
+
local_path: str
|
|
29
|
+
"""Path where API credentials and cache is stored.
|
|
30
|
+
|
|
31
|
+
This path is the same as `--base-path` when launching the proxy server (see server.py).
|
|
32
|
+
Required when url is not set."""
|
|
33
|
+
|
|
34
|
+
parallelism: int
|
|
35
|
+
"""How many threads to have at once"""
|
|
36
|
+
|
|
37
|
+
dry_run: bool = False
|
|
38
|
+
"""Whether to skip execution"""
|
|
39
|
+
|
|
40
|
+
sqlite_cache_backend_config: Optional[SqliteCacheBackendConfig] = None
|
|
41
|
+
"""If set, SQLite will be used for the cache.
|
|
42
|
+
|
|
43
|
+
This specifies the directory in which the SQLite cache will store files.
|
|
44
|
+
At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."""
|
|
45
|
+
|
|
46
|
+
mongo_cache_backend_config: Optional[MongoCacheBackendConfig] = None
|
|
47
|
+
"""If set, MongoDB will be used for the cache.
|
|
48
|
+
|
|
49
|
+
This specifies the MongoDB database to be used by the MongoDB cache.
|
|
50
|
+
At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class AnnotationExecutor:
|
|
54
|
+
"""
|
|
55
|
+
An `Executor` takes a `ScenarioState` which has a bunch of requests.
|
|
56
|
+
Issue them to the API and return the results.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, execution_spec: AnnotationExecutionSpec):
|
|
60
|
+
self.execution_spec = execution_spec
|
|
61
|
+
|
|
62
|
+
cache_backend_config: CacheBackendConfig
|
|
63
|
+
if execution_spec.sqlite_cache_backend_config and execution_spec.mongo_cache_backend_config:
|
|
64
|
+
raise AnnotationExecutorError(
|
|
65
|
+
"At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."
|
|
66
|
+
)
|
|
67
|
+
elif execution_spec.sqlite_cache_backend_config:
|
|
68
|
+
cache_backend_config = execution_spec.sqlite_cache_backend_config
|
|
69
|
+
elif execution_spec.mongo_cache_backend_config:
|
|
70
|
+
cache_backend_config = execution_spec.mongo_cache_backend_config
|
|
71
|
+
else:
|
|
72
|
+
cache_backend_config = BlackHoleCacheBackendConfig()
|
|
73
|
+
|
|
74
|
+
base_path: str = execution_spec.local_path
|
|
75
|
+
ensure_directory_exists(base_path)
|
|
76
|
+
client_file_storage_path = os.path.join(base_path, CACHE_DIR)
|
|
77
|
+
ensure_directory_exists(client_file_storage_path)
|
|
78
|
+
credentials: Dict[str, str] = get_credentials(base_path)
|
|
79
|
+
self.factory = AnnotatorFactory(
|
|
80
|
+
credentials=credentials,
|
|
81
|
+
file_storage_path=client_file_storage_path,
|
|
82
|
+
cache_backend_config=cache_backend_config,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@htrack(None)
|
|
86
|
+
def execute(self, scenario_state: ScenarioState) -> ScenarioState:
|
|
87
|
+
if self.execution_spec.dry_run:
|
|
88
|
+
hlog("Skipped annotation.")
|
|
89
|
+
return scenario_state
|
|
90
|
+
|
|
91
|
+
if scenario_state.annotator_specs is None or len(scenario_state.annotator_specs) == 0:
|
|
92
|
+
hlog("No annotators to run.")
|
|
93
|
+
return scenario_state
|
|
94
|
+
|
|
95
|
+
# Do it!
|
|
96
|
+
def do_it(request_state: RequestState) -> RequestState:
|
|
97
|
+
assert scenario_state.annotator_specs is not None
|
|
98
|
+
return self.process(scenario_state.annotator_specs, request_state)
|
|
99
|
+
|
|
100
|
+
self.annotator_specs = scenario_state.annotator_specs
|
|
101
|
+
|
|
102
|
+
request_states = parallel_map(
|
|
103
|
+
do_it,
|
|
104
|
+
scenario_state.request_states,
|
|
105
|
+
parallelism=self.execution_spec.parallelism,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
hlog(f"Annotated {len(request_states)} requests")
|
|
109
|
+
return ScenarioState(
|
|
110
|
+
adapter_spec=scenario_state.adapter_spec,
|
|
111
|
+
request_states=request_states,
|
|
112
|
+
annotator_specs=scenario_state.annotator_specs,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def process(self, annotator_specs: List[AnnotatorSpec], state: RequestState) -> RequestState:
|
|
116
|
+
annotations: Dict[str, Any] = {}
|
|
117
|
+
try:
|
|
118
|
+
for annotator_spec in annotator_specs:
|
|
119
|
+
annotator: Annotator = self.factory.get_annotator(annotator_spec)
|
|
120
|
+
new_annotations = annotator.annotate(state)
|
|
121
|
+
annotations[annotator.name] = new_annotations
|
|
122
|
+
except Exception as e:
|
|
123
|
+
raise AnnotationExecutorError(f"{str(e)} Request: {state.request}") from e
|
|
124
|
+
return replace(state, annotations=annotations)
|
|
@@ -10,13 +10,13 @@ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
|
10
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
11
11
|
from helm.benchmark.scenarios.scenario import Input, Instance, Reference, Output
|
|
12
12
|
from .perturbation_description import PerturbationDescription
|
|
13
|
-
from .perturbation import Perturbation
|
|
13
|
+
from .perturbation import Perturbation, TextPerturbation
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
############################################################
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
class ChineseTyposPerturbation(
|
|
19
|
+
class ChineseTyposPerturbation(TextPerturbation):
|
|
20
20
|
"""
|
|
21
21
|
Chinese typos. For implementation details, see
|
|
22
22
|
https://github.com/GEM-benchmark/NL-Augmenter/tree/main/nlaugmenter/transformations/chinese_butter_fingers_perturbation
|
|
@@ -271,7 +271,7 @@ class ChineseTyposPerturbation(Perturbation):
|
|
|
271
271
|
return chars_with_similar_pinyin
|
|
272
272
|
|
|
273
273
|
|
|
274
|
-
class ChineseSynonymPerturbation(
|
|
274
|
+
class ChineseSynonymPerturbation(TextPerturbation):
|
|
275
275
|
"""
|
|
276
276
|
Chinese synonyms. For implementation details, see
|
|
277
277
|
https://github.com/GEM-benchmark/NL-Augmenter/blob/main/nlaugmenter/transformations/chinese_antonym_synonym_substitution
|
|
@@ -340,7 +340,7 @@ class ChineseSynonymPerturbation(Perturbation):
|
|
|
340
340
|
return sample_list[index]
|
|
341
341
|
|
|
342
342
|
|
|
343
|
-
class CLEVAMildMixPerturbation(
|
|
343
|
+
class CLEVAMildMixPerturbation(TextPerturbation):
|
|
344
344
|
"""
|
|
345
345
|
CLEVA robustness perturbation that composes several perturbations.
|
|
346
346
|
"""
|
|
@@ -370,7 +370,7 @@ class CLEVAMildMixPerturbation(Perturbation):
|
|
|
370
370
|
############################################################
|
|
371
371
|
|
|
372
372
|
|
|
373
|
-
class ChineseGenderPerturbation(
|
|
373
|
+
class ChineseGenderPerturbation(TextPerturbation):
|
|
374
374
|
"""Individual fairness perturbation for Chinese gender terms and pronouns."""
|
|
375
375
|
|
|
376
376
|
name: str = "chinese_gender"
|
|
@@ -601,13 +601,6 @@ class ChinesePersonNamePerturbation(Perturbation):
|
|
|
601
601
|
name = rng.choice(list(options))
|
|
602
602
|
return name
|
|
603
603
|
|
|
604
|
-
def perturb(self, text: str, rng: Random) -> str:
|
|
605
|
-
"""
|
|
606
|
-
Perturbing the text is handled in `perturb_with_persistency` to ensure that perturbed names
|
|
607
|
-
in `Instance`s and `Reference`s match.
|
|
608
|
-
"""
|
|
609
|
-
raise NotImplementedError("Should never be called")
|
|
610
|
-
|
|
611
604
|
def perturb_with_persistency(
|
|
612
605
|
self, text: str, rng: Random, name_substitution_mapping: Dict[str, str], skipped_tokens: Set[str]
|
|
613
606
|
) -> str:
|
|
@@ -686,7 +679,7 @@ class ChinesePersonNamePerturbation(Perturbation):
|
|
|
686
679
|
return tokens, tags
|
|
687
680
|
|
|
688
681
|
|
|
689
|
-
class SimplifiedToTraditionalPerturbation(
|
|
682
|
+
class SimplifiedToTraditionalPerturbation(TextPerturbation):
|
|
690
683
|
"""Individual fairness perturbation for Chinese simplified to Chinese traditional."""
|
|
691
684
|
|
|
692
685
|
name: str = "simplified_to_traditional"
|
|
@@ -713,7 +706,7 @@ class SimplifiedToTraditionalPerturbation(Perturbation):
|
|
|
713
706
|
return perturbed_text
|
|
714
707
|
|
|
715
708
|
|
|
716
|
-
class MandarinToCantonesePerturbation(
|
|
709
|
+
class MandarinToCantonesePerturbation(TextPerturbation):
|
|
717
710
|
"""
|
|
718
711
|
Individual fairness perturbation for Mandarin to Cantonese translation.
|
|
719
712
|
The implementation is inspired by https://justyy.com/tools/chinese-converter/
|
|
@@ -5,7 +5,7 @@ import re
|
|
|
5
5
|
from random import Random
|
|
6
6
|
|
|
7
7
|
from helm.common.general import match_case
|
|
8
|
-
from .perturbation import
|
|
8
|
+
from .perturbation import TextPerturbation
|
|
9
9
|
from .perturbation_description import PerturbationDescription
|
|
10
10
|
|
|
11
11
|
|
|
@@ -92,7 +92,7 @@ CONTRACTION_MAP: Dict[str, str] = {
|
|
|
92
92
|
|
|
93
93
|
# The implementations below are based on
|
|
94
94
|
# https://github.com/GEM-benchmark/NL-Augmenter/blob/main/transformations/contraction_expansions/transformation.py
|
|
95
|
-
class ContractionPerturbation(
|
|
95
|
+
class ContractionPerturbation(TextPerturbation):
|
|
96
96
|
"""
|
|
97
97
|
Contractions.
|
|
98
98
|
Replaces each expansion with its contracted version.
|
|
@@ -132,7 +132,7 @@ class ContractionPerturbation(Perturbation):
|
|
|
132
132
|
return self.reverse_contraction_pattern.sub(cont, text)
|
|
133
133
|
|
|
134
134
|
|
|
135
|
-
class ExpansionPerturbation(
|
|
135
|
+
class ExpansionPerturbation(TextPerturbation):
|
|
136
136
|
"""
|
|
137
137
|
Expansions.
|
|
138
138
|
Replaces each contraction with its expanded version.
|
|
@@ -81,6 +81,3 @@ class ContrastSetsPerturbation(Perturbation):
|
|
|
81
81
|
references=perturbed_references,
|
|
82
82
|
perturbation=description,
|
|
83
83
|
)
|
|
84
|
-
|
|
85
|
-
def perturb(self, text: str, rng: Random) -> str: # we need this since parent method is abstract
|
|
86
|
-
raise NotImplementedError("Should never be called since apply() was overridden")
|
|
@@ -35,7 +35,6 @@ class Processor:
|
|
|
35
35
|
|
|
36
36
|
@dataclass(frozen=True)
|
|
37
37
|
class DataAugmenter:
|
|
38
|
-
|
|
39
38
|
# Perturbations to apply to generate new instances
|
|
40
39
|
perturbations: List[Perturbation]
|
|
41
40
|
|
|
@@ -72,7 +71,6 @@ class DataAugmenter:
|
|
|
72
71
|
|
|
73
72
|
@dataclass(frozen=True)
|
|
74
73
|
class DataAugmenterSpec:
|
|
75
|
-
|
|
76
74
|
# List of perturbation specs to use to augment the data
|
|
77
75
|
perturbation_specs: List[PerturbationSpec] = field(default_factory=list)
|
|
78
76
|
|
|
@@ -8,10 +8,10 @@ from typing import Dict, Optional, List
|
|
|
8
8
|
|
|
9
9
|
from helm.common.general import match_case, ensure_file_downloaded
|
|
10
10
|
from .perturbation_description import PerturbationDescription
|
|
11
|
-
from .perturbation import
|
|
11
|
+
from .perturbation import TextPerturbation
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
class DialectPerturbation(
|
|
14
|
+
class DialectPerturbation(TextPerturbation):
|
|
15
15
|
"""Individual fairness perturbation for dialect."""
|
|
16
16
|
|
|
17
17
|
""" Short unique identifier of the perturbation (e.g., extra_space) """
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from random import Random
|
|
3
3
|
|
|
4
|
-
from .perturbation import
|
|
4
|
+
from .perturbation import TextPerturbation
|
|
5
5
|
from .perturbation_description import PerturbationDescription
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class ExtraSpacePerturbation(
|
|
8
|
+
class ExtraSpacePerturbation(TextPerturbation):
|
|
9
9
|
"""
|
|
10
10
|
A toy perturbation that replaces existing spaces in the text with
|
|
11
11
|
`num_spaces` number of spaces.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
|
|
3
|
-
from .perturbation import
|
|
3
|
+
from .perturbation import TextPerturbation
|
|
4
4
|
from .perturbation_description import PerturbationDescription
|
|
5
5
|
|
|
6
6
|
from random import Random
|
|
@@ -31,7 +31,7 @@ UNCERTAIN_PHRASES = ["maybe", "perhaps", "probably", "possibly", "most likely"]
|
|
|
31
31
|
FILL_PHRASE = ["uhm", "umm", "ahh", "err", "actually", "obviously", "naturally", "like", "you know"]
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
class FillerWordsPerturbation(
|
|
34
|
+
class FillerWordsPerturbation(TextPerturbation):
|
|
35
35
|
"""
|
|
36
36
|
Randomly inserts filler words and phrases in the sentence.
|
|
37
37
|
Perturbation example:
|
|
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional, Tuple
|
|
|
6
6
|
|
|
7
7
|
from helm.common.general import match_case
|
|
8
8
|
from .perturbation_description import PerturbationDescription
|
|
9
|
-
from .perturbation import
|
|
9
|
+
from .perturbation import TextPerturbation
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
""" Gender term mappings """
|
|
@@ -62,7 +62,7 @@ GENDER_PRONOUN_MAPPINGS: List[Tuple[str, ...]] = [
|
|
|
62
62
|
]
|
|
63
63
|
|
|
64
64
|
|
|
65
|
-
class GenderPerturbation(
|
|
65
|
+
class GenderPerturbation(TextPerturbation):
|
|
66
66
|
"""Individual fairness perturbation for gender terms and pronouns."""
|
|
67
67
|
|
|
68
68
|
""" Short unique identifier of the perturbation (e.g., extra_space) """
|
|
@@ -214,7 +214,7 @@ class GenderPerturbation(Perturbation):
|
|
|
214
214
|
def perturb(self, text: str, rng: Random) -> str:
|
|
215
215
|
"""Perform the perturbations on the provided text."""
|
|
216
216
|
# Substitute the words
|
|
217
|
-
for
|
|
217
|
+
for word, synonym in self.word_synonym_pairs:
|
|
218
218
|
text = self.substitute_word(text, word, synonym, rng)
|
|
219
219
|
|
|
220
220
|
return text
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from random import Random
|
|
2
2
|
|
|
3
|
-
from .perturbation import
|
|
3
|
+
from .perturbation import TextPerturbation
|
|
4
4
|
from .perturbation_description import PerturbationDescription
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
class LowerCasePerturbation(
|
|
7
|
+
class LowerCasePerturbation(TextPerturbation):
|
|
8
8
|
"""
|
|
9
9
|
Simple perturbation turning input and references into lowercase.
|
|
10
10
|
"""
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from random import Random
|
|
2
2
|
|
|
3
3
|
from .perturbation_description import PerturbationDescription
|
|
4
|
-
from .perturbation import
|
|
4
|
+
from .perturbation import TextPerturbation
|
|
5
5
|
from .lowercase_perturbation import LowerCasePerturbation
|
|
6
6
|
from .contraction_expansion_perturbation import ContractionPerturbation
|
|
7
7
|
from .space_perturbation import SpacePerturbation
|
|
8
8
|
from .misspelling_perturbation import MisspellingPerturbation
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
class MildMixPerturbation(
|
|
11
|
+
class MildMixPerturbation(TextPerturbation):
|
|
12
12
|
"""
|
|
13
13
|
Canonical robustness perturbation that composes several perturbations.
|
|
14
14
|
These perturbations are chosen to be reasonable.
|
|
@@ -6,13 +6,13 @@ from random import Random
|
|
|
6
6
|
from typing import Dict, List
|
|
7
7
|
|
|
8
8
|
from helm.common.general import match_case
|
|
9
|
-
from .perturbation import
|
|
9
|
+
from .perturbation import TextPerturbation
|
|
10
10
|
from .perturbation_description import PerturbationDescription
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
# The implementation below is based on the following list of common misspellings:
|
|
14
14
|
# https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines
|
|
15
|
-
class MisspellingPerturbation(
|
|
15
|
+
class MisspellingPerturbation(TextPerturbation):
|
|
16
16
|
"""
|
|
17
17
|
Replaces words randomly with common misspellings, from a list of common misspellings.
|
|
18
18
|
|