crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -1,41 +1,20 @@
|
|
|
1
|
-
from
|
|
2
|
-
from helm.proxy.models import (
|
|
3
|
-
get_model,
|
|
4
|
-
get_model_names_with_tag,
|
|
5
|
-
Model,
|
|
6
|
-
AI21_WIDER_CONTEXT_WINDOW_TAG,
|
|
7
|
-
AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG,
|
|
8
|
-
WIDER_CONTEXT_WINDOW_TAG,
|
|
9
|
-
GPT_TURBO_CONTEXT_WINDOW_TAG,
|
|
10
|
-
GPT_TURBO_16K_CONTEXT_WINDOW_TAG,
|
|
11
|
-
GPT4_CONTEXT_WINDOW_TAG,
|
|
12
|
-
GPT4_32K_CONTEXT_WINDOW_TAG,
|
|
13
|
-
)
|
|
1
|
+
from typing import Optional
|
|
14
2
|
|
|
15
|
-
from helm.benchmark.
|
|
16
|
-
from helm.benchmark.
|
|
17
|
-
from helm.benchmark.window_services.remote_window_service import get_remote_window_service
|
|
3
|
+
from helm.benchmark.model_deployment_registry import ModelDeployment, WindowServiceSpec, get_model_deployment
|
|
4
|
+
from helm.benchmark.tokenizer_config_registry import TokenizerConfig, get_tokenizer_config
|
|
18
5
|
from helm.benchmark.window_services.window_service import WindowService
|
|
19
6
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
20
|
-
from helm.proxy.clients.remote_model_registry import get_remote_model
|
|
21
7
|
from helm.common.object_spec import create_object, inject_object_spec_args
|
|
22
8
|
|
|
23
9
|
|
|
24
10
|
class WindowServiceFactory:
|
|
25
11
|
@staticmethod
|
|
26
|
-
def get_window_service(
|
|
12
|
+
def get_window_service(model_deployment_name: str, service: TokenizerService) -> WindowService:
|
|
27
13
|
"""
|
|
28
14
|
Returns a `WindowService` given the name of the model.
|
|
29
15
|
Make sure this function returns instantaneously on repeated calls.
|
|
30
16
|
"""
|
|
31
|
-
|
|
32
|
-
organization: str = model.organization
|
|
33
|
-
engine: str = model.engine
|
|
34
|
-
|
|
35
|
-
window_service: WindowService
|
|
36
|
-
|
|
37
|
-
# TODO: Migrate all window services to use use model deployments
|
|
38
|
-
model_deployment = get_model_deployment(model_name)
|
|
17
|
+
model_deployment: Optional[ModelDeployment] = get_model_deployment(model_deployment_name)
|
|
39
18
|
if model_deployment:
|
|
40
19
|
# If the model deployment specifies a WindowServiceSpec, instantiate it.
|
|
41
20
|
window_service_spec: WindowServiceSpec
|
|
@@ -45,6 +24,16 @@ class WindowServiceFactory:
|
|
|
45
24
|
window_service_spec = WindowServiceSpec(
|
|
46
25
|
class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService", args={}
|
|
47
26
|
)
|
|
27
|
+
|
|
28
|
+
# If provided, look up special tokens from TokenizerConfig.
|
|
29
|
+
end_of_text_token: Optional[str] = None
|
|
30
|
+
prefix_token: Optional[str] = None
|
|
31
|
+
if model_deployment.tokenizer_name:
|
|
32
|
+
tokenizer_config: Optional[TokenizerConfig] = get_tokenizer_config(model_deployment.tokenizer_name)
|
|
33
|
+
if tokenizer_config:
|
|
34
|
+
end_of_text_token = tokenizer_config.end_of_text_token
|
|
35
|
+
prefix_token = tokenizer_config.prefix_token
|
|
36
|
+
|
|
48
37
|
# Perform dependency injection to fill in remaining arguments.
|
|
49
38
|
# Dependency injection is needed here for these reasons:
|
|
50
39
|
#
|
|
@@ -54,253 +43,19 @@ class WindowServiceFactory:
|
|
|
54
43
|
# in the users configuration file. Instead, they have to be constructed dynamically at runtime.
|
|
55
44
|
window_service_spec = inject_object_spec_args(
|
|
56
45
|
window_service_spec,
|
|
57
|
-
{
|
|
46
|
+
constant_bindings={
|
|
58
47
|
"service": service,
|
|
59
48
|
"tokenizer_name": model_deployment.tokenizer_name,
|
|
60
49
|
"max_sequence_length": model_deployment.max_sequence_length,
|
|
61
50
|
"max_request_length": model_deployment.max_request_length,
|
|
51
|
+
"max_sequence_and_generated_tokens_length": model_deployment.max_sequence_and_generated_tokens_length, # noqa
|
|
52
|
+
"end_of_text_token": end_of_text_token,
|
|
53
|
+
"prefix_token": prefix_token,
|
|
54
|
+
},
|
|
55
|
+
provider_bindings={
|
|
56
|
+
"gpt2_window_service": lambda: WindowServiceFactory.get_window_service("huggingface/gpt2", service)
|
|
62
57
|
},
|
|
63
58
|
)
|
|
64
|
-
|
|
65
|
-
elif get_remote_model(model_name):
|
|
66
|
-
window_service = get_remote_window_service(service, model_name)
|
|
67
|
-
elif organization == "neurips":
|
|
68
|
-
from helm.benchmark.window_services.http_model_window_service import HTTPModelWindowServce
|
|
69
|
-
|
|
70
|
-
window_service = HTTPModelWindowServce(service)
|
|
71
|
-
elif organization == "openai":
|
|
72
|
-
from helm.benchmark.window_services.openai_window_service import OpenAIWindowService
|
|
73
|
-
from helm.benchmark.window_services.wider_openai_window_service import (
|
|
74
|
-
WiderOpenAIWindowService,
|
|
75
|
-
GPTTurboWindowService,
|
|
76
|
-
GPTTurbo16KWindowService,
|
|
77
|
-
GPT4WindowService,
|
|
78
|
-
GPT432KWindowService,
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
if model_name in get_model_names_with_tag(GPT4_CONTEXT_WINDOW_TAG):
|
|
82
|
-
window_service = GPT4WindowService(service)
|
|
83
|
-
elif model_name in get_model_names_with_tag(GPT4_32K_CONTEXT_WINDOW_TAG):
|
|
84
|
-
window_service = GPT432KWindowService(service)
|
|
85
|
-
if model_name in get_model_names_with_tag(GPT_TURBO_CONTEXT_WINDOW_TAG):
|
|
86
|
-
window_service = GPTTurboWindowService(service)
|
|
87
|
-
elif model_name in get_model_names_with_tag(GPT_TURBO_16K_CONTEXT_WINDOW_TAG):
|
|
88
|
-
window_service = GPTTurbo16KWindowService(service)
|
|
89
|
-
elif model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG):
|
|
90
|
-
window_service = WiderOpenAIWindowService(service)
|
|
91
|
-
else:
|
|
92
|
-
window_service = OpenAIWindowService(service)
|
|
93
|
-
# For the Google models, we approximate with the OpenAIWindowService
|
|
94
|
-
elif organization == "simple" or organization == "google":
|
|
95
|
-
from helm.benchmark.window_services.openai_window_service import OpenAIWindowService
|
|
96
|
-
|
|
97
|
-
window_service = OpenAIWindowService(service)
|
|
98
|
-
elif organization == "AlephAlpha":
|
|
99
|
-
from helm.benchmark.window_services.luminous_window_service import (
|
|
100
|
-
LuminousBaseWindowService,
|
|
101
|
-
LuminousExtendedWindowService,
|
|
102
|
-
LuminousSupremeWindowService,
|
|
103
|
-
LuminousWorldWindowService,
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
if engine == "luminous-base":
|
|
107
|
-
window_service = LuminousBaseWindowService(service)
|
|
108
|
-
elif engine == "luminous-extended":
|
|
109
|
-
window_service = LuminousExtendedWindowService(service)
|
|
110
|
-
elif engine == "luminous-supreme":
|
|
111
|
-
window_service = LuminousSupremeWindowService(service)
|
|
112
|
-
elif engine == "luminous-world":
|
|
113
|
-
window_service = LuminousWorldWindowService(service)
|
|
114
|
-
else:
|
|
115
|
-
raise ValueError(f"Unhandled Aleph Alpha model: {engine}")
|
|
116
|
-
elif organization == "microsoft":
|
|
117
|
-
from helm.benchmark.window_services.mt_nlg_window_service import MTNLGWindowService
|
|
118
|
-
|
|
119
|
-
window_service = MTNLGWindowService(service)
|
|
120
|
-
elif organization == "anthropic":
|
|
121
|
-
from helm.benchmark.window_services.anthropic_window_service import (
|
|
122
|
-
AnthropicWindowService,
|
|
123
|
-
LegacyAnthropicWindowService,
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
if engine == "stanford-online-all-v4-s3":
|
|
127
|
-
window_service = LegacyAnthropicWindowService(service)
|
|
128
|
-
else:
|
|
129
|
-
window_service = AnthropicWindowService(service)
|
|
130
|
-
elif organization == "writer":
|
|
131
|
-
from helm.benchmark.window_services.palmyra_window_service import (
|
|
132
|
-
PalmyraWindowService,
|
|
133
|
-
LongerPalmyraWindowService,
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
if engine in ["palmyra-base", "palmyra-large", "palmyra-instruct-30", "palmyra-e"]:
|
|
137
|
-
window_service = PalmyraWindowService(service)
|
|
138
|
-
elif engine in ["palmyra-x", "silk-road"]:
|
|
139
|
-
window_service = LongerPalmyraWindowService(service)
|
|
140
|
-
else:
|
|
141
|
-
raise ValueError(f"Unhandled Writer model: {engine}")
|
|
142
|
-
elif engine == "santacoder":
|
|
143
|
-
from helm.benchmark.window_services.santacoder_window_service import SantaCoderWindowService
|
|
144
|
-
|
|
145
|
-
window_service = SantaCoderWindowService(service)
|
|
146
|
-
elif engine == "starcoder":
|
|
147
|
-
from helm.benchmark.window_services.starcoder_window_service import StarCoderWindowService
|
|
148
|
-
|
|
149
|
-
window_service = StarCoderWindowService(service)
|
|
150
|
-
elif model_name == "huggingface/gpt2":
|
|
151
|
-
window_service = GPT2WindowService(service)
|
|
152
|
-
elif model_name == "together/bloom":
|
|
153
|
-
from helm.benchmark.window_services.bloom_window_service import BloomWindowService
|
|
154
|
-
|
|
155
|
-
window_service = BloomWindowService(service)
|
|
156
|
-
elif model_name == "together/glm":
|
|
157
|
-
# From https://github.com/THUDM/GLM-130B, "the tokenizer is implemented based on
|
|
158
|
-
# icetk---a unified multimodal tokenizer for images, Chinese, and English."
|
|
159
|
-
from helm.benchmark.window_services.ice_window_service import ICEWindowService
|
|
160
|
-
|
|
161
|
-
window_service = ICEWindowService(service)
|
|
162
|
-
elif model_name in ["huggingface/gpt-j-6b", "together/gpt-j-6b", "together/gpt-jt-6b-v1", "gooseai/gpt-j-6b"]:
|
|
163
|
-
from helm.benchmark.window_services.gptj_window_service import GPTJWindowService
|
|
164
|
-
|
|
165
|
-
window_service = GPTJWindowService(service)
|
|
166
|
-
elif model_name in [
|
|
167
|
-
"together/gpt-neox-20b",
|
|
168
|
-
"gooseai/gpt-neo-20b",
|
|
169
|
-
"together/gpt-neoxt-chat-base-20b",
|
|
170
|
-
"together/redpajama-incite-base-3b-v1",
|
|
171
|
-
"together/redpajama-incite-instruct-3b-v1",
|
|
172
|
-
"together/redpajama-incite-base-7b",
|
|
173
|
-
"together/redpajama-incite-instruct-7b",
|
|
174
|
-
# Pythia uses the same tokenizer as GPT-NeoX-20B.
|
|
175
|
-
# See: https://huggingface.co/EleutherAI/pythia-6.9b#training-procedure
|
|
176
|
-
"eleutherai/pythia-1b-v0",
|
|
177
|
-
"eleutherai/pythia-2.8b-v0",
|
|
178
|
-
"eleutherai/pythia-6.9b",
|
|
179
|
-
"eleutherai/pythia-12b-v0",
|
|
180
|
-
# MPT-7B model was trained with the EleutherAI/gpt-neox-20b tokenizer
|
|
181
|
-
# See: https://huggingface.co/mosaicml/mpt-7b
|
|
182
|
-
"mosaicml/mpt-7b",
|
|
183
|
-
"mosaicml/mpt-instruct-7b",
|
|
184
|
-
"mosaicml/mpt-30b",
|
|
185
|
-
"mosaicml/mpt-instruct-30b",
|
|
186
|
-
# Dolly models are based on Pythia.
|
|
187
|
-
# See: https://github.com/databrickslabs/dolly
|
|
188
|
-
"databricks/dolly-v2-3b",
|
|
189
|
-
"databricks/dolly-v2-7b",
|
|
190
|
-
"databricks/dolly-v2-12b",
|
|
191
|
-
]:
|
|
192
|
-
from helm.benchmark.window_services.gptneox_window_service import GPTNeoXWindowService
|
|
193
|
-
|
|
194
|
-
window_service = GPTNeoXWindowService(service)
|
|
195
|
-
elif model_name in [
|
|
196
|
-
"tiiuae/falcon-7b",
|
|
197
|
-
"tiiuae/falcon-7b-instruct",
|
|
198
|
-
"tiiuae/falcon-40b",
|
|
199
|
-
"tiiuae/falcon-40b-instruct",
|
|
200
|
-
]:
|
|
201
|
-
window_service = HuggingFaceWindowService(service=service, tokenizer_name="tiiuae/falcon-7b")
|
|
202
|
-
elif model_name in [
|
|
203
|
-
"stabilityai/stablelm-base-alpha-3b",
|
|
204
|
-
"stabilityai/stablelm-base-alpha-7b",
|
|
205
|
-
]:
|
|
206
|
-
from helm.benchmark.window_services.gptneox_window_service import StableLMAlphaWindowService
|
|
207
|
-
|
|
208
|
-
window_service = StableLMAlphaWindowService(service)
|
|
209
|
-
elif model_name == "together/h3-2.7b":
|
|
210
|
-
window_service = GPT2WindowService(service)
|
|
211
|
-
elif model_name in [
|
|
212
|
-
"together/opt-1.3b",
|
|
213
|
-
"together/opt-6.7b",
|
|
214
|
-
"together/opt-66b",
|
|
215
|
-
"together/opt-175b",
|
|
216
|
-
]:
|
|
217
|
-
from helm.benchmark.window_services.opt_window_service import OPTWindowService
|
|
218
|
-
|
|
219
|
-
window_service = OPTWindowService(service)
|
|
220
|
-
elif model_name == "together/t0pp":
|
|
221
|
-
from helm.benchmark.window_services.t0pp_window_service import T0ppWindowService
|
|
222
|
-
|
|
223
|
-
window_service = T0ppWindowService(service)
|
|
224
|
-
elif model_name == "together/t5-11b":
|
|
225
|
-
from helm.benchmark.window_services.t511b_window_service import T511bWindowService
|
|
226
|
-
|
|
227
|
-
window_service = T511bWindowService(service)
|
|
228
|
-
elif model_name == "together/flan-t5-xxl":
|
|
229
|
-
from helm.benchmark.window_services.flan_t5_window_service import FlanT5WindowService
|
|
230
|
-
|
|
231
|
-
window_service = FlanT5WindowService(service)
|
|
232
|
-
elif model_name == "together/ul2":
|
|
233
|
-
from helm.benchmark.window_services.ul2_window_service import UL2WindowService
|
|
234
|
-
|
|
235
|
-
window_service = UL2WindowService(service)
|
|
236
|
-
elif model_name == "together/yalm":
|
|
237
|
-
from helm.benchmark.window_services.yalm_window_service import YaLMWindowService
|
|
238
|
-
|
|
239
|
-
window_service = YaLMWindowService(service)
|
|
240
|
-
elif model_name == "nvidia/megatron-gpt2":
|
|
241
|
-
from helm.benchmark.window_services.megatron_window_service import MegatronWindowService
|
|
242
|
-
|
|
243
|
-
window_service = MegatronWindowService(service)
|
|
244
|
-
elif model_name in [
|
|
245
|
-
"lmsys/vicuna-7b-v1.3",
|
|
246
|
-
"lmsys/vicuna-13b-v1.3",
|
|
247
|
-
"meta/llama-7b",
|
|
248
|
-
"meta/llama-13b",
|
|
249
|
-
"meta/llama-30b",
|
|
250
|
-
"meta/llama-65b",
|
|
251
|
-
"stanford/alpaca-7b",
|
|
252
|
-
]:
|
|
253
|
-
from helm.benchmark.window_services.llama_window_service import LlamaWindowService
|
|
254
|
-
|
|
255
|
-
window_service = LlamaWindowService(service)
|
|
256
|
-
elif model_name in [
|
|
257
|
-
"meta/llama-2-7b",
|
|
258
|
-
"meta/llama-2-13b",
|
|
259
|
-
"meta/llama-2-70b",
|
|
260
|
-
]:
|
|
261
|
-
from helm.benchmark.window_services.llama_window_service import Llama2WindowService
|
|
262
|
-
|
|
263
|
-
window_service = Llama2WindowService(service)
|
|
264
|
-
elif organization == "cohere":
|
|
265
|
-
from helm.benchmark.window_services.cohere_window_service import (
|
|
266
|
-
CohereWindowService,
|
|
267
|
-
CohereCommandWindowService,
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
if "command" in engine:
|
|
271
|
-
window_service = CohereCommandWindowService(service)
|
|
272
|
-
else:
|
|
273
|
-
window_service = CohereWindowService(service)
|
|
274
|
-
elif organization == "ai21":
|
|
275
|
-
from helm.benchmark.window_services.wider_ai21_window_service import (
|
|
276
|
-
WiderAI21WindowService,
|
|
277
|
-
AI21Jurassic2JumboWindowService,
|
|
278
|
-
)
|
|
279
|
-
from helm.benchmark.window_services.ai21_window_service import AI21WindowService
|
|
280
|
-
|
|
281
|
-
if model_name in get_model_names_with_tag(AI21_WIDER_CONTEXT_WINDOW_TAG):
|
|
282
|
-
window_service = WiderAI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
|
|
283
|
-
if model_name in get_model_names_with_tag(AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG):
|
|
284
|
-
window_service = AI21Jurassic2JumboWindowService(
|
|
285
|
-
service=service, gpt2_window_service=GPT2WindowService(service)
|
|
286
|
-
)
|
|
287
|
-
else:
|
|
288
|
-
window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
|
|
289
|
-
|
|
290
|
-
elif organization == "lightningai":
|
|
291
|
-
from helm.benchmark.window_services.lit_gpt_window_service import LitGPTWindowServce
|
|
292
|
-
|
|
293
|
-
window_service = LitGPTWindowServce(service)
|
|
294
|
-
elif organization == "mistralai":
|
|
295
|
-
window_service = HuggingFaceWindowService(service, tokenizer_name="mistralai/Mistral-7B-v0.1")
|
|
296
|
-
elif model_name in [
|
|
297
|
-
"HuggingFaceM4/idefics-9b",
|
|
298
|
-
"HuggingFaceM4/idefics-9b-instruct",
|
|
299
|
-
"HuggingFaceM4/idefics-80b",
|
|
300
|
-
"HuggingFaceM4/idefics-80b-instruct",
|
|
301
|
-
]:
|
|
302
|
-
window_service = HuggingFaceWindowService(service, model_name)
|
|
303
|
-
else:
|
|
304
|
-
raise ValueError(f"Unhandled model name: {model_name}")
|
|
59
|
+
return create_object(window_service_spec)
|
|
305
60
|
|
|
306
|
-
|
|
61
|
+
raise ValueError(f"Unhandled model deployment name: {model_deployment_name}")
|
|
@@ -1,34 +1,7 @@
|
|
|
1
|
-
from helm.proxy.tokenizers.yalm_tokenizer_data.yalm_tokenizer import YaLMTokenizer
|
|
2
1
|
from .local_window_service import LocalWindowService
|
|
3
|
-
from .tokenizer_service import TokenizerService
|
|
4
2
|
|
|
5
3
|
|
|
6
4
|
class YaLMWindowService(LocalWindowService):
|
|
7
|
-
def __init__(self, service: TokenizerService):
|
|
8
|
-
super().__init__(service)
|
|
9
|
-
|
|
10
|
-
@property
|
|
11
|
-
def tokenizer_name(self) -> str:
|
|
12
|
-
return "Yandex/yalm"
|
|
13
|
-
|
|
14
|
-
@property
|
|
15
|
-
def max_sequence_length(self) -> int:
|
|
16
|
-
return YaLMTokenizer.MAX_SEQUENCE_LENGTH
|
|
17
|
-
|
|
18
|
-
@property
|
|
19
|
-
def max_request_length(self) -> int:
|
|
20
|
-
return self.max_sequence_length + 1
|
|
21
|
-
|
|
22
|
-
@property
|
|
23
|
-
def end_of_text_token(self) -> str:
|
|
24
|
-
"""The end of text token."""
|
|
25
|
-
return YaLMTokenizer.EOS_TOKEN
|
|
26
|
-
|
|
27
|
-
@property
|
|
28
|
-
def prefix_token(self) -> str:
|
|
29
|
-
"""The prefix token"""
|
|
30
|
-
return self.end_of_text_token
|
|
31
|
-
|
|
32
5
|
def truncate_from_right(self, text: str, expected_completion_token_length: int = 0) -> str:
|
|
33
6
|
"""
|
|
34
7
|
Truncates text from the right to fit within the context window given by `max_request_length`
|
helm/clients/__init__.py
ADDED
|
File without changes
|
|
@@ -7,10 +7,9 @@ from helm.common.request import (
|
|
|
7
7
|
EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
|
|
8
8
|
Request,
|
|
9
9
|
RequestResult,
|
|
10
|
-
|
|
10
|
+
GeneratedOutput,
|
|
11
11
|
Token,
|
|
12
12
|
)
|
|
13
|
-
from helm.proxy.tokenizers.tokenizer import Tokenizer
|
|
14
13
|
from .client import CachingClient, truncate_sequence, cleanup_str
|
|
15
14
|
from .ai21_utils import AI21RequestError, handle_failed_request
|
|
16
15
|
|
|
@@ -24,8 +23,8 @@ class AI21Client(CachingClient):
|
|
|
24
23
|
COMPLETION_URL_TEMPLATE: str = "https://api.ai21.com/studio/v1/{model}/complete"
|
|
25
24
|
EXPERIMENTAL_COMPLETION_URL_TEMPLATE: str = "https://api.ai21.com/studio/v1/experimental/{model}/complete"
|
|
26
25
|
|
|
27
|
-
def __init__(self, api_key: str,
|
|
28
|
-
super().__init__(cache_config=cache_config
|
|
26
|
+
def __init__(self, api_key: str, cache_config: CacheConfig, url: Optional[str] = None):
|
|
27
|
+
super().__init__(cache_config=cache_config)
|
|
29
28
|
self.api_key = api_key
|
|
30
29
|
self.url = url
|
|
31
30
|
|
|
@@ -98,25 +97,19 @@ class AI21Client(CachingClient):
|
|
|
98
97
|
# Compute the actual length of the token text
|
|
99
98
|
# e.g. "▁burying"(0,8) -> 8 - 0 = 8; "▁burying"(0,7) -> 7 - 0 = 7
|
|
100
99
|
text_length: int = raw["textRange"]["end"] - raw["textRange"]["start"]
|
|
101
|
-
# "topTokens" can be None when sending a request with topKReturn=0
|
|
102
|
-
# AI21 sends unscaled logprobs as `raw_logprob` so use this instead of `logprob`.
|
|
103
|
-
top_logprobs: Dict[str, float] = dict(
|
|
104
|
-
(fix_text(x["token"], first), x["raw_logprob"]) for x in raw["topTokens"] or []
|
|
105
|
-
)
|
|
106
100
|
|
|
107
101
|
return Token(
|
|
108
102
|
# Text should not be longer than text_length. Since "▁" is always inserted
|
|
109
103
|
# in the beginning, we truncate the text from the right.
|
|
110
104
|
text=fix_text(raw["generatedToken"]["token"], first)[-text_length:] if text_length else "",
|
|
111
105
|
logprob=raw["generatedToken"]["raw_logprob"],
|
|
112
|
-
top_logprobs=top_logprobs,
|
|
113
106
|
)
|
|
114
107
|
|
|
115
|
-
def parse_sequence(raw: Dict, first: bool, finish_reason: Optional[Dict] = None) ->
|
|
108
|
+
def parse_sequence(raw: Dict, first: bool, finish_reason: Optional[Dict] = None) -> GeneratedOutput:
|
|
116
109
|
text = raw["text"]
|
|
117
110
|
tokens = [parse_token(token, first and i == 0) for i, token in enumerate(raw["tokens"])]
|
|
118
111
|
logprob = sum(token.logprob for token in tokens)
|
|
119
|
-
return
|
|
112
|
+
return GeneratedOutput(text=text, logprob=logprob, tokens=tokens, finish_reason=finish_reason)
|
|
120
113
|
|
|
121
114
|
prompt = parse_sequence(response["prompt"], True)
|
|
122
115
|
completions = []
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.common.cache import CacheConfig
|
|
4
|
+
from helm.common.media_object import TEXT_TYPE
|
|
5
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
|
+
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
7
|
+
from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from aleph_alpha_client import Client, CompletionRequest, CompletionResponse, Image, Prompt
|
|
11
|
+
except ModuleNotFoundError as e:
|
|
12
|
+
handle_module_not_found_error(e, ["aleph-alpha"])
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AlephAlphaClient(CachingClient):
|
|
16
|
+
def __init__(self, api_key: str, cache_config: CacheConfig):
|
|
17
|
+
super().__init__(cache_config=cache_config)
|
|
18
|
+
self._api_key: str = api_key
|
|
19
|
+
self._aleph_alpha_client = Client(token=self._api_key) if self._api_key else None
|
|
20
|
+
|
|
21
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
22
|
+
"""Make a request following https://docs.aleph-alpha.com/api/complete."""
|
|
23
|
+
assert self._aleph_alpha_client is not None
|
|
24
|
+
|
|
25
|
+
model: str = request.model_engine
|
|
26
|
+
prompt: Prompt
|
|
27
|
+
|
|
28
|
+
# The prompt key is a unique identifier for the prompt
|
|
29
|
+
prompt_key: str = request.prompt
|
|
30
|
+
|
|
31
|
+
# Contents can either be text or a list of multimodal content made up of text, images or other content
|
|
32
|
+
if request.multimodal_prompt is not None:
|
|
33
|
+
from helm.common.images_utils import encode_base64
|
|
34
|
+
|
|
35
|
+
items = []
|
|
36
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
37
|
+
if media_object.is_type("image") and media_object.location:
|
|
38
|
+
items.append(Image(base_64=encode_base64(media_object.location), cropping=None, controls=[]))
|
|
39
|
+
elif media_object.is_type(TEXT_TYPE):
|
|
40
|
+
if media_object.text is None:
|
|
41
|
+
raise ValueError("MediaObject of text type has missing text field value")
|
|
42
|
+
items.append(media_object.text)
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
|
|
45
|
+
|
|
46
|
+
prompt = Prompt(items=items)
|
|
47
|
+
prompt_key = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
|
|
48
|
+
else:
|
|
49
|
+
prompt = Prompt.from_text(request.prompt)
|
|
50
|
+
|
|
51
|
+
parameters = {
|
|
52
|
+
"maximum_tokens": request.max_tokens,
|
|
53
|
+
"temperature": request.temperature,
|
|
54
|
+
"top_k": request.top_k_per_token,
|
|
55
|
+
"top_p": request.top_p,
|
|
56
|
+
"presence_penalty": request.presence_penalty,
|
|
57
|
+
"frequency_penalty": request.frequency_penalty,
|
|
58
|
+
"n": request.num_completions,
|
|
59
|
+
"stop_sequences": request.stop_sequences,
|
|
60
|
+
"log_probs": request.top_k_per_token,
|
|
61
|
+
"echo": request.echo_prompt,
|
|
62
|
+
"tokens": True, # Setting to True returns individual tokens of the completion
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
|
|
67
|
+
def do_it():
|
|
68
|
+
assert self._aleph_alpha_client is not None
|
|
69
|
+
completion_response: CompletionResponse = self._aleph_alpha_client.complete(
|
|
70
|
+
request=CompletionRequest(prompt=prompt, **parameters), model=model
|
|
71
|
+
)
|
|
72
|
+
result = dict(completion_response.to_json())
|
|
73
|
+
assert "completions" in result, f"Invalid response: {result}"
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
cache_key = CachingClient.make_cache_key({"model": model, "prompt": prompt_key, **parameters}, request)
|
|
77
|
+
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
78
|
+
except Exception as e:
|
|
79
|
+
error: str = f"AlephAlphaClient error: {e}"
|
|
80
|
+
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
81
|
+
|
|
82
|
+
completions: List[GeneratedOutput] = []
|
|
83
|
+
for completion in response["completions"]:
|
|
84
|
+
sequence_logprob: float = 0
|
|
85
|
+
tokens: List[Token] = []
|
|
86
|
+
|
|
87
|
+
# `completion_tokens` is the list of selected tokens.
|
|
88
|
+
for i, token in enumerate(completion.get("completion_tokens", [])):
|
|
89
|
+
# Use the selected token value to get the logprob
|
|
90
|
+
logprob: float = completion["log_probs"][i][token]
|
|
91
|
+
sequence_logprob += logprob
|
|
92
|
+
tokens.append(
|
|
93
|
+
Token(
|
|
94
|
+
text=token,
|
|
95
|
+
logprob=logprob,
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
sequence: GeneratedOutput = GeneratedOutput(
|
|
100
|
+
text=completion["completion"], logprob=sequence_logprob, tokens=tokens
|
|
101
|
+
)
|
|
102
|
+
sequence = truncate_sequence(sequence, request)
|
|
103
|
+
completions.append(sequence)
|
|
104
|
+
|
|
105
|
+
return RequestResult(
|
|
106
|
+
success=True,
|
|
107
|
+
cached=cached,
|
|
108
|
+
request_time=response["request_time"],
|
|
109
|
+
request_datetime=response["request_datetime"],
|
|
110
|
+
completions=completions,
|
|
111
|
+
embedding=[],
|
|
112
|
+
)
|