crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
helm/proxy/test_models.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from .models import get_model, get_model_group, get_models_by_organization, get_all_code_models, Model
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def test_get_model():
|
|
5
|
-
model: Model = get_model("ai21/j1-jumbo")
|
|
6
|
-
assert model.organization == "ai21"
|
|
7
|
-
assert model.engine == "j1-jumbo"
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def test_get_model_with_invalid_model_name():
|
|
11
|
-
try:
|
|
12
|
-
get_model("invalid/model")
|
|
13
|
-
assert False, "Expected to throw ValueError"
|
|
14
|
-
except ValueError:
|
|
15
|
-
pass
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def test_get_model_group():
|
|
19
|
-
assert get_model_group("openai/text-curie-001") == "gpt3"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def test_get_models_by_organization():
|
|
23
|
-
assert get_models_by_organization("simple") == ["simple/model1"]
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def test_all_code_models():
|
|
27
|
-
assert "openai/code-davinci-002" in get_all_code_models()
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
from helm.common.request import Request, Sequence
|
|
4
|
-
from .token_counter import TokenCounter
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class AI21TokenCounter(TokenCounter):
|
|
8
|
-
def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
|
|
9
|
-
"""
|
|
10
|
-
Counts the number of generated tokens and NOT the number of tokens in the prompt
|
|
11
|
-
(https://studio.ai21.com/docs/calculating-usage).
|
|
12
|
-
|
|
13
|
-
The AI21 documentation (https://studio.ai21.com/docs/calculating-usage/) defines
|
|
14
|
-
generated tokens as:
|
|
15
|
-
"the total number of all completion tokens you generate. For example, assume you post
|
|
16
|
-
a complete request for J1-Jumbo with a prompt consisting of 10 tokens and requiring 3
|
|
17
|
-
completions, i.e. numResults = 3, and the model generates completions with 5, 15, and
|
|
18
|
-
20 tokens. In total this request will consume 5+15+20=40 generated tokens."
|
|
19
|
-
"""
|
|
20
|
-
return sum(len(sequence.tokens) for sequence in completions)
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
from helm.common.request import Request, Sequence
|
|
4
|
-
from .token_counter import TokenCounter
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class CohereTokenCounter(TokenCounter):
|
|
8
|
-
def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
|
|
9
|
-
"""
|
|
10
|
-
Counts the number of generated tokens.
|
|
11
|
-
TODO: Cohere simply counts the number of generations, but we currently only support counting tokens.
|
|
12
|
-
"""
|
|
13
|
-
return sum(len(sequence.tokens) for sequence in completions)
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
from helm.common.request import Request, Sequence
|
|
4
|
-
from .token_counter import TokenCounter
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class FreeTokenCounter(TokenCounter):
|
|
8
|
-
"""For when we don't care about keeping track of the number of tokens."""
|
|
9
|
-
|
|
10
|
-
def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
|
|
11
|
-
"""No need to count tokens, since it's free. Return 0."""
|
|
12
|
-
return 0
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
from helm.common.request import Request, Sequence
|
|
4
|
-
from .token_counter import TokenCounter
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class GooseAITokenCounter(TokenCounter):
|
|
8
|
-
# From https://goose.ai/pricing: "the base price includes your first 25 tokens
|
|
9
|
-
# generated, and you can scale beyond that on a per-token basis."
|
|
10
|
-
BASE_PRICE_TOKENS: int = 25
|
|
11
|
-
|
|
12
|
-
@staticmethod
|
|
13
|
-
def account_for_base_tokens(num_tokens: int):
|
|
14
|
-
"""Subtracts the number of tokens included in the base price."""
|
|
15
|
-
return max(num_tokens - GooseAITokenCounter.BASE_PRICE_TOKENS, 0)
|
|
16
|
-
|
|
17
|
-
def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
|
|
18
|
-
"""
|
|
19
|
-
Counts the number of generated tokens and NOT the number of tokens in the prompt.
|
|
20
|
-
From https://goose.ai/pricing: "by charging only for output, you have control since
|
|
21
|
-
you can configure the maximum number of tokens generated per API call
|
|
22
|
-
(up to 2,048 tokens)."
|
|
23
|
-
"""
|
|
24
|
-
return GooseAITokenCounter.account_for_base_tokens(sum(len(sequence.tokens) for sequence in completions))
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
from helm.common.request import Request, Sequence
|
|
4
|
-
from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
|
|
5
|
-
from helm.proxy.clients.huggingface_client import HuggingFaceClient
|
|
6
|
-
from .token_counter import TokenCounter
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class OpenAITokenCounter(TokenCounter):
|
|
10
|
-
def __init__(self, huggingface_client: HuggingFaceClient):
|
|
11
|
-
self.huggingface_client: HuggingFaceClient = huggingface_client
|
|
12
|
-
|
|
13
|
-
def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
|
|
14
|
-
"""
|
|
15
|
-
Counts the total number of tokens using the suggestion here:
|
|
16
|
-
https://community.openai.com/t/how-do-i-calculate-the-pricing-for-generation-of-text/11662/5
|
|
17
|
-
"""
|
|
18
|
-
tokenized_prompt: TokenizationRequestResult = self.huggingface_client.tokenize(
|
|
19
|
-
TokenizationRequest(request.prompt)
|
|
20
|
-
)
|
|
21
|
-
# Number of tokens in the prompt + number of tokens in all the completions
|
|
22
|
-
return len(tokenized_prompt.tokens) + sum([len(sequence.tokens) for sequence in completions])
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
from helm.common.request import Request, Sequence, Token
|
|
4
|
-
from .ai21_token_counter import AI21TokenCounter
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class TestAI21TokenCounter:
|
|
8
|
-
def setup_method(self, method):
|
|
9
|
-
self.token_counter = AI21TokenCounter()
|
|
10
|
-
|
|
11
|
-
def test_count_tokens(self):
|
|
12
|
-
request = Request(
|
|
13
|
-
prompt="The Center for Research on Foundation Models (CRFM) is "
|
|
14
|
-
"an interdisciplinary initiative born out of the Stanford "
|
|
15
|
-
"Institute for Human-Centered Artificial Intelligence (HAI) "
|
|
16
|
-
"that aims to make fundamental advances in the study, development, "
|
|
17
|
-
"and deployment of foundation models."
|
|
18
|
-
)
|
|
19
|
-
completions: List[Sequence] = [
|
|
20
|
-
Sequence(
|
|
21
|
-
text="\n\nFoundation models are an artificial intelligence paradigm that emphasizes: (1) reasoning "
|
|
22
|
-
"about symbolic, structured knowledge, (2) learning to perform tasks from observation, ",
|
|
23
|
-
logprob=-66.24831021729915,
|
|
24
|
-
tokens=[
|
|
25
|
-
Token(text="\n", logprob=-1.034758448600769, top_logprobs={"\n": -1.034758448600769}),
|
|
26
|
-
Token(text="\n", logprob=-2.325321674346924, top_logprobs={" Foundation": -1.2628217935562134}),
|
|
27
|
-
Token(
|
|
28
|
-
text=" Foundation",
|
|
29
|
-
logprob=-1.2575088739395142,
|
|
30
|
-
top_logprobs={" Foundation": -1.2575088739395142},
|
|
31
|
-
),
|
|
32
|
-
Token(
|
|
33
|
-
text=" models are",
|
|
34
|
-
logprob=-0.9496442079544067,
|
|
35
|
-
top_logprobs={" models are": -0.9496442079544067},
|
|
36
|
-
),
|
|
37
|
-
Token(text=" an", logprob=-5.785625457763672, top_logprobs={" a class of": -2.762187957763672}),
|
|
38
|
-
Token(
|
|
39
|
-
text=" artificial intelligence",
|
|
40
|
-
logprob=-2.073937177658081,
|
|
41
|
-
top_logprobs={" increasingly popular": -1.714562177658081},
|
|
42
|
-
),
|
|
43
|
-
Token(text=" paradigm", logprob=-2.855092763900757, top_logprobs={" ": -1.2613427639007568}),
|
|
44
|
-
Token(text=" that", logprob=-1.380897879600525, top_logprobs={" that": -1.380897879600525}),
|
|
45
|
-
Token(
|
|
46
|
-
text=" emphasizes",
|
|
47
|
-
logprob=-4.230795860290527,
|
|
48
|
-
top_logprobs={" attempts to": -3.5276708602905273},
|
|
49
|
-
),
|
|
50
|
-
Token(text=":", logprob=-5.380178451538086, top_logprobs={" reasoning": -2.192678689956665}),
|
|
51
|
-
Token(text=" ", logprob=-1.1408348083496094, top_logprobs={"\n": -0.6095848083496094}),
|
|
52
|
-
Token(text="(", logprob=-0.41460439562797546, top_logprobs={"(": -0.41460439562797546}),
|
|
53
|
-
Token(text="1", logprob=-0.5666008591651917, top_logprobs={"1": -0.5666008591651917}),
|
|
54
|
-
Token(text=")", logprob=-0.001801535952836275, top_logprobs={")": -0.001801535952836275}),
|
|
55
|
-
Token(text=" reasoning", logprob=-3.4144058227539062, top_logprobs={" the": -2.3987808227539062}),
|
|
56
|
-
Token(text=" about", logprob=-1.3604949712753296, top_logprobs={" about": -1.3604949712753296}),
|
|
57
|
-
Token(text=" symbolic", logprob=-7.108627796173096, top_logprobs={" and": -2.5617527961730957}),
|
|
58
|
-
Token(text=",", logprob=-2.8421378135681152, top_logprobs={" knowledge": -1.6233878135681152}),
|
|
59
|
-
Token(
|
|
60
|
-
text=" structured",
|
|
61
|
-
logprob=-2.6082611083984375,
|
|
62
|
-
top_logprobs={" structured": -2.6082611083984375},
|
|
63
|
-
),
|
|
64
|
-
Token(text=" knowledge", logprob=-0.91008061170578, top_logprobs={" knowledge": -0.91008061170578}),
|
|
65
|
-
Token(text=",", logprob=-1.0750247240066528, top_logprobs={",": -1.0750247240066528}),
|
|
66
|
-
Token(text=" ", logprob=-0.5834965705871582, top_logprobs={" ": -0.5834965705871582}),
|
|
67
|
-
Token(text="(", logprob=-0.0004963834653608501, top_logprobs={"(": -0.0004963834653608501}),
|
|
68
|
-
Token(text="2", logprob=-0.0009141556802205741, top_logprobs={"2": -0.0009141556802205741}),
|
|
69
|
-
Token(text=")", logprob=-5.686121585313231e-05, top_logprobs={")": -5.686121585313231e-05}),
|
|
70
|
-
Token(text=" learning", logprob=-2.123058319091797, top_logprobs={" learning": -2.123058319091797}),
|
|
71
|
-
Token(
|
|
72
|
-
text=" to perform", logprob=-5.197870254516602, top_logprobs={" through": -1.7916204929351807}
|
|
73
|
-
),
|
|
74
|
-
Token(text=" tasks", logprob=-1.5782833099365234, top_logprobs={" complex": -1.5470333099365234}),
|
|
75
|
-
Token(text=" from", logprob=-1.1503676176071167, top_logprobs={" from": -1.1503676176071167}),
|
|
76
|
-
Token(
|
|
77
|
-
text=" observation", logprob=-4.8489789962768555, top_logprobs={" human": -2.5989787578582764}
|
|
78
|
-
),
|
|
79
|
-
Token(text=",", logprob=-0.7239797711372375, top_logprobs={",": -0.7239797711372375}),
|
|
80
|
-
Token(text=" ", logprob=-1.3241727352142334, top_logprobs={" and": -0.5897976756095886}),
|
|
81
|
-
],
|
|
82
|
-
)
|
|
83
|
-
]
|
|
84
|
-
|
|
85
|
-
# Verified against https://studio.ai21.com/playground.
|
|
86
|
-
assert self.token_counter.count_tokens(request, completions) == 32
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import tempfile
|
|
3
|
-
from typing import List
|
|
4
|
-
|
|
5
|
-
from helm.common.cache import SqliteCacheConfig
|
|
6
|
-
from helm.common.request import Request, Sequence, Token
|
|
7
|
-
from helm.proxy.clients.huggingface_client import HuggingFaceClient
|
|
8
|
-
from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
9
|
-
from .openai_token_counter import OpenAITokenCounter
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class TestOpenAITokenCounter:
|
|
13
|
-
# The following prompt has 51 tokens according to the GPT-2 tokenizer
|
|
14
|
-
TEST_PROMPT: str = (
|
|
15
|
-
"The Center for Research on Foundation Models (CRFM) is "
|
|
16
|
-
"an interdisciplinary initiative born out of the Stanford "
|
|
17
|
-
"Institute for Human-Centered Artificial Intelligence (HAI) "
|
|
18
|
-
"that aims to make fundamental advances in the study, development, "
|
|
19
|
-
"and deployment of foundation models."
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
def setup_method(self, method):
|
|
23
|
-
self.cache_path: str = tempfile.NamedTemporaryFile(delete=False).name
|
|
24
|
-
self.token_counter = OpenAITokenCounter(
|
|
25
|
-
HuggingFaceClient(
|
|
26
|
-
tokenizer=HuggingFaceTokenizer(cache_config=SqliteCacheConfig(self.cache_path)),
|
|
27
|
-
cache_config=SqliteCacheConfig(self.cache_path),
|
|
28
|
-
)
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
def teardown_method(self, method):
|
|
32
|
-
os.remove(self.cache_path)
|
|
33
|
-
|
|
34
|
-
def test_count_tokens(self):
|
|
35
|
-
request = Request(prompt=TestOpenAITokenCounter.TEST_PROMPT)
|
|
36
|
-
completions: List[Sequence] = [
|
|
37
|
-
Sequence(
|
|
38
|
-
text=" The CRFM is dedicated to advancing our knowledge of the foundations of artificial intelligence "
|
|
39
|
-
"(AI) and related fields. It focuses on foundational questions in AI, which are",
|
|
40
|
-
logprob=-49.00783279519999,
|
|
41
|
-
tokens=[
|
|
42
|
-
Token(text=" The", logprob=-1.8096403, top_logprobs={"\n": -1.6654028}),
|
|
43
|
-
Token(text=" CR", logprob=-1.2861944, top_logprobs={" CR": -1.2861944}),
|
|
44
|
-
Token(text="FM", logprob=-0.0032369632, top_logprobs={"FM": -0.0032369632}),
|
|
45
|
-
Token(text=" is", logprob=-1.4355252, top_logprobs={" is": -1.4355252}),
|
|
46
|
-
Token(text=" dedicated", logprob=-3.814422, top_logprobs={" a": -1.8003343}),
|
|
47
|
-
Token(text=" to", logprob=-0.009623392, top_logprobs={" to": -0.009623392}),
|
|
48
|
-
Token(text=" advancing", logprob=-2.6732886, top_logprobs={" the": -1.859751}),
|
|
49
|
-
Token(text=" our", logprob=-3.123714, top_logprobs={" the": -1.0504603}),
|
|
50
|
-
Token(text=" knowledge", logprob=-3.030337, top_logprobs={" understanding": -0.34646907}),
|
|
51
|
-
Token(text=" of", logprob=-0.46280858, top_logprobs={" of": -0.46280858}),
|
|
52
|
-
Token(text=" the", logprob=-1.4058315, top_logprobs={" the": -1.4058315}),
|
|
53
|
-
Token(text=" foundations", logprob=-2.0638132, top_logprobs={" foundations": -2.0638132}),
|
|
54
|
-
Token(text=" of", logprob=-0.2607486, top_logprobs={" of": -0.2607486}),
|
|
55
|
-
Token(text=" artificial", logprob=-1.1653417, top_logprobs={" artificial": -1.1653417}),
|
|
56
|
-
Token(text=" intelligence", logprob=-0.03756146, top_logprobs={" intelligence": -0.03756146}),
|
|
57
|
-
Token(text=" (", logprob=-2.019812, top_logprobs={",": -1.3503861}),
|
|
58
|
-
Token(text="AI", logprob=-0.03869382, top_logprobs={"AI": -0.03869382}),
|
|
59
|
-
Token(text=")", logprob=-0.49895737, top_logprobs={")": -0.49895737}),
|
|
60
|
-
Token(text=" and", logprob=-0.81909865, top_logprobs={" and": -0.81909865}),
|
|
61
|
-
Token(text=" related", logprob=-2.611718, top_logprobs={" to": -2.3555496}),
|
|
62
|
-
Token(text=" fields", logprob=-0.7640527, top_logprobs={" fields": -0.7640527}),
|
|
63
|
-
Token(text=".", logprob=-1.8066244, top_logprobs={",": -1.2972366}),
|
|
64
|
-
Token(text=" It", logprob=-2.2856107, top_logprobs={"\n": -1.7865871}),
|
|
65
|
-
Token(text=" focuses", logprob=-3.3735154, top_logprobs={" is": -1.4982711}),
|
|
66
|
-
Token(text=" on", logprob=-0.13244776, top_logprobs={" on": -0.13244776}),
|
|
67
|
-
Token(text=" foundational", logprob=-1.2640914, top_logprobs={" foundational": -1.2640914}),
|
|
68
|
-
Token(text=" questions", logprob=-2.010647, top_logprobs={" issues": -1.673752}),
|
|
69
|
-
Token(text=" in", logprob=-1.980726, top_logprobs={" that": -1.8133409}),
|
|
70
|
-
Token(text=" AI", logprob=-0.5709368, top_logprobs={" AI": -0.5709368}),
|
|
71
|
-
Token(text=",", logprob=-1.036094, top_logprobs={",": -1.036094}),
|
|
72
|
-
Token(text=" which", logprob=-3.826836, top_logprobs={" such": -2.0843854}),
|
|
73
|
-
Token(text=" are", logprob=-1.3858839, top_logprobs={" are": -1.3858839}),
|
|
74
|
-
],
|
|
75
|
-
)
|
|
76
|
-
]
|
|
77
|
-
|
|
78
|
-
# Verified against https://beta.openai.com/tokenizer. Prompt + completions = 51 + 32.
|
|
79
|
-
assert self.token_counter.count_tokens(request, completions) == 51 + 32
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
from helm.common.tokenization_request import (
|
|
4
|
-
DecodeRequest,
|
|
5
|
-
DecodeRequestResult,
|
|
6
|
-
TokenizationRequest,
|
|
7
|
-
TokenizationRequestResult,
|
|
8
|
-
TokenizationToken,
|
|
9
|
-
)
|
|
10
|
-
from .tokenizer import Tokenizer
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class SimpleTokenizer(Tokenizer):
|
|
14
|
-
"""Implements some "models" that just generate silly things quickly just to debug the infrastructure."""
|
|
15
|
-
|
|
16
|
-
@staticmethod
|
|
17
|
-
def tokenize_by_space(text: str) -> List[str]:
|
|
18
|
-
"""Simply tokenizes by a single white space."""
|
|
19
|
-
return text.split(" ")
|
|
20
|
-
|
|
21
|
-
def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
22
|
-
# TODO: Does not support encoding
|
|
23
|
-
if request.tokenizer == "simple/model1":
|
|
24
|
-
raw_tokens: List[str] = SimpleTokenizer.tokenize_by_space(request.text)
|
|
25
|
-
return TokenizationRequestResult(
|
|
26
|
-
success=True, cached=False, tokens=[TokenizationToken(text) for text in raw_tokens], text=request.text
|
|
27
|
-
)
|
|
28
|
-
else:
|
|
29
|
-
raise ValueError("Unknown model")
|
|
30
|
-
|
|
31
|
-
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
|
|
32
|
-
raise NotImplementedError
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
from helm.common.general import singleton
|
|
3
|
-
from .huggingface_tokenizer import HuggingFaceTokenizer
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class TestHuggingFaceTokenizer:
|
|
7
|
-
# The following prompt has 51 tokens according to the GPT-2 tokenizer
|
|
8
|
-
TEST_PROMPT: str = (
|
|
9
|
-
"The Center for Research on Foundation Models (CRFM) is "
|
|
10
|
-
"an interdisciplinary initiative born out of the Stanford "
|
|
11
|
-
"Institute for Human-Centered Artificial Intelligence (HAI) "
|
|
12
|
-
"that aims to make fundamental advances in the study, development, "
|
|
13
|
-
"and deployment of foundation models."
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
@staticmethod
|
|
17
|
-
def verify_get_tokenizer(
|
|
18
|
-
tokenizer_name: str, expected_num_tokens: int, pretrained_model_name_or_path: Optional[str] = None
|
|
19
|
-
):
|
|
20
|
-
tokenizer = HuggingFaceTokenizer.get_tokenizer(
|
|
21
|
-
helm_tokenizer_name=tokenizer_name,
|
|
22
|
-
pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
|
|
23
|
-
)
|
|
24
|
-
assert tokenizer_name in HuggingFaceTokenizer._tokenizers, "Tokenizer should be cached"
|
|
25
|
-
assert len(tokenizer.encode(TestHuggingFaceTokenizer.TEST_PROMPT)) == expected_num_tokens
|
|
26
|
-
|
|
27
|
-
def test_get_tokenizer_gpt2(self):
|
|
28
|
-
TestHuggingFaceTokenizer.verify_get_tokenizer("huggingface/gpt2", 51, pretrained_model_name_or_path="gpt2")
|
|
29
|
-
|
|
30
|
-
def test_get_tokenizer_gptj(self):
|
|
31
|
-
TestHuggingFaceTokenizer.verify_get_tokenizer("EleutherAI/gpt-j-6B", 51)
|
|
32
|
-
|
|
33
|
-
def test_get_tokenizer_gptneox(self):
|
|
34
|
-
TestHuggingFaceTokenizer.verify_get_tokenizer("EleutherAI/gpt-neox-20b", 52)
|
|
35
|
-
|
|
36
|
-
def test_get_tokenizer_bloom(self):
|
|
37
|
-
TestHuggingFaceTokenizer.verify_get_tokenizer("bigscience/bloom", 51)
|
|
38
|
-
|
|
39
|
-
def test_get_tokenizer_t0pp(self):
|
|
40
|
-
TestHuggingFaceTokenizer.verify_get_tokenizer("bigscience/T0pp", 58)
|
|
41
|
-
|
|
42
|
-
def test_get_tokenizer_t511b(self):
|
|
43
|
-
TestHuggingFaceTokenizer.verify_get_tokenizer("google/t5-11b", 58, pretrained_model_name_or_path="t5-11b")
|
|
44
|
-
|
|
45
|
-
def test_get_tokenizer_ul2(self):
|
|
46
|
-
TestHuggingFaceTokenizer.verify_get_tokenizer("google/ul2", 58)
|
|
47
|
-
|
|
48
|
-
def test_get_santacoder(self):
|
|
49
|
-
TestHuggingFaceTokenizer.verify_get_tokenizer("bigcode/santacoder", 62)
|
|
50
|
-
|
|
51
|
-
def test_gpt2_tokenize_eos(self):
|
|
52
|
-
eos_token: str = "<|endoftext|>"
|
|
53
|
-
tokenizer = HuggingFaceTokenizer.get_tokenizer("huggingface/gpt2", pretrained_model_name_or_path="gpt2")
|
|
54
|
-
token_ids = tokenizer.encode(eos_token)
|
|
55
|
-
assert singleton(token_ids) == 50256
|
|
56
|
-
assert tokenizer.decode(token_ids) == eos_token
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|