crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
helm/clients/palmyra_client.py
CHANGED
|
@@ -5,14 +5,14 @@ from typing import Any, Dict, List
|
|
|
5
5
|
|
|
6
6
|
from helm.clients.openai_client import OpenAIClient
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
-
from helm.common.hierarchical_logger import
|
|
8
|
+
from helm.common.hierarchical_logger import hwarn
|
|
9
9
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
|
|
10
10
|
from helm.common.tokenization_request import (
|
|
11
11
|
TokenizationRequest,
|
|
12
12
|
TokenizationRequestResult,
|
|
13
13
|
)
|
|
14
14
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
15
|
-
from .client import CachingClient, truncate_sequence
|
|
15
|
+
from helm.clients.client import CachingClient, truncate_sequence
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
_CONTENT_MODERATION_KEY = "fail.content.moderation.failed"
|
|
@@ -103,10 +103,7 @@ class PalmyraClient(CachingClient):
|
|
|
103
103
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
104
104
|
|
|
105
105
|
if _is_content_moderation_failure(response):
|
|
106
|
-
|
|
107
|
-
f"WARNING: Returning empty request for {request.model_deployment} "
|
|
108
|
-
"due to content moderation filter"
|
|
109
|
-
)
|
|
106
|
+
hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
|
|
110
107
|
return RequestResult(
|
|
111
108
|
success=False,
|
|
112
109
|
cached=False,
|
|
@@ -163,6 +160,3 @@ class PalmyraChatClient(OpenAIClient):
|
|
|
163
160
|
org_id=None,
|
|
164
161
|
base_url="https://api.writer.com/v1/chat",
|
|
165
162
|
)
|
|
166
|
-
|
|
167
|
-
def _is_chat_model_engine(self, model_engine: str) -> bool:
|
|
168
|
-
return True
|
helm/clients/reka_client.py
CHANGED
|
@@ -6,10 +6,10 @@ from helm.proxy.retry import NonRetriableException
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.media_object import TEXT_TYPE
|
|
8
8
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
|
|
9
|
-
from helm.common.hierarchical_logger import
|
|
9
|
+
from helm.common.hierarchical_logger import hwarn
|
|
10
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
11
11
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
12
|
-
from .client import CachingClient, truncate_and_tokenize_response_text
|
|
12
|
+
from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
|
|
13
13
|
|
|
14
14
|
try:
|
|
15
15
|
import reka
|
|
@@ -121,7 +121,7 @@ class RekaClient(CachingClient):
|
|
|
121
121
|
if messages[-1]["role"] != "user":
|
|
122
122
|
raise ValueError("Last message must have role 'user'")
|
|
123
123
|
if request.prompt != "":
|
|
124
|
-
|
|
124
|
+
hwarn("Since message is set, prompt will be ignored")
|
|
125
125
|
reka_chat_history = self._convert_messages_to_reka_chat_history(messages)
|
|
126
126
|
else:
|
|
127
127
|
current_chat_history: Dict[str, Any] = {
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from helm.clients.azure_openai_client import AzureOpenAIClient
|
|
4
|
+
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.proxy.retry import NonRetriableException
|
|
6
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
|
|
10
|
+
"""
|
|
11
|
+
Client for accessing OpenAI models hosted on Stanford Health Care's model API.
|
|
12
|
+
|
|
13
|
+
Configure by setting the following in prod_env/credentials.conf:
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
stanfordhealthcareEndpoint: https://your-domain-name/
|
|
17
|
+
stanfordhealthcareApiKey: your-private-key
|
|
18
|
+
```
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
tokenizer: Tokenizer,
|
|
26
|
+
tokenizer_name: str,
|
|
27
|
+
cache_config: CacheConfig,
|
|
28
|
+
openai_model_name: str,
|
|
29
|
+
api_version: str,
|
|
30
|
+
api_key: Optional[str] = None,
|
|
31
|
+
endpoint: Optional[str] = None,
|
|
32
|
+
base_url: Optional[str] = None,
|
|
33
|
+
):
|
|
34
|
+
if not api_key:
|
|
35
|
+
raise NonRetriableException("Must provide API key through credentials.conf")
|
|
36
|
+
if base_url:
|
|
37
|
+
base_url = base_url.format(endpoint=endpoint)
|
|
38
|
+
super().__init__(
|
|
39
|
+
tokenizer=tokenizer,
|
|
40
|
+
tokenizer_name=tokenizer_name,
|
|
41
|
+
cache_config=cache_config,
|
|
42
|
+
api_key="unused",
|
|
43
|
+
base_url=base_url,
|
|
44
|
+
azure_openai_deployment_name=openai_model_name,
|
|
45
|
+
api_version=api_version,
|
|
46
|
+
default_headers={StanfordHealthCareAzureOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
|
|
47
|
+
)
|
|
48
|
+
else:
|
|
49
|
+
super().__init__(
|
|
50
|
+
tokenizer=tokenizer,
|
|
51
|
+
tokenizer_name=tokenizer_name,
|
|
52
|
+
cache_config=cache_config,
|
|
53
|
+
api_key="unused",
|
|
54
|
+
endpoint=endpoint,
|
|
55
|
+
azure_openai_deployment_name=openai_model_name,
|
|
56
|
+
api_version=api_version,
|
|
57
|
+
default_headers={StanfordHealthCareAzureOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
|
|
58
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.common.request import (
|
|
4
|
+
Request,
|
|
5
|
+
GeneratedOutput,
|
|
6
|
+
)
|
|
7
|
+
from helm.clients.stanfordhealthcare_http_model_client import StanfordHealthCareHTTPModelClient
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StanfordHealthCareClaudeClient(StanfordHealthCareHTTPModelClient):
|
|
11
|
+
"""
|
|
12
|
+
Client for accessing Claude models hosted on Stanford Health Care's model API.
|
|
13
|
+
|
|
14
|
+
Configure by setting the following in prod_env/credentials.conf:
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
stanfordhealthcareEndpoint: https://your-domain-name/
|
|
18
|
+
stanfordhealthcareApiKey: your-private-key
|
|
19
|
+
```
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def get_request(self, request: Request) -> Dict[str, Any]:
|
|
23
|
+
return {
|
|
24
|
+
"model_id": self.model,
|
|
25
|
+
"prompt_text": request.prompt,
|
|
26
|
+
"max_tokens": request.max_tokens,
|
|
27
|
+
"temperature": request.temperature,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
|
|
31
|
+
return [GeneratedOutput(text=item["text"], logprob=0, tokens=[]) for item in response["content"]]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.common.request import (
|
|
4
|
+
Request,
|
|
5
|
+
GeneratedOutput,
|
|
6
|
+
)
|
|
7
|
+
from helm.clients.stanfordhealthcare_http_model_client import StanfordHealthCareHTTPModelClient
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StanfordHealthCareGoogleClient(StanfordHealthCareHTTPModelClient):
|
|
11
|
+
"""
|
|
12
|
+
Client for accessing Google models hosted on Stanford Health Care's model API.
|
|
13
|
+
|
|
14
|
+
Configure by setting the following in prod_env/credentials.conf:
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
stanfordhealthcareEndpoint: https://your-domain-name/
|
|
18
|
+
stanfordhealthcareApiKey: your-private-key
|
|
19
|
+
```
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def get_request(self, request: Request) -> Dict[str, Any]:
|
|
23
|
+
return {
|
|
24
|
+
"contents": {
|
|
25
|
+
"role": "user",
|
|
26
|
+
"parts": {"text": request.prompt},
|
|
27
|
+
},
|
|
28
|
+
"generation_config": {
|
|
29
|
+
"temperature": request.temperature,
|
|
30
|
+
"topP": request.top_p,
|
|
31
|
+
"topK": request.top_k_per_token,
|
|
32
|
+
"candidateCount": 1,
|
|
33
|
+
"maxOutputTokens": request.max_tokens,
|
|
34
|
+
"stopSequences": request.stop_sequences,
|
|
35
|
+
},
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
|
|
39
|
+
completion = ""
|
|
40
|
+
for item in response["content"]:
|
|
41
|
+
if "content" in item["candidates"][0]:
|
|
42
|
+
completion += item["candidates"][0]["content"]["parts"][0]["text"]
|
|
43
|
+
return [GeneratedOutput(text=completion, logprob=0, tokens=[])]
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import asdict
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.request import (
|
|
9
|
+
wrap_request_time,
|
|
10
|
+
Request,
|
|
11
|
+
RequestResult,
|
|
12
|
+
GeneratedOutput,
|
|
13
|
+
EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
|
|
14
|
+
)
|
|
15
|
+
from helm.clients.client import CachingClient
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class StanfordHealthCareHTTPModelClient(CachingClient, ABC):
|
|
19
|
+
"""
|
|
20
|
+
Client for accessing Stanford Health Care models via HTTP requests.
|
|
21
|
+
|
|
22
|
+
Configure by setting the following in prod_env/credentials.conf:
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
stanfordhealthcareEndpoint: https://your-domain-name/
|
|
26
|
+
stanfordhealthcareApiKey: your-private-key
|
|
27
|
+
```
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
cache_config: CacheConfig,
|
|
35
|
+
deployment: str,
|
|
36
|
+
endpoint: str = "http://localhost:8080",
|
|
37
|
+
do_cache: bool = False,
|
|
38
|
+
timeout: int = 3000,
|
|
39
|
+
api_key: Optional[str] = None,
|
|
40
|
+
model: Optional[str] = None,
|
|
41
|
+
):
|
|
42
|
+
super().__init__(cache_config=cache_config)
|
|
43
|
+
assert api_key, "API key must be provided"
|
|
44
|
+
self.endpoint = endpoint
|
|
45
|
+
self.timeout = timeout
|
|
46
|
+
self.do_cache = do_cache
|
|
47
|
+
self.deployment = deployment
|
|
48
|
+
self.model = model
|
|
49
|
+
self.default_headers = {StanfordHealthCareHTTPModelClient.CREDENTIAL_HEADER_NAME: api_key}
|
|
50
|
+
|
|
51
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
52
|
+
cache_key = asdict(request)
|
|
53
|
+
if request.embedding:
|
|
54
|
+
return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
|
|
55
|
+
|
|
56
|
+
raw_request = self.get_request(request)
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
|
|
60
|
+
def do_it() -> Dict[str, Any]:
|
|
61
|
+
url = f"{self.endpoint}/{self.deployment}"
|
|
62
|
+
response = requests.post(url, json=raw_request, headers=self.default_headers, timeout=self.timeout)
|
|
63
|
+
response.raise_for_status()
|
|
64
|
+
response_json = response.json()
|
|
65
|
+
if type(response_json) == list:
|
|
66
|
+
response_json = {"content": response_json}
|
|
67
|
+
return response_json
|
|
68
|
+
|
|
69
|
+
if self.do_cache:
|
|
70
|
+
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
71
|
+
else:
|
|
72
|
+
response, cached = wrap_request_time(do_it)(), False
|
|
73
|
+
|
|
74
|
+
completions = self.parse_response(response)
|
|
75
|
+
|
|
76
|
+
return RequestResult(
|
|
77
|
+
success=True,
|
|
78
|
+
cached=cached,
|
|
79
|
+
error=None,
|
|
80
|
+
completions=completions,
|
|
81
|
+
embedding=[],
|
|
82
|
+
request_time=response["request_time"],
|
|
83
|
+
)
|
|
84
|
+
except requests.exceptions.RequestException as e:
|
|
85
|
+
return RequestResult(success=False, cached=False, error=f"Request error: {e}", completions=[], embedding=[])
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def get_request(self, request: Request) -> Dict[str, Any]:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
|
|
93
|
+
pass
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from helm.clients.openai_client import OpenAIClient
|
|
4
|
+
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
|
+
from helm.proxy.retry import NonRetriableException
|
|
7
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from openai import OpenAI
|
|
11
|
+
except ModuleNotFoundError as e:
|
|
12
|
+
handle_module_not_found_error(e, ["openai"])
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StanfordHealthCareOpenAIClient(OpenAIClient):
|
|
16
|
+
"""
|
|
17
|
+
Client for accessing OpenAI models hosted on Stanford Health Care's model API.
|
|
18
|
+
|
|
19
|
+
Configure by setting the following in prod_env/credentials.conf:
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
stanfordhealthcareEndpoint: https://your-domain-name/
|
|
23
|
+
stanfordhealthcareApiKey: your-private-key
|
|
24
|
+
```
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
tokenizer: Tokenizer,
|
|
32
|
+
tokenizer_name: str,
|
|
33
|
+
cache_config: CacheConfig,
|
|
34
|
+
model_name: str,
|
|
35
|
+
api_key: Optional[str] = None,
|
|
36
|
+
endpoint: Optional[str] = None,
|
|
37
|
+
base_url: Optional[str] = None,
|
|
38
|
+
openai_model_name: Optional[str] = None,
|
|
39
|
+
output_processor: Optional[str] = None,
|
|
40
|
+
):
|
|
41
|
+
super().__init__(
|
|
42
|
+
tokenizer=tokenizer,
|
|
43
|
+
tokenizer_name=tokenizer_name,
|
|
44
|
+
cache_config=cache_config,
|
|
45
|
+
api_key="unused",
|
|
46
|
+
openai_model_name=openai_model_name,
|
|
47
|
+
output_processor=output_processor,
|
|
48
|
+
)
|
|
49
|
+
if not endpoint:
|
|
50
|
+
raise NonRetriableException("Must provide endpoint through credentials.conf")
|
|
51
|
+
if not api_key:
|
|
52
|
+
raise NonRetriableException("Must provide API key through credentials.conf")
|
|
53
|
+
if not base_url:
|
|
54
|
+
raise NonRetriableException("Must provide base url through model_deployments")
|
|
55
|
+
# Guess the base URL part based on the model name
|
|
56
|
+
# Maybe make this configurable instead?
|
|
57
|
+
base_url = base_url.format(endpoint=endpoint)
|
|
58
|
+
self.client = OpenAI(
|
|
59
|
+
api_key="dummy",
|
|
60
|
+
base_url=base_url,
|
|
61
|
+
default_headers={StanfordHealthCareOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
|
|
62
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
from helm.clients.azure_openai_client import AzureOpenAIClient
|
|
4
|
+
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.proxy.retry import NonRetriableException
|
|
6
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StanfordHealthCareSHCOpenAIClient(AzureOpenAIClient):
|
|
10
|
+
"""
|
|
11
|
+
Client for accessing OpenAI models hosted on Stanford Health Care's model API.
|
|
12
|
+
|
|
13
|
+
Configure by setting the following in prod_env/credentials.conf:
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
stanfordhealthcareEndpoint: https://your-domain-name/
|
|
17
|
+
stanfordhealthcareApiKey: your-private-key
|
|
18
|
+
```
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
API_VERSION = "2024-08-01-preview"
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
tokenizer: Tokenizer,
|
|
26
|
+
tokenizer_name: str,
|
|
27
|
+
cache_config: CacheConfig,
|
|
28
|
+
api_key: Optional[str] = None,
|
|
29
|
+
endpoint: Optional[str] = None,
|
|
30
|
+
default_headers: Optional[Dict[str, str]] = None,
|
|
31
|
+
):
|
|
32
|
+
if not api_key:
|
|
33
|
+
raise NonRetriableException("Must provide API key through credentials.conf")
|
|
34
|
+
super().__init__(
|
|
35
|
+
tokenizer=tokenizer,
|
|
36
|
+
tokenizer_name=tokenizer_name,
|
|
37
|
+
cache_config=cache_config,
|
|
38
|
+
api_key=api_key,
|
|
39
|
+
endpoint=endpoint,
|
|
40
|
+
api_version=StanfordHealthCareSHCOpenAIClient.API_VERSION,
|
|
41
|
+
default_headers=default_headers,
|
|
42
|
+
)
|
helm/clients/test_client.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
2
2
|
from helm.tokenizers.auto_tokenizer import AutoTokenizer
|
|
3
|
-
from .client import truncate_sequence, truncate_and_tokenize_response_text
|
|
3
|
+
from helm.clients.client import truncate_sequence, truncate_and_tokenize_response_text
|
|
4
4
|
from typing import List
|
|
5
5
|
from helm.common.request import Request, GeneratedOutput, Token
|
|
6
6
|
|
|
@@ -5,7 +5,12 @@ import tempfile
|
|
|
5
5
|
from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
|
|
6
6
|
from helm.common.request import Request
|
|
7
7
|
|
|
8
|
-
from .together_client import
|
|
8
|
+
from helm.clients.together_client import (
|
|
9
|
+
TogetherClient,
|
|
10
|
+
TogetherChatClient,
|
|
11
|
+
TogetherCompletionClient,
|
|
12
|
+
TogetherClientError,
|
|
13
|
+
)
|
|
9
14
|
|
|
10
15
|
|
|
11
16
|
class TestTogetherClient:
|
helm/clients/together_client.py
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
from copy import deepcopy
|
|
2
2
|
from itertools import zip_longest
|
|
3
|
+
import re
|
|
3
4
|
import threading
|
|
4
|
-
from typing import List, Dict, Any, Mapping, Optional, TypedDict, Union
|
|
5
|
+
from typing import Callable, List, Dict, Any, Mapping, Optional, Tuple, TypedDict, Union
|
|
6
|
+
from typing_extensions import NotRequired
|
|
5
7
|
|
|
6
8
|
import requests
|
|
7
9
|
from retrying import retry
|
|
8
10
|
|
|
9
11
|
from helm.common.cache import CacheConfig
|
|
10
12
|
from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
|
|
13
|
+
from helm.common.object_spec import get_class_by_name
|
|
11
14
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
12
|
-
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
15
|
+
from helm.common.request import Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
13
16
|
from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
|
|
14
17
|
|
|
15
18
|
try:
|
|
@@ -98,6 +101,19 @@ class JobNotFinishedError(TogetherClientError):
|
|
|
98
101
|
pass
|
|
99
102
|
|
|
100
103
|
|
|
104
|
+
def _parse_thinking(input: str) -> Tuple[str, str]:
|
|
105
|
+
"""Return a tuple of thinking text and output text."""
|
|
106
|
+
match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
|
|
107
|
+
if match:
|
|
108
|
+
return (match.group(1), match.group(2))
|
|
109
|
+
|
|
110
|
+
match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
|
|
111
|
+
if match:
|
|
112
|
+
return (match.group(1), "")
|
|
113
|
+
|
|
114
|
+
return (input, "")
|
|
115
|
+
|
|
116
|
+
|
|
101
117
|
class TogetherClient(CachingClient):
|
|
102
118
|
"""
|
|
103
119
|
Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
|
|
@@ -313,15 +329,31 @@ class TogetherRawChatRequest(TypedDict):
|
|
|
313
329
|
logprobs: int
|
|
314
330
|
echo: bool
|
|
315
331
|
n: int
|
|
332
|
+
response_format: NotRequired[Dict[str, Any]]
|
|
316
333
|
|
|
317
334
|
|
|
318
335
|
class TogetherChatClient(CachingClient):
|
|
319
336
|
"""Client that uses the Python Together library for chat models."""
|
|
320
337
|
|
|
321
|
-
def __init__(
|
|
338
|
+
def __init__(
|
|
339
|
+
self,
|
|
340
|
+
cache_config: CacheConfig,
|
|
341
|
+
api_key: Optional[str],
|
|
342
|
+
together_model: Optional[str] = None,
|
|
343
|
+
disable_logprobs: Optional[bool] = None,
|
|
344
|
+
output_processor: Optional[str] = None,
|
|
345
|
+
parse_thinking: Optional[bool] = None,
|
|
346
|
+
):
|
|
322
347
|
super().__init__(cache_config=cache_config)
|
|
323
348
|
self._client = Together(api_key=api_key)
|
|
324
349
|
self._together_model = together_model
|
|
350
|
+
self._disable_logprobs = bool(disable_logprobs)
|
|
351
|
+
# self.output_processor is actually a function, not a class
|
|
352
|
+
self._parse_thinking = bool(parse_thinking)
|
|
353
|
+
|
|
354
|
+
self.output_processor: Optional[Callable[[str], str]] = (
|
|
355
|
+
get_class_by_name(output_processor) if output_processor else None
|
|
356
|
+
)
|
|
325
357
|
|
|
326
358
|
def convert_to_raw_chat_request(self, request: Request) -> TogetherRawChatRequest:
|
|
327
359
|
request.validate()
|
|
@@ -353,7 +385,11 @@ class TogetherChatClient(CachingClient):
|
|
|
353
385
|
model = self._together_model
|
|
354
386
|
else:
|
|
355
387
|
model = request.model
|
|
356
|
-
|
|
388
|
+
if self._disable_logprobs:
|
|
389
|
+
logprobs = 0
|
|
390
|
+
else:
|
|
391
|
+
logprobs = min(request.top_k_per_token, 1)
|
|
392
|
+
raw_chat_request: TogetherRawChatRequest = {
|
|
357
393
|
"messages": messages,
|
|
358
394
|
"model": model,
|
|
359
395
|
"max_tokens": request.max_tokens,
|
|
@@ -361,10 +397,16 @@ class TogetherChatClient(CachingClient):
|
|
|
361
397
|
"temperature": request.temperature,
|
|
362
398
|
"top_p": request.top_p,
|
|
363
399
|
"top_k": request.top_k_per_token,
|
|
364
|
-
"logprobs":
|
|
400
|
+
"logprobs": logprobs,
|
|
365
401
|
"echo": request.echo_prompt,
|
|
366
402
|
"n": request.num_completions,
|
|
367
403
|
}
|
|
404
|
+
if request.response_format and request.response_format.json_schema:
|
|
405
|
+
raw_chat_request["response_format"] = {
|
|
406
|
+
"type": "json_object",
|
|
407
|
+
"schema": request.response_format.json_schema,
|
|
408
|
+
}
|
|
409
|
+
return raw_chat_request
|
|
368
410
|
|
|
369
411
|
def make_request(self, request: Request) -> RequestResult:
|
|
370
412
|
raw_request = self.convert_to_raw_chat_request(request)
|
|
@@ -398,8 +440,21 @@ class TogetherChatClient(CachingClient):
|
|
|
398
440
|
if token_text is None:
|
|
399
441
|
break
|
|
400
442
|
tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
|
|
443
|
+
logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
|
|
401
444
|
assert choice.message.role == "assistant"
|
|
402
|
-
|
|
445
|
+
output_text = choice.message.content
|
|
446
|
+
if self.output_processor:
|
|
447
|
+
output_text = self.output_processor(output_text)
|
|
448
|
+
|
|
449
|
+
if self._parse_thinking:
|
|
450
|
+
thinking_text, output_text = _parse_thinking(output_text)
|
|
451
|
+
generated_outputs.append(
|
|
452
|
+
GeneratedOutput(
|
|
453
|
+
text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
|
|
454
|
+
)
|
|
455
|
+
)
|
|
456
|
+
else:
|
|
457
|
+
generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
|
|
403
458
|
return RequestResult(
|
|
404
459
|
success=True,
|
|
405
460
|
cached=cached,
|
|
@@ -426,16 +481,27 @@ class TogetherRawCompletionRequest(TypedDict):
|
|
|
426
481
|
class TogetherCompletionClient(CachingClient):
|
|
427
482
|
"""Client that uses the Python Together library for text completion models."""
|
|
428
483
|
|
|
429
|
-
def __init__(
|
|
484
|
+
def __init__(
|
|
485
|
+
self,
|
|
486
|
+
cache_config: CacheConfig,
|
|
487
|
+
api_key: Optional[str],
|
|
488
|
+
together_model: Optional[str] = None,
|
|
489
|
+
disable_logprobs: Optional[bool] = None,
|
|
490
|
+
):
|
|
430
491
|
super().__init__(cache_config=cache_config)
|
|
431
492
|
self._client = Together(api_key=api_key)
|
|
432
493
|
self._together_model = together_model
|
|
494
|
+
self._disable_logprobs = bool(disable_logprobs)
|
|
433
495
|
|
|
434
496
|
def convert_to_raw_completion_request(self, request: Request) -> TogetherRawCompletionRequest:
|
|
435
497
|
if self._together_model is not None:
|
|
436
498
|
model = self._together_model
|
|
437
499
|
else:
|
|
438
500
|
model = request.model
|
|
501
|
+
if self._disable_logprobs:
|
|
502
|
+
logprobs = 0
|
|
503
|
+
else:
|
|
504
|
+
logprobs = min(request.top_k_per_token, 1)
|
|
439
505
|
return {
|
|
440
506
|
"prompt": request.prompt,
|
|
441
507
|
"model": model,
|
|
@@ -444,7 +510,7 @@ class TogetherCompletionClient(CachingClient):
|
|
|
444
510
|
"temperature": request.temperature,
|
|
445
511
|
"top_p": request.top_p,
|
|
446
512
|
"top_k": request.top_k_per_token,
|
|
447
|
-
"logprobs":
|
|
513
|
+
"logprobs": logprobs,
|
|
448
514
|
"echo": request.echo_prompt,
|
|
449
515
|
"n": request.num_completions,
|
|
450
516
|
}
|
|
@@ -481,8 +547,9 @@ class TogetherCompletionClient(CachingClient):
|
|
|
481
547
|
if token_text is None:
|
|
482
548
|
break
|
|
483
549
|
tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
|
|
550
|
+
logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
|
|
484
551
|
assert choice.text
|
|
485
|
-
generated_outputs.append(GeneratedOutput(text=choice.text, logprob=
|
|
552
|
+
generated_outputs.append(GeneratedOutput(text=choice.text, logprob=logprob, tokens=tokens))
|
|
486
553
|
return RequestResult(
|
|
487
554
|
success=True,
|
|
488
555
|
cached=cached,
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from helm.clients.openai_client import OpenAIClient
|
|
2
|
+
from helm.common.cache import CacheConfig
|
|
3
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class UpstageChatClient(OpenAIClient):
|
|
7
|
+
"""Sends request to a Upstage model using a OpenAI-compatible Chat API."""
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
tokenizer: Tokenizer,
|
|
12
|
+
tokenizer_name: str,
|
|
13
|
+
cache_config: CacheConfig,
|
|
14
|
+
api_key: str,
|
|
15
|
+
):
|
|
16
|
+
super().__init__(
|
|
17
|
+
tokenizer=tokenizer,
|
|
18
|
+
tokenizer_name=tokenizer_name,
|
|
19
|
+
cache_config=cache_config,
|
|
20
|
+
api_key=api_key,
|
|
21
|
+
org_id=None,
|
|
22
|
+
base_url="https://api.upstage.ai/v1/solar",
|
|
23
|
+
)
|