crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
helm/proxy/server.py
CHANGED
|
@@ -6,6 +6,7 @@ Look at `index.js` to see how the functionality is invoked.
|
|
|
6
6
|
from urllib.parse import unquote_plus
|
|
7
7
|
import argparse
|
|
8
8
|
import dataclasses
|
|
9
|
+
import importlib_resources as resources
|
|
9
10
|
import json
|
|
10
11
|
import os
|
|
11
12
|
import sys
|
|
@@ -29,9 +30,9 @@ from helm.common.perspective_api_request import PerspectiveAPIRequest
|
|
|
29
30
|
from helm.common.moderations_api_request import ModerationAPIRequest
|
|
30
31
|
from helm.common.tokenization_request import TokenizationRequest, DecodeRequest
|
|
31
32
|
from helm.proxy.services.service import CACHE_DIR
|
|
32
|
-
from .accounts import Account
|
|
33
|
-
from .services.server_service import ServerService
|
|
34
|
-
from .query import Query
|
|
33
|
+
from helm.proxy.accounts import Account
|
|
34
|
+
from helm.proxy.services.server_service import ServerService
|
|
35
|
+
from helm.proxy.query import Query
|
|
35
36
|
|
|
36
37
|
try:
|
|
37
38
|
import gunicorn # noqa
|
|
@@ -86,7 +87,7 @@ def handle_root():
|
|
|
86
87
|
|
|
87
88
|
@app.get("/static/<filename:path>")
|
|
88
89
|
def handle_static_filename(filename):
|
|
89
|
-
resp = bottle.static_file(filename, root=
|
|
90
|
+
resp = bottle.static_file(filename, root=app.config["helm.staticpath"])
|
|
90
91
|
resp.add_header("Cache-Control", "no-store, must-revalidate ")
|
|
91
92
|
return resp
|
|
92
93
|
|
|
@@ -284,6 +285,12 @@ def main():
|
|
|
284
285
|
ensure_directory_exists(sqlite_cache_path)
|
|
285
286
|
cache_backend_config = SqliteCacheBackendConfig(sqlite_cache_path)
|
|
286
287
|
|
|
288
|
+
static_package_name = "helm.proxy.static"
|
|
289
|
+
resource_path = resources.files(static_package_name).joinpath("index.html")
|
|
290
|
+
with resources.as_file(resource_path) as resource_filename:
|
|
291
|
+
static_path = str(resource_filename.parent)
|
|
292
|
+
app.config["helm.staticpath"] = static_path
|
|
293
|
+
|
|
287
294
|
service = ServerService(base_path=args.base_path, cache_backend_config=cache_backend_config)
|
|
288
295
|
|
|
289
296
|
gunicorn_args = {
|
|
@@ -24,7 +24,7 @@ from helm.common.request import Request, RequestResult
|
|
|
24
24
|
from dacite import from_dict
|
|
25
25
|
from helm.proxy.accounts import Account
|
|
26
26
|
from helm.proxy.query import Query, QueryResult
|
|
27
|
-
from .service import Service, GeneralInfo
|
|
27
|
+
from helm.proxy.services.service import Service, GeneralInfo
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
class RemoteServiceError(Exception):
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import dataclasses
|
|
2
1
|
import os
|
|
3
2
|
import signal
|
|
4
|
-
from typing import List
|
|
3
|
+
from typing import List
|
|
5
4
|
|
|
5
|
+
from helm.common.local_context import LocalContext
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
|
|
8
8
|
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
@@ -11,7 +11,6 @@ from helm.common.moderations_api_request import ModerationAPIRequest, Moderation
|
|
|
11
11
|
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
12
12
|
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
13
13
|
from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
14
|
-
from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
|
|
15
14
|
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
16
15
|
from helm.common.tokenization_request import (
|
|
17
16
|
TokenizationRequest,
|
|
@@ -22,27 +21,13 @@ from helm.common.tokenization_request import (
|
|
|
22
21
|
from helm.common.request import Request, RequestResult
|
|
23
22
|
from helm.common.hierarchical_logger import hlog
|
|
24
23
|
from helm.proxy.accounts import Accounts, Account
|
|
25
|
-
from helm.clients.auto_client import AutoClient
|
|
26
|
-
from helm.clients.moderation_api_client import ModerationAPIClient
|
|
27
|
-
from helm.clients.image_generation.nudity_check_client import NudityCheckClient
|
|
28
|
-
from helm.clients.gcs_client import GCSClient
|
|
29
|
-
from helm.clients.clip_score_client import CLIPScoreClient
|
|
30
|
-
from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
|
|
31
|
-
from helm.proxy.example_queries import example_queries
|
|
32
|
-
from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
|
|
33
24
|
from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization
|
|
34
25
|
from helm.proxy.query import Query, QueryResult
|
|
35
|
-
from helm.proxy.retry import retry_request
|
|
36
26
|
from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
|
|
37
|
-
from helm.
|
|
38
|
-
from .service import (
|
|
27
|
+
from helm.proxy.services.service import (
|
|
39
28
|
Service,
|
|
40
|
-
CACHE_DIR,
|
|
41
29
|
ACCOUNTS_FILE,
|
|
42
30
|
GeneralInfo,
|
|
43
|
-
VERSION,
|
|
44
|
-
expand_environments,
|
|
45
|
-
synthesize_request,
|
|
46
31
|
)
|
|
47
32
|
|
|
48
33
|
|
|
@@ -57,43 +42,17 @@ class ServerService(Service):
|
|
|
57
42
|
root_mode: bool = False,
|
|
58
43
|
cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
|
|
59
44
|
):
|
|
60
|
-
ensure_directory_exists(base_path)
|
|
61
|
-
client_file_storage_path = os.path.join(base_path, CACHE_DIR)
|
|
62
|
-
ensure_directory_exists(client_file_storage_path)
|
|
63
|
-
|
|
64
|
-
credentials = get_credentials(base_path)
|
|
65
45
|
accounts_path = os.path.join(base_path, ACCOUNTS_FILE)
|
|
66
46
|
|
|
67
|
-
self.
|
|
68
|
-
self.
|
|
69
|
-
self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
|
|
70
|
-
self.token_counter = AutoTokenCounter(self.tokenizer)
|
|
47
|
+
self.context = LocalContext(base_path, cache_backend_config)
|
|
48
|
+
self.token_counter = AutoTokenCounter(self.context.tokenizer)
|
|
71
49
|
self.accounts = Accounts(accounts_path, root_mode=root_mode)
|
|
72
50
|
|
|
73
|
-
# Lazily instantiate the following clients
|
|
74
|
-
self.moderation_api_client: Optional[ModerationAPIClient] = None
|
|
75
|
-
self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
|
|
76
|
-
self.perspective_api_client: Optional[ToxicityClassifierClient] = None
|
|
77
|
-
self.nudity_check_client: Optional[NudityCheckClient] = None
|
|
78
|
-
self.clip_score_client: Optional[CLIPScoreClient] = None
|
|
79
|
-
self.gcs_client: Optional[GCSClient] = None
|
|
80
|
-
|
|
81
51
|
def get_general_info(self) -> GeneralInfo:
|
|
82
|
-
|
|
83
|
-
# TODO(#2158): Either fix this or delete get_general_info.
|
|
84
|
-
all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
|
|
85
|
-
return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
|
|
52
|
+
return self.context.get_general_info()
|
|
86
53
|
|
|
87
54
|
def expand_query(self, query: Query) -> QueryResult:
|
|
88
|
-
|
|
89
|
-
prompt = query.prompt
|
|
90
|
-
settings = query.settings
|
|
91
|
-
environments = parse_hocon(query.environments)
|
|
92
|
-
requests = []
|
|
93
|
-
for environment in expand_environments(environments):
|
|
94
|
-
request = synthesize_request(prompt, settings, environment)
|
|
95
|
-
requests.append(request)
|
|
96
|
-
return QueryResult(requests=requests)
|
|
55
|
+
return self.context.expand_query(query)
|
|
97
56
|
|
|
98
57
|
def _get_model_group_for_model_deployment(self, model_deployment: str) -> str:
|
|
99
58
|
if model_deployment.startswith("openai/"):
|
|
@@ -105,7 +64,11 @@ class ServerService(Service):
|
|
|
105
64
|
return "gpt4"
|
|
106
65
|
elif model_deployment.startswith("openai/gpt-3"):
|
|
107
66
|
return "gpt3"
|
|
108
|
-
elif
|
|
67
|
+
elif (
|
|
68
|
+
model_deployment.startswith("openai/o1")
|
|
69
|
+
or model_deployment.startswith("openai/o3")
|
|
70
|
+
or model_deployment.startswith("openai/o4")
|
|
71
|
+
):
|
|
109
72
|
return "o1"
|
|
110
73
|
else:
|
|
111
74
|
return "openai"
|
|
@@ -126,7 +89,7 @@ class ServerService(Service):
|
|
|
126
89
|
self.accounts.check_can_use(auth.api_key, model_group)
|
|
127
90
|
|
|
128
91
|
# Use!
|
|
129
|
-
request_result: RequestResult = self.
|
|
92
|
+
request_result: RequestResult = self.context.make_request(request)
|
|
130
93
|
|
|
131
94
|
# Only deduct if not cached
|
|
132
95
|
if not request_result.cached:
|
|
@@ -139,66 +102,39 @@ class ServerService(Service):
|
|
|
139
102
|
def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
140
103
|
"""Tokenize via an API."""
|
|
141
104
|
self.accounts.authenticate(auth)
|
|
142
|
-
return self.
|
|
105
|
+
return self.context.tokenize(request)
|
|
143
106
|
|
|
144
107
|
def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
|
|
145
108
|
"""Decodes to text."""
|
|
146
109
|
self.accounts.authenticate(auth)
|
|
147
|
-
return self.
|
|
110
|
+
return self.context.decode(request)
|
|
148
111
|
|
|
149
112
|
def upload(self, auth: Authentication, request: FileUploadRequest) -> FileUploadResult:
|
|
150
113
|
"""Uploads a file to external storage."""
|
|
151
114
|
self.accounts.authenticate(auth)
|
|
152
|
-
|
|
153
|
-
if not self.gcs_client:
|
|
154
|
-
self.gcs_client = self.client.get_gcs_client()
|
|
155
|
-
|
|
156
|
-
assert self.gcs_client
|
|
157
|
-
return self.gcs_client.upload(request)
|
|
115
|
+
return self.context.upload(request)
|
|
158
116
|
|
|
159
117
|
def check_nudity(self, auth: Authentication, request: NudityCheckRequest) -> NudityCheckResult:
|
|
160
118
|
"""Check for nudity."""
|
|
161
119
|
self.accounts.authenticate(auth)
|
|
162
|
-
|
|
163
|
-
if not self.nudity_check_client:
|
|
164
|
-
self.nudity_check_client = self.client.get_nudity_check_client()
|
|
165
|
-
|
|
166
|
-
assert self.nudity_check_client
|
|
167
|
-
return self.nudity_check_client.check_nudity(request)
|
|
120
|
+
return self.context.check_nudity(request)
|
|
168
121
|
|
|
169
122
|
def compute_clip_score(self, auth: Authentication, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
170
123
|
"""Computes CLIPScore for a given caption and image."""
|
|
171
124
|
self.accounts.authenticate(auth)
|
|
172
|
-
|
|
173
|
-
if not self.clip_score_client:
|
|
174
|
-
self.clip_score_client = self.client.get_clip_score_client()
|
|
175
|
-
|
|
176
|
-
assert self.clip_score_client
|
|
177
|
-
return self.clip_score_client.compute_score(request)
|
|
125
|
+
return self.context.compute_clip_score(request)
|
|
178
126
|
|
|
179
127
|
def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
180
|
-
@retry_request
|
|
181
|
-
def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
182
|
-
if not self.toxicity_classifier_client:
|
|
183
|
-
self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
|
|
184
|
-
return self.toxicity_classifier_client.get_toxicity_scores(request)
|
|
185
|
-
|
|
186
128
|
self.accounts.authenticate(auth)
|
|
187
|
-
return
|
|
129
|
+
return self.context.get_toxicity_scores(request)
|
|
188
130
|
|
|
189
131
|
def get_moderation_results(self, auth: Authentication, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
190
|
-
@retry_request
|
|
191
|
-
def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
192
|
-
if not self.moderation_api_client:
|
|
193
|
-
self.moderation_api_client = self.client.get_moderation_api_client()
|
|
194
|
-
return self.moderation_api_client.get_moderation_results(request)
|
|
195
|
-
|
|
196
132
|
self.accounts.authenticate(auth)
|
|
197
|
-
return
|
|
133
|
+
return self.context.get_moderation_results(request)
|
|
198
134
|
|
|
199
135
|
def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
|
|
200
136
|
self.accounts.authenticate(auth)
|
|
201
|
-
return self.
|
|
137
|
+
return self.context.make_critique_request(request)
|
|
202
138
|
|
|
203
139
|
def create_account(self, auth: Authentication) -> Account:
|
|
204
140
|
"""Creates a new account."""
|
|
@@ -233,4 +169,4 @@ class ServerService(Service):
|
|
|
233
169
|
hlog("Done.")
|
|
234
170
|
|
|
235
171
|
def get_cache_config(self, shard_name: str) -> CacheConfig:
|
|
236
|
-
return self.
|
|
172
|
+
return self.context.get_cache_config(shard_name)
|
|
@@ -18,8 +18,8 @@ from helm.common.authentication import Authentication
|
|
|
18
18
|
from helm.common.request import Request, RequestResult
|
|
19
19
|
from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
|
|
20
20
|
from helm.proxy.accounts import Account, set_default_quotas
|
|
21
|
-
from .remote_service import RemoteService
|
|
22
|
-
from .service import ACCOUNTS_FILE
|
|
21
|
+
from helm.proxy.services.remote_service import RemoteService
|
|
22
|
+
from helm.proxy.services.service import ACCOUNTS_FILE
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
@dataclass(frozen=True)
|
|
@@ -8,7 +8,7 @@ from helm.common.authentication import Authentication
|
|
|
8
8
|
from helm.common.request import Request
|
|
9
9
|
from helm.proxy.accounts import AuthenticationError, Accounts
|
|
10
10
|
from helm.proxy.query import Query
|
|
11
|
-
from .server_service import ServerService
|
|
11
|
+
from helm.proxy.services.server_service import ServerService
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def get_authentication():
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
function assert(condition, message) {
|
|
2
|
+
if (!condition) {
|
|
3
|
+
throw message || "Assertion failed";
|
|
4
|
+
}
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
function encodeUrlParams(params) {
|
|
8
|
+
let s = "";
|
|
9
|
+
for (let k in params) {
|
|
10
|
+
if (params[k] != null) {
|
|
11
|
+
s += (s === "" ? "?" : "&") + k + "=" + encodeURIComponent(params[k]);
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
return s;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function decodeUrlParams(str) {
|
|
18
|
+
const params = {};
|
|
19
|
+
if (str === "") return params;
|
|
20
|
+
const items = str.substring(1).split(/&/);
|
|
21
|
+
for (let i = 0; i < items.length; i++) {
|
|
22
|
+
const pair = items[i].split(/=/);
|
|
23
|
+
params[pair[0]] = decodeURIComponent(pair[1]);
|
|
24
|
+
}
|
|
25
|
+
return params;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function updateBrowserLocation(params) {
|
|
29
|
+
// Update the address bar
|
|
30
|
+
window.history.pushState(
|
|
31
|
+
{},
|
|
32
|
+
"",
|
|
33
|
+
window.location.pathname + encodeUrlParams(params),
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function createCookie(key, value, days) {
|
|
38
|
+
let expires = "";
|
|
39
|
+
if (days) {
|
|
40
|
+
const date = new Date();
|
|
41
|
+
date.setTime(date.getTime() + days * 24 * 60 * 60 * 1000);
|
|
42
|
+
expires = "; expires=" + date.toUTCString();
|
|
43
|
+
}
|
|
44
|
+
document.cookie = key + "=" + value + expires + "; path=/";
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function readCookie(key) {
|
|
48
|
+
let tokens = document.cookie.split(";");
|
|
49
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
50
|
+
const [k, v] = tokens[i].trim().split("=", 2);
|
|
51
|
+
if (key === k) return v;
|
|
52
|
+
}
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function eraseCookie(key) {
|
|
57
|
+
createCookie(key, "", -1);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function renderTimestamp(timestamp) {
|
|
61
|
+
if (!timestamp) return null;
|
|
62
|
+
const d = new Date(timestamp * 1000);
|
|
63
|
+
return d.toLocaleString();
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function renderDict(data) {
|
|
67
|
+
return JSON.stringify(data).substring(0, 10000);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function loadScript(src, onload, onerror) {
|
|
71
|
+
// Using jquery doesn't work, so do it in with our bare hands.
|
|
72
|
+
const s = document.createElement("script");
|
|
73
|
+
s.src = src;
|
|
74
|
+
s.onload = onload;
|
|
75
|
+
s.onerror = onerror;
|
|
76
|
+
document.head.appendChild(s);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function getRandomString() {
|
|
80
|
+
const vocab =
|
|
81
|
+
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
|
|
82
|
+
let text = "";
|
|
83
|
+
for (let i = 0; i < 6; i++)
|
|
84
|
+
text += vocab.charAt(Math.floor(Math.random() * vocab.length));
|
|
85
|
+
return text;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function round(x, n) {
|
|
89
|
+
const base = Math.pow(10, n);
|
|
90
|
+
return Math.round(x * base) / base;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function multilineHtml(s) {
|
|
94
|
+
return s.replace(/\n/g, "<br>");
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function renderError(e) {
|
|
98
|
+
return $("<div>").addClass("alert alert-danger").append(multilineHtml(e));
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function helpIcon(help, link) {
|
|
102
|
+
// Show a ?
|
|
103
|
+
return $("<a>", { href: link, target: "blank_", class: "help-icon" }).append(
|
|
104
|
+
$("<img>", { src: "info-icon.png", width: 15, title: help }),
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const markdownConverter = new showdown.Converter({ optionKey: "value" });
|
|
109
|
+
function renderMarkdown(markdown) {
|
|
110
|
+
return markdown && markdownConverter.makeHtml(markdown);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function refreshHashLocation() {
|
|
114
|
+
// If we request a hash location (URL contains #foo), the problem is #foo
|
|
115
|
+
// might not exist (since it's generated). Call this function to jump to the
|
|
116
|
+
// hash location once all the anchors are generated.
|
|
117
|
+
if (location.hash) {
|
|
118
|
+
const hash = location.hash;
|
|
119
|
+
location.hash = "";
|
|
120
|
+
location.hash = hash;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<title>CRFM Models API</title>
|
|
5
|
+
<meta charset="utf-8">
|
|
6
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
|
7
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/css/bootstrap.min.css" integrity="sha384-/Y6pD6FV/Vv2HJnA6t+vslU6fwYXjCFtcEpHbNJ0lyAFsXTsjBbfaDjzALeQsN6M" crossorigin="anonymous">
|
|
8
|
+
<link rel="stylesheet" type="text/css" href="index.css">
|
|
9
|
+
</head>
|
|
10
|
+
|
|
11
|
+
<body>
|
|
12
|
+
<div class="container-fluid">
|
|
13
|
+
<nav class="navbar navbar-expand-sm navbar-light bg-faded">
|
|
14
|
+
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#nav-content" aria-controls="nav-content" aria-expanded="false" aria-label="Toggle navigation">
|
|
15
|
+
<span class="navbar-toggler-icon"></span>
|
|
16
|
+
</button>
|
|
17
|
+
|
|
18
|
+
<div class="collapse navbar-collapse" id="nav-content">
|
|
19
|
+
<ul class="navbar-nav">
|
|
20
|
+
<li class="nav-item"><a class="nav-link" href="index.html">Query</a></li>
|
|
21
|
+
<li class="nav-item"><a class="nav-link active" href="help.html">Help</a></li>
|
|
22
|
+
<li class="nav-item" id="loginInfo"></li>
|
|
23
|
+
</ul>
|
|
24
|
+
</div>
|
|
25
|
+
</nav>
|
|
26
|
+
|
|
27
|
+
<div class="row">
|
|
28
|
+
<div class="col-sm-12">
|
|
29
|
+
<p>
|
|
30
|
+
This web interface and API allows you to experiment with multiple large language models using a unified interface.
|
|
31
|
+
To log in, you need an API key.
|
|
32
|
+
</p>
|
|
33
|
+
<h2>Query</h2>
|
|
34
|
+
<ul>
|
|
35
|
+
<li>In the web interface, you can enter a <b>query</b>, which consists of the following components:
|
|
36
|
+
<ul>
|
|
37
|
+
<li><b>prompt</b>, which is what text we want to feed into the language model. The prompt can have variables (e.g., <tt>${name}</tt>) which are filled in later.</li>
|
|
38
|
+
<li><b>settings</b>, which configures how we're going to call the backend API (HOCON format):
|
|
39
|
+
<ul>
|
|
40
|
+
<li><tt>model</tt>: which model to query; options are:
|
|
41
|
+
<div id="help-models" />
|
|
42
|
+
</li>
|
|
43
|
+
<li><tt>temperature</tt>: a non-negative number determining amount of stochasticity (e.g., <tt>1</tt> is sampling from the model, <tt>0</tt> is returning the maximum probability output)</li>
|
|
44
|
+
<li><tt>num_completions</tt>: number of completions (sequences, independent sampled) to return</li>
|
|
45
|
+
<li><tt>top_k_per_token</tt>: number of candidates per token position in each completion</li>
|
|
46
|
+
<li><tt>max_tokens</tt>: maximum number of tokens before generation stops</li>
|
|
47
|
+
<li><tt>stop_sequences</tt>: list of strings that will stop generation (e.g., <tt>'.'</tt> or <tt>'\n'</tt>)</li>
|
|
48
|
+
<li><tt>echo_prompt</tt>: Whether to include the prompt as a prefix of the completion</li>
|
|
49
|
+
<li><tt>top_p</tt>: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.</li>
|
|
50
|
+
<li><tt>presence_penalty</tt>: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. (OpenAI only)</li>
|
|
51
|
+
<li><tt>frequency_penalty</tt>: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. (OpenAI only)</li>
|
|
52
|
+
</ul>
|
|
53
|
+
Settings can also have variables in them (e.g., <tt>temperature</tt>).
|
|
54
|
+
</li>
|
|
55
|
+
<li><b>environments</b>, which specifies for each variable, a list of values (HOCON format).
|
|
56
|
+
</ul>
|
|
57
|
+
</li>
|
|
58
|
+
<li>
|
|
59
|
+
When the query is submitted,
|
|
60
|
+
we consider all possible assignments of values to variables.
|
|
61
|
+
For example:
|
|
62
|
+
<ul>
|
|
63
|
+
<li>environments has <tt>name: [Boston, New York]</tt> and <tt>temperature: [0, 1]</tt></li>
|
|
64
|
+
<li>prompt is <tt>${name} is a</tt></li>
|
|
65
|
+
<li>settings is <tt>temperature: ${temperature}</tt></li>
|
|
66
|
+
</ul>
|
|
67
|
+
This gives rise to 4 <b>requests</b>:
|
|
68
|
+
<ul>
|
|
69
|
+
<li>prompt: <tt>Boston is a</tt>, temperature: <tt>0</tt></li>
|
|
70
|
+
<li>prompt: <tt>Boston is a</tt>, temperature: <tt>1</tt></li>
|
|
71
|
+
<li>prompt: <tt>New York is a</tt>, temperature: <tt>0</tt></li>
|
|
72
|
+
<li>prompt: <tt>New York is a</tt>, temperature: <tt>1</tt></li>
|
|
73
|
+
</ul>
|
|
74
|
+
</li>
|
|
75
|
+
</ul>
|
|
76
|
+
|
|
77
|
+
<h2><a name="quotas">Quotas</a></h2>
|
|
78
|
+
<p>
|
|
79
|
+
For each model group (e.g., <tt>gpt3</tt>) and time granularity
|
|
80
|
+
(e.g., <tt>daily</tt>, <tt>monthly</tt>, <tt>total</tt>), you are
|
|
81
|
+
given a quota of a certain number of tokens.
|
|
82
|
+
Once you go over that number, you won't be able to use the API.
|
|
83
|
+
However, note that requests that have already been made (by you or
|
|
84
|
+
anyone) that are cached are not counted towards your quota.
|
|
85
|
+
For example, if your daily quota for <tt>gpt3</tt> is <tt>10000</tt>,
|
|
86
|
+
that means each day, you will get 10000 tokens.
|
|
87
|
+
</p>
|
|
88
|
+
</div>
|
|
89
|
+
</div>
|
|
90
|
+
</div>
|
|
91
|
+
|
|
92
|
+
<script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>
|
|
93
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js" integrity="sha384-b/U6ypiBEHpOf/4+1nzFpr53nxSS+GLCkfwBdFNTxtclqqenISfwAzpKaMNFNmj4" crossorigin="anonymous"></script>
|
|
94
|
+
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/js/bootstrap.min.js" integrity="sha384-h0AbiXch4ZDo7tp9hKZ4TsHbi047NrKGLO3SEJAg45jXxnGIfYzk4Si90RDIqNm1" crossorigin="anonymous"></script>
|
|
95
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.tablesorter/2.29.0/js/jquery.tablesorter.min.js"></script>
|
|
96
|
+
<script src="general.js"></script>
|
|
97
|
+
<script src="index.js"></script>
|
|
98
|
+
</body>
|
|
99
|
+
</html>
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
.warning {
|
|
2
|
+
color: red;
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
.active {
|
|
6
|
+
background-color: #f0f0f0;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
.examples-block {
|
|
10
|
+
margin-left: 120px;
|
|
11
|
+
margin-top: 10px;
|
|
12
|
+
margin-bottom: 10px;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
.request {
|
|
16
|
+
background-color: #f0f0f0;
|
|
17
|
+
margin-top: 20px;
|
|
18
|
+
margin-bottom: 20px;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
.completion {
|
|
22
|
+
border: solid 1px;
|
|
23
|
+
border-color: #c0c0c0;
|
|
24
|
+
padding: 3px;
|
|
25
|
+
margin-top: 5px;
|
|
26
|
+
margin-bottom: 5px;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
.completion:hover {
|
|
30
|
+
background-color: #e8e8e8;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
.metadata {
|
|
34
|
+
float: right;
|
|
35
|
+
font-style: italic;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
.token:hover {
|
|
39
|
+
background-color: lightgreen;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
.block {
|
|
43
|
+
margin: 15px;
|
|
44
|
+
padding: 15px;
|
|
45
|
+
border: solid 1px;
|
|
46
|
+
border-color: lightgray;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
td {
|
|
50
|
+
text-align: left;
|
|
51
|
+
vertical-align: top;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
.help-icon {
|
|
55
|
+
margin-left: 3px;
|
|
56
|
+
margin-right: 3px;
|
|
57
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<title>CRFM Models API</title>
|
|
5
|
+
<meta charset="utf-8">
|
|
6
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
|
7
|
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/css/bootstrap.min.css" integrity="sha384-/Y6pD6FV/Vv2HJnA6t+vslU6fwYXjCFtcEpHbNJ0lyAFsXTsjBbfaDjzALeQsN6M" crossorigin="anonymous">
|
|
8
|
+
<link rel="stylesheet" type="text/css" href="index.css">
|
|
9
|
+
</head>
|
|
10
|
+
|
|
11
|
+
<body>
|
|
12
|
+
<div class="container-fluid">
|
|
13
|
+
<nav class="navbar navbar-expand-sm navbar-light bg-faded">
|
|
14
|
+
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#nav-content" aria-controls="nav-content" aria-expanded="false" aria-label="Toggle navigation">
|
|
15
|
+
<span class="navbar-toggler-icon"></span>
|
|
16
|
+
</button>
|
|
17
|
+
|
|
18
|
+
<div class="collapse navbar-collapse" id="nav-content">
|
|
19
|
+
<ul class="navbar-nav">
|
|
20
|
+
<li class="nav-item"><a class="nav-link active" href="index.html">Query</a></li>
|
|
21
|
+
<li class="nav-item"><a class="nav-link" href="help.html">Help</a></li>
|
|
22
|
+
<li class="nav-item" id="loginInfo"></li>
|
|
23
|
+
</ul>
|
|
24
|
+
</div>
|
|
25
|
+
</nav>
|
|
26
|
+
|
|
27
|
+
<div class="row">
|
|
28
|
+
<div class="col-sm-12" id="main">
|
|
29
|
+
</div>
|
|
30
|
+
</div>
|
|
31
|
+
</div>
|
|
32
|
+
|
|
33
|
+
<script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>
|
|
34
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js" integrity="sha384-b/U6ypiBEHpOf/4+1nzFpr53nxSS+GLCkfwBdFNTxtclqqenISfwAzpKaMNFNmj4" crossorigin="anonymous"></script>
|
|
35
|
+
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/js/bootstrap.min.js" integrity="sha384-h0AbiXch4ZDo7tp9hKZ4TsHbi047NrKGLO3SEJAg45jXxnGIfYzk4Si90RDIqNm1" crossorigin="anonymous"></script>
|
|
36
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.tablesorter/2.29.0/js/jquery.tablesorter.min.js"></script>
|
|
37
|
+
<script src="general.js"></script>
|
|
38
|
+
<script src="index.js"></script>
|
|
39
|
+
</body>
|
|
40
|
+
</html>
|