crfm-helm 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/MANIFEST.in +2 -1
- {crfm-helm-0.2.0/src/crfm_helm.egg-info → crfm-helm-0.2.2}/PKG-INFO +1 -1
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/README.md +1 -1
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/requirements.txt +13 -9
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/setup.py +3 -2
- {crfm-helm-0.2.0 → crfm-helm-0.2.2/src/crfm_helm.egg-info}/PKG-INFO +1 -1
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/SOURCES.txt +30 -1
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/entry_points.txt +2 -1
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/requires.txt +10 -7
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/__init__.py +13 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapter_spec.py +3 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- crfm-helm-0.2.2/src/helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
- crfm-helm-0.2.2/src/helm/benchmark/metrics/classification_metrics.py +70 -0
- crfm-helm-0.2.2/src/helm/benchmark/metrics/machine_translation_metrics.py +36 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summarization_metrics.py +7 -8
- crfm-helm-0.2.2/src/helm/benchmark/metrics/test_classification_metrics.py +150 -0
- crfm-helm-0.2.2/src/helm/benchmark/presentation/create_plots.py +617 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/run_display.py +7 -48
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/summarize.py +4 -2
- crfm-helm-0.2.2/src/helm/benchmark/presentation/test_create_plots.py +32 -0
- crfm-helm-0.2.2/src/helm/benchmark/run.py +276 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/run_expander.py +164 -47
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/run_specs.py +346 -39
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/runner.py +34 -6
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/copyright_scenario.py +1 -1
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/lextreme_scenario.py +458 -0
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/med_qa_scenario.py +96 -0
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/scenario.py +5 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/benchmarking.css +14 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/benchmarking.js +43 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/index.html +2 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/json-urls.js +4 -0
- crfm-helm-0.2.2/src/helm/benchmark/static/plot-captions.js +16 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/schema.yaml +154 -1
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/cohere_window_service.py +20 -0
- crfm-helm-0.2.2/src/helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- crfm-helm-0.2.2/src/helm/benchmark/window_services/huggingface_window_service.py +39 -0
- crfm-helm-0.2.2/src/helm/benchmark/window_services/santacoder_window_service.py +27 -0
- crfm-helm-0.2.2/src/helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- crfm-helm-0.2.2/src/helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/window_service_factory.py +34 -7
- crfm-helm-0.2.2/src/helm/common/codec.py +123 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/general.py +12 -5
- crfm-helm-0.2.2/src/helm/common/test_codec.py +144 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/aleph_alpha_client.py +47 -28
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/auto_client.py +32 -24
- crfm-helm-0.2.2/src/helm/proxy/clients/google_client.py +88 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/huggingface_client.py +32 -16
- crfm-helm-0.2.2/src/helm/proxy/clients/huggingface_model_registry.py +111 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/huggingface_tokenizer.py +25 -7
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/openai_client.py +60 -2
- crfm-helm-0.2.2/src/helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/together_client.py +17 -2
- crfm-helm-0.2.2/src/helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/models.py +115 -7
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/test_models.py +1 -1
- crfm-helm-0.2.2/src/helm/proxy/token_counters/__init__.py +0 -0
- crfm-helm-0.2.0/src/helm/benchmark/presentation/present.py +0 -249
- crfm-helm-0.2.0/src/helm/benchmark/run.py +0 -180
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/LICENSE +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/pyproject.toml +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/setup.cfg +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/dependency_links.txt +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/not-zip-safe +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/top_level.txt +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/adapter_factory.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/generation_adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/prompt.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/request_state.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/scenario_state.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/data_augmenter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/dialect_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/extra_space_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/filler_words_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/gender_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/lowercase_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/mild_mix_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/misspelling_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/person_name_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/perturbation_description.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/space_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/synonym_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/test_perturbation.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/typos_perturbation.py +0 -0
- {crfm-helm-0.2.0/src/helm/benchmark/metrics → crfm-helm-0.2.2/src/helm/benchmark/contamination}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/data_preprocessor.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/training_efficiency.json +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/executor.py +0 -0
- {crfm-helm-0.2.0/src/helm/benchmark/metrics/summac → crfm-helm-0.2.2/src/helm/benchmark/metrics}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/basic_metrics.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bbq_metrics.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bias_metrics.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bias_word_lists.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/code_metrics.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/code_metrics_helper.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/copyright_metrics.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/disinformation_metrics.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric_name.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/numeracy_metrics.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/ranking_metrics.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/statistic.py +0 -0
- {crfm-helm-0.2.0/src/helm/benchmark/metrics/tokens → crfm-helm-0.2.2/src/helm/benchmark/metrics/summac}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summac/model_summac.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summac/utils_misc.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_bias_metrics.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_metric.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_numeracy_metrics.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_statistic.py +0 -0
- {crfm-helm-0.2.0/src/helm/benchmark/presentation → crfm-helm-0.2.2/src/helm/benchmark/metrics/tokens}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/free_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/token_cost_estimator.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens_metric.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/toxicity_metrics.py +0 -0
- {crfm-helm-0.2.0/src/helm/benchmark/scenarios → crfm-helm-0.2.2/src/helm/benchmark/presentation}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/contamination.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/run_entry.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/schema.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/table.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/test_contamination.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/test_run_entry.py +0 -0
- {crfm-helm-0.2.0/src/helm/benchmark/window_services → crfm-helm-0.2.2/src/helm/benchmark/scenarios}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/babi_qa_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/bbq_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/big_bench_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/blimp_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/bold_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/boolq_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/civil_comments_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/code_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/code_scenario_helper.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/commonsense_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/dialogue_scenarios.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/disinformation_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/dyck_language_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/entity_data_imputation_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/entity_matching_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/gsm_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/ice_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/imdb_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/legal_support_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/lsat_qa_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/math_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/mmlu_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/msmarco_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/narrativeqa_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/natural_qa_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/newsqa_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/numeracy_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/pubmed_qa_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/quac_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/raft_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/simple_scenarios.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/summarization_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_efficiency_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_reasoning_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/test_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/truthful_qa_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/twitter_aae_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/wikifact_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/wikitext_103_scenario.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/server.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/contamination.yaml +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/general.js +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/crfm-logo.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/helm-logo-simple.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/helm-logo.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/language-model-helm.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/ai21.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/anthropic.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/bigscience.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/cohere.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/google.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/meta.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/microsoft.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/nvidia.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/openai.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/together.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/yandex.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/info-icon.png +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/json-urls-root.js +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/utils.js +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/test_data_preprocessor.py +0 -0
- {crfm-helm-0.2.0/src/helm/common → crfm-helm-0.2.2/src/helm/benchmark/window_services}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ai21_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/anthropic_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/bloom_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/encoder_decoder_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gpt2_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gptj_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gptneox_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ice_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/local_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/luminous_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/mt_nlg_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/openai_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/opt_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/t0pp_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/t511b_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ai21_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_bloom_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_cohere_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gpt2_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gptj_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gptneox_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ice_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_openai_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_opt_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_t0pp_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_t511b_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ul2_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_utils.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_yalm_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/tokenizer_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ul2_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/wider_openai_window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/window_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/yalm_window_service.py +0 -0
- {crfm-helm-0.2.0/src/helm/proxy → crfm-helm-0.2.2/src/helm/common}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/authentication.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/cache.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/hierarchical_logger.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/object_spec.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/perspective_api_request.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/request.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/test_cache.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/test_general.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/tokenization_request.py +0 -0
- {crfm-helm-0.2.0/src/helm/proxy/clients → crfm-helm-0.2.2/src/helm/proxy}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/accounts.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/cli.py +0 -0
- {crfm-helm-0.2.0/src/helm/proxy/clients/yalm_tokenizer → crfm-helm-0.2.2/src/helm/proxy/clients}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/ai21_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/anthropic_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/chat_gpt_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/cohere_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/goose_ai_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/ice_tokenizer_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/microsoft_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/perspective_api_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/simple_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_huggingface_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_ice_tokenizer_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_yalm_tokenizer_client.py +0 -0
- {crfm-helm-0.2.0/src/helm/proxy/services → crfm-helm-0.2.2/src/helm/proxy/clients/yalm_tokenizer}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer_client.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/example_queries.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/query.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/retry.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/server.py +0 -0
- {crfm-helm-0.2.0/src/helm/proxy/token_counters → crfm-helm-0.2.2/src/helm/proxy/services}/__init__.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/services/remote_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/services/server_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/services/service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/services/test_remote_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/services/test_service.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/test_retry.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/ai21_token_counter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/auto_token_counter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/cohere_token_counter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/free_token_counter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/gooseai_token_counter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/openai_token_counter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/test_ai21_token_counter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/test_openai_token_counter.py +0 -0
- {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/token_counter.py +0 -0
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
[comment]: <> (When using the img tag, which allows us to specify size, src has to be a URL.)
|
|
5
5
|
<img src="https://github.com/stanford-crfm/helm/raw/main/src/helm/benchmark/static/images/helm-logo.png" alt="" width="800"/>
|
|
6
6
|
|
|
7
|
-
Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/
|
|
7
|
+
Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/latest/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features:
|
|
8
8
|
|
|
9
9
|
- Collection of datasets in a standard format (e.g., NaturalQuestions)
|
|
10
10
|
- Collection of models accessible via a unified API (e.g., GPT-3, MT-NLG, OPT, BLOOM)
|
|
@@ -6,19 +6,13 @@
|
|
|
6
6
|
#
|
|
7
7
|
# pip freeze | xargs pip uninstall -y
|
|
8
8
|
# pip install -r requirements.txt
|
|
9
|
+
# pip install -r requirements-dev.txt
|
|
9
10
|
# pip freeze | grep -v en-core-web-sm > requirements-freeze.txt
|
|
10
11
|
#
|
|
11
12
|
# Also update the versions in the manual installation steps in pre-commit.sh.
|
|
12
13
|
#
|
|
13
14
|
# Check that everything works because the versions might be upgraded.
|
|
14
15
|
|
|
15
|
-
# Development
|
|
16
|
-
pytest~=7.2.0
|
|
17
|
-
black~=22.10.0
|
|
18
|
-
mypy~=0.982
|
|
19
|
-
pre-commit~=2.20.0
|
|
20
|
-
flake8~=5.0.4
|
|
21
|
-
|
|
22
16
|
# Common
|
|
23
17
|
zstandard~=0.18.0
|
|
24
18
|
tqdm~=4.64.1
|
|
@@ -26,6 +20,7 @@ pyhocon~=0.3.59
|
|
|
26
20
|
dacite~=1.6.0
|
|
27
21
|
|
|
28
22
|
# Proxy
|
|
23
|
+
aleph-alpha-client~=2.14.0
|
|
29
24
|
bottle~=0.12.23
|
|
30
25
|
gunicorn~=20.1.0
|
|
31
26
|
Mako~=1.2.3
|
|
@@ -35,8 +30,9 @@ sqlitedict~=1.7.0
|
|
|
35
30
|
pymongo~=4.2.0
|
|
36
31
|
retrying~=1.3.3
|
|
37
32
|
websocket-client~=1.3.2 # For Anthropic
|
|
38
|
-
openai~=0.
|
|
39
|
-
transformers~=4.
|
|
33
|
+
openai~=0.27.0
|
|
34
|
+
transformers~=4.26.1
|
|
35
|
+
tokenizers~=0.13.2
|
|
40
36
|
icetk~=0.0.4
|
|
41
37
|
protobuf~=3.20.2 # Can't use 4.21.0 due to backward incompatibility
|
|
42
38
|
google-api-python-client~=2.64.0
|
|
@@ -49,6 +45,8 @@ jsonlines~=3.1.0 # Not really needed
|
|
|
49
45
|
sympy~=1.11.1 # For math scenarios
|
|
50
46
|
sentencepiece~=0.1.97
|
|
51
47
|
numba~=0.56.4
|
|
48
|
+
cattrs~=22.2.0
|
|
49
|
+
xlrd~=2.0.1 # Used by pandas.read_excel in ice_scenario
|
|
52
50
|
|
|
53
51
|
# Metrics
|
|
54
52
|
importlib-resources~=5.10.0
|
|
@@ -67,3 +65,9 @@ summ-eval~=0.892
|
|
|
67
65
|
# End users should install a CUDA version of PyTorch manually if needed
|
|
68
66
|
torch~=1.12.1 # Summarization metrics
|
|
69
67
|
torchvision~=0.13.1
|
|
68
|
+
|
|
69
|
+
# plotting
|
|
70
|
+
colorcet~=3.0.1
|
|
71
|
+
matplotlib~=3.6.0
|
|
72
|
+
numpy~=1.23.3
|
|
73
|
+
seaborn~=0.11.0
|
|
@@ -11,7 +11,7 @@ def get_requirements(path: str):
|
|
|
11
11
|
|
|
12
12
|
setup(
|
|
13
13
|
name="crfm-helm",
|
|
14
|
-
version="0.2.
|
|
14
|
+
version="0.2.2",
|
|
15
15
|
description="Benchmark for language models",
|
|
16
16
|
long_description="Benchmark for language models",
|
|
17
17
|
url="https://github.com/stanford-crfm/helm",
|
|
@@ -31,9 +31,10 @@ setup(
|
|
|
31
31
|
install_requires=get_requirements("requirements.txt"),
|
|
32
32
|
entry_points={
|
|
33
33
|
"console_scripts": [
|
|
34
|
-
"helm-run=helm.benchmark.
|
|
34
|
+
"helm-run=helm.benchmark.run:main",
|
|
35
35
|
"helm-summarize=helm.benchmark.presentation.summarize:main",
|
|
36
36
|
"helm-server=helm.benchmark.server:main",
|
|
37
|
+
"helm-create-plots=helm.benchmark.presentation.create_plots:main",
|
|
37
38
|
"crfm-proxy-server=helm.proxy.server:main",
|
|
38
39
|
"crfm-proxy-cli=helm.proxy.cli:main",
|
|
39
40
|
]
|
|
@@ -44,6 +44,7 @@ src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
|
|
|
44
44
|
src/helm/benchmark/augmentations/__init__.py
|
|
45
45
|
src/helm/benchmark/augmentations/contraction_expansion_perturbation.py
|
|
46
46
|
src/helm/benchmark/augmentations/contrast_sets_perturbation.py
|
|
47
|
+
src/helm/benchmark/augmentations/correct_to_misspelling.json
|
|
47
48
|
src/helm/benchmark/augmentations/data_augmenter.py
|
|
48
49
|
src/helm/benchmark/augmentations/dialect_perturbation.py
|
|
49
50
|
src/helm/benchmark/augmentations/extra_space_perturbation.py
|
|
@@ -59,6 +60,7 @@ src/helm/benchmark/augmentations/space_perturbation.py
|
|
|
59
60
|
src/helm/benchmark/augmentations/synonym_perturbation.py
|
|
60
61
|
src/helm/benchmark/augmentations/test_perturbation.py
|
|
61
62
|
src/helm/benchmark/augmentations/typos_perturbation.py
|
|
63
|
+
src/helm/benchmark/contamination/__init__.py
|
|
62
64
|
src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json
|
|
63
65
|
src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json
|
|
64
66
|
src/helm/benchmark/efficiency_data/training_efficiency.json
|
|
@@ -67,10 +69,12 @@ src/helm/benchmark/metrics/basic_metrics.py
|
|
|
67
69
|
src/helm/benchmark/metrics/bbq_metrics.py
|
|
68
70
|
src/helm/benchmark/metrics/bias_metrics.py
|
|
69
71
|
src/helm/benchmark/metrics/bias_word_lists.py
|
|
72
|
+
src/helm/benchmark/metrics/classification_metrics.py
|
|
70
73
|
src/helm/benchmark/metrics/code_metrics.py
|
|
71
74
|
src/helm/benchmark/metrics/code_metrics_helper.py
|
|
72
75
|
src/helm/benchmark/metrics/copyright_metrics.py
|
|
73
76
|
src/helm/benchmark/metrics/disinformation_metrics.py
|
|
77
|
+
src/helm/benchmark/metrics/machine_translation_metrics.py
|
|
74
78
|
src/helm/benchmark/metrics/metric.py
|
|
75
79
|
src/helm/benchmark/metrics/metric_name.py
|
|
76
80
|
src/helm/benchmark/metrics/metric_service.py
|
|
@@ -79,6 +83,7 @@ src/helm/benchmark/metrics/ranking_metrics.py
|
|
|
79
83
|
src/helm/benchmark/metrics/statistic.py
|
|
80
84
|
src/helm/benchmark/metrics/summarization_metrics.py
|
|
81
85
|
src/helm/benchmark/metrics/test_bias_metrics.py
|
|
86
|
+
src/helm/benchmark/metrics/test_classification_metrics.py
|
|
82
87
|
src/helm/benchmark/metrics/test_metric.py
|
|
83
88
|
src/helm/benchmark/metrics/test_numeracy_metrics.py
|
|
84
89
|
src/helm/benchmark/metrics/test_statistic.py
|
|
@@ -99,13 +104,14 @@ src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
|
|
|
99
104
|
src/helm/benchmark/metrics/tokens/token_cost_estimator.py
|
|
100
105
|
src/helm/benchmark/presentation/__init__.py
|
|
101
106
|
src/helm/benchmark/presentation/contamination.py
|
|
102
|
-
src/helm/benchmark/presentation/
|
|
107
|
+
src/helm/benchmark/presentation/create_plots.py
|
|
103
108
|
src/helm/benchmark/presentation/run_display.py
|
|
104
109
|
src/helm/benchmark/presentation/run_entry.py
|
|
105
110
|
src/helm/benchmark/presentation/schema.py
|
|
106
111
|
src/helm/benchmark/presentation/summarize.py
|
|
107
112
|
src/helm/benchmark/presentation/table.py
|
|
108
113
|
src/helm/benchmark/presentation/test_contamination.py
|
|
114
|
+
src/helm/benchmark/presentation/test_create_plots.py
|
|
109
115
|
src/helm/benchmark/presentation/test_run_entry.py
|
|
110
116
|
src/helm/benchmark/scenarios/__init__.py
|
|
111
117
|
src/helm/benchmark/scenarios/babi_qa_scenario.py
|
|
@@ -119,6 +125,7 @@ src/helm/benchmark/scenarios/code_scenario.py
|
|
|
119
125
|
src/helm/benchmark/scenarios/code_scenario_helper.py
|
|
120
126
|
src/helm/benchmark/scenarios/commonsense_scenario.py
|
|
121
127
|
src/helm/benchmark/scenarios/copyright_scenario.py
|
|
128
|
+
src/helm/benchmark/scenarios/covid_dialog_scenario.py
|
|
122
129
|
src/helm/benchmark/scenarios/dialogue_scenarios.py
|
|
123
130
|
src/helm/benchmark/scenarios/disinformation_scenario.py
|
|
124
131
|
src/helm/benchmark/scenarios/dyck_language_scenario.py
|
|
@@ -126,17 +133,26 @@ src/helm/benchmark/scenarios/entity_data_imputation_scenario.py
|
|
|
126
133
|
src/helm/benchmark/scenarios/entity_matching_scenario.py
|
|
127
134
|
src/helm/benchmark/scenarios/gsm_scenario.py
|
|
128
135
|
src/helm/benchmark/scenarios/ice_scenario.py
|
|
136
|
+
src/helm/benchmark/scenarios/imdb_listdir.json
|
|
129
137
|
src/helm/benchmark/scenarios/imdb_scenario.py
|
|
130
138
|
src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py
|
|
131
139
|
src/helm/benchmark/scenarios/legal_support_scenario.py
|
|
140
|
+
src/helm/benchmark/scenarios/lex_glue_scenario.py
|
|
141
|
+
src/helm/benchmark/scenarios/lextreme_scenario.py
|
|
132
142
|
src/helm/benchmark/scenarios/lsat_qa_scenario.py
|
|
133
143
|
src/helm/benchmark/scenarios/math_scenario.py
|
|
144
|
+
src/helm/benchmark/scenarios/me_q_sum_scenario.py
|
|
145
|
+
src/helm/benchmark/scenarios/med_dialog_scenario.py
|
|
146
|
+
src/helm/benchmark/scenarios/med_mcqa_scenario.py
|
|
147
|
+
src/helm/benchmark/scenarios/med_paragraph_simplification_scenario.py
|
|
148
|
+
src/helm/benchmark/scenarios/med_qa_scenario.py
|
|
134
149
|
src/helm/benchmark/scenarios/mmlu_scenario.py
|
|
135
150
|
src/helm/benchmark/scenarios/msmarco_scenario.py
|
|
136
151
|
src/helm/benchmark/scenarios/narrativeqa_scenario.py
|
|
137
152
|
src/helm/benchmark/scenarios/natural_qa_scenario.py
|
|
138
153
|
src/helm/benchmark/scenarios/newsqa_scenario.py
|
|
139
154
|
src/helm/benchmark/scenarios/numeracy_scenario.py
|
|
155
|
+
src/helm/benchmark/scenarios/opinions_qa_scenario.py
|
|
140
156
|
src/helm/benchmark/scenarios/pubmed_qa_scenario.py
|
|
141
157
|
src/helm/benchmark/scenarios/quac_scenario.py
|
|
142
158
|
src/helm/benchmark/scenarios/raft_scenario.py
|
|
@@ -153,6 +169,7 @@ src/helm/benchmark/scenarios/truthful_qa_scenario.py
|
|
|
153
169
|
src/helm/benchmark/scenarios/twitter_aae_scenario.py
|
|
154
170
|
src/helm/benchmark/scenarios/wikifact_scenario.py
|
|
155
171
|
src/helm/benchmark/scenarios/wikitext_103_scenario.py
|
|
172
|
+
src/helm/benchmark/scenarios/wmt_14_scenario.py
|
|
156
173
|
src/helm/benchmark/static/benchmarking.css
|
|
157
174
|
src/helm/benchmark/static/benchmarking.js
|
|
158
175
|
src/helm/benchmark/static/contamination.yaml
|
|
@@ -161,6 +178,7 @@ src/helm/benchmark/static/index.html
|
|
|
161
178
|
src/helm/benchmark/static/info-icon.png
|
|
162
179
|
src/helm/benchmark/static/json-urls-root.js
|
|
163
180
|
src/helm/benchmark/static/json-urls.js
|
|
181
|
+
src/helm/benchmark/static/plot-captions.js
|
|
164
182
|
src/helm/benchmark/static/schema.yaml
|
|
165
183
|
src/helm/benchmark/static/utils.js
|
|
166
184
|
src/helm/benchmark/static/images/crfm-logo.png
|
|
@@ -188,21 +206,25 @@ src/helm/benchmark/window_services/anthropic_window_service.py
|
|
|
188
206
|
src/helm/benchmark/window_services/bloom_window_service.py
|
|
189
207
|
src/helm/benchmark/window_services/cohere_window_service.py
|
|
190
208
|
src/helm/benchmark/window_services/encoder_decoder_window_service.py
|
|
209
|
+
src/helm/benchmark/window_services/flan_t5_window_service.py
|
|
191
210
|
src/helm/benchmark/window_services/gpt2_window_service.py
|
|
192
211
|
src/helm/benchmark/window_services/gptj_window_service.py
|
|
193
212
|
src/helm/benchmark/window_services/gptneox_window_service.py
|
|
213
|
+
src/helm/benchmark/window_services/huggingface_window_service.py
|
|
194
214
|
src/helm/benchmark/window_services/ice_window_service.py
|
|
195
215
|
src/helm/benchmark/window_services/local_window_service.py
|
|
196
216
|
src/helm/benchmark/window_services/luminous_window_service.py
|
|
197
217
|
src/helm/benchmark/window_services/mt_nlg_window_service.py
|
|
198
218
|
src/helm/benchmark/window_services/openai_window_service.py
|
|
199
219
|
src/helm/benchmark/window_services/opt_window_service.py
|
|
220
|
+
src/helm/benchmark/window_services/santacoder_window_service.py
|
|
200
221
|
src/helm/benchmark/window_services/t0pp_window_service.py
|
|
201
222
|
src/helm/benchmark/window_services/t511b_window_service.py
|
|
202
223
|
src/helm/benchmark/window_services/test_ai21_window_service.py
|
|
203
224
|
src/helm/benchmark/window_services/test_bloom_window_service.py
|
|
204
225
|
src/helm/benchmark/window_services/test_cohere_window_service.py
|
|
205
226
|
src/helm/benchmark/window_services/test_cohere_window_service_utils.py
|
|
227
|
+
src/helm/benchmark/window_services/test_flan_t5_window_service.py
|
|
206
228
|
src/helm/benchmark/window_services/test_gpt2_window_service.py
|
|
207
229
|
src/helm/benchmark/window_services/test_gptj_window_service.py
|
|
208
230
|
src/helm/benchmark/window_services/test_gptneox_window_service.py
|
|
@@ -217,6 +239,7 @@ src/helm/benchmark/window_services/test_utils.py
|
|
|
217
239
|
src/helm/benchmark/window_services/test_yalm_window_service.py
|
|
218
240
|
src/helm/benchmark/window_services/tokenizer_service.py
|
|
219
241
|
src/helm/benchmark/window_services/ul2_window_service.py
|
|
242
|
+
src/helm/benchmark/window_services/wider_ai21_window_service.py
|
|
220
243
|
src/helm/benchmark/window_services/wider_openai_window_service.py
|
|
221
244
|
src/helm/benchmark/window_services/window_service.py
|
|
222
245
|
src/helm/benchmark/window_services/window_service_factory.py
|
|
@@ -224,12 +247,14 @@ src/helm/benchmark/window_services/yalm_window_service.py
|
|
|
224
247
|
src/helm/common/__init__.py
|
|
225
248
|
src/helm/common/authentication.py
|
|
226
249
|
src/helm/common/cache.py
|
|
250
|
+
src/helm/common/codec.py
|
|
227
251
|
src/helm/common/general.py
|
|
228
252
|
src/helm/common/hierarchical_logger.py
|
|
229
253
|
src/helm/common/object_spec.py
|
|
230
254
|
src/helm/common/perspective_api_request.py
|
|
231
255
|
src/helm/common/request.py
|
|
232
256
|
src/helm/common/test_cache.py
|
|
257
|
+
src/helm/common/test_codec.py
|
|
233
258
|
src/helm/common/test_general.py
|
|
234
259
|
src/helm/common/tokenization_request.py
|
|
235
260
|
src/helm/proxy/__init__.py
|
|
@@ -250,8 +275,10 @@ src/helm/proxy/clients/auto_client.py
|
|
|
250
275
|
src/helm/proxy/clients/chat_gpt_client.py
|
|
251
276
|
src/helm/proxy/clients/client.py
|
|
252
277
|
src/helm/proxy/clients/cohere_client.py
|
|
278
|
+
src/helm/proxy/clients/google_client.py
|
|
253
279
|
src/helm/proxy/clients/goose_ai_client.py
|
|
254
280
|
src/helm/proxy/clients/huggingface_client.py
|
|
281
|
+
src/helm/proxy/clients/huggingface_model_registry.py
|
|
255
282
|
src/helm/proxy/clients/huggingface_tokenizer.py
|
|
256
283
|
src/helm/proxy/clients/ice_tokenizer_client.py
|
|
257
284
|
src/helm/proxy/clients/microsoft_client.py
|
|
@@ -260,6 +287,7 @@ src/helm/proxy/clients/perspective_api_client.py
|
|
|
260
287
|
src/helm/proxy/clients/simple_client.py
|
|
261
288
|
src/helm/proxy/clients/test_client.py
|
|
262
289
|
src/helm/proxy/clients/test_huggingface_client.py
|
|
290
|
+
src/helm/proxy/clients/test_huggingface_model_registry.py
|
|
263
291
|
src/helm/proxy/clients/test_huggingface_tokenizer.py
|
|
264
292
|
src/helm/proxy/clients/test_ice_tokenizer_client.py
|
|
265
293
|
src/helm/proxy/clients/test_yalm_tokenizer_client.py
|
|
@@ -267,6 +295,7 @@ src/helm/proxy/clients/together_client.py
|
|
|
267
295
|
src/helm/proxy/clients/yalm_tokenizer_client.py
|
|
268
296
|
src/helm/proxy/clients/yalm_tokenizer/__init__.py
|
|
269
297
|
src/helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py
|
|
298
|
+
src/helm/proxy/clients/yalm_tokenizer/voc_100b.sp
|
|
270
299
|
src/helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py
|
|
271
300
|
src/helm/proxy/services/__init__.py
|
|
272
301
|
src/helm/proxy/services/remote_service.py
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
crfm-proxy-cli = helm.proxy.cli:main
|
|
3
3
|
crfm-proxy-server = helm.proxy.server:main
|
|
4
|
-
helm-
|
|
4
|
+
helm-create-plots = helm.benchmark.presentation.create_plots:main
|
|
5
|
+
helm-run = helm.benchmark.run:main
|
|
5
6
|
helm-server = helm.benchmark.server:main
|
|
6
7
|
helm-summarize = helm.benchmark.presentation.summarize:main
|
|
@@ -1,12 +1,8 @@
|
|
|
1
|
-
pytest~=7.2.0
|
|
2
|
-
black~=22.10.0
|
|
3
|
-
mypy~=0.982
|
|
4
|
-
pre-commit~=2.20.0
|
|
5
|
-
flake8~=5.0.4
|
|
6
1
|
zstandard~=0.18.0
|
|
7
2
|
tqdm~=4.64.1
|
|
8
3
|
pyhocon~=0.3.59
|
|
9
4
|
dacite~=1.6.0
|
|
5
|
+
aleph-alpha-client~=2.14.0
|
|
10
6
|
bottle~=0.12.23
|
|
11
7
|
gunicorn~=20.1.0
|
|
12
8
|
Mako~=1.2.3
|
|
@@ -14,8 +10,9 @@ sqlitedict~=1.7.0
|
|
|
14
10
|
pymongo~=4.2.0
|
|
15
11
|
retrying~=1.3.3
|
|
16
12
|
websocket-client~=1.3.2
|
|
17
|
-
openai~=0.
|
|
18
|
-
transformers~=4.
|
|
13
|
+
openai~=0.27.0
|
|
14
|
+
transformers~=4.26.1
|
|
15
|
+
tokenizers~=0.13.2
|
|
19
16
|
icetk~=0.0.4
|
|
20
17
|
protobuf~=3.20.2
|
|
21
18
|
google-api-python-client~=2.64.0
|
|
@@ -26,6 +23,8 @@ jsonlines~=3.1.0
|
|
|
26
23
|
sympy~=1.11.1
|
|
27
24
|
sentencepiece~=0.1.97
|
|
28
25
|
numba~=0.56.4
|
|
26
|
+
cattrs~=22.2.0
|
|
27
|
+
xlrd~=2.0.1
|
|
29
28
|
importlib-resources~=5.10.0
|
|
30
29
|
nltk~=3.7
|
|
31
30
|
scipy~=1.9.1
|
|
@@ -39,3 +38,7 @@ spacy~=3.2.4
|
|
|
39
38
|
summ-eval~=0.892
|
|
40
39
|
torch~=1.12.1
|
|
41
40
|
torchvision~=0.13.1
|
|
41
|
+
colorcet~=3.0.1
|
|
42
|
+
matplotlib~=3.6.0
|
|
43
|
+
numpy~=1.23.3
|
|
44
|
+
seaborn~=0.11.0
|
|
@@ -42,12 +42,24 @@ from .scenarios import legal_support_scenario # noqa
|
|
|
42
42
|
from .scenarios import entity_matching_scenario # noqa
|
|
43
43
|
from .scenarios import entity_data_imputation_scenario # noqa
|
|
44
44
|
from .scenarios import big_bench_scenario # noqa
|
|
45
|
+
from .scenarios import opinions_qa_scenario # noqa
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Biomedical
|
|
49
|
+
from .scenarios import covid_dialog_scenario # noqa
|
|
50
|
+
from .scenarios import me_q_sum_scenario # noqa
|
|
51
|
+
from .scenarios import med_dialog_scenario # noqa
|
|
52
|
+
from .scenarios import med_mcqa_scenario # noqa
|
|
53
|
+
from .scenarios import med_paragraph_simplification_scenario # noqa
|
|
54
|
+
from .scenarios import med_qa_scenario # noqa
|
|
45
55
|
from .scenarios import pubmed_qa_scenario # noqa
|
|
56
|
+
from .scenarios import wmt_14_scenario # noqa
|
|
46
57
|
|
|
47
58
|
# Metrics
|
|
48
59
|
from .metrics import basic_metrics # noqa
|
|
49
60
|
from .metrics import bbq_metrics # noqa
|
|
50
61
|
from .metrics import bias_metrics # noqa
|
|
62
|
+
from .metrics import classification_metrics # noqa
|
|
51
63
|
from .metrics import code_metrics # noqa
|
|
52
64
|
from .metrics import copyright_metrics # noqa
|
|
53
65
|
from .metrics import disinformation_metrics # noqa
|
|
@@ -56,6 +68,7 @@ from .metrics import ranking_metrics # noqa
|
|
|
56
68
|
from .metrics import summarization_metrics # noqa
|
|
57
69
|
from .metrics import toxicity_metrics # noqa
|
|
58
70
|
from .metrics import tokens_metric # noqa
|
|
71
|
+
from .metrics import machine_translation_metrics # noqa
|
|
59
72
|
|
|
60
73
|
# Perturbations for data augmentation
|
|
61
74
|
from .augmentations.extra_space_perturbation import ExtraSpacePerturbation # noqa
|
|
@@ -68,6 +68,9 @@ class AdapterSpec:
|
|
|
68
68
|
# set of training instances. Used to compute error bars.
|
|
69
69
|
num_train_trials: int = 1
|
|
70
70
|
|
|
71
|
+
# If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
72
|
+
sample_train: bool = True
|
|
73
|
+
|
|
71
74
|
# Decoding parameters (inherited by `Request`)
|
|
72
75
|
|
|
73
76
|
# Model to make the request to (need to fill in)
|
|
@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
23
23
|
@htrack(None)
|
|
24
24
|
def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
|
|
25
25
|
"""
|
|
26
|
-
Takes a
|
|
26
|
+
Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
|
|
27
27
|
The reason we don't do this per eval instance is that we create a common set of
|
|
28
28
|
training instances which is shared across all eval instances.
|
|
29
29
|
"""
|
|
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
65
65
|
parallelism: int,
|
|
66
66
|
) -> List[RequestState]:
|
|
67
67
|
self.train_trial_index: int = train_trial_index
|
|
68
|
-
self.train_instances: List[Instance] = self.sample_examples(
|
|
68
|
+
self.train_instances: List[Instance] = self.sample_examples(
|
|
69
|
+
all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
|
|
70
|
+
)
|
|
69
71
|
hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
|
|
70
72
|
|
|
71
73
|
# Generate request_states
|
|
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
93
95
|
|
|
94
96
|
return [request_state for result in results for request_state in result]
|
|
95
97
|
|
|
96
|
-
def sample_examples(
|
|
98
|
+
def sample_examples(
|
|
99
|
+
self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
|
|
100
|
+
) -> List[Instance]:
|
|
97
101
|
"""
|
|
98
102
|
Sample a random set of train instances to use as examples by following the steps below:
|
|
99
103
|
1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
|
|
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
121
125
|
random.seed(seed)
|
|
122
126
|
num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
|
|
123
127
|
|
|
128
|
+
examples: List[Instance] = []
|
|
129
|
+
if not sample_train:
|
|
130
|
+
# Select sequentially from the train set
|
|
131
|
+
examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
|
|
132
|
+
return examples
|
|
133
|
+
|
|
124
134
|
unlabeled_instances: List[Instance] = []
|
|
125
135
|
label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
|
|
126
|
-
|
|
127
136
|
for instance in all_train_instances:
|
|
128
137
|
if instance.first_correct_reference:
|
|
129
138
|
label_to_instances[instance.first_correct_reference.output.text].append(instance)
|
|
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
145
154
|
sorted_labels.extend(labels)
|
|
146
155
|
|
|
147
156
|
labels_iterable = cycle(sorted_labels)
|
|
148
|
-
examples: List[Instance] = []
|
|
149
157
|
while num_instances_to_sample > 0:
|
|
150
158
|
next_label: Optional[str] = next(labels_iterable, None)
|
|
151
159
|
if not next_label:
|
|
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
218
226
|
|
|
219
227
|
# References (optionally) and output
|
|
220
228
|
output: str
|
|
229
|
+
|
|
230
|
+
delimiter = ","
|
|
221
231
|
if reference_index is None:
|
|
222
232
|
# Put only the correct reference as the output
|
|
223
|
-
|
|
224
|
-
|
|
233
|
+
correct_references: List[Reference] = instance.all_correct_references
|
|
234
|
+
if not correct_references:
|
|
235
|
+
output = "n/a"
|
|
236
|
+
else:
|
|
237
|
+
output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
|
|
225
238
|
else:
|
|
226
239
|
reference = instance.references[reference_index]
|
|
227
240
|
output = reference.output.text
|