crfm-helm 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/MANIFEST.in +1 -0
- {crfm-helm-0.2.1/src/crfm_helm.egg-info → crfm-helm-0.2.2}/PKG-INFO +1 -1
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/README.md +1 -1
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/requirements.txt +12 -9
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/setup.py +2 -1
- {crfm-helm-0.2.1 → crfm-helm-0.2.2/src/crfm_helm.egg-info}/PKG-INFO +1 -1
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/SOURCES.txt +13 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/entry_points.txt +1 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/requires.txt +9 -7
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/__init__.py +2 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapter_spec.py +3 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/classification_metrics.py +28 -23
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_classification_metrics.py +44 -9
- crfm-helm-0.2.2/src/helm/benchmark/presentation/create_plots.py +617 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/summarize.py +4 -2
- crfm-helm-0.2.2/src/helm/benchmark/presentation/test_create_plots.py +32 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/run.py +23 -1
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/run_expander.py +161 -47
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/run_specs.py +84 -10
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/runner.py +31 -3
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/copyright_scenario.py +1 -1
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/lextreme_scenario.py +37 -25
- crfm-helm-0.2.2/src/helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/scenario.py +5 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/benchmarking.css +14 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/benchmarking.js +43 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/index.html +2 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/json-urls.js +4 -0
- crfm-helm-0.2.2/src/helm/benchmark/static/plot-captions.js +16 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/schema.yaml +66 -8
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/cohere_window_service.py +20 -0
- crfm-helm-0.2.2/src/helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- crfm-helm-0.2.2/src/helm/benchmark/window_services/huggingface_window_service.py +39 -0
- crfm-helm-0.2.2/src/helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- crfm-helm-0.2.2/src/helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/window_service_factory.py +27 -6
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/general.py +12 -5
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/aleph_alpha_client.py +47 -28
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/auto_client.py +28 -24
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/huggingface_client.py +30 -17
- crfm-helm-0.2.2/src/helm/proxy/clients/huggingface_model_registry.py +111 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/huggingface_tokenizer.py +23 -7
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/openai_client.py +60 -2
- crfm-helm-0.2.2/src/helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/together_client.py +17 -2
- crfm-helm-0.2.2/src/helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/models.py +82 -2
- crfm-helm-0.2.2/src/helm/proxy/token_counters/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/LICENSE +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/pyproject.toml +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/setup.cfg +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/dependency_links.txt +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/not-zip-safe +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/top_level.txt +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/adapter_factory.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/generation_adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/prompt.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/request_state.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/scenario_state.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/correct_to_misspelling.json +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/data_augmenter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/dialect_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/extra_space_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/filler_words_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/gender_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/lowercase_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/mild_mix_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/misspelling_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/person_name_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/perturbation_description.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/space_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/synonym_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/test_perturbation.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/typos_perturbation.py +0 -0
- {crfm-helm-0.2.1/src/helm/benchmark/metrics → crfm-helm-0.2.2/src/helm/benchmark/contamination}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/data_preprocessor.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/training_efficiency.json +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/executor.py +0 -0
- {crfm-helm-0.2.1/src/helm/benchmark/metrics/summac → crfm-helm-0.2.2/src/helm/benchmark/metrics}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/basic_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bbq_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bias_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bias_word_lists.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/code_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/code_metrics_helper.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/copyright_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/disinformation_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/machine_translation_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric_name.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/numeracy_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/ranking_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/statistic.py +0 -0
- {crfm-helm-0.2.1/src/helm/benchmark/metrics/tokens → crfm-helm-0.2.2/src/helm/benchmark/metrics/summac}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summac/model_summac.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summac/utils_misc.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summarization_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_bias_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_metric.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_numeracy_metrics.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_statistic.py +0 -0
- {crfm-helm-0.2.1/src/helm/benchmark/presentation → crfm-helm-0.2.2/src/helm/benchmark/metrics/tokens}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/free_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/token_cost_estimator.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens_metric.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/toxicity_metrics.py +0 -0
- {crfm-helm-0.2.1/src/helm/benchmark/scenarios → crfm-helm-0.2.2/src/helm/benchmark/presentation}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/contamination.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/run_display.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/run_entry.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/schema.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/table.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/test_contamination.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/test_run_entry.py +0 -0
- {crfm-helm-0.2.1/src/helm/benchmark/window_services → crfm-helm-0.2.2/src/helm/benchmark/scenarios}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/babi_qa_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/bbq_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/big_bench_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/blimp_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/bold_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/boolq_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/civil_comments_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/code_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/code_scenario_helper.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/commonsense_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/covid_dialog_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/dialogue_scenarios.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/disinformation_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/dyck_language_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/entity_data_imputation_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/entity_matching_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/gsm_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/ice_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/imdb_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/legal_support_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/lsat_qa_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/math_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/me_q_sum_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/med_dialog_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/med_mcqa_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/med_qa_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/mmlu_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/msmarco_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/narrativeqa_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/natural_qa_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/newsqa_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/numeracy_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/pubmed_qa_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/quac_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/raft_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/simple_scenarios.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/summarization_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_efficiency_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_reasoning_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/test_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/truthful_qa_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/twitter_aae_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/wikifact_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/wikitext_103_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/wmt_14_scenario.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/server.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/contamination.yaml +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/general.js +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/crfm-logo.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/helm-logo-simple.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/helm-logo.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/language-model-helm.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/ai21.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/anthropic.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/bigscience.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/cohere.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/google.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/meta.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/microsoft.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/nvidia.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/openai.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/together.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/yandex.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/info-icon.png +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/json-urls-root.js +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/utils.js +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/test_data_preprocessor.py +0 -0
- {crfm-helm-0.2.1/src/helm/common → crfm-helm-0.2.2/src/helm/benchmark/window_services}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ai21_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/anthropic_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/bloom_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/encoder_decoder_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gpt2_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gptj_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gptneox_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ice_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/local_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/luminous_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/mt_nlg_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/openai_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/opt_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/santacoder_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/t0pp_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/t511b_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ai21_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_bloom_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_cohere_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gpt2_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gptj_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gptneox_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ice_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_openai_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_opt_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_t0pp_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_t511b_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ul2_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_utils.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_yalm_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/tokenizer_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ul2_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/wider_openai_window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/window_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/yalm_window_service.py +0 -0
- {crfm-helm-0.2.1/src/helm/proxy → crfm-helm-0.2.2/src/helm/common}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/authentication.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/cache.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/codec.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/hierarchical_logger.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/object_spec.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/perspective_api_request.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/request.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/test_cache.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/test_codec.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/test_general.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/tokenization_request.py +0 -0
- {crfm-helm-0.2.1/src/helm/proxy/clients → crfm-helm-0.2.2/src/helm/proxy}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/accounts.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/cli.py +0 -0
- {crfm-helm-0.2.1/src/helm/proxy/clients/yalm_tokenizer → crfm-helm-0.2.2/src/helm/proxy/clients}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/ai21_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/anthropic_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/chat_gpt_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/cohere_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/google_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/goose_ai_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/ice_tokenizer_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/microsoft_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/perspective_api_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/simple_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_huggingface_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_huggingface_tokenizer.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_ice_tokenizer_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_yalm_tokenizer_client.py +0 -0
- {crfm-helm-0.2.1/src/helm/proxy/services → crfm-helm-0.2.2/src/helm/proxy/clients/yalm_tokenizer}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer_client.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/example_queries.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/query.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/retry.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/server.py +0 -0
- {crfm-helm-0.2.1/src/helm/proxy/token_counters → crfm-helm-0.2.2/src/helm/proxy/services}/__init__.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/services/remote_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/services/server_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/services/service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/services/test_remote_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/services/test_service.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/test_models.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/test_retry.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/ai21_token_counter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/auto_token_counter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/cohere_token_counter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/free_token_counter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/gooseai_token_counter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/openai_token_counter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/test_ai21_token_counter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/test_openai_token_counter.py +0 -0
- {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/token_counter.py +0 -0
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
[comment]: <> (When using the img tag, which allows us to specify size, src has to be a URL.)
|
|
5
5
|
<img src="https://github.com/stanford-crfm/helm/raw/main/src/helm/benchmark/static/images/helm-logo.png" alt="" width="800"/>
|
|
6
6
|
|
|
7
|
-
Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/
|
|
7
|
+
Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/latest/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features:
|
|
8
8
|
|
|
9
9
|
- Collection of datasets in a standard format (e.g., NaturalQuestions)
|
|
10
10
|
- Collection of models accessible via a unified API (e.g., GPT-3, MT-NLG, OPT, BLOOM)
|
|
@@ -6,19 +6,13 @@
|
|
|
6
6
|
#
|
|
7
7
|
# pip freeze | xargs pip uninstall -y
|
|
8
8
|
# pip install -r requirements.txt
|
|
9
|
+
# pip install -r requirements-dev.txt
|
|
9
10
|
# pip freeze | grep -v en-core-web-sm > requirements-freeze.txt
|
|
10
11
|
#
|
|
11
12
|
# Also update the versions in the manual installation steps in pre-commit.sh.
|
|
12
13
|
#
|
|
13
14
|
# Check that everything works because the versions might be upgraded.
|
|
14
15
|
|
|
15
|
-
# Development
|
|
16
|
-
pytest~=7.2.0
|
|
17
|
-
black~=22.10.0
|
|
18
|
-
mypy~=0.982
|
|
19
|
-
pre-commit~=2.20.0
|
|
20
|
-
flake8~=5.0.4
|
|
21
|
-
|
|
22
16
|
# Common
|
|
23
17
|
zstandard~=0.18.0
|
|
24
18
|
tqdm~=4.64.1
|
|
@@ -26,6 +20,7 @@ pyhocon~=0.3.59
|
|
|
26
20
|
dacite~=1.6.0
|
|
27
21
|
|
|
28
22
|
# Proxy
|
|
23
|
+
aleph-alpha-client~=2.14.0
|
|
29
24
|
bottle~=0.12.23
|
|
30
25
|
gunicorn~=20.1.0
|
|
31
26
|
Mako~=1.2.3
|
|
@@ -35,8 +30,9 @@ sqlitedict~=1.7.0
|
|
|
35
30
|
pymongo~=4.2.0
|
|
36
31
|
retrying~=1.3.3
|
|
37
32
|
websocket-client~=1.3.2 # For Anthropic
|
|
38
|
-
openai~=0.
|
|
39
|
-
transformers~=4.
|
|
33
|
+
openai~=0.27.0
|
|
34
|
+
transformers~=4.26.1
|
|
35
|
+
tokenizers~=0.13.2
|
|
40
36
|
icetk~=0.0.4
|
|
41
37
|
protobuf~=3.20.2 # Can't use 4.21.0 due to backward incompatibility
|
|
42
38
|
google-api-python-client~=2.64.0
|
|
@@ -50,6 +46,7 @@ sympy~=1.11.1 # For math scenarios
|
|
|
50
46
|
sentencepiece~=0.1.97
|
|
51
47
|
numba~=0.56.4
|
|
52
48
|
cattrs~=22.2.0
|
|
49
|
+
xlrd~=2.0.1 # Used by pandas.read_excel in ice_scenario
|
|
53
50
|
|
|
54
51
|
# Metrics
|
|
55
52
|
importlib-resources~=5.10.0
|
|
@@ -68,3 +65,9 @@ summ-eval~=0.892
|
|
|
68
65
|
# End users should install a CUDA version of PyTorch manually if needed
|
|
69
66
|
torch~=1.12.1 # Summarization metrics
|
|
70
67
|
torchvision~=0.13.1
|
|
68
|
+
|
|
69
|
+
# plotting
|
|
70
|
+
colorcet~=3.0.1
|
|
71
|
+
matplotlib~=3.6.0
|
|
72
|
+
numpy~=1.23.3
|
|
73
|
+
seaborn~=0.11.0
|
|
@@ -11,7 +11,7 @@ def get_requirements(path: str):
|
|
|
11
11
|
|
|
12
12
|
setup(
|
|
13
13
|
name="crfm-helm",
|
|
14
|
-
version="0.2.
|
|
14
|
+
version="0.2.2",
|
|
15
15
|
description="Benchmark for language models",
|
|
16
16
|
long_description="Benchmark for language models",
|
|
17
17
|
url="https://github.com/stanford-crfm/helm",
|
|
@@ -34,6 +34,7 @@ setup(
|
|
|
34
34
|
"helm-run=helm.benchmark.run:main",
|
|
35
35
|
"helm-summarize=helm.benchmark.presentation.summarize:main",
|
|
36
36
|
"helm-server=helm.benchmark.server:main",
|
|
37
|
+
"helm-create-plots=helm.benchmark.presentation.create_plots:main",
|
|
37
38
|
"crfm-proxy-server=helm.proxy.server:main",
|
|
38
39
|
"crfm-proxy-cli=helm.proxy.cli:main",
|
|
39
40
|
]
|
|
@@ -60,6 +60,7 @@ src/helm/benchmark/augmentations/space_perturbation.py
|
|
|
60
60
|
src/helm/benchmark/augmentations/synonym_perturbation.py
|
|
61
61
|
src/helm/benchmark/augmentations/test_perturbation.py
|
|
62
62
|
src/helm/benchmark/augmentations/typos_perturbation.py
|
|
63
|
+
src/helm/benchmark/contamination/__init__.py
|
|
63
64
|
src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json
|
|
64
65
|
src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json
|
|
65
66
|
src/helm/benchmark/efficiency_data/training_efficiency.json
|
|
@@ -103,12 +104,14 @@ src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
|
|
|
103
104
|
src/helm/benchmark/metrics/tokens/token_cost_estimator.py
|
|
104
105
|
src/helm/benchmark/presentation/__init__.py
|
|
105
106
|
src/helm/benchmark/presentation/contamination.py
|
|
107
|
+
src/helm/benchmark/presentation/create_plots.py
|
|
106
108
|
src/helm/benchmark/presentation/run_display.py
|
|
107
109
|
src/helm/benchmark/presentation/run_entry.py
|
|
108
110
|
src/helm/benchmark/presentation/schema.py
|
|
109
111
|
src/helm/benchmark/presentation/summarize.py
|
|
110
112
|
src/helm/benchmark/presentation/table.py
|
|
111
113
|
src/helm/benchmark/presentation/test_contamination.py
|
|
114
|
+
src/helm/benchmark/presentation/test_create_plots.py
|
|
112
115
|
src/helm/benchmark/presentation/test_run_entry.py
|
|
113
116
|
src/helm/benchmark/scenarios/__init__.py
|
|
114
117
|
src/helm/benchmark/scenarios/babi_qa_scenario.py
|
|
@@ -130,6 +133,7 @@ src/helm/benchmark/scenarios/entity_data_imputation_scenario.py
|
|
|
130
133
|
src/helm/benchmark/scenarios/entity_matching_scenario.py
|
|
131
134
|
src/helm/benchmark/scenarios/gsm_scenario.py
|
|
132
135
|
src/helm/benchmark/scenarios/ice_scenario.py
|
|
136
|
+
src/helm/benchmark/scenarios/imdb_listdir.json
|
|
133
137
|
src/helm/benchmark/scenarios/imdb_scenario.py
|
|
134
138
|
src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py
|
|
135
139
|
src/helm/benchmark/scenarios/legal_support_scenario.py
|
|
@@ -148,6 +152,7 @@ src/helm/benchmark/scenarios/narrativeqa_scenario.py
|
|
|
148
152
|
src/helm/benchmark/scenarios/natural_qa_scenario.py
|
|
149
153
|
src/helm/benchmark/scenarios/newsqa_scenario.py
|
|
150
154
|
src/helm/benchmark/scenarios/numeracy_scenario.py
|
|
155
|
+
src/helm/benchmark/scenarios/opinions_qa_scenario.py
|
|
151
156
|
src/helm/benchmark/scenarios/pubmed_qa_scenario.py
|
|
152
157
|
src/helm/benchmark/scenarios/quac_scenario.py
|
|
153
158
|
src/helm/benchmark/scenarios/raft_scenario.py
|
|
@@ -173,6 +178,7 @@ src/helm/benchmark/static/index.html
|
|
|
173
178
|
src/helm/benchmark/static/info-icon.png
|
|
174
179
|
src/helm/benchmark/static/json-urls-root.js
|
|
175
180
|
src/helm/benchmark/static/json-urls.js
|
|
181
|
+
src/helm/benchmark/static/plot-captions.js
|
|
176
182
|
src/helm/benchmark/static/schema.yaml
|
|
177
183
|
src/helm/benchmark/static/utils.js
|
|
178
184
|
src/helm/benchmark/static/images/crfm-logo.png
|
|
@@ -200,9 +206,11 @@ src/helm/benchmark/window_services/anthropic_window_service.py
|
|
|
200
206
|
src/helm/benchmark/window_services/bloom_window_service.py
|
|
201
207
|
src/helm/benchmark/window_services/cohere_window_service.py
|
|
202
208
|
src/helm/benchmark/window_services/encoder_decoder_window_service.py
|
|
209
|
+
src/helm/benchmark/window_services/flan_t5_window_service.py
|
|
203
210
|
src/helm/benchmark/window_services/gpt2_window_service.py
|
|
204
211
|
src/helm/benchmark/window_services/gptj_window_service.py
|
|
205
212
|
src/helm/benchmark/window_services/gptneox_window_service.py
|
|
213
|
+
src/helm/benchmark/window_services/huggingface_window_service.py
|
|
206
214
|
src/helm/benchmark/window_services/ice_window_service.py
|
|
207
215
|
src/helm/benchmark/window_services/local_window_service.py
|
|
208
216
|
src/helm/benchmark/window_services/luminous_window_service.py
|
|
@@ -216,6 +224,7 @@ src/helm/benchmark/window_services/test_ai21_window_service.py
|
|
|
216
224
|
src/helm/benchmark/window_services/test_bloom_window_service.py
|
|
217
225
|
src/helm/benchmark/window_services/test_cohere_window_service.py
|
|
218
226
|
src/helm/benchmark/window_services/test_cohere_window_service_utils.py
|
|
227
|
+
src/helm/benchmark/window_services/test_flan_t5_window_service.py
|
|
219
228
|
src/helm/benchmark/window_services/test_gpt2_window_service.py
|
|
220
229
|
src/helm/benchmark/window_services/test_gptj_window_service.py
|
|
221
230
|
src/helm/benchmark/window_services/test_gptneox_window_service.py
|
|
@@ -230,6 +239,7 @@ src/helm/benchmark/window_services/test_utils.py
|
|
|
230
239
|
src/helm/benchmark/window_services/test_yalm_window_service.py
|
|
231
240
|
src/helm/benchmark/window_services/tokenizer_service.py
|
|
232
241
|
src/helm/benchmark/window_services/ul2_window_service.py
|
|
242
|
+
src/helm/benchmark/window_services/wider_ai21_window_service.py
|
|
233
243
|
src/helm/benchmark/window_services/wider_openai_window_service.py
|
|
234
244
|
src/helm/benchmark/window_services/window_service.py
|
|
235
245
|
src/helm/benchmark/window_services/window_service_factory.py
|
|
@@ -268,6 +278,7 @@ src/helm/proxy/clients/cohere_client.py
|
|
|
268
278
|
src/helm/proxy/clients/google_client.py
|
|
269
279
|
src/helm/proxy/clients/goose_ai_client.py
|
|
270
280
|
src/helm/proxy/clients/huggingface_client.py
|
|
281
|
+
src/helm/proxy/clients/huggingface_model_registry.py
|
|
271
282
|
src/helm/proxy/clients/huggingface_tokenizer.py
|
|
272
283
|
src/helm/proxy/clients/ice_tokenizer_client.py
|
|
273
284
|
src/helm/proxy/clients/microsoft_client.py
|
|
@@ -276,6 +287,7 @@ src/helm/proxy/clients/perspective_api_client.py
|
|
|
276
287
|
src/helm/proxy/clients/simple_client.py
|
|
277
288
|
src/helm/proxy/clients/test_client.py
|
|
278
289
|
src/helm/proxy/clients/test_huggingface_client.py
|
|
290
|
+
src/helm/proxy/clients/test_huggingface_model_registry.py
|
|
279
291
|
src/helm/proxy/clients/test_huggingface_tokenizer.py
|
|
280
292
|
src/helm/proxy/clients/test_ice_tokenizer_client.py
|
|
281
293
|
src/helm/proxy/clients/test_yalm_tokenizer_client.py
|
|
@@ -283,6 +295,7 @@ src/helm/proxy/clients/together_client.py
|
|
|
283
295
|
src/helm/proxy/clients/yalm_tokenizer_client.py
|
|
284
296
|
src/helm/proxy/clients/yalm_tokenizer/__init__.py
|
|
285
297
|
src/helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py
|
|
298
|
+
src/helm/proxy/clients/yalm_tokenizer/voc_100b.sp
|
|
286
299
|
src/helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py
|
|
287
300
|
src/helm/proxy/services/__init__.py
|
|
288
301
|
src/helm/proxy/services/remote_service.py
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
crfm-proxy-cli = helm.proxy.cli:main
|
|
3
3
|
crfm-proxy-server = helm.proxy.server:main
|
|
4
|
+
helm-create-plots = helm.benchmark.presentation.create_plots:main
|
|
4
5
|
helm-run = helm.benchmark.run:main
|
|
5
6
|
helm-server = helm.benchmark.server:main
|
|
6
7
|
helm-summarize = helm.benchmark.presentation.summarize:main
|
|
@@ -1,12 +1,8 @@
|
|
|
1
|
-
pytest~=7.2.0
|
|
2
|
-
black~=22.10.0
|
|
3
|
-
mypy~=0.982
|
|
4
|
-
pre-commit~=2.20.0
|
|
5
|
-
flake8~=5.0.4
|
|
6
1
|
zstandard~=0.18.0
|
|
7
2
|
tqdm~=4.64.1
|
|
8
3
|
pyhocon~=0.3.59
|
|
9
4
|
dacite~=1.6.0
|
|
5
|
+
aleph-alpha-client~=2.14.0
|
|
10
6
|
bottle~=0.12.23
|
|
11
7
|
gunicorn~=20.1.0
|
|
12
8
|
Mako~=1.2.3
|
|
@@ -14,8 +10,9 @@ sqlitedict~=1.7.0
|
|
|
14
10
|
pymongo~=4.2.0
|
|
15
11
|
retrying~=1.3.3
|
|
16
12
|
websocket-client~=1.3.2
|
|
17
|
-
openai~=0.
|
|
18
|
-
transformers~=4.
|
|
13
|
+
openai~=0.27.0
|
|
14
|
+
transformers~=4.26.1
|
|
15
|
+
tokenizers~=0.13.2
|
|
19
16
|
icetk~=0.0.4
|
|
20
17
|
protobuf~=3.20.2
|
|
21
18
|
google-api-python-client~=2.64.0
|
|
@@ -27,6 +24,7 @@ sympy~=1.11.1
|
|
|
27
24
|
sentencepiece~=0.1.97
|
|
28
25
|
numba~=0.56.4
|
|
29
26
|
cattrs~=22.2.0
|
|
27
|
+
xlrd~=2.0.1
|
|
30
28
|
importlib-resources~=5.10.0
|
|
31
29
|
nltk~=3.7
|
|
32
30
|
scipy~=1.9.1
|
|
@@ -40,3 +38,7 @@ spacy~=3.2.4
|
|
|
40
38
|
summ-eval~=0.892
|
|
41
39
|
torch~=1.12.1
|
|
42
40
|
torchvision~=0.13.1
|
|
41
|
+
colorcet~=3.0.1
|
|
42
|
+
matplotlib~=3.6.0
|
|
43
|
+
numpy~=1.23.3
|
|
44
|
+
seaborn~=0.11.0
|
|
@@ -42,6 +42,8 @@ from .scenarios import legal_support_scenario # noqa
|
|
|
42
42
|
from .scenarios import entity_matching_scenario # noqa
|
|
43
43
|
from .scenarios import entity_data_imputation_scenario # noqa
|
|
44
44
|
from .scenarios import big_bench_scenario # noqa
|
|
45
|
+
from .scenarios import opinions_qa_scenario # noqa
|
|
46
|
+
|
|
45
47
|
|
|
46
48
|
# Biomedical
|
|
47
49
|
from .scenarios import covid_dialog_scenario # noqa
|
|
@@ -68,6 +68,9 @@ class AdapterSpec:
|
|
|
68
68
|
# set of training instances. Used to compute error bars.
|
|
69
69
|
num_train_trials: int = 1
|
|
70
70
|
|
|
71
|
+
# If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
72
|
+
sample_train: bool = True
|
|
73
|
+
|
|
71
74
|
# Decoding parameters (inherited by `Request`)
|
|
72
75
|
|
|
73
76
|
# Model to make the request to (need to fill in)
|
|
@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
23
23
|
@htrack(None)
|
|
24
24
|
def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
|
|
25
25
|
"""
|
|
26
|
-
Takes a
|
|
26
|
+
Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
|
|
27
27
|
The reason we don't do this per eval instance is that we create a common set of
|
|
28
28
|
training instances which is shared across all eval instances.
|
|
29
29
|
"""
|
|
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
65
65
|
parallelism: int,
|
|
66
66
|
) -> List[RequestState]:
|
|
67
67
|
self.train_trial_index: int = train_trial_index
|
|
68
|
-
self.train_instances: List[Instance] = self.sample_examples(
|
|
68
|
+
self.train_instances: List[Instance] = self.sample_examples(
|
|
69
|
+
all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
|
|
70
|
+
)
|
|
69
71
|
hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
|
|
70
72
|
|
|
71
73
|
# Generate request_states
|
|
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
93
95
|
|
|
94
96
|
return [request_state for result in results for request_state in result]
|
|
95
97
|
|
|
96
|
-
def sample_examples(
|
|
98
|
+
def sample_examples(
|
|
99
|
+
self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
|
|
100
|
+
) -> List[Instance]:
|
|
97
101
|
"""
|
|
98
102
|
Sample a random set of train instances to use as examples by following the steps below:
|
|
99
103
|
1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
|
|
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
121
125
|
random.seed(seed)
|
|
122
126
|
num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
|
|
123
127
|
|
|
128
|
+
examples: List[Instance] = []
|
|
129
|
+
if not sample_train:
|
|
130
|
+
# Select sequentially from the train set
|
|
131
|
+
examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
|
|
132
|
+
return examples
|
|
133
|
+
|
|
124
134
|
unlabeled_instances: List[Instance] = []
|
|
125
135
|
label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
|
|
126
|
-
|
|
127
136
|
for instance in all_train_instances:
|
|
128
137
|
if instance.first_correct_reference:
|
|
129
138
|
label_to_instances[instance.first_correct_reference.output.text].append(instance)
|
|
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
145
154
|
sorted_labels.extend(labels)
|
|
146
155
|
|
|
147
156
|
labels_iterable = cycle(sorted_labels)
|
|
148
|
-
examples: List[Instance] = []
|
|
149
157
|
while num_instances_to_sample > 0:
|
|
150
158
|
next_label: Optional[str] = next(labels_iterable, None)
|
|
151
159
|
if not next_label:
|
|
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
218
226
|
|
|
219
227
|
# References (optionally) and output
|
|
220
228
|
output: str
|
|
229
|
+
|
|
230
|
+
delimiter = ","
|
|
221
231
|
if reference_index is None:
|
|
222
232
|
# Put only the correct reference as the output
|
|
223
|
-
|
|
224
|
-
|
|
233
|
+
correct_references: List[Reference] = instance.all_correct_references
|
|
234
|
+
if not correct_references:
|
|
235
|
+
output = "n/a"
|
|
236
|
+
else:
|
|
237
|
+
output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
|
|
225
238
|
else:
|
|
226
239
|
reference = instance.references[reference_index]
|
|
227
240
|
output = reference.output.text
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
from sklearn.metrics import f1_score
|
|
4
|
+
from sklearn.preprocessing import MultiLabelBinarizer
|
|
4
5
|
|
|
5
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
7
|
from helm.benchmark.metrics.basic_metrics import normalize_text
|
|
@@ -20,8 +21,7 @@ class ClassificationMetric(Metric):
|
|
|
20
21
|
|
|
21
22
|
Note:
|
|
22
23
|
- The set of classes is derived from the correct references from all the instances.
|
|
23
|
-
This means that classes may be omitted if they
|
|
24
|
-
reference.
|
|
24
|
+
This means that classes may be omitted if they are never used as a correct reference.
|
|
25
25
|
- Generations that are not in any of the known classes are counted as a
|
|
26
26
|
negative prediction for every class.
|
|
27
27
|
- Perturbed classes are considered different classes from unperturbed
|
|
@@ -29,10 +29,16 @@ class ClassificationMetric(Metric):
|
|
|
29
29
|
- Currently, multi-label classification is not supported.
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
|
+
def __init__(self, delimiter: Optional[str] = None):
|
|
33
|
+
self.delimiter = delimiter
|
|
34
|
+
|
|
35
|
+
def is_multi_label(self) -> bool:
|
|
36
|
+
return bool(self.delimiter)
|
|
37
|
+
|
|
32
38
|
def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
|
|
33
|
-
y_pred: List[str] = []
|
|
34
|
-
y_true: List[str] = []
|
|
35
|
-
for request_state in request_states:
|
|
39
|
+
y_pred: List[List[str]] = []
|
|
40
|
+
y_true: List[List[str]] = []
|
|
41
|
+
for request_state in request_states: # one request state per instance
|
|
36
42
|
# Only the generation adapter is supported.
|
|
37
43
|
# TODO: Support multiple_choice_* adapters.
|
|
38
44
|
if request_state.reference_index is not None:
|
|
@@ -42,24 +48,23 @@ class ClassificationMetric(Metric):
|
|
|
42
48
|
assert request_state.result is not None
|
|
43
49
|
if len(request_state.result.completions) != 1:
|
|
44
50
|
raise ValueError("Result must contain exactly one completion")
|
|
45
|
-
|
|
46
|
-
num_correct = 0
|
|
47
|
-
for reference in request_state.instance.references:
|
|
48
|
-
if reference.is_correct:
|
|
49
|
-
num_correct += 1
|
|
50
|
-
y_true.append(normalize_text(reference.output.text))
|
|
51
|
-
if num_correct != 1:
|
|
52
|
-
# TODO: Support multi-label classification.
|
|
53
|
-
raise ValueError("ClassificationMetric does not support multi-label classification")
|
|
54
51
|
if request_state.output_mapping:
|
|
55
52
|
raise ValueError("ClassificationMetric does not support multiple choice adapters")
|
|
56
|
-
|
|
57
|
-
|
|
53
|
+
|
|
54
|
+
references = request_state.instance.all_correct_references
|
|
55
|
+
if not self.is_multi_label():
|
|
56
|
+
assert len(references) == 1
|
|
57
|
+
correct_ref_texts = [normalize_text(ref.output.text) for ref in references if ref.output.text]
|
|
58
|
+
y_true.append(correct_ref_texts)
|
|
59
|
+
|
|
60
|
+
input_text = request_state.result.completions[0].text
|
|
61
|
+
predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
|
|
62
|
+
y_pred.append([normalize_text(pred) for pred in predictions if pred])
|
|
63
|
+
labels: List[str] = list(set(y for ys in y_true for y in ys))
|
|
64
|
+
mlb = MultiLabelBinarizer().fit([labels])
|
|
65
|
+
y_true = mlb.transform(y_true)
|
|
66
|
+
y_pred = mlb.transform(y_pred)
|
|
58
67
|
return [
|
|
59
|
-
Stat(MetricName("classification_macro_f1")).add(
|
|
60
|
-
|
|
61
|
-
),
|
|
62
|
-
Stat(MetricName("classification_micro_f1")).add(
|
|
63
|
-
f1_score(y_pred=y_pred, y_true=y_true, labels=list(labels), average="micro")
|
|
64
|
-
),
|
|
68
|
+
Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
|
|
69
|
+
Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
|
|
65
70
|
]
|
{crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_classification_metrics.py
RENAMED
|
@@ -63,7 +63,8 @@ def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
|
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
def test_evaluate_instances_binary_generation():
|
|
66
|
-
metric = ClassificationMetric()
|
|
66
|
+
metric = ClassificationMetric(delimiter=None)
|
|
67
|
+
|
|
67
68
|
request_states = [
|
|
68
69
|
_request_state("yes", [_Option("yes", True)]),
|
|
69
70
|
_request_state("yes", [_Option("yes", True)]),
|
|
@@ -86,20 +87,21 @@ def test_evaluate_instances_binary_generation():
|
|
|
86
87
|
|
|
87
88
|
|
|
88
89
|
def test_evaluate_instances_multi_class():
|
|
89
|
-
|
|
90
|
+
# Note: no "a" because it would get filtered out by normalize_text()
|
|
91
|
+
metric = ClassificationMetric(delimiter=None)
|
|
90
92
|
|
|
91
93
|
def _options(correct: str):
|
|
92
|
-
return [_Option(text, text == correct) for text in ["
|
|
94
|
+
return [_Option(text, text == correct) for text in ["d", "b", "c"]]
|
|
93
95
|
|
|
94
96
|
request_states = [
|
|
95
|
-
_request_state("
|
|
96
|
-
_request_state("
|
|
97
|
-
_request_state("
|
|
98
|
-
_request_state("
|
|
97
|
+
_request_state("d", _options("d")),
|
|
98
|
+
_request_state("d", _options("d")),
|
|
99
|
+
_request_state("d", _options("d")),
|
|
100
|
+
_request_state("d", _options("b")),
|
|
99
101
|
_request_state("b", _options("b")),
|
|
100
102
|
_request_state("b", _options("b")),
|
|
101
103
|
_request_state("b", _options("c")),
|
|
102
|
-
_request_state("c", _options("
|
|
104
|
+
_request_state("c", _options("d")),
|
|
103
105
|
_request_state("c", _options("c")),
|
|
104
106
|
_request_state("invalid", _options("c")),
|
|
105
107
|
]
|
|
@@ -107,9 +109,42 @@ def test_evaluate_instances_multi_class():
|
|
|
107
109
|
metric.evaluate_instances(request_states),
|
|
108
110
|
_expected_stats(
|
|
109
111
|
{
|
|
110
|
-
"
|
|
112
|
+
"d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
|
|
111
113
|
"b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
|
|
112
114
|
"c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
|
|
113
115
|
}
|
|
114
116
|
),
|
|
115
117
|
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_evaluate_instances_multilabel():
|
|
121
|
+
# Note: no "a" because it would get filtered out by normalize_text()
|
|
122
|
+
metric = ClassificationMetric(delimiter=",")
|
|
123
|
+
|
|
124
|
+
def _options(correct: List[str]):
|
|
125
|
+
return [_Option(text, text in correct) for text in ["d", "b", "c"]]
|
|
126
|
+
|
|
127
|
+
request_states = [
|
|
128
|
+
_request_state("d,b", _options(["d", "b"])),
|
|
129
|
+
_request_state("d,b", _options(["d", "c"])),
|
|
130
|
+
_request_state("d", _options(["d"])),
|
|
131
|
+
_request_state("c", _options(["b"])),
|
|
132
|
+
_request_state("b", _options(["b", "c"])),
|
|
133
|
+
_request_state("d,b", _options(["c"])),
|
|
134
|
+
_request_state("d,c", _options(["d"])),
|
|
135
|
+
_request_state("d,b,c", _options(["d", "b", "c"])),
|
|
136
|
+
_request_state("", []),
|
|
137
|
+
_request_state("n/a", []),
|
|
138
|
+
_request_state("invalid", _options(["c"])),
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
assert_stats_equal(
|
|
142
|
+
metric.evaluate_instances(request_states),
|
|
143
|
+
_expected_stats(
|
|
144
|
+
{
|
|
145
|
+
"d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
|
|
146
|
+
"b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
|
|
147
|
+
"c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
|
|
148
|
+
}
|
|
149
|
+
),
|
|
150
|
+
)
|