crfm-helm 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/MANIFEST.in +1 -0
  2. {crfm-helm-0.2.1/src/crfm_helm.egg-info → crfm-helm-0.2.2}/PKG-INFO +1 -1
  3. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/README.md +1 -1
  4. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/requirements.txt +12 -9
  5. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/setup.py +2 -1
  6. {crfm-helm-0.2.1 → crfm-helm-0.2.2/src/crfm_helm.egg-info}/PKG-INFO +1 -1
  7. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/SOURCES.txt +13 -0
  8. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/entry_points.txt +1 -0
  9. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/requires.txt +9 -7
  10. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/__init__.py +2 -0
  11. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapter_spec.py +3 -0
  12. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  13. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/classification_metrics.py +28 -23
  14. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_classification_metrics.py +44 -9
  15. crfm-helm-0.2.2/src/helm/benchmark/presentation/create_plots.py +617 -0
  16. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/summarize.py +4 -2
  17. crfm-helm-0.2.2/src/helm/benchmark/presentation/test_create_plots.py +32 -0
  18. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/run.py +23 -1
  19. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/run_expander.py +161 -47
  20. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/run_specs.py +84 -10
  21. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/runner.py +31 -3
  22. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/copyright_scenario.py +1 -1
  23. crfm-helm-0.2.2/src/helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  24. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
  25. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/lextreme_scenario.py +37 -25
  26. crfm-helm-0.2.2/src/helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  27. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/scenario.py +5 -0
  28. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  29. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/benchmarking.css +14 -0
  30. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/benchmarking.js +43 -0
  31. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/index.html +2 -0
  32. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/json-urls.js +4 -0
  33. crfm-helm-0.2.2/src/helm/benchmark/static/plot-captions.js +16 -0
  34. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/schema.yaml +66 -8
  35. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/cohere_window_service.py +20 -0
  36. crfm-helm-0.2.2/src/helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  37. crfm-helm-0.2.2/src/helm/benchmark/window_services/huggingface_window_service.py +39 -0
  38. crfm-helm-0.2.2/src/helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  39. crfm-helm-0.2.2/src/helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  40. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/window_service_factory.py +27 -6
  41. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/general.py +12 -5
  42. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/aleph_alpha_client.py +47 -28
  43. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/auto_client.py +28 -24
  44. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/huggingface_client.py +30 -17
  45. crfm-helm-0.2.2/src/helm/proxy/clients/huggingface_model_registry.py +111 -0
  46. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/huggingface_tokenizer.py +23 -7
  47. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/openai_client.py +60 -2
  48. crfm-helm-0.2.2/src/helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  49. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/together_client.py +17 -2
  50. crfm-helm-0.2.2/src/helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  51. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  52. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/models.py +82 -2
  53. crfm-helm-0.2.2/src/helm/proxy/token_counters/__init__.py +0 -0
  54. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/LICENSE +0 -0
  55. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/pyproject.toml +0 -0
  56. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/setup.cfg +0 -0
  57. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/dependency_links.txt +0 -0
  58. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/not-zip-safe +0 -0
  59. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/top_level.txt +0 -0
  60. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/__init__.py +0 -0
  61. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/__init__.py +0 -0
  62. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/__init__.py +0 -0
  63. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/adapter.py +0 -0
  64. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/adapter_factory.py +0 -0
  65. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +0 -0
  66. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/generation_adapter.py +0 -0
  67. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py +0 -0
  68. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +0 -0
  69. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +0 -0
  70. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +0 -0
  71. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_adapter.py +0 -0
  72. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py +0 -0
  73. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +0 -0
  74. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +0 -0
  75. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/prompt.py +0 -0
  76. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/request_state.py +0 -0
  77. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/scenario_state.py +0 -0
  78. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/__init__.py +0 -0
  79. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py +0 -0
  80. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -0
  81. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/correct_to_misspelling.json +0 -0
  82. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/data_augmenter.py +0 -0
  83. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/dialect_perturbation.py +0 -0
  84. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/extra_space_perturbation.py +0 -0
  85. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/filler_words_perturbation.py +0 -0
  86. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/gender_perturbation.py +0 -0
  87. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/lowercase_perturbation.py +0 -0
  88. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/mild_mix_perturbation.py +0 -0
  89. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/misspelling_perturbation.py +0 -0
  90. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/person_name_perturbation.py +0 -0
  91. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/perturbation.py +0 -0
  92. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/perturbation_description.py +0 -0
  93. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/space_perturbation.py +0 -0
  94. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/synonym_perturbation.py +0 -0
  95. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/test_perturbation.py +0 -0
  96. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/typos_perturbation.py +0 -0
  97. {crfm-helm-0.2.1/src/helm/benchmark/metrics → crfm-helm-0.2.2/src/helm/benchmark/contamination}/__init__.py +0 -0
  98. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/data_preprocessor.py +0 -0
  99. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json +0 -0
  100. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json +0 -0
  101. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/training_efficiency.json +0 -0
  102. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/executor.py +0 -0
  103. {crfm-helm-0.2.1/src/helm/benchmark/metrics/summac → crfm-helm-0.2.2/src/helm/benchmark/metrics}/__init__.py +0 -0
  104. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/basic_metrics.py +0 -0
  105. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bbq_metrics.py +0 -0
  106. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bias_metrics.py +0 -0
  107. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bias_word_lists.py +0 -0
  108. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/code_metrics.py +0 -0
  109. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/code_metrics_helper.py +0 -0
  110. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/copyright_metrics.py +0 -0
  111. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/disinformation_metrics.py +0 -0
  112. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/machine_translation_metrics.py +0 -0
  113. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric.py +0 -0
  114. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric_name.py +0 -0
  115. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric_service.py +0 -0
  116. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/numeracy_metrics.py +0 -0
  117. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/ranking_metrics.py +0 -0
  118. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/statistic.py +0 -0
  119. {crfm-helm-0.2.1/src/helm/benchmark/metrics/tokens → crfm-helm-0.2.2/src/helm/benchmark/metrics/summac}/__init__.py +0 -0
  120. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summac/model_summac.py +0 -0
  121. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summac/utils_misc.py +0 -0
  122. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summarization_metrics.py +0 -0
  123. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_bias_metrics.py +0 -0
  124. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_metric.py +0 -0
  125. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_numeracy_metrics.py +0 -0
  126. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_statistic.py +0 -0
  127. {crfm-helm-0.2.1/src/helm/benchmark/presentation → crfm-helm-0.2.2/src/helm/benchmark/metrics/tokens}/__init__.py +0 -0
  128. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +0 -0
  129. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +0 -0
  130. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +0 -0
  131. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/free_token_cost_estimator.py +0 -0
  132. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +0 -0
  133. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +0 -0
  134. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +0 -0
  135. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +0 -0
  136. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/token_cost_estimator.py +0 -0
  137. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens_metric.py +0 -0
  138. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/toxicity_metrics.py +0 -0
  139. {crfm-helm-0.2.1/src/helm/benchmark/scenarios → crfm-helm-0.2.2/src/helm/benchmark/presentation}/__init__.py +0 -0
  140. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/contamination.py +0 -0
  141. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/run_display.py +0 -0
  142. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/run_entry.py +0 -0
  143. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/schema.py +0 -0
  144. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/table.py +0 -0
  145. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/test_contamination.py +0 -0
  146. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/test_run_entry.py +0 -0
  147. {crfm-helm-0.2.1/src/helm/benchmark/window_services → crfm-helm-0.2.2/src/helm/benchmark/scenarios}/__init__.py +0 -0
  148. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/babi_qa_scenario.py +0 -0
  149. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/bbq_scenario.py +0 -0
  150. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/big_bench_scenario.py +0 -0
  151. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/blimp_scenario.py +0 -0
  152. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/bold_scenario.py +0 -0
  153. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/boolq_scenario.py +0 -0
  154. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/civil_comments_scenario.py +0 -0
  155. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/code_scenario.py +0 -0
  156. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/code_scenario_helper.py +0 -0
  157. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/commonsense_scenario.py +0 -0
  158. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/covid_dialog_scenario.py +0 -0
  159. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/dialogue_scenarios.py +0 -0
  160. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/disinformation_scenario.py +0 -0
  161. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/dyck_language_scenario.py +0 -0
  162. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/entity_data_imputation_scenario.py +0 -0
  163. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/entity_matching_scenario.py +0 -0
  164. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/gsm_scenario.py +0 -0
  165. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/ice_scenario.py +0 -0
  166. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/imdb_scenario.py +0 -0
  167. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +0 -0
  168. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/legal_support_scenario.py +0 -0
  169. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/lsat_qa_scenario.py +0 -0
  170. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/math_scenario.py +0 -0
  171. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/me_q_sum_scenario.py +0 -0
  172. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/med_dialog_scenario.py +0 -0
  173. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/med_mcqa_scenario.py +0 -0
  174. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +0 -0
  175. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/med_qa_scenario.py +0 -0
  176. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/mmlu_scenario.py +0 -0
  177. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/msmarco_scenario.py +0 -0
  178. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/narrativeqa_scenario.py +0 -0
  179. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/natural_qa_scenario.py +0 -0
  180. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/newsqa_scenario.py +0 -0
  181. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/numeracy_scenario.py +0 -0
  182. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/pubmed_qa_scenario.py +0 -0
  183. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/quac_scenario.py +0 -0
  184. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/raft_scenario.py +0 -0
  185. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +0 -0
  186. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/simple_scenarios.py +0 -0
  187. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/summarization_scenario.py +0 -0
  188. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_efficiency_scenario.py +0 -0
  189. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +0 -0
  190. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_reasoning_scenario.py +0 -0
  191. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/test_scenario.py +0 -0
  192. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/truthful_qa_scenario.py +0 -0
  193. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/twitter_aae_scenario.py +0 -0
  194. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/wikifact_scenario.py +0 -0
  195. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/wikitext_103_scenario.py +0 -0
  196. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/wmt_14_scenario.py +0 -0
  197. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/server.py +0 -0
  198. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/contamination.yaml +0 -0
  199. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/general.js +0 -0
  200. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/crfm-logo.png +0 -0
  201. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/helm-logo-simple.png +0 -0
  202. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/helm-logo.png +0 -0
  203. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/language-model-helm.png +0 -0
  204. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/ai21.png +0 -0
  205. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/anthropic.png +0 -0
  206. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/bigscience.png +0 -0
  207. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/cohere.png +0 -0
  208. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  209. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/google.png +0 -0
  210. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/meta.png +0 -0
  211. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/microsoft.png +0 -0
  212. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/nvidia.png +0 -0
  213. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/openai.png +0 -0
  214. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/together.png +0 -0
  215. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  216. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/yandex.png +0 -0
  217. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  218. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  219. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/info-icon.png +0 -0
  220. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/json-urls-root.js +0 -0
  221. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/static/utils.js +0 -0
  222. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/test_data_preprocessor.py +0 -0
  223. {crfm-helm-0.2.1/src/helm/common → crfm-helm-0.2.2/src/helm/benchmark/window_services}/__init__.py +0 -0
  224. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ai21_window_service.py +0 -0
  225. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/anthropic_window_service.py +0 -0
  226. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/bloom_window_service.py +0 -0
  227. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/encoder_decoder_window_service.py +0 -0
  228. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gpt2_window_service.py +0 -0
  229. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gptj_window_service.py +0 -0
  230. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gptneox_window_service.py +0 -0
  231. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ice_window_service.py +0 -0
  232. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/local_window_service.py +0 -0
  233. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/luminous_window_service.py +0 -0
  234. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/mt_nlg_window_service.py +0 -0
  235. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/openai_window_service.py +0 -0
  236. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/opt_window_service.py +0 -0
  237. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/santacoder_window_service.py +0 -0
  238. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/t0pp_window_service.py +0 -0
  239. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/t511b_window_service.py +0 -0
  240. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ai21_window_service.py +0 -0
  241. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_bloom_window_service.py +0 -0
  242. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_cohere_window_service.py +0 -0
  243. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -0
  244. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gpt2_window_service.py +0 -0
  245. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gptj_window_service.py +0 -0
  246. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gptneox_window_service.py +0 -0
  247. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ice_window_service.py +0 -0
  248. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -0
  249. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_openai_window_service.py +0 -0
  250. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_opt_window_service.py +0 -0
  251. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_t0pp_window_service.py +0 -0
  252. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_t511b_window_service.py +0 -0
  253. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ul2_window_service.py +0 -0
  254. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_utils.py +0 -0
  255. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_yalm_window_service.py +0 -0
  256. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/tokenizer_service.py +0 -0
  257. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ul2_window_service.py +0 -0
  258. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/wider_openai_window_service.py +0 -0
  259. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/window_service.py +0 -0
  260. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/yalm_window_service.py +0 -0
  261. {crfm-helm-0.2.1/src/helm/proxy → crfm-helm-0.2.2/src/helm/common}/__init__.py +0 -0
  262. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/authentication.py +0 -0
  263. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/cache.py +0 -0
  264. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/codec.py +0 -0
  265. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/hierarchical_logger.py +0 -0
  266. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/object_spec.py +0 -0
  267. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/perspective_api_request.py +0 -0
  268. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/request.py +0 -0
  269. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/test_cache.py +0 -0
  270. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/test_codec.py +0 -0
  271. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/test_general.py +0 -0
  272. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/common/tokenization_request.py +0 -0
  273. {crfm-helm-0.2.1/src/helm/proxy/clients → crfm-helm-0.2.2/src/helm/proxy}/__init__.py +0 -0
  274. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/accounts.py +0 -0
  275. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/cli.py +0 -0
  276. {crfm-helm-0.2.1/src/helm/proxy/clients/yalm_tokenizer → crfm-helm-0.2.2/src/helm/proxy/clients}/__init__.py +0 -0
  277. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/ai21_client.py +0 -0
  278. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/anthropic_client.py +0 -0
  279. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/chat_gpt_client.py +0 -0
  280. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/client.py +0 -0
  281. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/cohere_client.py +0 -0
  282. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/google_client.py +0 -0
  283. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/goose_ai_client.py +0 -0
  284. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/ice_tokenizer_client.py +0 -0
  285. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/microsoft_client.py +0 -0
  286. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/perspective_api_client.py +0 -0
  287. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/simple_client.py +0 -0
  288. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_client.py +0 -0
  289. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_huggingface_client.py +0 -0
  290. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_huggingface_tokenizer.py +0 -0
  291. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_ice_tokenizer_client.py +0 -0
  292. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_yalm_tokenizer_client.py +0 -0
  293. {crfm-helm-0.2.1/src/helm/proxy/services → crfm-helm-0.2.2/src/helm/proxy/clients/yalm_tokenizer}/__init__.py +0 -0
  294. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py +0 -0
  295. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer_client.py +0 -0
  296. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/example_queries.py +0 -0
  297. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/query.py +0 -0
  298. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/retry.py +0 -0
  299. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/server.py +0 -0
  300. {crfm-helm-0.2.1/src/helm/proxy/token_counters → crfm-helm-0.2.2/src/helm/proxy/services}/__init__.py +0 -0
  301. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/services/remote_service.py +0 -0
  302. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/services/server_service.py +0 -0
  303. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/services/service.py +0 -0
  304. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/services/test_remote_service.py +0 -0
  305. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/services/test_service.py +0 -0
  306. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/test_models.py +0 -0
  307. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/test_retry.py +0 -0
  308. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/ai21_token_counter.py +0 -0
  309. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/auto_token_counter.py +0 -0
  310. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/cohere_token_counter.py +0 -0
  311. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/free_token_counter.py +0 -0
  312. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/gooseai_token_counter.py +0 -0
  313. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/openai_token_counter.py +0 -0
  314. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/test_ai21_token_counter.py +0 -0
  315. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/test_openai_token_counter.py +0 -0
  316. {crfm-helm-0.2.1 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/token_counter.py +0 -0
@@ -1,3 +1,4 @@
1
+ recursive-include src/helm/proxy/clients/ *.sp
1
2
  recursive-include src/helm/benchmark/ *.json
2
3
  recursive-include src/helm/benchmark/static/ *.css *.html *.js *.png *.yaml
3
4
  include requirements.txt
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crfm-helm
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Benchmark for language models
5
5
  Home-page: https://github.com/stanford-crfm/helm
6
6
  Author: Stanford CRFM
@@ -4,7 +4,7 @@
4
4
  [comment]: <> (When using the img tag, which allows us to specify size, src has to be a URL.)
5
5
  <img src="https://github.com/stanford-crfm/helm/raw/main/src/helm/benchmark/static/images/helm-logo.png" alt="" width="800"/>
6
6
 
7
- Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/v1.0/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features:
7
+ Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/latest/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features:
8
8
 
9
9
  - Collection of datasets in a standard format (e.g., NaturalQuestions)
10
10
  - Collection of models accessible via a unified API (e.g., GPT-3, MT-NLG, OPT, BLOOM)
@@ -6,19 +6,13 @@
6
6
  #
7
7
  # pip freeze | xargs pip uninstall -y
8
8
  # pip install -r requirements.txt
9
+ # pip install -r requirements-dev.txt
9
10
  # pip freeze | grep -v en-core-web-sm > requirements-freeze.txt
10
11
  #
11
12
  # Also update the versions in the manual installation steps in pre-commit.sh.
12
13
  #
13
14
  # Check that everything works because the versions might be upgraded.
14
15
 
15
- # Development
16
- pytest~=7.2.0
17
- black~=22.10.0
18
- mypy~=0.982
19
- pre-commit~=2.20.0
20
- flake8~=5.0.4
21
-
22
16
  # Common
23
17
  zstandard~=0.18.0
24
18
  tqdm~=4.64.1
@@ -26,6 +20,7 @@ pyhocon~=0.3.59
26
20
  dacite~=1.6.0
27
21
 
28
22
  # Proxy
23
+ aleph-alpha-client~=2.14.0
29
24
  bottle~=0.12.23
30
25
  gunicorn~=20.1.0
31
26
  Mako~=1.2.3
@@ -35,8 +30,9 @@ sqlitedict~=1.7.0
35
30
  pymongo~=4.2.0
36
31
  retrying~=1.3.3
37
32
  websocket-client~=1.3.2 # For Anthropic
38
- openai~=0.25.0
39
- transformers~=4.22.2 # For HuggingFace tokenizer
33
+ openai~=0.27.0
34
+ transformers~=4.26.1
35
+ tokenizers~=0.13.2
40
36
  icetk~=0.0.4
41
37
  protobuf~=3.20.2 # Can't use 4.21.0 due to backward incompatibility
42
38
  google-api-python-client~=2.64.0
@@ -50,6 +46,7 @@ sympy~=1.11.1 # For math scenarios
50
46
  sentencepiece~=0.1.97
51
47
  numba~=0.56.4
52
48
  cattrs~=22.2.0
49
+ xlrd~=2.0.1 # Used by pandas.read_excel in ice_scenario
53
50
 
54
51
  # Metrics
55
52
  importlib-resources~=5.10.0
@@ -68,3 +65,9 @@ summ-eval~=0.892
68
65
  # End users should install a CUDA version of PyTorch manually if needed
69
66
  torch~=1.12.1 # Summarization metrics
70
67
  torchvision~=0.13.1
68
+
69
+ # plotting
70
+ colorcet~=3.0.1
71
+ matplotlib~=3.6.0
72
+ numpy~=1.23.3
73
+ seaborn~=0.11.0
@@ -11,7 +11,7 @@ def get_requirements(path: str):
11
11
 
12
12
  setup(
13
13
  name="crfm-helm",
14
- version="0.2.1",
14
+ version="0.2.2",
15
15
  description="Benchmark for language models",
16
16
  long_description="Benchmark for language models",
17
17
  url="https://github.com/stanford-crfm/helm",
@@ -34,6 +34,7 @@ setup(
34
34
  "helm-run=helm.benchmark.run:main",
35
35
  "helm-summarize=helm.benchmark.presentation.summarize:main",
36
36
  "helm-server=helm.benchmark.server:main",
37
+ "helm-create-plots=helm.benchmark.presentation.create_plots:main",
37
38
  "crfm-proxy-server=helm.proxy.server:main",
38
39
  "crfm-proxy-cli=helm.proxy.cli:main",
39
40
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crfm-helm
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Benchmark for language models
5
5
  Home-page: https://github.com/stanford-crfm/helm
6
6
  Author: Stanford CRFM
@@ -60,6 +60,7 @@ src/helm/benchmark/augmentations/space_perturbation.py
60
60
  src/helm/benchmark/augmentations/synonym_perturbation.py
61
61
  src/helm/benchmark/augmentations/test_perturbation.py
62
62
  src/helm/benchmark/augmentations/typos_perturbation.py
63
+ src/helm/benchmark/contamination/__init__.py
63
64
  src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json
64
65
  src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json
65
66
  src/helm/benchmark/efficiency_data/training_efficiency.json
@@ -103,12 +104,14 @@ src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
103
104
  src/helm/benchmark/metrics/tokens/token_cost_estimator.py
104
105
  src/helm/benchmark/presentation/__init__.py
105
106
  src/helm/benchmark/presentation/contamination.py
107
+ src/helm/benchmark/presentation/create_plots.py
106
108
  src/helm/benchmark/presentation/run_display.py
107
109
  src/helm/benchmark/presentation/run_entry.py
108
110
  src/helm/benchmark/presentation/schema.py
109
111
  src/helm/benchmark/presentation/summarize.py
110
112
  src/helm/benchmark/presentation/table.py
111
113
  src/helm/benchmark/presentation/test_contamination.py
114
+ src/helm/benchmark/presentation/test_create_plots.py
112
115
  src/helm/benchmark/presentation/test_run_entry.py
113
116
  src/helm/benchmark/scenarios/__init__.py
114
117
  src/helm/benchmark/scenarios/babi_qa_scenario.py
@@ -130,6 +133,7 @@ src/helm/benchmark/scenarios/entity_data_imputation_scenario.py
130
133
  src/helm/benchmark/scenarios/entity_matching_scenario.py
131
134
  src/helm/benchmark/scenarios/gsm_scenario.py
132
135
  src/helm/benchmark/scenarios/ice_scenario.py
136
+ src/helm/benchmark/scenarios/imdb_listdir.json
133
137
  src/helm/benchmark/scenarios/imdb_scenario.py
134
138
  src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py
135
139
  src/helm/benchmark/scenarios/legal_support_scenario.py
@@ -148,6 +152,7 @@ src/helm/benchmark/scenarios/narrativeqa_scenario.py
148
152
  src/helm/benchmark/scenarios/natural_qa_scenario.py
149
153
  src/helm/benchmark/scenarios/newsqa_scenario.py
150
154
  src/helm/benchmark/scenarios/numeracy_scenario.py
155
+ src/helm/benchmark/scenarios/opinions_qa_scenario.py
151
156
  src/helm/benchmark/scenarios/pubmed_qa_scenario.py
152
157
  src/helm/benchmark/scenarios/quac_scenario.py
153
158
  src/helm/benchmark/scenarios/raft_scenario.py
@@ -173,6 +178,7 @@ src/helm/benchmark/static/index.html
173
178
  src/helm/benchmark/static/info-icon.png
174
179
  src/helm/benchmark/static/json-urls-root.js
175
180
  src/helm/benchmark/static/json-urls.js
181
+ src/helm/benchmark/static/plot-captions.js
176
182
  src/helm/benchmark/static/schema.yaml
177
183
  src/helm/benchmark/static/utils.js
178
184
  src/helm/benchmark/static/images/crfm-logo.png
@@ -200,9 +206,11 @@ src/helm/benchmark/window_services/anthropic_window_service.py
200
206
  src/helm/benchmark/window_services/bloom_window_service.py
201
207
  src/helm/benchmark/window_services/cohere_window_service.py
202
208
  src/helm/benchmark/window_services/encoder_decoder_window_service.py
209
+ src/helm/benchmark/window_services/flan_t5_window_service.py
203
210
  src/helm/benchmark/window_services/gpt2_window_service.py
204
211
  src/helm/benchmark/window_services/gptj_window_service.py
205
212
  src/helm/benchmark/window_services/gptneox_window_service.py
213
+ src/helm/benchmark/window_services/huggingface_window_service.py
206
214
  src/helm/benchmark/window_services/ice_window_service.py
207
215
  src/helm/benchmark/window_services/local_window_service.py
208
216
  src/helm/benchmark/window_services/luminous_window_service.py
@@ -216,6 +224,7 @@ src/helm/benchmark/window_services/test_ai21_window_service.py
216
224
  src/helm/benchmark/window_services/test_bloom_window_service.py
217
225
  src/helm/benchmark/window_services/test_cohere_window_service.py
218
226
  src/helm/benchmark/window_services/test_cohere_window_service_utils.py
227
+ src/helm/benchmark/window_services/test_flan_t5_window_service.py
219
228
  src/helm/benchmark/window_services/test_gpt2_window_service.py
220
229
  src/helm/benchmark/window_services/test_gptj_window_service.py
221
230
  src/helm/benchmark/window_services/test_gptneox_window_service.py
@@ -230,6 +239,7 @@ src/helm/benchmark/window_services/test_utils.py
230
239
  src/helm/benchmark/window_services/test_yalm_window_service.py
231
240
  src/helm/benchmark/window_services/tokenizer_service.py
232
241
  src/helm/benchmark/window_services/ul2_window_service.py
242
+ src/helm/benchmark/window_services/wider_ai21_window_service.py
233
243
  src/helm/benchmark/window_services/wider_openai_window_service.py
234
244
  src/helm/benchmark/window_services/window_service.py
235
245
  src/helm/benchmark/window_services/window_service_factory.py
@@ -268,6 +278,7 @@ src/helm/proxy/clients/cohere_client.py
268
278
  src/helm/proxy/clients/google_client.py
269
279
  src/helm/proxy/clients/goose_ai_client.py
270
280
  src/helm/proxy/clients/huggingface_client.py
281
+ src/helm/proxy/clients/huggingface_model_registry.py
271
282
  src/helm/proxy/clients/huggingface_tokenizer.py
272
283
  src/helm/proxy/clients/ice_tokenizer_client.py
273
284
  src/helm/proxy/clients/microsoft_client.py
@@ -276,6 +287,7 @@ src/helm/proxy/clients/perspective_api_client.py
276
287
  src/helm/proxy/clients/simple_client.py
277
288
  src/helm/proxy/clients/test_client.py
278
289
  src/helm/proxy/clients/test_huggingface_client.py
290
+ src/helm/proxy/clients/test_huggingface_model_registry.py
279
291
  src/helm/proxy/clients/test_huggingface_tokenizer.py
280
292
  src/helm/proxy/clients/test_ice_tokenizer_client.py
281
293
  src/helm/proxy/clients/test_yalm_tokenizer_client.py
@@ -283,6 +295,7 @@ src/helm/proxy/clients/together_client.py
283
295
  src/helm/proxy/clients/yalm_tokenizer_client.py
284
296
  src/helm/proxy/clients/yalm_tokenizer/__init__.py
285
297
  src/helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py
298
+ src/helm/proxy/clients/yalm_tokenizer/voc_100b.sp
286
299
  src/helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py
287
300
  src/helm/proxy/services/__init__.py
288
301
  src/helm/proxy/services/remote_service.py
@@ -1,6 +1,7 @@
1
1
  [console_scripts]
2
2
  crfm-proxy-cli = helm.proxy.cli:main
3
3
  crfm-proxy-server = helm.proxy.server:main
4
+ helm-create-plots = helm.benchmark.presentation.create_plots:main
4
5
  helm-run = helm.benchmark.run:main
5
6
  helm-server = helm.benchmark.server:main
6
7
  helm-summarize = helm.benchmark.presentation.summarize:main
@@ -1,12 +1,8 @@
1
- pytest~=7.2.0
2
- black~=22.10.0
3
- mypy~=0.982
4
- pre-commit~=2.20.0
5
- flake8~=5.0.4
6
1
  zstandard~=0.18.0
7
2
  tqdm~=4.64.1
8
3
  pyhocon~=0.3.59
9
4
  dacite~=1.6.0
5
+ aleph-alpha-client~=2.14.0
10
6
  bottle~=0.12.23
11
7
  gunicorn~=20.1.0
12
8
  Mako~=1.2.3
@@ -14,8 +10,9 @@ sqlitedict~=1.7.0
14
10
  pymongo~=4.2.0
15
11
  retrying~=1.3.3
16
12
  websocket-client~=1.3.2
17
- openai~=0.25.0
18
- transformers~=4.22.2
13
+ openai~=0.27.0
14
+ transformers~=4.26.1
15
+ tokenizers~=0.13.2
19
16
  icetk~=0.0.4
20
17
  protobuf~=3.20.2
21
18
  google-api-python-client~=2.64.0
@@ -27,6 +24,7 @@ sympy~=1.11.1
27
24
  sentencepiece~=0.1.97
28
25
  numba~=0.56.4
29
26
  cattrs~=22.2.0
27
+ xlrd~=2.0.1
30
28
  importlib-resources~=5.10.0
31
29
  nltk~=3.7
32
30
  scipy~=1.9.1
@@ -40,3 +38,7 @@ spacy~=3.2.4
40
38
  summ-eval~=0.892
41
39
  torch~=1.12.1
42
40
  torchvision~=0.13.1
41
+ colorcet~=3.0.1
42
+ matplotlib~=3.6.0
43
+ numpy~=1.23.3
44
+ seaborn~=0.11.0
@@ -42,6 +42,8 @@ from .scenarios import legal_support_scenario # noqa
42
42
  from .scenarios import entity_matching_scenario # noqa
43
43
  from .scenarios import entity_data_imputation_scenario # noqa
44
44
  from .scenarios import big_bench_scenario # noqa
45
+ from .scenarios import opinions_qa_scenario # noqa
46
+
45
47
 
46
48
  # Biomedical
47
49
  from .scenarios import covid_dialog_scenario # noqa
@@ -68,6 +68,9 @@ class AdapterSpec:
68
68
  # set of training instances. Used to compute error bars.
69
69
  num_train_trials: int = 1
70
70
 
71
+ # If true, randomly sample N training examples; if false, select N consecutive training examples
72
+ sample_train: bool = True
73
+
71
74
  # Decoding parameters (inherited by `Request`)
72
75
 
73
76
  # Model to make the request to (need to fill in)
@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
23
23
  @htrack(None)
24
24
  def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
25
25
  """
26
- Takes a a list of `Instance`s and builds a list of corresponding `RequestState`s.
26
+ Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
27
27
  The reason we don't do this per eval instance is that we create a common set of
28
28
  training instances which is shared across all eval instances.
29
29
  """
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
65
65
  parallelism: int,
66
66
  ) -> List[RequestState]:
67
67
  self.train_trial_index: int = train_trial_index
68
- self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index)
68
+ self.train_instances: List[Instance] = self.sample_examples(
69
+ all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
70
+ )
69
71
  hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
70
72
 
71
73
  # Generate request_states
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
93
95
 
94
96
  return [request_state for result in results for request_state in result]
95
97
 
96
- def sample_examples(self, all_train_instances: List[Instance], seed: int) -> List[Instance]:
98
+ def sample_examples(
99
+ self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
100
+ ) -> List[Instance]:
97
101
  """
98
102
  Sample a random set of train instances to use as examples by following the steps below:
99
103
  1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
121
125
  random.seed(seed)
122
126
  num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
123
127
 
128
+ examples: List[Instance] = []
129
+ if not sample_train:
130
+ # Select sequentially from the train set
131
+ examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
132
+ return examples
133
+
124
134
  unlabeled_instances: List[Instance] = []
125
135
  label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
126
-
127
136
  for instance in all_train_instances:
128
137
  if instance.first_correct_reference:
129
138
  label_to_instances[instance.first_correct_reference.output.text].append(instance)
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
145
154
  sorted_labels.extend(labels)
146
155
 
147
156
  labels_iterable = cycle(sorted_labels)
148
- examples: List[Instance] = []
149
157
  while num_instances_to_sample > 0:
150
158
  next_label: Optional[str] = next(labels_iterable, None)
151
159
  if not next_label:
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
218
226
 
219
227
  # References (optionally) and output
220
228
  output: str
229
+
230
+ delimiter = ","
221
231
  if reference_index is None:
222
232
  # Put only the correct reference as the output
223
- correct_reference: Optional[Reference] = instance.first_correct_reference
224
- output = correct_reference.output.text if correct_reference is not None else "n/a"
233
+ correct_references: List[Reference] = instance.all_correct_references
234
+ if not correct_references:
235
+ output = "n/a"
236
+ else:
237
+ output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
225
238
  else:
226
239
  reference = instance.references[reference_index]
227
240
  output = reference.output.text
@@ -1,6 +1,7 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
 
3
3
  from sklearn.metrics import f1_score
4
+ from sklearn.preprocessing import MultiLabelBinarizer
4
5
 
5
6
  from helm.benchmark.adaptation.request_state import RequestState
6
7
  from helm.benchmark.metrics.basic_metrics import normalize_text
@@ -20,8 +21,7 @@ class ClassificationMetric(Metric):
20
21
 
21
22
  Note:
22
23
  - The set of classes is derived from the correct references from all the instances.
23
- This means that classes may be omitted if they never are never used as a correct
24
- reference.
24
+ This means that classes may be omitted if they are never used as a correct reference.
25
25
  - Generations that are not in any of the known classes are counted as a
26
26
  negative prediction for every class.
27
27
  - Perturbed classes are considered different classes from unperturbed
@@ -29,10 +29,16 @@ class ClassificationMetric(Metric):
29
29
  - Currently, multi-label classification is not supported.
30
30
  """
31
31
 
32
+ def __init__(self, delimiter: Optional[str] = None):
33
+ self.delimiter = delimiter
34
+
35
+ def is_multi_label(self) -> bool:
36
+ return bool(self.delimiter)
37
+
32
38
  def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
33
- y_pred: List[str] = []
34
- y_true: List[str] = []
35
- for request_state in request_states:
39
+ y_pred: List[List[str]] = []
40
+ y_true: List[List[str]] = []
41
+ for request_state in request_states: # one request state per instance
36
42
  # Only the generation adapter is supported.
37
43
  # TODO: Support multiple_choice_* adapters.
38
44
  if request_state.reference_index is not None:
@@ -42,24 +48,23 @@ class ClassificationMetric(Metric):
42
48
  assert request_state.result is not None
43
49
  if len(request_state.result.completions) != 1:
44
50
  raise ValueError("Result must contain exactly one completion")
45
-
46
- num_correct = 0
47
- for reference in request_state.instance.references:
48
- if reference.is_correct:
49
- num_correct += 1
50
- y_true.append(normalize_text(reference.output.text))
51
- if num_correct != 1:
52
- # TODO: Support multi-label classification.
53
- raise ValueError("ClassificationMetric does not support multi-label classification")
54
51
  if request_state.output_mapping:
55
52
  raise ValueError("ClassificationMetric does not support multiple choice adapters")
56
- y_pred.append(normalize_text(request_state.result.completions[0].text))
57
- labels = list(set(y_true))
53
+
54
+ references = request_state.instance.all_correct_references
55
+ if not self.is_multi_label():
56
+ assert len(references) == 1
57
+ correct_ref_texts = [normalize_text(ref.output.text) for ref in references if ref.output.text]
58
+ y_true.append(correct_ref_texts)
59
+
60
+ input_text = request_state.result.completions[0].text
61
+ predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
62
+ y_pred.append([normalize_text(pred) for pred in predictions if pred])
63
+ labels: List[str] = list(set(y for ys in y_true for y in ys))
64
+ mlb = MultiLabelBinarizer().fit([labels])
65
+ y_true = mlb.transform(y_true)
66
+ y_pred = mlb.transform(y_pred)
58
67
  return [
59
- Stat(MetricName("classification_macro_f1")).add(
60
- f1_score(y_pred=y_pred, y_true=y_true, labels=list(labels), average="macro")
61
- ),
62
- Stat(MetricName("classification_micro_f1")).add(
63
- f1_score(y_pred=y_pred, y_true=y_true, labels=list(labels), average="micro")
64
- ),
68
+ Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
69
+ Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
65
70
  ]
@@ -63,7 +63,8 @@ def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
63
63
 
64
64
 
65
65
  def test_evaluate_instances_binary_generation():
66
- metric = ClassificationMetric()
66
+ metric = ClassificationMetric(delimiter=None)
67
+
67
68
  request_states = [
68
69
  _request_state("yes", [_Option("yes", True)]),
69
70
  _request_state("yes", [_Option("yes", True)]),
@@ -86,20 +87,21 @@ def test_evaluate_instances_binary_generation():
86
87
 
87
88
 
88
89
  def test_evaluate_instances_multi_class():
89
- metric = ClassificationMetric()
90
+ # Note: no "a" because it would get filtered out by normalize_text()
91
+ metric = ClassificationMetric(delimiter=None)
90
92
 
91
93
  def _options(correct: str):
92
- return [_Option(text, text == correct) for text in ["a", "b", "c"]]
94
+ return [_Option(text, text == correct) for text in ["d", "b", "c"]]
93
95
 
94
96
  request_states = [
95
- _request_state("a", _options("a")),
96
- _request_state("a", _options("a")),
97
- _request_state("a", _options("a")),
98
- _request_state("a", _options("b")),
97
+ _request_state("d", _options("d")),
98
+ _request_state("d", _options("d")),
99
+ _request_state("d", _options("d")),
100
+ _request_state("d", _options("b")),
99
101
  _request_state("b", _options("b")),
100
102
  _request_state("b", _options("b")),
101
103
  _request_state("b", _options("c")),
102
- _request_state("c", _options("a")),
104
+ _request_state("c", _options("d")),
103
105
  _request_state("c", _options("c")),
104
106
  _request_state("invalid", _options("c")),
105
107
  ]
@@ -107,9 +109,42 @@ def test_evaluate_instances_multi_class():
107
109
  metric.evaluate_instances(request_states),
108
110
  _expected_stats(
109
111
  {
110
- "a": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
112
+ "d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
111
113
  "b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
112
114
  "c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
113
115
  }
114
116
  ),
115
117
  )
118
+
119
+
120
+ def test_evaluate_instances_multilabel():
121
+ # Note: no "a" because it would get filtered out by normalize_text()
122
+ metric = ClassificationMetric(delimiter=",")
123
+
124
+ def _options(correct: List[str]):
125
+ return [_Option(text, text in correct) for text in ["d", "b", "c"]]
126
+
127
+ request_states = [
128
+ _request_state("d,b", _options(["d", "b"])),
129
+ _request_state("d,b", _options(["d", "c"])),
130
+ _request_state("d", _options(["d"])),
131
+ _request_state("c", _options(["b"])),
132
+ _request_state("b", _options(["b", "c"])),
133
+ _request_state("d,b", _options(["c"])),
134
+ _request_state("d,c", _options(["d"])),
135
+ _request_state("d,b,c", _options(["d", "b", "c"])),
136
+ _request_state("", []),
137
+ _request_state("n/a", []),
138
+ _request_state("invalid", _options(["c"])),
139
+ ]
140
+
141
+ assert_stats_equal(
142
+ metric.evaluate_instances(request_states),
143
+ _expected_stats(
144
+ {
145
+ "d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
146
+ "b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
147
+ "c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
148
+ }
149
+ ),
150
+ )