crfm-helm 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/MANIFEST.in +2 -1
  2. {crfm-helm-0.2.0/src/crfm_helm.egg-info → crfm-helm-0.2.2}/PKG-INFO +1 -1
  3. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/README.md +1 -1
  4. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/requirements.txt +13 -9
  5. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/setup.py +3 -2
  6. {crfm-helm-0.2.0 → crfm-helm-0.2.2/src/crfm_helm.egg-info}/PKG-INFO +1 -1
  7. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/SOURCES.txt +30 -1
  8. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/entry_points.txt +2 -1
  9. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/requires.txt +10 -7
  10. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/__init__.py +13 -0
  11. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapter_spec.py +3 -0
  12. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  13. crfm-helm-0.2.2/src/helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
  14. crfm-helm-0.2.2/src/helm/benchmark/metrics/classification_metrics.py +70 -0
  15. crfm-helm-0.2.2/src/helm/benchmark/metrics/machine_translation_metrics.py +36 -0
  16. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summarization_metrics.py +7 -8
  17. crfm-helm-0.2.2/src/helm/benchmark/metrics/test_classification_metrics.py +150 -0
  18. crfm-helm-0.2.2/src/helm/benchmark/presentation/create_plots.py +617 -0
  19. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/run_display.py +7 -48
  20. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/summarize.py +4 -2
  21. crfm-helm-0.2.2/src/helm/benchmark/presentation/test_create_plots.py +32 -0
  22. crfm-helm-0.2.2/src/helm/benchmark/run.py +276 -0
  23. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/run_expander.py +164 -47
  24. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/run_specs.py +346 -39
  25. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/runner.py +34 -6
  26. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/copyright_scenario.py +1 -1
  27. crfm-helm-0.2.2/src/helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
  28. crfm-helm-0.2.2/src/helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  29. crfm-helm-0.2.2/src/helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
  30. crfm-helm-0.2.2/src/helm/benchmark/scenarios/lextreme_scenario.py +458 -0
  31. crfm-helm-0.2.2/src/helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
  32. crfm-helm-0.2.2/src/helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
  33. crfm-helm-0.2.2/src/helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
  34. crfm-helm-0.2.2/src/helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
  35. crfm-helm-0.2.2/src/helm/benchmark/scenarios/med_qa_scenario.py +96 -0
  36. crfm-helm-0.2.2/src/helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  37. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/scenario.py +5 -0
  38. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  39. crfm-helm-0.2.2/src/helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
  40. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/benchmarking.css +14 -0
  41. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/benchmarking.js +43 -0
  42. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/index.html +2 -0
  43. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/json-urls.js +4 -0
  44. crfm-helm-0.2.2/src/helm/benchmark/static/plot-captions.js +16 -0
  45. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/schema.yaml +154 -1
  46. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/cohere_window_service.py +20 -0
  47. crfm-helm-0.2.2/src/helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  48. crfm-helm-0.2.2/src/helm/benchmark/window_services/huggingface_window_service.py +39 -0
  49. crfm-helm-0.2.2/src/helm/benchmark/window_services/santacoder_window_service.py +27 -0
  50. crfm-helm-0.2.2/src/helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  51. crfm-helm-0.2.2/src/helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  52. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/window_service_factory.py +34 -7
  53. crfm-helm-0.2.2/src/helm/common/codec.py +123 -0
  54. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/general.py +12 -5
  55. crfm-helm-0.2.2/src/helm/common/test_codec.py +144 -0
  56. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/aleph_alpha_client.py +47 -28
  57. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/auto_client.py +32 -24
  58. crfm-helm-0.2.2/src/helm/proxy/clients/google_client.py +88 -0
  59. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/huggingface_client.py +32 -16
  60. crfm-helm-0.2.2/src/helm/proxy/clients/huggingface_model_registry.py +111 -0
  61. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/huggingface_tokenizer.py +25 -7
  62. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/openai_client.py +60 -2
  63. crfm-helm-0.2.2/src/helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  64. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
  65. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/together_client.py +17 -2
  66. crfm-helm-0.2.2/src/helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  67. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  68. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/models.py +115 -7
  69. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/test_models.py +1 -1
  70. crfm-helm-0.2.2/src/helm/proxy/token_counters/__init__.py +0 -0
  71. crfm-helm-0.2.0/src/helm/benchmark/presentation/present.py +0 -249
  72. crfm-helm-0.2.0/src/helm/benchmark/run.py +0 -180
  73. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/LICENSE +0 -0
  74. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/pyproject.toml +0 -0
  75. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/setup.cfg +0 -0
  76. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/dependency_links.txt +0 -0
  77. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/not-zip-safe +0 -0
  78. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/crfm_helm.egg-info/top_level.txt +0 -0
  79. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/__init__.py +0 -0
  80. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/__init__.py +0 -0
  81. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/__init__.py +0 -0
  82. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/adapter.py +0 -0
  83. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/adapter_factory.py +0 -0
  84. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +0 -0
  85. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/generation_adapter.py +0 -0
  86. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py +0 -0
  87. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +0 -0
  88. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +0 -0
  89. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +0 -0
  90. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_adapter.py +0 -0
  91. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py +0 -0
  92. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +0 -0
  93. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +0 -0
  94. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/prompt.py +0 -0
  95. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/request_state.py +0 -0
  96. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/adaptation/scenario_state.py +0 -0
  97. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/__init__.py +0 -0
  98. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py +0 -0
  99. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -0
  100. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/data_augmenter.py +0 -0
  101. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/dialect_perturbation.py +0 -0
  102. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/extra_space_perturbation.py +0 -0
  103. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/filler_words_perturbation.py +0 -0
  104. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/gender_perturbation.py +0 -0
  105. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/lowercase_perturbation.py +0 -0
  106. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/mild_mix_perturbation.py +0 -0
  107. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/misspelling_perturbation.py +0 -0
  108. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/person_name_perturbation.py +0 -0
  109. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/perturbation.py +0 -0
  110. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/perturbation_description.py +0 -0
  111. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/space_perturbation.py +0 -0
  112. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/synonym_perturbation.py +0 -0
  113. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/test_perturbation.py +0 -0
  114. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/augmentations/typos_perturbation.py +0 -0
  115. {crfm-helm-0.2.0/src/helm/benchmark/metrics → crfm-helm-0.2.2/src/helm/benchmark/contamination}/__init__.py +0 -0
  116. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/data_preprocessor.py +0 -0
  117. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json +0 -0
  118. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json +0 -0
  119. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/efficiency_data/training_efficiency.json +0 -0
  120. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/executor.py +0 -0
  121. {crfm-helm-0.2.0/src/helm/benchmark/metrics/summac → crfm-helm-0.2.2/src/helm/benchmark/metrics}/__init__.py +0 -0
  122. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/basic_metrics.py +0 -0
  123. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bbq_metrics.py +0 -0
  124. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bias_metrics.py +0 -0
  125. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/bias_word_lists.py +0 -0
  126. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/code_metrics.py +0 -0
  127. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/code_metrics_helper.py +0 -0
  128. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/copyright_metrics.py +0 -0
  129. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/disinformation_metrics.py +0 -0
  130. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric.py +0 -0
  131. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric_name.py +0 -0
  132. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/metric_service.py +0 -0
  133. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/numeracy_metrics.py +0 -0
  134. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/ranking_metrics.py +0 -0
  135. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/statistic.py +0 -0
  136. {crfm-helm-0.2.0/src/helm/benchmark/metrics/tokens → crfm-helm-0.2.2/src/helm/benchmark/metrics/summac}/__init__.py +0 -0
  137. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summac/model_summac.py +0 -0
  138. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/summac/utils_misc.py +0 -0
  139. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_bias_metrics.py +0 -0
  140. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_metric.py +0 -0
  141. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_numeracy_metrics.py +0 -0
  142. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/test_statistic.py +0 -0
  143. {crfm-helm-0.2.0/src/helm/benchmark/presentation → crfm-helm-0.2.2/src/helm/benchmark/metrics/tokens}/__init__.py +0 -0
  144. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +0 -0
  145. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +0 -0
  146. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +0 -0
  147. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/free_token_cost_estimator.py +0 -0
  148. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +0 -0
  149. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +0 -0
  150. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +0 -0
  151. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +0 -0
  152. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens/token_cost_estimator.py +0 -0
  153. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/tokens_metric.py +0 -0
  154. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/metrics/toxicity_metrics.py +0 -0
  155. {crfm-helm-0.2.0/src/helm/benchmark/scenarios → crfm-helm-0.2.2/src/helm/benchmark/presentation}/__init__.py +0 -0
  156. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/contamination.py +0 -0
  157. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/run_entry.py +0 -0
  158. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/schema.py +0 -0
  159. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/table.py +0 -0
  160. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/test_contamination.py +0 -0
  161. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/presentation/test_run_entry.py +0 -0
  162. {crfm-helm-0.2.0/src/helm/benchmark/window_services → crfm-helm-0.2.2/src/helm/benchmark/scenarios}/__init__.py +0 -0
  163. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/babi_qa_scenario.py +0 -0
  164. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/bbq_scenario.py +0 -0
  165. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/big_bench_scenario.py +0 -0
  166. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/blimp_scenario.py +0 -0
  167. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/bold_scenario.py +0 -0
  168. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/boolq_scenario.py +0 -0
  169. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/civil_comments_scenario.py +0 -0
  170. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/code_scenario.py +0 -0
  171. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/code_scenario_helper.py +0 -0
  172. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/commonsense_scenario.py +0 -0
  173. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/dialogue_scenarios.py +0 -0
  174. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/disinformation_scenario.py +0 -0
  175. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/dyck_language_scenario.py +0 -0
  176. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/entity_data_imputation_scenario.py +0 -0
  177. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/entity_matching_scenario.py +0 -0
  178. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/gsm_scenario.py +0 -0
  179. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/ice_scenario.py +0 -0
  180. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/imdb_scenario.py +0 -0
  181. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +0 -0
  182. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/legal_support_scenario.py +0 -0
  183. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/lsat_qa_scenario.py +0 -0
  184. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/math_scenario.py +0 -0
  185. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/mmlu_scenario.py +0 -0
  186. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/msmarco_scenario.py +0 -0
  187. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/narrativeqa_scenario.py +0 -0
  188. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/natural_qa_scenario.py +0 -0
  189. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/newsqa_scenario.py +0 -0
  190. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/numeracy_scenario.py +0 -0
  191. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/pubmed_qa_scenario.py +0 -0
  192. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/quac_scenario.py +0 -0
  193. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/raft_scenario.py +0 -0
  194. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +0 -0
  195. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/simple_scenarios.py +0 -0
  196. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/summarization_scenario.py +0 -0
  197. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_efficiency_scenario.py +0 -0
  198. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +0 -0
  199. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/synthetic_reasoning_scenario.py +0 -0
  200. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/test_scenario.py +0 -0
  201. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/truthful_qa_scenario.py +0 -0
  202. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/twitter_aae_scenario.py +0 -0
  203. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/wikifact_scenario.py +0 -0
  204. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/scenarios/wikitext_103_scenario.py +0 -0
  205. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/server.py +0 -0
  206. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/contamination.yaml +0 -0
  207. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/general.js +0 -0
  208. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/crfm-logo.png +0 -0
  209. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/helm-logo-simple.png +0 -0
  210. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/helm-logo.png +0 -0
  211. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/language-model-helm.png +0 -0
  212. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/ai21.png +0 -0
  213. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/anthropic.png +0 -0
  214. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/bigscience.png +0 -0
  215. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/cohere.png +0 -0
  216. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  217. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/google.png +0 -0
  218. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/meta.png +0 -0
  219. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/microsoft.png +0 -0
  220. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/nvidia.png +0 -0
  221. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/openai.png +0 -0
  222. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/together.png +0 -0
  223. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  224. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/organizations/yandex.png +0 -0
  225. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  226. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  227. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/info-icon.png +0 -0
  228. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/json-urls-root.js +0 -0
  229. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/static/utils.js +0 -0
  230. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/test_data_preprocessor.py +0 -0
  231. {crfm-helm-0.2.0/src/helm/common → crfm-helm-0.2.2/src/helm/benchmark/window_services}/__init__.py +0 -0
  232. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ai21_window_service.py +0 -0
  233. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/anthropic_window_service.py +0 -0
  234. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/bloom_window_service.py +0 -0
  235. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/encoder_decoder_window_service.py +0 -0
  236. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gpt2_window_service.py +0 -0
  237. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gptj_window_service.py +0 -0
  238. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/gptneox_window_service.py +0 -0
  239. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ice_window_service.py +0 -0
  240. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/local_window_service.py +0 -0
  241. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/luminous_window_service.py +0 -0
  242. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/mt_nlg_window_service.py +0 -0
  243. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/openai_window_service.py +0 -0
  244. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/opt_window_service.py +0 -0
  245. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/t0pp_window_service.py +0 -0
  246. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/t511b_window_service.py +0 -0
  247. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ai21_window_service.py +0 -0
  248. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_bloom_window_service.py +0 -0
  249. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_cohere_window_service.py +0 -0
  250. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -0
  251. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gpt2_window_service.py +0 -0
  252. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gptj_window_service.py +0 -0
  253. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_gptneox_window_service.py +0 -0
  254. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ice_window_service.py +0 -0
  255. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -0
  256. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_openai_window_service.py +0 -0
  257. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_opt_window_service.py +0 -0
  258. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_t0pp_window_service.py +0 -0
  259. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_t511b_window_service.py +0 -0
  260. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_ul2_window_service.py +0 -0
  261. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_utils.py +0 -0
  262. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/test_yalm_window_service.py +0 -0
  263. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/tokenizer_service.py +0 -0
  264. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/ul2_window_service.py +0 -0
  265. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/wider_openai_window_service.py +0 -0
  266. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/window_service.py +0 -0
  267. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/benchmark/window_services/yalm_window_service.py +0 -0
  268. {crfm-helm-0.2.0/src/helm/proxy → crfm-helm-0.2.2/src/helm/common}/__init__.py +0 -0
  269. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/authentication.py +0 -0
  270. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/cache.py +0 -0
  271. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/hierarchical_logger.py +0 -0
  272. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/object_spec.py +0 -0
  273. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/perspective_api_request.py +0 -0
  274. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/request.py +0 -0
  275. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/test_cache.py +0 -0
  276. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/test_general.py +0 -0
  277. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/common/tokenization_request.py +0 -0
  278. {crfm-helm-0.2.0/src/helm/proxy/clients → crfm-helm-0.2.2/src/helm/proxy}/__init__.py +0 -0
  279. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/accounts.py +0 -0
  280. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/cli.py +0 -0
  281. {crfm-helm-0.2.0/src/helm/proxy/clients/yalm_tokenizer → crfm-helm-0.2.2/src/helm/proxy/clients}/__init__.py +0 -0
  282. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/ai21_client.py +0 -0
  283. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/anthropic_client.py +0 -0
  284. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/chat_gpt_client.py +0 -0
  285. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/client.py +0 -0
  286. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/cohere_client.py +0 -0
  287. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/goose_ai_client.py +0 -0
  288. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/ice_tokenizer_client.py +0 -0
  289. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/microsoft_client.py +0 -0
  290. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/perspective_api_client.py +0 -0
  291. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/simple_client.py +0 -0
  292. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_client.py +0 -0
  293. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_huggingface_client.py +0 -0
  294. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_ice_tokenizer_client.py +0 -0
  295. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/test_yalm_tokenizer_client.py +0 -0
  296. {crfm-helm-0.2.0/src/helm/proxy/services → crfm-helm-0.2.2/src/helm/proxy/clients/yalm_tokenizer}/__init__.py +0 -0
  297. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py +0 -0
  298. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/clients/yalm_tokenizer_client.py +0 -0
  299. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/example_queries.py +0 -0
  300. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/query.py +0 -0
  301. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/retry.py +0 -0
  302. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/server.py +0 -0
  303. {crfm-helm-0.2.0/src/helm/proxy/token_counters → crfm-helm-0.2.2/src/helm/proxy/services}/__init__.py +0 -0
  304. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/services/remote_service.py +0 -0
  305. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/services/server_service.py +0 -0
  306. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/services/service.py +0 -0
  307. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/services/test_remote_service.py +0 -0
  308. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/services/test_service.py +0 -0
  309. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/test_retry.py +0 -0
  310. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/ai21_token_counter.py +0 -0
  311. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/auto_token_counter.py +0 -0
  312. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/cohere_token_counter.py +0 -0
  313. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/free_token_counter.py +0 -0
  314. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/gooseai_token_counter.py +0 -0
  315. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/openai_token_counter.py +0 -0
  316. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/test_ai21_token_counter.py +0 -0
  317. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/test_openai_token_counter.py +0 -0
  318. {crfm-helm-0.2.0 → crfm-helm-0.2.2}/src/helm/proxy/token_counters/token_counter.py +0 -0
@@ -1,3 +1,4 @@
1
- recursive-include src/helm/benchmark/efficiency_data/ *.json
1
+ recursive-include src/helm/proxy/clients/ *.sp
2
+ recursive-include src/helm/benchmark/ *.json
2
3
  recursive-include src/helm/benchmark/static/ *.css *.html *.js *.png *.yaml
3
4
  include requirements.txt
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crfm-helm
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Benchmark for language models
5
5
  Home-page: https://github.com/stanford-crfm/helm
6
6
  Author: Stanford CRFM
@@ -4,7 +4,7 @@
4
4
  [comment]: <> (When using the img tag, which allows us to specify size, src has to be a URL.)
5
5
  <img src="https://github.com/stanford-crfm/helm/raw/main/src/helm/benchmark/static/images/helm-logo.png" alt="" width="800"/>
6
6
 
7
- Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/v1.0/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features:
7
+ Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/latest/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features:
8
8
 
9
9
  - Collection of datasets in a standard format (e.g., NaturalQuestions)
10
10
  - Collection of models accessible via a unified API (e.g., GPT-3, MT-NLG, OPT, BLOOM)
@@ -6,19 +6,13 @@
6
6
  #
7
7
  # pip freeze | xargs pip uninstall -y
8
8
  # pip install -r requirements.txt
9
+ # pip install -r requirements-dev.txt
9
10
  # pip freeze | grep -v en-core-web-sm > requirements-freeze.txt
10
11
  #
11
12
  # Also update the versions in the manual installation steps in pre-commit.sh.
12
13
  #
13
14
  # Check that everything works because the versions might be upgraded.
14
15
 
15
- # Development
16
- pytest~=7.2.0
17
- black~=22.10.0
18
- mypy~=0.982
19
- pre-commit~=2.20.0
20
- flake8~=5.0.4
21
-
22
16
  # Common
23
17
  zstandard~=0.18.0
24
18
  tqdm~=4.64.1
@@ -26,6 +20,7 @@ pyhocon~=0.3.59
26
20
  dacite~=1.6.0
27
21
 
28
22
  # Proxy
23
+ aleph-alpha-client~=2.14.0
29
24
  bottle~=0.12.23
30
25
  gunicorn~=20.1.0
31
26
  Mako~=1.2.3
@@ -35,8 +30,9 @@ sqlitedict~=1.7.0
35
30
  pymongo~=4.2.0
36
31
  retrying~=1.3.3
37
32
  websocket-client~=1.3.2 # For Anthropic
38
- openai~=0.25.0
39
- transformers~=4.22.2 # For HuggingFace tokenizer
33
+ openai~=0.27.0
34
+ transformers~=4.26.1
35
+ tokenizers~=0.13.2
40
36
  icetk~=0.0.4
41
37
  protobuf~=3.20.2 # Can't use 4.21.0 due to backward incompatibility
42
38
  google-api-python-client~=2.64.0
@@ -49,6 +45,8 @@ jsonlines~=3.1.0 # Not really needed
49
45
  sympy~=1.11.1 # For math scenarios
50
46
  sentencepiece~=0.1.97
51
47
  numba~=0.56.4
48
+ cattrs~=22.2.0
49
+ xlrd~=2.0.1 # Used by pandas.read_excel in ice_scenario
52
50
 
53
51
  # Metrics
54
52
  importlib-resources~=5.10.0
@@ -67,3 +65,9 @@ summ-eval~=0.892
67
65
  # End users should install a CUDA version of PyTorch manually if needed
68
66
  torch~=1.12.1 # Summarization metrics
69
67
  torchvision~=0.13.1
68
+
69
+ # plotting
70
+ colorcet~=3.0.1
71
+ matplotlib~=3.6.0
72
+ numpy~=1.23.3
73
+ seaborn~=0.11.0
@@ -11,7 +11,7 @@ def get_requirements(path: str):
11
11
 
12
12
  setup(
13
13
  name="crfm-helm",
14
- version="0.2.0",
14
+ version="0.2.2",
15
15
  description="Benchmark for language models",
16
16
  long_description="Benchmark for language models",
17
17
  url="https://github.com/stanford-crfm/helm",
@@ -31,9 +31,10 @@ setup(
31
31
  install_requires=get_requirements("requirements.txt"),
32
32
  entry_points={
33
33
  "console_scripts": [
34
- "helm-run=helm.benchmark.presentation.present:main",
34
+ "helm-run=helm.benchmark.run:main",
35
35
  "helm-summarize=helm.benchmark.presentation.summarize:main",
36
36
  "helm-server=helm.benchmark.server:main",
37
+ "helm-create-plots=helm.benchmark.presentation.create_plots:main",
37
38
  "crfm-proxy-server=helm.proxy.server:main",
38
39
  "crfm-proxy-cli=helm.proxy.cli:main",
39
40
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crfm-helm
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Benchmark for language models
5
5
  Home-page: https://github.com/stanford-crfm/helm
6
6
  Author: Stanford CRFM
@@ -44,6 +44,7 @@ src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
44
44
  src/helm/benchmark/augmentations/__init__.py
45
45
  src/helm/benchmark/augmentations/contraction_expansion_perturbation.py
46
46
  src/helm/benchmark/augmentations/contrast_sets_perturbation.py
47
+ src/helm/benchmark/augmentations/correct_to_misspelling.json
47
48
  src/helm/benchmark/augmentations/data_augmenter.py
48
49
  src/helm/benchmark/augmentations/dialect_perturbation.py
49
50
  src/helm/benchmark/augmentations/extra_space_perturbation.py
@@ -59,6 +60,7 @@ src/helm/benchmark/augmentations/space_perturbation.py
59
60
  src/helm/benchmark/augmentations/synonym_perturbation.py
60
61
  src/helm/benchmark/augmentations/test_perturbation.py
61
62
  src/helm/benchmark/augmentations/typos_perturbation.py
63
+ src/helm/benchmark/contamination/__init__.py
62
64
  src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json
63
65
  src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json
64
66
  src/helm/benchmark/efficiency_data/training_efficiency.json
@@ -67,10 +69,12 @@ src/helm/benchmark/metrics/basic_metrics.py
67
69
  src/helm/benchmark/metrics/bbq_metrics.py
68
70
  src/helm/benchmark/metrics/bias_metrics.py
69
71
  src/helm/benchmark/metrics/bias_word_lists.py
72
+ src/helm/benchmark/metrics/classification_metrics.py
70
73
  src/helm/benchmark/metrics/code_metrics.py
71
74
  src/helm/benchmark/metrics/code_metrics_helper.py
72
75
  src/helm/benchmark/metrics/copyright_metrics.py
73
76
  src/helm/benchmark/metrics/disinformation_metrics.py
77
+ src/helm/benchmark/metrics/machine_translation_metrics.py
74
78
  src/helm/benchmark/metrics/metric.py
75
79
  src/helm/benchmark/metrics/metric_name.py
76
80
  src/helm/benchmark/metrics/metric_service.py
@@ -79,6 +83,7 @@ src/helm/benchmark/metrics/ranking_metrics.py
79
83
  src/helm/benchmark/metrics/statistic.py
80
84
  src/helm/benchmark/metrics/summarization_metrics.py
81
85
  src/helm/benchmark/metrics/test_bias_metrics.py
86
+ src/helm/benchmark/metrics/test_classification_metrics.py
82
87
  src/helm/benchmark/metrics/test_metric.py
83
88
  src/helm/benchmark/metrics/test_numeracy_metrics.py
84
89
  src/helm/benchmark/metrics/test_statistic.py
@@ -99,13 +104,14 @@ src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
99
104
  src/helm/benchmark/metrics/tokens/token_cost_estimator.py
100
105
  src/helm/benchmark/presentation/__init__.py
101
106
  src/helm/benchmark/presentation/contamination.py
102
- src/helm/benchmark/presentation/present.py
107
+ src/helm/benchmark/presentation/create_plots.py
103
108
  src/helm/benchmark/presentation/run_display.py
104
109
  src/helm/benchmark/presentation/run_entry.py
105
110
  src/helm/benchmark/presentation/schema.py
106
111
  src/helm/benchmark/presentation/summarize.py
107
112
  src/helm/benchmark/presentation/table.py
108
113
  src/helm/benchmark/presentation/test_contamination.py
114
+ src/helm/benchmark/presentation/test_create_plots.py
109
115
  src/helm/benchmark/presentation/test_run_entry.py
110
116
  src/helm/benchmark/scenarios/__init__.py
111
117
  src/helm/benchmark/scenarios/babi_qa_scenario.py
@@ -119,6 +125,7 @@ src/helm/benchmark/scenarios/code_scenario.py
119
125
  src/helm/benchmark/scenarios/code_scenario_helper.py
120
126
  src/helm/benchmark/scenarios/commonsense_scenario.py
121
127
  src/helm/benchmark/scenarios/copyright_scenario.py
128
+ src/helm/benchmark/scenarios/covid_dialog_scenario.py
122
129
  src/helm/benchmark/scenarios/dialogue_scenarios.py
123
130
  src/helm/benchmark/scenarios/disinformation_scenario.py
124
131
  src/helm/benchmark/scenarios/dyck_language_scenario.py
@@ -126,17 +133,26 @@ src/helm/benchmark/scenarios/entity_data_imputation_scenario.py
126
133
  src/helm/benchmark/scenarios/entity_matching_scenario.py
127
134
  src/helm/benchmark/scenarios/gsm_scenario.py
128
135
  src/helm/benchmark/scenarios/ice_scenario.py
136
+ src/helm/benchmark/scenarios/imdb_listdir.json
129
137
  src/helm/benchmark/scenarios/imdb_scenario.py
130
138
  src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py
131
139
  src/helm/benchmark/scenarios/legal_support_scenario.py
140
+ src/helm/benchmark/scenarios/lex_glue_scenario.py
141
+ src/helm/benchmark/scenarios/lextreme_scenario.py
132
142
  src/helm/benchmark/scenarios/lsat_qa_scenario.py
133
143
  src/helm/benchmark/scenarios/math_scenario.py
144
+ src/helm/benchmark/scenarios/me_q_sum_scenario.py
145
+ src/helm/benchmark/scenarios/med_dialog_scenario.py
146
+ src/helm/benchmark/scenarios/med_mcqa_scenario.py
147
+ src/helm/benchmark/scenarios/med_paragraph_simplification_scenario.py
148
+ src/helm/benchmark/scenarios/med_qa_scenario.py
134
149
  src/helm/benchmark/scenarios/mmlu_scenario.py
135
150
  src/helm/benchmark/scenarios/msmarco_scenario.py
136
151
  src/helm/benchmark/scenarios/narrativeqa_scenario.py
137
152
  src/helm/benchmark/scenarios/natural_qa_scenario.py
138
153
  src/helm/benchmark/scenarios/newsqa_scenario.py
139
154
  src/helm/benchmark/scenarios/numeracy_scenario.py
155
+ src/helm/benchmark/scenarios/opinions_qa_scenario.py
140
156
  src/helm/benchmark/scenarios/pubmed_qa_scenario.py
141
157
  src/helm/benchmark/scenarios/quac_scenario.py
142
158
  src/helm/benchmark/scenarios/raft_scenario.py
@@ -153,6 +169,7 @@ src/helm/benchmark/scenarios/truthful_qa_scenario.py
153
169
  src/helm/benchmark/scenarios/twitter_aae_scenario.py
154
170
  src/helm/benchmark/scenarios/wikifact_scenario.py
155
171
  src/helm/benchmark/scenarios/wikitext_103_scenario.py
172
+ src/helm/benchmark/scenarios/wmt_14_scenario.py
156
173
  src/helm/benchmark/static/benchmarking.css
157
174
  src/helm/benchmark/static/benchmarking.js
158
175
  src/helm/benchmark/static/contamination.yaml
@@ -161,6 +178,7 @@ src/helm/benchmark/static/index.html
161
178
  src/helm/benchmark/static/info-icon.png
162
179
  src/helm/benchmark/static/json-urls-root.js
163
180
  src/helm/benchmark/static/json-urls.js
181
+ src/helm/benchmark/static/plot-captions.js
164
182
  src/helm/benchmark/static/schema.yaml
165
183
  src/helm/benchmark/static/utils.js
166
184
  src/helm/benchmark/static/images/crfm-logo.png
@@ -188,21 +206,25 @@ src/helm/benchmark/window_services/anthropic_window_service.py
188
206
  src/helm/benchmark/window_services/bloom_window_service.py
189
207
  src/helm/benchmark/window_services/cohere_window_service.py
190
208
  src/helm/benchmark/window_services/encoder_decoder_window_service.py
209
+ src/helm/benchmark/window_services/flan_t5_window_service.py
191
210
  src/helm/benchmark/window_services/gpt2_window_service.py
192
211
  src/helm/benchmark/window_services/gptj_window_service.py
193
212
  src/helm/benchmark/window_services/gptneox_window_service.py
213
+ src/helm/benchmark/window_services/huggingface_window_service.py
194
214
  src/helm/benchmark/window_services/ice_window_service.py
195
215
  src/helm/benchmark/window_services/local_window_service.py
196
216
  src/helm/benchmark/window_services/luminous_window_service.py
197
217
  src/helm/benchmark/window_services/mt_nlg_window_service.py
198
218
  src/helm/benchmark/window_services/openai_window_service.py
199
219
  src/helm/benchmark/window_services/opt_window_service.py
220
+ src/helm/benchmark/window_services/santacoder_window_service.py
200
221
  src/helm/benchmark/window_services/t0pp_window_service.py
201
222
  src/helm/benchmark/window_services/t511b_window_service.py
202
223
  src/helm/benchmark/window_services/test_ai21_window_service.py
203
224
  src/helm/benchmark/window_services/test_bloom_window_service.py
204
225
  src/helm/benchmark/window_services/test_cohere_window_service.py
205
226
  src/helm/benchmark/window_services/test_cohere_window_service_utils.py
227
+ src/helm/benchmark/window_services/test_flan_t5_window_service.py
206
228
  src/helm/benchmark/window_services/test_gpt2_window_service.py
207
229
  src/helm/benchmark/window_services/test_gptj_window_service.py
208
230
  src/helm/benchmark/window_services/test_gptneox_window_service.py
@@ -217,6 +239,7 @@ src/helm/benchmark/window_services/test_utils.py
217
239
  src/helm/benchmark/window_services/test_yalm_window_service.py
218
240
  src/helm/benchmark/window_services/tokenizer_service.py
219
241
  src/helm/benchmark/window_services/ul2_window_service.py
242
+ src/helm/benchmark/window_services/wider_ai21_window_service.py
220
243
  src/helm/benchmark/window_services/wider_openai_window_service.py
221
244
  src/helm/benchmark/window_services/window_service.py
222
245
  src/helm/benchmark/window_services/window_service_factory.py
@@ -224,12 +247,14 @@ src/helm/benchmark/window_services/yalm_window_service.py
224
247
  src/helm/common/__init__.py
225
248
  src/helm/common/authentication.py
226
249
  src/helm/common/cache.py
250
+ src/helm/common/codec.py
227
251
  src/helm/common/general.py
228
252
  src/helm/common/hierarchical_logger.py
229
253
  src/helm/common/object_spec.py
230
254
  src/helm/common/perspective_api_request.py
231
255
  src/helm/common/request.py
232
256
  src/helm/common/test_cache.py
257
+ src/helm/common/test_codec.py
233
258
  src/helm/common/test_general.py
234
259
  src/helm/common/tokenization_request.py
235
260
  src/helm/proxy/__init__.py
@@ -250,8 +275,10 @@ src/helm/proxy/clients/auto_client.py
250
275
  src/helm/proxy/clients/chat_gpt_client.py
251
276
  src/helm/proxy/clients/client.py
252
277
  src/helm/proxy/clients/cohere_client.py
278
+ src/helm/proxy/clients/google_client.py
253
279
  src/helm/proxy/clients/goose_ai_client.py
254
280
  src/helm/proxy/clients/huggingface_client.py
281
+ src/helm/proxy/clients/huggingface_model_registry.py
255
282
  src/helm/proxy/clients/huggingface_tokenizer.py
256
283
  src/helm/proxy/clients/ice_tokenizer_client.py
257
284
  src/helm/proxy/clients/microsoft_client.py
@@ -260,6 +287,7 @@ src/helm/proxy/clients/perspective_api_client.py
260
287
  src/helm/proxy/clients/simple_client.py
261
288
  src/helm/proxy/clients/test_client.py
262
289
  src/helm/proxy/clients/test_huggingface_client.py
290
+ src/helm/proxy/clients/test_huggingface_model_registry.py
263
291
  src/helm/proxy/clients/test_huggingface_tokenizer.py
264
292
  src/helm/proxy/clients/test_ice_tokenizer_client.py
265
293
  src/helm/proxy/clients/test_yalm_tokenizer_client.py
@@ -267,6 +295,7 @@ src/helm/proxy/clients/together_client.py
267
295
  src/helm/proxy/clients/yalm_tokenizer_client.py
268
296
  src/helm/proxy/clients/yalm_tokenizer/__init__.py
269
297
  src/helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py
298
+ src/helm/proxy/clients/yalm_tokenizer/voc_100b.sp
270
299
  src/helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py
271
300
  src/helm/proxy/services/__init__.py
272
301
  src/helm/proxy/services/remote_service.py
@@ -1,6 +1,7 @@
1
1
  [console_scripts]
2
2
  crfm-proxy-cli = helm.proxy.cli:main
3
3
  crfm-proxy-server = helm.proxy.server:main
4
- helm-run = helm.benchmark.presentation.present:main
4
+ helm-create-plots = helm.benchmark.presentation.create_plots:main
5
+ helm-run = helm.benchmark.run:main
5
6
  helm-server = helm.benchmark.server:main
6
7
  helm-summarize = helm.benchmark.presentation.summarize:main
@@ -1,12 +1,8 @@
1
- pytest~=7.2.0
2
- black~=22.10.0
3
- mypy~=0.982
4
- pre-commit~=2.20.0
5
- flake8~=5.0.4
6
1
  zstandard~=0.18.0
7
2
  tqdm~=4.64.1
8
3
  pyhocon~=0.3.59
9
4
  dacite~=1.6.0
5
+ aleph-alpha-client~=2.14.0
10
6
  bottle~=0.12.23
11
7
  gunicorn~=20.1.0
12
8
  Mako~=1.2.3
@@ -14,8 +10,9 @@ sqlitedict~=1.7.0
14
10
  pymongo~=4.2.0
15
11
  retrying~=1.3.3
16
12
  websocket-client~=1.3.2
17
- openai~=0.25.0
18
- transformers~=4.22.2
13
+ openai~=0.27.0
14
+ transformers~=4.26.1
15
+ tokenizers~=0.13.2
19
16
  icetk~=0.0.4
20
17
  protobuf~=3.20.2
21
18
  google-api-python-client~=2.64.0
@@ -26,6 +23,8 @@ jsonlines~=3.1.0
26
23
  sympy~=1.11.1
27
24
  sentencepiece~=0.1.97
28
25
  numba~=0.56.4
26
+ cattrs~=22.2.0
27
+ xlrd~=2.0.1
29
28
  importlib-resources~=5.10.0
30
29
  nltk~=3.7
31
30
  scipy~=1.9.1
@@ -39,3 +38,7 @@ spacy~=3.2.4
39
38
  summ-eval~=0.892
40
39
  torch~=1.12.1
41
40
  torchvision~=0.13.1
41
+ colorcet~=3.0.1
42
+ matplotlib~=3.6.0
43
+ numpy~=1.23.3
44
+ seaborn~=0.11.0
@@ -42,12 +42,24 @@ from .scenarios import legal_support_scenario # noqa
42
42
  from .scenarios import entity_matching_scenario # noqa
43
43
  from .scenarios import entity_data_imputation_scenario # noqa
44
44
  from .scenarios import big_bench_scenario # noqa
45
+ from .scenarios import opinions_qa_scenario # noqa
46
+
47
+
48
+ # Biomedical
49
+ from .scenarios import covid_dialog_scenario # noqa
50
+ from .scenarios import me_q_sum_scenario # noqa
51
+ from .scenarios import med_dialog_scenario # noqa
52
+ from .scenarios import med_mcqa_scenario # noqa
53
+ from .scenarios import med_paragraph_simplification_scenario # noqa
54
+ from .scenarios import med_qa_scenario # noqa
45
55
  from .scenarios import pubmed_qa_scenario # noqa
56
+ from .scenarios import wmt_14_scenario # noqa
46
57
 
47
58
  # Metrics
48
59
  from .metrics import basic_metrics # noqa
49
60
  from .metrics import bbq_metrics # noqa
50
61
  from .metrics import bias_metrics # noqa
62
+ from .metrics import classification_metrics # noqa
51
63
  from .metrics import code_metrics # noqa
52
64
  from .metrics import copyright_metrics # noqa
53
65
  from .metrics import disinformation_metrics # noqa
@@ -56,6 +68,7 @@ from .metrics import ranking_metrics # noqa
56
68
  from .metrics import summarization_metrics # noqa
57
69
  from .metrics import toxicity_metrics # noqa
58
70
  from .metrics import tokens_metric # noqa
71
+ from .metrics import machine_translation_metrics # noqa
59
72
 
60
73
  # Perturbations for data augmentation
61
74
  from .augmentations.extra_space_perturbation import ExtraSpacePerturbation # noqa
@@ -68,6 +68,9 @@ class AdapterSpec:
68
68
  # set of training instances. Used to compute error bars.
69
69
  num_train_trials: int = 1
70
70
 
71
+ # If true, randomly sample N training examples; if false, select N consecutive training examples
72
+ sample_train: bool = True
73
+
71
74
  # Decoding parameters (inherited by `Request`)
72
75
 
73
76
  # Model to make the request to (need to fill in)
@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
23
23
  @htrack(None)
24
24
  def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
25
25
  """
26
- Takes a a list of `Instance`s and builds a list of corresponding `RequestState`s.
26
+ Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
27
27
  The reason we don't do this per eval instance is that we create a common set of
28
28
  training instances which is shared across all eval instances.
29
29
  """
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
65
65
  parallelism: int,
66
66
  ) -> List[RequestState]:
67
67
  self.train_trial_index: int = train_trial_index
68
- self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index)
68
+ self.train_instances: List[Instance] = self.sample_examples(
69
+ all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
70
+ )
69
71
  hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
70
72
 
71
73
  # Generate request_states
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
93
95
 
94
96
  return [request_state for result in results for request_state in result]
95
97
 
96
- def sample_examples(self, all_train_instances: List[Instance], seed: int) -> List[Instance]:
98
+ def sample_examples(
99
+ self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
100
+ ) -> List[Instance]:
97
101
  """
98
102
  Sample a random set of train instances to use as examples by following the steps below:
99
103
  1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
121
125
  random.seed(seed)
122
126
  num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
123
127
 
128
+ examples: List[Instance] = []
129
+ if not sample_train:
130
+ # Select sequentially from the train set
131
+ examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
132
+ return examples
133
+
124
134
  unlabeled_instances: List[Instance] = []
125
135
  label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
126
-
127
136
  for instance in all_train_instances:
128
137
  if instance.first_correct_reference:
129
138
  label_to_instances[instance.first_correct_reference.output.text].append(instance)
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
145
154
  sorted_labels.extend(labels)
146
155
 
147
156
  labels_iterable = cycle(sorted_labels)
148
- examples: List[Instance] = []
149
157
  while num_instances_to_sample > 0:
150
158
  next_label: Optional[str] = next(labels_iterable, None)
151
159
  if not next_label:
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
218
226
 
219
227
  # References (optionally) and output
220
228
  output: str
229
+
230
+ delimiter = ","
221
231
  if reference_index is None:
222
232
  # Put only the correct reference as the output
223
- correct_reference: Optional[Reference] = instance.first_correct_reference
224
- output = correct_reference.output.text if correct_reference is not None else "n/a"
233
+ correct_references: List[Reference] = instance.all_correct_references
234
+ if not correct_references:
235
+ output = "n/a"
236
+ else:
237
+ output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
225
238
  else:
226
239
  reference = instance.references[reference_index]
227
240
  output = reference.output.text